diff --git a/bindings/python/tests/bindings/test_processors.py b/bindings/python/tests/bindings/test_processors.py index 0cefec6..b7bcb5e 100644 --- a/bindings/python/tests/bindings/test_processors.py +++ b/bindings/python/tests/bindings/test_processors.py @@ -19,7 +19,7 @@ class TestBertProcessing: tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1)) - output = tokenizer.encode("my name", "pair") + output = tokenizer.encode(("my name", "pair")) assert output.tokens == ["[CLS]", "my", "name", "[SEP]", "pair", "[SEP]"] assert output.ids == [1, 2, 3, 0, 6, 0] @@ -37,7 +37,7 @@ class TestRobertaProcessing: tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.post_processor = RobertaProcessing(("", 1), ("", 0)) - output = tokenizer.encode("my name", "pair") + output = tokenizer.encode(("my name", "pair")) assert output.tokens == ["", "my", "name", "", "", "pair", ""] assert output.ids == [0, 2, 3, 1, 1, 6, 1] diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index 6c8f18c..0dd271a 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -102,9 +102,13 @@ class TestTokenizer: assert type(output.overflowing) == list # Can encode a pair of sequences - output = tokenizer.encode("my name is john", "pair") + output = tokenizer.encode(("my name is john", "pair")) assert output.tokens == ["my", "name", "is", "john", "pair"] + # Can encode a single pre-tokenized sequence + output = tokenizer.encode(["my", "name", "is", "john"]) + assert output.tokens == ["my", "name", "is", "john"] + # Can encode a batch with both a single sequence and a pair of sequences output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")]) assert len(output) == 2 @@ -136,7 +140,7 @@ class TestTokenizer: assert output.tokens == ["my", "name"] # Can truncate pair sequences as well - output = tokenizer.encode("my name is john", "pair") + output = tokenizer.encode(("my name is john", "pair")) assert output.tokens == ["my", "pair"] def test_padding(self): @@ -156,7 +160,7 @@ class TestTokenizer: tokenizer.enable_padding(max_length=4) output = tokenizer.encode("my name") assert output.tokens == ["my", "name", "[PAD]", "[PAD]"] - output = tokenizer.encode("my name", "pair") + output = tokenizer.encode(("my name", "pair")) assert output.tokens == ["my", "name", "pair", "[PAD]"] def test_decode(self): diff --git a/bindings/python/tests/implementations/test_bert_wordpiece.py b/bindings/python/tests/implementations/test_bert_wordpiece.py index fb241cd..91a7e78 100644 --- a/bindings/python/tests/implementations/test_bert_wordpiece.py +++ b/bindings/python/tests/implementations/test_bert_wordpiece.py @@ -7,14 +7,14 @@ class TestBertWordPieceBPE: tokenizer = BertWordPieceTokenizer(bert_files["vocab"]) # Encode with special tokens by default - output = tokenizer.encode("My name is John", "pair") + output = tokenizer.encode(("My name is John", "pair")) assert output.ids == [101, 2026, 2171, 2003, 2198, 102, 3940, 102] assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"] assert output.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (11, 15), (0, 0), (0, 4), (0, 0)] assert output.type_ids == [0, 0, 0, 0, 0, 0, 1, 1] # Can encode without the special tokens - output = tokenizer.encode("My name is John", "pair", add_special_tokens=False) + output = tokenizer.encode(("My name is John", "pair"), add_special_tokens=False) assert output.ids == [2026, 2171, 2003, 2198, 3940] assert output.tokens == ["my", "name", "is", "john", "pair"] assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)] diff --git a/bindings/python/tests/implementations/test_char_bpe.py b/bindings/python/tests/implementations/test_char_bpe.py index 66b45f4..8a3bd3a 100644 --- a/bindings/python/tests/implementations/test_char_bpe.py +++ b/bindings/python/tests/implementations/test_char_bpe.py @@ -6,7 +6,7 @@ class TestBertWordPieceBPE: def test_basic_encode(self, openai_files): tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"]) - output = tokenizer.encode("My name is John", "pair") + output = tokenizer.encode(("My name is John", "pair")) assert output.ids == [0, 253, 1362, 544, 0, 7, 12662, 2688] assert output.tokens == [ "", @@ -32,7 +32,7 @@ class TestBertWordPieceBPE: def test_lowercase(self, openai_files): tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True) - output = tokenizer.encode("My name is John", "pair", add_special_tokens=False) + output = tokenizer.encode(("My name is John", "pair"), add_special_tokens=False) assert output.ids == [547, 1362, 544, 2476, 2688] assert output.tokens == ["my", "name", "is", "john", "pair"] assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]