From 05829b4c1cfa2c2f2a2d721d7fc937759182cb9c Mon Sep 17 00:00:00 2001 From: Lysandre Date: Wed, 21 Apr 2021 18:09:36 -0400 Subject: [PATCH] Revert "Fix SPM conversions" This reverts commit e1ffe397649838bb7a053fc5c26843248a18b5d5. --- bindings/python/scripts/convert.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/bindings/python/scripts/convert.py b/bindings/python/scripts/convert.py index 6c812f8..4847bcf 100644 --- a/bindings/python/scripts/convert.py +++ b/bindings/python/scripts/convert.py @@ -131,7 +131,7 @@ class AlbertConverter(SpmConverter): ] def normalizer(self, proto): - normalizers = [Replace("``", '"'), Replace("''", '"')] + normalizers = [Replace("``", '"'), Replace("''", '"'), Replace(Regex(" {2,}"), " ")] if not self.original_tokenizer.keep_accents: normalizers.append(NFKD()) normalizers.append(StripAccents()) @@ -140,7 +140,6 @@ class AlbertConverter(SpmConverter): precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap normalizers.append(Precompiled(precompiled_charsmap)) - normalizers.append(Replace(Regex(" {2,}"), " ")) return Sequence(normalizers) def post_processor(self, tokenizer): @@ -268,7 +267,7 @@ class XLNetConverter(SpmConverter): ] def normalizer(self, proto): - normalizers = [Replace("``", '"'), Replace("''", '"')] + normalizers = [Replace("``", '"'), Replace("''", '"'), Replace(Regex(" {2,}"), " ")] if not self.original_tokenizer.keep_accents: normalizers.append(NFKD()) normalizers.append(StripAccents()) @@ -277,7 +276,6 @@ class XLNetConverter(SpmConverter): precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap normalizers.append(Precompiled(precompiled_charsmap)) - normalizers.append(Replace(Regex(" {2,}"), " ")) return Sequence(normalizers) def post_processor(self, tokenizer):