diff --git a/bindings/python/scripts/convert.py b/bindings/python/scripts/convert.py index 4847bcf..6c812f8 100644 --- a/bindings/python/scripts/convert.py +++ b/bindings/python/scripts/convert.py @@ -131,7 +131,7 @@ class AlbertConverter(SpmConverter): ] def normalizer(self, proto): - normalizers = [Replace("``", '"'), Replace("''", '"'), Replace(Regex(" {2,}"), " ")] + normalizers = [Replace("``", '"'), Replace("''", '"')] if not self.original_tokenizer.keep_accents: normalizers.append(NFKD()) normalizers.append(StripAccents()) @@ -140,6 +140,7 @@ class AlbertConverter(SpmConverter): precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap normalizers.append(Precompiled(precompiled_charsmap)) + normalizers.append(Replace(Regex(" {2,}"), " ")) return Sequence(normalizers) def post_processor(self, tokenizer): @@ -267,7 +268,7 @@ class XLNetConverter(SpmConverter): ] def normalizer(self, proto): - normalizers = [Replace("``", '"'), Replace("''", '"'), Replace(Regex(" {2,}"), " ")] + normalizers = [Replace("``", '"'), Replace("''", '"')] if not self.original_tokenizer.keep_accents: normalizers.append(NFKD()) normalizers.append(StripAccents()) @@ -276,6 +277,7 @@ class XLNetConverter(SpmConverter): precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap normalizers.append(Precompiled(precompiled_charsmap)) + normalizers.append(Replace(Regex(" {2,}"), " ")) return Sequence(normalizers) def post_processor(self, tokenizer):