Fix SPM conversions

This commit is contained in:
Lysandre 2021-04-21 18:09:36 -04:00 committed by Torsten Scholak
parent 41bf8695a9
commit 319bf0445d
No known key found for this signature in database
GPG Key ID: EF135E6C40866D80

View File

@ -131,7 +131,7 @@ class AlbertConverter(SpmConverter):
]
def normalizer(self, proto):
normalizers = [Replace("``", '"'), Replace("''", '"'), Replace(Regex(" {2,}"), " ")]
normalizers = [Replace("``", '"'), Replace("''", '"')]
if not self.original_tokenizer.keep_accents:
normalizers.append(NFKD())
normalizers.append(StripAccents())
@ -140,6 +140,7 @@ class AlbertConverter(SpmConverter):
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
normalizers.append(Precompiled(precompiled_charsmap))
normalizers.append(Replace(Regex(" {2,}"), " "))
return Sequence(normalizers)
def post_processor(self, tokenizer):
@ -267,7 +268,7 @@ class XLNetConverter(SpmConverter):
]
def normalizer(self, proto):
normalizers = [Replace("``", '"'), Replace("''", '"'), Replace(Regex(" {2,}"), " ")]
normalizers = [Replace("``", '"'), Replace("''", '"')]
if not self.original_tokenizer.keep_accents:
normalizers.append(NFKD())
normalizers.append(StripAccents())
@ -276,6 +277,7 @@ class XLNetConverter(SpmConverter):
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
normalizers.append(Precompiled(precompiled_charsmap))
normalizers.append(Replace(Regex(" {2,}"), " "))
return Sequence(normalizers)
def post_processor(self, tokenizer):