mirror of
https://github.com/hasktorch/tokenizers.git
synced 2024-10-26 06:00:08 +03:00
Fix SPM conversions
This commit is contained in:
parent
32b3b7a0f2
commit
e1ffe39764
@ -131,7 +131,7 @@ class AlbertConverter(SpmConverter):
|
||||
]
|
||||
|
||||
def normalizer(self, proto):
|
||||
normalizers = [Replace("``", '"'), Replace("''", '"'), Replace(Regex(" {2,}"), " ")]
|
||||
normalizers = [Replace("``", '"'), Replace("''", '"')]
|
||||
if not self.original_tokenizer.keep_accents:
|
||||
normalizers.append(NFKD())
|
||||
normalizers.append(StripAccents())
|
||||
@ -140,6 +140,7 @@ class AlbertConverter(SpmConverter):
|
||||
|
||||
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
|
||||
normalizers.append(Precompiled(precompiled_charsmap))
|
||||
normalizers.append(Replace(Regex(" {2,}"), " "))
|
||||
return Sequence(normalizers)
|
||||
|
||||
def post_processor(self, tokenizer):
|
||||
@ -267,7 +268,7 @@ class XLNetConverter(SpmConverter):
|
||||
]
|
||||
|
||||
def normalizer(self, proto):
|
||||
normalizers = [Replace("``", '"'), Replace("''", '"'), Replace(Regex(" {2,}"), " ")]
|
||||
normalizers = [Replace("``", '"'), Replace("''", '"')]
|
||||
if not self.original_tokenizer.keep_accents:
|
||||
normalizers.append(NFKD())
|
||||
normalizers.append(StripAccents())
|
||||
@ -276,6 +277,7 @@ class XLNetConverter(SpmConverter):
|
||||
|
||||
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
|
||||
normalizers.append(Precompiled(precompiled_charsmap))
|
||||
normalizers.append(Replace(Regex(" {2,}"), " "))
|
||||
return Sequence(normalizers)
|
||||
|
||||
def post_processor(self, tokenizer):
|
||||
|
Loading…
Reference in New Issue
Block a user