mirror of
https://github.com/hasktorch/tokenizers.git
synced 2024-10-26 06:00:08 +03:00
Fix SPM conversions
This commit is contained in:
parent
41bf8695a9
commit
319bf0445d
@ -131,7 +131,7 @@ class AlbertConverter(SpmConverter):
|
|||||||
]
|
]
|
||||||
|
|
||||||
def normalizer(self, proto):
|
def normalizer(self, proto):
|
||||||
normalizers = [Replace("``", '"'), Replace("''", '"'), Replace(Regex(" {2,}"), " ")]
|
normalizers = [Replace("``", '"'), Replace("''", '"')]
|
||||||
if not self.original_tokenizer.keep_accents:
|
if not self.original_tokenizer.keep_accents:
|
||||||
normalizers.append(NFKD())
|
normalizers.append(NFKD())
|
||||||
normalizers.append(StripAccents())
|
normalizers.append(StripAccents())
|
||||||
@ -140,6 +140,7 @@ class AlbertConverter(SpmConverter):
|
|||||||
|
|
||||||
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
|
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
|
||||||
normalizers.append(Precompiled(precompiled_charsmap))
|
normalizers.append(Precompiled(precompiled_charsmap))
|
||||||
|
normalizers.append(Replace(Regex(" {2,}"), " "))
|
||||||
return Sequence(normalizers)
|
return Sequence(normalizers)
|
||||||
|
|
||||||
def post_processor(self, tokenizer):
|
def post_processor(self, tokenizer):
|
||||||
@ -267,7 +268,7 @@ class XLNetConverter(SpmConverter):
|
|||||||
]
|
]
|
||||||
|
|
||||||
def normalizer(self, proto):
|
def normalizer(self, proto):
|
||||||
normalizers = [Replace("``", '"'), Replace("''", '"'), Replace(Regex(" {2,}"), " ")]
|
normalizers = [Replace("``", '"'), Replace("''", '"')]
|
||||||
if not self.original_tokenizer.keep_accents:
|
if not self.original_tokenizer.keep_accents:
|
||||||
normalizers.append(NFKD())
|
normalizers.append(NFKD())
|
||||||
normalizers.append(StripAccents())
|
normalizers.append(StripAccents())
|
||||||
@ -276,6 +277,7 @@ class XLNetConverter(SpmConverter):
|
|||||||
|
|
||||||
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
|
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
|
||||||
normalizers.append(Precompiled(precompiled_charsmap))
|
normalizers.append(Precompiled(precompiled_charsmap))
|
||||||
|
normalizers.append(Replace(Regex(" {2,}"), " "))
|
||||||
return Sequence(normalizers)
|
return Sequence(normalizers)
|
||||||
|
|
||||||
def post_processor(self, tokenizer):
|
def post_processor(self, tokenizer):
|
||||||
|
Loading…
Reference in New Issue
Block a user