mirror of
https://github.com/google/sentencepiece.git
synced 2025-01-06 09:19:12 +03:00
Uses NMT_NFKC rule by default.
This commit is contained in:
parent
a574ce183c
commit
a65ca0d829
@ -155525,9 +155525,15 @@ FB9 F90 FB5 # ྐྵ => ྐྵ
|
||||
33FD 33 30 65E5 # ㏽ => 30日
|
||||
33FE 33 31 65E5 # ㏾ => 31日
|
||||
33FF 67 61 6C # ㏿ => gal
|
||||
A69C 44A # ꚜ => ъ
|
||||
A69D 44C # ꚝ => ь
|
||||
A770 A76F # ꝰ => ꝯ
|
||||
A7F8 126 # ꟸ => Ħ
|
||||
A7F9 153 # ꟹ => œ
|
||||
AB5C A727 # ꭜ => ꜧ
|
||||
AB5D AB37 # ꭝ => ꬷ
|
||||
AB5E 26B # ꭞ => ɫ
|
||||
AB5F AB52 # ꭟ => ꭒ
|
||||
F900 8C48 # 豈 => 豈
|
||||
F901 66F4 # 更 => 更
|
||||
F902 8ECA # 車 => 車
|
||||
@ -212802,6 +212808,13 @@ FFEE 25CB # ○ => ○
|
||||
110A5 110BA 110AB # 𑂫 => 𑂫
|
||||
11131 11127 1112E # 𑄮 => 𑄮
|
||||
11132 11127 1112F # 𑄯 => 𑄯
|
||||
11347 1133E 1134B # 𑍋 => 𑍋
|
||||
11347 11357 1134C # 𑍌 => 𑍌
|
||||
114B9 114B0 114BC # 𑒼 => 𑒼
|
||||
114B9 114BA 114BB # 𑒻 => 𑒻
|
||||
114B9 114BD 114BE # 𑒾 => 𑒾
|
||||
115B8 115AF 115BA # 𑖺 => 𑖺
|
||||
115B9 115AF 115BB # 𑖻 => 𑖻
|
||||
1D15E 1D157 1D165 # 𝅗𝅥 => 𝅗𝅥
|
||||
1D15F 1D158 1D165 # 𝅘𝅥 => 𝅘𝅥
|
||||
1D160 1D158 1D165 1D16E # 𝅘𝅥𝅮 => 𝅘𝅥𝅮
|
||||
|
Can't render this file because it is too large.
|
224679
data/nmt_nfkc.tsv
Normal file
224679
data/nmt_nfkc.tsv
Normal file
File diff suppressed because it is too large
Load Diff
@ -344,6 +344,44 @@ util::Status Builder::BuildNFKCMap(CharsMap *chars_map) {
|
||||
return util::OkStatus();
|
||||
}
|
||||
|
||||
util::Status Builder::BuildNmtNFKCMap(CharsMap *chars_map) {
|
||||
#ifdef ENABLE_NFKC_COMPILE
|
||||
LOG(INFO) << "Running BuildNmtNFKCMap";
|
||||
|
||||
CharsMap nfkc_map;
|
||||
RETURN_IF_ERROR(Builder::BuildNFKCMap(&nfkc_map));
|
||||
|
||||
// Other code points considered as whitespace.
|
||||
nfkc_map[{0x9}] = {0x20}; // TAB
|
||||
nfkc_map[{0xA}] = {0x20}; // LINE FEED
|
||||
nfkc_map[{0xC}] = {0x20}; // FORM FEED
|
||||
nfkc_map[{0xD}] = {0x20}; // CARRIAGE RETURN
|
||||
nfkc_map[{0x1680}] = {0x20}; // OGHAM SPACE MARK
|
||||
nfkc_map[{0x200B}] = {0x20}; // ZERO WIDTH SPACE
|
||||
nfkc_map[{0x200E}] = {0x20}; // LEFT-TO-RIGHT MARK
|
||||
nfkc_map[{0x200F}] = {0x20}; // RIGHT-TO-LEFT MARK
|
||||
nfkc_map[{0x2028}] = {0x20}; // LINE SEPARATOR
|
||||
nfkc_map[{0x2029}] = {0x20}; // PARAGRAPH SEPARATOR
|
||||
nfkc_map[{0x2581}] = {0x20}; // LOWER ONE EIGHT BLOCK
|
||||
nfkc_map[{0xFEFF}] = {0x20}; // ZERO WIDTH NO-BREAK
|
||||
nfkc_map[{0xFFFD}] = {0x20}; // REPLACEMENT CHARACTER
|
||||
|
||||
// Do not normalize FULL_WIDTH TILDE, since FULL_WIDTH TILDE
|
||||
// and HALF_WIDTH TILDE are used differently in Japanese.
|
||||
nfkc_map.erase({0xFF5E});
|
||||
|
||||
RETURN_IF_ERROR(RemoveRedundantMap(&nfkc_map));
|
||||
|
||||
*chars_map = std::move(nfkc_map);
|
||||
|
||||
#else
|
||||
LOG(ERROR) << "NFKC compile is not enabled."
|
||||
<< " rebuild with ./configure --enable-nfkc-compile";
|
||||
#endif
|
||||
|
||||
return util::OkStatus();
|
||||
}
|
||||
|
||||
// static
|
||||
util::Status Builder::LoadCharsMap(StringPiece filename, CharsMap *chars_map) {
|
||||
LOG(INFO) << "Loading maping file: " << filename.data();
|
||||
|
@ -89,6 +89,10 @@ class Builder {
|
||||
// TODO(taku): Make NFC, NFD, and NFKD mapping if necessary.
|
||||
static util::Status BuildNFKCMap(CharsMap *chars_map);
|
||||
|
||||
// Makes an NFKC-based mapping with NMT specific modifications around
|
||||
// whitespaces.
|
||||
static util::Status BuildNmtNFKCMap(CharsMap *chars_map);
|
||||
|
||||
// Builds Chars map save in `filename`.
|
||||
// Format:
|
||||
// src_uchar1 src_uchar2 ... <tab> trg_uchar1 trg_uchar2...
|
||||
|
@ -60,7 +60,8 @@ TEST(BuilderTest, BuildNFKCMapTest) {
|
||||
|
||||
TEST(BuilderTest, GetPrecompiledCharsMapTest) {
|
||||
{
|
||||
const NormalizerSpec spec = SentencePieceTrainer::GetNormalizerSpec("nfkc");
|
||||
const NormalizerSpec spec =
|
||||
SentencePieceTrainer::GetNormalizerSpec("nmt_nfkc");
|
||||
const Normalizer normalizer(spec);
|
||||
EXPECT_EQ(WS "ABC", normalizer.Normalize("ABC"));
|
||||
EXPECT_EQ(WS "(株)", normalizer.Normalize("㈱"));
|
||||
|
@ -66,7 +66,8 @@ int main(int argc, char **argv) {
|
||||
|
||||
const std::vector<
|
||||
std::pair<std::string, std::function<Status(Builder::CharsMap *)>>>
|
||||
kRuleList = {{"nfkc", Builder::BuildNFKCMap}};
|
||||
kRuleList = {{"nfkc", Builder::BuildNFKCMap},
|
||||
{"nmt_nfkc", Builder::BuildNmtNFKCMap}};
|
||||
|
||||
constexpr char kHeader[] =
|
||||
R"(#ifndef NORMALIZATION_RULE_H_
|
||||
|
29753
src/normalization_rule.h
29753
src/normalization_rule.h
File diff suppressed because it is too large
Load Diff
@ -30,7 +30,7 @@ namespace {
|
||||
#define RC "\xEF\xBF\xBD"
|
||||
|
||||
NormalizerSpec MakeDefaultSpec() {
|
||||
return SentencePieceTrainer::GetNormalizerSpec("nfkc");
|
||||
return SentencePieceTrainer::GetNormalizerSpec("nmt_nfkc");
|
||||
}
|
||||
} // namespace
|
||||
|
||||
|
@ -104,7 +104,7 @@ std::vector<std::string> GetSpVec(const SentencePieceText &spt) {
|
||||
}
|
||||
|
||||
NormalizerSpec MakeDefaultNormalizerSpec() {
|
||||
return SentencePieceTrainer::GetNormalizerSpec("nfkc");
|
||||
return SentencePieceTrainer::GetNormalizerSpec("nmt_nfkc");
|
||||
}
|
||||
|
||||
TEST(SentencepieceProcessorTest, StatusTest) {
|
||||
|
@ -27,7 +27,7 @@
|
||||
|
||||
namespace sentencepiece {
|
||||
namespace {
|
||||
static constexpr char kDefaultNormalizerName[] = "nfkc";
|
||||
static constexpr char kDefaultNormalizerName[] = "nmt_nfkc";
|
||||
} // namespace
|
||||
|
||||
// static
|
||||
|
@ -64,7 +64,7 @@ DEFINE_bool(split_by_whitespace, kDefaultTrainerSpec.split_by_whitespace(),
|
||||
DEFINE_string(control_symbols, "", "comma separated list of control symbols");
|
||||
DEFINE_string(user_defined_symbols, "",
|
||||
"comma separated list of user defined symbols");
|
||||
DEFINE_string(normalization_rule_name, "nfkc",
|
||||
DEFINE_string(normalization_rule_name, "nmt_nfkc",
|
||||
"Normalization rule name. "
|
||||
"Choose from nfkc or identity");
|
||||
DEFINE_string(normalization_rule_tsv, "", "Normalization rule TSV file. ");
|
||||
|
Loading…
Reference in New Issue
Block a user