Uses NMT_NFKC rule by default.

This commit is contained in:
Taku Kudo 2018-06-10 01:15:34 +09:00
parent a574ce183c
commit a65ca0d829
11 changed files with 245532 additions and 8969 deletions

View File

@ -155525,9 +155525,15 @@ FB9 F90 FB5 # ྐྵ => ྐྵ
33FD 33 30 65E5 # ㏽ => 30日
33FE 33 31 65E5 # ㏾ => 31日
33FF 67 61 6C # ㏿ => gal
A69C 44A # ꚜ => ъ
A69D 44C # ꚝ => ь
A770 A76F # ꝰ => ꝯ
A7F8 126 # ꟸ => Ħ
A7F9 153 # ꟹ => œ
AB5C A727 # ꭜ => ꜧ
AB5D AB37 # ꭝ => ꬷ
AB5E 26B # ꭞ => ɫ
AB5F AB52 # ꭟ =>
F900 8C48 # 豈 => 豈
F901 66F4 # 更 => 更
F902 8ECA # 車 => 車
@ -212802,6 +212808,13 @@ FFEE 25CB # ○ => ○
110A5 110BA 110AB # 𑂫 => 𑂫
11131 11127 1112E # 𑄮 => 𑄮
11132 11127 1112F # 𑄯 => 𑄯
11347 1133E 1134B # 𑍋 => 𑍋
11347 11357 1134C # 𑍌 => 𑍌
114B9 114B0 114BC # 𑒼 => 𑒼
114B9 114BA 114BB # 𑒻 => 𑒻
114B9 114BD 114BE # 𑒾 => 𑒾
115B8 115AF 115BA # 𑖺 => 𑖺
115B9 115AF 115BB # 𑖻 => 𑖻
1D15E 1D157 1D165 # 𝅗𝅥 => 𝅗𝅥
1D15F 1D158 1D165 # 𝅘𝅥 => 𝅘𝅥
1D160 1D158 1D165 1D16E # 𝅘𝅥𝅮 => 𝅘𝅥𝅮

Can't render this file because it is too large.

224679
data/nmt_nfkc.tsv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -344,6 +344,44 @@ util::Status Builder::BuildNFKCMap(CharsMap *chars_map) {
return util::OkStatus();
}
util::Status Builder::BuildNmtNFKCMap(CharsMap *chars_map) {
#ifdef ENABLE_NFKC_COMPILE
LOG(INFO) << "Running BuildNmtNFKCMap";
CharsMap nfkc_map;
RETURN_IF_ERROR(Builder::BuildNFKCMap(&nfkc_map));
// Other code points considered as whitespace.
nfkc_map[{0x9}] = {0x20}; // TAB
nfkc_map[{0xA}] = {0x20}; // LINE FEED
nfkc_map[{0xC}] = {0x20}; // FORM FEED
nfkc_map[{0xD}] = {0x20}; // CARRIAGE RETURN
nfkc_map[{0x1680}] = {0x20}; // OGHAM SPACE MARK
nfkc_map[{0x200B}] = {0x20}; // ZERO WIDTH SPACE
nfkc_map[{0x200E}] = {0x20}; // LEFT-TO-RIGHT MARK
nfkc_map[{0x200F}] = {0x20}; // RIGHT-TO-LEFT MARK
nfkc_map[{0x2028}] = {0x20}; // LINE SEPARATOR
nfkc_map[{0x2029}] = {0x20}; // PARAGRAPH SEPARATOR
nfkc_map[{0x2581}] = {0x20}; // LOWER ONE EIGHT BLOCK
nfkc_map[{0xFEFF}] = {0x20}; // ZERO WIDTH NO-BREAK
nfkc_map[{0xFFFD}] = {0x20}; // REPLACEMENT CHARACTER
// Do not normalize FULL_WIDTH TILDE, since FULL_WIDTH TILDE
// and HALF_WIDTH TILDE are used differently in Japanese.
nfkc_map.erase({0xFF5E});
RETURN_IF_ERROR(RemoveRedundantMap(&nfkc_map));
*chars_map = std::move(nfkc_map);
#else
LOG(ERROR) << "NFKC compile is not enabled."
<< " rebuild with ./configure --enable-nfkc-compile";
#endif
return util::OkStatus();
}
// static
util::Status Builder::LoadCharsMap(StringPiece filename, CharsMap *chars_map) {
LOG(INFO) << "Loading maping file: " << filename.data();

View File

@ -89,6 +89,10 @@ class Builder {
// TODO(taku): Make NFC, NFD, and NFKD mapping if necessary.
static util::Status BuildNFKCMap(CharsMap *chars_map);
// Makes an NFKC-based mapping with NMT specific modifications around
// whitespaces.
static util::Status BuildNmtNFKCMap(CharsMap *chars_map);
// Builds Chars map save in `filename`.
// Format:
// src_uchar1 src_uchar2 ... <tab> trg_uchar1 trg_uchar2...

View File

@ -60,7 +60,8 @@ TEST(BuilderTest, BuildNFKCMapTest) {
TEST(BuilderTest, GetPrecompiledCharsMapTest) {
{
const NormalizerSpec spec = SentencePieceTrainer::GetNormalizerSpec("nfkc");
const NormalizerSpec spec =
SentencePieceTrainer::GetNormalizerSpec("nmt_nfkc");
const Normalizer normalizer(spec);
EXPECT_EQ(WS "ABC", normalizer.Normalize(""));
EXPECT_EQ(WS "(株)", normalizer.Normalize(""));

View File

@ -66,7 +66,8 @@ int main(int argc, char **argv) {
const std::vector<
std::pair<std::string, std::function<Status(Builder::CharsMap *)>>>
kRuleList = {{"nfkc", Builder::BuildNFKCMap}};
kRuleList = {{"nfkc", Builder::BuildNFKCMap},
{"nmt_nfkc", Builder::BuildNmtNFKCMap}};
constexpr char kHeader[] =
R"(#ifndef NORMALIZATION_RULE_H_

File diff suppressed because it is too large Load Diff

View File

@ -30,7 +30,7 @@ namespace {
#define RC "\xEF\xBF\xBD"
NormalizerSpec MakeDefaultSpec() {
return SentencePieceTrainer::GetNormalizerSpec("nfkc");
return SentencePieceTrainer::GetNormalizerSpec("nmt_nfkc");
}
} // namespace

View File

@ -104,7 +104,7 @@ std::vector<std::string> GetSpVec(const SentencePieceText &spt) {
}
NormalizerSpec MakeDefaultNormalizerSpec() {
return SentencePieceTrainer::GetNormalizerSpec("nfkc");
return SentencePieceTrainer::GetNormalizerSpec("nmt_nfkc");
}
TEST(SentencepieceProcessorTest, StatusTest) {

View File

@ -27,7 +27,7 @@
namespace sentencepiece {
namespace {
static constexpr char kDefaultNormalizerName[] = "nfkc";
static constexpr char kDefaultNormalizerName[] = "nmt_nfkc";
} // namespace
// static

View File

@ -64,7 +64,7 @@ DEFINE_bool(split_by_whitespace, kDefaultTrainerSpec.split_by_whitespace(),
DEFINE_string(control_symbols, "", "comma separated list of control symbols");
DEFINE_string(user_defined_symbols, "",
"comma separated list of user defined symbols");
DEFINE_string(normalization_rule_name, "nfkc",
DEFINE_string(normalization_rule_name, "nmt_nfkc",
"Normalization rule name. "
"Choose from nfkc or identity");
DEFINE_string(normalization_rule_tsv, "", "Normalization rule TSV file. ");