Added normalization with Unicode case folding

This commit is contained in:
Taku Kudo 2018-06-29 15:17:18 +09:00
parent f4d0ddce6d
commit 573586854e
7 changed files with 476007 additions and 11 deletions

225823
data/nfkc_cf.tsv Normal file

File diff suppressed because it is too large Load Diff

225823
data/nmt_nfkc_cf.tsv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -383,6 +383,61 @@ util::Status Builder::BuildNmtNFKCMap(CharsMap *chars_map) {
return util::OkStatus();
}
// static
util::Status Builder::MergeUnicodeCaseFoldMap(Builder::CharsMap *chars_map) {
#ifdef ENABLE_NFKC_COMPILE
for (auto &c : *chars_map) {
std::vector<char32> trg;
for (char32 c : c.second) trg.push_back(u_foldCase(c, U_FOLD_CASE_DEFAULT));
c.second = trg;
}
constexpr int kMaxUnicode = 0x10FFFF;
for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
if (!U_IS_UNICODE_CHAR(cp)) {
continue;
}
if (chars_map->find({cp}) != chars_map->end()) continue;
const char32 trg = u_foldCase(cp, U_FOLD_CASE_DEFAULT);
if (trg != cp) (*chars_map)[{cp}] = {trg};
}
RETURN_IF_ERROR(RemoveRedundantMap(chars_map));
#endif
return util::OkStatus();
}
// static
util::Status Builder::BuildNFKC_CFMap(CharsMap *chars_map) {
#ifdef ENABLE_NFKC_COMPILE
CharsMap nfkc_map;
RETURN_IF_ERROR(Builder::BuildNFKCMap(&nfkc_map));
RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfkc_map));
*chars_map = std::move(nfkc_map);
#else
LOG(ERROR) << "NFKC_CF compile is not enabled."
<< " rebuild with ./configure --enable-nfkc-compile";
#endif
return util::OkStatus();
}
// static
util::Status Builder::BuildNmtNFKC_CFMap(CharsMap *chars_map) {
#ifdef ENABLE_NFKC_COMPILE
CharsMap nfkc_map;
RETURN_IF_ERROR(Builder::BuildNmtNFKCMap(&nfkc_map));
RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfkc_map));
*chars_map = std::move(nfkc_map);
#else
LOG(ERROR) << "NMT_NFKC_CF compile is not enabled."
<< " rebuild with ./configure --enable-nfkc-compile";
#endif
return util::OkStatus();
}
// static
util::Status Builder::LoadCharsMap(absl::string_view filename,
CharsMap *chars_map) {

View File

@ -46,7 +46,8 @@ class Builder {
std::string *output);
// Decompiles `blob` into `chars_map`.
static util::Status DecompileCharsMap(absl::string_view blob, CharsMap *chars_map);
static util::Status DecompileCharsMap(absl::string_view blob,
CharsMap *chars_map);
// Returns a pre-compiled binary index with `name`.
static util::Status GetPrecompiledCharsMap(const std::string &name,
@ -93,11 +94,21 @@ class Builder {
// whitespaces.
static util::Status BuildNmtNFKCMap(CharsMap *chars_map);
// Merge Unicode case folding mapping into `chars_map`.
static util::Status MergeUnicodeCaseFoldMap(CharsMap *chars_map);
// Makes NFKC with Unicode case folding.
static util::Status BuildNFKC_CFMap(CharsMap *chars_map);
// Makes NMT NFKC with Unicode case folding.
static util::Status BuildNmtNFKC_CFMap(CharsMap *chars_map);
// Builds Chars map save in `filename`.
// Format:
// src_uchar1 src_uchar2 ... <tab> trg_uchar1 trg_uchar2...
// (src|trg)_ucharX must be a hex of Unicode code point.
static util::Status LoadCharsMap(absl::string_view filename, CharsMap *chars_map);
static util::Status LoadCharsMap(absl::string_view filename,
CharsMap *chars_map);
// Saves Chars map to `filename` as TSV.
static util::Status SaveCharsMap(absl::string_view filename,

View File

@ -68,6 +68,22 @@ TEST(BuilderTest, GetPrecompiledCharsMapTest) {
EXPECT_EQ(WS "グーグル", normalizer.Normalize("グーグル"));
}
{
const NormalizerSpec spec =
SentencePieceTrainer::GetNormalizerSpec("nfkc_cf");
const Normalizer normalizer(spec);
EXPECT_EQ(WS "abc", normalizer.Normalize(""));
EXPECT_EQ(WS "abc", normalizer.Normalize("ABC"));
}
{
const NormalizerSpec spec =
SentencePieceTrainer::GetNormalizerSpec("nmt_nfkc_cf");
const Normalizer normalizer(spec);
EXPECT_EQ(WS "abc", normalizer.Normalize(""));
EXPECT_EQ(WS "abc", normalizer.Normalize("ABC"));
}
{
const NormalizerSpec spec =
SentencePieceTrainer::GetNormalizerSpec("identity");

View File

@ -42,7 +42,7 @@ std::string ToHexData(absl::string_view data) {
const size_t bucket_size =
std::min<size_t>(end - begin, kNumOfBytesOnOneLine -
output_count % kNumOfBytesOnOneLine);
if (output_count % kNumOfBytesOnOneLine == 0) {
if (output_count % kNumOfBytesOnOneLine == 0 && bucket_size > 0) {
os << "\"";
}
for (size_t i = 0; i < bucket_size; ++i) {
@ -50,7 +50,7 @@ std::string ToHexData(absl::string_view data) {
++begin;
}
output_count += bucket_size;
if (output_count % kNumOfBytesOnOneLine == 0) {
if (output_count % kNumOfBytesOnOneLine == 0 && bucket_size > 0) {
os << "\"\n";
}
}
@ -67,7 +67,9 @@ int main(int argc, char **argv) {
const std::vector<
std::pair<std::string, std::function<Status(Builder::CharsMap *)>>>
kRuleList = {{"nfkc", Builder::BuildNFKCMap},
{"nmt_nfkc", Builder::BuildNmtNFKCMap}};
{"nmt_nfkc", Builder::BuildNmtNFKCMap},
{"nfkc_cf", Builder::BuildNmtNFKC_CFMap},
{"nmt_nfkc_cf", Builder::BuildNmtNFKC_CFMap}};
constexpr char kHeader[] =
R"(#ifndef NORMALIZATION_RULE_H_

File diff suppressed because it is too large Load Diff