mirror of
https://github.com/google/sentencepiece.git
synced 2025-01-06 09:19:12 +03:00
Added normalization with Unicode case folding
This commit is contained in:
parent
f4d0ddce6d
commit
573586854e
225823
data/nfkc_cf.tsv
Normal file
225823
data/nfkc_cf.tsv
Normal file
File diff suppressed because it is too large
Load Diff
225823
data/nmt_nfkc_cf.tsv
Normal file
225823
data/nmt_nfkc_cf.tsv
Normal file
File diff suppressed because it is too large
Load Diff
@ -383,6 +383,61 @@ util::Status Builder::BuildNmtNFKCMap(CharsMap *chars_map) {
|
||||
return util::OkStatus();
|
||||
}
|
||||
|
||||
// static
|
||||
util::Status Builder::MergeUnicodeCaseFoldMap(Builder::CharsMap *chars_map) {
|
||||
#ifdef ENABLE_NFKC_COMPILE
|
||||
for (auto &c : *chars_map) {
|
||||
std::vector<char32> trg;
|
||||
for (char32 c : c.second) trg.push_back(u_foldCase(c, U_FOLD_CASE_DEFAULT));
|
||||
c.second = trg;
|
||||
}
|
||||
|
||||
constexpr int kMaxUnicode = 0x10FFFF;
|
||||
for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
|
||||
if (!U_IS_UNICODE_CHAR(cp)) {
|
||||
continue;
|
||||
}
|
||||
if (chars_map->find({cp}) != chars_map->end()) continue;
|
||||
const char32 trg = u_foldCase(cp, U_FOLD_CASE_DEFAULT);
|
||||
if (trg != cp) (*chars_map)[{cp}] = {trg};
|
||||
}
|
||||
|
||||
RETURN_IF_ERROR(RemoveRedundantMap(chars_map));
|
||||
#endif
|
||||
|
||||
return util::OkStatus();
|
||||
}
|
||||
|
||||
// static
|
||||
util::Status Builder::BuildNFKC_CFMap(CharsMap *chars_map) {
|
||||
#ifdef ENABLE_NFKC_COMPILE
|
||||
CharsMap nfkc_map;
|
||||
RETURN_IF_ERROR(Builder::BuildNFKCMap(&nfkc_map));
|
||||
RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfkc_map));
|
||||
*chars_map = std::move(nfkc_map);
|
||||
#else
|
||||
LOG(ERROR) << "NFKC_CF compile is not enabled."
|
||||
<< " rebuild with ./configure --enable-nfkc-compile";
|
||||
#endif
|
||||
|
||||
return util::OkStatus();
|
||||
}
|
||||
|
||||
// static
|
||||
util::Status Builder::BuildNmtNFKC_CFMap(CharsMap *chars_map) {
|
||||
#ifdef ENABLE_NFKC_COMPILE
|
||||
CharsMap nfkc_map;
|
||||
RETURN_IF_ERROR(Builder::BuildNmtNFKCMap(&nfkc_map));
|
||||
RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfkc_map));
|
||||
*chars_map = std::move(nfkc_map);
|
||||
#else
|
||||
LOG(ERROR) << "NMT_NFKC_CF compile is not enabled."
|
||||
<< " rebuild with ./configure --enable-nfkc-compile";
|
||||
#endif
|
||||
|
||||
return util::OkStatus();
|
||||
}
|
||||
|
||||
// static
|
||||
util::Status Builder::LoadCharsMap(absl::string_view filename,
|
||||
CharsMap *chars_map) {
|
||||
|
@ -46,7 +46,8 @@ class Builder {
|
||||
std::string *output);
|
||||
|
||||
// Decompiles `blob` into `chars_map`.
|
||||
static util::Status DecompileCharsMap(absl::string_view blob, CharsMap *chars_map);
|
||||
static util::Status DecompileCharsMap(absl::string_view blob,
|
||||
CharsMap *chars_map);
|
||||
|
||||
// Returns a pre-compiled binary index with `name`.
|
||||
static util::Status GetPrecompiledCharsMap(const std::string &name,
|
||||
@ -93,11 +94,21 @@ class Builder {
|
||||
// whitespaces.
|
||||
static util::Status BuildNmtNFKCMap(CharsMap *chars_map);
|
||||
|
||||
// Merge Unicode case folding mapping into `chars_map`.
|
||||
static util::Status MergeUnicodeCaseFoldMap(CharsMap *chars_map);
|
||||
|
||||
// Makes NFKC with Unicode case folding.
|
||||
static util::Status BuildNFKC_CFMap(CharsMap *chars_map);
|
||||
|
||||
// Makes NMT NFKC with Unicode case folding.
|
||||
static util::Status BuildNmtNFKC_CFMap(CharsMap *chars_map);
|
||||
|
||||
// Builds Chars map save in `filename`.
|
||||
// Format:
|
||||
// src_uchar1 src_uchar2 ... <tab> trg_uchar1 trg_uchar2...
|
||||
// (src|trg)_ucharX must be a hex of Unicode code point.
|
||||
static util::Status LoadCharsMap(absl::string_view filename, CharsMap *chars_map);
|
||||
static util::Status LoadCharsMap(absl::string_view filename,
|
||||
CharsMap *chars_map);
|
||||
|
||||
// Saves Chars map to `filename` as TSV.
|
||||
static util::Status SaveCharsMap(absl::string_view filename,
|
||||
|
@ -68,6 +68,22 @@ TEST(BuilderTest, GetPrecompiledCharsMapTest) {
|
||||
EXPECT_EQ(WS "グーグル", normalizer.Normalize("グーグル"));
|
||||
}
|
||||
|
||||
{
|
||||
const NormalizerSpec spec =
|
||||
SentencePieceTrainer::GetNormalizerSpec("nfkc_cf");
|
||||
const Normalizer normalizer(spec);
|
||||
EXPECT_EQ(WS "abc", normalizer.Normalize("ABC"));
|
||||
EXPECT_EQ(WS "abc", normalizer.Normalize("ABC"));
|
||||
}
|
||||
|
||||
{
|
||||
const NormalizerSpec spec =
|
||||
SentencePieceTrainer::GetNormalizerSpec("nmt_nfkc_cf");
|
||||
const Normalizer normalizer(spec);
|
||||
EXPECT_EQ(WS "abc", normalizer.Normalize("ABC"));
|
||||
EXPECT_EQ(WS "abc", normalizer.Normalize("ABC"));
|
||||
}
|
||||
|
||||
{
|
||||
const NormalizerSpec spec =
|
||||
SentencePieceTrainer::GetNormalizerSpec("identity");
|
||||
|
@ -42,7 +42,7 @@ std::string ToHexData(absl::string_view data) {
|
||||
const size_t bucket_size =
|
||||
std::min<size_t>(end - begin, kNumOfBytesOnOneLine -
|
||||
output_count % kNumOfBytesOnOneLine);
|
||||
if (output_count % kNumOfBytesOnOneLine == 0) {
|
||||
if (output_count % kNumOfBytesOnOneLine == 0 && bucket_size > 0) {
|
||||
os << "\"";
|
||||
}
|
||||
for (size_t i = 0; i < bucket_size; ++i) {
|
||||
@ -50,7 +50,7 @@ std::string ToHexData(absl::string_view data) {
|
||||
++begin;
|
||||
}
|
||||
output_count += bucket_size;
|
||||
if (output_count % kNumOfBytesOnOneLine == 0) {
|
||||
if (output_count % kNumOfBytesOnOneLine == 0 && bucket_size > 0) {
|
||||
os << "\"\n";
|
||||
}
|
||||
}
|
||||
@ -67,7 +67,9 @@ int main(int argc, char **argv) {
|
||||
const std::vector<
|
||||
std::pair<std::string, std::function<Status(Builder::CharsMap *)>>>
|
||||
kRuleList = {{"nfkc", Builder::BuildNFKCMap},
|
||||
{"nmt_nfkc", Builder::BuildNmtNFKCMap}};
|
||||
{"nmt_nfkc", Builder::BuildNmtNFKCMap},
|
||||
{"nfkc_cf", Builder::BuildNmtNFKC_CFMap},
|
||||
{"nmt_nfkc_cf", Builder::BuildNmtNFKC_CFMap}};
|
||||
|
||||
constexpr char kHeader[] =
|
||||
R"(#ifndef NORMALIZATION_RULE_H_
|
||||
|
24278
src/normalization_rule.h
24278
src/normalization_rule.h
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user