add optional NFKD support.

This commit is contained in:
Taku Kudo 2022-05-29 11:43:42 +09:00
parent 9f3ed99f5c
commit 9ca65fa9b6
4 changed files with 16949 additions and 11 deletions

16908
data/nfkd.tsv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -12,11 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.!
#include "builder.h"
#include <algorithm>
#include <functional>
#include <utility>
#include "builder.h"
#include "filesystem.h"
#include "third_party/absl/strings/str_join.h"
#include "third_party/absl/strings/str_replace.h"
@ -47,6 +48,12 @@ constexpr int kMaxUnicode = 0x10FFFF;
static constexpr char kDefaultNormalizerName[] = "nfkc";
#ifndef ENABLE_NFKC_COMPILE
static constexpr char kCompileError[] =
"NFK compile is not enabled. rebuild with ./configure "
"--enable-nfkc-compile";
#endif
#ifdef ENABLE_NFKC_COMPILE
// Normalize `input` with ICU's normalizer with `mode`.
Builder::Chars UnicodeNormalize(UNormalizationMode mode,
@ -338,8 +345,7 @@ util::Status Builder::BuildNFKCMap(CharsMap *chars_map) {
*chars_map = std::move(nfkc_map);
#else
LOG(ERROR) << "NFKC compile is not enabled."
<< " rebuild with ./configure --enable-nfkc-compile";
LOG(ERROR) << kCompileError;
#endif
return util::OkStatus();
@ -412,8 +418,7 @@ util::Status Builder::BuildNmtNFKCMap(CharsMap *chars_map) {
*chars_map = std::move(nfkc_map);
#else
LOG(ERROR) << "NFKC compile is not enabled."
<< " rebuild with ./configure --enable-nfkc-compile";
LOG(ERROR) << kCompileError;
#endif
return util::OkStatus();
@ -452,8 +457,7 @@ util::Status Builder::BuildNFKC_CFMap(CharsMap *chars_map) {
RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfkc_map));
*chars_map = std::move(nfkc_map);
#else
LOG(ERROR) << "NFKC_CF compile is not enabled."
<< " rebuild with ./configure --enable-nfkc-compile";
LOG(ERROR) << kCompileError;
#endif
return util::OkStatus();
@ -467,13 +471,31 @@ util::Status Builder::BuildNmtNFKC_CFMap(CharsMap *chars_map) {
RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfkc_map));
*chars_map = std::move(nfkc_map);
#else
LOG(ERROR) << "NMT_NFKC_CF compile is not enabled."
<< " rebuild with ./configure --enable-nfkc-compile";
LOG(ERROR) << kCompileError;
#endif
return util::OkStatus();
}
// static
util::Status Builder::BuildNFKDMap(CharsMap *chars_map) {
#ifdef ENABLE_NFKC_COMPILE
constexpr int kMaxUnicode = 0x10FFFF;
for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
if (!U_IS_UNICODE_CHAR(cp)) {
continue;
}
const auto nfkd = ToNFKD({cp});
if (nfkd.size() >= 2 || (nfkd.size() == 1 && nfkd[0] != cp)) {
(*chars_map)[{cp}] = nfkd;
}
}
#else
LOG(ERROR) << kCompileError;
#endif
return util::OkStatus();
}
// static
util::Status Builder::LoadCharsMap(absl::string_view filename,
CharsMap *chars_map) {

View File

@ -104,6 +104,9 @@ class Builder {
// Makes NMT NFKC with Unicode case folding.
static util::Status BuildNmtNFKC_CFMap(CharsMap *chars_map);
// Given NFKC maps, convert them to NFKD.
static util::Status BuildNFKDMap(CharsMap *chars_map);
// Builds Chars map save in `filename`.
// Format:
// src_uchar1 src_uchar2 ... <tab> trg_uchar1 trg_uchar2...

View File

@ -164,7 +164,8 @@ int main(int argc, char **argv) {
kRuleList = {{"nfkc", Builder::BuildNFKCMap},
{"nmt_nfkc", Builder::BuildNmtNFKCMap},
{"nfkc_cf", Builder::BuildNFKC_CFMap},
{"nmt_nfkc_cf", Builder::BuildNmtNFKC_CFMap}};
{"nmt_nfkc_cf", Builder::BuildNmtNFKC_CFMap},
{"nfkd", Builder::BuildNFKDMap}};
std::vector<std::pair<std::string, std::string>> data;
for (const auto &p : kRuleList) {
@ -174,10 +175,14 @@ int main(int argc, char **argv) {
// Write Header.
std::string index;
CHECK_OK(Builder::CompileCharsMap(normalized_map, &index));
data.emplace_back(p.first, index);
// Write TSV file.
CHECK_OK(Builder::SaveCharsMap(p.first + ".tsv", normalized_map));
// Do not make NFKD map as it is optionally created.
if (p.first.find("nfkd") != std::string::npos) continue;
data.emplace_back(p.first, index);
}
if (absl::GetFlag(FLAGS_output_precompiled_header)) {