mirror of
https://github.com/google/sentencepiece.git
synced 2024-09-11 10:55:42 +03:00
add optional NFKD support.
This commit is contained in:
parent
9f3ed99f5c
commit
9ca65fa9b6
16908
data/nfkd.tsv
Normal file
16908
data/nfkd.tsv
Normal file
File diff suppressed because it is too large
Load Diff
@ -12,11 +12,12 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.!
|
||||
|
||||
#include "builder.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <utility>
|
||||
|
||||
#include "builder.h"
|
||||
#include "filesystem.h"
|
||||
#include "third_party/absl/strings/str_join.h"
|
||||
#include "third_party/absl/strings/str_replace.h"
|
||||
@ -47,6 +48,12 @@ constexpr int kMaxUnicode = 0x10FFFF;
|
||||
|
||||
static constexpr char kDefaultNormalizerName[] = "nfkc";
|
||||
|
||||
#ifndef ENABLE_NFKC_COMPILE
|
||||
static constexpr char kCompileError[] =
|
||||
"NFK compile is not enabled. rebuild with ./configure "
|
||||
"--enable-nfkc-compile";
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_NFKC_COMPILE
|
||||
// Normalize `input` with ICU's normalizer with `mode`.
|
||||
Builder::Chars UnicodeNormalize(UNormalizationMode mode,
|
||||
@ -338,8 +345,7 @@ util::Status Builder::BuildNFKCMap(CharsMap *chars_map) {
|
||||
*chars_map = std::move(nfkc_map);
|
||||
|
||||
#else
|
||||
LOG(ERROR) << "NFKC compile is not enabled."
|
||||
<< " rebuild with ./configure --enable-nfkc-compile";
|
||||
LOG(ERROR) << kCompileError;
|
||||
#endif
|
||||
|
||||
return util::OkStatus();
|
||||
@ -412,8 +418,7 @@ util::Status Builder::BuildNmtNFKCMap(CharsMap *chars_map) {
|
||||
*chars_map = std::move(nfkc_map);
|
||||
|
||||
#else
|
||||
LOG(ERROR) << "NFKC compile is not enabled."
|
||||
<< " rebuild with ./configure --enable-nfkc-compile";
|
||||
LOG(ERROR) << kCompileError;
|
||||
#endif
|
||||
|
||||
return util::OkStatus();
|
||||
@ -452,8 +457,7 @@ util::Status Builder::BuildNFKC_CFMap(CharsMap *chars_map) {
|
||||
RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfkc_map));
|
||||
*chars_map = std::move(nfkc_map);
|
||||
#else
|
||||
LOG(ERROR) << "NFKC_CF compile is not enabled."
|
||||
<< " rebuild with ./configure --enable-nfkc-compile";
|
||||
LOG(ERROR) << kCompileError;
|
||||
#endif
|
||||
|
||||
return util::OkStatus();
|
||||
@ -467,13 +471,31 @@ util::Status Builder::BuildNmtNFKC_CFMap(CharsMap *chars_map) {
|
||||
RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfkc_map));
|
||||
*chars_map = std::move(nfkc_map);
|
||||
#else
|
||||
LOG(ERROR) << "NMT_NFKC_CF compile is not enabled."
|
||||
<< " rebuild with ./configure --enable-nfkc-compile";
|
||||
LOG(ERROR) << kCompileError;
|
||||
#endif
|
||||
|
||||
return util::OkStatus();
|
||||
}
|
||||
|
||||
// static
|
||||
util::Status Builder::BuildNFKDMap(CharsMap *chars_map) {
|
||||
#ifdef ENABLE_NFKC_COMPILE
|
||||
constexpr int kMaxUnicode = 0x10FFFF;
|
||||
for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
|
||||
if (!U_IS_UNICODE_CHAR(cp)) {
|
||||
continue;
|
||||
}
|
||||
const auto nfkd = ToNFKD({cp});
|
||||
if (nfkd.size() >= 2 || (nfkd.size() == 1 && nfkd[0] != cp)) {
|
||||
(*chars_map)[{cp}] = nfkd;
|
||||
}
|
||||
}
|
||||
#else
|
||||
LOG(ERROR) << kCompileError;
|
||||
#endif
|
||||
return util::OkStatus();
|
||||
}
|
||||
|
||||
// static
|
||||
util::Status Builder::LoadCharsMap(absl::string_view filename,
|
||||
CharsMap *chars_map) {
|
||||
|
@ -104,6 +104,9 @@ class Builder {
|
||||
// Makes NMT NFKC with Unicode case folding.
|
||||
static util::Status BuildNmtNFKC_CFMap(CharsMap *chars_map);
|
||||
|
||||
// Given NFKC maps, convert them to NFKD.
|
||||
static util::Status BuildNFKDMap(CharsMap *chars_map);
|
||||
|
||||
// Builds Chars map save in `filename`.
|
||||
// Format:
|
||||
// src_uchar1 src_uchar2 ... <tab> trg_uchar1 trg_uchar2...
|
||||
|
@ -164,7 +164,8 @@ int main(int argc, char **argv) {
|
||||
kRuleList = {{"nfkc", Builder::BuildNFKCMap},
|
||||
{"nmt_nfkc", Builder::BuildNmtNFKCMap},
|
||||
{"nfkc_cf", Builder::BuildNFKC_CFMap},
|
||||
{"nmt_nfkc_cf", Builder::BuildNmtNFKC_CFMap}};
|
||||
{"nmt_nfkc_cf", Builder::BuildNmtNFKC_CFMap},
|
||||
{"nfkd", Builder::BuildNFKDMap}};
|
||||
|
||||
std::vector<std::pair<std::string, std::string>> data;
|
||||
for (const auto &p : kRuleList) {
|
||||
@ -174,10 +175,14 @@ int main(int argc, char **argv) {
|
||||
// Write Header.
|
||||
std::string index;
|
||||
CHECK_OK(Builder::CompileCharsMap(normalized_map, &index));
|
||||
data.emplace_back(p.first, index);
|
||||
|
||||
// Write TSV file.
|
||||
CHECK_OK(Builder::SaveCharsMap(p.first + ".tsv", normalized_map));
|
||||
|
||||
// Do not make NFKD map as it is optionally created.
|
||||
if (p.first.find("nfkd") != std::string::npos) continue;
|
||||
|
||||
data.emplace_back(p.first, index);
|
||||
}
|
||||
|
||||
if (absl::GetFlag(FLAGS_output_precompiled_header)) {
|
||||
|
Loading…
Reference in New Issue
Block a user