mirror of
https://github.com/google/sentencepiece.git
synced 2024-09-11 10:55:42 +03:00
add optional NFKD support.
This commit is contained in:
parent
9f3ed99f5c
commit
9ca65fa9b6
16908
data/nfkd.tsv
Normal file
16908
data/nfkd.tsv
Normal file
File diff suppressed because it is too large
Load Diff
@ -12,11 +12,12 @@
|
|||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.!
|
// limitations under the License.!
|
||||||
|
|
||||||
|
#include "builder.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
#include "builder.h"
|
|
||||||
#include "filesystem.h"
|
#include "filesystem.h"
|
||||||
#include "third_party/absl/strings/str_join.h"
|
#include "third_party/absl/strings/str_join.h"
|
||||||
#include "third_party/absl/strings/str_replace.h"
|
#include "third_party/absl/strings/str_replace.h"
|
||||||
@ -47,6 +48,12 @@ constexpr int kMaxUnicode = 0x10FFFF;
|
|||||||
|
|
||||||
static constexpr char kDefaultNormalizerName[] = "nfkc";
|
static constexpr char kDefaultNormalizerName[] = "nfkc";
|
||||||
|
|
||||||
|
#ifndef ENABLE_NFKC_COMPILE
|
||||||
|
static constexpr char kCompileError[] =
|
||||||
|
"NFK compile is not enabled. rebuild with ./configure "
|
||||||
|
"--enable-nfkc-compile";
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef ENABLE_NFKC_COMPILE
|
#ifdef ENABLE_NFKC_COMPILE
|
||||||
// Normalize `input` with ICU's normalizer with `mode`.
|
// Normalize `input` with ICU's normalizer with `mode`.
|
||||||
Builder::Chars UnicodeNormalize(UNormalizationMode mode,
|
Builder::Chars UnicodeNormalize(UNormalizationMode mode,
|
||||||
@ -338,8 +345,7 @@ util::Status Builder::BuildNFKCMap(CharsMap *chars_map) {
|
|||||||
*chars_map = std::move(nfkc_map);
|
*chars_map = std::move(nfkc_map);
|
||||||
|
|
||||||
#else
|
#else
|
||||||
LOG(ERROR) << "NFKC compile is not enabled."
|
LOG(ERROR) << kCompileError;
|
||||||
<< " rebuild with ./configure --enable-nfkc-compile";
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return util::OkStatus();
|
return util::OkStatus();
|
||||||
@ -412,8 +418,7 @@ util::Status Builder::BuildNmtNFKCMap(CharsMap *chars_map) {
|
|||||||
*chars_map = std::move(nfkc_map);
|
*chars_map = std::move(nfkc_map);
|
||||||
|
|
||||||
#else
|
#else
|
||||||
LOG(ERROR) << "NFKC compile is not enabled."
|
LOG(ERROR) << kCompileError;
|
||||||
<< " rebuild with ./configure --enable-nfkc-compile";
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return util::OkStatus();
|
return util::OkStatus();
|
||||||
@ -452,8 +457,7 @@ util::Status Builder::BuildNFKC_CFMap(CharsMap *chars_map) {
|
|||||||
RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfkc_map));
|
RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfkc_map));
|
||||||
*chars_map = std::move(nfkc_map);
|
*chars_map = std::move(nfkc_map);
|
||||||
#else
|
#else
|
||||||
LOG(ERROR) << "NFKC_CF compile is not enabled."
|
LOG(ERROR) << kCompileError;
|
||||||
<< " rebuild with ./configure --enable-nfkc-compile";
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return util::OkStatus();
|
return util::OkStatus();
|
||||||
@ -467,13 +471,31 @@ util::Status Builder::BuildNmtNFKC_CFMap(CharsMap *chars_map) {
|
|||||||
RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfkc_map));
|
RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfkc_map));
|
||||||
*chars_map = std::move(nfkc_map);
|
*chars_map = std::move(nfkc_map);
|
||||||
#else
|
#else
|
||||||
LOG(ERROR) << "NMT_NFKC_CF compile is not enabled."
|
LOG(ERROR) << kCompileError;
|
||||||
<< " rebuild with ./configure --enable-nfkc-compile";
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return util::OkStatus();
|
return util::OkStatus();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// static
|
||||||
|
util::Status Builder::BuildNFKDMap(CharsMap *chars_map) {
|
||||||
|
#ifdef ENABLE_NFKC_COMPILE
|
||||||
|
constexpr int kMaxUnicode = 0x10FFFF;
|
||||||
|
for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
|
||||||
|
if (!U_IS_UNICODE_CHAR(cp)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const auto nfkd = ToNFKD({cp});
|
||||||
|
if (nfkd.size() >= 2 || (nfkd.size() == 1 && nfkd[0] != cp)) {
|
||||||
|
(*chars_map)[{cp}] = nfkd;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
LOG(ERROR) << kCompileError;
|
||||||
|
#endif
|
||||||
|
return util::OkStatus();
|
||||||
|
}
|
||||||
|
|
||||||
// static
|
// static
|
||||||
util::Status Builder::LoadCharsMap(absl::string_view filename,
|
util::Status Builder::LoadCharsMap(absl::string_view filename,
|
||||||
CharsMap *chars_map) {
|
CharsMap *chars_map) {
|
||||||
|
@ -104,6 +104,9 @@ class Builder {
|
|||||||
// Makes NMT NFKC with Unicode case folding.
|
// Makes NMT NFKC with Unicode case folding.
|
||||||
static util::Status BuildNmtNFKC_CFMap(CharsMap *chars_map);
|
static util::Status BuildNmtNFKC_CFMap(CharsMap *chars_map);
|
||||||
|
|
||||||
|
// Given NFKC maps, convert them to NFKD.
|
||||||
|
static util::Status BuildNFKDMap(CharsMap *chars_map);
|
||||||
|
|
||||||
// Builds Chars map save in `filename`.
|
// Builds Chars map save in `filename`.
|
||||||
// Format:
|
// Format:
|
||||||
// src_uchar1 src_uchar2 ... <tab> trg_uchar1 trg_uchar2...
|
// src_uchar1 src_uchar2 ... <tab> trg_uchar1 trg_uchar2...
|
||||||
|
@ -164,7 +164,8 @@ int main(int argc, char **argv) {
|
|||||||
kRuleList = {{"nfkc", Builder::BuildNFKCMap},
|
kRuleList = {{"nfkc", Builder::BuildNFKCMap},
|
||||||
{"nmt_nfkc", Builder::BuildNmtNFKCMap},
|
{"nmt_nfkc", Builder::BuildNmtNFKCMap},
|
||||||
{"nfkc_cf", Builder::BuildNFKC_CFMap},
|
{"nfkc_cf", Builder::BuildNFKC_CFMap},
|
||||||
{"nmt_nfkc_cf", Builder::BuildNmtNFKC_CFMap}};
|
{"nmt_nfkc_cf", Builder::BuildNmtNFKC_CFMap},
|
||||||
|
{"nfkd", Builder::BuildNFKDMap}};
|
||||||
|
|
||||||
std::vector<std::pair<std::string, std::string>> data;
|
std::vector<std::pair<std::string, std::string>> data;
|
||||||
for (const auto &p : kRuleList) {
|
for (const auto &p : kRuleList) {
|
||||||
@ -174,10 +175,14 @@ int main(int argc, char **argv) {
|
|||||||
// Write Header.
|
// Write Header.
|
||||||
std::string index;
|
std::string index;
|
||||||
CHECK_OK(Builder::CompileCharsMap(normalized_map, &index));
|
CHECK_OK(Builder::CompileCharsMap(normalized_map, &index));
|
||||||
data.emplace_back(p.first, index);
|
|
||||||
|
|
||||||
// Write TSV file.
|
// Write TSV file.
|
||||||
CHECK_OK(Builder::SaveCharsMap(p.first + ".tsv", normalized_map));
|
CHECK_OK(Builder::SaveCharsMap(p.first + ".tsv", normalized_map));
|
||||||
|
|
||||||
|
// Do not make NFKD map as it is optionally created.
|
||||||
|
if (p.first.find("nfkd") != std::string::npos) continue;
|
||||||
|
|
||||||
|
data.emplace_back(p.first, index);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (absl::GetFlag(FLAGS_output_precompiled_header)) {
|
if (absl::GetFlag(FLAGS_output_precompiled_header)) {
|
||||||
|
Loading…
Reference in New Issue
Block a user