allows to load precomputed seed sentencepieces for unigram from a file.

This commit is contained in:
Taku Kudo 2024-01-28 16:17:08 +00:00
parent 0fe7add363
commit 53de76561c
7 changed files with 544 additions and 351 deletions

View File

@ -285,74 +285,74 @@ class TrainerSpec::_Internal {
(*has_bits)[0] |= 1u;
}
static void set_has_model_type(HasBits* has_bits) {
(*has_bits)[0] |= 8388608u;
}
static void set_has_vocab_size(HasBits* has_bits) {
(*has_bits)[0] |= 16777216u;
}
static void set_has_self_test_sample_size(HasBits* has_bits) {
(*has_bits)[0] |= 512u;
}
static void set_has_enable_differential_privacy(HasBits* has_bits) {
(*has_bits)[0] |= 8192u;
}
static void set_has_differential_privacy_noise_level(HasBits* has_bits) {
(*has_bits)[0] |= 2097152u;
}
static void set_has_differential_privacy_clipping_threshold(HasBits* has_bits) {
(*has_bits)[0] |= 4194304u;
}
static void set_has_character_coverage(HasBits* has_bits) {
static void set_has_vocab_size(HasBits* has_bits) {
(*has_bits)[0] |= 33554432u;
}
static void set_has_input_sentence_size(HasBits* has_bits) {
(*has_bits)[0] |= 2048u;
}
static void set_has_shuffle_input_sentence(HasBits* has_bits) {
(*has_bits)[1] |= 1u;
}
static void set_has_mining_sentence_size(HasBits* has_bits) {
static void set_has_self_test_sample_size(HasBits* has_bits) {
(*has_bits)[0] |= 1024u;
}
static void set_has_training_sentence_size(HasBits* has_bits) {
(*has_bits)[0] |= 4096u;
}
static void set_has_seed_sentencepiece_size(HasBits* has_bits) {
(*has_bits)[0] |= 67108864u;
}
static void set_has_shrinking_factor(HasBits* has_bits) {
(*has_bits)[0] |= 134217728u;
}
static void set_has_max_sentence_length(HasBits* has_bits) {
(*has_bits)[0] |= 1073741824u;
}
static void set_has_num_threads(HasBits* has_bits) {
(*has_bits)[0] |= 268435456u;
}
static void set_has_num_sub_iterations(HasBits* has_bits) {
(*has_bits)[0] |= 536870912u;
}
static void set_has_max_sentencepiece_length(HasBits* has_bits) {
(*has_bits)[0] |= 2147483648u;
}
static void set_has_split_by_unicode_script(HasBits* has_bits) {
(*has_bits)[1] |= 2u;
}
static void set_has_split_by_number(HasBits* has_bits) {
(*has_bits)[1] |= 4u;
}
static void set_has_split_by_whitespace(HasBits* has_bits) {
(*has_bits)[1] |= 8u;
}
static void set_has_treat_whitespace_as_suffix(HasBits* has_bits) {
static void set_has_enable_differential_privacy(HasBits* has_bits) {
(*has_bits)[0] |= 16384u;
}
static void set_has_allow_whitespace_only_pieces(HasBits* has_bits) {
static void set_has_differential_privacy_noise_level(HasBits* has_bits) {
(*has_bits)[0] |= 4194304u;
}
static void set_has_differential_privacy_clipping_threshold(HasBits* has_bits) {
(*has_bits)[0] |= 8388608u;
}
static void set_has_character_coverage(HasBits* has_bits) {
(*has_bits)[0] |= 67108864u;
}
static void set_has_input_sentence_size(HasBits* has_bits) {
(*has_bits)[0] |= 4096u;
}
static void set_has_shuffle_input_sentence(HasBits* has_bits) {
(*has_bits)[1] |= 2u;
}
static void set_has_mining_sentence_size(HasBits* has_bits) {
(*has_bits)[0] |= 2048u;
}
static void set_has_training_sentence_size(HasBits* has_bits) {
(*has_bits)[0] |= 8192u;
}
static void set_has_seed_sentencepiece_size(HasBits* has_bits) {
(*has_bits)[0] |= 134217728u;
}
static void set_has_shrinking_factor(HasBits* has_bits) {
(*has_bits)[0] |= 268435456u;
}
static void set_has_max_sentence_length(HasBits* has_bits) {
(*has_bits)[0] |= 2147483648u;
}
static void set_has_num_threads(HasBits* has_bits) {
(*has_bits)[0] |= 536870912u;
}
static void set_has_num_sub_iterations(HasBits* has_bits) {
(*has_bits)[0] |= 1073741824u;
}
static void set_has_max_sentencepiece_length(HasBits* has_bits) {
(*has_bits)[1] |= 1u;
}
static void set_has_split_by_unicode_script(HasBits* has_bits) {
(*has_bits)[1] |= 4u;
}
static void set_has_split_by_number(HasBits* has_bits) {
(*has_bits)[1] |= 8u;
}
static void set_has_split_by_whitespace(HasBits* has_bits) {
(*has_bits)[1] |= 16u;
}
static void set_has_treat_whitespace_as_suffix(HasBits* has_bits) {
(*has_bits)[0] |= 32768u;
}
static void set_has_split_digits(HasBits* has_bits) {
static void set_has_allow_whitespace_only_pieces(HasBits* has_bits) {
(*has_bits)[0] |= 65536u;
}
static void set_has_split_digits(HasBits* has_bits) {
(*has_bits)[0] |= 131072u;
}
static void set_has_pretokenization_delimiter(HasBits* has_bits) {
(*has_bits)[0] |= 256u;
}
@ -360,29 +360,29 @@ class TrainerSpec::_Internal {
(*has_bits)[0] |= 4u;
}
static void set_has_byte_fallback(HasBits* has_bits) {
(*has_bits)[0] |= 131072u;
}
static void set_has_vocabulary_output_piece_score(HasBits* has_bits) {
(*has_bits)[1] |= 16u;
}
static void set_has_hard_vocab_limit(HasBits* has_bits) {
(*has_bits)[1] |= 32u;
}
static void set_has_use_all_vocab(HasBits* has_bits) {
(*has_bits)[0] |= 262144u;
}
static void set_has_unk_id(HasBits* has_bits) {
(*has_bits)[0] |= 1048576u;
static void set_has_vocabulary_output_piece_score(HasBits* has_bits) {
(*has_bits)[1] |= 32u;
}
static void set_has_bos_id(HasBits* has_bits) {
static void set_has_hard_vocab_limit(HasBits* has_bits) {
(*has_bits)[1] |= 64u;
}
static void set_has_eos_id(HasBits* has_bits) {
static void set_has_use_all_vocab(HasBits* has_bits) {
(*has_bits)[0] |= 524288u;
}
static void set_has_unk_id(HasBits* has_bits) {
(*has_bits)[0] |= 2097152u;
}
static void set_has_bos_id(HasBits* has_bits) {
(*has_bits)[1] |= 128u;
}
static void set_has_pad_id(HasBits* has_bits) {
static void set_has_eos_id(HasBits* has_bits) {
(*has_bits)[1] |= 256u;
}
static void set_has_pad_id(HasBits* has_bits) {
(*has_bits)[1] |= 512u;
}
static void set_has_unk_piece(HasBits* has_bits) {
(*has_bits)[0] |= 16u;
}
@ -399,7 +399,10 @@ class TrainerSpec::_Internal {
(*has_bits)[0] |= 8u;
}
static void set_has_train_extremely_large_corpus(HasBits* has_bits) {
(*has_bits)[0] |= 524288u;
(*has_bits)[0] |= 1048576u;
}
static void set_has_seed_sentencepieces_file(HasBits* has_bits) {
(*has_bits)[0] |= 512u;
}
};
@ -473,6 +476,11 @@ TrainerSpec::TrainerSpec(const TrainerSpec& from)
pretokenization_delimiter_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, from._internal_pretokenization_delimiter(),
GetArena());
}
seed_sentencepieces_file_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited());
if (from._internal_has_seed_sentencepieces_file()) {
seed_sentencepieces_file_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, from._internal_seed_sentencepieces_file(),
GetArena());
}
::memcpy(&self_test_sample_size_, &from.self_test_sample_size_,
static_cast<size_t>(reinterpret_cast<char*>(&pad_id_) -
reinterpret_cast<char*>(&self_test_sample_size_)) + sizeof(pad_id_));
@ -490,6 +498,7 @@ void TrainerSpec::SharedCtor() {
eos_piece_.UnsafeSetDefault(nullptr);
pad_piece_.UnsafeSetDefault(nullptr);
pretokenization_delimiter_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited());
seed_sentencepieces_file_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited());
::memset(reinterpret_cast<char*>(this) + static_cast<size_t>(
reinterpret_cast<char*>(&self_test_sample_size_) - reinterpret_cast<char*>(this)),
0, static_cast<size_t>(reinterpret_cast<char*>(&differential_privacy_clipping_threshold_) -
@ -531,6 +540,7 @@ void TrainerSpec::SharedDtor() {
eos_piece_.DestroyNoArena(nullptr);
pad_piece_.DestroyNoArena(nullptr);
pretokenization_delimiter_.DestroyNoArena(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited());
seed_sentencepieces_file_.DestroyNoArena(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited());
}
void TrainerSpec::ArenaDtor(void* object) {
@ -586,21 +596,26 @@ void TrainerSpec::Clear() {
pad_piece_.ClearToDefault(::sentencepiece::TrainerSpec::_i_give_permission_to_break_this_code_default_pad_piece_, GetArena());
}
}
if (cached_has_bits & 0x00000100u) {
pretokenization_delimiter_.ClearNonDefaultToEmpty();
if (cached_has_bits & 0x00000300u) {
if (cached_has_bits & 0x00000100u) {
pretokenization_delimiter_.ClearNonDefaultToEmpty();
}
if (cached_has_bits & 0x00000200u) {
seed_sentencepieces_file_.ClearNonDefaultToEmpty();
}
}
if (cached_has_bits & 0x0000fe00u) {
if (cached_has_bits & 0x0000fc00u) {
::memset(&self_test_sample_size_, 0, static_cast<size_t>(
reinterpret_cast<char*>(&allow_whitespace_only_pieces_) -
reinterpret_cast<char*>(&self_test_sample_size_)) + sizeof(allow_whitespace_only_pieces_));
reinterpret_cast<char*>(&treat_whitespace_as_suffix_) -
reinterpret_cast<char*>(&self_test_sample_size_)) + sizeof(treat_whitespace_as_suffix_));
}
if (cached_has_bits & 0x00ff0000u) {
::memset(&split_digits_, 0, static_cast<size_t>(
::memset(&allow_whitespace_only_pieces_, 0, static_cast<size_t>(
reinterpret_cast<char*>(&differential_privacy_clipping_threshold_) -
reinterpret_cast<char*>(&split_digits_)) + sizeof(differential_privacy_clipping_threshold_));
model_type_ = 1;
reinterpret_cast<char*>(&allow_whitespace_only_pieces_)) + sizeof(differential_privacy_clipping_threshold_));
}
if (cached_has_bits & 0xff000000u) {
model_type_ = 1;
vocab_size_ = 8000;
character_coverage_ = 0.9995f;
seed_sentencepiece_size_ = 1000000;
@ -608,10 +623,10 @@ void TrainerSpec::Clear() {
num_threads_ = 16;
num_sub_iterations_ = 2;
max_sentence_length_ = 4192;
max_sentencepiece_length_ = 16;
}
cached_has_bits = _has_bits_[1];
if (cached_has_bits & 0x000000ffu) {
max_sentencepiece_length_ = 16;
shuffle_input_sentence_ = true;
split_by_unicode_script_ = true;
split_by_number_ = true;
@ -619,9 +634,11 @@ void TrainerSpec::Clear() {
vocabulary_output_piece_score_ = true;
hard_vocab_limit_ = true;
bos_id_ = 1;
eos_id_ = 2;
}
pad_id_ = -1;
if (cached_has_bits & 0x00000300u) {
eos_id_ = 2;
pad_id_ = -1;
}
_has_bits_.Clear();
_internal_metadata_.Clear<std::string>();
}
@ -1017,6 +1034,14 @@ const char* TrainerSpec::_InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID
CHK_(ptr);
} else goto handle_unusual;
continue;
// optional string seed_sentencepieces_file = 54 [default = ""];
case 54:
if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 178)) {
auto str = _internal_mutable_seed_sentencepieces_file();
ptr = ::PROTOBUF_NAMESPACE_ID::internal::InlineGreedyStringParser(str, ptr, ctx);
CHK_(ptr);
} else goto handle_unusual;
continue;
default: {
handle_unusual:
if ((tag & 7) == 4 || tag == 0) {
@ -1065,14 +1090,14 @@ failure:
}
// optional .sentencepiece.TrainerSpec.ModelType model_type = 3 [default = UNIGRAM];
if (cached_has_bits & 0x00800000u) {
if (cached_has_bits & 0x01000000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteEnumToArray(
3, this->_internal_model_type(), target);
}
// optional int32 vocab_size = 4 [default = 8000];
if (cached_has_bits & 0x01000000u) {
if (cached_has_bits & 0x02000000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(4, this->_internal_vocab_size(), target);
}
@ -1084,7 +1109,7 @@ failure:
}
// optional int32 self_test_sample_size = 6 [default = 0];
if (cached_has_bits & 0x00000200u) {
if (cached_has_bits & 0x00000400u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(6, this->_internal_self_test_sample_size(), target);
}
@ -1096,107 +1121,105 @@ failure:
}
// optional float character_coverage = 10 [default = 0.9995];
if (cached_has_bits & 0x02000000u) {
if (cached_has_bits & 0x04000000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteFloatToArray(10, this->_internal_character_coverage(), target);
}
// optional uint64 input_sentence_size = 11 [default = 0];
if (cached_has_bits & 0x00000800u) {
if (cached_has_bits & 0x00001000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteUInt64ToArray(11, this->_internal_input_sentence_size(), target);
}
// optional int32 mining_sentence_size = 12 [deprecated = true];
if (cached_has_bits & 0x00000400u) {
if (cached_has_bits & 0x00000800u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(12, this->_internal_mining_sentence_size(), target);
}
// optional int32 training_sentence_size = 13 [deprecated = true];
if (cached_has_bits & 0x00001000u) {
if (cached_has_bits & 0x00002000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(13, this->_internal_training_sentence_size(), target);
}
// optional int32 seed_sentencepiece_size = 14 [default = 1000000];
if (cached_has_bits & 0x04000000u) {
if (cached_has_bits & 0x08000000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(14, this->_internal_seed_sentencepiece_size(), target);
}
// optional float shrinking_factor = 15 [default = 0.75];
if (cached_has_bits & 0x08000000u) {
if (cached_has_bits & 0x10000000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteFloatToArray(15, this->_internal_shrinking_factor(), target);
}
// optional int32 num_threads = 16 [default = 16];
if (cached_has_bits & 0x10000000u) {
if (cached_has_bits & 0x20000000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(16, this->_internal_num_threads(), target);
}
// optional int32 num_sub_iterations = 17 [default = 2];
if (cached_has_bits & 0x20000000u) {
if (cached_has_bits & 0x40000000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(17, this->_internal_num_sub_iterations(), target);
}
// optional int32 max_sentence_length = 18 [default = 4192];
if (cached_has_bits & 0x40000000u) {
if (cached_has_bits & 0x80000000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(18, this->_internal_max_sentence_length(), target);
}
cached_has_bits = _has_bits_[1];
// optional bool shuffle_input_sentence = 19 [default = true];
if (cached_has_bits & 0x00000001u) {
if (cached_has_bits & 0x00000002u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(19, this->_internal_shuffle_input_sentence(), target);
}
cached_has_bits = _has_bits_[0];
// optional int32 max_sentencepiece_length = 20 [default = 16];
if (cached_has_bits & 0x80000000u) {
if (cached_has_bits & 0x00000001u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(20, this->_internal_max_sentencepiece_length(), target);
}
cached_has_bits = _has_bits_[1];
// optional bool split_by_unicode_script = 21 [default = true];
if (cached_has_bits & 0x00000002u) {
if (cached_has_bits & 0x00000004u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(21, this->_internal_split_by_unicode_script(), target);
}
// optional bool split_by_whitespace = 22 [default = true];
if (cached_has_bits & 0x00000008u) {
if (cached_has_bits & 0x00000010u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(22, this->_internal_split_by_whitespace(), target);
}
// optional bool split_by_number = 23 [default = true];
if (cached_has_bits & 0x00000004u) {
if (cached_has_bits & 0x00000008u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(23, this->_internal_split_by_number(), target);
}
cached_has_bits = _has_bits_[0];
// optional bool treat_whitespace_as_suffix = 24 [default = false];
if (cached_has_bits & 0x00004000u) {
if (cached_has_bits & 0x00008000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(24, this->_internal_treat_whitespace_as_suffix(), target);
}
// optional bool split_digits = 25 [default = false];
if (cached_has_bits & 0x00010000u) {
if (cached_has_bits & 0x00020000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(25, this->_internal_split_digits(), target);
}
// optional bool allow_whitespace_only_pieces = 26 [default = false];
if (cached_has_bits & 0x00008000u) {
if (cached_has_bits & 0x00010000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(26, this->_internal_allow_whitespace_only_pieces(), target);
}
@ -1215,26 +1238,26 @@ failure:
cached_has_bits = _has_bits_[1];
// optional bool vocabulary_output_piece_score = 32 [default = true];
if (cached_has_bits & 0x00000010u) {
if (cached_has_bits & 0x00000020u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(32, this->_internal_vocabulary_output_piece_score(), target);
}
// optional bool hard_vocab_limit = 33 [default = true];
if (cached_has_bits & 0x00000020u) {
if (cached_has_bits & 0x00000040u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(33, this->_internal_hard_vocab_limit(), target);
}
cached_has_bits = _has_bits_[0];
// optional bool use_all_vocab = 34 [default = false];
if (cached_has_bits & 0x00040000u) {
if (cached_has_bits & 0x00080000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(34, this->_internal_use_all_vocab(), target);
}
// optional bool byte_fallback = 35 [default = false];
if (cached_has_bits & 0x00020000u) {
if (cached_has_bits & 0x00040000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(35, this->_internal_byte_fallback(), target);
}
@ -1246,26 +1269,26 @@ failure:
}
// optional int32 unk_id = 40 [default = 0];
if (cached_has_bits & 0x00100000u) {
if (cached_has_bits & 0x00200000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(40, this->_internal_unk_id(), target);
}
cached_has_bits = _has_bits_[1];
// optional int32 bos_id = 41 [default = 1];
if (cached_has_bits & 0x00000040u) {
if (cached_has_bits & 0x00000080u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(41, this->_internal_bos_id(), target);
}
// optional int32 eos_id = 42 [default = 2];
if (cached_has_bits & 0x00000080u) {
if (cached_has_bits & 0x00000100u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(42, this->_internal_eos_id(), target);
}
// optional int32 pad_id = 43 [default = -1];
if (cached_has_bits & 0x00000100u) {
if (cached_has_bits & 0x00000200u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(43, this->_internal_pad_id(), target);
}
@ -1302,25 +1325,25 @@ failure:
}
// optional bool train_extremely_large_corpus = 49 [default = false];
if (cached_has_bits & 0x00080000u) {
if (cached_has_bits & 0x00100000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(49, this->_internal_train_extremely_large_corpus(), target);
}
// optional bool enable_differential_privacy = 50 [default = false];
if (cached_has_bits & 0x00002000u) {
if (cached_has_bits & 0x00004000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(50, this->_internal_enable_differential_privacy(), target);
}
// optional float differential_privacy_noise_level = 51 [default = 0];
if (cached_has_bits & 0x00200000u) {
if (cached_has_bits & 0x00400000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteFloatToArray(51, this->_internal_differential_privacy_noise_level(), target);
}
// optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
if (cached_has_bits & 0x00400000u) {
if (cached_has_bits & 0x00800000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteUInt64ToArray(52, this->_internal_differential_privacy_clipping_threshold(), target);
}
@ -1331,6 +1354,12 @@ failure:
53, this->_internal_pretokenization_delimiter(), target);
}
// optional string seed_sentencepieces_file = 54 [default = ""];
if (cached_has_bits & 0x00000200u) {
target = stream->WriteStringMaybeAliased(
54, this->_internal_seed_sentencepieces_file(), target);
}
// Extension range [200, 536870912)
target = _extensions_._InternalSerialize(
200, 536870912, target, stream);
@ -1452,205 +1481,214 @@ size_t TrainerSpec::ByteSizeLong() const {
this->_internal_pretokenization_delimiter());
}
// optional int32 self_test_sample_size = 6 [default = 0];
// optional string seed_sentencepieces_file = 54 [default = ""];
if (cached_has_bits & 0x00000200u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::StringSize(
this->_internal_seed_sentencepieces_file());
}
// optional int32 self_test_sample_size = 6 [default = 0];
if (cached_has_bits & 0x00000400u) {
total_size += 1 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_self_test_sample_size());
}
// optional int32 mining_sentence_size = 12 [deprecated = true];
if (cached_has_bits & 0x00000400u) {
if (cached_has_bits & 0x00000800u) {
total_size += 1 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_mining_sentence_size());
}
// optional uint64 input_sentence_size = 11 [default = 0];
if (cached_has_bits & 0x00000800u) {
if (cached_has_bits & 0x00001000u) {
total_size += 1 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::UInt64Size(
this->_internal_input_sentence_size());
}
// optional int32 training_sentence_size = 13 [deprecated = true];
if (cached_has_bits & 0x00001000u) {
if (cached_has_bits & 0x00002000u) {
total_size += 1 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_training_sentence_size());
}
// optional bool enable_differential_privacy = 50 [default = false];
if (cached_has_bits & 0x00002000u) {
total_size += 2 + 1;
}
// optional bool treat_whitespace_as_suffix = 24 [default = false];
if (cached_has_bits & 0x00004000u) {
total_size += 2 + 1;
}
// optional bool allow_whitespace_only_pieces = 26 [default = false];
// optional bool treat_whitespace_as_suffix = 24 [default = false];
if (cached_has_bits & 0x00008000u) {
total_size += 2 + 1;
}
}
if (cached_has_bits & 0x00ff0000u) {
// optional bool split_digits = 25 [default = false];
// optional bool allow_whitespace_only_pieces = 26 [default = false];
if (cached_has_bits & 0x00010000u) {
total_size += 2 + 1;
}
// optional bool byte_fallback = 35 [default = false];
// optional bool split_digits = 25 [default = false];
if (cached_has_bits & 0x00020000u) {
total_size += 2 + 1;
}
// optional bool use_all_vocab = 34 [default = false];
// optional bool byte_fallback = 35 [default = false];
if (cached_has_bits & 0x00040000u) {
total_size += 2 + 1;
}
// optional bool train_extremely_large_corpus = 49 [default = false];
// optional bool use_all_vocab = 34 [default = false];
if (cached_has_bits & 0x00080000u) {
total_size += 2 + 1;
}
// optional int32 unk_id = 40 [default = 0];
// optional bool train_extremely_large_corpus = 49 [default = false];
if (cached_has_bits & 0x00100000u) {
total_size += 2 + 1;
}
// optional int32 unk_id = 40 [default = 0];
if (cached_has_bits & 0x00200000u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_unk_id());
}
// optional float differential_privacy_noise_level = 51 [default = 0];
if (cached_has_bits & 0x00200000u) {
if (cached_has_bits & 0x00400000u) {
total_size += 2 + 4;
}
// optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
if (cached_has_bits & 0x00400000u) {
if (cached_has_bits & 0x00800000u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::UInt64Size(
this->_internal_differential_privacy_clipping_threshold());
}
}
if (cached_has_bits & 0xff000000u) {
// optional .sentencepiece.TrainerSpec.ModelType model_type = 3 [default = UNIGRAM];
if (cached_has_bits & 0x00800000u) {
if (cached_has_bits & 0x01000000u) {
total_size += 1 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::EnumSize(this->_internal_model_type());
}
}
if (cached_has_bits & 0xff000000u) {
// optional int32 vocab_size = 4 [default = 8000];
if (cached_has_bits & 0x01000000u) {
if (cached_has_bits & 0x02000000u) {
total_size += 1 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_vocab_size());
}
// optional float character_coverage = 10 [default = 0.9995];
if (cached_has_bits & 0x02000000u) {
if (cached_has_bits & 0x04000000u) {
total_size += 1 + 4;
}
// optional int32 seed_sentencepiece_size = 14 [default = 1000000];
if (cached_has_bits & 0x04000000u) {
if (cached_has_bits & 0x08000000u) {
total_size += 1 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_seed_sentencepiece_size());
}
// optional float shrinking_factor = 15 [default = 0.75];
if (cached_has_bits & 0x08000000u) {
if (cached_has_bits & 0x10000000u) {
total_size += 1 + 4;
}
// optional int32 num_threads = 16 [default = 16];
if (cached_has_bits & 0x10000000u) {
if (cached_has_bits & 0x20000000u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_num_threads());
}
// optional int32 num_sub_iterations = 17 [default = 2];
if (cached_has_bits & 0x20000000u) {
if (cached_has_bits & 0x40000000u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_num_sub_iterations());
}
// optional int32 max_sentence_length = 18 [default = 4192];
if (cached_has_bits & 0x40000000u) {
if (cached_has_bits & 0x80000000u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_max_sentence_length());
}
}
cached_has_bits = _has_bits_[1];
if (cached_has_bits & 0x000000ffu) {
// optional int32 max_sentencepiece_length = 20 [default = 16];
if (cached_has_bits & 0x80000000u) {
if (cached_has_bits & 0x00000001u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_max_sentencepiece_length());
}
}
cached_has_bits = _has_bits_[1];
if (cached_has_bits & 0x000000ffu) {
// optional bool shuffle_input_sentence = 19 [default = true];
if (cached_has_bits & 0x00000001u) {
total_size += 2 + 1;
}
// optional bool split_by_unicode_script = 21 [default = true];
if (cached_has_bits & 0x00000002u) {
total_size += 2 + 1;
}
// optional bool split_by_number = 23 [default = true];
// optional bool split_by_unicode_script = 21 [default = true];
if (cached_has_bits & 0x00000004u) {
total_size += 2 + 1;
}
// optional bool split_by_whitespace = 22 [default = true];
// optional bool split_by_number = 23 [default = true];
if (cached_has_bits & 0x00000008u) {
total_size += 2 + 1;
}
// optional bool vocabulary_output_piece_score = 32 [default = true];
// optional bool split_by_whitespace = 22 [default = true];
if (cached_has_bits & 0x00000010u) {
total_size += 2 + 1;
}
// optional bool hard_vocab_limit = 33 [default = true];
// optional bool vocabulary_output_piece_score = 32 [default = true];
if (cached_has_bits & 0x00000020u) {
total_size += 2 + 1;
}
// optional int32 bos_id = 41 [default = 1];
// optional bool hard_vocab_limit = 33 [default = true];
if (cached_has_bits & 0x00000040u) {
total_size += 2 + 1;
}
// optional int32 bos_id = 41 [default = 1];
if (cached_has_bits & 0x00000080u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_bos_id());
}
}
if (cached_has_bits & 0x00000300u) {
// optional int32 eos_id = 42 [default = 2];
if (cached_has_bits & 0x00000080u) {
if (cached_has_bits & 0x00000100u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_eos_id());
}
}
// optional int32 pad_id = 43 [default = -1];
if (cached_has_bits & 0x00000100u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_pad_id());
}
// optional int32 pad_id = 43 [default = -1];
if (cached_has_bits & 0x00000200u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_pad_id());
}
}
if (PROTOBUF_PREDICT_FALSE(_internal_metadata_.have_unknown_fields())) {
total_size += _internal_metadata_.unknown_fields<std::string>(::PROTOBUF_NAMESPACE_ID::internal::GetEmptyString).size();
}
@ -1709,112 +1747,118 @@ void TrainerSpec::MergeFrom(const TrainerSpec& from) {
_internal_set_pretokenization_delimiter(from._internal_pretokenization_delimiter());
}
if (cached_has_bits & 0x00000200u) {
self_test_sample_size_ = from.self_test_sample_size_;
_internal_set_seed_sentencepieces_file(from._internal_seed_sentencepieces_file());
}
if (cached_has_bits & 0x00000400u) {
mining_sentence_size_ = from.mining_sentence_size_;
self_test_sample_size_ = from.self_test_sample_size_;
}
if (cached_has_bits & 0x00000800u) {
input_sentence_size_ = from.input_sentence_size_;
mining_sentence_size_ = from.mining_sentence_size_;
}
if (cached_has_bits & 0x00001000u) {
training_sentence_size_ = from.training_sentence_size_;
input_sentence_size_ = from.input_sentence_size_;
}
if (cached_has_bits & 0x00002000u) {
enable_differential_privacy_ = from.enable_differential_privacy_;
training_sentence_size_ = from.training_sentence_size_;
}
if (cached_has_bits & 0x00004000u) {
treat_whitespace_as_suffix_ = from.treat_whitespace_as_suffix_;
enable_differential_privacy_ = from.enable_differential_privacy_;
}
if (cached_has_bits & 0x00008000u) {
allow_whitespace_only_pieces_ = from.allow_whitespace_only_pieces_;
treat_whitespace_as_suffix_ = from.treat_whitespace_as_suffix_;
}
_has_bits_[0] |= cached_has_bits;
}
if (cached_has_bits & 0x00ff0000u) {
if (cached_has_bits & 0x00010000u) {
split_digits_ = from.split_digits_;
allow_whitespace_only_pieces_ = from.allow_whitespace_only_pieces_;
}
if (cached_has_bits & 0x00020000u) {
byte_fallback_ = from.byte_fallback_;
split_digits_ = from.split_digits_;
}
if (cached_has_bits & 0x00040000u) {
use_all_vocab_ = from.use_all_vocab_;
byte_fallback_ = from.byte_fallback_;
}
if (cached_has_bits & 0x00080000u) {
train_extremely_large_corpus_ = from.train_extremely_large_corpus_;
use_all_vocab_ = from.use_all_vocab_;
}
if (cached_has_bits & 0x00100000u) {
unk_id_ = from.unk_id_;
train_extremely_large_corpus_ = from.train_extremely_large_corpus_;
}
if (cached_has_bits & 0x00200000u) {
differential_privacy_noise_level_ = from.differential_privacy_noise_level_;
unk_id_ = from.unk_id_;
}
if (cached_has_bits & 0x00400000u) {
differential_privacy_clipping_threshold_ = from.differential_privacy_clipping_threshold_;
differential_privacy_noise_level_ = from.differential_privacy_noise_level_;
}
if (cached_has_bits & 0x00800000u) {
model_type_ = from.model_type_;
differential_privacy_clipping_threshold_ = from.differential_privacy_clipping_threshold_;
}
_has_bits_[0] |= cached_has_bits;
}
if (cached_has_bits & 0xff000000u) {
if (cached_has_bits & 0x01000000u) {
vocab_size_ = from.vocab_size_;
model_type_ = from.model_type_;
}
if (cached_has_bits & 0x02000000u) {
character_coverage_ = from.character_coverage_;
vocab_size_ = from.vocab_size_;
}
if (cached_has_bits & 0x04000000u) {
seed_sentencepiece_size_ = from.seed_sentencepiece_size_;
character_coverage_ = from.character_coverage_;
}
if (cached_has_bits & 0x08000000u) {
shrinking_factor_ = from.shrinking_factor_;
seed_sentencepiece_size_ = from.seed_sentencepiece_size_;
}
if (cached_has_bits & 0x10000000u) {
num_threads_ = from.num_threads_;
shrinking_factor_ = from.shrinking_factor_;
}
if (cached_has_bits & 0x20000000u) {
num_sub_iterations_ = from.num_sub_iterations_;
num_threads_ = from.num_threads_;
}
if (cached_has_bits & 0x40000000u) {
max_sentence_length_ = from.max_sentence_length_;
num_sub_iterations_ = from.num_sub_iterations_;
}
if (cached_has_bits & 0x80000000u) {
max_sentencepiece_length_ = from.max_sentencepiece_length_;
max_sentence_length_ = from.max_sentence_length_;
}
_has_bits_[0] |= cached_has_bits;
}
cached_has_bits = from._has_bits_[1];
if (cached_has_bits & 0x000000ffu) {
if (cached_has_bits & 0x00000001u) {
shuffle_input_sentence_ = from.shuffle_input_sentence_;
max_sentencepiece_length_ = from.max_sentencepiece_length_;
}
if (cached_has_bits & 0x00000002u) {
split_by_unicode_script_ = from.split_by_unicode_script_;
shuffle_input_sentence_ = from.shuffle_input_sentence_;
}
if (cached_has_bits & 0x00000004u) {
split_by_number_ = from.split_by_number_;
split_by_unicode_script_ = from.split_by_unicode_script_;
}
if (cached_has_bits & 0x00000008u) {
split_by_whitespace_ = from.split_by_whitespace_;
split_by_number_ = from.split_by_number_;
}
if (cached_has_bits & 0x00000010u) {
vocabulary_output_piece_score_ = from.vocabulary_output_piece_score_;
split_by_whitespace_ = from.split_by_whitespace_;
}
if (cached_has_bits & 0x00000020u) {
hard_vocab_limit_ = from.hard_vocab_limit_;
vocabulary_output_piece_score_ = from.vocabulary_output_piece_score_;
}
if (cached_has_bits & 0x00000040u) {
bos_id_ = from.bos_id_;
hard_vocab_limit_ = from.hard_vocab_limit_;
}
if (cached_has_bits & 0x00000080u) {
eos_id_ = from.eos_id_;
bos_id_ = from.bos_id_;
}
_has_bits_[1] |= cached_has_bits;
}
if (cached_has_bits & 0x00000100u) {
_internal_set_pad_id(from._internal_pad_id());
if (cached_has_bits & 0x00000300u) {
if (cached_has_bits & 0x00000100u) {
eos_id_ = from.eos_id_;
}
if (cached_has_bits & 0x00000200u) {
pad_id_ = from.pad_id_;
}
_has_bits_[1] |= cached_has_bits;
}
}
@ -1852,6 +1896,7 @@ void TrainerSpec::InternalSwap(TrainerSpec* other) {
eos_piece_.Swap(&other->eos_piece_, nullptr, GetArena());
pad_piece_.Swap(&other->pad_piece_, nullptr, GetArena());
pretokenization_delimiter_.Swap(&other->pretokenization_delimiter_, &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
seed_sentencepieces_file_.Swap(&other->seed_sentencepieces_file_, &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
::PROTOBUF_NAMESPACE_ID::internal::memswap<
PROTOBUF_FIELD_OFFSET(TrainerSpec, differential_privacy_clipping_threshold_)
+ sizeof(TrainerSpec::differential_privacy_clipping_threshold_)

View File

@ -274,6 +274,7 @@ class TrainerSpec PROTOBUF_FINAL :
kEosPieceFieldNumber = 47,
kPadPieceFieldNumber = 48,
kPretokenizationDelimiterFieldNumber = 53,
kSeedSentencepiecesFileFieldNumber = 54,
kSelfTestSampleSizeFieldNumber = 6,
kMiningSentenceSizeFieldNumber = 12,
kInputSentenceSizeFieldNumber = 11,
@ -583,6 +584,26 @@ class TrainerSpec PROTOBUF_FINAL :
std::string* _internal_mutable_pretokenization_delimiter();
public:
// optional string seed_sentencepieces_file = 54 [default = ""];
bool has_seed_sentencepieces_file() const;
private:
bool _internal_has_seed_sentencepieces_file() const;
public:
void clear_seed_sentencepieces_file();
const std::string& seed_sentencepieces_file() const;
void set_seed_sentencepieces_file(const std::string& value);
void set_seed_sentencepieces_file(std::string&& value);
void set_seed_sentencepieces_file(const char* value);
void set_seed_sentencepieces_file(const char* value, size_t size);
std::string* mutable_seed_sentencepieces_file();
std::string* release_seed_sentencepieces_file();
void set_allocated_seed_sentencepieces_file(std::string* seed_sentencepieces_file);
private:
const std::string& _internal_seed_sentencepieces_file() const;
void _internal_set_seed_sentencepieces_file(const std::string& value);
std::string* _internal_mutable_seed_sentencepieces_file();
public:
// optional int32 self_test_sample_size = 6 [default = 0];
bool has_self_test_sample_size() const;
private:
@ -1029,6 +1050,7 @@ class TrainerSpec PROTOBUF_FINAL :
static const ::PROTOBUF_NAMESPACE_ID::internal::LazyString _i_give_permission_to_break_this_code_default_pad_piece_;
::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr pad_piece_;
::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr pretokenization_delimiter_;
::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr seed_sentencepieces_file_;
::PROTOBUF_NAMESPACE_ID::int32 self_test_sample_size_;
::PROTOBUF_NAMESPACE_ID::int32 mining_sentence_size_;
::PROTOBUF_NAMESPACE_ID::uint64 input_sentence_size_;
@ -2262,7 +2284,7 @@ inline void TrainerSpec::set_allocated_model_prefix(std::string* model_prefix) {
// optional .sentencepiece.TrainerSpec.ModelType model_type = 3 [default = UNIGRAM];
inline bool TrainerSpec::_internal_has_model_type() const {
bool value = (_has_bits_[0] & 0x00800000u) != 0;
bool value = (_has_bits_[0] & 0x01000000u) != 0;
return value;
}
inline bool TrainerSpec::has_model_type() const {
@ -2270,7 +2292,7 @@ inline bool TrainerSpec::has_model_type() const {
}
inline void TrainerSpec::clear_model_type() {
model_type_ = 1;
_has_bits_[0] &= ~0x00800000u;
_has_bits_[0] &= ~0x01000000u;
}
inline ::sentencepiece::TrainerSpec_ModelType TrainerSpec::_internal_model_type() const {
return static_cast< ::sentencepiece::TrainerSpec_ModelType >(model_type_);
@ -2281,7 +2303,7 @@ inline ::sentencepiece::TrainerSpec_ModelType TrainerSpec::model_type() const {
}
inline void TrainerSpec::_internal_set_model_type(::sentencepiece::TrainerSpec_ModelType value) {
assert(::sentencepiece::TrainerSpec_ModelType_IsValid(value));
_has_bits_[0] |= 0x00800000u;
_has_bits_[0] |= 0x01000000u;
model_type_ = value;
}
inline void TrainerSpec::set_model_type(::sentencepiece::TrainerSpec_ModelType value) {
@ -2291,7 +2313,7 @@ inline void TrainerSpec::set_model_type(::sentencepiece::TrainerSpec_ModelType v
// optional int32 vocab_size = 4 [default = 8000];
inline bool TrainerSpec::_internal_has_vocab_size() const {
bool value = (_has_bits_[0] & 0x01000000u) != 0;
bool value = (_has_bits_[0] & 0x02000000u) != 0;
return value;
}
inline bool TrainerSpec::has_vocab_size() const {
@ -2299,7 +2321,7 @@ inline bool TrainerSpec::has_vocab_size() const {
}
inline void TrainerSpec::clear_vocab_size() {
vocab_size_ = 8000;
_has_bits_[0] &= ~0x01000000u;
_has_bits_[0] &= ~0x02000000u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_vocab_size() const {
return vocab_size_;
@ -2309,7 +2331,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::vocab_size() const {
return _internal_vocab_size();
}
inline void TrainerSpec::_internal_set_vocab_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x01000000u;
_has_bits_[0] |= 0x02000000u;
vocab_size_ = value;
}
inline void TrainerSpec::set_vocab_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -2393,7 +2415,7 @@ TrainerSpec::mutable_accept_language() {
// optional int32 self_test_sample_size = 6 [default = 0];
inline bool TrainerSpec::_internal_has_self_test_sample_size() const {
bool value = (_has_bits_[0] & 0x00000200u) != 0;
bool value = (_has_bits_[0] & 0x00000400u) != 0;
return value;
}
inline bool TrainerSpec::has_self_test_sample_size() const {
@ -2401,7 +2423,7 @@ inline bool TrainerSpec::has_self_test_sample_size() const {
}
inline void TrainerSpec::clear_self_test_sample_size() {
self_test_sample_size_ = 0;
_has_bits_[0] &= ~0x00000200u;
_has_bits_[0] &= ~0x00000400u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_self_test_sample_size() const {
return self_test_sample_size_;
@ -2411,7 +2433,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::self_test_sample_size() const
return _internal_self_test_sample_size();
}
inline void TrainerSpec::_internal_set_self_test_sample_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x00000200u;
_has_bits_[0] |= 0x00000400u;
self_test_sample_size_ = value;
}
inline void TrainerSpec::set_self_test_sample_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -2421,7 +2443,7 @@ inline void TrainerSpec::set_self_test_sample_size(::PROTOBUF_NAMESPACE_ID::int3
// optional bool enable_differential_privacy = 50 [default = false];
inline bool TrainerSpec::_internal_has_enable_differential_privacy() const {
bool value = (_has_bits_[0] & 0x00002000u) != 0;
bool value = (_has_bits_[0] & 0x00004000u) != 0;
return value;
}
inline bool TrainerSpec::has_enable_differential_privacy() const {
@ -2429,7 +2451,7 @@ inline bool TrainerSpec::has_enable_differential_privacy() const {
}
inline void TrainerSpec::clear_enable_differential_privacy() {
enable_differential_privacy_ = false;
_has_bits_[0] &= ~0x00002000u;
_has_bits_[0] &= ~0x00004000u;
}
inline bool TrainerSpec::_internal_enable_differential_privacy() const {
return enable_differential_privacy_;
@ -2439,7 +2461,7 @@ inline bool TrainerSpec::enable_differential_privacy() const {
return _internal_enable_differential_privacy();
}
inline void TrainerSpec::_internal_set_enable_differential_privacy(bool value) {
_has_bits_[0] |= 0x00002000u;
_has_bits_[0] |= 0x00004000u;
enable_differential_privacy_ = value;
}
inline void TrainerSpec::set_enable_differential_privacy(bool value) {
@ -2449,7 +2471,7 @@ inline void TrainerSpec::set_enable_differential_privacy(bool value) {
// optional float differential_privacy_noise_level = 51 [default = 0];
inline bool TrainerSpec::_internal_has_differential_privacy_noise_level() const {
bool value = (_has_bits_[0] & 0x00200000u) != 0;
bool value = (_has_bits_[0] & 0x00400000u) != 0;
return value;
}
inline bool TrainerSpec::has_differential_privacy_noise_level() const {
@ -2457,7 +2479,7 @@ inline bool TrainerSpec::has_differential_privacy_noise_level() const {
}
inline void TrainerSpec::clear_differential_privacy_noise_level() {
differential_privacy_noise_level_ = 0;
_has_bits_[0] &= ~0x00200000u;
_has_bits_[0] &= ~0x00400000u;
}
inline float TrainerSpec::_internal_differential_privacy_noise_level() const {
return differential_privacy_noise_level_;
@ -2467,7 +2489,7 @@ inline float TrainerSpec::differential_privacy_noise_level() const {
return _internal_differential_privacy_noise_level();
}
inline void TrainerSpec::_internal_set_differential_privacy_noise_level(float value) {
_has_bits_[0] |= 0x00200000u;
_has_bits_[0] |= 0x00400000u;
differential_privacy_noise_level_ = value;
}
inline void TrainerSpec::set_differential_privacy_noise_level(float value) {
@ -2477,7 +2499,7 @@ inline void TrainerSpec::set_differential_privacy_noise_level(float value) {
// optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
inline bool TrainerSpec::_internal_has_differential_privacy_clipping_threshold() const {
bool value = (_has_bits_[0] & 0x00400000u) != 0;
bool value = (_has_bits_[0] & 0x00800000u) != 0;
return value;
}
inline bool TrainerSpec::has_differential_privacy_clipping_threshold() const {
@ -2485,7 +2507,7 @@ inline bool TrainerSpec::has_differential_privacy_clipping_threshold() const {
}
inline void TrainerSpec::clear_differential_privacy_clipping_threshold() {
differential_privacy_clipping_threshold_ = PROTOBUF_ULONGLONG(0);
_has_bits_[0] &= ~0x00400000u;
_has_bits_[0] &= ~0x00800000u;
}
inline ::PROTOBUF_NAMESPACE_ID::uint64 TrainerSpec::_internal_differential_privacy_clipping_threshold() const {
return differential_privacy_clipping_threshold_;
@ -2495,7 +2517,7 @@ inline ::PROTOBUF_NAMESPACE_ID::uint64 TrainerSpec::differential_privacy_clippin
return _internal_differential_privacy_clipping_threshold();
}
inline void TrainerSpec::_internal_set_differential_privacy_clipping_threshold(::PROTOBUF_NAMESPACE_ID::uint64 value) {
_has_bits_[0] |= 0x00400000u;
_has_bits_[0] |= 0x00800000u;
differential_privacy_clipping_threshold_ = value;
}
inline void TrainerSpec::set_differential_privacy_clipping_threshold(::PROTOBUF_NAMESPACE_ID::uint64 value) {
@ -2505,7 +2527,7 @@ inline void TrainerSpec::set_differential_privacy_clipping_threshold(::PROTOBUF_
// optional float character_coverage = 10 [default = 0.9995];
inline bool TrainerSpec::_internal_has_character_coverage() const {
bool value = (_has_bits_[0] & 0x02000000u) != 0;
bool value = (_has_bits_[0] & 0x04000000u) != 0;
return value;
}
inline bool TrainerSpec::has_character_coverage() const {
@ -2513,7 +2535,7 @@ inline bool TrainerSpec::has_character_coverage() const {
}
inline void TrainerSpec::clear_character_coverage() {
character_coverage_ = 0.9995f;
_has_bits_[0] &= ~0x02000000u;
_has_bits_[0] &= ~0x04000000u;
}
inline float TrainerSpec::_internal_character_coverage() const {
return character_coverage_;
@ -2523,7 +2545,7 @@ inline float TrainerSpec::character_coverage() const {
return _internal_character_coverage();
}
inline void TrainerSpec::_internal_set_character_coverage(float value) {
_has_bits_[0] |= 0x02000000u;
_has_bits_[0] |= 0x04000000u;
character_coverage_ = value;
}
inline void TrainerSpec::set_character_coverage(float value) {
@ -2533,7 +2555,7 @@ inline void TrainerSpec::set_character_coverage(float value) {
// optional uint64 input_sentence_size = 11 [default = 0];
inline bool TrainerSpec::_internal_has_input_sentence_size() const {
bool value = (_has_bits_[0] & 0x00000800u) != 0;
bool value = (_has_bits_[0] & 0x00001000u) != 0;
return value;
}
inline bool TrainerSpec::has_input_sentence_size() const {
@ -2541,7 +2563,7 @@ inline bool TrainerSpec::has_input_sentence_size() const {
}
inline void TrainerSpec::clear_input_sentence_size() {
input_sentence_size_ = PROTOBUF_ULONGLONG(0);
_has_bits_[0] &= ~0x00000800u;
_has_bits_[0] &= ~0x00001000u;
}
inline ::PROTOBUF_NAMESPACE_ID::uint64 TrainerSpec::_internal_input_sentence_size() const {
return input_sentence_size_;
@ -2551,7 +2573,7 @@ inline ::PROTOBUF_NAMESPACE_ID::uint64 TrainerSpec::input_sentence_size() const
return _internal_input_sentence_size();
}
inline void TrainerSpec::_internal_set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::uint64 value) {
_has_bits_[0] |= 0x00000800u;
_has_bits_[0] |= 0x00001000u;
input_sentence_size_ = value;
}
inline void TrainerSpec::set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::uint64 value) {
@ -2561,7 +2583,7 @@ inline void TrainerSpec::set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::uint64
// optional bool shuffle_input_sentence = 19 [default = true];
inline bool TrainerSpec::_internal_has_shuffle_input_sentence() const {
bool value = (_has_bits_[1] & 0x00000001u) != 0;
bool value = (_has_bits_[1] & 0x00000002u) != 0;
return value;
}
inline bool TrainerSpec::has_shuffle_input_sentence() const {
@ -2569,7 +2591,7 @@ inline bool TrainerSpec::has_shuffle_input_sentence() const {
}
inline void TrainerSpec::clear_shuffle_input_sentence() {
shuffle_input_sentence_ = true;
_has_bits_[1] &= ~0x00000001u;
_has_bits_[1] &= ~0x00000002u;
}
inline bool TrainerSpec::_internal_shuffle_input_sentence() const {
return shuffle_input_sentence_;
@ -2579,7 +2601,7 @@ inline bool TrainerSpec::shuffle_input_sentence() const {
return _internal_shuffle_input_sentence();
}
inline void TrainerSpec::_internal_set_shuffle_input_sentence(bool value) {
_has_bits_[1] |= 0x00000001u;
_has_bits_[1] |= 0x00000002u;
shuffle_input_sentence_ = value;
}
inline void TrainerSpec::set_shuffle_input_sentence(bool value) {
@ -2589,7 +2611,7 @@ inline void TrainerSpec::set_shuffle_input_sentence(bool value) {
// optional int32 mining_sentence_size = 12 [deprecated = true];
inline bool TrainerSpec::_internal_has_mining_sentence_size() const {
bool value = (_has_bits_[0] & 0x00000400u) != 0;
bool value = (_has_bits_[0] & 0x00000800u) != 0;
return value;
}
inline bool TrainerSpec::has_mining_sentence_size() const {
@ -2597,7 +2619,7 @@ inline bool TrainerSpec::has_mining_sentence_size() const {
}
inline void TrainerSpec::clear_mining_sentence_size() {
mining_sentence_size_ = 0;
_has_bits_[0] &= ~0x00000400u;
_has_bits_[0] &= ~0x00000800u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_mining_sentence_size() const {
return mining_sentence_size_;
@ -2607,7 +2629,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::mining_sentence_size() const
return _internal_mining_sentence_size();
}
inline void TrainerSpec::_internal_set_mining_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x00000400u;
_has_bits_[0] |= 0x00000800u;
mining_sentence_size_ = value;
}
inline void TrainerSpec::set_mining_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -2617,7 +2639,7 @@ inline void TrainerSpec::set_mining_sentence_size(::PROTOBUF_NAMESPACE_ID::int32
// optional int32 training_sentence_size = 13 [deprecated = true];
inline bool TrainerSpec::_internal_has_training_sentence_size() const {
bool value = (_has_bits_[0] & 0x00001000u) != 0;
bool value = (_has_bits_[0] & 0x00002000u) != 0;
return value;
}
inline bool TrainerSpec::has_training_sentence_size() const {
@ -2625,7 +2647,7 @@ inline bool TrainerSpec::has_training_sentence_size() const {
}
inline void TrainerSpec::clear_training_sentence_size() {
training_sentence_size_ = 0;
_has_bits_[0] &= ~0x00001000u;
_has_bits_[0] &= ~0x00002000u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_training_sentence_size() const {
return training_sentence_size_;
@ -2635,7 +2657,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::training_sentence_size() cons
return _internal_training_sentence_size();
}
inline void TrainerSpec::_internal_set_training_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x00001000u;
_has_bits_[0] |= 0x00002000u;
training_sentence_size_ = value;
}
inline void TrainerSpec::set_training_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -2645,7 +2667,7 @@ inline void TrainerSpec::set_training_sentence_size(::PROTOBUF_NAMESPACE_ID::int
// optional int32 seed_sentencepiece_size = 14 [default = 1000000];
inline bool TrainerSpec::_internal_has_seed_sentencepiece_size() const {
bool value = (_has_bits_[0] & 0x04000000u) != 0;
bool value = (_has_bits_[0] & 0x08000000u) != 0;
return value;
}
inline bool TrainerSpec::has_seed_sentencepiece_size() const {
@ -2653,7 +2675,7 @@ inline bool TrainerSpec::has_seed_sentencepiece_size() const {
}
inline void TrainerSpec::clear_seed_sentencepiece_size() {
seed_sentencepiece_size_ = 1000000;
_has_bits_[0] &= ~0x04000000u;
_has_bits_[0] &= ~0x08000000u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_seed_sentencepiece_size() const {
return seed_sentencepiece_size_;
@ -2663,7 +2685,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::seed_sentencepiece_size() con
return _internal_seed_sentencepiece_size();
}
inline void TrainerSpec::_internal_set_seed_sentencepiece_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x04000000u;
_has_bits_[0] |= 0x08000000u;
seed_sentencepiece_size_ = value;
}
inline void TrainerSpec::set_seed_sentencepiece_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -2673,7 +2695,7 @@ inline void TrainerSpec::set_seed_sentencepiece_size(::PROTOBUF_NAMESPACE_ID::in
// optional float shrinking_factor = 15 [default = 0.75];
inline bool TrainerSpec::_internal_has_shrinking_factor() const {
bool value = (_has_bits_[0] & 0x08000000u) != 0;
bool value = (_has_bits_[0] & 0x10000000u) != 0;
return value;
}
inline bool TrainerSpec::has_shrinking_factor() const {
@ -2681,7 +2703,7 @@ inline bool TrainerSpec::has_shrinking_factor() const {
}
inline void TrainerSpec::clear_shrinking_factor() {
shrinking_factor_ = 0.75f;
_has_bits_[0] &= ~0x08000000u;
_has_bits_[0] &= ~0x10000000u;
}
inline float TrainerSpec::_internal_shrinking_factor() const {
return shrinking_factor_;
@ -2691,7 +2713,7 @@ inline float TrainerSpec::shrinking_factor() const {
return _internal_shrinking_factor();
}
inline void TrainerSpec::_internal_set_shrinking_factor(float value) {
_has_bits_[0] |= 0x08000000u;
_has_bits_[0] |= 0x10000000u;
shrinking_factor_ = value;
}
inline void TrainerSpec::set_shrinking_factor(float value) {
@ -2701,7 +2723,7 @@ inline void TrainerSpec::set_shrinking_factor(float value) {
// optional int32 max_sentence_length = 18 [default = 4192];
inline bool TrainerSpec::_internal_has_max_sentence_length() const {
bool value = (_has_bits_[0] & 0x40000000u) != 0;
bool value = (_has_bits_[0] & 0x80000000u) != 0;
return value;
}
inline bool TrainerSpec::has_max_sentence_length() const {
@ -2709,7 +2731,7 @@ inline bool TrainerSpec::has_max_sentence_length() const {
}
inline void TrainerSpec::clear_max_sentence_length() {
max_sentence_length_ = 4192;
_has_bits_[0] &= ~0x40000000u;
_has_bits_[0] &= ~0x80000000u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_max_sentence_length() const {
return max_sentence_length_;
@ -2719,7 +2741,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::max_sentence_length() const {
return _internal_max_sentence_length();
}
inline void TrainerSpec::_internal_set_max_sentence_length(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x40000000u;
_has_bits_[0] |= 0x80000000u;
max_sentence_length_ = value;
}
inline void TrainerSpec::set_max_sentence_length(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -2729,7 +2751,7 @@ inline void TrainerSpec::set_max_sentence_length(::PROTOBUF_NAMESPACE_ID::int32
// optional int32 num_threads = 16 [default = 16];
inline bool TrainerSpec::_internal_has_num_threads() const {
bool value = (_has_bits_[0] & 0x10000000u) != 0;
bool value = (_has_bits_[0] & 0x20000000u) != 0;
return value;
}
inline bool TrainerSpec::has_num_threads() const {
@ -2737,7 +2759,7 @@ inline bool TrainerSpec::has_num_threads() const {
}
inline void TrainerSpec::clear_num_threads() {
num_threads_ = 16;
_has_bits_[0] &= ~0x10000000u;
_has_bits_[0] &= ~0x20000000u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_num_threads() const {
return num_threads_;
@ -2747,7 +2769,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::num_threads() const {
return _internal_num_threads();
}
inline void TrainerSpec::_internal_set_num_threads(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x10000000u;
_has_bits_[0] |= 0x20000000u;
num_threads_ = value;
}
inline void TrainerSpec::set_num_threads(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -2757,7 +2779,7 @@ inline void TrainerSpec::set_num_threads(::PROTOBUF_NAMESPACE_ID::int32 value) {
// optional int32 num_sub_iterations = 17 [default = 2];
inline bool TrainerSpec::_internal_has_num_sub_iterations() const {
bool value = (_has_bits_[0] & 0x20000000u) != 0;
bool value = (_has_bits_[0] & 0x40000000u) != 0;
return value;
}
inline bool TrainerSpec::has_num_sub_iterations() const {
@ -2765,7 +2787,7 @@ inline bool TrainerSpec::has_num_sub_iterations() const {
}
inline void TrainerSpec::clear_num_sub_iterations() {
num_sub_iterations_ = 2;
_has_bits_[0] &= ~0x20000000u;
_has_bits_[0] &= ~0x40000000u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_num_sub_iterations() const {
return num_sub_iterations_;
@ -2775,7 +2797,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::num_sub_iterations() const {
return _internal_num_sub_iterations();
}
inline void TrainerSpec::_internal_set_num_sub_iterations(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x20000000u;
_has_bits_[0] |= 0x40000000u;
num_sub_iterations_ = value;
}
inline void TrainerSpec::set_num_sub_iterations(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -2785,7 +2807,7 @@ inline void TrainerSpec::set_num_sub_iterations(::PROTOBUF_NAMESPACE_ID::int32 v
// optional int32 max_sentencepiece_length = 20 [default = 16];
inline bool TrainerSpec::_internal_has_max_sentencepiece_length() const {
bool value = (_has_bits_[0] & 0x80000000u) != 0;
bool value = (_has_bits_[1] & 0x00000001u) != 0;
return value;
}
inline bool TrainerSpec::has_max_sentencepiece_length() const {
@ -2793,7 +2815,7 @@ inline bool TrainerSpec::has_max_sentencepiece_length() const {
}
inline void TrainerSpec::clear_max_sentencepiece_length() {
max_sentencepiece_length_ = 16;
_has_bits_[0] &= ~0x80000000u;
_has_bits_[1] &= ~0x00000001u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_max_sentencepiece_length() const {
return max_sentencepiece_length_;
@ -2803,7 +2825,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::max_sentencepiece_length() co
return _internal_max_sentencepiece_length();
}
inline void TrainerSpec::_internal_set_max_sentencepiece_length(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x80000000u;
_has_bits_[1] |= 0x00000001u;
max_sentencepiece_length_ = value;
}
inline void TrainerSpec::set_max_sentencepiece_length(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -2813,7 +2835,7 @@ inline void TrainerSpec::set_max_sentencepiece_length(::PROTOBUF_NAMESPACE_ID::i
// optional bool split_by_unicode_script = 21 [default = true];
inline bool TrainerSpec::_internal_has_split_by_unicode_script() const {
bool value = (_has_bits_[1] & 0x00000002u) != 0;
bool value = (_has_bits_[1] & 0x00000004u) != 0;
return value;
}
inline bool TrainerSpec::has_split_by_unicode_script() const {
@ -2821,7 +2843,7 @@ inline bool TrainerSpec::has_split_by_unicode_script() const {
}
inline void TrainerSpec::clear_split_by_unicode_script() {
split_by_unicode_script_ = true;
_has_bits_[1] &= ~0x00000002u;
_has_bits_[1] &= ~0x00000004u;
}
inline bool TrainerSpec::_internal_split_by_unicode_script() const {
return split_by_unicode_script_;
@ -2831,7 +2853,7 @@ inline bool TrainerSpec::split_by_unicode_script() const {
return _internal_split_by_unicode_script();
}
inline void TrainerSpec::_internal_set_split_by_unicode_script(bool value) {
_has_bits_[1] |= 0x00000002u;
_has_bits_[1] |= 0x00000004u;
split_by_unicode_script_ = value;
}
inline void TrainerSpec::set_split_by_unicode_script(bool value) {
@ -2841,7 +2863,7 @@ inline void TrainerSpec::set_split_by_unicode_script(bool value) {
// optional bool split_by_number = 23 [default = true];
inline bool TrainerSpec::_internal_has_split_by_number() const {
bool value = (_has_bits_[1] & 0x00000004u) != 0;
bool value = (_has_bits_[1] & 0x00000008u) != 0;
return value;
}
inline bool TrainerSpec::has_split_by_number() const {
@ -2849,7 +2871,7 @@ inline bool TrainerSpec::has_split_by_number() const {
}
inline void TrainerSpec::clear_split_by_number() {
split_by_number_ = true;
_has_bits_[1] &= ~0x00000004u;
_has_bits_[1] &= ~0x00000008u;
}
inline bool TrainerSpec::_internal_split_by_number() const {
return split_by_number_;
@ -2859,7 +2881,7 @@ inline bool TrainerSpec::split_by_number() const {
return _internal_split_by_number();
}
inline void TrainerSpec::_internal_set_split_by_number(bool value) {
_has_bits_[1] |= 0x00000004u;
_has_bits_[1] |= 0x00000008u;
split_by_number_ = value;
}
inline void TrainerSpec::set_split_by_number(bool value) {
@ -2869,7 +2891,7 @@ inline void TrainerSpec::set_split_by_number(bool value) {
// optional bool split_by_whitespace = 22 [default = true];
inline bool TrainerSpec::_internal_has_split_by_whitespace() const {
bool value = (_has_bits_[1] & 0x00000008u) != 0;
bool value = (_has_bits_[1] & 0x00000010u) != 0;
return value;
}
inline bool TrainerSpec::has_split_by_whitespace() const {
@ -2877,7 +2899,7 @@ inline bool TrainerSpec::has_split_by_whitespace() const {
}
inline void TrainerSpec::clear_split_by_whitespace() {
split_by_whitespace_ = true;
_has_bits_[1] &= ~0x00000008u;
_has_bits_[1] &= ~0x00000010u;
}
inline bool TrainerSpec::_internal_split_by_whitespace() const {
return split_by_whitespace_;
@ -2887,7 +2909,7 @@ inline bool TrainerSpec::split_by_whitespace() const {
return _internal_split_by_whitespace();
}
inline void TrainerSpec::_internal_set_split_by_whitespace(bool value) {
_has_bits_[1] |= 0x00000008u;
_has_bits_[1] |= 0x00000010u;
split_by_whitespace_ = value;
}
inline void TrainerSpec::set_split_by_whitespace(bool value) {
@ -2897,7 +2919,7 @@ inline void TrainerSpec::set_split_by_whitespace(bool value) {
// optional bool treat_whitespace_as_suffix = 24 [default = false];
inline bool TrainerSpec::_internal_has_treat_whitespace_as_suffix() const {
bool value = (_has_bits_[0] & 0x00004000u) != 0;
bool value = (_has_bits_[0] & 0x00008000u) != 0;
return value;
}
inline bool TrainerSpec::has_treat_whitespace_as_suffix() const {
@ -2905,7 +2927,7 @@ inline bool TrainerSpec::has_treat_whitespace_as_suffix() const {
}
inline void TrainerSpec::clear_treat_whitespace_as_suffix() {
treat_whitespace_as_suffix_ = false;
_has_bits_[0] &= ~0x00004000u;
_has_bits_[0] &= ~0x00008000u;
}
inline bool TrainerSpec::_internal_treat_whitespace_as_suffix() const {
return treat_whitespace_as_suffix_;
@ -2915,7 +2937,7 @@ inline bool TrainerSpec::treat_whitespace_as_suffix() const {
return _internal_treat_whitespace_as_suffix();
}
inline void TrainerSpec::_internal_set_treat_whitespace_as_suffix(bool value) {
_has_bits_[0] |= 0x00004000u;
_has_bits_[0] |= 0x00008000u;
treat_whitespace_as_suffix_ = value;
}
inline void TrainerSpec::set_treat_whitespace_as_suffix(bool value) {
@ -2925,7 +2947,7 @@ inline void TrainerSpec::set_treat_whitespace_as_suffix(bool value) {
// optional bool allow_whitespace_only_pieces = 26 [default = false];
inline bool TrainerSpec::_internal_has_allow_whitespace_only_pieces() const {
bool value = (_has_bits_[0] & 0x00008000u) != 0;
bool value = (_has_bits_[0] & 0x00010000u) != 0;
return value;
}
inline bool TrainerSpec::has_allow_whitespace_only_pieces() const {
@ -2933,7 +2955,7 @@ inline bool TrainerSpec::has_allow_whitespace_only_pieces() const {
}
inline void TrainerSpec::clear_allow_whitespace_only_pieces() {
allow_whitespace_only_pieces_ = false;
_has_bits_[0] &= ~0x00008000u;
_has_bits_[0] &= ~0x00010000u;
}
inline bool TrainerSpec::_internal_allow_whitespace_only_pieces() const {
return allow_whitespace_only_pieces_;
@ -2943,7 +2965,7 @@ inline bool TrainerSpec::allow_whitespace_only_pieces() const {
return _internal_allow_whitespace_only_pieces();
}
inline void TrainerSpec::_internal_set_allow_whitespace_only_pieces(bool value) {
_has_bits_[0] |= 0x00008000u;
_has_bits_[0] |= 0x00010000u;
allow_whitespace_only_pieces_ = value;
}
inline void TrainerSpec::set_allow_whitespace_only_pieces(bool value) {
@ -2953,7 +2975,7 @@ inline void TrainerSpec::set_allow_whitespace_only_pieces(bool value) {
// optional bool split_digits = 25 [default = false];
inline bool TrainerSpec::_internal_has_split_digits() const {
bool value = (_has_bits_[0] & 0x00010000u) != 0;
bool value = (_has_bits_[0] & 0x00020000u) != 0;
return value;
}
inline bool TrainerSpec::has_split_digits() const {
@ -2961,7 +2983,7 @@ inline bool TrainerSpec::has_split_digits() const {
}
inline void TrainerSpec::clear_split_digits() {
split_digits_ = false;
_has_bits_[0] &= ~0x00010000u;
_has_bits_[0] &= ~0x00020000u;
}
inline bool TrainerSpec::_internal_split_digits() const {
return split_digits_;
@ -2971,7 +2993,7 @@ inline bool TrainerSpec::split_digits() const {
return _internal_split_digits();
}
inline void TrainerSpec::_internal_set_split_digits(bool value) {
_has_bits_[0] |= 0x00010000u;
_has_bits_[0] |= 0x00020000u;
split_digits_ = value;
}
inline void TrainerSpec::set_split_digits(bool value) {
@ -3275,7 +3297,7 @@ inline void TrainerSpec::set_allocated_required_chars(std::string* required_char
// optional bool byte_fallback = 35 [default = false];
inline bool TrainerSpec::_internal_has_byte_fallback() const {
bool value = (_has_bits_[0] & 0x00020000u) != 0;
bool value = (_has_bits_[0] & 0x00040000u) != 0;
return value;
}
inline bool TrainerSpec::has_byte_fallback() const {
@ -3283,7 +3305,7 @@ inline bool TrainerSpec::has_byte_fallback() const {
}
inline void TrainerSpec::clear_byte_fallback() {
byte_fallback_ = false;
_has_bits_[0] &= ~0x00020000u;
_has_bits_[0] &= ~0x00040000u;
}
inline bool TrainerSpec::_internal_byte_fallback() const {
return byte_fallback_;
@ -3293,7 +3315,7 @@ inline bool TrainerSpec::byte_fallback() const {
return _internal_byte_fallback();
}
inline void TrainerSpec::_internal_set_byte_fallback(bool value) {
_has_bits_[0] |= 0x00020000u;
_has_bits_[0] |= 0x00040000u;
byte_fallback_ = value;
}
inline void TrainerSpec::set_byte_fallback(bool value) {
@ -3303,7 +3325,7 @@ inline void TrainerSpec::set_byte_fallback(bool value) {
// optional bool vocabulary_output_piece_score = 32 [default = true];
inline bool TrainerSpec::_internal_has_vocabulary_output_piece_score() const {
bool value = (_has_bits_[1] & 0x00000010u) != 0;
bool value = (_has_bits_[1] & 0x00000020u) != 0;
return value;
}
inline bool TrainerSpec::has_vocabulary_output_piece_score() const {
@ -3311,7 +3333,7 @@ inline bool TrainerSpec::has_vocabulary_output_piece_score() const {
}
inline void TrainerSpec::clear_vocabulary_output_piece_score() {
vocabulary_output_piece_score_ = true;
_has_bits_[1] &= ~0x00000010u;
_has_bits_[1] &= ~0x00000020u;
}
inline bool TrainerSpec::_internal_vocabulary_output_piece_score() const {
return vocabulary_output_piece_score_;
@ -3321,7 +3343,7 @@ inline bool TrainerSpec::vocabulary_output_piece_score() const {
return _internal_vocabulary_output_piece_score();
}
inline void TrainerSpec::_internal_set_vocabulary_output_piece_score(bool value) {
_has_bits_[1] |= 0x00000010u;
_has_bits_[1] |= 0x00000020u;
vocabulary_output_piece_score_ = value;
}
inline void TrainerSpec::set_vocabulary_output_piece_score(bool value) {
@ -3331,7 +3353,7 @@ inline void TrainerSpec::set_vocabulary_output_piece_score(bool value) {
// optional bool hard_vocab_limit = 33 [default = true];
inline bool TrainerSpec::_internal_has_hard_vocab_limit() const {
bool value = (_has_bits_[1] & 0x00000020u) != 0;
bool value = (_has_bits_[1] & 0x00000040u) != 0;
return value;
}
inline bool TrainerSpec::has_hard_vocab_limit() const {
@ -3339,7 +3361,7 @@ inline bool TrainerSpec::has_hard_vocab_limit() const {
}
inline void TrainerSpec::clear_hard_vocab_limit() {
hard_vocab_limit_ = true;
_has_bits_[1] &= ~0x00000020u;
_has_bits_[1] &= ~0x00000040u;
}
inline bool TrainerSpec::_internal_hard_vocab_limit() const {
return hard_vocab_limit_;
@ -3349,7 +3371,7 @@ inline bool TrainerSpec::hard_vocab_limit() const {
return _internal_hard_vocab_limit();
}
inline void TrainerSpec::_internal_set_hard_vocab_limit(bool value) {
_has_bits_[1] |= 0x00000020u;
_has_bits_[1] |= 0x00000040u;
hard_vocab_limit_ = value;
}
inline void TrainerSpec::set_hard_vocab_limit(bool value) {
@ -3359,7 +3381,7 @@ inline void TrainerSpec::set_hard_vocab_limit(bool value) {
// optional bool use_all_vocab = 34 [default = false];
inline bool TrainerSpec::_internal_has_use_all_vocab() const {
bool value = (_has_bits_[0] & 0x00040000u) != 0;
bool value = (_has_bits_[0] & 0x00080000u) != 0;
return value;
}
inline bool TrainerSpec::has_use_all_vocab() const {
@ -3367,7 +3389,7 @@ inline bool TrainerSpec::has_use_all_vocab() const {
}
inline void TrainerSpec::clear_use_all_vocab() {
use_all_vocab_ = false;
_has_bits_[0] &= ~0x00040000u;
_has_bits_[0] &= ~0x00080000u;
}
inline bool TrainerSpec::_internal_use_all_vocab() const {
return use_all_vocab_;
@ -3377,7 +3399,7 @@ inline bool TrainerSpec::use_all_vocab() const {
return _internal_use_all_vocab();
}
inline void TrainerSpec::_internal_set_use_all_vocab(bool value) {
_has_bits_[0] |= 0x00040000u;
_has_bits_[0] |= 0x00080000u;
use_all_vocab_ = value;
}
inline void TrainerSpec::set_use_all_vocab(bool value) {
@ -3387,7 +3409,7 @@ inline void TrainerSpec::set_use_all_vocab(bool value) {
// optional int32 unk_id = 40 [default = 0];
inline bool TrainerSpec::_internal_has_unk_id() const {
bool value = (_has_bits_[0] & 0x00100000u) != 0;
bool value = (_has_bits_[0] & 0x00200000u) != 0;
return value;
}
inline bool TrainerSpec::has_unk_id() const {
@ -3395,7 +3417,7 @@ inline bool TrainerSpec::has_unk_id() const {
}
inline void TrainerSpec::clear_unk_id() {
unk_id_ = 0;
_has_bits_[0] &= ~0x00100000u;
_has_bits_[0] &= ~0x00200000u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_unk_id() const {
return unk_id_;
@ -3405,7 +3427,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::unk_id() const {
return _internal_unk_id();
}
inline void TrainerSpec::_internal_set_unk_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x00100000u;
_has_bits_[0] |= 0x00200000u;
unk_id_ = value;
}
inline void TrainerSpec::set_unk_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -3415,7 +3437,7 @@ inline void TrainerSpec::set_unk_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
// optional int32 bos_id = 41 [default = 1];
inline bool TrainerSpec::_internal_has_bos_id() const {
bool value = (_has_bits_[1] & 0x00000040u) != 0;
bool value = (_has_bits_[1] & 0x00000080u) != 0;
return value;
}
inline bool TrainerSpec::has_bos_id() const {
@ -3423,7 +3445,7 @@ inline bool TrainerSpec::has_bos_id() const {
}
inline void TrainerSpec::clear_bos_id() {
bos_id_ = 1;
_has_bits_[1] &= ~0x00000040u;
_has_bits_[1] &= ~0x00000080u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_bos_id() const {
return bos_id_;
@ -3433,7 +3455,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::bos_id() const {
return _internal_bos_id();
}
inline void TrainerSpec::_internal_set_bos_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[1] |= 0x00000040u;
_has_bits_[1] |= 0x00000080u;
bos_id_ = value;
}
inline void TrainerSpec::set_bos_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -3443,7 +3465,7 @@ inline void TrainerSpec::set_bos_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
// optional int32 eos_id = 42 [default = 2];
inline bool TrainerSpec::_internal_has_eos_id() const {
bool value = (_has_bits_[1] & 0x00000080u) != 0;
bool value = (_has_bits_[1] & 0x00000100u) != 0;
return value;
}
inline bool TrainerSpec::has_eos_id() const {
@ -3451,7 +3473,7 @@ inline bool TrainerSpec::has_eos_id() const {
}
inline void TrainerSpec::clear_eos_id() {
eos_id_ = 2;
_has_bits_[1] &= ~0x00000080u;
_has_bits_[1] &= ~0x00000100u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_eos_id() const {
return eos_id_;
@ -3461,7 +3483,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::eos_id() const {
return _internal_eos_id();
}
inline void TrainerSpec::_internal_set_eos_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[1] |= 0x00000080u;
_has_bits_[1] |= 0x00000100u;
eos_id_ = value;
}
inline void TrainerSpec::set_eos_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -3471,7 +3493,7 @@ inline void TrainerSpec::set_eos_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
// optional int32 pad_id = 43 [default = -1];
inline bool TrainerSpec::_internal_has_pad_id() const {
bool value = (_has_bits_[1] & 0x00000100u) != 0;
bool value = (_has_bits_[1] & 0x00000200u) != 0;
return value;
}
inline bool TrainerSpec::has_pad_id() const {
@ -3479,7 +3501,7 @@ inline bool TrainerSpec::has_pad_id() const {
}
inline void TrainerSpec::clear_pad_id() {
pad_id_ = -1;
_has_bits_[1] &= ~0x00000100u;
_has_bits_[1] &= ~0x00000200u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_pad_id() const {
return pad_id_;
@ -3489,7 +3511,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::pad_id() const {
return _internal_pad_id();
}
inline void TrainerSpec::_internal_set_pad_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[1] |= 0x00000100u;
_has_bits_[1] |= 0x00000200u;
pad_id_ = value;
}
inline void TrainerSpec::set_pad_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -3869,7 +3891,7 @@ inline void TrainerSpec::set_allocated_unk_surface(std::string* unk_surface) {
// optional bool train_extremely_large_corpus = 49 [default = false];
inline bool TrainerSpec::_internal_has_train_extremely_large_corpus() const {
bool value = (_has_bits_[0] & 0x00080000u) != 0;
bool value = (_has_bits_[0] & 0x00100000u) != 0;
return value;
}
inline bool TrainerSpec::has_train_extremely_large_corpus() const {
@ -3877,7 +3899,7 @@ inline bool TrainerSpec::has_train_extremely_large_corpus() const {
}
inline void TrainerSpec::clear_train_extremely_large_corpus() {
train_extremely_large_corpus_ = false;
_has_bits_[0] &= ~0x00080000u;
_has_bits_[0] &= ~0x00100000u;
}
inline bool TrainerSpec::_internal_train_extremely_large_corpus() const {
return train_extremely_large_corpus_;
@ -3887,7 +3909,7 @@ inline bool TrainerSpec::train_extremely_large_corpus() const {
return _internal_train_extremely_large_corpus();
}
inline void TrainerSpec::_internal_set_train_extremely_large_corpus(bool value) {
_has_bits_[0] |= 0x00080000u;
_has_bits_[0] |= 0x00100000u;
train_extremely_large_corpus_ = value;
}
inline void TrainerSpec::set_train_extremely_large_corpus(bool value) {
@ -3895,6 +3917,79 @@ inline void TrainerSpec::set_train_extremely_large_corpus(bool value) {
// @@protoc_insertion_point(field_set:sentencepiece.TrainerSpec.train_extremely_large_corpus)
}
// optional string seed_sentencepieces_file = 54 [default = ""];
inline bool TrainerSpec::_internal_has_seed_sentencepieces_file() const {
bool value = (_has_bits_[0] & 0x00000200u) != 0;
return value;
}
inline bool TrainerSpec::has_seed_sentencepieces_file() const {
return _internal_has_seed_sentencepieces_file();
}
inline void TrainerSpec::clear_seed_sentencepieces_file() {
seed_sentencepieces_file_.ClearToEmpty();
_has_bits_[0] &= ~0x00000200u;
}
inline const std::string& TrainerSpec::seed_sentencepieces_file() const {
// @@protoc_insertion_point(field_get:sentencepiece.TrainerSpec.seed_sentencepieces_file)
return _internal_seed_sentencepieces_file();
}
inline void TrainerSpec::set_seed_sentencepieces_file(const std::string& value) {
_internal_set_seed_sentencepieces_file(value);
// @@protoc_insertion_point(field_set:sentencepiece.TrainerSpec.seed_sentencepieces_file)
}
inline std::string* TrainerSpec::mutable_seed_sentencepieces_file() {
// @@protoc_insertion_point(field_mutable:sentencepiece.TrainerSpec.seed_sentencepieces_file)
return _internal_mutable_seed_sentencepieces_file();
}
inline const std::string& TrainerSpec::_internal_seed_sentencepieces_file() const {
return seed_sentencepieces_file_.Get();
}
inline void TrainerSpec::_internal_set_seed_sentencepieces_file(const std::string& value) {
_has_bits_[0] |= 0x00000200u;
seed_sentencepieces_file_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, value, GetArena());
}
inline void TrainerSpec::set_seed_sentencepieces_file(std::string&& value) {
_has_bits_[0] |= 0x00000200u;
seed_sentencepieces_file_.Set(
::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, ::std::move(value), GetArena());
// @@protoc_insertion_point(field_set_rvalue:sentencepiece.TrainerSpec.seed_sentencepieces_file)
}
inline void TrainerSpec::set_seed_sentencepieces_file(const char* value) {
GOOGLE_DCHECK(value != nullptr);
_has_bits_[0] |= 0x00000200u;
seed_sentencepieces_file_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, ::std::string(value), GetArena());
// @@protoc_insertion_point(field_set_char:sentencepiece.TrainerSpec.seed_sentencepieces_file)
}
inline void TrainerSpec::set_seed_sentencepieces_file(const char* value,
size_t size) {
_has_bits_[0] |= 0x00000200u;
seed_sentencepieces_file_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, ::std::string(
reinterpret_cast<const char*>(value), size), GetArena());
// @@protoc_insertion_point(field_set_pointer:sentencepiece.TrainerSpec.seed_sentencepieces_file)
}
inline std::string* TrainerSpec::_internal_mutable_seed_sentencepieces_file() {
_has_bits_[0] |= 0x00000200u;
return seed_sentencepieces_file_.Mutable(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, GetArena());
}
inline std::string* TrainerSpec::release_seed_sentencepieces_file() {
// @@protoc_insertion_point(field_release:sentencepiece.TrainerSpec.seed_sentencepieces_file)
if (!_internal_has_seed_sentencepieces_file()) {
return nullptr;
}
_has_bits_[0] &= ~0x00000200u;
return seed_sentencepieces_file_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
}
inline void TrainerSpec::set_allocated_seed_sentencepieces_file(std::string* seed_sentencepieces_file) {
if (seed_sentencepieces_file != nullptr) {
_has_bits_[0] |= 0x00000200u;
} else {
_has_bits_[0] &= ~0x00000200u;
}
seed_sentencepieces_file_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), seed_sentencepieces_file,
GetArena());
// @@protoc_insertion_point(field_set_allocated:sentencepiece.TrainerSpec.seed_sentencepieces_file)
}
// -------------------------------------------------------------------
// NormalizerSpec

View File

@ -20,7 +20,7 @@ option optimize_for = LITE_RUNTIME;
package sentencepiece;
// TrainerSpec encodes a various parameters for SentencePiece training.
// Next id: 54
// Next id: 55
message TrainerSpec {
///////////////////////////////////////////////////////////////////
// General parameters
@ -232,6 +232,10 @@ message TrainerSpec {
// is increased memory usage.
optional bool train_extremely_large_corpus = 49 [default = false];
// Path to a seed sentencepieces file, with one tab-separated
// seed sentencepiece <tab> frequency per line.
optional string seed_sentencepieces_file = 54 [default = ""];
// Customized extensions: the range of field numbers
// are open to third-party extensions.
extensions 200 to max;

View File

@ -153,6 +153,7 @@ inline std::string PrintProto(const TrainerSpec &message,
PRINT_PARAM(byte_fallback);
PRINT_PARAM(vocabulary_output_piece_score);
PRINT_PARAM(train_extremely_large_corpus);
PRINT_PARAM(seed_sentencepieces_file);
PRINT_PARAM(hard_vocab_limit);
PRINT_PARAM(use_all_vocab);
PRINT_PARAM(unk_id);
@ -233,6 +234,7 @@ util::Status SentencePieceTrainer::SetProtoField(absl::string_view name,
PARSE_BOOL(hard_vocab_limit);
PARSE_BOOL(vocabulary_output_piece_score);
PARSE_BOOL(train_extremely_large_corpus);
PARSE_STRING(seed_sentencepieces_file);
PARSE_BOOL(use_all_vocab);
PARSE_INT32(unk_id);
PARSE_INT32(bos_id);

View File

@ -57,6 +57,8 @@ ABSL_FLAG(bool, shuffle_input_sentence,
ABSL_FLAG(int32, seed_sentencepiece_size,
kDefaultTrainerSpec.seed_sentencepiece_size(),
"the size of seed sentencepieces");
ABSL_FLAG(std::string, seed_sentencepieces_file, "",
"file to load seed sentencepieces from");
ABSL_FLAG(double, shrinking_factor, kDefaultTrainerSpec.shrinking_factor(),
"Keeps top shrinking_factor pieces with respect to the loss");
ABSL_FLAG(int32, num_threads, kDefaultTrainerSpec.num_threads(),
@ -221,6 +223,7 @@ int main(int argc, char *argv[]) {
SetTrainerSpecFromFlag(input_sentence_size);
SetTrainerSpecFromFlag(shuffle_input_sentence);
SetTrainerSpecFromFlag(seed_sentencepiece_size);
SetTrainerSpecFromFlag(seed_sentencepieces_file);
SetTrainerSpecFromFlag(shrinking_factor);
SetTrainerSpecFromFlag(num_threads);
SetTrainerSpecFromFlag(num_sub_iterations);

View File

@ -58,6 +58,11 @@ util::Status VerifySpec(const TrainerSpec &trainer_spec) {
<< "--use_all_vocab=true is valid for WORD/CHAR model.";
}
if (!trainer_spec.seed_sentencepieces_file().empty()) {
CHECK_OR_RETURN(trainer_spec.model_type() == TrainerSpec::UNIGRAM)
<< "seed_sentencepieces_file is only supported for UNIGRAM model.";
}
#define CHECK_RANGE(variable, minval, maxval) \
CHECK_OR_RETURN(variable >= minval && variable <= maxval)

View File

@ -24,13 +24,16 @@
#include <utility>
#include <vector>
#include "filesystem.h"
#include "normalizer.h"
#include "pretokenizer_for_training.h"
#include "sentencepiece_trainer.h"
#include "third_party/absl/container/flat_hash_map.h"
#include "third_party/absl/strings/numbers.h"
#include "third_party/absl/strings/str_replace.h"
#include "third_party/absl/strings/str_split.h"
#include "third_party/esaxx/esa.hxx" // Suffix array library.
#include "trainer_interface.h"
#include "unicode_script.h"
#include "util.h"
@ -204,68 +207,104 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePiecesInternal() {
}
}
CHECK_LE(array.size(),
static_cast<size_t>(std::numeric_limits<node_int_type>::max()))
<< "Input corpus too large, try with train_extremely_large_corpus=true";
const node_int_type n = array.size();
std::vector<node_int_type> SA(n); // suffix array
std::vector<node_int_type> L(n); // left boundaries of internal node
std::vector<node_int_type> R(n); // right boundaries of internal node
std::vector<node_int_type> D(n); // depths of internal node
// Makes a suffix array to extract all sub strings occurring
// more than 2 times in the sentence.
constexpr node_int_type kAlphabetSize = 0x110000; // All UCS4 range.
node_int_type node_num = 0;
LOG(INFO) << "Making suffix array...";
CHECK_EQ(0, esaxx(array.begin(), SA.begin(), L.begin(), R.begin(), D.begin(),
n, kAlphabetSize, node_num));
LOG(INFO) << "Extracting frequent sub strings... node_num=" << node_num;
BoundedPriorityQueue<node_int_type> queue(
static_cast<size_t>(trainer_spec_.seed_sentencepiece_size()));
for (node_int_type i = 0; i < node_num; ++i) {
const node_int_type offset = SA[L[i]];
const node_int_type len = D[i];
if (len <= 1 || offset >= array.size() || offset + len >= array.size()) {
continue;
}
const char32 *begin = &array[offset];
const char32 *end = &array[offset + len];
// Skips if a substring contains a sentence boundary.
if (std::find(begin, end, kSentenceBoundary) != end) {
continue;
}
const UnicodeText uw(begin, end);
if (!IsValidSentencePiece(uw)) {
continue;
}
// character-wise coverage is the default score.
const node_int_type freq = R[i] - L[i];
const node_int_type score = freq * len;
queue.push(i, score);
}
// all_chars must be included in the seed sentencepieces.
TrainerModel::SentencePieces seed_sentencepieces;
for (const auto &it : Sorted(all_chars)) {
seed_sentencepieces.emplace_back(it);
}
for (const auto &p : queue.get()) {
const node_int_type offset = SA[L[p.first]];
const node_int_type len = D[p.first];
CHECK_GT(len, 0);
const char32 *begin = &array[offset];
const char32 *end = &array[offset + len];
const UnicodeText uw(begin, end);
const std::string w = string_util::UnicodeTextToUTF8(uw);
CHECK(IsValidSentencePiece(uw)); // just in case.
CHECK(!port::ContainsKey(all_chars, w));
seed_sentencepieces.emplace_back(w, p.second);
if (!trainer_spec_.seed_sentencepieces_file().empty()) {
auto seed_sentencepieces_file = sentencepiece::filesystem::NewReadableFile(
trainer_spec_.seed_sentencepieces_file());
std::string line;
int64_t freq = 1;
int skipped_sentencepieces = 0;
while (seed_sentencepieces_file->ReadLine(&line)) {
const std::vector<std::string> fields = absl::StrSplit(line, '\t');
CHECK_GE(fields.size(), 2);
const auto &seed_sentencepiece = fields[0];
CHECK(absl::SimpleAtoi(fields[1], &freq))
<< "Could not parse the frequency; line: " << line;
const UnicodeText uw = string_util::UTF8ToUnicodeText(seed_sentencepiece);
if (!IsValidSentencePiece(uw)) {
++skipped_sentencepieces;
continue;
}
// Initialise score of a piece by character coverage.
seed_sentencepieces.emplace_back(seed_sentencepiece, freq * uw.size());
if (seed_sentencepieces.size() % 1000000 == 0) {
LOG(INFO) << "loaded " << seed_sentencepieces.size()
<< " seed sentencepieces";
}
}
LOG(INFO) << "skipped " << skipped_sentencepieces << " seed sentencepieces";
// Take highest scoring pieces as initial vocab.
seed_sentencepieces = Sorted(seed_sentencepieces);
seed_sentencepieces.resize(std::min<size_t>(
trainer_spec_.seed_sentencepiece_size(), seed_sentencepieces.size()));
LOG(INFO) << "Initialized " << seed_sentencepieces.size()
<< " seed sentencepieces from file.";
} else {
CHECK_LE(array.size(),
static_cast<size_t>(std::numeric_limits<node_int_type>::max()))
<< "Input corpus too large, try with train_extremely_large_corpus=true";
const node_int_type n = array.size();
std::vector<node_int_type> SA(n); // suffix array
std::vector<node_int_type> L(n); // left boundaries of internal node
std::vector<node_int_type> R(n); // right boundaries of internal node
std::vector<node_int_type> D(n); // depths of internal node
// Makes a suffix array to extract all sub strings occurring
// more than 2 times in the sentence.
constexpr node_int_type kAlphabetSize = 0x110000; // All UCS4 range.
node_int_type node_num = 0;
LOG(INFO) << "Making suffix array...";
CHECK_EQ(0, esaxx(array.begin(), SA.begin(), L.begin(), R.begin(),
D.begin(), n, kAlphabetSize, node_num));
LOG(INFO) << "Extracting frequent sub strings... node_num=" << node_num;
BoundedPriorityQueue<node_int_type> queue(
static_cast<size_t>(trainer_spec_.seed_sentencepiece_size()));
for (node_int_type i = 0; i < node_num; ++i) {
const node_int_type offset = SA[L[i]];
const node_int_type len = D[i];
if (len <= 1) {
continue;
}
const char32 *begin = &array[offset];
const char32 *end = &array[offset + len];
// Skips if a substring contains a sentence boundary.
if (std::find(begin, end, kSentenceBoundary) != end) {
continue;
}
const UnicodeText uw(begin, end);
if (!IsValidSentencePiece(uw)) {
continue;
}
// character-wise coverage is the default score.
const node_int_type freq = R[i] - L[i];
const node_int_type score = freq * len;
queue.push(i, score);
}
for (const auto &p : queue.get()) {
const node_int_type offset = SA[L[p.first]];
const node_int_type len = D[p.first];
CHECK_GT(len, 0);
const char32 *begin = &array[offset];
const char32 *end = &array[offset + len];
const UnicodeText uw(begin, end);
const std::string w = string_util::UnicodeTextToUTF8(uw);
CHECK(IsValidSentencePiece(uw)); // just in case.
CHECK(!port::ContainsKey(all_chars, w));
seed_sentencepieces.emplace_back(w, p.second);
}
}
ToLogProb(seed_sentencepieces.begin(), seed_sentencepieces.end());