change the type of input_sentence_size from int32 to uint64

This commit is contained in:
Taku Kudo 2021-01-08 16:20:57 +09:00
parent da6f3a893c
commit 8eaa672a37
8 changed files with 61 additions and 61 deletions

View File

@ -19,7 +19,7 @@ DESCRIPTOR = _descriptor.FileDescriptor(
syntax='proto2',
serialized_options=b'H\003',
create_key=_descriptor._internal_create_key,
serialized_pb=b'\n\x19sentencepiece_model.proto\x12\rsentencepiece\"\xa1\n\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12\"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x05:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12\"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18 \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18\" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse\"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32\".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL\"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03'
serialized_pb=b'\n\x19sentencepiece_model.proto\x12\rsentencepiece\"\xa1\n\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12\"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12\"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18 \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18\" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse\"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32\".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL\"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03'
)
@ -171,7 +171,7 @@ _TRAINERSPEC = _descriptor.Descriptor(
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='input_sentence_size', full_name='sentencepiece.TrainerSpec.input_sentence_size', index=8,
number=11, type=5, cpp_type=1, label=1,
number=11, type=4, cpp_type=4, label=1,
has_default_value=True, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,

View File

@ -297,13 +297,13 @@ class TrainerSpec::_Internal {
(*has_bits)[0] |= 1048576u;
}
static void set_has_input_sentence_size(HasBits* has_bits) {
(*has_bits)[0] |= 512u;
(*has_bits)[0] |= 1024u;
}
static void set_has_shuffle_input_sentence(HasBits* has_bits) {
(*has_bits)[0] |= 134217728u;
}
static void set_has_mining_sentence_size(HasBits* has_bits) {
(*has_bits)[0] |= 1024u;
(*has_bits)[0] |= 512u;
}
static void set_has_training_sentence_size(HasBits* has_bits) {
(*has_bits)[0] |= 2048u;
@ -686,7 +686,7 @@ const char* TrainerSpec::_InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID
ptr += sizeof(float);
} else goto handle_unusual;
continue;
// optional int32 input_sentence_size = 11 [default = 0];
// optional uint64 input_sentence_size = 11 [default = 0];
case 11:
if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 88)) {
_Internal::set_has_input_sentence_size(&_has_bits_);
@ -1036,14 +1036,14 @@ failure:
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteFloatToArray(10, this->_internal_character_coverage(), target);
}
// optional int32 input_sentence_size = 11 [default = 0];
if (cached_has_bits & 0x00000200u) {
// optional uint64 input_sentence_size = 11 [default = 0];
if (cached_has_bits & 0x00000400u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(11, this->_internal_input_sentence_size(), target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteUInt64ToArray(11, this->_internal_input_sentence_size(), target);
}
// optional int32 mining_sentence_size = 12 [deprecated = true];
if (cached_has_bits & 0x00000400u) {
if (cached_has_bits & 0x00000200u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(12, this->_internal_mining_sentence_size(), target);
}
@ -1353,18 +1353,18 @@ size_t TrainerSpec::ByteSizeLong() const {
this->_internal_self_test_sample_size());
}
// optional int32 input_sentence_size = 11 [default = 0];
// optional int32 mining_sentence_size = 12 [deprecated = true];
if (cached_has_bits & 0x00000200u) {
total_size += 1 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_input_sentence_size());
this->_internal_mining_sentence_size());
}
// optional int32 mining_sentence_size = 12 [deprecated = true];
// optional uint64 input_sentence_size = 11 [default = 0];
if (cached_has_bits & 0x00000400u) {
total_size += 1 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_mining_sentence_size());
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::UInt64Size(
this->_internal_input_sentence_size());
}
// optional int32 training_sentence_size = 13 [deprecated = true];
@ -1581,10 +1581,10 @@ void TrainerSpec::MergeFrom(const TrainerSpec& from) {
self_test_sample_size_ = from.self_test_sample_size_;
}
if (cached_has_bits & 0x00000200u) {
input_sentence_size_ = from.input_sentence_size_;
mining_sentence_size_ = from.mining_sentence_size_;
}
if (cached_has_bits & 0x00000400u) {
mining_sentence_size_ = from.mining_sentence_size_;
input_sentence_size_ = from.input_sentence_size_;
}
if (cached_has_bits & 0x00000800u) {
training_sentence_size_ = from.training_sentence_size_;

View File

@ -274,8 +274,8 @@ class TrainerSpec PROTOBUF_FINAL :
kEosPieceFieldNumber = 47,
kPadPieceFieldNumber = 48,
kSelfTestSampleSizeFieldNumber = 6,
kInputSentenceSizeFieldNumber = 11,
kMiningSentenceSizeFieldNumber = 12,
kInputSentenceSizeFieldNumber = 11,
kTrainingSentenceSizeFieldNumber = 13,
kTreatWhitespaceAsSuffixFieldNumber = 24,
kSplitDigitsFieldNumber = 25,
@ -571,19 +571,6 @@ class TrainerSpec PROTOBUF_FINAL :
void _internal_set_self_test_sample_size(::PROTOBUF_NAMESPACE_ID::int32 value);
public:
// optional int32 input_sentence_size = 11 [default = 0];
bool has_input_sentence_size() const;
private:
bool _internal_has_input_sentence_size() const;
public:
void clear_input_sentence_size();
::PROTOBUF_NAMESPACE_ID::int32 input_sentence_size() const;
void set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value);
private:
::PROTOBUF_NAMESPACE_ID::int32 _internal_input_sentence_size() const;
void _internal_set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value);
public:
// optional int32 mining_sentence_size = 12 [deprecated = true];
PROTOBUF_DEPRECATED bool has_mining_sentence_size() const;
private:
@ -597,6 +584,19 @@ class TrainerSpec PROTOBUF_FINAL :
void _internal_set_mining_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value);
public:
// optional uint64 input_sentence_size = 11 [default = 0];
bool has_input_sentence_size() const;
private:
bool _internal_has_input_sentence_size() const;
public:
void clear_input_sentence_size();
::PROTOBUF_NAMESPACE_ID::uint64 input_sentence_size() const;
void set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::uint64 value);
private:
::PROTOBUF_NAMESPACE_ID::uint64 _internal_input_sentence_size() const;
void _internal_set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::uint64 value);
public:
// optional int32 training_sentence_size = 13 [deprecated = true];
PROTOBUF_DEPRECATED bool has_training_sentence_size() const;
private:
@ -952,8 +952,8 @@ class TrainerSpec PROTOBUF_FINAL :
static const ::PROTOBUF_NAMESPACE_ID::internal::LazyString _i_give_permission_to_break_this_code_default_pad_piece_;
::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr pad_piece_;
::PROTOBUF_NAMESPACE_ID::int32 self_test_sample_size_;
::PROTOBUF_NAMESPACE_ID::int32 input_sentence_size_;
::PROTOBUF_NAMESPACE_ID::int32 mining_sentence_size_;
::PROTOBUF_NAMESPACE_ID::uint64 input_sentence_size_;
::PROTOBUF_NAMESPACE_ID::int32 training_sentence_size_;
bool treat_whitespace_as_suffix_;
bool split_digits_;
@ -2365,30 +2365,30 @@ inline void TrainerSpec::set_character_coverage(float value) {
// @@protoc_insertion_point(field_set:sentencepiece.TrainerSpec.character_coverage)
}
// optional int32 input_sentence_size = 11 [default = 0];
// optional uint64 input_sentence_size = 11 [default = 0];
inline bool TrainerSpec::_internal_has_input_sentence_size() const {
bool value = (_has_bits_[0] & 0x00000200u) != 0;
bool value = (_has_bits_[0] & 0x00000400u) != 0;
return value;
}
inline bool TrainerSpec::has_input_sentence_size() const {
return _internal_has_input_sentence_size();
}
inline void TrainerSpec::clear_input_sentence_size() {
input_sentence_size_ = 0;
_has_bits_[0] &= ~0x00000200u;
input_sentence_size_ = PROTOBUF_ULONGLONG(0);
_has_bits_[0] &= ~0x00000400u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_input_sentence_size() const {
inline ::PROTOBUF_NAMESPACE_ID::uint64 TrainerSpec::_internal_input_sentence_size() const {
return input_sentence_size_;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::input_sentence_size() const {
inline ::PROTOBUF_NAMESPACE_ID::uint64 TrainerSpec::input_sentence_size() const {
// @@protoc_insertion_point(field_get:sentencepiece.TrainerSpec.input_sentence_size)
return _internal_input_sentence_size();
}
inline void TrainerSpec::_internal_set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x00000200u;
inline void TrainerSpec::_internal_set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::uint64 value) {
_has_bits_[0] |= 0x00000400u;
input_sentence_size_ = value;
}
inline void TrainerSpec::set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
inline void TrainerSpec::set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::uint64 value) {
_internal_set_input_sentence_size(value);
// @@protoc_insertion_point(field_set:sentencepiece.TrainerSpec.input_sentence_size)
}
@ -2423,7 +2423,7 @@ inline void TrainerSpec::set_shuffle_input_sentence(bool value) {
// optional int32 mining_sentence_size = 12 [deprecated = true];
inline bool TrainerSpec::_internal_has_mining_sentence_size() const {
bool value = (_has_bits_[0] & 0x00000400u) != 0;
bool value = (_has_bits_[0] & 0x00000200u) != 0;
return value;
}
inline bool TrainerSpec::has_mining_sentence_size() const {
@ -2431,7 +2431,7 @@ inline bool TrainerSpec::has_mining_sentence_size() const {
}
inline void TrainerSpec::clear_mining_sentence_size() {
mining_sentence_size_ = 0;
_has_bits_[0] &= ~0x00000400u;
_has_bits_[0] &= ~0x00000200u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_mining_sentence_size() const {
return mining_sentence_size_;
@ -2441,7 +2441,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::mining_sentence_size() const
return _internal_mining_sentence_size();
}
inline void TrainerSpec::_internal_set_mining_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x00000400u;
_has_bits_[0] |= 0x00000200u;
mining_sentence_size_ = value;
}
inline void TrainerSpec::set_mining_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) {

View File

@ -19,8 +19,8 @@
ABSL_FLAG(int32, int32_f, 10, "int32_flags");
ABSL_FLAG(bool, bool_f, false, "bool_flags");
ABSL_FLAG(int64, int64_f, 20, "int64_flags");
ABSL_FLAG(uint64, uint64_f, 30, "uint64_flags");
ABSL_FLAG(int64, int64_f, 9223372036854775807LL, "int64_flags");
ABSL_FLAG(uint64, uint64_f, 18446744073709551615ULL, "uint64_flags");
ABSL_FLAG(double, double_f, 40.0, "double_flags");
ABSL_FLAG(std::string, string_f, "str", "string_flags");
@ -33,8 +33,8 @@ namespace absl {
TEST(FlagsTest, DefaultValueTest) {
EXPECT_EQ(10, absl::GetFlag(FLAGS_int32_f));
EXPECT_EQ(false, absl::GetFlag(FLAGS_bool_f));
EXPECT_EQ(20, absl::GetFlag(FLAGS_int64_f));
EXPECT_EQ(30, absl::GetFlag(FLAGS_uint64_f));
EXPECT_EQ(9223372036854775807LL, absl::GetFlag(FLAGS_int64_f));
EXPECT_EQ(18446744073709551615ULL, absl::GetFlag(FLAGS_uint64_f));
EXPECT_EQ(40.0, absl::GetFlag(FLAGS_double_f));
EXPECT_EQ("str", absl::GetFlag(FLAGS_string_f));
}

View File

@ -74,7 +74,7 @@ message TrainerSpec {
// Maximum size of sentences the trainer loads from `input` parameter.
// Trainer simply loads the `input` files in sequence.
// It is better to shuffle the input corpus randomly.
optional int32 input_sentence_size = 11 [default = 0];
optional uint64 input_sentence_size = 11 [default = 0];
optional bool shuffle_input_sentence = 19 [default = true];
// Maximum size of sentences to make seed sentence pieces.

View File

@ -47,7 +47,8 @@ ABSL_FLAG(int32, self_test_sample_size,
"the size of self test samples");
ABSL_FLAG(double, character_coverage, kDefaultTrainerSpec.character_coverage(),
"character coverage to determine the minimum symbols");
ABSL_FLAG(int32, input_sentence_size, kDefaultTrainerSpec.input_sentence_size(),
ABSL_FLAG(std::uint64_t, input_sentence_size,
kDefaultTrainerSpec.input_sentence_size(),
"maximum size of sentences the trainer loads");
ABSL_FLAG(bool, shuffle_input_sentence,
kDefaultTrainerSpec.shuffle_input_sentence(),

View File

@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.!
#include "trainer_interface.h"
#include <cstdlib>
#include <memory>
#include <set>
@ -32,7 +34,6 @@
#include "third_party/absl/strings/str_format.h"
#include "third_party/absl/strings/str_join.h"
#include "third_party/absl/strings/str_split.h"
#include "trainer_interface.h"
#include "unicode_script.h"
#include "util.h"
@ -120,16 +121,14 @@ class SentenceSelector {
}
bool Add(const std::pair<std::string, int64> &sentence) {
if (spec_->input_sentence_size() <= 0) {
if (spec_->input_sentence_size() == 0) {
sentences_->emplace_back(sentence);
} else {
if (spec_->shuffle_input_sentence()) {
sampler_->Add(sentence);
} else {
sentences_->emplace_back(sentence);
if (sentences_->size() >=
static_cast<size_t>(spec_->input_sentence_size()))
return false;
if (sentences_->size() >= spec_->input_sentence_size()) return false;
}
}

View File

@ -307,9 +307,9 @@ std::mt19937 *GetRandomGenerator();
template <typename T>
class ReservoirSampler {
public:
explicit ReservoirSampler(std::vector<T> *sampled, size_t size)
explicit ReservoirSampler(std::vector<T> *sampled, uint64 size)
: sampled_(sampled), size_(size), engine_(GetRandomGeneratorSeed()) {}
explicit ReservoirSampler(std::vector<T> *sampled, size_t size, size_t seed)
explicit ReservoirSampler(std::vector<T> *sampled, uint64 size, uint64 seed)
: sampled_(sampled), size_(size), engine_(seed) {}
virtual ~ReservoirSampler() {}
@ -320,18 +320,18 @@ class ReservoirSampler {
if (sampled_->size() < size_) {
sampled_->push_back(item);
} else {
std::uniform_int_distribution<size_t> dist(0, total_ - 1);
const size_t n = dist(engine_);
std::uniform_int_distribution<uint64> dist(0, total_ - 1);
const uint64 n = dist(engine_);
if (n < sampled_->size()) (*sampled_)[n] = item;
}
}
size_t total_size() const { return total_; }
uint64 total_size() const { return total_; }
private:
std::vector<T> *sampled_ = nullptr;
size_t size_ = 0;
size_t total_ = 0;
uint64 size_ = 0;
uint64 total_ = 0;
std::mt19937 engine_;
};