mirror of
https://github.com/google/sentencepiece.git
synced 2024-09-19 06:40:00 +03:00
change the type of input_sentence_size from int32 to uint64
This commit is contained in:
parent
da6f3a893c
commit
8eaa672a37
@ -19,7 +19,7 @@ DESCRIPTOR = _descriptor.FileDescriptor(
|
||||
syntax='proto2',
|
||||
serialized_options=b'H\003',
|
||||
create_key=_descriptor._internal_create_key,
|
||||
serialized_pb=b'\n\x19sentencepiece_model.proto\x12\rsentencepiece\"\xa1\n\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12\"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x05:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12\"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18 \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18\" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse\"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32\".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL\"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03'
|
||||
serialized_pb=b'\n\x19sentencepiece_model.proto\x12\rsentencepiece\"\xa1\n\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12\"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12\"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18 \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18\" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse\"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32\".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL\"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03'
|
||||
)
|
||||
|
||||
|
||||
@ -171,7 +171,7 @@ _TRAINERSPEC = _descriptor.Descriptor(
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='input_sentence_size', full_name='sentencepiece.TrainerSpec.input_sentence_size', index=8,
|
||||
number=11, type=5, cpp_type=1, label=1,
|
||||
number=11, type=4, cpp_type=4, label=1,
|
||||
has_default_value=True, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
|
@ -297,13 +297,13 @@ class TrainerSpec::_Internal {
|
||||
(*has_bits)[0] |= 1048576u;
|
||||
}
|
||||
static void set_has_input_sentence_size(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 512u;
|
||||
(*has_bits)[0] |= 1024u;
|
||||
}
|
||||
static void set_has_shuffle_input_sentence(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 134217728u;
|
||||
}
|
||||
static void set_has_mining_sentence_size(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 1024u;
|
||||
(*has_bits)[0] |= 512u;
|
||||
}
|
||||
static void set_has_training_sentence_size(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 2048u;
|
||||
@ -686,7 +686,7 @@ const char* TrainerSpec::_InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID
|
||||
ptr += sizeof(float);
|
||||
} else goto handle_unusual;
|
||||
continue;
|
||||
// optional int32 input_sentence_size = 11 [default = 0];
|
||||
// optional uint64 input_sentence_size = 11 [default = 0];
|
||||
case 11:
|
||||
if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 88)) {
|
||||
_Internal::set_has_input_sentence_size(&_has_bits_);
|
||||
@ -1036,14 +1036,14 @@ failure:
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteFloatToArray(10, this->_internal_character_coverage(), target);
|
||||
}
|
||||
|
||||
// optional int32 input_sentence_size = 11 [default = 0];
|
||||
if (cached_has_bits & 0x00000200u) {
|
||||
// optional uint64 input_sentence_size = 11 [default = 0];
|
||||
if (cached_has_bits & 0x00000400u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(11, this->_internal_input_sentence_size(), target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteUInt64ToArray(11, this->_internal_input_sentence_size(), target);
|
||||
}
|
||||
|
||||
// optional int32 mining_sentence_size = 12 [deprecated = true];
|
||||
if (cached_has_bits & 0x00000400u) {
|
||||
if (cached_has_bits & 0x00000200u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(12, this->_internal_mining_sentence_size(), target);
|
||||
}
|
||||
@ -1353,18 +1353,18 @@ size_t TrainerSpec::ByteSizeLong() const {
|
||||
this->_internal_self_test_sample_size());
|
||||
}
|
||||
|
||||
// optional int32 input_sentence_size = 11 [default = 0];
|
||||
// optional int32 mining_sentence_size = 12 [deprecated = true];
|
||||
if (cached_has_bits & 0x00000200u) {
|
||||
total_size += 1 +
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
|
||||
this->_internal_input_sentence_size());
|
||||
this->_internal_mining_sentence_size());
|
||||
}
|
||||
|
||||
// optional int32 mining_sentence_size = 12 [deprecated = true];
|
||||
// optional uint64 input_sentence_size = 11 [default = 0];
|
||||
if (cached_has_bits & 0x00000400u) {
|
||||
total_size += 1 +
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
|
||||
this->_internal_mining_sentence_size());
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::UInt64Size(
|
||||
this->_internal_input_sentence_size());
|
||||
}
|
||||
|
||||
// optional int32 training_sentence_size = 13 [deprecated = true];
|
||||
@ -1581,10 +1581,10 @@ void TrainerSpec::MergeFrom(const TrainerSpec& from) {
|
||||
self_test_sample_size_ = from.self_test_sample_size_;
|
||||
}
|
||||
if (cached_has_bits & 0x00000200u) {
|
||||
input_sentence_size_ = from.input_sentence_size_;
|
||||
mining_sentence_size_ = from.mining_sentence_size_;
|
||||
}
|
||||
if (cached_has_bits & 0x00000400u) {
|
||||
mining_sentence_size_ = from.mining_sentence_size_;
|
||||
input_sentence_size_ = from.input_sentence_size_;
|
||||
}
|
||||
if (cached_has_bits & 0x00000800u) {
|
||||
training_sentence_size_ = from.training_sentence_size_;
|
||||
|
@ -274,8 +274,8 @@ class TrainerSpec PROTOBUF_FINAL :
|
||||
kEosPieceFieldNumber = 47,
|
||||
kPadPieceFieldNumber = 48,
|
||||
kSelfTestSampleSizeFieldNumber = 6,
|
||||
kInputSentenceSizeFieldNumber = 11,
|
||||
kMiningSentenceSizeFieldNumber = 12,
|
||||
kInputSentenceSizeFieldNumber = 11,
|
||||
kTrainingSentenceSizeFieldNumber = 13,
|
||||
kTreatWhitespaceAsSuffixFieldNumber = 24,
|
||||
kSplitDigitsFieldNumber = 25,
|
||||
@ -571,19 +571,6 @@ class TrainerSpec PROTOBUF_FINAL :
|
||||
void _internal_set_self_test_sample_size(::PROTOBUF_NAMESPACE_ID::int32 value);
|
||||
public:
|
||||
|
||||
// optional int32 input_sentence_size = 11 [default = 0];
|
||||
bool has_input_sentence_size() const;
|
||||
private:
|
||||
bool _internal_has_input_sentence_size() const;
|
||||
public:
|
||||
void clear_input_sentence_size();
|
||||
::PROTOBUF_NAMESPACE_ID::int32 input_sentence_size() const;
|
||||
void set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value);
|
||||
private:
|
||||
::PROTOBUF_NAMESPACE_ID::int32 _internal_input_sentence_size() const;
|
||||
void _internal_set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value);
|
||||
public:
|
||||
|
||||
// optional int32 mining_sentence_size = 12 [deprecated = true];
|
||||
PROTOBUF_DEPRECATED bool has_mining_sentence_size() const;
|
||||
private:
|
||||
@ -597,6 +584,19 @@ class TrainerSpec PROTOBUF_FINAL :
|
||||
void _internal_set_mining_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value);
|
||||
public:
|
||||
|
||||
// optional uint64 input_sentence_size = 11 [default = 0];
|
||||
bool has_input_sentence_size() const;
|
||||
private:
|
||||
bool _internal_has_input_sentence_size() const;
|
||||
public:
|
||||
void clear_input_sentence_size();
|
||||
::PROTOBUF_NAMESPACE_ID::uint64 input_sentence_size() const;
|
||||
void set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::uint64 value);
|
||||
private:
|
||||
::PROTOBUF_NAMESPACE_ID::uint64 _internal_input_sentence_size() const;
|
||||
void _internal_set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::uint64 value);
|
||||
public:
|
||||
|
||||
// optional int32 training_sentence_size = 13 [deprecated = true];
|
||||
PROTOBUF_DEPRECATED bool has_training_sentence_size() const;
|
||||
private:
|
||||
@ -952,8 +952,8 @@ class TrainerSpec PROTOBUF_FINAL :
|
||||
static const ::PROTOBUF_NAMESPACE_ID::internal::LazyString _i_give_permission_to_break_this_code_default_pad_piece_;
|
||||
::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr pad_piece_;
|
||||
::PROTOBUF_NAMESPACE_ID::int32 self_test_sample_size_;
|
||||
::PROTOBUF_NAMESPACE_ID::int32 input_sentence_size_;
|
||||
::PROTOBUF_NAMESPACE_ID::int32 mining_sentence_size_;
|
||||
::PROTOBUF_NAMESPACE_ID::uint64 input_sentence_size_;
|
||||
::PROTOBUF_NAMESPACE_ID::int32 training_sentence_size_;
|
||||
bool treat_whitespace_as_suffix_;
|
||||
bool split_digits_;
|
||||
@ -2365,30 +2365,30 @@ inline void TrainerSpec::set_character_coverage(float value) {
|
||||
// @@protoc_insertion_point(field_set:sentencepiece.TrainerSpec.character_coverage)
|
||||
}
|
||||
|
||||
// optional int32 input_sentence_size = 11 [default = 0];
|
||||
// optional uint64 input_sentence_size = 11 [default = 0];
|
||||
inline bool TrainerSpec::_internal_has_input_sentence_size() const {
|
||||
bool value = (_has_bits_[0] & 0x00000200u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x00000400u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_input_sentence_size() const {
|
||||
return _internal_has_input_sentence_size();
|
||||
}
|
||||
inline void TrainerSpec::clear_input_sentence_size() {
|
||||
input_sentence_size_ = 0;
|
||||
_has_bits_[0] &= ~0x00000200u;
|
||||
input_sentence_size_ = PROTOBUF_ULONGLONG(0);
|
||||
_has_bits_[0] &= ~0x00000400u;
|
||||
}
|
||||
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_input_sentence_size() const {
|
||||
inline ::PROTOBUF_NAMESPACE_ID::uint64 TrainerSpec::_internal_input_sentence_size() const {
|
||||
return input_sentence_size_;
|
||||
}
|
||||
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::input_sentence_size() const {
|
||||
inline ::PROTOBUF_NAMESPACE_ID::uint64 TrainerSpec::input_sentence_size() const {
|
||||
// @@protoc_insertion_point(field_get:sentencepiece.TrainerSpec.input_sentence_size)
|
||||
return _internal_input_sentence_size();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
_has_bits_[0] |= 0x00000200u;
|
||||
inline void TrainerSpec::_internal_set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::uint64 value) {
|
||||
_has_bits_[0] |= 0x00000400u;
|
||||
input_sentence_size_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
inline void TrainerSpec::set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::uint64 value) {
|
||||
_internal_set_input_sentence_size(value);
|
||||
// @@protoc_insertion_point(field_set:sentencepiece.TrainerSpec.input_sentence_size)
|
||||
}
|
||||
@ -2423,7 +2423,7 @@ inline void TrainerSpec::set_shuffle_input_sentence(bool value) {
|
||||
|
||||
// optional int32 mining_sentence_size = 12 [deprecated = true];
|
||||
inline bool TrainerSpec::_internal_has_mining_sentence_size() const {
|
||||
bool value = (_has_bits_[0] & 0x00000400u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x00000200u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_mining_sentence_size() const {
|
||||
@ -2431,7 +2431,7 @@ inline bool TrainerSpec::has_mining_sentence_size() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_mining_sentence_size() {
|
||||
mining_sentence_size_ = 0;
|
||||
_has_bits_[0] &= ~0x00000400u;
|
||||
_has_bits_[0] &= ~0x00000200u;
|
||||
}
|
||||
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_mining_sentence_size() const {
|
||||
return mining_sentence_size_;
|
||||
@ -2441,7 +2441,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::mining_sentence_size() const
|
||||
return _internal_mining_sentence_size();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_mining_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
_has_bits_[0] |= 0x00000400u;
|
||||
_has_bits_[0] |= 0x00000200u;
|
||||
mining_sentence_size_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_mining_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
|
@ -19,8 +19,8 @@
|
||||
|
||||
ABSL_FLAG(int32, int32_f, 10, "int32_flags");
|
||||
ABSL_FLAG(bool, bool_f, false, "bool_flags");
|
||||
ABSL_FLAG(int64, int64_f, 20, "int64_flags");
|
||||
ABSL_FLAG(uint64, uint64_f, 30, "uint64_flags");
|
||||
ABSL_FLAG(int64, int64_f, 9223372036854775807LL, "int64_flags");
|
||||
ABSL_FLAG(uint64, uint64_f, 18446744073709551615ULL, "uint64_flags");
|
||||
ABSL_FLAG(double, double_f, 40.0, "double_flags");
|
||||
ABSL_FLAG(std::string, string_f, "str", "string_flags");
|
||||
|
||||
@ -33,8 +33,8 @@ namespace absl {
|
||||
TEST(FlagsTest, DefaultValueTest) {
|
||||
EXPECT_EQ(10, absl::GetFlag(FLAGS_int32_f));
|
||||
EXPECT_EQ(false, absl::GetFlag(FLAGS_bool_f));
|
||||
EXPECT_EQ(20, absl::GetFlag(FLAGS_int64_f));
|
||||
EXPECT_EQ(30, absl::GetFlag(FLAGS_uint64_f));
|
||||
EXPECT_EQ(9223372036854775807LL, absl::GetFlag(FLAGS_int64_f));
|
||||
EXPECT_EQ(18446744073709551615ULL, absl::GetFlag(FLAGS_uint64_f));
|
||||
EXPECT_EQ(40.0, absl::GetFlag(FLAGS_double_f));
|
||||
EXPECT_EQ("str", absl::GetFlag(FLAGS_string_f));
|
||||
}
|
||||
|
@ -74,7 +74,7 @@ message TrainerSpec {
|
||||
// Maximum size of sentences the trainer loads from `input` parameter.
|
||||
// Trainer simply loads the `input` files in sequence.
|
||||
// It is better to shuffle the input corpus randomly.
|
||||
optional int32 input_sentence_size = 11 [default = 0];
|
||||
optional uint64 input_sentence_size = 11 [default = 0];
|
||||
optional bool shuffle_input_sentence = 19 [default = true];
|
||||
|
||||
// Maximum size of sentences to make seed sentence pieces.
|
||||
|
@ -47,7 +47,8 @@ ABSL_FLAG(int32, self_test_sample_size,
|
||||
"the size of self test samples");
|
||||
ABSL_FLAG(double, character_coverage, kDefaultTrainerSpec.character_coverage(),
|
||||
"character coverage to determine the minimum symbols");
|
||||
ABSL_FLAG(int32, input_sentence_size, kDefaultTrainerSpec.input_sentence_size(),
|
||||
ABSL_FLAG(std::uint64_t, input_sentence_size,
|
||||
kDefaultTrainerSpec.input_sentence_size(),
|
||||
"maximum size of sentences the trainer loads");
|
||||
ABSL_FLAG(bool, shuffle_input_sentence,
|
||||
kDefaultTrainerSpec.shuffle_input_sentence(),
|
||||
|
@ -12,6 +12,8 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.!
|
||||
|
||||
#include "trainer_interface.h"
|
||||
|
||||
#include <cstdlib>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
@ -32,7 +34,6 @@
|
||||
#include "third_party/absl/strings/str_format.h"
|
||||
#include "third_party/absl/strings/str_join.h"
|
||||
#include "third_party/absl/strings/str_split.h"
|
||||
#include "trainer_interface.h"
|
||||
#include "unicode_script.h"
|
||||
#include "util.h"
|
||||
|
||||
@ -120,16 +121,14 @@ class SentenceSelector {
|
||||
}
|
||||
|
||||
bool Add(const std::pair<std::string, int64> &sentence) {
|
||||
if (spec_->input_sentence_size() <= 0) {
|
||||
if (spec_->input_sentence_size() == 0) {
|
||||
sentences_->emplace_back(sentence);
|
||||
} else {
|
||||
if (spec_->shuffle_input_sentence()) {
|
||||
sampler_->Add(sentence);
|
||||
} else {
|
||||
sentences_->emplace_back(sentence);
|
||||
if (sentences_->size() >=
|
||||
static_cast<size_t>(spec_->input_sentence_size()))
|
||||
return false;
|
||||
if (sentences_->size() >= spec_->input_sentence_size()) return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
14
src/util.h
14
src/util.h
@ -307,9 +307,9 @@ std::mt19937 *GetRandomGenerator();
|
||||
template <typename T>
|
||||
class ReservoirSampler {
|
||||
public:
|
||||
explicit ReservoirSampler(std::vector<T> *sampled, size_t size)
|
||||
explicit ReservoirSampler(std::vector<T> *sampled, uint64 size)
|
||||
: sampled_(sampled), size_(size), engine_(GetRandomGeneratorSeed()) {}
|
||||
explicit ReservoirSampler(std::vector<T> *sampled, size_t size, size_t seed)
|
||||
explicit ReservoirSampler(std::vector<T> *sampled, uint64 size, uint64 seed)
|
||||
: sampled_(sampled), size_(size), engine_(seed) {}
|
||||
virtual ~ReservoirSampler() {}
|
||||
|
||||
@ -320,18 +320,18 @@ class ReservoirSampler {
|
||||
if (sampled_->size() < size_) {
|
||||
sampled_->push_back(item);
|
||||
} else {
|
||||
std::uniform_int_distribution<size_t> dist(0, total_ - 1);
|
||||
const size_t n = dist(engine_);
|
||||
std::uniform_int_distribution<uint64> dist(0, total_ - 1);
|
||||
const uint64 n = dist(engine_);
|
||||
if (n < sampled_->size()) (*sampled_)[n] = item;
|
||||
}
|
||||
}
|
||||
|
||||
size_t total_size() const { return total_; }
|
||||
uint64 total_size() const { return total_; }
|
||||
|
||||
private:
|
||||
std::vector<T> *sampled_ = nullptr;
|
||||
size_t size_ = 0;
|
||||
size_t total_ = 0;
|
||||
uint64 size_ = 0;
|
||||
uint64 total_ = 0;
|
||||
std::mt19937 engine_;
|
||||
};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user