mirror of
https://github.com/google/sentencepiece.git
synced 2025-01-03 22:36:34 +03:00
add pretokenization_delimiter options. Initialize seed pieces more accurately.
This commit is contained in:
parent
6c9fd791cf
commit
e58bb684d0
@ -19,7 +19,7 @@ DESCRIPTOR = _descriptor.FileDescriptor(
|
||||
syntax='proto2',
|
||||
serialized_options=b'H\003',
|
||||
create_key=_descriptor._internal_create_key,
|
||||
serialized_pb=b'\n\x19sentencepiece_model.proto\x12\rsentencepiece\"\xdb\x0b\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12*\n\x1b\x65nable_differential_privacy\x18\x32 \x01(\x08:\x05\x66\x61lse\x12+\n differential_privacy_noise_level\x18\x33 \x01(\x02:\x01\x30\x12\x32\n\'differential_privacy_clipping_threshold\x18\x34 \x01(\x04:\x01\x30\x12\"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12\"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12+\n\x1c\x61llow_whitespace_only_pieces\x18\x1a \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18 \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18\" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse\"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32\".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL\"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03'
|
||||
serialized_pb=b'\n\x19sentencepiece_model.proto\x12\rsentencepiece\"\x80\x0c\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12*\n\x1b\x65nable_differential_privacy\x18\x32 \x01(\x08:\x05\x66\x61lse\x12+\n differential_privacy_noise_level\x18\x33 \x01(\x02:\x01\x30\x12\x32\n\'differential_privacy_clipping_threshold\x18\x34 \x01(\x04:\x01\x30\x12\"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12\"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12+\n\x1c\x61llow_whitespace_only_pieces\x18\x1a \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12#\n\x19pretokenization_delimiter\x18\x35 \x01(\t:\x00\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18 \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18\" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse\"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32\".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL\"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03'
|
||||
)
|
||||
|
||||
|
||||
@ -54,8 +54,8 @@ _TRAINERSPEC_MODELTYPE = _descriptor.EnumDescriptor(
|
||||
],
|
||||
containing_type=None,
|
||||
serialized_options=None,
|
||||
serialized_start=1480,
|
||||
serialized_end=1533,
|
||||
serialized_start=1517,
|
||||
serialized_end=1570,
|
||||
)
|
||||
_sym_db.RegisterEnumDescriptor(_TRAINERSPEC_MODELTYPE)
|
||||
|
||||
@ -99,8 +99,8 @@ _MODELPROTO_SENTENCEPIECE_TYPE = _descriptor.EnumDescriptor(
|
||||
],
|
||||
containing_type=None,
|
||||
serialized_options=None,
|
||||
serialized_start=2286,
|
||||
serialized_end=2370,
|
||||
serialized_start=2323,
|
||||
serialized_end=2407,
|
||||
)
|
||||
_sym_db.RegisterEnumDescriptor(_MODELPROTO_SENTENCEPIECE_TYPE)
|
||||
|
||||
@ -303,119 +303,126 @@ _TRAINERSPEC = _descriptor.Descriptor(
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='control_symbols', full_name='sentencepiece.TrainerSpec.control_symbols', index=27,
|
||||
name='pretokenization_delimiter', full_name='sentencepiece.TrainerSpec.pretokenization_delimiter', index=27,
|
||||
number=53, type=9, cpp_type=9, label=1,
|
||||
has_default_value=True, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='control_symbols', full_name='sentencepiece.TrainerSpec.control_symbols', index=28,
|
||||
number=30, type=9, cpp_type=9, label=3,
|
||||
has_default_value=False, default_value=[],
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='user_defined_symbols', full_name='sentencepiece.TrainerSpec.user_defined_symbols', index=28,
|
||||
name='user_defined_symbols', full_name='sentencepiece.TrainerSpec.user_defined_symbols', index=29,
|
||||
number=31, type=9, cpp_type=9, label=3,
|
||||
has_default_value=False, default_value=[],
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='required_chars', full_name='sentencepiece.TrainerSpec.required_chars', index=29,
|
||||
name='required_chars', full_name='sentencepiece.TrainerSpec.required_chars', index=30,
|
||||
number=36, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='byte_fallback', full_name='sentencepiece.TrainerSpec.byte_fallback', index=30,
|
||||
name='byte_fallback', full_name='sentencepiece.TrainerSpec.byte_fallback', index=31,
|
||||
number=35, type=8, cpp_type=7, label=1,
|
||||
has_default_value=True, default_value=False,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='vocabulary_output_piece_score', full_name='sentencepiece.TrainerSpec.vocabulary_output_piece_score', index=31,
|
||||
name='vocabulary_output_piece_score', full_name='sentencepiece.TrainerSpec.vocabulary_output_piece_score', index=32,
|
||||
number=32, type=8, cpp_type=7, label=1,
|
||||
has_default_value=True, default_value=True,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='hard_vocab_limit', full_name='sentencepiece.TrainerSpec.hard_vocab_limit', index=32,
|
||||
name='hard_vocab_limit', full_name='sentencepiece.TrainerSpec.hard_vocab_limit', index=33,
|
||||
number=33, type=8, cpp_type=7, label=1,
|
||||
has_default_value=True, default_value=True,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='use_all_vocab', full_name='sentencepiece.TrainerSpec.use_all_vocab', index=33,
|
||||
name='use_all_vocab', full_name='sentencepiece.TrainerSpec.use_all_vocab', index=34,
|
||||
number=34, type=8, cpp_type=7, label=1,
|
||||
has_default_value=True, default_value=False,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='unk_id', full_name='sentencepiece.TrainerSpec.unk_id', index=34,
|
||||
name='unk_id', full_name='sentencepiece.TrainerSpec.unk_id', index=35,
|
||||
number=40, type=5, cpp_type=1, label=1,
|
||||
has_default_value=True, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='bos_id', full_name='sentencepiece.TrainerSpec.bos_id', index=35,
|
||||
name='bos_id', full_name='sentencepiece.TrainerSpec.bos_id', index=36,
|
||||
number=41, type=5, cpp_type=1, label=1,
|
||||
has_default_value=True, default_value=1,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='eos_id', full_name='sentencepiece.TrainerSpec.eos_id', index=36,
|
||||
name='eos_id', full_name='sentencepiece.TrainerSpec.eos_id', index=37,
|
||||
number=42, type=5, cpp_type=1, label=1,
|
||||
has_default_value=True, default_value=2,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='pad_id', full_name='sentencepiece.TrainerSpec.pad_id', index=37,
|
||||
name='pad_id', full_name='sentencepiece.TrainerSpec.pad_id', index=38,
|
||||
number=43, type=5, cpp_type=1, label=1,
|
||||
has_default_value=True, default_value=-1,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='unk_piece', full_name='sentencepiece.TrainerSpec.unk_piece', index=38,
|
||||
name='unk_piece', full_name='sentencepiece.TrainerSpec.unk_piece', index=39,
|
||||
number=45, type=9, cpp_type=9, label=1,
|
||||
has_default_value=True, default_value=b"<unk>".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='bos_piece', full_name='sentencepiece.TrainerSpec.bos_piece', index=39,
|
||||
name='bos_piece', full_name='sentencepiece.TrainerSpec.bos_piece', index=40,
|
||||
number=46, type=9, cpp_type=9, label=1,
|
||||
has_default_value=True, default_value=b"<s>".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='eos_piece', full_name='sentencepiece.TrainerSpec.eos_piece', index=40,
|
||||
name='eos_piece', full_name='sentencepiece.TrainerSpec.eos_piece', index=41,
|
||||
number=47, type=9, cpp_type=9, label=1,
|
||||
has_default_value=True, default_value=b"</s>".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='pad_piece', full_name='sentencepiece.TrainerSpec.pad_piece', index=41,
|
||||
name='pad_piece', full_name='sentencepiece.TrainerSpec.pad_piece', index=42,
|
||||
number=48, type=9, cpp_type=9, label=1,
|
||||
has_default_value=True, default_value=b"<pad>".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='unk_surface', full_name='sentencepiece.TrainerSpec.unk_surface', index=42,
|
||||
name='unk_surface', full_name='sentencepiece.TrainerSpec.unk_surface', index=43,
|
||||
number=44, type=9, cpp_type=9, label=1,
|
||||
has_default_value=True, default_value=b" \342\201\207 ".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='train_extremely_large_corpus', full_name='sentencepiece.TrainerSpec.train_extremely_large_corpus', index=43,
|
||||
name='train_extremely_large_corpus', full_name='sentencepiece.TrainerSpec.train_extremely_large_corpus', index=44,
|
||||
number=49, type=8, cpp_type=7, label=1,
|
||||
has_default_value=True, default_value=False,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
@ -435,7 +442,7 @@ _TRAINERSPEC = _descriptor.Descriptor(
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=45,
|
||||
serialized_end=1544,
|
||||
serialized_end=1581,
|
||||
)
|
||||
|
||||
|
||||
@ -501,8 +508,8 @@ _NORMALIZERSPEC = _descriptor.Descriptor(
|
||||
extension_ranges=[(200, 536870912), ],
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=1547,
|
||||
serialized_end=1756,
|
||||
serialized_start=1584,
|
||||
serialized_end=1793,
|
||||
)
|
||||
|
||||
|
||||
@ -540,8 +547,8 @@ _SELFTESTDATA_SAMPLE = _descriptor.Descriptor(
|
||||
extension_ranges=[],
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=1827,
|
||||
serialized_end=1868,
|
||||
serialized_start=1864,
|
||||
serialized_end=1905,
|
||||
)
|
||||
|
||||
_SELFTESTDATA = _descriptor.Descriptor(
|
||||
@ -571,8 +578,8 @@ _SELFTESTDATA = _descriptor.Descriptor(
|
||||
extension_ranges=[(200, 536870912), ],
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=1758,
|
||||
serialized_end=1879,
|
||||
serialized_start=1795,
|
||||
serialized_end=1916,
|
||||
)
|
||||
|
||||
|
||||
@ -618,8 +625,8 @@ _MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor(
|
||||
extension_ranges=[(200, 536870912), ],
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=2171,
|
||||
serialized_end=2381,
|
||||
serialized_start=2208,
|
||||
serialized_end=2418,
|
||||
)
|
||||
|
||||
_MODELPROTO = _descriptor.Descriptor(
|
||||
@ -677,8 +684,8 @@ _MODELPROTO = _descriptor.Descriptor(
|
||||
extension_ranges=[(200, 536870912), ],
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=1882,
|
||||
serialized_end=2392,
|
||||
serialized_start=1919,
|
||||
serialized_end=2429,
|
||||
)
|
||||
|
||||
_TRAINERSPEC.fields_by_name['model_type'].enum_type = _TRAINERSPEC_MODELTYPE
|
||||
|
@ -63,7 +63,7 @@ if (SPM_USE_BUILTIN_PROTOBUF)
|
||||
if (MSVC)
|
||||
add_definitions("/DHAVE_PTHREAD /wd4018 /wd4514")
|
||||
else()
|
||||
add_definitions("-pthread -DHAVE_PTHREAD=1 -Wno-sign-compare")
|
||||
add_definitions("-pthread -DHAVE_PTHREAD=1 -Wno-sign-compare -Wno-deprecated-declarations")
|
||||
endif()
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../third_party/protobuf-lite)
|
||||
include_directories(builtin_pb)
|
||||
|
@ -285,101 +285,104 @@ class TrainerSpec::_Internal {
|
||||
(*has_bits)[0] |= 1u;
|
||||
}
|
||||
static void set_has_model_type(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 4194304u;
|
||||
}
|
||||
static void set_has_vocab_size(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 8388608u;
|
||||
}
|
||||
static void set_has_self_test_sample_size(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 256u;
|
||||
}
|
||||
static void set_has_enable_differential_privacy(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 4096u;
|
||||
}
|
||||
static void set_has_differential_privacy_noise_level(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 1048576u;
|
||||
}
|
||||
static void set_has_differential_privacy_clipping_threshold(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 2097152u;
|
||||
}
|
||||
static void set_has_character_coverage(HasBits* has_bits) {
|
||||
static void set_has_vocab_size(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 16777216u;
|
||||
}
|
||||
static void set_has_input_sentence_size(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 1024u;
|
||||
}
|
||||
static void set_has_shuffle_input_sentence(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 2147483648u;
|
||||
}
|
||||
static void set_has_mining_sentence_size(HasBits* has_bits) {
|
||||
static void set_has_self_test_sample_size(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 512u;
|
||||
}
|
||||
static void set_has_training_sentence_size(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 2048u;
|
||||
}
|
||||
static void set_has_seed_sentencepiece_size(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 33554432u;
|
||||
}
|
||||
static void set_has_shrinking_factor(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 67108864u;
|
||||
}
|
||||
static void set_has_max_sentence_length(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 536870912u;
|
||||
}
|
||||
static void set_has_num_threads(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 134217728u;
|
||||
}
|
||||
static void set_has_num_sub_iterations(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 268435456u;
|
||||
}
|
||||
static void set_has_max_sentencepiece_length(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 1073741824u;
|
||||
}
|
||||
static void set_has_split_by_unicode_script(HasBits* has_bits) {
|
||||
(*has_bits)[1] |= 1u;
|
||||
}
|
||||
static void set_has_split_by_number(HasBits* has_bits) {
|
||||
(*has_bits)[1] |= 2u;
|
||||
}
|
||||
static void set_has_split_by_whitespace(HasBits* has_bits) {
|
||||
(*has_bits)[1] |= 4u;
|
||||
}
|
||||
static void set_has_treat_whitespace_as_suffix(HasBits* has_bits) {
|
||||
static void set_has_enable_differential_privacy(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 8192u;
|
||||
}
|
||||
static void set_has_allow_whitespace_only_pieces(HasBits* has_bits) {
|
||||
static void set_has_differential_privacy_noise_level(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 2097152u;
|
||||
}
|
||||
static void set_has_differential_privacy_clipping_threshold(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 4194304u;
|
||||
}
|
||||
static void set_has_character_coverage(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 33554432u;
|
||||
}
|
||||
static void set_has_input_sentence_size(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 2048u;
|
||||
}
|
||||
static void set_has_shuffle_input_sentence(HasBits* has_bits) {
|
||||
(*has_bits)[1] |= 1u;
|
||||
}
|
||||
static void set_has_mining_sentence_size(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 1024u;
|
||||
}
|
||||
static void set_has_training_sentence_size(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 4096u;
|
||||
}
|
||||
static void set_has_seed_sentencepiece_size(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 67108864u;
|
||||
}
|
||||
static void set_has_shrinking_factor(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 134217728u;
|
||||
}
|
||||
static void set_has_max_sentence_length(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 1073741824u;
|
||||
}
|
||||
static void set_has_num_threads(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 268435456u;
|
||||
}
|
||||
static void set_has_num_sub_iterations(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 536870912u;
|
||||
}
|
||||
static void set_has_max_sentencepiece_length(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 2147483648u;
|
||||
}
|
||||
static void set_has_split_by_unicode_script(HasBits* has_bits) {
|
||||
(*has_bits)[1] |= 2u;
|
||||
}
|
||||
static void set_has_split_by_number(HasBits* has_bits) {
|
||||
(*has_bits)[1] |= 4u;
|
||||
}
|
||||
static void set_has_split_by_whitespace(HasBits* has_bits) {
|
||||
(*has_bits)[1] |= 8u;
|
||||
}
|
||||
static void set_has_treat_whitespace_as_suffix(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 16384u;
|
||||
}
|
||||
static void set_has_split_digits(HasBits* has_bits) {
|
||||
static void set_has_allow_whitespace_only_pieces(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 32768u;
|
||||
}
|
||||
static void set_has_split_digits(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 65536u;
|
||||
}
|
||||
static void set_has_pretokenization_delimiter(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 256u;
|
||||
}
|
||||
static void set_has_required_chars(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 4u;
|
||||
}
|
||||
static void set_has_byte_fallback(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 65536u;
|
||||
}
|
||||
static void set_has_vocabulary_output_piece_score(HasBits* has_bits) {
|
||||
(*has_bits)[1] |= 8u;
|
||||
}
|
||||
static void set_has_hard_vocab_limit(HasBits* has_bits) {
|
||||
(*has_bits)[1] |= 16u;
|
||||
}
|
||||
static void set_has_use_all_vocab(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 131072u;
|
||||
}
|
||||
static void set_has_unk_id(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 524288u;
|
||||
static void set_has_vocabulary_output_piece_score(HasBits* has_bits) {
|
||||
(*has_bits)[1] |= 16u;
|
||||
}
|
||||
static void set_has_bos_id(HasBits* has_bits) {
|
||||
static void set_has_hard_vocab_limit(HasBits* has_bits) {
|
||||
(*has_bits)[1] |= 32u;
|
||||
}
|
||||
static void set_has_eos_id(HasBits* has_bits) {
|
||||
static void set_has_use_all_vocab(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 262144u;
|
||||
}
|
||||
static void set_has_unk_id(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 1048576u;
|
||||
}
|
||||
static void set_has_bos_id(HasBits* has_bits) {
|
||||
(*has_bits)[1] |= 64u;
|
||||
}
|
||||
static void set_has_pad_id(HasBits* has_bits) {
|
||||
static void set_has_eos_id(HasBits* has_bits) {
|
||||
(*has_bits)[1] |= 128u;
|
||||
}
|
||||
static void set_has_pad_id(HasBits* has_bits) {
|
||||
(*has_bits)[1] |= 256u;
|
||||
}
|
||||
static void set_has_unk_piece(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 16u;
|
||||
}
|
||||
@ -396,7 +399,7 @@ class TrainerSpec::_Internal {
|
||||
(*has_bits)[0] |= 8u;
|
||||
}
|
||||
static void set_has_train_extremely_large_corpus(HasBits* has_bits) {
|
||||
(*has_bits)[0] |= 262144u;
|
||||
(*has_bits)[0] |= 524288u;
|
||||
}
|
||||
};
|
||||
|
||||
@ -465,6 +468,11 @@ TrainerSpec::TrainerSpec(const TrainerSpec& from)
|
||||
pad_piece_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::NonEmptyDefault{}, from._internal_pad_piece(),
|
||||
GetArena());
|
||||
}
|
||||
pretokenization_delimiter_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited());
|
||||
if (from._internal_has_pretokenization_delimiter()) {
|
||||
pretokenization_delimiter_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, from._internal_pretokenization_delimiter(),
|
||||
GetArena());
|
||||
}
|
||||
::memcpy(&self_test_sample_size_, &from.self_test_sample_size_,
|
||||
static_cast<size_t>(reinterpret_cast<char*>(&pad_id_) -
|
||||
reinterpret_cast<char*>(&self_test_sample_size_)) + sizeof(pad_id_));
|
||||
@ -481,6 +489,7 @@ void TrainerSpec::SharedCtor() {
|
||||
bos_piece_.UnsafeSetDefault(nullptr);
|
||||
eos_piece_.UnsafeSetDefault(nullptr);
|
||||
pad_piece_.UnsafeSetDefault(nullptr);
|
||||
pretokenization_delimiter_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited());
|
||||
::memset(reinterpret_cast<char*>(this) + static_cast<size_t>(
|
||||
reinterpret_cast<char*>(&self_test_sample_size_) - reinterpret_cast<char*>(this)),
|
||||
0, static_cast<size_t>(reinterpret_cast<char*>(&differential_privacy_clipping_threshold_) -
|
||||
@ -521,6 +530,7 @@ void TrainerSpec::SharedDtor() {
|
||||
bos_piece_.DestroyNoArena(nullptr);
|
||||
eos_piece_.DestroyNoArena(nullptr);
|
||||
pad_piece_.DestroyNoArena(nullptr);
|
||||
pretokenization_delimiter_.DestroyNoArena(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited());
|
||||
}
|
||||
|
||||
void TrainerSpec::ArenaDtor(void* object) {
|
||||
@ -576,19 +586,22 @@ void TrainerSpec::Clear() {
|
||||
pad_piece_.ClearToDefault(::sentencepiece::TrainerSpec::_i_give_permission_to_break_this_code_default_pad_piece_, GetArena());
|
||||
}
|
||||
}
|
||||
if (cached_has_bits & 0x0000ff00u) {
|
||||
if (cached_has_bits & 0x00000100u) {
|
||||
pretokenization_delimiter_.ClearNonDefaultToEmpty();
|
||||
}
|
||||
if (cached_has_bits & 0x0000fe00u) {
|
||||
::memset(&self_test_sample_size_, 0, static_cast<size_t>(
|
||||
reinterpret_cast<char*>(&split_digits_) -
|
||||
reinterpret_cast<char*>(&self_test_sample_size_)) + sizeof(split_digits_));
|
||||
reinterpret_cast<char*>(&allow_whitespace_only_pieces_) -
|
||||
reinterpret_cast<char*>(&self_test_sample_size_)) + sizeof(allow_whitespace_only_pieces_));
|
||||
}
|
||||
if (cached_has_bits & 0x00ff0000u) {
|
||||
::memset(&byte_fallback_, 0, static_cast<size_t>(
|
||||
::memset(&split_digits_, 0, static_cast<size_t>(
|
||||
reinterpret_cast<char*>(&differential_privacy_clipping_threshold_) -
|
||||
reinterpret_cast<char*>(&byte_fallback_)) + sizeof(differential_privacy_clipping_threshold_));
|
||||
reinterpret_cast<char*>(&split_digits_)) + sizeof(differential_privacy_clipping_threshold_));
|
||||
model_type_ = 1;
|
||||
vocab_size_ = 8000;
|
||||
}
|
||||
if (cached_has_bits & 0xff000000u) {
|
||||
vocab_size_ = 8000;
|
||||
character_coverage_ = 0.9995f;
|
||||
seed_sentencepiece_size_ = 1000000;
|
||||
shrinking_factor_ = 0.75f;
|
||||
@ -596,10 +609,10 @@ void TrainerSpec::Clear() {
|
||||
num_sub_iterations_ = 2;
|
||||
max_sentence_length_ = 4192;
|
||||
max_sentencepiece_length_ = 16;
|
||||
shuffle_input_sentence_ = true;
|
||||
}
|
||||
cached_has_bits = _has_bits_[1];
|
||||
if (cached_has_bits & 0x000000ffu) {
|
||||
shuffle_input_sentence_ = true;
|
||||
split_by_unicode_script_ = true;
|
||||
split_by_number_ = true;
|
||||
split_by_whitespace_ = true;
|
||||
@ -607,8 +620,8 @@ void TrainerSpec::Clear() {
|
||||
hard_vocab_limit_ = true;
|
||||
bos_id_ = 1;
|
||||
eos_id_ = 2;
|
||||
pad_id_ = -1;
|
||||
}
|
||||
pad_id_ = -1;
|
||||
_has_bits_.Clear();
|
||||
_internal_metadata_.Clear<std::string>();
|
||||
}
|
||||
@ -996,6 +1009,14 @@ const char* TrainerSpec::_InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID
|
||||
CHK_(ptr);
|
||||
} else goto handle_unusual;
|
||||
continue;
|
||||
// optional string pretokenization_delimiter = 53 [default = ""];
|
||||
case 53:
|
||||
if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 170)) {
|
||||
auto str = _internal_mutable_pretokenization_delimiter();
|
||||
ptr = ::PROTOBUF_NAMESPACE_ID::internal::InlineGreedyStringParser(str, ptr, ctx);
|
||||
CHK_(ptr);
|
||||
} else goto handle_unusual;
|
||||
continue;
|
||||
default: {
|
||||
handle_unusual:
|
||||
if ((tag & 7) == 4 || tag == 0) {
|
||||
@ -1044,14 +1065,14 @@ failure:
|
||||
}
|
||||
|
||||
// optional .sentencepiece.TrainerSpec.ModelType model_type = 3 [default = UNIGRAM];
|
||||
if (cached_has_bits & 0x00400000u) {
|
||||
if (cached_has_bits & 0x00800000u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteEnumToArray(
|
||||
3, this->_internal_model_type(), target);
|
||||
}
|
||||
|
||||
// optional int32 vocab_size = 4 [default = 8000];
|
||||
if (cached_has_bits & 0x00800000u) {
|
||||
if (cached_has_bits & 0x01000000u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(4, this->_internal_vocab_size(), target);
|
||||
}
|
||||
@ -1063,7 +1084,7 @@ failure:
|
||||
}
|
||||
|
||||
// optional int32 self_test_sample_size = 6 [default = 0];
|
||||
if (cached_has_bits & 0x00000100u) {
|
||||
if (cached_has_bits & 0x00000200u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(6, this->_internal_self_test_sample_size(), target);
|
||||
}
|
||||
@ -1075,105 +1096,107 @@ failure:
|
||||
}
|
||||
|
||||
// optional float character_coverage = 10 [default = 0.9995];
|
||||
if (cached_has_bits & 0x01000000u) {
|
||||
if (cached_has_bits & 0x02000000u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteFloatToArray(10, this->_internal_character_coverage(), target);
|
||||
}
|
||||
|
||||
// optional uint64 input_sentence_size = 11 [default = 0];
|
||||
if (cached_has_bits & 0x00000400u) {
|
||||
if (cached_has_bits & 0x00000800u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteUInt64ToArray(11, this->_internal_input_sentence_size(), target);
|
||||
}
|
||||
|
||||
// optional int32 mining_sentence_size = 12 [deprecated = true];
|
||||
if (cached_has_bits & 0x00000200u) {
|
||||
if (cached_has_bits & 0x00000400u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(12, this->_internal_mining_sentence_size(), target);
|
||||
}
|
||||
|
||||
// optional int32 training_sentence_size = 13 [deprecated = true];
|
||||
if (cached_has_bits & 0x00000800u) {
|
||||
if (cached_has_bits & 0x00001000u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(13, this->_internal_training_sentence_size(), target);
|
||||
}
|
||||
|
||||
// optional int32 seed_sentencepiece_size = 14 [default = 1000000];
|
||||
if (cached_has_bits & 0x02000000u) {
|
||||
if (cached_has_bits & 0x04000000u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(14, this->_internal_seed_sentencepiece_size(), target);
|
||||
}
|
||||
|
||||
// optional float shrinking_factor = 15 [default = 0.75];
|
||||
if (cached_has_bits & 0x04000000u) {
|
||||
if (cached_has_bits & 0x08000000u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteFloatToArray(15, this->_internal_shrinking_factor(), target);
|
||||
}
|
||||
|
||||
// optional int32 num_threads = 16 [default = 16];
|
||||
if (cached_has_bits & 0x08000000u) {
|
||||
if (cached_has_bits & 0x10000000u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(16, this->_internal_num_threads(), target);
|
||||
}
|
||||
|
||||
// optional int32 num_sub_iterations = 17 [default = 2];
|
||||
if (cached_has_bits & 0x10000000u) {
|
||||
if (cached_has_bits & 0x20000000u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(17, this->_internal_num_sub_iterations(), target);
|
||||
}
|
||||
|
||||
// optional int32 max_sentence_length = 18 [default = 4192];
|
||||
if (cached_has_bits & 0x20000000u) {
|
||||
if (cached_has_bits & 0x40000000u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(18, this->_internal_max_sentence_length(), target);
|
||||
}
|
||||
|
||||
cached_has_bits = _has_bits_[1];
|
||||
// optional bool shuffle_input_sentence = 19 [default = true];
|
||||
if (cached_has_bits & 0x80000000u) {
|
||||
if (cached_has_bits & 0x00000001u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(19, this->_internal_shuffle_input_sentence(), target);
|
||||
}
|
||||
|
||||
cached_has_bits = _has_bits_[0];
|
||||
// optional int32 max_sentencepiece_length = 20 [default = 16];
|
||||
if (cached_has_bits & 0x40000000u) {
|
||||
if (cached_has_bits & 0x80000000u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(20, this->_internal_max_sentencepiece_length(), target);
|
||||
}
|
||||
|
||||
cached_has_bits = _has_bits_[1];
|
||||
// optional bool split_by_unicode_script = 21 [default = true];
|
||||
if (cached_has_bits & 0x00000001u) {
|
||||
if (cached_has_bits & 0x00000002u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(21, this->_internal_split_by_unicode_script(), target);
|
||||
}
|
||||
|
||||
// optional bool split_by_whitespace = 22 [default = true];
|
||||
if (cached_has_bits & 0x00000004u) {
|
||||
if (cached_has_bits & 0x00000008u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(22, this->_internal_split_by_whitespace(), target);
|
||||
}
|
||||
|
||||
// optional bool split_by_number = 23 [default = true];
|
||||
if (cached_has_bits & 0x00000002u) {
|
||||
if (cached_has_bits & 0x00000004u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(23, this->_internal_split_by_number(), target);
|
||||
}
|
||||
|
||||
cached_has_bits = _has_bits_[0];
|
||||
// optional bool treat_whitespace_as_suffix = 24 [default = false];
|
||||
if (cached_has_bits & 0x00002000u) {
|
||||
if (cached_has_bits & 0x00004000u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(24, this->_internal_treat_whitespace_as_suffix(), target);
|
||||
}
|
||||
|
||||
// optional bool split_digits = 25 [default = false];
|
||||
if (cached_has_bits & 0x00008000u) {
|
||||
if (cached_has_bits & 0x00010000u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(25, this->_internal_split_digits(), target);
|
||||
}
|
||||
|
||||
// optional bool allow_whitespace_only_pieces = 26 [default = false];
|
||||
if (cached_has_bits & 0x00004000u) {
|
||||
if (cached_has_bits & 0x00008000u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(26, this->_internal_allow_whitespace_only_pieces(), target);
|
||||
}
|
||||
@ -1192,26 +1215,26 @@ failure:
|
||||
|
||||
cached_has_bits = _has_bits_[1];
|
||||
// optional bool vocabulary_output_piece_score = 32 [default = true];
|
||||
if (cached_has_bits & 0x00000008u) {
|
||||
if (cached_has_bits & 0x00000010u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(32, this->_internal_vocabulary_output_piece_score(), target);
|
||||
}
|
||||
|
||||
// optional bool hard_vocab_limit = 33 [default = true];
|
||||
if (cached_has_bits & 0x00000010u) {
|
||||
if (cached_has_bits & 0x00000020u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(33, this->_internal_hard_vocab_limit(), target);
|
||||
}
|
||||
|
||||
cached_has_bits = _has_bits_[0];
|
||||
// optional bool use_all_vocab = 34 [default = false];
|
||||
if (cached_has_bits & 0x00020000u) {
|
||||
if (cached_has_bits & 0x00040000u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(34, this->_internal_use_all_vocab(), target);
|
||||
}
|
||||
|
||||
// optional bool byte_fallback = 35 [default = false];
|
||||
if (cached_has_bits & 0x00010000u) {
|
||||
if (cached_has_bits & 0x00020000u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(35, this->_internal_byte_fallback(), target);
|
||||
}
|
||||
@ -1223,26 +1246,26 @@ failure:
|
||||
}
|
||||
|
||||
// optional int32 unk_id = 40 [default = 0];
|
||||
if (cached_has_bits & 0x00080000u) {
|
||||
if (cached_has_bits & 0x00100000u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(40, this->_internal_unk_id(), target);
|
||||
}
|
||||
|
||||
cached_has_bits = _has_bits_[1];
|
||||
// optional int32 bos_id = 41 [default = 1];
|
||||
if (cached_has_bits & 0x00000020u) {
|
||||
if (cached_has_bits & 0x00000040u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(41, this->_internal_bos_id(), target);
|
||||
}
|
||||
|
||||
// optional int32 eos_id = 42 [default = 2];
|
||||
if (cached_has_bits & 0x00000040u) {
|
||||
if (cached_has_bits & 0x00000080u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(42, this->_internal_eos_id(), target);
|
||||
}
|
||||
|
||||
// optional int32 pad_id = 43 [default = -1];
|
||||
if (cached_has_bits & 0x00000080u) {
|
||||
if (cached_has_bits & 0x00000100u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(43, this->_internal_pad_id(), target);
|
||||
}
|
||||
@ -1279,29 +1302,35 @@ failure:
|
||||
}
|
||||
|
||||
// optional bool train_extremely_large_corpus = 49 [default = false];
|
||||
if (cached_has_bits & 0x00040000u) {
|
||||
if (cached_has_bits & 0x00080000u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(49, this->_internal_train_extremely_large_corpus(), target);
|
||||
}
|
||||
|
||||
// optional bool enable_differential_privacy = 50 [default = false];
|
||||
if (cached_has_bits & 0x00001000u) {
|
||||
if (cached_has_bits & 0x00002000u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(50, this->_internal_enable_differential_privacy(), target);
|
||||
}
|
||||
|
||||
// optional float differential_privacy_noise_level = 51 [default = 0];
|
||||
if (cached_has_bits & 0x00100000u) {
|
||||
if (cached_has_bits & 0x00200000u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteFloatToArray(51, this->_internal_differential_privacy_noise_level(), target);
|
||||
}
|
||||
|
||||
// optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
|
||||
if (cached_has_bits & 0x00200000u) {
|
||||
if (cached_has_bits & 0x00400000u) {
|
||||
target = stream->EnsureSpace(target);
|
||||
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteUInt64ToArray(52, this->_internal_differential_privacy_clipping_threshold(), target);
|
||||
}
|
||||
|
||||
// optional string pretokenization_delimiter = 53 [default = ""];
|
||||
if (cached_has_bits & 0x00000100u) {
|
||||
target = stream->WriteStringMaybeAliased(
|
||||
53, this->_internal_pretokenization_delimiter(), target);
|
||||
}
|
||||
|
||||
// Extension range [200, 536870912)
|
||||
target = _extensions_._InternalSerialize(
|
||||
200, 536870912, target, stream);
|
||||
@ -1416,205 +1445,212 @@ size_t TrainerSpec::ByteSizeLong() const {
|
||||
|
||||
}
|
||||
if (cached_has_bits & 0x0000ff00u) {
|
||||
// optional int32 self_test_sample_size = 6 [default = 0];
|
||||
// optional string pretokenization_delimiter = 53 [default = ""];
|
||||
if (cached_has_bits & 0x00000100u) {
|
||||
total_size += 2 +
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::StringSize(
|
||||
this->_internal_pretokenization_delimiter());
|
||||
}
|
||||
|
||||
// optional int32 self_test_sample_size = 6 [default = 0];
|
||||
if (cached_has_bits & 0x00000200u) {
|
||||
total_size += 1 +
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
|
||||
this->_internal_self_test_sample_size());
|
||||
}
|
||||
|
||||
// optional int32 mining_sentence_size = 12 [deprecated = true];
|
||||
if (cached_has_bits & 0x00000200u) {
|
||||
if (cached_has_bits & 0x00000400u) {
|
||||
total_size += 1 +
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
|
||||
this->_internal_mining_sentence_size());
|
||||
}
|
||||
|
||||
// optional uint64 input_sentence_size = 11 [default = 0];
|
||||
if (cached_has_bits & 0x00000400u) {
|
||||
if (cached_has_bits & 0x00000800u) {
|
||||
total_size += 1 +
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::UInt64Size(
|
||||
this->_internal_input_sentence_size());
|
||||
}
|
||||
|
||||
// optional int32 training_sentence_size = 13 [deprecated = true];
|
||||
if (cached_has_bits & 0x00000800u) {
|
||||
if (cached_has_bits & 0x00001000u) {
|
||||
total_size += 1 +
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
|
||||
this->_internal_training_sentence_size());
|
||||
}
|
||||
|
||||
// optional bool enable_differential_privacy = 50 [default = false];
|
||||
if (cached_has_bits & 0x00001000u) {
|
||||
total_size += 2 + 1;
|
||||
}
|
||||
|
||||
// optional bool treat_whitespace_as_suffix = 24 [default = false];
|
||||
if (cached_has_bits & 0x00002000u) {
|
||||
total_size += 2 + 1;
|
||||
}
|
||||
|
||||
// optional bool allow_whitespace_only_pieces = 26 [default = false];
|
||||
// optional bool treat_whitespace_as_suffix = 24 [default = false];
|
||||
if (cached_has_bits & 0x00004000u) {
|
||||
total_size += 2 + 1;
|
||||
}
|
||||
|
||||
// optional bool split_digits = 25 [default = false];
|
||||
// optional bool allow_whitespace_only_pieces = 26 [default = false];
|
||||
if (cached_has_bits & 0x00008000u) {
|
||||
total_size += 2 + 1;
|
||||
}
|
||||
|
||||
}
|
||||
if (cached_has_bits & 0x00ff0000u) {
|
||||
// optional bool byte_fallback = 35 [default = false];
|
||||
// optional bool split_digits = 25 [default = false];
|
||||
if (cached_has_bits & 0x00010000u) {
|
||||
total_size += 2 + 1;
|
||||
}
|
||||
|
||||
// optional bool use_all_vocab = 34 [default = false];
|
||||
// optional bool byte_fallback = 35 [default = false];
|
||||
if (cached_has_bits & 0x00020000u) {
|
||||
total_size += 2 + 1;
|
||||
}
|
||||
|
||||
// optional bool train_extremely_large_corpus = 49 [default = false];
|
||||
// optional bool use_all_vocab = 34 [default = false];
|
||||
if (cached_has_bits & 0x00040000u) {
|
||||
total_size += 2 + 1;
|
||||
}
|
||||
|
||||
// optional int32 unk_id = 40 [default = 0];
|
||||
// optional bool train_extremely_large_corpus = 49 [default = false];
|
||||
if (cached_has_bits & 0x00080000u) {
|
||||
total_size += 2 + 1;
|
||||
}
|
||||
|
||||
// optional int32 unk_id = 40 [default = 0];
|
||||
if (cached_has_bits & 0x00100000u) {
|
||||
total_size += 2 +
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
|
||||
this->_internal_unk_id());
|
||||
}
|
||||
|
||||
// optional float differential_privacy_noise_level = 51 [default = 0];
|
||||
if (cached_has_bits & 0x00100000u) {
|
||||
if (cached_has_bits & 0x00200000u) {
|
||||
total_size += 2 + 4;
|
||||
}
|
||||
|
||||
// optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
|
||||
if (cached_has_bits & 0x00200000u) {
|
||||
if (cached_has_bits & 0x00400000u) {
|
||||
total_size += 2 +
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::UInt64Size(
|
||||
this->_internal_differential_privacy_clipping_threshold());
|
||||
}
|
||||
|
||||
// optional .sentencepiece.TrainerSpec.ModelType model_type = 3 [default = UNIGRAM];
|
||||
if (cached_has_bits & 0x00400000u) {
|
||||
if (cached_has_bits & 0x00800000u) {
|
||||
total_size += 1 +
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::EnumSize(this->_internal_model_type());
|
||||
}
|
||||
|
||||
}
|
||||
if (cached_has_bits & 0xff000000u) {
|
||||
// optional int32 vocab_size = 4 [default = 8000];
|
||||
if (cached_has_bits & 0x00800000u) {
|
||||
if (cached_has_bits & 0x01000000u) {
|
||||
total_size += 1 +
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
|
||||
this->_internal_vocab_size());
|
||||
}
|
||||
|
||||
}
|
||||
if (cached_has_bits & 0xff000000u) {
|
||||
// optional float character_coverage = 10 [default = 0.9995];
|
||||
if (cached_has_bits & 0x01000000u) {
|
||||
if (cached_has_bits & 0x02000000u) {
|
||||
total_size += 1 + 4;
|
||||
}
|
||||
|
||||
// optional int32 seed_sentencepiece_size = 14 [default = 1000000];
|
||||
if (cached_has_bits & 0x02000000u) {
|
||||
if (cached_has_bits & 0x04000000u) {
|
||||
total_size += 1 +
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
|
||||
this->_internal_seed_sentencepiece_size());
|
||||
}
|
||||
|
||||
// optional float shrinking_factor = 15 [default = 0.75];
|
||||
if (cached_has_bits & 0x04000000u) {
|
||||
if (cached_has_bits & 0x08000000u) {
|
||||
total_size += 1 + 4;
|
||||
}
|
||||
|
||||
// optional int32 num_threads = 16 [default = 16];
|
||||
if (cached_has_bits & 0x08000000u) {
|
||||
if (cached_has_bits & 0x10000000u) {
|
||||
total_size += 2 +
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
|
||||
this->_internal_num_threads());
|
||||
}
|
||||
|
||||
// optional int32 num_sub_iterations = 17 [default = 2];
|
||||
if (cached_has_bits & 0x10000000u) {
|
||||
if (cached_has_bits & 0x20000000u) {
|
||||
total_size += 2 +
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
|
||||
this->_internal_num_sub_iterations());
|
||||
}
|
||||
|
||||
// optional int32 max_sentence_length = 18 [default = 4192];
|
||||
if (cached_has_bits & 0x20000000u) {
|
||||
if (cached_has_bits & 0x40000000u) {
|
||||
total_size += 2 +
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
|
||||
this->_internal_max_sentence_length());
|
||||
}
|
||||
|
||||
// optional int32 max_sentencepiece_length = 20 [default = 16];
|
||||
if (cached_has_bits & 0x40000000u) {
|
||||
if (cached_has_bits & 0x80000000u) {
|
||||
total_size += 2 +
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
|
||||
this->_internal_max_sentencepiece_length());
|
||||
}
|
||||
|
||||
// optional bool shuffle_input_sentence = 19 [default = true];
|
||||
if (cached_has_bits & 0x80000000u) {
|
||||
total_size += 2 + 1;
|
||||
}
|
||||
|
||||
}
|
||||
cached_has_bits = _has_bits_[1];
|
||||
if (cached_has_bits & 0x000000ffu) {
|
||||
// optional bool split_by_unicode_script = 21 [default = true];
|
||||
// optional bool shuffle_input_sentence = 19 [default = true];
|
||||
if (cached_has_bits & 0x00000001u) {
|
||||
total_size += 2 + 1;
|
||||
}
|
||||
|
||||
// optional bool split_by_number = 23 [default = true];
|
||||
// optional bool split_by_unicode_script = 21 [default = true];
|
||||
if (cached_has_bits & 0x00000002u) {
|
||||
total_size += 2 + 1;
|
||||
}
|
||||
|
||||
// optional bool split_by_whitespace = 22 [default = true];
|
||||
// optional bool split_by_number = 23 [default = true];
|
||||
if (cached_has_bits & 0x00000004u) {
|
||||
total_size += 2 + 1;
|
||||
}
|
||||
|
||||
// optional bool vocabulary_output_piece_score = 32 [default = true];
|
||||
// optional bool split_by_whitespace = 22 [default = true];
|
||||
if (cached_has_bits & 0x00000008u) {
|
||||
total_size += 2 + 1;
|
||||
}
|
||||
|
||||
// optional bool hard_vocab_limit = 33 [default = true];
|
||||
// optional bool vocabulary_output_piece_score = 32 [default = true];
|
||||
if (cached_has_bits & 0x00000010u) {
|
||||
total_size += 2 + 1;
|
||||
}
|
||||
|
||||
// optional int32 bos_id = 41 [default = 1];
|
||||
// optional bool hard_vocab_limit = 33 [default = true];
|
||||
if (cached_has_bits & 0x00000020u) {
|
||||
total_size += 2 + 1;
|
||||
}
|
||||
|
||||
// optional int32 bos_id = 41 [default = 1];
|
||||
if (cached_has_bits & 0x00000040u) {
|
||||
total_size += 2 +
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
|
||||
this->_internal_bos_id());
|
||||
}
|
||||
|
||||
// optional int32 eos_id = 42 [default = 2];
|
||||
if (cached_has_bits & 0x00000040u) {
|
||||
if (cached_has_bits & 0x00000080u) {
|
||||
total_size += 2 +
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
|
||||
this->_internal_eos_id());
|
||||
}
|
||||
|
||||
// optional int32 pad_id = 43 [default = -1];
|
||||
if (cached_has_bits & 0x00000080u) {
|
||||
total_size += 2 +
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
|
||||
this->_internal_pad_id());
|
||||
}
|
||||
|
||||
}
|
||||
// optional int32 pad_id = 43 [default = -1];
|
||||
if (cached_has_bits & 0x00000100u) {
|
||||
total_size += 2 +
|
||||
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
|
||||
this->_internal_pad_id());
|
||||
}
|
||||
|
||||
if (PROTOBUF_PREDICT_FALSE(_internal_metadata_.have_unknown_fields())) {
|
||||
total_size += _internal_metadata_.unknown_fields<std::string>(::PROTOBUF_NAMESPACE_ID::internal::GetEmptyString).size();
|
||||
}
|
||||
@ -1670,113 +1706,116 @@ void TrainerSpec::MergeFrom(const TrainerSpec& from) {
|
||||
}
|
||||
if (cached_has_bits & 0x0000ff00u) {
|
||||
if (cached_has_bits & 0x00000100u) {
|
||||
self_test_sample_size_ = from.self_test_sample_size_;
|
||||
_internal_set_pretokenization_delimiter(from._internal_pretokenization_delimiter());
|
||||
}
|
||||
if (cached_has_bits & 0x00000200u) {
|
||||
mining_sentence_size_ = from.mining_sentence_size_;
|
||||
self_test_sample_size_ = from.self_test_sample_size_;
|
||||
}
|
||||
if (cached_has_bits & 0x00000400u) {
|
||||
input_sentence_size_ = from.input_sentence_size_;
|
||||
mining_sentence_size_ = from.mining_sentence_size_;
|
||||
}
|
||||
if (cached_has_bits & 0x00000800u) {
|
||||
training_sentence_size_ = from.training_sentence_size_;
|
||||
input_sentence_size_ = from.input_sentence_size_;
|
||||
}
|
||||
if (cached_has_bits & 0x00001000u) {
|
||||
enable_differential_privacy_ = from.enable_differential_privacy_;
|
||||
training_sentence_size_ = from.training_sentence_size_;
|
||||
}
|
||||
if (cached_has_bits & 0x00002000u) {
|
||||
treat_whitespace_as_suffix_ = from.treat_whitespace_as_suffix_;
|
||||
enable_differential_privacy_ = from.enable_differential_privacy_;
|
||||
}
|
||||
if (cached_has_bits & 0x00004000u) {
|
||||
allow_whitespace_only_pieces_ = from.allow_whitespace_only_pieces_;
|
||||
treat_whitespace_as_suffix_ = from.treat_whitespace_as_suffix_;
|
||||
}
|
||||
if (cached_has_bits & 0x00008000u) {
|
||||
split_digits_ = from.split_digits_;
|
||||
allow_whitespace_only_pieces_ = from.allow_whitespace_only_pieces_;
|
||||
}
|
||||
_has_bits_[0] |= cached_has_bits;
|
||||
}
|
||||
if (cached_has_bits & 0x00ff0000u) {
|
||||
if (cached_has_bits & 0x00010000u) {
|
||||
byte_fallback_ = from.byte_fallback_;
|
||||
split_digits_ = from.split_digits_;
|
||||
}
|
||||
if (cached_has_bits & 0x00020000u) {
|
||||
use_all_vocab_ = from.use_all_vocab_;
|
||||
byte_fallback_ = from.byte_fallback_;
|
||||
}
|
||||
if (cached_has_bits & 0x00040000u) {
|
||||
train_extremely_large_corpus_ = from.train_extremely_large_corpus_;
|
||||
use_all_vocab_ = from.use_all_vocab_;
|
||||
}
|
||||
if (cached_has_bits & 0x00080000u) {
|
||||
unk_id_ = from.unk_id_;
|
||||
train_extremely_large_corpus_ = from.train_extremely_large_corpus_;
|
||||
}
|
||||
if (cached_has_bits & 0x00100000u) {
|
||||
differential_privacy_noise_level_ = from.differential_privacy_noise_level_;
|
||||
unk_id_ = from.unk_id_;
|
||||
}
|
||||
if (cached_has_bits & 0x00200000u) {
|
||||
differential_privacy_clipping_threshold_ = from.differential_privacy_clipping_threshold_;
|
||||
differential_privacy_noise_level_ = from.differential_privacy_noise_level_;
|
||||
}
|
||||
if (cached_has_bits & 0x00400000u) {
|
||||
model_type_ = from.model_type_;
|
||||
differential_privacy_clipping_threshold_ = from.differential_privacy_clipping_threshold_;
|
||||
}
|
||||
if (cached_has_bits & 0x00800000u) {
|
||||
vocab_size_ = from.vocab_size_;
|
||||
model_type_ = from.model_type_;
|
||||
}
|
||||
_has_bits_[0] |= cached_has_bits;
|
||||
}
|
||||
if (cached_has_bits & 0xff000000u) {
|
||||
if (cached_has_bits & 0x01000000u) {
|
||||
character_coverage_ = from.character_coverage_;
|
||||
vocab_size_ = from.vocab_size_;
|
||||
}
|
||||
if (cached_has_bits & 0x02000000u) {
|
||||
seed_sentencepiece_size_ = from.seed_sentencepiece_size_;
|
||||
character_coverage_ = from.character_coverage_;
|
||||
}
|
||||
if (cached_has_bits & 0x04000000u) {
|
||||
shrinking_factor_ = from.shrinking_factor_;
|
||||
seed_sentencepiece_size_ = from.seed_sentencepiece_size_;
|
||||
}
|
||||
if (cached_has_bits & 0x08000000u) {
|
||||
num_threads_ = from.num_threads_;
|
||||
shrinking_factor_ = from.shrinking_factor_;
|
||||
}
|
||||
if (cached_has_bits & 0x10000000u) {
|
||||
num_sub_iterations_ = from.num_sub_iterations_;
|
||||
num_threads_ = from.num_threads_;
|
||||
}
|
||||
if (cached_has_bits & 0x20000000u) {
|
||||
max_sentence_length_ = from.max_sentence_length_;
|
||||
num_sub_iterations_ = from.num_sub_iterations_;
|
||||
}
|
||||
if (cached_has_bits & 0x40000000u) {
|
||||
max_sentencepiece_length_ = from.max_sentencepiece_length_;
|
||||
max_sentence_length_ = from.max_sentence_length_;
|
||||
}
|
||||
if (cached_has_bits & 0x80000000u) {
|
||||
shuffle_input_sentence_ = from.shuffle_input_sentence_;
|
||||
max_sentencepiece_length_ = from.max_sentencepiece_length_;
|
||||
}
|
||||
_has_bits_[0] |= cached_has_bits;
|
||||
}
|
||||
cached_has_bits = from._has_bits_[1];
|
||||
if (cached_has_bits & 0x000000ffu) {
|
||||
if (cached_has_bits & 0x00000001u) {
|
||||
split_by_unicode_script_ = from.split_by_unicode_script_;
|
||||
shuffle_input_sentence_ = from.shuffle_input_sentence_;
|
||||
}
|
||||
if (cached_has_bits & 0x00000002u) {
|
||||
split_by_number_ = from.split_by_number_;
|
||||
split_by_unicode_script_ = from.split_by_unicode_script_;
|
||||
}
|
||||
if (cached_has_bits & 0x00000004u) {
|
||||
split_by_whitespace_ = from.split_by_whitespace_;
|
||||
split_by_number_ = from.split_by_number_;
|
||||
}
|
||||
if (cached_has_bits & 0x00000008u) {
|
||||
vocabulary_output_piece_score_ = from.vocabulary_output_piece_score_;
|
||||
split_by_whitespace_ = from.split_by_whitespace_;
|
||||
}
|
||||
if (cached_has_bits & 0x00000010u) {
|
||||
hard_vocab_limit_ = from.hard_vocab_limit_;
|
||||
vocabulary_output_piece_score_ = from.vocabulary_output_piece_score_;
|
||||
}
|
||||
if (cached_has_bits & 0x00000020u) {
|
||||
bos_id_ = from.bos_id_;
|
||||
hard_vocab_limit_ = from.hard_vocab_limit_;
|
||||
}
|
||||
if (cached_has_bits & 0x00000040u) {
|
||||
eos_id_ = from.eos_id_;
|
||||
bos_id_ = from.bos_id_;
|
||||
}
|
||||
if (cached_has_bits & 0x00000080u) {
|
||||
pad_id_ = from.pad_id_;
|
||||
eos_id_ = from.eos_id_;
|
||||
}
|
||||
_has_bits_[1] |= cached_has_bits;
|
||||
}
|
||||
if (cached_has_bits & 0x00000100u) {
|
||||
_internal_set_pad_id(from._internal_pad_id());
|
||||
}
|
||||
}
|
||||
|
||||
void TrainerSpec::CopyFrom(const TrainerSpec& from) {
|
||||
@ -1812,6 +1851,7 @@ void TrainerSpec::InternalSwap(TrainerSpec* other) {
|
||||
bos_piece_.Swap(&other->bos_piece_, nullptr, GetArena());
|
||||
eos_piece_.Swap(&other->eos_piece_, nullptr, GetArena());
|
||||
pad_piece_.Swap(&other->pad_piece_, nullptr, GetArena());
|
||||
pretokenization_delimiter_.Swap(&other->pretokenization_delimiter_, &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
|
||||
::PROTOBUF_NAMESPACE_ID::internal::memswap<
|
||||
PROTOBUF_FIELD_OFFSET(TrainerSpec, differential_privacy_clipping_threshold_)
|
||||
+ sizeof(TrainerSpec::differential_privacy_clipping_threshold_)
|
||||
|
@ -273,6 +273,7 @@ class TrainerSpec PROTOBUF_FINAL :
|
||||
kBosPieceFieldNumber = 46,
|
||||
kEosPieceFieldNumber = 47,
|
||||
kPadPieceFieldNumber = 48,
|
||||
kPretokenizationDelimiterFieldNumber = 53,
|
||||
kSelfTestSampleSizeFieldNumber = 6,
|
||||
kMiningSentenceSizeFieldNumber = 12,
|
||||
kInputSentenceSizeFieldNumber = 11,
|
||||
@ -562,6 +563,26 @@ class TrainerSpec PROTOBUF_FINAL :
|
||||
std::string* _internal_mutable_pad_piece();
|
||||
public:
|
||||
|
||||
// optional string pretokenization_delimiter = 53 [default = ""];
|
||||
bool has_pretokenization_delimiter() const;
|
||||
private:
|
||||
bool _internal_has_pretokenization_delimiter() const;
|
||||
public:
|
||||
void clear_pretokenization_delimiter();
|
||||
const std::string& pretokenization_delimiter() const;
|
||||
void set_pretokenization_delimiter(const std::string& value);
|
||||
void set_pretokenization_delimiter(std::string&& value);
|
||||
void set_pretokenization_delimiter(const char* value);
|
||||
void set_pretokenization_delimiter(const char* value, size_t size);
|
||||
std::string* mutable_pretokenization_delimiter();
|
||||
std::string* release_pretokenization_delimiter();
|
||||
void set_allocated_pretokenization_delimiter(std::string* pretokenization_delimiter);
|
||||
private:
|
||||
const std::string& _internal_pretokenization_delimiter() const;
|
||||
void _internal_set_pretokenization_delimiter(const std::string& value);
|
||||
std::string* _internal_mutable_pretokenization_delimiter();
|
||||
public:
|
||||
|
||||
// optional int32 self_test_sample_size = 6 [default = 0];
|
||||
bool has_self_test_sample_size() const;
|
||||
private:
|
||||
@ -1007,6 +1028,7 @@ class TrainerSpec PROTOBUF_FINAL :
|
||||
::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr eos_piece_;
|
||||
static const ::PROTOBUF_NAMESPACE_ID::internal::LazyString _i_give_permission_to_break_this_code_default_pad_piece_;
|
||||
::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr pad_piece_;
|
||||
::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr pretokenization_delimiter_;
|
||||
::PROTOBUF_NAMESPACE_ID::int32 self_test_sample_size_;
|
||||
::PROTOBUF_NAMESPACE_ID::int32 mining_sentence_size_;
|
||||
::PROTOBUF_NAMESPACE_ID::uint64 input_sentence_size_;
|
||||
@ -2240,7 +2262,7 @@ inline void TrainerSpec::set_allocated_model_prefix(std::string* model_prefix) {
|
||||
|
||||
// optional .sentencepiece.TrainerSpec.ModelType model_type = 3 [default = UNIGRAM];
|
||||
inline bool TrainerSpec::_internal_has_model_type() const {
|
||||
bool value = (_has_bits_[0] & 0x00400000u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x00800000u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_model_type() const {
|
||||
@ -2248,7 +2270,7 @@ inline bool TrainerSpec::has_model_type() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_model_type() {
|
||||
model_type_ = 1;
|
||||
_has_bits_[0] &= ~0x00400000u;
|
||||
_has_bits_[0] &= ~0x00800000u;
|
||||
}
|
||||
inline ::sentencepiece::TrainerSpec_ModelType TrainerSpec::_internal_model_type() const {
|
||||
return static_cast< ::sentencepiece::TrainerSpec_ModelType >(model_type_);
|
||||
@ -2259,7 +2281,7 @@ inline ::sentencepiece::TrainerSpec_ModelType TrainerSpec::model_type() const {
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_model_type(::sentencepiece::TrainerSpec_ModelType value) {
|
||||
assert(::sentencepiece::TrainerSpec_ModelType_IsValid(value));
|
||||
_has_bits_[0] |= 0x00400000u;
|
||||
_has_bits_[0] |= 0x00800000u;
|
||||
model_type_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_model_type(::sentencepiece::TrainerSpec_ModelType value) {
|
||||
@ -2269,7 +2291,7 @@ inline void TrainerSpec::set_model_type(::sentencepiece::TrainerSpec_ModelType v
|
||||
|
||||
// optional int32 vocab_size = 4 [default = 8000];
|
||||
inline bool TrainerSpec::_internal_has_vocab_size() const {
|
||||
bool value = (_has_bits_[0] & 0x00800000u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x01000000u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_vocab_size() const {
|
||||
@ -2277,7 +2299,7 @@ inline bool TrainerSpec::has_vocab_size() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_vocab_size() {
|
||||
vocab_size_ = 8000;
|
||||
_has_bits_[0] &= ~0x00800000u;
|
||||
_has_bits_[0] &= ~0x01000000u;
|
||||
}
|
||||
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_vocab_size() const {
|
||||
return vocab_size_;
|
||||
@ -2287,7 +2309,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::vocab_size() const {
|
||||
return _internal_vocab_size();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_vocab_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
_has_bits_[0] |= 0x00800000u;
|
||||
_has_bits_[0] |= 0x01000000u;
|
||||
vocab_size_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_vocab_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
@ -2371,7 +2393,7 @@ TrainerSpec::mutable_accept_language() {
|
||||
|
||||
// optional int32 self_test_sample_size = 6 [default = 0];
|
||||
inline bool TrainerSpec::_internal_has_self_test_sample_size() const {
|
||||
bool value = (_has_bits_[0] & 0x00000100u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x00000200u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_self_test_sample_size() const {
|
||||
@ -2379,7 +2401,7 @@ inline bool TrainerSpec::has_self_test_sample_size() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_self_test_sample_size() {
|
||||
self_test_sample_size_ = 0;
|
||||
_has_bits_[0] &= ~0x00000100u;
|
||||
_has_bits_[0] &= ~0x00000200u;
|
||||
}
|
||||
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_self_test_sample_size() const {
|
||||
return self_test_sample_size_;
|
||||
@ -2389,7 +2411,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::self_test_sample_size() const
|
||||
return _internal_self_test_sample_size();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_self_test_sample_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
_has_bits_[0] |= 0x00000100u;
|
||||
_has_bits_[0] |= 0x00000200u;
|
||||
self_test_sample_size_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_self_test_sample_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
@ -2399,7 +2421,7 @@ inline void TrainerSpec::set_self_test_sample_size(::PROTOBUF_NAMESPACE_ID::int3
|
||||
|
||||
// optional bool enable_differential_privacy = 50 [default = false];
|
||||
inline bool TrainerSpec::_internal_has_enable_differential_privacy() const {
|
||||
bool value = (_has_bits_[0] & 0x00001000u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x00002000u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_enable_differential_privacy() const {
|
||||
@ -2407,7 +2429,7 @@ inline bool TrainerSpec::has_enable_differential_privacy() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_enable_differential_privacy() {
|
||||
enable_differential_privacy_ = false;
|
||||
_has_bits_[0] &= ~0x00001000u;
|
||||
_has_bits_[0] &= ~0x00002000u;
|
||||
}
|
||||
inline bool TrainerSpec::_internal_enable_differential_privacy() const {
|
||||
return enable_differential_privacy_;
|
||||
@ -2417,7 +2439,7 @@ inline bool TrainerSpec::enable_differential_privacy() const {
|
||||
return _internal_enable_differential_privacy();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_enable_differential_privacy(bool value) {
|
||||
_has_bits_[0] |= 0x00001000u;
|
||||
_has_bits_[0] |= 0x00002000u;
|
||||
enable_differential_privacy_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_enable_differential_privacy(bool value) {
|
||||
@ -2427,7 +2449,7 @@ inline void TrainerSpec::set_enable_differential_privacy(bool value) {
|
||||
|
||||
// optional float differential_privacy_noise_level = 51 [default = 0];
|
||||
inline bool TrainerSpec::_internal_has_differential_privacy_noise_level() const {
|
||||
bool value = (_has_bits_[0] & 0x00100000u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x00200000u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_differential_privacy_noise_level() const {
|
||||
@ -2435,7 +2457,7 @@ inline bool TrainerSpec::has_differential_privacy_noise_level() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_differential_privacy_noise_level() {
|
||||
differential_privacy_noise_level_ = 0;
|
||||
_has_bits_[0] &= ~0x00100000u;
|
||||
_has_bits_[0] &= ~0x00200000u;
|
||||
}
|
||||
inline float TrainerSpec::_internal_differential_privacy_noise_level() const {
|
||||
return differential_privacy_noise_level_;
|
||||
@ -2445,7 +2467,7 @@ inline float TrainerSpec::differential_privacy_noise_level() const {
|
||||
return _internal_differential_privacy_noise_level();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_differential_privacy_noise_level(float value) {
|
||||
_has_bits_[0] |= 0x00100000u;
|
||||
_has_bits_[0] |= 0x00200000u;
|
||||
differential_privacy_noise_level_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_differential_privacy_noise_level(float value) {
|
||||
@ -2455,7 +2477,7 @@ inline void TrainerSpec::set_differential_privacy_noise_level(float value) {
|
||||
|
||||
// optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
|
||||
inline bool TrainerSpec::_internal_has_differential_privacy_clipping_threshold() const {
|
||||
bool value = (_has_bits_[0] & 0x00200000u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x00400000u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_differential_privacy_clipping_threshold() const {
|
||||
@ -2463,7 +2485,7 @@ inline bool TrainerSpec::has_differential_privacy_clipping_threshold() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_differential_privacy_clipping_threshold() {
|
||||
differential_privacy_clipping_threshold_ = PROTOBUF_ULONGLONG(0);
|
||||
_has_bits_[0] &= ~0x00200000u;
|
||||
_has_bits_[0] &= ~0x00400000u;
|
||||
}
|
||||
inline ::PROTOBUF_NAMESPACE_ID::uint64 TrainerSpec::_internal_differential_privacy_clipping_threshold() const {
|
||||
return differential_privacy_clipping_threshold_;
|
||||
@ -2473,7 +2495,7 @@ inline ::PROTOBUF_NAMESPACE_ID::uint64 TrainerSpec::differential_privacy_clippin
|
||||
return _internal_differential_privacy_clipping_threshold();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_differential_privacy_clipping_threshold(::PROTOBUF_NAMESPACE_ID::uint64 value) {
|
||||
_has_bits_[0] |= 0x00200000u;
|
||||
_has_bits_[0] |= 0x00400000u;
|
||||
differential_privacy_clipping_threshold_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_differential_privacy_clipping_threshold(::PROTOBUF_NAMESPACE_ID::uint64 value) {
|
||||
@ -2483,7 +2505,7 @@ inline void TrainerSpec::set_differential_privacy_clipping_threshold(::PROTOBUF_
|
||||
|
||||
// optional float character_coverage = 10 [default = 0.9995];
|
||||
inline bool TrainerSpec::_internal_has_character_coverage() const {
|
||||
bool value = (_has_bits_[0] & 0x01000000u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x02000000u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_character_coverage() const {
|
||||
@ -2491,7 +2513,7 @@ inline bool TrainerSpec::has_character_coverage() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_character_coverage() {
|
||||
character_coverage_ = 0.9995f;
|
||||
_has_bits_[0] &= ~0x01000000u;
|
||||
_has_bits_[0] &= ~0x02000000u;
|
||||
}
|
||||
inline float TrainerSpec::_internal_character_coverage() const {
|
||||
return character_coverage_;
|
||||
@ -2501,7 +2523,7 @@ inline float TrainerSpec::character_coverage() const {
|
||||
return _internal_character_coverage();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_character_coverage(float value) {
|
||||
_has_bits_[0] |= 0x01000000u;
|
||||
_has_bits_[0] |= 0x02000000u;
|
||||
character_coverage_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_character_coverage(float value) {
|
||||
@ -2511,7 +2533,7 @@ inline void TrainerSpec::set_character_coverage(float value) {
|
||||
|
||||
// optional uint64 input_sentence_size = 11 [default = 0];
|
||||
inline bool TrainerSpec::_internal_has_input_sentence_size() const {
|
||||
bool value = (_has_bits_[0] & 0x00000400u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x00000800u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_input_sentence_size() const {
|
||||
@ -2519,7 +2541,7 @@ inline bool TrainerSpec::has_input_sentence_size() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_input_sentence_size() {
|
||||
input_sentence_size_ = PROTOBUF_ULONGLONG(0);
|
||||
_has_bits_[0] &= ~0x00000400u;
|
||||
_has_bits_[0] &= ~0x00000800u;
|
||||
}
|
||||
inline ::PROTOBUF_NAMESPACE_ID::uint64 TrainerSpec::_internal_input_sentence_size() const {
|
||||
return input_sentence_size_;
|
||||
@ -2529,7 +2551,7 @@ inline ::PROTOBUF_NAMESPACE_ID::uint64 TrainerSpec::input_sentence_size() const
|
||||
return _internal_input_sentence_size();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::uint64 value) {
|
||||
_has_bits_[0] |= 0x00000400u;
|
||||
_has_bits_[0] |= 0x00000800u;
|
||||
input_sentence_size_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::uint64 value) {
|
||||
@ -2539,7 +2561,7 @@ inline void TrainerSpec::set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::uint64
|
||||
|
||||
// optional bool shuffle_input_sentence = 19 [default = true];
|
||||
inline bool TrainerSpec::_internal_has_shuffle_input_sentence() const {
|
||||
bool value = (_has_bits_[0] & 0x80000000u) != 0;
|
||||
bool value = (_has_bits_[1] & 0x00000001u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_shuffle_input_sentence() const {
|
||||
@ -2547,7 +2569,7 @@ inline bool TrainerSpec::has_shuffle_input_sentence() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_shuffle_input_sentence() {
|
||||
shuffle_input_sentence_ = true;
|
||||
_has_bits_[0] &= ~0x80000000u;
|
||||
_has_bits_[1] &= ~0x00000001u;
|
||||
}
|
||||
inline bool TrainerSpec::_internal_shuffle_input_sentence() const {
|
||||
return shuffle_input_sentence_;
|
||||
@ -2557,7 +2579,7 @@ inline bool TrainerSpec::shuffle_input_sentence() const {
|
||||
return _internal_shuffle_input_sentence();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_shuffle_input_sentence(bool value) {
|
||||
_has_bits_[0] |= 0x80000000u;
|
||||
_has_bits_[1] |= 0x00000001u;
|
||||
shuffle_input_sentence_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_shuffle_input_sentence(bool value) {
|
||||
@ -2567,7 +2589,7 @@ inline void TrainerSpec::set_shuffle_input_sentence(bool value) {
|
||||
|
||||
// optional int32 mining_sentence_size = 12 [deprecated = true];
|
||||
inline bool TrainerSpec::_internal_has_mining_sentence_size() const {
|
||||
bool value = (_has_bits_[0] & 0x00000200u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x00000400u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_mining_sentence_size() const {
|
||||
@ -2575,7 +2597,7 @@ inline bool TrainerSpec::has_mining_sentence_size() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_mining_sentence_size() {
|
||||
mining_sentence_size_ = 0;
|
||||
_has_bits_[0] &= ~0x00000200u;
|
||||
_has_bits_[0] &= ~0x00000400u;
|
||||
}
|
||||
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_mining_sentence_size() const {
|
||||
return mining_sentence_size_;
|
||||
@ -2585,7 +2607,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::mining_sentence_size() const
|
||||
return _internal_mining_sentence_size();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_mining_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
_has_bits_[0] |= 0x00000200u;
|
||||
_has_bits_[0] |= 0x00000400u;
|
||||
mining_sentence_size_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_mining_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
@ -2595,7 +2617,7 @@ inline void TrainerSpec::set_mining_sentence_size(::PROTOBUF_NAMESPACE_ID::int32
|
||||
|
||||
// optional int32 training_sentence_size = 13 [deprecated = true];
|
||||
inline bool TrainerSpec::_internal_has_training_sentence_size() const {
|
||||
bool value = (_has_bits_[0] & 0x00000800u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x00001000u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_training_sentence_size() const {
|
||||
@ -2603,7 +2625,7 @@ inline bool TrainerSpec::has_training_sentence_size() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_training_sentence_size() {
|
||||
training_sentence_size_ = 0;
|
||||
_has_bits_[0] &= ~0x00000800u;
|
||||
_has_bits_[0] &= ~0x00001000u;
|
||||
}
|
||||
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_training_sentence_size() const {
|
||||
return training_sentence_size_;
|
||||
@ -2613,7 +2635,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::training_sentence_size() cons
|
||||
return _internal_training_sentence_size();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_training_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
_has_bits_[0] |= 0x00000800u;
|
||||
_has_bits_[0] |= 0x00001000u;
|
||||
training_sentence_size_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_training_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
@ -2623,7 +2645,7 @@ inline void TrainerSpec::set_training_sentence_size(::PROTOBUF_NAMESPACE_ID::int
|
||||
|
||||
// optional int32 seed_sentencepiece_size = 14 [default = 1000000];
|
||||
inline bool TrainerSpec::_internal_has_seed_sentencepiece_size() const {
|
||||
bool value = (_has_bits_[0] & 0x02000000u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x04000000u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_seed_sentencepiece_size() const {
|
||||
@ -2631,7 +2653,7 @@ inline bool TrainerSpec::has_seed_sentencepiece_size() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_seed_sentencepiece_size() {
|
||||
seed_sentencepiece_size_ = 1000000;
|
||||
_has_bits_[0] &= ~0x02000000u;
|
||||
_has_bits_[0] &= ~0x04000000u;
|
||||
}
|
||||
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_seed_sentencepiece_size() const {
|
||||
return seed_sentencepiece_size_;
|
||||
@ -2641,7 +2663,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::seed_sentencepiece_size() con
|
||||
return _internal_seed_sentencepiece_size();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_seed_sentencepiece_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
_has_bits_[0] |= 0x02000000u;
|
||||
_has_bits_[0] |= 0x04000000u;
|
||||
seed_sentencepiece_size_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_seed_sentencepiece_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
@ -2651,7 +2673,7 @@ inline void TrainerSpec::set_seed_sentencepiece_size(::PROTOBUF_NAMESPACE_ID::in
|
||||
|
||||
// optional float shrinking_factor = 15 [default = 0.75];
|
||||
inline bool TrainerSpec::_internal_has_shrinking_factor() const {
|
||||
bool value = (_has_bits_[0] & 0x04000000u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x08000000u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_shrinking_factor() const {
|
||||
@ -2659,7 +2681,7 @@ inline bool TrainerSpec::has_shrinking_factor() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_shrinking_factor() {
|
||||
shrinking_factor_ = 0.75f;
|
||||
_has_bits_[0] &= ~0x04000000u;
|
||||
_has_bits_[0] &= ~0x08000000u;
|
||||
}
|
||||
inline float TrainerSpec::_internal_shrinking_factor() const {
|
||||
return shrinking_factor_;
|
||||
@ -2669,7 +2691,7 @@ inline float TrainerSpec::shrinking_factor() const {
|
||||
return _internal_shrinking_factor();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_shrinking_factor(float value) {
|
||||
_has_bits_[0] |= 0x04000000u;
|
||||
_has_bits_[0] |= 0x08000000u;
|
||||
shrinking_factor_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_shrinking_factor(float value) {
|
||||
@ -2679,7 +2701,7 @@ inline void TrainerSpec::set_shrinking_factor(float value) {
|
||||
|
||||
// optional int32 max_sentence_length = 18 [default = 4192];
|
||||
inline bool TrainerSpec::_internal_has_max_sentence_length() const {
|
||||
bool value = (_has_bits_[0] & 0x20000000u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x40000000u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_max_sentence_length() const {
|
||||
@ -2687,7 +2709,7 @@ inline bool TrainerSpec::has_max_sentence_length() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_max_sentence_length() {
|
||||
max_sentence_length_ = 4192;
|
||||
_has_bits_[0] &= ~0x20000000u;
|
||||
_has_bits_[0] &= ~0x40000000u;
|
||||
}
|
||||
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_max_sentence_length() const {
|
||||
return max_sentence_length_;
|
||||
@ -2697,7 +2719,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::max_sentence_length() const {
|
||||
return _internal_max_sentence_length();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_max_sentence_length(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
_has_bits_[0] |= 0x20000000u;
|
||||
_has_bits_[0] |= 0x40000000u;
|
||||
max_sentence_length_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_max_sentence_length(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
@ -2707,7 +2729,7 @@ inline void TrainerSpec::set_max_sentence_length(::PROTOBUF_NAMESPACE_ID::int32
|
||||
|
||||
// optional int32 num_threads = 16 [default = 16];
|
||||
inline bool TrainerSpec::_internal_has_num_threads() const {
|
||||
bool value = (_has_bits_[0] & 0x08000000u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x10000000u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_num_threads() const {
|
||||
@ -2715,7 +2737,7 @@ inline bool TrainerSpec::has_num_threads() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_num_threads() {
|
||||
num_threads_ = 16;
|
||||
_has_bits_[0] &= ~0x08000000u;
|
||||
_has_bits_[0] &= ~0x10000000u;
|
||||
}
|
||||
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_num_threads() const {
|
||||
return num_threads_;
|
||||
@ -2725,7 +2747,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::num_threads() const {
|
||||
return _internal_num_threads();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_num_threads(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
_has_bits_[0] |= 0x08000000u;
|
||||
_has_bits_[0] |= 0x10000000u;
|
||||
num_threads_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_num_threads(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
@ -2735,7 +2757,7 @@ inline void TrainerSpec::set_num_threads(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
|
||||
// optional int32 num_sub_iterations = 17 [default = 2];
|
||||
inline bool TrainerSpec::_internal_has_num_sub_iterations() const {
|
||||
bool value = (_has_bits_[0] & 0x10000000u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x20000000u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_num_sub_iterations() const {
|
||||
@ -2743,7 +2765,7 @@ inline bool TrainerSpec::has_num_sub_iterations() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_num_sub_iterations() {
|
||||
num_sub_iterations_ = 2;
|
||||
_has_bits_[0] &= ~0x10000000u;
|
||||
_has_bits_[0] &= ~0x20000000u;
|
||||
}
|
||||
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_num_sub_iterations() const {
|
||||
return num_sub_iterations_;
|
||||
@ -2753,7 +2775,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::num_sub_iterations() const {
|
||||
return _internal_num_sub_iterations();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_num_sub_iterations(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
_has_bits_[0] |= 0x10000000u;
|
||||
_has_bits_[0] |= 0x20000000u;
|
||||
num_sub_iterations_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_num_sub_iterations(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
@ -2763,7 +2785,7 @@ inline void TrainerSpec::set_num_sub_iterations(::PROTOBUF_NAMESPACE_ID::int32 v
|
||||
|
||||
// optional int32 max_sentencepiece_length = 20 [default = 16];
|
||||
inline bool TrainerSpec::_internal_has_max_sentencepiece_length() const {
|
||||
bool value = (_has_bits_[0] & 0x40000000u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x80000000u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_max_sentencepiece_length() const {
|
||||
@ -2771,7 +2793,7 @@ inline bool TrainerSpec::has_max_sentencepiece_length() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_max_sentencepiece_length() {
|
||||
max_sentencepiece_length_ = 16;
|
||||
_has_bits_[0] &= ~0x40000000u;
|
||||
_has_bits_[0] &= ~0x80000000u;
|
||||
}
|
||||
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_max_sentencepiece_length() const {
|
||||
return max_sentencepiece_length_;
|
||||
@ -2781,7 +2803,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::max_sentencepiece_length() co
|
||||
return _internal_max_sentencepiece_length();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_max_sentencepiece_length(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
_has_bits_[0] |= 0x40000000u;
|
||||
_has_bits_[0] |= 0x80000000u;
|
||||
max_sentencepiece_length_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_max_sentencepiece_length(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
@ -2791,7 +2813,7 @@ inline void TrainerSpec::set_max_sentencepiece_length(::PROTOBUF_NAMESPACE_ID::i
|
||||
|
||||
// optional bool split_by_unicode_script = 21 [default = true];
|
||||
inline bool TrainerSpec::_internal_has_split_by_unicode_script() const {
|
||||
bool value = (_has_bits_[1] & 0x00000001u) != 0;
|
||||
bool value = (_has_bits_[1] & 0x00000002u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_split_by_unicode_script() const {
|
||||
@ -2799,7 +2821,7 @@ inline bool TrainerSpec::has_split_by_unicode_script() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_split_by_unicode_script() {
|
||||
split_by_unicode_script_ = true;
|
||||
_has_bits_[1] &= ~0x00000001u;
|
||||
_has_bits_[1] &= ~0x00000002u;
|
||||
}
|
||||
inline bool TrainerSpec::_internal_split_by_unicode_script() const {
|
||||
return split_by_unicode_script_;
|
||||
@ -2809,7 +2831,7 @@ inline bool TrainerSpec::split_by_unicode_script() const {
|
||||
return _internal_split_by_unicode_script();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_split_by_unicode_script(bool value) {
|
||||
_has_bits_[1] |= 0x00000001u;
|
||||
_has_bits_[1] |= 0x00000002u;
|
||||
split_by_unicode_script_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_split_by_unicode_script(bool value) {
|
||||
@ -2819,7 +2841,7 @@ inline void TrainerSpec::set_split_by_unicode_script(bool value) {
|
||||
|
||||
// optional bool split_by_number = 23 [default = true];
|
||||
inline bool TrainerSpec::_internal_has_split_by_number() const {
|
||||
bool value = (_has_bits_[1] & 0x00000002u) != 0;
|
||||
bool value = (_has_bits_[1] & 0x00000004u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_split_by_number() const {
|
||||
@ -2827,7 +2849,7 @@ inline bool TrainerSpec::has_split_by_number() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_split_by_number() {
|
||||
split_by_number_ = true;
|
||||
_has_bits_[1] &= ~0x00000002u;
|
||||
_has_bits_[1] &= ~0x00000004u;
|
||||
}
|
||||
inline bool TrainerSpec::_internal_split_by_number() const {
|
||||
return split_by_number_;
|
||||
@ -2837,7 +2859,7 @@ inline bool TrainerSpec::split_by_number() const {
|
||||
return _internal_split_by_number();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_split_by_number(bool value) {
|
||||
_has_bits_[1] |= 0x00000002u;
|
||||
_has_bits_[1] |= 0x00000004u;
|
||||
split_by_number_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_split_by_number(bool value) {
|
||||
@ -2847,7 +2869,7 @@ inline void TrainerSpec::set_split_by_number(bool value) {
|
||||
|
||||
// optional bool split_by_whitespace = 22 [default = true];
|
||||
inline bool TrainerSpec::_internal_has_split_by_whitespace() const {
|
||||
bool value = (_has_bits_[1] & 0x00000004u) != 0;
|
||||
bool value = (_has_bits_[1] & 0x00000008u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_split_by_whitespace() const {
|
||||
@ -2855,7 +2877,7 @@ inline bool TrainerSpec::has_split_by_whitespace() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_split_by_whitespace() {
|
||||
split_by_whitespace_ = true;
|
||||
_has_bits_[1] &= ~0x00000004u;
|
||||
_has_bits_[1] &= ~0x00000008u;
|
||||
}
|
||||
inline bool TrainerSpec::_internal_split_by_whitespace() const {
|
||||
return split_by_whitespace_;
|
||||
@ -2865,7 +2887,7 @@ inline bool TrainerSpec::split_by_whitespace() const {
|
||||
return _internal_split_by_whitespace();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_split_by_whitespace(bool value) {
|
||||
_has_bits_[1] |= 0x00000004u;
|
||||
_has_bits_[1] |= 0x00000008u;
|
||||
split_by_whitespace_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_split_by_whitespace(bool value) {
|
||||
@ -2875,7 +2897,7 @@ inline void TrainerSpec::set_split_by_whitespace(bool value) {
|
||||
|
||||
// optional bool treat_whitespace_as_suffix = 24 [default = false];
|
||||
inline bool TrainerSpec::_internal_has_treat_whitespace_as_suffix() const {
|
||||
bool value = (_has_bits_[0] & 0x00002000u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x00004000u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_treat_whitespace_as_suffix() const {
|
||||
@ -2883,7 +2905,7 @@ inline bool TrainerSpec::has_treat_whitespace_as_suffix() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_treat_whitespace_as_suffix() {
|
||||
treat_whitespace_as_suffix_ = false;
|
||||
_has_bits_[0] &= ~0x00002000u;
|
||||
_has_bits_[0] &= ~0x00004000u;
|
||||
}
|
||||
inline bool TrainerSpec::_internal_treat_whitespace_as_suffix() const {
|
||||
return treat_whitespace_as_suffix_;
|
||||
@ -2893,7 +2915,7 @@ inline bool TrainerSpec::treat_whitespace_as_suffix() const {
|
||||
return _internal_treat_whitespace_as_suffix();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_treat_whitespace_as_suffix(bool value) {
|
||||
_has_bits_[0] |= 0x00002000u;
|
||||
_has_bits_[0] |= 0x00004000u;
|
||||
treat_whitespace_as_suffix_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_treat_whitespace_as_suffix(bool value) {
|
||||
@ -2903,7 +2925,7 @@ inline void TrainerSpec::set_treat_whitespace_as_suffix(bool value) {
|
||||
|
||||
// optional bool allow_whitespace_only_pieces = 26 [default = false];
|
||||
inline bool TrainerSpec::_internal_has_allow_whitespace_only_pieces() const {
|
||||
bool value = (_has_bits_[0] & 0x00004000u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x00008000u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_allow_whitespace_only_pieces() const {
|
||||
@ -2911,7 +2933,7 @@ inline bool TrainerSpec::has_allow_whitespace_only_pieces() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_allow_whitespace_only_pieces() {
|
||||
allow_whitespace_only_pieces_ = false;
|
||||
_has_bits_[0] &= ~0x00004000u;
|
||||
_has_bits_[0] &= ~0x00008000u;
|
||||
}
|
||||
inline bool TrainerSpec::_internal_allow_whitespace_only_pieces() const {
|
||||
return allow_whitespace_only_pieces_;
|
||||
@ -2921,7 +2943,7 @@ inline bool TrainerSpec::allow_whitespace_only_pieces() const {
|
||||
return _internal_allow_whitespace_only_pieces();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_allow_whitespace_only_pieces(bool value) {
|
||||
_has_bits_[0] |= 0x00004000u;
|
||||
_has_bits_[0] |= 0x00008000u;
|
||||
allow_whitespace_only_pieces_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_allow_whitespace_only_pieces(bool value) {
|
||||
@ -2931,7 +2953,7 @@ inline void TrainerSpec::set_allow_whitespace_only_pieces(bool value) {
|
||||
|
||||
// optional bool split_digits = 25 [default = false];
|
||||
inline bool TrainerSpec::_internal_has_split_digits() const {
|
||||
bool value = (_has_bits_[0] & 0x00008000u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x00010000u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_split_digits() const {
|
||||
@ -2939,7 +2961,7 @@ inline bool TrainerSpec::has_split_digits() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_split_digits() {
|
||||
split_digits_ = false;
|
||||
_has_bits_[0] &= ~0x00008000u;
|
||||
_has_bits_[0] &= ~0x00010000u;
|
||||
}
|
||||
inline bool TrainerSpec::_internal_split_digits() const {
|
||||
return split_digits_;
|
||||
@ -2949,7 +2971,7 @@ inline bool TrainerSpec::split_digits() const {
|
||||
return _internal_split_digits();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_split_digits(bool value) {
|
||||
_has_bits_[0] |= 0x00008000u;
|
||||
_has_bits_[0] |= 0x00010000u;
|
||||
split_digits_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_split_digits(bool value) {
|
||||
@ -2957,6 +2979,79 @@ inline void TrainerSpec::set_split_digits(bool value) {
|
||||
// @@protoc_insertion_point(field_set:sentencepiece.TrainerSpec.split_digits)
|
||||
}
|
||||
|
||||
// optional string pretokenization_delimiter = 53 [default = ""];
|
||||
inline bool TrainerSpec::_internal_has_pretokenization_delimiter() const {
|
||||
bool value = (_has_bits_[0] & 0x00000100u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_pretokenization_delimiter() const {
|
||||
return _internal_has_pretokenization_delimiter();
|
||||
}
|
||||
inline void TrainerSpec::clear_pretokenization_delimiter() {
|
||||
pretokenization_delimiter_.ClearToEmpty();
|
||||
_has_bits_[0] &= ~0x00000100u;
|
||||
}
|
||||
inline const std::string& TrainerSpec::pretokenization_delimiter() const {
|
||||
// @@protoc_insertion_point(field_get:sentencepiece.TrainerSpec.pretokenization_delimiter)
|
||||
return _internal_pretokenization_delimiter();
|
||||
}
|
||||
inline void TrainerSpec::set_pretokenization_delimiter(const std::string& value) {
|
||||
_internal_set_pretokenization_delimiter(value);
|
||||
// @@protoc_insertion_point(field_set:sentencepiece.TrainerSpec.pretokenization_delimiter)
|
||||
}
|
||||
inline std::string* TrainerSpec::mutable_pretokenization_delimiter() {
|
||||
// @@protoc_insertion_point(field_mutable:sentencepiece.TrainerSpec.pretokenization_delimiter)
|
||||
return _internal_mutable_pretokenization_delimiter();
|
||||
}
|
||||
inline const std::string& TrainerSpec::_internal_pretokenization_delimiter() const {
|
||||
return pretokenization_delimiter_.Get();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_pretokenization_delimiter(const std::string& value) {
|
||||
_has_bits_[0] |= 0x00000100u;
|
||||
pretokenization_delimiter_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, value, GetArena());
|
||||
}
|
||||
inline void TrainerSpec::set_pretokenization_delimiter(std::string&& value) {
|
||||
_has_bits_[0] |= 0x00000100u;
|
||||
pretokenization_delimiter_.Set(
|
||||
::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, ::std::move(value), GetArena());
|
||||
// @@protoc_insertion_point(field_set_rvalue:sentencepiece.TrainerSpec.pretokenization_delimiter)
|
||||
}
|
||||
inline void TrainerSpec::set_pretokenization_delimiter(const char* value) {
|
||||
GOOGLE_DCHECK(value != nullptr);
|
||||
_has_bits_[0] |= 0x00000100u;
|
||||
pretokenization_delimiter_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, ::std::string(value), GetArena());
|
||||
// @@protoc_insertion_point(field_set_char:sentencepiece.TrainerSpec.pretokenization_delimiter)
|
||||
}
|
||||
inline void TrainerSpec::set_pretokenization_delimiter(const char* value,
|
||||
size_t size) {
|
||||
_has_bits_[0] |= 0x00000100u;
|
||||
pretokenization_delimiter_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, ::std::string(
|
||||
reinterpret_cast<const char*>(value), size), GetArena());
|
||||
// @@protoc_insertion_point(field_set_pointer:sentencepiece.TrainerSpec.pretokenization_delimiter)
|
||||
}
|
||||
inline std::string* TrainerSpec::_internal_mutable_pretokenization_delimiter() {
|
||||
_has_bits_[0] |= 0x00000100u;
|
||||
return pretokenization_delimiter_.Mutable(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, GetArena());
|
||||
}
|
||||
inline std::string* TrainerSpec::release_pretokenization_delimiter() {
|
||||
// @@protoc_insertion_point(field_release:sentencepiece.TrainerSpec.pretokenization_delimiter)
|
||||
if (!_internal_has_pretokenization_delimiter()) {
|
||||
return nullptr;
|
||||
}
|
||||
_has_bits_[0] &= ~0x00000100u;
|
||||
return pretokenization_delimiter_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
|
||||
}
|
||||
inline void TrainerSpec::set_allocated_pretokenization_delimiter(std::string* pretokenization_delimiter) {
|
||||
if (pretokenization_delimiter != nullptr) {
|
||||
_has_bits_[0] |= 0x00000100u;
|
||||
} else {
|
||||
_has_bits_[0] &= ~0x00000100u;
|
||||
}
|
||||
pretokenization_delimiter_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), pretokenization_delimiter,
|
||||
GetArena());
|
||||
// @@protoc_insertion_point(field_set_allocated:sentencepiece.TrainerSpec.pretokenization_delimiter)
|
||||
}
|
||||
|
||||
// repeated string control_symbols = 30;
|
||||
inline int TrainerSpec::_internal_control_symbols_size() const {
|
||||
return control_symbols_.size();
|
||||
@ -3180,7 +3275,7 @@ inline void TrainerSpec::set_allocated_required_chars(std::string* required_char
|
||||
|
||||
// optional bool byte_fallback = 35 [default = false];
|
||||
inline bool TrainerSpec::_internal_has_byte_fallback() const {
|
||||
bool value = (_has_bits_[0] & 0x00010000u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x00020000u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_byte_fallback() const {
|
||||
@ -3188,7 +3283,7 @@ inline bool TrainerSpec::has_byte_fallback() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_byte_fallback() {
|
||||
byte_fallback_ = false;
|
||||
_has_bits_[0] &= ~0x00010000u;
|
||||
_has_bits_[0] &= ~0x00020000u;
|
||||
}
|
||||
inline bool TrainerSpec::_internal_byte_fallback() const {
|
||||
return byte_fallback_;
|
||||
@ -3198,7 +3293,7 @@ inline bool TrainerSpec::byte_fallback() const {
|
||||
return _internal_byte_fallback();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_byte_fallback(bool value) {
|
||||
_has_bits_[0] |= 0x00010000u;
|
||||
_has_bits_[0] |= 0x00020000u;
|
||||
byte_fallback_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_byte_fallback(bool value) {
|
||||
@ -3208,7 +3303,7 @@ inline void TrainerSpec::set_byte_fallback(bool value) {
|
||||
|
||||
// optional bool vocabulary_output_piece_score = 32 [default = true];
|
||||
inline bool TrainerSpec::_internal_has_vocabulary_output_piece_score() const {
|
||||
bool value = (_has_bits_[1] & 0x00000008u) != 0;
|
||||
bool value = (_has_bits_[1] & 0x00000010u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_vocabulary_output_piece_score() const {
|
||||
@ -3216,7 +3311,7 @@ inline bool TrainerSpec::has_vocabulary_output_piece_score() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_vocabulary_output_piece_score() {
|
||||
vocabulary_output_piece_score_ = true;
|
||||
_has_bits_[1] &= ~0x00000008u;
|
||||
_has_bits_[1] &= ~0x00000010u;
|
||||
}
|
||||
inline bool TrainerSpec::_internal_vocabulary_output_piece_score() const {
|
||||
return vocabulary_output_piece_score_;
|
||||
@ -3226,7 +3321,7 @@ inline bool TrainerSpec::vocabulary_output_piece_score() const {
|
||||
return _internal_vocabulary_output_piece_score();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_vocabulary_output_piece_score(bool value) {
|
||||
_has_bits_[1] |= 0x00000008u;
|
||||
_has_bits_[1] |= 0x00000010u;
|
||||
vocabulary_output_piece_score_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_vocabulary_output_piece_score(bool value) {
|
||||
@ -3236,7 +3331,7 @@ inline void TrainerSpec::set_vocabulary_output_piece_score(bool value) {
|
||||
|
||||
// optional bool hard_vocab_limit = 33 [default = true];
|
||||
inline bool TrainerSpec::_internal_has_hard_vocab_limit() const {
|
||||
bool value = (_has_bits_[1] & 0x00000010u) != 0;
|
||||
bool value = (_has_bits_[1] & 0x00000020u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_hard_vocab_limit() const {
|
||||
@ -3244,7 +3339,7 @@ inline bool TrainerSpec::has_hard_vocab_limit() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_hard_vocab_limit() {
|
||||
hard_vocab_limit_ = true;
|
||||
_has_bits_[1] &= ~0x00000010u;
|
||||
_has_bits_[1] &= ~0x00000020u;
|
||||
}
|
||||
inline bool TrainerSpec::_internal_hard_vocab_limit() const {
|
||||
return hard_vocab_limit_;
|
||||
@ -3254,7 +3349,7 @@ inline bool TrainerSpec::hard_vocab_limit() const {
|
||||
return _internal_hard_vocab_limit();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_hard_vocab_limit(bool value) {
|
||||
_has_bits_[1] |= 0x00000010u;
|
||||
_has_bits_[1] |= 0x00000020u;
|
||||
hard_vocab_limit_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_hard_vocab_limit(bool value) {
|
||||
@ -3264,7 +3359,7 @@ inline void TrainerSpec::set_hard_vocab_limit(bool value) {
|
||||
|
||||
// optional bool use_all_vocab = 34 [default = false];
|
||||
inline bool TrainerSpec::_internal_has_use_all_vocab() const {
|
||||
bool value = (_has_bits_[0] & 0x00020000u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x00040000u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_use_all_vocab() const {
|
||||
@ -3272,7 +3367,7 @@ inline bool TrainerSpec::has_use_all_vocab() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_use_all_vocab() {
|
||||
use_all_vocab_ = false;
|
||||
_has_bits_[0] &= ~0x00020000u;
|
||||
_has_bits_[0] &= ~0x00040000u;
|
||||
}
|
||||
inline bool TrainerSpec::_internal_use_all_vocab() const {
|
||||
return use_all_vocab_;
|
||||
@ -3282,7 +3377,7 @@ inline bool TrainerSpec::use_all_vocab() const {
|
||||
return _internal_use_all_vocab();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_use_all_vocab(bool value) {
|
||||
_has_bits_[0] |= 0x00020000u;
|
||||
_has_bits_[0] |= 0x00040000u;
|
||||
use_all_vocab_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_use_all_vocab(bool value) {
|
||||
@ -3292,7 +3387,7 @@ inline void TrainerSpec::set_use_all_vocab(bool value) {
|
||||
|
||||
// optional int32 unk_id = 40 [default = 0];
|
||||
inline bool TrainerSpec::_internal_has_unk_id() const {
|
||||
bool value = (_has_bits_[0] & 0x00080000u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x00100000u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_unk_id() const {
|
||||
@ -3300,7 +3395,7 @@ inline bool TrainerSpec::has_unk_id() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_unk_id() {
|
||||
unk_id_ = 0;
|
||||
_has_bits_[0] &= ~0x00080000u;
|
||||
_has_bits_[0] &= ~0x00100000u;
|
||||
}
|
||||
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_unk_id() const {
|
||||
return unk_id_;
|
||||
@ -3310,7 +3405,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::unk_id() const {
|
||||
return _internal_unk_id();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_unk_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
_has_bits_[0] |= 0x00080000u;
|
||||
_has_bits_[0] |= 0x00100000u;
|
||||
unk_id_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_unk_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
@ -3320,7 +3415,7 @@ inline void TrainerSpec::set_unk_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
|
||||
// optional int32 bos_id = 41 [default = 1];
|
||||
inline bool TrainerSpec::_internal_has_bos_id() const {
|
||||
bool value = (_has_bits_[1] & 0x00000020u) != 0;
|
||||
bool value = (_has_bits_[1] & 0x00000040u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_bos_id() const {
|
||||
@ -3328,7 +3423,7 @@ inline bool TrainerSpec::has_bos_id() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_bos_id() {
|
||||
bos_id_ = 1;
|
||||
_has_bits_[1] &= ~0x00000020u;
|
||||
_has_bits_[1] &= ~0x00000040u;
|
||||
}
|
||||
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_bos_id() const {
|
||||
return bos_id_;
|
||||
@ -3338,7 +3433,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::bos_id() const {
|
||||
return _internal_bos_id();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_bos_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
_has_bits_[1] |= 0x00000020u;
|
||||
_has_bits_[1] |= 0x00000040u;
|
||||
bos_id_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_bos_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
@ -3348,7 +3443,7 @@ inline void TrainerSpec::set_bos_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
|
||||
// optional int32 eos_id = 42 [default = 2];
|
||||
inline bool TrainerSpec::_internal_has_eos_id() const {
|
||||
bool value = (_has_bits_[1] & 0x00000040u) != 0;
|
||||
bool value = (_has_bits_[1] & 0x00000080u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_eos_id() const {
|
||||
@ -3356,7 +3451,7 @@ inline bool TrainerSpec::has_eos_id() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_eos_id() {
|
||||
eos_id_ = 2;
|
||||
_has_bits_[1] &= ~0x00000040u;
|
||||
_has_bits_[1] &= ~0x00000080u;
|
||||
}
|
||||
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_eos_id() const {
|
||||
return eos_id_;
|
||||
@ -3366,7 +3461,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::eos_id() const {
|
||||
return _internal_eos_id();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_eos_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
_has_bits_[1] |= 0x00000040u;
|
||||
_has_bits_[1] |= 0x00000080u;
|
||||
eos_id_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_eos_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
@ -3376,7 +3471,7 @@ inline void TrainerSpec::set_eos_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
|
||||
// optional int32 pad_id = 43 [default = -1];
|
||||
inline bool TrainerSpec::_internal_has_pad_id() const {
|
||||
bool value = (_has_bits_[1] & 0x00000080u) != 0;
|
||||
bool value = (_has_bits_[1] & 0x00000100u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_pad_id() const {
|
||||
@ -3384,7 +3479,7 @@ inline bool TrainerSpec::has_pad_id() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_pad_id() {
|
||||
pad_id_ = -1;
|
||||
_has_bits_[1] &= ~0x00000080u;
|
||||
_has_bits_[1] &= ~0x00000100u;
|
||||
}
|
||||
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_pad_id() const {
|
||||
return pad_id_;
|
||||
@ -3394,7 +3489,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::pad_id() const {
|
||||
return _internal_pad_id();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_pad_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
_has_bits_[1] |= 0x00000080u;
|
||||
_has_bits_[1] |= 0x00000100u;
|
||||
pad_id_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_pad_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
|
||||
@ -3774,7 +3869,7 @@ inline void TrainerSpec::set_allocated_unk_surface(std::string* unk_surface) {
|
||||
|
||||
// optional bool train_extremely_large_corpus = 49 [default = false];
|
||||
inline bool TrainerSpec::_internal_has_train_extremely_large_corpus() const {
|
||||
bool value = (_has_bits_[0] & 0x00040000u) != 0;
|
||||
bool value = (_has_bits_[0] & 0x00080000u) != 0;
|
||||
return value;
|
||||
}
|
||||
inline bool TrainerSpec::has_train_extremely_large_corpus() const {
|
||||
@ -3782,7 +3877,7 @@ inline bool TrainerSpec::has_train_extremely_large_corpus() const {
|
||||
}
|
||||
inline void TrainerSpec::clear_train_extremely_large_corpus() {
|
||||
train_extremely_large_corpus_ = false;
|
||||
_has_bits_[0] &= ~0x00040000u;
|
||||
_has_bits_[0] &= ~0x00080000u;
|
||||
}
|
||||
inline bool TrainerSpec::_internal_train_extremely_large_corpus() const {
|
||||
return train_extremely_large_corpus_;
|
||||
@ -3792,7 +3887,7 @@ inline bool TrainerSpec::train_extremely_large_corpus() const {
|
||||
return _internal_train_extremely_large_corpus();
|
||||
}
|
||||
inline void TrainerSpec::_internal_set_train_extremely_large_corpus(bool value) {
|
||||
_has_bits_[0] |= 0x00040000u;
|
||||
_has_bits_[0] |= 0x00080000u;
|
||||
train_extremely_large_corpus_ = value;
|
||||
}
|
||||
inline void TrainerSpec::set_train_extremely_large_corpus(bool value) {
|
||||
|
@ -11,9 +11,10 @@
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.!
|
||||
#include "pretokenizer_for_training.h"
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "pretokenizer_for_training.h"
|
||||
#include "third_party/absl/strings/str_replace.h"
|
||||
|
||||
namespace sentencepiece {
|
||||
@ -24,10 +25,9 @@ namespace {
|
||||
// defined them explicitly to avoid the dependency to trainier_interface.
|
||||
// Currently, we have no separated build rules.
|
||||
const char kWSStr[] = "\xe2\x96\x81";
|
||||
const char kUPPBoundaryStr[] = "\t";
|
||||
} // namespace
|
||||
|
||||
std::string PretokenizerForTrainingInterface::PreTokenize(
|
||||
std::vector<std::string> PretokenizerForTrainingInterface::PreTokenize(
|
||||
absl::string_view text) const {
|
||||
return Postprocess(Tokenize(Preprocess(text)));
|
||||
}
|
||||
@ -40,14 +40,17 @@ std::string PretokenizerForTrainingInterface::Preprocess(
|
||||
}
|
||||
|
||||
// static
|
||||
std::string PretokenizerForTrainingInterface::Postprocess(
|
||||
std::vector<std::string> PretokenizerForTrainingInterface::Postprocess(
|
||||
const SentencePieceText &spt) {
|
||||
// Inserts kUPPBoundaryStr before/after of token boundaries.
|
||||
std::vector<std::string> result;
|
||||
std::string output;
|
||||
|
||||
int prev = 0;
|
||||
for (const auto &piece : spt.pieces()) {
|
||||
if (prev == piece.begin() && piece.begin() != 0) {
|
||||
output += kUPPBoundaryStr;
|
||||
result.push_back(output);
|
||||
output.clear();
|
||||
} else {
|
||||
output.append(piece.begin() - prev, ' ');
|
||||
}
|
||||
@ -55,8 +58,11 @@ std::string PretokenizerForTrainingInterface::Postprocess(
|
||||
prev = piece.end();
|
||||
}
|
||||
|
||||
// Restores kWSStr.
|
||||
return absl::StrReplaceAll(output, {{" ", kWSStr}});
|
||||
if (!output.empty()) result.push_back(output);
|
||||
|
||||
for (auto &w : result) w = absl::StrReplaceAll(w, {{" ", kWSStr}});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace pretokenizer
|
||||
|
@ -44,7 +44,7 @@ class PretokenizerForTrainingInterface {
|
||||
// segmentation: piece[0] = {0, 1}, piece[1] = {2, 6},
|
||||
// piece[2] = {7, 15}, piece[3] = {15, 20}
|
||||
// output: I love sentence<tab>piece.
|
||||
std::string PreTokenize(absl::string_view text) const;
|
||||
std::vector<std::string> PreTokenize(absl::string_view text) const;
|
||||
|
||||
// Returns pre-tokenized result.
|
||||
// Note that the pre-tokenized constraint is specified with the
|
||||
@ -54,7 +54,7 @@ class PretokenizerForTrainingInterface {
|
||||
|
||||
private:
|
||||
static std::string Preprocess(absl::string_view text);
|
||||
static std::string Postprocess(const SentencePieceText &spt);
|
||||
static std::vector<std::string> Postprocess(const SentencePieceText &spt);
|
||||
};
|
||||
|
||||
} // namespace pretokenizer
|
||||
|
@ -12,8 +12,11 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.!
|
||||
#include "pretokenizer_for_training.h"
|
||||
|
||||
#include "testharness.h"
|
||||
#include "third_party/absl/strings/str_cat.h"
|
||||
#include "third_party/absl/strings/str_join.h"
|
||||
#include "third_party/absl/strings/str_split.h"
|
||||
#include "trainer_interface.h"
|
||||
|
||||
namespace sentencepiece {
|
||||
@ -64,9 +67,11 @@ TEST(PretokenizerForTrainingTest, BaseTest) {
|
||||
|
||||
mock.SetOutput(spt);
|
||||
|
||||
EXPECT_EQ(absl::StrCat("I", TrainerInterface::kWSStr, "love",
|
||||
TrainerInterface::kWSStr, "sentence\tpiece"),
|
||||
mock.PreTokenize("I love sentencepiece"));
|
||||
const auto expected =
|
||||
absl::StrCat("I", TrainerInterface::kWSStr, "love",
|
||||
TrainerInterface::kWSStr, "sentence||||piece");
|
||||
EXPECT_EQ(expected,
|
||||
absl::StrJoin(mock.PreTokenize("I love sentencepiece"), "||||"));
|
||||
}
|
||||
|
||||
{
|
||||
@ -94,7 +99,9 @@ TEST(PretokenizerForTrainingTest, BaseTest) {
|
||||
|
||||
mock.SetOutput(spt);
|
||||
|
||||
EXPECT_EQ("これ\tは\tペン\tです", mock.PreTokenize("これはペンです"));
|
||||
const auto expected = "これ||||は||||ペン||||です";
|
||||
EXPECT_EQ(expected,
|
||||
absl::StrJoin(mock.PreTokenize("これはペンです"), "||||"));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -20,7 +20,7 @@ option optimize_for = LITE_RUNTIME;
|
||||
package sentencepiece;
|
||||
|
||||
// TrainerSpec encodes a various parameters for SentencePiece training.
|
||||
// Next id: 53
|
||||
// Next id: 54
|
||||
message TrainerSpec {
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// General parameters
|
||||
@ -157,6 +157,13 @@ message TrainerSpec {
|
||||
// Split all digits (0-9) into separate pieces.
|
||||
optional bool split_digits = 25 [default = false];
|
||||
|
||||
// Defines the pre-tokenization delimiter.
|
||||
// When specified, no pieces crossing this delimiter is not included
|
||||
// in the vocab. Then the delimiter string is virtually ignored
|
||||
// during the training. This field can allows constraints on the vocabulary
|
||||
// selection. Note that this field is available on unigram mode.
|
||||
optional string pretokenization_delimiter = 53 [ default = ""];
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// Vocabulary management
|
||||
//
|
||||
|
@ -144,6 +144,7 @@ inline std::string PrintProto(const TrainerSpec &message,
|
||||
PRINT_PARAM(split_by_number);
|
||||
PRINT_PARAM(split_by_whitespace);
|
||||
PRINT_PARAM(split_digits);
|
||||
PRINT_PARAM(pretokenization_delimiter);
|
||||
PRINT_PARAM(treat_whitespace_as_suffix);
|
||||
PRINT_PARAM(allow_whitespace_only_pieces);
|
||||
PRINT_REPEATED_STRING(control_symbols);
|
||||
@ -222,6 +223,7 @@ util::Status SentencePieceTrainer::SetProtoField(absl::string_view name,
|
||||
PARSE_BOOL(split_by_number);
|
||||
PARSE_BOOL(split_by_whitespace);
|
||||
PARSE_BOOL(split_digits);
|
||||
PARSE_STRING(pretokenization_delimiter);
|
||||
PARSE_BOOL(treat_whitespace_as_suffix);
|
||||
PARSE_BOOL(allow_whitespace_only_pieces);
|
||||
PARSE_REPEATED_STRING(control_symbols);
|
||||
|
@ -77,6 +77,9 @@ ABSL_FLAG(bool, split_by_whitespace, kDefaultTrainerSpec.split_by_whitespace(),
|
||||
"use a white space to split sentence pieces");
|
||||
ABSL_FLAG(bool, split_digits, kDefaultTrainerSpec.split_digits(),
|
||||
"split all digits (0-9) into separate pieces");
|
||||
ABSL_FLAG(std::string, pretokenization_delimiter,
|
||||
kDefaultTrainerSpec.pretokenization_delimiter(),
|
||||
"specifies the delimiter of pre-tokenization");
|
||||
ABSL_FLAG(bool, treat_whitespace_as_suffix,
|
||||
kDefaultTrainerSpec.treat_whitespace_as_suffix(),
|
||||
"treat whitespace marker as suffix instead of prefix.");
|
||||
@ -227,6 +230,7 @@ int main(int argc, char *argv[]) {
|
||||
SetTrainerSpecFromFlag(split_by_whitespace);
|
||||
SetTrainerSpecFromFlag(split_by_number);
|
||||
SetTrainerSpecFromFlag(split_digits);
|
||||
SetTrainerSpecFromFlag(pretokenization_delimiter);
|
||||
SetTrainerSpecFromFlag(byte_fallback);
|
||||
SetTrainerSpecFromFlag(treat_whitespace_as_suffix);
|
||||
SetTrainerSpecFromFlag(allow_whitespace_only_pieces);
|
||||
|
@ -81,7 +81,8 @@ util::Status VerifySpec(const TrainerSpec &trainer_spec) {
|
||||
CHECK_OR_RETURN(!trainer_spec.eos_piece().empty());
|
||||
CHECK_OR_RETURN(!trainer_spec.pad_piece().empty());
|
||||
|
||||
if (SentencePieceTrainer::GetPretokenizerForTraining()) {
|
||||
if (SentencePieceTrainer::GetPretokenizerForTraining() ||
|
||||
!trainer_spec.pretokenization_delimiter().empty()) {
|
||||
CHECK_EQ_OR_RETURN(TrainerSpec::UNIGRAM, trainer_spec.model_type())
|
||||
<< "PretokenizerForTraining is only supported in UNIGRAM mode.";
|
||||
}
|
||||
|
@ -461,7 +461,7 @@ std::vector<Lattice::LatticePathWithScore> Lattice::NBest(size_t nbest_size,
|
||||
} else {
|
||||
hyp->gx = lnode->score + top->gx; // just adds node->score
|
||||
hyp->fx =
|
||||
lnode->backtrace_score + top->gx; // backtrace_score is h(node).
|
||||
lnode->backtrace_score + hyp->gx; // backtrace_score is h(node).
|
||||
}
|
||||
hyp->next = top;
|
||||
agenda.push(hyp);
|
||||
|
@ -28,7 +28,10 @@
|
||||
#include "pretokenizer_for_training.h"
|
||||
#include "sentencepiece_trainer.h"
|
||||
#include "third_party/absl/container/flat_hash_map.h"
|
||||
#include "third_party/absl/container/flat_hash_set.h"
|
||||
#include "third_party/absl/memory/memory.h"
|
||||
#include "third_party/absl/strings/str_replace.h"
|
||||
#include "third_party/absl/strings/str_split.h"
|
||||
#include "third_party/esaxx/esa.hxx" // Suffix array library.
|
||||
#include "unicode_script.h"
|
||||
#include "util.h"
|
||||
@ -37,6 +40,9 @@ namespace sentencepiece {
|
||||
namespace unigram {
|
||||
namespace {
|
||||
|
||||
constexpr char32 kSentenceBoundary = 0x0000;
|
||||
constexpr char32 kWsMarker = 0x2581;
|
||||
|
||||
double Digamma(double x) {
|
||||
double result = 0.0;
|
||||
for (; x < 7; ++x) result -= 1 / x;
|
||||
@ -60,6 +66,63 @@ void ToLogProb(IT begin, IT end) {
|
||||
it->second = std::log(static_cast<double>(it->second)) - logsum;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::vector<std::pair<const T *, const T *>> SplitBySentenceBoundary(
|
||||
const T *begin, const T *end) {
|
||||
std::vector<std::pair<const T *, const T *>> result;
|
||||
|
||||
while (begin < end) {
|
||||
const auto *p = std::find(begin, end, static_cast<T>(kSentenceBoundary));
|
||||
if (p != end) {
|
||||
result.emplace_back(begin, p);
|
||||
begin = p + 1;
|
||||
} else {
|
||||
result.emplace_back(begin, end);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
class BoundedPriorityQueue {
|
||||
public:
|
||||
explicit BoundedPriorityQueue(size_t size) : size_(size) {}
|
||||
~BoundedPriorityQueue() = default;
|
||||
|
||||
void push(const T &elem, int64 score) {
|
||||
if (queue_.size() > 4 * size_) resize();
|
||||
if (queue_.size() >= size_ && queue_[size_ - 1].second > score) return;
|
||||
queue_.emplace_back(elem, score);
|
||||
}
|
||||
|
||||
const std::vector<std::pair<T, int64>> &get() {
|
||||
resize();
|
||||
return queue_;
|
||||
}
|
||||
|
||||
private:
|
||||
void resize() {
|
||||
std::sort(queue_.begin(), queue_.end(), [](const auto &p1, const auto &p2) {
|
||||
return (p1.second > p2.second ||
|
||||
(p1.second == p2.second && p1.first < p2.first));
|
||||
});
|
||||
|
||||
absl::flat_hash_set<absl::string_view> dup;
|
||||
std::vector<std::pair<T, int64>> new_queue;
|
||||
for (auto &p : queue_) {
|
||||
if (dup.insert(p.first).second) new_queue.emplace_back(std::move(p));
|
||||
if (new_queue.size() == size_) break;
|
||||
}
|
||||
queue_ = std::move(new_queue);
|
||||
}
|
||||
|
||||
size_t size_ = 0;
|
||||
std::vector<std::pair<T, int64>> queue_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
TrainerModel::TrainerModel(const TrainerSpec &trainer_spec,
|
||||
@ -96,7 +159,7 @@ void TrainerModel::SetSentencePieces(SentencePieces &&sentencepieces) {
|
||||
CHECK(status().ok());
|
||||
}
|
||||
|
||||
TrainerModel::SentencePieces Trainer::MakeSeedSentencePieces() const {
|
||||
TrainerModel::SentencePieces Trainer::MakeSeedSentencePieces() {
|
||||
return trainer_spec_.train_extremely_large_corpus()
|
||||
? MakeSeedSentencePiecesInternal<int64>()
|
||||
: MakeSeedSentencePiecesInternal<int32>();
|
||||
@ -104,7 +167,7 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePieces() const {
|
||||
|
||||
// Returns seed sentencepieces for EM training.
|
||||
template <typename node_int_type>
|
||||
TrainerModel::SentencePieces Trainer::MakeSeedSentencePiecesInternal() const {
|
||||
TrainerModel::SentencePieces Trainer::MakeSeedSentencePiecesInternal() {
|
||||
CHECK(!sentences_.empty());
|
||||
CHECK(!required_chars_.empty());
|
||||
|
||||
@ -112,14 +175,43 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePiecesInternal() const {
|
||||
// Pretokenizer is used as a constraint of piece extractions.
|
||||
const auto *pretokenizer = SentencePieceTrainer::GetPretokenizerForTraining();
|
||||
|
||||
auto pretokenize_or_rewrite = [&](std::pair<std::string, int64> *w) {
|
||||
if (pretokenizer) {
|
||||
std::vector<char32> chars;
|
||||
for (const auto &w : pretokenizer->PreTokenize(w->first)) {
|
||||
for (const auto &c : string_util::UTF8ToUnicodeText(w)) {
|
||||
chars.push_back(c);
|
||||
}
|
||||
chars.push_back(kSentenceBoundary);
|
||||
}
|
||||
return chars;
|
||||
} else if (!trainer_spec_.pretokenization_delimiter().empty()) {
|
||||
// When delimiter is specified, tokenize the input with the delimiter.
|
||||
// For EM training, we assume that the delimiter doesn't exist and
|
||||
// rewrite the original sentence.
|
||||
std::vector<char32> chars;
|
||||
absl::string_view delimiter = trainer_spec_.pretokenization_delimiter();
|
||||
for (const auto &w : absl::StrSplit(w->first, delimiter)) {
|
||||
for (const auto &c : string_util::UTF8ToUnicodeText(w)) {
|
||||
chars.push_back(c);
|
||||
}
|
||||
chars.push_back(kSentenceBoundary);
|
||||
}
|
||||
// Removes the delimiter.
|
||||
w->first = absl::StrReplaceAll(w->first, {{delimiter, ""}});
|
||||
return chars;
|
||||
}
|
||||
return string_util::UTF8ToUnicodeText(w->first);
|
||||
};
|
||||
|
||||
// Merges all sentences into one array with 0x0000 delimiter.
|
||||
std::vector<char32> array;
|
||||
absl::flat_hash_map<std::string, int64> all_chars;
|
||||
constexpr char32 kSentenceBoundary = 0x0000;
|
||||
|
||||
for (const auto &w : sentences_) {
|
||||
const auto ut = string_util::UTF8ToUnicodeText(
|
||||
pretokenizer ? pretokenizer->PreTokenize(w.first) : w.first);
|
||||
const bool is_tsv = trainer_spec_.input_format() == "tsv";
|
||||
|
||||
for (auto &w : sentences_) {
|
||||
const auto ut = pretokenize_or_rewrite(&w);
|
||||
for (const auto &c : ut) {
|
||||
array.push_back(c);
|
||||
if (c != kUNKChar && c != kSentenceBoundary) {
|
||||
@ -127,6 +219,15 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePiecesInternal() const {
|
||||
}
|
||||
}
|
||||
array.push_back(kSentenceBoundary); // sentence boundary marker.
|
||||
|
||||
// Naive workaround to over-sample the input.
|
||||
// In TSV mode, the frequency field is not used to extract the seed piece.
|
||||
// we can at least extract all pieces by copying the input because
|
||||
// the occurrence gets at least larger than or equals to 2.
|
||||
if (is_tsv) {
|
||||
for (const auto &c : ut) array.push_back(c);
|
||||
array.push_back(kSentenceBoundary);
|
||||
}
|
||||
}
|
||||
|
||||
CHECK_LE(array.size(),
|
||||
@ -147,29 +248,42 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePiecesInternal() const {
|
||||
CHECK_EQ(0, esaxx(array.begin(), SA.begin(), L.begin(), R.begin(), D.begin(),
|
||||
n, kAlphabetSize, node_num));
|
||||
|
||||
LOG(INFO) << "Extracting frequent sub strings...";
|
||||
std::vector<std::pair<node_int_type, node_int_type>> substr_index;
|
||||
LOG(INFO) << "Extracting frequent sub strings... node_num=" << node_num;
|
||||
|
||||
BoundedPriorityQueue<std::string> queue(
|
||||
static_cast<size_t>(trainer_spec_.seed_sentencepiece_size()));
|
||||
|
||||
for (node_int_type i = 0; i < node_num; ++i) {
|
||||
const node_int_type offset = SA[L[i]];
|
||||
const node_int_type len = D[i];
|
||||
if (len <= 1) {
|
||||
continue;
|
||||
}
|
||||
const char32 *begin = &array[0] + offset;
|
||||
const char32 *end = &array[0] + offset + len;
|
||||
// Skips if a substring contains a sentence boundary.
|
||||
if (std::find(begin, end, kSentenceBoundary) != end) {
|
||||
continue;
|
||||
}
|
||||
const UnicodeText uw(begin, end);
|
||||
if (!IsValidSentencePiece(uw)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// character-wise coverage is the default score.
|
||||
const node_int_type freq = R[i] - L[i];
|
||||
const node_int_type score = freq * len;
|
||||
substr_index.emplace_back(i, score);
|
||||
for (const auto &p :
|
||||
SplitBySentenceBoundary(&array[offset], &array[offset + len])) {
|
||||
if (p.first == p.second) continue;
|
||||
const auto [begin, end] = NormalizeRange(p.first, p.second);
|
||||
|
||||
const UnicodeText uw(begin, end);
|
||||
if (uw.size() <= 1 || !IsValidSentencePiece(uw)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// character-wise coverage is the default score.
|
||||
const node_int_type freq = R[i] - L[i];
|
||||
const node_int_type score = freq * freq;
|
||||
|
||||
const auto w = string_util::UnicodeTextToUTF8(uw);
|
||||
queue.push(w, score);
|
||||
|
||||
const auto subpieces =
|
||||
SplitIntoWords(w, trainer_spec_.treat_whitespace_as_suffix(),
|
||||
trainer_spec_.allow_whitespace_only_pieces());
|
||||
if (subpieces.size() > 1) {
|
||||
for (const auto &s : subpieces) queue.push(std::string(s), score);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// all_chars must be included in the seed sentencepieces.
|
||||
@ -178,22 +292,8 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePiecesInternal() const {
|
||||
seed_sentencepieces.emplace_back(it);
|
||||
}
|
||||
|
||||
// Sort by the coverage of sub strings.
|
||||
for (const auto &p : Sorted(substr_index)) {
|
||||
const node_int_type offset = SA[L[p.first]];
|
||||
const node_int_type len = D[p.first];
|
||||
CHECK_GT(len, 0);
|
||||
const char32 *begin = &array[offset];
|
||||
const char32 *end = &array[offset + len];
|
||||
const UnicodeText uw(begin, end);
|
||||
CHECK(IsValidSentencePiece(uw)); // just in case.
|
||||
const std::string w = string_util::UnicodeTextToUTF8(uw);
|
||||
if (seed_sentencepieces.size() ==
|
||||
static_cast<size_t>(trainer_spec_.seed_sentencepiece_size())) {
|
||||
break;
|
||||
}
|
||||
CHECK(!port::ContainsKey(all_chars, w));
|
||||
seed_sentencepieces.emplace_back(w, p.second);
|
||||
for (const auto &p : queue.get()) {
|
||||
seed_sentencepieces.emplace_back(p);
|
||||
}
|
||||
|
||||
ToLogProb(seed_sentencepieces.begin(), seed_sentencepieces.end());
|
||||
@ -430,6 +530,22 @@ TrainerModel::SentencePieces Trainer::PruneSentencePieces(
|
||||
return new_sentencepieces;
|
||||
}
|
||||
|
||||
std::pair<const char32 *, const char32 *> Trainer::NormalizeRange(
|
||||
const char32 *begin, const char32 *end) const {
|
||||
if (trainer_spec_.treat_whitespace_as_suffix()) {
|
||||
while ((*begin == kSentenceBoundary || *begin == kWsMarker) &&
|
||||
begin + 1 < end)
|
||||
++begin;
|
||||
while (*(end - 1) == kSentenceBoundary && begin + 1 < end) --end;
|
||||
} else {
|
||||
while (*begin == kSentenceBoundary && begin + 1 < end) ++begin;
|
||||
while ((*(end - 1) == kSentenceBoundary || *(end - 1) == kWsMarker) &&
|
||||
begin + 1 < end)
|
||||
--end;
|
||||
}
|
||||
return std::make_pair(begin, end);
|
||||
}
|
||||
|
||||
TrainerModel::SentencePieces Trainer::FinalizeSentencePieces(
|
||||
const TrainerModel &model) const {
|
||||
const auto &sentencepieces = model.GetSentencePieces();
|
||||
|
@ -68,7 +68,7 @@ class Trainer : public TrainerInterface {
|
||||
: TrainerInterface::TrainerInterface(trainer_spec, normalizer_spec,
|
||||
denormalizer_spec) {}
|
||||
|
||||
TrainerModel::SentencePieces MakeSeedSentencePieces() const;
|
||||
TrainerModel::SentencePieces MakeSeedSentencePieces();
|
||||
|
||||
util::Status Train() override;
|
||||
|
||||
@ -80,7 +80,7 @@ class Trainer : public TrainerInterface {
|
||||
// node_int_type should be of integer type (int32 or int64),
|
||||
// determined by train_extremely_large_corpus.
|
||||
template <typename node_int_type>
|
||||
TrainerModel::SentencePieces MakeSeedSentencePiecesInternal() const;
|
||||
TrainerModel::SentencePieces MakeSeedSentencePiecesInternal();
|
||||
|
||||
// Executes the E step of EM and returns expected count.
|
||||
// The index of return array is the vocab id.
|
||||
@ -105,6 +105,9 @@ class Trainer : public TrainerInterface {
|
||||
TrainerModel::SentencePieces FinalizeSentencePieces(
|
||||
const TrainerModel &model) const;
|
||||
|
||||
std::pair<const char32 *, const char32 *> NormalizeRange(
|
||||
const char32 *begin, const char32 *end) const;
|
||||
|
||||
// When the size of SentencePieces becomes less than desired_vocab_size_,
|
||||
// break the main training loop. desired_vocab_size_ = 1.1 * vocab_size_
|
||||
// for now.
|
||||
|
@ -117,11 +117,13 @@ TEST(UnigramTrainerTest, BasicTest) {
|
||||
30);
|
||||
|
||||
// Check seed pieces.
|
||||
EXPECT_EQ(27, res.seed_pieces_and_probs.size());
|
||||
EXPECT_EQ(63, res.seed_pieces_and_probs.size());
|
||||
|
||||
// Check final pieces.
|
||||
EXPECT_EQ("i a n y m l e apple ve O P r t g an v ▁ A b le ▁an p d h",
|
||||
res.sentence_pieces);
|
||||
EXPECT_EQ(
|
||||
"Overly Pineapple magnanimity Available ▁an a ▁ b A t g r P O v m y p n "
|
||||
"l d e h i",
|
||||
res.sentence_pieces);
|
||||
}
|
||||
|
||||
TEST(UnigramTrainerTest, BasicDPTest) {
|
||||
@ -132,8 +134,7 @@ TEST(UnigramTrainerTest, BasicDPTest) {
|
||||
"Overly \t 6", "Available \t 5"},
|
||||
22, true /*use_dp*/, 0 /*dp_noise*/, 4 /*dp_clipping*/);
|
||||
|
||||
// Got 16 instead of 27 seeds.
|
||||
EXPECT_EQ(16, res.seed_pieces_and_probs.size());
|
||||
EXPECT_EQ(49, res.seed_pieces_and_probs.size());
|
||||
|
||||
// And they are equiv to if the last sentence was not there.
|
||||
const auto& res_nodp = RunTrainer(
|
||||
@ -191,12 +192,12 @@ TEST(UnigramTrainerTest, EndToEndTest) {
|
||||
.ok());
|
||||
// TODO(taku): Temporally disable this test on Windows.
|
||||
#ifndef OS_WIN
|
||||
EXPECT_EQ(WS
|
||||
" 吾輩 《 わが はい 》 は 猫 である 。 名前 はまだ 無い 。 "
|
||||
"どこ で 生 れた か とん と 見当 《 けん とう 》 が つか ぬ 。 "
|
||||
"何でも 薄 暗 い じめ じめ した 所で ニャーニャー "
|
||||
"泣 い ていた 事 だけは 記憶 している 。",
|
||||
absl::StrJoin(tok, " "));
|
||||
EXPECT_EQ(
|
||||
WS
|
||||
" 吾輩 《 わ が は い 》 は猫である 。 名前は まだ 無 い 。 どこ で 生れ "
|
||||
"た か とん と 見当 《 けん とう 》 が つか ぬ 。 何でも 薄 暗 い じめ "
|
||||
"じめ した 所で ニャーニャー 泣 い ていた 事 だけ は記憶している 。",
|
||||
absl::StrJoin(tok, " "));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user