add pretokenization_delimiter options. Initialize seed pieces more accurately.

This commit is contained in:
Taku Kudo 2023-04-10 02:11:37 +00:00
parent 6c9fd791cf
commit e58bb684d0
15 changed files with 676 additions and 387 deletions

View File

@ -19,7 +19,7 @@ DESCRIPTOR = _descriptor.FileDescriptor(
syntax='proto2',
serialized_options=b'H\003',
create_key=_descriptor._internal_create_key,
serialized_pb=b'\n\x19sentencepiece_model.proto\x12\rsentencepiece\"\xdb\x0b\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12*\n\x1b\x65nable_differential_privacy\x18\x32 \x01(\x08:\x05\x66\x61lse\x12+\n differential_privacy_noise_level\x18\x33 \x01(\x02:\x01\x30\x12\x32\n\'differential_privacy_clipping_threshold\x18\x34 \x01(\x04:\x01\x30\x12\"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12\"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12+\n\x1c\x61llow_whitespace_only_pieces\x18\x1a \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18 \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18\" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse\"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32\".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL\"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03'
serialized_pb=b'\n\x19sentencepiece_model.proto\x12\rsentencepiece\"\x80\x0c\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12*\n\x1b\x65nable_differential_privacy\x18\x32 \x01(\x08:\x05\x66\x61lse\x12+\n differential_privacy_noise_level\x18\x33 \x01(\x02:\x01\x30\x12\x32\n\'differential_privacy_clipping_threshold\x18\x34 \x01(\x04:\x01\x30\x12\"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12\"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12+\n\x1c\x61llow_whitespace_only_pieces\x18\x1a \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12#\n\x19pretokenization_delimiter\x18\x35 \x01(\t:\x00\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18 \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18\" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse\"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32\".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL\"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03'
)
@ -54,8 +54,8 @@ _TRAINERSPEC_MODELTYPE = _descriptor.EnumDescriptor(
],
containing_type=None,
serialized_options=None,
serialized_start=1480,
serialized_end=1533,
serialized_start=1517,
serialized_end=1570,
)
_sym_db.RegisterEnumDescriptor(_TRAINERSPEC_MODELTYPE)
@ -99,8 +99,8 @@ _MODELPROTO_SENTENCEPIECE_TYPE = _descriptor.EnumDescriptor(
],
containing_type=None,
serialized_options=None,
serialized_start=2286,
serialized_end=2370,
serialized_start=2323,
serialized_end=2407,
)
_sym_db.RegisterEnumDescriptor(_MODELPROTO_SENTENCEPIECE_TYPE)
@ -303,119 +303,126 @@ _TRAINERSPEC = _descriptor.Descriptor(
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='control_symbols', full_name='sentencepiece.TrainerSpec.control_symbols', index=27,
name='pretokenization_delimiter', full_name='sentencepiece.TrainerSpec.pretokenization_delimiter', index=27,
number=53, type=9, cpp_type=9, label=1,
has_default_value=True, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='control_symbols', full_name='sentencepiece.TrainerSpec.control_symbols', index=28,
number=30, type=9, cpp_type=9, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='user_defined_symbols', full_name='sentencepiece.TrainerSpec.user_defined_symbols', index=28,
name='user_defined_symbols', full_name='sentencepiece.TrainerSpec.user_defined_symbols', index=29,
number=31, type=9, cpp_type=9, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='required_chars', full_name='sentencepiece.TrainerSpec.required_chars', index=29,
name='required_chars', full_name='sentencepiece.TrainerSpec.required_chars', index=30,
number=36, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='byte_fallback', full_name='sentencepiece.TrainerSpec.byte_fallback', index=30,
name='byte_fallback', full_name='sentencepiece.TrainerSpec.byte_fallback', index=31,
number=35, type=8, cpp_type=7, label=1,
has_default_value=True, default_value=False,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='vocabulary_output_piece_score', full_name='sentencepiece.TrainerSpec.vocabulary_output_piece_score', index=31,
name='vocabulary_output_piece_score', full_name='sentencepiece.TrainerSpec.vocabulary_output_piece_score', index=32,
number=32, type=8, cpp_type=7, label=1,
has_default_value=True, default_value=True,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='hard_vocab_limit', full_name='sentencepiece.TrainerSpec.hard_vocab_limit', index=32,
name='hard_vocab_limit', full_name='sentencepiece.TrainerSpec.hard_vocab_limit', index=33,
number=33, type=8, cpp_type=7, label=1,
has_default_value=True, default_value=True,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='use_all_vocab', full_name='sentencepiece.TrainerSpec.use_all_vocab', index=33,
name='use_all_vocab', full_name='sentencepiece.TrainerSpec.use_all_vocab', index=34,
number=34, type=8, cpp_type=7, label=1,
has_default_value=True, default_value=False,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='unk_id', full_name='sentencepiece.TrainerSpec.unk_id', index=34,
name='unk_id', full_name='sentencepiece.TrainerSpec.unk_id', index=35,
number=40, type=5, cpp_type=1, label=1,
has_default_value=True, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='bos_id', full_name='sentencepiece.TrainerSpec.bos_id', index=35,
name='bos_id', full_name='sentencepiece.TrainerSpec.bos_id', index=36,
number=41, type=5, cpp_type=1, label=1,
has_default_value=True, default_value=1,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='eos_id', full_name='sentencepiece.TrainerSpec.eos_id', index=36,
name='eos_id', full_name='sentencepiece.TrainerSpec.eos_id', index=37,
number=42, type=5, cpp_type=1, label=1,
has_default_value=True, default_value=2,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='pad_id', full_name='sentencepiece.TrainerSpec.pad_id', index=37,
name='pad_id', full_name='sentencepiece.TrainerSpec.pad_id', index=38,
number=43, type=5, cpp_type=1, label=1,
has_default_value=True, default_value=-1,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='unk_piece', full_name='sentencepiece.TrainerSpec.unk_piece', index=38,
name='unk_piece', full_name='sentencepiece.TrainerSpec.unk_piece', index=39,
number=45, type=9, cpp_type=9, label=1,
has_default_value=True, default_value=b"<unk>".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='bos_piece', full_name='sentencepiece.TrainerSpec.bos_piece', index=39,
name='bos_piece', full_name='sentencepiece.TrainerSpec.bos_piece', index=40,
number=46, type=9, cpp_type=9, label=1,
has_default_value=True, default_value=b"<s>".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='eos_piece', full_name='sentencepiece.TrainerSpec.eos_piece', index=40,
name='eos_piece', full_name='sentencepiece.TrainerSpec.eos_piece', index=41,
number=47, type=9, cpp_type=9, label=1,
has_default_value=True, default_value=b"</s>".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='pad_piece', full_name='sentencepiece.TrainerSpec.pad_piece', index=41,
name='pad_piece', full_name='sentencepiece.TrainerSpec.pad_piece', index=42,
number=48, type=9, cpp_type=9, label=1,
has_default_value=True, default_value=b"<pad>".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='unk_surface', full_name='sentencepiece.TrainerSpec.unk_surface', index=42,
name='unk_surface', full_name='sentencepiece.TrainerSpec.unk_surface', index=43,
number=44, type=9, cpp_type=9, label=1,
has_default_value=True, default_value=b" \342\201\207 ".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='train_extremely_large_corpus', full_name='sentencepiece.TrainerSpec.train_extremely_large_corpus', index=43,
name='train_extremely_large_corpus', full_name='sentencepiece.TrainerSpec.train_extremely_large_corpus', index=44,
number=49, type=8, cpp_type=7, label=1,
has_default_value=True, default_value=False,
message_type=None, enum_type=None, containing_type=None,
@ -435,7 +442,7 @@ _TRAINERSPEC = _descriptor.Descriptor(
oneofs=[
],
serialized_start=45,
serialized_end=1544,
serialized_end=1581,
)
@ -501,8 +508,8 @@ _NORMALIZERSPEC = _descriptor.Descriptor(
extension_ranges=[(200, 536870912), ],
oneofs=[
],
serialized_start=1547,
serialized_end=1756,
serialized_start=1584,
serialized_end=1793,
)
@ -540,8 +547,8 @@ _SELFTESTDATA_SAMPLE = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
serialized_start=1827,
serialized_end=1868,
serialized_start=1864,
serialized_end=1905,
)
_SELFTESTDATA = _descriptor.Descriptor(
@ -571,8 +578,8 @@ _SELFTESTDATA = _descriptor.Descriptor(
extension_ranges=[(200, 536870912), ],
oneofs=[
],
serialized_start=1758,
serialized_end=1879,
serialized_start=1795,
serialized_end=1916,
)
@ -618,8 +625,8 @@ _MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor(
extension_ranges=[(200, 536870912), ],
oneofs=[
],
serialized_start=2171,
serialized_end=2381,
serialized_start=2208,
serialized_end=2418,
)
_MODELPROTO = _descriptor.Descriptor(
@ -677,8 +684,8 @@ _MODELPROTO = _descriptor.Descriptor(
extension_ranges=[(200, 536870912), ],
oneofs=[
],
serialized_start=1882,
serialized_end=2392,
serialized_start=1919,
serialized_end=2429,
)
_TRAINERSPEC.fields_by_name['model_type'].enum_type = _TRAINERSPEC_MODELTYPE

View File

@ -63,7 +63,7 @@ if (SPM_USE_BUILTIN_PROTOBUF)
if (MSVC)
add_definitions("/DHAVE_PTHREAD /wd4018 /wd4514")
else()
add_definitions("-pthread -DHAVE_PTHREAD=1 -Wno-sign-compare")
add_definitions("-pthread -DHAVE_PTHREAD=1 -Wno-sign-compare -Wno-deprecated-declarations")
endif()
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../third_party/protobuf-lite)
include_directories(builtin_pb)

View File

@ -285,101 +285,104 @@ class TrainerSpec::_Internal {
(*has_bits)[0] |= 1u;
}
static void set_has_model_type(HasBits* has_bits) {
(*has_bits)[0] |= 4194304u;
}
static void set_has_vocab_size(HasBits* has_bits) {
(*has_bits)[0] |= 8388608u;
}
static void set_has_self_test_sample_size(HasBits* has_bits) {
(*has_bits)[0] |= 256u;
}
static void set_has_enable_differential_privacy(HasBits* has_bits) {
(*has_bits)[0] |= 4096u;
}
static void set_has_differential_privacy_noise_level(HasBits* has_bits) {
(*has_bits)[0] |= 1048576u;
}
static void set_has_differential_privacy_clipping_threshold(HasBits* has_bits) {
(*has_bits)[0] |= 2097152u;
}
static void set_has_character_coverage(HasBits* has_bits) {
static void set_has_vocab_size(HasBits* has_bits) {
(*has_bits)[0] |= 16777216u;
}
static void set_has_input_sentence_size(HasBits* has_bits) {
(*has_bits)[0] |= 1024u;
}
static void set_has_shuffle_input_sentence(HasBits* has_bits) {
(*has_bits)[0] |= 2147483648u;
}
static void set_has_mining_sentence_size(HasBits* has_bits) {
static void set_has_self_test_sample_size(HasBits* has_bits) {
(*has_bits)[0] |= 512u;
}
static void set_has_training_sentence_size(HasBits* has_bits) {
(*has_bits)[0] |= 2048u;
}
static void set_has_seed_sentencepiece_size(HasBits* has_bits) {
(*has_bits)[0] |= 33554432u;
}
static void set_has_shrinking_factor(HasBits* has_bits) {
(*has_bits)[0] |= 67108864u;
}
static void set_has_max_sentence_length(HasBits* has_bits) {
(*has_bits)[0] |= 536870912u;
}
static void set_has_num_threads(HasBits* has_bits) {
(*has_bits)[0] |= 134217728u;
}
static void set_has_num_sub_iterations(HasBits* has_bits) {
(*has_bits)[0] |= 268435456u;
}
static void set_has_max_sentencepiece_length(HasBits* has_bits) {
(*has_bits)[0] |= 1073741824u;
}
static void set_has_split_by_unicode_script(HasBits* has_bits) {
(*has_bits)[1] |= 1u;
}
static void set_has_split_by_number(HasBits* has_bits) {
(*has_bits)[1] |= 2u;
}
static void set_has_split_by_whitespace(HasBits* has_bits) {
(*has_bits)[1] |= 4u;
}
static void set_has_treat_whitespace_as_suffix(HasBits* has_bits) {
static void set_has_enable_differential_privacy(HasBits* has_bits) {
(*has_bits)[0] |= 8192u;
}
static void set_has_allow_whitespace_only_pieces(HasBits* has_bits) {
static void set_has_differential_privacy_noise_level(HasBits* has_bits) {
(*has_bits)[0] |= 2097152u;
}
static void set_has_differential_privacy_clipping_threshold(HasBits* has_bits) {
(*has_bits)[0] |= 4194304u;
}
static void set_has_character_coverage(HasBits* has_bits) {
(*has_bits)[0] |= 33554432u;
}
static void set_has_input_sentence_size(HasBits* has_bits) {
(*has_bits)[0] |= 2048u;
}
static void set_has_shuffle_input_sentence(HasBits* has_bits) {
(*has_bits)[1] |= 1u;
}
static void set_has_mining_sentence_size(HasBits* has_bits) {
(*has_bits)[0] |= 1024u;
}
static void set_has_training_sentence_size(HasBits* has_bits) {
(*has_bits)[0] |= 4096u;
}
static void set_has_seed_sentencepiece_size(HasBits* has_bits) {
(*has_bits)[0] |= 67108864u;
}
static void set_has_shrinking_factor(HasBits* has_bits) {
(*has_bits)[0] |= 134217728u;
}
static void set_has_max_sentence_length(HasBits* has_bits) {
(*has_bits)[0] |= 1073741824u;
}
static void set_has_num_threads(HasBits* has_bits) {
(*has_bits)[0] |= 268435456u;
}
static void set_has_num_sub_iterations(HasBits* has_bits) {
(*has_bits)[0] |= 536870912u;
}
static void set_has_max_sentencepiece_length(HasBits* has_bits) {
(*has_bits)[0] |= 2147483648u;
}
static void set_has_split_by_unicode_script(HasBits* has_bits) {
(*has_bits)[1] |= 2u;
}
static void set_has_split_by_number(HasBits* has_bits) {
(*has_bits)[1] |= 4u;
}
static void set_has_split_by_whitespace(HasBits* has_bits) {
(*has_bits)[1] |= 8u;
}
static void set_has_treat_whitespace_as_suffix(HasBits* has_bits) {
(*has_bits)[0] |= 16384u;
}
static void set_has_split_digits(HasBits* has_bits) {
static void set_has_allow_whitespace_only_pieces(HasBits* has_bits) {
(*has_bits)[0] |= 32768u;
}
static void set_has_split_digits(HasBits* has_bits) {
(*has_bits)[0] |= 65536u;
}
static void set_has_pretokenization_delimiter(HasBits* has_bits) {
(*has_bits)[0] |= 256u;
}
static void set_has_required_chars(HasBits* has_bits) {
(*has_bits)[0] |= 4u;
}
static void set_has_byte_fallback(HasBits* has_bits) {
(*has_bits)[0] |= 65536u;
}
static void set_has_vocabulary_output_piece_score(HasBits* has_bits) {
(*has_bits)[1] |= 8u;
}
static void set_has_hard_vocab_limit(HasBits* has_bits) {
(*has_bits)[1] |= 16u;
}
static void set_has_use_all_vocab(HasBits* has_bits) {
(*has_bits)[0] |= 131072u;
}
static void set_has_unk_id(HasBits* has_bits) {
(*has_bits)[0] |= 524288u;
static void set_has_vocabulary_output_piece_score(HasBits* has_bits) {
(*has_bits)[1] |= 16u;
}
static void set_has_bos_id(HasBits* has_bits) {
static void set_has_hard_vocab_limit(HasBits* has_bits) {
(*has_bits)[1] |= 32u;
}
static void set_has_eos_id(HasBits* has_bits) {
static void set_has_use_all_vocab(HasBits* has_bits) {
(*has_bits)[0] |= 262144u;
}
static void set_has_unk_id(HasBits* has_bits) {
(*has_bits)[0] |= 1048576u;
}
static void set_has_bos_id(HasBits* has_bits) {
(*has_bits)[1] |= 64u;
}
static void set_has_pad_id(HasBits* has_bits) {
static void set_has_eos_id(HasBits* has_bits) {
(*has_bits)[1] |= 128u;
}
static void set_has_pad_id(HasBits* has_bits) {
(*has_bits)[1] |= 256u;
}
static void set_has_unk_piece(HasBits* has_bits) {
(*has_bits)[0] |= 16u;
}
@ -396,7 +399,7 @@ class TrainerSpec::_Internal {
(*has_bits)[0] |= 8u;
}
static void set_has_train_extremely_large_corpus(HasBits* has_bits) {
(*has_bits)[0] |= 262144u;
(*has_bits)[0] |= 524288u;
}
};
@ -465,6 +468,11 @@ TrainerSpec::TrainerSpec(const TrainerSpec& from)
pad_piece_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::NonEmptyDefault{}, from._internal_pad_piece(),
GetArena());
}
pretokenization_delimiter_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited());
if (from._internal_has_pretokenization_delimiter()) {
pretokenization_delimiter_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, from._internal_pretokenization_delimiter(),
GetArena());
}
::memcpy(&self_test_sample_size_, &from.self_test_sample_size_,
static_cast<size_t>(reinterpret_cast<char*>(&pad_id_) -
reinterpret_cast<char*>(&self_test_sample_size_)) + sizeof(pad_id_));
@ -481,6 +489,7 @@ void TrainerSpec::SharedCtor() {
bos_piece_.UnsafeSetDefault(nullptr);
eos_piece_.UnsafeSetDefault(nullptr);
pad_piece_.UnsafeSetDefault(nullptr);
pretokenization_delimiter_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited());
::memset(reinterpret_cast<char*>(this) + static_cast<size_t>(
reinterpret_cast<char*>(&self_test_sample_size_) - reinterpret_cast<char*>(this)),
0, static_cast<size_t>(reinterpret_cast<char*>(&differential_privacy_clipping_threshold_) -
@ -521,6 +530,7 @@ void TrainerSpec::SharedDtor() {
bos_piece_.DestroyNoArena(nullptr);
eos_piece_.DestroyNoArena(nullptr);
pad_piece_.DestroyNoArena(nullptr);
pretokenization_delimiter_.DestroyNoArena(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited());
}
void TrainerSpec::ArenaDtor(void* object) {
@ -576,19 +586,22 @@ void TrainerSpec::Clear() {
pad_piece_.ClearToDefault(::sentencepiece::TrainerSpec::_i_give_permission_to_break_this_code_default_pad_piece_, GetArena());
}
}
if (cached_has_bits & 0x0000ff00u) {
if (cached_has_bits & 0x00000100u) {
pretokenization_delimiter_.ClearNonDefaultToEmpty();
}
if (cached_has_bits & 0x0000fe00u) {
::memset(&self_test_sample_size_, 0, static_cast<size_t>(
reinterpret_cast<char*>(&split_digits_) -
reinterpret_cast<char*>(&self_test_sample_size_)) + sizeof(split_digits_));
reinterpret_cast<char*>(&allow_whitespace_only_pieces_) -
reinterpret_cast<char*>(&self_test_sample_size_)) + sizeof(allow_whitespace_only_pieces_));
}
if (cached_has_bits & 0x00ff0000u) {
::memset(&byte_fallback_, 0, static_cast<size_t>(
::memset(&split_digits_, 0, static_cast<size_t>(
reinterpret_cast<char*>(&differential_privacy_clipping_threshold_) -
reinterpret_cast<char*>(&byte_fallback_)) + sizeof(differential_privacy_clipping_threshold_));
reinterpret_cast<char*>(&split_digits_)) + sizeof(differential_privacy_clipping_threshold_));
model_type_ = 1;
vocab_size_ = 8000;
}
if (cached_has_bits & 0xff000000u) {
vocab_size_ = 8000;
character_coverage_ = 0.9995f;
seed_sentencepiece_size_ = 1000000;
shrinking_factor_ = 0.75f;
@ -596,10 +609,10 @@ void TrainerSpec::Clear() {
num_sub_iterations_ = 2;
max_sentence_length_ = 4192;
max_sentencepiece_length_ = 16;
shuffle_input_sentence_ = true;
}
cached_has_bits = _has_bits_[1];
if (cached_has_bits & 0x000000ffu) {
shuffle_input_sentence_ = true;
split_by_unicode_script_ = true;
split_by_number_ = true;
split_by_whitespace_ = true;
@ -607,8 +620,8 @@ void TrainerSpec::Clear() {
hard_vocab_limit_ = true;
bos_id_ = 1;
eos_id_ = 2;
pad_id_ = -1;
}
pad_id_ = -1;
_has_bits_.Clear();
_internal_metadata_.Clear<std::string>();
}
@ -996,6 +1009,14 @@ const char* TrainerSpec::_InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID
CHK_(ptr);
} else goto handle_unusual;
continue;
// optional string pretokenization_delimiter = 53 [default = ""];
case 53:
if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 170)) {
auto str = _internal_mutable_pretokenization_delimiter();
ptr = ::PROTOBUF_NAMESPACE_ID::internal::InlineGreedyStringParser(str, ptr, ctx);
CHK_(ptr);
} else goto handle_unusual;
continue;
default: {
handle_unusual:
if ((tag & 7) == 4 || tag == 0) {
@ -1044,14 +1065,14 @@ failure:
}
// optional .sentencepiece.TrainerSpec.ModelType model_type = 3 [default = UNIGRAM];
if (cached_has_bits & 0x00400000u) {
if (cached_has_bits & 0x00800000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteEnumToArray(
3, this->_internal_model_type(), target);
}
// optional int32 vocab_size = 4 [default = 8000];
if (cached_has_bits & 0x00800000u) {
if (cached_has_bits & 0x01000000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(4, this->_internal_vocab_size(), target);
}
@ -1063,7 +1084,7 @@ failure:
}
// optional int32 self_test_sample_size = 6 [default = 0];
if (cached_has_bits & 0x00000100u) {
if (cached_has_bits & 0x00000200u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(6, this->_internal_self_test_sample_size(), target);
}
@ -1075,105 +1096,107 @@ failure:
}
// optional float character_coverage = 10 [default = 0.9995];
if (cached_has_bits & 0x01000000u) {
if (cached_has_bits & 0x02000000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteFloatToArray(10, this->_internal_character_coverage(), target);
}
// optional uint64 input_sentence_size = 11 [default = 0];
if (cached_has_bits & 0x00000400u) {
if (cached_has_bits & 0x00000800u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteUInt64ToArray(11, this->_internal_input_sentence_size(), target);
}
// optional int32 mining_sentence_size = 12 [deprecated = true];
if (cached_has_bits & 0x00000200u) {
if (cached_has_bits & 0x00000400u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(12, this->_internal_mining_sentence_size(), target);
}
// optional int32 training_sentence_size = 13 [deprecated = true];
if (cached_has_bits & 0x00000800u) {
if (cached_has_bits & 0x00001000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(13, this->_internal_training_sentence_size(), target);
}
// optional int32 seed_sentencepiece_size = 14 [default = 1000000];
if (cached_has_bits & 0x02000000u) {
if (cached_has_bits & 0x04000000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(14, this->_internal_seed_sentencepiece_size(), target);
}
// optional float shrinking_factor = 15 [default = 0.75];
if (cached_has_bits & 0x04000000u) {
if (cached_has_bits & 0x08000000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteFloatToArray(15, this->_internal_shrinking_factor(), target);
}
// optional int32 num_threads = 16 [default = 16];
if (cached_has_bits & 0x08000000u) {
if (cached_has_bits & 0x10000000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(16, this->_internal_num_threads(), target);
}
// optional int32 num_sub_iterations = 17 [default = 2];
if (cached_has_bits & 0x10000000u) {
if (cached_has_bits & 0x20000000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(17, this->_internal_num_sub_iterations(), target);
}
// optional int32 max_sentence_length = 18 [default = 4192];
if (cached_has_bits & 0x20000000u) {
if (cached_has_bits & 0x40000000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(18, this->_internal_max_sentence_length(), target);
}
cached_has_bits = _has_bits_[1];
// optional bool shuffle_input_sentence = 19 [default = true];
if (cached_has_bits & 0x80000000u) {
if (cached_has_bits & 0x00000001u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(19, this->_internal_shuffle_input_sentence(), target);
}
cached_has_bits = _has_bits_[0];
// optional int32 max_sentencepiece_length = 20 [default = 16];
if (cached_has_bits & 0x40000000u) {
if (cached_has_bits & 0x80000000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(20, this->_internal_max_sentencepiece_length(), target);
}
cached_has_bits = _has_bits_[1];
// optional bool split_by_unicode_script = 21 [default = true];
if (cached_has_bits & 0x00000001u) {
if (cached_has_bits & 0x00000002u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(21, this->_internal_split_by_unicode_script(), target);
}
// optional bool split_by_whitespace = 22 [default = true];
if (cached_has_bits & 0x00000004u) {
if (cached_has_bits & 0x00000008u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(22, this->_internal_split_by_whitespace(), target);
}
// optional bool split_by_number = 23 [default = true];
if (cached_has_bits & 0x00000002u) {
if (cached_has_bits & 0x00000004u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(23, this->_internal_split_by_number(), target);
}
cached_has_bits = _has_bits_[0];
// optional bool treat_whitespace_as_suffix = 24 [default = false];
if (cached_has_bits & 0x00002000u) {
if (cached_has_bits & 0x00004000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(24, this->_internal_treat_whitespace_as_suffix(), target);
}
// optional bool split_digits = 25 [default = false];
if (cached_has_bits & 0x00008000u) {
if (cached_has_bits & 0x00010000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(25, this->_internal_split_digits(), target);
}
// optional bool allow_whitespace_only_pieces = 26 [default = false];
if (cached_has_bits & 0x00004000u) {
if (cached_has_bits & 0x00008000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(26, this->_internal_allow_whitespace_only_pieces(), target);
}
@ -1192,26 +1215,26 @@ failure:
cached_has_bits = _has_bits_[1];
// optional bool vocabulary_output_piece_score = 32 [default = true];
if (cached_has_bits & 0x00000008u) {
if (cached_has_bits & 0x00000010u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(32, this->_internal_vocabulary_output_piece_score(), target);
}
// optional bool hard_vocab_limit = 33 [default = true];
if (cached_has_bits & 0x00000010u) {
if (cached_has_bits & 0x00000020u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(33, this->_internal_hard_vocab_limit(), target);
}
cached_has_bits = _has_bits_[0];
// optional bool use_all_vocab = 34 [default = false];
if (cached_has_bits & 0x00020000u) {
if (cached_has_bits & 0x00040000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(34, this->_internal_use_all_vocab(), target);
}
// optional bool byte_fallback = 35 [default = false];
if (cached_has_bits & 0x00010000u) {
if (cached_has_bits & 0x00020000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(35, this->_internal_byte_fallback(), target);
}
@ -1223,26 +1246,26 @@ failure:
}
// optional int32 unk_id = 40 [default = 0];
if (cached_has_bits & 0x00080000u) {
if (cached_has_bits & 0x00100000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(40, this->_internal_unk_id(), target);
}
cached_has_bits = _has_bits_[1];
// optional int32 bos_id = 41 [default = 1];
if (cached_has_bits & 0x00000020u) {
if (cached_has_bits & 0x00000040u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(41, this->_internal_bos_id(), target);
}
// optional int32 eos_id = 42 [default = 2];
if (cached_has_bits & 0x00000040u) {
if (cached_has_bits & 0x00000080u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(42, this->_internal_eos_id(), target);
}
// optional int32 pad_id = 43 [default = -1];
if (cached_has_bits & 0x00000080u) {
if (cached_has_bits & 0x00000100u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(43, this->_internal_pad_id(), target);
}
@ -1279,29 +1302,35 @@ failure:
}
// optional bool train_extremely_large_corpus = 49 [default = false];
if (cached_has_bits & 0x00040000u) {
if (cached_has_bits & 0x00080000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(49, this->_internal_train_extremely_large_corpus(), target);
}
// optional bool enable_differential_privacy = 50 [default = false];
if (cached_has_bits & 0x00001000u) {
if (cached_has_bits & 0x00002000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(50, this->_internal_enable_differential_privacy(), target);
}
// optional float differential_privacy_noise_level = 51 [default = 0];
if (cached_has_bits & 0x00100000u) {
if (cached_has_bits & 0x00200000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteFloatToArray(51, this->_internal_differential_privacy_noise_level(), target);
}
// optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
if (cached_has_bits & 0x00200000u) {
if (cached_has_bits & 0x00400000u) {
target = stream->EnsureSpace(target);
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteUInt64ToArray(52, this->_internal_differential_privacy_clipping_threshold(), target);
}
// optional string pretokenization_delimiter = 53 [default = ""];
if (cached_has_bits & 0x00000100u) {
target = stream->WriteStringMaybeAliased(
53, this->_internal_pretokenization_delimiter(), target);
}
// Extension range [200, 536870912)
target = _extensions_._InternalSerialize(
200, 536870912, target, stream);
@ -1416,205 +1445,212 @@ size_t TrainerSpec::ByteSizeLong() const {
}
if (cached_has_bits & 0x0000ff00u) {
// optional int32 self_test_sample_size = 6 [default = 0];
// optional string pretokenization_delimiter = 53 [default = ""];
if (cached_has_bits & 0x00000100u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::StringSize(
this->_internal_pretokenization_delimiter());
}
// optional int32 self_test_sample_size = 6 [default = 0];
if (cached_has_bits & 0x00000200u) {
total_size += 1 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_self_test_sample_size());
}
// optional int32 mining_sentence_size = 12 [deprecated = true];
if (cached_has_bits & 0x00000200u) {
if (cached_has_bits & 0x00000400u) {
total_size += 1 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_mining_sentence_size());
}
// optional uint64 input_sentence_size = 11 [default = 0];
if (cached_has_bits & 0x00000400u) {
if (cached_has_bits & 0x00000800u) {
total_size += 1 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::UInt64Size(
this->_internal_input_sentence_size());
}
// optional int32 training_sentence_size = 13 [deprecated = true];
if (cached_has_bits & 0x00000800u) {
if (cached_has_bits & 0x00001000u) {
total_size += 1 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_training_sentence_size());
}
// optional bool enable_differential_privacy = 50 [default = false];
if (cached_has_bits & 0x00001000u) {
total_size += 2 + 1;
}
// optional bool treat_whitespace_as_suffix = 24 [default = false];
if (cached_has_bits & 0x00002000u) {
total_size += 2 + 1;
}
// optional bool allow_whitespace_only_pieces = 26 [default = false];
// optional bool treat_whitespace_as_suffix = 24 [default = false];
if (cached_has_bits & 0x00004000u) {
total_size += 2 + 1;
}
// optional bool split_digits = 25 [default = false];
// optional bool allow_whitespace_only_pieces = 26 [default = false];
if (cached_has_bits & 0x00008000u) {
total_size += 2 + 1;
}
}
if (cached_has_bits & 0x00ff0000u) {
// optional bool byte_fallback = 35 [default = false];
// optional bool split_digits = 25 [default = false];
if (cached_has_bits & 0x00010000u) {
total_size += 2 + 1;
}
// optional bool use_all_vocab = 34 [default = false];
// optional bool byte_fallback = 35 [default = false];
if (cached_has_bits & 0x00020000u) {
total_size += 2 + 1;
}
// optional bool train_extremely_large_corpus = 49 [default = false];
// optional bool use_all_vocab = 34 [default = false];
if (cached_has_bits & 0x00040000u) {
total_size += 2 + 1;
}
// optional int32 unk_id = 40 [default = 0];
// optional bool train_extremely_large_corpus = 49 [default = false];
if (cached_has_bits & 0x00080000u) {
total_size += 2 + 1;
}
// optional int32 unk_id = 40 [default = 0];
if (cached_has_bits & 0x00100000u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_unk_id());
}
// optional float differential_privacy_noise_level = 51 [default = 0];
if (cached_has_bits & 0x00100000u) {
if (cached_has_bits & 0x00200000u) {
total_size += 2 + 4;
}
// optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
if (cached_has_bits & 0x00200000u) {
if (cached_has_bits & 0x00400000u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::UInt64Size(
this->_internal_differential_privacy_clipping_threshold());
}
// optional .sentencepiece.TrainerSpec.ModelType model_type = 3 [default = UNIGRAM];
if (cached_has_bits & 0x00400000u) {
if (cached_has_bits & 0x00800000u) {
total_size += 1 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::EnumSize(this->_internal_model_type());
}
}
if (cached_has_bits & 0xff000000u) {
// optional int32 vocab_size = 4 [default = 8000];
if (cached_has_bits & 0x00800000u) {
if (cached_has_bits & 0x01000000u) {
total_size += 1 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_vocab_size());
}
}
if (cached_has_bits & 0xff000000u) {
// optional float character_coverage = 10 [default = 0.9995];
if (cached_has_bits & 0x01000000u) {
if (cached_has_bits & 0x02000000u) {
total_size += 1 + 4;
}
// optional int32 seed_sentencepiece_size = 14 [default = 1000000];
if (cached_has_bits & 0x02000000u) {
if (cached_has_bits & 0x04000000u) {
total_size += 1 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_seed_sentencepiece_size());
}
// optional float shrinking_factor = 15 [default = 0.75];
if (cached_has_bits & 0x04000000u) {
if (cached_has_bits & 0x08000000u) {
total_size += 1 + 4;
}
// optional int32 num_threads = 16 [default = 16];
if (cached_has_bits & 0x08000000u) {
if (cached_has_bits & 0x10000000u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_num_threads());
}
// optional int32 num_sub_iterations = 17 [default = 2];
if (cached_has_bits & 0x10000000u) {
if (cached_has_bits & 0x20000000u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_num_sub_iterations());
}
// optional int32 max_sentence_length = 18 [default = 4192];
if (cached_has_bits & 0x20000000u) {
if (cached_has_bits & 0x40000000u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_max_sentence_length());
}
// optional int32 max_sentencepiece_length = 20 [default = 16];
if (cached_has_bits & 0x40000000u) {
if (cached_has_bits & 0x80000000u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_max_sentencepiece_length());
}
// optional bool shuffle_input_sentence = 19 [default = true];
if (cached_has_bits & 0x80000000u) {
total_size += 2 + 1;
}
}
cached_has_bits = _has_bits_[1];
if (cached_has_bits & 0x000000ffu) {
// optional bool split_by_unicode_script = 21 [default = true];
// optional bool shuffle_input_sentence = 19 [default = true];
if (cached_has_bits & 0x00000001u) {
total_size += 2 + 1;
}
// optional bool split_by_number = 23 [default = true];
// optional bool split_by_unicode_script = 21 [default = true];
if (cached_has_bits & 0x00000002u) {
total_size += 2 + 1;
}
// optional bool split_by_whitespace = 22 [default = true];
// optional bool split_by_number = 23 [default = true];
if (cached_has_bits & 0x00000004u) {
total_size += 2 + 1;
}
// optional bool vocabulary_output_piece_score = 32 [default = true];
// optional bool split_by_whitespace = 22 [default = true];
if (cached_has_bits & 0x00000008u) {
total_size += 2 + 1;
}
// optional bool hard_vocab_limit = 33 [default = true];
// optional bool vocabulary_output_piece_score = 32 [default = true];
if (cached_has_bits & 0x00000010u) {
total_size += 2 + 1;
}
// optional int32 bos_id = 41 [default = 1];
// optional bool hard_vocab_limit = 33 [default = true];
if (cached_has_bits & 0x00000020u) {
total_size += 2 + 1;
}
// optional int32 bos_id = 41 [default = 1];
if (cached_has_bits & 0x00000040u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_bos_id());
}
// optional int32 eos_id = 42 [default = 2];
if (cached_has_bits & 0x00000040u) {
if (cached_has_bits & 0x00000080u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_eos_id());
}
// optional int32 pad_id = 43 [default = -1];
if (cached_has_bits & 0x00000080u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_pad_id());
}
}
// optional int32 pad_id = 43 [default = -1];
if (cached_has_bits & 0x00000100u) {
total_size += 2 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size(
this->_internal_pad_id());
}
if (PROTOBUF_PREDICT_FALSE(_internal_metadata_.have_unknown_fields())) {
total_size += _internal_metadata_.unknown_fields<std::string>(::PROTOBUF_NAMESPACE_ID::internal::GetEmptyString).size();
}
@ -1670,113 +1706,116 @@ void TrainerSpec::MergeFrom(const TrainerSpec& from) {
}
if (cached_has_bits & 0x0000ff00u) {
if (cached_has_bits & 0x00000100u) {
self_test_sample_size_ = from.self_test_sample_size_;
_internal_set_pretokenization_delimiter(from._internal_pretokenization_delimiter());
}
if (cached_has_bits & 0x00000200u) {
mining_sentence_size_ = from.mining_sentence_size_;
self_test_sample_size_ = from.self_test_sample_size_;
}
if (cached_has_bits & 0x00000400u) {
input_sentence_size_ = from.input_sentence_size_;
mining_sentence_size_ = from.mining_sentence_size_;
}
if (cached_has_bits & 0x00000800u) {
training_sentence_size_ = from.training_sentence_size_;
input_sentence_size_ = from.input_sentence_size_;
}
if (cached_has_bits & 0x00001000u) {
enable_differential_privacy_ = from.enable_differential_privacy_;
training_sentence_size_ = from.training_sentence_size_;
}
if (cached_has_bits & 0x00002000u) {
treat_whitespace_as_suffix_ = from.treat_whitespace_as_suffix_;
enable_differential_privacy_ = from.enable_differential_privacy_;
}
if (cached_has_bits & 0x00004000u) {
allow_whitespace_only_pieces_ = from.allow_whitespace_only_pieces_;
treat_whitespace_as_suffix_ = from.treat_whitespace_as_suffix_;
}
if (cached_has_bits & 0x00008000u) {
split_digits_ = from.split_digits_;
allow_whitespace_only_pieces_ = from.allow_whitespace_only_pieces_;
}
_has_bits_[0] |= cached_has_bits;
}
if (cached_has_bits & 0x00ff0000u) {
if (cached_has_bits & 0x00010000u) {
byte_fallback_ = from.byte_fallback_;
split_digits_ = from.split_digits_;
}
if (cached_has_bits & 0x00020000u) {
use_all_vocab_ = from.use_all_vocab_;
byte_fallback_ = from.byte_fallback_;
}
if (cached_has_bits & 0x00040000u) {
train_extremely_large_corpus_ = from.train_extremely_large_corpus_;
use_all_vocab_ = from.use_all_vocab_;
}
if (cached_has_bits & 0x00080000u) {
unk_id_ = from.unk_id_;
train_extremely_large_corpus_ = from.train_extremely_large_corpus_;
}
if (cached_has_bits & 0x00100000u) {
differential_privacy_noise_level_ = from.differential_privacy_noise_level_;
unk_id_ = from.unk_id_;
}
if (cached_has_bits & 0x00200000u) {
differential_privacy_clipping_threshold_ = from.differential_privacy_clipping_threshold_;
differential_privacy_noise_level_ = from.differential_privacy_noise_level_;
}
if (cached_has_bits & 0x00400000u) {
model_type_ = from.model_type_;
differential_privacy_clipping_threshold_ = from.differential_privacy_clipping_threshold_;
}
if (cached_has_bits & 0x00800000u) {
vocab_size_ = from.vocab_size_;
model_type_ = from.model_type_;
}
_has_bits_[0] |= cached_has_bits;
}
if (cached_has_bits & 0xff000000u) {
if (cached_has_bits & 0x01000000u) {
character_coverage_ = from.character_coverage_;
vocab_size_ = from.vocab_size_;
}
if (cached_has_bits & 0x02000000u) {
seed_sentencepiece_size_ = from.seed_sentencepiece_size_;
character_coverage_ = from.character_coverage_;
}
if (cached_has_bits & 0x04000000u) {
shrinking_factor_ = from.shrinking_factor_;
seed_sentencepiece_size_ = from.seed_sentencepiece_size_;
}
if (cached_has_bits & 0x08000000u) {
num_threads_ = from.num_threads_;
shrinking_factor_ = from.shrinking_factor_;
}
if (cached_has_bits & 0x10000000u) {
num_sub_iterations_ = from.num_sub_iterations_;
num_threads_ = from.num_threads_;
}
if (cached_has_bits & 0x20000000u) {
max_sentence_length_ = from.max_sentence_length_;
num_sub_iterations_ = from.num_sub_iterations_;
}
if (cached_has_bits & 0x40000000u) {
max_sentencepiece_length_ = from.max_sentencepiece_length_;
max_sentence_length_ = from.max_sentence_length_;
}
if (cached_has_bits & 0x80000000u) {
shuffle_input_sentence_ = from.shuffle_input_sentence_;
max_sentencepiece_length_ = from.max_sentencepiece_length_;
}
_has_bits_[0] |= cached_has_bits;
}
cached_has_bits = from._has_bits_[1];
if (cached_has_bits & 0x000000ffu) {
if (cached_has_bits & 0x00000001u) {
split_by_unicode_script_ = from.split_by_unicode_script_;
shuffle_input_sentence_ = from.shuffle_input_sentence_;
}
if (cached_has_bits & 0x00000002u) {
split_by_number_ = from.split_by_number_;
split_by_unicode_script_ = from.split_by_unicode_script_;
}
if (cached_has_bits & 0x00000004u) {
split_by_whitespace_ = from.split_by_whitespace_;
split_by_number_ = from.split_by_number_;
}
if (cached_has_bits & 0x00000008u) {
vocabulary_output_piece_score_ = from.vocabulary_output_piece_score_;
split_by_whitespace_ = from.split_by_whitespace_;
}
if (cached_has_bits & 0x00000010u) {
hard_vocab_limit_ = from.hard_vocab_limit_;
vocabulary_output_piece_score_ = from.vocabulary_output_piece_score_;
}
if (cached_has_bits & 0x00000020u) {
bos_id_ = from.bos_id_;
hard_vocab_limit_ = from.hard_vocab_limit_;
}
if (cached_has_bits & 0x00000040u) {
eos_id_ = from.eos_id_;
bos_id_ = from.bos_id_;
}
if (cached_has_bits & 0x00000080u) {
pad_id_ = from.pad_id_;
eos_id_ = from.eos_id_;
}
_has_bits_[1] |= cached_has_bits;
}
if (cached_has_bits & 0x00000100u) {
_internal_set_pad_id(from._internal_pad_id());
}
}
void TrainerSpec::CopyFrom(const TrainerSpec& from) {
@ -1812,6 +1851,7 @@ void TrainerSpec::InternalSwap(TrainerSpec* other) {
bos_piece_.Swap(&other->bos_piece_, nullptr, GetArena());
eos_piece_.Swap(&other->eos_piece_, nullptr, GetArena());
pad_piece_.Swap(&other->pad_piece_, nullptr, GetArena());
pretokenization_delimiter_.Swap(&other->pretokenization_delimiter_, &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
::PROTOBUF_NAMESPACE_ID::internal::memswap<
PROTOBUF_FIELD_OFFSET(TrainerSpec, differential_privacy_clipping_threshold_)
+ sizeof(TrainerSpec::differential_privacy_clipping_threshold_)

View File

@ -273,6 +273,7 @@ class TrainerSpec PROTOBUF_FINAL :
kBosPieceFieldNumber = 46,
kEosPieceFieldNumber = 47,
kPadPieceFieldNumber = 48,
kPretokenizationDelimiterFieldNumber = 53,
kSelfTestSampleSizeFieldNumber = 6,
kMiningSentenceSizeFieldNumber = 12,
kInputSentenceSizeFieldNumber = 11,
@ -562,6 +563,26 @@ class TrainerSpec PROTOBUF_FINAL :
std::string* _internal_mutable_pad_piece();
public:
// optional string pretokenization_delimiter = 53 [default = ""];
bool has_pretokenization_delimiter() const;
private:
bool _internal_has_pretokenization_delimiter() const;
public:
void clear_pretokenization_delimiter();
const std::string& pretokenization_delimiter() const;
void set_pretokenization_delimiter(const std::string& value);
void set_pretokenization_delimiter(std::string&& value);
void set_pretokenization_delimiter(const char* value);
void set_pretokenization_delimiter(const char* value, size_t size);
std::string* mutable_pretokenization_delimiter();
std::string* release_pretokenization_delimiter();
void set_allocated_pretokenization_delimiter(std::string* pretokenization_delimiter);
private:
const std::string& _internal_pretokenization_delimiter() const;
void _internal_set_pretokenization_delimiter(const std::string& value);
std::string* _internal_mutable_pretokenization_delimiter();
public:
// optional int32 self_test_sample_size = 6 [default = 0];
bool has_self_test_sample_size() const;
private:
@ -1007,6 +1028,7 @@ class TrainerSpec PROTOBUF_FINAL :
::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr eos_piece_;
static const ::PROTOBUF_NAMESPACE_ID::internal::LazyString _i_give_permission_to_break_this_code_default_pad_piece_;
::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr pad_piece_;
::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr pretokenization_delimiter_;
::PROTOBUF_NAMESPACE_ID::int32 self_test_sample_size_;
::PROTOBUF_NAMESPACE_ID::int32 mining_sentence_size_;
::PROTOBUF_NAMESPACE_ID::uint64 input_sentence_size_;
@ -2240,7 +2262,7 @@ inline void TrainerSpec::set_allocated_model_prefix(std::string* model_prefix) {
// optional .sentencepiece.TrainerSpec.ModelType model_type = 3 [default = UNIGRAM];
inline bool TrainerSpec::_internal_has_model_type() const {
bool value = (_has_bits_[0] & 0x00400000u) != 0;
bool value = (_has_bits_[0] & 0x00800000u) != 0;
return value;
}
inline bool TrainerSpec::has_model_type() const {
@ -2248,7 +2270,7 @@ inline bool TrainerSpec::has_model_type() const {
}
inline void TrainerSpec::clear_model_type() {
model_type_ = 1;
_has_bits_[0] &= ~0x00400000u;
_has_bits_[0] &= ~0x00800000u;
}
inline ::sentencepiece::TrainerSpec_ModelType TrainerSpec::_internal_model_type() const {
return static_cast< ::sentencepiece::TrainerSpec_ModelType >(model_type_);
@ -2259,7 +2281,7 @@ inline ::sentencepiece::TrainerSpec_ModelType TrainerSpec::model_type() const {
}
inline void TrainerSpec::_internal_set_model_type(::sentencepiece::TrainerSpec_ModelType value) {
assert(::sentencepiece::TrainerSpec_ModelType_IsValid(value));
_has_bits_[0] |= 0x00400000u;
_has_bits_[0] |= 0x00800000u;
model_type_ = value;
}
inline void TrainerSpec::set_model_type(::sentencepiece::TrainerSpec_ModelType value) {
@ -2269,7 +2291,7 @@ inline void TrainerSpec::set_model_type(::sentencepiece::TrainerSpec_ModelType v
// optional int32 vocab_size = 4 [default = 8000];
inline bool TrainerSpec::_internal_has_vocab_size() const {
bool value = (_has_bits_[0] & 0x00800000u) != 0;
bool value = (_has_bits_[0] & 0x01000000u) != 0;
return value;
}
inline bool TrainerSpec::has_vocab_size() const {
@ -2277,7 +2299,7 @@ inline bool TrainerSpec::has_vocab_size() const {
}
inline void TrainerSpec::clear_vocab_size() {
vocab_size_ = 8000;
_has_bits_[0] &= ~0x00800000u;
_has_bits_[0] &= ~0x01000000u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_vocab_size() const {
return vocab_size_;
@ -2287,7 +2309,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::vocab_size() const {
return _internal_vocab_size();
}
inline void TrainerSpec::_internal_set_vocab_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x00800000u;
_has_bits_[0] |= 0x01000000u;
vocab_size_ = value;
}
inline void TrainerSpec::set_vocab_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -2371,7 +2393,7 @@ TrainerSpec::mutable_accept_language() {
// optional int32 self_test_sample_size = 6 [default = 0];
inline bool TrainerSpec::_internal_has_self_test_sample_size() const {
bool value = (_has_bits_[0] & 0x00000100u) != 0;
bool value = (_has_bits_[0] & 0x00000200u) != 0;
return value;
}
inline bool TrainerSpec::has_self_test_sample_size() const {
@ -2379,7 +2401,7 @@ inline bool TrainerSpec::has_self_test_sample_size() const {
}
inline void TrainerSpec::clear_self_test_sample_size() {
self_test_sample_size_ = 0;
_has_bits_[0] &= ~0x00000100u;
_has_bits_[0] &= ~0x00000200u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_self_test_sample_size() const {
return self_test_sample_size_;
@ -2389,7 +2411,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::self_test_sample_size() const
return _internal_self_test_sample_size();
}
inline void TrainerSpec::_internal_set_self_test_sample_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x00000100u;
_has_bits_[0] |= 0x00000200u;
self_test_sample_size_ = value;
}
inline void TrainerSpec::set_self_test_sample_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -2399,7 +2421,7 @@ inline void TrainerSpec::set_self_test_sample_size(::PROTOBUF_NAMESPACE_ID::int3
// optional bool enable_differential_privacy = 50 [default = false];
inline bool TrainerSpec::_internal_has_enable_differential_privacy() const {
bool value = (_has_bits_[0] & 0x00001000u) != 0;
bool value = (_has_bits_[0] & 0x00002000u) != 0;
return value;
}
inline bool TrainerSpec::has_enable_differential_privacy() const {
@ -2407,7 +2429,7 @@ inline bool TrainerSpec::has_enable_differential_privacy() const {
}
inline void TrainerSpec::clear_enable_differential_privacy() {
enable_differential_privacy_ = false;
_has_bits_[0] &= ~0x00001000u;
_has_bits_[0] &= ~0x00002000u;
}
inline bool TrainerSpec::_internal_enable_differential_privacy() const {
return enable_differential_privacy_;
@ -2417,7 +2439,7 @@ inline bool TrainerSpec::enable_differential_privacy() const {
return _internal_enable_differential_privacy();
}
inline void TrainerSpec::_internal_set_enable_differential_privacy(bool value) {
_has_bits_[0] |= 0x00001000u;
_has_bits_[0] |= 0x00002000u;
enable_differential_privacy_ = value;
}
inline void TrainerSpec::set_enable_differential_privacy(bool value) {
@ -2427,7 +2449,7 @@ inline void TrainerSpec::set_enable_differential_privacy(bool value) {
// optional float differential_privacy_noise_level = 51 [default = 0];
inline bool TrainerSpec::_internal_has_differential_privacy_noise_level() const {
bool value = (_has_bits_[0] & 0x00100000u) != 0;
bool value = (_has_bits_[0] & 0x00200000u) != 0;
return value;
}
inline bool TrainerSpec::has_differential_privacy_noise_level() const {
@ -2435,7 +2457,7 @@ inline bool TrainerSpec::has_differential_privacy_noise_level() const {
}
inline void TrainerSpec::clear_differential_privacy_noise_level() {
differential_privacy_noise_level_ = 0;
_has_bits_[0] &= ~0x00100000u;
_has_bits_[0] &= ~0x00200000u;
}
inline float TrainerSpec::_internal_differential_privacy_noise_level() const {
return differential_privacy_noise_level_;
@ -2445,7 +2467,7 @@ inline float TrainerSpec::differential_privacy_noise_level() const {
return _internal_differential_privacy_noise_level();
}
inline void TrainerSpec::_internal_set_differential_privacy_noise_level(float value) {
_has_bits_[0] |= 0x00100000u;
_has_bits_[0] |= 0x00200000u;
differential_privacy_noise_level_ = value;
}
inline void TrainerSpec::set_differential_privacy_noise_level(float value) {
@ -2455,7 +2477,7 @@ inline void TrainerSpec::set_differential_privacy_noise_level(float value) {
// optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
inline bool TrainerSpec::_internal_has_differential_privacy_clipping_threshold() const {
bool value = (_has_bits_[0] & 0x00200000u) != 0;
bool value = (_has_bits_[0] & 0x00400000u) != 0;
return value;
}
inline bool TrainerSpec::has_differential_privacy_clipping_threshold() const {
@ -2463,7 +2485,7 @@ inline bool TrainerSpec::has_differential_privacy_clipping_threshold() const {
}
inline void TrainerSpec::clear_differential_privacy_clipping_threshold() {
differential_privacy_clipping_threshold_ = PROTOBUF_ULONGLONG(0);
_has_bits_[0] &= ~0x00200000u;
_has_bits_[0] &= ~0x00400000u;
}
inline ::PROTOBUF_NAMESPACE_ID::uint64 TrainerSpec::_internal_differential_privacy_clipping_threshold() const {
return differential_privacy_clipping_threshold_;
@ -2473,7 +2495,7 @@ inline ::PROTOBUF_NAMESPACE_ID::uint64 TrainerSpec::differential_privacy_clippin
return _internal_differential_privacy_clipping_threshold();
}
inline void TrainerSpec::_internal_set_differential_privacy_clipping_threshold(::PROTOBUF_NAMESPACE_ID::uint64 value) {
_has_bits_[0] |= 0x00200000u;
_has_bits_[0] |= 0x00400000u;
differential_privacy_clipping_threshold_ = value;
}
inline void TrainerSpec::set_differential_privacy_clipping_threshold(::PROTOBUF_NAMESPACE_ID::uint64 value) {
@ -2483,7 +2505,7 @@ inline void TrainerSpec::set_differential_privacy_clipping_threshold(::PROTOBUF_
// optional float character_coverage = 10 [default = 0.9995];
inline bool TrainerSpec::_internal_has_character_coverage() const {
bool value = (_has_bits_[0] & 0x01000000u) != 0;
bool value = (_has_bits_[0] & 0x02000000u) != 0;
return value;
}
inline bool TrainerSpec::has_character_coverage() const {
@ -2491,7 +2513,7 @@ inline bool TrainerSpec::has_character_coverage() const {
}
inline void TrainerSpec::clear_character_coverage() {
character_coverage_ = 0.9995f;
_has_bits_[0] &= ~0x01000000u;
_has_bits_[0] &= ~0x02000000u;
}
inline float TrainerSpec::_internal_character_coverage() const {
return character_coverage_;
@ -2501,7 +2523,7 @@ inline float TrainerSpec::character_coverage() const {
return _internal_character_coverage();
}
inline void TrainerSpec::_internal_set_character_coverage(float value) {
_has_bits_[0] |= 0x01000000u;
_has_bits_[0] |= 0x02000000u;
character_coverage_ = value;
}
inline void TrainerSpec::set_character_coverage(float value) {
@ -2511,7 +2533,7 @@ inline void TrainerSpec::set_character_coverage(float value) {
// optional uint64 input_sentence_size = 11 [default = 0];
inline bool TrainerSpec::_internal_has_input_sentence_size() const {
bool value = (_has_bits_[0] & 0x00000400u) != 0;
bool value = (_has_bits_[0] & 0x00000800u) != 0;
return value;
}
inline bool TrainerSpec::has_input_sentence_size() const {
@ -2519,7 +2541,7 @@ inline bool TrainerSpec::has_input_sentence_size() const {
}
inline void TrainerSpec::clear_input_sentence_size() {
input_sentence_size_ = PROTOBUF_ULONGLONG(0);
_has_bits_[0] &= ~0x00000400u;
_has_bits_[0] &= ~0x00000800u;
}
inline ::PROTOBUF_NAMESPACE_ID::uint64 TrainerSpec::_internal_input_sentence_size() const {
return input_sentence_size_;
@ -2529,7 +2551,7 @@ inline ::PROTOBUF_NAMESPACE_ID::uint64 TrainerSpec::input_sentence_size() const
return _internal_input_sentence_size();
}
inline void TrainerSpec::_internal_set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::uint64 value) {
_has_bits_[0] |= 0x00000400u;
_has_bits_[0] |= 0x00000800u;
input_sentence_size_ = value;
}
inline void TrainerSpec::set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::uint64 value) {
@ -2539,7 +2561,7 @@ inline void TrainerSpec::set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::uint64
// optional bool shuffle_input_sentence = 19 [default = true];
inline bool TrainerSpec::_internal_has_shuffle_input_sentence() const {
bool value = (_has_bits_[0] & 0x80000000u) != 0;
bool value = (_has_bits_[1] & 0x00000001u) != 0;
return value;
}
inline bool TrainerSpec::has_shuffle_input_sentence() const {
@ -2547,7 +2569,7 @@ inline bool TrainerSpec::has_shuffle_input_sentence() const {
}
inline void TrainerSpec::clear_shuffle_input_sentence() {
shuffle_input_sentence_ = true;
_has_bits_[0] &= ~0x80000000u;
_has_bits_[1] &= ~0x00000001u;
}
inline bool TrainerSpec::_internal_shuffle_input_sentence() const {
return shuffle_input_sentence_;
@ -2557,7 +2579,7 @@ inline bool TrainerSpec::shuffle_input_sentence() const {
return _internal_shuffle_input_sentence();
}
inline void TrainerSpec::_internal_set_shuffle_input_sentence(bool value) {
_has_bits_[0] |= 0x80000000u;
_has_bits_[1] |= 0x00000001u;
shuffle_input_sentence_ = value;
}
inline void TrainerSpec::set_shuffle_input_sentence(bool value) {
@ -2567,7 +2589,7 @@ inline void TrainerSpec::set_shuffle_input_sentence(bool value) {
// optional int32 mining_sentence_size = 12 [deprecated = true];
inline bool TrainerSpec::_internal_has_mining_sentence_size() const {
bool value = (_has_bits_[0] & 0x00000200u) != 0;
bool value = (_has_bits_[0] & 0x00000400u) != 0;
return value;
}
inline bool TrainerSpec::has_mining_sentence_size() const {
@ -2575,7 +2597,7 @@ inline bool TrainerSpec::has_mining_sentence_size() const {
}
inline void TrainerSpec::clear_mining_sentence_size() {
mining_sentence_size_ = 0;
_has_bits_[0] &= ~0x00000200u;
_has_bits_[0] &= ~0x00000400u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_mining_sentence_size() const {
return mining_sentence_size_;
@ -2585,7 +2607,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::mining_sentence_size() const
return _internal_mining_sentence_size();
}
inline void TrainerSpec::_internal_set_mining_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x00000200u;
_has_bits_[0] |= 0x00000400u;
mining_sentence_size_ = value;
}
inline void TrainerSpec::set_mining_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -2595,7 +2617,7 @@ inline void TrainerSpec::set_mining_sentence_size(::PROTOBUF_NAMESPACE_ID::int32
// optional int32 training_sentence_size = 13 [deprecated = true];
inline bool TrainerSpec::_internal_has_training_sentence_size() const {
bool value = (_has_bits_[0] & 0x00000800u) != 0;
bool value = (_has_bits_[0] & 0x00001000u) != 0;
return value;
}
inline bool TrainerSpec::has_training_sentence_size() const {
@ -2603,7 +2625,7 @@ inline bool TrainerSpec::has_training_sentence_size() const {
}
inline void TrainerSpec::clear_training_sentence_size() {
training_sentence_size_ = 0;
_has_bits_[0] &= ~0x00000800u;
_has_bits_[0] &= ~0x00001000u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_training_sentence_size() const {
return training_sentence_size_;
@ -2613,7 +2635,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::training_sentence_size() cons
return _internal_training_sentence_size();
}
inline void TrainerSpec::_internal_set_training_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x00000800u;
_has_bits_[0] |= 0x00001000u;
training_sentence_size_ = value;
}
inline void TrainerSpec::set_training_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -2623,7 +2645,7 @@ inline void TrainerSpec::set_training_sentence_size(::PROTOBUF_NAMESPACE_ID::int
// optional int32 seed_sentencepiece_size = 14 [default = 1000000];
inline bool TrainerSpec::_internal_has_seed_sentencepiece_size() const {
bool value = (_has_bits_[0] & 0x02000000u) != 0;
bool value = (_has_bits_[0] & 0x04000000u) != 0;
return value;
}
inline bool TrainerSpec::has_seed_sentencepiece_size() const {
@ -2631,7 +2653,7 @@ inline bool TrainerSpec::has_seed_sentencepiece_size() const {
}
inline void TrainerSpec::clear_seed_sentencepiece_size() {
seed_sentencepiece_size_ = 1000000;
_has_bits_[0] &= ~0x02000000u;
_has_bits_[0] &= ~0x04000000u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_seed_sentencepiece_size() const {
return seed_sentencepiece_size_;
@ -2641,7 +2663,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::seed_sentencepiece_size() con
return _internal_seed_sentencepiece_size();
}
inline void TrainerSpec::_internal_set_seed_sentencepiece_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x02000000u;
_has_bits_[0] |= 0x04000000u;
seed_sentencepiece_size_ = value;
}
inline void TrainerSpec::set_seed_sentencepiece_size(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -2651,7 +2673,7 @@ inline void TrainerSpec::set_seed_sentencepiece_size(::PROTOBUF_NAMESPACE_ID::in
// optional float shrinking_factor = 15 [default = 0.75];
inline bool TrainerSpec::_internal_has_shrinking_factor() const {
bool value = (_has_bits_[0] & 0x04000000u) != 0;
bool value = (_has_bits_[0] & 0x08000000u) != 0;
return value;
}
inline bool TrainerSpec::has_shrinking_factor() const {
@ -2659,7 +2681,7 @@ inline bool TrainerSpec::has_shrinking_factor() const {
}
inline void TrainerSpec::clear_shrinking_factor() {
shrinking_factor_ = 0.75f;
_has_bits_[0] &= ~0x04000000u;
_has_bits_[0] &= ~0x08000000u;
}
inline float TrainerSpec::_internal_shrinking_factor() const {
return shrinking_factor_;
@ -2669,7 +2691,7 @@ inline float TrainerSpec::shrinking_factor() const {
return _internal_shrinking_factor();
}
inline void TrainerSpec::_internal_set_shrinking_factor(float value) {
_has_bits_[0] |= 0x04000000u;
_has_bits_[0] |= 0x08000000u;
shrinking_factor_ = value;
}
inline void TrainerSpec::set_shrinking_factor(float value) {
@ -2679,7 +2701,7 @@ inline void TrainerSpec::set_shrinking_factor(float value) {
// optional int32 max_sentence_length = 18 [default = 4192];
inline bool TrainerSpec::_internal_has_max_sentence_length() const {
bool value = (_has_bits_[0] & 0x20000000u) != 0;
bool value = (_has_bits_[0] & 0x40000000u) != 0;
return value;
}
inline bool TrainerSpec::has_max_sentence_length() const {
@ -2687,7 +2709,7 @@ inline bool TrainerSpec::has_max_sentence_length() const {
}
inline void TrainerSpec::clear_max_sentence_length() {
max_sentence_length_ = 4192;
_has_bits_[0] &= ~0x20000000u;
_has_bits_[0] &= ~0x40000000u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_max_sentence_length() const {
return max_sentence_length_;
@ -2697,7 +2719,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::max_sentence_length() const {
return _internal_max_sentence_length();
}
inline void TrainerSpec::_internal_set_max_sentence_length(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x20000000u;
_has_bits_[0] |= 0x40000000u;
max_sentence_length_ = value;
}
inline void TrainerSpec::set_max_sentence_length(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -2707,7 +2729,7 @@ inline void TrainerSpec::set_max_sentence_length(::PROTOBUF_NAMESPACE_ID::int32
// optional int32 num_threads = 16 [default = 16];
inline bool TrainerSpec::_internal_has_num_threads() const {
bool value = (_has_bits_[0] & 0x08000000u) != 0;
bool value = (_has_bits_[0] & 0x10000000u) != 0;
return value;
}
inline bool TrainerSpec::has_num_threads() const {
@ -2715,7 +2737,7 @@ inline bool TrainerSpec::has_num_threads() const {
}
inline void TrainerSpec::clear_num_threads() {
num_threads_ = 16;
_has_bits_[0] &= ~0x08000000u;
_has_bits_[0] &= ~0x10000000u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_num_threads() const {
return num_threads_;
@ -2725,7 +2747,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::num_threads() const {
return _internal_num_threads();
}
inline void TrainerSpec::_internal_set_num_threads(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x08000000u;
_has_bits_[0] |= 0x10000000u;
num_threads_ = value;
}
inline void TrainerSpec::set_num_threads(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -2735,7 +2757,7 @@ inline void TrainerSpec::set_num_threads(::PROTOBUF_NAMESPACE_ID::int32 value) {
// optional int32 num_sub_iterations = 17 [default = 2];
inline bool TrainerSpec::_internal_has_num_sub_iterations() const {
bool value = (_has_bits_[0] & 0x10000000u) != 0;
bool value = (_has_bits_[0] & 0x20000000u) != 0;
return value;
}
inline bool TrainerSpec::has_num_sub_iterations() const {
@ -2743,7 +2765,7 @@ inline bool TrainerSpec::has_num_sub_iterations() const {
}
inline void TrainerSpec::clear_num_sub_iterations() {
num_sub_iterations_ = 2;
_has_bits_[0] &= ~0x10000000u;
_has_bits_[0] &= ~0x20000000u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_num_sub_iterations() const {
return num_sub_iterations_;
@ -2753,7 +2775,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::num_sub_iterations() const {
return _internal_num_sub_iterations();
}
inline void TrainerSpec::_internal_set_num_sub_iterations(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x10000000u;
_has_bits_[0] |= 0x20000000u;
num_sub_iterations_ = value;
}
inline void TrainerSpec::set_num_sub_iterations(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -2763,7 +2785,7 @@ inline void TrainerSpec::set_num_sub_iterations(::PROTOBUF_NAMESPACE_ID::int32 v
// optional int32 max_sentencepiece_length = 20 [default = 16];
inline bool TrainerSpec::_internal_has_max_sentencepiece_length() const {
bool value = (_has_bits_[0] & 0x40000000u) != 0;
bool value = (_has_bits_[0] & 0x80000000u) != 0;
return value;
}
inline bool TrainerSpec::has_max_sentencepiece_length() const {
@ -2771,7 +2793,7 @@ inline bool TrainerSpec::has_max_sentencepiece_length() const {
}
inline void TrainerSpec::clear_max_sentencepiece_length() {
max_sentencepiece_length_ = 16;
_has_bits_[0] &= ~0x40000000u;
_has_bits_[0] &= ~0x80000000u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_max_sentencepiece_length() const {
return max_sentencepiece_length_;
@ -2781,7 +2803,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::max_sentencepiece_length() co
return _internal_max_sentencepiece_length();
}
inline void TrainerSpec::_internal_set_max_sentencepiece_length(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x40000000u;
_has_bits_[0] |= 0x80000000u;
max_sentencepiece_length_ = value;
}
inline void TrainerSpec::set_max_sentencepiece_length(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -2791,7 +2813,7 @@ inline void TrainerSpec::set_max_sentencepiece_length(::PROTOBUF_NAMESPACE_ID::i
// optional bool split_by_unicode_script = 21 [default = true];
inline bool TrainerSpec::_internal_has_split_by_unicode_script() const {
bool value = (_has_bits_[1] & 0x00000001u) != 0;
bool value = (_has_bits_[1] & 0x00000002u) != 0;
return value;
}
inline bool TrainerSpec::has_split_by_unicode_script() const {
@ -2799,7 +2821,7 @@ inline bool TrainerSpec::has_split_by_unicode_script() const {
}
inline void TrainerSpec::clear_split_by_unicode_script() {
split_by_unicode_script_ = true;
_has_bits_[1] &= ~0x00000001u;
_has_bits_[1] &= ~0x00000002u;
}
inline bool TrainerSpec::_internal_split_by_unicode_script() const {
return split_by_unicode_script_;
@ -2809,7 +2831,7 @@ inline bool TrainerSpec::split_by_unicode_script() const {
return _internal_split_by_unicode_script();
}
inline void TrainerSpec::_internal_set_split_by_unicode_script(bool value) {
_has_bits_[1] |= 0x00000001u;
_has_bits_[1] |= 0x00000002u;
split_by_unicode_script_ = value;
}
inline void TrainerSpec::set_split_by_unicode_script(bool value) {
@ -2819,7 +2841,7 @@ inline void TrainerSpec::set_split_by_unicode_script(bool value) {
// optional bool split_by_number = 23 [default = true];
inline bool TrainerSpec::_internal_has_split_by_number() const {
bool value = (_has_bits_[1] & 0x00000002u) != 0;
bool value = (_has_bits_[1] & 0x00000004u) != 0;
return value;
}
inline bool TrainerSpec::has_split_by_number() const {
@ -2827,7 +2849,7 @@ inline bool TrainerSpec::has_split_by_number() const {
}
inline void TrainerSpec::clear_split_by_number() {
split_by_number_ = true;
_has_bits_[1] &= ~0x00000002u;
_has_bits_[1] &= ~0x00000004u;
}
inline bool TrainerSpec::_internal_split_by_number() const {
return split_by_number_;
@ -2837,7 +2859,7 @@ inline bool TrainerSpec::split_by_number() const {
return _internal_split_by_number();
}
inline void TrainerSpec::_internal_set_split_by_number(bool value) {
_has_bits_[1] |= 0x00000002u;
_has_bits_[1] |= 0x00000004u;
split_by_number_ = value;
}
inline void TrainerSpec::set_split_by_number(bool value) {
@ -2847,7 +2869,7 @@ inline void TrainerSpec::set_split_by_number(bool value) {
// optional bool split_by_whitespace = 22 [default = true];
inline bool TrainerSpec::_internal_has_split_by_whitespace() const {
bool value = (_has_bits_[1] & 0x00000004u) != 0;
bool value = (_has_bits_[1] & 0x00000008u) != 0;
return value;
}
inline bool TrainerSpec::has_split_by_whitespace() const {
@ -2855,7 +2877,7 @@ inline bool TrainerSpec::has_split_by_whitespace() const {
}
inline void TrainerSpec::clear_split_by_whitespace() {
split_by_whitespace_ = true;
_has_bits_[1] &= ~0x00000004u;
_has_bits_[1] &= ~0x00000008u;
}
inline bool TrainerSpec::_internal_split_by_whitespace() const {
return split_by_whitespace_;
@ -2865,7 +2887,7 @@ inline bool TrainerSpec::split_by_whitespace() const {
return _internal_split_by_whitespace();
}
inline void TrainerSpec::_internal_set_split_by_whitespace(bool value) {
_has_bits_[1] |= 0x00000004u;
_has_bits_[1] |= 0x00000008u;
split_by_whitespace_ = value;
}
inline void TrainerSpec::set_split_by_whitespace(bool value) {
@ -2875,7 +2897,7 @@ inline void TrainerSpec::set_split_by_whitespace(bool value) {
// optional bool treat_whitespace_as_suffix = 24 [default = false];
inline bool TrainerSpec::_internal_has_treat_whitespace_as_suffix() const {
bool value = (_has_bits_[0] & 0x00002000u) != 0;
bool value = (_has_bits_[0] & 0x00004000u) != 0;
return value;
}
inline bool TrainerSpec::has_treat_whitespace_as_suffix() const {
@ -2883,7 +2905,7 @@ inline bool TrainerSpec::has_treat_whitespace_as_suffix() const {
}
inline void TrainerSpec::clear_treat_whitespace_as_suffix() {
treat_whitespace_as_suffix_ = false;
_has_bits_[0] &= ~0x00002000u;
_has_bits_[0] &= ~0x00004000u;
}
inline bool TrainerSpec::_internal_treat_whitespace_as_suffix() const {
return treat_whitespace_as_suffix_;
@ -2893,7 +2915,7 @@ inline bool TrainerSpec::treat_whitespace_as_suffix() const {
return _internal_treat_whitespace_as_suffix();
}
inline void TrainerSpec::_internal_set_treat_whitespace_as_suffix(bool value) {
_has_bits_[0] |= 0x00002000u;
_has_bits_[0] |= 0x00004000u;
treat_whitespace_as_suffix_ = value;
}
inline void TrainerSpec::set_treat_whitespace_as_suffix(bool value) {
@ -2903,7 +2925,7 @@ inline void TrainerSpec::set_treat_whitespace_as_suffix(bool value) {
// optional bool allow_whitespace_only_pieces = 26 [default = false];
inline bool TrainerSpec::_internal_has_allow_whitespace_only_pieces() const {
bool value = (_has_bits_[0] & 0x00004000u) != 0;
bool value = (_has_bits_[0] & 0x00008000u) != 0;
return value;
}
inline bool TrainerSpec::has_allow_whitespace_only_pieces() const {
@ -2911,7 +2933,7 @@ inline bool TrainerSpec::has_allow_whitespace_only_pieces() const {
}
inline void TrainerSpec::clear_allow_whitespace_only_pieces() {
allow_whitespace_only_pieces_ = false;
_has_bits_[0] &= ~0x00004000u;
_has_bits_[0] &= ~0x00008000u;
}
inline bool TrainerSpec::_internal_allow_whitespace_only_pieces() const {
return allow_whitespace_only_pieces_;
@ -2921,7 +2943,7 @@ inline bool TrainerSpec::allow_whitespace_only_pieces() const {
return _internal_allow_whitespace_only_pieces();
}
inline void TrainerSpec::_internal_set_allow_whitespace_only_pieces(bool value) {
_has_bits_[0] |= 0x00004000u;
_has_bits_[0] |= 0x00008000u;
allow_whitespace_only_pieces_ = value;
}
inline void TrainerSpec::set_allow_whitespace_only_pieces(bool value) {
@ -2931,7 +2953,7 @@ inline void TrainerSpec::set_allow_whitespace_only_pieces(bool value) {
// optional bool split_digits = 25 [default = false];
inline bool TrainerSpec::_internal_has_split_digits() const {
bool value = (_has_bits_[0] & 0x00008000u) != 0;
bool value = (_has_bits_[0] & 0x00010000u) != 0;
return value;
}
inline bool TrainerSpec::has_split_digits() const {
@ -2939,7 +2961,7 @@ inline bool TrainerSpec::has_split_digits() const {
}
inline void TrainerSpec::clear_split_digits() {
split_digits_ = false;
_has_bits_[0] &= ~0x00008000u;
_has_bits_[0] &= ~0x00010000u;
}
inline bool TrainerSpec::_internal_split_digits() const {
return split_digits_;
@ -2949,7 +2971,7 @@ inline bool TrainerSpec::split_digits() const {
return _internal_split_digits();
}
inline void TrainerSpec::_internal_set_split_digits(bool value) {
_has_bits_[0] |= 0x00008000u;
_has_bits_[0] |= 0x00010000u;
split_digits_ = value;
}
inline void TrainerSpec::set_split_digits(bool value) {
@ -2957,6 +2979,79 @@ inline void TrainerSpec::set_split_digits(bool value) {
// @@protoc_insertion_point(field_set:sentencepiece.TrainerSpec.split_digits)
}
// optional string pretokenization_delimiter = 53 [default = ""];
inline bool TrainerSpec::_internal_has_pretokenization_delimiter() const {
bool value = (_has_bits_[0] & 0x00000100u) != 0;
return value;
}
inline bool TrainerSpec::has_pretokenization_delimiter() const {
return _internal_has_pretokenization_delimiter();
}
inline void TrainerSpec::clear_pretokenization_delimiter() {
pretokenization_delimiter_.ClearToEmpty();
_has_bits_[0] &= ~0x00000100u;
}
inline const std::string& TrainerSpec::pretokenization_delimiter() const {
// @@protoc_insertion_point(field_get:sentencepiece.TrainerSpec.pretokenization_delimiter)
return _internal_pretokenization_delimiter();
}
inline void TrainerSpec::set_pretokenization_delimiter(const std::string& value) {
_internal_set_pretokenization_delimiter(value);
// @@protoc_insertion_point(field_set:sentencepiece.TrainerSpec.pretokenization_delimiter)
}
inline std::string* TrainerSpec::mutable_pretokenization_delimiter() {
// @@protoc_insertion_point(field_mutable:sentencepiece.TrainerSpec.pretokenization_delimiter)
return _internal_mutable_pretokenization_delimiter();
}
inline const std::string& TrainerSpec::_internal_pretokenization_delimiter() const {
return pretokenization_delimiter_.Get();
}
inline void TrainerSpec::_internal_set_pretokenization_delimiter(const std::string& value) {
_has_bits_[0] |= 0x00000100u;
pretokenization_delimiter_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, value, GetArena());
}
inline void TrainerSpec::set_pretokenization_delimiter(std::string&& value) {
_has_bits_[0] |= 0x00000100u;
pretokenization_delimiter_.Set(
::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, ::std::move(value), GetArena());
// @@protoc_insertion_point(field_set_rvalue:sentencepiece.TrainerSpec.pretokenization_delimiter)
}
inline void TrainerSpec::set_pretokenization_delimiter(const char* value) {
GOOGLE_DCHECK(value != nullptr);
_has_bits_[0] |= 0x00000100u;
pretokenization_delimiter_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, ::std::string(value), GetArena());
// @@protoc_insertion_point(field_set_char:sentencepiece.TrainerSpec.pretokenization_delimiter)
}
inline void TrainerSpec::set_pretokenization_delimiter(const char* value,
size_t size) {
_has_bits_[0] |= 0x00000100u;
pretokenization_delimiter_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, ::std::string(
reinterpret_cast<const char*>(value), size), GetArena());
// @@protoc_insertion_point(field_set_pointer:sentencepiece.TrainerSpec.pretokenization_delimiter)
}
inline std::string* TrainerSpec::_internal_mutable_pretokenization_delimiter() {
_has_bits_[0] |= 0x00000100u;
return pretokenization_delimiter_.Mutable(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, GetArena());
}
inline std::string* TrainerSpec::release_pretokenization_delimiter() {
// @@protoc_insertion_point(field_release:sentencepiece.TrainerSpec.pretokenization_delimiter)
if (!_internal_has_pretokenization_delimiter()) {
return nullptr;
}
_has_bits_[0] &= ~0x00000100u;
return pretokenization_delimiter_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
}
inline void TrainerSpec::set_allocated_pretokenization_delimiter(std::string* pretokenization_delimiter) {
if (pretokenization_delimiter != nullptr) {
_has_bits_[0] |= 0x00000100u;
} else {
_has_bits_[0] &= ~0x00000100u;
}
pretokenization_delimiter_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), pretokenization_delimiter,
GetArena());
// @@protoc_insertion_point(field_set_allocated:sentencepiece.TrainerSpec.pretokenization_delimiter)
}
// repeated string control_symbols = 30;
inline int TrainerSpec::_internal_control_symbols_size() const {
return control_symbols_.size();
@ -3180,7 +3275,7 @@ inline void TrainerSpec::set_allocated_required_chars(std::string* required_char
// optional bool byte_fallback = 35 [default = false];
inline bool TrainerSpec::_internal_has_byte_fallback() const {
bool value = (_has_bits_[0] & 0x00010000u) != 0;
bool value = (_has_bits_[0] & 0x00020000u) != 0;
return value;
}
inline bool TrainerSpec::has_byte_fallback() const {
@ -3188,7 +3283,7 @@ inline bool TrainerSpec::has_byte_fallback() const {
}
inline void TrainerSpec::clear_byte_fallback() {
byte_fallback_ = false;
_has_bits_[0] &= ~0x00010000u;
_has_bits_[0] &= ~0x00020000u;
}
inline bool TrainerSpec::_internal_byte_fallback() const {
return byte_fallback_;
@ -3198,7 +3293,7 @@ inline bool TrainerSpec::byte_fallback() const {
return _internal_byte_fallback();
}
inline void TrainerSpec::_internal_set_byte_fallback(bool value) {
_has_bits_[0] |= 0x00010000u;
_has_bits_[0] |= 0x00020000u;
byte_fallback_ = value;
}
inline void TrainerSpec::set_byte_fallback(bool value) {
@ -3208,7 +3303,7 @@ inline void TrainerSpec::set_byte_fallback(bool value) {
// optional bool vocabulary_output_piece_score = 32 [default = true];
inline bool TrainerSpec::_internal_has_vocabulary_output_piece_score() const {
bool value = (_has_bits_[1] & 0x00000008u) != 0;
bool value = (_has_bits_[1] & 0x00000010u) != 0;
return value;
}
inline bool TrainerSpec::has_vocabulary_output_piece_score() const {
@ -3216,7 +3311,7 @@ inline bool TrainerSpec::has_vocabulary_output_piece_score() const {
}
inline void TrainerSpec::clear_vocabulary_output_piece_score() {
vocabulary_output_piece_score_ = true;
_has_bits_[1] &= ~0x00000008u;
_has_bits_[1] &= ~0x00000010u;
}
inline bool TrainerSpec::_internal_vocabulary_output_piece_score() const {
return vocabulary_output_piece_score_;
@ -3226,7 +3321,7 @@ inline bool TrainerSpec::vocabulary_output_piece_score() const {
return _internal_vocabulary_output_piece_score();
}
inline void TrainerSpec::_internal_set_vocabulary_output_piece_score(bool value) {
_has_bits_[1] |= 0x00000008u;
_has_bits_[1] |= 0x00000010u;
vocabulary_output_piece_score_ = value;
}
inline void TrainerSpec::set_vocabulary_output_piece_score(bool value) {
@ -3236,7 +3331,7 @@ inline void TrainerSpec::set_vocabulary_output_piece_score(bool value) {
// optional bool hard_vocab_limit = 33 [default = true];
inline bool TrainerSpec::_internal_has_hard_vocab_limit() const {
bool value = (_has_bits_[1] & 0x00000010u) != 0;
bool value = (_has_bits_[1] & 0x00000020u) != 0;
return value;
}
inline bool TrainerSpec::has_hard_vocab_limit() const {
@ -3244,7 +3339,7 @@ inline bool TrainerSpec::has_hard_vocab_limit() const {
}
inline void TrainerSpec::clear_hard_vocab_limit() {
hard_vocab_limit_ = true;
_has_bits_[1] &= ~0x00000010u;
_has_bits_[1] &= ~0x00000020u;
}
inline bool TrainerSpec::_internal_hard_vocab_limit() const {
return hard_vocab_limit_;
@ -3254,7 +3349,7 @@ inline bool TrainerSpec::hard_vocab_limit() const {
return _internal_hard_vocab_limit();
}
inline void TrainerSpec::_internal_set_hard_vocab_limit(bool value) {
_has_bits_[1] |= 0x00000010u;
_has_bits_[1] |= 0x00000020u;
hard_vocab_limit_ = value;
}
inline void TrainerSpec::set_hard_vocab_limit(bool value) {
@ -3264,7 +3359,7 @@ inline void TrainerSpec::set_hard_vocab_limit(bool value) {
// optional bool use_all_vocab = 34 [default = false];
inline bool TrainerSpec::_internal_has_use_all_vocab() const {
bool value = (_has_bits_[0] & 0x00020000u) != 0;
bool value = (_has_bits_[0] & 0x00040000u) != 0;
return value;
}
inline bool TrainerSpec::has_use_all_vocab() const {
@ -3272,7 +3367,7 @@ inline bool TrainerSpec::has_use_all_vocab() const {
}
inline void TrainerSpec::clear_use_all_vocab() {
use_all_vocab_ = false;
_has_bits_[0] &= ~0x00020000u;
_has_bits_[0] &= ~0x00040000u;
}
inline bool TrainerSpec::_internal_use_all_vocab() const {
return use_all_vocab_;
@ -3282,7 +3377,7 @@ inline bool TrainerSpec::use_all_vocab() const {
return _internal_use_all_vocab();
}
inline void TrainerSpec::_internal_set_use_all_vocab(bool value) {
_has_bits_[0] |= 0x00020000u;
_has_bits_[0] |= 0x00040000u;
use_all_vocab_ = value;
}
inline void TrainerSpec::set_use_all_vocab(bool value) {
@ -3292,7 +3387,7 @@ inline void TrainerSpec::set_use_all_vocab(bool value) {
// optional int32 unk_id = 40 [default = 0];
inline bool TrainerSpec::_internal_has_unk_id() const {
bool value = (_has_bits_[0] & 0x00080000u) != 0;
bool value = (_has_bits_[0] & 0x00100000u) != 0;
return value;
}
inline bool TrainerSpec::has_unk_id() const {
@ -3300,7 +3395,7 @@ inline bool TrainerSpec::has_unk_id() const {
}
inline void TrainerSpec::clear_unk_id() {
unk_id_ = 0;
_has_bits_[0] &= ~0x00080000u;
_has_bits_[0] &= ~0x00100000u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_unk_id() const {
return unk_id_;
@ -3310,7 +3405,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::unk_id() const {
return _internal_unk_id();
}
inline void TrainerSpec::_internal_set_unk_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[0] |= 0x00080000u;
_has_bits_[0] |= 0x00100000u;
unk_id_ = value;
}
inline void TrainerSpec::set_unk_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -3320,7 +3415,7 @@ inline void TrainerSpec::set_unk_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
// optional int32 bos_id = 41 [default = 1];
inline bool TrainerSpec::_internal_has_bos_id() const {
bool value = (_has_bits_[1] & 0x00000020u) != 0;
bool value = (_has_bits_[1] & 0x00000040u) != 0;
return value;
}
inline bool TrainerSpec::has_bos_id() const {
@ -3328,7 +3423,7 @@ inline bool TrainerSpec::has_bos_id() const {
}
inline void TrainerSpec::clear_bos_id() {
bos_id_ = 1;
_has_bits_[1] &= ~0x00000020u;
_has_bits_[1] &= ~0x00000040u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_bos_id() const {
return bos_id_;
@ -3338,7 +3433,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::bos_id() const {
return _internal_bos_id();
}
inline void TrainerSpec::_internal_set_bos_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[1] |= 0x00000020u;
_has_bits_[1] |= 0x00000040u;
bos_id_ = value;
}
inline void TrainerSpec::set_bos_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -3348,7 +3443,7 @@ inline void TrainerSpec::set_bos_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
// optional int32 eos_id = 42 [default = 2];
inline bool TrainerSpec::_internal_has_eos_id() const {
bool value = (_has_bits_[1] & 0x00000040u) != 0;
bool value = (_has_bits_[1] & 0x00000080u) != 0;
return value;
}
inline bool TrainerSpec::has_eos_id() const {
@ -3356,7 +3451,7 @@ inline bool TrainerSpec::has_eos_id() const {
}
inline void TrainerSpec::clear_eos_id() {
eos_id_ = 2;
_has_bits_[1] &= ~0x00000040u;
_has_bits_[1] &= ~0x00000080u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_eos_id() const {
return eos_id_;
@ -3366,7 +3461,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::eos_id() const {
return _internal_eos_id();
}
inline void TrainerSpec::_internal_set_eos_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[1] |= 0x00000040u;
_has_bits_[1] |= 0x00000080u;
eos_id_ = value;
}
inline void TrainerSpec::set_eos_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -3376,7 +3471,7 @@ inline void TrainerSpec::set_eos_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
// optional int32 pad_id = 43 [default = -1];
inline bool TrainerSpec::_internal_has_pad_id() const {
bool value = (_has_bits_[1] & 0x00000080u) != 0;
bool value = (_has_bits_[1] & 0x00000100u) != 0;
return value;
}
inline bool TrainerSpec::has_pad_id() const {
@ -3384,7 +3479,7 @@ inline bool TrainerSpec::has_pad_id() const {
}
inline void TrainerSpec::clear_pad_id() {
pad_id_ = -1;
_has_bits_[1] &= ~0x00000080u;
_has_bits_[1] &= ~0x00000100u;
}
inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_pad_id() const {
return pad_id_;
@ -3394,7 +3489,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::pad_id() const {
return _internal_pad_id();
}
inline void TrainerSpec::_internal_set_pad_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
_has_bits_[1] |= 0x00000080u;
_has_bits_[1] |= 0x00000100u;
pad_id_ = value;
}
inline void TrainerSpec::set_pad_id(::PROTOBUF_NAMESPACE_ID::int32 value) {
@ -3774,7 +3869,7 @@ inline void TrainerSpec::set_allocated_unk_surface(std::string* unk_surface) {
// optional bool train_extremely_large_corpus = 49 [default = false];
inline bool TrainerSpec::_internal_has_train_extremely_large_corpus() const {
bool value = (_has_bits_[0] & 0x00040000u) != 0;
bool value = (_has_bits_[0] & 0x00080000u) != 0;
return value;
}
inline bool TrainerSpec::has_train_extremely_large_corpus() const {
@ -3782,7 +3877,7 @@ inline bool TrainerSpec::has_train_extremely_large_corpus() const {
}
inline void TrainerSpec::clear_train_extremely_large_corpus() {
train_extremely_large_corpus_ = false;
_has_bits_[0] &= ~0x00040000u;
_has_bits_[0] &= ~0x00080000u;
}
inline bool TrainerSpec::_internal_train_extremely_large_corpus() const {
return train_extremely_large_corpus_;
@ -3792,7 +3887,7 @@ inline bool TrainerSpec::train_extremely_large_corpus() const {
return _internal_train_extremely_large_corpus();
}
inline void TrainerSpec::_internal_set_train_extremely_large_corpus(bool value) {
_has_bits_[0] |= 0x00040000u;
_has_bits_[0] |= 0x00080000u;
train_extremely_large_corpus_ = value;
}
inline void TrainerSpec::set_train_extremely_large_corpus(bool value) {

View File

@ -11,9 +11,10 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.!
#include "pretokenizer_for_training.h"
#include <string>
#include "pretokenizer_for_training.h"
#include "third_party/absl/strings/str_replace.h"
namespace sentencepiece {
@ -24,10 +25,9 @@ namespace {
// defined them explicitly to avoid the dependency to trainier_interface.
// Currently, we have no separated build rules.
const char kWSStr[] = "\xe2\x96\x81";
const char kUPPBoundaryStr[] = "\t";
} // namespace
std::string PretokenizerForTrainingInterface::PreTokenize(
std::vector<std::string> PretokenizerForTrainingInterface::PreTokenize(
absl::string_view text) const {
return Postprocess(Tokenize(Preprocess(text)));
}
@ -40,14 +40,17 @@ std::string PretokenizerForTrainingInterface::Preprocess(
}
// static
std::string PretokenizerForTrainingInterface::Postprocess(
std::vector<std::string> PretokenizerForTrainingInterface::Postprocess(
const SentencePieceText &spt) {
// Inserts kUPPBoundaryStr before/after of token boundaries.
std::vector<std::string> result;
std::string output;
int prev = 0;
for (const auto &piece : spt.pieces()) {
if (prev == piece.begin() && piece.begin() != 0) {
output += kUPPBoundaryStr;
result.push_back(output);
output.clear();
} else {
output.append(piece.begin() - prev, ' ');
}
@ -55,8 +58,11 @@ std::string PretokenizerForTrainingInterface::Postprocess(
prev = piece.end();
}
// Restores kWSStr.
return absl::StrReplaceAll(output, {{" ", kWSStr}});
if (!output.empty()) result.push_back(output);
for (auto &w : result) w = absl::StrReplaceAll(w, {{" ", kWSStr}});
return result;
}
} // namespace pretokenizer

View File

@ -44,7 +44,7 @@ class PretokenizerForTrainingInterface {
// segmentation: piece[0] = {0, 1}, piece[1] = {2, 6},
// piece[2] = {7, 15}, piece[3] = {15, 20}
// output: I love sentence<tab>piece.
std::string PreTokenize(absl::string_view text) const;
std::vector<std::string> PreTokenize(absl::string_view text) const;
// Returns pre-tokenized result.
// Note that the pre-tokenized constraint is specified with the
@ -54,7 +54,7 @@ class PretokenizerForTrainingInterface {
private:
static std::string Preprocess(absl::string_view text);
static std::string Postprocess(const SentencePieceText &spt);
static std::vector<std::string> Postprocess(const SentencePieceText &spt);
};
} // namespace pretokenizer

View File

@ -12,8 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.!
#include "pretokenizer_for_training.h"
#include "testharness.h"
#include "third_party/absl/strings/str_cat.h"
#include "third_party/absl/strings/str_join.h"
#include "third_party/absl/strings/str_split.h"
#include "trainer_interface.h"
namespace sentencepiece {
@ -64,9 +67,11 @@ TEST(PretokenizerForTrainingTest, BaseTest) {
mock.SetOutput(spt);
EXPECT_EQ(absl::StrCat("I", TrainerInterface::kWSStr, "love",
TrainerInterface::kWSStr, "sentence\tpiece"),
mock.PreTokenize("I love sentencepiece"));
const auto expected =
absl::StrCat("I", TrainerInterface::kWSStr, "love",
TrainerInterface::kWSStr, "sentence||||piece");
EXPECT_EQ(expected,
absl::StrJoin(mock.PreTokenize("I love sentencepiece"), "||||"));
}
{
@ -94,7 +99,9 @@ TEST(PretokenizerForTrainingTest, BaseTest) {
mock.SetOutput(spt);
EXPECT_EQ("これ\t\tペン\tです", mock.PreTokenize("これはペンです"));
const auto expected = "これ||||は||||ペン||||です";
EXPECT_EQ(expected,
absl::StrJoin(mock.PreTokenize("これはペンです"), "||||"));
}
}

View File

@ -20,7 +20,7 @@ option optimize_for = LITE_RUNTIME;
package sentencepiece;
// TrainerSpec encodes a various parameters for SentencePiece training.
// Next id: 53
// Next id: 54
message TrainerSpec {
///////////////////////////////////////////////////////////////////
// General parameters
@ -157,6 +157,13 @@ message TrainerSpec {
// Split all digits (0-9) into separate pieces.
optional bool split_digits = 25 [default = false];
// Defines the pre-tokenization delimiter.
// When specified, no pieces crossing this delimiter is not included
// in the vocab. Then the delimiter string is virtually ignored
// during the training. This field can allows constraints on the vocabulary
// selection. Note that this field is available on unigram mode.
optional string pretokenization_delimiter = 53 [ default = ""];
///////////////////////////////////////////////////////////////////
// Vocabulary management
//

View File

@ -144,6 +144,7 @@ inline std::string PrintProto(const TrainerSpec &message,
PRINT_PARAM(split_by_number);
PRINT_PARAM(split_by_whitespace);
PRINT_PARAM(split_digits);
PRINT_PARAM(pretokenization_delimiter);
PRINT_PARAM(treat_whitespace_as_suffix);
PRINT_PARAM(allow_whitespace_only_pieces);
PRINT_REPEATED_STRING(control_symbols);
@ -222,6 +223,7 @@ util::Status SentencePieceTrainer::SetProtoField(absl::string_view name,
PARSE_BOOL(split_by_number);
PARSE_BOOL(split_by_whitespace);
PARSE_BOOL(split_digits);
PARSE_STRING(pretokenization_delimiter);
PARSE_BOOL(treat_whitespace_as_suffix);
PARSE_BOOL(allow_whitespace_only_pieces);
PARSE_REPEATED_STRING(control_symbols);

View File

@ -77,6 +77,9 @@ ABSL_FLAG(bool, split_by_whitespace, kDefaultTrainerSpec.split_by_whitespace(),
"use a white space to split sentence pieces");
ABSL_FLAG(bool, split_digits, kDefaultTrainerSpec.split_digits(),
"split all digits (0-9) into separate pieces");
ABSL_FLAG(std::string, pretokenization_delimiter,
kDefaultTrainerSpec.pretokenization_delimiter(),
"specifies the delimiter of pre-tokenization");
ABSL_FLAG(bool, treat_whitespace_as_suffix,
kDefaultTrainerSpec.treat_whitespace_as_suffix(),
"treat whitespace marker as suffix instead of prefix.");
@ -227,6 +230,7 @@ int main(int argc, char *argv[]) {
SetTrainerSpecFromFlag(split_by_whitespace);
SetTrainerSpecFromFlag(split_by_number);
SetTrainerSpecFromFlag(split_digits);
SetTrainerSpecFromFlag(pretokenization_delimiter);
SetTrainerSpecFromFlag(byte_fallback);
SetTrainerSpecFromFlag(treat_whitespace_as_suffix);
SetTrainerSpecFromFlag(allow_whitespace_only_pieces);

View File

@ -81,7 +81,8 @@ util::Status VerifySpec(const TrainerSpec &trainer_spec) {
CHECK_OR_RETURN(!trainer_spec.eos_piece().empty());
CHECK_OR_RETURN(!trainer_spec.pad_piece().empty());
if (SentencePieceTrainer::GetPretokenizerForTraining()) {
if (SentencePieceTrainer::GetPretokenizerForTraining() ||
!trainer_spec.pretokenization_delimiter().empty()) {
CHECK_EQ_OR_RETURN(TrainerSpec::UNIGRAM, trainer_spec.model_type())
<< "PretokenizerForTraining is only supported in UNIGRAM mode.";
}

View File

@ -461,7 +461,7 @@ std::vector<Lattice::LatticePathWithScore> Lattice::NBest(size_t nbest_size,
} else {
hyp->gx = lnode->score + top->gx; // just adds node->score
hyp->fx =
lnode->backtrace_score + top->gx; // backtrace_score is h(node).
lnode->backtrace_score + hyp->gx; // backtrace_score is h(node).
}
hyp->next = top;
agenda.push(hyp);

View File

@ -28,7 +28,10 @@
#include "pretokenizer_for_training.h"
#include "sentencepiece_trainer.h"
#include "third_party/absl/container/flat_hash_map.h"
#include "third_party/absl/container/flat_hash_set.h"
#include "third_party/absl/memory/memory.h"
#include "third_party/absl/strings/str_replace.h"
#include "third_party/absl/strings/str_split.h"
#include "third_party/esaxx/esa.hxx" // Suffix array library.
#include "unicode_script.h"
#include "util.h"
@ -37,6 +40,9 @@ namespace sentencepiece {
namespace unigram {
namespace {
constexpr char32 kSentenceBoundary = 0x0000;
constexpr char32 kWsMarker = 0x2581;
double Digamma(double x) {
double result = 0.0;
for (; x < 7; ++x) result -= 1 / x;
@ -60,6 +66,63 @@ void ToLogProb(IT begin, IT end) {
it->second = std::log(static_cast<double>(it->second)) - logsum;
}
}
template <typename T>
std::vector<std::pair<const T *, const T *>> SplitBySentenceBoundary(
const T *begin, const T *end) {
std::vector<std::pair<const T *, const T *>> result;
while (begin < end) {
const auto *p = std::find(begin, end, static_cast<T>(kSentenceBoundary));
if (p != end) {
result.emplace_back(begin, p);
begin = p + 1;
} else {
result.emplace_back(begin, end);
break;
}
}
return result;
}
template <class T>
class BoundedPriorityQueue {
public:
explicit BoundedPriorityQueue(size_t size) : size_(size) {}
~BoundedPriorityQueue() = default;
void push(const T &elem, int64 score) {
if (queue_.size() > 4 * size_) resize();
if (queue_.size() >= size_ && queue_[size_ - 1].second > score) return;
queue_.emplace_back(elem, score);
}
const std::vector<std::pair<T, int64>> &get() {
resize();
return queue_;
}
private:
void resize() {
std::sort(queue_.begin(), queue_.end(), [](const auto &p1, const auto &p2) {
return (p1.second > p2.second ||
(p1.second == p2.second && p1.first < p2.first));
});
absl::flat_hash_set<absl::string_view> dup;
std::vector<std::pair<T, int64>> new_queue;
for (auto &p : queue_) {
if (dup.insert(p.first).second) new_queue.emplace_back(std::move(p));
if (new_queue.size() == size_) break;
}
queue_ = std::move(new_queue);
}
size_t size_ = 0;
std::vector<std::pair<T, int64>> queue_;
};
} // namespace
TrainerModel::TrainerModel(const TrainerSpec &trainer_spec,
@ -96,7 +159,7 @@ void TrainerModel::SetSentencePieces(SentencePieces &&sentencepieces) {
CHECK(status().ok());
}
TrainerModel::SentencePieces Trainer::MakeSeedSentencePieces() const {
TrainerModel::SentencePieces Trainer::MakeSeedSentencePieces() {
return trainer_spec_.train_extremely_large_corpus()
? MakeSeedSentencePiecesInternal<int64>()
: MakeSeedSentencePiecesInternal<int32>();
@ -104,7 +167,7 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePieces() const {
// Returns seed sentencepieces for EM training.
template <typename node_int_type>
TrainerModel::SentencePieces Trainer::MakeSeedSentencePiecesInternal() const {
TrainerModel::SentencePieces Trainer::MakeSeedSentencePiecesInternal() {
CHECK(!sentences_.empty());
CHECK(!required_chars_.empty());
@ -112,14 +175,43 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePiecesInternal() const {
// Pretokenizer is used as a constraint of piece extractions.
const auto *pretokenizer = SentencePieceTrainer::GetPretokenizerForTraining();
auto pretokenize_or_rewrite = [&](std::pair<std::string, int64> *w) {
if (pretokenizer) {
std::vector<char32> chars;
for (const auto &w : pretokenizer->PreTokenize(w->first)) {
for (const auto &c : string_util::UTF8ToUnicodeText(w)) {
chars.push_back(c);
}
chars.push_back(kSentenceBoundary);
}
return chars;
} else if (!trainer_spec_.pretokenization_delimiter().empty()) {
// When delimiter is specified, tokenize the input with the delimiter.
// For EM training, we assume that the delimiter doesn't exist and
// rewrite the original sentence.
std::vector<char32> chars;
absl::string_view delimiter = trainer_spec_.pretokenization_delimiter();
for (const auto &w : absl::StrSplit(w->first, delimiter)) {
for (const auto &c : string_util::UTF8ToUnicodeText(w)) {
chars.push_back(c);
}
chars.push_back(kSentenceBoundary);
}
// Removes the delimiter.
w->first = absl::StrReplaceAll(w->first, {{delimiter, ""}});
return chars;
}
return string_util::UTF8ToUnicodeText(w->first);
};
// Merges all sentences into one array with 0x0000 delimiter.
std::vector<char32> array;
absl::flat_hash_map<std::string, int64> all_chars;
constexpr char32 kSentenceBoundary = 0x0000;
for (const auto &w : sentences_) {
const auto ut = string_util::UTF8ToUnicodeText(
pretokenizer ? pretokenizer->PreTokenize(w.first) : w.first);
const bool is_tsv = trainer_spec_.input_format() == "tsv";
for (auto &w : sentences_) {
const auto ut = pretokenize_or_rewrite(&w);
for (const auto &c : ut) {
array.push_back(c);
if (c != kUNKChar && c != kSentenceBoundary) {
@ -127,6 +219,15 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePiecesInternal() const {
}
}
array.push_back(kSentenceBoundary); // sentence boundary marker.
// Naive workaround to over-sample the input.
// In TSV mode, the frequency field is not used to extract the seed piece.
// we can at least extract all pieces by copying the input because
// the occurrence gets at least larger than or equals to 2.
if (is_tsv) {
for (const auto &c : ut) array.push_back(c);
array.push_back(kSentenceBoundary);
}
}
CHECK_LE(array.size(),
@ -147,29 +248,42 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePiecesInternal() const {
CHECK_EQ(0, esaxx(array.begin(), SA.begin(), L.begin(), R.begin(), D.begin(),
n, kAlphabetSize, node_num));
LOG(INFO) << "Extracting frequent sub strings...";
std::vector<std::pair<node_int_type, node_int_type>> substr_index;
LOG(INFO) << "Extracting frequent sub strings... node_num=" << node_num;
BoundedPriorityQueue<std::string> queue(
static_cast<size_t>(trainer_spec_.seed_sentencepiece_size()));
for (node_int_type i = 0; i < node_num; ++i) {
const node_int_type offset = SA[L[i]];
const node_int_type len = D[i];
if (len <= 1) {
continue;
}
const char32 *begin = &array[0] + offset;
const char32 *end = &array[0] + offset + len;
// Skips if a substring contains a sentence boundary.
if (std::find(begin, end, kSentenceBoundary) != end) {
continue;
}
const UnicodeText uw(begin, end);
if (!IsValidSentencePiece(uw)) {
continue;
}
// character-wise coverage is the default score.
const node_int_type freq = R[i] - L[i];
const node_int_type score = freq * len;
substr_index.emplace_back(i, score);
for (const auto &p :
SplitBySentenceBoundary(&array[offset], &array[offset + len])) {
if (p.first == p.second) continue;
const auto [begin, end] = NormalizeRange(p.first, p.second);
const UnicodeText uw(begin, end);
if (uw.size() <= 1 || !IsValidSentencePiece(uw)) {
continue;
}
// character-wise coverage is the default score.
const node_int_type freq = R[i] - L[i];
const node_int_type score = freq * freq;
const auto w = string_util::UnicodeTextToUTF8(uw);
queue.push(w, score);
const auto subpieces =
SplitIntoWords(w, trainer_spec_.treat_whitespace_as_suffix(),
trainer_spec_.allow_whitespace_only_pieces());
if (subpieces.size() > 1) {
for (const auto &s : subpieces) queue.push(std::string(s), score);
}
}
}
// all_chars must be included in the seed sentencepieces.
@ -178,22 +292,8 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePiecesInternal() const {
seed_sentencepieces.emplace_back(it);
}
// Sort by the coverage of sub strings.
for (const auto &p : Sorted(substr_index)) {
const node_int_type offset = SA[L[p.first]];
const node_int_type len = D[p.first];
CHECK_GT(len, 0);
const char32 *begin = &array[offset];
const char32 *end = &array[offset + len];
const UnicodeText uw(begin, end);
CHECK(IsValidSentencePiece(uw)); // just in case.
const std::string w = string_util::UnicodeTextToUTF8(uw);
if (seed_sentencepieces.size() ==
static_cast<size_t>(trainer_spec_.seed_sentencepiece_size())) {
break;
}
CHECK(!port::ContainsKey(all_chars, w));
seed_sentencepieces.emplace_back(w, p.second);
for (const auto &p : queue.get()) {
seed_sentencepieces.emplace_back(p);
}
ToLogProb(seed_sentencepieces.begin(), seed_sentencepieces.end());
@ -430,6 +530,22 @@ TrainerModel::SentencePieces Trainer::PruneSentencePieces(
return new_sentencepieces;
}
std::pair<const char32 *, const char32 *> Trainer::NormalizeRange(
const char32 *begin, const char32 *end) const {
if (trainer_spec_.treat_whitespace_as_suffix()) {
while ((*begin == kSentenceBoundary || *begin == kWsMarker) &&
begin + 1 < end)
++begin;
while (*(end - 1) == kSentenceBoundary && begin + 1 < end) --end;
} else {
while (*begin == kSentenceBoundary && begin + 1 < end) ++begin;
while ((*(end - 1) == kSentenceBoundary || *(end - 1) == kWsMarker) &&
begin + 1 < end)
--end;
}
return std::make_pair(begin, end);
}
TrainerModel::SentencePieces Trainer::FinalizeSentencePieces(
const TrainerModel &model) const {
const auto &sentencepieces = model.GetSentencePieces();

View File

@ -68,7 +68,7 @@ class Trainer : public TrainerInterface {
: TrainerInterface::TrainerInterface(trainer_spec, normalizer_spec,
denormalizer_spec) {}
TrainerModel::SentencePieces MakeSeedSentencePieces() const;
TrainerModel::SentencePieces MakeSeedSentencePieces();
util::Status Train() override;
@ -80,7 +80,7 @@ class Trainer : public TrainerInterface {
// node_int_type should be of integer type (int32 or int64),
// determined by train_extremely_large_corpus.
template <typename node_int_type>
TrainerModel::SentencePieces MakeSeedSentencePiecesInternal() const;
TrainerModel::SentencePieces MakeSeedSentencePiecesInternal();
// Executes the E step of EM and returns expected count.
// The index of return array is the vocab id.
@ -105,6 +105,9 @@ class Trainer : public TrainerInterface {
TrainerModel::SentencePieces FinalizeSentencePieces(
const TrainerModel &model) const;
std::pair<const char32 *, const char32 *> NormalizeRange(
const char32 *begin, const char32 *end) const;
// When the size of SentencePieces becomes less than desired_vocab_size_,
// break the main training loop. desired_vocab_size_ = 1.1 * vocab_size_
// for now.

View File

@ -117,11 +117,13 @@ TEST(UnigramTrainerTest, BasicTest) {
30);
// Check seed pieces.
EXPECT_EQ(27, res.seed_pieces_and_probs.size());
EXPECT_EQ(63, res.seed_pieces_and_probs.size());
// Check final pieces.
EXPECT_EQ("i a n y m l e apple ve O P r t g an v ▁ A b le ▁an p d h",
res.sentence_pieces);
EXPECT_EQ(
"Overly Pineapple magnanimity Available ▁an a ▁ b A t g r P O v m y p n "
"l d e h i",
res.sentence_pieces);
}
TEST(UnigramTrainerTest, BasicDPTest) {
@ -132,8 +134,7 @@ TEST(UnigramTrainerTest, BasicDPTest) {
"Overly \t 6", "Available \t 5"},
22, true /*use_dp*/, 0 /*dp_noise*/, 4 /*dp_clipping*/);
// Got 16 instead of 27 seeds.
EXPECT_EQ(16, res.seed_pieces_and_probs.size());
EXPECT_EQ(49, res.seed_pieces_and_probs.size());
// And they are equiv to if the last sentence was not there.
const auto& res_nodp = RunTrainer(
@ -191,12 +192,12 @@ TEST(UnigramTrainerTest, EndToEndTest) {
.ok());
// TODO(taku): Temporally disable this test on Windows.
#ifndef OS_WIN
EXPECT_EQ(WS
" 吾輩 《 わが はい 》 は 猫 である 。 名前 はまだ 無い 。 "
"どこ で 生 れた か とん と 見当 《 けん とう 》 が つか ぬ 。 "
"何でも 薄 暗 い じめ じめ した 所で ニャーニャー "
"泣 い ていた 事 だけ 記憶 している 。",
absl::StrJoin(tok, " "));
EXPECT_EQ(
WS
" 吾輩 《 わ が は い 》 は猫である 。 名前は まだ 無 い 。 どこ で 生れ "
"た か とん と 見当 《 けん とう 》 が つか ぬ 。 何でも 薄 暗 い じめ "
"じめ した 所で ニャーニャー 泣 い ていた 事 だけ 記憶している 。",
absl::StrJoin(tok, " "));
#endif
}