diff --git a/python/src/sentencepiece/sentencepiece_model_pb2.py b/python/src/sentencepiece/sentencepiece_model_pb2.py index c6ff5a6..3b824d4 100644 --- a/python/src/sentencepiece/sentencepiece_model_pb2.py +++ b/python/src/sentencepiece/sentencepiece_model_pb2.py @@ -19,7 +19,7 @@ DESCRIPTOR = _descriptor.FileDescriptor( syntax='proto2', serialized_options=b'H\003', create_key=_descriptor._internal_create_key, - serialized_pb=b'\n\x19sentencepiece_model.proto\x12\rsentencepiece\"\xdb\x0b\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12*\n\x1b\x65nable_differential_privacy\x18\x32 \x01(\x08:\x05\x66\x61lse\x12+\n differential_privacy_noise_level\x18\x33 \x01(\x02:\x01\x30\x12\x32\n\'differential_privacy_clipping_threshold\x18\x34 \x01(\x04:\x01\x30\x12\"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12\"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12+\n\x1c\x61llow_whitespace_only_pieces\x18\x1a \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18 \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18\" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05\x12\x16\n\tbos_piece\x18. \x01(\t:\x03\x12\x17\n\teos_piece\x18/ \x01(\t:\x04\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse\"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32\".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL\"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03' + serialized_pb=b'\n\x19sentencepiece_model.proto\x12\rsentencepiece\"\x80\x0c\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12*\n\x1b\x65nable_differential_privacy\x18\x32 \x01(\x08:\x05\x66\x61lse\x12+\n differential_privacy_noise_level\x18\x33 \x01(\x02:\x01\x30\x12\x32\n\'differential_privacy_clipping_threshold\x18\x34 \x01(\x04:\x01\x30\x12\"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12\"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12+\n\x1c\x61llow_whitespace_only_pieces\x18\x1a \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12#\n\x19pretokenization_delimiter\x18\x35 \x01(\t:\x00\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18 \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18\" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05\x12\x16\n\tbos_piece\x18. \x01(\t:\x03\x12\x17\n\teos_piece\x18/ \x01(\t:\x04\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse\"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32\".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL\"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03' ) @@ -54,8 +54,8 @@ _TRAINERSPEC_MODELTYPE = _descriptor.EnumDescriptor( ], containing_type=None, serialized_options=None, - serialized_start=1480, - serialized_end=1533, + serialized_start=1517, + serialized_end=1570, ) _sym_db.RegisterEnumDescriptor(_TRAINERSPEC_MODELTYPE) @@ -99,8 +99,8 @@ _MODELPROTO_SENTENCEPIECE_TYPE = _descriptor.EnumDescriptor( ], containing_type=None, serialized_options=None, - serialized_start=2286, - serialized_end=2370, + serialized_start=2323, + serialized_end=2407, ) _sym_db.RegisterEnumDescriptor(_MODELPROTO_SENTENCEPIECE_TYPE) @@ -303,119 +303,126 @@ _TRAINERSPEC = _descriptor.Descriptor( is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='control_symbols', full_name='sentencepiece.TrainerSpec.control_symbols', index=27, + name='pretokenization_delimiter', full_name='sentencepiece.TrainerSpec.pretokenization_delimiter', index=27, + number=53, type=9, cpp_type=9, label=1, + has_default_value=True, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='control_symbols', full_name='sentencepiece.TrainerSpec.control_symbols', index=28, number=30, type=9, cpp_type=9, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='user_defined_symbols', full_name='sentencepiece.TrainerSpec.user_defined_symbols', index=28, + name='user_defined_symbols', full_name='sentencepiece.TrainerSpec.user_defined_symbols', index=29, number=31, type=9, cpp_type=9, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='required_chars', full_name='sentencepiece.TrainerSpec.required_chars', index=29, + name='required_chars', full_name='sentencepiece.TrainerSpec.required_chars', index=30, number=36, type=9, cpp_type=9, label=1, has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='byte_fallback', full_name='sentencepiece.TrainerSpec.byte_fallback', index=30, + name='byte_fallback', full_name='sentencepiece.TrainerSpec.byte_fallback', index=31, number=35, type=8, cpp_type=7, label=1, has_default_value=True, default_value=False, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='vocabulary_output_piece_score', full_name='sentencepiece.TrainerSpec.vocabulary_output_piece_score', index=31, + name='vocabulary_output_piece_score', full_name='sentencepiece.TrainerSpec.vocabulary_output_piece_score', index=32, number=32, type=8, cpp_type=7, label=1, has_default_value=True, default_value=True, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='hard_vocab_limit', full_name='sentencepiece.TrainerSpec.hard_vocab_limit', index=32, + name='hard_vocab_limit', full_name='sentencepiece.TrainerSpec.hard_vocab_limit', index=33, number=33, type=8, cpp_type=7, label=1, has_default_value=True, default_value=True, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='use_all_vocab', full_name='sentencepiece.TrainerSpec.use_all_vocab', index=33, + name='use_all_vocab', full_name='sentencepiece.TrainerSpec.use_all_vocab', index=34, number=34, type=8, cpp_type=7, label=1, has_default_value=True, default_value=False, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='unk_id', full_name='sentencepiece.TrainerSpec.unk_id', index=34, + name='unk_id', full_name='sentencepiece.TrainerSpec.unk_id', index=35, number=40, type=5, cpp_type=1, label=1, has_default_value=True, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='bos_id', full_name='sentencepiece.TrainerSpec.bos_id', index=35, + name='bos_id', full_name='sentencepiece.TrainerSpec.bos_id', index=36, number=41, type=5, cpp_type=1, label=1, has_default_value=True, default_value=1, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='eos_id', full_name='sentencepiece.TrainerSpec.eos_id', index=36, + name='eos_id', full_name='sentencepiece.TrainerSpec.eos_id', index=37, number=42, type=5, cpp_type=1, label=1, has_default_value=True, default_value=2, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='pad_id', full_name='sentencepiece.TrainerSpec.pad_id', index=37, + name='pad_id', full_name='sentencepiece.TrainerSpec.pad_id', index=38, number=43, type=5, cpp_type=1, label=1, has_default_value=True, default_value=-1, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='unk_piece', full_name='sentencepiece.TrainerSpec.unk_piece', index=38, + name='unk_piece', full_name='sentencepiece.TrainerSpec.unk_piece', index=39, number=45, type=9, cpp_type=9, label=1, has_default_value=True, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='bos_piece', full_name='sentencepiece.TrainerSpec.bos_piece', index=39, + name='bos_piece', full_name='sentencepiece.TrainerSpec.bos_piece', index=40, number=46, type=9, cpp_type=9, label=1, has_default_value=True, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='eos_piece', full_name='sentencepiece.TrainerSpec.eos_piece', index=40, + name='eos_piece', full_name='sentencepiece.TrainerSpec.eos_piece', index=41, number=47, type=9, cpp_type=9, label=1, has_default_value=True, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='pad_piece', full_name='sentencepiece.TrainerSpec.pad_piece', index=41, + name='pad_piece', full_name='sentencepiece.TrainerSpec.pad_piece', index=42, number=48, type=9, cpp_type=9, label=1, has_default_value=True, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='unk_surface', full_name='sentencepiece.TrainerSpec.unk_surface', index=42, + name='unk_surface', full_name='sentencepiece.TrainerSpec.unk_surface', index=43, number=44, type=9, cpp_type=9, label=1, has_default_value=True, default_value=b" \342\201\207 ".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='train_extremely_large_corpus', full_name='sentencepiece.TrainerSpec.train_extremely_large_corpus', index=43, + name='train_extremely_large_corpus', full_name='sentencepiece.TrainerSpec.train_extremely_large_corpus', index=44, number=49, type=8, cpp_type=7, label=1, has_default_value=True, default_value=False, message_type=None, enum_type=None, containing_type=None, @@ -435,7 +442,7 @@ _TRAINERSPEC = _descriptor.Descriptor( oneofs=[ ], serialized_start=45, - serialized_end=1544, + serialized_end=1581, ) @@ -501,8 +508,8 @@ _NORMALIZERSPEC = _descriptor.Descriptor( extension_ranges=[(200, 536870912), ], oneofs=[ ], - serialized_start=1547, - serialized_end=1756, + serialized_start=1584, + serialized_end=1793, ) @@ -540,8 +547,8 @@ _SELFTESTDATA_SAMPLE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1827, - serialized_end=1868, + serialized_start=1864, + serialized_end=1905, ) _SELFTESTDATA = _descriptor.Descriptor( @@ -571,8 +578,8 @@ _SELFTESTDATA = _descriptor.Descriptor( extension_ranges=[(200, 536870912), ], oneofs=[ ], - serialized_start=1758, - serialized_end=1879, + serialized_start=1795, + serialized_end=1916, ) @@ -618,8 +625,8 @@ _MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor( extension_ranges=[(200, 536870912), ], oneofs=[ ], - serialized_start=2171, - serialized_end=2381, + serialized_start=2208, + serialized_end=2418, ) _MODELPROTO = _descriptor.Descriptor( @@ -677,8 +684,8 @@ _MODELPROTO = _descriptor.Descriptor( extension_ranges=[(200, 536870912), ], oneofs=[ ], - serialized_start=1882, - serialized_end=2392, + serialized_start=1919, + serialized_end=2429, ) _TRAINERSPEC.fields_by_name['model_type'].enum_type = _TRAINERSPEC_MODELTYPE diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 216e0c6..c272b6e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -63,7 +63,7 @@ if (SPM_USE_BUILTIN_PROTOBUF) if (MSVC) add_definitions("/DHAVE_PTHREAD /wd4018 /wd4514") else() - add_definitions("-pthread -DHAVE_PTHREAD=1 -Wno-sign-compare") + add_definitions("-pthread -DHAVE_PTHREAD=1 -Wno-sign-compare -Wno-deprecated-declarations") endif() include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../third_party/protobuf-lite) include_directories(builtin_pb) diff --git a/src/builtin_pb/sentencepiece_model.pb.cc b/src/builtin_pb/sentencepiece_model.pb.cc index a844938..669cac2 100644 --- a/src/builtin_pb/sentencepiece_model.pb.cc +++ b/src/builtin_pb/sentencepiece_model.pb.cc @@ -285,101 +285,104 @@ class TrainerSpec::_Internal { (*has_bits)[0] |= 1u; } static void set_has_model_type(HasBits* has_bits) { - (*has_bits)[0] |= 4194304u; - } - static void set_has_vocab_size(HasBits* has_bits) { (*has_bits)[0] |= 8388608u; } - static void set_has_self_test_sample_size(HasBits* has_bits) { - (*has_bits)[0] |= 256u; - } - static void set_has_enable_differential_privacy(HasBits* has_bits) { - (*has_bits)[0] |= 4096u; - } - static void set_has_differential_privacy_noise_level(HasBits* has_bits) { - (*has_bits)[0] |= 1048576u; - } - static void set_has_differential_privacy_clipping_threshold(HasBits* has_bits) { - (*has_bits)[0] |= 2097152u; - } - static void set_has_character_coverage(HasBits* has_bits) { + static void set_has_vocab_size(HasBits* has_bits) { (*has_bits)[0] |= 16777216u; } - static void set_has_input_sentence_size(HasBits* has_bits) { - (*has_bits)[0] |= 1024u; - } - static void set_has_shuffle_input_sentence(HasBits* has_bits) { - (*has_bits)[0] |= 2147483648u; - } - static void set_has_mining_sentence_size(HasBits* has_bits) { + static void set_has_self_test_sample_size(HasBits* has_bits) { (*has_bits)[0] |= 512u; } - static void set_has_training_sentence_size(HasBits* has_bits) { - (*has_bits)[0] |= 2048u; - } - static void set_has_seed_sentencepiece_size(HasBits* has_bits) { - (*has_bits)[0] |= 33554432u; - } - static void set_has_shrinking_factor(HasBits* has_bits) { - (*has_bits)[0] |= 67108864u; - } - static void set_has_max_sentence_length(HasBits* has_bits) { - (*has_bits)[0] |= 536870912u; - } - static void set_has_num_threads(HasBits* has_bits) { - (*has_bits)[0] |= 134217728u; - } - static void set_has_num_sub_iterations(HasBits* has_bits) { - (*has_bits)[0] |= 268435456u; - } - static void set_has_max_sentencepiece_length(HasBits* has_bits) { - (*has_bits)[0] |= 1073741824u; - } - static void set_has_split_by_unicode_script(HasBits* has_bits) { - (*has_bits)[1] |= 1u; - } - static void set_has_split_by_number(HasBits* has_bits) { - (*has_bits)[1] |= 2u; - } - static void set_has_split_by_whitespace(HasBits* has_bits) { - (*has_bits)[1] |= 4u; - } - static void set_has_treat_whitespace_as_suffix(HasBits* has_bits) { + static void set_has_enable_differential_privacy(HasBits* has_bits) { (*has_bits)[0] |= 8192u; } - static void set_has_allow_whitespace_only_pieces(HasBits* has_bits) { + static void set_has_differential_privacy_noise_level(HasBits* has_bits) { + (*has_bits)[0] |= 2097152u; + } + static void set_has_differential_privacy_clipping_threshold(HasBits* has_bits) { + (*has_bits)[0] |= 4194304u; + } + static void set_has_character_coverage(HasBits* has_bits) { + (*has_bits)[0] |= 33554432u; + } + static void set_has_input_sentence_size(HasBits* has_bits) { + (*has_bits)[0] |= 2048u; + } + static void set_has_shuffle_input_sentence(HasBits* has_bits) { + (*has_bits)[1] |= 1u; + } + static void set_has_mining_sentence_size(HasBits* has_bits) { + (*has_bits)[0] |= 1024u; + } + static void set_has_training_sentence_size(HasBits* has_bits) { + (*has_bits)[0] |= 4096u; + } + static void set_has_seed_sentencepiece_size(HasBits* has_bits) { + (*has_bits)[0] |= 67108864u; + } + static void set_has_shrinking_factor(HasBits* has_bits) { + (*has_bits)[0] |= 134217728u; + } + static void set_has_max_sentence_length(HasBits* has_bits) { + (*has_bits)[0] |= 1073741824u; + } + static void set_has_num_threads(HasBits* has_bits) { + (*has_bits)[0] |= 268435456u; + } + static void set_has_num_sub_iterations(HasBits* has_bits) { + (*has_bits)[0] |= 536870912u; + } + static void set_has_max_sentencepiece_length(HasBits* has_bits) { + (*has_bits)[0] |= 2147483648u; + } + static void set_has_split_by_unicode_script(HasBits* has_bits) { + (*has_bits)[1] |= 2u; + } + static void set_has_split_by_number(HasBits* has_bits) { + (*has_bits)[1] |= 4u; + } + static void set_has_split_by_whitespace(HasBits* has_bits) { + (*has_bits)[1] |= 8u; + } + static void set_has_treat_whitespace_as_suffix(HasBits* has_bits) { (*has_bits)[0] |= 16384u; } - static void set_has_split_digits(HasBits* has_bits) { + static void set_has_allow_whitespace_only_pieces(HasBits* has_bits) { (*has_bits)[0] |= 32768u; } + static void set_has_split_digits(HasBits* has_bits) { + (*has_bits)[0] |= 65536u; + } + static void set_has_pretokenization_delimiter(HasBits* has_bits) { + (*has_bits)[0] |= 256u; + } static void set_has_required_chars(HasBits* has_bits) { (*has_bits)[0] |= 4u; } static void set_has_byte_fallback(HasBits* has_bits) { - (*has_bits)[0] |= 65536u; - } - static void set_has_vocabulary_output_piece_score(HasBits* has_bits) { - (*has_bits)[1] |= 8u; - } - static void set_has_hard_vocab_limit(HasBits* has_bits) { - (*has_bits)[1] |= 16u; - } - static void set_has_use_all_vocab(HasBits* has_bits) { (*has_bits)[0] |= 131072u; } - static void set_has_unk_id(HasBits* has_bits) { - (*has_bits)[0] |= 524288u; + static void set_has_vocabulary_output_piece_score(HasBits* has_bits) { + (*has_bits)[1] |= 16u; } - static void set_has_bos_id(HasBits* has_bits) { + static void set_has_hard_vocab_limit(HasBits* has_bits) { (*has_bits)[1] |= 32u; } - static void set_has_eos_id(HasBits* has_bits) { + static void set_has_use_all_vocab(HasBits* has_bits) { + (*has_bits)[0] |= 262144u; + } + static void set_has_unk_id(HasBits* has_bits) { + (*has_bits)[0] |= 1048576u; + } + static void set_has_bos_id(HasBits* has_bits) { (*has_bits)[1] |= 64u; } - static void set_has_pad_id(HasBits* has_bits) { + static void set_has_eos_id(HasBits* has_bits) { (*has_bits)[1] |= 128u; } + static void set_has_pad_id(HasBits* has_bits) { + (*has_bits)[1] |= 256u; + } static void set_has_unk_piece(HasBits* has_bits) { (*has_bits)[0] |= 16u; } @@ -396,7 +399,7 @@ class TrainerSpec::_Internal { (*has_bits)[0] |= 8u; } static void set_has_train_extremely_large_corpus(HasBits* has_bits) { - (*has_bits)[0] |= 262144u; + (*has_bits)[0] |= 524288u; } }; @@ -465,6 +468,11 @@ TrainerSpec::TrainerSpec(const TrainerSpec& from) pad_piece_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::NonEmptyDefault{}, from._internal_pad_piece(), GetArena()); } + pretokenization_delimiter_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); + if (from._internal_has_pretokenization_delimiter()) { + pretokenization_delimiter_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, from._internal_pretokenization_delimiter(), + GetArena()); + } ::memcpy(&self_test_sample_size_, &from.self_test_sample_size_, static_cast(reinterpret_cast(&pad_id_) - reinterpret_cast(&self_test_sample_size_)) + sizeof(pad_id_)); @@ -481,6 +489,7 @@ void TrainerSpec::SharedCtor() { bos_piece_.UnsafeSetDefault(nullptr); eos_piece_.UnsafeSetDefault(nullptr); pad_piece_.UnsafeSetDefault(nullptr); + pretokenization_delimiter_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); ::memset(reinterpret_cast(this) + static_cast( reinterpret_cast(&self_test_sample_size_) - reinterpret_cast(this)), 0, static_cast(reinterpret_cast(&differential_privacy_clipping_threshold_) - @@ -521,6 +530,7 @@ void TrainerSpec::SharedDtor() { bos_piece_.DestroyNoArena(nullptr); eos_piece_.DestroyNoArena(nullptr); pad_piece_.DestroyNoArena(nullptr); + pretokenization_delimiter_.DestroyNoArena(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); } void TrainerSpec::ArenaDtor(void* object) { @@ -576,19 +586,22 @@ void TrainerSpec::Clear() { pad_piece_.ClearToDefault(::sentencepiece::TrainerSpec::_i_give_permission_to_break_this_code_default_pad_piece_, GetArena()); } } - if (cached_has_bits & 0x0000ff00u) { + if (cached_has_bits & 0x00000100u) { + pretokenization_delimiter_.ClearNonDefaultToEmpty(); + } + if (cached_has_bits & 0x0000fe00u) { ::memset(&self_test_sample_size_, 0, static_cast( - reinterpret_cast(&split_digits_) - - reinterpret_cast(&self_test_sample_size_)) + sizeof(split_digits_)); + reinterpret_cast(&allow_whitespace_only_pieces_) - + reinterpret_cast(&self_test_sample_size_)) + sizeof(allow_whitespace_only_pieces_)); } if (cached_has_bits & 0x00ff0000u) { - ::memset(&byte_fallback_, 0, static_cast( + ::memset(&split_digits_, 0, static_cast( reinterpret_cast(&differential_privacy_clipping_threshold_) - - reinterpret_cast(&byte_fallback_)) + sizeof(differential_privacy_clipping_threshold_)); + reinterpret_cast(&split_digits_)) + sizeof(differential_privacy_clipping_threshold_)); model_type_ = 1; - vocab_size_ = 8000; } if (cached_has_bits & 0xff000000u) { + vocab_size_ = 8000; character_coverage_ = 0.9995f; seed_sentencepiece_size_ = 1000000; shrinking_factor_ = 0.75f; @@ -596,10 +609,10 @@ void TrainerSpec::Clear() { num_sub_iterations_ = 2; max_sentence_length_ = 4192; max_sentencepiece_length_ = 16; - shuffle_input_sentence_ = true; } cached_has_bits = _has_bits_[1]; if (cached_has_bits & 0x000000ffu) { + shuffle_input_sentence_ = true; split_by_unicode_script_ = true; split_by_number_ = true; split_by_whitespace_ = true; @@ -607,8 +620,8 @@ void TrainerSpec::Clear() { hard_vocab_limit_ = true; bos_id_ = 1; eos_id_ = 2; - pad_id_ = -1; } + pad_id_ = -1; _has_bits_.Clear(); _internal_metadata_.Clear(); } @@ -996,6 +1009,14 @@ const char* TrainerSpec::_InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID CHK_(ptr); } else goto handle_unusual; continue; + // optional string pretokenization_delimiter = 53 [default = ""]; + case 53: + if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 170)) { + auto str = _internal_mutable_pretokenization_delimiter(); + ptr = ::PROTOBUF_NAMESPACE_ID::internal::InlineGreedyStringParser(str, ptr, ctx); + CHK_(ptr); + } else goto handle_unusual; + continue; default: { handle_unusual: if ((tag & 7) == 4 || tag == 0) { @@ -1044,14 +1065,14 @@ failure: } // optional .sentencepiece.TrainerSpec.ModelType model_type = 3 [default = UNIGRAM]; - if (cached_has_bits & 0x00400000u) { + if (cached_has_bits & 0x00800000u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteEnumToArray( 3, this->_internal_model_type(), target); } // optional int32 vocab_size = 4 [default = 8000]; - if (cached_has_bits & 0x00800000u) { + if (cached_has_bits & 0x01000000u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(4, this->_internal_vocab_size(), target); } @@ -1063,7 +1084,7 @@ failure: } // optional int32 self_test_sample_size = 6 [default = 0]; - if (cached_has_bits & 0x00000100u) { + if (cached_has_bits & 0x00000200u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(6, this->_internal_self_test_sample_size(), target); } @@ -1075,105 +1096,107 @@ failure: } // optional float character_coverage = 10 [default = 0.9995]; - if (cached_has_bits & 0x01000000u) { + if (cached_has_bits & 0x02000000u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteFloatToArray(10, this->_internal_character_coverage(), target); } // optional uint64 input_sentence_size = 11 [default = 0]; - if (cached_has_bits & 0x00000400u) { + if (cached_has_bits & 0x00000800u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteUInt64ToArray(11, this->_internal_input_sentence_size(), target); } // optional int32 mining_sentence_size = 12 [deprecated = true]; - if (cached_has_bits & 0x00000200u) { + if (cached_has_bits & 0x00000400u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(12, this->_internal_mining_sentence_size(), target); } // optional int32 training_sentence_size = 13 [deprecated = true]; - if (cached_has_bits & 0x00000800u) { + if (cached_has_bits & 0x00001000u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(13, this->_internal_training_sentence_size(), target); } // optional int32 seed_sentencepiece_size = 14 [default = 1000000]; - if (cached_has_bits & 0x02000000u) { + if (cached_has_bits & 0x04000000u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(14, this->_internal_seed_sentencepiece_size(), target); } // optional float shrinking_factor = 15 [default = 0.75]; - if (cached_has_bits & 0x04000000u) { + if (cached_has_bits & 0x08000000u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteFloatToArray(15, this->_internal_shrinking_factor(), target); } // optional int32 num_threads = 16 [default = 16]; - if (cached_has_bits & 0x08000000u) { + if (cached_has_bits & 0x10000000u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(16, this->_internal_num_threads(), target); } // optional int32 num_sub_iterations = 17 [default = 2]; - if (cached_has_bits & 0x10000000u) { + if (cached_has_bits & 0x20000000u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(17, this->_internal_num_sub_iterations(), target); } // optional int32 max_sentence_length = 18 [default = 4192]; - if (cached_has_bits & 0x20000000u) { + if (cached_has_bits & 0x40000000u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(18, this->_internal_max_sentence_length(), target); } + cached_has_bits = _has_bits_[1]; // optional bool shuffle_input_sentence = 19 [default = true]; - if (cached_has_bits & 0x80000000u) { + if (cached_has_bits & 0x00000001u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(19, this->_internal_shuffle_input_sentence(), target); } + cached_has_bits = _has_bits_[0]; // optional int32 max_sentencepiece_length = 20 [default = 16]; - if (cached_has_bits & 0x40000000u) { + if (cached_has_bits & 0x80000000u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(20, this->_internal_max_sentencepiece_length(), target); } cached_has_bits = _has_bits_[1]; // optional bool split_by_unicode_script = 21 [default = true]; - if (cached_has_bits & 0x00000001u) { + if (cached_has_bits & 0x00000002u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(21, this->_internal_split_by_unicode_script(), target); } // optional bool split_by_whitespace = 22 [default = true]; - if (cached_has_bits & 0x00000004u) { + if (cached_has_bits & 0x00000008u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(22, this->_internal_split_by_whitespace(), target); } // optional bool split_by_number = 23 [default = true]; - if (cached_has_bits & 0x00000002u) { + if (cached_has_bits & 0x00000004u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(23, this->_internal_split_by_number(), target); } cached_has_bits = _has_bits_[0]; // optional bool treat_whitespace_as_suffix = 24 [default = false]; - if (cached_has_bits & 0x00002000u) { + if (cached_has_bits & 0x00004000u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(24, this->_internal_treat_whitespace_as_suffix(), target); } // optional bool split_digits = 25 [default = false]; - if (cached_has_bits & 0x00008000u) { + if (cached_has_bits & 0x00010000u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(25, this->_internal_split_digits(), target); } // optional bool allow_whitespace_only_pieces = 26 [default = false]; - if (cached_has_bits & 0x00004000u) { + if (cached_has_bits & 0x00008000u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(26, this->_internal_allow_whitespace_only_pieces(), target); } @@ -1192,26 +1215,26 @@ failure: cached_has_bits = _has_bits_[1]; // optional bool vocabulary_output_piece_score = 32 [default = true]; - if (cached_has_bits & 0x00000008u) { + if (cached_has_bits & 0x00000010u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(32, this->_internal_vocabulary_output_piece_score(), target); } // optional bool hard_vocab_limit = 33 [default = true]; - if (cached_has_bits & 0x00000010u) { + if (cached_has_bits & 0x00000020u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(33, this->_internal_hard_vocab_limit(), target); } cached_has_bits = _has_bits_[0]; // optional bool use_all_vocab = 34 [default = false]; - if (cached_has_bits & 0x00020000u) { + if (cached_has_bits & 0x00040000u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(34, this->_internal_use_all_vocab(), target); } // optional bool byte_fallback = 35 [default = false]; - if (cached_has_bits & 0x00010000u) { + if (cached_has_bits & 0x00020000u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(35, this->_internal_byte_fallback(), target); } @@ -1223,26 +1246,26 @@ failure: } // optional int32 unk_id = 40 [default = 0]; - if (cached_has_bits & 0x00080000u) { + if (cached_has_bits & 0x00100000u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(40, this->_internal_unk_id(), target); } cached_has_bits = _has_bits_[1]; // optional int32 bos_id = 41 [default = 1]; - if (cached_has_bits & 0x00000020u) { + if (cached_has_bits & 0x00000040u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(41, this->_internal_bos_id(), target); } // optional int32 eos_id = 42 [default = 2]; - if (cached_has_bits & 0x00000040u) { + if (cached_has_bits & 0x00000080u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(42, this->_internal_eos_id(), target); } // optional int32 pad_id = 43 [default = -1]; - if (cached_has_bits & 0x00000080u) { + if (cached_has_bits & 0x00000100u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(43, this->_internal_pad_id(), target); } @@ -1279,29 +1302,35 @@ failure: } // optional bool train_extremely_large_corpus = 49 [default = false]; - if (cached_has_bits & 0x00040000u) { + if (cached_has_bits & 0x00080000u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(49, this->_internal_train_extremely_large_corpus(), target); } // optional bool enable_differential_privacy = 50 [default = false]; - if (cached_has_bits & 0x00001000u) { + if (cached_has_bits & 0x00002000u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(50, this->_internal_enable_differential_privacy(), target); } // optional float differential_privacy_noise_level = 51 [default = 0]; - if (cached_has_bits & 0x00100000u) { + if (cached_has_bits & 0x00200000u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteFloatToArray(51, this->_internal_differential_privacy_noise_level(), target); } // optional uint64 differential_privacy_clipping_threshold = 52 [default = 0]; - if (cached_has_bits & 0x00200000u) { + if (cached_has_bits & 0x00400000u) { target = stream->EnsureSpace(target); target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteUInt64ToArray(52, this->_internal_differential_privacy_clipping_threshold(), target); } + // optional string pretokenization_delimiter = 53 [default = ""]; + if (cached_has_bits & 0x00000100u) { + target = stream->WriteStringMaybeAliased( + 53, this->_internal_pretokenization_delimiter(), target); + } + // Extension range [200, 536870912) target = _extensions_._InternalSerialize( 200, 536870912, target, stream); @@ -1416,205 +1445,212 @@ size_t TrainerSpec::ByteSizeLong() const { } if (cached_has_bits & 0x0000ff00u) { - // optional int32 self_test_sample_size = 6 [default = 0]; + // optional string pretokenization_delimiter = 53 [default = ""]; if (cached_has_bits & 0x00000100u) { + total_size += 2 + + ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::StringSize( + this->_internal_pretokenization_delimiter()); + } + + // optional int32 self_test_sample_size = 6 [default = 0]; + if (cached_has_bits & 0x00000200u) { total_size += 1 + ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size( this->_internal_self_test_sample_size()); } // optional int32 mining_sentence_size = 12 [deprecated = true]; - if (cached_has_bits & 0x00000200u) { + if (cached_has_bits & 0x00000400u) { total_size += 1 + ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size( this->_internal_mining_sentence_size()); } // optional uint64 input_sentence_size = 11 [default = 0]; - if (cached_has_bits & 0x00000400u) { + if (cached_has_bits & 0x00000800u) { total_size += 1 + ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::UInt64Size( this->_internal_input_sentence_size()); } // optional int32 training_sentence_size = 13 [deprecated = true]; - if (cached_has_bits & 0x00000800u) { + if (cached_has_bits & 0x00001000u) { total_size += 1 + ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size( this->_internal_training_sentence_size()); } // optional bool enable_differential_privacy = 50 [default = false]; - if (cached_has_bits & 0x00001000u) { - total_size += 2 + 1; - } - - // optional bool treat_whitespace_as_suffix = 24 [default = false]; if (cached_has_bits & 0x00002000u) { total_size += 2 + 1; } - // optional bool allow_whitespace_only_pieces = 26 [default = false]; + // optional bool treat_whitespace_as_suffix = 24 [default = false]; if (cached_has_bits & 0x00004000u) { total_size += 2 + 1; } - // optional bool split_digits = 25 [default = false]; + // optional bool allow_whitespace_only_pieces = 26 [default = false]; if (cached_has_bits & 0x00008000u) { total_size += 2 + 1; } } if (cached_has_bits & 0x00ff0000u) { - // optional bool byte_fallback = 35 [default = false]; + // optional bool split_digits = 25 [default = false]; if (cached_has_bits & 0x00010000u) { total_size += 2 + 1; } - // optional bool use_all_vocab = 34 [default = false]; + // optional bool byte_fallback = 35 [default = false]; if (cached_has_bits & 0x00020000u) { total_size += 2 + 1; } - // optional bool train_extremely_large_corpus = 49 [default = false]; + // optional bool use_all_vocab = 34 [default = false]; if (cached_has_bits & 0x00040000u) { total_size += 2 + 1; } - // optional int32 unk_id = 40 [default = 0]; + // optional bool train_extremely_large_corpus = 49 [default = false]; if (cached_has_bits & 0x00080000u) { + total_size += 2 + 1; + } + + // optional int32 unk_id = 40 [default = 0]; + if (cached_has_bits & 0x00100000u) { total_size += 2 + ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size( this->_internal_unk_id()); } // optional float differential_privacy_noise_level = 51 [default = 0]; - if (cached_has_bits & 0x00100000u) { + if (cached_has_bits & 0x00200000u) { total_size += 2 + 4; } // optional uint64 differential_privacy_clipping_threshold = 52 [default = 0]; - if (cached_has_bits & 0x00200000u) { + if (cached_has_bits & 0x00400000u) { total_size += 2 + ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::UInt64Size( this->_internal_differential_privacy_clipping_threshold()); } // optional .sentencepiece.TrainerSpec.ModelType model_type = 3 [default = UNIGRAM]; - if (cached_has_bits & 0x00400000u) { + if (cached_has_bits & 0x00800000u) { total_size += 1 + ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::EnumSize(this->_internal_model_type()); } + } + if (cached_has_bits & 0xff000000u) { // optional int32 vocab_size = 4 [default = 8000]; - if (cached_has_bits & 0x00800000u) { + if (cached_has_bits & 0x01000000u) { total_size += 1 + ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size( this->_internal_vocab_size()); } - } - if (cached_has_bits & 0xff000000u) { // optional float character_coverage = 10 [default = 0.9995]; - if (cached_has_bits & 0x01000000u) { + if (cached_has_bits & 0x02000000u) { total_size += 1 + 4; } // optional int32 seed_sentencepiece_size = 14 [default = 1000000]; - if (cached_has_bits & 0x02000000u) { + if (cached_has_bits & 0x04000000u) { total_size += 1 + ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size( this->_internal_seed_sentencepiece_size()); } // optional float shrinking_factor = 15 [default = 0.75]; - if (cached_has_bits & 0x04000000u) { + if (cached_has_bits & 0x08000000u) { total_size += 1 + 4; } // optional int32 num_threads = 16 [default = 16]; - if (cached_has_bits & 0x08000000u) { + if (cached_has_bits & 0x10000000u) { total_size += 2 + ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size( this->_internal_num_threads()); } // optional int32 num_sub_iterations = 17 [default = 2]; - if (cached_has_bits & 0x10000000u) { + if (cached_has_bits & 0x20000000u) { total_size += 2 + ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size( this->_internal_num_sub_iterations()); } // optional int32 max_sentence_length = 18 [default = 4192]; - if (cached_has_bits & 0x20000000u) { + if (cached_has_bits & 0x40000000u) { total_size += 2 + ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size( this->_internal_max_sentence_length()); } // optional int32 max_sentencepiece_length = 20 [default = 16]; - if (cached_has_bits & 0x40000000u) { + if (cached_has_bits & 0x80000000u) { total_size += 2 + ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size( this->_internal_max_sentencepiece_length()); } - // optional bool shuffle_input_sentence = 19 [default = true]; - if (cached_has_bits & 0x80000000u) { - total_size += 2 + 1; - } - } cached_has_bits = _has_bits_[1]; if (cached_has_bits & 0x000000ffu) { - // optional bool split_by_unicode_script = 21 [default = true]; + // optional bool shuffle_input_sentence = 19 [default = true]; if (cached_has_bits & 0x00000001u) { total_size += 2 + 1; } - // optional bool split_by_number = 23 [default = true]; + // optional bool split_by_unicode_script = 21 [default = true]; if (cached_has_bits & 0x00000002u) { total_size += 2 + 1; } - // optional bool split_by_whitespace = 22 [default = true]; + // optional bool split_by_number = 23 [default = true]; if (cached_has_bits & 0x00000004u) { total_size += 2 + 1; } - // optional bool vocabulary_output_piece_score = 32 [default = true]; + // optional bool split_by_whitespace = 22 [default = true]; if (cached_has_bits & 0x00000008u) { total_size += 2 + 1; } - // optional bool hard_vocab_limit = 33 [default = true]; + // optional bool vocabulary_output_piece_score = 32 [default = true]; if (cached_has_bits & 0x00000010u) { total_size += 2 + 1; } - // optional int32 bos_id = 41 [default = 1]; + // optional bool hard_vocab_limit = 33 [default = true]; if (cached_has_bits & 0x00000020u) { + total_size += 2 + 1; + } + + // optional int32 bos_id = 41 [default = 1]; + if (cached_has_bits & 0x00000040u) { total_size += 2 + ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size( this->_internal_bos_id()); } // optional int32 eos_id = 42 [default = 2]; - if (cached_has_bits & 0x00000040u) { + if (cached_has_bits & 0x00000080u) { total_size += 2 + ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size( this->_internal_eos_id()); } - // optional int32 pad_id = 43 [default = -1]; - if (cached_has_bits & 0x00000080u) { - total_size += 2 + - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size( - this->_internal_pad_id()); - } - } + // optional int32 pad_id = 43 [default = -1]; + if (cached_has_bits & 0x00000100u) { + total_size += 2 + + ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size( + this->_internal_pad_id()); + } + if (PROTOBUF_PREDICT_FALSE(_internal_metadata_.have_unknown_fields())) { total_size += _internal_metadata_.unknown_fields(::PROTOBUF_NAMESPACE_ID::internal::GetEmptyString).size(); } @@ -1670,113 +1706,116 @@ void TrainerSpec::MergeFrom(const TrainerSpec& from) { } if (cached_has_bits & 0x0000ff00u) { if (cached_has_bits & 0x00000100u) { - self_test_sample_size_ = from.self_test_sample_size_; + _internal_set_pretokenization_delimiter(from._internal_pretokenization_delimiter()); } if (cached_has_bits & 0x00000200u) { - mining_sentence_size_ = from.mining_sentence_size_; + self_test_sample_size_ = from.self_test_sample_size_; } if (cached_has_bits & 0x00000400u) { - input_sentence_size_ = from.input_sentence_size_; + mining_sentence_size_ = from.mining_sentence_size_; } if (cached_has_bits & 0x00000800u) { - training_sentence_size_ = from.training_sentence_size_; + input_sentence_size_ = from.input_sentence_size_; } if (cached_has_bits & 0x00001000u) { - enable_differential_privacy_ = from.enable_differential_privacy_; + training_sentence_size_ = from.training_sentence_size_; } if (cached_has_bits & 0x00002000u) { - treat_whitespace_as_suffix_ = from.treat_whitespace_as_suffix_; + enable_differential_privacy_ = from.enable_differential_privacy_; } if (cached_has_bits & 0x00004000u) { - allow_whitespace_only_pieces_ = from.allow_whitespace_only_pieces_; + treat_whitespace_as_suffix_ = from.treat_whitespace_as_suffix_; } if (cached_has_bits & 0x00008000u) { - split_digits_ = from.split_digits_; + allow_whitespace_only_pieces_ = from.allow_whitespace_only_pieces_; } _has_bits_[0] |= cached_has_bits; } if (cached_has_bits & 0x00ff0000u) { if (cached_has_bits & 0x00010000u) { - byte_fallback_ = from.byte_fallback_; + split_digits_ = from.split_digits_; } if (cached_has_bits & 0x00020000u) { - use_all_vocab_ = from.use_all_vocab_; + byte_fallback_ = from.byte_fallback_; } if (cached_has_bits & 0x00040000u) { - train_extremely_large_corpus_ = from.train_extremely_large_corpus_; + use_all_vocab_ = from.use_all_vocab_; } if (cached_has_bits & 0x00080000u) { - unk_id_ = from.unk_id_; + train_extremely_large_corpus_ = from.train_extremely_large_corpus_; } if (cached_has_bits & 0x00100000u) { - differential_privacy_noise_level_ = from.differential_privacy_noise_level_; + unk_id_ = from.unk_id_; } if (cached_has_bits & 0x00200000u) { - differential_privacy_clipping_threshold_ = from.differential_privacy_clipping_threshold_; + differential_privacy_noise_level_ = from.differential_privacy_noise_level_; } if (cached_has_bits & 0x00400000u) { - model_type_ = from.model_type_; + differential_privacy_clipping_threshold_ = from.differential_privacy_clipping_threshold_; } if (cached_has_bits & 0x00800000u) { - vocab_size_ = from.vocab_size_; + model_type_ = from.model_type_; } _has_bits_[0] |= cached_has_bits; } if (cached_has_bits & 0xff000000u) { if (cached_has_bits & 0x01000000u) { - character_coverage_ = from.character_coverage_; + vocab_size_ = from.vocab_size_; } if (cached_has_bits & 0x02000000u) { - seed_sentencepiece_size_ = from.seed_sentencepiece_size_; + character_coverage_ = from.character_coverage_; } if (cached_has_bits & 0x04000000u) { - shrinking_factor_ = from.shrinking_factor_; + seed_sentencepiece_size_ = from.seed_sentencepiece_size_; } if (cached_has_bits & 0x08000000u) { - num_threads_ = from.num_threads_; + shrinking_factor_ = from.shrinking_factor_; } if (cached_has_bits & 0x10000000u) { - num_sub_iterations_ = from.num_sub_iterations_; + num_threads_ = from.num_threads_; } if (cached_has_bits & 0x20000000u) { - max_sentence_length_ = from.max_sentence_length_; + num_sub_iterations_ = from.num_sub_iterations_; } if (cached_has_bits & 0x40000000u) { - max_sentencepiece_length_ = from.max_sentencepiece_length_; + max_sentence_length_ = from.max_sentence_length_; } if (cached_has_bits & 0x80000000u) { - shuffle_input_sentence_ = from.shuffle_input_sentence_; + max_sentencepiece_length_ = from.max_sentencepiece_length_; } _has_bits_[0] |= cached_has_bits; } cached_has_bits = from._has_bits_[1]; if (cached_has_bits & 0x000000ffu) { if (cached_has_bits & 0x00000001u) { - split_by_unicode_script_ = from.split_by_unicode_script_; + shuffle_input_sentence_ = from.shuffle_input_sentence_; } if (cached_has_bits & 0x00000002u) { - split_by_number_ = from.split_by_number_; + split_by_unicode_script_ = from.split_by_unicode_script_; } if (cached_has_bits & 0x00000004u) { - split_by_whitespace_ = from.split_by_whitespace_; + split_by_number_ = from.split_by_number_; } if (cached_has_bits & 0x00000008u) { - vocabulary_output_piece_score_ = from.vocabulary_output_piece_score_; + split_by_whitespace_ = from.split_by_whitespace_; } if (cached_has_bits & 0x00000010u) { - hard_vocab_limit_ = from.hard_vocab_limit_; + vocabulary_output_piece_score_ = from.vocabulary_output_piece_score_; } if (cached_has_bits & 0x00000020u) { - bos_id_ = from.bos_id_; + hard_vocab_limit_ = from.hard_vocab_limit_; } if (cached_has_bits & 0x00000040u) { - eos_id_ = from.eos_id_; + bos_id_ = from.bos_id_; } if (cached_has_bits & 0x00000080u) { - pad_id_ = from.pad_id_; + eos_id_ = from.eos_id_; } _has_bits_[1] |= cached_has_bits; } + if (cached_has_bits & 0x00000100u) { + _internal_set_pad_id(from._internal_pad_id()); + } } void TrainerSpec::CopyFrom(const TrainerSpec& from) { @@ -1812,6 +1851,7 @@ void TrainerSpec::InternalSwap(TrainerSpec* other) { bos_piece_.Swap(&other->bos_piece_, nullptr, GetArena()); eos_piece_.Swap(&other->eos_piece_, nullptr, GetArena()); pad_piece_.Swap(&other->pad_piece_, nullptr, GetArena()); + pretokenization_delimiter_.Swap(&other->pretokenization_delimiter_, &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); ::PROTOBUF_NAMESPACE_ID::internal::memswap< PROTOBUF_FIELD_OFFSET(TrainerSpec, differential_privacy_clipping_threshold_) + sizeof(TrainerSpec::differential_privacy_clipping_threshold_) diff --git a/src/builtin_pb/sentencepiece_model.pb.h b/src/builtin_pb/sentencepiece_model.pb.h index 84450e6..e1f4a12 100644 --- a/src/builtin_pb/sentencepiece_model.pb.h +++ b/src/builtin_pb/sentencepiece_model.pb.h @@ -273,6 +273,7 @@ class TrainerSpec PROTOBUF_FINAL : kBosPieceFieldNumber = 46, kEosPieceFieldNumber = 47, kPadPieceFieldNumber = 48, + kPretokenizationDelimiterFieldNumber = 53, kSelfTestSampleSizeFieldNumber = 6, kMiningSentenceSizeFieldNumber = 12, kInputSentenceSizeFieldNumber = 11, @@ -562,6 +563,26 @@ class TrainerSpec PROTOBUF_FINAL : std::string* _internal_mutable_pad_piece(); public: + // optional string pretokenization_delimiter = 53 [default = ""]; + bool has_pretokenization_delimiter() const; + private: + bool _internal_has_pretokenization_delimiter() const; + public: + void clear_pretokenization_delimiter(); + const std::string& pretokenization_delimiter() const; + void set_pretokenization_delimiter(const std::string& value); + void set_pretokenization_delimiter(std::string&& value); + void set_pretokenization_delimiter(const char* value); + void set_pretokenization_delimiter(const char* value, size_t size); + std::string* mutable_pretokenization_delimiter(); + std::string* release_pretokenization_delimiter(); + void set_allocated_pretokenization_delimiter(std::string* pretokenization_delimiter); + private: + const std::string& _internal_pretokenization_delimiter() const; + void _internal_set_pretokenization_delimiter(const std::string& value); + std::string* _internal_mutable_pretokenization_delimiter(); + public: + // optional int32 self_test_sample_size = 6 [default = 0]; bool has_self_test_sample_size() const; private: @@ -1007,6 +1028,7 @@ class TrainerSpec PROTOBUF_FINAL : ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr eos_piece_; static const ::PROTOBUF_NAMESPACE_ID::internal::LazyString _i_give_permission_to_break_this_code_default_pad_piece_; ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr pad_piece_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr pretokenization_delimiter_; ::PROTOBUF_NAMESPACE_ID::int32 self_test_sample_size_; ::PROTOBUF_NAMESPACE_ID::int32 mining_sentence_size_; ::PROTOBUF_NAMESPACE_ID::uint64 input_sentence_size_; @@ -2240,7 +2262,7 @@ inline void TrainerSpec::set_allocated_model_prefix(std::string* model_prefix) { // optional .sentencepiece.TrainerSpec.ModelType model_type = 3 [default = UNIGRAM]; inline bool TrainerSpec::_internal_has_model_type() const { - bool value = (_has_bits_[0] & 0x00400000u) != 0; + bool value = (_has_bits_[0] & 0x00800000u) != 0; return value; } inline bool TrainerSpec::has_model_type() const { @@ -2248,7 +2270,7 @@ inline bool TrainerSpec::has_model_type() const { } inline void TrainerSpec::clear_model_type() { model_type_ = 1; - _has_bits_[0] &= ~0x00400000u; + _has_bits_[0] &= ~0x00800000u; } inline ::sentencepiece::TrainerSpec_ModelType TrainerSpec::_internal_model_type() const { return static_cast< ::sentencepiece::TrainerSpec_ModelType >(model_type_); @@ -2259,7 +2281,7 @@ inline ::sentencepiece::TrainerSpec_ModelType TrainerSpec::model_type() const { } inline void TrainerSpec::_internal_set_model_type(::sentencepiece::TrainerSpec_ModelType value) { assert(::sentencepiece::TrainerSpec_ModelType_IsValid(value)); - _has_bits_[0] |= 0x00400000u; + _has_bits_[0] |= 0x00800000u; model_type_ = value; } inline void TrainerSpec::set_model_type(::sentencepiece::TrainerSpec_ModelType value) { @@ -2269,7 +2291,7 @@ inline void TrainerSpec::set_model_type(::sentencepiece::TrainerSpec_ModelType v // optional int32 vocab_size = 4 [default = 8000]; inline bool TrainerSpec::_internal_has_vocab_size() const { - bool value = (_has_bits_[0] & 0x00800000u) != 0; + bool value = (_has_bits_[0] & 0x01000000u) != 0; return value; } inline bool TrainerSpec::has_vocab_size() const { @@ -2277,7 +2299,7 @@ inline bool TrainerSpec::has_vocab_size() const { } inline void TrainerSpec::clear_vocab_size() { vocab_size_ = 8000; - _has_bits_[0] &= ~0x00800000u; + _has_bits_[0] &= ~0x01000000u; } inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_vocab_size() const { return vocab_size_; @@ -2287,7 +2309,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::vocab_size() const { return _internal_vocab_size(); } inline void TrainerSpec::_internal_set_vocab_size(::PROTOBUF_NAMESPACE_ID::int32 value) { - _has_bits_[0] |= 0x00800000u; + _has_bits_[0] |= 0x01000000u; vocab_size_ = value; } inline void TrainerSpec::set_vocab_size(::PROTOBUF_NAMESPACE_ID::int32 value) { @@ -2371,7 +2393,7 @@ TrainerSpec::mutable_accept_language() { // optional int32 self_test_sample_size = 6 [default = 0]; inline bool TrainerSpec::_internal_has_self_test_sample_size() const { - bool value = (_has_bits_[0] & 0x00000100u) != 0; + bool value = (_has_bits_[0] & 0x00000200u) != 0; return value; } inline bool TrainerSpec::has_self_test_sample_size() const { @@ -2379,7 +2401,7 @@ inline bool TrainerSpec::has_self_test_sample_size() const { } inline void TrainerSpec::clear_self_test_sample_size() { self_test_sample_size_ = 0; - _has_bits_[0] &= ~0x00000100u; + _has_bits_[0] &= ~0x00000200u; } inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_self_test_sample_size() const { return self_test_sample_size_; @@ -2389,7 +2411,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::self_test_sample_size() const return _internal_self_test_sample_size(); } inline void TrainerSpec::_internal_set_self_test_sample_size(::PROTOBUF_NAMESPACE_ID::int32 value) { - _has_bits_[0] |= 0x00000100u; + _has_bits_[0] |= 0x00000200u; self_test_sample_size_ = value; } inline void TrainerSpec::set_self_test_sample_size(::PROTOBUF_NAMESPACE_ID::int32 value) { @@ -2399,7 +2421,7 @@ inline void TrainerSpec::set_self_test_sample_size(::PROTOBUF_NAMESPACE_ID::int3 // optional bool enable_differential_privacy = 50 [default = false]; inline bool TrainerSpec::_internal_has_enable_differential_privacy() const { - bool value = (_has_bits_[0] & 0x00001000u) != 0; + bool value = (_has_bits_[0] & 0x00002000u) != 0; return value; } inline bool TrainerSpec::has_enable_differential_privacy() const { @@ -2407,7 +2429,7 @@ inline bool TrainerSpec::has_enable_differential_privacy() const { } inline void TrainerSpec::clear_enable_differential_privacy() { enable_differential_privacy_ = false; - _has_bits_[0] &= ~0x00001000u; + _has_bits_[0] &= ~0x00002000u; } inline bool TrainerSpec::_internal_enable_differential_privacy() const { return enable_differential_privacy_; @@ -2417,7 +2439,7 @@ inline bool TrainerSpec::enable_differential_privacy() const { return _internal_enable_differential_privacy(); } inline void TrainerSpec::_internal_set_enable_differential_privacy(bool value) { - _has_bits_[0] |= 0x00001000u; + _has_bits_[0] |= 0x00002000u; enable_differential_privacy_ = value; } inline void TrainerSpec::set_enable_differential_privacy(bool value) { @@ -2427,7 +2449,7 @@ inline void TrainerSpec::set_enable_differential_privacy(bool value) { // optional float differential_privacy_noise_level = 51 [default = 0]; inline bool TrainerSpec::_internal_has_differential_privacy_noise_level() const { - bool value = (_has_bits_[0] & 0x00100000u) != 0; + bool value = (_has_bits_[0] & 0x00200000u) != 0; return value; } inline bool TrainerSpec::has_differential_privacy_noise_level() const { @@ -2435,7 +2457,7 @@ inline bool TrainerSpec::has_differential_privacy_noise_level() const { } inline void TrainerSpec::clear_differential_privacy_noise_level() { differential_privacy_noise_level_ = 0; - _has_bits_[0] &= ~0x00100000u; + _has_bits_[0] &= ~0x00200000u; } inline float TrainerSpec::_internal_differential_privacy_noise_level() const { return differential_privacy_noise_level_; @@ -2445,7 +2467,7 @@ inline float TrainerSpec::differential_privacy_noise_level() const { return _internal_differential_privacy_noise_level(); } inline void TrainerSpec::_internal_set_differential_privacy_noise_level(float value) { - _has_bits_[0] |= 0x00100000u; + _has_bits_[0] |= 0x00200000u; differential_privacy_noise_level_ = value; } inline void TrainerSpec::set_differential_privacy_noise_level(float value) { @@ -2455,7 +2477,7 @@ inline void TrainerSpec::set_differential_privacy_noise_level(float value) { // optional uint64 differential_privacy_clipping_threshold = 52 [default = 0]; inline bool TrainerSpec::_internal_has_differential_privacy_clipping_threshold() const { - bool value = (_has_bits_[0] & 0x00200000u) != 0; + bool value = (_has_bits_[0] & 0x00400000u) != 0; return value; } inline bool TrainerSpec::has_differential_privacy_clipping_threshold() const { @@ -2463,7 +2485,7 @@ inline bool TrainerSpec::has_differential_privacy_clipping_threshold() const { } inline void TrainerSpec::clear_differential_privacy_clipping_threshold() { differential_privacy_clipping_threshold_ = PROTOBUF_ULONGLONG(0); - _has_bits_[0] &= ~0x00200000u; + _has_bits_[0] &= ~0x00400000u; } inline ::PROTOBUF_NAMESPACE_ID::uint64 TrainerSpec::_internal_differential_privacy_clipping_threshold() const { return differential_privacy_clipping_threshold_; @@ -2473,7 +2495,7 @@ inline ::PROTOBUF_NAMESPACE_ID::uint64 TrainerSpec::differential_privacy_clippin return _internal_differential_privacy_clipping_threshold(); } inline void TrainerSpec::_internal_set_differential_privacy_clipping_threshold(::PROTOBUF_NAMESPACE_ID::uint64 value) { - _has_bits_[0] |= 0x00200000u; + _has_bits_[0] |= 0x00400000u; differential_privacy_clipping_threshold_ = value; } inline void TrainerSpec::set_differential_privacy_clipping_threshold(::PROTOBUF_NAMESPACE_ID::uint64 value) { @@ -2483,7 +2505,7 @@ inline void TrainerSpec::set_differential_privacy_clipping_threshold(::PROTOBUF_ // optional float character_coverage = 10 [default = 0.9995]; inline bool TrainerSpec::_internal_has_character_coverage() const { - bool value = (_has_bits_[0] & 0x01000000u) != 0; + bool value = (_has_bits_[0] & 0x02000000u) != 0; return value; } inline bool TrainerSpec::has_character_coverage() const { @@ -2491,7 +2513,7 @@ inline bool TrainerSpec::has_character_coverage() const { } inline void TrainerSpec::clear_character_coverage() { character_coverage_ = 0.9995f; - _has_bits_[0] &= ~0x01000000u; + _has_bits_[0] &= ~0x02000000u; } inline float TrainerSpec::_internal_character_coverage() const { return character_coverage_; @@ -2501,7 +2523,7 @@ inline float TrainerSpec::character_coverage() const { return _internal_character_coverage(); } inline void TrainerSpec::_internal_set_character_coverage(float value) { - _has_bits_[0] |= 0x01000000u; + _has_bits_[0] |= 0x02000000u; character_coverage_ = value; } inline void TrainerSpec::set_character_coverage(float value) { @@ -2511,7 +2533,7 @@ inline void TrainerSpec::set_character_coverage(float value) { // optional uint64 input_sentence_size = 11 [default = 0]; inline bool TrainerSpec::_internal_has_input_sentence_size() const { - bool value = (_has_bits_[0] & 0x00000400u) != 0; + bool value = (_has_bits_[0] & 0x00000800u) != 0; return value; } inline bool TrainerSpec::has_input_sentence_size() const { @@ -2519,7 +2541,7 @@ inline bool TrainerSpec::has_input_sentence_size() const { } inline void TrainerSpec::clear_input_sentence_size() { input_sentence_size_ = PROTOBUF_ULONGLONG(0); - _has_bits_[0] &= ~0x00000400u; + _has_bits_[0] &= ~0x00000800u; } inline ::PROTOBUF_NAMESPACE_ID::uint64 TrainerSpec::_internal_input_sentence_size() const { return input_sentence_size_; @@ -2529,7 +2551,7 @@ inline ::PROTOBUF_NAMESPACE_ID::uint64 TrainerSpec::input_sentence_size() const return _internal_input_sentence_size(); } inline void TrainerSpec::_internal_set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::uint64 value) { - _has_bits_[0] |= 0x00000400u; + _has_bits_[0] |= 0x00000800u; input_sentence_size_ = value; } inline void TrainerSpec::set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::uint64 value) { @@ -2539,7 +2561,7 @@ inline void TrainerSpec::set_input_sentence_size(::PROTOBUF_NAMESPACE_ID::uint64 // optional bool shuffle_input_sentence = 19 [default = true]; inline bool TrainerSpec::_internal_has_shuffle_input_sentence() const { - bool value = (_has_bits_[0] & 0x80000000u) != 0; + bool value = (_has_bits_[1] & 0x00000001u) != 0; return value; } inline bool TrainerSpec::has_shuffle_input_sentence() const { @@ -2547,7 +2569,7 @@ inline bool TrainerSpec::has_shuffle_input_sentence() const { } inline void TrainerSpec::clear_shuffle_input_sentence() { shuffle_input_sentence_ = true; - _has_bits_[0] &= ~0x80000000u; + _has_bits_[1] &= ~0x00000001u; } inline bool TrainerSpec::_internal_shuffle_input_sentence() const { return shuffle_input_sentence_; @@ -2557,7 +2579,7 @@ inline bool TrainerSpec::shuffle_input_sentence() const { return _internal_shuffle_input_sentence(); } inline void TrainerSpec::_internal_set_shuffle_input_sentence(bool value) { - _has_bits_[0] |= 0x80000000u; + _has_bits_[1] |= 0x00000001u; shuffle_input_sentence_ = value; } inline void TrainerSpec::set_shuffle_input_sentence(bool value) { @@ -2567,7 +2589,7 @@ inline void TrainerSpec::set_shuffle_input_sentence(bool value) { // optional int32 mining_sentence_size = 12 [deprecated = true]; inline bool TrainerSpec::_internal_has_mining_sentence_size() const { - bool value = (_has_bits_[0] & 0x00000200u) != 0; + bool value = (_has_bits_[0] & 0x00000400u) != 0; return value; } inline bool TrainerSpec::has_mining_sentence_size() const { @@ -2575,7 +2597,7 @@ inline bool TrainerSpec::has_mining_sentence_size() const { } inline void TrainerSpec::clear_mining_sentence_size() { mining_sentence_size_ = 0; - _has_bits_[0] &= ~0x00000200u; + _has_bits_[0] &= ~0x00000400u; } inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_mining_sentence_size() const { return mining_sentence_size_; @@ -2585,7 +2607,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::mining_sentence_size() const return _internal_mining_sentence_size(); } inline void TrainerSpec::_internal_set_mining_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) { - _has_bits_[0] |= 0x00000200u; + _has_bits_[0] |= 0x00000400u; mining_sentence_size_ = value; } inline void TrainerSpec::set_mining_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) { @@ -2595,7 +2617,7 @@ inline void TrainerSpec::set_mining_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 // optional int32 training_sentence_size = 13 [deprecated = true]; inline bool TrainerSpec::_internal_has_training_sentence_size() const { - bool value = (_has_bits_[0] & 0x00000800u) != 0; + bool value = (_has_bits_[0] & 0x00001000u) != 0; return value; } inline bool TrainerSpec::has_training_sentence_size() const { @@ -2603,7 +2625,7 @@ inline bool TrainerSpec::has_training_sentence_size() const { } inline void TrainerSpec::clear_training_sentence_size() { training_sentence_size_ = 0; - _has_bits_[0] &= ~0x00000800u; + _has_bits_[0] &= ~0x00001000u; } inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_training_sentence_size() const { return training_sentence_size_; @@ -2613,7 +2635,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::training_sentence_size() cons return _internal_training_sentence_size(); } inline void TrainerSpec::_internal_set_training_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) { - _has_bits_[0] |= 0x00000800u; + _has_bits_[0] |= 0x00001000u; training_sentence_size_ = value; } inline void TrainerSpec::set_training_sentence_size(::PROTOBUF_NAMESPACE_ID::int32 value) { @@ -2623,7 +2645,7 @@ inline void TrainerSpec::set_training_sentence_size(::PROTOBUF_NAMESPACE_ID::int // optional int32 seed_sentencepiece_size = 14 [default = 1000000]; inline bool TrainerSpec::_internal_has_seed_sentencepiece_size() const { - bool value = (_has_bits_[0] & 0x02000000u) != 0; + bool value = (_has_bits_[0] & 0x04000000u) != 0; return value; } inline bool TrainerSpec::has_seed_sentencepiece_size() const { @@ -2631,7 +2653,7 @@ inline bool TrainerSpec::has_seed_sentencepiece_size() const { } inline void TrainerSpec::clear_seed_sentencepiece_size() { seed_sentencepiece_size_ = 1000000; - _has_bits_[0] &= ~0x02000000u; + _has_bits_[0] &= ~0x04000000u; } inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_seed_sentencepiece_size() const { return seed_sentencepiece_size_; @@ -2641,7 +2663,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::seed_sentencepiece_size() con return _internal_seed_sentencepiece_size(); } inline void TrainerSpec::_internal_set_seed_sentencepiece_size(::PROTOBUF_NAMESPACE_ID::int32 value) { - _has_bits_[0] |= 0x02000000u; + _has_bits_[0] |= 0x04000000u; seed_sentencepiece_size_ = value; } inline void TrainerSpec::set_seed_sentencepiece_size(::PROTOBUF_NAMESPACE_ID::int32 value) { @@ -2651,7 +2673,7 @@ inline void TrainerSpec::set_seed_sentencepiece_size(::PROTOBUF_NAMESPACE_ID::in // optional float shrinking_factor = 15 [default = 0.75]; inline bool TrainerSpec::_internal_has_shrinking_factor() const { - bool value = (_has_bits_[0] & 0x04000000u) != 0; + bool value = (_has_bits_[0] & 0x08000000u) != 0; return value; } inline bool TrainerSpec::has_shrinking_factor() const { @@ -2659,7 +2681,7 @@ inline bool TrainerSpec::has_shrinking_factor() const { } inline void TrainerSpec::clear_shrinking_factor() { shrinking_factor_ = 0.75f; - _has_bits_[0] &= ~0x04000000u; + _has_bits_[0] &= ~0x08000000u; } inline float TrainerSpec::_internal_shrinking_factor() const { return shrinking_factor_; @@ -2669,7 +2691,7 @@ inline float TrainerSpec::shrinking_factor() const { return _internal_shrinking_factor(); } inline void TrainerSpec::_internal_set_shrinking_factor(float value) { - _has_bits_[0] |= 0x04000000u; + _has_bits_[0] |= 0x08000000u; shrinking_factor_ = value; } inline void TrainerSpec::set_shrinking_factor(float value) { @@ -2679,7 +2701,7 @@ inline void TrainerSpec::set_shrinking_factor(float value) { // optional int32 max_sentence_length = 18 [default = 4192]; inline bool TrainerSpec::_internal_has_max_sentence_length() const { - bool value = (_has_bits_[0] & 0x20000000u) != 0; + bool value = (_has_bits_[0] & 0x40000000u) != 0; return value; } inline bool TrainerSpec::has_max_sentence_length() const { @@ -2687,7 +2709,7 @@ inline bool TrainerSpec::has_max_sentence_length() const { } inline void TrainerSpec::clear_max_sentence_length() { max_sentence_length_ = 4192; - _has_bits_[0] &= ~0x20000000u; + _has_bits_[0] &= ~0x40000000u; } inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_max_sentence_length() const { return max_sentence_length_; @@ -2697,7 +2719,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::max_sentence_length() const { return _internal_max_sentence_length(); } inline void TrainerSpec::_internal_set_max_sentence_length(::PROTOBUF_NAMESPACE_ID::int32 value) { - _has_bits_[0] |= 0x20000000u; + _has_bits_[0] |= 0x40000000u; max_sentence_length_ = value; } inline void TrainerSpec::set_max_sentence_length(::PROTOBUF_NAMESPACE_ID::int32 value) { @@ -2707,7 +2729,7 @@ inline void TrainerSpec::set_max_sentence_length(::PROTOBUF_NAMESPACE_ID::int32 // optional int32 num_threads = 16 [default = 16]; inline bool TrainerSpec::_internal_has_num_threads() const { - bool value = (_has_bits_[0] & 0x08000000u) != 0; + bool value = (_has_bits_[0] & 0x10000000u) != 0; return value; } inline bool TrainerSpec::has_num_threads() const { @@ -2715,7 +2737,7 @@ inline bool TrainerSpec::has_num_threads() const { } inline void TrainerSpec::clear_num_threads() { num_threads_ = 16; - _has_bits_[0] &= ~0x08000000u; + _has_bits_[0] &= ~0x10000000u; } inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_num_threads() const { return num_threads_; @@ -2725,7 +2747,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::num_threads() const { return _internal_num_threads(); } inline void TrainerSpec::_internal_set_num_threads(::PROTOBUF_NAMESPACE_ID::int32 value) { - _has_bits_[0] |= 0x08000000u; + _has_bits_[0] |= 0x10000000u; num_threads_ = value; } inline void TrainerSpec::set_num_threads(::PROTOBUF_NAMESPACE_ID::int32 value) { @@ -2735,7 +2757,7 @@ inline void TrainerSpec::set_num_threads(::PROTOBUF_NAMESPACE_ID::int32 value) { // optional int32 num_sub_iterations = 17 [default = 2]; inline bool TrainerSpec::_internal_has_num_sub_iterations() const { - bool value = (_has_bits_[0] & 0x10000000u) != 0; + bool value = (_has_bits_[0] & 0x20000000u) != 0; return value; } inline bool TrainerSpec::has_num_sub_iterations() const { @@ -2743,7 +2765,7 @@ inline bool TrainerSpec::has_num_sub_iterations() const { } inline void TrainerSpec::clear_num_sub_iterations() { num_sub_iterations_ = 2; - _has_bits_[0] &= ~0x10000000u; + _has_bits_[0] &= ~0x20000000u; } inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_num_sub_iterations() const { return num_sub_iterations_; @@ -2753,7 +2775,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::num_sub_iterations() const { return _internal_num_sub_iterations(); } inline void TrainerSpec::_internal_set_num_sub_iterations(::PROTOBUF_NAMESPACE_ID::int32 value) { - _has_bits_[0] |= 0x10000000u; + _has_bits_[0] |= 0x20000000u; num_sub_iterations_ = value; } inline void TrainerSpec::set_num_sub_iterations(::PROTOBUF_NAMESPACE_ID::int32 value) { @@ -2763,7 +2785,7 @@ inline void TrainerSpec::set_num_sub_iterations(::PROTOBUF_NAMESPACE_ID::int32 v // optional int32 max_sentencepiece_length = 20 [default = 16]; inline bool TrainerSpec::_internal_has_max_sentencepiece_length() const { - bool value = (_has_bits_[0] & 0x40000000u) != 0; + bool value = (_has_bits_[0] & 0x80000000u) != 0; return value; } inline bool TrainerSpec::has_max_sentencepiece_length() const { @@ -2771,7 +2793,7 @@ inline bool TrainerSpec::has_max_sentencepiece_length() const { } inline void TrainerSpec::clear_max_sentencepiece_length() { max_sentencepiece_length_ = 16; - _has_bits_[0] &= ~0x40000000u; + _has_bits_[0] &= ~0x80000000u; } inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_max_sentencepiece_length() const { return max_sentencepiece_length_; @@ -2781,7 +2803,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::max_sentencepiece_length() co return _internal_max_sentencepiece_length(); } inline void TrainerSpec::_internal_set_max_sentencepiece_length(::PROTOBUF_NAMESPACE_ID::int32 value) { - _has_bits_[0] |= 0x40000000u; + _has_bits_[0] |= 0x80000000u; max_sentencepiece_length_ = value; } inline void TrainerSpec::set_max_sentencepiece_length(::PROTOBUF_NAMESPACE_ID::int32 value) { @@ -2791,7 +2813,7 @@ inline void TrainerSpec::set_max_sentencepiece_length(::PROTOBUF_NAMESPACE_ID::i // optional bool split_by_unicode_script = 21 [default = true]; inline bool TrainerSpec::_internal_has_split_by_unicode_script() const { - bool value = (_has_bits_[1] & 0x00000001u) != 0; + bool value = (_has_bits_[1] & 0x00000002u) != 0; return value; } inline bool TrainerSpec::has_split_by_unicode_script() const { @@ -2799,7 +2821,7 @@ inline bool TrainerSpec::has_split_by_unicode_script() const { } inline void TrainerSpec::clear_split_by_unicode_script() { split_by_unicode_script_ = true; - _has_bits_[1] &= ~0x00000001u; + _has_bits_[1] &= ~0x00000002u; } inline bool TrainerSpec::_internal_split_by_unicode_script() const { return split_by_unicode_script_; @@ -2809,7 +2831,7 @@ inline bool TrainerSpec::split_by_unicode_script() const { return _internal_split_by_unicode_script(); } inline void TrainerSpec::_internal_set_split_by_unicode_script(bool value) { - _has_bits_[1] |= 0x00000001u; + _has_bits_[1] |= 0x00000002u; split_by_unicode_script_ = value; } inline void TrainerSpec::set_split_by_unicode_script(bool value) { @@ -2819,7 +2841,7 @@ inline void TrainerSpec::set_split_by_unicode_script(bool value) { // optional bool split_by_number = 23 [default = true]; inline bool TrainerSpec::_internal_has_split_by_number() const { - bool value = (_has_bits_[1] & 0x00000002u) != 0; + bool value = (_has_bits_[1] & 0x00000004u) != 0; return value; } inline bool TrainerSpec::has_split_by_number() const { @@ -2827,7 +2849,7 @@ inline bool TrainerSpec::has_split_by_number() const { } inline void TrainerSpec::clear_split_by_number() { split_by_number_ = true; - _has_bits_[1] &= ~0x00000002u; + _has_bits_[1] &= ~0x00000004u; } inline bool TrainerSpec::_internal_split_by_number() const { return split_by_number_; @@ -2837,7 +2859,7 @@ inline bool TrainerSpec::split_by_number() const { return _internal_split_by_number(); } inline void TrainerSpec::_internal_set_split_by_number(bool value) { - _has_bits_[1] |= 0x00000002u; + _has_bits_[1] |= 0x00000004u; split_by_number_ = value; } inline void TrainerSpec::set_split_by_number(bool value) { @@ -2847,7 +2869,7 @@ inline void TrainerSpec::set_split_by_number(bool value) { // optional bool split_by_whitespace = 22 [default = true]; inline bool TrainerSpec::_internal_has_split_by_whitespace() const { - bool value = (_has_bits_[1] & 0x00000004u) != 0; + bool value = (_has_bits_[1] & 0x00000008u) != 0; return value; } inline bool TrainerSpec::has_split_by_whitespace() const { @@ -2855,7 +2877,7 @@ inline bool TrainerSpec::has_split_by_whitespace() const { } inline void TrainerSpec::clear_split_by_whitespace() { split_by_whitespace_ = true; - _has_bits_[1] &= ~0x00000004u; + _has_bits_[1] &= ~0x00000008u; } inline bool TrainerSpec::_internal_split_by_whitespace() const { return split_by_whitespace_; @@ -2865,7 +2887,7 @@ inline bool TrainerSpec::split_by_whitespace() const { return _internal_split_by_whitespace(); } inline void TrainerSpec::_internal_set_split_by_whitespace(bool value) { - _has_bits_[1] |= 0x00000004u; + _has_bits_[1] |= 0x00000008u; split_by_whitespace_ = value; } inline void TrainerSpec::set_split_by_whitespace(bool value) { @@ -2875,7 +2897,7 @@ inline void TrainerSpec::set_split_by_whitespace(bool value) { // optional bool treat_whitespace_as_suffix = 24 [default = false]; inline bool TrainerSpec::_internal_has_treat_whitespace_as_suffix() const { - bool value = (_has_bits_[0] & 0x00002000u) != 0; + bool value = (_has_bits_[0] & 0x00004000u) != 0; return value; } inline bool TrainerSpec::has_treat_whitespace_as_suffix() const { @@ -2883,7 +2905,7 @@ inline bool TrainerSpec::has_treat_whitespace_as_suffix() const { } inline void TrainerSpec::clear_treat_whitespace_as_suffix() { treat_whitespace_as_suffix_ = false; - _has_bits_[0] &= ~0x00002000u; + _has_bits_[0] &= ~0x00004000u; } inline bool TrainerSpec::_internal_treat_whitespace_as_suffix() const { return treat_whitespace_as_suffix_; @@ -2893,7 +2915,7 @@ inline bool TrainerSpec::treat_whitespace_as_suffix() const { return _internal_treat_whitespace_as_suffix(); } inline void TrainerSpec::_internal_set_treat_whitespace_as_suffix(bool value) { - _has_bits_[0] |= 0x00002000u; + _has_bits_[0] |= 0x00004000u; treat_whitespace_as_suffix_ = value; } inline void TrainerSpec::set_treat_whitespace_as_suffix(bool value) { @@ -2903,7 +2925,7 @@ inline void TrainerSpec::set_treat_whitespace_as_suffix(bool value) { // optional bool allow_whitespace_only_pieces = 26 [default = false]; inline bool TrainerSpec::_internal_has_allow_whitespace_only_pieces() const { - bool value = (_has_bits_[0] & 0x00004000u) != 0; + bool value = (_has_bits_[0] & 0x00008000u) != 0; return value; } inline bool TrainerSpec::has_allow_whitespace_only_pieces() const { @@ -2911,7 +2933,7 @@ inline bool TrainerSpec::has_allow_whitespace_only_pieces() const { } inline void TrainerSpec::clear_allow_whitespace_only_pieces() { allow_whitespace_only_pieces_ = false; - _has_bits_[0] &= ~0x00004000u; + _has_bits_[0] &= ~0x00008000u; } inline bool TrainerSpec::_internal_allow_whitespace_only_pieces() const { return allow_whitespace_only_pieces_; @@ -2921,7 +2943,7 @@ inline bool TrainerSpec::allow_whitespace_only_pieces() const { return _internal_allow_whitespace_only_pieces(); } inline void TrainerSpec::_internal_set_allow_whitespace_only_pieces(bool value) { - _has_bits_[0] |= 0x00004000u; + _has_bits_[0] |= 0x00008000u; allow_whitespace_only_pieces_ = value; } inline void TrainerSpec::set_allow_whitespace_only_pieces(bool value) { @@ -2931,7 +2953,7 @@ inline void TrainerSpec::set_allow_whitespace_only_pieces(bool value) { // optional bool split_digits = 25 [default = false]; inline bool TrainerSpec::_internal_has_split_digits() const { - bool value = (_has_bits_[0] & 0x00008000u) != 0; + bool value = (_has_bits_[0] & 0x00010000u) != 0; return value; } inline bool TrainerSpec::has_split_digits() const { @@ -2939,7 +2961,7 @@ inline bool TrainerSpec::has_split_digits() const { } inline void TrainerSpec::clear_split_digits() { split_digits_ = false; - _has_bits_[0] &= ~0x00008000u; + _has_bits_[0] &= ~0x00010000u; } inline bool TrainerSpec::_internal_split_digits() const { return split_digits_; @@ -2949,7 +2971,7 @@ inline bool TrainerSpec::split_digits() const { return _internal_split_digits(); } inline void TrainerSpec::_internal_set_split_digits(bool value) { - _has_bits_[0] |= 0x00008000u; + _has_bits_[0] |= 0x00010000u; split_digits_ = value; } inline void TrainerSpec::set_split_digits(bool value) { @@ -2957,6 +2979,79 @@ inline void TrainerSpec::set_split_digits(bool value) { // @@protoc_insertion_point(field_set:sentencepiece.TrainerSpec.split_digits) } +// optional string pretokenization_delimiter = 53 [default = ""]; +inline bool TrainerSpec::_internal_has_pretokenization_delimiter() const { + bool value = (_has_bits_[0] & 0x00000100u) != 0; + return value; +} +inline bool TrainerSpec::has_pretokenization_delimiter() const { + return _internal_has_pretokenization_delimiter(); +} +inline void TrainerSpec::clear_pretokenization_delimiter() { + pretokenization_delimiter_.ClearToEmpty(); + _has_bits_[0] &= ~0x00000100u; +} +inline const std::string& TrainerSpec::pretokenization_delimiter() const { + // @@protoc_insertion_point(field_get:sentencepiece.TrainerSpec.pretokenization_delimiter) + return _internal_pretokenization_delimiter(); +} +inline void TrainerSpec::set_pretokenization_delimiter(const std::string& value) { + _internal_set_pretokenization_delimiter(value); + // @@protoc_insertion_point(field_set:sentencepiece.TrainerSpec.pretokenization_delimiter) +} +inline std::string* TrainerSpec::mutable_pretokenization_delimiter() { + // @@protoc_insertion_point(field_mutable:sentencepiece.TrainerSpec.pretokenization_delimiter) + return _internal_mutable_pretokenization_delimiter(); +} +inline const std::string& TrainerSpec::_internal_pretokenization_delimiter() const { + return pretokenization_delimiter_.Get(); +} +inline void TrainerSpec::_internal_set_pretokenization_delimiter(const std::string& value) { + _has_bits_[0] |= 0x00000100u; + pretokenization_delimiter_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, value, GetArena()); +} +inline void TrainerSpec::set_pretokenization_delimiter(std::string&& value) { + _has_bits_[0] |= 0x00000100u; + pretokenization_delimiter_.Set( + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:sentencepiece.TrainerSpec.pretokenization_delimiter) +} +inline void TrainerSpec::set_pretokenization_delimiter(const char* value) { + GOOGLE_DCHECK(value != nullptr); + _has_bits_[0] |= 0x00000100u; + pretokenization_delimiter_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, ::std::string(value), GetArena()); + // @@protoc_insertion_point(field_set_char:sentencepiece.TrainerSpec.pretokenization_delimiter) +} +inline void TrainerSpec::set_pretokenization_delimiter(const char* value, + size_t size) { + _has_bits_[0] |= 0x00000100u; + pretokenization_delimiter_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, ::std::string( + reinterpret_cast(value), size), GetArena()); + // @@protoc_insertion_point(field_set_pointer:sentencepiece.TrainerSpec.pretokenization_delimiter) +} +inline std::string* TrainerSpec::_internal_mutable_pretokenization_delimiter() { + _has_bits_[0] |= 0x00000100u; + return pretokenization_delimiter_.Mutable(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, GetArena()); +} +inline std::string* TrainerSpec::release_pretokenization_delimiter() { + // @@protoc_insertion_point(field_release:sentencepiece.TrainerSpec.pretokenization_delimiter) + if (!_internal_has_pretokenization_delimiter()) { + return nullptr; + } + _has_bits_[0] &= ~0x00000100u; + return pretokenization_delimiter_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline void TrainerSpec::set_allocated_pretokenization_delimiter(std::string* pretokenization_delimiter) { + if (pretokenization_delimiter != nullptr) { + _has_bits_[0] |= 0x00000100u; + } else { + _has_bits_[0] &= ~0x00000100u; + } + pretokenization_delimiter_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), pretokenization_delimiter, + GetArena()); + // @@protoc_insertion_point(field_set_allocated:sentencepiece.TrainerSpec.pretokenization_delimiter) +} + // repeated string control_symbols = 30; inline int TrainerSpec::_internal_control_symbols_size() const { return control_symbols_.size(); @@ -3180,7 +3275,7 @@ inline void TrainerSpec::set_allocated_required_chars(std::string* required_char // optional bool byte_fallback = 35 [default = false]; inline bool TrainerSpec::_internal_has_byte_fallback() const { - bool value = (_has_bits_[0] & 0x00010000u) != 0; + bool value = (_has_bits_[0] & 0x00020000u) != 0; return value; } inline bool TrainerSpec::has_byte_fallback() const { @@ -3188,7 +3283,7 @@ inline bool TrainerSpec::has_byte_fallback() const { } inline void TrainerSpec::clear_byte_fallback() { byte_fallback_ = false; - _has_bits_[0] &= ~0x00010000u; + _has_bits_[0] &= ~0x00020000u; } inline bool TrainerSpec::_internal_byte_fallback() const { return byte_fallback_; @@ -3198,7 +3293,7 @@ inline bool TrainerSpec::byte_fallback() const { return _internal_byte_fallback(); } inline void TrainerSpec::_internal_set_byte_fallback(bool value) { - _has_bits_[0] |= 0x00010000u; + _has_bits_[0] |= 0x00020000u; byte_fallback_ = value; } inline void TrainerSpec::set_byte_fallback(bool value) { @@ -3208,7 +3303,7 @@ inline void TrainerSpec::set_byte_fallback(bool value) { // optional bool vocabulary_output_piece_score = 32 [default = true]; inline bool TrainerSpec::_internal_has_vocabulary_output_piece_score() const { - bool value = (_has_bits_[1] & 0x00000008u) != 0; + bool value = (_has_bits_[1] & 0x00000010u) != 0; return value; } inline bool TrainerSpec::has_vocabulary_output_piece_score() const { @@ -3216,7 +3311,7 @@ inline bool TrainerSpec::has_vocabulary_output_piece_score() const { } inline void TrainerSpec::clear_vocabulary_output_piece_score() { vocabulary_output_piece_score_ = true; - _has_bits_[1] &= ~0x00000008u; + _has_bits_[1] &= ~0x00000010u; } inline bool TrainerSpec::_internal_vocabulary_output_piece_score() const { return vocabulary_output_piece_score_; @@ -3226,7 +3321,7 @@ inline bool TrainerSpec::vocabulary_output_piece_score() const { return _internal_vocabulary_output_piece_score(); } inline void TrainerSpec::_internal_set_vocabulary_output_piece_score(bool value) { - _has_bits_[1] |= 0x00000008u; + _has_bits_[1] |= 0x00000010u; vocabulary_output_piece_score_ = value; } inline void TrainerSpec::set_vocabulary_output_piece_score(bool value) { @@ -3236,7 +3331,7 @@ inline void TrainerSpec::set_vocabulary_output_piece_score(bool value) { // optional bool hard_vocab_limit = 33 [default = true]; inline bool TrainerSpec::_internal_has_hard_vocab_limit() const { - bool value = (_has_bits_[1] & 0x00000010u) != 0; + bool value = (_has_bits_[1] & 0x00000020u) != 0; return value; } inline bool TrainerSpec::has_hard_vocab_limit() const { @@ -3244,7 +3339,7 @@ inline bool TrainerSpec::has_hard_vocab_limit() const { } inline void TrainerSpec::clear_hard_vocab_limit() { hard_vocab_limit_ = true; - _has_bits_[1] &= ~0x00000010u; + _has_bits_[1] &= ~0x00000020u; } inline bool TrainerSpec::_internal_hard_vocab_limit() const { return hard_vocab_limit_; @@ -3254,7 +3349,7 @@ inline bool TrainerSpec::hard_vocab_limit() const { return _internal_hard_vocab_limit(); } inline void TrainerSpec::_internal_set_hard_vocab_limit(bool value) { - _has_bits_[1] |= 0x00000010u; + _has_bits_[1] |= 0x00000020u; hard_vocab_limit_ = value; } inline void TrainerSpec::set_hard_vocab_limit(bool value) { @@ -3264,7 +3359,7 @@ inline void TrainerSpec::set_hard_vocab_limit(bool value) { // optional bool use_all_vocab = 34 [default = false]; inline bool TrainerSpec::_internal_has_use_all_vocab() const { - bool value = (_has_bits_[0] & 0x00020000u) != 0; + bool value = (_has_bits_[0] & 0x00040000u) != 0; return value; } inline bool TrainerSpec::has_use_all_vocab() const { @@ -3272,7 +3367,7 @@ inline bool TrainerSpec::has_use_all_vocab() const { } inline void TrainerSpec::clear_use_all_vocab() { use_all_vocab_ = false; - _has_bits_[0] &= ~0x00020000u; + _has_bits_[0] &= ~0x00040000u; } inline bool TrainerSpec::_internal_use_all_vocab() const { return use_all_vocab_; @@ -3282,7 +3377,7 @@ inline bool TrainerSpec::use_all_vocab() const { return _internal_use_all_vocab(); } inline void TrainerSpec::_internal_set_use_all_vocab(bool value) { - _has_bits_[0] |= 0x00020000u; + _has_bits_[0] |= 0x00040000u; use_all_vocab_ = value; } inline void TrainerSpec::set_use_all_vocab(bool value) { @@ -3292,7 +3387,7 @@ inline void TrainerSpec::set_use_all_vocab(bool value) { // optional int32 unk_id = 40 [default = 0]; inline bool TrainerSpec::_internal_has_unk_id() const { - bool value = (_has_bits_[0] & 0x00080000u) != 0; + bool value = (_has_bits_[0] & 0x00100000u) != 0; return value; } inline bool TrainerSpec::has_unk_id() const { @@ -3300,7 +3395,7 @@ inline bool TrainerSpec::has_unk_id() const { } inline void TrainerSpec::clear_unk_id() { unk_id_ = 0; - _has_bits_[0] &= ~0x00080000u; + _has_bits_[0] &= ~0x00100000u; } inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_unk_id() const { return unk_id_; @@ -3310,7 +3405,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::unk_id() const { return _internal_unk_id(); } inline void TrainerSpec::_internal_set_unk_id(::PROTOBUF_NAMESPACE_ID::int32 value) { - _has_bits_[0] |= 0x00080000u; + _has_bits_[0] |= 0x00100000u; unk_id_ = value; } inline void TrainerSpec::set_unk_id(::PROTOBUF_NAMESPACE_ID::int32 value) { @@ -3320,7 +3415,7 @@ inline void TrainerSpec::set_unk_id(::PROTOBUF_NAMESPACE_ID::int32 value) { // optional int32 bos_id = 41 [default = 1]; inline bool TrainerSpec::_internal_has_bos_id() const { - bool value = (_has_bits_[1] & 0x00000020u) != 0; + bool value = (_has_bits_[1] & 0x00000040u) != 0; return value; } inline bool TrainerSpec::has_bos_id() const { @@ -3328,7 +3423,7 @@ inline bool TrainerSpec::has_bos_id() const { } inline void TrainerSpec::clear_bos_id() { bos_id_ = 1; - _has_bits_[1] &= ~0x00000020u; + _has_bits_[1] &= ~0x00000040u; } inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_bos_id() const { return bos_id_; @@ -3338,7 +3433,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::bos_id() const { return _internal_bos_id(); } inline void TrainerSpec::_internal_set_bos_id(::PROTOBUF_NAMESPACE_ID::int32 value) { - _has_bits_[1] |= 0x00000020u; + _has_bits_[1] |= 0x00000040u; bos_id_ = value; } inline void TrainerSpec::set_bos_id(::PROTOBUF_NAMESPACE_ID::int32 value) { @@ -3348,7 +3443,7 @@ inline void TrainerSpec::set_bos_id(::PROTOBUF_NAMESPACE_ID::int32 value) { // optional int32 eos_id = 42 [default = 2]; inline bool TrainerSpec::_internal_has_eos_id() const { - bool value = (_has_bits_[1] & 0x00000040u) != 0; + bool value = (_has_bits_[1] & 0x00000080u) != 0; return value; } inline bool TrainerSpec::has_eos_id() const { @@ -3356,7 +3451,7 @@ inline bool TrainerSpec::has_eos_id() const { } inline void TrainerSpec::clear_eos_id() { eos_id_ = 2; - _has_bits_[1] &= ~0x00000040u; + _has_bits_[1] &= ~0x00000080u; } inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_eos_id() const { return eos_id_; @@ -3366,7 +3461,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::eos_id() const { return _internal_eos_id(); } inline void TrainerSpec::_internal_set_eos_id(::PROTOBUF_NAMESPACE_ID::int32 value) { - _has_bits_[1] |= 0x00000040u; + _has_bits_[1] |= 0x00000080u; eos_id_ = value; } inline void TrainerSpec::set_eos_id(::PROTOBUF_NAMESPACE_ID::int32 value) { @@ -3376,7 +3471,7 @@ inline void TrainerSpec::set_eos_id(::PROTOBUF_NAMESPACE_ID::int32 value) { // optional int32 pad_id = 43 [default = -1]; inline bool TrainerSpec::_internal_has_pad_id() const { - bool value = (_has_bits_[1] & 0x00000080u) != 0; + bool value = (_has_bits_[1] & 0x00000100u) != 0; return value; } inline bool TrainerSpec::has_pad_id() const { @@ -3384,7 +3479,7 @@ inline bool TrainerSpec::has_pad_id() const { } inline void TrainerSpec::clear_pad_id() { pad_id_ = -1; - _has_bits_[1] &= ~0x00000080u; + _has_bits_[1] &= ~0x00000100u; } inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::_internal_pad_id() const { return pad_id_; @@ -3394,7 +3489,7 @@ inline ::PROTOBUF_NAMESPACE_ID::int32 TrainerSpec::pad_id() const { return _internal_pad_id(); } inline void TrainerSpec::_internal_set_pad_id(::PROTOBUF_NAMESPACE_ID::int32 value) { - _has_bits_[1] |= 0x00000080u; + _has_bits_[1] |= 0x00000100u; pad_id_ = value; } inline void TrainerSpec::set_pad_id(::PROTOBUF_NAMESPACE_ID::int32 value) { @@ -3774,7 +3869,7 @@ inline void TrainerSpec::set_allocated_unk_surface(std::string* unk_surface) { // optional bool train_extremely_large_corpus = 49 [default = false]; inline bool TrainerSpec::_internal_has_train_extremely_large_corpus() const { - bool value = (_has_bits_[0] & 0x00040000u) != 0; + bool value = (_has_bits_[0] & 0x00080000u) != 0; return value; } inline bool TrainerSpec::has_train_extremely_large_corpus() const { @@ -3782,7 +3877,7 @@ inline bool TrainerSpec::has_train_extremely_large_corpus() const { } inline void TrainerSpec::clear_train_extremely_large_corpus() { train_extremely_large_corpus_ = false; - _has_bits_[0] &= ~0x00040000u; + _has_bits_[0] &= ~0x00080000u; } inline bool TrainerSpec::_internal_train_extremely_large_corpus() const { return train_extremely_large_corpus_; @@ -3792,7 +3887,7 @@ inline bool TrainerSpec::train_extremely_large_corpus() const { return _internal_train_extremely_large_corpus(); } inline void TrainerSpec::_internal_set_train_extremely_large_corpus(bool value) { - _has_bits_[0] |= 0x00040000u; + _has_bits_[0] |= 0x00080000u; train_extremely_large_corpus_ = value; } inline void TrainerSpec::set_train_extremely_large_corpus(bool value) { diff --git a/src/pretokenizer_for_training.cc b/src/pretokenizer_for_training.cc index 049658e..d4f492c 100644 --- a/src/pretokenizer_for_training.cc +++ b/src/pretokenizer_for_training.cc @@ -11,9 +11,10 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License.! +#include "pretokenizer_for_training.h" + #include -#include "pretokenizer_for_training.h" #include "third_party/absl/strings/str_replace.h" namespace sentencepiece { @@ -24,10 +25,9 @@ namespace { // defined them explicitly to avoid the dependency to trainier_interface. // Currently, we have no separated build rules. const char kWSStr[] = "\xe2\x96\x81"; -const char kUPPBoundaryStr[] = "\t"; } // namespace -std::string PretokenizerForTrainingInterface::PreTokenize( +std::vector PretokenizerForTrainingInterface::PreTokenize( absl::string_view text) const { return Postprocess(Tokenize(Preprocess(text))); } @@ -40,14 +40,17 @@ std::string PretokenizerForTrainingInterface::Preprocess( } // static -std::string PretokenizerForTrainingInterface::Postprocess( +std::vector PretokenizerForTrainingInterface::Postprocess( const SentencePieceText &spt) { // Inserts kUPPBoundaryStr before/after of token boundaries. + std::vector result; std::string output; + int prev = 0; for (const auto &piece : spt.pieces()) { if (prev == piece.begin() && piece.begin() != 0) { - output += kUPPBoundaryStr; + result.push_back(output); + output.clear(); } else { output.append(piece.begin() - prev, ' '); } @@ -55,8 +58,11 @@ std::string PretokenizerForTrainingInterface::Postprocess( prev = piece.end(); } - // Restores kWSStr. - return absl::StrReplaceAll(output, {{" ", kWSStr}}); + if (!output.empty()) result.push_back(output); + + for (auto &w : result) w = absl::StrReplaceAll(w, {{" ", kWSStr}}); + + return result; } } // namespace pretokenizer diff --git a/src/pretokenizer_for_training.h b/src/pretokenizer_for_training.h index 2d3bc82..fa54f95 100644 --- a/src/pretokenizer_for_training.h +++ b/src/pretokenizer_for_training.h @@ -44,7 +44,7 @@ class PretokenizerForTrainingInterface { // segmentation: piece[0] = {0, 1}, piece[1] = {2, 6}, // piece[2] = {7, 15}, piece[3] = {15, 20} // output: I love sentencepiece. - std::string PreTokenize(absl::string_view text) const; + std::vector PreTokenize(absl::string_view text) const; // Returns pre-tokenized result. // Note that the pre-tokenized constraint is specified with the @@ -54,7 +54,7 @@ class PretokenizerForTrainingInterface { private: static std::string Preprocess(absl::string_view text); - static std::string Postprocess(const SentencePieceText &spt); + static std::vector Postprocess(const SentencePieceText &spt); }; } // namespace pretokenizer diff --git a/src/pretokenizer_for_training_test.cc b/src/pretokenizer_for_training_test.cc index 80f4787..99db0c5 100644 --- a/src/pretokenizer_for_training_test.cc +++ b/src/pretokenizer_for_training_test.cc @@ -12,8 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License.! #include "pretokenizer_for_training.h" + #include "testharness.h" #include "third_party/absl/strings/str_cat.h" +#include "third_party/absl/strings/str_join.h" +#include "third_party/absl/strings/str_split.h" #include "trainer_interface.h" namespace sentencepiece { @@ -64,9 +67,11 @@ TEST(PretokenizerForTrainingTest, BaseTest) { mock.SetOutput(spt); - EXPECT_EQ(absl::StrCat("I", TrainerInterface::kWSStr, "love", - TrainerInterface::kWSStr, "sentence\tpiece"), - mock.PreTokenize("I love sentencepiece")); + const auto expected = + absl::StrCat("I", TrainerInterface::kWSStr, "love", + TrainerInterface::kWSStr, "sentence||||piece"); + EXPECT_EQ(expected, + absl::StrJoin(mock.PreTokenize("I love sentencepiece"), "||||")); } { @@ -94,7 +99,9 @@ TEST(PretokenizerForTrainingTest, BaseTest) { mock.SetOutput(spt); - EXPECT_EQ("これ\tは\tペン\tです", mock.PreTokenize("これはペンです")); + const auto expected = "これ||||は||||ペン||||です"; + EXPECT_EQ(expected, + absl::StrJoin(mock.PreTokenize("これはペンです"), "||||")); } } diff --git a/src/sentencepiece_model.proto b/src/sentencepiece_model.proto index b6c1224..826f72d 100644 --- a/src/sentencepiece_model.proto +++ b/src/sentencepiece_model.proto @@ -20,7 +20,7 @@ option optimize_for = LITE_RUNTIME; package sentencepiece; // TrainerSpec encodes a various parameters for SentencePiece training. -// Next id: 53 +// Next id: 54 message TrainerSpec { /////////////////////////////////////////////////////////////////// // General parameters @@ -157,6 +157,13 @@ message TrainerSpec { // Split all digits (0-9) into separate pieces. optional bool split_digits = 25 [default = false]; + // Defines the pre-tokenization delimiter. + // When specified, no pieces crossing this delimiter is not included + // in the vocab. Then the delimiter string is virtually ignored + // during the training. This field can allows constraints on the vocabulary + // selection. Note that this field is available on unigram mode. + optional string pretokenization_delimiter = 53 [ default = ""]; + /////////////////////////////////////////////////////////////////// // Vocabulary management // diff --git a/src/spec_parser.h b/src/spec_parser.h index de8f72f..c5f0582 100644 --- a/src/spec_parser.h +++ b/src/spec_parser.h @@ -144,6 +144,7 @@ inline std::string PrintProto(const TrainerSpec &message, PRINT_PARAM(split_by_number); PRINT_PARAM(split_by_whitespace); PRINT_PARAM(split_digits); + PRINT_PARAM(pretokenization_delimiter); PRINT_PARAM(treat_whitespace_as_suffix); PRINT_PARAM(allow_whitespace_only_pieces); PRINT_REPEATED_STRING(control_symbols); @@ -222,6 +223,7 @@ util::Status SentencePieceTrainer::SetProtoField(absl::string_view name, PARSE_BOOL(split_by_number); PARSE_BOOL(split_by_whitespace); PARSE_BOOL(split_digits); + PARSE_STRING(pretokenization_delimiter); PARSE_BOOL(treat_whitespace_as_suffix); PARSE_BOOL(allow_whitespace_only_pieces); PARSE_REPEATED_STRING(control_symbols); diff --git a/src/spm_train_main.cc b/src/spm_train_main.cc index 6ab634d..34369cd 100644 --- a/src/spm_train_main.cc +++ b/src/spm_train_main.cc @@ -77,6 +77,9 @@ ABSL_FLAG(bool, split_by_whitespace, kDefaultTrainerSpec.split_by_whitespace(), "use a white space to split sentence pieces"); ABSL_FLAG(bool, split_digits, kDefaultTrainerSpec.split_digits(), "split all digits (0-9) into separate pieces"); +ABSL_FLAG(std::string, pretokenization_delimiter, + kDefaultTrainerSpec.pretokenization_delimiter(), + "specifies the delimiter of pre-tokenization"); ABSL_FLAG(bool, treat_whitespace_as_suffix, kDefaultTrainerSpec.treat_whitespace_as_suffix(), "treat whitespace marker as suffix instead of prefix."); @@ -227,6 +230,7 @@ int main(int argc, char *argv[]) { SetTrainerSpecFromFlag(split_by_whitespace); SetTrainerSpecFromFlag(split_by_number); SetTrainerSpecFromFlag(split_digits); + SetTrainerSpecFromFlag(pretokenization_delimiter); SetTrainerSpecFromFlag(byte_fallback); SetTrainerSpecFromFlag(treat_whitespace_as_suffix); SetTrainerSpecFromFlag(allow_whitespace_only_pieces); diff --git a/src/trainer_interface.cc b/src/trainer_interface.cc index fb4087a..ab1bf8a 100644 --- a/src/trainer_interface.cc +++ b/src/trainer_interface.cc @@ -81,7 +81,8 @@ util::Status VerifySpec(const TrainerSpec &trainer_spec) { CHECK_OR_RETURN(!trainer_spec.eos_piece().empty()); CHECK_OR_RETURN(!trainer_spec.pad_piece().empty()); - if (SentencePieceTrainer::GetPretokenizerForTraining()) { + if (SentencePieceTrainer::GetPretokenizerForTraining() || + !trainer_spec.pretokenization_delimiter().empty()) { CHECK_EQ_OR_RETURN(TrainerSpec::UNIGRAM, trainer_spec.model_type()) << "PretokenizerForTraining is only supported in UNIGRAM mode."; } diff --git a/src/unigram_model.cc b/src/unigram_model.cc index d9f1ce9..9592053 100644 --- a/src/unigram_model.cc +++ b/src/unigram_model.cc @@ -461,7 +461,7 @@ std::vector Lattice::NBest(size_t nbest_size, } else { hyp->gx = lnode->score + top->gx; // just adds node->score hyp->fx = - lnode->backtrace_score + top->gx; // backtrace_score is h(node). + lnode->backtrace_score + hyp->gx; // backtrace_score is h(node). } hyp->next = top; agenda.push(hyp); diff --git a/src/unigram_model_trainer.cc b/src/unigram_model_trainer.cc index ee7921c..4e9e0b3 100644 --- a/src/unigram_model_trainer.cc +++ b/src/unigram_model_trainer.cc @@ -28,7 +28,10 @@ #include "pretokenizer_for_training.h" #include "sentencepiece_trainer.h" #include "third_party/absl/container/flat_hash_map.h" +#include "third_party/absl/container/flat_hash_set.h" #include "third_party/absl/memory/memory.h" +#include "third_party/absl/strings/str_replace.h" +#include "third_party/absl/strings/str_split.h" #include "third_party/esaxx/esa.hxx" // Suffix array library. #include "unicode_script.h" #include "util.h" @@ -37,6 +40,9 @@ namespace sentencepiece { namespace unigram { namespace { +constexpr char32 kSentenceBoundary = 0x0000; +constexpr char32 kWsMarker = 0x2581; + double Digamma(double x) { double result = 0.0; for (; x < 7; ++x) result -= 1 / x; @@ -60,6 +66,63 @@ void ToLogProb(IT begin, IT end) { it->second = std::log(static_cast(it->second)) - logsum; } } + +template +std::vector> SplitBySentenceBoundary( + const T *begin, const T *end) { + std::vector> result; + + while (begin < end) { + const auto *p = std::find(begin, end, static_cast(kSentenceBoundary)); + if (p != end) { + result.emplace_back(begin, p); + begin = p + 1; + } else { + result.emplace_back(begin, end); + break; + } + } + + return result; +} + +template +class BoundedPriorityQueue { + public: + explicit BoundedPriorityQueue(size_t size) : size_(size) {} + ~BoundedPriorityQueue() = default; + + void push(const T &elem, int64 score) { + if (queue_.size() > 4 * size_) resize(); + if (queue_.size() >= size_ && queue_[size_ - 1].second > score) return; + queue_.emplace_back(elem, score); + } + + const std::vector> &get() { + resize(); + return queue_; + } + + private: + void resize() { + std::sort(queue_.begin(), queue_.end(), [](const auto &p1, const auto &p2) { + return (p1.second > p2.second || + (p1.second == p2.second && p1.first < p2.first)); + }); + + absl::flat_hash_set dup; + std::vector> new_queue; + for (auto &p : queue_) { + if (dup.insert(p.first).second) new_queue.emplace_back(std::move(p)); + if (new_queue.size() == size_) break; + } + queue_ = std::move(new_queue); + } + + size_t size_ = 0; + std::vector> queue_; +}; + } // namespace TrainerModel::TrainerModel(const TrainerSpec &trainer_spec, @@ -96,7 +159,7 @@ void TrainerModel::SetSentencePieces(SentencePieces &&sentencepieces) { CHECK(status().ok()); } -TrainerModel::SentencePieces Trainer::MakeSeedSentencePieces() const { +TrainerModel::SentencePieces Trainer::MakeSeedSentencePieces() { return trainer_spec_.train_extremely_large_corpus() ? MakeSeedSentencePiecesInternal() : MakeSeedSentencePiecesInternal(); @@ -104,7 +167,7 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePieces() const { // Returns seed sentencepieces for EM training. template -TrainerModel::SentencePieces Trainer::MakeSeedSentencePiecesInternal() const { +TrainerModel::SentencePieces Trainer::MakeSeedSentencePiecesInternal() { CHECK(!sentences_.empty()); CHECK(!required_chars_.empty()); @@ -112,14 +175,43 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePiecesInternal() const { // Pretokenizer is used as a constraint of piece extractions. const auto *pretokenizer = SentencePieceTrainer::GetPretokenizerForTraining(); + auto pretokenize_or_rewrite = [&](std::pair *w) { + if (pretokenizer) { + std::vector chars; + for (const auto &w : pretokenizer->PreTokenize(w->first)) { + for (const auto &c : string_util::UTF8ToUnicodeText(w)) { + chars.push_back(c); + } + chars.push_back(kSentenceBoundary); + } + return chars; + } else if (!trainer_spec_.pretokenization_delimiter().empty()) { + // When delimiter is specified, tokenize the input with the delimiter. + // For EM training, we assume that the delimiter doesn't exist and + // rewrite the original sentence. + std::vector chars; + absl::string_view delimiter = trainer_spec_.pretokenization_delimiter(); + for (const auto &w : absl::StrSplit(w->first, delimiter)) { + for (const auto &c : string_util::UTF8ToUnicodeText(w)) { + chars.push_back(c); + } + chars.push_back(kSentenceBoundary); + } + // Removes the delimiter. + w->first = absl::StrReplaceAll(w->first, {{delimiter, ""}}); + return chars; + } + return string_util::UTF8ToUnicodeText(w->first); + }; + // Merges all sentences into one array with 0x0000 delimiter. std::vector array; absl::flat_hash_map all_chars; - constexpr char32 kSentenceBoundary = 0x0000; - for (const auto &w : sentences_) { - const auto ut = string_util::UTF8ToUnicodeText( - pretokenizer ? pretokenizer->PreTokenize(w.first) : w.first); + const bool is_tsv = trainer_spec_.input_format() == "tsv"; + + for (auto &w : sentences_) { + const auto ut = pretokenize_or_rewrite(&w); for (const auto &c : ut) { array.push_back(c); if (c != kUNKChar && c != kSentenceBoundary) { @@ -127,6 +219,15 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePiecesInternal() const { } } array.push_back(kSentenceBoundary); // sentence boundary marker. + + // Naive workaround to over-sample the input. + // In TSV mode, the frequency field is not used to extract the seed piece. + // we can at least extract all pieces by copying the input because + // the occurrence gets at least larger than or equals to 2. + if (is_tsv) { + for (const auto &c : ut) array.push_back(c); + array.push_back(kSentenceBoundary); + } } CHECK_LE(array.size(), @@ -147,29 +248,42 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePiecesInternal() const { CHECK_EQ(0, esaxx(array.begin(), SA.begin(), L.begin(), R.begin(), D.begin(), n, kAlphabetSize, node_num)); - LOG(INFO) << "Extracting frequent sub strings..."; - std::vector> substr_index; + LOG(INFO) << "Extracting frequent sub strings... node_num=" << node_num; + + BoundedPriorityQueue queue( + static_cast(trainer_spec_.seed_sentencepiece_size())); + for (node_int_type i = 0; i < node_num; ++i) { const node_int_type offset = SA[L[i]]; const node_int_type len = D[i]; if (len <= 1) { continue; } - const char32 *begin = &array[0] + offset; - const char32 *end = &array[0] + offset + len; - // Skips if a substring contains a sentence boundary. - if (std::find(begin, end, kSentenceBoundary) != end) { - continue; - } - const UnicodeText uw(begin, end); - if (!IsValidSentencePiece(uw)) { - continue; - } - // character-wise coverage is the default score. - const node_int_type freq = R[i] - L[i]; - const node_int_type score = freq * len; - substr_index.emplace_back(i, score); + for (const auto &p : + SplitBySentenceBoundary(&array[offset], &array[offset + len])) { + if (p.first == p.second) continue; + const auto [begin, end] = NormalizeRange(p.first, p.second); + + const UnicodeText uw(begin, end); + if (uw.size() <= 1 || !IsValidSentencePiece(uw)) { + continue; + } + + // character-wise coverage is the default score. + const node_int_type freq = R[i] - L[i]; + const node_int_type score = freq * freq; + + const auto w = string_util::UnicodeTextToUTF8(uw); + queue.push(w, score); + + const auto subpieces = + SplitIntoWords(w, trainer_spec_.treat_whitespace_as_suffix(), + trainer_spec_.allow_whitespace_only_pieces()); + if (subpieces.size() > 1) { + for (const auto &s : subpieces) queue.push(std::string(s), score); + } + } } // all_chars must be included in the seed sentencepieces. @@ -178,22 +292,8 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePiecesInternal() const { seed_sentencepieces.emplace_back(it); } - // Sort by the coverage of sub strings. - for (const auto &p : Sorted(substr_index)) { - const node_int_type offset = SA[L[p.first]]; - const node_int_type len = D[p.first]; - CHECK_GT(len, 0); - const char32 *begin = &array[offset]; - const char32 *end = &array[offset + len]; - const UnicodeText uw(begin, end); - CHECK(IsValidSentencePiece(uw)); // just in case. - const std::string w = string_util::UnicodeTextToUTF8(uw); - if (seed_sentencepieces.size() == - static_cast(trainer_spec_.seed_sentencepiece_size())) { - break; - } - CHECK(!port::ContainsKey(all_chars, w)); - seed_sentencepieces.emplace_back(w, p.second); + for (const auto &p : queue.get()) { + seed_sentencepieces.emplace_back(p); } ToLogProb(seed_sentencepieces.begin(), seed_sentencepieces.end()); @@ -430,6 +530,22 @@ TrainerModel::SentencePieces Trainer::PruneSentencePieces( return new_sentencepieces; } +std::pair Trainer::NormalizeRange( + const char32 *begin, const char32 *end) const { + if (trainer_spec_.treat_whitespace_as_suffix()) { + while ((*begin == kSentenceBoundary || *begin == kWsMarker) && + begin + 1 < end) + ++begin; + while (*(end - 1) == kSentenceBoundary && begin + 1 < end) --end; + } else { + while (*begin == kSentenceBoundary && begin + 1 < end) ++begin; + while ((*(end - 1) == kSentenceBoundary || *(end - 1) == kWsMarker) && + begin + 1 < end) + --end; + } + return std::make_pair(begin, end); +} + TrainerModel::SentencePieces Trainer::FinalizeSentencePieces( const TrainerModel &model) const { const auto &sentencepieces = model.GetSentencePieces(); diff --git a/src/unigram_model_trainer.h b/src/unigram_model_trainer.h index 9593e31..a031286 100644 --- a/src/unigram_model_trainer.h +++ b/src/unigram_model_trainer.h @@ -68,7 +68,7 @@ class Trainer : public TrainerInterface { : TrainerInterface::TrainerInterface(trainer_spec, normalizer_spec, denormalizer_spec) {} - TrainerModel::SentencePieces MakeSeedSentencePieces() const; + TrainerModel::SentencePieces MakeSeedSentencePieces(); util::Status Train() override; @@ -80,7 +80,7 @@ class Trainer : public TrainerInterface { // node_int_type should be of integer type (int32 or int64), // determined by train_extremely_large_corpus. template - TrainerModel::SentencePieces MakeSeedSentencePiecesInternal() const; + TrainerModel::SentencePieces MakeSeedSentencePiecesInternal(); // Executes the E step of EM and returns expected count. // The index of return array is the vocab id. @@ -105,6 +105,9 @@ class Trainer : public TrainerInterface { TrainerModel::SentencePieces FinalizeSentencePieces( const TrainerModel &model) const; + std::pair NormalizeRange( + const char32 *begin, const char32 *end) const; + // When the size of SentencePieces becomes less than desired_vocab_size_, // break the main training loop. desired_vocab_size_ = 1.1 * vocab_size_ // for now. diff --git a/src/unigram_model_trainer_test.cc b/src/unigram_model_trainer_test.cc index 059772b..7b7ecc8 100644 --- a/src/unigram_model_trainer_test.cc +++ b/src/unigram_model_trainer_test.cc @@ -117,11 +117,13 @@ TEST(UnigramTrainerTest, BasicTest) { 30); // Check seed pieces. - EXPECT_EQ(27, res.seed_pieces_and_probs.size()); + EXPECT_EQ(63, res.seed_pieces_and_probs.size()); // Check final pieces. - EXPECT_EQ("i a n y m l e apple ve O P r t g an v ▁ A b le ▁an p d h", - res.sentence_pieces); + EXPECT_EQ( + "Overly Pineapple magnanimity Available ▁an a ▁ b A t g r P O v m y p n " + "l d e h i", + res.sentence_pieces); } TEST(UnigramTrainerTest, BasicDPTest) { @@ -132,8 +134,7 @@ TEST(UnigramTrainerTest, BasicDPTest) { "Overly \t 6", "Available \t 5"}, 22, true /*use_dp*/, 0 /*dp_noise*/, 4 /*dp_clipping*/); - // Got 16 instead of 27 seeds. - EXPECT_EQ(16, res.seed_pieces_and_probs.size()); + EXPECT_EQ(49, res.seed_pieces_and_probs.size()); // And they are equiv to if the last sentence was not there. const auto& res_nodp = RunTrainer( @@ -191,12 +192,12 @@ TEST(UnigramTrainerTest, EndToEndTest) { .ok()); // TODO(taku): Temporally disable this test on Windows. #ifndef OS_WIN - EXPECT_EQ(WS - " 吾輩 《 わが はい 》 は 猫 である 。 名前 はまだ 無い 。 " - "どこ で 生 れた か とん と 見当 《 けん とう 》 が つか ぬ 。 " - "何でも 薄 暗 い じめ じめ した 所で ニャーニャー " - "泣 い ていた 事 だけは 記憶 している 。", - absl::StrJoin(tok, " ")); + EXPECT_EQ( + WS + " 吾輩 《 わ が は い 》 は猫である 。 名前は まだ 無 い 。 どこ で 生れ " + "た か とん と 見当 《 けん とう 》 が つか ぬ 。 何でも 薄 暗 い じめ " + "じめ した 所で ニャーニャー 泣 い ていた 事 だけ は記憶している 。", + absl::StrJoin(tok, " ")); #endif }