diff --git a/VERSION.txt b/VERSION.txt index e982215..9c178d3 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -0.1.94 +0.1.95 diff --git a/python/VERSION.txt b/python/VERSION.txt index e982215..9c178d3 100644 --- a/python/VERSION.txt +++ b/python/VERSION.txt @@ -1 +1 @@ -0.1.94 +0.1.95 diff --git a/python/make_py_wheel.sh b/python/make_py_wheel.sh index 79aeb45..18339e4 100755 --- a/python/make_py_wheel.sh +++ b/python/make_py_wheel.sh @@ -41,6 +41,7 @@ build() { for i in /opt/python/* do + export LD_LIBRARY_PATH=/usr/local/lib:/usr/lib $i/bin/python setup.py bdist strip build/*/*/*.so $i/bin/python setup.py bdist_wheel diff --git a/python/src/sentencepiece/sentencepiece_model_pb2.py b/python/src/sentencepiece/sentencepiece_model_pb2.py index 084b896..940a079 100644 --- a/python/src/sentencepiece/sentencepiece_model_pb2.py +++ b/python/src/sentencepiece/sentencepiece_model_pb2.py @@ -1,13 +1,11 @@ +# -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: sentencepiece_model.proto - -import sys -_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +"""Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import message as _message from google.protobuf import reflection as _reflection from google.protobuf import symbol_database as _symbol_database -from google.protobuf import descriptor_pb2 # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -19,9 +17,10 @@ DESCRIPTOR = _descriptor.FileDescriptor( name='sentencepiece_model.proto', package='sentencepiece', syntax='proto2', - serialized_pb=_b('\n\x19sentencepiece_model.proto\x12\rsentencepiece\"\xf4\x08\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12\"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x05:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12\"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18\" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05\x12\x16\n\tbos_piece\x18. \x01(\t:\x03\x12\x17\n\teos_piece\x18/ \x01(\t:\x04\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32\".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xba\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x1a\xc8\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL\"J\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03') + serialized_options=b'H\003', + create_key=_descriptor._internal_create_key, + serialized_pb=b'\n\x19sentencepiece_model.proto\x12\rsentencepiece\"\xa1\n\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12\"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x05:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12\"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18 \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18\" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05\x12\x16\n\tbos_piece\x18. \x01(\t:\x03\x12\x17\n\teos_piece\x18/ \x01(\t:\x04\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse\"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32\".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL\"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03' ) -_sym_db.RegisterFileDescriptor(DESCRIPTOR) @@ -30,28 +29,33 @@ _TRAINERSPEC_MODELTYPE = _descriptor.EnumDescriptor( full_name='sentencepiece.TrainerSpec.ModelType', filename=None, file=DESCRIPTOR, + create_key=_descriptor._internal_create_key, values=[ _descriptor.EnumValueDescriptor( name='UNIGRAM', index=0, number=1, - options=None, - type=None), + serialized_options=None, + type=None, + create_key=_descriptor._internal_create_key), _descriptor.EnumValueDescriptor( name='BPE', index=1, number=2, - options=None, - type=None), + serialized_options=None, + type=None, + create_key=_descriptor._internal_create_key), _descriptor.EnumValueDescriptor( name='WORD', index=2, number=3, - options=None, - type=None), + serialized_options=None, + type=None, + create_key=_descriptor._internal_create_key), _descriptor.EnumValueDescriptor( name='CHAR', index=3, number=4, - options=None, - type=None), + serialized_options=None, + type=None, + create_key=_descriptor._internal_create_key), ], containing_type=None, - options=None, - serialized_start=1121, - serialized_end=1174, + serialized_options=None, + serialized_start=1294, + serialized_end=1347, ) _sym_db.RegisterEnumDescriptor(_TRAINERSPEC_MODELTYPE) @@ -60,32 +64,43 @@ _MODELPROTO_SENTENCEPIECE_TYPE = _descriptor.EnumDescriptor( full_name='sentencepiece.ModelProto.SentencePiece.Type', filename=None, file=DESCRIPTOR, + create_key=_descriptor._internal_create_key, values=[ _descriptor.EnumValueDescriptor( name='NORMAL', index=0, number=1, - options=None, - type=None), + serialized_options=None, + type=None, + create_key=_descriptor._internal_create_key), _descriptor.EnumValueDescriptor( name='UNKNOWN', index=1, number=2, - options=None, - type=None), + serialized_options=None, + type=None, + create_key=_descriptor._internal_create_key), _descriptor.EnumValueDescriptor( name='CONTROL', index=2, number=3, - options=None, - type=None), + serialized_options=None, + type=None, + create_key=_descriptor._internal_create_key), _descriptor.EnumValueDescriptor( name='USER_DEFINED', index=3, number=4, - options=None, - type=None), + serialized_options=None, + type=None, + create_key=_descriptor._internal_create_key), _descriptor.EnumValueDescriptor( - name='UNUSED', index=4, number=5, - options=None, - type=None), + name='BYTE', index=4, number=6, + serialized_options=None, + type=None, + create_key=_descriptor._internal_create_key), + _descriptor.EnumValueDescriptor( + name='UNUSED', index=5, number=5, + serialized_options=None, + type=None, + create_key=_descriptor._internal_create_key), ], containing_type=None, - options=None, - serialized_start=1869, - serialized_end=1943, + serialized_options=None, + serialized_start=2100, + serialized_end=2184, ) _sym_db.RegisterEnumDescriptor(_MODELPROTO_SENTENCEPIECE_TYPE) @@ -96,6 +111,7 @@ _TRAINERSPEC = _descriptor.Descriptor( filename=None, file=DESCRIPTOR, containing_type=None, + create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( name='input', full_name='sentencepiece.TrainerSpec.input', index=0, @@ -103,245 +119,280 @@ _TRAINERSPEC = _descriptor.Descriptor( has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='input_format', full_name='sentencepiece.TrainerSpec.input_format', index=1, number=7, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), + has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='model_prefix', full_name='sentencepiece.TrainerSpec.model_prefix', index=2, number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), + has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='model_type', full_name='sentencepiece.TrainerSpec.model_type', index=3, number=3, type=14, cpp_type=8, label=1, has_default_value=True, default_value=1, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='vocab_size', full_name='sentencepiece.TrainerSpec.vocab_size', index=4, number=4, type=5, cpp_type=1, label=1, has_default_value=True, default_value=8000, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='accept_language', full_name='sentencepiece.TrainerSpec.accept_language', index=5, number=5, type=9, cpp_type=9, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='self_test_sample_size', full_name='sentencepiece.TrainerSpec.self_test_sample_size', index=6, number=6, type=5, cpp_type=1, label=1, has_default_value=True, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='character_coverage', full_name='sentencepiece.TrainerSpec.character_coverage', index=7, number=10, type=2, cpp_type=6, label=1, has_default_value=True, default_value=float(0.9995), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='input_sentence_size', full_name='sentencepiece.TrainerSpec.input_sentence_size', index=8, number=11, type=5, cpp_type=1, label=1, has_default_value=True, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='shuffle_input_sentence', full_name='sentencepiece.TrainerSpec.shuffle_input_sentence', index=9, number=19, type=8, cpp_type=7, label=1, has_default_value=True, default_value=True, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='mining_sentence_size', full_name='sentencepiece.TrainerSpec.mining_sentence_size', index=10, number=12, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\030\001'))), + serialized_options=b'\030\001', file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='training_sentence_size', full_name='sentencepiece.TrainerSpec.training_sentence_size', index=11, number=13, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\030\001'))), + serialized_options=b'\030\001', file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='seed_sentencepiece_size', full_name='sentencepiece.TrainerSpec.seed_sentencepiece_size', index=12, number=14, type=5, cpp_type=1, label=1, has_default_value=True, default_value=1000000, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='shrinking_factor', full_name='sentencepiece.TrainerSpec.shrinking_factor', index=13, number=15, type=2, cpp_type=6, label=1, has_default_value=True, default_value=float(0.75), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='max_sentence_length', full_name='sentencepiece.TrainerSpec.max_sentence_length', index=14, number=18, type=5, cpp_type=1, label=1, has_default_value=True, default_value=4192, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='num_threads', full_name='sentencepiece.TrainerSpec.num_threads', index=15, number=16, type=5, cpp_type=1, label=1, has_default_value=True, default_value=16, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='num_sub_iterations', full_name='sentencepiece.TrainerSpec.num_sub_iterations', index=16, number=17, type=5, cpp_type=1, label=1, has_default_value=True, default_value=2, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='max_sentencepiece_length', full_name='sentencepiece.TrainerSpec.max_sentencepiece_length', index=17, number=20, type=5, cpp_type=1, label=1, has_default_value=True, default_value=16, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='split_by_unicode_script', full_name='sentencepiece.TrainerSpec.split_by_unicode_script', index=18, number=21, type=8, cpp_type=7, label=1, has_default_value=True, default_value=True, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='split_by_number', full_name='sentencepiece.TrainerSpec.split_by_number', index=19, number=23, type=8, cpp_type=7, label=1, has_default_value=True, default_value=True, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='split_by_whitespace', full_name='sentencepiece.TrainerSpec.split_by_whitespace', index=20, number=22, type=8, cpp_type=7, label=1, has_default_value=True, default_value=True, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='treat_whitespace_as_suffix', full_name='sentencepiece.TrainerSpec.treat_whitespace_as_suffix', index=21, number=24, type=8, cpp_type=7, label=1, has_default_value=True, default_value=False, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='control_symbols', full_name='sentencepiece.TrainerSpec.control_symbols', index=22, + name='split_digits', full_name='sentencepiece.TrainerSpec.split_digits', index=22, + number=25, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='control_symbols', full_name='sentencepiece.TrainerSpec.control_symbols', index=23, number=30, type=9, cpp_type=9, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='user_defined_symbols', full_name='sentencepiece.TrainerSpec.user_defined_symbols', index=23, + name='user_defined_symbols', full_name='sentencepiece.TrainerSpec.user_defined_symbols', index=24, number=31, type=9, cpp_type=9, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='hard_vocab_limit', full_name='sentencepiece.TrainerSpec.hard_vocab_limit', index=24, + name='required_chars', full_name='sentencepiece.TrainerSpec.required_chars', index=25, + number=36, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='byte_fallback', full_name='sentencepiece.TrainerSpec.byte_fallback', index=26, + number=35, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='vocabulary_output_piece_score', full_name='sentencepiece.TrainerSpec.vocabulary_output_piece_score', index=27, + number=32, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=True, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='hard_vocab_limit', full_name='sentencepiece.TrainerSpec.hard_vocab_limit', index=28, number=33, type=8, cpp_type=7, label=1, has_default_value=True, default_value=True, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='use_all_vocab', full_name='sentencepiece.TrainerSpec.use_all_vocab', index=25, + name='use_all_vocab', full_name='sentencepiece.TrainerSpec.use_all_vocab', index=29, number=34, type=8, cpp_type=7, label=1, has_default_value=True, default_value=False, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='unk_id', full_name='sentencepiece.TrainerSpec.unk_id', index=26, + name='unk_id', full_name='sentencepiece.TrainerSpec.unk_id', index=30, number=40, type=5, cpp_type=1, label=1, has_default_value=True, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='bos_id', full_name='sentencepiece.TrainerSpec.bos_id', index=27, + name='bos_id', full_name='sentencepiece.TrainerSpec.bos_id', index=31, number=41, type=5, cpp_type=1, label=1, has_default_value=True, default_value=1, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='eos_id', full_name='sentencepiece.TrainerSpec.eos_id', index=28, + name='eos_id', full_name='sentencepiece.TrainerSpec.eos_id', index=32, number=42, type=5, cpp_type=1, label=1, has_default_value=True, default_value=2, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='pad_id', full_name='sentencepiece.TrainerSpec.pad_id', index=29, + name='pad_id', full_name='sentencepiece.TrainerSpec.pad_id', index=33, number=43, type=5, cpp_type=1, label=1, has_default_value=True, default_value=-1, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='unk_piece', full_name='sentencepiece.TrainerSpec.unk_piece', index=30, + name='unk_piece', full_name='sentencepiece.TrainerSpec.unk_piece', index=34, number=45, type=9, cpp_type=9, label=1, - has_default_value=True, default_value=_b("").decode('utf-8'), + has_default_value=True, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='bos_piece', full_name='sentencepiece.TrainerSpec.bos_piece', index=31, + name='bos_piece', full_name='sentencepiece.TrainerSpec.bos_piece', index=35, number=46, type=9, cpp_type=9, label=1, - has_default_value=True, default_value=_b("").decode('utf-8'), + has_default_value=True, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='eos_piece', full_name='sentencepiece.TrainerSpec.eos_piece', index=32, + name='eos_piece', full_name='sentencepiece.TrainerSpec.eos_piece', index=36, number=47, type=9, cpp_type=9, label=1, - has_default_value=True, default_value=_b("").decode('utf-8'), + has_default_value=True, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='pad_piece', full_name='sentencepiece.TrainerSpec.pad_piece', index=33, + name='pad_piece', full_name='sentencepiece.TrainerSpec.pad_piece', index=37, number=48, type=9, cpp_type=9, label=1, - has_default_value=True, default_value=_b("").decode('utf-8'), + has_default_value=True, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='unk_surface', full_name='sentencepiece.TrainerSpec.unk_surface', index=34, + name='unk_surface', full_name='sentencepiece.TrainerSpec.unk_surface', index=38, number=44, type=9, cpp_type=9, label=1, - has_default_value=True, default_value=_b(" \342\201\207 ").decode('utf-8'), + has_default_value=True, default_value=b" \342\201\207 ".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='train_extremely_large_corpus', full_name='sentencepiece.TrainerSpec.train_extremely_large_corpus', index=39, + number=49, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], @@ -349,14 +400,14 @@ _TRAINERSPEC = _descriptor.Descriptor( enum_types=[ _TRAINERSPEC_MODELTYPE, ], - options=None, + serialized_options=None, is_extendable=True, syntax='proto2', extension_ranges=[(200, 536870912), ], oneofs=[ ], serialized_start=45, - serialized_end=1185, + serialized_end=1358, ) @@ -366,63 +417,64 @@ _NORMALIZERSPEC = _descriptor.Descriptor( filename=None, file=DESCRIPTOR, containing_type=None, + create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( name='name', full_name='sentencepiece.NormalizerSpec.name', index=0, number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), + has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='precompiled_charsmap', full_name='sentencepiece.NormalizerSpec.precompiled_charsmap', index=1, number=2, type=12, cpp_type=9, label=1, - has_default_value=False, default_value=_b(""), + has_default_value=False, default_value=b"", message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='add_dummy_prefix', full_name='sentencepiece.NormalizerSpec.add_dummy_prefix', index=2, number=3, type=8, cpp_type=7, label=1, has_default_value=True, default_value=True, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='remove_extra_whitespaces', full_name='sentencepiece.NormalizerSpec.remove_extra_whitespaces', index=3, number=4, type=8, cpp_type=7, label=1, has_default_value=True, default_value=True, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='escape_whitespaces', full_name='sentencepiece.NormalizerSpec.escape_whitespaces', index=4, number=5, type=8, cpp_type=7, label=1, has_default_value=True, default_value=True, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='normalization_rule_tsv', full_name='sentencepiece.NormalizerSpec.normalization_rule_tsv', index=5, number=6, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), + has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], nested_types=[], enum_types=[ ], - options=None, + serialized_options=None, is_extendable=True, syntax='proto2', extension_ranges=[(200, 536870912), ], oneofs=[ ], - serialized_start=1188, - serialized_end=1397, + serialized_start=1361, + serialized_end=1570, ) @@ -432,35 +484,36 @@ _SELFTESTDATA_SAMPLE = _descriptor.Descriptor( filename=None, file=DESCRIPTOR, containing_type=None, + create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( name='input', full_name='sentencepiece.SelfTestData.Sample.input', index=0, number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), + has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='expected', full_name='sentencepiece.SelfTestData.Sample.expected', index=1, number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), + has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], nested_types=[], enum_types=[ ], - options=None, + serialized_options=None, is_extendable=False, syntax='proto2', extension_ranges=[], oneofs=[ ], - serialized_start=1468, - serialized_end=1509, + serialized_start=1641, + serialized_end=1682, ) _SELFTESTDATA = _descriptor.Descriptor( @@ -469,6 +522,7 @@ _SELFTESTDATA = _descriptor.Descriptor( filename=None, file=DESCRIPTOR, containing_type=None, + create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( name='samples', full_name='sentencepiece.SelfTestData.samples', index=0, @@ -476,21 +530,21 @@ _SELFTESTDATA = _descriptor.Descriptor( has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], nested_types=[_SELFTESTDATA_SAMPLE, ], enum_types=[ ], - options=None, + serialized_options=None, is_extendable=True, syntax='proto2', extension_ranges=[(200, 536870912), ], oneofs=[ ], - serialized_start=1399, - serialized_end=1520, + serialized_start=1572, + serialized_end=1693, ) @@ -500,28 +554,29 @@ _MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor( filename=None, file=DESCRIPTOR, containing_type=None, + create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( name='piece', full_name='sentencepiece.ModelProto.SentencePiece.piece', index=0, number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), + has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='score', full_name='sentencepiece.ModelProto.SentencePiece.score', index=1, number=2, type=2, cpp_type=6, label=1, has_default_value=False, default_value=float(0), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='type', full_name='sentencepiece.ModelProto.SentencePiece.type', index=2, number=3, type=14, cpp_type=8, label=1, has_default_value=True, default_value=1, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], @@ -529,14 +584,14 @@ _MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor( enum_types=[ _MODELPROTO_SENTENCEPIECE_TYPE, ], - options=None, + serialized_options=None, is_extendable=True, syntax='proto2', extension_ranges=[(200, 536870912), ], oneofs=[ ], - serialized_start=1754, - serialized_end=1954, + serialized_start=1985, + serialized_end=2195, ) _MODELPROTO = _descriptor.Descriptor( @@ -545,6 +600,7 @@ _MODELPROTO = _descriptor.Descriptor( filename=None, file=DESCRIPTOR, containing_type=None, + create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( name='pieces', full_name='sentencepiece.ModelProto.pieces', index=0, @@ -552,42 +608,49 @@ _MODELPROTO = _descriptor.Descriptor( has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='trainer_spec', full_name='sentencepiece.ModelProto.trainer_spec', index=1, number=2, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='normalizer_spec', full_name='sentencepiece.ModelProto.normalizer_spec', index=2, number=3, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='self_test_data', full_name='sentencepiece.ModelProto.self_test_data', index=3, number=4, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='denormalizer_spec', full_name='sentencepiece.ModelProto.denormalizer_spec', index=4, + number=5, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], nested_types=[_MODELPROTO_SENTENCEPIECE, ], enum_types=[ ], - options=None, + serialized_options=None, is_extendable=True, syntax='proto2', extension_ranges=[(200, 536870912), ], oneofs=[ ], - serialized_start=1523, - serialized_end=1965, + serialized_start=1696, + serialized_end=2206, ) _TRAINERSPEC.fields_by_name['model_type'].enum_type = _TRAINERSPEC_MODELTYPE @@ -601,60 +664,59 @@ _MODELPROTO.fields_by_name['pieces'].message_type = _MODELPROTO_SENTENCEPIECE _MODELPROTO.fields_by_name['trainer_spec'].message_type = _TRAINERSPEC _MODELPROTO.fields_by_name['normalizer_spec'].message_type = _NORMALIZERSPEC _MODELPROTO.fields_by_name['self_test_data'].message_type = _SELFTESTDATA +_MODELPROTO.fields_by_name['denormalizer_spec'].message_type = _NORMALIZERSPEC DESCRIPTOR.message_types_by_name['TrainerSpec'] = _TRAINERSPEC DESCRIPTOR.message_types_by_name['NormalizerSpec'] = _NORMALIZERSPEC DESCRIPTOR.message_types_by_name['SelfTestData'] = _SELFTESTDATA DESCRIPTOR.message_types_by_name['ModelProto'] = _MODELPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) -TrainerSpec = _reflection.GeneratedProtocolMessageType('TrainerSpec', (_message.Message,), dict( - DESCRIPTOR = _TRAINERSPEC, - __module__ = 'sentencepiece_model_pb2' +TrainerSpec = _reflection.GeneratedProtocolMessageType('TrainerSpec', (_message.Message,), { + 'DESCRIPTOR' : _TRAINERSPEC, + '__module__' : 'sentencepiece_model_pb2' # @@protoc_insertion_point(class_scope:sentencepiece.TrainerSpec) - )) + }) _sym_db.RegisterMessage(TrainerSpec) -NormalizerSpec = _reflection.GeneratedProtocolMessageType('NormalizerSpec', (_message.Message,), dict( - DESCRIPTOR = _NORMALIZERSPEC, - __module__ = 'sentencepiece_model_pb2' +NormalizerSpec = _reflection.GeneratedProtocolMessageType('NormalizerSpec', (_message.Message,), { + 'DESCRIPTOR' : _NORMALIZERSPEC, + '__module__' : 'sentencepiece_model_pb2' # @@protoc_insertion_point(class_scope:sentencepiece.NormalizerSpec) - )) + }) _sym_db.RegisterMessage(NormalizerSpec) -SelfTestData = _reflection.GeneratedProtocolMessageType('SelfTestData', (_message.Message,), dict( +SelfTestData = _reflection.GeneratedProtocolMessageType('SelfTestData', (_message.Message,), { - Sample = _reflection.GeneratedProtocolMessageType('Sample', (_message.Message,), dict( - DESCRIPTOR = _SELFTESTDATA_SAMPLE, - __module__ = 'sentencepiece_model_pb2' + 'Sample' : _reflection.GeneratedProtocolMessageType('Sample', (_message.Message,), { + 'DESCRIPTOR' : _SELFTESTDATA_SAMPLE, + '__module__' : 'sentencepiece_model_pb2' # @@protoc_insertion_point(class_scope:sentencepiece.SelfTestData.Sample) - )) + }) , - DESCRIPTOR = _SELFTESTDATA, - __module__ = 'sentencepiece_model_pb2' + 'DESCRIPTOR' : _SELFTESTDATA, + '__module__' : 'sentencepiece_model_pb2' # @@protoc_insertion_point(class_scope:sentencepiece.SelfTestData) - )) + }) _sym_db.RegisterMessage(SelfTestData) _sym_db.RegisterMessage(SelfTestData.Sample) -ModelProto = _reflection.GeneratedProtocolMessageType('ModelProto', (_message.Message,), dict( +ModelProto = _reflection.GeneratedProtocolMessageType('ModelProto', (_message.Message,), { - SentencePiece = _reflection.GeneratedProtocolMessageType('SentencePiece', (_message.Message,), dict( - DESCRIPTOR = _MODELPROTO_SENTENCEPIECE, - __module__ = 'sentencepiece_model_pb2' + 'SentencePiece' : _reflection.GeneratedProtocolMessageType('SentencePiece', (_message.Message,), { + 'DESCRIPTOR' : _MODELPROTO_SENTENCEPIECE, + '__module__' : 'sentencepiece_model_pb2' # @@protoc_insertion_point(class_scope:sentencepiece.ModelProto.SentencePiece) - )) + }) , - DESCRIPTOR = _MODELPROTO, - __module__ = 'sentencepiece_model_pb2' + 'DESCRIPTOR' : _MODELPROTO, + '__module__' : 'sentencepiece_model_pb2' # @@protoc_insertion_point(class_scope:sentencepiece.ModelProto) - )) + }) _sym_db.RegisterMessage(ModelProto) _sym_db.RegisterMessage(ModelProto.SentencePiece) -DESCRIPTOR.has_options = True -DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('H\003')) -_TRAINERSPEC.fields_by_name['mining_sentence_size'].has_options = True -_TRAINERSPEC.fields_by_name['mining_sentence_size']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\030\001')) -_TRAINERSPEC.fields_by_name['training_sentence_size'].has_options = True -_TRAINERSPEC.fields_by_name['training_sentence_size']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\030\001')) +DESCRIPTOR._options = None +_TRAINERSPEC.fields_by_name['mining_sentence_size']._options = None +_TRAINERSPEC.fields_by_name['training_sentence_size']._options = None # @@protoc_insertion_point(module_scope) diff --git a/python/src/sentencepiece/sentencepiece_pb2.py b/python/src/sentencepiece/sentencepiece_pb2.py index 8347974..00f1b28 100644 --- a/python/src/sentencepiece/sentencepiece_pb2.py +++ b/python/src/sentencepiece/sentencepiece_pb2.py @@ -1,13 +1,11 @@ +# -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: sentencepiece.proto - -import sys -_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +"""Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import message as _message from google.protobuf import reflection as _reflection from google.protobuf import symbol_database as _symbol_database -from google.protobuf import descriptor_pb2 # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -19,9 +17,10 @@ DESCRIPTOR = _descriptor.FileDescriptor( name='sentencepiece.proto', package='sentencepiece', syntax='proto2', - serialized_pb=_b('\n\x13sentencepiece.proto\x12\rsentencepiece\"\xdf\x01\n\x11SentencePieceText\x12\x0c\n\x04text\x18\x01 \x01(\t\x12>\n\x06pieces\x18\x02 \x03(\x0b\x32..sentencepiece.SentencePieceText.SentencePiece\x12\r\n\x05score\x18\x03 \x01(\x02\x1a\x62\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\n\n\x02id\x18\x02 \x01(\r\x12\x0f\n\x07surface\x18\x03 \x01(\t\x12\r\n\x05\x62\x65gin\x18\x04 \x01(\r\x12\x0b\n\x03\x65nd\x18\x05 \x01(\r*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"J\n\x16NBestSentencePieceText\x12\x30\n\x06nbests\x18\x01 \x03(\x0b\x32 .sentencepiece.SentencePieceTextB\x02H\x03') + serialized_options=b'H\003', + create_key=_descriptor._internal_create_key, + serialized_pb=b'\n\x13sentencepiece.proto\x12\rsentencepiece\"\xdf\x01\n\x11SentencePieceText\x12\x0c\n\x04text\x18\x01 \x01(\t\x12>\n\x06pieces\x18\x02 \x03(\x0b\x32..sentencepiece.SentencePieceText.SentencePiece\x12\r\n\x05score\x18\x03 \x01(\x02\x1a\x62\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\n\n\x02id\x18\x02 \x01(\r\x12\x0f\n\x07surface\x18\x03 \x01(\t\x12\r\n\x05\x62\x65gin\x18\x04 \x01(\r\x12\x0b\n\x03\x65nd\x18\x05 \x01(\r*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"J\n\x16NBestSentencePieceText\x12\x30\n\x06nbests\x18\x01 \x03(\x0b\x32 .sentencepiece.SentencePieceTextB\x02H\x03' ) -_sym_db.RegisterFileDescriptor(DESCRIPTOR) @@ -32,49 +31,50 @@ _SENTENCEPIECETEXT_SENTENCEPIECE = _descriptor.Descriptor( filename=None, file=DESCRIPTOR, containing_type=None, + create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( name='piece', full_name='sentencepiece.SentencePieceText.SentencePiece.piece', index=0, number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), + has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='id', full_name='sentencepiece.SentencePieceText.SentencePiece.id', index=1, number=2, type=13, cpp_type=3, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='surface', full_name='sentencepiece.SentencePieceText.SentencePiece.surface', index=2, number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), + has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='begin', full_name='sentencepiece.SentencePieceText.SentencePiece.begin', index=3, number=4, type=13, cpp_type=3, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='end', full_name='sentencepiece.SentencePieceText.SentencePiece.end', index=4, number=5, type=13, cpp_type=3, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], nested_types=[], enum_types=[ ], - options=None, + serialized_options=None, is_extendable=True, syntax='proto2', extension_ranges=[(200, 536870912), ], @@ -90,35 +90,36 @@ _SENTENCEPIECETEXT = _descriptor.Descriptor( filename=None, file=DESCRIPTOR, containing_type=None, + create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( name='text', full_name='sentencepiece.SentencePieceText.text', index=0, number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), + has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='pieces', full_name='sentencepiece.SentencePieceText.pieces', index=1, number=2, type=11, cpp_type=10, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='score', full_name='sentencepiece.SentencePieceText.score', index=2, number=3, type=2, cpp_type=6, label=1, has_default_value=False, default_value=float(0), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], nested_types=[_SENTENCEPIECETEXT_SENTENCEPIECE, ], enum_types=[ ], - options=None, + serialized_options=None, is_extendable=True, syntax='proto2', extension_ranges=[(200, 536870912), ], @@ -135,6 +136,7 @@ _NBESTSENTENCEPIECETEXT = _descriptor.Descriptor( filename=None, file=DESCRIPTOR, containing_type=None, + create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( name='nbests', full_name='sentencepiece.NBestSentencePieceText.nbests', index=0, @@ -142,14 +144,14 @@ _NBESTSENTENCEPIECETEXT = _descriptor.Descriptor( has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - options=None), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], nested_types=[], enum_types=[ ], - options=None, + serialized_options=None, is_extendable=False, syntax='proto2', extension_ranges=[], @@ -164,30 +166,30 @@ _SENTENCEPIECETEXT.fields_by_name['pieces'].message_type = _SENTENCEPIECETEXT_SE _NBESTSENTENCEPIECETEXT.fields_by_name['nbests'].message_type = _SENTENCEPIECETEXT DESCRIPTOR.message_types_by_name['SentencePieceText'] = _SENTENCEPIECETEXT DESCRIPTOR.message_types_by_name['NBestSentencePieceText'] = _NBESTSENTENCEPIECETEXT +_sym_db.RegisterFileDescriptor(DESCRIPTOR) -SentencePieceText = _reflection.GeneratedProtocolMessageType('SentencePieceText', (_message.Message,), dict( +SentencePieceText = _reflection.GeneratedProtocolMessageType('SentencePieceText', (_message.Message,), { - SentencePiece = _reflection.GeneratedProtocolMessageType('SentencePiece', (_message.Message,), dict( - DESCRIPTOR = _SENTENCEPIECETEXT_SENTENCEPIECE, - __module__ = 'sentencepiece_pb2' + 'SentencePiece' : _reflection.GeneratedProtocolMessageType('SentencePiece', (_message.Message,), { + 'DESCRIPTOR' : _SENTENCEPIECETEXT_SENTENCEPIECE, + '__module__' : 'sentencepiece_pb2' # @@protoc_insertion_point(class_scope:sentencepiece.SentencePieceText.SentencePiece) - )) + }) , - DESCRIPTOR = _SENTENCEPIECETEXT, - __module__ = 'sentencepiece_pb2' + 'DESCRIPTOR' : _SENTENCEPIECETEXT, + '__module__' : 'sentencepiece_pb2' # @@protoc_insertion_point(class_scope:sentencepiece.SentencePieceText) - )) + }) _sym_db.RegisterMessage(SentencePieceText) _sym_db.RegisterMessage(SentencePieceText.SentencePiece) -NBestSentencePieceText = _reflection.GeneratedProtocolMessageType('NBestSentencePieceText', (_message.Message,), dict( - DESCRIPTOR = _NBESTSENTENCEPIECETEXT, - __module__ = 'sentencepiece_pb2' +NBestSentencePieceText = _reflection.GeneratedProtocolMessageType('NBestSentencePieceText', (_message.Message,), { + 'DESCRIPTOR' : _NBESTSENTENCEPIECETEXT, + '__module__' : 'sentencepiece_pb2' # @@protoc_insertion_point(class_scope:sentencepiece.NBestSentencePieceText) - )) + }) _sym_db.RegisterMessage(NBestSentencePieceText) -DESCRIPTOR.has_options = True -DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('H\003')) +DESCRIPTOR._options = None # @@protoc_insertion_point(module_scope)