mirror of
https://github.com/google/sentencepiece.git
synced 2024-09-11 10:55:42 +03:00
176 lines
5.9 KiB
Perl
Executable File
176 lines
5.9 KiB
Perl
Executable File
#!/usr/bin/perl
|
|
|
|
# Copyright 2018 Google Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
# Generate spec_parser.h from sentencepiece_model.proto
|
|
#
|
|
# usage: ./gen_spec_parser.pl sentencepiece_model.proto > spec_parser.h
|
|
|
|
use strict;
|
|
use warnings;
|
|
|
|
sub ProcessPrinter() {
|
|
my ($filename) = @_;
|
|
my $classname = "";
|
|
my $valid = 0;
|
|
my %enum;
|
|
open(F, $filename) || die;
|
|
print "namespace {\n";
|
|
while (<F>) {
|
|
chomp;
|
|
if (/^\s*message (\S+)/) {
|
|
$classname = $1;
|
|
$valid = 0;
|
|
if ($classname =~ /(TrainerSpec|NormalizerSpec)/) {
|
|
print "inline std::string PrintProto(const $classname &message) {\n";
|
|
print " std::ostringstream os;\n\n";
|
|
print " os << \"$classname {\\n\";\n";
|
|
$valid = 1;
|
|
}
|
|
} elsif (/^\s*}/) {
|
|
next if (!$valid);
|
|
print " os << \"}\\n\";\n";
|
|
print "\n return os.str();\n";
|
|
print "}\n\n";
|
|
} elsif (/enum\s*(\S+)/) {
|
|
my $name = $1;
|
|
$enum{$name} = 1;
|
|
next if (!$valid);
|
|
print " static const std::map<$classname::$name, std::string> k${name}_Map = { ";
|
|
while (<F>) {
|
|
if (/(\S+)\s*=\s*(\d+)/) {
|
|
print "{$classname::$1, \"$1\"}, ";
|
|
} elsif (/}/) {
|
|
print " };\n";
|
|
last;
|
|
}
|
|
}
|
|
} elsif (/\s*(repeated|optional)\s+(\S+)\s+(\S+)\s*=\s*(\d+)/) {
|
|
next if (/deprecated = true/);
|
|
next if (!$valid);
|
|
my $opt = $1;
|
|
my $type = $2;
|
|
my $name = $3;
|
|
if ($type =~ /(int|double|float|bool|string)/) {
|
|
if ($opt eq "optional") {
|
|
print " os << \" ${name}: \" << message.${name}() << \"\\n\";\n";
|
|
} else {
|
|
print " for (const auto &v : message.${name}())\n";
|
|
print " os << \" ${name}: \" << v << \"\\n\";\n";
|
|
}
|
|
} elsif (defined $enum{$type}) {
|
|
if ($opt eq "optional") {
|
|
print " {\n";
|
|
print " const auto it = k${type}_Map.find(message.${name}());\n";
|
|
print " if (it == k${type}_Map.end())\n";
|
|
print " os << \" ${name}: unknown\\n\";\n";
|
|
print " else\n";
|
|
print " os << \" ${name}: \" << it->second << \"\\n\";\n";
|
|
print " }\n";
|
|
} else {
|
|
print " for (const auto &v : message.${name}()) {\n";
|
|
print " const auto it = k${type}_Map.find(v);\n";
|
|
print " if (it == k${type}_Map.end())\n";
|
|
print " os << \" ${name}: unknown\\n\";\n";
|
|
print " else\n";
|
|
print " os << \" ${name}: \" << it->second << \"\\n\";\n";
|
|
print " }\n";
|
|
}
|
|
}
|
|
}
|
|
}
|
|
print "} // namespace\n\n";
|
|
close(F);
|
|
}
|
|
|
|
sub ProcessParser() {
|
|
my ($filename) = @_;
|
|
my $classname = "";
|
|
my $valid = 0;
|
|
my %enum;
|
|
open(F, $filename) || die;
|
|
while (<F>) {
|
|
if (/^\s*message (\S+)/) {
|
|
$classname = $1;
|
|
$valid = 0;
|
|
if ($classname =~ /(TrainerSpec|NormalizerSpec)/) {
|
|
print "util::Status SentencePieceTrainer::SetProtoField(const std::string& name, const std::string& value, $classname *message) {\n";
|
|
print " CHECK_OR_RETURN(message);\n\n";
|
|
$valid = 1;
|
|
}
|
|
} elsif (/^\s*}/) {
|
|
next if (!$valid);
|
|
print " return util::StatusBuilder(util::error::NOT_FOUND)\n";
|
|
print " << \"unknown field name \\\"\" << name << \"\\\" in ${classname}.\";\n";
|
|
print "}\n\n";
|
|
} elsif (/enum\s*(\S+)/) {
|
|
my $name = $1;
|
|
$enum{$name} = 1;
|
|
next if (!$valid);
|
|
print " static const std::map <std::string, $classname::$name> k${name}_Map = { ";
|
|
while (<F>) {
|
|
if (/(\S+)\s*=\s*(\d+)/) {
|
|
print "{\"$1\", $classname::$1}, ";
|
|
} elsif (/}/) {
|
|
print " };\n\n";
|
|
last;
|
|
}
|
|
}
|
|
} elsif (/\s*(repeated|optional)\s+(\S+)\s+(\S+)\s*=\s*(\d+)/) {
|
|
next if (/deprecated = true/);
|
|
next if (!$valid);
|
|
my $opt = $1;
|
|
my $type = $2;
|
|
my $name = $3;
|
|
my $func_prefix = $opt eq "optional" ? "set_" : "add_";
|
|
my $body = "";
|
|
if ($type =~ /(int|double|float|bool)/) {
|
|
my $empty = $type eq "bool" ? "\"true\"" : "\"\"";
|
|
$body =
|
|
"${type} v;\n" .
|
|
" if (!string_util::lexical_cast(val.empty() ? ${empty} : val, &v))\n" .
|
|
" return util::StatusBuilder(util::error::INVALID_ARGUMENT) << \"cannot parse \\\"\" << val << \"\\\" as ${type}.\";\n" .
|
|
" message->${func_prefix}${name}(v);\n";
|
|
} elsif ($type =~ /string/) {
|
|
$body = "message->${func_prefix}${name}(val);\n";
|
|
} elsif ($type =~ /bytes/) {
|
|
$body = "message->${func_prefix}${name}(val.data(), val.size());\n";
|
|
} elsif (defined $enum{$type}) {
|
|
$body = "const auto it = k${type}_Map.find(string_util::ToUpper(val));\n" .
|
|
" if (it == k${type}_Map.end())\n" .
|
|
" return util::StatusBuilder(util::error::INVALID_ARGUMENT) << \"unknown enumeration value of \\\"\" << val << \"\\\" as ${type}.\";\n" .
|
|
" message->${func_prefix}${name}(it->second);\n";
|
|
}
|
|
print " if (name == \"${name}\") {\n";
|
|
if ($opt eq "repeated") {
|
|
print " for (const auto &val : string_util::Split(value, \",\")) {\n";
|
|
print " ${body}";
|
|
print " }\n";
|
|
} else {
|
|
print " const auto &val = value;\n";
|
|
print " ${body}";
|
|
}
|
|
print " return util::OkStatus();\n";
|
|
print " }\n\n";
|
|
}
|
|
}
|
|
close(F);
|
|
}
|
|
|
|
for my $file (@ARGV) {
|
|
&ProcessPrinter($file);
|
|
&ProcessParser($file);
|
|
}
|