sentencepiece/data/gen_spec_parser.pl
2019-01-08 19:25:11 +09:00

176 lines
5.9 KiB
Perl
Executable File

#!/usr/bin/perl
# Copyright 2018 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Generate spec_parser.h from sentencepiece_model.proto
#
# usage: ./gen_spec_parser.pl sentencepiece_model.proto > spec_parser.h
use strict;
use warnings;
sub ProcessPrinter() {
my ($filename) = @_;
my $classname = "";
my $valid = 0;
my %enum;
open(F, $filename) || die;
print "namespace {\n";
while (<F>) {
chomp;
if (/^\s*message (\S+)/) {
$classname = $1;
$valid = 0;
if ($classname =~ /(TrainerSpec|NormalizerSpec)/) {
print "inline std::string PrintProto(const $classname &message) {\n";
print " std::ostringstream os;\n\n";
print " os << \"$classname {\\n\";\n";
$valid = 1;
}
} elsif (/^\s*}/) {
next if (!$valid);
print " os << \"}\\n\";\n";
print "\n return os.str();\n";
print "}\n\n";
} elsif (/enum\s*(\S+)/) {
my $name = $1;
$enum{$name} = 1;
next if (!$valid);
print " static const std::map<$classname::$name, std::string> k${name}_Map = { ";
while (<F>) {
if (/(\S+)\s*=\s*(\d+)/) {
print "{$classname::$1, \"$1\"}, ";
} elsif (/}/) {
print " };\n";
last;
}
}
} elsif (/\s*(repeated|optional)\s+(\S+)\s+(\S+)\s*=\s*(\d+)/) {
next if (/deprecated = true/);
next if (!$valid);
my $opt = $1;
my $type = $2;
my $name = $3;
if ($type =~ /(int|double|float|bool|string)/) {
if ($opt eq "optional") {
print " os << \" ${name}: \" << message.${name}() << \"\\n\";\n";
} else {
print " for (const auto &v : message.${name}())\n";
print " os << \" ${name}: \" << v << \"\\n\";\n";
}
} elsif (defined $enum{$type}) {
if ($opt eq "optional") {
print " {\n";
print " const auto it = k${type}_Map.find(message.${name}());\n";
print " if (it == k${type}_Map.end())\n";
print " os << \" ${name}: unknown\\n\";\n";
print " else\n";
print " os << \" ${name}: \" << it->second << \"\\n\";\n";
print " }\n";
} else {
print " for (const auto &v : message.${name}()) {\n";
print " const auto it = k${type}_Map.find(v);\n";
print " if (it == k${type}_Map.end())\n";
print " os << \" ${name}: unknown\\n\";\n";
print " else\n";
print " os << \" ${name}: \" << it->second << \"\\n\";\n";
print " }\n";
}
}
}
}
print "} // namespace\n\n";
close(F);
}
sub ProcessParser() {
my ($filename) = @_;
my $classname = "";
my $valid = 0;
my %enum;
open(F, $filename) || die;
while (<F>) {
if (/^\s*message (\S+)/) {
$classname = $1;
$valid = 0;
if ($classname =~ /(TrainerSpec|NormalizerSpec)/) {
print "util::Status SentencePieceTrainer::SetProtoField(const std::string& name, const std::string& value, $classname *message) {\n";
print " CHECK_OR_RETURN(message);\n\n";
$valid = 1;
}
} elsif (/^\s*}/) {
next if (!$valid);
print " return util::StatusBuilder(util::error::NOT_FOUND)\n";
print " << \"unknown field name \\\"\" << name << \"\\\" in ${classname}.\";\n";
print "}\n\n";
} elsif (/enum\s*(\S+)/) {
my $name = $1;
$enum{$name} = 1;
next if (!$valid);
print " static const std::map <std::string, $classname::$name> k${name}_Map = { ";
while (<F>) {
if (/(\S+)\s*=\s*(\d+)/) {
print "{\"$1\", $classname::$1}, ";
} elsif (/}/) {
print " };\n\n";
last;
}
}
} elsif (/\s*(repeated|optional)\s+(\S+)\s+(\S+)\s*=\s*(\d+)/) {
next if (/deprecated = true/);
next if (!$valid);
my $opt = $1;
my $type = $2;
my $name = $3;
my $func_prefix = $opt eq "optional" ? "set_" : "add_";
my $body = "";
if ($type =~ /(int|double|float|bool)/) {
my $empty = $type eq "bool" ? "\"true\"" : "\"\"";
$body =
"${type} v;\n" .
" if (!string_util::lexical_cast(val.empty() ? ${empty} : val, &v))\n" .
" return util::StatusBuilder(util::error::INVALID_ARGUMENT) << \"cannot parse \\\"\" << val << \"\\\" as ${type}.\";\n" .
" message->${func_prefix}${name}(v);\n";
} elsif ($type =~ /string/) {
$body = "message->${func_prefix}${name}(val);\n";
} elsif ($type =~ /bytes/) {
$body = "message->${func_prefix}${name}(val.data(), val.size());\n";
} elsif (defined $enum{$type}) {
$body = "const auto it = k${type}_Map.find(string_util::ToUpper(val));\n" .
" if (it == k${type}_Map.end())\n" .
" return util::StatusBuilder(util::error::INVALID_ARGUMENT) << \"unknown enumeration value of \\\"\" << val << \"\\\" as ${type}.\";\n" .
" message->${func_prefix}${name}(it->second);\n";
}
print " if (name == \"${name}\") {\n";
if ($opt eq "repeated") {
print " for (const auto &val : string_util::Split(value, \",\")) {\n";
print " ${body}";
print " }\n";
} else {
print " const auto &val = value;\n";
print " ${body}";
}
print " return util::OkStatus();\n";
print " }\n\n";
}
}
close(F);
}
for my $file (@ARGV) {
&ProcessPrinter($file);
&ProcessParser($file);
}