mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 22:14:57 +03:00
barebone support for quality estimation in experiment.perl
This commit is contained in:
parent
ea306f62b7
commit
defbf8d7c3
@ -1540,6 +1540,150 @@ analysis-precision
|
||||
rerun-on-change: precision-by-coverage-base
|
||||
final-model: yes
|
||||
|
||||
[QUALITY-ESTIMATION] single
|
||||
tokenize-input
|
||||
in: raw-input
|
||||
out: tokenized-input
|
||||
default-name: quality-estimation/input.tok
|
||||
pass-unless: input-tokenizer
|
||||
template: $input-tokenizer < IN > OUT
|
||||
tokenize-input-devtest
|
||||
in: raw-input-devtest
|
||||
out: tokenized-input-devtest
|
||||
default-name: quality-estimation/input.devtest.tok
|
||||
pass-unless: input-tokenizer
|
||||
template: $input-tokenizer < IN > OUT
|
||||
lowercase-input
|
||||
in: tokenized-input
|
||||
out: truecased-input
|
||||
default-name: quality-estimation/input.lc
|
||||
pass-unless: input-lowercaser
|
||||
ignore-if: input-truecaser
|
||||
template: $input-lowercaser < IN > OUT
|
||||
lowercase-input-devtest
|
||||
in: tokenized-input-devtest
|
||||
out: truecased-input-devtest
|
||||
default-name: quality-estimation/input.devtest.lc
|
||||
pass-unless: input-lowercaser
|
||||
ignore-if: input-truecaser
|
||||
template: $input-lowercaser < IN > OUT
|
||||
truecase-input
|
||||
in: tokenized-input TRUECASER:truecase-model
|
||||
out: truecased-input
|
||||
rerun-on-change: input-truecaser
|
||||
default-name: quality-estimation/input.tc
|
||||
ignore-unless: input-truecaser
|
||||
template: $input-truecaser -model IN1.$input-extension < IN > OUT
|
||||
truecase-input-devtest
|
||||
in: tokenized-input-devtest TRUECASER:truecase-model
|
||||
out: truecased-input-devtest
|
||||
rerun-on-change: input-truecaser
|
||||
ignore-unless: input-truecaser
|
||||
default-name: quality-estimation/input.devtest.tc
|
||||
template: $input-truecaser -model IN1.$input-extension < IN > OUT
|
||||
split-input
|
||||
in: truecased-input SPLITTER:splitter-model
|
||||
out: split-input
|
||||
rerun-on-change: input-splitter
|
||||
default-name: quality-estimation/input.split
|
||||
pass-unless: input-splitter
|
||||
template: $input-splitter -model IN1.$input-extension < IN > OUT
|
||||
split-input-devtest
|
||||
in: truecased-input-devtest SPLITTER:splitter-model
|
||||
out: split-input-devtest
|
||||
rerun-on-change: input-splitter
|
||||
default-name: quality-estimation/input.devtest.split
|
||||
pass-unless: input-splitter
|
||||
template: $input-splitter -model IN1.$input-extension < IN > OUT
|
||||
tokenize-reference
|
||||
in: raw-reference
|
||||
out: tokenized-reference
|
||||
default-name: quality-estimation/reference.tok
|
||||
pass-unless: output-tokenizer
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $output-tokenizer < IN > OUT
|
||||
tokenize-reference-devtest
|
||||
in: raw-reference-devtest
|
||||
out: tokenized-reference-devtest
|
||||
default-name: quality-estimation/reference.devtest.tok
|
||||
pass-unless: output-tokenizer
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $output-tokenizer < IN > OUT
|
||||
lowercase-reference
|
||||
in: tokenized-reference
|
||||
out: truecased-reference
|
||||
default-name: quality-estimation/reference.lc
|
||||
pass-unless: output-lowercaser
|
||||
ignore-if: output-truecaser
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $output-lowercaser < IN > OUT
|
||||
lowercase-reference-devtest
|
||||
in: tokenized-reference-devtest
|
||||
out: truecased-reference-devtest
|
||||
default-name: quality-estimation/reference.devtest.lc
|
||||
pass-unless: output-lowercaser
|
||||
ignore-if: output-truecaser
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $output-lowercaser < IN > OUT
|
||||
truecase-reference
|
||||
in: tokenized-reference TRUECASER:truecase-model
|
||||
out: truecased-reference
|
||||
rerun-on-change: output-truecaser
|
||||
default-name: quality-estimation/reference.tc
|
||||
ignore-unless: output-truecaser
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $output-truecaser -model IN1.$output-extension < IN > OUT
|
||||
truecase-reference-devtest
|
||||
in: tokenized-reference-devtest TRUECASER:truecase-model
|
||||
out: truecased-reference-devtest
|
||||
rerun-on-change: output-truecaser
|
||||
default-name: quality-estimation/reference.devtest.tc
|
||||
ignore-unless: output-truecaser
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $output-truecaser -model IN1.$output-extension < IN > OUT
|
||||
decode
|
||||
in: TUNING:config-with-reused-weights split-input
|
||||
out: rich-output
|
||||
default-name: quality-estimation/output
|
||||
template: $decoder -v 0 -tt -f IN < IN1 > OUT
|
||||
error: Translation was not performed correctly
|
||||
not-error: trans: No such file or directory
|
||||
decode-devtest
|
||||
in: TUNING:config-with-reused-weights split-input-devtest
|
||||
out: rich-output-devtest
|
||||
default-name: quality-estimation/output-devtest
|
||||
template: $decoder -v 0 -tt -f IN < IN1 > OUT
|
||||
error: Translation was not performed correctly
|
||||
not-error: trans: No such file or directory
|
||||
remove-markup
|
||||
in: rich-output
|
||||
out: cleaned-output
|
||||
default-name: quality-estimation/tokenized-output
|
||||
template: $moses-script-dir/ems/support/remove-segmentation-markup.perl < IN > OUT
|
||||
remove-markup-devtest
|
||||
in: rich-output-devtest
|
||||
out: cleaned-output-devtest
|
||||
default-name: quality-estimation/tokenized-output-devtest
|
||||
template: $moses-script-dir/ems/support/remove-segmentation-markup.perl < IN > OUT
|
||||
score-output
|
||||
in: cleaned-output truecased-reference
|
||||
out: scored-output
|
||||
default-name: quality-estimation/output-scored
|
||||
tmp-name: quality-estimation/ter
|
||||
template: mkdir TMP ; $moses-script-dir/ems/support/ter.perl $tercom IN IN1 TMP > OUT
|
||||
score-output-devtest
|
||||
in: cleaned-output-devtest truecased-reference-devtest
|
||||
out: scored-output-devtest
|
||||
default-name: quality-estimation/output-scored-devtest
|
||||
tmp-name: quality-estimation/ter-devtest
|
||||
template: mkdir TMP ; $moses-script-dir/ems/support/ter.perl $tercom IN IN1 TMP > OUT
|
||||
train
|
||||
in: input rich-output scored-output input-devtest rich-output-devtest scored-output-devtest
|
||||
out: quality-estimation-model
|
||||
default-name: quality-estimation/model
|
||||
template: $trainer --train-rich IN1 --train-ter IN2 --eval-rich IN4 --eval-ter IN5 --model OUT
|
||||
final-model: yes
|
||||
|
||||
[REPORTING] single
|
||||
report
|
||||
in: EVALUATION:nist-bleu-score EVALUATION:nist-bleu-c-score EVALUATION:bolt-bleu-score EVALUATION:bolt-bleu-c-score EVALUATION:multi-bleu-score EVALUATION:multi-bleu-c-score EVALUATION:multi-bleu-detok-score EVALUATION:multi-bleu-c-detok-score EVALUATION:meteor-score EVALUATION:ter-score EVALUATION:wer-score EVALUATION:ibm-bleu-score EVALUATION:ibm-bleu-c-score EVALUATION:analysis EVALUATION:analysis-coverage EVALUATION:analysis-prec TRAINING:biconcor-model EVALUATION:wade-analysis
|
||||
|
42
scripts/ems/support/create-xml.perl
Executable file
42
scripts/ems/support/create-xml.perl
Executable file
@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
||||
# Public License version 2.1 or, at your option, any later version.
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
my ($type) = @ARGV;
|
||||
if ($type =~ /^s/i) {
|
||||
print "<srcset setid=\"test\" srclang=\"any\">\n";
|
||||
print "<doc docid=\"doc\">\n";
|
||||
}
|
||||
elsif ($type =~ /^t/i) {
|
||||
print "<tstset setid=\"test\" tgtlang=\"any\" srclang=\"any\">\n";
|
||||
print "<doc sysid=\"moses\" docid=\"doc\">\n";
|
||||
}
|
||||
elsif ($type =~ /^r/i) {
|
||||
print "<refset setid=\"test\" tgtlang=\"any\" srclang=\"any\">\n";
|
||||
print "<doc sysid=\"ref\" docid=\"doc\">\n";
|
||||
}
|
||||
else {
|
||||
die("ERROR: specify source / target / ref");
|
||||
}
|
||||
|
||||
my $i = 0;
|
||||
while(<STDIN>) {
|
||||
chomp;
|
||||
print "<seg id=\"".(++$i)."\">$_</seg>\n";
|
||||
}
|
||||
|
||||
print "</doc>\n";
|
||||
|
||||
if ($type =~ /^s/i) {
|
||||
print "</srcset>\n";
|
||||
}
|
||||
elsif ($type =~ /^t/i) {
|
||||
print "</tstset>\n";
|
||||
}
|
||||
elsif ($type =~ /^r/i) {
|
||||
print "</refset>\n";
|
||||
}
|
@ -9,7 +9,16 @@ use strict;
|
||||
$|++;
|
||||
|
||||
while(<STDIN>) {
|
||||
s/ \|\d+\-\d+\| / /g;
|
||||
s/ \|\d+\-\d+\|$//;
|
||||
print $_;
|
||||
chop;
|
||||
s/\|[^\|]+\|//g;
|
||||
s/\s+/ /g;
|
||||
s/^ //;
|
||||
s/ $//;
|
||||
print $_."\n";
|
||||
}
|
||||
|
||||
#while(<STDIN>) {
|
||||
# s/ \|\d+\-\d+\| / /g;
|
||||
# s/ \|\d+\-\d+\|$//;
|
||||
# print $_;
|
||||
#}
|
||||
|
15
scripts/ems/support/ter.perl
Normal file
15
scripts/ems/support/ter.perl
Normal file
@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
||||
# Public License version 2.1 or, at your option, any later version.
|
||||
|
||||
use strict;
|
||||
use FindBin qw($RealBin);
|
||||
|
||||
my ($jar, $hyp,$ref,$tmp) = @ARGV;
|
||||
`mkdir -p $tmp`;
|
||||
`$RealBin/create-xml.perl test < $hyp > $tmp/hyp`;
|
||||
`$RealBin/create-xml.perl ref < $ref > $tmp/ref`;
|
||||
`java -jar $jar -h $tmp/hyp -r $tmp/ref -o ter -n $tmp/out`;
|
||||
print `cat $tmp/out.ter`;
|
||||
|
Loading…
Reference in New Issue
Block a user