Interpolated OSM

This commit is contained in:
Nadir 2015-10-07 13:57:32 +01:00
parent a4b585ca11
commit 2ec6fed898
2 changed files with 163 additions and 6 deletions

View File

@ -14,7 +14,7 @@ print STDERR "Training OSM - Start\n".`date`;
my $ORDER = 5;
my $OUT_DIR = "/tmp/osm.$$";
my $___FACTOR_DELIMITER = "|";
my ($MOSES_SRC_DIR,$CORPUS_F,$CORPUS_E,$ALIGNMENT,$SRILM_DIR,$FACTOR,$LMPLZ);
my ($MOSES_SRC_DIR,$CORPUS_F,$CORPUS_E,$ALIGNMENT,$SRILM_DIR,$FACTOR,$LMPLZ,$DOMAIN,$TUNE,$INP_EXT,$OP_EXT);
my $cmd;
@ -29,6 +29,10 @@ die("ERROR: wrong syntax when invoking OSM-Train.perl")
'alignment=s' => \$ALIGNMENT,
'order=i' => \$ORDER,
'factor=s' => \$FACTOR,
'input-extension=s' => \$INP_EXT,
'output-extension=s' => \$OP_EXT,
'tune=s' => \$TUNE,
'domain=s' => \$DOMAIN,
'srilm-dir=s' => \$SRILM_DIR,
'lmplz=s' => \$LMPLZ,
'out-dir=s' => \$OUT_DIR);
@ -74,19 +78,172 @@ if (defined($FACTOR)) {
`ln -s $corpus_stem_f.$factor_val.$ext_f $OUT_DIR/$factor_val/f`;
`ln -s $corpus_stem_e.$factor_val.$ext_e $OUT_DIR/$factor_val/e`;
create_model($factor_val);
if (defined($TUNE) && defined($DOMAIN) && $factor_val eq "0-0")
{
die("ERROR: For Interpolated OSM model, you need SRILM")
unless -e $SRILM_DIR;
`mkdir $OUT_DIR/TUNE`;
`$MOSES_SRC_DIR/scripts/training/reduce-factors.perl --corpus $TUNE.$INP_EXT --reduced $OUT_DIR/TUNE/tune.$INP_EXT --factor 0`;
`$MOSES_SRC_DIR/scripts/training/reduce-factors.perl --corpus $TUNE.$OP_EXT --reduced $OUT_DIR/TUNE/tune.$OP_EXT --factor 0`;
create_interpolated_model($factor_val);
}
else
{
create_model($factor_val);
}
}
}
else {
`ln -s $CORPUS_F $OUT_DIR/f`;
`ln -s $CORPUS_E $OUT_DIR/e`;
create_model("");
if (defined($TUNE) && defined($DOMAIN))
{
die("ERROR: For Interpolated OSM model, you need SRILM")
unless -e $SRILM_DIR;
`mkdir $OUT_DIR/TUNE`;
`cp $TUNE.$INP_EXT --reduced $OUT_DIR/TUNE/tune.$INP_EXT`;
`cp $TUNE.$OP_EXT --reduced $OUT_DIR/TUNE/tune.$OP_EXT`;
create_interpolated_model("");
}
else
{
create_model("");
}
}
# create model
print "Training OSM - End".`date`;
sub read_domain_file{
open(my $fh, '<:encoding(UTF-8)', $DOMAIN)
or die "Could not open file '$DOMAIN' $!";
my @corpora;
while (my $row = <$fh>) {
chomp $row;
my ($num,$dom) = split(/\ /,$row);
push @corpora, $dom;
push @corpora, $num;
#print "$dom $num\n";
}
return @corpora;
}
sub create_interpolated_model{
my ($factor_val) = @_;
my $fNum = 0;
my $dName;
my @corpora = read_domain_file();
my $i = 0;
while($i < scalar(@corpora))
{
$dName = "$OUT_DIR/$factor_val/$corpora[$i]";
$cmd = "mkdir $dName";
`$cmd`;
my $cal = $corpora[$i+1] - $fNum;
$cmd = "head -$corpora[$i+1] $OUT_DIR/$factor_val/e | tail -$cal > $dName/e";
`$cmd`;
$cmd = "head -$corpora[$i+1] $OUT_DIR/$factor_val/f | tail -$cal > $dName/f";
`$cmd`;
$cmd = "head -$corpora[$i+1] $OUT_DIR/align | tail -$cal > $dName/align";
`$cmd`;
#print STDERR "Flip Alignment\n";
#`$MOSES_SRC_DIR/scripts/OSM/flipAlignment.perl $dName/alignment > $dName/align`;
print STDERR "Extracting Singletons\n";
$cmd = "$MOSES_SRC_DIR/scripts/OSM/extract-singletons.perl $dName/e $dName/f $dName/align > $dName/Singletons";
print STDERR "Executing: $cmd\n";
`$cmd`;
print STDERR "Converting Bilingual Sentence Pair into Operation Corpus\n";
$cmd = "$MOSES_SRC_DIR/bin/generateSequences $dName/e $dName/f $dName/align $dName/Singletons > $dName/opCorpus";
print STDERR "Executing: $cmd\n";
`$cmd`;
print STDERR "Learning Operation Sequence Translation Model\n";
if (defined($SRILM_DIR)) {
$cmd = "$SRILM_DIR/ngram-count -kndiscount -order $ORDER -unk -text $dName/opCorpus -lm $dName/operationLM 2>> /dev/stderr";
print STDERR "Executing: $cmd\n";
`$cmd`;
}
else {
$cmd = "$LMPLZ -T $OUT_DIR --order $ORDER --text $dName/opCorpus --arpa $dName/operationLM --prune 0 0 1 2>> /dev/stderr";
print STDERR "Executing: $cmd\n";
`$cmd`;
}
print "$cmd\n";
$fNum = $corpora[$i+1];
$i = $i+2;
}
`$MOSES_SRC_DIR/scripts/OSM/flipAlignment.perl $TUNE.align > $OUT_DIR/TUNE/tune.align`;
print STDERR "Extracting Singletons\n";
$cmd = "$MOSES_SRC_DIR/scripts/OSM/extract-singletons.perl $OUT_DIR/TUNE/tune.$OP_EXT $OUT_DIR/TUNE/tune.$INP_EXT $OUT_DIR/TUNE/tune.align > $OUT_DIR/TUNE/Singletons";
print STDERR "Executing: $cmd\n";
`$cmd`;
print STDERR "Converting Bilingual Sentence Pair into Operation Corpus\n";
$cmd = "$MOSES_SRC_DIR/bin/generateSequences $OUT_DIR/TUNE/tune.$OP_EXT $OUT_DIR/TUNE/tune.$INP_EXT $OUT_DIR/TUNE/tune.align $OUT_DIR/TUNE/Singletons > $OUT_DIR/TUNE/tune.opCorpus";
print STDERR "Executing: $cmd\n";
`$cmd`;
print STDERR "Interpolating OSM Models\n";
$cmd = "$MOSES_SRC_DIR/scripts/ems/support/interpolate-lm.perl --tuning $OUT_DIR/TUNE/tune.opCorpus --name $OUT_DIR/$factor_val/operationLM --srilm $SRILM_DIR --lm ";
$i = 0;
$dName = "$OUT_DIR/$factor_val/$corpora[$i]/operationLM";
$cmd = $cmd . $dName;
$i = $i+2;
while($i < scalar(@corpora))
{
$cmd = $cmd . ",";
$dName = "$OUT_DIR/$factor_val/$corpora[$i]/operationLM";
$cmd = $cmd . $dName;
$i = $i+2;
}
print STDERR "Executing: $cmd\n";
`$cmd`;
print STDERR "Binarizing\n";
$cmd = "$MOSES_SRC_DIR/bin/build_binary $OUT_DIR/$factor_val/operationLM $OUT_DIR/$factor_val/operationLM.bin";
print STDERR "Executing: $cmd\n";
system($cmd) == 0 or die("system $cmd failed: $?");
}
sub create_model{
my ($factor_val) = @_;

View File

@ -533,7 +533,7 @@ build-domains
in: CORPUS:post-split-factorized-stem
out: domains
default-name: model/domains
ignore-unless: domain-features mml-filter-corpora
ignore-unless: domain-features mml-filter-corpora operation-sequence-model
template: $moses-script-dir/ems/support/build-domain-file-from-subcorpora.perl $input-extension IN > OUT
final-model: yes
mml-score
@ -698,11 +698,11 @@ pcfg-score
pass-unless: use-pcfg-feature
template: ln -s IN.$input-extension OUT.$input-extension ; $moses-bin-dir/pcfg-score IN1.$output-extension < IN.$output-extension > OUT.$output-extension
build-osm
in: corpus word-alignment
in: corpus word-alignment domains
out: osm-model
ignore-unless: operation-sequence-model
rerun-on-change: operation-sequence-model training-options script giza-settings operation-sequence-model-settings
template: $moses-script-dir/OSM/OSM-Train.perl --corpus-f IN0.$input-extension --corpus-e IN0.$output-extension --alignment IN1.$alignment-symmetrization-method --order $operation-sequence-model-order --out-dir OUT --moses-src-dir $moses-src-dir $operation-sequence-model-settings
template: $moses-script-dir/OSM/OSM-Train.perl --corpus-f IN0.$input-extension --corpus-e IN0.$output-extension --alignment IN1.$alignment-symmetrization-method --order $operation-sequence-model-order --out-dir OUT --moses-src-dir $moses-src-dir --input-extension $input-extension --output-extension $output-extension $operation-sequence-model-settings --domain IN2
default-name: model/OSM
build-transliteration-model
in: corpus word-alignment