osm tweaks and morfessor wrapper

This commit is contained in:
Hieu Hoang 2015-05-12 20:19:39 +04:00
parent a922245864
commit abfc0671a3
4 changed files with 61 additions and 12 deletions

View File

@ -12,7 +12,9 @@ my $ORDER = 5;
my $OUT_DIR = "/tmp/osm.$$";
my $___FACTOR_DELIMITER = "|";
my ($MOSES_SRC_DIR,$CORPUS_F,$CORPUS_E,$ALIGNMENT,$SRILM_DIR,$FACTOR,$LMPLZ);
$LMPLZ = "$RealBin/../../lmplz";
$LMPLZ = "$RealBin/../../bin/lmplz";
my $cmd;
# utilities
my $ZCAT = "gzip -cd";
@ -85,22 +87,31 @@ my ($factor_val) = @_;
print "Creating Model ".$factor_val."\n";
print "Extracting Singletons\n";
`$MOSES_SRC_DIR/scripts/OSM/extract-singletons.perl $OUT_DIR/$factor_val/e $OUT_DIR/$factor_val/f $OUT_DIR/align > $OUT_DIR/$factor_val/Singletons`;
$cmd = "$MOSES_SRC_DIR/scripts/OSM/extract-singletons.perl $OUT_DIR/$factor_val/e $OUT_DIR/$factor_val/f $OUT_DIR/align > $OUT_DIR/$factor_val/Singletons";
print STDERR "Executing: $cmd\n";
`$cmd`;
print "Converting Bilingual Sentence Pair into Operation Corpus\n";
`$MOSES_SRC_DIR/bin/generateSequences $OUT_DIR/$factor_val/e $OUT_DIR/$factor_val/f $OUT_DIR/align $OUT_DIR/$factor_val/Singletons > $OUT_DIR/$factor_val/opCorpus`;
$cmd = "$MOSES_SRC_DIR/bin/generateSequences $OUT_DIR/$factor_val/e $OUT_DIR/$factor_val/f $OUT_DIR/align $OUT_DIR/$factor_val/Singletons > $OUT_DIR/$factor_val/opCorpus";
print STDERR "Executing: $cmd\n";
`$cmd`;
print "Learning Operation Sequence Translation Model\n";
if (defined($SRILM_DIR)) {
`$SRILM_DIR/ngram-count -kndiscount -order $ORDER -unk -text $OUT_DIR/$factor_val/opCorpus -lm $OUT_DIR/$factor_val/operationLM`;
$cmd = "$SRILM_DIR/ngram-count -kndiscount -order $ORDER -unk -text $OUT_DIR/$factor_val/opCorpus -lm $OUT_DIR/$factor_val/operationLM";
print STDERR "Executing: $cmd\n";
`$cmd`;
}
else {
`$LMPLZ --order $ORDER --text $OUT_DIR/$factor_val/opCorpus --arpa $OUT_DIR/$factor_val/operationLM --prune 0 0 1`;
$cmd = "$LMPLZ --order $ORDER --text $OUT_DIR/$factor_val/opCorpus --arpa $OUT_DIR/$factor_val/operationLM --prune 0 0 1";
print STDERR "Executing: $cmd\n";
`$cmd`;
}
print "Binarizing\n";
`$MOSES_SRC_DIR/bin/build_binary $OUT_DIR/$factor_val/operationLM $OUT_DIR/$factor_val/operationLM.bin`;
$cmd = "$MOSES_SRC_DIR/bin/build_binary $OUT_DIR/$factor_val/operationLM $OUT_DIR/$factor_val/operationLM.bin";
print STDERR "Executing: $cmd\n";
`$cmd`;
}
@ -211,4 +222,4 @@ sub safesystem {
print STDERR "Exit code: $exitcode\n" if $exitcode;
return ! $exitcode;
}
}
}

View File

@ -1,12 +1,13 @@
#!/usr/bin/env perl
#use strict;
use warnings;
use Getopt::Std;
getopts('q');
$target = shift;
$source = shift;
$align = shift or die "
my $target = shift;
my $source = shift;
my $align = shift or die "
Usage: extract-singletons.perl target source align
";

View File

@ -602,7 +602,7 @@ build-osm
out: osm-model
ignore-unless: operation-sequence-model
rerun-on-change: operation-sequence-model training-options script giza-settings operation-sequence-model-settings
template: $moses-script-dir/OSM/OSM-Train.perl --corpus-f IN0.$input-extension --corpus-e IN0.$output-extension --alignment IN1.$alignment-symmetrization-method --order $operation-sequence-model-order --out-dir OUT --moses-src-dir $moses-src-dir --srilm-dir $srilm-dir $operation-sequence-model-settings
template: $moses-script-dir/OSM/OSM-Train.perl --corpus-f IN0.$input-extension --corpus-e IN0.$output-extension --alignment IN1.$alignment-symmetrization-method --order $operation-sequence-model-order --out-dir OUT --moses-src-dir $moses-src-dir $operation-sequence-model-settings
default-name: model/OSM
build-transliteration-model
in: corpus word-alignment

View File

@ -0,0 +1,37 @@
#!/usr/bin/env perl
use warnings;
use strict;
use Getopt::Long "GetOptions";
my $MORF_DIR;
my $MODEL;
GetOptions('morfessor-dir=s' => \$MORF_DIR,
'model=s' => \$MODEL);
die("Must provide --model=s argument") if (!defined($MODEL));
my $cmd = "";
if (defined($MORF_DIR)) {
$cmd .= "PYTHONPATH=$MORF_DIR $MORF_DIR/scripts/";
}
my $TMP_FILE = "/tmp/morf.$$";
$cmd .= "morfessor-segment "
."-L $MODEL "
."--output-format \"{analysis} \" "
."--output-format-separator \" \" "
."--output-newlines "
."/dev/stdin "
."| sed 's/ \$//' > $TMP_FILE";
print STDERR "Executing: $cmd\n";
`$cmd`;
open(FILE, $TMP_FILE) or die("Can't open file $TMP_FILE");
while (my $line = <FILE>) {
print "$line";
}
close(FILE);