mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-28 14:32:38 +03:00
osm tweaks and morfessor wrapper
This commit is contained in:
parent
a922245864
commit
abfc0671a3
@ -12,7 +12,9 @@ my $ORDER = 5;
|
||||
my $OUT_DIR = "/tmp/osm.$$";
|
||||
my $___FACTOR_DELIMITER = "|";
|
||||
my ($MOSES_SRC_DIR,$CORPUS_F,$CORPUS_E,$ALIGNMENT,$SRILM_DIR,$FACTOR,$LMPLZ);
|
||||
$LMPLZ = "$RealBin/../../lmplz";
|
||||
$LMPLZ = "$RealBin/../../bin/lmplz";
|
||||
|
||||
my $cmd;
|
||||
|
||||
# utilities
|
||||
my $ZCAT = "gzip -cd";
|
||||
@ -85,22 +87,31 @@ my ($factor_val) = @_;
|
||||
print "Creating Model ".$factor_val."\n";
|
||||
|
||||
print "Extracting Singletons\n";
|
||||
`$MOSES_SRC_DIR/scripts/OSM/extract-singletons.perl $OUT_DIR/$factor_val/e $OUT_DIR/$factor_val/f $OUT_DIR/align > $OUT_DIR/$factor_val/Singletons`;
|
||||
$cmd = "$MOSES_SRC_DIR/scripts/OSM/extract-singletons.perl $OUT_DIR/$factor_val/e $OUT_DIR/$factor_val/f $OUT_DIR/align > $OUT_DIR/$factor_val/Singletons";
|
||||
print STDERR "Executing: $cmd\n";
|
||||
`$cmd`;
|
||||
|
||||
print "Converting Bilingual Sentence Pair into Operation Corpus\n";
|
||||
`$MOSES_SRC_DIR/bin/generateSequences $OUT_DIR/$factor_val/e $OUT_DIR/$factor_val/f $OUT_DIR/align $OUT_DIR/$factor_val/Singletons > $OUT_DIR/$factor_val/opCorpus`;
|
||||
$cmd = "$MOSES_SRC_DIR/bin/generateSequences $OUT_DIR/$factor_val/e $OUT_DIR/$factor_val/f $OUT_DIR/align $OUT_DIR/$factor_val/Singletons > $OUT_DIR/$factor_val/opCorpus";
|
||||
print STDERR "Executing: $cmd\n";
|
||||
`$cmd`;
|
||||
|
||||
print "Learning Operation Sequence Translation Model\n";
|
||||
if (defined($SRILM_DIR)) {
|
||||
`$SRILM_DIR/ngram-count -kndiscount -order $ORDER -unk -text $OUT_DIR/$factor_val/opCorpus -lm $OUT_DIR/$factor_val/operationLM`;
|
||||
$cmd = "$SRILM_DIR/ngram-count -kndiscount -order $ORDER -unk -text $OUT_DIR/$factor_val/opCorpus -lm $OUT_DIR/$factor_val/operationLM";
|
||||
print STDERR "Executing: $cmd\n";
|
||||
`$cmd`;
|
||||
}
|
||||
else {
|
||||
`$LMPLZ --order $ORDER --text $OUT_DIR/$factor_val/opCorpus --arpa $OUT_DIR/$factor_val/operationLM --prune 0 0 1`;
|
||||
$cmd = "$LMPLZ --order $ORDER --text $OUT_DIR/$factor_val/opCorpus --arpa $OUT_DIR/$factor_val/operationLM --prune 0 0 1";
|
||||
print STDERR "Executing: $cmd\n";
|
||||
`$cmd`;
|
||||
}
|
||||
|
||||
print "Binarizing\n";
|
||||
`$MOSES_SRC_DIR/bin/build_binary $OUT_DIR/$factor_val/operationLM $OUT_DIR/$factor_val/operationLM.bin`;
|
||||
|
||||
$cmd = "$MOSES_SRC_DIR/bin/build_binary $OUT_DIR/$factor_val/operationLM $OUT_DIR/$factor_val/operationLM.bin";
|
||||
print STDERR "Executing: $cmd\n";
|
||||
`$cmd`;
|
||||
|
||||
}
|
||||
|
||||
|
@ -1,12 +1,13 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
#use strict;
|
||||
use warnings;
|
||||
use Getopt::Std;
|
||||
getopts('q');
|
||||
|
||||
$target = shift;
|
||||
$source = shift;
|
||||
$align = shift or die "
|
||||
my $target = shift;
|
||||
my $source = shift;
|
||||
my $align = shift or die "
|
||||
Usage: extract-singletons.perl target source align
|
||||
|
||||
";
|
||||
|
@ -602,7 +602,7 @@ build-osm
|
||||
out: osm-model
|
||||
ignore-unless: operation-sequence-model
|
||||
rerun-on-change: operation-sequence-model training-options script giza-settings operation-sequence-model-settings
|
||||
template: $moses-script-dir/OSM/OSM-Train.perl --corpus-f IN0.$input-extension --corpus-e IN0.$output-extension --alignment IN1.$alignment-symmetrization-method --order $operation-sequence-model-order --out-dir OUT --moses-src-dir $moses-src-dir --srilm-dir $srilm-dir $operation-sequence-model-settings
|
||||
template: $moses-script-dir/OSM/OSM-Train.perl --corpus-f IN0.$input-extension --corpus-e IN0.$output-extension --alignment IN1.$alignment-symmetrization-method --order $operation-sequence-model-order --out-dir OUT --moses-src-dir $moses-src-dir $operation-sequence-model-settings
|
||||
default-name: model/OSM
|
||||
build-transliteration-model
|
||||
in: corpus word-alignment
|
||||
|
37
scripts/training/wrappers/morfessor-wrapper.perl
Executable file
37
scripts/training/wrappers/morfessor-wrapper.perl
Executable file
@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
use Getopt::Long "GetOptions";
|
||||
|
||||
my $MORF_DIR;
|
||||
my $MODEL;
|
||||
|
||||
GetOptions('morfessor-dir=s' => \$MORF_DIR,
|
||||
'model=s' => \$MODEL);
|
||||
|
||||
die("Must provide --model=s argument") if (!defined($MODEL));
|
||||
|
||||
my $cmd = "";
|
||||
|
||||
if (defined($MORF_DIR)) {
|
||||
$cmd .= "PYTHONPATH=$MORF_DIR $MORF_DIR/scripts/";
|
||||
}
|
||||
|
||||
my $TMP_FILE = "/tmp/morf.$$";
|
||||
$cmd .= "morfessor-segment "
|
||||
."-L $MODEL "
|
||||
."--output-format \"{analysis} \" "
|
||||
."--output-format-separator \" \" "
|
||||
."--output-newlines "
|
||||
."/dev/stdin "
|
||||
."| sed 's/ \$//' > $TMP_FILE";
|
||||
print STDERR "Executing: $cmd\n";
|
||||
`$cmd`;
|
||||
|
||||
|
||||
open(FILE, $TMP_FILE) or die("Can't open file $TMP_FILE");
|
||||
while (my $line = <FILE>) {
|
||||
print "$line";
|
||||
}
|
||||
close(FILE);
|
Loading…
Reference in New Issue
Block a user