mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 21:42:19 +03:00
OSM Training Script
This commit is contained in:
parent
59eac56012
commit
82d6105f05
33
scripts/OSM/OSM-Train.sh
Executable file
33
scripts/OSM/OSM-Train.sh
Executable file
@ -0,0 +1,33 @@
|
||||
#!/bin/sh
|
||||
|
||||
PATH=$PATH:/fs/hel1/nadir/SRILM/bin/i686-m64/
|
||||
|
||||
echo 'Training OSM - Start'
|
||||
date
|
||||
|
||||
\rm $5/e
|
||||
\rm $5/f
|
||||
\rm $5/align
|
||||
|
||||
ln -s $1 $5/e
|
||||
ln -s $2 $5/f
|
||||
|
||||
./flipAlignment $3 > $5/align
|
||||
|
||||
echo 'Extracting Singletons'
|
||||
|
||||
./extract-singletons.perl $5/e $5/f $5/align > $5/Singletons
|
||||
|
||||
echo 'Converting Bilingual Sentence Pair into Operation Corpus'
|
||||
|
||||
./generateSequences $5/e $5/f $5/align $5/Singletons > $5/opCorpus # Generates Operation Corpus
|
||||
|
||||
echo 'Learning Operation Sequence Translation Model'
|
||||
|
||||
ngram-count -kndiscount -order $4 -unk -text $5/opCorpus -lm $5/operationLM$4
|
||||
|
||||
../../bin/build_binary -i $5/operationLM$4 $5/operationLM$4.bin
|
||||
|
||||
echo 'Training OSM - End'
|
||||
date
|
||||
|
46
scripts/OSM/extract-singletons.perl
Executable file
46
scripts/OSM/extract-singletons.perl
Executable file
@ -0,0 +1,46 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
use Getopt::Std;
|
||||
getopts('q');
|
||||
|
||||
$target = shift;
|
||||
$source = shift;
|
||||
$align = shift or die "
|
||||
Usage: extract-singletons.perl target source align
|
||||
|
||||
";
|
||||
open(TARGET,$target) or die "Error: unable to open target file \"$target\"!\n";
|
||||
open(SOURCE,$source) or die "Error: unable to open source file \"$source\"!\n";
|
||||
open(ALIGN,$align) or die "Error: unable to open alignment file \"$align\"!\n";
|
||||
|
||||
while (<TARGET>) {
|
||||
unless (defined $opt_q) {
|
||||
print STDERR "\r$M" if ++$M%1000 == 0;
|
||||
}
|
||||
@T = split;
|
||||
$_ = <SOURCE>;
|
||||
@S = split;
|
||||
$_ = <ALIGN>;
|
||||
@A = split;
|
||||
|
||||
my(@source_links,@target_links);
|
||||
for( $i=0; $i<=$#A; $i+=2 ) {
|
||||
$target_links[$A[$i]]++;
|
||||
$source_links[$A[$i+1]]++;
|
||||
}
|
||||
|
||||
for( $i=0; $i<=$#A; $i+=2 ) {
|
||||
if ($target_links[$A[$i]] == 1 && $source_links[$A[$i+1]] == 1 &&
|
||||
$T[$A[$i]] eq $S[$A[$i+1]])
|
||||
{
|
||||
$count{$S[$A[$i+1]]}++; # Print this if it only occurs here
|
||||
}
|
||||
else {
|
||||
$count{$S[$A[$i+1]]}+=2; # Don't print this
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach $w (sort keys %count) {
|
||||
print "$w\n" if $count{$w}==1;
|
||||
}
|
BIN
scripts/OSM/flipAlignment
Executable file
BIN
scripts/OSM/flipAlignment
Executable file
Binary file not shown.
BIN
scripts/OSM/generateSequences
Executable file
BIN
scripts/OSM/generateSequences
Executable file
Binary file not shown.
Loading…
Reference in New Issue
Block a user