OSM Training Script

This commit is contained in:
Nadir Durrani 2013-07-02 13:59:47 +01:00
parent 59eac56012
commit 82d6105f05
4 changed files with 79 additions and 0 deletions

33
scripts/OSM/OSM-Train.sh Executable file
View File

@ -0,0 +1,33 @@
#!/bin/sh
PATH=$PATH:/fs/hel1/nadir/SRILM/bin/i686-m64/
echo 'Training OSM - Start'
date
\rm $5/e
\rm $5/f
\rm $5/align
ln -s $1 $5/e
ln -s $2 $5/f
./flipAlignment $3 > $5/align
echo 'Extracting Singletons'
./extract-singletons.perl $5/e $5/f $5/align > $5/Singletons
echo 'Converting Bilingual Sentence Pair into Operation Corpus'
./generateSequences $5/e $5/f $5/align $5/Singletons > $5/opCorpus # Generates Operation Corpus
echo 'Learning Operation Sequence Translation Model'
ngram-count -kndiscount -order $4 -unk -text $5/opCorpus -lm $5/operationLM$4
../../bin/build_binary -i $5/operationLM$4 $5/operationLM$4.bin
echo 'Training OSM - End'
date

View File

@ -0,0 +1,46 @@
#!/usr/bin/perl
use Getopt::Std;
getopts('q');
$target = shift;
$source = shift;
$align = shift or die "
Usage: extract-singletons.perl target source align
";
open(TARGET,$target) or die "Error: unable to open target file \"$target\"!\n";
open(SOURCE,$source) or die "Error: unable to open source file \"$source\"!\n";
open(ALIGN,$align) or die "Error: unable to open alignment file \"$align\"!\n";
while (<TARGET>) {
unless (defined $opt_q) {
print STDERR "\r$M" if ++$M%1000 == 0;
}
@T = split;
$_ = <SOURCE>;
@S = split;
$_ = <ALIGN>;
@A = split;
my(@source_links,@target_links);
for( $i=0; $i<=$#A; $i+=2 ) {
$target_links[$A[$i]]++;
$source_links[$A[$i+1]]++;
}
for( $i=0; $i<=$#A; $i+=2 ) {
if ($target_links[$A[$i]] == 1 && $source_links[$A[$i+1]] == 1 &&
$T[$A[$i]] eq $S[$A[$i+1]])
{
$count{$S[$A[$i+1]]}++; # Print this if it only occurs here
}
else {
$count{$S[$A[$i+1]]}+=2; # Don't print this
}
}
}
foreach $w (sort keys %count) {
print "$w\n" if $count{$w}==1;
}

BIN
scripts/OSM/flipAlignment Executable file

Binary file not shown.

BIN
scripts/OSM/generateSequences Executable file

Binary file not shown.