Transliteration Scripts

This commit is contained in:
Nadir Durrani 2013-12-02 14:49:21 +00:00
parent 1caadce208
commit c527f0db8d
5 changed files with 907 additions and 0 deletions

316
scripts/Transliteration/clean.pl Executable file
View File

@ -0,0 +1,316 @@
#!/usr/bin/perl
#input hindi word urdu word, delete all those entries that have number on any side
use utf8;
use Getopt::Std;
use IO::Handle;
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
use open qw(:std :utf8);
$srcHash = ();
$trgHash = ();
$file = $ARGV[0];
@f0 = split(/\//, $file); # if file name has a path
@f1 = split(/\./, $f0[$#f0]); # last element would be the file name
@f2 = split(/\-/, $f1[1]);
$srcMark = $f2[0];
$trgMark = $f2[1];
$lang = 0;
$lang1 = 1;
$lang2 = 1;
if ($srcMark eq "en" || $srcMark eq "de" || $srcMark eq "es" || $srcMark eq "fr" || $srcMark eq "it" || $srcMark eq "nl" || $srcMark eq "pt-br" || $srcMark eq "ro" || $srcMark eq "sl" || $srcMark eq "tr" )
{
print STDERR "Source is Latin\n";
$lang1 = 0;
$lang = $lang + 1;
}
if ( "$trgMark" eq "en" || "$trgMark" eq "de" || "$trgMark" eq "es" || "$trgMark" eq "fr" || "$trgMark" eq "it" || "$trgMark" eq "nl" || "$trgMark" eq "pt-br" || "$trgMark" eq "ro" || "$trgMark" eq "sl" || "$trgMark" eq "tr" )
{
print STDERR "Target is Latin\n";
$lang2 = 0;
$lang = $lang + 1;
}
if ("$lang" == 2)
{
print STDERR "No Transliteration Module Possible\n";
}
else
{ print STDERR "will run Transliteration module\n";
print STDERR "Three preprocessing steps to do:\n 1) Delete Symbol \t 2) Delete Latin from non-Latin langauge \t 3) Character Frequency based filtering\n";
print STDERR "STARTING 1 and 2 ...\n";
open ($IN, $ARGV[0]);
while(<$IN>)
{
chomp;
$retur = deleteSymbol($_);
if($retur == 1)
{
#print "$_\n";
$retur = deleteEnglish($lang1, $lang2, $_);
if ($retur == 1)
{
#print "$_\n";
push (@inputArr, $_);
charFreqFilterPreprocess($_);
}
}
}
close ($IN);
}
print STDERR "DONE 1 and 2\nSTARTING 3) Preprocessing for Character filtering...\n";
charFreqFilterPreprocess2();
print STDERR "DONE 3\n";
foreach (@inputArr)
{
charFreqFilter($_);
}
###############################Delete English##################################
sub deleteEnglish{
@list = @_;
$backEng = 0;
if($list[0] == 1 && $list[1] == 1)
{
# print "Both are Non-Latin\n";
if (m/[A-Za-z]/) {}
else {$backEng = 1; return $backEng;}
}
elsif($list[0] == 0 && $list[1] == 1)
{
# print "Target is Non-Latin\n";
@F=split("\t");
if ($F[1] =~ m/[A-Za-z]/) {}
else {$backEng = 1; return $backEng;}
}
elsif($list[0] == 1 && $list[1] == 0)
{
# print "Source is Non-Latin\n";
@F=split("\t");
if ($F[0] =~ m/[A-Za-z]/) {}
else {$backEng = 1; return $backEng;}
}
}
###############################Delete Symbol##################################
sub deleteSymbol{
$back = 0;
if (/\d+/) {}
elsif(/\?/) {}
elsif(/\!/) {}
elsif(/@/) {}
elsif(/\./) {}
elsif(/\#/) {}
elsif(/\%/) {}
elsif(/\$/) {}
elsif(/-/) {}
elsif(/"/) {}
elsif(/\(/) {}
elsif(/\)/) {}
elsif(/\&/) {}
elsif(/\;/) {}
elsif(/\\/) {}
elsif(/\*/) {}
elsif(/\+/) {}
elsif(/\,/) {}
elsif(/\</){}
elsif(/\>/){}
else
{
@wrds = split(/\t/);
if($wrds[0] eq $wrds[1])
{}
elsif(length $wrds[0] < 3 )
{}
elsif(length $wrds[1] < 3)
{}
else
{
$back = 1;
return $back;
# print "$_\n";
}
}
}
#################################Char Frequency Filter Preprocess########################
sub charFreqFilterPreprocess{
@wrds = split(/\t/);
$srcWrd = lc $wrds[0];
$trgWrd = lc $wrds[1];
if($srcWrd eq $trgWrd)
{}
else
{
@src = split('',$srcWrd);
foreach (@src)
{
if(exists $srcHash{$_})
{
$srcHash{$_}++;
}
else
{
$srcHash{$_} = 0;
}
}
@trg = split('',$trgWrd);
foreach (@trg)
{
if(exists $trgHash{$_})
{
$trgHash{$_}++;
}
else
{
$trgHash{$_} = 0;
}
}
}
}
##############################Preprocess Two#############################
sub charFreqFilterPreprocess2{
###################srchash###################################
@keys = sort { $srcHash{$b} <=> $srcHash{$a} } keys %srcHash;
$bestsrcfreq = $srcHash{$keys[0]};
$srcOnePer = $bestsrcfreq * 0.005;
$take = 0; # take top 30 character from hash
foreach (@keys)
{
# print "$srcHash{$_}\t$_\n";
if($take < 30)
{
$srcChar{$_} = 1;
# print "$srcHash{$_}\t$_\n";
}
else
{ ################# take worst characters that are not 1% of the best character################
if($srcHash{$_} < $srcOnePer || $take > 50)
{
$srcBadChar{$_} = 1;
}
}
# print "$_\t$srcHash{$_}\n";
$take++;
}
################### target hash ###################################
@keys = sort { $trgHash{$b} <=> $trgHash{$a} } keys %trgHash;
$besttrgfreq = $trgHash{$keys[0]};
$trgOnePer = $besttrgfreq * 0.005;
#print "$besttrgfreq\t$trgOnePer\n";
$take = 0; # take top 30 character from hash
foreach (@keys)
{
if($take < 30)
{
$trgChar{$_} = 1;
}
else
{ ################# take worst characters that are not 1% of the best character################
if($trgHash{$_} < $trgOnePer || $take > 50 )
{
$trgBadChar{$_} = 1;
}
}
# print "$_\t$trgHash{$_}\n";
$take++;
}
}
###############################CharFreqFiltering###################################
sub charFreqFilter{
@in = @_;
@wrds = split(/\t/, $in[0]);
$srcWrd = lc $wrds[0];
$trgWrd = lc $wrds[1];
@srcWrdArr = split("",$srcWrd);
@trgWrdArr = split("",$trgWrd);
$check = 0;
$remove = 0;
########################## search if word contain any of the bad characters ####################################
foreach (@srcWrdArr)
{
# print "$srcWrd\n";
if (exists $srcBadChar{$_}) # if this character is in the list of worst characters
{
$remove = 1;
# print "#######EXIT src: \t$srcWrd##########\n";
last;
}
}
if($remove == 1)
{}
else
{ foreach (@trgWrdArr)
{
if (exists $trgBadChar{$_}) # if this character is in the list of worst characters
{
$remove = 1;
# print "EXIT target: \t$trgWrd\n";
last;
}
}
}
########################## search if word contain any of the good characters ####################################
if($remove == 1)
{}
else
{
foreach (@srcWrdArr)
{
if(exists ($srcChar{$_}))
{
$check = 1;
last;
}
}
if($check == 1)
{
foreach (@trgWrdArr)
{
if(exists ($trgChar{$_}))
{
# print "$wrds[0]\t$wrds[1]\n";
$printSrc = join (" ", split("",$wrds[0]));
$printTrg = join (" ", split("",$wrds[1]));
print "$printSrc\n$printTrg\n";
last;
}
}
}
}
}

View File

@ -0,0 +1,86 @@
#!/usr/bin/perl -w
use strict;
use utf8;
use Getopt::Std;
use IO::Handle;
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
my @source;
my @target;
my @words;
my $tPath = $ARGV[0];
my $tFile = $ARGV[1];
my $inp_ext = $ARGV[2];
my $op_ext = $ARGV[3];
my $src;
my $tgt;
my $t;
my $s;
`mkdir $tPath/training`;
`mkdir $tPath/tuning`;
open FH, "<:encoding(UTF-8)", "$tPath/$tFile" or die "Can't open $tPath/$tFile: $!\n";
open MYSFILE, ">:encoding(UTF-8)", "$tPath/training/corpus.$inp_ext" or die "Can't open $tPath/training/corpus.$inp_ext: $!\n";
open MYTFILE, ">:encoding(UTF-8)", "$tPath/training/corpus.$op_ext" or die "Can't open $tPath/training/corpus.$op_ext: $!\n";
while (<FH>)
{
chomp;
my ($src,$tgt) = split(/\t/);
$s = join(' ', split('',$src));
$t = join(' ', split('',$tgt));
print MYSFILE "$s\n";
print MYTFILE "$t\n";
push(@source, $s);
push(@target, $t);
}
close (FH);
close (MYSFILE);
close (MYTFILE);
open MYSFILE, ">:encoding(UTF-8)", "$tPath/training/corpusA.$inp_ext" or die "Can't open $tPath/training/corpusA.$inp_ext: $!\n";
open MYTFILE, ">:encoding(UTF-8)", "$tPath/training/corpusA.$op_ext" or die "Can't open $tPath/training/corpusA.$op_ext: $!\n";
open MYSDEVFILE, ">:encoding(UTF-8)", "$tPath/tuning/input" or die "Can't open $tPath/tuning/input: $!\n";
open MYTDEVFILE, ">:encoding(UTF-8)", "$tPath/tuning/reference" or die "Can't open $tPath/tuning/reference: $!\n";
my $corpus_size = @source;
my $count = 11;
my $dev_size = 0;
foreach (@source)
{
if ($count % 5 == 0 && $dev_size < 1000)
{
print MYSDEVFILE "$source[$count-11]\n";
print MYTDEVFILE "$target[$count-11]\n";
$dev_size++;
}
else
{
print MYSFILE "$source[$count-11]\n";
print MYTFILE "$target[$count-11]\n";
}
$count++;
}
close (MYSFILE);
close (MYTFILE);
close (MYSDEVFILE);
close (MYTDEVFILE);
if ($corpus_size < 6000)
{
`rm $tPath/training/corpusA.$inp_ext`;
`rm $tPath/training/corpusA.$op_ext`;
}

View File

@ -0,0 +1,169 @@
#!/usr/bin/perl -w
use strict;
use utf8;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
use IO::Handle;
use File::Basename;
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
my $OUT_DIR = "/tmp/Transliteration-Phrase-Table.$$";
my ($MOSES_SRC_DIR,$TRANSLIT_MODEL,$OOV_FILE,$EXTERNAL_BIN_DIR, $INPUT_EXTENSION, $OUTPUT_EXTENSION);
die("ERROR: wrong syntax when invoking train-transliteration-PT.pl")
unless &GetOptions('moses-src-dir=s' => \$MOSES_SRC_DIR,
'external-bin-dir=s' => \$EXTERNAL_BIN_DIR,
'transliteration-model-dir=s' => \$TRANSLIT_MODEL,
'input-extension=s' => \$INPUT_EXTENSION,
'output-extension=s' => \$OUTPUT_EXTENSION,
'out-dir=s' => \$OUT_DIR,
'oov-file=s' => \$OOV_FILE);
# check if the files are in place
die("ERROR: you need to define --moses-src-dir --external-bin-dir, --transliteration-model-dir, --oov-file, --input-extension, --output-extension")
unless (defined($MOSES_SRC_DIR) &&
defined($TRANSLIT_MODEL) &&
defined($OOV_FILE) &&
defined($INPUT_EXTENSION)&&
defined($OUTPUT_EXTENSION));
die("ERROR: could not find Transliteration Model '$TRANSLIT_MODEL'")
unless -e $TRANSLIT_MODEL;
die("ERROR: could not find OOV file '$OOV_FILE'")
unless -e $OOV_FILE;
my $UNK_FILE_NAME = basename($OOV_FILE);
`mkdir -p $OUT_DIR/$UNK_FILE_NAME/training`;
`cp $OOV_FILE $OUT_DIR/$UNK_FILE_NAME/$UNK_FILE_NAME`;
my $translitFile = "$OUT_DIR/" . $UNK_FILE_NAME . "/" . $UNK_FILE_NAME . ".translit";
print "Preparing for Transliteration\n";
prepare_for_transliteration ($OOV_FILE , $translitFile);
print "Run Transliteration\n";
run_transliteration ($MOSES_SRC_DIR , $EXTERNAL_BIN_DIR , $TRANSLIT_MODEL , $translitFile);
print "Form Transliteration Corpus\n";
form_corpus ($translitFile , $translitFile.".op.nBest" , $OUT_DIR);
################### Read the UNK word file and prepare for Transliteration ###############################
sub prepare_for_transliteration
{
my @list = @_;
my $testFile = $list[0];
my $translitFile = $list[1];
my %UNK;
my @words;
my $src;
open MYFILE, "<:encoding(UTF-8)", $testFile or die "Can't open $testFile: $!\n";
while (<MYFILE>)
{
chomp;
#print "$_\n";
@words = split(/ /, "$_");
foreach (@words)
{
$UNK{"$_"} = 1;
}
}
close (MYFILE);
open MYFILE, ">:encoding(UTF-8)", $translitFile or die "Can't open $translitFile: $!\n";
foreach my $key ( keys %UNK )
{
$src=join(' ', split('',$key));
print MYFILE "$src\n";
}
close (MYFILE);
}
################### Run Transliteration Module to Obtain Transliterations ###############################
sub run_transliteration
{
my @list = @_;
my $MOSES_SRC = $list[0];
my $EXTERNAL_BIN_DIR = $list[1];
my $TRANSLIT_MODEL = $list[2];
my $eval_file = $list[3];
`touch $eval_file.moses.table.ini`;
print "Filter Table\n";
`$MOSES_SRC/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table -reordering-table $TRANSLIT_MODEL/model/reordering-table -config $eval_file.moses.table.ini -lm 0:3:$eval_file.moses.table.ini:8`;
`$MOSES_SRC/scripts/training/filter-model-given-input.pl $eval_file.filtered $eval_file.moses.table.ini $eval_file -Binarizer "$MOSES_SRC/bin/processPhraseTable"`;
`rm $eval_file.moses.table.ini`;
print "Apply Filter\n";
`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl $eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini $TRANSLIT_MODEL/tuning/moses.tuned.ini $eval_file.filtered.ini`;
`$MOSES_SRC/bin/moses -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads 16 -drop-unknown -distortion-limit 0 -n-best-list $eval_file.op.nBest 50 -f $eval_file.filtered.ini < $eval_file > $eval_file.op`;
}
################### Read the output of Transliteration Model and Form Corpus ###############################
sub form_corpus
{
my @list = @_;
my $inp_file = $list[0];
my $testFile = $list[1];
my $EVAL_DIR = $list[2];
my %vocab;
my @words;
my $thisStr;
my $UNK_FILE_NAME = basename($OOV_FILE);
my $target = $EVAL_DIR . "/$UNK_FILE_NAME/training/corpus.$OUTPUT_EXTENSION";
open MYFILE, "<:encoding(UTF-8)", $testFile or die "Can't open $testFile: $!\n";
while (<MYFILE>)
{
chomp;
#print "$_\n";
@words = split(/ /, "$_");
my $i = 2;
my $prob;
$thisStr = "";
while ($words[$i] ne "|||")
{
$thisStr = $thisStr . $words[$i];
$i++;
}
$i++;
while ($words[$i] ne "|||")
{
$i++;
}
$i++;
$prob = $words[$i];
print "$thisStr \t $prob\n";
}
close (MYFILE);
}

View File

@ -0,0 +1,30 @@
#!/usr/bin/perl
use utf8;
require Encode;
use IO::Handle;
$input = <STDIN>;
#print $input;
$filename = shift or die "Error: missing hindi urdu file argument!\n";
open(FILE,$filename) or die "Error: unable to open file \"$filename\"!\n";
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
binmode(FILE, ':utf8');
$c=0;
while (<FILE>)
{
chomp;
@F=split("\t");
$hash{$F[0]."\t".$F[1]}=$F[$#F];
$c++;
if($F[$#F] < $input)
{
print "$F[0]\t$F[1]\n";
}
}close FILE;

View File

@ -0,0 +1,306 @@
#!/usr/bin/perl -w
use utf8;
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
print STDERR "Training Transliteration Module - Start\n".`date`;
my $ORDER = 5;
my $OUT_DIR = "/tmp/Transliteration-Model.$$";
my $___FACTOR_DELIMITER = "|";
my ($MOSES_SRC_DIR,$CORPUS_F,$CORPUS_E,$ALIGNMENT,$SRILM_DIR,$FACTOR,$EXTERNAL_BIN_DIR,$INPUT_EXTENSION, $OUTPUT_EXTENSION);
# utilities
my $ZCAT = "gzip -cd";
my $BZCAT = "bzcat";
die("ERROR: wrong syntax when invoking TransliterationModel.perl")
unless &GetOptions('moses-src-dir=s' => \$MOSES_SRC_DIR,
'external-bin-dir=s' => \$EXTERNAL_BIN_DIR,
'input-extension=s' => \$INPUT_EXTENSION,
'output-extension=s' => \$OUTPUT_EXTENSION,
'corpus-f=s' => \$CORPUS_F,
'corpus-e=s' => \$CORPUS_E,
'alignment=s' => \$ALIGNMENT,
'order=i' => \$ORDER,
'factor=s' => \$FACTOR,
'srilm-dir=s' => \$SRILM_DIR,
'out-dir=s' => \$OUT_DIR);
# check if the files are in place
die("ERROR: you need to define --corpus-e, --corpus-f, --alignment, --srilm-dir, --moses-src-dir --external-bin-dir, --input-extension and --output-extension")
unless (defined($MOSES_SRC_DIR) &&
defined($CORPUS_F) &&
defined($CORPUS_E) &&
defined($ALIGNMENT)&&
defined($INPUT_EXTENSION)&&
defined($OUTPUT_EXTENSION)&&
defined($EXTERNAL_BIN_DIR)&&
defined($SRILM_DIR));
die("ERROR: could not find input corpus file '$CORPUS_F'")
unless -e $CORPUS_F;
die("ERROR: could not find output corpus file '$CORPUS_E'")
unless -e $CORPUS_E;
die("ERROR: could not find algnment file '$ALIGNMENT'")
unless -e $ALIGNMENT;
# create factors
`mkdir $OUT_DIR`;
if (defined($FACTOR)) {
my @factor_values = split(',', $FACTOR);
foreach my $factor_val (@factor_values) {
`mkdir $OUT_DIR/$factor_val`;
my ($factor_f,$factor_e) = split(/\-/,$factor_val);
$CORPUS_F =~ /^(.+)\.([^\.]+)/;
my ($corpus_stem_f,$ext_f) = ($1,$OUT_DIR);
$CORPUS_E =~ /^(.+)\.([^\.]+)/;
my ($corpus_stem_e,$ext_e) = ($1,$OUT_DIR);
&reduce_factors($CORPUS_F,"$corpus_stem_f.$factor_val.$ext_f",$factor_f);
&reduce_factors($CORPUS_E,"$corpus_stem_e.$factor_val.$ext_e",$factor_e);
`ln -s $corpus_stem_f.$factor_val.$ext_f $OUT_DIR/$factor_val/f`;
`ln -s $corpus_stem_e.$factor_val.$ext_e $OUT_DIR/$factor_val/e`;
`ln -s $ALIGNMENT $OUT_DIR/$factor_val/a`;
mine_transliterations($factor_val, $INPUT_EXTENSION, $OUTPUT_EXTENSION);
}
}
else {
`ln -s $CORPUS_F $OUT_DIR/f`;
`ln -s $CORPUS_E $OUT_DIR/e`;
`ln -s $ALIGNMENT $OUT_DIR/a`;
mine_transliterations("", $INPUT_EXTENSION, $OUTPUT_EXTENSION);
}
train_transliteration_module();
retrain_transliteration_module();
# create model
print "Training Transliteration Module - End ".`date`;
sub learn_transliteration_model{
my ($t) = @_;
`cp $OUT_DIR/training/corpus$t.$OUTPUT_EXTENSION $OUT_DIR/lm/target`;
print "Align Corpus\n";
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -last-step 1 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -corpus $OUT_DIR/training/corpus$t -corpus-dir $OUT_DIR/training/prepared`;
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 2 -last-step 2 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared -giza-e2f $OUT_DIR/training/giza -direction 2`;
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 2 -last-step 2 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared -giza-f2e $OUT_DIR/training/giza-inverse -direction 1`;
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 3 -last-step 3 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -giza-e2f $OUT_DIR/training/giza -giza-f2e $OUT_DIR/training/giza-inverse -alignment-file $OUT_DIR/model/aligned -alignment-stem $OUT_DIR/model/aligned -alignment grow-diag-final-and`;
print "Train Translation Models\n";
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 4 -last-step 4 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -lexical-file $OUT_DIR/model/lex -alignment-file $OUT_DIR/model/aligned -alignment-stem $OUT_DIR/model/aligned -corpus $OUT_DIR/training/corpus$t`;
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 5 -last-step 5 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -alignment-file $OUT_DIR/model/aligned -alignment-stem $OUT_DIR/model/aligned -extract-file $OUT_DIR/model/extract -corpus $OUT_DIR/training/corpus$t`;
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 6 -last-step 6 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -extract-file $OUT_DIR/model/extract -lexical-file $OUT_DIR/model/lex -phrase-translation-table $OUT_DIR/model/phrase-table`;
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 7 -last-step 7 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -extract-file $OUT_DIR/model/extract -reordering-table $OUT_DIR/model/reordering-table`;
print "Train Language Models\n";
`$SRILM_DIR/ngram-count -order 5 -interpolate -kndiscount -addsmooth1 0.0 -unk -text $OUT_DIR/lm/target -lm $OUT_DIR/lm/targetLM`;
`$MOSES_SRC_DIR/bin/build_binary $OUT_DIR/lm/targetLM $OUT_DIR/lm/targetLM.bin`;
print "Create Config File\n";
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -phrase-translation-table $OUT_DIR/model/phrase-table -reordering-table $OUT_DIR/model/reordering-table -config $OUT_DIR/model/moses.ini -lm 0:5:$OUT_DIR/lm/targetLM.bin:8`;
}
sub retrain_transliteration_module{
if (-e "$OUT_DIR/training/corpusA.$OUTPUT_EXTENSION")
{
`rm -r $OUT_DIR/model`;
`rm -r $OUT_DIR/lm`;
`rm -r $OUT_DIR/training/giza`;
`rm -r $OUT_DIR/training/giza-inverse`;
`rm -r $OUT_DIR/training/prepared`;
`mkdir $OUT_DIR/model`;
`mkdir $OUT_DIR/lm`;
learn_transliteration_model("");
}
}
sub train_transliteration_module{
`mkdir $OUT_DIR/model`;
`mkdir $OUT_DIR/lm`;
print "Preparing Corpus\n";
`$MOSES_SRC_DIR/scripts/Transliteration/corpusCreator.pl $OUT_DIR 1-1.$INPUT_EXTENSION-$OUTPUT_EXTENSION.mined-pairs $INPUT_EXTENSION $OUTPUT_EXTENSION`;
if (-e "$OUT_DIR/training/corpusA.$OUTPUT_EXTENSION")
{
learn_transliteration_model("A");
}
else
{
learn_transliteration_model("");
}
print "Running Tuning for Transliteration Module\n";
`touch $OUT_DIR/tuning/moses.table.ini`;
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -phrase-translation-table $OUT_DIR/model/phrase-table -reordering-table $OUT_DIR/model/reordering-table -config $OUT_DIR/tuning/moses.table.ini -lm 0:3:$OUT_DIR/tuning/moses.table.ini:8`;
`$MOSES_SRC_DIR/scripts/training/filter-model-given-input.pl $OUT_DIR/tuning/filtered $OUT_DIR/tuning/moses.table.ini $OUT_DIR/tuning/input -Binarizer "$MOSES_SRC_DIR/bin/processPhraseTable"`;
`rm $OUT_DIR/tuning/moses.table.ini`;
`$MOSES_SRC_DIR/scripts/ems/support/substitute-filtered-tables.perl $OUT_DIR/tuning/filtered/moses.ini < $OUT_DIR/model/moses.ini > $OUT_DIR/tuning/moses.filtered.ini`;
`$MOSES_SRC_DIR/scripts/training/mert-moses.pl $OUT_DIR/tuning/input $OUT_DIR/tuning/reference $MOSES_SRC_DIR/bin/moses $OUT_DIR/tuning/moses.filtered.ini --nbest 100 --working-dir $OUT_DIR/tuning/tmp --decoder-flags "-threads 16 -drop-unknown -v 0 -distortion-limit 0" --rootdir $MOSES_SRC_DIR/scripts -mertdir $MOSES_SRC_DIR/mert -threads=16 --no-filter-phrase-table`;
`cp $OUT_DIR/tuning/tmp/moses.ini $OUT_DIR/tuning/moses.ini`;
`$MOSES_SRC_DIR/scripts/ems/support/substitute-weights.perl $OUT_DIR/model/moses.ini $OUT_DIR/tuning/moses.ini $OUT_DIR/tuning/moses.tuned.ini`;
}
sub mine_transliterations{
my @list = @_;
my $factor_val = $list[0];
my $inp_ext = $list[1];
my $op_ext = $list[2];
my $count = 0;
my $l1 = 1;
my $l2 = 1;
print "Creating Model ".$factor_val."\n";
print "Extracting 1-1 Alignments\n";
`$MOSES_SRC_DIR/bin/1-1-Extraction $OUT_DIR/$factor_val/f $OUT_DIR/$factor_val/e $OUT_DIR/$factor_val/a > $OUT_DIR/$factor_val/1-1.$inp_ext-$op_ext`;
print "Cleaning the list for Miner\n";
`$MOSES_SRC_DIR/scripts/Transliteration/clean.pl $OUT_DIR/$factor_val/1-1.$inp_ext-$op_ext > $OUT_DIR/$factor_val/1-1.$inp_ext-$op_ext.cleaned`;
if (-e "$OUT_DIR/$factor_val/1-1.$inp_ext-$op_ext.pair-probs")
{
print STDERR "1-1.$inp_ext-$op_ext.pair-probs in place, reusing\n";
}
else
{
print "Extracting Transliteration Pairs \n";
`$MOSES_SRC_DIR/bin/TMining $OUT_DIR/$factor_val/1-1.$inp_ext-$op_ext.cleaned > $OUT_DIR/$factor_val/1-1.$inp_ext-$op_ext.pair-probs`;
}
print "Selecting Transliteration Pairs with threshold 0.5 \n";
`echo 0.5 | $MOSES_SRC_DIR/scripts/Transliteration/threshold.pl $OUT_DIR/$factor_val/1-1.$inp_ext-$op_ext.pair-probs > $OUT_DIR/$factor_val/1-1.$inp_ext-$op_ext.mined-pairs`;
}
# from train-model.perl
sub reduce_factors {
my ($full,$reduced,$factors) = @_;
my @INCLUDE = sort {$a <=> $b} split(/,/,$factors);
print "Reducing factors to produce $reduced @ ".`date`;
while(-e $reduced.".lock") {
sleep(10);
}
if (-e $reduced) {
print STDERR " $reduced in place, reusing\n";
return;
}
if (-e $reduced.".gz") {
print STDERR " $reduced.gz in place, reusing\n";
return;
}
# peek at input, to check if we are asked to produce exactly the
# available factors
my $inh = open_or_zcat($full);
my $firstline = <$inh>;
die "Corpus file $full is empty" unless $firstline;
close $inh;
# pick first word
$firstline =~ s/^\s*//;
$firstline =~ s/\s.*//;
# count factors
my $maxfactorindex = $firstline =~ tr/|/|/;
if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) {
# create just symlink; preserving compression
my $realfull = $full;
if (!-e $realfull && -e $realfull.".gz") {
$realfull .= ".gz";
$reduced =~ s/(\.gz)?$/.gz/;
}
safesystem("ln -s '$realfull' '$reduced'")
or die "Failed to create symlink $realfull -> $reduced";
return;
}
# The default is to select the needed factors
`touch $reduced.lock`;
*IN = open_or_zcat($full);
open(OUT,">".$reduced) or die "ERROR: Can't write $reduced";
my $nr = 0;
while(<IN>) {
$nr++;
print STDERR "." if $nr % 10000 == 0;
print STDERR "($nr)" if $nr % 100000 == 0;
chomp; s/ +/ /g; s/^ //; s/ $//;
my $first = 1;
foreach (split) {
my @FACTOR = split /\Q$___FACTOR_DELIMITER/;
# \Q causes to disable metacharacters in regex
print OUT " " unless $first;
$first = 0;
my $first_factor = 1;
foreach my $outfactor (@INCLUDE) {
print OUT "|" unless $first_factor;
$first_factor = 0;
my $out = $FACTOR[$outfactor];
die "ERROR: Couldn't find factor $outfactor in token \"$_\" in $full LINE $nr" if !defined $out;
print OUT $out;
}
}
print OUT "\n";
}
print STDERR "\n";
close(OUT);
close(IN);
`rm -f $reduced.lock`;
}
sub open_or_zcat {
my $fn = shift;
my $read = $fn;
$fn = $fn.".gz" if ! -e $fn && -e $fn.".gz";
$fn = $fn.".bz2" if ! -e $fn && -e $fn.".bz2";
if ($fn =~ /\.bz2$/) {
$read = "$BZCAT $fn|";
} elsif ($fn =~ /\.gz$/) {
$read = "$ZCAT $fn|";
}
my $hdl;
open($hdl,$read) or die "Can't read $fn ($read)";
return $hdl;
}