mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-08 12:36:35 +03:00
6a27dc4f17
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4084 1f5c12ca-751b-0410-a591-d2e778427230
125 lines
3.1 KiB
Perl
Executable File
125 lines
3.1 KiB
Perl
Executable File
#! /usr/bin/perl
|
|
|
|
# example
|
|
# ~/giza-parallel.perl 10 split ~/workspace/sourceforge/trunk/scripts/training/train-model.perl ar en train align
|
|
|
|
use strict;
|
|
use File::Basename;
|
|
|
|
sub NumStr($);
|
|
|
|
print "Started ".localtime() ."\n";
|
|
|
|
my $numParallel = $ARGV[0];
|
|
my $splitCmd = $ARGV[1];
|
|
my $trainCmd = $ARGV[2];
|
|
my $inputExt = $ARGV[3];
|
|
my $outputExt = $ARGV[4];
|
|
my $corpus = $ARGV[5];
|
|
my $align = $ARGV[6];
|
|
|
|
my $TMPDIR=dirname($align) ."/tmp.$$";
|
|
mkdir $TMPDIR;
|
|
|
|
my $scriptDir=dirname($trainCmd) ."/..";
|
|
|
|
# split corpus file
|
|
my $totalLines = int(`wc -l $corpus.$inputExt`);
|
|
my $linesPerSplit = int($totalLines / $numParallel) + 1;
|
|
|
|
my $cmd = "$splitCmd -d -l $linesPerSplit -a 5 $corpus.$inputExt $TMPDIR/source.";
|
|
`$cmd`;
|
|
|
|
$cmd = "$splitCmd -d -l $linesPerSplit -a 5 $corpus.$outputExt $TMPDIR/target.";
|
|
`$cmd`;
|
|
|
|
for (my $i = 0; $i < $numParallel; ++$i)
|
|
{
|
|
my $numStr = NumStr($i);
|
|
rename("$TMPDIR/source.$numStr", "$TMPDIR/$numStr.source");
|
|
rename("$TMPDIR/target.$numStr", "$TMPDIR/$numStr.target");
|
|
}
|
|
|
|
#fork & run giza & friends
|
|
my $isParent = 1;
|
|
my @childs;
|
|
for (my $i = 0; $i < $numParallel; ++$i)
|
|
{
|
|
my $pid = fork();
|
|
|
|
if ($pid == 0)
|
|
{ # child
|
|
$isParent = 0;
|
|
|
|
my $numStr = NumStr($i);
|
|
my $cmd = "$trainCmd -dont-zip -last-step 1 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -corpus $TMPDIR/$numStr -corpus-dir $TMPDIR/prepared.$numStr \n";
|
|
print $cmd;
|
|
`$cmd`;
|
|
|
|
$cmd = "$trainCmd -dont-zip -first-step 2 -last-step 2 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -corpus-dir $TMPDIR/prepared.$numStr -giza-e2f $TMPDIR/giza.$numStr -direction 2 \n";
|
|
print $cmd;
|
|
`$cmd`;
|
|
|
|
$cmd = "$trainCmd -dont-zip -first-step 2 -last-step 2 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -corpus-dir $TMPDIR/prepared.$numStr -giza-f2e $TMPDIR/giza-inverse.$numStr -direction 1 \n";
|
|
print $cmd;
|
|
`$cmd`;
|
|
|
|
$cmd = "$trainCmd -dont-zip -first-step 3 -last-step 3 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -giza-e2f $TMPDIR/giza.$numStr -giza-f2e $TMPDIR/giza-inverse.$numStr -alignment-file $TMPDIR/aligned.$numStr -alignment grow-diag-final-and \n";
|
|
print $cmd;
|
|
`$cmd`;
|
|
|
|
exit();
|
|
}
|
|
else
|
|
{ # parent
|
|
push(@childs, $pid);
|
|
}
|
|
|
|
}
|
|
|
|
# wait for everything is finished
|
|
if ($isParent)
|
|
{
|
|
foreach (@childs) {
|
|
waitpid($_, 0);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
die "shouldn't be here";
|
|
}
|
|
|
|
# cat all aligned files together. Voila
|
|
my $cmd = "cat ";
|
|
for (my $i = 0; $i < $numParallel; ++$i)
|
|
{
|
|
my $numStr = NumStr($i);
|
|
$cmd .= "$TMPDIR/aligned.$numStr.grow-diag-final-and ";
|
|
}
|
|
$cmd .= " > $align \n";
|
|
print $cmd;
|
|
`$cmd`;
|
|
|
|
sub NumStr($)
|
|
{
|
|
my $i = shift;
|
|
my $numStr;
|
|
if ($i < 10) {
|
|
$numStr = "0000$i";
|
|
}
|
|
elsif ($i < 100) {
|
|
$numStr = "000$i";
|
|
}
|
|
elsif ($i < 1000) {
|
|
$numStr = "00$i";
|
|
}
|
|
elsif ($i < 10000) {
|
|
$numStr = "0$i";
|
|
}
|
|
else {
|
|
$numStr = $i;
|
|
}
|
|
return $numStr;
|
|
}
|
|
|