2011-06-21 20:20:22 +04:00
|
|
|
#! /usr/bin/perl
|
|
|
|
|
2011-06-21 20:47:22 +04:00
|
|
|
# example
|
2011-06-23 06:25:49 +04:00
|
|
|
# ./extract-parallel.perl 8 ./coreutils-8.9/src/split "./coreutils-8.9/src/sort --batch-size=253" ./extract ./corpus.5.en ./corpus.5.ar ./align.ar-en.grow-diag-final-and ./extracted 7 --NoFileLimit orientation
|
2011-06-21 20:47:22 +04:00
|
|
|
|
2011-06-21 20:20:22 +04:00
|
|
|
use strict;
|
2011-06-22 01:31:47 +04:00
|
|
|
use File::Basename;
|
2011-06-21 20:20:22 +04:00
|
|
|
|
|
|
|
sub NumStr($);
|
|
|
|
|
2011-06-22 01:31:47 +04:00
|
|
|
print "Started ".localtime() ."\n";
|
|
|
|
|
2011-06-23 06:25:49 +04:00
|
|
|
my $numParallel = $ARGV[0];
|
|
|
|
my $splitCmd = $ARGV[1];
|
|
|
|
my $sortCmd = $ARGV[2];
|
|
|
|
my $extractCmd = $ARGV[3];
|
|
|
|
|
|
|
|
my $target = $ARGV[4]; # 1st arg of extract argument
|
|
|
|
my $source = $ARGV[5]; # 2nd arg of extract argument
|
|
|
|
my $align = $ARGV[6]; # 3rd arg of extract argument
|
|
|
|
my $extract = $ARGV[7]; # 4th arg of extract argument
|
|
|
|
|
|
|
|
my $otherExtractArgs = "";
|
|
|
|
for (my $i = 8; $i < $#ARGV + 1; ++$i)
|
|
|
|
{
|
|
|
|
$otherExtractArgs .= $ARGV[$i] ." ";
|
|
|
|
}
|
2011-06-21 20:20:22 +04:00
|
|
|
|
2011-06-22 01:31:47 +04:00
|
|
|
my $TMPDIR=dirname($extract) ."/tmp.$$";
|
2011-06-21 20:20:22 +04:00
|
|
|
mkdir $TMPDIR;
|
|
|
|
|
|
|
|
my $totalLines = int(`wc -l $align`);
|
|
|
|
my $linesPerSplit = int($totalLines / $numParallel) + 1;
|
|
|
|
|
|
|
|
print "total=$totalLines line-per-split=$linesPerSplit \n";
|
|
|
|
|
|
|
|
my $cmd = "$splitCmd -d -l $linesPerSplit -a 5 $target $TMPDIR/target.";
|
|
|
|
`$cmd`;
|
|
|
|
|
|
|
|
$cmd = "$splitCmd -d -l $linesPerSplit -a 5 $source $TMPDIR/source.";
|
|
|
|
`$cmd`;
|
|
|
|
|
|
|
|
$cmd = "$splitCmd -d -l $linesPerSplit -a 5 $align $TMPDIR/align.";
|
|
|
|
`$cmd`;
|
|
|
|
|
|
|
|
# run extract
|
|
|
|
my $isParent = 1;
|
|
|
|
my @childs;
|
|
|
|
for (my $i = 0; $i < $numParallel; ++$i)
|
|
|
|
{
|
|
|
|
my $pid = fork();
|
|
|
|
|
|
|
|
if ($pid == 0)
|
|
|
|
{ # child
|
|
|
|
$isParent = 0;
|
|
|
|
my $numStr = NumStr($i);
|
2011-06-23 06:25:49 +04:00
|
|
|
my $cmd = "$extractCmd $TMPDIR/target.$numStr $TMPDIR/source.$numStr $TMPDIR/align.$numStr $TMPDIR/extract.$numStr $otherExtractArgs \n";
|
2011-06-21 20:20:22 +04:00
|
|
|
print $cmd;
|
|
|
|
`$cmd`;
|
|
|
|
|
2011-06-21 20:38:15 +04:00
|
|
|
$cmd = "LC_ALL=C $sortCmd -T $TMPDIR $TMPDIR/extract.$numStr > $TMPDIR/extract.$numStr.sorted \n";
|
2011-06-21 20:20:22 +04:00
|
|
|
print $cmd;
|
|
|
|
`$cmd`;
|
|
|
|
|
2011-06-21 20:38:15 +04:00
|
|
|
$cmd = "LC_ALL=C $sortCmd -T $TMPDIR $TMPDIR/extract.$numStr.inv > $TMPDIR/extract.$numStr.inv.sorted \n";
|
2011-06-21 20:20:22 +04:00
|
|
|
print $cmd;
|
|
|
|
`$cmd`;
|
|
|
|
|
2011-06-23 06:25:49 +04:00
|
|
|
$cmd = "LC_ALL=C $sortCmd -T $TMPDIR $TMPDIR/extract.$numStr.o > $TMPDIR/extract.$numStr.o.sorted \n";
|
|
|
|
print $cmd;
|
|
|
|
`$cmd`;
|
|
|
|
|
2011-06-22 01:31:47 +04:00
|
|
|
exit();
|
2011-06-21 20:20:22 +04:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{ # parent
|
|
|
|
push(@childs, $pid);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-06-21 20:38:15 +04:00
|
|
|
# wait for everything is finished
|
2011-06-21 20:20:22 +04:00
|
|
|
if ($isParent)
|
|
|
|
{
|
|
|
|
foreach (@childs) {
|
|
|
|
waitpid($_, 0);
|
|
|
|
}
|
|
|
|
}
|
2011-06-21 20:38:15 +04:00
|
|
|
else
|
|
|
|
{
|
2011-06-22 01:31:47 +04:00
|
|
|
die "shouldn't be here";
|
2011-06-21 20:38:15 +04:00
|
|
|
}
|
2011-06-21 20:20:22 +04:00
|
|
|
|
2011-06-21 20:38:15 +04:00
|
|
|
# merge
|
|
|
|
my $extractCmd = "LC_ALL=C $sortCmd -m ";
|
|
|
|
my $extractInvCmd = "LC_ALL=C $sortCmd -m ";
|
2011-06-23 06:25:49 +04:00
|
|
|
my $extractOrderingCmd = "LC_ALL=C $sortCmd -m ";
|
2011-06-21 20:38:15 +04:00
|
|
|
for (my $i = 0; $i < $numParallel; ++$i)
|
|
|
|
{
|
|
|
|
my $numStr = NumStr($i);
|
|
|
|
$extractCmd .= "$TMPDIR/extract.$numStr.sorted ";
|
|
|
|
$extractInvCmd .= "$TMPDIR/extract.$numStr.inv.sorted ";
|
2011-06-23 06:25:49 +04:00
|
|
|
$extractOrderingCmd .= "$TMPDIR/extract.$numStr.o.sorted ";
|
2011-06-21 20:38:15 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
$extractCmd .= "> $extract.sorted \n";
|
|
|
|
$extractInvCmd .= "> $extract.inv.sorted \n";
|
2011-06-23 06:25:49 +04:00
|
|
|
$extractOrderingCmd .= "> $extract.o.sorted \n";
|
2011-06-21 20:38:15 +04:00
|
|
|
print $extractCmd;
|
|
|
|
print $extractInvCmd;
|
2011-06-23 06:25:49 +04:00
|
|
|
print $extractOrderingCmd;
|
2011-06-21 20:38:15 +04:00
|
|
|
`$extractCmd`;
|
|
|
|
`$extractInvCmd`;
|
2011-06-23 06:25:49 +04:00
|
|
|
`$extractOrderingCmd`;
|
2011-06-21 20:20:22 +04:00
|
|
|
|
2011-06-22 01:34:18 +04:00
|
|
|
$cmd = "rm -rf $TMPDIR \n";
|
|
|
|
print $cmd;
|
2011-06-22 01:31:47 +04:00
|
|
|
`$cmd`;
|
|
|
|
|
2011-06-21 20:45:42 +04:00
|
|
|
print "Finished ".localtime() ."\n";
|
|
|
|
|
|
|
|
|
2011-06-21 20:20:22 +04:00
|
|
|
sub NumStr($)
|
|
|
|
{
|
|
|
|
my $i = shift;
|
|
|
|
my $numStr;
|
|
|
|
if ($i < 10) {
|
|
|
|
$numStr = "0000$i";
|
|
|
|
}
|
|
|
|
elsif ($i < 100) {
|
|
|
|
$numStr = "000$i";
|
|
|
|
}
|
|
|
|
elsif ($i < 1000) {
|
|
|
|
$numStr = "00$i";
|
|
|
|
}
|
|
|
|
elsif ($i < 10000) {
|
|
|
|
$numStr = "0$i";
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$numStr = $i;
|
|
|
|
}
|
|
|
|
return $numStr;
|
2011-06-23 06:25:49 +04:00
|
|
|
}
|
|
|
|
|