2015-04-30 14:35:15 +03:00
|
|
|
#!/usr/bin/env perl
|
|
|
|
|
|
|
|
use warnings;
|
|
|
|
use strict;
|
|
|
|
use File::Temp qw/tempfile/;
|
|
|
|
use Getopt::Long "GetOptions";
|
|
|
|
use File::Basename;
|
|
|
|
use FindBin qw($RealBin);
|
|
|
|
use Cwd 'abs_path';
|
|
|
|
|
|
|
|
sub GetFactors;
|
|
|
|
|
|
|
|
|
|
|
|
my $TMPDIR = "tmp";
|
|
|
|
my $KEEP_TMP = 0;
|
|
|
|
my $MADA_DIR;
|
|
|
|
my $CONFIG;
|
|
|
|
my $SCHEME;
|
2015-05-03 10:50:10 +03:00
|
|
|
my $USE_PARALLEL = 1;
|
2015-04-30 14:35:15 +03:00
|
|
|
|
|
|
|
my $FACTORS_STR;
|
|
|
|
my @FACTORS;
|
|
|
|
|
|
|
|
GetOptions(
|
|
|
|
"tmpdir=s" => \$TMPDIR,
|
|
|
|
"keep-tmp" => \$KEEP_TMP,
|
|
|
|
"mada-dir=s" => \$MADA_DIR,
|
|
|
|
"factors=s" => \$FACTORS_STR,
|
|
|
|
"config=s" => \$CONFIG,
|
2015-05-03 10:50:10 +03:00
|
|
|
"scheme=s" => \$SCHEME,
|
|
|
|
"use-parallel=i" => \$USE_PARALLEL
|
2015-04-30 14:35:15 +03:00
|
|
|
) or die("ERROR: unknown options");
|
|
|
|
|
|
|
|
die("must have -scheme arg") unless defined($SCHEME);
|
|
|
|
|
|
|
|
if (!defined($CONFIG)) {
|
|
|
|
$CONFIG = "$MADA_DIR/samples/sampleConfigFile.xml";
|
|
|
|
}
|
|
|
|
|
|
|
|
$TMPDIR = abs_path($TMPDIR);
|
|
|
|
print STDERR "TMPDIR=$TMPDIR \n";
|
|
|
|
|
|
|
|
if (defined($FACTORS_STR)) {
|
|
|
|
@FACTORS = split(",", $FACTORS_STR);
|
|
|
|
}
|
|
|
|
|
|
|
|
#binmode(STDIN, ":utf8");
|
|
|
|
#binmode(STDOUT, ":utf8");
|
|
|
|
|
|
|
|
$TMPDIR = "$TMPDIR/madamira.$$";
|
|
|
|
`mkdir -p $TMPDIR`;
|
|
|
|
`mkdir -p $TMPDIR/split`;
|
|
|
|
`mkdir -p $TMPDIR/out`;
|
|
|
|
|
|
|
|
my $infile = "$TMPDIR/input";
|
|
|
|
print STDERR $infile."\n";
|
|
|
|
|
|
|
|
open(TMP,">$infile");
|
2015-05-17 16:04:04 +03:00
|
|
|
while(<STDIN>) {
|
2015-04-30 14:35:15 +03:00
|
|
|
print TMP $_;
|
|
|
|
}
|
|
|
|
close(TMP);
|
|
|
|
|
|
|
|
my $cmd;
|
|
|
|
|
2015-05-03 10:50:10 +03:00
|
|
|
if ($USE_PARALLEL) {
|
|
|
|
# split input file
|
2015-05-17 16:04:04 +03:00
|
|
|
my $SPLIT_EXEC = `gsplit --help 2>/dev/null`;
|
2015-05-03 10:50:10 +03:00
|
|
|
if($SPLIT_EXEC) {
|
2015-04-30 14:35:15 +03:00
|
|
|
$SPLIT_EXEC = 'gsplit';
|
2015-05-03 10:50:10 +03:00
|
|
|
}
|
|
|
|
else {
|
2015-04-30 14:35:15 +03:00
|
|
|
$SPLIT_EXEC = 'split';
|
2015-05-03 10:50:10 +03:00
|
|
|
}
|
2015-04-30 14:35:15 +03:00
|
|
|
|
2015-05-03 10:50:10 +03:00
|
|
|
$cmd = "$SPLIT_EXEC -l 10000 -a 7 -d $TMPDIR/input $TMPDIR/split/x";
|
|
|
|
`$cmd`;
|
2015-04-30 14:35:15 +03:00
|
|
|
|
2015-05-03 10:50:10 +03:00
|
|
|
$cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir $TMPDIR/out -rawconfig $CONFIG ::: $TMPDIR/split/x*";
|
|
|
|
print STDERR "Executing: $cmd\n";
|
|
|
|
`$cmd`;
|
2015-04-30 14:35:15 +03:00
|
|
|
|
2015-05-03 10:50:10 +03:00
|
|
|
$cmd = "cat $TMPDIR/out/x*.$SCHEME.tok > $infile.mada";
|
|
|
|
print STDERR "Executing: $cmd\n";
|
|
|
|
`$cmd`;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$cmd = "cd $MADA_DIR && java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput $infile -rawoutdir $TMPDIR/out -rawconfig $CONFIG";
|
|
|
|
print STDERR "Executing: $cmd\n";
|
|
|
|
`$cmd`;
|
|
|
|
|
|
|
|
$cmd = "cat $TMPDIR/out/input.$SCHEME.tok > $infile.mada";
|
|
|
|
print STDERR "Executing: $cmd\n";
|
|
|
|
`$cmd`;
|
|
|
|
}
|
2015-04-30 14:35:15 +03:00
|
|
|
|
|
|
|
# get stuff out of mada output
|
|
|
|
open(MADA_OUT,"<$infile.mada");
|
|
|
|
#binmode(MADA_OUT, ":utf8");
|
2015-05-17 16:04:04 +03:00
|
|
|
while(my $line = <MADA_OUT>) {
|
2015-04-30 14:35:15 +03:00
|
|
|
chomp($line);
|
|
|
|
print "$line\n";
|
|
|
|
}
|
|
|
|
close (MADA_OUT);
|
|
|
|
|
|
|
|
|
|
|
|
if ($KEEP_TMP == 0) {
|
|
|
|
# `rm -rf $TMPDIR`;
|
|
|
|
}
|
|
|
|
|