use pigz rather than gzip if it exists

This commit is contained in:
Hieu Hoang 2015-01-13 15:16:22 +00:00
parent 542a65a16e
commit 90d4b2d713
3 changed files with 45 additions and 18 deletions

View File

@ -32,6 +32,15 @@ my $glueFile;
my $phraseOrientation = 0;
my $phraseOrientationPriorsFile;
my $GZIP_EXEC; # = which("pigz");
if(-f "/usr/bin/pigz") {
$GZIP_EXEC = 'pigz';
}
else {
$GZIP_EXEC = 'gzip';
}
print STDERR "using $GZIP_EXEC \n";
for (my $i = 8; $i < $#ARGV + 1; ++$i)
{
$makeTTable = 0 if $ARGV[$i] eq "--NoTTable";
@ -178,11 +187,11 @@ if (defined($baselineExtract)) {
$catOCmd .= "$baselineExtract.o$sorted.gz ";
}
$catCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | gzip -c > $extract.sorted.gz 2>> /dev/stderr \n";
$catInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | gzip -c > $extract.inv.sorted.gz 2>> /dev/stderr \n";
$catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | gzip -c > $extract.o.sorted.gz 2>> /dev/stderr \n";
$catContextCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | uniq | gzip -c > $extract.context.sorted.gz 2>> /dev/stderr \n";
$catContextInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | uniq | gzip -c > $extract.context.inv.sorted.gz 2>> /dev/stderr \n";
$catCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.sorted.gz 2>> /dev/stderr \n";
$catInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.inv.sorted.gz 2>> /dev/stderr \n";
$catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.o.sorted.gz 2>> /dev/stderr \n";
$catContextCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | uniq | $GZIP_EXEC -c > $extract.context.sorted.gz 2>> /dev/stderr \n";
$catContextInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | uniq | $GZIP_EXEC -c > $extract.context.inv.sorted.gz 2>> /dev/stderr \n";
@children = ();

View File

@ -13,6 +13,15 @@ sub GetSourcePhrase($);
sub NumStr($);
sub CutContextFile($$$);
my $GZIP_EXEC; # = which("pigz");
if(-f "/usr/bin/pigz") {
$GZIP_EXEC = 'pigz';
}
else {
$GZIP_EXEC = 'gzip';
}
print STDERR "using $GZIP_EXEC \n";
#my $EXTRACT_SPLIT_LINES = 5000000;
my $EXTRACT_SPLIT_LINES = 50000000;
@ -100,7 +109,7 @@ else
}
my $filePath = "$TMPDIR/extract.$fileCount.gz";
open (OUT, "| gzip -c > $filePath") or die "error starting gzip $!";
open (OUT, "| $GZIP_EXEC -c > $filePath") or die "error starting $GZIP_EXEC $!";
my $lineCount = 0;
my $line;
@ -133,7 +142,7 @@ else
++$fileCount;
my $filePath = $fileCount;
$filePath = "$TMPDIR/extract.$filePath.gz";
open (OUT, "| gzip -c > $filePath") or die "error starting gzip $!";
open (OUT, "| $GZIP_EXEC -c > $filePath") or die "error starting $GZIP_EXEC $!";
}
}
else
@ -175,7 +184,7 @@ for (my $i = 0; $i < $fileCount; ++$i)
$cmd .= "zcat $TMPDIR/phrase-table.half.$numStr.gz | $FlexibilityCmd $TMPDIR/extract.context.$i.gz";
$cmd .= " --Inverse" if ($otherExtractArgs =~ /--Inverse/);
$cmd .= " --Hierarchical" if ($otherExtractArgs =~ /--Hierarchical/);
$cmd .= " | gzip -c > $TMPDIR/phrase-table.half.$numStr.flex.gz\n";
$cmd .= " | $GZIP_EXEC -c > $TMPDIR/phrase-table.half.$numStr.flex.gz\n";
$cmd .= "mv $TMPDIR/phrase-table.half.$numStr.flex.gz $TMPDIR/phrase-table.half.$numStr.gz\n";
}
@ -219,7 +228,7 @@ else
$cmd .= "| LC_ALL=C $sortCmd -T $TMPDIR ";
}
$cmd .= " | gzip -c > $ptHalf 2>> /dev/stderr ";
$cmd .= " | $GZIP_EXEC -c > $ptHalf 2>> /dev/stderr ";
}
print STDERR $cmd;
systemCheck($cmd);
@ -356,7 +365,7 @@ sub CutContextFile($$$)
my $sourcePhrase;
my $filePath = "$TMPDIR/extract.context.$fileCount.gz";
open (OUT_CONTEXT, "| gzip -c > $filePath") or die "error starting gzip $!";
open (OUT_CONTEXT, "| $GZIP_EXEC -c > $filePath") or die "error starting $GZIP_EXEC $!";
if ($lastline ne "") {
print OUT_CONTEXT "$lastline\n";

View File

@ -306,6 +306,15 @@ else {
$SORT_EXEC = 'sort';
}
my $GZIP_EXEC; # = which("pigz");
if(-f "/usr/bin/pigz") {
$GZIP_EXEC = 'pigz';
}
else {
$GZIP_EXEC = 'gzip';
}
print STDERR "using $GZIP_EXEC \n";
my $__SORT_BUFFER_SIZE = "";
$__SORT_BUFFER_SIZE = "-S $_SORT_BUFFER_SIZE" if $_SORT_BUFFER_SIZE;
@ -353,7 +362,7 @@ my $PHRASE_CONSOLIDATE = "$SCRIPTS_ROOTDIR/../bin/consolidate";
my $FLEX_SCORER = "$SCRIPTS_ROOTDIR/training/flexibility_score.py";
# utilities
my $ZCAT = "gzip -cd";
my $ZCAT = "$GZIP_EXEC -cd";
my $BZCAT = "bzcat";
# do a sanity check to make sure we can find the necessary binaries since
@ -1209,7 +1218,7 @@ sub run_single_giza {
die "ERROR: Giza did not produce the output file $dir/$f-$e.$___GIZA_EXTENSION. Is your corpus clean (reasonably-sized sentences)?"
if ! -e "$dir/$f-$e.$___GIZA_EXTENSION";
safesystem("rm -f $dir/$f-$e.$___GIZA_EXTENSION.gz") or die;
safesystem("gzip $dir/$f-$e.$___GIZA_EXTENSION") or die;
safesystem("$GZIP_EXEC $dir/$f-$e.$___GIZA_EXTENSION") or die;
}
sub run_single_snt2cooc {
@ -1486,9 +1495,9 @@ sub extract_phrase {
if (defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT !~ /extract-parallel.perl/) {
print STDERR "merging with baseline extract from $_BASELINE_EXTRACT\n";
safesystem("$ZCAT $_BASELINE_EXTRACT.gz $extract_file$suffix.gz | gzip > $extract_file.gz");
safesystem("$ZCAT $_BASELINE_EXTRACT.inv.gz $extract_file$suffix.inv.gz | gzip > $extract_file.inv.gz");
safesystem("$ZCAT $_BASELINE_EXTRACT.o.gz $extract_file$suffix.o.gz | gzip > $extract_file.o.gz")
safesystem("$ZCAT $_BASELINE_EXTRACT.gz $extract_file$suffix.gz | $GZIP_EXEC > $extract_file.gz");
safesystem("$ZCAT $_BASELINE_EXTRACT.inv.gz $extract_file$suffix.inv.gz | $GZIP_EXEC > $extract_file.inv.gz");
safesystem("$ZCAT $_BASELINE_EXTRACT.o.gz $extract_file$suffix.o.gz | $GZIP_EXEC > $extract_file.o.gz")
if -e "$extract_file$suffix.o.gz";
safesystem("rm $extract_file$suffix.gz");
safesystem("rm $extract_file$suffix.inv.gz");
@ -1681,7 +1690,7 @@ sub score_phrase_phrase_extract {
$cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY;
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
$cmd .= " | gzip -c > $ttable_file.gz";
$cmd .= " | $GZIP_EXEC -c > $ttable_file.gz";
safesystem($cmd) or die "ERROR: Consolidating the two phrase table halves failed";
if (! $debug) { safesystem("rm -f $ttable_file.half.*") or die("ERROR"); }
@ -1698,7 +1707,7 @@ sub score_phrase_memscore {
# The output is sorted to avoid breaking scripts that rely on the
# sorting behaviour of the previous scoring algorithm.
my $cmd = "$MEMSCORE $options | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR | gzip >$ttable_file.gz";
my $cmd = "$MEMSCORE $options | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR | $GZIP_EXEC >$ttable_file.gz";
if (-e "$extract_file.gz") {
$cmd = "$ZCAT $extract_file.gz | ".$cmd;
} else {
@ -1863,7 +1872,7 @@ sub get_generation {
}
close(GEN);
safesystem("rm -f $file.gz") or die("ERROR");
safesystem("gzip $file") or die("ERROR");
safesystem("$GZIP_EXEC $file") or die("ERROR");
}
### (9) CREATE CONFIGURATION FILE