mosesdecoder/scripts/recaser/train-recaser.perl
Jeroen Vermeulen ef028446f3 Add license notices to scripts.
This is not pleasant to read (and much, much less pleasant to write!) but
sort of necessary in an open project.  Right now it's quite hard to figure
out what is licensed how, which doesn't matter much to most people but can
suddenly become very important when people want to know what they're being
allowed to do.

I kept the notices as short as I could.  As far as I could see, everything
without a clear license notice is LGPL v2.1 or later.
2015-05-29 18:30:26 +07:00

224 lines
7.7 KiB
Perl
Executable File

#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
# $Id$
use warnings;
use strict;
use FindBin qw($Bin);
use Getopt::Long "GetOptions";
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
# apply switches
my ($DIR,$CORPUS,$SCRIPTS_ROOT_DIR,$CONFIG,$HELP,$ERROR);
my $LM = "KENLM"; # KENLM is default.
my $BUILD_LM = "build-lm.sh";
my $BUILD_KENLM = "$Bin/../../bin/lmplz";
my $BUILD_BINARY = "$Bin/../../bin/build_binary";
my $EXTRACT = "$Bin/../../bin/extract";
my $SCORE = "$Bin/../../bin/score";
my $CONSOLIDATE_DIRECT = "$Bin/../../bin/consolidate-direct";
my $NGRAM_COUNT = "ngram-count";
my $TRAIN_SCRIPT = "$Bin/../training/train-model.perl";
my $MAX_LEN = 1;
my $FIRST_STEP = 1;
my $LAST_STEP = 11;
$ERROR = "training Aborted."
unless &GetOptions('first-step=i' => \$FIRST_STEP,
'last-step=i' => \$LAST_STEP,
'corpus=s' => \$CORPUS,
'config=s' => \$CONFIG,
'dir=s' => \$DIR,
'ngram-count=s' => \$NGRAM_COUNT,
'build-lm=s' => \$BUILD_LM,
'build-kenlm=s' => \$BUILD_KENLM,
'lm=s' => \$LM,
'train-script=s' => \$TRAIN_SCRIPT,
'scripts-root-dir=s' => \$SCRIPTS_ROOT_DIR,
'max-len=i' => \$MAX_LEN,
'help' => \$HELP);
# check and set default to unset parameters
$ERROR = "please specify working dir --dir" unless defined($DIR) || defined($HELP);
$ERROR = "please specify --corpus" if !defined($CORPUS) && !defined($HELP)
&& $FIRST_STEP <= 2 && $LAST_STEP >= 1;
if ($HELP || $ERROR) {
if ($ERROR) {
print STDERR "ERROR: " . $ERROR . "\n";
}
print STDERR "Usage: $0 --dir /output/recaser --corpus /Cased/corpus/files [options ...]";
print STDERR "\n\nOptions:
== MANDATORY ==
--dir=dir ... outputted recaser directory.
--corpus=file ... inputted cased corpus.
== OPTIONAL ==
= Recaser Training configuration =
--train-script=file ... path to the train script (default: train-factored-phrase-model.perl in \$PATH).
--config=config ... training script configuration.
--scripts-root-dir=dir ... scripts directory.
--max-len=int ... max phrase length (default: 1).
= Language Model Training configuration =
--lm=[IRSTLM,SRILM,KENLM] ... language model (default: KENLM).
--build-lm=file ... path to build-lm.sh if not in \$PATH (used only with --lm=IRSTLM).
--ngram-count=file ... path to ngram-count.sh if not in \$PATH (used only with --lm=SRILM).
= Steps this script will perform =
(1) Truecasing;
(2) Language Model Training;
(3) Data Preparation
(4-10) Recaser Model Training;
(11) Cleanup.
--first-step=[1-11] ... step where script starts (default: 1).
--last-step=[1-11] ... step where script ends (default: 11).
--help ... this usage output.\n";
if ($ERROR) {
exit(1);
}
else {
exit(0);
}
}
# main loop
`mkdir -p $DIR`;
&truecase() if $FIRST_STEP == 1;
$CORPUS = "$DIR/aligned.truecased" if (-e "$DIR/aligned.truecased");
&train_lm() if $FIRST_STEP <= 2;
&prepare_data() if $FIRST_STEP <= 3 && $LAST_STEP >= 3;
&train_recase_model() if $FIRST_STEP <= 10 && $LAST_STEP >= 3;
&cleanup() if $LAST_STEP == 11;
exit(0);
### subs ###
sub truecase {
print STDERR "(1) Truecase data @ ".`date`;
print STDERR "(1) To build model without truecasing, use --first-step 2, and make sure $DIR/aligned.truecased does not exist\n";
my $cmd = "$Bin/train-truecaser.perl --model $DIR/truecaser_model --corpus $CORPUS";
print STDERR $cmd."\n";
system($cmd) == 0 || die("Training truecaser died with error " . ($? >> 8) . "\n");
$cmd = "$Bin/truecase.perl --model $DIR/truecaser_model < $CORPUS > $DIR/aligned.truecased";
print STDERR $cmd."\n";
system($cmd) == 0 || die("Applying truecaser died with error " . ($? >> 8) . "\n");
}
sub train_lm {
print STDERR "(2) Train language model on cased data @ ".`date`;
my $cmd = "";
if (uc $LM eq "IRSTLM") {
$cmd = "$BUILD_LM -t /tmp -i $CORPUS -n 3 -o $DIR/cased.irstlm.gz";
}
elsif (uc $LM eq "SRILM") {
$LM = "SRILM";
$cmd = "$NGRAM_COUNT -text $CORPUS -lm $DIR/cased.srilm.gz -interpolate -kndiscount";
}
else {
$LM = "KENLM";
$cmd = "$BUILD_KENLM --prune 0 0 1 -S 5% -T $DIR/lmtmp --order 3 --text $CORPUS --arpa $DIR/cased.kenlm.arpa.gz";
}
print STDERR "** Using $LM **" . "\n";
print STDERR $cmd."\n";
system($cmd) == 0 || die("Language model training failed with error " . ($? >> 8) . "\n");
if ($LM eq "KENLM") {
system("$BUILD_BINARY $DIR/cased.kenlm.arpa.gz $DIR/cased.kenlm ; rm $DIR/cased.kenlm.arpa.gz");
}
}
sub prepare_data {
print STDERR "\n(3) Preparing data for training recasing model @ ".`date`;
open(CORPUS,$CORPUS);
binmode(CORPUS, ":utf8");
open(CASED,">$DIR/aligned.cased");
binmode(CASED, ":utf8");
print "$DIR/aligned.lowercased\n";
open(LOWERCASED,">$DIR/aligned.lowercased");
binmode(LOWERCASED, ":utf8");
open(ALIGNMENT,">$DIR/aligned.a");
while(<CORPUS>) {
next if length($_)>2000;
s/\x{0}//g;
s/\|//g;
s/ +/ /g;
s/^ //;
s/ [\r\n]*$/\n/;
next if /^$/;
print CASED $_;
print LOWERCASED lc($_);
my $i=0;
foreach (split) {
print ALIGNMENT "$i-$i ";
$i++;
}
print ALIGNMENT "\n";
}
close(CORPUS);
close(CASED);
close(LOWERCASED);
close(ALIGNMENT);
}
sub train_recase_model {
print STDERR "\n(4) Training recasing model @ ".`date`;
my $first = $FIRST_STEP;
$first = 4 if $first < 4;
if ($MAX_LEN == 1) {
my $cmd = "$EXTRACT $DIR/aligned.cased $DIR/aligned.lowercased $DIR/aligned.a $DIR/extract 1";
system($cmd) == 0 || die("ERROR: extract (special case max-len 1) failed: $cmd");
$cmd = "sort -S 2G $DIR/extract > $DIR/extract.sorted";
system($cmd) == 0 || die("ERROR: sort extract (special case max-len 1) failed: $cmd");
$cmd = "$SCORE $DIR/extract.sorted /dev/null $DIR/phrase-table-half --NoLex";
system($cmd) == 0 || die("ERROR: score (special case max-len 1) failed: $cmd");
$cmd = "$CONSOLIDATE_DIRECT $DIR/phrase-table-half $DIR/phrase-table";
system($cmd) == 0 || die("ERROR: consolidate-direct (special case max-len 1) failed: $cmd");
system("rm $DIR/phrase-table-half");
system("gzip $DIR/phrase-table");
$first = 9;
}
my $cmd = "$TRAIN_SCRIPT --root-dir $DIR --model-dir $DIR --first-step $first --alignment a --corpus $DIR/aligned --f lowercased --e cased --max-phrase-length $MAX_LEN";
if ($MAX_LEN == 1) {
$cmd .= " --score-options='--NoLex --OnlyDirect'";
}
else {
$cmd .= " --score-options='--OnlyDirect'";
}
if (uc $LM eq "IRSTLM") {
$cmd .= " --lm 0:3:$DIR/cased.irstlm.gz:1";
}
elsif (uc $LM eq "SRILM") {
$cmd .= " --lm 0:3:$DIR/cased.srilm.gz:8";
}
else {
$cmd .= " --lm 0:3:$DIR/cased.kenlm:8";
}
$cmd .= " -config $CONFIG" if $CONFIG;
print STDERR $cmd."\n";
system($cmd) == 0 || die("Recaser model training failed with error " . ($? >> 8) . "\n");
}
sub cleanup {
print STDERR "\n(11) Cleaning up @ ".`date`;
`rm -f $DIR/extract*`;
my $clean_1 = $?;
`rm -f $DIR/aligned*`;
my $clean_2 = $?;
`rm -f $DIR/lex*`;
my $clean_3 = $?;
`rm -f $DIR/truecaser_model`;
my $clean_4 = $?;
if ($clean_1 + $clean_2 + $clean_3 + $clean_4 != 0) {
print STDERR "Training successful but some files could not be cleaned.\n";
}
}