2015-04-02 16:38:56 +03:00
|
|
|
#!/usr/bin/env perl
|
2007-02-13 22:22:35 +03:00
|
|
|
|
2010-10-11 15:32:27 +04:00
|
|
|
# $Id$
|
2007-02-13 22:22:35 +03:00
|
|
|
use strict;
|
2012-07-11 18:27:00 +04:00
|
|
|
use FindBin qw($Bin);
|
2007-02-13 22:22:35 +03:00
|
|
|
use Getopt::Long "GetOptions";
|
|
|
|
|
|
|
|
binmode(STDIN, ":utf8");
|
|
|
|
binmode(STDOUT, ":utf8");
|
|
|
|
|
|
|
|
# apply switches
|
2011-11-27 14:14:39 +04:00
|
|
|
my ($DIR,$CORPUS,$SCRIPTS_ROOT_DIR,$CONFIG,$HELP,$ERROR);
|
2014-06-07 00:54:42 +04:00
|
|
|
my $LM = "KENLM"; # KENLM is default.
|
2011-11-25 06:16:16 +04:00
|
|
|
my $BUILD_LM = "build-lm.sh";
|
2014-06-07 00:54:42 +04:00
|
|
|
my $BUILD_KENLM = "$Bin/../../bin/lmplz";
|
2015-02-04 12:18:09 +03:00
|
|
|
my $BUILD_BINARY = "$Bin/../../bin/build_binary";
|
|
|
|
my $EXTRACT = "$Bin/../../bin/extract";
|
|
|
|
my $SCORE = "$Bin/../../bin/score";
|
|
|
|
my $CONSOLIDATE_DIRECT = "$Bin/../../bin/consolidate-direct";
|
2007-02-13 22:22:35 +03:00
|
|
|
my $NGRAM_COUNT = "ngram-count";
|
2015-02-10 18:43:10 +03:00
|
|
|
my $TRAIN_SCRIPT = "$Bin/../training/train-model.perl";
|
2007-02-13 22:22:35 +03:00
|
|
|
my $MAX_LEN = 1;
|
|
|
|
my $FIRST_STEP = 1;
|
|
|
|
my $LAST_STEP = 11;
|
2011-11-25 21:21:55 +04:00
|
|
|
$ERROR = "training Aborted."
|
2007-02-13 22:22:35 +03:00
|
|
|
unless &GetOptions('first-step=i' => \$FIRST_STEP,
|
|
|
|
'last-step=i' => \$LAST_STEP,
|
|
|
|
'corpus=s' => \$CORPUS,
|
|
|
|
'config=s' => \$CONFIG,
|
2011-11-25 21:21:55 +04:00
|
|
|
'dir=s' => \$DIR,
|
|
|
|
'ngram-count=s' => \$NGRAM_COUNT,
|
|
|
|
'build-lm=s' => \$BUILD_LM,
|
2014-06-07 00:54:42 +04:00
|
|
|
'build-kenlm=s' => \$BUILD_KENLM,
|
2011-11-25 21:21:55 +04:00
|
|
|
'lm=s' => \$LM,
|
|
|
|
'train-script=s' => \$TRAIN_SCRIPT,
|
|
|
|
'scripts-root-dir=s' => \$SCRIPTS_ROOT_DIR,
|
|
|
|
'max-len=i' => \$MAX_LEN,
|
|
|
|
'help' => \$HELP);
|
2007-02-13 22:22:35 +03:00
|
|
|
|
|
|
|
# check and set default to unset parameters
|
2011-11-27 14:14:39 +04:00
|
|
|
$ERROR = "please specify working dir --dir" unless defined($DIR) || defined($HELP);
|
|
|
|
$ERROR = "please specify --corpus" if !defined($CORPUS) && !defined($HELP)
|
2007-02-13 22:22:35 +03:00
|
|
|
&& $FIRST_STEP <= 2 && $LAST_STEP >= 1;
|
|
|
|
|
2011-11-25 21:21:55 +04:00
|
|
|
if ($HELP || $ERROR) {
|
|
|
|
if ($ERROR) {
|
|
|
|
print STDERR "ERROR: " . $ERROR . "\n";
|
|
|
|
}
|
|
|
|
print STDERR "Usage: $0 --dir /output/recaser --corpus /Cased/corpus/files [options ...]";
|
|
|
|
|
|
|
|
print STDERR "\n\nOptions:
|
|
|
|
== MANDATORY ==
|
|
|
|
--dir=dir ... outputted recaser directory.
|
|
|
|
--corpus=file ... inputted cased corpus.
|
|
|
|
|
|
|
|
== OPTIONAL ==
|
|
|
|
= Recaser Training configuration =
|
|
|
|
--train-script=file ... path to the train script (default: train-factored-phrase-model.perl in \$PATH).
|
|
|
|
--config=config ... training script configuration.
|
|
|
|
--scripts-root-dir=dir ... scripts directory.
|
|
|
|
--max-len=int ... max phrase length (default: 1).
|
|
|
|
|
|
|
|
= Language Model Training configuration =
|
2014-06-07 00:54:42 +04:00
|
|
|
--lm=[IRSTLM,SRILM,KENLM] ... language model (default: KENLM).
|
2011-11-25 21:21:55 +04:00
|
|
|
--build-lm=file ... path to build-lm.sh if not in \$PATH (used only with --lm=IRSTLM).
|
|
|
|
--ngram-count=file ... path to ngram-count.sh if not in \$PATH (used only with --lm=SRILM).
|
|
|
|
|
|
|
|
= Steps this script will perform =
|
2012-07-11 18:27:00 +04:00
|
|
|
(1) Truecasing;
|
2011-11-25 21:21:55 +04:00
|
|
|
(2) Language Model Training;
|
|
|
|
(3) Data Preparation
|
|
|
|
(4-10) Recaser Model Training;
|
|
|
|
(11) Cleanup.
|
|
|
|
--first-step=[1-11] ... step where script starts (default: 1).
|
|
|
|
--last-step=[1-11] ... step where script ends (default: 11).
|
|
|
|
|
|
|
|
--help ... this usage output.\n";
|
|
|
|
if ($ERROR) {
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
exit(0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-02-13 22:22:35 +03:00
|
|
|
# main loop
|
|
|
|
`mkdir -p $DIR`;
|
2012-07-11 18:27:00 +04:00
|
|
|
&truecase() if $FIRST_STEP == 1;
|
|
|
|
$CORPUS = "$DIR/aligned.truecased" if (-e "$DIR/aligned.truecased");
|
2007-02-13 22:22:35 +03:00
|
|
|
&train_lm() if $FIRST_STEP <= 2;
|
|
|
|
&prepare_data() if $FIRST_STEP <= 3 && $LAST_STEP >= 3;
|
|
|
|
&train_recase_model() if $FIRST_STEP <= 10 && $LAST_STEP >= 3;
|
|
|
|
&cleanup() if $LAST_STEP == 11;
|
|
|
|
|
2012-09-25 13:57:01 +04:00
|
|
|
exit(0);
|
|
|
|
|
2007-02-13 22:22:35 +03:00
|
|
|
### subs ###
|
|
|
|
|
|
|
|
sub truecase {
|
2012-07-11 18:27:00 +04:00
|
|
|
print STDERR "(1) Truecase data @ ".`date`;
|
|
|
|
print STDERR "(1) To build model without truecasing, use --first-step 2, and make sure $DIR/aligned.truecased does not exist\n";
|
|
|
|
|
|
|
|
my $cmd = "$Bin/train-truecaser.perl --model $DIR/truecaser_model --corpus $CORPUS";
|
|
|
|
print STDERR $cmd."\n";
|
|
|
|
system($cmd) == 0 || die("Training truecaser died with error " . ($? >> 8) . "\n");
|
|
|
|
|
|
|
|
$cmd = "$Bin/truecase.perl --model $DIR/truecaser_model < $CORPUS > $DIR/aligned.truecased";
|
|
|
|
print STDERR $cmd."\n";
|
|
|
|
system($cmd) == 0 || die("Applying truecaser died with error " . ($? >> 8) . "\n");
|
|
|
|
|
2007-02-13 22:22:35 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
sub train_lm {
|
|
|
|
print STDERR "(2) Train language model on cased data @ ".`date`;
|
2011-11-25 06:16:16 +04:00
|
|
|
my $cmd = "";
|
2011-11-25 06:31:18 +04:00
|
|
|
if (uc $LM eq "IRSTLM") {
|
2011-11-25 06:16:16 +04:00
|
|
|
$cmd = "$BUILD_LM -t /tmp -i $CORPUS -n 3 -o $DIR/cased.irstlm.gz";
|
|
|
|
}
|
2014-06-07 00:54:42 +04:00
|
|
|
elsif (uc $LM eq "SRILM") {
|
2011-11-25 06:16:16 +04:00
|
|
|
$LM = "SRILM";
|
|
|
|
$cmd = "$NGRAM_COUNT -text $CORPUS -lm $DIR/cased.srilm.gz -interpolate -kndiscount";
|
|
|
|
}
|
2014-06-07 00:54:42 +04:00
|
|
|
else {
|
|
|
|
$LM = "KENLM";
|
2015-02-04 12:18:09 +03:00
|
|
|
$cmd = "$BUILD_KENLM --prune 0 0 1 -S 5% -T $DIR/lmtmp --order 3 --text $CORPUS --arpa $DIR/cased.kenlm.arpa.gz";
|
2014-06-07 00:54:42 +04:00
|
|
|
}
|
2011-11-25 06:16:16 +04:00
|
|
|
print STDERR "** Using $LM **" . "\n";
|
2007-02-13 22:22:35 +03:00
|
|
|
print STDERR $cmd."\n";
|
2011-11-27 13:55:30 +04:00
|
|
|
system($cmd) == 0 || die("Language model training failed with error " . ($? >> 8) . "\n");
|
2015-02-04 12:18:09 +03:00
|
|
|
if ($LM eq "KENLM") {
|
|
|
|
system("$BUILD_BINARY $DIR/cased.kenlm.arpa.gz $DIR/cased.kenlm ; rm $DIR/cased.kenlm.arpa.gz");
|
|
|
|
}
|
2007-02-13 22:22:35 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
sub prepare_data {
|
|
|
|
print STDERR "\n(3) Preparing data for training recasing model @ ".`date`;
|
|
|
|
open(CORPUS,$CORPUS);
|
2007-03-26 09:44:27 +04:00
|
|
|
binmode(CORPUS, ":utf8");
|
2007-02-13 22:22:35 +03:00
|
|
|
open(CASED,">$DIR/aligned.cased");
|
2007-03-26 09:44:27 +04:00
|
|
|
binmode(CASED, ":utf8");
|
2007-02-13 22:22:35 +03:00
|
|
|
print "$DIR/aligned.lowercased\n";
|
|
|
|
open(LOWERCASED,">$DIR/aligned.lowercased");
|
2007-03-26 09:44:27 +04:00
|
|
|
binmode(LOWERCASED, ":utf8");
|
2007-02-13 22:22:35 +03:00
|
|
|
open(ALIGNMENT,">$DIR/aligned.a");
|
|
|
|
while(<CORPUS>) {
|
|
|
|
next if length($_)>2000;
|
|
|
|
s/\x{0}//g;
|
|
|
|
s/\|//g;
|
|
|
|
s/ +/ /g;
|
|
|
|
s/^ //;
|
|
|
|
s/ [\r\n]*$/\n/;
|
|
|
|
next if /^$/;
|
|
|
|
print CASED $_;
|
|
|
|
print LOWERCASED lc($_);
|
|
|
|
my $i=0;
|
|
|
|
foreach (split) {
|
|
|
|
print ALIGNMENT "$i-$i ";
|
|
|
|
$i++;
|
|
|
|
}
|
|
|
|
print ALIGNMENT "\n";
|
|
|
|
}
|
|
|
|
close(CORPUS);
|
|
|
|
close(CASED);
|
|
|
|
close(LOWERCASED);
|
|
|
|
close(ALIGNMENT);
|
|
|
|
}
|
|
|
|
|
|
|
|
sub train_recase_model {
|
2015-02-04 12:18:09 +03:00
|
|
|
print STDERR "\n(4) Training recasing model @ ".`date`;
|
2007-02-13 22:22:35 +03:00
|
|
|
my $first = $FIRST_STEP;
|
|
|
|
$first = 4 if $first < 4;
|
2015-02-04 12:18:09 +03:00
|
|
|
if ($MAX_LEN == 1) {
|
|
|
|
my $cmd = "$EXTRACT $DIR/aligned.cased $DIR/aligned.lowercased $DIR/aligned.a $DIR/extract 1";
|
|
|
|
system($cmd) == 0 || die("ERROR: extract (special case max-len 1) failed: $cmd");
|
|
|
|
$cmd = "sort -S 2G $DIR/extract > $DIR/extract.sorted";
|
|
|
|
system($cmd) == 0 || die("ERROR: sort extract (special case max-len 1) failed: $cmd");
|
|
|
|
$cmd = "$SCORE $DIR/extract.sorted /dev/null $DIR/phrase-table-half --NoLex";
|
|
|
|
system($cmd) == 0 || die("ERROR: score (special case max-len 1) failed: $cmd");
|
|
|
|
$cmd = "$CONSOLIDATE_DIRECT $DIR/phrase-table-half $DIR/phrase-table";
|
|
|
|
system($cmd) == 0 || die("ERROR: consolidate-direct (special case max-len 1) failed: $cmd");
|
|
|
|
system("rm $DIR/phrase-table-half");
|
|
|
|
system("gzip $DIR/phrase-table");
|
|
|
|
$first = 9;
|
|
|
|
}
|
2011-11-25 06:16:16 +04:00
|
|
|
my $cmd = "$TRAIN_SCRIPT --root-dir $DIR --model-dir $DIR --first-step $first --alignment a --corpus $DIR/aligned --f lowercased --e cased --max-phrase-length $MAX_LEN";
|
2015-02-04 12:18:09 +03:00
|
|
|
if ($MAX_LEN == 1) {
|
|
|
|
$cmd .= " --score-options='--NoLex --OnlyDirect'";
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$cmd .= " --score-options='--OnlyDirect'";
|
|
|
|
}
|
2011-11-25 06:31:18 +04:00
|
|
|
if (uc $LM eq "IRSTLM") {
|
2011-11-25 06:16:16 +04:00
|
|
|
$cmd .= " --lm 0:3:$DIR/cased.irstlm.gz:1";
|
|
|
|
}
|
2014-06-07 00:54:42 +04:00
|
|
|
elsif (uc $LM eq "SRILM") {
|
2012-10-20 17:01:11 +04:00
|
|
|
$cmd .= " --lm 0:3:$DIR/cased.srilm.gz:8";
|
2011-11-25 06:16:16 +04:00
|
|
|
}
|
2014-06-07 00:54:42 +04:00
|
|
|
else {
|
2015-02-04 12:18:09 +03:00
|
|
|
$cmd .= " --lm 0:3:$DIR/cased.kenlm:8";
|
2014-06-07 00:54:42 +04:00
|
|
|
}
|
2007-02-26 15:19:06 +03:00
|
|
|
$cmd .= " -config $CONFIG" if $CONFIG;
|
2007-02-13 22:22:35 +03:00
|
|
|
print STDERR $cmd."\n";
|
2011-11-27 13:55:30 +04:00
|
|
|
system($cmd) == 0 || die("Recaser model training failed with error " . ($? >> 8) . "\n");
|
2007-02-13 22:22:35 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
sub cleanup {
|
|
|
|
print STDERR "\n(11) Cleaning up @ ".`date`;
|
|
|
|
`rm -f $DIR/extract*`;
|
2011-11-27 13:55:30 +04:00
|
|
|
my $clean_1 = $?;
|
2007-02-13 22:22:35 +03:00
|
|
|
`rm -f $DIR/aligned*`;
|
2011-11-27 13:55:30 +04:00
|
|
|
my $clean_2 = $?;
|
2007-02-13 22:22:35 +03:00
|
|
|
`rm -f $DIR/lex*`;
|
2011-11-27 13:55:30 +04:00
|
|
|
my $clean_3 = $?;
|
2012-07-11 18:27:00 +04:00
|
|
|
`rm -f $DIR/truecaser_model`;
|
|
|
|
my $clean_4 = $?;
|
|
|
|
if ($clean_1 + $clean_2 + $clean_3 + $clean_4 != 0) {
|
2011-11-27 13:55:30 +04:00
|
|
|
print STDERR "Training successful but some files could not be cleaned.\n";
|
|
|
|
}
|
2007-02-13 22:22:35 +03:00
|
|
|
}
|