Allowing the truecaser to work on uncased ASR input, pass the -a flag

This commit is contained in:
Lexi Birch 2015-06-08 16:58:50 +01:00
parent 9266d65304
commit 501c51947b

View File

@ -8,11 +8,14 @@ binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8"); binmode(STDOUT, ":utf8");
# apply switches # apply switches
my ($MODEL, $UNBUFFERED); # ASR input has no case, make sure it is lowercase, and make sure known are cased eg. 'i' to be uppercased even if i is known
die("truecase.perl --model MODEL [-b] < in > out") my ($MODEL, $UNBUFFERED, $ASR);
unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED) die("truecase.perl --model MODEL [-b] [-a] < in > out")
unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED, 'a|asr' => \$ASR)
&& defined($MODEL); && defined($MODEL);
if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
my $asr = 0;
if (defined($ASR) && $ASR) { $asr = 1; }
my (%BEST,%KNOWN); my (%BEST,%KNOWN);
open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'"); open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'");
@ -20,10 +23,12 @@ binmode(MODEL, ":utf8");
while(<MODEL>) { while(<MODEL>) {
my ($word,@OPTIONS) = split; my ($word,@OPTIONS) = split;
$BEST{ lc($word) } = $word; $BEST{ lc($word) } = $word;
if ($asr == 0) {
$KNOWN{ $word } = 1; $KNOWN{ $word } = 1;
for(my $i=1;$i<$#OPTIONS;$i+=2) { for(my $i=1;$i<$#OPTIONS;$i+=2) {
$KNOWN{ $OPTIONS[$i] } = 1; $KNOWN{ $OPTIONS[$i] } = 1;
} }
}
} }
close(MODEL); close(MODEL);
@ -49,6 +54,9 @@ while(<STDIN>) {
$word = $$WORD[$i]; $word = $$WORD[$i];
$otherfactors = ""; $otherfactors = "";
} }
if ($asr){
$word = lc($word); #make sure ASR output is not uc
}
if ($sentence_start && defined($BEST{lc($word)})) { if ($sentence_start && defined($BEST{lc($word)})) {
print $BEST{lc($word)}; # truecase sentence start print $BEST{lc($word)}; # truecase sentence start