From 501c51947b192e8559fa35d820ebd951566bebba Mon Sep 17 00:00:00 2001 From: Lexi Birch Date: Mon, 8 Jun 2015 16:58:50 +0100 Subject: [PATCH] Allowing the truecaser to work on uncased ASR input, pass the -a flag --- scripts/recaser/truecase.perl | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl index 0a4d366e0..7b3dc20fb 100755 --- a/scripts/recaser/truecase.perl +++ b/scripts/recaser/truecase.perl @@ -8,11 +8,14 @@ binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); # apply switches -my ($MODEL, $UNBUFFERED); -die("truecase.perl --model MODEL [-b] < in > out") - unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED) +# ASR input has no case, make sure it is lowercase, and make sure known are cased eg. 'i' to be uppercased even if i is known +my ($MODEL, $UNBUFFERED, $ASR); +die("truecase.perl --model MODEL [-b] [-a] < in > out") + unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED, 'a|asr' => \$ASR) && defined($MODEL); if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } +my $asr = 0; +if (defined($ASR) && $ASR) { $asr = 1; } my (%BEST,%KNOWN); open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'"); @@ -20,9 +23,11 @@ binmode(MODEL, ":utf8"); while() { my ($word,@OPTIONS) = split; $BEST{ lc($word) } = $word; - $KNOWN{ $word } = 1; - for(my $i=1;$i<$#OPTIONS;$i+=2) { - $KNOWN{ $OPTIONS[$i] } = 1; + if ($asr == 0) { + $KNOWN{ $word } = 1; + for(my $i=1;$i<$#OPTIONS;$i+=2) { + $KNOWN{ $OPTIONS[$i] } = 1; + } } } close(MODEL); @@ -49,6 +54,9 @@ while() { $word = $$WORD[$i]; $otherfactors = ""; } + if ($asr){ + $word = lc($word); #make sure ASR output is not uc + } if ($sentence_start && defined($BEST{lc($word)})) { print $BEST{lc($word)}; # truecase sentence start