factor for oov is 0, not <unk> - interferes with source input. Add extra argument to lowercase input words or not

This commit is contained in:
Hieu Hoang 2014-07-13 02:54:58 +01:00
parent a402523ef5
commit a3bd695cd4

View File

@ -2,7 +2,7 @@
use strict;
my ($cluster_file,$in,$out,$tmp) = @ARGV;
my ($lowercase, $cluster_file,$in,$out,$tmp) = @ARGV;
my $CLUSTER = &read_cluster_from_mkcls($cluster_file);
@ -17,7 +17,10 @@ while(<IN>) {
s/ $//;
my $first = 1;
foreach my $word (split) {
my $cluster = defined($$CLUSTER{$word}) ? $$CLUSTER{$word} : "<unk>";
if ($lowercase) {
$word = lc($word);
}
my $cluster = defined($$CLUSTER{$word}) ? $$CLUSTER{$word} : "0";
print OUT " " unless $first;
print OUT $cluster;
$first = 0;