mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-07 12:10:36 +03:00
a25193cc5d
This is lint reported by the new lint-checking functionality in beautify.py. (We can change to a different lint checker if we have a better one, but it would probably still flag these same problems.) Lint checking can help a lot, but only if we get the lint under control.
60 lines
1.1 KiB
Perl
Executable File
60 lines
1.1 KiB
Perl
Executable File
#!/usr/bin/env perl
|
|
|
|
use warnings;
|
|
use strict;
|
|
|
|
my ($lowercase,$cluster_file,$in,$out,$tmp) = @ARGV;
|
|
|
|
my $CLUSTER = &read_cluster_from_mkcls($cluster_file);
|
|
|
|
# is $lowercase a script?
|
|
if ($lowercase =~ /\//) {
|
|
open(IN,"$lowercase < $in|") || die("ERROR: could not open input");
|
|
$lowercase = 0;
|
|
}
|
|
else {
|
|
open(IN,$in) || die("ERROR: could not open input");
|
|
}
|
|
binmode(IN, ":utf8");
|
|
open(OUT,">$out");
|
|
binmode(OUT, ":utf8");
|
|
while(<IN>) {
|
|
chop;
|
|
s/\s+/ /g;
|
|
s/^ //;
|
|
s/ $//;
|
|
my $first = 1;
|
|
foreach my $word (split) {
|
|
# if lowercase is a flag
|
|
if ($lowercase) {
|
|
$word = lc($word);
|
|
}
|
|
my $cluster = defined($$CLUSTER{$word}) ? $$CLUSTER{$word} : "0";
|
|
print OUT " " unless $first;
|
|
print OUT $cluster;
|
|
$first = 0;
|
|
}
|
|
print OUT "\n";
|
|
}
|
|
close(OUT);
|
|
close(IN);
|
|
|
|
sub read_cluster_from_mkcls {
|
|
my ($file) = @_;
|
|
my %CLUSTER;
|
|
open(CLUSTER_FILE,$file) || die("ERROR: could not open cluster file '$file'");
|
|
binmode(CLUSTER_FILE, ":utf8");
|
|
while(<CLUSTER_FILE>) {
|
|
chop;
|
|
my ($word,$cluster) = split;
|
|
$CLUSTER{$word} = $cluster;
|
|
}
|
|
close(CLUSTER_FILE);
|
|
return \%CLUSTER;
|
|
}
|
|
|
|
sub add_cluster_to_string {
|
|
}
|
|
|
|
|