#!/usr/bin/perl -w # $Id$ use strict; use Getopt::Long "GetOptions"; my ($SRC,$INFILE,$RECASE_MODEL,$UNBUFFERED); my $MOSES = "moses"; my $LANGUAGE = "en"; # English by default; die("recase.perl --in file --model ini-file > out") unless &GetOptions('in=s' => \$INFILE, 'headline=s' => \$SRC, 'lang=s' => \$LANGUAGE, 'moses=s' => \$MOSES, 'model=s' => \$RECASE_MODEL, 'b|unbuffered' => \$UNBUFFERED) && defined($INFILE) && defined($RECASE_MODEL); if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } my %treated_languages = map { ($_,1) } qw/en cs/; die "I don't know any rules for $LANGUAGE. Use 'en' as the default." if ! defined $treated_languages{$LANGUAGE}; # lowercase even in headline my %ALWAYS_LOWER; if ($LANGUAGE eq "en" ) { foreach ("a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with") { $ALWAYS_LOWER{$_} = 1; } } # find out about the headlines my @HEADLINE; if (defined($SRC)) { open(SRC,$SRC); my $headline_flag = 0; while() { $headline_flag = 1 if //; $headline_flag = 0 if /<.hl>/; next unless /^) { chomp; s/\s+$//; my @WORD = split(/ /); # uppercase initial word &uppercase(\$WORD[0]); if ($LANGUAGE ne "cs") { # uppercase after period # unless in Czech where '.' is used after all ordinals for(my $i=1;$i