mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-28 14:32:38 +03:00
807 lines
28 KiB
Perl
807 lines
28 KiB
Perl
#!/usr/bin/perl -Tw
|
|
use warnings;
|
|
use strict;
|
|
$|++;
|
|
|
|
# file: translate.cgi
|
|
|
|
# Herve Saint-Amand
|
|
# saintamh [o] yahoo, com
|
|
# Universitaet des Saarlandes
|
|
# Mon May 12 14:10:54 2008
|
|
|
|
# This CGI script takes a web page URL as a parameter, fetches that page,
|
|
# translates it using the Moses decoder, and displays the translated version
|
|
# to the user, similarily to how Google or BabelFish translate web pages.
|
|
|
|
# I don't think I've ever written anything with such a high comment/code ratio,
|
|
# so hopefully it should be understandable. Just read top to bottom.
|
|
|
|
|
|
# TODO:
|
|
#
|
|
# - if the document contains <a name='anchor'></a> it will be lost
|
|
# - don't insert spaces everywhere around soft tags
|
|
# - charset autodetection would be nice, but it's not trivial
|
|
|
|
#------------------------------------------------------------------------------
|
|
# includes
|
|
|
|
use CGI;
|
|
use CGI::Carp qw/fatalsToBrowser/;
|
|
|
|
# we use the 2nd perl thread API. I think this means you need perl 5.6 or
|
|
# higher, compiled with thread support
|
|
use threads;
|
|
use threads::shared;
|
|
|
|
use Encode;
|
|
use HTML::Entities;
|
|
use HTML::Parser;
|
|
use LWP::UserAgent;
|
|
use URI;
|
|
use URI::Escape;
|
|
|
|
use lib 'lib';
|
|
use RemoteProcess;
|
|
use Subprocess;
|
|
|
|
#------------------------------------------------------------------------------
|
|
# constants, config
|
|
|
|
# In order to run this script, you must first start Moses as a sort of daemon
|
|
# process that accepts connections on some INET port, reads the sentences sent
|
|
# to it one line at a time and returns translations. The daemon.pl script that
|
|
# comes with this script does just that -- starts an instance of Moses and
|
|
# 'plugs' it to the net so it can be used from other machines or just other
|
|
# processes on the same machine.
|
|
#
|
|
# This list here indicates where to find these instances of Moses. May be
|
|
# localhost, or may be separate machines.
|
|
#
|
|
# On the current UniSaar setup we use SSH tunneling to connect to other hosts,
|
|
# so from this script's POV they're all localhost. These ports are actually
|
|
# forwarded to other machines. There wouldn't be much point in running 16
|
|
# instances of Moses on the same machine.
|
|
|
|
my @MOSES_ADDRESSES = map "localhost:90$_",
|
|
qw/01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16/;
|
|
|
|
|
|
# The tokenizer tries to adapt its rules depending on the language it's dealing
|
|
# with, so we indicate that here.
|
|
|
|
my $INPUT_LANG = 'fr';
|
|
my $OUTPUT_LANG = 'en';
|
|
|
|
|
|
# In order to tokenize and detokenize strings in a way that stays consistent
|
|
# with how it is done in the rest of the Moses system, we use the scripts that
|
|
# come with Moses as external processes. These are the commands we must run to
|
|
# start them.
|
|
|
|
my @TOKENIZER_CMD = ('./bin/tokenizer.perl', '-l', $INPUT_LANG);
|
|
my @DETOKENIZER_CMD = ('./bin/detokenizer.perl', '-l', $OUTPUT_LANG);
|
|
|
|
|
|
# We call 'soft tags' HTML tags whose presence is tolerated inside
|
|
# sentences. All other tags are assumed to be sentence-breakers and will be
|
|
# used to chop up documents into independent sentences. These few, however, are
|
|
# allowed within sentences.
|
|
|
|
my %SOFT_TAGS = map {$_ => 1} qw/a b i u em font blink tt acronym/;
|
|
|
|
|
|
# We call 'verbatim tags' HTML tags whose entire data is to be left untouched
|
|
# and reprinted as-is. These also happen to be tags whose content is typically
|
|
# not printed by the browser.
|
|
|
|
my %VERBATIM_TAGS = map {$_ => 1} qw/script style/;
|
|
|
|
|
|
# Some HTML tags have attributes that contain URLs. Since we'll be displaying
|
|
# the page on another server than its usual source server, relative paths will
|
|
# be broken, so we need to make all URLs absolute. These are the attributes
|
|
# that will be so modified.
|
|
|
|
my %URL_ATTRS = %{{
|
|
a => 'href',
|
|
img => 'src',
|
|
form => 'action',
|
|
link => 'href',
|
|
script => 'src',
|
|
}};
|
|
|
|
|
|
# Some HTML tags have attributes that can contain free text that is displayed
|
|
# to the user. Data in attributes is not usually translated, but these should
|
|
# be.
|
|
#
|
|
# Note that for implementation reasons these will always be treated as hard,
|
|
# sentence-splitting tags. This could be changed but would require a
|
|
# substantial re-write of this script.
|
|
|
|
my %TEXT_ATTR = %{{ input => [qw/value/], img => [qw/alt title/], }};
|
|
|
|
|
|
# Sentence splitting within a paragraph or block of text is done after
|
|
# tokenizing. Tokens matched by this regex will be considered to end a
|
|
# sentence, and hence be used in splitting the text into sentences.
|
|
|
|
my $RE_EOS_TOKEN = qr/^(?:\.+|[\?!:;])$/;
|
|
|
|
# This regex also matches sentence-ending tokens, but tokens matched by this
|
|
# one will not be included in the sentence itself. Tokens matched by the
|
|
# previous regex will be sent to Moses as part of the end of the sentence.
|
|
# Tokens matches by this one will never be sent to Moses. Which is why the pipe
|
|
# symbol, which Moses doesn't seem to like, must be in here.
|
|
|
|
my $RE_SPLIT_TOKEN = qr!^[\|\-]+$!;
|
|
|
|
#------------------------------------------------------------------------------
|
|
# global vars
|
|
|
|
# In cleaner code there wouldn't be global variables, but it simplified things
|
|
# to put these here. Eventually I wouldn't mind removing this section.
|
|
|
|
|
|
# This array is very central to the way this script works. The document will be
|
|
# chopped up into a list of 'segments'. Each segment is either some HTML code
|
|
# and whitespace which we don't translate or manipulate in any way, or a bit of
|
|
# text to be translated. It's as if we highlighted in the HTML source the bits
|
|
# of text that needed translation, and make each stripe of highlighter, and
|
|
# each length of text between them, a segment.
|
|
#
|
|
# Segments that are untouched HTML are simply strings. If the whole document
|
|
# contained no translatable text, this array would only contain strings.
|
|
#
|
|
# Segments that contain text to be translated are represented as arrayrefs. The
|
|
# first element of that arrayref is the text to be translated, with any soft
|
|
# tags within it replaced by placeholders of the type MOSESOPENTAG4. The
|
|
# remaining elements contain the necessary info to reinsert these tags. The
|
|
# placeholders are numbered, and the i-th placeholder corresponds to the
|
|
# (i+1)-th element in the arrayref (element 0 being the text). That element is
|
|
# itself an array ref, whose first element is the tag name and second element
|
|
# is a hashref of attributes.
|
|
#
|
|
# So this document:
|
|
#
|
|
# <p>This is <a href="somewhere">a link</a> but it's not <b>bold</b></p>
|
|
#
|
|
# would be represented by this @segments array:
|
|
#
|
|
# 0: "<p>"
|
|
# 1: [ 0: "This is MOSESOPENTAG0 a link MOSESCLOSETAG0 but it's not" .
|
|
# " MOSESOPENTAG1 bold MOSESCLOSETAG1"
|
|
# 1: [ "a", { href => "somewhere" } ]
|
|
# 2: [ "b", {} ] ]
|
|
# 2: "</p>"
|
|
#
|
|
# Finally, there's one hack to be mentioned: text in %TEXT_ATTR attributes
|
|
# (defined above) also goes into a segment of its own. Since this text does
|
|
# not contain tags, and to signal that the code for the popup containing
|
|
# source text should not be inserted around this text, we replace the tag
|
|
# information by the "__NOPOPUP__" string. So this document:
|
|
#
|
|
# <img src="blah" alt="This describes the image">
|
|
#
|
|
# would correspond to this @segments array:
|
|
#
|
|
# 0: "<img src=\"blah\" alt=\""
|
|
# 1: [ "This describes the image", "__NOPOPUP__" ]
|
|
# 2: "\">"
|
|
#
|
|
# This is a horrible hack. Yes.
|
|
|
|
my @segments;
|
|
|
|
|
|
# Finally, since this script is run in 'tainted' mode (-T switch) for basic
|
|
# security reasons, and we'll be launching subprocesses, so we need to make
|
|
# sure the PATH is clean otherwise Perl will refuse to do the system() calls.
|
|
|
|
$ENV{PATH} = '';
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Fetch the source page
|
|
|
|
# get value of URL param, make sure it's absolute
|
|
my $url = CGI->new->param ('url');
|
|
die "No URL?" unless $url;
|
|
$url = "http://$url" unless ($url =~ m!^[a-z]+://!);
|
|
|
|
# configure Web client
|
|
my $lwp = new LWP::UserAgent (%{{
|
|
agent => $ENV{HTTP_USER_AGENT} || 'Mozilla/5.0',
|
|
timeout => 5,
|
|
}});
|
|
|
|
# fetch the web page we want to translate
|
|
my $res = $lwp->get ($url);
|
|
die "Couldn't fetch page: " . $res->status_line unless $res->is_success;
|
|
my $html = $res->decoded_content;
|
|
|
|
# Find the page's base url. It may be different than the URL given to us as
|
|
# parameter if for instance that URL redirects to a different one, or the
|
|
# document contains a <base> tag.
|
|
my $base_url = $res->base;
|
|
|
|
# Decode entities, except some basics because it confuses our parsing. We need
|
|
# this because Moses won't understand the entities. It sometimes introduces
|
|
# minor display bugs, though. TODO: decode only alphanumerical entities?
|
|
$html =~ s/&((?:lt|gt);?)/&$1/g;
|
|
$html = decode_entities ($html);
|
|
|
|
# Start printing HTML page
|
|
print "Content-Type: text/html; charset=UTF-8\n\n";
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Parser stack and state management
|
|
|
|
# We're going to use a callback parser to parse the HTML file. As we walk the
|
|
# HTML tree we maintain a buffer containing the current block if text to be
|
|
# translated. These state variables contain that. The buffer is repeatedly
|
|
# emptied and its contents pushed onto @segments.
|
|
#
|
|
# We also remove 'soft' tags from the text as we append it to the buffer,
|
|
# replace them with placeholders, and save info about the tags we set aside in
|
|
# @buf_tag_index. @buf_tag_stack keeps track of 'currently open' tags, so that
|
|
# we can match closing tags to their opening tags.
|
|
|
|
my $buf_text_has_content = 0;
|
|
my $buf_text = '';
|
|
my @buf_tag_index;
|
|
my @buf_tag_stack;
|
|
|
|
my $in_verbatim = 0;
|
|
|
|
|
|
# This is called when we find soft tags within text to be translated. Arguments
|
|
# are the tag name, a hash of tag attributes, and a boolean telling us whether
|
|
# it's an opening or closing tag.
|
|
#
|
|
# We perform lookups in the above state variables, save the tag info in them if
|
|
# necessary, and return a string which is the placeholder to replace that tag.
|
|
|
|
sub make_placeholder {
|
|
my ($tag, $attr, $closing) = @_;
|
|
my $placeholder = '';
|
|
|
|
if ($closing) {
|
|
|
|
# try to match closing tags with their opening sibling
|
|
foreach my $i (reverse 0 .. $#buf_tag_stack) {
|
|
if ($buf_tag_stack[$i][0] eq $tag) {
|
|
$placeholder = 'MOSESCLOSETAG' . $buf_tag_stack[$i][1];
|
|
splice (@buf_tag_stack, $i, 1);
|
|
last;
|
|
}
|
|
}
|
|
|
|
# lone closing tags are added to the index but not the stack
|
|
if (!$placeholder) {
|
|
push (@buf_tag_index, [ $tag, $attr ]);
|
|
$placeholder = 'MOSESCLOSETAG' . $#buf_tag_index;
|
|
}
|
|
|
|
} else {
|
|
# opening tags are added to the index and the stack
|
|
push (@buf_tag_index, [ $tag, $attr ]);
|
|
push (@buf_tag_stack, [ $tag, $#buf_tag_index ]);
|
|
$placeholder = 'MOSESOPENTAG' . $#buf_tag_index;
|
|
}
|
|
|
|
return $placeholder;
|
|
}
|
|
|
|
|
|
# When we hit a hard tag, we call this to save any current text segment we have
|
|
# to the @segments array.
|
|
|
|
sub flush_buf_text {
|
|
if ($buf_text_has_content || @buf_tag_index) {
|
|
push (@segments, [ $buf_text, @buf_tag_index ] );
|
|
} else {
|
|
push (@segments, $buf_text);
|
|
}
|
|
|
|
$buf_text = '';
|
|
@buf_tag_index = ();
|
|
@buf_tag_stack = ();
|
|
$buf_text_has_content = 0;
|
|
}
|
|
|
|
#------------------------------------------------------------------------------
|
|
# HTML parser
|
|
|
|
# Parser callback for when we hit an opening or closing tag
|
|
sub start_and_end_h {
|
|
my ($tag, $attr, $closing) = @_;
|
|
|
|
# keep track of whether we're in a verbatim segment
|
|
$in_verbatim = $closing ? 0 : $tag
|
|
if $VERBATIM_TAGS{$tag};
|
|
|
|
# make links absolute
|
|
my $url_attr = $URL_ATTRS{$tag};
|
|
&make_link_absolute ($tag, $attr, $url_attr)
|
|
if ($url_attr && $attr->{$url_attr});
|
|
|
|
# textual attributes require some trickery - FIXME this duplicates some of
|
|
# &print_tag
|
|
if ($TEXT_ATTR{$tag}) {
|
|
&flush_buf_text ();
|
|
my $found = 0;
|
|
|
|
# there's an example of how this works in the comments that precede the
|
|
# declaration of @segments, above
|
|
foreach my $text_attr (@{$TEXT_ATTR{$tag}}) {
|
|
if ($attr->{$text_attr}) {
|
|
push (@segments, ($found ? '"' : "<$tag") . " $text_attr=\"");
|
|
push (@segments, [ $attr->{$text_attr}, '__NOPOPUP__' ]);
|
|
delete $attr->{$text_attr};
|
|
$found = 1;
|
|
}
|
|
}
|
|
|
|
if ($found) {
|
|
my $self_close = delete $attr->{'/'} ? 1 : 0;
|
|
push (@segments, "\"" . join ('', map {
|
|
(my $v = $attr->{$_}) =~ s/\"/&\#34;/g;
|
|
" $_=\"$v\"";
|
|
} keys %{$attr}) . ($self_close ? ' /' : '') . '>');
|
|
} else {
|
|
push (@segments, &print_tag ($tag, $attr, $closing));
|
|
}
|
|
|
|
# if the tag is soft we buffer it, if it's hard we flush the buffer out
|
|
} elsif ($SOFT_TAGS{$tag}) {
|
|
my $placeholder = &make_placeholder ($tag, $attr, $closing);
|
|
$buf_text .= ' ' . $placeholder . ' ';
|
|
} else {
|
|
&flush_buf_text ();
|
|
push (@segments, &print_tag ($tag, $attr, $closing));
|
|
}
|
|
|
|
# add a <base> tag at the beginning of the <head> (do we need this?)
|
|
push (@segments, "<base src='$base_url'>\n")
|
|
if ($tag eq 'head' && !$closing);
|
|
}
|
|
|
|
|
|
# parser callback for text segments
|
|
sub text_h {
|
|
my ($text) = @_;
|
|
|
|
if ($in_verbatim) {
|
|
# when in verbatim mode (in <script> or <style> tags), everything just
|
|
# gets reprinted as-is
|
|
|
|
# .. except this
|
|
$text =~ s/\@import\s+\"([^\n\"]+)\"/
|
|
'@import "' . URI->new_abs($1, $base_url)->as_string . '"';
|
|
/ge;
|
|
|
|
push (@segments, $text);
|
|
|
|
} else {
|
|
# otherwise add the text to the sentence buffer
|
|
$buf_text .= $text;
|
|
$buf_text_has_content ||= ($text =~ /\p{IsAlnum}/);
|
|
}
|
|
}
|
|
|
|
sub rest_h {
|
|
my ($text) = @_;
|
|
&flush_buf_text ();
|
|
push (@segments, $text);
|
|
}
|
|
|
|
|
|
my $parser = HTML::Parser->new (%{{
|
|
start_h => [\&start_and_end_h, 'tagname, attr' ],
|
|
text_h => [\&text_h, 'text' ],
|
|
declaration_h => [\&rest_h, 'text' ],
|
|
comment_h => [\&rest_h, 'text' ],
|
|
|
|
end_h => [sub {
|
|
&start_and_end_h (shift, {}, 1);
|
|
}, 'tagname' ],
|
|
}});
|
|
|
|
# parse it into @segments
|
|
$parser->parse ($html);
|
|
undef $parser;
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Run translation threads
|
|
|
|
# We have now parsed the who document to the @segments array. Now we start
|
|
# the actual translation process.
|
|
#
|
|
# We start one thread for each Moses host defined in the configuration above.
|
|
# All threads will then race to translate text segments, working down the
|
|
# @segments array. They also print segments as soon as a sequence of segments
|
|
# is done.
|
|
|
|
|
|
# These are the variables that are shared between threads and used for
|
|
# synchronisation.
|
|
|
|
my @input :shared = map { ref $_ ? $_->[0] : undef } @segments;
|
|
my @output :shared = map { ref $_ ? undef : $_ } @segments;
|
|
my $next_job_i :shared = 0;
|
|
my $num_printed :shared = 0;
|
|
|
|
|
|
# This sub will be run in parallel by the threads
|
|
my $thread_body = sub {
|
|
my ($moses_i) = @_;
|
|
|
|
# each thread uses it's own tokenizer and detokenizer subprocess
|
|
# (FIXME -- isn't this hugely inefficient?)
|
|
my $tokenizer = new Subprocess (@TOKENIZER_CMD);
|
|
my $detokenizer = new Subprocess (@DETOKENIZER_CMD);
|
|
$tokenizer->start;
|
|
$detokenizer->start;
|
|
|
|
# each thread also connects to its own Moses server
|
|
my ($host, $port) = split /:/, $MOSES_ADDRESSES[$moses_i];
|
|
my $moses = new RemoteProcess ($host, $port) ||
|
|
die "Can't connect to '$host:$port'";
|
|
$moses->start;
|
|
|
|
for (;;) {
|
|
|
|
# Snatch the next unassigned job from the queue
|
|
my $job_i;
|
|
{ lock $next_job_i; $job_i = $next_job_i++; }
|
|
last if ($job_i > $#input);
|
|
|
|
# If it's a text job, translate it, otherwise just don't do anything
|
|
$output[$job_i] = &translate_text_with_placeholders
|
|
($input[$job_i], $moses, $tokenizer, $detokenizer)
|
|
if (!defined $output[$job_i]);
|
|
|
|
# Print out any sequential block of done jobs
|
|
lock $num_printed;
|
|
while ($num_printed < @input && defined $output[$num_printed]) {
|
|
my $print;
|
|
|
|
if (ref $segments[$num_printed]) {
|
|
|
|
# replace placeholders by the original tags
|
|
my @buf_tag_index = @{$segments[$num_printed]};
|
|
shift @buf_tag_index;
|
|
$print = &replace_placeholders_by_tags
|
|
($output[$num_printed], @buf_tag_index);
|
|
|
|
# wrap in code to popup the original text onmouseover
|
|
if ($buf_tag_index[0] ne '__NOPOPUP__') {
|
|
$print = &add_original_text_popup
|
|
($input[$num_printed], $print);
|
|
} else {
|
|
$print =~ s/\"/&\#34;/g;
|
|
}
|
|
|
|
} else {
|
|
# HTML segments are just printed as-is
|
|
$print = $segments[$num_printed];
|
|
}
|
|
|
|
print encode ('UTF-8', $print);
|
|
$num_printed++;
|
|
}
|
|
}
|
|
};
|
|
|
|
if (@MOSES_ADDRESSES == 1) {
|
|
|
|
# If there's only one instance of Moses, there's no point in forking a
|
|
# single thread and waiting for it to complete, so we just run the thread
|
|
# code directly in the main thread
|
|
$thread_body->(0);
|
|
|
|
} else {
|
|
|
|
# Start all threads and wait for them all to finish
|
|
my @threads = map {
|
|
threads->create ($thread_body, $_);
|
|
} (0 .. $#MOSES_ADDRESSES);
|
|
$_->join foreach @threads;
|
|
|
|
}
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Translation subs
|
|
|
|
|
|
# This sub is called bt the translation thread for each text segment. The
|
|
# arguments are the input text and pointers to the various external processes
|
|
# needed for processing.
|
|
#
|
|
# At this stage the input text contains placeholders that look like
|
|
# "MOSESOPENTAG2". We don't need to know which tag they stand for, but we do
|
|
# need to set them aside, translate the remaining plain text, and reinsert them
|
|
# at the correct place in the translation.
|
|
|
|
sub translate_text_with_placeholders {
|
|
my ($input_text, $moses, $tokenizer, $detokenizer) = @_;
|
|
my $traced_text = '';
|
|
|
|
# Start by tokenizing the text, with placeholders still in it. The
|
|
# placeholders are designed to be interpreted as individual tokens by the
|
|
# tokenizer.
|
|
my @tokens = split /\s+/, $tokenizer->do_line ($input_text);
|
|
|
|
# remove placeholders, and for each remaining token, make a list of the
|
|
# tags that cover it
|
|
@tokens = ('START', @tokens, 'END');
|
|
my @tags_over_token = &_extract_placeholders (\@tokens);
|
|
@tokens = @tokens[1 .. $#tokens-1];
|
|
|
|
# translate sentence by sentence
|
|
my $token_base_i = 0;
|
|
while (@tokens > 0) {
|
|
|
|
# take a string of tokens up to the next sentence-ending token
|
|
my (@s_tokens, $split_token);
|
|
while (@tokens > 0) {
|
|
if ($tokens[0] =~ $RE_EOS_TOKEN) {
|
|
push (@s_tokens, shift @tokens);
|
|
last;
|
|
} elsif ($tokens[0] =~ $RE_SPLIT_TOKEN) {
|
|
$split_token = shift @tokens;
|
|
last;
|
|
} else {
|
|
push (@s_tokens, shift @tokens);
|
|
}
|
|
}
|
|
|
|
# Join together tokens into a plain text string. This is now ready to
|
|
# be shipped to Moses: all tags and placeholders have been removed,
|
|
# and it's a single sentence. We also lowercase as needed, and make
|
|
# a note of whether we did.
|
|
my $s_input_text = join (' ', @s_tokens);
|
|
my $was_ucfirst =
|
|
($s_input_text =~ s/^(\p{IsUpper})(?=\p{IsLower})/lc $1;/e);
|
|
my $was_allcaps =
|
|
($s_input_text =~ s/^([\p{IsUpper}\P{IsAlpha}]+)$/lc $1;/e);
|
|
|
|
# Translate the plain text sentence
|
|
# my $s_traced_text = &_translate_text_pig_latin ($s_input_text);
|
|
my $s_traced_text = &_translate_text_moses ($s_input_text, $moses);
|
|
|
|
# Early post-translation formatting fixes
|
|
$s_traced_text .= " $split_token" if $split_token;
|
|
$s_traced_text = ucfirst $s_traced_text if $was_ucfirst;
|
|
$s_traced_text = uc $s_traced_text if $was_allcaps;
|
|
|
|
# Update trace numbers to fit in the Grand Scheme of Things
|
|
$s_traced_text =~ s{\s*\|(\d+)-(\d+)\|}{
|
|
' |' . ($1+$token_base_i) . '-' . ($2+$token_base_i) . '| ';
|
|
}ge;
|
|
$token_base_i += @s_tokens + ($split_token ? 1 : 0);
|
|
|
|
$traced_text .= $s_traced_text . ' ';
|
|
}
|
|
|
|
# Apply to every segment in the traced output the union of all tags
|
|
# that covered tokens in the corresponding source segment
|
|
my $output_text = &_reinsert_placeholders
|
|
($traced_text, @tags_over_token);
|
|
|
|
# Try to remove spaces inserted by the tokenizer
|
|
$output_text = $detokenizer->do_line ($output_text);
|
|
|
|
return $output_text;
|
|
}
|
|
|
|
|
|
# This sub takes an array of tokens, some of which are placeholders for
|
|
# formatting tags. Some of these tag placeholders are for opening tags, some
|
|
# are for closing tags. What we do here is we remove all these placeholders
|
|
# from the list and create an index of which of the remaining tokens are
|
|
# covered by which tags (by which we mean, inside their scope).
|
|
#
|
|
# So for instance if the given array looks like this:
|
|
#
|
|
# [ "MOSESOPENTAG0", "MOSESOPENTAG1", "Hello", "MOSESCLOSETAG1",
|
|
# "MOSESOPENTAG2", "world", "MOSESCLOSETAG2", "MOSESCLOSETAG0" ]
|
|
#
|
|
# after executing this sub the array will look like this:
|
|
#
|
|
# [ "Hello", "world" ]
|
|
#
|
|
# and the @tags_over_token index will have been created, containing this:
|
|
#
|
|
# [ [0,1], [0,2] ]
|
|
#
|
|
# indicating that the first token ("Hello") is covered by tags 0 and 1, and
|
|
# that the 2nd token ("world") is covered by tags 0 and 2.
|
|
|
|
sub _extract_placeholders {
|
|
my ($tokens) = @_;
|
|
my @tags_over_token = ([]);
|
|
|
|
while (@tags_over_token <= @$tokens) {
|
|
my $i = $#tags_over_token;
|
|
my @t = @{$tags_over_token[$i]};
|
|
|
|
if ($tokens->[$i] =~ /^MOSESOPENTAG(\d+)$/) {
|
|
$tags_over_token[$i] = [@t, $1];
|
|
splice (@{$tokens}, $i, 1);
|
|
} elsif ($tokens->[$i] =~ /^MOSESCLOSETAG(\d+)$/) {
|
|
if (grep $_ == $1, @t) {
|
|
$tags_over_token[$i] = [grep $_ != $1, @t];
|
|
} else {
|
|
push (@{$tags_over_token[$_]}, $1) foreach (0 .. $i-1);
|
|
}
|
|
splice (@{$tokens}, $i, 1);
|
|
} else {
|
|
push (@tags_over_token, [@t]);
|
|
}
|
|
}
|
|
|
|
return @tags_over_token;
|
|
}
|
|
|
|
|
|
# This sub does pretty much the opposite of the preceding sub. It gets as
|
|
# argument the traced text output by Moses and the @tags_over_token array
|
|
# computed by the preceding sub. The traced text looks something like this:
|
|
#
|
|
# Hallo |0-0| Welt |1-1|
|
|
#
|
|
# For each such segment which is between two traces, we will want to apply
|
|
# to it the union of all tags that were over the corresponding source text.
|
|
#
|
|
# This sub does that, and returns the string, minus traces, plus reinserted
|
|
# placeholders.
|
|
|
|
sub _reinsert_placeholders {
|
|
my ($traced_text, @tags_over_token) = @_;
|
|
|
|
my %cur_open_tags = map {$_ => 1} @{$tags_over_token[0]};
|
|
my $output_text = '';
|
|
|
|
while ($traced_text =~ s/^(.+?)\s*\|(\d+)-+(\d+)\|\s*//) {
|
|
my ($segment, $from, $to) = ($1, $2+1, $3+1);
|
|
|
|
# list all tags that cover the source segment
|
|
my %segment_tags = map {$_ => 1} map {
|
|
@{$tags_over_token[$_]};
|
|
} ($from .. $to);
|
|
|
|
$output_text .= " MOSESCLOSETAG$_ "
|
|
foreach (grep !$segment_tags{$_}, keys %cur_open_tags);
|
|
$output_text .= " MOSESOPENTAG$_ "
|
|
foreach (grep !$cur_open_tags{$_}, keys %segment_tags);
|
|
%cur_open_tags = %segment_tags;
|
|
|
|
$output_text .= " $segment ";
|
|
}
|
|
|
|
my %final_tags = map {$_ => 1} @{$tags_over_token[-1]};
|
|
$output_text .= " MOSESCLOSETAG$_ "
|
|
foreach (grep !$final_tags{$_}, keys %cur_open_tags);
|
|
$output_text .= " MOSESOPENTAG$_ "
|
|
foreach (grep !$cur_open_tags{$_}, keys %final_tags);
|
|
|
|
$output_text .= $traced_text;
|
|
return $output_text;
|
|
}
|
|
|
|
|
|
# Finally this one replaces the placeholders by the actual tags.
|
|
|
|
sub replace_placeholders_by_tags {
|
|
my ($buf_text, @buf_tag_index) = @_;
|
|
|
|
# replace the placeholders by the original tags
|
|
$buf_text =~ s{MOSES(OPEN|CLOSE)TAG(\d+)}{
|
|
&print_tag (@{$buf_tag_index[$2]}, $1 eq 'CLOSE');
|
|
}ge;
|
|
|
|
return $buf_text;
|
|
}
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Interfaces to actual plain-text translators. These take a plain string and
|
|
# return a traced (Moses-style) translation
|
|
|
|
|
|
# This sub is used when you want to debug everything in this script except the
|
|
# actual translation. Translates to Pig Latin.
|
|
|
|
sub _translate_text_pig_latin {
|
|
my ($text) = @_;
|
|
|
|
$text =~ s/\b([bcdfhj-np-tv-z]+)([a-z]+)/
|
|
($1 eq ucfirst $1 ? ucfirst $2 : $2) .
|
|
($2 eq lc $2 ? lc $1 : $1) .
|
|
'ay';
|
|
/gei;
|
|
|
|
# insert fake traces
|
|
my $i = -1;
|
|
$text .= ' ';
|
|
$text =~ s/\s+/$i++; " |$i-$i| "/ge;
|
|
|
|
return $text;
|
|
}
|
|
|
|
|
|
# This one, given a handle to a Moses subprocess, will use that to translate
|
|
# the text. Not much to see here actually.
|
|
|
|
sub _translate_text_moses {
|
|
my ($text, $moses) = @_;
|
|
|
|
my $traced_text = $moses->do_line ($text);
|
|
unless ($traced_text) {
|
|
my @tokens = split /\s+/, $text;
|
|
|
|
# insert a fake trace if for some reason moses didn't return one
|
|
# (which most likely indicates something is quite wrong)
|
|
$traced_text = $text . " |0-$#tokens|";
|
|
}
|
|
|
|
return $traced_text;
|
|
}
|
|
|
|
#------------------------------------------------------------------------------
|
|
# basic HTML manipulation subs
|
|
|
|
sub make_link_absolute {
|
|
my ($tag_name, $attr_hash, $attr_name) = @_;
|
|
|
|
# make it absolute
|
|
$attr_hash->{$attr_name} = URI->new_abs
|
|
($attr_hash->{$attr_name}, $base_url)->as_string;
|
|
|
|
# make it point back to us if it's a link
|
|
if ($tag_name eq 'a') {
|
|
$attr_hash->{$attr_name} = 'index.cgi?url=' .
|
|
uri_escape ($attr_hash->{$attr_name});
|
|
$attr_hash->{target} = '_top';
|
|
}
|
|
}
|
|
|
|
sub print_tag {
|
|
my ($tag_name, $attr_hash, $closing) = @_;
|
|
my $self_close = $attr_hash->{'/'} ? 1 : 0;
|
|
|
|
return '<' . ($closing ? '/' : '') . $tag_name .
|
|
($closing ? '' : join ('', map {
|
|
my $v = $attr_hash->{$_};
|
|
$v =~ s/\"/&\#34;/g;
|
|
" $_=\"$v\"";
|
|
} keys %{$attr_hash})) .
|
|
($self_close ? ' /' : '') . '>';
|
|
}
|
|
|
|
sub add_original_text_popup {
|
|
my ($input_text, $output_html) = @_;
|
|
|
|
$input_text =~ s/\"/&\#34;/g;
|
|
$input_text =~ s/MOSES(?:OPEN|CLOSE)TAG\d+//g;
|
|
$input_text =~ s/^\s+//;
|
|
$input_text =~ s/\s+$//;
|
|
$input_text =~ s/\s+/ /g;
|
|
|
|
# Using this technique for displaying the source text pop-up means we don't
|
|
# have to fiddle with JavaScript, but it also means you need the LongTitles
|
|
# extension installed if using Firefox.. *I* happen to have it, so..
|
|
return "<span title=\"$input_text\">$output_html</span>";
|
|
}
|
|
|
|
#------------------------------------------------------------------------------
|
|
# conclusion
|
|
|
|
# stop the top frame counter
|
|
my $num_sentences = grep ref $_, @segments;
|
|
print "<script> top.numSentences = $num_sentences </script>\n";
|
|
|
|
#------------------------------------------------------------------------------
|