#!/usr/bin/perl -Tw use warnings; use strict; $|++; # file: translate.cgi # Herve Saint-Amand # saintamh [o] yahoo, com # Universitaet des Saarlandes # Mon May 12 14:10:54 2008 # This CGI script takes a web page URL as a parameter, fetches that page, # translates it using the Moses decoder, and displays the translated version # to the user, similarily to how Google or BabelFish translate web pages. # I don't think I've ever written anything with such a high comment/code ratio, # so hopefully it should be understandable. Just read top to bottom. # TODO: # # - if the document contains it will be lost # - don't insert spaces everywhere around soft tags # - charset autodetection would be nice, but it's not trivial #------------------------------------------------------------------------------ # includes use CGI; use CGI::Carp qw/fatalsToBrowser/; # we use the 2nd perl thread API. I think this means you need perl 5.6 or # higher, compiled with thread support use threads; use threads::shared; use Encode; use HTML::Entities; use HTML::Parser; use LWP::UserAgent; use URI; use URI::Escape; use lib 'lib'; use RemoteProcess; use Subprocess; #------------------------------------------------------------------------------ # constants, config # In order to run this script, you must first start Moses as a sort of daemon # process that accepts connections on some INET port, reads the sentences sent # to it one line at a time and returns translations. The daemon.pl script that # comes with this script does just that -- starts an instance of Moses and # 'plugs' it to the net so it can be used from other machines or just other # processes on the same machine. # # This list here indicates where to find these instances of Moses. May be # localhost, or may be separate machines. # # On the current UniSaar setup we use SSH tunneling to connect to other hosts, # so from this script's POV they're all localhost. These ports are actually # forwarded to other machines. There wouldn't be much point in running 16 # instances of Moses on the same machine. my @MOSES_ADDRESSES = map "localhost:90$_", qw/01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16/; # The tokenizer tries to adapt its rules depending on the language it's dealing # with, so we indicate that here. my $INPUT_LANG = 'fr'; my $OUTPUT_LANG = 'en'; # In order to tokenize and detokenize strings in a way that stays consistent # with how it is done in the rest of the Moses system, we use the scripts that # come with Moses as external processes. These are the commands we must run to # start them. my @TOKENIZER_CMD = ('./bin/tokenizer.perl', '-l', $INPUT_LANG); my @DETOKENIZER_CMD = ('./bin/detokenizer.perl', '-l', $OUTPUT_LANG); # We call 'soft tags' HTML tags whose presence is tolerated inside # sentences. All other tags are assumed to be sentence-breakers and will be # used to chop up documents into independent sentences. These few, however, are # allowed within sentences. my %SOFT_TAGS = map {$_ => 1} qw/a b i u em font blink tt acronym/; # We call 'verbatim tags' HTML tags whose entire data is to be left untouched # and reprinted as-is. These also happen to be tags whose content is typically # not printed by the browser. my %VERBATIM_TAGS = map {$_ => 1} qw/script style/; # Some HTML tags have attributes that contain URLs. Since we'll be displaying # the page on another server than its usual source server, relative paths will # be broken, so we need to make all URLs absolute. These are the attributes # that will be so modified. my %URL_ATTRS = %{{ a => 'href', img => 'src', form => 'action', link => 'href', script => 'src', }}; # Some HTML tags have attributes that can contain free text that is displayed # to the user. Data in attributes is not usually translated, but these should # be. # # Note that for implementation reasons these will always be treated as hard, # sentence-splitting tags. This could be changed but would require a # substantial re-write of this script. my %TEXT_ATTR = %{{ input => [qw/value/], img => [qw/alt title/], }}; # Sentence splitting within a paragraph or block of text is done after # tokenizing. Tokens matched by this regex will be considered to end a # sentence, and hence be used in splitting the text into sentences. my $RE_EOS_TOKEN = qr/^(?:\.+|[\?!:;])$/; # This regex also matches sentence-ending tokens, but tokens matched by this # one will not be included in the sentence itself. Tokens matched by the # previous regex will be sent to Moses as part of the end of the sentence. # Tokens matches by this one will never be sent to Moses. Which is why the pipe # symbol, which Moses doesn't seem to like, must be in here. my $RE_SPLIT_TOKEN = qr!^[\|\-]+$!; #------------------------------------------------------------------------------ # global vars # In cleaner code there wouldn't be global variables, but it simplified things # to put these here. Eventually I wouldn't mind removing this section. # This array is very central to the way this script works. The document will be # chopped up into a list of 'segments'. Each segment is either some HTML code # and whitespace which we don't translate or manipulate in any way, or a bit of # text to be translated. It's as if we highlighted in the HTML source the bits # of text that needed translation, and make each stripe of highlighter, and # each length of text between them, a segment. # # Segments that are untouched HTML are simply strings. If the whole document # contained no translatable text, this array would only contain strings. # # Segments that contain text to be translated are represented as arrayrefs. The # first element of that arrayref is the text to be translated, with any soft # tags within it replaced by placeholders of the type MOSESOPENTAG4. The # remaining elements contain the necessary info to reinsert these tags. The # placeholders are numbered, and the i-th placeholder corresponds to the # (i+1)-th element in the arrayref (element 0 being the text). That element is # itself an array ref, whose first element is the tag name and second element # is a hashref of attributes. # # So this document: # #

This is a link but it's not bold

# # would be represented by this @segments array: # # 0: "

" # 1: [ 0: "This is MOSESOPENTAG0 a link MOSESCLOSETAG0 but it's not" . # " MOSESOPENTAG1 bold MOSESCLOSETAG1" # 1: [ "a", { href => "somewhere" } ] # 2: [ "b", {} ] ] # 2: "

" # # Finally, there's one hack to be mentioned: text in %TEXT_ATTR attributes # (defined above) also goes into a segment of its own. Since this text does # not contain tags, and to signal that the code for the popup containing # source text should not be inserted around this text, we replace the tag # information by the "__NOPOPUP__" string. So this document: # # This describes the image # # would correspond to this @segments array: # # 0: "\""" # # This is a horrible hack. Yes. my @segments; # Finally, since this script is run in 'tainted' mode (-T switch) for basic # security reasons, and we'll be launching subprocesses, so we need to make # sure the PATH is clean otherwise Perl will refuse to do the system() calls. $ENV{PATH} = ''; #------------------------------------------------------------------------------ # Fetch the source page # get value of URL param, make sure it's absolute my $url = CGI->new->param ('url'); die "No URL?" unless $url; $url = "http://$url" unless ($url =~ m!^[a-z]+://!); # configure Web client my $lwp = new LWP::UserAgent (%{{ agent => $ENV{HTTP_USER_AGENT} || 'Mozilla/5.0', timeout => 5, }}); # fetch the web page we want to translate my $res = $lwp->get ($url); die "Couldn't fetch page: " . $res->status_line unless $res->is_success; my $html = $res->decoded_content; # Find the page's base url. It may be different than the URL given to us as # parameter if for instance that URL redirects to a different one, or the # document contains a tag. my $base_url = $res->base; # Decode entities, except some basics because it confuses our parsing. We need # this because Moses won't understand the entities. It sometimes introduces # minor display bugs, though. TODO: decode only alphanumerical entities? $html =~ s/&((?:lt|gt);?)/&$1/g; $html = decode_entities ($html); # Start printing HTML page print "Content-Type: text/html; charset=UTF-8\n\n"; #------------------------------------------------------------------------------ # Parser stack and state management # We're going to use a callback parser to parse the HTML file. As we walk the # HTML tree we maintain a buffer containing the current block if text to be # translated. These state variables contain that. The buffer is repeatedly # emptied and its contents pushed onto @segments. # # We also remove 'soft' tags from the text as we append it to the buffer, # replace them with placeholders, and save info about the tags we set aside in # @buf_tag_index. @buf_tag_stack keeps track of 'currently open' tags, so that # we can match closing tags to their opening tags. my $buf_text_has_content = 0; my $buf_text = ''; my @buf_tag_index; my @buf_tag_stack; my $in_verbatim = 0; # This is called when we find soft tags within text to be translated. Arguments # are the tag name, a hash of tag attributes, and a boolean telling us whether # it's an opening or closing tag. # # We perform lookups in the above state variables, save the tag info in them if # necessary, and return a string which is the placeholder to replace that tag. sub make_placeholder { my ($tag, $attr, $closing) = @_; my $placeholder = ''; if ($closing) { # try to match closing tags with their opening sibling foreach my $i (reverse 0 .. $#buf_tag_stack) { if ($buf_tag_stack[$i][0] eq $tag) { $placeholder = 'MOSESCLOSETAG' . $buf_tag_stack[$i][1]; splice (@buf_tag_stack, $i, 1); last; } } # lone closing tags are added to the index but not the stack if (!$placeholder) { push (@buf_tag_index, [ $tag, $attr ]); $placeholder = 'MOSESCLOSETAG' . $#buf_tag_index; } } else { # opening tags are added to the index and the stack push (@buf_tag_index, [ $tag, $attr ]); push (@buf_tag_stack, [ $tag, $#buf_tag_index ]); $placeholder = 'MOSESOPENTAG' . $#buf_tag_index; } return $placeholder; } # When we hit a hard tag, we call this to save any current text segment we have # to the @segments array. sub flush_buf_text { if ($buf_text_has_content || @buf_tag_index) { push (@segments, [ $buf_text, @buf_tag_index ] ); } else { push (@segments, $buf_text); } $buf_text = ''; @buf_tag_index = (); @buf_tag_stack = (); $buf_text_has_content = 0; } #------------------------------------------------------------------------------ # HTML parser # Parser callback for when we hit an opening or closing tag sub start_and_end_h { my ($tag, $attr, $closing) = @_; # keep track of whether we're in a verbatim segment $in_verbatim = $closing ? 0 : $tag if $VERBATIM_TAGS{$tag}; # make links absolute my $url_attr = $URL_ATTRS{$tag}; &make_link_absolute ($tag, $attr, $url_attr) if ($url_attr && $attr->{$url_attr}); # textual attributes require some trickery - FIXME this duplicates some of # &print_tag if ($TEXT_ATTR{$tag}) { &flush_buf_text (); my $found = 0; # there's an example of how this works in the comments that precede the # declaration of @segments, above foreach my $text_attr (@{$TEXT_ATTR{$tag}}) { if ($attr->{$text_attr}) { push (@segments, ($found ? '"' : "<$tag") . " $text_attr=\""); push (@segments, [ $attr->{$text_attr}, '__NOPOPUP__' ]); delete $attr->{$text_attr}; $found = 1; } } if ($found) { my $self_close = delete $attr->{'/'} ? 1 : 0; push (@segments, "\"" . join ('', map { (my $v = $attr->{$_}) =~ s/\"/&\#34;/g; " $_=\"$v\""; } keys %{$attr}) . ($self_close ? ' /' : '') . '>'); } else { push (@segments, &print_tag ($tag, $attr, $closing)); } # if the tag is soft we buffer it, if it's hard we flush the buffer out } elsif ($SOFT_TAGS{$tag}) { my $placeholder = &make_placeholder ($tag, $attr, $closing); $buf_text .= ' ' . $placeholder . ' '; } else { &flush_buf_text (); push (@segments, &print_tag ($tag, $attr, $closing)); } # add a tag at the beginning of the (do we need this?) push (@segments, "\n") if ($tag eq 'head' && !$closing); } # parser callback for text segments sub text_h { my ($text) = @_; if ($in_verbatim) { # when in verbatim mode (in \n"; #------------------------------------------------------------------------------