#!/usr/bin/perl -Tw use warnings; use strict; $|++; # file: translate.cgi # Herve Saint-Amand # saintamh [o] yahoo, com # Universitaet des Saarlandes # Mon May 12 14:10:54 2008 # This CGI script takes a web page URL as a parameter, fetches that page, # translates it using the Moses decoder, and displays the translated version # to the user, similarily to how Google or BabelFish translate web pages. # I don't think I've ever written anything with such a high comment/code ratio, # so hopefully it should be understandable. Just read top to bottom. # TODO: # # - if the document contains it will be lost # - don't insert spaces everywhere around soft tags # - charset autodetection would be nice, but it's not trivial #------------------------------------------------------------------------------ # includes use CGI; use CGI::Carp qw/fatalsToBrowser/; # we use the 2nd perl thread API. I think this means you need perl 5.6 or # higher, compiled with thread support use threads; use threads::shared; use Encode; use HTML::Entities; use HTML::Parser; use LWP::UserAgent; use URI; use URI::Escape; use lib 'lib'; use RemoteProcess; use Subprocess; #------------------------------------------------------------------------------ # constants, config # In order to run this script, you must first start Moses as a sort of daemon # process that accepts connections on some INET port, reads the sentences sent # to it one line at a time and returns translations. The daemon.pl script that # comes with this script does just that -- starts an instance of Moses and # 'plugs' it to the net so it can be used from other machines or just other # processes on the same machine. # # This list here indicates where to find these instances of Moses. May be # localhost, or may be separate machines. # # On the current UniSaar setup we use SSH tunneling to connect to other hosts, # so from this script's POV they're all localhost. These ports are actually # forwarded to other machines. There wouldn't be much point in running 16 # instances of Moses on the same machine. my @MOSES_ADDRESSES = map "localhost:90$_", qw/01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16/; # The tokenizer tries to adapt its rules depending on the language it's dealing # with, so we indicate that here. my $INPUT_LANG = 'fr'; my $OUTPUT_LANG = 'en'; # In order to tokenize and detokenize strings in a way that stays consistent # with how it is done in the rest of the Moses system, we use the scripts that # come with Moses as external processes. These are the commands we must run to # start them. my @TOKENIZER_CMD = ('./bin/tokenizer.perl', '-l', $INPUT_LANG); my @DETOKENIZER_CMD = ('./bin/detokenizer.perl', '-l', $OUTPUT_LANG); # We call 'soft tags' HTML tags whose presence is tolerated inside # sentences. All other tags are assumed to be sentence-breakers and will be # used to chop up documents into independent sentences. These few, however, are # allowed within sentences. my %SOFT_TAGS = map {$_ => 1} qw/a b i u em font blink tt acronym/; # We call 'verbatim tags' HTML tags whose entire data is to be left untouched # and reprinted as-is. These also happen to be tags whose content is typically # not printed by the browser. my %VERBATIM_TAGS = map {$_ => 1} qw/script style/; # Some HTML tags have attributes that contain URLs. Since we'll be displaying # the page on another server than its usual source server, relative paths will # be broken, so we need to make all URLs absolute. These are the attributes # that will be so modified. my %URL_ATTRS = %{{ a => 'href', img => 'src', form => 'action', link => 'href', script => 'src', }}; # Some HTML tags have attributes that can contain free text that is displayed # to the user. Data in attributes is not usually translated, but these should # be. # # Note that for implementation reasons these will always be treated as hard, # sentence-splitting tags. This could be changed but would require a # substantial re-write of this script. my %TEXT_ATTR = %{{ input => [qw/value/], img => [qw/alt title/], }}; # Sentence splitting within a paragraph or block of text is done after # tokenizing. Tokens matched by this regex will be considered to end a # sentence, and hence be used in splitting the text into sentences. my $RE_EOS_TOKEN = qr/^(?:\.+|[\?!:;])$/; # This regex also matches sentence-ending tokens, but tokens matched by this # one will not be included in the sentence itself. Tokens matched by the # previous regex will be sent to Moses as part of the end of the sentence. # Tokens matches by this one will never be sent to Moses. Which is why the pipe # symbol, which Moses doesn't seem to like, must be in here. my $RE_SPLIT_TOKEN = qr!^[\|\-]+$!; #------------------------------------------------------------------------------ # global vars # In cleaner code there wouldn't be global variables, but it simplified things # to put these here. Eventually I wouldn't mind removing this section. # This array is very central to the way this script works. The document will be # chopped up into a list of 'segments'. Each segment is either some HTML code # and whitespace which we don't translate or manipulate in any way, or a bit of # text to be translated. It's as if we highlighted in the HTML source the bits # of text that needed translation, and make each stripe of highlighter, and # each length of text between them, a segment. # # Segments that are untouched HTML are simply strings. If the whole document # contained no translatable text, this array would only contain strings. # # Segments that contain text to be translated are represented as arrayrefs. The # first element of that arrayref is the text to be translated, with any soft # tags within it replaced by placeholders of the type MOSESOPENTAG4. The # remaining elements contain the necessary info to reinsert these tags. The # placeholders are numbered, and the i-th placeholder corresponds to the # (i+1)-th element in the arrayref (element 0 being the text). That element is # itself an array ref, whose first element is the tag name and second element # is a hashref of attributes. # # So this document: # #
This is a link but it's not bold
# # would be represented by this @segments array: # # 0: "" # 1: [ 0: "This is MOSESOPENTAG0 a link MOSESCLOSETAG0 but it's not" . # " MOSESOPENTAG1 bold MOSESCLOSETAG1" # 1: [ "a", { href => "somewhere" } ] # 2: [ "b", {} ] ] # 2: "
" # # Finally, there's one hack to be mentioned: text in %TEXT_ATTR attributes # (defined above) also goes into a segment of its own. Since this text does # not contain tags, and to signal that the code for the popup containing # source text should not be inserted around this text, we replace the tag # information by the "__NOPOPUP__" string. So this document: # # # # would correspond to this @segments array: # # 0: "" # # This is a horrible hack. Yes. my @segments; # Finally, since this script is run in 'tainted' mode (-T switch) for basic # security reasons, and we'll be launching subprocesses, so we need to make # sure the PATH is clean otherwise Perl will refuse to do the system() calls. $ENV{PATH} = ''; #------------------------------------------------------------------------------ # Fetch the source page # get value of URL param, make sure it's absolute my $url = CGI->new->param ('url'); die "No URL?" unless $url; $url = "http://$url" unless ($url =~ m!^[a-z]+://!); # configure Web client my $lwp = new LWP::UserAgent (%{{ agent => $ENV{HTTP_USER_AGENT} || 'Mozilla/5.0', timeout => 5, }}); # fetch the web page we want to translate my $res = $lwp->get ($url); die "Couldn't fetch page: " . $res->status_line unless $res->is_success; my $html = $res->decoded_content; # Find the page's base url. It may be different than the URL given to us as # parameter if for instance that URL redirects to a different one, or the # document contains a