2014-04-03 23:35:26 +04:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2014-04-03 23:38:14 +04:00
|
|
|
# Written by Ulrich Germann on the basis of contrib/server/client.py.
|
|
|
|
# This script simulates post-editing of MT output and incrementally
|
|
|
|
# updates the dynamic phrase tables in the moses server.
|
2014-04-03 23:35:26 +04:00
|
|
|
|
2015-05-16 13:26:56 +03:00
|
|
|
import argparse
|
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
import time
|
|
|
|
import xmlrpclib
|
2014-05-19 00:48:17 +04:00
|
|
|
import moses
|
2015-05-16 13:26:56 +03:00
|
|
|
from subprocess import (
|
|
|
|
PIPE,
|
|
|
|
Popen,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2014-05-19 00:48:17 +04:00
|
|
|
mserver = moses.MosesServer()
|
2014-04-03 23:35:26 +04:00
|
|
|
|
2014-04-03 23:38:14 +04:00
|
|
|
# We must perform some custom argument processing, as moses parameter
|
|
|
|
# specifications do not comply with the standards used in standard
|
|
|
|
# argument parsing packages; an isolated double dash separates script
|
|
|
|
# arguments from moses arguments
|
2015-05-16 13:26:56 +03:00
|
|
|
|
|
|
|
|
2014-04-03 23:35:26 +04:00
|
|
|
def split_args(all_args):
|
|
|
|
"""
|
2014-05-19 00:48:17 +04:00
|
|
|
Split argument list all_args into arguments specific to this script and
|
2015-05-16 13:26:56 +03:00
|
|
|
arguments relating to the moses server. An isolated double dash acts as
|
|
|
|
the separator between the two types of arguments.
|
2014-04-03 23:35:26 +04:00
|
|
|
"""
|
|
|
|
my_args = []
|
|
|
|
mo_args = []
|
2014-05-20 00:41:32 +04:00
|
|
|
arglist = mo_args
|
|
|
|
i = 0
|
2015-05-16 13:26:56 +03:00
|
|
|
# IMPORTANT: the code below must be coordinated with
|
2014-04-03 23:35:26 +04:00
|
|
|
# - the evolution of moses command line arguments
|
2015-05-16 13:26:56 +03:00
|
|
|
# - mert-moses.pl
|
2014-05-20 00:41:32 +04:00
|
|
|
while i < len(all_args):
|
|
|
|
# print i,"MY_ARGS", my_args
|
|
|
|
# print i,"MO_ARGS", mo_args
|
|
|
|
if all_args[i] == "--[":
|
|
|
|
arglist = my_args
|
|
|
|
elif all_args[i] == "--]":
|
|
|
|
arglist = mo_args
|
|
|
|
elif all_args[i] == "-i" or all_args[i] == "-input-file":
|
2015-05-16 13:26:56 +03:00
|
|
|
my_args.extend(["-i", all_args[i + 1]])
|
2014-05-20 00:41:32 +04:00
|
|
|
i += 1
|
|
|
|
elif all_args[i] == "-inputtype":
|
2015-05-16 13:26:56 +03:00
|
|
|
if all_args[i + 1] != "0":
|
|
|
|
# Not yet supported! Therefore:
|
|
|
|
errmsg = (
|
|
|
|
"FATAL ERROR: "
|
|
|
|
"%s only supports plain text input at this point."
|
|
|
|
% sys.argv[0])
|
|
|
|
raise Exception(errmsg)
|
2014-05-20 00:41:32 +04:00
|
|
|
# my_args.extend(["--input-type",all_args[i+1]])
|
|
|
|
i += 1
|
|
|
|
elif all_args[i] == "-lattice-samples":
|
|
|
|
# my_args.extend(["--lattice-sample",all_args[i+2]])
|
|
|
|
# my_args.extend(["--lattice-sample-file",all_args[i+1]])
|
2014-05-19 00:48:17 +04:00
|
|
|
# mo_args[i:i+3] = []
|
2014-05-20 00:41:32 +04:00
|
|
|
# i += 2
|
2014-05-19 00:48:17 +04:00
|
|
|
# This is not yet supported! Therefore:
|
2015-05-16 13:26:56 +03:00
|
|
|
errmsg = (
|
|
|
|
"FATAL ERROR: %s does not yet support lattice sampling."
|
|
|
|
% sys.argv[0])
|
|
|
|
raise Exception(errmsg)
|
|
|
|
|
2014-05-20 00:41:32 +04:00
|
|
|
elif all_args[i] == "-n-best-list":
|
2015-05-16 13:26:56 +03:00
|
|
|
my_args.extend(["--nbest", all_args[i + 2]])
|
|
|
|
my_args.extend(["--nbest-file", all_args[i + 1]])
|
2014-05-20 00:41:32 +04:00
|
|
|
i += 2
|
2014-04-03 23:35:26 +04:00
|
|
|
|
2014-05-20 00:41:32 +04:00
|
|
|
elif all_args[i] == "-n-best-distinct":
|
|
|
|
my_args.extend(["-u"])
|
2014-04-03 23:35:26 +04:00
|
|
|
|
|
|
|
else:
|
2014-05-20 00:41:32 +04:00
|
|
|
arglist.append(all_args[i])
|
2014-04-03 23:35:26 +04:00
|
|
|
pass
|
2014-05-20 00:41:32 +04:00
|
|
|
|
|
|
|
i += 1
|
2014-04-03 23:35:26 +04:00
|
|
|
pass
|
2015-05-16 13:26:56 +03:00
|
|
|
return my_args, mo_args
|
|
|
|
|
|
|
|
|
2014-04-03 23:35:26 +04:00
|
|
|
def interpret_args(my_args):
|
|
|
|
"""
|
|
|
|
Parse script-specific argument list.
|
|
|
|
"""
|
|
|
|
aparser = argparse.ArgumentParser()
|
|
|
|
|
2015-05-16 13:26:56 +03:00
|
|
|
aparser.add_argument(
|
|
|
|
"-s", "--server-cmd", default="mosesserver", dest="servercmd",
|
|
|
|
help="Path to moses server command.")
|
|
|
|
aparser.add_argument(
|
|
|
|
"--url", help="URL of external moses server.")
|
|
|
|
aparser.add_argument(
|
|
|
|
"-p", "--port", type=int, default=7447,
|
|
|
|
help="Port number to be used for server.")
|
|
|
|
|
|
|
|
# Input / output.
|
|
|
|
aparser.add_argument(
|
|
|
|
"-i", "--input", default='-', help="source file")
|
|
|
|
aparser.add_argument(
|
|
|
|
"-r", "--ref", default=None, help="Reference translation.")
|
|
|
|
aparser.add_argument(
|
|
|
|
"-a", "--aln", default=None, help="Alignment.")
|
|
|
|
aparser.add_argument(
|
|
|
|
"-o", "--output", default="-", help="Output file.")
|
|
|
|
aparser.add_argument(
|
|
|
|
"-d", "--debug", action='store_true', help="Debug mode.")
|
|
|
|
|
|
|
|
# Moses reporting options.
|
|
|
|
aparser.add_argument(
|
|
|
|
"-A", "--with-alignment", dest="A", action='store_true',
|
|
|
|
help="Include alignment in output.")
|
|
|
|
aparser.add_argument(
|
|
|
|
"-G", "--with-graph", type=bool, default=False, dest="G",
|
|
|
|
help="Include search graph info in output.")
|
|
|
|
aparser.add_argument(
|
|
|
|
"-T", "--with-transopt", type=bool, default=False, dest="T",
|
|
|
|
help="Include translation options info in output.")
|
|
|
|
aparser.add_argument(
|
|
|
|
"-F", "--report-all-factors", action="store_true", dest="F",
|
|
|
|
help="Report all factors.")
|
|
|
|
aparser.add_argument(
|
|
|
|
"-n", "--nbest", type=int, dest="nbest", default=0,
|
|
|
|
help="Size of nbest list.")
|
|
|
|
aparser.add_argument(
|
|
|
|
"-N", "--nbest-file", dest="nbestFile", default=0,
|
|
|
|
help="Output file for nbest list.")
|
|
|
|
aparser.add_argument(
|
|
|
|
"-u", "--nbest-distinct", type=bool, dest="U", default=False,
|
|
|
|
help="Report all factors.")
|
2014-04-03 23:35:26 +04:00
|
|
|
|
|
|
|
return aparser.parse_args(my_args)
|
2015-05-16 13:26:56 +03:00
|
|
|
|
|
|
|
|
2014-05-19 00:48:17 +04:00
|
|
|
def translate(proxy, args, line):
|
|
|
|
if type(line) is unicode:
|
2015-05-16 13:26:56 +03:00
|
|
|
param = {'text': line.strip().encode('utf8')}
|
2014-05-19 00:48:17 +04:00
|
|
|
elif type(line) is str:
|
2015-05-16 13:26:56 +03:00
|
|
|
param = {'text': line.strip()}
|
2014-05-19 00:48:17 +04:00
|
|
|
else:
|
|
|
|
raise Exception("Can't handle input")
|
2015-05-16 13:26:56 +03:00
|
|
|
if args.A:
|
|
|
|
param['align'] = True
|
|
|
|
if args.T:
|
|
|
|
param['topt'] = True
|
|
|
|
if args.F:
|
|
|
|
param['report-all-factors'] = True
|
|
|
|
if args.nbest:
|
2014-04-03 23:35:26 +04:00
|
|
|
param['nbest'] = int(args.nbest)
|
|
|
|
param['add-score-breakdown'] = True
|
|
|
|
pass
|
2015-05-16 13:26:56 +03:00
|
|
|
if args.U:
|
2014-05-19 00:48:17 +04:00
|
|
|
param['nbest-distinct'] = True
|
|
|
|
pass
|
|
|
|
attempts = 0
|
2014-07-09 05:41:28 +04:00
|
|
|
while attempts < 20:
|
|
|
|
t1 = time.time()
|
2014-05-19 00:48:17 +04:00
|
|
|
try:
|
2015-05-16 13:26:56 +03:00
|
|
|
return proxy.translate(param)
|
2014-07-09 05:41:28 +04:00
|
|
|
|
|
|
|
# except xmlrpclib.Fault as e:
|
|
|
|
# except xmlrpclib.ProtocolError as e:
|
|
|
|
# except xmlrpclib.ResponseError as e:
|
|
|
|
except xmlrpclib.Error as e:
|
2015-05-16 13:26:56 +03:00
|
|
|
sys.stderr.flush()
|
|
|
|
print >>sys.stderr, " XMLRPC error:", e
|
2014-07-09 05:41:28 +04:00
|
|
|
print >>sys.stderr, "Input was"
|
|
|
|
print >>sys.stderr, param
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
except IOError as e:
|
2015-05-16 13:26:56 +03:00
|
|
|
print >>sys.stderr, (
|
|
|
|
"I/O error({0}): {1}".format(e.errno, e.strerror))
|
2014-05-19 00:48:17 +04:00
|
|
|
time.sleep(5)
|
2014-07-09 05:41:28 +04:00
|
|
|
|
|
|
|
except:
|
|
|
|
serverstatus = mserver.process.poll()
|
2015-05-16 13:26:56 +03:00
|
|
|
if serverstatus is None:
|
|
|
|
print >>sys.stderr, (
|
|
|
|
"Connection failed after %f seconds" % (time.time() - t1))
|
2014-07-09 05:41:28 +04:00
|
|
|
attempts += 1
|
|
|
|
if attempts > 10:
|
|
|
|
time.sleep(10)
|
|
|
|
else:
|
|
|
|
time.sleep(5)
|
|
|
|
else:
|
2015-05-16 13:26:56 +03:00
|
|
|
print >>sys.stderr, (
|
|
|
|
"Oopsidaisy, server exited with code %d (signal %d)"
|
|
|
|
% (serverstatus / 256, serverstatus % 256))
|
2014-07-09 05:41:28 +04:00
|
|
|
pass
|
2014-05-19 00:48:17 +04:00
|
|
|
pass
|
2014-04-03 23:35:26 +04:00
|
|
|
pass
|
2014-05-19 00:48:17 +04:00
|
|
|
raise Exception("Exception: could not reach translation server.")
|
2015-05-16 13:26:56 +03:00
|
|
|
|
2014-04-03 23:35:26 +04:00
|
|
|
|
|
|
|
def read_data(fname):
|
|
|
|
"""
|
|
|
|
Read and return data (source, target or alignment) from file fname.
|
|
|
|
"""
|
|
|
|
if fname[-3:] == ".gz":
|
2015-05-16 13:26:56 +03:00
|
|
|
process = Popen(["zcat", fname], stdout=PIPE)
|
|
|
|
stdout, _ = process.communicate()
|
|
|
|
foo = stdout.strip().split('\n')
|
2014-04-03 23:35:26 +04:00
|
|
|
else:
|
|
|
|
foo = [x.strip() for x in open(fname).readlines()]
|
|
|
|
return foo
|
|
|
|
|
2015-05-16 13:26:56 +03:00
|
|
|
|
|
|
|
def repack_result(idx, result):
|
2014-04-03 23:35:26 +04:00
|
|
|
global args
|
|
|
|
if args.nbest:
|
|
|
|
for h in result['nbest']:
|
2015-05-16 13:26:56 +03:00
|
|
|
fields = [idx, h['hyp'], h['fvals'], h['totalScore']]
|
2014-05-20 00:41:32 +04:00
|
|
|
for i in xrange(len(fields)):
|
|
|
|
if type(fields[i]) is unicode:
|
|
|
|
fields[i] = fields[i].encode('utf-8')
|
|
|
|
pass
|
|
|
|
pass
|
2015-05-16 13:26:56 +03:00
|
|
|
# Print fields.
|
|
|
|
print >>NBestFile, "%d ||| %s ||| %s ||| %f" % tuple(fields)
|
2014-04-03 23:35:26 +04:00
|
|
|
pass
|
|
|
|
if 'align' in result:
|
|
|
|
t = result['text'].split()
|
|
|
|
span = ''
|
|
|
|
i = 0
|
|
|
|
k = 0
|
|
|
|
for a in result['align']:
|
|
|
|
k = a['tgt-start']
|
2015-05-16 13:26:56 +03:00
|
|
|
if k:
|
|
|
|
print " ".join(t[i:k]).encode('utf8'), span,
|
2014-04-03 23:35:26 +04:00
|
|
|
i = k
|
2015-05-16 13:26:56 +03:00
|
|
|
span = "|%d %d|" % (a['src-start'], a['src-end'])
|
|
|
|
print " ".join(t[k:]).encode('utf8'), span
|
2014-04-03 23:35:26 +04:00
|
|
|
else:
|
2014-05-19 00:48:17 +04:00
|
|
|
print result['text'].encode('utf8')
|
2015-05-16 13:26:56 +03:00
|
|
|
|
2014-04-03 23:35:26 +04:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
my_args, mo_args = split_args(sys.argv[1:])
|
2014-04-03 23:38:14 +04:00
|
|
|
|
2014-05-20 00:41:32 +04:00
|
|
|
# print "MY ARGS", my_args
|
|
|
|
# print "MO_ARGS", mo_args
|
|
|
|
|
2014-04-03 23:35:26 +04:00
|
|
|
global args
|
|
|
|
args = interpret_args(my_args)
|
2014-05-19 00:48:17 +04:00
|
|
|
|
2014-04-03 23:38:14 +04:00
|
|
|
if "-show-weights" in mo_args:
|
2015-05-16 13:26:56 +03:00
|
|
|
# This is for use during tuning, where moses is called to get a list
|
|
|
|
# of feature names.
|
|
|
|
devnull = open(os.devnull, "w")
|
|
|
|
mo = Popen(mserver.cmd + mo_args, stdout=PIPE, stderr=devnull)
|
2014-04-03 23:38:14 +04:00
|
|
|
print mo.communicate()[0].strip()
|
|
|
|
sys.exit(0)
|
|
|
|
pass
|
|
|
|
|
2014-04-03 23:35:26 +04:00
|
|
|
if args.nbest:
|
|
|
|
if args.nbestFile:
|
2015-05-16 13:26:56 +03:00
|
|
|
NBestFile = open(args.nbestFile, "w")
|
2014-04-03 23:35:26 +04:00
|
|
|
else:
|
|
|
|
NBestFile = sys.stdout
|
|
|
|
pass
|
|
|
|
pass
|
2014-05-19 00:48:17 +04:00
|
|
|
|
|
|
|
ref = None
|
2014-04-03 23:35:26 +04:00
|
|
|
aln = None
|
2015-05-16 13:26:56 +03:00
|
|
|
if args.ref:
|
|
|
|
ref = read_data(args.ref)
|
|
|
|
if args.aln:
|
|
|
|
aln = read_data(args.aln)
|
2014-04-03 23:35:26 +04:00
|
|
|
|
2014-07-09 05:41:28 +04:00
|
|
|
if ref and aln:
|
|
|
|
try:
|
|
|
|
mo_args.index("--serial")
|
|
|
|
except:
|
|
|
|
mo_args.append("--serial")
|
|
|
|
pass
|
|
|
|
pass
|
|
|
|
|
|
|
|
if args.url:
|
|
|
|
mserver.connect(args.url)
|
|
|
|
else:
|
|
|
|
mserver.start(args=mo_args, port=args.port, debug=args.debug)
|
|
|
|
pass
|
|
|
|
|
2014-04-03 23:35:26 +04:00
|
|
|
if (args.input == "-"):
|
|
|
|
line = sys.stdin.readline()
|
2014-05-20 00:41:32 +04:00
|
|
|
idx = 0
|
2014-04-03 23:35:26 +04:00
|
|
|
while line:
|
2015-05-16 13:26:56 +03:00
|
|
|
result = translate(mserver.proxy, args, line)
|
|
|
|
repack_result(idx, result)
|
2014-04-03 23:35:26 +04:00
|
|
|
line = sys.stdin.readline()
|
2014-05-20 00:41:32 +04:00
|
|
|
idx += 1
|
2014-04-03 23:35:26 +04:00
|
|
|
else:
|
2014-05-19 00:48:17 +04:00
|
|
|
src = read_data(args.input)
|
2014-04-03 23:35:26 +04:00
|
|
|
for i in xrange(len(src)):
|
2015-05-16 13:26:56 +03:00
|
|
|
result = translate(mserver.proxy, args, src[i])
|
|
|
|
repack_result(i, result)
|
2014-05-20 00:41:32 +04:00
|
|
|
if args.debug:
|
|
|
|
print >>sys.stderr, result['text'].encode('utf-8')
|
|
|
|
pass
|
2015-05-16 13:26:56 +03:00
|
|
|
if ref and aln:
|
|
|
|
result = mserver.proxy.updater({
|
|
|
|
'source': src[i],
|
|
|
|
'target': ref[i],
|
|
|
|
'alignment': aln[i],
|
|
|
|
})
|