mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-26 11:28:48 +03:00
Simulated post-editing merge: XML update, parallel SPE script, MERT
This commit is contained in:
parent
35c346378e
commit
e7c36ee804
@ -30,6 +30,9 @@
|
||||
#include "TargetPhrase.h"
|
||||
#include "ReorderingConstraint.h"
|
||||
#include "FactorCollection.h"
|
||||
#if PT_UG
|
||||
#include "TranslationModel/UG/mmsapt.h"
|
||||
#endif
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
@ -306,6 +309,38 @@ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingCon
|
||||
placeholders.push_back(std::pair<size_t, std::string>(startPos, entity));
|
||||
}
|
||||
|
||||
// update: add new aligned sentence pair to Mmsapt identified by name
|
||||
else if (tagName == "update") {
|
||||
#if PT_UG
|
||||
// get model name and aligned sentence pair
|
||||
string pdName = ParseXmlTagAttribute(tagContent,"name");
|
||||
string source = ParseXmlTagAttribute(tagContent,"source");
|
||||
string target = ParseXmlTagAttribute(tagContent,"target");
|
||||
string alignment = ParseXmlTagAttribute(tagContent,"alignment");
|
||||
// find PhraseDictionary by name
|
||||
const vector<PhraseDictionary*> &pds = PhraseDictionary::GetColl();
|
||||
PhraseDictionary* pd = NULL;
|
||||
for (vector<PhraseDictionary*>::const_iterator i = pds.begin(); i != pds.end(); ++i) {
|
||||
PhraseDictionary* curPd = *i;
|
||||
if (curPd->GetScoreProducerDescription() == pdName) {
|
||||
pd = curPd;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (pd == NULL) {
|
||||
TRACE_ERR("ERROR: No PhraseDictionary with name " << pdName << ", no update" << endl);
|
||||
return false;
|
||||
}
|
||||
// update model
|
||||
VERBOSE(1,"Updating " << pdName << " ||| " << source << " ||| " << target << " ||| " << alignment << endl);
|
||||
Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(pd);
|
||||
pdsa->add(source, target, alignment);
|
||||
#else
|
||||
TRACE_ERR("ERROR: recompile with --with-mm to update PhraseDictionary at runtime" << endl);
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
// default: opening tag that specifies translation options
|
||||
else {
|
||||
if (startPos > endPos) {
|
||||
@ -361,7 +396,7 @@ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingCon
|
||||
float scoreValue = FloorScore(TransformScore(probValue));
|
||||
|
||||
WordsRange range(startPos + offset,endPos-1 + offset); // span covered by phrase
|
||||
TargetPhrase targetPhrase(NULL);
|
||||
TargetPhrase targetPhrase;
|
||||
// targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter, NULL);
|
||||
targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i], NULL);
|
||||
|
||||
|
320
scripts/generic/moses_sim_pe.py
Executable file
320
scripts/generic/moses_sim_pe.py
Executable file
@ -0,0 +1,320 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Written by Michael Denkowski
|
||||
|
||||
# This script parallelizes decoding with simulated post-editing via moses XML
|
||||
# input. Memory mapped dynamic phrase tables (Ulrich Germann, doc/Mmsapt.howto)
|
||||
# and language models (Kenneth Heafield, lm) allow separate moses processes to
|
||||
# share resources, facilitating memory efficient parallel decoding. Input is
|
||||
# divided into batches, each of which is decoded sequentially. Each batch pre
|
||||
# loads the data from previous batches.
|
||||
|
||||
# To use in tuning, run mert-moses.pl with --sim-pe=SYMAL where SYMAL is the
|
||||
# alignment from input to references. Specify the number of jobs with
|
||||
# --decoder-flags="-threads N".
|
||||
|
||||
import gzip
|
||||
import itertools
|
||||
import math
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import threading
|
||||
|
||||
# Example call from mert-moses.pl
|
||||
# moses [decoder flags] -config moses.ini -inputtype 0 -weight-overwrite '[text with spaces]' -n-best-list run1.best100.out 100 -input-file tune.src > run1.out
|
||||
|
||||
HELP = '''Moses with simulated post-editing
|
||||
|
||||
Usage: {} moses-cmd -config moses.ini -input-file text.src -ref text.tgt -symal text.src-tgt.symal [options] [decoder flags]
|
||||
|
||||
Options:
|
||||
-threads N: number of decoders to run in parallel (default read from moses.ini, 1 if not present)
|
||||
-n-best-list nbest.out N: location and size of N-best list
|
||||
-show-weights: for mert-moses.pl, just call moses and exit
|
||||
-tmp: location of temp directory (default /tmp)
|
||||
|
||||
Other options (decoder flags) are passed through to moses-cmd\n'''
|
||||
|
||||
# Provides progress bar
|
||||
class Progress:
|
||||
|
||||
def __init__(self):
|
||||
self.i = 0
|
||||
self.lock = threading.Lock()
|
||||
|
||||
def inc(self):
|
||||
self.lock.acquire()
|
||||
self.i += 1
|
||||
if self.i % 100 == 0:
|
||||
sys.stderr.write('.')
|
||||
if self.i % 1000 == 0:
|
||||
sys.stderr.write(' [{}]\n'.format(self.i))
|
||||
sys.stderr.flush()
|
||||
self.lock.release()
|
||||
|
||||
def done(self):
|
||||
self.lock.acquire()
|
||||
if self.i % 1000 != 0:
|
||||
sys.stderr.write('\n')
|
||||
self.lock.release()
|
||||
|
||||
# Run with atomic (synchronous) I/O
|
||||
def atomic_io(cmd, in_file, out_file, err_file, prog=None):
|
||||
with open(in_file, 'r') as inp, open(out_file, 'w') as out, open(err_file, 'w') as err:
|
||||
p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=err)
|
||||
while True:
|
||||
line = inp.readline()
|
||||
if not line:
|
||||
break
|
||||
p.stdin.write(line)
|
||||
out.write(p.stdout.readline())
|
||||
out.flush()
|
||||
if prog:
|
||||
prog.inc()
|
||||
p.stdin.close()
|
||||
p.wait()
|
||||
|
||||
# Open plain or gzipped text
|
||||
def gzopen(f):
|
||||
if f.endswith('.gz'):
|
||||
return gzip.open(f, 'rb')
|
||||
return open(f, 'r')
|
||||
|
||||
# Word count
|
||||
def wc(f):
|
||||
i = 0
|
||||
for line in gzopen(f):
|
||||
i += 1
|
||||
return i
|
||||
|
||||
# Write lines to gzipped file
|
||||
def write_gzfile(lines, f):
|
||||
out = gzip.open(f, 'wb')
|
||||
for line in lines:
|
||||
out.write('{}\n'.format(line))
|
||||
out.close()
|
||||
|
||||
def main(argv):
|
||||
|
||||
# Defaults
|
||||
moses_ini = None
|
||||
moses_ini_lines = None
|
||||
text_src = None
|
||||
text_tgt = None
|
||||
text_symal = None
|
||||
text_len = None
|
||||
threads_found = False
|
||||
threads = 1
|
||||
n_best_out = None
|
||||
n_best_size = None
|
||||
tmp_dir = '/tmp'
|
||||
xml_found = False
|
||||
xml_input = 'exclusive'
|
||||
show_weights = False
|
||||
mmsapt_name = None
|
||||
mmsapt_l1 = None
|
||||
mmsapt_l2 = None
|
||||
|
||||
# Decoder command
|
||||
cmd = argv[1:]
|
||||
|
||||
# Parse special options and remove from cmd
|
||||
i = 1
|
||||
while i < len(cmd):
|
||||
if cmd[i] in ('-f', '-config'):
|
||||
moses_ini = cmd[i + 1]
|
||||
cmd = cmd[:i] + cmd[i + 2:]
|
||||
elif cmd[i] in ('-i', '-input-file'):
|
||||
text_src = cmd[i + 1]
|
||||
cmd = cmd[:i] + cmd[i + 2:]
|
||||
elif cmd[i] == '-ref':
|
||||
text_tgt = cmd[i + 1]
|
||||
cmd = cmd[:i] + cmd[i + 2:]
|
||||
elif cmd[i] == '-symal':
|
||||
text_symal = cmd[i + 1]
|
||||
cmd = cmd[:i] + cmd[i + 2:]
|
||||
elif cmd[i] in ('-th', '-threads'):
|
||||
threads_found = True
|
||||
threads = int(cmd[i + 1])
|
||||
cmd = cmd[:i] + cmd[i + 2:]
|
||||
elif cmd[i] == '-n-best-list':
|
||||
n_best_out = cmd[i + 1]
|
||||
n_best_size = cmd[i + 2]
|
||||
cmd = cmd[:i] + cmd[i + 3:]
|
||||
elif cmd[i] == '-tmp':
|
||||
tmp_dir = cmd[i + 1]
|
||||
cmd = cmd[:i] + cmd[i + 2:]
|
||||
# Handled specially to make sure XML input is turned on somewhere
|
||||
elif cmd[i] in ('-xi', '-xml-input'):
|
||||
xml_found = True
|
||||
xml_input = cmd[i + 1]
|
||||
cmd = cmd[:i] + cmd[i + 2:]
|
||||
# Handled specially for mert-moses.pl
|
||||
elif cmd[i] == '-show-weights':
|
||||
show_weights = True
|
||||
# Do not remove from cmd
|
||||
i += 1
|
||||
else:
|
||||
i += 1
|
||||
|
||||
# Read moses.ini
|
||||
if moses_ini:
|
||||
moses_ini_lines = [line.strip() for line in open(moses_ini, 'r')]
|
||||
i = 0
|
||||
while i < len(moses_ini_lines):
|
||||
# PhraseDictionaryBitextSampling name=TranslationModel0 output-factor=0 num-features=7 path=corpus. L1=src L2=tgt pfwd=g pbwd=g smooth=0 sample=1000 workers=1
|
||||
if moses_ini_lines[i].startswith('PhraseDictionaryBitextSampling'):
|
||||
for (k, v) in (pair.split('=') for pair in moses_ini_lines[i].split()[1:]):
|
||||
if k == 'name':
|
||||
mmsapt_name = v
|
||||
elif k == 'L1':
|
||||
mmsapt_l1 = v
|
||||
elif k == 'L2':
|
||||
mmsapt_l2 = v
|
||||
moses_ini_lines[i] += '{mmsapt_extra}'
|
||||
# [threads]
|
||||
# 8
|
||||
elif moses_ini_lines[i] == '[threads]':
|
||||
# Prefer command line over moses.ini
|
||||
if not threads_found:
|
||||
threads = int(moses_ini_lines[i + 1])
|
||||
i += 1
|
||||
# [xml-input]
|
||||
# exclusive
|
||||
elif moses_ini_lines[i] == '[xml-input]':
|
||||
# Prefer command line over moses.ini
|
||||
if not xml_found:
|
||||
xml_found = True
|
||||
xml_input = moses_ini_lines[i + 1]
|
||||
i += 1
|
||||
i += 1
|
||||
|
||||
# If mert-moses.pl passes -show-weights, just call moses
|
||||
if show_weights:
|
||||
# re-append original moses.ini
|
||||
cmd.append('-config')
|
||||
cmd.append(moses_ini)
|
||||
sys.stdout.write(subprocess.check_output(cmd))
|
||||
sys.stdout.flush()
|
||||
sys.exit(0)
|
||||
|
||||
# Input length
|
||||
if text_src:
|
||||
text_len = wc(text_src)
|
||||
|
||||
# Check inputs
|
||||
if not (len(cmd) > 0 and all((moses_ini, text_src, text_tgt, text_symal))):
|
||||
sys.stderr.write(HELP.format(argv[0]))
|
||||
sys.exit(2)
|
||||
if not (os.path.isfile(cmd[0]) and os.access(cmd[0], os.X_OK)):
|
||||
sys.stderr.write('Error: moses-cmd "{}" is not executable\n'.format(cmd[0]))
|
||||
sys.exit(1)
|
||||
if not mmsapt_name:
|
||||
sys.stderr.write('Error: no PhraseDictionaryBitextSampling found in {}. See http://www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40\n'.format(moses_ini))
|
||||
sys.exit(1)
|
||||
if wc(text_tgt) != text_len or wc(text_symal) != text_len:
|
||||
sys.stderr.write('Error: length mismatch between "{}", "{}", and "{}"\n'.format(text_src, text_tgt, text_symal))
|
||||
sys.exit(1)
|
||||
|
||||
# Setup
|
||||
work_dir = tempfile.mkdtemp(prefix='moses.', dir=os.path.abspath(tmp_dir))
|
||||
batch_size = int(math.ceil(float(text_len) / threads))
|
||||
|
||||
# Report settings
|
||||
sys.stderr.write('Moses flags: {}\n'.format(' '.join('\'{}\''.format(s) if ' ' in s else s for s in cmd[1:])))
|
||||
sys.stderr.write('Mmsapt: {} {} {}\n'.format(mmsapt_name, mmsapt_l1, mmsapt_l2))
|
||||
sys.stderr.write('XML mode: {}\n'.format(xml_input))
|
||||
sys.stderr.write('Inputs: {} {} {} ({})\n'.format(text_src, text_tgt, text_symal, text_len))
|
||||
sys.stderr.write('Jobs: {}\n'.format(threads))
|
||||
sys.stderr.write('Batch size: {}\n'.format(batch_size))
|
||||
if n_best_out:
|
||||
sys.stderr.write('N-best list: {} ({})\n'.format(n_best_out, n_best_size))
|
||||
sys.stderr.write('Temp dir: {}\n'.format(work_dir))
|
||||
|
||||
# Accumulate seen lines
|
||||
src_lines = []
|
||||
tgt_lines = []
|
||||
symal_lines = []
|
||||
|
||||
# Current XML source file
|
||||
xml_out = None
|
||||
|
||||
# Split into batches. Each batch after 0 gets extra files with data from previous batches.
|
||||
# Data from previous lines in the current batch is added using XML input.
|
||||
job = -1
|
||||
lc = -1
|
||||
for (src, tgt, symal) in itertools.izip(gzopen(text_src), gzopen(text_tgt), gzopen(text_symal)):
|
||||
(src, tgt, symal) = (src.strip(), tgt.strip(), symal.strip())
|
||||
lc += 1
|
||||
if lc % batch_size == 0:
|
||||
job += 1
|
||||
xml_file = os.path.join(work_dir, 'input.{}.xml'.format(job))
|
||||
extra_src_file = os.path.join(work_dir, 'extra.{}.{}.txt.gz'.format(job, mmsapt_l1))
|
||||
extra_tgt_file = os.path.join(work_dir, 'extra.{}.{}.txt.gz'.format(job, mmsapt_l2))
|
||||
extra_symal_file = os.path.join(work_dir, 'extra.{}.{}-{}.symal.gz'.format(job, mmsapt_l1, mmsapt_l2))
|
||||
if job > 0:
|
||||
xml_out.close()
|
||||
write_gzfile(src_lines, extra_src_file)
|
||||
write_gzfile(tgt_lines, extra_tgt_file)
|
||||
write_gzfile(symal_lines, extra_symal_file)
|
||||
xml_out = open(xml_file, 'w')
|
||||
with open(os.path.join(work_dir, 'moses.{}.ini'.format(job)), 'w') as moses_ini_out:
|
||||
extra = '' if job == 0 else ' extra={}'.format(os.path.join(work_dir, 'extra.{}.'.format(job)))
|
||||
moses_ini_out.write('{}\n'.format('\n'.join(moses_ini_lines).format(mmsapt_extra=extra)))
|
||||
src_lines.append(src)
|
||||
tgt_lines.append(tgt)
|
||||
symal_lines.append(symal)
|
||||
# Lines after first start with update tag including previous translation.
|
||||
# Translation of last line of each batch is included in extra for next batch.
|
||||
xml_out.write('{}{}\n'.format('' if lc % batch_size == 0 else '<update name="{}" source="{}" target="{}" alignment="{}" /> '.format(mmsapt_name, src_lines[-2], tgt_lines[-2], symal_lines[-2]), src))
|
||||
xml_out.close()
|
||||
|
||||
# Run decoders in parallel
|
||||
workers = []
|
||||
prog = Progress()
|
||||
for i in range(threads):
|
||||
work_cmd = cmd[:]
|
||||
work_cmd.append('-config')
|
||||
work_cmd.append(os.path.join(work_dir, 'moses.{}.ini'.format(i)))
|
||||
# Workers use 1 CPU each
|
||||
work_cmd.append('-threads')
|
||||
work_cmd.append('1')
|
||||
if not xml_found:
|
||||
work_cmd.append('-xml-input')
|
||||
work_cmd.append(xml_input)
|
||||
if n_best_out:
|
||||
work_cmd.append('-n-best-list')
|
||||
work_cmd.append(os.path.join(work_dir, 'nbest.{}'.format(i)))
|
||||
work_cmd.append(str(n_best_size))
|
||||
in_file = os.path.join(work_dir, 'input.{}.xml'.format(i))
|
||||
out_file = os.path.join(work_dir, 'out.{}'.format(i))
|
||||
err_file = os.path.join(work_dir, 'err.{}'.format(i))
|
||||
t = threading.Thread(target=atomic_io, args=(work_cmd, in_file, out_file, err_file, prog))
|
||||
workers.append(t)
|
||||
t.start()
|
||||
# Wait for all to finish
|
||||
for t in workers:
|
||||
t.join()
|
||||
prog.done()
|
||||
|
||||
# Gather N-best lists
|
||||
if n_best_out:
|
||||
with open(n_best_out, 'w') as out:
|
||||
for i in range(threads):
|
||||
for line in open(os.path.join(work_dir, 'nbest.{}'.format(i)), 'r'):
|
||||
entry = line.partition(' ')
|
||||
out.write('{} {}'.format(int(entry[0]) + (i * batch_size), entry[2]))
|
||||
|
||||
# Gather stdout
|
||||
for i in range(threads):
|
||||
for line in open(os.path.join(work_dir, 'out.{}'.format(i)), 'r'):
|
||||
sys.stdout.write(line)
|
||||
|
||||
# Cleanup
|
||||
shutil.rmtree(work_dir)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv)
|
@ -160,6 +160,12 @@ my $prev_aggregate_nbl_size = -1; # number of previous step to consider when loa
|
||||
# and so on
|
||||
my $maximum_iterations = 25;
|
||||
|
||||
# Simulated post-editing
|
||||
my $___MOSES_SIM_PE = "$SCRIPTS_ROOTDIR/generic/moses_sim_pe.py";
|
||||
my $___DEV_SYMAL = undef;
|
||||
my $dev_symal_abs = undef;
|
||||
my $working_dir_abs = undef;
|
||||
|
||||
use Getopt::Long;
|
||||
GetOptions(
|
||||
"working-dir=s" => \$___WORKING_DIR,
|
||||
@ -213,7 +219,8 @@ GetOptions(
|
||||
"batch-mira-args=s" => \$batch_mira_args,
|
||||
"promix-training=s" => \$__PROMIX_TRAINING,
|
||||
"promix-table=s" => \@__PROMIX_TABLES,
|
||||
"threads=i" => \$__THREADS
|
||||
"threads=i" => \$__THREADS,
|
||||
"spe-symal=s" => \$___DEV_SYMAL
|
||||
) or exit(1);
|
||||
|
||||
# the 4 required parameters can be supplied on the command line directly
|
||||
@ -308,6 +315,8 @@ Options:
|
||||
--threads=NUMBER ... Use multi-threaded mert (must be compiled in).
|
||||
--historic-interpolation ... Interpolate optimized weights with prior iterations' weight
|
||||
(parameter sets factor [0;1] given to current weights)
|
||||
--spe-symal=SYMAL ... Use simulated post-editing when decoding.
|
||||
(SYMAL aligns input to refs)
|
||||
";
|
||||
exit 1;
|
||||
}
|
||||
@ -467,6 +476,12 @@ if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) /
|
||||
die "It is forbidden to supply any of -config, -ttable-file, -distortion-file, -generation-file or -lmodel-file in the --decoder-flags.\nPlease use only the --config option to give the config file that lists all the supplementary files.";
|
||||
}
|
||||
|
||||
# Paths needed for simulated post-editing
|
||||
if ($___DEV_SYMAL) {
|
||||
$dev_symal_abs = ensure_full_path($___DEV_SYMAL);
|
||||
$working_dir_abs = ensure_full_path($___WORKING_DIR);
|
||||
}
|
||||
|
||||
# as weights are normalized in the next steps (by cmert)
|
||||
# normalize initial LAMBDAs, too
|
||||
my $need_to_normalize = 1;
|
||||
@ -1235,7 +1250,14 @@ sub run_decoder {
|
||||
safesystem("rm -rf $hypergraph_dir");
|
||||
$nbest_list_cmd = "-output-search-graph-hypergraph true gz";
|
||||
}
|
||||
$decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config $lsamp_cmd $nbest_list_cmd -input-file $___DEV_F > run$run.out";
|
||||
# If simulating post-editing, route command through moses_sim_pe.py
|
||||
if (defined $___DEV_SYMAL) {
|
||||
# Always use single (first) reference. Simulated post-editing undefined for multiple references.
|
||||
$decoder_cmd = "$___MOSES_SIM_PE $___DECODER $___DECODER_FLAGS -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config $lsamp_cmd $nbest_list_cmd -input-file $___DEV_F -ref $references[0] -symal $dev_symal_abs -tmp $working_dir_abs > run$run.out";
|
||||
} else {
|
||||
# Default: call decoder directly
|
||||
$decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config $lsamp_cmd $nbest_list_cmd -input-file $___DEV_F > run$run.out";
|
||||
}
|
||||
}
|
||||
|
||||
print STDERR "Executing: $decoder_cmd \n";
|
||||
|
Loading…
Reference in New Issue
Block a user