Simulated post-editing merge: XML update, parallel SPE script, MERT

2024-10-26 11:28:48 +03:00 · 2014-08-05 14:20:00 -04:00 · 2014-08-05 14:20:00 -04:00 · e7c36ee804
commit e7c36ee804
parent 35c346378e
3 changed files with 380 additions and 3 deletions
--- a/moses/XmlOption.cpp
+++ b/moses/XmlOption.cpp
@ -30,6 +30,9 @@
 #include "TargetPhrase.h"
 #include "ReorderingConstraint.h"
 #include "FactorCollection.h"
+#if PT_UG
+#include "TranslationModel/UG/mmsapt.h"
+#endif

 namespace Moses
 {
@ -306,6 +309,38 @@ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingCon
          placeholders.push_back(std::pair<size_t, std::string>(startPos, entity));
        }

+        // update: add new aligned sentence pair to Mmsapt identified by name
+        else if (tagName == "update") {
+#if PT_UG
+            // get model name and aligned sentence pair
+            string pdName = ParseXmlTagAttribute(tagContent,"name");
+            string source = ParseXmlTagAttribute(tagContent,"source");
+            string target = ParseXmlTagAttribute(tagContent,"target");
+            string alignment = ParseXmlTagAttribute(tagContent,"alignment");
+            // find PhraseDictionary by name
+            const vector<PhraseDictionary*> &pds = PhraseDictionary::GetColl();
+            PhraseDictionary* pd = NULL;
+            for (vector<PhraseDictionary*>::const_iterator i = pds.begin(); i != pds.end(); ++i) {
+                PhraseDictionary* curPd = *i;
+                if (curPd->GetScoreProducerDescription() == pdName) {
+                    pd = curPd;
+                    break;
+                }
+            }
+            if (pd == NULL) {
+                TRACE_ERR("ERROR: No PhraseDictionary with name " << pdName << ", no update" << endl);
+                return false;
+            }
+            // update model
+            VERBOSE(1,"Updating " << pdName << " ||| " << source << " ||| " << target << " ||| " << alignment << endl);
+            Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(pd);
+            pdsa->add(source, target, alignment);
+#else
+            TRACE_ERR("ERROR: recompile with --with-mm to update PhraseDictionary at runtime" << endl);
+            return false;
+#endif
+        }
+
        // default: opening tag that specifies translation options
        else {
          if (startPos > endPos) {
@ -361,7 +396,7 @@ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingCon
              float scoreValue = FloorScore(TransformScore(probValue));

              WordsRange range(startPos + offset,endPos-1 + offset); // span covered by phrase
-              TargetPhrase targetPhrase(NULL);
+              TargetPhrase targetPhrase;
              // targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter, NULL);
              targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i], NULL);

--- a/scripts/generic/moses_sim_pe.py
+++ b/scripts/generic/moses_sim_pe.py
@ -0,0 +1,320 @@
+#!/usr/bin/env python
+
+# Written by Michael Denkowski
+
+# This script parallelizes decoding with simulated post-editing via moses XML
+# input.  Memory mapped dynamic phrase tables (Ulrich Germann, doc/Mmsapt.howto)
+# and language models (Kenneth Heafield, lm) allow separate moses processes to
+# share resources, facilitating memory efficient parallel decoding.  Input is
+# divided into batches, each of which is decoded sequentially.  Each batch pre
+# loads the data from previous batches.
+
+# To use in tuning, run mert-moses.pl with --sim-pe=SYMAL where SYMAL is the
+# alignment from input to references.  Specify the number of jobs with
+# --decoder-flags="-threads N".
+
+import gzip
+import itertools
+import math
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import threading
+
+# Example call from mert-moses.pl
+# moses [decoder flags] -config moses.ini -inputtype 0 -weight-overwrite '[text with spaces]' -n-best-list run1.best100.out 100 -input-file tune.src > run1.out
+
+HELP = '''Moses with simulated post-editing
+
+Usage: {} moses-cmd -config moses.ini -input-file text.src -ref text.tgt -symal text.src-tgt.symal [options] [decoder flags]
+
+Options:
+    -threads N: number of decoders to run in parallel (default read from moses.ini, 1 if not present)
+    -n-best-list nbest.out N: location and size of N-best list
+    -show-weights: for mert-moses.pl, just call moses and exit
+    -tmp: location of temp directory (default /tmp)
+
+Other options (decoder flags) are passed through to moses-cmd\n'''
+
+# Provides progress bar
+class Progress:
+
+    def __init__(self):
+        self.i = 0
+        self.lock = threading.Lock()
+
+    def inc(self):
+        self.lock.acquire()
+        self.i += 1
+        if self.i % 100 == 0:
+            sys.stderr.write('.')
+            if self.i % 1000 == 0:
+                sys.stderr.write(' [{}]\n'.format(self.i))
+            sys.stderr.flush()
+        self.lock.release()
+
+    def done(self):
+        self.lock.acquire()
+        if self.i % 1000 != 0:
+            sys.stderr.write('\n')
+        self.lock.release()
+
+# Run with atomic (synchronous) I/O
+def atomic_io(cmd, in_file, out_file, err_file, prog=None):
+    with open(in_file, 'r') as inp, open(out_file, 'w') as out, open(err_file, 'w') as err:
+        p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=err)
+        while True:
+            line = inp.readline()
+            if not line:
+                break
+            p.stdin.write(line)
+            out.write(p.stdout.readline())
+            out.flush()
+            if prog:
+                prog.inc()
+        p.stdin.close()
+        p.wait()
+
+# Open plain or gzipped text
+def gzopen(f):
+    if f.endswith('.gz'):
+        return gzip.open(f, 'rb')
+    return open(f, 'r')
+
+# Word count
+def wc(f):
+    i = 0
+    for line in gzopen(f):
+        i += 1
+    return i
+
+# Write lines to gzipped file
+def write_gzfile(lines, f):
+    out = gzip.open(f, 'wb')
+    for line in lines:
+        out.write('{}\n'.format(line))
+    out.close()
+
+def main(argv):
+
+    # Defaults
+    moses_ini = None
+    moses_ini_lines = None
+    text_src = None
+    text_tgt = None
+    text_symal = None
+    text_len = None
+    threads_found = False
+    threads = 1
+    n_best_out = None
+    n_best_size = None
+    tmp_dir = '/tmp'
+    xml_found = False
+    xml_input = 'exclusive'
+    show_weights = False
+    mmsapt_name = None
+    mmsapt_l1 = None
+    mmsapt_l2 = None
+
+    # Decoder command
+    cmd = argv[1:]
+
+    # Parse special options and remove from cmd
+    i = 1
+    while i < len(cmd):
+        if cmd[i] in ('-f', '-config'):
+            moses_ini = cmd[i + 1]
+            cmd = cmd[:i] + cmd[i + 2:]
+        elif cmd[i] in ('-i', '-input-file'):
+            text_src = cmd[i + 1]
+            cmd = cmd[:i] + cmd[i + 2:]
+        elif cmd[i] == '-ref':
+            text_tgt = cmd[i + 1]
+            cmd = cmd[:i] + cmd[i + 2:]
+        elif cmd[i] == '-symal':
+            text_symal = cmd[i + 1]
+            cmd = cmd[:i] + cmd[i + 2:]
+        elif cmd[i] in ('-th', '-threads'):
+            threads_found = True
+            threads = int(cmd[i + 1])
+            cmd = cmd[:i] + cmd[i + 2:]
+        elif cmd[i] == '-n-best-list':
+            n_best_out = cmd[i + 1]
+            n_best_size = cmd[i + 2]
+            cmd = cmd[:i] + cmd[i + 3:]
+        elif cmd[i] == '-tmp':
+            tmp_dir = cmd[i + 1]
+            cmd = cmd[:i] + cmd[i + 2:]
+        # Handled specially to make sure XML input is turned on somewhere
+        elif cmd[i] in ('-xi', '-xml-input'):
+            xml_found = True
+            xml_input = cmd[i + 1]
+            cmd = cmd[:i] + cmd[i + 2:]
+        # Handled specially for mert-moses.pl
+        elif cmd[i] == '-show-weights':
+            show_weights = True
+            # Do not remove from cmd
+            i += 1
+        else:
+            i += 1
+
+    # Read moses.ini
+    if moses_ini:
+        moses_ini_lines = [line.strip() for line in open(moses_ini, 'r')]
+        i = 0
+        while i < len(moses_ini_lines):
+            # PhraseDictionaryBitextSampling name=TranslationModel0 output-factor=0 num-features=7 path=corpus. L1=src L2=tgt pfwd=g pbwd=g smooth=0 sample=1000 workers=1
+            if moses_ini_lines[i].startswith('PhraseDictionaryBitextSampling'):
+                for (k, v) in (pair.split('=') for pair in moses_ini_lines[i].split()[1:]):
+                    if k == 'name':
+                        mmsapt_name = v
+                    elif k == 'L1':
+                        mmsapt_l1 = v
+                    elif k == 'L2':
+                        mmsapt_l2 = v
+                moses_ini_lines[i] += '{mmsapt_extra}'
+            # [threads]
+            # 8
+            elif moses_ini_lines[i] == '[threads]':
+                # Prefer command line over moses.ini
+                if not threads_found:
+                    threads = int(moses_ini_lines[i + 1])
+                i += 1
+            # [xml-input]
+            # exclusive
+            elif moses_ini_lines[i] == '[xml-input]':
+                # Prefer command line over moses.ini
+                if not xml_found:
+                    xml_found = True
+                    xml_input = moses_ini_lines[i + 1]
+                i += 1
+            i += 1
+
+    # If mert-moses.pl passes -show-weights, just call moses
+    if show_weights:
+        # re-append original moses.ini
+        cmd.append('-config')
+        cmd.append(moses_ini)
+        sys.stdout.write(subprocess.check_output(cmd))
+        sys.stdout.flush()
+        sys.exit(0)
+
+    # Input length
+    if text_src:
+        text_len = wc(text_src)
+
+    # Check inputs
+    if not (len(cmd) > 0 and all((moses_ini, text_src, text_tgt, text_symal))):
+        sys.stderr.write(HELP.format(argv[0]))
+        sys.exit(2)
+    if not (os.path.isfile(cmd[0]) and os.access(cmd[0], os.X_OK)):
+        sys.stderr.write('Error: moses-cmd "{}" is not executable\n'.format(cmd[0]))
+        sys.exit(1)
+    if not mmsapt_name:
+        sys.stderr.write('Error: no PhraseDictionaryBitextSampling found in {}.  See http://www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40\n'.format(moses_ini))
+        sys.exit(1)
+    if wc(text_tgt) != text_len or wc(text_symal) != text_len:
+        sys.stderr.write('Error: length mismatch between "{}", "{}", and "{}"\n'.format(text_src, text_tgt, text_symal))
+        sys.exit(1)
+    
+    # Setup
+    work_dir = tempfile.mkdtemp(prefix='moses.', dir=os.path.abspath(tmp_dir))
+    batch_size = int(math.ceil(float(text_len) / threads))
+
+    # Report settings
+    sys.stderr.write('Moses flags: {}\n'.format(' '.join('\'{}\''.format(s) if ' ' in s else s for s in cmd[1:])))
+    sys.stderr.write('Mmsapt: {} {} {}\n'.format(mmsapt_name, mmsapt_l1, mmsapt_l2))
+    sys.stderr.write('XML mode: {}\n'.format(xml_input))
+    sys.stderr.write('Inputs: {} {} {} ({})\n'.format(text_src, text_tgt, text_symal, text_len))
+    sys.stderr.write('Jobs: {}\n'.format(threads))
+    sys.stderr.write('Batch size: {}\n'.format(batch_size))
+    if n_best_out:
+        sys.stderr.write('N-best list: {} ({})\n'.format(n_best_out, n_best_size))
+    sys.stderr.write('Temp dir: {}\n'.format(work_dir))
+
+    # Accumulate seen lines
+    src_lines = []
+    tgt_lines = []
+    symal_lines = []
+
+    # Current XML source file
+    xml_out = None
+
+    # Split into batches.  Each batch after 0 gets extra files with data from previous batches.
+    # Data from previous lines in the current batch is added using XML input.
+    job = -1
+    lc = -1
+    for (src, tgt, symal) in itertools.izip(gzopen(text_src), gzopen(text_tgt), gzopen(text_symal)):
+        (src, tgt, symal) = (src.strip(), tgt.strip(), symal.strip())
+        lc += 1
+        if lc % batch_size == 0:
+            job += 1
+            xml_file = os.path.join(work_dir, 'input.{}.xml'.format(job))
+            extra_src_file = os.path.join(work_dir, 'extra.{}.{}.txt.gz'.format(job, mmsapt_l1))
+            extra_tgt_file = os.path.join(work_dir, 'extra.{}.{}.txt.gz'.format(job, mmsapt_l2))
+            extra_symal_file = os.path.join(work_dir, 'extra.{}.{}-{}.symal.gz'.format(job, mmsapt_l1, mmsapt_l2))
+            if job > 0:
+                xml_out.close()
+                write_gzfile(src_lines, extra_src_file)
+                write_gzfile(tgt_lines, extra_tgt_file)
+                write_gzfile(symal_lines, extra_symal_file)
+            xml_out = open(xml_file, 'w')
+            with open(os.path.join(work_dir, 'moses.{}.ini'.format(job)), 'w') as moses_ini_out:
+                extra = '' if job == 0 else ' extra={}'.format(os.path.join(work_dir, 'extra.{}.'.format(job)))
+                moses_ini_out.write('{}\n'.format('\n'.join(moses_ini_lines).format(mmsapt_extra=extra)))
+        src_lines.append(src)
+        tgt_lines.append(tgt)
+        symal_lines.append(symal)
+        # Lines after first start with update tag including previous translation.
+        # Translation of last line of each batch is included in extra for next batch.
+        xml_out.write('{}{}\n'.format('' if lc % batch_size == 0 else '<update name="{}" source="{}" target="{}" alignment="{}" /> '.format(mmsapt_name, src_lines[-2], tgt_lines[-2], symal_lines[-2]), src))
+    xml_out.close()
+
+    # Run decoders in parallel
+    workers = []
+    prog = Progress()
+    for i in range(threads):
+        work_cmd = cmd[:]
+        work_cmd.append('-config')
+        work_cmd.append(os.path.join(work_dir, 'moses.{}.ini'.format(i)))
+        # Workers use 1 CPU each
+        work_cmd.append('-threads')
+        work_cmd.append('1')
+        if not xml_found:
+            work_cmd.append('-xml-input')
+            work_cmd.append(xml_input)
+        if n_best_out:
+            work_cmd.append('-n-best-list')
+            work_cmd.append(os.path.join(work_dir, 'nbest.{}'.format(i)))
+            work_cmd.append(str(n_best_size))
+        in_file = os.path.join(work_dir, 'input.{}.xml'.format(i))
+        out_file = os.path.join(work_dir, 'out.{}'.format(i))
+        err_file = os.path.join(work_dir, 'err.{}'.format(i))
+        t = threading.Thread(target=atomic_io, args=(work_cmd, in_file, out_file, err_file, prog))
+        workers.append(t)
+        t.start()
+    # Wait for all to finish
+    for t in workers:
+        t.join()
+    prog.done()
+
+    # Gather N-best lists
+    if n_best_out:
+        with open(n_best_out, 'w') as out:
+            for i in range(threads):
+                for line in open(os.path.join(work_dir, 'nbest.{}'.format(i)), 'r'):
+                    entry = line.partition(' ')
+                    out.write('{} {}'.format(int(entry[0]) + (i * batch_size), entry[2]))
+
+    # Gather stdout
+    for i in range(threads):
+        for line in open(os.path.join(work_dir, 'out.{}'.format(i)), 'r'):
+            sys.stdout.write(line)
+
+    # Cleanup
+    shutil.rmtree(work_dir)
+
+if __name__ == '__main__':
+    main(sys.argv)
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@ -160,6 +160,12 @@ my $prev_aggregate_nbl_size = -1; # number of previous step to consider when loa
                                  # and so on
 my $maximum_iterations = 25;

+# Simulated post-editing
+my $___MOSES_SIM_PE = "$SCRIPTS_ROOTDIR/generic/moses_sim_pe.py";
+my $___DEV_SYMAL = undef;
+my $dev_symal_abs = undef;
+my $working_dir_abs = undef;
+
 use Getopt::Long;
 GetOptions(
  "working-dir=s" => \$___WORKING_DIR,
@ -213,7 +219,8 @@ GetOptions(
  "batch-mira-args=s" => \$batch_mira_args,
  "promix-training=s" => \$__PROMIX_TRAINING,
  "promix-table=s" => \@__PROMIX_TABLES,
-  "threads=i" => \$__THREADS
+  "threads=i" => \$__THREADS,
+  "spe-symal=s" => \$___DEV_SYMAL
 ) or exit(1);

 # the 4 required parameters can be supplied on the command line directly
@ -308,6 +315,8 @@ Options:
  --threads=NUMBER          ... Use multi-threaded mert (must be compiled in).
  --historic-interpolation  ... Interpolate optimized weights with prior iterations' weight
                                (parameter sets factor [0;1] given to current weights)
+  --spe-symal=SYMAL      ... Use simulated post-editing when decoding.
+                             (SYMAL aligns input to refs)
 ";
  exit 1;
 }
@ -467,6 +476,12 @@ if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) /
  die "It is forbidden to supply any of -config, -ttable-file, -distortion-file, -generation-file or -lmodel-file in the --decoder-flags.\nPlease use only the --config option to give the config file that lists all the supplementary files.";
 }

+# Paths needed for simulated post-editing
+if ($___DEV_SYMAL) {
+   $dev_symal_abs = ensure_full_path($___DEV_SYMAL);
+   $working_dir_abs = ensure_full_path($___WORKING_DIR);
+}
+
 # as weights are normalized in the next steps (by cmert)
 # normalize initial LAMBDAs, too
 my $need_to_normalize = 1;
@ -1235,7 +1250,14 @@ sub run_decoder {
        safesystem("rm -rf $hypergraph_dir");
        $nbest_list_cmd = "-output-search-graph-hypergraph true gz";
      }
-      $decoder_cmd = "$___DECODER $___DECODER_FLAGS  -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config $lsamp_cmd $nbest_list_cmd  -input-file $___DEV_F > run$run.out";
+      # If simulating post-editing, route command through moses_sim_pe.py
+      if (defined $___DEV_SYMAL) {
+        # Always use single (first) reference.  Simulated post-editing undefined for multiple references.
+        $decoder_cmd = "$___MOSES_SIM_PE $___DECODER $___DECODER_FLAGS  -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config $lsamp_cmd $nbest_list_cmd  -input-file $___DEV_F -ref $references[0] -symal $dev_symal_abs -tmp $working_dir_abs > run$run.out";
+      } else {
+        # Default: call decoder directly
+        $decoder_cmd = "$___DECODER $___DECODER_FLAGS  -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config $lsamp_cmd $nbest_list_cmd  -input-file $___DEV_F > run$run.out";
+      }
    }

    print STDERR "Executing: $decoder_cmd \n";