From 33eee96f35b07168d5912b35baed7df74d4cea23 Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ulrich.germann@gmail.com>
Date: Thu, 21 May 2015 16:36:51 +0000
Subject: [PATCH 001/108] Bug fix related to context-sensitive decoding:
 --context-string had no effect, even when --context-window was not specified.

---
 moses/IOWrapper.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)
diff --git a/moses/IOWrapper.cpp b/moses/IOWrapper.cpp
index 29769f180..8cbf4f091 100644
--- a/moses/IOWrapper.cpp
+++ b/moses/IOWrapper.cpp
@@ -296,16 +296,19 @@ GetBufferedInput()
 }
 
 boost::shared_ptr<InputType>
-IOWrapper::ReadInput()
+IOWrapper::
+ReadInput()
 {
 #ifdef WITH_THREADS
   boost::lock_guard<boost::mutex> lock(m_lock);
 #endif
   boost::shared_ptr<InputType> source = GetBufferedInput();
-  if (source) {
-    source->SetTranslationId(m_currentLine++);
-    this->set_context_for(*source);
-  }
+  if (source) 
+    {
+      source->SetTranslationId(m_currentLine++);
+      if (m_look_ahead || m_look_back) 
+	this->set_context_for(*source);
+    }
   m_past_input.push_back(source);
   return source;
 }
@@ -344,7 +347,7 @@ set_context_for(InputType& source)
     }
   }
   // cerr << string(80,'=') << endl;
-  source.SetContext(context);
+  if (context->size()) source.SetContext(context);
 }
 
 

From a1678187fead90da0e19da5a71a82e421b57ff06 Mon Sep 17 00:00:00 2001
From: Rico Sennrich <rico.sennrich@gmx.ch>
Date: Fri, 22 May 2015 15:28:42 +0100
Subject: [PATCH 002/108] wrapper for stanford dependency parser

---
 .../training/wrappers/parse-en-stanford.py    | 129 ++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100755 scripts/training/wrappers/parse-en-stanford.py

diff --git a/scripts/training/wrappers/parse-en-stanford.py b/scripts/training/wrappers/parse-en-stanford.py
new file mode 100755
index 000000000..7d8be4bcf
--- /dev/null
+++ b/scripts/training/wrappers/parse-en-stanford.py
@@ -0,0 +1,129 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Author: Rico Sennrich
+
+# (hacky) wrapper around Stanford CoreNLP to produce CoNLL dependency format.
+# assumes tokenized and sentence-split text.
+
+# to get Moses XML format, first projectivize the trees, then use conll2mosesxml.py.
+
+from __future__ import print_function, unicode_literals
+import os
+import sys
+import codecs
+import argparse
+
+from collections import defaultdict
+from subprocess import Popen, PIPE
+
+# hack for python2/3 compatibility
+from io import open
+argparse.open = open
+
+
+def create_parser():
+    parser = argparse.ArgumentParser(
+        description=(
+            """Wrapper around Stanford CoreNLP to produce CoNLL dependency format.
+            Assumes that text is tokenized and has one sentence per line."""))
+
+    parser.add_argument(
+        '--stanford', type=str,
+        metavar='PATH', required=True,
+        help='path to Stanford CoreNLP')
+
+    parser.add_argument(
+        '--java', type=str, default='java',
+        metavar='PATH',
+        help='path to java executable')
+
+    parser.add_argument(
+        '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
+        metavar='PATH',
+        help="Input text (default: standard input).")
+    parser.add_argument(
+        '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
+        metavar='PATH',
+        help="Output text (default: standard output).")
+
+    return parser
+
+def process_stanford(infile, javacmd, stanfordpath):
+
+    stanford = Popen([javacmd,
+               '-cp', os.path.join(stanfordpath, 'stanford-corenlp-3.5.0.jar') + ':' + os.path.join(stanfordpath, 'stanford-corenlp-3.5.0-models.jar'),
+               'edu.stanford.nlp.pipeline.StanfordCoreNLP',
+               '-annotators', 'tokenize, ssplit, pos, depparse, lemma',
+               '-ssplit.eolonly', 'true',
+               '-tokenize.whitespace', 'true',
+               '-numThreads', '8',
+               '-textFile', '-',
+               'outFile', '-'], stdin=infile, stdout = PIPE, stderr = open('/dev/null', 'w'))
+    return stanford.stdout
+
+
+def get_sentences(instream):
+    sentence = []
+    expect = 0
+
+    for line in instream:
+        if expect == 0 and line.startswith('Sentence #'):
+            if sentence:
+                yield sentence
+            sentence = []
+            expect = 1
+
+        elif line == '\n':
+            expect = 0
+
+        elif expect == 3:
+            rel, remainder = line.split('(')
+            head, dep = remainder.split()
+            head_int = int(head.split('-')[-1][:-1])
+            dep_int = int(dep.split('-')[-1][:-1])
+            sentence[dep_int-1]['head'] = head_int
+            sentence[dep_int-1]['label'] = rel
+
+        elif expect == 2:
+            linesplit = line.split('[',1)[1].rsplit(']',1)[0].split('] [')
+            if len(linesplit) != len(sentence):
+                sys.stderr.write('Warning: mismatch in number of words in sentence\n')
+                sys.stderr.write(' '.join(w['word'] for w in sentence))
+                for i in range(len(sentence)):
+                    sentence[i]['pos'] = '-'
+                    sentence[i]['lemma'] = '-'
+                    sentence[i]['head'] = 0
+                    sentence[i]['label'] = '-'
+                expect = 0
+                continue
+            for i,w in enumerate(linesplit):
+                sentence[i]['pos'] = w.split(' PartOfSpeech=')[-1].split()[0]
+                sentence[i]['lemma'] = w.split(' Lemma=')[-1]
+            expect = 3
+
+        elif expect == 1:
+            for w in line.split():
+                sentence.append({'word':w})
+            expect = 2
+
+    if sentence:
+        yield sentence
+
+def write(sentence, outstream):
+    for i, w in enumerate(sentence):
+      outstream.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n'.format(i+1, w['word'], w['lemma'], w['pos'], w['pos'], '-', w['head'], w['label']))
+
+if __name__ == '__main__':
+    if sys.version_info < (3, 0):
+        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+
+
+    parser = create_parser()
+    options = parser.parse_args()
+
+    stanford = process_stanford(options.input, options.java, options.stanford)
+    for sentence in get_sentences(codecs.getreader('UTF-8')(stanford)):
+       write(sentence, options.output)
+       options.output.write('\n')

From 43527c82fc8f759b13807fb10b17a3d5fcd47561 Mon Sep 17 00:00:00 2001
From: Rico Sennrich <rico.sennrich@gmx.ch>
Date: Fri, 22 May 2015 15:31:08 +0100
Subject: [PATCH 003/108] training script for monolingual Neural LM

(+bugfixes and usability improvements for RDLM training)
---
 .../bilingual-lm/averageNullEmbedding.py      |  16 +-
 .../training/rdlm/extract_syntactic_ngrams.py |   2 +-
 scripts/training/rdlm/train_rdlm.py           |  15 +-
 scripts/training/train-neurallm.py            | 179 ++++++++++++++++++
 4 files changed, 201 insertions(+), 11 deletions(-)
 create mode 100755 scripts/training/train-neurallm.py

diff --git a/scripts/training/bilingual-lm/averageNullEmbedding.py b/scripts/training/bilingual-lm/averageNullEmbedding.py
index aca03aaae..891595aff 100755
--- a/scripts/training/bilingual-lm/averageNullEmbedding.py
+++ b/scripts/training/bilingual-lm/averageNullEmbedding.py
@@ -21,16 +21,10 @@ parser.add_argument(
     required=True)
 
 
-options = parser.parse_args()
-
-sys.path.append(options.nplm_python_path)
-import nplm
-
-
 def load_model(model_file):
+    import nplm
     return nplm.NeuralLM.from_file(model_file)
 
-
 def get_weights(path, length):
     counter = [0] * length
     for line in open(path):
@@ -38,8 +32,9 @@ def get_weights(path, length):
         counter[last_context] += 1
     return counter
 
+def main(options):
 
-if __name__ == "__main__":
+    sys.path.append(options.nplm_python_path)
 
     model = load_model(options.input_model)
     if options.null_idx == -1:
@@ -50,3 +45,8 @@ if __name__ == "__main__":
     model.input_embeddings[options.null_idx] = numpy.average(
         numpy.array(model.input_embeddings), weights=weights, axis=0)
     model.to_file(open(options.output_model, 'w'))
+
+if __name__ == "__main__":
+
+    options = parser.parse_args()
+    main(options)
diff --git a/scripts/training/rdlm/extract_syntactic_ngrams.py b/scripts/training/rdlm/extract_syntactic_ngrams.py
index c6d4b7968..1292e90f2 100755
--- a/scripts/training/rdlm/extract_syntactic_ngrams.py
+++ b/scripts/training/rdlm/extract_syntactic_ngrams.py
@@ -133,7 +133,7 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab,
         skip_glue_labels = [
             options.glue_symbol,
             options.start_symbol,
-            options.end_symbo,
+            options.end_symbol,
             ]
         if xml.get('label') in skip_glue_labels:
             for child in xml:
diff --git a/scripts/training/rdlm/train_rdlm.py b/scripts/training/rdlm/train_rdlm.py
index ae57e8dfc..639c1b32c 100755
--- a/scripts/training/rdlm/train_rdlm.py
+++ b/scripts/training/rdlm/train_rdlm.py
@@ -23,7 +23,7 @@ parser = argparse.ArgumentParser()
 parser.add_argument(
     "--working-dir", dest="working_dir", metavar="PATH")
 parser.add_argument(
-    "--corpus", dest="corpus_stem", metavar="PATH", help="Input file.")
+    "--corpus", '-text', dest="corpus_stem", metavar="PATH", help="Input file.")
 parser.add_argument(
     "--nplm-home", dest="nplm_home", metavar="PATH", required=True,
     help="Location of NPLM.")
@@ -169,6 +169,13 @@ def prepare_vocabulary(options):
 
 def main(options):
 
+    if options.output_dir is None:
+        options.output_dir = options.working_dir
+    else:
+        # Create output dir if necessary
+        if not os.path.exists(options.output_dir):
+            os.makedirs(options.output_dir)
+
     options.ngram_size = (
         2 * options.up_context_size +
         2 * options.left_context_size +
@@ -209,6 +216,8 @@ def main(options):
         sys.stderr.write('extracting syntactic n-grams (validation file)\n')
         extract_syntactic_ngrams.main(extract_options)
         extract_options.output.close()
+    else:
+        options.validation_file = None
 
     sys.stderr.write('training neural network\n')
     train_nplm.main(options)
@@ -235,5 +244,7 @@ if __name__ == "__main__":
         sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
         sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
 
-    options = parser.parse_args()
+    options = parser.parse_known_args()[0]
+    if parser.parse_known_args()[1]:
+        sys.stderr.write('Warning: unknown arguments: {0}\n'.format(parser.parse_known_args()[1]))
     main(options)
diff --git a/scripts/training/train-neurallm.py b/scripts/training/train-neurallm.py
new file mode 100755
index 000000000..2d2f12015
--- /dev/null
+++ b/scripts/training/train-neurallm.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+""" train feed-forward neural network LM with NPLM tool
+resulting model can be used in Moses as feature function NeuralLM
+"""
+
+from __future__ import print_function, unicode_literals
+
+import logging
+import argparse
+import subprocess
+import sys
+import os
+import codecs
+
+# ./bilingual-lm
+sys.path.append(os.path.join(sys.path[0], 'bilingual-lm'))
+import train_nplm
+import averageNullEmbedding
+
+logging.basicConfig(
+    format='%(asctime)s %(levelname)s: %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--working-dir", dest="working_dir", metavar="PATH")
+parser.add_argument(
+    "--corpus", '-text', dest="corpus_stem", metavar="PATH", help="Input file.")
+parser.add_argument(
+    "--nplm-home", dest="nplm_home", metavar="PATH", required=True,
+    help="Location of NPLM.")
+parser.add_argument(
+    "--epochs", dest="epochs", type=int, metavar="INT",
+    help="Number of training epochs (default: %(default)s).")
+parser.add_argument(
+    "--order", dest="order", type=int, metavar="INT",
+    help="N-gram order of language model (default: %(default)s).")
+parser.add_argument(
+    "--minibatch-size", dest="minibatch_size", type=int, metavar="INT",
+    help="Minibatch size (default: %(default)s).")
+parser.add_argument(
+    "--noise", dest="noise", type=int, metavar="INT",
+    help="Number of noise samples for NCE (default: %(default)s).")
+parser.add_argument(
+    "--hidden", dest="hidden", type=int, metavar="INT",
+    help=(
+        "Size of hidden layer (0 for single hidden layer) "
+        "(default: %(default)s)"))
+parser.add_argument(
+    "--input-embedding", dest="input_embedding", type=int, metavar="INT",
+    help="Size of input embedding layer (default: %(default)s).")
+parser.add_argument(
+    "--output-embedding", dest="output_embedding", type=int, metavar="INT",
+    help="Size of output embedding layer (default: %(default)s).")
+parser.add_argument(
+    "--threads", "-t", dest="threads", type=int, metavar="INT",
+    help="Number of threads (default: %(default)s).")
+parser.add_argument(
+    "--output-model", dest="output_model", metavar="PATH",
+    help="Name of output model (default: %(default)s).")
+parser.add_argument(
+    "--output-dir", dest="output_dir", metavar="PATH",
+    help="Output directory (default: same as working-dir).")
+parser.add_argument(
+    "--config-options-file", dest="config_options_file", metavar="PATH")
+parser.add_argument(
+    "--log-file", dest="log_file", metavar="PATH",
+    help="Log file to write to (default: %(default)s).")
+parser.add_argument(
+    "--validation-corpus", dest="validation_corpus", metavar="PATH",
+    help="Validation file (default: %(default)s).")
+parser.add_argument(
+    "--activation-function", dest="activation_fn",
+    choices=['identity', 'rectifier', 'tanh', 'hardtanh'],
+    help="Activation function (default: %(default)s).")
+parser.add_argument(
+    "--learning-rate", dest="learning_rate", type=float, metavar="FLOAT",
+    help="Learning rate (default: %(default)s).")
+parser.add_argument(
+    "--words-file", dest="words_file", metavar="PATH",
+    help="Output vocabulary file (default: %(default)s).")
+parser.add_argument(
+    "--vocab-size", dest="vocab_size", type=int, metavar="INT",
+    help="Vocabulary size (default: %(default)s).")
+
+parser.set_defaults(
+    working_dir="working",
+    corpus_stem="train",
+    nplm_home="/home/bhaddow/tools/nplm",
+    epochs=2,
+    order=5,
+    minibatch_size=1000,
+    noise=100,
+    hidden=0,
+    input_embedding=150,
+    output_embedding=750,
+    threads=4,
+    output_model="train",
+    output_dir=None,
+    config_options_file="config",
+    log_file="log",
+    validation_corpus=None,
+    activation_fn="rectifier",
+    learning_rate=1,
+    words_file='vocab',
+    vocab_size=500000)
+
+def main(options):
+
+    options.ngram_size = options.order
+
+    if options.output_dir is None:
+        options.output_dir = options.working_dir
+    else:
+        # Create output dir if necessary
+        if not os.path.exists(options.output_dir):
+            os.makedirs(options.output_dir)
+
+    extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
+                      '--train_text', options.corpus_stem,
+                      '--ngramize', '1',
+                      '--ngram_size', str(options.ngram_size),
+                      '--vocab_size', str(options.vocab_size),
+                      '--write_words_file', os.path.join(options.working_dir, options.words_file),
+                      '--train_file', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized')
+                      ]
+
+    sys.stderr.write('extracting n-grams\n')
+    ret = subprocess.call(extraction_cmd)
+    if ret:
+        raise Exception("preparing neural LM failed")
+    
+    if options.validation_corpus:
+
+        extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
+                          '--train_text', options.validation_corpus,
+                          '--ngramize', '1',
+                          '--ngram_size', str(options.ngram_size),
+                          '--vocab_size', str(options.vocab_size),
+                          '--words_file', os.path.join(options.working_dir, options.words_file),
+                          '--train_file', os.path.join(options.working_dir, os.path.basename(options.validation_corpus) + '.numberized')
+                          ]
+
+        sys.stderr.write('extracting n-grams (validation file)\n')
+        ret = subprocess.call(extraction_cmd)
+        if ret:
+            raise Exception("preparing neural LM failed")
+
+    else:
+        options.validation_file = None
+
+    options.input_words_file = options.words_file
+    options.output_words_file = options.words_file
+    options.input_vocab_size = options.vocab_size
+    options.output_vocab_size = options.vocab_size
+
+    sys.stderr.write('training neural network\n')
+    train_nplm.main(options)
+
+    sys.stderr.write('averaging null words\n')
+    average_options = averageNullEmbedding.parser.parse_args(
+        ['-i', os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)),
+         '-o', os.path.join(options.output_dir, options.output_model + '.model.nplm'),
+         '-t', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'),
+         '-p', os.path.join(options.nplm_home, 'python')])
+    averageNullEmbedding.main(average_options)
+
+
+if __name__ == "__main__":
+    if sys.version_info < (3, 0):
+        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+
+    options = parser.parse_known_args()[0]
+    if parser.parse_known_args()[1]:
+        sys.stderr.write('Warning: unknown arguments: {0}\n'.format(parser.parse_known_args()[1]))
+    main(options)

From 502e72ce91e749e3e24480bb0d2692c4bf6b0b83 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Sun, 24 May 2015 17:08:57 +0400
Subject: [PATCH 004/108] eclipse

---
 contrib/other-builds/extract-rules/.cproject  |   9 +-
 contrib/other-builds/extract/.cproject        |   9 +-
 contrib/other-builds/manual-label/.cproject   | 132 ----------
 contrib/other-builds/manual-label/.project    |  27 ---
 contrib/other-builds/manual-label/DeEn.cpp    |  46 ----
 contrib/other-builds/manual-label/DeEn.h      |   5 -
 .../manual-label/EnOpenNLPChunker.cpp         | 202 ----------------
 .../manual-label/EnOpenNLPChunker.h           |  29 ---
 .../manual-label/EnPhrasalVerb.cpp            | 226 ------------------
 .../other-builds/manual-label/EnPhrasalVerb.h |  11 -
 .../manual-label/LabelByInitialLetter.cpp     |  29 ---
 .../manual-label/LabelByInitialLetter.h       |   6 -
 contrib/other-builds/manual-label/Main.cpp    | 195 ---------------
 contrib/other-builds/manual-label/Main.h      |  27 ---
 contrib/other-builds/manual-label/Makefile    |  14 --
 .../manual-label/manual-label.project         | 131 ----------
 contrib/other-builds/moses/.project           |  10 -
 contrib/other-builds/score/.cproject          |   1 -
 contrib/other-builds/server/.cproject         |   5 +-
 19 files changed, 12 insertions(+), 1102 deletions(-)
 delete mode 100644 contrib/other-builds/manual-label/.cproject
 delete mode 100644 contrib/other-builds/manual-label/.project
 delete mode 100644 contrib/other-builds/manual-label/DeEn.cpp
 delete mode 100644 contrib/other-builds/manual-label/DeEn.h
 delete mode 100644 contrib/other-builds/manual-label/EnOpenNLPChunker.cpp
 delete mode 100644 contrib/other-builds/manual-label/EnOpenNLPChunker.h
 delete mode 100644 contrib/other-builds/manual-label/EnPhrasalVerb.cpp
 delete mode 100644 contrib/other-builds/manual-label/EnPhrasalVerb.h
 delete mode 100644 contrib/other-builds/manual-label/LabelByInitialLetter.cpp
 delete mode 100644 contrib/other-builds/manual-label/LabelByInitialLetter.h
 delete mode 100644 contrib/other-builds/manual-label/Main.cpp
 delete mode 100644 contrib/other-builds/manual-label/Main.h
 delete mode 100644 contrib/other-builds/manual-label/Makefile
 delete mode 100644 contrib/other-builds/manual-label/manual-label.project

diff --git a/contrib/other-builds/extract-rules/.cproject b/contrib/other-builds/extract-rules/.cproject
index e79f0f526..86e38979e 100644
--- a/contrib/other-builds/extract-rules/.cproject
+++ b/contrib/other-builds/extract-rules/.cproject
@@ -5,16 +5,16 @@
 			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.1909818145" moduleId="org.eclipse.cdt.core.settings" name="Debug">
 				<externalSettings/>
 				<extensions>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 				</extensions>
 			</storageModule>
 			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
-				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.1909818145" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
+				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe,org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.1909818145" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
 					<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.1909818145." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.702289239" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
 							<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.769221744" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
@@ -25,6 +25,7 @@
 								<option id="gnu.cpp.compiler.exe.debug.option.debugging.level.640603457" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
 								<option id="gnu.cpp.compiler.option.include.paths.231971122" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/include&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../..&quot;"/>
 								</option>
 								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.61884195" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
 							</tool>
@@ -60,16 +61,16 @@
 			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.130284564" moduleId="org.eclipse.cdt.core.settings" name="Release">
 				<externalSettings/>
 				<extensions>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 				</extensions>
 			</storageModule>
 			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
-				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.130284564" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
+				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe,org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.130284564" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
 					<folderInfo id="cdt.managedbuild.config.gnu.exe.release.130284564." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.933956450" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
 							<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.release.1114636926" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
diff --git a/contrib/other-builds/extract/.cproject b/contrib/other-builds/extract/.cproject
index 10701cb6e..4c80306be 100644
--- a/contrib/other-builds/extract/.cproject
+++ b/contrib/other-builds/extract/.cproject
@@ -5,16 +5,16 @@
 			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.2119725657" moduleId="org.eclipse.cdt.core.settings" name="Debug">
 				<externalSettings/>
 				<extensions>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 				</extensions>
 			</storageModule>
 			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
-				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.2119725657" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
+				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe,org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.2119725657" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
 					<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.2119725657." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1708444053" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
 							<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.645190133" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
@@ -25,6 +25,7 @@
 								<option id="gnu.cpp.compiler.exe.debug.option.debugging.level.535775760" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
 								<option id="gnu.cpp.compiler.option.include.paths.874182289" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../..&quot;"/>
 								</option>
 								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1355287045" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
 							</tool>
@@ -61,16 +62,16 @@
 			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.1230189043" moduleId="org.eclipse.cdt.core.settings" name="Release">
 				<externalSettings/>
 				<extensions>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 				</extensions>
 			</storageModule>
 			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
-				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.1230189043" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
+				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe,org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.1230189043" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
 					<folderInfo id="cdt.managedbuild.config.gnu.exe.release.1230189043." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.280378247" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
 							<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.release.1881910636" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
diff --git a/contrib/other-builds/manual-label/.cproject b/contrib/other-builds/manual-label/.cproject
deleted file mode 100644
index d9297a9fc..000000000
--- a/contrib/other-builds/manual-label/.cproject
+++ /dev/null
@@ -1,132 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
-	<storageModule moduleId="org.eclipse.cdt.core.settings">
-		<cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.2107801703">
-			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.2107801703" moduleId="org.eclipse.cdt.core.settings" name="Debug">
-				<externalSettings/>
-				<extensions>
-					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
-				</extensions>
-			</storageModule>
-			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
-				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.2107801703" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
-					<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.2107801703." name="/" resourcePath="">
-						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.502948364" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
-							<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.1431969079" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
-							<builder buildPath="${workspace_loc:/manual-label}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.2101075234" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
-							<tool id="cdt.managedbuild.tool.gnu.archiver.base.1118840081" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
-							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.2037265673" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
-								<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.400985496" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
-								<option id="gnu.cpp.compiler.exe.debug.option.debugging.level.1160903812" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
-								<option id="gnu.cpp.compiler.option.include.paths.404589863" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
-									<listOptionValue builtIn="false" value="${workspace_loc:}/../.."/>
-									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost&quot;"/>
-								</option>
-								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.967940596" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
-							</tool>
-							<tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.789243964" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
-								<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.exe.debug.option.optimization.level.2033266575" name="Optimization Level" superClass="gnu.c.compiler.exe.debug.option.optimization.level" valueType="enumerated"/>
-								<option id="gnu.c.compiler.exe.debug.option.debugging.level.1568929819" name="Debug Level" superClass="gnu.c.compiler.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
-								<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.676866714" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
-							</tool>
-							<tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.254144861" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
-							<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.319879082" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
-								<option id="gnu.cpp.link.option.paths.132164474" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
-									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
-								</option>
-								<option id="gnu.cpp.link.option.libs.1017214824" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
-									<listOptionValue builtIn="false" value="boost_program_options"/>
-								</option>
-								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1672776758" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
-									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
-									<additionalInput kind="additionalinput" paths="$(LIBS)"/>
-								</inputType>
-							</tool>
-							<tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.1104732611" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug">
-								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.372096550" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
-							</tool>
-						</toolChain>
-					</folderInfo>
-				</configuration>
-			</storageModule>
-			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
-		</cconfiguration>
-		<cconfiguration id="cdt.managedbuild.config.gnu.exe.release.649050588">
-			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.649050588" moduleId="org.eclipse.cdt.core.settings" name="Release">
-				<externalSettings/>
-				<extensions>
-					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
-				</extensions>
-			</storageModule>
-			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
-				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.649050588" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
-					<folderInfo id="cdt.managedbuild.config.gnu.exe.release.649050588." name="/" resourcePath="">
-						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.1107402972" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
-							<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.release.1038954684" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
-							<builder buildPath="${workspace_loc:/manual-label}/Release" id="cdt.managedbuild.target.gnu.builder.exe.release.100518450" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.release"/>
-							<tool id="cdt.managedbuild.tool.gnu.archiver.base.2005888378" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
-							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1743303968" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release">
-								<option id="gnu.cpp.compiler.exe.release.option.optimization.level.968169340" name="Optimization Level" superClass="gnu.cpp.compiler.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
-								<option id="gnu.cpp.compiler.exe.release.option.debugging.level.977676916" name="Debug Level" superClass="gnu.cpp.compiler.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
-								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1889240027" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
-							</tool>
-							<tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.release.924128295" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.release">
-								<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.exe.release.option.optimization.level.1914416581" name="Optimization Level" superClass="gnu.c.compiler.exe.release.option.optimization.level" valueType="enumerated"/>
-								<option id="gnu.c.compiler.exe.release.option.debugging.level.826081780" name="Debug Level" superClass="gnu.c.compiler.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
-								<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.2048171432" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
-							</tool>
-							<tool id="cdt.managedbuild.tool.gnu.c.linker.exe.release.940327646" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.release"/>
-							<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.release.369758737" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.release">
-								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1186766936" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
-									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
-									<additionalInput kind="additionalinput" paths="$(LIBS)"/>
-								</inputType>
-							</tool>
-							<tool id="cdt.managedbuild.tool.gnu.assembler.exe.release.266174128" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.release">
-								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.558116084" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
-							</tool>
-						</toolChain>
-					</folderInfo>
-				</configuration>
-			</storageModule>
-			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
-		</cconfiguration>
-	</storageModule>
-	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
-		<project id="manual-label.cdt.managedbuild.target.gnu.exe.1701243340" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
-	</storageModule>
-	<storageModule moduleId="scannerConfiguration">
-		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.649050588;cdt.managedbuild.config.gnu.exe.release.649050588.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1743303968;cdt.managedbuild.tool.gnu.cpp.compiler.input.1889240027">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.649050588;cdt.managedbuild.config.gnu.exe.release.649050588.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.924128295;cdt.managedbuild.tool.gnu.c.compiler.input.2048171432">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.2107801703;cdt.managedbuild.config.gnu.exe.debug.2107801703.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.2037265673;cdt.managedbuild.tool.gnu.cpp.compiler.input.967940596">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.2107801703;cdt.managedbuild.config.gnu.exe.debug.2107801703.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.789243964;cdt.managedbuild.tool.gnu.c.compiler.input.676866714">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
-		</scannerConfigBuildInfo>
-	</storageModule>
-	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
-	<storageModule moduleId="refreshScope" versionNumber="2">
-		<configuration configurationName="Release">
-			<resource resourceType="PROJECT" workspacePath="/manual-label"/>
-		</configuration>
-		<configuration configurationName="Debug">
-			<resource resourceType="PROJECT" workspacePath="/manual-label"/>
-		</configuration>
-	</storageModule>
-</cproject>
diff --git a/contrib/other-builds/manual-label/.project b/contrib/other-builds/manual-label/.project
deleted file mode 100644
index d0c1dba19..000000000
--- a/contrib/other-builds/manual-label/.project
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<projectDescription>
-	<name>manual-label</name>
-	<comment></comment>
-	<projects>
-	</projects>
-	<buildSpec>
-		<buildCommand>
-			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
-			<triggers>clean,full,incremental,</triggers>
-			<arguments>
-			</arguments>
-		</buildCommand>
-		<buildCommand>
-			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
-			<triggers>full,incremental,</triggers>
-			<arguments>
-			</arguments>
-		</buildCommand>
-	</buildSpec>
-	<natures>
-		<nature>org.eclipse.cdt.core.cnature</nature>
-		<nature>org.eclipse.cdt.core.ccnature</nature>
-		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
-		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
-	</natures>
-</projectDescription>
diff --git a/contrib/other-builds/manual-label/DeEn.cpp b/contrib/other-builds/manual-label/DeEn.cpp
deleted file mode 100644
index ea2934c5a..000000000
--- a/contrib/other-builds/manual-label/DeEn.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#include <list>
-#include "DeEn.h"
-#include "Main.h"
-#include "moses/Util.h"
-
-using namespace std;
-
-extern bool g_debug;
-
-bool Contains(const Phrase &source, int start, int end, int factor, const string &str)
-{
-  for (int pos = start; pos <= end; ++pos) {
-    bool found = IsA(source, pos, 0, factor, str);
-    if (found) {
-      return true;
-    }
-  }
-  return false;
-}
-
-void LabelDeEn(const Phrase &source, ostream &out)
-{
-  Ranges ranges;
-
-  // find ranges to label
-  for (int start = 0; start < source.size(); ++start) {
-    for (int end = start; end < source.size(); ++end) {
-     if (IsA(source, start, -1, 1, "VAFIN")
-          && IsA(source, end, +1, 1, "VVINF VVPP")
-          && !Contains(source, start, end, 1, "VAFIN VVINF VVPP VVFIN")) {
-       Range range(start, end, "reorder-label");
-       ranges.push_back(range);
-      }
-      else if ((start == 0 || IsA(source, start, -1, 1, "$,"))
-          && IsA(source, end, +1, 0, "zu")
-          && IsA(source, end, +2, 1, "VVINF")
-          && !Contains(source, start, end, 1, "$,")) {
-        Range range(start, end, "reorder-label");
-        ranges.push_back(range);
-      }
-    }
-  }
-
-  OutputWithLabels(source, ranges, out);
-}
-
diff --git a/contrib/other-builds/manual-label/DeEn.h b/contrib/other-builds/manual-label/DeEn.h
deleted file mode 100644
index c24ce0079..000000000
--- a/contrib/other-builds/manual-label/DeEn.h
+++ /dev/null
@@ -1,5 +0,0 @@
-#pragma once
-
-#include "Main.h"
-
-void LabelDeEn(const Phrase &source, std::ostream &out);
diff --git a/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp b/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp
deleted file mode 100644
index 538aa9746..000000000
--- a/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * EnApacheChunker.cpp
- *
- *  Created on: 28 Feb 2014
- *      Author: hieu
- */
-#include <cstdlib>
-#include <cstdio>
-#include <algorithm>
-#include <fstream>
-#include <boost/algorithm/string/predicate.hpp>
-#include <boost/filesystem.hpp>
-#include "EnOpenNLPChunker.h"
-#include "moses/Util.h"
-
-using namespace std;
-using namespace boost::algorithm;
-
-EnOpenNLPChunker::EnOpenNLPChunker(const std::string &openNLPPath)
-:m_openNLPPath(openNLPPath)
-{
-	// TODO Auto-generated constructor stub
-
-}
-
-EnOpenNLPChunker::~EnOpenNLPChunker() {
-	// TODO Auto-generated destructor stub
-}
-
-void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector<string> &filterList)
-{
-        const boost::filesystem::path
-            inPath = boost::filesystem::unique_path(),
-            outPath = boost::filesystem::unique_path();
-	// read all input to a temp file
-	ofstream inFile(inPath.c_str());
-
-	string line;
-	while (getline(in, line)) {
-		Unescape(line);
-		inFile << line << endl;
-	}
-	inFile.close();
-
-	// execute chunker
-	string cmd = "cat " + inPath.native() + " | "
-			+ m_openNLPPath + "/bin/opennlp POSTagger "
-				+ m_openNLPPath + "/models/en-pos-maxent.bin | "
-			+ m_openNLPPath + "/bin/opennlp ChunkerME "
-				+ m_openNLPPath + "/models/en-chunker.bin > "
-			+ outPath.native();
-	//g << "Executing:" << cmd << endl;
-	int ret = system(cmd.c_str());
-
-	// read result of chunker and output as Moses xml trees
-	ifstream outFile(outPath.c_str());
-
-	size_t lineNum = 0;
-	while (getline(outFile, line)) {
-		//cerr << line << endl;
-		MosesReformat(line, out, filterList);
-		out << endl;
-		++lineNum;
-	}
-	outFile.close();
-
-	// clean up temporary files
-	remove(inPath.c_str());
-	remove(outPath.c_str());
-}
-
-void EnOpenNLPChunker::MosesReformat(const string &line, std::ostream &out, const vector<string> &filterList)
-{
-	//cerr << "REFORMATING:" << line << endl;
-	bool inLabel = false;
-	vector<string> toks;
-	Moses::Tokenize(toks, line);
-	for (size_t i = 0; i < toks.size(); ++i) {
-		const string &tok = toks[i];
-
-		if (tok.substr(0, 1) == "[" && tok.substr(1,1) != "_") {
-			// start of chunk
-			string label = tok.substr(1);
-			if (UseLabel(label, filterList)) {
-				out << "<tree label=\"" << label << "\">";
-				inLabel = true;
-			}
-		}
-		else if (ends_with(tok, "]")) {
-			// end of chunk
-			if (tok.size() > 1) {
-				if (tok.substr(1,1) == "_") {
-					// just a word that happens to be ]
-					vector<string> factors;
-					Moses::Tokenize(factors, tok, "_");
-					assert(factors.size() == 2);
-
-					Escape(factors[0]);
-					out << factors[0] << " ";
-				}
-				else {
-					// a word and end of tree
-					string word = tok.substr(0, tok.size()-1);
-
-					vector<string> factors;
-					Moses::Tokenize(factors, word, "_");
-					assert(factors.size() == 2);
-
-					Escape(factors[0]);
-					out << factors[0] << " ";
-				}
-
-				if (inLabel) {
-					out << "</tree> ";
-					inLabel = false;
-				}
-			}
-			else {
-				if (inLabel) {
-					out << "</tree> ";
-					inLabel = false;
-				}
-			}
-
-		}
-		else {
-			// lexical item
-			vector<string> factors;
-			Moses::Tokenize(factors, tok, "_");
-			if (factors.size() == 2) {
-				Escape(factors[0]);
-				out << factors[0] << " ";
-			}
-			else if (factors.size() == 1) {
-				// word is _
-				assert(tok.substr(0, 2) == "__");
-				out << "_ ";
-			}
-			else {
-				throw "Unknown format:" + tok;
-			}
-		}
-	}
-}
-
-std::string
-replaceAll( std::string const& original,
-            std::string const& before,
-            std::string const& after )
-{
-    std::string retval;
-    std::string::const_iterator end     = original.end();
-    std::string::const_iterator current = original.begin();
-    std::string::const_iterator next    =
-            std::search( current, end, before.begin(), before.end() );
-    while ( next != end ) {
-        retval.append( current, next );
-        retval.append( after );
-        current = next + before.size();
-        next = std::search( current, end, before.begin(), before.end() );
-    }
-    retval.append( current, next );
-    return retval;
-}
-
-void EnOpenNLPChunker::Escape(string &line)
-{
-	line = replaceAll(line, "&", "&amp;");
-	line = replaceAll(line, "|", "&#124;");
-	line = replaceAll(line, "<", "&lt;");
-	line = replaceAll(line, ">", "&gt;");
-	line = replaceAll(line, "'", "&apos;");
-	line = replaceAll(line, "\"", "&quot;");
-	line = replaceAll(line, "[", "&#91;");
-	line = replaceAll(line, "]", "&#93;");
-}
-
-void EnOpenNLPChunker::Unescape(string &line)
-{
-	line = replaceAll(line, "&#124;", "|");
-	line = replaceAll(line, "&lt;", "<");
-	line = replaceAll(line, "&gt;", ">");
-	line = replaceAll(line, "&quot;", "\"");
-	line = replaceAll(line, "&apos;", "'");
-	line = replaceAll(line, "&#91;", "[");
-	line = replaceAll(line, "&#93;", "]");
-	line = replaceAll(line, "&amp;", "&");
-}
-
-bool EnOpenNLPChunker::UseLabel(const std::string &label, const std::vector<std::string> &filterList) const
-{
-	if (filterList.size() == 0) {
-		return true;
-	}
-
-	for (size_t i = 0; i < filterList.size(); ++i) {
-		if (label == filterList[i]) {
-			return true;
-		}
-	}
-	return false;
-}
diff --git a/contrib/other-builds/manual-label/EnOpenNLPChunker.h b/contrib/other-builds/manual-label/EnOpenNLPChunker.h
deleted file mode 100644
index df9f90e42..000000000
--- a/contrib/other-builds/manual-label/EnOpenNLPChunker.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * EnApacheChunker.h
- *
- *  Created on: 28 Feb 2014
- *      Author: hieu
- */
-
-#pragma once
-
-#include <vector>
-#include <string>
-#include <iostream>
-
-class EnOpenNLPChunker {
-public:
-	EnOpenNLPChunker(const std::string &openNLPPath);
-	virtual ~EnOpenNLPChunker();
-	void Process(std::istream &in, std::ostream &out, const std::vector<std::string> &filterList);
-protected:
-	const std::string m_openNLPPath;
-
-	void Escape(std::string &line);
-	void Unescape(std::string &line);
-
-	void MosesReformat(const std::string &line, std::ostream &out, const std::vector<std::string> &filterList);
-
-	bool UseLabel(const std::string &label, const std::vector<std::string> &filterList) const;
-};
-
diff --git a/contrib/other-builds/manual-label/EnPhrasalVerb.cpp b/contrib/other-builds/manual-label/EnPhrasalVerb.cpp
deleted file mode 100644
index 4bee9b941..000000000
--- a/contrib/other-builds/manual-label/EnPhrasalVerb.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-#include <iostream>
-#include <list>
-#include <limits>
-#include <algorithm>
-#include "EnPhrasalVerb.h"
-#include "moses/Util.h"
-
-using namespace std;
-
-void EnPhrasalVerb(const Phrase &source, int revision, ostream &out)
-{
-  Ranges ranges;
-
-  // find ranges to label
-  for (int start = 0; start < source.size(); ++start) {
-	size_t end = std::numeric_limits<size_t>::max();
-
-	if (IsA(source, start, 0, 0, "ask asked asking")) {
-		end = Found(source, start, 0, "out");
-    }
-	else if (IsA(source, start, 0, 0, "back backed backing")) {
-		end = Found(source, start, 0, "up");
-	}
-	else if (IsA(source, start, 0, 0, "blow blown blew")) {
-		end = Found(source, start, 0, "up");
-	}
-	else if (IsA(source, start, 0, 0, "break broke broken")) {
-		end = Found(source, start, 0, "down up in");
-	}
-	else if (IsA(source, start, 0, 0, "bring brought bringing")) {
-		end = Found(source, start, 0, "down up in");
-	}
-	else if (IsA(source, start, 0, 0, "call called calling")) {
-		end = Found(source, start, 0, "back up off");
-	}
-	else if (IsA(source, start, 0, 0, "check checked checking")) {
-		end = Found(source, start, 0, "out in");
-	}
-	else if (IsA(source, start, 0, 0, "cheer cheered cheering")) {
-		end = Found(source, start, 0, "up");
-	}
-	else if (IsA(source, start, 0, 0, "clean cleaned cleaning")) {
-		end = Found(source, start, 0, "up");
-	}
-	else if (IsA(source, start, 0, 0, "cross crossed crossing")) {
-		end = Found(source, start, 0, "out");
-	}
-	else if (IsA(source, start, 0, 0, "cut cutting")) {
-		end = Found(source, start, 0, "down off out");
-	}
-	else if (IsA(source, start, 0, 0, "do did done")) {
-		end = Found(source, start, 0, "over up");
-	}
-	else if (IsA(source, start, 0, 0, "drop dropped dropping")) {
-		end = Found(source, start, 0, "off");
-	}
-	else if (IsA(source, start, 0, 0, "figure figured figuring")) {
-		end = Found(source, start, 0, "out");
-	}
-	else if (IsA(source, start, 0, 0, "fill filled filling")) {
-		end = Found(source, start, 0, "in out up");
-	}
-	else if (IsA(source, start, 0, 0, "find found finding")) {
-		end = Found(source, start, 0, "out");
-	}
-	else if (IsA(source, start, 0, 0, "get got getting gotten")) {
-		end = Found(source, start, 0, "across over back");
-	}
-	else if (IsA(source, start, 0, 0, "give given gave giving")) {
-		end = Found(source, start, 0, "away back out up");
-	}
-	else if (IsA(source, start, 0, 0, "hand handed handing")) {
-		end = Found(source, start, 0, "down in over");
-	}
-	else if (IsA(source, start, 0, 0, "hold held holding")) {
-		end = Found(source, start, 0, "back up");
-	}
-	else if (IsA(source, start, 0, 0, "keep kept keeping")) {
-		end = Found(source, start, 0, "from up");
-	}
-	else if (IsA(source, start, 0, 0, "let letting")) {
-		end = Found(source, start, 0, "down in");
-	}
-	else if (IsA(source, start, 0, 0, "look looked looking")) {
-		end = Found(source, start, 0, "over up");
-	}
-	else if (IsA(source, start, 0, 0, "make made making")) {
-		end = Found(source, start, 0, "up");
-	}
-	else if (IsA(source, start, 0, 0, "mix mixed mixing")) {
-		end = Found(source, start, 0, "up");
-	}
-	else if (IsA(source, start, 0, 0, "pass passed passing")) {
-		end = Found(source, start, 0, "out up");
-	}
-	else if (IsA(source, start, 0, 0, "pay payed paying")) {
-		end = Found(source, start, 0, "back");
-	}
-	else if (IsA(source, start, 0, 0, "pick picked picking")) {
-		end = Found(source, start, 0, "out");
-	}
-	else if (IsA(source, start, 0, 0, "point pointed pointing")) {
-		end = Found(source, start, 0, "out");
-	}
-	else if (IsA(source, start, 0, 0, "put putting")) {
-		end = Found(source, start, 0, "down off out together on");
-	}
-	else if (IsA(source, start, 0, 0, "send sending")) {
-		end = Found(source, start, 0, "back");
-	}
-	else if (IsA(source, start, 0, 0, "set setting")) {
-		end = Found(source, start, 0, "up");
-	}
-	else if (IsA(source, start, 0, 0, "sort sorted sorting")) {
-		end = Found(source, start, 0, "out");
-	}
-	else if (IsA(source, start, 0, 0, "switch switched switching")) {
-		end = Found(source, start, 0, "off on");
-	}
-	else if (IsA(source, start, 0, 0, "take took taking")) {
-		end = Found(source, start, 0, "apart back off out");
-	}
-	else if (IsA(source, start, 0, 0, "tear torn tearing")) {
-		end = Found(source, start, 0, "up");
-	}
-	else if (IsA(source, start, 0, 0, "think thought thinking")) {
-		end = Found(source, start, 0, "over");
-	}
-	else if (IsA(source, start, 0, 0, "thrown threw thrown throwing")) {
-		end = Found(source, start, 0, "away");
-	}
-	else if (IsA(source, start, 0, 0, "turn turned turning")) {
-		end = Found(source, start, 0, "down off on");
-	}
-	else if (IsA(source, start, 0, 0, "try tried trying")) {
-		end = Found(source, start, 0, "on out");
-	}
-	else if (IsA(source, start, 0, 0, "use used using")) {
-		end = Found(source, start, 0, "up");
-	}
-	else if (IsA(source, start, 0, 0, "warm warmed warming")) {
-		end = Found(source, start, 0, "up");
-	}
-	else if (IsA(source, start, 0, 0, "work worked working")) {
-		end = Found(source, start, 0, "out");
-	}
-
-	// found range to label
-	if (end != std::numeric_limits<size_t>::max() &&
-			end > start + 1) {
-		bool add = true;
-		if (revision == 1 && Exist(source,
-									start + 1,
-									end - 1,
-									1,
-									"VB VBD VBG VBN VBP VBZ")) {
-			// there's a verb in between
-			add = false;
-		}
-
-		if (add) {
-			Range range(start + 1, end - 1, "reorder-label");
-			ranges.push_back(range);
-		}
-	}
-  }
-
-  OutputWithLabels(source, ranges, out);
-}
-
-bool Exist(const Phrase &source, int start, int end, int factor, const std::string &str)
-{
-  vector<string> soughts = Moses::Tokenize(str, " ");
-  for (size_t i = start; i <= end; ++i) {
-	const Word &word = source[i];
-    bool found = Found(word, factor, soughts);
-    if (found) {
-    	return true;
-    }
-  }
-
-  return false;
-}
-
-size_t Found(const Phrase &source, int pos, int factor, const std::string &str)
-{
-  const size_t MAX_RANGE = 10;
-
-  vector<string> soughts = Moses::Tokenize(str, " ");
-  vector<string> puncts = Moses::Tokenize(". : , ;", " ");
-
-
-  size_t maxEnd = std::min(source.size(), (size_t) pos + MAX_RANGE);
-  for (size_t i = pos + 1; i < maxEnd; ++i) {
-	const Word &word = source[i];
-	bool found;
-
-	found = Found(word, factor, puncts);
-	if (found) {
-		return std::numeric_limits<size_t>::max();
-	}
-
-	found = Found(word, factor, soughts);
-	if (found) {
-		return i;
-	}
-  }
-
-  return std::numeric_limits<size_t>::max();
-}
-
-
-bool Found(const Word &word, int factor, const vector<string> &soughts)
-{
-  const string &element = word[factor];
-  for (size_t i = 0; i < soughts.size(); ++i) {
-	const string &sought = soughts[i];
-	bool found = (element == sought);
-	if (found) {
-	  return true;
-	}
-  }
-  return false;
-}
-
-
diff --git a/contrib/other-builds/manual-label/EnPhrasalVerb.h b/contrib/other-builds/manual-label/EnPhrasalVerb.h
deleted file mode 100644
index 4cb5f7348..000000000
--- a/contrib/other-builds/manual-label/EnPhrasalVerb.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#pragma once
-
-#include "Main.h"
-
-// roll your own identification of phrasal verbs
-void EnPhrasalVerb(const Phrase &source, int revision, std::ostream &out);
-
-bool Exist(const Phrase &source, int start, int end, int factor, const std::string &str);
-size_t Found(const Phrase &source, int pos, int factor, const std::string &str);
-bool Found(const Word &word, int factor, const std::vector<std::string> &soughts);
-
diff --git a/contrib/other-builds/manual-label/LabelByInitialLetter.cpp b/contrib/other-builds/manual-label/LabelByInitialLetter.cpp
deleted file mode 100644
index e4136a7ea..000000000
--- a/contrib/other-builds/manual-label/LabelByInitialLetter.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-#include "LabelByInitialLetter.h"
-#include "Main.h"
-
-using namespace std;
-
-void LabelByInitialLetter(const Phrase &source, std::ostream &out)
-{
-  Ranges ranges;
-
-  for (int start = 0; start < source.size(); ++start) {
-	  const string &startWord = source[start][0];
-	  string startChar = startWord.substr(0,1);
-
-	  for (int end = start + 1; end < source.size(); ++end) {
-		  const string &endWord = source[end][0];
-		  string endChar = endWord.substr(0,1);
-
-		  if (startChar == endChar) {
-				Range range(start, end, startChar + "-label");
-				ranges.push_back(range);
-		  }
-	  }
-  }
-
-  OutputWithLabels(source, ranges, out);
-
-}
-
-
diff --git a/contrib/other-builds/manual-label/LabelByInitialLetter.h b/contrib/other-builds/manual-label/LabelByInitialLetter.h
deleted file mode 100644
index ba8d34c19..000000000
--- a/contrib/other-builds/manual-label/LabelByInitialLetter.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#pragma once
-
-#include "Main.h"
-
-void LabelByInitialLetter(const Phrase &source, std::ostream &out);
-
diff --git a/contrib/other-builds/manual-label/Main.cpp b/contrib/other-builds/manual-label/Main.cpp
deleted file mode 100644
index 896f70590..000000000
--- a/contrib/other-builds/manual-label/Main.cpp
+++ /dev/null
@@ -1,195 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <boost/program_options.hpp>
-#include "moses/Util.h"
-#include "Main.h"
-#include "DeEn.h"
-#include "EnPhrasalVerb.h"
-#include "EnOpenNLPChunker.h"
-#include "LabelByInitialLetter.h"
-
-using namespace std;
-
-bool g_debug = false;
-
-Phrase Tokenize(const string &line);
-
-int main(int argc, char** argv)
-{
-  cerr << "Starting" << endl;
-
-  namespace po = boost::program_options;
-  po::options_description desc("Options");
-  desc.add_options()
-    ("help", "Print help messages")
-
-    ("input,i", po::value<string>(), "Input file. Otherwise it will read from standard in")
-    ("output,o", po::value<string>(), "Output file. Otherwise it will print from standard out")
-
-    ("source-language,s", po::value<string>()->required(), "Source Language")
-    ("target-language,t", po::value<string>()->required(), "Target Language")
-    ("revision,r", po::value<int>()->default_value(0), "Revision")
-    ("filter", po::value<string>(), "Only use labels from this comma-separated list")
-
-    ("opennlp", po::value<string>()->default_value(""), "Path to Apache OpenNLP toolkit")
-
-    ;
-
-  po::variables_map vm;
-  try
-  {
-    po::store(po::parse_command_line(argc, argv, desc),
-              vm); // can throw
-
-    /** --help option
-     */
-    if ( vm.count("help")  )
-    {
-      std::cout << "Basic Command Line Parameter App" << std::endl
-                << desc << std::endl;
-      return EXIT_SUCCESS;
-    }
-
-    po::notify(vm); // throws on error, so do after help in case
-                    // there are any problems
-  }
-  catch(po::error& e)
-  {
-    std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
-    std::cerr << desc << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  istream *inStrm = &cin;
-  if (vm.count("input")) {
-	  string inStr =  vm["input"].as<string>();
-	  cerr << "inStr=" << inStr << endl;
-	  ifstream *inFile = new ifstream(inStr.c_str());
-	  inStrm = inFile;
-  }
-
-  ostream *outStrm = &cout;
-  if (vm.count("output")) {
-	  string outStr =  vm["output"].as<string>();
-	  cerr << "outStr=" << outStr << endl;
-	  ostream *outFile = new ofstream(outStr.c_str());
-	  outStrm = outFile;
-  }
-
-  vector<string> filterList;
-  if (vm.count("filter")) {
-	  string filter = vm["filter"].as<string>();
-	  Moses::Tokenize(filterList, filter, ",");
-  }
-
-  string sourceLang = vm["source-language"].as<string>();
-  string targetLang = vm["target-language"].as<string>();
-  int revision = vm["revision"].as<int>();
-
-  cerr << sourceLang << " " << targetLang << " " << revision << endl;
-
-  if (sourceLang == "en" && revision == 2) {
-	if (vm.count("opennlp") == 0) {
-		throw "Need path to openNLP toolkit";
-	}
-
-	string openNLPPath = vm["opennlp"].as<string>();
-  	EnOpenNLPChunker chunker(openNLPPath);
-  	chunker.Process(*inStrm, *outStrm, filterList);
-  }
-  else {
-	  // process line-by-line
-	  string line;
-	  size_t lineNum = 1;
-
-	  while (getline(*inStrm, line)) {
-		//cerr << lineNum << ":" << line << endl;
-		if (lineNum % 1000 == 0) {
-		  cerr << lineNum << " ";
-		}
-
-		Phrase source = Tokenize(line);
-
-		if (revision == 600 ) {
-			LabelByInitialLetter(source, *outStrm);
-		}
-		else if (sourceLang == "de" && targetLang == "en") {
-			LabelDeEn(source, *outStrm);
-		}
-		else if (sourceLang == "en") {
-			if (revision == 0 || revision == 1) {
-				EnPhrasalVerb(source, revision, *outStrm);
-			}
-			else if (revision == 2) {
-				  string openNLPPath = vm["opennlp-path"].as<string>();
-				  EnOpenNLPChunker chunker(openNLPPath);
-			}
-		}
-
-		++lineNum;
-	  }
-  }
-
-
-  cerr << "Finished" << endl;
-  return EXIT_SUCCESS;
-}
-
-Phrase Tokenize(const string &line)
-{
-  Phrase ret;
-
-  vector<string> toks = Moses::Tokenize(line);
-  for (size_t i = 0; i < toks.size(); ++i) {
-    Word word = Moses::Tokenize(toks[i], "|");
-    ret.push_back(word);
-  }
-
-  return ret;
-}
-
-bool IsA(const Phrase &source, int pos, int offset, int factor, const string &str)
-{
-  pos += offset;
-  if (pos >= source.size() || pos < 0) {
-    return false;
-  }
-
-  const string &word = source[pos][factor];
-  vector<string> soughts = Moses::Tokenize(str, " ");
-  for (int i = 0; i < soughts.size(); ++i) {
-    string &sought = soughts[i];
-    bool found = (word == sought);
-    if (found) {
-      return true;
-    }
-  }
-  return false;
-}
-
-
-void OutputWithLabels(const Phrase &source, const Ranges ranges, ostream &out)
-{
-  // output sentence, with labels
-  for (int pos = 0; pos < source.size(); ++pos) {
-	// output beginning of label
-	for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) {
-	  const Range &range = *iter;
-	  if (range.range.first == pos) {
-		out << "<tree label=\"" + range.label + "\"> ";
-	  }
-	}
-
-	const Word &word = source[pos];
-	out << word[0] << " ";
-
-	for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) {
-	  const Range &range = *iter;
-	  if (range.range.second == pos) {
-		out << "</tree> ";
-	  }
-	}
-  }
-  out << endl;
-
-}
diff --git a/contrib/other-builds/manual-label/Main.h b/contrib/other-builds/manual-label/Main.h
deleted file mode 100644
index 036da0d45..000000000
--- a/contrib/other-builds/manual-label/Main.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#pragma once
-
-#include <iostream>
-#include <vector>
-#include <string>
-#include <list>
-
-typedef std::vector<std::string> Word;
-typedef std::vector<Word> Phrase;
-
-struct Range
-{
-  Range(int start,int end, const std::string &l)
-  :range(start, end)
-  ,label(l)
-  {}
-
-  std::pair<int,int> range;
-  std::string label;
-};
-
-typedef std::list<Range> Ranges;
-
-bool IsA(const Phrase &source, int pos, int offset, int factor, const std::string &str);
-void OutputWithLabels(const Phrase &source, const Ranges ranges, std::ostream &out);
-
-
diff --git a/contrib/other-builds/manual-label/Makefile b/contrib/other-builds/manual-label/Makefile
deleted file mode 100644
index f24d69dc7..000000000
--- a/contrib/other-builds/manual-label/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-all: manual-label
-
-clean: 
-	rm -f *.o manual-label
-
-.cpp.o:
-	g++ -I../../../boost/include -I../../../ -O3 -g -c $<
-
-OBJECTS = DeEn.o EnOpenNLPChunker.o EnPhrasalVerb.o Main.o LabelByInitialLetter.o
-
-manual-label: $(OBJECTS)
-	g++ $(OBJECTS)  -L../../../boost/lib64  -lz -lboost_program_options-mt -o manual-label
-
-
diff --git a/contrib/other-builds/manual-label/manual-label.project b/contrib/other-builds/manual-label/manual-label.project
deleted file mode 100644
index 5c678561a..000000000
--- a/contrib/other-builds/manual-label/manual-label.project
+++ /dev/null
@@ -1,131 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<CodeLite_Project Name="manual-label" InternalType="Console">
-  <Plugins>
-    <Plugin Name="qmake">
-      <![CDATA[00010001N0005Debug000000000000]]>
-    </Plugin>
-    <Plugin Name="CMakePlugin">
-      <![CDATA[[{
-  "name": "Debug",
-  "enabled": false,
-  "buildDirectory": "build",
-  "sourceDirectory": "$(ProjectPath)",
-  "generator": "",
-  "buildType": "",
-  "arguments": [],
-  "parentProject": ""
- }]]]>
-    </Plugin>
-  </Plugins>
-  <Description/>
-  <Dependencies/>
-  <VirtualDirectory Name="manual-label">
-    <File Name="DeEn.cpp"/>
-    <File Name="DeEn.h"/>
-    <File Name="EnOpenNLPChunker.cpp"/>
-    <File Name="EnOpenNLPChunker.h"/>
-    <File Name="EnPhrasalVerb.cpp"/>
-    <File Name="EnPhrasalVerb.h"/>
-    <File Name="LabelByInitialLetter.cpp"/>
-    <File Name="LabelByInitialLetter.h"/>
-    <File Name="Main.cpp"/>
-    <File Name="Main.h"/>
-  </VirtualDirectory>
-  <Settings Type="Executable">
-    <GlobalSettings>
-      <Compiler Options="" C_Options="" Assembler="">
-        <IncludePath Value="."/>
-      </Compiler>
-      <Linker Options="">
-        <LibraryPath Value="."/>
-      </Linker>
-      <ResourceCompiler Options=""/>
-    </GlobalSettings>
-    <Configuration Name="Debug" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Executable" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
-      <Compiler Options="-g;-O0;-Wall" C_Options="-g;-O0;-Wall" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
-        <IncludePath Value="."/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder"/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
-      </Compiler>
-      <Linker Options="" Required="yes">
-        <LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/boost/lib64"/>
-        <Library Value="boost_program_options"/>
-        <Library Value="boost_filesystem"/>
-        <Library Value="boost_system"/>
-      </Linker>
-      <ResourceCompiler Options="" Required="no"/>
-      <General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
-      <Environment EnvVarSetName="&lt;Use Defaults&gt;" DbgSetName="&lt;Use Defaults&gt;">
-        <![CDATA[]]>
-      </Environment>
-      <Debugger IsRemote="no" RemoteHostName="" RemoteHostPort="" DebuggerPath="" IsExtended="no">
-        <DebuggerSearchPaths/>
-        <PostConnectCommands/>
-        <StartupCommands/>
-      </Debugger>
-      <PreBuild/>
-      <PostBuild/>
-      <CustomBuild Enabled="no">
-        <RebuildCommand/>
-        <CleanCommand/>
-        <BuildCommand/>
-        <PreprocessFileCommand/>
-        <SingleFileCommand/>
-        <MakefileGenerationCommand/>
-        <ThirdPartyToolName>None</ThirdPartyToolName>
-        <WorkingDirectory/>
-      </CustomBuild>
-      <AdditionalRules>
-        <CustomPostBuild/>
-        <CustomPreBuild/>
-      </AdditionalRules>
-      <Completion EnableCpp11="no">
-        <ClangCmpFlagsC/>
-        <ClangCmpFlags/>
-        <ClangPP/>
-        <SearchPaths/>
-      </Completion>
-    </Configuration>
-    <Configuration Name="Release" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Executable" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
-      <Compiler Options="-O2;-Wall" C_Options="-O2;-Wall" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
-        <IncludePath Value="."/>
-        <Preprocessor Value="NDEBUG"/>
-      </Compiler>
-      <Linker Options="" Required="yes"/>
-      <ResourceCompiler Options="" Required="no"/>
-      <General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Release" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
-      <Environment EnvVarSetName="&lt;Use Defaults&gt;" DbgSetName="&lt;Use Defaults&gt;">
-        <![CDATA[]]>
-      </Environment>
-      <Debugger IsRemote="no" RemoteHostName="" RemoteHostPort="" DebuggerPath="" IsExtended="no">
-        <DebuggerSearchPaths/>
-        <PostConnectCommands/>
-        <StartupCommands/>
-      </Debugger>
-      <PreBuild/>
-      <PostBuild/>
-      <CustomBuild Enabled="no">
-        <RebuildCommand/>
-        <CleanCommand/>
-        <BuildCommand/>
-        <PreprocessFileCommand/>
-        <SingleFileCommand/>
-        <MakefileGenerationCommand/>
-        <ThirdPartyToolName>None</ThirdPartyToolName>
-        <WorkingDirectory/>
-      </CustomBuild>
-      <AdditionalRules>
-        <CustomPostBuild/>
-        <CustomPreBuild/>
-      </AdditionalRules>
-      <Completion EnableCpp11="no">
-        <ClangCmpFlagsC/>
-        <ClangCmpFlags/>
-        <ClangPP/>
-        <SearchPaths/>
-      </Completion>
-    </Configuration>
-  </Settings>
-  <Dependencies Name="Debug"/>
-  <Dependencies Name="Release"/>
-</CodeLite_Project>
diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project
index 7ed5723ea..389f71297 100644
--- a/contrib/other-builds/moses/.project
+++ b/contrib/other-builds/moses/.project
@@ -1140,16 +1140,6 @@
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/moses/FF/DynamicCacheBasedLanguageModel.h</locationURI>
 		</link>
-		<link>
-			<name>FF/ExternalFeature.cpp</name>
-			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/moses/FF/ExternalFeature.cpp</locationURI>
-		</link>
-		<link>
-			<name>FF/ExternalFeature.h</name>
-			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/moses/FF/ExternalFeature.h</locationURI>
-		</link>
 		<link>
 			<name>FF/FFState.cpp</name>
 			<type>1</type>
diff --git a/contrib/other-builds/score/.cproject b/contrib/other-builds/score/.cproject
index 78a5e13f9..d904122eb 100644
--- a/contrib/other-builds/score/.cproject
+++ b/contrib/other-builds/score/.cproject
@@ -59,7 +59,6 @@
 									<listOptionValue builtIn="false" value="boost_program_options"/>
 									<listOptionValue builtIn="false" value="pthread"/>
 									<listOptionValue builtIn="false" value="z"/>
-									<listOptionValue builtIn="false" value="bz2"/>
 									<listOptionValue builtIn="false" value="dl"/>
 									<listOptionValue builtIn="false" value="rt"/>
 								</option>
diff --git a/contrib/other-builds/server/.cproject b/contrib/other-builds/server/.cproject
index 688221af6..78c5185f9 100644
--- a/contrib/other-builds/server/.cproject
+++ b/contrib/other-builds/server/.cproject
@@ -75,7 +75,6 @@
 									<listOptionValue builtIn="false" value="boost_filesystem"/>
 									<listOptionValue builtIn="false" value="boost_program_options"/>
 									<listOptionValue builtIn="false" value="z"/>
-									<listOptionValue builtIn="false" value="bz2"/>
 									<listOptionValue builtIn="false" value="dl"/>
 									<listOptionValue builtIn="false" value="rt"/>
 								</option>
@@ -159,10 +158,10 @@
 	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
 	<storageModule moduleId="refreshScope" versionNumber="2">
-		<configuration configurationName="Release">
+		<configuration configurationName="Debug">
 			<resource resourceType="PROJECT" workspacePath="/server"/>
 		</configuration>
-		<configuration configurationName="Debug">
+		<configuration configurationName="Release">
 			<resource resourceType="PROJECT" workspacePath="/server"/>
 		</configuration>
 	</storageModule>

From df5aff2d827a976edf81c3d2b3baae9a5f27eb3f Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Sun, 24 May 2015 15:12:05 +0100
Subject: [PATCH 005/108] eclipse

---
 contrib/other-builds/OnDiskPt/.cproject       |   8 +-
 contrib/other-builds/manual-label/.cproject   | 132 ----------
 contrib/other-builds/manual-label/.project    |  27 ---
 contrib/other-builds/manual-label/DeEn.cpp    |  46 ----
 contrib/other-builds/manual-label/DeEn.h      |   5 -
 .../manual-label/EnOpenNLPChunker.cpp         | 202 ----------------
 .../manual-label/EnOpenNLPChunker.h           |  29 ---
 .../manual-label/EnPhrasalVerb.cpp            | 226 ------------------
 .../other-builds/manual-label/EnPhrasalVerb.h |  11 -
 .../manual-label/LabelByInitialLetter.cpp     |  29 ---
 .../manual-label/LabelByInitialLetter.h       |   6 -
 contrib/other-builds/manual-label/Main.cpp    | 195 ---------------
 contrib/other-builds/manual-label/Main.h      |  27 ---
 contrib/other-builds/manual-label/Makefile    |  14 --
 .../manual-label/manual-label.project         | 131 ----------
 contrib/other-builds/moses/.cproject          |   4 +-
 contrib/other-builds/moses/.project           |  10 +
 17 files changed, 16 insertions(+), 1086 deletions(-)
 delete mode 100644 contrib/other-builds/manual-label/.cproject
 delete mode 100644 contrib/other-builds/manual-label/.project
 delete mode 100644 contrib/other-builds/manual-label/DeEn.cpp
 delete mode 100644 contrib/other-builds/manual-label/DeEn.h
 delete mode 100644 contrib/other-builds/manual-label/EnOpenNLPChunker.cpp
 delete mode 100644 contrib/other-builds/manual-label/EnOpenNLPChunker.h
 delete mode 100644 contrib/other-builds/manual-label/EnPhrasalVerb.cpp
 delete mode 100644 contrib/other-builds/manual-label/EnPhrasalVerb.h
 delete mode 100644 contrib/other-builds/manual-label/LabelByInitialLetter.cpp
 delete mode 100644 contrib/other-builds/manual-label/LabelByInitialLetter.h
 delete mode 100644 contrib/other-builds/manual-label/Main.cpp
 delete mode 100644 contrib/other-builds/manual-label/Main.h
 delete mode 100644 contrib/other-builds/manual-label/Makefile
 delete mode 100644 contrib/other-builds/manual-label/manual-label.project

diff --git a/contrib/other-builds/OnDiskPt/.cproject b/contrib/other-builds/OnDiskPt/.cproject
index f551380fd..e32a5baea 100644
--- a/contrib/other-builds/OnDiskPt/.cproject
+++ b/contrib/other-builds/OnDiskPt/.cproject
@@ -11,12 +11,12 @@
 					</externalSetting>
 				</externalSettings>
 				<extensions>
-					<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
-					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 				</extensions>
 			</storageModule>
 			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@@ -72,13 +72,13 @@
 			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.macosx.exe.release.701931933" moduleId="org.eclipse.cdt.core.settings" name="Release">
 				<externalSettings/>
 				<extensions>
-					<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
-					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 				</extensions>
 			</storageModule>
 			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
diff --git a/contrib/other-builds/manual-label/.cproject b/contrib/other-builds/manual-label/.cproject
deleted file mode 100644
index d9297a9fc..000000000
--- a/contrib/other-builds/manual-label/.cproject
+++ /dev/null
@@ -1,132 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
-	<storageModule moduleId="org.eclipse.cdt.core.settings">
-		<cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.2107801703">
-			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.2107801703" moduleId="org.eclipse.cdt.core.settings" name="Debug">
-				<externalSettings/>
-				<extensions>
-					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
-				</extensions>
-			</storageModule>
-			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
-				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.2107801703" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
-					<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.2107801703." name="/" resourcePath="">
-						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.502948364" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
-							<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.1431969079" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
-							<builder buildPath="${workspace_loc:/manual-label}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.2101075234" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
-							<tool id="cdt.managedbuild.tool.gnu.archiver.base.1118840081" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
-							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.2037265673" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
-								<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.400985496" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
-								<option id="gnu.cpp.compiler.exe.debug.option.debugging.level.1160903812" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
-								<option id="gnu.cpp.compiler.option.include.paths.404589863" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
-									<listOptionValue builtIn="false" value="${workspace_loc:}/../.."/>
-									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost&quot;"/>
-								</option>
-								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.967940596" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
-							</tool>
-							<tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.789243964" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
-								<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.exe.debug.option.optimization.level.2033266575" name="Optimization Level" superClass="gnu.c.compiler.exe.debug.option.optimization.level" valueType="enumerated"/>
-								<option id="gnu.c.compiler.exe.debug.option.debugging.level.1568929819" name="Debug Level" superClass="gnu.c.compiler.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
-								<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.676866714" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
-							</tool>
-							<tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.254144861" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
-							<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.319879082" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
-								<option id="gnu.cpp.link.option.paths.132164474" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
-									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
-								</option>
-								<option id="gnu.cpp.link.option.libs.1017214824" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
-									<listOptionValue builtIn="false" value="boost_program_options"/>
-								</option>
-								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1672776758" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
-									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
-									<additionalInput kind="additionalinput" paths="$(LIBS)"/>
-								</inputType>
-							</tool>
-							<tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.1104732611" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug">
-								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.372096550" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
-							</tool>
-						</toolChain>
-					</folderInfo>
-				</configuration>
-			</storageModule>
-			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
-		</cconfiguration>
-		<cconfiguration id="cdt.managedbuild.config.gnu.exe.release.649050588">
-			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.649050588" moduleId="org.eclipse.cdt.core.settings" name="Release">
-				<externalSettings/>
-				<extensions>
-					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
-				</extensions>
-			</storageModule>
-			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
-				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.649050588" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
-					<folderInfo id="cdt.managedbuild.config.gnu.exe.release.649050588." name="/" resourcePath="">
-						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.1107402972" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
-							<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.release.1038954684" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
-							<builder buildPath="${workspace_loc:/manual-label}/Release" id="cdt.managedbuild.target.gnu.builder.exe.release.100518450" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.release"/>
-							<tool id="cdt.managedbuild.tool.gnu.archiver.base.2005888378" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
-							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1743303968" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release">
-								<option id="gnu.cpp.compiler.exe.release.option.optimization.level.968169340" name="Optimization Level" superClass="gnu.cpp.compiler.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
-								<option id="gnu.cpp.compiler.exe.release.option.debugging.level.977676916" name="Debug Level" superClass="gnu.cpp.compiler.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
-								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1889240027" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
-							</tool>
-							<tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.release.924128295" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.release">
-								<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.exe.release.option.optimization.level.1914416581" name="Optimization Level" superClass="gnu.c.compiler.exe.release.option.optimization.level" valueType="enumerated"/>
-								<option id="gnu.c.compiler.exe.release.option.debugging.level.826081780" name="Debug Level" superClass="gnu.c.compiler.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
-								<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.2048171432" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
-							</tool>
-							<tool id="cdt.managedbuild.tool.gnu.c.linker.exe.release.940327646" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.release"/>
-							<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.release.369758737" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.release">
-								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1186766936" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
-									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
-									<additionalInput kind="additionalinput" paths="$(LIBS)"/>
-								</inputType>
-							</tool>
-							<tool id="cdt.managedbuild.tool.gnu.assembler.exe.release.266174128" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.release">
-								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.558116084" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
-							</tool>
-						</toolChain>
-					</folderInfo>
-				</configuration>
-			</storageModule>
-			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
-		</cconfiguration>
-	</storageModule>
-	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
-		<project id="manual-label.cdt.managedbuild.target.gnu.exe.1701243340" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
-	</storageModule>
-	<storageModule moduleId="scannerConfiguration">
-		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.649050588;cdt.managedbuild.config.gnu.exe.release.649050588.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1743303968;cdt.managedbuild.tool.gnu.cpp.compiler.input.1889240027">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.649050588;cdt.managedbuild.config.gnu.exe.release.649050588.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.924128295;cdt.managedbuild.tool.gnu.c.compiler.input.2048171432">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.2107801703;cdt.managedbuild.config.gnu.exe.debug.2107801703.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.2037265673;cdt.managedbuild.tool.gnu.cpp.compiler.input.967940596">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.2107801703;cdt.managedbuild.config.gnu.exe.debug.2107801703.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.789243964;cdt.managedbuild.tool.gnu.c.compiler.input.676866714">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
-		</scannerConfigBuildInfo>
-	</storageModule>
-	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
-	<storageModule moduleId="refreshScope" versionNumber="2">
-		<configuration configurationName="Release">
-			<resource resourceType="PROJECT" workspacePath="/manual-label"/>
-		</configuration>
-		<configuration configurationName="Debug">
-			<resource resourceType="PROJECT" workspacePath="/manual-label"/>
-		</configuration>
-	</storageModule>
-</cproject>
diff --git a/contrib/other-builds/manual-label/.project b/contrib/other-builds/manual-label/.project
deleted file mode 100644
index d0c1dba19..000000000
--- a/contrib/other-builds/manual-label/.project
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<projectDescription>
-	<name>manual-label</name>
-	<comment></comment>
-	<projects>
-	</projects>
-	<buildSpec>
-		<buildCommand>
-			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
-			<triggers>clean,full,incremental,</triggers>
-			<arguments>
-			</arguments>
-		</buildCommand>
-		<buildCommand>
-			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
-			<triggers>full,incremental,</triggers>
-			<arguments>
-			</arguments>
-		</buildCommand>
-	</buildSpec>
-	<natures>
-		<nature>org.eclipse.cdt.core.cnature</nature>
-		<nature>org.eclipse.cdt.core.ccnature</nature>
-		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
-		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
-	</natures>
-</projectDescription>
diff --git a/contrib/other-builds/manual-label/DeEn.cpp b/contrib/other-builds/manual-label/DeEn.cpp
deleted file mode 100644
index ea2934c5a..000000000
--- a/contrib/other-builds/manual-label/DeEn.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#include <list>
-#include "DeEn.h"
-#include "Main.h"
-#include "moses/Util.h"
-
-using namespace std;
-
-extern bool g_debug;
-
-bool Contains(const Phrase &source, int start, int end, int factor, const string &str)
-{
-  for (int pos = start; pos <= end; ++pos) {
-    bool found = IsA(source, pos, 0, factor, str);
-    if (found) {
-      return true;
-    }
-  }
-  return false;
-}
-
-void LabelDeEn(const Phrase &source, ostream &out)
-{
-  Ranges ranges;
-
-  // find ranges to label
-  for (int start = 0; start < source.size(); ++start) {
-    for (int end = start; end < source.size(); ++end) {
-     if (IsA(source, start, -1, 1, "VAFIN")
-          && IsA(source, end, +1, 1, "VVINF VVPP")
-          && !Contains(source, start, end, 1, "VAFIN VVINF VVPP VVFIN")) {
-       Range range(start, end, "reorder-label");
-       ranges.push_back(range);
-      }
-      else if ((start == 0 || IsA(source, start, -1, 1, "$,"))
-          && IsA(source, end, +1, 0, "zu")
-          && IsA(source, end, +2, 1, "VVINF")
-          && !Contains(source, start, end, 1, "$,")) {
-        Range range(start, end, "reorder-label");
-        ranges.push_back(range);
-      }
-    }
-  }
-
-  OutputWithLabels(source, ranges, out);
-}
-
diff --git a/contrib/other-builds/manual-label/DeEn.h b/contrib/other-builds/manual-label/DeEn.h
deleted file mode 100644
index c24ce0079..000000000
--- a/contrib/other-builds/manual-label/DeEn.h
+++ /dev/null
@@ -1,5 +0,0 @@
-#pragma once
-
-#include "Main.h"
-
-void LabelDeEn(const Phrase &source, std::ostream &out);
diff --git a/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp b/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp
deleted file mode 100644
index 538aa9746..000000000
--- a/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * EnApacheChunker.cpp
- *
- *  Created on: 28 Feb 2014
- *      Author: hieu
- */
-#include <cstdlib>
-#include <cstdio>
-#include <algorithm>
-#include <fstream>
-#include <boost/algorithm/string/predicate.hpp>
-#include <boost/filesystem.hpp>
-#include "EnOpenNLPChunker.h"
-#include "moses/Util.h"
-
-using namespace std;
-using namespace boost::algorithm;
-
-EnOpenNLPChunker::EnOpenNLPChunker(const std::string &openNLPPath)
-:m_openNLPPath(openNLPPath)
-{
-	// TODO Auto-generated constructor stub
-
-}
-
-EnOpenNLPChunker::~EnOpenNLPChunker() {
-	// TODO Auto-generated destructor stub
-}
-
-void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector<string> &filterList)
-{
-        const boost::filesystem::path
-            inPath = boost::filesystem::unique_path(),
-            outPath = boost::filesystem::unique_path();
-	// read all input to a temp file
-	ofstream inFile(inPath.c_str());
-
-	string line;
-	while (getline(in, line)) {
-		Unescape(line);
-		inFile << line << endl;
-	}
-	inFile.close();
-
-	// execute chunker
-	string cmd = "cat " + inPath.native() + " | "
-			+ m_openNLPPath + "/bin/opennlp POSTagger "
-				+ m_openNLPPath + "/models/en-pos-maxent.bin | "
-			+ m_openNLPPath + "/bin/opennlp ChunkerME "
-				+ m_openNLPPath + "/models/en-chunker.bin > "
-			+ outPath.native();
-	//g << "Executing:" << cmd << endl;
-	int ret = system(cmd.c_str());
-
-	// read result of chunker and output as Moses xml trees
-	ifstream outFile(outPath.c_str());
-
-	size_t lineNum = 0;
-	while (getline(outFile, line)) {
-		//cerr << line << endl;
-		MosesReformat(line, out, filterList);
-		out << endl;
-		++lineNum;
-	}
-	outFile.close();
-
-	// clean up temporary files
-	remove(inPath.c_str());
-	remove(outPath.c_str());
-}
-
-void EnOpenNLPChunker::MosesReformat(const string &line, std::ostream &out, const vector<string> &filterList)
-{
-	//cerr << "REFORMATING:" << line << endl;
-	bool inLabel = false;
-	vector<string> toks;
-	Moses::Tokenize(toks, line);
-	for (size_t i = 0; i < toks.size(); ++i) {
-		const string &tok = toks[i];
-
-		if (tok.substr(0, 1) == "[" && tok.substr(1,1) != "_") {
-			// start of chunk
-			string label = tok.substr(1);
-			if (UseLabel(label, filterList)) {
-				out << "<tree label=\"" << label << "\">";
-				inLabel = true;
-			}
-		}
-		else if (ends_with(tok, "]")) {
-			// end of chunk
-			if (tok.size() > 1) {
-				if (tok.substr(1,1) == "_") {
-					// just a word that happens to be ]
-					vector<string> factors;
-					Moses::Tokenize(factors, tok, "_");
-					assert(factors.size() == 2);
-
-					Escape(factors[0]);
-					out << factors[0] << " ";
-				}
-				else {
-					// a word and end of tree
-					string word = tok.substr(0, tok.size()-1);
-
-					vector<string> factors;
-					Moses::Tokenize(factors, word, "_");
-					assert(factors.size() == 2);
-
-					Escape(factors[0]);
-					out << factors[0] << " ";
-				}
-
-				if (inLabel) {
-					out << "</tree> ";
-					inLabel = false;
-				}
-			}
-			else {
-				if (inLabel) {
-					out << "</tree> ";
-					inLabel = false;
-				}
-			}
-
-		}
-		else {
-			// lexical item
-			vector<string> factors;
-			Moses::Tokenize(factors, tok, "_");
-			if (factors.size() == 2) {
-				Escape(factors[0]);
-				out << factors[0] << " ";
-			}
-			else if (factors.size() == 1) {
-				// word is _
-				assert(tok.substr(0, 2) == "__");
-				out << "_ ";
-			}
-			else {
-				throw "Unknown format:" + tok;
-			}
-		}
-	}
-}
-
-std::string
-replaceAll( std::string const& original,
-            std::string const& before,
-            std::string const& after )
-{
-    std::string retval;
-    std::string::const_iterator end     = original.end();
-    std::string::const_iterator current = original.begin();
-    std::string::const_iterator next    =
-            std::search( current, end, before.begin(), before.end() );
-    while ( next != end ) {
-        retval.append( current, next );
-        retval.append( after );
-        current = next + before.size();
-        next = std::search( current, end, before.begin(), before.end() );
-    }
-    retval.append( current, next );
-    return retval;
-}
-
-void EnOpenNLPChunker::Escape(string &line)
-{
-	line = replaceAll(line, "&", "&amp;");
-	line = replaceAll(line, "|", "&#124;");
-	line = replaceAll(line, "<", "&lt;");
-	line = replaceAll(line, ">", "&gt;");
-	line = replaceAll(line, "'", "&apos;");
-	line = replaceAll(line, "\"", "&quot;");
-	line = replaceAll(line, "[", "&#91;");
-	line = replaceAll(line, "]", "&#93;");
-}
-
-void EnOpenNLPChunker::Unescape(string &line)
-{
-	line = replaceAll(line, "&#124;", "|");
-	line = replaceAll(line, "&lt;", "<");
-	line = replaceAll(line, "&gt;", ">");
-	line = replaceAll(line, "&quot;", "\"");
-	line = replaceAll(line, "&apos;", "'");
-	line = replaceAll(line, "&#91;", "[");
-	line = replaceAll(line, "&#93;", "]");
-	line = replaceAll(line, "&amp;", "&");
-}
-
-bool EnOpenNLPChunker::UseLabel(const std::string &label, const std::vector<std::string> &filterList) const
-{
-	if (filterList.size() == 0) {
-		return true;
-	}
-
-	for (size_t i = 0; i < filterList.size(); ++i) {
-		if (label == filterList[i]) {
-			return true;
-		}
-	}
-	return false;
-}
diff --git a/contrib/other-builds/manual-label/EnOpenNLPChunker.h b/contrib/other-builds/manual-label/EnOpenNLPChunker.h
deleted file mode 100644
index df9f90e42..000000000
--- a/contrib/other-builds/manual-label/EnOpenNLPChunker.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * EnApacheChunker.h
- *
- *  Created on: 28 Feb 2014
- *      Author: hieu
- */
-
-#pragma once
-
-#include <vector>
-#include <string>
-#include <iostream>
-
-class EnOpenNLPChunker {
-public:
-	EnOpenNLPChunker(const std::string &openNLPPath);
-	virtual ~EnOpenNLPChunker();
-	void Process(std::istream &in, std::ostream &out, const std::vector<std::string> &filterList);
-protected:
-	const std::string m_openNLPPath;
-
-	void Escape(std::string &line);
-	void Unescape(std::string &line);
-
-	void MosesReformat(const std::string &line, std::ostream &out, const std::vector<std::string> &filterList);
-
-	bool UseLabel(const std::string &label, const std::vector<std::string> &filterList) const;
-};
-
diff --git a/contrib/other-builds/manual-label/EnPhrasalVerb.cpp b/contrib/other-builds/manual-label/EnPhrasalVerb.cpp
deleted file mode 100644
index 4bee9b941..000000000
--- a/contrib/other-builds/manual-label/EnPhrasalVerb.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-#include <iostream>
-#include <list>
-#include <limits>
-#include <algorithm>
-#include "EnPhrasalVerb.h"
-#include "moses/Util.h"
-
-using namespace std;
-
-void EnPhrasalVerb(const Phrase &source, int revision, ostream &out)
-{
-  Ranges ranges;
-
-  // find ranges to label
-  for (int start = 0; start < source.size(); ++start) {
-	size_t end = std::numeric_limits<size_t>::max();
-
-	if (IsA(source, start, 0, 0, "ask asked asking")) {
-		end = Found(source, start, 0, "out");
-    }
-	else if (IsA(source, start, 0, 0, "back backed backing")) {
-		end = Found(source, start, 0, "up");
-	}
-	else if (IsA(source, start, 0, 0, "blow blown blew")) {
-		end = Found(source, start, 0, "up");
-	}
-	else if (IsA(source, start, 0, 0, "break broke broken")) {
-		end = Found(source, start, 0, "down up in");
-	}
-	else if (IsA(source, start, 0, 0, "bring brought bringing")) {
-		end = Found(source, start, 0, "down up in");
-	}
-	else if (IsA(source, start, 0, 0, "call called calling")) {
-		end = Found(source, start, 0, "back up off");
-	}
-	else if (IsA(source, start, 0, 0, "check checked checking")) {
-		end = Found(source, start, 0, "out in");
-	}
-	else if (IsA(source, start, 0, 0, "cheer cheered cheering")) {
-		end = Found(source, start, 0, "up");
-	}
-	else if (IsA(source, start, 0, 0, "clean cleaned cleaning")) {
-		end = Found(source, start, 0, "up");
-	}
-	else if (IsA(source, start, 0, 0, "cross crossed crossing")) {
-		end = Found(source, start, 0, "out");
-	}
-	else if (IsA(source, start, 0, 0, "cut cutting")) {
-		end = Found(source, start, 0, "down off out");
-	}
-	else if (IsA(source, start, 0, 0, "do did done")) {
-		end = Found(source, start, 0, "over up");
-	}
-	else if (IsA(source, start, 0, 0, "drop dropped dropping")) {
-		end = Found(source, start, 0, "off");
-	}
-	else if (IsA(source, start, 0, 0, "figure figured figuring")) {
-		end = Found(source, start, 0, "out");
-	}
-	else if (IsA(source, start, 0, 0, "fill filled filling")) {
-		end = Found(source, start, 0, "in out up");
-	}
-	else if (IsA(source, start, 0, 0, "find found finding")) {
-		end = Found(source, start, 0, "out");
-	}
-	else if (IsA(source, start, 0, 0, "get got getting gotten")) {
-		end = Found(source, start, 0, "across over back");
-	}
-	else if (IsA(source, start, 0, 0, "give given gave giving")) {
-		end = Found(source, start, 0, "away back out up");
-	}
-	else if (IsA(source, start, 0, 0, "hand handed handing")) {
-		end = Found(source, start, 0, "down in over");
-	}
-	else if (IsA(source, start, 0, 0, "hold held holding")) {
-		end = Found(source, start, 0, "back up");
-	}
-	else if (IsA(source, start, 0, 0, "keep kept keeping")) {
-		end = Found(source, start, 0, "from up");
-	}
-	else if (IsA(source, start, 0, 0, "let letting")) {
-		end = Found(source, start, 0, "down in");
-	}
-	else if (IsA(source, start, 0, 0, "look looked looking")) {
-		end = Found(source, start, 0, "over up");
-	}
-	else if (IsA(source, start, 0, 0, "make made making")) {
-		end = Found(source, start, 0, "up");
-	}
-	else if (IsA(source, start, 0, 0, "mix mixed mixing")) {
-		end = Found(source, start, 0, "up");
-	}
-	else if (IsA(source, start, 0, 0, "pass passed passing")) {
-		end = Found(source, start, 0, "out up");
-	}
-	else if (IsA(source, start, 0, 0, "pay payed paying")) {
-		end = Found(source, start, 0, "back");
-	}
-	else if (IsA(source, start, 0, 0, "pick picked picking")) {
-		end = Found(source, start, 0, "out");
-	}
-	else if (IsA(source, start, 0, 0, "point pointed pointing")) {
-		end = Found(source, start, 0, "out");
-	}
-	else if (IsA(source, start, 0, 0, "put putting")) {
-		end = Found(source, start, 0, "down off out together on");
-	}
-	else if (IsA(source, start, 0, 0, "send sending")) {
-		end = Found(source, start, 0, "back");
-	}
-	else if (IsA(source, start, 0, 0, "set setting")) {
-		end = Found(source, start, 0, "up");
-	}
-	else if (IsA(source, start, 0, 0, "sort sorted sorting")) {
-		end = Found(source, start, 0, "out");
-	}
-	else if (IsA(source, start, 0, 0, "switch switched switching")) {
-		end = Found(source, start, 0, "off on");
-	}
-	else if (IsA(source, start, 0, 0, "take took taking")) {
-		end = Found(source, start, 0, "apart back off out");
-	}
-	else if (IsA(source, start, 0, 0, "tear torn tearing")) {
-		end = Found(source, start, 0, "up");
-	}
-	else if (IsA(source, start, 0, 0, "think thought thinking")) {
-		end = Found(source, start, 0, "over");
-	}
-	else if (IsA(source, start, 0, 0, "thrown threw thrown throwing")) {
-		end = Found(source, start, 0, "away");
-	}
-	else if (IsA(source, start, 0, 0, "turn turned turning")) {
-		end = Found(source, start, 0, "down off on");
-	}
-	else if (IsA(source, start, 0, 0, "try tried trying")) {
-		end = Found(source, start, 0, "on out");
-	}
-	else if (IsA(source, start, 0, 0, "use used using")) {
-		end = Found(source, start, 0, "up");
-	}
-	else if (IsA(source, start, 0, 0, "warm warmed warming")) {
-		end = Found(source, start, 0, "up");
-	}
-	else if (IsA(source, start, 0, 0, "work worked working")) {
-		end = Found(source, start, 0, "out");
-	}
-
-	// found range to label
-	if (end != std::numeric_limits<size_t>::max() &&
-			end > start + 1) {
-		bool add = true;
-		if (revision == 1 && Exist(source,
-									start + 1,
-									end - 1,
-									1,
-									"VB VBD VBG VBN VBP VBZ")) {
-			// there's a verb in between
-			add = false;
-		}
-
-		if (add) {
-			Range range(start + 1, end - 1, "reorder-label");
-			ranges.push_back(range);
-		}
-	}
-  }
-
-  OutputWithLabels(source, ranges, out);
-}
-
-bool Exist(const Phrase &source, int start, int end, int factor, const std::string &str)
-{
-  vector<string> soughts = Moses::Tokenize(str, " ");
-  for (size_t i = start; i <= end; ++i) {
-	const Word &word = source[i];
-    bool found = Found(word, factor, soughts);
-    if (found) {
-    	return true;
-    }
-  }
-
-  return false;
-}
-
-size_t Found(const Phrase &source, int pos, int factor, const std::string &str)
-{
-  const size_t MAX_RANGE = 10;
-
-  vector<string> soughts = Moses::Tokenize(str, " ");
-  vector<string> puncts = Moses::Tokenize(". : , ;", " ");
-
-
-  size_t maxEnd = std::min(source.size(), (size_t) pos + MAX_RANGE);
-  for (size_t i = pos + 1; i < maxEnd; ++i) {
-	const Word &word = source[i];
-	bool found;
-
-	found = Found(word, factor, puncts);
-	if (found) {
-		return std::numeric_limits<size_t>::max();
-	}
-
-	found = Found(word, factor, soughts);
-	if (found) {
-		return i;
-	}
-  }
-
-  return std::numeric_limits<size_t>::max();
-}
-
-
-bool Found(const Word &word, int factor, const vector<string> &soughts)
-{
-  const string &element = word[factor];
-  for (size_t i = 0; i < soughts.size(); ++i) {
-	const string &sought = soughts[i];
-	bool found = (element == sought);
-	if (found) {
-	  return true;
-	}
-  }
-  return false;
-}
-
-
diff --git a/contrib/other-builds/manual-label/EnPhrasalVerb.h b/contrib/other-builds/manual-label/EnPhrasalVerb.h
deleted file mode 100644
index 4cb5f7348..000000000
--- a/contrib/other-builds/manual-label/EnPhrasalVerb.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#pragma once
-
-#include "Main.h"
-
-// roll your own identification of phrasal verbs
-void EnPhrasalVerb(const Phrase &source, int revision, std::ostream &out);
-
-bool Exist(const Phrase &source, int start, int end, int factor, const std::string &str);
-size_t Found(const Phrase &source, int pos, int factor, const std::string &str);
-bool Found(const Word &word, int factor, const std::vector<std::string> &soughts);
-
diff --git a/contrib/other-builds/manual-label/LabelByInitialLetter.cpp b/contrib/other-builds/manual-label/LabelByInitialLetter.cpp
deleted file mode 100644
index e4136a7ea..000000000
--- a/contrib/other-builds/manual-label/LabelByInitialLetter.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-#include "LabelByInitialLetter.h"
-#include "Main.h"
-
-using namespace std;
-
-void LabelByInitialLetter(const Phrase &source, std::ostream &out)
-{
-  Ranges ranges;
-
-  for (int start = 0; start < source.size(); ++start) {
-	  const string &startWord = source[start][0];
-	  string startChar = startWord.substr(0,1);
-
-	  for (int end = start + 1; end < source.size(); ++end) {
-		  const string &endWord = source[end][0];
-		  string endChar = endWord.substr(0,1);
-
-		  if (startChar == endChar) {
-				Range range(start, end, startChar + "-label");
-				ranges.push_back(range);
-		  }
-	  }
-  }
-
-  OutputWithLabels(source, ranges, out);
-
-}
-
-
diff --git a/contrib/other-builds/manual-label/LabelByInitialLetter.h b/contrib/other-builds/manual-label/LabelByInitialLetter.h
deleted file mode 100644
index ba8d34c19..000000000
--- a/contrib/other-builds/manual-label/LabelByInitialLetter.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#pragma once
-
-#include "Main.h"
-
-void LabelByInitialLetter(const Phrase &source, std::ostream &out);
-
diff --git a/contrib/other-builds/manual-label/Main.cpp b/contrib/other-builds/manual-label/Main.cpp
deleted file mode 100644
index 896f70590..000000000
--- a/contrib/other-builds/manual-label/Main.cpp
+++ /dev/null
@@ -1,195 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <boost/program_options.hpp>
-#include "moses/Util.h"
-#include "Main.h"
-#include "DeEn.h"
-#include "EnPhrasalVerb.h"
-#include "EnOpenNLPChunker.h"
-#include "LabelByInitialLetter.h"
-
-using namespace std;
-
-bool g_debug = false;
-
-Phrase Tokenize(const string &line);
-
-int main(int argc, char** argv)
-{
-  cerr << "Starting" << endl;
-
-  namespace po = boost::program_options;
-  po::options_description desc("Options");
-  desc.add_options()
-    ("help", "Print help messages")
-
-    ("input,i", po::value<string>(), "Input file. Otherwise it will read from standard in")
-    ("output,o", po::value<string>(), "Output file. Otherwise it will print from standard out")
-
-    ("source-language,s", po::value<string>()->required(), "Source Language")
-    ("target-language,t", po::value<string>()->required(), "Target Language")
-    ("revision,r", po::value<int>()->default_value(0), "Revision")
-    ("filter", po::value<string>(), "Only use labels from this comma-separated list")
-
-    ("opennlp", po::value<string>()->default_value(""), "Path to Apache OpenNLP toolkit")
-
-    ;
-
-  po::variables_map vm;
-  try
-  {
-    po::store(po::parse_command_line(argc, argv, desc),
-              vm); // can throw
-
-    /** --help option
-     */
-    if ( vm.count("help")  )
-    {
-      std::cout << "Basic Command Line Parameter App" << std::endl
-                << desc << std::endl;
-      return EXIT_SUCCESS;
-    }
-
-    po::notify(vm); // throws on error, so do after help in case
-                    // there are any problems
-  }
-  catch(po::error& e)
-  {
-    std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
-    std::cerr << desc << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  istream *inStrm = &cin;
-  if (vm.count("input")) {
-	  string inStr =  vm["input"].as<string>();
-	  cerr << "inStr=" << inStr << endl;
-	  ifstream *inFile = new ifstream(inStr.c_str());
-	  inStrm = inFile;
-  }
-
-  ostream *outStrm = &cout;
-  if (vm.count("output")) {
-	  string outStr =  vm["output"].as<string>();
-	  cerr << "outStr=" << outStr << endl;
-	  ostream *outFile = new ofstream(outStr.c_str());
-	  outStrm = outFile;
-  }
-
-  vector<string> filterList;
-  if (vm.count("filter")) {
-	  string filter = vm["filter"].as<string>();
-	  Moses::Tokenize(filterList, filter, ",");
-  }
-
-  string sourceLang = vm["source-language"].as<string>();
-  string targetLang = vm["target-language"].as<string>();
-  int revision = vm["revision"].as<int>();
-
-  cerr << sourceLang << " " << targetLang << " " << revision << endl;
-
-  if (sourceLang == "en" && revision == 2) {
-	if (vm.count("opennlp") == 0) {
-		throw "Need path to openNLP toolkit";
-	}
-
-	string openNLPPath = vm["opennlp"].as<string>();
-  	EnOpenNLPChunker chunker(openNLPPath);
-  	chunker.Process(*inStrm, *outStrm, filterList);
-  }
-  else {
-	  // process line-by-line
-	  string line;
-	  size_t lineNum = 1;
-
-	  while (getline(*inStrm, line)) {
-		//cerr << lineNum << ":" << line << endl;
-		if (lineNum % 1000 == 0) {
-		  cerr << lineNum << " ";
-		}
-
-		Phrase source = Tokenize(line);
-
-		if (revision == 600 ) {
-			LabelByInitialLetter(source, *outStrm);
-		}
-		else if (sourceLang == "de" && targetLang == "en") {
-			LabelDeEn(source, *outStrm);
-		}
-		else if (sourceLang == "en") {
-			if (revision == 0 || revision == 1) {
-				EnPhrasalVerb(source, revision, *outStrm);
-			}
-			else if (revision == 2) {
-				  string openNLPPath = vm["opennlp-path"].as<string>();
-				  EnOpenNLPChunker chunker(openNLPPath);
-			}
-		}
-
-		++lineNum;
-	  }
-  }
-
-
-  cerr << "Finished" << endl;
-  return EXIT_SUCCESS;
-}
-
-Phrase Tokenize(const string &line)
-{
-  Phrase ret;
-
-  vector<string> toks = Moses::Tokenize(line);
-  for (size_t i = 0; i < toks.size(); ++i) {
-    Word word = Moses::Tokenize(toks[i], "|");
-    ret.push_back(word);
-  }
-
-  return ret;
-}
-
-bool IsA(const Phrase &source, int pos, int offset, int factor, const string &str)
-{
-  pos += offset;
-  if (pos >= source.size() || pos < 0) {
-    return false;
-  }
-
-  const string &word = source[pos][factor];
-  vector<string> soughts = Moses::Tokenize(str, " ");
-  for (int i = 0; i < soughts.size(); ++i) {
-    string &sought = soughts[i];
-    bool found = (word == sought);
-    if (found) {
-      return true;
-    }
-  }
-  return false;
-}
-
-
-void OutputWithLabels(const Phrase &source, const Ranges ranges, ostream &out)
-{
-  // output sentence, with labels
-  for (int pos = 0; pos < source.size(); ++pos) {
-	// output beginning of label
-	for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) {
-	  const Range &range = *iter;
-	  if (range.range.first == pos) {
-		out << "<tree label=\"" + range.label + "\"> ";
-	  }
-	}
-
-	const Word &word = source[pos];
-	out << word[0] << " ";
-
-	for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) {
-	  const Range &range = *iter;
-	  if (range.range.second == pos) {
-		out << "</tree> ";
-	  }
-	}
-  }
-  out << endl;
-
-}
diff --git a/contrib/other-builds/manual-label/Main.h b/contrib/other-builds/manual-label/Main.h
deleted file mode 100644
index 036da0d45..000000000
--- a/contrib/other-builds/manual-label/Main.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#pragma once
-
-#include <iostream>
-#include <vector>
-#include <string>
-#include <list>
-
-typedef std::vector<std::string> Word;
-typedef std::vector<Word> Phrase;
-
-struct Range
-{
-  Range(int start,int end, const std::string &l)
-  :range(start, end)
-  ,label(l)
-  {}
-
-  std::pair<int,int> range;
-  std::string label;
-};
-
-typedef std::list<Range> Ranges;
-
-bool IsA(const Phrase &source, int pos, int offset, int factor, const std::string &str);
-void OutputWithLabels(const Phrase &source, const Ranges ranges, std::ostream &out);
-
-
diff --git a/contrib/other-builds/manual-label/Makefile b/contrib/other-builds/manual-label/Makefile
deleted file mode 100644
index f24d69dc7..000000000
--- a/contrib/other-builds/manual-label/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-all: manual-label
-
-clean: 
-	rm -f *.o manual-label
-
-.cpp.o:
-	g++ -I../../../boost/include -I../../../ -O3 -g -c $<
-
-OBJECTS = DeEn.o EnOpenNLPChunker.o EnPhrasalVerb.o Main.o LabelByInitialLetter.o
-
-manual-label: $(OBJECTS)
-	g++ $(OBJECTS)  -L../../../boost/lib64  -lz -lboost_program_options-mt -o manual-label
-
-
diff --git a/contrib/other-builds/manual-label/manual-label.project b/contrib/other-builds/manual-label/manual-label.project
deleted file mode 100644
index 5c678561a..000000000
--- a/contrib/other-builds/manual-label/manual-label.project
+++ /dev/null
@@ -1,131 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<CodeLite_Project Name="manual-label" InternalType="Console">
-  <Plugins>
-    <Plugin Name="qmake">
-      <![CDATA[00010001N0005Debug000000000000]]>
-    </Plugin>
-    <Plugin Name="CMakePlugin">
-      <![CDATA[[{
-  "name": "Debug",
-  "enabled": false,
-  "buildDirectory": "build",
-  "sourceDirectory": "$(ProjectPath)",
-  "generator": "",
-  "buildType": "",
-  "arguments": [],
-  "parentProject": ""
- }]]]>
-    </Plugin>
-  </Plugins>
-  <Description/>
-  <Dependencies/>
-  <VirtualDirectory Name="manual-label">
-    <File Name="DeEn.cpp"/>
-    <File Name="DeEn.h"/>
-    <File Name="EnOpenNLPChunker.cpp"/>
-    <File Name="EnOpenNLPChunker.h"/>
-    <File Name="EnPhrasalVerb.cpp"/>
-    <File Name="EnPhrasalVerb.h"/>
-    <File Name="LabelByInitialLetter.cpp"/>
-    <File Name="LabelByInitialLetter.h"/>
-    <File Name="Main.cpp"/>
-    <File Name="Main.h"/>
-  </VirtualDirectory>
-  <Settings Type="Executable">
-    <GlobalSettings>
-      <Compiler Options="" C_Options="" Assembler="">
-        <IncludePath Value="."/>
-      </Compiler>
-      <Linker Options="">
-        <LibraryPath Value="."/>
-      </Linker>
-      <ResourceCompiler Options=""/>
-    </GlobalSettings>
-    <Configuration Name="Debug" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Executable" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
-      <Compiler Options="-g;-O0;-Wall" C_Options="-g;-O0;-Wall" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
-        <IncludePath Value="."/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder"/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
-      </Compiler>
-      <Linker Options="" Required="yes">
-        <LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/boost/lib64"/>
-        <Library Value="boost_program_options"/>
-        <Library Value="boost_filesystem"/>
-        <Library Value="boost_system"/>
-      </Linker>
-      <ResourceCompiler Options="" Required="no"/>
-      <General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
-      <Environment EnvVarSetName="&lt;Use Defaults&gt;" DbgSetName="&lt;Use Defaults&gt;">
-        <![CDATA[]]>
-      </Environment>
-      <Debugger IsRemote="no" RemoteHostName="" RemoteHostPort="" DebuggerPath="" IsExtended="no">
-        <DebuggerSearchPaths/>
-        <PostConnectCommands/>
-        <StartupCommands/>
-      </Debugger>
-      <PreBuild/>
-      <PostBuild/>
-      <CustomBuild Enabled="no">
-        <RebuildCommand/>
-        <CleanCommand/>
-        <BuildCommand/>
-        <PreprocessFileCommand/>
-        <SingleFileCommand/>
-        <MakefileGenerationCommand/>
-        <ThirdPartyToolName>None</ThirdPartyToolName>
-        <WorkingDirectory/>
-      </CustomBuild>
-      <AdditionalRules>
-        <CustomPostBuild/>
-        <CustomPreBuild/>
-      </AdditionalRules>
-      <Completion EnableCpp11="no">
-        <ClangCmpFlagsC/>
-        <ClangCmpFlags/>
-        <ClangPP/>
-        <SearchPaths/>
-      </Completion>
-    </Configuration>
-    <Configuration Name="Release" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Executable" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
-      <Compiler Options="-O2;-Wall" C_Options="-O2;-Wall" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
-        <IncludePath Value="."/>
-        <Preprocessor Value="NDEBUG"/>
-      </Compiler>
-      <Linker Options="" Required="yes"/>
-      <ResourceCompiler Options="" Required="no"/>
-      <General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Release" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
-      <Environment EnvVarSetName="&lt;Use Defaults&gt;" DbgSetName="&lt;Use Defaults&gt;">
-        <![CDATA[]]>
-      </Environment>
-      <Debugger IsRemote="no" RemoteHostName="" RemoteHostPort="" DebuggerPath="" IsExtended="no">
-        <DebuggerSearchPaths/>
-        <PostConnectCommands/>
-        <StartupCommands/>
-      </Debugger>
-      <PreBuild/>
-      <PostBuild/>
-      <CustomBuild Enabled="no">
-        <RebuildCommand/>
-        <CleanCommand/>
-        <BuildCommand/>
-        <PreprocessFileCommand/>
-        <SingleFileCommand/>
-        <MakefileGenerationCommand/>
-        <ThirdPartyToolName>None</ThirdPartyToolName>
-        <WorkingDirectory/>
-      </CustomBuild>
-      <AdditionalRules>
-        <CustomPostBuild/>
-        <CustomPreBuild/>
-      </AdditionalRules>
-      <Completion EnableCpp11="no">
-        <ClangCmpFlagsC/>
-        <ClangCmpFlags/>
-        <ClangPP/>
-        <SearchPaths/>
-      </Completion>
-    </Configuration>
-  </Settings>
-  <Dependencies Name="Debug"/>
-  <Dependencies Name="Release"/>
-</CodeLite_Project>
diff --git a/contrib/other-builds/moses/.cproject b/contrib/other-builds/moses/.cproject
index 960a13947..2fd2601c6 100644
--- a/contrib/other-builds/moses/.cproject
+++ b/contrib/other-builds/moses/.cproject
@@ -11,11 +11,11 @@
 					</externalSetting>
 				</externalSettings>
 				<extensions>
-					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 				</extensions>
 			</storageModule>
 			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@@ -79,12 +79,12 @@
 			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.1911984684" moduleId="org.eclipse.cdt.core.settings" name="Release">
 				<externalSettings/>
 				<extensions>
-					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 				</extensions>
 			</storageModule>
 			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project
index 389f71297..fcc6b8948 100644
--- a/contrib/other-builds/moses/.project
+++ b/contrib/other-builds/moses/.project
@@ -220,6 +220,16 @@
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/moses/ConfusionNet.h</locationURI>
 		</link>
+		<link>
+			<name>ContextParameters.cpp</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/ContextParameters.cpp</locationURI>
+		</link>
+		<link>
+			<name>ContextParameters.h</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/ContextParameters.h</locationURI>
+		</link>
 		<link>
 			<name>DecodeGraph.cpp</name>
 			<type>1</type>

From da052b7f2b05f886960dd60f175b977e6f254f5e Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Sun, 24 May 2015 16:05:14 +0100
Subject: [PATCH 006/108] Removed dependency on libcurlpp, as it was difficult
 to link that staticly.

---
 Jamroot                                       |   2 +-
 .../UG/mm/test-http-client.cc                 |  27 +++
 .../UG/mm/test-xml-escaping.cc                |  13 ++
 .../TranslationModel/UG/mm/ug_http_client.cc  | 200 ++++++++++++++++++
 moses/TranslationModel/UG/mm/ug_http_client.h |  57 +++++
 .../UG/mm/ug_sampling_bias.cc                 |  36 ++--
 moses/TranslationModel/UG/mmsapt.cpp          |  10 +-
 7 files changed, 321 insertions(+), 24 deletions(-)
 create mode 100644 moses/TranslationModel/UG/mm/test-http-client.cc
 create mode 100644 moses/TranslationModel/UG/mm/test-xml-escaping.cc
 create mode 100644 moses/TranslationModel/UG/mm/ug_http_client.cc
 create mode 100644 moses/TranslationModel/UG/mm/ug_http_client.h

diff --git a/Jamroot b/Jamroot
index 65282ff63..119c6183e 100644
--- a/Jamroot
+++ b/Jamroot
@@ -89,7 +89,7 @@ if [ path.exists $(home)/moses-environment.jam ]
 include $(TOP)/jam-files/check-environment.jam ; # get resource locations 
                                                  # from environment variables
 include $(TOP)/jam-files/xmlrpc-c.jam ; # xmlrpc-c stuff for the server
-include $(TOP)/jam-files/curlpp.jam ;   # curlpp stuff for bias lookup (MMT only)
+# include $(TOP)/jam-files/curlpp.jam ;   # curlpp stuff for bias lookup (MMT only)
 
 # exit "done" : 0 ;
 
diff --git a/moses/TranslationModel/UG/mm/test-http-client.cc b/moses/TranslationModel/UG/mm/test-http-client.cc
new file mode 100644
index 000000000..f50f3b468
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/test-http-client.cc
@@ -0,0 +1,27 @@
+// -*- c++ -*-
+#include "ug_http_client.h"
+
+int main(int argc, char* argv[])
+{
+  try
+    {
+      if (argc != 2)
+	{
+	  std::cout << "Usage: async_client <url>\n";
+	  std::cout << "Example:\n";
+	  std::cout << "  async_client www.boost.org/LICENSE_1_0.txt\n";
+	  return 1;
+	}
+
+      boost::asio::io_service io_service;
+      Moses::http_client c(io_service, argv[1]);
+      io_service.run();
+      std::cout << c.content() << std::endl;
+    }
+  catch (std::exception& e)
+    {
+      std::cout << "Exception: " << e.what() << "\n";
+    }
+
+  return 0;
+}
diff --git a/moses/TranslationModel/UG/mm/test-xml-escaping.cc b/moses/TranslationModel/UG/mm/test-xml-escaping.cc
new file mode 100644
index 000000000..a99471a9b
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/test-xml-escaping.cc
@@ -0,0 +1,13 @@
+#include <iostream>
+#include <string>
+#include <iomanip>
+#include "ug_http_client.h"
+
+using namespace std;
+int main()
+{
+  string line;
+  while (getline(cin,line))
+    cout << Moses::uri_encode(line) << endl;
+}
+
diff --git a/moses/TranslationModel/UG/mm/ug_http_client.cc b/moses/TranslationModel/UG/mm/ug_http_client.cc
new file mode 100644
index 000000000..1d6d70edb
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_http_client.cc
@@ -0,0 +1,200 @@
+#include "ug_http_client.h"
+namespace Moses
+{
+using boost::asio::ip::tcp;
+
+std::string http_client::content() const { return m_content.str(); }
+
+http_client::
+http_client(boost::asio::io_service& io_service,
+	    const std::string& server, const std::string& path)
+  : resolver_(io_service), socket_(io_service)
+{
+  init(server,path);
+}
+
+http_client::
+http_client(boost::asio::io_service& io_service, std::string url)
+  : resolver_(io_service), socket_(io_service)
+{
+  size_t p = url.find("://");
+  if (p < url.size()) url.erase(0,p+3);
+  p = url.find("/");
+  if (p < url.size()) 
+    init(url.substr(0,p),url.substr(p));
+  else 
+    init(url,"/");
+}
+
+void 
+http_client::
+init(std::string const& server, std::string const& path)
+{
+  // Form the request. We specify the "Connection: close" header so
+  // that the server will close the socket after transmitting the
+  // response. This will allow us to treat all data up until the EOF
+  // as the content.
+
+  std::ostream request_stream(&request_);
+  request_stream << "GET " << path << " HTTP/1.0\r\n";
+  request_stream << "Host: " << server << "\r\n";
+  request_stream << "Accept: */*\r\n";
+  request_stream << "Connection: close\r\n\r\n";
+  
+  // Start an asynchronous resolve to translate the server and service names
+  // into a list of endpoints.
+  tcp::resolver::query query(server, "http");
+  resolver_.async_resolve(query,
+			  boost::bind(&http_client::handle_resolve, this,
+				      boost::asio::placeholders::error,
+				      boost::asio::placeholders::iterator));
+  
+}
+
+void 
+http_client::
+handle_resolve(const boost::system::error_code& err,
+	       tcp::resolver::iterator endpoint_iterator)
+{
+  if (!err)
+    {
+      // Attempt a connection to the first endpoint in the list. Each endpoint
+      // will be tried until we successfully establish a connection.
+      tcp::endpoint endpoint = *endpoint_iterator;
+      socket_.async_connect(endpoint,
+			    boost::bind(&http_client::handle_connect, this,
+					boost::asio::placeholders::error, ++endpoint_iterator));
+    }
+  else
+    {
+      m_error << "Error: " << err.message() << "\n";
+    }
+}
+
+void 
+http_client::
+handle_connect(const boost::system::error_code& err,
+		      tcp::resolver::iterator endpoint_iterator)
+{
+  if (!err)
+    {
+      // The connection was successful. Send the request.
+      boost::asio::async_write(socket_, request_,
+			       boost::bind(&http_client::handle_write_request, this,
+					   boost::asio::placeholders::error));
+    }
+  else if (endpoint_iterator != tcp::resolver::iterator())
+    {
+      // The connection failed. Try the next endpoint in the list.
+      socket_.close();
+      tcp::endpoint endpoint = *endpoint_iterator;
+      socket_.async_connect(endpoint,
+			    boost::bind(&http_client::handle_connect, this,
+					boost::asio::placeholders::error, ++endpoint_iterator));
+    }
+  else m_error << "Error: " << err.message() << "\n";
+}
+
+void 
+http_client::
+handle_write_request(const boost::system::error_code& err)
+{
+  using namespace boost::asio;
+  if (err) { m_error << "Error: " << err.message() << "\n"; return; }
+  
+  // Read the response status line. The response_ streambuf will
+  // automatically grow to accommodate the entire line. The growth may be
+  // limited by passing a maximum size to the streambuf constructor.
+  async_read_until(socket_, response_, "\r\n",
+		   boost::bind(&http_client::handle_read_status_line,
+			       this, placeholders::error));
+}
+
+void 
+http_client::
+handle_read_status_line(const boost::system::error_code& err)
+{
+  if (err) { m_error << "Error: " << err << "\n"; return; }
+
+  using namespace boost::asio;
+  // Check that response is OK.
+  std::istream response_stream(&response_);
+  response_stream >> m_http_version >> m_status_code;
+  std::getline(response_stream, m_status_message);
+  if (!response_stream || m_http_version.substr(0, 5) != "HTTP/")
+    m_error << "Invalid response\n"; 
+  else if (m_status_code != 200)
+    m_error << "Response returned with status code " << m_status_code << "\n";
+  else // Read the response headers, which are terminated by a blank line.
+    async_read_until(socket_, response_, "\r\n\r\n",
+		     boost::bind(&http_client::handle_read_headers, this,
+				 placeholders::error));
+}
+
+
+void 
+http_client::
+handle_read_headers(const boost::system::error_code& err)
+{
+  if (err) { m_error << "Error: " << err << "\n"; return; }
+  
+  // Process the response headers.
+  std::istream response_stream(&response_);
+  std::string line;
+  while (std::getline(response_stream, line) && line != "\r")
+    m_header.push_back(line);
+  
+  // Write whatever content we already have to output.
+  if (response_.size() > 0)
+    m_content << &response_;
+  
+  using namespace boost::asio;
+  // Start reading remaining data until EOF.
+  async_read(socket_, response_, transfer_at_least(1),
+	     boost::bind(&http_client::handle_read_content, this,
+			 placeholders::error));
+}
+
+void 
+http_client::
+handle_read_content(const boost::system::error_code& err)
+{
+  using namespace boost::asio;
+  if(!err)
+    {
+      // Write all of the data that has been read so far.
+      // Then continue reading remaining data until EOF.
+      m_content << &response_; 
+      async_read(socket_, response_, transfer_at_least(1),
+		 boost::bind(&http_client::handle_read_content, this,
+			     placeholders::error));
+    }
+  else if (err != boost::asio::error::eof) 
+    { 
+      m_error << "Error: " << err << "\n"; 
+    }
+}
+
+std::string
+uri_encode(std::string const& in)
+{
+  char buf[3 * in.size() + 1];
+  size_t i = 0;
+  for (unsigned char const* c = (unsigned char const*)in.c_str(); *c; ++c)
+    {
+      // cout << *c << " " << int(*c) << endl;
+      if (*c == ' ') buf[i++] = '+';
+      else if (*c == '.' || *c == '~' || *c == '_' || *c == '-') buf[i++] = *c;
+      else if (*c <  '0') i += sprintf(buf+i, "%%%x", int(*c));
+      else if (*c <= '9') buf[i++] = *c;
+      else if (*c <  'A') i += sprintf(buf+i, "%%%x", int(*c));
+      else if (*c <= 'Z') buf[i++] = *c;
+      else if (*c <  'a') i += sprintf(buf+i, "%%%x", int(*c));
+      else if (*c <= 'z') buf[i++] = *c;
+      else i += sprintf(buf+i, "%%%x", int(*c));
+    }
+  buf[i] = 0;
+  return std::string(buf);
+}
+
+}
diff --git a/moses/TranslationModel/UG/mm/ug_http_client.h b/moses/TranslationModel/UG/mm/ug_http_client.h
new file mode 100644
index 000000000..53ee258f9
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_http_client.h
@@ -0,0 +1,57 @@
+// -*- c++ -*-
+// Adapted by Ulrich Germann from: 
+// async_client.cpp
+// ~~~~~~~~~~~~~~~~
+//
+// Copyright (c) 2003-2011 Christopher M. Kohlhoff (chris at kohlhoff dot com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+#pragma once
+#include <iostream>
+#include <istream>
+#include <ostream>
+#include <string>
+#include <sstream>
+#include <boost/asio.hpp>
+#include <boost/bind.hpp>
+
+namespace Moses
+{
+using boost::asio::ip::tcp;
+
+std::string uri_encode(std::string const& in);
+
+class http_client
+{
+  std::ostringstream m_content;
+  std::vector<std::string> m_header;
+  std::string m_http_version;
+  unsigned int m_status_code;
+  std::string m_status_message;
+  std::ostringstream m_error;
+
+public:
+  http_client(boost::asio::io_service& io_service, std::string url);
+  http_client(boost::asio::io_service& io_service,
+	      const std::string& server, const std::string& path);
+private:
+  void init(std::string const& server, std::string const& path);
+  void handle_resolve(const boost::system::error_code& err,
+		      tcp::resolver::iterator endpoint_iterator);
+  void handle_connect(const boost::system::error_code& err,
+		      tcp::resolver::iterator endpoint_iterator);
+  void handle_write_request(const boost::system::error_code& err);
+  void handle_read_status_line(const boost::system::error_code& err);
+  void handle_read_headers(const boost::system::error_code& err);
+  void handle_read_content(const boost::system::error_code& err);
+  tcp::resolver resolver_;
+  tcp::socket socket_;
+  boost::asio::streambuf request_;
+  boost::asio::streambuf response_;
+public:
+  std::string content() const;
+};
+
+}
diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc
index ebe1ce91d..31046e178 100644
--- a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc
+++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc
@@ -3,11 +3,15 @@
 #include <boost/foreach.hpp>
 #include "moses/Timer.h"
 
-#ifdef HAVE_CURLPP
-#include <curlpp/Options.hpp>
-#include <curlpp/cURLpp.hpp>
-#include <curlpp/Easy.hpp>
-#endif
+// #ifdef HAVE_CURLPP
+// #include <curlpp/Options.hpp>
+// #include <curlpp/cURLpp.hpp>
+// #include <curlpp/Easy.hpp>
+// #endif
+
+// #ifdef WITH_MMT_BIAS_CLIENT
+#include "ug_http_client.h"
+// #endif
 
 namespace Moses
 {
@@ -15,21 +19,17 @@ namespace Moses
   {
     using ugdiss::id_type;
 
-#ifdef HAVE_CURLPP
+    // #ifdef WITH_MMT_BIAS_CLIENT
     std::string
     query_bias_server(std::string const& url, std::string const& text)
     {
-      // communicate with the bias server; resuts will be in ...
-      std::ostringstream os;
-      curlpp::Easy myRequest;
-      std::string query = url+curlpp::escape(text);
-      myRequest.setOpt(new curlpp::options::Url(query));
-      curlpp::options::WriteStream ws(&os);
-      myRequest.setOpt(ws); // Give it to your request
-      myRequest.perform();  // This will output to os
-      return os.str();
+      std::string query = url+uri_encode(text);
+      boost::asio::io_service io_service;
+      Moses::client c(io_service, query);
+      io_service.run();
+      return c.content();
     }
-#endif
+    // #endif
 
     DocumentBias
     ::DocumentBias
@@ -40,13 +40,13 @@ namespace Moses
       : m_sid2docid(sid2doc)
       , m_bias(docname2docid.size(), 0)
     {
-#ifdef HAVE_CURLPP
+      // #ifdef HAVE_CURLPP
       Timer timer;
       if (log) timer.start(NULL);
       std::string json = query_bias_server(server_url, text);
       init_from_json(json, docname2docid, log);
       if (log) *log << "Bias query took " << timer << " seconds." << std::endl;
-#endif
+      // #endif
     }
 
     void
diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp
index 4ce775877..f05c0d59b 100644
--- a/moses/TranslationModel/UG/mmsapt.cpp
+++ b/moses/TranslationModel/UG/mmsapt.cpp
@@ -1,8 +1,8 @@
-#ifdef HAVE_CURLPP
-#include <curlpp/Options.hpp>
-#include <curlpp/cURLpp.hpp>
-#include <curlpp/Easy.hpp>
-#endif
+// #ifdef HAVE_CURLPP
+// #include <curlpp/Options.hpp>
+// #include <curlpp/cURLpp.hpp>
+// #include <curlpp/Easy.hpp>
+// #endif
 
 #include "mmsapt.h"
 #include <boost/foreach.hpp>

From c82ee9a4e9fb5727b946bb0fe0f805f8ce8c044c Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Sun, 24 May 2015 16:44:41 +0100
Subject: [PATCH 007/108] Bug fix.

---
 moses/TranslationModel/UG/mm/ug_sampling_bias.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc
index 31046e178..da408dfb3 100644
--- a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc
+++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc
@@ -25,7 +25,7 @@ namespace Moses
     {
       std::string query = url+uri_encode(text);
       boost::asio::io_service io_service;
-      Moses::client c(io_service, query);
+      Moses::http_client c(io_service, query);
       io_service.run();
       return c.content();
     }

From 582a845524774ecea69fed7232c3cd56fe1ed3a1 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Sun, 24 May 2015 20:04:01 +0400
Subject: [PATCH 008/108] don't use zcat

---
 scripts/generic/score-parallel.perl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl
index e911cd4a3..81bc6f7d0 100755
--- a/scripts/generic/score-parallel.perl
+++ b/scripts/generic/score-parallel.perl
@@ -188,7 +188,7 @@ for (my $i = 0; $i < $fileCount; ++$i)
   print STDERR $cmd;
 
   if ($FlexibilityScore) {
-    $cmd .= "zcat $TMPDIR/phrase-table.half.$numStr.gz | $FlexibilityCmd $TMPDIR/extract.context.$i.gz";
+    $cmd .= "gzip -cd $TMPDIR/phrase-table.half.$numStr.gz | $FlexibilityCmd $TMPDIR/extract.context.$i.gz";
     $cmd .= " --Inverse" if ($otherExtractArgs =~ /--Inverse/);
     $cmd .= " --Hierarchical" if ($otherExtractArgs =~ /--Hierarchical/);
     $cmd .= " | $GZIP_EXEC -c > $TMPDIR/phrase-table.half.$numStr.flex.gz\n";

From f6f56d11af1868e3cf0104b6ac2fc27f65f92ed4 Mon Sep 17 00:00:00 2001
From: Rico Sennrich <rico.sennrich@gmx.ch>
Date: Mon, 25 May 2015 15:50:45 +0100
Subject: [PATCH 009/108] ems: parse-relax comes last in train; do same for
 dev/test

---
 scripts/ems/experiment.meta | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index 910c0c040..dafbe4a42 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -858,22 +858,22 @@ parse-input-devtest
 	ignore-unless: use-mira	
 	template: $input-parser < IN > OUT
 parse-relax-input
-	in: parsed-input
-	out: parse-relaxed-input
+	in: split-input
+	out: input
 	default-name: tuning/input.parse-relaxed
 	pass-unless: input-parse-relaxer
 	pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
     template: $input-parse-relaxer < IN > OUT
 parse-relax-input-devtest
-	in: parsed-input-devtest
-	out: parse-relaxed-input-devtest
+	in: split-input-devtest
+	out: input-devtest
 	default-name: tuning/input.devtest.parse-relaxed
 	pass-unless: input-parse-relaxer
 	pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
 	ignore-unless: use-mira
     template: $input-parse-relaxer < IN > OUT
 factorize-input
-	in: parse-relaxed-input
+	in: parsed-input
 	out: factorized-input
 	default-name: tuning/input.factorized
 	rerun-on-change: TRAINING:input-factors
@@ -881,7 +881,7 @@ factorize-input
 	error: can't open
 	error: incompatible number of words in factor
 factorize-input-devtest
-	in: parse-relaxed-input-devtest
+	in: parsed-input-devtest
 	out: factorized-input-devtest
 	default-name: tuning/input.devtest.factorized
 	rerun-on-change: TRAINING:input-factors
@@ -934,14 +934,14 @@ truecase-input-devtest
         template: $input-truecaser -model IN1.$input-extension < IN > OUT
 split-input 
 	in: truecased-input SPLITTER:splitter-model
-	out: input
+	out: split-input
 	rerun-on-change: input-splitter
 	default-name: tuning/input.split
 	pass-unless: input-splitter
 	template: $input-splitter -model IN1.$input-extension < IN > OUT
 split-input-devtest
 	in: truecased-input-devtest SPLITTER:splitter-model
-	out: input-devtest
+	out: split-input-devtest
 	rerun-on-change: input-splitter
 	default-name: tuning/input.devtest.split
 	pass-unless: input-splitter
@@ -1148,14 +1148,14 @@ parse-input
 	pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
 	template: $input-parser < IN > OUT
 parse-relax-input
-	in: parsed-input
-	out: parse-relaxed-input
+	in: split-input
+	out: input
 	default-name: evaluation/input.parse-relaxed
 	pass-unless: input-parse-relaxer
 	pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
     template: $input-parse-relaxer < IN > OUT
 factorize-input
-	in: parse-relaxed-input
+	in: parsed-input
 	out: factorized-input
 	default-name: evaluation/input.factorized
 	rerun-on-change: TRAINING:input-factors
@@ -1187,7 +1187,7 @@ truecase-input
         template: $input-truecaser -model IN1.$input-extension < IN > OUT
 split-input
 	in: truecased-input SPLITTER:splitter-model
-	out: input
+	out: split-input
 	default-name: evaluation/input.split
 	pass-unless: input-splitter
 	template: $input-splitter -model IN1.$input-extension < IN > OUT

From ea9b097aba6ac422d346a8766ef27607bd31e787 Mon Sep 17 00:00:00 2001
From: Jeroen Vermeulen <jtv@precisiontranslationtools.com>
Date: Tue, 26 May 2015 15:06:04 +0700
Subject: [PATCH 010/108] =?UTF-8?q?OutputFileStream:=20accept=20=E2=80=98-?=
 =?UTF-8?q?=E2=80=99=20for=20=E2=80=9Cstdout=E2=80=9D.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a common convention: when a program gets a dash as the path of a
file that it should write, it writes to standard output instead.

Enhances portability to systems that don't have /dev/stdout.
---
 phrase-extract/OutputFileStream.cpp | 45 +++++++++++++++++------------
 phrase-extract/OutputFileStream.h   | 35 ++++++++++++++++++++--
 2 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/phrase-extract/OutputFileStream.cpp b/phrase-extract/OutputFileStream.cpp
index 15c2bd73e..d7874b06f 100644
--- a/phrase-extract/OutputFileStream.cpp
+++ b/phrase-extract/OutputFileStream.cpp
@@ -19,6 +19,7 @@
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  ***********************************************************************/
 
+#include <iostream>
 #include <boost/algorithm/string/predicate.hpp>
 #include <boost/iostreams/filter/gzip.hpp>
 #include "OutputFileStream.h"
@@ -32,11 +33,13 @@ namespace Moses
 OutputFileStream::OutputFileStream()
   :boost::iostreams::filtering_ostream()
   ,m_outFile(NULL)
+  ,m_open(false)
 {
 }
 
 OutputFileStream::OutputFileStream(const std::string &filePath)
-  : m_outFile(NULL)
+  :m_outFile(NULL)
+  ,m_open(false)
 {
   Open(filePath);
 }
@@ -48,32 +51,38 @@ OutputFileStream::~OutputFileStream()
 
 bool OutputFileStream::Open(const std::string &filePath)
 {
-  m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary);
-  if (m_outFile->fail()) {
-    return false;
+  assert(!m_open);
+  if (filePath == std::string("-")) {
+    // Write to standard output.  Leave m_outFile null.
+    this->push(std::cout);
+  } else {
+    m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary);
+    if (m_outFile->fail()) {
+      return false;
+    }
+
+    if (ends_with(filePath, ".gz")) {
+      this->push(boost::iostreams::gzip_compressor());
+    }
+    this->push(*m_outFile);
   }
 
-  if (ends_with(filePath, ".gz")) {
-    this->push(boost::iostreams::gzip_compressor());
-  }
-  this->push(*m_outFile);
-
+  m_open = true;
   return true;
 }
 
 void OutputFileStream::Close()
 {
-  if (m_outFile == NULL) {
-    return;
-  }
-
+  if (!m_open) return;
   this->flush();
-  this->pop(); // file
+  if (m_outFile) {
+    this->pop(); // file
 
-  m_outFile->close();
-  delete m_outFile;
-  m_outFile = NULL;
-  return;
+    m_outFile->close();
+    delete m_outFile;
+    m_outFile = NULL;
+  }
+  m_open = false;
 }
 
 
diff --git a/phrase-extract/OutputFileStream.h b/phrase-extract/OutputFileStream.h
index f52e36d76..b77741a73 100644
--- a/phrase-extract/OutputFileStream.h
+++ b/phrase-extract/OutputFileStream.h
@@ -30,19 +30,50 @@
 namespace Moses
 {
 
-/** Used in place of std::istream, can read zipped files if it ends in .gz
+/** Version of std::ostream with transparent compression.
+ *
+ * Transparently compresses output when writing to a file whose name ends in
+ * ".gz".  Or, writes to stdout instead of a file when given a filename
+ * consisting of just a dash ("-").
  */
 class OutputFileStream : public boost::iostreams::filtering_ostream
 {
-protected:
+private:
+  /** File that needs flushing & closing when we close this stream.
+   *
+   * Is NULL when no file is opened, e.g. when writing to standard output.
+   */
   std::ofstream *m_outFile;
+
+  /// Is this stream open?
+  bool m_open;
+
 public:
+  /** Create an unopened OutputFileStream.
+   *
+   * Until it's been opened, nothing can be done with this stream.
+   */
   OutputFileStream();
 
+  /// Create an OutputFileStream, and open it by calling Open().
   OutputFileStream(const std::string &filePath);
   virtual ~OutputFileStream();
 
+  // TODO: Can we please just always throw an exception when this fails?
+  /** Open stream.
+   *
+   * If filePath is "-" (just a dash), this opens the stream for writing to
+   * standard output.  Otherwise, it opens the given file.  If the filename
+   * has the ".gz" suffix, output will be transparently compressed.
+   *
+   * Call Close() to close the file.
+   *
+   * Returns whether opening the file was successful.  It may also throw an
+   * exception on failure.
+   */
   bool Open(const std::string &filePath);
+
+  /// Flush and close stream.  After this, the stream can be opened again.
   void Close();
 };
 

From c086a8ee5054f5fc56298736f1e4ca2ba441c51b Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Tue, 26 May 2015 16:44:13 +0100
Subject: [PATCH 011/108] Add a wrapper script for parsing English text with
 SENNA

---
 scripts/training/wrappers/parse-en-senna.perl | 149 ++++++++++++++++++
 scripts/training/wrappers/senna2brackets.py   |  98 ++++++++++++
 2 files changed, 247 insertions(+)
 create mode 100755 scripts/training/wrappers/parse-en-senna.perl
 create mode 100755 scripts/training/wrappers/senna2brackets.py

diff --git a/scripts/training/wrappers/parse-en-senna.perl b/scripts/training/wrappers/parse-en-senna.perl
new file mode 100755
index 000000000..f271633ea
--- /dev/null
+++ b/scripts/training/wrappers/parse-en-senna.perl
@@ -0,0 +1,149 @@
+#!/usr/bin/env perl 
+
+use strict;
+use warnings;
+
+use autodie;
+use FindBin qw($RealBin);
+use Getopt::Long "GetOptions";
+
+my ($SENNA,
+    $SENNA_DIR,
+    $SENNA_OPTIONS,
+    $SPLIT_HYPHEN,
+    $SPLIT_SLASH,
+    $MARK_SPLIT,
+    $BINARIZE,
+    $UNPARSEABLE,
+    $RAW_IN,
+    $RAW_OUT);
+
+$UNPARSEABLE = 0;
+
+die("ERROR: syntax is: parse-en-senna.perl [-senna-options OPTIONS] [-split-hyphen] [-split-slash] [-mark-split] [-binarize] [-unparseable] [-raw-in PATH] [-raw-out PATH] -senna PATH -senna-dir PATH < in > out\n")
+  unless &GetOptions
+  ('senna=s' => \$SENNA,
+   'senna-dir=s' => \$SENNA_DIR,
+   'senna-options=s' => \$SENNA_OPTIONS,
+   'split-hyphen' => \$SPLIT_HYPHEN,
+   'split-slash' => \$SPLIT_SLASH,
+   'mark-split' => \$MARK_SPLIT,
+   'binarize' => \$BINARIZE,
+   'unparseable' => \$UNPARSEABLE,
+   'raw-in=s' => \$RAW_IN,
+   'raw-out=s' => \$RAW_OUT
+   )
+  && defined($SENNA);
+
+die("ERROR: file not found or not executable: '$SENNA'\n") unless -x $SENNA;
+die("ERROR: could not find SENNA directory: '$SENNA_DIR'\n") unless -d $SENNA_DIR;
+
+# Step 1: Read standard input and write two temporary files:
+#
+#     $tmpOriginal    Contains a copy of the input as-is
+#
+#     $tmpProcessed   Contains a copy of the input after pre-processing ready
+#                     for input to SENNA
+
+my $tmpOriginal = "/tmp/parse-en-senna.1.$$";
+my $tmpProcessed = "/tmp/parse-en-senna.2.$$";
+
+open(TMP_ORIGINAL, ">$tmpOriginal");
+
+open(TMP_PROCESSED,
+     "| $RealBin/../../tokenizer/deescape-special-chars.perl > $tmpProcessed;");
+
+while(<STDIN>) {
+  print TMP_ORIGINAL $_;
+
+  # If the line is longer than 1023 bytes (including the newline) then replace
+  # it with "SENTENCE_TOO_LONG\n".  This is because SENNA reads lines into a
+  # 1024 character array and if a line is longer than 1023 characters then it
+  # gets read in stages and treated as multiple input lines.
+  my $num_bytes;
+  {
+    use bytes;
+    $num_bytes = length($_); 
+  }
+  if ($num_bytes > 1023) {
+    print TMP_PROCESSED "SENTENCE_TOO_LONG\n";
+    next;
+  }
+
+  # Replace "-LRB-", "-RRB-", etc. with "(", ")", etc.
+  s/-LRB-/(/g;
+  s/-RRB-/)/g;
+  s/-LSB-/[/g;
+  s/-RSB-/]/g;
+  s/-LCB-/{/g;
+  s/-RCB-/}/g;
+
+  # Unsplit hyphens.
+  s/ \@-\@ /-/g if $SPLIT_HYPHEN;
+  # Unsplit slashes.
+  s/ \@\/\@ /\//g if $SPLIT_SLASH;
+
+  print TMP_PROCESSED $_;
+}
+
+close(TMP_ORIGINAL);
+close(TMP_PROCESSED);
+
+# Step 2: Parse $tmpProcessed then pass the raw output through a post-processing
+#         pipeline.
+
+my $pipeline = "";
+
+# Stage 1: Parse input (unless given pre-parsed input via -raw-in option).
+if (defined($RAW_IN)) {
+  $pipeline .= "cat \"$RAW_IN\" |";
+} else {
+  $pipeline .= "cat $tmpProcessed |";
+  my $path = $SENNA_DIR;
+  # SENNA requires -path's argument to end with a slash.
+  if ($path !~ /\/$/) {
+    $path .= "/";
+  }
+  $pipeline .= " $SENNA -path $path -usrtokens";
+  $pipeline .= " $SENNA_OPTIONS" if defined($SENNA_OPTIONS);
+  $pipeline .= " |";
+}
+
+if (defined($RAW_OUT)) {
+  $pipeline .= " tee \"$RAW_OUT\" |";
+}
+
+# Stage 2: Convert SENNA output to Moses XML (via Berkeley output format)
+$pipeline .= " $RealBin/senna2brackets.py --berkeley-style |";
+$pipeline .= " $RealBin/berkeleyparsed2mosesxml.perl |";
+
+# Stage 3: Re-split hyphens / slashes.
+if ($SPLIT_HYPHEN) {
+  $pipeline .= " $RealBin/syntax-hyphen-splitting.perl";
+  $pipeline .= " -binarize" if $BINARIZE;
+  $pipeline .= " -mark-split" if $MARK_SPLIT;
+  $pipeline .= " |";
+}
+if ($SPLIT_SLASH) {
+  $pipeline .= " $RealBin/syntax-hyphen-splitting.perl -slash";
+  $pipeline .= " -binarize" if $BINARIZE;
+  $pipeline .= " -mark-split" if $MARK_SPLIT;
+  $pipeline .= " |";
+}
+
+# Run the parsing + post-processing pipeline.
+open(PARSE, $pipeline);
+open(TMP_ORIGINAL, $tmpOriginal);
+while (<PARSE>) {
+  my $parsedLine = $_;
+  my $originalLine = <TMP_ORIGINAL>;
+  if ($UNPARSEABLE == 1 && length($parsedLine) == 1) {
+    print $originalLine;
+  } else {
+    print $parsedLine;
+  }
+}
+close(PARSE);
+
+`rm $tmpOriginal`;
+`rm $tmpProcessed`;
diff --git a/scripts/training/wrappers/senna2brackets.py b/scripts/training/wrappers/senna2brackets.py
new file mode 100755
index 000000000..28fa6d2d7
--- /dev/null
+++ b/scripts/training/wrappers/senna2brackets.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+
+# Read SENNA output (from stdin), extract the parse trees, and write them in
+# PTB-style bracketed format (to stdout).
+#
+# The SENNA output is assumed to contain tokens in the first column, POS tags
+# in the second column, and PSG fragments in the final column.
+#
+# It is also assumed that SENNA was run through the parse-en-senna.perl wrapper,
+# which:
+#
+#   - Substitutes the special "SENTENCE_TOO_LONG" token for sentences that
+#     exceed SENNA's hardcoded limit.
+#
+#   - Replaces the bracket-like tokens "-LRB-", "-RRB-", etc. with "(", ")",
+#     etc.
+
+import optparse
+import os
+import sys
+
+def main():
+    usage = "usage: %prog [options]"
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--berkeley-style", action="store_true", default=False,
+                      dest="berkeley",
+                      help="mimic the Berkeley Parser's output format")
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("incorrect number of arguments")
+
+    tree = ""
+    for line in sys.stdin:
+        if line.strip() == "":
+            if not balanced(tree):
+                warn("unbalanced parentheses at line %d: "
+                     "discarding tree" % line_num)
+                tree = ""
+            if tree == "" and options.berkeley:
+                print "(())"
+            else:
+                tree = beautify(tree)
+                if options.berkeley:
+                    tree = berkelify(tree)
+                print tree
+            tree = ""
+            continue
+        tokens = line.split()
+        word, pos, frag = tokens[0], tokens[1], tokens[-1]
+        # Check for the special "SENTENCE_TOO_LONG" token (see
+        # parse-en-senna.perl)
+        if word == "SENTENCE_TOO_LONG":
+            continue
+        # Restore -LRB-, -RRB-, etc.
+        if word == "(":
+            word = "-LRB-"
+        elif word == ")":
+            word = "-RRB-"
+        elif word == "[":
+            word = "-LSB-"
+        elif word == "]":
+            word = "-RSB-"
+        elif word == "{":
+            word = "-LCB-"
+        elif word == "}":
+            word = "-RCB-"
+        tree += frag.replace("*", "(%s %s)" % (pos, word))
+
+def balanced(s):
+    num_left = 0
+    num_right = 0
+    for char in s:
+        if char == "(":
+            num_left += 1
+        elif char == ")":
+            num_right += 1
+    return num_left == num_right
+
+def beautify(tree):
+    s = tree.replace("(", " (")
+    return s.strip()
+
+def berkelify(tree):
+    if len(tree) == 0:
+        return tree
+    assert tree[0] == "("
+    pos = tree.find(" (", 1)
+    assert pos != -1
+    old_root = tree[1:pos]
+    return tree.replace(old_root, "TOP")
+
+def warn(msg):
+    prog_name = os.path.basename(sys.argv[0])
+    sys.stderr.write("%s: warning: %s" % (prog_name, msg))
+    sys.exit(1)
+
+if __name__ == "__main__":
+    main()

From 842fc9780e8f86aea076cc9de39d4039039da904 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Wed, 27 May 2015 20:33:43 +0100
Subject: [PATCH 012/108] senna2brackets.py: bug fixes + clean-up

---
 scripts/training/wrappers/senna2brackets.py | 23 ++++++++++-----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/scripts/training/wrappers/senna2brackets.py b/scripts/training/wrappers/senna2brackets.py
index 28fa6d2d7..4fc71ed44 100755
--- a/scripts/training/wrappers/senna2brackets.py
+++ b/scripts/training/wrappers/senna2brackets.py
@@ -30,19 +30,19 @@ def main():
         parser.error("incorrect number of arguments")
 
     tree = ""
+    line_num = 0
     for line in sys.stdin:
+        line_num += 1
+        # Check for a blank line (the sentence delimiter).
         if line.strip() == "":
             if not balanced(tree):
-                warn("unbalanced parentheses at line %d: "
+                warn("unbalanced parentheses in tree ending at line %d: "
                      "discarding tree" % line_num)
                 tree = ""
-            if tree == "" and options.berkeley:
-                print "(())"
-            else:
-                tree = beautify(tree)
-                if options.berkeley:
-                    tree = berkelify(tree)
-                print tree
+            tree = beautify(tree)
+            if options.berkeley:
+                tree = berkelify(tree)
+            print tree
             tree = ""
             continue
         tokens = line.split()
@@ -81,8 +81,8 @@ def beautify(tree):
     return s.strip()
 
 def berkelify(tree):
-    if len(tree) == 0:
-        return tree
+    if tree == "":
+        return "(())"
     assert tree[0] == "("
     pos = tree.find(" (", 1)
     assert pos != -1
@@ -91,8 +91,7 @@ def berkelify(tree):
 
 def warn(msg):
     prog_name = os.path.basename(sys.argv[0])
-    sys.stderr.write("%s: warning: %s" % (prog_name, msg))
-    sys.exit(1)
+    sys.stderr.write("%s: warning: %s\n" % (prog_name, msg))
 
 if __name__ == "__main__":
     main()

From 7ff1f9c06370ba10eb7951b86002fde171e97b7f Mon Sep 17 00:00:00 2001
From: Ulrich Germann <Ulrich.Germann@gmail.com>
Date: Wed, 27 May 2015 20:45:55 +0100
Subject: [PATCH 013/108] Option bundling.

---
 moses/StaticData.cpp                    | 71 +++++++-------------
 moses/StaticData.h                      | 87 +++++++++++++++----------
 moses/parameters/BookkeepingOptions.cpp | 18 +++++
 moses/parameters/BookkeepingOptions.h   | 15 +++++
 moses/parameters/NBestOptions.cpp       | 40 ++++++++++++
 moses/parameters/NBestOptions.h         | 29 +++++++++
 6 files changed, 179 insertions(+), 81 deletions(-)
 create mode 100644 moses/parameters/BookkeepingOptions.cpp
 create mode 100644 moses/parameters/BookkeepingOptions.h
 create mode 100644 moses/parameters/NBestOptions.cpp
 create mode 100644 moses/parameters/NBestOptions.h

diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index 9cf97657a..ac0c3c990 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -63,8 +63,8 @@ StaticData::StaticData()
   : m_sourceStartPosMattersForRecombination(false)
   , m_requireSortingAfterSourceContext(false)
   , m_inputType(SentenceInput)
-  , m_onlyDistinctNBest(false)
-  , m_needAlignmentInfo(false)
+    // , m_onlyDistinctNBest(false)
+    // , m_needAlignmentInfo(false)
   , m_lmEnableOOVFeature(false)
   , m_isAlwaysCreateDirectTranslationOption(false)
   , m_currentWeightSetting("default")
@@ -203,25 +203,26 @@ StaticData
   //word-to-word alignment
   // alignments
   m_parameter->SetParameter(m_PrintAlignmentInfo, "print-alignment-info", false );
-  if (m_PrintAlignmentInfo) {
-    m_needAlignmentInfo = true;
-  }
+
+  // if (m_PrintAlignmentInfo) { // => now in BookkeepingOptions::init()
+  // m_needAlignmentInfo = true;
+  // }
 
   m_parameter->SetParameter(m_wordAlignmentSort, "sort-word-alignment", NoSort);
 
-  if (m_PrintAlignmentInfoNbest) {
-    m_needAlignmentInfo = true;
-  }
+  // if (m_PrintAlignmentInfoNbest) { // => now in BookkeepingOptions::init()
+  //   m_needAlignmentInfo = true;
+  // }
 
   params = m_parameter->GetParam("alignment-output-file");
   if (params && params->size()) {
     m_alignmentOutputFile = Scan<std::string>(params->at(0));
-    m_needAlignmentInfo = true;
+    // m_needAlignmentInfo = true; // => now in BookkeepingOptions::init()
   }
 
   m_parameter->SetParameter( m_PrintID, "print-id", false );
   m_parameter->SetParameter( m_PrintPassthroughInformation, "print-passthrough", false );
-  m_parameter->SetParameter( m_PrintPassthroughInformationInNBest, "print-passthrough-in-n-best", false );
+  // m_parameter->SetParameter( m_PrintPassthroughInformationInNBest, "print-passthrough-in-n-best", false ); // => now in BookkeepingOptions::init()
 
   // word graph
   params = m_parameter->GetParam("output-word-graph");
@@ -327,41 +328,7 @@ bool
 StaticData
 ::ini_nbest_options()
 {
-  const PARAM_VEC *params;
-  // n-best
-  params = m_parameter->GetParam("n-best-list");
-  if (params) {
-    if (params->size() >= 2) {
-      m_nBestFilePath = params->at(0);
-      m_nBestSize = Scan<size_t>( params->at(1) );
-      m_onlyDistinctNBest=(params->size()>2 && params->at(2)=="distinct");
-    } else {
-      std::cerr << "wrong format for switch -n-best-list file size [disinct]";
-      return false;
-    }
-  } else {
-    m_nBestSize = 0;
-  }
-
-  m_parameter->SetParameter<size_t>(m_nBestFactor, "n-best-factor", 20);
-
-
-  m_parameter->SetParameter(m_PrintAlignmentInfoNbest,
-                            "print-alignment-info-in-n-best", false );
-
-  // include feature names in the n-best list
-  m_parameter->SetParameter(m_labeledNBestList, "labeled-n-best-list", true );
-
-  // include word alignment in the n-best list
-  m_parameter->SetParameter(m_nBestIncludesSegmentation,
-                            "include-segmentation-in-n-best", false );
-
-  // print all factors of output translations
-  m_parameter->SetParameter(m_reportAllFactorsNBest,
-                            "report-all-factors-in-n-best", false );
-
-  m_parameter->SetParameter(m_printNBestTrees, "n-best-trees", false );
-  return true;
+  return m_nbest_options.init(*m_parameter);
 }
 
 void
@@ -625,8 +592,9 @@ bool StaticData::LoadData(Parameter *parameter)
   // input, output
   ini_factor_maps();
   ini_input_options();
+  m_bookkeeping_options.init(*parameter);
+  m_nbest_options.init(*parameter); // if (!ini_nbest_options()) return false; 
   if (!ini_output_options()) return false;
-  if (!ini_nbest_options())  return false;
 
   // threading etc.
   if (!ini_performance_options()) return false;
@@ -647,6 +615,17 @@ bool StaticData::LoadData(Parameter *parameter)
 
   ini_mira_options();
 
+  // set m_nbest_options.enabled = true if necessary:
+  if (m_mbr || m_useLatticeMBR || m_outputSearchGraph || m_outputSearchGraphSLF 
+      || m_mira || m_outputSearchGraphHypergraph || m_useConsensusDecoding 
+#ifdef HAVE_PROTOBUF
+      || m_outputSearchGraphPB 
+#endif
+      || m_latticeSamplesFilePath.size())
+    { 
+      m_nbest_options.enabled = true; 
+    }
+
   // S2T decoder
   m_parameter->SetParameter(m_s2tParsingAlgorithm, "s2t-parsing-algorithm",
                             RecursiveCYKPlus);
diff --git a/moses/StaticData.h b/moses/StaticData.h
index 7e71f0881..2b1e37b83 100644
--- a/moses/StaticData.h
+++ b/moses/StaticData.h
@@ -45,6 +45,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "moses/PP/Factory.h"
 
 #include "moses/parameters/ContextParameters.h"
+#include "moses/parameters/NBestOptions.h"
+#include "moses/parameters/BookkeepingOptions.h"
 
 namespace Moses
 {
@@ -95,18 +97,21 @@ protected:
   // 0		= no disortion (monotone in old pharaoh)
   bool m_reorderingConstraint; //! use additional reordering constraints
   bool m_useEarlyDistortionCost;
-  size_t
-  m_maxHypoStackSize //! hypothesis-stack size that triggers pruning
-  , m_minHypoStackDiversity //! minimum number of hypothesis in stack for each source word coverage
-  , m_nBestSize
-  , m_latticeSamplesSize
-  , m_nBestFactor
-  , m_maxNoTransOptPerCoverage
-  , m_maxNoPartTransOpt
-  , m_maxPhraseLength;
+  size_t m_maxHypoStackSize; //! hypothesis-stack size that triggers pruning
+  size_t m_minHypoStackDiversity; //! minimum number of hypothesis in stack for each source word coverage;
+  NBestOptions m_nbest_options;
+  BookkeepingOptions m_bookkeeping_options;
+  // size_t m_nBestSize;
+  // size_t m_nBestFactor;
+  
+  size_t m_latticeSamplesSize;
+  size_t m_maxNoTransOptPerCoverage;
+  size_t m_maxNoPartTransOpt;
+  size_t m_maxPhraseLength;
 
-  std::string		m_nBestFilePath, m_latticeSamplesFilePath;
-  bool                  m_labeledNBestList,m_nBestIncludesSegmentation;
+  // std::string m_nBestFilePath;
+  std::string  m_latticeSamplesFilePath;
+  // bool m_labeledNBestList,m_nBestIncludesSegmentation;
   bool m_dropUnknown; //! false = treat unknown words as unknowns, and translate them as themselves; true = drop (ignore) them
   bool m_markUnknown; //! false = treat unknown words as unknowns, and translate them as themselves; true = mark and (ignore) them
   bool m_wordDeletionEnabled;
@@ -128,21 +133,21 @@ protected:
   bool m_reportSegmentation;
   bool m_reportSegmentationEnriched;
   bool m_reportAllFactors;
-  bool m_reportAllFactorsNBest;
+  // bool m_reportAllFactorsNBest;
   std::string m_detailedTranslationReportingFilePath;
   std::string m_detailedTreeFragmentsTranslationReportingFilePath;
 
   //DIMw
   std::string m_detailedAllTranslationReportingFilePath;
 
-  bool m_onlyDistinctNBest;
+  // bool m_onlyDistinctNBest;
   bool m_PrintAlignmentInfo;
-  bool m_needAlignmentInfo;
-  bool m_PrintAlignmentInfoNbest;
+  // bool m_needAlignmentInfo; // => BookkeepingOptions
+  // bool m_PrintAlignmentInfoNbest;
 
   bool m_PrintID;
   bool m_PrintPassthroughInformation;
-  bool m_PrintPassthroughInformationInNBest;
+  // bool m_PrintPassthroughInformationInNBest;
 
   std::string m_alignmentOutputFile;
 
@@ -214,7 +219,7 @@ protected:
   bool m_useLegacyPT;
   bool m_defaultNonTermOnlyForEmptyRange;
   S2TParsingAlgorithm m_s2tParsingAlgorithm;
-  bool m_printNBestTrees;
+  // bool m_printNBestTrees;
 
   FeatureRegistry m_registry;
   PhrasePropertyFactory m_phrasePropertyFactory;
@@ -361,7 +366,8 @@ public:
     return m_PrintPassthroughInformation;
   }
   bool IsPassthroughInNBestEnabled() const {
-    return m_PrintPassthroughInformationInNBest;
+    return m_nbest_options.include_passthrough;
+    // return m_PrintPassthroughInformationInNBest;
   }
   int GetMaxDistortion() const {
     return m_maxDistortion;
@@ -410,7 +416,8 @@ public:
     return m_reportAllFactors;
   }
   bool GetReportAllFactorsNBest() const {
-    return m_reportAllFactorsNBest;
+    return m_nbest_options.include_all_factors;
+    // return m_reportAllFactorsNBest;
   }
   bool IsDetailedTranslationReportingEnabled() const {
     return !m_detailedTranslationReportingFilePath.empty();
@@ -430,7 +437,8 @@ public:
     return m_detailedTreeFragmentsTranslationReportingFilePath;
   }
   bool IsLabeledNBestList() const {
-    return m_labeledNBestList;
+    return m_nbest_options.include_feature_labels;
+    // return m_labeledNBestList;
   }
 
   bool UseMinphrInMemory() const {
@@ -443,21 +451,24 @@ public:
 
   // for mert
   size_t GetNBestSize() const {
-    return m_nBestSize;
+    return m_nbest_options.nbest_size;
+    // return m_nBestSize;
   }
 
   const std::string &GetNBestFilePath() const {
-    return m_nBestFilePath;
+    return m_nbest_options.output_file_path;
+    // return m_nBestFilePath;
   }
 
   bool IsNBestEnabled() const {
-    return (!m_nBestFilePath.empty() || m_mbr || m_useLatticeMBR || m_mira ||
-            m_outputSearchGraph || m_outputSearchGraphSLF ||
-            m_outputSearchGraphHypergraph || m_useConsensusDecoding ||
-#ifdef HAVE_PROTOBUF
-            m_outputSearchGraphPB ||
-#endif
-            !m_latticeSamplesFilePath.empty());
+    return m_nbest_options.enabled;
+    //     return (!m_nBestFilePath.empty() || m_mbr || m_useLatticeMBR || m_mira ||
+    //             m_outputSearchGraph || m_outputSearchGraphSLF ||
+    //             m_outputSearchGraphHypergraph || m_useConsensusDecoding ||
+    // #ifdef HAVE_PROTOBUF
+    //             m_outputSearchGraphPB ||
+    // #endif
+    //             !m_latticeSamplesFilePath.empty());
   }
 
   size_t GetLatticeSamplesSize() const {
@@ -469,7 +480,8 @@ public:
   }
 
   size_t GetNBestFactor() const {
-    return m_nBestFactor;
+    return m_nbest_options.factor;
+    // return m_nBestFactor;
   }
   bool GetOutputWordGraph() const {
     return m_outputWordGraph;
@@ -527,7 +539,8 @@ public:
   void SetWeights(const FeatureFunction* sp, const std::vector<float>& weights);
 
   bool GetDistinctNBest() const {
-    return m_onlyDistinctNBest;
+    return m_nbest_options.only_distinct;
+    // return m_onlyDistinctNBest;
   }
   const std::string& GetFactorDelimiter() const {
     return m_factorDelimiter;
@@ -692,7 +705,8 @@ public:
   const std::string &GetBinDirectory() const;
 
   bool NeedAlignmentInfo() const {
-    return m_needAlignmentInfo;
+    return m_bookkeeping_options.need_alignment_info;
+    // return m_needAlignmentInfo;
   }
   const std::string &GetAlignmentOutputFile() const {
     return m_alignmentOutputFile;
@@ -701,14 +715,16 @@ public:
     return m_PrintAlignmentInfo;
   }
   bool PrintAlignmentInfoInNbest() const {
-    return m_PrintAlignmentInfoNbest;
+    return m_nbest_options.include_alignment_info;
+    // return m_PrintAlignmentInfoNbest;
   }
   WordAlignmentSort GetWordAlignmentSort() const {
     return m_wordAlignmentSort;
   }
 
   bool NBestIncludesSegmentation() const {
-    return m_nBestIncludesSegmentation;
+    return m_nbest_options.include_segmentation;
+    // return m_nBestIncludesSegmentation;
   }
 
   bool GetHasAlternateWeightSettings() const {
@@ -849,7 +865,8 @@ public:
   }
 
   bool PrintNBestTrees() const {
-    return m_printNBestTrees;
+    return m_nbest_options.print_trees;
+    // return m_printNBestTrees;
   }
 
   bool RequireSortingAfterSourceContext() const {
diff --git a/moses/parameters/BookkeepingOptions.cpp b/moses/parameters/BookkeepingOptions.cpp
new file mode 100644
index 000000000..875c605bf
--- /dev/null
+++ b/moses/parameters/BookkeepingOptions.cpp
@@ -0,0 +1,18 @@
+#include "BookkeepingOptions.h"
+
+namespace Moses {
+  bool
+  BookkeepingOptions::
+  init(Parameter const& P)
+  {
+    bool& x = need_alignment_info;
+    P.SetParameter(x, "print-alignment-info", false);
+    if (!x) P.SetParameter(x, "print-alignment-info-in-n-best", false);
+    if (!x) 
+      {
+	PARAM_VEC const* params = P.GetParam("alignment-output-file");
+	x = params && params->size();
+      }
+    return true;
+  }
+}
diff --git a/moses/parameters/BookkeepingOptions.h b/moses/parameters/BookkeepingOptions.h
new file mode 100644
index 000000000..8e800c587
--- /dev/null
+++ b/moses/parameters/BookkeepingOptions.h
@@ -0,0 +1,15 @@
+// -*- mode: c++; cc-style: gnu -*-
+#include "moses/Parameter.h"
+// #include <string>
+
+namespace Moses {
+
+  struct BookkeepingOptions
+  {
+    bool need_alignment_info;
+    bool init(Parameter const& param);
+  };
+
+
+
+}
diff --git a/moses/parameters/NBestOptions.cpp b/moses/parameters/NBestOptions.cpp
new file mode 100644
index 000000000..6ec97c91b
--- /dev/null
+++ b/moses/parameters/NBestOptions.cpp
@@ -0,0 +1,40 @@
+// -*- mode: c++; cc-style: gnu -*-
+#include "moses/Parameter.h"
+#include "NBestOptions.h"
+
+namespace Moses {
+
+bool
+NBestOptions::
+init(Parameter const& P)
+{
+  const PARAM_VEC *params;
+  params = P.GetParam("n-best-list");
+  if (params) 
+    {
+      if (params->size() >= 2) 
+	{
+	  output_file_path = params->at(0);
+	  nbest_size = Scan<size_t>( params->at(1) );
+	  only_distinct = (params->size()>2 && params->at(2)=="distinct");
+	} 
+      else 
+	{
+	  std::cerr << "wrong format for switch -n-best-list file size [disinct]";
+	  return false;
+	}
+    } 
+  else nbest_size = 0;
+  
+  P.SetParameter<size_t>(factor, "n-best-factor", 20);
+  P.SetParameter(include_alignment_info, "print-alignment-info-in-n-best", false );
+  P.SetParameter(include_feature_labels, "labeled-n-best-list", true );
+  P.SetParameter(include_segmentation, "include-segmentation-in-n-best", false );
+  P.SetParameter(include_passthrough, "print-passthrough-in-n-best", false );
+  P.SetParameter(include_all_factors, "report-all-factors-in-n-best", false );
+  P.SetParameter(print_trees, "n-best-trees", false );
+  
+  enabled = output_file_path.size();
+  return true;
+}
+} // namespace Moses
diff --git a/moses/parameters/NBestOptions.h b/moses/parameters/NBestOptions.h
new file mode 100644
index 000000000..e844c1eac
--- /dev/null
+++ b/moses/parameters/NBestOptions.h
@@ -0,0 +1,29 @@
+// -*- mode: c++; cc-style: gnu -*-
+#include <string>
+
+namespace Moses {
+
+  struct NBestOptions
+  {
+    size_t nbest_size;
+    size_t factor;
+    bool enabled;
+    bool print_trees;
+    bool only_distinct;
+
+    bool include_alignment_info;
+    bool include_segmentation;
+    bool include_feature_labels;
+    bool include_passthrough;
+
+    bool include_all_factors;
+
+    std::string output_file_path;
+    
+    bool init(Parameter const& param);
+
+  };
+
+
+
+}

From ab2d396781e57d8d7e3526d102db24597e289fab Mon Sep 17 00:00:00 2001
From: Barry Haddow <barry.haddow@gmail.com>
Date: Thu, 28 May 2015 17:10:21 +0100
Subject: [PATCH 014/108] Min score parameter

---
 scripts/training/binarize-model.perl | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/scripts/training/binarize-model.perl b/scripts/training/binarize-model.perl
index 0239f5fc8..cca74f1ab 100755
--- a/scripts/training/binarize-model.perl
+++ b/scripts/training/binarize-model.perl
@@ -17,12 +17,14 @@ if ($SCRIPTS_ROOTDIR eq '') {
 }
 $SCRIPTS_ROOTDIR =~ s/\/training$//;
 
-my ($binarizer, $input_config, $output_config);
+my ($binarizer, $input_config, $output_config, $min_score);
 my $opt_hierarchical = 0;
-$binarizer = "$SCRIPTS_ROOTDIR/../bin/processPhraseTable";
+$binarizer = "$SCRIPTS_ROOTDIR/../bin/processPhraseTableMin";
+$min_score = "0";
 GetOptions(
   "Hierarchical" => \$opt_hierarchical,
-  "Binarizer=s" => \$binarizer
+  "Binarizer=s" => \$binarizer,
+  "MinScore=s" => \$min_score
 ) or exit(1);
 
 $input_config = shift;
@@ -37,7 +39,9 @@ my $hierarchical = "";
 $hierarchical = "-Hierarchical" if $opt_hierarchical;
 my $targetdir = "$output_config.tables";
 
-safesystem("$RealBin/filter-model-given-input.pl  $targetdir $input_config /dev/null $hierarchical -nofilter -Binarizer $binarizer") || die "binarising failed";
+my $cmd = "$RealBin/filter-model-given-input.pl  $targetdir $input_config /dev/null $hierarchical -nofilter -Binarizer $binarizer";
+$cmd .= "--MinScore $min_score" if (defined $min_score);
+safesystem($cmd) || die "binarising failed";
 safesystem("rm -f $output_config; ln -s $targetdir/moses.ini $output_config") || die "failed to link new ini file";
 
 #FIXME: Why isn't this in a module?

From c27aa193eaa3c73754c8d90dea0cd32dd5a22e7d Mon Sep 17 00:00:00 2001
From: Barry Haddow <barry.haddow@gmail.com>
Date: Thu, 28 May 2015 17:44:26 +0100
Subject: [PATCH 015/108] Revert "Min score parameter". Doesn't work without
 filter.

This reverts commit ab2d396781e57d8d7e3526d102db24597e289fab.
---
 scripts/training/binarize-model.perl | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/scripts/training/binarize-model.perl b/scripts/training/binarize-model.perl
index cca74f1ab..0239f5fc8 100755
--- a/scripts/training/binarize-model.perl
+++ b/scripts/training/binarize-model.perl
@@ -17,14 +17,12 @@ if ($SCRIPTS_ROOTDIR eq '') {
 }
 $SCRIPTS_ROOTDIR =~ s/\/training$//;
 
-my ($binarizer, $input_config, $output_config, $min_score);
+my ($binarizer, $input_config, $output_config);
 my $opt_hierarchical = 0;
-$binarizer = "$SCRIPTS_ROOTDIR/../bin/processPhraseTableMin";
-$min_score = "0";
+$binarizer = "$SCRIPTS_ROOTDIR/../bin/processPhraseTable";
 GetOptions(
   "Hierarchical" => \$opt_hierarchical,
-  "Binarizer=s" => \$binarizer,
-  "MinScore=s" => \$min_score
+  "Binarizer=s" => \$binarizer
 ) or exit(1);
 
 $input_config = shift;
@@ -39,9 +37,7 @@ my $hierarchical = "";
 $hierarchical = "-Hierarchical" if $opt_hierarchical;
 my $targetdir = "$output_config.tables";
 
-my $cmd = "$RealBin/filter-model-given-input.pl  $targetdir $input_config /dev/null $hierarchical -nofilter -Binarizer $binarizer";
-$cmd .= "--MinScore $min_score" if (defined $min_score);
-safesystem($cmd) || die "binarising failed";
+safesystem("$RealBin/filter-model-given-input.pl  $targetdir $input_config /dev/null $hierarchical -nofilter -Binarizer $binarizer") || die "binarising failed";
 safesystem("rm -f $output_config; ln -s $targetdir/moses.ini $output_config") || die "failed to link new ini file";
 
 #FIXME: Why isn't this in a module?

From 26170a41790bc1dfbc01c90dbcbf2699a0fe3cd0 Mon Sep 17 00:00:00 2001
From: Jeroen Vermeulen <jtv@precisiontranslationtools.com>
Date: Fri, 29 May 2015 09:37:37 +0700
Subject: [PATCH 016/108] Friendlier error reporting in beautify.py.

---
 scripts/other/beautify.py | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/scripts/other/beautify.py b/scripts/other/beautify.py
index f03a58ce7..0caa6b162 100755
--- a/scripts/other/beautify.py
+++ b/scripts/other/beautify.py
@@ -38,6 +38,17 @@ BEAUTIFY_IGNORE = '.beautify-ignore'
 
 class LintCheckFailure(Exception):
     """Lint was found, or the lint checker otherwise returned failure."""
+    exit_code = 1
+
+
+class ProgramFailure(Exception):
+    """The program failed, but it's not a bug.  No traceback."""
+    exit_code = 2
+
+
+class CommandLineError(Exception):
+    """Something wrong with the command-line arguments."""
+    exit_code = 3
 
 
 def read_ignore_file(root_dir):
@@ -52,7 +63,7 @@ def read_ignore_file(root_dir):
             ignore_contents = ignore_file.read()
     except IOError as error:
         if error.errno == ENOENT:
-            raise Exception(
+            raise ProgramFailure(
                 "No .gitignore file found in %s.  "
                 "Is it really the project's root directory?"
                 % root_dir)
@@ -200,7 +211,7 @@ def check_astyle_version(verbose=False):
         ['astyle', '--version'], verbose=verbose, env={'LC_ALL': 'C'})
     version = version.strip()
     if version != EXPECTED_ASTYLE_VERSION:
-        raise Exception(
+        raise ProgramFailure(
             "Wrong astyle version.  "
             "Expected '%s', but got version string '%s'."
             % (EXPECTED_ASTYLE_VERSION, version))
@@ -226,8 +237,15 @@ def run_perltidy(source_files, verbose=False, dry_run=False):
         # Write "} else {", with 'else' on the same line as the braces.
         '--cuddled-else',
     ]
-    _, stderr = run_command(
-        command_line + source_files, verbose=verbose, dry_run=dry_run)
+    try:
+        _, stderr = run_command(
+            command_line + source_files, verbose=verbose, dry_run=dry_run)
+    except OSError as error:
+        if error.errno == ENOENT:
+            raise ProgramFailure(
+                "Could not run 'perltidy'.  Make sure that it is installed.")
+        else:
+            raise
     if stderr != '':
         sys.stderr.write(stderr)
 
@@ -386,7 +404,7 @@ def main():
     """Find and format source files."""
     args = parse_arguments()
     if not args.format and not args.lint:
-        raise Exception("Select action: --format, --lint, or both.")
+        raise CommandLineError("Select action: --format, --lint, or both.")
 
     ignore = read_ignore_file(args.root_dir)
 
@@ -409,8 +427,8 @@ def main():
 if __name__ == '__main__':
     try:
         main()
-    except LintCheckFailure as error:
+    except (CommandLineError, LintCheckFailure, ProgramFailure) as error:
         # This is a failure, but not a bug.  Print a friendly error
         # message, not a traceback.
         sys.stderr.write('%s\n' % error)
-        sys.exit(1)
+        sys.exit(error.exit_code)

From ef028446f3640e007215b4576a4dc52a9c9de6db Mon Sep 17 00:00:00 2001
From: Jeroen Vermeulen <jtv@precisiontranslationtools.com>
Date: Fri, 29 May 2015 18:30:26 +0700
Subject: [PATCH 017/108] Add license notices to scripts.

This is not pleasant to read (and much, much less pleasant to write!) but
sort of necessary in an open project.  Right now it's quite hard to figure
out what is licensed how, which doesn't matter much to most people but can
suddenly become very important when people want to know what they're being
allowed to do.

I kept the notices as short as I could.  As far as I could see, everything
without a clear license notice is LGPL v2.1 or later.
---
 scripts/OSM/OSM-Train.perl                    |  3 ++
 scripts/OSM/extract-singletons.perl           |  3 ++
 scripts/OSM/flipAlignment.perl                |  3 ++
 scripts/Transliteration/clean.pl              |  5 ++-
 scripts/Transliteration/corpusCreator.pl      |  3 ++
 .../in-decoding-transliteration.pl            |  3 ++
 .../post-decoding-transliteration.pl          |  3 ++
 .../prepare-transliteration-phrase-table.pl   |  3 ++
 scripts/Transliteration/threshold.pl          |  3 ++
 .../train-transliteration-module.pl           |  3 ++
 ...trap-hypothesis-difference-significance.pl |  3 ++
 scripts/analysis/extract-target-trees.py      | 12 ++++--
 scripts/analysis/nontranslated_words.pl       |  3 ++
 scripts/analysis/oov.pl                       |  3 ++
 scripts/analysis/sentence-by-sentence.pl      |  3 ++
 scripts/analysis/sg2dot.perl                  |  2 +
 scripts/analysis/show-phrases-used.pl         |  3 ++
 scripts/analysis/smtgui/Corpus.pm             |  3 ++
 .../analysis/smtgui/filter-phrase-table.pl    |  3 ++
 scripts/analysis/smtgui/newsmtgui.cgi         |  3 ++
 scripts/analysis/suspicious_tokenization.pl   |  3 ++
 scripts/analysis/weight-scan-summarize.sh     |  4 ++
 scripts/analysis/weight-scan.pl               |  4 ++
 scripts/ems/experiment.perl                   |  3 ++
 scripts/ems/fix-info.perl                     |  3 ++
 scripts/ems/support/analysis.perl             |  3 ++
 scripts/ems/support/berkeley-process.sh       |  3 ++
 scripts/ems/support/berkeley-train.sh         |  3 ++
 .../build-domain-file-from-subcorpora.perl    |  3 ++
 .../ems/support/build-sparse-features.perl    |  3 ++
 .../support/consolidate-training-data.perl    |  3 ++
 scripts/ems/support/defaultconfig.py          |  3 ++
 scripts/ems/support/fast-align-in-parts.perl  |  3 ++
 .../generic-multicore-parallelizer.perl       |  3 ++
 scripts/ems/support/generic-parallelizer.perl |  3 ++
 scripts/ems/support/input-from-sgm.perl       |  3 ++
 scripts/ems/support/interpolate-lm.perl       |  3 ++
 scripts/ems/support/lmplz-wrapper.perl        |  3 ++
 scripts/ems/support/mml-filter.perl           |  3 ++
 scripts/ems/support/mml-score.perl            |  3 ++
 scripts/ems/support/mml-train.perl            |  3 ++
 scripts/ems/support/prepare-fast-align.perl   |  3 ++
 scripts/ems/support/reference-from-sgm.perl   |  3 ++
 .../support/remove-segmentation-markup.perl   |  3 ++
 .../ems/support/report-experiment-scores.perl |  3 ++
 .../run-command-on-multiple-refsets.perl      |  3 ++
 scripts/ems/support/run-wade.perl             |  3 ++
 scripts/ems/support/split-sentences.perl      |  3 ++
 scripts/ems/support/submit-grid.perl          |  3 ++
 ...ubstitute-filtered-tables-and-weights.perl |  3 ++
 .../support/substitute-filtered-tables.perl   |  3 ++
 scripts/ems/support/substitute-weights.perl   |  3 ++
 .../ems/support/symmetrize-fast-align.perl    |  3 ++
 scripts/ems/support/thot-lm-wrapper.perl      |  3 ++
 .../ems/support/tree-converter-wrapper.perl   |  3 ++
 scripts/ems/support/wrap-xml.perl             |  3 ++
 scripts/ems/web/analysis.php                  |  5 +++
 scripts/ems/web/analysis_diff.php             |  4 ++
 scripts/ems/web/diff.php                      |  5 +++
 scripts/ems/web/hierarchical-segmentation.js  |  4 ++
 scripts/ems/web/index.php                     |  5 +++
 scripts/ems/web/lib.php                       |  5 +++
 scripts/ems/web/overview.php                  |  4 ++
 scripts/ems/web/progress.perl                 |  3 ++
 scripts/ems/web/sgviz.js                      |  4 ++
 scripts/ems/web/sgviz.php                     |  6 +++
 scripts/fuzzy-match/create_xml.perl           |  3 ++
 scripts/generic/bsbleu.py                     |  3 ++
 scripts/generic/compound-splitter.perl        |  3 ++
 scripts/generic/extract-factors.pl            |  3 ++
 scripts/generic/extract-parallel.perl         |  3 ++
 scripts/generic/fsa2fsal.pl                   |  3 ++
 scripts/generic/fsa2plf.pl                    |  3 ++
 scripts/generic/fsal2fsa.pl                   |  3 ++
 scripts/generic/generic-parallel.perl         |  3 ++
 scripts/generic/giza-parallel.perl            |  3 ++
 scripts/generic/lopar2pos.pl                  |  3 ++
 scripts/generic/moses-parallel.pl             |  3 ++
 scripts/generic/moses_sim_pe.py               | 29 ++++++++------
 scripts/generic/mteval-v12.pl                 |  3 ++
 scripts/generic/mteval-v13a.pl                |  3 ++
 scripts/generic/multi-bleu.perl               |  3 ++
 scripts/generic/ph_numbers.perl               |  3 ++
 scripts/generic/qsub-wrapper.pl               |  3 ++
 scripts/generic/reverse-alignment.perl        |  3 ++
 scripts/generic/score-parallel.perl           |  3 ++
 scripts/generic/strip-xml.perl                |  3 ++
 scripts/generic/trainlm-irst2.perl            |  3 ++
 scripts/other/beautify.py                     |  5 +++
 scripts/other/convert-pt.perl                 |  3 ++
 scripts/other/delete-scores.perl              |  3 ++
 scripts/other/gacha_filter.py                 |  3 ++
 .../get_many_translations_from_google.perl    |  3 ++
 scripts/other/retain-lines.perl               |  3 ++
 .../other/translate_by_microsoft_bing.perl    |  3 ++
 scripts/recaser/detruecase.perl               |  3 ++
 scripts/recaser/recase.perl                   |  3 ++
 scripts/recaser/train-recaser.perl            |  3 ++
 scripts/recaser/train-truecaser.perl          |  3 ++
 scripts/recaser/truecase.perl                 |  3 ++
 .../MosesScriptsRegressionTesting.pm          |  3 ++
 scripts/regression-testing/compare-results.pl |  3 ++
 .../create_localized_moses_ini.pl             |  3 ++
 scripts/regression-testing/modify-pars.pl     |  3 ++
 scripts/regression-testing/moses-virtual.pl   |  3 ++
 scripts/regression-testing/run-single-test.pl |  3 ++
 scripts/regression-testing/run-test-suite.pl  |  3 ++
 scripts/server/moses.py                       |  3 ++
 scripts/server/sim-pe.py                      | 10 ++++-
 .../tokenizer/deescape-special-chars-PTB.perl |  3 ++
 scripts/tokenizer/deescape-special-chars.perl |  3 ++
 scripts/tokenizer/detokenizer.perl            |  3 ++
 scripts/tokenizer/escape-special-chars.perl   |  3 ++
 scripts/tokenizer/lowercase.perl              |  3 ++
 scripts/tokenizer/normalize-punctuation.perl  |  3 ++
 scripts/tokenizer/pre-tok-clean.perl          |  3 ++
 scripts/tokenizer/pre-tokenizer.perl          |  3 ++
 scripts/tokenizer/pre_tokenize_cleaning.py    |  3 ++
 .../tokenizer/remove-non-printing-char.perl   |  3 ++
 .../replace-unicode-punctuation.perl          |  3 ++
 scripts/tokenizer/tokenizer.perl              |  3 ++
 scripts/tokenizer/tokenizer_PTB.perl          |  3 ++
 scripts/training/LexicalTranslationModel.pm   |  3 ++
 scripts/training/absolutize_moses_model.pl    |  3 ++
 scripts/training/analyse_moses_model.pl       |  3 ++
 .../bilingual-lm/averageNullEmbedding.py      |  3 ++
 scripts/training/bilingual-lm/extract.py      |  3 ++
 scripts/training/bilingual-lm/extract_test.py |  3 ++
 .../training/bilingual-lm/extract_training.py |  3 ++
 .../training/bilingual-lm/reduce_ngrams.py    |  5 ++-
 scripts/training/bilingual-lm/test_nplm.py    |  3 ++
 scripts/training/bilingual-lm/train_nplm.py   |  3 ++
 scripts/training/binarize-model.perl          |  3 ++
 scripts/training/build-generation-table.perl  |  3 ++
 scripts/training/build-mmsapt.perl            |  3 ++
 scripts/training/clean-corpus-n.perl          |  3 ++
 scripts/training/clone_moses_model.pl         |  3 ++
 scripts/training/combine_factors.pl           |  3 ++
 scripts/training/convert-moses-ini-to-v2.perl |  3 ++
 .../training/convert-moses-ini-v2-to-v1.py    |  3 ++
 scripts/training/corpus-sizes.perl            |  3 ++
 scripts/training/create_count_tables.py       |  3 ++
 scripts/training/exodus.perl                  |  3 ++
 scripts/training/filter-model-given-input.pl  |  3 ++
 scripts/training/filter-rule-table.py         | 40 ++++++++++---------
 scripts/training/flexibility_score.py         |  3 ++
 scripts/training/giza2bal.pl                  |  3 ++
 scripts/training/mert-moses.pl                |  4 ++
 scripts/training/postprocess-lopar.perl       |  3 ++
 .../training/rdlm/average_null_embedding.py   |  3 ++
 .../training/rdlm/extract_syntactic_ngrams.py |  3 ++
 scripts/training/rdlm/extract_vocab.py        |  3 ++
 scripts/training/rdlm/train_rdlm.py           |  3 ++
 scripts/training/reduce-factors.perl          |  3 ++
 scripts/training/reduce-topt-count.pl         |  3 ++
 scripts/training/reduce_combine.pl            |  3 ++
 ...an-phrase-pairs-from-reordering-table.perl |  3 ++
 scripts/training/threshold-filter.perl        |  3 ++
 .../training/train-global-lexicon-model.perl  |  3 ++
 scripts/training/train-model.perl             |  3 ++
 scripts/training/train-neurallm.py            |  8 +++-
 .../adam-suffix-array/suffix-array-create.sh  |  3 ++
 .../adam-suffix-array/suffix-array-extract.sh |  3 ++
 .../wrappers/berkeleyparsed2mosesxml.perl     |  3 ++
 .../wrappers/berkeleyparsed2mosesxml_PTB.perl |  3 ++
 scripts/training/wrappers/conll2mosesxml.py   |  3 ++
 .../wrappers/filter-excluded-lines.perl       |  3 ++
 .../training/wrappers/find-unparseable.perl   |  3 ++
 scripts/training/wrappers/mada-wrapper.perl   |  3 ++
 scripts/training/wrappers/madamira-tok.perl   |  3 ++
 .../training/wrappers/madamira-wrapper.perl   |  3 ++
 .../make-factor-brown-cluster-mkcls.perl      |  3 ++
 .../wrappers/make-factor-de-lemma.perl        |  3 ++
 .../wrappers/make-factor-de-morph.perl        |  3 ++
 .../training/wrappers/make-factor-de-pos.perl |  3 ++
 .../wrappers/make-factor-en-porter.perl       |  3 ++
 .../wrappers/make-factor-en-pos.mxpost.perl   |  3 ++
 .../wrappers/make-factor-pos.tree-tagger.perl |  3 ++
 .../training/wrappers/make-factor-stem.perl   |  3 ++
 .../training/wrappers/make-factor-suffix.perl |  3 ++
 .../training/wrappers/morfessor-wrapper.perl  |  3 ++
 .../wrappers/mosesxml2berkeleyparsed.perl     |  3 ++
 .../training/wrappers/mosesxml2brackets.py    |  5 ++-
 .../training/wrappers/parse-de-berkeley.perl  |  3 ++
 .../training/wrappers/parse-de-bitpar.perl    |  3 ++
 .../training/wrappers/parse-en-collins.perl   |  3 ++
 scripts/training/wrappers/parse-en-egret.perl |  3 ++
 scripts/training/wrappers/parse-en-senna.perl |  3 ++
 .../training/wrappers/parse-en-stanford.py    | 12 ++++--
 scripts/training/wrappers/senna2brackets.py   | 33 ++++++++-------
 .../wrappers/syntax-hyphen-splitting.perl     |  3 ++
 .../wrappers/tagger-german-chunk.perl         |  3 ++
 192 files changed, 666 insertions(+), 58 deletions(-)

diff --git a/scripts/OSM/OSM-Train.perl b/scripts/OSM/OSM-Train.perl
index 895a821db..07ad71f68 100755
--- a/scripts/OSM/OSM-Train.perl
+++ b/scripts/OSM/OSM-Train.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/OSM/extract-singletons.perl b/scripts/OSM/extract-singletons.perl
index 5a1665a8c..6295edfad 100755
--- a/scripts/OSM/extract-singletons.perl
+++ b/scripts/OSM/extract-singletons.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 #use strict;
 use warnings;
diff --git a/scripts/OSM/flipAlignment.perl b/scripts/OSM/flipAlignment.perl
index b896c0a23..57a1e9bb0 100755
--- a/scripts/OSM/flipAlignment.perl
+++ b/scripts/OSM/flipAlignment.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/Transliteration/clean.pl b/scripts/Transliteration/clean.pl
index ccc364fc9..7a08271da 100755
--- a/scripts/Transliteration/clean.pl
+++ b/scripts/Transliteration/clean.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 #input hindi word urdu word, delete all those entries that have number on any side
 use warnings;
@@ -314,4 +317,4 @@ sub charFreqFilter{
 		}
 	}
   }
-}
\ No newline at end of file
+}
diff --git a/scripts/Transliteration/corpusCreator.pl b/scripts/Transliteration/corpusCreator.pl
index 4c62449df..ac67f5d74 100755
--- a/scripts/Transliteration/corpusCreator.pl
+++ b/scripts/Transliteration/corpusCreator.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/Transliteration/in-decoding-transliteration.pl b/scripts/Transliteration/in-decoding-transliteration.pl
index c3cc31f26..e8130db02 100755
--- a/scripts/Transliteration/in-decoding-transliteration.pl
+++ b/scripts/Transliteration/in-decoding-transliteration.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/Transliteration/post-decoding-transliteration.pl b/scripts/Transliteration/post-decoding-transliteration.pl
index 60c3200f6..2c7908085 100755
--- a/scripts/Transliteration/post-decoding-transliteration.pl
+++ b/scripts/Transliteration/post-decoding-transliteration.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/Transliteration/prepare-transliteration-phrase-table.pl b/scripts/Transliteration/prepare-transliteration-phrase-table.pl
index df3b1ceca..0a9f554c5 100755
--- a/scripts/Transliteration/prepare-transliteration-phrase-table.pl
+++ b/scripts/Transliteration/prepare-transliteration-phrase-table.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/Transliteration/threshold.pl b/scripts/Transliteration/threshold.pl
index bf6657742..3baa8e0a7 100755
--- a/scripts/Transliteration/threshold.pl
+++ b/scripts/Transliteration/threshold.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use utf8;
diff --git a/scripts/Transliteration/train-transliteration-module.pl b/scripts/Transliteration/train-transliteration-module.pl
index 35e4ee396..b1d4d0ff5 100755
--- a/scripts/Transliteration/train-transliteration-module.pl
+++ b/scripts/Transliteration/train-transliteration-module.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use utf8;
diff --git a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
index 8e6a6255a..9a3f63d69 100755
--- a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
+++ b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 use utf8;
 
 ###############################################
diff --git a/scripts/analysis/extract-target-trees.py b/scripts/analysis/extract-target-trees.py
index 3a92fdc4d..7166211d9 100755
--- a/scripts/analysis/extract-target-trees.py
+++ b/scripts/analysis/extract-target-trees.py
@@ -1,9 +1,13 @@
 #!/usr/bin/env python
-
-# Usage: extract-target-trees.py [FILE]
 #
-# Reads moses-chart's -T output from FILE or standard input and writes trees to
-# standard output in Moses' XML tree format.
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
+"""Usage: extract-target-trees.py [FILE]
+
+Reads moses-chart's -T output from FILE or standard input and writes trees to
+standard output in Moses' XML tree format.
+"""
 
 import re
 import sys
diff --git a/scripts/analysis/nontranslated_words.pl b/scripts/analysis/nontranslated_words.pl
index 51a4f9d20..7213deb76 100755
--- a/scripts/analysis/nontranslated_words.pl
+++ b/scripts/analysis/nontranslated_words.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id$
 # Reads a source and hypothesis file and counts equal tokens. Some of these
diff --git a/scripts/analysis/oov.pl b/scripts/analysis/oov.pl
index 052c9994d..9756887c9 100755
--- a/scripts/analysis/oov.pl
+++ b/scripts/analysis/oov.pl
@@ -1,6 +1,9 @@
 #!/usr/bin/env perl
 # Display OOV rate of a test set against a training corpus or a phrase table.
 # Ondrej Bojar
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use strict;
 use warnings;
diff --git a/scripts/analysis/sentence-by-sentence.pl b/scripts/analysis/sentence-by-sentence.pl
index 72b70dc72..b9eb6e56d 100755
--- a/scripts/analysis/sentence-by-sentence.pl
+++ b/scripts/analysis/sentence-by-sentence.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id$
 #sentence-by-sentence: take in a system output, with any number of factors, and a reference translation, also maybe with factors, and show each sentence and its errors
diff --git a/scripts/analysis/sg2dot.perl b/scripts/analysis/sg2dot.perl
index e9c1639ed..5f9a5ea1d 100755
--- a/scripts/analysis/sg2dot.perl
+++ b/scripts/analysis/sg2dot.perl
@@ -3,6 +3,8 @@
 # Author : Loic BARRAULT
 # Script to convert MOSES searchgraph to DOT format
 #
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/analysis/show-phrases-used.pl b/scripts/analysis/show-phrases-used.pl
index 522e6d3ff..9428ea9b8 100755
--- a/scripts/analysis/show-phrases-used.pl
+++ b/scripts/analysis/show-phrases-used.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id$
 #show-phrases-used: display all source and target phrases for each sentence in a corpus, and give average phrase length used
diff --git a/scripts/analysis/smtgui/Corpus.pm b/scripts/analysis/smtgui/Corpus.pm
index f050a9f6d..2391a6c15 100644
--- a/scripts/analysis/smtgui/Corpus.pm
+++ b/scripts/analysis/smtgui/Corpus.pm
@@ -1,5 +1,8 @@
 #package Corpus: hold a bunch of sentences in any language, with translation factors and stats about individual sentences and the corpus as a whole
 #Evan Herbst, 7 / 25 / 06
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 package Corpus;
 BEGIN
diff --git a/scripts/analysis/smtgui/filter-phrase-table.pl b/scripts/analysis/smtgui/filter-phrase-table.pl
index 55f2619c0..cd0f6b91b 100755
--- a/scripts/analysis/smtgui/filter-phrase-table.pl
+++ b/scripts/analysis/smtgui/filter-phrase-table.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id$
 #by Philipp Koehn, de-augmented by Evan Herbst
diff --git a/scripts/analysis/smtgui/newsmtgui.cgi b/scripts/analysis/smtgui/newsmtgui.cgi
index 32ad3a948..034ee265e 100755
--- a/scripts/analysis/smtgui/newsmtgui.cgi
+++ b/scripts/analysis/smtgui/newsmtgui.cgi
@@ -1,4 +1,7 @@
 #!/usr/bin/perl -w
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id$
 use strict;
diff --git a/scripts/analysis/suspicious_tokenization.pl b/scripts/analysis/suspicious_tokenization.pl
index 3ea15154e..f807153d9 100755
--- a/scripts/analysis/suspicious_tokenization.pl
+++ b/scripts/analysis/suspicious_tokenization.pl
@@ -2,6 +2,9 @@
 # Collects and prints all n-grams that appear in the given corpus both
 # tokenized as well as untokenized.
 # Ondrej Bojar
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use strict;
 use warnings;
diff --git a/scripts/analysis/weight-scan-summarize.sh b/scripts/analysis/weight-scan-summarize.sh
index 237182736..2fccb6470 100755
--- a/scripts/analysis/weight-scan-summarize.sh
+++ b/scripts/analysis/weight-scan-summarize.sh
@@ -1,4 +1,8 @@
 #!/bin/bash
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
 # Hackish summarization of weight-scan.pl results, heavily relies on tools by
 # Ondrej Bojar (bojar@ufal.mff.cuni.cz), some of which need Mercury; beware.
 
diff --git a/scripts/analysis/weight-scan.pl b/scripts/analysis/weight-scan.pl
index b33360694..b51a6bcd1 100755
--- a/scripts/analysis/weight-scan.pl
+++ b/scripts/analysis/weight-scan.pl
@@ -1,4 +1,8 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
 # runs Moses many times changing the values of one weight, all others fixed
 # nbest lists are always produced to allow for comparison of real and
 # 'projected' BLEU (BLEU estimated from n-best lists collected at a neighouring
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index ece110fbc..a3f5310a5 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # Experiment Management System
 # Documentation at http://www.statmt.org/moses/?n=FactoredTraining.EMS
diff --git a/scripts/ems/fix-info.perl b/scripts/ems/fix-info.perl
index abe58fe83..6659027b2 100755
--- a/scripts/ems/fix-info.perl
+++ b/scripts/ems/fix-info.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl
index f4d5a55b4..01bb21773 100755
--- a/scripts/ems/support/analysis.perl
+++ b/scripts/ems/support/analysis.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/berkeley-process.sh b/scripts/ems/support/berkeley-process.sh
index e68056c96..347ebba3c 100755
--- a/scripts/ems/support/berkeley-process.sh
+++ b/scripts/ems/support/berkeley-process.sh
@@ -1,4 +1,7 @@
 #!/bin/sh
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 if [ $# -lt 8 ]
 	then
diff --git a/scripts/ems/support/berkeley-train.sh b/scripts/ems/support/berkeley-train.sh
index 96f6b648c..530cf978f 100755
--- a/scripts/ems/support/berkeley-train.sh
+++ b/scripts/ems/support/berkeley-train.sh
@@ -1,4 +1,7 @@
 #!/bin/sh
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 if [ $# -lt 6 ]
 	then
diff --git a/scripts/ems/support/build-domain-file-from-subcorpora.perl b/scripts/ems/support/build-domain-file-from-subcorpora.perl
index 085fd2629..f45b5ba2a 100755
--- a/scripts/ems/support/build-domain-file-from-subcorpora.perl
+++ b/scripts/ems/support/build-domain-file-from-subcorpora.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/build-sparse-features.perl b/scripts/ems/support/build-sparse-features.perl
index 79fc1e394..b134cee69 100755
--- a/scripts/ems/support/build-sparse-features.perl
+++ b/scripts/ems/support/build-sparse-features.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/consolidate-training-data.perl b/scripts/ems/support/consolidate-training-data.perl
index 4ab7f82cf..2a732be77 100755
--- a/scripts/ems/support/consolidate-training-data.perl
+++ b/scripts/ems/support/consolidate-training-data.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $
 
diff --git a/scripts/ems/support/defaultconfig.py b/scripts/ems/support/defaultconfig.py
index a118e96b3..53913da08 100644
--- a/scripts/ems/support/defaultconfig.py
+++ b/scripts/ems/support/defaultconfig.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python2
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 """Version of ConfigParser which accepts default values."""
 
diff --git a/scripts/ems/support/fast-align-in-parts.perl b/scripts/ems/support/fast-align-in-parts.perl
index f777d7e52..bc340a50f 100755
--- a/scripts/ems/support/fast-align-in-parts.perl
+++ b/scripts/ems/support/fast-align-in-parts.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 #######################
 # Revision history
diff --git a/scripts/ems/support/generic-multicore-parallelizer.perl b/scripts/ems/support/generic-multicore-parallelizer.perl
index 0f7910603..d821aa114 100755
--- a/scripts/ems/support/generic-multicore-parallelizer.perl
+++ b/scripts/ems/support/generic-multicore-parallelizer.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/generic-parallelizer.perl b/scripts/ems/support/generic-parallelizer.perl
index 811a99bde..087498ccf 100755
--- a/scripts/ems/support/generic-parallelizer.perl
+++ b/scripts/ems/support/generic-parallelizer.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/input-from-sgm.perl b/scripts/ems/support/input-from-sgm.perl
index 18000581a..eb6a2e3a1 100755
--- a/scripts/ems/support/input-from-sgm.perl
+++ b/scripts/ems/support/input-from-sgm.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/interpolate-lm.perl b/scripts/ems/support/interpolate-lm.perl
index 7d52fd877..4d9a513f6 100755
--- a/scripts/ems/support/interpolate-lm.perl
+++ b/scripts/ems/support/interpolate-lm.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/lmplz-wrapper.perl b/scripts/ems/support/lmplz-wrapper.perl
index df503754f..89b2847d6 100755
--- a/scripts/ems/support/lmplz-wrapper.perl
+++ b/scripts/ems/support/lmplz-wrapper.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/mml-filter.perl b/scripts/ems/support/mml-filter.perl
index 51bc4cda5..32bca335b 100755
--- a/scripts/ems/support/mml-filter.perl
+++ b/scripts/ems/support/mml-filter.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/mml-score.perl b/scripts/ems/support/mml-score.perl
index 6f7b724ea..f88021818 100755
--- a/scripts/ems/support/mml-score.perl
+++ b/scripts/ems/support/mml-score.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/mml-train.perl b/scripts/ems/support/mml-train.perl
index dcc998711..bdf6c1c1a 100755
--- a/scripts/ems/support/mml-train.perl
+++ b/scripts/ems/support/mml-train.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/prepare-fast-align.perl b/scripts/ems/support/prepare-fast-align.perl
index 80fec36b2..68b1f0189 100755
--- a/scripts/ems/support/prepare-fast-align.perl
+++ b/scripts/ems/support/prepare-fast-align.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/reference-from-sgm.perl b/scripts/ems/support/reference-from-sgm.perl
index ebb9ae4ae..b8e1d108d 100755
--- a/scripts/ems/support/reference-from-sgm.perl
+++ b/scripts/ems/support/reference-from-sgm.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/remove-segmentation-markup.perl b/scripts/ems/support/remove-segmentation-markup.perl
index a0bd61fff..3b02bceaf 100755
--- a/scripts/ems/support/remove-segmentation-markup.perl
+++ b/scripts/ems/support/remove-segmentation-markup.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/report-experiment-scores.perl b/scripts/ems/support/report-experiment-scores.perl
index b649951ce..c859508cb 100755
--- a/scripts/ems/support/report-experiment-scores.perl
+++ b/scripts/ems/support/report-experiment-scores.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id: report-experiment-scores.perl 407 2008-11-10 14:43:31Z philipp $
 
diff --git a/scripts/ems/support/run-command-on-multiple-refsets.perl b/scripts/ems/support/run-command-on-multiple-refsets.perl
index 1e914b44b..41823b4ee 100755
--- a/scripts/ems/support/run-command-on-multiple-refsets.perl
+++ b/scripts/ems/support/run-command-on-multiple-refsets.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/run-wade.perl b/scripts/ems/support/run-wade.perl
index 175948b98..dfdb8e59d 100755
--- a/scripts/ems/support/run-wade.perl
+++ b/scripts/ems/support/run-wade.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl
index 02a1e2315..f72767054 100755
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # Based on Preprocessor written by Philipp Koehn
 
diff --git a/scripts/ems/support/submit-grid.perl b/scripts/ems/support/submit-grid.perl
index a0967f9a5..ff43cd123 100755
--- a/scripts/ems/support/submit-grid.perl
+++ b/scripts/ems/support/submit-grid.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/substitute-filtered-tables-and-weights.perl b/scripts/ems/support/substitute-filtered-tables-and-weights.perl
index 13be52c6b..2e6908ab4 100755
--- a/scripts/ems/support/substitute-filtered-tables-and-weights.perl
+++ b/scripts/ems/support/substitute-filtered-tables-and-weights.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/substitute-filtered-tables.perl b/scripts/ems/support/substitute-filtered-tables.perl
index c5ebabded..548982592 100755
--- a/scripts/ems/support/substitute-filtered-tables.perl
+++ b/scripts/ems/support/substitute-filtered-tables.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 
diff --git a/scripts/ems/support/substitute-weights.perl b/scripts/ems/support/substitute-weights.perl
index b692f3f85..efa9338ca 100755
--- a/scripts/ems/support/substitute-weights.perl
+++ b/scripts/ems/support/substitute-weights.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 
diff --git a/scripts/ems/support/symmetrize-fast-align.perl b/scripts/ems/support/symmetrize-fast-align.perl
index 9f7fec248..4ed3e087d 100755
--- a/scripts/ems/support/symmetrize-fast-align.perl
+++ b/scripts/ems/support/symmetrize-fast-align.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/thot-lm-wrapper.perl b/scripts/ems/support/thot-lm-wrapper.perl
index 59d483e65..ffbcb50e2 100755
--- a/scripts/ems/support/thot-lm-wrapper.perl
+++ b/scripts/ems/support/thot-lm-wrapper.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/tree-converter-wrapper.perl b/scripts/ems/support/tree-converter-wrapper.perl
index aae55991a..ae7e2c5a6 100755
--- a/scripts/ems/support/tree-converter-wrapper.perl
+++ b/scripts/ems/support/tree-converter-wrapper.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/support/wrap-xml.perl b/scripts/ems/support/wrap-xml.perl
index 52190309a..09ea2a2f8 100755
--- a/scripts/ems/support/wrap-xml.perl
+++ b/scripts/ems/support/wrap-xml.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/web/analysis.php b/scripts/ems/web/analysis.php
index 57776dd22..5e5f707f6 100644
--- a/scripts/ems/web/analysis.php
+++ b/scripts/ems/web/analysis.php
@@ -1,5 +1,10 @@
 <?php
 
+/*
+This file is part of moses.  Its use is licensed under the GNU Lesser General
+Public License version 2.1 or, at your option, any later version.
+*/
+
 # main page frame, triggers the loading of parts
 function show_analysis() {
   global $task,$user,$setup,$id,$set;
diff --git a/scripts/ems/web/analysis_diff.php b/scripts/ems/web/analysis_diff.php
index 214ae1592..9840de7c2 100644
--- a/scripts/ems/web/analysis_diff.php
+++ b/scripts/ems/web/analysis_diff.php
@@ -1,5 +1,9 @@
 <?php
 
+/*
+This file is part of moses.  Its use is licensed under the GNU Lesser General
+Public License version 2.1 or, at your option, any later version.
+*/
 function diff_analysis() {
   global $task,$user,$setup,$id,$id2,$set;
   global $comment,$dir;
diff --git a/scripts/ems/web/diff.php b/scripts/ems/web/diff.php
index f440d3240..c290e7c02 100644
--- a/scripts/ems/web/diff.php
+++ b/scripts/ems/web/diff.php
@@ -1,5 +1,10 @@
 <?php
 
+/*
+This file is part of moses.  Its use is licensed under the GNU Lesser General
+Public License version 2.1 or, at your option, any later version.
+*/
+
 function diff() {
   global $experiment;
   $display = $_GET["run"];
diff --git a/scripts/ems/web/hierarchical-segmentation.js b/scripts/ems/web/hierarchical-segmentation.js
index fad5a16d4..7f0df85ff 100644
--- a/scripts/ems/web/hierarchical-segmentation.js
+++ b/scripts/ems/web/hierarchical-segmentation.js
@@ -1,3 +1,7 @@
+/*
+This file is part of moses.  Its use is licensed under the GNU Lesser General
+Public License version 2.1 or, at your option, any later version.
+*/
 var nodeIn = [];
 var nodeOut = [];
 var nodeChildren = [];
diff --git a/scripts/ems/web/index.php b/scripts/ems/web/index.php
index 9c918a96a..757c20d72 100644
--- a/scripts/ems/web/index.php
+++ b/scripts/ems/web/index.php
@@ -1,5 +1,10 @@
 <?php
 
+/*
+This file is part of moses.  Its use is licensed under the GNU Lesser General
+Public License version 2.1 or, at your option, any later version.
+*/
+
 require("lib.php");
 require("overview.php");
 require("analysis.php");
diff --git a/scripts/ems/web/lib.php b/scripts/ems/web/lib.php
index 68c58860b..df3286ebb 100644
--- a/scripts/ems/web/lib.php
+++ b/scripts/ems/web/lib.php
@@ -1,5 +1,10 @@
 <?php
 
+/*
+This file is part of moses.  Its use is licensed under the GNU Lesser General
+Public License version 2.1 or, at your option, any later version.
+*/
+
 function load_experiment_info() {
   global $dir,$task,$user,$setup;
   global $evalset;
diff --git a/scripts/ems/web/overview.php b/scripts/ems/web/overview.php
index 534c7d8c0..c59970b67 100644
--- a/scripts/ems/web/overview.php
+++ b/scripts/ems/web/overview.php
@@ -1,5 +1,9 @@
 <?php
 
+/*
+This file is part of moses.  Its use is licensed under the GNU Lesser General
+Public License version 2.1 or, at your option, any later version.
+*/
 
 function setup() {
   $setup = file("setup");
diff --git a/scripts/ems/web/progress.perl b/scripts/ems/web/progress.perl
index fa2ce9e8f..f5acfc64f 100755
--- a/scripts/ems/web/progress.perl
+++ b/scripts/ems/web/progress.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/ems/web/sgviz.js b/scripts/ems/web/sgviz.js
index 13d45f8c5..3926f9513 100644
--- a/scripts/ems/web/sgviz.js
+++ b/scripts/ems/web/sgviz.js
@@ -1,3 +1,7 @@
+/*
+This file is part of moses.  Its use is licensed under the GNU Lesser General
+Public License version 2.1 or, at your option, any later version.
+*/
 var xmlns="http://www.w3.org/2000/svg";
 var RECOMBINED = 0;
 var FROM = 1;
diff --git a/scripts/ems/web/sgviz.php b/scripts/ems/web/sgviz.php
index 9fccadf60..85778caab 100644
--- a/scripts/ems/web/sgviz.php
+++ b/scripts/ems/web/sgviz.php
@@ -1,4 +1,10 @@
 <?php
+
+/*
+This file is part of moses.  Its use is licensed under the GNU Lesser General
+Public License version 2.1 or, at your option, any later version.
+*/
+
 function sgviz($sentence) {
   global $setup,$dir,$id,$set;
 ?><html><head><title>Search Graph Visualization, Sentence <?php $sentence ?></title>
diff --git a/scripts/fuzzy-match/create_xml.perl b/scripts/fuzzy-match/create_xml.perl
index 4ab281eae..97025d62a 100755
--- a/scripts/fuzzy-match/create_xml.perl
+++ b/scripts/fuzzy-match/create_xml.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 binmode( STDIN,  ":utf8" );
 binmode( STDOUT, ":utf8" );
diff --git a/scripts/generic/bsbleu.py b/scripts/generic/bsbleu.py
index 12d2201de..296900b18 100755
--- a/scripts/generic/bsbleu.py
+++ b/scripts/generic/bsbleu.py
@@ -1,6 +1,9 @@
 #!/usr/bin/env python
 # compute Bleu scores with confidence intervals via boostrap resampling
 # written by Ulrich Germann
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 from argparse import ArgumentParser
 import math
diff --git a/scripts/generic/compound-splitter.perl b/scripts/generic/compound-splitter.perl
index b39d4d660..2ece80a60 100755
--- a/scripts/generic/compound-splitter.perl
+++ b/scripts/generic/compound-splitter.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/generic/extract-factors.pl b/scripts/generic/extract-factors.pl
index 38cf97bd4..2b1c51cd1 100755
--- a/scripts/generic/extract-factors.pl
+++ b/scripts/generic/extract-factors.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id$
 #extract-factors.pl: extract only the desired factors from a factored corpus
diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl
index be30ff652..3240f24eb 100755
--- a/scripts/generic/extract-parallel.perl
+++ b/scripts/generic/extract-parallel.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # example
 #  ./extract-parallel.perl 8 ./coreutils-8.9/src/split "./coreutils-8.9/src/sort --batch-size=253" ./extract ./corpus.5.en ./corpus.5.ar ./align.ar-en.grow-diag-final-and ./extracted 7 --NoFileLimit orientation --GZOutput
diff --git a/scripts/generic/fsa2fsal.pl b/scripts/generic/fsa2fsal.pl
index 7dc7751ee..28ec28a26 100755
--- a/scripts/generic/fsa2fsal.pl
+++ b/scripts/generic/fsa2fsal.pl
@@ -4,6 +4,9 @@
 # ' ' to delimit nodes (i.e. original lines).
 # Some rudimentary sanity checks are done on the fly.
 # Ondrej Bojar, bojar@ufal.mff.cuni.cz
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/generic/fsa2plf.pl b/scripts/generic/fsa2plf.pl
index 07c8a4cc1..4b9474d5a 100755
--- a/scripts/generic/fsa2plf.pl
+++ b/scripts/generic/fsa2plf.pl
@@ -7,6 +7,9 @@
 # final nodes.
 # Note that the output format may not contain any spaces.
 # Ondrej Bojar, bojar@ufal.mff.cuni.cz
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/generic/fsal2fsa.pl b/scripts/generic/fsal2fsa.pl
index a21305dad..158dab5b3 100755
--- a/scripts/generic/fsal2fsa.pl
+++ b/scripts/generic/fsal2fsa.pl
@@ -1,6 +1,9 @@
 #!/usr/bin/env perl
 # A very simple script that converts fsal back to fsa format (openfst lattices)
 # Ondrej Bojar, bojar@ufal.mff.cuni.cz
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/generic/generic-parallel.perl b/scripts/generic/generic-parallel.perl
index a9bc73d85..07f6a210a 100755
--- a/scripts/generic/generic-parallel.perl
+++ b/scripts/generic/generic-parallel.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/generic/giza-parallel.perl b/scripts/generic/giza-parallel.perl
index 9a6516a8f..a9921a992 100755
--- a/scripts/generic/giza-parallel.perl
+++ b/scripts/generic/giza-parallel.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # example
 # ~/giza-parallel.perl 10 split ~/workspace/sourceforge/trunk/scripts/training/train-model.perl ar en train align
diff --git a/scripts/generic/lopar2pos.pl b/scripts/generic/lopar2pos.pl
index 2b9245e0f..fc2c35c7f 100755
--- a/scripts/generic/lopar2pos.pl
+++ b/scripts/generic/lopar2pos.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id$
 #lopar2pos: extract POSs from LOPAR output
diff --git a/scripts/generic/moses-parallel.pl b/scripts/generic/moses-parallel.pl
index eb51daa98..144b7d6b2 100755
--- a/scripts/generic/moses-parallel.pl
+++ b/scripts/generic/moses-parallel.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id$
 #######################
diff --git a/scripts/generic/moses_sim_pe.py b/scripts/generic/moses_sim_pe.py
index 32f785961..3497ca558 100755
--- a/scripts/generic/moses_sim_pe.py
+++ b/scripts/generic/moses_sim_pe.py
@@ -1,20 +1,25 @@
 #!/usr/bin/env python
 
 # Written by Michael Denkowski
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
-# This script parallelizes decoding with simulated post-editing via moses XML
-# input (XML entities need to be escaped in tokenization).  Memory mapped
-# dynamic phrase tables (Ulrich Germann,
-# www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40) and language models
-# (Kenneth Heafield,
-# http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel#ntoc19)
-# facilitate memory efficient multi process decoding.  Input is divided into
-# batches, each of which is decoded sequentially.  Each batch pre-loads the
-# data from previous batches.
+"""Parallelize decoding with simulated post-editing via moses XML input.
 
-# To use in tuning, run mert-moses.pl with --sim-pe=SYMAL where SYMAL is the
-# alignment from input to references.  Specify the number of jobs with
-# --decoder-flags="-threads N".
+(XML entities need to be escaped in tokenization).  Memory mapped
+dynamic phrase tables (Ulrich Germann,
+www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40) and language models
+(Kenneth Heafield,
+http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel#ntoc19)
+facilitate memory efficient multi process decoding.  Input is divided into
+batches, each of which is decoded sequentially.  Each batch pre-loads the
+data from previous batches.
+
+To use in tuning, run mert-moses.pl with --sim-pe=SYMAL where SYMAL is the
+alignment from input to references.  Specify the number of jobs with
+--decoder-flags="-threads N".
+"""
 
 import gzip
 import itertools
diff --git a/scripts/generic/mteval-v12.pl b/scripts/generic/mteval-v12.pl
index 2666c8012..b4dfbf83a 100755
--- a/scripts/generic/mteval-v12.pl
+++ b/scripts/generic/mteval-v12.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/generic/mteval-v13a.pl b/scripts/generic/mteval-v13a.pl
index 41a88800a..bdc2d9479 100755
--- a/scripts/generic/mteval-v13a.pl
+++ b/scripts/generic/mteval-v13a.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/generic/multi-bleu.perl b/scripts/generic/multi-bleu.perl
index 344f58c6f..61de10d45 100755
--- a/scripts/generic/multi-bleu.perl
+++ b/scripts/generic/multi-bleu.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id$
 use warnings;
diff --git a/scripts/generic/ph_numbers.perl b/scripts/generic/ph_numbers.perl
index 612263249..f0ae1f851 100755
--- a/scripts/generic/ph_numbers.perl
+++ b/scripts/generic/ph_numbers.perl
@@ -6,6 +6,9 @@ package ph_numbers;
 # and decoder input
 #
 # (c) 2013 TAUS
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/generic/qsub-wrapper.pl b/scripts/generic/qsub-wrapper.pl
index ac3d0900a..ef9938e07 100755
--- a/scripts/generic/qsub-wrapper.pl
+++ b/scripts/generic/qsub-wrapper.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id$
 use warnings;
diff --git a/scripts/generic/reverse-alignment.perl b/scripts/generic/reverse-alignment.perl
index 681b3221e..f01acf5b0 100755
--- a/scripts/generic/reverse-alignment.perl
+++ b/scripts/generic/reverse-alignment.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl
index 81bc6f7d0..625b449c0 100755
--- a/scripts/generic/score-parallel.perl
+++ b/scripts/generic/score-parallel.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # example
 # ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.sorted.gz ./lex.2.f2e ./phrase-table.2.half.f2e  --GoodTuring ./phrase-table.2.coc 0
diff --git a/scripts/generic/strip-xml.perl b/scripts/generic/strip-xml.perl
index c993421f0..a5dbbaa37 100755
--- a/scripts/generic/strip-xml.perl
+++ b/scripts/generic/strip-xml.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/generic/trainlm-irst2.perl b/scripts/generic/trainlm-irst2.perl
index f664e96ee..8af372fac 100755
--- a/scripts/generic/trainlm-irst2.perl
+++ b/scripts/generic/trainlm-irst2.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # Compatible with sri LM-creating script, eg.
 #    ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
diff --git a/scripts/other/beautify.py b/scripts/other/beautify.py
index 0caa6b162..56df24bc8 100755
--- a/scripts/other/beautify.py
+++ b/scripts/other/beautify.py
@@ -1,4 +1,9 @@
 #! /usr/bin/env python
+#
+# Originally written in 2015 by Jeroen Vermeulen (Precision Translation Tools).
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 """Reformat project source code, and/or check for style errors ("lint").
 
diff --git a/scripts/other/convert-pt.perl b/scripts/other/convert-pt.perl
index e087126f1..60c8cbdb2 100755
--- a/scripts/other/convert-pt.perl
+++ b/scripts/other/convert-pt.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id$
 # convert a phrase-table with alignment in Moses' dead-end format
diff --git a/scripts/other/delete-scores.perl b/scripts/other/delete-scores.perl
index ffb788867..ebaf277fa 100755
--- a/scripts/other/delete-scores.perl
+++ b/scripts/other/delete-scores.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/other/gacha_filter.py b/scripts/other/gacha_filter.py
index 0deb45761..af5921d41 100644
--- a/scripts/other/gacha_filter.py
+++ b/scripts/other/gacha_filter.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3 -*- coding: utf-8 -*-
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 """
 The Gacha filter cleans out sentence pairs that have global character mean
diff --git a/scripts/other/get_many_translations_from_google.perl b/scripts/other/get_many_translations_from_google.perl
index 0b1436c20..ac2933296 100755
--- a/scripts/other/get_many_translations_from_google.perl
+++ b/scripts/other/get_many_translations_from_google.perl
@@ -5,6 +5,9 @@
 # Expects one sentence per line, not tokenized!
 #
 # Ondrej Bojar, bojar@ufal.mff.cuni.cz
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/other/retain-lines.perl b/scripts/other/retain-lines.perl
index f04a8ebad..c789f96c7 100755
--- a/scripts/other/retain-lines.perl
+++ b/scripts/other/retain-lines.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 #retain lines in clean.lines-retained.1
 use strict;
diff --git a/scripts/other/translate_by_microsoft_bing.perl b/scripts/other/translate_by_microsoft_bing.perl
index c9b1b31de..d4222878e 100755
--- a/scripts/other/translate_by_microsoft_bing.perl
+++ b/scripts/other/translate_by_microsoft_bing.perl
@@ -2,6 +2,9 @@
 
 # Script implemented by Pranava Swaroop Madhyastha (a student at Charles
 # University, UFAL)
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use strict;
 use warnings;
diff --git a/scripts/recaser/detruecase.perl b/scripts/recaser/detruecase.perl
index b882852a0..66ca24fa2 100755
--- a/scripts/recaser/detruecase.perl
+++ b/scripts/recaser/detruecase.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/recaser/recase.perl b/scripts/recaser/recase.perl
index 52cec36ea..b951ca764 100755
--- a/scripts/recaser/recase.perl
+++ b/scripts/recaser/recase.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id$
 use warnings;
diff --git a/scripts/recaser/train-recaser.perl b/scripts/recaser/train-recaser.perl
index dce388bca..cb3388c38 100755
--- a/scripts/recaser/train-recaser.perl
+++ b/scripts/recaser/train-recaser.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id$
 use warnings;
diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl
index 753183324..7f8909082 100755
--- a/scripts/recaser/train-truecaser.perl
+++ b/scripts/recaser/train-truecaser.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
 
diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl
index 544b79c47..aab185ce9 100755
--- a/scripts/recaser/truecase.perl
+++ b/scripts/recaser/truecase.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
 
diff --git a/scripts/regression-testing/MosesScriptsRegressionTesting.pm b/scripts/regression-testing/MosesScriptsRegressionTesting.pm
index d8b0590c8..acc134d70 100644
--- a/scripts/regression-testing/MosesScriptsRegressionTesting.pm
+++ b/scripts/regression-testing/MosesScriptsRegressionTesting.pm
@@ -1,3 +1,6 @@
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
 package MosesScriptsRegressionTesting;
 
 use strict;
diff --git a/scripts/regression-testing/compare-results.pl b/scripts/regression-testing/compare-results.pl
index 572431951..8f1461cec 100755
--- a/scripts/regression-testing/compare-results.pl
+++ b/scripts/regression-testing/compare-results.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/regression-testing/create_localized_moses_ini.pl b/scripts/regression-testing/create_localized_moses_ini.pl
index 1d03e5ab8..3e2b6f37f 100755
--- a/scripts/regression-testing/create_localized_moses_ini.pl
+++ b/scripts/regression-testing/create_localized_moses_ini.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/regression-testing/modify-pars.pl b/scripts/regression-testing/modify-pars.pl
index de2df2919..7726af9e6 100755
--- a/scripts/regression-testing/modify-pars.pl
+++ b/scripts/regression-testing/modify-pars.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/regression-testing/moses-virtual.pl b/scripts/regression-testing/moses-virtual.pl
index 3af3c79e4..3b23b525a 100755
--- a/scripts/regression-testing/moses-virtual.pl
+++ b/scripts/regression-testing/moses-virtual.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/regression-testing/run-single-test.pl b/scripts/regression-testing/run-single-test.pl
index e8307da36..037de8285 100755
--- a/scripts/regression-testing/run-single-test.pl
+++ b/scripts/regression-testing/run-single-test.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/regression-testing/run-test-suite.pl b/scripts/regression-testing/run-test-suite.pl
index b384f8b98..a12938e61 100755
--- a/scripts/regression-testing/run-test-suite.pl
+++ b/scripts/regression-testing/run-test-suite.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/server/moses.py b/scripts/server/moses.py
index 7cf152187..e825ab39e 100644
--- a/scripts/server/moses.py
+++ b/scripts/server/moses.py
@@ -1,5 +1,8 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 """
 Python utilities for moses
diff --git a/scripts/server/sim-pe.py b/scripts/server/sim-pe.py
index 5f1407524..6f76bf46d 100755
--- a/scripts/server/sim-pe.py
+++ b/scripts/server/sim-pe.py
@@ -2,8 +2,14 @@
 # -*- coding: utf-8 -*-
 
 # Written by Ulrich Germann on the basis of contrib/server/client.py.
-# This script simulates post-editing of MT output and incrementally
-# updates the dynamic phrase tables in the moses server.
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
+"""Simulate post-editing of MT output.
+
+Incrementally updates the dynamic phrase tables in the moses server.
+"""
 
 import argparse
 import os
diff --git a/scripts/tokenizer/deescape-special-chars-PTB.perl b/scripts/tokenizer/deescape-special-chars-PTB.perl
index f9601924f..ad2529b21 100755
--- a/scripts/tokenizer/deescape-special-chars-PTB.perl
+++ b/scripts/tokenizer/deescape-special-chars-PTB.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/tokenizer/deescape-special-chars.perl b/scripts/tokenizer/deescape-special-chars.perl
index 002955e62..b9d1ad74c 100755
--- a/scripts/tokenizer/deescape-special-chars.perl
+++ b/scripts/tokenizer/deescape-special-chars.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl
index 3a92bd024..881b93dd1 100755
--- a/scripts/tokenizer/detokenizer.perl
+++ b/scripts/tokenizer/detokenizer.perl
@@ -4,6 +4,9 @@
 # Sample De-Tokenizer
 # written by Josh Schroeder, based on code by Philipp Koehn
 # further modifications by Ondrej Bojar
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 binmode(STDIN, ":utf8");
 binmode(STDOUT, ":utf8");
diff --git a/scripts/tokenizer/escape-special-chars.perl b/scripts/tokenizer/escape-special-chars.perl
index fbbbae292..143e85490 100755
--- a/scripts/tokenizer/escape-special-chars.perl
+++ b/scripts/tokenizer/escape-special-chars.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/tokenizer/lowercase.perl b/scripts/tokenizer/lowercase.perl
index e5c41bbed..bc75e5e5c 100755
--- a/scripts/tokenizer/lowercase.perl
+++ b/scripts/tokenizer/lowercase.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/tokenizer/normalize-punctuation.perl b/scripts/tokenizer/normalize-punctuation.perl
index 13e9fd3fc..7dab7543a 100755
--- a/scripts/tokenizer/normalize-punctuation.perl
+++ b/scripts/tokenizer/normalize-punctuation.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/tokenizer/pre-tok-clean.perl b/scripts/tokenizer/pre-tok-clean.perl
index 900e992ee..064f7b187 100755
--- a/scripts/tokenizer/pre-tok-clean.perl
+++ b/scripts/tokenizer/pre-tok-clean.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use strict;
 
diff --git a/scripts/tokenizer/pre-tokenizer.perl b/scripts/tokenizer/pre-tokenizer.perl
index 514d8da8d..541ce77fb 100755
--- a/scripts/tokenizer/pre-tokenizer.perl
+++ b/scripts/tokenizer/pre-tokenizer.perl
@@ -3,6 +3,9 @@
 # script for preprocessing language data prior to tokenization
 # Start by Ulrich Germann, after noticing systematic preprocessing errors
 # in some of the English Europarl data.
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/tokenizer/pre_tokenize_cleaning.py b/scripts/tokenizer/pre_tokenize_cleaning.py
index 096a45dc4..c03af8f66 100644
--- a/scripts/tokenizer/pre_tokenize_cleaning.py
+++ b/scripts/tokenizer/pre_tokenize_cleaning.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python -*- coding: utf-8 -*-
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 """
 The Gacha filter cleans out sentence pairs that have global character mean
diff --git a/scripts/tokenizer/remove-non-printing-char.perl b/scripts/tokenizer/remove-non-printing-char.perl
index 9125b7691..92f6ade16 100755
--- a/scripts/tokenizer/remove-non-printing-char.perl
+++ b/scripts/tokenizer/remove-non-printing-char.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use utf8;
diff --git a/scripts/tokenizer/replace-unicode-punctuation.perl b/scripts/tokenizer/replace-unicode-punctuation.perl
index cda69ddf7..c2c7088d6 100755
--- a/scripts/tokenizer/replace-unicode-punctuation.perl
+++ b/scripts/tokenizer/replace-unicode-punctuation.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
index a5d4fadd3..e08bac941 100755
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 
diff --git a/scripts/tokenizer/tokenizer_PTB.perl b/scripts/tokenizer/tokenizer_PTB.perl
index 6fff8d7f7..46b14775c 100755
--- a/scripts/tokenizer/tokenizer_PTB.perl
+++ b/scripts/tokenizer/tokenizer_PTB.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # Sample Tokenizer
 ### Version 1.1
diff --git a/scripts/training/LexicalTranslationModel.pm b/scripts/training/LexicalTranslationModel.pm
index c5dad60fb..3adc45f5e 100644
--- a/scripts/training/LexicalTranslationModel.pm
+++ b/scripts/training/LexicalTranslationModel.pm
@@ -1,3 +1,6 @@
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
 package LexicalTranslationModel;
 
 use strict;
diff --git a/scripts/training/absolutize_moses_model.pl b/scripts/training/absolutize_moses_model.pl
index bb7085895..27eccd8c7 100755
--- a/scripts/training/absolutize_moses_model.pl
+++ b/scripts/training/absolutize_moses_model.pl
@@ -5,6 +5,9 @@
 # paths with absolute paths.
 #
 # Ondrej Bojar.
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 
diff --git a/scripts/training/analyse_moses_model.pl b/scripts/training/analyse_moses_model.pl
index 656f4a59b..7a5c2e701 100755
--- a/scripts/training/analyse_moses_model.pl
+++ b/scripts/training/analyse_moses_model.pl
@@ -4,6 +4,9 @@
 # given a moses.ini file, checks the translation and generation tables and reports
 # statistics on ambiguity
 # Ondrej Bojar
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/bilingual-lm/averageNullEmbedding.py b/scripts/training/bilingual-lm/averageNullEmbedding.py
index 891595aff..54c9a1bc4 100755
--- a/scripts/training/bilingual-lm/averageNullEmbedding.py
+++ b/scripts/training/bilingual-lm/averageNullEmbedding.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python2
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 import sys
 import numpy
 import argparse
diff --git a/scripts/training/bilingual-lm/extract.py b/scripts/training/bilingual-lm/extract.py
index f620edb5d..876fba9ee 100755
--- a/scripts/training/bilingual-lm/extract.py
+++ b/scripts/training/bilingual-lm/extract.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 from collections import Counter
 import logging
diff --git a/scripts/training/bilingual-lm/extract_test.py b/scripts/training/bilingual-lm/extract_test.py
index 3c9a03b85..8cade1e04 100755
--- a/scripts/training/bilingual-lm/extract_test.py
+++ b/scripts/training/bilingual-lm/extract_test.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 """Create a test corpus, using a previously pruned vocabulary."""
 
diff --git a/scripts/training/bilingual-lm/extract_training.py b/scripts/training/bilingual-lm/extract_training.py
index bd3538188..e39a70318 100755
--- a/scripts/training/bilingual-lm/extract_training.py
+++ b/scripts/training/bilingual-lm/extract_training.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 from collections import Counter
 import logging
diff --git a/scripts/training/bilingual-lm/reduce_ngrams.py b/scripts/training/bilingual-lm/reduce_ngrams.py
index 3442fb302..4db41378d 100755
--- a/scripts/training/bilingual-lm/reduce_ngrams.py
+++ b/scripts/training/bilingual-lm/reduce_ngrams.py
@@ -1,6 +1,9 @@
 #!/usr/bin/env python3
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
-"""Reduces an ngrams file for training nplm to a smaller version of it.
+"""Reduce an ngrams file for training nplm to a smaller version of it.
 
 The smaller version will have fewer ngrams.
 """
diff --git a/scripts/training/bilingual-lm/test_nplm.py b/scripts/training/bilingual-lm/test_nplm.py
index 737266bc3..3a59fd344 100755
--- a/scripts/training/bilingual-lm/test_nplm.py
+++ b/scripts/training/bilingual-lm/test_nplm.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 import logging
 import optparse
diff --git a/scripts/training/bilingual-lm/train_nplm.py b/scripts/training/bilingual-lm/train_nplm.py
index 7bc74429e..cb5980a91 100755
--- a/scripts/training/bilingual-lm/train_nplm.py
+++ b/scripts/training/bilingual-lm/train_nplm.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 from __future__ import print_function, unicode_literals
 
diff --git a/scripts/training/binarize-model.perl b/scripts/training/binarize-model.perl
index 0239f5fc8..0131d2222 100755
--- a/scripts/training/binarize-model.perl
+++ b/scripts/training/binarize-model.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 #
 # Binarize a Moses model
diff --git a/scripts/training/build-generation-table.perl b/scripts/training/build-generation-table.perl
index 435f7f58e..14176908a 100755
--- a/scripts/training/build-generation-table.perl
+++ b/scripts/training/build-generation-table.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id$
 use warnings;
diff --git a/scripts/training/build-mmsapt.perl b/scripts/training/build-mmsapt.perl
index 00cbd09d6..d0c5b818e 100755
--- a/scripts/training/build-mmsapt.perl
+++ b/scripts/training/build-mmsapt.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/clean-corpus-n.perl b/scripts/training/clean-corpus-n.perl
index cee4c76a2..76a09e539 100755
--- a/scripts/training/clean-corpus-n.perl
+++ b/scripts/training/clean-corpus-n.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id: clean-corpus-n.perl 3633 2010-10-21 09:49:27Z phkoehn $
 use warnings;
diff --git a/scripts/training/clone_moses_model.pl b/scripts/training/clone_moses_model.pl
index bf6708fca..18dc4aa41 100755
--- a/scripts/training/clone_moses_model.pl
+++ b/scripts/training/clone_moses_model.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id$
 # given a moses.ini file, creates a fresh version of it
diff --git a/scripts/training/combine_factors.pl b/scripts/training/combine_factors.pl
index fa6f15db2..fcc9ab3f5 100755
--- a/scripts/training/combine_factors.pl
+++ b/scripts/training/combine_factors.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id$
 # given a list of files, combines them to a single corpus (sent to stdout)
diff --git a/scripts/training/convert-moses-ini-to-v2.perl b/scripts/training/convert-moses-ini-to-v2.perl
index e091a710d..3fdfa53a6 100755
--- a/scripts/training/convert-moses-ini-to-v2.perl
+++ b/scripts/training/convert-moses-ini-to-v2.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/convert-moses-ini-v2-to-v1.py b/scripts/training/convert-moses-ini-v2-to-v1.py
index 44f192efe..3ef7d7c0d 100755
--- a/scripts/training/convert-moses-ini-v2-to-v1.py
+++ b/scripts/training/convert-moses-ini-v2-to-v1.py
@@ -1,5 +1,8 @@
 #! /usr/bin/env python
 # -*- coding: utf8 -*-
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 3 or, at your option, any later version.
 
 
 from __future__ import (
diff --git a/scripts/training/corpus-sizes.perl b/scripts/training/corpus-sizes.perl
index 30ae67ebb..1a6db669b 100755
--- a/scripts/training/corpus-sizes.perl
+++ b/scripts/training/corpus-sizes.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $
 
diff --git a/scripts/training/create_count_tables.py b/scripts/training/create_count_tables.py
index 2288c034a..12499b1d7 100755
--- a/scripts/training/create_count_tables.py
+++ b/scripts/training/create_count_tables.py
@@ -1,6 +1,9 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # Author: Rico Sennrich <sennrich [AT] cl.uzh.ch>
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # This script creates tables that store phrase pair frequencies rather than
 # probabilities.
diff --git a/scripts/training/exodus.perl b/scripts/training/exodus.perl
index bb8616007..579056ff0 100755
--- a/scripts/training/exodus.perl
+++ b/scripts/training/exodus.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id$
 
diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
index e3a34c40b..a44d9c193 100755
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id$
 # Given a moses.ini file and an input text prepare minimized translation
diff --git a/scripts/training/filter-rule-table.py b/scripts/training/filter-rule-table.py
index 14736fe1f..d28fa0c89 100755
--- a/scripts/training/filter-rule-table.py
+++ b/scripts/training/filter-rule-table.py
@@ -1,25 +1,29 @@
 #!/usr/bin/env python
 
 # Author: Phil Williams
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
-# Usage: filter-rule-table.py [--min-non-initial-rule-count=N] INPUT
-#
-# Given a rule table (on stdin) and an input text, filter out rules that
-# couldn't be used in parsing the input and write the resulting rule table
-# to stdout.  The input text is assumed to contain the same factors as
-# the rule table and is assumed to be small (not more than a few thousand
-# sentences): the current algorithm won't scale well to large input sets.
-#
-# The filtering algorithm considers a source RHS to be a sequence of
-# words and gaps, which must match a sequence of words in one of the
-# input sentences, with at least one input word per gap.  The NT labels
-# are ignored, so for example a rule with the source RHS "the JJ dog"
-# would be allowed if the sequence "the slobbering dog" occurs in one of
-# the input sentences, even if there's no rule to derive a JJ from
-# "slobbering."  (If "slobbering" were an unknown word, the 'unknown-lhs'
-# decoder option would allow it to take a number of NT labels, likely
-# including JJ, with varying probabilities, so removing the rule would
-# be a bad idea.)
+"""Usage: filter-rule-table.py [--min-non-initial-rule-count=N] INPUT
+
+Given a rule table (on stdin) and an input text, filter out rules that
+couldn't be used in parsing the input and write the resulting rule table
+to stdout.  The input text is assumed to contain the same factors as
+the rule table and is assumed to be small (not more than a few thousand
+sentences): the current algorithm won't scale well to large input sets.
+
+The filtering algorithm considers a source RHS to be a sequence of
+words and gaps, which must match a sequence of words in one of the
+input sentences, with at least one input word per gap.  The NT labels
+are ignored, so for example a rule with the source RHS "the JJ dog"
+would be allowed if the sequence "the slobbering dog" occurs in one of
+the input sentences, even if there's no rule to derive a JJ from
+"slobbering."  (If "slobbering" were an unknown word, the 'unknown-lhs'
+decoder option would allow it to take a number of NT labels, likely
+including JJ, with varying probabilities, so removing the rule would
+be a bad idea.)
+"""
 
 import optparse
 import sys
diff --git a/scripts/training/flexibility_score.py b/scripts/training/flexibility_score.py
index 496184616..56d4f9425 100755
--- a/scripts/training/flexibility_score.py
+++ b/scripts/training/flexibility_score.py
@@ -2,6 +2,9 @@
 # -*- coding: utf-8 -*-
 
 # author: Rico Sennrich
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 """Add flexibility scores to a phrase table half.
 
diff --git a/scripts/training/giza2bal.pl b/scripts/training/giza2bal.pl
index 27ba9d659..ad9edb584 100755
--- a/scripts/training/giza2bal.pl
+++ b/scripts/training/giza2bal.pl
@@ -6,6 +6,9 @@
 #produced by giza containing the frequency of each traning sentence.
 
 #Copyright Marcello Federico, November 2004
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 #use warnings;
 
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index 92e1a79ff..c73e75a87 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -1,4 +1,8 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
 # $Id$
 # Usage:
 # mert-moses.pl <foreign> <english> <decoder-executable> <decoder-config>
diff --git a/scripts/training/postprocess-lopar.perl b/scripts/training/postprocess-lopar.perl
index 44be9c26c..05a56a3b5 100755
--- a/scripts/training/postprocess-lopar.perl
+++ b/scripts/training/postprocess-lopar.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id$
 
diff --git a/scripts/training/rdlm/average_null_embedding.py b/scripts/training/rdlm/average_null_embedding.py
index 28abc9508..899b402c1 100755
--- a/scripts/training/rdlm/average_null_embedding.py
+++ b/scripts/training/rdlm/average_null_embedding.py
@@ -1,6 +1,9 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # Author: Rico Sennrich
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 """Average embeddings of special null words for RDLM.
 
diff --git a/scripts/training/rdlm/extract_syntactic_ngrams.py b/scripts/training/rdlm/extract_syntactic_ngrams.py
index 1292e90f2..be4ed2335 100755
--- a/scripts/training/rdlm/extract_syntactic_ngrams.py
+++ b/scripts/training/rdlm/extract_syntactic_ngrams.py
@@ -1,6 +1,9 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # Author: Rico Sennrich
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 """
 Extract syntactic n-grams from dependency treebank in Moses XML format for
diff --git a/scripts/training/rdlm/extract_vocab.py b/scripts/training/rdlm/extract_vocab.py
index ed9266fd9..48e5215c3 100755
--- a/scripts/training/rdlm/extract_vocab.py
+++ b/scripts/training/rdlm/extract_vocab.py
@@ -1,6 +1,9 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # Author: Rico Sennrich
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # extract 5 vocabulary files from parsed corpus in moses XML format
 
diff --git a/scripts/training/rdlm/train_rdlm.py b/scripts/training/rdlm/train_rdlm.py
index 639c1b32c..a7edbab36 100755
--- a/scripts/training/rdlm/train_rdlm.py
+++ b/scripts/training/rdlm/train_rdlm.py
@@ -1,5 +1,8 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 from __future__ import print_function, unicode_literals
 
diff --git a/scripts/training/reduce-factors.perl b/scripts/training/reduce-factors.perl
index 09f9c7f2b..82aed4355 100755
--- a/scripts/training/reduce-factors.perl
+++ b/scripts/training/reduce-factors.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/reduce-topt-count.pl b/scripts/training/reduce-topt-count.pl
index f760051c4..85ce0d6d9 100755
--- a/scripts/training/reduce-topt-count.pl
+++ b/scripts/training/reduce-topt-count.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # given a moses.ini, filter the phrase tables to contain
 # only ttable-limit options per source phrase
diff --git a/scripts/training/reduce_combine.pl b/scripts/training/reduce_combine.pl
index a7614f73e..2055bed5b 100755
--- a/scripts/training/reduce_combine.pl
+++ b/scripts/training/reduce_combine.pl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # $Id$
 # given a pathname to a factored corpus, a list of (numeric) factors to keep
diff --git a/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl b/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl
index eda529393..25c5cc028 100755
--- a/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl
+++ b/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/threshold-filter.perl b/scripts/training/threshold-filter.perl
index 3e42ca795..0aed67d25 100755
--- a/scripts/training/threshold-filter.perl
+++ b/scripts/training/threshold-filter.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/train-global-lexicon-model.perl b/scripts/training/train-global-lexicon-model.perl
index d3c55789d..528bfbd72 100755
--- a/scripts/training/train-global-lexicon-model.perl
+++ b/scripts/training/train-global-lexicon-model.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index 5a304c2f9..b693d774d 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/train-neurallm.py b/scripts/training/train-neurallm.py
index 2d2f12015..fec859611 100755
--- a/scripts/training/train-neurallm.py
+++ b/scripts/training/train-neurallm.py
@@ -1,8 +1,12 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
-""" train feed-forward neural network LM with NPLM tool
-resulting model can be used in Moses as feature function NeuralLM
+"""Train feed-forward neural network LM with NPLM tool.
+
+The resulting model can be used in Moses as feature function NeuralLM.
 """
 
 from __future__ import print_function, unicode_literals
diff --git a/scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh b/scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh
index 238a53349..5db5e9aa9 100755
--- a/scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh
+++ b/scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # execute: ~/workspace/bin/moses-smt/scripts/training/wrappers/suffix-array-create.sh $SA_EXEC_DIR $SOURCE_CORPUS $TARGET_CORPUS $ALIGNMENT $SA_OUTPUT
 
diff --git a/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh b/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh
index 8c255b1b6..128ccaa9e 100755
--- a/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh
+++ b/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 # execute: ~/workspace/bin/moses-smt/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh $SA_EXEC_DIR $MODEL_DIR $INPUT_FILE $OUTPUT_DIR
 
diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl
index 232cfefab..9c376200c 100755
--- a/scripts/training/wrappers/berkeleyparsed2mosesxml.perl
+++ b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl
index 9e8c30d42..b8ba146c9 100755
--- a/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl
+++ b/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/conll2mosesxml.py b/scripts/training/wrappers/conll2mosesxml.py
index 761037488..6473166d9 100755
--- a/scripts/training/wrappers/conll2mosesxml.py
+++ b/scripts/training/wrappers/conll2mosesxml.py
@@ -1,6 +1,9 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # Author: Rico Sennrich
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 """
 Takes a file in the CoNLL dependency format (from the CoNLL-X shared task on
diff --git a/scripts/training/wrappers/filter-excluded-lines.perl b/scripts/training/wrappers/filter-excluded-lines.perl
index dff104dba..508ab8a06 100755
--- a/scripts/training/wrappers/filter-excluded-lines.perl
+++ b/scripts/training/wrappers/filter-excluded-lines.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/find-unparseable.perl b/scripts/training/wrappers/find-unparseable.perl
index 00009e2e9..fd0664f1d 100755
--- a/scripts/training/wrappers/find-unparseable.perl
+++ b/scripts/training/wrappers/find-unparseable.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/mada-wrapper.perl b/scripts/training/wrappers/mada-wrapper.perl
index f2cf14f40..d4124e34c 100755
--- a/scripts/training/wrappers/mada-wrapper.perl
+++ b/scripts/training/wrappers/mada-wrapper.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/madamira-tok.perl b/scripts/training/wrappers/madamira-tok.perl
index 37e70079e..e9f19d53a 100755
--- a/scripts/training/wrappers/madamira-tok.perl
+++ b/scripts/training/wrappers/madamira-tok.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/madamira-wrapper.perl b/scripts/training/wrappers/madamira-wrapper.perl
index 6535b6187..05ec44d7d 100755
--- a/scripts/training/wrappers/madamira-wrapper.perl
+++ b/scripts/training/wrappers/madamira-wrapper.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
index 1e3a1ce3f..a8ce5f24e 100755
--- a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
+++ b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/make-factor-de-lemma.perl b/scripts/training/wrappers/make-factor-de-lemma.perl
index db978317e..0b93002a9 100755
--- a/scripts/training/wrappers/make-factor-de-lemma.perl
+++ b/scripts/training/wrappers/make-factor-de-lemma.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/perl -w
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use strict;
 use Encode;
diff --git a/scripts/training/wrappers/make-factor-de-morph.perl b/scripts/training/wrappers/make-factor-de-morph.perl
index 366a5a76d..d09196745 100755
--- a/scripts/training/wrappers/make-factor-de-morph.perl
+++ b/scripts/training/wrappers/make-factor-de-morph.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/make-factor-de-pos.perl b/scripts/training/wrappers/make-factor-de-pos.perl
index 495517352..585323bd4 100755
--- a/scripts/training/wrappers/make-factor-de-pos.perl
+++ b/scripts/training/wrappers/make-factor-de-pos.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/make-factor-en-porter.perl b/scripts/training/wrappers/make-factor-en-porter.perl
index 749dc1318..7ae5fd0b3 100755
--- a/scripts/training/wrappers/make-factor-en-porter.perl
+++ b/scripts/training/wrappers/make-factor-en-porter.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/perl -w
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use strict;
 use FindBin qw($RealBin);
diff --git a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl
index 4aa66bac6..2bff8e329 100755
--- a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl
+++ b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/make-factor-pos.tree-tagger.perl b/scripts/training/wrappers/make-factor-pos.tree-tagger.perl
index 0ad04d4de..1e8ccd0ee 100755
--- a/scripts/training/wrappers/make-factor-pos.tree-tagger.perl
+++ b/scripts/training/wrappers/make-factor-pos.tree-tagger.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/make-factor-stem.perl b/scripts/training/wrappers/make-factor-stem.perl
index 662f1d882..9bde7648f 100755
--- a/scripts/training/wrappers/make-factor-stem.perl
+++ b/scripts/training/wrappers/make-factor-stem.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/make-factor-suffix.perl b/scripts/training/wrappers/make-factor-suffix.perl
index 6a59254e4..015df3874 100755
--- a/scripts/training/wrappers/make-factor-suffix.perl
+++ b/scripts/training/wrappers/make-factor-suffix.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/morfessor-wrapper.perl b/scripts/training/wrappers/morfessor-wrapper.perl
index c65a2cebc..0269045a0 100755
--- a/scripts/training/wrappers/morfessor-wrapper.perl
+++ b/scripts/training/wrappers/morfessor-wrapper.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl
index e929658ff..02bc7b88e 100755
--- a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl
+++ b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/mosesxml2brackets.py b/scripts/training/wrappers/mosesxml2brackets.py
index 6ff1d20c9..6b90aa256 100755
--- a/scripts/training/wrappers/mosesxml2brackets.py
+++ b/scripts/training/wrappers/mosesxml2brackets.py
@@ -1,8 +1,11 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # Author: Rico Sennrich
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
-# convert trees in moses XML format to PTB-style bracketed format
+"""Convert trees in moses XML format to PTB-style bracketed format."""
 
 from __future__ import print_function, unicode_literals
 import sys
diff --git a/scripts/training/wrappers/parse-de-berkeley.perl b/scripts/training/wrappers/parse-de-berkeley.perl
index 596fb3eff..f605a37ae 100755
--- a/scripts/training/wrappers/parse-de-berkeley.perl
+++ b/scripts/training/wrappers/parse-de-berkeley.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/parse-de-bitpar.perl b/scripts/training/wrappers/parse-de-bitpar.perl
index 1bbcf5329..0d5346058 100755
--- a/scripts/training/wrappers/parse-de-bitpar.perl
+++ b/scripts/training/wrappers/parse-de-bitpar.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/parse-en-collins.perl b/scripts/training/wrappers/parse-en-collins.perl
index 252d3d2b7..c9a960912 100755
--- a/scripts/training/wrappers/parse-en-collins.perl
+++ b/scripts/training/wrappers/parse-en-collins.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/parse-en-egret.perl b/scripts/training/wrappers/parse-en-egret.perl
index 9f434063b..e97bc1ae0 100755
--- a/scripts/training/wrappers/parse-en-egret.perl
+++ b/scripts/training/wrappers/parse-en-egret.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/parse-en-senna.perl b/scripts/training/wrappers/parse-en-senna.perl
index f271633ea..2df46284b 100755
--- a/scripts/training/wrappers/parse-en-senna.perl
+++ b/scripts/training/wrappers/parse-en-senna.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl 
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use strict;
 use warnings;
diff --git a/scripts/training/wrappers/parse-en-stanford.py b/scripts/training/wrappers/parse-en-stanford.py
index 7d8be4bcf..06b027e55 100755
--- a/scripts/training/wrappers/parse-en-stanford.py
+++ b/scripts/training/wrappers/parse-en-stanford.py
@@ -1,11 +1,17 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
 # Author: Rico Sennrich
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
-# (hacky) wrapper around Stanford CoreNLP to produce CoNLL dependency format.
-# assumes tokenized and sentence-split text.
+"""
+(Hacky) wrapper around Stanford CoreNLP to produce CoNLL dependency format.
+Assumes tokenized and sentence-split text.
 
-# to get Moses XML format, first projectivize the trees, then use conll2mosesxml.py.
+To get Moses XML format, first projectivize the trees, then use
+conll2mosesxml.py.
+"""
 
 from __future__ import print_function, unicode_literals
 import os
diff --git a/scripts/training/wrappers/senna2brackets.py b/scripts/training/wrappers/senna2brackets.py
index 4fc71ed44..a81100277 100755
--- a/scripts/training/wrappers/senna2brackets.py
+++ b/scripts/training/wrappers/senna2brackets.py
@@ -1,19 +1,24 @@
 #!/usr/bin/env python
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
-# Read SENNA output (from stdin), extract the parse trees, and write them in
-# PTB-style bracketed format (to stdout).
-#
-# The SENNA output is assumed to contain tokens in the first column, POS tags
-# in the second column, and PSG fragments in the final column.
-#
-# It is also assumed that SENNA was run through the parse-en-senna.perl wrapper,
-# which:
-#
-#   - Substitutes the special "SENTENCE_TOO_LONG" token for sentences that
-#     exceed SENNA's hardcoded limit.
-#
-#   - Replaces the bracket-like tokens "-LRB-", "-RRB-", etc. with "(", ")",
-#     etc.
+"""
+Read SENNA output (from stdin), extract the parse trees, and write them in
+PTB-style bracketed format (to stdout).
+
+The SENNA output is assumed to contain tokens in the first column, POS tags
+in the second column, and PSG fragments in the final column.
+
+It is also assumed that SENNA was run through the parse-en-senna.perl wrapper,
+which:
+
+  - Substitutes the special "SENTENCE_TOO_LONG" token for sentences that
+    exceed SENNA's hardcoded limit.
+
+  - Replaces the bracket-like tokens "-LRB-", "-RRB-", etc. with "(", ")",
+    etc.
+"""
 
 import optparse
 import os
diff --git a/scripts/training/wrappers/syntax-hyphen-splitting.perl b/scripts/training/wrappers/syntax-hyphen-splitting.perl
index 653b410d0..1a260df10 100755
--- a/scripts/training/wrappers/syntax-hyphen-splitting.perl
+++ b/scripts/training/wrappers/syntax-hyphen-splitting.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;
diff --git a/scripts/training/wrappers/tagger-german-chunk.perl b/scripts/training/wrappers/tagger-german-chunk.perl
index c57031889..0b707a579 100755
--- a/scripts/training/wrappers/tagger-german-chunk.perl
+++ b/scripts/training/wrappers/tagger-german-chunk.perl
@@ -1,4 +1,7 @@
 #!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
 
 use warnings;
 use strict;

From 5d8af9c2896d86785c5db2fd3a8029ae9b741e26 Mon Sep 17 00:00:00 2001
From: Rico Sennrich <rico.sennrich@gmx.ch>
Date: Fri, 29 May 2015 16:07:26 +0100
Subject: [PATCH 018/108] support memory-mapped files for NPLM training

---
 scripts/training/bilingual-lm/train_nplm.py | 14 ++++++---
 scripts/training/rdlm/train_rdlm.py         | 33 +++++++++++++++++----
 scripts/training/train-neurallm.py          | 33 +++++++++++++++++++--
 3 files changed, 68 insertions(+), 12 deletions(-)

diff --git a/scripts/training/bilingual-lm/train_nplm.py b/scripts/training/bilingual-lm/train_nplm.py
index cb5980a91..572076006 100755
--- a/scripts/training/bilingual-lm/train_nplm.py
+++ b/scripts/training/bilingual-lm/train_nplm.py
@@ -39,7 +39,8 @@ parser.add_argument("--input-words-file", dest="input_words_file")
 parser.add_argument("--output-words-file", dest="output_words_file")
 parser.add_argument("--input_vocab_size", dest="input_vocab_size", type=int)
 parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int)
-
+parser.add_argument("--mmap", dest="mmap", action="store_true",
+    help="Use memory-mapped file (for lower memory consumption).")
 
 parser.set_defaults(
     working_dir="working",
@@ -113,6 +114,11 @@ def main(options):
         options.working_dir,
         os.path.basename(options.corpus_stem) + ".numberized")
 
+    mmap_command = []
+    if options.mmap:
+        in_file += '.mmap'
+        mmap_command = ['--mmap_file', '1']
+
     model_prefix = os.path.join(
         options.output_dir, options.output_model + ".model.nplm")
     train_args = [
@@ -127,9 +133,9 @@ def main(options):
         "--input_embedding_dimension", str(options.input_embedding),
         "--output_embedding_dimension", str(options.output_embedding),
         "--num_threads", str(options.threads),
-        "--activation_function",
-        options.activation_fn,
-    ] + validations_command + vocab_command
+        "--activation_function", options.activation_fn,
+        "--ngram_size", str(options.ngram_size),
+    ] + validations_command + vocab_command + mmap_command
     print("Train model command: ")
     print(', '.join(train_args))
 
diff --git a/scripts/training/rdlm/train_rdlm.py b/scripts/training/rdlm/train_rdlm.py
index a7edbab36..289ab405c 100755
--- a/scripts/training/rdlm/train_rdlm.py
+++ b/scripts/training/rdlm/train_rdlm.py
@@ -94,11 +94,14 @@ parser.add_argument(
     "--output-words-file", dest="output_words_file", metavar="PATH",
     help="Output vocabulary (default: %(default)s).")
 parser.add_argument(
-    "--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT",
+    "--input-vocab-size", dest="input_vocab_size", type=int, metavar="INT",
     help="Input vocabulary size (default: %(default)s).")
 parser.add_argument(
     "--output-vocab-size", dest="output_vocab_size", type=int, metavar="INT",
     help="Output vocabulary size (default: %(default)s).")
+parser.add_argument(
+    "--mmap", dest="mmap", action="store_true",
+    help="Use memory-mapped file (for lower memory consumption).")
 
 
 parser.set_defaults(
@@ -195,11 +198,14 @@ def main(options):
             "extracting vocabulary from training text.\n")
         prepare_vocabulary(options)
 
+    numberized_file = os.path.basename(options.corpus_stem) + '.numberized'
+    train_file = numberized_file
+    if options.mmap:
+        train_file += '.mmap'
+
     extract_options = extract_syntactic_ngrams.create_parser().parse_args([
         '--input', options.corpus_stem,
-        '--output', os.path.join(
-            options.working_dir,
-            os.path.basename(options.corpus_stem) + '.numberized'),
+        '--output', os.path.join(options.working_dir, numberized_file),
         '--vocab', options.input_words_file,
         '--output_vocab', options.output_words_file,
         '--right_context', str(options.right_context_size),
@@ -222,6 +228,23 @@ def main(options):
     else:
         options.validation_file = None
 
+    if options.mmap:
+        try:
+            os.remove(os.path.join(options.working_dir, train_file))
+        except OSError:
+            pass
+        mmap_cmd = [os.path.join(options.nplm_home, 'src', 'createMmap'),
+                    '--input_file',
+                    os.path.join(options.working_dir, numberized_file),
+                    '--output_file',
+                    os.path.join(options.working_dir, train_file)
+                    ]
+        sys.stderr.write('creating memory-mapped file\n')
+        sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n')
+        ret = subprocess.call(mmap_cmd)
+        if ret:
+            raise Exception("creating memory-mapped file failed")
+
     sys.stderr.write('training neural network\n')
     train_nplm.main(options)
 
@@ -234,7 +257,7 @@ def main(options):
             options.output_model + '.model.nplm.' + str(options.epochs)),
         os.path.join(
             options.working_dir,
-            os.path.basename(options.corpus_stem) + '.numberized'),
+            numberized_file),
         os.path.join(options.output_dir, options.output_model + '.model.nplm')
         ])
     if ret:
diff --git a/scripts/training/train-neurallm.py b/scripts/training/train-neurallm.py
index fec859611..ae77a42af 100755
--- a/scripts/training/train-neurallm.py
+++ b/scripts/training/train-neurallm.py
@@ -87,6 +87,9 @@ parser.add_argument(
 parser.add_argument(
     "--vocab-size", dest="vocab_size", type=int, metavar="INT",
     help="Vocabulary size (default: %(default)s).")
+parser.add_argument(
+    "--mmap", dest="mmap", action="store_true",
+    help="Use memory-mapped file (for lower memory consumption).")
 
 parser.set_defaults(
     working_dir="working",
@@ -121,20 +124,43 @@ def main(options):
         if not os.path.exists(options.output_dir):
             os.makedirs(options.output_dir)
 
+    numberized_file = os.path.basename(options.corpus_stem) + '.numberized'
+    train_file = numberized_file
+    if options.mmap:
+        train_file += '.mmap'
+
     extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
                       '--train_text', options.corpus_stem,
                       '--ngramize', '1',
                       '--ngram_size', str(options.ngram_size),
                       '--vocab_size', str(options.vocab_size),
                       '--write_words_file', os.path.join(options.working_dir, options.words_file),
-                      '--train_file', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized')
+                      '--train_file', os.path.join(options.working_dir, numberized_file)
                       ]
 
     sys.stderr.write('extracting n-grams\n')
+    sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
     ret = subprocess.call(extraction_cmd)
     if ret:
         raise Exception("preparing neural LM failed")
-    
+
+    if options.mmap:
+        try:
+            os.remove(os.path.join(options.working_dir, train_file))
+        except OSError:
+            pass
+        mmap_cmd = [os.path.join(options.nplm_home, 'src', 'createMmap'),
+                    '--input_file',
+                    os.path.join(options.working_dir, numberized_file),
+                    '--output_file',
+                    os.path.join(options.working_dir, train_file)
+                    ]
+        sys.stderr.write('creating memory-mapped file\n')
+        sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n')
+        ret = subprocess.call(mmap_cmd)
+        if ret:
+            raise Exception("creating memory-mapped file failed")
+
     if options.validation_corpus:
 
         extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
@@ -147,6 +173,7 @@ def main(options):
                           ]
 
         sys.stderr.write('extracting n-grams (validation file)\n')
+        sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
         ret = subprocess.call(extraction_cmd)
         if ret:
             raise Exception("preparing neural LM failed")
@@ -166,7 +193,7 @@ def main(options):
     average_options = averageNullEmbedding.parser.parse_args(
         ['-i', os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)),
          '-o', os.path.join(options.output_dir, options.output_model + '.model.nplm'),
-         '-t', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'),
+         '-t', os.path.join(options.working_dir, numberized_file),
          '-p', os.path.join(options.nplm_home, 'python')])
     averageNullEmbedding.main(average_options)
 

From 2f735998ca8755263ec8dcc30303358988519091 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Fri, 29 May 2015 18:46:02 +0100
Subject: [PATCH 019/108] Rename MosesTraining::SyntaxTree to
 MosesTraining::SyntaxNodeCollection

This is the first step in a small-scale refactoring effort that will touch a
lot of the syntax-related code in moses/phrase-extract.  The end goals are:

  - a storage mechanism for general attribute/value pairs in XML-style
    tree / lattice input.  E.g. the "pcfg-score" and "semantic-role"
    attributes in:

     <tree label="PRP" pcfg-score="1.0" semantic-role="AGENT"> I </tree>

  - consolidation of the various near-duplicate Tree / XmlTreeParser classes
    that have accumulated over the years (my fault)

  - general de-crufting
---
 phrase-extract/SentenceAlignmentWithSyntax.h  |  4 +-
 phrase-extract/SyntaxTree.cpp                 | 48 +++----------------
 phrase-extract/SyntaxTree.h                   | 23 +++------
 phrase-extract/XmlTree.cpp                    | 12 ++---
 phrase-extract/XmlTree.h                      |  2 +-
 phrase-extract/extract-ghkm/ExtractGHKM.cpp   |  4 +-
 phrase-extract/extract-ghkm/ScfgRule.cpp      |  8 ++--
 phrase-extract/extract-ghkm/ScfgRule.h        |  8 ++--
 phrase-extract/extract-ghkm/XmlTreeParser.h   |  2 +-
 phrase-extract/pcfg-common/xml_tree_parser.h  |  2 +-
 phrase-extract/relax-parse-main.cpp           | 12 ++---
 phrase-extract/relax-parse.h                  |  8 ++--
 .../syntax-common/xml_tree_parser.cc          | 10 ++--
 .../syntax-common/xml_tree_parser.h           |  2 +-
 14 files changed, 51 insertions(+), 94 deletions(-)

diff --git a/phrase-extract/SentenceAlignmentWithSyntax.h b/phrase-extract/SentenceAlignmentWithSyntax.h
index 8b9088770..a603f7722 100644
--- a/phrase-extract/SentenceAlignmentWithSyntax.h
+++ b/phrase-extract/SentenceAlignmentWithSyntax.h
@@ -36,8 +36,8 @@ namespace MosesTraining
 class SentenceAlignmentWithSyntax : public SentenceAlignment
 {
 public:
-  SyntaxTree targetTree;
-  SyntaxTree sourceTree;
+  SyntaxNodeCollection targetTree;
+  SyntaxNodeCollection sourceTree;
   std::set<std::string> & m_targetLabelCollection;
   std::set<std::string> & m_sourceLabelCollection;
   std::map<std::string, int> & m_targetTopLabelCollection;
diff --git a/phrase-extract/SyntaxTree.cpp b/phrase-extract/SyntaxTree.cpp
index c50693e0d..7f641125e 100644
--- a/phrase-extract/SyntaxTree.cpp
+++ b/phrase-extract/SyntaxTree.cpp
@@ -1,6 +1,3 @@
-// $Id: SyntaxTree.cpp 1960 2008-12-15 12:52:38Z phkoehn $
-// vim:tabstop=2
-
 /***********************************************************************
   Moses - factored phrase-based language decoder
   Copyright (C) 2009 University of Edinburgh
@@ -29,12 +26,12 @@
 namespace MosesTraining
 {
 
-SyntaxTree::~SyntaxTree()
+SyntaxNodeCollection::~SyntaxNodeCollection()
 {
   Clear();
 }
 
-void SyntaxTree::Clear()
+void SyntaxNodeCollection::Clear()
 {
   m_top = 0;
   // loop through all m_nodes, delete them
@@ -45,7 +42,7 @@ void SyntaxTree::Clear()
   m_index.clear();
 }
 
-SyntaxNode *SyntaxTree::AddNode( int startPos, int endPos, std::string label )
+SyntaxNode *SyntaxNodeCollection::AddNode( int startPos, int endPos, std::string label )
 {
   SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label );
   m_nodes.push_back( newNode );
@@ -54,7 +51,7 @@ SyntaxNode *SyntaxTree::AddNode( int startPos, int endPos, std::string label )
   return newNode;
 }
 
-ParentNodes SyntaxTree::Parse()
+ParentNodes SyntaxNodeCollection::Parse()
 {
   ParentNodes parents;
 
@@ -94,12 +91,12 @@ ParentNodes SyntaxTree::Parse()
   return parents;
 }
 
-bool SyntaxTree::HasNode( int startPos, int endPos ) const
+bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const
 {
   return GetNodes( startPos, endPos).size() > 0;
 }
 
-const std::vector< SyntaxNode* >& SyntaxTree::GetNodes( int startPos, int endPos ) const
+const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes( int startPos, int endPos ) const
 {
   SyntaxTreeIndexIterator startIndex = m_index.find( startPos );
   if (startIndex == m_index.end() )
@@ -112,15 +109,7 @@ const std::vector< SyntaxNode* >& SyntaxTree::GetNodes( int startPos, int endPos
   return endIndex->second;
 }
 
-// for printing out tree
-std::string SyntaxTree::ToString() const
-{
-  std::stringstream out;
-  out << *this;
-  return out.str();
-}
-
-void SyntaxTree::ConnectNodes()
+void SyntaxNodeCollection::ConnectNodes()
 {
   typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator;
 
@@ -162,27 +151,4 @@ void SyntaxTree::ConnectNodes()
   }
 }
 
-std::ostream& operator<<(std::ostream& os, const SyntaxTree& t)
-{
-  size_t size = t.m_index.size();
-  for(size_t length=1; length<=size; length++) {
-    for(size_t space=0; space<length; space++) {
-      os << "    ";
-    }
-    for(size_t start=0; start<=size-length; start++) {
-
-      if (t.HasNode( start, start+(length-1) )) {
-        std::string label = t.GetNodes( start, start+(length-1) )[0]->GetLabel() + "#######";
-
-        os << label.substr(0,7) << " ";
-      } else {
-        os << "------- ";
-      }
-    }
-    os << std::endl;
-  }
-  return os;
 }
-
-}
-
diff --git a/phrase-extract/SyntaxTree.h b/phrase-extract/SyntaxTree.h
index 6ffb5da34..649a6197b 100644
--- a/phrase-extract/SyntaxTree.h
+++ b/phrase-extract/SyntaxTree.h
@@ -1,6 +1,3 @@
-// $Id: SyntaxTree.h 1960 2008-12-15 12:52:38Z phkoehn $
-// vim:tabstop=2
-
 /***********************************************************************
   Moses - factored phrase-based language decoder
   Copyright (C) 2009 University of Edinburgh
@@ -20,12 +17,12 @@
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  ***********************************************************************/
 
-
 #pragma once
-#include <string>
-#include <vector>
+
 #include <map>
 #include <sstream>
+#include <string>
+#include <vector>
 
 namespace MosesTraining
 {
@@ -79,7 +76,7 @@ public:
 typedef std::vector< int > SplitPoints;
 typedef std::vector< SplitPoints > ParentNodes;
 
-class SyntaxTree
+class SyntaxNodeCollection
 {
 protected:
   std::vector< SyntaxNode* > m_nodes;
@@ -93,14 +90,12 @@ protected:
   int m_size;
   std::vector< SyntaxNode* > m_emptyNode;
 
-  friend std::ostream& operator<<(std::ostream&, const SyntaxTree&);
-
 public:
-  SyntaxTree()
+  SyntaxNodeCollection()
     : m_top(0)  // m_top doesn't get set unless ConnectNodes is called.
     , m_size(0) {}
 
-  ~SyntaxTree();
+  ~SyntaxNodeCollection();
 
   SyntaxNode *AddNode( int startPos, int endPos, std::string label );
 
@@ -119,10 +114,6 @@ public:
   }
   void ConnectNodes();
   void Clear();
-  std::string ToString() const;
 };
 
-std::ostream& operator<<(std::ostream&, const SyntaxTree&);
-
-}
-
+}  // namespace MosesTraining
diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp
index 6efa1bf5c..d45fd99eb 100644
--- a/phrase-extract/XmlTree.cpp
+++ b/phrase-extract/XmlTree.cpp
@@ -1,6 +1,3 @@
-// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $
-// vim:tabstop=2
-
 /***********************************************************************
   Moses - factored phrase-based language decoder
   Copyright (C) 2006 University of Edinburgh
@@ -228,7 +225,10 @@ vector<string> TokenizeXml(const string& str)
 	parse because we don't have the completed source parsed until after this function
 	removes all the markup from it (CreateFromString in Sentence::Read).
 */
-bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &labelCollection, map< string, int > &topLabelCollection, bool unescapeSpecialChars )
+bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
+                            set< string > &labelCollection,
+                            map< string, int > &topLabelCollection,
+                            bool unescapeSpecialChars )
 {
   //parse XML markup in translation line
 
@@ -374,7 +374,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label
           cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
           cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
         }
-        SyntaxNode *node = tree.AddNode( startPos, endPos-1, label );
+        SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label );
         node->SetPcfgScore(pcfgScore);
       }
     }
@@ -386,7 +386,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label
   }
 
   // collect top labels
-  const vector< SyntaxNode* >& topNodes = tree.GetNodes( 0, wordPos-1 );
+  const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 );
   for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) {
     SyntaxNode *n = *node;
     const string &label = n->GetLabel();
diff --git a/phrase-extract/XmlTree.h b/phrase-extract/XmlTree.h
index 50b1c0acc..392192ae6 100644
--- a/phrase-extract/XmlTree.h
+++ b/phrase-extract/XmlTree.h
@@ -35,7 +35,7 @@ std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r"
 std::string TrimXml(const std::string& str);
 bool isXmlTag(const std::string& tag);
 std::vector<std::string> TokenizeXml(const std::string& str);
-bool ProcessAndStripXMLTags(std::string &line, SyntaxTree &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection, bool unescape = true);
+bool ProcessAndStripXMLTags(std::string &line, SyntaxNodeCollection &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection, bool unescape = true);
 std::string unescape(const std::string &str);
 
 
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index bc687ec6b..9e6aacc20 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -172,7 +172,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
 
 
     // Parse source tree and construct a SyntaxTree object.
-    MosesTraining::SyntaxTree sourceSyntaxTree;
+    MosesTraining::SyntaxNodeCollection sourceSyntaxTree;
     MosesTraining::SyntaxNode *sourceSyntaxTreeRoot=NULL;
 
     if (options.sourceLabels) {
@@ -196,7 +196,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
     // Read source tokens.
     std::vector<std::string> sourceTokens(ReadTokens(sourceLine));
 
-    // Construct a source ParseTree object from the SyntaxTree object.
+    // Construct a source ParseTree object from the SyntaxNodeCollection object.
     std::auto_ptr<ParseTree> sourceParseTree;
 
     if (options.sourceLabels) {
diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp
index 01178b72c..94ff3c605 100644
--- a/phrase-extract/extract-ghkm/ScfgRule.cpp
+++ b/phrase-extract/extract-ghkm/ScfgRule.cpp
@@ -31,7 +31,7 @@ namespace GHKM
 {
 
 ScfgRule::ScfgRule(const Subgraph &fragment,
-                   const MosesTraining::SyntaxTree *sourceSyntaxTree)
+                   const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree)
   : m_graphFragment(fragment)
   , m_sourceLHS("X", NonTerminal)
   , m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal)
@@ -133,9 +133,9 @@ ScfgRule::ScfgRule(const Subgraph &fragment,
   }
 }
 
-void ScfgRule::PushSourceLabel(const MosesTraining::SyntaxTree *sourceSyntaxTree,
-                               const Node *node,
-                               const std::string &nonMatchingLabel)
+void ScfgRule::PushSourceLabel(
+    const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree,
+    const Node *node, const std::string &nonMatchingLabel)
 {
   ContiguousSpan span = Closure(node->GetSpan());
   if (sourceSyntaxTree->HasNode(span.first,span.second)) { // does a source constituent match the span?
diff --git a/phrase-extract/extract-ghkm/ScfgRule.h b/phrase-extract/extract-ghkm/ScfgRule.h
index 94ee7b82e..b3d8ad017 100644
--- a/phrase-extract/extract-ghkm/ScfgRule.h
+++ b/phrase-extract/extract-ghkm/ScfgRule.h
@@ -41,7 +41,7 @@ class ScfgRule : public Rule
 {
 public:
   ScfgRule(const Subgraph &fragment,
-           const MosesTraining::SyntaxTree *sourceSyntaxTree = 0);
+           const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree = 0);
 
   const Subgraph &GetGraphFragment() const {
     return m_graphFragment;
@@ -78,9 +78,9 @@ public:
   }
 
 private:
-  void PushSourceLabel(const MosesTraining::SyntaxTree *sourceSyntaxTree,
-                       const Node *node,
-                       const std::string &nonMatchingLabel);
+  void PushSourceLabel(
+      const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree,
+      const Node *node, const std::string &nonMatchingLabel);
 
   const Subgraph& m_graphFragment;
   Symbol m_sourceLHS;
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h
index ff0baeace..03450383a 100644
--- a/phrase-extract/extract-ghkm/XmlTreeParser.h
+++ b/phrase-extract/extract-ghkm/XmlTreeParser.h
@@ -58,7 +58,7 @@ private:
   std::set<std::string> &m_labelSet;
   std::map<std::string, int> &m_topLabelSet;
   std::string m_line;
-  MosesTraining::SyntaxTree m_tree;
+  MosesTraining::SyntaxNodeCollection m_tree;
   std::vector<std::string> m_words;
 };
 
diff --git a/phrase-extract/pcfg-common/xml_tree_parser.h b/phrase-extract/pcfg-common/xml_tree_parser.h
index 675a112d8..69754bb56 100644
--- a/phrase-extract/pcfg-common/xml_tree_parser.h
+++ b/phrase-extract/pcfg-common/xml_tree_parser.h
@@ -47,7 +47,7 @@ class XmlTreeParser {
   std::set<std::string> m_labelSet;
   std::map<std::string, int> m_topLabelSet;
   std::string m_line;
-  MosesTraining::SyntaxTree m_tree;
+  MosesTraining::SyntaxNodeCollection m_tree;
   std::vector<std::string> m_words;
 };
 
diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp
index 5c9daa7ae..5bca886bf 100644
--- a/phrase-extract/relax-parse-main.cpp
+++ b/phrase-extract/relax-parse-main.cpp
@@ -43,7 +43,7 @@ int main(int argc, char* argv[])
     // process into syntax tree representation
     set< string > labelCollection;         // set of labels, not used
     map< string, int > topLabelCollection; // count of top labels, not used
-    SyntaxTree tree;
+    SyntaxNodeCollection tree;
     ProcessAndStripXMLTags( inBufferString, tree, labelCollection, topLabelCollection, false );
     const vector< string > inWords = util::tokenize( inBufferString );
 
@@ -105,7 +105,7 @@ void init(int argc, char* argv[])
   }
 }
 
-void store( SyntaxTree &tree, const vector< string > &words )
+void store( SyntaxNodeCollection &tree, const vector< string > &words )
 {
   // output words
   for( size_t i=0; i<words.size(); i++ ) {
@@ -126,7 +126,7 @@ void store( SyntaxTree &tree, const vector< string > &words )
   cout << endl;
 }
 
-void LeftBinarize( SyntaxTree &tree, ParentNodes &parents )
+void LeftBinarize( SyntaxNodeCollection &tree, ParentNodes &parents )
 {
   for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) {
     const SplitPoints &point = *p;
@@ -143,7 +143,7 @@ void LeftBinarize( SyntaxTree &tree, ParentNodes &parents )
   }
 }
 
-void RightBinarize( SyntaxTree &tree, ParentNodes &parents )
+void RightBinarize( SyntaxNodeCollection &tree, ParentNodes &parents )
 {
   for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) {
     const SplitPoints &point = *p;
@@ -161,11 +161,11 @@ void RightBinarize( SyntaxTree &tree, ParentNodes &parents )
   }
 }
 
-void SAMT( SyntaxTree &tree, ParentNodes &parents )
+void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
 {
   int numWords = tree.GetNumWords();
 
-  SyntaxTree newTree; // to store new nodes
+  SyntaxNodeCollection newTree; // to store new nodes
 
   // look through parents to combine children
   for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) {
diff --git a/phrase-extract/relax-parse.h b/phrase-extract/relax-parse.h
index 9bd0bfb23..af41b0945 100644
--- a/phrase-extract/relax-parse.h
+++ b/phrase-extract/relax-parse.h
@@ -39,8 +39,8 @@ char SAMTLevel = 0;
 
 // functions
 void init(int argc, char* argv[]);
-void store( MosesTraining::SyntaxTree &tree, const std::vector<std::string> &words );
-void LeftBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
-void RightBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
-void SAMT( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
+void store( MosesTraining::SyntaxNodeCollection &tree, const std::vector<std::string> &words );
+void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
+void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
+void SAMT( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
 
diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc
index c6e3cd3c3..2f8a904fa 100644
--- a/phrase-extract/syntax-common/xml_tree_parser.cc
+++ b/phrase-extract/syntax-common/xml_tree_parser.cc
@@ -13,17 +13,17 @@ namespace Syntax {
 
 StringTree *XmlTreeParser::Parse(const std::string &line) {
   line_ = line;
-  tree_.Clear();
+  node_collection_.Clear();
   try {
-    if (!ProcessAndStripXMLTags(line_, tree_, label_set_, top_label_set_,
-                                false)) {
+    if (!ProcessAndStripXMLTags(line_, node_collection_, label_set_,
+                                top_label_set_, false)) {
       throw Exception("");
     }
   } catch (const XmlException &e) {
     throw Exception(e.getMsg());
   }
-  tree_.ConnectNodes();
-  SyntaxNode *root = tree_.GetTop();
+  node_collection_.ConnectNodes();
+  SyntaxNode *root = node_collection_.GetTop();
   assert(root);
   words_ = util::tokenize(line_);
   return ConvertTree(*root, words_);
diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h
index a5563f63a..e530b84ef 100644
--- a/phrase-extract/syntax-common/xml_tree_parser.h
+++ b/phrase-extract/syntax-common/xml_tree_parser.h
@@ -26,7 +26,7 @@ class XmlTreeParser {
   std::set<std::string> label_set_;
   std::map<std::string, int> top_label_set_;
   std::string line_;
-  MosesTraining::SyntaxTree tree_;
+  MosesTraining::SyntaxNodeCollection node_collection_;
   std::vector<std::string> words_;
 };
 

From 985e7bbfc30c6f124c546e769948caf22eacfc66 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Fri, 29 May 2015 20:57:25 +0100
Subject: [PATCH 020/108] Ongoing moses/phrase-extract refactoring

---
 phrase-extract/SentenceAlignmentWithSyntax.h  |  2 +-
 phrase-extract/SyntaxNode.h                   | 75 +++++++++++++++++++
 ...yntaxTree.cpp => SyntaxNodeCollection.cpp} |  7 +-
 .../{SyntaxTree.h => SyntaxNodeCollection.h}  | 50 +------------
 phrase-extract/XmlTree.cpp                    |  3 +-
 phrase-extract/XmlTree.h                      | 10 +--
 phrase-extract/extract-ghkm/ExtractGHKM.cpp   |  3 +-
 phrase-extract/extract-ghkm/ScfgRule.cpp      |  7 +-
 phrase-extract/extract-ghkm/ScfgRule.h        |  9 +--
 phrase-extract/extract-ghkm/XmlTreeParser.h   |  5 +-
 phrase-extract/extract-rules-main.cpp         |  2 +-
 phrase-extract/pcfg-common/xml_tree_parser.h  |  3 +-
 phrase-extract/relax-parse.h                  |  2 +-
 .../syntax-common/xml_tree_parser.h           |  3 +-
 14 files changed, 108 insertions(+), 73 deletions(-)
 create mode 100644 phrase-extract/SyntaxNode.h
 rename phrase-extract/{SyntaxTree.cpp => SyntaxNodeCollection.cpp} (96%)
 rename phrase-extract/{SyntaxTree.h => SyntaxNodeCollection.h} (69%)

diff --git a/phrase-extract/SentenceAlignmentWithSyntax.h b/phrase-extract/SentenceAlignmentWithSyntax.h
index a603f7722..604b6d0e2 100644
--- a/phrase-extract/SentenceAlignmentWithSyntax.h
+++ b/phrase-extract/SentenceAlignmentWithSyntax.h
@@ -28,7 +28,7 @@
 
 #include "RuleExtractionOptions.h"
 #include "SentenceAlignment.h"
-#include "SyntaxTree.h"
+#include "SyntaxNodeCollection.h"
 
 namespace MosesTraining
 {
diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h
new file mode 100644
index 000000000..46e0f456f
--- /dev/null
+++ b/phrase-extract/SyntaxNode.h
@@ -0,0 +1,75 @@
+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2009 University of Edinburgh
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#pragma once
+
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace MosesTraining
+{
+
+class SyntaxNode
+{
+protected:
+  int m_start, m_end;
+  std::string m_label;
+  std::vector< SyntaxNode* > m_children;
+  SyntaxNode* m_parent;
+  float m_pcfgScore;
+public:
+  SyntaxNode( int startPos, int endPos, std::string label )
+    :m_start(startPos)
+    ,m_end(endPos)
+    ,m_label(label)
+    ,m_parent(0)
+    ,m_pcfgScore(0.0f) {
+  }
+  int GetStart() const {
+    return m_start;
+  }
+  int GetEnd() const {
+    return m_end;
+  }
+  std::string GetLabel() const {
+    return m_label;
+  }
+  float GetPcfgScore() const {
+    return m_pcfgScore;
+  }
+  void SetPcfgScore(float score) {
+    m_pcfgScore = score;
+  }
+  SyntaxNode *GetParent() {
+    return m_parent;
+  }
+  void SetParent(SyntaxNode *parent) {
+    m_parent = parent;
+  }
+  void AddChild(SyntaxNode* child) {
+    m_children.push_back(child);
+  }
+  const std::vector< SyntaxNode* > &GetChildren() const {
+    return m_children;
+  }
+};
+
+}  // namespace MosesTraining
diff --git a/phrase-extract/SyntaxTree.cpp b/phrase-extract/SyntaxNodeCollection.cpp
similarity index 96%
rename from phrase-extract/SyntaxTree.cpp
rename to phrase-extract/SyntaxNodeCollection.cpp
index 7f641125e..099a5697f 100644
--- a/phrase-extract/SyntaxTree.cpp
+++ b/phrase-extract/SyntaxNodeCollection.cpp
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 
-#include "SyntaxTree.h"
+#include "SyntaxNodeCollection.h"
 
 #include <cassert>
 #include <iostream>
@@ -42,7 +42,8 @@ void SyntaxNodeCollection::Clear()
   m_index.clear();
 }
 
-SyntaxNode *SyntaxNodeCollection::AddNode( int startPos, int endPos, std::string label )
+SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
+                                          const std::string &label)
 {
   SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label );
   m_nodes.push_back( newNode );
@@ -151,4 +152,4 @@ void SyntaxNodeCollection::ConnectNodes()
   }
 }
 
-}
+}  // namespace MosesTraining
diff --git a/phrase-extract/SyntaxTree.h b/phrase-extract/SyntaxNodeCollection.h
similarity index 69%
rename from phrase-extract/SyntaxTree.h
rename to phrase-extract/SyntaxNodeCollection.h
index 649a6197b..70b14206d 100644
--- a/phrase-extract/SyntaxTree.h
+++ b/phrase-extract/SyntaxNodeCollection.h
@@ -24,55 +24,11 @@
 #include <string>
 #include <vector>
 
+#include "SyntaxNode.h"
+
 namespace MosesTraining
 {
 
-class SyntaxNode
-{
-protected:
-  int m_start, m_end;
-  std::string m_label;
-  std::vector< SyntaxNode* > m_children;
-  SyntaxNode* m_parent;
-  float m_pcfgScore;
-public:
-  SyntaxNode( int startPos, int endPos, std::string label )
-    :m_start(startPos)
-    ,m_end(endPos)
-    ,m_label(label)
-    ,m_parent(0)
-    ,m_pcfgScore(0.0f) {
-  }
-  int GetStart() const {
-    return m_start;
-  }
-  int GetEnd() const {
-    return m_end;
-  }
-  std::string GetLabel() const {
-    return m_label;
-  }
-  float GetPcfgScore() const {
-    return m_pcfgScore;
-  }
-  void SetPcfgScore(float score) {
-    m_pcfgScore = score;
-  }
-  SyntaxNode *GetParent() {
-    return m_parent;
-  }
-  void SetParent(SyntaxNode *parent) {
-    m_parent = parent;
-  }
-  void AddChild(SyntaxNode* child) {
-    m_children.push_back(child);
-  }
-  const std::vector< SyntaxNode* > &GetChildren() const {
-    return m_children;
-  }
-};
-
-
 typedef std::vector< int > SplitPoints;
 typedef std::vector< SplitPoints > ParentNodes;
 
@@ -97,7 +53,7 @@ public:
 
   ~SyntaxNodeCollection();
 
-  SyntaxNode *AddNode( int startPos, int endPos, std::string label );
+  SyntaxNode *AddNode( int startPos, int endPos, const std::string &label );
 
   SyntaxNode *GetTop() {
     return m_top;
diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp
index d45fd99eb..0f068fca7 100644
--- a/phrase-extract/XmlTree.cpp
+++ b/phrase-extract/XmlTree.cpp
@@ -24,7 +24,8 @@
 #include <iostream>
 #include <cstdlib>
 #include <sstream>
-#include "SyntaxTree.h"
+
+#include "SyntaxNodeCollection.h"
 #include "XmlException.h"
 
 using namespace std;
diff --git a/phrase-extract/XmlTree.h b/phrase-extract/XmlTree.h
index 392192ae6..3b5afd4dd 100644
--- a/phrase-extract/XmlTree.h
+++ b/phrase-extract/XmlTree.h
@@ -1,6 +1,3 @@
-// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $
-// vim:tabstop=2
-
 /***********************************************************************
   Moses - factored phrase-based language decoder
   Copyright (C) 2006 University of Edinburgh
@@ -21,11 +18,13 @@
  ***********************************************************************/
 
 #pragma once
+
 #include <string>
 #include <vector>
 #include <set>
 #include <map>
-#include "SyntaxTree.h"
+
+#include "SyntaxNodeCollection.h"
 
 namespace MosesTraining
 {
@@ -39,5 +38,4 @@ bool ProcessAndStripXMLTags(std::string &line, SyntaxNodeCollection &tree, std::
 std::string unescape(const std::string &str);
 
 
-} // namespace
-
+} // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index 9e6aacc20..937d88030 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -33,7 +33,8 @@
 #include "Span.h"
 #include "StsgRule.h"
 #include "StsgRuleWriter.h"
-#include "SyntaxTree.h"
+#include "SyntaxNode.h"
+#include "SyntaxNodeCollection.h"
 #include "tables-core.h"
 #include "XmlException.h"
 #include "XmlTree.h"
diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp
index 94ff3c605..918c88eeb 100644
--- a/phrase-extract/extract-ghkm/ScfgRule.cpp
+++ b/phrase-extract/extract-ghkm/ScfgRule.cpp
@@ -19,11 +19,12 @@
 
 #include "ScfgRule.h"
 
+#include <algorithm>
+
 #include "Node.h"
 #include "Subgraph.h"
-#include "SyntaxTree.h"
-
-#include <algorithm>
+#include "SyntaxNode.h"
+#include "SyntaxNodeCollection.h"
 
 namespace Moses
 {
diff --git a/phrase-extract/extract-ghkm/ScfgRule.h b/phrase-extract/extract-ghkm/ScfgRule.h
index b3d8ad017..c8b76114a 100644
--- a/phrase-extract/extract-ghkm/ScfgRule.h
+++ b/phrase-extract/extract-ghkm/ScfgRule.h
@@ -19,16 +19,16 @@
 
 #pragma once
 
-#include "Alignment.h"
-#include "Rule.h"
-#include "SyntaxTree.h"
-
 #include <string>
 #include <vector>
 #include <list>
 #include <memory>
 #include <iostream>
 
+#include "Alignment.h"
+#include "Rule.h"
+#include "SyntaxNodeCollection.h"
+
 namespace Moses
 {
 namespace GHKM
@@ -95,4 +95,3 @@ private:
 
 }  // namespace GHKM
 }  // namespace Moses
-
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h
index 03450383a..db9fa8bf2 100644
--- a/phrase-extract/extract-ghkm/XmlTreeParser.h
+++ b/phrase-extract/extract-ghkm/XmlTreeParser.h
@@ -23,14 +23,15 @@
 
 #include "Exception.h"
 
-#include "SyntaxTree.h"
-
 #include <map>
 #include <memory>
 #include <set>
 #include <string>
 #include <vector>
 
+#include "SyntaxNode.h"
+#include "SyntaxNodeCollection.h"
+
 namespace Moses
 {
 namespace GHKM
diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp
index 50baa4e0d..825f12d89 100644
--- a/phrase-extract/extract-rules-main.cpp
+++ b/phrase-extract/extract-rules-main.cpp
@@ -41,7 +41,7 @@
 #include "HoleCollection.h"
 #include "RuleExist.h"
 #include "SentenceAlignmentWithSyntax.h"
-#include "SyntaxTree.h"
+#include "SyntaxNode.h"
 #include "tables-core.h"
 #include "XmlTree.h"
 #include "InputFileStream.h"
diff --git a/phrase-extract/pcfg-common/xml_tree_parser.h b/phrase-extract/pcfg-common/xml_tree_parser.h
index 69754bb56..8605c0691 100644
--- a/phrase-extract/pcfg-common/xml_tree_parser.h
+++ b/phrase-extract/pcfg-common/xml_tree_parser.h
@@ -28,7 +28,8 @@
 #include <vector>
 
 #include "pcfg_tree.h"
-#include "SyntaxTree.h"
+#include "SyntaxNode.h"
+#include "SyntaxNodeCollection.h"
 
 namespace MosesTraining {
 namespace Syntax {
diff --git a/phrase-extract/relax-parse.h b/phrase-extract/relax-parse.h
index af41b0945..a00aa6deb 100644
--- a/phrase-extract/relax-parse.h
+++ b/phrase-extract/relax-parse.h
@@ -28,7 +28,7 @@
 #include <algorithm>
 #include <cstring>
 
-#include "SyntaxTree.h"
+#include "SyntaxNodeCollection.h"
 #include "XmlTree.h"
 
 #define LINE_MAX_LENGTH 1000000
diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h
index e530b84ef..c84ea25ec 100644
--- a/phrase-extract/syntax-common/xml_tree_parser.h
+++ b/phrase-extract/syntax-common/xml_tree_parser.h
@@ -5,7 +5,8 @@
 #include <string>
 #include <vector>
 
-#include "SyntaxTree.h"
+#include "SyntaxNode.h"
+#include "SyntaxNodeCollection.h"
 
 #include "exception.h"
 #include "string_tree.h"

From ab9b9ae3493da391d19b98551af966b6426bd400 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Sun, 31 May 2015 21:27:55 +0400
Subject: [PATCH 021/108] 1st pass to automatically beautify

---
 cruise-control/test_all_new_commits.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cruise-control/test_all_new_commits.sh b/cruise-control/test_all_new_commits.sh
index 1e0a9c47f..7f1520452 100755
--- a/cruise-control/test_all_new_commits.sh
+++ b/cruise-control/test_all_new_commits.sh
@@ -107,6 +107,7 @@ function run_single_test () {
   #regtest_dir=$PWD/$(basename $regtest_file .tgz)
   cd ..
 
+  ./scripts/other/beautify.py --format
 
   echo "## ./bjam clean" >> $longlog
   ./bjam clean $MCC_CONFIGURE_ARGS --with-regtest=$regtest_dir >> $longlog 2>&1 || warn "bjam clean failed, suspicious"
@@ -153,8 +154,10 @@ function run_single_test () {
   date >> $longlog
 
   if [ -z "$err" ]; then
+    git commit -am "automatic daily beautifier"
     status="OK"
   else
+    git reset --hard HEAD
     status="FAIL:$err"
   fi
   echo "## Status: $status" >> $longlog

From afb032014dc22cc184046fbf99fc08569781afe5 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Sun, 31 May 2015 21:51:43 +0400
Subject: [PATCH 022/108] skip perltidy. Not available on thor (Ubuntu 12.04)

---
 cruise-control/test_all_new_commits.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cruise-control/test_all_new_commits.sh b/cruise-control/test_all_new_commits.sh
index 7f1520452..bb9305768 100755
--- a/cruise-control/test_all_new_commits.sh
+++ b/cruise-control/test_all_new_commits.sh
@@ -107,7 +107,7 @@ function run_single_test () {
   #regtest_dir=$PWD/$(basename $regtest_file .tgz)
   cd ..
 
-  ./scripts/other/beautify.py --format
+  ./scripts/other/beautify.py --format --skip-perltidy
 
   echo "## ./bjam clean" >> $longlog
   ./bjam clean $MCC_CONFIGURE_ARGS --with-regtest=$regtest_dir >> $longlog 2>&1 || warn "bjam clean failed, suspicious"
@@ -190,7 +190,7 @@ done
 
 #### Main loop over all commits
 for i in $MCC_SCAN_BRANCHES; do
-  warn "On brach $i"
+  warn "On branch $i"
   git rev-list $i \
   | while read commit; do
     first_char=$(echo $commit | grep -o '^.')

From c754aef37a804c5ee74ff7b5ccbe1b4cdc80e81c Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Mon, 1 Jun 2015 08:45:04 +0100
Subject: [PATCH 023/108] Oops.  Fix compile error.

---
 phrase-extract/extract-ghkm/StsgRule.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/phrase-extract/extract-ghkm/StsgRule.cpp b/phrase-extract/extract-ghkm/StsgRule.cpp
index 83398f80a..271249e1b 100644
--- a/phrase-extract/extract-ghkm/StsgRule.cpp
+++ b/phrase-extract/extract-ghkm/StsgRule.cpp
@@ -2,7 +2,6 @@
 
 #include "Node.h"
 #include "Subgraph.h"
-#include "SyntaxTree.h"
 
 #include <algorithm>
 

From f3ccd68bee73e2d8dfe8b7d57c9ea2b33d0d99ae Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Mon, 1 Jun 2015 10:35:50 +0100
Subject: [PATCH 024/108] Add ConstPreOrderIterator to
 MosesTraining::Syntax::Tree

---
 phrase-extract/syntax-common/tree-inl.h   | 43 +++++++++++++----------
 phrase-extract/syntax-common/tree.h       | 28 +++++++++------
 phrase-extract/syntax-common/tree_test.cc | 40 +++++++++++++++++++++
 3 files changed, 83 insertions(+), 28 deletions(-)

diff --git a/phrase-extract/syntax-common/tree-inl.h b/phrase-extract/syntax-common/tree-inl.h
index 2ba55df1a..9101fc490 100644
--- a/phrase-extract/syntax-common/tree-inl.h
+++ b/phrase-extract/syntax-common/tree-inl.h
@@ -35,23 +35,24 @@ std::size_t Tree<T>::Depth() const {
 }
 
 template<typename T>
-class Tree<T>::PreOrderIterator {
+template<typename V>
+class Tree<T>::PreOrderIter {
  public:
-  PreOrderIterator();
-  PreOrderIterator(Tree<T> &);
+  PreOrderIter();
+  PreOrderIter(V &);
 
-  Tree<T> &operator*() { return *node_; }
-  Tree<T> *operator->() { return node_; }
+  V &operator*() { return *node_; }
+  V *operator->() { return node_; }
 
-  PreOrderIterator &operator++();
-  PreOrderIterator operator++(int);
+  PreOrderIter &operator++();
+  PreOrderIter operator++(int);
 
-  bool operator==(const Tree<T>::PreOrderIterator &);
-  bool operator!=(const Tree<T>::PreOrderIterator &);
+  bool operator==(const PreOrderIter &);
+  bool operator!=(const PreOrderIter &);
 
  private:
   // Pointer to the current node.
-  Tree<T> *node_;
+  V *node_;
 
   // Stack of indices defining the position of node_ within the child vectors
   // of its ancestors.
@@ -59,17 +60,20 @@ class Tree<T>::PreOrderIterator {
 };
 
 template<typename T>
-Tree<T>::PreOrderIterator::PreOrderIterator()
+template<typename V>
+Tree<T>::PreOrderIter<V>::PreOrderIter()
     : node_(0) {
 }
 
 template<typename T>
-Tree<T>::PreOrderIterator::PreOrderIterator(Tree<T> &t)
+template<typename V>
+Tree<T>::PreOrderIter<V>::PreOrderIter(V &t)
     : node_(&t) {
 }
 
 template<typename T>
-typename Tree<T>::PreOrderIterator &Tree<T>::PreOrderIterator::operator++() {
+template<typename V>
+Tree<T>::PreOrderIter<V> &Tree<T>::PreOrderIter<V>::operator++() {
   // If the current node has children then visit the left-most child next.
   if (!node_->children().empty()) {
     index_stack_.push(0);
@@ -79,7 +83,7 @@ typename Tree<T>::PreOrderIterator &Tree<T>::PreOrderIterator::operator++() {
   // Otherwise, try node's ancestors until either a node is found with a
   // sibling to the right or we reach the root (in which case the traversal
   // is complete).
-  Tree<T> *ancestor = node_->parent_;
+  V *ancestor = node_->parent_;
   while (ancestor) {
     std::size_t index = index_stack_.top();
     index_stack_.pop();
@@ -95,19 +99,22 @@ typename Tree<T>::PreOrderIterator &Tree<T>::PreOrderIterator::operator++() {
 }
 
 template<typename T>
-typename Tree<T>::PreOrderIterator Tree<T>::PreOrderIterator::operator++(int) {
-  PreOrderIterator tmp(*this);
+template<typename V>
+Tree<T>::PreOrderIter<V> Tree<T>::PreOrderIter<V>::operator++(int) {
+  PreOrderIter tmp(*this);
   ++*this;
   return tmp;
 }
 
 template<typename T>
-bool Tree<T>::PreOrderIterator::operator==(const PreOrderIterator &rhs) {
+template<typename V>
+bool Tree<T>::PreOrderIter<V>::operator==(const PreOrderIter &rhs) {
   return node_ == rhs.node_;
 }
 
 template<typename T>
-bool Tree<T>::PreOrderIterator::operator!=(const PreOrderIterator &rhs) {
+template<typename V>
+bool Tree<T>::PreOrderIter<V>::operator!=(const PreOrderIter &rhs) {
   return node_ != rhs.node_;
 }
 
diff --git a/phrase-extract/syntax-common/tree.h b/phrase-extract/syntax-common/tree.h
index 52adaa699..e37c2c21f 100644
--- a/phrase-extract/syntax-common/tree.h
+++ b/phrase-extract/syntax-common/tree.h
@@ -61,23 +61,31 @@ class Tree {
   //
   // All iterators are forward iterators.  Example use:
   //
-  //  Tree<int> &root = GetMeATree();
-  //  for (Tree<int>::PreOrderIterator p(root);
-  //       p != Tree<int>::PreOrderIterator(); ++p) {
-  //    std::cout << p->value() << " ";
+  //  const Tree<int> &root = GetMeATree();
+  //  for (Tree<int>::ConstPreOrderIterator p(root);
+  //       p != Tree<int>::ConstPreOrderIterator(); ++p) {
+  //    std::cout << p->value() << "\n";
   //  }
 
+ private:
+  // Use templates to avoid code duplication between const and non-const
+  // iterators.  V is the value type: either Tree<T> or const Tree<T>.
+  template<typename V> class PreOrderIter;
+  // template<typename V> class PostOrderIter; TODO
+  // template<typename V> class LeafIter; TODO
+
+ public:
   // Pre-order iterators.
-  class PreOrderIterator;
-  // class ConstPreOrderIterator; TODO
+  typedef PreOrderIter<Tree<T> > PreOrderIterator;
+  typedef PreOrderIter<const Tree<T> > ConstPreOrderIterator;
 
   // Post-order iterators.
-  // class PostOrderIterator; TODO
-  // class ConstPostOrderIterator; TODO
+  // typedef PostOrderIter<Tree<T> > PostOrderIterator; TODO
+  // typedef PostOrderIter<const Tree<T> > ConstPostOrderIterator; TODO
 
   // Leaf iterators (left-to-right).
-  // class LeafIterator; TODO
-  // class ConstLeafIterator; TODO
+  // typedef LeafIter<Tree<T> > LeafIterator;  TODO
+  // typedef LeafIter<const Tree<T> > ConstLeafIterator; TODO
 
  private:
   T value_;
diff --git a/phrase-extract/syntax-common/tree_test.cc b/phrase-extract/syntax-common/tree_test.cc
index 0a54ad3f1..198f52310 100644
--- a/phrase-extract/syntax-common/tree_test.cc
+++ b/phrase-extract/syntax-common/tree_test.cc
@@ -61,6 +61,46 @@ BOOST_AUTO_TEST_CASE(pre_order_2) {
   BOOST_REQUIRE(p == end);
 }
 
+// Test Tree<>::ConstPreOrderIterator on this tree: (1 (2 (3 (4 (5) (6)))) (7))
+BOOST_AUTO_TEST_CASE(const_pre_order_1) {
+  boost::scoped_ptr<Tree<int> > root(new Tree<int>(1));
+  root->children().push_back(new Tree<int>(2));
+  root->children()[0]->children().push_back(new Tree<int>(3));
+  root->children()[0]->children()[0]->children().push_back(new Tree<int>(4));
+  root->children()[0]->children()[0]->children()[0]->children().push_back(
+      new Tree<int>(5));
+  root->children()[0]->children()[0]->children()[0]->children().push_back(
+      new Tree<int>(6));
+  root->children().push_back(new Tree<int>(7));
+  root->SetParents();
+
+  Tree<int>::ConstPreOrderIterator p(*root);
+  Tree<int>::ConstPreOrderIterator end;
+
+  BOOST_REQUIRE(p != end);
+  BOOST_REQUIRE(p->value() == 1);
+  ++p;
+  BOOST_REQUIRE(p != end);
+  BOOST_REQUIRE(p->value() == 2);
+  ++p;
+  BOOST_REQUIRE(p != end);
+  BOOST_REQUIRE(p->value() == 3);
+  ++p;
+  BOOST_REQUIRE(p != end);
+  BOOST_REQUIRE(p->value() == 4);
+  ++p;
+  BOOST_REQUIRE(p != end);
+  BOOST_REQUIRE(p->value() == 5);
+  ++p;
+  BOOST_REQUIRE(p != end);
+  BOOST_REQUIRE(p->value() == 6);
+  ++p;
+  BOOST_REQUIRE(p != end);
+  BOOST_REQUIRE(p->value() == 7);
+  ++p;
+  BOOST_REQUIRE(p == end);
+}
+
 }  // namespace
 }  // namespace Syntax
 }  // namespace MosesTraining

From bf42fa058c424b642afd91a40257bff1c4c82241 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Mon, 1 Jun 2015 11:01:00 +0100
Subject: [PATCH 025/108] Add LeafIterator and ConstLeafIterator to
 MosesTraining::Syntax::Tree

---
 phrase-extract/syntax-common/tree-inl.h   | 87 +++++++++++++++++++++++
 phrase-extract/syntax-common/tree.h       |  6 +-
 phrase-extract/syntax-common/tree_test.cc | 40 +++++++++++
 3 files changed, 130 insertions(+), 3 deletions(-)

diff --git a/phrase-extract/syntax-common/tree-inl.h b/phrase-extract/syntax-common/tree-inl.h
index 9101fc490..811bae2d2 100644
--- a/phrase-extract/syntax-common/tree-inl.h
+++ b/phrase-extract/syntax-common/tree-inl.h
@@ -118,5 +118,92 @@ bool Tree<T>::PreOrderIter<V>::operator!=(const PreOrderIter &rhs) {
   return node_ != rhs.node_;
 }
 
+template<typename T>
+template<typename V>
+class Tree<T>::LeafIter {
+ public:
+  LeafIter();
+  LeafIter(V &);
+
+  V &operator*() { return *node_; }
+  V *operator->() { return node_; }
+
+  LeafIter &operator++();
+  LeafIter operator++(int);
+
+  bool operator==(const LeafIter &);
+  bool operator!=(const LeafIter &);
+
+ private:
+  // Pointer to the current node.
+  V *node_;
+
+  // Stack of indices defining the position of node_ within the child vectors
+  // of its ancestors.
+  std::stack<std::size_t> index_stack_;
+};
+
+template<typename T>
+template<typename V>
+Tree<T>::LeafIter<V>::LeafIter()
+    : node_(0) {
+}
+
+template<typename T>
+template<typename V>
+Tree<T>::LeafIter<V>::LeafIter(V &t)
+    : node_(&t) {
+  // Navigate to the first leaf.
+  while (!node_->IsLeaf()) {
+    index_stack_.push(0);
+    node_ = node_->children()[0];
+  }
+}
+
+template<typename T>
+template<typename V>
+Tree<T>::LeafIter<V> &Tree<T>::LeafIter<V>::operator++() {
+  // Try node's ancestors until either a node is found with a sibling to the
+  // right or we reach the root (in which case the traversal is complete).
+  V *ancestor = node_->parent_;
+  while (ancestor) {
+    std::size_t index = index_stack_.top();
+    index_stack_.pop();
+    if (index+1 < ancestor->children_.size()) {
+      index_stack_.push(index+1);
+      node_ = ancestor->children()[index+1];
+      // Navigate to the first leaf.
+      while (!node_->IsLeaf()) {
+        index_stack_.push(0);
+        node_ = node_->children()[0];
+      }
+      return *this;
+    }
+    ancestor = ancestor->parent_;
+  }
+  node_ = 0;
+  return *this;
+}
+
+template<typename T>
+template<typename V>
+Tree<T>::LeafIter<V> Tree<T>::LeafIter<V>::operator++(int) {
+  LeafIter tmp(*this);
+  ++*this;
+  return tmp;
+}
+
+template<typename T>
+template<typename V>
+bool Tree<T>::LeafIter<V>::operator==(const LeafIter &rhs) {
+  return node_ == rhs.node_;
+}
+
+template<typename T>
+template<typename V>
+bool Tree<T>::LeafIter<V>::operator!=(const LeafIter &rhs) {
+  return node_ != rhs.node_;
+}
+
 }  // namespace Syntax
 }  // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/tree.h b/phrase-extract/syntax-common/tree.h
index e37c2c21f..8cec07a54 100644
--- a/phrase-extract/syntax-common/tree.h
+++ b/phrase-extract/syntax-common/tree.h
@@ -72,7 +72,7 @@ class Tree {
   // iterators.  V is the value type: either Tree<T> or const Tree<T>.
   template<typename V> class PreOrderIter;
   // template<typename V> class PostOrderIter; TODO
-  // template<typename V> class LeafIter; TODO
+  template<typename V> class LeafIter;
 
  public:
   // Pre-order iterators.
@@ -84,8 +84,8 @@ class Tree {
   // typedef PostOrderIter<const Tree<T> > ConstPostOrderIterator; TODO
 
   // Leaf iterators (left-to-right).
-  // typedef LeafIter<Tree<T> > LeafIterator;  TODO
-  // typedef LeafIter<const Tree<T> > ConstLeafIterator; TODO
+  typedef LeafIter<Tree<T> > LeafIterator;
+  typedef LeafIter<const Tree<T> > ConstLeafIterator;
 
  private:
   T value_;
diff --git a/phrase-extract/syntax-common/tree_test.cc b/phrase-extract/syntax-common/tree_test.cc
index 198f52310..8e689f000 100644
--- a/phrase-extract/syntax-common/tree_test.cc
+++ b/phrase-extract/syntax-common/tree_test.cc
@@ -101,6 +101,46 @@ BOOST_AUTO_TEST_CASE(const_pre_order_1) {
   BOOST_REQUIRE(p == end);
 }
 
+// Test Tree<>::LeafIterator with a trivial, single-node tree.
+BOOST_AUTO_TEST_CASE(leaf_1) {
+  boost::scoped_ptr<Tree<int> > root(new Tree<int>(123));
+  Tree<int>::LeafIterator p(*root);
+  BOOST_REQUIRE(p != Tree<int>::LeafIterator());
+  BOOST_REQUIRE(p->value() == 123);
+  ++p;
+  BOOST_REQUIRE(p == Tree<int>::LeafIterator());
+}
+
+// Test Tree<>::LeafIterator on this tree: (1 (2 3) (4) (5 6 (7 8)))
+BOOST_AUTO_TEST_CASE(leaf_2) {
+  boost::scoped_ptr<Tree<int> > root(new Tree<int>(1));
+  root->children().push_back(new Tree<int>(2));
+  root->children()[0]->children().push_back(new Tree<int>(3));
+  root->children().push_back(new Tree<int>(4));
+  root->children().push_back(new Tree<int>(5));
+  root->children()[2]->children().push_back(new Tree<int>(6));
+  root->children()[2]->children().push_back(new Tree<int>(7));
+  root->children()[2]->children()[1]->children().push_back(new Tree<int>(8));
+  root->SetParents();
+
+  Tree<int>::LeafIterator p(*root);
+  Tree<int>::LeafIterator end;
+
+  BOOST_REQUIRE(p != end);
+  BOOST_REQUIRE(p->value() == 3);
+  ++p;
+  BOOST_REQUIRE(p != end);
+  BOOST_REQUIRE(p->value() == 4);
+  ++p;
+  BOOST_REQUIRE(p != end);
+  BOOST_REQUIRE(p->value() == 6);
+  ++p;
+  BOOST_REQUIRE(p != end);
+  BOOST_REQUIRE(p->value() == 8);
+  ++p;
+  BOOST_REQUIRE(p == end);
+}
+
 }  // namespace
 }  // namespace Syntax
 }  // namespace MosesTraining

From f61091e38dc597644c76b65f3c1e0ed6cbc641ab Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Mon, 1 Jun 2015 14:23:25 +0100
Subject: [PATCH 026/108] Ongoing moses/phrase-extract refactoring

---
 .../extract-ghkm/AlignmentGraph.cpp           |  6 +-
 phrase-extract/extract-ghkm/AlignmentGraph.h  |  2 +-
 phrase-extract/extract-ghkm/ExtractGHKM.cpp   | 30 ++++-----
 phrase-extract/extract-ghkm/ExtractGHKM.h     |  3 +-
 phrase-extract/extract-ghkm/Jamfile           |  2 +-
 phrase-extract/extract-ghkm/ParseTree.cpp     | 56 ----------------
 phrase-extract/extract-ghkm/ParseTree.h       | 67 ++-----------------
 phrase-extract/extract-ghkm/XmlTreeParser.cpp | 14 ++--
 phrase-extract/extract-ghkm/XmlTreeParser.h   |  3 +-
 9 files changed, 32 insertions(+), 151 deletions(-)
 delete mode 100644 phrase-extract/extract-ghkm/ParseTree.cpp

diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
index 974188dbd..52a4b41db 100644
--- a/phrase-extract/extract-ghkm/AlignmentGraph.cpp
+++ b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
@@ -212,13 +212,13 @@ Node *AlignmentGraph::CopyParseTree(const ParseTree *root)
 {
   NodeType nodeType = (root->IsLeaf()) ? TARGET : TREE;
 
-  std::auto_ptr<Node> n(new Node(root->GetLabel(), nodeType));
+  std::auto_ptr<Node> n(new Node(root->value().GetLabel(), nodeType));
 
   if (nodeType == TREE) {
-    n->SetPcfgScore(root->GetPcfgScore());
+    n->SetPcfgScore(root->value().GetPcfgScore());
   }
 
-  const std::vector<ParseTree *> &children = root->GetChildren();
+  const std::vector<ParseTree *> &children = root->children();
   std::vector<Node *> childNodes;
   childNodes.reserve(children.size());
   for (std::vector<ParseTree *>::const_iterator p(children.begin());
diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.h b/phrase-extract/extract-ghkm/AlignmentGraph.h
index cf26b8c27..7ae3784cd 100644
--- a/phrase-extract/extract-ghkm/AlignmentGraph.h
+++ b/phrase-extract/extract-ghkm/AlignmentGraph.h
@@ -23,6 +23,7 @@
 
 #include "Alignment.h"
 #include "Options.h"
+#include "ParseTree.h"
 
 #include <set>
 #include <string>
@@ -34,7 +35,6 @@ namespace GHKM
 {
 
 class Node;
-class ParseTree;
 class Subgraph;
 
 class AlignmentGraph
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index 937d88030..7891bc2c7 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -828,24 +828,22 @@ void ExtractGHKM::CollectWordLabelCounts(
   std::map<std::string, int> &wordCount,
   std::map<std::string, std::string> &wordLabel)
 {
-  std::vector<const ParseTree*> leaves;
-  root.GetLeaves(std::back_inserter(leaves));
-  for (std::vector<const ParseTree *>::const_iterator p = leaves.begin();
-       p != leaves.end(); ++p) {
-    const ParseTree &leaf = **p;
-    const std::string &word = leaf.GetLabel();
-    const ParseTree *ancestor = leaf.GetParent();
+  for (ParseTree::ConstLeafIterator p(root);
+       p != ParseTree::ConstLeafIterator(); ++p) {
+    const ParseTree &leaf = *p;
+    const std::string &word = leaf.value().GetLabel();
+    const ParseTree *ancestor = leaf.parent();
     // If unary rule elimination is enabled and this word is at the end of a
     // chain of unary rewrites, e.g.
     //    PN-SB -> NE -> word
     // then record the constituent label at the top of the chain instead of
     // the part-of-speech label.
     while (!options.allowUnary &&
-           ancestor->GetParent() &&
-           ancestor->GetParent()->GetChildren().size() == 1) {
-      ancestor = ancestor->GetParent();
+           ancestor->parent() &&
+           ancestor->parent()->children().size() == 1) {
+      ancestor = ancestor->parent();
     }
-    const std::string &label = ancestor->GetLabel();
+    const std::string &label = ancestor->value().GetLabel();
     ++wordCount[word];
     wordLabel[word] = label;
   }
@@ -854,12 +852,10 @@ void ExtractGHKM::CollectWordLabelCounts(
 std::vector<std::string> ExtractGHKM::ReadTokens(const ParseTree &root) const
 {
   std::vector<std::string> tokens;
-  std::vector<const ParseTree*> leaves;
-  root.GetLeaves(std::back_inserter(leaves));
-  for (std::vector<const ParseTree *>::const_iterator p = leaves.begin();
-       p != leaves.end(); ++p) {
-    const ParseTree &leaf = **p;
-    const std::string &word = leaf.GetLabel();
+  for (ParseTree::ConstLeafIterator p(root);
+       p != ParseTree::ConstLeafIterator(); ++p) {
+    const ParseTree &leaf = *p;
+    const std::string &word = leaf.value().GetLabel();
     tokens.push_back(word);
   }
   return tokens;
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.h b/phrase-extract/extract-ghkm/ExtractGHKM.h
index 68babdccf..5954e7425 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.h
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.h
@@ -25,6 +25,8 @@
 #include <string>
 #include <vector>
 
+#include "ParseTree.h"
+
 namespace Moses
 {
 
@@ -34,7 +36,6 @@ namespace GHKM
 {
 
 struct Options;
-class ParseTree;
 
 class ExtractGHKM
 {
diff --git a/phrase-extract/extract-ghkm/Jamfile b/phrase-extract/extract-ghkm/Jamfile
index f2d1ac5a8..4692937de 100644
--- a/phrase-extract/extract-ghkm/Jamfile
+++ b/phrase-extract/extract-ghkm/Jamfile
@@ -1 +1 @@
-exe extract-ghkm : [ glob *.cpp ] ..//deps ../..//boost_iostreams ../..//boost_program_options ../..//z : <include>.. ;
+exe extract-ghkm : [ glob *.cpp ] ..//syntax-common ..//deps ../..//boost_iostreams ../..//boost_program_options ../..//z : <include>.. ;
diff --git a/phrase-extract/extract-ghkm/ParseTree.cpp b/phrase-extract/extract-ghkm/ParseTree.cpp
deleted file mode 100644
index f86486487..000000000
--- a/phrase-extract/extract-ghkm/ParseTree.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2011 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-***********************************************************************/
-
-#include "ParseTree.h"
-
-namespace Moses
-{
-namespace GHKM
-{
-
-ParseTree::~ParseTree()
-{
-  for (std::vector<ParseTree*>::iterator p(m_children.begin());
-       p != m_children.end(); ++p) {
-    delete *p;
-  }
-}
-
-void ParseTree::SetChildren(const std::vector<ParseTree*> &children)
-{
-  m_children = children;
-}
-
-void ParseTree::SetParent(ParseTree *parent)
-{
-  m_parent = parent;
-}
-
-void ParseTree::AddChild(ParseTree *child)
-{
-  m_children.push_back(child);
-}
-
-bool ParseTree::IsLeaf() const
-{
-  return m_children.empty();
-}
-
-}  // namespace GHKM
-}  // namespace Moses
diff --git a/phrase-extract/extract-ghkm/ParseTree.h b/phrase-extract/extract-ghkm/ParseTree.h
index 694286c9d..f0b83f63f 100644
--- a/phrase-extract/extract-ghkm/ParseTree.h
+++ b/phrase-extract/extract-ghkm/ParseTree.h
@@ -21,75 +21,16 @@
 #ifndef EXTRACT_GHKM_PARSE_TREE_H_
 #define EXTRACT_GHKM_PARSE_TREE_H_
 
-#include <string>
-#include <vector>
+#include "syntax-common/tree.h"
+
+#include "SyntaxNode.h"
 
 namespace Moses
 {
 namespace GHKM
 {
 
-class ParseTree
-{
-public:
-  ParseTree(const std::string &label)
-    : m_label(label)
-    , m_parent(0)
-    , m_pcfgScore(0.0) {}
-
-  ~ParseTree();
-
-  const std::string &GetLabel() const {
-    return m_label;
-  }
-  const std::vector<ParseTree*> &GetChildren() const {
-    return m_children;
-  }
-  const ParseTree *GetParent() const {
-    return m_parent;
-  }
-  float GetPcfgScore() const {
-    return m_pcfgScore;
-  }
-
-  void SetParent(ParseTree *);
-  void SetChildren(const std::vector<ParseTree*> &);
-  void SetPcfgScore(float score) {
-    m_pcfgScore = score;
-  }
-
-  void AddChild(ParseTree *);
-
-  bool IsLeaf() const;
-
-  template<typename OutputIterator>
-  void GetLeaves(OutputIterator) const;
-
-private:
-  // Disallow copying
-  ParseTree(const ParseTree &);
-  ParseTree &operator=(const ParseTree &);
-
-  std::string m_label;
-  std::vector<ParseTree*> m_children;
-  ParseTree *m_parent;
-  float m_pcfgScore;  // log probability
-};
-
-template<typename OutputIterator>
-void ParseTree::GetLeaves(OutputIterator result) const
-{
-  if (IsLeaf()) {
-    *result++ = this;
-  } else {
-    std::vector<ParseTree *>::const_iterator p = m_children.begin();
-    std::vector<ParseTree *>::const_iterator end = m_children.end();
-    while (p != end) {
-      ParseTree &child = **p++;
-      child.GetLeaves(result);
-    }
-  }
-}
+typedef MosesTraining::Syntax::Tree<MosesTraining::SyntaxNode> ParseTree;
 
 }  // namespace GHKM
 }  // namespace Moses
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/phrase-extract/extract-ghkm/XmlTreeParser.cpp
index f9800c8e0..671b03a78 100644
--- a/phrase-extract/extract-ghkm/XmlTreeParser.cpp
+++ b/phrase-extract/extract-ghkm/XmlTreeParser.cpp
@@ -66,8 +66,7 @@ std::auto_ptr<ParseTree> XmlTreeParser::ConvertTree(
   const SyntaxNode &tree,
   const std::vector<std::string> &words)
 {
-  std::auto_ptr<ParseTree> root(new ParseTree(tree.GetLabel()));
-  root->SetPcfgScore(tree.GetPcfgScore());
+  std::auto_ptr<ParseTree> root(new ParseTree(tree));
   const std::vector<SyntaxNode*> &children = tree.GetChildren();
   if (children.empty()) {
     if (tree.GetStart() != tree.GetEnd()) {
@@ -76,16 +75,17 @@ std::auto_ptr<ParseTree> XmlTreeParser::ConvertTree(
           << "-" << tree.GetEnd() << "): this is currently unsupported";
       throw Exception(msg.str());
     }
-    std::auto_ptr<ParseTree> leaf(new ParseTree(words[tree.GetStart()]));
-    leaf->SetParent(root.get());
-    root->AddChild(leaf.release());
+    SyntaxNode value(tree.GetStart(), tree.GetStart(), words[tree.GetStart()]);
+    std::auto_ptr<ParseTree> leaf(new ParseTree(value));
+    leaf->parent() = root.get();
+    root->children().push_back(leaf.release());
   } else {
     for (std::vector<SyntaxNode*>::const_iterator p = children.begin();
          p != children.end(); ++p) {
       assert(*p);
       std::auto_ptr<ParseTree> child = ConvertTree(**p, words);
-      child->SetParent(root.get());
-      root->AddChild(child.release());
+      child->parent() = root.get();
+      root->children().push_back(child.release());
     }
   }
   return root;
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h
index db9fa8bf2..a82862428 100644
--- a/phrase-extract/extract-ghkm/XmlTreeParser.h
+++ b/phrase-extract/extract-ghkm/XmlTreeParser.h
@@ -29,6 +29,7 @@
 #include <string>
 #include <vector>
 
+#include "ParseTree.h"
 #include "SyntaxNode.h"
 #include "SyntaxNodeCollection.h"
 
@@ -37,8 +38,6 @@ namespace Moses
 namespace GHKM
 {
 
-class ParseTree;
-
 // Parses a string in Moses' XML parse tree format and returns a ParseTree
 // object.
 class XmlTreeParser

From f37415a259f19116d90c7bc82ecf16fd8bbbf23b Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Mon, 1 Jun 2015 16:40:35 +0100
Subject: [PATCH 027/108] Ongoing moses/phrase-extract refactoring

---
 moses/FF/PhraseOrientationFeature.cpp         | 70 +++++++-------
 moses/FF/PhraseOrientationFeature.h           |  6 +-
 phrase-extract/InternalStructFeature.h        |  4 -
 phrase-extract/SyntaxTree.h                   | 12 +++
 phrase-extract/extract-ghkm/Alignment.cpp     |  4 +-
 phrase-extract/extract-ghkm/Alignment.h       |  4 +-
 .../extract-ghkm/AlignmentGraph.cpp           | 25 ++---
 phrase-extract/extract-ghkm/AlignmentGraph.h  | 17 ++--
 phrase-extract/extract-ghkm/ComposedRule.cpp  | 12 +--
 phrase-extract/extract-ghkm/ComposedRule.h    |  8 +-
 phrase-extract/extract-ghkm/Exception.h       |  4 +-
 phrase-extract/extract-ghkm/ExtractGHKM.cpp   | 95 ++++++++++---------
 phrase-extract/extract-ghkm/ExtractGHKM.h     | 19 ++--
 phrase-extract/extract-ghkm/Main.cpp          |  2 +-
 phrase-extract/extract-ghkm/Node.cpp          |  4 +-
 phrase-extract/extract-ghkm/Node.h            |  8 +-
 phrase-extract/extract-ghkm/Options.h         |  4 +-
 phrase-extract/extract-ghkm/ParseTree.h       | 38 --------
 .../extract-ghkm/PhraseOrientation.cpp        |  4 +-
 .../extract-ghkm/PhraseOrientation.h          | 18 ++--
 phrase-extract/extract-ghkm/Rule.cpp          |  4 +-
 phrase-extract/extract-ghkm/Rule.h            |  4 +-
 phrase-extract/extract-ghkm/ScfgRule.cpp      | 14 +--
 phrase-extract/extract-ghkm/ScfgRule.h        | 11 +--
 .../extract-ghkm/ScfgRuleWriter.cpp           | 12 +--
 phrase-extract/extract-ghkm/ScfgRuleWriter.h  |  8 +-
 phrase-extract/extract-ghkm/Span.cpp          |  4 +-
 phrase-extract/extract-ghkm/Span.h            |  4 +-
 phrase-extract/extract-ghkm/StsgRule.cpp      |  8 +-
 phrase-extract/extract-ghkm/StsgRule.h        |  8 +-
 .../extract-ghkm/StsgRuleWriter.cpp           | 12 +--
 phrase-extract/extract-ghkm/StsgRuleWriter.h  |  8 +-
 phrase-extract/extract-ghkm/Subgraph.cpp      |  9 +-
 phrase-extract/extract-ghkm/Subgraph.h        |  8 +-
 phrase-extract/extract-ghkm/XmlTreeParser.cpp | 29 +++---
 phrase-extract/extract-ghkm/XmlTreeParser.h   | 18 ++--
 36 files changed, 246 insertions(+), 273 deletions(-)
 create mode 100644 phrase-extract/SyntaxTree.h
 delete mode 100644 phrase-extract/extract-ghkm/ParseTree.h

diff --git a/moses/FF/PhraseOrientationFeature.cpp b/moses/FF/PhraseOrientationFeature.cpp
index 1c9a3f738..fea8dafad 100644
--- a/moses/FF/PhraseOrientationFeature.cpp
+++ b/moses/FF/PhraseOrientationFeature.cpp
@@ -134,7 +134,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,
   if (targetPhrase.GetAlignNonTerm().GetSize() != 0) {
 
     // Initialize phrase orientation scoring object
-    Moses::GHKM::PhraseOrientation phraseOrientation(source.GetSize(), targetPhrase.GetSize(),
+    MosesTraining::GHKM::PhraseOrientation phraseOrientation(source.GetSize(), targetPhrase.GetSize(),
         targetPhrase.GetAlignTerm(), targetPhrase.GetAlignNonTerm());
 
     PhraseOrientationFeature::ReoClassData* reoClassData = new PhraseOrientationFeature::ReoClassData();
@@ -150,7 +150,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,
 
       // LEFT-TO-RIGHT DIRECTION
 
-      Moses::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,Moses::GHKM::PhraseOrientation::REO_DIR_L2R);
+      MosesTraining::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,MosesTraining::GHKM::PhraseOrientation::REO_DIR_L2R);
 
       if ( ((targetIndex == 0) || !phraseOrientation.TargetSpanIsAligned(0,targetIndex)) // boundary non-terminal in rule-initial position (left boundary)
            && (targetPhraseLHS != m_glueTargetLHS) ) { // and not glue rule
@@ -170,7 +170,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,
         if (reoClassData->firstNonTerminalPreviousSourceSpanIsAligned &&
             reoClassData->firstNonTerminalFollowingSourceSpanIsAligned) {
           // discontinuous
-          l2rOrientation = Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT;
+          l2rOrientation = MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT;
         } else {
           reoClassData->firstNonTerminalIsBoundary = true;
         }
@@ -180,7 +180,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,
 
       // RIGHT-TO-LEFT DIRECTION
 
-      Moses::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,Moses::GHKM::PhraseOrientation::REO_DIR_R2L);
+      MosesTraining::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,MosesTraining::GHKM::PhraseOrientation::REO_DIR_R2L);
 
       if ( ((targetIndex == targetPhrase.GetSize()-1) || !phraseOrientation.TargetSpanIsAligned(targetIndex,targetPhrase.GetSize()-1)) // boundary non-terminal in rule-final position (right boundary)
            && (targetPhraseLHS != m_glueTargetLHS) ) { // and not glue rule
@@ -200,7 +200,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,
         if (reoClassData->lastNonTerminalPreviousSourceSpanIsAligned &&
             reoClassData->lastNonTerminalFollowingSourceSpanIsAligned) {
           // discontinuous
-          r2lOrientation = Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT;
+          r2lOrientation = MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT;
         } else {
           reoClassData->lastNonTerminalIsBoundary = true;
         }
@@ -335,25 +335,25 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied(
 
       // LEFT-TO-RIGHT DIRECTION
 
-      Moses::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = reoClassData->nonTerminalReoClassL2R[nNT];
+      MosesTraining::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = reoClassData->nonTerminalReoClassL2R[nNT];
 
       IFFEATUREVERBOSE(2) {
         FEATUREVERBOSE(2, "l2rOrientation ");
         switch (l2rOrientation) {
-        case Moses::GHKM::PhraseOrientation::REO_CLASS_LEFT:
+        case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT:
           FEATUREVERBOSE2(2, "mono" << std::endl);
           break;
-        case Moses::GHKM::PhraseOrientation::REO_CLASS_RIGHT:
+        case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT:
           FEATUREVERBOSE2(2, "swap" << std::endl);
           break;
-        case Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT:
+        case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT:
           FEATUREVERBOSE2(2, "dleft" << std::endl);
           break;
-        case Moses::GHKM::PhraseOrientation::REO_CLASS_DRIGHT:
+        case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT:
           FEATUREVERBOSE2(2, "dright" << std::endl);
           break;
-        case Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN:
-          // modelType == Moses::GHKM::PhraseOrientation::REO_MSLR
+        case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN:
+          // modelType == MosesTraining::GHKM::PhraseOrientation::REO_MSLR
           FEATUREVERBOSE2(2, "unknown->dleft" << std::endl);
           break;
         default:
@@ -396,23 +396,23 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied(
 
       } else {
 
-        if ( l2rOrientation == Moses::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
+        if ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
 
           newScores[0] += TransformScore(orientationPhraseProperty->GetLeftToRightProbabilityMono());
           // if sub-derivation has left-boundary non-terminal:
           // add recursive actual score of boundary non-terminal from subderivation
           LeftBoundaryL2RScoreRecursive(featureID, prevState, 0x1, newScores, accumulator);
 
-        } else if ( l2rOrientation == Moses::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
+        } else if ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
 
           newScores[1] += TransformScore(orientationPhraseProperty->GetLeftToRightProbabilitySwap());
           // if sub-derivation has left-boundary non-terminal:
           // add recursive actual score of boundary non-terminal from subderivation
           LeftBoundaryL2RScoreRecursive(featureID, prevState, 0x2, newScores, accumulator);
 
-        } else if ( ( l2rOrientation == Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
-                    ( l2rOrientation == Moses::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
-                    ( l2rOrientation == Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
+        } else if ( ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
+                    ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
+                    ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
 
           newScores[2] += TransformScore(orientationPhraseProperty->GetLeftToRightProbabilityDiscontinuous());
           // if sub-derivation has left-boundary non-terminal:
@@ -437,25 +437,25 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied(
 
       // RIGHT-TO-LEFT DIRECTION
 
-      Moses::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = reoClassData->nonTerminalReoClassR2L[nNT];
+      MosesTraining::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = reoClassData->nonTerminalReoClassR2L[nNT];
 
       IFFEATUREVERBOSE(2) {
         FEATUREVERBOSE(2, "r2lOrientation ");
         switch (r2lOrientation) {
-        case Moses::GHKM::PhraseOrientation::REO_CLASS_LEFT:
+        case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT:
           FEATUREVERBOSE2(2, "mono" << std::endl);
           break;
-        case Moses::GHKM::PhraseOrientation::REO_CLASS_RIGHT:
+        case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT:
           FEATUREVERBOSE2(2, "swap" << std::endl);
           break;
-        case Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT:
+        case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT:
           FEATUREVERBOSE2(2, "dleft" << std::endl);
           break;
-        case Moses::GHKM::PhraseOrientation::REO_CLASS_DRIGHT:
+        case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT:
           FEATUREVERBOSE2(2, "dright" << std::endl);
           break;
-        case Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN:
-          // modelType == Moses::GHKM::PhraseOrientation::REO_MSLR
+        case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN:
+          // modelType == MosesTraining::GHKM::PhraseOrientation::REO_MSLR
           FEATUREVERBOSE2(2, "unknown->dleft" << std::endl);
           break;
         default:
@@ -498,23 +498,23 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied(
 
       } else {
 
-        if ( r2lOrientation == Moses::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
+        if ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
 
           newScores[m_offsetR2LScores+0] += TransformScore(orientationPhraseProperty->GetRightToLeftProbabilityMono());
           // if sub-derivation has right-boundary non-terminal:
           // add recursive actual score of boundary non-terminal from subderivation
           RightBoundaryR2LScoreRecursive(featureID, prevState, 0x1, newScores, accumulator);
 
-        } else if ( r2lOrientation == Moses::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
+        } else if ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
 
           newScores[m_offsetR2LScores+1] += TransformScore(orientationPhraseProperty->GetRightToLeftProbabilitySwap());
           // if sub-derivation has right-boundary non-terminal:
           // add recursive actual score of boundary non-terminal from subderivation
           RightBoundaryR2LScoreRecursive(featureID, prevState, 0x2, newScores, accumulator);
 
-        } else if ( ( r2lOrientation == Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
-                    ( r2lOrientation == Moses::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
-                    ( r2lOrientation == Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
+        } else if ( ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
+                    ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
+                    ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
 
           newScores[m_offsetR2LScores+2] += TransformScore(orientationPhraseProperty->GetRightToLeftProbabilityDiscontinuous());
           // if sub-derivation has right-boundary non-terminal:
@@ -862,17 +862,17 @@ void PhraseOrientationFeature::SparseNonTerminalR2LScore(const Factor* nonTermin
 }
 
 
-const std::string* PhraseOrientationFeature::ToString(const Moses::GHKM::PhraseOrientation::REO_CLASS o) const
+const std::string* PhraseOrientationFeature::ToString(const MosesTraining::GHKM::PhraseOrientation::REO_CLASS o) const
 {
-  if ( o == Moses::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
+  if ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
     return &MORIENT;
 
-  } else if ( o == Moses::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
+  } else if ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
     return &SORIENT;
 
-  } else if ( ( o == Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
-              ( o == Moses::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
-              ( o == Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
+  } else if ( ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
+              ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
+              ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
     return &DORIENT;
 
   } else {
diff --git a/moses/FF/PhraseOrientationFeature.h b/moses/FF/PhraseOrientationFeature.h
index 4460a1ea7..7c429dd1c 100644
--- a/moses/FF/PhraseOrientationFeature.h
+++ b/moses/FF/PhraseOrientationFeature.h
@@ -302,8 +302,8 @@ public:
 
   struct ReoClassData {
   public:
-    std::vector<Moses::GHKM::PhraseOrientation::REO_CLASS> nonTerminalReoClassL2R;
-    std::vector<Moses::GHKM::PhraseOrientation::REO_CLASS> nonTerminalReoClassR2L;
+    std::vector<MosesTraining::GHKM::PhraseOrientation::REO_CLASS> nonTerminalReoClassL2R;
+    std::vector<MosesTraining::GHKM::PhraseOrientation::REO_CLASS> nonTerminalReoClassR2L;
     bool firstNonTerminalIsBoundary;
     bool firstNonTerminalPreviousSourceSpanIsAligned;
     bool firstNonTerminalFollowingSourceSpanIsAligned;
@@ -401,7 +401,7 @@ protected:
                                  ScoreComponentCollection* scoreBreakdown,
                                  const std::string* o) const;
 
-  const std::string* ToString(const Moses::GHKM::PhraseOrientation::REO_CLASS o) const;
+  const std::string* ToString(const MosesTraining::GHKM::PhraseOrientation::REO_CLASS o) const;
 
   static const std::string MORIENT;
   static const std::string SORIENT;
diff --git a/phrase-extract/InternalStructFeature.h b/phrase-extract/InternalStructFeature.h
index 2ac3ecd9d..66d61c6f9 100644
--- a/phrase-extract/InternalStructFeature.h
+++ b/phrase-extract/InternalStructFeature.h
@@ -10,10 +10,6 @@
 #include "ScoreFeature.h"
 #include "extract-ghkm/Node.h"
 
-using namespace MosesTraining;
-using namespace Moses;
-using namespace GHKM;
-
 namespace MosesTraining
 {
 
diff --git a/phrase-extract/SyntaxTree.h b/phrase-extract/SyntaxTree.h
new file mode 100644
index 000000000..c2132fda3
--- /dev/null
+++ b/phrase-extract/SyntaxTree.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "syntax-common/tree.h"
+
+#include "SyntaxNode.h"
+
+namespace MosesTraining
+{
+
+typedef Syntax::Tree<SyntaxNode> SyntaxTree;
+
+}  // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/Alignment.cpp b/phrase-extract/extract-ghkm/Alignment.cpp
index 7e084e495..6f946fe5a 100644
--- a/phrase-extract/extract-ghkm/Alignment.cpp
+++ b/phrase-extract/extract-ghkm/Alignment.cpp
@@ -25,7 +25,7 @@
 #include <cassert>
 #include <cstdlib>
 
-namespace Moses
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -70,4 +70,4 @@ void FlipAlignment(Alignment &a)
 }
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/Alignment.h b/phrase-extract/extract-ghkm/Alignment.h
index e8381a602..154e1fc4f 100644
--- a/phrase-extract/extract-ghkm/Alignment.h
+++ b/phrase-extract/extract-ghkm/Alignment.h
@@ -23,7 +23,7 @@
 #include <utility>
 #include <vector>
 
-namespace Moses
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -35,5 +35,5 @@ void ReadAlignment(const std::string &, Alignment &);
 void FlipAlignment(Alignment &);
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
 
diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
index 52a4b41db..3fa65656c 100644
--- a/phrase-extract/extract-ghkm/AlignmentGraph.cpp
+++ b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
@@ -19,23 +19,24 @@
 
 #include "AlignmentGraph.h"
 
-#include "ComposedRule.h"
-#include "Node.h"
-#include "Options.h"
-#include "ParseTree.h"
-#include "Subgraph.h"
-
 #include <algorithm>
 #include <cassert>
 #include <memory>
 #include <stack>
 
-namespace Moses
+#include "SyntaxTree.h"
+
+#include "ComposedRule.h"
+#include "Node.h"
+#include "Options.h"
+#include "Subgraph.h"
+
+namespace MosesTraining
 {
 namespace GHKM
 {
 
-AlignmentGraph::AlignmentGraph(const ParseTree *t,
+AlignmentGraph::AlignmentGraph(const SyntaxTree *t,
                                const std::vector<std::string> &s,
                                const Alignment &a)
 {
@@ -208,7 +209,7 @@ void AlignmentGraph::ExtractComposedRules(Node *node, const Options &options)
   }
 }
 
-Node *AlignmentGraph::CopyParseTree(const ParseTree *root)
+Node *AlignmentGraph::CopyParseTree(const SyntaxTree *root)
 {
   NodeType nodeType = (root->IsLeaf()) ? TARGET : TREE;
 
@@ -218,10 +219,10 @@ Node *AlignmentGraph::CopyParseTree(const ParseTree *root)
     n->SetPcfgScore(root->value().GetPcfgScore());
   }
 
-  const std::vector<ParseTree *> &children = root->children();
+  const std::vector<SyntaxTree *> &children = root->children();
   std::vector<Node *> childNodes;
   childNodes.reserve(children.size());
-  for (std::vector<ParseTree *>::const_iterator p(children.begin());
+  for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
        p != children.end(); ++p) {
     Node *child = CopyParseTree(*p);
     child->AddParent(n.get());
@@ -385,4 +386,4 @@ Node *AlignmentGraph::DetermineAttachmentPoint(int index)
 }
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.h b/phrase-extract/extract-ghkm/AlignmentGraph.h
index 7ae3784cd..032b946f0 100644
--- a/phrase-extract/extract-ghkm/AlignmentGraph.h
+++ b/phrase-extract/extract-ghkm/AlignmentGraph.h
@@ -21,15 +21,16 @@
 #ifndef EXTRACT_GHKM_ALIGNMENT_GRAPH_H_
 #define EXTRACT_GHKM_ALIGNMENT_GRAPH_H_
 
-#include "Alignment.h"
-#include "Options.h"
-#include "ParseTree.h"
-
 #include <set>
 #include <string>
 #include <vector>
 
-namespace Moses
+#include "SyntaxTree.h"
+
+#include "Alignment.h"
+#include "Options.h"
+
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -40,7 +41,7 @@ class Subgraph;
 class AlignmentGraph
 {
 public:
-  AlignmentGraph(const ParseTree *,
+  AlignmentGraph(const SyntaxTree *,
                  const std::vector<std::string> &,
                  const Alignment &);
 
@@ -61,7 +62,7 @@ private:
   AlignmentGraph(const AlignmentGraph &);
   AlignmentGraph &operator=(const AlignmentGraph &);
 
-  Node *CopyParseTree(const ParseTree *);
+  Node *CopyParseTree(const SyntaxTree *);
   void ComputeFrontierSet(Node *, const Options &, std::set<Node *> &) const;
   void CalcComplementSpans(Node *);
   void GetTargetTreeLeaves(Node *, std::vector<Node *> &);
@@ -77,6 +78,6 @@ private:
 };
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
 
 #endif
diff --git a/phrase-extract/extract-ghkm/ComposedRule.cpp b/phrase-extract/extract-ghkm/ComposedRule.cpp
index e9fc826b7..d322a255f 100644
--- a/phrase-extract/extract-ghkm/ComposedRule.cpp
+++ b/phrase-extract/extract-ghkm/ComposedRule.cpp
@@ -19,15 +19,15 @@
 
 #include "ComposedRule.h"
 
-#include "Node.h"
-#include "Options.h"
-#include "Subgraph.h"
-
 #include <set>
 #include <vector>
 #include <queue>
 
-namespace Moses
+#include "Node.h"
+#include "Options.h"
+#include "Subgraph.h"
+
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -128,4 +128,4 @@ Subgraph ComposedRule::CreateSubgraph()
 }
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/ComposedRule.h b/phrase-extract/extract-ghkm/ComposedRule.h
index b5f72a492..d456fd27c 100644
--- a/phrase-extract/extract-ghkm/ComposedRule.h
+++ b/phrase-extract/extract-ghkm/ComposedRule.h
@@ -21,12 +21,12 @@
 #ifndef EXTRACT_GHKM_COMPOSED_RULE_H_
 #define EXTRACT_GHKM_COMPOSED_RULE_H_
 
-#include "Subgraph.h"
-
 #include <vector>
 #include <queue>
 
-namespace Moses
+#include "Subgraph.h"
+
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -67,6 +67,6 @@ private:
 };
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
 
 #endif
diff --git a/phrase-extract/extract-ghkm/Exception.h b/phrase-extract/extract-ghkm/Exception.h
index a1e623cd1..99e1067f4 100644
--- a/phrase-extract/extract-ghkm/Exception.h
+++ b/phrase-extract/extract-ghkm/Exception.h
@@ -23,7 +23,7 @@
 
 #include <string>
 
-namespace Moses
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -41,6 +41,6 @@ private:
 };
 
 } // namespace GHKM
-} // namespace Moses
+} // namespace MosesTraining
 
 #endif
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index 7891bc2c7..0c7dadd4d 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -19,29 +19,6 @@
 
 #include "ExtractGHKM.h"
 
-#include "Alignment.h"
-#include "AlignmentGraph.h"
-#include "Exception.h"
-#include "InputFileStream.h"
-#include "Node.h"
-#include "OutputFileStream.h"
-#include "Options.h"
-#include "ParseTree.h"
-#include "PhraseOrientation.h"
-#include "ScfgRule.h"
-#include "ScfgRuleWriter.h"
-#include "Span.h"
-#include "StsgRule.h"
-#include "StsgRuleWriter.h"
-#include "SyntaxNode.h"
-#include "SyntaxNodeCollection.h"
-#include "tables-core.h"
-#include "XmlException.h"
-#include "XmlTree.h"
-#include "XmlTreeParser.h"
-
-#include <boost/program_options.hpp>
-
 #include <cassert>
 #include <cstdlib>
 #include <fstream>
@@ -51,13 +28,40 @@
 #include <sstream>
 #include <vector>
 
-namespace Moses
+#include <boost/program_options.hpp>
+
+#include "InputFileStream.h"
+#include "OutputFileStream.h"
+#include "SyntaxNode.h"
+#include "SyntaxNodeCollection.h"
+#include "SyntaxTree.h"
+#include "tables-core.h"
+#include "XmlException.h"
+#include "XmlTree.h"
+
+#include "Alignment.h"
+#include "AlignmentGraph.h"
+#include "Exception.h"
+#include "Node.h"
+#include "Options.h"
+#include "PhraseOrientation.h"
+#include "ScfgRule.h"
+#include "ScfgRuleWriter.h"
+#include "Span.h"
+#include "StsgRule.h"
+#include "StsgRuleWriter.h"
+#include "XmlTreeParser.h"
+
+namespace MosesTraining
 {
 namespace GHKM
 {
 
 int ExtractGHKM::Main(int argc, char *argv[])
 {
+  using Moses::InputFileStream;
+  using Moses::OutputFileStream;
+
   // Process command-line options.
   Options options;
   ProcessOptions(argc, argv, options);
@@ -158,7 +162,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
       std::cerr << "skipping line " << lineNum << " with empty target tree\n";
       continue;
     }
-    std::auto_ptr<ParseTree> targetParseTree;
+    std::auto_ptr<SyntaxTree> targetParseTree;
     try {
       targetParseTree = targetXmlTreeParser.Parse(targetLine);
       assert(targetParseTree.get());
@@ -173,8 +177,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
 
 
     // Parse source tree and construct a SyntaxTree object.
-    MosesTraining::SyntaxNodeCollection sourceSyntaxTree;
-    MosesTraining::SyntaxNode *sourceSyntaxTreeRoot=NULL;
+    SyntaxNodeCollection sourceSyntaxTree;
+    SyntaxNode *sourceSyntaxTreeRoot=NULL;
 
     if (options.sourceLabels) {
       try {
@@ -197,8 +201,9 @@ int ExtractGHKM::Main(int argc, char *argv[])
     // Read source tokens.
     std::vector<std::string> sourceTokens(ReadTokens(sourceLine));
 
-    // Construct a source ParseTree object from the SyntaxNodeCollection object.
-    std::auto_ptr<ParseTree> sourceParseTree;
+    // Construct a source SyntaxTree object from the SyntaxNodeCollection
+    // object.
+    std::auto_ptr<SyntaxTree> sourceParseTree;
 
     if (options.sourceLabels) {
       try {
@@ -264,12 +269,12 @@ int ExtractGHKM::Main(int argc, char *argv[])
 
       const std::vector<const Subgraph *> &rules = (*p)->GetRules();
 
-      Moses::GHKM::PhraseOrientation::REO_CLASS l2rOrientation=Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN, r2lOrientation=Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN;
+      PhraseOrientation::REO_CLASS l2rOrientation=PhraseOrientation::REO_CLASS_UNKNOWN, r2lOrientation=PhraseOrientation::REO_CLASS_UNKNOWN;
       if (options.phraseOrientation && !rules.empty()) {
         int sourceSpanBegin = *((*p)->GetSpan().begin());
         int sourceSpanEnd   = *((*p)->GetSpan().rbegin());
-        l2rOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,Moses::GHKM::PhraseOrientation::REO_DIR_L2R);
-        r2lOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,Moses::GHKM::PhraseOrientation::REO_DIR_R2L);
+        l2rOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,PhraseOrientation::REO_DIR_L2R);
+        r2lOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,PhraseOrientation::REO_DIR_R2L);
         // std::cerr << "span " << sourceSpanBegin << " " << sourceSpanEnd << std::endl;
         // std::cerr << "phraseOrientation " << phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd) << std::endl;
       }
@@ -310,8 +315,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
             fwdExtractStream << " ";
             phraseOrientation.WriteOrientation(fwdExtractStream,r2lOrientation);
             fwdExtractStream << "}}";
-            phraseOrientation.IncrementPriorCount(Moses::GHKM::PhraseOrientation::REO_DIR_L2R,l2rOrientation,1);
-            phraseOrientation.IncrementPriorCount(Moses::GHKM::PhraseOrientation::REO_DIR_R2L,r2lOrientation,1);
+            phraseOrientation.IncrementPriorCount(PhraseOrientation::REO_DIR_L2R,l2rOrientation,1);
+            phraseOrientation.IncrementPriorCount(PhraseOrientation::REO_DIR_R2L,r2lOrientation,1);
           }
           fwdExtractStream << std::endl;
           invExtractStream << std::endl;
@@ -400,7 +405,7 @@ void ExtractGHKM::OpenOutputFileOrDie(const std::string &filename,
 }
 
 void ExtractGHKM::OpenOutputFileOrDie(const std::string &filename,
-                                      OutputFileStream &stream)
+                                      Moses::OutputFileStream &stream)
 {
   bool ret = stream.Open(filename);
   if (!ret) {
@@ -823,16 +828,16 @@ void ExtractGHKM::WriteSourceLabelSet(
 }
 
 void ExtractGHKM::CollectWordLabelCounts(
-  ParseTree &root,
+  SyntaxTree &root,
   const Options &options,
   std::map<std::string, int> &wordCount,
   std::map<std::string, std::string> &wordLabel)
 {
-  for (ParseTree::ConstLeafIterator p(root);
-       p != ParseTree::ConstLeafIterator(); ++p) {
-    const ParseTree &leaf = *p;
+  for (SyntaxTree::ConstLeafIterator p(root);
+       p != SyntaxTree::ConstLeafIterator(); ++p) {
+    const SyntaxTree &leaf = *p;
     const std::string &word = leaf.value().GetLabel();
-    const ParseTree *ancestor = leaf.parent();
+    const SyntaxTree *ancestor = leaf.parent();
     // If unary rule elimination is enabled and this word is at the end of a
     // chain of unary rewrites, e.g.
     //    PN-SB -> NE -> word
@@ -849,12 +854,12 @@ void ExtractGHKM::CollectWordLabelCounts(
   }
 }
 
-std::vector<std::string> ExtractGHKM::ReadTokens(const ParseTree &root) const
+std::vector<std::string> ExtractGHKM::ReadTokens(const SyntaxTree &root) const
 {
   std::vector<std::string> tokens;
-  for (ParseTree::ConstLeafIterator p(root);
-       p != ParseTree::ConstLeafIterator(); ++p) {
-    const ParseTree &leaf = *p;
+  for (SyntaxTree::ConstLeafIterator p(root);
+       p != SyntaxTree::ConstLeafIterator(); ++p) {
+    const SyntaxTree &leaf = *p;
     const std::string &word = leaf.value().GetLabel();
     tokens.push_back(word);
   }
@@ -956,4 +961,4 @@ void ExtractGHKM::StripBitParLabels(
 }
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.h b/phrase-extract/extract-ghkm/ExtractGHKM.h
index 5954e7425..66c4c55f8 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.h
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.h
@@ -25,13 +25,11 @@
 #include <string>
 #include <vector>
 
-#include "ParseTree.h"
+#include "OutputFileStream.h"
+#include "SyntaxTree.h"
 
-namespace Moses
+namespace MosesTraining
 {
-
-class OutputFileStream;
-
 namespace GHKM
 {
 
@@ -52,9 +50,9 @@ private:
   void Error(const std::string &) const;
   void OpenInputFileOrDie(const std::string &, std::ifstream &);
   void OpenOutputFileOrDie(const std::string &, std::ofstream &);
-  void OpenOutputFileOrDie(const std::string &, OutputFileStream &);
-  void RecordTreeLabels(const ParseTree &, std::set<std::string> &);
-  void CollectWordLabelCounts(ParseTree &,
+  void OpenOutputFileOrDie(const std::string &, Moses::OutputFileStream &);
+  void RecordTreeLabels(const SyntaxTree &, std::set<std::string> &);
+  void CollectWordLabelCounts(SyntaxTree &,
                               const Options &,
                               std::map<std::string, int> &,
                               std::map<std::string, std::string> &);
@@ -78,7 +76,7 @@ private:
                          std::map<std::string, int> &outTopLabelSet) const;
 
   std::vector<std::string> ReadTokens(const std::string &) const;
-  std::vector<std::string> ReadTokens(const ParseTree &root) const;
+  std::vector<std::string> ReadTokens(const SyntaxTree &root) const;
 
   void ProcessOptions(int, char *[], Options &) const;
 
@@ -86,5 +84,4 @@ private:
 };
 
 }  // namespace GHKM
-}  // namespace Moses
-
+}  // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/Main.cpp b/phrase-extract/extract-ghkm/Main.cpp
index 14064406b..64b3e0f00 100644
--- a/phrase-extract/extract-ghkm/Main.cpp
+++ b/phrase-extract/extract-ghkm/Main.cpp
@@ -21,6 +21,6 @@
 
 int main(int argc, char *argv[])
 {
-  Moses::GHKM::ExtractGHKM tool;
+  MosesTraining::GHKM::ExtractGHKM tool;
   return tool.Main(argc, argv);
 }
diff --git a/phrase-extract/extract-ghkm/Node.cpp b/phrase-extract/extract-ghkm/Node.cpp
index e14d8c050..384db3306 100644
--- a/phrase-extract/extract-ghkm/Node.cpp
+++ b/phrase-extract/extract-ghkm/Node.cpp
@@ -21,7 +21,7 @@
 
 #include "Subgraph.h"
 
-namespace Moses
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -70,4 +70,4 @@ void Node::GetTargetWords(std::vector<std::string> &targetWords) const
 }
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/Node.h b/phrase-extract/extract-ghkm/Node.h
index 2eed01311..71a24b28e 100644
--- a/phrase-extract/extract-ghkm/Node.h
+++ b/phrase-extract/extract-ghkm/Node.h
@@ -21,14 +21,14 @@
 #ifndef EXTRACT_GHKM_NODE_H_
 #define EXTRACT_GHKM_NODE_H_
 
-#include "Span.h"
-
 #include <cassert>
 #include <iterator>
 #include <string>
 #include <vector>
 
-namespace Moses
+#include "Span.h"
+
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -215,6 +215,6 @@ Node *Node::LowestCommonAncestor(InputIterator first, InputIterator last)
 }
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
 
 #endif
diff --git a/phrase-extract/extract-ghkm/Options.h b/phrase-extract/extract-ghkm/Options.h
index 00d59f9c7..f694fb55c 100644
--- a/phrase-extract/extract-ghkm/Options.h
+++ b/phrase-extract/extract-ghkm/Options.h
@@ -21,7 +21,7 @@
 
 #include <string>
 
-namespace Moses
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -89,5 +89,5 @@ public:
 };
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
 
diff --git a/phrase-extract/extract-ghkm/ParseTree.h b/phrase-extract/extract-ghkm/ParseTree.h
deleted file mode 100644
index f0b83f63f..000000000
--- a/phrase-extract/extract-ghkm/ParseTree.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2011 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-***********************************************************************/
-
-#pragma once
-#ifndef EXTRACT_GHKM_PARSE_TREE_H_
-#define EXTRACT_GHKM_PARSE_TREE_H_
-
-#include "syntax-common/tree.h"
-
-#include "SyntaxNode.h"
-
-namespace Moses
-{
-namespace GHKM
-{
-
-typedef MosesTraining::Syntax::Tree<MosesTraining::SyntaxNode> ParseTree;
-
-}  // namespace GHKM
-}  // namespace Moses
-
-#endif
diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.cpp b/phrase-extract/extract-ghkm/PhraseOrientation.cpp
index 8f1356cb3..57952d580 100644
--- a/phrase-extract/extract-ghkm/PhraseOrientation.cpp
+++ b/phrase-extract/extract-ghkm/PhraseOrientation.cpp
@@ -26,7 +26,7 @@
 
 #include <boost/assign/list_of.hpp>
 
-namespace Moses
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -469,5 +469,5 @@ void PhraseOrientation::WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE
 }
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
 
diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.h b/phrase-extract/extract-ghkm/PhraseOrientation.h
index d826c127c..572124e61 100644
--- a/phrase-extract/extract-ghkm/PhraseOrientation.h
+++ b/phrase-extract/extract-ghkm/PhraseOrientation.h
@@ -1,4 +1,3 @@
-
 /***********************************************************************
  Moses - statistical machine translation system
  Copyright (C) 2006-2011 University of Edinburgh
@@ -20,16 +19,18 @@
 
 #pragma once
 
-#include "Alignment.h"
-#include "moses/AlignmentInfo.h"
-
 #include <map>
 #include <set>
 #include <string>
 #include <vector>
+
 #include <boost/unordered_map.hpp>
 
-namespace Moses
+#include "moses/AlignmentInfo.h"
+
+#include "Alignment.h"
+
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -53,8 +54,8 @@ public:
 
   PhraseOrientation(int sourceSize,
                     int targetSize,
-                    const AlignmentInfo &alignTerm,
-                    const AlignmentInfo &alignNonTerm);
+                    const Moses::AlignmentInfo &alignTerm,
+                    const Moses::AlignmentInfo &alignNonTerm);
 
   REO_CLASS GetOrientationInfo(int startF, int endF, REO_DIR direction) const;
   REO_CLASS GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const;
@@ -119,5 +120,4 @@ private:
 };
 
 }  // namespace GHKM
-}  // namespace Moses
-
+}  // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/Rule.cpp b/phrase-extract/extract-ghkm/Rule.cpp
index da6b2ff23..1b7207c3c 100644
--- a/phrase-extract/extract-ghkm/Rule.cpp
+++ b/phrase-extract/extract-ghkm/Rule.cpp
@@ -3,7 +3,7 @@
 #include "Node.h"
 #include "Subgraph.h"
 
-namespace Moses
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -38,4 +38,4 @@ bool Rule::PartitionOrderComp(const Node *a, const Node *b)
 }
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/Rule.h b/phrase-extract/extract-ghkm/Rule.h
index 36e24c799..b87934735 100644
--- a/phrase-extract/extract-ghkm/Rule.h
+++ b/phrase-extract/extract-ghkm/Rule.h
@@ -7,7 +7,7 @@
 
 #include "Alignment.h"
 
-namespace Moses
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -54,6 +54,6 @@ protected:
 };
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
 
 #endif
diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp
index 918c88eeb..fc960b598 100644
--- a/phrase-extract/extract-ghkm/ScfgRule.cpp
+++ b/phrase-extract/extract-ghkm/ScfgRule.cpp
@@ -26,13 +26,13 @@
 #include "SyntaxNode.h"
 #include "SyntaxNodeCollection.h"
 
-namespace Moses
+namespace MosesTraining
 {
 namespace GHKM
 {
 
 ScfgRule::ScfgRule(const Subgraph &fragment,
-                   const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree)
+                   const SyntaxNodeCollection *sourceSyntaxTree)
   : m_graphFragment(fragment)
   , m_sourceLHS("X", NonTerminal)
   , m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal)
@@ -134,13 +134,13 @@ ScfgRule::ScfgRule(const Subgraph &fragment,
   }
 }
 
-void ScfgRule::PushSourceLabel(
-    const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree,
-    const Node *node, const std::string &nonMatchingLabel)
+void ScfgRule::PushSourceLabel(const SyntaxNodeCollection *sourceSyntaxTree,
+                               const Node *node,
+                               const std::string &nonMatchingLabel)
 {
   ContiguousSpan span = Closure(node->GetSpan());
   if (sourceSyntaxTree->HasNode(span.first,span.second)) { // does a source constituent match the span?
-    std::vector<MosesTraining::SyntaxNode*> sourceLabels =
+    std::vector<SyntaxNode*> sourceLabels =
       sourceSyntaxTree->GetNodes(span.first,span.second);
     if (!sourceLabels.empty()) {
       // store the topmost matching label from the source syntax tree
@@ -197,4 +197,4 @@ void ScfgRule::UpdateSourceLabelCoocCounts(std::map< std::string, std::map<std::
 }
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/ScfgRule.h b/phrase-extract/extract-ghkm/ScfgRule.h
index c8b76114a..c8cdbb143 100644
--- a/phrase-extract/extract-ghkm/ScfgRule.h
+++ b/phrase-extract/extract-ghkm/ScfgRule.h
@@ -29,7 +29,7 @@
 #include "Rule.h"
 #include "SyntaxNodeCollection.h"
 
-namespace Moses
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -41,7 +41,7 @@ class ScfgRule : public Rule
 {
 public:
   ScfgRule(const Subgraph &fragment,
-           const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree = 0);
+           const SyntaxNodeCollection *sourceSyntaxTree = 0);
 
   const Subgraph &GetGraphFragment() const {
     return m_graphFragment;
@@ -78,9 +78,8 @@ public:
   }
 
 private:
-  void PushSourceLabel(
-      const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree,
-      const Node *node, const std::string &nonMatchingLabel);
+  void PushSourceLabel(const SyntaxNodeCollection *sourceSyntaxTree,
+                       const Node *node, const std::string &nonMatchingLabel);
 
   const Subgraph& m_graphFragment;
   Symbol m_sourceLHS;
@@ -94,4 +93,4 @@ private:
 };
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
index 1dfe78934..b513ecdaf 100644
--- a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
+++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
@@ -19,10 +19,6 @@
 
 #include "ScfgRuleWriter.h"
 
-#include "Alignment.h"
-#include "Options.h"
-#include "ScfgRule.h"
-
 #include <cassert>
 #include <cmath>
 #include <ostream>
@@ -30,7 +26,11 @@
 #include <sstream>
 #include <vector>
 
-namespace Moses
+#include "Alignment.h"
+#include "Options.h"
+#include "ScfgRule.h"
+
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -229,4 +229,4 @@ void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out)
 }
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.h b/phrase-extract/extract-ghkm/ScfgRuleWriter.h
index 41ef9a6d2..31358c57d 100644
--- a/phrase-extract/extract-ghkm/ScfgRuleWriter.h
+++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.h
@@ -19,11 +19,11 @@
 
 #pragma once
 
-#include "Subgraph.h"
-
 #include <ostream>
 
-namespace Moses
+#include "Subgraph.h"
+
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -57,5 +57,5 @@ private:
 };
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
 
diff --git a/phrase-extract/extract-ghkm/Span.cpp b/phrase-extract/extract-ghkm/Span.cpp
index d637ec3d2..f6636cebb 100644
--- a/phrase-extract/extract-ghkm/Span.cpp
+++ b/phrase-extract/extract-ghkm/Span.cpp
@@ -19,7 +19,7 @@
 
 #include "Span.h"
 
-namespace Moses
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -45,4 +45,4 @@ ContiguousSpan Closure(const Span &s)
 }
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/Span.h b/phrase-extract/extract-ghkm/Span.h
index c4d146c4e..90bed416a 100644
--- a/phrase-extract/extract-ghkm/Span.h
+++ b/phrase-extract/extract-ghkm/Span.h
@@ -24,7 +24,7 @@
 #include <map>
 #include <set>
 
-namespace Moses
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -36,7 +36,7 @@ bool SpansIntersect(const Span &, const ContiguousSpan &);
 
 ContiguousSpan Closure(const Span &);
 
-}  // namespace Moses
+}  // namespace MosesTraining
 }  // namespace GHKM
 
 #endif
diff --git a/phrase-extract/extract-ghkm/StsgRule.cpp b/phrase-extract/extract-ghkm/StsgRule.cpp
index 271249e1b..10368e4c0 100644
--- a/phrase-extract/extract-ghkm/StsgRule.cpp
+++ b/phrase-extract/extract-ghkm/StsgRule.cpp
@@ -1,11 +1,11 @@
 #include "StsgRule.h"
 
+#include <algorithm>
+
 #include "Node.h"
 #include "Subgraph.h"
 
-#include <algorithm>
-
-namespace Moses
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -91,4 +91,4 @@ StsgRule::StsgRule(const Subgraph &fragment)
 }
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/StsgRule.h b/phrase-extract/extract-ghkm/StsgRule.h
index b14695c5c..a037a8d91 100644
--- a/phrase-extract/extract-ghkm/StsgRule.h
+++ b/phrase-extract/extract-ghkm/StsgRule.h
@@ -2,12 +2,12 @@
 #ifndef EXTRACT_GHKM_STSG_RULE_H_
 #define EXTRACT_GHKM_STSG_RULE_H_
 
+#include <vector>
+
 #include "Rule.h"
 #include "Subgraph.h"
 
-#include <vector>
-
-namespace Moses
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -39,6 +39,6 @@ private:
 };
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
 
 #endif
diff --git a/phrase-extract/extract-ghkm/StsgRuleWriter.cpp b/phrase-extract/extract-ghkm/StsgRuleWriter.cpp
index a9596b65c..32953bf68 100644
--- a/phrase-extract/extract-ghkm/StsgRuleWriter.cpp
+++ b/phrase-extract/extract-ghkm/StsgRuleWriter.cpp
@@ -1,9 +1,5 @@
 #include "StsgRuleWriter.h"
 
-#include "Alignment.h"
-#include "Options.h"
-#include "StsgRule.h"
-
 #include <cassert>
 #include <cmath>
 #include <ostream>
@@ -11,7 +7,11 @@
 #include <sstream>
 #include <vector>
 
-namespace Moses
+#include "Alignment.h"
+#include "Options.h"
+#include "StsgRule.h"
+
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -92,4 +92,4 @@ void StsgRuleWriter::Write(const StsgRule &rule)
 }
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/StsgRuleWriter.h b/phrase-extract/extract-ghkm/StsgRuleWriter.h
index efba44d2c..3f215a5c9 100644
--- a/phrase-extract/extract-ghkm/StsgRuleWriter.h
+++ b/phrase-extract/extract-ghkm/StsgRuleWriter.h
@@ -2,11 +2,11 @@
 #ifndef EXTRACT_GHKM_STSG_RULE_WRITER_H_
 #define EXTRACT_GHKM_STSG_RULE_WRITER_H_
 
-#include "Subgraph.h"
-
 #include <ostream>
 
-namespace Moses
+#include "Subgraph.h"
+
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -36,6 +36,6 @@ private:
 };
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
 
 #endif
diff --git a/phrase-extract/extract-ghkm/Subgraph.cpp b/phrase-extract/extract-ghkm/Subgraph.cpp
index 0d673edca..f04c6982c 100644
--- a/phrase-extract/extract-ghkm/Subgraph.cpp
+++ b/phrase-extract/extract-ghkm/Subgraph.cpp
@@ -18,10 +18,11 @@
 ***********************************************************************/
 
 #include <iostream>
-#include "Subgraph.h"
-#include "Node.h"
 
-namespace Moses
+#include "Node.h"
+#include "Subgraph.h"
+
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -193,5 +194,5 @@ void Subgraph::RecursivelyGetPartsOfSpeech(const Node *n, std::vector<std::strin
   }
 }
 
-}  // namespace Moses
+}  // namespace MosesTraining
 }  // namespace GHKM
diff --git a/phrase-extract/extract-ghkm/Subgraph.h b/phrase-extract/extract-ghkm/Subgraph.h
index 85da5a7fe..a9c6dac48 100644
--- a/phrase-extract/extract-ghkm/Subgraph.h
+++ b/phrase-extract/extract-ghkm/Subgraph.h
@@ -19,12 +19,12 @@
 
 #pragma once
 
-#include "Node.h"
-
 #include <set>
 #include <vector>
 
-namespace Moses
+#include "Node.h"
+
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -137,5 +137,5 @@ private:
 };
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
 
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/phrase-extract/extract-ghkm/XmlTreeParser.cpp
index 671b03a78..bbf20c765 100644
--- a/phrase-extract/extract-ghkm/XmlTreeParser.cpp
+++ b/phrase-extract/extract-ghkm/XmlTreeParser.cpp
@@ -19,18 +19,17 @@
 
 #include "XmlTreeParser.h"
 
-#include "ParseTree.h"
-#include "tables-core.h"
-#include "XmlException.h"
-#include "XmlTree.h"
-#include "util/tokenize.hh"
-
 #include <cassert>
 #include <vector>
 
-using namespace MosesTraining;
+#include "util/tokenize.hh"
 
-namespace Moses
+#include "SyntaxTree.h"
+#include "tables-core.h"
+#include "XmlException.h"
+#include "XmlTree.h"
+
+namespace MosesTraining
 {
 namespace GHKM
 {
@@ -42,7 +41,7 @@ XmlTreeParser::XmlTreeParser(std::set<std::string> &labelSet,
 {
 }
 
-std::auto_ptr<ParseTree> XmlTreeParser::Parse(const std::string &line)
+std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line)
 {
   m_line = line;
   m_tree.Clear();
@@ -61,12 +60,12 @@ std::auto_ptr<ParseTree> XmlTreeParser::Parse(const std::string &line)
   return ConvertTree(*root, m_words);
 }
 
-// Converts a SyntaxNode tree to a Moses::GHKM::ParseTree.
-std::auto_ptr<ParseTree> XmlTreeParser::ConvertTree(
+// Converts a SyntaxNode tree to a MosesTraining::GHKM::SyntaxTree.
+std::auto_ptr<SyntaxTree> XmlTreeParser::ConvertTree(
   const SyntaxNode &tree,
   const std::vector<std::string> &words)
 {
-  std::auto_ptr<ParseTree> root(new ParseTree(tree));
+  std::auto_ptr<SyntaxTree> root(new SyntaxTree(tree));
   const std::vector<SyntaxNode*> &children = tree.GetChildren();
   if (children.empty()) {
     if (tree.GetStart() != tree.GetEnd()) {
@@ -76,14 +75,14 @@ std::auto_ptr<ParseTree> XmlTreeParser::ConvertTree(
       throw Exception(msg.str());
     }
     SyntaxNode value(tree.GetStart(), tree.GetStart(), words[tree.GetStart()]);
-    std::auto_ptr<ParseTree> leaf(new ParseTree(value));
+    std::auto_ptr<SyntaxTree> leaf(new SyntaxTree(value));
     leaf->parent() = root.get();
     root->children().push_back(leaf.release());
   } else {
     for (std::vector<SyntaxNode*>::const_iterator p = children.begin();
          p != children.end(); ++p) {
       assert(*p);
-      std::auto_ptr<ParseTree> child = ConvertTree(**p, words);
+      std::auto_ptr<SyntaxTree> child = ConvertTree(**p, words);
       child->parent() = root.get();
       root->children().push_back(child.release());
     }
@@ -92,4 +91,4 @@ std::auto_ptr<ParseTree> XmlTreeParser::ConvertTree(
 }
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h
index a82862428..4e89e7167 100644
--- a/phrase-extract/extract-ghkm/XmlTreeParser.h
+++ b/phrase-extract/extract-ghkm/XmlTreeParser.h
@@ -21,32 +21,32 @@
 #ifndef EXTRACT_GHKM_XML_TREE_PARSER_H_
 #define EXTRACT_GHKM_XML_TREE_PARSER_H_
 
-#include "Exception.h"
-
 #include <map>
 #include <memory>
 #include <set>
 #include <string>
 #include <vector>
 
-#include "ParseTree.h"
 #include "SyntaxNode.h"
 #include "SyntaxNodeCollection.h"
+#include "SyntaxTree.h"
 
-namespace Moses
+#include "Exception.h"
+
+namespace MosesTraining
 {
 namespace GHKM
 {
 
-// Parses a string in Moses' XML parse tree format and returns a ParseTree
+// Parses a string in Moses' XML parse tree format and returns a SyntaxTree
 // object.
 class XmlTreeParser
 {
 public:
   XmlTreeParser(std::set<std::string> &, std::map<std::string, int> &);
-  std::auto_ptr<ParseTree> Parse(const std::string &);
+  std::auto_ptr<SyntaxTree> Parse(const std::string &);
 
-  static std::auto_ptr<ParseTree> ConvertTree(const MosesTraining::SyntaxNode &,
+  static std::auto_ptr<SyntaxTree> ConvertTree(const SyntaxNode &,
       const std::vector<std::string> &);
 
   const std::vector<std::string>& GetWords() {
@@ -58,11 +58,11 @@ private:
   std::set<std::string> &m_labelSet;
   std::map<std::string, int> &m_topLabelSet;
   std::string m_line;
-  MosesTraining::SyntaxNodeCollection m_tree;
+  SyntaxNodeCollection m_tree;
   std::vector<std::string> m_words;
 };
 
 }  // namespace GHKM
-}  // namespace Moses
+}  // namespace MosesTraining
 
 #endif

From 8a9505d72fcf61a32e16397e46e04acc3561027b Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Mon, 1 Jun 2015 16:54:12 +0100
Subject: [PATCH 028/108] Ongoing moses/phrase-extract refactoring

---
 phrase-extract/SyntaxNodeCollection.cpp       | 61 +++++++++++++++++++
 phrase-extract/SyntaxNodeCollection.h         |  6 ++
 phrase-extract/extract-ghkm/XmlTreeParser.cpp | 35 +++++++++--
 phrase-extract/extract-ghkm/XmlTreeParser.h   |  3 +-
 4 files changed, 100 insertions(+), 5 deletions(-)

diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp
index 099a5697f..f67bee587 100644
--- a/phrase-extract/SyntaxNodeCollection.cpp
+++ b/phrase-extract/SyntaxNodeCollection.cpp
@@ -23,6 +23,8 @@
 #include <cassert>
 #include <iostream>
 
+#include <boost/make_shared.hpp>
+
 namespace MosesTraining
 {
 
@@ -152,4 +154,63 @@ void SyntaxNodeCollection::ConnectNodes()
   }
 }
 
+//boost::shared_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
+std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
+{
+  std::map<SyntaxNode *, SyntaxTree *> nodeToTree;
+
+  // Create a SyntaxTree object for each SyntaxNode.
+  for (std::vector<SyntaxNode*>::const_iterator p = m_nodes.begin();
+       p != m_nodes.end(); ++p) {
+    nodeToTree[*p] = new SyntaxTree(**p);
+  }
+
+  // Connect the SyntaxTrees.
+  typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator;
+
+  SyntaxTree *root = 0;
+  SyntaxNode *prevNode = 0;
+  SyntaxTree *prevTree = 0;
+  // Iterate over all start indices from lowest to highest.
+  for (SyntaxTreeIndexIterator p = m_index.begin(); p != m_index.end(); ++p) {
+    const SyntaxTreeIndex2 &inner = p->second;
+    // Iterate over all end indices from highest to lowest.
+    for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) {
+      const std::vector<SyntaxNode*> &nodes = q->second;
+      // Iterate over all nodes that cover the same span in order of tree
+      // depth, top-most first.
+      for (std::vector<SyntaxNode*>::const_reverse_iterator r = nodes.rbegin();
+           r != nodes.rend(); ++r) {
+        SyntaxNode *node = *r;
+        SyntaxTree *tree = nodeToTree[node];
+        if (!prevNode) {
+          // node is the root.
+          root = tree;
+          tree->parent() = 0;
+        } else if (prevNode->GetStart() == node->GetStart()) {
+          // prevNode is the parent of node.
+          assert(prevNode->GetEnd() >= node->GetEnd());
+          tree->parent() = prevTree;
+          prevTree->children().push_back(tree);
+        } else {
+          // prevNode is a descendant of node's parent.  The lowest common
+          // ancestor of prevNode and node will be node's parent.
+          SyntaxTree *ancestor = prevTree->parent();
+          while (ancestor->value().GetEnd() < tree->value().GetEnd()) {
+            ancestor = ancestor->parent();
+          }
+          assert(ancestor);
+          tree->parent() = ancestor;
+          ancestor->children().push_back(tree);
+        }
+        prevNode = node;
+        prevTree = tree;
+      }
+    }
+  }
+
+  //return boost::shared_ptr<SyntaxTree>(root);
+  return std::auto_ptr<SyntaxTree>(root);
+}
+
 }  // namespace MosesTraining
diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h
index 70b14206d..c54400ca1 100644
--- a/phrase-extract/SyntaxNodeCollection.h
+++ b/phrase-extract/SyntaxNodeCollection.h
@@ -24,7 +24,10 @@
 #include <string>
 #include <vector>
 
+#include <boost/shared_ptr.hpp>
+
 #include "SyntaxNode.h"
+#include "SyntaxTree.h"
 
 namespace MosesTraining
 {
@@ -70,6 +73,9 @@ public:
   }
   void ConnectNodes();
   void Clear();
+
+  std::auto_ptr<SyntaxTree> ExtractTree();
+  //boost::shared_ptr<SyntaxTree> ExtractTree();
 };
 
 }  // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/phrase-extract/extract-ghkm/XmlTreeParser.cpp
index bbf20c765..83dfbd42f 100644
--- a/phrase-extract/extract-ghkm/XmlTreeParser.cpp
+++ b/phrase-extract/extract-ghkm/XmlTreeParser.cpp
@@ -53,11 +53,11 @@ std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line)
   } catch (const XmlException &e) {
     throw Exception(e.getMsg());
   }
-  m_tree.ConnectNodes();
-  SyntaxNode *root = m_tree.GetTop();
-  assert(root);
+  //boost::shared_ptr<SyntaxTree> root = m_tree.ExtractTree();
+  std::auto_ptr<SyntaxTree> root = m_tree.ExtractTree();
   m_words = util::tokenize(m_line);
-  return ConvertTree(*root, m_words);
+  AttachWords(m_words, *root);
+  return root;
 }
 
 // Converts a SyntaxNode tree to a MosesTraining::GHKM::SyntaxTree.
@@ -90,5 +90,32 @@ std::auto_ptr<SyntaxTree> XmlTreeParser::ConvertTree(
   return root;
 }
 
+void XmlTreeParser::AttachWords(const std::vector<std::string> &words,
+                                SyntaxTree &root)
+{
+  std::vector<SyntaxTree*> leaves;
+  leaves.reserve(words.size());
+  for (SyntaxTree::LeafIterator p(root); p != SyntaxTree::LeafIterator(); ++p) {
+    leaves.push_back(&*p);
+  }
+
+  std::vector<std::string>::const_iterator q = words.begin();
+  for (std::vector<SyntaxTree*>::iterator p = leaves.begin(); p != leaves.end();
+       ++p) {
+    SyntaxTree *leaf = *p;
+    const int start = leaf->value().GetStart();
+    const int end = leaf->value().GetEnd();
+    if (start != end) {
+      std::ostringstream msg;
+      msg << "leaf node covers multiple words (" << start << "-" << end
+          << "): this is currently unsupported";
+      throw Exception(msg.str());
+    }
+    SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(start, end, *q++));
+    leaf->children().push_back(newLeaf);
+    newLeaf->parent() = leaf;
+  }
+}
+
 }  // namespace GHKM
 }  // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h
index 4e89e7167..2fcdd9b56 100644
--- a/phrase-extract/extract-ghkm/XmlTreeParser.h
+++ b/phrase-extract/extract-ghkm/XmlTreeParser.h
@@ -54,12 +54,13 @@ public:
   };
 
 private:
-
   std::set<std::string> &m_labelSet;
   std::map<std::string, int> &m_topLabelSet;
   std::string m_line;
   SyntaxNodeCollection m_tree;
   std::vector<std::string> m_words;
+
+  void AttachWords(const std::vector<std::string> &, SyntaxTree &);
 };
 
 }  // namespace GHKM

From 25f98a446e8802398a5f06bf299ca1587aad157f Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Mon, 1 Jun 2015 18:19:34 +0100
Subject: [PATCH 029/108] Bug fix in building imTtrack directly from input
 stream.

---
 moses/TranslationModel/UG/mm/ug_im_ttrack.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/moses/TranslationModel/UG/mm/ug_im_ttrack.h b/moses/TranslationModel/UG/mm/ug_im_ttrack.h
index 20ab653f4..503a5546c 100644
--- a/moses/TranslationModel/UG/mm/ug_im_ttrack.h
+++ b/moses/TranslationModel/UG/mm/ug_im_ttrack.h
@@ -57,7 +57,7 @@ namespace ugdiss
   public:
 
     imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d);
-    imTtrack(istream& in, TokenIndex const& V, ostream* log = NULL);
+    imTtrack(istream& in, TokenIndex& V, ostream* log = NULL);
     imTtrack(size_t reserve = 0);
     // imTtrack(istream& in, Vocab& V);
 
@@ -131,24 +131,30 @@ namespace ugdiss
 
   template<typename Token>
   imTtrack<Token>::
-  imTtrack(istream& in, TokenIndex const& V, ostream* log)
+  imTtrack(istream& in, TokenIndex& V, ostream* log)
     : numToks(0)
   {
     myData.reset(new vector<vector<Token> >());
     string line,w;
     size_t linectr=0;
     boost::unordered_map<string,id_type> H;
-    for (id_type i = 0; i < V.knownVocabSize(); ++i)
-      H[V[i]] = i;
+    // for (id_type i = 0; i < V.knownVocabSize(); ++i)
+    // H[V[i]] = i;
     while (getline(in,line))
       {
+	// cout << line << endl;
 	myData->push_back(vector<Token>());
 	if (log && ++linectr%1000000==0)
 	  *log << linectr/1000000 << "M lines of input processed" << endl;
 	istringstream buf(line);
+	// cout << line << endl;
 	while (buf>>w)
-	  myData->back().push_back(Token(H[w]));
-	myData->back().resize(myData.back().size());
+	  {
+	    myData->back().push_back(Token(V[w]));
+	    // cout << w << " " << myData->back().back().id() << " " 
+	    // << V[w] << endl;
+	  }
+	// myData->back().resize(myData->back().size(), Token(0));
 	numToks += myData->back().size();
       }
   }

From 349163f3fd915c9c61241778db3eecb36d6f526d Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Mon, 1 Jun 2015 18:21:52 +0100
Subject: [PATCH 030/108] Bug fix and in-line code documentation.

---
 .../TranslationModel/UG/mm/tpt_tokenindex.cc  | 32 +++++++++++++------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/moses/TranslationModel/UG/mm/tpt_tokenindex.cc b/moses/TranslationModel/UG/mm/tpt_tokenindex.cc
index 5fc6a6acc..0be8aa082 100644
--- a/moses/TranslationModel/UG/mm/tpt_tokenindex.cc
+++ b/moses/TranslationModel/UG/mm/tpt_tokenindex.cc
@@ -16,7 +16,8 @@ namespace ugdiss
 
   TokenIndex::
   TokenIndex(string unkToken)
-    : ridx(0),unkLabel(unkToken),unkId(1),numTokens(0)
+    : ridx(0), unkLabel(unkToken), unkId(1), numTokens(0)
+    , startIdx(0), endIdx(0)
   {
     lock.reset(new boost::mutex());
   };
@@ -94,15 +95,25 @@ namespace ugdiss
   TokenIndex::
   operator[](char const* p) const
   {
-    if (startIdx==endIdx && !dynamic) return strcmp(p,"NULL") && unkId;
-    Entry const* bla = lower_bound(startIdx,endIdx,p,comp);
-    if (bla != endIdx && !strcmp(comp.base+bla->offset,p))
-      return bla->id;
-    if (!dynamic) return unkId;
+    if (startIdx != endIdx)
+      {
+	Entry const* bla = lower_bound(startIdx,endIdx,p,comp);
+	if (bla != endIdx && !strcmp(comp.base+bla->offset,p))
+	  return bla->id;
+	if (!dynamic) return unkId;
+      }
+    else if (!dynamic) return strcmp(p,"NULL") && unkId;
+    
     boost::lock_guard<boost::mutex> lk(*this->lock);
-    // stuff below is new as of 2011-01-30, for dynamic adding of unknown items
-    // IMPORTANT: numTokens is not currently not changed, it is the number of
-    // PRE-EXISING TOKENS, not including dynamically added Items
+    // stuff below is new as of 2011-01-30, for dynamic adding of
+    // unknown items IMPORTANT: numTokens is not currently not
+    // changed, it is the number of PRE-EXISING TOKENS, not including
+    // dynamically added Items
+    // if (!str2idExtra)
+    //   {
+    //     this->str2idExtra.reset(new map<string,id_type>());
+    //     this->newWords.reset(new vector<string>());
+    //   }
     map<string,id_type>::value_type newItem(p,str2idExtra->size()+numTokens);
     pair<map<string,id_type>::iterator,bool> foo = str2idExtra->insert(newItem);
     if (foo.second) // it actually is a new item
@@ -144,10 +155,13 @@ namespace ugdiss
     if (!ridx.size())
       {
 	boost::lock_guard<boost::mutex> lk(*this->lock);
+	// Someone else (multi-threading!) may have created the 
+	// reverse index in the meantime, so let's check again
 	if (!ridx.size()) ridx = reverseIndex();
       }
     if (id < ridx.size())
       return ridx[id];
+    
     boost::lock_guard<boost::mutex> lk(*this->lock);
     if (dynamic && id < ridx.size()+newWords->size())
       return (*newWords)[id-ridx.size()].c_str();

From 99896cfd2cbbe9bdde1f16c0bf1adbbfc6579296 Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Mon, 1 Jun 2015 18:22:37 +0100
Subject: [PATCH 031/108] Untangling bitext class from Moses dependencies, so
 that the class can be used independently of Moses again.

---
 moses/TranslationModel/UG/mm/ug_bitext.h      | 186 +++---------------
 .../TranslationModel/UG/mm/ug_bitext_moses.h  |  88 +++++++++
 .../UG/mm/ug_lexical_reordering.h             |  30 ++-
 moses/TranslationModel/UG/mm/ug_mm_bitext.h   |   5 +-
 moses/TranslationModel/UG/mm/ug_phrasepair.h  |   6 +
 5 files changed, 156 insertions(+), 159 deletions(-)
 create mode 100644 moses/TranslationModel/UG/mm/ug_bitext_moses.h

diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h
index c1a065b0a..2d2afc3ca 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext.h
@@ -35,12 +35,18 @@
 #include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
 #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
 #include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h"
-#include "moses/FF/LexicalReordering/LexicalReorderingState.h"
+// #include "moses/FF/LexicalReordering/LexicalReorderingState.h"
 #include "moses/Util.h"
-// #include "moses/StaticData.h"
+
+#ifndef NO_MOSES
+// #pragma message "COMPILING WITH MOSES SUPPORT!"
+#include "moses/StaticData.h"
 #include "moses/thread_safe_container.h"
 #include "moses/ContextScope.h"
 #include "moses/TranslationTask.h"
+#else
+// #pragma message "COMPILING WITHOUT MOSES SUPPORT!"
+#endif
 
 #include "util/exception.hh"
 // #include "util/check.hh"
@@ -70,6 +76,7 @@ namespace Moses {
     float lbop(size_t const tries, size_t const succ, float const confidence);
     void write_bitvector(bitvector const& v, ostream& out);
 
+#ifndef NO_MOSES
     struct
     ContextForQuery
     {
@@ -82,7 +89,7 @@ namespace Moses {
       ostream* bias_log;
       ContextForQuery() : bias_log(NULL) { }
     };
-
+#endif
 
     template<typename TKN>
     class Bitext
@@ -140,8 +147,13 @@ namespace Moses {
 
       // prep2 launches sampling and returns immediately.
       // lookup (below) waits for the job to finish before it returns
+      sptr<pstats>
+      prep2(iter const& phrase, int max_sample = -1) const;
+
+#ifndef NO_MOSES
       sptr<pstats>
       prep2(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
+#endif 
 
     public:
       Bitext(size_t const max_sample = 1000, size_t const xnum_workers = 16);
@@ -157,9 +169,15 @@ namespace Moses {
       open(string const base, string const L1, string const L2) = 0;
 
       sptr<pstats>
-      lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
+      lookup(iter const& phrase, int max_sample = -1) const;
+      void prep(iter const& phrase) const;
 
+#ifndef NO_MOSES
+      sptr<pstats>
+      lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
       void prep(ttasksptr const& ttask, iter const& phrase) const;
+#endif
+
 
       void   setDefaultSampleSize(size_t const max_samples);
       size_t getDefaultSampleSize() const;
@@ -181,16 +199,7 @@ namespace Moses {
       void
       write_yawat_alignment
       ( id_type const sid, iter const* m1, iter const* m2, ostream& out ) const;
-#if 0
-      // needs to be adapted to the new API
-      void
-      lookup(std::vector<Token> const& snt, TSA<Token>& idx,
-	     std::vector<std::vector<sptr<std::vector<PhrasePair<Token> > > > >& dest,
-	     std::vector<std::vector<uint64_t> >* pidmap = NULL,
-	     typename PhrasePair<Token>::Scorer* scorer=NULL,
-	     sptr<SamplingBias const> const bias,
-	     bool multithread=true) const;
-#endif
+
       string docname(id_type const sid) const;
 
     };
@@ -427,11 +436,13 @@ namespace Moses {
     template<typename Token>
     void
     Bitext<Token>::
-    prep(ttasksptr const& ttask, iter const& phrase) const
+    prep(iter const& phrase) const
     {
-      prep2(ttask, phrase, m_default_sample_size);
+      prep2(phrase, m_default_sample_size);
     }
 
+
+
     // prep2 schedules a phrase for sampling, and returns immediately
     // the member function lookup retrieves the respective pstats instance
     // and waits until the sampling is finished before it returns.
@@ -440,26 +451,20 @@ namespace Moses {
     sptr<pstats>
     Bitext<Token>
     ::prep2
-    ( ttasksptr const& ttask, iter const& phrase, int max_sample) const
+    (iter const& phrase, int max_sample) const
     {
       if (max_sample < 0) max_sample = m_default_sample_size;
-      sptr<ContextScope> scope = ttask->GetScope();
-      sptr<ContextForQuery> context = scope->get<ContextForQuery>(this);
       sptr<SamplingBias> bias;
-      if (context) bias = context->bias;
       sptr<pstats::cache_t> cache;
-
       // - no caching for rare phrases and special requests (max_sample)
       //   (still need to test what a good caching threshold is ...)
       // - use the task-specific cache when there is a sampling bias
       if (max_sample == int(m_default_sample_size)
 	  && phrase.approxOccurrenceCount() > m_pstats_cache_threshold)
 	{
-	  cache = (phrase.root == I1.get()
-		   ? (bias ? context->cache1 : m_cache1)
-		   : (bias ? context->cache2 : m_cache2));
-	  // if (bias) cerr << "Using bias." << endl;
+	  cache = (phrase.root == I1.get() ? m_cache1 : m_cache2);
 	}
+
       sptr<pstats> ret;
       sptr<pstats> const* cached;
 
@@ -472,9 +477,6 @@ namespace Moses {
 	  if (m_num_workers > 1)
 	    ag->add_workers(m_num_workers);
 	}
-      // cerr << "NEW FREQUENT PHRASE: "
-      // << phrase.str(V1.get()) << " " << phrase.approxOccurrenceCount()
-      // << " at " << __FILE__ << ":" << __LINE__ << endl;
       ret = ag->add_job(this, phrase, max_sample, bias);
       if (cache) cache->set(phrase.getPid(),ret);
       UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job.");
@@ -545,87 +547,6 @@ namespace Moses {
       }
     };
 
-#if 0
-    template<typename Token>
-    void
-    Bitext<Token>::
-    lookup(std::vector<Token> const& snt, TSA<Token>& idx,
-	   std::vector<std::vector<sptr<std::vector<PhrasePair<Token> > > > >& dest,
-	   std::vector<std::vector<uint64_t> >* pidmap,
-	   typename PhrasePair<Token>::Scorer* scorer,
-	   sptr<SamplingBias const> const& bias, bool multithread) const
-    {
-      // typedef std::vector<std::vector<sptr<std::vector<PhrasePair<Token> > > > > ret_t;
-
-      dest.clear();
-      dest.resize(snt.size());
-      if (pidmap) { pidmap->clear(); pidmap->resize(snt.size()); }
-
-      // collect statistics in parallel, then build PT entries as
-      // the sampling finishes
-      bool fwd = &idx == I1.get();
-      std::vector<boost::thread*> workers; // background threads doing the lookup
-      pplist_cache_t& C = (fwd ? m_pplist_cache1 : m_pplist_cache2);
-      if (C.capacity() < 100000) C.reserve(100000);
-      for (size_t i = 0; i < snt.size(); ++i)
-	{
-	  dest[i].reserve(snt.size()-i);
-	  typename TSA<Token>::tree_iterator m(&idx);
-	  for (size_t k = i; k < snt.size() && m.extend(snt[k].id()); ++k)
-	    {
-	      uint64_t key = m.getPid();
-	      if (pidmap) (*pidmap)[i].push_back(key);
-	      sptr<std::vector<PhrasePair<Token> > > pp = C.get(key);
-	      if (pp)
-		dest[i].push_back(pp);
-	      else
-		{
-		  pp.reset(new std::vector<PhrasePair<Token> >());
-		  C.set(key,pp);
-		  dest[i].push_back(pp);
-		  sptr<pstats> x = prep2(m, this->default_sample_size,bias);
-		  pstats2pplist<Token> w(m,*(fwd?T2:T1),x,*pp,scorer);
-		  if (multithread)
-		    {
-		      boost::thread* t = new boost::thread(w);
-		      workers.push_back(t);
-		    }
-		  else w();
-		}
-	    }
-	}
-      for (size_t w = 0; w < workers.size(); ++w)
-	{
-	  workers[w]->join();
-	  delete workers[w];
-	}
-    }
-#endif
-
-    template<typename Token>
-    sptr<pstats>
-    Bitext<Token>::
-    lookup(ttasksptr const& ttask, iter const& phrase, int max_sample) const
-    {
-      sptr<pstats> ret = prep2(ttask, phrase, max_sample);
-
-      UTIL_THROW_IF2(!ret, "Got NULL pointer where I expected a valid pointer.");
-
-      // Why were we locking here?
-      if (m_num_workers <= 1)
-	{
-	  boost::unique_lock<boost::shared_mutex> guard(m_lock);
-	  typename agenda::worker(*this->ag)();
-	}
-      else
-	{
-	  boost::unique_lock<boost::mutex> lock(ret->lock);
-	  while (ret->in_progress)
-	    ret->ready.wait(lock);
-	}
-      return ret;
-    }
-
     template<typename Token>
     void
     Bitext<Token>
@@ -729,27 +650,6 @@ namespace Moses {
 	}
     }
 
-#if 0
-    template<typename Token>
-    sptr<pstats>
-    Bitext<Token>::
-    lookup(siter const& phrase, size_t const max_sample,
-	   sptr<SamplingBias const> const& bias) const
-    {
-      sptr<pstats> ret = prep2(phrase, max_sample);
-      boost::unique_lock<boost::shared_mutex> guard(m_lock);
-      if (this->num_workers <= 1)
-	typename agenda::worker(*this->ag)();
-      else
-	{
-	  boost::unique_lock<boost::mutex> lock(ret->lock);
-	  while (ret->in_progress)
-	    ret->ready.wait(lock);
-	}
-      return ret;
-    }
-#endif
-
     template<typename Token>
     void
     expand(typename Bitext<Token>::iter const& m,
@@ -773,33 +673,9 @@ namespace Moses {
 	}
     }
 
-#if 0
-    template<typename Token>
-    class
-    PStatsCache
-    {
-      typedef boost::unordered_map<uint64_t, sptr<pstats> > my_cache_t;
-      boost::shared_mutex m_lock;
-      my_cache_t m_cache;
-
-    public:
-      sptr<pstats> get(Bitext<Token>::iter const& phrase) const;
-
-      sptr<pstats>
-      add(Bitext<Token>::iter const& phrase) const
-      {
-	uint64_t pid = phrase.getPid();
-	std::pair<my_cache_t::iterator,bool>
-      }
-
-
-    };
-#endif
   } // end of namespace bitext
 } // end of namespace moses
 
 #include "ug_im_bitext.h"
 #include "ug_mm_bitext.h"
-
-
-
+#include "ug_bitext_moses.h"
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_moses.h b/moses/TranslationModel/UG/mm/ug_bitext_moses.h
new file mode 100644
index 000000000..539a9166d
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_moses.h
@@ -0,0 +1,88 @@
+// -*- mode: c++; cc-style: moses-cc-style -*-
+#pragma once
+#ifndef NO_MOSES
+namespace Moses {
+namespace bitext {
+
+template<typename Token>
+sptr<pstats>
+Bitext<Token>::
+lookup(ttasksptr const& ttask, iter const& phrase, int max_sample) const
+{
+  sptr<pstats> ret = prep2(ttask, phrase, max_sample);
+  UTIL_THROW_IF2(!ret, "Got NULL pointer where I expected a valid pointer.");
+  
+  // Why were we locking here?
+  if (m_num_workers <= 1)
+    {
+      boost::unique_lock<boost::shared_mutex> guard(m_lock);
+      typename agenda::worker(*this->ag)();
+    }
+  else
+    {
+      boost::unique_lock<boost::mutex> lock(ret->lock);
+      while (ret->in_progress)
+	ret->ready.wait(lock);
+    }
+  return ret;
+}
+
+
+template<typename Token>
+void
+Bitext<Token>::
+prep(ttasksptr const& ttask, iter const& phrase) const
+{
+  prep2(ttask, phrase, m_default_sample_size);
+}
+
+
+// prep2 schedules a phrase for sampling, and returns immediately
+// the member function lookup retrieves the respective pstats instance
+// and waits until the sampling is finished before it returns.
+// This allows sampling in the background
+template<typename Token>
+sptr<pstats>
+Bitext<Token>
+::prep2
+( ttasksptr const& ttask, iter const& phrase, int max_sample) const
+{
+  if (max_sample < 0) max_sample = m_default_sample_size;
+  sptr<SamplingBias> bias;
+  sptr<ContextScope> scope = ttask->GetScope();
+  sptr<ContextForQuery> context = scope->get<ContextForQuery>(this);
+  if (context) bias = context->bias;
+  sptr<pstats::cache_t> cache;
+  // - no caching for rare phrases and special requests (max_sample)
+  //   (still need to test what a good caching threshold is ...)
+  // - use the task-specific cache when there is a sampling bias
+  if (max_sample == int(m_default_sample_size)
+      && phrase.approxOccurrenceCount() > m_pstats_cache_threshold)
+    {
+      cache = (phrase.root == I1.get()
+	       ? (bias ? context->cache1 : m_cache1)
+	       : (bias ? context->cache2 : m_cache2));
+    }
+  sptr<pstats> ret;
+  sptr<pstats> const* cached;
+  
+  if (cache && (cached = cache->get(phrase.getPid(), ret)) && *cached)
+    return *cached;
+  boost::unique_lock<boost::shared_mutex> guard(m_lock);
+  if (!ag)
+    {
+      ag.reset(new agenda(*this));
+      if (m_num_workers > 1)
+	ag->add_workers(m_num_workers);
+    }
+  ret = ag->add_job(this, phrase, max_sample, bias);
+  if (cache) cache->set(phrase.getPid(),ret);
+  UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job.");
+  return ret;
+}
+
+
+
+}
+}
+#endif
diff --git a/moses/TranslationModel/UG/mm/ug_lexical_reordering.h b/moses/TranslationModel/UG/mm/ug_lexical_reordering.h
index 9004b757e..9c56e6cb5 100644
--- a/moses/TranslationModel/UG/mm/ug_lexical_reordering.h
+++ b/moses/TranslationModel/UG/mm/ug_lexical_reordering.h
@@ -1,9 +1,35 @@
 // -*- c++ -*-
 #pragma once
 #include <vector>
-#include "moses/FF/LexicalReordering/LexicalReorderingState.h"
 
-namespace Moses { namespace bitext {
+#ifndef NO_MOSES
+#include "moses/FF/LexicalReordering/LexicalReorderingState.h"
+#endif
+
+namespace Moses { 
+#ifdef NO_MOSES
+namespace LRModel{
+
+  enum ModelType { Monotonic, MSD, MSLR, LeftRight, None };
+  enum Direction { Forward, Backward, Bidirectional };
+
+  enum ReorderingType {
+    M    = 0, // monotonic
+    NM   = 1, // non-monotonic
+    S    = 1, // swap
+    D    = 2, // discontinuous
+    DL   = 2, // discontinuous, left
+    DR   = 3, // discontinuous, right
+    R    = 0, // right
+    L    = 1, // left
+    MAX  = 3, // largest possible
+    NONE = 4  // largest possible
+  };
+
+}
+#endif
+
+namespace bitext {
 
 typedef Moses::LRModel::ReorderingType PhraseOrientation;
 
diff --git a/moses/TranslationModel/UG/mm/ug_mm_bitext.h b/moses/TranslationModel/UG/mm/ug_mm_bitext.h
index be3fdfce8..4f93d4d3c 100644
--- a/moses/TranslationModel/UG/mm/ug_mm_bitext.h
+++ b/moses/TranslationModel/UG/mm/ug_mm_bitext.h
@@ -45,8 +45,9 @@ namespace Moses
 	  this->m_docname2docid[docname] = docid;
 	  this->m_docname.push_back(docname);
 	  line >> b;
-	  VERBOSE(1, "DOCUMENT MAP " << docname
-		  << " " << a << "-" << b+a << endl);
+#ifndef NO_MOSES
+	  VERBOSE(1, "DOCUMENT MAP " << docname << " " << a << "-" << b+a << endl);
+#endif
 	  for (b += a; a < b; ++a)
 	    (*this->m_sid2docid)[a] = docid;
 	}
diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h
index 53a9f761c..7e565c2df 100644
--- a/moses/TranslationModel/UG/mm/ug_phrasepair.h
+++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h
@@ -3,7 +3,9 @@
 #include <vector>
 #include "ug_typedefs.h"
 #include "ug_bitext_pstats.h"
+#ifndef NO_MOSES
 #include "moses/FF/LexicalReordering/LexicalReorderingState.h"
+#endif
 #include "boost/format.hpp"
 #include "tpt_tokenindex.h"
 namespace Moses
@@ -52,9 +54,11 @@ namespace Moses
       fill_lr_vec(LRModel::Direction const& dir,
 		  LRModel::ModelType const& mdl,
 		  vector<float>& v) const;
+#ifndef NO_MOSES
       void
       print(ostream& out, TokenIndex const& V1, TokenIndex const& V2,
 	    LRModel const& LR) const;
+#endif 
 
       class SortByTargetIdSeq
       {
@@ -292,6 +296,7 @@ namespace Moses
     }
 
 
+#ifndef NO_MOSES
     template<typename Token>
     void
     PhrasePair<Token>
@@ -331,5 +336,6 @@ namespace Moses
 	}
 #endif
     }
+#endif
   } // namespace bitext
 } // namespace Moses

From cc800742b1163545d3c72544f5cfd6a5059eeba0 Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Mon, 1 Jun 2015 18:26:27 +0100
Subject: [PATCH 032/108] Updated Makefile for local compiles.

---
 moses/TranslationModel/UG/Makefile | 174 +++++++++++++++--------------
 1 file changed, 92 insertions(+), 82 deletions(-)

diff --git a/moses/TranslationModel/UG/Makefile b/moses/TranslationModel/UG/Makefile
index 56fad1feb..ed1dead52 100644
--- a/moses/TranslationModel/UG/Makefile
+++ b/moses/TranslationModel/UG/Makefile
@@ -1,7 +1,22 @@
-# Some systems apparently distinguish between shell 
-# variables and environment variables. The latter are
-# visible to the make utility, the former apparently not,
-# so we need to set them if they are not defined yet
+SHELL = bash
+MAKEFLAGS += --warn-undefined-variables
+.DEFAULT_GOAL = all
+.SUFFIXES:
+
+# ===============================================================================
+# LOCAL ENVIRONMENT
+# ===============================================================================
+
+# shell script snippet:
+define find_moses_root
+d=$$(pwd);
+while [[ ! -e $$d/Jamroot && $$d != "/" ]] ; do
+    d=$$(dirname $$d); 
+done; 
+echo $$d
+endef
+
+MOSES_ROOT := $(shell $(find_moses_root))
 
 # ===============================================================================
 # COMPILATION PREFERENCES
@@ -10,107 +25,102 @@
 # OPTI:   optimization level
 # PROF:   profiler switches 
 
-CCACHE  = ccache
-OPTI    = 3
-EXE_TAG = exe
-PROF = 
-# PROF = -g -pg
+# compiler command
+compiler ?= g++
+variant  ?= runtime
+link     ?= static
 
-# ===============================================================================
+CXX       = $(shell which ccache) $(compiler)  
+CXXFLAGS += -DMAX_NUM_FACTORS=4
+CXXFLAGS += -DKENLM_MAX_ORDER=5
+CXXFLAGS += -DWITH_THREADS
+CXXFLAGS += -DNO_MOSES
+CXXFLAGS += -I${MOSES_ROOT} -I.
 
-SHELL         = bash
-MAKEFLAGS    += --warn-undefined-variables
-.DEFAULT_GOAL = all
-.SUFFIXES:
+ifeq ($(variant),debug)
+CXXFLAGS += -ggdb -O0
+else ifeq ($(variant),profile)
+CXXFLAGS += -g -pg -O3
+else ifeq ($(variant),syntax)
+CXXFLAGS += -fsyntax-only
+endif
+
+# LDFLAGS  = -L${MOSES_ROOT}/lib -L ./lib/
+
+# WDIR = build/$(variant)/${HOSTTYPE}/${KERNEL}
+WDIR = build/$(variant)
 
 # ===============================================================================
 # COMPILATION 'LOCALIZATION'
-HOST     ?= $(shell hostname)
-HOSTTYPE ?= $(shell uname -m)
-KERNEL    = $(shell uname -r)
+HOST       ?= $(shell hostname)
+HOSTTYPE   ?= $(shell uname -m)
+KERNEL      = $(shell uname -r)
 
-MOSES_ROOT  = ${HOME}/code/mosesdecoder
-WDIR        = build/${HOSTTYPE}/${KERNEL}/${OPTI}
-VPATH       = ${HOME}/code/mosesdecoder/
-CXXFLAGS    = ${PROF} -ggdb -Wall -O${OPTI} ${INCLUDES} 
-CXXFLAGS   += -DMAX_NUM_FACTORS=4
-CXXFLAGS   += -DKENLM_MAX_ORDER=5
-modirs     := $(addprefix -I,$(shell find ${MOSES_ROOT}/moses ${MOSES_ROOT}/contrib -type d))
-CXXFLAGS   += -I${MOSES_ROOT} 
-INCLUDES    = 
-BZLIB       =  
-BOOSTLIBTAG = 
+nil:
 
-lzma = lzma
-#lzma = 
-REQLIBS = m z pthread dl ${lzma} ${BZLIB} \
-	boost_thread${BOOSTLIBTAG} \
-	boost_program_options${BOOSTLIBTAG} \
-	boost_system${BOOSTLIBTAG} \
-	boost_filesystem${BOOSTLIBTAG} \
-	boost_iostreams${BOOSTLIBTAG} z bz2
+# libraries required
 
-# 	icuuc icuio icui18n \
-
-LIBS     = $(addprefix -l, moses ${REQLIBS}) 
-LIBS     = $(addprefix -l, ${REQLIBS}) 
-LIBDIRS   = -L${HOME}/code/mosesdecoder/lib
-LIBDIRS  += -L${HOME}/lib
-PREFIX ?= .
-BINDIR ?= ${PREFIX}/bin
-ifeq "$(OPTI)" "0"
-BINPREF = debug.
-else
-BINPREF = 
+LIBS  = m z bz2 pthread dl ${BOOSTLIBS}
+BOOSTLIBS := thread system filesystem program_options iostreams
+BOOSTLIBS := $(addprefix boost_,${BOOSTLIBS})
+ifdef ($(BOOSTLIBTAG),"")
+BOOSTLIBS := $(addsuffix ${BOOSTLIBTAG},${BOOSTLIBS})
 endif
 
+cc2obj = $(addsuffix .o,$(patsubst ${MOSES_ROOT}%,$(WDIR)%,\
+	 	        $(patsubst .%,$(WDIR)%,$(basename $1))))
+cc2exe = $(addprefix ./bin/$(variant)/,$(basename $(notdir $1)))
+cc2trg = $(basename $(notdir $1)) 
 
-OBJ2 :=
+define compile
 
-define compile 
-
-DEP  += ${WDIR}/$(basename $(notdir $1)).d
-${WDIR}/$(basename $(notdir $1)).o : $1 $(wildcard $(basename $1).h)
+DEP += $(basename $(call cc2obj,$1)).d
+$(call cc2obj,$1): $1
 	@echo -e "COMPILING $1"
 	@mkdir -p $$(@D)
-	${CXX} ${CXXFLAGS} -MD -MP -c $$(abspath $$<) -o $$@
+	@${CXX} ${CXXFLAGS} -MD -MP -c $$< -o $$@
 
 endef
 
-testprogs = test-dynamic-im-tsa try-align
-programs  = mtt-build mtt-dump symal2mam custom-pt mmlex-build ${testprogs}
-programs += mtt-count-words
+define build
 
-all: $(addprefix ${BINDIR}/${BINPREF}, $(programs))
-	@echo $^
-clean:
-	rm -f ${WDIR}/*.o ${WDIR}/*.d
+$(call cc2trg,$1): $(call cc2exe,$1)
+$(call cc2exe,$1): $(call cc2obj,$1) $(LIBOBJ)
+ifneq ($(variant),syntax)
+	@echo -e "LINKING $$@"
+	@mkdir -p $${@D}
+	@${CXX} ${CXXFLAGS} -o $$@ $(LIBOBJ) $(addprefix -l,${LIBS}) $$<
+endif
 
-custom-pt: ${BINDIR}/${BINPREF}custom-pt
-	echo $^
+endef
 
-INMOGEN = $(wildcard ${MOSES_ROOT}/moses/TranslationModel/UG/generic/*/*.cpp)
-#INMOMM  = $(wildcard ${MOSES_ROOT}/moses/TranslationModel/UG/mm/*.cc)
-#INMOMM += $(wildcard ${MOSES_ROOT}/moses/TranslationModel/UG/mm/*.cpp)
-OBJ     = $(patsubst %.cc,%.o,$(wildcard $(patsubst %.h,%.cc,$(wildcard *.h))))
-OBJ    += $(patsubst %.cpp,%.o,${INMOGEN})
-#OBJ    += $(patsubst %.cpp,%.o,${INMOMM})
-#OBJ    += $(patsubst %.cc,%.o,${INMOMM})
-EXE     = $(patsubst %.cc,%.o,$(filter-out $(patsubst %.h,%.cc,$(wildcard *.h)),$(wildcard *.cc)))
+# list files here that you want explicitly excluded from compilation
+skip  = sim-pe.cc 
+skip += mtt.count.cc 
+skip += try-align2.cc 
+skip += spe-check-coverage3.cc
+skip += mmsapt.cpp
+skip += ug_stringdist.cc
+skip += ug_splice_arglist.cc
+skip += ug_lexical_reordering.cc
+skip += ug_sampling_bias.cc
 
-$(foreach cpp,${INMOGEN},$(eval $(call compile,${cpp})))
-$(foreach cpp,$(wildcard *.cc),$(eval $(call compile,${cpp})))
-$(addprefix ${BINDIR}/${BINPREF}, $(programs)): $(addprefix ${WDIR}/,$(notdir ${OBJ}))
-$(addprefix ${BINDIR}/${BINPREF}, $(programs)): ${MOSES_ROOT}/lib/libmoses.a 
-${BINDIR}/${BINPREF}%: ${WDIR}/%.o ${WDIR}/mmsapt_align.o
-	@mkdir -p ${BINDIR}
-	echo PREREQS: $^
-	$(CXX) $(CXXFLAGS) -o $@ $^ ${LIBDIRS} ${LIBS} 
+# objects from elsewhere in the moses tree that are needed
+extra = ${MOSES_ROOT}/util/exception.cc
 
-#try-align: ${WDIR}/try-align.o ${WDIR}/tpt_tokenindex.o 
-#	$(CXX) $(CXXFLAGS) -o $@ $^ ${LIBDIRS}
+$(foreach f,$(skip),$(eval broken+=$(shell find -name $f)))
 
-.SECONDARY: 
+$(info SCANNING DIRECTORY TREE FOR FILES)
+find_cfiles = find -name '*.cc' -or -name '*.cpp'
+CFILES    = $(filter-out $(broken), $(shell $(find_cfiles)))
+PROGRAMS := $(shell $(find_cfiles) | xargs grep -lP '^(int +)?main')
+PROGRAMS := $(filter-out $(broken),$(PROGRAMS))
+
+ALLOBJ = $(call cc2obj,$(CFILES) $(extra))
+LIBOBJ = $(call cc2obj,$(filter-out $(PROGRAMS),$(CFILES) $(extra)))
+
+$(foreach f,$(CFILES) $(extra),$(eval $(call compile,$f)))
+$(foreach p,$(PROGRAMS),$(eval $(call build,$p)))
 
 -include $(DEP)
 

From aa4eed93d5791f98e8fc3f51db650d2aa231cc2c Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Mon, 1 Jun 2015 18:55:40 +0100
Subject: [PATCH 033/108] Bug fix related to getting rid of using namespace
 std; .

---
 .../UG/generic/sorting/VectorIndexSorter.h    | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h b/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h
index 31132c63c..f224b3bae 100644
--- a/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h
+++ b/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h
@@ -16,24 +16,28 @@
 
 namespace Moses
 {
-  using namespace std;
+  // using namespace std;
+
+
+  using std::greater;
+
   template<typename VAL,
 	   typename COMP = greater<VAL>,
 	   typename IDX_T=size_t>
   class
   VectorIndexSorter
-    : public binary_function<IDX_T const&, IDX_T const&, bool>
+    : public std::binary_function<IDX_T const&, IDX_T const&, bool>
   {
-    vector<VAL> const&    m_vecref;
+    std::vector<VAL> const&    m_vecref;
     boost::shared_ptr<COMP> m_comp;
   public:
 
     COMP const& Compare;
-    VectorIndexSorter(vector<VAL> const& v, COMP const& comp)
+    VectorIndexSorter(std::vector<VAL> const& v, COMP const& comp)
       : m_vecref(v), Compare(comp) {
     }
 
-    VectorIndexSorter(vector<VAL> const& v)
+    VectorIndexSorter(std::vector<VAL> const& v)
       : m_vecref(v), m_comp(new COMP()), Compare(*m_comp) {
     }
 
@@ -43,20 +47,20 @@ namespace Moses
       return (fwd == bwd ? a < b : fwd);
     }
 
-    boost::shared_ptr<vector<IDX_T> >
+    boost::shared_ptr<std::vector<IDX_T> >
     GetOrder() const;
 
     void
-    GetOrder(vector<IDX_T> & order) const;
+    GetOrder(std::vector<IDX_T> & order) const;
 
   };
 
   template<typename VAL, typename COMP, typename IDX_T>
-  boost::shared_ptr<vector<IDX_T> >
+  boost::shared_ptr<std::vector<IDX_T> >
   VectorIndexSorter<VAL,COMP,IDX_T>::
   GetOrder() const
   {
-    boost::shared_ptr<vector<IDX_T> > ret(new vector<IDX_T>(m_vecref.size()));
+    boost::shared_ptr<std::vector<IDX_T> > ret(new std::vector<IDX_T>(m_vecref.size()));
     get_order(*ret);
     return ret;
   }
@@ -64,7 +68,7 @@ namespace Moses
   template<typename VAL, typename COMP, typename IDX_T>
   void
   VectorIndexSorter<VAL,COMP,IDX_T>::
-  GetOrder(vector<IDX_T> & order) const
+  GetOrder(std::vector<IDX_T> & order) const
   {
     order.resize(m_vecref.size());
     for (IDX_T i = 0; i < IDX_T(m_vecref.size()); ++i) order[i] = i;

From 35cf55d4d25eaff8c99a6467e7fc923f35ac7aa7 Mon Sep 17 00:00:00 2001
From: Jeroen Vermeulen <jtv@precisiontranslationtools.com>
Date: Tue, 2 Jun 2015 15:03:18 +0700
Subject: [PATCH 034/108] Trailing spaces.

---
 moses/TranslationModel/UG/Makefile | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/moses/TranslationModel/UG/Makefile b/moses/TranslationModel/UG/Makefile
index ed1dead52..e595609ad 100644
--- a/moses/TranslationModel/UG/Makefile
+++ b/moses/TranslationModel/UG/Makefile
@@ -11,8 +11,8 @@ MAKEFLAGS += --warn-undefined-variables
 define find_moses_root
 d=$$(pwd);
 while [[ ! -e $$d/Jamroot && $$d != "/" ]] ; do
-    d=$$(dirname $$d); 
-done; 
+    d=$$(dirname $$d);
+done;
 echo $$d
 endef
 
@@ -23,14 +23,14 @@ MOSES_ROOT := $(shell $(find_moses_root))
 # ===============================================================================
 # CCACHE: if set to ccache, use ccache to speed up compilation
 # OPTI:   optimization level
-# PROF:   profiler switches 
+# PROF:   profiler switches
 
 # compiler command
 compiler ?= g++
 variant  ?= runtime
 link     ?= static
 
-CXX       = $(shell which ccache) $(compiler)  
+CXX       = $(shell which ccache) $(compiler)
 CXXFLAGS += -DMAX_NUM_FACTORS=4
 CXXFLAGS += -DKENLM_MAX_ORDER=5
 CXXFLAGS += -DWITH_THREADS
@@ -70,7 +70,7 @@ endif
 cc2obj = $(addsuffix .o,$(patsubst ${MOSES_ROOT}%,$(WDIR)%,\
 	 	        $(patsubst .%,$(WDIR)%,$(basename $1))))
 cc2exe = $(addprefix ./bin/$(variant)/,$(basename $(notdir $1)))
-cc2trg = $(basename $(notdir $1)) 
+cc2trg = $(basename $(notdir $1))
 
 define compile
 
@@ -95,9 +95,9 @@ endif
 endef
 
 # list files here that you want explicitly excluded from compilation
-skip  = sim-pe.cc 
-skip += mtt.count.cc 
-skip += try-align2.cc 
+skip  = sim-pe.cc
+skip += mtt.count.cc
+skip += try-align2.cc
 skip += spe-check-coverage3.cc
 skip += mmsapt.cpp
 skip += ug_stringdist.cc

From 0981d2370505672d027d0f6e17890fb36286c439 Mon Sep 17 00:00:00 2001
From: Jeroen Vermeulen <jtv@precisiontranslationtools.com>
Date: Tue, 2 Jun 2015 16:02:39 +0700
Subject: [PATCH 035/108] Lint-fixing binge.

---
 .beautify-ignore                              |  2 +
 mingw/MosesGUI/addMTModel.py                  | 11 ++-
 mingw/MosesGUI/chooseMTModel.py               | 31 +++++---
 mingw/MosesGUI/main.py                        |  4 +-
 mingw/MosesGUI/mainWindow.py                  | 71 ++++++++++++------
 misc/processLexicalTableMin.cpp               |  2 +-
 misc/processPhraseTableMin.cpp                |  2 +-
 moses/FF/GlobalLexicalModel.cpp               |  2 +-
 moses/FF/GlobalLexicalModelUnlimited.cpp      |  2 +-
 moses/IOWrapper.cpp                           |  4 +-
 moses/StaticData.cpp                          | 13 ++--
 moses/StaticData.h                            |  2 +-
 moses/WordsBitmapTest.cpp                     |  2 +-
 moses/parameters/BookkeepingOptions.cpp       |  2 +-
 moses/parameters/NBestOptions.cpp             | 14 ++--
 moses/parameters/NBestOptions.h               |  4 +-
 phrase-extract/score-main.cpp                 | 13 +++-
 .../in-decoding-transliteration.pl            | 29 ++++++-
 .../post-decoding-transliteration.pl          | 75 ++++++++++++++++---
 .../prepare-transliteration-phrase-table.pl   | 26 ++++++-
 .../train-transliteration-module.pl           | 70 ++++++++++++++---
 scripts/ems/example/data/weight.ini           | 10 +--
 scripts/ems/support/berkeley-process.sh       | 10 ++-
 23 files changed, 299 insertions(+), 102 deletions(-)

diff --git a/.beautify-ignore b/.beautify-ignore
index 9acdb5824..b05524e1d 100644
--- a/.beautify-ignore
+++ b/.beautify-ignore
@@ -18,6 +18,8 @@ irstlm
 jam-files
 lm
 mingw/MosesGUI/icons_rc.py
+mingw/MosesGUI/Ui_credits.py
+mingw/MosesGUI/Ui_mainWindow.py
 moses/TranslationModel/UG
 phrase-extract/pcfg-common
 phrase-extract/syntax-common
diff --git a/mingw/MosesGUI/addMTModel.py b/mingw/MosesGUI/addMTModel.py
index 8d55400d5..09e6fc542 100644
--- a/mingw/MosesGUI/addMTModel.py
+++ b/mingw/MosesGUI/addMTModel.py
@@ -4,14 +4,17 @@
 Module implementing Dialog.
 """
 
-from PyQt4.QtGui import *
-from PyQt4.QtCore import *
+from PyQt4.QtGui import (
+    QDialog,
+    QFileDialog,
+    )
+from PyQt4.QtCore import pyqtSignature
 
 import datetime
 import os
 
 from Ui_addMTModel import Ui_Dialog
-from util import *
+from util import doAlert
 
 
 class AddMTModelDialog(QDialog, Ui_Dialog):
@@ -88,7 +91,7 @@ class AddMTModelDialog(QDialog, Ui_Dialog):
         def checkEmpty(mystr):
             return len(str(mystr).strip()) <= 0
 
-        #check everything
+        # Check everything.
         self.modelName = self.editName.text()
         if checkEmpty(self.modelName):
             doAlert("Please provide non-empty Model Name")
diff --git a/mingw/MosesGUI/chooseMTModel.py b/mingw/MosesGUI/chooseMTModel.py
index 95c566f1e..5702216b8 100644
--- a/mingw/MosesGUI/chooseMTModel.py
+++ b/mingw/MosesGUI/chooseMTModel.py
@@ -4,11 +4,18 @@
 Module implementing ChooseMTModelDialog.
 """
 
-from PyQt4.QtCore import *
-from PyQt4.QtGui import *
-from PyQt4.QtSql import *
+import sys
+
+from PyQt4.QtCore import (
+    pyqtSignature,
+    QObject,
+    SIGNAL,
+    )
+from PyQt4.QtGui import QDialog
+from PyQt4.QtSql import QSqlQueryModel
 
 from Ui_chooseMTModel import Ui_Dialog
+from util import doAlert
 
 
 class ChooseMTModelDialog(QDialog, Ui_Dialog):
@@ -28,14 +35,20 @@ class ChooseMTModelDialog(QDialog, Ui_Dialog):
         self.selTableView.hideColumn(0)
         self.selTableView.hideColumn(5)
         self.selTableView.hideColumn(6)
-        #change status and keep the column
-        QObject.connect(datamodel,  SIGNAL("modelInstalled()"),  self.on_datamodel_modelInstalled)
+        # Change status and keep the column.
+        QObject.connect(
+            datamodel,  SIGNAL("modelInstalled()"),
+            self.on_datamodel_modelInstalled)
 
     def updateModel(self):
-        self.model.setQuery('SELECT ID, name, srclang, trglang, status, path, mosesini FROM models WHERE status = "READY" AND deleted != "True"', self.database)
+        self.model.setQuery(
+            'SELECT ID, name, srclang, trglang, status, path, mosesini '
+            'FROM models '
+            'WHERE status = "READY" AND deleted != "True"',
+            self.database)
 
     def on_datamodel_recordUpdated(self,  bRecord):
-        #deal with the selection changed problem
+        """Deal with the selection changed problem."""
         try:
             if bRecord:
                 current = self.selTableView.currentIndex()
@@ -44,9 +57,9 @@ class ChooseMTModelDialog(QDialog, Ui_Dialog):
                 else:
                     self.curSelection = None
             else:
-                if not self.curSelection is None:
+                if self.curSelection is not None:
                     self.selTableView.selectRow(self.curSelection)
-        except Exception, e:
+        except Exception as e:
             print >> sys.stderr, str(e)
 
     def on_datamodel_modelInstalled(self):
diff --git a/mingw/MosesGUI/main.py b/mingw/MosesGUI/main.py
index 805a7bc0c..3bab0e617 100644
--- a/mingw/MosesGUI/main.py
+++ b/mingw/MosesGUI/main.py
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 
-from PyQt4.QtCore import *
-from PyQt4.QtGui import *
+from PyQt4.QtGui import QApplication
 
 import os
 import sys
@@ -9,7 +8,6 @@ import sys
 from mainWindow import MainWindow
 from datamodel import DataModel
 from moses import Moses
-from util import *
 
 if __name__ == "__main__":
     app = QApplication(sys.argv)
diff --git a/mingw/MosesGUI/mainWindow.py b/mingw/MosesGUI/mainWindow.py
index 5fb031c50..e92cdbb92 100644
--- a/mingw/MosesGUI/mainWindow.py
+++ b/mingw/MosesGUI/mainWindow.py
@@ -4,10 +4,19 @@
 Module implementing MainWindow.
 """
 
-from PyQt4.QtCore import *
-from PyQt4.QtGui import *
-from PyQt4.QtSql import *
+from PyQt4.QtCore import (
+    pyqtSignature,
+    QObject,
+    Qt,
+    SIGNAL,
+    )
+from PyQt4.QtGui import (
+    QMainWindow,
+    QMessageBox,
+    QProgressDialog,
+    )
 
+import sys
 import threading
 
 from Ui_mainWindow import Ui_MainWindow
@@ -15,7 +24,7 @@ from addMTModel import AddMTModelDialog
 from chooseMTModel import ChooseMTModelDialog
 from engine import Engine
 from credits import DlgCredits
-from util import *
+from util import doAlert
 
 
 class MainWindow(QMainWindow, Ui_MainWindow):
@@ -54,18 +63,27 @@ class MainWindow(QMainWindow, Ui_MainWindow):
         Slot documentation goes here.
         """
         current = self.tableView.currentIndex()
-        if current and current.row() >= 0:
-            if self.engine and self.datamodel.getRowID(current.row()) == self.engine.model['ID']:
-                text = '''The model is still in use, do you want to stop and delete it?
-It might take a while...'''
-                reply = QMessageBox.question(None, 'Message', text, QMessageBox.Yes, QMessageBox.No)
-                if reply == QMessageBox.No:
-                    return
-                t = self.stopEngine(self.engine)
-                t.join()
-                self.engine = None
-                self.clearPanel()
-            self.datamodel.delModel(current.row())
+        if not current or current.row() < 0:
+            return
+        model_in_use = (
+            self.engine and
+            self.datamodel.getRowID(current.row()) == self.engine.model['ID']
+            )
+        if model_in_use:
+            text = (
+                "The model is still in use, do you want to "
+                "stop and delete it?\n"
+                "It might take a while..."
+                )
+            reply = QMessageBox.question(
+                None, 'Message', text, QMessageBox.Yes, QMessageBox.No)
+            if reply == QMessageBox.No:
+                return
+            t = self.stopEngine(self.engine)
+            t.join()
+            self.engine = None
+            self.clearPanel()
+        self.datamodel.delModel(current.row())
 
     @pyqtSignature("")
     def on_newModelBtn_clicked(self):
@@ -153,17 +171,24 @@ It might take a while...'''
             if self.progress:
                 self.progress.close()
                 self.progress = None
-            self.progress = QProgressDialog("Model: %s" % model['name'], "Cancel", 0, self.engine.countSteps(), self)
+            self.progress = QProgressDialog(
+                "Model: %s" % model['name'], "Cancel", 0,
+                self.engine.countSteps(), self)
             self.progress.setAutoReset(True)
             self.progress.setAutoClose(True)
             self.progress.setWindowModality(Qt.WindowModal)
             self.progress.setWindowTitle('Loading Model...')
-            QObject.connect(self.progress, SIGNAL("canceled()"), self.progressCancelled)
+            QObject.connect(
+                self.progress, SIGNAL("canceled()"), self.progressCancelled)
             self.progress.show()
 
-            #connect engine signal
-            QObject.connect(self.engine, SIGNAL("stepFinished(int)"), self.engineStepFinished)
-            QObject.connect(self.engine, SIGNAL("loaded(bool, QString)"), self.engineLoaded)
+            # Connect engine signal.
+            QObject.connect(
+                self.engine, SIGNAL("stepFinished(int)"),
+                self.engineStepFinished)
+            QObject.connect(
+                self.engine, SIGNAL("loaded(bool, QString)"),
+                self.engineLoaded)
 
             def startEngineThread():
                 self.engine.start()
@@ -225,7 +250,9 @@ It might take a while...'''
                 if text.strip() == "":
                     trans.append(text)
                 else:
-                    trans.append(self.engine.translate(text.replace('\r', ' ').strip()).decode('utf8'))
+                    trans.append(
+                        self.engine.translate(
+                            text.replace('\r', ' ').strip()).decode('utf8'))
             self.editTrg.setText('\n'.join(trans))
         except Exception, e:
             print >> sys.stderr, str(e)
diff --git a/misc/processLexicalTableMin.cpp b/misc/processLexicalTableMin.cpp
index 8d309c331..8eee489ad 100644
--- a/misc/processLexicalTableMin.cpp
+++ b/misc/processLexicalTableMin.cpp
@@ -54,7 +54,7 @@ int main(int argc, char** argv)
   bool multipleScoreTrees = true;
   size_t quantize = 0;
 
-  size_t threads = 
+  size_t threads =
 		#ifdef WITH_THREADS
     	boost::thread::hardware_concurrency() ? boost::thread::hardware_concurrency() :
 		#endif
diff --git a/misc/processPhraseTableMin.cpp b/misc/processPhraseTableMin.cpp
index 92d63433e..3948a692c 100644
--- a/misc/processPhraseTableMin.cpp
+++ b/misc/processPhraseTableMin.cpp
@@ -67,7 +67,7 @@ int main(int argc, char **argv)
   bool sortScoreIndexSet = false;
   size_t sortScoreIndex = 2;
   bool warnMe = true;
-  size_t threads = 
+  size_t threads =
 		#ifdef WITH_THREADS
     	boost::thread::hardware_concurrency() ? boost::thread::hardware_concurrency() :
 		#endif
diff --git a/moses/FF/GlobalLexicalModel.cpp b/moses/FF/GlobalLexicalModel.cpp
index f4df403ae..b5a07b1ef 100644
--- a/moses/FF/GlobalLexicalModel.cpp
+++ b/moses/FF/GlobalLexicalModel.cpp
@@ -111,7 +111,7 @@ void GlobalLexicalModel::Load()
 
 void GlobalLexicalModel::InitializeForInput(ttasksptr const& ttask)
 {
-  UTIL_THROW_IF2(ttask->GetSource()->GetType() != SentenceInput, 
+  UTIL_THROW_IF2(ttask->GetSource()->GetType() != SentenceInput,
 		 "GlobalLexicalModel works only with sentence input.");
   Sentence const* s = reinterpret_cast<Sentence const*>(ttask->GetSource().get());
   m_local.reset(new ThreadLocalStorage);
diff --git a/moses/FF/GlobalLexicalModelUnlimited.cpp b/moses/FF/GlobalLexicalModelUnlimited.cpp
index 434fa7fbb..d507054c2 100644
--- a/moses/FF/GlobalLexicalModelUnlimited.cpp
+++ b/moses/FF/GlobalLexicalModelUnlimited.cpp
@@ -107,7 +107,7 @@ bool GlobalLexicalModelUnlimited::Load(const std::string &filePathSource,
 
 void GlobalLexicalModelUnlimited::InitializeForInput(ttasksptr const& ttask)
 {
-  UTIL_THROW_IF2(ttask->GetSource()->GetType() != SentenceInput, 
+  UTIL_THROW_IF2(ttask->GetSource()->GetType() != SentenceInput,
 		 "GlobalLexicalModel works only with sentence input.");
   Sentence const* s = reinterpret_cast<Sentence const*>(ttask->GetSource().get());
   m_local.reset(new ThreadLocalStorage);
diff --git a/moses/IOWrapper.cpp b/moses/IOWrapper.cpp
index 8cbf4f091..d1bdeb44f 100644
--- a/moses/IOWrapper.cpp
+++ b/moses/IOWrapper.cpp
@@ -303,10 +303,10 @@ ReadInput()
   boost::lock_guard<boost::mutex> lock(m_lock);
 #endif
   boost::shared_ptr<InputType> source = GetBufferedInput();
-  if (source) 
+  if (source)
     {
       source->SetTranslationId(m_currentLine++);
-      if (m_look_ahead || m_look_back) 
+      if (m_look_ahead || m_look_back)
 	this->set_context_for(*source);
     }
   m_past_input.push_back(source);
diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index ac0c3c990..b41768604 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -593,7 +593,7 @@ bool StaticData::LoadData(Parameter *parameter)
   ini_factor_maps();
   ini_input_options();
   m_bookkeeping_options.init(*parameter);
-  m_nbest_options.init(*parameter); // if (!ini_nbest_options()) return false; 
+  m_nbest_options.init(*parameter); // if (!ini_nbest_options()) return false;
   if (!ini_output_options()) return false;
 
   // threading etc.
@@ -616,14 +616,14 @@ bool StaticData::LoadData(Parameter *parameter)
   ini_mira_options();
 
   // set m_nbest_options.enabled = true if necessary:
-  if (m_mbr || m_useLatticeMBR || m_outputSearchGraph || m_outputSearchGraphSLF 
-      || m_mira || m_outputSearchGraphHypergraph || m_useConsensusDecoding 
+  if (m_mbr || m_useLatticeMBR || m_outputSearchGraph || m_outputSearchGraphSLF
+      || m_mira || m_outputSearchGraphHypergraph || m_useConsensusDecoding
 #ifdef HAVE_PROTOBUF
-      || m_outputSearchGraphPB 
+      || m_outputSearchGraphPB
 #endif
       || m_latticeSamplesFilePath.size())
-    { 
-      m_nbest_options.enabled = true; 
+    {
+      m_nbest_options.enabled = true;
     }
 
   // S2T decoder
@@ -1371,4 +1371,3 @@ void StaticData::ResetWeights(const std::string &denseWeights, const std::string
 }
 
 } // namespace
-
diff --git a/moses/StaticData.h b/moses/StaticData.h
index 2b1e37b83..a93e67003 100644
--- a/moses/StaticData.h
+++ b/moses/StaticData.h
@@ -103,7 +103,7 @@ protected:
   BookkeepingOptions m_bookkeeping_options;
   // size_t m_nBestSize;
   // size_t m_nBestFactor;
-  
+
   size_t m_latticeSamplesSize;
   size_t m_maxNoTransOptPerCoverage;
   size_t m_maxNoPartTransOpt;
diff --git a/moses/WordsBitmapTest.cpp b/moses/WordsBitmapTest.cpp
index 3acd1351a..543c96bd1 100644
--- a/moses/WordsBitmapTest.cpp
+++ b/moses/WordsBitmapTest.cpp
@@ -40,7 +40,7 @@ BOOST_AUTO_TEST_CASE(initialise)
   bitvec[2] = true;
   bitvec[3] = true;
   bitvec[7] = true;
-  
+
   WordsBitmap wbm2(7,bitvec);
   BOOST_CHECK_EQUAL(wbm2.GetSize(),7);
   for (size_t i = 0; i < 7; ++i) {
diff --git a/moses/parameters/BookkeepingOptions.cpp b/moses/parameters/BookkeepingOptions.cpp
index 875c605bf..2ab26b53c 100644
--- a/moses/parameters/BookkeepingOptions.cpp
+++ b/moses/parameters/BookkeepingOptions.cpp
@@ -8,7 +8,7 @@ namespace Moses {
     bool& x = need_alignment_info;
     P.SetParameter(x, "print-alignment-info", false);
     if (!x) P.SetParameter(x, "print-alignment-info-in-n-best", false);
-    if (!x) 
+    if (!x)
       {
 	PARAM_VEC const* params = P.GetParam("alignment-output-file");
 	x = params && params->size();
diff --git a/moses/parameters/NBestOptions.cpp b/moses/parameters/NBestOptions.cpp
index 6ec97c91b..45747011a 100644
--- a/moses/parameters/NBestOptions.cpp
+++ b/moses/parameters/NBestOptions.cpp
@@ -10,22 +10,22 @@ init(Parameter const& P)
 {
   const PARAM_VEC *params;
   params = P.GetParam("n-best-list");
-  if (params) 
+  if (params)
     {
-      if (params->size() >= 2) 
+      if (params->size() >= 2)
 	{
 	  output_file_path = params->at(0);
 	  nbest_size = Scan<size_t>( params->at(1) );
 	  only_distinct = (params->size()>2 && params->at(2)=="distinct");
-	} 
-      else 
+	}
+      else
 	{
 	  std::cerr << "wrong format for switch -n-best-list file size [disinct]";
 	  return false;
 	}
-    } 
+    }
   else nbest_size = 0;
-  
+
   P.SetParameter<size_t>(factor, "n-best-factor", 20);
   P.SetParameter(include_alignment_info, "print-alignment-info-in-n-best", false );
   P.SetParameter(include_feature_labels, "labeled-n-best-list", true );
@@ -33,7 +33,7 @@ init(Parameter const& P)
   P.SetParameter(include_passthrough, "print-passthrough-in-n-best", false );
   P.SetParameter(include_all_factors, "report-all-factors-in-n-best", false );
   P.SetParameter(print_trees, "n-best-trees", false );
-  
+
   enabled = output_file_path.size();
   return true;
 }
diff --git a/moses/parameters/NBestOptions.h b/moses/parameters/NBestOptions.h
index e844c1eac..6c868990c 100644
--- a/moses/parameters/NBestOptions.h
+++ b/moses/parameters/NBestOptions.h
@@ -19,11 +19,9 @@ namespace Moses {
     bool include_all_factors;
 
     std::string output_file_path;
-    
+
     bool init(Parameter const& param);
 
   };
 
-
-
 }
diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp
index b65dce4ba..185c0ae9e 100644
--- a/phrase-extract/score-main.cpp
+++ b/phrase-extract/score-main.cpp
@@ -130,7 +130,15 @@ int main(int argc, char* argv[])
 
   ScoreFeatureManager featureManager;
   if (argc < 4) {
-    std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PartsOfSpeech] [--PCFG] [--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] [--TargetPreferenceLabels] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
+    std::cerr <<
+        "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] "
+        "[--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] "
+        "[--NoWordAlignment] [--UnalignedPenalty] "
+        "[--UnalignedFunctionWordPenalty function-word-file] "
+        "[--MinCountHierarchical count] [--PartsOfSpeech] [--PCFG] "
+        "[--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] "
+        "[--TargetPreferenceLabels] [--UnpairedExtractFormat] "
+        "[--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
     std::cerr << featureManager.usage() << std::endl;
     exit(1);
   }
@@ -147,7 +155,8 @@ int main(int argc, char* argv[])
   std::string fileNameLeftHandSideTargetPreferenceLabelCounts;
   std::string fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts;
   std::string fileNamePhraseOrientationPriors;
-  std::vector<std::string> featureArgs; // all unknown args passed to feature manager
+  // All unknown args are passed to feature manager.
+  std::vector<std::string> featureArgs;
 
   for(int i=4; i<argc; i++) {
     if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
diff --git a/scripts/Transliteration/in-decoding-transliteration.pl b/scripts/Transliteration/in-decoding-transliteration.pl
index e8130db02..4ecba1e5a 100755
--- a/scripts/Transliteration/in-decoding-transliteration.pl
+++ b/scripts/Transliteration/in-decoding-transliteration.pl
@@ -122,17 +122,38 @@ sub run_transliteration
 
 	print "Filter Table\n";
 
-	`$MOSES_SRC/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table -config $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini -lm 0:3:$TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini:8`;
+	`$MOSES_SRC/scripts/training/train-model.perl \
+            -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \
+            -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
+            -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
+            -score-options '--KneserNey' \
+            -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \
+            -config $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \
+            -lm 0:3:$TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini:8`;
 
-	`$MOSES_SRC/scripts/training/filter-model-given-input.pl $TRANSLIT_MODEL/evaluation/$eval_file.filtered $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini $TRANSLIT_MODEL/evaluation/$eval_file  -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`;
+	`$MOSES_SRC/scripts/training/filter-model-given-input.pl \
+            $TRANSLIT_MODEL/evaluation/$eval_file.filtered \
+            $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \
+            $TRANSLIT_MODEL/evaluation/$eval_file  \
+            -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`;
 
 	`rm  $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini`;
 
 	print "Apply Filter\n";
 
-	`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl $TRANSLIT_MODEL/evaluation/$eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini $TRANSLIT_MODEL/tuning/moses.tuned.ini $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini`;
+	`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \
+            $TRANSLIT_MODEL/evaluation/$eval_file.filtered/moses.ini \
+            $TRANSLIT_MODEL/model/moses.ini \
+            $TRANSLIT_MODEL/tuning/moses.tuned.ini \
+            $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini`;
 
-	`$MOSES_SRC/bin/moses -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads 16 -drop-unknown -distortion-limit 0 -n-best-list $TRANSLIT_MODEL/evaluation/$eval_file.op.nBest 100 distinct -f $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini < $TRANSLIT_MODEL/evaluation/$eval_file > $TRANSLIT_MODEL/evaluation/$eval_file.op`;
+	`$MOSES_SRC/bin/moses \
+            -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \
+            -threads 16 -drop-unknown -distortion-limit 0 \
+            -n-best-list $TRANSLIT_MODEL/evaluation/$eval_file.op.nBest 100 \
+            distinct -f $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini \
+            < $TRANSLIT_MODEL/evaluation/$eval_file \
+            > $TRANSLIT_MODEL/evaluation/$eval_file.op`;
 
 }
 
diff --git a/scripts/Transliteration/post-decoding-transliteration.pl b/scripts/Transliteration/post-decoding-transliteration.pl
index 2c7908085..df840c709 100755
--- a/scripts/Transliteration/post-decoding-transliteration.pl
+++ b/scripts/Transliteration/post-decoding-transliteration.pl
@@ -137,18 +137,39 @@ sub run_transliteration
 
 	print "Filter Table\n";
 
-	`$MOSES_SRC/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table -config $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini -lm 0:3:$TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini:8`;
+	`$MOSES_SRC/scripts/training/train-model.perl \
+            -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \
+            -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
+            -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
+            -score-options '--KneserNey' \
+            -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \
+            -config $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \
+            -lm 0:3:$TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini:8`;
 
-	`$MOSES_SRC/scripts/training/filter-model-given-input.pl $TRANSLIT_MODEL/evaluation/$eval_file.filtered $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini $TRANSLIT_MODEL/evaluation/$eval_file  -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`;
+	`$MOSES_SRC/scripts/training/filter-model-given-input.pl \
+            $TRANSLIT_MODEL/evaluation/$eval_file.filtered \
+            $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \
+            $TRANSLIT_MODEL/evaluation/$eval_file \
+            -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`;
 
 	`rm  $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini`;
 
 	print "Apply Filter\n";
 
-	`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl $TRANSLIT_MODEL/evaluation/$eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini $TRANSLIT_MODEL/tuning/moses.tuned.ini $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini`;
+	`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \
+            $TRANSLIT_MODEL/evaluation/$eval_file.filtered/moses.ini \
+            $TRANSLIT_MODEL/model/moses.ini \
+            $TRANSLIT_MODEL/tuning/moses.tuned.ini \
+            $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini`;
 
   my $drop_stderr = $VERBOSE ? "" : " 2>/dev/null";
-	`$DECODER -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads 16 -drop-unknown -distortion-limit 0 -n-best-list $TRANSLIT_MODEL/evaluation/$eval_file.op.nBest 1000 distinct -f $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini < $TRANSLIT_MODEL/evaluation/$eval_file > $TRANSLIT_MODEL/evaluation/$eval_file.op $drop_stderr`;
+	`$DECODER \
+            -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \
+            -threads 16 -drop-unknown -distortion-limit 0 \
+            -n-best-list $TRANSLIT_MODEL/evaluation/$eval_file.op.nBest 1000 \
+            distinct -f $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini \
+            < $TRANSLIT_MODEL/evaluation/$eval_file \
+            > $TRANSLIT_MODEL/evaluation/$eval_file.op $drop_stderr`;
 
 }
 
@@ -294,22 +315,52 @@ sub run_decoder
 
 	`mkdir $corpus_dir/evaluation`;
 
-	`$MOSES_SRC/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -lmodel-oov-feature "yes" -post-decoding-translit "yes" -phrase-translation-table $corpus_dir/model/phrase-table -config $corpus_dir/model/moses.ini -lm 0:5:$LM_FILE:8`;
+	`$MOSES_SRC/scripts/training/train-model.perl \
+            -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \
+            -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
+            -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
+            -lmodel-oov-feature "yes" -post-decoding-translit "yes" \
+            -phrase-translation-table $corpus_dir/model/phrase-table \
+            -config $corpus_dir/model/moses.ini -lm 0:5:$LM_FILE:8`;
 
 	`touch $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini`;
 
-	`$MOSES_SRC/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -lmodel-oov-feature "yes" -post-decoding-translit "yes" -phrase-translation-table $corpus_dir/model/phrase-table -config $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini -lm 0:3:$corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini:8`;
+	`$MOSES_SRC/scripts/training/train-model.perl \
+            -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \
+            -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
+            -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
+            -lmodel-oov-feature "yes" -post-decoding-translit "yes" \
+            -phrase-translation-table $corpus_dir/model/phrase-table \
+            -config $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini \
+            -lm 0:3:$corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini:8`;
 
-	`$MOSES_SRC/scripts/training/filter-model-given-input.pl $corpus_dir/evaluation/filtered $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini $INPUT_FILE  -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`;
+	`$MOSES_SRC/scripts/training/filter-model-given-input.pl \
+            $corpus_dir/evaluation/filtered \
+            $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini \
+            $INPUT_FILE  -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt \
+            1 1 4 100 2"`;
 
 	`rm $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini`;
 
-	`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables.perl $corpus_dir/evaluation/filtered/moses.ini < $corpus_dir/model/moses.ini > $corpus_dir/evaluation/moses.filtered.ini`;
+	`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables.perl \
+            $corpus_dir/evaluation/filtered/moses.ini \
+            < $corpus_dir/model/moses.ini \
+            > $corpus_dir/evaluation/moses.filtered.ini`;
 
   my $drop_stderr = $VERBOSE ? "" : " 2>/dev/null";
-	`$DECODER -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' -max-trans-opt-per-coverage 100 -f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 < $INPUT_FILE > $OUTPUT_FILE $drop_stderr`;
+	`$DECODER \
+            -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \
+            -threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' \
+            -max-trans-opt-per-coverage 100 \
+            -f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 \
+            < $INPUT_FILE \
+            > $OUTPUT_FILE $drop_stderr`;
 
-	print "$DECODER -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' -max-trans-opt-per-coverage 100 -f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 < $INPUT_FILE > $OUTPUT_FILE $drop_stderr\n";
+	print "$DECODER \
+            -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \
+            -threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' \
+            -max-trans-opt-per-coverage 100 \
+            -f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 \
+            < $INPUT_FILE \
+            > $OUTPUT_FILE $drop_stderr\n";
 }
-
-
diff --git a/scripts/Transliteration/prepare-transliteration-phrase-table.pl b/scripts/Transliteration/prepare-transliteration-phrase-table.pl
index 0a9f554c5..fd8b5a978 100755
--- a/scripts/Transliteration/prepare-transliteration-phrase-table.pl
+++ b/scripts/Transliteration/prepare-transliteration-phrase-table.pl
@@ -103,17 +103,35 @@ sub run_transliteration
 
 	print STDERR "Filter Table\n";
 
-	`$MOSES_SRC/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table -reordering-table $TRANSLIT_MODEL/model/reordering-table -config $eval_file.moses.table.ini -lm 0:3:$eval_file.moses.table.ini:8`;
+	`$MOSES_SRC/scripts/training/train-model.perl \
+            -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \
+            -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
+            -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
+            -reordering msd-bidirectional-fe -score-options '--KneserNey' \
+            -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \
+            -reordering-table $TRANSLIT_MODEL/model/reordering-table \
+            -config $eval_file.moses.table.ini \
+            -lm 0:3:$eval_file.moses.table.ini:8`;
 
-	`$MOSES_SRC/scripts/training/filter-model-given-input.pl $eval_file.filtered $eval_file.moses.table.ini $eval_file  -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`;
+	`$MOSES_SRC/scripts/training/filter-model-given-input.pl \
+            $eval_file.filtered $eval_file.moses.table.ini $eval_file \
+            -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`;
 
 	`rm  $eval_file.moses.table.ini`;
 
 	print STDERR "Apply Filter\n";
 
-	`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl $eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini $TRANSLIT_MODEL/tuning/moses.tuned.ini $eval_file.filtered.ini`;
+	`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \
+            $eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini \
+            $TRANSLIT_MODEL/tuning/moses.tuned.ini $eval_file.filtered.ini`;
 
-	`$MOSES_SRC/bin/moses -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads 16 -drop-unknown -distortion-limit 0 -n-best-list $eval_file.op.nBest 50 -f $eval_file.filtered.ini < $eval_file > $eval_file.op`;
+	`$MOSES_SRC/bin/moses \
+            -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \
+            -threads 16 -drop-unknown -distortion-limit 0 \
+            -n-best-list $eval_file.op.nBest 50 \
+            -f $eval_file.filtered.ini \
+            < $eval_file \
+            > $eval_file.op`;
 
 }
 
diff --git a/scripts/Transliteration/train-transliteration-module.pl b/scripts/Transliteration/train-transliteration-module.pl
index b1d4d0ff5..817e2d815 100755
--- a/scripts/Transliteration/train-transliteration-module.pl
+++ b/scripts/Transliteration/train-transliteration-module.pl
@@ -118,31 +118,81 @@ sub learn_transliteration_model{
 
    print "Align Corpus\n";
 
-  `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -last-step 1 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -corpus $OUT_DIR/training/corpus$t -corpus-dir $OUT_DIR/training/prepared`;
+  `$MOSES_SRC_DIR/scripts/training/train-model.perl \
+      -mgiza -mgiza-cpus 10 -dont-zip -last-step 1 \
+      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
+      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
+      -score-options '--KneserNey' -corpus $OUT_DIR/training/corpus$t \
+      -corpus-dir $OUT_DIR/training/prepared`;
 
-  `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 2 -last-step 2 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared -giza-e2f $OUT_DIR/training/giza -direction 2`;
+  `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 \
+      -dont-zip -first-step 2 -last-step 2 \
+      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
+      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
+      -score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared \
+      -giza-e2f $OUT_DIR/training/giza -direction 2`;
 
-  `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 2 -last-step 2 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared -giza-f2e $OUT_DIR/training/giza-inverse -direction 1`;
+  `$MOSES_SRC_DIR/scripts/training/train-model.perl \
+      -mgiza -mgiza-cpus 10 -dont-zip -first-step 2 -last-step 2 \
+      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
+      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
+      -score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared \
+      -giza-f2e $OUT_DIR/training/giza-inverse -direction 1`;
 
-  `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 3 -last-step 3 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -giza-e2f $OUT_DIR/training/giza -giza-f2e $OUT_DIR/training/giza-inverse -alignment-file $OUT_DIR/model/aligned -alignment-stem $OUT_DIR/model/aligned -alignment grow-diag-final-and`;
+  `$MOSES_SRC_DIR/scripts/training/train-model.perl \
+      -mgiza -mgiza-cpus 10 -dont-zip -first-step 3 -last-step 3 \
+      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
+      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
+      -score-options '--KneserNey' -giza-e2f $OUT_DIR/training/giza \
+      -giza-f2e $OUT_DIR/training/giza-inverse \
+      -alignment-file $OUT_DIR/model/aligned \
+      -alignment-stem $OUT_DIR/model/aligned -alignment grow-diag-final-and`;
 
   print "Train Translation Models\n";
 
- `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 4 -last-step 4 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -lexical-file $OUT_DIR/model/lex -alignment-file $OUT_DIR/model/aligned -alignment-stem $OUT_DIR/model/aligned -corpus $OUT_DIR/training/corpus$t`;
+ `$MOSES_SRC_DIR/scripts/training/train-model.perl \
+     -mgiza -mgiza-cpus 10 -dont-zip -first-step 4 -last-step 4 \
+     -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
+     -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
+     -score-options '--KneserNey' -lexical-file $OUT_DIR/model/lex \
+     -alignment-file $OUT_DIR/model/aligned \
+     -alignment-stem $OUT_DIR/model/aligned \
+     -corpus $OUT_DIR/training/corpus$t`;
 
-  `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 5 -last-step 5 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -alignment-file $OUT_DIR/model/aligned -alignment-stem $OUT_DIR/model/aligned -extract-file $OUT_DIR/model/extract -corpus $OUT_DIR/training/corpus$t`;
+  `$MOSES_SRC_DIR/scripts/training/train-model.perl \
+      -mgiza -mgiza-cpus 10 -dont-zip -first-step 5 -last-step 5 \
+      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
+      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
+      -score-options '--KneserNey' -alignment-file $OUT_DIR/model/aligned \
+      -alignment-stem $OUT_DIR/model/aligned -extract-file \
+      $OUT_DIR/model/extract -corpus $OUT_DIR/training/corpus$t`;
 
-  `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 6 -last-step 6 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -extract-file $OUT_DIR/model/extract -lexical-file $OUT_DIR/model/lex -phrase-translation-table $OUT_DIR/model/phrase-table`;
+  `$MOSES_SRC_DIR/scripts/training/train-model.perl \
+      -mgiza -mgiza-cpus 10 -dont-zip -first-step 6 -last-step 6 \
+      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
+      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
+      -score-options '--KneserNey' -extract-file $OUT_DIR/model/extract \
+      -lexical-file $OUT_DIR/model/lex -phrase-translation-table \
+      $OUT_DIR/model/phrase-table`;
 
   print "Train Language Models\n";
 
-  `$SRILM_DIR/ngram-count -order 5 -interpolate -kndiscount -addsmooth1 0.0 -unk -text $OUT_DIR/lm/target -lm $OUT_DIR/lm/targetLM`;
+  `$SRILM_DIR/ngram-count \
+      -order 5 -interpolate -kndiscount -addsmooth1 0.0 -unk \
+      -text $OUT_DIR/lm/target -lm $OUT_DIR/lm/targetLM`;
 
-  `$MOSES_SRC_DIR/bin/build_binary $OUT_DIR/lm/targetLM $OUT_DIR/lm/targetLM.bin`;
+  `$MOSES_SRC_DIR/bin/build_binary \
+      $OUT_DIR/lm/targetLM $OUT_DIR/lm/targetLM.bin`;
 
   print "Create Config File\n";
 
-  `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -phrase-translation-table $OUT_DIR/model/phrase-table -config $OUT_DIR/model/moses.ini -lm 0:5:$OUT_DIR/lm/targetLM.bin:8`;
+  `$MOSES_SRC_DIR/scripts/training/train-model.perl \
+      -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \
+      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
+      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
+      -score-options '--KneserNey' \
+      -phrase-translation-table $OUT_DIR/model/phrase-table \
+      -config $OUT_DIR/model/moses.ini -lm 0:5:$OUT_DIR/lm/targetLM.bin:8`;
 
 }
 
diff --git a/scripts/ems/example/data/weight.ini b/scripts/ems/example/data/weight.ini
index 4e941b662..e42fbb529 100644
--- a/scripts/ems/example/data/weight.ini
+++ b/scripts/ems/example/data/weight.ini
@@ -3,12 +3,12 @@
 #########################
 
 [weight]
-Distortion0= 0.3 
-UnknownWordPenalty0= 1 
-WordPenalty0= -1 
+Distortion0= 0.3
+UnknownWordPenalty0= 1
+WordPenalty0= -1
 TranslationModel0= 0.2 0.2 0.2 0.2
 PhrasePenalty0= 0.2
-LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3 
-LM0= 0.5 
+LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3
+LM0= 0.5
 
 
diff --git a/scripts/ems/support/berkeley-process.sh b/scripts/ems/support/berkeley-process.sh
index 347ebba3c..4b23f0c16 100755
--- a/scripts/ems/support/berkeley-process.sh
+++ b/scripts/ems/support/berkeley-process.sh
@@ -28,7 +28,15 @@ shift
 shift
 shift
 
-JAVA_CMD="/usr/local/share/java/bin/java $JAVA_OPTS -jar $JAR -Data.trainSources $INFILE.list -Main.loadParamsDir $PARAMDIR -exec.execDir $OUTNAME -Main.loadLexicalModelOnly false -Data.englishSuffix $SLANG -Data.foreignSuffix $TLANG -exec.create true -Main.saveParams false -Main.alignTraining true -Main.forwardModels HMM -Main.reverseModels HMM -Main.mode JOINT -Main.iters 0 -Data.testSources -EMWordAligner.posteriorDecodingThreshold $POSTERIOR $@"
+JAVA_CMD="/usr/local/share/java/bin/java \
+      $JAVA_OPTS -jar $JAR -Data.trainSources $INFILE.list \
+      -Main.loadParamsDir $PARAMDIR -exec.execDir $OUTNAME \
+      -Main.loadLexicalModelOnly false -Data.englishSuffix $SLANG \
+      -Data.foreignSuffix $TLANG -exec.create true -Main.saveParams false \
+      -Main.alignTraining true -Main.forwardModels HMM \
+      -Main.reverseModels HMM -Main.mode JOINT -Main.iters 0 \
+      -Data.testSources -EMWordAligner.posteriorDecodingThreshold $POSTERIOR \
+      $@"
 echo "Running $JAVA_CMD"
 $JAVA_CMD
 

From d3fb4a8002702685d322fcbcb9fc2e5797b4aeb8 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Tue, 2 Jun 2015 10:16:42 +0100
Subject: [PATCH 036/108] Ongoing moses/phrase-extract refactoring

---
 phrase-extract/SyntaxNode.h                   |  4 +++
 phrase-extract/XmlTree.cpp                    | 34 +++++++++++++++++++
 .../extract-ghkm/AlignmentGraph.cpp           |  8 ++++-
 3 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h
index 46e0f456f..5f57e1790 100644
--- a/phrase-extract/SyntaxNode.h
+++ b/phrase-extract/SyntaxNode.h
@@ -36,6 +36,10 @@ protected:
   SyntaxNode* m_parent;
   float m_pcfgScore;
 public:
+  typedef std::map<std::string, std::string> AttributeMap;
+
+  AttributeMap attributes;
+
   SyntaxNode( int startPos, int endPos, std::string label )
     :m_start(startPos)
     ,m_end(endPos)
diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp
index 0f068fca7..d3c5da900 100644
--- a/phrase-extract/XmlTree.cpp
+++ b/phrase-extract/XmlTree.cpp
@@ -80,6 +80,39 @@ string ParseXmlTagAttribute(const string& tag,const string& attributeName)
   return tag.substr(contentsStart,contentsEnd-contentsStart);
 }
 
+// TODO Special handling of "label" attribute
+// s should be a sequence of name=attribute pairs separated by whitespace.
+// e.g. "label=\"S\" pcfg=\"-1.452\" foo=\"blah\\\"blah\""
+void ParseXmlTagAttributes(const std::string &s,
+                           std::map<std::string, std::string> &attributes)
+{
+  std::size_t begin = 0;
+  while (true) {
+    std::size_t pos = s.find('=', begin);
+    if (pos == std::string::npos) {
+      return;
+    }
+    std::string name = Trim(s.substr(begin, pos-begin));
+    begin = s.find('"', pos+1);
+    if (begin == std::string::npos) {
+      throw XmlException("invalid tag content");
+    }
+    pos = s.find('"', begin+1);
+    if (pos == std::string::npos) {
+      throw XmlException("invalid tag content");
+    }
+    while (s[pos-1] == '\\') {
+      pos = s.find('"', pos+1);
+      if (pos == std::string::npos) {
+        throw XmlException("invalid tag content");
+      }
+    }
+    // TODO unescape \"
+    attributes[name] = s.substr(begin+1, pos-begin-1);
+    begin = pos+1;
+  }
+}
+
 /**
  * Remove "<" and ">" from XML tag
  *
@@ -377,6 +410,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
         }
         SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label );
         node->SetPcfgScore(pcfgScore);
+        ParseXmlTagAttributes(tagContent, node->attributes);
       }
     }
   }
diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
index 3fa65656c..1a3c23de5 100644
--- a/phrase-extract/extract-ghkm/AlignmentGraph.cpp
+++ b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
@@ -216,7 +216,13 @@ Node *AlignmentGraph::CopyParseTree(const SyntaxTree *root)
   std::auto_ptr<Node> n(new Node(root->value().GetLabel(), nodeType));
 
   if (nodeType == TREE) {
-    n->SetPcfgScore(root->value().GetPcfgScore());
+    float score = 0.0f;
+    SyntaxNode::AttributeMap::const_iterator p =
+        root->value().attributes.find("pcfg");
+    if (p != root->value().attributes.end()) {
+      score = std::atof(p->second.c_str());
+    }
+    n->SetPcfgScore(score);
   }
 
   const std::vector<SyntaxTree *> &children = root->children();

From b3e577be769ecc80257274b3af9f5f2a2490020b Mon Sep 17 00:00:00 2001
From: Jeroen Vermeulen <jtv@precisiontranslationtools.com>
Date: Tue, 2 Jun 2015 17:29:32 +0700
Subject: [PATCH 037/108] Fixing lint.  Only 600 or so lines of errors left!

---
 scripts/recaser/train-truecaser.perl          |   9 +-
 .../bilingual-lm/averageNullEmbedding.py      |   5 +-
 .../training/convert-moses-ini-v2-to-v1.py    | 390 +++++++++---------
 scripts/training/train-neurallm.py            |  74 ++--
 .../wrappers/mosesxml2berkeleyparsed.perl     |  12 +-
 scripts/training/wrappers/parse-en-senna.perl |   4 +-
 .../training/wrappers/parse-en-stanford.py    |  52 ++-
 scripts/training/wrappers/senna2brackets.py   |   6 +
 8 files changed, 301 insertions(+), 251 deletions(-)

diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl
index 7f8909082..4f600a640 100755
--- a/scripts/recaser/train-truecaser.perl
+++ b/scripts/recaser/train-truecaser.perl
@@ -8,8 +8,13 @@
 #
 # Options:
 #
-# --possiblyUseFirstToken : boolean option; the default behaviour (when this option is not provided) is that the first token of a sentence is ignored, on the basis that the first word of a sentence is always capitalized; if this option is provided then: a) if a sentence-initial token is *not* capitalized, then it is counted, and b) if a capitalized sentence-initial token is the only token of the segment, then it is counted, but with only 10% of the weight of a normal token.
-#
+# --possiblyUseFirstToken : boolean option; the default behaviour (when this
+# option is not provided) is that the first token of a sentence is ignored, on
+# the basis that the first word of a sentence is always capitalized; if this
+# option is provided then: a) if a sentence-initial token is *not* capitalized,
+# then it is counted, and b) if a capitalized sentence-initial token is the
+# only token of the segment, then it is counted, but with only 10% of the
+# weight of a normal token.
 
 use warnings;
 use strict;
diff --git a/scripts/training/bilingual-lm/averageNullEmbedding.py b/scripts/training/bilingual-lm/averageNullEmbedding.py
index 54c9a1bc4..bf0d465f6 100755
--- a/scripts/training/bilingual-lm/averageNullEmbedding.py
+++ b/scripts/training/bilingual-lm/averageNullEmbedding.py
@@ -6,6 +6,7 @@ import sys
 import numpy
 import argparse
 
+
 parser = argparse.ArgumentParser(
     description=(
         "Set input embedding of <null> token to weighted average "
@@ -28,6 +29,7 @@ def load_model(model_file):
     import nplm
     return nplm.NeuralLM.from_file(model_file)
 
+
 def get_weights(path, length):
     counter = [0] * length
     for line in open(path):
@@ -35,6 +37,7 @@ def get_weights(path, length):
         counter[last_context] += 1
     return counter
 
+
 def main(options):
 
     sys.path.append(options.nplm_python_path)
@@ -49,7 +52,7 @@ def main(options):
         numpy.array(model.input_embeddings), weights=weights, axis=0)
     model.to_file(open(options.output_model, 'w'))
 
-if __name__ == "__main__":
 
+if __name__ == "__main__":
     options = parser.parse_args()
     main(options)
diff --git a/scripts/training/convert-moses-ini-v2-to-v1.py b/scripts/training/convert-moses-ini-v2-to-v1.py
index 3ef7d7c0d..4b7cfa5fa 100755
--- a/scripts/training/convert-moses-ini-v2-to-v1.py
+++ b/scripts/training/convert-moses-ini-v2-to-v1.py
@@ -6,10 +6,10 @@
 
 
 from __future__ import (
-	absolute_import,
-	print_function,
-	unicode_literals,
-	)
+    absolute_import,
+    print_function,
+    unicode_literals,
+    )
 
 __version__ = '1.0'
 __license__ = 'LGPL3'
@@ -19,248 +19,248 @@ import errno
 from sys import stdout
 from copy import deepcopy
 from os.path import (
-	dirname,
-	basename,
-	exists,
-	realpath,
-	)
-from os import (
-	sep,
-	makedirs,
-	)
+    dirname,
+    basename,
+    exists,
+    realpath,
+    )
+from os import makedirs
+
 
 root_escape = '%(escape-prefix)s'
 
 
 class moses2_to_ini(object):
 
+    def __init__(self, inp, out, escape_prefix):
+        self.inp = inp
+        self.out = out
+        self.escape_prefix = escape_prefix
+        self._config = {}
 
-	def __init__(self, inp, out, escape_prefix):
-		self.inp = inp
-		self.out = out
-		self.escape_prefix = escape_prefix
-		self._config = {}
+    def parse(self):
+        key = ''
+        section = None
+        self._config = {}
+        counter = 0
 
+        with open(self.inp, 'rb') as f:
+            contents = f.read().decode('utf8')
 
-	def parse(self):
+        lines = contents.splitlines()
 
-		content = ''
-		key = ''
-		section = None
-		self._config = {}
-		counter = 0
+        # Known feature/functions without attributes.
+        attrless_ffs = [
+            'UnknownWordPenalty',
+            'WordPenalty',
+            'PhrasePenalty',
+            'Distortion',
+            ]
 
-		with open(self.inp, 'rb' ) as f:
-			contents = f.read().decode('utf8')
+        # Retrieve all values except feature/functions with attributes.
+        for i, line in [(i, line.strip()) for i, line in enumerate(lines)
+                        if line.strip() and not line.strip().startswith('#')]:
 
-		lines = contents.splitlines()
+            if line.startswith('[') and line.endswith(']'):
 
-		# retrieve all values except feature/functions with attributes
-		for i, line in [(i, line.strip()) for i, line in enumerate(lines)
-						if line.strip() and not line.strip().startswith('#')]:
+                section = line.strip('] [')
 
-			if line.startswith('[') and line.endswith(']'):
+                if section not in self._config.keys() + ['feature', 'weight']:
+                    # New section not in config and not a reserved section.
+                    counter = 0
+                    key = section
+                    self._config[key] = {}
 
-				section = line.strip('] [')
+            elif section == 'feature' and line in attrless_ffs:
+                # Known feature/funcions without attributes.
+                key = '%s0' % line
+                if key not in self._config:
+                    self._config[key] = {}
+                self._config[key]['feature'] = line
 
-				if section not in self._config.keys() + ['feature', 'weight']:
-					# new section not in config and not a reserved section
-					counter = 0
-					key = section
-					self._config[key] = {}
+            elif section == 'feature':
+                # Skip feature/funcions with arguments.
+                continue
 
-			elif section == 'feature' and line in ['UnknownWordPenalty',
-								'WordPenalty', 'PhrasePenalty', 'Distortion']:
-				# known feature/funcions without attributes
-				key = '%s0' % line
-				if key not in self._config:
-					self._config[key] = {}
-				self._config[key]['feature'] = line
+            elif section == 'weight':
+                # Add weight value to feature sections.
+                config_items = [
+                    (key.strip(), value.strip())
+                    for key, value in [line.split('=', 1)]
+                    ]
+                for key, value in config_items:
+                    if key not in self._config:
+                        self._config[key] = {}
+                    self._config[key]['weight'] = value
 
-			elif section == 'feature':
-				# skip feature/funcions with artuments
-				continue
+            else:
+                self._config[key][counter] = line
+                counter += 0
 
-			elif section == 'weight':
-				# add weight value to feature sections
-				for key, value in [(key.strip(), value.strip())
-									for key, value in [line.split('=', 1)]]:
-					if key not in self._config:
-						self._config[key] = {}
-					self._config[key]['weight'] = value
+            lines[i] = ''
 
-			else:
-				self._config[key][counter] = line
-				counter += 0
+        # Second, match feature/functions attributes to [weight] section
+        # values.
+        stripped_lines = [line.strip() for line in lines]
+        nonempty_lines = [
+            line
+            for line in stripped_lines
+            if line != '' and not line.startswith('#')
+            ]
+        for i, line in enumerate(nonempty_lines):
+            # Add "feature" to assist creating tmpdict for feature/functions.
+            line = 'feature=%s' % line
+            tmpdict = dict([key.split('=', 1) for key in line.split()])
 
-			lines[i] = ''
+            # Feature/functions 'name' attribute must match an entry in
+            # [weight].
+            if tmpdict.get('name') not in self._config:
+                raise RuntimeError('malformed moses.ini v2 file')
 
-		# second, match feature/functions attributes to [weight] section values
-		for i, line in [(i, line.strip()) for i, line in enumerate(lines)
-						if line.strip() and not line.strip().startswith('#')]:
+            config_items = [
+                (key.strip(), value.strip())
+                for key, value in tmpdict.items()
+                if key.strip() != 'name'
+                ]
+            for key, value in config_items:
+                self._config[tmpdict['name']][key] = value
 
-			# add "feature" to assist creating tmpdict for feature/functions
-			line = 'feature=%s' % line
-			tmpdict = dict([key.split('=',1) for key in line.split()])
+        return deepcopy(self._config)
 
-			# feature/functions 'name' attribute must match an entry in [weight]
-			if tmpdict.get('name') not in self._config:
-				raise RuntimeError('malformed moses.ini v2 file')
+    def render(self, config):
+        self._config = deepcopy(config)
+        _config = deepcopy(config)
+        lines = _tolines(_config, self.escape_prefix)
+        if self.out == '-':
+            stdout.write('\n'.join(lines))
+        else:
+            contents = '\r\n'.join(lines)
+            makedir(dirname(self.out))
+            with open(self.out, 'wb') as f:
+                f.write(contents.encode('utf8'))
 
-			for key, value in [(key.strip(), value.strip()) for key, value
-								in tmpdict.items() if key.strip() != 'name']:
+    def __str__(self):
+        return '\n'.join(_tolines(self._config, self.escape_prefix))
 
-				self._config[tmpdict['name']][key] = value
-
-		return deepcopy(self._config)
-
-
-	def render(self, config):
-
-		self._config = deepcopy(config)
-
-		_config = deepcopy(config)
-
-		lines = _tolines(_config, self.escape_prefix)
-
-		if self.out == '-':
-
-			stdout.write('\n'.join(lines))
-
-		else:
-
-			contents = '\r\n'.join(lines)
-
-			makedir(dirname(self.out))
-
-			with open(self.out, 'wb') as f:
-				f.write(contents.encode('utf8'))
-
-
-	def __str__(self):
-		return '\n'.join(_tolines(self._config, self.escape_prefix))
-
-
-	@property
-	def config(self):
-		return deepcopy(self._config)
+    @property
+    def config(self):
+        return deepcopy(self._config)
 
 
 def _tolines(config, escape_prefix):
 
-	lines = []
+    section_names = sorted(config)
+    lines = []
 
-	# group feature/functions first
-	for sectionname in [sectionname for sectionname in sorted(config)
-									if sectionname[-1] in '0123456789']:
+    # Group feature/functions first.
+    group_ffs = [
+        name
+        for name in section_names
+        if name[-1].isdigit()
+    ]
+    for sectionname in group_ffs:
+        section = config[sectionname]
+        lines.append('[%s]' % sectionname)
+        for option, value in section.items():
+            if option == 'path' \
+                    and escape_prefix is not None \
+                    and value.startswith(escape_prefix):
+                value = value.replace(escape_prefix, root_escape, 1)
+            lines.append('%s=%s' % (option, value))
+        lines.append('')
 
-		section = config[sectionname]
+    other_ffs = [
+        name
+        for name in section_names
+        if not name[-1].isdigit()
+    ]
+    for sectionname in other_ffs:
+        section = config[sectionname]
+        lines.append('[%s]' % sectionname)
+        for option, value in section.items():
+            lines.append('%s=%s' % (option, value))
+        lines.append('')
 
-		lines.append('[%s]' % sectionname)
-
-		for option, value in section.items():
-
-			if option == 'path' \
-					and escape_prefix is not None \
-					and value.startswith(escape_prefix):
-
-				value = value.replace(escape_prefix, root_escape, 1)
-
-			lines.append('%s=%s' % (option, value))
-
-		lines.append('')
-
-	for sectionname in [sectionname for sectionname in sorted(config)
-									if sectionname[-1] not in '0123456789']:
-
-		section = config[sectionname]
-
-		lines.append('[%s]' % sectionname)
-
-		for option, value in section.items():
-
-			lines.append('%s=%s' % (option, value))
-
-		lines.append('')
-
-	return deepcopy(lines)
+    return deepcopy(lines)
 
 
 def makedir(path, mode=0o777):
-	try:
-		makedirs(path, mode)
-	except OSError as e:
-		if e.errno not in [errno.EEXIST,
-							errno.EPERM, errno.EACCES, errno.ENOENT]:
-			raise
+    try:
+        makedirs(path, mode)
+    except OSError as e:
+        accepted_errors = [
+            errno.EEXIST,
+            errno.EPERM,
+            errno.EACCES,
+            errno.ENOENT,
+            ]
+        if e.errno not in accepted_errors:
+            raise
 
 
 def get_args():
-	'''Parse command-line arguments
+    '''Parse command-line arguments
 
-	Uses the API compatibility between the legacy
-	argparse.OptionParser and its replacement argparse.ArgumentParser
-	for functional equivelancy and nearly identical help prompt.
-	'''
+    Uses the API compatibility between the legacy
+    argparse.OptionParser and its replacement argparse.ArgumentParser
+    for functional equivelancy and nearly identical help prompt.
+    '''
 
-	description = 'Convert Moses.ini v2 file to standard INI format'
-	usage = '%s [arguments]' % basename(__file__)
+    description = 'Convert Moses.ini v2 file to standard INI format'
+    usage = '%s [arguments]' % basename(__file__)
 
-	try:
-		from argparse import ArgumentParser
-	except ImportError:
-		from optparse import OptionParser
-		argparser = False
-		escape_help = ('Optional. Path of SMT model. If provided, '
-							'escapes \"escape-prefix\" with \"%(escape-prefix)s\"')
-		parser = OptionParser(usage=usage, description=description)
-		add_argument = parser.add_option
-	else:
-		argparser = True
-		escape_help = ('Optional. Path of SMT model. If provided, '
-							'escape \"escape-prefix\" with \"%%(escape-prefix)s\"')
-		parser = ArgumentParser(usage=usage, description=description)
-		add_argument = parser.add_argument
+    try:
+        from argparse import ArgumentParser
+    except ImportError:
+        from optparse import OptionParser
+        argparser = False
+        escape_help = (
+            "Optional. Path of SMT model. If provided, "
+            "escapes \"escape-prefix\" with \"%(escape-prefix)s\"")
+        parser = OptionParser(usage=usage, description=description)
+        add_argument = parser.add_option
+    else:
+        argparser = True
+        escape_help = (
+            "Optional. Path of SMT model. If provided, "
+            "escape \"escape-prefix\" with \"%%(escape-prefix)s\"")
+        parser = ArgumentParser(usage=usage, description=description)
+        add_argument = parser.add_argument
 
-	add_argument('-i','--inp', action='store',
-			help='moses.ini v2 file to convert (required)')
+    add_argument(
+        '-i', '--inp', action='store',
+        help="moses.ini v2 file to convert (required)")
 
-	add_argument('-o','--out', action='store', default='-',
-			help='standard INI file (default: "-" outputs to stdout)')
+    add_argument(
+        '-o', '--out', action='store', default='-',
+        help="standard INI file (default: '-' outputs to stdout)")
 
-	add_argument('-r','--escape-prefix', action='store',
-			help=escape_help)
+    add_argument('-r', '--escape-prefix', action='store', help=escape_help)
 
-	if argparser:
+    if argparser:
+        args = vars(parser.parse_args())
+    else:
+        opts = parser.parse_args()
+        args = vars(opts[0])
 
-		args = vars(parser.parse_args())
+    if args['inp'] is None:
+        parser.error('argument -i/--inp required')
 
-	else:
+    args['inp'] = realpath(args['inp'])
 
-		opts = parser.parse_args()
-		args = vars(opts[0])
+    if not exists(args['inp']):
+        parser.error(
+            "argument -i/--inp invalid.\n"
+            "reference: %s" % args['inp'])
 
-	if args['inp'] is None:
-		parser.error('argument -i/--inp required')
+    if args['out'] != '-':
+        args['out'] = realpath(args['out'])
 
-	args['inp'] = realpath(args['inp'])
-
-	if not exists(args['inp']):
-		parser.error('argument -i/--inp invalid.\n'
-										'reference: %s' % args['inp'])
-
-	if args['out'] != '-':
-		args['out'] = realpath(args['out'])
-
-	return args
+    return args
 
 
 if __name__ == '__main__':
-
-	args = get_args()
-
-	converter = moses2_to_ini(**args)
-
-	config = converter.parse()
-
-	converter.render(config)
+    args = get_args()
+    converter = moses2_to_ini(**args)
+    config = converter.parse()
+    converter.render(config)
diff --git a/scripts/training/train-neurallm.py b/scripts/training/train-neurallm.py
index ae77a42af..4f0e8bdaf 100755
--- a/scripts/training/train-neurallm.py
+++ b/scripts/training/train-neurallm.py
@@ -23,6 +23,7 @@ sys.path.append(os.path.join(sys.path[0], 'bilingual-lm'))
 import train_nplm
 import averageNullEmbedding
 
+
 logging.basicConfig(
     format='%(asctime)s %(levelname)s: %(message)s',
     datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
@@ -30,7 +31,8 @@ parser = argparse.ArgumentParser()
 parser.add_argument(
     "--working-dir", dest="working_dir", metavar="PATH")
 parser.add_argument(
-    "--corpus", '-text', dest="corpus_stem", metavar="PATH", help="Input file.")
+    "--corpus", '-text', dest="corpus_stem", metavar="PATH",
+    help="Input file.")
 parser.add_argument(
     "--nplm-home", dest="nplm_home", metavar="PATH", required=True,
     help="Location of NPLM.")
@@ -113,6 +115,7 @@ parser.set_defaults(
     words_file='vocab',
     vocab_size=500000)
 
+
 def main(options):
 
     options.ngram_size = options.order
@@ -129,14 +132,16 @@ def main(options):
     if options.mmap:
         train_file += '.mmap'
 
-    extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
-                      '--train_text', options.corpus_stem,
-                      '--ngramize', '1',
-                      '--ngram_size', str(options.ngram_size),
-                      '--vocab_size', str(options.vocab_size),
-                      '--write_words_file', os.path.join(options.working_dir, options.words_file),
-                      '--train_file', os.path.join(options.working_dir, numberized_file)
-                      ]
+    extraction_cmd = [
+        os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
+        '--train_text', options.corpus_stem,
+        '--ngramize', '1',
+        '--ngram_size', str(options.ngram_size),
+        '--vocab_size', str(options.vocab_size),
+        '--write_words_file', os.path.join(
+            options.working_dir, options.words_file),
+        '--train_file', os.path.join(options.working_dir, numberized_file)
+        ]
 
     sys.stderr.write('extracting n-grams\n')
     sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
@@ -149,12 +154,13 @@ def main(options):
             os.remove(os.path.join(options.working_dir, train_file))
         except OSError:
             pass
-        mmap_cmd = [os.path.join(options.nplm_home, 'src', 'createMmap'),
-                    '--input_file',
-                    os.path.join(options.working_dir, numberized_file),
-                    '--output_file',
-                    os.path.join(options.working_dir, train_file)
-                    ]
+        mmap_cmd = [
+            os.path.join(options.nplm_home, 'src', 'createMmap'),
+            '--input_file',
+            os.path.join(options.working_dir, numberized_file),
+            '--output_file',
+            os.path.join(options.working_dir, train_file)
+            ]
         sys.stderr.write('creating memory-mapped file\n')
         sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n')
         ret = subprocess.call(mmap_cmd)
@@ -163,14 +169,18 @@ def main(options):
 
     if options.validation_corpus:
 
-        extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
-                          '--train_text', options.validation_corpus,
-                          '--ngramize', '1',
-                          '--ngram_size', str(options.ngram_size),
-                          '--vocab_size', str(options.vocab_size),
-                          '--words_file', os.path.join(options.working_dir, options.words_file),
-                          '--train_file', os.path.join(options.working_dir, os.path.basename(options.validation_corpus) + '.numberized')
-                          ]
+        extraction_cmd = [
+            os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
+            '--train_text', options.validation_corpus,
+            '--ngramize', '1',
+            '--ngram_size', str(options.ngram_size),
+            '--vocab_size', str(options.vocab_size),
+            '--words_file', os.path.join(
+                options.working_dir, options.words_file),
+            '--train_file', os.path.join(
+                options.working_dir,
+                os.path.basename(options.validation_corpus) + '.numberized')
+            ]
 
         sys.stderr.write('extracting n-grams (validation file)\n')
         sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
@@ -190,11 +200,15 @@ def main(options):
     train_nplm.main(options)
 
     sys.stderr.write('averaging null words\n')
-    average_options = averageNullEmbedding.parser.parse_args(
-        ['-i', os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)),
-         '-o', os.path.join(options.output_dir, options.output_model + '.model.nplm'),
-         '-t', os.path.join(options.working_dir, numberized_file),
-         '-p', os.path.join(options.nplm_home, 'python')])
+    average_options = averageNullEmbedding.parser.parse_args([
+        '-i', os.path.join(
+            options.output_dir,
+            options.output_model + '.model.nplm.' + str(options.epochs)),
+        '-o', os.path.join(
+            options.output_dir, options.output_model + '.model.nplm'),
+        '-t', os.path.join(options.working_dir, numberized_file),
+        '-p', os.path.join(options.nplm_home, 'python'),
+    ])
     averageNullEmbedding.main(average_options)
 
 
@@ -206,5 +220,7 @@ if __name__ == "__main__":
 
     options = parser.parse_known_args()[0]
     if parser.parse_known_args()[1]:
-        sys.stderr.write('Warning: unknown arguments: {0}\n'.format(parser.parse_known_args()[1]))
+        sys.stderr.write(
+            "Warning: unknown arguments: {0}\n".format(
+                parser.parse_known_args()[1]))
     main(options)
diff --git a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl
index 02bc7b88e..9449e6bc4 100755
--- a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl
+++ b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl
@@ -6,8 +6,16 @@
 use warnings;
 use strict;
 
-#( (NP (NP (NN resumption)) (PP (IN of) (NP (DT the) (NN session)))) )
-#( (S (@S (@S (@S (S (NP (PRP I)) (VP (VB declare) (VP (@VP (VBD resumed) (NP (@NP (NP (DT the) (NN session)) (PP (IN of) (NP (@NP (DT the) (NNP European)) (NNP Parliament)))) (VP (VBN adjourned) (PP (IN on) (NP (NNP Friday) (CD 17)))))) (NP (NNP December) (CD 1999))))) (, ,)) (CC and)) (S (NP (PRP I)) (VP (MD would) (VP (VB like) (S (ADVP (RB once) (RB again)) (VP (TO to) (VP (@VP (VB wish) (NP (PRP you))) (NP (NP (@NP (@NP (DT a) (JJ happy)) (JJ new)) (NN year)) (PP (IN in) (NP (@NP (DT the) (NN hope)) (SBAR (IN that) (S (NP (PRP you)) (VP (VBD enjoyed) (NP (@NP (@NP (DT a) (JJ pleasant)) (JJ festive)) (NN period))))))))))))))) (. .)) )
+# ( (NP (NP (NN resumption)) (PP (IN of) (NP (DT the) (NN session)))) )
+# ( (S (@S (@S (@S (S (NP (PRP I)) (VP (VB declare) (VP (@VP (VBD resumed)
+# (NP (@NP (NP (DT the) (NN session)) (PP (IN of) (NP (@NP (DT the)
+# (NNP European)) (NNP Parliament)))) (VP (VBN adjourned) (PP (IN on) (NP
+#(NNP Friday) (CD 17)))))) (NP (NNP December) (CD 1999))))) (, ,)) (CC and))
+# (S (NP (PRP I)) (VP (MD would) (VP (VB like) (S (ADVP (RB once) (RB again))
+# (VP (TO to) (VP (@VP (VB wish) (NP (PRP you))) (NP (NP (@NP (@NP (DT a)
+# (JJ happy)) (JJ new)) (NN year)) (PP (IN in) (NP (@NP (DT the) (NN hope))
+#(SBAR (IN that) (S (NP (PRP you)) (VP (VBD enjoyed) (NP (@NP (@NP (DT a)
+# (JJ pleasant)) (JJ festive)) (NN period))))))))))))))) (. .)) )
 
 while(<STDIN>) {
   if (/^$/) {
diff --git a/scripts/training/wrappers/parse-en-senna.perl b/scripts/training/wrappers/parse-en-senna.perl
index 2df46284b..9297b127f 100755
--- a/scripts/training/wrappers/parse-en-senna.perl
+++ b/scripts/training/wrappers/parse-en-senna.perl
@@ -1,4 +1,4 @@
-#!/usr/bin/env perl 
+#!/usr/bin/env perl
 #
 # This file is part of moses.  Its use is licensed under the GNU Lesser General
 # Public License version 2.1 or, at your option, any later version.
@@ -66,7 +66,7 @@ while(<STDIN>) {
   my $num_bytes;
   {
     use bytes;
-    $num_bytes = length($_); 
+    $num_bytes = length($_);
   }
   if ($num_bytes > 1023) {
     print TMP_PROCESSED "SENTENCE_TOO_LONG\n";
diff --git a/scripts/training/wrappers/parse-en-stanford.py b/scripts/training/wrappers/parse-en-stanford.py
index 06b027e55..f77a2d92e 100755
--- a/scripts/training/wrappers/parse-en-stanford.py
+++ b/scripts/training/wrappers/parse-en-stanford.py
@@ -19,7 +19,6 @@ import sys
 import codecs
 import argparse
 
-from collections import defaultdict
 from subprocess import Popen, PIPE
 
 # hack for python2/3 compatibility
@@ -54,17 +53,25 @@ def create_parser():
 
     return parser
 
+
 def process_stanford(infile, javacmd, stanfordpath):
 
-    stanford = Popen([javacmd,
-               '-cp', os.path.join(stanfordpath, 'stanford-corenlp-3.5.0.jar') + ':' + os.path.join(stanfordpath, 'stanford-corenlp-3.5.0-models.jar'),
-               'edu.stanford.nlp.pipeline.StanfordCoreNLP',
-               '-annotators', 'tokenize, ssplit, pos, depparse, lemma',
-               '-ssplit.eolonly', 'true',
-               '-tokenize.whitespace', 'true',
-               '-numThreads', '8',
-               '-textFile', '-',
-               'outFile', '-'], stdin=infile, stdout = PIPE, stderr = open('/dev/null', 'w'))
+    corenlp_jar = os.path.join(stanfordpath, 'stanford-corenlp-3.5.0.jar')
+    corenlp_models_jar = os.path.join(
+        stanfordpath, 'stanford-corenlp-3.5.0-models.jar')
+    stanford = Popen(
+        [
+            javacmd,
+            '-cp', "%s:%s" % (corenlp_jar, corenlp_models_jar),
+            'edu.stanford.nlp.pipeline.StanfordCoreNLP',
+            '-annotators', 'tokenize, ssplit, pos, depparse, lemma',
+            '-ssplit.eolonly', 'true',
+            '-tokenize.whitespace', 'true',
+            '-numThreads', '8',
+            '-textFile', '-',
+            'outFile', '-',
+        ],
+        stdin=infile, stdout=PIPE, stderr=open('/dev/null', 'w'))
     return stanford.stdout
 
 
@@ -87,13 +94,14 @@ def get_sentences(instream):
             head, dep = remainder.split()
             head_int = int(head.split('-')[-1][:-1])
             dep_int = int(dep.split('-')[-1][:-1])
-            sentence[dep_int-1]['head'] = head_int
-            sentence[dep_int-1]['label'] = rel
+            sentence[dep_int - 1]['head'] = head_int
+            sentence[dep_int - 1]['label'] = rel
 
         elif expect == 2:
-            linesplit = line.split('[',1)[1].rsplit(']',1)[0].split('] [')
+            linesplit = line.split('[', 1)[1].rsplit(']', 1)[0].split('] [')
             if len(linesplit) != len(sentence):
-                sys.stderr.write('Warning: mismatch in number of words in sentence\n')
+                sys.stderr.write(
+                    "Warning: mismatch in number of words in sentence\n")
                 sys.stderr.write(' '.join(w['word'] for w in sentence))
                 for i in range(len(sentence)):
                     sentence[i]['pos'] = '-'
@@ -102,22 +110,27 @@ def get_sentences(instream):
                     sentence[i]['label'] = '-'
                 expect = 0
                 continue
-            for i,w in enumerate(linesplit):
+            for i, w in enumerate(linesplit):
                 sentence[i]['pos'] = w.split(' PartOfSpeech=')[-1].split()[0]
                 sentence[i]['lemma'] = w.split(' Lemma=')[-1]
             expect = 3
 
         elif expect == 1:
             for w in line.split():
-                sentence.append({'word':w})
+                sentence.append({'word': w})
             expect = 2
 
     if sentence:
         yield sentence
 
+
 def write(sentence, outstream):
     for i, w in enumerate(sentence):
-      outstream.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n'.format(i+1, w['word'], w['lemma'], w['pos'], w['pos'], '-', w['head'], w['label']))
+        outstream.write(
+            '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n'.format(
+                i + 1, w['word'], w['lemma'], w['pos'], w['pos'], '-',
+                w['head'], w['label']))
+
 
 if __name__ == '__main__':
     if sys.version_info < (3, 0):
@@ -125,11 +138,10 @@ if __name__ == '__main__':
         sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
         sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
 
-
     parser = create_parser()
     options = parser.parse_args()
 
     stanford = process_stanford(options.input, options.java, options.stanford)
     for sentence in get_sentences(codecs.getreader('UTF-8')(stanford)):
-       write(sentence, options.output)
-       options.output.write('\n')
+        write(sentence, options.output)
+        options.output.write('\n')
diff --git a/scripts/training/wrappers/senna2brackets.py b/scripts/training/wrappers/senna2brackets.py
index a81100277..5b8495c84 100755
--- a/scripts/training/wrappers/senna2brackets.py
+++ b/scripts/training/wrappers/senna2brackets.py
@@ -24,6 +24,7 @@ import optparse
 import os
 import sys
 
+
 def main():
     usage = "usage: %prog [options]"
     parser = optparse.OptionParser(usage=usage)
@@ -71,6 +72,7 @@ def main():
             word = "-RCB-"
         tree += frag.replace("*", "(%s %s)" % (pos, word))
 
+
 def balanced(s):
     num_left = 0
     num_right = 0
@@ -81,10 +83,12 @@ def balanced(s):
             num_right += 1
     return num_left == num_right
 
+
 def beautify(tree):
     s = tree.replace("(", " (")
     return s.strip()
 
+
 def berkelify(tree):
     if tree == "":
         return "(())"
@@ -94,9 +98,11 @@ def berkelify(tree):
     old_root = tree[1:pos]
     return tree.replace(old_root, "TOP")
 
+
 def warn(msg):
     prog_name = os.path.basename(sys.argv[0])
     sys.stderr.write("%s: warning: %s\n" % (prog_name, msg))
 
+
 if __name__ == "__main__":
     main()

From 85c23ed7dcbbd312d1c9ea7b64177c9e80e06088 Mon Sep 17 00:00:00 2001
From: Jeroen Vermeulen <jtv@precisiontranslationtools.com>
Date: Tue, 2 Jun 2015 18:05:12 +0700
Subject: [PATCH 038/108] Fix some JS lint.

---
 scripts/ems/web/base64.js                    |  33 +++---
 scripts/ems/web/hierarchical-segmentation.js | 115 ++++++++++---------
 2 files changed, 77 insertions(+), 71 deletions(-)

diff --git a/scripts/ems/web/base64.js b/scripts/ems/web/base64.js
index 67fd9ad8d..a35940c5a 100644
--- a/scripts/ems/web/base64.js
+++ b/scripts/ems/web/base64.js
@@ -21,16 +21,19 @@
     // constants
     var b64chars
         = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/';
-    var b64tab = function(bin) {
-        var t = {};
-        for (var i = 0, l = bin.length; i < l; i++) t[bin.charAt(i)] = i;
+    var b64tab = (function(bin) {
+        var t = {}, i, l;
+        for (i = 0, l = bin.length; i < l; i++) {
+            t[bin.charAt(i)] = i;
+        }
         return t;
-    }(b64chars);
+    })(b64chars);
     var fromCharCode = String.fromCharCode;
     // encoder stuff
     var cb_utob = function(c) {
+        var cc;
         if (c.length < 2) {
-            var cc = c.charCodeAt(0);
+            cc = c.charCodeAt(0);
             return cc < 0x80 ? c
                 : cc < 0x800 ? (fromCharCode(0xc0 | (cc >>> 6))
                                 + fromCharCode(0x80 | (cc & 0x3f)))
@@ -38,7 +41,7 @@
                    + fromCharCode(0x80 | ((cc >>>  6) & 0x3f))
                    + fromCharCode(0x80 | ( cc         & 0x3f)));
         } else {
-            var cc = 0x10000
+            cc = 0x10000
                 + (c.charCodeAt(0) - 0xD800) * 0x400
                 + (c.charCodeAt(1) - 0xDC00);
             return (fromCharCode(0xf0 | ((cc >>> 18) & 0x07))
@@ -70,19 +73,21 @@
         return b.replace(/[\s\S]{1,3}/g, cb_encode);
     };
     var _encode = buffer ? function (u) {
-        return (u.constructor === buffer.constructor ? u : new buffer(u))
-        .toString('base64')
+        return (
+            u.constructor === buffer.constructor ? u : new buffer(u)
+            ).toString('base64');
     }
-    : function (u) { return btoa(utob(u)) }
+    : function (u) { return btoa(utob(u)); }
     ;
     var encode = function(u, urisafe) {
-        return !urisafe
-            ? _encode(String(u))
-            : _encode(String(u)).replace(/[+\/]/g, function(m0) {
-                return m0 == '+' ? '-' : '_';
+        return (
+            !urisafe ?
+            _encode(String(u)) :
+            _encode(String(u)).replace(/[+\/]/g, function(m0) {
+                return (m0 === '+') ? '-' : '_';
             }).replace(/=/g, '');
     };
-    var encodeURI = function(u) { return encode(u, true) };
+    var encodeURI = function(u) { return encode(u, true); };
     // decoder stuff
     var re_btou = new RegExp([
         '[\xC0-\xDF][\x80-\xBF]',
diff --git a/scripts/ems/web/hierarchical-segmentation.js b/scripts/ems/web/hierarchical-segmentation.js
index 7f0df85ff..a1e16eff7 100644
--- a/scripts/ems/web/hierarchical-segmentation.js
+++ b/scripts/ems/web/hierarchical-segmentation.js
@@ -10,6 +10,63 @@ var span_count_in = [];
 var span_count_out = [];
 var current_depth = -1;
 
+function highlightSingleNode( sentence, id, color ) {
+  var i, j, item;
+  for(i=nodeIn[sentence][id].start;i<=nodeIn[sentence][id].end;i++) {
+    for(j=nodeIn[sentence][id].depth;j<=max_depth[sentence];j++) {
+      item = "in-" + sentence + "-" + i + "-" + j;
+      if ($(item) !== null) {
+        $(item).setStyle({ backgroundColor: color, borderColor: 'red' });
+      }
+    }
+  }
+  //$("debug").innerHTML = "highlight: "+id+", of "+nodeOut[sentence].size()+"<br>";
+  for(i=nodeOut[sentence][id].start;i<=nodeOut[sentence][id].end;i++) {
+    for(j=nodeOut[sentence][id].depth;j<=max_depth[sentence];j++) {
+      item = "out-" + sentence + "-" + i + "-" + j;
+      //$("debug").innerHTML += item;
+      if ($(item) !== null) {
+        $(item).setStyle({ backgroundColor: color, borderColor: 'red' });
+      }
+    }
+  }
+}
+
+function lowlightAllNodes( sentence ) {
+  var i, j, item;
+  for(i=0;i<span_count_in[sentence];i++) {
+    for(j=0;j<=max_depth[sentence];j++) {
+      item = "in-" + sentence + "-" + i + "-" + j;
+      if ($(item) !== null) {
+        $(item).setStyle({ backgroundColor: 'white', borderColor: 'black' });
+      }
+    }
+  }
+  for(i=0;i<span_count_out[sentence];i++) {
+    for(j=0;j<=max_depth[sentence];j++) {
+      item = "out-" + sentence + "-" + i + "-" + j;
+      if ($(item) !== null) {
+        $(item).setStyle({ backgroundColor: 'white', borderColor: 'black' });
+      }
+    }
+  }
+}
+
+function highlightNode( sentence, id ) {
+  var i;
+  lowlightAllNodes( sentence );
+  highlightSingleNode( sentence, id, 'yellow' );
+  for(i=0; i<nodeChildren[sentence][id].size(); i++) {
+    var childId = nodeChildren[sentence][id][i];
+    var j;
+    highlightSingleNode( sentence, childId, '#ffffa0');
+    for(j=0; j<nodeChildren[sentence][childId].size(); j++) {
+      highlightSingleNode(
+          sentence, nodeChildren[sentence][childId][j], '#ffffe0');
+    }
+  }
+}
+
 function alignIn( sentence, word, depth ) {
   var id, i;
   if (current_depth < depth) {
@@ -44,63 +101,7 @@ function alignOut( sentence, word, depth ) {
 }
 
 function unAlign( sentence ) {
-  if (current_depth == -1) { return; }
+  if (current_depth === -1) { return; }
   current_depth = -1;
   lowlightAllNodes( sentence );
 }
-
-function highlightNode( sentence, id ) {
-  var i;
-  lowlightAllNodes( sentence );
-  highlightSingleNode( sentence, id, 'yellow' );
-  for(i=0; i<nodeChildren[sentence][id].size(); i++) {
-    var childId = nodeChildren[sentence][id][i];
-    var j;
-    highlightSingleNode( sentence, childId, '#ffffa0');
-    for(j=0; j<nodeChildren[sentence][childId].size(); j++) {
-      highlightSingleNode( sentence, nodeChildren[sentence][childId][j], '#ffffe0');
-    }
-  }
-}
-
-function highlightSingleNode( sentence, id, color ) {
-  var i, j;
-  for(i=nodeIn[sentence][id].start;i<=nodeIn[sentence][id].end;i++) {
-    for(j=nodeIn[sentence][id].depth;j<=max_depth[sentence];j++) {
-      var item = "in-" + sentence + "-" + i + "-" + j;
-      if ($(item) != null) {
-        $(item).setStyle({ backgroundColor: color, borderColor: 'red' });
-      }
-    }
-  }
-  //$("debug").innerHTML = "highlight: "+id+", of "+nodeOut[sentence].size()+"<br>";
-  for(i=nodeOut[sentence][id].start;i<=nodeOut[sentence][id].end;i++) {
-    for(j=nodeOut[sentence][id].depth;j<=max_depth[sentence];j++) {
-      var item = "out-" + sentence + "-" + i + "-" + j;
-      //$("debug").innerHTML += item;
-      if ($(item) != null) {
-        $(item).setStyle({ backgroundColor: color, borderColor: 'red' });
-      }
-    }
-  }
-}
-
-function lowlightAllNodes( sentence ) {
-  var i, j;
-  for(i=0;i<span_count_in[sentence];i++) {
-    for(j=0;j<=max_depth[sentence];j++) {
-      var item = "in-" + sentence + "-" + i + "-" + j;
-      if ($(item) != null) {
-        $(item).setStyle({ backgroundColor: 'white', borderColor: 'black' });
-      }
-    }
-  }
-  for(i=0;i<span_count_out[sentence];i++) {
-    for(j=0;j<=max_depth[sentence];j++) {
-      var item = "out-" + sentence + "-" + i + "-" + j;
-      if ($(item) != null) {
-        $(item).setStyle({ backgroundColor: 'white', borderColor: 'black' });
-      }
-    }
-  }
-}

From 0c61970ac74af49dbf1c73c2c9caf59ef465f434 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Tue, 2 Jun 2015 13:56:03 +0100
Subject: [PATCH 039/108] Ongoing moses/phrase-extract refactoring

---
 phrase-extract/SyntaxNodeCollection.cpp       |  4 --
 phrase-extract/SyntaxNodeCollection.h         |  3 --
 phrase-extract/extract-ghkm/ExtractGHKM.cpp   | 51 ++++++-------------
 phrase-extract/extract-ghkm/ScfgRule.cpp      | 20 ++++----
 phrase-extract/extract-ghkm/ScfgRule.h        |  4 +-
 phrase-extract/extract-ghkm/XmlTreeParser.cpp | 39 ++------------
 phrase-extract/extract-ghkm/XmlTreeParser.h   | 12 +++--
 7 files changed, 38 insertions(+), 95 deletions(-)

diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp
index f67bee587..60a2f6c2f 100644
--- a/phrase-extract/SyntaxNodeCollection.cpp
+++ b/phrase-extract/SyntaxNodeCollection.cpp
@@ -23,8 +23,6 @@
 #include <cassert>
 #include <iostream>
 
-#include <boost/make_shared.hpp>
-
 namespace MosesTraining
 {
 
@@ -154,7 +152,6 @@ void SyntaxNodeCollection::ConnectNodes()
   }
 }
 
-//boost::shared_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
 std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
 {
   std::map<SyntaxNode *, SyntaxTree *> nodeToTree;
@@ -209,7 +206,6 @@ std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
     }
   }
 
-  //return boost::shared_ptr<SyntaxTree>(root);
   return std::auto_ptr<SyntaxTree>(root);
 }
 
diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h
index c54400ca1..604b8d629 100644
--- a/phrase-extract/SyntaxNodeCollection.h
+++ b/phrase-extract/SyntaxNodeCollection.h
@@ -24,8 +24,6 @@
 #include <string>
 #include <vector>
 
-#include <boost/shared_ptr.hpp>
-
 #include "SyntaxNode.h"
 #include "SyntaxTree.h"
 
@@ -75,7 +73,6 @@ public:
   void Clear();
 
   std::auto_ptr<SyntaxTree> ExtractTree();
-  //boost::shared_ptr<SyntaxTree> ExtractTree();
 };
 
 }  // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index 0c7dadd4d..43873e804 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -139,6 +139,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
   std::string alignmentLine;
   Alignment alignment;
   XmlTreeParser targetXmlTreeParser(targetLabelSet, targetTopLabelSet);
+  XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet);
   ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options);
   StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options);
   size_t lineNum = options.sentenceOffset;
@@ -175,39 +176,14 @@ int ExtractGHKM::Main(int argc, char *argv[])
       Error(oss.str());
     }
 
-
-    // Parse source tree and construct a SyntaxTree object.
-    SyntaxNodeCollection sourceSyntaxTree;
-    SyntaxNode *sourceSyntaxTreeRoot=NULL;
-
-    if (options.sourceLabels) {
-      try {
-        if (!ProcessAndStripXMLTags(sourceLine, sourceSyntaxTree, sourceLabelSet, sourceTopLabelSet, false)) {
-          throw Exception("");
-        }
-        sourceSyntaxTree.ConnectNodes();
-        sourceSyntaxTreeRoot = sourceSyntaxTree.GetTop();
-        assert(sourceSyntaxTreeRoot);
-      } catch (const Exception &e) {
-        std::ostringstream oss;
-        oss << "Failed to parse source XML tree at line " << lineNum;
-        if (!e.GetMsg().empty()) {
-          oss << ": " << e.GetMsg();
-        }
-        Error(oss.str());
-      }
-    }
-
-    // Read source tokens.
-    std::vector<std::string> sourceTokens(ReadTokens(sourceLine));
-
-    // Construct a source SyntaxTree object from the SyntaxNodeCollection
-    // object.
+    // Read source tokens (and parse tree if using source labels).
+    std::vector<std::string> sourceTokens;
     std::auto_ptr<SyntaxTree> sourceParseTree;
-
-    if (options.sourceLabels) {
+    if (!options.sourceLabels) {
+      sourceTokens = ReadTokens(sourceLine);
+    } else {
       try {
-        sourceParseTree = XmlTreeParser::ConvertTree(*sourceSyntaxTreeRoot, sourceTokens);
+        sourceParseTree = sourceXmlTreeParser.Parse(sourceLine);
         assert(sourceParseTree.get());
       } catch (const Exception &e) {
         std::ostringstream oss;
@@ -217,9 +193,9 @@ int ExtractGHKM::Main(int argc, char *argv[])
         }
         Error(oss.str());
       }
+      sourceTokens = sourceXmlTreeParser.GetWords();
     }
 
-
     // Read word alignments.
     try {
       ReadAlignment(alignmentLine, alignment);
@@ -239,12 +215,14 @@ int ExtractGHKM::Main(int argc, char *argv[])
 
     // Record word counts.
     if (!options.targetUnknownWordFile.empty()) {
-      CollectWordLabelCounts(*targetParseTree, options, targetWordCount, targetWordLabel);
+      CollectWordLabelCounts(*targetParseTree, options, targetWordCount,
+                             targetWordLabel);
     }
 
     // Record word counts: source side.
     if (options.sourceLabels && !options.sourceUnknownWordFile.empty()) {
-      CollectWordLabelCounts(*sourceParseTree, options, sourceWordCount, sourceWordLabel);
+      CollectWordLabelCounts(*sourceParseTree, options, sourceWordCount,
+                             sourceWordLabel);
     }
 
     // Form an alignment graph from the target tree, source words, and
@@ -260,7 +238,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
     }
 
     // Initialize phrase orientation scoring object
-    PhraseOrientation phraseOrientation( sourceTokens.size(), targetXmlTreeParser.GetWords().size(), alignment);
+    PhraseOrientation phraseOrientation(sourceTokens.size(),
+        targetXmlTreeParser.GetWords().size(), alignment);
 
     // Write the rules, subject to scope pruning.
     const std::vector<Node *> &targetNodes = graph.GetTargetNodes();
@@ -292,7 +271,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
         // SCFG output.
         ScfgRule *r = 0;
         if (options.sourceLabels) {
-          r = new ScfgRule(**q, &sourceSyntaxTree);
+          r = new ScfgRule(**q, &sourceXmlTreeParser.GetNodeCollection());
         } else {
           r = new ScfgRule(**q);
         }
diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp
index fc960b598..a6fc19dd9 100644
--- a/phrase-extract/extract-ghkm/ScfgRule.cpp
+++ b/phrase-extract/extract-ghkm/ScfgRule.cpp
@@ -32,12 +32,12 @@ namespace GHKM
 {
 
 ScfgRule::ScfgRule(const Subgraph &fragment,
-                   const SyntaxNodeCollection *sourceSyntaxTree)
+                   const SyntaxNodeCollection *sourceNodeCollection)
   : m_graphFragment(fragment)
   , m_sourceLHS("X", NonTerminal)
   , m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal)
   , m_pcfgScore(fragment.GetPcfgScore())
-  , m_hasSourceLabels(sourceSyntaxTree)
+  , m_hasSourceLabels(sourceNodeCollection)
 {
 
   // Source RHS
@@ -82,9 +82,9 @@ ScfgRule::ScfgRule(const Subgraph &fragment,
         }
       }
     }
-    if (sourceSyntaxTree) {
+    if (sourceNodeCollection) {
       // Source syntax label
-      PushSourceLabel(sourceSyntaxTree,&sinkNode,"XRHS");
+      PushSourceLabel(sourceNodeCollection,&sinkNode,"XRHS");
     }
   }
 
@@ -125,23 +125,23 @@ ScfgRule::ScfgRule(const Subgraph &fragment,
     }
   }
 
-  if (sourceSyntaxTree) {
-    // Source syntax label for root node (if sourceSyntaxTree available)
-    PushSourceLabel(sourceSyntaxTree,fragment.GetRoot(),"XLHS");
+  if (sourceNodeCollection) {
+    // Source syntax label for root node (if sourceNodeCollection available)
+    PushSourceLabel(sourceNodeCollection,fragment.GetRoot(),"XLHS");
     // All non-terminal spans (including the LHS) should have obtained a label
     // (a source-side syntactic constituent label if the span matches, "XLHS" otherwise)
 //    assert(m_sourceLabels.size() == m_numberOfNonTerminals+1);
   }
 }
 
-void ScfgRule::PushSourceLabel(const SyntaxNodeCollection *sourceSyntaxTree,
+void ScfgRule::PushSourceLabel(const SyntaxNodeCollection *sourceNodeCollection,
                                const Node *node,
                                const std::string &nonMatchingLabel)
 {
   ContiguousSpan span = Closure(node->GetSpan());
-  if (sourceSyntaxTree->HasNode(span.first,span.second)) { // does a source constituent match the span?
+  if (sourceNodeCollection->HasNode(span.first,span.second)) { // does a source constituent match the span?
     std::vector<SyntaxNode*> sourceLabels =
-      sourceSyntaxTree->GetNodes(span.first,span.second);
+      sourceNodeCollection->GetNodes(span.first,span.second);
     if (!sourceLabels.empty()) {
       // store the topmost matching label from the source syntax tree
       m_sourceLabels.push_back(sourceLabels.back()->GetLabel());
diff --git a/phrase-extract/extract-ghkm/ScfgRule.h b/phrase-extract/extract-ghkm/ScfgRule.h
index c8cdbb143..439c19fd7 100644
--- a/phrase-extract/extract-ghkm/ScfgRule.h
+++ b/phrase-extract/extract-ghkm/ScfgRule.h
@@ -41,7 +41,7 @@ class ScfgRule : public Rule
 {
 public:
   ScfgRule(const Subgraph &fragment,
-           const SyntaxNodeCollection *sourceSyntaxTree = 0);
+           const SyntaxNodeCollection *sourceNodeCollection = 0);
 
   const Subgraph &GetGraphFragment() const {
     return m_graphFragment;
@@ -78,7 +78,7 @@ public:
   }
 
 private:
-  void PushSourceLabel(const SyntaxNodeCollection *sourceSyntaxTree,
+  void PushSourceLabel(const SyntaxNodeCollection *sourceNodeCollection,
                        const Node *node, const std::string &nonMatchingLabel);
 
   const Subgraph& m_graphFragment;
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/phrase-extract/extract-ghkm/XmlTreeParser.cpp
index 83dfbd42f..17513fdd4 100644
--- a/phrase-extract/extract-ghkm/XmlTreeParser.cpp
+++ b/phrase-extract/extract-ghkm/XmlTreeParser.cpp
@@ -44,52 +44,21 @@ XmlTreeParser::XmlTreeParser(std::set<std::string> &labelSet,
 std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line)
 {
   m_line = line;
-  m_tree.Clear();
+  m_nodeCollection.Clear();
   try {
-    if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet,
-                                false)) {
+    if (!ProcessAndStripXMLTags(m_line, m_nodeCollection, m_labelSet,
+                                m_topLabelSet, false)) {
       throw Exception("");
     }
   } catch (const XmlException &e) {
     throw Exception(e.getMsg());
   }
-  //boost::shared_ptr<SyntaxTree> root = m_tree.ExtractTree();
-  std::auto_ptr<SyntaxTree> root = m_tree.ExtractTree();
+  std::auto_ptr<SyntaxTree> root = m_nodeCollection.ExtractTree();
   m_words = util::tokenize(m_line);
   AttachWords(m_words, *root);
   return root;
 }
 
-// Converts a SyntaxNode tree to a MosesTraining::GHKM::SyntaxTree.
-std::auto_ptr<SyntaxTree> XmlTreeParser::ConvertTree(
-  const SyntaxNode &tree,
-  const std::vector<std::string> &words)
-{
-  std::auto_ptr<SyntaxTree> root(new SyntaxTree(tree));
-  const std::vector<SyntaxNode*> &children = tree.GetChildren();
-  if (children.empty()) {
-    if (tree.GetStart() != tree.GetEnd()) {
-      std::ostringstream msg;
-      msg << "leaf node covers multiple words (" << tree.GetStart()
-          << "-" << tree.GetEnd() << "): this is currently unsupported";
-      throw Exception(msg.str());
-    }
-    SyntaxNode value(tree.GetStart(), tree.GetStart(), words[tree.GetStart()]);
-    std::auto_ptr<SyntaxTree> leaf(new SyntaxTree(value));
-    leaf->parent() = root.get();
-    root->children().push_back(leaf.release());
-  } else {
-    for (std::vector<SyntaxNode*>::const_iterator p = children.begin();
-         p != children.end(); ++p) {
-      assert(*p);
-      std::auto_ptr<SyntaxTree> child = ConvertTree(**p, words);
-      child->parent() = root.get();
-      root->children().push_back(child.release());
-    }
-  }
-  return root;
-}
-
 void XmlTreeParser::AttachWords(const std::vector<std::string> &words,
                                 SyntaxTree &root)
 {
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h
index 2fcdd9b56..339a2bd13 100644
--- a/phrase-extract/extract-ghkm/XmlTreeParser.h
+++ b/phrase-extract/extract-ghkm/XmlTreeParser.h
@@ -44,20 +44,22 @@ class XmlTreeParser
 {
 public:
   XmlTreeParser(std::set<std::string> &, std::map<std::string, int> &);
-  std::auto_ptr<SyntaxTree> Parse(const std::string &);
 
-  static std::auto_ptr<SyntaxTree> ConvertTree(const SyntaxNode &,
-      const std::vector<std::string> &);
+  std::auto_ptr<SyntaxTree> Parse(const std::string &);
 
   const std::vector<std::string>& GetWords() {
     return m_words;
-  };
+  }
+
+  const SyntaxNodeCollection &GetNodeCollection() const {
+    return m_nodeCollection;
+  }
 
 private:
   std::set<std::string> &m_labelSet;
   std::map<std::string, int> &m_topLabelSet;
   std::string m_line;
-  SyntaxNodeCollection m_tree;
+  SyntaxNodeCollection m_nodeCollection;
   std::vector<std::string> m_words;
 
   void AttachWords(const std::vector<std::string> &, SyntaxTree &);

From 5ece895ab4d7fafe32d76cb2dd7bd7995cd06c7c Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Tue, 2 Jun 2015 14:00:56 +0100
Subject: [PATCH 040/108] Ongoing moses/phrase-extract refactoring

---
 phrase-extract/SyntaxNodeCollection.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h
index 604b8d629..a0d19841c 100644
--- a/phrase-extract/SyntaxNodeCollection.h
+++ b/phrase-extract/SyntaxNodeCollection.h
@@ -20,6 +20,7 @@
 #pragma once
 
 #include <map>
+#include <memory>
 #include <sstream>
 #include <string>
 #include <vector>

From 2f04d4a56ebab78a97b9fa9ecf4b50ef845a1bdb Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Tue, 2 Jun 2015 15:23:41 +0100
Subject: [PATCH 041/108] Ongoing moses/phrase-extract refactoring

---
 phrase-extract/extract-ghkm/ExtractGHKM.cpp   |  7 +-
 phrase-extract/extract-ghkm/XmlTreeParser.cpp | 90 -------------------
 phrase-extract/extract-ghkm/XmlTreeParser.h   | 71 ---------------
 .../filter-rule-table/FilterRuleTable.cpp     | 11 ++-
 .../filter-rule-table/FilterRuleTable.h       |  6 +-
 .../filter-rule-table/ForestTsgFilter.h       |  1 -
 .../filter-rule-table/TreeCfgFilter.cpp       |  2 +-
 .../filter-rule-table/TreeCfgFilter.h         |  5 +-
 .../filter-rule-table/TreeTsgFilter.cpp       | 18 ++--
 .../filter-rule-table/TreeTsgFilter.h         |  9 +-
 .../syntax-common/xml_tree_parser.cc          | 71 ++++++++-------
 .../syntax-common/xml_tree_parser.h           | 28 ++++--
 12 files changed, 91 insertions(+), 228 deletions(-)
 delete mode 100644 phrase-extract/extract-ghkm/XmlTreeParser.cpp
 delete mode 100644 phrase-extract/extract-ghkm/XmlTreeParser.h

diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index 43873e804..2293371ac 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -30,6 +30,8 @@
 
 #include <boost/program_options.hpp>
 
+#include "syntax-common/xml_tree_parser.h"
+
 #include "InputFileStream.h"
 #include "OutputFileStream.h"
 #include "SyntaxNode.h"
@@ -50,7 +52,6 @@
 #include "Span.h"
 #include "StsgRule.h"
 #include "StsgRuleWriter.h"
-#include "XmlTreeParser.h"
 
 namespace MosesTraining
 {
@@ -138,8 +139,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
   std::string sourceLine;
   std::string alignmentLine;
   Alignment alignment;
-  XmlTreeParser targetXmlTreeParser(targetLabelSet, targetTopLabelSet);
-  XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet);
+  Syntax::XmlTreeParser targetXmlTreeParser(targetLabelSet, targetTopLabelSet);
+  Syntax::XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet);
   ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options);
   StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options);
   size_t lineNum = options.sentenceOffset;
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/phrase-extract/extract-ghkm/XmlTreeParser.cpp
deleted file mode 100644
index 17513fdd4..000000000
--- a/phrase-extract/extract-ghkm/XmlTreeParser.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2011 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-***********************************************************************/
-
-#include "XmlTreeParser.h"
-
-#include <cassert>
-#include <vector>
-
-#include "util/tokenize.hh"
-
-#include "SyntaxTree.h"
-#include "tables-core.h"
-#include "XmlException.h"
-#include "XmlTree.h"
-
-namespace MosesTraining
-{
-namespace GHKM
-{
-
-XmlTreeParser::XmlTreeParser(std::set<std::string> &labelSet,
-                             std::map<std::string, int> &topLabelSet)
-  : m_labelSet(labelSet)
-  , m_topLabelSet(topLabelSet)
-{
-}
-
-std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line)
-{
-  m_line = line;
-  m_nodeCollection.Clear();
-  try {
-    if (!ProcessAndStripXMLTags(m_line, m_nodeCollection, m_labelSet,
-                                m_topLabelSet, false)) {
-      throw Exception("");
-    }
-  } catch (const XmlException &e) {
-    throw Exception(e.getMsg());
-  }
-  std::auto_ptr<SyntaxTree> root = m_nodeCollection.ExtractTree();
-  m_words = util::tokenize(m_line);
-  AttachWords(m_words, *root);
-  return root;
-}
-
-void XmlTreeParser::AttachWords(const std::vector<std::string> &words,
-                                SyntaxTree &root)
-{
-  std::vector<SyntaxTree*> leaves;
-  leaves.reserve(words.size());
-  for (SyntaxTree::LeafIterator p(root); p != SyntaxTree::LeafIterator(); ++p) {
-    leaves.push_back(&*p);
-  }
-
-  std::vector<std::string>::const_iterator q = words.begin();
-  for (std::vector<SyntaxTree*>::iterator p = leaves.begin(); p != leaves.end();
-       ++p) {
-    SyntaxTree *leaf = *p;
-    const int start = leaf->value().GetStart();
-    const int end = leaf->value().GetEnd();
-    if (start != end) {
-      std::ostringstream msg;
-      msg << "leaf node covers multiple words (" << start << "-" << end
-          << "): this is currently unsupported";
-      throw Exception(msg.str());
-    }
-    SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(start, end, *q++));
-    leaf->children().push_back(newLeaf);
-    newLeaf->parent() = leaf;
-  }
-}
-
-}  // namespace GHKM
-}  // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h
deleted file mode 100644
index 339a2bd13..000000000
--- a/phrase-extract/extract-ghkm/XmlTreeParser.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2011 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-***********************************************************************/
-
-#pragma once
-#ifndef EXTRACT_GHKM_XML_TREE_PARSER_H_
-#define EXTRACT_GHKM_XML_TREE_PARSER_H_
-
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "SyntaxNode.h"
-#include "SyntaxNodeCollection.h"
-#include "SyntaxTree.h"
-
-#include "Exception.h"
-
-namespace MosesTraining
-{
-namespace GHKM
-{
-
-// Parses a string in Moses' XML parse tree format and returns a SyntaxTree
-// object.
-class XmlTreeParser
-{
-public:
-  XmlTreeParser(std::set<std::string> &, std::map<std::string, int> &);
-
-  std::auto_ptr<SyntaxTree> Parse(const std::string &);
-
-  const std::vector<std::string>& GetWords() {
-    return m_words;
-  }
-
-  const SyntaxNodeCollection &GetNodeCollection() const {
-    return m_nodeCollection;
-  }
-
-private:
-  std::set<std::string> &m_labelSet;
-  std::map<std::string, int> &m_topLabelSet;
-  std::string m_line;
-  SyntaxNodeCollection m_nodeCollection;
-  std::vector<std::string> m_words;
-
-  void AttachWords(const std::vector<std::string> &, SyntaxTree &);
-};
-
-}  // namespace GHKM
-}  // namespace MosesTraining
-
-#endif
diff --git a/phrase-extract/filter-rule-table/FilterRuleTable.cpp b/phrase-extract/filter-rule-table/FilterRuleTable.cpp
index c42c13de6..0c6f132f8 100644
--- a/phrase-extract/filter-rule-table/FilterRuleTable.cpp
+++ b/phrase-extract/filter-rule-table/FilterRuleTable.cpp
@@ -82,7 +82,7 @@ int FilterRuleTable::Main(int argc, char *argv[])
     StringCfgFilter filter(testStrings);
     filter.Filter(std::cin, std::cout);
   } else if (testSentenceFormat == kTree) {
-    std::vector<boost::shared_ptr<StringTree> > testTrees;
+    std::vector<boost::shared_ptr<SyntaxTree> > testTrees;
     ReadTestSet(testStream, testTrees);
     if (sourceSideRuleFormat == kCfg) {
       // TODO Implement TreeCfgFilter
@@ -124,9 +124,11 @@ void FilterRuleTable::ReadTestSet(
 }
 
 void FilterRuleTable::ReadTestSet(
-  std::istream &input, std::vector<boost::shared_ptr<StringTree> > &sentences)
+  std::istream &input, std::vector<boost::shared_ptr<SyntaxTree> > &sentences)
 {
-  XmlTreeParser parser;
+  std::set<std::string> labelSet;
+  std::map<std::string, int> topLabelSet;
+  XmlTreeParser parser(labelSet, topLabelSet);
   int lineNum = 0;
   std::string line;
   while (std::getline(input, line)) {
@@ -136,7 +138,8 @@ void FilterRuleTable::ReadTestSet(
                 << std::endl;
       continue;
     }
-    sentences.push_back(boost::shared_ptr<StringTree>(parser.Parse(line)));
+    sentences.push_back(
+        boost::shared_ptr<SyntaxTree>(parser.Parse(line).release()));
   }
 }
 
diff --git a/phrase-extract/filter-rule-table/FilterRuleTable.h b/phrase-extract/filter-rule-table/FilterRuleTable.h
index 3a9489428..3077e690d 100644
--- a/phrase-extract/filter-rule-table/FilterRuleTable.h
+++ b/phrase-extract/filter-rule-table/FilterRuleTable.h
@@ -5,7 +5,7 @@
 
 #include <boost/shared_ptr.hpp>
 
-#include "syntax-common/string_tree.h"
+#include "SyntaxTree.h"
 
 #include "StringForest.h"
 
@@ -36,7 +36,7 @@ private:
   void Filter(const std::vector<std::vector<std::string> > &);
 
   // Filter rule table (on std::cin) for test set (parse tree version).
-  void Filter(const std::vector<boost::shared_ptr<StringTree> > &);
+  void Filter(const std::vector<boost::shared_ptr<SyntaxTree> > &);
 
   void ProcessOptions(int, char *[], Options &) const;
 
@@ -46,7 +46,7 @@ private:
 
   // Read test set (tree version)
   void ReadTestSet(std::istream &,
-                   std::vector<boost::shared_ptr<StringTree> > &);
+                   std::vector<boost::shared_ptr<SyntaxTree> > &);
 
   // Read test set (forest version)
   void ReadTestSet(std::istream &,
diff --git a/phrase-extract/filter-rule-table/ForestTsgFilter.h b/phrase-extract/filter-rule-table/ForestTsgFilter.h
index ff48b2e22..c9fe41f57 100644
--- a/phrase-extract/filter-rule-table/ForestTsgFilter.h
+++ b/phrase-extract/filter-rule-table/ForestTsgFilter.h
@@ -10,7 +10,6 @@
 #include <boost/unordered_set.hpp>
 
 #include "syntax-common/numbered_set.h"
-#include "syntax-common/string_tree.h"
 #include "syntax-common/tree.h"
 #include "syntax-common/tree_fragment_tokenizer.h"
 
diff --git a/phrase-extract/filter-rule-table/TreeCfgFilter.cpp b/phrase-extract/filter-rule-table/TreeCfgFilter.cpp
index cb04dc94e..dc938ac19 100644
--- a/phrase-extract/filter-rule-table/TreeCfgFilter.cpp
+++ b/phrase-extract/filter-rule-table/TreeCfgFilter.cpp
@@ -12,7 +12,7 @@ namespace FilterRuleTable
 {
 
 TreeCfgFilter::TreeCfgFilter(
-  const std::vector<boost::shared_ptr<StringTree> > &sentences)
+  const std::vector<boost::shared_ptr<SyntaxTree> > &sentences)
 {
 }
 
diff --git a/phrase-extract/filter-rule-table/TreeCfgFilter.h b/phrase-extract/filter-rule-table/TreeCfgFilter.h
index 7dd0fa072..3434ff200 100644
--- a/phrase-extract/filter-rule-table/TreeCfgFilter.h
+++ b/phrase-extract/filter-rule-table/TreeCfgFilter.h
@@ -8,8 +8,9 @@
 #include <boost/shared_ptr.hpp>
 #include <boost/unordered_map.hpp>
 
+#include "SyntaxTree.h"
+
 #include "syntax-common/numbered_set.h"
-#include "syntax-common/string_tree.h"
 #include "syntax-common/tree.h"
 #include "syntax-common/tree_fragment_tokenizer.h"
 
@@ -29,7 +30,7 @@ class TreeCfgFilter : public CfgFilter
 {
 public:
   // Initialize the filter for a given set of test sentences.
-  TreeCfgFilter(const std::vector<boost::shared_ptr<StringTree> > &);
+  TreeCfgFilter(const std::vector<boost::shared_ptr<SyntaxTree> > &);
 
   void Filter(std::istream &in, std::ostream &out);
 };
diff --git a/phrase-extract/filter-rule-table/TreeTsgFilter.cpp b/phrase-extract/filter-rule-table/TreeTsgFilter.cpp
index 32a59fd6c..17a8dcb22 100644
--- a/phrase-extract/filter-rule-table/TreeTsgFilter.cpp
+++ b/phrase-extract/filter-rule-table/TreeTsgFilter.cpp
@@ -8,13 +8,13 @@ namespace FilterRuleTable
 {
 
 TreeTsgFilter::TreeTsgFilter(
-  const std::vector<boost::shared_ptr<StringTree> > &sentences)
+  const std::vector<boost::shared_ptr<SyntaxTree> > &sentences)
 {
-  // Convert each StringTree to an IdTree.
+  // Convert each SyntaxTree to an IdTree.
   m_sentences.reserve(sentences.size());
-  for (std::vector<boost::shared_ptr<StringTree> >::const_iterator p =
+  for (std::vector<boost::shared_ptr<SyntaxTree> >::const_iterator p =
          sentences.begin(); p != sentences.end(); ++p) {
-    m_sentences.push_back(boost::shared_ptr<IdTree>(StringTreeToIdTree(**p)));
+    m_sentences.push_back(boost::shared_ptr<IdTree>(SyntaxTreeToIdTree(**p)));
   }
 
   m_labelToTree.resize(m_testVocab.Size());
@@ -25,15 +25,15 @@ TreeTsgFilter::TreeTsgFilter(
   }
 }
 
-TreeTsgFilter::IdTree *TreeTsgFilter::StringTreeToIdTree(const StringTree &s)
+TreeTsgFilter::IdTree *TreeTsgFilter::SyntaxTreeToIdTree(const SyntaxTree &s)
 {
-  IdTree *t = new IdTree(m_testVocab.Insert(s.value()));
-  const std::vector<StringTree*> &sChildren = s.children();
+  IdTree *t = new IdTree(m_testVocab.Insert(s.value().GetLabel()));
+  const std::vector<SyntaxTree*> &sChildren = s.children();
   std::vector<IdTree*> &tChildren = t->children();
   tChildren.reserve(sChildren.size());
-  for (std::vector<StringTree*>::const_iterator p = sChildren.begin();
+  for (std::vector<SyntaxTree*>::const_iterator p = sChildren.begin();
        p != sChildren.end(); ++p) {
-    IdTree *child = StringTreeToIdTree(**p);
+    IdTree *child = SyntaxTreeToIdTree(**p);
     child->parent() = t;
     tChildren.push_back(child);
   }
diff --git a/phrase-extract/filter-rule-table/TreeTsgFilter.h b/phrase-extract/filter-rule-table/TreeTsgFilter.h
index 17378b552..fa11350b6 100644
--- a/phrase-extract/filter-rule-table/TreeTsgFilter.h
+++ b/phrase-extract/filter-rule-table/TreeTsgFilter.h
@@ -8,8 +8,9 @@
 #include <boost/shared_ptr.hpp>
 #include <boost/unordered_map.hpp>
 
+#include "SyntaxTree.h"
+
 #include "syntax-common/numbered_set.h"
-#include "syntax-common/string_tree.h"
 #include "syntax-common/tree.h"
 #include "syntax-common/tree_fragment_tokenizer.h"
 
@@ -29,7 +30,7 @@ class TreeTsgFilter : public TsgFilter
 {
 public:
   // Initialize the filter for a given set of test sentences.
-  TreeTsgFilter(const std::vector<boost::shared_ptr<StringTree> > &);
+  TreeTsgFilter(const std::vector<boost::shared_ptr<SyntaxTree> > &);
 
 private:
   // Add an entry to m_labelToTree for every subtree of the given tree.
@@ -41,9 +42,9 @@ private:
   // Try to match a fragment against a specific subtree of a test tree.
   bool MatchFragment(const IdTree &, const IdTree &);
 
-  // Convert a StringTree to an IdTree (wrt m_testVocab).  Inserts symbols into
+  // Convert a SyntaxTree to an IdTree (wrt m_testVocab).  Inserts symbols into
   // m_testVocab.
-  IdTree *StringTreeToIdTree(const StringTree &);
+  IdTree *SyntaxTreeToIdTree(const SyntaxTree &);
 
   std::vector<boost::shared_ptr<IdTree> > m_sentences;
   std::vector<std::vector<const IdTree *> > m_labelToTree;
diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc
index 2f8a904fa..bf3c6d87e 100644
--- a/phrase-extract/syntax-common/xml_tree_parser.cc
+++ b/phrase-extract/syntax-common/xml_tree_parser.cc
@@ -1,17 +1,27 @@
 #include "xml_tree_parser.h"
 
-#include "tables-core.h"
-#include "XmlException.h"
-#include "XmlTree.h"
-#include "util/tokenize.hh"
-
 #include <cassert>
 #include <vector>
 
+#include "util/tokenize.hh"
+
+#include "SyntaxTree.h"
+#include "tables-core.h"
+#include "XmlException.h"
+#include "XmlTree.h"
+
 namespace MosesTraining {
 namespace Syntax {
 
-StringTree *XmlTreeParser::Parse(const std::string &line) {
+XmlTreeParser::XmlTreeParser(std::set<std::string> &labelSet,
+                             std::map<std::string, int> &topLabelSet)
+  : label_set_(labelSet)
+  , top_label_set_(topLabelSet)
+{
+}
+
+std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line)
+{
   line_ = line;
   node_collection_.Clear();
   try {
@@ -22,38 +32,37 @@ StringTree *XmlTreeParser::Parse(const std::string &line) {
   } catch (const XmlException &e) {
     throw Exception(e.getMsg());
   }
-  node_collection_.ConnectNodes();
-  SyntaxNode *root = node_collection_.GetTop();
-  assert(root);
+  std::auto_ptr<SyntaxTree> root = node_collection_.ExtractTree();
   words_ = util::tokenize(line_);
-  return ConvertTree(*root, words_);
+  AttachWords(words_, *root);
+  return root;
 }
 
-// Converts a SyntaxNode tree to a StringTree.
-StringTree *XmlTreeParser::ConvertTree(const SyntaxNode &tree,
-                                       const std::vector<std::string> &words) {
-  StringTree *root = new StringTree(tree.GetLabel());
-  const std::vector<SyntaxNode*> &children = tree.GetChildren();
-  if (children.empty()) {
-    if (tree.GetStart() != tree.GetEnd()) {
+void XmlTreeParser::AttachWords(const std::vector<std::string> &words,
+                                SyntaxTree &root)
+{
+  std::vector<SyntaxTree*> leaves;
+  leaves.reserve(words.size());
+  for (SyntaxTree::LeafIterator p(root); p != SyntaxTree::LeafIterator(); ++p) {
+    leaves.push_back(&*p);
+  }
+
+  std::vector<std::string>::const_iterator q = words.begin();
+  for (std::vector<SyntaxTree*>::iterator p = leaves.begin(); p != leaves.end();
+       ++p) {
+    SyntaxTree *leaf = *p;
+    const int start = leaf->value().GetStart();
+    const int end = leaf->value().GetEnd();
+    if (start != end) {
       std::ostringstream msg;
-      msg << "leaf node covers multiple words (" << tree.GetStart()
-          << "-" << tree.GetEnd() << "): this is currently unsupported";
+      msg << "leaf node covers multiple words (" << start << "-" << end
+          << "): this is currently unsupported";
       throw Exception(msg.str());
     }
-    StringTree *leaf = new StringTree(words[tree.GetStart()]);
-    leaf->parent() = root;
-    root->children().push_back(leaf);
-  } else {
-    for (std::vector<SyntaxNode*>::const_iterator p = children.begin();
-         p != children.end(); ++p) {
-      assert(*p);
-      StringTree *child = ConvertTree(**p, words);
-      child->parent() = root;
-      root->children().push_back(child);
-    }
+    SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(start, end, *q++));
+    leaf->children().push_back(newLeaf);
+    newLeaf->parent() = leaf;
   }
-  return root;
 }
 
 }  // namespace Syntax
diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h
index c84ea25ec..e0b75c830 100644
--- a/phrase-extract/syntax-common/xml_tree_parser.h
+++ b/phrase-extract/syntax-common/xml_tree_parser.h
@@ -1,34 +1,44 @@
 #pragma once
 
 #include <map>
+#include <memory>
 #include <set>
 #include <string>
 #include <vector>
 
 #include "SyntaxNode.h"
 #include "SyntaxNodeCollection.h"
+#include "SyntaxTree.h"
 
 #include "exception.h"
-#include "string_tree.h"
 
 namespace MosesTraining {
 namespace Syntax {
 
-// Parses a string in Moses' XML parse tree format and returns a StringTree
+// Parses a string in Moses' XML parse tree format and returns a SyntaxTree
 // object.  This is a wrapper around the ProcessAndStripXMLTags function.
 class XmlTreeParser {
  public:
-  StringTree *Parse(const std::string &);
+  XmlTreeParser(std::set<std::string> &, std::map<std::string, int> &);
+
+  std::auto_ptr<SyntaxTree> Parse(const std::string &);
+
+  const std::vector<std::string>& GetWords() {
+    return words_;
+  }
+
+  const SyntaxNodeCollection &GetNodeCollection() const {
+    return node_collection_;
+  }
 
  private:
-  static StringTree *ConvertTree(const MosesTraining::SyntaxNode &,
-                                 const std::vector<std::string> &);
-
-  std::set<std::string> label_set_;
-  std::map<std::string, int> top_label_set_;
+  std::set<std::string> &label_set_;
+  std::map<std::string, int> &top_label_set_;
   std::string line_;
-  MosesTraining::SyntaxNodeCollection node_collection_;
+  SyntaxNodeCollection node_collection_;
   std::vector<std::string> words_;
+
+  void AttachWords(const std::vector<std::string> &, SyntaxTree &);
 };
 
 }  // namespace Syntax

From efdb8566b17d19783aa65caf22b24e48a789fbb8 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Tue, 2 Jun 2015 21:00:32 +0400
Subject: [PATCH 042/108] delete ChangeSource(). Not used

---
 contrib/other-builds/all.workspace       |  1 -
 contrib/other-builds/moses/moses.project |  8 +-
 moses-cmd/MainVW.cpp                     |  3 -
 moses/ExportInterface.cpp                |  4 -
 moses/FF/Factory.cpp                     |  2 -
 moses/FF/FeatureFunction.cpp             | 14 ----
 moses/FF/FeatureFunction.h               |  6 --
 moses/FF/SkeletonChangeInput.cpp         | 96 ------------------------
 moses/FF/SkeletonChangeInput.h           | 45 -----------
 9 files changed, 3 insertions(+), 176 deletions(-)
 delete mode 100644 moses/FF/SkeletonChangeInput.cpp
 delete mode 100644 moses/FF/SkeletonChangeInput.h

diff --git a/contrib/other-builds/all.workspace b/contrib/other-builds/all.workspace
index 3df758293..66dafe3d2 100644
--- a/contrib/other-builds/all.workspace
+++ b/contrib/other-builds/all.workspace
@@ -1,6 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <CodeLite_Workspace Name="all" Database="all.tags">
-  <Project Name="manual-label" Path="manual-label/manual-label.project" Active="No"/>
   <Project Name="extract" Path="extract/extract.project" Active="No"/>
   <Project Name="util" Path="util/util.project" Active="No"/>
   <Project Name="extract-mixed-syntax" Path="extract-mixed-syntax/extract-mixed-syntax.project" Active="No"/>
diff --git a/contrib/other-builds/moses/moses.project b/contrib/other-builds/moses/moses.project
index 2c2affd45..f902dd1f4 100644
--- a/contrib/other-builds/moses/moses.project
+++ b/contrib/other-builds/moses/moses.project
@@ -1,6 +1,9 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <CodeLite_Project Name="moses" InternalType="Library">
   <Plugins>
+    <Plugin Name="qmake">
+      <![CDATA[00010001N0005Debug000000000000]]>
+    </Plugin>
     <Plugin Name="CMakePlugin">
       <![CDATA[[{
   "name": "Debug",
@@ -13,9 +16,6 @@
   "parentProject": ""
  }]]]>
     </Plugin>
-    <Plugin Name="qmake">
-      <![CDATA[00010001N0005Debug000000000000]]>
-    </Plugin>
   </Plugins>
   <VirtualDirectory Name="TranslationModel">
     <VirtualDirectory Name="UG">
@@ -531,8 +531,6 @@
     <File Name="../../../moses/FF/RuleScope.h"/>
     <File Name="../../../moses/FF/SetSourcePhrase.cpp"/>
     <File Name="../../../moses/FF/SetSourcePhrase.h"/>
-    <File Name="../../../moses/FF/SkeletonChangeInput.cpp"/>
-    <File Name="../../../moses/FF/SkeletonChangeInput.h"/>
     <File Name="../../../moses/FF/SkeletonStatefulFF.cpp"/>
     <File Name="../../../moses/FF/SkeletonStatefulFF.h"/>
     <File Name="../../../moses/FF/SkeletonStatelessFF.cpp"/>
diff --git a/moses-cmd/MainVW.cpp b/moses-cmd/MainVW.cpp
index ac54c1ed6..c8047c201 100644
--- a/moses-cmd/MainVW.cpp
+++ b/moses-cmd/MainVW.cpp
@@ -151,9 +151,6 @@ int main(int argc, char** argv)
         ResetUserTime();
       }
 
-      InputType* foo = source.get();
-      FeatureFunction::CallChangeSource(foo);
-
       // set up task of training one sentence
       boost::shared_ptr<TrainingTask> task;
       task = TrainingTask::create(source, ioWrapper);
diff --git a/moses/ExportInterface.cpp b/moses/ExportInterface.cpp
index 0ceeceec1..c444e98c9 100644
--- a/moses/ExportInterface.cpp
+++ b/moses/ExportInterface.cpp
@@ -118,8 +118,6 @@ string SimpleTranslationInterface::translate(const string &inputString)
     ResetUserTime();
   }
 
-  FeatureFunction::CallChangeSource(&*source);
-
   // set up task of translating one sentence
   boost::shared_ptr<TranslationTask> task
   = TranslationTask::create(source, ioWrapper);
@@ -223,8 +221,6 @@ batch_run()
   while ((source = ioWrapper->ReadInput()) != NULL) {
     IFVERBOSE(1) ResetUserTime();
 
-    FeatureFunction::CallChangeSource(source.get());
-
     // set up task of translating one sentence
     boost::shared_ptr<TranslationTask>
     task = TranslationTask::create(source, ioWrapper);
diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp
index c797381ff..167e02370 100644
--- a/moses/FF/Factory.cpp
+++ b/moses/FF/Factory.cpp
@@ -62,7 +62,6 @@
 #include "moses/LM/SkeletonLM.h"
 #include "moses/FF/SkeletonTranslationOptionListFeature.h"
 #include "moses/LM/BilingualLM.h"
-#include "SkeletonChangeInput.h"
 #include "moses/TranslationModel/SkeletonPT.h"
 #include "moses/Syntax/InputWeightFF.h"
 #include "moses/Syntax/RuleTableFF.h"
@@ -268,7 +267,6 @@ FeatureRegistry::FeatureRegistry()
   MOSES_FNAME(SkeletonStatelessFF);
   MOSES_FNAME(SkeletonStatefulFF);
   MOSES_FNAME(SkeletonLM);
-  MOSES_FNAME(SkeletonChangeInput);
   MOSES_FNAME(SkeletonTranslationOptionListFeature);
   MOSES_FNAME(SkeletonPT);
 
diff --git a/moses/FF/FeatureFunction.cpp b/moses/FF/FeatureFunction.cpp
index 5eab202ae..08ad26db8 100644
--- a/moses/FF/FeatureFunction.cpp
+++ b/moses/FF/FeatureFunction.cpp
@@ -38,20 +38,6 @@ void FeatureFunction::Destroy()
   RemoveAllInColl(s_staticColl);
 }
 
-// The original declaration as
-// void FeatureFunction::CallChangeSource(InputType *&input)
-// had me a bit perplexed. Would you really want to allow
-// any feature function to replace the InputType behind the
-// back of the others? And change what the vector is pointing to?
-
-void FeatureFunction::CallChangeSource(InputType * const&input)
-{
-  for (size_t i = 0; i < s_staticColl.size(); ++i) {
-    const FeatureFunction &ff = *s_staticColl[i];
-    ff.ChangeSource(input);
-  }
-}
-
 void FeatureFunction::SetupAll(TranslationTask const& ttask)
 {
   BOOST_FOREACH(FeatureFunction* ff, s_staticColl)
diff --git a/moses/FF/FeatureFunction.h b/moses/FF/FeatureFunction.h
index d3d6ab168..c95b5eb25 100644
--- a/moses/FF/FeatureFunction.h
+++ b/moses/FF/FeatureFunction.h
@@ -62,9 +62,6 @@ public:
   static FeatureFunction &FindFeatureFunction(const std::string& name);
   static void Destroy();
 
-  static void CallChangeSource(InputType * const&input);
-  // see my note in FeatureFunction.cpp --- UG
-
   FeatureFunction(const std::string &line, bool initializeNow);
   FeatureFunction(size_t numScoreComponents, const std::string &line);
   virtual bool IsStateless() const = 0;
@@ -156,9 +153,6 @@ public:
                       ScoreComponentCollection& scoreBreakdown,
                       ScoreComponentCollection& estimatedFutureScore) const = 0;
 
-  // override this method if you want to change the input before decoding
-  virtual void ChangeSource(InputType * const&input) const { }
-
   // for context-dependent processing
   static void SetupAll(TranslationTask const& task);
   virtual void Setup(TranslationTask const& task) const { };
diff --git a/moses/FF/SkeletonChangeInput.cpp b/moses/FF/SkeletonChangeInput.cpp
deleted file mode 100644
index 7937d7771..000000000
--- a/moses/FF/SkeletonChangeInput.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-#include <vector>
-#include "SkeletonChangeInput.h"
-#include "moses/ScoreComponentCollection.h"
-#include "moses/TargetPhrase.h"
-#include "moses/Sentence.h"
-#include "moses/FactorCollection.h"
-#include "util/exception.hh"
-
-using namespace std;
-
-namespace Moses
-{
-SkeletonChangeInput::SkeletonChangeInput(const std::string &line)
-  :StatelessFeatureFunction(2, line)
-{
-  ReadParameters();
-}
-
-void SkeletonChangeInput::EvaluateInIsolation(const Phrase &source
-    , const TargetPhrase &targetPhrase
-    , ScoreComponentCollection &scoreBreakdown
-    , ScoreComponentCollection &estimatedFutureScore) const
-{
-  // dense scores
-  vector<float> newScores(m_numScoreComponents);
-  newScores[0] = 1.5;
-  newScores[1] = 0.3;
-  scoreBreakdown.PlusEquals(this, newScores);
-
-  // sparse scores
-  scoreBreakdown.PlusEquals(this, "sparse-name", 2.4);
-
-}
-
-void SkeletonChangeInput::EvaluateWithSourceContext(const InputType &input
-    , const InputPath &inputPath
-    , const TargetPhrase &targetPhrase
-    , const StackVec *stackVec
-    , ScoreComponentCollection &scoreBreakdown
-    , ScoreComponentCollection *estimatedFutureScore) const
-{
-  if (targetPhrase.GetNumNonTerminals()) {
-    vector<float> newScores(m_numScoreComponents);
-    newScores[0] = - std::numeric_limits<float>::infinity();
-    scoreBreakdown.PlusEquals(this, newScores);
-  }
-
-}
-
-void SkeletonChangeInput::EvaluateTranslationOptionListWithSourceContext(const InputType &input
-    , const TranslationOptionList &translationOptionList) const
-{}
-
-void SkeletonChangeInput::EvaluateWhenApplied(const Hypothesis& hypo,
-    ScoreComponentCollection* accumulator) const
-{}
-
-void SkeletonChangeInput::EvaluateWhenApplied(const ChartHypothesis &hypo,
-    ScoreComponentCollection* accumulator) const
-{}
-
-void SkeletonChangeInput::ChangeSource(InputType* const& input) const
-{
-  // add factor[1] to each word. Created from first 4 letter of factor[0]
-
-  Sentence *sentence = dynamic_cast<Sentence*>(input);
-  UTIL_THROW_IF2(sentence == NULL, "Not a sentence input");
-
-  FactorCollection &fc = FactorCollection::Instance();
-
-  size_t size = sentence->GetSize();
-  for (size_t i = 0; i < size; ++i) {
-    Word &word = sentence->Phrase::GetWord(i);
-    const Factor *factor0 = word[0];
-
-    std::string str = factor0->GetString().as_string();
-    if (str.length() > 4) {
-      str = str.substr(0, 4);
-    }
-
-    const Factor *factor1 = fc.AddFactor(str);
-    word.SetFactor(1, factor1);
-  }
-}
-
-void SkeletonChangeInput::SetParameter(const std::string& key, const std::string& value)
-{
-  if (key == "arg") {
-    // set value here
-  } else {
-    StatelessFeatureFunction::SetParameter(key, value);
-  }
-}
-
-}
-
diff --git a/moses/FF/SkeletonChangeInput.h b/moses/FF/SkeletonChangeInput.h
deleted file mode 100644
index f8d9010ce..000000000
--- a/moses/FF/SkeletonChangeInput.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#pragma once
-
-#include <string>
-#include "StatelessFeatureFunction.h"
-
-namespace Moses
-{
-
-class SkeletonChangeInput : public StatelessFeatureFunction
-{
-public:
-  SkeletonChangeInput(const std::string &line);
-
-  bool IsUseable(const FactorMask &mask) const {
-    return true;
-  }
-
-  void EvaluateInIsolation(const Phrase &source
-                           , const TargetPhrase &targetPhrase
-                           , ScoreComponentCollection &scoreBreakdown
-                           , ScoreComponentCollection &estimatedFutureScore) const;
-
-  void ChangeSource(InputType* const&input) const;
-
-  void EvaluateWithSourceContext(const InputType &input
-                                 , const InputPath &inputPath
-                                 , const TargetPhrase &targetPhrase
-                                 , const StackVec *stackVec
-                                 , ScoreComponentCollection &scoreBreakdown
-                                 , ScoreComponentCollection *estimatedFutureScore = NULL) const;
-
-  void EvaluateTranslationOptionListWithSourceContext(const InputType &input
-      , const TranslationOptionList &translationOptionList) const;
-
-  void EvaluateWhenApplied(const Hypothesis& hypo,
-                           ScoreComponentCollection* accumulator) const;
-  void EvaluateWhenApplied(const ChartHypothesis &hypo,
-                           ScoreComponentCollection* accumulator) const;
-
-  void SetParameter(const std::string& key, const std::string& value);
-
-};
-
-}
-

From 1d7ed728eec85d916ab5331f4aa20a259b047b38 Mon Sep 17 00:00:00 2001
From: Jeroen Vermeulen <jtv@precisiontranslationtools.com>
Date: Wed, 3 Jun 2015 00:00:57 +0700
Subject: [PATCH 043/108] =?UTF-8?q?Rename=20=E2=80=98aux=E2=80=99=20to=20?=
 =?UTF-8?q?=E2=80=98auxiliary=E2=80=99=20for=20Windows'=20sake.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Any filename like “aux” or “aux.*” is special in Windows, and can't be
opened, dir'ed, and so on.  This was causing some people problems.
---
 contrib/m4m/examples/giza-vs-fast.m4m          |  2 +-
 contrib/m4m/modules/{aux.m4m => auxiliary.m4m} |  0
 contrib/m4m/modules/m4m.m4m                    |  2 +-
 contrib/m4m/modules/prepare-corpus.m4m         | 12 ++++++------
 4 files changed, 8 insertions(+), 8 deletions(-)
 rename contrib/m4m/modules/{aux.m4m => auxiliary.m4m} (100%)

diff --git a/contrib/m4m/examples/giza-vs-fast.m4m b/contrib/m4m/examples/giza-vs-fast.m4m
index 3ce336611..e5e56dc2a 100644
--- a/contrib/m4m/examples/giza-vs-fast.m4m
+++ b/contrib/m4m/examples/giza-vs-fast.m4m
@@ -96,4 +96,4 @@ reset-lm:
 	-rm -rf lm
 reset-all: reset-lm reset-aln
 	-rm -rf $(wildcard crp/trn/*/[ct]* crp/dev/[ct]* crp/tst/[ct]*) 
-	-rm -rf aux
+	-rm -rf auxiliary
diff --git a/contrib/m4m/modules/aux.m4m b/contrib/m4m/modules/auxiliary.m4m
similarity index 100%
rename from contrib/m4m/modules/aux.m4m
rename to contrib/m4m/modules/auxiliary.m4m
diff --git a/contrib/m4m/modules/m4m.m4m b/contrib/m4m/modules/m4m.m4m
index d6c597db9..1a88e80b5 100644
--- a/contrib/m4m/modules/m4m.m4m
+++ b/contrib/m4m/modules/m4m.m4m
@@ -8,7 +8,7 @@ m4mdir := $(patsubst %modules/,%,\
 # $(info M4MDIR is ${m4mdir})
 
 # m4m modules to be included
-M4M_MODULES := aux init 
+M4M_MODULES := auxiliary init 
 M4M_MODULES += tools moses-parameters prepare-corpus 
 M4M_MODULES += mgiza fastalign mmbitext phrase-table moses-ini 
 M4M_MODULES += tune-moses eval-system kenlm
diff --git a/contrib/m4m/modules/prepare-corpus.m4m b/contrib/m4m/modules/prepare-corpus.m4m
index 3c88069c3..2c064c9c7 100644
--- a/contrib/m4m/modules/prepare-corpus.m4m
+++ b/contrib/m4m/modules/prepare-corpus.m4m
@@ -40,8 +40,8 @@ endef
 define truecase
 
 $2/cased/%.$3.gz: caser  = ${run-truecaser} 
-$2/cased/%.$3.gz: caser += -model ${WDIR}/aux/truecasing-model.$1
-$2/cased/%.$3.gz: | $2/tok/%.$3.gz  ${WDIR}/aux/truecasing-model.$1
+$2/cased/%.$3.gz: caser += -model ${WDIR}/auxiliary/truecasing-model.$1
+$2/cased/%.$3.gz: | $2/tok/%.$3.gz  ${WDIR}/auxiliary/truecasing-model.$1
 	$$(lock)
 	zcat $$(word 1, $$|) | ${parallel} --pipe -k $${caser} | gzip > $$@_
 	mv $$@_ $$@
@@ -127,8 +127,8 @@ endef
 # .SECONDARY: $(call trn.tok-mno,${L1}) $(call trn.tok-pll,${L1})
 # .SECONDARY: $(call trn.tok-mno,${L2}) $(call trn.tok-pll,${L2})
 
-#${WDIR}/aux/truecasing-model.${L1}: | $(call trn.tok-mno,${L1}) $(call trn.tok-pll,${L1})
-${WDIR}/aux/truecasing-model.${L1}: | $(call trn.tok-mno,${L1}) 
+#${WDIR}/auxiliary/truecasing-model.${L1}: | $(call trn.tok-mno,${L1}) $(call trn.tok-pll,${L1})
+${WDIR}/auxiliary/truecasing-model.${L1}: | $(call trn.tok-mno,${L1}) 
 	$(lock)
 	$(if $|,,$(error Can't find training data for $@!))#'
 	${train-truecaser} -model $@_ -corpus <(echo $| | xargs zcat -f) 
@@ -136,8 +136,8 @@ ${WDIR}/aux/truecasing-model.${L1}: | $(call trn.tok-mno,${L1})
 	mv $@_ $@
 	$(unlock)
 
-#${WDIR}/aux/truecasing-model.${L2}: | $(call trn.tok-mno,${L2}) $(call trn.tok-pll,${L2})
-${WDIR}/aux/truecasing-model.${L2}: | $(call trn.tok-mno,${L2}) 
+#${WDIR}/auxiliary/truecasing-model.${L2}: | $(call trn.tok-mno,${L2}) $(call trn.tok-pll,${L2})
+${WDIR}/auxiliary/truecasing-model.${L2}: | $(call trn.tok-mno,${L2}) 
 	$(lock)
 	$(if $|,,$(error Can't find training data for $@!))#'
 	${train-truecaser} -model $@_ -corpus <(echo $| | xargs zcat -f) 

From 3ea5faead8bb21d93ada5553dcb37d2229394415 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Tue, 2 Jun 2015 21:44:58 +0400
Subject: [PATCH 044/108] codelite

---
 contrib/other-builds/moses/moses.project               | 10 +++++++---
 .../CYKPlusParser/ChartRuleLookupManagerMemory.h       |  3 ---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/contrib/other-builds/moses/moses.project b/contrib/other-builds/moses/moses.project
index f902dd1f4..66e0b9bad 100644
--- a/contrib/other-builds/moses/moses.project
+++ b/contrib/other-builds/moses/moses.project
@@ -798,6 +798,10 @@
   <VirtualDirectory Name="parameters">
     <File Name="../../../moses/parameters/ContextParameters.cpp"/>
     <File Name="../../../moses/parameters/ContextParameters.h"/>
+    <File Name="../../../moses/parameters/BookkeepingOptions.cpp"/>
+    <File Name="../../../moses/parameters/BookkeepingOptions.h"/>
+    <File Name="../../../moses/parameters/NBestOptions.cpp"/>
+    <File Name="../../../moses/parameters/NBestOptions.h"/>
   </VirtualDirectory>
   <Settings Type="Static Library">
     <GlobalSettings>
@@ -812,9 +816,9 @@
     <Configuration Name="Debug" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Static Library" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
       <Compiler Options="-g" C_Options="-g" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
         <IncludePath Value="."/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder"/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/phrase-extract"/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
+        <IncludePath Value="../../../"/>
+        <IncludePath Value="../../../phrase-extract"/>
+        <IncludePath Value="../../../boost/include"/>
         <Preprocessor Value="MAX_NUM_FACTORS=4"/>
         <Preprocessor Value="KENLM_MAX_ORDER=7"/>
         <Preprocessor Value="WITH_THREADS"/>
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h
index 84e5f085d..c8e2db2d7 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h
@@ -18,8 +18,6 @@
  ***********************************************************************/
 
 #pragma once
-#ifndef moses_ChartRuleLookupManagerMemory_h
-#define moses_ChartRuleLookupManagerMemory_h
 
 #include <vector>
 
@@ -97,4 +95,3 @@ private:
 
 }  // namespace Moses
 
-#endif

From 6bea23357c1d5a9a50382330d14f4c734f94ac98 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Wed, 3 Jun 2015 09:28:38 +0100
Subject: [PATCH 045/108] Ongoing moses/phrase-extract refactoring

---
 phrase-extract/pcfg-common/pcfg_tree.h        |  79 ----------
 phrase-extract/pcfg-common/syntax_tree.h      |  93 ------------
 phrase-extract/pcfg-common/typedef.h          |   1 -
 phrase-extract/pcfg-common/xml_tree_parser.cc |  89 ------------
 phrase-extract/pcfg-common/xml_tree_parser.h  |  59 --------
 phrase-extract/pcfg-common/xml_tree_writer.h  | 135 ------------------
 phrase-extract/pcfg-extract/Jamfile           |   2 +-
 phrase-extract/pcfg-extract/pcfg_extract.cc   |  34 ++---
 phrase-extract/pcfg-extract/rule_extractor.cc |  16 +--
 phrase-extract/pcfg-extract/rule_extractor.h  |   6 +-
 phrase-extract/pcfg-score/pcfg_score.cc       |  19 +--
 phrase-extract/pcfg-score/tree_scorer.cc      |  66 +++++++--
 phrase-extract/pcfg-score/tree_scorer.h       |  10 +-
 .../syntax-common/xml_tree_parser.cc          |   5 +-
 .../syntax-common/xml_tree_parser.h           |  36 ++++-
 .../syntax-common/xml_tree_writer.cc          |  82 +++++++++++
 .../syntax-common/xml_tree_writer.h           |  27 ++++
 17 files changed, 245 insertions(+), 514 deletions(-)
 delete mode 100644 phrase-extract/pcfg-common/pcfg_tree.h
 delete mode 100644 phrase-extract/pcfg-common/syntax_tree.h
 delete mode 100644 phrase-extract/pcfg-common/xml_tree_parser.cc
 delete mode 100644 phrase-extract/pcfg-common/xml_tree_parser.h
 delete mode 100644 phrase-extract/pcfg-common/xml_tree_writer.h
 create mode 100644 phrase-extract/syntax-common/xml_tree_writer.cc
 create mode 100644 phrase-extract/syntax-common/xml_tree_writer.h

diff --git a/phrase-extract/pcfg-common/pcfg_tree.h b/phrase-extract/pcfg-common/pcfg_tree.h
deleted file mode 100644
index ce28eb8dd..000000000
--- a/phrase-extract/pcfg-common/pcfg_tree.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-***********************************************************************/
-
-#pragma once
-#ifndef PCFG_PCFG_TREE_H_
-#define PCFG_PCFG_TREE_H_
-
-#include <string>
-
-#include "syntax_tree.h"
-#include "xml_tree_writer.h"
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-template<typename DerivedType>
-class PcfgTreeBase : public SyntaxTreeBase<std::string, DerivedType> {
- public:
-  typedef std::string LabelType;
-  typedef SyntaxTreeBase<LabelType, DerivedType> BaseType;
-
-  PcfgTreeBase(const LabelType &label) : BaseType(label), score_(0.0) {}
-
-  double score() const { return score_; }
-  void set_score(double s) { score_ = s; }
-
- private:
-  double score_;
-};
-
-class PcfgTree : public PcfgTreeBase<PcfgTree> {
- public:
-  typedef PcfgTreeBase<PcfgTree> BaseType;
-  PcfgTree(const BaseType::LabelType &label) : BaseType(label) {}
-};
-
-// Specialise XmlOutputHandler for PcfgTree.
-template<>
-class XmlOutputHandler<PcfgTree> {
- public:
-  typedef std::map<std::string, std::string> AttributeMap;
-
-  void GetLabel(const PcfgTree &tree, std::string &label) const {
-    label = tree.label();
-  }
-
-  void GetAttributes(const PcfgTree &tree, AttributeMap &attribute_map) const {
-    attribute_map.clear();
-    double score = tree.score();
-    if (score != 0.0) {
-      std::ostringstream out;
-      out << tree.score();
-      attribute_map["pcfg"] = out.str();
-    }
-  }
-};
-
-}  // namespace PCFG
-}  // namespace Syntax
-}  // namespace MosesTraining
-
-#endif
diff --git a/phrase-extract/pcfg-common/syntax_tree.h b/phrase-extract/pcfg-common/syntax_tree.h
deleted file mode 100644
index c0c6eaef9..000000000
--- a/phrase-extract/pcfg-common/syntax_tree.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-***********************************************************************/
-
-#pragma once
-#ifndef PCFG_SYNTAX_TREE_H_
-#define PCFG_SYNTAX_TREE_H_
-
-#include <cassert>
-#include <vector>
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-// Base class for SyntaxTree, AgreementTree, and friends.
-template<typename T, typename DerivedType>
-class SyntaxTreeBase {
- public:
-  // Constructors
-  SyntaxTreeBase(const T &label)
-    : label_(label)
-    , children_()
-    , parent_(0) {}
-
-  SyntaxTreeBase(const T &label, const std::vector<DerivedType *> &children)
-    : label_(label)
-    , children_(children)
-    , parent_(0) {}
-
-  // Destructor
-  virtual ~SyntaxTreeBase();
-
-  const T &label() const { return label_; }
-  const DerivedType *parent() const { return parent_; }
-  DerivedType *parent() { return parent_; }
-  const std::vector<DerivedType *> &children() const { return children_; }
-  std::vector<DerivedType *> &children() { return children_; }
-
-  void set_label(const T &label) { label_ = label; }
-  void set_parent(DerivedType *parent) { parent_ = parent; }
-  void set_children(const std::vector<DerivedType *> &c) { children_ = c; }
-
-  bool IsLeaf() const { return children_.empty(); }
-
-  bool IsPreterminal() const {
-    return children_.size() == 1 && children_[0]->IsLeaf();
-  }
-
-  void AddChild(DerivedType *child) { children_.push_back(child); }
-
- private:
-  T label_;
-  std::vector<DerivedType *> children_;
-  DerivedType *parent_;
-};
-
-template<typename T>
-class SyntaxTree : public SyntaxTreeBase<T, SyntaxTree<T> > {
- public:
-  typedef SyntaxTreeBase<T, SyntaxTree<T> > BaseType;
-  SyntaxTree(const T &label) : BaseType(label) {}
-  SyntaxTree(const T &label, const std::vector<SyntaxTree *> &children)
-    : BaseType(label, children) {}
-};
-
-template<typename T, typename DerivedType>
-SyntaxTreeBase<T, DerivedType>::~SyntaxTreeBase() {
-  for (std::size_t i = 0; i < children_.size(); ++i) {
-    delete children_[i];
-  }
-}
-
-}  // namespace PCFG
-}  // namespace Syntax
-}  // namespace MosesTraining
-
-#endif
diff --git a/phrase-extract/pcfg-common/typedef.h b/phrase-extract/pcfg-common/typedef.h
index e738163df..1280b89cf 100644
--- a/phrase-extract/pcfg-common/typedef.h
+++ b/phrase-extract/pcfg-common/typedef.h
@@ -24,7 +24,6 @@
 #include <string>
 
 #include "syntax-common/numbered_set.h"
-#include "syntax_tree.h"
 
 namespace MosesTraining {
 namespace Syntax {
diff --git a/phrase-extract/pcfg-common/xml_tree_parser.cc b/phrase-extract/pcfg-common/xml_tree_parser.cc
deleted file mode 100644
index f15a04811..000000000
--- a/phrase-extract/pcfg-common/xml_tree_parser.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-***********************************************************************/
-
-#include "xml_tree_parser.h"
-
-#include <cassert>
-#include <vector>
-
-#include "tables-core.h"
-#include "XmlException.h"
-#include "XmlTree.h"
-#include "util/tokenize.hh"
-
-#include "syntax-common/exception.h"
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-XmlTreeParser::XmlTreeParser() {
-}
-
-std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line) {
-  m_line = line;
-  m_tree.Clear();
-  try {
-    if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet)) {
-      throw Exception("");
-    }
-  } catch (const XmlException &e) {
-    throw Exception(e.getMsg());
-  }
-  m_tree.ConnectNodes();
-  SyntaxNode *root = m_tree.GetTop();
-  if (!root) {
-    // There is no XML tree.
-    return std::auto_ptr<PcfgTree>();
-  }
-  m_words = util::tokenize(m_line);
-  return ConvertTree(*root, m_words);
-}
-
-// Converts a SyntaxNode tree to a Moses::PCFG::PcfgTree.
-std::auto_ptr<PcfgTree> XmlTreeParser::ConvertTree(
-    const SyntaxNode &tree,
-    const std::vector<std::string> &words) {
-  std::auto_ptr<PcfgTree> root(new PcfgTree(tree.GetLabel()));
-  const std::vector<SyntaxNode*> &children = tree.GetChildren();
-  if (children.empty()) {
-    if (tree.GetStart() != tree.GetEnd()) {
-      std::ostringstream msg;
-      msg << "leaf node covers multiple words (" << tree.GetStart()
-          << "-" << tree.GetEnd() << "): this is currently unsupported";
-      throw Exception(msg.str());
-    }
-    std::auto_ptr<PcfgTree> leaf(new PcfgTree(words[tree.GetStart()]));
-    leaf->set_parent(root.get());
-    root->AddChild(leaf.release());
-  } else {
-    for (std::vector<SyntaxNode*>::const_iterator p = children.begin();
-         p != children.end(); ++p) {
-      assert(*p);
-      std::auto_ptr<PcfgTree> child = ConvertTree(**p, words);
-      child->set_parent(root.get());
-      root->AddChild(child.release());
-    }
-  }
-  return root;
-}
-
-}  // namespace PCFG
-}  // namespace Syntax
-}  // namespace MosesTraining
diff --git a/phrase-extract/pcfg-common/xml_tree_parser.h b/phrase-extract/pcfg-common/xml_tree_parser.h
deleted file mode 100644
index 8605c0691..000000000
--- a/phrase-extract/pcfg-common/xml_tree_parser.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-***********************************************************************/
-
-#pragma once
-#ifndef PCFG_XML_TREE_PARSER_H_
-#define PCFG_XML_TREE_PARSER_H_
-
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "pcfg_tree.h"
-#include "SyntaxNode.h"
-#include "SyntaxNodeCollection.h"
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-// Parses a string in Moses' XML parse tree format and returns a PcfgTree
-// object.
-class XmlTreeParser {
- public:
-  XmlTreeParser();
-  std::auto_ptr<PcfgTree> Parse(const std::string &);
- private:
-  std::auto_ptr<PcfgTree> ConvertTree(const MosesTraining::SyntaxNode &,
-                                      const std::vector<std::string> &);
-
-  std::set<std::string> m_labelSet;
-  std::map<std::string, int> m_topLabelSet;
-  std::string m_line;
-  MosesTraining::SyntaxNodeCollection m_tree;
-  std::vector<std::string> m_words;
-};
-
-}  // namespace PCFG
-}  // namespace Syntax
-}  // namespace MosesTraining
-
-#endif
diff --git a/phrase-extract/pcfg-common/xml_tree_writer.h b/phrase-extract/pcfg-common/xml_tree_writer.h
deleted file mode 100644
index 8582e544f..000000000
--- a/phrase-extract/pcfg-common/xml_tree_writer.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-***********************************************************************/
-
-#pragma once
-#ifndef PCFG_XML_TREE_WRITER_H_
-#define PCFG_XML_TREE_WRITER_H_
-
-#include <cassert>
-#include <map>
-#include <memory>
-#include <ostream>
-#include <vector>
-#include <string>
-
-#include "XmlTree.h"
-
-#include "syntax_tree.h"
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-template<typename InputTree>
-class XmlOutputHandler {
- public:
-  typedef std::map<std::string, std::string> AttributeMap;
-
-  void GetLabel(const InputTree &, std::string &) const;
-  void GetAttributes(const InputTree &, AttributeMap &) const;
-};
-
-template<typename InputTree>
-class XmlTreeWriter : public XmlOutputHandler<InputTree> {
- public:
-  typedef XmlOutputHandler<InputTree> Base;
-  void Write(const InputTree &, std::ostream &) const;
- private:
-  std::string Escape(const std::string &) const;
-};
-
-template<typename InputTree>
-void XmlTreeWriter<InputTree>::Write(const InputTree &tree,
-                                     std::ostream &out) const {
-  assert(!tree.IsLeaf());
-
-  // Opening tag
-
-  std::string label;
-  Base::GetLabel(tree, label);
-  out << "<tree label=\"" << Escape(label) << "\"";
-
-  typename Base::AttributeMap attribute_map;
-  Base::GetAttributes(tree, attribute_map);
-
-  for (typename Base::AttributeMap::const_iterator p = attribute_map.begin();
-       p != attribute_map.end(); ++p) {
-    out << " " << p->first << "=\"" << p->second << "\"";
-  }
-
-  out << ">";
-
-  // Children
-
-  const std::vector<InputTree *> &children = tree.children();
-  for (typename std::vector<InputTree *>::const_iterator p = children.begin();
-       p != children.end(); ++p) {
-    InputTree &child = **p;
-    if (child.IsLeaf()) {
-      Base::GetLabel(child, label);
-      out << " " << Escape(label);
-    } else {
-      out << " ";
-      Write(**p, out);
-    }
-  }
-
-  // Closing tag
-  out << " </tree>";
-
-  if (tree.parent() == 0) {
-    out << std::endl;
-  }
-}
-
-// Escapes XML special characters.
-template<typename InputTree>
-std::string XmlTreeWriter<InputTree>::Escape(const std::string &s) const {
-  std::string t;
-  std::size_t len = s.size();
-  t.reserve(len);
-  for (std::size_t i = 0; i < len; ++i) {
-    if (s[i] == '<') {
-      t += "&lt;";
-    } else if (s[i] == '>') {
-      t += "&gt;";
-    } else if (s[i] == '[') {
-      t += "&#91;";
-    } else if (s[i] == ']') {
-      t += "&#93;";
-    } else if (s[i] == '|') {
-      t += "&#124;";
-    } else if (s[i] == '&') {
-      t += "&amp;";
-    } else if (s[i] == '\'') {
-      t += "&apos;";
-    } else if (s[i] == '"') {
-      t += "&quot;";
-    } else {
-      t += s[i];
-    }
-  }
-  return t;
-}
-
-}  // namespace PCFG
-}  // namespace Syntax
-}  // namespace MosesTraining
-
-#endif
diff --git a/phrase-extract/pcfg-extract/Jamfile b/phrase-extract/pcfg-extract/Jamfile
index 61f056599..2442b967a 100644
--- a/phrase-extract/pcfg-extract/Jamfile
+++ b/phrase-extract/pcfg-extract/Jamfile
@@ -1 +1 @@
-exe pcfg-extract : [ glob *.cc ] ..//pcfg-common ../..//boost_program_options : <include>.. ;
+exe pcfg-extract : [ glob *.cc ] ..//syntax-common ..//pcfg-common ../..//boost_program_options : <include>.. ;
diff --git a/phrase-extract/pcfg-extract/pcfg_extract.cc b/phrase-extract/pcfg-extract/pcfg_extract.cc
index 29d63b994..8e7a40e07 100644
--- a/phrase-extract/pcfg-extract/pcfg_extract.cc
+++ b/phrase-extract/pcfg-extract/pcfg_extract.cc
@@ -19,20 +19,6 @@
 
 #include "pcfg_extract.h"
 
-#include "options.h"
-#include "rule_collection.h"
-#include "rule_extractor.h"
-
-#include "syntax-common/exception.h"
-
-#include "pcfg-common/pcfg.h"
-#include "pcfg-common/pcfg_tree.h"
-#include "pcfg-common/syntax_tree.h"
-#include "pcfg-common/typedef.h"
-#include "pcfg-common/xml_tree_parser.h"
-
-#include <boost/program_options.hpp>
-
 #include <cassert>
 #include <cstdlib>
 #include <fstream>
@@ -43,6 +29,20 @@
 #include <string>
 #include <vector>
 
+#include <boost/program_options.hpp>
+
+#include "syntax-common/exception.h"
+#include "syntax-common/xml_tree_parser.h"
+
+#include "SyntaxTree.h"
+
+#include "pcfg-common/pcfg.h"
+#include "pcfg-common/typedef.h"
+
+#include "options.h"
+#include "rule_collection.h"
+#include "rule_extractor.h"
+
 namespace MosesTraining
 {
 namespace Syntax
@@ -60,10 +60,12 @@ int PcfgExtract::Main(int argc, char *argv[])
   Vocabulary non_term_vocab;
   RuleExtractor rule_extractor(non_term_vocab);
   RuleCollection rule_collection;
-  XmlTreeParser parser;
+  std::set<std::string> label_set;
+  std::map<std::string, int> top_label_set;
+  XmlTreeParser parser(label_set, top_label_set);
   std::string line;
   std::size_t line_num = 0;
-  std::auto_ptr<PcfgTree> tree;
+  std::auto_ptr<MosesTraining::SyntaxTree> tree;
   while (std::getline(std::cin, line)) {
     ++line_num;
     try {
diff --git a/phrase-extract/pcfg-extract/rule_extractor.cc b/phrase-extract/pcfg-extract/rule_extractor.cc
index bd2c48c8a..39da54ef2 100644
--- a/phrase-extract/pcfg-extract/rule_extractor.cc
+++ b/phrase-extract/pcfg-extract/rule_extractor.cc
@@ -19,8 +19,6 @@
 
 #include "rule_extractor.h"
 
-#include "pcfg-common/pcfg_tree.h"
-
 namespace MosesTraining
 {
 namespace Syntax
@@ -33,21 +31,21 @@ RuleExtractor::RuleExtractor(Vocabulary &non_term_vocab)
 {
 }
 
-void RuleExtractor::Extract(const PcfgTree &tree, RuleCollection &rc) const
+void RuleExtractor::Extract(const SyntaxTree &tree, RuleCollection &rc) const
 {
-  if (tree.IsPreterminal() || tree.IsLeaf()) {
+  if (tree.IsLeaf() || tree.children()[0]->IsLeaf()) {
     return;
   }
 
-  std::size_t lhs = non_term_vocab_.Insert(tree.label());
+  std::size_t lhs = non_term_vocab_.Insert(tree.value().GetLabel());
   std::vector<std::size_t> rhs;
 
-  const std::vector<PcfgTree *> &children = tree.children();
+  const std::vector<SyntaxTree *> &children = tree.children();
   rhs.reserve(children.size());
-  for (std::vector<PcfgTree *>::const_iterator p(children.begin());
+  for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
        p != children.end(); ++p) {
-    const PcfgTree &child = **p;
-    rhs.push_back(non_term_vocab_.Insert(child.label()));
+    const SyntaxTree &child = **p;
+    rhs.push_back(non_term_vocab_.Insert(child.value().GetLabel()));
     Extract(child, rc);
   }
   rc.Add(lhs, rhs);
diff --git a/phrase-extract/pcfg-extract/rule_extractor.h b/phrase-extract/pcfg-extract/rule_extractor.h
index f35460909..d32d76992 100644
--- a/phrase-extract/pcfg-extract/rule_extractor.h
+++ b/phrase-extract/pcfg-extract/rule_extractor.h
@@ -21,6 +21,8 @@
 #ifndef PCFG_EXTRACT_RULE_EXTRACTOR_H_
 #define PCFG_EXTRACT_RULE_EXTRACTOR_H_
 
+#include "SyntaxTree.h"
+
 #include "pcfg-common/typedef.h"
 
 #include "rule_collection.h"
@@ -32,14 +34,12 @@ namespace Syntax
 namespace PCFG
 {
 
-class PcfgTree;
-
 // Extracts PCFG rules from syntax trees and adds them to a RuleCollection.
 class RuleExtractor
 {
 public:
   RuleExtractor(Vocabulary &);
-  void Extract(const PcfgTree &, RuleCollection &) const;
+  void Extract(const MosesTraining::SyntaxTree &, RuleCollection &) const;
 private:
   Vocabulary &non_term_vocab_;
 };
diff --git a/phrase-extract/pcfg-score/pcfg_score.cc b/phrase-extract/pcfg-score/pcfg_score.cc
index 314e0fb38..d656d2882 100644
--- a/phrase-extract/pcfg-score/pcfg_score.cc
+++ b/phrase-extract/pcfg-score/pcfg_score.cc
@@ -33,13 +33,14 @@
 
 #include <boost/program_options.hpp>
 
+#include "SyntaxTree.h"
+
 #include "syntax-common/exception.h"
+#include "syntax-common/xml_tree_parser.h"
+#include "syntax-common/xml_tree_writer.h"
 
 #include "pcfg-common/pcfg.h"
-#include "pcfg-common/pcfg_tree.h"
-#include "pcfg-common/syntax_tree.h"
 #include "pcfg-common/typedef.h"
-#include "pcfg-common/xml_tree_parser.h"
 
 namespace MosesTraining
 {
@@ -65,15 +66,17 @@ int PcfgScore::Main(int argc, char *argv[])
 
   // Score corpus according to PCFG.
   TreeScorer scorer(pcfg, non_term_vocab);
-  XmlTreeParser parser;
-  XmlTreeWriter<PcfgTree> writer;
+  std::set<std::string> label_set;
+  std::map<std::string, int> top_label_set;
+  XmlTreeParser parser(label_set, top_label_set);
+  XmlTreeWriter writer(std::cout);
   std::string line;
   std::size_t line_num = 0;
-  std::auto_ptr<PcfgTree> tree;
+  std::auto_ptr<SyntaxTree> tree;
   while (std::getline(std::cin, line)) {
     ++line_num;
     try {
-      tree = parser.Parse(line);
+      tree = parser.Parse(line, true);
     } catch (Exception &e) {
       std::ostringstream msg;
       msg << "line " << line_num << ": " << e.msg();
@@ -93,7 +96,7 @@ int PcfgScore::Main(int argc, char *argv[])
       std::cout << line << std::endl;
       continue;
     }
-    writer.Write(*tree, std::cout);
+    writer.Write(*tree);
   }
 
   return 0;
diff --git a/phrase-extract/pcfg-score/tree_scorer.cc b/phrase-extract/pcfg-score/tree_scorer.cc
index 74d6e79ef..61ae16e4c 100644
--- a/phrase-extract/pcfg-score/tree_scorer.cc
+++ b/phrase-extract/pcfg-score/tree_scorer.cc
@@ -20,6 +20,7 @@
 #include "tree_scorer.h"
 
 #include <cassert>
+#include <sstream>
 
 namespace MosesTraining
 {
@@ -34,30 +35,41 @@ TreeScorer::TreeScorer(const Pcfg &pcfg, const Vocabulary &non_term_vocab)
 {
 }
 
-bool TreeScorer::Score(PcfgTree &root) const
+bool TreeScorer::Score(SyntaxTree &root)
 {
-  if (root.IsPreterminal() || root.IsLeaf()) {
+  scores_.clear();
+  ZeroScores(root);
+  if (!CalcScores(root)) {
+    return false;
+  }
+  SetAttributes(root);
+  return true;
+}
+
+bool TreeScorer::CalcScores(SyntaxTree &root)
+{
+  if (root.IsLeaf() || root.children()[0]->IsLeaf()) {
     return true;
   }
 
-  const std::vector<PcfgTree *> &children = root.children();
+  const std::vector<SyntaxTree *> &children = root.children();
 
   double log_prob = 0.0;
 
   std::vector<std::size_t> key;
   key.reserve(children.size()+1);
-  key.push_back(non_term_vocab_.Lookup(root.label()));
+  key.push_back(non_term_vocab_.Lookup(root.value().GetLabel()));
 
-  for (std::vector<PcfgTree *>::const_iterator p(children.begin());
+  for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
        p != children.end(); ++p) {
-    PcfgTree *child = *p;
+    SyntaxTree *child = *p;
     assert(!child->IsLeaf());
-    key.push_back(non_term_vocab_.Lookup(child->label()));
-    if (!Score(*child)) {
+    key.push_back(non_term_vocab_.Lookup(child->value().GetLabel()));
+    if (!CalcScores(*child)) {
       return false;
     }
-    if (!child->IsPreterminal()) {
-      log_prob += child->score();
+    if (!child->children()[0]->IsLeaf()) {
+      log_prob += scores_[child];
     }
   }
   double rule_score;
@@ -66,10 +78,42 @@ bool TreeScorer::Score(PcfgTree &root) const
     return false;
   }
   log_prob += rule_score;
-  root.set_score(log_prob);
+  scores_[&root] = log_prob;
   return true;
 }
 
+void TreeScorer::SetAttributes(SyntaxTree &root)
+{
+  // Terminals don't need attributes.
+  if (root.IsLeaf()) {
+    return;
+  }
+  // Preterminals don't need attributes (they have the implicit score 0.0).
+  if (root.children()[0]->IsLeaf()) {
+    return;
+  }
+  double score = scores_[&root];
+  if (score != 0.0) {
+    std::ostringstream out;
+    out << score;
+    root.value().attributes["pcfg"] = out.str();
+  }
+  for (std::vector<SyntaxTree *>::const_iterator p(root.children().begin());
+       p != root.children().end(); ++p) {
+    SetAttributes(**p);
+  }
+}
+
+void TreeScorer::ZeroScores(SyntaxTree &root)
+{
+  scores_[&root] = 0.0f;
+  const std::vector<SyntaxTree *> &children = root.children();
+  for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
+       p != children.end(); ++p) {
+    ZeroScores(**p);
+  }
+}
+
 }  // namespace PCFG
 }  // namespace Syntax
 }  // namespace MosesTraining
diff --git a/phrase-extract/pcfg-score/tree_scorer.h b/phrase-extract/pcfg-score/tree_scorer.h
index 8b1afcc3a..cf9fdd1a3 100644
--- a/phrase-extract/pcfg-score/tree_scorer.h
+++ b/phrase-extract/pcfg-score/tree_scorer.h
@@ -21,8 +21,9 @@
 #ifndef PCFG_SCORE_TREE_SCORER_H_
 #define PCFG_SCORE_TREE_SCORER_H_
 
+#include "SyntaxTree.h"
+
 #include "pcfg-common/pcfg.h"
-#include "pcfg-common/pcfg_tree.h"
 #include "pcfg-common/typedef.h"
 
 namespace MosesTraining
@@ -39,11 +40,16 @@ public:
 
   // Score tree according to PCFG.  Returns false if unsuccessful (due to
   // missing rule).
-  bool Score(PcfgTree &) const;
+  bool Score(SyntaxTree &);
 
 private:
   const Pcfg &pcfg_;
   const Vocabulary &non_term_vocab_;
+  std::map<SyntaxTree *, double> scores_;
+
+  bool CalcScores(SyntaxTree &);
+  void SetAttributes(SyntaxTree &);
+  void ZeroScores(SyntaxTree &);
 };
 
 }  // namespace PCFG
diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc
index bf3c6d87e..6eeb110e9 100644
--- a/phrase-extract/syntax-common/xml_tree_parser.cc
+++ b/phrase-extract/syntax-common/xml_tree_parser.cc
@@ -20,13 +20,14 @@ XmlTreeParser::XmlTreeParser(std::set<std::string> &labelSet,
 {
 }
 
-std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line)
+std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line,
+                                               bool unescape)
 {
   line_ = line;
   node_collection_.Clear();
   try {
     if (!ProcessAndStripXMLTags(line_, node_collection_, label_set_,
-                                top_label_set_, false)) {
+                                top_label_set_, unescape)) {
       throw Exception("");
     }
   } catch (const XmlException &e) {
diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h
index e0b75c830..0f671c65a 100644
--- a/phrase-extract/syntax-common/xml_tree_parser.h
+++ b/phrase-extract/syntax-common/xml_tree_parser.h
@@ -15,18 +15,42 @@
 namespace MosesTraining {
 namespace Syntax {
 
-// Parses a string in Moses' XML parse tree format and returns a SyntaxTree
-// object.  This is a wrapper around the ProcessAndStripXMLTags function.
+/** Parses string representations of parse trees in Moses' XML format and
+ *  converts them to SyntaxTree objects.
+ *
+ *  This is a thin wrapper around the ProcessAndStripXMLTags function.  After
+ *  calling Parse(), the output of the ProcessAndStripXMLTags function (the
+ *  sentence, node collection, label set, and top label set) are available via
+ *  accessors.
+ */
 class XmlTreeParser {
  public:
   XmlTreeParser(std::set<std::string> &, std::map<std::string, int> &);
 
-  std::auto_ptr<SyntaxTree> Parse(const std::string &);
+  //! Parse a single sentence and return a SyntaxTree (with words attached).
+  std::auto_ptr<SyntaxTree> Parse(const std::string &, bool=false);
 
-  const std::vector<std::string>& GetWords() {
-    return words_;
-  }
+  // TODO
+  //! Get the sentence string (see ProcessAndStripXMLTags)
+  //const std::string &sentence() const;
 
+  // FIXME
+  //! Get the sentence as a vector of tokens
+  const std::vector<std::string>& GetWords() { return words_; }
+
+  // TODO
+  //! Get the node collection (see ProcessAndStripXMLTags)
+  const SyntaxNodeCollection &node_collection() const;
+
+  // TODO
+  //! Get the label set (see ProcessAndStripXMLTags)
+  const std::set<std::string> &label_set() const;
+
+  // TODO
+  //! Get the top label set (see ProcessAndStripXMLTags)
+  const std::map<std::string, int> &top_label_set() const;
+
+  // FIXME
   const SyntaxNodeCollection &GetNodeCollection() const {
     return node_collection_;
   }
diff --git a/phrase-extract/syntax-common/xml_tree_writer.cc b/phrase-extract/syntax-common/xml_tree_writer.cc
new file mode 100644
index 000000000..3c16cb2eb
--- /dev/null
+++ b/phrase-extract/syntax-common/xml_tree_writer.cc
@@ -0,0 +1,82 @@
+#include "xml_tree_writer.h"
+
+#include <cassert>
+#include <ostream>
+#include <vector>
+#include <string>
+
+#include "SyntaxTree.h"
+#include "XmlTree.h"
+
+
+namespace MosesTraining {
+namespace Syntax {
+
+void XmlTreeWriter::Write(const SyntaxTree &tree) const {
+  assert(!tree.IsLeaf());
+
+  // Opening tag
+  out_ << "<tree label=\"" << Escape(tree.value().GetLabel()) << "\"";
+  for (SyntaxNode::AttributeMap::const_iterator
+       p = tree.value().attributes.begin();
+       p != tree.value().attributes.end(); ++p) {
+    if (p->first != "label") {
+      out_ << " " << p->first << "=\"" << p->second << "\"";
+    }
+  }
+  out_ << ">";
+
+  // Children
+  for (std::vector<SyntaxTree *>::const_iterator p = tree.children().begin();
+       p != tree.children().end(); ++p) {
+    SyntaxTree &child = **p;
+    if (child.IsLeaf()) {
+      out_ << " " << Escape(child.value().GetLabel());
+    } else {
+      out_ << " ";
+      Write(child);
+    }
+  }
+
+  // Closing tag
+  out_ << " </tree>";
+
+  if (tree.parent() == 0) {
+    out_ << std::endl;
+  }
+}
+
+// Escapes XML special characters.
+std::string XmlTreeWriter::Escape(const std::string &s) const {
+  if (!escape_) {
+    return s;
+  }
+  std::string t;
+  std::size_t len = s.size();
+  t.reserve(len);
+  for (std::size_t i = 0; i < len; ++i) {
+    if (s[i] == '<') {
+      t += "&lt;";
+    } else if (s[i] == '>') {
+      t += "&gt;";
+    } else if (s[i] == '[') {
+      t += "&#91;";
+    } else if (s[i] == ']') {
+      t += "&#93;";
+    } else if (s[i] == '|') {
+      t += "&#124;";
+    } else if (s[i] == '&') {
+      t += "&amp;";
+    } else if (s[i] == '\'') {
+      t += "&apos;";
+    } else if (s[i] == '"') {
+      t += "&quot;";
+    } else {
+      t += s[i];
+    }
+  }
+  return t;
+}
+
+}  // namespace Syntax
+}  // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/xml_tree_writer.h b/phrase-extract/syntax-common/xml_tree_writer.h
new file mode 100644
index 000000000..b39d01fab
--- /dev/null
+++ b/phrase-extract/syntax-common/xml_tree_writer.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <ostream>
+#include <string>
+
+#include "SyntaxTree.h"
+
+namespace MosesTraining {
+namespace Syntax {
+
+class XmlTreeWriter {
+ public:
+  XmlTreeWriter(std::ostream &out, bool escape=true)
+      : out_(out)
+      , escape_(escape) {}
+
+  void Write(const SyntaxTree &) const;
+
+ private:
+  std::string Escape(const std::string &) const;
+
+  std::ostream &out_;
+  bool escape_;
+};
+
+}  // namespace Syntax
+}  // namespace MosesTraining

From 2e21f051f217a6b835433cbc456bdcc841187ec0 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Wed, 3 Jun 2015 10:05:36 +0100
Subject: [PATCH 046/108] Ongoing moses/phrase-extract refactoring

---
 phrase-extract/extract-ghkm/ExtractGHKM.cpp   | 43 +++++++--------
 .../filter-rule-table/FilterRuleTable.cpp     |  4 +-
 phrase-extract/pcfg-extract/pcfg_extract.cc   |  4 +-
 phrase-extract/pcfg-score/pcfg_score.cc       |  4 +-
 .../syntax-common/xml_tree_parser.cc          | 15 ++----
 .../syntax-common/xml_tree_parser.h           | 53 ++++++++-----------
 6 files changed, 49 insertions(+), 74 deletions(-)

diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index 2293371ac..c48a37367 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -119,14 +119,6 @@ int ExtractGHKM::Main(int argc, char *argv[])
     OpenOutputFileOrDie(options.unknownWordSoftMatchesFile, unknownWordSoftMatchesStream);
   }
 
-  // Target label sets for producing glue grammar.
-  std::set<std::string> targetLabelSet;
-  std::map<std::string, int> targetTopLabelSet;
-
-  // Source label sets for producing glue grammar.
-  std::set<std::string> sourceLabelSet;
-  std::map<std::string, int> sourceTopLabelSet;
-
   // Word count statistics for producing unknown word labels.
   std::map<std::string, int> targetWordCount;
   std::map<std::string, std::string> targetWordLabel;
@@ -139,8 +131,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
   std::string sourceLine;
   std::string alignmentLine;
   Alignment alignment;
-  Syntax::XmlTreeParser targetXmlTreeParser(targetLabelSet, targetTopLabelSet);
-  Syntax::XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet);
+  Syntax::XmlTreeParser targetXmlTreeParser;
+  Syntax::XmlTreeParser sourceXmlTreeParser;
   ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options);
   StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options);
   size_t lineNum = options.sentenceOffset;
@@ -194,7 +186,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
         }
         Error(oss.str());
       }
-      sourceTokens = sourceXmlTreeParser.GetWords();
+      sourceTokens = sourceXmlTreeParser.words();
     }
 
     // Read word alignments.
@@ -240,7 +232,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
 
     // Initialize phrase orientation scoring object
     PhraseOrientation phraseOrientation(sourceTokens.size(),
-        targetXmlTreeParser.GetWords().size(), alignment);
+        targetXmlTreeParser.words().size(), alignment);
 
     // Write the rules, subject to scope pruning.
     const std::vector<Node *> &targetNodes = graph.GetTargetNodes();
@@ -272,7 +264,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
         // SCFG output.
         ScfgRule *r = 0;
         if (options.sourceLabels) {
-          r = new ScfgRule(**q, &sourceXmlTreeParser.GetNodeCollection());
+          r = new ScfgRule(**q, &sourceXmlTreeParser.node_collection());
         } else {
           r = new ScfgRule(**q);
         }
@@ -315,14 +307,14 @@ int ExtractGHKM::Main(int argc, char *argv[])
 
   std::map<std::string,size_t> sourceLabels;
   if (options.sourceLabels && !options.sourceLabelSetFile.empty()) {
-
-    sourceLabelSet.insert("XLHS"); // non-matching label (left-hand side)
-    sourceLabelSet.insert("XRHS"); // non-matching label (right-hand side)
-    sourceLabelSet.insert("TOPLABEL");  // as used in the glue grammar
-    sourceLabelSet.insert("SOMELABEL"); // as used in the glue grammar
+    std::set<std::string> extendedLabelSet = sourceXmlTreeParser.label_set();
+    extendedLabelSet.insert("XLHS"); // non-matching label (left-hand side)
+    extendedLabelSet.insert("XRHS"); // non-matching label (right-hand side)
+    extendedLabelSet.insert("TOPLABEL");  // as used in the glue grammar
+    extendedLabelSet.insert("SOMELABEL"); // as used in the glue grammar
     size_t index = 0;
-    for (std::set<std::string>::const_iterator iter=sourceLabelSet.begin();
-         iter!=sourceLabelSet.end(); ++iter, ++index) {
+    for (std::set<std::string>::const_iterator iter=extendedLabelSet.begin();
+         iter!=extendedLabelSet.end(); ++iter, ++index) {
       sourceLabels.insert(std::pair<std::string,size_t>(*iter,index));
     }
     WriteSourceLabelSet(sourceLabels, sourceLabelSetStream);
@@ -332,14 +324,18 @@ int ExtractGHKM::Main(int argc, char *argv[])
   std::map<std::string, int> strippedTargetTopLabelSet;
   if (options.stripBitParLabels &&
       (!options.glueGrammarFile.empty() || !options.unknownWordSoftMatchesFile.empty())) {
-    StripBitParLabels(targetLabelSet, targetTopLabelSet, strippedTargetLabelSet, strippedTargetTopLabelSet);
+    StripBitParLabels(targetXmlTreeParser.label_set(),
+                      targetXmlTreeParser.top_label_set(),
+                      strippedTargetLabelSet, strippedTargetTopLabelSet);
   }
 
   if (!options.glueGrammarFile.empty()) {
     if (options.stripBitParLabels) {
       WriteGlueGrammar(strippedTargetLabelSet, strippedTargetTopLabelSet, sourceLabels, options, glueGrammarStream);
     } else {
-      WriteGlueGrammar(targetLabelSet, targetTopLabelSet, sourceLabels, options, glueGrammarStream);
+      WriteGlueGrammar(targetXmlTreeParser.label_set(),
+                       targetXmlTreeParser.top_label_set(),
+                       sourceLabels, options, glueGrammarStream);
     }
   }
 
@@ -355,7 +351,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
     if (options.stripBitParLabels) {
       WriteUnknownWordSoftMatches(strippedTargetLabelSet, unknownWordSoftMatchesStream);
     } else {
-      WriteUnknownWordSoftMatches(targetLabelSet, unknownWordSoftMatchesStream);
+      WriteUnknownWordSoftMatches(targetXmlTreeParser.label_set(),
+                                  unknownWordSoftMatchesStream);
     }
   }
 
diff --git a/phrase-extract/filter-rule-table/FilterRuleTable.cpp b/phrase-extract/filter-rule-table/FilterRuleTable.cpp
index 0c6f132f8..32d2019cf 100644
--- a/phrase-extract/filter-rule-table/FilterRuleTable.cpp
+++ b/phrase-extract/filter-rule-table/FilterRuleTable.cpp
@@ -126,9 +126,7 @@ void FilterRuleTable::ReadTestSet(
 void FilterRuleTable::ReadTestSet(
   std::istream &input, std::vector<boost::shared_ptr<SyntaxTree> > &sentences)
 {
-  std::set<std::string> labelSet;
-  std::map<std::string, int> topLabelSet;
-  XmlTreeParser parser(labelSet, topLabelSet);
+  XmlTreeParser parser;
   int lineNum = 0;
   std::string line;
   while (std::getline(input, line)) {
diff --git a/phrase-extract/pcfg-extract/pcfg_extract.cc b/phrase-extract/pcfg-extract/pcfg_extract.cc
index 8e7a40e07..87419edb7 100644
--- a/phrase-extract/pcfg-extract/pcfg_extract.cc
+++ b/phrase-extract/pcfg-extract/pcfg_extract.cc
@@ -60,9 +60,7 @@ int PcfgExtract::Main(int argc, char *argv[])
   Vocabulary non_term_vocab;
   RuleExtractor rule_extractor(non_term_vocab);
   RuleCollection rule_collection;
-  std::set<std::string> label_set;
-  std::map<std::string, int> top_label_set;
-  XmlTreeParser parser(label_set, top_label_set);
+  XmlTreeParser parser;
   std::string line;
   std::size_t line_num = 0;
   std::auto_ptr<MosesTraining::SyntaxTree> tree;
diff --git a/phrase-extract/pcfg-score/pcfg_score.cc b/phrase-extract/pcfg-score/pcfg_score.cc
index d656d2882..e11f73f70 100644
--- a/phrase-extract/pcfg-score/pcfg_score.cc
+++ b/phrase-extract/pcfg-score/pcfg_score.cc
@@ -66,9 +66,7 @@ int PcfgScore::Main(int argc, char *argv[])
 
   // Score corpus according to PCFG.
   TreeScorer scorer(pcfg, non_term_vocab);
-  std::set<std::string> label_set;
-  std::map<std::string, int> top_label_set;
-  XmlTreeParser parser(label_set, top_label_set);
+  XmlTreeParser parser;
   XmlTreeWriter writer(std::cout);
   std::string line;
   std::size_t line_num = 0;
diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc
index 6eeb110e9..34f566a03 100644
--- a/phrase-extract/syntax-common/xml_tree_parser.cc
+++ b/phrase-extract/syntax-common/xml_tree_parser.cc
@@ -10,23 +10,18 @@
 #include "XmlException.h"
 #include "XmlTree.h"
 
+#include "exception.h"
+
 namespace MosesTraining {
 namespace Syntax {
 
-XmlTreeParser::XmlTreeParser(std::set<std::string> &labelSet,
-                             std::map<std::string, int> &topLabelSet)
-  : label_set_(labelSet)
-  , top_label_set_(topLabelSet)
-{
-}
-
 std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line,
                                                bool unescape)
 {
-  line_ = line;
+  sentence_ = line;
   node_collection_.Clear();
   try {
-    if (!ProcessAndStripXMLTags(line_, node_collection_, label_set_,
+    if (!ProcessAndStripXMLTags(sentence_, node_collection_, label_set_,
                                 top_label_set_, unescape)) {
       throw Exception("");
     }
@@ -34,7 +29,7 @@ std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line,
     throw Exception(e.getMsg());
   }
   std::auto_ptr<SyntaxTree> root = node_collection_.ExtractTree();
-  words_ = util::tokenize(line_);
+  words_ = util::tokenize(sentence_);
   AttachWords(words_, *root);
   return root;
 }
diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h
index 0f671c65a..48ea056b8 100644
--- a/phrase-extract/syntax-common/xml_tree_parser.h
+++ b/phrase-extract/syntax-common/xml_tree_parser.h
@@ -6,12 +6,9 @@
 #include <string>
 #include <vector>
 
-#include "SyntaxNode.h"
 #include "SyntaxNodeCollection.h"
 #include "SyntaxTree.h"
 
-#include "exception.h"
-
 namespace MosesTraining {
 namespace Syntax {
 
@@ -25,44 +22,36 @@ namespace Syntax {
  */
 class XmlTreeParser {
  public:
-  XmlTreeParser(std::set<std::string> &, std::map<std::string, int> &);
-
   //! Parse a single sentence and return a SyntaxTree (with words attached).
-  std::auto_ptr<SyntaxTree> Parse(const std::string &, bool=false);
+  std::auto_ptr<SyntaxTree> Parse(const std::string &, bool unescape=false);
 
-  // TODO
-  //! Get the sentence string (see ProcessAndStripXMLTags)
-  //const std::string &sentence() const;
+  //! Get the sentence string (as returned by ProcessAndStripXMLTags).
+  const std::string &sentence() const { return sentence_; }
 
-  // FIXME
-  //! Get the sentence as a vector of tokens
-  const std::vector<std::string>& GetWords() { return words_; }
+  //! Get the sentence as a vector of words.
+  const std::vector<std::string> &words() const { return words_; }
 
-  // TODO
-  //! Get the node collection (see ProcessAndStripXMLTags)
-  const SyntaxNodeCollection &node_collection() const;
-
-  // TODO
-  //! Get the label set (see ProcessAndStripXMLTags)
-  const std::set<std::string> &label_set() const;
-
-  // TODO
-  //! Get the top label set (see ProcessAndStripXMLTags)
-  const std::map<std::string, int> &top_label_set() const;
-
-  // FIXME
-  const SyntaxNodeCollection &GetNodeCollection() const {
+  //! Get the node collection (as returned by ProcessAndStripXMLTags).
+  const SyntaxNodeCollection &node_collection() const {
     return node_collection_;
   }
 
- private:
-  std::set<std::string> &label_set_;
-  std::map<std::string, int> &top_label_set_;
-  std::string line_;
-  SyntaxNodeCollection node_collection_;
-  std::vector<std::string> words_;
+  //! Get the label set (as returned by ProcessAndStripXMLTags).
+  const std::set<std::string> &label_set() const { return label_set_; }
 
+  //! Get the top label set (as returned by ProcessAndStripXMLTags).
+  const std::map<std::string, int> &top_label_set() const {
+    return top_label_set_;
+  }
+
+ private:
   void AttachWords(const std::vector<std::string> &, SyntaxTree &);
+
+  std::string sentence_;
+  SyntaxNodeCollection node_collection_;
+  std::set<std::string> label_set_;
+  std::map<std::string, int> top_label_set_;
+  std::vector<std::string> words_;
 };
 
 }  // namespace Syntax

From 5e09d3dc71ab8391c651418c01aa5c324e53683b Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Wed, 3 Jun 2015 10:33:46 +0100
Subject: [PATCH 047/108] Ongoing moses/phrase-extract refactoring

---
 phrase-extract/SyntaxNode.h             | 25 +-------------
 phrase-extract/SyntaxNodeCollection.cpp | 43 -------------------------
 phrase-extract/SyntaxNodeCollection.h   | 10 +-----
 phrase-extract/XmlTree.cpp              |  5 ---
 phrase-extract/extract-rules-main.cpp   | 19 ++++++++---
 5 files changed, 17 insertions(+), 85 deletions(-)

diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h
index 5f57e1790..883f9724f 100644
--- a/phrase-extract/SyntaxNode.h
+++ b/phrase-extract/SyntaxNode.h
@@ -32,9 +32,6 @@ class SyntaxNode
 protected:
   int m_start, m_end;
   std::string m_label;
-  std::vector< SyntaxNode* > m_children;
-  SyntaxNode* m_parent;
-  float m_pcfgScore;
 public:
   typedef std::map<std::string, std::string> AttributeMap;
 
@@ -43,9 +40,7 @@ public:
   SyntaxNode( int startPos, int endPos, std::string label )
     :m_start(startPos)
     ,m_end(endPos)
-    ,m_label(label)
-    ,m_parent(0)
-    ,m_pcfgScore(0.0f) {
+    ,m_label(label) {
   }
   int GetStart() const {
     return m_start;
@@ -56,24 +51,6 @@ public:
   std::string GetLabel() const {
     return m_label;
   }
-  float GetPcfgScore() const {
-    return m_pcfgScore;
-  }
-  void SetPcfgScore(float score) {
-    m_pcfgScore = score;
-  }
-  SyntaxNode *GetParent() {
-    return m_parent;
-  }
-  void SetParent(SyntaxNode *parent) {
-    m_parent = parent;
-  }
-  void AddChild(SyntaxNode* child) {
-    m_children.push_back(child);
-  }
-  const std::vector< SyntaxNode* > &GetChildren() const {
-    return m_children;
-  }
 };
 
 }  // namespace MosesTraining
diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp
index 60a2f6c2f..e1c9c44e1 100644
--- a/phrase-extract/SyntaxNodeCollection.cpp
+++ b/phrase-extract/SyntaxNodeCollection.cpp
@@ -33,7 +33,6 @@ SyntaxNodeCollection::~SyntaxNodeCollection()
 
 void SyntaxNodeCollection::Clear()
 {
-  m_top = 0;
   // loop through all m_nodes, delete them
   for(size_t i=0; i<m_nodes.size(); i++) {
     delete m_nodes[i];
@@ -110,48 +109,6 @@ const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes( int startPos,
   return endIndex->second;
 }
 
-void SyntaxNodeCollection::ConnectNodes()
-{
-  typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator;
-
-  SyntaxNode *prev = 0;
-  // Iterate over all start indices from lowest to highest.
-  for (SyntaxTreeIndexIterator p = m_index.begin(); p != m_index.end(); ++p) {
-    const SyntaxTreeIndex2 &inner = p->second;
-    // Iterate over all end indices from highest to lowest.
-    for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) {
-      const std::vector<SyntaxNode*> &nodes = q->second;
-      // Iterate over all nodes that cover the same span in order of tree
-      // depth, top-most first.
-      for (std::vector<SyntaxNode*>::const_reverse_iterator r = nodes.rbegin();
-           r != nodes.rend(); ++r) {
-        SyntaxNode *node = *r;
-        if (!prev) {
-          // node is the root.
-          m_top = node;
-          node->SetParent(0);
-        } else if (prev->GetStart() == node->GetStart()) {
-          // prev is the parent of node.
-          assert(prev->GetEnd() >= node->GetEnd());
-          node->SetParent(prev);
-          prev->AddChild(node);
-        } else {
-          // prev is a descendant of node's parent.  The lowest common
-          // ancestor of prev and node will be node's parent.
-          SyntaxNode *ancestor = prev->GetParent();
-          while (ancestor->GetEnd() < node->GetEnd()) {
-            ancestor = ancestor->GetParent();
-          }
-          assert(ancestor);
-          node->SetParent(ancestor);
-          ancestor->AddChild(node);
-        }
-        prev = node;
-      }
-    }
-  }
-}
-
 std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
 {
   std::map<SyntaxNode *, SyntaxTree *> nodeToTree;
diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h
index a0d19841c..c8ca67d3d 100644
--- a/phrase-extract/SyntaxNodeCollection.h
+++ b/phrase-extract/SyntaxNodeCollection.h
@@ -38,7 +38,6 @@ class SyntaxNodeCollection
 {
 protected:
   std::vector< SyntaxNode* > m_nodes;
-  SyntaxNode* m_top;
 
   typedef std::map< int, std::vector< SyntaxNode* > > SyntaxTreeIndex2;
   typedef SyntaxTreeIndex2::const_iterator SyntaxTreeIndexIterator2;
@@ -49,18 +48,12 @@ protected:
   std::vector< SyntaxNode* > m_emptyNode;
 
 public:
-  SyntaxNodeCollection()
-    : m_top(0)  // m_top doesn't get set unless ConnectNodes is called.
-    , m_size(0) {}
+  SyntaxNodeCollection() : m_size(0) {}
 
   ~SyntaxNodeCollection();
 
   SyntaxNode *AddNode( int startPos, int endPos, const std::string &label );
 
-  SyntaxNode *GetTop() {
-    return m_top;
-  }
-
   ParentNodes Parse();
   bool HasNode( int startPos, int endPos ) const;
   const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
@@ -70,7 +63,6 @@ public:
   size_t GetNumWords() const {
     return m_size;
   }
-  void ConnectNodes();
   void Clear();
 
   std::auto_ptr<SyntaxTree> ExtractTree();
diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp
index d3c5da900..ffbbd453a 100644
--- a/phrase-extract/XmlTree.cpp
+++ b/phrase-extract/XmlTree.cpp
@@ -398,10 +398,6 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
         string label = ParseXmlTagAttribute(tagContent,"label");
         labelCollection.insert( label );
 
-        string pcfgString = ParseXmlTagAttribute(tagContent,"pcfg");
-        float pcfgScore = pcfgString == "" ? 0.0f
-                          : std::atof(pcfgString.c_str());
-
         // report what we have processed so far
         if (0) {
           cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
@@ -409,7 +405,6 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
           cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
         }
         SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label );
-        node->SetPcfgScore(pcfgScore);
         ParseXmlTagAttributes(tagContent, node->attributes);
       }
     }
diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp
index 825f12d89..8f1ff758b 100644
--- a/phrase-extract/extract-rules-main.cpp
+++ b/phrase-extract/extract-rules-main.cpp
@@ -110,6 +110,8 @@ void collectWordLabelCounts(SentenceAlignmentWithSyntax &sentence );
 void writeGlueGrammar(const string &, RuleExtractionOptions &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection);
 void writeUnknownWordLabel(const string &);
 
+double getPcfgScore(const SyntaxNode &);
+
 
 int main(int argc, char* argv[])
 {
@@ -564,8 +566,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int
       }
 
       if (m_options.pcfgScore) {
-        double score = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
-        logPCFGScore -= score;
+        logPCFGScore -= getPcfgScore(*m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]);
       }
 
       currPos = hole.GetEnd(1);
@@ -689,7 +690,7 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
 
   // target
   if (m_options.pcfgScore) {
-    double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
+    double logPCFGScore = getPcfgScore(*m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]);
     rule.target = saveTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
                   + " [" + targetLabel + "]";
     rule.pcfgScore = std::exp(logPCFGScore);
@@ -973,7 +974,7 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
   rule.target += "[" + targetLabel + "]";
 
   if (m_options.pcfgScore) {
-    double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
+    double logPCFGScore = getPcfgScore(*m_sentence.targetTree.GetNodes(startT,endT)[0]);
     rule.pcfgScore = std::exp(logPCFGScore);
   }
 
@@ -1194,3 +1195,13 @@ void writeUnknownWordLabel(const string & fileName)
 
   outFile.close();
 }
+
+double getPcfgScore(const SyntaxNode &node)
+{
+  double score = 0.0f;
+  SyntaxNode::AttributeMap::const_iterator p = node.attributes.find("pcfg");
+  if (p != node.attributes.end()) {
+    score = std::atof(p->second.c_str());
+  }
+  return score;
+}

From ed321791a75c6177b218a0098d184c308bc9c561 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Wed, 3 Jun 2015 11:10:45 +0100
Subject: [PATCH 048/108] Ongoing moses/phrase-extract refactoring

---
 phrase-extract/SyntaxNode.h                   | 36 +++++----------
 phrase-extract/SyntaxNodeCollection.cpp       |  8 ++--
 phrase-extract/XmlTree.cpp                    |  2 +-
 .../extract-ghkm/AlignmentGraph.cpp           |  3 +-
 phrase-extract/extract-ghkm/ExtractGHKM.cpp   |  6 +--
 phrase-extract/extract-ghkm/ScfgRule.cpp      |  2 +-
 phrase-extract/extract-rules-main.cpp         | 16 +++----
 .../filter-rule-table/TreeTsgFilter.cpp       |  2 +-
 phrase-extract/pcfg-extract/rule_extractor.cc |  4 +-
 phrase-extract/pcfg-score/tree_scorer.cc      |  4 +-
 phrase-extract/relax-parse-main.cpp           | 44 +++++++++----------
 .../syntax-common/xml_tree_parser.cc          |  6 +--
 .../syntax-common/xml_tree_writer.cc          |  4 +-
 13 files changed, 62 insertions(+), 75 deletions(-)

diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h
index 883f9724f..f38e94713 100644
--- a/phrase-extract/SyntaxNode.h
+++ b/phrase-extract/SyntaxNode.h
@@ -20,37 +20,23 @@
 #pragma once
 
 #include <map>
-#include <sstream>
 #include <string>
-#include <vector>
 
-namespace MosesTraining
-{
+namespace MosesTraining {
 
-class SyntaxNode
-{
-protected:
-  int m_start, m_end;
-  std::string m_label;
-public:
+struct SyntaxNode {
   typedef std::map<std::string, std::string> AttributeMap;
 
-  AttributeMap attributes;
+  SyntaxNode(const std::string &label_, int start_, int end_)
+    : label(label_)
+    , start(start_)
+    , end(end_) {
+  }
 
-  SyntaxNode( int startPos, int endPos, std::string label )
-    :m_start(startPos)
-    ,m_end(endPos)
-    ,m_label(label) {
-  }
-  int GetStart() const {
-    return m_start;
-  }
-  int GetEnd() const {
-    return m_end;
-  }
-  std::string GetLabel() const {
-    return m_label;
-  }
+  std::string label;
+  int start;
+  int end;
+  AttributeMap attributes;
 };
 
 }  // namespace MosesTraining
diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp
index e1c9c44e1..7421cc0ed 100644
--- a/phrase-extract/SyntaxNodeCollection.cpp
+++ b/phrase-extract/SyntaxNodeCollection.cpp
@@ -44,7 +44,7 @@ void SyntaxNodeCollection::Clear()
 SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
                                           const std::string &label)
 {
-  SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label );
+  SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos);
   m_nodes.push_back( newNode );
   m_index[ startPos ][ endPos ].push_back( newNode );
   m_size = std::max(endPos+1, m_size);
@@ -141,16 +141,16 @@ std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
           // node is the root.
           root = tree;
           tree->parent() = 0;
-        } else if (prevNode->GetStart() == node->GetStart()) {
+        } else if (prevNode->start == node->start) {
           // prevNode is the parent of node.
-          assert(prevNode->GetEnd() >= node->GetEnd());
+          assert(prevNode->end >= node->end);
           tree->parent() = prevTree;
           prevTree->children().push_back(tree);
         } else {
           // prevNode is a descendant of node's parent.  The lowest common
           // ancestor of prevNode and node will be node's parent.
           SyntaxTree *ancestor = prevTree->parent();
-          while (ancestor->value().GetEnd() < tree->value().GetEnd()) {
+          while (ancestor->value().end < tree->value().end) {
             ancestor = ancestor->parent();
           }
           assert(ancestor);
diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp
index ffbbd453a..d8b77b6e6 100644
--- a/phrase-extract/XmlTree.cpp
+++ b/phrase-extract/XmlTree.cpp
@@ -419,7 +419,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
   const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 );
   for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) {
     SyntaxNode *n = *node;
-    const string &label = n->GetLabel();
+    const string &label = n->label;
     if (topLabelCollection.find( label ) == topLabelCollection.end())
       topLabelCollection[ label ] = 0;
     topLabelCollection[ label ]++;
diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
index 1a3c23de5..7c179295f 100644
--- a/phrase-extract/extract-ghkm/AlignmentGraph.cpp
+++ b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
@@ -21,6 +21,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <cstdlib>
 #include <memory>
 #include <stack>
 
@@ -213,7 +214,7 @@ Node *AlignmentGraph::CopyParseTree(const SyntaxTree *root)
 {
   NodeType nodeType = (root->IsLeaf()) ? TARGET : TREE;
 
-  std::auto_ptr<Node> n(new Node(root->value().GetLabel(), nodeType));
+  std::auto_ptr<Node> n(new Node(root->value().label, nodeType));
 
   if (nodeType == TREE) {
     float score = 0.0f;
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index c48a37367..c96cda146 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -813,7 +813,7 @@ void ExtractGHKM::CollectWordLabelCounts(
   for (SyntaxTree::ConstLeafIterator p(root);
        p != SyntaxTree::ConstLeafIterator(); ++p) {
     const SyntaxTree &leaf = *p;
-    const std::string &word = leaf.value().GetLabel();
+    const std::string &word = leaf.value().label;
     const SyntaxTree *ancestor = leaf.parent();
     // If unary rule elimination is enabled and this word is at the end of a
     // chain of unary rewrites, e.g.
@@ -825,7 +825,7 @@ void ExtractGHKM::CollectWordLabelCounts(
            ancestor->parent()->children().size() == 1) {
       ancestor = ancestor->parent();
     }
-    const std::string &label = ancestor->value().GetLabel();
+    const std::string &label = ancestor->value().label;
     ++wordCount[word];
     wordLabel[word] = label;
   }
@@ -837,7 +837,7 @@ std::vector<std::string> ExtractGHKM::ReadTokens(const SyntaxTree &root) const
   for (SyntaxTree::ConstLeafIterator p(root);
        p != SyntaxTree::ConstLeafIterator(); ++p) {
     const SyntaxTree &leaf = *p;
-    const std::string &word = leaf.value().GetLabel();
+    const std::string &word = leaf.value().label;
     tokens.push_back(word);
   }
   return tokens;
diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp
index a6fc19dd9..1a49c862e 100644
--- a/phrase-extract/extract-ghkm/ScfgRule.cpp
+++ b/phrase-extract/extract-ghkm/ScfgRule.cpp
@@ -144,7 +144,7 @@ void ScfgRule::PushSourceLabel(const SyntaxNodeCollection *sourceNodeCollection,
       sourceNodeCollection->GetNodes(span.first,span.second);
     if (!sourceLabels.empty()) {
       // store the topmost matching label from the source syntax tree
-      m_sourceLabels.push_back(sourceLabels.back()->GetLabel());
+      m_sourceLabels.push_back(sourceLabels.back()->label);
     }
   } else {
     // no matching source-side syntactic constituent: store nonMatchingLabel
diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp
index 8f1ff758b..e6fff965d 100644
--- a/phrase-extract/extract-rules-main.cpp
+++ b/phrase-extract/extract-rules-main.cpp
@@ -507,7 +507,7 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
 
       int labelI = labelIndex[ 2+holeCount+holeTotal ];
       string label = m_options.sourceSyntax ?
-                     m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X";
+                     m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->label : "X";
       hole.SetLabel(label, 0);
 
       currPos = hole.GetEnd(0);
@@ -550,7 +550,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int
       int labelI = labelIndex[ 2+holeCount ];
       string targetLabel;
       if (m_options.targetSyntax) {
-        targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetLabel();
+        targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->label;
       } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
         targetLabel = "S";
       } else {
@@ -675,7 +675,7 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
   // phrase labels
   string targetLabel;
   if (m_options.targetSyntax) {
-    targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->GetLabel();
+    targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->label;
   } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
     targetLabel = "S";
   } else {
@@ -683,7 +683,7 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
   }
 
   string sourceLabel = m_options.sourceSyntax ?
-                       m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
+                       m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->label : "X";
 
   // create non-terms on the source side
   preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex);
@@ -947,13 +947,13 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
   // phrase labels
   string targetLabel,sourceLabel;
   if (m_options.targetSyntax && m_options.conditionOnTargetLhs) {
-    sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
+    sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
   } else {
     sourceLabel = m_options.sourceSyntax ?
-                  m_sentence.sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
+                  m_sentence.sourceTree.GetNodes(startS,endS)[0]->label : "X";
 
     if (m_options.targetSyntax) {
-      targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
+      targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
     } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
       targetLabel = "S";
     } else {
@@ -1166,7 +1166,7 @@ void collectWordLabelCounts( SentenceAlignmentWithSyntax &sentence )
     const vector< SyntaxNode* >& labels = sentence.targetTree.GetNodes(ti,ti);
     if (labels.size() > 0) {
       wordCount[ word ]++;
-      wordLabel[ word ] = labels[0]->GetLabel();
+      wordLabel[ word ] = labels[0]->label;
     }
   }
 }
diff --git a/phrase-extract/filter-rule-table/TreeTsgFilter.cpp b/phrase-extract/filter-rule-table/TreeTsgFilter.cpp
index 17a8dcb22..b9c58228d 100644
--- a/phrase-extract/filter-rule-table/TreeTsgFilter.cpp
+++ b/phrase-extract/filter-rule-table/TreeTsgFilter.cpp
@@ -27,7 +27,7 @@ TreeTsgFilter::TreeTsgFilter(
 
 TreeTsgFilter::IdTree *TreeTsgFilter::SyntaxTreeToIdTree(const SyntaxTree &s)
 {
-  IdTree *t = new IdTree(m_testVocab.Insert(s.value().GetLabel()));
+  IdTree *t = new IdTree(m_testVocab.Insert(s.value().label));
   const std::vector<SyntaxTree*> &sChildren = s.children();
   std::vector<IdTree*> &tChildren = t->children();
   tChildren.reserve(sChildren.size());
diff --git a/phrase-extract/pcfg-extract/rule_extractor.cc b/phrase-extract/pcfg-extract/rule_extractor.cc
index 39da54ef2..f20f2d978 100644
--- a/phrase-extract/pcfg-extract/rule_extractor.cc
+++ b/phrase-extract/pcfg-extract/rule_extractor.cc
@@ -37,7 +37,7 @@ void RuleExtractor::Extract(const SyntaxTree &tree, RuleCollection &rc) const
     return;
   }
 
-  std::size_t lhs = non_term_vocab_.Insert(tree.value().GetLabel());
+  std::size_t lhs = non_term_vocab_.Insert(tree.value().label);
   std::vector<std::size_t> rhs;
 
   const std::vector<SyntaxTree *> &children = tree.children();
@@ -45,7 +45,7 @@ void RuleExtractor::Extract(const SyntaxTree &tree, RuleCollection &rc) const
   for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
        p != children.end(); ++p) {
     const SyntaxTree &child = **p;
-    rhs.push_back(non_term_vocab_.Insert(child.value().GetLabel()));
+    rhs.push_back(non_term_vocab_.Insert(child.value().label));
     Extract(child, rc);
   }
   rc.Add(lhs, rhs);
diff --git a/phrase-extract/pcfg-score/tree_scorer.cc b/phrase-extract/pcfg-score/tree_scorer.cc
index 61ae16e4c..3c6b6b0c8 100644
--- a/phrase-extract/pcfg-score/tree_scorer.cc
+++ b/phrase-extract/pcfg-score/tree_scorer.cc
@@ -58,13 +58,13 @@ bool TreeScorer::CalcScores(SyntaxTree &root)
 
   std::vector<std::size_t> key;
   key.reserve(children.size()+1);
-  key.push_back(non_term_vocab_.Lookup(root.value().GetLabel()));
+  key.push_back(non_term_vocab_.Lookup(root.value().label));
 
   for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
        p != children.end(); ++p) {
     SyntaxTree *child = *p;
     assert(!child->IsLeaf());
-    key.push_back(non_term_vocab_.Lookup(child->value().GetLabel()));
+    key.push_back(non_term_vocab_.Lookup(child->value().label));
     if (!CalcScores(*child)) {
       return false;
     }
diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp
index 5bca886bf..4b5c2d573 100644
--- a/phrase-extract/relax-parse-main.cpp
+++ b/phrase-extract/relax-parse-main.cpp
@@ -118,9 +118,9 @@ void store( SyntaxNodeCollection &tree, const vector< string > &words )
   // output tree nodes
   vector< SyntaxNode* > nodes = tree.GetAllNodes();
   for( size_t i=0; i<nodes.size(); i++ ) {
-    cout << " <tree span=\"" << nodes[i]->GetStart()
-         << "-" << nodes[i]->GetEnd()
-         << "\" label=\"" << nodes[i]->GetLabel()
+    cout << " <tree span=\"" << nodes[i]->start
+         << "-" << nodes[i]->end
+         << "\" label=\"" << nodes[i]->label
          << "\"/>";
   }
   cout << endl;
@@ -133,7 +133,7 @@ void LeftBinarize( SyntaxNodeCollection &tree, ParentNodes &parents )
     if (point.size() > 3) {
       const vector< SyntaxNode* >& topNodes
       = tree.GetNodes( point[0], point[point.size()-1]-1);
-      string topLabel = topNodes[0]->GetLabel();
+      string topLabel = topNodes[0]->label;
 
       for(size_t i=2; i<point.size()-1; i++) {
         // cerr << "LeftBin  " << point[0] << "-" << (point[point.size()-1]-1) << ": " << point[0] << "-" << point[i]-1 << " ^" << topLabel << endl;
@@ -151,7 +151,7 @@ void RightBinarize( SyntaxNodeCollection &tree, ParentNodes &parents )
       int endPoint = point[point.size()-1]-1;
       const vector< SyntaxNode* >& topNodes
       = tree.GetNodes( point[0], endPoint);
-      string topLabel = topNodes[0]->GetLabel();
+      string topLabel = topNodes[0]->label;
 
       for(size_t i=1; i<point.size()-2; i++) {
         // cerr << "RightBin " << point[0] << "-" << (point[point.size()-1]-1) << ": " << point[i] << "-" << endPoint << " ^" << topLabel << endl;
@@ -178,29 +178,29 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
       // cerr << endl;
 
       for(size_t i = 0; i+2 < point.size(); i++) {
-        // cerr << "\tadding " << point[i] << ";" << point[i+1] << ";" << (point[i+2]-1) << ": " << tree.GetNodes(point[i  ],point[i+1]-1)[0]->GetLabel() << "+" << tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() << endl;
+        // cerr << "\tadding " << point[i] << ";" << point[i+1] << ";" << (point[i+2]-1) << ": " << tree.GetNodes(point[i  ],point[i+1]-1)[0]->label << "+" << tree.GetNodes(point[i+1],point[i+2]-1)[0]->label << endl;
 
         newTree.AddNode( point[i],point[i+2]-1,
-                         tree.GetNodes(point[i  ],point[i+1]-1)[0]->GetLabel()
+                         tree.GetNodes(point[i  ],point[i+1]-1)[0]->label
                          + "+" +
-                         tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() );
+                         tree.GetNodes(point[i+1],point[i+2]-1)[0]->label);
       }
     }
     if (point.size() >= 4) {
       int ps = point.size();
-      string topLabel = tree.GetNodes(point[0],point[ps-1]-1)[0]->GetLabel();
+      string topLabel = tree.GetNodes(point[0],point[ps-1]-1)[0]->label;
 
-      // cerr << "\tadding " << topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() << endl;
+      // cerr << "\tadding " << topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->label << endl;
       newTree.AddNode( point[1],point[ps-1]-1,
                        topLabel
                        + "\\" +
-                       tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() );
+                       tree.GetNodes(point[0],point[1]-1)[0]->label );
 
-      // cerr << "\tadding " << topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() << endl;
+      // cerr << "\tadding " << topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->label << endl;
       newTree.AddNode( point[0],point[ps-2]-1,
                        topLabel
                        + "/" +
-                       tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() );
+                       tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->label );
     }
   }
 
@@ -219,12 +219,12 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
 
       for(int mid=start+1; mid<=end && !done; mid++) {
         if (tree.HasNode(start,mid-1) && tree.HasNode(mid,end)) {
-          // cerr << "\tadding " << tree.GetNodes(start,mid-1)[0]->GetLabel() << "++" << tree.GetNodes(mid,  end  )[0]->GetLabel() << endl;
+          // cerr << "\tadding " << tree.GetNodes(start,mid-1)[0]->label << "++" << tree.GetNodes(mid,  end  )[0]->label << endl;
 
           newTree.AddNode( start, end,
-                           tree.GetNodes(start,mid-1)[0]->GetLabel()
+                           tree.GetNodes(start,mid-1)[0]->label
                            + "++" +
-                           tree.GetNodes(mid,  end  )[0]->GetLabel() );
+                           tree.GetNodes(mid,  end  )[0]->label );
           done = true;
         }
       }
@@ -234,9 +234,9 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
       for(int postEnd=end+1; postEnd<numWords && !done; postEnd++) {
         if (tree.HasNode(start,postEnd) && tree.HasNode(end+1,postEnd)) {
           newTree.AddNode( start, end,
-                           tree.GetNodes(start,postEnd)[0]->GetLabel()
+                           tree.GetNodes(start,postEnd)[0]->label
                            + "//" +
-                           tree.GetNodes(end+1,postEnd)[0]->GetLabel() );
+                           tree.GetNodes(end+1,postEnd)[0]->label );
           done = true;
         }
       }
@@ -245,11 +245,11 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
       // if matching a constituent A left-minus constituent B: use A\\B
       for(int preStart=start-1; preStart>=0; preStart--) {
         if (tree.HasNode(preStart,end) && tree.HasNode(preStart,start-1)) {
-          // cerr << "\tadding " << tree.GetNodes(preStart,end    )[0]->GetLabel() << "\\\\" <<tree.GetNodes(preStart,start-1)[0]->GetLabel() << endl;
+          // cerr << "\tadding " << tree.GetNodes(preStart,end    )[0]->label << "\\\\" <<tree.GetNodes(preStart,start-1)[0]->label << endl;
           newTree.AddNode( start, end,
-                           tree.GetNodes(preStart,end    )[0]->GetLabel()
+                           tree.GetNodes(preStart,end    )[0]->label
                            + "\\\\" +
-                           tree.GetNodes(preStart,start-1)[0]->GetLabel() );
+                           tree.GetNodes(preStart,start-1)[0]->label );
           done = true;
         }
       }
@@ -268,6 +268,6 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
   // adding all new nodes
   vector< SyntaxNode* > nodes = newTree.GetAllNodes();
   for( size_t i=0; i<nodes.size(); i++ ) {
-    tree.AddNode( nodes[i]->GetStart(), nodes[i]->GetEnd(), nodes[i]->GetLabel());
+    tree.AddNode( nodes[i]->start, nodes[i]->end, nodes[i]->label);
   }
 }
diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc
index 34f566a03..8bd511522 100644
--- a/phrase-extract/syntax-common/xml_tree_parser.cc
+++ b/phrase-extract/syntax-common/xml_tree_parser.cc
@@ -47,15 +47,15 @@ void XmlTreeParser::AttachWords(const std::vector<std::string> &words,
   for (std::vector<SyntaxTree*>::iterator p = leaves.begin(); p != leaves.end();
        ++p) {
     SyntaxTree *leaf = *p;
-    const int start = leaf->value().GetStart();
-    const int end = leaf->value().GetEnd();
+    const int start = leaf->value().start;
+    const int end = leaf->value().end;
     if (start != end) {
       std::ostringstream msg;
       msg << "leaf node covers multiple words (" << start << "-" << end
           << "): this is currently unsupported";
       throw Exception(msg.str());
     }
-    SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(start, end, *q++));
+    SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(*q++, start, end));
     leaf->children().push_back(newLeaf);
     newLeaf->parent() = leaf;
   }
diff --git a/phrase-extract/syntax-common/xml_tree_writer.cc b/phrase-extract/syntax-common/xml_tree_writer.cc
index 3c16cb2eb..d17937fa8 100644
--- a/phrase-extract/syntax-common/xml_tree_writer.cc
+++ b/phrase-extract/syntax-common/xml_tree_writer.cc
@@ -16,7 +16,7 @@ void XmlTreeWriter::Write(const SyntaxTree &tree) const {
   assert(!tree.IsLeaf());
 
   // Opening tag
-  out_ << "<tree label=\"" << Escape(tree.value().GetLabel()) << "\"";
+  out_ << "<tree label=\"" << Escape(tree.value().label) << "\"";
   for (SyntaxNode::AttributeMap::const_iterator
        p = tree.value().attributes.begin();
        p != tree.value().attributes.end(); ++p) {
@@ -31,7 +31,7 @@ void XmlTreeWriter::Write(const SyntaxTree &tree) const {
        p != tree.children().end(); ++p) {
     SyntaxTree &child = **p;
     if (child.IsLeaf()) {
-      out_ << " " << Escape(child.value().GetLabel());
+      out_ << " " << Escape(child.value().label);
     } else {
       out_ << " ";
       Write(child);

From 9097fd8965e039f9c5c889d76a614dd4eda19651 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Wed, 3 Jun 2015 14:09:49 +0100
Subject: [PATCH 049/108] Ongoing moses/phrase-extract refactoring

---
 phrase-extract/SyntaxNodeCollection.cpp | 20 ++++++-----
 phrase-extract/SyntaxNodeCollection.h   | 44 ++++++++++++++++---------
 2 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp
index 7421cc0ed..356c49bf4 100644
--- a/phrase-extract/SyntaxNodeCollection.cpp
+++ b/phrase-extract/SyntaxNodeCollection.cpp
@@ -47,7 +47,7 @@ SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
   SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos);
   m_nodes.push_back( newNode );
   m_index[ startPos ][ endPos ].push_back( newNode );
-  m_size = std::max(endPos+1, m_size);
+  m_numWords = std::max(endPos+1, m_numWords);
   return newNode;
 }
 
@@ -56,8 +56,8 @@ ParentNodes SyntaxNodeCollection::Parse()
   ParentNodes parents;
 
   // looping through all spans of size >= 2
-  for( int length=2; length<=m_size; length++ ) {
-    for( int startPos = 0; startPos <= m_size-length; startPos++ ) {
+  for( int length=2; length<=m_numWords; length++ ) {
+    for( int startPos = 0; startPos <= m_numWords-length; startPos++ ) {
       if (HasNode( startPos, startPos+length-1 )) {
         // processing one (parent) span
 
@@ -96,13 +96,14 @@ bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const
   return GetNodes( startPos, endPos).size() > 0;
 }
 
-const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes( int startPos, int endPos ) const
+const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes(
+    int startPos, int endPos ) const
 {
-  SyntaxTreeIndexIterator startIndex = m_index.find( startPos );
+  NodeIndex::const_iterator startIndex = m_index.find( startPos );
   if (startIndex == m_index.end() )
     return m_emptyNode;
 
-  SyntaxTreeIndexIterator2 endIndex = startIndex->second.find( endPos );
+  InnerNodeIndex::const_iterator endIndex = startIndex->second.find( endPos );
   if (endIndex == startIndex->second.end())
     return m_emptyNode;
 
@@ -120,14 +121,15 @@ std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
   }
 
   // Connect the SyntaxTrees.
-  typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator;
+  typedef NodeIndex::const_iterator OuterIterator;
+  typedef InnerNodeIndex::const_reverse_iterator InnerIterator;
 
   SyntaxTree *root = 0;
   SyntaxNode *prevNode = 0;
   SyntaxTree *prevTree = 0;
   // Iterate over all start indices from lowest to highest.
-  for (SyntaxTreeIndexIterator p = m_index.begin(); p != m_index.end(); ++p) {
-    const SyntaxTreeIndex2 &inner = p->second;
+  for (OuterIterator p = m_index.begin(); p != m_index.end(); ++p) {
+    const InnerNodeIndex &inner = p->second;
     // Iterate over all end indices from highest to lowest.
     for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) {
       const std::vector<SyntaxNode*> &nodes = q->second;
diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h
index c8ca67d3d..060192980 100644
--- a/phrase-extract/SyntaxNodeCollection.h
+++ b/phrase-extract/SyntaxNodeCollection.h
@@ -34,38 +34,50 @@ namespace MosesTraining
 typedef std::vector< int > SplitPoints;
 typedef std::vector< SplitPoints > ParentNodes;
 
+/** A collection of SyntaxNodes organized by start and end position.
+ *
+ */
 class SyntaxNodeCollection
 {
-protected:
-  std::vector< SyntaxNode* > m_nodes;
-
-  typedef std::map< int, std::vector< SyntaxNode* > > SyntaxTreeIndex2;
-  typedef SyntaxTreeIndex2::const_iterator SyntaxTreeIndexIterator2;
-  typedef std::map< int, SyntaxTreeIndex2 > SyntaxTreeIndex;
-  typedef SyntaxTreeIndex::const_iterator SyntaxTreeIndexIterator;
-  SyntaxTreeIndex m_index;
-  int m_size;
-  std::vector< SyntaxNode* > m_emptyNode;
-
 public:
-  SyntaxNodeCollection() : m_size(0) {}
+  SyntaxNodeCollection() : m_numWords(0) {}
 
   ~SyntaxNodeCollection();
 
+  //! Construct and insert a new SyntaxNode.
   SyntaxNode *AddNode( int startPos, int endPos, const std::string &label );
 
+  // TODO Rename (and move?)
   ParentNodes Parse();
+
+  //! Return true iff there are one or more SyntaxNodes with the given span.
   bool HasNode( int startPos, int endPos ) const;
+
+  //! Lookup the SyntaxNodes for a given span.
   const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
-  const std::vector< SyntaxNode* >& GetAllNodes() {
-    return m_nodes;
-  };
+
+  //! Get a vector of pointers to all SyntaxNodes (unordered).
+  const std::vector< SyntaxNode* >& GetAllNodes() { return m_nodes; };
+
   size_t GetNumWords() const {
-    return m_size;
+    return m_numWords;
   }
   void Clear();
 
   std::auto_ptr<SyntaxTree> ExtractTree();
+
+private:
+  typedef std::map< int, std::vector< SyntaxNode* > > InnerNodeIndex;
+  typedef std::map< int, InnerNodeIndex > NodeIndex;
+
+  // Not copyable.
+  SyntaxNodeCollection(const SyntaxNodeCollection &);
+  SyntaxNodeCollection &operator=(const SyntaxNodeCollection &);
+
+  std::vector< SyntaxNode* > m_nodes;
+  NodeIndex m_index;
+  int m_numWords;
+  std::vector< SyntaxNode* > m_emptyNode;
 };
 
 }  // namespace MosesTraining

From 8653bd81590d1f9f658d9560458dc72d9556e197 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Wed, 3 Jun 2015 14:20:00 +0100
Subject: [PATCH 050/108] Ongoing moses/phrase-extract refactoring

---
 phrase-extract/SyntaxNodeCollection.cpp | 40 ----------------------
 phrase-extract/SyntaxNodeCollection.h   |  6 ----
 phrase-extract/relax-parse-main.cpp     | 44 ++++++++++++++++++++++++-
 phrase-extract/relax-parse.h            | 10 ++++--
 4 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp
index 356c49bf4..0a344fcd7 100644
--- a/phrase-extract/SyntaxNodeCollection.cpp
+++ b/phrase-extract/SyntaxNodeCollection.cpp
@@ -51,46 +51,6 @@ SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
   return newNode;
 }
 
-ParentNodes SyntaxNodeCollection::Parse()
-{
-  ParentNodes parents;
-
-  // looping through all spans of size >= 2
-  for( int length=2; length<=m_numWords; length++ ) {
-    for( int startPos = 0; startPos <= m_numWords-length; startPos++ ) {
-      if (HasNode( startPos, startPos+length-1 )) {
-        // processing one (parent) span
-
-        //std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":";
-        SplitPoints splitPoints;
-        splitPoints.push_back( startPos );
-        //std::cerr << " " << startPos;
-
-        int first = 1;
-        int covered = 0;
-        int found_somehing = 1; // break loop if nothing found
-        while( covered < length && found_somehing ) {
-          // find largest covering subspan (child)
-          // starting at last covered position
-          found_somehing = 0;
-          for( int midPos=length-first; midPos>covered; midPos-- ) {
-            if( HasNode( startPos+covered, startPos+midPos-1 ) ) {
-              covered = midPos;
-              splitPoints.push_back( startPos+covered );
-              // std::cerr << " " << ( startPos+covered );
-              first = 0;
-              found_somehing = 1;
-            }
-          }
-        }
-        // std::cerr << std::endl;
-        parents.push_back( splitPoints );
-      }
-    }
-  }
-  return parents;
-}
-
 bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const
 {
   return GetNodes( startPos, endPos).size() > 0;
diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h
index 060192980..8de151c55 100644
--- a/phrase-extract/SyntaxNodeCollection.h
+++ b/phrase-extract/SyntaxNodeCollection.h
@@ -31,9 +31,6 @@
 namespace MosesTraining
 {
 
-typedef std::vector< int > SplitPoints;
-typedef std::vector< SplitPoints > ParentNodes;
-
 /** A collection of SyntaxNodes organized by start and end position.
  *
  */
@@ -47,9 +44,6 @@ public:
   //! Construct and insert a new SyntaxNode.
   SyntaxNode *AddNode( int startPos, int endPos, const std::string &label );
 
-  // TODO Rename (and move?)
-  ParentNodes Parse();
-
   //! Return true iff there are one or more SyntaxNodes with the given span.
   bool HasNode( int startPos, int endPos ) const;
 
diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp
index 4b5c2d573..f7a2a271b 100644
--- a/phrase-extract/relax-parse-main.cpp
+++ b/phrase-extract/relax-parse-main.cpp
@@ -50,7 +50,7 @@ int main(int argc, char* argv[])
     // output tree
     // cerr << "BEFORE:" << endl << tree;
 
-    ParentNodes parents = tree.Parse();
+    ParentNodes parents = determineSplitPoints(tree);
 
     // execute selected grammar relaxation schemes
     if (leftBinarizeFlag)
@@ -271,3 +271,45 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
     tree.AddNode( nodes[i]->start, nodes[i]->end, nodes[i]->label);
   }
 }
+
+ParentNodes determineSplitPoints(const SyntaxNodeCollection &nodeColl)
+{
+  ParentNodes parents;
+
+  const std::size_t numWords = nodeColl.GetNumWords();
+
+  // looping through all spans of size >= 2
+  for( int length=2; length<=numWords; length++ ) {
+    for( int startPos = 0; startPos <= numWords-length; startPos++ ) {
+      if (nodeColl.HasNode( startPos, startPos+length-1 )) {
+        // processing one (parent) span
+
+        //std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":";
+        SplitPoints splitPoints;
+        splitPoints.push_back( startPos );
+        //std::cerr << " " << startPos;
+
+        int first = 1;
+        int covered = 0;
+        int found_somehing = 1; // break loop if nothing found
+        while( covered < length && found_somehing ) {
+          // find largest covering subspan (child)
+          // starting at last covered position
+          found_somehing = 0;
+          for( int midPos=length-first; midPos>covered; midPos-- ) {
+            if( nodeColl.HasNode( startPos+covered, startPos+midPos-1 ) ) {
+              covered = midPos;
+              splitPoints.push_back( startPos+covered );
+              // std::cerr << " " << ( startPos+covered );
+              first = 0;
+              found_somehing = 1;
+            }
+          }
+        }
+        // std::cerr << std::endl;
+        parents.push_back( splitPoints );
+      }
+    }
+  }
+  return parents;
+}
diff --git a/phrase-extract/relax-parse.h b/phrase-extract/relax-parse.h
index a00aa6deb..7c412646a 100644
--- a/phrase-extract/relax-parse.h
+++ b/phrase-extract/relax-parse.h
@@ -37,10 +37,14 @@ bool leftBinarizeFlag = false;
 bool rightBinarizeFlag = false;
 char SAMTLevel = 0;
 
+typedef std::vector< int > SplitPoints;
+typedef std::vector< SplitPoints > ParentNodes;
+
 // functions
 void init(int argc, char* argv[]);
+ParentNodes determineSplitPoints(const MosesTraining::SyntaxNodeCollection &);
 void store( MosesTraining::SyntaxNodeCollection &tree, const std::vector<std::string> &words );
-void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
-void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
-void SAMT( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
+void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );
+void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );
+void SAMT( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );
 

From ca82e9a244773d834b7ffdef548f1966f040a4d5 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Thu, 4 Jun 2015 16:34:02 +0400
Subject: [PATCH 051/108] don't run beautify from cruise control. Not master, a
 particular commit

---
 cruise-control/test_all_new_commits.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/cruise-control/test_all_new_commits.sh b/cruise-control/test_all_new_commits.sh
index bb9305768..433857eb5 100755
--- a/cruise-control/test_all_new_commits.sh
+++ b/cruise-control/test_all_new_commits.sh
@@ -107,8 +107,6 @@ function run_single_test () {
   #regtest_dir=$PWD/$(basename $regtest_file .tgz)
   cd ..
 
-  ./scripts/other/beautify.py --format --skip-perltidy
-
   echo "## ./bjam clean" >> $longlog
   ./bjam clean $MCC_CONFIGURE_ARGS --with-regtest=$regtest_dir >> $longlog 2>&1 || warn "bjam clean failed, suspicious"
 
@@ -154,7 +152,6 @@ function run_single_test () {
   date >> $longlog
 
   if [ -z "$err" ]; then
-    git commit -am "automatic daily beautifier"
     status="OK"
   else
     git reset --hard HEAD

From 5696a59ae46862221901226cfd232b18ddf74357 Mon Sep 17 00:00:00 2001
From: MosesAdmin <hieuhoang@gmail.com>
Date: Thu, 4 Jun 2015 13:41:46 +0100
Subject: [PATCH 052/108] daily automatic beautifier

---
 mert/Fdstream.h                               |  4 ++-
 misc/processLexicalTableMin.cpp               |  8 ++---
 misc/processPhraseTableMin.cpp                |  8 ++---
 moses/FF/GlobalLexicalModel.cpp               |  2 +-
 moses/FF/GlobalLexicalModelUnlimited.cpp      |  2 +-
 moses/IOWrapper.cpp                           | 11 +++----
 moses/StaticData.cpp                          | 11 +++----
 moses/parameters/BookkeepingOptions.cpp       | 28 ++++++++--------
 moses/parameters/BookkeepingOptions.h         | 12 +++----
 moses/parameters/NBestOptions.cpp             | 26 +++++++--------
 moses/parameters/NBestOptions.h               | 32 +++++++++----------
 phrase-extract/SyntaxNode.h                   |  3 +-
 phrase-extract/SyntaxNodeCollection.cpp       |  4 +--
 phrase-extract/SyntaxNodeCollection.h         |  4 ++-
 phrase-extract/consolidate-main.cpp           | 22 ++++++-------
 .../extract-ghkm/AlignmentGraph.cpp           |  2 +-
 phrase-extract/extract-ghkm/ExtractGHKM.cpp   | 24 +++++++-------
 phrase-extract/extract-main.cpp               | 14 ++++----
 .../filter-rule-table/FilterRuleTable.cpp     |  2 +-
 phrase-extract/score-main.cpp                 | 16 +++++-----
 20 files changed, 117 insertions(+), 118 deletions(-)

diff --git a/mert/Fdstream.h b/mert/Fdstream.h
index f6d4f039e..61529db6f 100644
--- a/mert/Fdstream.h
+++ b/mert/Fdstream.h
@@ -67,7 +67,9 @@ private:
 
 protected:
   /// For child classes only: retrieve filebuf.
-  __gnu_cxx::stdio_filebuf<char> *get_filebuf() { return _filebuf; }
+  __gnu_cxx::stdio_filebuf<char> *get_filebuf() {
+    return _filebuf;
+  }
 };
 
 class ifdstream : public _fdstream
diff --git a/misc/processLexicalTableMin.cpp b/misc/processLexicalTableMin.cpp
index 8eee489ad..fac3d632c 100644
--- a/misc/processLexicalTableMin.cpp
+++ b/misc/processLexicalTableMin.cpp
@@ -55,10 +55,10 @@ int main(int argc, char** argv)
   size_t quantize = 0;
 
   size_t threads =
-		#ifdef WITH_THREADS
-    	boost::thread::hardware_concurrency() ? boost::thread::hardware_concurrency() :
-		#endif
-    	1;
+#ifdef WITH_THREADS
+    boost::thread::hardware_concurrency() ? boost::thread::hardware_concurrency() :
+#endif
+    1;
 
   if(1 >= argc) {
     printHelp(argv);
diff --git a/misc/processPhraseTableMin.cpp b/misc/processPhraseTableMin.cpp
index 3948a692c..a124d25df 100644
--- a/misc/processPhraseTableMin.cpp
+++ b/misc/processPhraseTableMin.cpp
@@ -68,10 +68,10 @@ int main(int argc, char **argv)
   size_t sortScoreIndex = 2;
   bool warnMe = true;
   size_t threads =
-		#ifdef WITH_THREADS
-    	boost::thread::hardware_concurrency() ? boost::thread::hardware_concurrency() :
-		#endif
-    	1;
+#ifdef WITH_THREADS
+    boost::thread::hardware_concurrency() ? boost::thread::hardware_concurrency() :
+#endif
+    1;
 
   if(1 >= argc) {
     printHelp(argv);
diff --git a/moses/FF/GlobalLexicalModel.cpp b/moses/FF/GlobalLexicalModel.cpp
index b5a07b1ef..ef3fa4691 100644
--- a/moses/FF/GlobalLexicalModel.cpp
+++ b/moses/FF/GlobalLexicalModel.cpp
@@ -112,7 +112,7 @@ void GlobalLexicalModel::Load()
 void GlobalLexicalModel::InitializeForInput(ttasksptr const& ttask)
 {
   UTIL_THROW_IF2(ttask->GetSource()->GetType() != SentenceInput,
-		 "GlobalLexicalModel works only with sentence input.");
+                 "GlobalLexicalModel works only with sentence input.");
   Sentence const* s = reinterpret_cast<Sentence const*>(ttask->GetSource().get());
   m_local.reset(new ThreadLocalStorage);
   m_local->input = s;
diff --git a/moses/FF/GlobalLexicalModelUnlimited.cpp b/moses/FF/GlobalLexicalModelUnlimited.cpp
index d507054c2..675af2b6b 100644
--- a/moses/FF/GlobalLexicalModelUnlimited.cpp
+++ b/moses/FF/GlobalLexicalModelUnlimited.cpp
@@ -108,7 +108,7 @@ bool GlobalLexicalModelUnlimited::Load(const std::string &filePathSource,
 void GlobalLexicalModelUnlimited::InitializeForInput(ttasksptr const& ttask)
 {
   UTIL_THROW_IF2(ttask->GetSource()->GetType() != SentenceInput,
-		 "GlobalLexicalModel works only with sentence input.");
+                 "GlobalLexicalModel works only with sentence input.");
   Sentence const* s = reinterpret_cast<Sentence const*>(ttask->GetSource().get());
   m_local.reset(new ThreadLocalStorage);
   m_local->input = s;
diff --git a/moses/IOWrapper.cpp b/moses/IOWrapper.cpp
index d1bdeb44f..94287dd0b 100644
--- a/moses/IOWrapper.cpp
+++ b/moses/IOWrapper.cpp
@@ -303,12 +303,11 @@ ReadInput()
   boost::lock_guard<boost::mutex> lock(m_lock);
 #endif
   boost::shared_ptr<InputType> source = GetBufferedInput();
-  if (source)
-    {
-      source->SetTranslationId(m_currentLine++);
-      if (m_look_ahead || m_look_back)
-	this->set_context_for(*source);
-    }
+  if (source) {
+    source->SetTranslationId(m_currentLine++);
+    if (m_look_ahead || m_look_back)
+      this->set_context_for(*source);
+  }
   m_past_input.push_back(source);
   return source;
 }
diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index b41768604..6fd5ced57 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -63,8 +63,8 @@ StaticData::StaticData()
   : m_sourceStartPosMattersForRecombination(false)
   , m_requireSortingAfterSourceContext(false)
   , m_inputType(SentenceInput)
-    // , m_onlyDistinctNBest(false)
-    // , m_needAlignmentInfo(false)
+  // , m_onlyDistinctNBest(false)
+  // , m_needAlignmentInfo(false)
   , m_lmEnableOOVFeature(false)
   , m_isAlwaysCreateDirectTranslationOption(false)
   , m_currentWeightSetting("default")
@@ -621,10 +621,9 @@ bool StaticData::LoadData(Parameter *parameter)
 #ifdef HAVE_PROTOBUF
       || m_outputSearchGraphPB
 #endif
-      || m_latticeSamplesFilePath.size())
-    {
-      m_nbest_options.enabled = true;
-    }
+      || m_latticeSamplesFilePath.size()) {
+    m_nbest_options.enabled = true;
+  }
 
   // S2T decoder
   m_parameter->SetParameter(m_s2tParsingAlgorithm, "s2t-parsing-algorithm",
diff --git a/moses/parameters/BookkeepingOptions.cpp b/moses/parameters/BookkeepingOptions.cpp
index 2ab26b53c..db8fbd909 100644
--- a/moses/parameters/BookkeepingOptions.cpp
+++ b/moses/parameters/BookkeepingOptions.cpp
@@ -1,18 +1,18 @@
 #include "BookkeepingOptions.h"
 
-namespace Moses {
-  bool
-  BookkeepingOptions::
-  init(Parameter const& P)
-  {
-    bool& x = need_alignment_info;
-    P.SetParameter(x, "print-alignment-info", false);
-    if (!x) P.SetParameter(x, "print-alignment-info-in-n-best", false);
-    if (!x)
-      {
-	PARAM_VEC const* params = P.GetParam("alignment-output-file");
-	x = params && params->size();
-      }
-    return true;
+namespace Moses
+{
+bool
+BookkeepingOptions::
+init(Parameter const& P)
+{
+  bool& x = need_alignment_info;
+  P.SetParameter(x, "print-alignment-info", false);
+  if (!x) P.SetParameter(x, "print-alignment-info-in-n-best", false);
+  if (!x) {
+    PARAM_VEC const* params = P.GetParam("alignment-output-file");
+    x = params && params->size();
   }
+  return true;
+}
 }
diff --git a/moses/parameters/BookkeepingOptions.h b/moses/parameters/BookkeepingOptions.h
index 8e800c587..08bc1d59d 100644
--- a/moses/parameters/BookkeepingOptions.h
+++ b/moses/parameters/BookkeepingOptions.h
@@ -2,13 +2,13 @@
 #include "moses/Parameter.h"
 // #include <string>
 
-namespace Moses {
+namespace Moses
+{
 
-  struct BookkeepingOptions
-  {
-    bool need_alignment_info;
-    bool init(Parameter const& param);
-  };
+struct BookkeepingOptions {
+  bool need_alignment_info;
+  bool init(Parameter const& param);
+};
 
 
 
diff --git a/moses/parameters/NBestOptions.cpp b/moses/parameters/NBestOptions.cpp
index 45747011a..d61a67c2f 100644
--- a/moses/parameters/NBestOptions.cpp
+++ b/moses/parameters/NBestOptions.cpp
@@ -2,7 +2,8 @@
 #include "moses/Parameter.h"
 #include "NBestOptions.h"
 
-namespace Moses {
+namespace Moses
+{
 
 bool
 NBestOptions::
@@ -10,21 +11,16 @@ init(Parameter const& P)
 {
   const PARAM_VEC *params;
   params = P.GetParam("n-best-list");
-  if (params)
-    {
-      if (params->size() >= 2)
-	{
-	  output_file_path = params->at(0);
-	  nbest_size = Scan<size_t>( params->at(1) );
-	  only_distinct = (params->size()>2 && params->at(2)=="distinct");
-	}
-      else
-	{
-	  std::cerr << "wrong format for switch -n-best-list file size [disinct]";
-	  return false;
-	}
+  if (params) {
+    if (params->size() >= 2) {
+      output_file_path = params->at(0);
+      nbest_size = Scan<size_t>( params->at(1) );
+      only_distinct = (params->size()>2 && params->at(2)=="distinct");
+    } else {
+      std::cerr << "wrong format for switch -n-best-list file size [disinct]";
+      return false;
     }
-  else nbest_size = 0;
+  } else nbest_size = 0;
 
   P.SetParameter<size_t>(factor, "n-best-factor", 20);
   P.SetParameter(include_alignment_info, "print-alignment-info-in-n-best", false );
diff --git a/moses/parameters/NBestOptions.h b/moses/parameters/NBestOptions.h
index 6c868990c..bc125c2b6 100644
--- a/moses/parameters/NBestOptions.h
+++ b/moses/parameters/NBestOptions.h
@@ -1,27 +1,27 @@
 // -*- mode: c++; cc-style: gnu -*-
 #include <string>
 
-namespace Moses {
+namespace Moses
+{
 
-  struct NBestOptions
-  {
-    size_t nbest_size;
-    size_t factor;
-    bool enabled;
-    bool print_trees;
-    bool only_distinct;
+struct NBestOptions {
+  size_t nbest_size;
+  size_t factor;
+  bool enabled;
+  bool print_trees;
+  bool only_distinct;
 
-    bool include_alignment_info;
-    bool include_segmentation;
-    bool include_feature_labels;
-    bool include_passthrough;
+  bool include_alignment_info;
+  bool include_segmentation;
+  bool include_feature_labels;
+  bool include_passthrough;
 
-    bool include_all_factors;
+  bool include_all_factors;
 
-    std::string output_file_path;
+  std::string output_file_path;
 
-    bool init(Parameter const& param);
+  bool init(Parameter const& param);
 
-  };
+};
 
 }
diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h
index f38e94713..49e2eb695 100644
--- a/phrase-extract/SyntaxNode.h
+++ b/phrase-extract/SyntaxNode.h
@@ -22,7 +22,8 @@
 #include <map>
 #include <string>
 
-namespace MosesTraining {
+namespace MosesTraining
+{
 
 struct SyntaxNode {
   typedef std::map<std::string, std::string> AttributeMap;
diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp
index 0a344fcd7..70f52317e 100644
--- a/phrase-extract/SyntaxNodeCollection.cpp
+++ b/phrase-extract/SyntaxNodeCollection.cpp
@@ -42,7 +42,7 @@ void SyntaxNodeCollection::Clear()
 }
 
 SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
-                                          const std::string &label)
+    const std::string &label)
 {
   SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos);
   m_nodes.push_back( newNode );
@@ -57,7 +57,7 @@ bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const
 }
 
 const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes(
-    int startPos, int endPos ) const
+  int startPos, int endPos ) const
 {
   NodeIndex::const_iterator startIndex = m_index.find( startPos );
   if (startIndex == m_index.end() )
diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h
index 8de151c55..405a77c5f 100644
--- a/phrase-extract/SyntaxNodeCollection.h
+++ b/phrase-extract/SyntaxNodeCollection.h
@@ -51,7 +51,9 @@ public:
   const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
 
   //! Get a vector of pointers to all SyntaxNodes (unordered).
-  const std::vector< SyntaxNode* >& GetAllNodes() { return m_nodes; };
+  const std::vector< SyntaxNode* >& GetAllNodes() {
+    return m_nodes;
+  };
 
   size_t GetNumWords() const {
     return m_numWords;
diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp
index d52e8797b..5964bf686 100644
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@@ -73,17 +73,17 @@ int main(int argc, char* argv[])
 
   if (argc < 4) {
     std::cerr <<
-      "syntax: "
-      "consolidate phrase-table.direct "
-      "phrase-table.indirect "
-      "phrase-table.consolidated "
-      "[--Hierarchical] [--OnlyDirect] [--PhraseCount] "
-      "[--GoodTuring counts-of-counts-file] "
-      "[--KneserNey counts-of-counts-file] [--LowCountFeature] "
-      "[--SourceLabels source-labels-file] "
-      "[--PartsOfSpeech parts-of-speech-file] "
-      "[--MinScore id:threshold[,id:threshold]*]"
-      << std::endl;
+              "syntax: "
+              "consolidate phrase-table.direct "
+              "phrase-table.indirect "
+              "phrase-table.consolidated "
+              "[--Hierarchical] [--OnlyDirect] [--PhraseCount] "
+              "[--GoodTuring counts-of-counts-file] "
+              "[--KneserNey counts-of-counts-file] [--LowCountFeature] "
+              "[--SourceLabels source-labels-file] "
+              "[--PartsOfSpeech parts-of-speech-file] "
+              "[--MinScore id:threshold[,id:threshold]*]"
+              << std::endl;
     exit(1);
   }
   const std::string fileNameDirect = argv[1];
diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
index 7c179295f..9dba71331 100644
--- a/phrase-extract/extract-ghkm/AlignmentGraph.cpp
+++ b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
@@ -219,7 +219,7 @@ Node *AlignmentGraph::CopyParseTree(const SyntaxTree *root)
   if (nodeType == TREE) {
     float score = 0.0f;
     SyntaxNode::AttributeMap::const_iterator p =
-        root->value().attributes.find("pcfg");
+      root->value().attributes.find("pcfg");
     if (p != root->value().attributes.end()) {
       score = std::atof(p->second.c_str());
     }
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index c96cda146..777e56f52 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -232,7 +232,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
 
     // Initialize phrase orientation scoring object
     PhraseOrientation phraseOrientation(sourceTokens.size(),
-        targetXmlTreeParser.words().size(), alignment);
+                                        targetXmlTreeParser.words().size(), alignment);
 
     // Write the rules, subject to scope pruning.
     const std::vector<Node *> &targetNodes = graph.GetTargetNodes();
@@ -413,21 +413,21 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
               << "\nThe parse tree is assumed to contain part-of-speech preterminal nodes.\n"
               << "\n"
               << "For the composed rule constraints: rule depth is the "
-                 "maximum distance from the\nrule's root node to a sink "
-                 "node, not counting preterminal expansions or word\n"
-                 "alignments.  Rule size is the measure defined in DeNeefe "
-                 "et al (2007): the\nnumber of non-part-of-speech, non-leaf "
-                 "constituent labels in the target tree.\nNode count is the "
-                 "number of target tree nodes (excluding target words).\n"
+              "maximum distance from the\nrule's root node to a sink "
+              "node, not counting preterminal expansions or word\n"
+              "alignments.  Rule size is the measure defined in DeNeefe "
+              "et al (2007): the\nnumber of non-part-of-speech, non-leaf "
+              "constituent labels in the target tree.\nNode count is the "
+              "number of target tree nodes (excluding target words).\n"
               << "\n"
               << "Scope pruning (Hopkins and Langmead, 2010) is applied to both minimal and\ncomposed rules.\n"
               << "\n"
               << "Unaligned source words are attached to the tree using the "
-                 "following heuristic:\nif there are aligned source words to "
-                 "both the left and the right of an unaligned\nsource word "
-                 "then it is attached to the lowest common ancestor of its "
-                 "nearest\nsuch left and right neighbours.  Otherwise, it is "
-                 "attached to the root of the\nparse tree.\n"
+              "following heuristic:\nif there are aligned source words to "
+              "both the left and the right of an unaligned\nsource word "
+              "then it is attached to the lowest common ancestor of its "
+              "nearest\nsuch left and right neighbours.  Otherwise, it is "
+              "attached to the root of the\nparse tree.\n"
               << "\n"
               << "Unless the --AllowUnary option is given, unary rules containing no lexical\nsource items are eliminated using the method described in Chung et al. (2011).\nThe parsing algorithm used in Moses is unable to handle such rules.\n"
               << "\n"
diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp
index eb44b83d1..70d4cad35 100644
--- a/phrase-extract/extract-main.cpp
+++ b/phrase-extract/extract-main.cpp
@@ -87,13 +87,13 @@ class ExtractTask
 {
 public:
   ExtractTask(
-          size_t id, SentenceAlignment &sentence,
-          PhraseExtractionOptions &initoptions,
-          Moses::OutputFileStream &extractFile,
-          Moses::OutputFileStream &extractFileInv,
-          Moses::OutputFileStream &extractFileOrientation,
-          Moses::OutputFileStream &extractFileContext,
-          Moses::OutputFileStream &extractFileContextInv):
+    size_t id, SentenceAlignment &sentence,
+    PhraseExtractionOptions &initoptions,
+    Moses::OutputFileStream &extractFile,
+    Moses::OutputFileStream &extractFileInv,
+    Moses::OutputFileStream &extractFileOrientation,
+    Moses::OutputFileStream &extractFileContext,
+    Moses::OutputFileStream &extractFileContextInv):
     m_sentence(sentence),
     m_options(initoptions),
     m_extractFile(extractFile),
diff --git a/phrase-extract/filter-rule-table/FilterRuleTable.cpp b/phrase-extract/filter-rule-table/FilterRuleTable.cpp
index 32d2019cf..89e59b3e9 100644
--- a/phrase-extract/filter-rule-table/FilterRuleTable.cpp
+++ b/phrase-extract/filter-rule-table/FilterRuleTable.cpp
@@ -137,7 +137,7 @@ void FilterRuleTable::ReadTestSet(
       continue;
     }
     sentences.push_back(
-        boost::shared_ptr<SyntaxTree>(parser.Parse(line).release()));
+      boost::shared_ptr<SyntaxTree>(parser.Parse(line).release()));
   }
 }
 
diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp
index 185c0ae9e..cf28f90b9 100644
--- a/phrase-extract/score-main.cpp
+++ b/phrase-extract/score-main.cpp
@@ -131,14 +131,14 @@ int main(int argc, char* argv[])
   ScoreFeatureManager featureManager;
   if (argc < 4) {
     std::cerr <<
-        "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] "
-        "[--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] "
-        "[--NoWordAlignment] [--UnalignedPenalty] "
-        "[--UnalignedFunctionWordPenalty function-word-file] "
-        "[--MinCountHierarchical count] [--PartsOfSpeech] [--PCFG] "
-        "[--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] "
-        "[--TargetPreferenceLabels] [--UnpairedExtractFormat] "
-        "[--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
+              "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] "
+              "[--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] "
+              "[--NoWordAlignment] [--UnalignedPenalty] "
+              "[--UnalignedFunctionWordPenalty function-word-file] "
+              "[--MinCountHierarchical count] [--PartsOfSpeech] [--PCFG] "
+              "[--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] "
+              "[--TargetPreferenceLabels] [--UnpairedExtractFormat] "
+              "[--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
     std::cerr << featureManager.usage() << std::endl;
     exit(1);
   }

From 7047c591c8414864ef13020e256e841aa217c4d2 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Thu, 4 Jun 2015 16:46:56 +0400
Subject: [PATCH 053/108] Beautify before cruise control test. Push using
 MosesAdmin account, must be set up with ssh.

---
 cruise-control/test_all_new_commits.sh | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cruise-control/test_all_new_commits.sh b/cruise-control/test_all_new_commits.sh
index 433857eb5..79d44d3a3 100755
--- a/cruise-control/test_all_new_commits.sh
+++ b/cruise-control/test_all_new_commits.sh
@@ -17,6 +17,14 @@ configname=$(basename $configf | sed 's/\.config$//')
 
 source "$configf"
 
+# beautifier
+git clone git@github.com:moses-smt/mosesdecoder.git /tmp/moses
+cd /tmp/moses
+./scripts/other/beautify.py --format --skip-perltidy
+git commit -am "daily automatic beautifier"
+git push
+rm -rf /tmp/moses
+
 [ -z "$MCC_SCAN_BRANCHES" ] \
   && die "Bad config $configf; does not define MCC_SCAN_BRANCHES"
 

From 6a09042e6abd5dab3d0cf0d358804f7a0ef9ca9a Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Thu, 4 Jun 2015 16:51:22 +0400
Subject: [PATCH 054/108] Beautify before cruise control test. Push using
 MosesAdmin account, must be set up with ssh.

---
 cruise-control/test_all_new_commits.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cruise-control/test_all_new_commits.sh b/cruise-control/test_all_new_commits.sh
index 79d44d3a3..c30afa0d1 100755
--- a/cruise-control/test_all_new_commits.sh
+++ b/cruise-control/test_all_new_commits.sh
@@ -24,6 +24,7 @@ cd /tmp/moses
 git commit -am "daily automatic beautifier"
 git push
 rm -rf /tmp/moses
+cd -
 
 [ -z "$MCC_SCAN_BRANCHES" ] \
   && die "Bad config $configf; does not define MCC_SCAN_BRANCHES"

From f6ddc452241755733c947723a8618aab7245c8f1 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Thu, 4 Jun 2015 14:36:39 +0100
Subject: [PATCH 055/108] Ongoing moses/phrase-extract refactoring

---
 phrase-extract/pcfg-common/Jamfile            |  1 -
 phrase-extract/pcfg-common/pcfg.h             | 63 -------------------
 phrase-extract/pcfg-common/typedef.h          | 38 -----------
 phrase-extract/pcfg-extract/Jamfile           |  2 +-
 phrase-extract/pcfg-extract/pcfg_extract.cc   |  5 +-
 phrase-extract/pcfg-extract/pcfg_extract.h    |  2 +-
 .../pcfg-extract/rule_collection.cc           |  2 +-
 phrase-extract/pcfg-extract/rule_collection.h |  2 +-
 phrase-extract/pcfg-extract/rule_extractor.h  |  4 +-
 phrase-extract/pcfg-score/Jamfile             |  2 +-
 phrase-extract/pcfg-score/pcfg_score.cc       |  5 +-
 phrase-extract/pcfg-score/pcfg_score.h        |  2 +-
 phrase-extract/pcfg-score/tree_scorer.h       |  4 +-
 .../{pcfg-common => syntax-common}/pcfg.cc    | 21 -------
 phrase-extract/syntax-common/pcfg.h           | 38 +++++++++++
 .../{pcfg-common => syntax-common}/tool.cc    |  0
 .../{pcfg-common => syntax-common}/tool.h     |  0
 phrase-extract/syntax-common/vocabulary.h     | 13 ++++
 18 files changed, 65 insertions(+), 139 deletions(-)
 delete mode 100644 phrase-extract/pcfg-common/Jamfile
 delete mode 100644 phrase-extract/pcfg-common/pcfg.h
 delete mode 100644 phrase-extract/pcfg-common/typedef.h
 rename phrase-extract/{pcfg-common => syntax-common}/pcfg.cc (69%)
 create mode 100644 phrase-extract/syntax-common/pcfg.h
 rename phrase-extract/{pcfg-common => syntax-common}/tool.cc (100%)
 rename phrase-extract/{pcfg-common => syntax-common}/tool.h (100%)
 create mode 100644 phrase-extract/syntax-common/vocabulary.h

diff --git a/phrase-extract/pcfg-common/Jamfile b/phrase-extract/pcfg-common/Jamfile
deleted file mode 100644
index 5669b443e..000000000
--- a/phrase-extract/pcfg-common/Jamfile
+++ /dev/null
@@ -1 +0,0 @@
-lib pcfg_common : [ glob *.cc ] ..//syntax-common ..//deps : <include>.. ;
diff --git a/phrase-extract/pcfg-common/pcfg.h b/phrase-extract/pcfg-common/pcfg.h
deleted file mode 100644
index c5c04cba4..000000000
--- a/phrase-extract/pcfg-common/pcfg.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-***********************************************************************/
-
-#pragma once
-#ifndef PCFG_PCFG_H_
-#define PCFG_PCFG_H_
-
-#include <istream>
-#include <map>
-#include <ostream>
-#include <vector>
-
-#include "typedef.h"
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-class Pcfg {
- public:
-  typedef std::vector<std::size_t> Key;
-  typedef std::map<Key, double> Map;
-  typedef Map::iterator iterator;
-  typedef Map::const_iterator const_iterator;
-
-  Pcfg() {}
-
-  iterator begin() { return rules_.begin(); }
-  const_iterator begin() const { return rules_.begin(); }
-
-  iterator end() { return rules_.end(); }
-  const_iterator end() const { return rules_.end(); }
-
-  void Add(const Key &, double);
-  bool Lookup(const Key &, double &) const;
-  void Read(std::istream &, Vocabulary &);
-  void Write(const Vocabulary &, std::ostream &) const;
-
- private:
-  Map rules_;
-};
-
-}  // namespace PCFG
-}  // namespace Syntax
-}  // namespace MosesTraining
-
-#endif
diff --git a/phrase-extract/pcfg-common/typedef.h b/phrase-extract/pcfg-common/typedef.h
deleted file mode 100644
index 1280b89cf..000000000
--- a/phrase-extract/pcfg-common/typedef.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-***********************************************************************/
-
-#pragma once
-#ifndef PCFG_TYPEDEF_H_
-#define PCFG_TYPEDEF_H_
-
-#include <string>
-
-#include "syntax-common/numbered_set.h"
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-typedef NumberedSet<std::string> Vocabulary;
-
-}  // namespace PCFG
-}  // namespace Syntax
-}  // namespace MosesTraining
-
-#endif
diff --git a/phrase-extract/pcfg-extract/Jamfile b/phrase-extract/pcfg-extract/Jamfile
index 2442b967a..2f4ae1e7d 100644
--- a/phrase-extract/pcfg-extract/Jamfile
+++ b/phrase-extract/pcfg-extract/Jamfile
@@ -1 +1 @@
-exe pcfg-extract : [ glob *.cc ] ..//syntax-common ..//pcfg-common ../..//boost_program_options : <include>.. ;
+exe pcfg-extract : [ glob *.cc ] ..//syntax-common ../..//boost_program_options : <include>.. ;
diff --git a/phrase-extract/pcfg-extract/pcfg_extract.cc b/phrase-extract/pcfg-extract/pcfg_extract.cc
index 87419edb7..45eb9ff3d 100644
--- a/phrase-extract/pcfg-extract/pcfg_extract.cc
+++ b/phrase-extract/pcfg-extract/pcfg_extract.cc
@@ -32,13 +32,12 @@
 #include <boost/program_options.hpp>
 
 #include "syntax-common/exception.h"
+#include "syntax-common/pcfg.h"
+#include "syntax-common/vocabulary.h"
 #include "syntax-common/xml_tree_parser.h"
 
 #include "SyntaxTree.h"
 
-#include "pcfg-common/pcfg.h"
-#include "pcfg-common/typedef.h"
-
 #include "options.h"
 #include "rule_collection.h"
 #include "rule_extractor.h"
diff --git a/phrase-extract/pcfg-extract/pcfg_extract.h b/phrase-extract/pcfg-extract/pcfg_extract.h
index 5882e45da..3b084acbe 100644
--- a/phrase-extract/pcfg-extract/pcfg_extract.h
+++ b/phrase-extract/pcfg-extract/pcfg_extract.h
@@ -21,7 +21,7 @@
 #ifndef PCFG_EXTRACT_PCFG_EXTRACT_H_
 #define PCFG_EXTRACT_PCFG_EXTRACT_H_
 
-#include "pcfg-common/tool.h"
+#include "syntax-common/tool.h"
 
 namespace MosesTraining
 {
diff --git a/phrase-extract/pcfg-extract/rule_collection.cc b/phrase-extract/pcfg-extract/rule_collection.cc
index 9db0ce9bf..a814f82d6 100644
--- a/phrase-extract/pcfg-extract/rule_collection.cc
+++ b/phrase-extract/pcfg-extract/rule_collection.cc
@@ -19,7 +19,7 @@
 
 #include "rule_collection.h"
 
-#include "pcfg-common/pcfg.h"
+#include "syntax-common/pcfg.h"
 
 #include <cmath>
 
diff --git a/phrase-extract/pcfg-extract/rule_collection.h b/phrase-extract/pcfg-extract/rule_collection.h
index 3d9a9f98b..3bbc32721 100644
--- a/phrase-extract/pcfg-extract/rule_collection.h
+++ b/phrase-extract/pcfg-extract/rule_collection.h
@@ -25,7 +25,7 @@
 
 #include <boost/unordered_map.hpp>
 
-#include "pcfg-common/pcfg.h"
+#include "syntax-common/pcfg.h"
 
 namespace MosesTraining
 {
diff --git a/phrase-extract/pcfg-extract/rule_extractor.h b/phrase-extract/pcfg-extract/rule_extractor.h
index d32d76992..91014747c 100644
--- a/phrase-extract/pcfg-extract/rule_extractor.h
+++ b/phrase-extract/pcfg-extract/rule_extractor.h
@@ -23,7 +23,7 @@
 
 #include "SyntaxTree.h"
 
-#include "pcfg-common/typedef.h"
+#include "syntax-common/vocabulary.h"
 
 #include "rule_collection.h"
 
@@ -39,7 +39,7 @@ class RuleExtractor
 {
 public:
   RuleExtractor(Vocabulary &);
-  void Extract(const MosesTraining::SyntaxTree &, RuleCollection &) const;
+  void Extract(const SyntaxTree &, RuleCollection &) const;
 private:
   Vocabulary &non_term_vocab_;
 };
diff --git a/phrase-extract/pcfg-score/Jamfile b/phrase-extract/pcfg-score/Jamfile
index 45d46492a..ca321d04c 100644
--- a/phrase-extract/pcfg-score/Jamfile
+++ b/phrase-extract/pcfg-score/Jamfile
@@ -1 +1 @@
-exe pcfg-score : [ glob *.cc ] ..//pcfg-common ../..//boost_program_options : <include>.. ;
+exe pcfg-score : [ glob *.cc ] ..//syntax-common ../..//boost_program_options : <include>.. ;
diff --git a/phrase-extract/pcfg-score/pcfg_score.cc b/phrase-extract/pcfg-score/pcfg_score.cc
index e11f73f70..cec84211a 100644
--- a/phrase-extract/pcfg-score/pcfg_score.cc
+++ b/phrase-extract/pcfg-score/pcfg_score.cc
@@ -36,12 +36,11 @@
 #include "SyntaxTree.h"
 
 #include "syntax-common/exception.h"
+#include "syntax-common/pcfg.h"
+#include "syntax-common/vocabulary.h"
 #include "syntax-common/xml_tree_parser.h"
 #include "syntax-common/xml_tree_writer.h"
 
-#include "pcfg-common/pcfg.h"
-#include "pcfg-common/typedef.h"
-
 namespace MosesTraining
 {
 namespace Syntax
diff --git a/phrase-extract/pcfg-score/pcfg_score.h b/phrase-extract/pcfg-score/pcfg_score.h
index b0b4a77cd..b691b107f 100644
--- a/phrase-extract/pcfg-score/pcfg_score.h
+++ b/phrase-extract/pcfg-score/pcfg_score.h
@@ -21,7 +21,7 @@
 #ifndef PCFG_SCORE_PCFG_SCORE_H_
 #define PCFG_SCORE_PCFG_SCORE_H_
 
-#include "pcfg-common/tool.h"
+#include "syntax-common/tool.h"
 
 namespace MosesTraining
 {
diff --git a/phrase-extract/pcfg-score/tree_scorer.h b/phrase-extract/pcfg-score/tree_scorer.h
index cf9fdd1a3..b95d13ddb 100644
--- a/phrase-extract/pcfg-score/tree_scorer.h
+++ b/phrase-extract/pcfg-score/tree_scorer.h
@@ -23,8 +23,8 @@
 
 #include "SyntaxTree.h"
 
-#include "pcfg-common/pcfg.h"
-#include "pcfg-common/typedef.h"
+#include "syntax-common/vocabulary.h"
+#include "syntax-common/pcfg.h"
 
 namespace MosesTraining
 {
diff --git a/phrase-extract/pcfg-common/pcfg.cc b/phrase-extract/syntax-common/pcfg.cc
similarity index 69%
rename from phrase-extract/pcfg-common/pcfg.cc
rename to phrase-extract/syntax-common/pcfg.cc
index 988367c9b..3efe04218 100644
--- a/phrase-extract/pcfg-common/pcfg.cc
+++ b/phrase-extract/syntax-common/pcfg.cc
@@ -1,22 +1,3 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-***********************************************************************/
-
 #include "pcfg.h"
 
 #include <cassert>
@@ -28,7 +9,6 @@
 
 namespace MosesTraining {
 namespace Syntax {
-namespace PCFG {
 
 void Pcfg::Add(const Key &key, double score) {
   rules_[key] = score;
@@ -103,6 +83,5 @@ void Pcfg::Write(const Vocabulary &vocab, std::ostream &output) const {
   }
 }
 
-}  // namespace PCFG
 }  // namespace Syntax
 }  // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/pcfg.h b/phrase-extract/syntax-common/pcfg.h
new file mode 100644
index 000000000..0a731cc7a
--- /dev/null
+++ b/phrase-extract/syntax-common/pcfg.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <istream>
+#include <map>
+#include <ostream>
+#include <vector>
+
+#include "vocabulary.h"
+
+namespace MosesTraining {
+namespace Syntax {
+
+class Pcfg {
+ public:
+  typedef std::vector<std::size_t> Key;
+  typedef std::map<Key, double> Map;
+  typedef Map::iterator iterator;
+  typedef Map::const_iterator const_iterator;
+
+  Pcfg() {}
+
+  iterator begin() { return rules_.begin(); }
+  const_iterator begin() const { return rules_.begin(); }
+
+  iterator end() { return rules_.end(); }
+  const_iterator end() const { return rules_.end(); }
+
+  void Add(const Key &, double);
+  bool Lookup(const Key &, double &) const;
+  void Read(std::istream &, Vocabulary &);
+  void Write(const Vocabulary &, std::ostream &) const;
+
+ private:
+  Map rules_;
+};
+
+}  // namespace Syntax
+}  // namespace MosesTraining
diff --git a/phrase-extract/pcfg-common/tool.cc b/phrase-extract/syntax-common/tool.cc
similarity index 100%
rename from phrase-extract/pcfg-common/tool.cc
rename to phrase-extract/syntax-common/tool.cc
diff --git a/phrase-extract/pcfg-common/tool.h b/phrase-extract/syntax-common/tool.h
similarity index 100%
rename from phrase-extract/pcfg-common/tool.h
rename to phrase-extract/syntax-common/tool.h
diff --git a/phrase-extract/syntax-common/vocabulary.h b/phrase-extract/syntax-common/vocabulary.h
new file mode 100644
index 000000000..119767245
--- /dev/null
+++ b/phrase-extract/syntax-common/vocabulary.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <string>
+
+#include "numbered_set.h"
+
+namespace MosesTraining {
+namespace Syntax {
+
+typedef NumberedSet<std::string> Vocabulary;
+
+}  // namespace Syntax
+}  // namespace MosesTraining

From 721bfe823bd4142f73499be371036c775887f9d5 Mon Sep 17 00:00:00 2001
From: Ulrich Germann <Ulrich.Germann@gmail.com>
Date: Thu, 4 Jun 2015 16:07:22 +0100
Subject: [PATCH 056/108] Bug fix: m_nbestSize wasn't initialized in class
 TranslationRequest.

---
 moses/server/TranslationRequest.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp
index 5c87eb1a7..3762fbd96 100644
--- a/moses/server/TranslationRequest.cpp
+++ b/moses/server/TranslationRequest.cpp
@@ -221,6 +221,7 @@ TranslationRequest::
 TranslationRequest(xmlrpc_c::paramList const& paramList,
                    boost::condition_variable& cond, boost::mutex& mut)
   : m_cond(cond), m_mutex(mut), m_done(false), m_paramList(paramList)
+  , m_nbestSize(0)
 { }
 
 void
@@ -264,7 +265,12 @@ parse_request(std::map<std::string, xmlrpc_c::value> const& params)
       pdmm->SetTemporaryMultiModelWeightsVector(w);
     }
   }
-
+  
+  si = params.find("nbest");
+  if (si != params.end())
+    m_nbestSize = xmlrpc_c::value_int(si->second);
+  
+  
   // // biased sampling for suffix-array-based sampling phrase table?
   // if ((si = params.find("bias")) != params.end())
   //   {

From c6a3d8e54aa84933875160873d7bf837a6210b25 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Thu, 4 Jun 2015 16:54:31 +0100
Subject: [PATCH 057/108] Ongoing moses/phrase-extract refactoring

---
 phrase-extract/extract-ghkm/ExtractGHKM.cpp   | 46 +----------
 phrase-extract/extract-ghkm/ExtractGHKM.h     | 18 ++--
 .../filter-rule-table/FilterRuleTable.cpp     | 18 +---
 .../filter-rule-table/FilterRuleTable.h       | 18 ++--
 phrase-extract/pcfg-extract/pcfg_extract.cc   |  2 +-
 phrase-extract/pcfg-score/pcfg_score.cc       |  4 +-
 .../postprocess-egret-forests/Main.cpp        |  6 +-
 .../PostprocessEgretForests.cpp               | 50 ++++-------
 .../PostprocessEgretForests.h                 | 18 ++--
 phrase-extract/score-stsg/ScoreStsg.cpp       | 26 +-----
 phrase-extract/score-stsg/ScoreStsg.h         | 15 +---
 phrase-extract/syntax-common/tool.cc          | 81 +++++++-----------
 phrase-extract/syntax-common/tool.h           | 82 +++++--------------
 13 files changed, 96 insertions(+), 288 deletions(-)

diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index 777e56f52..c2ee43767 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -359,39 +359,6 @@ int ExtractGHKM::Main(int argc, char *argv[])
   return 0;
 }
 
-void ExtractGHKM::OpenInputFileOrDie(const std::string &filename,
-                                     std::ifstream &stream)
-{
-  stream.open(filename.c_str());
-  if (!stream) {
-    std::ostringstream msg;
-    msg << "failed to open input file: " << filename;
-    Error(msg.str());
-  }
-}
-
-void ExtractGHKM::OpenOutputFileOrDie(const std::string &filename,
-                                      std::ofstream &stream)
-{
-  stream.open(filename.c_str());
-  if (!stream) {
-    std::ostringstream msg;
-    msg << "failed to open output file: " << filename;
-    Error(msg.str());
-  }
-}
-
-void ExtractGHKM::OpenOutputFileOrDie(const std::string &filename,
-                                      Moses::OutputFileStream &stream)
-{
-  bool ret = stream.Open(filename);
-  if (!ret) {
-    std::ostringstream msg;
-    msg << "failed to open output file: " << filename;
-    Error(msg.str());
-  }
-}
-
 void ExtractGHKM::ProcessOptions(int argc, char *argv[],
                                  Options &options) const
 {
@@ -401,7 +368,7 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
   // Construct the 'top' of the usage message: the bit that comes before the
   // options list.
   std::ostringstream usageTop;
-  usageTop << "Usage: " << GetName()
+  usageTop << "Usage: " << name()
            << " [OPTION]... TARGET SOURCE ALIGNMENT EXTRACT\n\n"
            << "SCFG rule extractor based on the GHKM algorithm described in\n"
            << "Galley et al. (2004).\n\n"
@@ -547,11 +514,8 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
 
   // Process the command-line.
   po::variables_map vm;
-  const int optionStyle = cls::allow_long
-                          | cls::long_allow_adjacent
-                          | cls::long_allow_next;
   try {
-    po::store(po::command_line_parser(argc, argv).style(optionStyle).
+    po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()).
               options(cmdLineOptions).positional(p).run(), vm);
     po::notify(vm);
   } catch (const std::exception &e) {
@@ -635,12 +599,6 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
   }
 }
 
-void ExtractGHKM::Error(const std::string &msg) const
-{
-  std::cerr << GetName() << ": " << msg << std::endl;
-  std::exit(1);
-}
-
 std::vector<std::string> ExtractGHKM::ReadTokens(const std::string &s) const
 {
   std::vector<std::string> tokens;
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.h b/phrase-extract/extract-ghkm/ExtractGHKM.h
index 66c4c55f8..0d0fa8bf1 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.h
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.h
@@ -28,6 +28,8 @@
 #include "OutputFileStream.h"
 #include "SyntaxTree.h"
 
+#include "syntax-common/tool.h"
+
 namespace MosesTraining
 {
 namespace GHKM
@@ -35,22 +37,14 @@ namespace GHKM
 
 struct Options;
 
-class ExtractGHKM
+class ExtractGHKM : public Syntax::Tool
 {
 public:
+  ExtractGHKM() : Tool("extract-ghkm") {}
 
-  ExtractGHKM() : m_name("extract-ghkm") {}
-  const std::string &GetName() const {
-    return m_name;
-  }
-  int Main(int argc, char *argv[]);
+  virtual int Main(int argc, char *argv[]);
 
 private:
-
-  void Error(const std::string &) const;
-  void OpenInputFileOrDie(const std::string &, std::ifstream &);
-  void OpenOutputFileOrDie(const std::string &, std::ofstream &);
-  void OpenOutputFileOrDie(const std::string &, Moses::OutputFileStream &);
   void RecordTreeLabels(const SyntaxTree &, std::set<std::string> &);
   void CollectWordLabelCounts(SyntaxTree &,
                               const Options &,
@@ -79,8 +73,6 @@ private:
   std::vector<std::string> ReadTokens(const SyntaxTree &root) const;
 
   void ProcessOptions(int, char *[], Options &) const;
-
-  std::string m_name;
 };
 
 }  // namespace GHKM
diff --git a/phrase-extract/filter-rule-table/FilterRuleTable.cpp b/phrase-extract/filter-rule-table/FilterRuleTable.cpp
index 89e59b3e9..24c2803a7 100644
--- a/phrase-extract/filter-rule-table/FilterRuleTable.cpp
+++ b/phrase-extract/filter-rule-table/FilterRuleTable.cpp
@@ -167,7 +167,7 @@ void FilterRuleTable::ProcessOptions(int argc, char *argv[],
   // Construct the 'top' of the usage message: the bit that comes before the
   // options list.
   std::ostringstream usageTop;
-  usageTop << "Usage: " << GetName()
+  usageTop << "Usage: " << name()
            << " [OPTION]... MODEL TEST\n\n"
            << "Filter for SCFG/STSG rule tables.\n\n"
            << "Options";
@@ -203,11 +203,8 @@ void FilterRuleTable::ProcessOptions(int argc, char *argv[],
 
   // Process the command-line.
   po::variables_map vm;
-  const int optionStyle = cls::allow_long
-                          | cls::long_allow_adjacent
-                          | cls::long_allow_next;
   try {
-    po::store(po::command_line_parser(argc, argv).style(optionStyle).
+    po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()).
               options(cmdLineOptions).positional(p).run(), vm);
     po::notify(vm);
   } catch (const std::exception &e) {
@@ -229,17 +226,6 @@ void FilterRuleTable::ProcessOptions(int argc, char *argv[],
   }
 }
 
-void FilterRuleTable::Error(const std::string &msg) const
-{
-  std::cerr << GetName() << ": error: " << msg << std::endl;
-  std::exit(1);
-}
-
-void FilterRuleTable::Warn(const std::string &msg) const
-{
-  std::cerr << GetName() << ": warning: " << msg << std::endl;
-}
-
 }  // namespace FilterRuleTable
 }  // namespace Syntax
 }  // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/FilterRuleTable.h b/phrase-extract/filter-rule-table/FilterRuleTable.h
index 3077e690d..7b51bb8fa 100644
--- a/phrase-extract/filter-rule-table/FilterRuleTable.h
+++ b/phrase-extract/filter-rule-table/FilterRuleTable.h
@@ -7,6 +7,8 @@
 
 #include "SyntaxTree.h"
 
+#include "syntax-common/tool.h"
+
 #include "StringForest.h"
 
 namespace MosesTraining
@@ -18,20 +20,14 @@ namespace FilterRuleTable
 
 struct Options;
 
-class FilterRuleTable
+class FilterRuleTable : public Tool
 {
 public:
-  FilterRuleTable() : m_name("filter-rule-table") {}
+  FilterRuleTable() : Tool("filter-rule-table") {}
 
-  const std::string &GetName() const {
-    return m_name;
-  }
-
-  int Main(int argc, char *argv[]);
+  virtual int Main(int argc, char *argv[]);
 
 private:
-  void Error(const std::string &) const;
-
   // Filter rule table (on std::cin) for test set (string version).
   void Filter(const std::vector<std::vector<std::string> > &);
 
@@ -51,10 +47,6 @@ private:
   // Read test set (forest version)
   void ReadTestSet(std::istream &,
                    std::vector<boost::shared_ptr<StringForest> > &);
-
-  void Warn(const std::string &) const;
-
-  std::string m_name;
 };
 
 }  // namespace FilterRuleTable
diff --git a/phrase-extract/pcfg-extract/pcfg_extract.cc b/phrase-extract/pcfg-extract/pcfg_extract.cc
index 45eb9ff3d..0e89e26be 100644
--- a/phrase-extract/pcfg-extract/pcfg_extract.cc
+++ b/phrase-extract/pcfg-extract/pcfg_extract.cc
@@ -118,7 +118,7 @@ void PcfgExtract::ProcessOptions(int argc, char *argv[],
   // Process the command-line.
   po::variables_map vm;
   try {
-    po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()).
+    po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()).
               options(cmd_line_options).positional(p).run(), vm);
     po::notify(vm);
   } catch (const std::exception &e) {
diff --git a/phrase-extract/pcfg-score/pcfg_score.cc b/phrase-extract/pcfg-score/pcfg_score.cc
index cec84211a..bdbb761f9 100644
--- a/phrase-extract/pcfg-score/pcfg_score.cc
+++ b/phrase-extract/pcfg-score/pcfg_score.cc
@@ -56,7 +56,7 @@ int PcfgScore::Main(int argc, char *argv[])
 
   // Open PCFG stream.
   std::ifstream pcfg_stream;
-  OpenNamedInputOrDie(options.pcfg_file, pcfg_stream);
+  OpenInputFileOrDie(options.pcfg_file, pcfg_stream);
 
   // Read PCFG.
   Pcfg pcfg;
@@ -131,7 +131,7 @@ void PcfgScore::ProcessOptions(int argc, char *argv[], Options &options) const
   // Process the command-line.
   po::variables_map vm;
   try {
-    po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()).
+    po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()).
               options(cmd_line_options).positional(p).run(), vm);
     po::notify(vm);
   } catch (const std::exception &e) {
diff --git a/phrase-extract/postprocess-egret-forests/Main.cpp b/phrase-extract/postprocess-egret-forests/Main.cpp
index ec2bab185..fead94652 100644
--- a/phrase-extract/postprocess-egret-forests/Main.cpp
+++ b/phrase-extract/postprocess-egret-forests/Main.cpp
@@ -5,9 +5,5 @@
 int main(int argc, char *argv[])
 {
   MosesTraining::Syntax::PostprocessEgretForests::PostprocessEgretForests tool;
-  try {
-    return tool.Main(argc, argv);
-  } catch (const MosesTraining::Syntax::Exception &e) {
-    tool.Error(e.msg());
-  }
+  return tool.Main(argc, argv);
 }
diff --git a/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.cpp b/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.cpp
index d87e082dc..4911d4913 100644
--- a/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.cpp
+++ b/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.cpp
@@ -30,19 +30,23 @@ namespace PostprocessEgretForests
 
 int PostprocessEgretForests::Main(int argc, char *argv[])
 {
-  // Process command-line options.
-  Options options;
-  ProcessOptions(argc, argv, options);
+  try {
+    // Process command-line options.
+    Options options;
+    ProcessOptions(argc, argv, options);
 
-  // Open input files.
-  boost::scoped_ptr<SplitPointFileParser> splitPointParser;
-  std::ifstream splitPointFileStream;
-  if (!options.splitPointsFile.empty()) {
-    OpenInputFileOrDie(options.splitPointsFile, splitPointFileStream);
-    splitPointParser.reset(new SplitPointFileParser(splitPointFileStream));
+    // Open input files.
+    boost::scoped_ptr<SplitPointFileParser> splitPointParser;
+    std::ifstream splitPointFileStream;
+    if (!options.splitPointsFile.empty()) {
+      OpenInputFileOrDie(options.splitPointsFile, splitPointFileStream);
+      splitPointParser.reset(new SplitPointFileParser(splitPointFileStream));
+    }
+
+    ProcessForest(std::cin, std::cout, splitPointParser.get(), options);
+  } catch (const MosesTraining::Syntax::Exception &e) {
+    Error(e.msg());
   }
-
-  ProcessForest(std::cin, std::cout, splitPointParser.get(), options);
   return 0;
 }
 
@@ -76,17 +80,6 @@ void PostprocessEgretForests::ProcessForest(
   }
 }
 
-void PostprocessEgretForests::OpenInputFileOrDie(const std::string &filename,
-    std::ifstream &stream)
-{
-  stream.open(filename.c_str());
-  if (!stream) {
-    std::ostringstream msg;
-    msg << "failed to open input file: " << filename;
-    Error(msg.str());
-  }
-}
-
 void PostprocessEgretForests::ProcessOptions(int argc, char *argv[],
     Options &options) const
 {
@@ -96,7 +89,7 @@ void PostprocessEgretForests::ProcessOptions(int argc, char *argv[],
   // Construct the 'top' of the usage message: the bit that comes before the
   // options list.
   std::ostringstream usageTop;
-  usageTop << "Usage: " << GetName()
+  usageTop << "Usage: " << name()
            << " [OPTION]...\n\n"
            << "TODO\n\n"
            << "Options";
@@ -132,11 +125,8 @@ void PostprocessEgretForests::ProcessOptions(int argc, char *argv[],
 
   // Process the command-line.
   po::variables_map vm;
-  const int optionStyle = cls::allow_long
-                          | cls::long_allow_adjacent
-                          | cls::long_allow_next;
   try {
-    po::store(po::command_line_parser(argc, argv).style(optionStyle).
+    po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()).
               options(cmdLineOptions).positional(p).run(), vm);
     po::notify(vm);
   } catch (const std::exception &e) {
@@ -156,12 +146,6 @@ void PostprocessEgretForests::ProcessOptions(int argc, char *argv[],
   }
 }
 
-void PostprocessEgretForests::Error(const std::string &msg) const
-{
-  std::cerr << GetName() << ": " << msg << std::endl;
-  std::exit(1);
-}
-
 }  // namespace PostprocessEgretForests
 }  // namespace Syntax
 }  // namespace MosesTraining
diff --git a/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.h b/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.h
index 95da24c71..51970084e 100644
--- a/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.h
+++ b/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.h
@@ -4,6 +4,8 @@
 #include <ostream>
 #include <string>
 
+#include "syntax-common/tool.h"
+
 namespace MosesTraining
 {
 namespace Syntax
@@ -14,31 +16,21 @@ namespace PostprocessEgretForests
 struct Options;
 class SplitPointFileParser;
 
-class PostprocessEgretForests
+class PostprocessEgretForests : public Tool
 {
 public:
-  PostprocessEgretForests() : m_name("postprocess-egret-forests") {}
+  PostprocessEgretForests() : Tool("postprocess-egret-forests") {}
 
-  void Error(const std::string &) const;
-
-  const std::string &GetName() const {
-    return m_name;
-  }
-
-  int Main(int argc, char *argv[]);
+  virtual int Main(int argc, char *argv[]);
 
 private:
   void OneBestTree(std::istream &, std::ostream &, SplitPointFileParser *,
                    const Options &);
 
-  void OpenInputFileOrDie(const std::string &, std::ifstream &);
-
   void ProcessForest(std::istream &, std::ostream &, SplitPointFileParser *,
                      const Options &);
 
   void ProcessOptions(int, char *[], Options &) const;
-
-  std::string m_name;
 };
 
 }  // namespace PostprocessEgretForests
diff --git a/phrase-extract/score-stsg/ScoreStsg.cpp b/phrase-extract/score-stsg/ScoreStsg.cpp
index 09395e21e..f6df0d0da 100644
--- a/phrase-extract/score-stsg/ScoreStsg.cpp
+++ b/phrase-extract/score-stsg/ScoreStsg.cpp
@@ -35,7 +35,7 @@ namespace ScoreStsg
 const int ScoreStsg::kCountOfCountsMax = 10;
 
 ScoreStsg::ScoreStsg()
-  : m_name("score-stsg")
+  : Tool("score-stsg")
   , m_lexTable(m_srcVocab, m_tgtVocab)
   , m_countOfCounts(kCountOfCountsMax, 0)
   , m_totalDistinct(0)
@@ -300,17 +300,6 @@ double ScoreStsg::ComputeLexProb(const std::vector<RuleSymbol> &sourceFrontier,
   return lexScore;
 }
 
-void ScoreStsg::OpenOutputFileOrDie(const std::string &filename,
-                                    Moses::OutputFileStream &stream)
-{
-  bool ret = stream.Open(filename);
-  if (!ret) {
-    std::ostringstream msg;
-    msg << "failed to open output file: " << filename;
-    Error(msg.str());
-  }
-}
-
 void ScoreStsg::ProcessOptions(int argc, char *argv[], Options &options) const
 {
   namespace po = boost::program_options;
@@ -319,7 +308,7 @@ void ScoreStsg::ProcessOptions(int argc, char *argv[], Options &options) const
   // Construct the 'top' of the usage message: the bit that comes before the
   // options list.
   std::ostringstream usageTop;
-  usageTop << "Usage: " << GetName()
+  usageTop << "Usage: " << name()
            << " [OPTION]... EXTRACT LEX TABLE\n\n"
            << "STSG rule scorer\n\n"
            << "Options";
@@ -386,11 +375,8 @@ void ScoreStsg::ProcessOptions(int argc, char *argv[], Options &options) const
 
   // Process the command-line.
   po::variables_map vm;
-  const int optionStyle = cls::allow_long
-                          | cls::long_allow_adjacent
-                          | cls::long_allow_next;
   try {
-    po::store(po::command_line_parser(argc, argv).style(optionStyle).
+    po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()).
               options(cmdLineOptions).positional(p).run(), vm);
     po::notify(vm);
   } catch (const std::exception &e) {
@@ -440,12 +426,6 @@ void ScoreStsg::ProcessOptions(int argc, char *argv[], Options &options) const
   }
 }
 
-void ScoreStsg::Error(const std::string &msg) const
-{
-  std::cerr << GetName() << ": " << msg << std::endl;
-  std::exit(1);
-}
-
 }  // namespace ScoreStsg
 }  // namespace Syntax
 }  // namespace MosesTraining
diff --git a/phrase-extract/score-stsg/ScoreStsg.h b/phrase-extract/score-stsg/ScoreStsg.h
index 628c0080e..1757e181b 100644
--- a/phrase-extract/score-stsg/ScoreStsg.h
+++ b/phrase-extract/score-stsg/ScoreStsg.h
@@ -9,6 +9,8 @@
 #include "ExtractionPhrasePair.h"
 #include "OutputFileStream.h"
 
+#include "syntax-common/tool.h"
+
 #include "LexicalTable.h"
 #include "Options.h"
 #include "RuleSymbol.h"
@@ -25,16 +27,12 @@ namespace ScoreStsg
 class RuleGroup;
 class RuleTableWriter;
 
-class ScoreStsg
+class ScoreStsg : public Tool
 {
 public:
   ScoreStsg();
 
-  const std::string &GetName() const {
-    return m_name;
-  }
-
-  int Main(int argc, char *argv[]);
+  virtual int Main(int argc, char *argv[]);
 
 private:
   static const int kCountOfCountsMax;
@@ -43,10 +41,6 @@ private:
                         const std::vector<RuleSymbol> &,
                         const ALIGNMENT &);
 
-  void Error(const std::string &) const;
-
-  void OpenOutputFileOrDie(const std::string &, Moses::OutputFileStream &);
-
   void ParseAlignmentString(const std::string &, int,
                             ALIGNMENT &);
 
@@ -59,7 +53,6 @@ private:
 
   void TokenizeRuleHalf(const std::string &, TokenizedRuleHalf &);
 
-  std::string m_name;
   Options m_options;
   Vocabulary m_srcVocab;
   Vocabulary m_tgtVocab;
diff --git a/phrase-extract/syntax-common/tool.cc b/phrase-extract/syntax-common/tool.cc
index c41eaf9bd..e145b78be 100644
--- a/phrase-extract/syntax-common/tool.cc
+++ b/phrase-extract/syntax-common/tool.cc
@@ -1,64 +1,30 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-***********************************************************************/
-
 #include "tool.h"
 
+#include <cstdlib>
+#include <iostream>
 #include <sstream>
 
+#include <boost/program_options/cmdline.hpp>
+
 namespace MosesTraining {
 namespace Syntax {
-namespace PCFG {
 
-std::istream &Tool::OpenInputOrDie(const std::string &filename) {
-  // TODO Check that function is only called once?
-  if (filename.empty() || filename == "-") {
-    input_ptr_ = &(std::cin);
-  } else {
-    input_file_stream_.open(filename.c_str());
-    if (!input_file_stream_) {
-      std::ostringstream msg;
-      msg << "failed to open input file: " << filename;
-      Error(msg.str());
-    }
-    input_ptr_ = &input_file_stream_;
-  }
-  return *input_ptr_;
+int Tool::MosesOptionStyle() {
+  namespace cls = boost::program_options::command_line_style;
+  return cls::allow_long | cls::long_allow_adjacent | cls::long_allow_next;
 }
 
-std::ostream &Tool::OpenOutputOrDie(const std::string &filename) {
-  // TODO Check that function is only called once?
-  if (filename.empty() || filename == "-") {
-    output_ptr_ = &(std::cout);
-  } else {
-    output_file_stream_.open(filename.c_str());
-    if (!output_file_stream_) {
-      std::ostringstream msg;
-      msg << "failed to open output file: " << filename;
-      Error(msg.str());
-    }
-    output_ptr_ = &output_file_stream_;
-  }
-  return *output_ptr_;
+void Tool::Warn(const std::string &msg) const {
+  std::cerr << name_ << ": warning: " << msg << std::endl;
 }
 
-void Tool::OpenNamedInputOrDie(const std::string &filename,
-                               std::ifstream &stream) {
+void Tool::Error(const std::string &msg) const {
+  std::cerr << name_ << ": error: " << msg << std::endl;
+  std::exit(1);
+}
+
+void Tool::OpenInputFileOrDie(const std::string &filename,
+                              std::ifstream &stream) {
   stream.open(filename.c_str());
   if (!stream) {
     std::ostringstream msg;
@@ -67,8 +33,8 @@ void Tool::OpenNamedInputOrDie(const std::string &filename,
   }
 }
 
-void Tool::OpenNamedOutputOrDie(const std::string &filename,
-                                std::ofstream &stream) {
+void Tool::OpenOutputFileOrDie(const std::string &filename,
+                               std::ofstream &stream) {
   stream.open(filename.c_str());
   if (!stream) {
     std::ostringstream msg;
@@ -77,6 +43,15 @@ void Tool::OpenNamedOutputOrDie(const std::string &filename,
   }
 }
 
-}  // namespace PCFG
+void Tool::OpenOutputFileOrDie(const std::string &filename,
+                               Moses::OutputFileStream &stream) {
+  bool ret = stream.Open(filename);
+  if (!ret) {
+    std::ostringstream msg;
+    msg << "failed to open output file: " << filename;
+    Error(msg.str());
+  }
+}
+
 }  // namespace Syntax
 }  // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/tool.h b/phrase-extract/syntax-common/tool.h
index 2c903a11e..e1df8025f 100644
--- a/phrase-extract/syntax-common/tool.h
+++ b/phrase-extract/syntax-common/tool.h
@@ -1,93 +1,53 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-***********************************************************************/
-
 #pragma once
-#ifndef PCFG_TOOL_H_
-#define PCFG_TOOL_H_
 
-#include <cstdlib>
 #include <fstream>
-#include <iostream>
 #include <string>
 
-#include <boost/program_options/cmdline.hpp>
+#include "OutputFileStream.h"
 
 namespace MosesTraining {
 namespace Syntax {
-namespace PCFG {
 
+/*! Base class for command-line based tools.
+ */
 class Tool {
  public:
   virtual ~Tool() {}
 
+  //! Get the name of the tool.
   const std::string &name() const { return name_; }
 
+  //! Virtual main function to be provided by subclass.
   virtual int Main(int argc, char *argv[]) = 0;
 
  protected:
   Tool(const std::string &name) : name_(name) {}
 
-  // Returns the boost::program_options style that should be used by all tools.
-  static int CommonOptionStyle() {
-    namespace cls = boost::program_options::command_line_style;
-    return cls::default_style & (~cls::allow_guessing);
-  }
+  //! Returns a boost::program_options style that is consistent with other
+  //! Moses tools (extract-rules, score, etc.).
+  static int MosesOptionStyle();
 
-  void Warn(const std::string &msg) const {
-    std::cerr << name_ << ": warning: " << msg << std::endl;
-  }
+  //! Write a formatted warning message to standard error.
+  void Warn(const std::string &) const;
 
-  void Error(const std::string &msg) const {
-    std::cerr << name_ << ": error: " << msg << std::endl;
-    std::exit(1);
-  }
+  //! Write a formatted error message to standard error and call exit(1).
+  void Error(const std::string &msg) const;
 
-  // Initialises the tool's main input stream and returns a reference that is
-  // valid for the remainder of the tool's lifetime.  If filename is empty or
-  // "-" then input is standard input; otherwise it is the named file.  Calls
-  // Error() if the file cannot be opened for reading.
-  std::istream &OpenInputOrDie(const std::string &filename);
+  //! Opens the named input file using the supplied ifstream.  Calls Error() if
+  //! the file cannot be opened for reading.
+  void OpenInputFileOrDie(const std::string &, std::ifstream &);
 
-  // Initialises the tool's main output stream and returns a reference that is
-  // valid for the remainder of the tool's lifetime.  If filename is empty or
-  // "-" then output is standard output; otherwise it is the named file.  Calls
-  // Error() if the file cannot be opened for writing.
-  std::ostream &OpenOutputOrDie(const std::string &filename);
+  //! Opens the named output file using the supplied ofstream.  Calls Error() if
+  //! the file cannot be opened for writing.
+  void OpenOutputFileOrDie(const std::string &, std::ofstream &);
 
-  // Opens the named input file using the supplied ifstream.  Calls Error() if
-  // the file cannot be opened for reading.
-  void OpenNamedInputOrDie(const std::string &, std::ifstream &);
-
-  // Opens the named output file using the supplied ofstream.  Calls Error() if
-  // the file cannot be opened for writing.
-  void OpenNamedOutputOrDie(const std::string &, std::ofstream &);
+  //! Opens the named output file using the supplied OutputFileStream.  Calls
+  //! Error() if the file cannot be opened for writing.
+  void OpenOutputFileOrDie(const std::string &, Moses::OutputFileStream &);
 
  private:
   std::string name_;
-  std::istream *input_ptr_;
-  std::ifstream input_file_stream_;
-  std::ostream *output_ptr_;
-  std::ofstream output_file_stream_;
 };
 
-}  // namespace PCFG
 }  // namespace Syntax
 }  // namespace MosesTraining
-
-#endif

From 42b53b7a3939e9c5b0875a2eeb53010774527a95 Mon Sep 17 00:00:00 2001
From: MosesAdmin <hieuhoang@gmail.com>
Date: Fri, 5 Jun 2015 00:00:42 +0100
Subject: [PATCH 058/108] daily automatic beautifier

---
 moses/server/TranslationRequest.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp
index 3762fbd96..cad3696d1 100644
--- a/moses/server/TranslationRequest.cpp
+++ b/moses/server/TranslationRequest.cpp
@@ -265,12 +265,12 @@ parse_request(std::map<std::string, xmlrpc_c::value> const& params)
       pdmm->SetTemporaryMultiModelWeightsVector(w);
     }
   }
-  
+
   si = params.find("nbest");
   if (si != params.end())
     m_nbestSize = xmlrpc_c::value_int(si->second);
-  
-  
+
+
   // // biased sampling for suffix-array-based sampling phrase table?
   // if ((si = params.find("bias")) != params.end())
   //   {

From c306715e828f23ffceefebde8e227fc1bd7ff4d0 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Mon, 8 Jun 2015 14:35:36 +0400
Subject: [PATCH 059/108] add back arg -always-create-direct-transoption.
 Seemed to have dropped out a while ago

---
 moses/Parameter.cpp  | 1 +
 moses/StaticData.cpp | 1 +
 2 files changed, 2 insertions(+)

diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp
index 33441570f..cf8737e3b 100644
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@@ -143,6 +143,7 @@ Parameter::Parameter()
   AddParam(oov_opts,"mark-unknown", "mu", "mark unknown words in output");
   AddParam(oov_opts,"lmodel-oov-feature", "add language model oov feature, one per model");
   AddParam(oov_opts,"output-unknowns", "Output the unknown (OOV) words to the given file, one line per sentence");
+  AddParam(oov_opts,"always-create-direct-transopt", "Always create a translation that translates the source word ad-verbatim");
 
   ///////////////////////////////////////////////////////////////////////////////////////
   // input options
diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index 6fd5ced57..8fb88c257 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -444,6 +444,7 @@ StaticData
   //source word deletion
   m_parameter->SetParameter(m_wordDeletionEnabled, "phrase-drop-allowed", false );
 
+  m_parameter->SetParameter(m_isAlwaysCreateDirectTranslationOption, "always-create-direct-transopt", false );
 }
 
 void

From 501c51947b192e8559fa35d820ebd951566bebba Mon Sep 17 00:00:00 2001
From: Lexi Birch <lexi.birch@gmail.com>
Date: Mon, 8 Jun 2015 16:58:50 +0100
Subject: [PATCH 060/108] Allowing the truecaser to work on uncased ASR input,
 pass the -a flag

---
 scripts/recaser/truecase.perl | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl
index 0a4d366e0..7b3dc20fb 100755
--- a/scripts/recaser/truecase.perl
+++ b/scripts/recaser/truecase.perl
@@ -8,11 +8,14 @@ binmode(STDIN, ":utf8");
 binmode(STDOUT, ":utf8");
 
 # apply switches
-my ($MODEL, $UNBUFFERED);
-die("truecase.perl --model MODEL [-b] < in > out")
-    unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED)
+# ASR input has no case, make sure it is lowercase, and make sure known are cased eg. 'i' to be uppercased even if i is known
+my ($MODEL, $UNBUFFERED, $ASR);
+die("truecase.perl --model MODEL [-b] [-a] < in > out")
+    unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED, 'a|asr' => \$ASR)
     && defined($MODEL);
 if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
+my $asr = 0;
+if (defined($ASR) && $ASR) { $asr = 1; }
 
 my (%BEST,%KNOWN);
 open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'");
@@ -20,9 +23,11 @@ binmode(MODEL, ":utf8");
 while(<MODEL>) {
   my ($word,@OPTIONS) = split;
   $BEST{ lc($word) } = $word;
-  $KNOWN{ $word } = 1;
-  for(my $i=1;$i<$#OPTIONS;$i+=2) {
-    $KNOWN{ $OPTIONS[$i] } = 1;
+  if ($asr == 0) {
+    $KNOWN{ $word } = 1;
+    for(my $i=1;$i<$#OPTIONS;$i+=2) {
+      $KNOWN{ $OPTIONS[$i] } = 1;
+    }
   }
 }
 close(MODEL);
@@ -49,6 +54,9 @@ while(<STDIN>) {
 	$word = $$WORD[$i];
 	$otherfactors = "";
     }
+    if ($asr){
+      $word = lc($word); #make sure ASR output is not uc
+    }
 
     if ($sentence_start && defined($BEST{lc($word)})) {
       print $BEST{lc($word)}; # truecase sentence start

From fa51da28c5f21881b716026b69b07b0fd2e3a015 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Tue, 9 Jun 2015 16:50:27 +0100
Subject: [PATCH 061/108] moses/phrase-extract refactoring

Final commit in this round of refactoring (which started with commit
2f735998...).  The main changes are:

  - a general storage mechanism for attribute/value pairs in XML-style
    tree / lattice input.  E.g. the "pcfg-score" and "semantic-role"
    attributes in:

     <tree label="PRP" pcfg-score="1.0" semantic-role="AGENT"> I </tree>

  - consolidation of the various near-duplicate Tree / XmlTreeParser classes
    that have accumulated over the years (my fault)

  - miscellaneous de-crufting
---
 phrase-extract/SyntaxNode.h                   |  3 +++
 phrase-extract/SyntaxNodeCollection.h         |  8 +++---
 phrase-extract/XmlTree.cpp                    | 27 +++++++++----------
 .../syntax-common/xml_tree_parser.h           |  2 +-
 4 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h
index 49e2eb695..25a75b784 100644
--- a/phrase-extract/SyntaxNode.h
+++ b/phrase-extract/SyntaxNode.h
@@ -25,6 +25,9 @@
 namespace MosesTraining
 {
 
+/*! A node in a syntactic structure (tree, lattice, etc.).  SyntaxNodes have a
+ *  label and a span plus an arbitrary set of name/value attributes.
+ */
 struct SyntaxNode {
   typedef std::map<std::string, std::string> AttributeMap;
 
diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h
index 405a77c5f..da0e1eca3 100644
--- a/phrase-extract/SyntaxNodeCollection.h
+++ b/phrase-extract/SyntaxNodeCollection.h
@@ -55,11 +55,13 @@ public:
     return m_nodes;
   };
 
-  size_t GetNumWords() const {
-    return m_numWords;
-  }
+  //! Get the number of words (defined as 1 + the max end pos of any node).
+  std::size_t GetNumWords() const { return m_numWords; }
+
+  //! Clear the container (this deletes the SyntaxNodes).
   void Clear();
 
+  //! Extract a SyntaxTree (assuming the collection's nodes constitute a tree).
   std::auto_ptr<SyntaxTree> ExtractTree();
 
 private:
diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp
index d8b77b6e6..d88c78c0b 100644
--- a/phrase-extract/XmlTree.cpp
+++ b/phrase-extract/XmlTree.cpp
@@ -80,7 +80,6 @@ string ParseXmlTagAttribute(const string& tag,const string& attributeName)
   return tag.substr(contentsStart,contentsEnd-contentsStart);
 }
 
-// TODO Special handling of "label" attribute
 // s should be a sequence of name=attribute pairs separated by whitespace.
 // e.g. "label=\"S\" pcfg=\"-1.452\" foo=\"blah\\\"blah\""
 void ParseXmlTagAttributes(const std::string &s,
@@ -107,8 +106,9 @@ void ParseXmlTagAttributes(const std::string &s,
         throw XmlException("invalid tag content");
       }
     }
-    // TODO unescape \"
-    attributes[name] = s.substr(begin+1, pos-begin-1);
+    if (name != "label" && name != "span") {
+      attributes[name] = s.substr(begin+1, pos-begin-1);
+    }
     begin = pos+1;
   }
 }
@@ -245,20 +245,17 @@ vector<string> TokenizeXml(const string& str)
 }
 
 /**
- * Process a sentence with xml annotation
- * Xml tags may specifiy additional/replacing translation options
- * and reordering constraints
+ * Process a sentence with XML-style annotation of syntactic nodes.
  *
- * \param line in: sentence, out: sentence without the xml
- * \param res vector with translation options specified by xml
- * \param reorderingConstraint reordering constraint zones specified by xml
- * \param walls reordering constraint walls specified by xml
+ * \param line[in,out]            in: sentence, out: sentence without the XML
+ * \param nodeCollection[out]     the collection of SyntaxNode objects for this
+ *                                sentence
+ * \param labelCollection[out]    label values are inserted into this set
+ * \param topLabelCollection[out] top labels (key) and their counts (value)
+ *                                are inserted into this map
+ * \param unescapeSpecialChars    flag indicating whether XML special characters
+ *                                should be unescaped
  */
-/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector
-	is so we can link things up afterwards. We can't create TranslationOptions as we
-	parse because we don't have the completed source parsed until after this function
-	removes all the markup from it (CreateFromString in Sentence::Read).
-*/
 bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
                             set< string > &labelCollection,
                             map< string, int > &topLabelCollection,
diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h
index 48ea056b8..04ad74e24 100644
--- a/phrase-extract/syntax-common/xml_tree_parser.h
+++ b/phrase-extract/syntax-common/xml_tree_parser.h
@@ -16,7 +16,7 @@ namespace Syntax {
  *  converts them to SyntaxTree objects.
  *
  *  This is a thin wrapper around the ProcessAndStripXMLTags function.  After
- *  calling Parse(), the output of the ProcessAndStripXMLTags function (the
+ *  calling Parse(), the output from the ProcessAndStripXMLTags call (the
  *  sentence, node collection, label set, and top label set) are available via
  *  accessors.
  */

From dbcc264506ca26b471b821ab8fc0b78d88457185 Mon Sep 17 00:00:00 2001
From: Jeroen Vermeulen <jtv@precisiontranslationtools.com>
Date: Tue, 9 Jun 2015 23:10:27 +0700
Subject: [PATCH 062/108] Remove unneeded script.

Tom Hoar, the author of this script, asked me to remove it because it
doesn't actually do what the current name says, and can't work without
an additional script which isn't in the repository.
---
 .../training/convert-moses-ini-v2-to-v1.py    | 266 ------------------
 1 file changed, 266 deletions(-)
 delete mode 100755 scripts/training/convert-moses-ini-v2-to-v1.py

diff --git a/scripts/training/convert-moses-ini-v2-to-v1.py b/scripts/training/convert-moses-ini-v2-to-v1.py
deleted file mode 100755
index 4b7cfa5fa..000000000
--- a/scripts/training/convert-moses-ini-v2-to-v1.py
+++ /dev/null
@@ -1,266 +0,0 @@
-#! /usr/bin/env python
-# -*- coding: utf8 -*-
-#
-# This file is part of moses.  Its use is licensed under the GNU Lesser General
-# Public License version 3 or, at your option, any later version.
-
-
-from __future__ import (
-    absolute_import,
-    print_function,
-    unicode_literals,
-    )
-
-__version__ = '1.0'
-__license__ = 'LGPL3'
-__source__ = 'Precision Translation Tools Pte Lte'
-
-import errno
-from sys import stdout
-from copy import deepcopy
-from os.path import (
-    dirname,
-    basename,
-    exists,
-    realpath,
-    )
-from os import makedirs
-
-
-root_escape = '%(escape-prefix)s'
-
-
-class moses2_to_ini(object):
-
-    def __init__(self, inp, out, escape_prefix):
-        self.inp = inp
-        self.out = out
-        self.escape_prefix = escape_prefix
-        self._config = {}
-
-    def parse(self):
-        key = ''
-        section = None
-        self._config = {}
-        counter = 0
-
-        with open(self.inp, 'rb') as f:
-            contents = f.read().decode('utf8')
-
-        lines = contents.splitlines()
-
-        # Known feature/functions without attributes.
-        attrless_ffs = [
-            'UnknownWordPenalty',
-            'WordPenalty',
-            'PhrasePenalty',
-            'Distortion',
-            ]
-
-        # Retrieve all values except feature/functions with attributes.
-        for i, line in [(i, line.strip()) for i, line in enumerate(lines)
-                        if line.strip() and not line.strip().startswith('#')]:
-
-            if line.startswith('[') and line.endswith(']'):
-
-                section = line.strip('] [')
-
-                if section not in self._config.keys() + ['feature', 'weight']:
-                    # New section not in config and not a reserved section.
-                    counter = 0
-                    key = section
-                    self._config[key] = {}
-
-            elif section == 'feature' and line in attrless_ffs:
-                # Known feature/funcions without attributes.
-                key = '%s0' % line
-                if key not in self._config:
-                    self._config[key] = {}
-                self._config[key]['feature'] = line
-
-            elif section == 'feature':
-                # Skip feature/funcions with arguments.
-                continue
-
-            elif section == 'weight':
-                # Add weight value to feature sections.
-                config_items = [
-                    (key.strip(), value.strip())
-                    for key, value in [line.split('=', 1)]
-                    ]
-                for key, value in config_items:
-                    if key not in self._config:
-                        self._config[key] = {}
-                    self._config[key]['weight'] = value
-
-            else:
-                self._config[key][counter] = line
-                counter += 0
-
-            lines[i] = ''
-
-        # Second, match feature/functions attributes to [weight] section
-        # values.
-        stripped_lines = [line.strip() for line in lines]
-        nonempty_lines = [
-            line
-            for line in stripped_lines
-            if line != '' and not line.startswith('#')
-            ]
-        for i, line in enumerate(nonempty_lines):
-            # Add "feature" to assist creating tmpdict for feature/functions.
-            line = 'feature=%s' % line
-            tmpdict = dict([key.split('=', 1) for key in line.split()])
-
-            # Feature/functions 'name' attribute must match an entry in
-            # [weight].
-            if tmpdict.get('name') not in self._config:
-                raise RuntimeError('malformed moses.ini v2 file')
-
-            config_items = [
-                (key.strip(), value.strip())
-                for key, value in tmpdict.items()
-                if key.strip() != 'name'
-                ]
-            for key, value in config_items:
-                self._config[tmpdict['name']][key] = value
-
-        return deepcopy(self._config)
-
-    def render(self, config):
-        self._config = deepcopy(config)
-        _config = deepcopy(config)
-        lines = _tolines(_config, self.escape_prefix)
-        if self.out == '-':
-            stdout.write('\n'.join(lines))
-        else:
-            contents = '\r\n'.join(lines)
-            makedir(dirname(self.out))
-            with open(self.out, 'wb') as f:
-                f.write(contents.encode('utf8'))
-
-    def __str__(self):
-        return '\n'.join(_tolines(self._config, self.escape_prefix))
-
-    @property
-    def config(self):
-        return deepcopy(self._config)
-
-
-def _tolines(config, escape_prefix):
-
-    section_names = sorted(config)
-    lines = []
-
-    # Group feature/functions first.
-    group_ffs = [
-        name
-        for name in section_names
-        if name[-1].isdigit()
-    ]
-    for sectionname in group_ffs:
-        section = config[sectionname]
-        lines.append('[%s]' % sectionname)
-        for option, value in section.items():
-            if option == 'path' \
-                    and escape_prefix is not None \
-                    and value.startswith(escape_prefix):
-                value = value.replace(escape_prefix, root_escape, 1)
-            lines.append('%s=%s' % (option, value))
-        lines.append('')
-
-    other_ffs = [
-        name
-        for name in section_names
-        if not name[-1].isdigit()
-    ]
-    for sectionname in other_ffs:
-        section = config[sectionname]
-        lines.append('[%s]' % sectionname)
-        for option, value in section.items():
-            lines.append('%s=%s' % (option, value))
-        lines.append('')
-
-    return deepcopy(lines)
-
-
-def makedir(path, mode=0o777):
-    try:
-        makedirs(path, mode)
-    except OSError as e:
-        accepted_errors = [
-            errno.EEXIST,
-            errno.EPERM,
-            errno.EACCES,
-            errno.ENOENT,
-            ]
-        if e.errno not in accepted_errors:
-            raise
-
-
-def get_args():
-    '''Parse command-line arguments
-
-    Uses the API compatibility between the legacy
-    argparse.OptionParser and its replacement argparse.ArgumentParser
-    for functional equivelancy and nearly identical help prompt.
-    '''
-
-    description = 'Convert Moses.ini v2 file to standard INI format'
-    usage = '%s [arguments]' % basename(__file__)
-
-    try:
-        from argparse import ArgumentParser
-    except ImportError:
-        from optparse import OptionParser
-        argparser = False
-        escape_help = (
-            "Optional. Path of SMT model. If provided, "
-            "escapes \"escape-prefix\" with \"%(escape-prefix)s\"")
-        parser = OptionParser(usage=usage, description=description)
-        add_argument = parser.add_option
-    else:
-        argparser = True
-        escape_help = (
-            "Optional. Path of SMT model. If provided, "
-            "escape \"escape-prefix\" with \"%%(escape-prefix)s\"")
-        parser = ArgumentParser(usage=usage, description=description)
-        add_argument = parser.add_argument
-
-    add_argument(
-        '-i', '--inp', action='store',
-        help="moses.ini v2 file to convert (required)")
-
-    add_argument(
-        '-o', '--out', action='store', default='-',
-        help="standard INI file (default: '-' outputs to stdout)")
-
-    add_argument('-r', '--escape-prefix', action='store', help=escape_help)
-
-    if argparser:
-        args = vars(parser.parse_args())
-    else:
-        opts = parser.parse_args()
-        args = vars(opts[0])
-
-    if args['inp'] is None:
-        parser.error('argument -i/--inp required')
-
-    args['inp'] = realpath(args['inp'])
-
-    if not exists(args['inp']):
-        parser.error(
-            "argument -i/--inp invalid.\n"
-            "reference: %s" % args['inp'])
-
-    if args['out'] != '-':
-        args['out'] = realpath(args['out'])
-
-    return args
-
-
-if __name__ == '__main__':
-    args = get_args()
-    converter = moses2_to_ini(**args)
-    config = converter.parse()
-    converter.render(config)

From 47c793ca4647a6868fabc041cbd63e5114994e3f Mon Sep 17 00:00:00 2001
From: MosesAdmin <moses-support-owner@mit.edu>
Date: Wed, 10 Jun 2015 00:00:40 +0100
Subject: [PATCH 063/108] daily automatic beautifier

---
 phrase-extract/SyntaxNodeCollection.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h
index da0e1eca3..ef0989cd0 100644
--- a/phrase-extract/SyntaxNodeCollection.h
+++ b/phrase-extract/SyntaxNodeCollection.h
@@ -56,7 +56,9 @@ public:
   };
 
   //! Get the number of words (defined as 1 + the max end pos of any node).
-  std::size_t GetNumWords() const { return m_numWords; }
+  std::size_t GetNumWords() const {
+    return m_numWords;
+  }
 
   //! Clear the container (this deletes the SyntaxNodes).
   void Clear();

From 0d54286d3f11dda748de91d0a8a2977551066826 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Thu, 11 Jun 2015 14:43:10 -0400
Subject: [PATCH 064/108] Require __SSE2__ for i386 to use SSE2

---
 util/integer_to_string.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/util/integer_to_string.cc b/util/integer_to_string.cc
index 32047291d..6b8766119 100644
--- a/util/integer_to_string.cc
+++ b/util/integer_to_string.cc
@@ -7,6 +7,7 @@ Local modifications:
 4. Remove test hook
 5. Non-x86 support from the branch_lut code
 6. Rename functions
+7. Require __SSE2__ on i386
 
 Copyright (C) 2014 Milo Yip
 
@@ -66,7 +67,7 @@ const char gDigitsLut[200] = {
 // SSE2 implementation according to http://0x80.pl/articles/sse-itoa.html
 // Modifications: (1) fix incorrect digits (2) accept all ranges (3) write to user provided buffer.
 
-#if defined(i386) || defined(__amd64) || defined(_M_IX86) || defined(_M_X64)
+#if defined(__amd64) || defined(_M_X64) || (defined(__SSE2__) && (defined(_M_IX86) || defined(i386)))
 
 #include <emmintrin.h>
 

From 924710f53e959f7214f740db691ebe4f7778dfd7 Mon Sep 17 00:00:00 2001
From: Jeroen Vermeulen <jtv@precisiontranslationtools.com>
Date: Fri, 12 Jun 2015 15:11:57 +0700
Subject: [PATCH 065/108] On MinGW use Windows _chsize_t, not ftruncate.

This works around a problem when building against MinGW and then running
the resulting Windows binary on WINE.  (Perverse, I know.)  For some
reason the ftruncate() to 0 bytes succeeds, but the subsequent one to a
larger size fails.  Even if the size is just 1 byte.

This happened where GenericModel::InitializeFromARPA called
BinaryFormat::SetupJustVocab, which called MapZeroedWrite, which calls
ResizeOrThrow twice; the second one failed.
---
 util/file.cc | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/util/file.cc b/util/file.cc
index 414d0471c..046b9ff90 100644
--- a/util/file.cc
+++ b/util/file.cc
@@ -111,10 +111,7 @@ uint64_t SizeOrThrow(int fd) {
 }
 
 void ResizeOrThrow(int fd, uint64_t to) {
-#if defined __MINGW32__
-    // Does this handle 64-bit?
-    int ret = ftruncate
-#elif defined(_WIN32) || defined(_WIN64)
+#if defined(_WIN32) || defined(_WIN64)
     errno_t ret = _chsize_s
 #elif defined(OS_ANDROID)
     int ret = ftruncate64

From ffd3f2bb6e34c9eb0fba01f7a76d573c2d7105d9 Mon Sep 17 00:00:00 2001
From: XapaJIaMnu <nheart@gmail.com>
Date: Fri, 12 Jun 2015 16:21:24 +0100
Subject: [PATCH 066/108] Added basic BilingualNPLM support to EMS and an
 example config.

---
 scripts/ems/example/config.toy.bilinguallm    | 682 ++++++++++++++++++
 .../ems/example/data/weight_bilinguallm.ini   |  14 +
 scripts/ems/experiment.meta                   |  17 +-
 3 files changed, 712 insertions(+), 1 deletion(-)
 create mode 100644 scripts/ems/example/config.toy.bilinguallm
 create mode 100644 scripts/ems/example/data/weight_bilinguallm.ini

diff --git a/scripts/ems/example/config.toy.bilinguallm b/scripts/ems/example/config.toy.bilinguallm
new file mode 100644
index 000000000..37a34b70b
--- /dev/null
+++ b/scripts/ems/example/config.toy.bilinguallm
@@ -0,0 +1,682 @@
+################################################
+### CONFIGURATION FILE FOR AN SMT EXPERIMENT ###
+################################################
+
+[GENERAL]
+
+### directory in which experiment is run
+#
+working-dir = /mnt/gna0/nbogoych/ems_work
+
+# specification of the language pair
+input-extension = fr
+output-extension = en
+pair-extension = fr-en
+
+### directories that contain tools and data
+# 
+# moses
+moses-src-dir = /mnt/gna0/nbogoych/mosesdecoder
+#
+# moses binaries
+moses-bin-dir = $moses-src-dir/bin
+#
+# moses scripts
+moses-script-dir = $moses-src-dir/scripts
+#
+# directory where GIZA++/MGIZA programs resides
+external-bin-dir = /home/pkoehn/statmt/bin
+#
+# srilm
+#srilm-dir = $moses-src-dir/srilm/bin/i686
+#
+# irstlm
+#irstlm-dir = $moses-src-dir/irstlm/bin
+#
+# randlm
+#randlm-dir = $moses-src-dir/randlm/bin
+#
+# data
+toy-data = $moses-script-dir/ems/example/data
+
+### basic tools
+#
+# moses decoder
+decoder = $moses-bin-dir/moses
+
+# conversion of rule table into binary on-disk format
+ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2"
+
+# tokenizers - comment out if all your data is already tokenized
+input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
+output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension"
+
+# truecasers - comment out if you do not use the truecaser
+input-truecaser = $moses-script-dir/recaser/truecase.perl
+output-truecaser = $moses-script-dir/recaser/truecase.perl
+detruecaser = $moses-script-dir/recaser/detruecase.perl
+
+# lowercaser - comment out if you use truecasing
+#input-lowercaser = $moses-script-dir/tokenizer/lowercase.perl
+#output-lowercaser = $moses-script-dir/tokenizer/lowercase.perl
+
+### generic parallelizer for cluster and multi-core machines
+# you may specify a script that allows the parallel execution
+# parallizable steps (see meta file). you also need specify 
+# the number of jobs (cluster) or cores (multicore)
+#
+#generic-parallelizer = $moses-script-dir/ems/support/generic-parallelizer.perl
+#generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl
+
+### cluster settings (if run on a cluster machine)
+# number of jobs to be submitted in parallel
+#
+#jobs = 10
+
+# arguments to qsub when scheduling a job
+#qsub-settings = ""
+
+# project for priviledges and usage accounting 
+#qsub-project = iccs_smt
+
+# memory and time 
+#qsub-memory = 4
+#qsub-hours = 48
+
+### multi-core settings
+# when the generic parallelizer is used, the number of cores
+# specified here 
+cores = 8
+
+#################################################################
+# PARALLEL CORPUS PREPARATION: 
+# create a tokenized, sentence-aligned corpus, ready for training
+
+[CORPUS]
+
+### long sentences are filtered out, since they slow down GIZA++ 
+# and are a less reliable source of data. set here the maximum
+# length of a sentence
+#
+max-sentence-length = 80
+
+[CORPUS:toy]
+
+### command to run to get raw corpus files
+#
+# get-corpus-script = 
+
+### raw corpus files (untokenized, but sentence aligned)
+# 
+raw-stem = $toy-data/nc-5k
+
+### tokenized corpus files (may contain long sentences)
+#
+#tokenized-stem =
+
+### if sentence filtering should be skipped,
+# point to the clean training data
+#
+#clean-stem = 
+
+### if corpus preparation should be skipped,
+# point to the prepared training data
+#
+#lowercased-stem = 
+
+#################################################################
+# LANGUAGE MODEL TRAINING
+
+[LM]
+
+### tool to be used for language model training
+# kenlm training
+lm-training = "$moses-script-dir/ems/support/lmplz-wrapper.perl -bin $moses-bin-dir/lmplz"
+settings = "--prune '0 0 1' -T $working-dir/lm -S 20%"
+
+# srilm 
+#lm-training = $srilm-dir/ngram-count
+#settings = "-interpolate -kndiscount -unk"
+
+# irstlm training
+# msb = modified kneser ney; p=0 no singleton pruning
+#lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/tmp"
+#settings = "-s msb -p 0"
+
+# order of the language model
+order = 5
+
+### tool to be used for training randomized language model from scratch
+# (more commonly, a SRILM is trained)
+#
+#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
+
+### script to use for binary table format for irstlm or kenlm
+# (default: no binarization)
+
+# irstlm
+#lm-binarizer = $irstlm-dir/compile-lm
+
+# kenlm, also set type to 8
+lm-binarizer = $moses-bin-dir/build_binary
+type = 8
+
+### script to create quantized language model format (irstlm)
+# (default: no quantization)
+# 
+#lm-quantizer = $irstlm-dir/quantize-lm
+
+### script to use for converting into randomized table format
+# (default: no randomization)
+#
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
+
+### each language model to be used has its own section here
+
+[LM:toy]
+
+### command to run to get raw corpus files
+#
+#get-corpus-script = ""
+
+### raw corpus (untokenized)
+#
+raw-corpus = $toy-data/nc-5k.$output-extension
+
+### tokenized corpus files (may contain long sentences)
+#
+#tokenized-corpus = 
+
+### if corpus preparation should be skipped, 
+# point to the prepared language model
+#
+#lm = 
+
+[LM:bilingual-lm]
+#bilingual-lm
+exclude-from-interpolation = true
+bilingual-lm = "yes"
+bilingual-lm-workdir = "bilingual"
+bilingual-lm-settings = ""
+order = "5"
+source-window = "4"
+
+#actual training
+train_order = "14" #this is equal to order + 2*source-window + 1
+nplm-output-dir = "nplm_out"
+nplm-settings = "-l /mnt/gna0/rsennrich/tools/nplm-0.3-gpu-experimental/"
+
+#Config file generation:
+config-feature-line = "BilingualNPLM order=$order source_window=$source-window path=$working-dir/$nplm-output-dir/train.10k.model.nplm.10 source_vocab=$working-dir/$bilingual-lm-workdir/vocab.source target_vocab=$working-dir/$bilingual-lm-workdir/vocab.target"
+config-weight-line = "BilingualNPLM0= 0.1"
+
+#################################################################
+# INTERPOLATING LANGUAGE MODELS
+
+[INTERPOLATED-LM]
+
+# if multiple language models are used, these may be combined
+# by optimizing perplexity on a tuning set
+# see, for instance [Koehn and Schwenk, IJCNLP 2008]
+
+### script to interpolate language models
+# if commented out, no interpolation is performed
+#
+# script = $moses-script-dir/ems/support/interpolate-lm.perl
+
+### tuning set
+# you may use the same set that is used for mert tuning (reference set)
+#
+#tuning-sgm =
+#raw-tuning =
+#tokenized-tuning = 
+#factored-tuning = 
+#lowercased-tuning = 
+#split-tuning = 
+
+### group language models for hierarchical interpolation
+# (flat interpolation is limited to 10 language models)
+#group = "first,second fourth,fifth"
+
+### script to use for binary table format for irstlm or kenlm
+# (default: no binarization)
+
+# irstlm
+#lm-binarizer = $irstlm-dir/compile-lm
+
+# kenlm, also set type to 8
+lm-binarizer = $moses-bin-dir/build_binary
+type = 8
+
+### script to create quantized language model format (irstlm)
+# (default: no quantization)
+# 
+#lm-quantizer = $irstlm-dir/quantize-lm
+
+### script to use for converting into randomized table format
+# (default: no randomization)
+#
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
+
+#################################################################
+# MODIFIED MOORE LEWIS FILTERING
+
+[MML] IGNORE
+
+### specifications for language models to be trained
+#
+#lm-training = $srilm-dir/ngram-count
+#lm-settings = "-interpolate -kndiscount -unk"
+#lm-binarizer = $moses-src-dir/bin/build_binary
+#lm-query = $moses-src-dir/bin/query
+#order = 5
+
+### in-/out-of-domain source/target corpora to train the 4 language model
+# 
+# in-domain: point either to a parallel corpus
+#outdomain-stem = [CORPUS:toy:clean-split-stem]
+
+# ... or to two separate monolingual corpora
+#indomain-target = [LM:toy:lowercased-corpus]
+#raw-indomain-source = $toy-data/nc-5k.$input-extension
+
+# point to out-of-domain parallel corpus
+#outdomain-stem = [CORPUS:giga:clean-split-stem]
+
+# settings: number of lines sampled from the corpora to train each language model on
+# (if used at all, should be small as a percentage of corpus)
+#settings = "--line-count 100000"
+
+#################################################################
+# TRANSLATION MODEL TRAINING
+
+[TRAINING]
+
+### training script to be used: either a legacy script or 
+# current moses training script (default) 
+# 
+script = $moses-script-dir/training/train-model.perl
+
+### general options
+# these are options that are passed on to train-model.perl, for instance
+# * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza
+# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting
+# * "-sort-parallel 8 -cores 8" to speed up phrase table building
+# * "-parallel" for parallel execution of mkcls and giza
+#
+#training-options = ""
+
+### factored training: specify here which factors used
+# if none specified, single factor training is assumed
+# (one translation step, surface to surface)
+#
+#input-factors = word lemma pos morph
+#output-factors = word lemma pos
+#alignment-factors = "word -> word"
+#translation-factors = "word -> word"
+#reordering-factors = "word -> word"
+#generation-factors = "word -> pos"
+#decoding-steps = "t0, g0"
+
+### parallelization of data preparation step
+# the two directions of the data preparation can be run in parallel
+# comment out if not needed
+#
+parallel = yes
+
+### pre-computation for giza++
+# giza++ has a more efficient data structure that needs to be
+# initialized with snt2cooc. if run in parallel, this may reduces
+# memory requirements. set here the number of parts
+#
+#run-giza-in-parts = 5
+
+### symmetrization method to obtain word alignments from giza output
+# (commonly used: grow-diag-final-and)
+#
+alignment-symmetrization-method = grow-diag-final-and
+
+### use of Chris Dyer's fast align for word alignment
+#
+#fast-align-settings = "-d -o -v"
+
+### use of berkeley aligner for word alignment
+#
+#use-berkeley = true
+#alignment-symmetrization-method = berkeley
+#berkeley-train = $moses-script-dir/ems/support/berkeley-train.sh
+#berkeley-process =  $moses-script-dir/ems/support/berkeley-process.sh
+#berkeley-jar = /your/path/to/berkeleyaligner-1.1/berkeleyaligner.jar
+#berkeley-java-options = "-server -mx30000m -ea"
+#berkeley-training-options = "-Main.iters 5 5 -EMWordAligner.numThreads 8"
+#berkeley-process-options = "-EMWordAligner.numThreads 8"
+#berkeley-posterior = 0.5
+
+### use of baseline alignment model (incremental training)
+# 
+#baseline = 68
+#baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \
+#  $working-dir/training/prepared.$baseline/$output-extension.vcb \
+#  $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \
+#  $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \ 
+#  $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \
+#  $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \
+#  $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \
+#  $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5"
+
+### if word alignment should be skipped,
+# point to word alignment files
+#
+#word-alignment = $working-dir/model/aligned.1
+
+### filtering some corpora with modified Moore-Lewis
+# specify corpora to be filtered and ratio to be kept, either before or after word alignment
+#mml-filter-corpora = toy
+#mml-before-wa = "-proportion 0.9"
+#mml-after-wa = "-proportion 0.9"
+
+### build memory mapped suffix array phrase table
+# (binarizing the reordering table is a good idea, since filtering makes little sense)
+#mmsapt = "num-features=9 pfwd=g+ pbwd=g+ smooth=0 sample=1000 workers=1"
+#binarize-all = $moses-script-dir/training/binarize-model.perl
+
+### create a bilingual concordancer for the model
+#
+#biconcor = $moses-bin-dir/biconcor
+
+## Operation Sequence Model  (OSM)
+# Durrani, Schmid and Fraser. (2011): 
+# "A Joint Sequence Translation Model with Integrated Reordering"
+# compile Moses with --max-kenlm-order=9 if higher order is required
+# 
+#operation-sequence-model = "yes"
+#operation-sequence-model-order = 5
+#operation-sequence-model-settings = ""
+#
+# if OSM training should be skipped, point to OSM Model 
+#osm-model =
+
+### unsupervised transliteration module
+# Durrani, Sajjad, Hoang and Koehn (EACL, 2014).
+# "Integrating an Unsupervised Transliteration Model 
+# into Statistical Machine Translation."
+#
+#transliteration-module = "yes"
+#post-decoding-transliteration = "yes"
+
+### lexicalized reordering: specify orientation type
+# (default: only distance-based reordering model)
+#
+lexicalized-reordering = msd-bidirectional-fe
+
+### hierarchical rule set
+#
+#hierarchical-rule-set = true
+
+### settings for rule extraction
+#
+#extract-settings = ""
+max-phrase-length = 5
+
+### add extracted phrases from baseline model
+#
+#baseline-extract = $working-dir/model/extract.$baseline
+#
+# requires aligned parallel corpus for re-estimating lexical translation probabilities
+#baseline-corpus = $working-dir/training/corpus.$baseline
+#baseline-alignment = $working-dir/model/aligned.$baseline.$alignment-symmetrization-method
+
+### unknown word labels (target syntax only)
+# enables use of unknown word labels during decoding
+# label file is generated during rule extraction
+#
+#use-unknown-word-labels = true
+
+### if phrase extraction should be skipped,
+# point to stem for extract files
+#
+# extracted-phrases = 
+
+### settings for rule scoring
+#
+score-settings = "--GoodTuring --MinScore 2:0.0001"
+
+### include word alignment in phrase table
+#
+#include-word-alignment-in-rules = yes
+
+### sparse lexical features
+#
+#sparse-features = "target-word-insertion top 50, source-word-deletion top 50, word-translation top 50 50, phrase-length"
+
+### domain adaptation settings
+# options: sparse, any of: indicator, subset, ratio
+#domain-features = "subset" 
+
+### if phrase table training should be skipped,
+# point to phrase translation table
+#
+# phrase-translation-table = 
+
+### if reordering table training should be skipped,
+# point to reordering table
+#
+# reordering-table = 
+
+### filtering the phrase table based on significance tests
+# Johnson, Martin, Foster and Kuhn. (2007): "Improving Translation Quality by Discarding Most of the Phrasetable"
+# options: -n number of translations; -l 'a+e', 'a-e', or a positive real value -log prob threshold
+#salm-index = /path/to/project/salm/Bin/Linux/Index/IndexSA.O64
+#sigtest-filter = "-l a+e -n 50"
+
+### if training should be skipped, 
+# point to a configuration file that contains
+# pointers to all relevant model files
+#
+#config-with-reused-weights = 
+
+#####################################################
+### TUNING: finding good weights for model components
+
+[TUNING]
+
+### instead of tuning with this setting, old weights may be recycled
+# specify here an old configuration file with matching weights
+#
+weight-config = $toy-data/weight.ini
+
+### tuning script to be used
+#
+tuning-script = $moses-script-dir/training/mert-moses.pl
+tuning-settings = "-mertdir $moses-bin-dir"
+
+### specify the corpus used for tuning 
+# it should contain 1000s of sentences
+#
+#input-sgm = 
+#raw-input = 
+#tokenized-input = 
+#factorized-input = 
+#input =
+# 
+#reference-sgm = 
+#raw-reference = 
+#tokenized-reference = 
+#factorized-reference = 
+#reference = 
+
+### size of n-best list used (typically 100)
+#
+nbest = 100
+
+### ranges for weights for random initialization
+# if not specified, the tuning script will use generic ranges
+# it is not clear, if this matters
+#
+# lambda = 
+
+### additional flags for the filter script
+#
+filter-settings = ""
+
+### additional flags for the decoder
+#
+decoder-settings = ""
+
+### if tuning should be skipped, specify this here
+# and also point to a configuration file that contains
+# pointers to all relevant model files
+#
+#config-with-reused-weights = 
+
+#########################################################
+## RECASER: restore case, this part only trains the model
+
+[RECASING] IGNORE
+
+### training data
+# raw input needs to be still tokenized,
+# also also tokenized input may be specified
+#
+#tokenized = [LM:europarl:tokenized-corpus]
+
+### additinal settings
+#
+recasing-settings = ""
+#lm-training = $srilm-dir/ngram-count
+decoder = $moses-bin-dir/moses
+
+# already a trained recaser? point to config file
+#recase-config = 
+
+#######################################################
+## TRUECASER: train model to truecase corpora and input
+
+[TRUECASER]
+
+### script to train truecaser models
+#
+trainer = $moses-script-dir/recaser/train-truecaser.perl
+
+### training data
+# data on which truecaser is trained
+# if no training data is specified, parallel corpus is used
+#
+# raw-stem = 
+# tokenized-stem =
+
+### trained model
+#
+# truecase-model = 
+
+######################################################################
+## EVALUATION: translating a test set using the tuned system and score it
+
+[EVALUATION]
+
+### additional flags for the filter script
+#
+#filter-settings = ""
+
+### additional decoder settings
+# switches for the Moses decoder
+# common choices: 
+#   "-threads N" for multi-threading
+#   "-mbr" for MBR decoding
+#   "-drop-unknown" for dropping unknown source words
+#   "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
+#
+decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
+
+### specify size of n-best list, if produced
+#
+#nbest = 100
+
+### multiple reference translations
+#
+#multiref = yes
+
+### prepare system output for scoring 
+# this may include detokenization and wrapping output in sgm 
+# (needed for nist-bleu, ter, meteor)
+#
+detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension"
+#recaser = $moses-script-dir/recaser/recase.perl
+wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension"
+#output-sgm = 
+
+### BLEU
+#
+nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
+nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
+#multi-bleu = "$moses-script-dir/generic/multi-bleu.perl -lc"
+#multi-bleu-c = $moses-script-dir/generic/multi-bleu.perl
+#ibm-bleu =
+
+### TER: translation error rate (BBN metric) based on edit distance
+# not yet integrated
+#
+# ter = 
+
+### METEOR: gives credit to stem / worknet synonym matches
+# not yet integrated
+#
+# meteor = 
+
+### Analysis: carry out various forms of analysis on the output
+#
+analysis = $moses-script-dir/ems/support/analysis.perl
+#
+# also report on input coverage
+analyze-coverage = yes
+#
+# also report on phrase mappings used
+report-segmentation = yes
+#
+# report precision of translations for each input word, broken down by
+# count of input word in corpus and model
+#report-precision-by-coverage = yes
+#
+# further precision breakdown by factor
+#precision-by-coverage-factor = pos
+# 
+# visualization of the search graph in tree-based models
+#analyze-search-graph = yes
+
+[EVALUATION:test]
+
+### input data
+#
+input-sgm = $toy-data/test-src.$input-extension.sgm
+# raw-input = 
+# tokenized-input = 
+# factorized-input =
+# input = 
+
+### reference data
+#
+reference-sgm = $toy-data/test-ref.$output-extension.sgm
+# raw-reference =
+# tokenized-reference = 
+# reference = 
+
+### analysis settings 
+# may contain any of the general evaluation analysis settings
+# specific setting: base coverage statistics on earlier run
+#
+#precision-by-coverage-base = $working-dir/evaluation/test.analysis.5
+
+### wrapping frame
+# for nist-bleu and other scoring scripts, the output needs to be wrapped 
+# in sgm markup (typically like the input sgm)
+#
+wrapping-frame = $input-sgm
+
+##########################################
+### REPORTING: summarize evaluation scores
+
+[REPORTING]
+
+### currently no parameters for reporting section
+
+
diff --git a/scripts/ems/example/data/weight_bilinguallm.ini b/scripts/ems/example/data/weight_bilinguallm.ini
new file mode 100644
index 000000000..fbe26fc03
--- /dev/null
+++ b/scripts/ems/example/data/weight_bilinguallm.ini
@@ -0,0 +1,14 @@
+#########################
+### MOSES CONFIG FILE ###
+#########################
+
+[weight]
+Distortion0= 0.3
+UnknownWordPenalty0= 1
+WordPenalty0= -1
+TranslationModel0= 0.2 0.2 0.2 0.2
+PhrasePenalty0= 0.2
+LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3
+LM0= 0.5
+BilingualNPLM0= 0.1
+
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index dafbe4a42..62e38128c 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -160,6 +160,20 @@ train
 	ignore-if: no-splitter-training
 
 [LM] multiple
+prepare-bilingual-nplm
+	in: TRAINING:corpus TRAINING:word-alignment
+	out: numberized_ngrams
+	ignore-unless: bilingual-lm
+	rerun-on-change: TRAINING:corpus TRAINING:word-alignment
+	template: $moses-script-dir/training/bilingual-lm/extract_training.py -c IN0 -e $output-extension -f $input-extension -a IN1.$TRAINING:alignment-symmetrization-method -w $working-dir/$bilingual-lm-workdir -n $order -m $source-window $bilingual-lm-settings
+	default-name: LM/bilingualLM_prep
+train-bilingual-lm
+	in: numberized_ngrams TRAINING:corpus 
+	out: binlm
+	ignore-unless: bilingual-lm
+	rerun-on-change: numberized_ngrams
+	template: $moses-script-dir/training/bilingual-lm/train_nplm.py -w $working-dir/$bilingual-lm-workdir -c IN1 -r $working-dir/$nplm-output-dir -n $train_order $nplm-settings
+	default-name: LM/BilingualLM
 get-corpus
 	in: get-corpus-script
 	out: raw-corpus
@@ -247,7 +261,7 @@ train
 	in: stripped-corpus
 	out: lm
 	default-name: lm/lm
-	ignore-if: rlm-training custom-training
+	ignore-if: rlm-training custom-training bilingual-lm
 	rerun-on-change: lm-training order settings
 	template: $lm-training -order $order $settings -text IN -lm OUT
 	error: cannot execute binary file
@@ -293,6 +307,7 @@ binarize
 	in: qlm
 	out: binlm
 	pass-unless: lm-binarizer
+	ignore-if: bilingual-lm
 	rerun-on-change: lm
 	default-name: lm/binlm
 	template: $lm-binarizer IN OUT

From 166bf7365f02cf573265d7aded822f0d08215de0 Mon Sep 17 00:00:00 2001
From: XapaJIaMnu <nheart@gmail.com>
Date: Fri, 12 Jun 2015 16:56:36 +0100
Subject: [PATCH 067/108] Forgot to update the weight config path

---
 scripts/ems/example/config.toy.bilinguallm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/ems/example/config.toy.bilinguallm b/scripts/ems/example/config.toy.bilinguallm
index 37a34b70b..cd6880f32 100644
--- a/scripts/ems/example/config.toy.bilinguallm
+++ b/scripts/ems/example/config.toy.bilinguallm
@@ -483,7 +483,7 @@ score-settings = "--GoodTuring --MinScore 2:0.0001"
 ### instead of tuning with this setting, old weights may be recycled
 # specify here an old configuration file with matching weights
 #
-weight-config = $toy-data/weight.ini
+weight-config = $toy-data/weight_bilinguallm.ini
 
 ### tuning script to be used
 #

From bd86ceffbe6d748a863e15def2443150ca360b38 Mon Sep 17 00:00:00 2001
From: Jeroen Vermeulen <jtv@precisiontranslationtools.com>
Date: Sat, 13 Jun 2015 21:31:53 +0700
Subject: [PATCH 068/108] Check for error when opening gzfilebuf.

This replaces a segfault when a file can't be found with an exception.
Not as helpful as it could be yet, but certainly better than just
crashing.

Also, make InputFileStream constructor from path "explicit" to avoid
mistakes.
---
 moses/InputFileStream.h                               | 2 +-
 moses/gzfilebuf.h                                     | 3 +++
 phrase-extract/InputFileStream.h                      | 2 +-
 phrase-extract/extract-mixed-syntax/InputFileStream.h | 2 +-
 phrase-extract/extract-mixed-syntax/gzfilebuf.h       | 4 +++-
 phrase-extract/gzfilebuf.h                            | 4 +++-
 phrase-extract/lexical-reordering/InputFileStream.h   | 2 +-
 phrase-extract/lexical-reordering/gzfilebuf.h         | 4 +++-
 8 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/moses/InputFileStream.h b/moses/InputFileStream.h
index d53abfc23..313ddfed7 100644
--- a/moses/InputFileStream.h
+++ b/moses/InputFileStream.h
@@ -37,7 +37,7 @@ protected:
   std::streambuf *m_streambuf;
 public:
 
-  InputFileStream(const std::string &filePath);
+  explicit InputFileStream(const std::string &filePath);
   ~InputFileStream();
 
   void Close();
diff --git a/moses/gzfilebuf.h b/moses/gzfilebuf.h
index 2376c2875..c82092933 100644
--- a/moses/gzfilebuf.h
+++ b/moses/gzfilebuf.h
@@ -1,6 +1,7 @@
 #ifndef moses_gzfile_buf_h
 #define moses_gzfile_buf_h
 
+#include <stdexcept>
 #include <streambuf>
 #include <zlib.h>
 #include <cstring>
@@ -13,6 +14,8 @@ class gzfilebuf : public std::streambuf
 public:
   gzfilebuf(const char *filename) {
     _gzf = gzopen(filename, "rb");
+    if (!_gzf)
+      throw std::runtime_error("Could not open " + std::string(filename) + ".");
     setg (_buff+sizeof(int),     // beginning of putback area
           _buff+sizeof(int),     // read position
           _buff+sizeof(int));    // end position
diff --git a/phrase-extract/InputFileStream.h b/phrase-extract/InputFileStream.h
index e2a31bc82..5de416237 100644
--- a/phrase-extract/InputFileStream.h
+++ b/phrase-extract/InputFileStream.h
@@ -37,7 +37,7 @@ protected:
   std::streambuf *m_streambuf;
 public:
 
-  InputFileStream(const std::string &filePath);
+  explicit InputFileStream(const std::string &filePath);
   ~InputFileStream();
 
   void Close();
diff --git a/phrase-extract/extract-mixed-syntax/InputFileStream.h b/phrase-extract/extract-mixed-syntax/InputFileStream.h
index e2a31bc82..5de416237 100644
--- a/phrase-extract/extract-mixed-syntax/InputFileStream.h
+++ b/phrase-extract/extract-mixed-syntax/InputFileStream.h
@@ -37,7 +37,7 @@ protected:
   std::streambuf *m_streambuf;
 public:
 
-  InputFileStream(const std::string &filePath);
+  explicit InputFileStream(const std::string &filePath);
   ~InputFileStream();
 
   void Close();
diff --git a/phrase-extract/extract-mixed-syntax/gzfilebuf.h b/phrase-extract/extract-mixed-syntax/gzfilebuf.h
index b5b0ce87f..4c818ddbb 100644
--- a/phrase-extract/extract-mixed-syntax/gzfilebuf.h
+++ b/phrase-extract/extract-mixed-syntax/gzfilebuf.h
@@ -1,6 +1,7 @@
 #ifndef moses_gzfile_buf_h
 #define moses_gzfile_buf_h
 
+#include <stdexcept>
 #include <streambuf>
 #include <zlib.h>
 #include <cstring>
@@ -10,7 +11,8 @@ class gzfilebuf : public std::streambuf
 public:
   gzfilebuf(const char *filename) {
     _gzf = gzopen(filename, "rb");
-    setg (_buff+sizeof(int),     // beginning of putback area
+    if (!_gzf)
+      throw std::runtime_error("Could not open " + std::string(filename) + ".");    setg (_buff+sizeof(int),     // beginning of putback area
           _buff+sizeof(int),     // read position
           _buff+sizeof(int));    // end position
   }
diff --git a/phrase-extract/gzfilebuf.h b/phrase-extract/gzfilebuf.h
index b5b0ce87f..4c818ddbb 100644
--- a/phrase-extract/gzfilebuf.h
+++ b/phrase-extract/gzfilebuf.h
@@ -1,6 +1,7 @@
 #ifndef moses_gzfile_buf_h
 #define moses_gzfile_buf_h
 
+#include <stdexcept>
 #include <streambuf>
 #include <zlib.h>
 #include <cstring>
@@ -10,7 +11,8 @@ class gzfilebuf : public std::streambuf
 public:
   gzfilebuf(const char *filename) {
     _gzf = gzopen(filename, "rb");
-    setg (_buff+sizeof(int),     // beginning of putback area
+    if (!_gzf)
+      throw std::runtime_error("Could not open " + std::string(filename) + ".");    setg (_buff+sizeof(int),     // beginning of putback area
           _buff+sizeof(int),     // read position
           _buff+sizeof(int));    // end position
   }
diff --git a/phrase-extract/lexical-reordering/InputFileStream.h b/phrase-extract/lexical-reordering/InputFileStream.h
index 1f37715fd..dcc28a60c 100755
--- a/phrase-extract/lexical-reordering/InputFileStream.h
+++ b/phrase-extract/lexical-reordering/InputFileStream.h
@@ -37,7 +37,7 @@ protected:
   std::streambuf *m_streambuf;
 public:
 
-  InputFileStream(const std::string &filePath);
+  explicit InputFileStream(const std::string &filePath);
   ~InputFileStream();
 
   void Open(const std::string &filePath);
diff --git a/phrase-extract/lexical-reordering/gzfilebuf.h b/phrase-extract/lexical-reordering/gzfilebuf.h
index b5b0ce87f..4c818ddbb 100755
--- a/phrase-extract/lexical-reordering/gzfilebuf.h
+++ b/phrase-extract/lexical-reordering/gzfilebuf.h
@@ -1,6 +1,7 @@
 #ifndef moses_gzfile_buf_h
 #define moses_gzfile_buf_h
 
+#include <stdexcept>
 #include <streambuf>
 #include <zlib.h>
 #include <cstring>
@@ -10,7 +11,8 @@ class gzfilebuf : public std::streambuf
 public:
   gzfilebuf(const char *filename) {
     _gzf = gzopen(filename, "rb");
-    setg (_buff+sizeof(int),     // beginning of putback area
+    if (!_gzf)
+      throw std::runtime_error("Could not open " + std::string(filename) + ".");    setg (_buff+sizeof(int),     // beginning of putback area
           _buff+sizeof(int),     // read position
           _buff+sizeof(int));    // end position
   }

From 89c2df558ca3533369586d8d4e2c7451b2d2732e Mon Sep 17 00:00:00 2001
From: MosesAdmin <moses-support-owner@mit.edu>
Date: Sun, 14 Jun 2015 00:00:44 +0100
Subject: [PATCH 069/108] daily automatic beautifier

---
 phrase-extract/extract-mixed-syntax/gzfilebuf.h | 3 ++-
 phrase-extract/gzfilebuf.h                      | 3 ++-
 phrase-extract/lexical-reordering/gzfilebuf.h   | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)
 mode change 100755 => 100644 phrase-extract/lexical-reordering/gzfilebuf.h

diff --git a/phrase-extract/extract-mixed-syntax/gzfilebuf.h b/phrase-extract/extract-mixed-syntax/gzfilebuf.h
index 4c818ddbb..e070da306 100644
--- a/phrase-extract/extract-mixed-syntax/gzfilebuf.h
+++ b/phrase-extract/extract-mixed-syntax/gzfilebuf.h
@@ -12,7 +12,8 @@ public:
   gzfilebuf(const char *filename) {
     _gzf = gzopen(filename, "rb");
     if (!_gzf)
-      throw std::runtime_error("Could not open " + std::string(filename) + ".");    setg (_buff+sizeof(int),     // beginning of putback area
+      throw std::runtime_error("Could not open " + std::string(filename) + ".");
+    setg (_buff+sizeof(int),     // beginning of putback area
           _buff+sizeof(int),     // read position
           _buff+sizeof(int));    // end position
   }
diff --git a/phrase-extract/gzfilebuf.h b/phrase-extract/gzfilebuf.h
index 4c818ddbb..e070da306 100644
--- a/phrase-extract/gzfilebuf.h
+++ b/phrase-extract/gzfilebuf.h
@@ -12,7 +12,8 @@ public:
   gzfilebuf(const char *filename) {
     _gzf = gzopen(filename, "rb");
     if (!_gzf)
-      throw std::runtime_error("Could not open " + std::string(filename) + ".");    setg (_buff+sizeof(int),     // beginning of putback area
+      throw std::runtime_error("Could not open " + std::string(filename) + ".");
+    setg (_buff+sizeof(int),     // beginning of putback area
           _buff+sizeof(int),     // read position
           _buff+sizeof(int));    // end position
   }
diff --git a/phrase-extract/lexical-reordering/gzfilebuf.h b/phrase-extract/lexical-reordering/gzfilebuf.h
old mode 100755
new mode 100644
index 4c818ddbb..e070da306
--- a/phrase-extract/lexical-reordering/gzfilebuf.h
+++ b/phrase-extract/lexical-reordering/gzfilebuf.h
@@ -12,7 +12,8 @@ public:
   gzfilebuf(const char *filename) {
     _gzf = gzopen(filename, "rb");
     if (!_gzf)
-      throw std::runtime_error("Could not open " + std::string(filename) + ".");    setg (_buff+sizeof(int),     // beginning of putback area
+      throw std::runtime_error("Could not open " + std::string(filename) + ".");
+    setg (_buff+sizeof(int),     // beginning of putback area
           _buff+sizeof(int),     // read position
           _buff+sizeof(int));    // end position
   }

From ad8114ddb0b7af26c74680a9657c5fa6f82cf1eb Mon Sep 17 00:00:00 2001
From: Barry Haddow <barry.haddow@gmail.com>
Date: Mon, 15 Jun 2015 16:23:12 +0100
Subject: [PATCH 070/108] capitalisation

---
 scripts/ems/experiment.meta | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index 62e38128c..9edeec460 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -166,14 +166,14 @@ prepare-bilingual-nplm
 	ignore-unless: bilingual-lm
 	rerun-on-change: TRAINING:corpus TRAINING:word-alignment
 	template: $moses-script-dir/training/bilingual-lm/extract_training.py -c IN0 -e $output-extension -f $input-extension -a IN1.$TRAINING:alignment-symmetrization-method -w $working-dir/$bilingual-lm-workdir -n $order -m $source-window $bilingual-lm-settings
-	default-name: LM/bilingualLM_prep
+	default-name: lm/bilingualLM_prep
 train-bilingual-lm
 	in: numberized_ngrams TRAINING:corpus 
 	out: binlm
 	ignore-unless: bilingual-lm
 	rerun-on-change: numberized_ngrams
 	template: $moses-script-dir/training/bilingual-lm/train_nplm.py -w $working-dir/$bilingual-lm-workdir -c IN1 -r $working-dir/$nplm-output-dir -n $train_order $nplm-settings
-	default-name: LM/BilingualLM
+	default-name: lm/bilingualLM
 get-corpus
 	in: get-corpus-script
 	out: raw-corpus

From 6c0f875385ffaf827139c9a9220a2c5bfd195178 Mon Sep 17 00:00:00 2001
From: Rico Sennrich <rico.sennrich@gmx.ch>
Date: Tue, 16 Jun 2015 16:19:41 +0100
Subject: [PATCH 071/108] testing the waters for c++11

please adjust your compiler options or complain if you rely on a compiler that doesn't support c++11 yet.
---
 Jamroot                             |  2 +-
 moses/StaticData.cpp                |  3 +--
 phrase-extract/ScoreFeatureTest.cpp | 20 ++++++++++----------
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/Jamroot b/Jamroot
index 119c6183e..a4957dfa2 100644
--- a/Jamroot
+++ b/Jamroot
@@ -108,7 +108,7 @@ external-lib z ;
 
 #lib dl : : <runtime-link>static:<link>static <runtime-link>shared:<link>shared ;
 #requirements += <library>dl ;
-
+requirements += <toolset>gcc:<cxxflags>-std=c++0x ;
 
 if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_minimal" ] {
   if [ option.get "full-tcmalloc" : : "yes" ] {
diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index 8fb88c257..28d9f7831 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -1115,8 +1115,7 @@ void StaticData::LoadSparseWeightsFromConfig()
   }
 
   std::map<std::string, std::vector<float> > weights = m_parameter->GetAllWeights();
-  std::map<std::string, std::vector<float> >::iterator iter;
-  for (iter = weights.begin(); iter != weights.end(); ++iter) {
+  for (auto iter = weights.begin(); iter != weights.end(); ++iter) {
     // this indicates that it is sparse feature
     if (featureNames.find(iter->first) == featureNames.end()) {
       UTIL_THROW_IF2(iter->second.size() != 1, "ERROR: only one weight per sparse feature allowed: " << iter->first);
diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp
index 93a452dad..2863122dd 100644
--- a/phrase-extract/ScoreFeatureTest.cpp
+++ b/phrase-extract/ScoreFeatureTest.cpp
@@ -53,16 +53,16 @@ BOOST_AUTO_TEST_CASE(manager_configure_domain_except)
   //Check that configure rejects illegal domain arg combinations
   ScoreFeatureManager manager;
   BOOST_CHECK_THROW(
-    manager.configure(boost::assign::list_of("--DomainRatio")("/dev/null")("--DomainIndicator")("/dev/null")),
+    manager.configure({"--DomainRatio","/dev/null","--DomainIndicator","/dev/null"}),
     ScoreFeatureArgumentException);
   BOOST_CHECK_THROW(
-    manager.configure(boost::assign::list_of("--SparseDomainSubset")("/dev/null")("--SparseDomainRatio")("/dev/null")),
+    manager.configure({"--SparseDomainSubset","/dev/null","--SparseDomainRatio","/dev/null"}),
     ScoreFeatureArgumentException);
   BOOST_CHECK_THROW(
-    manager.configure(boost::assign::list_of("--SparseDomainBlah")("/dev/null")),
+    manager.configure({"--SparseDomainBlah","/dev/null"}),
     ScoreFeatureArgumentException);
   BOOST_CHECK_THROW(
-    manager.configure(boost::assign::list_of("--DomainSubset")),
+    manager.configure({"--DomainSubset"}),
     ScoreFeatureArgumentException);
 }
 
@@ -84,16 +84,16 @@ static void checkDomainConfigured(
 BOOST_AUTO_TEST_CASE(manager_config_domain)
 {
   checkDomainConfigured<RatioDomainFeature>
-  (boost::assign::list_of ("--DomainRatio")("/dev/null"));
+  ({"--DomainRatio","/dev/null"});
   checkDomainConfigured<IndicatorDomainFeature>
-  (boost::assign::list_of("--DomainIndicator")("/dev/null"));
+  ({"--DomainIndicator","/dev/null"});
   checkDomainConfigured<SubsetDomainFeature>
-  (boost::assign::list_of("--DomainSubset")("/dev/null"));
+  ({"--DomainSubset","/dev/null"});
   checkDomainConfigured<SparseRatioDomainFeature>
-  (boost::assign::list_of("--SparseDomainRatio")("/dev/null"));
+  ({"--SparseDomainRatio","/dev/null"});
   checkDomainConfigured<SparseIndicatorDomainFeature>
-  (boost::assign::list_of("--SparseDomainIndicator")("/dev/null"));
+  ({"--SparseDomainIndicator","/dev/null"});
   checkDomainConfigured<SparseSubsetDomainFeature>
-  (boost::assign::list_of("--SparseDomainSubset")("/dev/null"));
+  ({"--SparseDomainSubset","/dev/null"});
 }
 

From 2a798c0b9f19e44c1a63c7c75f657ae15968c8d0 Mon Sep 17 00:00:00 2001
From: MosesAdmin <moses-support-owner@mit.edu>
Date: Wed, 17 Jun 2015 00:00:42 +0100
Subject: [PATCH 072/108] daily automatic beautifier

---
 phrase-extract/ScoreFeatureTest.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp
index 2863122dd..51d4e1297 100644
--- a/phrase-extract/ScoreFeatureTest.cpp
+++ b/phrase-extract/ScoreFeatureTest.cpp
@@ -53,16 +53,16 @@ BOOST_AUTO_TEST_CASE(manager_configure_domain_except)
   //Check that configure rejects illegal domain arg combinations
   ScoreFeatureManager manager;
   BOOST_CHECK_THROW(
-    manager.configure({"--DomainRatio","/dev/null","--DomainIndicator","/dev/null"}),
+    manager.configure( {"--DomainRatio","/dev/null","--DomainIndicator","/dev/null"}),
     ScoreFeatureArgumentException);
   BOOST_CHECK_THROW(
-    manager.configure({"--SparseDomainSubset","/dev/null","--SparseDomainRatio","/dev/null"}),
+    manager.configure( {"--SparseDomainSubset","/dev/null","--SparseDomainRatio","/dev/null"}),
     ScoreFeatureArgumentException);
   BOOST_CHECK_THROW(
-    manager.configure({"--SparseDomainBlah","/dev/null"}),
+    manager.configure( {"--SparseDomainBlah","/dev/null"}),
     ScoreFeatureArgumentException);
   BOOST_CHECK_THROW(
-    manager.configure({"--DomainSubset"}),
+    manager.configure( {"--DomainSubset"}),
     ScoreFeatureArgumentException);
 }
 
@@ -84,16 +84,16 @@ static void checkDomainConfigured(
 BOOST_AUTO_TEST_CASE(manager_config_domain)
 {
   checkDomainConfigured<RatioDomainFeature>
-  ({"--DomainRatio","/dev/null"});
+  ( {"--DomainRatio","/dev/null"});
   checkDomainConfigured<IndicatorDomainFeature>
-  ({"--DomainIndicator","/dev/null"});
+  ( {"--DomainIndicator","/dev/null"});
   checkDomainConfigured<SubsetDomainFeature>
-  ({"--DomainSubset","/dev/null"});
+  ( {"--DomainSubset","/dev/null"});
   checkDomainConfigured<SparseRatioDomainFeature>
-  ({"--SparseDomainRatio","/dev/null"});
+  ( {"--SparseDomainRatio","/dev/null"});
   checkDomainConfigured<SparseIndicatorDomainFeature>
-  ({"--SparseDomainIndicator","/dev/null"});
+  ( {"--SparseDomainIndicator","/dev/null"});
   checkDomainConfigured<SparseSubsetDomainFeature>
-  ({"--SparseDomainSubset","/dev/null"});
+  ( {"--SparseDomainSubset","/dev/null"});
 }
 

From 42c5424c86bc2f7f79b70821169dc24433e04b28 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Wed, 17 Jun 2015 10:58:47 +0400
Subject: [PATCH 073/108] 1st casualty of c++11. clang 2.6 (latest c++ compiler
 on osx) doesn't support list of object init

---
 phrase-extract/ScoreFeatureTest.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp
index 51d4e1297..93a452dad 100644
--- a/phrase-extract/ScoreFeatureTest.cpp
+++ b/phrase-extract/ScoreFeatureTest.cpp
@@ -53,16 +53,16 @@ BOOST_AUTO_TEST_CASE(manager_configure_domain_except)
   //Check that configure rejects illegal domain arg combinations
   ScoreFeatureManager manager;
   BOOST_CHECK_THROW(
-    manager.configure( {"--DomainRatio","/dev/null","--DomainIndicator","/dev/null"}),
+    manager.configure(boost::assign::list_of("--DomainRatio")("/dev/null")("--DomainIndicator")("/dev/null")),
     ScoreFeatureArgumentException);
   BOOST_CHECK_THROW(
-    manager.configure( {"--SparseDomainSubset","/dev/null","--SparseDomainRatio","/dev/null"}),
+    manager.configure(boost::assign::list_of("--SparseDomainSubset")("/dev/null")("--SparseDomainRatio")("/dev/null")),
     ScoreFeatureArgumentException);
   BOOST_CHECK_THROW(
-    manager.configure( {"--SparseDomainBlah","/dev/null"}),
+    manager.configure(boost::assign::list_of("--SparseDomainBlah")("/dev/null")),
     ScoreFeatureArgumentException);
   BOOST_CHECK_THROW(
-    manager.configure( {"--DomainSubset"}),
+    manager.configure(boost::assign::list_of("--DomainSubset")),
     ScoreFeatureArgumentException);
 }
 
@@ -84,16 +84,16 @@ static void checkDomainConfigured(
 BOOST_AUTO_TEST_CASE(manager_config_domain)
 {
   checkDomainConfigured<RatioDomainFeature>
-  ( {"--DomainRatio","/dev/null"});
+  (boost::assign::list_of ("--DomainRatio")("/dev/null"));
   checkDomainConfigured<IndicatorDomainFeature>
-  ( {"--DomainIndicator","/dev/null"});
+  (boost::assign::list_of("--DomainIndicator")("/dev/null"));
   checkDomainConfigured<SubsetDomainFeature>
-  ( {"--DomainSubset","/dev/null"});
+  (boost::assign::list_of("--DomainSubset")("/dev/null"));
   checkDomainConfigured<SparseRatioDomainFeature>
-  ( {"--SparseDomainRatio","/dev/null"});
+  (boost::assign::list_of("--SparseDomainRatio")("/dev/null"));
   checkDomainConfigured<SparseIndicatorDomainFeature>
-  ( {"--SparseDomainIndicator","/dev/null"});
+  (boost::assign::list_of("--SparseDomainIndicator")("/dev/null"));
   checkDomainConfigured<SparseSubsetDomainFeature>
-  ( {"--SparseDomainSubset","/dev/null"});
+  (boost::assign::list_of("--SparseDomainSubset")("/dev/null"));
 }
 

From 80f0f71d03b0348649835e674692938dc6862840 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Wed, 17 Jun 2015 11:25:27 +0400
Subject: [PATCH 074/108] Revert "1st casualty of c++11. clang 2.6 (latest c++
 compiler on osx) doesn't support list of object init"

This reverts commit 42c5424c86bc2f7f79b70821169dc24433e04b28.
---
 phrase-extract/ScoreFeatureTest.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp
index 93a452dad..51d4e1297 100644
--- a/phrase-extract/ScoreFeatureTest.cpp
+++ b/phrase-extract/ScoreFeatureTest.cpp
@@ -53,16 +53,16 @@ BOOST_AUTO_TEST_CASE(manager_configure_domain_except)
   //Check that configure rejects illegal domain arg combinations
   ScoreFeatureManager manager;
   BOOST_CHECK_THROW(
-    manager.configure(boost::assign::list_of("--DomainRatio")("/dev/null")("--DomainIndicator")("/dev/null")),
+    manager.configure( {"--DomainRatio","/dev/null","--DomainIndicator","/dev/null"}),
     ScoreFeatureArgumentException);
   BOOST_CHECK_THROW(
-    manager.configure(boost::assign::list_of("--SparseDomainSubset")("/dev/null")("--SparseDomainRatio")("/dev/null")),
+    manager.configure( {"--SparseDomainSubset","/dev/null","--SparseDomainRatio","/dev/null"}),
     ScoreFeatureArgumentException);
   BOOST_CHECK_THROW(
-    manager.configure(boost::assign::list_of("--SparseDomainBlah")("/dev/null")),
+    manager.configure( {"--SparseDomainBlah","/dev/null"}),
     ScoreFeatureArgumentException);
   BOOST_CHECK_THROW(
-    manager.configure(boost::assign::list_of("--DomainSubset")),
+    manager.configure( {"--DomainSubset"}),
     ScoreFeatureArgumentException);
 }
 
@@ -84,16 +84,16 @@ static void checkDomainConfigured(
 BOOST_AUTO_TEST_CASE(manager_config_domain)
 {
   checkDomainConfigured<RatioDomainFeature>
-  (boost::assign::list_of ("--DomainRatio")("/dev/null"));
+  ( {"--DomainRatio","/dev/null"});
   checkDomainConfigured<IndicatorDomainFeature>
-  (boost::assign::list_of("--DomainIndicator")("/dev/null"));
+  ( {"--DomainIndicator","/dev/null"});
   checkDomainConfigured<SubsetDomainFeature>
-  (boost::assign::list_of("--DomainSubset")("/dev/null"));
+  ( {"--DomainSubset","/dev/null"});
   checkDomainConfigured<SparseRatioDomainFeature>
-  (boost::assign::list_of("--SparseDomainRatio")("/dev/null"));
+  ( {"--SparseDomainRatio","/dev/null"});
   checkDomainConfigured<SparseIndicatorDomainFeature>
-  (boost::assign::list_of("--SparseDomainIndicator")("/dev/null"));
+  ( {"--SparseDomainIndicator","/dev/null"});
   checkDomainConfigured<SparseSubsetDomainFeature>
-  (boost::assign::list_of("--SparseDomainSubset")("/dev/null"));
+  ( {"--SparseDomainSubset","/dev/null"});
 }
 

From 127b860c6a7b54daa9b8808006835410510241aa Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Wed, 17 Jun 2015 11:27:50 +0400
Subject: [PATCH 075/108] false alarm. clang does support object list init.
 Needed to enable c++11 for all toolsets

---
 Jamroot | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jamroot b/Jamroot
index a4957dfa2..4f76ec3ba 100644
--- a/Jamroot
+++ b/Jamroot
@@ -108,7 +108,7 @@ external-lib z ;
 
 #lib dl : : <runtime-link>static:<link>static <runtime-link>shared:<link>shared ;
 #requirements += <library>dl ;
-requirements += <toolset>gcc:<cxxflags>-std=c++0x ;
+requirements += <cxxflags>-std=c++0x ;
 
 if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_minimal" ] {
   if [ option.get "full-tcmalloc" : : "yes" ] {

From 7031992caa2bd850d2442ae99b697f01194046db Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Wed, 17 Jun 2015 11:42:46 +0400
Subject: [PATCH 076/108] use c++11 unordered set code

---
 phrase-extract/ScoreFeatureTest.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp
index 51d4e1297..9497414be 100644
--- a/phrase-extract/ScoreFeatureTest.cpp
+++ b/phrase-extract/ScoreFeatureTest.cpp
@@ -25,7 +25,7 @@
 #include <boost/test/test_tools.hpp>
 #include <boost/test/unit_test.hpp>
 
-#include <boost/assign/list_of.hpp>
+#include <unordered_set>
 
 using namespace MosesTraining;
 using namespace std;
@@ -95,5 +95,15 @@ BOOST_AUTO_TEST_CASE(manager_config_domain)
   ( {"--SparseDomainIndicator","/dev/null"});
   checkDomainConfigured<SparseSubsetDomainFeature>
   ( {"--SparseDomainSubset","/dev/null"});
+  
+  unordered_set<int> s;
+  s.insert(4);
+  s.insert(7);
+  s.insert(4);
+  s.insert(1);
+  
+  for (auto i: s) {
+  	cerr << i << " ";
+  }
 }
 

From 425118aa5d794a43a1aff6e692c4e90c7e0f800e Mon Sep 17 00:00:00 2001
From: Barry Haddow <barry.haddow@gmail.com>
Date: Wed, 17 Jun 2015 09:32:29 +0100
Subject: [PATCH 077/108] bugfixes - working directory

---
 scripts/training/train-neurallm.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/training/train-neurallm.py b/scripts/training/train-neurallm.py
index 4f0e8bdaf..00da64986 100755
--- a/scripts/training/train-neurallm.py
+++ b/scripts/training/train-neurallm.py
@@ -187,12 +187,14 @@ def main(options):
         ret = subprocess.call(extraction_cmd)
         if ret:
             raise Exception("preparing neural LM failed")
+        options.validation_file = os.path.join(
+            options.working_dir, os.path.basename(options.validation_corpus))
 
     else:
         options.validation_file = None
 
-    options.input_words_file = options.words_file
-    options.output_words_file = options.words_file
+    options.input_words_file = os.path.join(options.working_dir, options.words_file)
+    options.output_words_file = os.path.join(options.working_dir, options.words_file)
     options.input_vocab_size = options.vocab_size
     options.output_vocab_size = options.vocab_size
 

From f29f67710e980db7f965b9b2e849b7c14dcf338d Mon Sep 17 00:00:00 2001
From: MosesAdmin <moses-support-owner@mit.edu>
Date: Thu, 18 Jun 2015 00:00:39 +0100
Subject: [PATCH 078/108] daily automatic beautifier

---
 phrase-extract/ScoreFeatureTest.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp
index 9497414be..cc22f8630 100644
--- a/phrase-extract/ScoreFeatureTest.cpp
+++ b/phrase-extract/ScoreFeatureTest.cpp
@@ -95,15 +95,15 @@ BOOST_AUTO_TEST_CASE(manager_config_domain)
   ( {"--SparseDomainIndicator","/dev/null"});
   checkDomainConfigured<SparseSubsetDomainFeature>
   ( {"--SparseDomainSubset","/dev/null"});
-  
+
   unordered_set<int> s;
   s.insert(4);
   s.insert(7);
   s.insert(4);
   s.insert(1);
-  
-  for (auto i: s) {
-  	cerr << i << " ";
+
+for (auto i: s) {
+    cerr << i << " ";
   }
 }
 

From 90470e878d7ee150baafbb718ee6a402f641c9a5 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Fri, 19 Jun 2015 15:58:14 +0100
Subject: [PATCH 079/108] Fix some C++11-related compilation errors (clang)

---
 biconcor/Vocabulary.cpp                            | 4 ++--
 moses/TranslationModel/RuleTable/LoaderFactory.cpp | 3 +--
 phrase-extract/extract-mixed-syntax/Main.cpp       | 7 ++-----
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/biconcor/Vocabulary.cpp b/biconcor/Vocabulary.cpp
index f0f07c97d..3879b451d 100644
--- a/biconcor/Vocabulary.cpp
+++ b/biconcor/Vocabulary.cpp
@@ -62,7 +62,7 @@ void Vocabulary::Save(const string& fileName ) const
   vcbFile.open( fileName.c_str(), ios::out | ios::ate | ios::trunc);
 
   if (!vcbFile) {
-    cerr << "Failed to open " << vcbFile << endl;
+    cerr << "Failed to open " << fileName << endl;
     exit(1);
   }
 
@@ -81,7 +81,7 @@ void Vocabulary::Load(const string& fileName )
   vcbFile.open(fileName.c_str());
 
   if (!vcbFile) {
-    cerr << "no such file or directory: " << vcbFile << endl;
+    cerr << "no such file or directory: " << fileName << endl;
     exit(1);
   }
 
diff --git a/moses/TranslationModel/RuleTable/LoaderFactory.cpp b/moses/TranslationModel/RuleTable/LoaderFactory.cpp
index 66a39e3bd..5569f952c 100644
--- a/moses/TranslationModel/RuleTable/LoaderFactory.cpp
+++ b/moses/TranslationModel/RuleTable/LoaderFactory.cpp
@@ -40,9 +40,8 @@ std::auto_ptr<RuleTableLoader> RuleTableLoaderFactory::Create(
 {
   InputFileStream input(path);
   std::string line;
-  bool cont = std::getline(input, line);
 
-  if (cont) {
+  if (std::getline(input, line)) {
     std::vector<std::string> tokens;
     Tokenize(tokens, line);
     if (tokens.size() == 1) {
diff --git a/phrase-extract/extract-mixed-syntax/Main.cpp b/phrase-extract/extract-mixed-syntax/Main.cpp
index 5d1b3e7f5..f011e6e8d 100644
--- a/phrase-extract/extract-mixed-syntax/Main.cpp
+++ b/phrase-extract/extract-mixed-syntax/Main.cpp
@@ -148,13 +148,10 @@ int main(int argc, char** argv)
       cerr << lineNum << " ";
     }
 
-    bool success;
-    success = getline(strmSource, lineSource);
-    if (!success) {
+    if (!getline(strmSource, lineSource)) {
       throw "Couldn't read source";
     }
-    success = getline(strmAlignment, lineAlignment);
-    if (!success) {
+    if (!getline(strmAlignment, lineAlignment)) {
       throw "Couldn't read alignment";
     }
 

From 1bd10e104ce5a8e51e7336ad5bbf1c61b56a0883 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <junczys@amu.edu.pl>
Date: Sun, 21 Jun 2015 18:27:56 +0200
Subject: [PATCH 080/108] workaround/cleaning for weird copy-constructor
 behaviour with C++11

---
 .../CompactPT/BlockHashIndex.cpp              |  2 +-
 .../LexicalReorderingTableCreator.cpp         |  9 +++--
 .../CompactPT/MmapAllocator.h                 | 12 ++++---
 .../CompactPT/PhraseTableCreator.cpp          |  6 ++--
 .../TranslationModel/CompactPT/StringVector.h | 35 +++++++++----------
 5 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/moses/TranslationModel/CompactPT/BlockHashIndex.cpp b/moses/TranslationModel/CompactPT/BlockHashIndex.cpp
index c90dcd6d9..27209f5bc 100644
--- a/moses/TranslationModel/CompactPT/BlockHashIndex.cpp
+++ b/moses/TranslationModel/CompactPT/BlockHashIndex.cpp
@@ -34,7 +34,7 @@ namespace Moses
 BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits,
                                size_t threadsNum)
   : m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
-    m_fileHandle(0), m_fileHandleStart(0), m_size(0),
+    m_fileHandle(0), m_fileHandleStart(0), m_landmarks(true), m_size(0),
     m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0),
     m_threadPool(threadsNum)
 {
diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp
index 9fe9eec30..8e9f4fa0a 100644
--- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp
+++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp
@@ -52,13 +52,12 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator(
 
   std::cerr << "Pass 1/2: Creating phrase index + Counting scores" << std::endl;
   m_hash.BeginSave(m_outFile);
-
-
+  
   if(tempfilePath.size()) {
     MmapAllocator<unsigned char> allocEncoded(util::FMakeTemp(tempfilePath));
     m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocEncoded);
   } else {
-    m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>();
+    m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(true);
   }
 
   EncodeScores();
@@ -68,12 +67,12 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator(
 
   std::cerr << "Pass 2/2: Compressing scores" << std::endl;
 
-
+  
   if(tempfilePath.size()) {
     MmapAllocator<unsigned char> allocCompressed(util::FMakeTemp(tempfilePath));
     m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocCompressed);
   } else {
-    m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>();
+    m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(true);
   }
   CompressScores();
 
diff --git a/moses/TranslationModel/CompactPT/MmapAllocator.h b/moses/TranslationModel/CompactPT/MmapAllocator.h
index 78084f883..0e04890bd 100644
--- a/moses/TranslationModel/CompactPT/MmapAllocator.h
+++ b/moses/TranslationModel/CompactPT/MmapAllocator.h
@@ -62,6 +62,9 @@ public:
   typedef std::size_t    size_type;
   typedef std::ptrdiff_t difference_type;
 
+  MmapAllocator(MmapAllocator &&) = delete; 	
+  MmapAllocator(const MmapAllocator &&) = delete; 		
+  
   MmapAllocator() throw()
     : m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)),
       m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
@@ -151,11 +154,12 @@ public:
     if(!m_fixed) {
       util::UnmapOrThrow(p, num * sizeof(T));
     } else {
-      size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
-      size_t relative_offset = m_data_offset - map_offset;
-      util::UnmapOrThrow((pointer)((char*)p - relative_offset), num * sizeof(T));
+      const size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
+      const size_t relative_offset = m_data_offset - map_offset;
+      const size_t adjusted_map_size = m_map_size + relative_offset;
+      
+      util::UnmapOrThrow((pointer)((char*)p - relative_offset), adjusted_map_size);
     }
-
   }
 
   void construct (pointer p, const T& value) {
diff --git a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp
index ba1dfc578..d590ef9b3 100644
--- a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp
+++ b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp
@@ -130,7 +130,7 @@ PhraseTableCreator::PhraseTableCreator(std::string inPath,
     MmapAllocator<unsigned char> allocCompressed(util::FMakeTemp(tempfilePath));
     m_compressedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocCompressed);
   } else {
-    m_compressedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>();
+    m_compressedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>(true);
   }
   CompressTargetPhrases();
 
@@ -203,7 +203,7 @@ void PhraseTableCreator::Save()
         = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++)
       temp1[it->second] = it->first;
     std::sort(temp1.begin(), temp1.end());
-    StringVector<unsigned char, unsigned, std::allocator> sourceSymbols;
+    StringVector<unsigned char, unsigned, std::allocator> sourceSymbols(true);
     for(std::vector<std::string>::iterator it = temp1.begin();
         it != temp1.end(); it++)
       sourceSymbols.push_back(*it);
@@ -224,7 +224,7 @@ void PhraseTableCreator::Save()
   for(boost::unordered_map<std::string, unsigned>::iterator it
       = m_targetSymbolsMap.begin(); it != m_targetSymbolsMap.end(); it++)
     temp2[it->second] = it->first;
-  StringVector<unsigned char, unsigned, std::allocator> targetSymbols;
+  StringVector<unsigned char, unsigned, std::allocator> targetSymbols(true);
   for(std::vector<std::string>::iterator it = temp2.begin();
       it != temp2.end(); it++)
     targetSymbols.push_back(*it);
diff --git a/moses/TranslationModel/CompactPT/StringVector.h b/moses/TranslationModel/CompactPT/StringVector.h
index bb2bc11ef..3af970c41 100644
--- a/moses/TranslationModel/CompactPT/StringVector.h
+++ b/moses/TranslationModel/CompactPT/StringVector.h
@@ -147,8 +147,8 @@ public:
   typedef RangeIterator iterator;
   typedef StringIterator string_iterator;
 
-  StringVector();
-  StringVector(Allocator<ValueT> alloc);
+  StringVector(bool allocate = false);
+  StringVector(Allocator<ValueT>& alloc);
 
   virtual ~StringVector() {
     delete m_charArray;
@@ -203,13 +203,13 @@ public:
     m_memoryMapped = memoryMapped;
 
     size += std::fread(&m_sorted, sizeof(bool), 1, in) * sizeof(bool);
-    size += m_positions.load(in, m_memoryMapped);
+    size += m_positions.load(in, false);
 
-    size += loadCharArray(*m_charArray, in, m_memoryMapped);
+    size += loadCharArray(m_charArray, in, m_memoryMapped);
     return size;
   }
 
-  size_t loadCharArray(std::vector<ValueT, std::allocator<ValueT> >& c,
+  size_t loadCharArray(std::vector<ValueT, std::allocator<ValueT> >*& c,
                        std::FILE* in, bool map = false) {
     // Can only be read into memory. Mapping not possible with std:allocator.
     assert(map == false);
@@ -219,13 +219,13 @@ public:
     size_t valSize;
     byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
 
-    c.resize(valSize, 0);
-    byteSize += std::fread(&c[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
+    c = new std::vector<ValueT, std::allocator<ValueT> >(valSize, 0);
+    byteSize += std::fread(&(*c)[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
 
     return byteSize;
   }
 
-  size_t loadCharArray(std::vector<ValueT, MmapAllocator<ValueT> >& c,
+  size_t loadCharArray(std::vector<ValueT, MmapAllocator<ValueT> >*& c,
                        std::FILE* in, bool map = false) {
     size_t byteSize = 0;
 
@@ -235,19 +235,17 @@ public:
     if(map == false) {
       // Read data into temporary file (default constructor of MmapAllocator)
       // and map memory onto temporary file. Can be resized.
-
-      c.resize(valSize, 0);
-      byteSize += std::fread(&c[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
+      c = new std::vector<ValueT, MmapAllocator<ValueT> >(valSize, 0); 
+      byteSize += std::fread(&(*c)[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
     } else {
       // Map it directly on specified region of file "in" starting at valPos
       // with length valSize * sizeof(ValueT). Mapped region cannot be resized.
 
       size_t valPos = std::ftell(in);
       Allocator<ValueT> alloc(in, valPos);
-      std::vector<ValueT, Allocator<ValueT> > charArrayTemp(alloc);
-      charArrayTemp.resize(valSize, 0);
-      c.swap(charArrayTemp);
-
+      c = new std::vector<ValueT, Allocator<ValueT> >(alloc);
+      c->resize(valSize, 0);
+      
       byteSize += valSize * sizeof(ValueT);
     }
 
@@ -369,11 +367,12 @@ OStream& operator<<(OStream &os, ValueIteratorRange<ValueIteratorT> cr)
 // StringVector
 
 template<typename ValueT, typename PosT, template <typename> class Allocator>
-StringVector<ValueT, PosT, Allocator>::StringVector()
-  : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >()) { }
+StringVector<ValueT, PosT, Allocator>::StringVector(bool allocate)
+  : m_sorted(true), m_memoryMapped(false),
+    m_charArray(allocate ? new std::vector<ValueT, Allocator<ValueT> >() : 0) { }
 
 template<typename ValueT, typename PosT, template <typename> class Allocator>
-StringVector<ValueT, PosT, Allocator>::StringVector(Allocator<ValueT> alloc)
+StringVector<ValueT, PosT, Allocator>::StringVector(Allocator<ValueT> &alloc)
   : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >(alloc)) { }
 
 template<typename ValueT, typename PosT, template <typename> class Allocator>

From 0f943dd9c10acf4ac0cae5b642175d763594e4b1 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Sun, 21 Jun 2015 21:16:12 +0400
Subject: [PATCH 081/108] clang compile errors

---
 contrib/other-builds/all.workspace                            | 4 ++--
 contrib/other-builds/moses/moses.project                      | 2 +-
 .../CompactPT/LexicalReorderingTableCompact.cpp               | 4 ++--
 moses/TranslationModel/CompactPT/PhraseDecoder.cpp            | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/contrib/other-builds/all.workspace b/contrib/other-builds/all.workspace
index 66dafe3d2..621bafdc2 100644
--- a/contrib/other-builds/all.workspace
+++ b/contrib/other-builds/all.workspace
@@ -6,10 +6,10 @@
   <Project Name="lm" Path="lm/lm.project" Active="No"/>
   <Project Name="OnDiskPt" Path="OnDiskPt/OnDiskPt.project" Active="No"/>
   <Project Name="search" Path="search/search.project" Active="No"/>
-  <Project Name="moses-cmd" Path="moses-cmd/moses-cmd.project" Active="Yes"/>
+  <Project Name="moses-cmd" Path="moses-cmd/moses-cmd.project" Active="No"/>
   <Project Name="score" Path="score/score.project" Active="No"/>
   <Project Name="consolidate" Path="consolidate/consolidate.project" Active="No"/>
-  <Project Name="moses" Path="moses/moses.project" Active="No"/>
+  <Project Name="moses" Path="moses/moses.project" Active="Yes"/>
   <BuildMatrix>
     <WorkspaceConfiguration Name="Debug" Selected="yes">
       <Project Name="manual-label" ConfigName="Debug"/>
diff --git a/contrib/other-builds/moses/moses.project b/contrib/other-builds/moses/moses.project
index 66e0b9bad..81072d667 100644
--- a/contrib/other-builds/moses/moses.project
+++ b/contrib/other-builds/moses/moses.project
@@ -814,7 +814,7 @@
       <ResourceCompiler Options=""/>
     </GlobalSettings>
     <Configuration Name="Debug" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Static Library" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
-      <Compiler Options="-g" C_Options="-g" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
+      <Compiler Options="-g  -std=c++0x" C_Options="-g" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
         <IncludePath Value="."/>
         <IncludePath Value="../../../"/>
         <IncludePath Value="../../../phrase-extract"/>
diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
index fe475507c..cd71b1776 100644
--- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
+++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
@@ -78,9 +78,9 @@ GetScore(const Phrase& f, const Phrase& e, const Phrase& c)
   if(m_hash.GetSize() != index) {
     std::string scoresString;
     if(m_inMemory)
-      scoresString = m_scoresMemory[index];
+      scoresString = m_scoresMemory[index].str();
     else
-      scoresString = m_scoresMapped[index];
+      scoresString = m_scoresMapped[index].str();
 
     BitWrapper<> bitStream(scoresString);
     for(size_t i = 0; i < m_numScoreComponent; i++)
diff --git a/moses/TranslationModel/CompactPT/PhraseDecoder.cpp b/moses/TranslationModel/CompactPT/PhraseDecoder.cpp
index 3cf2f010e..54e6815a1 100644
--- a/moses/TranslationModel/CompactPT/PhraseDecoder.cpp
+++ b/moses/TranslationModel/CompactPT/PhraseDecoder.cpp
@@ -224,9 +224,9 @@ TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &
     // Retrieve compressed and encoded target phrase collection
     std::string encodedPhraseCollection;
     if(m_phraseDictionary.m_inMemory)
-      encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMemory[sourcePhraseId];
+      encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMemory[sourcePhraseId].str();
     else
-      encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMapped[sourcePhraseId];
+      encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMapped[sourcePhraseId].str();
 
     BitWrapper<> encodedBitStream(encodedPhraseCollection);
     if(m_coding == PREnc && bitsLeft)

From 6151003c1362f7ba12e769c3dd69bf21992ac48e Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <junczys@amu.edu.pl>
Date: Sun, 21 Jun 2015 19:24:43 +0200
Subject: [PATCH 082/108] Remove C++11 oddities

---
 moses/TranslationModel/CompactPT/MmapAllocator.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/moses/TranslationModel/CompactPT/MmapAllocator.h b/moses/TranslationModel/CompactPT/MmapAllocator.h
index 0e04890bd..1d0d06f77 100644
--- a/moses/TranslationModel/CompactPT/MmapAllocator.h
+++ b/moses/TranslationModel/CompactPT/MmapAllocator.h
@@ -61,9 +61,6 @@ public:
   typedef const T& const_reference;
   typedef std::size_t    size_type;
   typedef std::ptrdiff_t difference_type;
-
-  MmapAllocator(MmapAllocator &&) = delete; 	
-  MmapAllocator(const MmapAllocator &&) = delete; 		
   
   MmapAllocator() throw()
     : m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)),

From e57ca5ec34c8723a73122b3e0963a1e8ff719a45 Mon Sep 17 00:00:00 2001
From: MosesAdmin <moses-support-owner@mit.edu>
Date: Mon, 22 Jun 2015 00:00:43 +0100
Subject: [PATCH 083/108] daily automatic beautifier

---
 .../CompactPT/LexicalReorderingTableCreator.cpp               | 4 ++--
 moses/TranslationModel/CompactPT/MmapAllocator.h              | 4 ++--
 moses/TranslationModel/CompactPT/StringVector.h               | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp
index 8e9f4fa0a..4941d32ec 100644
--- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp
+++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp
@@ -52,7 +52,7 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator(
 
   std::cerr << "Pass 1/2: Creating phrase index + Counting scores" << std::endl;
   m_hash.BeginSave(m_outFile);
-  
+
   if(tempfilePath.size()) {
     MmapAllocator<unsigned char> allocEncoded(util::FMakeTemp(tempfilePath));
     m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocEncoded);
@@ -67,7 +67,7 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator(
 
   std::cerr << "Pass 2/2: Compressing scores" << std::endl;
 
-  
+
   if(tempfilePath.size()) {
     MmapAllocator<unsigned char> allocCompressed(util::FMakeTemp(tempfilePath));
     m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocCompressed);
diff --git a/moses/TranslationModel/CompactPT/MmapAllocator.h b/moses/TranslationModel/CompactPT/MmapAllocator.h
index 1d0d06f77..72d0c1663 100644
--- a/moses/TranslationModel/CompactPT/MmapAllocator.h
+++ b/moses/TranslationModel/CompactPT/MmapAllocator.h
@@ -61,7 +61,7 @@ public:
   typedef const T& const_reference;
   typedef std::size_t    size_type;
   typedef std::ptrdiff_t difference_type;
-  
+
   MmapAllocator() throw()
     : m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)),
       m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
@@ -154,7 +154,7 @@ public:
       const size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
       const size_t relative_offset = m_data_offset - map_offset;
       const size_t adjusted_map_size = m_map_size + relative_offset;
-      
+
       util::UnmapOrThrow((pointer)((char*)p - relative_offset), adjusted_map_size);
     }
   }
diff --git a/moses/TranslationModel/CompactPT/StringVector.h b/moses/TranslationModel/CompactPT/StringVector.h
index 3af970c41..aaec500f0 100644
--- a/moses/TranslationModel/CompactPT/StringVector.h
+++ b/moses/TranslationModel/CompactPT/StringVector.h
@@ -235,7 +235,7 @@ public:
     if(map == false) {
       // Read data into temporary file (default constructor of MmapAllocator)
       // and map memory onto temporary file. Can be resized.
-      c = new std::vector<ValueT, MmapAllocator<ValueT> >(valSize, 0); 
+      c = new std::vector<ValueT, MmapAllocator<ValueT> >(valSize, 0);
       byteSize += std::fread(&(*c)[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
     } else {
       // Map it directly on specified region of file "in" starting at valPos
@@ -245,7 +245,7 @@ public:
       Allocator<ValueT> alloc(in, valPos);
       c = new std::vector<ValueT, Allocator<ValueT> >(alloc);
       c->resize(valSize, 0);
-      
+
       byteSize += valSize * sizeof(ValueT);
     }
 

From 2a242afa346b70a6c8dc22522349300b6d28e563 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Mon, 22 Jun 2015 10:46:12 -0400
Subject: [PATCH 084/108] Didn't need header

---
 moses/IOWrapper.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/moses/IOWrapper.h b/moses/IOWrapper.h
index f1bcefa92..e3057794f 100644
--- a/moses/IOWrapper.h
+++ b/moses/IOWrapper.h
@@ -61,8 +61,6 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "moses/ChartKBestExtractor.h"
 #include "moses/Syntax/KBestExtractor.h"
 
-#include "search/applied.hh"
-
 #include <boost/format.hpp>
 
 namespace Moses

From 0d34023aad0dbf28c28bcc17876b4016b5b1b3ea Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Wed, 24 Jun 2015 14:56:37 +0400
Subject: [PATCH 085/108] prune generation table

---
 misc/Jamfile             |  4 ++-
 misc/pruneGeneration.cpp | 55 ++++++++++++++++++++++++++++++++++++++++
 misc/pruneGeneration.h   | 45 ++++++++++++++++++++++++++++++++
 3 files changed, 103 insertions(+), 1 deletion(-)
 create mode 100644 misc/pruneGeneration.cpp
 create mode 100644 misc/pruneGeneration.h

diff --git a/misc/Jamfile b/misc/Jamfile
index bfea14d58..46a18e253 100644
--- a/misc/Jamfile
+++ b/misc/Jamfile
@@ -14,6 +14,8 @@ exe 1-1-Extraction : 1-1-Extraction.cpp ..//boost_filesystem ../moses//moses ;
 
 exe prunePhraseTable : prunePhraseTable.cpp ..//boost_filesystem ../moses//moses ..//boost_program_options  ;
 
+exe pruneGeneration : pruneGeneration.cpp ..//boost_filesystem ../moses//moses ..//boost_program_options  ;
+
 local with-cmph = [ option.get "with-cmph" ] ;
 if $(with-cmph) {
     exe processPhraseTableMin : processPhraseTableMin.cpp ..//boost_filesystem ../moses//moses ;
@@ -46,6 +48,6 @@ $(TOP)//boost_iostreams
 $(TOP)//boost_program_options 
 ; 
 
-alias programs : 1-1-Extraction TMining generateSequences processLexicalTable queryLexicalTable programsMin programsProbing merge-sorted prunePhraseTable  ;
+alias programs : 1-1-Extraction TMining generateSequences processLexicalTable queryLexicalTable programsMin programsProbing merge-sorted prunePhraseTable pruneGeneration  ;
 #processPhraseTable queryPhraseTable
 
diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp
new file mode 100644
index 000000000..45873a4ac
--- /dev/null
+++ b/misc/pruneGeneration.cpp
@@ -0,0 +1,55 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <cassert>
+#include "pruneGeneration.h"
+
+using namespace std;
+
+int main(int argc, char **argv)
+{
+  cerr << "Starting" << endl;
+  int limit = atoi(argv[1]);
+  
+  vector<Rec> records;
+  string prevInWord;
+  string line;
+  while (getline(cin, line)) {
+    vector<string> toks;
+    Tokenize(toks, line);
+    assert(toks.size() == 4);
+    
+    if (prevInWord != toks[0]) {
+      Output(limit, records);
+      records.clear();
+    }
+    
+    // add new record
+    float prob = atof(toks[2].c_str());
+    records.push_back(Rec(prob, line));
+
+    prevInWord = toks[0];
+  }
+
+  // last
+  Output(limit, records);
+  records.clear();
+
+  cerr << "Finished" << endl;  
+}
+
+void Output(int limit, vector<Rec> &records)
+{
+  Prune(limit, records);
+  
+  for (size_t i = 0; i < limit && i < records.size(); ++i) {
+    const Rec &rec = records[i];
+    cout << rec.line << endl;
+  }
+}
+
+void Prune(int limit, std::vector<Rec> &records)
+{
+  std::sort(records.rbegin(), records.rend());
+  
+}
diff --git a/misc/pruneGeneration.h b/misc/pruneGeneration.h
new file mode 100644
index 000000000..693c5f149
--- /dev/null
+++ b/misc/pruneGeneration.h
@@ -0,0 +1,45 @@
+#pragma once
+#include <vector>
+#include <string>
+
+class Rec
+{
+public:
+  float prob;
+  std::string line;
+  
+  Rec(float aprob, const std::string &aline)
+  :prob(aprob)
+  ,line(aline)
+  {}
+  
+  inline bool operator< (const Rec &compare) const {
+    return prob < compare.prob;
+  }
+};
+
+////////////////////////////////////////////////////////////
+
+void Output(int limit, std::vector<Rec> &records);
+void Prune(int limit, std::vector<Rec> &records);
+
+////////////////////////////////////////////////////////////
+inline void Tokenize(std::vector<std::string> &output
+                     , const std::string& str
+                     , const std::string& delimiters = " \t")
+{
+  // Skip delimiters at beginning.
+  std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
+  // Find first "non-delimiter".
+  std::string::size_type pos     = str.find_first_of(delimiters, lastPos);
+
+  while (std::string::npos != pos || std::string::npos != lastPos) {
+    // Found a token, add it to the vector.
+    output.push_back(str.substr(lastPos, pos - lastPos));
+    // Skip delimiters.  Note the "not_of"
+    lastPos = str.find_first_not_of(delimiters, pos);
+    // Find next "non-delimiter"
+    pos = str.find_first_of(delimiters, lastPos);
+  }
+}
+

From bac5c2e55c1b2454328bf18207b6d9633d2b9adf Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Wed, 24 Jun 2015 16:24:12 +0400
Subject: [PATCH 086/108] compile error with gcc

---
 misc/pruneGeneration.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp
index 45873a4ac..275d599df 100644
--- a/misc/pruneGeneration.cpp
+++ b/misc/pruneGeneration.cpp
@@ -2,6 +2,8 @@
 #include <stdlib.h>
 #include <iostream>
 #include <cassert>
+#include <algorithm>
+#include <functional>
 #include "pruneGeneration.h"
 
 using namespace std;

From 9936c9f264f95c02e47a6e987bea0e2026b78727 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Mon, 22 Jun 2015 10:46:12 -0400
Subject: [PATCH 087/108] Didn't need header

---
 moses/IOWrapper.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/moses/IOWrapper.h b/moses/IOWrapper.h
index f1bcefa92..e3057794f 100644
--- a/moses/IOWrapper.h
+++ b/moses/IOWrapper.h
@@ -61,8 +61,6 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "moses/ChartKBestExtractor.h"
 #include "moses/Syntax/KBestExtractor.h"
 
-#include "search/applied.hh"
-
 #include <boost/format.hpp>
 
 namespace Moses

From d928340cd4a0a07fb8058a3a586cba2d4633c416 Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Wed, 24 Jun 2015 14:34:27 +0100
Subject: [PATCH 088/108] Added context handling to TranslationRequest for
 moses server.

---
 moses/server/TranslationRequest.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp
index cad3696d1..3848f81ba 100644
--- a/moses/server/TranslationRequest.cpp
+++ b/moses/server/TranslationRequest.cpp
@@ -1,4 +1,5 @@
 #include "TranslationRequest.h"
+#include "moses/ContextScope.h" 
 #include <boost/foreach.hpp>
 
 namespace MosesServer
@@ -30,6 +31,7 @@ create(xmlrpc_c::paramList const& paramList,
   boost::shared_ptr<TranslationRequest> ret;
   ret.reset(new TranslationRequest(paramList,cond, mut));
   ret->m_self = ret;
+  ret->m_scope.reset(new Moses::ContextScope);
   return ret;
 }
 
@@ -270,7 +272,10 @@ parse_request(std::map<std::string, xmlrpc_c::value> const& params)
   if (si != params.end())
     m_nbestSize = xmlrpc_c::value_int(si->second);
 
-
+  si = params.find("context");
+  if (si != params.end()) {
+    m_context_string = xmlrpc_c::value_string(si->second);
+  }
   // // biased sampling for suffix-array-based sampling phrase table?
   // if ((si = params.find("bias")) != params.end())
   //   {

From 555f91eb7ec79cc69e1b18889fd17217d3425389 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Wed, 24 Jun 2015 18:31:05 +0400
Subject: [PATCH 089/108] codelite

---
 contrib/other-builds/all.workspace            |  5 +-
 .../other-builds/moses-cmd/moses-cmd.project  | 14 +--
 contrib/other-builds/moses/moses.project      |  4 +-
 .../pruneGeneration/pruneGeneration.project   | 97 +++++++++++++++++++
 misc/pruneGeneration.cpp                      |  7 +-
 misc/pruneGeneration.h                        |  1 -
 6 files changed, 111 insertions(+), 17 deletions(-)
 create mode 100644 contrib/other-builds/pruneGeneration/pruneGeneration.project

diff --git a/contrib/other-builds/all.workspace b/contrib/other-builds/all.workspace
index 621bafdc2..5a7eaf114 100644
--- a/contrib/other-builds/all.workspace
+++ b/contrib/other-builds/all.workspace
@@ -9,7 +9,8 @@
   <Project Name="moses-cmd" Path="moses-cmd/moses-cmd.project" Active="No"/>
   <Project Name="score" Path="score/score.project" Active="No"/>
   <Project Name="consolidate" Path="consolidate/consolidate.project" Active="No"/>
-  <Project Name="moses" Path="moses/moses.project" Active="Yes"/>
+  <Project Name="moses" Path="moses/moses.project" Active="No"/>
+  <Project Name="pruneGeneration" Path="pruneGeneration/pruneGeneration.project" Active="Yes"/>
   <BuildMatrix>
     <WorkspaceConfiguration Name="Debug" Selected="yes">
       <Project Name="manual-label" ConfigName="Debug"/>
@@ -23,6 +24,7 @@
       <Project Name="score" ConfigName="Debug"/>
       <Project Name="consolidate" ConfigName="Debug"/>
       <Project Name="moses" ConfigName="Debug"/>
+      <Project Name="pruneGeneration" ConfigName="Debug"/>
     </WorkspaceConfiguration>
     <WorkspaceConfiguration Name="Release" Selected="yes">
       <Project Name="manual-label" ConfigName="Release"/>
@@ -36,6 +38,7 @@
       <Project Name="score" ConfigName="Release"/>
       <Project Name="consolidate" ConfigName="Release"/>
       <Project Name="moses" ConfigName="Release"/>
+      <Project Name="pruneGeneration" ConfigName="Release"/>
     </WorkspaceConfiguration>
   </BuildMatrix>
 </CodeLite_Workspace>
diff --git a/contrib/other-builds/moses-cmd/moses-cmd.project b/contrib/other-builds/moses-cmd/moses-cmd.project
index ac567ffce..44a0d621f 100644
--- a/contrib/other-builds/moses-cmd/moses-cmd.project
+++ b/contrib/other-builds/moses-cmd/moses-cmd.project
@@ -26,13 +26,6 @@
     <File Name="../../../moses-cmd/MainVW.cpp" ExcludeProjConfig="Debug"/>
     <File Name="../../../moses-cmd/MainVW.h" ExcludeProjConfig="Debug"/>
   </VirtualDirectory>
-  <Dependencies Name="Debug">
-    <Project Name="OnDiskPt"/>
-    <Project Name="lm"/>
-    <Project Name="search"/>
-    <Project Name="util"/>
-  </Dependencies>
-  <Dependencies Name="Release"/>
   <Settings Type="Executable">
     <GlobalSettings>
       <Compiler Options="" C_Options="" Assembler="">
@@ -150,4 +143,11 @@
       </Completion>
     </Configuration>
   </Settings>
+  <Dependencies Name="Debug">
+    <Project Name="OnDiskPt"/>
+    <Project Name="lm"/>
+    <Project Name="search"/>
+    <Project Name="util"/>
+  </Dependencies>
+  <Dependencies Name="Release"/>
 </CodeLite_Project>
diff --git a/contrib/other-builds/moses/moses.project b/contrib/other-builds/moses/moses.project
index 81072d667..0fbd942c6 100644
--- a/contrib/other-builds/moses/moses.project
+++ b/contrib/other-builds/moses/moses.project
@@ -793,8 +793,6 @@
     <File Name="../../../moses/PP/SpanLengthPhraseProperty.h"/>
     <File Name="../../../moses/PP/TreeStructurePhraseProperty.h"/>
   </VirtualDirectory>
-  <Dependencies Name="Debug"/>
-  <Dependencies Name="Release"/>
   <VirtualDirectory Name="parameters">
     <File Name="../../../moses/parameters/ContextParameters.cpp"/>
     <File Name="../../../moses/parameters/ContextParameters.h"/>
@@ -897,4 +895,6 @@
       </Completion>
     </Configuration>
   </Settings>
+  <Dependencies Name="Debug"/>
+  <Dependencies Name="Release"/>
 </CodeLite_Project>
diff --git a/contrib/other-builds/pruneGeneration/pruneGeneration.project b/contrib/other-builds/pruneGeneration/pruneGeneration.project
new file mode 100644
index 000000000..7060d55ea
--- /dev/null
+++ b/contrib/other-builds/pruneGeneration/pruneGeneration.project
@@ -0,0 +1,97 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<CodeLite_Project Name="pruneGeneration" InternalType="Console">
+  <Description/>
+  <Dependencies/>
+  <Settings Type="Executable">
+    <GlobalSettings>
+      <Compiler Options="" C_Options="" Assembler="">
+        <IncludePath Value="."/>
+      </Compiler>
+      <Linker Options="">
+        <LibraryPath Value="."/>
+      </Linker>
+      <ResourceCompiler Options=""/>
+    </GlobalSettings>
+    <Configuration Name="Debug" CompilerType="GCC ( XCode )" DebuggerType="LLDB Debugger" Type="Executable" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
+      <Compiler Options="-g;-O0;-Wall" C_Options="-g;-O0;-Wall" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
+        <IncludePath Value="."/>
+      </Compiler>
+      <Linker Options="" Required="yes"/>
+      <ResourceCompiler Options="" Required="no"/>
+      <General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
+      <Environment EnvVarSetName="&lt;Use Defaults&gt;" DbgSetName="&lt;Use Defaults&gt;">
+        <![CDATA[]]>
+      </Environment>
+      <Debugger IsRemote="no" RemoteHostName="" RemoteHostPort="" DebuggerPath="" IsExtended="yes">
+        <DebuggerSearchPaths/>
+        <PostConnectCommands/>
+        <StartupCommands/>
+      </Debugger>
+      <PreBuild/>
+      <PostBuild/>
+      <CustomBuild Enabled="no">
+        <RebuildCommand/>
+        <CleanCommand/>
+        <BuildCommand/>
+        <PreprocessFileCommand/>
+        <SingleFileCommand/>
+        <MakefileGenerationCommand/>
+        <ThirdPartyToolName>None</ThirdPartyToolName>
+        <WorkingDirectory/>
+      </CustomBuild>
+      <AdditionalRules>
+        <CustomPostBuild/>
+        <CustomPreBuild/>
+      </AdditionalRules>
+      <Completion EnableCpp11="no" EnableCpp14="no">
+        <ClangCmpFlagsC/>
+        <ClangCmpFlags/>
+        <ClangPP/>
+        <SearchPaths/>
+      </Completion>
+    </Configuration>
+    <Configuration Name="Release" CompilerType="GCC ( XCode )" DebuggerType="LLDB Debugger" Type="Executable" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
+      <Compiler Options="-O2;-Wall" C_Options="-O2;-Wall" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
+        <IncludePath Value="."/>
+        <Preprocessor Value="NDEBUG"/>
+      </Compiler>
+      <Linker Options="" Required="yes"/>
+      <ResourceCompiler Options="" Required="no"/>
+      <General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Release" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
+      <Environment EnvVarSetName="&lt;Use Defaults&gt;" DbgSetName="&lt;Use Defaults&gt;">
+        <![CDATA[]]>
+      </Environment>
+      <Debugger IsRemote="no" RemoteHostName="" RemoteHostPort="" DebuggerPath="" IsExtended="no">
+        <DebuggerSearchPaths/>
+        <PostConnectCommands/>
+        <StartupCommands/>
+      </Debugger>
+      <PreBuild/>
+      <PostBuild/>
+      <CustomBuild Enabled="no">
+        <RebuildCommand/>
+        <CleanCommand/>
+        <BuildCommand/>
+        <PreprocessFileCommand/>
+        <SingleFileCommand/>
+        <MakefileGenerationCommand/>
+        <ThirdPartyToolName>None</ThirdPartyToolName>
+        <WorkingDirectory/>
+      </CustomBuild>
+      <AdditionalRules>
+        <CustomPostBuild/>
+        <CustomPreBuild/>
+      </AdditionalRules>
+      <Completion EnableCpp11="no" EnableCpp14="no">
+        <ClangCmpFlagsC/>
+        <ClangCmpFlags/>
+        <ClangPP/>
+        <SearchPaths/>
+      </Completion>
+    </Configuration>
+  </Settings>
+  <VirtualDirectory Name="src">
+    <File Name="../../../misc/pruneGeneration.cpp"/>
+    <File Name="../../../misc/pruneGeneration.h"/>
+  </VirtualDirectory>
+</CodeLite_Project>
diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp
index 275d599df..19ae2184f 100644
--- a/misc/pruneGeneration.cpp
+++ b/misc/pruneGeneration.cpp
@@ -42,7 +42,7 @@ int main(int argc, char **argv)
 
 void Output(int limit, vector<Rec> &records)
 {
-  Prune(limit, records);
+  std::sort(records.rbegin(), records.rend());
   
   for (size_t i = 0; i < limit && i < records.size(); ++i) {
     const Rec &rec = records[i];
@@ -50,8 +50,3 @@ void Output(int limit, vector<Rec> &records)
   }
 }
 
-void Prune(int limit, std::vector<Rec> &records)
-{
-  std::sort(records.rbegin(), records.rend());
-  
-}
diff --git a/misc/pruneGeneration.h b/misc/pruneGeneration.h
index 693c5f149..470e607d4 100644
--- a/misc/pruneGeneration.h
+++ b/misc/pruneGeneration.h
@@ -21,7 +21,6 @@ public:
 ////////////////////////////////////////////////////////////
 
 void Output(int limit, std::vector<Rec> &records);
-void Prune(int limit, std::vector<Rec> &records);
 
 ////////////////////////////////////////////////////////////
 inline void Tokenize(std::vector<std::string> &output

From dce0f33270bd6e169850a9337141c5af39f3f765 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Wed, 24 Jun 2015 18:35:59 +0400
Subject: [PATCH 090/108] prune generation table in ems

---
 scripts/ems/experiment.meta | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index 9edeec460..4177f967e 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -739,6 +739,14 @@ build-generation-custom
 	ignore-unless: AND generation-factors generation-corpus
 	default-name: model/generation-table
 	final-model: yes
+generation-prune
+	in: generation-table
+	out: generation-table-pruned
+	rerun-on-change: TRAINING:prune-generation
+	pass-unless: AND TRAINING:prune-generation
+	default-name: model/generation-table-pruned
+	final-model: yes
+	template: $TRAINING:prune-generation < IN > OUT
 build-sparse
 	in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
         out: sparse
@@ -747,7 +755,7 @@ build-sparse
         default-name: model/sparse-features
 	template: $moses-script-dir/ems/support/build-sparse-features.perl IN $input-extension $output-extension OUT "$sparse-features"
 create-config
-	in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm
+	in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-table generation-table-pruned sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm
 	out: config
 	ignore-if: use-hiero thot
 	rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini mmsapt no-glue-grammar dont-tune-glue-grammar use-syntax-input-weight-feature

From 78b2810cfe52d0a7246c4c376e32e4f1bc321577 Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Wed, 24 Jun 2015 18:09:22 +0100
Subject: [PATCH 091/108] Allow context server to use ports other than 80.

---
 .../TranslationModel/UG/mm/ug_http_client.cc  | 34 +++++++++++++------
 moses/TranslationModel/UG/mm/ug_http_client.h | 10 ++++--
 2 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/moses/TranslationModel/UG/mm/ug_http_client.cc b/moses/TranslationModel/UG/mm/ug_http_client.cc
index 1d6d70edb..1bbb93b23 100644
--- a/moses/TranslationModel/UG/mm/ug_http_client.cc
+++ b/moses/TranslationModel/UG/mm/ug_http_client.cc
@@ -7,28 +7,40 @@ std::string http_client::content() const { return m_content.str(); }
 
 http_client::
 http_client(boost::asio::io_service& io_service,
-	    const std::string& server, const std::string& path)
+	    std::string const& server, 
+	    std::string const& port,
+	    std::string const& path)
   : resolver_(io_service), socket_(io_service)
 {
-  init(server,path);
+  init(server, port, path);
 }
-
+  
 http_client::
 http_client(boost::asio::io_service& io_service, std::string url)
   : resolver_(io_service), socket_(io_service)
 {
-  size_t p = url.find("://");
-  if (p < url.size()) url.erase(0,p+3);
-  p = url.find("/");
+  std::string server;
+  std::string path = "/";
+  std::string port = "http";
+  size_t p = url.find("://"), q;
   if (p < url.size()) 
-    init(url.substr(0,p),url.substr(p));
-  else 
-    init(url,"/");
+    {
+      port = url.substr(0,p);
+      url.erase(0, p+3);
+    }
+  p = std::min(url.find_first_of(":/"), url.size());
+  q = std::min(url.find("/"), url.size());
+  if (p < url.size() && url[p] == ':') 
+    port = url.substr(p,q-p);
+  server = url.substr(0,p);
+  if (q < url.size()) 
+    path = url.substr(q);
+  init(server, port, path);
 }
 
 void 
 http_client::
-init(std::string const& server, std::string const& path)
+init(std::string const& server, std::string const& port, std::string const& path)
 {
   // Form the request. We specify the "Connection: close" header so
   // that the server will close the socket after transmitting the
@@ -43,7 +55,7 @@ init(std::string const& server, std::string const& path)
   
   // Start an asynchronous resolve to translate the server and service names
   // into a list of endpoints.
-  tcp::resolver::query query(server, "http");
+  tcp::resolver::query query(server, port);
   resolver_.async_resolve(query,
 			  boost::bind(&http_client::handle_resolve, this,
 				      boost::asio::placeholders::error,
diff --git a/moses/TranslationModel/UG/mm/ug_http_client.h b/moses/TranslationModel/UG/mm/ug_http_client.h
index 53ee258f9..825c0c37e 100644
--- a/moses/TranslationModel/UG/mm/ug_http_client.h
+++ b/moses/TranslationModel/UG/mm/ug_http_client.h
@@ -35,9 +35,15 @@ class http_client
 public:
   http_client(boost::asio::io_service& io_service, std::string url);
   http_client(boost::asio::io_service& io_service,
-	      const std::string& server, const std::string& path);
+	      std::string const& server, 
+	      std::string const& port, 
+	      std::string const& path);
 private:
-  void init(std::string const& server, std::string const& path);
+
+  void init(std::string const& server, 
+	    std::string const& port, 
+	    std::string const& path);
+
   void handle_resolve(const boost::system::error_code& err,
 		      tcp::resolver::iterator endpoint_iterator);
   void handle_connect(const boost::system::error_code& err,

From 4ec69fbfdff104218db16c9c1ba8c8c381c331c3 Mon Sep 17 00:00:00 2001
From: MosesAdmin <moses-support-owner@mit.edu>
Date: Thu, 25 Jun 2015 00:00:42 +0100
Subject: [PATCH 092/108] daily automatic beautifier

---
 misc/pruneGeneration.cpp | 10 +++++-----
 misc/pruneGeneration.h   |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp
index 19ae2184f..8207e287f 100644
--- a/misc/pruneGeneration.cpp
+++ b/misc/pruneGeneration.cpp
@@ -12,7 +12,7 @@ int main(int argc, char **argv)
 {
   cerr << "Starting" << endl;
   int limit = atoi(argv[1]);
-  
+
   vector<Rec> records;
   string prevInWord;
   string line;
@@ -20,12 +20,12 @@ int main(int argc, char **argv)
     vector<string> toks;
     Tokenize(toks, line);
     assert(toks.size() == 4);
-    
+
     if (prevInWord != toks[0]) {
       Output(limit, records);
       records.clear();
     }
-    
+
     // add new record
     float prob = atof(toks[2].c_str());
     records.push_back(Rec(prob, line));
@@ -37,13 +37,13 @@ int main(int argc, char **argv)
   Output(limit, records);
   records.clear();
 
-  cerr << "Finished" << endl;  
+  cerr << "Finished" << endl;
 }
 
 void Output(int limit, vector<Rec> &records)
 {
   std::sort(records.rbegin(), records.rend());
-  
+
   for (size_t i = 0; i < limit && i < records.size(); ++i) {
     const Rec &rec = records[i];
     cout << rec.line << endl;
diff --git a/misc/pruneGeneration.h b/misc/pruneGeneration.h
index 470e607d4..dae5958f8 100644
--- a/misc/pruneGeneration.h
+++ b/misc/pruneGeneration.h
@@ -7,12 +7,12 @@ class Rec
 public:
   float prob;
   std::string line;
-  
+
   Rec(float aprob, const std::string &aline)
-  :prob(aprob)
-  ,line(aline)
+    :prob(aprob)
+    ,line(aline)
   {}
-  
+
   inline bool operator< (const Rec &compare) const {
     return prob < compare.prob;
   }

From c80df1212ede1c8db39fbd5fe21f11d8f2ea60f7 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Thu, 25 Jun 2015 10:48:35 +0400
Subject: [PATCH 093/108] prune multiple files at once. Make up for failure in
 ems to give the full path of the gen table

---
 .../other-builds/OnDiskPt/OnDiskPt.project    | 31 ++++++++++----
 .../extract-mixed-syntax.project              | 40 ++++++++++++++-----
 contrib/other-builds/extract/extract.project  | 31 ++++++++++----
 contrib/other-builds/lm/lm.project            | 31 ++++++++++----
 .../other-builds/moses-cmd/moses-cmd.project  | 32 +++++++--------
 .../pruneGeneration/pruneGeneration.project   |  9 +++--
 contrib/other-builds/score/score.project      | 30 +++++++-------
 contrib/other-builds/search/search.project    | 14 +++----
 contrib/other-builds/util/util.project        | 12 +++---
 misc/pruneGeneration.cpp                      | 18 ++++++---
 misc/pruneGeneration.h                        |  4 +-
 11 files changed, 165 insertions(+), 87 deletions(-)

diff --git a/contrib/other-builds/OnDiskPt/OnDiskPt.project b/contrib/other-builds/OnDiskPt/OnDiskPt.project
index 06f80d233..3a89ec832 100644
--- a/contrib/other-builds/OnDiskPt/OnDiskPt.project
+++ b/contrib/other-builds/OnDiskPt/OnDiskPt.project
@@ -1,5 +1,22 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <CodeLite_Project Name="OnDiskPt" InternalType="Library">
+  <Plugins>
+    <Plugin Name="CMakePlugin">
+      <![CDATA[[{
+  "name": "Debug",
+  "enabled": false,
+  "buildDirectory": "build",
+  "sourceDirectory": "$(ProjectPath)",
+  "generator": "",
+  "buildType": "",
+  "arguments": [],
+  "parentProject": ""
+ }]]]>
+    </Plugin>
+    <Plugin Name="qmake">
+      <![CDATA[00010001N0005Debug000000000000]]>
+    </Plugin>
+  </Plugins>
   <Description/>
   <Dependencies/>
   <VirtualDirectory Name="src"/>
@@ -27,6 +44,8 @@
     <File Name="../../../OnDiskPt/Word.cpp"/>
     <File Name="../../../OnDiskPt/Word.h"/>
   </VirtualDirectory>
+  <Dependencies Name="Debug"/>
+  <Dependencies Name="Release"/>
   <Settings Type="Static Library">
     <GlobalSettings>
       <Compiler Options="" C_Options="" Assembler="">
@@ -40,9 +59,9 @@
     <Configuration Name="Debug" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Static Library" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
       <Compiler Options="-g" C_Options="-g" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
         <IncludePath Value="."/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder"/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/phrase-extract"/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
+        <IncludePath Value="../../.."/>
+        <IncludePath Value="../../../phrase-extract"/>
+        <IncludePath Value="../../../boost/include"/>
         <Preprocessor Value="MAX_NUM_FACTORS=4"/>
       </Compiler>
       <Linker Options="" Required="yes"/>
@@ -72,7 +91,7 @@
         <CustomPostBuild/>
         <CustomPreBuild/>
       </AdditionalRules>
-      <Completion EnableCpp11="no">
+      <Completion EnableCpp11="no" EnableCpp14="no">
         <ClangCmpFlagsC/>
         <ClangCmpFlags/>
         <ClangPP/>
@@ -110,7 +129,7 @@
         <CustomPostBuild/>
         <CustomPreBuild/>
       </AdditionalRules>
-      <Completion EnableCpp11="no">
+      <Completion EnableCpp11="no" EnableCpp14="no">
         <ClangCmpFlagsC/>
         <ClangCmpFlags/>
         <ClangPP/>
@@ -118,6 +137,4 @@
       </Completion>
     </Configuration>
   </Settings>
-  <Dependencies Name="Debug"/>
-  <Dependencies Name="Release"/>
 </CodeLite_Project>
diff --git a/contrib/other-builds/extract-mixed-syntax/extract-mixed-syntax.project b/contrib/other-builds/extract-mixed-syntax/extract-mixed-syntax.project
index 83c652f8c..87d76689a 100644
--- a/contrib/other-builds/extract-mixed-syntax/extract-mixed-syntax.project
+++ b/contrib/other-builds/extract-mixed-syntax/extract-mixed-syntax.project
@@ -1,5 +1,22 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <CodeLite_Project Name="extract-mixed-syntax" InternalType="Console">
+  <Plugins>
+    <Plugin Name="qmake">
+      <![CDATA[00010001N0005Debug000000000000]]>
+    </Plugin>
+    <Plugin Name="CMakePlugin">
+      <![CDATA[[{
+  "name": "Debug",
+  "enabled": false,
+  "buildDirectory": "build",
+  "sourceDirectory": "$(ProjectPath)",
+  "generator": "",
+  "buildType": "",
+  "arguments": [],
+  "parentProject": ""
+ }]]]>
+    </Plugin>
+  </Plugins>
   <Description/>
   <Dependencies/>
   <VirtualDirectory Name="src"/>
@@ -43,6 +60,10 @@
     <File Name="../../../phrase-extract/OutputFileStream.cpp"/>
     <File Name="../../../phrase-extract/OutputFileStream.h"/>
   </VirtualDirectory>
+  <Dependencies Name="Debug">
+    <Project Name="util"/>
+  </Dependencies>
+  <Dependencies Name="Release"/>
   <Settings Type="Executable">
     <GlobalSettings>
       <Compiler Options="" C_Options="" Assembler="">
@@ -56,13 +77,14 @@
     <Configuration Name="Debug" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Executable" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
       <Compiler Options="-g;-O0;-Wall" C_Options="-g;-O0;-Wall" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
         <IncludePath Value="."/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder"/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/phrase-extract"/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
+        <IncludePath Value="../../../"/>
+        <IncludePath Value="../../../phrase-extract"/>
+        <IncludePath Value="../../../boost/include"/>
       </Compiler>
       <Linker Options="" Required="yes">
-        <LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/boost/lib64"/>
-        <LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/util/Debug"/>
+        <LibraryPath Value="../../../boost/lib64"/>
+        <LibraryPath Value="../../../contrib/other-builds/util/Debug"/>
+        <LibraryPath Value="Debug"/>
         <Library Value="util"/>
         <Library Value="boost_iostreams"/>
         <Library Value="boost_program_options"/>
@@ -94,7 +116,7 @@
         <CustomPostBuild/>
         <CustomPreBuild/>
       </AdditionalRules>
-      <Completion EnableCpp11="no">
+      <Completion EnableCpp11="no" EnableCpp14="no">
         <ClangCmpFlagsC/>
         <ClangCmpFlags/>
         <ClangPP/>
@@ -133,7 +155,7 @@
         <CustomPostBuild/>
         <CustomPreBuild/>
       </AdditionalRules>
-      <Completion EnableCpp11="no">
+      <Completion EnableCpp11="no" EnableCpp14="no">
         <ClangCmpFlagsC/>
         <ClangCmpFlags/>
         <ClangPP/>
@@ -141,8 +163,4 @@
       </Completion>
     </Configuration>
   </Settings>
-  <Dependencies Name="Debug">
-    <Project Name="util"/>
-  </Dependencies>
-  <Dependencies Name="Release"/>
 </CodeLite_Project>
diff --git a/contrib/other-builds/extract/extract.project b/contrib/other-builds/extract/extract.project
index ac74607f2..d86e89035 100644
--- a/contrib/other-builds/extract/extract.project
+++ b/contrib/other-builds/extract/extract.project
@@ -1,5 +1,22 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <CodeLite_Project Name="extract" InternalType="Console">
+  <Plugins>
+    <Plugin Name="qmake">
+      <![CDATA[00010001N0005Debug000000000000]]>
+    </Plugin>
+    <Plugin Name="CMakePlugin">
+      <![CDATA[[{
+  "name": "Debug",
+  "enabled": false,
+  "buildDirectory": "build",
+  "sourceDirectory": "$(ProjectPath)",
+  "generator": "",
+  "buildType": "",
+  "arguments": [],
+  "parentProject": ""
+ }]]]>
+    </Plugin>
+  </Plugins>
   <Description/>
   <Dependencies/>
   <VirtualDirectory Name="src">
@@ -13,6 +30,8 @@
     <File Name="../../../phrase-extract/tables-core.cpp"/>
     <File Name="../../../phrase-extract/tables-core.h"/>
   </VirtualDirectory>
+  <Dependencies Name="Debug"/>
+  <Dependencies Name="Release"/>
   <Settings Type="Executable">
     <GlobalSettings>
       <Compiler Options="" C_Options="" Assembler="">
@@ -26,11 +45,11 @@
     <Configuration Name="Debug" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Executable" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
       <Compiler Options="-g;-O0;-Wall" C_Options="-g;-O0;-Wall" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
         <IncludePath Value="."/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder"/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
+        <IncludePath Value="../../../"/>
+        <IncludePath Value="../../../boost/include"/>
       </Compiler>
       <Linker Options="" Required="yes">
-        <LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/boost/lib64"/>
+        <LibraryPath Value="../../../boost/lib64"/>
         <Library Value="boost_iostreams"/>
         <Library Value="z"/>
       </Linker>
@@ -60,7 +79,7 @@
         <CustomPostBuild/>
         <CustomPreBuild/>
       </AdditionalRules>
-      <Completion EnableCpp11="no">
+      <Completion EnableCpp11="no" EnableCpp14="no">
         <ClangCmpFlagsC/>
         <ClangCmpFlags/>
         <ClangPP/>
@@ -99,7 +118,7 @@
         <CustomPostBuild/>
         <CustomPreBuild/>
       </AdditionalRules>
-      <Completion EnableCpp11="no">
+      <Completion EnableCpp11="no" EnableCpp14="no">
         <ClangCmpFlagsC/>
         <ClangCmpFlags/>
         <ClangPP/>
@@ -107,6 +126,4 @@
       </Completion>
     </Configuration>
   </Settings>
-  <Dependencies Name="Debug"/>
-  <Dependencies Name="Release"/>
 </CodeLite_Project>
diff --git a/contrib/other-builds/lm/lm.project b/contrib/other-builds/lm/lm.project
index a184fe3d1..c30ebe533 100644
--- a/contrib/other-builds/lm/lm.project
+++ b/contrib/other-builds/lm/lm.project
@@ -1,5 +1,22 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <CodeLite_Project Name="lm" InternalType="Library">
+  <Plugins>
+    <Plugin Name="CMakePlugin">
+      <![CDATA[[{
+  "name": "Debug",
+  "enabled": false,
+  "buildDirectory": "build",
+  "sourceDirectory": "$(ProjectPath)",
+  "generator": "",
+  "buildType": "",
+  "arguments": [],
+  "parentProject": ""
+ }]]]>
+    </Plugin>
+    <Plugin Name="qmake">
+      <![CDATA[00010001N0005Debug000000000000]]>
+    </Plugin>
+  </Plugins>
   <Description/>
   <Dependencies/>
   <VirtualDirectory Name="src"/>
@@ -27,6 +44,8 @@
     <File Name="../../../lm/virtual_interface.cc"/>
     <File Name="../../../lm/vocab.cc"/>
   </VirtualDirectory>
+  <Dependencies Name="Debug"/>
+  <Dependencies Name="Release"/>
   <Settings Type="Static Library">
     <GlobalSettings>
       <Compiler Options="" C_Options="" Assembler="">
@@ -40,9 +59,9 @@
     <Configuration Name="Debug" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Static Library" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
       <Compiler Options="-g" C_Options="-g" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
         <IncludePath Value="."/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder"/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/phrase-extract"/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
+        <IncludePath Value="../../.."/>
+        <IncludePath Value="../../../phrase-extract"/>
+        <IncludePath Value="../../../boost/include"/>
         <Preprocessor Value="KENLM_MAX_ORDER=7"/>
       </Compiler>
       <Linker Options="" Required="yes"/>
@@ -72,7 +91,7 @@
         <CustomPostBuild/>
         <CustomPreBuild/>
       </AdditionalRules>
-      <Completion EnableCpp11="no">
+      <Completion EnableCpp11="no" EnableCpp14="no">
         <ClangCmpFlagsC/>
         <ClangCmpFlags/>
         <ClangPP/>
@@ -110,7 +129,7 @@
         <CustomPostBuild/>
         <CustomPreBuild/>
       </AdditionalRules>
-      <Completion EnableCpp11="no">
+      <Completion EnableCpp11="no" EnableCpp14="no">
         <ClangCmpFlagsC/>
         <ClangCmpFlags/>
         <ClangPP/>
@@ -118,6 +137,4 @@
       </Completion>
     </Configuration>
   </Settings>
-  <Dependencies Name="Debug"/>
-  <Dependencies Name="Release"/>
 </CodeLite_Project>
diff --git a/contrib/other-builds/moses-cmd/moses-cmd.project b/contrib/other-builds/moses-cmd/moses-cmd.project
index 44a0d621f..5303ba7c7 100644
--- a/contrib/other-builds/moses-cmd/moses-cmd.project
+++ b/contrib/other-builds/moses-cmd/moses-cmd.project
@@ -26,6 +26,13 @@
     <File Name="../../../moses-cmd/MainVW.cpp" ExcludeProjConfig="Debug"/>
     <File Name="../../../moses-cmd/MainVW.h" ExcludeProjConfig="Debug"/>
   </VirtualDirectory>
+  <Dependencies Name="Debug">
+    <Project Name="OnDiskPt"/>
+    <Project Name="lm"/>
+    <Project Name="search"/>
+    <Project Name="util"/>
+  </Dependencies>
+  <Dependencies Name="Release"/>
   <Settings Type="Executable">
     <GlobalSettings>
       <Compiler Options="" C_Options="" Assembler="">
@@ -39,20 +46,20 @@
     <Configuration Name="Debug" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Executable" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
       <Compiler Options="-g;-O0;-Wall" C_Options="-g;-O0;-Wall" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
         <IncludePath Value="."/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder"/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/phrase-extract"/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
+        <IncludePath Value="../../.."/>
+        <IncludePath Value="../../../phrase-extract"/>
+        <IncludePath Value="../../../boost/include"/>
         <Preprocessor Value="MAX_NUM_FACTORS=4"/>
         <Preprocessor Value="KENLM_MAX_ORDER=7"/>
         <Preprocessor Value="TRACE_ENABLE=1"/>
       </Compiler>
       <Linker Options="" Required="yes">
-        <LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/boost/lib64"/>
-        <LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/lm/Debug"/>
-        <LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/moses/Debug"/>
-        <LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/OnDiskPt/Debug"/>
-        <LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/search/Debug"/>
-        <LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/util/Debug"/>
+        <LibraryPath Value="../../../boost/lib64"/>
+        <LibraryPath Value="../../../contrib/other-builds/lm/Debug"/>
+        <LibraryPath Value="../../../contrib/other-builds/moses/Debug"/>
+        <LibraryPath Value="../../../contrib/other-builds/OnDiskPt/Debug"/>
+        <LibraryPath Value="../../../contrib/other-builds/search/Debug"/>
+        <LibraryPath Value="../../../contrib/other-builds/util/Debug"/>
         <Library Value="util"/>
         <Library Value="moses"/>
         <Library Value="search"/>
@@ -143,11 +150,4 @@
       </Completion>
     </Configuration>
   </Settings>
-  <Dependencies Name="Debug">
-    <Project Name="OnDiskPt"/>
-    <Project Name="lm"/>
-    <Project Name="search"/>
-    <Project Name="util"/>
-  </Dependencies>
-  <Dependencies Name="Release"/>
 </CodeLite_Project>
diff --git a/contrib/other-builds/pruneGeneration/pruneGeneration.project b/contrib/other-builds/pruneGeneration/pruneGeneration.project
index 7060d55ea..39109197a 100644
--- a/contrib/other-builds/pruneGeneration/pruneGeneration.project
+++ b/contrib/other-builds/pruneGeneration/pruneGeneration.project
@@ -2,6 +2,10 @@
 <CodeLite_Project Name="pruneGeneration" InternalType="Console">
   <Description/>
   <Dependencies/>
+  <VirtualDirectory Name="src">
+    <File Name="../../../misc/pruneGeneration.cpp"/>
+    <File Name="../../../misc/pruneGeneration.h"/>
+  </VirtualDirectory>
   <Settings Type="Executable">
     <GlobalSettings>
       <Compiler Options="" C_Options="" Assembler="">
@@ -15,6 +19,7 @@
     <Configuration Name="Debug" CompilerType="GCC ( XCode )" DebuggerType="LLDB Debugger" Type="Executable" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
       <Compiler Options="-g;-O0;-Wall" C_Options="-g;-O0;-Wall" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
         <IncludePath Value="."/>
+        <IncludePath Value="../../.."/>
       </Compiler>
       <Linker Options="" Required="yes"/>
       <ResourceCompiler Options="" Required="no"/>
@@ -90,8 +95,4 @@
       </Completion>
     </Configuration>
   </Settings>
-  <VirtualDirectory Name="src">
-    <File Name="../../../misc/pruneGeneration.cpp"/>
-    <File Name="../../../misc/pruneGeneration.h"/>
-  </VirtualDirectory>
 </CodeLite_Project>
diff --git a/contrib/other-builds/score/score.project b/contrib/other-builds/score/score.project
index c88df0e78..08e0b9414 100644
--- a/contrib/other-builds/score/score.project
+++ b/contrib/other-builds/score/score.project
@@ -19,6 +19,10 @@
     <File Name="../../../phrase-extract/tables-core.cpp"/>
     <File Name="../../../phrase-extract/tables-core.h"/>
   </VirtualDirectory>
+  <Dependencies Name="Debug">
+    <Project Name="util"/>
+  </Dependencies>
+  <Dependencies Name="Release"/>
   <Settings Type="Executable">
     <GlobalSettings>
       <Compiler Options="" C_Options="" Assembler="">
@@ -32,17 +36,17 @@
     <Configuration Name="Debug" CompilerType="clang( based on LLVM 3.5svn )" DebuggerType="LLDB Debugger" Type="Executable" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
       <Compiler Options="-g;-O0;-Wall" C_Options="-g;-O0;-Wall" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
         <IncludePath Value="."/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder"/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/phrase-extract"/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
+        <IncludePath Value="../../.."/>
+        <IncludePath Value="../../../phrase-extract"/>
+        <IncludePath Value="../../../boost/include"/>
       </Compiler>
       <Linker Options="" Required="yes">
-        <LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/boost/lib64"/>
-        <LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/lm/Debug"/>
-        <LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/moses/Debug"/>
-        <LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/OnDiskPt/Debug"/>
-        <LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/search/Debug"/>
-        <LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/util/Debug"/>
+        <LibraryPath Value="../../../boost/lib64"/>
+        <LibraryPath Value="../../../contrib/other-builds/lm/Debug"/>
+        <LibraryPath Value="../../../contrib/other-builds/moses/Debug"/>
+        <LibraryPath Value="../../../contrib/other-builds/OnDiskPt/Debug"/>
+        <LibraryPath Value="../../../contrib/other-builds/search/Debug"/>
+        <LibraryPath Value="../../../contrib/other-builds/util/Debug"/>
         <Library Value="moses"/>
         <Library Value="search"/>
         <Library Value="OnDiskPt"/>
@@ -86,7 +90,7 @@
         <CustomPostBuild/>
         <CustomPreBuild/>
       </AdditionalRules>
-      <Completion EnableCpp11="no">
+      <Completion EnableCpp11="no" EnableCpp14="no">
         <ClangCmpFlagsC/>
         <ClangCmpFlags/>
         <ClangPP/>
@@ -125,7 +129,7 @@
         <CustomPostBuild/>
         <CustomPreBuild/>
       </AdditionalRules>
-      <Completion EnableCpp11="no">
+      <Completion EnableCpp11="no" EnableCpp14="no">
         <ClangCmpFlagsC/>
         <ClangCmpFlags/>
         <ClangPP/>
@@ -133,8 +137,4 @@
       </Completion>
     </Configuration>
   </Settings>
-  <Dependencies Name="Debug">
-    <Project Name="util"/>
-  </Dependencies>
-  <Dependencies Name="Release"/>
 </CodeLite_Project>
diff --git a/contrib/other-builds/search/search.project b/contrib/other-builds/search/search.project
index d96252a89..8be29fd1d 100644
--- a/contrib/other-builds/search/search.project
+++ b/contrib/other-builds/search/search.project
@@ -10,6 +10,8 @@
     <File Name="../../../search/rule.cc"/>
     <File Name="../../../search/vertex.cc"/>
   </VirtualDirectory>
+  <Dependencies Name="Debug"/>
+  <Dependencies Name="Release"/>
   <Settings Type="Static Library">
     <GlobalSettings>
       <Compiler Options="" C_Options="" Assembler="">
@@ -23,9 +25,9 @@
     <Configuration Name="Debug" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Static Library" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
       <Compiler Options="-g" C_Options="-g" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
         <IncludePath Value="."/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder"/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/phrase-extract"/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
+        <IncludePath Value="../../.."/>
+        <IncludePath Value="../../../phrase-extract"/>
+        <IncludePath Value="../../../boost/include"/>
         <Preprocessor Value="KENLM_MAX_ORDER=7"/>
       </Compiler>
       <Linker Options="" Required="yes"/>
@@ -55,7 +57,7 @@
         <CustomPostBuild/>
         <CustomPreBuild/>
       </AdditionalRules>
-      <Completion EnableCpp11="no">
+      <Completion EnableCpp11="no" EnableCpp14="no">
         <ClangCmpFlagsC/>
         <ClangCmpFlags/>
         <ClangPP/>
@@ -93,7 +95,7 @@
         <CustomPostBuild/>
         <CustomPreBuild/>
       </AdditionalRules>
-      <Completion EnableCpp11="no">
+      <Completion EnableCpp11="no" EnableCpp14="no">
         <ClangCmpFlagsC/>
         <ClangCmpFlags/>
         <ClangPP/>
@@ -101,6 +103,4 @@
       </Completion>
     </Configuration>
   </Settings>
-  <Dependencies Name="Debug"/>
-  <Dependencies Name="Release"/>
 </CodeLite_Project>
diff --git a/contrib/other-builds/util/util.project b/contrib/other-builds/util/util.project
index 1006ddb52..4bb27306e 100644
--- a/contrib/other-builds/util/util.project
+++ b/contrib/other-builds/util/util.project
@@ -62,6 +62,8 @@
     <File Name="../../../util/stream/sort_test.cc" ExcludeProjConfig="Debug"/>
     <File Name="../../../util/stream/stream_test.cc" ExcludeProjConfig="Debug"/>
   </VirtualDirectory>
+  <Dependencies Name="Debug"/>
+  <Dependencies Name="Release"/>
   <Settings Type="Static Library">
     <GlobalSettings>
       <Compiler Options="" C_Options="" Assembler="">
@@ -75,8 +77,8 @@
     <Configuration Name="Debug" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Static Library" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
       <Compiler Options="-g" C_Options="-g" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
         <IncludePath Value="."/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder"/>
-        <IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
+        <IncludePath Value="../../.."/>
+        <IncludePath Value="../../../boost/include"/>
       </Compiler>
       <Linker Options="" Required="yes"/>
       <ResourceCompiler Options="" Required="no"/>
@@ -105,7 +107,7 @@
         <CustomPostBuild/>
         <CustomPreBuild/>
       </AdditionalRules>
-      <Completion EnableCpp11="no">
+      <Completion EnableCpp11="no" EnableCpp14="no">
         <ClangCmpFlagsC/>
         <ClangCmpFlags/>
         <ClangPP/>
@@ -143,7 +145,7 @@
         <CustomPostBuild/>
         <CustomPreBuild/>
       </AdditionalRules>
-      <Completion EnableCpp11="no">
+      <Completion EnableCpp11="no" EnableCpp14="no">
         <ClangCmpFlagsC/>
         <ClangCmpFlags/>
         <ClangPP/>
@@ -151,6 +153,4 @@
       </Completion>
     </Configuration>
   </Settings>
-  <Dependencies Name="Debug"/>
-  <Dependencies Name="Release"/>
 </CodeLite_Project>
diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp
index 8207e287f..98b21530c 100644
--- a/misc/pruneGeneration.cpp
+++ b/misc/pruneGeneration.cpp
@@ -1,10 +1,10 @@
 #include <stdio.h>
 #include <stdlib.h>
-#include <iostream>
 #include <cassert>
 #include <algorithm>
 #include <functional>
 #include "pruneGeneration.h"
+#include "moses/InputFileStream.h"
 
 using namespace std;
 
@@ -13,16 +13,23 @@ int main(int argc, char **argv)
   cerr << "Starting" << endl;
   int limit = atoi(argv[1]);
 
+  Process(limit, cin, cout);
+  
+  cerr << "Finished" << endl;
+}
+
+void Process(int limit, istream &inStrme, ostream &outStrme)
+{
   vector<Rec> records;
   string prevInWord;
   string line;
-  while (getline(cin, line)) {
+  while (getline(inStrme, line)) {
     vector<string> toks;
     Tokenize(toks, line);
     assert(toks.size() == 4);
 
     if (prevInWord != toks[0]) {
-      Output(limit, records);
+      Output(outStrme, records, limit);
       records.clear();
     }
 
@@ -34,13 +41,12 @@ int main(int argc, char **argv)
   }
 
   // last
-  Output(limit, records);
+  Output(outStrme, records, limit);
   records.clear();
 
-  cerr << "Finished" << endl;
 }
 
-void Output(int limit, vector<Rec> &records)
+void Output(ostream &outStrme, vector<Rec> &records, int limit)
 {
   std::sort(records.rbegin(), records.rend());
 
diff --git a/misc/pruneGeneration.h b/misc/pruneGeneration.h
index dae5958f8..b22d09869 100644
--- a/misc/pruneGeneration.h
+++ b/misc/pruneGeneration.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <vector>
 #include <string>
+#include <iostream>
 
 class Rec
 {
@@ -20,7 +21,8 @@ public:
 
 ////////////////////////////////////////////////////////////
 
-void Output(int limit, std::vector<Rec> &records);
+void Process(int limit, std::istream &inStrme, std::ostream &outStrme);
+void Output(std::ostream &outStrme, std::vector<Rec> &records, int limit);
 
 ////////////////////////////////////////////////////////////
 inline void Tokenize(std::vector<std::string> &output

From 930dce10bff821431213441fa1c07c1195d916b9 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Thu, 25 Jun 2015 13:02:29 +0400
Subject: [PATCH 094/108] prune multiple files at once. Make up for failure in
 ems to give the full path of the gen table

---
 contrib/other-builds/moses/moses.project      |  2 +
 .../pruneGeneration/pruneGeneration.project   | 27 +++++-
 misc/pruneGeneration.cpp                      | 44 ++++++++-
 moses/OutputFileStream.cpp                    | 90 +++++++++++++++++++
 moses/OutputFileStream.h                      | 81 +++++++++++++++++
 5 files changed, 241 insertions(+), 3 deletions(-)
 create mode 100644 moses/OutputFileStream.cpp
 create mode 100644 moses/OutputFileStream.h

diff --git a/contrib/other-builds/moses/moses.project b/contrib/other-builds/moses/moses.project
index 0fbd942c6..0ceb40723 100644
--- a/contrib/other-builds/moses/moses.project
+++ b/contrib/other-builds/moses/moses.project
@@ -775,6 +775,8 @@
     <File Name="../../../moses/WordsRange.h"/>
     <File Name="../../../moses/XmlOption.cpp"/>
     <File Name="../../../moses/XmlOption.h"/>
+    <File Name="../../../moses/OutputFileStream.cpp"/>
+    <File Name="../../../moses/OutputFileStream.h"/>
   </VirtualDirectory>
   <VirtualDirectory Name="PP">
     <File Name="../../../moses/PP/CountsPhraseProperty.cpp"/>
diff --git a/contrib/other-builds/pruneGeneration/pruneGeneration.project b/contrib/other-builds/pruneGeneration/pruneGeneration.project
index 39109197a..6f8a6adf5 100644
--- a/contrib/other-builds/pruneGeneration/pruneGeneration.project
+++ b/contrib/other-builds/pruneGeneration/pruneGeneration.project
@@ -1,5 +1,22 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <CodeLite_Project Name="pruneGeneration" InternalType="Console">
+  <Plugins>
+    <Plugin Name="CMakePlugin">
+      <![CDATA[[{
+  "name": "Debug",
+  "enabled": false,
+  "buildDirectory": "build",
+  "sourceDirectory": "$(ProjectPath)",
+  "generator": "",
+  "buildType": "",
+  "arguments": [],
+  "parentProject": ""
+ }]]]>
+    </Plugin>
+    <Plugin Name="qmake">
+      <![CDATA[00010001N0005Debug000000000000]]>
+    </Plugin>
+  </Plugins>
   <Description/>
   <Dependencies/>
   <VirtualDirectory Name="src">
@@ -20,8 +37,16 @@
       <Compiler Options="-g;-O0;-Wall" C_Options="-g;-O0;-Wall" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
         <IncludePath Value="."/>
         <IncludePath Value="../../.."/>
+        <IncludePath Value="../../../boost/include"/>
       </Compiler>
-      <Linker Options="" Required="yes"/>
+      <Linker Options="" Required="yes">
+        <LibraryPath Value="../../../boost/lib64"/>
+        <LibraryPath Value="../../../contrib/other-builds/moses/Debug"/>
+        <Library Value="boost_filesystem"/>
+        <Library Value="boost_system"/>
+        <Library Value="moses"/>
+        <Library Value="z"/>
+      </Linker>
       <ResourceCompiler Options="" Required="no"/>
       <General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
       <Environment EnvVarSetName="&lt;Use Defaults&gt;" DbgSetName="&lt;Use Defaults&gt;">
diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp
index 98b21530c..e436263e9 100644
--- a/misc/pruneGeneration.cpp
+++ b/misc/pruneGeneration.cpp
@@ -3,8 +3,10 @@
 #include <cassert>
 #include <algorithm>
 #include <functional>
+#include <boost/filesystem.hpp>
 #include "pruneGeneration.h"
 #include "moses/InputFileStream.h"
+#include "moses/OutputFileStream.h"
 
 using namespace std;
 
@@ -12,8 +14,46 @@ int main(int argc, char **argv)
 {
   cerr << "Starting" << endl;
   int limit = atoi(argv[1]);
+  string inPathStem = argv[2];
+  string outPathStem = argv[3];
 
-  Process(limit, cin, cout);
+  namespace fs = boost::filesystem;
+
+  //cerr << "inPathStem=" << inPathStem << endl;
+  fs::path p(inPathStem);
+  fs::path dir = p.parent_path();
+  //cerr << "dir=" << dir << endl;
+
+  fs::path fileStem = p.filename();
+  string fileStemStr = fileStem.native();
+  size_t fileStemStrSize = fileStemStr.size();
+  //cerr << "fileStem=" << fileStemStr << endl;
+
+  // loop thru each file in directory
+  fs::directory_iterator end_iter;
+  for( fs::directory_iterator dir_iter(dir) ; dir_iter != end_iter ; ++dir_iter) {  
+    if (fs::is_regular_file(dir_iter->status())) {
+      fs::path currPath = *dir_iter;
+      string currPathStr = currPath.native();
+      //cerr << "currPathStr=" << currPathStr << endl;
+
+      fs::path currFile = currPath.filename();
+      string currFileStr = currFile.native();
+
+      if (currFileStr.find(fileStemStr) == 0) {
+        // found gen table we need
+        //cerr << "found=" << currPathStr << endl;
+        string suffix = currFileStr.substr(fileStemStrSize, currFileStr.size() - fileStemStrSize);
+        string outPath = outPathStem + suffix;
+        cerr << "PRUNING " << currPathStr << " TO " << outPath << endl;
+        
+        Moses::InputFileStream inStrme(currPathStr);
+        Moses::OutputFileStream outStrme(outPath);
+        Process(limit, inStrme, outStrme);
+        
+      }
+    }
+  }
   
   cerr << "Finished" << endl;
 }
@@ -52,7 +92,7 @@ void Output(ostream &outStrme, vector<Rec> &records, int limit)
 
   for (size_t i = 0; i < limit && i < records.size(); ++i) {
     const Rec &rec = records[i];
-    cout << rec.line << endl;
+    outStrme << rec.line << endl;
   }
 }
 
diff --git a/moses/OutputFileStream.cpp b/moses/OutputFileStream.cpp
new file mode 100644
index 000000000..d7874b06f
--- /dev/null
+++ b/moses/OutputFileStream.cpp
@@ -0,0 +1,90 @@
+// $Id: OutputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#include <iostream>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+#include "OutputFileStream.h"
+#include "gzfilebuf.h"
+
+using namespace std;
+using namespace boost::algorithm;
+
+namespace Moses
+{
+OutputFileStream::OutputFileStream()
+  :boost::iostreams::filtering_ostream()
+  ,m_outFile(NULL)
+  ,m_open(false)
+{
+}
+
+OutputFileStream::OutputFileStream(const std::string &filePath)
+  :m_outFile(NULL)
+  ,m_open(false)
+{
+  Open(filePath);
+}
+
+OutputFileStream::~OutputFileStream()
+{
+  Close();
+}
+
+bool OutputFileStream::Open(const std::string &filePath)
+{
+  assert(!m_open);
+  if (filePath == std::string("-")) {
+    // Write to standard output.  Leave m_outFile null.
+    this->push(std::cout);
+  } else {
+    m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary);
+    if (m_outFile->fail()) {
+      return false;
+    }
+
+    if (ends_with(filePath, ".gz")) {
+      this->push(boost::iostreams::gzip_compressor());
+    }
+    this->push(*m_outFile);
+  }
+
+  m_open = true;
+  return true;
+}
+
+void OutputFileStream::Close()
+{
+  if (!m_open) return;
+  this->flush();
+  if (m_outFile) {
+    this->pop(); // file
+
+    m_outFile->close();
+    delete m_outFile;
+    m_outFile = NULL;
+  }
+  m_open = false;
+}
+
+
+}
+
diff --git a/moses/OutputFileStream.h b/moses/OutputFileStream.h
new file mode 100644
index 000000000..b77741a73
--- /dev/null
+++ b/moses/OutputFileStream.h
@@ -0,0 +1,81 @@
+// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#pragma once
+
+#include <cstdlib>
+#include <fstream>
+#include <string>
+#include <iostream>
+#include <boost/iostreams/filtering_stream.hpp>
+
+namespace Moses
+{
+
+/** Version of std::ostream with transparent compression.
+ *
+ * Transparently compresses output when writing to a file whose name ends in
+ * ".gz".  Or, writes to stdout instead of a file when given a filename
+ * consisting of just a dash ("-").
+ */
+class OutputFileStream : public boost::iostreams::filtering_ostream
+{
+private:
+  /** File that needs flushing & closing when we close this stream.
+   *
+   * Is NULL when no file is opened, e.g. when writing to standard output.
+   */
+  std::ofstream *m_outFile;
+
+  /// Is this stream open?
+  bool m_open;
+
+public:
+  /** Create an unopened OutputFileStream.
+   *
+   * Until it's been opened, nothing can be done with this stream.
+   */
+  OutputFileStream();
+
+  /// Create an OutputFileStream, and open it by calling Open().
+  OutputFileStream(const std::string &filePath);
+  virtual ~OutputFileStream();
+
+  // TODO: Can we please just always throw an exception when this fails?
+  /** Open stream.
+   *
+   * If filePath is "-" (just a dash), this opens the stream for writing to
+   * standard output.  Otherwise, it opens the given file.  If the filename
+   * has the ".gz" suffix, output will be transparently compressed.
+   *
+   * Call Close() to close the file.
+   *
+   * Returns whether opening the file was successful.  It may also throw an
+   * exception on failure.
+   */
+  bool Open(const std::string &filePath);
+
+  /// Flush and close stream.  After this, the stream can be opened again.
+  void Close();
+};
+
+}
+

From b83803203e94535aa4405df244ccbd32ab80ed34 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Thu, 25 Jun 2015 18:10:31 +0400
Subject: [PATCH 095/108] prune generation table in ems

---
 scripts/ems/experiment.meta | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index 4177f967e..110ab39b7 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -746,7 +746,7 @@ generation-prune
 	pass-unless: AND TRAINING:prune-generation
 	default-name: model/generation-table-pruned
 	final-model: yes
-	template: $TRAINING:prune-generation < IN > OUT
+	template: $TRAINING:prune-generation IN OUT
 build-sparse
 	in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
         out: sparse

From 22cc22064c3cfcd6a762ebf8e597a3ed13642814 Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Thu, 25 Jun 2015 15:17:26 +0100
Subject: [PATCH 096/108] Changed implementation of indocs (to keep track of
 which documents phrases come from) from vector to map.

---
 .../UG/mm/ug_bitext_agenda_job.h                | 17 +++++++++++++----
 .../TranslationModel/UG/mm/ug_bitext_jstats.cc  |  2 +-
 moses/TranslationModel/UG/mm/ug_bitext_jstats.h |  3 ++-
 .../TranslationModel/UG/mm/ug_bitext_pstats.cc  |  2 +-
 moses/TranslationModel/UG/mm/ug_bitext_pstats.h |  4 ++--
 moses/TranslationModel/UG/mm/ug_phrasepair.h    | 11 +++++++----
 6 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h
index 0e0624351..36b9873e0 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h
@@ -137,7 +137,10 @@ int Bitext<Token>::agenda::job
 
   float p = (*m_bias)[sid];
   id_type docid = m_bias->GetClass(sid);
-  uint32_t k = docid < stats->indoc.size() ? stats->indoc[docid] : 0;
+  
+  // uint32_t k = docid < stats->indoc.size() ? stats->indoc[docid] : 0;
+  std::map<uint32_t,uint32_t>::const_iterator m = stats->indoc.find(docid);
+  uint32_t k = m != stats->indoc.end() ? m->second : 0 ;
 
   // always consider candidates from dominating documents and
   // from documents that have not been considered at all yet
@@ -159,11 +162,17 @@ int Bitext<Token>::agenda::job
 	e = root->getCorpus()->sntEnd(sid);
       *log << docid << ":" << sid << " " << size_t(k) << "/" << N
 	   << " @" << p << " => " << d << " [";
-      for (size_t i = 0; i < stats->indoc.size(); ++i)
+      for (std::map<uint32_t, uint32_t>::const_iterator m = stats->indoc.begin();
+	   m != stats->indoc.end(); ++m)
 	{
-	  if (i) *log << " ";
-	  *log << stats->indoc[i];
+	  if (m != stats->indoc.begin()) *log << " ";
+	  *log << m->first << ":" << m->second;
 	}
+      // for (size_t i = 0; i < stats->indoc.size(); ++i)
+      // 	{
+      // 	  if (i) *log << " ";
+      // 	  *log << stats->indoc[i];
+      // 	}
       *log << "] ";
       for (; x < e; ++x) *log << (*m_bitext->V1)[x->id()] << " ";
       if (!ret) *log << "SKIP";
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
index bcda9ebf3..517caf783 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
@@ -76,7 +76,7 @@ namespace Moses
       ++obwd[bwd_orient];
       if (docid >= 0)
 	{
-	  while (int(indoc.size()) <= docid) indoc.push_back(0);
+	  // while (int(indoc.size()) <= docid) indoc.push_back(0);
 	  ++indoc[docid];
 	}
     }
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
index dade27649..03b231487 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
@@ -27,7 +27,8 @@ namespace Moses
       uint32_t obwd[Moses::LRModel::NONE+1]; // backward distortion type counts
 
     public:
-      vector<uint32_t> indoc; // counts origin of samples (for biased sampling)
+      std::map<uint32_t,uint32_t> indoc;
+      // vector<uint32_t> indoc; // counts origin of samples (for biased sampling)
       jstats();
       jstats(jstats const& other);
       uint32_t rcnt() const; // raw joint counts
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
index 580d7669b..8702d9c50 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
@@ -58,7 +58,7 @@ namespace Moses
       ++obwd[po_bwd];
       if (docid >= 0)
 	{
-	  while (int(indoc.size()) <= docid) indoc.push_back(0);
+	  // while (int(indoc.size()) <= docid) indoc.push_back(0);
 	  ++indoc[docid];
 	}
     }
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
index 9a14e378b..e5cf4ab26 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
@@ -33,8 +33,8 @@ namespace Moses
       uint32_t ofwd[Moses::LRModel::NONE+1]; // distribution of fwd phrase orientations
       uint32_t obwd[Moses::LRModel::NONE+1]; // distribution of bwd phrase orientations
 
-      std::vector<uint32_t> indoc; // distribution over where samples came from
-
+      // std::vector<uint32_t> indoc; // distribution over where samples came from
+      std::map<uint32_t,uint32_t> indoc; 
       typedef std::map<uint64_t, jstats> trg_map_t;
       trg_map_t trg;
       pstats();
diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h
index 7e565c2df..7f03d89df 100644
--- a/moses/TranslationModel/UG/mm/ug_phrasepair.h
+++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h
@@ -30,7 +30,8 @@ namespace Moses
       std::vector<uchar> aln;
       float score;
       bool inverse;
-      std::vector<uint32_t> indoc;
+      // std::vector<uint32_t> indoc;
+      std::map<uint32_t,uint32_t> indoc;
       PhrasePair() { };
       PhrasePair(PhrasePair const& o);
 
@@ -306,10 +307,12 @@ namespace Moses
       out << toString (V1, this->start1, this->len1) << " ::: "
 	  << toString (V2, this->start2, this->len2) << " "
 	  << this->joint << " [";
-      for (size_t i = 0; i < this->indoc.size(); ++i)
+      // for (size_t i = 0; i < this->indoc.size(); ++i)
+      for (std::map<uint32_t,uint32_t>::const_iterator m = indoc.begin();
+	   m != indoc.end(); ++m)
 	{
-	  if (i) out << " ";
-	  out << this->indoc[i];
+	  if (m != indoc.begin()) out << " ";
+	  out << m->first << ":" << m->second;
 	}
       out << "] [";
       vector<float> lrscores;

From 41a11dfe8ac9e7d01e950607afdd13492113e9d5 Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Thu, 25 Jun 2015 18:20:03 +0100
Subject: [PATCH 097/108] Allow ports other than 80 as the server ports for the
 context bias server.

---
 .../TranslationModel/UG/mm/ug_http_client.cc  | 11 ++-
 .../UG/mm/ug_sampling_bias.cc                 | 77 ++++++++++++++++---
 .../TranslationModel/UG/mm/ug_sampling_bias.h |  1 +
 3 files changed, 78 insertions(+), 11 deletions(-)

diff --git a/moses/TranslationModel/UG/mm/ug_http_client.cc b/moses/TranslationModel/UG/mm/ug_http_client.cc
index 1bbb93b23..da8537910 100644
--- a/moses/TranslationModel/UG/mm/ug_http_client.cc
+++ b/moses/TranslationModel/UG/mm/ug_http_client.cc
@@ -1,4 +1,5 @@
 #include "ug_http_client.h"
+#include "moses/Util.h"
 namespace Moses
 {
 using boost::asio::ip::tcp;
@@ -31,10 +32,16 @@ http_client(boost::asio::io_service& io_service, std::string url)
   p = std::min(url.find_first_of(":/"), url.size());
   q = std::min(url.find("/"), url.size());
   if (p < url.size() && url[p] == ':') 
-    port = url.substr(p,q-p);
+    port = url.substr(p+1,q-p-1);
   server = url.substr(0,p);
   if (q < url.size()) 
     path = url.substr(q);
+#if 0
+  std::cerr << HERE << std::endl;
+  std::cerr << "SERVER " << server << std::endl;
+  std::cerr << "PORT   |" << port << "|" << std::endl;
+  std::cerr << "PATH " << path << std::endl; 
+#endif
   init(server, port, path);
 }
 
@@ -55,7 +62,7 @@ init(std::string const& server, std::string const& port, std::string const& path
   
   // Start an asynchronous resolve to translate the server and service names
   // into a list of endpoints.
-  tcp::resolver::query query(server, port);
+  tcp::resolver::query query(server, port.c_str());
   resolver_.async_resolve(query,
 			  boost::bind(&http_client::handle_resolve, this,
 				      boost::asio::placeholders::error,
diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc
index da408dfb3..d54305997 100644
--- a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc
+++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc
@@ -2,7 +2,7 @@
 #include <iostream>
 #include <boost/foreach.hpp>
 #include "moses/Timer.h"
-
+// #include <curl/curl.h>
 // #ifdef HAVE_CURLPP
 // #include <curlpp/Options.hpp>
 // #include <curlpp/cURLpp.hpp>
@@ -19,19 +19,77 @@ namespace Moses
   {
     using ugdiss::id_type;
 
-    // #ifdef WITH_MMT_BIAS_CLIENT
-    std::string
-    query_bias_server(std::string const& url, std::string const& text)
+    size_t ca_write_callback(void *ptr, size_t size, size_t nmemb, 
+			     std::string* response) 
     {
-      std::string query = url+uri_encode(text);
+      char const* c = reinterpret_cast<char const*>(ptr);
+      *response += std::string(c, size * nmemb);
+      return size * nmemb;
+    }
+
+    std::string 
+    query_bias_server(std::string const& server, std::string const& context) 
+    {
+#if 0
+      std::string query = server + uri_encode(context);
+      std::string response;
+      
+      CURL* curl = curl_easy_init();
+      UTIL_THROW_IF2(!curl, "Could not init curl.");
+      curl_easy_setopt(curl, CURLOPT_URL, query.c_str());
+      curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, ca_write_callback);
+      curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
+      CURLcode res = curl_easy_perform(curl);
+      curl_easy_cleanup(curl);
+      return response;
+#else
+      std::string query = server+uri_encode(context);
       boost::asio::io_service io_service;
       Moses::http_client c(io_service, query);
       io_service.run();
-      return c.content();
-    }
-    // #endif
 
-    DocumentBias
+      std::string response = c.content();
+      std::cerr << "SERVER RESPONSE: " << response << std::endl;
+
+      return c.content();
+#endif
+    }
+
+//     // #ifdef WITH_MMT_BIAS_CLIENT
+//     std::string
+//     query_bias_server(std::string const& url, std::string const& text)
+//     {
+// #if 1
+//       std::string query = url+uri_encode(text);
+//       boost::asio::io_service io_service;
+//       Moses::http_client c(io_service, query);
+//       io_service.run();
+
+//       std::string response = c.content();
+//       std::cerr << "SERVER RESPONSE: " << response << std::endl;
+
+//       return c.content();
+// #else
+//       return "";
+// #endif
+//     }
+//     // #endif
+
+
+    // std::string
+    // query_bias_server(std::string const& url, int const port, 
+    // 		      std::string const& context,
+    // 		      std::string const& src_lang)
+    // {
+    //   char* response 
+    // 	= ca_get_context(url.c_str(), port, context.c_str(), src_lang.c_str());
+    //   UTIL_THROW_IF2(!response, "No response from server");
+    //   std::string json = response;
+    //   free(response);
+    //   return json;
+    // }
+
+    DocumentBias 
     ::DocumentBias
     ( std::vector<id_type> const& sid2doc,
       std::map<std::string,id_type> const& docname2docid,
@@ -44,6 +102,7 @@ namespace Moses
       Timer timer;
       if (log) timer.start(NULL);
       std::string json = query_bias_server(server_url, text);
+      std::cerr << "SERVER RESPONSE " << json << std::endl;
       init_from_json(json, docname2docid, log);
       if (log) *log << "Bias query took " << timer << " seconds." << std::endl;
       // #endif
diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.h b/moses/TranslationModel/UG/mm/ug_sampling_bias.h
index f540ddc76..24d39689e 100644
--- a/moses/TranslationModel/UG/mm/ug_sampling_bias.h
+++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.h
@@ -37,6 +37,7 @@ namespace Moses
     {
       std::vector<id_type> const& m_sid2docid;
       std::vector<float> m_bias;
+      // std::map<int,float> m_bias;
 
     public:
 

From faf7b51fb7ad8e382c751c832de74fda745a2f57 Mon Sep 17 00:00:00 2001
From: MosesAdmin <moses-support-owner@mit.edu>
Date: Fri, 26 Jun 2015 00:01:00 +0100
Subject: [PATCH 098/108] daily automatic beautifier

---
 misc/pruneGeneration.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp
index e436263e9..d58c10ebd 100644
--- a/misc/pruneGeneration.cpp
+++ b/misc/pruneGeneration.cpp
@@ -31,7 +31,7 @@ int main(int argc, char **argv)
 
   // loop thru each file in directory
   fs::directory_iterator end_iter;
-  for( fs::directory_iterator dir_iter(dir) ; dir_iter != end_iter ; ++dir_iter) {  
+  for( fs::directory_iterator dir_iter(dir) ; dir_iter != end_iter ; ++dir_iter) {
     if (fs::is_regular_file(dir_iter->status())) {
       fs::path currPath = *dir_iter;
       string currPathStr = currPath.native();
@@ -46,15 +46,15 @@ int main(int argc, char **argv)
         string suffix = currFileStr.substr(fileStemStrSize, currFileStr.size() - fileStemStrSize);
         string outPath = outPathStem + suffix;
         cerr << "PRUNING " << currPathStr << " TO " << outPath << endl;
-        
+
         Moses::InputFileStream inStrme(currPathStr);
         Moses::OutputFileStream outStrme(outPath);
         Process(limit, inStrme, outStrme);
-        
+
       }
     }
   }
-  
+
   cerr << "Finished" << endl;
 }
 

From ca5485264196fbc79e4f478e1937e95c170645e8 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Fri, 26 Jun 2015 11:37:35 +0400
Subject: [PATCH 099/108] tighten up extract-parallel on osx. Can now use
 gsplit and bsd split

---
 scripts/generic/extract-parallel.perl | 38 ++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl
index 3240f24eb..226dbeb6e 100755
--- a/scripts/generic/extract-parallel.perl
+++ b/scripts/generic/extract-parallel.perl
@@ -1,4 +1,4 @@
-#!/usr/bin/env perl
+#!/usr/bin/env perl 
 #
 # This file is part of moses.  Its use is licensed under the GNU Lesser General
 # Public License version 2.1 or, at your option, any later version.
@@ -15,8 +15,7 @@ sub systemCheck($);
 sub NumStr($);
 sub DigitStr($);
 sub CharStr($);
-
-my $is_osx = ($^O eq "darwin");
+sub GetSplitVersion($);
 
 my $alph = "abcdefghijklmnopqrstuvwxyz";
 my @alph = (split(//,$alph));
@@ -42,7 +41,7 @@ my $baselineExtract;
 my $glueFile;
 my $phraseOrientation = 0;
 my $phraseOrientationPriorsFile;
-my $splitCmdOption="-d";
+my $splitCmdOption = "";
 
 my $GZIP_EXEC;
 if(`which pigz`) {
@@ -53,6 +52,15 @@ else {
 }
 print STDERR "using $GZIP_EXEC \n";
 
+my $isBSDSplit = GetSplitVersion($splitCmd);
+print STDERR "isBSDSplit=$isBSDSplit \n";
+
+if ($isBSDSplit == 0) {
+	$splitCmdOption .= "-d";
+}
+
+my $gzOut = 0; 
+
 for (my $i = 8; $i < $#ARGV + 1; ++$i)
 {
   $makeTTable = 0 if $ARGV[$i] eq "--NoTTable";
@@ -73,11 +81,15 @@ for (my $i = 8; $i < $#ARGV + 1; ++$i)
     $phraseOrientationPriorsFile = $ARGV[++$i];
     next;
   }
-  $splitCmdOption="",next if $ARGV[$i] eq "--NoNumericSuffix";
+  if ($ARGV[$i] eq '--GZOutput') {
+  	$gzOut = 1;
+  }
 
   $otherExtractArgs .= $ARGV[$i] ." ";
 }
 
+die("Need to specify --GZOutput for parallel extract") if ($gzOut == 0);
+
 my $cmd;
 my $TMPDIR=dirname($extract)  ."/tmp.$$";
 $cmd = "mkdir -p $TMPDIR; ls -l $TMPDIR";
@@ -272,7 +284,7 @@ if ($phraseOrientation && defined($phraseOrientationPriorsFile)) {
 
 # delete temporary files
 $cmd = "rm -rf $TMPDIR \n";
-`$cmd`;
+systemCheck($cmd);
 
 print STDERR "Finished ".localtime() ."\n";
 
@@ -352,10 +364,22 @@ sub CharStr($)
 sub NumStr($)
 {
     my $i = shift;
-    if ($is_osx){
+    if ($isBSDSplit){
         return CharStr($i);
     }else{
         return DigitStr($i);
     }
 }
 
+sub GetSplitVersion($)
+{
+	my $splitCmd = shift;
+	my $retVal = system("$splitCmd -h");
+	if ($retVal != 0) {
+		return 1;
+	}
+	else {
+		return 0;
+	}
+}
+

From 57e213ed190a15ebfbc193e9eeb525813e92cc1a Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Fri, 26 Jun 2015 12:18:21 +0400
Subject: [PATCH 100/108] tighten up extract-parallel on osx. Can now use
 gsplit and bsd split

---
 scripts/generic/extract-parallel.perl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl
index 226dbeb6e..2424c1bd2 100755
--- a/scripts/generic/extract-parallel.perl
+++ b/scripts/generic/extract-parallel.perl
@@ -374,7 +374,7 @@ sub NumStr($)
 sub GetSplitVersion($)
 {
 	my $splitCmd = shift;
-	my $retVal = system("$splitCmd -h");
+	my $retVal = system("$splitCmd --help");
 	if ($retVal != 0) {
 		return 1;
 	}

From 82edbb98a7aa9186287f8f00dfcbbeb2906e7a5a Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Sun, 28 Jun 2015 10:40:43 +0400
Subject: [PATCH 101/108] comments in ini file about default weights

---
 scripts/training/train-model.perl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index b693d774d..4d73ef4ee 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -2358,6 +2358,8 @@ sub create_ini {
 
   print INI "\n# dense weights for feature functions\n";
   print INI "[weight]\n";
+  print INI "# The default weights are NOT optimized for translation quality. You MUST tune the weights.\n";
+  print INI "# Documentation for tuning is here: http://www.statmt.org/moses/?n=FactoredTraining.Tuning \n";
   print INI "UnknownWordPenalty0= 1\n";
   print INI "WordPenalty0= -1\n";
   print INI "PhrasePenalty0= 0.2\n";

From f66beabf4f0dca33a6bbcc37072811e9017e19b5 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Sun, 28 Jun 2015 14:03:54 +0400
Subject: [PATCH 102/108] Generation error in EMS due to pruning. Lets see if
 this works.

---
 scripts/ems/experiment.meta | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index 110ab39b7..ee6b188e8 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -743,7 +743,7 @@ generation-prune
 	in: generation-table
 	out: generation-table-pruned
 	rerun-on-change: TRAINING:prune-generation
-	pass-unless: AND TRAINING:prune-generation
+	ignore-unless: AND TRAINING:prune-generation
 	default-name: model/generation-table-pruned
 	final-model: yes
 	template: $TRAINING:prune-generation IN OUT

From f7c3d00824e1664ba0cbfbc80ff94a82f3eb7561 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Sun, 28 Jun 2015 22:20:42 +0400
Subject: [PATCH 103/108] more testing of c++11 waters

---
 phrase-extract/ScoreFeatureTest.cpp | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp
index cc22f8630..0ed2f71e6 100644
--- a/phrase-extract/ScoreFeatureTest.cpp
+++ b/phrase-extract/ScoreFeatureTest.cpp
@@ -26,6 +26,7 @@
 #include <boost/test/unit_test.hpp>
 
 #include <unordered_set>
+#include <unordered_map>
 
 using namespace MosesTraining;
 using namespace std;
@@ -81,6 +82,16 @@ static void checkDomainConfigured(
   BOOST_CHECK(manager.includeSentenceId());
 }
 
+template<typename T>
+T adder(T v) {
+  return v;
+}
+
+template<typename T, typename... Args>
+T adder(T first, Args... args) {
+  return first + adder(args...);
+}
+
 BOOST_AUTO_TEST_CASE(manager_config_domain)
 {
   checkDomainConfigured<RatioDomainFeature>
@@ -102,8 +113,23 @@ BOOST_AUTO_TEST_CASE(manager_config_domain)
   s.insert(4);
   s.insert(1);
 
-for (auto i: s) {
+	for (auto i: s) {
     cerr << i << " ";
   }
+
+  unordered_map<std::string, int> m;
+  m["a"] = 4;
+  m["ba"] = 6;
+  m["aabc"] = 7;
+  
+	for (auto i: m) {
+    cerr << i.first << "=" << i.second << " ";
+  }
+  
+  long sum = adder(1, 2, 3, 8, 7);
+
+	std::string s1 = "x", s2 = "aa", s3 = "bb", s4 = "yy";
+	std::string ssum = adder(s1, s2, s3, s4);
+
 }
 

From fba4a3e24da01a01088c95c8c85f71d551ba4634 Mon Sep 17 00:00:00 2001
From: MosesAdmin <moses-support-owner@mit.edu>
Date: Mon, 29 Jun 2015 00:00:54 +0100
Subject: [PATCH 104/108] daily automatic beautifier

---
 phrase-extract/ScoreFeatureTest.cpp | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp
index 0ed2f71e6..94a5a0480 100644
--- a/phrase-extract/ScoreFeatureTest.cpp
+++ b/phrase-extract/ScoreFeatureTest.cpp
@@ -83,12 +83,14 @@ static void checkDomainConfigured(
 }
 
 template<typename T>
-T adder(T v) {
+T adder(T v)
+{
   return v;
 }
 
 template<typename T, typename... Args>
-T adder(T first, Args... args) {
+T adder(T first, Args... args)
+{
   return first + adder(args...);
 }
 
@@ -113,7 +115,7 @@ BOOST_AUTO_TEST_CASE(manager_config_domain)
   s.insert(4);
   s.insert(1);
 
-	for (auto i: s) {
+for (auto i: s) {
     cerr << i << " ";
   }
 
@@ -121,15 +123,15 @@ BOOST_AUTO_TEST_CASE(manager_config_domain)
   m["a"] = 4;
   m["ba"] = 6;
   m["aabc"] = 7;
-  
-	for (auto i: m) {
+
+for (auto i: m) {
     cerr << i.first << "=" << i.second << " ";
   }
-  
+
   long sum = adder(1, 2, 3, 8, 7);
 
-	std::string s1 = "x", s2 = "aa", s3 = "bb", s4 = "yy";
-	std::string ssum = adder(s1, s2, s3, s4);
+  std::string s1 = "x", s2 = "aa", s3 = "bb", s4 = "yy";
+  std::string ssum = adder(s1, s2, s3, s4);
 
 }
 

From 5e81e4b9c37bcfe4f7828ca16bb03c28cbf4f491 Mon Sep 17 00:00:00 2001
From: Jeroen Vermeulen <jtv@precisiontranslationtools.com>
Date: Mon, 29 Jun 2015 12:23:53 +0700
Subject: [PATCH 105/108] Simplify unnecessarily complicated condition.

---
 moses/ChartHypothesisCollection.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/moses/ChartHypothesisCollection.h b/moses/ChartHypothesisCollection.h
index 37cd907d9..b2464e151 100644
--- a/moses/ChartHypothesisCollection.h
+++ b/moses/ChartHypothesisCollection.h
@@ -52,11 +52,7 @@ public:
     // shouldn't be mixing hypos with different lhs
     assert(hypoA->GetTargetLHS() == hypoB->GetTargetLHS());
 
-    int ret = hypoA->RecombineCompare(*hypoB);
-    if (ret != 0)
-      return (ret < 0);
-
-    return false;
+    return (hypoA->RecombineCompare(*hypoB) < 0);
   }
 };
 

From a374706bd4a995aa810b748f122b2d6279866088 Mon Sep 17 00:00:00 2001
From: MosesAdmin <moses-support-owner@mit.edu>
Date: Wed, 1 Jul 2015 00:00:59 +0100
Subject: [PATCH 106/108] daily automatic beautifier

---
 moses/server/TranslationRequest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp
index 3848f81ba..bc2b5032b 100644
--- a/moses/server/TranslationRequest.cpp
+++ b/moses/server/TranslationRequest.cpp
@@ -1,5 +1,5 @@
 #include "TranslationRequest.h"
-#include "moses/ContextScope.h" 
+#include "moses/ContextScope.h"
 #include <boost/foreach.hpp>
 
 namespace MosesServer

From 81f337bcd838a69bf0e275c8138b173427a17d02 Mon Sep 17 00:00:00 2001
From: hieu <hieu@localhost.localdomain>
Date: Wed, 1 Jul 2015 09:42:07 +0400
Subject: [PATCH 107/108] rollback c++11 for now

---
 Jamroot                             |  2 +-
 moses/StaticData.cpp                |  3 ++-
 phrase-extract/ScoreFeatureTest.cpp | 33 ++++++++++++++++-------------
 3 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/Jamroot b/Jamroot
index 4f76ec3ba..b3544274b 100644
--- a/Jamroot
+++ b/Jamroot
@@ -108,7 +108,7 @@ external-lib z ;
 
 #lib dl : : <runtime-link>static:<link>static <runtime-link>shared:<link>shared ;
 #requirements += <library>dl ;
-requirements += <cxxflags>-std=c++0x ;
+#requirements += <cxxflags>-std=c++0x ;
 
 if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_minimal" ] {
   if [ option.get "full-tcmalloc" : : "yes" ] {
diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index 28d9f7831..281129a2e 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -1115,7 +1115,8 @@ void StaticData::LoadSparseWeightsFromConfig()
   }
 
   std::map<std::string, std::vector<float> > weights = m_parameter->GetAllWeights();
-  for (auto iter = weights.begin(); iter != weights.end(); ++iter) {
+	std::map<std::string, std::vector<float> >::iterator iter;
+  for (iter = weights.begin(); iter != weights.end(); ++iter) {
     // this indicates that it is sparse feature
     if (featureNames.find(iter->first) == featureNames.end()) {
       UTIL_THROW_IF2(iter->second.size() != 1, "ERROR: only one weight per sparse feature allowed: " << iter->first);
diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp
index 94a5a0480..9537b970f 100644
--- a/phrase-extract/ScoreFeatureTest.cpp
+++ b/phrase-extract/ScoreFeatureTest.cpp
@@ -24,9 +24,10 @@
 #define  BOOST_TEST_MODULE MosesTrainingScoreFeature
 #include <boost/test/test_tools.hpp>
 #include <boost/test/unit_test.hpp>
+#include <boost/assign/list_of.hpp>
 
-#include <unordered_set>
-#include <unordered_map>
+//#include <unordered_set>
+//#include <unordered_map>
 
 using namespace MosesTraining;
 using namespace std;
@@ -54,16 +55,16 @@ BOOST_AUTO_TEST_CASE(manager_configure_domain_except)
   //Check that configure rejects illegal domain arg combinations
   ScoreFeatureManager manager;
   BOOST_CHECK_THROW(
-    manager.configure( {"--DomainRatio","/dev/null","--DomainIndicator","/dev/null"}),
+    manager.configure(boost::assign::list_of("--DomainRatio")("/dev/null")("--DomainIndicator")("/dev/null")),
     ScoreFeatureArgumentException);
   BOOST_CHECK_THROW(
-    manager.configure( {"--SparseDomainSubset","/dev/null","--SparseDomainRatio","/dev/null"}),
+    manager.configure(boost::assign::list_of("--SparseDomainSubset")("/dev/null")("--SparseDomainRatio")("/dev/null")),
     ScoreFeatureArgumentException);
   BOOST_CHECK_THROW(
-    manager.configure( {"--SparseDomainBlah","/dev/null"}),
+    manager.configure(boost::assign::list_of("--SparseDomainBlah")("/dev/null")),
     ScoreFeatureArgumentException);
   BOOST_CHECK_THROW(
-    manager.configure( {"--DomainSubset"}),
+    manager.configure(boost::assign::list_of("--DomainSubset")),
     ScoreFeatureArgumentException);
 }
 
@@ -97,25 +98,27 @@ T adder(T first, Args... args)
 BOOST_AUTO_TEST_CASE(manager_config_domain)
 {
   checkDomainConfigured<RatioDomainFeature>
-  ( {"--DomainRatio","/dev/null"});
+  (boost::assign::list_of("--DomainRatio")("/dev/null"));
   checkDomainConfigured<IndicatorDomainFeature>
-  ( {"--DomainIndicator","/dev/null"});
+  (boost::assign::list_of("--DomainIndicator")("/dev/null"));
   checkDomainConfigured<SubsetDomainFeature>
-  ( {"--DomainSubset","/dev/null"});
+  (boost::assign::list_of("--DomainSubset")("/dev/null"));
   checkDomainConfigured<SparseRatioDomainFeature>
-  ( {"--SparseDomainRatio","/dev/null"});
+  (boost::assign::list_of("--SparseDomainRatio")("/dev/null"));
   checkDomainConfigured<SparseIndicatorDomainFeature>
-  ( {"--SparseDomainIndicator","/dev/null"});
+  (boost::assign::list_of("--SparseDomainIndicator")("/dev/null"));
   checkDomainConfigured<SparseSubsetDomainFeature>
-  ( {"--SparseDomainSubset","/dev/null"});
+  (boost::assign::list_of("--SparseDomainSubset")("/dev/null"));
 
+  /*
+  // C++11 testing
   unordered_set<int> s;
   s.insert(4);
   s.insert(7);
   s.insert(4);
   s.insert(1);
 
-for (auto i: s) {
+  for (auto i: s) {
     cerr << i << " ";
   }
 
@@ -124,7 +127,7 @@ for (auto i: s) {
   m["ba"] = 6;
   m["aabc"] = 7;
 
-for (auto i: m) {
+  for (auto i: m) {
     cerr << i.first << "=" << i.second << " ";
   }
 
@@ -132,6 +135,6 @@ for (auto i: m) {
 
   std::string s1 = "x", s2 = "aa", s3 = "bb", s4 = "yy";
   std::string ssum = adder(s1, s2, s3, s4);
-
+  */
 }
 

From 86292f2ce332013c187afd8046a9eeec2770561e Mon Sep 17 00:00:00 2001
From: MosesAdmin <moses-support-owner@mit.edu>
Date: Thu, 2 Jul 2015 00:01:16 +0100
Subject: [PATCH 108/108] daily automatic beautifier

---
 moses/StaticData.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index 281129a2e..8fb88c257 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -1115,7 +1115,7 @@ void StaticData::LoadSparseWeightsFromConfig()
   }
 
   std::map<std::string, std::vector<float> > weights = m_parameter->GetAllWeights();
-	std::map<std::string, std::vector<float> >::iterator iter;
+  std::map<std::string, std::vector<float> >::iterator iter;
   for (iter = weights.begin(); iter != weights.end(); ++iter) {
     // this indicates that it is sparse feature
     if (featureNames.find(iter->first) == featureNames.end()) {