Merge branch 'master' of http://github.com/moses-smt/mosesdecoder into ranked-sampling

Conflicts: moses/TargetPhrase.cpp moses/TargetPhrase.h
2024-10-26 11:28:48 +03:00 · 2015-07-28 14:29:49 +01:00 · 2015-07-28 14:29:49 +01:00 · d67723fd29
commit d67723fd29
parent d1cb249a7f b64af59af6
98 changed files with 1742 additions and 1039 deletions
--- a/2
+++ b/2
@ -179,7 +179,7 @@ if [ option.get "with-icu" : : "yes" ]
  requirements += <library>icui18n/<link>shared ;
  requirements += <cxxflags>-fPIC ;
  requirements += <address-model>64 ;
-  requirements += <runtime-link>shared ;
+#  requirements += <runtime-link>shared ;
 }

 if [ option.get "with-probing-pt" : : "yes" ]
--- a/biconcor/SuffixArray.cpp
+++ b/biconcor/SuffixArray.cpp
@ -21,6 +21,11 @@ SuffixArray::SuffixArray()
    m_wordInSentence(NULL),
    m_sentence(NULL),
    m_sentenceLength(NULL),
+    m_document(NULL),
+    m_documentName(NULL),
+    m_documentNameLength(0),
+    m_documentCount(0),
+    m_useDocument(false),
    m_vcb(),
    m_size(0),
    m_sentenceCount(0) { }
@ -32,6 +37,8 @@ SuffixArray::~SuffixArray()
  free(m_wordInSentence);
  free(m_sentence);
  free(m_sentenceLength);
+  free(m_document);
+  free(m_documentName);
 }

 void SuffixArray::Create(const string& fileName )
@ -46,22 +53,32 @@ void SuffixArray::Create(const string& fileName )
  textFile.open(fileName.c_str());

  if (!textFile) {
-    cerr << "no such file or directory " << fileName << endl;
+    cerr << "Error: no such file or directory " << fileName << endl;
    exit(1);
  }

+  // first pass through data: get size
  istream *fileP = &textFile;
  m_size = 0;
  m_sentenceCount = 0;
+  m_documentCount = 0;
  while(!fileP->eof()) {
    SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
    if (fileP->eof()) break;
+    if (m_useDocument && ProcessDocumentLine(line,0)) continue;
    vector< WORD_ID > words = m_vcb.Tokenize( line );
    m_size += words.size() + 1;
    m_sentenceCount++;
  }
  textFile.close();
  cerr << m_size << " words (incl. sentence boundaries)" << endl;
+  if (m_useDocument) {
+    cerr << m_documentCount << " documents" << endl;
+    if (m_documentCount == 0) {
+      cerr << "Error: no documents found, aborting." << endl;
+      exit(1);
+    }
+  }

  // allocate memory
  m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
@ -69,21 +86,31 @@ void SuffixArray::Create(const string& fileName )
  m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
  m_sentence = (INDEX*) calloc( sizeof( INDEX ), m_size );
  m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );
-
-  // fill the array
-  int wordIndex = 0;
-  int sentenceId = 0;
-  textFile.open(fileName.c_str());
-
-  if (!textFile) {
-    cerr << "no such file or directory " << fileName << endl;
-    exit(1);
+  CheckAllocation(m_array != NULL, "m_array");
+  CheckAllocation(m_index != NULL, "m_index");
+  CheckAllocation(m_wordInSentence != NULL, "m_wordInSentence");
+  CheckAllocation(m_sentence != NULL, "m_sentence");
+  CheckAllocation(m_sentenceLength != NULL, "m_sentenceLength");
+  if (m_useDocument) {
+    m_document = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
+    m_documentName = (INDEX*) calloc( sizeof( char ), m_documentCount );
+    m_documentNameBuffer = (char*) calloc( sizeof( char ), m_documentNameLength );
+    CheckAllocation(m_document != NULL, "m_document");
+    CheckAllocation(m_documentName != NULL, "m_documentName");
+    CheckAllocation(m_documentNameBuffer != NULL, "m_documentNameBuffer");
  }

+  // second pass through data: fill the arrays
+  int wordIndex = 0;
+  int sentenceId = 0;
+  m_documentNameLength = 0; // re-use as counter
+  m_documentCount = 0;      // re-use as counter
+  textFile.open(fileName.c_str());
  fileP = &textFile;
  while(!fileP->eof()) {
    SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
    if (fileP->eof()) break;
+    if (m_useDocument && ProcessDocumentLine(line,sentenceId)) continue;
    vector< WORD_ID > words = m_vcb.Tokenize( line );
    vector< WORD_ID >::const_iterator i;

@ -105,7 +132,7 @@ void SuffixArray::Create(const string& fileName )
  m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );

  if (m_buffer == NULL) {
-    cerr << "cannot allocate memory to m_buffer" << endl;
+    cerr << "Error: cannot allocate memory to m_buffer" << endl;
    exit(1);
  }

@ -114,6 +141,45 @@ void SuffixArray::Create(const string& fileName )
  cerr << "done sorting" << endl;
 }

+// very specific code to deal with common crawl document ids
+bool SuffixArray::ProcessDocumentLine( const char *line, const size_t sentenceId )
+{
+  size_t i;
+  // first 32 characters are hex-hash
+  for(i=0; i<32; i++) {
+    if ((line[i] < '0' || line[i] > '9') && (line[i] < 'a' || line[i] > 'f')) {
+      return false;
+    }
+  }
+  if (line[i++] != ' ') return false;
+
+  // second token is float
+  for (; line[i] != ' ' && line[i] != 0; i++) {
+    if (line[i] != '.' && (line[i] < '0' || line[i] > '9')) {
+      return false;
+    }
+  }
+  i++;
+
+  // last token is url (=name)
+  size_t startName = i;
+  for (; line[i] != ' ' && line[i] != 0; i++) {}
+  if (line[i] == ' ') return false;
+  size_t endName = i+1; // include '\0'
+
+  // second pass: record name and sentence number
+  if (m_document != NULL) {
+    m_documentName[m_documentCount] = m_documentNameLength;
+    for(size_t i=startName; i<endName; i++) {
+      m_documentNameBuffer[m_documentNameLength + i-startName] = line[i];
+    }
+    m_document[m_documentCount] = sentenceId;
+  }
+  m_documentNameLength += endName-startName;
+  m_documentCount++;
+  return true;
+}
+
 // good ol' quick sort
 void SuffixArray::Sort(INDEX start, INDEX end)
 {
@ -162,7 +228,6 @@ int SuffixArray::CompareIndex( INDEX a, INDEX b ) const

 inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const
 {
-  // cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl;
  return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) );
 }

@ -272,13 +337,73 @@ void SuffixArray::List(INDEX start, INDEX end)
  }
 }

+void SuffixArray::PrintSentenceMatches( const std::vector< WORD > &phrase )
+{
+  cout << "QUERY\t";
+  for(size_t i=0; i<phrase.size(); i++) {
+    if (i>0) cout << " ";
+    cout <<  phrase[i];
+  }
+  cout << '\t';
+  INDEX start = 0;
+  INDEX end = m_size-1;
+  INDEX mid = FindFirst( phrase, start, end );
+  if (mid == m_size) { // no matches
+    cout << "0 matches" << endl;
+    return;
+  }
+
+  INDEX firstMatch = FindLast( phrase, mid, start, -1 );
+  INDEX lastMatch = FindLast( phrase, mid, end, 1 );
+
+  // loop through all matches
+  cout << (lastMatch-firstMatch+1) << " matches" << endl;
+  for(INDEX i=firstMatch; i<=lastMatch; i++) {
+    // get sentence information
+    INDEX pos = GetPosition( i );
+    INDEX start = pos - GetWordInSentence( pos );
+    char length = GetSentenceLength( GetSentence( pos ) );
+    // print document name
+    if (m_useDocument) {
+      INDEX sentence = GetSentence( pos );
+      INDEX document = GetDocument( sentence );
+      PrintDocumentName( document );
+      cout << '\t';
+    }
+    // print sentence
+    for(char i=0; i<length; i++) {
+      if (i>0) cout << " ";
+      cout << GetWord( start + i );
+    }
+    cout << endl;
+  }
+}
+
+SuffixArray::INDEX SuffixArray::GetDocument( INDEX sentence ) const
+{
+  // binary search
+  INDEX min = 0;
+  INDEX max = m_documentCount-1;
+  if (sentence >= m_document[max]) {
+    return max;
+  }
+  while(true) {
+    INDEX mid = (min + max) / 2;
+    if (sentence >= m_document[mid] && sentence < m_document[mid+1]) {
+      return mid;
+    }
+    if (sentence < m_document[mid]) {
+      max = mid-1;
+    } else {
+      min = mid+1;
+    }
+  }
+}
+
 void SuffixArray::Save(const string& fileName ) const
 {
  FILE *pFile = fopen ( fileName.c_str() , "w" );
-  if (pFile == NULL) {
-    cerr << "Cannot open " << fileName << endl;
-    exit(1);
-  }
+  if (pFile == NULL) Error("cannot open",fileName);

  fwrite( &m_size, sizeof(INDEX), 1, pFile );
  fwrite( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
@ -288,6 +413,16 @@ void SuffixArray::Save(const string& fileName ) const

  fwrite( &m_sentenceCount, sizeof(INDEX), 1, pFile );
  fwrite( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length
+
+  char useDocument = m_useDocument; // not sure if that is needed
+  fwrite( &useDocument, sizeof(char), 1, pFile );
+  if (m_useDocument) {
+    fwrite( &m_documentCount, sizeof(INDEX), 1, pFile );
+    fwrite( m_document, sizeof(INDEX), m_documentCount, pFile );
+    fwrite( m_documentName, sizeof(INDEX), m_documentCount, pFile );
+    fwrite( &m_documentNameLength, sizeof(INDEX), 1, pFile );
+    fwrite( m_documentNameBuffer, sizeof(char), m_documentNameLength, pFile );
+  }
  fclose( pFile );

  m_vcb.Save( fileName + ".src-vcb" );
@ -296,56 +431,81 @@ void SuffixArray::Save(const string& fileName ) const
 void SuffixArray::Load(const string& fileName )
 {
  FILE *pFile = fopen ( fileName.c_str() , "r" );
-  if (pFile == NULL) {
-    cerr << "no such file or directory " << fileName << endl;
-    exit(1);
-  }
+  if (pFile == NULL) Error("no such file or directory", fileName);

  cerr << "loading from " << fileName << endl;

-  fread( &m_size, sizeof(INDEX), 1, pFile );
+  fread( &m_size, sizeof(INDEX), 1, pFile )
+  || Error("could not read m_size from", fileName);
  cerr << "words in corpus: " << m_size << endl;
+
  m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
  m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
  m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
  m_sentence = (INDEX*) calloc( sizeof( INDEX ), m_size );
+  CheckAllocation(m_array != NULL, "m_array");
+  CheckAllocation(m_index != NULL, "m_index");
+  CheckAllocation(m_wordInSentence != NULL, "m_wordInSentence");
+  CheckAllocation(m_sentence != NULL, "m_sentence");
+  fread( m_array, sizeof(WORD_ID), m_size, pFile ) // corpus
+  || Error("could not read m_array from", fileName);
+  fread( m_index, sizeof(INDEX), m_size, pFile )   // suffix array
+  || Error("could not read m_index from", fileName);
+  fread( m_wordInSentence, sizeof(char), m_size, pFile) // word index
+  || Error("could not read m_wordInSentence from", fileName);
+  fread( m_sentence, sizeof(INDEX), m_size, pFile ) // sentence index
+  || Error("could not read m_sentence from", fileName);

-  if (m_array == NULL) {
-    cerr << "Error: cannot allocate memory to m_array" << endl;
-    exit(1);
-  }
-
-  if (m_index == NULL) {
-    cerr << "Error: cannot allocate memory to m_index" << endl;
-    exit(1);
-  }
-
-  if (m_wordInSentence == NULL) {
-    cerr << "Error: cannot allocate memory to m_wordInSentence" << endl;
-    exit(1);
-  }
-
-  if (m_sentence == NULL) {
-    cerr << "Error: cannot allocate memory to m_sentence" << endl;
-    exit(1);
-  }
-
-  fread( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
-  fread( m_index, sizeof(INDEX), m_size, pFile );   // suffix array
-  fread( m_wordInSentence, sizeof(char), m_size, pFile); // word index
-  fread( m_sentence, sizeof(INDEX), m_size, pFile); // sentence index
-
-  fread( &m_sentenceCount, sizeof(INDEX), 1, pFile );
+  fread( &m_sentenceCount, sizeof(INDEX), 1, pFile )
+  || Error("could not read m_sentenceCount from", fileName);
  cerr << "sentences in corpus: " << m_sentenceCount << endl;
-  m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );

-  if (m_sentenceLength == NULL) {
-    cerr << "Error: cannot allocate memory to m_sentenceLength" << endl;
-    exit(1);
+  m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );
+  CheckAllocation(m_sentenceLength != NULL, "m_sentenceLength");
+  fread( m_sentenceLength, sizeof(char), m_sentenceCount, pFile) // sentence length
+  || Error("could not read m_sentenceLength from", fileName);
+
+  if (m_useDocument) { // do not read it when you do not need it
+    char useDocument;
+    fread( &useDocument, sizeof(char), 1, pFile )
+    || Error("could not read m_useDocument from", fileName);
+    if (!useDocument) {
+      cerr << "Error: stored suffix array does not have a document index\n";
+      exit(1);
+    }
+    fread( &m_documentCount, sizeof(INDEX), 1, pFile )
+    || Error("could not read m_documentCount from", fileName);
+    m_document = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
+    m_documentName = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
+    CheckAllocation(m_document != NULL, "m_document");
+    CheckAllocation(m_documentName != NULL, "m_documentName");
+    fread( m_document, sizeof(INDEX), m_documentCount, pFile )
+    || Error("could not read m_document from", fileName);
+    fread( m_documentName, sizeof(INDEX), m_documentCount, pFile )
+    || Error("could not read m_documentName from", fileName);
+    fread( &m_documentNameLength, sizeof(INDEX), 1, pFile )
+    || Error("could not read m_documentNameLength from", fileName);
+    m_documentNameBuffer = (char*) calloc( sizeof( char ), m_documentNameLength );
+    CheckAllocation(m_documentNameBuffer != NULL, "m_documentNameBuffer");
+    fread( m_documentNameBuffer, sizeof(char), m_documentNameLength, pFile )
+    || Error("could not read m_document from", fileName);
  }

-  fread( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length
  fclose( pFile );

  m_vcb.Load( fileName + ".src-vcb" );
 }
+
+void SuffixArray::CheckAllocation( bool check, const char *dataStructure ) const
+{
+  if (check) return;
+  cerr << "Error: could not allocate memory for " << dataStructure << endl;
+  exit(1);
+}
+
+bool SuffixArray::Error( const char *message, const string &fileName) const
+{
+  cerr << "Error: " << message << " " << fileName << endl;
+  exit(1);
+  return true; // yeah, i know.
+}
--- a/biconcor/SuffixArray.h
+++ b/biconcor/SuffixArray.h
@ -15,6 +15,12 @@ private:
  INDEX *m_sentence;
  char *m_sentenceLength;
  WORD_ID m_endOfSentence;
+  INDEX *m_document;
+  INDEX *m_documentName;
+  char *m_documentNameBuffer;
+  size_t m_documentNameLength;
+  size_t m_documentCount;
+  bool m_useDocument;
  Vocabulary m_vcb;
  INDEX m_size;
  INDEX m_sentenceCount;
@ -28,6 +34,7 @@ public:
  ~SuffixArray();

  void Create(const std::string& fileName );
+  bool ProcessDocumentLine( const char* const, const size_t );
  void Sort(INDEX start, INDEX end);
  int CompareIndex( INDEX a, INDEX b ) const;
  inline int CompareWord( WORD_ID a, WORD_ID b ) const;
@ -40,6 +47,7 @@ public:
  INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction );
  int Match( const std::vector< WORD > &phrase, INDEX index );
  void List( INDEX start, INDEX end );
+  void PrintSentenceMatches( const std::vector< WORD > &phrase );
  inline INDEX GetPosition( INDEX index ) const {
    return m_index[ index ];
  }
@ -58,6 +66,17 @@ public:
  inline WORD GetWord( INDEX position ) const {
    return m_vcb.GetWord( m_array[position] );
  }
+  void UseDocument() {
+    m_useDocument = true;
+  }
+  INDEX GetDocument( INDEX sentence ) const;
+  void PrintDocumentName( INDEX document ) {
+    for(INDEX i=m_documentName[ document ]; m_documentNameBuffer[i] != 0; i++) {
+      std::cout << m_documentNameBuffer[ i ];
+    }
+  }
  void Save(const std::string& fileName ) const;
  void Load(const std::string& fileName );
+  void CheckAllocation(bool, const char *dataStructure) const;
+  bool Error( const char* message, const std::string& fileName) const;
 };
--- a/biconcor/phrase-lookup.cpp
+++ b/biconcor/phrase-lookup.cpp
@ -1,4 +1,5 @@
 #include "SuffixArray.h"
+#include "../util/tokenize.hh"
 #include <getopt.h>

 using namespace std;
@ -13,10 +14,12 @@ int main(int argc, char* argv[])
  string query;
  string fileNameSuffix;
  string fileNameSource;
-  int loadFlag = false;
-  int saveFlag = false;
-  int createFlag = false;
-  int queryFlag = false;
+  bool loadFlag = false;
+  bool saveFlag = false;
+  bool createFlag = false;
+  bool queryFlag = false;
+  bool querySentenceFlag = false;
+
  int stdioFlag = false;  // receive requests from STDIN, respond to STDOUT
  string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create corpus]\n\t[--query string]\n\t[--stdio]\n";
  while(1) {
@ -25,11 +28,14 @@ int main(int argc, char* argv[])
      {"save", required_argument, 0, 's'},
      {"create", required_argument, 0, 'c'},
      {"query", required_argument, 0, 'q'},
+      {"query-sentence", required_argument, 0, 'Q'},
+      {"document", required_argument, 0, 'd'},
      {"stdio", no_argument, 0, 'i'},
+      {"stdio-sentence", no_argument, 0, 'I'},
      {0, 0, 0, 0}
    };
    int option_index = 0;
-    int c = getopt_long (argc, argv, "l:s:c:q:i", long_options, &option_index);
+    int c = getopt_long (argc, argv, "l:s:c:q:Q:iId", long_options, &option_index);
    if (c == -1) break;
    switch (c) {
    case 'l':
@ -48,17 +54,25 @@ int main(int argc, char* argv[])
      query = string(optarg);
      queryFlag = true;
      break;
+    case 'Q':
+      query = string(optarg);
+      querySentenceFlag = true;
+      break;
    case 'i':
      stdioFlag = true;
      break;
+    case 'I':
+      stdioFlag = true;
+      querySentenceFlag = true;
+      break;
+    case 'd':
+      suffixArray.UseDocument();
+      break;
    default:
      cerr << info;
      exit(1);
    }
  }
-  if (stdioFlag) {
-    queryFlag = true;
-  }

  // check if parameter settings are legal
  if (saveFlag && !createFlag) {
@ -74,7 +88,7 @@ int main(int argc, char* argv[])
    exit(1);
  }

-  // do your thing
+  // get suffix array
  if (createFlag) {
    cerr << "will create\n";
    cerr << "corpus is in " << fileNameSource << endl;
@ -88,16 +102,26 @@ int main(int argc, char* argv[])
    cerr << "will load from " << fileNameSuffix << endl;
    suffixArray.Load( fileNameSuffix );
  }
+
+  // do something with it
  if (stdioFlag) {
    while(true) {
      string query;
      if (getline(cin, query, '\n').eof()) {
        return 0;
      }
-      cout << lookup( query ) << endl;
+      if (querySentenceFlag) {
+        vector< string > queryString = util::tokenize( query.c_str() );
+        suffixArray.PrintSentenceMatches( queryString );
+      } else {
+        cout << lookup( query ) << endl;
+      }
    }
  } else if (queryFlag) {
    cout << lookup( query ) << endl;
+  } else if (querySentenceFlag) {
+    vector< string > queryString = util::tokenize( query.c_str() );
+    suffixArray.PrintSentenceMatches( queryString );
  }
  return 0;
 }
@ -105,32 +129,6 @@ int main(int argc, char* argv[])
 size_t lookup( string query )
 {
  cerr << "query is " << query << endl;
-  vector< string > queryString = tokenize( query.c_str() );
+  vector< string > queryString = util::tokenize( query.c_str() );
  return suffixArray.Count( queryString );
 }
-
-// Duplicate of definition in util/tokenize.hh.
-// TODO: Can we de-duplicate this?  At the time of writing biconcor does not
-// use util at all.
-vector<string> tokenize(const char input[])
-{
-  vector< string > token;
-  bool betweenWords = true;
-  int start=0;
-  int i;
-  for(i = 0; input[i] != '\0'; i++) {
-    const bool isSpace = (input[i] == ' ' || input[i] == '\t');
-
-    if (!isSpace && betweenWords) {
-      start = i;
-      betweenWords = false;
-    } else if (isSpace && !betweenWords) {
-      token.push_back( string( input+start, i-start ) );
-      betweenWords = true;
-    }
-  }
-  if (!betweenWords)
-    token.push_back( string( input+start, i-start ) );
-  return token;
-}
-
--- a/contrib/moses-speedtest/README.md
+++ b/contrib/moses-speedtest/README.md
@ -28,14 +28,16 @@ TEST_DIR: /home/moses-speedtest/phrase_tables/tests
 TEST_LOG_DIR: /home/moses-speedtest/phrase_tables/testlogs
 BASEBRANCH: RELEASE-2.1.1
 MOSES_PROFILER_REPO: /home/moses-speedtest/moses-standard/mosesdecoder-variant-prof
+MOSES_GOOGLE_PROFILER_REPO: /home/moses-speedtest/moses-standard/mosesdecoder-variant-gperftools
 </pre>

 The _MOSES\_REPO\_PATH_ is the place where you have set up and built moses.
-The _DROP\_CACHES\_COMM_ is the command that would b eused to drop caches. It should run without needing root access.
+The _DROP\_CACHES\_COMM_ is the command that would be used to drop caches. It should run without needing root access.
 _TEST\_DIR_ is the directory where all the tests will reside.
 _TEST\_LOG\_DIR_ is the directory where the performance logs will be gathered. It should be created before running the testsuite for the first time.
 _BASEBRANCH_ is the branch against which all new tests will be compared. It should normally be set to be the latest Moses stable release.
 _MOSES\_PROFILER\_REPO_ is a path to a moses repository set up and built with profiling enabled. Optional if you want to produce profiling results.
+_MOSES\_GOOGLE\_PROFILER\_REPO is a path to moses repository set up with full tcmalloc and profiler, as well as shared link for use with gperftools.
 ### Creating tests

 In order to create a test one should go into the TEST_DIR and create a new folder. That folder will be used for the name of the test.
@ -45,7 +47,7 @@ An example such configuration file is **test\_config**
 <pre>
 Command: moses -f ... -i fff #Looks for the command in the /bin directory of the repo specified in the testsuite_config
 LDPRE: ldpreloads #Comma separated LD_LIBRARY_PATH:/, 
-Variants: vanilla, cached, ldpre, profile #Can't have cached without ldpre or vanilla
+Variants: vanilla, cached, ldpre, profile, google-profiler #Can't have cached without ldpre or vanilla
 </pre>

 The _Command:_ line specifies the executable (which is looked up in the /bin directory of the repo.) and any arguments necessary. Before running the test, the script cds to the current test directory so you can use relative paths.
@ -61,11 +63,21 @@ The _Variants:_ line specifies what type of tests should we run. This particular
 If you want to produce profiler results together in some tests you need to specify the _MOSES\_PROFILER\_REPO_ in the config
 ```bash
 git clone https://github.com/moses-smt/mosesdecoder.git mosesdecoder-profile
-cd mosesdecoder
+cd mosesdecoder-profile
 ./bjam -j10 --with-cmph=/usr/include/ variant=profile
 ```

-Afterwards for testcases which contain the **profile** keyword in **Variants** you will see a directory inside _TEST\_LOG\_DIR which contains the **gprof** output from every run.
+Afterwards for testcases which contain the **profile** keyword in **Variants** you will see a directory inside _TEST\_LOG\_DIR which contains the **gprof** output from every run (files ending in **\_profile**).
+
+#### Produce google profiler results.
+If you want to produce profiler results together in some tests you need to specify the _MOSES\_GOOGLE\_PROFILER\_REPO in the config
+```bash
+git clone https://github.com/moses-smt/mosesdecoder.git mosesdecoder-google-profile
+cd mosesdecoder
+./bjam link=shared -j10 --full-tcmalloc --with-cmph=/usr/include/
+```
+
+Afterwards for testcases which contain the **google-profiler** keyword in **Variants** you will see a directory inside _TEST\_LOG\_DIR which contains the **google-profiler** output from every run (files prefixed with **pprof**). To analyze the output you need to use [pprof](http://google-perftools.googlecode.com/svn/trunk/doc/cpuprofile.html).

 ### Running tests.
 Running the tests is done through the **runtests.py** script.
--- a/contrib/moses-speedtest/runtests.py
+++ b/contrib/moses-speedtest/runtests.py
@ -2,6 +2,7 @@
 import os
 import subprocess
 import time
+import shutil
 from argparse import ArgumentParser
 from testsuite_common import processLogLine

@ -26,16 +27,21 @@ def parse_cmd():
    arguments = parser.parse_args()
    return arguments

-def repoinit(testconfig, profiler=True):
+def repoinit(testconfig, profiler=None):
    """Determines revision and sets up the repo. If given the profiler optional
    argument, wil init the profiler repo instead of the default one."""
    revision = ''
    #Update the repo
-    if profiler:
+    if profiler == "gnu-profiler":
        if testconfig.repo_prof is not None:
            os.chdir(testconfig.repo_prof)
        else:
            raise ValueError('Profiling repo is not defined')
+    elif profiler == "google-profiler":
+        if testconfig.repo_gprof is not None:
+            os.chdir(testconfig.repo_gprof)
+        else:
+            raise ValueError('Profiling repo is not defined')
    else:
        os.chdir(testconfig.repo)
    #Checkout specific branch, else maintain main branch
@ -61,9 +67,10 @@ def repoinit(testconfig, profiler=True):

 class Configuration:
    """A simple class to hold all of the configuration constatns"""
-    def __init__(self, repo, drop_caches, tests, testlogs, basebranch, baserev, repo_prof=None):
+    def __init__(self, repo, drop_caches, tests, testlogs, basebranch, baserev, repo_prof=None, repo_gprof=None):
        self.repo = repo
        self.repo_prof = repo_prof
+        self.repo_gprof = repo_gprof
        self.drop_caches = drop_caches
        self.tests = tests
        self.testlogs = testlogs
@ -88,16 +95,17 @@ class Configuration:

 class Test:
    """A simple class to contain all information about tests"""
-    def __init__(self, name, command, ldopts, permutations, prof_command=None):
+    def __init__(self, name, command, ldopts, permutations, prof_command=None, gprof_command=None):
        self.name = name
        self.command = command
        self.prof_command = prof_command
+        self.gprof_command = gprof_command
        self.ldopts = ldopts.replace(' ', '').split(',') #Not tested yet
        self.permutations = permutations

-def parse_configfile(conffile, testdir, moses_repo, moses_prof_repo=None):
+def parse_configfile(conffile, testdir, moses_repo, moses_prof_repo=None, moses_gprof_repo=None):
    """Parses the config file"""
-    command, ldopts, prof_command = '', '', None
+    command, ldopts, prof_command, gprof_command = '', '', None, None
    permutations = []
    fileopen = open(conffile, 'r')
    for line in fileopen:
@ -108,8 +116,10 @@ def parse_configfile(conffile, testdir, moses_repo, moses_prof_repo=None):

        if opt == 'Command:':
            command = args.replace('\n', '')
-            if moses_prof is not None:  # Get optional command for profiling
+            if moses_prof_repo is not None:  # Get optional command for profiling
                prof_command = moses_prof_repo + '/bin/' + command
+            if moses_gprof_repo is not None: # Get optional command for google-perftools
+                gprof_command = moses_gprof_repo + '/bin/' + command
            command = moses_repo + '/bin/' + command
        elif opt == 'LDPRE:':
            ldopts = args.replace('\n', '')
@ -118,14 +128,14 @@ def parse_configfile(conffile, testdir, moses_repo, moses_prof_repo=None):
        else:
            raise ValueError('Unrecognized option ' + opt)
    #We use the testdir as the name.
-    testcase = Test(testdir, command, ldopts, permutations, prof_command)
+    testcase = Test(testdir, command, ldopts, permutations, prof_command, gprof_command)
    fileopen.close()
    return testcase

 def parse_testconfig(conffile):
    """Parses the config file for the whole testsuite."""
    repo_path, drop_caches, tests_dir, testlog_dir = '', '', '', ''
-    basebranch, baserev, repo_prof_path = '', '', None
+    basebranch, baserev, repo_prof_path, repo_gprof_path = '', '', None, None
    fileopen = open(conffile, 'r')
    for line in fileopen:
        line = line.split('#')[0] # Discard comments
@ -146,10 +156,12 @@ def parse_testconfig(conffile):
            baserev = args.replace('\n', '')
        elif opt == 'MOSES_PROFILER_REPO:':  # Optional
            repo_prof_path = args.replace('\n', '')
+        elif opt == 'MOSES_GOOGLE_PROFILER_REPO:':  # Optional
+            repo_gprof_path = args.replace('\n', '')
        else:
            raise ValueError('Unrecognized option ' + opt)
    config = Configuration(repo_path, drop_caches, tests_dir, testlog_dir,\
-    basebranch, baserev, repo_prof_path)
+    basebranch, baserev, repo_prof_path, repo_gprof_path)
    fileopen.close()
    return config

@ -160,7 +172,9 @@ def get_config():
    config.additional_args(args.singletestdir, args.revision, args.branch)
    revision = repoinit(config)
    if config.repo_prof is not None:
-        repoinit(config, True)
+        repoinit(config, "gnu-profiler")
+    if config.repo_gprof is not None:
+        repoinit(config, "google-profiler")
    config.set_revision(revision)
    return config

@ -212,16 +226,27 @@ def write_gprof(command, name, variant, config):
    executable_path = command.split(' ')[0]  # Path to the moses binary
    gprof_command = 'gprof ' + executable_path + ' ' + gmon_path + ' > ' + outputfile
    subprocess.call([gprof_command], shell=True)
-    os.remove('gmon_path')  # After we are done discard the gmon file
+    os.remove(gmon_path)  # After we are done discard the gmon file

-def execute_test(command, path, name, variant, config, profile=False):
+def write_pprof(name, variant, config):
+    """Copies the google-perftools profiler output to the corresponding test directory"""
+    output_dir = config.testlogs + '/' + name
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    outputfile = output_dir + '/pprof_' + time.strftime("%d.%m.%Y_%H:%M:%S") + '_' + name + '_' + variant
+    shutil.move("/tmp/moses.prof", outputfile)
+
+
+def execute_test(command, path, name, variant, config, profile=None):
    """Executes a testcase given a whole command, path to the test file output,
    name of the test and variant tested. Config is the global configuration"""
    subprocess.Popen([command], stdout=None, stderr=subprocess.PIPE, shell=True).communicate()
-    if not profile:
+    if profile is None:
        write_log(path, name + '_' + variant, config)
-    else:  # Basically produce a gmon output
+    elif profile == "gnu-profiler":  # Basically produce a gmon output
        write_gprof(command, name, variant, config)
+    elif profile == "google-profiler":
+        write_pprof(name, variant, config)        


 def execute_tests(testcase, cur_directory, config):
@ -271,9 +296,9 @@ def execute_tests(testcase, cur_directory, config):

        if 'vanilla' in testcase.permutations:
            whole_command = testcase.prof_command
-            execute_test(whole_command, time_path, testcase.name, 'profile', config, True)
+            execute_test(whole_command, time_path, testcase.name, 'profile', config, "gnu-profiler")
            if 'cached' in testcase.permutations:
-                execute_test(whole_command, time_path, testcase.name, 'profile_cached', config, True)
+                execute_test(whole_command, time_path, testcase.name, 'profile_cached', config, "gnu-profiler")

        if 'ldpre' in testcase.permutations:
            for opt in testcase.ldopts:
@ -282,13 +307,42 @@ def execute_tests(testcase, cur_directory, config):
                subprocess.call([config.drop_caches], shell=True)

                #Create the command for executing moses:
-                whole_command = 'LD_PRELOAD=' + opt + testcase.prof_command
+                whole_command = 'LD_PRELOAD=' + opt + " " + testcase.prof_command
                variant = 'profile_ldpre_' + opt

                #test normal and cached
-                execute_test(whole_command, time_path, testcase.name, variant, config, True)
+                execute_test(whole_command, time_path, testcase.name, variant, config, "gnu-profiler")
                if 'cached' in testcase.permutations:
-                    execute_test(whole_command, time_path, testcase.name, variant + '_cached', config, True)
+                    execute_test(whole_command, time_path, testcase.name, variant + '_cached', config, "gnu-profiler")
+
+    #Google-perftools profiler
+    if 'google-profiler' in testcase.permutations:
+        subprocess.call(['sync'], shell=True)  # Drop caches first
+        subprocess.call([config.drop_caches], shell=True)
+
+        #Create the command for executing moses
+        whole_command = "CPUPROFILE=/tmp/moses.prof " + testcase.gprof_command
+
+        #test normal and cached
+        execute_test(whole_command, time_path, testcase.name, 'vanilla', config, 'google-profiler')
+        if 'cached' in testcase.permutations:
+            execute_test(whole_command, time_path, testcase.name, 'vanilla_cached', config, 'google-profiler')
+
+    #Now perform LD_PRELOAD tests
+    if 'ldpre' in testcase.permutations:
+        for opt in testcase.ldopts:
+            #Clear caches
+            subprocess.call(['sync'], shell=True)
+            subprocess.call([config.drop_caches], shell=True)
+
+            #Create the command for executing moses:
+            whole_command = 'LD_PRELOAD=' + opt + " " + whole_command
+            variant = 'ldpre_' + opt
+
+            #test normal and cached
+            execute_test(whole_command, time_path, testcase.name, variant, config, 'google-profiler')
+            if 'cached' in testcase.permutations:
+                execute_test(whole_command, time_path, testcase.name, variant + '_cached', config, 'google-profiler')


 # Go through all the test directories and executes tests
@ -319,7 +373,7 @@ if __name__ == '__main__':

    for logfile in os.listdir(CONFIG.testlogs):
        logfile_name = CONFIG.testlogs + '/' + logfile
-        if not check_for_basever(logfile_name, CONFIG.basebranch):
+        if os.path.isfile(logfile_name) and not check_for_basever(logfile_name, CONFIG.basebranch):
            logfile = logfile.replace('_vanilla', '')
            logfile = logfile.replace('_cached', '')
            logfile = logfile.replace('_ldpre', '')
@ -330,7 +384,7 @@ if __name__ == '__main__':
        #Create a new configuration for base version tests:
        BASECONFIG = Configuration(CONFIG.repo, CONFIG.drop_caches,\
            CONFIG.tests, CONFIG.testlogs, CONFIG.basebranch,\
-            CONFIG.baserev, CONFIG.repo_prof)
+            CONFIG.baserev, CONFIG.repo_prof, CONFIG.repo_gprof)
        BASECONFIG.additional_args(None, CONFIG.baserev, CONFIG.basebranch)
        #Set up the repository and get its revision:
        REVISION = repoinit(BASECONFIG)
@ -340,20 +394,28 @@ if __name__ == '__main__':
        subprocess.call(['./previous.sh'], shell=True)
        #If profiler configuration exists also init it
        if BASECONFIG.repo_prof is not None:
-            repoinit(BASECONFIG, True)
+            repoinit(BASECONFIG, "gnu-profiler")
            os.chdir(BASECONFIG.repo_prof)
            subprocess.call(['./previous.sh'], shell=True)

+        if BASECONFIG.repo_gprof is not None:
+            repoinit(BASECONFIG, "google-profiler")
+            os.chdir(BASECONFIG.repo_gprof)
+            subprocess.call(['./previous.sh'], shell=True)
+
        #Perform tests
        for directory in FIRSTTIME:
            cur_testcase = parse_configfile(BASECONFIG.tests + '/' + directory +\
-            '/config', directory, BASECONFIG.repo)
+            '/config', directory, BASECONFIG.repo, BASECONFIG.repo_prof, BASECONFIG.repo_gprof)
            execute_tests(cur_testcase, directory, BASECONFIG)

        #Reset back the repository to the normal configuration
        repoinit(CONFIG)
        if BASECONFIG.repo_prof is not None:
-            repoinit(CONFIG, True)
+            repoinit(CONFIG, "gnu-profiler")
+
+        if BASECONFIG.repo_gprof is not None:
+            repoinit(CONFIG, "google-profiler")

    #Builds moses
    os.chdir(CONFIG.repo)
@ -362,12 +424,16 @@ if __name__ == '__main__':
        os.chdir(CONFIG.repo_prof)
        subprocess.call(['./previous.sh'], shell=True)

+    if CONFIG.repo_gprof is not None:
+        os.chdir(CONFIG.repo_gprof)
+        subprocess.call(['./previous.sh'], shell=True)
+
    if CONFIG.singletest:
        TESTCASE = parse_configfile(CONFIG.tests + '/' +\
-            CONFIG.singletest + '/config', CONFIG.singletest, CONFIG.repo)
+            CONFIG.singletest + '/config', CONFIG.singletest, CONFIG.repo, CONFIG.repo_prof, CONFIG.repo_gprof)
        execute_tests(TESTCASE, CONFIG.singletest, CONFIG)
    else:
        for directory in ALL_DIR:
            cur_testcase = parse_configfile(CONFIG.tests + '/' + directory +\
-            '/config', directory, CONFIG.repo)
+            '/config', directory, CONFIG.repo, CONFIG.repo_prof, CONFIG.repo_gprof)
            execute_tests(cur_testcase, directory, CONFIG)
--- a/contrib/other-builds/pruneGeneration/pruneGeneration.project
+++ b/contrib/other-builds/pruneGeneration/pruneGeneration.project
@ -1,6 +1,9 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <CodeLite_Project Name="pruneGeneration" InternalType="Console">
  <Plugins>
+    <Plugin Name="qmake">
+      <![CDATA[00010001N0005Debug000000000000]]>
+    </Plugin>
    <Plugin Name="CMakePlugin">
      <![CDATA[[{
  "name": "Debug",
@ -13,9 +16,6 @@
  "parentProject": ""
 }]]]>
    </Plugin>
-    <Plugin Name="qmake">
-      <![CDATA[00010001N0005Debug000000000000]]>
-    </Plugin>
  </Plugins>
  <Description/>
  <Dependencies/>
@ -44,8 +44,10 @@
        <LibraryPath Value="../../../contrib/other-builds/moses/Debug"/>
        <Library Value="boost_filesystem"/>
        <Library Value="boost_system"/>
+        <Library Value="boost_iostreams"/>
        <Library Value="moses"/>
        <Library Value="z"/>
+        <Library Value="bz2"/>
      </Linker>
      <ResourceCompiler Options="" Required="no"/>
      <General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
--- a/contrib/server/Jamfile
+++ b/contrib/server/Jamfile
@ -13,7 +13,7 @@ with-xmlrpc-c = [ option.get "with-xmlrpc-c" ] ;
 if $(with-xmlrpc-c) {
  echo While building mosesserver ... ;
  echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
-  echo "!!! You are linking the XMLRPC-C library; Do NOT use v.1.25.29            !!!" ;
+  echo "!!! You are linking the XMLRPC-C library; Must be v.1.32 (September 2012) or higher !!!" ;
  echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;

  build-moses-server = true ;
--- a/contrib/server/mosesserver.cpp
+++ b/contrib/server/mosesserver.cpp
@ -740,20 +740,23 @@ int main(int argc, char** argv)
  myRegistry.addMethod("updater", updater);
  myRegistry.addMethod("optimize", optimizer);

+  /* CODE FOR old xmlrpc-c v. 1.32 or lower
  xmlrpc_c::serverAbyss myAbyssServer(
 				      myRegistry,
 				      port,              // TCP port on which to listen
 				      logfile
 				      );
-  /* doesn't work with xmlrpc-c v. 1.16.33 - ie very old lib on Ubuntu 12.04
+  */
+
+  /* doesn't work with xmlrpc-c v. 1.16.33 - ie very old lib on Ubuntu 12.04 */
  xmlrpc_c::serverAbyss myAbyssServer(
    xmlrpc_c::serverAbyss::constrOpt()
-    .registryPtr(&myRegistry)
+    .registryP(&myRegistry)
    .portNumber(port)              // TCP port on which to listen
    .logFileName(logfile)
    .allowOrigin("*")
+    .maxConn((unsigned int)numThreads)
  );
-  */

  XVERBOSE(1,"Listening on port " << port << endl);
  if (isSerial) {
--- a/mert/HwcmScorer.h
+++ b/mert/HwcmScorer.h
@ -5,10 +5,7 @@
 #include <vector>

 #include "StatisticsBasedScorer.h"
-#include "moses/FF/InternalTree.h"
-
-using Moses::TreePointer;
-using Moses::InternalTree;
+#include "InternalTree.h"

 namespace MosesTuning
 {
--- a/mert/InternalTree.cpp
+++ b/mert/InternalTree.cpp
@ -0,0 +1,110 @@
+#include "InternalTree.h"
+
+namespace MosesTuning
+{
+
+InternalTree::InternalTree(const std::string & line, const bool terminal):
+  m_isTerminal(terminal)
+{
+
+  size_t found = line.find_first_of("[] ");
+
+  if (found == line.npos) {
+    m_value = line;
+  }
+
+  else {
+    AddSubTree(line, 0);
+  }
+}
+
+size_t InternalTree::AddSubTree(const std::string & line, size_t pos)
+{
+
+  std::string value;
+  char token = 0;
+
+  while (token != ']' && pos != std::string::npos) {
+    size_t oldpos = pos;
+    pos = line.find_first_of("[] ", pos);
+    if (pos == std::string::npos) break;
+    token = line[pos];
+    value = line.substr(oldpos,pos-oldpos);
+
+    if (token == '[') {
+      if (m_value.size() > 0) {
+        m_children.push_back(boost::make_shared<InternalTree>(value,false));
+        pos = m_children.back()->AddSubTree(line, pos+1);
+      } else {
+        if (value.size() > 0) {
+          m_value = value;
+        }
+        pos = AddSubTree(line, pos+1);
+      }
+    } else if (token == ' ' || token == ']') {
+      if (value.size() > 0 && !(m_value.size() > 0)) {
+        m_value = value;
+      } else if (value.size() > 0) {
+        m_isTerminal = false;
+        m_children.push_back(boost::make_shared<InternalTree>(value,true));
+      }
+      if (token == ' ') {
+        pos++;
+      }
+    }
+
+    if (m_children.size() > 0) {
+      m_isTerminal = false;
+    }
+  }
+
+  if (pos == std::string::npos) {
+    return line.size();
+  }
+  return std::min(line.size(),pos+1);
+
+}
+
+std::string InternalTree::GetString(bool start) const
+{
+
+  std::string ret = "";
+  if (!start) {
+    ret += " ";
+  }
+
+  if (!m_isTerminal) {
+    ret += "[";
+  }
+
+  ret += m_value;
+  for (std::vector<TreePointer>::const_iterator it = m_children.begin(); it != m_children.end(); ++it) {
+    ret += (*it)->GetString(false);
+  }
+
+  if (!m_isTerminal) {
+    ret += "]";
+  }
+  return ret;
+
+}
+
+
+void InternalTree::Combine(const std::vector<TreePointer> &previous)
+{
+
+  std::vector<TreePointer>::iterator it;
+  bool found = false;
+  leafNT next_leafNT(this);
+  for (std::vector<TreePointer>::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) {
+    found = next_leafNT(it);
+    if (found) {
+      *it = *it_prev;
+    } else {
+      std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n";
+    }
+  }
+}
+
+
+}
--- a/mert/InternalTree.h
+++ b/mert/InternalTree.h
@ -0,0 +1,77 @@
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <map>
+#include <vector>
+#include <boost/shared_ptr.hpp>
+#include <boost/make_shared.hpp>
+#include "util/generator.hh"
+#include "util/exception.hh"
+
+namespace MosesTuning
+{
+
+class InternalTree;
+typedef boost::shared_ptr<InternalTree> TreePointer;
+typedef int NTLabel;
+
+class InternalTree
+{
+  std::string m_value;
+  std::vector<TreePointer> m_children;
+  bool m_isTerminal;
+public:
+  InternalTree(const std::string & line, const bool terminal = false);
+  InternalTree(const InternalTree & tree):
+    m_value(tree.m_value),
+    m_isTerminal(tree.m_isTerminal) {
+    const std::vector<TreePointer> & children = tree.m_children;
+    for (std::vector<TreePointer>::const_iterator it = children.begin(); it != children.end(); it++) {
+      m_children.push_back(boost::make_shared<InternalTree>(**it));
+    }
+  }
+  size_t AddSubTree(const std::string & line, size_t start);
+
+  std::string GetString(bool start = true) const;
+  void Combine(const std::vector<TreePointer> &previous);
+  const std::string & GetLabel() const {
+    return m_value;
+  }
+
+  size_t GetLength() const {
+    return m_children.size();
+  }
+  std::vector<TreePointer> & GetChildren() {
+    return m_children;
+  }
+
+  bool IsTerminal() const {
+    return m_isTerminal;
+  }
+
+  bool IsLeafNT() const {
+    return (!m_isTerminal && m_children.size() == 0);
+  }
+};
+
+// Python-like generator that yields next nonterminal leaf on every call
+$generator(leafNT)
+{
+  std::vector<TreePointer>::iterator it;
+  InternalTree* tree;
+  leafNT(InternalTree* root = 0): tree(root) {}
+  $emit(std::vector<TreePointer>::iterator)
+  for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
+    if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
+      $yield(it);
+    } else if ((*it)->GetLength() > 0) {
+      if ((*it).get()) { // normal pointer to same object that TreePointer points to
+        $restart(tree = (*it).get());
+      }
+    }
+  }
+  $stop;
+};
+
+}
--- a/mert/Jamfile
+++ b/mert/Jamfile
@ -30,7 +30,7 @@ InterpolatedScorer.cpp
 Point.cpp
 PerScorer.cpp
 HwcmScorer.cpp
-../moses/FF/InternalTree.cpp
+InternalTree.cpp
 Scorer.cpp
 ScorerFactory.cpp
 Optimizer.cpp
--- a/moses/BaseManager.cpp
+++ b/moses/BaseManager.cpp
@ -28,7 +28,8 @@ BaseManager::GetSource() const
 }

 const ttasksptr
-BaseManager::GetTtask() const {
+BaseManager::GetTtask() const
+{
  return m_ttask.lock();
 }

--- a/moses/ExportInterface.cpp
+++ b/moses/ExportInterface.cpp
@ -167,7 +167,14 @@ run_as_server()
  myRegistry.addMethod("updater", updater);
  myRegistry.addMethod("optimize", optimizer);

-  xmlrpc_c::serverAbyss myAbyssServer(myRegistry, port, logfile);
+  xmlrpc_c::serverAbyss myAbyssServer(
+    xmlrpc_c::serverAbyss::constrOpt()
+    .registryP(&myRegistry)
+    .portNumber(port)              // TCP port on which to listen
+    .logFileName(logfile)
+    .allowOrigin("*")
+    .maxConn((unsigned int)num_threads)
+  );

  XVERBOSE(1,"Listening on port " << port << endl);
  if (isSerial) {
--- a/moses/FF/InternalTree.cpp
+++ b/moses/FF/InternalTree.cpp
@ -1,27 +1,24 @@
 #include "InternalTree.h"
+#include "moses/StaticData.h"

 namespace Moses
 {

-InternalTree::InternalTree(const std::string & line, size_t start, size_t len, const bool terminal):
-  m_value_nt(0),
-  m_isTerminal(terminal)
+InternalTree::InternalTree(const std::string & line, size_t start, size_t len, const bool nonterminal)
 {

  if (len > 0) {
-    m_value.assign(line, start, len);
+    m_value.CreateFromString(Output, StaticData::Instance().GetOutputFactorOrder(), StringPiece(line).substr(start, len), nonterminal);
  }
 }

-InternalTree::InternalTree(const std::string & line, const bool terminal):
-  m_value_nt(0),
-  m_isTerminal(terminal)
+InternalTree::InternalTree(const std::string & line, const bool nonterminal)
 {

  size_t found = line.find_first_of("[] ");

  if (found == line.npos) {
-    m_value = line;
+    m_value.CreateFromString(Output, StaticData::Instance().GetOutputFactorOrder(), line, nonterminal);
  } else {
    AddSubTree(line, 0);
  }
@ -32,6 +29,7 @@ size_t InternalTree::AddSubTree(const std::string & line, size_t pos)

  char token = 0;
  size_t len = 0;
+  bool has_value = false;

  while (token != ']' && pos != std::string::npos) {
    size_t oldpos = pos;
@ -41,30 +39,27 @@ size_t InternalTree::AddSubTree(const std::string & line, size_t pos)
    len = pos-oldpos;

    if (token == '[') {
-      if (!m_value.empty()) {
-        m_children.push_back(boost::make_shared<InternalTree>(line, oldpos, len, false));
+      if (has_value) {
+        m_children.push_back(boost::make_shared<InternalTree>(line, oldpos, len, true));
        pos = m_children.back()->AddSubTree(line, pos+1);
      } else {
        if (len > 0) {
-          m_value.assign(line, oldpos, len);
+          m_value.CreateFromString(Output, StaticData::Instance().GetOutputFactorOrder(), StringPiece(line).substr(oldpos, len), false);
+          has_value = true;
        }
        pos = AddSubTree(line, pos+1);
      }
    } else if (token == ' ' || token == ']') {
-      if (len > 0 && m_value.empty()) {
-        m_value.assign(line, oldpos, len);
+      if (len > 0 && !has_value) {
+        m_value.CreateFromString(Output, StaticData::Instance().GetOutputFactorOrder(), StringPiece(line).substr(oldpos, len), true);
+        has_value = true;
      } else if (len > 0) {
-        m_isTerminal = false;
-        m_children.push_back(boost::make_shared<InternalTree>(line, oldpos, len, true));
+        m_children.push_back(boost::make_shared<InternalTree>(line, oldpos, len, false));
      }
      if (token == ' ') {
        pos++;
      }
    }
-
-    if (!m_children.empty()) {
-      m_isTerminal = false;
-    }
  }

  if (pos == std::string::npos) {
@ -82,16 +77,16 @@ std::string InternalTree::GetString(bool start) const
    ret += " ";
  }

-  if (!m_isTerminal) {
+  if (!IsTerminal()) {
    ret += "[";
  }

-  ret += m_value;
+  ret += m_value.GetString(StaticData::Instance().GetOutputFactorOrder(), false);
  for (std::vector<TreePointer>::const_iterator it = m_children.begin(); it != m_children.end(); ++it) {
    ret += (*it)->GetString(false);
  }

-  if (!m_isTerminal) {
+  if (!IsTerminal()) {
    ret += "]";
  }
  return ret;
@ -120,13 +115,13 @@ void InternalTree::Unbinarize()
 {

  // nodes with virtual label cannot be unbinarized
-  if (m_value.empty() || m_value[0] == '^') {
+  if (m_value.GetString(0).empty() || m_value.GetString(0).as_string()[0] == '^') {
    return;
  }

  //if node has child that is virtual node, get unbinarized list of children
  for (std::vector<TreePointer>::iterator it = m_children.begin(); it != m_children.end(); ++it) {
-    if (!(*it)->IsTerminal() && (*it)->GetLabel()[0] == '^') {
+    if (!(*it)->IsTerminal() && (*it)->GetLabel().GetString(0).as_string()[0] == '^') {
      std::vector<TreePointer> new_children;
      GetUnbinarizedChildren(new_children);
      m_children = new_children;
@ -144,8 +139,8 @@ void InternalTree::Unbinarize()
 void InternalTree::GetUnbinarizedChildren(std::vector<TreePointer> &ret) const
 {
  for (std::vector<TreePointer>::const_iterator itx = m_children.begin(); itx != m_children.end(); ++itx) {
-    const std::string &label = (*itx)->GetLabel();
-    if (!label.empty() && label[0] == '^') {
+    const StringPiece label = (*itx)->GetLabel().GetString(0);
+    if (!label.empty() && label.as_string()[0] == '^') {
      (*itx)->GetUnbinarizedChildren(ret);
    } else {
      ret.push_back(*itx);
@ -153,7 +148,7 @@ void InternalTree::GetUnbinarizedChildren(std::vector<TreePointer> &ret) const
  }
 }

-bool InternalTree::FlatSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const
+bool InternalTree::FlatSearch(const Word & label, std::vector<TreePointer>::const_iterator & it) const
 {
  for (it = m_children.begin(); it != m_children.end(); ++it) {
    if ((*it)->GetLabel() == label) {
@ -163,7 +158,7 @@ bool InternalTree::FlatSearch(const std::string & label, std::vector<TreePointer
  return false;
 }

-bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const
+bool InternalTree::RecursiveSearch(const Word & label, std::vector<TreePointer>::const_iterator & it) const
 {
  for (it = m_children.begin(); it != m_children.end(); ++it) {
    if ((*it)->GetLabel() == label) {
@ -178,7 +173,7 @@ bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePo
  return false;
 }

-bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const
+bool InternalTree::RecursiveSearch(const Word & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const
 {
  for (it = m_children.begin(); it != m_children.end(); ++it) {
    if ((*it)->GetLabel() == label) {
@ -194,88 +189,4 @@ bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePo
  return false;
 }

-
-bool InternalTree::FlatSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const
-{
-  for (it = m_children.begin(); it != m_children.end(); ++it) {
-    if ((*it)->GetNTLabel() == label) {
-      return true;
-    }
-  }
-  return false;
-}
-
-bool InternalTree::RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const
-{
-  for (it = m_children.begin(); it != m_children.end(); ++it) {
-    if ((*it)->GetNTLabel() == label) {
-      return true;
-    }
-    std::vector<TreePointer>::const_iterator it2;
-    if ((*it)->RecursiveSearch(label, it2)) {
-      it = it2;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool InternalTree::RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const
-{
-  for (it = m_children.begin(); it != m_children.end(); ++it) {
-    if ((*it)->GetNTLabel() == label) {
-      parent = this;
-      return true;
-    }
-    std::vector<TreePointer>::const_iterator it2;
-    if ((*it)->RecursiveSearch(label, it2, parent)) {
-      it = it2;
-      return true;
-    }
-  }
-  return false;
-}
-
-
-bool InternalTree::FlatSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const
-{
-  for (it = m_children.begin(); it != m_children.end(); ++it) {
-    if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
-      return true;
-    }
-  }
-  return false;
-}
-
-bool InternalTree::RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const
-{
-  for (it = m_children.begin(); it != m_children.end(); ++it) {
-    if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
-      return true;
-    }
-    std::vector<TreePointer>::const_iterator it2;
-    if ((*it)->RecursiveSearch(labels, it2)) {
-      it = it2;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool InternalTree::RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const
-{
-  for (it = m_children.begin(); it != m_children.end(); ++it) {
-    if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
-      parent = this;
-      return true;
-    }
-    std::vector<TreePointer>::const_iterator it2;
-    if ((*it)->RecursiveSearch(labels, it2, parent)) {
-      it = it2;
-      return true;
-    }
-  }
-  return false;
-}
-
 }
--- a/moses/FF/InternalTree.h
+++ b/moses/FF/InternalTree.h
@ -5,30 +5,28 @@
 #include <map>
 #include <vector>
 #include "FFState.h"
+#include "moses/Word.h"
 #include <boost/shared_ptr.hpp>
 #include <boost/make_shared.hpp>
 #include "util/generator.hh"
 #include "util/exception.hh"
+#include "util/string_piece.hh"

 namespace Moses
 {

 class InternalTree;
 typedef boost::shared_ptr<InternalTree> TreePointer;
-typedef int NTLabel;

 class InternalTree
 {
-  std::string m_value;
-  NTLabel m_value_nt;
+  Word m_value;
  std::vector<TreePointer> m_children;
-  bool m_isTerminal;
 public:
  InternalTree(const std::string & line, size_t start, size_t len, const bool terminal);
-  InternalTree(const std::string & line, const bool terminal = false);
+  InternalTree(const std::string & line, const bool nonterminal = true);
  InternalTree(const InternalTree & tree):
-    m_value(tree.m_value),
-    m_isTerminal(tree.m_isTerminal) {
+    m_value(tree.m_value) {
    const std::vector<TreePointer> & children = tree.m_children;
    for (std::vector<TreePointer>::const_iterator it = children.begin(); it != children.end(); it++) {
      m_children.push_back(boost::make_shared<InternalTree>(**it));
@ -40,20 +38,10 @@ public:
  void Combine(const std::vector<TreePointer> &previous);
  void Unbinarize();
  void GetUnbinarizedChildren(std::vector<TreePointer> &children) const;
-  const std::string & GetLabel() const {
+  const Word & GetLabel() const {
    return m_value;
  }

-  // optionally identify label by int instead of string;
-  // allows abstraction if multiple nonterminal strings should map to same label.
-  const NTLabel & GetNTLabel() const {
-    return m_value_nt;
-  }
-
-  void SetNTLabel(NTLabel value) {
-    m_value_nt = value;
-  }
-
  size_t GetLength() const {
    return m_children.size();
  }
@ -62,38 +50,22 @@ public:
  }

  bool IsTerminal() const {
-    return m_isTerminal;
+    return !m_value.IsNonTerminal();
  }

  bool IsLeafNT() const {
-    return (!m_isTerminal && m_children.size() == 0);
+    return (m_value.IsNonTerminal() && m_children.size() == 0);
  }

  // different methods to search a tree (either just direct children (FlatSearch) or all children (RecursiveSearch)) for constituents.
  // can be used for formulating syntax constraints.

  // if found, 'it' is iterator to first tree node that matches search string
-  bool FlatSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const;
-  bool RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const;
+  bool FlatSearch(const Word & label, std::vector<TreePointer>::const_iterator & it) const;
+  bool RecursiveSearch(const Word & label, std::vector<TreePointer>::const_iterator & it) const;

  // if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
-  bool RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
-
-  // use NTLabel for search to reduce number of string comparisons / deal with synonymous labels
-  // if found, 'it' is iterator to first tree node that matches search string
-  bool FlatSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const;
-  bool RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const;
-
-  // if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
-  bool RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
-
-  // pass vector of possible labels to search
-  // if found, 'it' is iterator to first tree node that matches search string
-  bool FlatSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const;
-  bool RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const;
-
-  // if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
-  bool RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
+  bool RecursiveSearch(const Word & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;

  // Python-like generator that yields next nonterminal leaf on every call
  $generator(leafNT) {
--- a/moses/FF/Model1Feature.cpp
+++ b/moses/FF/Model1Feature.cpp
@ -75,7 +75,7 @@ void Model1Vocabulary::Load(const std::string& fileName)
    ++i;
    std::vector<std::string> tokens = Tokenize(line);
    UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
-    unsigned id = Scan<unsigned>(tokens[0]);
+    unsigned id = std::atoll( tokens[0].c_str() );
    if (! ( (id == 1) && (tokens[1] == "UNK") )) {
      const Factor* factor = factorCollection.AddFactor(tokens[1],false); // TODO: can we assume that the vocabulary is know and filter the model on loading?
      bool stored = Store(factor, id);
@ -86,7 +86,7 @@ void Model1Vocabulary::Load(const std::string& fileName)
    ++i;
    std::vector<std::string> tokens = Tokenize(line);
    UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
-    unsigned id = Scan<unsigned>(tokens[0]);
+    unsigned id = std::atoll( tokens[0].c_str() );
    const Factor* factor = factorCollection.AddFactor(tokens[1],false); // TODO: can we assume that the vocabulary is know and filter the model on loading?
    bool stored = Store(factor, id);
    UTIL_THROW_IF2(!stored, "Line " << i << " in " << fileName << " overwrites existing vocabulary entry.");
@ -105,11 +105,11 @@ void Model1LexicalTable::Load(const std::string &fileName, const Model1Vocabular
    ++i;
    std::vector<std::string> tokens = Tokenize(line);
    UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
-    unsigned idS = Scan<unsigned>(tokens[0]);
-    unsigned idT = Scan<unsigned>(tokens[1]);
+    unsigned idS = std::atoll( tokens[0].c_str() );
+    unsigned idT = std::atoll( tokens[1].c_str() );
    const Factor* wordS = vcbS.GetWord(idS);
    const Factor* wordT = vcbT.GetWord(idT);
-    float prob = Scan<float>(tokens[2]);
+    float prob = std::atof( tokens[2].c_str() );
    if ( (wordS != NULL) && (wordT != NULL) ) {
      m_ltable[ wordS ][ wordT ] = prob;
    }
--- a/moses/FF/PhraseOrientationFeature.cpp
+++ b/moses/FF/PhraseOrientationFeature.cpp
@ -134,7 +134,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,
  if (targetPhrase.GetAlignNonTerm().GetSize() != 0) {

    // Initialize phrase orientation scoring object
-    MosesTraining::GHKM::PhraseOrientation phraseOrientation(source.GetSize(), targetPhrase.GetSize(),
+    MosesTraining::Syntax::GHKM::PhraseOrientation phraseOrientation(source.GetSize(), targetPhrase.GetSize(),
        targetPhrase.GetAlignTerm(), targetPhrase.GetAlignNonTerm());

    PhraseOrientationFeature::ReoClassData* reoClassData = new PhraseOrientationFeature::ReoClassData();
@ -150,7 +150,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,

      // LEFT-TO-RIGHT DIRECTION

-      MosesTraining::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,MosesTraining::GHKM::PhraseOrientation::REO_DIR_L2R);
+      MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,MosesTraining::Syntax::GHKM::PhraseOrientation::REO_DIR_L2R);

      if ( ((targetIndex == 0) || !phraseOrientation.TargetSpanIsAligned(0,targetIndex)) // boundary non-terminal in rule-initial position (left boundary)
           && (targetPhraseLHS != m_glueTargetLHS) ) { // and not glue rule
@ -170,7 +170,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,
        if (reoClassData->firstNonTerminalPreviousSourceSpanIsAligned &&
            reoClassData->firstNonTerminalFollowingSourceSpanIsAligned) {
          // discontinuous
-          l2rOrientation = MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT;
+          l2rOrientation = MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT;
        } else {
          reoClassData->firstNonTerminalIsBoundary = true;
        }
@ -180,7 +180,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,

      // RIGHT-TO-LEFT DIRECTION

-      MosesTraining::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,MosesTraining::GHKM::PhraseOrientation::REO_DIR_R2L);
+      MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,MosesTraining::Syntax::GHKM::PhraseOrientation::REO_DIR_R2L);

      if ( ((targetIndex == targetPhrase.GetSize()-1) || !phraseOrientation.TargetSpanIsAligned(targetIndex,targetPhrase.GetSize()-1)) // boundary non-terminal in rule-final position (right boundary)
           && (targetPhraseLHS != m_glueTargetLHS) ) { // and not glue rule
@ -200,7 +200,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,
        if (reoClassData->lastNonTerminalPreviousSourceSpanIsAligned &&
            reoClassData->lastNonTerminalFollowingSourceSpanIsAligned) {
          // discontinuous
-          r2lOrientation = MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT;
+          r2lOrientation = MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT;
        } else {
          reoClassData->lastNonTerminalIsBoundary = true;
        }
@ -335,25 +335,25 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied(

      // LEFT-TO-RIGHT DIRECTION

-      MosesTraining::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = reoClassData->nonTerminalReoClassL2R[nNT];
+      MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = reoClassData->nonTerminalReoClassL2R[nNT];

      IFFEATUREVERBOSE(2) {
        FEATUREVERBOSE(2, "l2rOrientation ");
        switch (l2rOrientation) {
-        case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT:
+        case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_LEFT:
          FEATUREVERBOSE2(2, "mono" << std::endl);
          break;
-        case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT:
+        case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_RIGHT:
          FEATUREVERBOSE2(2, "swap" << std::endl);
          break;
-        case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT:
+        case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT:
          FEATUREVERBOSE2(2, "dleft" << std::endl);
          break;
-        case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT:
+        case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DRIGHT:
          FEATUREVERBOSE2(2, "dright" << std::endl);
          break;
-        case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN:
-          // modelType == MosesTraining::GHKM::PhraseOrientation::REO_MSLR
+        case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN:
+          // modelType == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_MSLR
          FEATUREVERBOSE2(2, "unknown->dleft" << std::endl);
          break;
        default:
@ -396,23 +396,23 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied(

      } else {

-        if ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
+        if ( l2rOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {

          newScores[0] += TransformScore(orientationPhraseProperty->GetLeftToRightProbabilityMono());
          // if sub-derivation has left-boundary non-terminal:
          // add recursive actual score of boundary non-terminal from subderivation
          LeftBoundaryL2RScoreRecursive(featureID, prevState, 0x1, newScores, accumulator);

-        } else if ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
+        } else if ( l2rOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {

          newScores[1] += TransformScore(orientationPhraseProperty->GetLeftToRightProbabilitySwap());
          // if sub-derivation has left-boundary non-terminal:
          // add recursive actual score of boundary non-terminal from subderivation
          LeftBoundaryL2RScoreRecursive(featureID, prevState, 0x2, newScores, accumulator);

-        } else if ( ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
-                    ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
-                    ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
+        } else if ( ( l2rOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
+                    ( l2rOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
+                    ( l2rOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {

          newScores[2] += TransformScore(orientationPhraseProperty->GetLeftToRightProbabilityDiscontinuous());
          // if sub-derivation has left-boundary non-terminal:
@ -437,25 +437,25 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied(

      // RIGHT-TO-LEFT DIRECTION

-      MosesTraining::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = reoClassData->nonTerminalReoClassR2L[nNT];
+      MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = reoClassData->nonTerminalReoClassR2L[nNT];

      IFFEATUREVERBOSE(2) {
        FEATUREVERBOSE(2, "r2lOrientation ");
        switch (r2lOrientation) {
-        case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT:
+        case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_LEFT:
          FEATUREVERBOSE2(2, "mono" << std::endl);
          break;
-        case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT:
+        case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_RIGHT:
          FEATUREVERBOSE2(2, "swap" << std::endl);
          break;
-        case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT:
+        case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT:
          FEATUREVERBOSE2(2, "dleft" << std::endl);
          break;
-        case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT:
+        case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DRIGHT:
          FEATUREVERBOSE2(2, "dright" << std::endl);
          break;
-        case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN:
-          // modelType == MosesTraining::GHKM::PhraseOrientation::REO_MSLR
+        case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN:
+          // modelType == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_MSLR
          FEATUREVERBOSE2(2, "unknown->dleft" << std::endl);
          break;
        default:
@ -498,23 +498,23 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied(

      } else {

-        if ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
+        if ( r2lOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {

          newScores[m_offsetR2LScores+0] += TransformScore(orientationPhraseProperty->GetRightToLeftProbabilityMono());
          // if sub-derivation has right-boundary non-terminal:
          // add recursive actual score of boundary non-terminal from subderivation
          RightBoundaryR2LScoreRecursive(featureID, prevState, 0x1, newScores, accumulator);

-        } else if ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
+        } else if ( r2lOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {

          newScores[m_offsetR2LScores+1] += TransformScore(orientationPhraseProperty->GetRightToLeftProbabilitySwap());
          // if sub-derivation has right-boundary non-terminal:
          // add recursive actual score of boundary non-terminal from subderivation
          RightBoundaryR2LScoreRecursive(featureID, prevState, 0x2, newScores, accumulator);

-        } else if ( ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
-                    ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
-                    ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
+        } else if ( ( r2lOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
+                    ( r2lOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
+                    ( r2lOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {

          newScores[m_offsetR2LScores+2] += TransformScore(orientationPhraseProperty->GetRightToLeftProbabilityDiscontinuous());
          // if sub-derivation has right-boundary non-terminal:
@ -862,17 +862,17 @@ void PhraseOrientationFeature::SparseNonTerminalR2LScore(const Factor* nonTermin
 }


-const std::string* PhraseOrientationFeature::ToString(const MosesTraining::GHKM::PhraseOrientation::REO_CLASS o) const
+const std::string* PhraseOrientationFeature::ToString(const MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS o) const
 {
-  if ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
+  if ( o == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
    return &MORIENT;

-  } else if ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
+  } else if ( o == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
    return &SORIENT;

-  } else if ( ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
-              ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
-              ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
+  } else if ( ( o == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
+              ( o == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
+              ( o == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
    return &DORIENT;

  } else {
--- a/moses/FF/PhraseOrientationFeature.h
+++ b/moses/FF/PhraseOrientationFeature.h
@ -302,8 +302,8 @@ public:

  struct ReoClassData {
  public:
-    std::vector<MosesTraining::GHKM::PhraseOrientation::REO_CLASS> nonTerminalReoClassL2R;
-    std::vector<MosesTraining::GHKM::PhraseOrientation::REO_CLASS> nonTerminalReoClassR2L;
+    std::vector<MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS> nonTerminalReoClassL2R;
+    std::vector<MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS> nonTerminalReoClassR2L;
    bool firstNonTerminalIsBoundary;
    bool firstNonTerminalPreviousSourceSpanIsAligned;
    bool firstNonTerminalFollowingSourceSpanIsAligned;
@ -401,7 +401,7 @@ protected:
                                 ScoreComponentCollection* scoreBreakdown,
                                 const std::string* o) const;

-  const std::string* ToString(const MosesTraining::GHKM::PhraseOrientation::REO_CLASS o) const;
+  const std::string* ToString(const MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS o) const;

  static const std::string MORIENT;
  static const std::string SORIENT;
--- a/moses/FF/PhrasePairFeature.cpp
+++ b/moses/FF/PhrasePairFeature.cpp
@ -16,21 +16,29 @@ namespace Moses

 PhrasePairFeature::PhrasePairFeature(const std::string &line)
  :StatelessFeatureFunction(0, line)
+  ,m_unrestricted(false)
+  ,m_simple(true)
+  ,m_sourceContext(false)
+  ,m_domainTrigger(false)
+  ,m_ignorePunctuation(false)
 {
-  std::cerr << "Initializing PhrasePairFeature.." << std::endl;
+  VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
  ReadParameters();

-  if (m_simple == 1) std::cerr << "using simple phrase pairs.. ";
-  if (m_sourceContext == 1) std::cerr << "using source context.. ";
-  if (m_domainTrigger == 1) std::cerr << "using domain triggers.. ";
+  if (m_simple == 1) VERBOSE(1, " Using simple phrase pairs.");
+  if (m_sourceContext == 1) VERBOSE(1, " Using source context.");
+  if (m_domainTrigger == 1) VERBOSE(1, " Using domain triggers.");

  // compile a list of punctuation characters
  if (m_ignorePunctuation) {
-    std::cerr << "ignoring punctuation for triggers.. ";
+    VERBOSE(1, " Ignoring punctuation for triggers.");
    char punctuation[] = "\"'!?¿·()#_,.:;•&@‑/\\0123456789~=";
-    for (size_t i=0; i < sizeof(punctuation)-1; ++i)
+    for (size_t i=0; i < sizeof(punctuation)-1; ++i) {
      m_punctuationHash[punctuation[i]] = 1;
+    }
  }
+
+  VERBOSE(1, " Done." << std::endl);
 }

 void PhrasePairFeature::SetParameter(const std::string& key, const std::string& value)
@ -76,7 +84,7 @@ void PhrasePairFeature::Load()
    }

    inFileSource.close();
-  } else {
+  } else if (!m_unrestricted) {
    // restricted source word vocabulary
    ifstream inFileSource(m_filePathSource.c_str());
    UTIL_THROW_IF2(!inFileSource, "could not open file " << m_filePathSource);
@ -101,8 +109,6 @@ void PhrasePairFeature::Load()
    }

    inFileTarget.close();*/
-
-    m_unrestricted = false;
  }
 }

@ -114,25 +120,6 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
    , ScoreComponentCollection *estimatedFutureScore) const
 {
  const Phrase& source = inputPath.GetPhrase();
-  if (m_simple) {
-    ostringstream namestr;
-    namestr << "pp_";
-    namestr << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString();
-    for (size_t i = 1; i < source.GetSize(); ++i) {
-      const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId);
-      namestr << ",";
-      namestr << sourceFactor->GetString();
-    }
-    namestr << "~";
-    namestr << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString();
-    for (size_t i = 1; i < targetPhrase.GetSize(); ++i) {
-      const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId);
-      namestr << ",";
-      namestr << targetFactor->GetString();
-    }
-
-    scoreBreakdown.SparsePlusEquals(namestr.str(),1);
-  }
  if (m_domainTrigger) {
    const Sentence& isnt = static_cast<const Sentence&>(input);
    const bool use_topicid = isnt.GetUseTopicId();
@ -140,18 +127,18 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input

    // compute pair
    ostringstream pair;
-    pair << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString();
+    pair << ReplaceTilde( source.GetWord(0).GetFactor(m_sourceFactorId)->GetString() );
    for (size_t i = 1; i < source.GetSize(); ++i) {
      const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId);
-      pair << ",";
-      pair << sourceFactor->GetString();
+      pair << "~";
+      pair << ReplaceTilde( sourceFactor->GetString() );
    }
-    pair << "~";
-    pair << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString();
+    pair << "~~";
+    pair << ReplaceTilde( targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString() );
    for (size_t i = 1; i < targetPhrase.GetSize(); ++i) {
      const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId);
-      pair << ",";
-      pair << targetFactor->GetString();
+      pair << "~";
+      pair << ReplaceTilde( targetFactor->GetString() );
    }

    if (use_topicid || use_topicid_prob) {
@ -159,7 +146,7 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
        // use topicid as trigger
        const long topicid = isnt.GetTopicId();
        stringstream feature;
-        feature << "pp_";
+        feature << m_description << "_";
        if (topicid == -1)
          feature << "unk";
        else
@ -173,13 +160,13 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
        const vector<string> &topicid_prob = *(isnt.GetTopicIdAndProb());
        if (atol(topicid_prob[0].c_str()) == -1) {
          stringstream feature;
-          feature << "pp_unk_";
+          feature << m_description << "_unk_";
          feature << pair.str();
          scoreBreakdown.SparsePlusEquals(feature.str(), 1);
        } else {
          for (size_t i=0; i+1 < topicid_prob.size(); i+=2) {
            stringstream feature;
-            feature << "pp_";
+            feature << m_description << "_";
            feature << topicid_prob[i];
            feature << "_";
            feature << pair.str();
@ -193,7 +180,7 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
      for (set<string>::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) {
        string sourceTrigger = *p;
        ostringstream namestr;
-        namestr << "pp_";
+        namestr << m_description << "_";
        namestr << sourceTrigger;
        namestr << "_";
        namestr << pair.str();
@ -221,21 +208,21 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input

      if (m_unrestricted || sourceTriggerExists) {
        ostringstream namestr;
-        namestr << "pp_";
+        namestr << m_description << "_";
        namestr << sourceTrigger;
        namestr << "~";
-        namestr << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString();
+        namestr << ReplaceTilde( source.GetWord(0).GetFactor(m_sourceFactorId)->GetString() );
        for (size_t i = 1; i < source.GetSize(); ++i) {
          const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId);
-          namestr << ",";
-          namestr << sourceFactor->GetString();
+          namestr << "~";
+          namestr << ReplaceTilde( sourceFactor->GetString() );
        }
-        namestr << "~";
-        namestr << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString();
+        namestr << "~~";
+        namestr << ReplaceTilde( targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString() );
        for (size_t i = 1; i < targetPhrase.GetSize(); ++i) {
          const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId);
-          namestr << ",";
-          namestr << targetFactor->GetString();
+          namestr << "~";
+          namestr << ReplaceTilde( targetFactor->GetString() );
        }

        scoreBreakdown.SparsePlusEquals(namestr.str(),1);
@ -244,6 +231,31 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
  }
 }

+void PhrasePairFeature::EvaluateInIsolation(const Phrase &source
+    , const TargetPhrase &targetPhrase
+    , ScoreComponentCollection &scoreBreakdown
+    , ScoreComponentCollection &estimatedFutureScore) const
+{
+  if (m_simple) {
+    ostringstream namestr;
+    namestr << m_description << "_";
+    namestr << ReplaceTilde( source.GetWord(0).GetFactor(m_sourceFactorId)->GetString() );
+    for (size_t i = 1; i < source.GetSize(); ++i) {
+      const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId);
+      namestr << "~";
+      namestr << ReplaceTilde( sourceFactor->GetString() );
+    }
+    namestr << "~~";
+    namestr << ReplaceTilde( targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString() );
+    for (size_t i = 1; i < targetPhrase.GetSize(); ++i) {
+      const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId);
+      namestr << "~";
+      namestr << ReplaceTilde( targetFactor->GetString() );
+    }
+    scoreBreakdown.SparsePlusEquals(namestr.str(),1);
+  }
+}
+
 bool PhrasePairFeature::IsUseable(const FactorMask &mask) const
 {
  bool ret = mask[m_targetFactorId];
--- a/moses/FF/PhrasePairFeature.h
+++ b/moses/FF/PhrasePairFeature.h
@ -1,5 +1,4 @@
-#ifndef moses_PhrasePairFeature_h
-#define moses_PhrasePairFeature_h
+#pragma once

 #include <stdexcept>
 #include <boost/unordered_set.hpp>
@ -32,6 +31,16 @@ class PhrasePairFeature: public StatelessFeatureFunction
  CharHash m_punctuationHash;
  std::string m_filePathSource;

+  inline std::string ReplaceTilde(const StringPiece &str) const {
+    std::string out = str.as_string();
+    size_t pos = out.find('~');
+    while ( pos != std::string::npos ) {
+      out.replace(pos,1,"<TILDE>");
+      pos = out.find('~',pos);
+    }
+    return out;
+  };
+
 public:
  PhrasePairFeature(const std::string &line);

@ -43,8 +52,7 @@ public:
  void EvaluateInIsolation(const Phrase &source
                           , const TargetPhrase &targetPhrase
                           , ScoreComponentCollection &scoreBreakdown
-                           , ScoreComponentCollection &estimatedFutureScore) const {
-  }
+                           , ScoreComponentCollection &estimatedFutureScore) const;

  void EvaluateTranslationOptionListWithSourceContext(const InputType &input
      , const TranslationOptionList &translationOptionList) const {
@ -69,5 +77,3 @@ public:

 }

-
-#endif
--- a/moses/FF/RulePairUnlexicalizedSource.cpp
+++ b/moses/FF/RulePairUnlexicalizedSource.cpp
@ -12,7 +12,7 @@ namespace Moses
 {

 RulePairUnlexicalizedSource::RulePairUnlexicalizedSource(const std::string &line)
-  : StatelessFeatureFunction(0, line)
+  : StatelessFeatureFunction(1, line)
  , m_glueRules(false)
  , m_nonGlueRules(true)
  , m_glueTargetLHSStr("Q")
@ -81,6 +81,9 @@ void RulePairUnlexicalizedSource::EvaluateInIsolation(const Phrase &source
  }

  scoreBreakdown.PlusEquals(this, namestr.str(), 1);
+  if ( targetPhraseLHS != m_glueTargetLHS ) {
+    scoreBreakdown.PlusEquals(this, 1);
+  }
 }

 }
--- a/moses/FF/SoftMatchingFeature.cpp
+++ b/moses/FF/SoftMatchingFeature.cpp
@ -13,6 +13,7 @@ namespace Moses
 SoftMatchingFeature::SoftMatchingFeature(const std::string &line)
  : StatelessFeatureFunction(0, line)
  , m_softMatches(moses_MaxNumNonterminals)
+  , m_scoreIdentical(true)
 {
  ReadParameters();
 }
@ -26,6 +27,8 @@ void SoftMatchingFeature::SetParameter(const std::string& key, const std::string
  } else if (key == "path") {
    const std::string filePath = value;
    Load(filePath);
+  } else if (key == "score-identical") {
+    m_scoreIdentical = Scan<bool>(value);
  } else {
    UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value);
  }
@ -80,8 +83,10 @@ void SoftMatchingFeature::EvaluateWhenApplied(const ChartHypothesis& hypo,
      const ChartHypothesis* prevHypo = hypo.GetPrevHypo(nonTermInd);
      const Word& prevLHS = prevHypo->GetTargetLHS();

-      const std::string &name = GetOrSetFeatureName(word, prevLHS);
-      accumulator->PlusEquals(this,name,1);
+      if ( (word != prevLHS) || m_scoreIdentical ) {
+        const std::string &name = GetOrSetFeatureName(word, prevLHS);
+        accumulator->PlusEquals(this,name,1);
+      }
    }
  }
 }
--- a/moses/FF/SoftMatchingFeature.h
+++ b/moses/FF/SoftMatchingFeature.h
@ -55,6 +55,7 @@ public:
 private:
  mutable std::vector<std::vector<Word> > m_softMatches; // map RHS of new rule to list of possible LHS of old rule (subtree)
  mutable std::vector<std::vector<std::string> > m_nameCache;
+  bool m_scoreIdentical;

 #ifdef WITH_THREADS
  //reader-writer lock
--- a/moses/FF/SourceWordDeletionFeature.cpp
+++ b/moses/FF/SourceWordDeletionFeature.cpp
@ -38,9 +38,8 @@ void SourceWordDeletionFeature::SetParameter(const std::string& key, const std::

 void SourceWordDeletionFeature::Load()
 {
-  if (m_filename == "") {
+  if (m_filename.empty())
    return;
-  }

  FEATUREVERBOSE(1, "Loading source word deletion word list from " << m_filename << std::endl);
  ifstream inFile(m_filename.c_str());
--- a/moses/FF/TreeStructureFeature.cpp
+++ b/moses/FF/TreeStructureFeature.cpp
@ -13,33 +13,12 @@ void TreeStructureFeature::Load()

  // syntactic constraints can be hooked in here.
  m_constraints = NULL;
-  m_labelset = NULL;

  StaticData &staticData = StaticData::InstanceNonConst();
  staticData.SetTreeStructure(this);
 }


-// define NT labels (ints) that are mapped from strings for quicker comparison.
-void TreeStructureFeature::AddNTLabels(TreePointer root) const
-{
-  std::string label = root->GetLabel();
-
-  if (root->IsTerminal()) {
-    return;
-  }
-
-  std::map<std::string, NTLabel>::const_iterator it = m_labelset->string_to_label.find(label);
-  if (it != m_labelset->string_to_label.end()) {
-    root->SetNTLabel(it->second);
-  }
-
-  std::vector<TreePointer> children = root->GetChildren();
-  for (std::vector<TreePointer>::const_iterator it2 = children.begin(); it2 != children.end(); ++it2) {
-    AddNTLabels(*it2);
-  }
-}
-
 FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hypo
    , int featureID /* used to index the state in the previous hypotheses */
    , ScoreComponentCollection* accumulator) const
@ -48,10 +27,6 @@ FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hy
    const std::string *tree = property->GetValueString();
    TreePointer mytree (boost::make_shared<InternalTree>(*tree));

-    if (m_labelset) {
-      AddNTLabels(mytree);
-    }
-
    //get subtrees (in target order)
    std::vector<TreePointer> previous_trees;
    for (size_t pos = 0; pos < cur_hypo.GetCurrTargetPhrase().GetSize(); ++pos) {
@ -70,7 +45,7 @@ FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hy
    }
    mytree->Combine(previous_trees);

-    bool full_sentence = (mytree->GetChildren().back()->GetLabel() == "</s>" || (mytree->GetChildren().back()->GetLabel() == "SEND" && mytree->GetChildren().back()->GetChildren().back()->GetLabel() == "</s>"));
+    bool full_sentence = (mytree->GetChildren().back()->GetLabel() == m_send || (mytree->GetChildren().back()->GetLabel() == m_send_nt && mytree->GetChildren().back()->GetChildren().back()->GetLabel() == m_send));
    if (m_binarized && full_sentence) {
      mytree->Unbinarize();
    }
--- a/moses/FF/TreeStructureFeature.h
+++ b/moses/FF/TreeStructureFeature.h
@ -4,6 +4,7 @@
 #include <map>
 #include "StatefulFeatureFunction.h"
 #include "FFState.h"
+#include "moses/Word.h"
 #include "InternalTree.h"

 namespace Moses
@ -35,11 +36,18 @@ class TreeStructureFeature : public StatefulFeatureFunction
  SyntaxConstraints* m_constraints;
  LabelSet* m_labelset;
  bool m_binarized;
+  Word m_send;
+  Word m_send_nt;
+
 public:
  TreeStructureFeature(const std::string &line)
    :StatefulFeatureFunction(0, line)
    , m_binarized(false) {
    ReadParameters();
+    std::vector<FactorType> factors;
+    factors.push_back(0);
+    m_send.CreateFromString(Output, factors, "</s>", false);
+    m_send_nt.CreateFromString(Output, factors, "SEND", true);
  }
  ~TreeStructureFeature() {
    delete m_constraints;
@ -49,8 +57,6 @@ public:
    return new TreeState(TreePointer());
  }

-  void AddNTLabels(TreePointer root) const;
-
  bool IsUseable(const FactorMask &mask) const {
    return true;
  }
--- a/moses/FF/WordTranslationFeature.cpp
+++ b/moses/FF/WordTranslationFeature.cpp
@ -110,7 +110,8 @@ void WordTranslationFeature::Load()
    }

    inFileSource.close();
-  } else {
+  } else if (!m_filePathSource.empty() || !m_filePathTarget.empty()) {
+    return;
    // restricted source word vocabulary
    ifstream inFileSource(m_filePathSource.c_str());
    UTIL_THROW_IF2(!inFileSource, "could not open file " << m_filePathSource);
--- a/moses/Hypothesis.cpp
+++ b/moses/Hypothesis.cpp
@ -213,7 +213,8 @@ RecombineCompare(const Hypothesis &compare) const

  for (unsigned i = 0; i < m_ffStates.size(); ++i) {
    if (m_ffStates[i] == NULL || compare.m_ffStates[i] == NULL) {
-      comp = m_ffStates[i] - compare.m_ffStates[i];
+      // TODO: Can this situation actually occur?
+      comp = int(m_ffStates[i] != NULL) - int(compare.m_ffStates[i] != NULL);
    } else {
      comp = m_ffStates[i]->Compare(*compare.m_ffStates[i]);
    }
@ -234,8 +235,8 @@ EvaluateWhenApplied(StatefulFeatureFunction const& sfff,
    ttasksptr const& ttask = manager.GetTtask();

    m_ffStates[state_idx] = sfff.EvaluateWhenAppliedWithContext
-      (ttask, *this, m_prevHypo ? m_prevHypo->m_ffStates[state_idx] : NULL,
-       &m_currScoreBreakdown);
+                            (ttask, *this, m_prevHypo ? m_prevHypo->m_ffStates[state_idx] : NULL,
+                             &m_currScoreBreakdown);
  }
 }

@ -585,7 +586,9 @@ OutputSurface(std::ostream &out, const Hypothesis &edge,
      //preface surface form with UNK if marking unknowns
      const Word &word = phrase.GetWord(pos);
      if(markUnknown && word.IsOOV()) {
-        out << "UNK" << *factor;
+        out << StaticData::Instance().GetUnknownWordPrefix()
+            << *factor
+            << StaticData::Instance().GetUnknownWordSuffix();
      } else {
        out << *factor;
      }
--- a/moses/LM/Base.cpp
+++ b/moses/LM/Base.cpp
@ -78,9 +78,9 @@ void LanguageModel::EvaluateInIsolation(const Phrase &source
  float fullScore, nGramScore;
  size_t oovCount;

-  if (targetPhrase.HasTtaskSPtr()){
+  if (targetPhrase.HasTtaskSPtr()) {
    CalcScoreWithContext(targetPhrase.GetTtask(), targetPhrase, fullScore, nGramScore, oovCount);
-  }else{
+  } else {
    CalcScore(targetPhrase, fullScore, nGramScore, oovCount);
  }
  //CalcScore(targetPhrase, fullScore, nGramScore, oovCount);
--- a/moses/LM/RDLM.cpp
+++ b/moses/LM/RDLM.cpp
@ -70,7 +70,7 @@ void RDLM::Load()
    static_label_null[i] = lm_label_base_instance_->lookup_input_word(numstr);
  }

-  static_dummy_head = lm_head_base_instance_->lookup_input_word(dummy_head);
+  static_dummy_head = lm_head_base_instance_->lookup_input_word(dummy_head.GetString(0).as_string());

  static_start_head = lm_head_base_instance_->lookup_input_word("<start_head>");
  static_start_label = lm_head_base_instance_->lookup_input_word("<start_label>");
@ -211,7 +211,7 @@ void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost
  }

  // ignore virtual nodes (in binarization; except if it's the root)
-  if (m_binarized && root->GetLabel()[0] == '^' && !ancestor_heads.empty()) {
+  if (m_binarized && root->GetLabel().GetString(0).as_string()[0] == '^' && !ancestor_heads.empty()) {
    // recursion
    if (root->IsLeafNT() && m_context_up > 1 && ancestor_heads.size()) {
      root = back_pointers.find(root)->second.get();
@ -241,9 +241,9 @@ void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost
    // root of tree: score without context
    if (ancestor_heads.empty() || (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head)) {
      std::vector<int> ngram_head_null (static_head_null);
-      ngram_head_null.back() = lm_head->lookup_output_word(root->GetChildren()[0]->GetLabel());
+      ngram_head_null.back() = lm_head->lookup_output_word(root->GetChildren()[0]->GetLabel().GetString(m_factorType).as_string());
      if (m_isPretermBackoff && ngram_head_null.back() == 0) {
-        ngram_head_null.back() = lm_head->lookup_output_word(root->GetLabel());
+        ngram_head_null.back() = lm_head->lookup_output_word(root->GetLabel().GetString(m_factorType).as_string());
      }
      if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head) {
        std::vector<int>::iterator it = ngram_head_null.begin();
@ -290,13 +290,13 @@ void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost
  }

  std::pair<int,int> head_ids;
-  InternalTree* found = GetHead(root, back_pointers, head_ids);
-  if (found == NULL) {
+  bool found = GetHead(root, back_pointers, head_ids);
+  if (!found) {
    head_ids = std::make_pair(static_dummy_head, static_dummy_head);
  }

  size_t context_up_nonempty = std::min(m_context_up, ancestor_heads.size());
-  const std::string & head_label = root->GetLabel();
+  const std::string & head_label = root->GetLabel().GetString(0).as_string();
  bool virtual_head = false;
  int reached_end = 0;
  int label_idx, label_idx_out;
@ -516,7 +516,7 @@ void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost
  ancestor_labels.pop_back();
 }

-InternalTree* RDLM::GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair<int,int> & IDs, InternalTree* head_ptr) const
+bool RDLM::GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair<int,int> & IDs) const
 {
  InternalTree *tree;

@ -527,52 +527,28 @@ InternalTree* RDLM::GetHead(InternalTree* root, const TreePointerMap & back_poin
      tree = it->get();
    }

-    if (m_binarized && tree->GetLabel()[0] == '^') {
-      head_ptr = GetHead(tree, back_pointers, IDs, head_ptr);
-      if (head_ptr != NULL && !m_isPTKVZ) {
-        return head_ptr;
+    if (m_binarized && tree->GetLabel().GetString(0).as_string()[0] == '^') {
+      bool found = GetHead(tree, back_pointers, IDs);
+      if (found) {
+        return true;
      }
    }

    // assumption (only true for dependency parse): each constituent has a preterminal label, and corresponding terminal is head
    // if constituent has multiple preterminals, first one is picked; if it has no preterminals, dummy_head is returned
-    else if (tree->GetLength() == 1 && tree->GetChildren()[0]->IsTerminal() && head_ptr == NULL) {
-      head_ptr = tree;
-      if (!m_isPTKVZ) {
-        GetIDs(head_ptr->GetChildren()[0]->GetLabel(), head_ptr->GetLabel(), IDs);
-        return head_ptr;
-      }
-    }
-
-    // add PTKVZ to lemma of verb
-    else if (m_isPTKVZ && head_ptr && tree->GetLabel() == "avz") {
-      InternalTree *tree2;
-      for (std::vector<TreePointer>::const_iterator it2 = tree->GetChildren().begin(); it2 != tree->GetChildren().end(); ++it2) {
-        if ((*it2)->IsLeafNT()) {
-          tree2 = back_pointers.find(it2->get())->second.get();
-        } else {
-          tree2 = it2->get();
-        }
-        if (tree2->GetLabel() == "PTKVZ" && tree2->GetLength() == 1 && tree2->GetChildren()[0]->IsTerminal()) {
-          std::string verb = tree2->GetChildren()[0]->GetLabel() + head_ptr->GetChildren()[0]->GetLabel();
-          GetIDs(verb, head_ptr->GetLabel(), IDs);
-          return head_ptr;
-        }
-      }
+    else if (tree->GetLength() == 1 && tree->GetChildren()[0]->IsTerminal()) {
+      GetIDs(tree->GetChildren()[0]->GetLabel(), tree->GetLabel(), IDs);
+      return true;
    }
  }

-  if (head_ptr != NULL) {
-    GetIDs(head_ptr->GetChildren()[0]->GetLabel(), head_ptr->GetLabel(), IDs);
-  }
-  return head_ptr;
+  return false;
 }


 void RDLM::GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & back_pointers, int reached_end, const nplm::neuralTM *lm_head, const nplm::neuralTM *lm_label, std::vector<int> & heads, std::vector<int> & labels, std::vector<int> & heads_output, std::vector<int> & labels_output) const
 {
  std::pair<int,int> child_ids;
-  InternalTree* found;
  size_t j = 0;

  // score start label (if enabled) for all nonterminal nodes (but not for terminal or preterminal nodes)
@ -616,13 +592,13 @@ void RDLM::GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & bac
      continue;
    }

-    found = GetHead(child, back_pointers, child_ids);
-    if (found == NULL) {
+    bool found = GetHead(child, back_pointers, child_ids);
+    if (!found) {
      child_ids = std::make_pair(static_dummy_head, static_dummy_head);
    }

-    labels[j] = lm_head->lookup_input_word(child->GetLabel());
-    labels_output[j] = lm_label->lookup_output_word(child->GetLabel());
+    labels[j] = lm_head->lookup_input_word(child->GetLabel().GetString(0).as_string());
+    labels_output[j] = lm_label->lookup_output_word(child->GetLabel().GetString(0).as_string());
    heads[j] = child_ids.first;
    heads_output[j] = child_ids.second;
    j++;
@ -637,18 +613,18 @@ void RDLM::GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & bac
 }


-void RDLM::GetIDs(const std::string & head, const std::string & preterminal, std::pair<int,int> & IDs) const
+void RDLM::GetIDs(const Word & head, const Word & preterminal, std::pair<int,int> & IDs) const
 {
-  IDs.first = lm_head_base_instance_->lookup_input_word(head);
+  IDs.first = lm_head_base_instance_->lookup_input_word(head.GetString(m_factorType).as_string());
  if (m_isPretermBackoff && IDs.first == 0) {
-    IDs.first = lm_head_base_instance_->lookup_input_word(preterminal);
+    IDs.first = lm_head_base_instance_->lookup_input_word(preterminal.GetString(0).as_string());
  }
  if (m_sharedVocab) {
    IDs.second = IDs.first;
  } else {
-    IDs.second = lm_head_base_instance_->lookup_output_word(head);
+    IDs.second = lm_head_base_instance_->lookup_output_word(head.GetString(m_factorType).as_string());
    if (m_isPretermBackoff && IDs.second == 0) {
-      IDs.second = lm_head_base_instance_->lookup_output_word(preterminal);
+      IDs.second = lm_head_base_instance_->lookup_output_word(preterminal.GetString(0).as_string());
    }
  }
 }
@ -714,8 +690,6 @@ void RDLM::SetParameter(const std::string& key, const std::string& value)
    m_path_head_lm = value;
  } else if (key == "path_label_lm") {
    m_path_label_lm = value;
-  } else if (key == "ptkvz") {
-    m_isPTKVZ = Scan<bool>(value);
  } else if (key == "backoff") {
    m_isPretermBackoff = Scan<bool>(value);
  } else if (key == "context_up") {
@ -744,7 +718,9 @@ void RDLM::SetParameter(const std::string& key, const std::string& value)
    else
      UTIL_THROW(util::Exception, "Unknown value for argument " << key << "=" << value);
  } else if (key == "glue_symbol") {
-    m_glueSymbol = value;
+    m_glueSymbolString = value;
+  } else if (key == "factor") {
+    m_factorType = Scan<FactorType>(value);
  } else if (key == "cache_size") {
    m_cacheSize = Scan<int>(value);
  } else {
--- a/moses/LM/RDLM.h
+++ b/moses/LM/RDLM.h
@ -3,6 +3,7 @@
 #include "moses/FF/StatefulFeatureFunction.h"
 #include "moses/FF/FFState.h"
 #include "moses/FF/InternalTree.h"
+#include "moses/Word.h"

 #include <boost/thread/tss.hpp>
 #include <boost/array.hpp>
@ -61,14 +62,14 @@ class RDLM : public StatefulFeatureFunction
  nplm::neuralTM* lm_label_base_instance_;
  mutable boost::thread_specific_ptr<nplm::neuralTM> lm_label_backend_;

-  std::string dummy_head;
-  std::string m_glueSymbol;
-  std::string m_startSymbol;
-  std::string m_endSymbol;
-  std::string m_endTag;
+  std::string m_glueSymbolString;
+  Word dummy_head;
+  Word m_glueSymbol;
+  Word m_startSymbol;
+  Word m_endSymbol;
+  Word m_endTag;
  std::string m_path_head_lm;
  std::string m_path_label_lm;
-  bool m_isPTKVZ;
  bool m_isPretermBackoff;
  size_t m_context_left;
  size_t m_context_right;
@ -103,15 +104,12 @@ class RDLM : public StatefulFeatureFunction
  int static_stop_label_output;
  int static_start_label_output;

+  FactorType m_factorType;
+
 public:
  RDLM(const std::string &line)
    : StatefulFeatureFunction(2, line)
-    , dummy_head("<dummy_head>")
-    , m_glueSymbol("Q")
-    , m_startSymbol("SSTART")
-    , m_endSymbol("SEND")
-    , m_endTag("</s>")
-    , m_isPTKVZ(false)
+    , m_glueSymbolString("Q")
    , m_isPretermBackoff(true)
    , m_context_left(3)
    , m_context_right(0)
@ -122,8 +120,16 @@ public:
    , m_normalizeLabelLM(false)
    , m_sharedVocab(false)
    , m_binarized(0)
-    , m_cacheSize(1000000) {
+    , m_cacheSize(1000000)
+    , m_factorType(0) {
    ReadParameters();
+    std::vector<FactorType> factors;
+    factors.push_back(0);
+    dummy_head.CreateFromString(Output, factors, "<dummy_head>", false);
+    m_glueSymbol.CreateFromString(Output, factors, m_glueSymbolString, true);
+    m_startSymbol.CreateFromString(Output, factors, "SSTART", true);
+    m_endSymbol.CreateFromString(Output, factors, "SEND", true);
+    m_endTag.CreateFromString(Output, factors, "</s>", false);
  }

  ~RDLM();
@ -133,9 +139,9 @@ public:
  }

  void Score(InternalTree* root, const TreePointerMap & back_pointers, boost::array<float,4> &score, std::vector<int> &ancestor_heads, std::vector<int> &ancestor_labels, size_t &boundary_hash, int num_virtual = 0, int rescoring_levels = 0) const;
-  InternalTree* GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair<int,int> & IDs, InternalTree * head_ptr=NULL) const;
+  bool GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair<int,int> & IDs) const;
  void GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & back_pointers, int reached_end, const nplm::neuralTM *lm_head, const nplm::neuralTM *lm_labels, std::vector<int> & heads, std::vector<int> & labels, std::vector<int> & heads_output, std::vector<int> & labels_output) const;
-  void GetIDs(const std::string & head, const std::string & preterminal, std::pair<int,int> & IDs) const;
+  void GetIDs(const Word & head, const Word & preterminal, std::pair<int,int> & IDs) const;
  void ScoreFile(std::string &path); //for debugging
  void PrintInfo(std::vector<int> &ngram, nplm::neuralTM* lm) const; //for debugging

@ -192,7 +198,7 @@ public:
      _end = current->GetChildren().end();
      iter = current->GetChildren().begin();
      // expand virtual node
-      while (binarized && !(*iter)->GetLabel().empty() && (*iter)->GetLabel()[0] == '^') {
+      while (binarized && !(*iter)->GetLabel().GetString(0).empty() && (*iter)->GetLabel().GetString(0).data()[0] == '^') {
        stack.push_back(std::make_pair(current, iter));
        // also go through trees or previous hypotheses to rescore nodes for which more context has become available
        if ((*iter)->IsLeafNT()) {
@ -229,7 +235,7 @@ public:
        }
      }
      // expand virtual node
-      while (binarized && !(*iter)->GetLabel().empty() && (*iter)->GetLabel()[0] == '^') {
+      while (binarized && !(*iter)->GetLabel().GetString(0).empty() && (*iter)->GetLabel().GetString(0).data()[0] == '^') {
        stack.push_back(std::make_pair(current, iter));
        // also go through trees or previous hypotheses to rescore nodes for which more context has become available
        if ((*iter)->IsLeafNT()) {
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@ -1737,7 +1737,9 @@ void Manager::OutputSurface(std::ostream &out, const Hypothesis &edge, const std
      //preface surface form with UNK if marking unknowns
      const Word &word = phrase.GetWord(pos);
      if(markUnknown && word.IsOOV()) {
-        out << "UNK" << *factor;
+        out << StaticData::Instance().GetUnknownWordPrefix()
+            << *factor
+            << StaticData::Instance().GetUnknownWordSuffix();
      } else {
        out << *factor;
      }
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@ -141,6 +141,8 @@ Parameter::Parameter()
  po::options_description oov_opts("OOV Handling Options");
  AddParam(oov_opts,"drop-unknown", "du", "drop unknown words instead of copying them");
  AddParam(oov_opts,"mark-unknown", "mu", "mark unknown words in output");
+  AddParam(oov_opts,"unknown-word-prefix", "prefix to unknwon word when marked (default: 'UNK')");
+  AddParam(oov_opts,"unknown-word-suffix", "suffix to unknwon word when marked (default: '')");
  AddParam(oov_opts,"lmodel-oov-feature", "add language model oov feature, one per model");
  AddParam(oov_opts,"output-unknowns", "Output the unknown (OOV) words to the given file, one line per sentence");
  AddParam(oov_opts,"always-create-direct-transopt", "Always create a translation that translates the source word ad-verbatim");
--- a/moses/Phrase.cpp
+++ b/moses/Phrase.cpp
@ -119,10 +119,13 @@ std::string Phrase::GetStringRep(const vector<FactorType> factorsToPrint) const

  stringstream strme;
  for (size_t pos = 0 ; pos < GetSize() ; pos++) {
-    if(markUnknown && GetWord(pos).IsOOV()) {
-      strme << "UNK";
+    if (markUnknown && GetWord(pos).IsOOV()) {
+      strme << StaticData::Instance().GetUnknownWordPrefix();
    }
    strme << GetWord(pos).GetString(factorsToPrint, (pos != GetSize()-1));
+    if (markUnknown && GetWord(pos).IsOOV()) {
+      strme << StaticData::Instance().GetUnknownWordSuffix();
+    }
  }

  return strme.str();
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@ -438,6 +438,8 @@ StaticData
  // unknown word processing
  m_parameter->SetParameter(m_dropUnknown, "drop-unknown", false );
  m_parameter->SetParameter(m_markUnknown, "mark-unknown", false );
+  m_parameter->SetParameter<string>(m_unknownWordPrefix, "unknown-word-prefix", "UNK" );
+  m_parameter->SetParameter<string>(m_unknownWordSuffix, "unknown-word-suffix", "" );

  m_parameter->SetParameter(m_lmEnableOOVFeature, "lmodel-oov-feature", false);

--- a/moses/StaticData.h
+++ b/moses/StaticData.h
@ -114,6 +114,8 @@ protected:
  // bool m_labeledNBestList,m_nBestIncludesSegmentation;
  bool m_dropUnknown; //! false = treat unknown words as unknowns, and translate them as themselves; true = drop (ignore) them
  bool m_markUnknown; //! false = treat unknown words as unknowns, and translate them as themselves; true = mark and (ignore) them
+  std::string m_unknownWordPrefix;
+  std::string m_unknownWordSuffix;
  bool m_wordDeletionEnabled;

  bool m_disableDiscarding;
@ -326,6 +328,12 @@ public:
  inline bool GetMarkUnknown() const {
    return m_markUnknown;
  }
+  inline std::string GetUnknownWordPrefix() const {
+    return m_unknownWordPrefix;
+  }
+  inline std::string GetUnknownWordSuffix() const {
+    return m_unknownWordSuffix;
+  }
  inline bool GetDisableDiscarding() const {
    return m_disableDiscarding;
  }
--- a/moses/TargetPhrase.cpp
+++ b/moses/TargetPhrase.cpp
@ -177,7 +177,8 @@ void TargetPhrase::WriteToRulePB(hgmert::Rule* pb) const
 }
 #endif

-bool TargetPhrase::HasTtaskSPtr() const {
+bool TargetPhrase::HasTtaskSPtr() const
+{
  return m_ttask_flag;
 }

--- a/moses/TranslationModel/UG/mm/ug_mm_2d_table.h
+++ b/moses/TranslationModel/UG/mm/ug_mm_2d_table.h
@ -103,7 +103,7 @@ namespace ugdiss
  operator[](ID key) const
  {
    if (start==stop) return INIT(0);
-    Cell const* c = lower_bound(start,stop,key);
+    Cell const* c = std::lower_bound(start,stop,key);
    return (c != stop && c->id == key ? c->val : INIT(0));
  }

--- a/moses/TranslationModel/UG/mm/ug_mm_ttrack.h
+++ b/moses/TranslationModel/UG/mm/ug_mm_ttrack.h
@ -21,6 +21,7 @@
 #include "ug_ttrack_base.h"
 #include "num_read_write.h"
 #include "ug_load_primer.h"
+#include "ug_tsa_base.h"

 namespace ugdiss
 {
@ -193,7 +194,7 @@ namespace ugdiss
  findSid(TKN const* t) const
  {
    id_type tokenPos = t-data;
-    id_type const* p = upper_bound(index,index+this->numSent,tokenPos);
+    id_type const* p = std::upper_bound(index,index+this->numSent,tokenPos);
    assert(p>index);
    return p-index-1;
  }
@ -203,7 +204,7 @@ namespace ugdiss
  mmTtrack<TKN>::
  findSid(id_type tokenPos) const
  {
-    id_type const* p = upper_bound(index,index+this->numSent,tokenPos);
+    id_type const* p = std::upper_bound(index,index+this->numSent,tokenPos);
    assert(p>index);
    return p-index-1;
  }
--- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
+++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
@ -114,14 +114,14 @@ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, co

  // find match ranges in suffix array
  vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range;
-  for(size_t start=0; start<input[sentenceInd].size(); start++) {
+  for(int start=0; start<input[sentenceInd].size(); start++) {
    SuffixArray::INDEX prior_first_match = 0;
    SuffixArray::INDEX prior_last_match = suffixArray->GetSize()-1;
    vector< string > substring;
    bool stillMatched = true;
    vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart;
    //cerr << "start: " << start;
-    for(int word=start; stillMatched && word<input[sentenceInd].size(); word++) {
+    for(size_t word=start; stillMatched && word<input[sentenceInd].size(); word++) {
      substring.push_back( GetVocabulary().GetWord( input[sentenceInd][word] ) );

      // only look up, if needed (i.e. no unnecessary short gram lookups)
@ -163,7 +163,7 @@ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, co
        count += range.second - range.first + 1;

        for(SuffixArray::INDEX i=range.first; i<=range.second; i++) {
-          int position = suffixArray->GetPosition( i );
+          size_t position = suffixArray->GetPosition( i );

          // sentence length mismatch
          size_t sentence_id = suffixArray->GetSentence( position );
@ -261,7 +261,7 @@ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, co

    // quick look: how many words are matched
    int words_matched = 0;
-    for(int m=0; m<match.size(); m++) {
+    for(size_t m=0; m<match.size(); m++) {

      if (match[m].min_cost <= best_cost) // makes no difference
        words_matched += match[m].input_end - match[m].input_start + 1;
@ -274,7 +274,7 @@ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, co
    // prune, check again how many words are matched
    vector< Match > pruned = prune_matches( match, best_cost );
    words_matched = 0;
-    for(int p=0; p<pruned.size(); p++) {
+    for(size_t p=0; p<pruned.size(); p++) {
      words_matched += pruned[p].input_end - pruned[p].input_start + 1;
    }
    if (max(input_length,tm_length) - words_matched > best_cost) {
@ -323,7 +323,7 @@ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, co

  // do not try to find the best ... report multiple matches
  if (multiple_flag) {
-    for(int si=0; si<best_tm.size(); si++) {
+    for(size_t si=0; si<best_tm.size(); si++) {
      int s = best_tm[si];
      string path;
      sed( input[sentenceInd], source[s], path, true );
@ -776,7 +776,7 @@ void FuzzyMatchWrapper::init_short_matches(WordIndex &wordIndex, long translatio
  wordIndex.clear();

  // store input words and their positions in hash map
-  for(int i=0; i<input.size(); i++) {
+  for(size_t i=0; i<input.size(); i++) {
    if (wordIndex.find( input[i] ) == wordIndex.end()) {
      vector< int > position_vector;
      wordIndex[ input[i] ] = position_vector;
@ -799,7 +799,7 @@ void FuzzyMatchWrapper::add_short_matches(WordIndex &wordIndex, long translation
    input_word_hit = wordIndex.find( tm[t_pos] );
    if (input_word_hit != wordIndex.end()) {
      vector< int > &position_vector = input_word_hit->second;
-      for(int j=0; j<position_vector.size(); j++) {
+      for(size_t j=0; j<position_vector.size(); j++) {
        int &i_pos = position_vector[j];

        // before match
@ -870,7 +870,7 @@ int FuzzyMatchWrapper::parse_matches( vector< Match > &match, int input_length,
    return input_length+tm_length;

  int this_best_cost = input_length + tm_length;
-  for(int i=0; i<match.size(); i++) {
+  for(size_t i=0; i<match.size(); i++) {
    this_best_cost = min( this_best_cost, match[i].max_cost );
  }
  // cerr << "\tthis best cost: " << this_best_cost << endl;
@ -892,8 +892,8 @@ int FuzzyMatchWrapper::parse_matches( vector< Match > &match, int input_length,
      vector< Match > &first_match  = multi_match[ first_level ];
      vector< Match > &second_match = multi_match[ second_level ];

-      for(int i1 = 0; i1 < first_match.size(); i1++) {
-        for(int i2 = 0; i2 < second_match.size(); i2++) {
+      for(size_t i1 = 0; i1 < first_match.size(); i1++) {
+        for(size_t i2 = 0; i2 < second_match.size(); i2++) {

          // do not combine the same pair twice
          if (first_level == second_level && i2 <= i1) {
--- a/moses/WordsBitmap.cpp
+++ b/moses/WordsBitmap.cpp
@ -28,20 +28,10 @@ TO_STRING_BODY(WordsBitmap);

 bool WordsBitmap::IsAdjacent(size_t startPos, size_t endPos) const
 {
-  if (GetNumWordsCovered() == 0) {
-    return true;
-  }
-
-  size_t first = GetFirstGapPos();
-  size_t last = GetLastGapPos();
-
-  if (startPos == last || endPos == first) {
-    return true;
-  }
-
-  return false;
+  return
+    GetNumWordsCovered() == 0 ||
+    startPos == GetFirstGapPos() ||
+    endPos == GetLastGapPos();
 }

-
 }
-
--- a/moses/WordsBitmap.h
+++ b/moses/WordsBitmap.h
@ -22,6 +22,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #ifndef moses_WordsBitmap_h
 #define moses_WordsBitmap_h

+#include <algorithm>
 #include <limits>
 #include <vector>
 #include <iostream>
@ -35,50 +36,34 @@ namespace Moses
 {
 typedef unsigned long WordsBitmapID;

-/** vector of boolean used to represent whether a word has been translated or not
-*/
+/** Vector of boolean to represent whether a word has been translated or not.
+ *
+ * Implemented using a vector of char, which is usually the same representation
+ * for the elements that a C array of bool would use.  A vector of bool, or a
+ * Boost dynamic_bitset, could be much more efficient in theory.  Unfortunately
+ * algorithms like std::find() are not optimized for vector<bool> on gcc or
+ * clang, and dynamic_bitset lacks all the optimized search operations we want.
+ * Only benchmarking will tell what works best.  Perhaps dynamic_bitset could
+ * still be a dramatic improvement, if we flip the meaning of the bits around
+ * so we can use its find_first() and find_next() for the most common searches.
+ */
 class WordsBitmap
 {
  friend std::ostream& operator<<(std::ostream& out, const WordsBitmap& wordsBitmap);
-protected:
-  const size_t m_size; /**< number of words in sentence */
-  bool	*m_bitmap;	/**< ticks of words that have been done */
-  size_t m_firstGap; /** Position of first gap, pre-calculated as it is consulted often */
+private:
+  std::vector<char> m_bitmap; //! Ticks of words in sentence that have been done.
+  size_t m_firstGap; //! Cached position of first gap, or NOT_FOUND.

  WordsBitmap(); // not implemented
  WordsBitmap& operator= (const WordsBitmap& other);

-  //! set all elements to false
-  void Initialize() {
-    for (size_t pos = 0 ; pos < m_size ; pos++) {
-      m_bitmap[pos] = false;
-    }
-  }
-
-  //sets elements by vector
-  void Initialize(const std::vector<bool>& vector) {
-    size_t vector_size = vector.size();
-    bool gapFound = false;
-    for (size_t pos = 0 ; pos < m_size ; pos++) {
-      if (pos < vector_size && vector[pos] == true) m_bitmap[pos] = true;
-      else {
-        m_bitmap[pos] = false;
-        if (!gapFound) {
-          m_firstGap = pos;
-          gapFound = true;
-        }
-      }
-    }
-    if (!gapFound) m_firstGap = NOT_FOUND;
-  }
-
  /** Update the first gap, when bits are flipped */
  void UpdateFirstGap(size_t startPos, size_t endPos, bool value) {
    if (value) {
      //may remove gap
      if (startPos <= m_firstGap && m_firstGap <= endPos) {
        m_firstGap = NOT_FOUND;
-        for (size_t i = endPos + 1 ; i < m_size; ++i) {
+        for (size_t i = endPos + 1 ; i < m_bitmap.size(); ++i) {
          if (!m_bitmap[i]) {
            m_firstGap = i;
            break;
@ -96,38 +81,35 @@ protected:


 public:
-  //! create WordsBitmap of length size and initialise with vector
-  WordsBitmap(size_t size, const std::vector<bool>& initialize_vector)
-    :m_size	(size), m_firstGap(0) {
-    m_bitmap = (bool*) malloc(sizeof(bool) * size);
-    Initialize(initialize_vector);
+  //! Create WordsBitmap of length size, and initialise with vector.
+  WordsBitmap(size_t size, const std::vector<bool>& initializer)
+    :m_bitmap(initializer.begin(), initializer.end()), m_firstGap(0) {
+
+    // The initializer may not be of the same length.  Change to the desired
+    // length.  If we need to add any elements, initialize them to false.
+    m_bitmap.resize(size, false);
+
+    // Find the first gap, and cache it.
+    std::vector<char>::const_iterator first_gap = std::find(
+          m_bitmap.begin(), m_bitmap.end(), false);
+    m_firstGap = (
+                   (first_gap == m_bitmap.end()) ?
+                   NOT_FOUND : first_gap - m_bitmap.begin());
  }
-  //! create WordsBitmap of length size and initialise
+
+  //! Create WordsBitmap of length size and initialise.
  WordsBitmap(size_t size)
-    :m_size	(size), m_firstGap(0) {
-    m_bitmap = (bool*) malloc(sizeof(bool) * size);
-    Initialize();
+    :m_bitmap(size, false), m_firstGap(0) {
  }
-  //! deep copy
+
+  //! Deep copy.
  WordsBitmap(const WordsBitmap &copy)
-    :m_size	(copy.m_size), m_firstGap(copy.m_firstGap) {
-    m_bitmap = (bool*) malloc(sizeof(bool) * m_size);
-    for (size_t pos = 0 ; pos < copy.m_size ; pos++) {
-      m_bitmap[pos] = copy.GetValue(pos);
-    }
-    m_firstGap = copy.m_firstGap;
+    :m_bitmap(copy.m_bitmap), m_firstGap(copy.m_firstGap) {
  }
-  ~WordsBitmap() {
-    free(m_bitmap);
-  }
-  //! count of words translated
+
+  //! Count of words translated.
  size_t GetNumWordsCovered() const {
-    size_t count = 0;
-    for (size_t pos = 0 ; pos < m_size ; pos++) {
-      if (m_bitmap[pos])
-        count++;
-    }
-    return count;
+    return std::count(m_bitmap.begin(), m_bitmap.end(), true);
  }

  //! position of 1st word not yet translated, or NOT_FOUND if everything already translated
@ -138,7 +120,7 @@ public:

  //! position of last word not yet translated, or NOT_FOUND if everything already translated
  size_t GetLastGapPos() const {
-    for (int pos = (int) m_size - 1 ; pos >= 0 ; pos--) {
+    for (int pos = int(m_bitmap.size()) - 1 ; pos >= 0 ; pos--) {
      if (!m_bitmap[pos]) {
        return pos;
      }
@ -150,7 +132,7 @@ public:

  //! position of last translated word
  size_t GetLastPos() const {
-    for (int pos = (int) m_size - 1 ; pos >= 0 ; pos--) {
+    for (int pos = int(m_bitmap.size()) - 1 ; pos >= 0 ; pos--) {
      if (m_bitmap[pos]) {
        return pos;
      }
@ -163,7 +145,7 @@ public:

  //! whether a word has been translated at a particular position
  bool GetValue(size_t pos) const {
-    return m_bitmap[pos];
+    return bool(m_bitmap[pos]);
  }
  //! set value at a particular position
  void SetValue( size_t pos, bool value ) {
@ -198,7 +180,7 @@ public:
  }
  //! number of elements
  size_t GetSize() const {
-    return m_size;
+    return m_bitmap.size();
  }

  //! transitive comparison of WordsBitmap
@ -213,7 +195,8 @@ public:
    if (thisSize != compareSize) {
      return (thisSize < compareSize) ? -1 : 1;
    }
-    return std::memcmp(m_bitmap, compare.m_bitmap, thisSize * sizeof(bool));
+    return std::memcmp(
+             &m_bitmap[0], &compare.m_bitmap[0], thisSize * sizeof(bool));
  }

  bool operator< (const WordsBitmap &compare) const {
@ -229,20 +212,20 @@ public:
  }

  inline size_t GetEdgeToTheRightOf(size_t r) const {
-    if (r+1 == m_size) return r;
-    while (r+1 < m_size && !m_bitmap[r+1]) {
-      ++r;
-    }
-    return r;
+    if (r+1 == m_bitmap.size()) return r;
+    return (
+             std::find(m_bitmap.begin() + r + 1, m_bitmap.end(), true) -
+             m_bitmap.begin()
+           ) - 1;
  }


  //! converts bitmap into an integer ID: it consists of two parts: the first 16 bit are the pattern between the first gap and the last word-1, the second 16 bit are the number of filled positions. enforces a sentence length limit of 65535 and a max distortion of 16
  WordsBitmapID GetID() const {
-    assert(m_size < (1<<16));
+    assert(m_bitmap.size() < (1<<16));

    size_t start = GetFirstGapPos();
-    if (start == NOT_FOUND) start = m_size; // nothing left
+    if (start == NOT_FOUND) start = m_bitmap.size(); // nothing left

    size_t end = GetLastPos();
    if (end == NOT_FOUND) end = 0; // nothing translated yet
@ -257,10 +240,10 @@ public:

  //! converts bitmap into an integer ID, with an additional span covered
  WordsBitmapID GetIDPlus( size_t startPos, size_t endPos ) const {
-    assert(m_size < (1<<16));
+    assert(m_bitmap.size() < (1<<16));

    size_t start = GetFirstGapPos();
-    if (start == NOT_FOUND) start = m_size; // nothing left
+    if (start == NOT_FOUND) start = m_bitmap.size(); // nothing left

    size_t end = GetLastPos();
    if (end == NOT_FOUND) end = 0; // nothing translated yet
@ -284,8 +267,8 @@ public:
 // friend
 inline std::ostream& operator<<(std::ostream& out, const WordsBitmap& wordsBitmap)
 {
-  for (size_t i = 0 ; i < wordsBitmap.m_size ; i++) {
-    out << (wordsBitmap.GetValue(i) ? 1 : 0);
+  for (size_t i = 0 ; i < wordsBitmap.m_bitmap.size() ; i++) {
+    out << int(wordsBitmap.GetValue(i));
  }
  return out;
 }
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@ -17,6 +17,7 @@
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 ***********************************************************************/

+#include <cstdlib>
 #include <vector>
 #include <string>

@ -123,7 +124,7 @@ int main(int argc, char* argv[])
      std::cerr << "include "<< (sparseCountBinFeatureFlag ? "sparse " : "") << "count bin feature:";
      int prev = 0;
      while(i+1<argc && argv[i+1][0]>='0' && argv[i+1][0]<='9') {
-        int binCount = Moses::Scan<int>(argv[++i]);
+        int binCount = std::atoi( argv[++i] );
        countBin.push_back( binCount );
        if (prev+1 == binCount) {
          std::cerr << " " << binCount;
@ -164,8 +165,8 @@ int main(int argc, char* argv[])
        }
        pos = single_setting.find(":");
        UTIL_THROW_IF2(pos == std::string::npos, "faulty MinScore setting '" << single_setting << "' in '" << argv[i] << "'");
-        unsigned int field = Moses::Scan<unsigned int>( single_setting.substr(0,pos) );
-        float threshold = Moses::Scan<float>( single_setting.substr(pos+1) );
+        unsigned int field = std::atoll( single_setting.substr(0,pos).c_str() );
+        float threshold = std::atof( single_setting.substr(pos+1).c_str() );
        if (field == 0) {
          minScore0 = threshold;
          std::cerr << "setting minScore0 to " << threshold << std::endl;
@ -195,9 +196,9 @@ void loadCountOfCounts( const std::string& fileNameCountOfCounts )
  std::string line;
  while (getline(fileCountOfCounts, line)) {
    if (totalCount < 0)
-      totalCount = Moses::Scan<float>(line); // total number of distinct phrase pairs
+      totalCount = std::atof( line.c_str() ); // total number of distinct phrase pairs
    else
-      countOfCounts.push_back( Moses::Scan<float>(line) );
+      countOfCounts.push_back( std::atof( line.c_str() ) );
  }
  fileCountOfCounts.Close();

@ -259,6 +260,7 @@ void processFiles( const std::string& fileNameDirect,
  // loop through all extracted phrase translations
  int i=0;
  while(true) {
+    // Print progress dots to stderr.
    i++;
    if (i%100000 == 0) std::cerr << "." << std::flush;

@ -285,13 +287,13 @@ void processFiles( const std::string& fileNameDirect,
    Moses::Tokenize( directCounts, itemDirect[4] );
    std::vector<std::string> indirectCounts;
    Moses::Tokenize( indirectCounts, itemIndirect[4] );
-    float countF = Moses::Scan<float>(directCounts[0]);
-    float countE = Moses::Scan<float>(indirectCounts[0]);
-    float countEF = Moses::Scan<float>(indirectCounts[1]);
+    float countF  = std::atof( directCounts[0].c_str() );
+    float countE  = std::atof( indirectCounts[0].c_str() );
+    float countEF = std::atof( indirectCounts[1].c_str() );
    float n1_F, n1_E;
    if (kneserNeyFlag) {
-      n1_F = Moses::Scan<float>(directCounts[2]);
-      n1_E = Moses::Scan<float>(indirectCounts[2]);
+      n1_F = std::atof( directCounts[2].c_str() );
+      n1_E = std::atof( indirectCounts[2].c_str() );
    }

    // Good Turing discounting
@ -436,6 +438,9 @@ void processFiles( const std::string& fileNameDirect,
  fileDirect.Close();
  fileIndirect.Close();
  fileConsolidated.Close();
+
+  // We've been printing progress dots to stderr.  End the line.
+  std::cerr << std::endl;
 }


--- a/phrase-extract/extract-ghkm/Alignment.cpp
+++ b/phrase-extract/extract-ghkm/Alignment.cpp
@ -19,7 +19,7 @@

 #include "Alignment.h"

-#include "Exception.h"
+#include "syntax-common/exception.h"

 #include <algorithm>
 #include <cassert>
@ -27,6 +27,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -70,4 +72,5 @@ void FlipAlignment(Alignment &a)
 }

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining
--- a/phrase-extract/extract-ghkm/Alignment.h
+++ b/phrase-extract/extract-ghkm/Alignment.h
@ -25,6 +25,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -35,5 +37,5 @@ void ReadAlignment(const std::string &, Alignment &);
 void FlipAlignment(Alignment &);

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining
-
--- a/phrase-extract/extract-ghkm/AlignmentGraph.cpp
+++ b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
@ -34,6 +34,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -242,36 +244,24 @@ Node *AlignmentGraph::CopyParseTree(const SyntaxTree *root)
  return p;
 }

-// Finds the set of frontier nodes.  The definition of a frontier node differs
-// from Galley et al's (2004) in the following ways:
-//
-// 1. A node with an empty span is not a frontier node (this excludes
-//    unaligned target subtrees).
-// 2. Target word nodes are not frontier nodes.
-// 3. Source word nodes are not frontier nodes.
-// 4. Unless the --AllowUnary option is used, a node is not a frontier node if
-//    it has the same span as its parent.
+// Recursively constructs the set of frontier nodes for the tree (or subtree)
+// rooted at the given node.
 void AlignmentGraph::ComputeFrontierSet(Node *root,
                                        const Options &options,
                                        std::set<Node *> &frontierSet) const
 {
-  // Don't include word nodes or unaligned target subtrees.
+  // Non-tree nodes and unaligned target subtrees are not frontier nodes (and
+  // nor are their descendants).  See the comment for the function
+  // AlignmentGraph::IsFrontierNode().
  if (root->GetType() != TREE || root->GetSpan().empty()) {
    return;
  }

-  if (!SpansIntersect(root->GetComplementSpan(), Closure(root->GetSpan()))) {
-    // Unless unary rules are explicitly allowed, we use Chung et al's (2011)
-    // modified defintion of a frontier node to eliminate the production of
-    // non-lexical unary rules.
-    assert(root->GetParents().size() <= 1);
-    if (options.allowUnary
-        || root->GetParents().empty()
-        || root->GetParents()[0]->GetSpan() != root->GetSpan()) {
-      frontierSet.insert(root);
-    }
+  if (IsFrontierNode(*root, options)) {
+    frontierSet.insert(root);
  }

+  // Recursively check descendants.
  const std::vector<Node *> &children = root->GetChildren();
  for (std::vector<Node *>::const_iterator p(children.begin());
       p != children.end(); ++p) {
@ -279,6 +269,37 @@ void AlignmentGraph::ComputeFrontierSet(Node *root,
  }
 }

+// Determines whether the given node is a frontier node or not. The definition
+// of a frontier node differs from Galley et al's (2004) in the following ways:
+//
+// 1. A node with an empty span is not a frontier node (this is to exclude
+//    unaligned target subtrees).
+// 2. Target word nodes are not frontier nodes.
+// 3. Source word nodes are not frontier nodes.
+// 4. Unless the --AllowUnary option is used, a node is not a frontier node if
+//    it has the same span as its parent.
+bool AlignmentGraph::IsFrontierNode(const Node &n, const Options &options) const
+{
+  // Don't include word nodes or unaligned target subtrees.
+  if (n.GetType() != TREE || n.GetSpan().empty()) {
+    return false;
+  }
+  // This is the original GHKM definition of a frontier node.
+  if (SpansIntersect(n.GetComplementSpan(), Closure(n.GetSpan()))) {
+    return false;
+  }
+  // Unless unary rules are explicitly allowed, we use Chung et al's (2011)
+  // modified defintion of a frontier node to eliminate the production of
+  // non-lexical unary rules.
+  assert(n.GetParents().size() <= 1);
+  if (!options.allowUnary &&
+      !n.GetParents().empty() &&
+      n.GetParents()[0]->GetSpan() == n.GetSpan()) {
+    return false;
+  }
+  return true;
+}
+
 void AlignmentGraph::CalcComplementSpans(Node *root)
 {
  Span compSpan;
@ -393,4 +414,5 @@ Node *AlignmentGraph::DetermineAttachmentPoint(int index)
 }

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining
--- a/phrase-extract/extract-ghkm/AlignmentGraph.h
+++ b/phrase-extract/extract-ghkm/AlignmentGraph.h
@ -32,6 +32,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -64,6 +66,7 @@ private:

  Node *CopyParseTree(const SyntaxTree *);
  void ComputeFrontierSet(Node *, const Options &, std::set<Node *> &) const;
+  bool IsFrontierNode(const Node &, const Options &) const;
  void CalcComplementSpans(Node *);
  void GetTargetTreeLeaves(Node *, std::vector<Node *> &);
  void AttachUnalignedSourceWords();
@ -78,6 +81,7 @@ private:
 };

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining

 #endif
--- a/phrase-extract/extract-ghkm/ComposedRule.cpp
+++ b/phrase-extract/extract-ghkm/ComposedRule.cpp
@ -29,6 +29,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -128,4 +130,5 @@ Subgraph ComposedRule::CreateSubgraph()
 }

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining
--- a/phrase-extract/extract-ghkm/ComposedRule.h
+++ b/phrase-extract/extract-ghkm/ComposedRule.h
@ -28,6 +28,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -67,6 +69,7 @@ private:
 };

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining

 #endif
--- a/phrase-extract/extract-ghkm/Exception.h
+++ b/phrase-extract/extract-ghkm/Exception.h
@ -1,46 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2011 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-***********************************************************************/
-
-#pragma once
-#ifndef EXTRACT_GHKM_EXCEPTION_H_
-#define EXTRACT_GHKM_EXCEPTION_H_
-
-#include <string>
-
-namespace MosesTraining
-{
-namespace GHKM
-{
-
-class Exception
-{
-public:
-  Exception(const char *msg) : m_msg(msg) {}
-  Exception(const std::string &msg) : m_msg(msg) {}
-  const std::string &GetMsg() const {
-    return m_msg;
-  }
-private:
-  std::string m_msg;
-};
-
-} // namespace GHKM
-} // namespace MosesTraining
-
-#endif
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@ -30,6 +30,7 @@

 #include <boost/program_options.hpp>

+#include "syntax-common/exception.h"
 #include "syntax-common/xml_tree_parser.h"

 #include "InputFileStream.h"
@ -43,7 +44,6 @@

 #include "Alignment.h"
 #include "AlignmentGraph.h"
-#include "Exception.h"
 #include "Node.h"
 #include "Options.h"
 #include "PhraseOrientation.h"
@ -55,6 +55,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -131,8 +133,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
  std::string sourceLine;
  std::string alignmentLine;
  Alignment alignment;
-  Syntax::XmlTreeParser targetXmlTreeParser;
-  Syntax::XmlTreeParser sourceXmlTreeParser;
+  XmlTreeParser targetXmlTreeParser;
+  XmlTreeParser sourceXmlTreeParser;
  ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options);
  StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options);
  size_t lineNum = options.sentenceOffset;
@ -163,8 +165,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
    } catch (const Exception &e) {
      std::ostringstream oss;
      oss << "Failed to parse target XML tree at line " << lineNum;
-      if (!e.GetMsg().empty()) {
-        oss << ": " << e.GetMsg();
+      if (!e.msg().empty()) {
+        oss << ": " << e.msg();
      }
      Error(oss.str());
    }
@ -181,8 +183,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
      } catch (const Exception &e) {
        std::ostringstream oss;
        oss << "Failed to parse source XML tree at line " << lineNum;
-        if (!e.GetMsg().empty()) {
-          oss << ": " << e.GetMsg();
+        if (!e.msg().empty()) {
+          oss << ": " << e.msg();
        }
        Error(oss.str());
      }
@ -195,7 +197,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
    } catch (const Exception &e) {
      std::ostringstream oss;
      oss << "Failed to read alignment at line " << lineNum << ": ";
-      oss << e.GetMsg();
+      oss << e.msg();
      Error(oss.str());
    }
    if (alignment.size() == 0) {
@ -896,4 +898,5 @@ void ExtractGHKM::StripBitParLabels(
 }

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining
--- a/phrase-extract/extract-ghkm/ExtractGHKM.h
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.h
@ -32,12 +32,14 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

 struct Options;

-class ExtractGHKM : public Syntax::Tool
+class ExtractGHKM : public Tool
 {
 public:
  ExtractGHKM() : Tool("extract-ghkm") {}
@ -76,4 +78,5 @@ private:
 };

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining
--- a/phrase-extract/extract-ghkm/Main.cpp
+++ b/phrase-extract/extract-ghkm/Main.cpp
@ -21,6 +21,6 @@

 int main(int argc, char *argv[])
 {
-  MosesTraining::GHKM::ExtractGHKM tool;
+  MosesTraining::Syntax::GHKM::ExtractGHKM tool;
  return tool.Main(argc, argv);
 }
--- a/phrase-extract/extract-ghkm/Node.cpp
+++ b/phrase-extract/extract-ghkm/Node.cpp
@ -23,6 +23,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -70,4 +72,5 @@ void Node::GetTargetWords(std::vector<std::string> &targetWords) const
 }

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining
--- a/phrase-extract/extract-ghkm/Node.h
+++ b/phrase-extract/extract-ghkm/Node.h
@ -30,6 +30,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -215,6 +217,7 @@ Node *Node::LowestCommonAncestor(InputIterator first, InputIterator last)
 }

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining

 #endif
--- a/phrase-extract/extract-ghkm/Options.h
+++ b/phrase-extract/extract-ghkm/Options.h
@ -23,6 +23,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -89,5 +91,5 @@ public:
 };

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining
-
--- a/phrase-extract/extract-ghkm/PhraseOrientation.cpp
+++ b/phrase-extract/extract-ghkm/PhraseOrientation.cpp
@ -28,6 +28,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -469,5 +471,5 @@ void PhraseOrientation::WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE
 }

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining
-
--- a/phrase-extract/extract-ghkm/PhraseOrientation.h
+++ b/phrase-extract/extract-ghkm/PhraseOrientation.h
@ -32,6 +32,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -120,4 +122,5 @@ private:
 };

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining
--- a/phrase-extract/extract-ghkm/Rule.cpp
+++ b/phrase-extract/extract-ghkm/Rule.cpp
@ -5,6 +5,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -38,4 +40,5 @@ bool Rule::PartitionOrderComp(const Node *a, const Node *b)
 }

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining
--- a/phrase-extract/extract-ghkm/Rule.h
+++ b/phrase-extract/extract-ghkm/Rule.h
@ -9,6 +9,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -54,6 +56,7 @@ protected:
 };

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining

 #endif
--- a/phrase-extract/extract-ghkm/ScfgRule.cpp
+++ b/phrase-extract/extract-ghkm/ScfgRule.cpp
@ -28,6 +28,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -197,4 +199,5 @@ void ScfgRule::UpdateSourceLabelCoocCounts(std::map< std::string, std::map<std::
 }

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining
--- a/phrase-extract/extract-ghkm/ScfgRule.h
+++ b/phrase-extract/extract-ghkm/ScfgRule.h
@ -31,6 +31,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -93,4 +95,5 @@ private:
 };

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining
--- a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
+++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
@ -32,6 +32,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -229,4 +231,5 @@ void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out)
 }

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining
--- a/phrase-extract/extract-ghkm/ScfgRuleWriter.h
+++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.h
@ -25,6 +25,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -57,5 +59,5 @@ private:
 };

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining
-
--- a/phrase-extract/extract-ghkm/Span.cpp
+++ b/phrase-extract/extract-ghkm/Span.cpp
@ -21,6 +21,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -45,4 +47,5 @@ ContiguousSpan Closure(const Span &s)
 }

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining
--- a/phrase-extract/extract-ghkm/Span.h
+++ b/phrase-extract/extract-ghkm/Span.h
@ -26,6 +26,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -37,6 +39,7 @@ bool SpansIntersect(const Span &, const ContiguousSpan &);
 ContiguousSpan Closure(const Span &);

 }  // namespace MosesTraining
+}  // namespace Syntax
 }  // namespace GHKM

 #endif
--- a/phrase-extract/extract-ghkm/StsgRule.cpp
+++ b/phrase-extract/extract-ghkm/StsgRule.cpp
@ -7,6 +7,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -91,4 +93,5 @@ StsgRule::StsgRule(const Subgraph &fragment)
 }

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining
--- a/phrase-extract/extract-ghkm/StsgRule.h
+++ b/phrase-extract/extract-ghkm/StsgRule.h
@ -9,6 +9,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -39,6 +41,7 @@ private:
 };

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining

 #endif
--- a/phrase-extract/extract-ghkm/StsgRuleWriter.cpp
+++ b/phrase-extract/extract-ghkm/StsgRuleWriter.cpp
@ -13,6 +13,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -92,4 +94,5 @@ void StsgRuleWriter::Write(const StsgRule &rule)
 }

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining
--- a/phrase-extract/extract-ghkm/StsgRuleWriter.h
+++ b/phrase-extract/extract-ghkm/StsgRuleWriter.h
@ -8,6 +8,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -36,6 +38,7 @@ private:
 };

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining

 #endif
--- a/phrase-extract/extract-ghkm/Subgraph.cpp
+++ b/phrase-extract/extract-ghkm/Subgraph.cpp
@ -24,6 +24,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -195,4 +197,5 @@ void Subgraph::RecursivelyGetPartsOfSpeech(const Node *n, std::vector<std::strin
 }

 }  // namespace MosesTraining
+}  // namespace Syntax
 }  // namespace GHKM
--- a/phrase-extract/extract-ghkm/Subgraph.h
+++ b/phrase-extract/extract-ghkm/Subgraph.h
@ -26,6 +26,8 @@

 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {

@ -137,5 +139,5 @@ private:
 };

 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining
-
--- a/phrase-extract/extract-lex-main.cpp
+++ b/phrase-extract/extract-lex-main.cpp
@ -4,6 +4,7 @@
 #include <vector>
 #include "extract-lex.h"
 #include "InputFileStream.h"
+#include "moses/Util.h"

 using namespace std;
 using namespace MosesTraining;
@ -53,9 +54,9 @@ int main(int argc, char* argv[])
    assert(isAlign);

    vector<string> toksTarget, toksSource, toksAlign;
-    Tokenize(toksTarget, lineTarget);
-    Tokenize(toksSource, lineSource);
-    Tokenize(toksAlign, lineAlign);
+    Moses::Tokenize(toksTarget, lineTarget);
+    Moses::Tokenize(toksSource, lineSource);
+    Moses::Tokenize(toksAlign, lineAlign);

    /*
    cerr  << endl
@ -99,7 +100,7 @@ void ExtractLex::Process(vector<string> &toksTarget, vector<string> &toksSource,
    const string &alignTok = *iterAlign;

    vector<size_t> alignPos;
-    Tokenize(alignPos, alignTok, "-");
+    Moses::Tokenize(alignPos, alignTok, "-");
    assert(alignPos.size() == 2);

    if (alignPos[0] >= toksSource.size()) {
--- a/phrase-extract/extract-lex.h
+++ b/phrase-extract/extract-lex.h
@ -9,59 +9,6 @@
 namespace MosesTraining
 {

-
-//! convert string to variable of type T. Used to reading floats, int etc from files
-template<typename T>
-inline T Scan(const std::string &input)
-{
-  std::stringstream stream(input);
-  T ret;
-  stream >> ret;
-  return ret;
-}
-
-
-//! speeded up version of above
-template<typename T>
-inline void Scan(std::vector<T> &output, const std::vector< std::string > &input)
-{
-  output.resize(input.size());
-  for (size_t i = 0 ; i < input.size() ; i++) {
-    output[i] = Scan<T>( input[i] );
-  }
-}
-
-
-inline void Tokenize(std::vector<std::string> &output
-                     , const std::string& str
-                     , const std::string& delimiters = " \t")
-{
-  // Skip delimiters at beginning.
-  std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
-  // Find first "non-delimiter".
-  std::string::size_type pos     = str.find_first_of(delimiters, lastPos);
-
-  while (std::string::npos != pos || std::string::npos != lastPos) {
-    // Found a token, add it to the vector.
-    output.push_back(str.substr(lastPos, pos - lastPos));
-    // Skip delimiters.  Note the "not_of"
-    lastPos = str.find_first_not_of(delimiters, pos);
-    // Find next "non-delimiter"
-    pos = str.find_first_of(delimiters, lastPos);
-  }
-}
-
-// speeded up version of above
-template<typename T>
-inline void Tokenize( std::vector<T> &output
-                      , const std::string &input
-                      , const std::string& delimiters = " \t")
-{
-  std::vector<std::string> stringVector;
-  Tokenize(stringVector, input, delimiters);
-  return Scan<T>(output, stringVector );
-}
-
 class WordCount
 {
  friend std::ostream& operator<<(std::ostream&, const WordCount&);
--- a/phrase-extract/extract-main.cpp
+++ b/phrase-extract/extract-main.cpp
@ -283,6 +283,7 @@ int main(int argc, char* argv[])
  string englishString, foreignString, alignmentString, weightString;

  while(getline(*eFileP, englishString)) {
+    // Print progress dots to stderr.
    i++;
    if (i%10000 == 0) cerr << "." << flush;

@ -337,6 +338,9 @@ int main(int argc, char* argv[])
      extractFileContextInv.Close();
    }
  }
+
+  // We've been printing progress dots to stderr.  End the line.
+  cerr << endl;
 }

 namespace MosesTraining
--- a/phrase-extract/lexical-reordering/Jamfile
+++ b/phrase-extract/lexical-reordering/Jamfile
@ -1,2 +1,2 @@
-exe lexical-reordering-score : InputFileStream.cpp reordering_classes.cpp score.cpp ../../util//kenutil ../..//z ;
+exe lexical-reordering-score : InputFileStream.cpp reordering_classes.cpp score.cpp ../OutputFileStream.cpp ../..//boost_iostreams ../..//boost_filesystem ../../util//kenutil ../..//z ;

--- a/phrase-extract/lexical-reordering/reordering_classes.cpp
+++ b/phrase-extract/lexical-reordering/reordering_classes.cpp
@ -277,7 +277,7 @@ void Model::score_fe(const string& f, const string& e)
 {
  if (!fe)    //Make sure we do not do anything if it is not a fe model
    return;
-  fprintf(file,"%s ||| %s ||| ",f.c_str(),e.c_str());
+  outputFile << f << " ||| " << e << " |||";
  //condition on the previous phrase
  if (previous) {
    vector<double> scores;
@ -288,9 +288,8 @@ void Model::score_fe(const string& f, const string& e)
      sum += scores[i];
    }
    for(size_t i=0; i<scores.size(); ++i) {
-      fprintf(file,"%f ",scores[i]/sum);
+      outputFile << " " << (scores[i]/sum);
    }
-    //fprintf(file, "||| ");
  }
  //condition on the next phrase
  if (next) {
@ -302,17 +301,17 @@ void Model::score_fe(const string& f, const string& e)
      sum += scores[i];
    }
    for(size_t i=0; i<scores.size(); ++i) {
-      fprintf(file, "%f ", scores[i]/sum);
+      outputFile << " " << (scores[i]/sum);
    }
  }
-  fprintf(file,"\n");
+  outputFile << endl;
 }

 void Model::score_f(const string& f)
 {
  if (fe)      //Make sure we do not do anything if it is not a f model
    return;
-  fprintf(file, "%s ||| ", f.c_str());
+  cout << f << " |||";
  //condition on the previous phrase
  if (previous) {
    vector<double> scores;
@ -323,9 +322,8 @@ void Model::score_f(const string& f)
      sum += scores[i];
    }
    for(size_t i=0; i<scores.size(); ++i) {
-      fprintf(file, "%f ", scores[i]/sum);
+      outputFile << " " << (scores[i]/sum);
    }
-    //fprintf(file, "||| ");
  }
  //condition on the next phrase
  if (next) {
@ -337,22 +335,16 @@ void Model::score_f(const string& f)
      sum += scores[i];
    }
    for(size_t i=0; i<scores.size(); ++i) {
-      fprintf(file, "%f ", scores[i]/sum);
+      outputFile << " " << (scores[i]/sum);
    }
  }
-  fprintf(file, "\n");
+  outputFile << endl;
 }

 Model::Model(ModelScore* ms, Scorer* sc, const string& dir, const string& lang, const string& fn)
  : modelscore(ms), scorer(sc), filename(fn)
 {
-
-  file = fopen(filename.c_str(),"w");
-  if (!file) {
-    cerr << "Could not open the model output file: " << filename << endl;
-    exit(1);
-  }
-
+  outputFile.Open( (filename+".gz").c_str() );
  fe = false;
  if (lang.compare("fe") == 0) {
    fe = true;
@ -373,28 +365,11 @@ Model::Model(ModelScore* ms, Scorer* sc, const string& dir, const string& lang,

 Model::~Model()
 {
-  fclose(file);
+  outputFile.Close();
  delete modelscore;
  delete scorer;
 }

-void Model::zipFile()
-{
-  fclose(file);
-  file = fopen(filename.c_str(), "rb");
-  gzFile gzfile = gzopen((filename+".gz").c_str(),"wb");
-  char inbuffer[128];
-  int num_read;
-  while ((num_read = fread(inbuffer, 1, sizeof(inbuffer), file)) > 0) {
-    gzwrite(gzfile, inbuffer, num_read);
-  }
-  fclose(file);
-  gzclose(gzfile);
-
-  //Remove the unzipped file
-  remove(filename.c_str());
-}
-
 void Model::split_config(const string& config, string& dir, string& lang, string& orient)
 {
  istringstream is(config);
--- a/phrase-extract/lexical-reordering/reordering_classes.h
+++ b/phrase-extract/lexical-reordering/reordering_classes.h
@ -13,7 +13,7 @@
 #include <fstream>

 #include "util/string_piece.hh"
-
+#include "../OutputFileStream.h"

 enum ORIENTATION {MONO, SWAP, DRIGHT, DLEFT, OTHER, NOMONO};

@ -122,8 +122,8 @@ private:
  ModelScore* modelscore;
  Scorer* scorer;

-  std::FILE* file;
  std::string filename;
+  Moses::OutputFileStream outputFile;

  bool fe;
  bool previous;
--- a/phrase-extract/lexical-reordering/score.cpp
+++ b/phrase-extract/lexical-reordering/score.cpp
@ -205,11 +205,10 @@ int main(int argc, char* argv[])
    models[i]->score_f(f_current);
  }

-  //Zip all files
+  // delete model objects (and close files)
  for (size_t i=0; i<models.size(); ++i) {
-    models[i]->zipFile();
+    delete models[i];
  }
-
  return 0;
 }

--- a/phrase-extract/relax-parse-main.cpp
+++ b/phrase-extract/relax-parse-main.cpp
@ -120,8 +120,13 @@ void store( SyntaxNodeCollection &tree, const vector< string > &words )
  for( size_t i=0; i<nodes.size(); i++ ) {
    cout << " <tree span=\"" << nodes[i]->start
         << "-" << nodes[i]->end
-         << "\" label=\"" << nodes[i]->label
-         << "\"/>";
+         << "\" label=\"" << nodes[i]->label << "\"";
+    for (SyntaxNode::AttributeMap::const_iterator
+         p = nodes[i]->attributes.begin();
+         p != nodes[i]->attributes.end(); ++p) {
+      cout << " " << p->first << "=\"" << p->second << "\"";
+    }
+    cout << "/>";
  }
  cout << endl;
 }
--- a/phrase-extract/score-main.cpp
+++ b/phrase-extract/score-main.cpp
@ -19,7 +19,9 @@

 #include <sstream>
 #include <assert.h>
+#include <cstdlib>
 #include <cstring>
+#include <list>
 #include <map>
 #include <set>
 #include <vector>
@ -70,6 +72,7 @@ bool nonTermContextTarget = false;

 int countOfCounts[COC_MAX+1];
 int totalDistinct = 0;
+float minCount = 0;
 float minCountHierarchical = 0;
 bool phraseOrientationPriorsFlag = false;

@ -107,7 +110,7 @@ void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float>
                                   const std::string &fileNameLeftHandSideSourceLabelCounts,
                                   const std::string &fileNameLeftHandSideTargetSourceLabelCounts );
 void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fileName );
-void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
+void processPhrasePairs( std::list< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
                         const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb );
 void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, std::ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog );
 double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
@ -131,14 +134,28 @@ int main(int argc, char* argv[])
  ScoreFeatureManager featureManager;
  if (argc < 4) {
    std::cerr <<
-              "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] "
-              "[--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] "
-              "[--NoWordAlignment] [--UnalignedPenalty] "
+              "syntax: score extract lex phrase-table "
+              "[--Inverse] "
+              "[--Hierarchical] "
+              "[--LogProb] "
+              "[--NegLogProb] "
+              "[--NoLex] "
+              "[--GoodTuring] "
+              "[--KneserNey] "
+              "[--NoWordAlignment] "
+              "[--UnalignedPenalty] "
              "[--UnalignedFunctionWordPenalty function-word-file] "
-              "[--MinCountHierarchical count] [--PartsOfSpeech] [--PCFG] "
-              "[--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] "
-              "[--TargetPreferenceLabels] [--UnpairedExtractFormat] "
-              "[--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
+              "[--MinCountHierarchical count] "
+              "[--PartsOfSpeech] "
+              "[--PCFG] "
+              "[--TreeFragments] "
+              "[--SourceLabels] "
+              "[--SourceLabelCountsLHS] "
+              "[--TargetPreferenceLabels] "
+              "[--UnpairedExtractFormat] "
+              "[--ConditionOnTargetLHS] "
+              "[--CrossedNonTerm]"
+              << std::endl;
    std::cerr << featureManager.usage() << std::endl;
    exit(1);
  }
@ -235,9 +252,13 @@ int main(int argc, char* argv[])
      logProbFlag = true;
      negLogProb = -1;
      std::cerr << "using negative log-probabilities" << std::endl;
+    } else if (strcmp(argv[i],"--MinCount") == 0) {
+      minCount = std::atof( argv[++i] );
+      std::cerr << "dropping all phrase pairs occurring less than " << minCount << " times" << std::endl;
+      minCount -= 0.00001; // account for rounding
    } else if (strcmp(argv[i],"--MinCountHierarchical") == 0) {
-      minCountHierarchical = Moses::Scan<float>( argv[++i] );
-      std::cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times" << std::endl;
+      minCountHierarchical = std::atof( argv[++i] );
+      std::cerr << "dropping all hierarchical phrase pairs occurring less than " << minCountHierarchical << " times" << std::endl;
      minCountHierarchical -= 0.00001; // account for rounding
    } else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
      crossedNonTerm = true;
@ -325,8 +346,8 @@ int main(int argc, char* argv[])
  // loop through all extracted phrase translations
  std::string line, lastLine;
  ExtractionPhrasePair *phrasePair = NULL;
-  std::vector< ExtractionPhrasePair* > phrasePairsWithSameSource;
-  std::vector< ExtractionPhrasePair* > phrasePairsWithSameSourceAndTarget; // required for hierarchical rules only, as non-terminal alignments might make the phrases incompatible
+  std::list< ExtractionPhrasePair* > phrasePairsWithSameSource;
+  std::list< ExtractionPhrasePair* > phrasePairsWithSameSourceAndTarget; // required for hierarchical rules only, as non-terminal alignments might make the phrases incompatible

  int tmpSentenceId;
  PHRASE *tmpPhraseSource, *tmpPhraseTarget;
@ -359,6 +380,7 @@ int main(int argc, char* argv[])

  while ( getline(extractFile, line) ) {

+    // Print progress dots to stderr.
    if ( ++i % 100000 == 0 ) {
      std::cerr << "." << std::flush;
    }
@ -389,7 +411,7 @@ int main(int argc, char* argv[])
    // once the first of them has been found to have to be set to false

    if ( hierarchicalFlag ) {
-      for ( std::vector< ExtractionPhrasePair* >::const_iterator iter = phrasePairsWithSameSourceAndTarget.begin();
+      for ( std::list< ExtractionPhrasePair* >::const_iterator iter = phrasePairsWithSameSourceAndTarget.begin();
            iter != phrasePairsWithSameSourceAndTarget.end(); ++iter ) {
        if ( (*iter)->Matches( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
                               sourceMatch, targetMatch, alignmentMatch ) ) {
@ -419,7 +441,7 @@ int main(int argc, char* argv[])
      if ( !phrasePairsWithSameSource.empty() &&
           !sourceMatch ) {
        processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
-        for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
+        for ( std::list< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
              iter!=phrasePairsWithSameSource.end(); ++iter) {
          delete *iter;
        }
@ -450,8 +472,11 @@ int main(int argc, char* argv[])

  }

+  // We've been printing progress dots to stderr.  End the line.
+  std::cerr << std::endl;
+
  processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
-  for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
+  for ( std::list< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
        iter!=phrasePairsWithSameSource.end(); ++iter) {
    delete *iter;
  }
@ -546,7 +571,7 @@ void processLine( std::string line,
    } else if (item + (includeSentenceIdFlag?-1:0) == 4) { // count
      sscanf(token[j].c_str(), "%f", &count);
    } else if (item + (includeSentenceIdFlag?-1:0) == 5) { // target syntax PCFG score
-      float pcfgScore = Moses::Scan<float>( token[j] );
+      float pcfgScore = std::atof( token[j].c_str() );
      pcfgSum = pcfgScore * count;
    }
  }
@ -652,7 +677,7 @@ void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fi
 }


-void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
+void processPhrasePairs( std::list< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
                         const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb )
 {
  if (phrasePairsWithSameSource.size() == 0) {
@ -664,14 +689,14 @@ void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSa
  //std::cerr << "phrasePairs.size() = " << phrasePairs.size() << std::endl;

  // loop through phrase pairs
-  for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
+  for ( std::list< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
        iter!=phrasePairsWithSameSource.end(); ++iter) {
    // add to total count
    totalSource += (*iter)->GetCount();
  }

  // output the distinct phrase pairs, one at a time
-  for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
+  for ( std::list< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
        iter!=phrasePairsWithSameSource.end(); ++iter) {
    // add to total count
    outputPhrasePair( **iter, totalSource, phrasePairsWithSameSource.size(), phraseTableFile, featureManager, maybeLogProb );
@ -700,16 +725,15 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
      countOfCounts[ countInt ]++;
  }

-  // compute PCFG score
-  float pcfgScore = 0;
-  if (pcfgFlag && !inverseFlag) {
-    pcfgScore = phrasePair.GetPcfgScore() / count;
-  }
-
  // output phrases
  const PHRASE *phraseSource = phrasePair.GetSource();
  const PHRASE *phraseTarget = phrasePair.GetTarget();

+  // do not output if count below threshold
+  if (count < minCount) {
+    return;
+  }
+
  // do not output if hierarchical and count below threshold
  if (hierarchicalFlag && count < minCountHierarchical) {
    for(size_t j=0; j<phraseSource->size()-1; ++j) {
@ -718,6 +742,12 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
    }
  }

+  // compute PCFG score
+  float pcfgScore = 0;
+  if (pcfgFlag && !inverseFlag) {
+    pcfgScore = phrasePair.GetPcfgScore() / count;
+  }
+
  // source phrase (unless inverse)
  if (!inverseFlag) {
    printSourcePhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);
@ -1167,7 +1197,7 @@ void LexicalTable::load( const std::string &fileName )
      continue;
    }

-    double prob = Moses::Scan<double>( token[2] );
+    double prob = std::atof( token[2].c_str() );
    WORD_ID wordT = vcbT.storeIfNew( token[0] );
    WORD_ID wordS = vcbS.storeIfNew( token[1] );
    ltable[ wordS ][ wordT ] = prob;
--- a/scripts/Transliteration/post-decoding-transliteration.pl
+++ b/scripts/Transliteration/post-decoding-transliteration.pl
@ -137,38 +137,38 @@ sub run_transliteration

 	print "Filter Table\n";

-	`$MOSES_SRC/scripts/training/train-model.perl \
-            -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \
-            -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-            -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-            -score-options '--KneserNey' \
-            -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \
-            -config $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \
+	`$MOSES_SRC/scripts/training/train-model.perl \\
+            -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \\
+            -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
+            -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
+            -score-options '--KneserNey' \\
+            -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \\
+            -config $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \\
            -lm 0:3:$TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini:8`;

-	`$MOSES_SRC/scripts/training/filter-model-given-input.pl \
-            $TRANSLIT_MODEL/evaluation/$eval_file.filtered \
-            $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \
-            $TRANSLIT_MODEL/evaluation/$eval_file \
+	`$MOSES_SRC/scripts/training/filter-model-given-input.pl \\
+            $TRANSLIT_MODEL/evaluation/$eval_file.filtered \\
+            $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \\
+            $TRANSLIT_MODEL/evaluation/$eval_file \\
            -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`;

 	`rm  $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini`;

 	print "Apply Filter\n";

-	`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \
-            $TRANSLIT_MODEL/evaluation/$eval_file.filtered/moses.ini \
-            $TRANSLIT_MODEL/model/moses.ini \
-            $TRANSLIT_MODEL/tuning/moses.tuned.ini \
+	`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \\
+            $TRANSLIT_MODEL/evaluation/$eval_file.filtered/moses.ini \\
+            $TRANSLIT_MODEL/model/moses.ini \\
+            $TRANSLIT_MODEL/tuning/moses.tuned.ini \\
            $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini`;

  my $drop_stderr = $VERBOSE ? "" : " 2>/dev/null";
-	`$DECODER \
-            -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \
-            -threads 16 -drop-unknown -distortion-limit 0 \
-            -n-best-list $TRANSLIT_MODEL/evaluation/$eval_file.op.nBest 1000 \
-            distinct -f $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini \
-            < $TRANSLIT_MODEL/evaluation/$eval_file \
+	`$DECODER \\
+            -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \\
+            -threads 16 -drop-unknown -distortion-limit 0 \\
+            -n-best-list $TRANSLIT_MODEL/evaluation/$eval_file.op.nBest 1000 \\
+            distinct -f $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini \\
+            < $TRANSLIT_MODEL/evaluation/$eval_file \\
            > $TRANSLIT_MODEL/evaluation/$eval_file.op $drop_stderr`;

 }
@ -315,52 +315,52 @@ sub run_decoder

 	`mkdir $corpus_dir/evaluation`;

-	`$MOSES_SRC/scripts/training/train-model.perl \
-            -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \
-            -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-            -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-            -lmodel-oov-feature "yes" -post-decoding-translit "yes" \
-            -phrase-translation-table $corpus_dir/model/phrase-table \
+	`$MOSES_SRC/scripts/training/train-model.perl \\
+            -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \\
+            -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
+            -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
+            -lmodel-oov-feature "yes" -post-decoding-translit "yes" \\
+            -phrase-translation-table $corpus_dir/model/phrase-table \\
            -config $corpus_dir/model/moses.ini -lm 0:5:$LM_FILE:8`;

 	`touch $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini`;

-	`$MOSES_SRC/scripts/training/train-model.perl \
-            -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \
-            -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-            -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-            -lmodel-oov-feature "yes" -post-decoding-translit "yes" \
-            -phrase-translation-table $corpus_dir/model/phrase-table \
-            -config $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini \
+	`$MOSES_SRC/scripts/training/train-model.perl \\
+            -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \\
+            -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
+            -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
+            -lmodel-oov-feature "yes" -post-decoding-translit "yes" \\
+            -phrase-translation-table $corpus_dir/model/phrase-table \\
+            -config $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini \\
            -lm 0:3:$corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini:8`;

-	`$MOSES_SRC/scripts/training/filter-model-given-input.pl \
-            $corpus_dir/evaluation/filtered \
-            $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini \
-            $INPUT_FILE  -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt \
+	`$MOSES_SRC/scripts/training/filter-model-given-input.pl \\
+            $corpus_dir/evaluation/filtered \\
+            $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini \\
+            $INPUT_FILE  -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt \\
            1 1 4 100 2"`;

 	`rm $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini`;

-	`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables.perl \
-            $corpus_dir/evaluation/filtered/moses.ini \
-            < $corpus_dir/model/moses.ini \
+	`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables.perl \\
+            $corpus_dir/evaluation/filtered/moses.ini \\
+            < $corpus_dir/model/moses.ini \\
            > $corpus_dir/evaluation/moses.filtered.ini`;

  my $drop_stderr = $VERBOSE ? "" : " 2>/dev/null";
-	`$DECODER \
-            -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \
-            -threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' \
-            -max-trans-opt-per-coverage 100 \
-            -f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 \
-            < $INPUT_FILE \
+	`$DECODER \\
+            -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \\
+            -threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' \\
+            -max-trans-opt-per-coverage 100 \\
+            -f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 \\
+            < $INPUT_FILE \\
            > $OUTPUT_FILE $drop_stderr`;

-	print "$DECODER \
-            -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \
-            -threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' \
-            -max-trans-opt-per-coverage 100 \
-            -f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 \
-            < $INPUT_FILE \
+	print "$DECODER \\
+            -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \\
+            -threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' \\
+            -max-trans-opt-per-coverage 100 \\
+            -f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 \\
+            < $INPUT_FILE \\
            > $OUTPUT_FILE $drop_stderr\n";
 }
--- a/scripts/Transliteration/prepare-transliteration-phrase-table.pl
+++ b/scripts/Transliteration/prepare-transliteration-phrase-table.pl
@ -103,34 +103,34 @@ sub run_transliteration

 	print STDERR "Filter Table\n";

-	`$MOSES_SRC/scripts/training/train-model.perl \
-            -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \
-            -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-            -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-            -reordering msd-bidirectional-fe -score-options '--KneserNey' \
-            -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \
-            -reordering-table $TRANSLIT_MODEL/model/reordering-table \
-            -config $eval_file.moses.table.ini \
+	`$MOSES_SRC/scripts/training/train-model.perl \\
+            -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \\
+            -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
+            -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
+            -reordering msd-bidirectional-fe -score-options '--KneserNey' \\
+            -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \\
+            -reordering-table $TRANSLIT_MODEL/model/reordering-table \\
+            -config $eval_file.moses.table.ini \\
            -lm 0:3:$eval_file.moses.table.ini:8`;

-	`$MOSES_SRC/scripts/training/filter-model-given-input.pl \
-            $eval_file.filtered $eval_file.moses.table.ini $eval_file \
+	`$MOSES_SRC/scripts/training/filter-model-given-input.pl \\
+            $eval_file.filtered $eval_file.moses.table.ini $eval_file \\
            -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`;

 	`rm  $eval_file.moses.table.ini`;

 	print STDERR "Apply Filter\n";

-	`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \
-            $eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini \
+	`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \\
+            $eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini \\
            $TRANSLIT_MODEL/tuning/moses.tuned.ini $eval_file.filtered.ini`;

-	`$MOSES_SRC/bin/moses \
-            -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \
-            -threads 16 -drop-unknown -distortion-limit 0 \
-            -n-best-list $eval_file.op.nBest 50 \
-            -f $eval_file.filtered.ini \
-            < $eval_file \
+	`$MOSES_SRC/bin/moses \\
+            -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \\
+            -threads 16 -drop-unknown -distortion-limit 0 \\
+            -n-best-list $eval_file.op.nBest 50 \\
+            -f $eval_file.filtered.ini \\
+            < $eval_file \\
            > $eval_file.op`;

 }
--- a/scripts/Transliteration/train-transliteration-module.pl
+++ b/scripts/Transliteration/train-transliteration-module.pl
@ -118,80 +118,80 @@ sub learn_transliteration_model{

   print "Align Corpus\n";

-  `$MOSES_SRC_DIR/scripts/training/train-model.perl \
-      -mgiza -mgiza-cpus 10 -dont-zip -last-step 1 \
-      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-      -score-options '--KneserNey' -corpus $OUT_DIR/training/corpus$t \
+  `$MOSES_SRC_DIR/scripts/training/train-model.perl \\
+      -mgiza -mgiza-cpus 10 -dont-zip -last-step 1 \\
+      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
+      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
+      -score-options '--KneserNey' -corpus $OUT_DIR/training/corpus$t \\
      -corpus-dir $OUT_DIR/training/prepared`;

-  `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 \
-      -dont-zip -first-step 2 -last-step 2 \
-      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-      -score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared \
+  `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 \\
+      -dont-zip -first-step 2 -last-step 2 \\
+      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
+      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
+      -score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared \\
      -giza-e2f $OUT_DIR/training/giza -direction 2`;

-  `$MOSES_SRC_DIR/scripts/training/train-model.perl \
-      -mgiza -mgiza-cpus 10 -dont-zip -first-step 2 -last-step 2 \
-      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-      -score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared \
+  `$MOSES_SRC_DIR/scripts/training/train-model.perl \\
+      -mgiza -mgiza-cpus 10 -dont-zip -first-step 2 -last-step 2 \\
+      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
+      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
+      -score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared \\
      -giza-f2e $OUT_DIR/training/giza-inverse -direction 1`;

-  `$MOSES_SRC_DIR/scripts/training/train-model.perl \
-      -mgiza -mgiza-cpus 10 -dont-zip -first-step 3 -last-step 3 \
-      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-      -score-options '--KneserNey' -giza-e2f $OUT_DIR/training/giza \
-      -giza-f2e $OUT_DIR/training/giza-inverse \
-      -alignment-file $OUT_DIR/model/aligned \
+  `$MOSES_SRC_DIR/scripts/training/train-model.perl \\
+      -mgiza -mgiza-cpus 10 -dont-zip -first-step 3 -last-step 3 \\
+      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
+      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
+      -score-options '--KneserNey' -giza-e2f $OUT_DIR/training/giza \\
+      -giza-f2e $OUT_DIR/training/giza-inverse \\
+      -alignment-file $OUT_DIR/model/aligned \\
      -alignment-stem $OUT_DIR/model/aligned -alignment grow-diag-final-and`;

  print "Train Translation Models\n";

- `$MOSES_SRC_DIR/scripts/training/train-model.perl \
-     -mgiza -mgiza-cpus 10 -dont-zip -first-step 4 -last-step 4 \
-     -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-     -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-     -score-options '--KneserNey' -lexical-file $OUT_DIR/model/lex \
-     -alignment-file $OUT_DIR/model/aligned \
-     -alignment-stem $OUT_DIR/model/aligned \
+ `$MOSES_SRC_DIR/scripts/training/train-model.perl \\
+     -mgiza -mgiza-cpus 10 -dont-zip -first-step 4 -last-step 4 \\
+     -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
+     -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
+     -score-options '--KneserNey' -lexical-file $OUT_DIR/model/lex \\
+     -alignment-file $OUT_DIR/model/aligned \\
+     -alignment-stem $OUT_DIR/model/aligned \\
     -corpus $OUT_DIR/training/corpus$t`;

-  `$MOSES_SRC_DIR/scripts/training/train-model.perl \
-      -mgiza -mgiza-cpus 10 -dont-zip -first-step 5 -last-step 5 \
-      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-      -score-options '--KneserNey' -alignment-file $OUT_DIR/model/aligned \
-      -alignment-stem $OUT_DIR/model/aligned -extract-file \
+  `$MOSES_SRC_DIR/scripts/training/train-model.perl \\
+      -mgiza -mgiza-cpus 10 -dont-zip -first-step 5 -last-step 5 \\
+      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
+      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
+      -score-options '--KneserNey' -alignment-file $OUT_DIR/model/aligned \\
+      -alignment-stem $OUT_DIR/model/aligned -extract-file \\
      $OUT_DIR/model/extract -corpus $OUT_DIR/training/corpus$t`;

-  `$MOSES_SRC_DIR/scripts/training/train-model.perl \
-      -mgiza -mgiza-cpus 10 -dont-zip -first-step 6 -last-step 6 \
-      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-      -score-options '--KneserNey' -extract-file $OUT_DIR/model/extract \
-      -lexical-file $OUT_DIR/model/lex -phrase-translation-table \
+  `$MOSES_SRC_DIR/scripts/training/train-model.perl \\
+      -mgiza -mgiza-cpus 10 -dont-zip -first-step 6 -last-step 6 \\
+      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
+      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
+      -score-options '--KneserNey' -extract-file $OUT_DIR/model/extract \\
+      -lexical-file $OUT_DIR/model/lex -phrase-translation-table \\
      $OUT_DIR/model/phrase-table`;

  print "Train Language Models\n";

-  `$SRILM_DIR/ngram-count \
-      -order 5 -interpolate -kndiscount -addsmooth1 0.0 -unk \
+  `$SRILM_DIR/ngram-count \\
+      -order 5 -interpolate -kndiscount -addsmooth1 0.0 -unk \\
      -text $OUT_DIR/lm/target -lm $OUT_DIR/lm/targetLM`;

-  `$MOSES_SRC_DIR/bin/build_binary \
+  `$MOSES_SRC_DIR/bin/build_binary \\
      $OUT_DIR/lm/targetLM $OUT_DIR/lm/targetLM.bin`;

  print "Create Config File\n";

-  `$MOSES_SRC_DIR/scripts/training/train-model.perl \
-      -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \
-      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-      -score-options '--KneserNey' \
-      -phrase-translation-table $OUT_DIR/model/phrase-table \
+  `$MOSES_SRC_DIR/scripts/training/train-model.perl \\
+      -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \\
+      -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
+      -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
+      -score-options '--KneserNey' \\
+      -phrase-translation-table $OUT_DIR/model/phrase-table \\
      -config $OUT_DIR/model/moses.ini -lm 0:5:$OUT_DIR/lm/targetLM.bin:8`;

 }
--- a/scripts/ems/example/config.toy.bilinguallm
+++ b/scripts/ems/example/config.toy.bilinguallm
@ -194,21 +194,19 @@ raw-corpus = $toy-data/nc-5k.$output-extension

 [LM:bilingual-lm]
 #bilingual-lm
-exclude-from-interpolation = true
+
+#required settings
 bilingual-lm = "yes"
-bilingual-lm-workdir = "bilingual"
-bilingual-lm-settings = ""
 order = "5"
 source-window = "4"
+nplm-dir = "/mnt/gna0/rsennrich/tools/nplm-0.3-gpu-experimental/"

-#actual training
-train_order = "14" #this is equal to order + 2*source-window + 1
-nplm-output-dir = "nplm_out"
-nplm-settings = "-l /mnt/gna0/rsennrich/tools/nplm-0.3-gpu-experimental/"
+# Add extra settings for ngram extraction or nplm training
+#bilingual-lm-settings = ""
+#nplm-settings = ""

-#Config file generation:
-config-feature-line = "BilingualNPLM order=$order source_window=$source-window path=$working-dir/$nplm-output-dir/train.10k.model.nplm.10 source_vocab=$working-dir/$bilingual-lm-workdir/vocab.source target_vocab=$working-dir/$bilingual-lm-workdir/vocab.target"
-config-weight-line = "BilingualNPLM0= 0.1"
+# Defaults to 10
+#epochs = 2

 #################################################################
 # INTERPOLATING LANGUAGE MODELS
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@ -61,6 +61,7 @@ factorize
 	rerun-on-change: TRAINING:input-factors TRAINING:output-factors
 	default-name: corpus/factored
 	pass-unless: TRAINING:input-factors
+	pass-if: factorize-after-split
 	parallelizable: yes
 	error: can't open
 	error: incompatible number of words in factor
@ -112,6 +113,15 @@ post-split-clean-syntax
 	pass-unless: input-splitter output-splitter
 	template: $moses-script-dir/training/clean-corpus-n.perl IN $input-extension $output-extension OUT 1 10000 OUT.lines-retained --ignore-xml
 	error: there is a blank factor
+post-split-factorize
+        in: clean-split-stem
+        out: post-split-factorized-stem
+        rerun-on-change: TRAINING:input-factors TRAINING:output-factors
+        default-name: corpus/split-factored
+        pass-unless: AND TRAINING:input-factors factorize-after-split
+        parallelizable: yes
+        error: can't open
+        error: incompatible number of words in factor

 [RECASING] single
 tokenize
@ -160,20 +170,24 @@ train
 	ignore-if: no-splitter-training

 [LM] multiple
-prepare-bilingual-nplm
+prepare-bilingual-lm
 	in: TRAINING:corpus TRAINING:word-alignment
 	out: numberized_ngrams
 	ignore-unless: bilingual-lm
 	rerun-on-change: TRAINING:corpus TRAINING:word-alignment
-	template: $moses-script-dir/training/bilingual-lm/extract_training.py -c IN0 -e $output-extension -f $input-extension -a IN1.$TRAINING:alignment-symmetrization-method -w $working-dir/$bilingual-lm-workdir -n $order -m $source-window $bilingual-lm-settings
-	default-name: lm/bilingualLM_prep
+	default-name: lm/blm
 train-bilingual-lm
 	in: numberized_ngrams TRAINING:corpus 
 	out: binlm
 	ignore-unless: bilingual-lm
 	rerun-on-change: numberized_ngrams
-	template: $moses-script-dir/training/bilingual-lm/train_nplm.py -w $working-dir/$bilingual-lm-workdir -c IN1 -r $working-dir/$nplm-output-dir -n $train_order $nplm-settings
-	default-name: lm/bilingualLM
+	default-name: lm/blm
+train-nplm
+  in: stripped-corpus
+  out: binlm
+  ignore-unless: nplm
+  rerun-on-change: stripped-corpus
+  default-name: lm/nplm
 get-corpus
 	in: get-corpus-script
 	out: raw-corpus
@ -207,6 +221,7 @@ factorize
 	out: factorized-corpus
 	default-name: lm/factored
 	pass-unless: factors
+	pass-if: factorize-after-split
 	ignore-if: concatenate-files concatenate-files-split
 	parallelizable: yes
 	error: can't open
@ -238,8 +253,17 @@ split
 	pass-unless: output-splitter
 	ignore-if: concatenate-files concatenate-files-split
 	template: $output-splitter -model IN1.$output-extension < IN > OUT
-strip
+post-split-factorize
        in: split-corpus
+        out: split-factorized-corpus
+        default-name: lm/split-factored
+        pass-unless: AND factors factorize-after-split
+        ignore-if: concatenate-files concatenate-files-split
+        parallelizable: yes
+        error: can't open
+        error: incompatible number of words in factor
+strip
+        in: split-factorized-corpus
        out: stripped-corpus
        default-name: lm/stripped
        pass-unless: mock-output-parser-lm
@ -261,7 +285,7 @@ train
 	in: stripped-corpus
 	out: lm
 	default-name: lm/lm
-	ignore-if: rlm-training custom-training bilingual-lm
+	ignore-if: rlm-training custom-training bilingual-lm nplm
 	rerun-on-change: lm-training order settings
 	template: $lm-training -order $order $settings -text IN -lm OUT
 	error: cannot execute binary file
@ -278,7 +302,7 @@ train-custom
 	template: $custom-training -text IN -lm OUT
 	final-model: yes
 train-custom-syntax
-	in: split-corpus
+	in: split-factorized-corpus
 	out: binlm
 	default-name: lm/custom-lm
 	rerun-on-change: custom-training
@ -337,6 +361,7 @@ factorize-tuning
 	out: factorized-tuning
 	default-name: lm/interpolate-tuning.factored
 	pass-unless: TRAINING:output-factors	
+	pass-if: factorize-after-split
 	parallelizable: yes
 	error: can't open
 	error: incompatible number of words in factor
@ -361,8 +386,16 @@ split-tuning
 	default-name: lm/interpolate-tuning.split
 	pass-unless: output-splitter
 	template: $output-splitter -model IN1.$output-extension < IN > OUT
+post-split-factorize-tuning
+        in: split-tuning
+        out: post-split-factorized-tuning
+        default-name: lm/interpolate-tuning.split-factored
+        pass-unless: AND TRAINING:output-factors factorize-after-split
+        parallelizable: yes
+        error: can't open
+        error: incompatible number of words in factor
 strip-tuning
-	in: split-tuning
+	in: post-split-factorized-tuning
 	out: stripped-tuning
 	default-name: lm/interpolate-tuning.stripped
 	pass-unless: mock-output-parser-lm
@ -490,12 +523,12 @@ train-in-mono
 	template: $moses-script-dir/ems/support/mml-train.perl -in-source IN -in-target IN1 -out-source IN2.$input-extension -out-target IN2.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
 [TRAINING] single
 consolidate
-	in: CORPUS:clean-split-stem
+	in: CORPUS:post-split-factorized-stem
 	out: corpus
 	default-name: corpus
 	template: $moses-script-dir/ems/support/consolidate-training-data.perl $input-extension $output-extension OUT IN
 build-domains
-	in: CORPUS:clean-split-stem
+	in: CORPUS:post-split-factorized-stem
 	out: domains
 	default-name: model/domains
 	ignore-unless: domain-features mml-filter-corpora
@ -523,14 +556,14 @@ fast-align
 	in: prepared-data-fast-align
 	out: fast-alignment
 	rerun-on-change: fast-align-settings
-	ignore-if: fast-align-max-lines
+	ignore-if: fast-align-max-lines fast-align-save-model
 	template: $external-bin-dir/fast_align -i IN $fast-align-settings > OUT
 	default-name: fast-align
 fast-align-inverse
 	in: prepared-data-fast-align
 	out: fast-alignment-inverse
 	rerun-on-change: fast-align-settings
-	ignore-if: fast-align-max-lines
+	ignore-if: fast-align-max-lines fast-align-save-model
 	template: $external-bin-dir/fast_align -i IN -r $fast-align-settings > OUT
 	default-name: fast-align-inverse
 fast-align-in-parts
@ -539,7 +572,7 @@ fast-align-in-parts
 	rerun-on-change: fast-align-settings fast-align-max-lines
 	ignore-unless: fast-align-max-lines
 	tmp-name: training/tmp.fast-align
-	template: $moses-script-dir/ems/support/fast-align-in-parts.perl -bin $external-bin-dir/fast_align -i IN -max-lines $fast-align-max-lines -tmp TMP -settings '$fast-align-settings' > OUT
+	template: $moses-script-dir/ems/support/fast-align-in-parts.perl -bin $external-bin-dir/fast_align -i IN -max-lines $fast-align-max-lines -tmp TMP -settings '$fast-align-settings' -save-model '$?fast-align-save-model' -o OUT
 	default-name: fast-align
 fast-align-in-parts-inverse
 	in: prepared-data-fast-align
@ -547,8 +580,24 @@ fast-align-in-parts-inverse
 	rerun-on-change: fast-align-settings fast-align-max-lines
 	ignore-unless: fast-align-max-lines
 	tmp-name: training/tmp.fast-align-inverse
-	template: $moses-script-dir/ems/support/fast-align-in-parts.perl -bin $external-bin-dir/fast_align -i IN -r -max-lines $fast-align-max-lines -tmp TMP -settings '$fast-align-settings' > OUT
+	template: $moses-script-dir/ems/support/fast-align-in-parts.perl -bin $external-bin-dir/fast_align -i IN -r -max-lines $fast-align-max-lines -tmp TMP -settings '$fast-align-settings' -save-model '$?fast-align-save-model' -o OUT
+	default-name: fast-align-inverse
+fast-align-save-model
+	in: prepared-data-fast-align
+	out: fast-alignment
+	ignore-unless: fast-align-save-model
+	ignore-if: fast-align-max-lines
 	default-name: fast-align
+	tmp-name: training/tmp.fast-align-inverse
+	template: $external-bin-dir/fast_align -i IN $fast-align-settings -p OUT.parameters > OUT 2> OUT.log
+fast-align-save-model-inverse
+	in: prepared-data-fast-align
+	out: fast-alignment-inverse
+	ignore-unless: fast-align-save-model
+	ignore-if: fast-align-max-lines
+	default-name: fast-align-inverse
+	tmp-name: training/tmp.fast-align-inverse
+	template: $external-bin-dir/fast_align -r -i IN $fast-align-settings -p OUT.parameters > OUT 2> OUT.log
 symmetrize-fast-align
 	in: fast-alignment fast-alignment-inverse corpus-mml-prefilter=OR=corpus
 	out: word-alignment
@ -616,7 +665,7 @@ build-biconcor
 	final-model: yes
 build-suffix-array
 	in: corpus-mml-postfilter=OR=word-alignment corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
-	out: phrase-translation-table
+	out: sigtest-filter-phrase-translation-table
 	default-name: model/suffix-array
 	ignore-unless: suffix-array
 	error: usage
@ -688,11 +737,18 @@ build-ttable
 	final-model: yes
 build-mmsapt
 	in: corpus-mml-postfilter=OR=word-alignment corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
-	out: phrase-translation-table
+	out: sigtest-filter-phrase-translation-table
 	ignore-unless: mmsapt
 	default-name: model/phrase-table-mmsapt
 	template: $moses-script-dir/training/build-mmsapt.perl --alignment IN.$alignment-symmetrization-method --corpus IN1 --f $input-extension --e $output-extension --dir OUT --settings '$mmsapt'
 	final-model: yes
+custom-phrase-table-pruning
+	in: phrase-translation-table
+	out: sigtest-filter-phrase-translation-table
+	ignore-unless: custom-phrase-table-pruning
+	ignore-if: mmsapt
+	template: $custom-phrase-table-pruning IN OUT
+	default-name: model/phrase-table-pruned
 sigtest-filter-suffix-array
 	in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
 	out: sigtest-filter-suffix-array
@ -714,7 +770,7 @@ sigtest-filter-ttable
 	out: sigtest-filter-phrase-translation-table
 	default-name: model/phrase-table-sigtest-filter
 	pass-unless: sigtest-filter
-	ignore-if: TRAINING:config
+	ignore-if: TRAINING:config custom-phrase-table-pruning
 	final-model: yes
 sigtest-filter-reordering
 	in: reordering-table sigtest-filter-suffix-array
@ -761,6 +817,7 @@ create-config
 	rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini mmsapt no-glue-grammar dont-tune-glue-grammar use-syntax-input-weight-feature
 	default-name: model/moses.ini
 	error: Unknown option
+	error: requires an argument
 	final-model: yes
 binarize-config
 	in: config
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@ -853,7 +853,7 @@ sub delete_output {
    `rm -r $file` if $EXECUTE;
  }
  # delete regular file that matches exactly
-  if (-e $file) {
+  elsif (-e $file) {
    print "\tdelete file $file\n";
    `rm $file` if $EXECUTE;
  }
@ -864,14 +864,14 @@ sub delete_output {
  foreach (`ls $dir`) {
    chop;
    next unless substr($_,0,length($f)) eq $f;
-    if (-e "$dir/$_") {
+    if (-d "$dir/$_") {
+      print "\tdelete directory $file\n";
+      `rm -r $dir/$_` if $EXECUTE;
+    }
+    elsif (-e "$dir/$_") {
      print "\tdelete file $dir/$_\n";
      `rm $dir/$_` if $EXECUTE;
    }
-    else {
-      print "\tdelete directory $dir/$_\n";
-      `rm -r $dir/$_` if $EXECUTE;
-    }
  }
 }

@ -1119,13 +1119,13 @@ sub define_step {
 	next if $RE_USE[$i];
 	next if defined($PASS{$i});
 	next if &define_template($i);
-        if ($DO_STEP[$i] =~ /^CORPUS:(.+):factorize$/) {
+        if ($DO_STEP[$i] =~ /^CORPUS:(.+):(post-split-)?factorize$/) {
            &define_corpus_factorize($i);
        }
 	elsif ($DO_STEP[$i] eq 'SPLITTER:train') {
 	    &define_splitter_train($i);
 	}
-        elsif ($DO_STEP[$i] =~ /^LM:(.+):factorize$/) {
+        elsif ($DO_STEP[$i] =~ /^LM:(.+):(post-split-)?factorize$/) {
            &define_lm_factorize($i,$1);
        }
 	elsif ($DO_STEP[$i] =~ /^LM:(.+):randomize$/ ||
@ -1135,6 +1135,15 @@ sub define_step {
 	elsif ($DO_STEP[$i] =~ /^LM:(.+):train-randomized$/) {
 	    &define_lm_train_randomized($i,$1);
 	}
+  elsif ($DO_STEP[$i] =~ /^LM:(.+):train-bilingual-lm$/) {
+	    &define_lm_train_bilingual_lm($i,$1);
+  }
+  elsif ($DO_STEP[$i] =~ /^LM:(.+):prepare-bilingual-lm$/) {
+	    &define_lm_prepare_bilingual_lm($i,$1);
+  }
+  elsif ($DO_STEP[$i] =~ /^LM:(.+):train-nplm$/) {
+      &define_lm_train_nplm($i,$1);
+  }
        elsif ($DO_STEP[$i] eq 'TRAINING:prepare-data') {
            &define_training_prepare_data($i);
        }
@ -1182,7 +1191,7 @@ sub define_step {
 	elsif ($DO_STEP[$i] eq 'TRAINING:create-config' || $DO_STEP[$i] eq 'TRAINING:create-config-interpolated-lm') {
 	    &define_training_create_config($i);
 	}
-	elsif ($DO_STEP[$i] eq 'INTERPOLATED-LM:factorize-tuning') {
+	elsif ($DO_STEP[$i] =~ /^INTERPOLATED-LM:(post-split-)?factorize-tuning$/) {
 	    &define_interpolated_lm_factorize_tuning($i);
 	}
 	elsif ($DO_STEP[$i] eq 'INTERPOLATED-LM:interpolate') {
@ -1777,6 +1786,95 @@ sub define_lm_train_randomized {
    &create_step($step_id,$cmd);
 }

+sub define_lm_train_bilingual_lm {
+    my ($step_id,$set) = @_;
+	  my ($working_dir, $ngrams, $corpus)		= &get_output_and_input($step_id);
+    my $scripts = &check_backoff_and_get("LM:moses-script-dir");
+    my $cmd = "$scripts/training/bilingual-lm/train_nplm.py -w $working_dir -c $corpus -r $working_dir";
+    my $nplm_dir = &check_backoff_and_get("LM:$set:nplm-dir");
+    $cmd .= " -l $nplm_dir";
+
+    my ($n, $m, $total_order) = &get_bilingual_lm_order($set);
+    $cmd .= " -n $total_order";
+
+    my $epochs = &get_bilingual_lm_epochs($set);
+    $cmd .= " -e $epochs" if defined($epochs);
+
+    my $nplm_settings = backoff_and_get("LM:$set:nplm-settings");
+    $cmd .= " $nplm_settings" if defined($nplm_settings);
+
+    # Create the ini file
+    $cmd .= "\n";
+    $cmd .= "$scripts/training/bilingual-lm/create_blm_ini.py -w $working_dir -n $n -m $m -x $set -e $epochs";
+
+    &create_step($step_id,$cmd);
+}
+
+sub define_lm_prepare_bilingual_lm {
+    my ($step_id,$set) = @_;
+	  my ($working_dir, $corpus, $align)		= &get_output_and_input($step_id);
+    my $scripts = &check_backoff_and_get("LM:moses-script-dir");
+    my $cmd = "$scripts/training/bilingual-lm/extract_training.py -w $working_dir -c $corpus";
+
+    my $input_extension = &check_backoff_and_get("GENERAL:input-extension");
+    my $output_extension = &check_backoff_and_get("GENERAL:output-extension");
+    $cmd .= " -e $output_extension -f $input_extension";
+
+    my $align_method = &check_backoff_and_get("TRAINING:alignment-symmetrization-method");
+    $cmd .= " -a $align.$align_method";
+
+    my ($n, $m, $total_order) = &get_bilingual_lm_order($set);
+    $cmd .= " -n $n -m $m";
+
+    my $bilingual_settings = backoff_and_get("LM:$set:bilingual-lm-settings");
+    $cmd .= " $bilingual_settings" if defined($bilingual_settings);
+
+
+    &create_step($step_id,$cmd);
+}
+
+sub define_lm_train_nplm {
+    my ($step_id,$set) = @_;
+	  my ($working_dir, $corpus)		= &get_output_and_input($step_id);
+    my $scripts = &check_backoff_and_get("LM:moses-script-dir");
+    my $cmd = "$scripts/training/train-neurallm.py --mmap --working-dir $working_dir --corpus $corpus";
+    my $nplm_dir = &check_backoff_and_get("LM:$set:nplm-dir");
+    $cmd .= " --nplm-home $nplm_dir";
+
+    my $epochs = &backoff_and_get("LM:$set:epochs");
+    $epochs = 2 unless defined($epochs);
+    $cmd .= " --epochs $epochs";
+
+    my $nplm_settings = backoff_and_get("LM:$set:nplm-settings");
+    $cmd .= " $nplm_settings" if defined($nplm_settings);
+
+    my $order = &backoff_and_get("LM:$set:order");
+    $order = 5 unless defined($order);
+    $cmd .= " --order $order";
+
+    # Create the ini file
+    $cmd .= "\n";
+    $cmd .= "$scripts/training/create_nplm_ini.py -w $working_dir -e $epochs -x $set -n $order";
+
+    &create_step($step_id,$cmd);
+}
+
+sub get_bilingual_lm_order {
+  my ($set) = @_;
+  my $order = &backoff_and_get("LM:$set:order");
+  $order = 5 unless defined ($order);
+  my $source_window = &backoff_and_get("LM:$set:source-window");
+  $source_window = 4 unless defined ($order);
+  return ($order, $source_window, $order + 2*$source_window+1);
+}
+
+sub get_bilingual_lm_epochs {
+  my ($set) = @_;
+  my $epochs = &backoff_and_get("LM:$set:epochs");
+  $epochs = 10 unless defined($epochs);
+  return $epochs;
+}
+
 sub define_lm_randomize {
    my ($step_id,$set_dummy) = @_;

@ -2548,7 +2646,8 @@ sub define_training_create_config {
    }

    # sparse lexical features provide additional content for config file
-    $cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
+    my @additional_ini_files;
+    push  (@additional_ini_files, "$sparse_lexical_features.ini") if $sparse_lexical_features;

    my @LM_SETS = &get_sets("LM");
    my %INTERPOLATED_AWAY;
@ -2599,8 +2698,11 @@ sub define_training_create_config {
            if (&get("LM:$set:config-feature-line") && &get("LM:$set:config-weight-line")) {
                $feature_lines .= &get("LM:$set:config-feature-line") . ";";
                $weight_lines .= &get("LM:$set:config-weight-line") . ";";
-            }
-            else {
+            } elsif (&get("LM:$set:nplm")) {
+               push(@additional_ini_files, "$lm/nplm.ini"); 
+            } elsif (&get("LM:$set:bilingual-lm")) {
+               push(@additional_ini_files, "$lm/blm.ini"); 
+            } else {
                my $order = &check_backoff_and_get("LM:$set:order");

                my $lm_file = "$lm";
@ -2629,13 +2731,17 @@ sub define_training_create_config {
            }
    }

-    if (defined($feature_lines)) {
+    if ($feature_lines) {
        $cmd .= "-config-add-feature-lines \"$feature_lines\" ";
    }
-    if (defined($weight_lines)) {
+    if ($weight_lines) {
        $cmd .= "-config-add-weight-lines \"$weight_lines\" ";
    }

+    if (@additional_ini_files) {
+        $cmd .= "-additional-ini-file " . join(":", @additional_ini_files);
+    }
+
    &create_step($step_id,$cmd);
 }

@ -2795,7 +2901,8 @@ sub get_interpolated_lm_sets {
  my $count=0;
  my $icount=0;
  foreach my $set (@LM_SETS) {
-    next if (&get("LM:$set:exclude-from-interpolation"));
+    next if (&get("LM:$set:exclude-from-interpolation")) or (&get("LM:$set:bilingual-lm")) 
+      or (&get("LM:$set:nplm"));
    my $order = &check_backoff_and_get("LM:$set:order");

    my $factor = 0;
@ -2831,6 +2938,7 @@ sub get_training_setting {
    my $pcfg = &get("TRAINING:use-pcfg-feature");
    my $baseline_alignment = &get("TRAINING:baseline-alignment-model");
    my $no_glue_grammar = &get("TRAINING:no-glue-grammar");
+    my $mmsapt = &get("TRAINING:mmsapt");

    my $xml = $source_syntax || $target_syntax;

@ -2855,6 +2963,7 @@ sub get_training_setting {
    $cmd .= "-parallel " if $parallel;
    $cmd .= "-pcfg " if $pcfg;
    $cmd .= "-baseline-alignment-model $baseline_alignment " if defined($baseline_alignment) && ($step == 1 || $step == 2);
+    $cmd .= "-mmsapt " if defined($mmsapt);

    # factored training
    if (&backoff_and_get("TRAINING:input-factors")) {
@ -3454,12 +3563,20 @@ sub define_template {
    }
    $cmd =~ s/VERSION/$VERSION/g;
    print "\tcmd is $cmd\n" if $VERBOSE;
-    while ($cmd =~ /^([\S\s]*)\$\{([^\s\/\"\']+)\}([\S\s]*)$/ ||
-           $cmd =~ /^([\S\s]*)\$([^\s\/\"\']+)([\S\s]*)$/) {
-	my ($pre,$variable,$post) = ($1,$2,$3);
-	$cmd = $pre
-	    . &check_backoff_and_get(&extend_local_name($module,$set,$variable))
-	    . $post;
+
+    # replace variables
+    while ($cmd =~ /^([\S\s]*)\$(\??)\{([^\s\/\"\']+)\}([\S\s]*)$/ ||
+           $cmd =~ /^([\S\s]*)\$(\??)([^\s\/\"\']+)([\S\s]*)$/) {
+	my ($pre,$optional,$variable,$post) = ($1,$2,$3,$4);
+	my $value;
+	if ($optional eq '?') {
+	  $value = &backoff_and_get(&extend_local_name($module,$set,$variable));
+          $value = "" unless $value;
+        }
+	else {
+	  $value = &check_backoff_and_get(&extend_local_name($module,$set,$variable));
+	} 
+	$cmd = $pre.$value.$post;
    }

    # deal with pipelined commands
--- a/scripts/ems/support/fast-align-in-parts.perl
+++ b/scripts/ems/support/fast-align-in-parts.perl
@ -12,28 +12,33 @@ use warnings;
 use strict;
 use Getopt::Long qw(:config pass_through no_ignore_case permute);

-my ($BIN,$IN,$MAX_LINES,$SETTINGS,$REVERSE,$TMP);
+my ($BIN,$IN,$OUT,$MAX_LINES,$SETTINGS,$REVERSE,$SAVE_MODEL,$TMP);

 GetOptions('bin=s' => \$BIN,
           'i=s' => \$IN,
+           'o=s' => \$OUT,
           'max-lines=i' => \$MAX_LINES,
           'settings=s' => \$SETTINGS,
+           'save-model=s' => \$SAVE_MODEL,
           'r' => \$REVERSE,
           'tmp=s' => \$TMP,
          ) or exit(1);

-die("ERROR - usage: fast-align-in-parts.perl -bin FAST_ALIGN_BIN -i PARALLEL_CORPUS -max-lines COUNT -settings CONFIG [-r] -tmp TMPDIR")
-  unless defined($BIN) && defined($IN) && defined($SETTINGS) && defined($TMP) && defined($MAX_LINES)
+die("ERROR - usage: fast-align-in-parts.perl -bin FAST_ALIGN_BIN -i PARALLEL_CORPUS -max-lines COUNT -settings CONFIG [-r] -tmp TMPDIR [-save-model MODEL] -o ALIGNMENTS")
+  unless defined($BIN) && defined($IN) && defined($SETTINGS) && defined($TMP) 
+      && defined($MAX_LINES) && defined($OUT)
      && $MAX_LINES > 0;
 die("ERROR - input file does not exist: $IN") unless -e $IN;
 die("ERROR - fast_align binary does not exist: $BIN") unless -e $BIN;

+$SAVE_MODEL = defined($SAVE_MODEL) && $SAVE_MODEL && $SAVE_MODEL ne 'no';
 chomp(my $line_count = `cat $IN | wc -l`);

 # not more than maximal number of lines -> just run it regulary
 if ($MAX_LINES > $line_count) {
-  my $cmd = "$BIN -i $IN $SETTINGS";
+  my $cmd = "$BIN -i $IN $SETTINGS >$OUT";
  $cmd .= " -r" if defined($REVERSE);
+  $cmd .= " -p $OUT.parameters 2> $OUT.log" if $SAVE_MODEL;
  safesystem($cmd) or die;
  exit(0);
 }
@ -56,6 +61,7 @@ foreach my $input_file (@INPUT_FILES) {
  # process part
  my $cmd = "$BIN -i $input_file $SETTINGS";
  $cmd .= " -r" if defined($REVERSE);
+  $cmd .= " -p $output_file.parameters 2> $output_file.log" if $SAVE_MODEL;
  $cmd .= " >$output_file";
  safesystem($cmd) or die;
  die("ERROR: no output produced from command $cmd") unless -e $output_file;
@ -67,12 +73,63 @@ foreach my $input_file (@INPUT_FILES) {
 }

 # join output
-$cmd = "cat $TMP/aligned-*";
+$cmd = "cat $TMP/aligned-?? > $OUT";
 safesystem($cmd) or die;

-$cmd = "rm -r $TMP/* ; rmdir $TMP";
+# join model
+&join_model(scalar @INPUT_FILES) if $SAVE_MODEL;
+&join_log(scalar @INPUT_FILES) if $SAVE_MODEL;
+
+$cmd = "rm $TMP/* ; rmdir $TMP";
 safesystem($cmd);

+sub join_model {
+  my ($count) = @_;
+  open(CONCAT,"cat $TMP/aligned-*.parameters | LC_ALL=C sort -T $TMP -S 10%|");
+  open(JOINED,">$OUT.parameters");
+  my ($last_f,$last_e,$f,$e,$score,$merged_score);
+  while(<CONCAT>) {
+    ($f,$e,$score) = split;
+    if (!defined($last_f) || $f ne $last_f || $e ne $last_e) {
+      printf JOINED "%s %s %f\n",$last_f,$last_e,log($merged_score) if defined($last_f);
+      $last_f = $f;
+      $last_e = $e;
+      $merged_score = 0;
+    } 
+    $merged_score += exp($score)/$count;
+  }
+  printf JOINED "%s %s %f\n",$f,$e,log($merged_score);
+  close(CONCAT);
+  close(JOINED);
+}
+
+sub merge_entry {
+  my ($count,$f,$e,@SCORE) = @_;
+  my $score = 0;
+  foreach (@SCORE) {
+    $score += exp($_)/$count;
+  }
+  $score = log($score);
+  print JOINED "$f $e $score\n";
+}
+
+sub join_log {
+  my ($count) = @_;
+  open(CONCAT,"cat $TMP/aligned-*.log |");
+  my ($length,$tension,$tension_count) = (0,0,0);
+  while(<CONCAT>) {
+    $length += $1 if /expected target length = source length \* ([\d\.]+)/;
+    $tension += $1 if /final tension: ([\d\.]+)/ and (++$tension_count % 3 == 0);
+  }
+  close(CONCAT);
+  $length /= $count;
+  $tension /= $count;
+  open(JOINED,">$OUT.log");
+  print JOINED "expected target length = source length * $length\n";
+  print JOINED "     final tension: $tension\n";
+  close(JOINED);
+}
+
 sub safesystem {
  print STDERR "Executing: @_\n";
  system(@_);
--- a/scripts/training/bilingual-lm/create_blm_ini.py
+++ b/scripts/training/bilingual-lm/create_blm_ini.py
@ -0,0 +1,52 @@
+#!/usr/bin/env python
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
+import argparse
+import os
+import os.path
+import sys
+
+
+def main():
+  parser = argparse.ArgumentParser()
+  parser.add_argument("-w", "--working-dir",  dest="working_dir")
+  parser.add_argument("-n", "--target-context",  dest="n")
+  parser.add_argument("-m", "--source-context",  dest="m")
+  parser.add_argument("-i", "--ini_filename",  dest="ini_filename")
+  parser.add_argument("-x", "--name",  dest="name")
+  parser.add_argument("-e", "--epochs",  dest="epochs")
+  
+
+  parser.set_defaults(
+    working_dir="working",
+    n = "5",
+    m = "4",
+    ini_filename = "blm.ini",
+    name = "comb",
+    epochs = "10"
+  )
+
+  options = parser.parse_args()
+
+  if not os.path.exists(options.working_dir):
+      os.makedirs(options.working_dir)
+
+  # Bit of a hack, parse the working directory to get the name
+  name = os.path.basename(options.working_dir).split(".")[0].split("-")[-1]
+
+  ini_filename = os.path.join(options.working_dir,options.ini_filename)
+  with open(ini_filename,"w") as ifh:
+    print>>ifh, "[feature]"
+    print>>ifh,"BilingualNPLM name=BLM%s order=%s source_window=%s path=%s/train.10k.model.nplm.%s source_vocab=%s/vocab.source target_vocab=%s/vocab.target" \
+      % (options.name,options.n, options.m, options.working_dir, options.epochs, options.working_dir, options.working_dir)
+    print>>ifh
+    print>>ifh,"[weight]"
+    print>>ifh,"BLM%s= 0.1" % options.name
+    print>>ifh
+
+
+if __name__ == "__main__":
+  main()
+
--- a/scripts/training/combine_factors.pl
+++ b/scripts/training/combine_factors.pl
@ -37,9 +37,7 @@ while (defined $_) {
  $nr++;
  print STDERR "." if $nr % 10000 == 0;
  print STDERR "($nr)" if $nr % 100000 == 0;
-  chomp;
-  s/\s+/ /g; s/^ //; s/ $//;
-  my @intokens = split / /;
+  my ($intokens,$MARKUP) = split_xml($_);
  # load lines of corresponding streams and ensure equal number of words
  my @lines_of_extratoks;
  foreach my $factor (0..$#streams) {
@ -49,14 +47,17 @@ while (defined $_) {
    chomp($line);
    $line =~ s/\s+/ /g; $line =~ s/^ //; $line =~ s/ $//;
    my @toks = split / /, $line;
-    die "Incompatible number of words in factor $factor on line $nr. ($#toks != $#intokens)"
-      if $#toks != $#intokens;
+    die "Incompatible number of words in factor $factor on line $nr. ($#toks != $#$intokens)"
+      if $#toks != $#$intokens;
    $lines_of_extratoks[$factor] = \@toks;
  }

  # for every token, print the factors in the order as user wished
-  for(my $i=0; $i<=$#intokens; $i++) {
-    my $token = $intokens[$i];
+  for(my $i=0; $i<=$#$intokens; $i++) {
+    print " " if $i && $$MARKUP[$i] eq '';
+    print $$MARKUP[$i];
+
+    my $token = $$intokens[$i];
    my @outtoken = ();
    push @outtoken, $token; # add the first one
    # print STDERR "Token: $token\n";
@ -69,11 +70,56 @@ while (defined $_) {
    print " " if $i != 0;
    print join("|", @outtoken);
  }
+  print $$MARKUP[$#$MARKUP];
  print "\n";
  $_ = readline($firststream);
 }
 close $firststream;
 print STDERR "Done.\n";

+# store away xml markup
+sub split_xml {
+  my ($line) = @_;
+  my (@WORD,@MARKUP);
+  my $i = 0;
+  $MARKUP[0] = "";
+  while($line =~ /\S/) {
+    # XML tag
+    if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
+      my $potential_xml = $1;
+      my $line_next = $2;
+      # exception for factor that is an XML tag
+      if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) {
+	$WORD[$i-1] .= $potential_xml;
+	if ($line_next =~ /^(\|+)(.*)$/) {
+	  $WORD[$i-1] .= $1;
+	  $line_next = $2;
+	}
+      }
+      else {
+        $MARKUP[$i] .= $potential_xml." ";
+      }
+      $line = $line_next;
+    }
+    # non-XML text
+    elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
+      $WORD[$i++] = $1;
+      $MARKUP[$i] = "";
+      $line = $2;
+    }
+    # '<' or '>' occurs in word, but it's not an XML tag
+    elsif ($line =~ /^\s*(\S+)(.*)$/) {
+      $WORD[$i++] = $1;
+      $MARKUP[$i] = "";
+      $line = $2;
+      }
+    else {
+      die("ERROR: huh? $line\n");
+    }
+  }
+  chop($MARKUP[$#MARKUP]);
+  return (\@WORD,\@MARKUP);
+}
+


--- a/scripts/training/create_nplm_ini.py
+++ b/scripts/training/create_nplm_ini.py
@ -0,0 +1,50 @@
+#!/usr/bin/env python
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
+import argparse
+import os
+import os.path
+import sys
+
+
+def main():
+  parser = argparse.ArgumentParser()
+  parser.add_argument("-w", "--working-dir",  dest="working_dir")
+  parser.add_argument("-n", "--order",  dest="n")
+  parser.add_argument("-i", "--ini_filename",  dest="ini_filename")
+  parser.add_argument("-x", "--name",  dest="name")
+  parser.add_argument("-e", "--epochs",  dest="epochs")
+  parser.add_argument("-f", "--factor",  dest="factor")
+  
+
+  parser.set_defaults(
+    working_dir="working",
+    n = "5",
+    ini_filename = "nplm.ini",
+    name = "neural",
+    epochs = "10",
+    factor = "0"
+  )
+
+  options = parser.parse_args()
+
+  if not os.path.exists(options.working_dir):
+      os.makedirs(options.working_dir)
+
+
+  ini_filename = os.path.join(options.working_dir,options.ini_filename)
+  with open(ini_filename,"w") as ifh:
+    print>>ifh, "[feature]"
+    print>>ifh,"NeuralLM factor=%s name=NPLM%s order=%s path=%s/train.model.nplm.%s" \
+      % (options.factor,options.name, options.n, options.working_dir, options.epochs)
+    print>>ifh
+    print>>ifh,"[weight]"
+    print>>ifh,"NPLM%s= 0.1" % options.name
+    print>>ifh
+
+
+if __name__ == "__main__":
+  main()
+
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@ -37,6 +37,12 @@ my $MAX_LENGTH = 10;
 # utilities
 my $ZCAT = "gzip -cd";

+# sometimes you just have to do the right thing without asking
+my $sort_option = "";
+if (`echo 'youcandoit' | sort --compress-program gzip 2>/dev/null` =~ /youcandoit/) {
+  $sort_option = "--compress-program gzip ";
+}
+
 # get optional parameters
 my $opt_hierarchical = 0;
 my $binarizer = undef;
@ -410,13 +416,13 @@ for(my $i=0;$i<=$#TABLE;$i++) {
        # ... phrase translation model
        elsif ($binarizer =~ /processPhraseTableMin/) {
          #compact phrase table
-          my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted";
+          my $cmd = "$catcmd $mid_file | LC_ALL=C sort $sort_option -T $tempdir | gzip - > $mid_file.sorted.gz && $binarizer -in $mid_file.sorted.gz -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted.gz";
          safesystem($cmd) or die "Can't binarize";
        } elsif ($binarizer =~ /CreateOnDiskPt/) {
      	  my $cmd = "$binarizer $mid_file $new_file.bin";
          safesystem($cmd) or die "Can't binarize";
        } else {
-          my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
+          my $cmd = "$catcmd $mid_file | LC_ALL=C sort $sort_option -T $tempdir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
          safesystem($cmd) or die "Can't binarize";
        }
      }
@ -431,7 +437,7 @@ for(my $i=0;$i<=$#TABLE;$i++) {
        $lexbin =~ s/PhraseTable/LexicalTable/;
        my $cmd;
        if ($lexbin =~ /processLexicalTableMin/) {
-          $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file -threads $threads && rm $mid_file.sorted";
+          $cmd = "$catcmd $mid_file | LC_ALL=C sort $sort_option -T $tempdir | gzip - > $mid_file.sorted.gz && $lexbin -in $mid_file.sorted.gz -out $new_file -threads $threads && rm $mid_file.sorted.gz";
        } else {
          $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options
          $cmd = "$lexbin -in $mid_file -out $new_file";
--- a/scripts/training/rdlm/extract_syntactic_ngrams.py
+++ b/scripts/training/rdlm/extract_syntactic_ngrams.py
@ -89,11 +89,6 @@ def create_parser():
        help=(
            "Sentence end symbol. Will be skipped during extraction "
            "(default: %(default)s)"))
-    parser.add_argument(
-        '--ptkvz', action='store_true',
-        help=(
-            "Special rule for German dependency trees: "
-            "concatenate separable verb prefix and verb."))
    return parser


@ -107,22 +102,15 @@ def escape_text(s):
    return s


-def get_head(xml, add_ptkvz):
+def get_head(xml):
    """Deterministic heuristic to get head of subtree."""
    head = None
    preterminal = None
    for child in xml:
        if not len(child):
-            if head is not None:
-                continue
            preterminal = child.get('label')
            head = escape_text(child.text.strip())
-
-        elif add_ptkvz and head and child.get('label') == 'avz':
-            for grandchild in child:
-                if grandchild.get('label') == 'PTKVZ':
-                    head = escape_text(grandchild.text.strip()) + head
-                    break
+            return head, preterminal

    return head, preterminal

@ -159,7 +147,7 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab,
            parent_labels = (
                [vocab.get('<root_label>', 0)] * options.up_context)

-            head, preterminal = get_head(xml, options.ptkvz)
+            head, preterminal = get_head(xml)
            if not head:
                head = '<dummy_head>'
                preterminal = head
@ -222,7 +210,7 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab,
                preterminal_child = head_child
                child_label = '<head_label>'
            else:
-                head_child, preterminal_child = get_head(child, options.ptkvz)
+                head_child, preterminal_child = get_head(child)
                child_label = child.get('label')

            if head_child is None:
--- a/scripts/training/rdlm/extract_vocab.py
+++ b/scripts/training/rdlm/extract_vocab.py
@ -46,11 +46,6 @@ def create_parser():
    parser.add_argument(
        '--output', '-o', type=str, default='vocab', metavar='PREFIX',
        help="Output prefix (default: 'vocab')")
-    parser.add_argument(
-        '--ptkvz', action="store_true",
-        help=(
-            "Special rule for German dependency trees: attach separable "
-            "verb prefixes to verb."))

    return parser

@ -70,16 +65,9 @@ def get_head(xml, args):
    preterminal = None
    for child in xml:
        if not len(child):
-            if head is not None:
-                continue
            preterminal = child.get('label')
            head = escape_text(child.text.strip())
-
-        elif args.ptkvz and head and child.get('label') == 'avz':
-            for grandchild in child:
-                if grandchild.get('label') == 'PTKVZ':
-                    head = escape_text(grandchild.text.strip()) + head
-                    break
+            return head, preterminal

    return head, preterminal

--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@ -1604,6 +1604,7 @@ sub extract_phrase {
    $cmd .= " --InstanceWeights $_INSTANCE_WEIGHTS_FILE " if defined $_INSTANCE_WEIGHTS_FILE;
    $cmd .= " --BaselineExtract $_BASELINE_EXTRACT" if defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT =~ /extract-parallel.perl/;
    $cmd .= " --FlexibilityScore" if $_FLEXIBILITY_SCORE;
+    $cmd .= " --NoTTable" if $_MMSAPT;

    map { die "File not found: $_" if ! -e $_ } ($alignment_file_e, $alignment_file_f, $alignment_file_a);
    print STDERR "$cmd\n";
@ -1611,12 +1612,16 @@ sub extract_phrase {

    if (defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT !~ /extract-parallel.perl/) {
      print STDERR "merging with baseline extract from $_BASELINE_EXTRACT\n";
-      safesystem("$ZCAT $_BASELINE_EXTRACT.gz $extract_file$suffix.gz | $GZIP_EXEC > $extract_file.gz");
-      safesystem("$ZCAT $_BASELINE_EXTRACT.inv.gz $extract_file$suffix.inv.gz | $GZIP_EXEC > $extract_file.inv.gz");
+      safesystem("$ZCAT $_BASELINE_EXTRACT.gz $extract_file$suffix.gz | $GZIP_EXEC > $extract_file.gz")
+        if -e "$extract_file$suffix.gz";
+      safesystem("$ZCAT $_BASELINE_EXTRACT.inv.gz $extract_file$suffix.inv.gz | $GZIP_EXEC > $extract_file.inv.gz")
+        if -e "$extract_file$suffix.inv.gz";
      safesystem("$ZCAT $_BASELINE_EXTRACT.o.gz $extract_file$suffix.o.gz | $GZIP_EXEC > $extract_file.o.gz")
 	if -e "$extract_file$suffix.o.gz";
-      safesystem("rm $extract_file$suffix.gz");
-      safesystem("rm $extract_file$suffix.inv.gz");
+      safesystem("rm $extract_file$suffix.gz")
+        if -e "$extract_file$suffix.gz";
+      safesystem("rm $extract_file$suffix.inv.gz")
+        if -e "$extract_file$suffix.inv.gz";
      safesystem("rm $extract_file$suffix.o.gz")
        if -e "$extract_file$suffix.o.gz";
    }
@ -2343,7 +2348,9 @@ sub create_ini {
  }
  if ($_ADDITIONAL_INI_FILE) {
    print INI "\n# additional settings\n\n";
-    print INI `cat $_ADDITIONAL_INI_FILE`;
+    for my $AIF (split (/:/, $_ADDITIONAL_INI_FILE)) {
+      print INI `cat $AIF`;
+    }
  }

  # feature functions and weights
--- a/scripts/training/train-neurallm.py
+++ b/scripts/training/train-neurallm.py
@ -122,10 +122,11 @@ def main(options):

    if options.output_dir is None:
        options.output_dir = options.working_dir
-    else:
-        # Create output dir if necessary
-        if not os.path.exists(options.output_dir):
-            os.makedirs(options.output_dir)
+    # Create dirs if necessary
+    if not os.path.exists(options.working_dir):
+        os.makedirs(options.working_dir)
+    if not os.path.exists(options.output_dir):
+        os.makedirs(options.output_dir)

    numberized_file = os.path.basename(options.corpus_stem) + '.numberized'
    train_file = numberized_file