Merge branch 'master' of https://github.com/moses-smt/mosesdecoder into mmt-dev

Conflicts: moses/Syntax/F2S/Manager-inl.h moses/TranslationModel/UG/mmsapt.cpp
2024-10-27 03:49:57 +03:00 · 2015-04-26 02:12:16 +01:00 · 2015-04-26 02:12:16 +01:00 · 0d72cdd72c
commit 0d72cdd72c
parent d773717ca0 10bd942127
222 changed files with 2014 additions and 538 deletions
--- a/biconcor/phrase-lookup.cpp
+++ b/biconcor/phrase-lookup.cpp
@ -109,14 +109,17 @@ size_t lookup( string query )
  return suffixArray.Count( queryString );
 }

+// Duplicate of definition in util/tokenize.hh.
+// TODO: Can we de-duplicate this?  At the time of writing biconcor does not
+// use util at all.
 vector<string> tokenize(const char input[])
 {
  vector< string > token;
  bool betweenWords = true;
  int start=0;
-  int i=0;
-  for(; input[i] != '\0'; i++) {
-    bool isSpace = (input[i] == ' ' || input[i] == '\t');
+  int i;
+  for(i = 0; input[i] != '\0'; i++) {
+    const bool isSpace = (input[i] == ' ' || input[i] == '\t');

    if (!isSpace && betweenWords) {
      start = i;
--- a/contrib/lmserver/examples/lmclient.cc
+++ b/contrib/lmserver/examples/lmclient.cc
@ -45,8 +45,8 @@ struct LMClient {
 	    exit(1);
    }

-    bzero((char *)&server, sizeof(server));
-    bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length);
+    memset(&server, '\0', sizeof(server));
+    memcpy((char *)&server.sin_addr, hp->h_addr, hp->h_length);
    server.sin_family = hp->h_addrtype;
    server.sin_port = htons(port);

--- a/contrib/mada/qsub-madamira.perl
+++ b/contrib/mada/qsub-madamira.perl
@ -0,0 +1,46 @@
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+use File::Slurp;
+use File::Basename;
+use Cwd 'abs_path';
+
+my $splitDir = $ARGV[0];
+$splitDir = abs_path($splitDir);
+
+my @files = read_dir $splitDir;
+
+my $qsubDir=dirname($splitDir) ."/qsub";
+print STDERR "qsubDir=$qsubDir\n";
+`mkdir -p $qsubDir`;
+
+my $out2Dir=dirname($splitDir) ."/out2";
+print STDERR "out2Dir=$out2Dir\n";
+`mkdir -p $out2Dir`;
+
+for my $file ( @files ) {
+    print STDERR "$file ";
+
+    my $qsubFile = "$qsubDir/$file.sh";
+    open(RUN_FILE, ">$qsubFile");
+    
+    print RUN_FILE "#!/usr/bin/env bash\n" 
+	."#PBS -d/scratch/hh65/workspace/experiment/ar-en \n"
+        ."#PBS -l mem=5gb \n\n"
+	."export PATH=\"/scratch/statmt/bin:/share/apps/NYUAD/perl/gcc_4.9.1/5.20.1/bin:/share/apps/NYUAD/jdk/1.8.0_31/bin:/share/apps/NYUAD/zlib/gcc_4.9.1/1.2.8/bin:/share/apps/NYUAD/cmake/gcc_4.9.1/3.1.0-rc3/bin:/share/apps/NYUAD/boost/gcc_4.9.1/openmpi_1.8.3/1.57.0/bin:/share/apps/NYUAD/openmpi/gcc_4.9.1/1.8.3/bin:/share/apps/NYUAD/python/gcc_4.9.1/2.7.9/bin:/share/apps/NYUAD/gcc/binutils/2.21/el6/bin:/share/apps/NYUAD/gcc/gcc/4.9.1/el6/bin:/usr/lib64/qt-3.3/bin:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sbin:/opt/bio/ncbi/bin:/opt/bio/mpiblast/bin:/opt/bio/EMBOSS/bin:/opt/bio/clustalw/bin:/opt/bio/tcoffee/bin:/opt/bio/hmmer/bin:/opt/bio/phylip/exe:/opt/bio/mrbayes:/opt/bio/fasta:/opt/bio/glimmer/bin:/opt/bio/glimmer/scripts:/opt/bio/gromacs/bin:/opt/bio/gmap/bin:/opt/bio/tigr/bin:/opt/bio/autodocksuite/bin:/opt/bio/wgs/bin:/opt/ganglia/bin:/opt/ganglia/sbin:/opt/bin:/usr/java/latest/bin:/opt/pdsh/bin:/opt/rocks/bin:/opt/rocks/sbin:/opt/torque/bin:/opt/torque/sbin:/home/hh65/bin:/home/hh65/bin\" \n"
+
+	."module load  NYUAD/2.0 \n"
+	."module load gcc python/2.7.9 openmpi/1.8.3 boost cmake zlib jdk perl expat \n"
+
+	."cd /scratch/statmt/MADAMIRA-release-20140709-1.0 \n";
+    print RUN_FILE "java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar /scratch/statmt/MADAMIRA-release-20140709-1.0/MADAMIRA.jar "
+	 ."-rawinput $splitDir/$file -rawoutdir $out2Dir -rawconfig /scratch/statmt/MADAMIRA-release-20140709-1.0/samples/sampleConfigFile.xml \n";
+
+    close(RUN_FILE);
+
+    my $cmd = "qsub $qsubFile";
+    `$cmd`;
+
+}
+
--- a/contrib/mira/Main.cpp
+++ b/contrib/mira/Main.cpp
@ -46,6 +46,7 @@ namespace mpi = boost::mpi;
 #include "moses/FF/PhrasePairFeature.h"
 #include "moses/FF/WordPenaltyProducer.h"
 #include "moses/LM/Base.h"
+#include "util/random.hh"

 using namespace Mira;
 using namespace std;
@ -54,6 +55,7 @@ namespace po = boost::program_options;

 int main(int argc, char** argv)
 {
+  util::rand_init();
  size_t rank = 0;
  size_t size = 1;
 #ifdef MPI_ENABLE
--- a/contrib/mira/Main.h
+++ b/contrib/mira/Main.h
@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "moses/Word.h"
 #include "moses/FF/FeatureFunction.h"
 #include "Decoder.h"
+#include "util/random.hh"

 typedef std::map<const Moses::FeatureFunction*, std::vector< float > > ProducerWeightMap;
 typedef std::pair<const Moses::FeatureFunction*, std::vector< float > > ProducerWeightPair;
@ -37,8 +38,7 @@ template <class T> bool from_string(T& t, const std::string& s, std::ios_base& (

 struct RandomIndex {
  ptrdiff_t operator()(ptrdiff_t max) {
-    srand(time(0));  // Initialize random number generator with current time.
-    return static_cast<ptrdiff_t> (rand() % max);
+    return util::rand_excl(max);
  }
 };

--- a/contrib/other-builds/manual-label/manual-label.project
+++ b/contrib/other-builds/manual-label/manual-label.project
@ -1,5 +1,22 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <CodeLite_Project Name="manual-label" InternalType="Console">
+  <Plugins>
+    <Plugin Name="CMakePlugin">
+      <![CDATA[[{
+  "name": "Debug",
+  "enabled": false,
+  "buildDirectory": "build",
+  "sourceDirectory": "$(ProjectPath)",
+  "generator": "",
+  "buildType": "",
+  "arguments": [],
+  "parentProject": ""
+ }]]]>
+    </Plugin>
+    <Plugin Name="qmake">
+      <![CDATA[00010001N0005Debug000000000000]]>
+    </Plugin>
+  </Plugins>
  <Description/>
  <Dependencies/>
  <VirtualDirectory Name="manual-label">
@ -14,6 +31,8 @@
    <File Name="Main.cpp"/>
    <File Name="Main.h"/>
  </VirtualDirectory>
+  <Dependencies Name="Debug"/>
+  <Dependencies Name="Release"/>
  <Settings Type="Executable">
    <GlobalSettings>
      <Compiler Options="" C_Options="" Assembler="">
@ -33,6 +52,8 @@
      <Linker Options="" Required="yes">
        <LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/boost/lib64"/>
        <Library Value="boost_program_options"/>
+        <Library Value="boost_filesystem"/>
+        <Library Value="boost_system"/>
      </Linker>
      <ResourceCompiler Options="" Required="no"/>
      <General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
@ -107,6 +128,4 @@
      </Completion>
    </Configuration>
  </Settings>
-  <Dependencies Name="Debug"/>
-  <Dependencies Name="Release"/>
 </CodeLite_Project>
--- a/contrib/other-builds/moses-cmd/moses-cmd.project
+++ b/contrib/other-builds/moses-cmd/moses-cmd.project
@ -1,5 +1,22 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <CodeLite_Project Name="moses-cmd" InternalType="Console">
+  <Plugins>
+    <Plugin Name="CMakePlugin">
+      <![CDATA[[{
+  "name": "Debug",
+  "enabled": false,
+  "buildDirectory": "build",
+  "sourceDirectory": "$(ProjectPath)",
+  "generator": "",
+  "buildType": "",
+  "arguments": [],
+  "parentProject": ""
+ }]]]>
+    </Plugin>
+    <Plugin Name="qmake">
+      <![CDATA[00010001N0005Debug000000000000]]>
+    </Plugin>
+  </Plugins>
  <Description/>
  <Dependencies/>
  <VirtualDirectory Name="src"/>
@ -9,6 +26,14 @@
    <File Name="../../../moses-cmd/MainVW.cpp" ExcludeProjConfig="Debug"/>
    <File Name="../../../moses-cmd/MainVW.h" ExcludeProjConfig="Debug"/>
  </VirtualDirectory>
+  <Dependencies Name="Release"/>
+  <Dependencies Name="Debug">
+    <Project Name="OnDiskPt"/>
+    <Project Name="lm"/>
+    <Project Name="moses"/>
+    <Project Name="search"/>
+    <Project Name="util"/>
+  </Dependencies>
  <Settings Type="Executable">
    <GlobalSettings>
      <Compiler Options="" C_Options="" Assembler="">
@ -53,7 +78,7 @@
        <Library Value="rt"/>
      </Linker>
      <ResourceCompiler Options="" Required="no"/>
-      <General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
+      <General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="-f /var/folders/c4/2p48fcwx611dmkdqq44mbblm0000gn/T/ZVd8xvuJAR.ini -i /Users/hieu/workspace/github/moses-regression-tests/tests/phrase.basic-surface-binptable.oldformat/to-translate.txt" UseSeparateDebugArgs="yes" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
      <Environment EnvVarSetName="&lt;Use Defaults&gt;" DbgSetName="&lt;Use Defaults&gt;">
        <![CDATA[]]>
      </Environment>
@ -125,12 +150,4 @@
      </Completion>
    </Configuration>
  </Settings>
-  <Dependencies Name="Release"/>
-  <Dependencies Name="Debug">
-    <Project Name="OnDiskPt"/>
-    <Project Name="lm"/>
-    <Project Name="moses"/>
-    <Project Name="search"/>
-    <Project Name="util"/>
-  </Dependencies>
 </CodeLite_Project>
--- a/contrib/other-builds/moses/moses.project
+++ b/contrib/other-builds/moses/moses.project
@ -474,8 +474,6 @@
    <File Name="../../../moses/FF/DistortionScoreProducer.h"/>
    <File Name="../../../moses/FF/DynamicCacheBasedLanguageModel.cpp"/>
    <File Name="../../../moses/FF/DynamicCacheBasedLanguageModel.h"/>
-    <File Name="../../../moses/FF/ExternalFeature.cpp"/>
-    <File Name="../../../moses/FF/ExternalFeature.h"/>
    <File Name="../../../moses/FF/Factory.cpp"/>
    <File Name="../../../moses/FF/Factory.h"/>
    <File Name="../../../moses/FF/FeatureFunction.cpp"/>
--- a/contrib/relent-filter/src/Main.cpp
+++ b/contrib/relent-filter/src/Main.cpp
@ -42,6 +42,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "RelativeEntropyCalc.h"
 #include "LexicalReordering.h"
 #include "LexicalReorderingState.h"
+#include "util/random.hh"

 #ifdef HAVE_PROTOBUF
 #include "hypergraph.pb.h"
@ -205,7 +206,7 @@ int main(int argc, char** argv)
  
  
    //initialise random numbers
-    srand(time(NULL));
+    rand_init();
  
    // set up read/writing class
    IOWrapper* ioWrapper = GetIOWrapper(staticData);
--- a/contrib/server/mosesserver.cpp
+++ b/contrib/server/mosesserver.cpp
@ -536,7 +536,7 @@ public:
 	{
 	  // should the score breakdown be reported in a more structured manner?
 	  ostringstream buf;
-	  path.GetScoreBreakdown().OutputAllFeatureScores(buf);
+	  path.GetScoreBreakdown()->OutputAllFeatureScores(buf);
 	  nBestXMLItem["fvals"] = xmlrpc_c::value_string(buf.str());
 	}

--- a/mert/Data.cpp
+++ b/mert/Data.cpp
@ -17,6 +17,7 @@
 #include "util/exception.hh"

 #include "util/file_piece.hh"
+#include "util/random.hh"
 #include "util/tokenize_piece.hh"
 #include "util/string_piece.hh"
 #include "FeatureDataIterator.h"
@ -286,7 +287,7 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor
    } else {
      //create shards by randomly sampling
      for (size_t i = 0; i < floor(shard_size+0.5); ++i) {
-        shard_contents.push_back(rand() % data_size);
+        shard_contents.push_back(util::rand_excl(data_size));
      }
    }

--- a/mert/Fdstream.h
+++ b/mert/Fdstream.h
@ -13,6 +13,8 @@
 #include <iostream>
 #include <string>

+#include "util/unistd.hh"
+
 #if defined(__GLIBCXX__) || defined(__GLIBCPP__)
 #include <ext/stdio_filebuf.h>

--- a/mert/FileStream.cpp
+++ b/mert/FileStream.cpp
@ -40,28 +40,3 @@ inputfilestream::~inputfilestream()
 void inputfilestream::close()
 {
 }
-
-outputfilestream::outputfilestream(const std::string &filePath)
-  : std::ostream(0), m_streambuf(0), m_is_good(false)
-{
-  // check if file is readable
-  std::filebuf* fb = new std::filebuf();
-  m_is_good = (fb->open(filePath.c_str(), std::ios::out) != NULL);
-
-  if (IsGzipFile(filePath)) {
-    throw runtime_error("Output to a zipped file not supported!");
-  } else {
-    m_streambuf = fb;
-  }
-  this->init(m_streambuf);
-}
-
-outputfilestream::~outputfilestream()
-{
-  delete m_streambuf;
-  m_streambuf = 0;
-}
-
-void outputfilestream::close()
-{
-}
--- a/mert/FileStream.h
+++ b/mert/FileStream.h
@ -22,20 +22,4 @@ public:
  void close();
 };

-class outputfilestream : public std::ostream
-{
-protected:
-  std::streambuf *m_streambuf;
-  bool m_is_good;
-
-public:
-  explicit outputfilestream(const std::string &filePath);
-  virtual ~outputfilestream();
-
-  bool good() const {
-    return m_is_good;
-  }
-  void close();
-};
-
 #endif // MERT_FILE_STREAM_H_
--- a/mert/ForestRescoreTest.cpp
+++ b/mert/ForestRescoreTest.cpp
@ -1,6 +1,9 @@
 #include <iostream>

+#include "util/tokenize_piece.hh"
+
 #include "ForestRescore.h"
+#include "MiraFeatureVector.h"

 #define BOOST_TEST_MODULE MertForestRescore
 #include <boost/test/unit_test.hpp>
@ -10,8 +13,7 @@
 using namespace std;
 using namespace MosesTuning;

-BOOST_AUTO_TEST_CASE(viterbi_simple_lattice)
-{
+BOOST_AUTO_TEST_CASE(viterbi_simple_lattice) {
  Vocab vocab;
  WordVec words;
  string wordStrings[] =
@ -242,5 +244,101 @@ BOOST_AUTO_TEST_CASE(viterbi_3branch_lattice)
  BOOST_CHECK_EQUAL(6, hopeHypo.bleuStats[8]);
 }

+BOOST_AUTO_TEST_CASE(viterbi_full_hypergraph) {
+  Vocab vocab;
+  //References
+  ReferenceSet references;
+  references.AddLine(0,"in addition to EU support for businesses , also the administration of national business support will be concentrated in four Centres for Economic Development , Transport and Environment ( ELY Centres ) , starting from mid @-@ September .",vocab); 
+  //Load the hypergraph
+  Graph graph(vocab);
+  util::scoped_fd fd(util::OpenReadOrThrow("mert/hgtest/0.gz"));
+  util::FilePiece file(fd.release());
+  ReadGraph(file,graph);
+
+  //prune
+  SparseVector weights;
+  weights.set("OpSequenceModel0_1",0.011187);
+  weights.set("OpSequenceModel0_2",-0.002797);
+  weights.set("OpSequenceModel0_3",0.002797);
+  weights.set("OpSequenceModel0_4",-0.000140);
+  weights.set("OpSequenceModel0_5",0.004195);
+  weights.set("Distortion0",0.041952);
+  weights.set("PhrasePenalty0",0.027968);
+  weights.set("WordPenalty0",-0.139841);
+  weights.set("UnknownWordPenalty0",1.000000);
+  weights.set("LM0",0.069920);
+  weights.set("LexicalReordering0_1",0.041952);
+  weights.set("LexicalReordering0_2",0.041952);
+  weights.set("LexicalReordering0_3",0.041952);
+  weights.set("LexicalReordering0_4",0.041952);
+  weights.set("LexicalReordering0_5",0.041952);
+  weights.set("LexicalReordering0_6",0.041952);
+  weights.set("LexicalReordering0_7",0.041952);
+  weights.set("LexicalReordering0_8",0.041952);
+  weights.set("TranslationModel0_1",0.027968);
+  weights.set("TranslationModel0_2",0.027968);
+  weights.set("TranslationModel0_3",0.027968);
+  weights.set("TranslationModel0_4",0.027968);
+  weights.set("TranslationModel0_5",0.027968);
+  weights.set("TranslationModel0_6",0.027968);
+  weights.set("TranslationModel0_7",0.027968);
+  weights.set("TranslationModel0_8",0.027968);
+  weights.set("TranslationModel0_9",0.027968);
+  weights.set("TranslationModel0_10",0.027968);
+  weights.set("TranslationModel0_11",0.027968);
+  weights.set("TranslationModel0_12",0.027968);
+  weights.set("TranslationModel0_13",0.027968);
+  size_t edgeCount = 500;
+  boost::shared_ptr<Graph> prunedGraph;
+  prunedGraph.reset(new Graph(vocab));
+  graph.Prune(prunedGraph.get(), weights, edgeCount);
+
+  vector<ValType> bg(9);
+  HgHypothesis bestHypo;
+  //best hypothesis
+  Viterbi(*prunedGraph, weights, 0, references, 0, bg, &bestHypo);
+  //check output as expected
+  string expectedStr = "<s> the EU matters , but also the national matters management focus since mid @-@ September four ely @-@ centre . </s>";
+  util::TokenIter<util::SingleCharacter, true> expected(expectedStr, util::SingleCharacter(' '));
+  for (size_t i = 0; i < bestHypo.text.size(); ++i) {
+    //cerr << bestHypo.text[i]->first << " ";
+    BOOST_CHECK_EQUAL(*expected,bestHypo.text[i]->first);
+    ++expected;
+  }
+  BOOST_CHECK(!expected);
+  //cerr << endl;
+  //check scores
+  BOOST_CHECK_CLOSE(-80.062,bestHypo.featureVector.get("OpSequenceModel0_1"), 0.001);
+  BOOST_CHECK_CLOSE(2,bestHypo.featureVector.get("OpSequenceModel0_2"), 0.001);
+  BOOST_CHECK_CLOSE(2,bestHypo.featureVector.get("OpSequenceModel0_3"), 0.001);
+  BOOST_CHECK_CLOSE(3,bestHypo.featureVector.get("OpSequenceModel0_4"), 0.001);
+  BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("OpSequenceModel0_5"), 0.001);
+  BOOST_CHECK_CLOSE(-6,bestHypo.featureVector.get("Distortion0"), 0.001);
+  BOOST_CHECK_CLOSE(14,bestHypo.featureVector.get("PhrasePenalty0"), 0.001);
+  BOOST_CHECK_CLOSE(-20,bestHypo.featureVector.get("WordPenalty0"), 0.001);
+  BOOST_CHECK_CLOSE(-100,bestHypo.featureVector.get("UnknownWordPenalty0"), 0.001);
+  BOOST_CHECK_CLOSE(-126.616,bestHypo.featureVector.get("LM0"), 0.001);
+  BOOST_CHECK_CLOSE(-5.2238,bestHypo.featureVector.get("LexicalReordering0_1"), 0.001);
+  BOOST_CHECK_CLOSE(-0.29515,bestHypo.featureVector.get("LexicalReordering0_2"), 0.001);
+  BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("LexicalReordering0_3"), 0.001);
+  BOOST_CHECK_CLOSE(-0.470004,bestHypo.featureVector.get("LexicalReordering0_4"), 0.001);
+  BOOST_CHECK_CLOSE(-9.28267,bestHypo.featureVector.get("LexicalReordering0_5"), 0.001);
+  BOOST_CHECK_CLOSE(-0.470004,bestHypo.featureVector.get("LexicalReordering0_6"), 0.001);
+  BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("LexicalReordering0_7"), 0.001);
+  BOOST_CHECK_CLOSE(-0.402678,bestHypo.featureVector.get("LexicalReordering0_8"), 0.001);
+  BOOST_CHECK_CLOSE(-54.3119,bestHypo.featureVector.get("TranslationModel0_1"), 0.001);
+  BOOST_CHECK_CLOSE(-62.2619,bestHypo.featureVector.get("TranslationModel0_2"), 0.001);
+  BOOST_CHECK_CLOSE(-23.8782,bestHypo.featureVector.get("TranslationModel0_3"), 0.001);
+  BOOST_CHECK_CLOSE(-25.1626,bestHypo.featureVector.get("TranslationModel0_4"), 0.001);
+  BOOST_CHECK_CLOSE(12.9986,bestHypo.featureVector.get("TranslationModel0_5"), 0.001);
+  BOOST_CHECK_CLOSE(3.99959,bestHypo.featureVector.get("TranslationModel0_6"), 0.001);
+  BOOST_CHECK_CLOSE(1.99979,bestHypo.featureVector.get("TranslationModel0_7"), 0.001);
+  BOOST_CHECK_CLOSE(1.99979,bestHypo.featureVector.get("TranslationModel0_8"), 0.001);
+  BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("TranslationModel0_9"), 0.001);
+  BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("TranslationModel0_10"), 0.001);
+  BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("TranslationModel0_11"), 0.001);
+  BOOST_CHECK_CLOSE(0.999896,bestHypo.featureVector.get("TranslationModel0_12"), 0.001);
+  BOOST_CHECK_CLOSE(7.99917,bestHypo.featureVector.get("TranslationModel0_13"), 0.001);
+}


--- a/mert/MeteorScorer.cpp
+++ b/mert/MeteorScorer.cpp
@ -18,6 +18,7 @@

 #include "ScoreStats.h"
 #include "Util.h"
+#include "util/unistd.hh"

 using namespace std;

@ -25,7 +26,7 @@ namespace MosesTuning
 {

 // Meteor supported
-#if defined(__GLIBCXX__) || defined(__GLIBCPP__)
+#if (defined(__GLIBCXX__) || defined(__GLIBCPP__)) && !defined(_WIN32)

 // for clarity
 #define CHILD_STDIN_READ pipefds_input[0]
--- a/mert/Point.cpp
+++ b/mert/Point.cpp
@ -3,6 +3,7 @@
 #include <cmath>
 #include <cstdlib>
 #include "util/exception.hh"
+#include "util/random.hh"
 #include "FeatureStats.h"
 #include "Optimizer.h"

@ -57,10 +58,8 @@ void Point::Randomize()
  UTIL_THROW_IF(m_min.size() != Point::m_dim, util::Exception, "Error");
  UTIL_THROW_IF(m_max.size() != Point::m_dim, util::Exception, "Error");

-  for (unsigned int i = 0; i < size(); i++) {
-    operator[](i) = m_min[i] +
-                    static_cast<float>(random()) / static_cast<float>(RAND_MAX) * (m_max[i] - m_min[i]);
-  }
+  for (unsigned int i = 0; i < size(); i++)
+    operator[](i) = util::rand_incl(m_min[i], m_max[i]);
 }

 double Point::operator*(const FeatureStats& F) const
--- a/mert/TODO
+++ b/mert/TODO
@ -5,11 +5,8 @@

 - check that --pairwise-ranked is compatible with all optimization metrics

- Replace the standard rand() currently used in MERT and PRO with better
-  random generators such as Boost's random generators (e.g., boost::mt19937).
-  - create a Random class to hide the details, i.e., how to generate
-    random numbers, which allows us to use custom random generators more
-    easily.
+- Use better random generators in util/random.cc, e.g. boost::mt19937.
+  - Support plugging of custom random generators.

  Pros:
  - In MERT, you might want to use the random restarting technique to avoid
--- a/mert/TimerTest.cpp
+++ b/mert/TimerTest.cpp
@ -11,7 +11,20 @@ using namespace MosesTuning;
 BOOST_AUTO_TEST_CASE(timer_basic_test)
 {
  Timer timer;
-  const int sleep_time_microsec = 40; // ad-hoc microseconds to pass unit tests.
+
+  // Sleep time.  The test will sleep for this number of microseconds, and
+  // expect the elapsed time to be noticeable.
+  // Keep this number low to avoid wasting test time sleeping, but at least as
+  // high as the Boost timer's resolution.  Tests must pass consistently, not
+  // just on lucky runs.
+#if defined(WIN32)
+  // Timer resolution on Windows seems to be a millisecond.  Anything less and
+  // the test fails consistently.
+  const int sleep_time_microsec = 1000;
+#else
+  // Unix-like systems seem to have more fine-grained clocks.
+  const int sleep_time_microsec = 40;
+#endif

  timer.start();
  BOOST_REQUIRE(timer.is_running());
--- a/mert/evaluator.cpp
+++ b/mert/evaluator.cpp
@ -1,3 +1,4 @@
+#include <cstdlib>
 #include <fstream>
 #include <iostream>
 #include <string>
@ -15,6 +16,7 @@
 #include "Timer.h"
 #include "Util.h"
 #include "Data.h"
+#include "util/random.hh"

 using namespace std;
 using namespace MosesTuning;
@ -91,17 +93,15 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap, bool nbest_i
  if (bootstrap) {
    vector<float> scores;
    for (int i = 0; i < bootstrap; ++i) {
-      // TODO: Use smart pointer for exceptional-safety.
-      ScoreData* scoredata = new ScoreData(g_scorer);
+      ScoreData scoredata(g_scorer);
      for (int j = 0; j < n; ++j) {
-        int randomIndex = random() % n;
-        scoredata->add(entries[randomIndex], j);
+        const int randomIndex = util::rand_excl(n);
+        scoredata.add(entries[randomIndex], j);
      }
-      g_scorer->setScoreData(scoredata);
+      g_scorer->setScoreData(&scoredata);
      candidates_t candidates(n, 0);
      float score = g_scorer->score(candidates);
      scores.push_back(score);
-      delete scoredata;
    }

    float avg = average(scores);
@ -121,15 +121,13 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap, bool nbest_i
    cout.precision(4);
    cout << avg << "\t[" << lb << "," << rb << "]" << endl;
  } else {
-    // TODO: Use smart pointer for exceptional-safety.
-    ScoreData* scoredata = new ScoreData(g_scorer);
+    ScoreData scoredata(g_scorer);
    for (int sid = 0; sid < n; ++sid) {
-      scoredata->add(entries[sid], sid);
+      scoredata.add(entries[sid], sid);
    }
-    g_scorer->setScoreData(scoredata);
+    g_scorer->setScoreData(&scoredata);
    candidates_t candidates(n, 0);
    float score = g_scorer->score(candidates);
-    delete scoredata;

    if (g_has_more_files) cout << candFile << "\t";
    if (g_has_more_scorers) cout << g_scorer->getName() << "\t";
@ -287,10 +285,10 @@ void InitSeed(const ProgramOption *opt)
 {
  if (opt->has_seed) {
    cerr << "Seeding random numbers with " << opt->seed << endl;
-    srandom(opt->seed);
+    util::rand_init(opt->seed);
  } else {
    cerr << "Seeding random numbers with system clock " << endl;
-    srandom(time(NULL));
+    util::rand_init();
  }
 }

--- a/mert/hgtest/0.gz
+++ b/mert/hgtest/0.gz
--- a/mert/kbmira.cpp
+++ b/mert/kbmira.cpp
@ -40,6 +40,7 @@ de recherches du Canada
 #include <boost/scoped_ptr.hpp>

 #include "util/exception.hh"
+#include "util/random.hh"

 #include "BleuScorer.h"
 #include "HopeFearDecoder.h"
@ -122,10 +123,10 @@ int main(int argc, char** argv)

  if (vm.count("random-seed")) {
    cerr << "Initialising random seed to " << seed << endl;
-    srand(seed);
+    util::rand_init(seed);
  } else {
    cerr << "Initialising random seed from system clock" << endl;
-    srand(time(NULL));
+    util::rand_init();
  }

  // Initialize weights
--- a/mert/mert.cpp
+++ b/mert/mert.cpp
@ -24,6 +24,7 @@
 #include "Types.h"
 #include "Timer.h"
 #include "Util.h"
+#include "util/random.hh"

 #include "moses/ThreadPool.h"

@ -289,10 +290,10 @@ int main(int argc, char **argv)

  if (option.has_seed) {
    cerr << "Seeding random numbers with " << option.seed << endl;
-    srandom(option.seed);
+    util::rand_init(option.seed);
  } else {
    cerr << "Seeding random numbers with system clock " << endl;
-    srandom(time(NULL));
+    util::rand_init();
  }

  if (option.sparse_weights_file.size()) ++option.pdim;
--- a/mert/pro.cpp
+++ b/mert/pro.cpp
@ -43,6 +43,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "ScoreDataIterator.h"
 #include "BleuScorer.h"
 #include "Util.h"
+#include "util/random.hh"

 using namespace std;
 using namespace MosesTuning;
@ -141,10 +142,10 @@ int main(int argc, char** argv)

  if (vm.count("random-seed")) {
    cerr << "Initialising random seed to " << seed << endl;
-    srand(seed);
+    util::rand_init(seed);
  } else {
    cerr << "Initialising random seed from system clock" << endl;
-    srand(time(NULL));
+    util::rand_init();
  }

  if (scoreFiles.size() == 0 || featureFiles.size() == 0) {
@ -211,11 +212,11 @@ int main(int argc, char** argv)
    vector<float> scores;
    size_t n_translations = hypotheses.size();
    for(size_t  i=0; i<n_candidates; i++) {
-      size_t rand1 = rand() % n_translations;
+      size_t rand1 = util::rand_excl(n_translations);
      pair<size_t,size_t> translation1 = hypotheses[rand1];
      float bleu1 = smoothedSentenceBleu(scoreDataIters[translation1.first]->operator[](translation1.second), bleuSmoothing, smoothBP);

-      size_t rand2 = rand() % n_translations;
+      size_t rand2 = util::rand_excl(n_translations);
      pair<size_t,size_t> translation2 = hypotheses[rand2];
      float bleu2 = smoothedSentenceBleu(scoreDataIters[translation2.first]->operator[](translation2.second), bleuSmoothing, smoothBP);

--- a/moses-cmd/MainVW.cpp
+++ b/moses-cmd/MainVW.cpp
@ -45,6 +45,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "moses/FF/StatefulFeatureFunction.h"
 #include "moses/FF/StatelessFeatureFunction.h"
 #include "moses/TrainingTask.h"
+#include "util/random.hh"

 #ifdef HAVE_PROTOBUF
 #include "hypergraph.pb.h"
@ -117,7 +118,7 @@ int main(int argc, char** argv)


    //initialise random numbers
-    srand(time(NULL));
+    util::rand_init();

    // set up read/writing class
    IFVERBOSE(1) {
--- a/moses/ExportInterface.cpp
+++ b/moses/ExportInterface.cpp
@ -27,6 +27,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include <sstream>
 #include <vector>

+#include "util/random.hh"
 #include "util/usage.hh"

 #ifdef WIN32
@ -91,7 +92,7 @@ SimpleTranslationInterface::SimpleTranslationInterface(const string &mosesIni):
      exit(1);
    }

-    srand(time(NULL));
+    util::rand_init();

 }

@ -185,7 +186,7 @@ batch_run()
  const StaticData& staticData = StaticData::Instance();

  //initialise random numbers
-  srand(time(NULL));
+  util::rand_init();

  IFVERBOSE(1) PrintUserTime("Created input-output object");
    
--- a/moses/FF/LexicalReordering/SparseReordering.cpp
+++ b/moses/FF/LexicalReordering/SparseReordering.cpp
@ -13,8 +13,11 @@
 #include "LexicalReordering.h"
 #include "SparseReordering.h"

+#include <boost/algorithm/string/predicate.hpp>
+

 using namespace std;
+using namespace boost::algorithm;

 namespace Moses
 {
@ -57,6 +60,7 @@ const std::string& SparseReorderingFeatureKey::Name (const string& wordListId)

 SparseReordering::SparseReordering(const map<string,string>& config, const LexicalReordering* producer)
  : m_producer(producer)
+  , m_useWeightMap(false)
 {
  static const string kSource= "source";
  static const string kTarget = "target";
@ -80,6 +84,14 @@ SparseReordering::SparseReordering(const map<string,string>& config, const Lexic
      } else {
        UTIL_THROW(util::Exception, "Sparse reordering requires source or target, not " << fields[1]);
      }
+    } else if (fields[0] == "weights") {
+      ReadWeightMap(i->second);
+      m_useWeightMap = true;
+      for (int reoType=0; reoType<=LRModel::MAX; ++reoType) {
+        ostringstream buf;
+        buf << reoType;
+        m_featureMap2.push_back(m_producer->GetFeatureName(buf.str()));
+      }

    } else if (fields[0] == "phrase") {
      m_usePhrase = true;
@ -175,8 +187,17 @@ void SparseReordering::AddFeatures(
    SparseReorderingFeatureKey key(id, type, wordFactor, false, position, side, reoType);
    FeatureMap::const_iterator fmi = m_featureMap.find(key);
    assert(fmi != m_featureMap.end());
+    if (m_useWeightMap) {
+      WeightMap::const_iterator wmi = m_weightMap.find(fmi->second.name());
+      if (wmi != m_weightMap.end()) {
+        if (wmi->second != 0) {
+          scores->SparsePlusEquals(m_featureMap2[reoType], wmi->second);
+        }
+      }
+    } else {
      scores->SparsePlusEquals(fmi->second, 1.0);
    }
+  }

  for (size_t id = 0; id < clusterMaps->size(); ++id) {
    const ClusterMap& clusterMap = (*clusterMaps)[id];
@ -186,9 +207,18 @@ void SparseReordering::AddFeatures(
      SparseReorderingFeatureKey key(id, type, clusterIter->second, true, position, side, reoType);
      FeatureMap::const_iterator fmi = m_featureMap.find(key);
      assert(fmi != m_featureMap.end());
+      if (m_useWeightMap) {
+        WeightMap::const_iterator wmi = m_weightMap.find(fmi->second.name());
+        if (wmi != m_weightMap.end()) {
+          if (wmi->second != 0) {
+            scores->SparsePlusEquals(m_featureMap2[reoType], wmi->second);
+          }
+        }
+      } else {
        scores->SparsePlusEquals(fmi->second, 1.0);
      }
    }
+  }

 }

@ -256,5 +286,29 @@ void SparseReordering::CopyScores(

 }

+
+void SparseReordering::ReadWeightMap(const string& filename)
+{
+  util::FilePiece file(filename.c_str());
+  StringPiece line;
+  while (true) {
+    try {
+      line = file.ReadLine();
+    } catch (const util::EndOfFileException &e) {
+      break;
+    }
+    util::TokenIter<util::SingleCharacter, true> lineIter(line,util::SingleCharacter(' '));
+    UTIL_THROW_IF2(!lineIter, "Malformed weight line: '" << line << "'");
+    const std::string& name = lineIter->as_string();
+    ++lineIter;
+    UTIL_THROW_IF2(!lineIter, "Malformed weight line: '" << line << "'");
+    float weight = Moses::Scan<float>(lineIter->as_string());
+
+    std::pair< WeightMap::iterator, bool> inserted = m_weightMap.insert( std::make_pair(name, weight) );
+    UTIL_THROW_IF2(!inserted.second, "Duplicate weight: '" << name << "'");
+  }
+}
+
+
 } //namespace

--- a/moses/FF/LexicalReordering/SparseReordering.h
+++ b/moses/FF/LexicalReordering/SparseReordering.h
@ -112,10 +112,16 @@ private:
  typedef boost::unordered_map<SparseReorderingFeatureKey, FName, HashSparseReorderingFeatureKey, EqualsSparseReorderingFeatureKey> FeatureMap;
  FeatureMap m_featureMap;

+  typedef boost::unordered_map<std::string, float> WeightMap;
+  WeightMap m_weightMap;
+  bool m_useWeightMap;
+  std::vector<FName> m_featureMap2; 
+
  void ReadWordList(const std::string& filename, const std::string& id,
                    SparseReorderingFeatureKey::Side side, std::vector<WordList>* pWordLists);
  void ReadClusterMap(const std::string& filename, const std::string& id, SparseReorderingFeatureKey::Side side, std::vector<ClusterMap>* pClusterMaps);
  void PreCalculateFeatureNames(size_t index, const std::string& id, SparseReorderingFeatureKey::Side side, const Factor* factor, bool isCluster);
+  void ReadWeightMap(const std::string& filename);

  void AddFeatures(
    SparseReorderingFeatureKey::Type type, SparseReorderingFeatureKey::Side side,
--- a/moses/FF/VW/VW.h
+++ b/moses/FF/VW/VW.h
@ -86,6 +86,10 @@ struct VWTargetSentence {
      int src = it->first;
      int tgt = it->second;

+      if (src >= m_sourceConstraints.size() || tgt >= m_targetConstraints.size()) {
+        UTIL_THROW2("VW :: alignment point out of bounds: " << src << "-" << tgt);
+      }
+
      m_sourceConstraints[src].Update(tgt);
      m_targetConstraints[tgt].Update(src);
    }
--- a/moses/HypergraphOutput.cpp
+++ b/moses/HypergraphOutput.cpp
@ -98,6 +98,7 @@ HypergraphOutput<M>::HypergraphOutput(size_t precision) :
      // If this line gives you compile errors,
      //   contact Lane Schwartz on the Moses mailing list
      m_hypergraphDir = nbestPath.parent_path().string();
+      if (m_hypergraphDir.empty()) m_hypergraphDir=".";

    } else {
      stringstream hypergraphDirName;
--- a/moses/LM/Remote.cpp
+++ b/moses/LM/Remote.cpp
@ -1,14 +1,15 @@
 #include <cstdio>
 #include <cstdlib>
+#include <cstring>
 #include <unistd.h>
 #include <sys/types.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <arpa/inet.h>
-#include <netdb.h>
 #include "Remote.h"
 #include "moses/Factor.h"

+#if !defined(_WIN32) && !defined(_WIN64)
+#include <arpa/inet.h>
+#endif
+
 namespace Moses
 {

@ -41,12 +42,16 @@ bool LanguageModelRemote::start(const std::string& host, int port)
  sock = socket(AF_INET, SOCK_STREAM, 0);
  hp = gethostbyname(host.c_str());
  if (hp==NULL) {
+#if defined(_WIN32) || defined(_WIN64)
+    fprintf(stderr, "gethostbyname failed\n");
+#else
    herror("gethostbyname failed");
+#endif
    exit(1);
  }

-  bzero((char *)&server, sizeof(server));
-  bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length);
+  memset(&server, '\0', sizeof(server));
+  memcpy((char *)&server.sin_addr, hp->h_addr, hp->h_length);
  server.sin_family = hp->h_addrtype;
  server.sin_port = htons(port);

--- a/moses/LM/Remote.h
+++ b/moses/LM/Remote.h
@ -4,9 +4,15 @@
 #include "SingleFactor.h"
 #include "moses/TypeDef.h"
 #include "moses/Factor.h"
-#include <sys/socket.h>
 #include <sys/types.h>
+
+#if defined(_WIN32) || defined(_WIN64)
+#include <winsock2.h>
+#else
+#include <sys/socket.h>
 #include <netinet/in.h>
+#include <netdb.h>
+#endif

 namespace Moses
 {
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@ -55,6 +55,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #endif

 #include "util/exception.hh"
+#include "util/random.hh"

 using namespace std;

@ -426,7 +427,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const
      //cerr << endl;

      //draw the sample
-      float frandom = log((float)rand()/RAND_MAX);
+      const float frandom = log(util::rand_incl(0.0f, 1.0f));
      size_t position = 1;
      float sum = candidateScores[0];
      for (; position < candidateScores.size() && sum < frandom; ++position) {
@ -1645,7 +1646,7 @@ void Manager::OutputNBest(std::ostream& out
    out << " |||";

    // print scores with feature names
-    path.GetScoreBreakdown().OutputAllFeatureScores(out );
+    path.GetScoreBreakdown()->OutputAllFeatureScores(out);

    // total
    out << " ||| " << path.GetTotalScore();
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@ -31,6 +31,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "InputFileStream.h"
 #include "StaticData.h"
 #include "util/exception.hh"
+#include "util/random.hh"
 #include <boost/program_options.hpp>


@ -1393,7 +1394,7 @@ struct Credit {
    this->contact							= contact						;
    this->currentPursuits			= currentPursuits		;
    this->areaResponsibility	= areaResponsibility;
-    this->sortId							= rand() % 1000;
+    this->sortId							= util::rand_excl(1000);
  }

  bool operator<(const Credit &other) const {
--- a/moses/Syntax/F2S/HyperTreeLoader.cpp
+++ b/moses/Syntax/F2S/HyperTreeLoader.cpp
@ -40,12 +40,12 @@ bool HyperTreeLoader::Load(const std::vector<FactorType> &input,
                           const std::vector<FactorType> &output,
                           const std::string &inFile,
                           const RuleTableFF &ff,
-                           HyperTree &trie)
+                           HyperTree &trie,
+                           boost::unordered_set<std::size_t> &sourceTermSet)
 {
  PrintUserTime(std::string("Start loading HyperTree"));

-  // const StaticData &staticData = StaticData::Instance();
-  // const std::string &factorDelimiter = staticData.GetFactorDelimiter();
+  sourceTermSet.clear();

  std::size_t count = 0;

@ -106,6 +106,7 @@ bool HyperTreeLoader::Load(const std::vector<FactorType> &input,
    // Source-side
    HyperPath sourceFragment;
    hyperPathLoader.Load(sourceString, sourceFragment);
+    ExtractSourceTerminalSetFromHyperPath(sourceFragment, sourceTermSet);

    // Target-side
    TargetPhrase *targetPhrase = new TargetPhrase(&ff);
@ -144,6 +145,23 @@ bool HyperTreeLoader::Load(const std::vector<FactorType> &input,
  return true;
 }

+void HyperTreeLoader::ExtractSourceTerminalSetFromHyperPath(
+    const HyperPath &hp, boost::unordered_set<std::size_t> &sourceTerminalSet)
+{
+  for (std::vector<HyperPath::NodeSeq>::const_iterator p = hp.nodeSeqs.begin();
+       p != hp.nodeSeqs.end(); ++p) {
+    for (std::vector<std::size_t>::const_iterator q = p->begin();
+         q != p->end(); ++q) {
+      const std::size_t factorId = *q;
+      if (factorId >= moses_MaxNumNonterminals &&
+          factorId != HyperPath::kComma &&
+          factorId != HyperPath::kEpsilon) {
+        sourceTerminalSet.insert(factorId);
+      }
+    }
+  }
+}
+
 }  // namespace F2S
 }  // namespace Syntax
 }  // namespace Moses
--- a/moses/Syntax/F2S/HyperTreeLoader.h
+++ b/moses/Syntax/F2S/HyperTreeLoader.h
@ -3,9 +3,12 @@
 #include <istream>
 #include <vector>

+#include <boost/unordered_set.hpp>
+
 #include "moses/TypeDef.h"
 #include "moses/Syntax/RuleTableFF.h"

+#include "HyperPath.h"
 #include "HyperTree.h"
 #include "HyperTreeCreator.h"

@ -23,7 +26,12 @@ public:
            const std::vector<FactorType> &output,
            const std::string &inFile,
            const RuleTableFF &,
-            HyperTree &);
+            HyperTree &,
+            boost::unordered_set<std::size_t> &);
+
+private:
+  void ExtractSourceTerminalSetFromHyperPath(
+     const HyperPath &, boost::unordered_set<std::size_t> &);
 };

 }  // namespace F2S
--- a/moses/Syntax/F2S/Manager-inl.h
+++ b/moses/Syntax/F2S/Manager-inl.h
@ -39,6 +39,7 @@ Manager<RuleMatcher>::Manager(ttasksptr const& ttask)
  if (const ForestInput *p = dynamic_cast<const ForestInput*>(&m_source)) {
    m_forest = p->GetForest();
    m_rootVertex = p->GetRootVertex();
+	m_sentenceLength = p->GetSize();
  } else if (const TreeInput *p = dynamic_cast<const TreeInput*>(&m_source)) {
    T2S::InputTreeBuilder builder;
    T2S::InputTree tmpTree;
@ -46,6 +47,7 @@ Manager<RuleMatcher>::Manager(ttasksptr const& ttask)
    boost::shared_ptr<Forest> forest = boost::make_shared<Forest>();
    m_rootVertex = T2S::InputTreeToForest(tmpTree, *forest);
    m_forest = forest;
+    m_sentenceLength = p->GetSize();
  } else {
    UTIL_THROW2("ERROR: F2S::Manager requires input to be a tree or forest");
  }
@ -83,8 +85,13 @@ void Manager<RuleMatcher>::Decode()
       p = sortedVertices.begin(); p != sortedVertices.end(); ++p) {
    const Forest::Vertex &vertex = **p;

-    // Skip terminal vertices.
+    // Skip terminal vertices (after checking if they are OOVs).
    if (vertex.incoming.empty()) {
+      if (vertex.pvertex.span.GetStartPos() > 0 &&
+          vertex.pvertex.span.GetEndPos() < m_sentenceLength-1 &&
+          IsUnknownSourceWord(vertex.pvertex.symbol)) {
+        m_oovs.insert(vertex.pvertex.symbol);
+      }
      continue;
    }

@ -190,6 +197,21 @@ void Manager<RuleMatcher>::InitializeStacks()
  }
 }

+template<typename RuleMatcher>
+bool Manager<RuleMatcher>::IsUnknownSourceWord(const Word &w) const
+{
+  const std::size_t factorId = w[0]->GetId();
+  const std::vector<RuleTableFF*> &ffs = RuleTableFF::Instances();
+  for (std::size_t i = 0; i < ffs.size(); ++i) {
+    RuleTableFF *ff = ffs[i];
+    const boost::unordered_set<std::size_t> &sourceTerms =
+      ff->GetSourceTerminalSet();
+    if (sourceTerms.find(factorId) != sourceTerms.end()) {
+      return false;
+    }
+  }
+  return true;
+}

 template<typename RuleMatcher>
 const SHyperedge *Manager<RuleMatcher>::GetBestSHyperedge() const
--- a/moses/Syntax/F2S/Manager.h
+++ b/moses/Syntax/F2S/Manager.h
@ -50,10 +50,13 @@ private:

  void InitializeStacks();

+  bool IsUnknownSourceWord(const Word &) const;
+
  void RecombineAndSort(const std::vector<SHyperedge*> &, SVertexStack &);

  boost::shared_ptr<const Forest> m_forest;
  const Forest::Vertex *m_rootVertex;
+  std::size_t m_sentenceLength;  // Includes <s> and </s>
  PVertexToStackMap m_stackMap;
  boost::shared_ptr<HyperTree> m_glueRuleTrie;
  std::vector<boost::shared_ptr<RuleMatcher> > m_mainRuleMatchers;
--- a/moses/Syntax/RuleTableFF.cpp
+++ b/moses/Syntax/RuleTableFF.cpp
@ -35,7 +35,8 @@ void RuleTableFF::Load()
      staticData.GetSearchAlgorithm() == SyntaxT2S) {
    F2S::HyperTree *trie = new F2S::HyperTree(this);
    F2S::HyperTreeLoader loader;
-    loader.Load(m_input, m_output, m_filePath, *this, *trie);
+    loader.Load(m_input, m_output, m_filePath, *this, *trie,
+                m_sourceTerminalSet);
    m_table = trie;
  } else if (staticData.GetSearchAlgorithm() == SyntaxS2T) {
    S2TParsingAlgorithm algorithm = staticData.GetS2TParsingAlgorithm();
--- a/moses/Syntax/RuleTableFF.h
+++ b/moses/Syntax/RuleTableFF.h
@ -43,10 +43,17 @@ public:
    return 0;
  }

+  // Get the source terminal vocabulary for this table's grammar (as a set of
+  // factor IDs)
+  const boost::unordered_set<std::size_t> &GetSourceTerminalSet() const {
+    return m_sourceTerminalSet;
+  }
+
 private:
  static std::vector<RuleTableFF*> s_instances;

  const RuleTable *m_table;
+  boost::unordered_set<std::size_t> m_sourceTerminalSet;
 };

 }  // Syntax
--- a/moses/TranslationModel/CompactPT/MmapAllocator.h
+++ b/moses/TranslationModel/CompactPT/MmapAllocator.h
@ -24,14 +24,18 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA

 #include <limits>
 #include <iostream>
-#include <sys/mman.h>
 #include <cstdio>
 #include <unistd.h>

-#ifndef __MMAN_PAGE_SIZE__
-#define __MMAN_PAGE_SIZE__ sysconf(_SC_PAGE_SIZE)
+#if defined(_WIN32) || defined(_WIN64)
+#include <windows.h>
+#include <io.h>
+#else
+#include <sys/mman.h>
 #endif

+#include "util/mmap.hh"
+
 namespace Moses
 {
 template <class T>
@ -60,25 +64,25 @@ public:

  MmapAllocator() throw()
    : m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)),
-      m_page_size(__MMAN_PAGE_SIZE__), m_map_size(0), m_data_ptr(0),
+      m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
      m_data_offset(0), m_fixed(false), m_count(new size_t(0)) {
  }

  MmapAllocator(std::FILE* f_ptr) throw()
    : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
-      m_page_size(__MMAN_PAGE_SIZE__), m_map_size(0), m_data_ptr(0),
+      m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
      m_data_offset(0), m_fixed(false), m_count(new size_t(0)) {
  }

  MmapAllocator(std::FILE* f_ptr, size_t data_offset) throw()
    : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
-      m_page_size(__MMAN_PAGE_SIZE__), m_map_size(0), m_data_ptr(0),
+      m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
      m_data_offset(data_offset), m_fixed(true), m_count(new size_t(0)) {
  }

  MmapAllocator(std::string fileName) throw()
    : m_file_ptr(std::fopen(fileName.c_str(), "wb+")), m_file_desc(fileno(m_file_ptr)),
-      m_page_size(__MMAN_PAGE_SIZE__), m_map_size(0), m_data_ptr(0),
+      m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
      m_data_offset(0), m_fixed(false), m_count(new size_t(0)) {
  }

@ -92,7 +96,7 @@ public:

  ~MmapAllocator() throw() {
    if(m_data_ptr && *m_count == 0) {
-      munmap(m_data_ptr, m_map_size);
+      util::UnmapOrThrow(m_data_ptr, m_map_size);
      if(!m_fixed && std::ftell(m_file_ptr) != -1)
        std::fclose(m_file_ptr);
    }
@ -119,13 +123,17 @@ public:
  pointer allocate (size_type num, const void* = 0) {
    m_map_size = num * sizeof(T);

+#if defined(_WIN32) || defined(_WIN64)
+    // On Windows, MAP_SHARED is not defined and MapOrThrow ignores the flags.
+    const int map_shared = 0;
+#else
+    const int map_shared = MAP_SHARED;
+#endif
    if(!m_fixed) {
      size_t read = 0;
      read += ftruncate(m_file_desc, m_map_size);
-      m_data_ptr = (char*)mmap(0, m_map_size, PROT_READ|PROT_WRITE, MAP_SHARED,
-                               m_file_desc, 0);
-      if(m_data_ptr == MAP_FAILED)
-        std::cerr << "Error: mmapping" << std::endl;
+      m_data_ptr = (char *)util::MapOrThrow(
+        m_map_size, true, map_shared, false, m_file_desc, 0);
      return (pointer)m_data_ptr;
    } else {
      size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
@ -133,8 +141,8 @@ public:

      size_t map_size = m_map_size + relative_offset;

-      m_data_ptr = (char*)mmap(0, map_size, PROT_READ, MAP_SHARED,
-                               m_file_desc, map_offset);
+      m_data_ptr = (char *)util::MapOrThrow(
+        m_map_size, false, map_shared, false, m_file_desc, map_offset);

      return (pointer)(m_data_ptr + relative_offset);
    }
@ -142,11 +150,11 @@ public:

  void deallocate (pointer p, size_type num) {
    if(!m_fixed) {
-      munmap(p, num * sizeof(T));
+      util::UnmapOrThrow(p, num * sizeof(T));
    } else {
      size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
      size_t relative_offset = m_data_offset - map_offset;
-      munmap((pointer)((char*)p - relative_offset), num * sizeof(T));
+      util::UnmapOrThrow((pointer)((char*)p - relative_offset), num * sizeof(T));
    }

  }
--- a/moses/TranslationModel/DynSAInclude/FileHandler.cpp
+++ b/moses/TranslationModel/DynSAInclude/FileHandler.cpp
@ -1,7 +1,9 @@
 #include "FileHandler.h"
 #include <cstdio>

-#ifdef WIN32
+// Workaround: plain Windows does not have popen()/pclose().
+// (MinGW already #define's them, so skip the workaround there.)
+#if defined(WIN32) && !defined(__MINGW32__)
 #define popen(A, B) _popen(A, B)
 #define pclose(A) _pclose(A)
 #endif
--- a/moses/TranslationModel/DynSAInclude/hash.h
+++ b/moses/TranslationModel/DynSAInclude/hash.h
@ -6,6 +6,7 @@
 #include "utils.h"
 #include "FileHandler.h"
 #include "util/exception.hh"
+#include "util/random.hh"

 using namespace Moses;
 typedef uint64_t P;   // largest input range is 2^64
@ -162,7 +163,7 @@ void Hash_shiftAddXOR<T>::initSeeds()
 {
  v_ = new T[this->H_];
  for(count_t i=0; i < this->H_; i++)
-    v_[i] = Utils::rand<T>() + 1;
+    v_[i] = util::wide_rand<T>() + 1;
 }
 template <typename T>
 T Hash_shiftAddXOR<T>::hash(const char* s, count_t h)
@ -187,9 +188,8 @@ void UnivHash_tableXOR<T>::initSeeds()
  // fill with random values
  for(count_t j=0; j < this->H_; j++) {
    table_[j] = new T[tblLen_];
-    for(count_t i=0; i < tblLen_; i++) {
-      table_[j][i] = Utils::rand<T>(this->m_-1);
-    }
+    for(count_t i=0; i < tblLen_; i++)
+      table_[j][i] = util::wide_rand_excl(this->m_-1);
  }
 }
 template <typename T>
@ -218,7 +218,7 @@ void UnivHash_noPrimes<T>::initSeeds()
 {
  a_ = new P[this->H_];
  for(T i=0; i < this->H_; i++) {
-    a_[i] = Utils::rand<P>();
+    a_[i] = util::wide_rand<P>();
    if(a_[i] % 2 == 0) a_[i]++;  // a must be odd
  }
 }
@ -284,8 +284,8 @@ void UnivHash_linear<T>::initSeeds()
    a_[i] = new T[MAX_NGRAM_ORDER];
    b_[i] = new T[MAX_NGRAM_ORDER];
    for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
-      a_[i][j] = 1 + Utils::rand<T>();
-      b_[i][j] = Utils::rand<T>();
+      a_[i][j] = 1 + util::wide_rand<T>();
+      b_[i][j] = util::wide_rand<T>();
    }
  }
 }
--- a/moses/TranslationModel/DynSAInclude/onlineRLM.h
+++ b/moses/TranslationModel/DynSAInclude/onlineRLM.h
@ -302,7 +302,8 @@ float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
    }
    while(num_fnd > 1) { // get lower order count
      //get sub-context of size one less than length found (exluding target)
-      if(((den_val = query(&ngram[len - num_fnd], num_fnd - 1)) > 0) &&
+      den_val = query(&ngram[len - num_fnd], num_fnd - 1);
+      if((den_val > 0) &&
          (den_val >= in[len - num_fnd]) && (in[len - num_fnd] > 0)) {
        break;
      } else --num_fnd; // else backoff to lower ngram order
--- a/moses/TranslationModel/DynSAInclude/utils.h
+++ b/moses/TranslationModel/DynSAInclude/utils.h
@ -62,22 +62,6 @@ public:
      str[i] = tolower(str[i]);
    }
  }
-  // TODO: interface with decent PRG
-  template<typename T>
-  static T rand(T mod_bnd = 0) {
-    T random = 0;
-    if(sizeof(T) <= 4) {
-      random = static_cast<T>(std::rand());
-    } else if(sizeof(T) == 8) {
-      random = static_cast<T>(std::rand());
-      random <<= 31;
-      random <<= 1;
-      random |= static_cast<T>(std::rand());
-    }
-    if(mod_bnd != 0)
-      return random % mod_bnd;
-    else return random;
-  }
 };

 #endif
--- a/moses/TranslationModel/DynSuffixArray.cpp
+++ b/moses/TranslationModel/DynSuffixArray.cpp
@ -1,4 +1,6 @@
 #include "DynSuffixArray.h"
+#include "util/random.hh"
+
 #include <iostream>
 #include <boost/foreach.hpp>

@ -315,33 +317,31 @@ int DynSuffixArray::Compare(int pos1, int pos2, int max)
  return 0;
 }

+namespace
+{
+/// Helper: swap two entries in an int array.
+inline void swap_ints(int array[], int one, int other)
+{
+  const int tmp = array[one];
+  array[one] = array[other];
+  array[other] = tmp;
+}
+}
+
 void DynSuffixArray::Qsort(int* array, int begin, int end)
 {
  if(end > begin) {
-    int index;
+    int index = util::rand_incl(begin, end);
    {
-      index = begin + (rand() % (end - begin + 1));
-      int pivot = array[index];
-      {
-        int tmp = array[index];
-        array[index] = array[end];
-        array[end] = tmp;
-      }
+      const int pivot = array[index];
+      swap_ints(array, index, end);
      for(int i=index=begin; i < end; ++i) {
        if (Compare(array[i], pivot, 20) <= 0) {
-          {
-            int tmp = array[index];
-            array[index] = array[i];
-            array[i] = tmp;
+          swap_ints(array, index, i);
          index++;
        }
      }
-      }
-      {
-        int tmp = array[index];
-        array[index] = array[end];
-        array[end] = tmp;
-      }
+      swap_ints(array, index, end);
    }
    Qsort(array, begin, index - 1);
    Qsort(array, index + 1,  end);
--- a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
+++ b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
@ -17,6 +17,7 @@ License along with this library; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 ***********************************************************************/
 #include "util/exception.hh"
+#include "util/tokenize.hh"
 #include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"

 using namespace std;
@ -30,29 +31,6 @@ void OutputVec(const vector<T> &vec)
  cerr << endl;
 }

-// from phrase-extract/tables-core.cpp
-inline vector<string> tokenize( const char* input )
-{
-  vector< string > token;
-  bool betweenWords = true;
-  int start=0;
-  int i=0;
-  for(; input[i] != '\0'; i++) {
-    bool isSpace = (input[i] == ' ' || input[i] == '\t');
-
-    if (!isSpace && betweenWords) {
-      start = i;
-      betweenWords = false;
-    } else if (isSpace && !betweenWords) {
-      token.push_back( string( input+start, i-start ) );
-      betweenWords = true;
-    }
-  }
-  if (!betweenWords)
-    token.push_back( string( input+start, i-start ) );
-  return token;
-}
-
 namespace Moses
 {

@ -464,7 +442,7 @@ void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexic
    i++;
    if (i%100000 == 0) cerr << "." << flush;

-    vector<string> token = tokenize( line.c_str() );
+    const vector<string> token = util::tokenize( line );
    if (token.size() != 4) {
      cerr << "line " << i << " in " << fileName
           << " has wrong number of tokens, skipping:\n"
--- a/moses/TranslationModel/PhraseDictionaryTransliteration.cpp
+++ b/moses/TranslationModel/PhraseDictionaryTransliteration.cpp
@ -1,11 +1,11 @@
 // vim:tabstop=2
 #include <cstdlib>
-#include <boost/filesystem.hpp>

 #include "PhraseDictionaryTransliteration.h"
 #include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h"
 #include "moses/DecodeGraph.h"
 #include "moses/DecodeStep.h"
+#include "util/tempfile.hh"

 using namespace std;

@ -70,11 +70,10 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input
    inputPath.SetTargetPhrases(*this, tpColl, NULL);
  } else {
    // TRANSLITERATE
-    const boost::filesystem::path
-        inFile = boost::filesystem::unique_path(),
-        outDir = boost::filesystem::unique_path();
+    const util::temp_file inFile;
+    const util::temp_dir outDir;

-    ofstream inStream(inFile.c_str());
+    ofstream inStream(inFile.path().c_str());
    inStream << sourcePhrase.ToString() << endl;
    inStream.close();

@ -84,14 +83,14 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input
                 " --external-bin-dir " + m_externalDir +
                 " --input-extension " + m_inputLang +
                 " --output-extension " + m_outputLang +
-                 " --oov-file " + inFile.native() +
-                 " --out-dir " + outDir.native();
+                 " --oov-file " + inFile.path() +
+                 " --out-dir " + outDir.path();

    int ret = system(cmd.c_str());
    UTIL_THROW_IF2(ret != 0, "Transliteration script error");

    TargetPhraseCollection *tpColl = new TargetPhraseCollection();
-    vector<TargetPhrase*> targetPhrases = CreateTargetPhrases(sourcePhrase, outDir.native());
+    vector<TargetPhrase*> targetPhrases = CreateTargetPhrases(sourcePhrase, outDir.path());
    vector<TargetPhrase*>::const_iterator iter;
    for (iter = targetPhrases.begin(); iter != targetPhrases.end(); ++iter) {
      TargetPhrase *tp = *iter;
@ -102,10 +101,6 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input
    cache[hash] = value;

    inputPath.SetTargetPhrases(*this, tpColl, NULL);
-
-    // clean up temporary files
-    remove(inFile.c_str());
-    boost::filesystem::remove_all(outDir);
  }
 }

--- a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
@ -45,6 +45,7 @@
 #include "moses/TranslationModel/fuzzy-match/SentenceAlignment.h"
 #include "util/file.hh"
 #include "util/exception.hh"
+#include "util/random.hh"

 using namespace std;

@ -62,8 +63,8 @@ char *mkdtemp(char *tempbuf)
    return NULL;
  }

-  srand((unsigned)time(0));
-  rand_value = (int)((rand() / ((double)RAND_MAX+1.0)) * 1e6);
+  util::rand_init();
+  rand_value = util::rand_excl(1e6);
  tempbase = strrchr(tempbuf, '/');
  tempbase = tempbase ? tempbase+1 : tempbuf;
  strcpy(tempbasebuf, tempbase);
@ -130,10 +131,6 @@ int removedirectoryrecursively(const char *dirname)
  struct dirent *entry;
  char path[PATH_MAX];

-  if (path == NULL) {
-    fprintf(stderr, "Out of memory error\n");
-    return 0;
-  }
  dir = opendir(dirname);
  if (dir == NULL) {
    perror("Error opendir()");
--- a/moses/TranslationModel/UG/generic/sampling/Sampling.h
+++ b/moses/TranslationModel/UG/generic/sampling/Sampling.h
@ -2,6 +2,9 @@
 #define __sampling_h
 #include <boost/dynamic_bitset.hpp>
 #include <vector>
+
+#include "util/random.hh"
+
 // Utility functions for proper sub-sampling.
 // (c) 2007-2012 Ulrich Germann

@ -9,12 +12,6 @@
 namespace Moses
 {
 using namespace std;
-inline
-size_t
-randInt(size_t N)
-{
-  return N*(rand()/(RAND_MAX+1.));
-}

 // select a random sample of size /s/ without restitution from the range of
 // integers [0,N);
@ -35,15 +32,15 @@ randomSample(vector<idx_t>& v, size_t s, size_t N)
  if (s*10<N) {
    boost::dynamic_bitset<uint64_t> check(N,0);
    for (size_t i = 0; i < v.size(); i++) {
-      size_t x = randInt(N);
-      while (check[x]) x = randInt(N);
+      size_t x = util::rand_excl(N);
+      while (check[x]) x = util::rand_excl(N);
      check[x]=true;
      v[i] = x;
    }
  } else {
    size_t m=0;
    for (size_t t = 0; m <= s && t < N; t++)
-      if (s==N || randInt(N-t) < s-m) v[m++] = t;
+      if (s==N || util::rand_excl(N-t) < s-m) v[m++] = t;
  }
 }

--- a/moses/TranslationModel/UG/mm/ug_mmbitext.cc
+++ b/moses/TranslationModel/UG/mm/ug_mmbitext.cc
@ -345,7 +345,7 @@
 // 	{
 // 	  boost::lock_guard<boost::mutex> lock(stats->lock);
 // 	  if (stats->raw_cnt == ctr) ++stats->raw_cnt;
-// 	  size_t rnum = randInt(stats->raw_cnt - ctr++);
+// 	  size_t rnum = util::rand_excl(stats->raw_cnt - ctr++);
 // 	  // cout << stats->raw_cnt << " " << ctr-1 << " " 
 // 	  // << rnum << " " << max_samples - stats->good << endl;
 // 	  if (rnum < max_samples - stats->good)
--- a/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h
+++ b/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h
@ -69,7 +69,7 @@ namespace ugdiss
    //   while (chosen < samplesize && next < stop)
    // 	{
    // 	  root->readEntry(next,*this);
-    // 	  if (randInt(N - sampled++) < samplesize - chosen)
+    // 	  if (util::rand_excl(N - sampled++) < samplesize - chosen)
    // 	    {
    // 	      ++chosen;
    // 	      return true;
--- a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
+++ b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
@ -9,6 +9,7 @@
 #include <iostream>
 #include "util/exception.hh"
 #include "moses/Util.h"
+#include "util/random.hh"
 //#include <cassert>

 // #include "ug_bv_iter.h"
@ -896,13 +897,6 @@ namespace ugdiss
    return bv;
  }

-  inline
-  size_t 
-  randInt(size_t N)
-  {
-    return size_t(N*(rand()/(RAND_MAX+1.)));
-  }
-
  /// randomly select up to N occurrences of the sequence
  template<typename Token>
  sptr<vector<typename ttrack::Position> >
@ -924,8 +918,8 @@ namespace ugdiss
        root->readEntry(I.next,I);
        
        // t: expected number of remaining samples
-        double t = (stop - I.pos)/root->aveIndexEntrySize();
-        double r = t*rand()/(RAND_MAX+1.);
+        const double t = (stop - I.pos)/root->aveIndexEntrySize();
+        const double r = util::rand_excl(t);
        if (r < N-m)
          {
            ret->at(m).offset = I.offset;
--- a/moses/TranslationModel/UG/mmsapt.cpp
+++ b/moses/TranslationModel/UG/mmsapt.cpp
@ -16,7 +16,7 @@ namespace Moses
 {
  using namespace bitext;
  using namespace std;
-  // using namespace boost;
+  using namespace boost;

  void 
  fillIdSeq(Phrase const& mophrase, size_t const ifactor,
@ -155,6 +155,10 @@ namespace Moses
    input_factor = atoi(param.insert(dflt).first->second.c_str());
    // shouldn't that be a string?
    
+    dflt = pair<string,string> ("output-factor","0");
+    output_factor = atoi(param.insert(dflt).first->second.c_str());
+    ofactor.assign(1,output_factor);
+    
    dflt = pair<string,string> ("smooth",".01");
    m_lbop_conf = atof(param.insert(dflt).first->second.c_str());

--- a/moses/TrellisPath.cpp
+++ b/moses/TrellisPath.cpp
@ -31,7 +31,6 @@ namespace Moses
 TrellisPath::TrellisPath(const Hypothesis *hypo)
  :	m_prevEdgeChanged(NOT_FOUND)
 {
-  m_scoreBreakdown					= hypo->GetScoreBreakdown();
  m_totalScore = hypo->GetTotalScore();

  // enumerate path using prevHypo
@ -41,10 +40,9 @@ TrellisPath::TrellisPath(const Hypothesis *hypo)
  }
 }

-void TrellisPath::InitScore()
+void TrellisPath::InitTotalScore()
 {
  m_totalScore		= m_path[0]->GetWinningHypo()->GetTotalScore();
-  m_scoreBreakdown= m_path[0]->GetWinningHypo()->GetScoreBreakdown();

  //calc score
  size_t sizePath = m_path.size();
@ -53,12 +51,8 @@ void TrellisPath::InitScore()
    const Hypothesis *winningHypo = hypo->GetWinningHypo();
    if (hypo != winningHypo) {
      m_totalScore = m_totalScore - winningHypo->GetTotalScore() + hypo->GetTotalScore();
-      m_scoreBreakdown.MinusEquals(winningHypo->GetScoreBreakdown());
-      m_scoreBreakdown.PlusEquals(hypo->GetScoreBreakdown());
    }
  }
-
-
 }

 TrellisPath::TrellisPath(const TrellisPath &copy, size_t edgeIndex, const Hypothesis *arc)
@ -80,7 +74,7 @@ TrellisPath::TrellisPath(const TrellisPath &copy, size_t edgeIndex, const Hypoth
    prevHypo = prevHypo->GetPrevHypo();
  }

-  InitScore();
+  InitTotalScore();
 }

 TrellisPath::TrellisPath(const vector<const Hypothesis*> edges)
@ -88,9 +82,7 @@ TrellisPath::TrellisPath(const vector<const Hypothesis*> edges)
 {
  m_path.resize(edges.size());
  copy(edges.rbegin(),edges.rend(),m_path.begin());
-  InitScore();
-
-
+  InitTotalScore();
 }


@ -172,6 +164,32 @@ void TrellisPath::CreateDeviantPaths(TrellisPathList &pathColl) const
  }
 }

+const boost::shared_ptr<ScoreComponentCollection> TrellisPath::GetScoreBreakdown() const
+{
+  if (!m_scoreBreakdown) {
+    float totalScore = m_path[0]->GetWinningHypo()->GetTotalScore(); // calculated for sanity check only
+
+    m_scoreBreakdown = boost::shared_ptr<ScoreComponentCollection>(new ScoreComponentCollection());
+    m_scoreBreakdown->PlusEquals(ScoreComponentCollection(m_path[0]->GetWinningHypo()->GetScoreBreakdown()));
+
+    //calc score
+    size_t sizePath = m_path.size();
+    for (size_t pos = 0 ; pos < sizePath ; pos++) {
+      const Hypothesis *hypo = m_path[pos];
+      const Hypothesis *winningHypo = hypo->GetWinningHypo();
+      if (hypo != winningHypo) {
+        totalScore = totalScore - winningHypo->GetTotalScore() + hypo->GetTotalScore();
+        m_scoreBreakdown->MinusEquals(winningHypo->GetScoreBreakdown());
+        m_scoreBreakdown->PlusEquals(hypo->GetScoreBreakdown());
+      }
+    }
+
+    assert(totalScore == m_totalScore);
+  }
+
+  return m_scoreBreakdown;
+}
+
 Phrase TrellisPath::GetTargetPhrase() const
 {
  Phrase targetPhrase(ARRAY_SIZE_INCR);
--- a/moses/TrellisPath.h
+++ b/moses/TrellisPath.h
@ -19,14 +19,14 @@ License along with this library; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 ***********************************************************************/

-#ifndef moses_TrellisPath_h
-#define moses_TrellisPath_h
+#pragma once

 #include <iostream>
 #include <vector>
 #include <limits>
 #include "Hypothesis.h"
 #include "TypeDef.h"
+#include <boost/shared_ptr.hpp>

 namespace Moses
 {
@ -50,13 +50,13 @@ protected:
 																	, or NOT_FOUND if this path is the best trans so consist of only hypos
 															 */

-  ScoreComponentCollection	m_scoreBreakdown;
  float m_totalScore;
+  mutable boost::shared_ptr<ScoreComponentCollection> m_scoreBreakdown;

  //Used by Manager::LatticeSample()
  explicit TrellisPath(const std::vector<const Hypothesis*> edges);

-  void InitScore();
+  void InitTotalScore();

 public:
  TrellisPath(); // not implemented
@ -91,9 +91,7 @@ public:
  //! create a list of next best paths by wiggling 1 of the node at a time.
  void CreateDeviantPaths(TrellisPathList &pathColl) const;

-  inline const ScoreComponentCollection &GetScoreBreakdown() const {
-    return m_scoreBreakdown;
-  }
+  const boost::shared_ptr<ScoreComponentCollection> GetScoreBreakdown() const;

  //! get target words range of the hypo within n-best trellis. not necessarily the same as hypo.GetCurrTargetWordsRange()
  WordsRange GetTargetWordsRange(const Hypothesis &hypo) const;
@ -123,4 +121,4 @@ inline std::ostream& operator<<(std::ostream& out, const TrellisPath& path)
 }

 }
-#endif
+
--- a/moses/Util.h
+++ b/moses/Util.h
@ -502,13 +502,11 @@ inline std::string GetFirstString(const std::string& str, int& first_pos,  const
 template<class T>
 T log_sum (T log_a, T log_b)
 {
-  T v;
  if (log_a < log_b) {
-    v = log_b+log ( 1 + exp ( log_a-log_b ));
+    return log_b + log1p(exp(log_a - log_b));
  } else {
-    v = log_a+log ( 1 + exp ( log_b-log_a ));
+    return log_a + log1p(exp(log_b - log_a));
  }
-  return ( v );
 }

 /**
--- a/moses/mbr.cpp
+++ b/moses/mbr.cpp
@ -105,13 +105,13 @@ const TrellisPath doMBR(const TrellisPathList& nBestList)
  for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
    const TrellisPath &path = **iter;
    float score = StaticData::Instance().GetMBRScale()
-                  * path.GetScoreBreakdown().GetWeightedScore();
+                  * path.GetScoreBreakdown()->GetWeightedScore();
    if (maxScore < score) maxScore = score;
  }

  for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
    const TrellisPath &path = **iter;
-    joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown().GetWeightedScore() - maxScore);
+    joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown()->GetWeightedScore() - maxScore);
    marginal += joint_prob;
    joint_prob_vec.push_back(joint_prob);

--- a/moses/server/TranslationRequest.cpp
+++ b/moses/server/TranslationRequest.cpp
@ -166,7 +166,7 @@ namespace MosesServer
 	  {
 	    // should the score breakdown be reported in a more structured manner?
 	    ostringstream buf;
-	    path->GetScoreBreakdown().OutputAllFeatureScores(buf);
+	    path->GetScoreBreakdown()->OutputAllFeatureScores(buf);
 	    nBestXmlItem["fvals"] = xmlrpc_c::value_string(buf.str());
 	  }
 	
--- a/phrase-extract/DomainFeature.cpp
+++ b/phrase-extract/DomainFeature.cpp
@ -2,6 +2,7 @@
 #include "ExtractionPhrasePair.h"
 #include "tables-core.h"
 #include "InputFileStream.h"
+#include "util/tokenize.hh"

 using namespace std;

@ -17,7 +18,7 @@ void Domain::load( const std::string &domainFileName )
  string line;
  while(getline(*fileP, line)) {
    // read
-    vector< string > domainSpecLine = tokenize( line.c_str() );
+    const vector< string > domainSpecLine = util::tokenize( line );
    int lineNumber;
    if (domainSpecLine.size() != 2 ||
        ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
@ -25,7 +26,7 @@ void Domain::load( const std::string &domainFileName )
      exit(1);
    }
    // store
-    string &name = domainSpecLine[1];
+    const string &name = domainSpecLine[1];
    spec.push_back( make_pair( lineNumber, name ));
    if (name2id.find( name ) == name2id.end()) {
      name2id[ name ] = list.size();
--- a/phrase-extract/DomainFeature.h
+++ b/phrase-extract/DomainFeature.h
@ -14,8 +14,6 @@

 #include "ScoreFeature.h"

-extern std::vector<std::string> tokenize( const char*);
-
 namespace MosesTraining
 {

--- a/phrase-extract/SentenceAlignment.cpp
+++ b/phrase-extract/SentenceAlignment.cpp
@ -24,6 +24,7 @@
 #include <string>

 #include "tables-core.h"
+#include "util/tokenize.hh"

 using namespace std;

@ -40,7 +41,7 @@ void addBoundaryWords(vector<string> &phrase)

 bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules)
 {
-  target = tokenize(targetString);
+  target = util::tokenize(targetString);
  if (boundaryRules)
    addBoundaryWords(target);
  return true;
@ -48,7 +49,7 @@ bool SentenceAlignment::processTargetSentence(const char * targetString, int, bo

 bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules)
 {
-  source = tokenize(sourceString);
+  source = util::tokenize(sourceString);
  if (boundaryRules)
    addBoundaryWords(source);
  return true;
@ -89,7 +90,7 @@ bool SentenceAlignment::create(const char targetString[],
  }

  // reading in alignments
-  vector<string> alignmentSequence = tokenize( alignmentString );
+  vector<string> alignmentSequence = util::tokenize( alignmentString );
  for(size_t i=0; i<alignmentSequence.size(); i++) {
    int s,t;
    // cout << "scaning " << alignmentSequence[i].c_str() << endl;
--- a/phrase-extract/SentenceAlignmentWithSyntax.cpp
+++ b/phrase-extract/SentenceAlignmentWithSyntax.cpp
@ -26,6 +26,7 @@
 #include "tables-core.h"
 #include "XmlException.h"
 #include "XmlTree.h"
+#include "util/tokenize.hh"

 using namespace std;

@ -49,7 +50,7 @@ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetStrin
              << sentenceID << ": " << e.getMsg() << std::endl;
    return false;
  }
-  target = tokenize(targetStringCPP.c_str());
+  target = util::tokenize(targetStringCPP);
  return true;
 }

@ -70,11 +71,8 @@ bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceStrin
              << sentenceID << ": " << e.getMsg() << std::endl;
    return false;
  }
-  source = tokenize(sourceStringCPP.c_str());
+  source = util::tokenize(sourceStringCPP);
  return true;
 }

 } // namespace
-
-
-
--- a/phrase-extract/consolidate-direct-main.cpp
+++ b/phrase-extract/consolidate-direct-main.cpp
@ -25,11 +25,10 @@
 #include <cstdlib>
 #include "InputFileStream.h"
 #include "OutputFileStream.h"
+#include "util/tokenize.hh"

 using namespace std;

-std::vector<std::string> tokenize( const char [] );
-
 vector< string > splitLine(const char *line)
 {
  vector< string > item;
@ -109,7 +108,7 @@ int main(int argc, char* argv[])
    if (! getLine(fileDirectP,  itemDirect  ))
      break;

-    vector< string > count = tokenize( itemDirect[4].c_str() );
+    const vector< string > count = util::tokenize( itemDirect[4] );
    float countEF = atof(count[0].c_str());
    float countF = atof(count[1].c_str());
    float prob = countF/countEF;
--- a/phrase-extract/consolidate-reverse-main.cpp
+++ b/phrase-extract/consolidate-reverse-main.cpp
@ -28,6 +28,7 @@

 #include "tables-core.h"
 #include "InputFileStream.h"
+#include "util/tokenize.hh"

 using namespace std;

@ -165,8 +166,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
    fileConsolidated << " ||| " << reverseAlignment(itemDirect[3]);

    // counts, for debugging
-    vector<string> directCounts = tokenize(itemDirect[4].c_str());
-    vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
+    const vector<string> directCounts = util::tokenize(itemDirect[4]);
+    const vector<string> indirectCounts = util::tokenize(itemIndirect[4]);
    fileConsolidated << "||| " << directCounts[0] << " " << indirectCounts[0];
    // output rule count if present in either file
    if (indirectCounts.size() > 1) {
@ -199,7 +200,6 @@ bool getLine( istream &fileP, vector< string > &item )
 vector< string > splitLine(const char *line)
 {
  vector< string > item;
-  bool betweenWords = true;
  int start=0;
  int i=0;
  for(; line[i] != '\0'; i++) {
@ -223,10 +223,10 @@ string reverseAlignment(const string &alignments)
 {
  stringstream ret("");

-  vector<string> alignToks = tokenize(alignments.c_str());
+  const vector<string> alignToks = util::tokenize(alignments);

  for (size_t i = 0; i < alignToks.size(); ++i) {
-    string &alignPair = alignToks[i];
+    const string &alignPair = alignToks[i];
    vector<string> alignPoints;
    Tokenize(alignPoints, alignPair, "-");
    assert(alignPoints.size() == 2);
--- a/phrase-extract/extract-ghkm/XmlTreeParser.cpp
+++ b/phrase-extract/extract-ghkm/XmlTreeParser.cpp
@ -23,6 +23,7 @@
 #include "tables-core.h"
 #include "XmlException.h"
 #include "XmlTree.h"
+#include "util/tokenize.hh"

 #include <cassert>
 #include <vector>
@ -56,7 +57,7 @@ std::auto_ptr<ParseTree> XmlTreeParser::Parse(const std::string &line)
  m_tree.ConnectNodes();
  SyntaxNode *root = m_tree.GetTop();
  assert(root);
-  m_words = tokenize(m_line.c_str());
+  m_words = util::tokenize(m_line);
  return ConvertTree(*root, m_words);
 }

--- a/phrase-extract/pcfg-common/xml_tree_parser.cc
+++ b/phrase-extract/pcfg-common/xml_tree_parser.cc
@ -25,6 +25,7 @@
 #include "tables-core.h"
 #include "XmlException.h"
 #include "XmlTree.h"
+#include "util/tokenize.hh"

 #include "syntax-common/exception.h"

@ -51,7 +52,7 @@ std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line) {
    // There is no XML tree.
    return std::auto_ptr<PcfgTree>();
  }
-  m_words = tokenize(m_line.c_str());
+  m_words = util::tokenize(m_line);
  return ConvertTree(*root, m_words);
 }

--- a/phrase-extract/relax-parse-main.cpp
+++ b/phrase-extract/relax-parse-main.cpp
@ -21,6 +21,7 @@

 #include "relax-parse.h"
 #include "tables-core.h"
+#include "util/tokenize.hh"

 using namespace std;
 using namespace MosesTraining;
@ -44,7 +45,7 @@ int main(int argc, char* argv[])
    map< string, int > topLabelCollection; // count of top labels, not used
    SyntaxTree tree;
    ProcessAndStripXMLTags( inBufferString, tree, labelCollection, topLabelCollection, false );
-    vector< string > inWords = tokenize( inBufferString.c_str() );
+    const vector< string > inWords = util::tokenize( inBufferString );

    // output tree
    // cerr << "BEFORE:" << endl << tree;
@ -104,7 +105,7 @@ void init(int argc, char* argv[])
  }
 }

-void store( SyntaxTree &tree, vector< string > &words )
+void store( SyntaxTree &tree, const vector< string > &words )
 {
  // output words
  for( size_t i=0; i<words.size(); i++ ) {
--- a/phrase-extract/relax-parse.h
+++ b/phrase-extract/relax-parse.h
@ -39,7 +39,7 @@ char SAMTLevel = 0;

 // functions
 void init(int argc, char* argv[]);
-void store( MosesTraining::SyntaxTree &tree, std::vector<std::string> &words );
+void store( MosesTraining::SyntaxTree &tree, const std::vector<std::string> &words );
 void LeftBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
 void RightBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
 void SAMT( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
--- a/phrase-extract/statistics-main.cpp
+++ b/phrase-extract/statistics-main.cpp
@ -14,6 +14,7 @@
 #include "AlignmentPhrase.h"
 #include "tables-core.h"
 #include "InputFileStream.h"
+#include "util/tokenize.hh"

 using namespace std;
 using namespace MosesTraining;
@ -237,7 +238,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair )

 bool PhraseAlignment::create(const char line[], int lineID )
 {
-  vector< string > token = tokenize( line );
+  const vector< string > token = util::tokenize( line );
  int item = 1;
  PHRASE phraseF, phraseE;
  for (size_t j=0; j<token.size(); j++) {
@ -321,7 +322,7 @@ void LexicalTable::load( const string &filePath )
    i++;
    if (i%100000 == 0) cerr << "." << flush;

-    vector<string> token = tokenize( line.c_str() );
+    const vector<string> token = util::tokenize( line );
    if (token.size() != 3) {
      cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
           token.size() << " " << token[0] << " " << line << endl;
--- a/phrase-extract/syntax-common/xml_tree_parser.cc
+++ b/phrase-extract/syntax-common/xml_tree_parser.cc
@ -3,6 +3,7 @@
 #include "tables-core.h"
 #include "XmlException.h"
 #include "XmlTree.h"
+#include "util/tokenize.hh"

 #include <cassert>
 #include <vector>
@ -24,7 +25,7 @@ StringTree *XmlTreeParser::Parse(const std::string &line) {
  tree_.ConnectNodes();
  SyntaxNode *root = tree_.GetTop();
  assert(root);
-  words_ = tokenize(line_.c_str());
+  words_ = util::tokenize(line_);
  return ConvertTree(*root, words_);
 }

--- a/phrase-extract/tables-core.cpp
+++ b/phrase-extract/tables-core.cpp
@ -1,5 +1,6 @@
 // $Id$
 //#include "beammain.h"
+#include "util/tokenize.hh"
 #include "tables-core.h"

 #define TABLE_LINE_MAX_LENGTH 1000
@ -7,37 +8,9 @@

 using namespace std;

-// as in beamdecoder/tables.cpp
-vector<string> tokenize( const char* input )
-{
-  vector< string > token;
-  bool betweenWords = true;
-  int start=0;
-  int i=0;
-  for(; input[i] != '\0'; i++) {
-    bool isSpace = (input[i] == ' ' || input[i] == '\t');
-
-    if (!isSpace && betweenWords) {
-      start = i;
-      betweenWords = false;
-    } else if (isSpace && !betweenWords) {
-      token.push_back( string( input+start, i-start ) );
-      betweenWords = true;
-    }
-  }
-  if (!betweenWords)
-    token.push_back( string( input+start, i-start ) );
-  return token;
-}
-
 namespace MosesTraining
 {

-bool isNonTerminal( const WORD &symbol )
-{
-  return symbol.substr(0, 1) == "[" && symbol.substr(symbol.size()-1, 1) == "]";
-}
-
 WORD_ID Vocabulary::storeIfNew( const WORD& word )
 {
  map<WORD, WORD_ID>::iterator i = lookup.find( word );
@ -107,7 +80,7 @@ void DTable::load( const string& fileName )
      abort();
    }

-    vector<string> token = tokenize(line.c_str());
+    const vector<string> token = util::tokenize(line);
    if (token.size() < 2) {
      cerr << "line " << i << " in " << fileName << " too short, skipping\n";
      continue;
--- a/phrase-extract/tables-core.h
+++ b/phrase-extract/tables-core.h
@ -12,8 +12,6 @@
 #include <map>
 #include <cmath>

-extern std::vector<std::string> tokenize( const char*);
-
 namespace MosesTraining
 {

--- a/scripts/OSM/OSM-Train.perl
+++ b/scripts/OSM/OSM-Train.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 use Getopt::Long "GetOptions";
 use FindBin qw($RealBin);
--- a/scripts/OSM/extract-singletons.perl
+++ b/scripts/OSM/extract-singletons.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use Getopt::Std;
 getopts('q');

--- a/scripts/OSM/flipAlignment.perl
+++ b/scripts/OSM/flipAlignment.perl
@ -1,4 +1,6 @@
 #!/usr/bin/env perl 
+
+use warnings;
 use strict;

  my $file = shift(@ARGV);
--- a/scripts/Transliteration/clean.pl
+++ b/scripts/Transliteration/clean.pl
@ -1,6 +1,7 @@
 #!/usr/bin/env perl 

 #input hindi word urdu word, delete all those entries that have number on any side
+use warnings;
 use utf8;

 use Getopt::Std;
--- a/scripts/Transliteration/corpusCreator.pl
+++ b/scripts/Transliteration/corpusCreator.pl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 use utf8;
--- a/scripts/Transliteration/in-decoding-transliteration.pl
+++ b/scripts/Transliteration/in-decoding-transliteration.pl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 use utf8;
--- a/scripts/Transliteration/post-decoding-transliteration.pl
+++ b/scripts/Transliteration/post-decoding-transliteration.pl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 use utf8;
--- a/scripts/Transliteration/prepare-transliteration-phrase-table.pl
+++ b/scripts/Transliteration/prepare-transliteration-phrase-table.pl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 use utf8;
--- a/scripts/Transliteration/threshold.pl
+++ b/scripts/Transliteration/threshold.pl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use utf8;
 require Encode;
 use IO::Handle;
--- a/scripts/Transliteration/train-transliteration-module.pl
+++ b/scripts/Transliteration/train-transliteration-module.pl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use utf8;
 use strict;
 use Getopt::Long "GetOptions";
--- a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
+++ b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
@ -14,6 +14,7 @@ use utf8;
 # 23.01.2010: added NIST p-value and interval computation
 ###############################################

+use warnings;
 use strict;

 #constants
--- a/scripts/analysis/sentence-by-sentence.pl
+++ b/scripts/analysis/sentence-by-sentence.pl
@ -4,6 +4,7 @@
 #sentence-by-sentence: take in a system output, with any number of factors, and a reference translation, also maybe with factors, and show each sentence and its errors
 #usage: sentence-by-sentence SYSOUT [REFERENCE]+ > sentences.html

+use warnings;
 use strict;
 use Getopt::Long;

--- a/scripts/analysis/sg2dot.perl
+++ b/scripts/analysis/sg2dot.perl
@ -4,6 +4,7 @@
 # Script to convert MOSES searchgraph to DOT format
 #

+use warnings;
 use strict;
 use File::Path;
 use File::Basename;
--- a/scripts/analysis/show-phrases-used.pl
+++ b/scripts/analysis/show-phrases-used.pl
@ -5,7 +5,9 @@
 #usage: show-phrases-used DECODER_OUTFILE > output.html
 #  where DECODER_OUTFILE is the output of moses with the -T (show alignments) option

+use warnings;
 use strict;
+
 BEGIN
 {
    my $wd= `pawd 2>/dev/null`;
--- a/scripts/analysis/smtgui/filter-phrase-table.pl
+++ b/scripts/analysis/smtgui/filter-phrase-table.pl
@ -9,6 +9,7 @@
 #similar function to filter-model-given-input.pl, but only operates
 #on the phrase table and doesn't require that any subdirectories exist

+use warnings;
 use strict;

 my $MAX_LENGTH = 10;
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@ -7,8 +7,15 @@ get-corpus
 	default-name: corpus/txt
 	rerun-on-change: input-extension output-extension
 	template: IN OUT $input-extension $output-extension
-tokenize
+pre-tok-clean
        in: raw-stem
+        out: pre-tok-cleaned
+        default-name: corpus/pre-tok-cleaned
+        pass-unless: pre-tok-clean
+        template: $pre-tok-clean IN $input-extension $output-extension OUT OUT.lines-retained
+        parallelizable: yes
+tokenize
+	in: pre-tok-cleaned
 	out: tokenized-stem
 	default-name: corpus/tok
 	pass-unless: input-tokenizer output-tokenizer
@ -158,11 +165,18 @@ get-corpus
 	pass-unless: get-corpus-script
 	default-name: lm/txt
 	template: $get-corpus-script > OUT
+use-parallel-corpus
+  in: parallel-corpus-stem
+  out: tokenized-corpus
+	default-name: lm/tok
+	ignore-unless: parallel-corpus-stem
+	template: ln -s IN.$output-extension  OUT	
 tokenize
 	in: raw-corpus
 	out: tokenized-corpus
 	default-name: lm/tok
 	pass-unless: output-tokenizer
+	ignore-if: parallel-corpus-stem
 	template: $output-tokenizer < IN > OUT
 	parallelizable: yes
 mock-parse
@ -204,8 +218,14 @@ split
 	default-name: lm/split
 	pass-unless: output-splitter
 	template: $output-splitter -model IN1.$output-extension < IN > OUT
-train
+strip
        in: split-corpus
+        out: stripped-corpus
+        default-name: lm/stripped
+        pass-unless: mock-output-parser-lm
+        template: $moses-script-dir/training/strip-xml.perl < IN > OUT
+train
+	in: stripped-corpus
 	out: lm
 	default-name: lm/lm
 	ignore-if: rlm-training
@ -220,7 +240,7 @@ randomize
 	pass-unless: lm-randomizer
 	ignore-if: rlm-training
 train-randomized
-	in: split-corpus
+	in: stripped-corpus
 	out: rlm
 	default-name: lm/rlm
 	ignore-unless: rlm-training
@ -953,21 +973,21 @@ split-reference-devtest
 	ignore-unless: use-mira
 	multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
 	template: $output-splitter -model IN1.$output-extension < IN > OUT
-reduce-reference
+strip-reference
 	in: split-ref
 	out: reference
-	default-name: tuning/reference.reduced
+	default-name: tuning/reference.stripped
 	pass-unless: mock-output-parser-references
 	multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
-	template: $moses-script-dir/training/reduce-factors.perl --factor 0 --xml 1 --corpus IN --reduced-corpus OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
-reduce-reference-devtest
+	template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
+strip-reference-devtest
 	in: split-ref-devtest
 	out: reference
-	default-name: tuning/reference.devtest.reduced
+	default-name: tuning/reference.devtest.stripped
 	pass-unless: mock-output-parser-references
 	ignore-unless: use-mira
 	multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
-	template: $moses-script-dir/training/reduce-factors.perl --factor 0 --xml 1 --corpus IN --reduced-corpus OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
+	template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
 filter
 	in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains TRAINING:transliteration-table
 	out: filtered-dir
@ -1224,13 +1244,13 @@ lowercase-reference
 	pass-if: recaser
 	multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
 	template: $output-lowercaser < IN > OUT	
-reduce-reference
+strip-reference
 	in: lowercased-reference
 	out: reference
 	default-name: evaluation/reference
 	pass-unless: mock-output-parser-references
 	multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
-	template: $moses-script-dir/training/reduce-factors.perl --factor 0 --xml 1 --corpus IN --reduced-corpus OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
+	template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
 wade
 	in: filtered-dir truecased-input tokenized-reference alignment system-output
 	out: wade-analysis
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@ -3,6 +3,7 @@
 # Experiment Management System
 # Documentation at http://www.statmt.org/moses/?n=FactoredTraining.EMS

+use warnings;
 use strict;
 use Getopt::Long "GetOptions";
 use FindBin qw($RealBin);
--- a/scripts/ems/fix-info.perl
+++ b/scripts/ems/fix-info.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 my ($file,$step) = @ARGV;
--- a/scripts/ems/support/analysis.perl
+++ b/scripts/ems/support/analysis.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 use Getopt::Long "GetOptions";

--- a/scripts/ems/support/build-domain-file-from-subcorpora.perl
+++ b/scripts/ems/support/build-domain-file-from-subcorpora.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 # Create domain file from corpora
--- a/scripts/ems/support/build-sparse-features.perl
+++ b/scripts/ems/support/build-sparse-features.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 # Build necessary files for sparse lexical features
--- a/scripts/ems/support/consolidate-training-data.perl
+++ b/scripts/ems/support/consolidate-training-data.perl
@ -2,6 +2,7 @@

 # $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $

+use warnings;
 use strict;

 my ($in,$out,$consolidated,@PART) = @ARGV;
--- a/scripts/ems/support/generic-multicore-parallelizer.perl
+++ b/scripts/ems/support/generic-multicore-parallelizer.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 my $cores = 8;
--- a/scripts/ems/support/generic-parallelizer.perl
+++ b/scripts/ems/support/generic-parallelizer.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 my $jobs = 20;
--- a/scripts/ems/support/input-from-sgm.perl
+++ b/scripts/ems/support/input-from-sgm.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 die("ERROR syntax: input-from-sgm.perl < in.sgm > in.txt") 
--- a/scripts/ems/support/interpolate-lm.perl
+++ b/scripts/ems/support/interpolate-lm.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 use IPC::Open3;
 use File::Temp qw/tempdir/;
--- a/scripts/ems/support/lmplz-wrapper.perl
+++ b/scripts/ems/support/lmplz-wrapper.perl
@ -1,10 +1,13 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 use Getopt::Long "GetOptions";

+Getopt::Long::config("no_auto_abbrev");
 Getopt::Long::config("pass_through");

+
 my ($TEXT,$ORDER,$BIN,$LM);

 &GetOptions('text=s' => \$TEXT,
@ -15,8 +18,9 @@ my ($TEXT,$ORDER,$BIN,$LM);
 die("ERROR: specify at least --bin BIN --text CORPUS --lm LM and --order N!")
  unless defined($BIN) && defined($TEXT) && defined($LM) && defined($ORDER);

-my $cmd = "$BIN --text $TEXT --order $ORDER --arpa $LM";
-$cmd .= " " . join(' ', @ARGV) if scalar(@ARGV);  # Pass remaining args through.
+my $settings = join(' ', @ARGV);
+#print STDERR "settngs=$settings \n";

+my $cmd = "$BIN --text $TEXT --order $ORDER --arpa $LM $settings";
 print "exec: $cmd\n";
 `$cmd`;
--- a/Show More
+++ b/Show More