Merge branch 'master' of https://github.com/moses-smt/mosesdecoder

2024-09-11 11:25:40 +03:00 · 2015-04-26 20:23:39 +02:00 · 2015-04-26 20:23:39 +02:00 · dda3ddd80b
commit dda3ddd80b
parent 496a2a716c 41529227b2
495 changed files with 8669 additions and 5118 deletions
--- a/65
+++ b/65
@ -72,48 +72,42 @@
 #--clean to clean
 #--debug-build to build with Og. Only available with gcc 4.8+

+import os ;
 import option ;
 import modules ;
 import path ;
 path-constant TOP : . ;
+
 include $(TOP)/jam-files/sanity.jam ;
-include $(TOP)/jam-files/server.jam ;

-# exit :  0 ; 
-
-if [ build_server ] != no 
-{
-  xmlrpc-c-prefix = [ shell_or_die "$(xmlrpc-c-config-cmd) c++2 abyss-server --prefix" ] ;
-  echo "XMLRPC-C: BUILDING MOSES WITH XMLRPC_C LIBRARY VERSION $(xmlrpc-c-version) FROM $(xmlrpc-c-prefix)" ;
-
-  xmlrpc-cxxflags = [ shell_or_die "$(xmlrpc-c-config-cmd) c++2 abyss-server --cflags" ] ;
-  requirements += <define>HAVE_XMLRPC_C ;
-  requirements += <cxxflags>$(xmlrpc-cxxflags) ;
-
-  xmlrpc-linkflags = [ shell_or_die "$(xmlrpc-c-config-cmd) c++2 abyss-server --libs" ] ;
-  for local i in [ SPLIT_BY_CHARACTERS $(xmlrpc-linkflags) : " " ] 
-  { 
-    local libname = [ MATCH "-l(xmlrpc.*)" : $(i) ] ; 
-    if $(libname) 
-    {   
-      external-lib $(libname) 
-      : : <runtime-link>static:<link>static <runtime-link>shared:<link>shared ;
-      requirements += <library>$(libname) ;
-    }
-    local pathname = [ MATCH "-L(.*)" : $(i) ] ; 
-    if $(pathname)
-    {
-      requirements += <library-path>$(pathname) ; 
-    }
-  } 
+home = [ os.environ "HOME" ] ;
+if [ path.exists $(home)/moses-environment.jam ] 
+{ 
+  # for those of use who don't like typing in command line bjam options all day long
+  include $(home)/moses-environment.jam ; 
 }
-# echo $(requirements) ; 
-# exit 0 ; 
+include $(TOP)/jam-files/check-environment.jam ; # get resource locations 
+                                                 # from environment variables
+include $(TOP)/jam-files/xmlrpc-c.jam ; # xmlrpc-c stuff for the server
+include $(TOP)/jam-files/curlpp.jam ;   # curlpp stuff for bias lookup (MMT only)
+
+# exit "done" : 0 ;
+
+max-order = [ option.get "max-kenlm-order" : 6 : 6 ] ;
+if ! [ option.get "max-kenlm-order" ] 
+{
+  # some classes in Moses pull in header files from KenLM, so this needs to be 
+  # defined here, not in moses/lm/Jamfile 
+  option.set "max-kenlm-order" : 6 ;
+  requirements += <define>KENLM_MAX_ORDER=$(max-order) ;
+}
+# exit "all done" : 0 ; 
+
 boost 104400 ;
 external-lib z ;

-lib dl : : <runtime-link>static:<link>static <runtime-link>shared:<link>shared ;
-requirements += <library>dl ;
+#lib dl : : <runtime-link>static:<link>static <runtime-link>shared:<link>shared ;
+#requirements += <library>dl ;


 if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_minimal" ] {
@ -139,6 +133,7 @@ if [ option.get "filter-warnings" : : "yes" ] {
  requirements += <cxxflags>-Wno-unused-but-set-variable ;
  requirements += <cxxflags>-Wno-unused-result ;
  requirements += <cxxflags>-Wno-unused-variable ;
+  requirements += <cxxflags>-Wcomment ;
 }

 if [ option.get "debug-build" : : "yes" ] {
@ -228,10 +223,11 @@ build-projects lm util phrase-extract phrase-extract/syntax-common search moses
 if [ option.get "with-mm" : : "yes" ]
 {
 alias mm :  
+  moses/TranslationModel/UG//bitext-find 
  moses/TranslationModel/UG//ptable-describe-features 
  moses/TranslationModel/UG//count-ptable-features 
  moses/TranslationModel/UG//ptable-lookup 
-  moses/TranslationModel/UG//spe-check-coverage 
+  # moses/TranslationModel/UG//spe-check-coverage 
  moses/TranslationModel/UG/mm//mtt-demo1 
  moses/TranslationModel/UG/mm//mtt-build 
  moses/TranslationModel/UG/mm//mtt-dump 
@ -307,6 +303,3 @@ if [ path.exists $(TOP)/dist ] && $(prefix) != dist {
 local temp = [ _shell "mkdir -p $(TOP)/bin" ] ;
 local temp = [ _shell "rm $(TOP)/bin/moses_chart" ] ;
 local temp = [ _shell "cd $(TOP)/bin && ln -s moses moses_chart" ] ;
-
-
-
--- a/OnDiskPt/OnDiskWrapper.cpp
+++ b/OnDiskPt/OnDiskWrapper.cpp
@ -51,7 +51,7 @@ void OnDiskWrapper::BeginLoad(const std::string &filePath)
  if (!m_vocab.Load(*this))
    UTIL_THROW(util::FileOpenException, "Couldn't load vocab");

-  UINT64 rootFilePos = GetMisc("RootNodeOffset");
+  uint64_t rootFilePos = GetMisc("RootNodeOffset");
  m_rootSourceNode = new PhraseNode(rootFilePos, *this);
 }

@ -102,7 +102,7 @@ bool OnDiskWrapper::LoadMisc()


    const string &key = tokens[0];
-    m_miscInfo[key] =  Moses::Scan<UINT64>(tokens[1]);
+    m_miscInfo[key] =  Moses::Scan<uint64_t>(tokens[1]);
  }

  return true;
@ -199,17 +199,17 @@ void OnDiskWrapper::SaveMisc()

 size_t OnDiskWrapper::GetSourceWordSize() const
 {
-  return sizeof(UINT64) + sizeof(char);
+  return sizeof(uint64_t) + sizeof(char);
 }

 size_t OnDiskWrapper::GetTargetWordSize() const
 {
-  return sizeof(UINT64) + sizeof(char);
+  return sizeof(uint64_t) + sizeof(char);
 }

-UINT64 OnDiskWrapper::GetMisc(const std::string &key) const
+uint64_t OnDiskWrapper::GetMisc(const std::string &key) const
 {
-  std::map<std::string, UINT64>::const_iterator iter;
+  std::map<std::string, uint64_t>::const_iterator iter;
  iter = m_miscInfo.find(key);
  UTIL_THROW_IF2(iter == m_miscInfo.end()
                 , "Couldn't find value for key " << key
@ -243,7 +243,7 @@ Word *OnDiskWrapper::ConvertFromMoses(const std::vector<Moses::FactorType> &fact
  } // for (size_t factorType

  bool found;
-  UINT64 vocabId = m_vocab.GetVocabId(strme.str(), found);
+  uint64_t vocabId = m_vocab.GetVocabId(strme.str(), found);
  if (!found) {
    // factor not in phrase table -> phrse definately not in. exit
    delete newWord;
--- a/OnDiskPt/OnDiskWrapper.h
+++ b/OnDiskPt/OnDiskWrapper.h
@ -43,7 +43,7 @@ protected:
  size_t m_defaultNodeSize;
  PhraseNode *m_rootSourceNode;

-  std::map<std::string, UINT64> m_miscInfo;
+  std::map<std::string, uint64_t> m_miscInfo;

  void SaveMisc();
  bool OpenForLoad(const std::string &filePath);
@ -105,7 +105,7 @@ public:
    return *m_rootSourceNode;
  }

-  UINT64 GetMisc(const std::string &key) const;
+  uint64_t GetMisc(const std::string &key) const;

  Word *ConvertFromMoses(const std::vector<Moses::FactorType> &factorsVec
                         , const Moses::Word &origWord) const;
--- a/OnDiskPt/PhraseNode.cpp
+++ b/OnDiskPt/PhraseNode.cpp
@ -31,8 +31,8 @@ namespace OnDiskPt

 size_t PhraseNode::GetNodeSize(size_t numChildren, size_t wordSize, size_t countSize)
 {
-  size_t ret = sizeof(UINT64) * 2 // num children, value
-               + (wordSize + sizeof(UINT64)) * numChildren // word + ptr to next source node
+  size_t ret = sizeof(uint64_t) * 2 // num children, value
+               + (wordSize + sizeof(uint64_t)) * numChildren // word + ptr to next source node
               + sizeof(float) * countSize; // count info
  return ret;
 }
@ -45,7 +45,7 @@ PhraseNode::PhraseNode()
 {
 }

-PhraseNode::PhraseNode(UINT64 filePos, OnDiskWrapper &onDiskWrapper)
+PhraseNode::PhraseNode(uint64_t filePos, OnDiskWrapper &onDiskWrapper)
  :m_counts(onDiskWrapper.GetNumCounts())
 {
  // load saved node
@ -55,26 +55,26 @@ PhraseNode::PhraseNode(UINT64 filePos, OnDiskWrapper &onDiskWrapper)

  std::fstream &file = onDiskWrapper.GetFileSource();
  file.seekg(filePos);
-  assert(filePos == (UINT64)file.tellg());
+  assert(filePos == (uint64_t)file.tellg());

-  file.read((char*) &m_numChildrenLoad, sizeof(UINT64));
+  file.read((char*) &m_numChildrenLoad, sizeof(uint64_t));

  size_t memAlloc = GetNodeSize(m_numChildrenLoad, onDiskWrapper.GetSourceWordSize(), countSize);
  m_memLoad = (char*) malloc(memAlloc);

  // go to start of node again
  file.seekg(filePos);
-  assert(filePos == (UINT64)file.tellg());
+  assert(filePos == (uint64_t)file.tellg());

  // read everything into memory
  file.read(m_memLoad, memAlloc);
-  assert(filePos + memAlloc == (UINT64)file.tellg());
+  assert(filePos + memAlloc == (uint64_t)file.tellg());

  // get value
-  m_value = ((UINT64*)m_memLoad)[1];
+  m_value = ((uint64_t*)m_memLoad)[1];

  // get counts
-  float *memFloat = (float*) (m_memLoad + sizeof(UINT64) * 2);
+  float *memFloat = (float*) (m_memLoad + sizeof(uint64_t) * 2);

  assert(countSize == 1);
  m_counts[0] = memFloat[0];
@ -108,10 +108,10 @@ void PhraseNode::Save(OnDiskWrapper &onDiskWrapper, size_t pos, size_t tableLimi
  //memset(mem, 0xfe, memAlloc);

  size_t memUsed = 0;
-  UINT64 *memArray = (UINT64*) mem;
+  uint64_t *memArray = (uint64_t*) mem;
  memArray[0] = GetSize(); // num of children
  memArray[1] = m_value;   // file pos of corresponding target phrases
-  memUsed += 2 * sizeof(UINT64);
+  memUsed += 2 * sizeof(uint64_t);

  // count info
  float *memFloat = (float*) (mem + memUsed);
@ -133,9 +133,9 @@ void PhraseNode::Save(OnDiskWrapper &onDiskWrapper, size_t pos, size_t tableLimi
    size_t wordMemUsed = childWord.WriteToMemory(currMem);
    memUsed += wordMemUsed;

-    UINT64 *memArray = (UINT64*) (mem + memUsed);
+    uint64_t *memArray = (uint64_t*) (mem + memUsed);
    memArray[0] = childNode.GetFilePos();
-    memUsed += sizeof(UINT64);
+    memUsed += sizeof(uint64_t);

  }

@ -148,7 +148,7 @@ void PhraseNode::Save(OnDiskWrapper &onDiskWrapper, size_t pos, size_t tableLimi
  file.seekp(0, ios::end);
  file.write(mem, memUsed);

-  UINT64 endPos = file.tellp();
+  uint64_t endPos = file.tellp();
  assert(m_filePos + memUsed == endPos);

  free(mem);
@ -206,7 +206,7 @@ const PhraseNode *PhraseNode::GetChild(const Word &wordSought, OnDiskWrapper &on
    x = (l + r) / 2;

    Word wordFound;
-    UINT64 childFilePos;
+    uint64_t childFilePos;
    GetChild(wordFound, childFilePos, x, onDiskWrapper);

    if (wordSought == wordFound) {
@ -222,14 +222,14 @@ const PhraseNode *PhraseNode::GetChild(const Word &wordSought, OnDiskWrapper &on
  return ret;
 }

-void PhraseNode::GetChild(Word &wordFound, UINT64 &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const
+void PhraseNode::GetChild(Word &wordFound, uint64_t &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const
 {

  size_t wordSize = onDiskWrapper.GetSourceWordSize();
-  size_t childSize = wordSize + sizeof(UINT64);
+  size_t childSize = wordSize + sizeof(uint64_t);

  char *currMem = m_memLoad
-                  + sizeof(UINT64) * 2 // size & file pos of target phrase coll
+                  + sizeof(uint64_t) * 2 // size & file pos of target phrase coll
                  + sizeof(float) * onDiskWrapper.GetNumCounts() // count info
                  + childSize * ind;

@ -237,15 +237,15 @@ void PhraseNode::GetChild(Word &wordFound, UINT64 &childFilePos, size_t ind, OnD
  assert(memRead == childSize);
 }

-size_t PhraseNode::ReadChild(Word &wordFound, UINT64 &childFilePos, const char *mem) const
+size_t PhraseNode::ReadChild(Word &wordFound, uint64_t &childFilePos, const char *mem) const
 {
  size_t memRead = wordFound.ReadFromMemory(mem);

  const char *currMem = mem + memRead;
-  UINT64 *memArray = (UINT64*) (currMem);
+  uint64_t *memArray = (uint64_t*) (currMem);
  childFilePos = memArray[0];

-  memRead += sizeof(UINT64);
+  memRead += sizeof(uint64_t);
  return memRead;
 }

--- a/OnDiskPt/PhraseNode.h
+++ b/OnDiskPt/PhraseNode.h
@ -36,7 +36,7 @@ class PhraseNode
 {
  friend std::ostream& operator<<(std::ostream&, const PhraseNode&);
 protected:
-  UINT64 m_filePos, m_value;
+  uint64_t m_filePos, m_value;

  typedef std::map<Word, PhraseNode> ChildColl;
  ChildColl m_children;
@ -48,35 +48,35 @@ protected:
  TargetPhraseCollection m_targetPhraseColl;

  char *m_memLoad, *m_memLoadLast;
-  UINT64 m_numChildrenLoad;
+  uint64_t m_numChildrenLoad;

  void AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase
                       , TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper
                       , size_t tableLimit, const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort);
-  size_t ReadChild(Word &wordFound, UINT64 &childFilePos, const char *mem) const;
-  void GetChild(Word &wordFound, UINT64 &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const;
+  size_t ReadChild(Word &wordFound, uint64_t &childFilePos, const char *mem) const;
+  void GetChild(Word &wordFound, uint64_t &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const;

 public:
  static size_t GetNodeSize(size_t numChildren, size_t wordSize, size_t countSize);

  PhraseNode(); // unsaved node
-  PhraseNode(UINT64 filePos, OnDiskWrapper &onDiskWrapper); // load saved node
+  PhraseNode(uint64_t filePos, OnDiskWrapper &onDiskWrapper); // load saved node
  ~PhraseNode();

-  void Add(const Word &word, UINT64 nextFilePos, size_t wordSize);
+  void Add(const Word &word, uint64_t nextFilePos, size_t wordSize);
  void Save(OnDiskWrapper &onDiskWrapper, size_t pos, size_t tableLimit);

  void AddTargetPhrase(const SourcePhrase &sourcePhrase, TargetPhrase *targetPhrase
                       , OnDiskWrapper &onDiskWrapper, size_t tableLimit
                       , const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort);

-  UINT64 GetFilePos() const {
+  uint64_t GetFilePos() const {
    return m_filePos;
  }
-  UINT64 GetValue() const {
+  uint64_t GetValue() const {
    return m_value;
  }
-  void SetValue(UINT64 value) {
+  void SetValue(uint64_t value) {
    m_value = value;
  }
  size_t GetSize() const {
--- a/OnDiskPt/TargetPhrase.cpp
+++ b/OnDiskPt/TargetPhrase.cpp
@ -103,17 +103,17 @@ char *TargetPhrase::WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed)
  size_t spSize = sp->GetSize();
  size_t sourceWordSize = onDiskWrapper.GetSourceWordSize();

-  size_t memNeeded = sizeof(UINT64)						// num of words
+  size_t memNeeded = sizeof(uint64_t)						// num of words
                     + targetWordSize * phraseSize	// actual words. lhs as last words
-                     + sizeof(UINT64)					// num source words
+                     + sizeof(uint64_t)					// num source words
                     + sourceWordSize * spSize;   // actual source words

  memUsed = 0;
-  UINT64 *mem = (UINT64*) malloc(memNeeded);
+  uint64_t *mem = (uint64_t*) malloc(memNeeded);

  // write size
  mem[0] = phraseSize;
-  memUsed += sizeof(UINT64);
+  memUsed += sizeof(uint64_t);

  // write each word
  for (size_t pos = 0; pos < phraseSize; ++pos) {
@ -124,9 +124,9 @@ char *TargetPhrase::WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed)

  // write size of source phrase and all source words
  char *currPtr = (char*)mem + memUsed;
-  UINT64 *memTmp = (UINT64*) currPtr;
+  uint64_t *memTmp = (uint64_t*) currPtr;
  memTmp[0] = spSize;
-  memUsed += sizeof(UINT64);
+  memUsed += sizeof(uint64_t);
  for (size_t pos = 0; pos < spSize; ++pos) {
    const Word &word = sp->GetWord(pos);
    char *currPtr = (char*)mem + memUsed;
@ -145,13 +145,13 @@ void TargetPhrase::Save(OnDiskWrapper &onDiskWrapper)

  std::fstream &file = onDiskWrapper.GetFileTargetInd();

-  UINT64 startPos = file.tellp();
+  uint64_t startPos = file.tellp();

  file.seekp(0, ios::end);
  file.write(mem, memUsed);

 #ifndef NDEBUG
-  UINT64 endPos = file.tellp();
+  uint64_t endPos = file.tellp();
  assert(startPos + memUsed == endPos);
 #endif

@ -167,11 +167,11 @@ char *TargetPhrase::WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t
  size_t sparseFeatureSize = m_sparseFeatures.size();
  size_t propSize = m_property.size();

-  size_t memNeeded = sizeof(UINT64) // file pos (phrase id)
-                     + sizeof(UINT64) + 2 * sizeof(UINT64) * numAlign // align
+  size_t memNeeded = sizeof(uint64_t) // file pos (phrase id)
+                     + sizeof(uint64_t) + 2 * sizeof(uint64_t) * numAlign // align
                     + sizeof(float) * numScores // scores
-                     + sizeof(UINT64) + sparseFeatureSize // sparse features string
-                     + sizeof(UINT64) + propSize; // property string
+                     + sizeof(uint64_t) + sparseFeatureSize // sparse features string
+                     + sizeof(uint64_t) + propSize; // property string

  char *mem = (char*) malloc(memNeeded);
  //memset(mem, 0, memNeeded);
@ -179,8 +179,8 @@ char *TargetPhrase::WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t
  memUsed = 0;

  // phrase id
-  memcpy(mem, &m_filePos, sizeof(UINT64));
-  memUsed += sizeof(UINT64);
+  memcpy(mem, &m_filePos, sizeof(uint64_t));
+  memUsed += sizeof(uint64_t);

  // align
  size_t tmp = WriteAlignToMemory(mem + memUsed);
@ -203,11 +203,11 @@ char *TargetPhrase::WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t
 size_t TargetPhrase::WriteStringToMemory(char *mem, const std::string &str) const
 {
  size_t memUsed = 0;
-  UINT64 *memTmp = (UINT64*) mem;
+  uint64_t *memTmp = (uint64_t*) mem;

  size_t strSize = str.size();
  memTmp[0] = strSize;
-  memUsed += sizeof(UINT64);
+  memUsed += sizeof(uint64_t);

  const char *charStr = str.c_str();
  memcpy(mem + memUsed, charStr, strSize);
@ -221,7 +221,7 @@ size_t TargetPhrase::WriteAlignToMemory(char *mem) const
  size_t memUsed = 0;

  // num of alignments
-  UINT64 numAlign = m_align.size();
+  uint64_t numAlign = m_align.size();
  memcpy(mem, &numAlign, sizeof(numAlign));
  memUsed += sizeof(numAlign);

@ -319,20 +319,20 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::Facto
  return ret;
 }

-UINT64 TargetPhrase::ReadOtherInfoFromFile(UINT64 filePos, std::fstream &fileTPColl)
+uint64_t TargetPhrase::ReadOtherInfoFromFile(uint64_t filePos, std::fstream &fileTPColl)
 {
-  assert(filePos == (UINT64)fileTPColl.tellg());
+  assert(filePos == (uint64_t)fileTPColl.tellg());

-  UINT64 memUsed = 0;
-  fileTPColl.read((char*) &m_filePos, sizeof(UINT64));
-  memUsed += sizeof(UINT64);
+  uint64_t memUsed = 0;
+  fileTPColl.read((char*) &m_filePos, sizeof(uint64_t));
+  memUsed += sizeof(uint64_t);
  assert(m_filePos != 0);

  memUsed += ReadAlignFromFile(fileTPColl);
-  assert((memUsed + filePos) == (UINT64)fileTPColl.tellg());
+  assert((memUsed + filePos) == (uint64_t)fileTPColl.tellg());

  memUsed += ReadScoresFromFile(fileTPColl);
-  assert((memUsed + filePos) == (UINT64)fileTPColl.tellg());
+  assert((memUsed + filePos) == (uint64_t)fileTPColl.tellg());

  // sparse features
  memUsed += ReadStringFromFile(fileTPColl, m_sparseFeatures);
@ -343,13 +343,13 @@ UINT64 TargetPhrase::ReadOtherInfoFromFile(UINT64 filePos, std::fstream &fileTPC
  return memUsed;
 }

-UINT64 TargetPhrase::ReadStringFromFile(std::fstream &fileTPColl, std::string &outStr)
+uint64_t TargetPhrase::ReadStringFromFile(std::fstream &fileTPColl, std::string &outStr)
 {
-  UINT64 bytesRead = 0;
+  uint64_t bytesRead = 0;

-  UINT64 strSize;
-  fileTPColl.read((char*) &strSize, sizeof(UINT64));
-  bytesRead += sizeof(UINT64);
+  uint64_t strSize;
+  fileTPColl.read((char*) &strSize, sizeof(uint64_t));
+  bytesRead += sizeof(uint64_t);

  if (strSize) {
    char *mem = (char*) malloc(strSize + 1);
@ -364,15 +364,15 @@ UINT64 TargetPhrase::ReadStringFromFile(std::fstream &fileTPColl, std::string &o
  return bytesRead;
 }

-UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP)
+uint64_t TargetPhrase::ReadFromFile(std::fstream &fileTP)
 {
-  UINT64 bytesRead = 0;
+  uint64_t bytesRead = 0;

  fileTP.seekg(m_filePos);

-  UINT64 numWords;
-  fileTP.read((char*) &numWords, sizeof(UINT64));
-  bytesRead += sizeof(UINT64);
+  uint64_t numWords;
+  fileTP.read((char*) &numWords, sizeof(uint64_t));
+  bytesRead += sizeof(uint64_t);

  for (size_t ind = 0; ind < numWords; ++ind) {
    WordPtr word(new Word());
@ -381,9 +381,9 @@ UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP)
  }

  // read source words
-  UINT64 numSourceWords;
-  fileTP.read((char*) &numSourceWords, sizeof(UINT64));
-  bytesRead += sizeof(UINT64);
+  uint64_t numSourceWords;
+  fileTP.read((char*) &numSourceWords, sizeof(uint64_t));
+  bytesRead += sizeof(uint64_t);

  PhrasePtr sp(new SourcePhrase());
  for (size_t ind = 0; ind < numSourceWords; ++ind) {
@ -396,31 +396,31 @@ UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP)
  return bytesRead;
 }

-UINT64 TargetPhrase::ReadAlignFromFile(std::fstream &fileTPColl)
+uint64_t TargetPhrase::ReadAlignFromFile(std::fstream &fileTPColl)
 {
-  UINT64 bytesRead = 0;
+  uint64_t bytesRead = 0;

-  UINT64 numAlign;
-  fileTPColl.read((char*) &numAlign, sizeof(UINT64));
-  bytesRead += sizeof(UINT64);
+  uint64_t numAlign;
+  fileTPColl.read((char*) &numAlign, sizeof(uint64_t));
+  bytesRead += sizeof(uint64_t);

  for (size_t ind = 0; ind < numAlign; ++ind) {
    AlignPair alignPair;
-    fileTPColl.read((char*) &alignPair.first, sizeof(UINT64));
-    fileTPColl.read((char*) &alignPair.second, sizeof(UINT64));
+    fileTPColl.read((char*) &alignPair.first, sizeof(uint64_t));
+    fileTPColl.read((char*) &alignPair.second, sizeof(uint64_t));
    m_align.push_back(alignPair);

-    bytesRead += sizeof(UINT64) * 2;
+    bytesRead += sizeof(uint64_t) * 2;
  }

  return bytesRead;
 }

-UINT64 TargetPhrase::ReadScoresFromFile(std::fstream &fileTPColl)
+uint64_t TargetPhrase::ReadScoresFromFile(std::fstream &fileTPColl)
 {
  UTIL_THROW_IF2(m_scores.size() == 0, "Translation rules must must have some scores");

-  UINT64 bytesRead = 0;
+  uint64_t bytesRead = 0;

  for (size_t ind = 0; ind < m_scores.size(); ++ind) {
    fileTPColl.read((char*) &m_scores[ind], sizeof(float));
--- a/OnDiskPt/TargetPhrase.h
+++ b/OnDiskPt/TargetPhrase.h
@ -36,7 +36,7 @@ class Phrase;
 namespace OnDiskPt
 {

-typedef std::pair<UINT64, UINT64>  AlignPair;
+typedef std::pair<uint64_t, uint64_t>  AlignPair;
 typedef std::vector<AlignPair> AlignType;

 class Vocab;
@ -53,15 +53,15 @@ protected:
  std::string m_sparseFeatures, m_property;

  std::vector<float> m_scores;
-  UINT64 m_filePos;
+  uint64_t m_filePos;

  size_t WriteAlignToMemory(char *mem) const;
  size_t WriteScoresToMemory(char *mem) const;
  size_t WriteStringToMemory(char *mem, const std::string &str) const;

-  UINT64 ReadAlignFromFile(std::fstream &fileTPColl);
-  UINT64 ReadScoresFromFile(std::fstream &fileTPColl);
-  UINT64 ReadStringFromFile(std::fstream &fileTPColl, std::string &outStr);
+  uint64_t ReadAlignFromFile(std::fstream &fileTPColl);
+  uint64_t ReadScoresFromFile(std::fstream &fileTPColl);
+  uint64_t ReadStringFromFile(std::fstream &fileTPColl, std::string &outStr);

 public:
  TargetPhrase() {
@ -95,7 +95,7 @@ public:
  char *WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed) const;
  void Save(OnDiskWrapper &onDiskWrapper);

-  UINT64 GetFilePos() const {
+  uint64_t GetFilePos() const {
    return m_filePos;
  }
  float GetScore(size_t ind) const {
@ -108,8 +108,8 @@ public:
                                      , const Moses::PhraseDictionary &phraseDict
                                      , const std::vector<float> &weightT
                                      , bool isSyntax) const;
-  UINT64 ReadOtherInfoFromFile(UINT64 filePos, std::fstream &fileTPColl);
-  UINT64 ReadFromFile(std::fstream &fileTP);
+  uint64_t ReadOtherInfoFromFile(uint64_t filePos, std::fstream &fileTPColl);
+  uint64_t ReadFromFile(std::fstream &fileTP);

  virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const;

--- a/OnDiskPt/TargetPhraseCollection.cpp
+++ b/OnDiskPt/TargetPhraseCollection.cpp
@ -71,12 +71,12 @@ void TargetPhraseCollection::Save(OnDiskWrapper &onDiskWrapper)
 {
  std::fstream &file = onDiskWrapper.GetFileTargetColl();

-  size_t memUsed = sizeof(UINT64);
+  size_t memUsed = sizeof(uint64_t);
  char *mem = (char*) malloc(memUsed);

  // size of coll
-  UINT64 numPhrases = GetSize();
-  ((UINT64*)mem)[0] = numPhrases;
+  uint64_t numPhrases = GetSize();
+  ((uint64_t*)mem)[0] = numPhrases;

  // MAIN LOOP
  CollType::iterator iter;
@ -98,16 +98,16 @@ void TargetPhraseCollection::Save(OnDiskWrapper &onDiskWrapper)
  }

  // total number of bytes
-  //((UINT64*)mem)[0] = (UINT64) memUsed;
+  //((uint64_t*)mem)[0] = (uint64_t) memUsed;

-  UINT64 startPos = file.tellp();
+  uint64_t startPos = file.tellp();
  file.seekp(0, ios::end);
  file.write((char*) mem, memUsed);

  free(mem);

 #ifndef NDEBUG
-  UINT64 endPos = file.tellp();
+  uint64_t endPos = file.tellp();
  assert(startPos + memUsed == endPos);
 #endif
  m_filePos = startPos;
@ -148,7 +148,7 @@ Moses::TargetPhraseCollection *TargetPhraseCollection::ConvertToMoses(const std:

 }

-void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnDiskWrapper &onDiskWrapper)
+void TargetPhraseCollection::ReadFromFile(size_t tableLimit, uint64_t filePos, OnDiskWrapper &onDiskWrapper)
 {
  fstream &fileTPColl = onDiskWrapper.GetFileTargetColl();
  fstream &fileTP = onDiskWrapper.GetFileTargetInd();
@ -156,23 +156,23 @@ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnD
  size_t numScores = onDiskWrapper.GetNumScores();


-  UINT64 numPhrases;
+  uint64_t numPhrases;

-  UINT64 currFilePos = filePos;
+  uint64_t currFilePos = filePos;
  fileTPColl.seekg(filePos);
-  fileTPColl.read((char*) &numPhrases, sizeof(UINT64));
+  fileTPColl.read((char*) &numPhrases, sizeof(uint64_t));

  // table limit
  if (tableLimit) {
-    numPhrases = std::min(numPhrases, (UINT64) tableLimit);
+    numPhrases = std::min(numPhrases, (uint64_t) tableLimit);
  }

-  currFilePos += sizeof(UINT64);
+  currFilePos += sizeof(uint64_t);

  for (size_t ind = 0; ind < numPhrases; ++ind) {
    TargetPhrase *tp = new TargetPhrase(numScores);

-    UINT64 sizeOtherInfo = tp->ReadOtherInfoFromFile(currFilePos, fileTPColl);
+    uint64_t sizeOtherInfo = tp->ReadOtherInfoFromFile(currFilePos, fileTPColl);
    tp->ReadFromFile(fileTP);

    currFilePos += sizeOtherInfo;
@ -181,7 +181,7 @@ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnD
  }
 }

-UINT64 TargetPhraseCollection::GetFilePos() const
+uint64_t TargetPhraseCollection::GetFilePos() const
 {
  return m_filePos;
 }
--- a/OnDiskPt/TargetPhraseCollection.h
+++ b/OnDiskPt/TargetPhraseCollection.h
@ -46,7 +46,7 @@ class TargetPhraseCollection
 protected:
  typedef std::vector<TargetPhrase*> CollType;
  CollType m_coll;
-  UINT64 m_filePos;
+  uint64_t m_filePos;
  std::string m_debugStr;

 public:
@ -67,7 +67,7 @@ public:

  const TargetPhrase &GetTargetPhrase(size_t ind) const;

-  UINT64 GetFilePos() const;
+  uint64_t GetFilePos() const;

  Moses::TargetPhraseCollection *ConvertToMoses(const std::vector<Moses::FactorType> &inputFactors
      , const std::vector<Moses::FactorType> &outputFactors
@ -75,7 +75,7 @@ public:
      , const std::vector<float> &weightT
      , Vocab &vocab
      , bool isSyntax) const;
-  void ReadFromFile(size_t tableLimit, UINT64 filePos, OnDiskWrapper &onDiskWrapper);
+  void ReadFromFile(size_t tableLimit, uint64_t filePos, OnDiskWrapper &onDiskWrapper);

  const std::string GetDebugStr() const;
  void SetDebugStr(const std::string &str);
--- a/OnDiskPt/Vocab.cpp
+++ b/OnDiskPt/Vocab.cpp
@ -38,7 +38,7 @@ bool Vocab::Load(OnDiskWrapper &onDiskWrapper)
    Moses::Tokenize(tokens, line);
    UTIL_THROW_IF2(tokens.size() != 2, "Vocab file corrupted");
    const string &key = tokens[0];
-    m_vocabColl[key] =  Moses::Scan<UINT64>(tokens[1]);
+    m_vocabColl[key] =  Moses::Scan<uint64_t>(tokens[1]);
  }

  // create lookup
@ -48,7 +48,7 @@ bool Vocab::Load(OnDiskWrapper &onDiskWrapper)

  CollType::const_iterator iter;
  for (iter = m_vocabColl.begin(); iter != m_vocabColl.end(); ++iter) {
-    UINT32 vocabId = iter->second;
+    uint32_t vocabId = iter->second;
    const std::string &word = iter->first;

    m_lookup[vocabId] = word;
@ -63,13 +63,13 @@ void Vocab::Save(OnDiskWrapper &onDiskWrapper)
  CollType::const_iterator iterVocab;
  for (iterVocab = m_vocabColl.begin(); iterVocab != m_vocabColl.end(); ++iterVocab) {
    const string &word = iterVocab->first;
-    UINT32 vocabId = iterVocab->second;
+    uint32_t vocabId = iterVocab->second;

    file << word << " " << vocabId << endl;
  }
 }

-UINT64 Vocab::AddVocabId(const std::string &str)
+uint64_t Vocab::AddVocabId(const std::string &str)
 {
  // find string id
  CollType::const_iterator iter = m_vocabColl.find(str);
@ -83,7 +83,7 @@ UINT64 Vocab::AddVocabId(const std::string &str)
  }
 }

-UINT64 Vocab::GetVocabId(const std::string &str, bool &found) const
+uint64_t Vocab::GetVocabId(const std::string &str, bool &found) const
 {
  // find string id
  CollType::const_iterator iter = m_vocabColl.find(str);
--- a/OnDiskPt/Vocab.h
+++ b/OnDiskPt/Vocab.h
@ -34,19 +34,19 @@ class OnDiskWrapper;
 class Vocab
 {
 protected:
-  typedef std::map<std::string, UINT64> CollType;
+  typedef std::map<std::string, uint64_t> CollType;
  CollType m_vocabColl;

  std::vector<std::string> m_lookup; // opposite of m_vocabColl
-  UINT64 m_nextId; // starts @ 1
+  uint64_t m_nextId; // starts @ 1

 public:
  Vocab()
    :m_nextId(1) {
  }
-  UINT64 AddVocabId(const std::string &str);
-  UINT64 GetVocabId(const std::string &str, bool &found) const;
-  const std::string &GetString(UINT64 vocabId) const {
+  uint64_t AddVocabId(const std::string &str);
+  uint64_t GetVocabId(const std::string &str, bool &found) const;
+  const std::string &GetString(uint64_t vocabId) const {
    return m_lookup[vocabId];
  }

--- a/OnDiskPt/Word.cpp
+++ b/OnDiskPt/Word.cpp
@ -57,10 +57,10 @@ void Word::CreateFromString(const std::string &inString, Vocab &vocab)

 size_t Word::WriteToMemory(char *mem) const
 {
-  UINT64 *vocabMem = (UINT64*) mem;
+  uint64_t *vocabMem = (uint64_t*) mem;
  vocabMem[0] = m_vocabId;

-  size_t size = sizeof(UINT64);
+  size_t size = sizeof(uint64_t);

  // is non-term
  char bNonTerm = (char) m_isNonTerminal;
@ -72,10 +72,10 @@ size_t Word::WriteToMemory(char *mem) const

 size_t Word::ReadFromMemory(const char *mem)
 {
-  UINT64 *vocabMem = (UINT64*) mem;
+  uint64_t *vocabMem = (uint64_t*) mem;
  m_vocabId = vocabMem[0];

-  size_t memUsed = sizeof(UINT64);
+  size_t memUsed = sizeof(uint64_t);

  // is non-term
  char bNonTerm;
@ -88,8 +88,8 @@ size_t Word::ReadFromMemory(const char *mem)

 size_t Word::ReadFromFile(std::fstream &file)
 {
-  const size_t memAlloc = sizeof(UINT64) + sizeof(char);
-  char mem[sizeof(UINT64) + sizeof(char)];
+  const size_t memAlloc = sizeof(uint64_t) + sizeof(char);
+  char mem[sizeof(uint64_t) + sizeof(char)];
  file.read(mem, memAlloc);

  size_t memUsed = ReadFromMemory(mem);
--- a/OnDiskPt/Word.h
+++ b/OnDiskPt/Word.h
@ -43,7 +43,7 @@ class Word

 private:
  bool m_isNonTerminal;
-  UINT64 m_vocabId;
+  uint64_t m_vocabId;

 public:
  explicit Word() {
@ -67,7 +67,7 @@ public:
  size_t ReadFromMemory(const char *mem);
  size_t ReadFromFile(std::fstream &file);

-  void SetVocabId(UINT32 vocabId) {
+  void SetVocabId(uint32_t vocabId) {
    m_vocabId = vocabId;
  }

--- a/biconcor/Alignment.cpp
+++ b/biconcor/Alignment.cpp
@ -2,7 +2,7 @@

 #include <fstream>
 #include <string>
-#include <stdlib.h>
+#include <cstdlib>
 #include <cstring>

 namespace
--- a/biconcor/Mismatch.cpp
+++ b/biconcor/Mismatch.cpp
@ -4,7 +4,7 @@
 #include <iostream>
 #include <cstring>
 #include <string>
-#include <stdlib.h>
+#include <cstdlib>

 #include "SuffixArray.h"
 #include "TargetCorpus.h"
--- a/biconcor/PhrasePairCollection.cpp
+++ b/biconcor/PhrasePairCollection.cpp
@ -1,6 +1,6 @@
 #include "PhrasePairCollection.h"

-#include <stdlib.h>
+#include <cstdlib>
 #include <cstring>
 #include <algorithm>

--- a/biconcor/SuffixArray.cpp
+++ b/biconcor/SuffixArray.cpp
@ -2,7 +2,7 @@

 #include <fstream>
 #include <string>
-#include <stdlib.h>
+#include <cstdlib>
 #include <cstring>

 namespace
--- a/biconcor/TargetCorpus.cpp
+++ b/biconcor/TargetCorpus.cpp
@ -2,7 +2,7 @@

 #include <fstream>
 #include <string>
-#include <stdlib.h>
+#include <cstdlib>
 #include <cstring>

 namespace
--- a/biconcor/phrase-lookup.cpp
+++ b/biconcor/phrase-lookup.cpp
@ -109,14 +109,17 @@ size_t lookup( string query )
  return suffixArray.Count( queryString );
 }

-vector<string> tokenize( const char input[] )
+// Duplicate of definition in util/tokenize.hh.
+// TODO: Can we de-duplicate this?  At the time of writing biconcor does not
+// use util at all.
+vector<string> tokenize(const char input[])
 {
  vector< string > token;
  bool betweenWords = true;
  int start=0;
-  int i=0;
-  for(; input[i] != '\0'; i++) {
-    bool isSpace = (input[i] == ' ' || input[i] == '\t');
+  int i;
+  for(i = 0; input[i] != '\0'; i++) {
+    const bool isSpace = (input[i] == ' ' || input[i] == '\t');

    if (!isSpace && betweenWords) {
      start = i;
--- a/contrib/c++tokenizer/Parameters.cpp
+++ b/contrib/c++tokenizer/Parameters.cpp
@ -5,11 +5,14 @@ namespace TOKENIZER_NAMESPACE {
 #endif

 Parameters::Parameters()
-: cfg_path(0)
+: nthreads(0)
+, chunksize(2000)
+, cfg_path(0)
 , verbose_p(false)
 , detag_p(false)
 , alltag_p(false)
-, escape_p(true)
+, entities_p(false)
+, escape_p(false)
 , aggro_p(false)
 , supersub_p(false)
 , url_p(true)
@ -23,6 +26,10 @@ Parameters::Parameters()
 , refined_p(false)
 , unescape_p(false)
 , drop_bad_p(false)
+, split_p(false)
+, notokenization_p(false)
+, para_marks_p(false)
+, split_breaks_p(false)
 {
 }

--- a/contrib/c++tokenizer/Parameters.h
+++ b/contrib/c++tokenizer/Parameters.h
@ -12,10 +12,13 @@ struct Parameters
    std::string lang_iso;
    std::vector<std::string> args;
    std::string out_path;
+    int nthreads;
+    int chunksize;
    const char *cfg_path;
    bool verbose_p;
    bool detag_p;
    bool alltag_p;
+    bool entities_p;
    bool escape_p;
    bool aggro_p;
    bool supersub_p;
@ -30,6 +33,10 @@ struct Parameters
    bool refined_p;
    bool unescape_p;
    bool drop_bad_p;
+    bool split_p;
+    bool notokenization_p;
+    bool para_marks_p;
+    bool split_breaks_p;

 	Parameters();

--- a/contrib/c++tokenizer/tokenizer.cpp
+++ b/contrib/c++tokenizer/tokenizer.cpp
--- a/contrib/c++tokenizer/tokenizer.h
+++ b/contrib/c++tokenizer/tokenizer.h
@ -26,12 +26,37 @@ class Tokenizer {

 private:

-    static std::string cfg_dir;
+    typedef enum { 
+        empty = 0,
+        blank,
+        upper, // upper case
+        letta, // extended word class (includes number, hyphen)
+        numba,
+        hyphn,
+        stops, // blank to stops are "extended word class" variants
+        quote, // init & fini = {',"}
+        pinit, // init (includes INVERT_*)
+        pfini, // fini
+        pfpct, // fini + pct
+        marks,
+        limit
+    } charclass_t;

+    std::size_t nthreads;
+    std::size_t chunksize;
+    std::string cfg_dir;
+
+    // non-breaking prefixes (numeric) utf8
    std::set<std::string> nbpre_num_set;
+    // non-breaking prefixes (other) utf8
    std::set<std::string> nbpre_gen_set;
+
+    // non-breaking prefixes (numeric) ucs4
    std::set<std::wstring> nbpre_num_ucs4;
+    // non-breaking prefixes (other) ucs4
    std::set<std::wstring> nbpre_gen_ucs4;
+
+    // compiled protected patterns 
    std::vector<re2::RE2 *> prot_pat_vec;

 protected:
@ -42,6 +67,7 @@ protected:
    bool latin_p; // is lang_iso "fr" or "it"
    bool skip_xml_p;
    bool skip_alltags_p;
+    bool entities_p;
    bool escape_p;
    bool unescape_p;
    bool aggressive_hyphen_p;
@ -54,20 +80,44 @@ protected:
    bool narrow_kana_p;
    bool refined_p;
    bool drop_bad_p;
+    bool splits_p;
    bool verbose_p;
+    bool para_marks_p;
+    bool split_breaks_p;

+    // return counts of general and numeric prefixes loaded
    std::pair<int,int> load_prefixes(std::ifstream& ifs); // used by init(), parameterized by lang_iso

-    // escapes specials into entities from the set &|"'[] (after tokenization, when enabled)
-    bool escape(std::string& inplace);
-
    // in-place 1 line tokenizer, replaces input string, depends on wrapper to set-up invariants
    void protected_tokenize(std::string& inplace);

-public:
+    // used for boost::thread
+    struct VectorTokenizerCallable {
+        Tokenizer *tokenizer;
+        std::vector<std::string>& in;
+        std::vector<std::string>& out;
+        
+        VectorTokenizerCallable(Tokenizer *_tokenizer, 
+                                std::vector<std::string>& _in, 
+                                std::vector<std::string>& _out) 
+        : tokenizer(_tokenizer)
+        , in(_in)
+        , out(_out) {
+        };

-    // cfg_dir is assumed shared by all languages
-    static void set_config_dir(const std::string& _cfg_dir);
+        void operator()() {
+            out.resize(in.size());
+            for (std::size_t ii = 0; ii < in.size(); ++ii) 
+                if (in[ii].empty())
+                    out[ii] = in[ii];
+                else if (tokenizer->penn_p) 
+                    out[ii] = tokenizer->penn_tokenize(in[ii]);
+                else
+                    out[ii] = tokenizer->quik_tokenize(in[ii]);
+        };
+    };
+
+public:

    Tokenizer(); // UNIMPL

@ -78,21 +128,46 @@ public:
    ~Tokenizer();

    // required before other methods, may throw
-    void init();
+    void init(const char *cfg_dir_path = 0);

-    // streaming tokenizer reads from is, writes to os, preserving line breaks
+    void set_config_dir(const std::string& _cfg_dir);
+
+    // required after processing a contiguous sequence of lines when sentence splitting is on
+    void reset();
+
+    // simultaneous sentence splitting not yet implemented
+    bool splitting() const { return splits_p; }
+
+    // escapes chars the set &|"'<> after tokenization (moses special characters)
+    bool escape(std::string& inplace);
+
+    // used in detokenizer, converts entities into characters
+    // if escape_p is set, does not unescape moses special tokens, thus
+    // escape_p and unescape_p can be used together usefully
+    bool unescape(std::string& inplace);
+
+    // streaming select-tokenizer reads from is, writes to os, preserving line breaks (unless splitting)
    std::size_t tokenize(std::istream& is, std::ostream& os);

-    // tokenize padded line buffer to return string
-    std::string tokenize(const std::string& buf);
+    // quik-tokenize padded line buffer to return string
+    std::string quik_tokenize(const std::string& buf);

+    // penn-tokenize padded line buffer to return string // untested
+    std::string penn_tokenize(const std::string& buf);
+
+    // select-tokenize padded line buffer to return string
+    std::string tokenize(const std::string& buf) {
+        return penn_p ? penn_tokenize(buf) : quik_tokenize(buf);
+    }
+
+    // tokenize with output argument
    void tokenize(const std::string& buf, std::string& outs) {
        outs = tokenize(buf);
    }

    // tokenize to a vector
    std::vector<std::string> tokens(const std::string& in) {
-        std::istringstream tokss(tokenize(in));
+        std::istringstream tokss(penn_p ? penn_tokenize(in) : tokenize(in));
        std::vector<std::string> outv;
        std::copy(std::istream_iterator<std::string>(tokss),
                  std::istream_iterator<std::string>(),
@ -117,6 +192,12 @@ public:
        return detokenize(oss.str());
    }

+    // split a string on sentence boundaries (approximately)
+    std::vector<std::string> splitter(const std::string &istr,bool *continuation_p = 0);
+
+    // split sentences from input stream and write one per line on output stream
+    std::pair<std::size_t,std::size_t> splitter(std::istream& is, std::ostream& os);
+
 }; // end class Tokenizer

 #ifdef TOKENIZER_NAMESPACE
--- a/contrib/c++tokenizer/tokenizer_main.cpp
+++ b/contrib/c++tokenizer/tokenizer_main.cpp
@ -16,10 +16,12 @@ usage(const char *path)
    std::cerr << "Usage: " << path << "[-{v|x|p|a|e|s|u|n|N]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
    std::cerr << " -a -- aggressive hyphenization" << std::endl;
    std::cerr << " -b -- drop bad bytes" << std::endl;
+    std::cerr << " -B -- splitter will split on linebreak" << std::endl;
    std::cerr << " -c DIR -- config (pattern) file directory" << std::endl;
    std::cerr << " -d -- downcase" << std::endl;
    std::cerr << " -D -- detokenize" << std::endl;
    std::cerr << " -e -- do not escape entities during tokenization" << std::endl;
+    std::cerr << " -E -- preserve entities during tokenization" << std::endl;
    std::cerr << " -k -- narrow kana" << std::endl;
    std::cerr << " -n -- narrow latin" << std::endl;
    std::cerr << " -N -- normalize" << std::endl;
@ -27,12 +29,16 @@ usage(const char *path)
    std::cerr << " -p -- penn treebank style" << std::endl;
    std::cerr << " -r -- refined contraction and quantity conjoining" << std::endl;
    std::cerr << " -s -- super- and sub-script conjoining" << std::endl;
+    std::cerr << " -S -- buffer and sentence-split lines" << std::endl;
+    std::cerr << " -T -- do not tokenize, just split, no <P> marks" << std::endl;
+    std::cerr << " -t N[,C] -- use N threads (1), chunksize C lines" << std::endl;
    std::cerr << " -u -- disable url handling" << std::endl;
    std::cerr << " -U -- unescape entities before tokenization, after detokenization" << std::endl;
    std::cerr << " -v -- verbose" << std::endl;
    std::cerr << " -w -- word filter" << std::endl;
    std::cerr << " -x -- skip xml tag lines" << std::endl;
    std::cerr << " -y -- skip all xml tags" << std::endl;
+    std::cerr << " -X -- split only, with <P> marks" << std::endl;
    std::cerr << "Default is -c ., stdin, stdout." << std::endl;
    std::cerr << "LL in en,fr,it affect contraction.  LL selects nonbreaking prefix file" << std::endl;
    std::cerr << "nonbreaking_prefix.LL is sought in getenv('TOKENIZER_SHARED_DIR')." << std::endl;
@ -83,15 +89,35 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
    int nlines = 0;
    std::string line;
    while (ifs.good() && std::getline(ifs,line)) {
-        if (line.empty()) continue;
+        if (line.empty()) 
+            continue;
        std::vector<std::string> tokens(tize.tokens(line));
        int count = 0;
+        bool was_break = false;
+
        for (auto& token: tokens) {
+            if (token.empty()) {
+                if (count || was_break) {
+                    ofs << std::endl;
+                    count = 0;
+                    nlines++;
+                    was_break = true;
+                    continue;
+                }
+            }
+            was_break = false;
+
            std::string word(token_word(token));
-            if (word.empty()) continue;
-            ofs << word << ' ';
-            count++;
+            if (word.empty()) {
+                continue;
+            }
+
+            if (count++) {
+                ofs << ' ';
+            }
+            ofs << word;
        }
+
        if (count) {
            ofs << std::endl;
            nlines++;
@ -104,13 +130,16 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
 int main(int ac, char **av) 
 {
    int rc = 0;
-		Parameters params;
+    Parameters params;

    const char *prog = av[0];
    bool next_cfg_p = false;
    bool next_output_p = false;
+    bool next_threads_p = false;
    bool detokenize_p = std::strstr(av[0],"detokenize") != 0;
-    
+    if (!detokenize_p)
+        params.split_p = std::strstr(av[0],"splitter") != 0;
+
    while (++av,--ac) { 
        if (**av == '-') {
            switch (av[0][1]) {
@ -120,6 +149,9 @@ int main(int ac, char **av)
            case 'b':
                params.drop_bad_p = true;
                break;
+            case 'B':
+                params.split_breaks_p = true;
+                break;
            case 'c':
                next_cfg_p = true;
                break;
@ -127,10 +159,13 @@ int main(int ac, char **av)
                params.downcase_p = true;
                break;
            case 'D':
-                detokenize_p = true;
+                detokenize_p = !detokenize_p;
                break;
            case 'e':
-                params.escape_p = false;
+                params.escape_p = !params.escape_p;
+                break;
+            case 'E':
+                params.entities_p = true;
                break;
            case 'h':
                usage(prog);
@ -156,6 +191,16 @@ int main(int ac, char **av)
            case 's':
                params.supersub_p = true;
                break;
+            case 'S':
+                params.split_p = !params.split_p;
+                break;
+            case 'T':
+                params.notokenization_p = true;
+                params.para_marks_p = false;
+                break;
+            case 't':
+                next_threads_p = true;
+                break;
            case 'U':
                params.unescape_p = true;
                break;
@ -171,6 +216,10 @@ int main(int ac, char **av)
            case 'x':
                params.detag_p = true;
                break;
+            case 'X':
+                params.notokenization_p = true;
+                params.para_marks_p = true;
+                break;
            case 'y':
                params.alltag_p = true;
                break;
@ -181,7 +230,7 @@ int main(int ac, char **av)
                std::cerr << "Unknown option: " << *av << std::endl;
                ::exit(1);
            }
-        } else if (params.lang_iso.empty() && strlen(*av) == 2) {
+        } else if (params.lang_iso.empty() && strlen(*av) == 2 && !isdigit(**av)) {
            params.lang_iso = *av;
        } else if (next_output_p) {
            next_output_p = false;
@ -189,6 +238,14 @@ int main(int ac, char **av)
        } else if (next_cfg_p) {
            next_cfg_p = false;
            params.cfg_path = *av;
+        } else if (next_threads_p) {
+            next_threads_p = false;
+            char *comma = strchr(*av,',');
+            if (comma) {
+                *comma++ = 0;
+                params.chunksize = std::strtoul(comma,0,0);
+            } 
+            params.nthreads = std::strtoul(*av,0,0);
        } else {
            params.args.push_back(std::string(*av));
        }
@ -230,7 +287,6 @@ int main(int ac, char **av)
        if (params.verbose_p) {
            std::cerr << "config path: " << params.cfg_path << std::endl;
        }
-        Tokenizer::set_config_dir(std::string(params.cfg_path));
    } 

    std::unique_ptr<std::ofstream> pofs = 0;
@ -244,16 +300,16 @@ int main(int ac, char **av)

    Tokenizer tize(params);
    tize.init();
-    size_t nlines = 0;
+    std::pair<std::size_t,std::size_t> plines = { 0, 0 };

    if (params.words_p) {
        if (params.args.empty()) {
-            nlines += copy_words(tize,std::cin,ofs);
+            plines.first += copy_words(tize,std::cin,ofs);
        } else {
            for (std::string& arg : params.args) {
                try {
                    std::ifstream ifs(arg.c_str());
-                    nlines += copy_words(tize,ifs,ofs);
+                    plines.first += copy_words(tize,ifs,ofs);
                } catch (...) {
                    std::cerr << "Exception extracting words from path " << arg << std::endl;
                }
@ -261,18 +317,22 @@ int main(int ac, char **av)
        }
    } else if (params.args.empty()) {
        if (detokenize_p) {
-            nlines = tize.detokenize(std::cin,ofs);
+            plines.first = tize.detokenize(std::cin,ofs);
+        } else if (params.notokenization_p) {
+            plines = tize.splitter(std::cin,ofs);
        } else {
-            nlines = tize.tokenize(std::cin,ofs);
+            plines.first = tize.tokenize(std::cin,ofs);
        }
    } else {
        for (std::string& arg : params.args) {
            try {
                std::ifstream ifs(arg.c_str());
                if (detokenize_p) {
-                    nlines = tize.detokenize(ifs,ofs);
+                    plines.first = tize.detokenize(ifs,ofs);
+                } else if (params.notokenization_p) {
+                    plines = tize.splitter(ifs,ofs);
                } else {
-                    nlines = tize.tokenize(ifs,ofs);
+                    plines.first = tize.tokenize(ifs,ofs);
                }
            } catch (...) {
                std::cerr << "Exception tokenizing from path " << arg << std::endl;
@ -280,9 +340,12 @@ int main(int ac, char **av)
        }
    }

-    if (params.verbose_p)
-        std::cerr << "%%% " << nlines << " lines." << std::endl;
-    
+    if (params.verbose_p) {
+        std::cerr << "%%% " << plines.first << " lines." << std::endl;
+        if (plines.second) {
+            std::cerr << "%%% " << plines.second << " sentences." << std::endl;
+        }
+    }    
    return rc;
 }

--- a/contrib/eppex/LossyCounter.h
+++ b/contrib/eppex/LossyCounter.h
@ -13,8 +13,8 @@
 #ifndef LOSSYCOUNTER_H
 #define	LOSSYCOUNTER_H

-#include <stddef.h>
-#include <math.h>
+#include <cstddef>
+#include <cmath>
 #ifdef USE_UNORDERED_MAP
 #include <tr1/unordered_map>
 #else
--- a/contrib/eppex/counter.cpp
+++ b/contrib/eppex/counter.cpp
@ -13,7 +13,7 @@
 #include <string>
 #include <iostream>
 #include <fstream>
-#include <stdlib.h>
+#include <cstdlib>
 #include <iomanip>

 #include "phrase-extract.h"
--- a/contrib/lmserver/examples/lmclient.cc
+++ b/contrib/lmserver/examples/lmclient.cc
@ -5,13 +5,13 @@
 #include <sstream>
 #include <string>
 #include <iostream>
-#include <stdio.h>
+#include <cstdio>
 #include <unistd.h>
 #include <sys/socket.h>
 #include <sys/types.h>
 #include <netinet/in.h>
 #include <netdb.h>
-#include <string.h>
+#include <cstring>
 #include <map>

 struct Cache {
@ -45,8 +45,8 @@ struct LMClient {
 	    exit(1);
    }

-    bzero((char *)&server, sizeof(server));
-    bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length);
+    memset(&server, '\0', sizeof(server));
+    memcpy((char *)&server.sin_addr, hp->h_addr, hp->h_length);
    server.sin_family = hp->h_addrtype;
    server.sin_port = htons(port);

--- a/contrib/mada/qsub-madamira.perl
+++ b/contrib/mada/qsub-madamira.perl
@ -0,0 +1,46 @@
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+use File::Slurp;
+use File::Basename;
+use Cwd 'abs_path';
+
+my $splitDir = $ARGV[0];
+$splitDir = abs_path($splitDir);
+
+my @files = read_dir $splitDir;
+
+my $qsubDir=dirname($splitDir) ."/qsub";
+print STDERR "qsubDir=$qsubDir\n";
+`mkdir -p $qsubDir`;
+
+my $out2Dir=dirname($splitDir) ."/out2";
+print STDERR "out2Dir=$out2Dir\n";
+`mkdir -p $out2Dir`;
+
+for my $file ( @files ) {
+    print STDERR "$file ";
+
+    my $qsubFile = "$qsubDir/$file.sh";
+    open(RUN_FILE, ">$qsubFile");
+    
+    print RUN_FILE "#!/usr/bin/env bash\n" 
+	."#PBS -d/scratch/hh65/workspace/experiment/ar-en \n"
+        ."#PBS -l mem=5gb \n\n"
+	."export PATH=\"/scratch/statmt/bin:/share/apps/NYUAD/perl/gcc_4.9.1/5.20.1/bin:/share/apps/NYUAD/jdk/1.8.0_31/bin:/share/apps/NYUAD/zlib/gcc_4.9.1/1.2.8/bin:/share/apps/NYUAD/cmake/gcc_4.9.1/3.1.0-rc3/bin:/share/apps/NYUAD/boost/gcc_4.9.1/openmpi_1.8.3/1.57.0/bin:/share/apps/NYUAD/openmpi/gcc_4.9.1/1.8.3/bin:/share/apps/NYUAD/python/gcc_4.9.1/2.7.9/bin:/share/apps/NYUAD/gcc/binutils/2.21/el6/bin:/share/apps/NYUAD/gcc/gcc/4.9.1/el6/bin:/usr/lib64/qt-3.3/bin:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sbin:/opt/bio/ncbi/bin:/opt/bio/mpiblast/bin:/opt/bio/EMBOSS/bin:/opt/bio/clustalw/bin:/opt/bio/tcoffee/bin:/opt/bio/hmmer/bin:/opt/bio/phylip/exe:/opt/bio/mrbayes:/opt/bio/fasta:/opt/bio/glimmer/bin:/opt/bio/glimmer/scripts:/opt/bio/gromacs/bin:/opt/bio/gmap/bin:/opt/bio/tigr/bin:/opt/bio/autodocksuite/bin:/opt/bio/wgs/bin:/opt/ganglia/bin:/opt/ganglia/sbin:/opt/bin:/usr/java/latest/bin:/opt/pdsh/bin:/opt/rocks/bin:/opt/rocks/sbin:/opt/torque/bin:/opt/torque/sbin:/home/hh65/bin:/home/hh65/bin\" \n"
+
+	."module load  NYUAD/2.0 \n"
+	."module load gcc python/2.7.9 openmpi/1.8.3 boost cmake zlib jdk perl expat \n"
+
+	."cd /scratch/statmt/MADAMIRA-release-20140709-1.0 \n";
+    print RUN_FILE "java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar /scratch/statmt/MADAMIRA-release-20140709-1.0/MADAMIRA.jar "
+	 ."-rawinput $splitDir/$file -rawoutdir $out2Dir -rawconfig /scratch/statmt/MADAMIRA-release-20140709-1.0/samples/sampleConfigFile.xml \n";
+
+    close(RUN_FILE);
+
+    my $cmd = "qsub $qsubFile";
+    `$cmd`;
+
+}
+
--- a/contrib/mira/Main.cpp
+++ b/contrib/mira/Main.cpp
@ -46,6 +46,7 @@ namespace mpi = boost::mpi;
 #include "moses/FF/PhrasePairFeature.h"
 #include "moses/FF/WordPenaltyProducer.h"
 #include "moses/LM/Base.h"
+#include "util/random.hh"

 using namespace Mira;
 using namespace std;
@ -54,6 +55,7 @@ namespace po = boost::program_options;

 int main(int argc, char** argv)
 {
+  util::rand_init();
  size_t rank = 0;
  size_t size = 1;
 #ifdef MPI_ENABLE
--- a/contrib/mira/Main.h
+++ b/contrib/mira/Main.h
@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "moses/Word.h"
 #include "moses/FF/FeatureFunction.h"
 #include "Decoder.h"
+#include "util/random.hh"

 typedef std::map<const Moses::FeatureFunction*, std::vector< float > > ProducerWeightMap;
 typedef std::pair<const Moses::FeatureFunction*, std::vector< float > > ProducerWeightPair;
@ -37,8 +38,7 @@ template <class T> bool from_string(T& t, const std::string& s, std::ios_base& (

 struct RandomIndex {
  ptrdiff_t operator()(ptrdiff_t max) {
-    srand(time(0));  // Initialize random number generator with current time.
-    return static_cast<ptrdiff_t> (rand() % max);
+    return util::rand_excl(max);
  }
 };

--- a/contrib/other-builds/all.workspace
+++ b/contrib/other-builds/all.workspace
@ -7,8 +7,8 @@
  <Project Name="lm" Path="lm/lm.project" Active="No"/>
  <Project Name="OnDiskPt" Path="OnDiskPt/OnDiskPt.project" Active="No"/>
  <Project Name="search" Path="search/search.project" Active="No"/>
-  <Project Name="moses" Path="moses/moses.project" Active="No"/>
-  <Project Name="moses-cmd" Path="moses-cmd/moses-cmd.project" Active="Yes"/>
+  <Project Name="moses" Path="moses/moses.project" Active="Yes"/>
+  <Project Name="moses-cmd" Path="moses-cmd/moses-cmd.project" Active="No"/>
  <Project Name="score" Path="score/score.project" Active="No"/>
  <Project Name="consolidate" Path="consolidate/consolidate.project" Active="No"/>
  <BuildMatrix>
--- a/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp
+++ b/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp
@ -4,11 +4,12 @@
 *  Created on: 28 Feb 2014
 *      Author: hieu
 */
-#include <stdlib.h>
-#include <stdio.h>
+#include <cstdlib>
+#include <cstdio>
 #include <algorithm>
 #include <fstream>
 #include <boost/algorithm/string/predicate.hpp>
+#include <boost/filesystem.hpp>
 #include "EnOpenNLPChunker.h"
 #include "moses/Util.h"

@ -28,10 +29,11 @@ EnOpenNLPChunker::~EnOpenNLPChunker() {

 void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector<string> &filterList)
 {
+        const boost::filesystem::path
+            inPath = boost::filesystem::unique_path(),
+            outPath = boost::filesystem::unique_path();
 	// read all input to a temp file
-	char *ptr = tmpnam(NULL);
-	string inStr(ptr);
-	ofstream inFile(ptr);
+	ofstream inFile(inPath.c_str());

 	string line;
 	while (getline(in, line)) {
@ -40,21 +42,18 @@ void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector
 	}
 	inFile.close();

-	ptr = tmpnam(NULL);
-	string outStr(ptr);
-
 	// execute chunker
-	string cmd = "cat " + inStr + " | "
+	string cmd = "cat " + inPath.native() + " | "
 			+ m_openNLPPath + "/bin/opennlp POSTagger "
 				+ m_openNLPPath + "/models/en-pos-maxent.bin | "
 			+ m_openNLPPath + "/bin/opennlp ChunkerME "
 				+ m_openNLPPath + "/models/en-chunker.bin > "
-			+ outStr;
+			+ outPath.native();
 	//g << "Executing:" << cmd << endl;
 	int ret = system(cmd.c_str());

 	// read result of chunker and output as Moses xml trees
-	ifstream outFile(outStr.c_str());
+	ifstream outFile(outPath.c_str());

 	size_t lineNum = 0;
 	while (getline(outFile, line)) {
@ -66,8 +65,8 @@ void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector
 	outFile.close();

 	// clean up temporary files
-	remove(inStr.c_str());
-	remove(outStr.c_str());
+	remove(inPath.c_str());
+	remove(outPath.c_str());
 }

 void EnOpenNLPChunker::MosesReformat(const string &line, std::ostream &out, const vector<string> &filterList)
--- a/contrib/other-builds/manual-label/manual-label.project
+++ b/contrib/other-builds/manual-label/manual-label.project
@ -1,5 +1,22 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <CodeLite_Project Name="manual-label" InternalType="Console">
+  <Plugins>
+    <Plugin Name="CMakePlugin">
+      <![CDATA[[{
+  "name": "Debug",
+  "enabled": false,
+  "buildDirectory": "build",
+  "sourceDirectory": "$(ProjectPath)",
+  "generator": "",
+  "buildType": "",
+  "arguments": [],
+  "parentProject": ""
+ }]]]>
+    </Plugin>
+    <Plugin Name="qmake">
+      <![CDATA[00010001N0005Debug000000000000]]>
+    </Plugin>
+  </Plugins>
  <Description/>
  <Dependencies/>
  <VirtualDirectory Name="manual-label">
@ -14,6 +31,8 @@
    <File Name="Main.cpp"/>
    <File Name="Main.h"/>
  </VirtualDirectory>
+  <Dependencies Name="Debug"/>
+  <Dependencies Name="Release"/>
  <Settings Type="Executable">
    <GlobalSettings>
      <Compiler Options="" C_Options="" Assembler="">
@ -33,6 +52,8 @@
      <Linker Options="" Required="yes">
        <LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/boost/lib64"/>
        <Library Value="boost_program_options"/>
+        <Library Value="boost_filesystem"/>
+        <Library Value="boost_system"/>
      </Linker>
      <ResourceCompiler Options="" Required="no"/>
      <General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
@ -107,6 +128,4 @@
      </Completion>
    </Configuration>
  </Settings>
-  <Dependencies Name="Debug"/>
-  <Dependencies Name="Release"/>
 </CodeLite_Project>
--- a/contrib/other-builds/moses-cmd/moses-cmd.project
+++ b/contrib/other-builds/moses-cmd/moses-cmd.project
@ -1,5 +1,22 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <CodeLite_Project Name="moses-cmd" InternalType="Console">
+  <Plugins>
+    <Plugin Name="CMakePlugin">
+      <![CDATA[[{
+  "name": "Debug",
+  "enabled": false,
+  "buildDirectory": "build",
+  "sourceDirectory": "$(ProjectPath)",
+  "generator": "",
+  "buildType": "",
+  "arguments": [],
+  "parentProject": ""
+ }]]]>
+    </Plugin>
+    <Plugin Name="qmake">
+      <![CDATA[00010001N0005Debug000000000000]]>
+    </Plugin>
+  </Plugins>
  <Description/>
  <Dependencies/>
  <VirtualDirectory Name="src"/>
@ -9,6 +26,14 @@
    <File Name="../../../moses-cmd/MainVW.cpp" ExcludeProjConfig="Debug"/>
    <File Name="../../../moses-cmd/MainVW.h" ExcludeProjConfig="Debug"/>
  </VirtualDirectory>
+  <Dependencies Name="Release"/>
+  <Dependencies Name="Debug">
+    <Project Name="OnDiskPt"/>
+    <Project Name="lm"/>
+    <Project Name="moses"/>
+    <Project Name="search"/>
+    <Project Name="util"/>
+  </Dependencies>
  <Settings Type="Executable">
    <GlobalSettings>
      <Compiler Options="" C_Options="" Assembler="">
@ -53,7 +78,7 @@
        <Library Value="rt"/>
      </Linker>
      <ResourceCompiler Options="" Required="no"/>
-      <General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
+      <General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="-f /var/folders/c4/2p48fcwx611dmkdqq44mbblm0000gn/T/ZVd8xvuJAR.ini -i /Users/hieu/workspace/github/moses-regression-tests/tests/phrase.basic-surface-binptable.oldformat/to-translate.txt" UseSeparateDebugArgs="yes" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
      <Environment EnvVarSetName="&lt;Use Defaults&gt;" DbgSetName="&lt;Use Defaults&gt;">
        <![CDATA[]]>
      </Environment>
@ -125,12 +150,4 @@
      </Completion>
    </Configuration>
  </Settings>
-  <Dependencies Name="Release"/>
-  <Dependencies Name="Debug">
-    <Project Name="OnDiskPt"/>
-    <Project Name="lm"/>
-    <Project Name="moses"/>
-    <Project Name="search"/>
-    <Project Name="util"/>
-  </Dependencies>
 </CodeLite_Project>
--- a/contrib/other-builds/moses/moses.project
+++ b/contrib/other-builds/moses/moses.project
@ -474,8 +474,6 @@
    <File Name="../../../moses/FF/DistortionScoreProducer.h"/>
    <File Name="../../../moses/FF/DynamicCacheBasedLanguageModel.cpp"/>
    <File Name="../../../moses/FF/DynamicCacheBasedLanguageModel.h"/>
-    <File Name="../../../moses/FF/ExternalFeature.cpp"/>
-    <File Name="../../../moses/FF/ExternalFeature.h"/>
    <File Name="../../../moses/FF/Factory.cpp"/>
    <File Name="../../../moses/FF/Factory.h"/>
    <File Name="../../../moses/FF/FeatureFunction.cpp"/>
--- a/contrib/python/moses/dictree.cpp
+++ b/contrib/python/moses/dictree.cpp
@ -20,7 +20,7 @@
    #error Cython requires Python 2.4+.
 #else
 #define CYTHON_ABI "0_20_1post0"
-#include <stddef.h> /* For offsetof */
+#include <cstddef> /* For offsetof */
 #ifndef offsetof
 #define offsetof(type, member) ( (size_t) & ((type*)0) -> member )
 #endif
@ -343,7 +343,7 @@ void __Pyx_call_destructor(T* x) {
 #if defined(WIN32) || defined(MS_WINDOWS)
 #define _USE_MATH_DEFINES
 #endif
-#include <math.h>
+#include <cmath>
 #define __PYX_HAVE__moses__dictree
 #define __PYX_HAVE_API__moses__dictree
 #include "string.h"
@ -1131,7 +1131,7 @@ bad:

 static CYTHON_INLINE int __Pyx_PyObject_Append(PyObject* L, PyObject* x); /*proto*/

-#include <string.h>
+#include <cstring>

 static int __Pyx_SetVtable(PyObject *dict, void *vtable); /*proto*/

--- a/contrib/relent-filter/sigtest-filter/WIN32_functions.cpp
+++ b/contrib/relent-filter/sigtest-filter/WIN32_functions.cpp
@ -31,14 +31,14 @@
 ///////////////////////////////////////////////////////////////////////////////
 // if you are not using precompiled headers then include these lines:
 //#include <windows.h>
-//#include <stdio.h>
+//#include <cstdio>
 //#include <tchar.h>
 ///////////////////////////////////////////////////////////////////////////////


-#include <stdio.h>
-#include <string.h>
-#include <math.h>
+#include <cstdio>
+#include <cstring>
+#include <cmath>
 #include "WIN32_functions.h"


@ -228,4 +228,4 @@ double lgamma(int x)
    sum += coefs[j]/++y;
  }
  return -tmp+log(2.5066282746310005*sum/(double)x);
-}
+}
--- a/contrib/relent-filter/src/Main.cpp
+++ b/contrib/relent-filter/src/Main.cpp
@ -42,6 +42,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "RelativeEntropyCalc.h"
 #include "LexicalReordering.h"
 #include "LexicalReorderingState.h"
+#include "util/random.hh"

 #ifdef HAVE_PROTOBUF
 #include "hypergraph.pb.h"
@ -205,7 +206,7 @@ int main(int argc, char** argv)
  
  
    //initialise random numbers
-    srand(time(NULL));
+    rand_init();
  
    // set up read/writing class
    IOWrapper* ioWrapper = GetIOWrapper(staticData);
--- a/contrib/relent-filter/src/mbr.cpp
+++ b/contrib/relent-filter/src/mbr.cpp
@ -4,10 +4,10 @@
 #include <iomanip>
 #include <vector>
 #include <map>
-#include <stdlib.h>
-#include <math.h>
+#include <cstdlib>
+#include <cmath>
 #include <algorithm>
-#include <stdio.h>
+#include <cstdio>
 #include "TrellisPathList.h"
 #include "TrellisPath.h"
 #include "StaticData.h"
--- a/contrib/server/Jamfile
+++ b/contrib/server/Jamfile
@ -11,7 +11,7 @@ else
 {
 with-xmlrpc-c = [ option.get "with-xmlrpc-c" ] ;
 if $(with-xmlrpc-c) {
-  echo Bulding mosesserver. ;
+  echo While building mosesserver ... ;
  echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
  echo "!!! You are linking the XMLRPC-C library; Do NOT use v.1.25.29            !!!" ;
  echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
--- a/contrib/server/mosesserver.cpp
+++ b/contrib/server/mosesserver.cpp
@ -37,6 +37,7 @@ int main(int argc, char** argv)
 #include "moses/Manager.h"
 #include "moses/StaticData.h"
 #include "moses/ThreadPool.h"
+#include "moses/TranslationTask.h"
 #include "moses/TranslationModel/PhraseDictionaryDynSuffixArray.h"
 #include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
 #if PT_UG
@ -232,8 +233,8 @@ public:
 /**
  * Required so that translations can be sent to a thread pool.
 **/
-class TranslationTask : public virtual Moses::Task {
-public:
+class TranslationTask : public virtual Moses::TranslationTask {
+protected:
  TranslationTask(xmlrpc_c::paramList const& paramList,
    boost::condition_variable& cond, boost::mutex& mut) 
   : m_paramList(paramList),
@ -242,23 +243,33 @@ public:
     m_done(false)
     {}

+public:
+  static boost::shared_ptr<TranslationTask> 
+  create(xmlrpc_c::paramList const& paramList,
+	 boost::condition_variable& cond, boost::mutex& mut)
+  {
+    boost::shared_ptr<TranslationTask> ret(new TranslationTask(paramList, cond, mut));
+    ret->m_self = ret;
+    return ret;
+  }
+  
  virtual bool DeleteAfterExecution() {return false;}

  bool IsDone() const {return m_done;}

  const map<string, xmlrpc_c::value>& GetRetData() { return m_retData;}

-  virtual void Run() {
-
+  virtual void 
+  Run() 
+  {
+    using namespace xmlrpc_c;
    const params_t params = m_paramList.getStruct(0);
    m_paramList.verifyEnd(1);
    params_t::const_iterator si = params.find("text");
    if (si == params.end()) {
-      throw xmlrpc_c::fault(
-        "Missing source text",
-        xmlrpc_c::fault::CODE_PARSE);
+      throw fault("Missing source text", fault::CODE_PARSE);
    }
-    const string source((xmlrpc_c::value_string(si->second)));
+    const string source = value_string(si->second);

    XVERBOSE(1,"Input: " << source << endl);
    si = params.find("align");
@ -272,7 +283,7 @@ public:
    si = params.find("report-all-factors");
    bool reportAllFactors = (si != params.end());
    si = params.find("nbest");
-    int nbest_size = (si == params.end()) ? 0 : int(xmlrpc_c::value_int(si->second));
+    int nbest_size = (si == params.end()) ? 0 : int(value_int(si->second));
    si = params.find("nbest-distinct");
    bool nbest_distinct = (si != params.end());

@ -281,21 +292,25 @@ public:

    vector<float> multiModelWeights;
    si = params.find("lambda");
-    if (si != params.end()) {
-        xmlrpc_c::value_array multiModelArray = xmlrpc_c::value_array(si->second);
-        vector<xmlrpc_c::value> multiModelValueVector(multiModelArray.vectorValueValue());
-        for (size_t i=0;i < multiModelValueVector.size();i++) {
-            multiModelWeights.push_back(xmlrpc_c::value_double(multiModelValueVector[i]));
-        }
-    }
+    if (si != params.end()) 
+      {
+        value_array multiModelArray = value_array(si->second);
+        vector<value> multiModelValueVector(multiModelArray.vectorValueValue());
+        for (size_t i=0;i < multiModelValueVector.size();i++) 
+	  {
+            multiModelWeights.push_back(value_double(multiModelValueVector[i]));
+	  }
+      }

    si = params.find("model_name");
-    if (si != params.end() && multiModelWeights.size() > 0) {
-        const string model_name = xmlrpc_c::value_string(si->second);
-        PhraseDictionaryMultiModel* pdmm = (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name);
+    if (si != params.end() && multiModelWeights.size() > 0) 
+      {
+        const string model_name = value_string(si->second);
+        PhraseDictionaryMultiModel* pdmm 
+	  = (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name);
        pdmm->SetTemporaryMultiModelWeightsVector(multiModelWeights);
-    }
-
+      }
+    
    const StaticData &staticData = StaticData::Instance();

    //Make sure alternative paths are retained, if necessary
@ -306,13 +321,14 @@ public:

    stringstream out, graphInfo, transCollOpts;

-    if (staticData.IsSyntax()) {
-      TreeInput tinput;
-        const vector<FactorType>& 
-	      inputFactorOrder = staticData.GetInputFactorOrder();
-        stringstream in(source + "\n");
-        tinput.Read(in,inputFactorOrder);
-        ChartManager manager(tinput);
+    if (staticData.IsSyntax()) 
+      {
+	boost::shared_ptr<TreeInput> tinput(new TreeInput);
+        const vector<FactorType>& IFO = staticData.GetInputFactorOrder();
+        istringstream in(source + "\n");
+        tinput->Read(in,IFO);
+	ttasksptr task = Moses::TranslationTask::create(tinput);
+        ChartManager manager(task);
        manager.Decode();
        const ChartHypothesis *hypo = manager.GetBestHypothesis();
        outputChartHypo(out,hypo);
@ -320,57 +336,50 @@ public:
          // const size_t translationId = tinput.GetTranslationId();
          std::ostringstream sgstream;
          manager.OutputSearchGraphMoses(sgstream);
-          m_retData.insert(pair<string, xmlrpc_c::value>("sg", xmlrpc_c::value_string(sgstream.str())));
+          m_retData["sg"] = value_string(sgstream.str());
        }
-    } else {
-        size_t lineNumber = 0; // TODO: Include sentence request number here?
-        Sentence sentence;
-        sentence.SetTranslationId(lineNumber);
-
-        const vector<FactorType> &
-	      inputFactorOrder = staticData.GetInputFactorOrder();
-        stringstream in(source + "\n");
-        sentence.Read(in,inputFactorOrder);
-        Manager manager(sentence);
-	      manager.Decode();
+      } 
+    else 
+      {
+        // size_t lineNumber = 0; // TODO: Include sentence request number here?
+	boost::shared_ptr<Sentence> sentence(new Sentence(0,source));
+	ttasksptr task = Moses::TranslationTask::create(sentence);
+        Manager manager(task);
+	manager.Decode();
        const Hypothesis* hypo = manager.GetBestHypothesis();

        vector<xmlrpc_c::value> alignInfo;
        outputHypo(out,hypo,addAlignInfo,alignInfo,reportAllFactors);
-        if (addAlignInfo) {
-          m_retData.insert(pair<string, xmlrpc_c::value>("align", xmlrpc_c::value_array(alignInfo)));
-        }
-        if (addWordAlignInfo) {
-          stringstream wordAlignment;
-          hypo->OutputAlignment(wordAlignment);
-          vector<xmlrpc_c::value> alignments;
-          string alignmentPair;
-          while (wordAlignment >> alignmentPair) {
+        if (addAlignInfo) m_retData["align"] = value_array(alignInfo);
+        if (addWordAlignInfo) 
+	  {
+	    stringstream wordAlignment;
+	    hypo->OutputAlignment(wordAlignment);
+	    vector<xmlrpc_c::value> alignments;
+	    string alignmentPair;
+	    while (wordAlignment >> alignmentPair) 
+	      {
          	int pos = alignmentPair.find('-');
          	map<string, xmlrpc_c::value> wordAlignInfo;
-          	wordAlignInfo["source-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(0, pos).c_str()));
-          	wordAlignInfo["target-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(pos + 1).c_str()));
-          	alignments.push_back(xmlrpc_c::value_struct(wordAlignInfo));
-          }
-          m_retData.insert(pair<string, xmlrpc_c::value_array>("word-align", alignments));
-        }
-
-        if (addGraphInfo) {
-          insertGraphInfo(manager,m_retData);
-        }
-        if (addTopts) {
-          insertTranslationOptions(manager,m_retData);
-        }
-        if (nbest_size>0) {
-          outputNBest(manager, m_retData, nbest_size, nbest_distinct, 
-		      reportAllFactors, addAlignInfo, addScoreBreakdown);
-        }
+          	wordAlignInfo["source-word"] 
+		  = value_int(atoi(alignmentPair.substr(0, pos).c_str()));
+          	wordAlignInfo["target-word"] 
+		  = value_int(atoi(alignmentPair.substr(pos + 1).c_str()));
+          	alignments.push_back(value_struct(wordAlignInfo));
+	      }
+	    m_retData["word-align"] = value_array(alignments);
+	  }
+	
+        if (addGraphInfo) insertGraphInfo(manager,m_retData);
+        if (addTopts) insertTranslationOptions(manager,m_retData);
+        if (nbest_size > 0) 
+	  {
+	    outputNBest(manager, m_retData, nbest_size, nbest_distinct, 
+			reportAllFactors, addAlignInfo, addScoreBreakdown);
+	  }
        (const_cast<StaticData&>(staticData)).SetOutputSearchGraph(false);
-
-    }
-    pair<string, xmlrpc_c::value>
-    text("text", xmlrpc_c::value_string(out.str()));
-    m_retData.insert(text);
+      }
+    m_retData["text"] = value_string(out.str());
    XVERBOSE(1,"Output: " << out.str() << endl);
    {
      boost::lock_guard<boost::mutex> lock(m_mut);
@ -380,9 +389,12 @@ public:

  }

-  void outputHypo(ostream& out, const Hypothesis* hypo, bool addAlignmentInfo, vector<xmlrpc_c::value>& alignInfo, bool reportAllFactors = false) {
+  void outputHypo(ostream& out, const Hypothesis* hypo, 
+		  bool addAlignmentInfo, vector<xmlrpc_c::value>& alignInfo, 
+		  bool reportAllFactors = false) {
    if (hypo->GetPrevHypo() != NULL) {
-      outputHypo(out,hypo->GetPrevHypo(),addAlignmentInfo, alignInfo, reportAllFactors);
+      outputHypo(out,hypo->GetPrevHypo(),addAlignmentInfo, 
+		 alignInfo, reportAllFactors);
      Phrase p = hypo->GetCurrTargetPhrase();
      if(reportAllFactors) {
        out << p << " ";
@ -524,7 +536,7 @@ public:
 	{
 	  // should the score breakdown be reported in a more structured manner?
 	  ostringstream buf;
-	  path.GetScoreBreakdown().OutputAllFeatureScores(buf);
+	  path.GetScoreBreakdown()->OutputAllFeatureScores(buf);
 	  nBestXMLItem["fvals"] = xmlrpc_c::value_string(buf.str());
 	}

@ -595,7 +607,7 @@ public:
    boost::condition_variable cond;
    boost::mutex mut;
    typedef ::TranslationTask TTask;
-    boost::shared_ptr<TTask> task(new TTask(paramList,cond,mut));
+    boost::shared_ptr<TTask> task = TTask::create(paramList,cond,mut);
    m_threadPool.Submit(task);
    boost::unique_lock<boost::mutex> lock(mut);
    while (!task->IsDone()) {
--- a/contrib/sigtest-filter/WIN32_functions.cpp
+++ b/contrib/sigtest-filter/WIN32_functions.cpp
@ -31,14 +31,14 @@
 ///////////////////////////////////////////////////////////////////////////////
 // if you are not using precompiled headers then include these lines:
 //#include <windows.h>
-//#include <stdio.h>
+//#include <cstdio>
 //#include <tchar.h>
 ///////////////////////////////////////////////////////////////////////////////


-#include <stdio.h>
-#include <string.h>
-#include <math.h>
+#include <cstdio>
+#include <cstring>
+#include <cmath>
 #include "WIN32_functions.h"


@ -228,4 +228,4 @@ double lgamma(int x)
    sum += coefs[j]/++y;
  }
  return -tmp+log(2.5066282746310005*sum/(double)x);
-}
+}
--- a/contrib/synlm/hhmm/rvtl/include/nl-cpt.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-cpt.h
@ -24,7 +24,7 @@
 #ifndef _NL_CPT__
 #define _NL_CPT__

-//#include <stdlib.h>
+//#include <cstdlib>
 //#include <vector>
 //#include <string>
 //#include <cassert>
--- a/contrib/synlm/hhmm/rvtl/include/nl-list.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-list.h
@ -24,7 +24,7 @@
 #ifndef _NL_LIST_ //////////////////////////////////////////////////////////////
 #define _NL_LIST_ //////////////////////////////////////////////////////////////

-#include <stdlib.h>
+#include <cstdlib>

 #define Listed(x) ListedObject<x>

--- a/contrib/synlm/hhmm/rvtl/include/nl-string.h
+++ b/contrib/synlm/hhmm/rvtl/include/nl-string.h
@ -26,10 +26,10 @@

 #include "nl-array.h"

-#include <stdio.h>
+#include <cstdio>
 #include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
+#include <cstdlib>
+#include <cstring>
 #include <iostream>
 #include <string>
 using namespace std;
--- a/moses/FF/ExternalFeature.cpp
+++ b/moses/FF/ExternalFeature.cpp
@ -1,6 +1,6 @@
 #include "ExternalFeature.h"
 #include <dlfcn.h>
-#include <stdlib.h>
+#include <cstdlib>
 #include <iostream>
 #include "util/exception.hh"

--- a/moses/FF/ExternalFeature.h
+++ b/moses/FF/ExternalFeature.h
--- a/jam-files/check-environment.jam
+++ b/jam-files/check-environment.jam
@ -0,0 +1,42 @@
+# get stuff from environment variables if not set on the command line
+# unless blocked explicitly
+for local what in cmph irstlm  
+{  
+   if ! [ option.get "with-$(what)" ] && ! [ option.get "no-$(what)" : : no ] 
+   {
+      local where = [ os.environ "$(what:U)_ROOT" ] ;
+      if $(where) 
+      { 
+         echo "setting option with-$(what) from environment variable "
+              "$(what:U)_ROOT ." ;
+         option.set "with-$(what)" : $(where) ; 
+       }
+   }
+   local where = [ option.get "with-$(what)" ] ;
+}
+
+# if --with-moses-regtest is specified without a directory
+local regtest = [ option.get "with-moses-regtest" : no : yes ] ;
+if $(regtest) = yes 
+{ # regtests requested but no path given 
+  echo "Regression tests requested but no path given." ; 
+  local $(where)  = [ os.environ "MOSES_REGTEST_ROOT" ] ;
+  if ! $(where) 
+  {
+    local HOME = [ os.environ "HOME" ] ; 
+    if [ path.exists $(HOME)/moses-regression-tests ] 
+    {
+      echo "Using ~/moses-regression-tests as the default." ; 
+      option.set "with-moses-regtest" : "~/moses-regression-tests" ; 
+    }
+  }
+  else 
+  { 
+    if [ path.exists $(where) ] 
+    {
+      echo "Using $(where) from environment variable MOSES_REGTEST_ROOT."
+      option.set "with-regtest" : $(where) ; 
+    }
+  }
+}
+
--- a/jam-files/curlpp.jam
+++ b/jam-files/curlpp.jam
@ -0,0 +1,123 @@
+# -*- jam -*-
+# configuration for curlpp
+# I haven't been able to wrap my mind around bjam yet, so chances are 
+# there's a much better way to do things.
+
+module curlppvars { } # this stores the variables we want to keep 
+
+if [ option.get "no-curlpp" : : yes ] 
+{
+  rule curlpp ( what ? ) { } # never return anything
+}
+else
+{
+  local version ; 
+  local prefix ; 
+  # check if a non-standard location for curl is given 
+  local curlpp = [ option.get "with-curlpp" ] ;
+  if ! $(curlpp) # maybe via environment variable CURLPP_ROOT ? 
+  {
+    local where = [ os.environ "CURLPP_ROOT" ] ;
+    if $(where) 
+    { 
+      option.set "with-curlpp" : $(where) ; 
+      local msg = "CURLPP:   setting --with-curlpp=$(where) via environment" ;
+      echo "$(msg) variable CURLPP_ROOT" ;
+    }
+    curlpp = [ option.get "with-curlpp" ] ; 
+  }
+  
+  local config ; 
+  if $(curlpp) 
+  { 
+    config = $(curlpp)/bin/curlpp-config ; 
+  }
+  else # is curlpp-config in the path ? 
+  {
+    local curlpp-check = [ _shell "curlpp-config 2>/dev/null" : exit-status ] ; 
+    if $(curlpp-check[2]) = 0 { config = curlpp-config ; }
+  }
+  
+  if $(config) 
+  {
+    prefix = [ shell_or_die "$(config) --prefix" ] ;
+    version = [ shell_or_die "$(config) --version" ] ;
+    version = [ SPLIT_BY_CHARACTERS $(version) : " " ] ;
+    version = [ trim-nl $(version[2]) ] ; 
+    modules.poke curlppvars : prefix : $(prefix) ; 
+    modules.poke curlppvars : version : $(version) ; 
+
+    requirements += <define>HAVE_CURLPP ; 
+    local cpp-cflags = [ shell_or_die "$(config) --cflags" ] ;
+    for local i in [ SPLIT_BY_CHARACTERS $(cpp-cflags) : " " ]
+    {
+      local incpath = [ MATCH "-I(.*)" : $(i) ] ; 
+      if $(incpath) 
+      { 
+	# echo "CURLPP: $(i)" ; 
+	requirements += <cxxflags>"-isystem $(incpath)" ;
+	# requirements += <include>$(incpath) ; 
+      }
+    }
+    local cpp-libs = [ shell_or_die "$(config) --libs" ] ;
+    local cpp-prefix = [ shell_or_die "$(config) --prefix" ] ;
+    for local i in [ SPLIT_BY_CHARACTERS $(cpp-libs) : " " ]
+    {
+      local libpath = [ MATCH "^-L(.*)" : $(i) ] ; 
+      if $(libpath) { requirements += <library-path>$(libpath) ; }
+      local libname = [ MATCH "^-l(.*)" : $(i) ] ; 
+      if $(libname) 
+      { 
+	# local curl = [ MATCH "^-l(.*pp)" : $(i) ] ;
+	# if [ path.exists $(cpp-prefix)/lib/lib$(libname).a ]
+	# {
+	#   echo "CURLPP:   STATIC LINKING FOR LIBRARY: $(libname)" ;
+	#   lib $(libname) : : <link>static ;
+	# }
+	# else
+	# {
+	  external-lib $(libname) : $(cpp-prefix)/lib ; 
+	# }
+	requirements += <library>$(libname)/<link>shared ; 
+	# requirements += <library>$(libname) ; 
+      }
+      else
+      {
+	requirements += <linkflags>$(i) ;
+      }
+
+      # requirements += <library-path>/usr/lib/x86_64-linux-gnu ; 
+      # for local xtra in idn rtmp ssl crypto ssl crypto ldap rt 
+      # {
+      # 	external-lib $(xtra) : /usr/lib/x86_64-linux-gnu ; 
+      #   requirements += <library>$(xtra) ; 
+      # }
+    }
+    # for local e in idn rtmp ssl crypto ldap rt  
+    # {
+    #   external-lib $(e) ; # : /usr/lib/x86_64-linux-gnu /usr/lib32 ; 
+    #   requirements += <library>$(e) ;
+    # }
+  
+    # the rule curlpp provides access to all the variables defined in this file
+    # if none argument is given, it returns $(version), which should only be 
+    # defined if curl is available
+    rule curlpp ( what ? )
+    {
+      if $(what) 
+      {
+	  retval = [ modules.peek curlppvars : $(what) ] ; 
+	  if $(retval) { return $(retval) ; }
+      }
+      else { return "yes" ; }
+    }
+  }
+  else { rule curlpp { } } 
+}
+
+if [ curlpp ] 
+{
+   local prefix  = [ curlpp prefix ] ; 
+   local version = [ curlpp version ] ;
+   echo "CULRPP:   USING VERSION $(version)  FROM $(prefix)" ; 
+}
--- a/jam-files/engine/fileunix.c
+++ b/jam-files/engine/fileunix.c
@ -134,10 +134,15 @@ void file_dirscan_( file_info_t * const d, scanback func, void * closure )

 int file_mkdir( char const * const path )
 {
+#if defined(__MINGW32__)
+    /* MinGW's mkdir() takes only one argument: the path. */
+    mkdir(path);
+#else
    /* Explicit cast to remove const modifiers and avoid related compiler
     * warnings displayed when using the intel compiler.
     */
    return mkdir( (char *)path, 0777 );
+#endif
 }


--- a/jam-files/engine/jam.h
+++ b/jam-files/engine/jam.h
@ -74,7 +74,7 @@
 * Windows MingW32
 */

-#ifdef MINGW
+#ifdef __MINGW32__

 #include <fcntl.h>
 #include <stdlib.h>
--- a/jam-files/sanity.jam
+++ b/jam-files/sanity.jam
@ -22,6 +22,14 @@ rule shell_or_fail ( cmd ) {
  }
 }

+rule shell_or_die ( cmd ) {
+  local ret = [ SHELL $(cmd) : exit-status ] ;
+  if $(ret[2]) != 0 {
+    exit $(cmd) failed : 1 ;
+  }
+  return [ trim-nl $(ret[1]) ] ;
+}
+
 cxxflags = [ os.environ "CXXFLAGS" ] ;
 cflags = [ os.environ "CFLAGS" ] ;
 ldflags = [ os.environ "LDFLAGS" ] ;
--- a/jam-files/server.jam
+++ b/jam-files/server.jam
@ -1,86 +0,0 @@
-# import path ;
-
-import option ; 
-# Is the XMLRPC-C server available?
-
-rule shell_or_die ( cmd ) {
-  local ret = [ _shell $(cmd) : exit-status ] ;
-  if $(ret[2]) != 0 {
-    exit "Failed to run $(cmd)" : 1 ;
-  }
-  return $(ret[1]) ;
-}
-
-build-server = [ option.get "no-xmlrpc-c" : "yes" : "no" ] ;
-
-if $(build-server) = yes
-{
-  # by default, we try to build server capabilities into the server
-  xmlrpc-c-path   = [ option.get "with-xmlrpc-c" ] ;
-  if $(xmlrpc-c-path) = "" 
-  { 
-    xmlrpc-c-config-cmd = "xmlrpc-c-config" ; 
-  }
-  else 
-  { 
-    xmlrpc-c-config-cmd = "$(xmlrpc-c-path)/bin/xmlrpc-c-config" ; 
-  }
-
-  # check if xmlrpc-config is available
-  xmlrpc-check = [ _shell "$(xmlrpc-c-config-cmd) --features 2>/dev/null" : exit-status ] ; 
-  if $(xmlrpc-check[2]) = 0 
-  { 
-    # xmlrpc-c-config was found. Now check if abyss server is available
-    if [ MATCH "(abyss-server)" : $(xmlrpc-check[1]) ] 
-    {
-      # Yes, abyss server is available. Is it the right xmlrpc-c version 
-      # Version 1.25.29 does not work. 
-      xmlrpc-check = [ _shell "$(xmlrpc-c-config-cmd) --version 2>/dev/null" : exit-status ] ; 
-      xmlrpc-c-version = $(xmlrpc-check[1]) ; 
-      if [ MATCH "(1.25.29)" : $(xmlrpc-c-version) ] 
-      {
-        echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
-	echo "XMLRPC-C: Moses is not compatible with xmlrpc-c version $(xmlrpc-c-version). " ; 
-	echo "XMLRPC-C: Use another one or compile without server functionality (--no-xmlrpc-c)." ; 
-	echo "XMLRPC-C: Build aborted." ; 
-	exit : 1 ;
-      }
-      else 
-      { 
-        # echo "XMLRPC-C: Found abyss server." ;
-      }
-   }
-   else 
-   {
-     echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
-     echo "XMLRPC-C: Found xmlrpc-c but it does not provide the abyss server." ;
-     echo "XMLRPC-C: Use another xmlrpc-c installation that provides one " ; 
-     echo "XMLRPC-C: or compile without server functionality (--no-xmlrpc-c)." ; 
-     exit : 1 ;
-   }
- }
- else if [ option.get "with-xmlrpc-c" ] 
- {
-   echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
-   echo "XMLRPC-C: Could not find $(xmlrpc-c-config-cmd). Build aborted. " ;
-   exit : 1 ;
- }
- else
- {
-   build-server = no ; 
-   rule build_server { return no ; }
- }
-}
-
-if $(build-server) = yes
-{
-  xmlrpc-path = [ _shell "$(xmlrpc-c-config-cmd) --prefix 2>/dev/null" : exit-status ] ; 
-  rule build_server { return $(xmlrpc-c-config-cmd) ; }
-  rule xmlrpc_path { return $(xmlrpc-path[1]) ; }
-}
-else
-{
-   rule build_server { return no ; }
-} 
-
-
--- a/jam-files/xmlrpc-c.jam
+++ b/jam-files/xmlrpc-c.jam
@ -0,0 +1,100 @@
+# This module handles the use (or non-use) of the externall
+# xmlrpc-c library (including the abyss server) that is needed for 
+# moses server functionality
+
+if [ option.get "no-xmlrpc-c" ] 
+{
+  rule xmlrpc ( what ? ) { } # never return anything
+}
+else
+{
+  local xmlrpc = [ option.get "with-xmlrpc-c" ] ; 
+  if ! $(xmlrpc) # check for environment variable
+  {
+    local where = [ os.environ "XMLRPC_C_ROOT" ] ;
+    if $(where) 
+    { 
+      option.set "with-xmlrpc-c" : $(where) ; 
+      local msg = "setting --with-xmlrpc-c=$(where) via environment " ;
+      echo "$(msg) variable XMLRPC_C_ROOT" ;
+    }
+    xmlrpc = [ option.get "with-xmlrpc-c" ] ; 
+  }
+  local config ; 
+  if ! $(xmlrpc) { config = "xmlrpc-c-config" ; }
+  else { config = "$(xmlrpc)/bin/xmlrpc-c-config" ; }
+
+  # check if xmlrpc-config can be executed
+  xmlrpc-check = [ _shell "$(config) --features 2>/dev/null" : exit-status ] ; 
+
+  if $(xmlrpc-check[2]) = 0 # yes it can
+  { 
+    # is the abyss server is available ?
+    if [ MATCH "(abyss-server)" : $(xmlrpc-check[1]) ] 
+    {
+      # Yes, abyss server is available. Is it the right xmlrpc-c version ?
+      # Note: Version 1.25.29 does not work. 
+      xmlrpc-check = [ _shell "$(config) --version 2>/dev/null" : exit-status ] ; 
+      xmlrpc-c-version = $(xmlrpc-check[1]) ; 
+      if [ MATCH "(1.25.29)" : $(xmlrpc-c-version) ] 
+      {
+        echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
+	echo "XMLRPC-C: Moses is not compatible with xmlrpc-c version $(xmlrpc-c-version). " ; 
+	echo "XMLRPC-C: Use another one or compile without server functionality (--no-xmlrpc-c)." ; 
+	echo "XMLRPC-C: Build aborted." ; 
+	exit : 1 ;
+      }
+    }
+    else 
+    {
+      echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
+      echo "XMLRPC-C: Found xmlrpc-c but it does not provide the abyss server." ;
+      echo "XMLRPC-C: Use another xmlrpc-c installation that provides one " ; 
+      echo "XMLRPC-C: or compile without server functionality (--no-xmlrpc-c)." ; 
+      exit : 1 ;
+    }
+    local prefix  = [ shell_or_die "$(config) --prefix" ] ;
+    local version = [ shell_or_die "$(config) --version" ] ;
+    echo "XMLRPC-C: USING VERSION $(version) FROM $(prefix)" ;
+    
+    # now add stuff to the requirements
+    local xmlrpc-cxxflags = [ shell_or_die "$(config) c++2 abyss-server --cflags" ] ;
+    requirements += <define>HAVE_XMLRPC_C ;
+    requirements += <cxxflags>$(xmlrpc-cxxflags) ;
+    
+    local libs = [ shell_or_die "$(config) c++2 abyss-server --libs" ] ;
+    for local i in [ SPLIT_BY_CHARACTERS $(libs) : " " ] 
+    { 
+      local libname = [ MATCH "-l(xmlrpc.*)" : $(i) ] ; 
+      if $(libname) 
+      {   
+	external-lib $(libname) : $(prefix)/lib ;
+	#      : : <runtime-link>static:<link>static <runtime-link>shared:<link>shared ;
+	requirements += <library>$(libname) ;
+      }
+      local pathname = [ MATCH "-L(.*)" : $(i) ] ; 
+      if $(pathname)
+      {
+	requirements += <library-path>$(pathname) ; 
+      }
+    }
+  
+    rule xmlrpc { return yes ; }
+  }
+  else if [ option.get "with-xmlrpc-c" ] 
+  {
+    echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
+    echo "XMLRPC-C: Could not find $(config). " ;
+    echo "Build aborted. " ; 
+    echo "Use --no-xmlrpc-c to compile moses without server functionality. " ; 
+    exit : 1 ;
+  }
+  else
+  {
+    rule xmlrpc ( what ? ) { } # never return anything
+  }
+  
+}
+
+
+
--- a/lm/Jamfile
+++ b/lm/Jamfile
@ -14,12 +14,12 @@ update-if-changed $(ORDER-LOG) $(max-order) ;
 max-order += <dependency>$(ORDER-LOG) ;

 wrappers = ;
-local with-nplm = [ option.get "with-nplm-0.1" ] ;
+local with-nplm = [ option.get "with-nplm" ] ;
 if $(with-nplm) {
-  lib neuralLM : : <search>$(with-nplm)/src ;
+  lib nplm : : <search>$(with-nplm)/src ;
  obj nplm.o : wrappers/nplm.cc : <include>.. <include>$(with-nplm)/src <cxxflags>-fopenmp ;
-  alias nplm : nplm.o neuralLM ..//boost_thread : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>WITH_NPLM <library>..//boost_thread ;
-  wrappers += nplm ;
+  alias nplm-all : nplm.o nplm ..//boost_thread : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>WITH_NPLM <library>..//boost_thread ;
+  wrappers += nplm-all ;
 }

 fakelib kenlm : $(wrappers) [ glob *.cc : *main.cc *test.cc ] ../util//kenutil : <include>.. $(max-order) : : <include>.. $(max-order) ;
--- a/lm/bhiksha.hh
+++ b/lm/bhiksha.hh
@ -19,9 +19,8 @@
 #include "util/sorted_uniform.hh"

 #include <algorithm>
-
 #include <stdint.h>
-#include <assert.h>
+#include <cassert>

 namespace lm {
 namespace ngram {
--- a/lm/blank.hh
+++ b/lm/blank.hh
@ -2,9 +2,8 @@
 #define LM_BLANK_H

 #include <limits>
-
 #include <stdint.h>
-#include <math.h>
+#include <cmath>

 namespace lm {
 namespace ngram {
--- a/lm/build_binary_main.cc
+++ b/lm/build_binary_main.cc
@ -9,9 +9,8 @@
 #include <iostream>
 #include <iomanip>
 #include <limits>
-
-#include <math.h>
-#include <stdlib.h>
+#include <cmath>
+#include <cstdlib>

 #ifdef WIN32
 #include "util/getopt.hh"
--- a/lm/builder/adjust_counts.cc
+++ b/lm/builder/adjust_counts.cc
@ -48,7 +48,8 @@ class StatCollector {
            // TODO: Specialize error message for j == 3, meaning 3+
            UTIL_THROW_IF(s.n[j] == 0, BadDiscountException, "Could not calculate Kneser-Ney discounts for "
                << (i+1) << "-grams with adjusted count " << (j+1) << " because we didn't observe any "
-                << (i+1) << "-grams with adjusted count " << j << "; Is this small or artificial data?");
+                << (i+1) << "-grams with adjusted count " << j << "; Is this small or artificial data?\n"
+                << "Try deduplicating the input.  To override this error for e.g. a class-based model, rerun with --discount_fallback\n");
          }

          // See equation (26) in Chen and Goodman.
@ -63,7 +64,7 @@ class StatCollector {
            case THROW_UP:
              throw;
            case COMPLAIN:
-              std::cerr << e.what() << "  Substituting fallback discounts D1=" << config.fallback.amount[1] << " D2=" << config.fallback.amount[2] << " D3+=" << config.fallback.amount[3] << std::endl;
+              std::cerr << "Substituting fallback discounts for order " << i << ": D1=" << config.fallback.amount[1] << " D2=" << config.fallback.amount[2] << " D3+=" << config.fallback.amount[3] << std::endl;
            case SILENT:
              break;
          }
--- a/lm/builder/adjust_counts_test.cc
+++ b/lm/builder/adjust_counts_test.cc
@ -78,7 +78,7 @@ BOOST_AUTO_TEST_CASE(Simple) {
    DiscountConfig discount_config;
    discount_config.fallback = Discount();
    discount_config.bad_action = THROW_UP;
-    BOOST_CHECK_THROW(AdjustCounts(prune_thresholds, counts, counts_pruned, discount_config, discount).Run(for_adjust), BadDiscountException);
+    BOOST_CHECK_THROW(AdjustCounts(prune_thresholds, counts, counts_pruned, std::vector<bool>(), discount_config, discount).Run(for_adjust), BadDiscountException);
  }
  BOOST_REQUIRE_EQUAL(4UL, counts.size());
  BOOST_CHECK_EQUAL(4UL, counts[0]);
--- a/lm/builder/corpus_count_test.cc
+++ b/lm/builder/corpus_count_test.cc
@ -45,7 +45,8 @@ BOOST_AUTO_TEST_CASE(Short) {
  NGramStream stream;
  uint64_t token_count;
  WordIndex type_count = 10;
-  CorpusCount counter(input_piece, vocab.get(), token_count, type_count, chain.BlockSize() / chain.EntrySize(), SILENT);
+  std::vector<bool> prune_words;
+  CorpusCount counter(input_piece, vocab.get(), token_count, type_count, prune_words, "", chain.BlockSize() / chain.EntrySize(), SILENT);
  chain >> boost::ref(counter) >> stream >> util::stream::kRecycle;

  const char *v[] = {"<unk>", "<s>", "</s>", "looking", "on", "a", "little", "more", "loin", "foo", "bar"};
--- a/lm/builder/interpolate.cc
+++ b/lm/builder/interpolate.cc
@ -8,8 +8,8 @@
 #include "util/fixed_array.hh"
 #include "util/murmur_hash.hh"

-#include <assert.h>
-#include <math.h>
+#include <cassert>
+#include <cmath>

 namespace lm { namespace builder {
 namespace {
--- a/lm/builder/joint_order.hh
+++ b/lm/builder/joint_order.hh
@ -9,7 +9,7 @@
 #include <iostream>
 #endif

-#include <string.h>
+#include <cstring>

 namespace lm { namespace builder {

--- a/lm/builder/lmplz_main.cc
+++ b/lm/builder/lmplz_main.cc
@ -202,6 +202,7 @@ int main(int argc, char *argv[]) {
    initial.adder_out.block_count = 2;
    pipeline.read_backoffs = initial.adder_out;

+    // Read from stdin, write to stdout by default
    util::scoped_fd in(0), out(1);
    if (vm.count("text")) {
      in.reset(util::OpenReadOrThrow(text.c_str()));
@ -210,7 +211,6 @@ int main(int argc, char *argv[]) {
      out.reset(util::CreateOrThrow(arpa.c_str()));
    }

-    // Read from stdin
    try {
      lm::builder::Output output;
      output.Add(new lm::builder::PrintARPA(out.release(), verbose_header));
--- a/lm/builder/ngram.hh
+++ b/lm/builder/ngram.hh
@ -5,10 +5,9 @@
 #include "lm/word_index.hh"

 #include <cstddef>
-
-#include <assert.h>
+#include <cassert>
 #include <stdint.h>
-#include <string.h>
+#include <cstring>

 namespace lm {
 namespace builder {
--- a/lm/builder/print.cc
+++ b/lm/builder/print.cc
@ -7,8 +7,7 @@
 #include "util/stream/timer.hh"

 #include <sstream>
-
-#include <string.h>
+#include <cstring>

 namespace lm { namespace builder {

--- a/lm/builder/print.hh
+++ b/lm/builder/print.hh
@ -10,8 +10,7 @@
 #include "util/string_piece.hh"

 #include <ostream>
-
-#include <assert.h>
+#include <cassert>

 // Warning: print routines read all unigrams before all bigrams before all
 // trigrams etc.  So if other parts of the chain move jointly, you'll have to
--- a/lm/config.hh
+++ b/lm/config.hh
@ -30,9 +30,10 @@ struct Config {
    return show_progress ? messages : 0;
  }

-  // This will be called with every string in the vocabulary.  See
-  // enumerate_vocab.hh for more detail.  Config does not take ownership; you
-  // are still responsible for deleting it (or stack allocating).
+  // This will be called with every string in the vocabulary by the
+  // constructor; it need only exist for the lifetime of the constructor.
+  // See enumerate_vocab.hh for more detail.  Config does not take ownership;
+  // just delete/let it go out of scope after the constructor exits.
  EnumerateVocab *enumerate_vocab;


--- a/lm/filter/arpa_io.cc
+++ b/lm/filter/arpa_io.cc
@ -6,9 +6,9 @@
 #include <string>
 #include <vector>

-#include <ctype.h>
-#include <errno.h>
-#include <string.h>
+#include <cctype>
+#include <cerrno>
+#include <cstring>

 namespace lm {

--- a/lm/filter/arpa_io.hh
+++ b/lm/filter/arpa_io.hh
@ -14,7 +14,7 @@
 #include <string>
 #include <vector>

-#include <string.h>
+#include <cstring>
 #include <stdint.h>

 namespace util { class FilePiece; }
--- a/lm/filter/phrase.cc
+++ b/lm/filter/phrase.cc
@ -9,7 +9,7 @@
 #include <string>
 #include <vector>

-#include <ctype.h>
+#include <cctype>

 namespace lm {
 namespace phrase {
--- a/lm/filter/vocab.cc
+++ b/lm/filter/vocab.cc
@ -3,7 +3,7 @@
 #include <istream>
 #include <iostream>

-#include <ctype.h>
+#include <cctype>

 namespace lm {
 namespace vocab {
--- a/lm/lm_exception.cc
+++ b/lm/lm_exception.cc
@ -1,7 +1,7 @@
 #include "lm/lm_exception.hh"

-#include<errno.h>
-#include<stdio.h>
+#include <cerrno>
+#include <cstdio>

 namespace lm {

--- a/lm/model.hh
+++ b/lm/model.hh
@ -17,8 +17,7 @@

 #include <algorithm>
 #include <vector>
-
-#include <string.h>
+#include <cstring>

 namespace util { class FilePiece; }

--- a/lm/model_test.cc
+++ b/lm/model_test.cc
@ -1,7 +1,7 @@
 #include "lm/model.hh"

-#include <stdlib.h>
-#include <string.h>
+#include <cstdlib>
+#include <cstring>

 #define BOOST_TEST_MODULE ModelTest
 #include <boost/test/unit_test.hpp>
--- a/lm/ngram_query.hh
+++ b/lm/ngram_query.hh
@ -11,8 +11,7 @@
 #include <ostream>
 #include <istream>
 #include <string>
-
-#include <math.h>
+#include <cmath>

 namespace lm {
 namespace ngram {
--- a/lm/partial.hh
+++ b/lm/partial.hh
@ -5,8 +5,7 @@
 #include "lm/state.hh"

 #include <algorithm>
-
-#include <assert.h>
+#include <cassert>

 namespace lm {
 namespace ngram {
--- a/lm/read_arpa.cc
+++ b/lm/read_arpa.cc
@ -9,8 +9,8 @@
 #include <sstream>
 #include <vector>

-#include <ctype.h>
-#include <string.h>
+#include <cctype>
+#include <cstring>
 #include <stdint.h>

 #ifdef WIN32
--- a/lm/search_trie.hh
+++ b/lm/search_trie.hh
@ -12,8 +12,7 @@

 #include <vector>
 #include <cstdlib>
-
-#include <assert.h>
+#include <cassert>

 namespace lm {
 namespace ngram {
--- a/lm/state.hh
+++ b/lm/state.hh
@ -5,7 +5,7 @@
 #include "lm/word_index.hh"
 #include "util/murmur_hash.hh"

-#include <string.h>
+#include <cstring>

 namespace lm {
 namespace ngram {
--- a/lm/trie.cc
+++ b/lm/trie.cc
@ -5,7 +5,7 @@
 #include "util/exception.hh"
 #include "util/sorted_uniform.hh"

-#include <assert.h>
+#include <cassert>

 namespace lm {
 namespace ngram {
--- a/lm/virtual_interface.hh
+++ b/lm/virtual_interface.hh
@ -6,7 +6,7 @@
 #include "util/string_piece.hh"

 #include <string>
-#include <string.h>
+#include <cstring>

 namespace lm {
 namespace base {
--- a/lm/vocab.cc
+++ b/lm/vocab.cc
@ -12,8 +12,7 @@
 #include "util/probing_hash_table.hh"

 #include <string>
-
-#include <string.h>
+#include <cstring>

 namespace lm {
 namespace ngram {
--- a/lm/word_index.hh
+++ b/lm/word_index.hh
@ -2,7 +2,7 @@
 #ifndef LM_WORD_INDEX_H
 #define LM_WORD_INDEX_H

-#include <limits.h>
+#include <climits>

 namespace lm {
 typedef unsigned int WordIndex;
--- a/lm/wrappers/nplm.cc
+++ b/lm/wrappers/nplm.cc
@ -3,8 +3,7 @@
 #include "util/file.hh"

 #include <algorithm>
-
-#include <string.h>
+#include <cstring>

 #include "neuralLM.h"

@ -21,6 +20,26 @@ WordIndex Vocabulary::Index(const std::string &str) const {
  return vocab_.lookup_word(str);
 }

+class Backend {
+  public:
+    Backend(const nplm::neuralLM &from, const std::size_t cache_size) : lm_(from), ngram_(from.get_order()) {
+      lm_.set_cache(cache_size);
+    }
+
+    nplm::neuralLM &LM() { return lm_; }
+    const nplm::neuralLM &LM() const { return lm_; }
+
+    Eigen::Matrix<int,Eigen::Dynamic,1> &staging_ngram() { return ngram_; }
+
+    double lookup_from_staging() { return lm_.lookup_ngram(ngram_); }
+
+    int order() const { return lm_.get_order(); }
+
+  private:
+    nplm::neuralLM lm_;
+    Eigen::Matrix<int,Eigen::Dynamic,1> ngram_;
+};
+
 bool Model::Recognize(const std::string &name) {
  try {
    util::scoped_fd file(util::OpenReadOrThrow(name.c_str()));
@ -31,10 +50,18 @@ bool Model::Recognize(const std::string &name) {
  } catch (const util::Exception &) {
    return false;
  }
-} 
+}
+
+namespace {
+nplm::neuralLM *LoadNPLM(const std::string &file) {
+  util::scoped_ptr<nplm::neuralLM> ret(new nplm::neuralLM());
+  ret->read(file);
+  return ret.release();
+}
+} // namespace

 Model::Model(const std::string &file, std::size_t cache) 
-  : base_instance_(new nplm::neuralLM(file)), vocab_(base_instance_->get_vocabulary()), cache_size_(cache) {
+  : base_instance_(LoadNPLM(file)), vocab_(base_instance_->get_vocabulary()), cache_size_(cache) {
  UTIL_THROW_IF(base_instance_->get_order() > NPLM_MAX_ORDER, util::Exception, "This NPLM has order " << (unsigned int)base_instance_->get_order() << " but the KenLM wrapper was compiled with " << NPLM_MAX_ORDER << ".  Change the defintion of NPLM_MAX_ORDER and recompile.");
  // log10 compatible with backoff models.
  base_instance_->set_log_base(10.0);
@ -49,26 +76,25 @@ Model::Model(const std::string &file, std::size_t cache)
 Model::~Model() {}

 FullScoreReturn Model::FullScore(const State &from, const WordIndex new_word, State &out_state) const {
-  nplm::neuralLM *lm = backend_.get();
-  if (!lm) {
-    lm = new nplm::neuralLM(*base_instance_);
-    backend_.reset(lm);
-    lm->set_cache(cache_size_);
+  Backend *backend = backend_.get();
+  if (!backend) {
+    backend = new Backend(*base_instance_, cache_size_);
+    backend_.reset(backend);
  }
  // State is in natural word order.
  FullScoreReturn ret;
-  for (int i = 0; i < lm->get_order() - 1; ++i) {
-    lm->staging_ngram()(i) = from.words[i];
+  for (int i = 0; i < backend->order() - 1; ++i) {
+    backend->staging_ngram()(i) = from.words[i];
  }
-  lm->staging_ngram()(lm->get_order() - 1) = new_word;
-  ret.prob = lm->lookup_from_staging();
+  backend->staging_ngram()(backend->order() - 1) = new_word;
+  ret.prob = backend->lookup_from_staging();
  // Always say full order.
-  ret.ngram_length = lm->get_order();
+  ret.ngram_length = backend->order();
  // Shift everything down by one.
-  memcpy(out_state.words, from.words + 1, sizeof(WordIndex) * (lm->get_order() - 2));
-  out_state.words[lm->get_order() - 2] = new_word;
+  memcpy(out_state.words, from.words + 1, sizeof(WordIndex) * (backend->order() - 2));
+  out_state.words[backend->order() - 2] = new_word;
  // Fill in trailing words with zeros so state comparison works.
-  memset(out_state.words + lm->get_order() - 1, 0, sizeof(WordIndex) * (NPLM_MAX_ORDER - lm->get_order()));
+  memset(out_state.words + backend->order() - 1, 0, sizeof(WordIndex) * (NPLM_MAX_ORDER - backend->order()));
  return ret;
 }

--- a/lm/wrappers/nplm.hh
+++ b/lm/wrappers/nplm.hh
@ -49,6 +49,8 @@ struct State {
  WordIndex words[NPLM_MAX_ORDER - 1];
 };

+class Backend;
+
 class Model : public lm::base::ModelFacade<Model, State, Vocabulary> {
  private:
    typedef lm::base::ModelFacade<Model, State, Vocabulary> P;
@ -68,7 +70,7 @@ class Model : public lm::base::ModelFacade<Model, State, Vocabulary> {
  private:
    boost::scoped_ptr<nplm::neuralLM> base_instance_;

-    mutable boost::thread_specific_ptr<nplm::neuralLM> backend_;
+    mutable boost::thread_specific_ptr<Backend> backend_;

    Vocabulary vocab_;

--- a/mert/BleuScorer.cpp
+++ b/mert/BleuScorer.cpp
@ -191,7 +191,7 @@ statscore_t BleuScorer::calculateScore(const vector<ScoreStatsType>& comps) cons
  UTIL_THROW_IF(comps.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error");

  float logbleu = 0.0;
-  for (int i = 0; i < kBleuNgramOrder; ++i) {
+  for (std::size_t i = 0; i < kBleuNgramOrder; ++i) {
    if (comps[2*i] == 0) {
      return 0.0;
    }
@ -249,7 +249,7 @@ float smoothedSentenceBleu
  UTIL_THROW_IF(stats.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error");

  float logbleu = 0.0;
-  for (int j = 0; j < kBleuNgramOrder; j++) {
+  for (std::size_t j = 0; j < kBleuNgramOrder; j++) {
    logbleu += log(stats[2 * j] + smoothing) - log(stats[2 * j + 1] + smoothing);
  }
  logbleu /= kBleuNgramOrder;
@ -275,7 +275,7 @@ float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vec

  // Calculate BLEU
  float logbleu = 0.0;
-  for (int j = 0; j < kBleuNgramOrder; j++) {
+  for (std::size_t j = 0; j < kBleuNgramOrder; j++) {
    logbleu += log(stats[2 * j]) - log(stats[2 * j + 1]);
  }
  logbleu /= kBleuNgramOrder;
--- a/mert/Data.cpp
+++ b/mert/Data.cpp
@ -17,6 +17,7 @@
 #include "util/exception.hh"

 #include "util/file_piece.hh"
+#include "util/random.hh"
 #include "util/tokenize_piece.hh"
 #include "util/string_piece.hh"
 #include "FeatureDataIterator.h"
@ -286,7 +287,7 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor
    } else {
      //create shards by randomly sampling
      for (size_t i = 0; i < floor(shard_size+0.5); ++i) {
-        shard_contents.push_back(rand() % data_size);
+        shard_contents.push_back(util::rand_excl(data_size));
      }
    }

--- a/mert/Fdstream.h
+++ b/mert/Fdstream.h
@ -13,6 +13,8 @@
 #include <iostream>
 #include <string>

+#include "util/unistd.hh"
+
 #if defined(__GLIBCXX__) || defined(__GLIBCPP__)
 #include <ext/stdio_filebuf.h>

--- a/mert/FileStream.cpp
+++ b/mert/FileStream.cpp
@ -40,28 +40,3 @@ inputfilestream::~inputfilestream()
 void inputfilestream::close()
 {
 }
-
-outputfilestream::outputfilestream(const std::string &filePath)
-  : std::ostream(0), m_streambuf(0), m_is_good(false)
-{
-  // check if file is readable
-  std::filebuf* fb = new std::filebuf();
-  m_is_good = (fb->open(filePath.c_str(), std::ios::out) != NULL);
-
-  if (IsGzipFile(filePath)) {
-    throw runtime_error("Output to a zipped file not supported!");
-  } else {
-    m_streambuf = fb;
-  }
-  this->init(m_streambuf);
-}
-
-outputfilestream::~outputfilestream()
-{
-  delete m_streambuf;
-  m_streambuf = 0;
-}
-
-void outputfilestream::close()
-{
-}
--- a/mert/FileStream.h
+++ b/mert/FileStream.h
@ -22,20 +22,4 @@ public:
  void close();
 };

-class outputfilestream : public std::ostream
-{
-protected:
-  std::streambuf *m_streambuf;
-  bool m_is_good;
-
-public:
-  explicit outputfilestream(const std::string &filePath);
-  virtual ~outputfilestream();
-
-  bool good() const {
-    return m_is_good;
-  }
-  void close();
-};
-
 #endif // MERT_FILE_STREAM_H_
--- a/mert/ForestRescoreTest.cpp
+++ b/mert/ForestRescoreTest.cpp
@ -1,6 +1,9 @@
 #include <iostream>

+#include "util/tokenize_piece.hh"
+
 #include "ForestRescore.h"
+#include "MiraFeatureVector.h"

 #define BOOST_TEST_MODULE MertForestRescore
 #include <boost/test/unit_test.hpp>
@ -10,8 +13,7 @@
 using namespace std;
 using namespace MosesTuning;

-BOOST_AUTO_TEST_CASE(viterbi_simple_lattice)
-{
+BOOST_AUTO_TEST_CASE(viterbi_simple_lattice) {
  Vocab vocab;
  WordVec words;
  string wordStrings[] =
@ -242,5 +244,101 @@ BOOST_AUTO_TEST_CASE(viterbi_3branch_lattice)
  BOOST_CHECK_EQUAL(6, hopeHypo.bleuStats[8]);
 }

+BOOST_AUTO_TEST_CASE(viterbi_full_hypergraph) {
+  Vocab vocab;
+  //References
+  ReferenceSet references;
+  references.AddLine(0,"in addition to EU support for businesses , also the administration of national business support will be concentrated in four Centres for Economic Development , Transport and Environment ( ELY Centres ) , starting from mid @-@ September .",vocab); 
+  //Load the hypergraph
+  Graph graph(vocab);
+  util::scoped_fd fd(util::OpenReadOrThrow("mert/hgtest/0.gz"));
+  util::FilePiece file(fd.release());
+  ReadGraph(file,graph);
+
+  //prune
+  SparseVector weights;
+  weights.set("OpSequenceModel0_1",0.011187);
+  weights.set("OpSequenceModel0_2",-0.002797);
+  weights.set("OpSequenceModel0_3",0.002797);
+  weights.set("OpSequenceModel0_4",-0.000140);
+  weights.set("OpSequenceModel0_5",0.004195);
+  weights.set("Distortion0",0.041952);
+  weights.set("PhrasePenalty0",0.027968);
+  weights.set("WordPenalty0",-0.139841);
+  weights.set("UnknownWordPenalty0",1.000000);
+  weights.set("LM0",0.069920);
+  weights.set("LexicalReordering0_1",0.041952);
+  weights.set("LexicalReordering0_2",0.041952);
+  weights.set("LexicalReordering0_3",0.041952);
+  weights.set("LexicalReordering0_4",0.041952);
+  weights.set("LexicalReordering0_5",0.041952);
+  weights.set("LexicalReordering0_6",0.041952);
+  weights.set("LexicalReordering0_7",0.041952);
+  weights.set("LexicalReordering0_8",0.041952);
+  weights.set("TranslationModel0_1",0.027968);
+  weights.set("TranslationModel0_2",0.027968);
+  weights.set("TranslationModel0_3",0.027968);
+  weights.set("TranslationModel0_4",0.027968);
+  weights.set("TranslationModel0_5",0.027968);
+  weights.set("TranslationModel0_6",0.027968);
+  weights.set("TranslationModel0_7",0.027968);
+  weights.set("TranslationModel0_8",0.027968);
+  weights.set("TranslationModel0_9",0.027968);
+  weights.set("TranslationModel0_10",0.027968);
+  weights.set("TranslationModel0_11",0.027968);
+  weights.set("TranslationModel0_12",0.027968);
+  weights.set("TranslationModel0_13",0.027968);
+  size_t edgeCount = 500;
+  boost::shared_ptr<Graph> prunedGraph;
+  prunedGraph.reset(new Graph(vocab));
+  graph.Prune(prunedGraph.get(), weights, edgeCount);
+
+  vector<ValType> bg(9);
+  HgHypothesis bestHypo;
+  //best hypothesis
+  Viterbi(*prunedGraph, weights, 0, references, 0, bg, &bestHypo);
+  //check output as expected
+  string expectedStr = "<s> the EU matters , but also the national matters management focus since mid @-@ September four ely @-@ centre . </s>";
+  util::TokenIter<util::SingleCharacter, true> expected(expectedStr, util::SingleCharacter(' '));
+  for (size_t i = 0; i < bestHypo.text.size(); ++i) {
+    //cerr << bestHypo.text[i]->first << " ";
+    BOOST_CHECK_EQUAL(*expected,bestHypo.text[i]->first);
+    ++expected;
+  }
+  BOOST_CHECK(!expected);
+  //cerr << endl;
+  //check scores
+  BOOST_CHECK_CLOSE(-80.062,bestHypo.featureVector.get("OpSequenceModel0_1"), 0.001);
+  BOOST_CHECK_CLOSE(2,bestHypo.featureVector.get("OpSequenceModel0_2"), 0.001);
+  BOOST_CHECK_CLOSE(2,bestHypo.featureVector.get("OpSequenceModel0_3"), 0.001);
+  BOOST_CHECK_CLOSE(3,bestHypo.featureVector.get("OpSequenceModel0_4"), 0.001);
+  BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("OpSequenceModel0_5"), 0.001);
+  BOOST_CHECK_CLOSE(-6,bestHypo.featureVector.get("Distortion0"), 0.001);
+  BOOST_CHECK_CLOSE(14,bestHypo.featureVector.get("PhrasePenalty0"), 0.001);
+  BOOST_CHECK_CLOSE(-20,bestHypo.featureVector.get("WordPenalty0"), 0.001);
+  BOOST_CHECK_CLOSE(-100,bestHypo.featureVector.get("UnknownWordPenalty0"), 0.001);
+  BOOST_CHECK_CLOSE(-126.616,bestHypo.featureVector.get("LM0"), 0.001);
+  BOOST_CHECK_CLOSE(-5.2238,bestHypo.featureVector.get("LexicalReordering0_1"), 0.001);
+  BOOST_CHECK_CLOSE(-0.29515,bestHypo.featureVector.get("LexicalReordering0_2"), 0.001);
+  BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("LexicalReordering0_3"), 0.001);
+  BOOST_CHECK_CLOSE(-0.470004,bestHypo.featureVector.get("LexicalReordering0_4"), 0.001);
+  BOOST_CHECK_CLOSE(-9.28267,bestHypo.featureVector.get("LexicalReordering0_5"), 0.001);
+  BOOST_CHECK_CLOSE(-0.470004,bestHypo.featureVector.get("LexicalReordering0_6"), 0.001);
+  BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("LexicalReordering0_7"), 0.001);
+  BOOST_CHECK_CLOSE(-0.402678,bestHypo.featureVector.get("LexicalReordering0_8"), 0.001);
+  BOOST_CHECK_CLOSE(-54.3119,bestHypo.featureVector.get("TranslationModel0_1"), 0.001);
+  BOOST_CHECK_CLOSE(-62.2619,bestHypo.featureVector.get("TranslationModel0_2"), 0.001);
+  BOOST_CHECK_CLOSE(-23.8782,bestHypo.featureVector.get("TranslationModel0_3"), 0.001);
+  BOOST_CHECK_CLOSE(-25.1626,bestHypo.featureVector.get("TranslationModel0_4"), 0.001);
+  BOOST_CHECK_CLOSE(12.9986,bestHypo.featureVector.get("TranslationModel0_5"), 0.001);
+  BOOST_CHECK_CLOSE(3.99959,bestHypo.featureVector.get("TranslationModel0_6"), 0.001);
+  BOOST_CHECK_CLOSE(1.99979,bestHypo.featureVector.get("TranslationModel0_7"), 0.001);
+  BOOST_CHECK_CLOSE(1.99979,bestHypo.featureVector.get("TranslationModel0_8"), 0.001);
+  BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("TranslationModel0_9"), 0.001);
+  BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("TranslationModel0_10"), 0.001);
+  BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("TranslationModel0_11"), 0.001);
+  BOOST_CHECK_CLOSE(0.999896,bestHypo.featureVector.get("TranslationModel0_12"), 0.001);
+  BOOST_CHECK_CLOSE(7.99917,bestHypo.featureVector.get("TranslationModel0_13"), 0.001);
+}


--- a/mert/HopeFearDecoder.cpp
+++ b/mert/HopeFearDecoder.cpp
@ -180,7 +180,7 @@ HypergraphHopeFearDecoder::HypergraphHopeFearDecoder
  references_.Load(referenceFiles, vocab_);

  SparseVector weights;
-  wv.ToSparse(&weights);
+  wv.ToSparse(&weights,num_dense_);
  scorer_ = scorer;

  static const string kWeights = "weights";
@ -243,7 +243,7 @@ void HypergraphHopeFearDecoder::HopeFear(
 {
  size_t sentenceId = *sentenceIdIter_;
  SparseVector weights;
-  wv.ToSparse(&weights);
+  wv.ToSparse(&weights, num_dense_);
  const Graph& graph = *(graphs_[sentenceId]);

  // ValType hope_scale = 1.0;
@ -338,7 +338,7 @@ void HypergraphHopeFearDecoder::MaxModel(const AvgWeightVector& wv, vector<ValTy
  HgHypothesis bestHypo;
  size_t sentenceId = *sentenceIdIter_;
  SparseVector weights;
-  wv.ToSparse(&weights);
+  wv.ToSparse(&weights, num_dense_);
  vector<ValType> bg(scorer_->NumberOfScores());
  //cerr << "Calculating bleu on " << sentenceId << endl;
  Viterbi(*(graphs_[sentenceId]), weights, 0, references_, sentenceId, bg, &bestHypo);
--- a/mert/HypPackEnumerator.h
+++ b/mert/HypPackEnumerator.h
@ -12,7 +12,7 @@
 #include <string>
 #include <vector>
 #include <utility>
-#include <stddef.h>
+#include <cstddef>

 #include "FeatureDataIterator.h"
 #include "ScoreDataIterator.h"
--- a/mert/Jamfile
+++ b/mert/Jamfile
@ -77,6 +77,7 @@ unit-test feature_data_test : FeatureDataTest.cpp mert_lib ..//boost_unit_test_f
 unit-test data_test : DataTest.cpp mert_lib ..//boost_unit_test_framework ;
 unit-test forest_rescore_test : ForestRescoreTest.cpp mert_lib ..//boost_unit_test_framework ;
 unit-test hypergraph_test : HypergraphTest.cpp mert_lib ..//boost_unit_test_framework ;
+unit-test mira_feature_vector_test : MiraFeatureVectorTest.cpp mert_lib ..//boost_unit_test_framework ;
 unit-test ngram_test : NgramTest.cpp mert_lib ..//boost_unit_test_framework ;
 unit-test optimizer_factory_test : OptimizerFactoryTest.cpp mert_lib ..//boost_unit_test_framework ;
 unit-test point_test : PointTest.cpp mert_lib ..//boost_unit_test_framework ;
--- a/mert/MeteorScorer.cpp
+++ b/mert/MeteorScorer.cpp
@ -6,7 +6,7 @@
 #include <iterator>
 #include <sstream>
 #include <stdexcept>
-#include <stdio.h>
+#include <cstdio>
 #include <string>
 #include <vector>

@ -18,6 +18,7 @@

 #include "ScoreStats.h"
 #include "Util.h"
+#include "util/unistd.hh"

 using namespace std;

@ -25,7 +26,7 @@ namespace MosesTuning
 {

 // Meteor supported
-#if defined(__GLIBCXX__) || defined(__GLIBCPP__)
+#if (defined(__GLIBCXX__) || defined(__GLIBCPP__)) && !defined(_WIN32)

 // for clarity
 #define CHILD_STDIN_READ pipefds_input[0]
--- a/mert/MiraFeatureVectorTest.cpp
+++ b/mert/MiraFeatureVectorTest.cpp
@ -0,0 +1,49 @@
+#include "MiraFeatureVector.h"
+#include "MiraWeightVector.h"
+
+#define BOOST_TEST_MODULE MiraFeatureVector
+#include <boost/test/unit_test.hpp>
+
+using namespace MosesTuning;
+
+/* Note that the conversion to and from SparseVector needs to know
+how many of the features are really "dense". This is because in hg mira
+all features (sparse and dense) are to get rolled in to SparseVector
+*/
+
+BOOST_AUTO_TEST_CASE(from_sparse) {
+  SparseVector sp;
+  sp.set("dense0", 0.2);
+  sp.set("dense1", 0.3);
+  sp.set("sparse0", 0.7);
+  sp.set("sparse1", 0.9);
+  sp.set("sparse2", 0.1);
+
+  MiraFeatureVector mfv(sp,2);
+  BOOST_CHECK_EQUAL(mfv.size(),5);
+
+  BOOST_CHECK_EQUAL(mfv.feat(0),0);
+  BOOST_CHECK_EQUAL(mfv.feat(1),1);
+  BOOST_CHECK_EQUAL(mfv.feat(2),4);
+  BOOST_CHECK_EQUAL(mfv.feat(3),5);
+  BOOST_CHECK_EQUAL(mfv.feat(4),6);
+
+  BOOST_CHECK_CLOSE(mfv.val(0), 0.2,1e-5);
+  BOOST_CHECK_CLOSE(mfv.val(1), 0.3,1e-5);
+  BOOST_CHECK_CLOSE(mfv.val(2), 0.7,1e-5);
+  BOOST_CHECK_CLOSE(mfv.val(3), 0.9,1e-5);
+  BOOST_CHECK_CLOSE(mfv.val(4), 0.1,1e-5);
+
+  MiraWeightVector mwv;
+  mwv.update(mfv,1.0);
+  SparseVector sp2;
+  mwv.ToSparse(&sp2,2);
+
+  //check we get back what we started with
+  BOOST_CHECK_CLOSE(sp2.get("dense0"), 0.2,1e-5);
+  BOOST_CHECK_CLOSE(sp2.get("dense1"), 0.3,1e-5);
+  BOOST_CHECK_CLOSE(sp2.get("sparse0"), 0.7,1e-5);
+  BOOST_CHECK_CLOSE(sp2.get("sparse1"), 0.9,1e-5);
+  BOOST_CHECK_CLOSE(sp2.get("sparse2"), 0.1,1e-5);
+
+}
--- a/mert/MiraWeightVector.cpp
+++ b/mert/MiraWeightVector.cpp
@ -93,11 +93,17 @@ void MiraWeightVector::update(size_t index, ValType delta)
  m_lastUpdated[index] = m_numUpdates;
 }

-void MiraWeightVector::ToSparse(SparseVector* sparse) const
+void MiraWeightVector::ToSparse(SparseVector* sparse, size_t denseSize) const
 {
  for (size_t i = 0; i < m_weights.size(); ++i) {
    if(abs(m_weights[i])>1e-8) {
-      sparse->set(i,m_weights[i]);
+      if (i < denseSize) {
+        sparse->set(i,m_weights[i]);
+      } else {
+        //The ids in MiraFeatureVector/MiraWeightVector for sparse features
+        //need to be translated when converting back to SparseVector.
+        sparse->set(i-denseSize, m_weights[i]);
+      }
    }
  }
 }
@ -172,12 +178,18 @@ size_t AvgWeightVector::size() const
  return m_wv.m_weights.size();
 }

-void AvgWeightVector::ToSparse(SparseVector* sparse) const
+void AvgWeightVector::ToSparse(SparseVector* sparse, size_t denseSize) const
 {
  for (size_t i = 0; i < size(); ++i) {
    ValType w = weight(i);
    if(abs(w)>1e-8) {
-      sparse->set(i,w);
+      if (i < denseSize) {
+        sparse->set(i,w);
+      } else {
+        //The ids in MiraFeatureVector/MiraWeightVector for sparse features
+        //need to be translated when converting back to SparseVector.
+        sparse->set(i-denseSize, w);
+      }
    }
  }
 }
--- a/mert/MiraWeightVector.h
+++ b/mert/MiraWeightVector.h
@ -64,9 +64,9 @@ public:
  AvgWeightVector avg();

  /**
-    * Convert to sparse vector, interpreting all features as sparse.
+    * Convert to sparse vector, interpreting all features as sparse. Only used by hgmira.
   **/
-  void ToSparse(SparseVector* sparse) const;
+  void ToSparse(SparseVector* sparse, size_t denseSize) const;

  friend class AvgWeightVector;

@ -104,7 +104,7 @@ public:
  ValType score(const MiraFeatureVector& fv) const;
  ValType weight(std::size_t index) const;
  std::size_t size() const;
-  void ToSparse(SparseVector* sparse) const;
+  void ToSparse(SparseVector* sparse, size_t num_dense) const;
 private:
  const MiraWeightVector& m_wv;
 };
--- a/mert/Permutation.cpp
+++ b/mert/Permutation.cpp
@ -8,7 +8,7 @@

 #include <fstream>
 #include <sstream>
-#include <math.h>
+#include <cmath>
 #include "Permutation.h"
 #include "Util.h"

--- a/Show More
+++ b/Show More