Merge branch 'master' into dynamic-models

2024-09-11 11:25:40 +03:00 · 2014-04-30 08:32:46 +02:00 · 2014-04-30 08:32:46 +02:00 · fe1ed42f81
commit fe1ed42f81
parent 0313523d7d ae4ab9aae0
43 changed files with 1907 additions and 261 deletions
--- a/6
+++ b/6
@ -55,7 +55,7 @@
 #                                information also known as -g
 # --notrace                      compiles without TRACE macros
 #
-# --enable-boost-pool            uses Boost pools for the memory SCFG table
+# --enable-boost-pool            uses Boost pools for the memory SCFG tabgle
 #
 # --enable-mpi                   switch on mpi
 # --without-libsegfault          does not link with libSegFault
@ -148,9 +148,13 @@ if [ option.get "with-mm" : : "yes" ]
  moses/TranslationModel/UG/mm//mtt-build 
  moses/TranslationModel/UG/mm//mtt-dump 
  moses/TranslationModel/UG/mm//symal2mam 
+  moses/TranslationModel/UG/mm//mam2symal 
+  moses/TranslationModel/UG/mm//mam_verify 
  moses/TranslationModel/UG/mm//custom-pt 
  moses/TranslationModel/UG/mm//mmlex-build 
+  moses/TranslationModel/UG/mm//mmlex-lookup 
  moses/TranslationModel/UG/mm//mtt-count-words 
+  moses/TranslationModel/UG/mm//calc-coverage 
  moses/TranslationModel/UG//try-align 
  ;
 }
--- a/contrib/server/Jamfile
+++ b/contrib/server/Jamfile
@ -35,7 +35,7 @@ if $(build-moses-server) = true
  xmlrpc-linkflags = [ shell_or_die "$(xmlrpc-command) c++2 abyss-server --libs" ] ;
  xmlrpc-cxxflags = [ shell_or_die "$(xmlrpc-command) c++2 abyss-server --cflags" ] ;

-  exe mosesserver : mosesserver.cpp ../../moses//moses ../../moses-cmd/IOWrapper.cpp ../../OnDiskPt//OnDiskPt : <linkflags>$(xmlrpc-linkflags) <cxxflags>$(xmlrpc-cxxflags) ;
+  exe mosesserver : mosesserver.cpp ../../moses//moses ../../OnDiskPt//OnDiskPt ../../moses-cmd/IOWrapper.cpp : <linkflags>$(xmlrpc-linkflags) <cxxflags>$(xmlrpc-cxxflags) ;
 } else {
  alias mosesserver ;
 }
--- a/contrib/server/mosesserver.cpp
+++ b/contrib/server/mosesserver.cpp
@ -10,6 +10,9 @@
 #include "moses/StaticData.h"
 #include "moses/TranslationModel/PhraseDictionaryDynSuffixArray.h"
 #include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
+#if PT_UG
+#include "moses/TranslationModel/UG/mmsapt.h"
+#endif
 #include "moses/TreeInput.h"
 #include "moses/LM/ORLM.h"
 #include "moses-cmd/IOWrapper.h"
@ -43,10 +46,16 @@ public:
          xmlrpc_c::value *   const  retvalP) {
    const params_t params = paramList.getStruct(0);
    breakOutParams(params);
+#if PT_UG
+    Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(PhraseDictionary::GetColl()[0]);
+    pdsa->add(source_,target_,alignment_);
+#else
    const PhraseDictionary* pdf = PhraseDictionary::GetColl()[0];
-    PhraseDictionaryDynSuffixArray* pdsa = (PhraseDictionaryDynSuffixArray*) pdf;
+    PhraseDictionaryDynSuffixArray* 
+      pdsa = (PhraseDictionaryDynSuffixArray*) pdf;
    cerr << "Inserting into address " << pdsa << endl;
    pdsa->insertSnt(source_, target_, alignment_);
+#endif
    if(add2ORLM_) {
      //updateORLM();
    }
@ -54,7 +63,9 @@ public:
    //PhraseDictionary* pdsa = (PhraseDictionary*) pdf->GetDictionary(*dummy);
    map<string, xmlrpc_c::value> retData;
    //*retvalP = xmlrpc_c::value_struct(retData);
+#ifndef PT_UG
    pdf = 0;
+#endif
    pdsa = 0;
    *retvalP = xmlrpc_c::value_string("Phrase table updated");
  }
@ -211,8 +222,7 @@ public:
        "Missing source text",
        xmlrpc_c::fault::CODE_PARSE);
    }
-    const string source(
-      (xmlrpc_c::value_string(si->second)));
+    const string source((xmlrpc_c::value_string(si->second)));

    cerr << "Input: " << source << endl;
    si = params.find("align");
@ -230,6 +240,9 @@ public:
    si = params.find("nbest-distinct");
    bool nbest_distinct = (si != params.end());

+    si = params.find("add-score-breakdown");
+    bool addScoreBreakdown = (si != params.end());
+
    vector<float> multiModelWeights;
    si = params.find("lambda");
    if (si != params.end()) {
@ -258,8 +271,8 @@ public:

    if (staticData.IsChart()) {
       TreeInput tinput;
-        const vector<FactorType> &inputFactorOrder =
-          staticData.GetInputFactorOrder();
+        const vector<FactorType>& 
+	  inputFactorOrder = staticData.GetInputFactorOrder();
        stringstream in(source + "\n");
        tinput.Read(in,inputFactorOrder);
        ChartManager manager(tinput);
@ -305,7 +318,8 @@ public:
          insertTranslationOptions(manager,retData);
        }
        if (nbest_size>0) {
-          outputNBest(manager, retData, nbest_size, nbest_distinct, reportAllFactors, addAlignInfo);
+          outputNBest(manager, retData, nbest_size, nbest_distinct, 
+		      reportAllFactors, addAlignInfo, addScoreBreakdown);
        }
    }
    pair<string, xmlrpc_c::value>
@ -330,8 +344,9 @@ public:

      if (addAlignmentInfo) {
        /**
-         * Add the alignment info to the array. This is in target order and consists of
-         *       (tgt-start, src-start, src-end) triples.
+         * Add the alignment info to the array. This is in target
+         * order and consists of (tgt-start, src-start, src-end)
+         * triples.
         **/
        map<string, xmlrpc_c::value> phraseAlignInfo;
        phraseAlignInfo["tgt-start"] = xmlrpc_c::value_int(hypo->GetCurrTargetWordsRange().GetStartPos());
@ -396,7 +411,8 @@ public:
                   const int n=100,
                   const bool distinct=false,
                   const bool reportAllFactors=false,
-                   const bool addAlignmentInfo=false)
+                   const bool addAlignmentInfo=false,
+		   const bool addScoreBreakdown=false)
  {
    TrellisPathList nBestList;
    manager.CalcNBest(n, nBestList, distinct);
@ -452,6 +468,14 @@ public:
        }
      }

+      if (addScoreBreakdown)
+	{
+	  // should the score breakdown be reported in a more structured manner?
+	  ostringstream buf;
+	  MosesCmd::OutputAllFeatureScores(path.GetScoreBreakdown(),buf);
+	  nBestXMLItem["fvals"] = xmlrpc_c::value_string(buf.str());
+	}
+
      // weighted score
      nBestXMLItem["totalScore"] = xmlrpc_c::value_double(path.GetTotalScore());
      nBestXml.push_back(xmlrpc_c::value_struct(nBestXMLItem));
@ -490,11 +514,55 @@ public:
    }
    retData.insert(pair<string, xmlrpc_c::value>("topt", xmlrpc_c::value_array(toptsXml)));
  }
-
-
-
 };

+static 
+void 
+PrintFeatureWeight(ostream& out, const FeatureFunction* ff)
+{
+  out << ff->GetScoreProducerDescription() << "=";
+  size_t numScoreComps = ff->GetNumScoreComponents();
+  vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
+  for (size_t i = 0; i < numScoreComps; ++i) {
+    out << " " << values[i];
+  }
+  out << endl;
+}
+
+static 
+void 
+ShowWeights(ostream& out)
+{
+  // adapted from moses-cmd/Main.cpp
+  std::ios::fmtflags old_flags = out.setf(std::ios::fixed);
+  size_t         old_precision = out.precision(6);
+  const vector<const StatelessFeatureFunction*>& 
+    slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
+  const vector<const StatefulFeatureFunction*>& 
+    sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
+
+  for (size_t i = 0; i < sff.size(); ++i) {
+    const StatefulFeatureFunction *ff = sff[i];
+    if (ff->IsTuneable()) {
+      PrintFeatureWeight(out,ff);
+    }
+    else {
+      out << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
+    }
+  }
+  for (size_t i = 0; i < slf.size(); ++i) {
+    const StatelessFeatureFunction *ff = slf[i];
+    if (ff->IsTuneable()) {
+      PrintFeatureWeight(out,ff);
+    }
+    else {
+      out << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
+    }
+  }
+  if (! (old_flags & std::ios::fixed)) 
+    out.unsetf(std::ios::fixed);
+  out.precision(old_precision);
+}

 int main(int argc, char** argv)
 {
@ -542,11 +610,16 @@ int main(int argc, char** argv)
    exit(1);
  }

+  if (params->isParamSpecified("show-weights")) {
+    ShowWeights(cout);
+    exit(0);
+  }
+
  //512 MB data limit (512KB is not enough for optimization)
  xmlrpc_limit_set(XMLRPC_XML_SIZE_LIMIT_ID, 512*1024*1024);

  xmlrpc_c::registry myRegistry;
-  
+
  xmlrpc_c::methodPtr const translator(new Translator);
  xmlrpc_c::methodPtr const updater(new Updater);
  xmlrpc_c::methodPtr const optimizer(new Optimizer);
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@ -253,14 +253,17 @@ public:
        if ( appendSuffix ) {
          fileName << "." << compression;
        }
-        boost::iostreams::filtering_ostream *file = new boost::iostreams::filtering_ostream;
+        boost::iostreams::filtering_ostream *file 
+	  = new boost::iostreams::filtering_ostream;

        if ( compression == "gz" ) {
          file->push( boost::iostreams::gzip_compressor() );
        } else if ( compression == "bz2" ) {
          file->push( boost::iostreams::bzip2_compressor() );
        } else if ( compression != "txt" ) {
-          TRACE_ERR("Unrecognized hypergraph compression format (" << compression << ") - using uncompressed plain txt" << std::endl);
+          TRACE_ERR("Unrecognized hypergraph compression format (" 
+		    << compression 
+		    << ") - using uncompressed plain txt" << std::endl);
          compression = "txt";
        }

@ -271,7 +274,10 @@ public:
          manager.OutputSearchGraphAsHypergraph(m_lineNumber, *file);
          file -> flush();
        } else {
-          TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file " << fileName.str() << " is not open or not ready for writing" << std::endl);
+          TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber 
+		    << " because the output file " << fileName.str() 
+		    << " is not open or not ready for writing" 
+		    << std::endl);
        }
        file -> pop();
        delete file;
--- a/moses/ConfusionNet.cpp
+++ b/moses/ConfusionNet.cpp
@ -95,7 +95,7 @@ namespace Moses
  ConfusionNet::
  ReadF(std::istream& in, const std::vector<FactorType>& factorOrder, int format)
  {
-    VERBOSE(1, "read confusion net with format "<<format<<"\n");
+    VERBOSE(2, "read confusion net with format "<<format<<"\n");
    switch(format) {
    case 0:
      return ReadFormat0(in,factorOrder);
@ -120,7 +120,9 @@ namespace Moses
    return rv;
  }

-
+#if 0
+  // Deprecated due to code duplication; 
+  // use Word::CreateFromString() instead
  void 
  ConfusionNet::
  String2Word(const std::string& s,Word& w,
@ -132,6 +134,7 @@ namespace Moses
 		  FactorCollection::Instance().AddFactor
 		  (Input,factorOrder[i], factorStrVector[i]));
  }
+#endif

  bool 
  ConfusionNet::
@ -155,7 +158,8 @@ namespace Moses
      Column col;
      while(is>>word) {
 	Word w;
-	String2Word(word,w,factorOrder);
+	// String2Word(word,w,factorOrder);
+	w.CreateFromString(Input,factorOrder,StringPiece(word),false,false);
 	std::vector<float> probs(totalCount, 0.0);
 	for(size_t i=0; i < numInputScores; i++) {
 	  double prob;
@ -216,7 +220,9 @@ namespace Moses
 	    VERBOSE(1, "WARN: neg costs: "<<data[i][j].second.denseScores[0]<<" -> set to 0\n");
 	    data[i][j].second.denseScores[0]=0.0;
 	  }
-	  String2Word(word,data[i][j].first,factorOrder);
+	  // String2Word(word,data[i][j].first,factorOrder);
+	  Word& w = data[i][j].first;
+	  w.CreateFromString(Input,factorOrder,StringPiece(word),false,false);
 	} else return 0;
    }
    return !data.empty();
--- a/moses/FF/InputFeature.cpp
+++ b/moses/FF/InputFeature.cpp
@ -14,11 +14,11 @@ InputFeature *InputFeature::s_instance = NULL;

 InputFeature::InputFeature(const std::string &line)
  : StatelessFeatureFunction(line)
-  , m_numInputScores(0)
  , m_numRealWordCount(0)
 {
+  m_numInputScores = this->m_numScoreComponents;
  ReadParameters();
-
+  
  UTIL_THROW_IF2(s_instance, "Can only have 1 input feature");
  s_instance = this;
 }
--- a/moses/InputPath.cpp
+++ b/moses/InputPath.cpp
@ -5,7 +5,7 @@
 #include "TypeDef.h"
 #include "AlignmentInfo.h"
 #include "util/exception.hh"
-
+#include "TranslationModel/PhraseDictionary.h"
 using namespace std;

 namespace Moses
@ -18,9 +18,9 @@ InputPath(const Phrase &phrase, const NonTerminalSet &sourceNonTerms,
  ,m_phrase(phrase)
  ,m_range(range)
  ,m_inputScore(inputScore)
+  ,m_nextNode(1)
  ,m_sourceNonTerms(sourceNonTerms)
  ,m_sourceNonTermArray(FactorCollection::Instance().GetNumNonTerminals(), false)
-  ,m_nextNode(1)
 {
  for (NonTerminalSet::const_iterator iter = sourceNonTerms.begin(); iter != sourceNonTerms.end(); ++iter) {
    size_t idx = (*iter)[0]->GetId();
@ -33,6 +33,14 @@ InputPath(const Phrase &phrase, const NonTerminalSet &sourceNonTerms,

 InputPath::~InputPath()
 {
+  // Since there is no way for the Phrase Dictionaries to tell in 
+  // which (sentence) context phrases were looked up, we tell them 
+  // now that the phrase isn't needed any more by this inputPath
+  typedef std::pair<const TargetPhraseCollection*, const void* > entry;
+  std::map<const PhraseDictionary*, entry>::const_iterator iter;
+  for (iter = m_targetPhrases.begin(); iter != m_targetPhrases.end(); ++iter)
+    iter->first->Release(iter->second.first);
+  
  delete m_inputScore;
 }

--- a/moses/Jamfile
+++ b/moses/Jamfile
@ -40,7 +40,18 @@ current ?= "" ;
 path-constant LM-LOG : bin/lm.log ;
 update-if-changed $(LM-LOG) $(current) ;

-obj FF_Factory.o : FF/Factory.cpp LM//macros headers ../lm//kenlm : <dependency>$(LM-LOG) ;
+obj FF_Factory.o : FF/Factory.cpp LM//macros headers ../lm//kenlm mmlib : <dependency>$(LM-LOG) ;
+
+if [ option.get "with-mm" : no : yes ] = yes
+{
+  alias mmlib : 
+  $(TOP)/moses/TranslationModel/UG//mmsapt
+  $(TOP)/moses/TranslationModel/UG/generic//generic
+  $(TOP)/moses/TranslationModel/UG/mm//mm
+  ;
+} else {
+  alias mmlib ;
+}

 lib moses :
 [ glob 
@ -62,12 +73,11 @@ lib moses :
 ]
 headers FF_Factory.o LM//LM TranslationModel/CompactPT//CompactPT synlm ThreadPool
 ..//search ../util/double-conversion//double-conversion ..//z ../OnDiskPt//OnDiskPt 
-$(TOP)//boost_iostreams 
+$(TOP)//boost_iostreams mmlib
 :
 <threading>single:<source>../util//rt
 ;

-#generic//generic mm//mm

 alias headers-to-install : [ glob-tree *.h ] ;

--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@ -182,7 +182,11 @@ void Manager::printDivergentHypothesis(long translationId, const Hypothesis* hyp
 }


-void Manager::printThisHypothesis(long translationId, const Hypothesis* hypo, const vector <const TargetPhrase*> & remainingPhrases, float remainingScore, ostream& outputStream) const
+void 
+Manager::
+printThisHypothesis(long translationId, const Hypothesis* hypo, 
+		    const vector <const TargetPhrase*> & remainingPhrases, 
+		    float remainingScore, ostream& outputStream) const
 {

  outputStream << translationId << " ||| ";
--- a/moses/TranslationModel/PhraseDictionary.cpp
+++ b/moses/TranslationModel/PhraseDictionary.cpp
@ -50,6 +50,13 @@ PhraseDictionary::PhraseDictionary(const std::string &line)
  s_staticColl.push_back(this);
 }

+bool
+PhraseDictionary::
+ProvidesPrefixCheck() const
+{
+  return false;
+}
+
 const TargetPhraseCollection *PhraseDictionary::GetTargetPhraseCollectionLEGACY(const Phrase& src) const
 {
  const TargetPhraseCollection *ret;
@ -129,6 +136,23 @@ SetFeaturesToApply()
  }
 }

+  
+  // tell the Phrase Dictionary that the TargetPhraseCollection is not needed any more
+  void
+  PhraseDictionary::
+  Release(TargetPhraseCollection const* tpc) const
+  {
+    // do nothing by default
+    return;
+  }
+
+  bool
+  PhraseDictionary::
+  PrefixExists(Phrase const& phrase) const
+  {
+    return true;
+  }
+
 void
 PhraseDictionary::
 GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const
--- a/moses/TranslationModel/PhraseDictionary.h
+++ b/moses/TranslationModel/PhraseDictionary.h
@ -71,6 +71,8 @@ public:
 class PhraseDictionary :  public DecodeFeature
 {
 public:
+  virtual bool ProvidesPrefixCheck() const;
+
  static const std::vector<PhraseDictionary*>& GetColl() {
    return s_staticColl;
  }
@ -85,6 +87,16 @@ public:
    return m_tableLimit;
  }

+  virtual
+  void
+  Release(TargetPhraseCollection const* tpc) const;
+
+  /// return true if phrase table entries starting with /phrase/ 
+  //  exist in the table.
+  virtual
+  bool
+  PrefixExists(Phrase const& phrase) const;
+
  // LEGACY!
  // The preferred method is to override GetTargetPhraseCollectionBatch().
  // See class PhraseDictionaryMemory or PhraseDictionaryOnDisk for details
--- a/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc
+++ b/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc
@ -0,0 +1,48 @@
+#include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h"
+namespace Moses
+{
+  ThreadSafeCounter::
+  ThreadSafeCounter()
+    : ctr(0) 
+  { }
+
+  size_t 
+  ThreadSafeCounter::
+  operator++()
+  {
+    boost::lock_guard<boost::mutex> guard(this->lock);
+    return ++ctr;
+  }
+
+  size_t 
+  ThreadSafeCounter::
+  operator++(int foo)
+  {
+    boost::lock_guard<boost::mutex> guard(this->lock);
+    return ctr++;
+  }
+  
+  ThreadSafeCounter::
+  operator size_t() const
+  {
+    return ctr;
+  }
+
+  size_t 
+  ThreadSafeCounter::
+  operator--()
+  {
+    boost::lock_guard<boost::mutex> guard(this->lock);
+    return --ctr;
+  }
+
+  size_t 
+  ThreadSafeCounter::
+  operator--(int foo)
+  {
+    boost::lock_guard<boost::mutex> guard(this->lock);
+    return ctr--;
+  }
+  
+  
+}
--- a/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h
+++ b/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h
@ -0,0 +1,21 @@
+#pragma once
+#include <boost/thread.hpp>
+
+namespace Moses
+{
+  class ThreadSafeCounter
+  {
+    size_t ctr;
+    boost::mutex lock;
+  public:
+    ThreadSafeCounter();
+    size_t operator++();
+    size_t operator++(int);
+    size_t operator--();
+    size_t operator--(int);
+    operator size_t() const;
+  };
+
+}
+
+
--- a/moses/TranslationModel/UG/mm/Jamfile
+++ b/moses/TranslationModel/UG/mm/Jamfile
@ -1,3 +1,5 @@
+external-lib bz2 ;
+
 exe mmlex-build : 
 mmlex-build.cc 
 $(TOP)/moses/TranslationModel/UG/generic//generic 
@ -7,6 +9,15 @@ $(TOP)/moses/TranslationModel/UG/mm//mm
 $(TOP)/util//kenutil 
 ; 

+exe mmlex-lookup : 
+mmlex-lookup.cc 
+$(TOP)/moses/TranslationModel/UG/generic//generic 
+$(TOP)//boost_iostreams 
+$(TOP)//boost_program_options 
+$(TOP)/moses/TranslationModel/UG/mm//mm 
+$(TOP)/util//kenutil 
+; 
+
 exe mtt-count-words : 
 mtt-count-words.cc 
 $(TOP)/moses/TranslationModel/UG/generic//generic 
@ -34,6 +45,15 @@ $(TOP)/moses/TranslationModel/UG/mm//mm
 $(TOP)/util//kenutil 
 ; 

+exe mam2symal : 
+mam2symal.cc
+$(TOP)/moses/TranslationModel/UG/generic//generic 
+$(TOP)//boost_iostreams 
+$(TOP)//boost_program_options 
+$(TOP)/moses/TranslationModel/UG/mm//mm 
+$(TOP)/util//kenutil 
+; 
+
 exe symal2mam : 
 symal2mam.cc 
 $(TOP)/moses/TranslationModel/UG/generic//generic 
@ -43,17 +63,47 @@ $(TOP)/moses/TranslationModel/UG/mm//mm
 $(TOP)/util//kenutil 
 ; 

-exe custom-pt : 
-custom-pt.cc 
-#$(TOP)/moses/generic//generic 
+exe mam_verify : 
+mam_verify.cc 
+$(TOP)/moses/TranslationModel/UG/generic//generic 
 $(TOP)//boost_iostreams 
 $(TOP)//boost_program_options 
 $(TOP)/moses/TranslationModel/UG/mm//mm 
 $(TOP)/util//kenutil 
 ; 

+exe custom-pt : 
+custom-pt.cc 
+$(TOP)/moses//moses
+$(TOP)//boost_iostreams 
+$(TOP)//boost_program_options 
+$(TOP)/moses/TranslationModel/UG/mm//mm 
+$(TOP)/moses/TranslationModel/UG/generic//generic 
+$(TOP)/util//kenutil 
+; 

-install $(PREFIX)/bin : mtt-build mtt-dump mtt-count-words symal2mam custom-pt mmlex-build ; 
+
+exe calc-coverage : 
+calc-coverage.cc 
+$(TOP)/moses/TranslationModel/UG/generic//generic 
+$(TOP)//boost_iostreams 
+$(TOP)//boost_program_options 
+$(TOP)/moses/TranslationModel/UG/mm//mm 
+$(TOP)/util//kenutil 
+; 
+
+install $(PREFIX)/bin : 
+mtt-build 
+mtt-dump 
+mtt-count-words 
+symal2mam 
+mam2symal 
+custom-pt 
+mmlex-build 
+mmlex-lookup
+mam_verify 
+calc-coverage
+; 

 fakelib mm : [ glob ug_*.cc tpt_*.cc ] ;

--- a/moses/TranslationModel/UG/mm/Makefile
+++ b/moses/TranslationModel/UG/mm/Makefile
@ -76,7 +76,7 @@ endef

 testprogs = test-dynamic-im-tsa
 programs  = mtt-build mtt-dump symal2mam custom-pt mmlex-build ${testprogs}
-programs += mtt-count-words
+programs += mtt-count-words calc-coverage

 all: $(addprefix ${BINDIR}/${BINPREF}, $(programs))
 	@echo $^
--- a/moses/TranslationModel/UG/mm/calc-coverage.cc
+++ b/moses/TranslationModel/UG/mm/calc-coverage.cc
@ -0,0 +1,56 @@
+#include "moses/TranslationModel/UG/mm/ug_mm_ttrack.h"
+#include "moses/TranslationModel/UG/mm/ug_mm_tsa.h"
+#include "moses/TranslationModel/UG/mm/tpt_tokenindex.h"
+#include "moses/TranslationModel/UG/mm/ug_corpus_token.h"
+#include "moses/TranslationModel/UG/mm/ug_typedefs.h"
+#include "moses/TranslationModel/UG/mm/tpt_pickler.h"
+#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h"
+
+#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
+
+// using namespace Moses;
+using namespace ugdiss;
+
+
+typedef L2R_Token<SimpleWordId> Token;
+TokenIndex V;
+sptr<vector<vector<Token> > > C(new vector<vector<Token> >());
+void 
+add_file(string fname)
+{
+  filtering_istream in;
+  open_input_stream(fname,in);
+  string line;
+  while (getline(in,line))
+    {
+      C->push_back(vector<Token>());
+      fill_token_seq(V,line,C->back());
+    }
+}
+
+int
+main(int argc, char* argv[])
+{
+  V.setDynamic(true);
+  add_file(argv[1]);
+  sptr<imTtrack<Token> > T(new imTtrack<Token>(C));
+  imTSA<Token> I(T,NULL,NULL);
+  string line;
+  while (getline(cin,line))
+    {
+      vector<Token> seq; fill_token_seq<Token>(V,line,seq);
+      for (size_t i = 0; i < seq.size(); ++i)
+	{
+	  TSA<Token>::tree_iterator m(&I);
+	  cout << V[seq[i].id()];
+	  for (size_t k = i; k < seq.size() && m.extend(seq[k]); ++k)
+	    {
+	      cout << " ";
+	      if (k > i) cout << V[seq[k].id()] << " ";
+	      cout << "[" << m.approxOccurrenceCount() << "]";
+	    }
+	  cout << endl;
+	}
+    }
+}
--- a/moses/TranslationModel/UG/mm/custom-pt.cc
+++ b/moses/TranslationModel/UG/mm/custom-pt.cc
@ -53,7 +53,7 @@ nbest_phrasepairs(uint64_t const  pid1,
 		  pstats   const& ps, 
 		  vector<PhrasePair> & nbest)
 {
-  boost::unordered_map<uint64_t,jstats>::const_iterator m;
+  pstats::trg_map_t::const_iterator m;
  vector<size_t> idx(nbest.size());
  size_t i=0;
  for (m  = ps.trg.begin(); 
--- a/moses/TranslationModel/UG/mm/mam2symal.cc
+++ b/moses/TranslationModel/UG/mm/mam2symal.cc
@ -0,0 +1,98 @@
+// -*- c++ -*-
+// (c) 2008-2010 Ulrich Germann
+#include <boost/program_options.hpp>
+#include <iomanip>
+
+#include "tpt_typedefs.h"
+#include "ug_mm_ttrack.h"
+#include "tpt_tokenindex.h"
+#include "ug_deptree.h"
+#include "ug_corpus_token.h"
+#include "tpt_pickler.h"
+
+using namespace std;
+using namespace ugdiss;
+namespace po = boost::program_options;
+
+string mamfile;
+vector<string> range;
+
+typedef L2R_Token<Conll_Sform> Token;
+
+mmTtrack<char> MAM;
+bool with_sids;
+
+void 
+interpret_args(int ac, char* av[])
+{
+  po::variables_map vm;
+  po::options_description o("Options");
+  o.add_options()
+    ("help,h",    "print this message")
+    ("numbers,n", po::bool_switch(&with_sids), "print sentence ids as first token")
+    ;
+  
+  po::options_description h("Hidden Options");
+  h.add_options()
+    ("mamfile", po::value<string>(&mamfile), "mamfile")
+    ("range", po::value<vector<string> >(&range), "range")
+    ;
+  po::positional_options_description a;
+  a.add("mamfile",1);
+  a.add("range",-1);
+  
+  po::store(po::command_line_parser(ac,av)
+            .options(h.add(o))
+            .positional(a)
+            .run(),vm);
+  po::notify(vm); // IMPORTANT
+  if (vm.count("help") || mamfile.empty())
+    {
+      cout << "usage:\n\t"
+           << av[0] << " track name [<range>]\n"
+           << endl;
+      cout << o << endl;
+      exit(0);
+    }
+}
+
+void 
+printRangeMAM(size_t start, size_t stop)
+{
+  for (;start < stop; start++)
+    { 
+      // size_t i = 0;
+      char const* p = MAM.sntStart(start);
+      char const* q = MAM.sntEnd(start);
+      if (with_sids) cout << start << " ";
+      ushort s,t;
+      while (p < q)
+	{
+	  p = binread(p,s);
+	  p = binread(p,t);
+	  cout << s << "-" << t << " ";
+	}
+      cout << endl;
+    }
+}
+
+int 
+main(int argc, char*argv[])
+{
+  interpret_args(argc,argv);
+  MAM.open(mamfile);
+  if (!range.size()) printRangeMAM(0, MAM.size());
+  else
+    {
+      for (size_t i = 0; i < range.size(); i++)
+        {
+          istringstream buf(range[i]);
+          size_t first,last; uchar c;
+          buf>>first;
+          if (buf.peek() == '-') buf>>c>>last;
+          else                   last = first;
+	  if (last < MAM.size()) 
+	    printRangeMAM(first,last+1);
+	}
+    }
+}
--- a/moses/TranslationModel/UG/mm/mam_verify.cc
+++ b/moses/TranslationModel/UG/mm/mam_verify.cc
@ -0,0 +1,120 @@
+// -*- c++ -*-
+// (c) 2008-2010 Ulrich Germann
+#include <boost/program_options.hpp>
+#include <iomanip>
+
+#include "tpt_typedefs.h"
+#include "ug_mm_ttrack.h"
+#include "tpt_tokenindex.h"
+#include "ug_deptree.h"
+#include "ug_corpus_token.h"
+#include "tpt_pickler.h"
+
+using namespace std;
+using namespace ugdiss;
+namespace po = boost::program_options;
+
+typedef L2R_Token<Conll_Sform> Token;
+
+string bname,L1,L2;
+mmTtrack<char> MAM;
+mmTtrack<Token> T1,T2;
+bool inv;
+vector<string> range;
+void 
+interpret_args(int ac, char* av[])
+{
+  po::variables_map vm;
+  po::options_description o("Options");
+  o.add_options()
+    ("help,h",    "print this message")
+    ("inv,i", po::bool_switch(&inv), "inverse")
+    ;
+  
+  po::options_description h("Hidden Options");
+  h.add_options()
+    ("bname", po::value<string>(&bname), "base name")
+    ("L1", po::value<string>(&L1), "L1")
+    ("L2", po::value<string>(&L2), "L2")
+    ("range", po::value<vector<string> >(&range), "range")
+    ;
+  po::positional_options_description a;
+  a.add("bname",1);
+  a.add("L1",1);
+  a.add("L2",1);
+  a.add("range",-1);
+  
+  po::store(po::command_line_parser(ac,av)
+            .options(h.add(o))
+            .positional(a)
+            .run(),vm);
+  po::notify(vm); // IMPORTANT
+  if (vm.count("help") || L2.empty())
+    {
+      cout << "usage:\n\t"
+           << av[0] << " <base name> <L1> <L2> \n"
+           << endl;
+      cout << o << endl;
+      exit(0);
+    }
+}
+
+size_t
+check_range(size_t start, size_t stop)
+{
+  size_t noAln = 0;
+  for (size_t sid = start; sid < stop; ++sid)
+    {
+      char const* p = MAM.sntStart(sid);
+      char const* q = MAM.sntEnd(sid);
+      size_t slen = T1.sntLen(sid);
+      size_t tlen = T2.sntLen(sid);
+      if (p == q) ++noAln;
+      ushort s,t;
+      while (p < q)
+	{
+	  p = binread(p,s);
+	  p = binread(p,t);
+	  if (s >= slen || t >= tlen)
+	    {
+	      cout << "alignment out of bounds in sentence " << sid << ": "
+		   << s << "-" << t << " in " << slen << ":" << tlen << "."
+		   << endl;
+	      break;
+	    }
+	}
+    }
+  return noAln;
+}
+
+int 
+main(int argc, char*argv[])
+{
+  interpret_args(argc,argv);
+  MAM.open(bname+L1+"-"+L2+".mam");
+  T1.open(bname+L1+".mct");
+  T2.open(bname+L2+".mct");
+  if (T1.size() != T2.size() || T1.size() != MAM.size())
+    {
+      cout << "Track sizes don't match!" << endl;
+      exit(1);
+    }
+  size_t noAln;
+  if (!range.size()) 
+    noAln = check_range(0, MAM.size());
+  else
+    {
+      noAln = 0;
+      for (size_t i = 0; i < range.size(); i++)
+        {
+          istringstream buf(range[i]);
+          size_t first,last; uchar c;
+          buf>>first;
+          if (buf.peek() == '-') buf>>c>>last;
+          else                   last = first;
+	  if (last < MAM.size()) 
+	    noAln += check_range(first,last+1);
+	}
+    }
+  cout << noAln << " sentence pairs without alignment" << endl;
+}
--- a/moses/TranslationModel/UG/mm/mmlex-lookup.cc
+++ b/moses/TranslationModel/UG/mm/mmlex-lookup.cc
@ -0,0 +1,149 @@
+// -*- c++ -*-
+// Program to extract word cooccurrence counts from a memory-mapped
+// word-aligned bitext stores the counts lexicon in the format for
+// mm2dTable<uint32_t> (ug_mm_2d_table.h) 
+// 
+// (c) 2010-2012 Ulrich Germann
+
+// to do: multi-threading
+
+#include <queue>
+#include <iomanip>
+#include <vector>
+#include <iterator>
+#include <sstream>
+#include <algorithm>
+
+#include <boost/program_options.hpp>
+#include <boost/dynamic_bitset.hpp>
+#include <boost/shared_ptr.hpp>
+#include <boost/foreach.hpp>
+#include <boost/thread.hpp>
+#include <boost/math/distributions/binomial.hpp>
+#include <boost/unordered_map.hpp> 
+#include <boost/unordered_set.hpp> 
+
+#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
+#include "ug_mm_2d_table.h"
+#include "ug_mm_ttrack.h"
+#include "ug_corpus_token.h"
+
+using namespace std;
+using namespace ugdiss;
+using namespace boost::math;
+
+typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> LEX_t;
+typedef SimpleWordId Token;
+
+// DECLARATIONS 
+void interpret_args(int ac, char* av[]);
+
+string swrd,twrd,L1,L2,bname;
+TokenIndex V1,V2;
+LEX_t LEX;
+
+
+void 
+lookup_source(ostream& out, id_type r)
+{
+  vector<LEX_t::Cell> foo(LEX[r].start,LEX[r].stop);
+  sort(foo.begin(),foo.end(),LEX_t::Cell::SortDescendingByValue());
+  out << V1[r] << " " << LEX.m1(r) << endl;
+  BOOST_FOREACH(LEX_t::Cell const& c, foo)
+    {
+      out << setw(10) << float(c.val)/LEX.m1(r)       << " "
+	  << setw(10) << float(c.val)/LEX.m2(c.id) << " "
+	  << V2[c.id] << " " << c.val    << "/" << LEX.m2(c.id) << endl;
+    }
+}
+
+void 
+lookup_target(ostream& out, id_type c)
+{
+  vector<LEX_t::Cell> foo;
+  LEX_t::Cell cell;
+  for (size_t r = 0; r < LEX.numRows; ++r)
+    {
+      size_t j = LEX[r][c];
+      if (j) 
+	{
+	  cell.id  = r;
+	  cell.val = j;
+	  foo.push_back(cell);
+	}
+    }
+  sort(foo.begin(),foo.end(),LEX_t::Cell::SortDescendingByValue());
+  out << V2[c] << " " << LEX.m2(c) << endl;
+  BOOST_FOREACH(LEX_t::Cell const& r, foo)
+    {
+      out << setw(10) << float(r.val)/LEX.m2(c)       << " "
+	  << setw(10) << float(r.val)/LEX.m1(r.id) << " "
+	  << V1[r.id] << " " << r.val    << "/" << LEX.m1(r.id) << endl;
+    }
+}
+
+void 
+dump(ostream& out)
+{
+  for (size_t r = 0; r < LEX.numRows; ++r)
+    lookup_source(out,r);
+  out << endl;
+}
+
+
+int 
+main(int argc, char* argv[])
+{
+  interpret_args(argc,argv);
+  char c = *bname.rbegin();
+  if (c != '/' && c != '.') bname += '.';
+  V1.open(bname+L1+".tdx");
+  V2.open(bname+L2+".tdx");
+  LEX.open(bname+L1+"-"+L2+".lex");
+ 
+  cout.precision(2);
+  id_type swid = V1[swrd];
+  id_type twid = V2[twrd];
+  if (swid != 1 && twid != 1)
+    {
+      cout << swrd << " " << twrd << " " 
+	   << LEX.m1(swid)    << " / " 
+	   << LEX[swid][twid] << " / "
+	   << LEX.m2(twid)    << endl;
+    }
+  else if (swid != 1)
+    lookup_source(cout,swid);
+  else if (twid != 1)
+    lookup_target(cout,twid);
+  else
+    dump(cout);
+}
+
+void 
+interpret_args(int ac, char* av[])
+{
+  namespace po=boost::program_options;
+  po::variables_map vm;
+  po::options_description o("Options");
+  po::options_description h("Hidden Options");
+  po::positional_options_description a;
+
+  o.add_options()
+    ("help,h",    "print this message")
+    ("source,s",po::value<string>(&swrd),"source word")
+    ("target,t",po::value<string>(&swrd),"target word")
+    ;
+  
+  h.add_options()
+    ("bname", po::value<string>(&bname), "base name")
+    ("L1",    po::value<string>(&L1),"L1 tag")
+    ("L2",    po::value<string>(&L2),"L2 tag")
+    ;
+  a.add("bname",1);
+  a.add("L1",1);
+  a.add("L2",1);
+  get_options(ac,av,h.add(o),a,vm,"cfg");
+
+}
+
+
--- a/moses/TranslationModel/UG/mm/mtt-dump.cc
+++ b/moses/TranslationModel/UG/mm/mtt-dump.cc
@ -24,7 +24,7 @@ mmTtrack<SimpleWordId> MCT;
 bool sform;
 bool have_mtt, have_mct;
 bool with_sids;
-
+bool with_positions;
 void 
 interpret_args(int ac, char* av[])
 {
@ -34,6 +34,7 @@ interpret_args(int ac, char* av[])
    ("help,h",    "print this message")
    ("numbers,n", po::bool_switch(&with_sids), "print sentence ids as first token")
    ("sform,s", po::bool_switch(&sform), "sform only")
+    ("with-positions,p", po::bool_switch(&with_positions), "show word positions")
    ;
  
  po::options_description h("Hidden Options");
@ -68,10 +69,10 @@ printRangeMTT(size_t start, size_t stop)
  for (;start < stop; start++)
    { 
      size_t i = 0;
-      Token const* t = MTT.sntStart(start);
+      Token const* s = MTT.sntStart(start);
      Token const* e = MTT.sntEnd(start);
      if (with_sids) cout << start << " ";
-      for (;t < e; ++t)
+      for (Token const* t = s; t < e; ++t)
        {
 #if 0
          uchar const* x = reinterpret_cast<uchar const*>(t);
@ -91,7 +92,11 @@ printRangeMTT(size_t start, size_t stop)
              cout << i+t->parent << " ";
              cout << DT[t->dtype] << endl;
            }
-          else cout << SF[t->id()] << " ";
+          else 
+	    {
+	      if (with_positions) cout << t-s << ":";
+	      cout << SF[t->id()] << " ";
+	    }
        }
      cout << endl;
    }
@ -102,10 +107,15 @@ printRangeMCT(size_t start, size_t stop)
 {
  for (;start < stop; start++)
    { 
-      SimpleWordId const* t = MCT.sntStart(start);
+      SimpleWordId const* s = MCT.sntStart(start);
+      SimpleWordId const* t = s;
      SimpleWordId const* e = MCT.sntEnd(start);
      if (with_sids) cout << start << " ";
-      while (t < e) cout << SF[(t++)->id()] << " ";
+      while (t < e) 
+	{
+	  if (with_positions) cout << t-s << ":";
+	  cout << SF[(t++)->id()] << " ";
+	}
      cout << endl;
    }
 }
--- a/moses/TranslationModel/UG/mm/symal2mam.cc
+++ b/moses/TranslationModel/UG/mm/symal2mam.cc
@ -21,8 +21,8 @@
 #include <boost/program_options.hpp>
 #include <boost/scoped_ptr.hpp>

-#include "headers-base/util/exception.hh"
-#include "headers-base/util/check.hh"
+#include "util/exception.hh"
+// #include "headers-base/util/check.hh"

 // NOTE TO SELF: 
 /* Program to filter out sentences that GIZA will skip or truncate,
--- a/moses/TranslationModel/UG/mm/tpt_tokenindex.cc
+++ b/moses/TranslationModel/UG/mm/tpt_tokenindex.cc
@ -44,16 +44,14 @@ namespace ugdiss
    file.open(fname);
    if (!file.is_open())
      {
-	cerr << "Error opening file " << fname << endl;
-	assert(0);
+        ostringstream msg;
+        msg << "TokenIndex::open: Error opening file '" << fname << "'.";
+        throw std::runtime_error(msg.str().c_str());
      }
-    // cout << "file is open" << endl;

    this->numTokens = *(reinterpret_cast<uint32_t const*>(file.data()));
    unkId = *(reinterpret_cast<id_type const*>(file.data()+4));

-    // cout << "tokenindex.open: unkId=" << unkId << endl;
-
    startIdx = reinterpret_cast<Entry const*>(file.data()+4+sizeof(id_type));
    endIdx   = startIdx + numTokens;
    comp.base = reinterpret_cast<char const*>(endIdx);
@ -143,13 +141,10 @@ namespace ugdiss
  TokenIndex::
  operator[](id_type id) const
  {
-    if (!ridx.size())
+    if (!ridx.size()) 
      {
-        cerr << "FATAL ERROR: You need to call iniReverseIndex() "
-             << "on the TokenIndex class before using operator[](id_type id)."
-             << endl;
-        assert(0);
-        exit(1);
+	boost::lock_guard<boost::mutex> lk(*this->lock);
+	if (!ridx.size()) ridx = reverseIndex();
      }
    if (id < ridx.size())  
      return ridx[id];
@ -163,7 +158,11 @@ namespace ugdiss
  TokenIndex::
  iniReverseIndex() 
  {
-    if (!ridx.size()) ridx = reverseIndex();
+    if (!ridx.size()) 
+      {
+	boost::lock_guard<boost::mutex> lk(*this->lock);
+	if (!ridx.size()) ridx = reverseIndex();
+      }
  }

  
@ -171,7 +170,11 @@ namespace ugdiss
  TokenIndex::
  operator[](id_type id) 
  {
-    if (!ridx.size()) ridx = reverseIndex();
+    if (!ridx.size()) 
+      {
+	boost::lock_guard<boost::mutex> lk(*this->lock);
+	if (!ridx.size()) ridx = reverseIndex();
+      }
    if (id < ridx.size())  
      return ridx[id];
    boost::lock_guard<boost::mutex> lk(*this->lock);
@ -184,7 +187,11 @@ namespace ugdiss
  TokenIndex::
  toString(vector<id_type> const& v) 
  {
-    if (!ridx.size()) ridx = reverseIndex();
+    if (!ridx.size()) 
+      {
+	boost::lock_guard<boost::mutex> lk(*this->lock);
+	if (!ridx.size()) ridx = reverseIndex();
+      }
    ostringstream buf;
    for (size_t i = 0; i < v.size(); i++)
      buf << (i ? " " : "") << (*this)[v[i]];
@ -195,7 +202,11 @@ namespace ugdiss
  TokenIndex::
  toString(vector<id_type> const& v) const
  {
-    assert (ridx.size());
+    if (!ridx.size()) 
+      {
+	boost::lock_guard<boost::mutex> lk(*this->lock);
+	if (!ridx.size()) ridx = reverseIndex();
+      }
    ostringstream buf;
    for (size_t i = 0; i < v.size(); i++)
      buf << (i ? " " : "") << (*this)[v[i]];
@ -206,7 +217,11 @@ namespace ugdiss
  TokenIndex::
  toString(id_type const* start, id_type const* const stop) 
  {
-    if (!ridx.size()) ridx = reverseIndex();
+    if (!ridx.size()) 
+      {
+	boost::lock_guard<boost::mutex> lk(*this->lock);
+	if (!ridx.size()) ridx = reverseIndex();
+      }
    ostringstream buf;
    if (start < stop)
      buf << (*this)[*start];
@ -219,7 +234,11 @@ namespace ugdiss
  TokenIndex::
  toString(id_type const* start, id_type const* const stop) const
  {
-    assert (ridx.size());
+    if (!ridx.size()) 
+      {
+	boost::lock_guard<boost::mutex> lk(*this->lock);
+	if (!ridx.size()) ridx = reverseIndex();
+      }
    ostringstream buf;
    if (start < stop)
      buf << (*this)[*start];
--- a/moses/TranslationModel/UG/mm/tpt_tokenindex.h
+++ b/moses/TranslationModel/UG/mm/tpt_tokenindex.h
@ -28,7 +28,7 @@ namespace ugdiss
  class TokenIndex
  {
    /** Reverse index: maps from ID to char const* */
-    vector<char const*> ridx;
+    mutable vector<char const*> ridx;
    /** Label for the UNK token */
    string unkLabel; 
    id_type unkId,numTokens;
@ -164,5 +164,12 @@ namespace ugdiss
    write_tokenindex_to_disk(tok,ofile,unkToken);
  }

+  template<typename Token>
+  void 
+  fill_token_seq(TokenIndex& V, string const& line, vector<Token>& dest)
+  {
+    istringstream buf(line); string w;
+    while (buf>>w) dest.push_back(Token(V[w]));
+  }
 }
 #endif
--- a/moses/TranslationModel/UG/mm/ug_bitext.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext.cc
@ -10,6 +10,9 @@ namespace Moses
 {
  namespace bitext 
  {
+
+    ThreadSafeCounter pstats::active;
+    
    pstats::
    pstats()
      : raw_cnt     (0)
@ -20,6 +23,14 @@ namespace Moses
    {
      ofwd[0] = ofwd[1] = ofwd[2] = ofwd[3] = ofwd[4] = ofwd[5] = ofwd[6] = 0;
      obwd[0] = obwd[1] = obwd[2] = obwd[3] = obwd[4] = obwd[5] = obwd[6] = 0;
+      // if (++active%5 == 0) 
+      // cerr << size_t(active) << " active pstats at " << __FILE__ << ":" << __LINE__ << endl;
+    }
+
+    pstats::
+    ~pstats()
+    {
+      --active;
    }

    void
@ -49,16 +60,13 @@ namespace Moses
 	uint32_t fwd_o, 
 	uint32_t bwd_o)
    {
-      this->lock.lock();
+      boost::lock_guard<boost::mutex> guard(this->lock);
      jstats& entry = this->trg[pid];
-      this->lock.unlock();
      entry.add(w,a,cnt2,fwd_o,bwd_o);
      if (this->good < entry.rcnt())
 	{
-	  this->lock.lock();
-	  return false;
-	  // UTIL_THROW(util::Exception, "more joint counts than good counts!" 
-	  // 	     << entry.rcnt() << "/" << this->good);
+	  UTIL_THROW(util::Exception, "more joint counts than good counts:" 
+		     << entry.rcnt() << "/" << this->good << "!");
 	}
      return true;
    }
@ -338,6 +346,10 @@ namespace Moses
      typedef L2R_Token<SimpleWordId> TKN;
      assert(s1.size() == s2.size() && s1.size() == aln.size());
      
+#ifndef NDEBUG
+      size_t first_new_snt = this->T1 ? this->T1->size() : 0;
+#endif
+
      sptr<imBitext<TKN> > ret;
      {
 	lock_guard<mutex> guard(this->lock);
@ -346,30 +358,58 @@ namespace Moses
      
      // we add the sentences in separate threads (so it's faster)
      boost::thread thread1(snt_adder<TKN>(s1,*ret->V1,ret->myT1,ret->myI1));
-      thread1.join(); // for debugging
+      // thread1.join(); // for debugging
      boost::thread thread2(snt_adder<TKN>(s2,*ret->V2,ret->myT2,ret->myI2));
      BOOST_FOREACH(string const& a, aln)
 	{
 	  istringstream ibuf(a);
 	  ostringstream obuf;
 	  uint32_t row,col; char c;
-	  while (ibuf>>row>>c>>col)
+	  while (ibuf >> row >> c >> col)
 	    {
 	      assert(c == '-');
 	      binwrite(obuf,row);
 	      binwrite(obuf,col);
 	    }
-	  char const* x = obuf.str().c_str();
-	  vector<char> v(x,x+obuf.str().size());
+	  // important: DO NOT replace the two lines below this comment by 
+	  // char const* x = obuf.str().c_str(), as the memory x is pointing 
+	  // to is freed immediately upon deconstruction of the string object.
+	  string foo = obuf.str(); 
+	  char const* x = foo.c_str();
+	  vector<char> v(x,x+foo.size());
 	  ret->myTx = append(ret->myTx, v);
 	}
+
      thread1.join();
      thread2.join();
+
      ret->Tx = ret->myTx;
      ret->T1 = ret->myT1;
      ret->T2 = ret->myT2;
      ret->I1 = ret->myI1;
      ret->I2 = ret->myI2;
+
+#ifndef NDEBUG
+      // sanity check
+      for (size_t i = first_new_snt; i < ret->T1->size(); ++i)
+	{
+	  size_t slen1  = ret->T1->sntLen(i);
+	  size_t slen2  = ret->T2->sntLen(i);
+	  char const* p = ret->Tx->sntStart(i);
+	  char const* q = ret->Tx->sntEnd(i);
+	  size_t k;
+	  while (p < q)
+	    {
+	      p = binread(p,k);
+	      assert(p);
+	      assert(p < q);
+	      assert(k < slen1);
+	      p = binread(p,k);
+	      assert(p);
+	      assert(k < slen2);
+	    }
+	}
+#endif
      return ret;
    }

--- a/moses/TranslationModel/UG/mm/ug_bitext.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext.h
@ -29,10 +29,12 @@
 #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
 #include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
 #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
+#include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h"
 #include "moses/Util.h"
+#include "moses/StaticData.h"

-#include "headers-base/util/exception.hh"
-#include "headers-base/util/check.hh"
+#include "util/exception.hh"
+// #include "util/check.hh"

 #include "ug_typedefs.h"
 #include "ug_mm_ttrack.h"
@ -44,10 +46,12 @@
 #include "tpt_pickler.h"
 #include "ug_lexical_phrase_scorer2.h"

+#define PSTATS_CACHE_THRESHOLD 50
+
 using namespace ugdiss;
 using namespace std;
 namespace Moses {
-
+  class Mmsapt;
  namespace bitext
  {
    using namespace ugdiss;
@ -122,6 +126,7 @@ namespace Moses {
    struct 
    pstats
    {
+      static ThreadSafeCounter active;
      boost::mutex lock;               // for parallel gathering of stats
      boost::condition_variable ready; // consumers can wait for this data structure to be ready.
      
@ -133,8 +138,11 @@ namespace Moses {

      uint32_t ofwd[po_other+1], obwd[po_other+1];

-      typename boost::unordered_map<uint64_t, jstats> trg;
-      pstats(); 
+      // typedef typename boost::unordered_map<uint64_t, jstats> trg_map_t;
+      typedef typename std::map<uint64_t, jstats> trg_map_t;
+      trg_map_t trg;
+      pstats();
+      ~pstats();
      void release();
      void register_worker();
      size_t count_workers() { return in_progress; } 
@ -192,8 +200,8 @@ namespace Moses {
      int index;
      int num_feats;
    public:
-      virtual 
 
+      virtual 
      void 
      operator()(Bitext<Token> const& pt, PhrasePair& pp, vector<float> * dest) 
 	const = 0;
@ -212,6 +220,7 @@ namespace Moses {
    PScorePfwd : public PhraseScorer<Token>
    {
      float conf;
+      int denom;
    public:
      PScorePfwd() 
      {
@ -219,9 +228,10 @@ namespace Moses {
      }

      int 
-      init(int const i, float const c) 
+      init(int const i, float const c, int d=0) 
      { 
-	conf = c; 
+	conf  = c; 
+	denom = d;
 	this->index = i;
 	return i + this->num_feats;
      }
@ -234,10 +244,20 @@ namespace Moses {
 	if (!dest) dest = &pp.fvals;
 	if (pp.joint > pp.good1) 
 	  {
-	    cerr << bt.toString(pp.p1,0) << " ::: " << bt.toString(pp.p2,1) << endl;
-	    cerr << pp.joint << "/" << pp.good1 << "/" << pp.raw2 << endl;
+	    cerr<<bt.toString(pp.p1,0)<<" ::: "<<bt.toString(pp.p2,1)<<endl;
+	    cerr<<pp.joint<<"/"<<pp.good1<<"/"<<pp.raw2<<endl;
+	  }
+	switch (denom)
+	  {
+	  case 0: 
+	    (*dest)[this->index] = log(lbop(pp.good1, pp.joint, conf)); 
+	    break;
+	  case 1: 
+	    (*dest)[this->index] = log(lbop(pp.sample1, pp.joint, conf)); 
+	    break;
+	  case 2:
+	    (*dest)[this->index] = log(lbop(pp.raw1, pp.joint, conf)); 
 	  }
-	(*dest)[this->index] = log(lbop(pp.good1, pp.joint, conf));
      }
    };

@ -294,6 +314,7 @@ namespace Moses {
 	parse_pid(pp.p2, sid2, off2, len2);

 #if 0
+	cout << len1 << " " << len2 << endl;
 	Token const* t1 = bt.T1->sntStart(sid1);
 	for (size_t i = off1; i < off1 + len1; ++i)
 	  cout << (*bt.V1)[t1[i].id()] << " "; 
@ -307,6 +328,7 @@ namespace Moses {
 	BOOST_FOREACH (int a, pp.aln)
 	  cout << a << " " ;
 	cout << __FILE__ << ":" << __LINE__ << "\n" << endl;
+
 #endif
 	scorer.score(bt.T1->sntStart(sid1)+off1,0,len1,
 		     bt.T2->sntStart(sid2)+off2,0,len2,
@ -371,8 +393,10 @@ namespace Moses {
    template<typename TKN>
    class Bitext 
    {
+      friend class Moses::Mmsapt;
    protected:
      mutable boost::mutex lock;
+      mutable boost::mutex cache_lock;
    public:
      typedef TKN Token;
      typedef typename TSA<Token>::tree_iterator iter;
@ -409,14 +433,22 @@ namespace Moses {
       bitvector* full_alignment,
       bool const flip) const;
      
-      mutable boost::unordered_map<uint64_t,sptr<pstats> > cache1,cache2;
+#if 1
+      typedef boost::unordered_map<uint64_t,sptr<pstats> > pcache_t;
+#else
+      typedef map<uint64_t,sptr<pstats> > pcache_t;
+#endif
+      mutable pcache_t cache1,cache2;
    protected:
      size_t default_sample_size;
+      size_t num_workers;
+      size_t m_pstats_cache_threshold;
    private:
      sptr<pstats> 
 	prep2(iter const& phrase, size_t const max_sample) const;
    public:
-      Bitext(size_t const max_sample=5000);
+      Bitext(size_t const max_sample =1000, 
+	     size_t const xnum_workers =16);

      Bitext(Ttrack<Token>* const t1, 
 	     Ttrack<Token>* const t2, 
@ -425,7 +457,8 @@ namespace Moses {
 	     TokenIndex*    const v2,
 	     TSA<Token>* const i1, 
 	     TSA<Token>* const i2,
-	     size_t const max_sample=5000);
+	     size_t const max_sample=1000,
+	     size_t const xnum_workers=16);
 	     
      virtual void open(string const base, string const L1, string const L2) = 0;
      
@ -433,10 +466,13 @@ namespace Moses {
      sptr<pstats> lookup(iter const& phrase) const;
      sptr<pstats> lookup(iter const& phrase, size_t const max_sample) const;
      void prep(iter const& phrase) const;
-      void setDefaultSampleSize(size_t const max_samples);
+
+      void   setDefaultSampleSize(size_t const max_samples);
      size_t getDefaultSampleSize() const;
-      
+
      string toString(uint64_t pid, int isL2) const;
+
+      virtual size_t revision() const { return 0; }
    };

    template<typename Token>
@ -471,6 +507,7 @@ namespace Moses {
    Bitext<Token>::
    setDefaultSampleSize(size_t const max_samples)
    { 
+      boost::lock_guard<boost::mutex> guard(this->lock);
      if (max_samples != default_sample_size) 
 	{
 	  cache1.clear();
@ -481,8 +518,10 @@ namespace Moses {

    template<typename Token>
    Bitext<Token>::
-    Bitext(size_t const max_sample)
+    Bitext(size_t const max_sample, size_t const xnum_workers)
      : default_sample_size(max_sample)
+      , num_workers(xnum_workers)
+      , m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD)
    { }

    template<typename Token>
@ -494,9 +533,12 @@ namespace Moses {
 	   TokenIndex*    const v2,
 	   TSA<Token>* const i1, 
 	   TSA<Token>* const i2,
-	   size_t const max_sample)
+	   size_t const max_sample,
+	   size_t const xnum_workers)
      : Tx(tx), T1(t1), T2(t2), V1(v1), V2(v2), I1(i1), I2(i2)
      , default_sample_size(max_sample)
+      , num_workers(xnum_workers)
+      , m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD)
    { }

    // agenda is a pool of jobs 
@ -508,6 +550,7 @@ namespace Moses {
      boost::mutex lock; 
      class job 
      {
+	static ThreadSafeCounter active;
 	boost::mutex      lock; 
 	friend class agenda;
      public:
@ -525,8 +568,9 @@ namespace Moses {
 	bool done() const;
 	job(typename TSA<Token>::tree_iterator const& m, 
 	    sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd);
+	~job();
      };
-      
+    public:      
      class 
      worker
      {
@ -535,7 +579,7 @@ namespace Moses {
 	worker(agenda& a) : ag(a) {}
 	void operator()();
      };
-      
+    private:
      list<sptr<job> > joblist;
      vector<sptr<boost::thread> > workers;
      bool shutdown;
@ -639,7 +683,7 @@ namespace Moses {
 	  while (j->step(sid,offset))
 	    {
 	      aln.clear();
-	      int po_fwd=5,po_bwd=5;
+	      int po_fwd=po_other,po_bwd=po_other;
 	      if (j->fwd)
 		{
 		  if (!ag.bt.find_trg_phr_bounds
@ -669,20 +713,25 @@ namespace Moses {
 		  // assert(b);
 		  for (size_t i = e1; i <= e2; ++i)
 		    {
-		      if (!j->stats->add(b->getPid(),sample_weight,aln,
-					 b->approxOccurrenceCount(),
-					 po_fwd,po_bwd))
+		      if (! j->stats->add(b->getPid(),sample_weight,aln,
+					  b->approxOccurrenceCount(),
+					  po_fwd,po_bwd))
 			{
+			  cerr << "FATAL ERROR AT " << __FILE__ 
+			       << ":" << __LINE__ << endl;
+			  assert(0);
+			  ostringstream msg;
 			  for (size_t z = 0; z < j->len; ++z)
 			    {
 			      id_type tid = ag.bt.T1->sntStart(sid)[offset+z].id();
-			      cout << (*ag.bt.V1)[tid] << " "; 
+			      cerr << (*ag.bt.V1)[tid] << " "; 
 			    }
-			  cout << endl;
+			  cerr << endl;
 			  for (size_t z = s; z <= i; ++z)
-			    cout << (*ag.bt.V2)[(o+z)->id()] << " "; 
-			  cout << endl;
-			  exit(1);
+			    cerr << (*ag.bt.V2)[(o+z)->id()] << " "; 
+			  cerr << endl;
+			  assert(0);
+			  UTIL_THROW(util::Exception,"Error in sampling.");
 			}
 		      if (i < e2)
 			{
@ -705,6 +754,16 @@ namespace Moses {
 	}
    }

+    template<typename Token>
+    Bitext<Token>::
+    agenda::
+    job::
+    ~job()
+    {
+      if (stats) stats.reset();
+      --active;
+    }
+
    template<typename Token>
    Bitext<Token>::
    agenda::
@ -722,6 +781,9 @@ namespace Moses {
    {
      stats.reset(new pstats());
      stats->raw_cnt = m.approxOccurrenceCount();
+      // if (++active%5 == 0) 
+      ++active;
+      // cerr << size_t(active) << " active jobs at " << __FILE__ << ":" << __LINE__ << endl;
    }

    template<typename Token>
@ -731,12 +793,12 @@ namespace Moses {
    add_job(typename TSA<Token>::tree_iterator const& phrase, 
 	    size_t const max_samples)
    {
+      boost::unique_lock<boost::mutex> lk(this->lock);
      static boost::posix_time::time_duration nodelay(0,0,0,0); 
      bool fwd = phrase.root == bt.I1.get();
-      sptr<job> j(new job(phrase, fwd ? bt.I2 : bt.I1, max_samples, fwd));
+      sptr<job> j(new job(phrase, fwd ? bt.I1 : bt.I2, max_samples, fwd));
      j->stats->register_worker();
      
-      boost::unique_lock<boost::mutex> lk(this->lock);
      joblist.push_back(j);
      if (joblist.size() == 1)
 	{
@ -770,7 +832,6 @@ namespace Moses {
      // cerr << workers.size() << " workers on record" << endl;
      sptr<job> ret;
      if (this->shutdown) return ret;
-      // add_workers(0);
      boost::unique_lock<boost::mutex> lock(this->lock);
      if (this->doomed) 
 	{
@ -840,7 +901,8 @@ namespace Moses {
      i2.open(base+L2+".sfa", this->T2);
      assert(this->T1->size() == this->T2->size());
    }
-    
+
+   
    template<typename TKN>
    class imBitext : public Bitext<TKN>
    {
@ -849,7 +911,9 @@ namespace Moses {
      sptr<imTtrack<TKN> >  myT2;
      sptr<imTSA<TKN> >     myI1; 
      sptr<imTSA<TKN> >     myI2;
+      static ThreadSafeCounter my_revision;
    public:
+      size_t revision() const { return my_revision; }
      void open(string const base, string const L1, string L2);
      imBitext(sptr<TokenIndex> const& V1,
 	       sptr<TokenIndex> const& V2,
@ -867,6 +931,10 @@ namespace Moses {

    };

+    template<typename TKN>
+    ThreadSafeCounter 
+    imBitext<TKN>::my_revision;
+
    template<typename TKN>
    imBitext<TKN>::
    imBitext(size_t max_sample)
@ -876,6 +944,7 @@ namespace Moses {
      this->V2.reset(new TokenIndex());
      this->V1->setDynamic(true);
      this->V2->setDynamic(true);
+      ++my_revision;
    }
    
    template<typename TKN>
@ -889,6 +958,7 @@ namespace Moses {
      this->V2 = v2;
      this->V1->setDynamic(true);
      this->V2->setDynamic(true);
+      ++my_revision;
    }
    

@ -909,6 +979,8 @@ namespace Moses {
      this->V1 = other.V1;
      this->V2 = other.V2;
      this->default_sample_size = other.default_sample_size;
+      this->num_workers = other.num_workers;
+      ++my_revision;
    }
    
    template<typename TKN> class snt_adder;
@ -1050,7 +1122,6 @@ namespace Moses {
      t1.open(base+L1+".mct");
      t2.open(base+L2+".mct");
      tx.open(base+L1+"-"+L2+".mam");
-      cerr << "DADA" << endl;
      this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
      this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
      mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
@ -1063,19 +1134,27 @@ namespace Moses {
    template<typename Token>
    bool
    Bitext<Token>::
-    find_trg_phr_bounds(size_t const sid, size_t const start, size_t const stop,
-			size_t & s1, size_t & s2, size_t & e1, size_t & e2,
-			int & po_fwd, int & po_bwd,
-			vector<uchar>* core_alignment, 
-			bitvector* full_alignment, 
-			bool const flip) const
+    find_trg_phr_bounds
+    (size_t const sid, 
+     size_t const start, size_t const stop,
+     size_t & s1, size_t & s2, size_t & e1, size_t & e2,
+     int & po_fwd, int & po_bwd,
+     vector<uchar>* core_alignment, bitvector* full_alignment, 
+     bool const flip) const
    {
      // if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl;
+
      // a word on the core_alignment:
-      // since fringe words ([s1,...,s2),[e1,..,e2) if s1 < s2, or e1 < e2, respectively)
-      // are be definition unaligned, we store only the core alignment in *core_alignment
-      // it is up to the calling function to shift alignment points over for start positions
-      // of extracted phrases that start with a fringe word
+      // 
+      // since fringe words ([s1,...,s2),[e1,..,e2) if s1 < s2, or e1
+      // < e2, respectively) are be definition unaligned, we store
+      // only the core alignment in *core_alignment it is up to the
+      // calling function to shift alignment points over for start
+      // positions of extracted phrases that start with a fringe word
+      assert(T1);
+      assert(T2);
+      assert(Tx);
+
      bitvector forbidden((flip ? T1 : T2)->sntLen(sid));
      size_t slen1 = (*T1).sntLen(sid);
      size_t slen2 = (*T2).sntLen(sid);
@ -1092,12 +1171,22 @@ namespace Moses {
      char const* p = Tx->sntStart(sid);
      char const* x = Tx->sntEnd(sid);

-      // cerr << "flip = " << flip << " " << __FILE__ << ":" << __LINE__ << endl;
-
      while (p < x)
 	{
 	  if (flip) { p = binread(p,trg); assert(p<x); p = binread(p,src); }
 	  else      { p = binread(p,src); assert(p<x); p = binread(p,trg); }
+
+	  // cerr << sid << " " << src << "/" << slen1 << " " << trg << "/" 
+	  // << slen2 << endl;
+	  if (src >= slen1 || trg >= slen2)
+	    {
+	      ostringstream buf;
+	      buf << "Alignment range error at sentence " << sid << "!" << endl
+		  << src << "/" << slen1 << " " << trg << "/" << slen2 << endl;
+	      cerr << buf.str() << endl;
+	      UTIL_THROW(util::Exception, buf.str().c_str());
+	    }
+	      
 	  if (src < start || src >= stop) 
 	    forbidden.set(trg);
 	  else
@ -1214,29 +1303,44 @@ namespace Moses {
    Bitext<Token>::
    prep2(iter const& phrase, size_t const max_sample) const
    {
-      // boost::lock_guard<boost::mutex>(this->lock);
+      boost::lock_guard<boost::mutex> guard(this->lock);
      if (!ag) 
 	{
 	  ag.reset(new agenda(*this));
-	  // ag->add_workers(1);
-	  ag->add_workers(20);
+	  if (this->num_workers > 1)
+	    ag->add_workers(this->num_workers);
 	}
-      typedef boost::unordered_map<uint64_t,sptr<pstats> > pcache_t;
      sptr<pstats> ret;
-      if (max_sample == this->default_sample_size)
+#if 1
+      // use pcache only for plain sentence input
+      if (StaticData::Instance().GetInputType() == SentenceInput && 
+	  max_sample == this->default_sample_size && 
+	  phrase.approxOccurrenceCount() > m_pstats_cache_threshold)
      	{
-      	  uint64_t pid = phrase.getPid();
-      	  pcache_t & cache(phrase.root == &(*this->I1) ? cache1 : cache2);
-      	  pcache_t::value_type entry(pid,sptr<pstats>());
+	  // need to test what a good caching threshold is
+	  // is caching here the cause of the apparent memory leak in 
+	  // confusion network decoding ????
+	  uint64_t pid = phrase.getPid();
+	  pcache_t & cache(phrase.root == &(*this->I1) ? cache1 : cache2);
+	  pcache_t::value_type entry(pid,sptr<pstats>());
 	  pair<pcache_t::iterator,bool> foo;
-	  {
-	    // boost::lock_guard<boost::mutex>(this->lock);
-	    foo = cache.emplace(entry);
-	  }
-      	  if (foo.second) foo.first->second = ag->add_job(phrase, max_sample);
+	  foo = cache.insert(entry); 
+	  if (foo.second) 
+	    {
+	      // cerr << "NEW FREQUENT PHRASE: "
+	      // << phrase.str(V1.get()) << " " << phrase.approxOccurrenceCount()  
+	      // << " at " << __FILE__ << ":" << __LINE__ << endl;
+	      foo.first->second = ag->add_job(phrase, max_sample);
+	      assert(foo.first->second);
+	    }
+	  assert(foo.first->second);
 	  ret = foo.first->second;
-      	}
-      else ret = ag->add_job(phrase, max_sample);
+	  assert(ret);
+	}
+      else 
+#endif
+	ret = ag->add_job(phrase, max_sample);
+      assert(ret);
      return ret;
    }

@ -1245,13 +1349,17 @@ namespace Moses {
    Bitext<Token>::
    lookup(iter const& phrase) const
    {
-      boost::lock_guard<boost::mutex>(this->lock);
-      sptr<pstats> ret;
-      ret = prep2(phrase, this->default_sample_size);
+      sptr<pstats> ret = prep2(phrase, this->default_sample_size);
      assert(ret);
-      boost::unique_lock<boost::mutex> lock(ret->lock);
-      while (ret->in_progress)
-	ret->ready.wait(lock);
+      boost::lock_guard<boost::mutex> guard(this->lock);
+      if (this->num_workers <= 1)
+	typename agenda::worker(*this->ag)();
+      else 
+	{
+	  boost::unique_lock<boost::mutex> lock(ret->lock);
+	  while (ret->in_progress)
+	    ret->ready.wait(lock);
+	}
      return ret;
    }

@ -1260,11 +1368,16 @@ namespace Moses {
    Bitext<Token>::
    lookup(iter const& phrase, size_t const max_sample) const
    {
-      boost::lock_guard<boost::mutex>(this->lock);
      sptr<pstats> ret = prep2(phrase, max_sample);
-      boost::unique_lock<boost::mutex> lock(ret->lock);
-      while (ret->in_progress)
-	ret->ready.wait(lock);
+      boost::lock_guard<boost::mutex> guard(this->lock);
+      if (this->num_workers <= 1)
+	typename agenda::worker(*this->ag)();
+      else 
+	{
+	  boost::unique_lock<boost::mutex> lock(ret->lock);
+	  while (ret->in_progress)
+	    ret->ready.wait(lock);
+	}
      return ret;
    }

@ -1297,6 +1410,12 @@ namespace Moses {
      return (max_samples && stats->good >= max_samples) || next == stop; 
    }

+    template<typename TKN>
+    ThreadSafeCounter 
+    Bitext<TKN>::
+    agenda::
+    job::active;
+
  } // end of namespace bitext
 } // end of namespace moses
 #endif
--- a/moses/TranslationModel/UG/mm/ug_im_tsa.h
+++ b/moses/TranslationModel/UG/mm/ug_im_tsa.h
@ -151,6 +151,7 @@ namespace ugdiss
 	filter2.set();
 	filter = &filter2;
      }
+    assert(filter);
    // In the first iteration over the corpus, we obtain word counts.
    // They allows us to 
    //    a. allocate the exact amount of memory we need
@ -235,9 +236,10 @@ namespace ugdiss
  imTSA<TOKEN>::
  getLowerBound(id_type id) const
  {
-    if (id >= this->index.size()) 
+    if (id >= this->index.size())
      return NULL;
-    return reinterpret_cast<char const*>(&(this->sufa[index[id]]));
+    assert(index[id] <= this->sufa.size());
+    return reinterpret_cast<char const*>(&(this->sufa.front()) + index[id]);
  }

  template<typename TOKEN>
@ -245,9 +247,10 @@ namespace ugdiss
  imTSA<TOKEN>::
  getUpperBound(id_type id) const
  {
-    if (id+1 >= this->index.size()) 
+    if (++id >= this->index.size()) 
      return NULL;
-    return reinterpret_cast<char const*>(&(this->sufa[index[id+1]]));
+    assert(index[id] <= this->sufa.size());
+    return reinterpret_cast<char const*>(&(this->sufa.front()) + index[id]);
  }

  template<typename TOKEN>
@ -255,6 +258,8 @@ namespace ugdiss
  imTSA<TOKEN>::
  readSid(char const* p, char const* q, id_type& sid) const
  {
+    assert(reinterpret_cast<cpos const*>(p) >= &(this->sufa.front()));
+    assert(reinterpret_cast<cpos const*>(p) <= &(this->sufa.back()));
    sid = reinterpret_cast<cpos const*>(p)->sid;
    return p;
  }
@ -264,6 +269,8 @@ namespace ugdiss
  imTSA<TOKEN>::
  readSid(char const* p, char const* q, uint64_t& sid) const
  {
+    assert(reinterpret_cast<cpos const*>(p) >= &(this->sufa.front()));
+    assert(reinterpret_cast<cpos const*>(p) <= &(this->sufa.back()));
    sid = reinterpret_cast<cpos const*>(p)->sid;
    return p;
  }
@ -273,6 +280,8 @@ namespace ugdiss
  imTSA<TOKEN>::
  readOffset(char const* p, char const* q, uint16_t& offset) const
  {
+    assert(reinterpret_cast<cpos const*>(p) >= &(this->sufa.front()));
+    assert(reinterpret_cast<cpos const*>(p) <= &(this->sufa.back()));
    offset = reinterpret_cast<cpos const*>(p)->offset;
    return p+sizeof(cpos);
  }
@ -282,6 +291,8 @@ namespace ugdiss
  imTSA<TOKEN>::
  readOffset(char const* p, char const* q, uint64_t& offset) const
  {
+    assert(reinterpret_cast<cpos const*>(p) >= &(this->sufa.front()));
+    assert(reinterpret_cast<cpos const*>(p) <= &(this->sufa.back()));
    offset = reinterpret_cast<cpos const*>(p)->offset;
    return p+sizeof(cpos);
  }
@ -363,6 +374,7 @@ namespace ugdiss
    size_t n = 0;
    BOOST_FOREACH(id_type sid, newsids) 
      {
+	assert(sid < crp->size());
  	for (size_t o = 0; o < (*crp)[sid].size(); ++o, ++n)
  	  { nidx[n].offset = o; nidx[n].sid  = sid; }
      }
@ -379,20 +391,22 @@ namespace ugdiss
    
    size_t i = 0;
    typename vector<cpos>::iterator k = this->sufa.begin();
-    this->index[0] = 0;
+    // cerr << newToks << " new items at " 
+    // << __FILE__ << ":" << __LINE__ << endl;
    for (size_t n = 0; n < nidx.size();)
      {
  	id_type nid = crp->getToken(nidx[n])->id();
  	assert(nid >= i);
  	while (i < nid)
  	  {
+  	    this->index[i] = k - this->sufa.begin();
  	    if (++i < prior.index.size() && prior.index[i-1] < prior.index[i])
  	      {
  		k = copy(prior.sufa.begin() + prior.index[i-1], 
  			 prior.sufa.begin() + prior.index[i], k);
  	      }
-  	    this->index[i] = k - prior.sufa.begin();
  	  }
+	this->index[i] = k - this->sufa.begin();
  	if (++i < prior.index.size() && prior.index[i] > prior.index[i-1])
  	  {
  	    size_t j = prior.index[i-1];
@ -418,6 +432,7 @@ namespace ugdiss
  	  }
  	this->index[i] = k - this->sufa.begin();
      }
+    this->index[i] = k - this->sufa.begin();
    while (++i < this->index.size())
      {
  	if (i < prior.index.size() && prior.index[i-1] < prior.index[i])
@ -425,6 +440,25 @@ namespace ugdiss
  		   prior.sufa.begin() + prior.index[i], k);
  	this->index[i] = k - this->sufa.begin();
      }
+#if 0
+    // sanity checks
+    assert(this->sufa.size() == this->index.back());
+    BOOST_FOREACH(cpos const& x, this->sufa)
+      {
+	assert(x.sid < this->corpusSize);
+	assert(x.offset < this->corpus->sntLen(x.sid));
+      }
+    for (size_t i = 1; i < index.size(); ++i)
+      {
+	assert(index[i-1] <= index[i]);
+	assert(index[i] <= sufa.size());
+	for (size_t k = index[i-1]; k < index[i]; ++k)
+	  assert(this->corpus->getToken(sufa[k])->id() == i-1);
+      }
+    assert(index[0] == 0);
+    assert(this->startArray == reinterpret_cast<char const*>(&(*this->sufa.begin())));
+    assert(this->endArray == reinterpret_cast<char const*>(&(*this->sufa.end())));
+#endif
  }

 }
--- a/moses/TranslationModel/UG/mm/ug_im_ttrack.h
+++ b/moses/TranslationModel/UG/mm/ug_im_ttrack.h
@ -145,9 +145,9 @@ namespace ugdiss
  imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d)
  {
    myData  = d;
-    numTokens = 0;
-    BOOST_FOREACH(vector<Token> const& v, d)
-      numTokens += v.size();
+    numToks = 0;
+    BOOST_FOREACH(vector<Token> const& v, *d)
+      numToks += v.size();
  }
  
  template<typename Token>
--- a/moses/TranslationModel/UG/mm/ug_mm_2d_table.h
+++ b/moses/TranslationModel/UG/mm/ug_mm_2d_table.h
@ -9,6 +9,7 @@
 #include "tpt_typedefs.h"
 #include "tpt_pickler.h"
 #include "ug_typedefs.h"
+#include "util/exception.hh"
 namespace bio=boost::iostreams;
 namespace ugdiss
 {
@ -113,16 +114,21 @@ namespace ugdiss
    // cout << "opening " << fname << " at " << __FILE__ << ":" << __LINE__ << endl;
    if (access(fname.c_str(),R_OK))
      {
-        cerr << "[" << __FILE__ << ":" << __LINE__ <<"] FATAL ERROR: "
-             << "file '" << fname << " is not accessible." << endl;
-        exit(1);
+	ostringstream msg;
+        msg << "[" << __FILE__ << ":" << __LINE__ <<"] FATAL ERROR: "
+	    << "file '" << fname << " is not accessible." << endl;
+	string foo = msg.str();
+	UTIL_THROW(util::Exception,foo.c_str());
      }
    file.reset(new bio::mapped_file());
    file->open(fname,ios::in|ios::out);
    if (!file->is_open())
      {
-	cerr << "Error opening file " << fname << endl;
-	assert(0);
+	ostringstream msg;
+        msg << "[" << __FILE__ << ":" << __LINE__ <<"] FATAL ERROR: "
+	    << "Opening file '" << fname << "' failed." << endl;
+	string foo = msg.str();
+	UTIL_THROW(util::Exception,foo.c_str());
      }
    char* p = file->data();
    filepos_type offset = *reinterpret_cast<filepos_type*>(p);
--- a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
+++ b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
@ -59,6 +59,7 @@ namespace ugdiss

    // TSA_tree_iterator(TSA_tree_iterator const& other);
    TSA_tree_iterator(TSA<Token> const* s);
+    TSA_tree_iterator(TSA<Token> const* r, id_type const* s, size_t const len);
    // TSA_tree_iterator(TSA<Token> const* s, Token const& t);
    TSA_tree_iterator(TSA<Token> const* s, 
 		      Token const* kstart, 
@ -312,6 +313,17 @@ namespace ugdiss
    : root(s) 
  {};

+  template<typename Token>
+  TSA_tree_iterator<Token>::
+  TSA_tree_iterator
+  (TSA<Token> const* r,
+   id_type    const* s, 
+   size_t     const  len)
+    : root(r) 
+  {
+    for (id_type const* e = s + len; s < e && extend(*s); ++s);
+  };
+
  // ---------------------------------------------------------------------------

 #if 0
--- a/moses/TranslationModel/UG/mmsapt.cpp
+++ b/moses/TranslationModel/UG/mmsapt.cpp
@ -1,12 +1,26 @@
 #include "mmsapt.h"
 #include <boost/foreach.hpp>
 #include <boost/tokenizer.hpp>
+#include <algorithm>

 namespace Moses
 {
  using namespace bitext;
  using namespace std;
  using namespace boost;
+  
+  void 
+  fillIdSeq(Phrase const& mophrase, size_t const ifactor,
+	    TokenIndex const& V, vector<id_type>& dest)
+  {
+    dest.resize(mophrase.GetSize());
+    for (size_t i = 0; i < mophrase.GetSize(); ++i)
+      {
+	Factor const* f = mophrase.GetFactor(i,ifactor);
+	dest[i] = V[f->ToString()];
+      }
+  }
+    

  void 
  parseLine(string const& line, map<string,string> & params)
@ -23,6 +37,7 @@ namespace Moses
 	params[t.substr(i,j)] = t.substr(k);
      }
  }
+
 #if 0
  Mmsapt::
  Mmsapt(string const& description, string const& line)
@ -35,7 +50,7 @@ namespace Moses
  Mmsapt::
  Mmsapt(string const& line)
    // : PhraseDictionary("Mmsapt",line), ofactor(1,0)
-    : PhraseDictionary(line), ofactor(1,0)
+    : PhraseDictionary(line), ofactor(1,0), m_tpc_ctr(0)
  {
    this->init(line);
  }
@ -53,36 +68,88 @@ namespace Moses
    assert(L1.size());
    assert(L2.size());
    map<string,string>::const_iterator m;
+
+    m = param.find("pfwd_denom");
+    m_pfwd_denom = m != param.end() ? m->second[0] : 's';
+
    m = param.find("smooth");
-    lbop_parameter = m != param.end() ? atof(m->second.c_str()) : .05;
+    m_lbop_parameter = m != param.end() ? atof(m->second.c_str()) : .05;
+
    m = param.find("max-samples");
-    default_sample_size = m != param.end() ? atoi(m->second.c_str()) : 1000;
+    m_default_sample_size = m != param.end() ? atoi(m->second.c_str()) : 1000;
+
+    m = param.find("workers");
+    m_workers = m != param.end() ? atoi(m->second.c_str()) : 8;
+    m_workers = min(m_workers,24UL);
+
+    m = param.find("cache-size");
+    m_history.reserve(m != param.end() 
+		      ? max(1000,atoi(m->second.c_str()))
+		      : 10000);
+    
    this->m_numScoreComponents = atoi(param["num-features"].c_str());
    // num_features = 0;
    m = param.find("ifactor");
    input_factor = m != param.end() ? atoi(m->second.c_str()) : 0;
    poolCounts = true;
+    m = param.find("extra");
+    if (m != param.end()) 
+      {
+	extra_data = m->second;
+	// cerr << "have extra data" << endl;
+      }
+    // keeps track of the most frequently used target phrase collections
+    // (to keep them cached even when not actively in use)
  }

+  void
+  Mmsapt::
+  load_extra_data(string bname)
+  {
+    // TO DO: ADD CHECKS FOR ROBUSTNESS
+    // - file existence?
+    // - same number of lines?
+    // - sane word alignment?
+    vector<string> text1,text2,symal;
+    string line;
+    filtering_istream in1,in2,ina; 
+
+    open_input_stream(bname+L1+".txt.gz",in1);
+    open_input_stream(bname+L2+".txt.gz",in2);
+    open_input_stream(bname+L1+"-"+L2+".symal.gz",ina);
+
+    while(getline(in1,line)) text1.push_back(line);
+    while(getline(in2,line)) text2.push_back(line);
+    while(getline(ina,line)) symal.push_back(line);
+
+    lock_guard<mutex> guard(this->lock);
+    btdyn = btdyn->add(text1,text2,symal);
+    assert(btdyn);
+    // cerr << "Loaded " << btdyn->T1->size() << " sentence pairs" << endl;
+  }
+  
  void
  Mmsapt::
  Load()
  {
+    btfix.num_workers = this->m_workers;
    btfix.open(bname, L1, L2);
+    btfix.setDefaultSampleSize(m_default_sample_size);
+    
    size_t num_feats;
    // TO DO: should we use different lbop parameters 
    //        for the relative-frequency based features?
-    num_feats  = calc_pfwd_fix.init(0,lbop_parameter);
-    num_feats  = calc_pbwd_fix.init(num_feats,lbop_parameter);
+    num_feats  = calc_pfwd_fix.init(0,m_lbop_parameter);
+    num_feats  = calc_pbwd_fix.init(num_feats,m_lbop_parameter);
    num_feats  = calc_lex.init(num_feats, bname + L1 + "-" + L2 + ".lex");
    num_feats  = apply_pp.init(num_feats);
    if (num_feats < this->m_numScoreComponents)
      {
 	poolCounts = false;
-	num_feats  = calc_pfwd_dyn.init(num_feats,lbop_parameter);
-	num_feats  = calc_pbwd_dyn.init(num_feats,lbop_parameter);
+	num_feats  = calc_pfwd_dyn.init(num_feats,m_lbop_parameter);
+	num_feats  = calc_pbwd_dyn.init(num_feats,m_lbop_parameter);
      }
-    btdyn.reset(new imBitext<Token>(btfix.V1, btfix.V2));
+
    if (num_feats != this->m_numScoreComponents)
      {
 	ostringstream buf;
@ -94,6 +161,11 @@ namespace Moses
    // cerr << "MMSAPT provides " << num_feats << " features at " 
    // << __FILE__ << ":" << __LINE__ << endl;

+    btdyn.reset(new imBitext<Token>(btfix.V1, btfix.V2,m_default_sample_size));
+    btdyn->num_workers = this->m_workers;
+    if (extra_data.size()) load_extra_data(extra_data);
+
+    // currently not used
    LexicalPhraseScorer2<Token>::table_t & COOC = calc_lex.scorer.COOC;
    typedef LexicalPhraseScorer2<Token>::table_t::Cell cell_t;
    wlex21.resize(COOC.numCols);
@ -128,7 +200,9 @@ namespace Moses
    Token const* x = bt.T2->sntStart(sid) + off;
    for (uint32_t k = 0; k < len; ++k)
      {
+	// cerr << (*bt.V2)[x[k].id()] << " at " << __FILE__ << ":" << __LINE__ << endl;
 	StringPiece wrd = (*bt.V2)[x[k].id()];
+	assert(off+len <= bt.T2->sntLen(sid));
 	w.CreateFromString(Output,ofactor,wrd,false);
 	tp->AddWord(w);
      }
@ -151,7 +225,7 @@ namespace Moses
    PhrasePair pp;   
    pp.init(pid1, stats, this->m_numScoreComponents);
    apply_pp(bt,pp);
-    boost::unordered_map<uint64_t,jstats>::const_iterator t;
+    pstats::trg_map_t::const_iterator t;
    for (t = stats.trg.begin(); t != stats.trg.end(); ++t)
      {
   	pp.update(t->first,t->second);
@ -178,14 +252,14 @@ namespace Moses
    if (statsa && statsb)
      pp.init(pid1b, *statsa, *statsb, this->m_numScoreComponents);
    else if (statsa)
-      pp.init(pid1b, *statsa, this->m_numScoreComponents);
+      pp.init(pid1a, *statsa, this->m_numScoreComponents);
    else if (statsb)
      pp.init(pid1b, *statsb, this->m_numScoreComponents);
    else return false; // throw "no stats for pooling available!";

    apply_pp(bta,pp);
-    boost::unordered_map<uint64_t,jstats>::const_iterator b;
-    boost::unordered_map<uint64_t,jstats>::iterator a;
+    pstats::trg_map_t::const_iterator b;
+    pstats::trg_map_t::iterator a;
    if (statsb)
      {
 	for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b)
@ -222,7 +296,7 @@ namespace Moses
 	parse_pid(a->first, sid, off, len);
 	if (btb.T2)
 	  {
-	    Token const* x = btb.T2->sntStart(sid) + off;
+	    Token const* x = bta.T2->sntStart(sid) + off;
 	    TSA<Token>::tree_iterator m(btb.I2.get(), x, x+len);
 	    if (m.size() == len) 
 	      pp.update(a->first,m.approxOccurrenceCount(),a->second);
@ -258,8 +332,8 @@ namespace Moses
    Word w;
    if (statsa) ppfix.init(pid1a,*statsa,this->m_numScoreComponents);
    if (statsb) ppdyn.init(pid1b,*statsb,this->m_numScoreComponents);
-    boost::unordered_map<uint64_t,jstats>::const_iterator b;
-    boost::unordered_map<uint64_t,jstats>::iterator a;
+    pstats::trg_map_t::const_iterator b;
+    pstats::trg_map_t::iterator a;
    if (statsb)
      {
 	pool.init(pid1b,*statsb,0);
@ -411,13 +485,35 @@ namespace Moses
  //     }
  // }
  
+  Mmsapt::
+  TargetPhraseCollectionWrapper::
+  TargetPhraseCollectionWrapper(size_t r, uint64_t k)
+    : revision(r), key(k), refCount(0), idx(-1)
+  { }
+
+  Mmsapt::
+  TargetPhraseCollectionWrapper::
+  ~TargetPhraseCollectionWrapper()
+  {
+    assert(this->refCount == 0);
+  }
+
+  
+
  // This is not the most efficient way of phrase lookup! 
  TargetPhraseCollection const* 
  Mmsapt::
  GetTargetPhraseCollectionLEGACY(const Phrase& src) const
  {
-    TargetPhraseCollection* ret = new TargetPhraseCollection();
+    // map from Moses Phrase to internal id sequence
+    vector<id_type> sphrase; 
+    fillIdSeq(src,input_factor,*btfix.V1,sphrase);
+    if (sphrase.size() == 0) return NULL;
+    
+    // lookup in static bitext 
+    TSA<Token>::tree_iterator mfix(btfix.I1.get(),&sphrase[0],sphrase.size());

+    // lookup in dynamic bitext
    // Reserve a local copy of the dynamic bitext in its current form. /btdyn/
    // is set to a new copy of the dynamic bitext every time a sentence pair
    // is added. /dyn/ keeps the old bitext around as long as we need it.
@ -426,57 +522,77 @@ namespace Moses
      boost::lock_guard<boost::mutex> guard(this->lock);
      dyn = btdyn;
    }
-
-    vector<id_type> sphrase(src.GetSize());
-    for (size_t i = 0; i < src.GetSize(); ++i)
-      {
-	Factor const* f = src.GetFactor(i,input_factor);
-	id_type wid = (*btfix.V1)[f->ToString()]; 
-	sphrase[i] = wid;
-      }
-
-    TSA<Token>::tree_iterator mfix(btfix.I1.get()), mdyn(dyn->I1.get());
-    for (size_t i = 0; mfix.size() == i && i < sphrase.size(); ++i)
-      mfix.extend(sphrase[i]);
-    
+    assert(dyn);
+    TSA<Token>::tree_iterator mdyn(dyn->I1.get());
    if (dyn->I1.get())
      {
 	for (size_t i = 0; mdyn.size() == i && i < sphrase.size(); ++i)
 	  mdyn.extend(sphrase[i]);
      }
+    
+    // phrase not found in either
+    if (mdyn.size() != sphrase.size() && 
+	mfix.size() != sphrase.size()) 
+      return NULL; // not found

+    // cache lookup:
+
+    uint64_t phrasekey;
+    if (mfix.size() == sphrase.size())
+      phrasekey = (mfix.getPid()<<1);
+    else
+      phrasekey = (mdyn.getPid()<<1)+1;
+
+    size_t revision = dyn->revision();
+    {
+      boost::lock_guard<boost::mutex> guard(this->lock);
+      tpc_cache_t::iterator c = m_cache.find(phrasekey);
+      if (c != m_cache.end() && c->second->revision == revision)
+	return encache(c->second);
+    }
+    
+    // not found or not up to date
    sptr<pstats> sfix,sdyn;
    if (mfix.size() == sphrase.size())
-      {
-	// do we need this lock here? 
-	// Is it used here to control the total number of running threads???
-	boost::lock_guard<boost::mutex> guard(this->lock);
-	sfix = btfix.lookup(mfix);
-      }
+      sfix = btfix.lookup(mfix);
    if (mdyn.size() == sphrase.size())
      sdyn = dyn->lookup(mdyn);
-    if (poolCounts)
+    
+    TargetPhraseCollectionWrapper* 
+      ret = new TargetPhraseCollectionWrapper(revision,phrasekey);
+    if ((poolCounts && 
+	 pool_pstats(src, mfix.getPid(),sfix.get(),btfix, 
+		     mdyn.getPid(),sdyn.get(),*dyn,ret))
+	|| combine_pstats(src, mfix.getPid(),sfix.get(),btfix, 
+			  mdyn.getPid(),sdyn.get(),*dyn,ret))
      {
-	if (!pool_pstats(src, mfix.getPid(),sfix.get(),btfix, 
-			 mdyn.getPid(),sdyn.get(),*dyn,ret))
-	  return NULL;
-      }
-    else if (!combine_pstats(src, mfix.getPid(),sfix.get(),btfix, 
-			     mdyn.getPid(),sdyn.get(),*dyn,ret))
-      return NULL;
-    ret->NthElement(m_tableLimit);
+	ret->NthElement(m_tableLimit);
 #if 0
-    sort(ret->begin(), ret->end(), CompareTargetPhrase());
-    cout << "SOURCE PHRASE: " << src << endl;
-    size_t i = 0;
-    for (TargetPhraseCollection::iterator r = ret->begin(); r != ret->end(); ++r)
-      {
-	cout << ++i << " " << **r << endl;
-      }
+	sort(ret->begin(), ret->end(), CompareTargetPhrase());
+	cout << "SOURCE PHRASE: " << src << endl;
+	size_t i = 0;
+	for (TargetPhraseCollection::iterator r = ret->begin(); r != ret->end(); ++r)
+	  {
+	    cout << ++i << " " << **r << endl;
+	    FVector fv = (*r)->GetScoreBreakdown().CreateFVector();
+	    typedef pair<Moses::FName,float> item_t;
+	    BOOST_FOREACH(item_t f, fv)
+	      cout << f.first << ":" << f.second << " ";
+	    cout << endl;
+	  }
 #endif
-    return ret;
+      }
+    boost::lock_guard<boost::mutex> guard(this->lock);
+    m_cache[phrasekey] = ret;
+    return encache(ret);
  }

+  void
+  Mmsapt::
+  CleanUpAfterSentenceProcessing(const InputType& source)
+  { }
+
+
  ChartRuleLookupManager*
  Mmsapt::
  CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &)
@ -484,13 +600,177 @@ namespace Moses
    throw "CreateRuleLookupManager is currently not supported in Mmsapt!";
  }

-  template<typename Token>
-  void 
-  fill_token_seq(TokenIndex& V, string const& line, vector<Token>& dest)
+  ChartRuleLookupManager*
+  Mmsapt::
+  CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &,
+			  size_t UnclearWhatThisVariableIsSupposedToAccomplishBecauseNobodyBotheredToDocumentItInPhraseTableDotHButIllTakeThisAsAnOpportunityToComplyWithTheMosesConventionOfRidiculouslyLongVariableAndClassNames)
  {
-    istringstream buf(line); string w;
-    while (buf>>w) dest.push_back(Token(V[w]));
+    throw "CreateRuleLookupManager is currently not supported in Mmsapt!";
  }

+  void 
+  Mmsapt::
+  InitializeForInput(InputType const& source)
+  {
+    // assert(0);
+  }
+
+  bool operator<(timespec const& a, timespec const& b)
+  {
+    if (a.tv_sec != b.tv_sec) return a.tv_sec < b.tv_sec;
+    return (a.tv_nsec < b.tv_nsec);
+  }
+
+  bool operator>=(timespec const& a, timespec const& b)
+  {
+    if (a.tv_sec != b.tv_sec) return a.tv_sec > b.tv_sec;
+    return (a.tv_nsec >= b.tv_nsec);
+  }
+
+  void 
+  bubble_up(vector<Mmsapt::TargetPhraseCollectionWrapper*>& v, size_t k)
+  {
+    if (k >= v.size()) return; 
+    for (;k && (v[k]->tstamp < v[k/2]->tstamp); k /=2)
+      {
+  	std::swap(v[k],v[k/2]);
+  	std::swap(v[k]->idx,v[k/2]->idx);
+      }
+  }
+
+  void 
+  bubble_down(vector<Mmsapt::TargetPhraseCollectionWrapper*>& v, size_t k)
+  {
+    for (size_t j = 2*(k+1); j <= v.size(); j = 2*((k=j)+1))
+      {
+	if (j == v.size() || (v[j-1]->tstamp < v[j]->tstamp)) --j;
+	if (v[j]->tstamp >= v[k]->tstamp) break;
+	std::swap(v[k],v[j]);
+	v[k]->idx = k;
+	v[j]->idx = j;
+      }
+  }
+
+  void
+  Mmsapt::
+  decache(TargetPhraseCollectionWrapper* ptr) const
+  {
+    if (ptr->refCount || ptr->idx >= 0) return;
+    
+    timespec t; clock_gettime(CLOCK_MONOTONIC,&t);
+    timespec r; clock_getres(CLOCK_MONOTONIC,&r);
+
+    // if (t.tv_nsec < v[0]->tstamp.tv_nsec)
+#if 0
+    float delta = t.tv_sec - ptr->tstamp.tv_sec;
+    cerr << "deleting old cache entry after "
+	 << delta << " seconds."
+	 << " clock resolution is " << r.tv_sec << ":" << r.tv_nsec 
+	 << " at " << __FILE__ << ":" << __LINE__ << endl;
+#endif
+    tpc_cache_t::iterator m = m_cache.find(ptr->key);
+    if (m != m_cache.end())
+      if (m->second == ptr)
+	m_cache.erase(m);
+    delete ptr;
+    --m_tpc_ctr;
+  }
+  
+
+  Mmsapt::
+  TargetPhraseCollectionWrapper*
+  Mmsapt::
+  encache(TargetPhraseCollectionWrapper* ptr) const
+  {
+    // Calling process must lock for thread safety!!
+    if (!ptr) return NULL;
+    ++ptr->refCount;
+    ++m_tpc_ctr;
+    clock_gettime(CLOCK_MONOTONIC, &ptr->tstamp);
+    
+    // update history
+    if (m_history.capacity() > 1)
+      {
+	vector<TargetPhraseCollectionWrapper*>& v = m_history;
+	if (ptr->idx >= 0) // ptr is already in history
+	  { 
+	    assert(ptr == v[ptr->idx]);
+	    size_t k = 2 * (ptr->idx + 1);
+	    if (k < v.size()) bubble_up(v,k--);
+	    if (k < v.size()) bubble_up(v,k);
+	  }
+	else if (v.size() < v.capacity())
+	  {
+	    size_t k = ptr->idx = v.size();
+	    v.push_back(ptr);
+	    bubble_up(v,k);
+	  }
+	else 
+	  {
+	    v[0]->idx = -1;
+	    decache(v[0]);
+	    v[0] = ptr;
+	    bubble_down(v,0);
+	  }
+      }
+    return ptr;
+  }
+
+  bool
+  Mmsapt::
+  PrefixExists(Moses::Phrase const& phrase) const
+  {
+    if (phrase.GetSize() == 0) return false;
+    vector<id_type> myphrase; 
+    fillIdSeq(phrase,input_factor,*btfix.V1,myphrase);
+    
+    TSA<Token>::tree_iterator mfix(btfix.I1.get(),&myphrase[0],myphrase.size());
+    if (mfix.size() == myphrase.size()) 
+      {
+	// cerr << phrase << " " << mfix.approxOccurrenceCount() << endl;
+	return true;
+      }
+
+    sptr<imBitext<Token> > dyn;
+    { // braces are needed for scoping mutex lock guard!
+      boost::lock_guard<boost::mutex> guard(this->lock);
+      dyn = btdyn;
+    }
+    assert(dyn);
+    TSA<Token>::tree_iterator mdyn(dyn->I1.get());
+    if (dyn->I1.get())
+      {
+	for (size_t i = 0; mdyn.size() == i && i < myphrase.size(); ++i)
+	  mdyn.extend(myphrase[i]);
+      }
+    return mdyn.size() == myphrase.size();
+  }
+
+  void
+  Mmsapt::
+  Release(TargetPhraseCollection const* tpc) const
+  {
+    if (!tpc) return;
+    boost::lock_guard<boost::mutex> guard(this->lock);
+    TargetPhraseCollectionWrapper* ptr 
+      = (reinterpret_cast<TargetPhraseCollectionWrapper*>
+	 (const_cast<TargetPhraseCollection*>(tpc)));
+    if (--ptr->refCount == 0 && ptr->idx < 0)
+      decache(ptr);
+#if 0
+    cerr << ptr->refCount << " references at " 
+	 << __FILE__ << ":" << __LINE__ 
+	 << "; " << m_tpc_ctr << " TPC references still in circulation; "
+	 << m_history.size() << " instances in history."
+	 << endl;
+#endif
+  }
+
+  bool
+  Mmsapt::
+  ProvidesPrefixCheck() const
+  {
+    return true;
+  }

 }
--- a/moses/TranslationModel/UG/mmsapt.h
+++ b/moses/TranslationModel/UG/mmsapt.h
@ -3,7 +3,9 @@
 // Design and code by Ulrich Germann.
 #pragma once

+#include <time.h>
 #include <boost/thread.hpp>
+#include <boost/scoped_ptr.hpp>

 #include "moses/TypeDef.h"
 #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
@ -52,11 +54,16 @@ namespace Moses
  private:
    mmbitext btfix; 
    sptr<imbitext> btdyn;
-    string bname;
+    string bname,extra_data;
    string L1;
    string L2;
-    float  lbop_parameter;
-    size_t default_sample_size;
+    float  m_lbop_parameter;
+    size_t m_default_sample_size;
+    size_t m_workers;  // number of worker threads for sampling the bitexts
+    char m_pfwd_denom; // denominator for computation of fwd phrase score:
+    // 'r' - divide by raw count
+    // 's' - divide by sample count
+    // 'g' - devide by number of "good" (i.e. coherent) samples 
    // size_t num_features;
    size_t input_factor;
    size_t output_factor; // we can actually return entire Tokens!
@ -70,6 +77,33 @@ namespace Moses
    bool poolCounts;
    vector<FactorType> ofactor;

+    
+  public:
+    // typedef boost::unordered_map<uint64_t, sptr<TargetPhraseCollection> > tpcoll_cache_t;
+    class TargetPhraseCollectionWrapper 
+      : public TargetPhraseCollection
+    {
+    public:
+      size_t   const revision; // time stamp from dynamic bitext
+      uint64_t const      key; // phrase key
+      uint32_t       refCount; // reference count
+      timespec         tstamp; // last use
+      int                 idx; // position in history heap
+      TargetPhraseCollectionWrapper(size_t r, uint64_t const k);
+      ~TargetPhraseCollectionWrapper();
+    };
+
+  private:
+
+    TargetPhraseCollectionWrapper*
+    encache(TargetPhraseCollectionWrapper* const ptr) const;
+
+    void
+    decache(TargetPhraseCollectionWrapper* ptr) const;
+
+    typedef map<uint64_t, TargetPhraseCollectionWrapper*> tpc_cache_t;
+    mutable tpc_cache_t m_cache;
+    mutable vector<TargetPhraseCollectionWrapper*> m_history;
    // phrase table feature weights for alignment:
    vector<float> feature_weights; 

@ -118,6 +152,10 @@ namespace Moses
     TargetPhraseCollection* tpcoll
     ) const;

+    void
+    load_extra_data(string bname);
+
+    mutable size_t m_tpc_ctr;
  public:
    // Mmsapt(string const& description, string const& line);
    Mmsapt(string const& line);
@ -130,6 +168,10 @@ namespace Moses
    //! Create a sentence-specific manager for SCFG rule lookup.
    ChartRuleLookupManager*
    CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &);
+
+    ChartRuleLookupManager*
+    CreateRuleLookupManager
+    (const ChartParser &, const ChartCellCollectionBase &, std::size_t);
 #endif

    void add(string const& s1, string const& s2, string const& a);
@ -139,6 +181,23 @@ namespace Moses
    align(string const& src, string const& trg) const;

    void setWeights(vector<float> const& w);
+
+    void 
+    CleanUpAfterSentenceProcessing(const InputType& source);
+
+    void 
+    InitializeForInput(InputType const& source);
+
+    void 
+    Release(TargetPhraseCollection const* tpc) const;
+
+    bool 
+    ProvidesPrefixCheck() const;
+    
+    /// return true if prefix /phrase/ exists
+    bool
+    PrefixExists(Phrase const& phrase) const;
+
  private:
  };
 } // end namespace
--- a/moses/TranslationModel/UG/mmsapt_align.cc
+++ b/moses/TranslationModel/UG/mmsapt_align.cc
@ -98,7 +98,7 @@ namespace Moses
    typedef pair<uint32_t, uint32_t>  span;
    typedef vector<vector<uint64_t> > pidmap_t; // span -> phrase ID
    typedef boost::unordered_map<uint64_t,vector<span> > pid2span_t;
-    typedef boost::unordered_map<uint64_t,jstats> jStatsTable;
+    typedef pstats::trg_map_t jStatsTable;

    Mmsapt const& PT;
    vector<id_type> s,t; 
--- a/moses/TranslationOptionCollectionConfusionNet.cpp
+++ b/moses/TranslationOptionCollectionConfusionNet.cpp
@ -1,6 +1,7 @@
 // $Id$

 #include <list>
+#include <vector>
 #include "TranslationOptionCollectionConfusionNet.h"
 #include "ConfusionNet.h"
 #include "DecodeGraph.h"
@ -10,6 +11,7 @@
 #include "FF/InputFeature.h"
 #include "TranslationModel/PhraseDictionaryTreeAdaptor.h"
 #include "util/exception.hh"
+#include <boost/foreach.hpp>

 using namespace std;

@ -17,11 +19,21 @@ namespace Moses
 {

 /** constructor; just initialize the base class */
-TranslationOptionCollectionConfusionNet::TranslationOptionCollectionConfusionNet(
-  const ConfusionNet &input
-  , size_t maxNoTransOptPerCoverage, float translationOptionThreshold)
-  : TranslationOptionCollection(input, maxNoTransOptPerCoverage, translationOptionThreshold)
+TranslationOptionCollectionConfusionNet::
+TranslationOptionCollectionConfusionNet(const ConfusionNet &input, 
+					size_t maxNoTransOptPerCoverage, 
+					float translationOptionThreshold)
+  : TranslationOptionCollection(input, maxNoTransOptPerCoverage, 
+				translationOptionThreshold)
 {
+  // Prefix checkers are phrase dictionaries that provide a prefix check
+  // to indicate that a phrase table entry with a given prefix exists.
+  // If no entry with the given prefix exists, there is no point in
+  // expanding it further.
+  vector<PhraseDictionary*> prefixCheckers;
+  BOOST_FOREACH(PhraseDictionary* pd, PhraseDictionary::GetColl())
+    if (pd->ProvidesPrefixCheck()) prefixCheckers.push_back(pd);
+  
  const InputFeature &inputFeature = InputFeature::Instance();
  UTIL_THROW_IF2(&inputFeature == NULL, "Input feature must be specified");

@ -91,6 +103,11 @@ TranslationOptionCollectionConfusionNet::TranslationOptionCollectionConfusionNet
          Phrase subphrase(prevPhrase);
          subphrase.AddWord(word);

+	  bool OK = prefixCheckers.size() == 0;
+	  for (size_t k = 0; !OK && k < prefixCheckers.size(); ++k)
+	    OK = prefixCheckers[k]->PrefixExists(subphrase);
+	  if (!OK) continue;
+
          const ScorePair &scores = col[i].second;
          ScorePair *inputScore = new ScorePair(*prevInputScore);
          inputScore->PlusEquals(scores);
@ -105,6 +122,9 @@ TranslationOptionCollectionConfusionNet::TranslationOptionCollectionConfusionNet
      } // for (iterPath = prevPaths.begin(); iterPath != prevPaths.end(); ++iterPath) {
    }
  }
+  // cerr << "HAVE " << m_inputPathQueue.size() 
+  // << " input paths of max. length " 
+  // << maxSizePhrase << "." << endl;
 }

 InputPathList &TranslationOptionCollectionConfusionNet::GetInputPathList(size_t startPos, size_t endPos)
@ -229,7 +249,9 @@ void TranslationOptionCollectionConfusionNet::CreateTranslationOptionsForRangeLE
      // go thru each intermediate trans opt just created
      const vector<TranslationOption*>& partTransOptList = oldPtoc->GetList();
      vector<TranslationOption*>::const_iterator iterPartialTranslOpt;
-      for (iterPartialTranslOpt = partTransOptList.begin() ; iterPartialTranslOpt != partTransOptList.end() ; ++iterPartialTranslOpt) {
+      for (iterPartialTranslOpt  = partTransOptList.begin(); 
+	   iterPartialTranslOpt != partTransOptList.end(); 
+	   ++iterPartialTranslOpt) {
        TranslationOption &inputPartialTranslOpt = **iterPartialTranslOpt;

        if (transStep) {
--- a/moses/TranslationOptionCollectionLattice.cpp
+++ b/moses/TranslationOptionCollectionLattice.cpp
@ -37,14 +37,17 @@ TranslationOptionCollectionLattice::TranslationOptionCollectionLattice(

    const std::vector<size_t> &nextNodes = input.GetNextNodes(startPos);

-    WordsRange range(startPos, startPos);
-    const NonTerminalSet &labels = input.GetLabelSet(startPos, startPos);
-
    const ConfusionNet::Column &col = input.GetColumn(startPos);
    for (size_t i = 0; i < col.size(); ++i) {
      const Word &word = col[i].first;
      UTIL_THROW_IF2(word.IsEpsilon(), "Epsilon not supported");

+      size_t nextNode = nextNodes[i];
+      size_t endPos = startPos + nextNode - 1;
+
+      WordsRange range(startPos, endPos);
+      const NonTerminalSet &labels = input.GetLabelSet(startPos, endPos);
+
      Phrase subphrase;
      subphrase.AddWord(word);

@ -53,9 +56,7 @@ TranslationOptionCollectionLattice::TranslationOptionCollectionLattice(

      InputPath *path = new InputPath(subphrase, labels, range, NULL, inputScore);

-      size_t nextNode = nextNodes[i];
      path->SetNextNode(nextNode);
-
      m_inputPathQueue.push_back(path);
    }
  }
@ -135,7 +136,7 @@ void TranslationOptionCollectionLattice::CreateTranslationOptions()
    const WordsRange &range = path.GetWordsRange();

    if (tpColl && tpColl->GetSize()) {
-    	TargetPhraseCollection::const_iterator iter;
+		TargetPhraseCollection::const_iterator iter;
    	for (iter = tpColl->begin(); iter != tpColl->end(); ++iter) {
    		const TargetPhrase &tp = **iter;
    		TranslationOption *transOpt = new TranslationOption(range, tp);
--- a/moses/Word.cpp
+++ b/moses/Word.cpp
@ -98,19 +98,45 @@ StringPiece Word::GetString(FactorType factorType) const

 class StrayFactorException : public util::Exception {};

-void Word::CreateFromString(FactorDirection direction
-                            , const std::vector<FactorType> &factorOrder
-                            , const StringPiece &str
-                            , bool isNonTerminal)
+void 
+Word::
+CreateFromString(FactorDirection direction
+		 , const std::vector<FactorType> &factorOrder
+		 , const StringPiece &str
+		 , bool isNonTerminal
+		 , bool strict)
 {
  FactorCollection &factorCollection = FactorCollection::Instance();
-
-  util::TokenIter<util::MultiCharacter> fit(str, StaticData::Instance().GetFactorDelimiter());
-  for (size_t ind = 0; ind < factorOrder.size() && fit; ++ind, ++fit) {
-    m_factorArray[factorOrder[ind]] = factorCollection.AddFactor(*fit, isNonTerminal);
-  }
-  UTIL_THROW_IF(fit, StrayFactorException, "You have configured " << factorOrder.size() << " factors but the word " << str << " contains factor delimiter " << StaticData::Instance().GetFactorDelimiter() << " too many times.");
-
+  vector<StringPiece> bits(MAX_NUM_FACTORS);
+  util::TokenIter<util::MultiCharacter> 
+    fit(str, StaticData::Instance().GetFactorDelimiter());
+  size_t i = 0;
+  for (; i < MAX_NUM_FACTORS && fit; ++i,++fit)
+    bits[i] = *fit;
+  if (i == MAX_NUM_FACTORS)
+    UTIL_THROW_IF(fit, StrayFactorException, 
+		  "The hard limit for factors is " << MAX_NUM_FACTORS
+		  << ". The word " << str << " contains factor delimiter " 
+		  << StaticData::Instance().GetFactorDelimiter() 
+		  << " too many times.");
+  if (strict)
+    UTIL_THROW_IF(fit, StrayFactorException, 
+		  "You have configured " << factorOrder.size() 
+		  << " factors but the word " << str 
+		  << " contains factor delimiter " 
+		  << StaticData::Instance().GetFactorDelimiter() 
+		  << " too many times.");
+  
+  UTIL_THROW_IF(i < factorOrder.size(),util::Exception, 
+		"Too few factors in string '" << str << "'.");
+  
+  for (size_t k = 0; k < factorOrder.size(); ++k) 
+    {
+      UTIL_THROW_IF(factorOrder[k] >= MAX_NUM_FACTORS, util::Exception, 
+		    "Factor order out of bounds.");
+      m_factorArray[factorOrder[k]] = factorCollection.AddFactor(bits[k], isNonTerminal);
+    }
+  
  // assume term/non-term same for all factors
  m_isNonTerminal = isNonTerminal;
 }
--- a/moses/Word.h
+++ b/moses/Word.h
@ -151,7 +151,8 @@ public:
  void CreateFromString(FactorDirection direction
                        , const std::vector<FactorType> &factorOrder
                        , const StringPiece &str
-                        , bool isNonTerminal);
+                        , bool isNonTerminal
+                        , bool strict = true); 

  void CreateUnknownWord(const Word &sourceWord);

--- a/moses/WordLattice.cpp
+++ b/moses/WordLattice.cpp
@ -49,7 +49,12 @@ void WordLattice::Print(std::ostream& out) const
  out<<"\n\n";
 }

-int WordLattice::InitializeFromPCNDataType(const PCN::CN& cn, const std::vector<FactorType>& factorOrder, const std::string& debug_line)
+int 
+WordLattice::
+InitializeFromPCNDataType
+(const PCN::CN& cn, 
+ const std::vector<FactorType>& factorOrder, 
+ const std::string& debug_line)
 {
  const StaticData &staticData = StaticData::Instance();
  const InputFeature &inputFeature = InputFeature::Instance();
@ -73,14 +78,20 @@ int WordLattice::InitializeFromPCNDataType(const PCN::CN& cn, const std::vector<

      //check for correct number of link parameters
      if (alt.m_denseFeatures.size() != numInputScores) {
-        TRACE_ERR("ERROR: need " << numInputScores << " link parameters, found " << alt.m_denseFeatures.size() << " while reading column " << i << " from " << debug_line << "\n");
+        TRACE_ERR("ERROR: need " << numInputScores 
+		  << " link parameters, found " 
+		  << alt.m_denseFeatures.size() 
+		  << " while reading column " << i 
+		  << " from " << debug_line << "\n");
        return false;
      }

      //check each element for bounds
      std::vector<float>::const_iterator probsIterator;
      data[i][j].second = std::vector<float>(0);
-      for(probsIterator = alt.m_denseFeatures.begin(); probsIterator < alt.m_denseFeatures.end(); probsIterator++) {
+      for(probsIterator = alt.m_denseFeatures.begin(); 
+	  probsIterator < alt.m_denseFeatures.end(); 
+	  probsIterator++) {
        IFVERBOSE(1) {
          if (*probsIterator < 0.0f) {
            TRACE_ERR("WARN: neg probability: " << *probsIterator << "\n");
@ -102,7 +113,9 @@ int WordLattice::InitializeFromPCNDataType(const PCN::CN& cn, const std::vector<
        float value = (alt.m_word=="" || alt.m_word==EPSILON) ? 0.0f : -1.0f;
        data[i][j].second.denseScores.push_back(value);
      }
-      String2Word(alt.m_word, data[i][j]. first, factorOrder);
+      Word& w = data[i][j].first;
+      w.CreateFromString(Input,factorOrder,StringPiece(alt.m_word),false);
+      // String2Word(alt.m_word, data[i][j]. first, factorOrder);
      next_nodes[i][j] = alt.m_next;

      if(next_nodes[i][j] > maxSizePhrase) {
--- a/regression-testing/run-single-test.perl
+++ b/regression-testing/run-single-test.perl
@ -119,14 +119,21 @@ sub exec_moses {
  my ($decoder, $conf, $input, $results) = @_;
  my $start_time = time;
  my ($o, $ec, $sig);
+  my $cmd;
  if ($NBEST > 0){
        print STDERR "Nbest output file is $results/run.nbest\n";
        print STDERR "Nbest size is $NBEST\n";
-	($o, $ec, $sig) = run_command("$decoder -f $conf -i $input -n-best-list $results/run.nbest $NBEST 1> $results/run.stdout 2> $results/run.stderr");
+	$cmd = "$decoder -f $conf -i $input -n-best-list $results/run.nbest $NBEST 1> $results/run.stdout 2> $results/run.stderr";
  }
  else{
-	($o, $ec, $sig) = run_command("$decoder -f $conf -i $input 1> $results/run.stdout 2> $results/run.stderr");
+      $cmd = "$decoder -f $conf -i $input 1> $results/run.stdout 2> $results/run.stderr";
  }
+
+  open  CMD, ">$results/cmd_line";
+  print CMD "$cmd\n";
+  close CMD;
+
+  ($o, $ec, $sig) = run_command($cmd);
  my $elapsed = time - $start_time;
  return ($o, $elapsed, $ec, $sig);
 }
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@ -21,11 +21,14 @@ while (@ARGV) {
 	/^-l$/ && ($language = shift, next);
 	/^-q$/ && ($QUIET = 1, next);
 	/^-h$/ && ($HELP = 1, next);
+	/^-b$/ && ($|++, next); # no output buffering
 }

 if ($HELP) {
-    print "Usage ./split-sentences.perl (-l [en|de|...]) < textfile > splitfile\n";
-	exit;
+    print "Usage ./split-sentences.perl (-l [en|de|...]) [-q] [-b] < textfile > splitfile\n";
+    print "-q: quiet mode\n";
+    print "-b: no output buffering (for use in bidirectional pipes)\n";
+    exit;
 }
 if (!$QUIET) {
 	print STDERR "Sentence Splitter v3\n";
--- a/scripts/recaser/truecase.perl
+++ b/scripts/recaser/truecase.perl
@ -38,9 +38,17 @@ while(<STDIN>) {
    print " " if $i;
    print $$MARKUP[$i];

-    $$WORD[$i] =~ /^([^\|]+)(.*)/;
-    my $word = $1;
-    my $otherfactors = $2;
+    my ($word,$otherfactors);
+    if ($$WORD[$i] =~ /^([^\|]+)(.*)/)
+    {
+	$word = $1;
+	$otherfactors = $2;
+    }
+    else
+    {
+	$word = $$WORD[$i];
+	$otherfactors = "";
+    }

    if ($sentence_start && defined($BEST{lc($word)})) {
      print $BEST{lc($word)}; # truecase sentence start
--- a/scripts/server/sim-pe.py
+++ b/scripts/server/sim-pe.py
@ -0,0 +1,291 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Written by Ulrich Germann on the basis of contrib/server/client.py.
+# This script simulates post-editing of MT output and incrementally
+# updates the dynamic phrase tables in the moses server.
+
+import xmlrpclib,datetime,argparse,sys,os,time
+from subprocess import *
+
+# We must perform some custom argument processing, as moses parameter
+# specifications do not comply with the standards used in standard
+# argument parsing packages; an isolated double dash separates script
+# arguments from moses arguments
+
+MosesProcess = None 
+NBestFile    = None
+
+def shutdown():
+    if MosesProcess:
+        if args.debug:
+            print >>sys.stderr,"shutting down moses server"
+            pass
+        MosesProcess.terminate()
+        pass
+    return
+
+def find_free_port(p):
+    """
+    Find a free port, starting at /p/. 
+    Return the free port, or False if none found.
+    """
+    ret = p
+    while ret - p < 20:
+        devnull = open(os.devnull,"w")
+        n = Popen(["netstat","-tnp"],stdout=PIPE,stderr=devnull)
+        if n.communicate()[0].find(":%d "%ret) < 0:
+            return p
+        ret += 1
+        pass
+    return False
+
+def launch_moses(mo_args):
+    """
+    Spawn a moses server process. Return URL of said process.
+    """
+    global MosesProcess
+    try:
+        port_index = mo_args.index("--server-port") + 1
+    except:
+        mo_args.extend(["--server-port","7777"])
+        port_index = len(mo_args) - 1
+        pass
+    port = find_free_port(int(mo_args[port_index]))
+    if not port:
+        print >>sys.stderr, "FATAL ERROR: No available port for moses server!"
+        sys.exit(1)
+        pass
+    if args.debug:
+        MosesProcess = Popen([args.servercmd] + mo_args)
+    else:
+        devnull = open(os.devnull,"w")
+        MosesProcess = Popen([args.servercmd] + mo_args, 
+                             stderr=devnull, stdout=devnull)
+    if MosesProcess.poll():
+        print >>sys.stderr, "FATAL ERROR: Could not launch moses server!"
+        sys.exit(1)
+        pass
+    if args.debug:
+        print >>sys.stderr,"MOSES port is %d."%port 
+        print >>sys.stderr,"Moses poll status is", MosesProcess.poll()
+        pass
+    return "http://localhost:%d"%port
+
+def split_args(all_args):
+    """
+    Split argument list all_args into script-specific 
+    and moses-specific arguments.
+    """
+    my_args = []
+    mo_args = []
+    try:
+        i = all_args.index("--")
+        my_args = all_args[:i]
+        mo_args = all_args[i+1:]
+    except:
+        my_args = []
+        mo_args = all_args[:]
+        pass
+
+    # IMPORTANT: the code below must be coordinated with 
+    # - the evolution of moses command line arguments
+    # - mert-moses.pl 
+    i = 0
+    while i < len(mo_args):
+        if mo_args[i] == "-i" or mo_args[i] == "-input-file":
+            my_args.extend(["--src",m_args[i+1]])
+            mo_args[i:i+2] = []
+            
+        elif mo_args[i] == "-inputtype":
+            if mo_args[i+1] != "0":
+                # not yet supported! Therefore:
+                errmsg  = "FATAL ERROR: "
+                errmsg += "%s only supports plain text input at this point."
+                print >>sys.stderr,errmsg%sys.argv[0]
+                sys.exit(1)
+                pass
+            my_args.extend(["--input-type",mo_args[i+1]])
+            mo_args[i:i+2] = []
+            
+        elif mo_args[i] == "-lattice-samples":
+            my_args.extend(["--lattice-sample",mo_args[i+2]])
+            my_args.extend(["--lattice-sample-file",mo_args[i+1]])
+            mo_args[i:i+3] = []
+                # not yet supported! Therefore:
+            errmsg  = "FATAL ERROR: "
+            errmsg += "%s does not yet support lattice sampling."
+            print >>sys.stderr,errmsg%sys.argv[0]
+            sys.exit(1)
+            
+        elif mo_args[i] == "-n-best-list":
+            my_args.extend(["--nbest",mo_args[i+2]])
+            my_args.extend(["--nbest-file",mo_args[i+1]])
+            mo_args[i:i+3] = []
+
+        elif mo_args[i] == "-n-best-distinct":
+            my_args.extend(["-U"])
+            mo_args[i:i+1] = []
+
+        else:
+            i += 1
+            pass
+        pass
+    return my_args,mo_args
+    
+def interpret_args(my_args):
+    """
+    Parse script-specific argument list.
+    """
+    aparser = argparse.ArgumentParser()
+
+    # interfacing with moses
+    # aparser.add_argument("-m","--moses-cmd",default="moses",dest="mosescmd",
+    #                      help="path to standard moses command")
+    aparser.add_argument("-s","--server-cmd",default="mosesserver",
+                         dest="servercmd", help="path to moses server command")
+    aparser.add_argument("-u","--url",help="URL of external moses server.")
+    
+    # input / output
+    aparser.add_argument("-i","--input",help="source file",default="-")
+    aparser.add_argument("-r","--ref",help="reference translation",default=None)
+    aparser.add_argument("-a","--aln",help="alignment",default=None)
+    aparser.add_argument("-o","--output",default="-",help="output file")
+    aparser.add_argument("-d","--debug",action="store_true",help="debug mode")
+    
+    # moses reporting options
+    aparser.add_argument("-A","--with-alignment", dest="A",
+                         help="include alignment in output", action="store_true")
+    aparser.add_argument("-G","--with-graph",type=bool, default=False, dest="G",
+                         help="include search graph info in output")
+    aparser.add_argument("-T","--with-transopt",type=bool, default=False, dest = "T",
+                         help="include translation options info in output")
+    aparser.add_argument("-F","--report-all-factors", action="store_true",dest="F",
+                         help="report all factors")
+    aparser.add_argument("-n","--nbest",type=int,dest="nbest",default=0, 
+                         help="size of nbest list")
+    aparser.add_argument("-N","--nbest-file",dest="nbestFile",default=0,
+                         help="output file for nbest list")
+    aparser.add_argument("-U","--nbest-distinct",type=bool,dest="U",default=False,
+                         help="report all factors")
+
+    return aparser.parse_args(my_args)
+    
+def translate(proxy,args,s):
+    param = {'text':s.strip()}
+    if args.A:     param['align'] = True
+    if args.T:     param['topt']  = True
+    if args.F:     param['report-all-factors'] = True
+    if args.nbest: 
+        param['nbest'] = int(args.nbest)
+        param['add-score-breakdown'] = True
+        pass
+    if args.U:     param['nbest-distinct'] = True
+    
+    try:
+        ret = proxy.translate(param)
+    except:
+        return None
+        pass
+    return ret
+
+def read_data(fname):
+    """
+    Read and return data (source, target or alignment) from file fname.
+    """
+    if fname[-3:] == ".gz":
+        foo = Popen(["zcat",fname],stdout=PIPE)\
+            .communicate()[0]\
+            .strip().split('\n')
+    else:
+        foo = [x.strip() for x in open(fname).readlines()]
+        pass
+    return foo
+
+def repack_result(id,result):
+    global args
+    if args.nbest:
+        if not NBestFile:
+            shutdown()
+            assert NBestFile
+            sys.exit(1)
+        for h in result['nbest']:
+            fields = (id,h['hyp'],h['fvals'],h['totalScore'])
+            print >>NBestFile,"%d ||| %s ||| %s ||| %f"%fields
+            pass
+        pass
+    if 'align' in result:
+        t = result['text'].split()
+        span = ''
+        i = 0
+        k = 0
+        for a in result['align']:
+            k = a['tgt-start']
+            if k: print " ".join(t[i:k]),span,
+            i = k
+            span = "|%d %d|"%(a['src-start'],a['src-end'])
+            pass
+        print " ".join(t[k:]),span
+        pass
+    else:
+        print result['text']
+        pass
+    return
+
+if __name__ == "__main__":
+    my_args, mo_args = split_args(sys.argv[1:])
+
+    global args
+    args = interpret_args(my_args)
+    if "-show-weights" in mo_args:
+        devnull = open(os.devnull,"w")
+        mo = Popen([args.servercmd] + mo_args,stdout=PIPE,stderr=devnull)
+        print mo.communicate()[0].strip()
+        sys.exit(0)
+        pass
+
+    if args.nbest:
+        if args.nbestFile:
+            NBestFile = open(args.nbestFile,"w")
+        else:
+            NBestFile = sys.stdout
+            pass
+        pass
+    if "url" not in args or not args.url:
+        url = launch_moses(mo_args)
+    else:
+        url = args.url
+        pass
+    if url[:4]  != "http":  url = "http://%s"%url
+    if url[-5:] != "/RPC2": url += "/RPC2"
+    proxy = xmlrpclib.ServerProxy(url)
+
+    ret = None
+    aln = None
+    if args.ref: ref = read_data(args.ref)
+    if args.aln: aln = read_data(args.aln)
+
+    if (args.input == "-"):
+        line = sys.stdin.readline()
+        id = 0
+        while line:
+            result = translate(proxy,args,line)
+            repack_result(id,result)
+            line = sys.stdin.readline()
+            id += 1
+            pass
+        pass
+    else:
+        src = read_data(args.src)
+        for i in xrange(len(src)):
+            if  ref and aln:
+                result = proxy.updater({'source'    : src[i],
+                                        'target'    : ref[i],
+                                        'alignment' : aln[i]})
+                repack_result(i,result)
+                pass
+            pass
+        pass
+    pass
+    
+shutdown()
--- a/scripts/tokenizer/pre-tokenizer.perl
+++ b/scripts/tokenizer/pre-tokenizer.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -W 
+#!/usr/bin/perl -W
 # script for preprocessing language data prior to tokenization
 # Start by Ulrich Germann, after noticing systematic preprocessing errors
 # in some of the English Europarl data.
@ -12,21 +12,21 @@ binmode(STDOUT, ":utf8");
 sub usage
 {
  print "Script for preprocessing of raw language data prior to tokenization\n";
-  print "Usage: $0 -l <language tag>\n";
+  print "Usage: $0 -l <language tag> [-b]\n";
+  print "       -b: no buffering\n";
 }

 my %args;
-getopt('l=s h',\%args);
+getopt('l=s h b',\%args);
 usage() && exit(0) if $args{'h'};
-
+$|++ if $args{'b'};
 if ($args{'l'} eq "en")
  {
-      while (<>)
+    while (<>)
      {
-	  s/([[:alpha:]]\') s\b/$1s/g;
-	  print;
+	s/([[:alpha:]]\') s\b/$1s/g;
+	print;
      }
-      
  }
 elsif ($args{'l'} eq "fr")
  {
@ -38,6 +38,5 @@ elsif ($args{'l'} eq "fr")
  }
 else
  {
-    
    print while <>;
  }