Merge branch 'dynamic-phrase-tables'

2024-09-11 19:27:11 +03:00 · 2014-03-11 14:17:42 +00:00 · 2014-03-11 14:17:42 +00:00 · f1449cd7fe
commit f1449cd7fe
parent ca9e3aaf84 c02fbf7664
39 changed files with 3053 additions and 679 deletions
--- a/25
+++ b/25
@ -108,6 +108,8 @@ if [ option.get "enable-mpi" : : "yes" ] {

 requirements += [ option.get "notrace" : <define>TRACE_ENABLE=1 ] ;
 requirements += [ option.get "enable-boost-pool" : : <define>USE_BOOST_POOL ] ;
+requirements += [ option.get "with-mm" : : <define>PT_UG ] ;
+requirements += [ option.get "with-mm" : : <define>MAX_NUM_FACTORS=4 ] ;

 if [ option.get "with-cmph" ] {
  requirements += <define>HAVE_CMPH ;
@ -137,6 +139,23 @@ project : requirements
 #Add directories here if you want their incidental targets too (i.e. tests).
 build-projects lm util phrase-extract search moses moses/LM mert moses-cmd moses-chart-cmd mira scripts regression-testing  ;

+if [ option.get "with-mm" : : "yes" ]
+{
+ alias mm :  
+  moses/TranslationModel/UG/mm//mtt-build 
+  moses/TranslationModel/UG/mm//mtt-dump 
+  moses/TranslationModel/UG/mm//symal2mam 
+  moses/TranslationModel/UG/mm//custom-pt 
+  moses/TranslationModel/UG/mm//mmlex-build 
+  moses/TranslationModel/UG/mm//mtt-count-words 
+  moses/TranslationModel/UG//try-align 
+  ;
+}
+else
+{
+ alias mm ; 
+}
+
 alias programs : 
 lm//programs 
 moses-chart-cmd//moses_chart 
@ -154,12 +173,10 @@ phrase-extract//pcfg-score
 biconcor 
 mira//mira 
 contrib/server//mosesserver 
-#moses/mm//mtt-build 
-#moses/mm//mtt-dump 
-#moses/mm//symal2mam 
-#moses/mm//custom-pt 
+mm
 ;

+
 install-bin-libs programs ;
 install-headers headers-base : [ path.glob-tree biconcor contrib lm mert misc moses-chart-cmd moses-cmd OnDiskPt phrase-extract symal util : *.hh *.h ] : . ;
 install-headers headers-moses : moses//headers-to-install : moses ;
--- a/moses-cmd/IOWrapper.cpp
+++ b/moses-cmd/IOWrapper.cpp
@ -182,7 +182,9 @@ void IOWrapper::Initialization(const std::vector<FactorType>	&/*inputFactorOrder

 }

-InputType*IOWrapper::GetInput(InputType* inputType)
+InputType*
+IOWrapper::
+GetInput(InputType* inputType)
 {
  if(inputType->Read(*m_inputStream, m_inputFactorOrder)) {
    if (long x = inputType->GetTranslationId()) {
@ -605,7 +607,7 @@ void IOWrapper::OutputLatticeMBRNBestList(const vector<LatticeMBRSolution>& solu

 bool ReadInput(IOWrapper &ioWrapper, InputTypeEnum inputType, InputType*& source)
 {
-  delete source;
+  if (source) delete source;
  switch(inputType) {
  case SentenceInput:
    source = ioWrapper.GetInput(new Sentence);
@ -618,6 +620,7 @@ bool ReadInput(IOWrapper &ioWrapper, InputTypeEnum inputType, InputType*& source
    break;
  default:
    TRACE_ERR("Unknown input type: " << inputType << "\n");
+    source = NULL;
  }
  return (source ? true : false);
 }
--- a/moses/ConfusionNet.cpp
+++ b/moses/ConfusionNet.cpp
@ -14,268 +14,293 @@

 namespace Moses
 {
-struct CNStats {
-  size_t created,destr,read,colls,words;
-
-  CNStats() : created(0),destr(0),read(0),colls(0),words(0) {}
-  ~CNStats() {
-    print(std::cerr);
-  }
-
-  void createOne() {
-    ++created;
-  }
-  void destroyOne() {
-    ++destr;
-  }
-
-  void collect(const ConfusionNet& cn) {
-    ++read;
-    colls+=cn.GetSize();
-    for(size_t i=0; i<cn.GetSize(); ++i)
-      words+=cn[i].size();
-  }
-  void print(std::ostream& out) const {
-    if(created>0) {
-      out<<"confusion net statistics:\n"
-         " created:\t"<<created<<"\n"
-         " destroyed:\t"<<destr<<"\n"
-         " succ. read:\t"<<read<<"\n"
-         " columns:\t"<<colls<<"\n"
-         " words:\t"<<words<<"\n"
-         " avg. word/column:\t"<<words/(1.0*colls)<<"\n"
-         " avg. cols/sent:\t"<<colls/(1.0*read)<<"\n"
-         "\n\n";
+  struct CNStats {
+    size_t created,destr,read,colls,words;
+    
+    CNStats() : created(0),destr(0),read(0),colls(0),words(0) {}
+    ~CNStats() {
+      print(std::cerr);
    }
-  }
-
-};
-
-CNStats stats;
-
-size_t ConfusionNet::GetColumnIncrement(size_t i, size_t j) const
-{
-  (void) i;
-  (void) j;
-  return 1;
-}
-
-ConfusionNet::ConfusionNet()
-  : InputType()
-{
-  stats.createOne();
-
-  const StaticData& staticData = StaticData::Instance();
-  if (staticData.IsChart()) {
-    m_defaultLabelSet.insert(StaticData::Instance().GetInputDefaultNonTerminal());
-  }
-  UTIL_THROW_IF2(&InputFeature::Instance() == NULL, "Input feature must be specified");
-}
-ConfusionNet::~ConfusionNet()
-{
-  stats.destroyOne();
-}
-
-ConfusionNet::ConfusionNet(Sentence const& s)
-{
-  data.resize(s.GetSize());
-  for(size_t i=0; i<s.GetSize(); ++i) {
-    ScorePair scorePair;
-    std::pair<Word, ScorePair > temp = std::make_pair(s.GetWord(i), scorePair);
-    data[i].push_back(temp);
-  }
-}
-
-bool ConfusionNet::ReadF(std::istream& in,
-                         const std::vector<FactorType>& factorOrder,
-                         int format)
-{
-  VERBOSE(1, "read confusion net with format "<<format<<"\n");
-  switch(format) {
-  case 0:
-    return ReadFormat0(in,factorOrder);
-  case 1:
-    return ReadFormat1(in,factorOrder);
-  default:
-    std::stringstream strme;
-    strme << "ERROR: unknown format '"<<format
-          <<"' in ConfusionNet::Read";
-    UserMessage::Add(strme.str());
-  }
-  return false;
-}
-
-int ConfusionNet::Read(std::istream& in,
-                       const std::vector<FactorType>& factorOrder)
-{
-  int rv=ReadF(in,factorOrder,0);
-  if(rv) stats.collect(*this);
-  return rv;
-}
-
-
-void ConfusionNet::String2Word(const std::string& s,Word& w,
-                               const std::vector<FactorType>& factorOrder)
-{
-  std::vector<std::string> factorStrVector = Tokenize(s, "|");
-  for(size_t i=0; i<factorOrder.size(); ++i)
-    w.SetFactor(factorOrder[i],
-                FactorCollection::Instance().AddFactor(Input,factorOrder[i],
-                    factorStrVector[i]));
-}
-
-bool ConfusionNet::ReadFormat0(std::istream& in,
-                               const std::vector<FactorType>& factorOrder)
-{
-  Clear();
-
-  const StaticData &staticData = StaticData::Instance();
-  const InputFeature &inputFeature = InputFeature::Instance();
-  size_t numInputScores = inputFeature.GetNumInputScores();
-  size_t numRealWordCount = inputFeature.GetNumRealWordsInInput();
-
-  size_t totalCount = numInputScores + numRealWordCount;
-  bool addRealWordCount = (numRealWordCount > 0);
-
-  std::string line;
-  while(getline(in,line)) {
-    std::istringstream is(line);
-    std::string word;
-
-    Column col;
-    while(is>>word) {
-      Word w;
-      String2Word(word,w,factorOrder);
-      std::vector<float> probs(totalCount, 0.0);
-      for(size_t i=0; i < numInputScores; i++) {
-        double prob;
-        if (!(is>>prob)) {
-          TRACE_ERR("ERROR: unable to parse CN input - bad link probability, or wrong number of scores\n");
-          return false;
-        }
-        if(prob<0.0) {
-          VERBOSE(1, "WARN: negative prob: "<<prob<<" ->set to 0.0\n");
-          prob=0.0;
-        } else if (prob>1.0) {
-          VERBOSE(1, "WARN: prob > 1.0 : "<<prob<<" -> set to 1.0\n");
-          prob=1.0;
-        }
-        probs[i] = (std::max(static_cast<float>(log(prob)),LOWEST_SCORE));

+    void createOne() {
+      ++created;
+    }
+    void destroyOne() {
+      ++destr;
+    }
+    
+    void collect(const ConfusionNet& cn) {
+      ++read;
+      colls+=cn.GetSize();
+      for(size_t i=0; i<cn.GetSize(); ++i)
+	words+=cn[i].size();
+    }
+    void print(std::ostream& out) const {
+      if(created>0) {
+	out<<"confusion net statistics:\n"
+	  " created:\t"<<created<<"\n"
+	  " destroyed:\t"<<destr<<"\n"
+	  " succ. read:\t"<<read<<"\n"
+	  " columns:\t"<<colls<<"\n"
+	  " words:\t"<<words<<"\n"
+	  " avg. word/column:\t"<<words/(1.0*colls)<<"\n"
+	  " avg. cols/sent:\t"<<colls/(1.0*read)<<"\n"
+	  "\n\n";
      }
-      //store 'real' word count in last feature if we have one more weight than we do arc scores and not epsilon
-      if (addRealWordCount && word!=EPSILON && word!="")
-        probs.back() = -1.0;
-
-      ScorePair scorePair(probs);
-
-      col.push_back(std::make_pair(w,scorePair));
    }
-    if(col.size()) {
-      data.push_back(col);
-      ShrinkToFit(data.back());
-    } else break;
+  };
+  
+  CNStats stats;
+
+  size_t 
+  ConfusionNet::
+  GetColumnIncrement(size_t i, size_t j) const
+  {
+    (void) i;
+    (void) j;
+    return 1;
  }
-  return !data.empty();
-}
-bool ConfusionNet::ReadFormat1(std::istream& in,
-                               const std::vector<FactorType>& factorOrder)
-{
-  Clear();
-  std::string line;
-  if(!getline(in,line)) return 0;
-  size_t s;
-  if(getline(in,line)) s=atoi(line.c_str());
-  else return 0;
-  data.resize(s);
-  for(size_t i=0; i<data.size(); ++i) {
+
+  ConfusionNet::
+  ConfusionNet()
+    : InputType()
+  {
+    stats.createOne();
+
+    const StaticData& staticData = StaticData::Instance();
+    if (staticData.IsChart()) {
+      m_defaultLabelSet.insert(StaticData::Instance().GetInputDefaultNonTerminal());
+    }
+    UTIL_THROW_IF2(&InputFeature::Instance() == NULL, "Input feature must be specified");
+  }
+
+  ConfusionNet::
+  ~ConfusionNet()
+  {
+    stats.destroyOne();
+  }
+
+  ConfusionNet::
+  ConfusionNet(Sentence const& s)
+  {
+    data.resize(s.GetSize());
+    for(size_t i=0; i<s.GetSize(); ++i) {
+      ScorePair scorePair;
+      std::pair<Word, ScorePair > temp = std::make_pair(s.GetWord(i), scorePair);
+      data[i].push_back(temp);
+    }
+  }
+
+  bool 
+  ConfusionNet::
+  ReadF(std::istream& in, const std::vector<FactorType>& factorOrder, int format)
+  {
+    VERBOSE(1, "read confusion net with format "<<format<<"\n");
+    switch(format) {
+    case 0:
+      return ReadFormat0(in,factorOrder);
+    case 1:
+      return ReadFormat1(in,factorOrder);
+    default:
+      std::stringstream strme;
+      strme << "ERROR: unknown format '"<<format
+	    <<"' in ConfusionNet::Read";
+      UserMessage::Add(strme.str());
+    }
+    return false;
+  }
+
+  int 
+  ConfusionNet::
+  Read(std::istream& in,
+       const std::vector<FactorType>& factorOrder)
+  {
+    int rv=ReadF(in,factorOrder,0);
+    if(rv) stats.collect(*this);
+    return rv;
+  }
+
+
+  void 
+  ConfusionNet::
+  String2Word(const std::string& s,Word& w,
+	      const std::vector<FactorType>& factorOrder)
+  {
+    std::vector<std::string> factorStrVector = Tokenize(s, "|");
+    for(size_t i=0; i<factorOrder.size(); ++i)
+      w.SetFactor(factorOrder[i],
+		  FactorCollection::Instance().AddFactor
+		  (Input,factorOrder[i], factorStrVector[i]));
+  }
+
+  bool 
+  ConfusionNet::
+  ReadFormat0(std::istream& in, const std::vector<FactorType>& factorOrder)
+  {
+    Clear();
+
+    const StaticData   &staticData   = StaticData::Instance();
+    const InputFeature &inputFeature = InputFeature::Instance();
+    size_t numInputScores   = inputFeature.GetNumInputScores();
+    size_t numRealWordCount = inputFeature.GetNumRealWordsInInput();
+
+    size_t totalCount = numInputScores + numRealWordCount;
+    bool addRealWordCount = (numRealWordCount > 0);
+
+    std::string line;
+    while(getline(in,line)) {
+      std::istringstream is(line);
+      std::string word;
+
+      Column col;
+      while(is>>word) {
+	Word w;
+	String2Word(word,w,factorOrder);
+	std::vector<float> probs(totalCount, 0.0);
+	for(size_t i=0; i < numInputScores; i++) {
+	  double prob;
+	  if (!(is>>prob)) {
+	    TRACE_ERR("ERROR: unable to parse CN input - bad link probability, or wrong number of scores\n");
+	    return false;
+	  }
+	  if(prob<0.0) {
+	    VERBOSE(1, "WARN: negative prob: "<<prob<<" ->set to 0.0\n");
+	    prob=0.0;
+	  } else if (prob>1.0) {
+	    VERBOSE(1, "WARN: prob > 1.0 : "<<prob<<" -> set to 1.0\n");
+	    prob=1.0;
+	  }
+	  probs[i] = (std::max(static_cast<float>(log(prob)),LOWEST_SCORE));
+
+	}
+	//store 'real' word count in last feature if we have one more weight than we do arc scores and not epsilon
+	if (addRealWordCount && word!=EPSILON && word!="")
+	  probs.back() = -1.0;
+
+	ScorePair scorePair(probs);
+
+	col.push_back(std::make_pair(w,scorePair));
+      }
+      if(col.size()) {
+	data.push_back(col);
+	ShrinkToFit(data.back());
+      } else break;
+    }
+    return !data.empty();
+  }
+
+  bool 
+  ConfusionNet::
+  ReadFormat1(std::istream& in, const std::vector<FactorType>& factorOrder)
+  {
+    Clear();
+    std::string line;
    if(!getline(in,line)) return 0;
-    std::istringstream is(line);
-    if(!(is>>s)) return 0;
-    std::string word;
-    double prob;
-    data[i].resize(s);
-    for(size_t j=0; j<s; ++j)
-      if(is>>word>>prob) {
-        //TODO: we are only reading one prob from this input format, should read many... but this function is unused anyway. -JS
-        data[i][j].second.denseScores = std::vector<float> (1);
-        data[i][j].second.denseScores.push_back((float) log(prob));
-        if(data[i][j].second.denseScores[0]<0) {
-          VERBOSE(1, "WARN: neg costs: "<<data[i][j].second.denseScores[0]<<" -> set to 0\n");
-          data[i][j].second.denseScores[0]=0.0;
-        }
-        String2Word(word,data[i][j].first,factorOrder);
-      } else return 0;
-  }
-  return !data.empty();
-}
-
-void ConfusionNet::Print(std::ostream& out) const
-{
-  out<<"conf net: "<<data.size()<<"\n";
-  for(size_t i=0; i<data.size(); ++i) {
-    out<<i<<" -- ";
-    for(size_t j=0; j<data[i].size(); ++j) {
-      out<<"("<<data[i][j].first.ToString()<<", ";
-
-      // dense
-      std::vector<float>::const_iterator iterDense;
-      for(iterDense = data[i][j].second.denseScores.begin(); iterDense < data[i][j].second.denseScores.end(); ++iterDense) {
-        out<<", "<<*iterDense;
-      }
-
-      // sparse
-      std::map<StringPiece, float>::const_iterator iterSparse;
-      for(iterSparse = data[i][j].second.sparseScores.begin(); iterSparse != data[i][j].second.sparseScores.end(); ++iterSparse) {
-        out << ", " << iterSparse->first << "=" << iterSparse->second;
-      }
-
-      out<<") ";
+    size_t s;
+    if(getline(in,line)) s=atoi(line.c_str());
+    else return 0;
+    data.resize(s);
+    for(size_t i=0; i<data.size(); ++i) {
+      if(!getline(in,line)) return 0;
+      std::istringstream is(line);
+      if(!(is>>s)) return 0;
+      std::string word;
+      double prob;
+      data[i].resize(s);
+      for(size_t j=0; j<s; ++j)
+	if(is>>word>>prob) {
+	  //TODO: we are only reading one prob from this input format, should read many... but this function is unused anyway. -JS
+	  data[i][j].second.denseScores = std::vector<float> (1);
+	  data[i][j].second.denseScores.push_back((float) log(prob));
+	  if(data[i][j].second.denseScores[0]<0) {
+	    VERBOSE(1, "WARN: neg costs: "<<data[i][j].second.denseScores[0]<<" -> set to 0\n");
+	    data[i][j].second.denseScores[0]=0.0;
+	  }
+	  String2Word(word,data[i][j].first,factorOrder);
+	} else return 0;
    }
-    out<<"\n";
+    return !data.empty();
+  }
+
+  void ConfusionNet::Print(std::ostream& out) const
+  {
+    out<<"conf net: "<<data.size()<<"\n";
+    for(size_t i=0; i<data.size(); ++i) {
+      out<<i<<" -- ";
+      for(size_t j=0; j<data[i].size(); ++j) {
+	out<<"("<<data[i][j].first.ToString()<<", ";
+
+	// dense
+	std::vector<float>::const_iterator iterDense;
+	for(iterDense = data[i][j].second.denseScores.begin(); 
+	    iterDense < data[i][j].second.denseScores.end(); 
+	    ++iterDense) {
+	  out<<", "<<*iterDense;
+	}
+
+	// sparse
+	std::map<StringPiece, float>::const_iterator iterSparse;
+	for(iterSparse = data[i][j].second.sparseScores.begin(); 
+	    iterSparse != data[i][j].second.sparseScores.end(); 
+	    ++iterSparse) {
+	  out << ", " << iterSparse->first << "=" << iterSparse->second;
+	}
+
+	out<<") ";
+      }
+      out<<"\n";
+    }
+    out<<"\n\n";
  }
-  out<<"\n\n";
-}

 #ifdef _WIN32
 #pragma warning(disable:4716)
 #endif
-Phrase ConfusionNet::GetSubString(const WordsRange&) const
-{
-  UTIL_THROW2("ERROR: call to ConfusionNet::GetSubString\n");
-  //return Phrase(Input);
-}
+  Phrase 
+  ConfusionNet::
+  GetSubString(const WordsRange&) const
+  {
+    UTIL_THROW2("ERROR: call to ConfusionNet::GetSubString\n");
+    //return Phrase(Input);
+  }

-std::string ConfusionNet::GetStringRep(const std::vector<FactorType> /* factorsToPrint */) const  //not well defined yet
-{
-  TRACE_ERR("ERROR: call to ConfusionNet::GeStringRep\n");
-  return "";
-}
+  std::string 
+  ConfusionNet::
+  GetStringRep(const std::vector<FactorType> /* factorsToPrint */) const  //not well defined yet
+  {
+    TRACE_ERR("ERROR: call to ConfusionNet::GeStringRep\n");
+    return "";
+  }
 #ifdef _WIN32
 #pragma warning(disable:4716)
 #endif
-const Word& ConfusionNet::GetWord(size_t) const
-{
-  UTIL_THROW2("ERROR: call to ConfusionNet::GetFactorArray\n");
-}
+  const Word& ConfusionNet::GetWord(size_t) const
+  {
+    UTIL_THROW2("ERROR: call to ConfusionNet::GetFactorArray\n");
+  }
 #ifdef _WIN32
 #pragma warning(default:4716)
 #endif
-std::ostream& operator<<(std::ostream& out,const ConfusionNet& cn)
-{
-  cn.Print(out);
-  return out;
-}
+  std::ostream& operator<<(std::ostream& out,const ConfusionNet& cn)
+  {
+    cn.Print(out);
+    return out;
+  }

-TranslationOptionCollection*
-ConfusionNet::CreateTranslationOptionCollection() const
-{
-  size_t maxNoTransOptPerCoverage = StaticData::Instance().GetMaxNoTransOptPerCoverage();
-  float translationOptionThreshold = StaticData::Instance().GetTranslationOptionThreshold();
-  TranslationOptionCollection *rv= new TranslationOptionCollectionConfusionNet(*this, maxNoTransOptPerCoverage, translationOptionThreshold);
-  assert(rv);
-  return rv;
-}
+  TranslationOptionCollection*
+  ConfusionNet::
+  CreateTranslationOptionCollection() const
+  {
+    size_t maxNoTransOptPerCoverage 
+      = StaticData::Instance().GetMaxNoTransOptPerCoverage();
+    float translationOptionThreshold 
+      = StaticData::Instance().GetTranslationOptionThreshold();
+    TranslationOptionCollection *rv 
+      = new TranslationOptionCollectionConfusionNet
+      (*this, maxNoTransOptPerCoverage, translationOptionThreshold);
+    assert(rv);
+    return rv;
+  }

 }

--- a/moses/FF/Factory.cpp
+++ b/moses/FF/Factory.cpp
@ -47,7 +47,7 @@
 #include "moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h"
 #endif
 #ifdef PT_UG
-#include "moses/TranslationModel/mmsapt.h"
+#include "moses/TranslationModel/UG/mmsapt.h"
 #endif

 #include "moses/LM/Ken.h"
--- a/moses/FF/InputFeature.cpp
+++ b/moses/FF/InputFeature.cpp
@ -13,7 +13,9 @@ namespace Moses
 InputFeature *InputFeature::s_instance = NULL;

 InputFeature::InputFeature(const std::string &line)
-  :StatelessFeatureFunction(line)
+  : StatelessFeatureFunction(line)
+  , m_numInputScores(0)
+  , m_numRealWordCount(0)
 {
  ReadParameters();

@ -23,6 +25,7 @@ InputFeature::InputFeature(const std::string &line)

 void InputFeature::Load()
 {
+  
  const PhraseDictionary *pt = PhraseDictionary::GetColl()[0];
  const PhraseDictionaryTreeAdaptor *ptBin = dynamic_cast<const PhraseDictionaryTreeAdaptor*>(pt);

--- a/moses/Jamfile
+++ b/moses/Jamfile
@ -54,6 +54,7 @@ lib moses :
 [ glob 
  *.cpp
  TranslationModel/*.cpp
+  TranslationModel/UG/*.cpp
  TranslationModel/fuzzy-match/*.cpp
  TranslationModel/DynSAInclude/*.cpp
  TranslationModel/RuleTable/*.cpp
@ -70,11 +71,11 @@ lib moses :
 ]
 headers FF_Factory.o LM//LM TranslationModel/CompactPT//CompactPT synlm ThreadPool rt 
 ..//search ../util/double-conversion//double-conversion ..//z ../OnDiskPt//OnDiskPt 
+TranslationModel/UG/generic//generic TranslationModel/UG/mm//mm
 $(TOP)//boost_iostreams ;

-#generic//generic mm//mm

-alias headers-to-install : [ glob-tree *.h ] ;
+alias headers-to-install : [ glob-tree [^.]*.h ] ;

 import testing ;

--- a/moses/TranslationModel/UG/Jamfile
+++ b/moses/TranslationModel/UG/Jamfile
@ -0,0 +1,14 @@
+exe try-align : 
+try-align.cc 
+$(TOP)/moses//moses
+$(TOP)/moses/TranslationModel/UG/generic//generic 
+$(TOP)//boost_iostreams 
+$(TOP)//boost_program_options 
+$(TOP)/moses/TranslationModel/UG/mm//mm 
+$(TOP)/moses/TranslationModel/UG//mmsapt 
+$(TOP)/util//kenutil 
+; 
+
+install $(PREFIX)/bin : try-align ; 
+
+fakelib mmsapt : [ glob *.cpp mmsapt*.cc ] ;
--- a/moses/TranslationModel/UG/Makefile
+++ b/moses/TranslationModel/UG/Makefile
@ -0,0 +1,116 @@
+# Some systems apparently distinguish between shell 
+# variables and environment variables. The latter are
+# visible to the make utility, the former apparently not,
+# so we need to set them if they are not defined yet
+
+# ===============================================================================
+# COMPILATION PREFERENCES
+# ===============================================================================
+# CCACHE: if set to ccache, use ccache to speed up compilation
+# OPTI:   optimization level
+# PROF:   profiler switches 
+
+CCACHE  = ccache
+OPTI    = 3
+EXE_TAG = exe
+PROF = 
+# PROF = -g -pg
+
+# ===============================================================================
+
+SHELL         = bash
+MAKEFLAGS    += --warn-undefined-variables
+.DEFAULT_GOAL = all
+.SUFFIXES:
+
+# ===============================================================================
+# COMPILATION 'LOCALIZATION'
+HOST     ?= $(shell hostname)
+HOSTTYPE ?= $(shell uname -m)
+KERNEL    = $(shell uname -r)
+
+MOSES_ROOT  = ${HOME}/code/mosesdecoder
+WDIR        = build/${HOSTTYPE}/${KERNEL}/${OPTI}
+VPATH       = ${HOME}/code/mosesdecoder/
+CXXFLAGS    = ${PROF} -ggdb -Wall -O${OPTI} ${INCLUDES} 
+CXXFLAGS   += -DMAX_NUM_FACTORS=4
+CXXFLAGS   += -DKENLM_MAX_ORDER=5
+modirs     := $(addprefix -I,$(shell find ${MOSES_ROOT}/moses ${MOSES_ROOT}/contrib -type d))
+CXXFLAGS   += -I${MOSES_ROOT} 
+INCLUDES    = 
+BZLIB       =  
+BOOSTLIBTAG = 
+
+lzma = lzma
+#lzma = 
+REQLIBS = m z pthread dl ${lzma} ${BZLIB} \
+	boost_thread${BOOSTLIBTAG} \
+	boost_program_options${BOOSTLIBTAG} \
+	boost_system${BOOSTLIBTAG} \
+	boost_filesystem${BOOSTLIBTAG} \
+	boost_iostreams${BOOSTLIBTAG} z bz2
+
+# 	icuuc icuio icui18n \
+
+LIBS     = $(addprefix -l, moses ${REQLIBS}) 
+LIBS     = $(addprefix -l, ${REQLIBS}) 
+LIBDIRS   = -L${HOME}/code/mosesdecoder/lib
+LIBDIRS  += -L${HOME}/lib
+PREFIX ?= .
+BINDIR ?= ${PREFIX}/bin
+ifeq "$(OPTI)" "0"
+BINPREF = debug.
+else
+BINPREF = 
+endif
+
+
+OBJ2 :=
+
+define compile 
+
+DEP  += ${WDIR}/$(basename $(notdir $1)).d
+${WDIR}/$(basename $(notdir $1)).o : $1 $(wildcard $(basename $1).h)
+	@echo -e "COMPILING $1"
+	@mkdir -p $$(@D)
+	${CXX} ${CXXFLAGS} -MD -MP -c $$(abspath $$<) -o $$@
+
+endef
+
+testprogs = test-dynamic-im-tsa try-align
+programs  = mtt-build mtt-dump symal2mam custom-pt mmlex-build ${testprogs}
+programs += mtt-count-words
+
+all: $(addprefix ${BINDIR}/${BINPREF}, $(programs))
+	@echo $^
+clean:
+	rm -f ${WDIR}/*.o ${WDIR}/*.d
+
+custom-pt: ${BINDIR}/${BINPREF}custom-pt
+	echo $^
+
+INMOGEN = $(wildcard ${MOSES_ROOT}/moses/TranslationModel/UG/generic/*/*.cpp)
+#INMOMM  = $(wildcard ${MOSES_ROOT}/moses/TranslationModel/UG/mm/*.cc)
+#INMOMM += $(wildcard ${MOSES_ROOT}/moses/TranslationModel/UG/mm/*.cpp)
+OBJ     = $(patsubst %.cc,%.o,$(wildcard $(patsubst %.h,%.cc,$(wildcard *.h))))
+OBJ    += $(patsubst %.cpp,%.o,${INMOGEN})
+#OBJ    += $(patsubst %.cpp,%.o,${INMOMM})
+#OBJ    += $(patsubst %.cc,%.o,${INMOMM})
+EXE     = $(patsubst %.cc,%.o,$(filter-out $(patsubst %.h,%.cc,$(wildcard *.h)),$(wildcard *.cc)))
+
+$(foreach cpp,${INMOGEN},$(eval $(call compile,${cpp})))
+$(foreach cpp,$(wildcard *.cc),$(eval $(call compile,${cpp})))
+$(addprefix ${BINDIR}/${BINPREF}, $(programs)): $(addprefix ${WDIR}/,$(notdir ${OBJ}))
+$(addprefix ${BINDIR}/${BINPREF}, $(programs)): ${MOSES_ROOT}/lib/libmoses.a 
+${BINDIR}/${BINPREF}%: ${WDIR}/%.o ${WDIR}/mmsapt_align.o
+	@mkdir -p ${BINDIR}
+	echo PREREQS: $^
+	$(CXX) $(CXXFLAGS) -o $@ $^ ${LIBDIRS} ${LIBS} 
+
+#try-align: ${WDIR}/try-align.o ${WDIR}/tpt_tokenindex.o 
+#	$(CXX) $(CXXFLAGS) -o $@ $^ ${LIBDIRS}
+
+.SECONDARY: 
+
+-include $(DEP)
+
--- a/moses/TranslationModel/UG/mm/Jamfile
+++ b/moses/TranslationModel/UG/mm/Jamfile
@ -1,41 +1,59 @@
-exe mtt-build : 
-mtt-build.cc 
-$(TOP)/moses/generic//generic 
+exe mmlex-build : 
+mmlex-build.cc 
+$(TOP)/moses/TranslationModel/UG/generic//generic 
 $(TOP)//boost_iostreams 
 $(TOP)//boost_program_options 
-$(TOP)/moses/mm//mm 
+$(TOP)/moses/TranslationModel/UG/mm//mm 
+$(TOP)/util//kenutil 
+; 
+
+exe mtt-count-words : 
+mtt-count-words.cc 
+$(TOP)/moses/TranslationModel/UG/generic//generic 
+$(TOP)//boost_iostreams 
+$(TOP)//boost_program_options 
+$(TOP)/moses/TranslationModel/UG/mm//mm 
+$(TOP)/util//kenutil 
+; 
+
+exe mtt-build : 
+mtt-build.cc 
+$(TOP)/moses/TranslationModel/UG/generic//generic 
+$(TOP)//boost_iostreams 
+$(TOP)//boost_program_options 
+$(TOP)/moses/TranslationModel/UG/mm//mm 
 $(TOP)/util//kenutil 
 ; 

 exe mtt-dump : 
 mtt-dump.cc 
-$(TOP)/moses/generic//generic 
+$(TOP)/moses/TranslationModel/UG/generic//generic 
 $(TOP)//boost_iostreams 
 $(TOP)//boost_program_options 
-$(TOP)/moses/mm//mm 
+$(TOP)/moses/TranslationModel/UG/mm//mm 
 $(TOP)/util//kenutil 
 ; 

 exe symal2mam : 
 symal2mam.cc 
-$(TOP)/moses/generic//generic 
+$(TOP)/moses/TranslationModel/UG/generic//generic 
 $(TOP)//boost_iostreams 
 $(TOP)//boost_program_options 
-$(TOP)/moses/mm//mm 
+$(TOP)/moses/TranslationModel/UG/mm//mm 
 $(TOP)/util//kenutil 
 ; 

 exe custom-pt : 
 custom-pt.cc 
-$(TOP)/moses/generic//generic 
+#$(TOP)/moses/generic//generic 
 $(TOP)//boost_iostreams 
 $(TOP)//boost_program_options 
-$(TOP)/moses/mm//mm 
+$(TOP)/moses/TranslationModel/UG/mm//mm 
 $(TOP)/util//kenutil 
 ; 


-install $(PREFIX)/bin :  mtt-build mtt-dump symal2mam custom-pt ; 
+install $(PREFIX)/bin : mtt-build mtt-dump mtt-count-words symal2mam custom-pt mmlex-build ; 

 fakelib mm : [ glob ug_*.cc tpt_*.cc ] ;

--- a/moses/TranslationModel/UG/mm/Makefile
+++ b/moses/TranslationModel/UG/mm/Makefile
@ -27,10 +27,11 @@ MAKEFLAGS    += --warn-undefined-variables
 # COMPILATION 'LOCALIZATION'
 HOST     ?= $(shell hostname)
 HOSTTYPE ?= $(shell uname -m)
+KERNEL    = $(shell uname -r)

-MOSES_ROOT  = ${HOME}/code/moses/master/mosesdecoder
-WDIR        = build/${HOSTTYPE}/${OPTI}
-VPATH       = ${HOME}/code/moses/master/mosesdecoder/
+MOSES_ROOT  = ${HOME}/code/mosesdecoder
+WDIR        = build/${HOSTTYPE}/${KERNEL}/${OPTI}
+VPATH       = ${HOME}/code/mosesdecoder/
 CXXFLAGS    = ${PROF} -ggdb -Wall -O${OPTI} ${INCLUDES} 
 CXXFLAGS   += -DMAX_NUM_FACTORS=4
 CXXFLAGS   += -DKENLM_MAX_ORDER=5
@ -50,8 +51,10 @@ REQLIBS = m z pthread lzma ${BZLIB} \
 # 	icuuc icuio icui18n \

 LIBS     = $(addprefix -l, ${REQLIBS} moses) 
-LIBDIRS  = -L${HOME}/code/moses/master/mosesdecoder/lib
-BINDIR = bin
+LIBDIRS   = -L${HOME}/code/mosesdecoder/lib
+LIBDIRS  += -L${HOME}/lib
+PREFIX ?= .
+BINDIR ?= ${PREFIX}/bin
 ifeq "$(OPTI)" "0"
 BINPREF = debug.
 else
@ -71,7 +74,9 @@ ${WDIR}/$(basename $(notdir $1)).o : $1 $(wildcard $(basename $1).h)

 endef

-programs = mtt-build mtt-dump symam2mam custom-pt mmlex-build
+testprogs = test-dynamic-im-tsa
+programs  = mtt-build mtt-dump symal2mam custom-pt mmlex-build ${testprogs}
+programs += mtt-count-words

 all: $(addprefix ${BINDIR}/${BINPREF}, $(programs))
 	@echo $^
@ -81,7 +86,7 @@ clean:
 custom-pt: ${BINDIR}/${BINPREF}custom-pt
 	echo $^

-INMOGEN = $(wildcard ${MOSES_ROOT}/moses/generic/*/*.cpp)
+INMOGEN = $(wildcard ${MOSES_ROOT}/moses/TranslationModel/UG/generic/*/*.cpp)
 OBJ     = $(patsubst %.cc,%.o,$(wildcard $(patsubst %.h,%.cc,$(wildcard *.h))))
 OBJ    += $(patsubst %.cpp,%.o,${INMOGEN})
 EXE     = $(patsubst %.cc,%.o,$(filter-out $(patsubst %.h,%.cc,$(wildcard *.h)),$(wildcard *.cc)))
--- a/moses/TranslationModel/UG/mm/custom-pt.cc
+++ b/moses/TranslationModel/UG/mm/custom-pt.cc
@ -8,9 +8,9 @@
 #include <iomanip>
 #include <algorithm>

-#include "moses/generic/sorting/VectorIndexSorter.h"
-#include "moses/generic/sampling/Sampling.h"
-#include "moses/generic/file_io/ug_stream.h"
+#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
+#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
+#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"

 #include <boost/math/distributions/binomial.hpp>
 #include <boost/unordered_map.hpp>
--- a/moses/TranslationModel/UG/mm/mmlex-build.cc
+++ b/moses/TranslationModel/UG/mm/mmlex-build.cc
@ -1,27 +1,32 @@
 // -*- c++ -*-
-// Program to extract word cooccurrence counts from a memory-mapped word-aligned bitext
-// stores the counts lexicon in the format for mm2dTable<uint32_t> (ug_mm_2d_table.h)
+// Program to extract word cooccurrence counts from a memory-mapped
+// word-aligned bitext stores the counts lexicon in the format for
+// mm2dTable<uint32_t> (ug_mm_2d_table.h) 
+// 
 // (c) 2010-2012 Ulrich Germann

+// to do: multi-threading
+
 #include <queue>
 #include <iomanip>
 #include <vector>
 #include <iterator>
 #include <sstream>
+#include <algorithm>

 #include <boost/program_options.hpp>
 #include <boost/dynamic_bitset.hpp>
 #include <boost/shared_ptr.hpp>
 #include <boost/foreach.hpp>
+#include <boost/thread.hpp>
 #include <boost/math/distributions/binomial.hpp>
+#include <boost/unordered_map.hpp> 
+#include <boost/unordered_set.hpp> 

-#include "moses/generic/program_options/ug_get_options.h"
-// #include "ug_translation_finder.h"
-// #include "ug_sorters.h"
-// #include "ug_corpus_sampling.h"
+#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
 #include "ug_mm_2d_table.h"
 #include "ug_mm_ttrack.h"
-#include  "ug_corpus_token.h"
+#include "ug_corpus_token.h"

 using namespace std;
 using namespace ugdiss;
@ -30,116 +35,296 @@ using namespace boost::math;
 typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> LEX_t;
 typedef SimpleWordId Token;

-vector<uint32_t> m1; // marginals L1
-vector<uint32_t> m2; // marginals L2
-
-id_type first_rare_id=500;
-vector<vector<uint32_t> >      JFREQ; // joint count table for frequent L1 words
-vector<map<id_type,uint32_t> > JRARE; // joint count table for rare     L1 words
+// DECLARATIONS 
+void interpret_args(int ac, char* av[]);

 mmTtrack<Token> T1,T2;
 mmTtrack<char>     Tx;
 TokenIndex      V1,V2;

-string bname,cfgFile,L1,L2,oname;
+typedef pair<id_type,id_type> wpair;
+struct Count
+{
+  uint32_t a;
+  uint32_t c;
+  Count() : a(0), c(0) {};
+  Count(uint32_t ax, uint32_t cx) : a(ax), c(cx) {}
+};

-// DECLARATIONS 
-void interpret_args(int ac, char* av[]);
+bool 
+operator<(pair<id_type,Count> const& a,
+	  pair<id_type,Count> const& b)
+{
+  return a.first < b.first;
+}
+
+
+typedef boost::unordered_map<wpair,Count> countmap_t;
+typedef vector<vector<pair<id_type,Count> > > countlist_t;
+
+vector<countlist_t> XLEX;
+
+class Counter
+{
+public:
+  countmap_t  CNT;
+  countlist_t & LEX;
+  size_t  offset;
+  size_t    skip;
+  Counter(countlist_t& lex, size_t o, size_t s) 
+    : LEX(lex), offset(o), skip(s) {}
+  void processSentence(id_type sid);
+  void operator()();
+};
+
+string bname,cfgFile,L1,L2,oname,cooc;
+int    verbose;
+size_t truncat;
+size_t num_threads;
+
+void 
+Counter::
+operator()()
+{
+  for (size_t sid = offset; sid < min(truncat,T1.size()); sid += skip)
+    processSentence(sid);
+
+  LEX.resize(V1.ksize());
+  for (countmap_t::const_iterator c = CNT.begin(); c != CNT.end(); ++c)
+    {
+      pair<id_type,Count> foo(c->first.second,c->second);
+      LEX.at(c->first.first).push_back(foo);
+    }
+  typedef vector<pair<id_type,Count> > v_t;
+  BOOST_FOREACH(v_t& v, LEX)
+    sort(v.begin(),v.end());
+}
+
+struct lexsorter
+{
+  vector<countlist_t> const& v;
+  id_type wid;
+  lexsorter(vector<countlist_t> const& vx, id_type widx) 
+    : v(vx),wid(widx) {}
+  bool operator()(pair<uint32_t,uint32_t> const& a,
+		  pair<uint32_t,uint32_t> const& b) const
+  {
+    return (v.at(a.first).at(wid).at(a.second).first > 
+	    v.at(b.first).at(wid).at(b.second).first);
+  }
+};
+
+void 
+writeTableHeader(ostream& out)
+{
+  filepos_type idxOffset=0;
+  numwrite(out,idxOffset); // blank for the time being
+  numwrite(out,id_type(V1.ksize()));
+  numwrite(out,id_type(V2.ksize()));
+}
+
+void writeTable(ostream* aln_out, ostream* coc_out)
+{
+  vector<uint32_t> m1a(V1.ksize(),0); // marginals L1
+  vector<uint32_t> m2a(V2.ksize(),0); // marginals L2
+  vector<uint32_t> m1c(V1.ksize(),0); // marginals L1
+  vector<uint32_t> m2c(V2.ksize(),0); // marginals L2
+  vector<id_type> idxa(V1.ksize()+1,0);
+  vector<id_type> idxc(V1.ksize()+1,0);
+  if (aln_out) writeTableHeader(*aln_out);
+  if (coc_out) writeTableHeader(*coc_out);
+  size_t CellCountA=0,CellCountC=0;
+  for (size_t id1 = 0; id1 < V1.ksize(); ++id1)
+    {
+      idxa[id1] = CellCountA;
+      idxc[id1] = CellCountC;
+      lexsorter sorter(XLEX,id1);
+      vector<pair<uint32_t,uint32_t> > H; H.reserve(num_threads);
+      for (size_t i = 0; i < num_threads; ++i)
+	{
+	  if (id1 < XLEX.at(i).size() && XLEX[i][id1].size())
+	    H.push_back(pair<uint32_t,uint32_t>(i,0));
+	}
+      if (!H.size()) continue;
+      make_heap(H.begin(),H.end(),sorter);
+      while (H.size())
+	{
+	  id_type  id2 = XLEX[H[0].first][id1][H[0].second].first;
+	  uint32_t aln = XLEX[H[0].first][id1][H[0].second].second.a;
+	  uint32_t coc = XLEX[H[0].first][id1][H[0].second].second.c;
+	  pop_heap(H.begin(),H.end(),sorter);
+	  ++H.back().second;
+	  if (H.back().second == XLEX[H.back().first][id1].size())
+	    H.pop_back();
+	  else
+	    push_heap(H.begin(),H.end(),sorter);
+	  while (H.size() && 
+		 XLEX[H[0].first][id1].at(H[0].second).first == id2)
+	    {
+	      aln += XLEX[H[0].first][id1][H[0].second].second.a;
+	      coc += XLEX[H[0].first][id1][H[0].second].second.c;
+	      pop_heap(H.begin(),H.end(),sorter);
+	      ++H.back().second;
+	      if (H.back().second == XLEX[H.back().first][id1].size())
+		H.pop_back();
+	      else
+		push_heap(H.begin(),H.end(),sorter);
+	    }
+	  if (aln_out)
+	    {
+	      ++CellCountA;
+	      numwrite(*aln_out,id2);
+	      numwrite(*aln_out,aln);
+	      m1a[id1] += aln;
+	      m2a[id2] += aln;
+	    }	      
+	  if (coc_out && coc)
+	    {
+	      ++CellCountC;
+	      numwrite(*coc_out,id2);
+	      numwrite(*coc_out,coc);
+	      m1c[id1] += coc;
+	      m2c[id2] += coc;
+	    }
+	}
+    }
+  idxa.back() = CellCountA;
+  idxc.back() = CellCountC;
+  if (aln_out) 
+    {
+      filepos_type idxOffsetA = aln_out->tellp();
+      BOOST_FOREACH(id_type foo, idxa)
+	numwrite(*aln_out,foo);
+      aln_out->write(reinterpret_cast<char const*>(&m1a[0]),m1a.size()*4);
+      aln_out->write(reinterpret_cast<char const*>(&m2a[0]),m2a.size()*4);
+      aln_out->seekp(0);
+      numwrite(*aln_out,idxOffsetA);
+    }
+  if (coc_out) 
+    {
+      filepos_type idxOffsetC = coc_out->tellp();
+      BOOST_FOREACH(id_type foo, idxc)
+	numwrite(*coc_out,foo);
+      coc_out->write(reinterpret_cast<char const*>(&m1c[0]),m1c.size()*4);
+      coc_out->write(reinterpret_cast<char const*>(&m2c[0]),m2c.size()*4);
+      coc_out->seekp(0);
+      numwrite(*coc_out,idxOffsetC);
+    }
+}

 void
+Counter::
 processSentence(id_type sid)
 {
  Token const* s1 = T1.sntStart(sid);
+  Token const* e1 = T1.sntEnd(sid);
  Token const* s2 = T2.sntStart(sid);
-  char const* p  = Tx.sntStart(sid);
-  char const* q  = Tx.sntEnd(sid);
-  ushort r,c;
-  bitvector check1(T1.sntLen(sid)), check2(T2.sntLen(sid));
-  check1.set();
-  check2.set();
-  
+  Token const* e2 = T2.sntEnd(sid);
+  vector<ushort> cnt1(V1.ksize(),0);
+  vector<ushort> cnt2(V2.ksize(),0);
+  for (Token const* x = s1; x < e1; ++x) 
+    ++cnt1.at(x->id());
+  for (Token const* x = s2; x < e2; ++x) 
+    ++cnt2.at(x->id());
+
+  boost::unordered_set<wpair> seen;
+  bitvector check1(T1.sntLen(sid)); check1.set();
+  bitvector check2(T2.sntLen(sid)); check2.set();
+
  // count links
+  char const*   p = Tx.sntStart(sid);
+  char const*   q = Tx.sntEnd(sid);
+  ushort r,c;
+  // cout << sid << " " << q-p << endl;
  while (p < q)
    {
      p = binread(p,r);
      p = binread(p,c);
+      // cout << sid << " " << r << "-" << c << endl;
+      assert(r < check1.size());
+      assert(c < check2.size());
+      assert(s1+r < e1);
+      assert(s2+c < e2);
      check1.reset(r);
      check2.reset(c);
      id_type id1 = (s1+r)->id();
-      if (id1 < first_rare_id) JFREQ[id1][(s2+c)->id()]++;
-      else                     JRARE[id1][(s2+c)->id()]++;
+      id_type id2 = (s2+c)->id();
+      wpair k(id1,id2);
+      Count& cnt = CNT[k];
+      cnt.a++;
+      if (seen.insert(k).second) 
+	cnt.c += cnt1[id1] * cnt2[id2];
    }
-  
  // count unaliged words
-  for (size_t i = check1.find_first(); i < check1.size(); i = check1.find_next(i))
-    {
-      id_type id1 = (s1+i)->id();
-      if (id1 < first_rare_id) JFREQ[id1][0]++;
-      else                     JRARE[id1][0]++;
-    }
-  for (size_t i = check2.find_first(); i < check2.size(); i = check2.find_next(i))
-    JFREQ[0][(s2+i)->id()]++;
+  for (size_t i = check1.find_first(); 
+       i < check1.size(); 
+       i = check1.find_next(i))
+    CNT[wpair((s1+i)->id(),0)].a++;
+  for (size_t i = check2.find_first(); 
+       i < check2.size(); 
+       i = check2.find_next(i))
+    CNT[wpair(0,(s2+i)->id())].a++;
 }

-void
-makeTable(string ofname)
-{
-  ofstream out(ofname.c_str());
-  filepos_type idxOffset=0;
-  m1.resize(max(first_rare_id,V1.getNumTokens()),0);
-  m2.resize(V2.getNumTokens(),0);
-  JFREQ.resize(first_rare_id,vector<uint32_t>(m2.size(),0));
-  JRARE.resize(m1.size());
-  for (size_t sid = 0; sid < T1.size(); ++sid)
-    processSentence(sid);
+// void
+// writeTable(string ofname, 
+// 	   vector<vector<uint32_t> >& FREQ,
+// 	   vector<map<id_type,uint32_t> >& RARE)
+// {
+//   ofstream out(ofname.c_str());
+//   filepos_type idxOffset=0;

-  vector<id_type> index(V1.getNumTokens()+1,0);
-  numwrite(out,idxOffset); // blank for the time being
-  numwrite(out,id_type(m1.size()));
-  numwrite(out,id_type(m2.size()));
+//   vector<uint32_t> m1; // marginals L1
+//   vector<uint32_t> m2; // marginals L2
+//   m1.resize(max(first_rare_id,V1.getNumTokens()),0);
+//   m2.resize(V2.getNumTokens(),0);
+//   vector<id_type> index(V1.getNumTokens()+1,0);
+//   numwrite(out,idxOffset); // blank for the time being
+//   numwrite(out,id_type(m1.size()));
+//   numwrite(out,id_type(m2.size()));

-  id_type cellCount=0;
-  id_type stop = min(first_rare_id,id_type(m1.size()));
-  for (id_type id1 = 0; id1 < stop; ++id1)
-    {
-      index[id1]  = cellCount;
-      vector<uint32_t> const& v = JFREQ[id1];
-      for (id_type id2 = 0; id2 < id_type(v.size()); ++id2)
-        {
-          if (!v[id2]) continue;
-          cellCount++;
-          numwrite(out,id2);
-          out.write(reinterpret_cast<char const*>(&v[id2]),sizeof(uint32_t));
-          m1[id1] += v[id2];
-          m2[id2] += v[id2];
-        }
-    }
-  for (id_type id1 = stop; id1 < id_type(m1.size()); ++id1)
-    {
-      index[id1]  = cellCount;
-      map<id_type,uint32_t> const& M = JRARE[id1];
-      for (map<id_type,uint32_t>::const_iterator m = M.begin(); m != M.end(); ++m)
-        {
-          if (m->second == 0) continue;
-          cellCount++;
-          numwrite(out,m->first);
-          out.write(reinterpret_cast<char const*>(&m->second),sizeof(float));
-          m1[id1] += m->second;
-          m2[m->first] += m->second;
-        }
-    }
-  index[m1.size()] = cellCount;
-  idxOffset    = out.tellp();
-  for (size_t i = 0; i < index.size(); ++i)
-    numwrite(out,index[i]);
-  out.write(reinterpret_cast<char const*>(&m1[0]),m1.size()*sizeof(float));
-  out.write(reinterpret_cast<char const*>(&m2[0]),m2.size()*sizeof(float));
+//   id_type cellCount=0;
+//   id_type stop = min(first_rare_id,id_type(m1.size()));
+//   for (id_type id1 = 0; id1 < stop; ++id1)
+//     {
+//       index[id1]  = cellCount;
+//       vector<uint32_t> const& v = FREQ[id1];
+//       for (id_type id2 = 0; id2 < id_type(v.size()); ++id2)
+//         {
+//           if (!v[id2]) continue;
+//           cellCount++;
+//           numwrite(out,id2);
+//           out.write(reinterpret_cast<char const*>(&v[id2]),sizeof(uint32_t));
+//           m1[id1] += v[id2];
+//           m2[id2] += v[id2];
+//         }
+//     }
+//   for (id_type id1 = stop; id1 < id_type(m1.size()); ++id1)
+//     {
+//       index[id1]  = cellCount;
+//       map<id_type,uint32_t> const& M = RARE[id1];
+//       for (map<id_type,uint32_t>::const_iterator m = M.begin(); m != M.end(); ++m)
+//         {
+//           if (m->second == 0) continue;
+//           cellCount++;
+//           numwrite(out,m->first);
+//           out.write(reinterpret_cast<char const*>(&m->second),sizeof(float));
+//           m1[id1] += m->second;
+//           m2[m->first] += m->second;
+//         }
+//     }
+//   index[m1.size()] = cellCount;
+//   idxOffset    = out.tellp();
+//   for (size_t i = 0; i < index.size(); ++i)
+//     numwrite(out,index[i]);
+//   out.write(reinterpret_cast<char const*>(&m1[0]),m1.size()*sizeof(float));
+//   out.write(reinterpret_cast<char const*>(&m2[0]),m2.size()*sizeof(float));
  
-  // re-write the file header
-  out.seekp(0);
-  numwrite(out,idxOffset);
-  out.close();
-}
+//   // re-write the file header
+//   out.seekp(0);
+//   numwrite(out,idxOffset);
+//   out.close();
+// }

 int 
 main(int argc, char* argv[])
@ -152,8 +337,21 @@ main(int argc, char* argv[])
  Tx.open(bname+L1+"-"+L2+".mam");
  V1.open(bname+L1+".tdx");
  V2.open(bname+L2+".tdx");
-  makeTable(oname);
-  exit(0);
+  if (!truncat) truncat = T1.size();
+  XLEX.resize(num_threads);
+  vector<boost::shared_ptr<boost::thread> > workers(num_threads);
+  for (size_t i = 0; i < num_threads; ++i)
+    workers[i].reset(new boost::thread(Counter(XLEX[i],i,num_threads)));
+  for (size_t i = 0; i < workers.size(); ++i)
+    workers[i]->join();
+  // cerr << "done counting" << endl;
+  ofstream aln_out,coc_out;
+  if (oname.size()) aln_out.open(oname.c_str());
+  if (cooc.size())  coc_out.open(cooc.c_str());
+  writeTable(oname.size() ? &aln_out : NULL,
+	     cooc.size()  ? &coc_out : NULL);
+  if (oname.size()) aln_out.close();
+  if (cooc.size())  coc_out.close();
 }

 void 
@ -169,6 +367,14 @@ interpret_args(int ac, char* av[])
    ("help,h",    "print this message")
    ("cfg,f", po::value<string>(&cfgFile),"config file")
    ("oname,o", po::value<string>(&oname),"output file name")
+    ("cooc,c", po::value<string>(&cooc),
+     "file name for raw co-occurrence counts")
+    ("verbose,v", po::value<int>(&verbose)->default_value(0)->implicit_value(1),
+     "verbosity level")
+    ("threads,t", po::value<size_t>(&num_threads)->default_value(4),
+     "count in <N> parallel threads")
+    ("truncate,n", po::value<size_t>(&truncat)->default_value(0),
+     "truncate corpus to <N> sentences (for debugging)")
    ;
  
  h.add_options()
@ -181,12 +387,14 @@ interpret_args(int ac, char* av[])
  a.add("L2",1);
  get_options(ac,av,h.add(o),a,vm,"cfg");

-  if (vm.count("help") || bname.empty() || oname.empty())
+  if (vm.count("help") || bname.empty() || (oname.empty() && cooc.empty()))
    {
-      cout << "usage:\n\t" << av[0] << " <basename> <L1 tag> <L2 tag> -o <output file>\n" << endl;
+      cout << "usage:\n\t" << av[0] << " <basename> <L1 tag> <L2 tag> [-o <output file>] [-c <output file>]\n" << endl;
+      cout << "at least one of -o / -c must be specified." << endl;
      cout << o << endl;
      exit(0);
    }
+  num_threads = min(num_threads,24UL);
 }


--- a/moses/TranslationModel/UG/mm/mtt-build.cc
+++ b/moses/TranslationModel/UG/mm/mtt-build.cc
@ -25,12 +25,13 @@
 #include "ug_mm_ttrack.h"
 #include "tpt_pickler.h"
 #include "ug_deptree.h"
-#include "moses/generic/sorting/VectorIndexSorter.h"
-#include "ug_im_tsa.h"
+#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
+#include "moses/TranslationModel/UG/mm/ug_im_tsa.h"

 using namespace std;
 using namespace ugdiss;
 using namespace Moses;
+using namespace boost;
 namespace po=boost::program_options;

 int with_pfas;
@ -360,10 +361,10 @@ build_mmTSA(string infile, string outfile)
 {
  size_t mypid = fork();
  if(mypid) return mypid;
-  mmTtrack<Token> T(infile);
+  shared_ptr<mmTtrack<Token> > T(new mmTtrack<Token>(infile));
  bdBitset filter;
-  filter.resize(T.size(),true);
-  imTSA<Token> S(&T,filter,(quiet?NULL:&cerr));
+  filter.resize(T->size(),true);
+  imTSA<Token> S(T,&filter,(quiet?NULL:&cerr));
  S.save_as_mm_tsa(outfile);
  exit(0);
 }
--- a/moses/TranslationModel/UG/mm/mtt-count-words.cc
+++ b/moses/TranslationModel/UG/mm/mtt-count-words.cc
@ -0,0 +1,69 @@
+// count words in a memory-mapped corpus
+#include "ug_mm_ttrack.h"
+#include "tpt_tokenindex.h"
+#include "ug_corpus_token.h"
+#include <string>
+#include <vector>
+#include <cassert>
+#include <boost/unordered_map.hpp>
+#include <boost/foreach.hpp>
+#include <iomanip>
+#include "ug_typedefs.h"
+#include "tpt_pickler.h"
+// #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
+// #include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
+// #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
+#include <algorithm>
+#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
+
+using namespace std;
+using namespace ugdiss;
+using namespace Moses;
+typedef L2R_Token<SimpleWordId> Token;
+// typedef mmTSA<Token>::tree_iterator iter;
+typedef boost::unordered_map<pair<size_t,size_t>,size_t> phrase_counter_t;
+
+#define CACHING_THRESHOLD 1000
+
+mmTtrack<Token> T; // token tracks
+TokenIndex      V; // vocabs
+// mmTSA<Token>    I; // suffix arrays
+
+void interpret_args(int ac, char* av[]);
+string bname;
+bool   echo;
+int main(int argc, char* argv[])
+{
+  interpret_args(argc,argv);
+  T.open(bname+".mct");
+  V.open(bname+".tdx"); 
+  vector<size_t> cnt(V.ksize(),0);
+  for (size_t sid = 0; sid < T.size(); ++sid)
+    {
+      Token const* stop = T.sntEnd(sid);
+      for (Token const* t = T.sntStart(sid); t < stop; ++cnt[(t++)->id()]);
+    }
+  for (size_t wid = 2; wid < V.ksize(); ++wid)
+    cout << V[wid] << " " << cnt[wid] << endl;
+  exit(0);
+}
+
+void 
+interpret_args(int ac, char* av[])
+{
+  namespace po=boost::program_options;
+  po::variables_map vm;
+  po::options_description o("Options");
+  po::options_description h("Hidden Options");
+  po::positional_options_description a;
+
+  o.add_options()
+    ("help,h",    "print this message")
+    ;
+  
+  h.add_options()
+    ("bname", po::value<string>(&bname), "base name")
+    ;
+  a.add("bname",1);
+  get_options(ac,av,h.add(o),a,vm);
+}
--- a/moses/TranslationModel/UG/mm/mtt.count.cc
+++ b/moses/TranslationModel/UG/mm/mtt.count.cc
@ -11,11 +11,11 @@
 #include <iomanip>
 #include "ug_typedefs.h"
 #include "tpt_pickler.h"
-#include "moses/generic/sorting/VectorIndexSorter.h"
-#include "moses/generic/sampling/Sampling.h"
-#include "moses/generic/file_io/ug_stream.h"
+#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
+#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
+#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
 #include <algorithm>
-#include "moses/generic/program_options/ug_get_options.h"
+#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"

 using namespace std;
 using namespace ugdiss;
--- a/moses/TranslationModel/UG/mm/symal2mam.cc
+++ b/moses/TranslationModel/UG/mm/symal2mam.cc
@ -11,8 +11,8 @@
 #include "ug_deptree.h"
 #include "tpt_tokenindex.h"
 #include "tpt_pickler.h"
-#include "moses/generic/program_options/ug_get_options.h"
-#include "moses/generic/file_io/ug_stream.h"
+#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
+#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"

 #include <iostream>
 #include <string>
@ -21,8 +21,8 @@
 #include <boost/program_options.hpp>
 #include <boost/scoped_ptr.hpp>

-#include "util/exception.hh"
-#include "util/check.hh"
+#include "headers-base/util/exception.hh"
+#include "headers-base/util/check.hh"

 // NOTE TO SELF: 
 /* Program to filter out sentences that GIZA will skip or truncate,
--- a/moses/TranslationModel/UG/mm/test-dynamic-im-tsa.cc
+++ b/moses/TranslationModel/UG/mm/test-dynamic-im-tsa.cc
@ -0,0 +1,64 @@
+// -*- c++ -*-
+// test program for dynamic tsas
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/options_description.hpp>
+#include <boost/program_options/parsers.hpp>
+#include <boost/program_options/variables_map.hpp>
+#include <boost/iostreams/device/mapped_file.hpp>
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+#include <vector>
+#include <string>
+
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include "ug_conll_record.h"
+#include "tpt_tokenindex.h"
+#include "ug_mm_ttrack.h"
+#include "tpt_pickler.h"
+#include "ug_deptree.h"
+#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
+#include "ug_im_ttrack.h"
+#include "ug_bitext.h"
+
+using namespace std;
+using namespace ugdiss;
+using namespace Moses;
+using namespace boost;
+using namespace Moses::bitext;
+namespace po=boost::program_options;
+
+typedef L2R_Token<SimpleWordId> L2R;
+
+int main()
+{
+  sptr<imBitext<L2R> > bt(new imBitext<L2R>());
+  string s1,s2,aln;
+  vector<string> S1,S2,ALN;
+  while (getline(cin,s1) && getline(cin,s2) && getline(cin,aln))
+    {
+      S1.push_back(s1);
+      S2.push_back(s2);
+      ALN.push_back(aln);
+    }
+  bt = bt->add(S1,S2,ALN);
+
+  TSA<L2R>::tree_iterator m(bt->I2.get());
+  m.down();
+  do {
+    char const* p = m.lower_bound(-1);
+    tsa::ArrayEntry I(p);
+    do {
+      m.root->readEntry(I.next,I);
+      L2R const* stop = m.root->getCorpus()->sntEnd(I.sid);
+      for (L2R const* t = m.root->getCorpus()->getToken(I); t < stop; ++t)
+	cout << (*bt->V2)[t->id()] << " ";
+      cout << endl;
+    } while (I.next < m.upper_bound(-1));
+  } while (m.over());
+}
--- a/moses/TranslationModel/UG/mm/tpt_tokenindex.h
+++ b/moses/TranslationModel/UG/mm/tpt_tokenindex.h
@ -138,7 +138,7 @@ namespace ugdiss
  void
  mkTokenIndex(string ofile,MYMAP const& M,string unkToken)
  {
-    typedef pair<uint32_t,id_type> IndexEntry; // offset and id
+    // typedef pair<uint32_t,id_type> IndexEntry; // offset and id
    typedef pair<string,uint32_t>  Token;      // token and id


--- a/moses/TranslationModel/UG/mm/ug_bitext.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext.cc
@ -17,7 +17,10 @@ namespace Moses
      , good        (0)
      , sum_pairs   (0)
      , in_progress (0)
-    {}
+    {
+      ofwd[0] = ofwd[1] = ofwd[2] = ofwd[3] = ofwd[4] = ofwd[5] = ofwd[6] = 0;
+      obwd[0] = obwd[1] = obwd[2] = obwd[3] = obwd[4] = obwd[5] = obwd[6] = 0;
+    }

    void
    pstats::
@ -38,28 +41,34 @@ namespace Moses
      this->lock.unlock();
    }

-    void
+    bool
    pstats::
    add(uint64_t pid, float const w, 
 	vector<uchar> const& a, 
-	uint32_t const cnt2)
+	uint32_t const cnt2, 
+	uint32_t fwd_o, 
+	uint32_t bwd_o)
    {
      this->lock.lock();
      jstats& entry = this->trg[pid];
      this->lock.unlock();
-      entry.add(w,a,cnt2);
+      entry.add(w,a,cnt2,fwd_o,bwd_o);
      if (this->good < entry.rcnt())
 	{
 	  this->lock.lock();
-	  UTIL_THROW(util::Exception, "more joint counts than good counts!" 
-		     << entry.rcnt() << "/" << this->good);
+	  return false;
+	  // UTIL_THROW(util::Exception, "more joint counts than good counts!" 
+	  // 	     << entry.rcnt() << "/" << this->good);
 	}
+      return true;
    }

    jstats::
    jstats()
      : my_rcnt(0), my_wcnt(0), my_cnt2(0)
    { 
+      ofwd[0] = ofwd[1] = ofwd[2] = ofwd[3] = ofwd[4] = ofwd[5] = ofwd[6] = 0;
+      obwd[0] = obwd[1] = obwd[2] = obwd[3] = obwd[4] = obwd[5] = obwd[6] = 0;
      my_aln.reserve(1); 
    }

@ -69,11 +78,33 @@ namespace Moses
      my_rcnt = other.rcnt();
      my_wcnt = other.wcnt();
      my_aln  = other.aln();
+      for (int i = po_first; i <= po_other; i++)
+	{
+	  ofwd[i] = other.ofwd[i];
+	  obwd[i] = other.obwd[i];
+	}
    }
  
+    uint32_t 
+    jstats::
+    dcnt_fwd(PhraseOrientation const idx) const
+    {
+      assert(idx <= po_other);
+      return ofwd[idx];
+    }
+
+    uint32_t 
+    jstats::
+    dcnt_bwd(PhraseOrientation const idx) const
+    {
+      assert(idx <= po_other);
+      return obwd[idx];
+    }
+    
    void 
    jstats::
-    add(float w, vector<uchar> const& a, uint32_t const cnt2)
+    add(float w, vector<uchar> const& a, uint32_t const cnt2,
+	uint32_t fwd_orient, uint32_t bwd_orient)
    {
      boost::lock_guard<boost::mutex> lk(this->lock);
      my_rcnt += 1;
@ -90,6 +121,8 @@ namespace Moses
 	  if (my_aln[i].first > my_aln[i/2].first)
 	    push_heap(my_aln.begin(),my_aln.begin()+i+1);
 	}
+      ++ofwd[fwd_orient];
+      ++obwd[bwd_orient];
    }
    
    uint32_t 
@ -112,6 +145,34 @@ namespace Moses
    aln() const 
    { return my_aln; }

+    void 
+    jstats::
+    invalidate()
+    {
+      my_rcnt = 0;
+    }
+
+    bool
+    jstats::
+    valid()
+    {
+      return my_rcnt != 0;
+    }
+
+    bool
+    PhrasePair::
+    operator<=(PhrasePair const& other) const
+    {
+      return this->score <= other.score;
+    }
+
+    bool
+    PhrasePair::
+    operator>=(PhrasePair const& other) const
+    {
+      return this->score >= other.score;
+    }
+
    bool
    PhrasePair::
    operator<(PhrasePair const& other) const
@ -126,7 +187,30 @@ namespace Moses
      return this->score > other.score;
    }
    
-    PhrasePair::PhrasePair() {}
+    PhrasePair::
+    PhrasePair() {}
+
+    PhrasePair::
+    PhrasePair(PhrasePair const& o) 
+      : p1(o.p1), 
+	p2(o.p2),
+	raw1(o.raw1), 
+	raw2(o.raw2), 
+	sample1(o.sample1),
+	sample2(o.sample2),
+	good1(o.good1),
+	good2(o.good2),
+	joint(o.joint),
+	fvals(o.fvals),
+	aln(o.aln),
+	score(o.score)
+    {
+      for (size_t i = 0; i <= po_other; ++i)
+	{
+	  dfwd[i] = o.dfwd[i];
+	  dbwd[i] = o.dbwd[i];
+	}
+    }
    
    void
    PhrasePair::
@ -140,6 +224,22 @@ namespace Moses
      good2   = 0;
      fvals.resize(numfeats);
    }
+
+    void
+    PhrasePair::
+    init(uint64_t const pid1, 
+	 pstats const& ps1, 
+	 pstats const& ps2, 
+	 size_t const numfeats)
+    {
+      p1      = pid1;
+      raw1    = ps1.raw_cnt    + ps2.raw_cnt;
+      sample1 = ps1.sample_cnt + ps2.sample_cnt;
+      sample2 = 0;
+      good1   = ps1.good       + ps2.good;
+      good2   = 0;
+      fvals.resize(numfeats);
+    }
    
    float 
    lbop(size_t const tries, size_t const succ, float const confidence)
@ -149,7 +249,7 @@ namespace Moses
 	find_lower_bound_on_p(tries, succ, confidence);
    }
    
-    void 
+    PhrasePair const&
    PhrasePair::
    update(uint64_t const pid2, jstats const& js)   
    {
@ -159,8 +259,64 @@ namespace Moses
      assert(js.aln().size());
      if (js.aln().size()) 
 	aln = js.aln()[0].second;
+      float total_fwd = 0, total_bwd = 0;
+      for (int i = po_first; i <= po_other; i++)
+	{
+	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
+	  total_fwd += js.dcnt_fwd(po)+1;
+	  total_bwd += js.dcnt_bwd(po)+1;
+	}
+      for (int i = po_first; i <= po_other; i++)
+	{
+	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
+	  dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
+	  dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
+	}
+      return *this;
    }
-    
+
+    PhrasePair const&
+    PhrasePair::
+    update(uint64_t const pid2, jstats const& js1, jstats const& js2)   
+    {
+      p2    = pid2;
+      raw2  = js1.cnt2() + js2.cnt2();
+      joint = js1.rcnt() + js2.rcnt();
+      assert(js1.aln().size() || js2.aln().size());
+      if (js1.aln().size()) 
+	aln = js1.aln()[0].second;
+      else if (js2.aln().size()) 
+	aln = js2.aln()[0].second;
+      for (int i = po_first; i < po_other; i++)
+	{
+	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
+	  dfwd[i] = float(js1.dcnt_fwd(po) + js2.dcnt_fwd(po) + 1)/(sample1+po_other);
+	  dbwd[i] = float(js1.dcnt_bwd(po) + js2.dcnt_bwd(po) + 1)/(sample1+po_other);
+	}
+      return *this;
+    }
+
+    PhrasePair const&
+    PhrasePair::
+    update(uint64_t const pid2, 
+	   size_t   const raw2extra,
+	   jstats   const& js)   
+    {
+      p2    = pid2;
+      raw2  = js.cnt2() + raw2extra;
+      joint = js.rcnt();
+      assert(js.aln().size());
+      if (js.aln().size()) 
+	aln = js.aln()[0].second;
+      for (int i = po_first; i <= po_other; i++)
+	{
+	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
+	  dfwd[i] = float(js.dcnt_fwd(po)+1)/(sample1+po_other);
+	  dbwd[i] = float(js.dcnt_bwd(po)+1)/(sample1+po_other);
+	}
+      return *this;
+    }
+
    float
    PhrasePair::
    eval(vector<float> const& w)
@ -172,5 +328,331 @@ namespace Moses
      return this->score;
    }
  
+    template<>
+    sptr<imBitext<L2R_Token<SimpleWordId> > > 
+    imBitext<L2R_Token<SimpleWordId> >::
+    add(vector<string> const& s1, 
+	vector<string> const& s2, 
+	vector<string> const& aln) const
+    {
+      typedef L2R_Token<SimpleWordId> TKN;
+      assert(s1.size() == s2.size() && s1.size() == aln.size());
+      
+      sptr<imBitext<TKN> > ret;
+      {
+	lock_guard<mutex> guard(this->lock);
+	ret.reset(new imBitext<TKN>(*this));
+      }
+      
+      // we add the sentences in separate threads (so it's faster)
+      boost::thread thread1(snt_adder<TKN>(s1,*ret->V1,ret->myT1,ret->myI1));
+      thread1.join(); // for debugging
+      boost::thread thread2(snt_adder<TKN>(s2,*ret->V2,ret->myT2,ret->myI2));
+      BOOST_FOREACH(string const& a, aln)
+	{
+	  istringstream ibuf(a);
+	  ostringstream obuf;
+	  uint32_t row,col; char c;
+	  while (ibuf>>row>>c>>col)
+	    {
+	      assert(c == '-');
+	      binwrite(obuf,row);
+	      binwrite(obuf,col);
+	    }
+	  char const* x = obuf.str().c_str();
+	  vector<char> v(x,x+obuf.str().size());
+	  ret->myTx = append(ret->myTx, v);
+	}
+      thread1.join();
+      thread2.join();
+      ret->Tx = ret->myTx;
+      ret->T1 = ret->myT1;
+      ret->T2 = ret->myT2;
+      ret->I1 = ret->myI1;
+      ret->I2 = ret->myI2;
+      return ret;
+    }
+
+    // template<>
+    void
+    snt_adder<L2R_Token<SimpleWordId> >::
+    operator()()
+    {
+	vector<id_type> sids;
+	sids.reserve(snt.size());
+	BOOST_FOREACH(string const& foo, snt)
+	  {
+	    sids.push_back(track ? track->size() : 0);
+	    istringstream buf(foo);
+	    string w;
+	    vector<L2R_Token<SimpleWordId > > s;
+	    s.reserve(100);
+	    while (buf >> w) 
+	      s.push_back(L2R_Token<SimpleWordId>(V[w]));
+	    track = append(track,s);
+	  }
+	if (index)
+	  index.reset(new imTSA<L2R_Token<SimpleWordId> >(*index,track,sids,V.tsize()));
+	else
+	  index.reset(new imTSA<L2R_Token<SimpleWordId> >(track,NULL,NULL));
+    }
+
+    snt_adder<L2R_Token<SimpleWordId> >::
+    snt_adder(vector<string> const& s, TokenIndex& v, 
+     	      sptr<imTtrack<L2R_Token<SimpleWordId> > >& t, 
+	      sptr<imTSA<L2R_Token<SimpleWordId> > >& i)
+      : snt(s), V(v), track(t), index(i) 
+    { }
+
+
+    bool 
+    expand_phrase_pair
+    (vector<vector<ushort> >& a1, 
+     vector<vector<ushort> >& a2,
+     ushort const s2, // next word on in target side
+     ushort const L1, ushort const R1, // limits of previous phrase
+     ushort & s1, ushort & e1, ushort& e2) // start/end src; end trg
+    {
+      if (a2[s2].size() == 0) 
+	{
+	  cout << __FILE__ << ":" << __LINE__ << endl;
+	  return false;
+	}
+      bitvector done1(a1.size());
+      bitvector done2(a2.size());
+      vector <pair<ushort,ushort> > agenda; 
+      // x.first:  side (1 or 2)
+      // x.second: word position
+      agenda.reserve(a1.size() + a2.size());
+      agenda.push_back(pair<ushort,ushort>(2,s2));
+      e2 = s2;
+      s1 = e1 = a2[s2].front();
+      if (s1 >= L1 && s1 < R1) 
+	{
+	  cout << __FILE__ << ":" << __LINE__ << endl;
+	  return false;
+	}
+      agenda.push_back(pair<ushort,ushort>(2,s2));
+      while (agenda.size())
+	{
+	  ushort side = agenda.back().first;
+	  ushort p    = agenda.back().second;
+	  agenda.pop_back();
+	  if (side == 1)
+	    {
+	      done1.set(p);
+	      BOOST_FOREACH(ushort i, a1[p])
+		{
+		  if (i < s2) 
+		    {
+		      // cout << __FILE__ << ":" << __LINE__ << endl;
+		      return false;
+		    }
+		  if (done2[i]) continue;
+		  for (;e2 <= i;++e2)
+		    if (!done2[e2]) 
+		      agenda.push_back(pair<ushort,ushort>(2,e2));
+		}
+	    }
+	  else
+	    {
+	      done2.set(p);
+	      BOOST_FOREACH(ushort i, a2[p])
+		{
+		  if ((e1 < L1 && i >= L1) || (s1 >= R1 && i < R1) || (i >= L1 && i < R1))
+		    {
+		      // cout << __FILE__ << ":" << __LINE__ << " " 
+		      // << L1 << "-" << R1 << " " << i << " " 
+		      // << s1 << "-" << e1<< endl;
+		      return false;
+		    }
+		  
+		  if (e1 < i)
+		    {
+		      for (; e1 <= i; ++e1)
+			if (!done1[e1])
+			  agenda.push_back(pair<ushort,ushort>(1,e1));
+		    }
+		  else if (s1 > i)
+		    {
+		      for (; i <= s1; ++i)
+			if (!done1[i])
+			  agenda.push_back(pair<ushort,ushort>(1,i));
+		    }
+		}
+	    }
+	}
+      ++e1;
+      ++e2;
+      return true;
+    }
+    //   s1 = seed;
+    //   e1 = seed;
+    //   s2 = e2 = a1[seed].front();
+
+    //   BOOST_FOREACH(ushort k, a1[seed])
+    // 	{
+    // 	  if (s2 < k) s2 = k;
+    // 	  if (e2 > k) e2 = k;
+    // 	}
+
+    //   for (ushort j = s2; j <= e2; ++j)
+    // 	{
+    // 	  if (a2[j].size() == 0) continue;
+    // 	  done2.set(j);
+    // 	  agenda.push_back(pair<ushort,ushort>(j,1));
+    // 	}
+
+    //   while (agenda.size())
+    // 	{
+    // 	  ushort side = agenda[0].second;
+    // 	  ushort i    = agenda[0].first;
+    // 	  agenda.pop_back();
+    // 	  if (side)
+    // 	    {
+    // 	      BOOST_FOREACH(ushort k, a2[i])
+    // 		{
+    // 		  if (k < L1 || k > R1) 
+    // 		    return false;
+    // 		  if (done1[k]) 
+    // 		    continue;
+    // 		  while (s1 > k)
+    // 		    {
+    // 		      --s1;
+    // 		      if (done1[s1] || !a1[s1].size()) 
+    // 			continue;
+    // 		      done1.set(s1);
+    // 		      agenda.push_back(pair<ushort,ushort>(s1,0));
+    // 		    }
+    // 		  while (e1 < k)
+    // 		    {
+    // 		      ++e1;
+    // 		      if (done1[e1] || !a1[e1].size())
+    // 			continue;
+    // 		      done1.set(e1);
+    // 		      agenda.push_back(pair<ushort,ushort>(e1,0));
+    // 		    }
+    // 		}
+    // 	    }
+    // 	  else
+    // 	    {
+    // 	      BOOST_FOREACH(ushort k, a1[i])
+    // 		{
+    // 		  if (k < L2 || k > R2) 
+    // 		    return false;
+    // 		  if (done2[k]) 
+    // 		    continue;
+    // 		  while (s2 > k)
+    // 		    {
+    // 		      --s2;
+    // 		      if (done2[s2] || !a2[s2].size()) 
+    // 			continue;
+    // 		      done1.set(s2);
+    // 		      agenda.push_back(pair<ushort,ushort>(s2,1));
+    // 		    }
+    // 		  while (e2 < k)
+    // 		    {
+    // 		      ++e2;
+    // 		      if (done1[e2] || !a1[e2].size())
+    // 			continue;
+    // 		      done2.set(e2);
+    // 		      agenda.push_back(pair<ushort,ushort>(e2,1));
+    // 		    }
+    // 		}
+    // 	    }
+    // 	}
+    //   ++e1;
+    //   ++e2;
+    //   return true;
+    // }
+
+    void 
+    print_amatrix(vector<vector<ushort> > a1, uint32_t len2,
+		  ushort b1, ushort e1, ushort b2, ushort e2)
+    {
+      vector<bitvector> M(a1.size(),bitvector(len2));
+      for (ushort j = 0; j < a1.size(); ++j)
+	{
+	  BOOST_FOREACH(ushort k, a1[j])
+	    M[j].set(k);
+	}
+      cout << b1 << "-" << e1 << " " << b2 << "-" << e2 << endl;
+      cout << "   ";
+      for (size_t c = 0; c < len2;++c)
+	cout << c%10;
+      cout << endl;
+      for (size_t r = 0; r < M.size(); ++r)
+	{
+	  cout << setw(3) << r << " ";
+	  for (size_t c = 0; c < M[r].size(); ++c)
+	    {
+	      if ((b1 <= r) && (r < e1) && b2 <= c && c < e2)
+		cout << (M[r][c] ? 'x' : '-');
+	      else cout << (M[r][c] ? 'o' : '.');
+	    }
+	  cout << endl;
+	}
+      cout  << string(90,'-') << endl;
+    }
+
+
+    PhraseOrientation 
+    find_po_fwd(vector<vector<ushort> >& a1,
+		vector<vector<ushort> >& a2,
+		size_t b1, size_t e1,
+		size_t b2, size_t e2)
+    {
+      size_t n2 = e2;
+      while (n2 < a2.size() && a2[n2].size() == 0) ++n2;
+
+      if (n2 == a2.size()) 
+	return po_last;
+      
+      ushort ns1,ne1,ne2;
+      if (!expand_phrase_pair(a1,a2,n2,b1,e1,ns1,ne1,ne2))
+	{
+	  return po_other;
+	}
+      if (ns1 >= e1)
+	{
+	  for (ushort j = e1; j < ns1; ++j)
+	    if (a1[j].size()) return po_jfwd;
+	  return po_mono;
+	}
+      else
+	{
+	  for (ushort j = ne1; j < b1; ++j)
+	    if (a1[j].size()) return po_jbwd;
+	  return po_swap;
+	}
+    }
+
+
+    PhraseOrientation 
+    find_po_bwd(vector<vector<ushort> >& a1,
+		vector<vector<ushort> >& a2,
+		size_t b1, size_t e1,
+		size_t b2, size_t e2)
+    {
+      int p2 = b2-1;
+      while (p2 >= 0 && !a2[p2].size()) --p2;
+      if (p2 < 0) return po_first;
+      ushort ps1,pe1,pe2;
+      if (!expand_phrase_pair(a1,a2,p2,b1,e1,ps1,pe1,pe2))
+	return po_other;
+      
+      if (pe1 < b1)
+	{
+	  for (ushort j = pe1; j < b1; ++j)
+	    if (a1[j].size()) return po_jfwd;
+	  return po_mono;
+	}
+      else
+	{
+	  for (ushort j = e1; j < ps1; ++j)
+	    if (a1[j].size()) return po_jbwd;
+	  return po_swap;
+	}
+    }
  }
 }
--- a/moses/TranslationModel/UG/mm/ug_bitext.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext.h
@ -26,13 +26,13 @@
 #include <boost/foreach.hpp>
 #include <boost/thread.hpp>

-#include "moses/generic/sorting/VectorIndexSorter.h"
-#include "moses/generic/sampling/Sampling.h"
-#include "moses/generic/file_io/ug_stream.h"
+#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
+#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
+#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
 #include "moses/Util.h"

-#include "util/exception.hh"
-#include "util/check.hh"
+#include "headers-base/util/exception.hh"
+#include "headers-base/util/check.hh"

 #include "ug_typedefs.h"
 #include "ug_mm_ttrack.h"
@ -54,6 +54,29 @@ namespace Moses {

    template<typename TKN> class Bitext;

+    enum PhraseOrientation 
+    {
+      po_first,
+      po_mono,
+      po_jfwd,
+      po_swap,
+      po_jbwd,
+      po_last,
+      po_other
+    };
+
+    PhraseOrientation 
+    find_po_fwd(vector<vector<ushort> >& a1,
+		vector<vector<ushort> >& a2,
+		size_t b1, size_t e1,
+		size_t b2, size_t e2);
+
+    PhraseOrientation 
+    find_po_bwd(vector<vector<ushort> >& a1,
+		vector<vector<ushort> >& a2,
+		size_t b1, size_t e1,
+		size_t b2, size_t e2);
+
    template<typename sid_t, typename off_t, typename len_t>
    void 
    parse_pid(uint64_t const pid, sid_t & sid, 
@ -79,6 +102,7 @@ namespace Moses {
      float    my_wcnt; // weighted count 
      uint32_t my_cnt2;
      vector<pair<size_t, vector<uchar> > > my_aln; 
+      uint32_t ofwd[7], obwd[7];
    public:
      jstats();
      jstats(jstats const& other);
@ -87,7 +111,12 @@ namespace Moses {
      float    wcnt() const;
      
      vector<pair<size_t, vector<uchar> > > const & aln() const;
-      void add(float w, vector<uchar> const& a, uint32_t const cnt2);
+      void add(float w, vector<uchar> const& a, uint32_t const cnt2,
+	       uint32_t fwd_orient, uint32_t bwd_orient);
+      void invalidate();
+      bool valid();
+      uint32_t dcnt_fwd(PhraseOrientation const idx) const;
+      uint32_t dcnt_bwd(PhraseOrientation const idx) const;
    };

    struct 
@ -101,14 +130,21 @@ namespace Moses {
      size_t good;       // number of selected instances with valid word alignments
      size_t sum_pairs;
      size_t in_progress; // keeps track of how many threads are currently working on this
+
+      uint32_t ofwd[po_other+1], obwd[po_other+1];
+
      typename boost::unordered_map<uint64_t, jstats> trg;
      pstats(); 
      void release();
      void register_worker();
      size_t count_workers() { return in_progress; } 

-      void add(uint64_t const pid, float const w, 
-	       vector<uchar> const& a, uint32_t const cnt2);
+      bool 
+      add(uint64_t const pid, 
+	  float    const w, 
+	  vector<uchar> const& a, 
+	  uint32_t      const cnt2,
+	  uint32_t fwd_o, uint32_t bwd_o);
    };
    
    class 
@ -117,19 +153,34 @@ namespace Moses {
    public:
      uint64_t p1, p2;
      uint32_t raw1,raw2,sample1,sample2,good1,good2,joint;
-      uint32_t mono,swap,left,right;
      vector<float> fvals;
+      float dfwd[po_other+1];
+      float dbwd[po_other+1];
      vector<uchar> aln;
      // float    avlex12,avlex21; // average lexical probs (Moses std)
      // float    znlex1,znlex2;   // zens-ney lexical smoothing
      // float    colex1,colex2;   // based on raw lexical occurrences
      float score;
      PhrasePair();
+      PhrasePair(PhrasePair const& o);
      bool operator<(PhrasePair const& other) const;
      bool operator>(PhrasePair const& other) const;
-      void init(uint64_t const pid1, pstats const& ps, 
+      bool operator<=(PhrasePair const& other) const;
+      bool operator>=(PhrasePair const& other) const;
+
+      void init(uint64_t const pid1, pstats const& ps,  size_t const numfeats);
+      void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2, 
 		size_t const numfeats);
-      void update(uint64_t const pid2, jstats const& js);
+
+      PhrasePair const& 
+      update(uint64_t const pid2, jstats const& js);
+
+      PhrasePair const& 
+      update(uint64_t const pid2, jstats   const& js1, jstats   const& js2);
+
+      PhrasePair const& 
+      update(uint64_t const pid2, size_t const raw2extra, jstats const& js);
+
      float eval(vector<float> const& w);
    };

@ -144,10 +195,16 @@ namespace Moses {
      virtual 
 
      void 
-      operator()(Bitext<Token> const& pt, PhrasePair& pp) const = 0;
+      operator()(Bitext<Token> const& pt, PhrasePair& pp, vector<float> * dest) 
+	const = 0;

      int 
-      fcnt() const { return num_feats; }
+      fcnt() const 
+      { return num_feats; }
+      
+      int 
+      getIndex() const 
+      { return index; }
    };

    template<typename Token>
@ -170,14 +227,17 @@ namespace Moses {
      }

      void 
-      operator()(Bitext<Token> const& bt, PhrasePair& pp) const
+      operator()(Bitext<Token> const& bt, 
+		 PhrasePair & pp, 
+		 vector<float> * dest = NULL) const
      {
+	if (!dest) dest = &pp.fvals;
 	if (pp.joint > pp.good1) 
 	  {
 	    cerr << bt.toString(pp.p1,0) << " ::: " << bt.toString(pp.p2,1) << endl;
 	    cerr << pp.joint << "/" << pp.good1 << "/" << pp.raw2 << endl;
 	  }
-	pp.fvals[this->index] = log(lbop(pp.good1, pp.joint, conf));
+	(*dest)[this->index] = log(lbop(pp.good1, pp.joint, conf));
      }
    };

@ -201,9 +261,10 @@ namespace Moses {
      }

      void 
-      operator()(Bitext<Token> const& pt, PhrasePair& pp) const
+      operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
      {
-	pp.fvals[this->index] = log(lbop(max(pp.raw2,pp.joint), pp.joint, conf));
+	if (!dest) dest = &pp.fvals;
+	(*dest)[this->index] = log(lbop(max(pp.raw2,pp.joint), pp.joint, conf));
      }
    };

@ -211,8 +272,8 @@ namespace Moses {
    class
    PScoreLex : public PhraseScorer<Token>
    {
-      LexicalPhraseScorer2<Token> scorer;
    public:
+      LexicalPhraseScorer2<Token> scorer;

      PScoreLex() { this->num_feats = 2; }

@ -225,8 +286,9 @@ namespace Moses {
      }

      void 
-      operator()(Bitext<Token> const& bt, PhrasePair& pp) const
+      operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
      {
+	if (!dest) dest = &pp.fvals;
 	uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
 	parse_pid(pp.p1, sid1, off1, len1);
 	parse_pid(pp.p2, sid2, off2, len2);
@ -248,8 +310,8 @@ namespace Moses {
 #endif
 	scorer.score(bt.T1->sntStart(sid1)+off1,0,len1,
 		     bt.T2->sntStart(sid2)+off2,0,len2,
-		     pp.aln, pp.fvals[this->index],
-		     pp.fvals[this->index+1]);
+		     pp.aln, (*dest)[this->index],
+		     (*dest)[this->index+1]);
      }
      
    };
@ -271,11 +333,12 @@ namespace Moses {
      }

      void 
-      operator()(Bitext<Token> const& bt, PhrasePair& pp) const
+      operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
      {
+	if (!dest) dest = &pp.fvals;
 	uint32_t sid2=0,off2=0,len2=0;
 	parse_pid(pp.p2, sid2, off2, len2);
-	pp.fvals[this->index] = len2;
+	(*dest)[this->index] = len2;
      }
      
    };
@ -297,9 +360,10 @@ namespace Moses {
      }

      void 
-      operator()(Bitext<Token> const& bt, PhrasePair& pp) const
+      operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
      {
-	pp.fvals[this->index] = 1;
+	if (!dest) dest = &pp.fvals;
+	(*dest)[this->index] = 1;
      }
      
    };
@ -307,6 +371,7 @@ namespace Moses {
    template<typename TKN>
    class Bitext 
    {
+    protected:
      mutable boost::mutex lock;
    public:
      typedef TKN Token;
@ -322,13 +387,13 @@ namespace Moses {
      // each other's way. 
      mutable sptr<agenda> ag; 
      
-      sptr<Ttrack<char> >  const Tx; // word alignments
-      sptr<Ttrack<Token> > const T1; // token track
-      sptr<Ttrack<Token> > const T2; // token track
-      sptr<TokenIndex>     const V1; // vocab
-      sptr<TokenIndex>     const V2; // vocab
-      sptr<TSA<Token> >    const I1; // indices
-      sptr<TSA<Token> >    const I2; // indices
+      sptr<Ttrack<char> >  Tx; // word alignments
+      sptr<Ttrack<Token> > T1; // token track
+      sptr<Ttrack<Token> > T2; // token track
+      sptr<TokenIndex>     V1; // vocab
+      sptr<TokenIndex>     V2; // vocab
+      sptr<TSA<Token> >    I1; // indices
+      sptr<TSA<Token> >    I2; // indices
      
      /// given the source phrase sid[start:stop]
      //  find the possible start (s1 .. s2) and end (e1 .. e2) 
@ -339,14 +404,20 @@ namespace Moses {
      find_trg_phr_bounds
      (size_t const sid, size_t const start, size_t const stop, 
       size_t & s1, size_t & s2, size_t & e1, size_t & e2, 
-       vector<uchar> * core_alignment, bool const flip) const;
+       int& po_fwd, int& po_bwd,
+       vector<uchar> * core_alignment, 
+       bitvector* full_alignment,
+       bool const flip) const;
      
      mutable boost::unordered_map<uint64_t,sptr<pstats> > cache1,cache2;
-    private:
+    protected:
      size_t default_sample_size;
+    private:
      sptr<pstats> 
 	prep2(iter const& phrase, size_t const max_sample) const;
    public:
+      Bitext(size_t const max_sample=5000);
+
      Bitext(Ttrack<Token>* const t1, 
 	     Ttrack<Token>* const t2, 
 	     Ttrack<char>*  const tx,
@ -358,6 +429,7 @@ namespace Moses {
 	     
      virtual void open(string const base, string const L1, string const L2) = 0;
      
+      // sptr<pstats> lookup(Phrase const& phrase, size_t factor) const;
      sptr<pstats> lookup(iter const& phrase) const;
      sptr<pstats> lookup(iter const& phrase, size_t const max_sample) const;
      void prep(iter const& phrase) const;
@ -407,6 +479,12 @@ namespace Moses {
 	}
    }

+    template<typename Token>
+    Bitext<Token>::
+    Bitext(size_t const max_sample)
+      : default_sample_size(max_sample)
+    { }
+
    template<typename Token>
    Bitext<Token>::
    Bitext(Ttrack<Token>* const t1, 
@ -557,16 +635,27 @@ namespace Moses {
 	{
 	  j->stats->register_worker();
 	  vector<uchar> aln;
+	  bitvector full_alignment(100*100);
 	  while (j->step(sid,offset))
 	    {
 	      aln.clear();
-	      if (!ag.bt.find_trg_phr_bounds
-		  (sid, offset, offset + j->len, s1, s2, e1, e2, 
-		   j->fwd?&aln:NULL, !j->fwd)) 
+	      int po_fwd=5,po_bwd=5;
+	      if (j->fwd)
+		{
+		  if (!ag.bt.find_trg_phr_bounds
+		      (sid,offset,offset+j->len,s1,s2,e1,e2,po_fwd,po_bwd,
+		       &aln,&full_alignment,false))
+		    continue;
+		}
+	      else if (!ag.bt.find_trg_phr_bounds
+		       (sid,offset,offset+j->len,s1,s2,e1,e2,po_fwd,po_bwd,
+			NULL,NULL,true))
 		continue;
 	      j->stats->lock.lock(); 
 	      j->stats->good += 1; 
 	      j->stats->sum_pairs += (s2-s1+1)*(e2-e1+1);
+	      ++j->stats->ofwd[po_fwd];
+	      ++j->stats->obwd[po_bwd];
 	      j->stats->lock.unlock();
 	      for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2) 
 		aln[k] += s2 - s1;
@ -580,8 +669,21 @@ namespace Moses {
 		  // assert(b);
 		  for (size_t i = e1; i <= e2; ++i)
 		    {
-		      
-		      j->stats->add(b->getPid(),sample_weight,aln,b->approxOccurrenceCount());
+		      if (!j->stats->add(b->getPid(),sample_weight,aln,
+					 b->approxOccurrenceCount(),
+					 po_fwd,po_bwd))
+			{
+			  for (size_t z = 0; z < j->len; ++z)
+			    {
+			      id_type tid = ag.bt.T1->sntStart(sid)[offset+z].id();
+			      cout << (*ag.bt.V1)[tid] << " "; 
+			    }
+			  cout << endl;
+			  for (size_t z = s; z <= i; ++z)
+			    cout << (*ag.bt.V2)[(o+z)->id()] << " "; 
+			  cout << endl;
+			  exit(1);
+			}
 		      if (i < e2)
 			{
 #ifndef NDEBUG
@ -734,59 +836,239 @@ namespace Moses {
      this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
      mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
      mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
-      i1.open(base+L1+".sfa", this->T1.get());
-      i2.open(base+L2+".sfa", this->T2.get());
+      i1.open(base+L1+".sfa", this->T1);
+      i2.open(base+L2+".sfa", this->T2);
      assert(this->T1->size() == this->T2->size());
    }
    
    template<typename TKN>
    class imBitext : public Bitext<TKN>
    {
+      sptr<imTtrack<char> > myTx;
+      sptr<imTtrack<TKN> >  myT1;
+      sptr<imTtrack<TKN> >  myT2;
+      sptr<imTSA<TKN> >     myI1; 
+      sptr<imTSA<TKN> >     myI2;
    public:
      void open(string const base, string const L1, string L2);
-      imBitext();
+      imBitext(sptr<TokenIndex> const& V1,
+	       sptr<TokenIndex> const& V2,
+	       size_t max_sample = 5000);
+      imBitext(size_t max_sample = 5000);
+      imBitext(imBitext const& other);
+      
+      // sptr<imBitext<TKN> > 
+      // add(vector<TKN> const& s1, vector<TKN> const& s2, vector<ushort> & a);
+
+      sptr<imBitext<TKN> > 
+      add(vector<string> const& s1, 
+	  vector<string> const& s2, 
+	  vector<string> const& a) const;
+
    };

    template<typename TKN>
    imBitext<TKN>::
-    imBitext()
-      : Bitext<TKN>(new imTtrack<TKN>(),
-		    new imTtrack<TKN>(),
-		    new imTtrack<char>(),
-		    new TokenIndex(),
-		    new TokenIndex(),
-		    new imTSA<TKN>(),
-		    new imTSA<TKN>())
-      {}
+    imBitext(size_t max_sample)
+    { 
+      this->default_sample_size = max_sample;
+      this->V1.reset(new TokenIndex());
+      this->V2.reset(new TokenIndex());
+      this->V1->setDynamic(true);
+      this->V2->setDynamic(true);
+    }
+    
+    template<typename TKN>
+    imBitext<TKN>::
+    imBitext(sptr<TokenIndex> const& v1,
+	     sptr<TokenIndex> const& v2,
+	     size_t max_sample)
+    { 
+      this->default_sample_size = max_sample;
+      this->V1 = v1;
+      this->V2 = v2;
+      this->V1->setDynamic(true);
+      this->V2->setDynamic(true);
+    }
    

+    template<typename TKN>
+    imBitext<TKN>::
+    imBitext(imBitext<TKN> const& other)
+    { 
+      this->myTx = other.myTx;
+      this->myT1 = other.myT1;
+      this->myT2 = other.myT2;
+      this->myI1 = other.myI1;
+      this->myI2 = other.myI2;
+      this->Tx = this->myTx;
+      this->T1 = this->myT1;
+      this->T2 = this->myT2;
+      this->I1 = this->myI1;
+      this->I2 = this->myI2;
+      this->V1 = other.V1;
+      this->V2 = other.V2;
+      this->default_sample_size = other.default_sample_size;
+    }
+    
+    template<typename TKN> class snt_adder;
+    template<>             class snt_adder<L2R_Token<SimpleWordId> >;
+
+    template<>     
+    class snt_adder<L2R_Token<SimpleWordId> >
+    {
+      typedef L2R_Token<SimpleWordId> TKN;
+      vector<string> const & snt;
+      TokenIndex           & V;
+      sptr<imTtrack<TKN> > & track;
+      sptr<imTSA<TKN > >   & index;
+    public:
+      snt_adder(vector<string> const& s, TokenIndex& v, 
+    		sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i);
+      
+      void operator()();
+    };
+
    // template<typename TKN>
-    // void
-    // imBitext<TKN>::
-    // open(string const base, string const L1, string L2)
+    // class snt_adder
    // {
-    //   mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtracuk<TKN>*>(this->T1.get());
-    //   mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get());
-    //   mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get());
-    //   t1.open(base+L1+".mct");
-    //   t2.open(base+L2+".mct");
-    //   tx.open(base+L1+"-"+L2+".mam");
-    //   cerr << "DADA" << endl;
-    //   this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
-    //   this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
-    //   mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
-    //   mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
-    //   i1.open(base+L1+".sfa", this->T1.get());
-    //   i2.open(base+L2+".sfa", this->T2.get());
-    //   assert(this->T1->size() == this->T2->size());
+    //   vector<string> const & snt;
+    //   TokenIndex           & V;
+    //   sptr<imTtrack<TKN> > & track;
+    //   sptr<imTSA<TKN > >   & index;
+    // public:
+    //   snt_adder(vector<string> const& s, TokenIndex& v, 
+    //  		sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i);
+
+    //   template<typename T>
+    //   void operator()();
+    // };
+
+    // // template<>
+    // void
+    // snt_adder<L2R_Token<SimpleWordId> >::
+    // operator()();
+
+    //  template<>
+    //  void
+    //  snt_adder<char>::
+    //  operator()()
+    //  {
+    // 	vector<id_type> sids;
+    // 	sids.reserve(snt.size());
+    // 	BOOST_FOREACH(string const& s, snt)
+    // 	  {
+    // 	    sids.push_back(track ? track->size() : 0);
+    // 	    istringstream buf(s);
+    // 	    string w;
+    // 	    vector<char> s;
+    // 	    s.reserve(100);
+    // 	    while (buf >> w) 
+    // 	      s.push_back(vector<char>(V[w]));
+    // 	    track = append(track,s);
+    // 	  }
+    // 	index.reset(new imTSA<char>(*index,track,sids,V.tsize()));
    // }
+    
+    // template<typename TKN>
+    // snt_adder<TKN>::
+    // snt_adder(vector<string> const& s, TokenIndex& v, 
+    //  	      sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i)
+    //   : snt(s), V(v), track(t), index(i) 
+    // {
+    //   throw "Not implemented yet.";
+    // }
+
+    template<>
+    sptr<imBitext<L2R_Token<SimpleWordId> > > 
+    imBitext<L2R_Token<SimpleWordId> >::
+    add(vector<string> const& s1, 
+	vector<string> const& s2, 
+	vector<string> const& aln) const;
+
+    template<typename TKN>
+    sptr<imBitext<TKN> > 
+    imBitext<TKN>::
+    add(vector<string> const& s1, 
+	vector<string> const& s2, 
+	vector<string> const& aln) const
+    {
+      throw "Not yet implemented";
+    }
+    // template<typename TKN>
+    // sptr<imBitext<TKN> > 
+    // imBitext<TKN>::
+    // add(vector<TKN> const& s1, vector<TKN> const& s2, vector<ushort> & a)
+    // {
+    //   boost::lock_guard<boost::mutex> guard(this->lock);
+    //   sptr<imBitext<TKN> > ret(new imBitext<TKN>());
+    //   vector<id_type> sids(1,this->myT1.size()-1);
+    //   ret->myT1 = add(this->myT1,s1);
+    //   ret->myT2 = add(this->myT2,s2);
+    //   size_t v1size = this->V1.tsize();
+    //   size_t v2size = this->V2.tsize();
+    //   BOOST_FOREACH(TKN const& t, s1) { if (t->id() >= v1size) v1size = t->id() + 1; }
+    //   BOOST_FOREACH(TKN const& t, s2) { if (t->id() >= v2size) v2size = t->id() + 1; }
+    //   ret->myI1.reset(new imTSA<TKN>(*this->I1,ret->myT1,sids,v1size));
+    //   ret->myI2.reset(new imTSA<TKN>(*this->I2,ret->myT2,sids,v2size));
+    //   ostringstream abuf; 
+    //   BOOST_FOREACH(ushort x, a) binwrite(abuf,x);
+    //   vector<char> foo(abuf.str().begin(),abuf.str().end());
+    //   ret->myTx = add(this->myTx,foo);
+    //   ret->T1 = ret->myT1;
+    //   ret->T2 = ret->myT2;
+    //   ret->Tx = ret->myTx;
+    //   ret->I1 = ret->myI1;
+    //   ret->I2 = ret->myI2;
+    //   ret->V1 = this->V1;
+    //   ret->V2 = this->V2; 
+    //   return ret;
+    // }
+
+
+    // template<typename TKN>
+    // imBitext<TKN>::
+    // imBitext()
+    //   : Bitext<TKN>(new imTtrack<TKN>(),
+    // 		    new imTtrack<TKN>(),
+    // 		    new imTtrack<char>(),
+    // 		    new TokenIndex(),
+    // 		    new TokenIndex(),
+    // 		    new imTSA<TKN>(),
+    // 		    new imTSA<TKN>())
+    //   {}
+    
+
+    template<typename TKN>
+    void
+    imBitext<TKN>::
+    open(string const base, string const L1, string L2)
+    {
+      mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get());
+      mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get());
+      mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get());
+      t1.open(base+L1+".mct");
+      t2.open(base+L2+".mct");
+      tx.open(base+L1+"-"+L2+".mam");
+      cerr << "DADA" << endl;
+      this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
+      this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
+      mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
+      mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
+      i1.open(base+L1+".sfa", this->T1);
+      i2.open(base+L2+".sfa", this->T2);
+      assert(this->T1->size() == this->T2->size());
+    }

    template<typename Token>
    bool
    Bitext<Token>::
    find_trg_phr_bounds(size_t const sid, size_t const start, size_t const stop,
 			size_t & s1, size_t & s2, size_t & e1, size_t & e2,
-			vector<uchar>* core_alignment, bool const flip) const
+			int & po_fwd, int & po_bwd,
+			vector<uchar>* core_alignment, 
+			bitvector* full_alignment, 
+			bool const flip) const
    {
      // if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl;
      // a word on the core_alignment:
@ -795,10 +1077,18 @@ namespace Moses {
      // it is up to the calling function to shift alignment points over for start positions
      // of extracted phrases that start with a fringe word
      bitvector forbidden((flip ? T1 : T2)->sntLen(sid));
+      size_t slen1 = (*T1).sntLen(sid);
+      size_t slen2 = (*T2).sntLen(sid);
+      if (full_alignment)
+	{
+	  if (slen1*slen2 > full_alignment->size())
+	    full_alignment->resize(slen1*slen2*2);
+	  full_alignment->reset();
+	}
      size_t src,trg;
      size_t lft = forbidden.size();
      size_t rgt = 0;
-      vector<vector<ushort> > aln((*T1).sntLen(sid));
+      vector<vector<ushort> > aln1(slen1),aln2(slen2);
      char const* p = Tx->sntStart(sid);
      char const* x = Tx->sntEnd(sid);

@ -814,11 +1104,24 @@ namespace Moses {
 	    {
 	      lft = min(lft,trg);
 	      rgt = max(rgt,trg);
-	      if (core_alignment) 
+	    }
+	  if (core_alignment) 
+	    {
+	      if (flip) 
 		{
-		  if (flip) aln[trg].push_back(src);
-		  else      aln[src].push_back(trg);
+		  aln1[trg].push_back(src);
+		  aln2[src].push_back(trg);
 		}
+	      else      
+		{
+		  aln1[src].push_back(trg);
+		  aln2[trg].push_back(src);
+		}
+	    }
+	  if (full_alignment)
+	    {
+	      if (flip) full_alignment->set(trg*slen2 + src);
+	      else      full_alignment->set(src*slen2 + trg);
 	    }
 	}
      
@ -837,8 +1140,8 @@ namespace Moses {
 	    {
 	      for (size_t i = lft; i <= rgt; ++i)
 		{
-		  sort(aln[i].begin(),aln[i].end());
-		  BOOST_FOREACH(ushort x, aln[i])
+		  sort(aln1[i].begin(),aln1[i].end());
+		  BOOST_FOREACH(ushort x, aln1[i])
 		    {
 		      core_alignment->push_back(i-lft);
 		      core_alignment->push_back(x-start);
@ -849,14 +1152,25 @@ namespace Moses {
 	    {
 	      for (size_t i = start; i < stop; ++i)
 		{
-		  BOOST_FOREACH(ushort x, aln[i])
+		  BOOST_FOREACH(ushort x, aln1[i])
 		    {
 		      core_alignment->push_back(i-start);
 		      core_alignment->push_back(x-lft);
 		    }
 		}
 	    }
-	  
+
+	  // now determine fwd and bwd phrase orientation
+	  if (flip) 
+	    {
+	      po_fwd = find_po_fwd(aln2,aln1,start,stop,s1,e2);
+	      po_bwd = find_po_bwd(aln2,aln1,start,stop,s1,e2);
+	    }
+	  else  	  
+	    {
+	      po_fwd = find_po_fwd(aln1,aln2,start,stop,s1,e2);
+	      po_bwd = find_po_bwd(aln1,aln2,start,stop,s1,e2);
+	    }
 #if 0
 	  // if (e1 - s1 > 3)
 	    {
@ -898,17 +1212,14 @@ namespace Moses {
    template<typename Token>
    sptr<pstats> 
    Bitext<Token>::
-      prep2(iter const& phrase, size_t const max_sample) const
+    prep2(iter const& phrase, size_t const max_sample) const
    {
      // boost::lock_guard<boost::mutex>(this->lock);
      if (!ag) 
 	{
-	  // boost::lock_guard<boost::mutex>(this->lock);
-	  if (!ag) 
-	    {
-	      ag.reset(new agenda(*this));
-	      ag->add_workers(20);
-	    }
+	  ag.reset(new agenda(*this));
+	  // ag->add_workers(1);
+	  ag->add_workers(20);
 	}
      typedef boost::unordered_map<uint64_t,sptr<pstats> > pcache_t;
      sptr<pstats> ret;
@ -928,7 +1239,7 @@ namespace Moses {
      else ret = ag->add_job(phrase, max_sample);
      return ret;
    }
-    
+
    template<typename Token>
    sptr<pstats> 
    Bitext<Token>::
--- a/moses/TranslationModel/UG/mm/ug_im_tsa.h
+++ b/moses/TranslationModel/UG/mm/ug_im_tsa.h
@ -11,6 +11,7 @@
 #include <boost/iostreams/device/mapped_file.hpp>
 #include <boost/shared_ptr.hpp>
 #include <boost/dynamic_bitset.hpp>
+#include <boost/foreach.hpp>

 #include "tpt_tightindex.h"
 #include "tpt_tokenindex.h"
@ -20,13 +21,17 @@
 namespace ugdiss
 {
  using namespace std;
+  using namespace boost;
  namespace bio=boost::iostreams;
-  
-  //-----------------------------------------------------------------------
+   
+  // template<typename TOKEN> class imBitext<TOKEN>;
+
+ //-----------------------------------------------------------------------
  template<typename TOKEN>
  class imTSA : public TSA<TOKEN>
  {
    typedef typename Ttrack<TOKEN>::Position cpos;
+    // friend class imBitext<TOKEN>;
  public:
    class tree_iterator;
    friend class tree_iterator;
@ -35,7 +40,6 @@ namespace ugdiss
    vector<cpos>          sufa; // stores the actual array
    vector<filepos_type> index; /* top-level index into regions in sufa 
                                 * (for faster access) */
-    
  private:
    char const* 
    index_jump(char const* a, char const* z, float ratio) const;
@ -48,8 +52,14 @@ namespace ugdiss
    
  public:
    imTSA();
-    imTSA(Ttrack<TOKEN> const* c, bdBitset const& filt, ostream* log = NULL);
-    
+    imTSA(shared_ptr<Ttrack<TOKEN> const> c, 
+	  bdBitset const* filt, 
+	  ostream* log = NULL);
+
+    imTSA(imTSA<TOKEN> const& prior, 
+	  shared_ptr<imTtrack<TOKEN> const> const&   crp,
+	  vector<id_type> const& newsids, size_t const vsize);
+
    count_type 
    sntCnt(char const* p, char const * const q) const; 

@ -78,6 +88,9 @@ namespace ugdiss
    void 
    save_as_mm_tsa(string fname) const;
    
+    /// add a sentence to the database
+    // shared_ptr<imTSA<TOKEN> > add(vector<TOKEN> const& snt) const; 
+
  };

  template<typename TOKEN>
@ -115,12 +128,11 @@ namespace ugdiss
  imTSA<TOKEN>::
  imTSA() 
  {
-    this->corpus  = NULL;
-    this->indexSize = 0;
-    this->data    = NULL;
+    this->indexSize  = 0;
+    // this->data       = NULL;
    this->startArray = NULL;
-    this->endArray = NULL;
-    this->corpusSize=0;
+    this->endArray   = NULL;
+    this->corpusSize = 0;
    this->BitSetCachingThreshold=4096;
  };
  
@ -128,11 +140,17 @@ namespace ugdiss
  // specified in filter
  template<typename TOKEN>
  imTSA<TOKEN>::
-  imTSA(Ttrack<TOKEN> const* c, bdBitset const& filter, ostream* log)
+  imTSA(shared_ptr<Ttrack<TOKEN> const> c, bdBitset const* filter, ostream* log)
  {
    assert(c);
    this->corpus = c;
-    
+    bdBitset  filter2;
+    if (!filter)
+      {
+	filter2.resize(c->size());
+	filter2.set();
+	filter = &filter2;
+      }
    // In the first iteration over the corpus, we obtain word counts.
    // They allows us to 
    //    a. allocate the exact amount of memory we need
@ -160,9 +178,9 @@ namespace ugdiss
    
    // Now dump all token positions into the right place in sufa
    this->corpusSize = 0;
-    for (id_type sid = filter.find_first();
-	 sid < filter.size();
-	 sid = filter.find_next(sid))
+    for (id_type sid = filter->find_first();
+	 sid < filter->size();
+	 sid = filter->find_next(sid))
      {
 	TOKEN const* k = c->sntStart(sid);
 	TOKEN const* const stop = c->sntEnd(sid);
@ -181,7 +199,7 @@ namespace ugdiss
    // Now sort the array
    if (log) *log << "sorting ...." << endl;
    index.resize(wcnt.size()+1,0);
-    typename ttrack::Position::LESS<Ttrack<TOKEN> > sorter(c);
+    typename ttrack::Position::LESS<Ttrack<TOKEN> > sorter(c.get());
    for (size_t i = 0; i < wcnt.size(); i++)
      {
        if (log && wcnt[i] > 5000)
@ -284,7 +302,7 @@ namespace ugdiss
  getCounts(char const* p, char const* const q, 
 	    count_type& sids, count_type& raw) const
  {
-    id_type sid; uint16_t off;
+    id_type sid; // uint16_t off;
    bdBitset check(this->corpus->size());
    cpos const* xp = reinterpret_cast<cpos const*>(p);
    cpos const* xq = reinterpret_cast<cpos const*>(q);
@ -292,7 +310,7 @@ namespace ugdiss
    for (;xp < xq;xp++)
      {
 	sid = xp->sid;
-	off = xp->offset;
+	// off = xp->offset;
 	check.set(sid);
      }
    sids = check.count();
@ -323,8 +341,92 @@ namespace ugdiss
    for (size_t i = 0; i < mmIndex.size(); i++)
      numwrite(out,mmIndex[i]-mmIndex[0]);
    out.seekp(0);
-    numwrite(out,idxStart);
+    numwrite(out,idxStart);  
    out.close();
  }
+
+  template<typename TOKEN>
+  imTSA<TOKEN>::
+  imTSA(imTSA<TOKEN> const& prior, 
+  	shared_ptr<imTtrack<TOKEN> const> const&   crp,
+  	vector<id_type> const& newsids, size_t const vsize)
+  {
+    typename ttrack::Position::LESS<Ttrack<TOKEN> > sorter(crp.get());
+    
+    // count how many tokens will be added to the TSA
+    // and index the new additions to the corpus
+    size_t newToks = 0;
+    BOOST_FOREACH(id_type sid, newsids) 
+      newToks += crp->sntLen(sid);
+    vector<cpos> nidx(newToks); // new array entries
+    
+    size_t n = 0;
+    BOOST_FOREACH(id_type sid, newsids) 
+      {
+  	for (size_t o = 0; o < (*crp)[sid].size(); ++o, ++n)
+  	  { nidx[n].offset = o; nidx[n].sid  = sid; }
+      }
+    sort(nidx.begin(),nidx.end(),sorter);
+  
+    // create the new suffix array
+    this->numTokens = newToks + prior.sufa.size();
+    this->sufa.resize(this->numTokens);
+    this->startArray = reinterpret_cast<char const*>(&(*this->sufa.begin()));
+    this->endArray   = reinterpret_cast<char const*>(&(*this->sufa.end()));
+    this->corpusSize = crp->size();
+    this->corpus     = crp;
+    this->index.resize(vsize+1);
+    
+    size_t i = 0;
+    typename vector<cpos>::iterator k = this->sufa.begin();
+    this->index[0] = 0;
+    for (size_t n = 0; n < nidx.size();)
+      {
+  	id_type nid = crp->getToken(nidx[n])->id();
+  	assert(nid >= i);
+  	while (i < nid)
+  	  {
+  	    if (++i < prior.index.size() && prior.index[i-1] < prior.index[i])
+  	      {
+  		k = copy(prior.sufa.begin() + prior.index[i-1], 
+  			 prior.sufa.begin() + prior.index[i], k);
+  	      }
+  	    this->index[i] = k - prior.sufa.begin();
+  	  }
+  	if (++i < prior.index.size() && prior.index[i] > prior.index[i-1])
+  	  {
+  	    size_t j = prior.index[i-1];
+  	    while (j < prior.index[i] && n < nidx.size() 
+  		   && crp->getToken(nidx[n])->id() < i)
+  	      {
+  		assert(k < this->sufa.end());
+  		if (sorter(prior.sufa[j],nidx[n]))
+  		  *k++ = prior.sufa[j++];
+  		else 
+  		  *k++ = nidx[n++];
+  	      }
+  	    while (j < prior.index[i])
+  	      {
+  		assert(k < this->sufa.end());
+  		*k++ = prior.sufa[j++];
+  	      }
+  	  }
+  	while (n < nidx.size() && this->corpus->getToken(nidx[n])->id() < i)
+  	  {
+  	    assert(k < this->sufa.end());
+  	    *k++ = nidx[n++];
+  	  }
+  	this->index[i] = k - this->sufa.begin();
+      }
+    while (++i < this->index.size())
+      {
+  	if (i < prior.index.size() && prior.index[i-1] < prior.index[i])
+  	  k = copy(prior.sufa.begin() + prior.index[i-1], 
+  		   prior.sufa.begin() + prior.index[i], k);
+  	this->index[i] = k - this->sufa.begin();
+      }
+  }
+
 }
+  
 #endif
--- a/moses/TranslationModel/UG/mm/ug_im_ttrack.h
+++ b/moses/TranslationModel/UG/mm/ug_im_ttrack.h
@ -1,4 +1,4 @@
-// -*- c++-mode -*-
+// -*- c++ -*-
 // In-memory corpus track
 // (c) 2006-2012 Ulrich Germann. 

@ -10,6 +10,7 @@

 #include <boost/shared_ptr.hpp>
 #include <boost/unordered_map.hpp>
+#include <boost/foreach.hpp>

 #include "tpt_typedefs.h"
 #include "tpt_tokenindex.h"
@ -17,24 +18,44 @@
 #include "tpt_tokenindex.h"
 // #include "ug_vocab.h"

+// define the corpus buffer size (in sentences) and the
+// for adding additional sentences:
+#define IMTTRACK_INCREMENT_SIZE 100000
+#define IMTSA_INCREMENT_SIZE   1000000
+
 namespace ugdiss
 {
  using namespace std;
+  using namespace boost;
  namespace bio=boost::iostreams;

-  template<typename Token=id_type>
+  template<typename Token> class imTSA;
+  template<typename Token> class imTtrack;
+
+  template<typename TOKEN>
+  typename boost::shared_ptr<imTtrack<TOKEN> > 
+  append(typename boost::shared_ptr<imTtrack<TOKEN> > const &  crp, vector<TOKEN> const & snt);
+
+  template<typename Token>
  class imTtrack : public Ttrack<Token>
  {
+    
  private:
    size_t numToks;
    boost::shared_ptr<vector<vector<Token> > > myData;  // pointer to corpus data
+    friend class imTSA<Token>;
+
+    friend 
+    typename boost::shared_ptr<imTtrack<Token> > 
+    append<Token>(typename boost::shared_ptr<imTtrack<Token> > const & crp, vector<Token> const & snt);
+
  public:

    imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d);
    imTtrack(istream& in, TokenIndex const& V, ostream* log);
-    imTtrack();
+    imTtrack(size_t reserve = 0);
    // imTtrack(istream& in, Vocab& V);
-
+    
    /** return pointer to beginning of sentence */
    Token const* sntStart(size_t sid) const; 

@ -65,7 +86,7 @@ namespace ugdiss
  {
    assert(sid < size());
    if ((*myData)[sid].size() == 0) return NULL;
-    return &(*myData)[sid].back();
+    return &(*myData)[sid].back()+1;
  }
  
  template<typename Token>
@ -76,7 +97,7 @@ namespace ugdiss
    // we assume that myIndex has pointers to both the beginning of the
    // first sentence and the end point of the last, so there's one more
    // offset in the myIndex than there are sentences
-    return myData.size();
+    return myData->size();
  }
  
  template<typename Token>
@ -113,9 +134,10 @@ namespace ugdiss
  
  template<typename Token>
  imTtrack<Token>::
-  imTtrack()
+  imTtrack(size_t reserve)
  {
    myData.reset(new vector<vector<Token> >());
+    if (reserve) myData->reserve(reserve);
  }

  template<typename Token>
@ -123,8 +145,11 @@ namespace ugdiss
  imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d)
  {
    myData  = d;
+    numTokens = 0;
+    BOOST_FOREACH(vector<Token> const& v, d)
+      numTokens += v.size();
  }
-
+  
  template<typename Token>
  id_type
  imTtrack<Token>::
@ -141,5 +166,27 @@ namespace ugdiss
    return i;
  }

+  /// add a sentence to the database
+  template<typename TOKEN>
+  shared_ptr<imTtrack<TOKEN> > 
+  append(shared_ptr<imTtrack<TOKEN> > const& crp, vector<TOKEN> const & snt)
+  {
+    shared_ptr<imTtrack<TOKEN> > ret;
+    if (crp == NULL)
+      {
+  	ret.reset(new imTtrack<TOKEN>());
+	ret->myData->reserve(IMTTRACK_INCREMENT_SIZE);
+      }
+    else if (crp->myData->capacity() == crp->size())
+      {
+  	ret.reset(new imTtrack<TOKEN>());
+	ret->myData->reserve(crp->size() + IMTTRACK_INCREMENT_SIZE);
+	copy(crp->myData->begin(),crp->myData->end(),ret->myData->begin());
+      }
+    else ret = crp;
+    ret->myData->push_back(snt);
+    return ret;
+  }
+
 }
 #endif
--- a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h
+++ b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h
@ -5,7 +5,7 @@
 #ifndef __ug_lexical_phrase_scorer_h
 #define __ug_lexical_phrase_scorer_h

-#include "moses/generic/file_io/ug_stream.h"
+#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
 #include "tpt_tokenindex.h"
 #include <string>
 #include <boost/unordered_map.hpp>
@ -19,9 +19,9 @@ namespace ugdiss
  class 
  LexicalPhraseScorer2
  {
+  public:
    typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> table_t;
    table_t COOC;
-  public:
    void open(string const& fname);

    template<typename someint>
@ -96,8 +96,8 @@ namespace ugdiss
  {
    if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0;
    // if (!COOC[s][t]) cout << s << " " << t << endl;
-    assert(COOC[s][t]);
-    return float(COOC[s][t])/COOC.m1(s);
+    // assert(COOC[s][t]);
+    return float(COOC[s][t]+1)/(COOC.m1(s)+1);
  }

  template<typename TKN>
@ -106,8 +106,8 @@ namespace ugdiss
  plup_bwd(id_type const s, id_type const t) const
  {
    if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0;
-    assert(COOC[s][t]);
-    return float(COOC[s][t])/COOC.m2(t);
+    // assert(COOC[s][t]);
+    return float(COOC[s][t]+1)/(COOC.m2(t)+1);
  }

  template<typename TKN>
--- a/moses/TranslationModel/UG/mm/ug_mm_tsa.h
+++ b/moses/TranslationModel/UG/mm/ug_mm_tsa.h
@ -44,7 +44,7 @@ namespace ugdiss
  public:
    mmTSA();
    mmTSA(string fname, Ttrack<TOKEN> const* c);
-    void open(string fname, Ttrack<TOKEN> const* c);
+    void open(string fname, typename boost::shared_ptr<Ttrack<TOKEN> const> c);

    count_type
    sntCnt(char const* p, char const * const q) const;
@ -100,7 +100,6 @@ namespace ugdiss
  mmTSA<TOKEN>::
  mmTSA() 
  {
-    this->corpus       = NULL;
    this->startArray   = NULL;
    this->endArray     = NULL;
    this->BitSetCachingThreshold=4096;
@ -120,7 +119,7 @@ namespace ugdiss
  template<typename TOKEN>
  void
  mmTSA<TOKEN>::
-  open(string fname, Ttrack<TOKEN> const* c)
+  open(string fname, typename boost::shared_ptr<Ttrack<TOKEN> const> c)
  {
    this->bsc.reset(new BitSetCache<TSA<TOKEN> >(this));
    if (access(fname.c_str(),F_OK))
--- a/moses/TranslationModel/UG/mm/ug_mm_ttrack.h
+++ b/moses/TranslationModel/UG/mm/ug_mm_ttrack.h
@ -34,6 +34,8 @@ namespace ugdiss
    typedef TKN Token;

  private:
+    id_type numSent;
+    id_type numWords;
    bio::mapped_file_source file;
    Token   const* data;  // pointer to first word of first sentence
    id_type const* index; /* pointer to index (change data type for corpora 
--- a/moses/TranslationModel/UG/mm/ug_mmbitext.h
+++ b/moses/TranslationModel/UG/mm/ug_mmbitext.h
@ -20,9 +20,9 @@
 #include <boost/foreach.hpp>
 #include <boost/thread.hpp>

-#include "moses/generic/sorting/VectorIndexSorter.h"
-#include "moses/generic/sampling/Sampling.h"
-#include "moses/generic/file_io/ug_stream.h"
+#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
+#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
+#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"

 #include "ug_typedefs.h"
 #include "ug_mm_ttrack.h"
--- a/moses/TranslationModel/UG/mm/ug_tsa_array_entry.cc
+++ b/moses/TranslationModel/UG/mm/ug_tsa_array_entry.cc
@ -1,6 +1,6 @@
 #include "ug_tsa_array_entry.h"
 #include "ug_ttrack_position.h"
-#include "moses/generic/sampling/Sampling.h"
+#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"

 // (c) 2007-2010 Ulrich Germann

--- a/moses/TranslationModel/UG/mm/ug_tsa_base.h
+++ b/moses/TranslationModel/UG/mm/ug_tsa_base.h
@ -8,9 +8,10 @@
 #include <string>

 #include <boost/iostreams/device/mapped_file.hpp>
-
+#include <boost/shared_ptr.hpp>
 #include "tpt_tokenindex.h"
 #include "ug_ttrack_base.h"
+#include "ug_im_ttrack.h"
 #include "ug_corpus_token.h"
 #include "ug_tsa_tree_iterator.h"
 #include "ug_tsa_array_entry.h"
@ -44,7 +45,6 @@ namespace ugdiss
  template<typename TKN> 
  class TSA 
  {
-
  public:
    virtual ~TSA() {};
    typedef TSA_tree_iterator<TKN>       tree_iterator; 
@ -62,9 +62,9 @@ namespace ugdiss
    friend class TSA_tree_iterator<TKN>;

  protected:
-    Ttrack<TKN> const* corpus; // pointer to the underlying corpus
-    char const*    startArray; // beginning ...
-    char const*      endArray; // ... and end ...
+    shared_ptr<Ttrack<TKN> const> corpus; // pointer to the underlying corpus
+    char const*               startArray; // beginning ...
+    char const*                 endArray; // ... and end ...
    // of memory block storing the actual TSA

    size_t corpusSize; 
@ -737,7 +737,7 @@ namespace ugdiss
  TSA<TKN>::
  getCorpus() const
  {
-    return corpus;
+    return corpus.get();
  }

  //---------------------------------------------------------------------------
--- a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
+++ b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
@ -19,21 +19,25 @@ namespace ugdiss
  template<typename T>
  void display(T const* x, string label)
  {
-    cout << label << ":"; for (;x;x=next(x)) cout << " " << x->lemma; cout << endl; 
+    cout << label << ":"; 
+    for (;x;x=next(x)) cout << " " << x->lemma; 
+    cout << endl; 
  }
 #endif

  template<typename T> class TSA;

  // CLASS DEFINITION
-  // The TSA_tree_iterator allows traversal of a Token Sequence Array as if it was a trie.
+  // The TSA_tree_iterator allows traversal of a Token Sequence Array 
+  // as if it was a trie.
+  //
  // down(): go to first child
  // over(): go to next sibling 
  // up():   go to parent
  // extend(id): go to a specific child node
  // all four functions return true if successful, false otherwise
-  // lower_bound() and upper_bound() give the range of entries in the array covered by the 
-  // "virtual trie node".
+  // lower_bound() and upper_bound() give the range of entries in the 
+  // array covered by the "virtual trie node".
  template<typename TKN>
  class
  TSA_tree_iterator
@ -50,12 +54,16 @@ namespace ugdiss
    virtual ~TSA_tree_iterator() {};

    TSA<Token> const* root; 
-    // TO BE DONE: make the pointer private and add a const function to return the pointer
+    // TO BE DONE: make the pointer private and add a const function
+    // to return the pointer

    // TSA_tree_iterator(TSA_tree_iterator const& other);
    TSA_tree_iterator(TSA<Token> const* s);
    // TSA_tree_iterator(TSA<Token> const* s, Token const& t);
-    // TSA_tree_iterator(TSA<Token> const* s, Token const* kstart, Token const* kend);
+    TSA_tree_iterator(TSA<Token> const* s, 
+		      Token const* kstart, 
+		      Token const* kend, 
+		      bool full_match_only=true);
    // TSA_tree_iterator(TSA<Token> const* s, 
    // 		      TokenIndex const& V, 
    // 		      string const& key);
@ -354,21 +362,24 @@ namespace ugdiss

  // ---------------------------------------------------------------------------

+#endif
+
  template<typename Token>
  TSA_tree_iterator<Token>::
-  TSA_tree_iterator(TSA<Token> const* s, Token const* kstart, Token const* kend)
+  TSA_tree_iterator(TSA<Token> const* s, Token const* kstart, 
+		    Token const* kend, bool full_match_only)
    : root(s) 
  {
    for (;kstart != kend; kstart = kstart->next()) 
      if (!extend(*kstart)) 
        break;
-    if (kstart != kend) 
+    if (full_match_only && kstart != kend) 
      {
        lower.clear();
        upper.clear();
      }
  };
-#endif
+
  // ---------------------------------------------------------------------------
  // EXTEND
  // ---------------------------------------------------------------------------
@ -449,6 +460,7 @@ namespace ugdiss
  TSA_tree_iterator<Token>::
  getPid(int p) const 
  { 
+    if (this->size() == 0) return 0;
    if (p < 0) p += upper.size();
    char const* lb = lower_bound(p);
    char const* ub = upper_bound(p);
@ -845,8 +857,9 @@ namespace ugdiss
    
    size_t m=0; // number of samples selected so far
    typename Token::ArrayEntry I(lower.at(level));
+
    char const* stop = upper.at(level);
-    while (m < N && I.next < stop)
+    while (m < N && (I.next) < stop)
      {
        root->readEntry(I.next,I);
        
@ -860,9 +873,9 @@ namespace ugdiss
          }
      }
    ret->resize(m);
+
    return ret;
  }
-
  
 } // end of namespace ugdiss
 #endif
--- a/moses/TranslationModel/UG/mm/ug_ttrack_base.h
+++ b/moses/TranslationModel/UG/mm/ug_ttrack_base.h
@ -28,10 +28,6 @@ namespace ugdiss
  template<typename TKN=id_type>
  class Ttrack
  {
-  protected:
-    id_type numSent;
-    id_type numWords;
-
  public:

    virtual ~Ttrack() {};
@ -92,13 +88,15 @@ namespace ugdiss
     *  Currently only defined for Ttrack<id_type> */
    string str(id_type sid, TokenIndex const& T) const;

+    string pid2str(TokenIndex const* V, uint64_t pid) const;
+
    // /** @return string representation of sentence /sid/ 
    //  *  Currently only defined for Ttrack<id_type> */
    // string str(id_type sid, Vocab const& V) const;
    
    /** counts the tokens in the corpus; used for example in the construction of 
     *  token sequence arrays */
-    count_type count_tokens(vector<count_type>& cnt, bdBitset const& filter,
+    count_type count_tokens(vector<count_type>& cnt, bdBitset const* filter,
                            int lengthCutoff=0, ostream* log=NULL) const;

    // static id_type toID(TKN const& t);
@ -145,16 +143,27 @@ namespace ugdiss
  template<typename TKN>
  count_type
  Ttrack<TKN>::
-  count_tokens(vector<count_type>& cnt, bdBitset const& filter,
-               int lengthCutoff, ostream* log) const
+  count_tokens(vector<count_type>& cnt, bdBitset const* filter, 
+	       int lengthCutoff, ostream* log) const
  {
+    bdBitset filter2; 
+    if (!filter) 
+      {
+	filter2.resize(this->size());
+	filter2.set();
+	filter = &filter2;
+      }
    cnt.clear();
    cnt.reserve(500000);
    count_type totalCount=0;
-    int64_t expectedTotal=numTokens();
-    for (size_t sid = filter.find_first();
-	 sid < filter.size();
-	 sid = filter.find_next(sid))
+    
+    int64_t expectedTotal=0;
+    for (size_t sid = 0; sid < this->size(); ++sid)
+      expectedTotal += this->sntLen(sid);
+    
+    for (size_t sid = filter->find_first();
+	 sid < filter->size();
+	 sid = filter->find_next(sid))
      {
 	TKN const* k = sntStart(sid);
 	TKN const* const stop = sntEnd(sid);
@ -177,7 +186,7 @@ namespace ugdiss
              }
          }
      }
-    if (this->size() == filter.count())
+    if (this->size() == filter->count())
      {
        if (totalCount != expectedTotal)
          cerr << "OOPS: expected " << expectedTotal 
@ -344,5 +353,36 @@ namespace ugdiss
      return Position(this->size(),0);
  }

+  template<typename TKN>
+  string
+  Ttrack<TKN>::
+  pid2str(TokenIndex const* V, uint64_t pid) const
+  {
+    uint32_t len = pid % (1<<16);
+    pid >>= 16;
+    uint32_t off = pid % (1<<16);
+    uint32_t sid = pid>>16;
+    ostringstream buf;
+    TKN const* t    = sntStart(sid) + off;
+    TKN const* stop = t + len;
+    if (V)
+      {
+	while (t < stop)
+	  {
+	    buf << (*V)[t->id()];
+	    if ((t = t->next()) != stop) buf << " ";
+	  }
+      }
+    else
+      {
+	while (t < stop)
+	  {
+	    buf << t->id();
+	    if ((t = t->next()) != stop) buf << " ";
+	  }
+      }
+    return buf.str();
+  }
+  
 }
 #endif
--- a/moses/TranslationModel/UG/mmsapt.cpp
+++ b/moses/TranslationModel/UG/mmsapt.cpp
@ -23,17 +23,19 @@ namespace Moses
 	params[t.substr(i,j)] = t.substr(k);
      }
  }
-
+#if 0
  Mmsapt::
  Mmsapt(string const& description, string const& line)
-    : PhraseDictionary(description,line)
+    : PhraseDictionary(description,line), ofactor(1,0)
  {
    this->init(line);
  }
+#endif

  Mmsapt::
  Mmsapt(string const& line)
-    : PhraseDictionary("Mmsapt",line)
+    // : PhraseDictionary("Mmsapt",line), ofactor(1,0)
+    : PhraseDictionary(line), ofactor(1,0)
  {
    this->init(line);
  }
@ -59,80 +61,409 @@ namespace Moses
    // num_features = 0;
    m = param.find("ifactor");
    input_factor = m != param.end() ? atoi(m->second.c_str()) : 0;
+    poolCounts = true;
  }

  void
  Mmsapt::
  Load()
  {
-    bt.open(bname, L1, L2);
+    btfix.open(bname, L1, L2);
    size_t num_feats;
-    num_feats = calc_pfwd.init(0,lbop_parameter);
-    num_feats = calc_pbwd.init(num_feats,lbop_parameter);
-    num_feats = calc_lex.init(num_feats, bname + L1 + "-" + L2 + ".lex");
-    num_feats = apply_pp.init(num_feats);
-    assert (num_feats == this->m_numScoreComponents);
+    // TO DO: should we use different lbop parameters 
+    //        for the relative-frequency based features?
+    num_feats  = calc_pfwd_fix.init(0,lbop_parameter);
+    num_feats  = calc_pbwd_fix.init(num_feats,lbop_parameter);
+    num_feats  = calc_lex.init(num_feats, bname + L1 + "-" + L2 + ".lex");
+    num_feats  = apply_pp.init(num_feats);
+    if (num_feats < this->m_numScoreComponents)
+      {
+	poolCounts = false;
+	num_feats  = calc_pfwd_dyn.init(num_feats,lbop_parameter);
+	num_feats  = calc_pbwd_dyn.init(num_feats,lbop_parameter);
+      }
+    btdyn.reset(new imBitext<Token>(btfix.V1, btfix.V2));
+    if (num_feats != this->m_numScoreComponents)
+      {
+	ostringstream buf;
+	buf << "At " << __FILE__ << ":" << __LINE__
+	    << ": number of feature values provided by Phrase table"
+	    << " does not match number specified in Moses config file!";
+	throw buf.str().c_str();
+      }
    // cerr << "MMSAPT provides " << num_feats << " features at " 
    // << __FILE__ << ":" << __LINE__ << endl;
+
+    LexicalPhraseScorer2<Token>::table_t & COOC = calc_lex.scorer.COOC;
+    typedef LexicalPhraseScorer2<Token>::table_t::Cell cell_t;
+    wlex21.resize(COOC.numCols);
+    for (size_t r = 0; r < COOC.numRows; ++r)
+      for (cell_t const* c = COOC[r].start; c < COOC[r].stop; ++c)
+	wlex21[c->id].push_back(r);
+    COOCraw.open(bname + L1 + "-" + L2 + ".coc");
+
  }

+  void
+  Mmsapt::
+  add(string const& s1, string const& s2, string const& a)
+  {
+    vector<string> S1(1,s1);
+    vector<string> S2(1,s2);
+    vector<string> ALN(1,a);
+    boost::lock_guard<boost::mutex> guard(this->lock);
+    btdyn = btdyn->add(S1,S2,ALN);
+  }
+
+
+  TargetPhrase* 
+  Mmsapt::
+  createTargetPhrase(Phrase        const& src, 
+		     Bitext<Token> const& bt, 
+		     PhrasePair    const& pp) const
+  {
+    Word w; uint32_t sid,off,len;    
+    TargetPhrase* tp = new TargetPhrase();
+    parse_pid(pp.p2, sid, off, len);
+    Token const* x = bt.T2->sntStart(sid) + off;
+    for (uint32_t k = 0; k < len; ++k)
+      {
+	StringPiece wrd = (*bt.V2)[x[k].id()];
+	w.CreateFromString(Output,ofactor,wrd,false);
+	tp->AddWord(w);
+      }
+    tp->GetScoreBreakdown().Assign(this, pp.fvals);
+    tp->Evaluate(src);
+    return tp;
+  }
+
+  // process phrase stats from a single parallel corpus
+  void
+  Mmsapt::
+  process_pstats
+  (Phrase   const& src,
+   uint64_t const  pid1, 
+   pstats   const& stats, 
+   Bitext<Token> const & bt, 
+   TargetPhraseCollection* tpcoll
+   ) const
+  {
+    PhrasePair pp;   
+    pp.init(pid1, stats, this->m_numScoreComponents);
+    apply_pp(bt,pp);
+    boost::unordered_map<uint64_t,jstats>::const_iterator t;
+    for (t = stats.trg.begin(); t != stats.trg.end(); ++t)
+      {
+   	pp.update(t->first,t->second);
+	calc_lex(bt,pp);
+	calc_pfwd_fix(bt,pp);
+	calc_pbwd_fix(bt,pp);
+	tpcoll->Add(createTargetPhrase(src,bt,pp));
+      }
+  }
+
+  // process phrase stats from a single parallel corpus
+  bool
+  Mmsapt::
+  pool_pstats(Phrase   const& src,
+	      uint64_t const  pid1a, 
+	      pstats        * statsa, 
+	      Bitext<Token> const & bta,
+	      uint64_t const  pid1b, 
+	      pstats   const* statsb, 
+	      Bitext<Token> const & btb,
+	      TargetPhraseCollection* tpcoll) const
+  {
+    PhrasePair pp;
+    if (statsa && statsb)
+      pp.init(pid1b, *statsa, *statsb, this->m_numScoreComponents);
+    else if (statsa)
+      pp.init(pid1b, *statsa, this->m_numScoreComponents);
+    else if (statsb)
+      pp.init(pid1b, *statsb, this->m_numScoreComponents);
+    else return false; // throw "no stats for pooling available!";
+
+    apply_pp(bta,pp);
+    boost::unordered_map<uint64_t,jstats>::const_iterator b;
+    boost::unordered_map<uint64_t,jstats>::iterator a;
+    if (statsb)
+      {
+	for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b)
+	  {
+	    uint32_t sid,off,len;    
+	    parse_pid(b->first, sid, off, len);
+	    Token const* x = bta.T2->sntStart(sid) + off;
+	    TSA<Token>::tree_iterator m(bta.I2.get(),x,x+len);
+	    if (m.size() == len) 
+	      {
+		;
+		if (statsa && ((a = statsa->trg.find(m.getPid())) 
+			       != statsa->trg.end()))
+		  {
+		    pp.update(b->first,a->second,b->second);
+		    a->second.invalidate();
+		  }
+		else 
+		  pp.update(b->first,m.approxOccurrenceCount(),
+			    b->second);
+	      }
+	    else pp.update(b->first,b->second);
+	    calc_lex(btb,pp);
+	    calc_pfwd_fix(btb,pp);
+	    calc_pbwd_fix(btb,pp);
+	    tpcoll->Add(createTargetPhrase(src,btb,pp));
+	  }
+      }
+    if (!statsa) return statsb != NULL;
+    for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a)
+      {
+	uint32_t sid,off,len;
+	if (!a->second.valid()) continue;
+	parse_pid(a->first, sid, off, len);
+	if (btb.T2)
+	  {
+	    Token const* x = btb.T2->sntStart(sid) + off;
+	    TSA<Token>::tree_iterator m(btb.I2.get(), x, x+len);
+	    if (m.size() == len) 
+	      pp.update(a->first,m.approxOccurrenceCount(),a->second);
+	    else 
+	      pp.update(a->first,a->second);
+	  }
+	else 
+	  pp.update(a->first,a->second);
+	calc_lex(bta,pp);
+	calc_pfwd_fix(bta,pp);
+	calc_pbwd_fix(bta,pp);
+	tpcoll->Add(createTargetPhrase(src,bta,pp));
+      }
+    return true;
+}
  
-  // this is not the most efficient way of phrase lookup! 
+  
+  // process phrase stats from a single parallel corpus
+  bool
+  Mmsapt::
+  combine_pstats
+  (Phrase   const& src,
+   uint64_t const  pid1a, 
+   pstats   * statsa, 
+   Bitext<Token> const & bta,
+   uint64_t const  pid1b, 
+   pstats   const* statsb, 
+   Bitext<Token> const & btb,
+   TargetPhraseCollection* tpcoll
+   ) const
+  {
+    PhrasePair ppfix,ppdyn,pool; 
+    Word w;
+    if (statsa) ppfix.init(pid1a,*statsa,this->m_numScoreComponents);
+    if (statsb) ppdyn.init(pid1b,*statsb,this->m_numScoreComponents);
+    boost::unordered_map<uint64_t,jstats>::const_iterator b;
+    boost::unordered_map<uint64_t,jstats>::iterator a;
+    if (statsb)
+      {
+	pool.init(pid1b,*statsb,0);
+	apply_pp(btb,ppdyn);
+	for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b)
+	  {
+	    ppdyn.update(b->first,b->second);
+	    calc_pfwd_dyn(btb,ppdyn);
+	    calc_pbwd_dyn(btb,ppdyn);
+	    calc_lex(btb,ppdyn);
+	    
+	    uint32_t sid,off,len;    
+	    parse_pid(b->first, sid, off, len);
+	    Token const* x = bta.T2->sntStart(sid) + off;
+	    TSA<Token>::tree_iterator m(bta.I2.get(),x,x+len);
+	    if (m.size() && statsa && 
+		((a = statsa->trg.find(m.getPid())) 
+		 != statsa->trg.end()))
+	      {
+		ppfix.update(a->first,a->second);
+		calc_pfwd_fix(bta,ppfix,&ppdyn.fvals);
+		calc_pbwd_fix(btb,ppfix,&ppdyn.fvals);
+		a->second.invalidate();
+	      }
+	    else 
+	      {
+		if (m.size())
+		  pool.update(b->first,m.approxOccurrenceCount(),
+			      b->second);
+		else
+		  pool.update(b->first,b->second);
+		calc_pfwd_fix(btb,pool,&ppdyn.fvals);
+		calc_pbwd_fix(btb,pool,&ppdyn.fvals);
+	      }
+	    tpcoll->Add(createTargetPhrase(src,btb,ppdyn));
+	  }
+      }
+    if (statsa)
+      {
+	pool.init(pid1a,*statsa,0);
+	apply_pp(bta,ppfix);
+	for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a)
+	  {
+	    if (!a->second.valid()) continue; // done above
+	    ppfix.update(a->first,a->second);
+	    calc_pfwd_fix(bta,ppfix);
+	    calc_pbwd_fix(bta,ppfix);
+	    calc_lex(bta,ppfix);
+	    
+	    uint32_t sid,off,len;    
+	    parse_pid(a->first, sid, off, len);
+	    Token const* x = btb.T2->sntStart(sid) + off;
+	    TSA<Token>::tree_iterator m(btb.I2.get(),x,x+len);
+	    if (m.size())
+	      pool.update(a->first,m.approxOccurrenceCount(),a->second);
+	    else
+	      pool.update(a->first,a->second);
+	    calc_pfwd_dyn(bta,pool,&ppfix.fvals);
+	    calc_pbwd_dyn(bta,pool,&ppfix.fvals);
+	  }
+	tpcoll->Add(createTargetPhrase(src,bta,ppfix));
+      }
+    return (statsa || statsb);
+  }
+  
+  // // phrase statistics combination treating the two knowledge 
+  // // sources separately with backoff to pooling when only one 
+  // // of the two knowledge sources contains the phrase pair in 
+  // // question
+  // void
+  // Mmsapt::
+  // process_pstats(uint64_t const  mypid1,
+  // 		 uint64_t const  otpid1,
+  // 		 pstats   const& mystats,       // my phrase stats
+  // 		 pstats   const* otstats,       // other phrase stats
+  // 		 Bitext<Token> const & mybt,    // my bitext
+  // 		 Bitext<Token> const * otbt,    // other bitext
+  // 		 PhraseScorer<Token> const& mypfwd, 
+  // 		 PhraseScorer<Token> const& mypbwd, 
+  // 		 PhraseScorer<Token> const* otpfwd, 
+  // 		 PhraseScorer<Token> const* otpbwd, 
+  // 		 TargetPhraseCollection* tpcoll)
+  // {
+  //   boost::unordered_map<uint64_t,jstats>::const_iterator t;
+  //   vector<FactorType> ofact(1,0);
+  //   PhrasePair mypp,otpp,combo; 
+  //   mypp.init(mypid1, mystats, this->m_numScoreComponents);
+  //   if (otstats) 
+  //     {
+  // 	otpp.init(otpid1, *otstats, 0);
+  // 	combo.init(otpid1, mystats, *otstats, 0);
+  //     }
+  //   else combo = mypp;
+    
+  //   for (t = mystats.trg.begin(); t != mystats.trg.end(); ++t)
+  //     {
+  // 	if (!t->second.valid()) continue; 
+  // 	// we dealt with this phrase pair already; 
+  // 	// see j->second.invalidate() below;
+  // 	uint32_t sid,off,len; parse_pid(t->first,sid,off,len);
+   
+  // 	mypp.update(t->first,t->second);
+  // 	apply_pp(mybt,mypp);
+  // 	calc_lex (mybt,mypp);
+  // 	mypfwd(mybt,mypp);
+  // 	mypbwd(mybt,mypp);
+	
+  // 	if (otbt) // it's a dynamic phrase table
+  // 	  {
+  // 	    assert(otpfwd);
+  // 	    assert(otpbwd);
+  // 	    boost::unordered_map<uint64_t,jstats>::iterator j;
+	    
+  // 	    // look up the current target phrase in the other bitext
+  // 	    Token const* x = mybt.T2->sntStart(sid) + off;
+  // 	    TSA<TOKEN>::tree_iterator m(otbt->I2.get(),x,x+len);
+  // 	    if (otstats     // source phrase exists in other bitext
+  // 		&& m.size() // target phrase exists in other bitext
+  // 		&& ((j = otstats->trg.find(m.getPid())) 
+  // 		    != otstats->trg.end())) // phrase pair found in other bitext
+  // 	      {
+  // 		otpp.update(j->first,j->second);
+  // 		j->second.invalidate(); // mark the phrase pair as seen
+  // 		otpfwd(*otbt,otpp,&mypp.fvals);
+  // 		otpbwd(*otbt,otpp,&mypp.fvals);
+  // 	      }
+  // 	    else 
+  // 	      {
+  // 		if (m.size()) // target phrase seen in other bitext, but not the phrase pair
+  // 		  combo.update(t->first,m.approxOccurrenceCount(),t->second);
+  // 		else
+  // 		  combo.update(t->first,t->second);
+  // 		(*otpfwd)(mybt,combo,&mypp.fvals);
+  // 		(*otpbwd)(mybt,combo,&mypp.fvals);
+  // 	      }
+  // 	  }
+	
+  // 	// now add the phrase pair to the TargetPhraseCollection:
+  // 	TargetPhrase* tp = new TargetPhrase();
+  // 	for (size_t k = off; k < stop; ++k)
+  // 	  {
+  // 	    StringPiece wrd = (*mybt.V2)[x[k].id()];
+  // 	    Word w; w.CreateFromString(Output,ofact,wrd,false);
+  // 	    tp->AddWord(w);
+  // 	  }
+  // 	tp->GetScoreBreakdown().Assign(this,mypp.fvals);
+  // 	tp->Evaluate(src);
+  // 	tpcoll->Add(tp);
+  //     }
+  // }
+  
+  // This is not the most efficient way of phrase lookup! 
  TargetPhraseCollection const* 
  Mmsapt::
  GetTargetPhraseCollectionLEGACY(const Phrase& src) const
  {
-    TSA<Token>::tree_iterator m(bt.I1.get());
+    TargetPhraseCollection* ret = new TargetPhraseCollection();
+
+    // Reserve a local copy of the dynamic bitext in its current form. /btdyn/
+    // is set to a new copy of the dynamic bitext every time a sentence pair
+    // is added. /dyn/ keeps the old bitext around as long as we need it.
+    sptr<imBitext<Token> > dyn;
+    { // braces are needed for scoping mutex lock guard!
+      boost::lock_guard<boost::mutex> guard(this->lock);
+      dyn = btdyn;
+    }
+
+    vector<id_type> sphrase(src.GetSize());
    for (size_t i = 0; i < src.GetSize(); ++i)
      {
 	Factor const* f = src.GetFactor(i,input_factor);
-	id_type wid = (*bt.V1)[f->ToString()]; 
-	// cout << (*bt.V1)[wid] << " ";
-	if (!m.extend(wid)) break;
+	id_type wid = (*btfix.V1)[f->ToString()]; 
+	sphrase[i] = wid;
      }
-#if 0
-    cout << endl;
-    Token const* sphrase = m.getToken(0);
-    for (size_t i = 0; i < m.size(); ++i)
-      cout << (*bt.V1)[sphrase[i].id()] << " ";
-    cout << endl;
-#endif

-    sptr<pstats> s;
-    if (m.size() < src.GetSize()) return NULL;
-    {
-      boost::lock_guard<boost::mutex> guard(this->lock);
-      s = bt.lookup(m);
-    }
-    PhrasePair pp; pp.init(m.getPid(), *s, this->m_numScoreComponents);
-    TargetPhraseCollection* ret = new TargetPhraseCollection();
-
-    vector<FactorType> ofact(1,0);
-    boost::unordered_map<uint64_t,jstats>::const_iterator t;
-    for (t = s->trg.begin(); t != s->trg.end(); ++t)
+    TSA<Token>::tree_iterator mfix(btfix.I1.get()), mdyn(dyn->I1.get());
+    for (size_t i = 0; mfix.size() == i && i < sphrase.size(); ++i)
+      mfix.extend(sphrase[i]);
+    
+    if (dyn->I1.get())
      {
-	pp.update(t->first,t->second);
-	calc_pfwd(bt,pp);
-	calc_pbwd(bt,pp);
-	calc_lex (bt,pp);
-	apply_pp (bt,pp);
-
-	uint32_t sid,off,len;
-	parse_pid(t->first,sid,off,len);
-	size_t stop = off + len;
-	Token const* x = bt.T2->sntStart(sid);
-
-	TargetPhrase* tp = new TargetPhrase();
-	for (size_t k = off; k < stop; ++k)
-	  {
-	    StringPiece wrd = (*bt.V2)[x[k].id()];
-	    Word w; w.CreateFromString(Output,ofact,wrd,false);
-	    tp->AddWord(w);
-	  }
-	tp->GetScoreBreakdown().Assign(this,pp.fvals);
-	tp->Evaluate(src);
-	ret->Add(tp);
+	for (size_t i = 0; mdyn.size() == i && i < sphrase.size(); ++i)
+	  mdyn.extend(sphrase[i]);
      }
+
+    sptr<pstats> sfix,sdyn;
+    if (mfix.size() == sphrase.size())
+      {
+	// do we need this lock here? 
+	// Is it used here to control the total number of running threads???
+	boost::lock_guard<boost::mutex> guard(this->lock);
+	sfix = btfix.lookup(mfix);
+      }
+    if (mdyn.size() == sphrase.size())
+      sdyn = dyn->lookup(mdyn);
+    if (poolCounts)
+      {
+	if (!pool_pstats(src, mfix.getPid(),sfix.get(),btfix, 
+			 mdyn.getPid(),sdyn.get(),*dyn,ret))
+	  return NULL;
+      }
+    else if (!combine_pstats(src, mfix.getPid(),sfix.get(),btfix, 
+			     mdyn.getPid(),sdyn.get(),*dyn,ret))
+      return NULL;
    ret->NthElement(m_tableLimit);
 #if 0
    sort(ret->begin(), ret->end(), CompareTargetPhrase());
@ -150,7 +481,16 @@ namespace Moses
  Mmsapt::
  CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &)
  {
-    throw "CreateRuleLookupManager is currently not supported in Moses!";
+    throw "CreateRuleLookupManager is currently not supported in Mmsapt!";
  }

+  template<typename Token>
+  void 
+  fill_token_seq(TokenIndex& V, string const& line, vector<Token>& dest)
+  {
+    istringstream buf(line); string w;
+    while (buf>>w) dest.push_back(Token(V[w]));
+  }
+
+
 }
--- a/moses/TranslationModel/UG/mmsapt.h
+++ b/moses/TranslationModel/UG/mmsapt.h
@ -5,18 +5,19 @@

 #include <boost/thread.hpp>

-#include "moses/generic/sorting/VectorIndexSorter.h"
-#include "moses/generic/sampling/Sampling.h"
-#include "moses/generic/file_io/ug_stream.h"
+#include "moses/TypeDef.h"
+#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
+#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
+#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"

-#include "moses/mm/ug_mm_ttrack.h"
-#include "moses/mm/ug_mm_tsa.h"
-#include "moses/mm/tpt_tokenindex.h"
-#include "moses/mm/ug_corpus_token.h"
-#include "moses/mm/ug_typedefs.h"
-#include "moses/mm/tpt_pickler.h"
-#include "moses/mm/ug_bitext.h"
-#include "moses/mm/ug_lexical_phrase_scorer2.h"
+#include "moses/TranslationModel/UG/mm/ug_mm_ttrack.h"
+#include "moses/TranslationModel/UG/mm/ug_mm_tsa.h"
+#include "moses/TranslationModel/UG/mm/tpt_tokenindex.h"
+#include "moses/TranslationModel/UG/mm/ug_corpus_token.h"
+#include "moses/TranslationModel/UG/mm/ug_typedefs.h"
+#include "moses/TranslationModel/UG/mm/tpt_pickler.h"
+#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h"

 #include "moses/InputFileStream.h"
 #include "moses/FactorTypeSet.h"
@ -25,20 +26,32 @@
 #include "moses/TargetPhraseCollection.h"
 #include <map>

-#include "PhraseDictionary.h"
+#include "moses/TranslationModel/PhraseDictionary.h"
+
+// TO DO:
+// - make lexical phrase scorer take addition to the "dynamic overlay" into account
+// - switch to pool of sapts, where each sapt has its own provenance feature
+//   RESEARCH QUESTION: is this more effective than having multiple phrase tables, 
+//   each with its own set of features?

 using namespace std;
 namespace Moses
 {
  using namespace bitext;
-  class Mmsapt : public PhraseDictionary
+  class Mmsapt 
+#ifndef NO_MOSES
+    : public PhraseDictionary
+#endif
  {
-    
+    friend class Alignment;
+  public:    
    typedef L2R_Token<SimpleWordId> Token;
    typedef mmBitext<Token> mmbitext;
-    mmbitext bt;
-    
-    // string description;
+    typedef imBitext<Token> imbitext;
+    typedef TSA<Token>           tsa;
+  private:
+    mmbitext btfix; 
+    sptr<imbitext> btdyn;
    string bname;
    string L1;
    string L2;
@ -48,25 +61,84 @@ namespace Moses
    size_t input_factor;
    size_t output_factor; // we can actually return entire Tokens!
    // built-in feature functions
-    PScorePfwd<Token> calc_pfwd;
-    PScorePbwd<Token> calc_pbwd;
+    PScorePfwd<Token> calc_pfwd_fix, calc_pfwd_dyn;
+    PScorePbwd<Token> calc_pbwd_fix, calc_pbwd_dyn;
    PScoreLex<Token>  calc_lex; // this one I'd like to see as an external ff eventually
    PScorePP<Token>   apply_pp; // apply phrase penalty 
    void init(string const& line);
    mutable boost::mutex lock;
+    bool poolCounts;
+    vector<FactorType> ofactor;
+
+    // phrase table feature weights for alignment:
+    vector<float> feature_weights; 
+
+    vector<vector<id_type> > wlex21; 
+    // word translation lexicon (without counts, get these from calc_lex.COOC)
+    typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> mm2dtable_t;
+    mm2dtable_t COOCraw;
+
+    TargetPhrase* 
+    createTargetPhrase
+    (Phrase        const& src, 
+     Bitext<Token> const& bt, 
+     bitext::PhrasePair    const& pp
+     ) const;
+
+    void
+    process_pstats
+    (Phrase   const& src,
+     uint64_t const  pid1, 
+     pstats   const& stats, 
+     Bitext<Token> const & bt, 
+     TargetPhraseCollection* tpcoll
+     ) const;
+
+    bool
+    pool_pstats
+    (Phrase   const& src,
+     uint64_t const  pid1a, 
+     pstats        * statsa, 
+     Bitext<Token> const & bta,
+     uint64_t const  pid1b, 
+     pstats   const* statsb, 
+     Bitext<Token> const & btb,
+     TargetPhraseCollection* tpcoll
+     ) const;
+     
+    bool
+    combine_pstats
+    (Phrase   const& src,
+     uint64_t const  pid1a, 
+     pstats   * statsa, 
+     Bitext<Token> const & bta,
+     uint64_t const  pid1b, 
+     pstats   const* statsb, 
+     Bitext<Token> const & btb,
+     TargetPhraseCollection* tpcoll
+     ) const;
+
  public:
-    Mmsapt(string const& description, string const& line);
+    // Mmsapt(string const& description, string const& line);
    Mmsapt(string const& line);
    void
    Load();
    
+#ifndef NO_MOSES
    TargetPhraseCollection const* 
    GetTargetPhraseCollectionLEGACY(const Phrase& src) const;
-
    //! Create a sentence-specific manager for SCFG rule lookup.
    ChartRuleLookupManager*
    CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &);
+#endif

+    void add(string const& s1, string const& s2, string const& a);
+
+    // align two new sentences
+    sptr<vector<int> >
+    align(string const& src, string const& trg) const;
+
+    void setWeights(vector<float> const& w);
  private:
  };
 } // end namespace
--- a/moses/TranslationModel/UG/mmsapt_align.cc
+++ b/moses/TranslationModel/UG/mmsapt_align.cc
@ -0,0 +1,334 @@
+#include "mmsapt.h"
+
+namespace Moses
+{
+  using namespace bitext;
+  using namespace std;
+  using namespace boost;
+  
+  struct PPgreater
+  {
+    bool operator()(PhrasePair const& a, PhrasePair const& b)
+    {
+      return a.score > b.score;
+    }
+  };
+
+  void
+  Mmsapt::
+  setWeights(vector<float> const & w)
+  {
+    assert(w.size() == this->m_numScoreComponents);
+    this->feature_weights = w;
+  }
+
+  struct PhraseAlnHyp
+  {
+    PhrasePair pp;
+    ushort   s1,e1,s2,e2; // start and end positions
+    int             prev; // preceding alignment hypothesis
+    float          score; 
+    bitvector       scov; // source coverage
+    PhraseAlnHyp(PhrasePair const& ppx, int slen,
+		 pair<uint32_t,uint32_t> const& sspan,
+		 pair<uint32_t,uint32_t> const& tspan)
+      : pp(ppx), prev(-1), score(ppx.score), scov(slen)
+    {
+      s1 = sspan.first; e1 = sspan.second;
+      s2 = tspan.first; e2 = tspan.second;
+      for (size_t i = s1; i < e1; ++i) 
+	scov.set(i);
+    }
+
+    bool operator<(PhraseAlnHyp const& other) const
+    {
+      return this->score < other.score;
+    }
+
+    bool operator>(PhraseAlnHyp const& other) const
+    {
+      return this->score > other.score;
+    }
+
+    PhraseOrientation
+    po_bwd(PhraseAlnHyp const* prev) const
+    {
+      if (s2 == 0) return po_first;
+      assert(prev);
+      assert(prev->e2 <= s2);
+      if (prev->e2 < s2)  return po_other;
+      if (prev->e1 == s1) return po_mono;
+      if (prev->e1 < s1)  return po_jfwd;
+      if (prev->s1 == e1) return po_swap;
+      if (prev->s1 > e1)  return po_jbwd;
+      return po_other;
+    }
+
+    PhraseOrientation
+    po_fwd(PhraseAlnHyp const* next) const
+    {
+      if (!next) return po_last;
+      assert(next->s2 >= e2);
+      if (next->s2 < e2)  return po_other;
+      if (next->e1 == s1) return po_swap;
+      if (next->e1 < s1)  return po_jbwd;
+      if (next->s1 == e1) return po_mono;
+      if (next->s1 > e1)  return po_jfwd;
+      return po_other;
+    }
+
+    float 
+    dprob_fwd(PhraseAlnHyp const& next)
+    {
+      return pp.dfwd[po_fwd(&next)];
+    }
+
+    float 
+    dprob_bwd(PhraseAlnHyp const& prev)
+    {
+      return pp.dbwd[po_bwd(&prev)];
+    }
+
+  };
+
+  class Alignment
+  {
+    typedef L2R_Token<SimpleWordId> Token;
+    typedef TSA<Token>           tsa;
+    typedef pair<uint32_t, uint32_t>  span;
+    typedef vector<vector<uint64_t> > pidmap_t; // span -> phrase ID
+    typedef boost::unordered_map<uint64_t,vector<span> > pid2span_t;
+    typedef boost::unordered_map<uint64_t,jstats> jStatsTable;
+
+    Mmsapt const& PT;
+    vector<id_type> s,t; 
+    pidmap_t   sspan2pid, tspan2pid; // span -> phrase ID
+    pid2span_t spid2span,tpid2span;
+    vector<vector<sptr<pstats> > > spstats;
+
+    vector<PhrasePair> PP; 
+    // position-independent phrase pair info
+  public:
+    vector<PhraseAlnHyp> PAH;  
+    vector<vector<int> > tpos2ahyp;
+    // maps from target start positions to PhraseAlnHyps starting at
+    // that position
+
+    sptr<pstats> getPstats(span const& sspan);
+    void fill_tspan_maps();
+    void fill_sspan_maps();
+  public:
+    Alignment(Mmsapt const& pt, string const& src, string const& trg);
+    void show(ostream& out); 
+    void show(ostream& out, PhraseAlnHyp const& ah); 
+  };
+
+  void
+  Alignment::
+  show(ostream& out, PhraseAlnHyp const& ah)
+  {
+    LexicalPhraseScorer2<Token>::table_t const& 
+      COOCjnt = PT.calc_lex.scorer.COOC;
+
+    out << setw(10) << exp(ah.score) << " "
+	<< PT.btfix.T2->pid2str(PT.btfix.V2.get(), ah.pp.p2) 
+	<< " <=> "
+	<< PT.btfix.T1->pid2str(PT.btfix.V1.get(), ah.pp.p1);
+    vector<uchar> const& a = ah.pp.aln;
+    // BOOST_FOREACH(int x,a) cout << "[" << x << "] ";
+    for (size_t u = 0; u+1 < a.size(); u += 2)
+      out << " " << int(a[u+1]) << "-" << int(a[u]);
+
+    if (ah.e2-ah.s2 == 1 and ah.e1-ah.s1 == 1)
+      out << " " << COOCjnt[s[ah.s1]][t[ah.s2]]
+	  << "/" << PT.COOCraw[s[ah.s1]][t[ah.s2]]
+	  << "=" << float(COOCjnt[s[ah.s1]][t[ah.s2]])/PT.COOCraw[s[ah.s1]][t[ah.s2]];
+    out << endl;
+    // float const* ofwdj = ah.pp.dfwd;
+    // float const* obwdj = ah.pp.dbwd;
+    // uint32_t const* ofwdm = spstats[ah.s1][ah.e1-ah.s1-1]->ofwd;
+    // uint32_t const* obwdm = spstats[ah.s1][ah.e1-ah.s1-1]->obwd;
+    // out << "   [first: " << ofwdj[po_first]<<"/"<<ofwdm[po_first]
+    // 	 <<     " last: " << ofwdj[po_last]<<"/"<<ofwdm[po_last]
+    // 	 <<     " mono: " << ofwdj[po_mono]<<"/"<<ofwdm[po_mono]
+    // 	 <<     " jfwd: " << ofwdj[po_jfwd]<<"/"<<ofwdm[po_jfwd]
+    // 	 <<     " swap: " << ofwdj[po_swap]<<"/"<<ofwdm[po_swap]
+    // 	 <<     " jbwd: " << ofwdj[po_jbwd]<<"/"<<ofwdm[po_jbwd]
+    // 	 <<     " other: " << ofwdj[po_other]<<"/"<<ofwdm[po_other]
+    // 	 << "]" << endl
+    // 	 << "   [first: " << obwdj[po_first]<<"/"<<obwdm[po_first]
+    // 	 <<     " last: " << obwdj[po_last]<<"/"<<obwdm[po_last]
+    // 	 <<     " mono: " << obwdj[po_mono]<<"/"<<obwdm[po_mono]
+    // 	 <<     " jfwd: " << obwdj[po_jfwd]<<"/"<<obwdm[po_jfwd]
+    // 	 <<     " swap: " << obwdj[po_swap]<<"/"<<obwdm[po_swap]
+    // 	 <<     " jbwd: " << obwdj[po_jbwd]<<"/"<<obwdm[po_jbwd]
+    // 	 <<     " other: " << obwdj[po_other]<<"/"<<obwdm[po_other]
+    // 	 << "]" << endl;
+  }
+  
+  void
+  Alignment::
+  show(ostream& out)
+  {
+    // show what we have so far ...
+    for (size_t s2 = 0; s2 < t.size(); ++s2)
+      {
+	VectorIndexSorter<PhraseAlnHyp> foo(PAH);
+	sort(tpos2ahyp[s2].begin(), tpos2ahyp[s2].end(), foo);
+	for (size_t h = 0; h < tpos2ahyp[s2].size(); ++h)
+	  show(out,PAH[tpos2ahyp[s2][h]]);
+      }
+  }
+
+  sptr<pstats>
+  Alignment::
+  getPstats(span const& sspan)
+  {
+    size_t k = sspan.second - sspan.first - 1;
+    if (k < spstats[sspan.first].size())
+      return spstats[sspan.first][k];
+    else return sptr<pstats>();
+  }
+  
+  void
+  Alignment::
+  fill_tspan_maps()
+  {
+    tspan2pid.assign(t.size(),vector<uint64_t>(t.size(),0));
+    for (size_t i = 0; i < t.size(); ++i)
+      {
+	tsa::tree_iterator m(PT.btfix.I2.get());
+	for (size_t k = i; k < t.size() && m.extend(t[k]); ++k)
+	  {
+	    uint64_t pid = m.getPid();
+	    tpid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
+	    tspan2pid[i][k] = pid;
+	  }
+      } 
+  }
+
+  void
+  Alignment::
+  fill_sspan_maps()
+  {
+    sspan2pid.assign(s.size(),vector<uint64_t>(s.size(),0));
+    spstats.resize(s.size());
+    for (size_t i = 0; i < s.size(); ++i)
+      {
+	tsa::tree_iterator m(PT.btfix.I1.get());
+	for (size_t k = i; k < s.size() && m.extend(s[k]); ++k)
+	  {
+	    uint64_t pid = m.getPid();
+	    sspan2pid[i][k] = pid;
+	    pid2span_t::iterator p = spid2span.find(pid);
+	    if (p != spid2span.end())
+	      {
+		int x = p->second[0].first;
+		int y = p->second[0].second-1;
+		spstats[i].push_back(spstats[x][y-x]);
+	      }
+	    else 
+	      {
+		spstats[i].push_back(PT.btfix.lookup(m));
+		cout << PT.btfix.T1->pid2str(PT.btfix.V1.get(),pid) << " "
+		     << spstats[i].back()->good << "/" << spstats[i].back()->sample_cnt 
+		     << endl;
+	      }
+	    spid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
+	  }
+      }
+  }
+
+  Alignment::
+  Alignment(Mmsapt const& pt, string const& src, string const& trg)
+    : PT(pt)
+  {
+    PT.btfix.V1->fillIdSeq(src,s);
+    PT.btfix.V2->fillIdSeq(trg,t);
+
+    // LexicalPhraseScorer2<Token>::table_t const& COOC = PT.calc_lex.scorer.COOC;
+    // BOOST_FOREACH(id_type i, t)
+    //   {
+    // 	cout << (*PT.btfix.V2)[i];
+    // 	if (i < PT.wlex21.size())
+    // 	  {
+    // 	    BOOST_FOREACH(id_type k, PT.wlex21[i])
+    // 	      {
+    // 		size_t  j = COOC[k][i];
+    // 		size_t m1 = COOC.m1(k);
+    // 		size_t m2 = COOC.m2(i);
+    // 		if (j*1000 > m1 && j*1000 > m2)
+    // 		  cout << " " << (*PT.btfix.V1)[k];
+    // 	      }	 
+    // 	  }
+    // 	cout << endl;
+    //   }
+    
+    fill_tspan_maps();
+    fill_sspan_maps();
+    tpos2ahyp.resize(t.size()); 
+    // now fill the association score table
+    PAH.reserve(1000000);
+    typedef pid2span_t::iterator psiter;
+    for (psiter L = spid2span.begin(); L != spid2span.end(); ++L)
+      {
+	if (!L->second.size()) continue; // should never happen anyway
+	int i = L->second[0].first;
+	int k = L->second[0].second - i -1;
+	sptr<pstats> ps = spstats[i][k];
+	PhrasePair pp; pp.init(L->first,*ps, PT.m_numScoreComponents);
+	jStatsTable & J = ps->trg;
+	for (jStatsTable::iterator y = J.begin(); y != J.end(); ++y)
+	  {
+	    psiter R = tpid2span.find(y->first);
+	    if (R == tpid2span.end()) continue;
+	    pp.update(y->first, y->second);
+	    PT.calc_lex(PT.btfix,pp);
+	    PT.calc_pfwd_fix(PT.btfix,pp);
+	    PT.calc_pbwd_fix(PT.btfix,pp);
+	    pp.eval(PT.feature_weights);
+	    PP.push_back(pp);
+	    BOOST_FOREACH(span const& sspan, L->second)
+	      {
+		BOOST_FOREACH(span const& tspan, R->second)
+		  {
+		    tpos2ahyp[tspan.first].push_back(PAH.size());
+		    PAH.push_back(PhraseAlnHyp(PP.back(),s.size(),sspan,tspan));
+		  }
+	      }
+	  }
+      }
+  }
+
+    
+
+  int
+  extend(vector<PhraseAlnHyp> & PAH, int edge, int next)
+  {
+    if ((PAH[edge].scov & PAH[next].scov).count()) 
+      return -1;
+    int ret = PAH.size();
+    PAH.push_back(PAH[next]);
+    PhraseAlnHyp & h = PAH.back();
+    h.prev  = edge;
+    h.scov |= PAH[edge].scov;
+    h.score += log(PAH[edge].dprob_fwd(PAH[next]));
+    h.score += log(PAH[next].dprob_bwd(PAH[edge]));
+    return ret;
+  }
+
+  sptr<vector<int> >
+  Mmsapt::
+  align(string const& src, string const& trg) const
+  {
+    // For the time being, we consult only the fixed bitext.
+    // We might also consider the dynamic bitext. => TO DO.
+    Alignment A(*this,src,trg);
+    VectorIndexSorter<PhraseAlnHyp> foo(A.PAH);
+    vector<size_t> o; foo.GetOrder(o);
+    BOOST_FOREACH(int i, o) A.show(cout,A.PAH[i]);
+    sptr<vector<int> > aln;
+    return aln;
+  }
+}
+
--- a/moses/TranslationModel/UG/try-align.cc
+++ b/moses/TranslationModel/UG/try-align.cc
@ -0,0 +1,33 @@
+#include "mmsapt.h"
+using namespace std;
+using namespace Moses;
+
+
+Mmsapt* PT;
+int main(int argc, char* argv[])
+{
+  string base = argv[1];
+  string L1   = argv[2];
+  string L2   = argv[3];
+  ostringstream buf;
+  buf << "Mmsapt name=PT0 output-factor=0 num-features=5 base="
+      << base << " L1=" << L1 << " L2=" << L2;
+  string configline = buf.str();
+  PT = new Mmsapt(configline);
+  PT->Load();
+  float w[] = { 0.0582634, 0.0518865, 0.0229819, 0.00640856,  0.647506 };
+  vector<float> weights(w,w+5);
+  PT->setWeights(weights);
+  // these values are taken from a moses.ini file;
+  // is there a convenient way of accessing them from within mmsapt ???
+  string eline,fline;
+  // TokenIndex V; V.open("crp/trn/mm/de.tdx");
+  while (getline(cin,eline) && getline(cin,fline))
+    {
+      cout << eline << endl;
+      cout << fline << endl;
+      PT->align(eline,fline);
+    }
+  delete PT;
+}
+
--- a/phrase-extract/lexical-reordering/reordering_classes.cpp
+++ b/phrase-extract/lexical-reordering/reordering_classes.cpp
@ -35,7 +35,10 @@ ModelScore* ModelScore::createModelScore(const string& modeltype)
  } else if (modeltype.compare("leftright") == 0) {
    return new ModelScoreLR();
  } else {
-    cerr << "Illegal model type given for lexical reordering model scoring: " << modeltype << ". The allowed types are: mslr, msd, monotonicity, leftright" << endl;
+    cerr << "Illegal model type given for lexical reordering model scoring: " 
+	 << modeltype 
+	 << ". The allowed types are: mslr, msd, monotonicity, leftright" 
+	 << endl;
    exit(1);
  }
 }
--- a/phrase-extract/score-main.cpp
+++ b/phrase-extract/score-main.cpp
@ -555,51 +555,6 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
    phraseTableFile << " ||| ";
  }

-  // alignment
-  if ( hierarchicalFlag ) {
-      // always output alignment if hiero style
-      assert(phraseTarget->size() == bestAlignmentT2S->size()+1);
-      std::vector<std::string> alignment;
-      for ( size_t j = 0; j < phraseTarget->size() - 1; ++j ) {
-        if ( isNonTerminal(vcbT.getWord( phraseTarget->at(j) ))) {
-          if ( bestAlignmentT2S->at(j).size() != 1 ) {
-            std::cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << std::endl;
-            phraseTableFile.flush();
-            assert(bestAlignmentT2S->at(j).size() == 1);
-          }
-          size_t sourcePos = *(bestAlignmentT2S->at(j).begin());
-          //phraseTableFile << sourcePos << "-" << j << " ";
-          std::stringstream point;
-          point << sourcePos << "-" << j;
-          alignment.push_back(point.str());
-        } else {
-          for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin(); 
-                setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
-            size_t sourcePos = *setIter;
-            std::stringstream point;
-            point << sourcePos << "-" << j;
-            alignment.push_back(point.str());
-          }
-        }
-      }
-      // now print all alignments, sorted by source index
-      sort(alignment.begin(), alignment.end());
-      for (size_t i = 0; i < alignment.size(); ++i) {
-        phraseTableFile << alignment[i] << " ";
-      }
-  } else if ( !inverseFlag && wordAlignmentFlag) {
-      // alignment info in pb model
-      for (size_t j = 0; j < bestAlignmentT2S->size(); ++j) {
-        for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin(); 
-              setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
-          size_t sourcePos = *setIter;
-          phraseTableFile << sourcePos << "-" << j << " ";
-        }
-      }
-  }
-
-  phraseTableFile << " ||| ";
-
  // lexical translation probability
  if (lexFlag) {
    double lexScore = computeLexicalTranslation( phraseSource, phraseTarget, bestAlignmentT2S );
@ -641,6 +596,53 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
    phraseTableFile << " " << i->first << " " << i->second;
  }

+  phraseTableFile << " ||| ";
+
+  // output alignment info
+  if ( !inverseFlag ) {
+    if ( hierarchicalFlag ) {
+      // always output alignment if hiero style
+      assert(phraseTarget->size() == bestAlignmentT2S->size()+1);
+      std::vector<std::string> alignment;
+      for ( size_t j = 0; j < phraseTarget->size() - 1; ++j ) {
+        if ( isNonTerminal(vcbT.getWord( phraseTarget->at(j) ))) {
+          if ( bestAlignmentT2S->at(j).size() != 1 ) {
+            std::cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << std::endl;
+            phraseTableFile.flush();
+            assert(bestAlignmentT2S->at(j).size() == 1);
+          }
+          size_t sourcePos = *(bestAlignmentT2S->at(j).begin());
+          //phraseTableFile << sourcePos << "-" << j << " ";
+          std::stringstream point;
+          point << sourcePos << "-" << j;
+          alignment.push_back(point.str());
+        } else {
+          for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin(); 
+                setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
+            size_t sourcePos = *setIter;
+            std::stringstream point;
+            point << sourcePos << "-" << j;
+            alignment.push_back(point.str());
+          }
+        }
+      }
+      // now print all alignments, sorted by source index
+      sort(alignment.begin(), alignment.end());
+      for (size_t i = 0; i < alignment.size(); ++i) {
+        phraseTableFile << alignment[i] << " ";
+      }
+    } else if (wordAlignmentFlag) {
+      // alignment info in pb model
+      for (size_t j = 0; j < bestAlignmentT2S->size(); ++j) {
+        for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin(); 
+              setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
+          size_t sourcePos = *setIter;
+          phraseTableFile << sourcePos << "-" << j << " ";
+        }
+      }
+    }
+  }
+
  // counts
  phraseTableFile << " ||| " << totalCount << " " << count;
  if (kneserNeyFlag)
--- a/scripts/tokenizer/pre-tokenizer.perl
+++ b/scripts/tokenizer/pre-tokenizer.perl
@ -0,0 +1,43 @@
+#!/usr/bin/perl -W 
+# script for preprocessing language data prior to tokenization
+# Start by Ulrich Germann, after noticing systematic preprocessing errors
+# in some of the English Europarl data.
+
+use strict;
+use Getopt::Std;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+sub usage
+{
+  print "Script for preprocessing of raw language data prior to tokenization\n";
+  print "Usage: $0 -l <language tag>\n";
+}
+
+my %args;
+getopt('l=s h',\%args);
+usage() && exit(0) if $args{'h'};
+
+if ($args{'l'} eq "en")
+  {
+      while (<>)
+      {
+	  s/([[:alpha:]]\') s\b/$1s/g;
+	  print;
+      }
+      
+  }
+elsif ($args{'l'} eq "fr")
+  {
+    while (<>)
+      {
+	s/\b([[:alpha:]]\')\s+(?=[[:alpha:]])/$1/g;
+	print;
+      }
+  }
+else
+  {
+    
+    print while <>;
+  }
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@ -33,7 +33,7 @@ my $TIMING = 0;
 my $NUM_THREADS = 1;
 my $NUM_SENTENCES_PER_THREAD = 2000;
 my $PENN = 0;
-
+my $NO_ESCAPING = 0;
 while (@ARGV) 
 {
 	$_ = shift;
@ -49,6 +49,7 @@ while (@ARGV)
 	/^-threads$/ && ($NUM_THREADS = int(shift), next);
 	/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
 	/^-penn$/ && ($PENN = 1, next);
+	/^-no-escape/ && ($NO_ESCAPING = 1, next);
 }

 # for time calculation
@ -69,6 +70,7 @@ if ($HELP)
        print "  -time  ... enable processing time calculation.\n";
        print "  -penn  ... use Penn treebank-like tokenization.\n";
        print "  -protected FILE  ... specify file with patters to be protected in tokenisation.\n";
+	print "  -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n";
 	exit;
 }

@ -246,7 +248,7 @@ sub tokenize
    # aggressive hyphen splitting
    if ($AGGRESSIVE) 
    {
-        $text =~ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g;
+        $text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g;
    }

    #multi-dots stay together
@ -345,14 +347,17 @@ sub tokenize
    $text =~ s/DOTMULTI/./g;

    #escape special chars
-    $text =~ s/\&/\&amp;/g;   # escape escape
-    $text =~ s/\|/\&#124;/g;  # factor separator
-    $text =~ s/\</\&lt;/g;    # xml
-    $text =~ s/\>/\&gt;/g;    # xml
-    $text =~ s/\'/\&apos;/g;  # xml
-    $text =~ s/\"/\&quot;/g;  # xml
-    $text =~ s/\[/\&#91;/g;   # syntax non-terminal
-    $text =~ s/\]/\&#93;/g;   # syntax non-terminal
+    if (!$NO_ESCAPING)
+      {
+	$text =~ s/\&/\&amp;/g;   # escape escape
+	$text =~ s/\|/\&#124;/g;  # factor separator
+	$text =~ s/\</\&lt;/g;    # xml
+	$text =~ s/\>/\&gt;/g;    # xml
+	$text =~ s/\'/\&apos;/g;  # xml
+	$text =~ s/\"/\&quot;/g;  # xml
+	$text =~ s/\[/\&#91;/g;   # syntax non-terminal
+	$text =~ s/\]/\&#93;/g;   # syntax non-terminal
+      }

    #ensure final line break
    $text .= "\n" unless $text =~ /\n$/;
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@ -315,7 +315,9 @@ print STDERR "Using SCRIPTS_ROOTDIR: $SCRIPTS_ROOTDIR\n";
 # path of script for filtering phrase tables and running the decoder
 $filtercmd = File::Spec->catfile($SCRIPTS_ROOTDIR, "training", "filter-model-given-input.pl") if !defined $filtercmd;

-if ( ! -x $filtercmd && ! $___FILTER_PHRASE_TABLE) {
+# WHY ... ! ___FILTER_PHRASE_TABLE ??? This doesn't make sense! [UG]
+# if ( ! -x $filtercmd && ! $___FILTER_PHRASE_TABLE) {
+if ( ! -x $filtercmd && $___FILTER_PHRASE_TABLE) {
  warn "Filtering command not found: $filtercmd.";
  warn "Use --filtercmd=PATH to specify a valid one or --no-filter-phrase-table";
  exit 1;
@ -409,7 +411,7 @@ if ($___ACTIVATE_FEATURES) {
 }

 my ($just_cmd_filtercmd, $x) = split(/ /, $filtercmd);
-die "Not executable: $just_cmd_filtercmd" if ! -x $just_cmd_filtercmd;
+die "Not executable: $just_cmd_filtercmd" if $___FILTER_PHRASE_TABLE && ! -x $just_cmd_filtercmd;
 die "Not executable: $moses_parallel_cmd" if defined $___JOBS && ! -x $moses_parallel_cmd;
 die "Not executable: $qsubwrapper"        if defined $___JOBS && ! -x $qsubwrapper;
 die "Not executable: $___DECODER"         if ! -x $___DECODER;