Alpha version of phrase tables based on dynamic suffix arrays.

2024-12-25 04:43:03 +03:00 · 2013-06-05 10:46:42 +01:00 · 2013-06-05 10:46:42 +01:00 · 7ecfb88a29
commit 7ecfb88a29
parent e3a0bfa330
13 changed files with 1500 additions and 805 deletions
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@ -275,13 +275,15 @@ bool Parameter::LoadParam(int argc, char* argv[])
  }

  // overwrite parameters with values from switches
-  for(PARAM_STRING::const_iterator iterParam = m_description.begin(); iterParam != m_description.end(); iterParam++) {
+  for(PARAM_STRING::const_iterator iterParam = m_description.begin(); 
+      iterParam != m_description.end(); iterParam++) {
    const string paramName = iterParam->first;
    OverwriteParam("-" + paramName, paramName, argc, argv);
  }

  // ... also shortcuts
-  for(PARAM_STRING::const_iterator iterParam = m_abbreviation.begin(); iterParam != m_abbreviation.end(); iterParam++) {
+  for(PARAM_STRING::const_iterator iterParam = m_abbreviation.begin(); 
+      iterParam != m_abbreviation.end(); iterParam++) {
    const string paramName = iterParam->first;
    const string paramShortName = iterParam->second;
    OverwriteParam("-" + paramShortName, paramName, argc, argv);
@ -294,7 +296,8 @@ bool Parameter::LoadParam(int argc, char* argv[])
    verbose = Scan<int>(m_setting["verbose"][0]);
  if (verbose >= 1) { // only if verbose
    TRACE_ERR( "Defined parameters (per moses.ini or switch):" << endl);
-    for(PARAM_MAP::const_iterator iterParam = m_setting.begin() ; iterParam != m_setting.end(); iterParam++) {
+    for(PARAM_MAP::const_iterator iterParam = m_setting.begin() ; 
+	iterParam != m_setting.end(); iterParam++) {
      TRACE_ERR( "\t" << iterParam->first << ": ");
      for ( size_t i = 0; i < iterParam->second.size(); i++ )
        TRACE_ERR( iterParam->second[i] << " ");
@ -303,7 +306,8 @@ bool Parameter::LoadParam(int argc, char* argv[])
  }

  // convert old weights args to new format
-  if (!isParamSpecified("feature"))
+  // WHAT IS GOING ON HERE??? - UG
+ if (!isParamSpecified("feature")) // UG
    ConvertWeightArgs();
  CreateWeightsMap();
  WeightOverwrite();
@ -329,11 +333,11 @@ std::vector<float> &Parameter::GetWeights(const std::string &name)
 {
  std::vector<float> &ret = m_weights[name];

-  cerr << "WEIGHT " << name << "=";
-  for (size_t i = 0; i < ret.size(); ++i) {
-    cerr << ret[i] << ",";
-  }
-  cerr << endl;
+  // cerr << "WEIGHT " << name << "=";
+  // for (size_t i = 0; i < ret.size(); ++i) {
+  //   cerr << ret[i] << ",";
+  // }
+  // cerr << endl;
  return ret;
 }

@ -355,7 +359,10 @@ void Parameter::SetWeight(const std::string &name, size_t ind, const vector<floa
  newWeights.push_back(line);
 }

-void Parameter::AddWeight(const std::string &name, size_t ind, const std::vector<float> &weights)
+void 
+Parameter::
+AddWeight(const std::string &name, size_t ind, 
+	  const std::vector<float> &weights)
 {
  PARAM_VEC &newWeights = m_setting["weight"];

@ -480,6 +487,12 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
      case Compact:
        ptType = "PhraseDictionaryCompact";
        break;
+      case SuffixArray:
+        ptType = "PhraseDictionarySuffixArray";
+        break;
+      case DSuffixArray:
+        ptType = "PhraseDictionaryDynSuffixArray";
+        break;
      default:
        break;
      }
@ -505,6 +518,9 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)

        ++currOldInd;
      }
+
+      // cerr << weights.size() << " PHRASE TABLE WEIGHTS " 
+      // << __FILE__ << ":" << __LINE__ << endl;
      AddWeight(ptType, ptInd, weights);

      // actual pt
@ -531,7 +547,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
      ptLine << "num-input-features=" << (currDict==0 ? numInputScores + numRealWordsInInput : 0) << " ";
      ptLine << "table-limit=" << maxTargetPhrase[currDict] << " ";

-      if (implementation == SuffixArray) {
+      if (implementation == SuffixArray || implementation == DSuffixArray) {
        ptLine << "target-path=" << token[5] << " ";
        ptLine << "alignment-path=" << token[6] << " ";
      }
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@ -26,6 +26,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h"
 #include "moses/TranslationModel/RuleTable/PhraseDictionarySCFG.h"
 #include "moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h"
+#include "moses/TranslationModel/PhraseDictionaryDynSuffixArray.h"
 #include "moses/TranslationModel/PhraseDictionaryMultiModel.h"
 #include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
 #include "DecodeStepTranslation.h"
@ -56,9 +57,9 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "ScoreComponentCollection.h"
 #include "LM/Ken.h"

-//#ifdef LM_IRST
+#ifdef LM_IRST
 #include "LM/IRST.h"
-//#endif
+#endif

 #ifdef HAVE_SYNLM
 #include "SyntacticLanguageModel.h"
@ -108,16 +109,16 @@ int GetFeatureIndex(std::map<string, int> &map, const string &featureName)
 StaticData StaticData::s_instance;

 StaticData::StaticData()
-  :m_sourceStartPosMattersForRecombination(false)
+  :m_numRealWordsInInput(0)
+  ,m_sourceStartPosMattersForRecombination(false)
  ,m_inputType(SentenceInput)
+  ,m_numInputScores(0)
  ,m_detailedTranslationReportingFilePath()
  ,m_onlyDistinctNBest(false)
+  ,m_needAlignmentInfo(false)
  ,m_factorDelimiter("|") // default delimiter between factors
  ,m_lmEnableOOVFeature(false)
  ,m_isAlwaysCreateDirectTranslationOption(false)
-  ,m_needAlignmentInfo(false)
-  ,m_numInputScores(0)
-  ,m_numRealWordsInInput(0)
 {
  m_xmlBrackets.first="<";
  m_xmlBrackets.second=">";
@ -534,11 +535,16 @@ bool StaticData::LoadData(Parameter *parameter)
  }

  // use of xml in input
-  if (m_parameter->GetParam("xml-input").size() == 0) m_xmlInputType = XmlPassThrough;
-  else if (m_parameter->GetParam("xml-input")[0]=="exclusive") m_xmlInputType = XmlExclusive;
-  else if (m_parameter->GetParam("xml-input")[0]=="inclusive") m_xmlInputType = XmlInclusive;
-  else if (m_parameter->GetParam("xml-input")[0]=="ignore") m_xmlInputType = XmlIgnore;
-  else if (m_parameter->GetParam("xml-input")[0]=="pass-through") m_xmlInputType = XmlPassThrough;
+  if (m_parameter->GetParam("xml-input").size() == 0) 
+    m_xmlInputType = XmlPassThrough;
+  else if (m_parameter->GetParam("xml-input")[0]=="exclusive") 
+    m_xmlInputType = XmlExclusive;
+  else if (m_parameter->GetParam("xml-input")[0]=="inclusive") 
+    m_xmlInputType = XmlInclusive;
+  else if (m_parameter->GetParam("xml-input")[0]=="ignore") 
+    m_xmlInputType = XmlIgnore;
+  else if (m_parameter->GetParam("xml-input")[0]=="pass-through") 
+    m_xmlInputType = XmlPassThrough;
  else {
    UserMessage::Add("invalid xml-input value, must be pass-through, exclusive, inclusive, or ignore");
    return false;
@ -569,7 +575,8 @@ bool StaticData::LoadData(Parameter *parameter)
    vector<string> toks = Tokenize(line);

    const string &feature = toks[0];
-    int featureIndex = GetFeatureIndex(featureIndexMap, feature);
+    // not used:
+    // int featureIndex = GetFeatureIndex(featureIndexMap, feature);

    if (feature == "GlobalLexicalModel") {
      GlobalLexicalModel *model = new GlobalLexicalModel(line);
@ -631,13 +638,13 @@ bool StaticData::LoadData(Parameter *parameter)
      vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
      SetWeights(model, weights);
    }
-//#ifdef LM_IRST
+#ifdef LM_IRST
    else if (feature == "IRSTLM") {
      LanguageModelIRST *model = new LanguageModelIRST(line);
      vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
      SetWeights(model, weights);
    }
-//#endif
+#endif
    else if (feature == "Generation") {
      GenerationDictionary *model = new GenerationDictionary(line);
      vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
@ -700,10 +707,22 @@ bool StaticData::LoadData(Parameter *parameter)
    else if (feature == "PhraseDictionaryMultiModelCounts") {
      PhraseDictionaryMultiModelCounts* model = new PhraseDictionaryMultiModelCounts(line);
      vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
+      
+      SetWeights(model, weights);
+      m_phraseDictionary.push_back(model);
+    }
+    else if (feature == "PhraseDictionaryDynSuffixArray") {
+      PhraseDictionaryDynSuffixArray* model;
+      model = new PhraseDictionaryDynSuffixArray(line);
+      string const& mdesc = model->GetScoreProducerDescription();
+      vector<float> weights = m_parameter->GetWeights(mdesc);
+      cerr << "PhraseDictionaryDynSuffixArray with " 
+	   << weights.size() << " weights. (" 
+	   << model->GetScoreProducerDescription() << ")"
+	   << endl;
      SetWeights(model, weights);
      m_phraseDictionary.push_back(model);
    }      
-
 #ifdef HAVE_SYNLM
    else if (feature == "SyntacticLanguageModel") {
      SyntacticLanguageModel *model = new SyntacticLanguageModel(line);
--- a/moses/TranslationModel/BilingualDynSuffixArray.cpp
+++ b/moses/TranslationModel/BilingualDynSuffixArray.cpp
--- a/moses/TranslationModel/BilingualDynSuffixArray.h
+++ b/moses/TranslationModel/BilingualDynSuffixArray.h
@ -5,152 +5,176 @@
 #include "moses/TranslationModel/DynSAInclude/vocab.h"
 #include "moses/TranslationModel/DynSAInclude/types.h"
 #include "moses/TranslationModel/DynSAInclude/utils.h"
+#include "moses/TranslationModel/WordCoocTable.h"
 #include "moses/InputFileStream.h"
 #include "moses/FactorTypeSet.h"
 #include "moses/TargetPhrase.h"
+#include <boost/dynamic_bitset.hpp>
+#include "moses/TargetPhraseCollection.h"
+#include <map>

-namespace Moses {
-
-/** @todo ask Abbey Levenberg
- */
-class SAPhrase
+using namespace std;
+namespace Moses
 {
-public:
-	std::vector<wordID_t> words;		
+  class PhraseDictionaryDynSuffixArray;

-	SAPhrase(size_t phraseSize)
-	:words(phraseSize)
-	{}
+  /** @todo ask Abbey Levenberg
+   */
+  class SAPhrase
+  {
+  public:
+    vector<wordID_t> words;		
 	
-	void SetId(size_t pos, wordID_t id)
-	{
-    CHECK(pos < words.size());
-		words[pos] = id;
-	}
-	bool operator<(const SAPhrase& phr2) const
-  { return words < phr2.words; }
-};
+    SAPhrase(size_t phraseSize)
+      :words(phraseSize)
+    {}
 	
-/** @todo ask Abbey Levenberg
- */
-class PhrasePair
-{
-public:
-	int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex;
-	PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex)
-	: m_startTarget(startTarget)
-	, m_endTarget(endTarget)
-	, m_startSource(startSource)
-	, m_endSource(endSource)
-	, m_sntIndex(sntIndex)
-	{}
-
-	size_t GetTargetSize() const
-	{ return m_endTarget - m_startTarget + 1; }
-};
-	
-/** @todo ask Abbey Levenberg
- */
-class SentenceAlignment 
-{
-public:
-	SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
-	int m_sntIndex;
-	std::vector<wordID_t>* trgSnt;
-	std::vector<wordID_t>* srcSnt;
-	std::vector<int> numberAligned; 
-	std::vector< std::vector<int> > alignedList; 
-	bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
-};
-class ScoresComp {
-public: 
-  ScoresComp(const std::vector<float>& weights): m_weights(weights) {}
-  bool operator()(const Scores& s1, const Scores& s2) const { 
-    return s1[0] < s2[0]; // just p(e|f) as approximation
-    /*float score1(0), score2(0);
-    int idx1(0), idx2(0);
-    for (Scores::const_iterator itr = s1.begin(); 
-            itr != s1.end(); ++itr) {
-        score1 += log(*itr * m_weights.at(idx1++)); 
+    void SetId(size_t pos, wordID_t id)
+    {
+      CHECK(pos < words.size());
+      words[pos] = id;
    }
-    for (Scores::const_iterator itr = s2.begin();
-        itr != s2.end(); ++itr) {
-        score2 += log(*itr * m_weights.at(idx2++));
+    bool operator<(const SAPhrase& phr2) const
+    { return words < phr2.words; }
+  };
+
+  /** @todo ask Abbey Levenberg
+   */
+  class PhrasePair
+  {
+  public:
+    int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex;
+    PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex)
+      : m_startTarget(startTarget)
+      , m_endTarget(endTarget)
+      , m_startSource(startSource)
+      , m_endSource(endSource)
+      , m_sntIndex(sntIndex)
+    {}
+
+    size_t GetTargetSize() const
+    { return m_endTarget - m_startTarget + 1; }
+    
+    size_t GetSourceSize() const
+    { return m_endSource - m_startSource + 1; }
+  };
+	
+  /** @todo ask Abbey Levenberg
+   */
+  class SentenceAlignment 
+  {
+  public:
+    SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
+    int m_sntIndex;
+    vector<wordID_t>* trgSnt;
+    vector<wordID_t>* srcSnt;
+    vector<int> numberAligned; 
+    vector< vector<int> > alignedList; 
+    bool Extract(int maxPhraseLength, vector<PhrasePair*> &ret, 
+		 int startSource, int endSource) const;
+  };
+
+  class ScoresComp {
+  public: 
+    ScoresComp(const vector<float>& weights): m_weights(weights) {}
+    bool operator()(const Scores& s1, const Scores& s2) const { 
+      return s1[0] < s2[0]; // just p(e|f) as approximation
+      // float score1(0), score2(0);
+      // int idx1(0), idx2(0);
+      // for (Scores::const_iterator itr = s1.begin(); 
+      // 	 itr != s1.end(); ++itr) {
+      //   score1 += log(*itr * m_weights.at(idx1++)); 
+      // }
+      // for (Scores::const_iterator itr = s2.begin();
+      // 	 itr != s2.end(); ++itr) {
+      //   score2 += log(*itr * m_weights.at(idx2++));
+      // }
+      // return score1 < score2;
    }
-    return score1 < score2;*/
-  }
-private: 
-  const std::vector<float>& m_weights;
-};
+  private: 
+    const vector<float>& m_weights;
+  };
 	
-/** @todo ask Abbey Levenberg
- */
-class BilingualDynSuffixArray {
-public: 
-	BilingualDynSuffixArray();
-	~BilingualDynSuffixArray();
-	bool Load( const std::vector<FactorType>& inputFactors,
-		const std::vector<FactorType>& outputTactors,
-		std::string source, std::string target, std::string alignments, 
-		const std::vector<float> &weight);
-	bool LoadTM( const std::vector<FactorType>& inputFactors,
-            const std::vector<FactorType>& outputTactors,
-            std::string source, std::string target, std::string alignments, 
-            const std::vector<float> &weight);
-	void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
-  void addSntPair(string& source, string& target, string& alignment);
-private:
-	DynSuffixArray* m_srcSA;
-	DynSuffixArray* m_trgSA;
-	std::vector<wordID_t>* m_srcCorpus;
-	std::vector<wordID_t>* m_trgCorpus;
-  std::vector<FactorType> m_inputFactors;
-  std::vector<FactorType> m_outputFactors;
+  struct BetterPhrase
+  {
+    ScoresComp const& cmp;
+    BetterPhrase(ScoresComp const& sc);
+    // bool operator()(pair<Scores, TargetPhrase const*> const& a, 
+    // pair<Scores, TargetPhrase const*> const& b) const; 
+    bool operator()(pair<Scores, SAPhrase const*> const& a, 
+		    pair<Scores, SAPhrase const*> const& b) const; 
+  };

-	std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
+  /** @todo ask Abbey Levenberg
+   */
+  class BilingualDynSuffixArray {
+  public: 
+    BilingualDynSuffixArray();
+    ~BilingualDynSuffixArray();
+    bool Load( const vector<FactorType>& inputFactors,
+	       const vector<FactorType>& outputTactors,
+	       string source, string target, string alignments, 
+	       const vector<float> &weight);
+    // bool LoadTM( const vector<FactorType>& inputFactors,
+    // 	     const vector<FactorType>& outputTactors,
+    // 	     string source, string target, string alignments, 
+    // 	     const vector<float> &weight);
+    void GetTargetPhrasesByLexicalWeight(const Phrase& src, vector< pair<Scores, TargetPhrase*> >& target) const;

-	Vocab* m_srcVocab, *m_trgVocab;
-	ScoresComp* m_scoreCmp;
+    void CleanUp(const InputType& source);
+    void addSntPair(string& source, string& target, string& alignment);
+    pair<float,float>
+    GatherCands(Phrase const& src, map<SAPhrase, vector<float> >& pstats) const;

-	std::vector<SentenceAlignment> m_alignments;
-	std::vector<std::vector<short> > m_rawAlignments;
+    TargetPhrase* 
+    GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;

-	mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache; 
-  mutable std::set<wordID_t> m_freqWordsCached;
-	const size_t m_maxPhraseLength, m_maxSampleSize;
+  private:

-	int LoadCorpus(FactorDirection direction, InputFileStream&, const std::vector<FactorType>& factors,
-		std::vector<wordID_t>&, std::vector<wordID_t>&,
-    Vocab*);
-	int LoadAlignments(InputFileStream& aligs);
-	int LoadRawAlignments(InputFileStream& aligs);
-	int LoadRawAlignments(string& aligs);

-	bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
-	SentenceAlignment GetSentenceAlignment(const int, bool=false) const; 
-	int SampleSelection(std::vector<unsigned>&, int = 300) const;
+    mutable WordCoocTable m_wrd_cooc;
+    DynSuffixArray * m_srcSA;
+    DynSuffixArray * m_trgSA;
+    vector<wordID_t>* m_srcCorpus;
+    vector<wordID_t>* m_trgCorpus;
+    vector<FactorType> m_inputFactors;
+    vector<FactorType> m_outputFactors;
 	
-	std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;	
-	TargetPhrase* GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;
-	SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
-	bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
-	void CacheWordProbs(wordID_t) const;
-  void CacheFreqWords() const;
-  void ClearWordInCache(wordID_t);
-	std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
+    vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
 	
-	int GetSourceSentenceSize(size_t sentenceId) const
-	{ 
-		return (sentenceId==m_srcSntBreaks.size()-1) ? 
-			m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) : 
-			m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId); 
-	}
-	int GetTargetSentenceSize(size_t sentenceId) const
-	{ 
-		return (sentenceId==m_trgSntBreaks.size()-1) ?
-			m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) : 
-			m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId); 
-	}
-};
+    Vocab* m_srcVocab, *m_trgVocab;
+    ScoresComp* m_scoreCmp;
+	
+    vector<SentenceAlignment> m_alignments;
+    vector<vector<short> > m_rawAlignments;
+	
+    mutable map<pair<wordID_t, wordID_t>, pair<float, float> > m_wordPairCache; 
+    mutable set<wordID_t> m_freqWordsCached;
+    const size_t m_maxPhraseLength, m_maxSampleSize;
+    const size_t m_maxPTEntries;
+    int LoadCorpus(FactorDirection direction,
+		   InputFileStream&, const vector<FactorType>& factors, 
+		   vector<wordID_t>&, vector<wordID_t>&,
+		   Vocab*);
+    int LoadAlignments(InputFileStream& aligs);
+    int LoadRawAlignments(InputFileStream& aligs);
+    int LoadRawAlignments(string& aligs);
+
+    bool ExtractPhrases(const int&, const int&, const int&, vector<PhrasePair*>&, bool=false) const;
+    SentenceAlignment GetSentenceAlignment(const int, bool=false) const; 
+    int SampleSelection(vector<unsigned>&, int = 300) const;
+
+    vector<int> GetSntIndexes(vector<unsigned>&, int, const vector<unsigned>&) const;	
+    SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
+    bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
+    void CacheWordProbs(wordID_t) const;
+    void CacheFreqWords() const;
+    void ClearWordInCache(wordID_t);
+    pair<float, float> GetLexicalWeight(const PhrasePair&) const;
+
+    int GetSourceSentenceSize(size_t sentenceId) const;
+    int GetTargetSentenceSize(size_t sentenceId) const;
+
+  };
 } // end namespace
 #endif
--- a/moses/TranslationModel/DynSuffixArray.h
+++ b/moses/TranslationModel/DynSuffixArray.h
@ -11,8 +11,24 @@

 namespace Moses
 {
+  using namespace std;
+  typedef std::vector<unsigned> vuint_t;
+
+
+  /// compare position /i/ in the suffix array /m_sfa/ into corpus /m_crp/
+  /// against reference phrase /phrase/
+  // added by Ulrich Germann
+  class ComparePosition
+  { 
+    vuint_t const& m_crp;
+    vuint_t const& m_sfa;
+
+  public:
+    ComparePosition(vuint_t const& crp, vuint_t const& sfa);
+    bool operator()(unsigned const& i, vector<wordID_t> const& phrase) const;
+    bool operator()(vector<wordID_t> const& phrase, unsigned const& i) const;
+  };
  
-typedef std::vector<unsigned> vuint_t;

 /** @todo ask Abbey Levenberg
 */
@ -30,6 +46,8 @@ public:
  void Delete(unsigned, unsigned);
  void Substitute(vuint_t*, unsigned);

+  size_t GetCount(vuint_t const& phrase) const;
+
 private:
  vuint_t* m_SA;
  vuint_t* m_ISA;
@ -46,10 +64,10 @@ private:
  void PrintAuxArrays() {
    std::cerr << "SA\tISA\tF\tL\n";
    for(size_t i=0; i < m_SA->size(); ++i)
-      std::cerr << m_SA->at(i) << "\t" << m_ISA->at(i) << "\t" << m_F->at(i) << "\t" << m_L->at(i) << std::endl;
+      std::cerr << m_SA->at(i) << "\t" << m_ISA->at(i) << "\t" 
+  		<< m_F->at(i) << "\t" << m_L->at(i) << std::endl;
  }
 };
-
 } //end namespace

 #endif
--- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.README
+++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.README
@ -0,0 +1,4 @@
+Specifying Dynamic Suffix Array-based Phrase Tables in moses.ini
+
+[ttable-file]
+14 0 0 5 <source language text file> <target language text file> <file with alignment info in symal format>
--- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
+++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
@ -3,76 +3,92 @@
 #include "moses/StaticData.h"
 #include "moses/TargetPhrase.h"
 #include <iomanip>
-
+#include <boost/foreach.hpp>
 using namespace std;

 namespace Moses
 {
-PhraseDictionaryDynSuffixArray::PhraseDictionaryDynSuffixArray(const std::string &line)
-:PhraseDictionary("PhraseDictionaryDynSuffixArray", line)
+PhraseDictionaryDynSuffixArray::
+PhraseDictionaryDynSuffixArray(const std::string &line)
+: PhraseDictionary("PhraseDictionaryDynSuffixArray", line)
 {
  m_biSA = new BilingualDynSuffixArray();
 }

-PhraseDictionaryDynSuffixArray::~PhraseDictionaryDynSuffixArray()
+
+bool
+PhraseDictionaryDynSuffixArray::
+InitDictionary()
 {
-  delete m_biSA;
-}
+  m_weightWP = StaticData::Instance().GetWeightWordPenalty(); 
+  vector<float> weight = StaticData::Instance().GetWeights(this);
  
-bool PhraseDictionaryDynSuffixArray::Load(const std::vector<FactorType>& input,
-    const std::vector<FactorType>& output,
-    string source, string target, string alignments,
-    const std::vector<float> &weight,
-    size_t tableLimit,
-    const LMList &languageModels,
-    float weightWP)
-{
-
-  m_tableLimit = tableLimit;
-  m_languageModels = &languageModels;
-  m_weight = weight;
-  m_weightWP = weightWP;
-
-  m_biSA->Load( input, output, source, target, alignments, weight);
+  m_biSA->Load(m_input, m_output, m_filePath, m_targetFile, 
+	       m_alignmentsFile, weight);

  return true;
 }

-const TargetPhraseCollection *PhraseDictionaryDynSuffixArray::GetTargetPhraseCollection(const Phrase& src) const
+PhraseDictionaryDynSuffixArray::
+~PhraseDictionaryDynSuffixArray()
 {
+  delete m_biSA;
+}
+
+const TargetPhraseCollection*
+PhraseDictionaryDynSuffixArray::
+GetTargetPhraseCollection(const Phrase& src) const
+{
+  typedef map<SAPhrase, vector<float> >::value_type pstat_entry; 
+  map<SAPhrase, vector<float> > pstats; // phrase (pair) statistics
+  m_biSA->GatherCands(src,pstats);
+
  TargetPhraseCollection *ret = new TargetPhraseCollection();
-  std::vector< std::pair< Scores, TargetPhrase*> > trg;
-  // extract target phrases and their scores from suffix array
-  m_biSA->GetTargetPhrasesByLexicalWeight( src, trg);
-
-  std::vector< std::pair< Scores, TargetPhrase*> >::iterator itr;
-  for(itr = trg.begin(); itr != trg.end(); ++itr) {
-    Scores scoreVector = itr->first;
-    TargetPhrase *targetPhrase = itr->second;
-    //std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),NegateScore);
-    std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
-
-    targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
-    targetPhrase->Evaluate();
-
-    //cout << *targetPhrase << "\t" << std::setprecision(8) << scoreVector[2] << endl;
-    ret->Add(targetPhrase);
-  }
-  ret->NthElement(m_tableLimit); // sort the phrases for the dcoder
+  BOOST_FOREACH(pstat_entry & e, pstats)
+    {
+      TargetPhrase* tp = m_biSA->GetMosesFactorIDs(e.first, src);
+      tp->GetScoreBreakdown().Assign(this,e.second); 
+      ret->Add(tp);
+    }
+  // return ret;
+  // TargetPhraseCollection *ret = new TargetPhraseCollection();
+  // std::vector< std::pair< Scores, TargetPhrase*> > trg;
+  //
+  // // extract target phrases and their scores from suffix array
+  // m_biSA->GetTargetPhrasesByLexicalWeight(src, trg);
+  //
+  // std::vector< std::pair< Scores, TargetPhrase*> >::iterator itr;
+  // for(itr = trg.begin(); itr != trg.end(); ++itr) {
+  //   Scores scoreVector = itr->first;
+  //   TargetPhrase *targetPhrase = itr->second;
+  //   std::transform(scoreVector.begin(),scoreVector.end(),
+  // 		   scoreVector.begin(),FloorScore);
+  //   targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
+  //   targetPhrase->Evaluate();
+  //   ret->Add(targetPhrase);
+  // }
+  ret->NthElement(m_tableLimit); // sort the phrases for the decoder
  return ret;
 }

-void PhraseDictionaryDynSuffixArray::insertSnt(string& source, string& target, string& alignment)
+void 
+PhraseDictionaryDynSuffixArray::
+insertSnt(string& source, string& target, string& alignment)
 {
  m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays
  //StaticData::Instance().ClearTransOptionCache(); // clear translation option cache 
 }
-void PhraseDictionaryDynSuffixArray::deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
+
+void 
+PhraseDictionaryDynSuffixArray::
+deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
 {
  // need to implement --
 }

-ChartRuleLookupManager *PhraseDictionaryDynSuffixArray::CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&)
+ChartRuleLookupManager*
+PhraseDictionaryDynSuffixArray::
+CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&)
 {
  CHECK(false);
  return 0;
--- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h
+++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h
@ -17,15 +17,18 @@ class PhraseDictionaryDynSuffixArray: public PhraseDictionary
 public:
  PhraseDictionaryDynSuffixArray(const std::string &line);
  ~PhraseDictionaryDynSuffixArray();
-  bool Load( const std::vector<FactorType>& m_input
-             , const std::vector<FactorType>& m_output
-             , std::string m_source
-             , std::string m_target
-             , std::string m_alignments
-             , const std::vector<float> &m_weight
-             , size_t m_tableLimit
-             , const LMList &languageModels
-             , float weightWP);
+  bool InitDictionary();
+  bool Load(
+	    // const std::vector<FactorType>& m_input
+            //  , const std::vector<FactorType>& m_output
+            //  , std::string m_source
+            //  , std::string m_target
+            //  , std::string m_alignments
+            //  , const std::vector<float> &m_weight
+            //  , size_t m_tableLimit
+            //  , const LMList &languageModels
+            //  , float weightWP
+	    );
  // functions below required by base class
  const TargetPhraseCollection* GetTargetPhraseCollection(const Phrase& src) const;
  void insertSnt(string&, string&, string&);
--- a/moses/TranslationModel/WordCoocTable.cpp
+++ b/moses/TranslationModel/WordCoocTable.cpp
@ -0,0 +1,72 @@
+#include "moses/TranslationModel/WordCoocTable.h"
+using namespace std;
+namespace Moses {
+
+  WordCoocTable::
+  WordCoocTable()
+  {
+    m_cooc.reserve(1000000);
+    m_marg1.reserve(1000000);
+    m_marg2.reserve(1000000);
+  }
+  
+  WordCoocTable::
+  WordCoocTable(wordID_t const VocabSize1, wordID_t const VocabSize2)
+    : m_cooc(VocabSize1), m_marg1(VocabSize1,0), m_marg2(VocabSize2, 0)
+  {}
+  
+  void
+  WordCoocTable::
+  Count(size_t const a, size_t const b)
+  {
+    while (a >= m_marg1.size()) 
+      {
+	m_cooc.push_back(my_map_t());
+	m_marg1.push_back(0);
+      }
+    while (b >= m_marg2.size()) 
+      m_marg2.push_back(0);
+    ++m_marg1[a];
+    ++m_marg2[b];
+    ++m_cooc[a][b];
+  }
+  
+  uint32_t 
+  WordCoocTable::
+  GetJoint(size_t const a, size_t const b) const
+  {
+    if (a >= m_marg1.size() || b >= m_marg2.size()) return 0;
+    my_map_t::const_iterator m = m_cooc.at(a).find(b);
+    if (m == m_cooc[a].end()) return 0;
+    return m->second;
+  }
+
+  uint32_t 
+  WordCoocTable::
+  GetMarg1(size_t const x) const
+  {
+    return x >= m_marg1.size() ? 0 : m_marg1[x];
+  }
+
+  uint32_t 
+  WordCoocTable::
+  GetMarg2(size_t const x) const
+  {
+    return x >= m_marg2.size() ? 0 : m_marg2[x];
+  }
+
+  float 
+  WordCoocTable::
+  pfwd(size_t const a, size_t const b) const
+  {
+    return float(GetJoint(a,b))/GetMarg1(a);
+  }
+
+  float 
+  WordCoocTable::
+  pbwd(size_t const a, size_t const b) const
+  {
+    // cerr << "at " << __FILE__ << ":" << __LINE__ << endl;
+    return float(GetJoint(a,b))/GetMarg2(b);
+  }
+}
--- a/moses/TranslationModel/WordCoocTable.h
+++ b/moses/TranslationModel/WordCoocTable.h
@ -0,0 +1,72 @@
+#ifndef moses_WordCoocTable_h
+#define moses_WordCoocTable_h
+
+#include "moses/TranslationModel/DynSAInclude/vocab.h"
+#include "moses/TranslationModel/DynSAInclude/types.h"
+#include "moses/TranslationModel/DynSAInclude/utils.h"
+#include "moses/InputFileStream.h"
+#include "moses/FactorTypeSet.h"
+#include "moses/TargetPhrase.h"
+#include <boost/dynamic_bitset.hpp>
+#include <map>
+
+namespace Moses {
+
+  using namespace std;
+
+#ifndef bitvector
+  typedef boost::dynamic_bitset<uint64_t> bitvector;
+#endif
+
+
+  /** 
+   *  Stores word cooccurrence counts
+   *  @todo ask Uli Germann
+   */
+  class WordCoocTable
+  {
+    typedef map<wordID_t,uint32_t> my_map_t;
+    vector<my_map_t> m_cooc;
+    vector<uint32_t> m_marg1;
+    vector<uint32_t> m_marg2;
+  public:
+    WordCoocTable();
+    WordCoocTable(wordID_t const VocabSize1, wordID_t const VocabSize2);
+    uint32_t GetJoint(size_t const a, size_t const b) const;
+    uint32_t GetMarg1(size_t const x) const;
+    uint32_t GetMarg2(size_t const x) const;
+    float pfwd(size_t const a, size_t const b) const;
+    float pbwd(size_t const a, size_t const b) const;
+    void 
+    Count(size_t const a, size_t const b);
+
+    template<typename idvec, typename alnvec>
+    void 
+    Count(idvec const& s1, idvec const& s2, alnvec const& aln, 
+	  wordID_t const NULL1, wordID_t const NULL2);
+
+  };
+
+  template<typename idvec, typename alnvec>
+  void 
+  WordCoocTable::
+  Count(idvec const& s1, idvec const& s2, alnvec const& aln, 
+	wordID_t const NULL1, wordID_t const NULL2)
+  {
+    boost::dynamic_bitset<uint64_t> check1(s1.size()), check2(s2.size());
+    check1.set();
+    check2.set();
+    for (size_t i = 0; i < aln.size(); i += 2)
+      {
+	Count(s1[aln[i]], s2[aln[i+1]]);
+	check1.reset(aln[i]);
+	check2.reset(aln[i+1]);
+      }
+    for (size_t i = check1.find_first(); i < check1.size(); i = check1.find_next(i))
+      Count(s1[i], NULL2);
+    for (size_t i = check2.find_first(); i < check2.size(); i = check2.find_next(i))
+      Count(NULL1, s2[i]);
+  }
+
+}
+#endif
--- a/moses/generic/sampling/Sampling.h
+++ b/moses/generic/sampling/Sampling.h
@ -0,0 +1,55 @@
+#ifndef __sampling_h
+#define __sampling_h
+
+// Utility functions for proper sub-sampling. 
+// (c) 2007-2012 Ulrich Germann
+
+
+namespace Moses
+{
+
+  inline
+  size_t 
+  randInt(size_t N)
+  {
+    return N*(rand()/(RAND_MAX+1.));
+  }
+
+  // select a random sample of size /s/ without restitution from the range of
+  // integers [0,N);
+  template<typename idx_t>
+  void 
+  randomSample(vector<idx_t>& v, size_t s, size_t N)
+  {
+    // see also Knuth: Art of Computer Programming Vol. 2, p. 142
+    
+    s = min(s,N);
+    v.resize(s);
+
+    // the first option tries to be a bit more efficient than O(N) in picking
+    // the samples. The threshold is an ad-hoc, off-the-cuff guess. I still
+    // need to figure out the optimal break-even point between a linear sweep
+    // and repeatedly picking random numbers with the risk of hitting the same
+    // number many times. 
+    if (s*10<N)
+      { 
+        boost::dynamic_bitset<uint64_t> check(N,0);
+        for (size_t i = 0; i < v.size(); i++)
+          {
+            size_t x = randInt(N);
+            while (check[x]) x = randInt(N);
+            check[x]=true;
+            v[i] = x;
+          }
+      }
+    else 
+      {
+        size_t m=0;
+        for (size_t t = 0; m <= s && t < N; t++)
+          if (s==N || randInt(N-t) < s-m) v[m++] = t;
+      }
+  }
+  
+};
+
+#endif
--- a/moses/generic/sorting/NBestList.h
+++ b/moses/generic/sorting/NBestList.h
@ -0,0 +1,89 @@
+#ifndef __n_best_list_h
+#define __n_best_list_h
+#include <algorithm>
+#include "moses/generic/sorting/VectorIndexSorter.h"
+
+// NBest List; (c) 2007-2012 Ulrich Germann
+//
+// The 'trick' used in this implementation is to maintain a heap of size <= N
+// such that the lowest-scoring item is on top of the heap. For each incoming
+// item we can then determine easily if it is in the top N.
+
+namespace Moses
+{
+  using namespace std;
+  
+  template<typename THINGY, typename CMP> 
+  class 
+  NBestList
+  {
+    vector<uint32_t> m_heap;
+    vector<THINGY>   m_list;
+    VectorIndexSorter<THINGY, CMP, uint32_t> m_better;
+    mutable vector<uint32_t> m_order;
+    mutable bool m_changed;
+  public:
+    NBestList(size_t const max_size, CMP const& cmp);
+    NBestList(size_t const max_size);
+    bool add(THINGY const& item);
+    THINGY const& operator[](int i) const;
+    size_t size() const { return m_heap.size(); }
+  };
+  
+  template<typename THINGY, typename CMP> 
+  NBestList<THINGY,CMP>::
+  NBestList(size_t const max_size, CMP const& cmp)
+    : m_better(m_list, cmp), m_changed(false)
+  {
+    m_heap.reserve(max_size);
+  }
+
+  template<typename THINGY, typename CMP> 
+  NBestList<THINGY,CMP>::
+  NBestList(size_t const max_size)
+    : m_better(m_heap), m_changed(false)
+  {
+    m_heap.reserve(max_size);
+  }
+
+  template<typename THINGY, typename CMP> 
+  bool
+  NBestList<THINGY,CMP>::
+  add(THINGY const& item)
+  {
+    if (m_heap.size() == m_heap.capacity())
+      {
+	if (m_better.Compare(item, m_list[m_heap.at(0)]))
+	  { 
+	    pop_heap(m_heap.begin(),m_heap.end(),m_better);
+	    m_list[m_heap.back()] = item;
+	  }
+	else return false;
+      }
+    else 
+      {
+	m_list.push_back(item);
+	m_heap.push_back(m_heap.size());
+      }
+    push_heap(m_heap.begin(),m_heap.end(),m_better);
+    return m_changed = true;
+  }
+
+  template<typename THINGY, typename CMP> 
+  THINGY const& 
+  NBestList<THINGY,CMP>::
+  operator[](int i) const
+  {
+    if (m_changed) 
+      {
+	m_order.assign(m_heap.begin(),m_heap.end());
+	for (size_t k = m_heap.size(); k != 0; --k)
+	  pop_heap(m_order.begin(), m_order.begin()+k);
+	m_changed = false;
+      }
+    if (i < 0) i += m_order.size();
+    return m_list[m_order.at(i)];
+  }
+  
+}
+#endif
--- a/moses/generic/sorting/VectorIndexSorter.h
+++ b/moses/generic/sorting/VectorIndexSorter.h
@ -0,0 +1,70 @@
+#ifndef __vector_index_sorter_h
+#define __vector_index_sorter_h
+
+// VectorIndexSorter; (c) 2007-2012 Ulrich Germann
+
+// A VectorIndexSorter is a function object for sorting indices into a vector
+// of objects (instead of sorting the vector itself).
+//
+// typcial use:
+// vector<thingy> my_vector;
+// VectorIndexSorter<thingy,less<thingy>,int> sorter(my_vector);
+// vector<int> order; 
+// sorter.get_order(order);
+
+namespace Moses
+{
+  template<typename VAL, typename COMP = greater<VAL>,  typename IDX_T=size_t>
+  class
+  VectorIndexSorter : public binary_function<IDX_T const&, IDX_T const&, bool>
+  {
+    vector<VAL> const&    m_vecref;
+    boost::shared_ptr<COMP> m_comp;
+  public:
+    
+    COMP const& Compare;
+    VectorIndexSorter(vector<VAL> const& v, COMP const& comp) 
+      : m_vecref(v), Compare(comp)
+    { }
+    
+    VectorIndexSorter(vector<VAL> const& v) 
+      : m_vecref(v), m_comp(new COMP()), Compare(*m_comp)
+    { }
+    
+    bool operator()(IDX_T const & a, IDX_T const & b) const
+    {
+      bool fwd = Compare(m_vecref.at(a) ,m_vecref.at(b));
+      bool bwd = Compare(m_vecref[b],    m_vecref[a]);
+      return (fwd == bwd ? a < b : fwd);
+    }
+    
+    boost::shared_ptr<vector<IDX_T> >
+    GetOrder() const;
+    
+    void
+    GetOrder(vector<IDX_T> & order) const;
+    
+  };
+
+  template<typename VAL, typename COMP, typename IDX_T>
+  boost::shared_ptr<vector<IDX_T> >
+  VectorIndexSorter<VAL,COMP,IDX_T>::
+  GetOrder() const
+  {
+    boost::shared_ptr<vector<IDX_T> > ret(new vector<IDX_T>(m_vecref.size()));
+    get_order(*ret);
+    return ret;
+  }
+  
+  template<typename VAL, typename COMP, typename IDX_T>
+  void
+  VectorIndexSorter<VAL,COMP,IDX_T>::
+  GetOrder(vector<IDX_T> & order) const
+  {
+    order.resize(m_vecref.size());
+    for (IDX_T i = 0; i < IDX_T(m_vecref.size()); ++i) order[i] = i;
+    sort(order.begin(), order.end(), *this);
+  }
+
+}
+#endif