Merge branch 'nadir_osm' of github.com:moses-smt/mosesdecoder into nadir_osm

2024-09-20 15:48:05 +03:00 · 2013-07-01 11:07:21 +01:00 · 2013-07-01 11:07:21 +01:00 · ba72c70c6e
commit ba72c70c6e
parent 6a915253e1 d0464455db
25 changed files with 1146 additions and 435 deletions
--- a/contrib/other-builds/moses/.project
+++ b/contrib/other-builds/moses/.project
@ -1571,6 +1571,16 @@
 			<type>2</type>
 			<locationURI>virtual:/virtual</locationURI>
 		</link>
+		<link>
+			<name>TranslationModel/WordCoocTable.cpp</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/WordCoocTable.cpp</locationURI>
+		</link>
+		<link>
+			<name>TranslationModel/WordCoocTable.h</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/WordCoocTable.h</locationURI>
+		</link>
 		<link>
 			<name>TranslationModel/fuzzy-match</name>
 			<type>2</type>
--- a/moses/FF/DistortionScoreProducer.cpp
+++ b/moses/FF/DistortionScoreProducer.cpp
@ -22,7 +22,8 @@ struct DistortionState_traditional : public FFState {
 };

 DistortionScoreProducer::DistortionScoreProducer(const std::string &line)
-  : StatefulFeatureFunction("Distortion", 1, line) {
+  : StatefulFeatureFunction("Distortion", 1, line)
+{
  ReadParameters();
 }

--- a/moses/FF/FeatureFunction.cpp
+++ b/moses/FF/FeatureFunction.cpp
@ -102,8 +102,8 @@ void FeatureFunction::SetParameter(const std::string& key, const std::string& va
 void FeatureFunction::ReadParameters()
 {
  while (!m_args.empty()) {
-	const vector<string> &args = m_args[0];
-	SetParameter(args[0], args[1]);
+    const vector<string> &args = m_args[0];
+    SetParameter(args[0], args[1]);

    m_args.erase(m_args.begin());
  }
--- a/moses/FF/PhrasePenalty.cpp
+++ b/moses/FF/PhrasePenalty.cpp
@ -5,14 +5,15 @@
 namespace Moses
 {
 PhrasePenalty::PhrasePenalty(const std::string &line)
-: StatelessFeatureFunction("PhrasePenalty",1, line) {
+  : StatelessFeatureFunction("PhrasePenalty",1, line)
+{
  ReadParameters();
 }

 void PhrasePenalty::Evaluate(const Phrase &source
-						, const TargetPhrase &targetPhrase
-						, ScoreComponentCollection &scoreBreakdown
-						, ScoreComponentCollection &estimatedFutureScore) const
+                             , const TargetPhrase &targetPhrase
+                             , ScoreComponentCollection &scoreBreakdown
+                             , ScoreComponentCollection &estimatedFutureScore) const
 {
  scoreBreakdown.Assign(this, 1.0f);
 }
--- a/moses/FF/PhrasePenalty.h
+++ b/moses/FF/PhrasePenalty.h
@ -11,13 +11,13 @@ public:
  PhrasePenalty(const std::string &line);

  bool IsUseable(const FactorMask &mask) const {
-	return true;
+    return true;
  }

  virtual void Evaluate(const Phrase &source
-						, const TargetPhrase &targetPhrase
-						, ScoreComponentCollection &scoreBreakdown
-						, ScoreComponentCollection &estimatedFutureScore) const;
+                        , const TargetPhrase &targetPhrase
+                        , ScoreComponentCollection &scoreBreakdown
+                        , ScoreComponentCollection &estimatedFutureScore) const;
 };

 } //namespace
--- a/moses/FF/UnknownWordPenaltyProducer.cpp
+++ b/moses/FF/UnknownWordPenaltyProducer.cpp
@ -7,7 +7,8 @@ using namespace std;
 namespace Moses
 {
 UnknownWordPenaltyProducer::UnknownWordPenaltyProducer(const std::string &line)
-  : StatelessFeatureFunction("UnknownWordPenalty",1, line) {
+  : StatelessFeatureFunction("UnknownWordPenalty",1, line)
+{
  m_tuneable = false;
  ReadParameters();
 }
--- a/moses/FF/WordPenaltyProducer.cpp
+++ b/moses/FF/WordPenaltyProducer.cpp
@ -7,7 +7,8 @@ using namespace std;
 namespace Moses
 {
 WordPenaltyProducer::WordPenaltyProducer(const std::string &line)
-  : StatelessFeatureFunction("WordPenalty",1, line) {
+  : StatelessFeatureFunction("WordPenalty",1, line)
+{
  ReadParameters();
 }

--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@ -275,13 +275,15 @@ bool Parameter::LoadParam(int argc, char* argv[])
  }

  // overwrite parameters with values from switches
-  for(PARAM_STRING::const_iterator iterParam = m_description.begin(); iterParam != m_description.end(); iterParam++) {
+  for(PARAM_STRING::const_iterator iterParam = m_description.begin();
+      iterParam != m_description.end(); iterParam++) {
    const string paramName = iterParam->first;
    OverwriteParam("-" + paramName, paramName, argc, argv);
  }

  // ... also shortcuts
-  for(PARAM_STRING::const_iterator iterParam = m_abbreviation.begin(); iterParam != m_abbreviation.end(); iterParam++) {
+  for(PARAM_STRING::const_iterator iterParam = m_abbreviation.begin();
+      iterParam != m_abbreviation.end(); iterParam++) {
    const string paramName = iterParam->first;
    const string paramShortName = iterParam->second;
    OverwriteParam("-" + paramShortName, paramName, argc, argv);
@ -294,7 +296,8 @@ bool Parameter::LoadParam(int argc, char* argv[])
    verbose = Scan<int>(m_setting["verbose"][0]);
  if (verbose >= 1) { // only if verbose
    TRACE_ERR( "Defined parameters (per moses.ini or switch):" << endl);
-    for(PARAM_MAP::const_iterator iterParam = m_setting.begin() ; iterParam != m_setting.end(); iterParam++) {
+    for(PARAM_MAP::const_iterator iterParam = m_setting.begin() ;
+        iterParam != m_setting.end(); iterParam++) {
      TRACE_ERR( "\t" << iterParam->first << ": ");
      for ( size_t i = 0; i < iterParam->second.size(); i++ )
        TRACE_ERR( iterParam->second[i] << " ");
@ -303,7 +306,8 @@ bool Parameter::LoadParam(int argc, char* argv[])
  }

  // convert old weights args to new format
-  if (!isParamSpecified("feature"))
+  // WHAT IS GOING ON HERE??? - UG
+  if (!isParamSpecified("feature")) // UG
    ConvertWeightArgs();
  CreateWeightsMap();
  WeightOverwrite();
@ -331,11 +335,11 @@ std::vector<float> &Parameter::GetWeights(const std::string &name)
 {
  std::vector<float> &ret = m_weights[name];

-  cerr << "WEIGHT " << name << "=";
-  for (size_t i = 0; i < ret.size(); ++i) {
-    cerr << ret[i] << ",";
-  }
-  cerr << endl;
+  // cerr << "WEIGHT " << name << "=";
+  // for (size_t i = 0; i < ret.size(); ++i) {
+  //   cerr << ret[i] << ",";
+  // }
+  // cerr << endl;
  return ret;
 }

@ -357,7 +361,10 @@ void Parameter::SetWeight(const std::string &name, size_t ind, const vector<floa
  newWeights.push_back(line);
 }

-void Parameter::AddWeight(const std::string &name, size_t ind, const std::vector<float> &weights)
+void
+Parameter::
+AddWeight(const std::string &name, size_t ind,
+          const std::vector<float> &weights)
 {
  PARAM_VEC &newWeights = m_setting["weight"];

@ -478,6 +485,12 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
      case Compact:
        ptType = "PhraseDictionaryCompact";
        break;
+      case SuffixArray:
+        ptType = "PhraseDictionarySuffixArray";
+        break;
+      case DSuffixArray:
+        ptType = "PhraseDictionaryDynSuffixArray";
+        break;
      default:
        break;
      }
@ -502,6 +515,9 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)

        ++currOldInd;
      }
+
+      // cerr << weights.size() << " PHRASE TABLE WEIGHTS "
+      // << __FILE__ << ":" << __LINE__ << endl;
      AddWeight(ptType, ptInd, weights);

      // actual pt
@ -527,7 +543,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
      ptLine << "num-features=" << numScoreComponent << " ";
      ptLine << "table-limit=" << maxTargetPhrase[currDict] << " ";

-      if (implementation == SuffixArray) {
+      if (implementation == SuffixArray || implementation == DSuffixArray) {
        ptLine << "target-path=" << token[5] << " ";
        ptLine << "alignment-path=" << token[6] << " ";
      }
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@ -63,7 +63,9 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "moses/FF/InputFeature.h"
 #include "moses/FF/PhrasePenalty.h"

+#ifdef LM_SRI
 #include "moses/FF/OSM-Feature/OpSequenceModel.h"
+#endif

 #include "LM/Ken.h"
 #ifdef LM_IRST
@ -695,13 +697,17 @@ bool StaticData::LoadData(Parameter *parameter)
      vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
      SetWeights(model, weights);
    } else if (feature == "OpSequenceModel") {
+#ifdef HAVE_SRI
 	  OpSequenceModel* model = new OpSequenceModel(line);
 	  vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
 	  SetWeights(model, weights);
+#else
+      UTIL_THROW(util::Exception, "TODO(nadir): Fix OSM to work without SRILM");
+#endif
    } else if (feature == "PhrasePenalty") {
      PhrasePenalty* model = new PhrasePenalty(line);
-  	  vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
-	  SetWeights(model, weights);
+      vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
+      SetWeights(model, weights);
    }

 #ifdef HAVE_SYNLM
@ -1177,7 +1183,6 @@ void StaticData::LoadFeatureFunctions()
    }
  }

-  // load phrase table
  for (size_t i = 0; i < m_phraseDictionary.size(); ++i) {
    PhraseDictionary *pt = m_phraseDictionary[i];
    pt->Load();
--- a/moses/TargetPhraseCollection.cpp
+++ b/moses/TargetPhraseCollection.cpp
@ -35,11 +35,11 @@ struct CompareTargetPhrase {

 void TargetPhraseCollection::NthElement(size_t tableLimit)
 {
-  vector<TargetPhrase*>::iterator
-  iterMiddle = (tableLimit == 0 || m_collection.size() < tableLimit) ?m_collection.end() : m_collection.begin() + tableLimit;
-
-  //std::sort(m_collection.begin(), m_collection.end(), CompareTargetPhrase());
-  std::nth_element(m_collection.begin(), iterMiddle, m_collection.end(), CompareTargetPhrase());
+  vector<TargetPhrase*>::iterator nth;
+  nth = (tableLimit && tableLimit <= m_collection.size()
+         ? m_collection.begin() + tableLimit
+         : m_collection.end());
+  std::nth_element(m_collection.begin(), nth, m_collection.end(), CompareTargetPhrase());
 }

 void TargetPhraseCollection::Prune(bool adhereTableLimit, size_t tableLimit)
--- a/moses/TranslationModel/BilingualDynSuffixArray.cpp
+++ b/moses/TranslationModel/BilingualDynSuffixArray.cpp
--- a/moses/TranslationModel/BilingualDynSuffixArray.h
+++ b/moses/TranslationModel/BilingualDynSuffixArray.h
@ -5,23 +5,29 @@
 #include "moses/TranslationModel/DynSAInclude/vocab.h"
 #include "moses/TranslationModel/DynSAInclude/types.h"
 #include "moses/TranslationModel/DynSAInclude/utils.h"
+#include "moses/TranslationModel/WordCoocTable.h"
 #include "moses/InputFileStream.h"
 #include "moses/FactorTypeSet.h"
 #include "moses/TargetPhrase.h"
+#include <boost/dynamic_bitset.hpp>
+#include "moses/TargetPhraseCollection.h"
+#include <map>

+using namespace std;
 namespace Moses
 {
+class PhraseDictionaryDynSuffixArray;

 /** @todo ask Abbey Levenberg
 */
 class SAPhrase
 {
 public:
-  std::vector<wordID_t> words;
+  vector<wordID_t> words;

  SAPhrase(size_t phraseSize)
-    :words(phraseSize) {
-  }
+    :words(phraseSize)
+  {}

  void SetId(size_t pos, wordID_t id) {
    CHECK(pos < words.size());
@ -43,12 +49,16 @@ public:
    , m_endTarget(endTarget)
    , m_startSource(startSource)
    , m_endSource(endSource)
-    , m_sntIndex(sntIndex) {
-  }
+    , m_sntIndex(sntIndex)
+  {}

  size_t GetTargetSize() const {
    return m_endTarget - m_startTarget + 1;
  }
+
+  size_t GetSourceSize() const {
+    return m_endSource - m_startSource + 1;
+  }
 };

 /** @todo ask Abbey Levenberg
@ -58,32 +68,43 @@ class SentenceAlignment
 public:
  SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
  int m_sntIndex;
-  std::vector<wordID_t>* trgSnt;
-  std::vector<wordID_t>* srcSnt;
-  std::vector<int> numberAligned;
-  std::vector< std::vector<int> > alignedList;
-  bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
+  vector<wordID_t>* trgSnt;
+  vector<wordID_t>* srcSnt;
+  vector<int> numberAligned;
+  vector< vector<int> > alignedList;
+  bool Extract(int maxPhraseLength, vector<PhrasePair*> &ret,
+               int startSource, int endSource) const;
 };
+
 class ScoresComp
 {
 public:
-  ScoresComp(const std::vector<float>& weights): m_weights(weights) {}
+  ScoresComp(const vector<float>& weights): m_weights(weights) {}
  bool operator()(const Scores& s1, const Scores& s2) const {
    return s1[0] < s2[0]; // just p(e|f) as approximation
-    /*float score1(0), score2(0);
-    int idx1(0), idx2(0);
-    for (Scores::const_iterator itr = s1.begin();
-            itr != s1.end(); ++itr) {
-        score1 += log(*itr * m_weights.at(idx1++));
-    }
-    for (Scores::const_iterator itr = s2.begin();
-        itr != s2.end(); ++itr) {
-        score2 += log(*itr * m_weights.at(idx2++));
-    }
-    return score1 < score2;*/
+    // float score1(0), score2(0);
+    // int idx1(0), idx2(0);
+    // for (Scores::const_iterator itr = s1.begin();
+    // 	 itr != s1.end(); ++itr) {
+    //   score1 += log(*itr * m_weights.at(idx1++));
+    // }
+    // for (Scores::const_iterator itr = s2.begin();
+    // 	 itr != s2.end(); ++itr) {
+    //   score2 += log(*itr * m_weights.at(idx2++));
+    // }
+    // return score1 < score2;
  }
 private:
-  const std::vector<float>& m_weights;
+  const vector<float>& m_weights;
+};
+
+struct BetterPhrase {
+  ScoresComp const& cmp;
+  BetterPhrase(ScoresComp const& sc);
+  // bool operator()(pair<Scores, TargetPhrase const*> const& a,
+  // pair<Scores, TargetPhrase const*> const& b) const;
+  bool operator()(pair<Scores, SAPhrase const*> const& a,
+                  pair<Scores, SAPhrase const*> const& b) const;
 };

 /** @todo ask Abbey Levenberg
@ -93,66 +114,70 @@ class BilingualDynSuffixArray
 public:
  BilingualDynSuffixArray();
  ~BilingualDynSuffixArray();
-  bool Load( const std::vector<FactorType>& inputFactors,
-             const std::vector<FactorType>& outputTactors,
-             std::string source, std::string target, std::string alignments,
-             const std::vector<float> &weight);
-  bool LoadTM( const std::vector<FactorType>& inputFactors,
-               const std::vector<FactorType>& outputTactors,
-               std::string source, std::string target, std::string alignments,
-               const std::vector<float> &weight);
-  void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
-  void addSntPair(string& source, string& target, string& alignment);
-private:
-  DynSuffixArray* m_srcSA;
-  DynSuffixArray* m_trgSA;
-  std::vector<wordID_t>* m_srcCorpus;
-  std::vector<wordID_t>* m_trgCorpus;
-  std::vector<FactorType> m_inputFactors;
-  std::vector<FactorType> m_outputFactors;
+  bool Load( const vector<FactorType>& inputFactors,
+             const vector<FactorType>& outputTactors,
+             string source, string target, string alignments,
+             const vector<float> &weight);
+  // bool LoadTM( const vector<FactorType>& inputFactors,
+  // 	     const vector<FactorType>& outputTactors,
+  // 	     string source, string target, string alignments,
+  // 	     const vector<float> &weight);
+  void GetTargetPhrasesByLexicalWeight(const Phrase& src, vector< pair<Scores, TargetPhrase*> >& target) const;

-  std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
+  void CleanUp(const InputType& source);
+  void addSntPair(string& source, string& target, string& alignment);
+  pair<float,float>
+  GatherCands(Phrase const& src, map<SAPhrase, vector<float> >& pstats) const;
+
+  TargetPhrase*
+  GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;
+
+private:
+
+
+  mutable WordCoocTable m_wrd_cooc;
+  DynSuffixArray * m_srcSA;
+  DynSuffixArray * m_trgSA;
+  vector<wordID_t>* m_srcCorpus;
+  vector<wordID_t>* m_trgCorpus;
+  vector<FactorType> m_inputFactors;
+  vector<FactorType> m_outputFactors;
+
+  vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;

  Vocab* m_srcVocab, *m_trgVocab;
  ScoresComp* m_scoreCmp;

-  std::vector<SentenceAlignment> m_alignments;
-  std::vector<std::vector<short> > m_rawAlignments;
+  vector<SentenceAlignment> m_alignments;
+  vector<vector<short> > m_rawAlignments;

-  mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
-  mutable std::set<wordID_t> m_freqWordsCached;
+  mutable map<pair<wordID_t, wordID_t>, pair<float, float> > m_wordPairCache;
+  mutable set<wordID_t> m_freqWordsCached;
  const size_t m_maxPhraseLength, m_maxSampleSize;
-
-  int LoadCorpus(FactorDirection direction, InputFileStream&, const std::vector<FactorType>& factors,
-                 std::vector<wordID_t>&, std::vector<wordID_t>&,
+  const size_t m_maxPTEntries;
+  int LoadCorpus(FactorDirection direction,
+                 InputFileStream&, const vector<FactorType>& factors,
+                 vector<wordID_t>&, vector<wordID_t>&,
                 Vocab*);
  int LoadAlignments(InputFileStream& aligs);
  int LoadRawAlignments(InputFileStream& aligs);
  int LoadRawAlignments(string& aligs);

-  bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
+  bool ExtractPhrases(const int&, const int&, const int&, vector<PhrasePair*>&, bool=false) const;
  SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
-  int SampleSelection(std::vector<unsigned>&, int = 300) const;
+  int SampleSelection(vector<unsigned>&, int = 300) const;

-  std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
-  TargetPhrase* GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;
+  vector<int> GetSntIndexes(vector<unsigned>&, int, const vector<unsigned>&) const;
  SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
  bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
  void CacheWordProbs(wordID_t) const;
  void CacheFreqWords() const;
  void ClearWordInCache(wordID_t);
-  std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
+  pair<float, float> GetLexicalWeight(const PhrasePair&) const;
+
+  int GetSourceSentenceSize(size_t sentenceId) const;
+  int GetTargetSentenceSize(size_t sentenceId) const;

-  int GetSourceSentenceSize(size_t sentenceId) const {
-    return (sentenceId==m_srcSntBreaks.size()-1) ?
-           m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
-           m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
-  }
-  int GetTargetSentenceSize(size_t sentenceId) const {
-    return (sentenceId==m_trgSntBreaks.size()-1) ?
-           m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
-           m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
-  }
 };
 } // end namespace
 #endif
--- a/moses/TranslationModel/DynSuffixArray.cpp
+++ b/moses/TranslationModel/DynSuffixArray.cpp
@ -1,5 +1,6 @@
 #include "DynSuffixArray.h"
 #include <iostream>
+#include <boost/foreach.hpp>

 using namespace std;

@ -215,8 +216,37 @@ void DynSuffixArray::Substitute(vuint_t* /* newSents */, unsigned /* newIndex */
  return;
 }

+ComparePosition::
+ComparePosition(vuint_t const& crp, vuint_t const& sfa)
+  : m_crp(crp), m_sfa(sfa) { }
+
+bool
+ComparePosition::
+operator()(unsigned const& i, vector<wordID_t> const& phrase) const
+{
+  unsigned const* x = &m_crp.at(i);
+  unsigned const* e = &m_crp.back();
+  size_t k = 0;
+  for (; k < phrase.size() && x < e; ++k, ++x)
+    if (*x != phrase[k]) return *x < phrase[k];
+  return (x == e && k < phrase.size());
+}
+
+bool
+ComparePosition::
+operator()(vector<wordID_t> const& phrase, unsigned const& i) const
+{
+  unsigned const* x = &m_crp.at(i);
+  unsigned const* e = &m_crp.back();
+  size_t k = 0;
+  for (; k < phrase.size() && x < e; ++k, ++x)
+    if (*x != phrase[k]) return phrase[k] < *x;
+  return false; // (k == phrase.size() && x < e);
+}
+
 bool DynSuffixArray::GetCorpusIndex(const vuint_t* phrase, vuint_t* indices)
 {
+  // DOES THIS EVEN WORK WHEN A DynSuffixArray has been saved and reloaded????
  pair<vuint_t::iterator,vuint_t::iterator> bounds;
  indices->clear();
  size_t phrasesize = phrase->size();
@ -251,6 +281,16 @@ bool DynSuffixArray::GetCorpusIndex(const vuint_t* phrase, vuint_t* indices)
  return (indices->size() > 0);
 }

+size_t
+DynSuffixArray::
+GetCount(vuint_t const& phrase) const
+{
+  ComparePosition cmp(*m_corpus, *m_SA);
+  vuint_t::const_iterator lb = lower_bound(m_SA->begin(), m_SA->end(), phrase, cmp);
+  vuint_t::const_iterator ub = upper_bound(m_SA->begin(), m_SA->end(), phrase, cmp);
+  return ub-lb;
+}
+
 void DynSuffixArray::Save(FILE* fout)
 {
  fWriteVector(fout, *m_SA);
--- a/moses/TranslationModel/DynSuffixArray.h
+++ b/moses/TranslationModel/DynSuffixArray.h
@ -11,9 +11,25 @@

 namespace Moses
 {
-
+using namespace std;
 typedef std::vector<unsigned> vuint_t;

+
+/// compare position /i/ in the suffix array /m_sfa/ into corpus /m_crp/
+/// against reference phrase /phrase/
+// added by Ulrich Germann
+class ComparePosition
+{
+  vuint_t const& m_crp;
+  vuint_t const& m_sfa;
+
+public:
+  ComparePosition(vuint_t const& crp, vuint_t const& sfa);
+  bool operator()(unsigned const& i, vector<wordID_t> const& phrase) const;
+  bool operator()(vector<wordID_t> const& phrase, unsigned const& i) const;
+};
+
+
 /** @todo ask Abbey Levenberg
 */
 class DynSuffixArray
@ -30,6 +46,8 @@ public:
  void Delete(unsigned, unsigned);
  void Substitute(vuint_t*, unsigned);

+  size_t GetCount(vuint_t const& phrase) const;
+
 private:
  vuint_t* m_SA;
  vuint_t* m_ISA;
@ -46,10 +64,10 @@ private:
  void PrintAuxArrays() {
    std::cerr << "SA\tISA\tF\tL\n";
    for(size_t i=0; i < m_SA->size(); ++i)
-      std::cerr << m_SA->at(i) << "\t" << m_ISA->at(i) << "\t" << m_F->at(i) << "\t" << m_L->at(i) << std::endl;
+      std::cerr << m_SA->at(i) << "\t" << m_ISA->at(i) << "\t"
+                << m_F->at(i) << "\t" << m_L->at(i) << std::endl;
  }
 };
-
 } //end namespace

 #endif
--- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.README
+++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.README
@ -0,0 +1,4 @@
+Specifying Dynamic Suffix Array-based Phrase Tables in moses.ini
+
+[ttable-file]
+14 0 0 5 <source language text file> <target language text file> <file with alignment info in symal format>
--- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
+++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
@ -3,70 +3,32 @@
 #include "moses/StaticData.h"
 #include "moses/TargetPhrase.h"
 #include <iomanip>
-
+#include <boost/foreach.hpp>
 using namespace std;

 namespace Moses
 {
-PhraseDictionaryDynSuffixArray::PhraseDictionaryDynSuffixArray(const std::string &line)
-  :PhraseDictionary("PhraseDictionaryDynSuffixArray", line)
+PhraseDictionaryDynSuffixArray::
+PhraseDictionaryDynSuffixArray(const std::string &line)
+  : PhraseDictionary("PhraseDictionaryDynSuffixArray", line)
  ,m_biSA(new BilingualDynSuffixArray())
 {
  ReadParameters();
 }

-PhraseDictionaryDynSuffixArray::~PhraseDictionaryDynSuffixArray()
-{
-  delete m_biSA;
-}

 void PhraseDictionaryDynSuffixArray::Load()
 {
  SetFeaturesToApply();

-  const StaticData &staticData = StaticData::Instance();
-  vector<float> weight = staticData.GetWeights(this);
-
-  m_biSA->Load( m_input, m_output, m_source, m_target, m_alignments, weight);
+  vector<float> weight = StaticData::Instance().GetWeights(this);
+  m_biSA->Load(m_input, m_output, m_source, m_target, m_alignments, weight);
 }

-const TargetPhraseCollection *PhraseDictionaryDynSuffixArray::GetTargetPhraseCollection(const Phrase& src) const
+PhraseDictionaryDynSuffixArray::
+~PhraseDictionaryDynSuffixArray()
 {
-  TargetPhraseCollection *ret = new TargetPhraseCollection();
-  std::vector< std::pair< Scores, TargetPhrase*> > trg;
-  // extract target phrases and their scores from suffix array
-  m_biSA->GetTargetPhrasesByLexicalWeight( src, trg);
-
-  std::vector< std::pair< Scores, TargetPhrase*> >::iterator itr;
-  for(itr = trg.begin(); itr != trg.end(); ++itr) {
-    Scores scoreVector = itr->first;
-    TargetPhrase *targetPhrase = itr->second;
-    //std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),NegateScore);
-    std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
-
-    targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
-    targetPhrase->Evaluate(src);
-
-    //cout << *targetPhrase << "\t" << std::setprecision(8) << scoreVector[2] << endl;
-    ret->Add(targetPhrase);
-  }
-  ret->NthElement(m_tableLimit); // sort the phrases for the dcoder
-  return ret;
-}
-
-void PhraseDictionaryDynSuffixArray::insertSnt(string& source, string& target, string& alignment)
-{
-  m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays
-  //StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
-}
-void PhraseDictionaryDynSuffixArray::deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
-{
-  // need to implement --
-}
-
-ChartRuleLookupManager *PhraseDictionaryDynSuffixArray::CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&)
-{
-  throw "Chart decoding not supported by PhraseDictionaryDynSuffixArray";
+  delete m_biSA;
 }

 void PhraseDictionaryDynSuffixArray::SetParameter(const std::string& key, const std::string& value)
@ -82,4 +44,62 @@ void PhraseDictionaryDynSuffixArray::SetParameter(const std::string& key, const
  }
 }

+const TargetPhraseCollection*
+PhraseDictionaryDynSuffixArray::
+GetTargetPhraseCollection(const Phrase& src) const
+{
+  typedef map<SAPhrase, vector<float> >::value_type pstat_entry;
+  map<SAPhrase, vector<float> > pstats; // phrase (pair) statistics
+  m_biSA->GatherCands(src,pstats);
+
+  TargetPhraseCollection *ret = new TargetPhraseCollection();
+  BOOST_FOREACH(pstat_entry & e, pstats) {
+    TargetPhrase* tp = m_biSA->GetMosesFactorIDs(e.first, src);
+    tp->GetScoreBreakdown().Assign(this,e.second);
+    ret->Add(tp);
+  }
+  // return ret;
+  // TargetPhraseCollection *ret = new TargetPhraseCollection();
+  // std::vector< std::pair< Scores, TargetPhrase*> > trg;
+  //
+  // // extract target phrases and their scores from suffix array
+  // m_biSA->GetTargetPhrasesByLexicalWeight(src, trg);
+  //
+  // std::vector< std::pair< Scores, TargetPhrase*> >::iterator itr;
+  // for(itr = trg.begin(); itr != trg.end(); ++itr) {
+  //   Scores scoreVector = itr->first;
+  //   TargetPhrase *targetPhrase = itr->second;
+  //   std::transform(scoreVector.begin(),scoreVector.end(),
+  // 		   scoreVector.begin(),FloorScore);
+  //   targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
+  //   targetPhrase->Evaluate();
+  //   ret->Add(targetPhrase);
+  // }
+  ret->NthElement(m_tableLimit); // sort the phrases for the decoder
+  return ret;
+}
+
+void
+PhraseDictionaryDynSuffixArray::
+insertSnt(string& source, string& target, string& alignment)
+{
+  m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays
+  //StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
+}
+
+void
+PhraseDictionaryDynSuffixArray::
+deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
+{
+  // need to implement --
+}
+
+ChartRuleLookupManager*
+PhraseDictionaryDynSuffixArray::
+CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&)
+{
+  CHECK(false);
+  return 0;
+}
+
 }// end namepsace
--- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h
+++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h
@ -17,21 +17,19 @@ class PhraseDictionaryDynSuffixArray: public PhraseDictionary
 public:
  PhraseDictionaryDynSuffixArray(const std::string &line);
  ~PhraseDictionaryDynSuffixArray();
-
+  bool InitDictionary();
  void Load();
-
  // functions below required by base class
  const TargetPhraseCollection* GetTargetPhraseCollection(const Phrase& src) const;
  void insertSnt(string&, string&, string&);
  void deleteSnt(unsigned, unsigned);
  ChartRuleLookupManager *CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&);
-
  void SetParameter(const std::string& key, const std::string& value);
-
 private:
  BilingualDynSuffixArray *m_biSA;
  std::string m_source, m_target, m_alignments;

+  std::vector<float> m_weight;
 };

 } // end namespace
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
@ -31,7 +31,8 @@ using namespace std;
 namespace Moses
 {
 PhraseDictionaryOnDisk::PhraseDictionaryOnDisk(const std::string &line)
-  : MyBase("PhraseDictionaryOnDisk", line) {
+  : MyBase("PhraseDictionaryOnDisk", line)
+{
  ReadParameters();
 }

--- a/moses/TranslationModel/RuleTable/Trie.h
+++ b/moses/TranslationModel/RuleTable/Trie.h
@ -48,12 +48,6 @@ public:

  void Load();

-  // Required by PhraseDictionary.
-  virtual const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase &) const {
-    CHECK(false);
-    return NULL;
-  }
-
 private:
  friend class RuleTableLoader;

--- a/moses/TranslationModel/WordCoocTable.cpp
+++ b/moses/TranslationModel/WordCoocTable.cpp
@ -0,0 +1,72 @@
+#include "moses/TranslationModel/WordCoocTable.h"
+using namespace std;
+namespace Moses
+{
+
+WordCoocTable::
+WordCoocTable()
+{
+  m_cooc.reserve(1000000);
+  m_marg1.reserve(1000000);
+  m_marg2.reserve(1000000);
+}
+
+WordCoocTable::
+WordCoocTable(wordID_t const VocabSize1, wordID_t const VocabSize2)
+  : m_cooc(VocabSize1), m_marg1(VocabSize1,0), m_marg2(VocabSize2, 0)
+{}
+
+void
+WordCoocTable::
+Count(size_t const a, size_t const b)
+{
+  while (a >= m_marg1.size()) {
+    m_cooc.push_back(my_map_t());
+    m_marg1.push_back(0);
+  }
+  while (b >= m_marg2.size())
+    m_marg2.push_back(0);
+  ++m_marg1[a];
+  ++m_marg2[b];
+  ++m_cooc[a][b];
+}
+
+uint32_t
+WordCoocTable::
+GetJoint(size_t const a, size_t const b) const
+{
+  if (a >= m_marg1.size() || b >= m_marg2.size()) return 0;
+  my_map_t::const_iterator m = m_cooc.at(a).find(b);
+  if (m == m_cooc[a].end()) return 0;
+  return m->second;
+}
+
+uint32_t
+WordCoocTable::
+GetMarg1(size_t const x) const
+{
+  return x >= m_marg1.size() ? 0 : m_marg1[x];
+}
+
+uint32_t
+WordCoocTable::
+GetMarg2(size_t const x) const
+{
+  return x >= m_marg2.size() ? 0 : m_marg2[x];
+}
+
+float
+WordCoocTable::
+pfwd(size_t const a, size_t const b) const
+{
+  return float(GetJoint(a,b))/GetMarg1(a);
+}
+
+float
+WordCoocTable::
+pbwd(size_t const a, size_t const b) const
+{
+  // cerr << "at " << __FILE__ << ":" << __LINE__ << endl;
+  return float(GetJoint(a,b))/GetMarg2(b);
+}
+}
--- a/moses/TranslationModel/WordCoocTable.h
+++ b/moses/TranslationModel/WordCoocTable.h
@ -0,0 +1,72 @@
+#ifndef moses_WordCoocTable_h
+#define moses_WordCoocTable_h
+
+#include "moses/TranslationModel/DynSAInclude/vocab.h"
+#include "moses/TranslationModel/DynSAInclude/types.h"
+#include "moses/TranslationModel/DynSAInclude/utils.h"
+#include "moses/InputFileStream.h"
+#include "moses/FactorTypeSet.h"
+#include "moses/TargetPhrase.h"
+#include <boost/dynamic_bitset.hpp>
+#include <map>
+
+namespace Moses
+{
+
+using namespace std;
+
+#ifndef bitvector
+typedef boost::dynamic_bitset<uint64_t> bitvector;
+#endif
+
+
+/**
+ *  Stores word cooccurrence counts
+ *  @todo ask Uli Germann
+ */
+class WordCoocTable
+{
+  typedef map<wordID_t,uint32_t> my_map_t;
+  vector<my_map_t> m_cooc;
+  vector<uint32_t> m_marg1;
+  vector<uint32_t> m_marg2;
+public:
+  WordCoocTable();
+  WordCoocTable(wordID_t const VocabSize1, wordID_t const VocabSize2);
+  uint32_t GetJoint(size_t const a, size_t const b) const;
+  uint32_t GetMarg1(size_t const x) const;
+  uint32_t GetMarg2(size_t const x) const;
+  float pfwd(size_t const a, size_t const b) const;
+  float pbwd(size_t const a, size_t const b) const;
+  void
+  Count(size_t const a, size_t const b);
+
+  template<typename idvec, typename alnvec>
+  void
+  Count(idvec const& s1, idvec const& s2, alnvec const& aln,
+        wordID_t const NULL1, wordID_t const NULL2);
+
+};
+
+template<typename idvec, typename alnvec>
+void
+WordCoocTable::
+Count(idvec const& s1, idvec const& s2, alnvec const& aln,
+      wordID_t const NULL1, wordID_t const NULL2)
+{
+  boost::dynamic_bitset<uint64_t> check1(s1.size()), check2(s2.size());
+  check1.set();
+  check2.set();
+  for (size_t i = 0; i < aln.size(); i += 2) {
+    Count(s1[aln[i]], s2[aln[i+1]]);
+    check1.reset(aln[i]);
+    check2.reset(aln[i+1]);
+  }
+  for (size_t i = check1.find_first(); i < check1.size(); i = check1.find_next(i))
+    Count(s1[i], NULL2);
+  for (size_t i = check2.find_first(); i < check2.size(); i = check2.find_next(i))
+    Count(NULL1, s2[i]);
+}
+
+}
+#endif
--- a/moses/TypeDef.h
+++ b/moses/TypeDef.h
@ -121,6 +121,7 @@ enum PhraseTableImplementation {
  ,FuzzyMatch    = 11
  ,Compact      = 12
  ,Interpolated = 13
+  ,DSuffixArray = 14
 };

 enum InputTypeEnum {
--- a/moses/generic/sampling/Sampling.h
+++ b/moses/generic/sampling/Sampling.h
@ -0,0 +1,51 @@
+#ifndef __sampling_h
+#define __sampling_h
+
+// Utility functions for proper sub-sampling.
+// (c) 2007-2012 Ulrich Germann
+
+
+namespace Moses
+{
+
+inline
+size_t
+randInt(size_t N)
+{
+  return N*(rand()/(RAND_MAX+1.));
+}
+
+// select a random sample of size /s/ without restitution from the range of
+// integers [0,N);
+template<typename idx_t>
+void
+randomSample(vector<idx_t>& v, size_t s, size_t N)
+{
+  // see also Knuth: Art of Computer Programming Vol. 2, p. 142
+
+  s = min(s,N);
+  v.resize(s);
+
+  // the first option tries to be a bit more efficient than O(N) in picking
+  // the samples. The threshold is an ad-hoc, off-the-cuff guess. I still
+  // need to figure out the optimal break-even point between a linear sweep
+  // and repeatedly picking random numbers with the risk of hitting the same
+  // number many times.
+  if (s*10<N) {
+    boost::dynamic_bitset<uint64_t> check(N,0);
+    for (size_t i = 0; i < v.size(); i++) {
+      size_t x = randInt(N);
+      while (check[x]) x = randInt(N);
+      check[x]=true;
+      v[i] = x;
+    }
+  } else {
+    size_t m=0;
+    for (size_t t = 0; m <= s && t < N; t++)
+      if (s==N || randInt(N-t) < s-m) v[m++] = t;
+  }
+}
+
+};
+
+#endif
--- a/moses/generic/sorting/NBestList.h
+++ b/moses/generic/sorting/NBestList.h
@ -0,0 +1,85 @@
+#ifndef __n_best_list_h
+#define __n_best_list_h
+#include <algorithm>
+#include "moses/generic/sorting/VectorIndexSorter.h"
+
+// NBest List; (c) 2007-2012 Ulrich Germann
+//
+// The 'trick' used in this implementation is to maintain a heap of size <= N
+// such that the lowest-scoring item is on top of the heap. For each incoming
+// item we can then determine easily if it is in the top N.
+
+namespace Moses
+{
+using namespace std;
+
+template<typename THINGY, typename CMP>
+class
+  NBestList
+{
+  vector<uint32_t> m_heap;
+  vector<THINGY>   m_list;
+  VectorIndexSorter<THINGY, CMP, uint32_t> m_better;
+  mutable vector<uint32_t> m_order;
+  mutable bool m_changed;
+public:
+  NBestList(size_t const max_size, CMP const& cmp);
+  NBestList(size_t const max_size);
+  bool add(THINGY const& item);
+  THINGY const& operator[](int i) const;
+  size_t size() const {
+    return m_heap.size();
+  }
+};
+
+template<typename THINGY, typename CMP>
+NBestList<THINGY,CMP>::
+NBestList(size_t const max_size, CMP const& cmp)
+  : m_better(m_list, cmp), m_changed(false)
+{
+  m_heap.reserve(max_size);
+}
+
+template<typename THINGY, typename CMP>
+NBestList<THINGY,CMP>::
+NBestList(size_t const max_size)
+  : m_better(m_heap), m_changed(false)
+{
+  m_heap.reserve(max_size);
+}
+
+template<typename THINGY, typename CMP>
+bool
+NBestList<THINGY,CMP>::
+add(THINGY const& item)
+{
+  if (m_heap.size() == m_heap.capacity()) {
+    if (m_better.Compare(item, m_list[m_heap.at(0)])) {
+      pop_heap(m_heap.begin(),m_heap.end(),m_better);
+      m_list[m_heap.back()] = item;
+    } else return false;
+  } else {
+    m_list.push_back(item);
+    m_heap.push_back(m_heap.size());
+  }
+  push_heap(m_heap.begin(),m_heap.end(),m_better);
+  return m_changed = true;
+}
+
+template<typename THINGY, typename CMP>
+THINGY const&
+NBestList<THINGY,CMP>::
+operator[](int i) const
+{
+  if (m_changed) {
+    m_order.assign(m_heap.begin(),m_heap.end());
+    for (size_t k = m_heap.size(); k != 0; --k)
+      pop_heap(m_order.begin(), m_order.begin()+k);
+    m_changed = false;
+  }
+  if (i < 0) i += m_order.size();
+  return m_list[m_order.at(i)];
+}
+
+}
+#endif
--- a/moses/generic/sorting/VectorIndexSorter.h
+++ b/moses/generic/sorting/VectorIndexSorter.h
@ -0,0 +1,69 @@
+#ifndef __vector_index_sorter_h
+#define __vector_index_sorter_h
+
+// VectorIndexSorter; (c) 2007-2012 Ulrich Germann
+
+// A VectorIndexSorter is a function object for sorting indices into a vector
+// of objects (instead of sorting the vector itself).
+//
+// typcial use:
+// vector<thingy> my_vector;
+// VectorIndexSorter<thingy,less<thingy>,int> sorter(my_vector);
+// vector<int> order;
+// sorter.get_order(order);
+
+namespace Moses
+{
+template<typename VAL, typename COMP = greater<VAL>,  typename IDX_T=size_t>
+class
+  VectorIndexSorter : public binary_function<IDX_T const&, IDX_T const&, bool>
+{
+  vector<VAL> const&    m_vecref;
+  boost::shared_ptr<COMP> m_comp;
+public:
+
+  COMP const& Compare;
+  VectorIndexSorter(vector<VAL> const& v, COMP const& comp)
+    : m_vecref(v), Compare(comp)
+  { }
+
+  VectorIndexSorter(vector<VAL> const& v)
+    : m_vecref(v), m_comp(new COMP()), Compare(*m_comp)
+  { }
+
+  bool operator()(IDX_T const & a, IDX_T const & b) const {
+    bool fwd = Compare(m_vecref.at(a) ,m_vecref.at(b));
+    bool bwd = Compare(m_vecref[b],    m_vecref[a]);
+    return (fwd == bwd ? a < b : fwd);
+  }
+
+  boost::shared_ptr<vector<IDX_T> >
+  GetOrder() const;
+
+  void
+  GetOrder(vector<IDX_T> & order) const;
+
+};
+
+template<typename VAL, typename COMP, typename IDX_T>
+boost::shared_ptr<vector<IDX_T> >
+VectorIndexSorter<VAL,COMP,IDX_T>::
+GetOrder() const
+{
+  boost::shared_ptr<vector<IDX_T> > ret(new vector<IDX_T>(m_vecref.size()));
+  get_order(*ret);
+  return ret;
+}
+
+template<typename VAL, typename COMP, typename IDX_T>
+void
+VectorIndexSorter<VAL,COMP,IDX_T>::
+GetOrder(vector<IDX_T> & order) const
+{
+  order.resize(m_vecref.size());
+  for (IDX_T i = 0; i < IDX_T(m_vecref.size()); ++i) order[i] = i;
+  sort(order.begin(), order.end(), *this);
+}
+
+}
+#endif