mosesdecoder/moses/src/PhraseDictionaryDynSuffixArray.h

#ifndef MOSES_DICTSFXARY_H
#define MOSES_DICTSFXARY_H

#include "PhraseDictionary.h"
#include "DynSuffixArray.h" 
#include "DynSAInclude/vocab.h"
#include "DynSAInclude/types.h"
#include "DynSAInclude/utils.h"
#include "InputFileStream.h"
namespace Moses {

class SAPhrase
{
public:
	vector<wordID_t> words;		
	
	SAPhrase(size_t phraseSize)
	:words(phraseSize)
	{}
	
	void SetId(size_t pos, wordID_t id)
	{
    assert(pos < words.size());
		words[pos] = id;
	}
	bool operator<(const SAPhrase& phr2) const
  { return words < phr2.words; }
};

class PhrasePair
{
public:
	int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex;
	PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex)
	: m_startTarget(startTarget)
	, m_endTarget(endTarget)
	, m_startSource(startSource)
	, m_endSource(endSource)
  , m_sntIndex(sntIndex)
	{}

	size_t GetTargetSize() const
	{ return m_endTarget - m_startTarget + 1; }
};
	
class SentenceAlignment 
{
public:
  SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
	int m_sntIndex;
	vector<wordID_t>* trgSnt;
  vector<wordID_t>* srcSnt;
  vector<int> numberAligned; 
  vector< vector<int> > alignedList; 
	bool Extract(int maxPhraseLength, vector<PhrasePair*> &ret, int startSource, int endSource) const;
};
class ScoresComp {
public: 
  ScoresComp(const vector<float>& weights): m_weights(weights) {}
  bool operator()(const Scores& s1, const Scores& s2) const { 
    float score1(1), score2(1);
    int idx1(0), idx2(0);
    iterate(s1, itr) score1 += (*itr * m_weights.at(idx1++)); 
    iterate(s2, itr) score2 += (*itr * m_weights.at(idx2++));
    return score1 < score2;
  }
private: 
  const vector<float>& m_weights;
};
	
class PhraseDictionaryDynSuffixArray: public PhraseDictionary {
public: 
  PhraseDictionaryDynSuffixArray(size_t numScoreComponent, PhraseDictionaryFeature* feature);
  ~PhraseDictionaryDynSuffixArray();
	bool Load(string source, string target, string alignments
						, const vector<float> &weight
						, size_t tableLimit
						, const LMList &languageModels
						, float weightWP);
	void LoadVocabLookup();
  void save(string);
  void load(string);
  // functions below required by base class
  void SetWeightTransModel(const vector<float, std::allocator<float> >&);
  const TargetPhraseCollection* GetTargetPhraseCollection(const Phrase& src) const;
  void InitializeForInput(const InputType& i);
  void AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase){}
  void CleanUp();
private:
  DynSuffixArray* srcSA_;
  DynSuffixArray* trgSA_;
  vector<wordID_t>* srcCrp_;
  vector<wordID_t>* trgCrp_;
  vector<unsigned> srcSntBreaks_, trgSntBreaks_;
  Vocab* vocab_;
  ScoresComp* scoreCmp_;
  vector<SentenceAlignment> alignments_;
  vector<vector<short> > rawAlignments_;
	vector<float> m_weight;
	size_t m_tableLimit;
	const LMList *m_languageModels;
	float m_weightWP;
	std::map<const Factor *, wordID_t> vocabLookup_;
	std::map<wordID_t, const Factor *> vocabLookupRev_;	
  mutable std::map<pair<wordID_t, wordID_t>, pair<float, float> > wordPairCache_; 
  const int maxPhraseLength_, maxSampleSize_;
  int loadCorpus(InputFileStream&, vector<wordID_t>&, vector<wordID_t>&);
  int loadAlignments(InputFileStream& aligs);
  int loadRawAlignments(InputFileStream& aligs);
  bool extractPhrases(const int&, const int&, const int&, vector<PhrasePair*>&, bool=false) const;
  SentenceAlignment getSentenceAlignment(const int, bool=false) const; 
  vector<unsigned> sampleSelection(vector<unsigned>) const;
  vector<int> getSntIndexes(vector<unsigned>&, const int) const; 	
  TargetPhrase* getMosesFactorIDs(const SAPhrase&) const;
  SAPhrase trgPhraseFromSntIdx(const PhrasePair&) const;
  bool getLocalVocabIDs(const Phrase&, SAPhrase &) const;
  void cacheWordProbs(wordID_t) const;
  pair<float, float> getLexicalWeight(const PhrasePair&) const;
	int GetSourceSentenceSize(size_t sentenceId) const
	{ return (sentenceId==srcSntBreaks_.size()-1) ? srcCrp_->size() - srcSntBreaks_.at(sentenceId) : srcSntBreaks_.at(sentenceId+1) - srcSntBreaks_.at(sentenceId); }
	int GetTargetSentenceSize(size_t sentenceId) const
	{ return (sentenceId==trgSntBreaks_.size()-1) ? trgCrp_->size() - trgSntBreaks_.at(sentenceId) : trgSntBreaks_.at(sentenceId+1) - trgSntBreaks_.at(sentenceId); }
};
} // end namespace
#endif
Added suffix array phrase dictionary git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2890 1f5c12ca-751b-0410-a591-d2e778427230 2010-02-12 14:05:43 +03:00			`#ifndef MOSES_DICTSFXARY_H`
			`#define MOSES_DICTSFXARY_H`

			`#include "PhraseDictionary.h"`
			`#include "DynSuffixArray.h"`
			`#include "DynSAInclude/vocab.h"`
			`#include "DynSAInclude/types.h"`
			`#include "DynSAInclude/utils.h"`
			`#include "InputFileStream.h"`
			`namespace Moses {`

			`class SAPhrase`
			`{`
			`public:`
			`vector<wordID_t> words;`

			`SAPhrase(size_t phraseSize)`
			`:words(phraseSize)`
			`{}`

			`void SetId(size_t pos, wordID_t id)`
			`{`
			`assert(pos < words.size());`
			`words[pos] = id;`
			`}`
			`bool operator<(const SAPhrase& phr2) const`
			`{ return words < phr2.words; }`
			`};`

			`class PhrasePair`
			`{`
			`public:`
			`int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex;`
			`PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex)`
			`: m_startTarget(startTarget)`
			`, m_endTarget(endTarget)`
			`, m_startSource(startSource)`
			`, m_endSource(endSource)`
			`, m_sntIndex(sntIndex)`
			`{}`

			`size_t GetTargetSize() const`
			`{ return m_endTarget - m_startTarget + 1; }`
			`};`

			`class SentenceAlignment`
			`{`
			`public:`
			`SentenceAlignment(int sntIndex, int sourceSize, int targetSize);`
			`int m_sntIndex;`
			`vector<wordID_t>* trgSnt;`
			`vector<wordID_t>* srcSnt;`
added ability to go src->trg and trg->src phrase extraction in suffix array phrase implementation git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2895 1f5c12ca-751b-0410-a591-d2e778427230 2010-02-12 20:29:15 +03:00			`vector<int> numberAligned;`
			`vector< vector<int> > alignedList;`
Added suffix array phrase dictionary git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2890 1f5c12ca-751b-0410-a591-d2e778427230 2010-02-12 14:05:43 +03:00			`bool Extract(int maxPhraseLength, vector<PhrasePair*> &ret, int startSource, int endSource) const;`
			`};`
			`class ScoresComp {`
			`public:`
			`ScoresComp(const vector<float>& weights): m_weights(weights) {}`
			`bool operator()(const Scores& s1, const Scores& s2) const {`
			`float score1(1), score2(1);`
			`int idx1(0), idx2(0);`
			`iterate(s1, itr) score1 += (itr m_weights.at(idx1++));`
			`iterate(s2, itr) score2 += (itr m_weights.at(idx2++));`
			`return score1 < score2;`
			`}`
			`private:`
			`const vector<float>& m_weights;`
			`};`

			`class PhraseDictionaryDynSuffixArray: public PhraseDictionary {`
			`public:`
			`PhraseDictionaryDynSuffixArray(size_t numScoreComponent, PhraseDictionaryFeature* feature);`
			`~PhraseDictionaryDynSuffixArray();`
			`bool Load(string source, string target, string alignments`
			`, const vector<float> &weight`
			`, size_t tableLimit`
			`, const LMList &languageModels`
			`, float weightWP);`
			`void LoadVocabLookup();`
			`void save(string);`
			`void load(string);`
			`// functions below required by base class`
			`void SetWeightTransModel(const vector<float, std::allocator<float> >&);`
			`const TargetPhraseCollection* GetTargetPhraseCollection(const Phrase& src) const;`
			`void InitializeForInput(const InputType& i);`
			`void AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase){}`
			`void CleanUp();`
			`private:`
			`DynSuffixArray* srcSA_;`
			`DynSuffixArray* trgSA_;`
			`vector<wordID_t>* srcCrp_;`
			`vector<wordID_t>* trgCrp_;`
			`vector<unsigned> srcSntBreaks_, trgSntBreaks_;`
			`Vocab* vocab_;`
			`ScoresComp* scoreCmp_;`
			`vector<SentenceAlignment> alignments_;`
added ability to go src->trg and trg->src phrase extraction in suffix array phrase implementation git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2895 1f5c12ca-751b-0410-a591-d2e778427230 2010-02-12 20:29:15 +03:00			`vector<vector<short> > rawAlignments_;`
Added suffix array phrase dictionary git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2890 1f5c12ca-751b-0410-a591-d2e778427230 2010-02-12 14:05:43 +03:00			`vector<float> m_weight;`
			`size_t m_tableLimit;`
			`const LMList *m_languageModels;`
			`float m_weightWP;`
			`std::map<const Factor *, wordID_t> vocabLookup_;`
			`std::map<wordID_t, const Factor *> vocabLookupRev_;`
			`mutable std::map<pair<wordID_t, wordID_t>, pair<float, float> > wordPairCache_;`
added ability to go src->trg and trg->src phrase extraction in suffix array phrase implementation git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2895 1f5c12ca-751b-0410-a591-d2e778427230 2010-02-12 20:29:15 +03:00			`const int maxPhraseLength_, maxSampleSize_;`
			`int loadCorpus(InputFileStream&, vector<wordID_t>&, vector<wordID_t>&);`
Added suffix array phrase dictionary git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2890 1f5c12ca-751b-0410-a591-d2e778427230 2010-02-12 14:05:43 +03:00			`int loadAlignments(InputFileStream& aligs);`
added ability to go src->trg and trg->src phrase extraction in suffix array phrase implementation git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2895 1f5c12ca-751b-0410-a591-d2e778427230 2010-02-12 20:29:15 +03:00			`int loadRawAlignments(InputFileStream& aligs);`
			`bool extractPhrases(const int&, const int&, const int&, vector<PhrasePair*>&, bool=false) const;`
			`SentenceAlignment getSentenceAlignment(const int, bool=false) const;`
Added suffix array phrase dictionary git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2890 1f5c12ca-751b-0410-a591-d2e778427230 2010-02-12 14:05:43 +03:00			`vector<unsigned> sampleSelection(vector<unsigned>) const;`
			`vector<int> getSntIndexes(vector<unsigned>&, const int) const;`
			`TargetPhrase* getMosesFactorIDs(const SAPhrase&) const;`
			`SAPhrase trgPhraseFromSntIdx(const PhrasePair&) const;`
added ability to go src->trg and trg->src phrase extraction in suffix array phrase implementation git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2895 1f5c12ca-751b-0410-a591-d2e778427230 2010-02-12 20:29:15 +03:00			`bool getLocalVocabIDs(const Phrase&, SAPhrase &) const;`
Added suffix array phrase dictionary git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2890 1f5c12ca-751b-0410-a591-d2e778427230 2010-02-12 14:05:43 +03:00			`void cacheWordProbs(wordID_t) const;`
			`pair<float, float> getLexicalWeight(const PhrasePair&) const;`
			`int GetSourceSentenceSize(size_t sentenceId) const`
			`{ return (sentenceId==srcSntBreaks_.size()-1) ? srcCrp_->size() - srcSntBreaks_.at(sentenceId) : srcSntBreaks_.at(sentenceId+1) - srcSntBreaks_.at(sentenceId); }`
			`int GetTargetSentenceSize(size_t sentenceId) const`
			`{ return (sentenceId==trgSntBreaks_.size()-1) ? trgCrp_->size() - trgSntBreaks_.at(sentenceId) : trgSntBreaks_.at(sentenceId+1) - trgSntBreaks_.at(sentenceId); }`
			`};`
			`} // end namespace`
			`#endif`