#ifndef moses_BilingualDynSuffixArray_h #define moses_BilingualDynSuffixArray_h #include "DynSuffixArray.h" #include "moses/TranslationModel/DynSAInclude/vocab.h" #include "moses/TranslationModel/DynSAInclude/types.h" #include "moses/TranslationModel/DynSAInclude/utils.h" #include "moses/TranslationModel/WordCoocTable.h" #include "moses/InputFileStream.h" #include "moses/FactorTypeSet.h" #include "moses/TargetPhrase.h" #include #include "moses/TargetPhraseCollection.h" #include using namespace std; namespace Moses { class PhraseDictionaryDynSuffixArray; /** @todo ask Abbey Levenberg */ class SAPhrase { public: vector words; SAPhrase(size_t phraseSize) :words(phraseSize) { } void SetId(size_t pos, wordID_t id) { CHECK(pos < words.size()); words[pos] = id; } bool operator<(const SAPhrase& phr2) const { return words < phr2.words; } }; /** @todo ask Abbey Levenberg */ class PhrasePair { public: int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex; PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex) : m_startTarget(startTarget) , m_endTarget(endTarget) , m_startSource(startSource) , m_endSource(endSource) , m_sntIndex(sntIndex) { } size_t GetTargetSize() const { return m_endTarget - m_startTarget + 1; } size_t GetSourceSize() const { return m_endSource - m_startSource + 1; } }; /** @todo ask Abbey Levenberg */ class SentenceAlignment { public: SentenceAlignment(int sntIndex, int sourceSize, int targetSize); int m_sntIndex; vector* trgSnt; vector* srcSnt; vector numberAligned; vector< vector > alignedList; bool Extract(int maxPhraseLength, vector &ret, int startSource, int endSource) const; }; class ScoresComp { public: ScoresComp(const vector& weights): m_weights(weights) {} bool operator()(const Scores& s1, const Scores& s2) const { return s1[0] < s2[0]; // just p(e|f) as approximation // float score1(0), score2(0); // int idx1(0), idx2(0); // for (Scores::const_iterator itr = s1.begin(); // itr != s1.end(); ++itr) { // score1 += log(*itr * m_weights.at(idx1++)); // } // for (Scores::const_iterator itr = s2.begin(); // itr != s2.end(); ++itr) { // score2 += log(*itr * m_weights.at(idx2++)); // } // return score1 < score2; } private: const vector& m_weights; }; struct BetterPhrase { ScoresComp const& cmp; BetterPhrase(ScoresComp const& sc); // bool operator()(pair const& a, // pair const& b) const; bool operator()(pair const& a, pair const& b) const; }; /** @todo ask Abbey Levenberg */ class BilingualDynSuffixArray { public: BilingualDynSuffixArray(); ~BilingualDynSuffixArray(); bool Load( const vector& inputFactors, const vector& outputTactors, string source, string target, string alignments, const vector &weight); // bool LoadTM( const vector& inputFactors, // const vector& outputTactors, // string source, string target, string alignments, // const vector &weight); void GetTargetPhrasesByLexicalWeight(const Phrase& src, vector< pair >& target) const; void CleanUp(const InputType& source); void addSntPair(string& source, string& target, string& alignment); pair GatherCands(Phrase const& src, map >& pstats) const; TargetPhrase* GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const; private: mutable WordCoocTable m_wrd_cooc; DynSuffixArray * m_srcSA; DynSuffixArray * m_trgSA; vector* m_srcCorpus; vector* m_trgCorpus; vector m_inputFactors; vector m_outputFactors; vector m_srcSntBreaks, m_trgSntBreaks; Vocab* m_srcVocab, *m_trgVocab; ScoresComp* m_scoreCmp; vector m_alignments; vector > m_rawAlignments; mutable map, pair > m_wordPairCache; mutable set m_freqWordsCached; const size_t m_maxPhraseLength, m_maxSampleSize; const size_t m_maxPTEntries; int LoadCorpus(FactorDirection direction, InputFileStream&, const vector& factors, vector&, vector&, Vocab*); int LoadAlignments(InputFileStream& aligs); int LoadRawAlignments(InputFileStream& aligs); int LoadRawAlignments(string& aligs); bool ExtractPhrases(const int&, const int&, const int&, vector&, bool=false) const; SentenceAlignment GetSentenceAlignment(const int, bool=false) const; int SampleSelection(vector&, int = 300) const; vector GetSntIndexes(vector&, int, const vector&) const; SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const; bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const; void CacheWordProbs(wordID_t) const; void CacheFreqWords() const; void ClearWordInCache(wordID_t); pair GetLexicalWeight(const PhrasePair&) const; int GetSourceSentenceSize(size_t sentenceId) const; int GetTargetSentenceSize(size_t sentenceId) const; }; } // end namespace #endif