diff --git a/OnDiskPt/Main.cpp b/OnDiskPt/Main.cpp index 5d4e0be8d..c3cda2a75 100644 --- a/OnDiskPt/Main.cpp +++ b/OnDiskPt/Main.cpp @@ -50,14 +50,14 @@ int main (int argc, char * const argv[]) } int numSourceFactors = Moses::Scan(argv[1]) - , numTargetFactors = Moses::Scan(argv[2]) - , numScores = Moses::Scan(argv[3]) - , tableLimit = Moses::Scan(argv[4]); + , numTargetFactors = Moses::Scan(argv[2]) + , numScores = Moses::Scan(argv[3]) + , tableLimit = Moses::Scan(argv[4]); TargetPhraseCollection::s_sortScoreInd = Moses::Scan(argv[5]); assert(TargetPhraseCollection::s_sortScoreInd < numScores); - + const string filePath = argv[6] - ,destPath = argv[7]; + ,destPath = argv[7]; Moses::InputFileStream inStream(filePath); @@ -128,10 +128,10 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr } else { switch (stage) { case 0: { - WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper); - if (w != NULL) - out->AddWord(w); - + WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper); + if (w != NULL) + out->AddWord(w); + break; } case 1: { @@ -146,19 +146,19 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr } case 3: { //targetPhrase.Create1AlignFromString(tok); - targetPhrase.CreateAlignFromString(tok); + targetPhrase.CreateAlignFromString(tok); break; } case 4: ++stage; break; - /* case 5: { - // count info. Only store the 2nd one - float val = Moses::Scan(tok); - misc[0] = val; - ++stage; - break; - }*/ + /* case 5: { + // count info. Only store the 2nd one + float val = Moses::Scan(tok); + misc[0] = val; + ++stage; + break; + }*/ case 5: { // count info. Only store the 2nd one //float val = Moses::Scan(tok); @@ -167,12 +167,12 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr break; } case 6: { - // store only the 3rd one (rule count) + // store only the 3rd one (rule count) float val = Moses::Scan(tok); misc[0] = val; ++stage; break; - } + } default: cerr << "ERROR in line " << line << endl; assert(false); @@ -189,8 +189,8 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr } // Tokenize() OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase - , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm - , OnDiskPt::OnDiskWrapper &onDiskWrapper) + , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm + , OnDiskPt::OnDiskWrapper &onDiskWrapper) { bool nonTerm = false; @@ -218,7 +218,7 @@ OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase if (addSourceNonTerm) { WordPtr word(new Word()); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); - phrase.AddWord(word); + phrase.AddWord(word); } wordStr = token.substr(splitPos, tokSize - splitPos); @@ -237,7 +237,7 @@ OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase phrase.AddWord(word); out = word; } - + return out; } diff --git a/OnDiskPt/Main.h b/OnDiskPt/Main.h index b79827589..5c7efa43c 100644 --- a/OnDiskPt/Main.h +++ b/OnDiskPt/Main.h @@ -26,12 +26,12 @@ typedef std::pair AlignPair; typedef std::vector AlignType; OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase - , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm - , OnDiskPt::OnDiskWrapper &onDiskWrapper); + , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm + , OnDiskPt::OnDiskWrapper &onDiskWrapper); OnDiskPt::PhrasePtr Tokenize(OnDiskPt::SourcePhrase &sourcePhrase, OnDiskPt::TargetPhrase &targetPhrase - , char *line, OnDiskPt::OnDiskWrapper &onDiskWrapper - , int numScores - , std::vector &misc); + , char *line, OnDiskPt::OnDiskWrapper &onDiskWrapper + , int numScores + , std::vector &misc); void InsertTargetNonTerminals(std::vector &sourceToks, const std::vector &targetToks, const AlignType &alignments); void SortAlign(AlignType &alignments); diff --git a/OnDiskPt/OnDiskQuery.cpp b/OnDiskPt/OnDiskQuery.cpp index c39697d04..2cc7380db 100644 --- a/OnDiskPt/OnDiskQuery.cpp +++ b/OnDiskPt/OnDiskQuery.cpp @@ -3,10 +3,10 @@ namespace OnDiskPt { -void OnDiskQuery::Tokenize(Phrase &phrase, - const std::string &token, - bool addSourceNonTerm, - bool addTargetNonTerm) +void OnDiskQuery::Tokenize(Phrase &phrase, + const std::string &token, + bool addSourceNonTerm, + bool addTargetNonTerm) { bool nonTerm = false; size_t tokSize = token.size(); @@ -50,13 +50,13 @@ void OnDiskQuery::Tokenize(Phrase &phrase, phrase.AddWord(word); } } - + SourcePhrase OnDiskQuery::Tokenize(const std::vector& tokens) { SourcePhrase sourcePhrase; - if (tokens.size() > 0){ + if (tokens.size() > 0) { std::vector::const_iterator token = tokens.begin(); - for (; token + 1 != tokens.end(); ++token){ + for (; token + 1 != tokens.end(); ++token) { Tokenize(sourcePhrase, *token, true, true); } // last position. LHS non-term @@ -64,22 +64,20 @@ SourcePhrase OnDiskQuery::Tokenize(const std::vector& tokens) } return sourcePhrase; } - + const PhraseNode* OnDiskQuery::Query(const SourcePhrase& sourcePhrase) { - const PhraseNode *node = &m_wrapper.GetRootSourceNode(); - assert(node); - - for (size_t pos = 0; pos < sourcePhrase.GetSize(); ++pos) - { - const Word &word = sourcePhrase.GetWord(pos); - node = node->GetChild(word, m_wrapper); - if (node == NULL) - { - break; - } + const PhraseNode *node = &m_wrapper.GetRootSourceNode(); + assert(node); + + for (size_t pos = 0; pos < sourcePhrase.GetSize(); ++pos) { + const Word &word = sourcePhrase.GetWord(pos); + node = node->GetChild(word, m_wrapper); + if (node == NULL) { + break; } - return node; + } + return node; } } diff --git a/OnDiskPt/OnDiskQuery.h b/OnDiskPt/OnDiskQuery.h index 679f545fa..233603c6c 100644 --- a/OnDiskPt/OnDiskQuery.h +++ b/OnDiskPt/OnDiskQuery.h @@ -18,22 +18,21 @@ private: public: - OnDiskQuery(OnDiskWrapper &wrapper):m_wrapper(wrapper){} + OnDiskQuery(OnDiskWrapper &wrapper):m_wrapper(wrapper) {} + + void Tokenize(Phrase &phrase, + const std::string &token, + bool addSourceNonTerm, + bool addTargetNonTerm); - void Tokenize(Phrase &phrase, - const std::string &token, - bool addSourceNonTerm, - bool addTargetNonTerm); - SourcePhrase Tokenize(const std::vector& tokens); const PhraseNode *Query(const SourcePhrase& sourcePhrase); - inline const PhraseNode *Query(const std::vector& tokens) - { + inline const PhraseNode *Query(const std::vector& tokens) { return Query(Tokenize(tokens)); } - + }; diff --git a/OnDiskPt/OnDiskWrapper.cpp b/OnDiskPt/OnDiskWrapper.cpp index 3a1773c0a..8f90862be 100644 --- a/OnDiskPt/OnDiskWrapper.cpp +++ b/OnDiskPt/OnDiskWrapper.cpp @@ -204,16 +204,16 @@ Word *OnDiskWrapper::ConvertFromMoses(Moses::FactorDirection /* direction */ Word *newWord = new Word(isNonTerminal); stringstream strme; - size_t factorType = factorsVec[0]; + size_t factorType = factorsVec[0]; const Moses::Factor *factor = origWord.GetFactor(factorType); - CHECK(factor); + CHECK(factor); strme << factor->GetString(); for (size_t ind = 1 ; ind < factorsVec.size() ; ++ind) { size_t factorType = factorsVec[ind]; const Moses::Factor *factor = origWord.GetFactor(factorType); - if (factor == NULL) - { // can have less factors than factorType.size() + if (factor == NULL) { + // can have less factors than factorType.size() break; } CHECK(factor); diff --git a/OnDiskPt/OnDiskWrapper.h b/OnDiskPt/OnDiskWrapper.h index f763194c1..8b786d346 100644 --- a/OnDiskPt/OnDiskWrapper.h +++ b/OnDiskPt/OnDiskWrapper.h @@ -28,7 +28,7 @@ namespace OnDiskPt { const float DEFAULT_COUNT = 66666; -/** Global class with misc information need to create and use the on-disk rule table. +/** Global class with misc information need to create and use the on-disk rule table. * 1 object of this class should be instantiated per rule table. * Currently only hierarchical/syntax models use this, but can & should be used with pb models too */ diff --git a/OnDiskPt/PhraseNode.cpp b/OnDiskPt/PhraseNode.cpp index c3f2ebdc4..c259aa077 100644 --- a/OnDiskPt/PhraseNode.cpp +++ b/OnDiskPt/PhraseNode.cpp @@ -38,7 +38,7 @@ size_t PhraseNode::GetNodeSize(size_t numChildren, size_t wordSize, size_t count } PhraseNode::PhraseNode() - : m_value(0) + : m_value(0) ,m_currChild(NULL) ,m_saved(false) ,m_memLoad(NULL) @@ -58,7 +58,7 @@ PhraseNode::PhraseNode(UINT64 filePos, OnDiskWrapper &onDiskWrapper) CHECK(filePos == (UINT64)file.tellg()); file.read((char*) &m_numChildrenLoad, sizeof(UINT64)); - + size_t memAlloc = GetNodeSize(m_numChildrenLoad, onDiskWrapper.GetSourceWordSize(), countSize); m_memLoad = (char*) malloc(memAlloc); @@ -168,7 +168,7 @@ void PhraseNode::AddTargetPhrase(const SourcePhrase &sourcePhrase, TargetPhrase void PhraseNode::AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase , TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper , size_t tableLimit, const std::vector &counts, OnDiskPt::PhrasePtr spShort) -{ +{ size_t phraseSize = sourcePhrase.GetSize(); if (pos < phraseSize) { const Word &word = sourcePhrase.GetWord(pos); @@ -185,7 +185,7 @@ void PhraseNode::AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase m_currChild = &node; } - // keep searching for target phrase node.. + // keep searching for target phrase node.. node.AddTargetPhrase(pos + 1, sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, counts, spShort); } else { // drilled down to the right node diff --git a/OnDiskPt/PhraseNode.h b/OnDiskPt/PhraseNode.h index fbd20ce36..6b629a401 100644 --- a/OnDiskPt/PhraseNode.h +++ b/OnDiskPt/PhraseNode.h @@ -53,7 +53,7 @@ protected: void AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase , TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper , size_t tableLimit, const std::vector &counts, OnDiskPt::PhrasePtr spShort); - size_t ReadChild(Word &wordFound, UINT64 &childFilePos, const char *mem) const; + size_t ReadChild(Word &wordFound, UINT64 &childFilePos, const char *mem) const; void GetChild(Word &wordFound, UINT64 &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const; public: diff --git a/OnDiskPt/TargetPhrase.cpp b/OnDiskPt/TargetPhrase.cpp index 6ca2ef5f9..2e3e3511b 100644 --- a/OnDiskPt/TargetPhrase.cpp +++ b/OnDiskPt/TargetPhrase.cpp @@ -64,13 +64,13 @@ void TargetPhrase::Create1AlignFromString(const std::string &align1Str) void TargetPhrase::CreateAlignFromString(const std::string &alignStr) { - vector alignPairs; - boost::split(alignPairs, alignStr, boost::is_any_of("\t ")); - for (size_t i = 0; i < alignPairs.size(); ++i) { - vector alignPoints; - Moses::Tokenize(alignPoints, alignPairs[i], "-"); - m_align.push_back(pair(alignPoints[0], alignPoints[1]) ); - } + vector alignPairs; + boost::split(alignPairs, alignStr, boost::is_any_of("\t ")); + for (size_t i = 0; i < alignPairs.size(); ++i) { + vector alignPoints; + Moses::Tokenize(alignPoints, alignPairs[i], "-"); + m_align.push_back(pair(alignPoints[0], alignPoints[1]) ); + } } @@ -97,16 +97,16 @@ char *TargetPhrase::WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed) { size_t phraseSize = GetSize(); size_t targetWordSize = onDiskWrapper.GetTargetWordSize(); - + const PhrasePtr sp = GetSourcePhrase(); size_t spSize = sp->GetSize(); size_t sourceWordSize = onDiskWrapper.GetSourceWordSize(); - + size_t memNeeded = sizeof(UINT64) // num of words + targetWordSize * phraseSize // actual words. lhs as last words - + sizeof(UINT64) // num source words - + sourceWordSize * spSize; // actual source words - + + sizeof(UINT64) // num source words + + sourceWordSize * spSize; // actual source words + memUsed = 0; UINT64 *mem = (UINT64*) malloc(memNeeded); @@ -125,13 +125,13 @@ char *TargetPhrase::WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed) char *currPtr = (char*)mem + memUsed; UINT64 *memTmp = (UINT64*) currPtr; memTmp[0] = spSize; - memUsed += sizeof(UINT64); + memUsed += sizeof(UINT64); for (size_t pos = 0; pos < spSize; ++pos) { const Word &word = sp->GetWord(pos); char *currPtr = (char*)mem + memUsed; memUsed += word.WriteToMemory((char*) currPtr); } - + CHECK(memUsed == memNeeded); return (char *) mem; } @@ -174,7 +174,7 @@ char *TargetPhrase::WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t // phrase id memcpy(mem, &m_filePos, sizeof(UINT64)); memUsed += sizeof(UINT64); - + // align size_t tmp = WriteAlignToMemory(mem + memUsed); memUsed += tmp; @@ -223,7 +223,7 @@ size_t TargetPhrase::WriteScoresToMemory(char *mem) const } -Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector & inputFactors +Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector & inputFactors , const std::vector &outputFactors , const Vocab &vocab , const Moses::PhraseDictionary &phraseDict @@ -244,7 +244,7 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector > alignmentInfo; - const PhrasePtr sp = GetSourcePhrase(); + const PhrasePtr sp = GetSourcePhrase(); for (size_t ind = 0; ind < m_align.size(); ++ind) { const std::pair &entry = m_align[ind]; alignmentInfo.insert(entry); @@ -252,11 +252,10 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector(sourcePos, targetPos)); + alignNonTerm.insert(std::pair(sourcePos, targetPos)); + } else { + alignTerm.insert(std::pair(sourcePos, targetPos)); } - else { - alignTerm.insert(std::pair(sourcePos, targetPos)); - } } ret->SetAlignTerm(alignTerm); @@ -313,7 +312,7 @@ UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP) bytesRead += word->ReadFromFile(fileTP); AddWord(word); } - + // read source words UINT64 numSourceWords; fileTP.read((char*) &numSourceWords, sizeof(UINT64)); @@ -371,7 +370,7 @@ UINT64 TargetPhrase::ReadScoresFromFile(std::fstream &fileTPColl) void TargetPhrase::DebugPrint(ostream &out, const Vocab &vocab) const { Phrase::DebugPrint(out, vocab); - + for (size_t ind = 0; ind < m_align.size(); ++ind) { const AlignPair &alignPair = m_align[ind]; out << alignPair.first << "-" << alignPair.second << " "; diff --git a/OnDiskPt/TargetPhrase.h b/OnDiskPt/TargetPhrase.h index 5510ddd11..c4bb40454 100644 --- a/OnDiskPt/TargetPhrase.h +++ b/OnDiskPt/TargetPhrase.h @@ -49,7 +49,7 @@ class TargetPhrase: public Phrase friend std::ostream& operator<<(std::ostream&, const TargetPhrase&); protected: AlignType m_align; - PhrasePtr m_sourcePhrase; + PhrasePtr m_sourcePhrase; std::vector m_scores; UINT64 m_filePos; @@ -73,10 +73,10 @@ public: const PhrasePtr GetSourcePhrase() const { return m_sourcePhrase; } - const std::vector &GetScores() const{ + const std::vector &GetScores() const { return m_scores; } - + void SetLHS(WordPtr lhs); void Create1AlignFromString(const std::string &align1Str); @@ -107,7 +107,7 @@ public: UINT64 ReadOtherInfoFromFile(UINT64 filePos, std::fstream &fileTPColl); UINT64 ReadFromFile(std::fstream &fileTP); - virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const; + virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const; }; diff --git a/OnDiskPt/TargetPhraseCollection.cpp b/OnDiskPt/TargetPhraseCollection.cpp index f29bea9cf..c865c2df7 100644 --- a/OnDiskPt/TargetPhraseCollection.cpp +++ b/OnDiskPt/TargetPhraseCollection.cpp @@ -82,7 +82,7 @@ void TargetPhraseCollection::Save(OnDiskWrapper &onDiskWrapper) CollType::iterator iter; for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) { // save phrase - TargetPhrase &targetPhrase = **iter; + TargetPhrase &targetPhrase = **iter; targetPhrase.Save(onDiskWrapper); // save coll @@ -150,9 +150,9 @@ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnD { fstream &fileTPColl = onDiskWrapper.GetFileTargetColl(); fstream &fileTP = onDiskWrapper.GetFileTargetInd(); - + size_t numScores = onDiskWrapper.GetNumScores(); - + UINT64 numPhrases; @@ -164,9 +164,9 @@ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnD numPhrases = std::min(numPhrases, (UINT64) tableLimit); currFilePos += sizeof(UINT64); - + for (size_t ind = 0; ind < numPhrases; ++ind) { - TargetPhrase *tp = new TargetPhrase(numScores); + TargetPhrase *tp = new TargetPhrase(numScores); UINT64 sizeOtherInfo = tp->ReadOtherInfoFromFile(currFilePos, fileTPColl); tp->ReadFromFile(fileTP); @@ -197,7 +197,7 @@ const TargetPhrase &TargetPhraseCollection::GetTargetPhrase(size_t ind) const assert(ind < GetSize()); return *m_coll[ind]; } - + } diff --git a/OnDiskPt/TargetPhraseCollection.h b/OnDiskPt/TargetPhraseCollection.h index e0d5d1599..d6086850f 100644 --- a/OnDiskPt/TargetPhraseCollection.h +++ b/OnDiskPt/TargetPhraseCollection.h @@ -64,9 +64,9 @@ public: size_t GetSize() const { return m_coll.size(); } - + const TargetPhrase &GetTargetPhrase(size_t ind) const; - + UINT64 GetFilePos() const; Moses::TargetPhraseCollection *ConvertToMoses(const std::vector &inputFactors diff --git a/OnDiskPt/Vocab.cpp b/OnDiskPt/Vocab.cpp index 5de620b75..03af2d886 100644 --- a/OnDiskPt/Vocab.cpp +++ b/OnDiskPt/Vocab.cpp @@ -44,7 +44,7 @@ bool Vocab::Load(OnDiskWrapper &onDiskWrapper) // assume contiguous vocab id m_lookup.resize(m_vocabColl.size() + 1); m_nextId = m_lookup.size(); - + CollType::const_iterator iter; for (iter = m_vocabColl.begin(); iter != m_vocabColl.end(); ++iter) { UINT32 vocabId = iter->second; diff --git a/OnDiskPt/Word.cpp b/OnDiskPt/Word.cpp index 13c77f739..1664571c5 100644 --- a/OnDiskPt/Word.cpp +++ b/OnDiskPt/Word.cpp @@ -97,13 +97,14 @@ size_t Word::ReadFromFile(std::fstream &file) } void Word::ConvertToMoses( - const std::vector &outputFactorsVec, - const Vocab &vocab, - Moses::Word &overwrite) const { + const std::vector &outputFactorsVec, + const Vocab &vocab, + Moses::Word &overwrite) const +{ Moses::FactorCollection &factorColl = Moses::FactorCollection::Instance(); overwrite = Moses::Word(m_isNonTerminal); - // TODO: this conversion should have been done at load time. + // TODO: this conversion should have been done at load time. util::TokenIter tok(vocab.GetString(m_vocabId), '|'); for (std::vector::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) { @@ -144,14 +145,14 @@ bool Word::operator==(const Word &compare) const void Word::DebugPrint(ostream &out, const Vocab &vocab) const { - const string &str = vocab.GetString(m_vocabId); + const string &str = vocab.GetString(m_vocabId); out << str; } std::ostream& operator<<(std::ostream &out, const Word &word) { out << "("; - out << word.m_vocabId; + out << word.m_vocabId; out << (word.m_isNonTerminal ? "n" : "t"); out << ")"; diff --git a/OnDiskPt/Word.h b/OnDiskPt/Word.h index 64be6f148..254959737 100644 --- a/OnDiskPt/Word.h +++ b/OnDiskPt/Word.h @@ -50,8 +50,8 @@ public: {} explicit Word(bool isNonTerminal) - :m_isNonTerminal(isNonTerminal) - ,m_vocabId(0) + :m_isNonTerminal(isNonTerminal) + ,m_vocabId(0) {} Word(const Word ©); @@ -77,8 +77,7 @@ public: Moses::Word &overwrite) const; void DebugPrint(std::ostream &out, const Vocab &vocab) const; - inline const std::string &GetString(const Vocab &vocab) const - { + inline const std::string &GetString(const Vocab &vocab) const { return vocab.GetString(m_vocabId); } diff --git a/OnDiskPt/queryOnDiskPt.cpp b/OnDiskPt/queryOnDiskPt.cpp index 8126a2b75..776dd8a2c 100644 --- a/OnDiskPt/queryOnDiskPt.cpp +++ b/OnDiskPt/queryOnDiskPt.cpp @@ -33,8 +33,7 @@ int main(int argc, char **argv) if(i + 1 == argc) usage(); ttable = argv[++i]; - } - else + } else usage(); } @@ -55,30 +54,27 @@ int main(int argc, char **argv) cerr << "line: " << line << endl; const PhraseNode* node = onDiskQuery.Query(tokens); - - if (node) - { // source phrase points to a bunch of rules + + if (node) { + // source phrase points to a bunch of rules const TargetPhraseCollection *coll = node->GetTargetPhraseCollection(tableLimit, onDiskWrapper); string str = coll->GetDebugStr(); cout << "Found " << coll->GetSize() << endl; - - for (size_t ind = 0; ind < coll->GetSize(); ++ind) - { + + for (size_t ind = 0; ind < coll->GetSize(); ++ind) { const TargetPhrase &targetPhrase = coll->GetTargetPhrase(ind); cerr << " "; targetPhrase.DebugPrint(cerr, onDiskWrapper.GetVocab()); cerr << endl; } - } - else - { + } else { cout << "Not found" << endl; } - + std::cout << '\n'; std::cout.flush(); } - + cerr << "Finished." << endl; } diff --git a/biconcor/Alignment.cpp b/biconcor/Alignment.cpp index e73e18840..814802531 100644 --- a/biconcor/Alignment.cpp +++ b/biconcor/Alignment.cpp @@ -5,7 +5,8 @@ #include #include -namespace { +namespace +{ const int LINE_MAX_LENGTH = 10000; @@ -84,10 +85,10 @@ void Alignment::Create(const string& fileName) } Alignment::Alignment() - : m_array(NULL), - m_sentenceEnd(NULL), - m_size(0), - m_sentenceCount(0) {} + : m_array(NULL), + m_sentenceEnd(NULL), + m_size(0), + m_sentenceCount(0) {} Alignment::~Alignment() { diff --git a/biconcor/Mismatch.cpp b/biconcor/Mismatch.cpp index 31140b200..c3afec781 100644 --- a/biconcor/Mismatch.cpp +++ b/biconcor/Mismatch.cpp @@ -23,16 +23,16 @@ enum { }; Mismatch::Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end ) - :m_suffixArray(sa) - ,m_targetCorpus(tc) - ,m_alignment(a) - ,m_sentence_id(sentence_id) - ,m_source_length(source_length) - ,m_target_length(target_length) - ,m_source_position(position) - ,m_source_start(source_start) - ,m_source_end(source_end) - ,m_unaligned(true) + :m_suffixArray(sa) + ,m_targetCorpus(tc) + ,m_alignment(a) + ,m_sentence_id(sentence_id) + ,m_source_length(source_length) + ,m_target_length(target_length) + ,m_source_position(position) + ,m_source_start(source_start) + ,m_source_end(source_end) + ,m_unaligned(true) { // initialize unaligned indexes for (int i = 0; i < m_source_length; i++) { @@ -42,7 +42,7 @@ Mismatch::Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sente m_target_unaligned[i] = true; } m_num_alignment_points = - m_alignment->GetNumberOfAlignmentPoints( sentence_id ); + m_alignment->GetNumberOfAlignmentPoints( sentence_id ); for(INDEX ap=0; apGetSourceWord( sentence_id, ap ) ] = false; m_target_unaligned[ (int)m_alignment->GetTargetWord( sentence_id, ap ) ] = false; @@ -58,234 +58,235 @@ Mismatch::~Mismatch () {} void Mismatch::PrintClippedHTML( ostream* out, int width ) { - int source_annotation[256], target_annotation[256]; - vector< string > label_class; - label_class.push_back( "" ); - label_class.push_back( "mismatch_pre_aligned" ); - label_class.push_back( "mismatch_post_aligned" ); - label_class.push_back( "null_aligned" ); - label_class.push_back( "mismatch_misaligned" ); - label_class.push_back( "mismatch_aligned" ); + int source_annotation[256], target_annotation[256]; + vector< string > label_class; + label_class.push_back( "" ); + label_class.push_back( "mismatch_pre_aligned" ); + label_class.push_back( "mismatch_post_aligned" ); + label_class.push_back( "null_aligned" ); + label_class.push_back( "mismatch_misaligned" ); + label_class.push_back( "mismatch_aligned" ); - for(int i=0; i= 0) { - int word_id = m_source_start-i; - source_annotation[ word_id ] = UNALIGNED; - if (!m_source_unaligned[ word_id ]) { - found_aligned = true; - LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED ); - } - } + for(int i=0; i= 0) { + int word_id = m_source_start-i; + source_annotation[ word_id ] = UNALIGNED; + if (!m_source_unaligned[ word_id ]) { + found_aligned = true; + LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED ); + } + } - // find first and last - int target_start = -1; - int target_end; - for(int i=0; iGetTargetWord( m_sentence_id, ap ) == i) { - int source_word = m_alignment->GetSourceWord( m_sentence_id, ap ); - // if not part of the source phrase -> also misaligned - if (source_word < m_source_start || source_word > m_source_end) - source_annotation[ source_word ] = MISALIGNED; - } - } - } - } - // closure - bool change = true; - while(change) { - change = false; - for(INDEX ap=0; apGetSourceWord( m_sentence_id, ap ); - int target_word = m_alignment->GetTargetWord( m_sentence_id, ap ); - if (source_annotation[source_word] != UNANNOTATED && - target_annotation[target_word] == UNANNOTATED) { - target_annotation[target_word] = MISALIGNED; - change = true; - } - if (source_annotation[source_word] == UNANNOTATED && - target_annotation[target_word] != UNANNOTATED) { - source_annotation[source_word] = MISALIGNED; - change = true; - } - } - } - } - - // print source - // shorten source context if too long + if (m_source_end+i < m_source_length) { + int word_id = m_source_end+i; + source_annotation[ word_id ] = UNALIGNED; + if (!m_source_unaligned[ word_id ]) { + found_aligned = true; + LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED ); + } + } + } + + } + // misalignment + else { + // label aligned output words + for(int i=m_source_start; i<=m_source_end; i++) + LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED ); + + // find first and last + int target_start = -1; + int target_end; + for(int i=0; iGetTargetWord( m_sentence_id, ap ) == i) { + int source_word = m_alignment->GetSourceWord( m_sentence_id, ap ); + // if not part of the source phrase -> also misaligned + if (source_word < m_source_start || source_word > m_source_end) + source_annotation[ source_word ] = MISALIGNED; + } + } + } + } + // closure + bool change = true; + while(change) { + change = false; + for(INDEX ap=0; apGetSourceWord( m_sentence_id, ap ); + int target_word = m_alignment->GetTargetWord( m_sentence_id, ap ); + if (source_annotation[source_word] != UNANNOTATED && + target_annotation[target_word] == UNANNOTATED) { + target_annotation[target_word] = MISALIGNED; + change = true; + } + if (source_annotation[source_word] == UNANNOTATED && + target_annotation[target_word] != UNANNOTATED) { + source_annotation[source_word] = MISALIGNED; + change = true; + } + } + } + } + + // print source + // shorten source context if too long int sentence_start = m_source_position - m_source_start; - int context_space = width/2; - for(int i=m_source_start;i<=m_source_end;i++) - context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1; - context_space /= 2; + int context_space = width/2; + for(int i=m_source_start; i<=m_source_end; i++) + context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1; + context_space /= 2; - int remaining = context_space; - int start_word = m_source_start; - for(;start_word>0 && remaining>0; start_word--) - remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1; - if (remaining<0 || start_word == -1) start_word++; + int remaining = context_space; + int start_word = m_source_start; + for(; start_word>0 && remaining>0; start_word--) + remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1; + if (remaining<0 || start_word == -1) start_word++; - remaining = context_space; - int end_word = m_source_end; - for(;end_word0; end_word++) - remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1; - end_word--; + remaining = context_space; + int end_word = m_source_end; + for(; end_word0; end_word++) + remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1; + end_word--; - // output with markup - *out << ""; - char current_label = UNANNOTATED; - if (start_word>0) { - current_label = source_annotation[start_word-1]; - *out << "... "; - } - for(int i=start_word; i<=end_word; i++) { - // change to phrase block - if (i == m_source_start) { - if (current_label != UNANNOTATED && i!=start_word) - *out << ""; - *out << ""; - current_label = UNANNOTATED; - } + // output with markup + *out << ""; + char current_label = UNANNOTATED; + if (start_word>0) { + current_label = source_annotation[start_word-1]; + *out << "... "; + } + for(int i=start_word; i<=end_word; i++) { + // change to phrase block + if (i == m_source_start) { + if (current_label != UNANNOTATED && i!=start_word) + *out << ""; + *out << ""; + current_label = UNANNOTATED; + } - // change to labeled word - else if (source_annotation[i] != current_label && - source_annotation[i] != ALIGNED) { - if (current_label != UNANNOTATED && i!=start_word) - *out << ""; - if (source_annotation[i] != UNANNOTATED) - *out << ""; - current_label = source_annotation[i]; - } + // change to labeled word + else if (source_annotation[i] != current_label && + source_annotation[i] != ALIGNED) { + if (current_label != UNANNOTATED && i!=start_word) + *out << ""; + if (source_annotation[i] != UNANNOTATED) + *out << ""; + current_label = source_annotation[i]; + } - // output word - *out << m_suffixArray->GetWord( sentence_start + i ) << " "; + // output word + *out << m_suffixArray->GetWord( sentence_start + i ) << " "; - // change to right context block - if (i == m_source_end) { - *out << ""; - current_label = UNANNOTATED; - } - } + // change to right context block + if (i == m_source_end) { + *out << ""; + current_label = UNANNOTATED; + } + } - if (current_label != UNANNOTATED && end_word>m_source_end) - *out << ""; - if (end_wordm_source_end) + *out << ""; + if (end_wordGetWord( m_sentence_id, i ).size() + 1; - while (context_space < 0) { // shorten matched part, if too long - context_space += - m_targetCorpus->GetWord( m_sentence_id, target_start ).size() + - m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2; - target_start++; - target_end--; - } - context_space /= 2; + context_space = width/2; + for(int i=target_start; i<=target_end; i++) + context_space -= m_targetCorpus->GetWord( m_sentence_id, i ).size() + 1; + while (context_space < 0) { // shorten matched part, if too long + context_space += + m_targetCorpus->GetWord( m_sentence_id, target_start ).size() + + m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2; + target_start++; + target_end--; + } + context_space /= 2; - remaining = context_space; - start_word = target_start; - for(;start_word>0 && remaining>0; start_word--) { - //cerr << "remaining: " << remaining << ", start_word: " << start_word << endl; - remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1; - } - if (remaining<0 || start_word == -1) start_word++; + remaining = context_space; + start_word = target_start; + for(; start_word>0 && remaining>0; start_word--) { + //cerr << "remaining: " << remaining << ", start_word: " << start_word << endl; + remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1; + } + if (remaining<0 || start_word == -1) start_word++; - remaining = context_space; - end_word = target_end; - for(;end_word0; end_word++) { - //cerr << "remaining: " << remaining << ", end_word: " << end_word << endl; - remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1; - } - end_word--; + remaining = context_space; + end_word = target_end; + for(; end_word0; end_word++) { + //cerr << "remaining: " << remaining << ", end_word: " << end_word << endl; + remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1; + } + end_word--; - // output with markup - *out << ""; - current_label = UNANNOTATED; - if (start_word>0) { - current_label = target_annotation[start_word-1]; - *out << "... "; - } - for(int i=start_word; i<=end_word; i++) { - if (target_annotation[i] != current_label) { - if (current_label != UNANNOTATED && i!=start_word) - *out << ""; - if (target_annotation[i] != UNANNOTATED) - *out << ""; - current_label = target_annotation[i]; - } + // output with markup + *out << ""; + current_label = UNANNOTATED; + if (start_word>0) { + current_label = target_annotation[start_word-1]; + *out << "... "; + } + for(int i=start_word; i<=end_word; i++) { + if (target_annotation[i] != current_label) { + if (current_label != UNANNOTATED && i!=start_word) + *out << ""; + if (target_annotation[i] != UNANNOTATED) + *out << ""; + current_label = target_annotation[i]; + } - // output word - *out << m_targetCorpus->GetWord( m_sentence_id, i ) << " "; - } + // output word + *out << m_targetCorpus->GetWord( m_sentence_id, i ) << " "; + } - if (current_label != UNANNOTATED && end_word>target_end) - *out << ""; - if (end_word"; + if (current_label != UNANNOTATED && end_word>target_end) + *out << ""; + if (end_word"; } -void Mismatch::LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label ) { - for(INDEX ap=0; apGetSourceWord( m_sentence_id, ap ) == source_id) { - source_annotation[ source_id ] = label; - target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label; - } - } +void Mismatch::LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label ) +{ + for(INDEX ap=0; apGetSourceWord( m_sentence_id, ap ) == source_id) { + source_annotation[ source_id ] = label; + target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label; + } + } } diff --git a/biconcor/Mismatch.h b/biconcor/Mismatch.h index c0063d049..1277ed95a 100644 --- a/biconcor/Mismatch.h +++ b/biconcor/Mismatch.h @@ -34,7 +34,9 @@ public: Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end ); ~Mismatch(); - bool Unaligned() const { return m_unaligned; } + bool Unaligned() const { + return m_unaligned; + } void PrintClippedHTML(std::ostream* out, int width ); void LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label ); }; diff --git a/biconcor/PhrasePair.cpp b/biconcor/PhrasePair.cpp index 038fa3a31..b6409258b 100644 --- a/biconcor/PhrasePair.cpp +++ b/biconcor/PhrasePair.cpp @@ -37,7 +37,7 @@ void PhrasePair::Print( ostream* out ) const INDEX ap_points = m_alignment->GetNumberOfAlignmentPoints( m_sentence_id ); for( INDEX i=0; iGetSourceWord( m_sentence_id, i ) - << "-" << m_alignment->GetTargetWord( m_sentence_id, i ); + << "-" << m_alignment->GetTargetWord( m_sentence_id, i ); } *out << endl; @@ -185,27 +185,27 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const size_t source_pre_width = (source_width-source.size())/2; size_t source_post_width = (source_width-source.size()+1)/2; - // if phrase is too long, don't show any context + // if phrase is too long, don't show any context if (source.size() > (size_t)width) { source_pre_width = 0; source_post_width = 0; } - // too long -> truncate and add "..." + // too long -> truncate and add "..." if (source_pre.size() > source_pre_width) { - // first skip up to a space - while(source_pre_width>0 && - source_pre.substr(source_pre.size()-source_pre_width,1) != " ") { - source_pre_width--; - } + // first skip up to a space + while(source_pre_width>0 && + source_pre.substr(source_pre.size()-source_pre_width,1) != " ") { + source_pre_width--; + } source_pre = "..." + source_pre.substr( source_pre.size()-source_pre_width, source_pre_width ); - } + } if (source_post.size() > source_post_width) { - while(source_post_width>0 && - source_post.substr(source_post_width-1,1) != " ") { - source_post_width--; - } + while(source_post_width>0 && + source_post.substr(source_post_width-1,1) != " ") { + source_post_width--; + } source_post = source_post.substr( 0, source_post_width ) + "..."; - } + } *out << "" << source_pre @@ -220,13 +220,13 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const string target_pre = ""; string target = ""; string target_post = ""; - size_t target_pre_null_width = 0; - size_t target_post_null_width = 0; + size_t target_pre_null_width = 0; + size_t target_post_null_width = 0; for( char i=0; iGetWord( m_sentence_id, i); + WORD word = m_targetCorpus->GetWord( m_sentence_id, i); target_pre += " " + word; - if (i >= m_target_start-m_pre_null) - target_pre_null_width += word.size() + 1; + if (i >= m_target_start-m_pre_null) + target_pre_null_width += word.size() + 1; } for( char i=m_target_start; i<=m_target_end; i++ ) { if (i>m_target_start) target += " "; @@ -234,11 +234,11 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const } for( char i=m_target_end+1; im_target_end+1) target_post += " "; - WORD word = m_targetCorpus->GetWord( m_sentence_id, i); + WORD word = m_targetCorpus->GetWord( m_sentence_id, i); target_post += word; - if (i-(m_target_end+1) < m_post_null) { - target_post_null_width += word.size() + 1; - } + if (i-(m_target_end+1) < m_post_null) { + target_post_null_width += word.size() + 1; + } } size_t target_pre_width = (target_width-target.size())/2; @@ -249,46 +249,45 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const target_post_width = 0; } - if (target_pre.size() < target_pre_width) - target_pre_width = target_pre.size(); - else { - while(target_pre_width>0 && - target_pre.substr(target_pre.size()-target_pre_width,1) != " ") { - target_pre_width--; - } + if (target_pre.size() < target_pre_width) + target_pre_width = target_pre.size(); + else { + while(target_pre_width>0 && + target_pre.substr(target_pre.size()-target_pre_width,1) != " ") { + target_pre_width--; + } target_pre = "..." + target_pre.substr( target_pre.size()-target_pre_width, target_pre_width ); - } + } - if (target_post.size() < target_post_width) { - target_post_width = target_post.size(); - } - else { - while(target_post_width>0 && - target_post.substr(target_post_width-1,1) != " ") { - target_post_width--; - } - target_post = target_post.substr( 0, target_post_width ) + "..."; - } + if (target_post.size() < target_post_width) { + target_post_width = target_post.size(); + } else { + while(target_post_width>0 && + target_post.substr(target_post_width-1,1) != " ") { + target_post_width--; + } + target_post = target_post.substr( 0, target_post_width ) + "..."; + } - if (m_pre_null) { - //cerr << endl << "target_pre_width=" << target_pre_width << ", target_pre_null_width=" << target_pre_null_width << ", target_pre.size()=" << target_pre.size() << endl; - if (target_pre_width < target_pre.size()) - target_pre_null_width -= target_pre.size()-target_pre_width; - target_pre = target_pre.substr(0,target_pre_width-target_pre_null_width) - + "" - + target_pre.substr(target_pre_width-target_pre_null_width) - + ""; - } - if (m_post_null) { - //cerr << endl << "target_post_width=" << target_post_width << ", target_post_null_width=" << target_post_null_width << ", target_post.size()=" << target_post.size() << endl; - if (target_post_null_width > target_post.size()) { - target_post_null_width = target_post.size(); - } - target_post = "" - + target_post.substr(0,target_post_null_width) - + "" - + target_post.substr(target_post_null_width); - } + if (m_pre_null) { + //cerr << endl << "target_pre_width=" << target_pre_width << ", target_pre_null_width=" << target_pre_null_width << ", target_pre.size()=" << target_pre.size() << endl; + if (target_pre_width < target_pre.size()) + target_pre_null_width -= target_pre.size()-target_pre_width; + target_pre = target_pre.substr(0,target_pre_width-target_pre_null_width) + + "" + + target_pre.substr(target_pre_width-target_pre_null_width) + + ""; + } + if (m_post_null) { + //cerr << endl << "target_post_width=" << target_post_width << ", target_post_null_width=" << target_post_null_width << ", target_post.size()=" << target_post.size() << endl; + if (target_post_null_width > target_post.size()) { + target_post_null_width = target_post.size(); + } + target_post = "" + + target_post.substr(0,target_post_null_width) + + "" + + target_post.substr(target_post_null_width); + } *out << "" << target_pre diff --git a/biconcor/PhrasePairCollection.cpp b/biconcor/PhrasePairCollection.cpp index 7497b2af8..dd21faad3 100644 --- a/biconcor/PhrasePairCollection.cpp +++ b/biconcor/PhrasePairCollection.cpp @@ -47,15 +47,15 @@ int PhrasePairCollection::GetCollection( const vector< string >& sourceString ) int sentence_length = m_suffixArray->GetSentenceLength( sentence_id ); int target_length = m_targetCorpus->GetSentenceLength( sentence_id ); //cerr << "match " << (i-first_match) - //<< " in sentence " << sentence_id - //<< ", starting at word " << source_start - //<< " of " << sentence_length - //<< ". target sentence has " << target_length << " words."; + //<< " in sentence " << sentence_id + //<< ", starting at word " << source_start + //<< " of " << sentence_length + //<< ". target sentence has " << target_length << " words."; int target_start, target_end, pre_null, post_null; if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) { //cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]"; //cerr << " +(" << (int)pre_null << "," << (int)post_null << ")"; - bool null_boundary_words = false; + bool null_boundary_words = false; for (int pre = 0; pre <= pre_null && (pre == 0 || null_boundary_words); pre++ ) { for (int post = 0; post <= post_null && (post == 0 || null_boundary_words); post++ ) { vector< WORD_ID > targetString; @@ -75,19 +75,18 @@ int PhrasePairCollection::GetCollection( const vector< string >& sourceString ) m_size++; } } + } else { + //cerr << "mismatch " << (i-first_match) + // << " in sentence " << sentence_id + // << ", starting at word " << source_start + // << " of " << sentence_length + // << ". target sentence has " << target_length << " words."; + Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end ); + if (mismatch->Unaligned()) + m_unaligned.push_back( mismatch ); + else + m_mismatch.push_back( mismatch ); } - else { - //cerr << "mismatch " << (i-first_match) - // << " in sentence " << sentence_id - // << ", starting at word " << source_start - // << " of " << sentence_length - // << ". target sentence has " << target_length << " words."; - Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end ); - if (mismatch->Unaligned()) - m_unaligned.push_back( mismatch ); - else - m_mismatch.push_back( mismatch ); - } //cerr << endl; if (found > (INDEX)m_max_lookup) { @@ -111,8 +110,7 @@ void PhrasePairCollection::Print(bool pretty) const for(int j=0; jsize() && jPrintPretty( &cout, 100 ); - } - else { + } else { (*p)->Print( &cout ); } if (ppWithSameTarget->size() > m_max_example) { @@ -125,33 +123,32 @@ void PhrasePairCollection::Print(bool pretty) const void PhrasePairCollection::PrintHTML() const { int pp_target = 0; - bool singleton = false; - // loop over all translations + bool singleton = false; + // loop over all translations vector< vector >::const_iterator ppWithSameTarget; for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_targetsize(); - if (!singleton) { - if (count == 1) { - singleton = true; - cout << "

singleton" - << (m_collection.end() - ppWithSameTarget==1?"":"s") << " (" - << (m_collection.end() - ppWithSameTarget) - << "/" << m_size << ")

"; - } - else { - cout << "

"; - (*(ppWithSameTarget->begin()))->PrintTarget( &cout ); - cout << " (" << count << "/" << m_size << ")" << endl; - cout << "

"; - } - cout << ""; - } + int count = ppWithSameTarget->size(); + if (!singleton) { + if (count == 1) { + singleton = true; + cout << "

singleton" + << (m_collection.end() - ppWithSameTarget==1?"":"s") << " (" + << (m_collection.end() - ppWithSameTarget) + << "/" << m_size << ")

"; + } else { + cout << "

"; + (*(ppWithSameTarget->begin()))->PrintTarget( &cout ); + cout << " (" << count << "/" << m_size << ")" << endl; + cout << "

"; + } + cout << "
"; + } vector< PhrasePair* >::const_iterator p; - // loop over all sentences where translation occurs + // loop over all sentences where translation occurs int pp=0; - int i=0; + int i=0; for(p = ppWithSameTarget->begin(); i<10 && ppend(); p++, pp++, i++ ) { (*p)->PrintClippedHTML( &cout, 160 ); if (count > m_max_example) { @@ -159,54 +156,54 @@ void PhrasePairCollection::PrintHTML() const pp += count/m_max_example-1; } } - if (i == 10 && pp < count) { - // extended table - cout << "
(more)
"; - cout << "
"; - cout << ""; - for(i=0, pp=0, p = ppWithSameTarget->begin(); iend(); p++, pp++, i++ ) { - (*p)->PrintClippedHTML( &cout, 160 ); - if (count > m_max_example) { - p += count/m_max_example-1; - pp += count/m_max_example-1; - } - } - } - if (!singleton) cout << "
\n"; - - if (!singleton && pp_target == 9) { - cout << "
"; - cout << "

(more)

"; - cout << "
"; - } + if (i == 10 && pp < count) { + // extended table + cout << "(more)
"; + cout << "
"; + cout << ""; + for(i=0, pp=0, p = ppWithSameTarget->begin(); iend(); p++, pp++, i++ ) { + (*p)->PrintClippedHTML( &cout, 160 ); + if (count > m_max_example) { + p += count/m_max_example-1; + pp += count/m_max_example-1; + } + } + } + if (!singleton) cout << "
\n"; + + if (!singleton && pp_target == 9) { + cout << "
"; + cout << "

(more)

"; + cout << "
"; + } } - if (singleton) cout << "
\n"; - else if (pp_target > 9) cout << ""; + if (singleton) cout << "\n"; + else if (pp_target > 9) cout << ""; - size_t max_mismatch = m_max_example/3; - // unaligned phrases - if (m_unaligned.size() > 0) { - cout << "

unaligned" - << " (" << (m_unaligned.size()) << ")

"; - cout << ""; - int step_size = 1; - if (m_unaligned.size() > max_mismatch) - step_size = (m_unaligned.size()+max_mismatch-1) / max_mismatch; - for(size_t i=0;iPrintClippedHTML( &cout, 160 ); - cout << "
"; - } + size_t max_mismatch = m_max_example/3; + // unaligned phrases + if (m_unaligned.size() > 0) { + cout << "

unaligned" + << " (" << (m_unaligned.size()) << ")

"; + cout << ""; + int step_size = 1; + if (m_unaligned.size() > max_mismatch) + step_size = (m_unaligned.size()+max_mismatch-1) / max_mismatch; + for(size_t i=0; iPrintClippedHTML( &cout, 160 ); + cout << "
"; + } - // mismatched phrases - if (m_mismatch.size() > 0) { - cout << "

mismatched" - << " (" << (m_mismatch.size()) << ")

"; - cout << ""; - int step_size = 1; - if (m_mismatch.size() > max_mismatch) - step_size = (m_mismatch.size()+max_mismatch-1) / max_mismatch; - for(size_t i=0;iPrintClippedHTML( &cout, 160 ); - cout << "
"; - } + // mismatched phrases + if (m_mismatch.size() > 0) { + cout << "

mismatched" + << " (" << (m_mismatch.size()) << ")

"; + cout << ""; + int step_size = 1; + if (m_mismatch.size() > max_mismatch) + step_size = (m_mismatch.size()+max_mismatch-1) / max_mismatch; + for(size_t i=0; iPrintClippedHTML( &cout, 160 ); + cout << "
"; + } } diff --git a/biconcor/SuffixArray.cpp b/biconcor/SuffixArray.cpp index 15e6b47b0..f4122a2d8 100644 --- a/biconcor/SuffixArray.cpp +++ b/biconcor/SuffixArray.cpp @@ -5,7 +5,8 @@ #include #include -namespace { +namespace +{ const int LINE_MAX_LENGTH = 10000; @@ -14,15 +15,15 @@ const int LINE_MAX_LENGTH = 10000; using namespace std; SuffixArray::SuffixArray() - : m_array(NULL), - m_index(NULL), - m_buffer(NULL), - m_wordInSentence(NULL), - m_sentence(NULL), - m_sentenceLength(NULL), - m_vcb(), - m_size(0), - m_sentenceCount(0) { } + : m_array(NULL), + m_index(NULL), + m_buffer(NULL), + m_wordInSentence(NULL), + m_sentence(NULL), + m_sentenceLength(NULL), + m_vcb(), + m_size(0), + m_sentenceCount(0) { } SuffixArray::~SuffixArray() { diff --git a/biconcor/TargetCorpus.cpp b/biconcor/TargetCorpus.cpp index d331a548a..06468007f 100644 --- a/biconcor/TargetCorpus.cpp +++ b/biconcor/TargetCorpus.cpp @@ -5,7 +5,8 @@ #include #include -namespace { +namespace +{ const int LINE_MAX_LENGTH = 10000; @@ -14,11 +15,11 @@ const int LINE_MAX_LENGTH = 10000; using namespace std; TargetCorpus::TargetCorpus() - : m_array(NULL), - m_sentenceEnd(NULL), - m_vcb(), - m_size(0), - m_sentenceCount(0) {} + : m_array(NULL), + m_sentenceEnd(NULL), + m_vcb(), + m_size(0), + m_sentenceCount(0) {} TargetCorpus::~TargetCorpus() { diff --git a/biconcor/Vocabulary.cpp b/biconcor/Vocabulary.cpp index 9c35b3feb..9d52ee44e 100644 --- a/biconcor/Vocabulary.cpp +++ b/biconcor/Vocabulary.cpp @@ -2,7 +2,8 @@ #include "Vocabulary.h" #include -namespace { +namespace +{ const int MAX_LENGTH = 10000; diff --git a/biconcor/base64.cpp b/biconcor/base64.cpp index 2a863d161..8032399b5 100644 --- a/biconcor/base64.cpp +++ b/biconcor/base64.cpp @@ -1,4 +1,4 @@ -/* +/* base64.cpp and base64.h Copyright (C) 2004-2008 RenĂ© Nyffenegger @@ -28,17 +28,19 @@ #include "base64.h" #include -static const std::string base64_chars = - "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz" - "0123456789+/"; +static const std::string base64_chars = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; -static inline bool is_base64(unsigned char c) { +static inline bool is_base64(unsigned char c) +{ return (isalnum(c) || (c == '+') || (c == '/')); } -std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len) { +std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len) +{ std::string ret; int i = 0; int j = 0; @@ -59,8 +61,7 @@ std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_ } } - if (i) - { + if (i) { for(j = i; j < 3; j++) char_array_3[j] = '\0'; @@ -81,7 +82,8 @@ std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_ } -std::string base64_decode(std::string const& encoded_string) { +std::string base64_decode(std::string const& encoded_string) +{ int in_len = encoded_string.size(); int i = 0; int j = 0; @@ -90,7 +92,8 @@ std::string base64_decode(std::string const& encoded_string) { std::string ret; while (in_len-- && ( encoded_string[in_] != '=') && is_base64(encoded_string[in_])) { - char_array_4[i++] = encoded_string[in_]; in_++; + char_array_4[i++] = encoded_string[in_]; + in_++; if (i ==4) { for (i = 0; i <4; i++) char_array_4[i] = base64_chars.find(char_array_4[i]); diff --git a/biconcor/biconcor.cpp b/biconcor/biconcor.cpp index f4e7c03fb..cb63e855d 100644 --- a/biconcor/biconcor.cpp +++ b/biconcor/biconcor.cpp @@ -150,22 +150,19 @@ int main(int argc, char* argv[]) cout << "TOTAL: " << total << endl; if (htmlFlag) { ppCollection.PrintHTML(); - } - else { - ppCollection.Print(prettyFlag); + } else { + ppCollection.Print(prettyFlag); } cout << "-|||- BICONCOR END -|||-" << endl << flush; } - } - else if (queryFlag) { + } else if (queryFlag) { cerr << "query is " << query << endl; vector< string > queryString = alignment.Tokenize( query.c_str() ); PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example ); ppCollection.GetCollection( queryString ); if (htmlFlag) { ppCollection.PrintHTML(); - } - else { + } else { ppCollection.Print(prettyFlag); } } diff --git a/defer/PhraseDictionaryInterpolated.cpp b/defer/PhraseDictionaryInterpolated.cpp index 764927081..93c74d956 100644 --- a/defer/PhraseDictionaryInterpolated.cpp +++ b/defer/PhraseDictionaryInterpolated.cpp @@ -29,155 +29,158 @@ using namespace std; namespace Moses { - PhraseDictionaryInterpolated::PhraseDictionaryInterpolated - (size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature): - PhraseDictionary(numScoreComponent,feature), - m_targetPhrases(NULL), - m_languageModels(NULL) {} +PhraseDictionaryInterpolated::PhraseDictionaryInterpolated +(size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature): + PhraseDictionary(numScoreComponent,feature), + m_targetPhrases(NULL), + m_languageModels(NULL) {} - bool PhraseDictionaryInterpolated::Load( - const std::vector &input - , const std::vector &output - , const std::vector& config - , const std::vector &weightT - , size_t tableLimit - , const LMList &languageModels - , float weightWP) { +bool PhraseDictionaryInterpolated::Load( + const std::vector &input + , const std::vector &output + , const std::vector& config + , const std::vector &weightT + , size_t tableLimit + , const LMList &languageModels + , float weightWP) +{ - m_languageModels = &languageModels; - m_weightT = weightT; - m_tableLimit = tableLimit; - m_weightWP = weightWP; + m_languageModels = &languageModels; + m_weightT = weightT; + m_tableLimit = tableLimit; + m_weightWP = weightWP; - //The config should be as follows: - //0-3: type factor factor num-components (as usual) - //4: combination mode (e.g. naive) - //5-(length-2): List of phrase-table files - //length-1: Weight string, in the same format as used for tmcombine + //The config should be as follows: + //0-3: type factor factor num-components (as usual) + //4: combination mode (e.g. naive) + //5-(length-2): List of phrase-table files + //length-1: Weight string, in the same format as used for tmcombine - UTIL_THROW_IF(config.size() < 7, util::Exception, "Missing fields from phrase table configuration: expected at least 7"); - UTIL_THROW_IF(config[4] != "naive", util::Exception, "Unsupported combination mode: '" << config[4] << "'"); - - // Create the dictionaries - for (size_t i = 5; i < config.size()-1; ++i) { - m_dictionaries.push_back(DictionaryHandle(new PhraseDictionaryTreeAdaptor( - GetFeature()->GetNumScoreComponents(), - GetFeature()->GetNumInputScores(), - GetFeature()))); - bool ret = m_dictionaries.back()->Load( - input, - output, - config[i], - weightT, - 0, - languageModels, - weightWP); - if (!ret) return ret; - } + UTIL_THROW_IF(config.size() < 7, util::Exception, "Missing fields from phrase table configuration: expected at least 7"); + UTIL_THROW_IF(config[4] != "naive", util::Exception, "Unsupported combination mode: '" << config[4] << "'"); - //Parse the weight strings - for (util::TokenIter featureWeights(config.back(), util::SingleCharacter(';')); featureWeights; ++featureWeights) { - m_weights.push_back(vector()); - float sum = 0; - for (util::TokenIter tableWeights(*featureWeights, util::SingleCharacter(',')); tableWeights; ++tableWeights) { - const float weight = boost::lexical_cast(*tableWeights); - m_weights.back().push_back(weight); - sum += weight; - } - UTIL_THROW_IF(m_weights.back().size() != m_dictionaries.size(), util::Exception, - "Number of weights (" << m_weights.back().size() << - ") does not match number of dictionaries to combine (" << m_dictionaries.size() << ")"); - UTIL_THROW_IF(abs(sum - 1) > 0.01, util::Exception, "Weights not normalised"); - - } - - //check number of weight sets. Make sure there is a weight for every score component - //except for the last - which is assumed to be the phrase penalty. - UTIL_THROW_IF(m_weights.size() != 1 && m_weights.size() != GetFeature()->GetNumScoreComponents()-1, util::Exception, "Unexpected number of weight sets"); - //if 1 weight set, then repeat - if (m_weights.size() == 1) { - while(m_weights.size() < GetFeature()->GetNumScoreComponents()-1) { - m_weights.push_back(m_weights[0]); - } - } - - return true; + // Create the dictionaries + for (size_t i = 5; i < config.size()-1; ++i) { + m_dictionaries.push_back(DictionaryHandle(new PhraseDictionaryTreeAdaptor( + GetFeature()->GetNumScoreComponents(), + GetFeature()->GetNumInputScores(), + GetFeature()))); + bool ret = m_dictionaries.back()->Load( + input, + output, + config[i], + weightT, + 0, + languageModels, + weightWP); + if (!ret) return ret; } - void PhraseDictionaryInterpolated::InitializeForInput(InputType const& source) { - for (size_t i = 0; i < m_dictionaries.size(); ++i) { - m_dictionaries[i]->InitializeForInput(source); + //Parse the weight strings + for (util::TokenIter featureWeights(config.back(), util::SingleCharacter(';')); featureWeights; ++featureWeights) { + m_weights.push_back(vector()); + float sum = 0; + for (util::TokenIter tableWeights(*featureWeights, util::SingleCharacter(',')); tableWeights; ++tableWeights) { + const float weight = boost::lexical_cast(*tableWeights); + m_weights.back().push_back(weight); + sum += weight; + } + UTIL_THROW_IF(m_weights.back().size() != m_dictionaries.size(), util::Exception, + "Number of weights (" << m_weights.back().size() << + ") does not match number of dictionaries to combine (" << m_dictionaries.size() << ")"); + UTIL_THROW_IF(abs(sum - 1) > 0.01, util::Exception, "Weights not normalised"); + + } + + //check number of weight sets. Make sure there is a weight for every score component + //except for the last - which is assumed to be the phrase penalty. + UTIL_THROW_IF(m_weights.size() != 1 && m_weights.size() != GetFeature()->GetNumScoreComponents()-1, util::Exception, "Unexpected number of weight sets"); + //if 1 weight set, then repeat + if (m_weights.size() == 1) { + while(m_weights.size() < GetFeature()->GetNumScoreComponents()-1) { + m_weights.push_back(m_weights[0]); } } - typedef - boost::unordered_set PhraseSet; - + return true; +} - const TargetPhraseCollection* - PhraseDictionaryInterpolated::GetTargetPhraseCollection(const Phrase& src) const { +void PhraseDictionaryInterpolated::InitializeForInput(InputType const& source) +{ + for (size_t i = 0; i < m_dictionaries.size(); ++i) { + m_dictionaries[i]->InitializeForInput(source); + } +} - delete m_targetPhrases; - m_targetPhrases = new TargetPhraseCollection(); - PhraseSet allPhrases; - vector phrasesByTable(m_dictionaries.size()); - for (size_t i = 0; i < m_dictionaries.size(); ++i) { - const TargetPhraseCollection* phrases = m_dictionaries[i]->GetTargetPhraseCollection(src); - if (phrases) { - for (TargetPhraseCollection::const_iterator j = phrases->begin(); - j != phrases->end(); ++j) { - allPhrases.insert(*j); - phrasesByTable[i].insert(*j); +typedef +boost::unordered_set PhraseSet; + + +const TargetPhraseCollection* +PhraseDictionaryInterpolated::GetTargetPhraseCollection(const Phrase& src) const +{ + + delete m_targetPhrases; + m_targetPhrases = new TargetPhraseCollection(); + PhraseSet allPhrases; + vector phrasesByTable(m_dictionaries.size()); + for (size_t i = 0; i < m_dictionaries.size(); ++i) { + const TargetPhraseCollection* phrases = m_dictionaries[i]->GetTargetPhraseCollection(src); + if (phrases) { + for (TargetPhraseCollection::const_iterator j = phrases->begin(); + j != phrases->end(); ++j) { + allPhrases.insert(*j); + phrasesByTable[i].insert(*j); + } + } + } + ScoreComponentCollection sparseVector; + for (PhraseSet::const_iterator i = allPhrases.begin(); i != allPhrases.end(); ++i) { + TargetPhrase* combinedPhrase = new TargetPhrase((Phrase)**i); + //combinedPhrase->ResetScore(); + //cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl; + combinedPhrase->SetSourcePhrase((*i)->GetSourcePhrase()); + combinedPhrase->SetAlignTerm(&((*i)->GetAlignTerm())); + combinedPhrase->SetAlignNonTerm(&((*i)->GetAlignTerm())); + Scores combinedScores(GetFeature()->GetNumScoreComponents()); + for (size_t j = 0; j < phrasesByTable.size(); ++j) { + PhraseSet::const_iterator tablePhrase = phrasesByTable[j].find(combinedPhrase); + if (tablePhrase != phrasesByTable[j].end()) { + Scores tableScores = (*tablePhrase)->GetScoreBreakdown() + .GetScoresForProducer(GetFeature()); + //cerr << "Scores from " << j << " table: "; + for (size_t k = 0; k < tableScores.size()-1; ++k) { + //cerr << tableScores[k] << "(" << exp(tableScores[k]) << ") "; + combinedScores[k] += m_weights[k][j] * exp(tableScores[k]); + //cerr << m_weights[k][j] * exp(tableScores[k]) << " "; } + //cerr << endl; } } - ScoreComponentCollection sparseVector; - for (PhraseSet::const_iterator i = allPhrases.begin(); i != allPhrases.end(); ++i) { - TargetPhrase* combinedPhrase = new TargetPhrase((Phrase)**i); - //combinedPhrase->ResetScore(); - //cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl; - combinedPhrase->SetSourcePhrase((*i)->GetSourcePhrase()); - combinedPhrase->SetAlignTerm(&((*i)->GetAlignTerm())); - combinedPhrase->SetAlignNonTerm(&((*i)->GetAlignTerm())); - Scores combinedScores(GetFeature()->GetNumScoreComponents()); - for (size_t j = 0; j < phrasesByTable.size(); ++j) { - PhraseSet::const_iterator tablePhrase = phrasesByTable[j].find(combinedPhrase); - if (tablePhrase != phrasesByTable[j].end()) { - Scores tableScores = (*tablePhrase)->GetScoreBreakdown() - .GetScoresForProducer(GetFeature()); - //cerr << "Scores from " << j << " table: "; - for (size_t k = 0; k < tableScores.size()-1; ++k) { - //cerr << tableScores[k] << "(" << exp(tableScores[k]) << ") "; - combinedScores[k] += m_weights[k][j] * exp(tableScores[k]); - //cerr << m_weights[k][j] * exp(tableScores[k]) << " "; - } - //cerr << endl; - } - } - //map back to log space - //cerr << "Combined "; - for (size_t k = 0; k < combinedScores.size()-1; ++k) { - //cerr << combinedScores[k] << " "; - combinedScores[k] = log(combinedScores[k]); - //cerr << combinedScores[k] << " "; - } - //cerr << endl; - combinedScores.back() = 1; //assume last is penalty - combinedPhrase->SetScore( - GetFeature(), - combinedScores, - sparseVector, - m_weightT, - m_weightWP, - *m_languageModels); - //cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl; - m_targetPhrases->Add(combinedPhrase); + //map back to log space + //cerr << "Combined "; + for (size_t k = 0; k < combinedScores.size()-1; ++k) { + //cerr << combinedScores[k] << " "; + combinedScores[k] = log(combinedScores[k]); + //cerr << combinedScores[k] << " "; } - - m_targetPhrases->Prune(true,m_tableLimit); - - - return m_targetPhrases; + //cerr << endl; + combinedScores.back() = 1; //assume last is penalty + combinedPhrase->SetScore( + GetFeature(), + combinedScores, + sparseVector, + m_weightT, + m_weightWP, + *m_languageModels); + //cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl; + m_targetPhrases->Add(combinedPhrase); } + m_targetPhrases->Prune(true,m_tableLimit); + + + return m_targetPhrases; +} + } diff --git a/defer/PhraseDictionaryInterpolated.h b/defer/PhraseDictionaryInterpolated.h index 74add1833..9bb4dcc3c 100644 --- a/defer/PhraseDictionaryInterpolated.h +++ b/defer/PhraseDictionaryInterpolated.h @@ -34,12 +34,14 @@ namespace Moses **/ class PhraseDictionaryInterpolated : public PhraseDictionary { - public: +public: PhraseDictionaryInterpolated - (size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature); + (size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature); - virtual ~PhraseDictionaryInterpolated() {delete m_targetPhrases;} + virtual ~PhraseDictionaryInterpolated() { + delete m_targetPhrases; + } // initialize ... bool Load(const std::vector &input @@ -58,7 +60,7 @@ class PhraseDictionaryInterpolated : public PhraseDictionary throw std::logic_error("PhraseDictionaryInterpolated.CreateRuleLookupManager() Not implemented"); } - private: +private: typedef boost::shared_ptr DictionaryHandle; std::vector m_dictionaries; diff --git a/defer/PhraseLengthFeatureTest.cpp b/defer/PhraseLengthFeatureTest.cpp index 42026e805..6fb15e71e 100644 --- a/defer/PhraseLengthFeatureTest.cpp +++ b/defer/PhraseLengthFeatureTest.cpp @@ -31,7 +31,8 @@ BOOST_AUTO_TEST_SUITE(phrase_length_feature) //TODO: Factor out setup code so that it can be reused -static Word MakeWord(string text) { +static Word MakeWord(string text) +{ FactorCollection &factorCollection = FactorCollection::Instance(); const Factor* f = factorCollection.AddFactor(Input,0,text); Word w; @@ -40,7 +41,8 @@ static Word MakeWord(string text) { } -BOOST_AUTO_TEST_CASE(evaluate) { +BOOST_AUTO_TEST_CASE(evaluate) +{ Word w1 = MakeWord("w1"); Word w2 = MakeWord("y2"); Word w3 = MakeWord("x3"); @@ -78,7 +80,7 @@ BOOST_AUTO_TEST_CASE(evaluate) { PhraseBasedFeatureContext context1(topt1,sentence); PhraseBasedFeatureContext context2(topt2,sentence); PhraseBasedFeatureContext context3(topt3,sentence); - + PhraseLengthFeature plf; ScoreComponentCollection acc1,acc2,acc3; diff --git a/defer/TargetBigramFeatureTest.cpp b/defer/TargetBigramFeatureTest.cpp index 4b8d00800..c651c8ed9 100644 --- a/defer/TargetBigramFeatureTest.cpp +++ b/defer/TargetBigramFeatureTest.cpp @@ -34,12 +34,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA using namespace std; using namespace Moses; -namespace MosesTest +namespace MosesTest { BOOST_AUTO_TEST_SUITE(target_bigram) -static Word MakeWord(string text) { +static Word MakeWord(string text) +{ FactorCollection &factorCollection = FactorCollection::Instance(); const Factor* f = factorCollection.AddFactor(Input,0,text); Word w; @@ -47,34 +48,32 @@ static Word MakeWord(string text) { return w; } -class VocabFileFixture { - public: - template - VocabFileFixture(I begin, I end) - { - char name[] = "TargetBigramXXXXXX"; - int fd = mkstemp(name); - BOOST_CHECK(fd != -1); - BOOST_CHECK(!close(fd)); - filename = name; - ofstream out(name); - for (I i = begin; i != end; ++i) - { - out << *i << endl; - } - out.close(); +class VocabFileFixture +{ +public: + template + VocabFileFixture(I begin, I end) { + char name[] = "TargetBigramXXXXXX"; + int fd = mkstemp(name); + BOOST_CHECK(fd != -1); + BOOST_CHECK(!close(fd)); + filename = name; + ofstream out(name); + for (I i = begin; i != end; ++i) { + out << *i << endl; } + out.close(); + } - ~VocabFileFixture() - { - BOOST_CHECK(!remove(filename.c_str())); - } + ~VocabFileFixture() { + BOOST_CHECK(!remove(filename.c_str())); + } - string filename; + string filename; }; /* -BOOST_AUTO_TEST_CASE(Test2) +BOOST_AUTO_TEST_CASE(Test2) { HypothesisFixture hypos; cerr << hypos.empty() << ", " << *hypos.empty() << endl; @@ -113,7 +112,7 @@ BOOST_AUTO_TEST_CASE(score_components) ScoreProducer::unlimited); } -BOOST_AUTO_TEST_CASE(empty_hypo) +BOOST_AUTO_TEST_CASE(empty_hypo) { Sentence s; TargetBigramFeature tbf; @@ -124,7 +123,7 @@ BOOST_AUTO_TEST_CASE(empty_hypo) } //Test of evaluate() where a vocab is specified -BOOST_AUTO_TEST_CASE(evaluate_vocab) +BOOST_AUTO_TEST_CASE(evaluate_vocab) { string vocab[] = {"i", "do"}; VocabFileFixture vocabFile(vocab,vocab+2); @@ -156,7 +155,7 @@ BOOST_AUTO_TEST_CASE(evaluate_all) BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&tbf, "do:not"),1); BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&tbf, "not:"),0); BOOST_CHECK(! currState->Compare(TargetBigramState(MakeWord("not")))); - + } BOOST_AUTO_TEST_CASE(evaluate_empty) @@ -171,7 +170,7 @@ BOOST_AUTO_TEST_CASE(evaluate_empty) BOOST_CHECK(! currState->Compare(*prevState)); } -BOOST_AUTO_TEST_CASE(evaluate_eos) +BOOST_AUTO_TEST_CASE(evaluate_eos) { HypothesisFixture hypos; TargetBigramFeature tbf; diff --git a/mert/BleuScorer.cpp b/mert/BleuScorer.cpp index 26723d36b..1d5caa394 100644 --- a/mert/BleuScorer.cpp +++ b/mert/BleuScorer.cpp @@ -18,7 +18,8 @@ using namespace std; -namespace { +namespace +{ // configure regularisation const char KEY_REFLEN[] = "reflen"; @@ -33,8 +34,9 @@ namespace MosesTuning BleuScorer::BleuScorer(const string& config) - : StatisticsBasedScorer("BLEU", config), - m_ref_length_type(CLOSEST) { + : StatisticsBasedScorer("BLEU", config), + m_ref_length_type(CLOSEST) +{ const string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST); if (reflen == REFLEN_AVERAGE) { m_ref_length_type = AVERAGE; @@ -101,7 +103,8 @@ void BleuScorer::setReferenceFiles(const vector& referenceFiles) } } -bool BleuScorer::OpenReference(const char* filename, size_t file_id) { +bool BleuScorer::OpenReference(const char* filename, size_t file_id) +{ ifstream ifs(filename); if (!ifs) { cerr << "Cannot open " << filename << endl; @@ -110,7 +113,8 @@ bool BleuScorer::OpenReference(const char* filename, size_t file_id) { return OpenReferenceStream(&ifs, file_id); } -bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id) { +bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id) +{ if (is == NULL) return false; string line; @@ -203,25 +207,27 @@ statscore_t BleuScorer::calculateScore(const vector& comps) const return exp(logbleu); } -int BleuScorer::CalcReferenceLength(size_t sentence_id, size_t length) { +int BleuScorer::CalcReferenceLength(size_t sentence_id, size_t length) +{ switch (m_ref_length_type) { - case AVERAGE: - return m_references[sentence_id]->CalcAverage(); - break; - case CLOSEST: - return m_references[sentence_id]->CalcClosest(length); - break; - case SHORTEST: - return m_references[sentence_id]->CalcShortest(); - break; - default: - cerr << "unknown reference types." << endl; - exit(1); + case AVERAGE: + return m_references[sentence_id]->CalcAverage(); + break; + case CLOSEST: + return m_references[sentence_id]->CalcClosest(length); + break; + case SHORTEST: + return m_references[sentence_id]->CalcShortest(); + break; + default: + cerr << "unknown reference types." << endl; + exit(1); } } void BleuScorer::DumpCounts(ostream* os, - const NgramCounts& counts) const { + const NgramCounts& counts) const +{ for (NgramCounts::const_iterator it = counts.begin(); it != counts.end(); ++it) { *os << "("; @@ -238,7 +244,8 @@ void BleuScorer::DumpCounts(ostream* os, } float smoothedSentenceBleu - (const std::vector& stats, float smoothing, bool smoothBP) { +(const std::vector& stats, float smoothing, bool smoothBP) +{ CHECK(stats.size() == kBleuNgramOrder * 2 + 1); @@ -247,8 +254,8 @@ float smoothedSentenceBleu logbleu += log(stats[2 * j] + smoothing) - log(stats[2 * j + 1] + smoothing); } logbleu /= kBleuNgramOrder; - const float reflength = stats[(kBleuNgramOrder * 2)] + - (smoothBP ? smoothing : 0.0f); + const float reflength = stats[(kBleuNgramOrder * 2)] + + (smoothBP ? smoothing : 0.0f); const float brevity = 1.0 - reflength / stats[1]; if (brevity < 0.0) { @@ -263,7 +270,7 @@ float sentenceLevelBackgroundBleu(const std::vector& sent, const std::vec std::vector stats; CHECK(sent.size()==bg.size()); CHECK(sent.size()==kBleuNgramOrder*2+1); - for(size_t i=0;i& sent, const std::vec return exp(logbleu) * stats[kBleuNgramOrder*2]; } -float unsmoothedBleu(const std::vector& stats) { +float unsmoothedBleu(const std::vector& stats) +{ CHECK(stats.size() == kBleuNgramOrder * 2 + 1); float logbleu = 0.0; @@ -298,50 +306,51 @@ float unsmoothedBleu(const std::vector& stats) { return exp(logbleu); } -vector BleuScorer::ScoreNbestList(const string& scoreFile, const string& featureFile) { - vector scoreFiles; - vector featureFiles; - scoreFiles.push_back(scoreFile); - featureFiles.push_back(featureFile); +vector BleuScorer::ScoreNbestList(const string& scoreFile, const string& featureFile) +{ + vector scoreFiles; + vector featureFiles; + scoreFiles.push_back(scoreFile); + featureFiles.push_back(featureFile); - vector featureDataIters; - vector scoreDataIters; - for (size_t i = 0; i < featureFiles.size(); ++i) { - featureDataIters.push_back(FeatureDataIterator(featureFiles[i])); - scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i])); - } + vector featureDataIters; + vector scoreDataIters; + for (size_t i = 0; i < featureFiles.size(); ++i) { + featureDataIters.push_back(FeatureDataIterator(featureFiles[i])); + scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i])); + } - vector > hypotheses; - if (featureDataIters[0] == FeatureDataIterator::end()) { - cerr << "Error: at the end of feature data iterator" << endl; - exit(1); - } - for (size_t i = 0; i < featureFiles.size(); ++i) { - if (featureDataIters[i] == FeatureDataIterator::end()) { - cerr << "Error: Feature file " << i << " ended prematurely" << endl; - exit(1); - } - if (scoreDataIters[i] == ScoreDataIterator::end()) { - cerr << "Error: Score file " << i << " ended prematurely" << endl; - exit(1); - } - if (featureDataIters[i]->size() != scoreDataIters[i]->size()) { - cerr << "Error: features and scores have different size" << endl; - exit(1); - } - for (size_t j = 0; j < featureDataIters[i]->size(); ++j) { - hypotheses.push_back(pair(i,j)); - } - } + vector > hypotheses; + if (featureDataIters[0] == FeatureDataIterator::end()) { + cerr << "Error: at the end of feature data iterator" << endl; + exit(1); + } + for (size_t i = 0; i < featureFiles.size(); ++i) { + if (featureDataIters[i] == FeatureDataIterator::end()) { + cerr << "Error: Feature file " << i << " ended prematurely" << endl; + exit(1); + } + if (scoreDataIters[i] == ScoreDataIterator::end()) { + cerr << "Error: Score file " << i << " ended prematurely" << endl; + exit(1); + } + if (featureDataIters[i]->size() != scoreDataIters[i]->size()) { + cerr << "Error: features and scores have different size" << endl; + exit(1); + } + for (size_t j = 0; j < featureDataIters[i]->size(); ++j) { + hypotheses.push_back(pair(i,j)); + } + } - // score the nbest list - vector bleuScores; - for (size_t i=0; i < hypotheses.size(); ++i) { - pair translation = hypotheses[i]; - float bleu = smoothedSentenceBleu(scoreDataIters[translation.first]->operator[](translation.second)); - bleuScores.push_back(bleu); - } - return bleuScores; + // score the nbest list + vector bleuScores; + for (size_t i=0; i < hypotheses.size(); ++i) { + pair translation = hypotheses[i]; + float bleu = smoothedSentenceBleu(scoreDataIters[translation.first]->operator[](translation.second)); + bleuScores.push_back(bleu); + } + return bleuScores; } diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index af889b13e..248b3e1d1 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -38,14 +38,22 @@ public: virtual void setReferenceFiles(const std::vector& referenceFiles); virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry); virtual statscore_t calculateScore(const std::vector& comps) const; - virtual std::size_t NumberOfScores() const { return 2 * kBleuNgramOrder + 1; } + virtual std::size_t NumberOfScores() const { + return 2 * kBleuNgramOrder + 1; + } int CalcReferenceLength(std::size_t sentence_id, std::size_t length); - ReferenceLengthType GetReferenceLengthType() const { return m_ref_length_type; } - void SetReferenceLengthType(ReferenceLengthType type) { m_ref_length_type = type; } + ReferenceLengthType GetReferenceLengthType() const { + return m_ref_length_type; + } + void SetReferenceLengthType(ReferenceLengthType type) { + m_ref_length_type = type; + } - const std::vector& GetReferences() const { return m_references.get(); } + const std::vector& GetReferences() const { + return m_references.get(); + } /** * Count the ngrams of each type, up to the given length in the input line. @@ -74,7 +82,7 @@ private: * This function is used in PRO. */ float smoothedSentenceBleu - (const std::vector& stats, float smoothing=1.0, bool smoothBP=false); +(const std::vector& stats, float smoothing=1.0, bool smoothBP=false); /** Computes sentence-level BLEU score given a background corpus. * This function is used in batch MIRA. diff --git a/mert/BleuScorerTest.cpp b/mert/BleuScorerTest.cpp index 136f134eb..a63196a3b 100644 --- a/mert/BleuScorerTest.cpp +++ b/mert/BleuScorerTest.cpp @@ -10,16 +10,19 @@ using namespace MosesTuning; -namespace { +namespace +{ NgramCounts* g_counts = NULL; -NgramCounts* GetNgramCounts() { +NgramCounts* GetNgramCounts() +{ assert(g_counts); return g_counts; } -void SetNgramCounts(NgramCounts* counts) { +void SetNgramCounts(NgramCounts* counts) +{ g_counts = counts; } @@ -58,33 +61,38 @@ struct Fourgram { NgramCounts::Key instance; }; -bool CheckUnigram(const std::string& str) { +bool CheckUnigram(const std::string& str) +{ Unigram unigram(str); NgramCounts::Value v; return GetNgramCounts()->Lookup(unigram.instance, &v); } -bool CheckBigram(const std::string& a, const std::string& b) { +bool CheckBigram(const std::string& a, const std::string& b) +{ Bigram bigram(a, b); NgramCounts::Value v; return GetNgramCounts()->Lookup(bigram.instance, &v); } bool CheckTrigram(const std::string& a, const std::string& b, - const std::string& c) { + const std::string& c) +{ Trigram trigram(a, b, c); NgramCounts::Value v; return GetNgramCounts()->Lookup(trigram.instance, &v); } bool CheckFourgram(const std::string& a, const std::string& b, - const std::string& c, const std::string& d) { + const std::string& c, const std::string& d) +{ Fourgram fourgram(a, b, c, d); NgramCounts::Value v; return GetNgramCounts()->Lookup(fourgram.instance, &v); } -void SetUpReferences(BleuScorer& scorer) { +void SetUpReferences(BleuScorer& scorer) +{ // The following examples are taken from Koehn, "Statistical Machine Translation", // Cambridge University Press, 2010. { @@ -115,7 +123,8 @@ void SetUpReferences(BleuScorer& scorer) { } // namespace -BOOST_AUTO_TEST_CASE(bleu_reference_type) { +BOOST_AUTO_TEST_CASE(bleu_reference_type) +{ BleuScorer scorer; // BleuScorer will use "closest" by default. BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::CLOSEST); @@ -127,7 +136,8 @@ BOOST_AUTO_TEST_CASE(bleu_reference_type) { BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::SHORTEST); } -BOOST_AUTO_TEST_CASE(bleu_reference_type_with_config) { +BOOST_AUTO_TEST_CASE(bleu_reference_type_with_config) +{ { BleuScorer scorer("reflen:average"); BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::AVERAGE); @@ -139,7 +149,8 @@ BOOST_AUTO_TEST_CASE(bleu_reference_type_with_config) { } } -BOOST_AUTO_TEST_CASE(bleu_count_ngrams) { +BOOST_AUTO_TEST_CASE(bleu_count_ngrams) +{ BleuScorer scorer; std::string line = "I saw a girl with a telescope ."; @@ -198,7 +209,8 @@ BOOST_AUTO_TEST_CASE(bleu_count_ngrams) { BOOST_CHECK(CheckFourgram("with", "a", "telescope", ".")); } -BOOST_AUTO_TEST_CASE(bleu_clipped_counts) { +BOOST_AUTO_TEST_CASE(bleu_clipped_counts) +{ BleuScorer scorer; SetUpReferences(scorer); std::string line("israeli officials responsibility of airport safety"); @@ -220,7 +232,8 @@ BOOST_AUTO_TEST_CASE(bleu_clipped_counts) { BOOST_CHECK_EQUAL(entry.get(7), 3); // fourgram } -BOOST_AUTO_TEST_CASE(calculate_actual_score) { +BOOST_AUTO_TEST_CASE(calculate_actual_score) +{ BOOST_REQUIRE(4 == kBleuNgramOrder); std::vector stats(2 * kBleuNgramOrder + 1); BleuScorer scorer; @@ -247,7 +260,8 @@ BOOST_AUTO_TEST_CASE(calculate_actual_score) { BOOST_CHECK_CLOSE(0.5115f, scorer.calculateScore(stats), 0.01); } -BOOST_AUTO_TEST_CASE(sentence_level_bleu) { +BOOST_AUTO_TEST_CASE(sentence_level_bleu) +{ BOOST_REQUIRE(4 == kBleuNgramOrder); std::vector stats(2 * kBleuNgramOrder + 1); diff --git a/mert/CderScorer.cpp b/mert/CderScorer.cpp index cece29034..21a80ad52 100644 --- a/mert/CderScorer.cpp +++ b/mert/CderScorer.cpp @@ -6,9 +6,11 @@ using namespace std; -namespace { +namespace +{ -inline int CalcDistance(int word1, int word2) { +inline int CalcDistance(int word1, int word2) +{ return word1 == word2 ? 0 : 1; } @@ -16,11 +18,11 @@ inline int CalcDistance(int word1, int word2) { namespace MosesTuning { - + CderScorer::CderScorer(const string& config, bool allowed_long_jumps) - : StatisticsBasedScorer(allowed_long_jumps ? "CDER" : "WER", config), - m_allowed_long_jumps(allowed_long_jumps) {} + : StatisticsBasedScorer(allowed_long_jumps ? "CDER" : "WER", config), + m_allowed_long_jumps(allowed_long_jumps) {} CderScorer::~CderScorer() {} @@ -82,7 +84,8 @@ float CderScorer::calculateScore(const vector& comps) const } void CderScorer::computeCD(const sent_t& cand, const sent_t& ref, - vector& stats) const { + vector& stats) const +{ int I = cand.size() + 1; // Number of inter-words positions in candidate sentence int L = ref.size() + 1; // Number of inter-words positions in reference sentence @@ -95,11 +98,9 @@ void CderScorer::computeCD(const sent_t& cand, const sent_t& ref, for (int i = 1; i < I; ++i) (*row)[i] = 1; // Calculating costs for next row using costs from the previous row. - while (++l < L) - { + while (++l < L) { vector* nextRow = new vector(I); - for (int i = 0; i < I; ++i) - { + for (int i = 0; i < I; ++i) { vector possibleCosts; if (i > 0) { possibleCosts.push_back((*nextRow)[i-1] + 1); // Deletion diff --git a/mert/CderScorer.h b/mert/CderScorer.h index 60b6ad125..bd43ec0d8 100644 --- a/mert/CderScorer.h +++ b/mert/CderScorer.h @@ -8,13 +8,14 @@ namespace MosesTuning { - + /** * CderScorer class can compute both CDER and WER metric. */ -class CderScorer: public StatisticsBasedScorer { - public: +class CderScorer: public StatisticsBasedScorer +{ +public: explicit CderScorer(const std::string& config, bool allowed_long_jumps = true); ~CderScorer(); @@ -24,11 +25,13 @@ class CderScorer: public StatisticsBasedScorer { virtual void prepareStatsVector(std::size_t sid, const std::string& text, std::vector& stats); - virtual std::size_t NumberOfScores() const { return 2; } + virtual std::size_t NumberOfScores() const { + return 2; + } virtual float calculateScore(const std::vector& comps) const; - private: +private: bool m_allowed_long_jumps; typedef std::vector sent_t; diff --git a/mert/Data.cpp b/mert/Data.cpp index 1efa080a2..613ce419b 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -27,11 +27,11 @@ namespace MosesTuning { Data::Data(Scorer* scorer, const string& sparse_weights_file) - : m_scorer(scorer), - m_score_type(m_scorer->getName()), - m_num_scores(0), - m_score_data(new ScoreData(m_scorer)), - m_feature_data(new FeatureData) + : m_scorer(scorer), + m_score_type(m_scorer->getName()), + m_num_scores(0), + m_score_data(new ScoreData(m_scorer)), + m_feature_data(new FeatureData) { TRACE_ERR("Data::m_score_type " << m_score_type << endl); TRACE_ERR("Data::Scorer type from Scorer: " << m_scorer->getName() << endl); @@ -48,7 +48,8 @@ Data::Data(Scorer* scorer, const string& sparse_weights_file) //ADDED BY TS // TODO: This is too long; consider creating additional functions to // reduce the lines of this function. -void Data::removeDuplicates() { +void Data::removeDuplicates() +{ size_t nSentences = m_feature_data->size(); assert(m_score_data->size() == nSentences); @@ -128,7 +129,8 @@ void Data::removeDuplicates() { } //END_ADDED -void Data::load(const std::string &featfile, const std::string &scorefile) { +void Data::load(const std::string &featfile, const std::string &scorefile) +{ m_feature_data->load(featfile, m_sparse_weights); m_score_data->load(scorefile); } @@ -192,7 +194,8 @@ void Data::loadNBest(const string &file) } } -void Data::save(const std::string &featfile, const std::string &scorefile, bool bin) { +void Data::save(const std::string &featfile, const std::string &scorefile, bool bin) +{ if (bin) cerr << "Binary write mode is selected" << endl; else @@ -202,7 +205,8 @@ void Data::save(const std::string &featfile, const std::string &scorefile, bool m_score_data->save(scorefile, bin); } -void Data::InitFeatureMap(const string& str) { +void Data::InitFeatureMap(const string& str) +{ string buf = str; string substr; string features = ""; @@ -231,7 +235,8 @@ void Data::InitFeatureMap(const string& str) { } void Data::AddFeatures(const string& str, - int sentence_index) { + int sentence_index) +{ string buf = str; string substr; FeatureStats feature_entry; diff --git a/mert/Data.h b/mert/Data.h index e17ac0239..cd090bad3 100644 --- a/mert/Data.h +++ b/mert/Data.h @@ -44,18 +44,28 @@ public: m_feature_data->clear(); } - ScoreDataHandle getScoreData() { return m_score_data; } + ScoreDataHandle getScoreData() { + return m_score_data; + } - FeatureDataHandle getFeatureData() { return m_feature_data; } + FeatureDataHandle getFeatureData() { + return m_feature_data; + } - Scorer* getScorer() { return m_scorer; } + Scorer* getScorer() { + return m_scorer; + } std::size_t NumberOfFeatures() const { return m_feature_data->NumberOfFeatures(); } - std::string Features() const { return m_feature_data->Features(); } - void Features(const std::string &f) { m_feature_data->Features(f); } + std::string Features() const { + return m_feature_data->Features(); + } + void Features(const std::string &f) { + m_feature_data->Features(f); + } void loadNBest(const std::string &file); diff --git a/mert/DataTest.cpp b/mert/DataTest.cpp index 189d8ccda..911171e0b 100644 --- a/mert/DataTest.cpp +++ b/mert/DataTest.cpp @@ -10,7 +10,8 @@ using namespace MosesTuning; //very basic test of sharding -BOOST_AUTO_TEST_CASE(shard_basic) { +BOOST_AUTO_TEST_CASE(shard_basic) +{ boost::scoped_ptr scorer(ScorerFactory::getScorer("BLEU", "")); Data data(scorer.get()); FeatureArray fa1, fa2, fa3, fa4; @@ -39,7 +40,8 @@ BOOST_AUTO_TEST_CASE(shard_basic) { BOOST_CHECK_EQUAL(shards[1].getFeatureData()->size(),(std::size_t)2); } -BOOST_AUTO_TEST_CASE(init_feature_map_test) { +BOOST_AUTO_TEST_CASE(init_feature_map_test) +{ boost::scoped_ptr scorer(ScorerFactory::getScorer("BLEU", "")); Data data(scorer.get()); @@ -49,7 +51,8 @@ BOOST_AUTO_TEST_CASE(init_feature_map_test) { BOOST_CHECK_EQUAL(expected, data.Features()); } -BOOST_AUTO_TEST_CASE(add_features_test) { +BOOST_AUTO_TEST_CASE(add_features_test) +{ boost::scoped_ptr scorer(ScorerFactory::getScorer("BLEU", "")); Data data(scorer.get()); diff --git a/mert/Fdstream.h b/mert/Fdstream.h index 6dbdb40a6..c59052f02 100644 --- a/mert/Fdstream.h +++ b/mert/Fdstream.h @@ -13,27 +13,27 @@ #define BUFFER_SIZE (32768) -namespace MosesTuning +namespace MosesTuning { class _fdstream { protected: _fdstream() : - _file_descriptor(-1), _filebuf(NULL) + _file_descriptor(-1), _filebuf(NULL) { } _fdstream(int file_descriptor, std::ios_base::openmode openmode) : - _file_descriptor(file_descriptor), _openmode(openmode) - { + _file_descriptor(file_descriptor), _openmode(openmode) { _filebuf = NULL; open(file_descriptor, openmode); } - std::ios_base::openmode openmode() const { return _openmode; } + std::ios_base::openmode openmode() const { + return _openmode; + } - void open(int file_descriptor, std::ios_base::openmode openmode) - { + void open(int file_descriptor, std::ios_base::openmode openmode) { if (!_filebuf) // We create a C++ stream from a file descriptor // stdio_filebuf is not synced with stdio. @@ -41,11 +41,10 @@ protected: // You can also create the filebuf from a FILE* with // FILE* f = fdopen(file_descriptor, mode); _filebuf = new __gnu_cxx::stdio_filebuf (file_descriptor, - openmode); + openmode); } - virtual ~_fdstream() - { + virtual ~_fdstream() { close(_file_descriptor); delete _filebuf; _filebuf = NULL; @@ -60,59 +59,51 @@ class ifdstream : public _fdstream { public: ifdstream() : - _fdstream(), _stream(NULL) + _fdstream(), _stream(NULL) { } ifdstream(int file_descriptor) : - _fdstream(file_descriptor, std::ios_base::in) - { + _fdstream(file_descriptor, std::ios_base::in) { _stream = new std::istream(_filebuf); } - void open(int file_descriptor) - { - if (!_stream) - { - _fdstream::open(file_descriptor, std::ios_base::in); - _stream = new std::istream(_filebuf); - } + void open(int file_descriptor) { + if (!_stream) { + _fdstream::open(file_descriptor, std::ios_base::in); + _stream = new std::istream(_filebuf); + } } - ifdstream& operator>> (std::string& str) - { + ifdstream& operator>> (std::string& str) { (*_stream) >> str; return *this; } - std::size_t getline(std::string& str) - { + std::size_t getline(std::string& str) { char tmp[BUFFER_SIZE]; std::size_t ret = getline(tmp, BUFFER_SIZE); str = tmp; return ret; } - std::size_t getline(char* s, std::streamsize n) - { + std::size_t getline(char* s, std::streamsize n) { return (getline(s, n, '\n')); } - std::size_t getline(char* s, std::streamsize n, char delim) - { + std::size_t getline(char* s, std::streamsize n, char delim) { int i = 0; - do{ + do { s[i] = _stream->get(); i++; - }while(i < n-1 && s[i-1] != delim && s[i-1] != '\0'); + } while(i < n-1 && s[i-1] != delim && s[i-1] != '\0'); s[i-1] = '\0'; // overwrite the delimiter given with string end return i-1; } - ~ifdstream() - { + ~ifdstream() { //this->~_fdstream(); delete _stream; } @@ -125,27 +116,23 @@ class ofdstream : public _fdstream { public: ofdstream() : - _fdstream(), _stream(NULL) + _fdstream(), _stream(NULL) { } ofdstream(int file_descriptor) : - _fdstream(file_descriptor, std::ios_base::out) - { + _fdstream(file_descriptor, std::ios_base::out) { _stream = new std::ostream(_filebuf); } - void open(int file_descriptor) - { - if (!_stream) - { + void open(int file_descriptor) { + if (!_stream) { _fdstream::open(file_descriptor, std::ios_base::out); _stream = new std::ostream(_filebuf); } } - ofdstream& operator<< (const std::string& str) - { + ofdstream& operator<< (const std::string& str) { if (_stream->good()) (*_stream) << str; @@ -153,8 +140,7 @@ public: return *this; } - ~ofdstream() - { + ~ofdstream() { //this->~_fdstream(); delete _stream; } diff --git a/mert/FeatureArray.cpp b/mert/FeatureArray.cpp index fd5fea200..d49b53b96 100644 --- a/mert/FeatureArray.cpp +++ b/mert/FeatureArray.cpp @@ -19,14 +19,14 @@ namespace MosesTuning FeatureArray::FeatureArray() - : m_index(0), m_num_features(0){} + : m_index(0), m_num_features(0) {} FeatureArray::~FeatureArray() {} void FeatureArray::savetxt(ostream* os) { *os << FEATURES_TXT_BEGIN << " " << m_index << " " << m_array.size() - << " " << m_num_features << " " << m_features << endl; + << " " << m_num_features << " " << m_features << endl; for (featarray_t::iterator i = m_array.begin(); i != m_array.end(); ++i) { i->savetxt(os); *os << endl; @@ -37,7 +37,7 @@ void FeatureArray::savetxt(ostream* os) void FeatureArray::savebin(ostream* os) { *os << FEATURES_BIN_BEGIN << " " << m_index << " " << m_array.size() - << " " << m_num_features << " " << m_features << endl; + << " " << m_num_features << " " << m_features << endl; for (featarray_t::iterator i = m_array.begin(); i != m_array.end(); ++i) i->savebin(os); diff --git a/mert/FeatureArray.h b/mert/FeatureArray.h index 03fe6b40c..f5fc489a2 100644 --- a/mert/FeatureArray.h +++ b/mert/FeatureArray.h @@ -36,16 +36,28 @@ public: FeatureArray(); ~FeatureArray(); - void clear() { m_array.clear(); } + void clear() { + m_array.clear(); + } - int getIndex() const { return m_index; } - void setIndex(const int value) { m_index = value; } + int getIndex() const { + return m_index; + } + void setIndex(const int value) { + m_index = value; + } - FeatureStats& get(std::size_t i) { return m_array.at(i); } - const FeatureStats& get(std::size_t i) const { return m_array.at(i); } + FeatureStats& get(std::size_t i) { + return m_array.at(i); + } + const FeatureStats& get(std::size_t i) const { + return m_array.at(i); + } - void add(FeatureStats& e) { m_array.push_back(e); } + void add(FeatureStats& e) { + m_array.push_back(e); + } //ADDED BY TS void swap(std::size_t i, std::size_t j) { @@ -59,13 +71,23 @@ public: void merge(FeatureArray& e); - std::size_t size() const { return m_array.size(); } + std::size_t size() const { + return m_array.size(); + } - std::size_t NumberOfFeatures() const { return m_num_features; } - void NumberOfFeatures(std::size_t v) { m_num_features = v; } + std::size_t NumberOfFeatures() const { + return m_num_features; + } + void NumberOfFeatures(std::size_t v) { + m_num_features = v; + } - std::string Features() const { return m_features; } - void Features(const std::string& f) { m_features = f; } + std::string Features() const { + return m_features; + } + void Features(const std::string& f) { + m_features = f; + } void savetxt(std::ostream* os); void savebin(std::ostream* os); diff --git a/mert/FeatureData.cpp b/mert/FeatureData.cpp index 75888ef6d..13b9b3a96 100644 --- a/mert/FeatureData.cpp +++ b/mert/FeatureData.cpp @@ -20,7 +20,7 @@ namespace MosesTuning FeatureData::FeatureData() - : m_num_features(0) {} + : m_num_features(0) {} void FeatureData::save(ostream* os, bool bin) { @@ -38,7 +38,8 @@ void FeatureData::save(const string &file, bool bin) ofs.close(); } -void FeatureData::save(bool bin) { +void FeatureData::save(bool bin) +{ save(&cout, bin); } @@ -145,7 +146,8 @@ void FeatureData::setFeatureMap(const string& feat) } } -string FeatureData::ToString() const { +string FeatureData::ToString() const +{ string res; { diff --git a/mert/FeatureData.h b/mert/FeatureData.h index 79e52b330..2510b3aee 100644 --- a/mert/FeatureData.h +++ b/mert/FeatureData.h @@ -33,7 +33,9 @@ public: FeatureData(); ~FeatureData() {} - void clear() { m_array.clear(); } + void clear() { + m_array.clear(); + } FeatureArray& get(size_t idx) { return m_array.at(idx); @@ -61,13 +63,23 @@ public: void add(FeatureArray& e); void add(FeatureStats& e, int sent_idx); - std::size_t size() const { return m_array.size(); } + std::size_t size() const { + return m_array.size(); + } - std::size_t NumberOfFeatures() const { return m_num_features; } - void NumberOfFeatures(std::size_t v) { m_num_features = v; } + std::size_t NumberOfFeatures() const { + return m_num_features; + } + void NumberOfFeatures(std::size_t v) { + m_num_features = v; + } - std::string Features() const { return m_features; } - void Features(const std::string& f) { m_features = f; } + std::string Features() const { + return m_features; + } + void Features(const std::string& f) { + m_features = f; + } void save(const std::string &file, bool bin=false); void save(std::ostream* os, bool bin=false); diff --git a/mert/FeatureDataIterator.cpp b/mert/FeatureDataIterator.cpp index 471da07ee..9deb0ac50 100644 --- a/mert/FeatureDataIterator.cpp +++ b/mert/FeatureDataIterator.cpp @@ -32,9 +32,10 @@ using namespace util; namespace MosesTuning { - -int ParseInt(const StringPiece& str ) { + +int ParseInt(const StringPiece& str ) +{ char* errIndex; //could wrap? int value = static_cast(strtol(str.data(), &errIndex,10)); @@ -44,7 +45,8 @@ int ParseInt(const StringPiece& str ) { return value; } -float ParseFloat(const StringPiece& str) { +float ParseFloat(const StringPiece& str) +{ char* errIndex; float value = static_cast(strtod(str.data(), &errIndex)); if (errIndex == str.data()) { @@ -53,11 +55,13 @@ float ParseFloat(const StringPiece& str) { return value; } -bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2) { +bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2) +{ return item1.dense==item1.dense && item1.sparse==item1.sparse; } -size_t hash_value(FeatureDataItem const& item) { +size_t hash_value(FeatureDataItem const& item) +{ size_t seed = 0; boost::hash_combine(seed,item.dense); boost::hash_combine(seed,item.sparse); @@ -67,14 +71,16 @@ size_t hash_value(FeatureDataItem const& item) { FeatureDataIterator::FeatureDataIterator() {} -FeatureDataIterator::FeatureDataIterator(const string& filename) { +FeatureDataIterator::FeatureDataIterator(const string& filename) +{ m_in.reset(new FilePiece(filename.c_str())); readNext(); } FeatureDataIterator::~FeatureDataIterator() {} -void FeatureDataIterator::readNext() { +void FeatureDataIterator::readNext() +{ m_next.clear(); try { StringPiece marker = m_in->ReadDelimited(); @@ -101,7 +107,7 @@ void FeatureDataIterator::readNext() { //sparse feature StringPiece second = *value; float floatValue = ParseFloat(second); - m_next.back().sparse.set(first.as_string(),floatValue); + m_next.back().sparse.set(first.as_string(),floatValue); } } if (length != m_next.back().dense.size()) { @@ -117,11 +123,13 @@ void FeatureDataIterator::readNext() { } } -void FeatureDataIterator::increment() { +void FeatureDataIterator::increment() +{ readNext(); } -bool FeatureDataIterator::equal(const FeatureDataIterator& rhs) const { +bool FeatureDataIterator::equal(const FeatureDataIterator& rhs) const +{ if (!m_in && !rhs.m_in) { return true; } else if (!m_in) { @@ -129,12 +137,13 @@ bool FeatureDataIterator::equal(const FeatureDataIterator& rhs) const { } else if (!rhs.m_in) { return false; } else { - return m_in->FileName() == rhs.m_in->FileName() && - m_in->Offset() == rhs.m_in->Offset(); + return m_in->FileName() == rhs.m_in->FileName() && + m_in->Offset() == rhs.m_in->Offset(); } } -const vector& FeatureDataIterator::dereference() const { +const vector& FeatureDataIterator::dereference() const +{ return m_next; } diff --git a/mert/FeatureDataIterator.h b/mert/FeatureDataIterator.h index 8bbb8d497..15a654182 100644 --- a/mert/FeatureDataIterator.h +++ b/mert/FeatureDataIterator.h @@ -37,18 +37,21 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "FeatureStats.h" -namespace util { class FilePiece; } +namespace util +{ +class FilePiece; +} namespace MosesTuning { - -class FileFormatException : public util::Exception + +class FileFormatException : public util::Exception { - public: - explicit FileFormatException(const std::string& filename, const std::string& line) { - *this << "Error in line \"" << line << "\" of " << filename; - } +public: + explicit FileFormatException(const std::string& filename, const std::string& line) { + *this << "Error in line \"" << line << "\" of " << filename; + } }; @@ -56,45 +59,45 @@ class FileFormatException : public util::Exception int ParseInt(const StringPiece& str ); /** Assumes a delimiter, so only apply to tokens */ -float ParseFloat(const StringPiece& str); +float ParseFloat(const StringPiece& str); -class FeatureDataItem +class FeatureDataItem { - public: - std::vector dense; - SparseVector sparse; +public: + std::vector dense; + SparseVector sparse; }; bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2); std::size_t hash_value(FeatureDataItem const& item); -class FeatureDataIterator : +class FeatureDataIterator : public boost::iterator_facade, - boost::forward_traversal_tag> + const std::vector, + boost::forward_traversal_tag> { - public: - FeatureDataIterator(); - explicit FeatureDataIterator(const std::string& filename); - ~FeatureDataIterator(); +public: + FeatureDataIterator(); + explicit FeatureDataIterator(const std::string& filename); + ~FeatureDataIterator(); - static FeatureDataIterator end() { - return FeatureDataIterator(); - } + static FeatureDataIterator end() { + return FeatureDataIterator(); + } - private: - friend class boost::iterator_core_access; +private: + friend class boost::iterator_core_access; - void increment(); - bool equal(const FeatureDataIterator& rhs) const; - const std::vector& dereference() const; + void increment(); + bool equal(const FeatureDataIterator& rhs) const; + const std::vector& dereference() const; - void readNext(); + void readNext(); - boost::shared_ptr m_in; - std::vector m_next; + boost::shared_ptr m_in; + std::vector m_next; }; } diff --git a/mert/FeatureDataTest.cpp b/mert/FeatureDataTest.cpp index 0f3d6a596..916203592 100644 --- a/mert/FeatureDataTest.cpp +++ b/mert/FeatureDataTest.cpp @@ -7,10 +7,12 @@ using namespace MosesTuning; -namespace { +namespace +{ void CheckFeatureMap(const FeatureData* feature_data, - const char* str, int num_feature, int* cnt) { + const char* str, int num_feature, int* cnt) +{ for (int i = 0; i < num_feature; ++i) { std::stringstream ss; ss << str << "_" << i; @@ -23,7 +25,8 @@ void CheckFeatureMap(const FeatureData* feature_data, } // namespace -BOOST_AUTO_TEST_CASE(set_feature_map) { +BOOST_AUTO_TEST_CASE(set_feature_map) +{ std::string str("d_0 d_1 d_2 d_3 d_4 d_5 d_6 lm_0 lm_1 tm_0 tm_1 tm_2 tm_3 tm_4 w_0 "); FeatureData feature_data; diff --git a/mert/FeatureStats.cpp b/mert/FeatureStats.cpp index 242d3fbd0..aa32e1fef 100644 --- a/mert/FeatureStats.cpp +++ b/mert/FeatureStats.cpp @@ -18,31 +18,35 @@ using namespace std; -namespace { +namespace +{ const int kAvailableSize = 8; } // namespace namespace MosesTuning { - + SparseVector::name2id_t SparseVector::m_name_to_id; SparseVector::id2name_t SparseVector::m_id_to_name; -FeatureStatsType SparseVector::get(const string& name) const { +FeatureStatsType SparseVector::get(const string& name) const +{ name2id_t::const_iterator name2id_iter = m_name_to_id.find(name); if (name2id_iter == m_name_to_id.end()) return 0; size_t id = name2id_iter->second; return get(id); } -FeatureStatsType SparseVector::get(size_t id) const { +FeatureStatsType SparseVector::get(size_t id) const +{ fvector_t::const_iterator fvector_iter = m_fvector.find(id); if (fvector_iter == m_fvector.end()) return 0; return fvector_iter->second; } -void SparseVector::set(const string& name, FeatureStatsType value) { +void SparseVector::set(const string& name, FeatureStatsType value) +{ name2id_t::const_iterator name2id_iter = m_name_to_id.find(name); size_t id = 0; if (name2id_iter == m_name_to_id.end()) { @@ -55,7 +59,8 @@ void SparseVector::set(const string& name, FeatureStatsType value) { m_fvector[id] = value; } -void SparseVector::write(ostream& out, const string& sep) const { +void SparseVector::write(ostream& out, const string& sep) const +{ for (fvector_t::const_iterator i = m_fvector.begin(); i != m_fvector.end(); ++i) { if (abs(i->second) < 0.00001) continue; string name = m_id_to_name[i->first]; @@ -63,11 +68,13 @@ void SparseVector::write(ostream& out, const string& sep) const { } } -void SparseVector::clear() { +void SparseVector::clear() +{ m_fvector.clear(); } -void SparseVector::load(const string& file) { +void SparseVector::load(const string& file) +{ ifstream in(file.c_str()); if (!in) { throw runtime_error("Failed to open sparse weights file: " + file); @@ -84,39 +91,44 @@ void SparseVector::load(const string& file) { } } -SparseVector& SparseVector::operator-=(const SparseVector& rhs) { +SparseVector& SparseVector::operator-=(const SparseVector& rhs) +{ for (fvector_t::const_iterator i = rhs.m_fvector.begin(); - i != rhs.m_fvector.end(); ++i) { + i != rhs.m_fvector.end(); ++i) { m_fvector[i->first] = get(i->first) - (i->second); } return *this; } -FeatureStatsType SparseVector::inner_product(const SparseVector& rhs) const { +FeatureStatsType SparseVector::inner_product(const SparseVector& rhs) const +{ FeatureStatsType product = 0.0; for (fvector_t::const_iterator i = m_fvector.begin(); - i != m_fvector.end(); ++i) { + i != m_fvector.end(); ++i) { product += ((i->second) * (rhs.get(i->first))); } return product; } -SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs) { +SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs) +{ SparseVector res(lhs); res -= rhs; return res; } -FeatureStatsType inner_product(const SparseVector& lhs, const SparseVector& rhs) { - if (lhs.size() >= rhs.size()) { - return rhs.inner_product(lhs); - } else { - return lhs.inner_product(rhs); - } +FeatureStatsType inner_product(const SparseVector& lhs, const SparseVector& rhs) +{ + if (lhs.size() >= rhs.size()) { + return rhs.inner_product(lhs); + } else { + return lhs.inner_product(rhs); + } } -std::vector SparseVector::feats() const { +std::vector SparseVector::feats() const +{ std::vector toRet; for(fvector_t::const_iterator iter = m_fvector.begin(); iter!=m_fvector.end(); @@ -126,7 +138,8 @@ std::vector SparseVector::feats() const { return toRet; } -std::size_t SparseVector::encode(const std::string& name) { +std::size_t SparseVector::encode(const std::string& name) +{ name2id_t::const_iterator name2id_iter = m_name_to_id.find(name); size_t id = 0; if (name2id_iter == m_name_to_id.end()) { @@ -139,26 +152,29 @@ std::size_t SparseVector::encode(const std::string& name) { return id; } -std::string SparseVector::decode(std::size_t id) { +std::string SparseVector::decode(std::size_t id) +{ return m_id_to_name[id]; } -bool operator==(SparseVector const& item1, SparseVector const& item2) { +bool operator==(SparseVector const& item1, SparseVector const& item2) +{ return item1.m_fvector==item2.m_fvector; } -std::size_t hash_value(SparseVector const& item) { +std::size_t hash_value(SparseVector const& item) +{ boost::hash hasher; return hasher(item.m_fvector); } FeatureStats::FeatureStats() - : m_available_size(kAvailableSize), m_entries(0), - m_array(new FeatureStatsType[m_available_size]) {} + : m_available_size(kAvailableSize), m_entries(0), + m_array(new FeatureStatsType[m_available_size]) {} FeatureStats::FeatureStats(const size_t size) - : m_available_size(size), m_entries(size), - m_array(new FeatureStatsType[m_available_size]) + : m_available_size(size), m_entries(size), + m_array(new FeatureStatsType[m_available_size]) { memset(m_array, 0, GetArraySizeWithBytes()); } @@ -276,7 +292,8 @@ void FeatureStats::savetxt(ostream* os) *os << *this; } -void FeatureStats::savetxt() { +void FeatureStats::savetxt() +{ savetxt(&cout); } @@ -298,7 +315,8 @@ ostream& operator<<(ostream& o, const FeatureStats& e) return o; } -bool operator==(const FeatureStats& f1, const FeatureStats& f2) { +bool operator==(const FeatureStats& f1, const FeatureStats& f2) +{ size_t size = f1.size(); if (size != f2.size()) diff --git a/mert/FeatureStats.h b/mert/FeatureStats.h index 883a89b97..a882e7358 100644 --- a/mert/FeatureStats.h +++ b/mert/FeatureStats.h @@ -18,10 +18,11 @@ namespace MosesTuning { - + // Minimal sparse vector -class SparseVector { +class SparseVector +{ public: typedef std::map fvector_t; typedef std::map name2id_t; @@ -32,8 +33,10 @@ public: void set(const std::string& name, FeatureStatsType value); void clear(); void load(const std::string& file); - std::size_t size() const { return m_fvector.size(); } - + std::size_t size() const { + return m_fvector.size(); + } + void write(std::ostream& out, const std::string& sep = " ") const; SparseVector& operator-=(const SparseVector& rhs); @@ -78,7 +81,9 @@ public: void Copy(const FeatureStats &stats); - bool isfull() const { return (m_entries < m_available_size) ? 0 : 1; } + bool isfull() const { + return (m_entries < m_available_size) ? 0 : 1; + } void expand(); void add(FeatureStatsType v); void addSparse(const std::string& name, FeatureStatsType v); @@ -93,23 +98,37 @@ public: clear(); } - FeatureStatsType get(std::size_t i) { return m_array[i]; } - FeatureStatsType get(std::size_t i)const { return m_array[i]; } - featstats_t getArray() const { return m_array; } + FeatureStatsType get(std::size_t i) { + return m_array[i]; + } + FeatureStatsType get(std::size_t i)const { + return m_array[i]; + } + featstats_t getArray() const { + return m_array; + } - const SparseVector& getSparse() const { return m_map; } + const SparseVector& getSparse() const { + return m_map; + } void set(std::string &theString, const SparseVector& sparseWeights); - inline std::size_t bytes() const { return GetArraySizeWithBytes(); } + inline std::size_t bytes() const { + return GetArraySizeWithBytes(); + } std::size_t GetArraySizeWithBytes() const { return m_entries * sizeof(FeatureStatsType); } - std::size_t size() const { return m_entries; } + std::size_t size() const { + return m_entries; + } - std::size_t available() const { return m_available_size; } + std::size_t available() const { + return m_available_size; + } void savetxt(const std::string &file); void savetxt(std::ostream* os); diff --git a/mert/FileStream.cpp b/mert/FileStream.cpp index 1a52e53fa..800ce1bfe 100644 --- a/mert/FileStream.cpp +++ b/mert/FileStream.cpp @@ -5,15 +5,17 @@ using namespace std; -namespace { -bool IsGzipFile(const std::string &filename) { +namespace +{ +bool IsGzipFile(const std::string &filename) +{ return filename.size() > 3 && - filename.substr(filename.size() - 3, 3) == ".gz"; + filename.substr(filename.size() - 3, 3) == ".gz"; } } // namespace inputfilestream::inputfilestream(const std::string &filePath) - : std::istream(0), m_streambuf(0), m_is_good(false) + : std::istream(0), m_streambuf(0), m_is_good(false) { // check if file is readable std::filebuf* fb = new std::filebuf(); @@ -40,7 +42,7 @@ void inputfilestream::close() } outputfilestream::outputfilestream(const std::string &filePath) - : std::ostream(0), m_streambuf(0), m_is_good(false) + : std::ostream(0), m_streambuf(0), m_is_good(false) { // check if file is readable std::filebuf* fb = new std::filebuf(); diff --git a/mert/FileStream.h b/mert/FileStream.h index 3fd489cd7..582cbcb59 100644 --- a/mert/FileStream.h +++ b/mert/FileStream.h @@ -16,7 +16,9 @@ public: explicit inputfilestream(const std::string &filePath); virtual ~inputfilestream(); - bool good() const { return m_is_good; } + bool good() const { + return m_is_good; + } void close(); }; @@ -30,7 +32,9 @@ public: explicit outputfilestream(const std::string &filePath); virtual ~outputfilestream(); - bool good() const { return m_is_good; } + bool good() const { + return m_is_good; + } void close(); }; diff --git a/mert/GzFileBuf.cpp b/mert/GzFileBuf.cpp index 9d3ccb588..d61a22525 100644 --- a/mert/GzFileBuf.cpp +++ b/mert/GzFileBuf.cpp @@ -5,7 +5,8 @@ #include #include -GzFileBuf::GzFileBuf(const char* filename) { +GzFileBuf::GzFileBuf(const char* filename) +{ m_gz_file = gzopen(filename, "rb"); if (m_gz_file == NULL) { std::cerr << "ERROR: Failed to open " << filename << std::endl; @@ -16,16 +17,19 @@ GzFileBuf::GzFileBuf(const char* filename) { m_buf + sizeof(int)); // end position } -GzFileBuf::~GzFileBuf() { +GzFileBuf::~GzFileBuf() +{ gzclose(m_gz_file); } -int GzFileBuf::overflow(int_type c) { +int GzFileBuf::overflow(int_type c) +{ throw; } // read one character -int GzFileBuf::underflow() { +int GzFileBuf::underflow() +{ // is read position before end of m_buf? if (gptr() < egptr()) { return traits_type::to_int_type(*gptr()); @@ -64,17 +68,20 @@ int GzFileBuf::underflow() { } std::streampos GzFileBuf::seekpos( - std::streampos sp, - std::ios_base::openmode which) { + std::streampos sp, + std::ios_base::openmode which) +{ throw; } std::streamsize GzFileBuf::xsgetn(char* s, - std::streamsize num) { + std::streamsize num) +{ return static_cast(gzread(m_gz_file,s,num)); } std::streamsize GzFileBuf::xsputn(const char* s, - std::streamsize num) { + std::streamsize num) +{ throw; } diff --git a/mert/GzFileBuf.h b/mert/GzFileBuf.h index 729523e0e..fb57fcfe7 100644 --- a/mert/GzFileBuf.h +++ b/mert/GzFileBuf.h @@ -17,8 +17,8 @@ protected: virtual int_type underflow(); virtual std::streampos seekpos( - std::streampos sp, - std::ios_base::openmode which = std::ios_base::in | std::ios_base::out); + std::streampos sp, + std::ios_base::openmode which = std::ios_base::in | std::ios_base::out); virtual std::streamsize xsgetn(char* s, std::streamsize num); diff --git a/mert/HypPackEnumerator.cpp b/mert/HypPackEnumerator.cpp index 776c02857..1cdd1cb7d 100644 --- a/mert/HypPackEnumerator.cpp +++ b/mert/HypPackEnumerator.cpp @@ -8,13 +8,13 @@ using namespace std; namespace MosesTuning { - + StreamingHypPackEnumerator::StreamingHypPackEnumerator ( - vector const& featureFiles, - vector const& scoreFiles - ) + vector const& featureFiles, + vector const& scoreFiles +) : m_featureFiles(featureFiles), m_scoreFiles(scoreFiles) { @@ -22,19 +22,20 @@ StreamingHypPackEnumerator::StreamingHypPackEnumerator cerr << "No data to process" << endl; exit(0); } - + if (featureFiles.size() != scoreFiles.size()) { cerr << "Error: Number of feature files (" << featureFiles.size() << - ") does not match number of score files (" << scoreFiles.size() << ")" << endl; + ") does not match number of score files (" << scoreFiles.size() << ")" << endl; exit(1); } - + m_num_lists = scoreFiles.size(); m_primed = false; m_iNumDense = -1; } -size_t StreamingHypPackEnumerator::num_dense() const { +size_t StreamingHypPackEnumerator::num_dense() const +{ if(m_iNumDense<0) { cerr << "Error: Requested num_dense() for an unprimed StreamingHypPackEnumerator" << endl; exit(1); @@ -42,12 +43,13 @@ size_t StreamingHypPackEnumerator::num_dense() const { return (size_t) m_iNumDense; } -void StreamingHypPackEnumerator::prime(){ +void StreamingHypPackEnumerator::prime() +{ m_current_indexes.clear(); m_current_featureVectors.clear(); boost::unordered_set seen; m_primed = true; - + for (size_t i = 0; i < m_num_lists; ++i) { if (m_featureDataIters[i] == FeatureDataIterator::end()) { cerr << "Error: Feature file " << i << " ended prematurely" << endl; @@ -78,13 +80,14 @@ void StreamingHypPackEnumerator::prime(){ } // Store item for retrieval m_current_indexes.push_back(pair(i,j)); - m_current_featureVectors.push_back(MiraFeatureVector(item)); + m_current_featureVectors.push_back(MiraFeatureVector(item)); } } } } -void StreamingHypPackEnumerator::reset(){ +void StreamingHypPackEnumerator::reset() +{ m_featureDataIters.clear(); m_scoreDataIters.clear(); for (size_t i = 0; i < m_num_lists; ++i) { @@ -95,11 +98,13 @@ void StreamingHypPackEnumerator::reset(){ prime(); } -bool StreamingHypPackEnumerator::finished(){ +bool StreamingHypPackEnumerator::finished() +{ return m_featureDataIters[0]==FeatureDataIterator::end(); } -void StreamingHypPackEnumerator::next(){ +void StreamingHypPackEnumerator::next() +{ if(!m_primed) { cerr << "Enumerating an unprimed HypPackEnumerator" << endl; exit(1); @@ -113,7 +118,8 @@ void StreamingHypPackEnumerator::next(){ if(!finished()) prime(); } -size_t StreamingHypPackEnumerator::cur_size(){ +size_t StreamingHypPackEnumerator::cur_size() +{ if(!m_primed) { cerr << "Querying size from an unprimed HypPackEnumerator" << endl; exit(1); @@ -121,7 +127,8 @@ size_t StreamingHypPackEnumerator::cur_size(){ return m_current_indexes.size(); } -const MiraFeatureVector& StreamingHypPackEnumerator::featuresAt(size_t index){ +const MiraFeatureVector& StreamingHypPackEnumerator::featuresAt(size_t index) +{ if(!m_primed) { cerr << "Querying features from an unprimed HypPackEnumerator" << endl; exit(1); @@ -129,7 +136,8 @@ const MiraFeatureVector& StreamingHypPackEnumerator::featuresAt(size_t index){ return m_current_featureVectors[index]; } -const ScoreDataItem& StreamingHypPackEnumerator::scoresAt(size_t index) { +const ScoreDataItem& StreamingHypPackEnumerator::scoresAt(size_t index) +{ if(!m_primed) { cerr << "Querying scores from an unprimed HypPackEnumerator" << endl; exit(1); @@ -138,22 +146,23 @@ const ScoreDataItem& StreamingHypPackEnumerator::scoresAt(size_t index) { return m_scoreDataIters[pij.first]->operator[](pij.second); } -size_t StreamingHypPackEnumerator::cur_id() { +size_t StreamingHypPackEnumerator::cur_id() +{ return m_sentenceId; } /* --------- RandomAccessHypPackEnumerator ------------- */ RandomAccessHypPackEnumerator::RandomAccessHypPackEnumerator(vector const& featureFiles, - vector const& scoreFiles, - bool no_shuffle) + vector const& scoreFiles, + bool no_shuffle) { StreamingHypPackEnumerator train(featureFiles,scoreFiles); size_t index=0; for(train.reset(); !train.finished(); train.next()) { m_features.push_back(vector()); m_scores.push_back(vector()); - for(size_t j=0;j cons m_num_dense = train.num_dense(); } -size_t RandomAccessHypPackEnumerator::num_dense() const { +size_t RandomAccessHypPackEnumerator::num_dense() const +{ return m_num_dense; } - -void RandomAccessHypPackEnumerator::reset() { + +void RandomAccessHypPackEnumerator::reset() +{ m_cur_index = 0; if(!m_no_shuffle) random_shuffle(m_indexes.begin(),m_indexes.end()); } -bool RandomAccessHypPackEnumerator::finished() { +bool RandomAccessHypPackEnumerator::finished() +{ return m_cur_index >= m_indexes.size(); } -void RandomAccessHypPackEnumerator::next() { +void RandomAccessHypPackEnumerator::next() +{ m_cur_index++; } -size_t RandomAccessHypPackEnumerator::cur_size() { +size_t RandomAccessHypPackEnumerator::cur_size() +{ assert(m_features[m_indexes[m_cur_index]].size()==m_scores[m_indexes[m_cur_index]].size()); return m_features[m_indexes[m_cur_index]].size(); } -const MiraFeatureVector& RandomAccessHypPackEnumerator::featuresAt(size_t i) { +const MiraFeatureVector& RandomAccessHypPackEnumerator::featuresAt(size_t i) +{ return m_features[m_indexes[m_cur_index]][i]; } -const ScoreDataItem& RandomAccessHypPackEnumerator::scoresAt(size_t i) { +const ScoreDataItem& RandomAccessHypPackEnumerator::scoresAt(size_t i) +{ return m_scores[m_indexes[m_cur_index]][i]; } -size_t RandomAccessHypPackEnumerator::cur_id() { +size_t RandomAccessHypPackEnumerator::cur_id() +{ return m_indexes[m_cur_index]; -} +} // --Emacs trickery-- // Local Variables: // mode:c++ diff --git a/mert/HypPackEnumerator.h b/mert/HypPackEnumerator.h index 690e53103..957c6d408 100644 --- a/mert/HypPackEnumerator.h +++ b/mert/HypPackEnumerator.h @@ -20,11 +20,12 @@ namespace MosesTuning { - + // Start with these abstract classes -class HypPackEnumerator { +class HypPackEnumerator +{ public: virtual ~HypPackEnumerator() {} @@ -41,7 +42,8 @@ public: // Instantiation that streams from disk // Low-memory, low-speed, sequential access -class StreamingHypPackEnumerator : public HypPackEnumerator { +class StreamingHypPackEnumerator : public HypPackEnumerator +{ public: StreamingHypPackEnumerator(std::vector const& featureFiles, std::vector const& scoreFiles); @@ -75,7 +77,8 @@ private: // Instantiation that reads into memory // High-memory, high-speed, random access // (Actually randomizes with each call to reset) -class RandomAccessHypPackEnumerator : public HypPackEnumerator { +class RandomAccessHypPackEnumerator : public HypPackEnumerator +{ public: RandomAccessHypPackEnumerator(std::vector const& featureFiles, std::vector const& scoreFiles, diff --git a/mert/InterpolatedScorer.cpp b/mert/InterpolatedScorer.cpp index af3f26bf2..87cec9211 100644 --- a/mert/InterpolatedScorer.cpp +++ b/mert/InterpolatedScorer.cpp @@ -11,7 +11,7 @@ namespace MosesTuning // TODO: This is too long. Consider creating a function for // initialization such as Init(). InterpolatedScorer::InterpolatedScorer(const string& name, const string& config) - : Scorer(name,config) + : Scorer(name,config) { // name would be: HAMMING,BLEU or similar string scorers = name; @@ -66,7 +66,8 @@ InterpolatedScorer::InterpolatedScorer(const string& name, const string& config) cerr <::const_iterator itsc = m_scorers.begin(); itsc < m_scorers.end(); itsc++) { if ((*itsc)->useAlignment()) { @@ -176,8 +177,7 @@ void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats ScoreStats tempEntry; if ((*itsc)->useAlignment()) { (*itsc)->prepareStats(sid, text, tempEntry); - } - else { + } else { (*itsc)->prepareStats(sid, sentence, tempEntry); } if (i > 0) buff << " "; @@ -206,17 +206,17 @@ void InterpolatedScorer::setFactors(const string& factors) void InterpolatedScorer::setFilter(const string& filterCommand) { - if (filterCommand.empty()) return; + if (filterCommand.empty()) return; - vector csplit; - split(filterCommand, ',', csplit); + vector csplit; + split(filterCommand, ',', csplit); - if (csplit.size() != m_scorers.size()) - throw runtime_error("Number of command specifications does not equal number of interpolated scorers."); + if (csplit.size() != m_scorers.size()) + throw runtime_error("Number of command specifications does not equal number of interpolated scorers."); - for (size_t i = 0; i < m_scorers.size(); ++i) { - m_scorers[i]->setFilter(csplit[i]); - } + for (size_t i = 0; i < m_scorers.size(); ++i) { + m_scorers[i]->setFilter(csplit[i]); + } } } diff --git a/mert/InterpolatedScorer.h b/mert/InterpolatedScorer.h index 49c065d27..d1078e9e1 100644 --- a/mert/InterpolatedScorer.h +++ b/mert/InterpolatedScorer.h @@ -10,7 +10,7 @@ namespace MosesTuning { - + /** * Class that includes other scorers eg. diff --git a/mert/MiraFeatureVector.cpp b/mert/MiraFeatureVector.cpp index 95805c295..dea9b9b83 100644 --- a/mert/MiraFeatureVector.cpp +++ b/mert/MiraFeatureVector.cpp @@ -7,7 +7,7 @@ using namespace std; namespace MosesTuning { - + MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec) : m_dense(vec.dense) @@ -17,8 +17,7 @@ MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec) size_t lastFeat = 0; m_sparseFeats.reserve(sparseFeats.size()); m_sparseVals.reserve(sparseFeats.size()); - for(size_t i=0;i=feat) { cerr << "Error: Feature indeces must be strictly ascending coming out of SparseVector" << endl; exit(1); @@ -61,29 +59,33 @@ MiraFeatureVector::MiraFeatureVector(const vector& dense, } } -ValType MiraFeatureVector::val(size_t index) const { +ValType MiraFeatureVector::val(size_t index) const +{ if(index < m_dense.size()) return m_dense[index]; else return m_sparseVals[index-m_dense.size()]; } -size_t MiraFeatureVector::feat(size_t index) const { +size_t MiraFeatureVector::feat(size_t index) const +{ if(index < m_dense.size()) return index; else return m_sparseFeats[index-m_dense.size()]; } -size_t MiraFeatureVector::size() const { +size_t MiraFeatureVector::size() const +{ return m_dense.size() + m_sparseVals.size(); } -ValType MiraFeatureVector::sqrNorm() const { +ValType MiraFeatureVector::sqrNorm() const +{ ValType toRet = 0.0; - for(size_t i=0;i0) o << " "; o << e.feat(i) << ":" << e.val(i); } diff --git a/mert/MiraFeatureVector.h b/mert/MiraFeatureVector.h index 60e765605..cb2b1c87d 100644 --- a/mert/MiraFeatureVector.h +++ b/mert/MiraFeatureVector.h @@ -19,11 +19,12 @@ namespace MosesTuning { - + typedef FeatureStatsType ValType; -class MiraFeatureVector { +class MiraFeatureVector +{ public: MiraFeatureVector(const FeatureDataItem& vec); MiraFeatureVector(const MiraFeatureVector& other); diff --git a/mert/MiraWeightVector.cpp b/mert/MiraWeightVector.cpp index c6f0261dc..e23804cbf 100644 --- a/mert/MiraWeightVector.cpp +++ b/mert/MiraWeightVector.cpp @@ -6,7 +6,7 @@ using namespace std; namespace MosesTuning { - + /** * Constructor, initializes to the zero vector @@ -36,9 +36,10 @@ MiraWeightVector::MiraWeightVector(const vector& init) * \param fv Feature vector to be added to the weights * \param tau FV will be scaled by this value before update */ -void MiraWeightVector::update(const MiraFeatureVector& fv, float tau) { +void MiraWeightVector::update(const MiraFeatureVector& fv, float tau) +{ m_numUpdates++; - for(size_t i=0;ifixTotals(); return AvgWeightVector(*this); } @@ -73,7 +77,8 @@ AvgWeightVector MiraWeightVector::avg() { /** * Updates a weight and lazily updates its total */ -void MiraWeightVector::update(size_t index, ValType delta) { +void MiraWeightVector::update(size_t index, ValType delta) +{ // Handle previously unseen weights while(index>=m_weights.size()) { @@ -91,25 +96,27 @@ void MiraWeightVector::update(size_t index, ValType delta) { /** * Make sure everyone's total is up-to-date */ -void MiraWeightVector::fixTotals() { +void MiraWeightVector::fixTotals() +{ for(size_t i=0; i1e-8) { if(i>0) o << " "; cerr << i << ":" << e.m_weights[i]; @@ -136,14 +143,14 @@ ValType AvgWeightVector::weight(size_t index) const else { if(index < m_wv.m_totals.size()) { return m_wv.m_totals[index] / m_wv.m_numUpdates; - } - else { + } else { return 0; } } } -ValType AvgWeightVector::score(const MiraFeatureVector& fv) const { +ValType AvgWeightVector::score(const MiraFeatureVector& fv) const +{ ValType toRet = 0.0; for(size_t i=0; i& a, const std::vector& b) const { @@ -45,7 +46,9 @@ class NgramCounts { /** * If the specified "ngram" is found, we add counts. * If not, we insert the default count in the container. */ - inline void Add(const Key& ngram) { m_counts[ngram]++; } + inline void Add(const Key& ngram) { + m_counts[ngram]++; + } /** * Return true iff the specified "ngram" is found in the container. @@ -60,34 +63,58 @@ class NgramCounts { /** * Clear all elments in the container. */ - void clear() { m_counts.clear(); } + void clear() { + m_counts.clear(); + } /** * Return true iff the container is empty. */ - bool empty() const { return m_counts.empty(); } + bool empty() const { + return m_counts.empty(); + } /** * Return the the number of elements in the container. */ - std::size_t size() const { return m_counts.size(); } + std::size_t size() const { + return m_counts.size(); + } - std::size_t max_size() const { return m_counts.max_size(); } + std::size_t max_size() const { + return m_counts.max_size(); + } // Note: This is mainly used by unit tests. - int get_default_count() const { return kDefaultCount; } + int get_default_count() const { + return kDefaultCount; + } - iterator find(const Key& ngram) { return m_counts.find(ngram); } - const_iterator find(const Key& ngram) const { return m_counts.find(ngram); } + iterator find(const Key& ngram) { + return m_counts.find(ngram); + } + const_iterator find(const Key& ngram) const { + return m_counts.find(ngram); + } - Value& operator[](const Key& ngram) { return m_counts[ngram]; } + Value& operator[](const Key& ngram) { + return m_counts[ngram]; + } - iterator begin() { return m_counts.begin(); } - const_iterator begin() const { return m_counts.begin(); } - iterator end() { return m_counts.end(); } - const_iterator end() const { return m_counts.end(); } + iterator begin() { + return m_counts.begin(); + } + const_iterator begin() const { + return m_counts.begin(); + } + iterator end() { + return m_counts.end(); + } + const_iterator end() const { + return m_counts.end(); + } - private: +private: const int kDefaultCount; boost::unordered_map m_counts; }; diff --git a/mert/NgramTest.cpp b/mert/NgramTest.cpp index e6218206f..87f36860b 100644 --- a/mert/NgramTest.cpp +++ b/mert/NgramTest.cpp @@ -5,7 +5,8 @@ using namespace MosesTuning; -BOOST_AUTO_TEST_CASE(ngram_basic) { +BOOST_AUTO_TEST_CASE(ngram_basic) +{ NgramCounts counts; NgramCounts::Key key; key.push_back(1); @@ -25,7 +26,8 @@ BOOST_AUTO_TEST_CASE(ngram_basic) { BOOST_CHECK_EQUAL(it->second, 1); } -BOOST_AUTO_TEST_CASE(ngram_Add) { +BOOST_AUTO_TEST_CASE(ngram_Add) +{ NgramCounts counts; NgramCounts::Key key; key.push_back(1); @@ -49,7 +51,8 @@ BOOST_AUTO_TEST_CASE(ngram_Add) { BOOST_CHECK_EQUAL(counts[key3], counts.get_default_count()); } -BOOST_AUTO_TEST_CASE(ngram_lookup) { +BOOST_AUTO_TEST_CASE(ngram_lookup) +{ NgramCounts counts; NgramCounts::Key key; key.push_back(1); diff --git a/mert/Optimizer.cpp b/mert/Optimizer.cpp index e5f5854b2..6afbd6241 100644 --- a/mert/Optimizer.cpp +++ b/mert/Optimizer.cpp @@ -17,7 +17,8 @@ using namespace std; static const float MIN_FLOAT = -1.0 * numeric_limits::max(); static const float MAX_FLOAT = numeric_limits::max(); -namespace { +namespace +{ /** * Compute the intersection of 2 lines. @@ -35,7 +36,7 @@ inline float intersect(float m1, float b1, float m2, float b2) namespace MosesTuning { - + Optimizer::Optimizer(unsigned Pd, const vector& i2O, const vector& pos, const vector& start, unsigned int nrandom) : m_scorer(NULL), m_feature_data(), m_num_random_directions(nrandom), m_positive(pos) @@ -198,7 +199,7 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction, thresholdmap.erase(previnserted); // erase old previnsert previnserted = thresholdmap.find(leftmostx); // point previnsert to the new threshold previnserted->second.back()=newd; // We update the diff for sentence S - // Threshold already exists but is not the previous one. + // Threshold already exists but is not the previous one. } else { // We append the diffs in previnsert to tit before destroying previnsert. tit->second.insert(tit->second.end(),previnserted->second.begin(),previnserted->second.end()); @@ -405,8 +406,7 @@ statscore_t SimpleOptimizer::TrueRun(Point& P) const for (unsigned int i = 0; i < Point::getdim(); i++) direction[i]=0.0; direction[d]=1.0; - } - else { // random direction update + } else { // random direction update direction.Randomize(); } statscore_t curscore = LineOptimize(P, direction, linebest);//find the minimum on the line @@ -443,8 +443,7 @@ statscore_t RandomDirectionOptimizer::TrueRun(Point& P) const // do specified number of random direction optimizations unsigned int nrun = 0; unsigned int nrun_no_change = 0; - for (; nrun_no_change < m_num_random_directions; nrun++, nrun_no_change++) - { + for (; nrun_no_change < m_num_random_directions; nrun++, nrun_no_change++) { // choose a random direction in which to optimize Point direction; direction.Randomize(); diff --git a/mert/Optimizer.h b/mert/Optimizer.h index f81d59d96..80d6d10cc 100644 --- a/mert/Optimizer.h +++ b/mert/Optimizer.h @@ -12,7 +12,7 @@ static const float kMaxFloat = std::numeric_limits::max(); namespace MosesTuning { - + class Point; @@ -31,8 +31,12 @@ protected: public: Optimizer(unsigned Pd, const std::vector& i2O, const std::vector& positive, const std::vector& start, unsigned int nrandom); - void SetScorer(Scorer *scorer) { m_scorer = scorer; } - void SetFeatureData(FeatureDataHandle feature_data) { m_feature_data = feature_data; } + void SetScorer(Scorer *scorer) { + m_scorer = scorer; + } + void SetFeatureData(FeatureDataHandle feature_data) { + m_feature_data = feature_data; + } virtual ~Optimizer(); unsigned size() const { @@ -97,7 +101,7 @@ private: public: RandomDirectionOptimizer(unsigned dim, const std::vector& i2O, const std::vector& positive, const std::vector& start, unsigned int nrandom) - : Optimizer(dim, i2O, positive, start, nrandom), kEPS(0.0001f) {} + : Optimizer(dim, i2O, positive, start, nrandom), kEPS(0.0001f) {} virtual statscore_t TrueRun(Point&) const; }; @@ -109,7 +113,7 @@ class RandomOptimizer : public Optimizer public: RandomOptimizer(unsigned dim, const std::vector& i2O, const std::vector& positive, const std::vector& start, unsigned int nrandom) - : Optimizer(dim, i2O, positive, start, nrandom) {} + : Optimizer(dim, i2O, positive, start, nrandom) {} virtual statscore_t TrueRun(Point&) const; }; diff --git a/mert/OptimizerFactory.cpp b/mert/OptimizerFactory.cpp index b33194f33..97288f9a8 100644 --- a/mert/OptimizerFactory.cpp +++ b/mert/OptimizerFactory.cpp @@ -5,7 +5,7 @@ using namespace std; namespace MosesTuning { - + vector OptimizerFactory::m_type_names; @@ -38,11 +38,11 @@ OptimizerFactory::OptimizerType OptimizerFactory::GetOptimizerType(const string& } Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim, - const vector& i2o, - const std::vector& positive, - const vector& start, - const string& type, - unsigned int nrandom) + const vector& i2o, + const std::vector& positive, + const vector& start, + const string& type, + unsigned int nrandom) { OptimizerType opt_type = GetOptimizerType(type); if (opt_type == NOPTIMIZER) { @@ -55,18 +55,18 @@ Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim, } switch (opt_type) { - case POWELL: - return new SimpleOptimizer(dim, i2o, positive, start, nrandom); - break; - case RANDOM_DIRECTION: - return new RandomDirectionOptimizer(dim, i2o, positive, start, nrandom); - break; - case RANDOM: - return new RandomOptimizer(dim, i2o, positive, start, nrandom); - break; - default: - cerr << "Error: unknown optimizer" << type << endl; - return NULL; + case POWELL: + return new SimpleOptimizer(dim, i2o, positive, start, nrandom); + break; + case RANDOM_DIRECTION: + return new RandomDirectionOptimizer(dim, i2o, positive, start, nrandom); + break; + case RANDOM: + return new RandomOptimizer(dim, i2o, positive, start, nrandom); + break; + default: + cerr << "Error: unknown optimizer" << type << endl; + return NULL; } } diff --git a/mert/OptimizerFactory.h b/mert/OptimizerFactory.h index ae34bcb00..fc0fea65a 100644 --- a/mert/OptimizerFactory.h +++ b/mert/OptimizerFactory.h @@ -6,13 +6,13 @@ namespace MosesTuning { - + class Optimizer; class OptimizerFactory { - public: +public: // NOTE: Add new optimizer here BEFORE NOPTIMZER enum OptimizerType { POWELL = 0, @@ -36,7 +36,7 @@ class OptimizerFactory const std::string& type, unsigned int nrandom); - private: +private: OptimizerFactory() {} ~OptimizerFactory() {} diff --git a/mert/OptimizerFactoryTest.cpp b/mert/OptimizerFactoryTest.cpp index 4d259c68d..56f894904 100644 --- a/mert/OptimizerFactoryTest.cpp +++ b/mert/OptimizerFactoryTest.cpp @@ -7,21 +7,24 @@ using namespace MosesTuning; -namespace { +namespace +{ inline bool CheckBuildOptimizer(unsigned dim, const std::vector& to_optimize, const std::vector& positive, const std::vector& start, const std::string& type, - unsigned int num_random) { + unsigned int num_random) +{ boost::scoped_ptr optimizer(OptimizerFactory::BuildOptimizer(dim, to_optimize, positive, start, type, num_random)); return optimizer.get() != NULL; } } // namespace -BOOST_AUTO_TEST_CASE(optimizer_type) { +BOOST_AUTO_TEST_CASE(optimizer_type) +{ BOOST_CHECK_EQUAL(OptimizerFactory::GetOptimizerType("powell"), OptimizerFactory::POWELL); BOOST_CHECK_EQUAL(OptimizerFactory::GetOptimizerType("random"), @@ -30,7 +33,8 @@ BOOST_AUTO_TEST_CASE(optimizer_type) { OptimizerFactory::RANDOM_DIRECTION); } -BOOST_AUTO_TEST_CASE(optimizer_build) { +BOOST_AUTO_TEST_CASE(optimizer_build) +{ const unsigned dim = 3; std::vector to_optimize; to_optimize.push_back(1); diff --git a/mert/PerScorer.cpp b/mert/PerScorer.cpp index 06b53436f..3e157a55e 100644 --- a/mert/PerScorer.cpp +++ b/mert/PerScorer.cpp @@ -10,7 +10,7 @@ using namespace std; namespace MosesTuning { - + PerScorer::PerScorer(const string& config) : StatisticsBasedScorer("PER",config) {} diff --git a/mert/PerScorer.h b/mert/PerScorer.h index 76ea9bfd7..ffb869942 100644 --- a/mert/PerScorer.h +++ b/mert/PerScorer.h @@ -9,7 +9,7 @@ namespace MosesTuning { - + class ScoreStats; @@ -27,7 +27,9 @@ public: virtual void setReferenceFiles(const std::vector& referenceFiles); virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry); - virtual std::size_t NumberOfScores() const { return 3; } + virtual std::size_t NumberOfScores() const { + return 3; + } virtual float calculateScore(const std::vector& comps) const; private: diff --git a/mert/Permutation.cpp b/mert/Permutation.cpp index 5f3102f26..a4c74b0d2 100644 --- a/mert/Permutation.cpp +++ b/mert/Permutation.cpp @@ -16,7 +16,7 @@ using namespace std; namespace MosesTuning { - + Permutation::Permutation(const string &alignment, const int sourceLength, const int targetLength ) { @@ -86,7 +86,7 @@ void Permutation::set(const string & alignment,const int sourceLength) //cout << "SP:" << sourcePos << " TP:" << targetPos << endl; if (sourcePos > sourceLength) { cerr << "Source sentence length:" << sourceLength << " is smaller than alignment source position:" << sourcePos << endl; - cerr << "******** Permutation::set :" << alignment << ": len : " << sourceLength <3); // TODO: fix verboselevel() - bool debug=false; + bool debug=false; if (debug) { cout << "*****Permutation::distance" <3); // TODO: fix verboselevel() - bool debug=false; + bool debug=false; if (debug) { cout << "*******prepareStats" ; cout << text << endl; diff --git a/mert/PermutationScorer.h b/mert/PermutationScorer.h index 4d5c144ce..c3d0cc960 100644 --- a/mert/PermutationScorer.h +++ b/mert/PermutationScorer.h @@ -19,7 +19,7 @@ namespace MosesTuning { - + /** * Permutation **/ diff --git a/mert/Point.cpp b/mert/Point.cpp index 5c446aa8b..2219749bd 100644 --- a/mert/Point.cpp +++ b/mert/Point.cpp @@ -29,7 +29,7 @@ Point::Point() : vector(m_dim), m_score(0.0) {} Point::Point(const vector& init, const vector& min, const vector& max) - : vector(Point::m_dim), m_score(0.0) + : vector(Point::m_dim), m_score(0.0) { m_min.resize(Point::m_dim); m_max.resize(Point::m_dim); diff --git a/mert/Point.h b/mert/Point.h index 92cb832dd..f53f5f982 100644 --- a/mert/Point.h +++ b/mert/Point.h @@ -8,7 +8,7 @@ namespace MosesTuning { - + class FeatureStats; class Optimizer; @@ -53,11 +53,19 @@ private: statscore_t m_score; public: - static unsigned int getdim() { return m_dim; } - static void setdim(std::size_t d) { m_dim = d; } + static unsigned int getdim() { + return m_dim; + } + static void setdim(std::size_t d) { + m_dim = d; + } - static unsigned int getpdim() { return m_pdim; } - static void setpdim(std::size_t pd) { m_pdim = pd; } + static unsigned int getpdim() { + return m_pdim; + } + static void setpdim(std::size_t pd) { + m_pdim = pd; + } static void set_optindices(const std::vector& indices) { m_opt_indices = indices; @@ -90,7 +98,9 @@ public: */ friend std::ostream& operator<<(std::ostream& o,const Point& P); - void Normalize() { NormalizeL2(); } + void Normalize() { + NormalizeL2(); + } void NormalizeL2(); void NormalizeL1(); @@ -100,8 +110,12 @@ public: */ void GetAllWeights(std::vector& w) const; - statscore_t GetScore() const { return m_score; } - void SetScore(statscore_t score) { m_score = score; } + statscore_t GetScore() const { + return m_score; + } + void SetScore(statscore_t score) { + m_score = score; + } }; } diff --git a/mert/PointTest.cpp b/mert/PointTest.cpp index df270dec9..f9e8e8bb2 100644 --- a/mert/PointTest.cpp +++ b/mert/PointTest.cpp @@ -9,7 +9,8 @@ using namespace std; using namespace MosesTuning; -BOOST_AUTO_TEST_CASE(point_operators) { +BOOST_AUTO_TEST_CASE(point_operators) +{ const unsigned int dim = 5; vector init(dim); init[0] = 1.0f; diff --git a/mert/PreProcessFilter.cpp b/mert/PreProcessFilter.cpp index da26177f7..4fbcc0c89 100644 --- a/mert/PreProcessFilter.cpp +++ b/mert/PreProcessFilter.cpp @@ -18,7 +18,7 @@ using namespace std; namespace MosesTuning { - + // Child exec error signal void exec_failed (int sig) @@ -28,116 +28,108 @@ void exec_failed (int sig) } PreProcessFilter::PreProcessFilter(const string& filterCommand) - : m_toFilter(NULL), - m_fromFilter(NULL) + : m_toFilter(NULL), + m_fromFilter(NULL) { - // Child error signal install - // sigaction is the replacement for the traditional signal() method - struct sigaction action; - action.sa_handler = exec_failed; - sigemptyset(&action.sa_mask); - action.sa_flags = 0; - if (sigaction(SIGUSR1, &action, NULL) < 0) - { - perror("SIGUSR1 install error"); - exit(EXIT_FAILURE); - } + // Child error signal install + // sigaction is the replacement for the traditional signal() method + struct sigaction action; + action.sa_handler = exec_failed; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + if (sigaction(SIGUSR1, &action, NULL) < 0) { + perror("SIGUSR1 install error"); + exit(EXIT_FAILURE); + } - int pipe_status; - int pipefds_input[2]; - int pipefds_output[2]; - // int pipefds_error[2]; + int pipe_status; + int pipefds_input[2]; + int pipefds_output[2]; + // int pipefds_error[2]; - // Create the pipes - // We do this before the fork so both processes will know about - // the same pipe and they can communicate. + // Create the pipes + // We do this before the fork so both processes will know about + // the same pipe and they can communicate. - pipe_status = pipe(pipefds_input); - if (pipe_status == -1) - { - perror("Error creating the pipe"); - exit(EXIT_FAILURE); - } + pipe_status = pipe(pipefds_input); + if (pipe_status == -1) { + perror("Error creating the pipe"); + exit(EXIT_FAILURE); + } - pipe_status = pipe(pipefds_output); - if (pipe_status == -1) - { - perror("Error creating the pipe"); - exit(EXIT_FAILURE); - } + pipe_status = pipe(pipefds_output); + if (pipe_status == -1) { + perror("Error creating the pipe"); + exit(EXIT_FAILURE); + } - /* - pipe_status = pipe(pipefds_error); - if (pipe_status == -1) - { - perror("Error creating the pipe"); - exit(EXIT_FAILURE); - } - */ + /* + pipe_status = pipe(pipefds_error); + if (pipe_status == -1) + { + perror("Error creating the pipe"); + exit(EXIT_FAILURE); + } + */ - pid_t pid; - // Create child process; both processes continue from here - pid = fork(); + pid_t pid; + // Create child process; both processes continue from here + pid = fork(); - if (pid == pid_t(0)) - { - // Child process + if (pid == pid_t(0)) { + // Child process - // When the child process finishes sends a SIGCHLD signal - // to the parent + // When the child process finishes sends a SIGCHLD signal + // to the parent - // Tie the standard input, output and error streams to the - // appropiate pipe ends - // The file descriptor 0 is the standard input - // We tie it to the read end of the pipe as we will use - // this end of the pipe to read from it - dup2 (CHILD_STDIN_READ,0); - dup2 (CHILD_STDOUT_WRITE,1); - // dup2 (CHILD_STDERR_WRITE,2); - // Close in the child the unused ends of the pipes - close(CHILD_STDIN_WRITE); - close(CHILD_STDOUT_READ); - //close(CHILD_STDERR_READ); + // Tie the standard input, output and error streams to the + // appropiate pipe ends + // The file descriptor 0 is the standard input + // We tie it to the read end of the pipe as we will use + // this end of the pipe to read from it + dup2 (CHILD_STDIN_READ,0); + dup2 (CHILD_STDOUT_WRITE,1); + // dup2 (CHILD_STDERR_WRITE,2); + // Close in the child the unused ends of the pipes + close(CHILD_STDIN_WRITE); + close(CHILD_STDOUT_READ); + //close(CHILD_STDERR_READ); - // Execute the program - execl("/bin/bash", "bash", "-c", filterCommand.c_str() , (char*)NULL); + // Execute the program + execl("/bin/bash", "bash", "-c", filterCommand.c_str() , (char*)NULL); - // We should never reach this point - // Tell the parent the exec failed - kill(getppid(), SIGUSR1); - exit(EXIT_FAILURE); - } - else if (pid > pid_t(0)) - { - // Parent + // We should never reach this point + // Tell the parent the exec failed + kill(getppid(), SIGUSR1); + exit(EXIT_FAILURE); + } else if (pid > pid_t(0)) { + // Parent - // Close in the parent the unused ends of the pipes - close(CHILD_STDIN_READ); - close(CHILD_STDOUT_WRITE); - // close(CHILD_STDERR_WRITE); + // Close in the parent the unused ends of the pipes + close(CHILD_STDIN_READ); + close(CHILD_STDOUT_WRITE); + // close(CHILD_STDERR_WRITE); - m_toFilter = new ofdstream(CHILD_STDIN_WRITE); - m_fromFilter = new ifdstream(CHILD_STDOUT_READ); - } - else - { - perror("Error: fork failed"); - exit(EXIT_FAILURE); - } + m_toFilter = new ofdstream(CHILD_STDIN_WRITE); + m_fromFilter = new ifdstream(CHILD_STDOUT_READ); + } else { + perror("Error: fork failed"); + exit(EXIT_FAILURE); + } } string PreProcessFilter::ProcessSentence(const string& sentence) { - *m_toFilter << sentence << "\n"; - string processedSentence; - m_fromFilter->getline(processedSentence); - return processedSentence; + *m_toFilter << sentence << "\n"; + string processedSentence; + m_fromFilter->getline(processedSentence); + return processedSentence; } PreProcessFilter::~PreProcessFilter() { - delete m_toFilter; - delete m_fromFilter; + delete m_toFilter; + delete m_fromFilter; } } diff --git a/mert/PreProcessFilter.h b/mert/PreProcessFilter.h index 25e627f6d..9946ddccb 100644 --- a/mert/PreProcessFilter.h +++ b/mert/PreProcessFilter.h @@ -5,7 +5,7 @@ namespace MosesTuning { - + class ofdstream; class ifdstream; @@ -22,8 +22,8 @@ public: ~PreProcessFilter(); private: - ofdstream* m_toFilter; - ifdstream* m_fromFilter; + ofdstream* m_toFilter; + ifdstream* m_fromFilter; }; } diff --git a/mert/Reference.h b/mert/Reference.h index 1d6869a12..2c12f2ed7 100644 --- a/mert/Reference.h +++ b/mert/Reference.h @@ -9,38 +9,57 @@ namespace MosesTuning { - + /** * Reference class represents reference translations for an output * translation used in calculating BLEU score. */ -class Reference { - public: +class Reference +{ +public: // for m_length typedef std::vector::iterator iterator; typedef std::vector::const_iterator const_iterator; Reference() : m_counts(new NgramCounts) { } - ~Reference() { delete m_counts; } + ~Reference() { + delete m_counts; + } - NgramCounts* get_counts() { return m_counts; } - const NgramCounts* get_counts() const { return m_counts; } + NgramCounts* get_counts() { + return m_counts; + } + const NgramCounts* get_counts() const { + return m_counts; + } - iterator begin() { return m_length.begin(); } - const_iterator begin() const { return m_length.begin(); } - iterator end() { return m_length.end(); } - const_iterator end() const { return m_length.end(); } + iterator begin() { + return m_length.begin(); + } + const_iterator begin() const { + return m_length.begin(); + } + iterator end() { + return m_length.end(); + } + const_iterator end() const { + return m_length.end(); + } - void push_back(std::size_t len) { m_length.push_back(len); } + void push_back(std::size_t len) { + m_length.push_back(len); + } - std::size_t num_references() const { return m_length.size(); } + std::size_t num_references() const { + return m_length.size(); + } int CalcAverage() const; int CalcClosest(std::size_t length) const; int CalcShortest() const; - private: +private: NgramCounts* m_counts; // multiple reference lengths @@ -49,16 +68,18 @@ class Reference { // TODO(tetsuok): fix this function and related stuff. // "average" reference length should not be calculated at sentence-level unlike "closest". -inline int Reference::CalcAverage() const { +inline int Reference::CalcAverage() const +{ int total = 0; for (std::size_t i = 0; i < m_length.size(); ++i) { total += m_length[i]; } return static_cast( - static_cast(total) / m_length.size()); + static_cast(total) / m_length.size()); } -inline int Reference::CalcClosest(std::size_t length) const { +inline int Reference::CalcClosest(std::size_t length) const +{ int min_diff = INT_MAX; int closest_ref_id = 0; // an index of the closest reference translation for (std::size_t i = 0; i < m_length.size(); ++i) { @@ -79,7 +100,8 @@ inline int Reference::CalcClosest(std::size_t length) const { return static_cast(m_length[closest_ref_id]); } -inline int Reference::CalcShortest() const { +inline int Reference::CalcShortest() const +{ return *std::min_element(m_length.begin(), m_length.end()); } diff --git a/mert/ReferenceTest.cpp b/mert/ReferenceTest.cpp index ad76de1f7..c33321227 100644 --- a/mert/ReferenceTest.cpp +++ b/mert/ReferenceTest.cpp @@ -5,12 +5,14 @@ using namespace MosesTuning; -BOOST_AUTO_TEST_CASE(refernece_count) { +BOOST_AUTO_TEST_CASE(refernece_count) +{ Reference ref; BOOST_CHECK(ref.get_counts() != NULL); } -BOOST_AUTO_TEST_CASE(refernece_length_iterator) { +BOOST_AUTO_TEST_CASE(refernece_length_iterator) +{ Reference ref; ref.push_back(4); ref.push_back(2); @@ -24,7 +26,8 @@ BOOST_AUTO_TEST_CASE(refernece_length_iterator) { BOOST_CHECK(it == ref.end()); } -BOOST_AUTO_TEST_CASE(refernece_length_average) { +BOOST_AUTO_TEST_CASE(refernece_length_average) +{ { Reference ref; ref.push_back(4); @@ -49,7 +52,8 @@ BOOST_AUTO_TEST_CASE(refernece_length_average) { } } -BOOST_AUTO_TEST_CASE(refernece_length_closest) { +BOOST_AUTO_TEST_CASE(refernece_length_closest) +{ { Reference ref; ref.push_back(4); @@ -92,7 +96,8 @@ BOOST_AUTO_TEST_CASE(refernece_length_closest) { } } -BOOST_AUTO_TEST_CASE(refernece_length_shortest) { +BOOST_AUTO_TEST_CASE(refernece_length_shortest) +{ { Reference ref; ref.push_back(4); diff --git a/mert/ScopedVector.h b/mert/ScopedVector.h index c87f07071..bd9251a7c 100644 --- a/mert/ScopedVector.h +++ b/mert/ScopedVector.h @@ -5,19 +5,26 @@ namespace MosesTuning { - + template -class ScopedVector { - public: +class ScopedVector +{ +public: typedef typename std::vector::iterator iterator; typedef typename std::vector::const_iterator const_iterator; ScopedVector() {} - virtual ~ScopedVector() { reset(); } + virtual ~ScopedVector() { + reset(); + } - bool empty() const { return m_vec.empty(); } + bool empty() const { + return m_vec.empty(); + } - void push_back(T *e) { m_vec.push_back(e); } + void push_back(T *e) { + m_vec.push_back(e); + } void reset() { for (iterator it = m_vec.begin(); it != m_vec.end(); ++it) { @@ -26,27 +33,53 @@ class ScopedVector { m_vec.clear(); } - void reserve(std::size_t capacity) { m_vec.reserve(capacity); } - void resize(std::size_t size) { m_vec.resize(size); } + void reserve(std::size_t capacity) { + m_vec.reserve(capacity); + } + void resize(std::size_t size) { + m_vec.resize(size); + } - std::size_t size() const {return m_vec.size(); } + std::size_t size() const { + return m_vec.size(); + } - iterator begin() { return m_vec.begin(); } - const_iterator begin() const { return m_vec.begin(); } + iterator begin() { + return m_vec.begin(); + } + const_iterator begin() const { + return m_vec.begin(); + } - iterator end() { return m_vec.end(); } - const_iterator end() const { return m_vec.end(); } + iterator end() { + return m_vec.end(); + } + const_iterator end() const { + return m_vec.end(); + } - std::vector& get() { return m_vec; } - const std::vector& get() const { return m_vec; } + std::vector& get() { + return m_vec; + } + const std::vector& get() const { + return m_vec; + } - std::vector* operator->() { return &m_vec; } - const std::vector* operator->() const { return &m_vec; } + std::vector* operator->() { + return &m_vec; + } + const std::vector* operator->() const { + return &m_vec; + } - T*& operator[](std::size_t i) { return m_vec[i]; } - const T* operator[](std::size_t i) const { return m_vec[i]; } + T*& operator[](std::size_t i) { + return m_vec[i]; + } + const T* operator[](std::size_t i) const { + return m_vec[i]; + } - private: +private: std::vector m_vec; // no copying allowed. diff --git a/mert/ScoreArray.cpp b/mert/ScoreArray.cpp index dcd0f7680..dd9aa5b07 100644 --- a/mert/ScoreArray.cpp +++ b/mert/ScoreArray.cpp @@ -17,12 +17,12 @@ namespace MosesTuning ScoreArray::ScoreArray() - : m_num_scores(0), m_index(0) {} + : m_num_scores(0), m_index(0) {} void ScoreArray::savetxt(ostream* os, const string& sctype) { *os << SCORES_TXT_BEGIN << " " << m_index << " " << m_array.size() - << " " << m_num_scores << " " << sctype << endl; + << " " << m_num_scores << " " << sctype << endl; for (scorearray_t::iterator i = m_array.begin(); i !=m_array.end(); i++) { i->savetxt(os); *os << endl; @@ -33,7 +33,7 @@ void ScoreArray::savetxt(ostream* os, const string& sctype) void ScoreArray::savebin(ostream* os, const string& score_type) { *os << SCORES_BIN_BEGIN << " " << m_index << " " << m_array.size() - << " " << m_num_scores << " " << score_type << endl; + << " " << m_num_scores << " " << score_type << endl; for (scorearray_t::iterator i = m_array.begin(); i != m_array.end(); i++) { i->savebin(os); @@ -63,7 +63,8 @@ void ScoreArray::save(const string &file, const string& score_type, bool bin) ofs.close(); } -void ScoreArray::save(const string& score_type, bool bin) { +void ScoreArray::save(const string& score_type, bool bin) +{ save(&cout, score_type, bin); } diff --git a/mert/ScoreArray.h b/mert/ScoreArray.h index 5b6c748cb..438b57e3f 100644 --- a/mert/ScoreArray.h +++ b/mert/ScoreArray.h @@ -25,7 +25,7 @@ const char SCORES_BIN_END[] = "SCORES_BIN_END_0"; class ScoreArray { - private: +private: scorearray_t m_array; std::string m_score_type; std::size_t m_num_scores; @@ -38,17 +38,29 @@ public: ScoreArray(); ~ScoreArray() {} - void clear() { m_array.clear(); } + void clear() { + m_array.clear(); + } - int getIndex() const { return m_index; } + int getIndex() const { + return m_index; + } - void setIndex(int value) { m_index = value; } + void setIndex(int value) { + m_index = value; + } - ScoreStats& get(std::size_t i) { return m_array.at(i); } + ScoreStats& get(std::size_t i) { + return m_array.at(i); + } - const ScoreStats& get(std::size_t i) const { return m_array.at(i); } + const ScoreStats& get(std::size_t i) const { + return m_array.at(i); + } - void add(const ScoreStats& e) { m_array.push_back(e); } + void add(const ScoreStats& e) { + m_array.push_back(e); + } //ADDED BY TS void swap(std::size_t i, std::size_t j) { @@ -62,15 +74,25 @@ public: void merge(ScoreArray& e); - std::string name() const { return m_score_type; } + std::string name() const { + return m_score_type; + } - void name(std::string &score_type) { m_score_type = score_type; } + void name(std::string &score_type) { + m_score_type = score_type; + } - std::size_t size() const { return m_array.size(); } + std::size_t size() const { + return m_array.size(); + } - std::size_t NumberOfScores() const { return m_num_scores; } + std::size_t NumberOfScores() const { + return m_num_scores; + } - void NumberOfScores(std::size_t v) { m_num_scores = v; } + void NumberOfScores(std::size_t v) { + m_num_scores = v; + } void savetxt(std::ostream* os, const std::string& score_type); void savebin(std::ostream* os, const std::string& score_type); diff --git a/mert/ScoreData.cpp b/mert/ScoreData.cpp index d02a4d700..0906b3459 100644 --- a/mert/ScoreData.cpp +++ b/mert/ScoreData.cpp @@ -50,7 +50,8 @@ void ScoreData::save(const string &file, bool bin) ofs.close(); } -void ScoreData::save(bool bin) { +void ScoreData::save(bool bin) +{ save(&cout, bin); } diff --git a/mert/ScoreData.h b/mert/ScoreData.h index ac3a6faa6..9159e029f 100644 --- a/mert/ScoreData.h +++ b/mert/ScoreData.h @@ -40,7 +40,9 @@ public: ScoreData(Scorer* scorer); ~ScoreData() {} - void clear() { m_array.clear(); } + void clear() { + m_array.clear(); + } inline ScoreArray& get(std::size_t idx) { return m_array.at(idx); @@ -66,7 +68,9 @@ public: return m_array.at(i).get(j); } - std::string name() const { return m_score_type; } + std::string name() const { + return m_score_type; + } std::string name(const std::string &score_type) { return m_score_type = score_type; @@ -75,8 +79,12 @@ public: void add(ScoreArray& e); void add(const ScoreStats& e, int sent_idx); - std::size_t NumberOfScores() const { return m_num_scores; } - std::size_t size() const { return m_array.size(); } + std::size_t NumberOfScores() const { + return m_num_scores; + } + std::size_t size() const { + return m_array.size(); + } void save(const std::string &file, bool bin=false); void save(std::ostream* os, bool bin=false); diff --git a/mert/ScoreDataIterator.cpp b/mert/ScoreDataIterator.cpp index 80568b810..71e05ab0b 100644 --- a/mert/ScoreDataIterator.cpp +++ b/mert/ScoreDataIterator.cpp @@ -29,18 +29,20 @@ using namespace util; namespace MosesTuning { - + ScoreDataIterator::ScoreDataIterator() {} -ScoreDataIterator::ScoreDataIterator(const string& filename) { +ScoreDataIterator::ScoreDataIterator(const string& filename) +{ m_in.reset(new FilePiece(filename.c_str())); readNext(); } ScoreDataIterator::~ScoreDataIterator() {} -void ScoreDataIterator::readNext() { +void ScoreDataIterator::readNext() +{ m_next.clear(); try { StringPiece marker = m_in->ReadDelimited(); @@ -71,12 +73,14 @@ void ScoreDataIterator::readNext() { } } -void ScoreDataIterator::increment() { +void ScoreDataIterator::increment() +{ readNext(); } -bool ScoreDataIterator::equal(const ScoreDataIterator& rhs) const { +bool ScoreDataIterator::equal(const ScoreDataIterator& rhs) const +{ if (!m_in && !rhs.m_in) { return true; } else if (!m_in) { @@ -84,13 +88,14 @@ bool ScoreDataIterator::equal(const ScoreDataIterator& rhs) const { } else if (!rhs.m_in) { return false; } else { - return m_in->FileName() == rhs.m_in->FileName() && - m_in->Offset() == rhs.m_in->Offset(); + return m_in->FileName() == rhs.m_in->FileName() && + m_in->Offset() == rhs.m_in->Offset(); } } -const vector& ScoreDataIterator::dereference() const { +const vector& ScoreDataIterator::dereference() const +{ return m_next; } diff --git a/mert/ScoreDataIterator.h b/mert/ScoreDataIterator.h index 50640c158..e5968a8f7 100644 --- a/mert/ScoreDataIterator.h +++ b/mert/ScoreDataIterator.h @@ -33,40 +33,43 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "FeatureDataIterator.h" -namespace util { class FilePiece; } +namespace util +{ +class FilePiece; +} namespace MosesTuning { - + typedef std::vector ScoreDataItem; class ScoreDataIterator : public boost::iterator_facade, - boost::forward_traversal_tag> + const std::vector, + boost::forward_traversal_tag> { - public: - ScoreDataIterator(); - explicit ScoreDataIterator(const std::string& filename); +public: + ScoreDataIterator(); + explicit ScoreDataIterator(const std::string& filename); - ~ScoreDataIterator(); + ~ScoreDataIterator(); - static ScoreDataIterator end() { - return ScoreDataIterator(); - } + static ScoreDataIterator end() { + return ScoreDataIterator(); + } - private: - friend class boost::iterator_core_access; +private: + friend class boost::iterator_core_access; - void increment(); - bool equal(const ScoreDataIterator& rhs) const; - const std::vector& dereference() const; + void increment(); + bool equal(const ScoreDataIterator& rhs) const; + const std::vector& dereference() const; - void readNext(); + void readNext(); - boost::shared_ptr m_in; - std::vector m_next; + boost::shared_ptr m_in; + std::vector m_next; }; } diff --git a/mert/ScoreStats.cpp b/mert/ScoreStats.cpp index 20e707005..1c66cdb5f 100644 --- a/mert/ScoreStats.cpp +++ b/mert/ScoreStats.cpp @@ -13,21 +13,22 @@ using namespace std; -namespace { +namespace +{ const int kAvailableSize = 8; } // namespace namespace MosesTuning { - + ScoreStats::ScoreStats() - : m_available_size(kAvailableSize), m_entries(0), - m_array(new ScoreStatsType[m_available_size]) {} + : m_available_size(kAvailableSize), m_entries(0), + m_array(new ScoreStatsType[m_available_size]) {} ScoreStats::ScoreStats(const size_t size) - : m_available_size(size), m_entries(size), - m_array(new ScoreStatsType[m_available_size]) + : m_available_size(size), m_entries(size), + m_array(new ScoreStatsType[m_available_size]) { memset(m_array, 0, GetArraySizeWithBytes()); } @@ -123,7 +124,8 @@ void ScoreStats::savetxt(ostream* os) *os << *this; } -void ScoreStats::savetxt() { +void ScoreStats::savetxt() +{ savetxt(&cout); } @@ -140,7 +142,8 @@ ostream& operator<<(ostream& o, const ScoreStats& e) return o; } -bool operator==(const ScoreStats& s1, const ScoreStats& s2) { +bool operator==(const ScoreStats& s1, const ScoreStats& s2) +{ size_t size = s1.size(); if (size != s2.size()) diff --git a/mert/ScoreStats.h b/mert/ScoreStats.h index 4088b655e..930b03cea 100644 --- a/mert/ScoreStats.h +++ b/mert/ScoreStats.h @@ -18,7 +18,7 @@ namespace MosesTuning { - + class ScoreStats { @@ -41,7 +41,9 @@ public: void Copy(const ScoreStats &stats); - bool isfull() const { return (m_entries < m_available_size) ? 0 : 1; } + bool isfull() const { + return (m_entries < m_available_size) ? 0 : 1; + } void expand(); void add(ScoreStatsType v); @@ -55,9 +57,15 @@ public: clear(); } - ScoreStatsType get(std::size_t i) { return m_array[i]; } - ScoreStatsType get(std::size_t i) const { return m_array[i]; } - scorestats_t getArray() const { return m_array; } + ScoreStatsType get(std::size_t i) { + return m_array[i]; + } + ScoreStatsType get(std::size_t i) const { + return m_array[i]; + } + scorestats_t getArray() const { + return m_array; + } void set(const std::string& str); @@ -69,15 +77,21 @@ public: } } - std::size_t bytes() const { return GetArraySizeWithBytes(); } + std::size_t bytes() const { + return GetArraySizeWithBytes(); + } std::size_t GetArraySizeWithBytes() const { return m_entries * sizeof(ScoreStatsType); } - std::size_t size() const { return m_entries; } + std::size_t size() const { + return m_entries; + } - std::size_t available() const { return m_available_size; } + std::size_t available() const { + return m_available_size; + } void savetxt(const std::string &file); void savetxt(std::ostream* os); diff --git a/mert/Scorer.cpp b/mert/Scorer.cpp index 1a7136dc0..e3cc3d7e6 100644 --- a/mert/Scorer.cpp +++ b/mert/Scorer.cpp @@ -12,27 +12,31 @@ using namespace std; namespace MosesTuning { -namespace { +namespace +{ // For tokenizing a hypothesis translation, we may encounter unknown tokens which // do not exist in the corresponding reference translations. const int kUnknownToken = -1; } // namespace Scorer::Scorer(const string& name, const string& config) - : m_name(name), - m_vocab(mert::VocabularyFactory::GetVocabulary()), - m_filter(NULL), - m_score_data(NULL), - m_enable_preserve_case(true) { + : m_name(name), + m_vocab(mert::VocabularyFactory::GetVocabulary()), + m_filter(NULL), + m_score_data(NULL), + m_enable_preserve_case(true) +{ InitConfig(config); } -Scorer::~Scorer() { +Scorer::~Scorer() +{ Singleton::Delete(); delete m_filter; } -void Scorer::InitConfig(const string& config) { +void Scorer::InitConfig(const string& config) +{ // cerr << "Scorer config string: " << config << endl; size_t start = 0; while (start < config.size()) { @@ -53,7 +57,8 @@ void Scorer::InitConfig(const string& config) { } } -void Scorer::TokenizeAndEncode(const string& line, vector& encoded) { +void Scorer::TokenizeAndEncode(const string& line, vector& encoded) +{ for (util::TokenIter it(line, util::AnyCharacter(" ")); it; ++it) { if (!m_enable_preserve_case) { @@ -69,7 +74,8 @@ void Scorer::TokenizeAndEncode(const string& line, vector& encoded) { } } -void Scorer::TokenizeAndEncodeTesting(const string& line, vector& encoded) { +void Scorer::TokenizeAndEncodeTesting(const string& line, vector& encoded) +{ for (util::TokenIter it(line, util::AnyCharacter(" ")); it; ++it) { if (!m_enable_preserve_case) { @@ -103,8 +109,7 @@ void Scorer::setFactors(const string& factors) if (factors.empty()) return; vector factors_vec; split(factors, '|', factors_vec); - for(vector::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it) - { + for(vector::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it) { int factor = atoi(it->c_str()); m_factors.push_back(factor); } @@ -115,8 +120,8 @@ void Scorer::setFactors(const string& factors) */ void Scorer::setFilter(const string& filterCommand) { - if (filterCommand.empty()) return; - m_filter = new PreProcessFilter(filterCommand); + if (filterCommand.empty()) return; + m_filter = new PreProcessFilter(filterCommand); } /** @@ -130,8 +135,7 @@ string Scorer::applyFactors(const string& sentence) const split(sentence, ' ', tokens); stringstream sstream; - for (size_t i = 0; i < tokens.size(); ++i) - { + for (size_t i = 0; i < tokens.size(); ++i) { if (tokens[i] == "") continue; vector factors; @@ -141,8 +145,7 @@ string Scorer::applyFactors(const string& sentence) const if (i > 0) sstream << " "; - for (size_t j = 0; j < m_factors.size(); ++j) - { + for (size_t j = 0; j < m_factors.size(); ++j) { int findex = m_factors[j]; if (findex < 0 || findex >= fsize) throw runtime_error("Factor index is out of range."); @@ -158,17 +161,15 @@ string Scorer::applyFactors(const string& sentence) const */ string Scorer::applyFilter(const string& sentence) const { - if (m_filter) - { + if (m_filter) { return m_filter->ProcessSentence(sentence); - } - else - { + } else { return sentence; } } -float Scorer::score(const candidates_t& candidates) const { +float Scorer::score(const candidates_t& candidates) const +{ diffs_t diffs; statscores_t scores; score(candidates, diffs, scores); diff --git a/mert/Scorer.h b/mert/Scorer.h index 0a090d1c4..4a1f4476a 100644 --- a/mert/Scorer.h +++ b/mert/Scorer.h @@ -10,7 +10,8 @@ #include "Types.h" #include "ScoreData.h" -namespace mert { +namespace mert +{ class Vocabulary; @@ -32,7 +33,7 @@ enum ScorerRegularisationStrategy {REG_NONE, REG_AVERAGE, REG_MINIMUM}; */ class Scorer { - public: +public: Scorer(const std::string& name, const std::string& config); virtual ~Scorer(); @@ -117,14 +118,16 @@ class Scorer */ virtual void setFactors(const std::string& factors); - mert::Vocabulary* GetVocab() const { return m_vocab; } + mert::Vocabulary* GetVocab() const { + return m_vocab; + } /** * Set unix filter, which will be used to preprocess the sentences */ virtual void setFilter(const std::string& filterCommand); - private: +private: void InitConfig(const std::string& config); /** @@ -143,7 +146,7 @@ class Scorer std::vector m_factors; PreProcessFilter* m_filter; - protected: +protected: ScoreData* m_score_data; bool m_enable_preserve_case; @@ -173,40 +176,40 @@ class Scorer /** * Every inherited scorer should call this function for each sentence */ - std::string preprocessSentence(const std::string& sentence) const - { + std::string preprocessSentence(const std::string& sentence) const { return applyFactors(applyFilter(sentence)); } }; -namespace { +namespace +{ - //regularisation strategies - inline float score_min(const statscores_t& scores, size_t start, size_t end) - { - float min = std::numeric_limits::max(); - for (size_t i = start; i < end; ++i) { - if (scores[i] < min) { - min = scores[i]; - } +//regularisation strategies +inline float score_min(const statscores_t& scores, size_t start, size_t end) +{ + float min = std::numeric_limits::max(); + for (size_t i = start; i < end; ++i) { + if (scores[i] < min) { + min = scores[i]; } - return min; + } + return min; +} + +inline float score_average(const statscores_t& scores, size_t start, size_t end) +{ + if ((end - start) < 1) { + // this shouldn't happen + return 0; + } + float total = 0; + for (size_t j = start; j < end; ++j) { + total += scores[j]; } - inline float score_average(const statscores_t& scores, size_t start, size_t end) - { - if ((end - start) < 1) { - // this shouldn't happen - return 0; - } - float total = 0; - for (size_t j = start; j < end; ++j) { - total += scores[j]; - } - - return total / (end - start); - } + return total / (end - start); +} } // namespace diff --git a/mert/ScorerFactory.cpp b/mert/ScorerFactory.cpp index 126218b65..02000c1bc 100644 --- a/mert/ScorerFactory.cpp +++ b/mert/ScorerFactory.cpp @@ -14,9 +14,10 @@ using namespace std; namespace MosesTuning { - -vector ScorerFactory::getTypes() { + +vector ScorerFactory::getTypes() +{ vector types; types.push_back(string("BLEU")); types.push_back(string("PER")); @@ -29,7 +30,8 @@ vector ScorerFactory::getTypes() { return types; } -Scorer* ScorerFactory::getScorer(const string& type, const string& config) { +Scorer* ScorerFactory::getScorer(const string& type, const string& config) +{ if (type == "BLEU") { return new BleuScorer(config); } else if (type == "PER") { @@ -48,8 +50,7 @@ Scorer* ScorerFactory::getScorer(const string& type, const string& config) { } else { if (type.find(',') != string::npos) { return new InterpolatedScorer(type, config); - } - else { + } else { throw runtime_error("Unknown scorer type: " + type); } } diff --git a/mert/ScorerFactory.h b/mert/ScorerFactory.h index e8b33d87c..b93db3024 100644 --- a/mert/ScorerFactory.h +++ b/mert/ScorerFactory.h @@ -6,7 +6,7 @@ namespace MosesTuning { - + class Scorer; diff --git a/mert/SemposOverlapping.cpp b/mert/SemposOverlapping.cpp index ffcabaab2..718bc7f26 100644 --- a/mert/SemposOverlapping.cpp +++ b/mert/SemposOverlapping.cpp @@ -6,7 +6,8 @@ using namespace std; -namespace { +namespace +{ MosesTuning::SemposOverlapping* g_overlapping = NULL; @@ -14,9 +15,10 @@ MosesTuning::SemposOverlapping* g_overlapping = NULL; namespace MosesTuning { - -SemposOverlapping* SemposOverlappingFactory::GetOverlapping(const string& str, const SemposScorer* sempos) { + +SemposOverlapping* SemposOverlappingFactory::GetOverlapping(const string& str, const SemposScorer* sempos) +{ if (str == "cap-micro") { return new CapMicroOverlapping(sempos); } else if (str == "cap-macro") { @@ -26,7 +28,8 @@ SemposOverlapping* SemposOverlappingFactory::GetOverlapping(const string& str, c } } -void SemposOverlappingFactory::SetOverlapping(SemposOverlapping* ovr) { +void SemposOverlappingFactory::SetOverlapping(SemposOverlapping* ovr) +{ g_overlapping = ovr; } @@ -41,15 +44,13 @@ vector CapMicroOverlapping::prepareStats(const sentence_t& cand, const sent int multCoeff = 1000; float interSum = 0; - for (sentence_t::iterator it = intersection.begin(); it != intersection.end(); it++) - { + for (sentence_t::iterator it = intersection.begin(); it != intersection.end(); it++) { interSum += semposScorer->weight(it->first); } float refSum = 0; - for (sentence_t::iterator it = ref.begin(); it != ref.end(); it++) - { - refSum += semposScorer->weight(it->first); + for (sentence_t::iterator it = ref.begin(); it != ref.end(); it++) { + refSum += semposScorer->weight(it->first); } stats[0] = (int)(multCoeff * interSum); diff --git a/mert/SemposOverlapping.h b/mert/SemposOverlapping.h index 3b5a99f7f..5eddbaef3 100644 --- a/mert/SemposOverlapping.h +++ b/mert/SemposOverlapping.h @@ -9,7 +9,7 @@ namespace MosesTuning { - + class SemposScorer; @@ -36,14 +36,15 @@ public: virtual std::size_t NumberOfScores() const = 0; }; -class SemposOverlappingFactory { - public: +class SemposOverlappingFactory +{ +public: static SemposOverlapping* GetOverlapping(const std::string& str, const SemposScorer* sempos); // dependency injection for unit testing. static void SetOverlapping(SemposOverlapping* ovr); - private: +private: SemposOverlappingFactory() {} ~SemposOverlappingFactory() {} }; @@ -62,9 +63,11 @@ public: virtual std::vector prepareStats(const sentence_t& cand, const sentence_t& ref); virtual float calculateScore(const std::vector& stats) const; - virtual std::size_t NumberOfScores() const { return 2; } + virtual std::size_t NumberOfScores() const { + return 2; + } - private: +private: // no copying allowed. CapMicroOverlapping(const CapMicroOverlapping&); CapMicroOverlapping& operator=(const CapMicroOverlapping&); @@ -82,9 +85,11 @@ public: virtual std::vector prepareStats(const sentence_t& cand, const sentence_t& ref); virtual float calculateScore(const std::vector& stats) const; - virtual std::size_t NumberOfScores() const { return kMaxNOC * 2; } + virtual std::size_t NumberOfScores() const { + return kMaxNOC * 2; + } - private: +private: // no copying allowed. CapMacroOverlapping(const CapMacroOverlapping&); CapMacroOverlapping& operator=(const CapMacroOverlapping&); diff --git a/mert/SemposScorer.cpp b/mert/SemposScorer.cpp index 8dd1fc8ee..235f73fbf 100644 --- a/mert/SemposScorer.cpp +++ b/mert/SemposScorer.cpp @@ -12,7 +12,7 @@ using namespace std; namespace MosesTuning { - + SemposScorer::SemposScorer(const string& config) : StatisticsBasedScorer("SEMPOS", config), @@ -25,8 +25,7 @@ SemposScorer::SemposScorer(const string& config) m_semposMap.clear(); string weightsfile = getConfig("weightsfile", ""); - if (weightsfile != "") - { + if (weightsfile != "") { loadWeights(weightsfile); } } @@ -144,42 +143,35 @@ int SemposScorer::encodeSempos(const string& sempos) float SemposScorer::weight(int item) const { - std::map::const_iterator it = weightsMap.find(item); - if (it == weightsMap.end()) - { - return 1.0f; - } - else - { - return it->second; - } + std::map::const_iterator it = weightsMap.find(item); + if (it == weightsMap.end()) { + return 1.0f; + } else { + return it->second; + } } void SemposScorer::loadWeights(const string& weightsfile) { - string line; - ifstream myfile; - myfile.open(weightsfile.c_str(), ifstream::in); - if (myfile.is_open()) - { - while ( myfile.good() ) - { - getline (myfile,line); - vector fields; - if (line == "") continue; - split(line, '\t', fields); - if (fields.size() != 2) throw std::runtime_error("Bad format of a row in weights file."); - int encoded = encodeString(fields[0]); - float weight = atof(fields[1].c_str()); - weightsMap[encoded] = weight; - } - myfile.close(); - } - else - { - cerr << "Unable to open file "<< weightsfile << endl; - exit(1); + string line; + ifstream myfile; + myfile.open(weightsfile.c_str(), ifstream::in); + if (myfile.is_open()) { + while ( myfile.good() ) { + getline (myfile,line); + vector fields; + if (line == "") continue; + split(line, '\t', fields); + if (fields.size() != 2) throw std::runtime_error("Bad format of a row in weights file."); + int encoded = encodeString(fields[0]); + float weight = atof(fields[1].c_str()); + weightsMap[encoded] = weight; } + myfile.close(); + } else { + cerr << "Unable to open file "<< weightsfile << endl; + exit(1); + } } diff --git a/mert/SemposScorer.h b/mert/SemposScorer.h index bde064349..b6c735bbe 100644 --- a/mert/SemposScorer.h +++ b/mert/SemposScorer.h @@ -19,7 +19,7 @@ namespace MosesTuning { - + /** * This class represents sempos based metrics. @@ -32,12 +32,16 @@ public: virtual void setReferenceFiles(const std::vector& referenceFiles); virtual void prepareStats(std::size_t sindex, const std::string& text, ScoreStats& entry); - virtual std::size_t NumberOfScores() const { return m_ovr->NumberOfScores(); } + virtual std::size_t NumberOfScores() const { + return m_ovr->NumberOfScores(); + } virtual float calculateScore(const std::vector& comps) const { return m_ovr->calculateScore(comps); } - bool EnableDebug() const { return m_enable_debug; } + bool EnableDebug() const { + return m_enable_debug; + } float weight(int item) const; diff --git a/mert/SentenceLevelScorer.cpp b/mert/SentenceLevelScorer.cpp index 0b159f0b7..0d1c15140 100644 --- a/mert/SentenceLevelScorer.cpp +++ b/mert/SentenceLevelScorer.cpp @@ -17,48 +17,50 @@ namespace MosesTuning { SentenceLevelScorer::SentenceLevelScorer(const string& name, const string& config) - : Scorer(name, config), - m_regularisationStrategy(REG_NONE), - m_regularisationWindow(0) { + : Scorer(name, config), + m_regularisationStrategy(REG_NONE), + m_regularisationWindow(0) +{ Init(); } SentenceLevelScorer::~SentenceLevelScorer() {} -void SentenceLevelScorer::Init() { - // Configure regularisation. - static string KEY_TYPE = "regtype"; - static string KEY_WINDOW = "regwin"; - static string KEY_CASE = "case"; - static string TYPE_NONE = "none"; - static string TYPE_AVERAGE = "average"; - static string TYPE_MINIMUM = "min"; - static string TRUE = "true"; - static string FALSE = "false"; +void SentenceLevelScorer::Init() +{ + // Configure regularisation. + static string KEY_TYPE = "regtype"; + static string KEY_WINDOW = "regwin"; + static string KEY_CASE = "case"; + static string TYPE_NONE = "none"; + static string TYPE_AVERAGE = "average"; + static string TYPE_MINIMUM = "min"; + static string TRUE = "true"; + static string FALSE = "false"; - const string type = getConfig(KEY_TYPE, TYPE_NONE); - if (type == TYPE_NONE) { - m_regularisationStrategy = REG_NONE; - } else if (type == TYPE_AVERAGE) { - m_regularisationStrategy = REG_AVERAGE; - } else if (type == TYPE_MINIMUM) { - m_regularisationStrategy = REG_MINIMUM; - } else { - throw boost::lexer::runtime_error("Unknown scorer regularisation strategy: " + type); - } - cerr << "Using scorer regularisation strategy: " << type << endl; + const string type = getConfig(KEY_TYPE, TYPE_NONE); + if (type == TYPE_NONE) { + m_regularisationStrategy = REG_NONE; + } else if (type == TYPE_AVERAGE) { + m_regularisationStrategy = REG_AVERAGE; + } else if (type == TYPE_MINIMUM) { + m_regularisationStrategy = REG_MINIMUM; + } else { + throw boost::lexer::runtime_error("Unknown scorer regularisation strategy: " + type); + } + cerr << "Using scorer regularisation strategy: " << type << endl; - const string window = getConfig(KEY_WINDOW, "0"); - m_regularisationWindow = atoi(window.c_str()); - cerr << "Using scorer regularisation window: " << m_regularisationWindow << endl; + const string window = getConfig(KEY_WINDOW, "0"); + m_regularisationWindow = atoi(window.c_str()); + cerr << "Using scorer regularisation window: " << m_regularisationWindow << endl; - const string preservecase = getConfig(KEY_CASE, TRUE); - if (preservecase == TRUE) { - m_enable_preserve_case = true; - } else if (preservecase == FALSE) { - m_enable_preserve_case = false; - } - cerr << "Using case preservation: " << m_enable_preserve_case << endl; + const string preservecase = getConfig(KEY_CASE, TRUE); + if (preservecase == TRUE) { + m_enable_preserve_case = true; + } else if (preservecase == FALSE) { + m_enable_preserve_case = false; + } + cerr << "Using case preservation: " << m_enable_preserve_case << endl; } void SentenceLevelScorer::score(const candidates_t& candidates, const diffs_t& diffs, @@ -83,8 +85,8 @@ void SentenceLevelScorer::score(const candidates_t& candidates, const diffs_t& if (stats.size() != totals.size()) { stringstream msg; msg << "Statistics for (" << "," << candidates[i] << ") have incorrect " - << "number of fields. Found: " << stats.size() << " Expected: " - << totals.size(); + << "number of fields. Found: " << stats.size() << " Expected: " + << totals.size(); throw runtime_error(msg.str()); } //Add up scores for all sentences, would normally be just one score diff --git a/mert/Singleton.h b/mert/Singleton.h index 473517170..f50925fa4 100644 --- a/mert/Singleton.h +++ b/mert/Singleton.h @@ -5,13 +5,14 @@ namespace MosesTuning { - + // thread *un*safe singleton. // TODO: replace this with thread-safe singleton. template -class Singleton { - public: +class Singleton +{ +public: static T* GetInstance() { if (m_instance == NULL) { m_instance = new T; @@ -26,7 +27,7 @@ class Singleton { } } - private: +private: Singleton(); static T* m_instance; }; diff --git a/mert/SingletonTest.cpp b/mert/SingletonTest.cpp index a74ce7c6b..36acbeec2 100644 --- a/mert/SingletonTest.cpp +++ b/mert/SingletonTest.cpp @@ -5,19 +5,24 @@ using namespace MosesTuning; -namespace { +namespace +{ static int g_count = 0; -class Instance { - public: - Instance() { ++g_count; } +class Instance +{ +public: + Instance() { + ++g_count; + } ~Instance() {} }; } // namespace -BOOST_AUTO_TEST_CASE(singleton_basic) { +BOOST_AUTO_TEST_CASE(singleton_basic) +{ Instance* instance1 = Singleton::GetInstance(); Instance* instance2 = Singleton::GetInstance(); Instance* instance3 = Singleton::GetInstance(); diff --git a/mert/StatisticsBasedScorer.cpp b/mert/StatisticsBasedScorer.cpp index 05dd95939..869e2f55a 100644 --- a/mert/StatisticsBasedScorer.cpp +++ b/mert/StatisticsBasedScorer.cpp @@ -13,10 +13,11 @@ using namespace std; namespace MosesTuning { - + StatisticsBasedScorer::StatisticsBasedScorer(const string& name, const string& config) -: Scorer(name,config) { + : Scorer(name,config) +{ //configure regularisation static string KEY_TYPE = "regtype"; static string KEY_WINDOW = "regwin"; @@ -26,7 +27,7 @@ StatisticsBasedScorer::StatisticsBasedScorer(const string& name, const string& c static string TYPE_MINIMUM = "min"; static string TRUE = "true"; static string FALSE = "false"; - + string type = getConfig(KEY_TYPE,TYPE_NONE); if (type == TYPE_NONE) { m_regularization_type = NONE; @@ -38,11 +39,11 @@ StatisticsBasedScorer::StatisticsBasedScorer(const string& name, const string& c throw runtime_error("Unknown scorer regularisation strategy: " + type); } // cerr << "Using scorer regularisation strategy: " << type << endl; - + const string& window = getConfig(KEY_WINDOW, "0"); m_regularization_window = atoi(window.c_str()); // cerr << "Using scorer regularisation window: " << m_regularization_window << endl; - + const string& preserve_case = getConfig(KEY_CASE,TRUE); if (preserve_case == TRUE) { m_enable_preserve_case = true; @@ -72,8 +73,8 @@ void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t if (stats.size() != totals.size()) { stringstream msg; msg << "Statistics for (" << "," << candidates[i] << ") have incorrect " - << "number of fields. Found: " << stats.size() << " Expected: " - << totals.size(); + << "number of fields. Found: " << stats.size() << " Expected: " + << totals.size(); throw runtime_error(msg.str()); } for (size_t k = 0; k < totals.size(); ++k) { @@ -81,7 +82,7 @@ void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t } } scores.push_back(calculateScore(totals)); - + candidates_t last_candidates(candidates); // apply each of the diffs, and get new scores for (size_t i = 0; i < diffs.size(); ++i) { @@ -91,21 +92,21 @@ void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t size_t last_nid = last_candidates[sid]; for (size_t k = 0; k < totals.size(); ++k) { int diff = m_score_data->get(sid,nid).get(k) - - m_score_data->get(sid,last_nid).get(k); + - m_score_data->get(sid,last_nid).get(k); totals[k] += diff; } last_candidates[sid] = nid; } scores.push_back(calculateScore(totals)); } - + // Regularisation. This can either be none, or the min or average as described in // Cer, Jurafsky and Manning at WMT08. if (m_regularization_type == NONE || m_regularization_window <= 0) { // no regularisation return; } - + // window size specifies the +/- in each direction statscores_t raw_scores(scores); // copy scores for (size_t i = 0; i < scores.size(); ++i) { diff --git a/mert/StatisticsBasedScorer.h b/mert/StatisticsBasedScorer.h index ca32535ad..644873b60 100644 --- a/mert/StatisticsBasedScorer.h +++ b/mert/StatisticsBasedScorer.h @@ -13,7 +13,7 @@ namespace MosesTuning { - + /** * Abstract base class for Scorers that work by adding statistics across all @@ -26,20 +26,20 @@ public: virtual ~StatisticsBasedScorer() {} virtual void score(const candidates_t& candidates, const diffs_t& diffs, statscores_t& scores) const; - + protected: - + enum RegularisationType { NONE, AVERAGE, MINIMUM }; - + /** * Calculate the actual score. */ virtual statscore_t calculateScore(const std::vector& totals) const = 0; - + // regularisation RegularisationType m_regularization_type; std::size_t m_regularization_window; diff --git a/mert/TerScorer.cpp b/mert/TerScorer.cpp index cc7cf1630..7c11ea66b 100644 --- a/mert/TerScorer.cpp +++ b/mert/TerScorer.cpp @@ -14,10 +14,10 @@ using namespace TERCpp; namespace MosesTuning { - + TerScorer::TerScorer(const string& config) - : StatisticsBasedScorer("TER",config), kLENGTH(2) {} + : StatisticsBasedScorer("TER",config), kLENGTH(2) {} TerScorer::~TerScorer() {} diff --git a/mert/TerScorer.h b/mert/TerScorer.h index 0229f5e8c..5e9fed46f 100644 --- a/mert/TerScorer.h +++ b/mert/TerScorer.h @@ -10,7 +10,7 @@ namespace MosesTuning { - + class ScoreStats; diff --git a/mert/Timer.cpp b/mert/Timer.cpp index 088be93a5..47fa5c750 100644 --- a/mert/Timer.cpp +++ b/mert/Timer.cpp @@ -6,14 +6,17 @@ #include #endif -namespace { +namespace +{ #if !defined(_WIN32) && !defined(_WIN64) -uint64_t GetMicroSeconds(const struct timeval& tv) { +uint64_t GetMicroSeconds(const struct timeval& tv) +{ return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; } -uint64_t GetTimeOfDayMicroSeconds() { +uint64_t GetTimeOfDayMicroSeconds() +{ struct timeval tv; gettimeofday(&tv, NULL); return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; @@ -24,9 +27,10 @@ uint64_t GetTimeOfDayMicroSeconds() { namespace MosesTuning { - -void Timer::GetCPUTimeMicroSeconds(Timer::CPUTime* cpu_time) const { + +void Timer::GetCPUTimeMicroSeconds(Timer::CPUTime* cpu_time) const +{ #if !defined(_WIN32) && !defined(_WIN64) struct rusage usage; if (getrusage(RUSAGE_SELF, &usage)) { @@ -41,22 +45,26 @@ void Timer::GetCPUTimeMicroSeconds(Timer::CPUTime* cpu_time) const { #endif } -double Timer::get_elapsed_cpu_time() const { +double Timer::get_elapsed_cpu_time() const +{ return static_cast(get_elapsed_cpu_time_microseconds()) * 1e-6; } -uint64_t Timer::get_elapsed_cpu_time_microseconds() const { +uint64_t Timer::get_elapsed_cpu_time_microseconds() const +{ CPUTime e; GetCPUTimeMicroSeconds(&e); return (e.user_time - m_start_time.user_time) + - (e.sys_time - m_start_time.sys_time); + (e.sys_time - m_start_time.sys_time); } -double Timer::get_elapsed_wall_time() const { +double Timer::get_elapsed_wall_time() const +{ return static_cast(get_elapsed_wall_time_microseconds()) * 1e-6; } -uint64_t Timer::get_elapsed_wall_time_microseconds() const { +uint64_t Timer::get_elapsed_wall_time_microseconds() const +{ return GetTimeOfDayMicroSeconds() - m_wall; } @@ -92,7 +100,8 @@ void Timer::check(const char* msg) } } -std::string Timer::ToString() const { +std::string Timer::ToString() const +{ std::string res; const double wall = get_elapsed_wall_time(); CPUTime e; diff --git a/mert/Timer.h b/mert/Timer.h index bae4ab6b3..2adb86412 100644 --- a/mert/Timer.h +++ b/mert/Timer.h @@ -7,11 +7,11 @@ namespace MosesTuning { - + class Timer { - private: +private: // Time values are stored in microseconds. struct CPUTime { uint64_t user_time; // user CPU time @@ -30,15 +30,15 @@ class Timer Timer(const Timer&); void operator=(const Timer&); - public: +public: /** * 'm_is_running' is initially false. A timer needs to be explicitly started * using 'start'. */ Timer() - : m_is_running(false), - m_wall(0), - m_start_time() {} + : m_is_running(false), + m_wall(0), + m_start_time() {} ~Timer() {} @@ -61,7 +61,9 @@ class Timer /** */ - bool is_running() const { return m_is_running; } + bool is_running() const { + return m_is_running; + } /** * Return the total time in seconds that the timer has been in the @@ -97,7 +99,8 @@ class Timer * for an ostream 'os' and a timer 't'. For example, "cout << t" will * print out the total amount of time 't' has been "running". */ -inline std::ostream& operator<<(std::ostream& os, const Timer& t) { +inline std::ostream& operator<<(std::ostream& os, const Timer& t) +{ if (t.is_running()) { os << t.ToString(); } else { diff --git a/mert/TimerTest.cpp b/mert/TimerTest.cpp index 3bf0e5573..d72b1c312 100644 --- a/mert/TimerTest.cpp +++ b/mert/TimerTest.cpp @@ -8,7 +8,8 @@ using namespace MosesTuning; -BOOST_AUTO_TEST_CASE(timer_basic_test) { +BOOST_AUTO_TEST_CASE(timer_basic_test) +{ Timer timer; const int sleep_time_microsec = 40; // ad-hoc microseconds to pass unit tests. diff --git a/mert/Util.cpp b/mert/Util.cpp index ac7d1803b..67448292f 100644 --- a/mert/Util.cpp +++ b/mert/Util.cpp @@ -11,7 +11,8 @@ using namespace std; -namespace { +namespace +{ MosesTuning::Timer g_timer; int g_verbose = 0; @@ -56,7 +57,8 @@ size_t getNextPound(std::string &str, std::string &substr, return pos; } -void split(const std::string &s, char delim, std::vector &elems) { +void split(const std::string &s, char delim, std::vector &elems) +{ std::stringstream ss(s); std::string item; while(std::getline(ss, item, delim)) { @@ -65,7 +67,8 @@ void split(const std::string &s, char delim, std::vector &elems) { } void Tokenize(const char *str, const char delim, - std::vector *res) { + std::vector *res) +{ while (1) { const char *begin = str; while (*str != delim && *str) str++; diff --git a/mert/Util.h b/mert/Util.h index e2071bf1f..5c9c635ab 100644 --- a/mert/Util.h +++ b/mert/Util.h @@ -40,7 +40,8 @@ int setverboselevel(int v); const float kEPS = 0.0001f; template -bool IsAlmostEqual(T expected, T actual, float round=kEPS) { +bool IsAlmostEqual(T expected, T actual, float round=kEPS) +{ if (std::abs(expected - actual) < round) { return true; } else { @@ -86,7 +87,8 @@ inline T Scan(const std::string &input) * Returns true iff "str" ends with "suffix". * e.g., Given str = "abc:" and suffix = ":", this function returns true. */ -inline bool EndsWith(const std::string& str, const char* suffix) { +inline bool EndsWith(const std::string& str, const char* suffix) +{ return str.find_last_of(suffix) == str.size() - 1; } diff --git a/mert/UtilTest.cpp b/mert/UtilTest.cpp index f3ca6ca80..6f86d5144 100644 --- a/mert/UtilTest.cpp +++ b/mert/UtilTest.cpp @@ -5,7 +5,8 @@ using namespace MosesTuning; -BOOST_AUTO_TEST_CASE(util_get_next_pound_test) { +BOOST_AUTO_TEST_CASE(util_get_next_pound_test) +{ { std::string str("9 9 7 "); std::string substr; @@ -38,7 +39,8 @@ BOOST_AUTO_TEST_CASE(util_get_next_pound_test) { } } -BOOST_AUTO_TEST_CASE(util_tokenize_test) { +BOOST_AUTO_TEST_CASE(util_tokenize_test) +{ { std::vector res; Tokenize("9 9 7", ' ', &res); @@ -66,7 +68,8 @@ BOOST_AUTO_TEST_CASE(util_tokenize_test) { } } -BOOST_AUTO_TEST_CASE(util_ends_with_test) { +BOOST_AUTO_TEST_CASE(util_ends_with_test) +{ BOOST_CHECK(EndsWith("abc:", ":")); BOOST_CHECK(EndsWith("a b c:", ":")); BOOST_CHECK(!EndsWith("a", ":")); diff --git a/mert/Vocabulary.cpp b/mert/Vocabulary.cpp index 458024ce1..5a17c2c6e 100644 --- a/mert/Vocabulary.cpp +++ b/mert/Vocabulary.cpp @@ -1,34 +1,39 @@ #include "Vocabulary.h" #include "Singleton.h" -namespace mert { -namespace { +namespace mert +{ +namespace +{ Vocabulary* g_vocab = NULL; } // namespace -int Vocabulary::Encode(const std::string& token) { - iterator it = m_vocab.find(token); - int encoded_token; - if (it == m_vocab.end()) { - // Add an new entry to the vocaburary. - encoded_token = static_cast(m_vocab.size()); +int Vocabulary::Encode(const std::string& token) +{ + iterator it = m_vocab.find(token); + int encoded_token; + if (it == m_vocab.end()) { + // Add an new entry to the vocaburary. + encoded_token = static_cast(m_vocab.size()); - m_vocab[token] = encoded_token; - } else { - encoded_token = it->second; - } - return encoded_token; + m_vocab[token] = encoded_token; + } else { + encoded_token = it->second; + } + return encoded_token; } -bool Vocabulary::Lookup(const std::string&str , int* v) const { +bool Vocabulary::Lookup(const std::string&str , int* v) const +{ - const_iterator it = m_vocab.find(str); - if (it == m_vocab.end()) return false; - *v = it->second; - return true; + const_iterator it = m_vocab.find(str); + if (it == m_vocab.end()) return false; + *v = it->second; + return true; } -Vocabulary* VocabularyFactory::GetVocabulary() { +Vocabulary* VocabularyFactory::GetVocabulary() +{ if (g_vocab == NULL) { return MosesTuning::Singleton::GetInstance(); } else { @@ -36,7 +41,8 @@ Vocabulary* VocabularyFactory::GetVocabulary() { } } -void VocabularyFactory::SetVocabulary(Vocabulary* vocab) { +void VocabularyFactory::SetVocabulary(Vocabulary* vocab) +{ g_vocab = vocab; } diff --git a/mert/Vocabulary.h b/mert/Vocabulary.h index 3ad42f46f..16c8698c6 100644 --- a/mert/Vocabulary.h +++ b/mert/Vocabulary.h @@ -4,7 +4,8 @@ #include #include -namespace mert { +namespace mert +{ /** * A map to handle vocabularies to calculate @@ -12,8 +13,9 @@ namespace mert { * * TODO: replace this with more efficient data structure. */ -class Vocabulary { - public: +class Vocabulary +{ +public: typedef boost::unordered_map::iterator iterator; typedef boost::unordered_map::const_iterator const_iterator; @@ -28,32 +30,53 @@ class Vocabulary { */ bool Lookup(const std::string&str , int* v) const; - void clear() { m_vocab.clear(); } + void clear() { + m_vocab.clear(); + } - bool empty() const { return m_vocab.empty(); } + bool empty() const { + return m_vocab.empty(); + } - std::size_t size() const { return m_vocab.size(); } + std::size_t size() const { + return m_vocab.size(); + } - iterator find(const std::string& str) { return m_vocab.find(str); } - const_iterator find(const std::string& str) const { return m_vocab.find(str); } + iterator find(const std::string& str) { + return m_vocab.find(str); + } + const_iterator find(const std::string& str) const { + return m_vocab.find(str); + } - int& operator[](const std::string& str) { return m_vocab[str]; } + int& operator[](const std::string& str) { + return m_vocab[str]; + } - iterator begin() { return m_vocab.begin(); } - const_iterator begin() const { return m_vocab.begin(); } - iterator end() { return m_vocab.end(); } - const_iterator end() const { return m_vocab.end(); } + iterator begin() { + return m_vocab.begin(); + } + const_iterator begin() const { + return m_vocab.begin(); + } + iterator end() { + return m_vocab.end(); + } + const_iterator end() const { + return m_vocab.end(); + } - private: +private: boost::unordered_map m_vocab; }; -class VocabularyFactory { - public: +class VocabularyFactory +{ +public: static Vocabulary* GetVocabulary(); static void SetVocabulary(Vocabulary* vocab); - private: +private: VocabularyFactory() {} virtual ~VocabularyFactory() {} }; diff --git a/mert/VocabularyTest.cpp b/mert/VocabularyTest.cpp index 5b453fcda..002b6a64f 100644 --- a/mert/VocabularyTest.cpp +++ b/mert/VocabularyTest.cpp @@ -6,16 +6,20 @@ using namespace MosesTuning; -namespace mert { -namespace { +namespace mert +{ +namespace +{ -void TearDown() { +void TearDown() +{ Singleton::Delete(); } } // namespace -BOOST_AUTO_TEST_CASE(vocab_basic) { +BOOST_AUTO_TEST_CASE(vocab_basic) +{ Vocabulary vocab; BOOST_REQUIRE(vocab.empty()); vocab.clear(); @@ -39,7 +43,8 @@ BOOST_AUTO_TEST_CASE(vocab_basic) { BOOST_CHECK(!vocab.Lookup("world", &v)); } -BOOST_AUTO_TEST_CASE(vocab_factory_test) { +BOOST_AUTO_TEST_CASE(vocab_factory_test) +{ Vocabulary* vocab1 = VocabularyFactory::GetVocabulary(); Vocabulary* vocab2 = VocabularyFactory::GetVocabulary(); Vocabulary* vocab3 = VocabularyFactory::GetVocabulary(); diff --git a/mert/evaluator.cpp b/mert/evaluator.cpp index 58a66b12d..f976f39b3 100644 --- a/mert/evaluator.cpp +++ b/mert/evaluator.cpp @@ -14,7 +14,8 @@ using namespace std; using namespace MosesTuning; -namespace { +namespace +{ Scorer* g_scorer = NULL; bool g_has_more_files = false; @@ -22,13 +23,14 @@ bool g_has_more_scorers = false; const float g_alpha = 0.05; -class EvaluatorUtil { - public: +class EvaluatorUtil +{ +public: static void evaluate(const string& candFile, int bootstrap); static float average(const vector& list); static string int2string(int n); - private: +private: EvaluatorUtil() {} ~EvaluatorUtil() {} }; @@ -43,22 +45,18 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap) // Loading sentences and preparing statistics ScoreStats scoreentry; string line; - while (getline(cand, line)) - { + while (getline(cand, line)) { g_scorer->prepareStats(entries.size(), line, scoreentry); entries.push_back(scoreentry); } int n = entries.size(); - if (bootstrap) - { + if (bootstrap) { vector scores; - for (int i = 0; i < bootstrap; ++i) - { + for (int i = 0; i < bootstrap; ++i) { // TODO: Use smart pointer for exceptional-safety. ScoreData* scoredata = new ScoreData(g_scorer); - for (int j = 0; j < n; ++j) - { + for (int j = 0; j < n; ++j) { int randomIndex = random() % n; scoredata->add(entries[randomIndex], j); } @@ -85,13 +83,10 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap) cout.setf(ios::fixed, ios::floatfield); cout.precision(4); cout << avg << "\t[" << lb << "," << rb << "]" << endl; - } - else - { + } else { // TODO: Use smart pointer for exceptional-safety. ScoreData* scoredata = new ScoreData(g_scorer); - for (int sid = 0; sid < n; ++sid) - { + for (int sid = 0; sid < n; ++sid) { scoredata->add(entries[sid], sid); } g_scorer->setScoreData(scoredata); @@ -184,56 +179,56 @@ struct ProgramOption { bool has_seed; ProgramOption() - : reference(""), - candidate(""), - bootstrap(0), - seed(0), - has_seed(false) { } + : reference(""), + candidate(""), + bootstrap(0), + seed(0), + has_seed(false) { } }; -void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) { +void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) +{ int c; int option_index; int last_scorer_index = -1; while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:f:l:h", long_options, &option_index)) != -1) { switch(c) { - case 's': - opt->scorer_types.push_back(string(optarg)); - opt->scorer_configs.push_back(string("")); - opt->scorer_factors.push_back(string("")); - opt->scorer_filter.push_back(string("")); - last_scorer_index++; - break; - case 'c': - opt->scorer_configs[last_scorer_index] = string(optarg); - break; - case 'R': - opt->reference = string(optarg); - break; - case 'C': - opt->candidate = string(optarg); - break; - case 'b': - opt->bootstrap = atoi(optarg); - break; - case 'r': - opt->seed = strtol(optarg, NULL, 10); - opt->has_seed = true; - break; - case 'f': - opt->scorer_factors[last_scorer_index] = string(optarg); - break; - case 'l': - opt->scorer_filter[last_scorer_index] = string(optarg); - break; - default: - usage(); + case 's': + opt->scorer_types.push_back(string(optarg)); + opt->scorer_configs.push_back(string("")); + opt->scorer_factors.push_back(string("")); + opt->scorer_filter.push_back(string("")); + last_scorer_index++; + break; + case 'c': + opt->scorer_configs[last_scorer_index] = string(optarg); + break; + case 'R': + opt->reference = string(optarg); + break; + case 'C': + opt->candidate = string(optarg); + break; + case 'b': + opt->bootstrap = atoi(optarg); + break; + case 'r': + opt->seed = strtol(optarg, NULL, 10); + opt->has_seed = true; + break; + case 'f': + opt->scorer_factors[last_scorer_index] = string(optarg); + break; + case 'l': + opt->scorer_filter[last_scorer_index] = string(optarg); + break; + default: + usage(); } } // Add default scorer if no scorer provided - if (opt->scorer_types.size() == 0) - { + if (opt->scorer_types.size() == 0) { opt->scorer_types.push_back(string("BLEU")); opt->scorer_configs.push_back(string("")); opt->scorer_factors.push_back(string("")); @@ -241,7 +236,8 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) { } } -void InitSeed(const ProgramOption *opt) { +void InitSeed(const ProgramOption *opt) +{ if (opt->has_seed) { cerr << "Seeding random numbers with " << opt->seed << endl; srandom(opt->seed); @@ -260,8 +256,7 @@ int main(int argc, char** argv) ProgramOption option; ParseCommandOptions(argc, argv, &option); - if (option.bootstrap) - { + if (option.bootstrap) { InitSeed(&option); } @@ -278,17 +273,15 @@ int main(int argc, char** argv) if (candFiles.size() > 1) g_has_more_files = true; if (option.scorer_types.size() > 1) g_has_more_scorers = true; - for (vector::const_iterator fileIt = candFiles.begin(); fileIt != candFiles.end(); ++fileIt) - { - for (size_t i = 0; i < option.scorer_types.size(); i++) - { - g_scorer = ScorerFactory::getScorer(option.scorer_types[i], option.scorer_configs[i]); - g_scorer->setFactors(option.scorer_factors[i]); - g_scorer->setFilter(option.scorer_filter[i]); - g_scorer->setReferenceFiles(refFiles); - EvaluatorUtil::evaluate(*fileIt, option.bootstrap); - delete g_scorer; - } + for (vector::const_iterator fileIt = candFiles.begin(); fileIt != candFiles.end(); ++fileIt) { + for (size_t i = 0; i < option.scorer_types.size(); i++) { + g_scorer = ScorerFactory::getScorer(option.scorer_types[i], option.scorer_configs[i]); + g_scorer->setFactors(option.scorer_factors[i]); + g_scorer->setFilter(option.scorer_filter[i]); + g_scorer->setReferenceFiles(refFiles); + EvaluatorUtil::evaluate(*fileIt, option.bootstrap); + delete g_scorer; + } } return EXIT_SUCCESS; } catch (const exception& e) { diff --git a/mert/extractor.cpp b/mert/extractor.cpp index 077d9b94c..38652296e 100644 --- a/mert/extractor.cpp +++ b/mert/extractor.cpp @@ -20,7 +20,8 @@ using namespace std; using namespace MosesTuning; -namespace { +namespace +{ void usage() { @@ -78,68 +79,69 @@ struct ProgramOption { int verbosity; ProgramOption() - : scorerType("BLEU"), - scorerConfig(""), - scorerFactors(""), - scorerFilter(""), - referenceFile(""), - nbestFile(""), - scoreDataFile("statscore.data"), - featureDataFile("features.data"), - prevScoreDataFile(""), - prevFeatureDataFile(""), - binmode(false), - allowDuplicates(false), - verbosity(0) { } + : scorerType("BLEU"), + scorerConfig(""), + scorerFactors(""), + scorerFilter(""), + referenceFile(""), + nbestFile(""), + scoreDataFile("statscore.data"), + featureDataFile("features.data"), + prevScoreDataFile(""), + prevFeatureDataFile(""), + binmode(false), + allowDuplicates(false), + verbosity(0) { } }; -void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) { +void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) +{ int c; int option_index; while ((c = getopt_long(argc, argv, "s:r:f:l:n:S:F:R:E:v:hbd", long_options, &option_index)) != -1) { switch (c) { - case 's': - opt->scorerType = string(optarg); - break; - case 'c': - opt->scorerConfig = string(optarg); - break; - case 'f': - opt->scorerFactors = string(optarg); - break; - case 'l': - opt->scorerFilter = string(optarg); - break; - case 'r': - opt->referenceFile = string(optarg); - break; - case 'b': - opt->binmode = true; - break; - case 'n': - opt->nbestFile = string(optarg); - break; - case 'S': - opt->scoreDataFile = string(optarg); - break; - case 'F': - opt->featureDataFile = string(optarg); - break; - case 'E': - opt->prevFeatureDataFile = string(optarg); - break; - case 'R': - opt->prevScoreDataFile = string(optarg); - break; - case 'v': - opt->verbosity = atoi(optarg); - break; - case 'd': - opt->allowDuplicates = true; - break; - default: - usage(); + case 's': + opt->scorerType = string(optarg); + break; + case 'c': + opt->scorerConfig = string(optarg); + break; + case 'f': + opt->scorerFactors = string(optarg); + break; + case 'l': + opt->scorerFilter = string(optarg); + break; + case 'r': + opt->referenceFile = string(optarg); + break; + case 'b': + opt->binmode = true; + break; + case 'n': + opt->nbestFile = string(optarg); + break; + case 'S': + opt->scoreDataFile = string(optarg); + break; + case 'F': + opt->featureDataFile = string(optarg); + break; + case 'E': + opt->prevFeatureDataFile = string(optarg); + break; + case 'R': + opt->prevScoreDataFile = string(optarg); + break; + case 'v': + opt->verbosity = atoi(optarg); + break; + case 'd': + opt->allowDuplicates = true; + break; + default: + usage(); } } } @@ -202,7 +204,7 @@ int main(int argc, char** argv) TRACE_ERR("Scorer type: " << option.scorerType << endl); boost::scoped_ptr scorer( - ScorerFactory::getScorer(option.scorerType, option.scorerConfig)); + ScorerFactory::getScorer(option.scorerType, option.scorerConfig)); // set Factors and Filter used to preprocess the sentences scorer->setFactors(option.scorerFactors); diff --git a/mert/kbmira.cpp b/mert/kbmira.cpp index f0d1624e6..a2665ac13 100644 --- a/mert/kbmira.cpp +++ b/mert/kbmira.cpp @@ -2,7 +2,7 @@ // vim:tabstop=2 /*********************************************************************** K-best Batch MIRA for Moses -Copyright (C) 2012, National Research Council Canada / Conseil national +Copyright (C) 2012, National Research Council Canada / Conseil national de recherches du Canada ***********************************************************************/ @@ -49,13 +49,14 @@ using namespace MosesTuning; namespace po = boost::program_options; -ValType evaluate(HypPackEnumerator* train, const AvgWeightVector& wv) { +ValType evaluate(HypPackEnumerator* train, const AvgWeightVector& wv) +{ vector stats(kBleuNgramOrder*2+1,0); for(train->reset(); !train->finished(); train->next()) { // Find max model size_t max_index=0; ValType max_score=0; - for(size_t i=0;icur_size();i++) { + for(size_t i=0; icur_size(); i++) { MiraFeatureVector vec(train->featuresAt(i)); ValType score = wv.score(vec); if(i==0 || score > max_score) { @@ -64,8 +65,8 @@ ValType evaluate(HypPackEnumerator* train, const AvgWeightVector& wv) { } } // Update stats - const vector& sent = train->scoresAt(max_index); - for(size_t i=0;i& sent = train->scoresAt(max_index); + for(size_t i=0; izero_tokens()->default_value(false), "Print this help message and exit") - ("scfile,S", po::value >(&scoreFiles), "Scorer data files") - ("ffile,F", po::value > (&featureFiles), "Feature data files") - ("random-seed,r", po::value(&seed), "Seed for random number generation") - ("output-file,o", po::value(&outputFile), "Output file") - ("cparam,C", po::value(&c), "MIRA C-parameter, lower for more regularization (default 0.01)") - ("decay,D", po::value(&decay), "BLEU background corpus decay rate (default 0.999)") - ("iters,J", po::value(&n_iters), "Number of MIRA iterations to run (default 60)") - ("dense-init,d", po::value(&denseInitFile), "Weight file for dense features") - ("sparse-init,s", po::value(&sparseInitFile), "Weight file for sparse features") - ("streaming", po::value(&streaming)->zero_tokens()->default_value(false), "Stream n-best lists to save memory, implies --no-shuffle") - ("no-shuffle", po::value(&no_shuffle)->zero_tokens()->default_value(false), "Don't shuffle hypotheses before each epoch") - ("model-bg", po::value(&model_bg)->zero_tokens()->default_value(false), "Use model instead of hope for BLEU background") - ("verbose", po::value(&verbose)->zero_tokens()->default_value(false), "Verbose updates") - ("safe-hope", po::value(&safe_hope)->zero_tokens()->default_value(false), "Mode score's influence on hope decoding is limited") - ; + ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit") + ("scfile,S", po::value >(&scoreFiles), "Scorer data files") + ("ffile,F", po::value > (&featureFiles), "Feature data files") + ("random-seed,r", po::value(&seed), "Seed for random number generation") + ("output-file,o", po::value(&outputFile), "Output file") + ("cparam,C", po::value(&c), "MIRA C-parameter, lower for more regularization (default 0.01)") + ("decay,D", po::value(&decay), "BLEU background corpus decay rate (default 0.999)") + ("iters,J", po::value(&n_iters), "Number of MIRA iterations to run (default 60)") + ("dense-init,d", po::value(&denseInitFile), "Weight file for dense features") + ("sparse-init,s", po::value(&sparseInitFile), "Weight file for sparse features") + ("streaming", po::value(&streaming)->zero_tokens()->default_value(false), "Stream n-best lists to save memory, implies --no-shuffle") + ("no-shuffle", po::value(&no_shuffle)->zero_tokens()->default_value(false), "Don't shuffle hypotheses before each epoch") + ("model-bg", po::value(&model_bg)->zero_tokens()->default_value(false), "Use model instead of hope for BLEU background") + ("verbose", po::value(&verbose)->zero_tokens()->default_value(false), "Verbose updates") + ("safe-hope", po::value(&safe_hope)->zero_tokens()->default_value(false), "Mode score's influence on hope decoding is limited") + ; po::options_description cmdline_options; cmdline_options.add(desc); @@ -118,9 +119,9 @@ int main(int argc, char** argv) options(cmdline_options).run(), vm); po::notify(vm); if (help) { - cout << "Usage: " + string(argv[0]) + " [options]" << endl; - cout << desc << endl; - exit(0); + cout << "Usage: " + string(argv[0]) + " [options]" << endl; + cout << desc << endl; + exit(0); } cerr << "kbmira with c=" << c << " decay=" << decay << " no_shuffle=" << no_shuffle << endl; @@ -165,7 +166,8 @@ int main(int argc, char** argv) exit(3); } int sparseCount=0; - parameter_t val; std::string name; + parameter_t val; + std::string name; while(opt >> name >> val) { size_t id = SparseVector::encode(name) + initDenseSize; while(initParams.size()<=id) initParams.push_back(0.0); @@ -175,17 +177,17 @@ int main(int argc, char** argv) cerr << "Found " << sparseCount << " initial sparse features" << endl; opt.close(); } - + MiraWeightVector wv(initParams); // Initialize background corpus vector bg; - for(int j=0;j train; if(streaming) @@ -194,8 +196,7 @@ int main(int argc, char** argv) train.reset(new RandomAccessHypPackEnumerator(featureFiles, scoreFiles, no_shuffle)); cerr << "Initial BLEU = " << evaluate(train.get(), wv.avg()) << endl; ValType bestBleu = 0; - for(int j=0;j& model_stats = train->scoresAt(model_index); - for(size_t k=0;k1e-8) + if(abs(avg.weight(i))>1e-8) *out << SparseVector::decode(i-num_dense) << " " << avg.weight(i) << endl; } } diff --git a/mert/mert.cpp b/mert/mert.cpp index e53c86be2..b73c536d1 100644 --- a/mert/mert.cpp +++ b/mert/mert.cpp @@ -30,7 +30,8 @@ using namespace std; using namespace MosesTuning; -namespace { +namespace +{ const char kDefaultOptimizer[] = "powell"; const char kDefaultScorer[] = "BLEU"; @@ -46,10 +47,11 @@ const char kOutputFile[] = "weights.txt"; /** * Runs an optimisation, or a random restart. */ -class OptimizationTask : public Moses::Task { - public: +class OptimizationTask : public Moses::Task +{ +public: OptimizationTask(Optimizer* optimizer, const Point& point) - : m_optimizer(optimizer), m_point(point) {} + : m_optimizer(optimizer), m_point(point) {} ~OptimizationTask() {} @@ -76,7 +78,7 @@ class OptimizationTask : public Moses::Task { return m_point; } - private: +private: // Do not allow the user to instanciate without arguments. OptimizationTask() {} @@ -85,7 +87,8 @@ class OptimizationTask : public Moses::Task { statscore_t m_score; }; -bool WriteFinalWeights(const char* filename, const Point& point) { +bool WriteFinalWeights(const char* filename, const Point& point) +{ ofstream ofs(filename); if (!ofs) { cerr << "Cannot open " << filename << endl; @@ -165,91 +168,92 @@ struct ProgramOption { size_t shard_count; ProgramOption() - : to_optimize_str(""), - pdim(-1), - ntry(1), - nrandom(0), - seed(0), - has_seed(false), - optimize_type(kDefaultOptimizer), - scorer_type(kDefaultScorer), - scorer_config(""), - scorer_file(kDefaultScorerFile), - feature_file(kDefaultFeatureFile), - init_file(kDefaultInitFile), - positive_string(kDefaultPositiveString), - sparse_weights_file(kDefaultSparseWeightsFile), - num_threads(1), - shard_size(0), - shard_count(0) { } + : to_optimize_str(""), + pdim(-1), + ntry(1), + nrandom(0), + seed(0), + has_seed(false), + optimize_type(kDefaultOptimizer), + scorer_type(kDefaultScorer), + scorer_config(""), + scorer_file(kDefaultScorerFile), + feature_file(kDefaultFeatureFile), + init_file(kDefaultInitFile), + positive_string(kDefaultPositiveString), + sparse_weights_file(kDefaultSparseWeightsFile), + num_threads(1), + shard_size(0), + shard_count(0) { } }; -void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) { +void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) +{ int c; int option_index; while ((c = getopt_long(argc, argv, "o:r:d:n:m:t:s:S:F:v:p:P:", long_options, &option_index)) != -1) { switch (c) { - case 'o': - opt->to_optimize_str = string(optarg); - break; - case 'd': - opt->pdim = strtol(optarg, NULL, 10); - break; - case 'n': - opt->ntry = strtol(optarg, NULL, 10); - break; - case 'm': - opt->nrandom = strtol(optarg, NULL, 10); - break; - case 'r': - opt->seed = strtol(optarg, NULL, 10); - opt->has_seed = true; - break; - case 't': - opt->optimize_type = string(optarg); - break; - case's': - opt->scorer_type = string(optarg); - break; - case 'c': - opt->scorer_config = string(optarg); - break; - case 'S': - opt->scorer_file = string(optarg); - break; - case 'F': - opt->feature_file = string(optarg); - break; - case 'i': - opt->init_file = string(optarg); - break; - case 'p': - opt->sparse_weights_file=string(optarg); - break; - case 'v': - setverboselevel(strtol(optarg, NULL, 10)); - break; + case 'o': + opt->to_optimize_str = string(optarg); + break; + case 'd': + opt->pdim = strtol(optarg, NULL, 10); + break; + case 'n': + opt->ntry = strtol(optarg, NULL, 10); + break; + case 'm': + opt->nrandom = strtol(optarg, NULL, 10); + break; + case 'r': + opt->seed = strtol(optarg, NULL, 10); + opt->has_seed = true; + break; + case 't': + opt->optimize_type = string(optarg); + break; + case's': + opt->scorer_type = string(optarg); + break; + case 'c': + opt->scorer_config = string(optarg); + break; + case 'S': + opt->scorer_file = string(optarg); + break; + case 'F': + opt->feature_file = string(optarg); + break; + case 'i': + opt->init_file = string(optarg); + break; + case 'p': + opt->sparse_weights_file=string(optarg); + break; + case 'v': + setverboselevel(strtol(optarg, NULL, 10)); + break; #ifdef WITH_THREADS - case 'T': - opt->num_threads = strtol(optarg, NULL, 10); - if (opt->num_threads < 1) opt->num_threads = 1; - break; + case 'T': + opt->num_threads = strtol(optarg, NULL, 10); + if (opt->num_threads < 1) opt->num_threads = 1; + break; #endif - case 'a': - opt->shard_count = strtof(optarg, NULL); - break; - case 'b': - opt->shard_size = strtof(optarg, NULL); - break; - case 'h': - usage(0); - break; - case 'P': - opt->positive_string = string(optarg); - break; - default: - usage(1); + case 'a': + opt->shard_count = strtof(optarg, NULL); + break; + case 'b': + opt->shard_size = strtof(optarg, NULL); + break; + case 'h': + usage(0); + break; + case 'P': + opt->positive_string = string(optarg); + break; + default: + usage(1); } } } @@ -353,7 +357,7 @@ int main(int argc, char **argv) // it make sense to know what parameter set were used to generate the nbest boost::scoped_ptr scorer( - ScorerFactory::getScorer(option.scorer_type, option.scorer_config)); + ScorerFactory::getScorer(option.scorer_type, option.scorer_config)); //load data Data data(scorer.get(), option.sparse_weights_file); diff --git a/mert/pro.cpp b/mert/pro.cpp index 3777d0470..b8cf81ca3 100644 --- a/mert/pro.cpp +++ b/mert/pro.cpp @@ -51,7 +51,8 @@ namespace po = boost::program_options; namespace MosesTuning { -class SampledPair { +class SampledPair +{ private: pair m_translation1; pair m_translation2; @@ -70,12 +71,19 @@ public: } } - float getDiff() const { return m_score_diff; } - const pair& getTranslation1() const { return m_translation1; } - const pair& getTranslation2() const { return m_translation2; } + float getDiff() const { + return m_score_diff; + } + const pair& getTranslation1() const { + return m_translation1; + } + const pair& getTranslation2() const { + return m_translation2; + } }; -static void outputSample(ostream& out, const FeatureDataItem& f1, const FeatureDataItem& f2) { +static void outputSample(ostream& out, const FeatureDataItem& f1, const FeatureDataItem& f2) +{ // difference in score in regular features for(unsigned int j=0; j 0.00001) @@ -110,13 +118,13 @@ int main(int argc, char** argv) po::options_description desc("Allowed options"); desc.add_options() - ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit") - ("scfile,S", po::value >(&scoreFiles), "Scorer data files") - ("ffile,F", po::value > (&featureFiles), "Feature data files") - ("random-seed,r", po::value(&seed), "Seed for random number generation") - ("output-file,o", po::value(&outputFile), "Output file") - ("smooth-brevity-penalty,b", po::value(&smoothBP)->zero_tokens()->default_value(false), "Smooth the brevity penalty, as in Nakov et al. (Coling 2012)") - ; + ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit") + ("scfile,S", po::value >(&scoreFiles), "Scorer data files") + ("ffile,F", po::value > (&featureFiles), "Feature data files") + ("random-seed,r", po::value(&seed), "Seed for random number generation") + ("output-file,o", po::value(&outputFile), "Output file") + ("smooth-brevity-penalty,b", po::value(&smoothBP)->zero_tokens()->default_value(false), "Smooth the brevity penalty, as in Nakov et al. (Coling 2012)") + ; po::options_description cmdline_options; cmdline_options.add(desc); @@ -125,9 +133,9 @@ int main(int argc, char** argv) options(cmdline_options).run(), vm); po::notify(vm); if (help) { - cout << "Usage: " + string(argv[0]) + " [options]" << endl; - cout << desc << endl; - exit(0); + cout << "Usage: " + string(argv[0]) + " [options]" << endl; + cout << desc << endl; + exit(0); } if (vm.count("random-seed")) { @@ -145,7 +153,7 @@ int main(int argc, char** argv) if (featureFiles.size() != scoreFiles.size()) { cerr << "Error: Number of feature files (" << featureFiles.size() << - ") does not match number of score files (" << scoreFiles.size() << ")" << endl; + ") does not match number of score files (" << scoreFiles.size() << ")" << endl; exit(1); } @@ -238,11 +246,11 @@ int main(int argc, char** argv) size_t hypo_id2 = samples[i].getTranslation2().second; *out << "1"; outputSample(*out, featureDataIters[file_id1]->operator[](hypo_id1), - featureDataIters[file_id2]->operator[](hypo_id2)); + featureDataIters[file_id2]->operator[](hypo_id2)); *out << endl; *out << "0"; outputSample(*out, featureDataIters[file_id2]->operator[](hypo_id2), - featureDataIters[file_id1]->operator[](hypo_id1)); + featureDataIters[file_id1]->operator[](hypo_id1)); *out << endl; } //advance all iterators diff --git a/mert/sentence-bleu.cpp b/mert/sentence-bleu.cpp index 17a9737f2..5269d37cd 100644 --- a/mert/sentence-bleu.cpp +++ b/mert/sentence-bleu.cpp @@ -18,7 +18,7 @@ int main(int argc, char **argv) // TODO all of these are empty for now string config; string factors; - string filter; + string filter; BleuScorer scorer(config); scorer.setFactors(factors); diff --git a/mira/Decoder.cpp b/mira/Decoder.cpp index dea6699f6..3dea97f24 100644 --- a/mira/Decoder.cpp +++ b/mira/Decoder.cpp @@ -31,357 +31,371 @@ using namespace std; using namespace Moses; -namespace Mira { +namespace Mira +{ - /** - * Allocates a char* and copies string into it. - **/ - static char* strToChar(const string& s) { - char* c = new char[s.size()+1]; - strcpy(c,s.c_str()); - return c; +/** + * Allocates a char* and copies string into it. +**/ +static char* strToChar(const string& s) +{ + char* c = new char[s.size()+1]; + strcpy(c,s.c_str()); + return c; +} + +MosesDecoder::MosesDecoder(const string& inifile, int debuglevel, int argc, vector decoder_params) + : m_manager(NULL) +{ + static int BASE_ARGC = 8; + Parameter* params = new Parameter(); + char ** mosesargv = new char*[BASE_ARGC + argc]; + mosesargv[0] = strToChar("-f"); + mosesargv[1] = strToChar(inifile); + mosesargv[2] = strToChar("-v"); + stringstream dbgin; + dbgin << debuglevel; + mosesargv[3] = strToChar(dbgin.str()); + mosesargv[4] = strToChar("-use-persistent-cache"); + mosesargv[5] = strToChar("0"); + mosesargv[6] = strToChar("-persistent-cache-size"); + mosesargv[7] = strToChar("0"); + + for (int i = 0; i < argc; ++i) { + char *cstr = &(decoder_params[i])[0]; + mosesargv[BASE_ARGC + i] = cstr; } - MosesDecoder::MosesDecoder(const string& inifile, int debuglevel, int argc, vector decoder_params) - : m_manager(NULL) { - static int BASE_ARGC = 8; - Parameter* params = new Parameter(); - char ** mosesargv = new char*[BASE_ARGC + argc]; - mosesargv[0] = strToChar("-f"); - mosesargv[1] = strToChar(inifile); - mosesargv[2] = strToChar("-v"); - stringstream dbgin; - dbgin << debuglevel; - mosesargv[3] = strToChar(dbgin.str()); - mosesargv[4] = strToChar("-use-persistent-cache"); - mosesargv[5] = strToChar("0"); - mosesargv[6] = strToChar("-persistent-cache-size"); - mosesargv[7] = strToChar("0"); - - for (int i = 0; i < argc; ++i) { - char *cstr = &(decoder_params[i])[0]; - mosesargv[BASE_ARGC + i] = cstr; - } - - if (!params->LoadParam(BASE_ARGC + argc,mosesargv)) { - cerr << "Loading static data failed, exit." << endl; - exit(1); - } - StaticData::LoadDataStatic(params, "mira"); - for (int i = 0; i < BASE_ARGC; ++i) { - delete[] mosesargv[i]; - } - delete[] mosesargv; - - //m_bleuScoreFeature = staticData.GetBleuScoreFeature(); TODO - assert(false); + if (!params->LoadParam(BASE_ARGC + argc,mosesargv)) { + cerr << "Loading static data failed, exit." << endl; + exit(1); } - - void MosesDecoder::cleanup(bool chartDecoding) { - delete m_manager; - if (chartDecoding) - delete m_chartManager; - else - delete m_sentence; + StaticData::LoadDataStatic(params, "mira"); + for (int i = 0; i < BASE_ARGC; ++i) { + delete[] mosesargv[i]; + } + delete[] mosesargv; + + //m_bleuScoreFeature = staticData.GetBleuScoreFeature(); TODO + assert(false); +} + +void MosesDecoder::cleanup(bool chartDecoding) +{ + delete m_manager; + if (chartDecoding) + delete m_chartManager; + else + delete m_sentence; +} + +vector< vector > MosesDecoder::getNBest(const std::string& source, + size_t sentenceid, + size_t nBestSize, + float bleuObjectiveWeight, + float bleuScoreWeight, + vector< ScoreComponentCollection>& featureValues, + vector< float>& bleuScores, + vector< float>& modelScores, + size_t numReturnedTranslations, + bool realBleu, + bool distinct, + bool avgRefLength, + size_t rank, + size_t epoch, + string filename) +{ + StaticData &staticData = StaticData::InstanceNonConst(); + bool chartDecoding = (staticData.GetSearchAlgorithm() == ChartDecoding); + initialize(staticData, source, sentenceid, bleuObjectiveWeight, bleuScoreWeight, avgRefLength, chartDecoding); + + // run the decoder + if (chartDecoding) { + return runChartDecoder(source, sentenceid, nBestSize, bleuObjectiveWeight, bleuScoreWeight, + featureValues, bleuScores, modelScores, numReturnedTranslations, realBleu, distinct, rank, epoch); + } else { + SearchAlgorithm search = staticData.GetSearchAlgorithm(); + return runDecoder(source, sentenceid, nBestSize, bleuObjectiveWeight, bleuScoreWeight, + featureValues, bleuScores, modelScores, numReturnedTranslations, realBleu, distinct, rank, epoch, + search, filename); + } +} + +vector< vector > MosesDecoder::runDecoder(const std::string& source, + size_t sentenceid, + size_t nBestSize, + float bleuObjectiveWeight, + float bleuScoreWeight, + vector< ScoreComponentCollection>& featureValues, + vector< float>& bleuScores, + vector< float>& modelScores, + size_t numReturnedTranslations, + bool realBleu, + bool distinct, + size_t rank, + size_t epoch, + SearchAlgorithm& search, + string filename) +{ + // run the decoder + m_manager = new Moses::Manager(0,*m_sentence, search); + m_manager->ProcessSentence(); + TrellisPathList nBestList; + m_manager->CalcNBest(nBestSize, nBestList, distinct); + + // optionally print nbest to file (to extract scores and features.. currently just for sentence bleu scoring) + if (filename != "") { + ofstream out(filename.c_str()); + if (!out) { + ostringstream msg; + msg << "Unable to open " << filename; + throw runtime_error(msg.str()); + } + // TODO: handle sentence id (for now always 0) + //OutputNBest(out, nBestList, StaticData::Instance().GetOutputFactorOrder(), 0, false); + out.close(); } - vector< vector > MosesDecoder::getNBest(const std::string& source, - size_t sentenceid, - size_t nBestSize, - float bleuObjectiveWeight, - float bleuScoreWeight, - vector< ScoreComponentCollection>& featureValues, - vector< float>& bleuScores, - vector< float>& modelScores, - size_t numReturnedTranslations, - bool realBleu, - bool distinct, - bool avgRefLength, - size_t rank, - size_t epoch, - string filename) - { - StaticData &staticData = StaticData::InstanceNonConst(); - bool chartDecoding = (staticData.GetSearchAlgorithm() == ChartDecoding); - initialize(staticData, source, sentenceid, bleuObjectiveWeight, bleuScoreWeight, avgRefLength, chartDecoding); + // read off the feature values and bleu scores for each sentence in the nbest list + Moses::TrellisPathList::const_iterator iter; + for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) { + const Moses::TrellisPath &path = **iter; + featureValues.push_back(path.GetScoreBreakdown()); + float bleuScore, dynBleuScore, realBleuScore; + if (realBleu) realBleuScore = m_bleuScoreFeature->CalculateBleu(path.GetTargetPhrase()); + else dynBleuScore = getBleuScore(featureValues.back()); + bleuScore = realBleu ? realBleuScore : dynBleuScore; + bleuScores.push_back(bleuScore); - // run the decoder - if (chartDecoding) { - return runChartDecoder(source, sentenceid, nBestSize, bleuObjectiveWeight, bleuScoreWeight, - featureValues, bleuScores, modelScores, numReturnedTranslations, realBleu, distinct, rank, epoch); - } - else { - SearchAlgorithm search = staticData.GetSearchAlgorithm(); - return runDecoder(source, sentenceid, nBestSize, bleuObjectiveWeight, bleuScoreWeight, - featureValues, bleuScores, modelScores, numReturnedTranslations, realBleu, distinct, rank, epoch, - search, filename); - } + //std::cout << "Score breakdown: " << path.GetScoreBreakdown() << endl; + float scoreWithoutBleu = path.GetTotalScore() - (bleuObjectiveWeight * bleuScoreWeight * bleuScore); + modelScores.push_back(scoreWithoutBleu); + + if (iter != nBestList.begin()) + cerr << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", \"" << path.GetTargetPhrase() << "\", score: " + << scoreWithoutBleu << ", Bleu: " << bleuScore << ", total: " << path.GetTotalScore(); + if (m_bleuScoreFeature->Enabled() && realBleu) + cerr << " (d-bleu: " << dynBleuScore << ", r-bleu: " << realBleuScore << ") "; + + // set bleu score to zero in the feature vector since we do not want to optimise its weight + setBleuScore(featureValues.back(), 0); } - vector< vector > MosesDecoder::runDecoder(const std::string& source, - size_t sentenceid, - size_t nBestSize, - float bleuObjectiveWeight, - float bleuScoreWeight, - vector< ScoreComponentCollection>& featureValues, - vector< float>& bleuScores, - vector< float>& modelScores, - size_t numReturnedTranslations, - bool realBleu, - bool distinct, - size_t rank, - size_t epoch, - SearchAlgorithm& search, - string filename) { - // run the decoder - m_manager = new Moses::Manager(0,*m_sentence, search); - m_manager->ProcessSentence(); - TrellisPathList nBestList; - m_manager->CalcNBest(nBestSize, nBestList, distinct); - - // optionally print nbest to file (to extract scores and features.. currently just for sentence bleu scoring) - if (filename != "") { - ofstream out(filename.c_str()); - if (!out) { - ostringstream msg; - msg << "Unable to open " << filename; - throw runtime_error(msg.str()); - } - // TODO: handle sentence id (for now always 0) - //OutputNBest(out, nBestList, StaticData::Instance().GetOutputFactorOrder(), 0, false); - out.close(); + // prepare translations to return + vector< vector > translations; + for (size_t i=0; i < numReturnedTranslations && i < nBestList.GetSize(); ++i) { + const TrellisPath &path = nBestList.at(i); + Phrase phrase = path.GetTargetPhrase(); + + vector translation; + for (size_t pos = 0; pos < phrase.GetSize(); ++pos) { + const Word &word = phrase.GetWord(pos); + Word *newWord = new Word(word); + translation.push_back(newWord); } - - // read off the feature values and bleu scores for each sentence in the nbest list - Moses::TrellisPathList::const_iterator iter; - for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) { - const Moses::TrellisPath &path = **iter; - featureValues.push_back(path.GetScoreBreakdown()); - float bleuScore, dynBleuScore, realBleuScore; - if (realBleu) realBleuScore = m_bleuScoreFeature->CalculateBleu(path.GetTargetPhrase()); - else dynBleuScore = getBleuScore(featureValues.back()); - bleuScore = realBleu ? realBleuScore : dynBleuScore; - bleuScores.push_back(bleuScore); - - //std::cout << "Score breakdown: " << path.GetScoreBreakdown() << endl; - float scoreWithoutBleu = path.GetTotalScore() - (bleuObjectiveWeight * bleuScoreWeight * bleuScore); - modelScores.push_back(scoreWithoutBleu); - - if (iter != nBestList.begin()) - cerr << endl; - cerr << "Rank " << rank << ", epoch " << epoch << ", \"" << path.GetTargetPhrase() << "\", score: " - << scoreWithoutBleu << ", Bleu: " << bleuScore << ", total: " << path.GetTotalScore(); - if (m_bleuScoreFeature->Enabled() && realBleu) - cerr << " (d-bleu: " << dynBleuScore << ", r-bleu: " << realBleuScore << ") "; - - // set bleu score to zero in the feature vector since we do not want to optimise its weight - setBleuScore(featureValues.back(), 0); - } - - // prepare translations to return - vector< vector > translations; - for (size_t i=0; i < numReturnedTranslations && i < nBestList.GetSize(); ++i) { - const TrellisPath &path = nBestList.at(i); - Phrase phrase = path.GetTargetPhrase(); - - vector translation; - for (size_t pos = 0; pos < phrase.GetSize(); ++pos) { - const Word &word = phrase.GetWord(pos); - Word *newWord = new Word(word); - translation.push_back(newWord); - } - translations.push_back(translation); - } - - return translations; + translations.push_back(translation); } - vector< vector > MosesDecoder::runChartDecoder(const std::string& source, - size_t sentenceid, - size_t nBestSize, - float bleuObjectiveWeight, - float bleuScoreWeight, - vector< ScoreComponentCollection>& featureValues, - vector< float>& bleuScores, - vector< float>& modelScores, - size_t numReturnedTranslations, - bool realBleu, - bool distinct, - size_t rank, - size_t epoch) { - // run the decoder + return translations; +} + +vector< vector > MosesDecoder::runChartDecoder(const std::string& source, + size_t sentenceid, + size_t nBestSize, + float bleuObjectiveWeight, + float bleuScoreWeight, + vector< ScoreComponentCollection>& featureValues, + vector< float>& bleuScores, + vector< float>& modelScores, + size_t numReturnedTranslations, + bool realBleu, + bool distinct, + size_t rank, + size_t epoch) +{ + // run the decoder + m_chartManager = new ChartManager(*m_sentence); + m_chartManager->ProcessSentence(); + ChartTrellisPathList nBestList; + m_chartManager->CalcNBest(nBestSize, nBestList, distinct); + + // read off the feature values and bleu scores for each sentence in the nbest list + ChartTrellisPathList::const_iterator iter; + for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) { + const Moses::ChartTrellisPath &path = **iter; + featureValues.push_back(path.GetScoreBreakdown()); + float bleuScore, dynBleuScore, realBleuScore; + dynBleuScore = getBleuScore(featureValues.back()); + realBleuScore = m_bleuScoreFeature->CalculateBleu(path.GetOutputPhrase()); + bleuScore = realBleu ? realBleuScore : dynBleuScore; + bleuScores.push_back(bleuScore); + + //std::cout << "Score breakdown: " << path.GetScoreBreakdown() << endl; + float scoreWithoutBleu = path.GetTotalScore() - (bleuObjectiveWeight * bleuScoreWeight * bleuScore); + modelScores.push_back(scoreWithoutBleu); + + if (iter != nBestList.begin()) + cerr << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", \"" << path.GetOutputPhrase() << "\", score: " + << scoreWithoutBleu << ", Bleu: " << bleuScore << ", total: " << path.GetTotalScore(); + if (m_bleuScoreFeature->Enabled() && realBleu) + cerr << " (d-bleu: " << dynBleuScore << ", r-bleu: " << realBleuScore << ") "; + + // set bleu score to zero in the feature vector since we do not want to optimise its weight + setBleuScore(featureValues.back(), 0); + } + + // prepare translations to return + vector< vector > translations; + for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) { + const ChartTrellisPath &path = **iter; + Phrase phrase = path.GetOutputPhrase(); + + vector translation; + for (size_t pos = 0; pos < phrase.GetSize(); ++pos) { + const Word &word = phrase.GetWord(pos); + Word *newWord = new Word(word); + translation.push_back(newWord); + } + translations.push_back(translation); + } + + return translations; +} + +void MosesDecoder::outputNBestList(const std::string& source, size_t sentenceid, + size_t nBestSize, float bleuObjectiveWeight, float bleuScoreWeight, + bool distinctNbest, bool avgRefLength, string filename, ofstream& streamOut) +{ + StaticData &staticData = StaticData::InstanceNonConst(); + bool chartDecoding = (staticData.GetSearchAlgorithm() == ChartDecoding); + initialize(staticData, source, sentenceid, bleuObjectiveWeight, bleuScoreWeight, avgRefLength, chartDecoding); + + if (chartDecoding) { m_chartManager = new ChartManager(*m_sentence); m_chartManager->ProcessSentence(); ChartTrellisPathList nBestList; - m_chartManager->CalcNBest(nBestSize, nBestList, distinct); + m_chartManager->CalcNBest(nBestSize, nBestList, distinctNbest); - // read off the feature values and bleu scores for each sentence in the nbest list - ChartTrellisPathList::const_iterator iter; - for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) { - const Moses::ChartTrellisPath &path = **iter; - featureValues.push_back(path.GetScoreBreakdown()); - float bleuScore, dynBleuScore, realBleuScore; - dynBleuScore = getBleuScore(featureValues.back()); - realBleuScore = m_bleuScoreFeature->CalculateBleu(path.GetOutputPhrase()); - bleuScore = realBleu ? realBleuScore : dynBleuScore; - bleuScores.push_back(bleuScore); - - //std::cout << "Score breakdown: " << path.GetScoreBreakdown() << endl; - float scoreWithoutBleu = path.GetTotalScore() - (bleuObjectiveWeight * bleuScoreWeight * bleuScore); - modelScores.push_back(scoreWithoutBleu); - - if (iter != nBestList.begin()) - cerr << endl; - cerr << "Rank " << rank << ", epoch " << epoch << ", \"" << path.GetOutputPhrase() << "\", score: " - << scoreWithoutBleu << ", Bleu: " << bleuScore << ", total: " << path.GetTotalScore(); - if (m_bleuScoreFeature->Enabled() && realBleu) - cerr << " (d-bleu: " << dynBleuScore << ", r-bleu: " << realBleuScore << ") "; - - // set bleu score to zero in the feature vector since we do not want to optimise its weight - setBleuScore(featureValues.back(), 0); - } - - // prepare translations to return - vector< vector > translations; - for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) { - const ChartTrellisPath &path = **iter; - Phrase phrase = path.GetOutputPhrase(); - - vector translation; - for (size_t pos = 0; pos < phrase.GetSize(); ++pos) { - const Word &word = phrase.GetWord(pos); - Word *newWord = new Word(word); - translation.push_back(newWord); - } - translations.push_back(translation); - } - - return translations; - } - - void MosesDecoder::outputNBestList(const std::string& source, size_t sentenceid, - size_t nBestSize, float bleuObjectiveWeight, float bleuScoreWeight, - bool distinctNbest, bool avgRefLength, string filename, ofstream& streamOut) { - StaticData &staticData = StaticData::InstanceNonConst(); - bool chartDecoding = (staticData.GetSearchAlgorithm() == ChartDecoding); - initialize(staticData, source, sentenceid, bleuObjectiveWeight, bleuScoreWeight, avgRefLength, chartDecoding); - - if (chartDecoding) { - m_chartManager = new ChartManager(*m_sentence); - m_chartManager->ProcessSentence(); - ChartTrellisPathList nBestList; - m_chartManager->CalcNBest(nBestSize, nBestList, distinctNbest); - - cerr << "generate nbest list " << filename << endl; - cerr << "not implemented.." << endl; - exit(1); - if (filename != "") { - ofstream out(filename.c_str()); - if (!out) { - ostringstream msg; - msg << "Unable to open " << filename; - throw runtime_error(msg.str()); - } - // TODO: handle sentence id (for now always 0) + cerr << "generate nbest list " << filename << endl; + cerr << "not implemented.." << endl; + exit(1); + if (filename != "") { + ofstream out(filename.c_str()); + if (!out) { + ostringstream msg; + msg << "Unable to open " << filename; + throw runtime_error(msg.str()); + } + // TODO: handle sentence id (for now always 0) // OutputNBestList(const ChartTrellisPathList &nBestList, const ChartHypothesis *bestHypo, const TranslationSystem* system, long translationId, false) // OutputNBest(out, nBestList, StaticData::Instance().GetOutputFactorOrder(),m_manager->GetTranslationSystem(), 0, false); - out.close(); - } - else { + out.close(); + } else { // OutputNBest(streamOut, nBestList, StaticData::Instance().GetOutputFactorOrder(),m_manager->GetTranslationSystem(), sentenceid, false); - } } - else { - // run the decoder - m_manager = new Moses::Manager(0,*m_sentence, staticData.GetSearchAlgorithm()); - m_manager->ProcessSentence(); - TrellisPathList nBestList; - m_manager->CalcNBest(nBestSize, nBestList, distinctNbest); + } else { + // run the decoder + m_manager = new Moses::Manager(0,*m_sentence, staticData.GetSearchAlgorithm()); + m_manager->ProcessSentence(); + TrellisPathList nBestList; + m_manager->CalcNBest(nBestSize, nBestList, distinctNbest); - if (filename != "") { - ofstream out(filename.c_str()); - if (!out) { - ostringstream msg; - msg << "Unable to open " << filename; - throw runtime_error(msg.str()); - } - // TODO: handle sentence id (for now always 0) - //OutputNBest(out, nBestList, StaticData::Instance().GetOutputFactorOrder(),m_manager->GetTranslationSystem(), 0, false); - out.close(); - } - else { - //OutputNBest(streamOut, nBestList, StaticData::Instance().GetOutputFactorOrder(),m_manager->GetTranslationSystem(), sentenceid, false); - streamOut.flush(); + if (filename != "") { + ofstream out(filename.c_str()); + if (!out) { + ostringstream msg; + msg << "Unable to open " << filename; + throw runtime_error(msg.str()); } + // TODO: handle sentence id (for now always 0) + //OutputNBest(out, nBestList, StaticData::Instance().GetOutputFactorOrder(),m_manager->GetTranslationSystem(), 0, false); + out.close(); + } else { + //OutputNBest(streamOut, nBestList, StaticData::Instance().GetOutputFactorOrder(),m_manager->GetTranslationSystem(), sentenceid, false); + streamOut.flush(); } } +} - void MosesDecoder::initialize(StaticData& staticData, const std::string& source, size_t sentenceid, - float bleuObjectiveWeight, float bleuScoreWeight, bool avgRefLength, bool chartDecoding) { - m_sentence = new Sentence(); - stringstream in(source + "\n"); - const std::vector &inputFactorOrder = staticData.GetInputFactorOrder(); - m_sentence->Read(in,inputFactorOrder); +void MosesDecoder::initialize(StaticData& staticData, const std::string& source, size_t sentenceid, + float bleuObjectiveWeight, float bleuScoreWeight, bool avgRefLength, bool chartDecoding) +{ + m_sentence = new Sentence(); + stringstream in(source + "\n"); + const std::vector &inputFactorOrder = staticData.GetInputFactorOrder(); + m_sentence->Read(in,inputFactorOrder); - // set weight of BleuScoreFeature - //cerr << "Reload Bleu feature weight: " << bleuObjectiveWeight*bleuScoreWeight << " (" << bleuObjectiveWeight << "*" << bleuScoreWeight << ")" << endl; - staticData.ReLoadBleuScoreFeatureParameter(bleuObjectiveWeight*bleuScoreWeight); + // set weight of BleuScoreFeature + //cerr << "Reload Bleu feature weight: " << bleuObjectiveWeight*bleuScoreWeight << " (" << bleuObjectiveWeight << "*" << bleuScoreWeight << ")" << endl; + staticData.ReLoadBleuScoreFeatureParameter(bleuObjectiveWeight*bleuScoreWeight); - m_bleuScoreFeature->SetCurrSourceLength((*m_sentence).GetSize()); - if (chartDecoding) - m_bleuScoreFeature->SetCurrNormSourceLength((*m_sentence).GetSize()-2); - else - m_bleuScoreFeature->SetCurrNormSourceLength((*m_sentence).GetSize()); - - if (avgRefLength) - m_bleuScoreFeature->SetCurrAvgRefLength(sentenceid); - else - m_bleuScoreFeature->SetCurrShortestRefLength(sentenceid); - m_bleuScoreFeature->SetCurrReferenceNgrams(sentenceid); - } + m_bleuScoreFeature->SetCurrSourceLength((*m_sentence).GetSize()); + if (chartDecoding) + m_bleuScoreFeature->SetCurrNormSourceLength((*m_sentence).GetSize()-2); + else + m_bleuScoreFeature->SetCurrNormSourceLength((*m_sentence).GetSize()); - float MosesDecoder::getBleuScore(const ScoreComponentCollection& scores) { - return scores.GetScoreForProducer(m_bleuScoreFeature); - } + if (avgRefLength) + m_bleuScoreFeature->SetCurrAvgRefLength(sentenceid); + else + m_bleuScoreFeature->SetCurrShortestRefLength(sentenceid); + m_bleuScoreFeature->SetCurrReferenceNgrams(sentenceid); +} - void MosesDecoder::setBleuScore(ScoreComponentCollection& scores, float bleu) { - scores.Assign(m_bleuScoreFeature, bleu); - } +float MosesDecoder::getBleuScore(const ScoreComponentCollection& scores) +{ + return scores.GetScoreForProducer(m_bleuScoreFeature); +} - ScoreComponentCollection MosesDecoder::getWeights() { - return StaticData::Instance().GetAllWeights(); - } +void MosesDecoder::setBleuScore(ScoreComponentCollection& scores, float bleu) +{ + scores.Assign(m_bleuScoreFeature, bleu); +} - void MosesDecoder::setWeights(const ScoreComponentCollection& weights) { - StaticData::InstanceNonConst().SetAllWeights(weights); - } +ScoreComponentCollection MosesDecoder::getWeights() +{ + return StaticData::Instance().GetAllWeights(); +} - void MosesDecoder::updateHistory(const vector& words) { - m_bleuScoreFeature->UpdateHistory(words); - } +void MosesDecoder::setWeights(const ScoreComponentCollection& weights) +{ + StaticData::InstanceNonConst().SetAllWeights(weights); +} - void MosesDecoder::updateHistory(const vector< vector< const Word*> >& words, vector& sourceLengths, vector& ref_ids, size_t rank, size_t epoch) { - m_bleuScoreFeature->UpdateHistory(words, sourceLengths, ref_ids, rank, epoch); - } +void MosesDecoder::updateHistory(const vector& words) +{ + m_bleuScoreFeature->UpdateHistory(words); +} - void MosesDecoder::printBleuFeatureHistory(std::ostream& out) { - m_bleuScoreFeature->PrintHistory(out); - } +void MosesDecoder::updateHistory(const vector< vector< const Word*> >& words, vector& sourceLengths, vector& ref_ids, size_t rank, size_t epoch) +{ + m_bleuScoreFeature->UpdateHistory(words, sourceLengths, ref_ids, rank, epoch); +} - size_t MosesDecoder::getClosestReferenceLength(size_t ref_id, int hypoLength) { - return m_bleuScoreFeature->GetClosestRefLength(ref_id, hypoLength); - } +void MosesDecoder::printBleuFeatureHistory(std::ostream& out) +{ + m_bleuScoreFeature->PrintHistory(out); +} - size_t MosesDecoder::getShortestReferenceIndex(size_t ref_id) { - return m_bleuScoreFeature->GetShortestRefIndex(ref_id); - } +size_t MosesDecoder::getClosestReferenceLength(size_t ref_id, int hypoLength) +{ + return m_bleuScoreFeature->GetClosestRefLength(ref_id, hypoLength); +} - void MosesDecoder::setBleuParameters(bool disable, bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength, - bool scaleByInverseLength, bool scaleByAvgInverseLength, - float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu) { - m_bleuScoreFeature->SetBleuParameters(disable, sentenceBleu, scaleByInputLength, scaleByAvgInputLength, - scaleByInverseLength, scaleByAvgInverseLength, - scaleByX, historySmoothing, scheme, simpleHistoryBleu); - } -} +size_t MosesDecoder::getShortestReferenceIndex(size_t ref_id) +{ + return m_bleuScoreFeature->GetShortestRefIndex(ref_id); +} + +void MosesDecoder::setBleuParameters(bool disable, bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength, + bool scaleByInverseLength, bool scaleByAvgInverseLength, + float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu) +{ + m_bleuScoreFeature->SetBleuParameters(disable, sentenceBleu, scaleByInputLength, scaleByAvgInputLength, + scaleByInverseLength, scaleByAvgInverseLength, + scaleByX, historySmoothing, scheme, simpleHistoryBleu); +} +} diff --git a/mira/Decoder.h b/mira/Decoder.h index 49a33d4d0..ac8acc26b 100644 --- a/mira/Decoder.h +++ b/mira/Decoder.h @@ -36,100 +36,110 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA // Wrapper functions and objects for the decoder. // -namespace Mira { - +namespace Mira +{ + /** * Wraps moses decoder. **/ -class MosesDecoder { - public: - /** - * Initialise moses (including StaticData) using the given ini file and debuglevel, passing through any - * other command line arguments. - **/ - MosesDecoder(const std::string& inifile, int debuglevel, int argc, std::vector decoder_params); - - //returns the best sentence - std::vector< std::vector > getNBest(const std::string& source, - size_t sentenceid, - size_t nbestSize, - float bleuObjectiveweight, //weight of bleu in objective - float bleuScoreWeight, //weight of bleu in score - std::vector< Moses::ScoreComponentCollection>& featureValues, - std::vector< float>& bleuScores, - std::vector< float>& modelScores, - size_t numReturnedTranslations, - bool realBleu, - bool distinct, - bool avgRefLength, - size_t rank, - size_t epoch, - std::string filename); - std::vector< std::vector > runDecoder(const std::string& source, - size_t sentenceid, - size_t nbestSize, - float bleuObjectiveweight, //weight of bleu in objective - float bleuScoreWeight, //weight of bleu in score - std::vector< Moses::ScoreComponentCollection>& featureValues, - std::vector< float>& bleuScores, - std::vector< float>& modelScores, - size_t numReturnedTranslations, - bool realBleu, - bool distinct, - size_t rank, - size_t epoch, - Moses::SearchAlgorithm& seach, - std::string filename); - std::vector< std::vector > runChartDecoder(const std::string& source, - size_t sentenceid, - size_t nbestSize, - float bleuObjectiveweight, //weight of bleu in objective - float bleuScoreWeight, //weight of bleu in score - std::vector< Moses::ScoreComponentCollection>& featureValues, - std::vector< float>& bleuScores, - std::vector< float>& modelScores, - size_t numReturnedTranslations, - bool realBleu, - bool distinct, - size_t rank, - size_t epoch); - void outputNBestList(const std::string& source, - size_t sentenceid, - size_t nBestSize, - float bleuObjectiveWeight, - float bleuScoreWeight, - bool distinctNbest, - bool avgRefLength, - std::string filename, - std::ofstream& streamOut); - void initialize(Moses::StaticData& staticData, const std::string& source, size_t sentenceid, - float bleuObjectiveWeight, float bleuScoreWeight, bool avgRefLength, bool chartDecoding); - void updateHistory(const std::vector& words); - void updateHistory(const std::vector< std::vector< const Moses::Word*> >& words, std::vector& sourceLengths, std::vector& ref_ids, size_t rank, size_t epoch); - void printBleuFeatureHistory(std::ostream& out); - void printReferenceLength(const std::vector& ref_ids); - size_t getReferenceLength(size_t ref_id); - size_t getClosestReferenceLength(size_t ref_id, int hypoLength); - size_t getShortestReferenceIndex(size_t ref_id); - void setBleuParameters(bool disable, bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength, - bool scaleByInverseLength, bool scaleByAvgInverseLength, - float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu); - void setAvgInputLength (float l) { m_bleuScoreFeature->SetAvgInputLength(l); } - Moses::ScoreComponentCollection getWeights(); - void setWeights(const Moses::ScoreComponentCollection& weights); - void cleanup(bool chartDecoding); - - float getSourceLengthHistory() { return m_bleuScoreFeature->GetSourceLengthHistory(); } - float getTargetLengthHistory() { return m_bleuScoreFeature->GetTargetLengthHistory(); } - float getAverageInputLength() { return m_bleuScoreFeature->GetAverageInputLength(); } +class MosesDecoder +{ +public: + /** + * Initialise moses (including StaticData) using the given ini file and debuglevel, passing through any + * other command line arguments. + **/ + MosesDecoder(const std::string& inifile, int debuglevel, int argc, std::vector decoder_params); - private: - float getBleuScore(const Moses::ScoreComponentCollection& scores); - void setBleuScore(Moses::ScoreComponentCollection& scores, float bleu); - Moses::Manager *m_manager; - Moses::ChartManager *m_chartManager; - Moses::Sentence *m_sentence; - Moses::BleuScoreFeature *m_bleuScoreFeature; + //returns the best sentence + std::vector< std::vector > getNBest(const std::string& source, + size_t sentenceid, + size_t nbestSize, + float bleuObjectiveweight, //weight of bleu in objective + float bleuScoreWeight, //weight of bleu in score + std::vector< Moses::ScoreComponentCollection>& featureValues, + std::vector< float>& bleuScores, + std::vector< float>& modelScores, + size_t numReturnedTranslations, + bool realBleu, + bool distinct, + bool avgRefLength, + size_t rank, + size_t epoch, + std::string filename); + std::vector< std::vector > runDecoder(const std::string& source, + size_t sentenceid, + size_t nbestSize, + float bleuObjectiveweight, //weight of bleu in objective + float bleuScoreWeight, //weight of bleu in score + std::vector< Moses::ScoreComponentCollection>& featureValues, + std::vector< float>& bleuScores, + std::vector< float>& modelScores, + size_t numReturnedTranslations, + bool realBleu, + bool distinct, + size_t rank, + size_t epoch, + Moses::SearchAlgorithm& seach, + std::string filename); + std::vector< std::vector > runChartDecoder(const std::string& source, + size_t sentenceid, + size_t nbestSize, + float bleuObjectiveweight, //weight of bleu in objective + float bleuScoreWeight, //weight of bleu in score + std::vector< Moses::ScoreComponentCollection>& featureValues, + std::vector< float>& bleuScores, + std::vector< float>& modelScores, + size_t numReturnedTranslations, + bool realBleu, + bool distinct, + size_t rank, + size_t epoch); + void outputNBestList(const std::string& source, + size_t sentenceid, + size_t nBestSize, + float bleuObjectiveWeight, + float bleuScoreWeight, + bool distinctNbest, + bool avgRefLength, + std::string filename, + std::ofstream& streamOut); + void initialize(Moses::StaticData& staticData, const std::string& source, size_t sentenceid, + float bleuObjectiveWeight, float bleuScoreWeight, bool avgRefLength, bool chartDecoding); + void updateHistory(const std::vector& words); + void updateHistory(const std::vector< std::vector< const Moses::Word*> >& words, std::vector& sourceLengths, std::vector& ref_ids, size_t rank, size_t epoch); + void printBleuFeatureHistory(std::ostream& out); + void printReferenceLength(const std::vector& ref_ids); + size_t getReferenceLength(size_t ref_id); + size_t getClosestReferenceLength(size_t ref_id, int hypoLength); + size_t getShortestReferenceIndex(size_t ref_id); + void setBleuParameters(bool disable, bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength, + bool scaleByInverseLength, bool scaleByAvgInverseLength, + float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu); + void setAvgInputLength (float l) { + m_bleuScoreFeature->SetAvgInputLength(l); + } + Moses::ScoreComponentCollection getWeights(); + void setWeights(const Moses::ScoreComponentCollection& weights); + void cleanup(bool chartDecoding); + + float getSourceLengthHistory() { + return m_bleuScoreFeature->GetSourceLengthHistory(); + } + float getTargetLengthHistory() { + return m_bleuScoreFeature->GetTargetLengthHistory(); + } + float getAverageInputLength() { + return m_bleuScoreFeature->GetAverageInputLength(); + } + +private: + float getBleuScore(const Moses::ScoreComponentCollection& scores); + void setBleuScore(Moses::ScoreComponentCollection& scores, float bleu); + Moses::Manager *m_manager; + Moses::ChartManager *m_chartManager; + Moses::Sentence *m_sentence; + Moses::BleuScoreFeature *m_bleuScoreFeature; }; diff --git a/mira/Hildreth.cpp b/mira/Hildreth.cpp index 53d1e0881..03076e767 100644 --- a/mira/Hildreth.cpp +++ b/mira/Hildreth.cpp @@ -3,186 +3,173 @@ using namespace Moses; using namespace std; -namespace Mira { +namespace Mira +{ - vector Hildreth::optimise (const vector& a, const vector& b) { +vector Hildreth::optimise (const vector& a, const vector& b) +{ - size_t i; - int max_iter = 10000; - float eps = 0.00000001; - float zero = 0.000000000001; + size_t i; + int max_iter = 10000; + float eps = 0.00000001; + float zero = 0.000000000001; - vector alpha ( b.size() ); - vector F ( b.size() ); - vector kkt ( b.size() ); + vector alpha ( b.size() ); + vector F ( b.size() ); + vector kkt ( b.size() ); - float max_kkt = -1e100; + float max_kkt = -1e100; - size_t K = b.size(); + size_t K = b.size(); - float A[K][K]; - bool is_computed[K]; - for ( i = 0; i < K; i++ ) - { - A[i][i] = a[i].InnerProduct(a[i]); - is_computed[i] = false; + float A[K][K]; + bool is_computed[K]; + for ( i = 0; i < K; i++ ) { + A[i][i] = a[i].InnerProduct(a[i]); + is_computed[i] = false; + } + + int max_kkt_i = -1; + + + for ( i = 0; i < b.size(); i++ ) { + F[i] = b[i]; + kkt[i] = F[i]; + if ( kkt[i] > max_kkt ) { + max_kkt = kkt[i]; + max_kkt_i = i; + } + } + + int iter = 0; + float diff_alpha; + float try_alpha; + float add_alpha; + + while ( max_kkt >= eps && iter < max_iter ) { + + diff_alpha = A[max_kkt_i][max_kkt_i] <= zero ? 0.0 : F[max_kkt_i]/A[max_kkt_i][max_kkt_i]; + try_alpha = alpha[max_kkt_i] + diff_alpha; + add_alpha = 0.0; + + if ( try_alpha < 0.0 ) + add_alpha = -1.0 * alpha[max_kkt_i]; + else + add_alpha = diff_alpha; + + alpha[max_kkt_i] = alpha[max_kkt_i] + add_alpha; + + if ( !is_computed[max_kkt_i] ) { + for ( i = 0; i < K; i++ ) { + A[i][max_kkt_i] = a[i].InnerProduct(a[max_kkt_i] ); // for version 1 + //A[i][max_kkt_i] = 0; // for version 1 + is_computed[max_kkt_i] = true; + } } - int max_kkt_i = -1; - - - for ( i = 0; i < b.size(); i++ ) - { - F[i] = b[i]; + for ( i = 0; i < F.size(); i++ ) { + F[i] -= add_alpha * A[i][max_kkt_i]; kkt[i] = F[i]; - if ( kkt[i] > max_kkt ) - { + if ( alpha[i] > zero ) + kkt[i] = abs ( F[i] ); + } + max_kkt = -1e100; + max_kkt_i = -1; + for ( i = 0; i < F.size(); i++ ) + if ( kkt[i] > max_kkt ) { max_kkt = kkt[i]; max_kkt_i = i; } - } - int iter = 0; - float diff_alpha; - float try_alpha; - float add_alpha; - - while ( max_kkt >= eps && iter < max_iter ) - { - - diff_alpha = A[max_kkt_i][max_kkt_i] <= zero ? 0.0 : F[max_kkt_i]/A[max_kkt_i][max_kkt_i]; - try_alpha = alpha[max_kkt_i] + diff_alpha; - add_alpha = 0.0; - - if ( try_alpha < 0.0 ) - add_alpha = -1.0 * alpha[max_kkt_i]; - else - add_alpha = diff_alpha; - - alpha[max_kkt_i] = alpha[max_kkt_i] + add_alpha; - - if ( !is_computed[max_kkt_i] ) - { - for ( i = 0; i < K; i++ ) - { - A[i][max_kkt_i] = a[i].InnerProduct(a[max_kkt_i] ); // for version 1 - //A[i][max_kkt_i] = 0; // for version 1 - is_computed[max_kkt_i] = true; - } - } - - for ( i = 0; i < F.size(); i++ ) - { - F[i] -= add_alpha * A[i][max_kkt_i]; - kkt[i] = F[i]; - if ( alpha[i] > zero ) - kkt[i] = abs ( F[i] ); - } - max_kkt = -1e100; - max_kkt_i = -1; - for ( i = 0; i < F.size(); i++ ) - if ( kkt[i] > max_kkt ) - { - max_kkt = kkt[i]; - max_kkt_i = i; - } - - iter++; - } - - return alpha; + iter++; } - vector Hildreth::optimise (const vector& a, const vector& b, float C) { - - size_t i; - int max_iter = 10000; - float eps = 0.00000001; - float zero = 0.000000000001; - - vector alpha ( b.size() ); - vector F ( b.size() ); - vector kkt ( b.size() ); - - float max_kkt = -1e100; - - size_t K = b.size(); - - float A[K][K]; - bool is_computed[K]; - for ( i = 0; i < K; i++ ) - { - A[i][i] = a[i].InnerProduct(a[i]); - is_computed[i] = false; - } - - int max_kkt_i = -1; - - - for ( i = 0; i < b.size(); i++ ) - { - F[i] = b[i]; - kkt[i] = F[i]; - if ( kkt[i] > max_kkt ) - { - max_kkt = kkt[i]; - max_kkt_i = i; - } - } - - int iter = 0; - float diff_alpha; - float try_alpha; - float add_alpha; - - while ( max_kkt >= eps && iter < max_iter ) - { - - diff_alpha = A[max_kkt_i][max_kkt_i] <= zero ? 0.0 : F[max_kkt_i]/A[max_kkt_i][max_kkt_i]; - try_alpha = alpha[max_kkt_i] + diff_alpha; - add_alpha = 0.0; - - if ( try_alpha < 0.0 ) - add_alpha = -1.0 * alpha[max_kkt_i]; - else if (try_alpha > C) - add_alpha = C - alpha[max_kkt_i]; - else - add_alpha = diff_alpha; - - alpha[max_kkt_i] = alpha[max_kkt_i] + add_alpha; - - if ( !is_computed[max_kkt_i] ) - { - for ( i = 0; i < K; i++ ) - { - A[i][max_kkt_i] = a[i].InnerProduct(a[max_kkt_i] ); // for version 1 - //A[i][max_kkt_i] = 0; // for version 1 - is_computed[max_kkt_i] = true; - } - } - - for ( i = 0; i < F.size(); i++ ) - { - F[i] -= add_alpha * A[i][max_kkt_i]; - kkt[i] = F[i]; - if (alpha[i] > C - zero) - kkt[i]=-kkt[i]; - else if (alpha[i] > zero) - kkt[i] = abs(F[i]); - - } - max_kkt = -1e100; - max_kkt_i = -1; - for ( i = 0; i < F.size(); i++ ) - if ( kkt[i] > max_kkt ) - { - max_kkt = kkt[i]; - max_kkt_i = i; - } - - iter++; - } - - return alpha; - } + return alpha; +} + +vector Hildreth::optimise (const vector& a, const vector& b, float C) +{ + + size_t i; + int max_iter = 10000; + float eps = 0.00000001; + float zero = 0.000000000001; + + vector alpha ( b.size() ); + vector F ( b.size() ); + vector kkt ( b.size() ); + + float max_kkt = -1e100; + + size_t K = b.size(); + + float A[K][K]; + bool is_computed[K]; + for ( i = 0; i < K; i++ ) { + A[i][i] = a[i].InnerProduct(a[i]); + is_computed[i] = false; + } + + int max_kkt_i = -1; + + + for ( i = 0; i < b.size(); i++ ) { + F[i] = b[i]; + kkt[i] = F[i]; + if ( kkt[i] > max_kkt ) { + max_kkt = kkt[i]; + max_kkt_i = i; + } + } + + int iter = 0; + float diff_alpha; + float try_alpha; + float add_alpha; + + while ( max_kkt >= eps && iter < max_iter ) { + + diff_alpha = A[max_kkt_i][max_kkt_i] <= zero ? 0.0 : F[max_kkt_i]/A[max_kkt_i][max_kkt_i]; + try_alpha = alpha[max_kkt_i] + diff_alpha; + add_alpha = 0.0; + + if ( try_alpha < 0.0 ) + add_alpha = -1.0 * alpha[max_kkt_i]; + else if (try_alpha > C) + add_alpha = C - alpha[max_kkt_i]; + else + add_alpha = diff_alpha; + + alpha[max_kkt_i] = alpha[max_kkt_i] + add_alpha; + + if ( !is_computed[max_kkt_i] ) { + for ( i = 0; i < K; i++ ) { + A[i][max_kkt_i] = a[i].InnerProduct(a[max_kkt_i] ); // for version 1 + //A[i][max_kkt_i] = 0; // for version 1 + is_computed[max_kkt_i] = true; + } + } + + for ( i = 0; i < F.size(); i++ ) { + F[i] -= add_alpha * A[i][max_kkt_i]; + kkt[i] = F[i]; + if (alpha[i] > C - zero) + kkt[i]=-kkt[i]; + else if (alpha[i] > zero) + kkt[i] = abs(F[i]); + + } + max_kkt = -1e100; + max_kkt_i = -1; + for ( i = 0; i < F.size(); i++ ) + if ( kkt[i] > max_kkt ) { + max_kkt = kkt[i]; + max_kkt_i = i; + } + + iter++; + } + + return alpha; +} } diff --git a/mira/Hildreth.h b/mira/Hildreth.h index da52995aa..373f2ac43 100644 --- a/mira/Hildreth.h +++ b/mira/Hildreth.h @@ -1,11 +1,13 @@ #include "moses/FeatureVector.h" #include "moses/ScoreComponentCollection.h" -namespace Mira { +namespace Mira +{ - class Hildreth { - public : - static std::vector optimise (const std::vector& a, const std::vector& b ); - static std::vector optimise (const std::vector& a, const std::vector& b, float C); - }; +class Hildreth +{ +public : + static std::vector optimise (const std::vector& a, const std::vector& b ); + static std::vector optimise (const std::vector& a, const std::vector& b, float C); +}; } diff --git a/mira/HildrethTest.cpp b/mira/HildrethTest.cpp index a32dcd1d3..43e4403e4 100644 --- a/mira/HildrethTest.cpp +++ b/mira/HildrethTest.cpp @@ -34,22 +34,31 @@ using namespace Mira; namespace MosesTest { -class MockSingleFeature : public StatelessFeatureFunction { - public: - MockSingleFeature(): StatelessFeatureFunction("MockSingle",1) {} - std::string GetScoreProducerWeightShortName(unsigned) const {return "sf";} +class MockSingleFeature : public StatelessFeatureFunction +{ +public: + MockSingleFeature(): StatelessFeatureFunction("MockSingle",1) {} + std::string GetScoreProducerWeightShortName(unsigned) const { + return "sf"; + } }; -class MockMultiFeature : public StatelessFeatureFunction { - public: - MockMultiFeature(): StatelessFeatureFunction("MockMulti",5) {} - std::string GetScoreProducerWeightShortName(unsigned) const {return "mf";} +class MockMultiFeature : public StatelessFeatureFunction +{ +public: + MockMultiFeature(): StatelessFeatureFunction("MockMulti",5) {} + std::string GetScoreProducerWeightShortName(unsigned) const { + return "mf"; + } }; -class MockSparseFeature : public StatelessFeatureFunction { - public: - MockSparseFeature(): StatelessFeatureFunction("MockSparse", ScoreProducer::unlimited) {} - std::string GetScoreProducerWeightShortName(unsigned) const {return "sf";} +class MockSparseFeature : public StatelessFeatureFunction +{ +public: + MockSparseFeature(): StatelessFeatureFunction("MockSparse", ScoreProducer::unlimited) {} + std::string GetScoreProducerWeightShortName(unsigned) const { + return "sf"; + } }; struct MockProducers { @@ -66,716 +75,716 @@ BOOST_AUTO_TEST_SUITE(hildreth_test) BOOST_FIXTURE_TEST_CASE(test_hildreth_1, MockProducers) { - // Feasible example with 2 constraints - cerr << "\n>>>>>Hildreth test, without slack and with 0.01 slack" << endl << endl; - vector< ScoreComponentCollection> featureValueDiffs; - vector< float> lossMinusModelScoreDiff; + // Feasible example with 2 constraints + cerr << "\n>>>>>Hildreth test, without slack and with 0.01 slack" << endl << endl; + vector< ScoreComponentCollection> featureValueDiffs; + vector< float> lossMinusModelScoreDiff; - // initial weights - float w[] = { 1, 1, 1, 1, 0 }; - vector vec(w,w+5); - ScoreComponentCollection weights; - weights.PlusEquals(&multi, vec); + // initial weights + float w[] = { 1, 1, 1, 1, 0 }; + vector vec(w,w+5); + ScoreComponentCollection weights; + weights.PlusEquals(&multi, vec); - // feature values (second is oracle) - //float arr1[] = {0, -5, -27.0908, -1.83258, 0 }; - //float arr2[] = {0, -5, -29.158, -1.83258, 0 }; - //float arr3[] = {0, -5, -27.0908, -1.83258, 0 }; + // feature values (second is oracle) + //float arr1[] = {0, -5, -27.0908, -1.83258, 0 }; + //float arr2[] = {0, -5, -29.158, -1.83258, 0 }; + //float arr3[] = {0, -5, -27.0908, -1.83258, 0 }; - // feature value differences (to oracle) - ScoreComponentCollection s1, s2, s3; - float arr1[] = { 0, 0, -2.0672, 0, 0 }; - float arr2[] = { 0, 0, 0, 0, 0 }; - float arr3[] = { 0, 0, -2.0672, 0, 0 }; + // feature value differences (to oracle) + ScoreComponentCollection s1, s2, s3; + float arr1[] = { 0, 0, -2.0672, 0, 0 }; + float arr2[] = { 0, 0, 0, 0, 0 }; + float arr3[] = { 0, 0, -2.0672, 0, 0 }; - float loss1 = 2.34085; - float loss2 = 0; - float loss3 = 2.34085; + float loss1 = 2.34085; + float loss2 = 0; + float loss3 = 2.34085; - vector vec1(arr1,arr1+5); - vector vec2(arr2,arr2+5); - vector vec3(arr3,arr3+5); + vector vec1(arr1,arr1+5); + vector vec2(arr2,arr2+5); + vector vec3(arr3,arr3+5); - s1.PlusEquals(&multi,vec1); - s2.PlusEquals(&multi,vec2); - s3.PlusEquals(&multi,vec3); + s1.PlusEquals(&multi,vec1); + s2.PlusEquals(&multi,vec2); + s3.PlusEquals(&multi,vec3); - featureValueDiffs.push_back(s1); - featureValueDiffs.push_back(s2); - featureValueDiffs.push_back(s3); + featureValueDiffs.push_back(s1); + featureValueDiffs.push_back(s2); + featureValueDiffs.push_back(s3); - cerr << "feature value diff: " << featureValueDiffs[0] << endl; - cerr << "feature value diff: " << featureValueDiffs[1] << endl; - cerr << "feature value diff: " << featureValueDiffs[2] << endl << endl; + cerr << "feature value diff: " << featureValueDiffs[0] << endl; + cerr << "feature value diff: " << featureValueDiffs[1] << endl; + cerr << "feature value diff: " << featureValueDiffs[2] << endl << endl; - float oldModelScoreDiff1 = featureValueDiffs[0].InnerProduct(weights); - float oldModelScoreDiff2 = featureValueDiffs[1].InnerProduct(weights); - float oldModelScoreDiff3 = featureValueDiffs[2].InnerProduct(weights); + float oldModelScoreDiff1 = featureValueDiffs[0].InnerProduct(weights); + float oldModelScoreDiff2 = featureValueDiffs[1].InnerProduct(weights); + float oldModelScoreDiff3 = featureValueDiffs[2].InnerProduct(weights); - cerr << "model score diff: " << oldModelScoreDiff1 << ", loss: " << loss1 << endl; - cerr << "model score diff: " << oldModelScoreDiff2 << ", loss: " << loss2 << endl; - cerr << "model score diff: " << oldModelScoreDiff3 << ", loss: " << loss3 << endl << endl; + cerr << "model score diff: " << oldModelScoreDiff1 << ", loss: " << loss1 << endl; + cerr << "model score diff: " << oldModelScoreDiff2 << ", loss: " << loss2 << endl; + cerr << "model score diff: " << oldModelScoreDiff3 << ", loss: " << loss3 << endl << endl; - lossMinusModelScoreDiff.push_back(loss1 - oldModelScoreDiff1); - lossMinusModelScoreDiff.push_back(loss2 - oldModelScoreDiff2); - lossMinusModelScoreDiff.push_back(loss3 - oldModelScoreDiff3); + lossMinusModelScoreDiff.push_back(loss1 - oldModelScoreDiff1); + lossMinusModelScoreDiff.push_back(loss2 - oldModelScoreDiff2); + lossMinusModelScoreDiff.push_back(loss3 - oldModelScoreDiff3); - vector< float> alphas1 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff); - vector< float> alphas2 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff, 0.01); + vector< float> alphas1 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff); + vector< float> alphas2 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff, 0.01); - cerr << "\nalphas without slack:" << endl; - for (size_t i = 0; i < alphas1.size(); ++i) { - cerr << "alpha " << i << ": " << alphas1[i] << endl; - } - cerr << endl; + cerr << "\nalphas without slack:" << endl; + for (size_t i = 0; i < alphas1.size(); ++i) { + cerr << "alpha " << i << ": " << alphas1[i] << endl; + } + cerr << endl; - cerr << "partial updates:" << endl; - vector< ScoreComponentCollection> featureValueDiffs1(featureValueDiffs); - FVector totalUpdate1 = ScoreComponentCollection::CreateFVector(); - for (size_t k = 0; k < featureValueDiffs1.size(); ++k) { - featureValueDiffs1[k].MultiplyEquals(alphas1[k]); - cerr << k << ": " << featureValueDiffs1[k].GetScoresVector() << endl; - FVector update = featureValueDiffs1[k].GetScoresVector(); - totalUpdate1 += update; - } - cerr << endl; - cerr << "total update: " << totalUpdate1 << endl << endl; + cerr << "partial updates:" << endl; + vector< ScoreComponentCollection> featureValueDiffs1(featureValueDiffs); + FVector totalUpdate1 = ScoreComponentCollection::CreateFVector(); + for (size_t k = 0; k < featureValueDiffs1.size(); ++k) { + featureValueDiffs1[k].MultiplyEquals(alphas1[k]); + cerr << k << ": " << featureValueDiffs1[k].GetScoresVector() << endl; + FVector update = featureValueDiffs1[k].GetScoresVector(); + totalUpdate1 += update; + } + cerr << endl; + cerr << "total update: " << totalUpdate1 << endl << endl; - ScoreComponentCollection weightsUpdate1(weights); - weightsUpdate1.PlusEquals(totalUpdate1); - cerr << "new weights: " << weightsUpdate1 << endl << endl; + ScoreComponentCollection weightsUpdate1(weights); + weightsUpdate1.PlusEquals(totalUpdate1); + cerr << "new weights: " << weightsUpdate1 << endl << endl; - float newModelScoreDiff1 = featureValueDiffs[0].InnerProduct(weightsUpdate1); - float newModelScoreDiff2 = featureValueDiffs[1].InnerProduct(weightsUpdate1); - float newModelScoreDiff3 = featureValueDiffs[2].InnerProduct(weightsUpdate1); + float newModelScoreDiff1 = featureValueDiffs[0].InnerProduct(weightsUpdate1); + float newModelScoreDiff2 = featureValueDiffs[1].InnerProduct(weightsUpdate1); + float newModelScoreDiff3 = featureValueDiffs[2].InnerProduct(weightsUpdate1); - cerr << "new model score diff: " << newModelScoreDiff1 << ", loss: " << loss1 << endl; - cerr << "new model score diff: " << newModelScoreDiff2 << ", loss: " << loss2 << endl; - cerr << "new model score diff: " << newModelScoreDiff3 << ", loss: " << loss3 << endl; + cerr << "new model score diff: " << newModelScoreDiff1 << ", loss: " << loss1 << endl; + cerr << "new model score diff: " << newModelScoreDiff2 << ", loss: " << loss2 << endl; + cerr << "new model score diff: " << newModelScoreDiff3 << ", loss: " << loss3 << endl; - cerr << "\n\nalphas with slack 0.01:" << endl; - for (size_t i = 0; i < alphas2.size(); ++i) { - cerr << "alpha " << i << ": " << alphas2[i] << endl; - } - cerr << endl; + cerr << "\n\nalphas with slack 0.01:" << endl; + for (size_t i = 0; i < alphas2.size(); ++i) { + cerr << "alpha " << i << ": " << alphas2[i] << endl; + } + cerr << endl; - cerr << "partial updates:" << endl; - vector< ScoreComponentCollection> featureValueDiffs2(featureValueDiffs); - FVector totalUpdate2 = ScoreComponentCollection::CreateFVector(); - for (size_t k = 0; k < featureValueDiffs2.size(); ++k) { - featureValueDiffs2[k].MultiplyEquals(alphas2[k]); - cerr << k << ": " << featureValueDiffs2[k].GetScoresVector() << endl; - FVector update = featureValueDiffs2[k].GetScoresVector(); - totalUpdate2 += update; - } - cerr << endl; - cerr << "total update: " << totalUpdate2 << endl << endl; + cerr << "partial updates:" << endl; + vector< ScoreComponentCollection> featureValueDiffs2(featureValueDiffs); + FVector totalUpdate2 = ScoreComponentCollection::CreateFVector(); + for (size_t k = 0; k < featureValueDiffs2.size(); ++k) { + featureValueDiffs2[k].MultiplyEquals(alphas2[k]); + cerr << k << ": " << featureValueDiffs2[k].GetScoresVector() << endl; + FVector update = featureValueDiffs2[k].GetScoresVector(); + totalUpdate2 += update; + } + cerr << endl; + cerr << "total update: " << totalUpdate2 << endl << endl; - ScoreComponentCollection weightsUpdate2(weights); - weightsUpdate2.PlusEquals(totalUpdate2); - cerr << "new weights: " << weightsUpdate2 << endl << endl; + ScoreComponentCollection weightsUpdate2(weights); + weightsUpdate2.PlusEquals(totalUpdate2); + cerr << "new weights: " << weightsUpdate2 << endl << endl; - float newModelScoreDiff4 = featureValueDiffs[0].InnerProduct(weightsUpdate2); - float newModelScoreDiff5 = featureValueDiffs[1].InnerProduct(weightsUpdate2); - float newModelScoreDiff6 = featureValueDiffs[2].InnerProduct(weightsUpdate2); + float newModelScoreDiff4 = featureValueDiffs[0].InnerProduct(weightsUpdate2); + float newModelScoreDiff5 = featureValueDiffs[1].InnerProduct(weightsUpdate2); + float newModelScoreDiff6 = featureValueDiffs[2].InnerProduct(weightsUpdate2); - cerr << "new model score diff: " << newModelScoreDiff4 << ", loss: " << loss1 << endl; - cerr << "new model score diff: " << newModelScoreDiff5 << ", loss: " << loss2 << endl; - cerr << "new model score diff: " << newModelScoreDiff6 << ", loss: " << loss3 << endl; + cerr << "new model score diff: " << newModelScoreDiff4 << ", loss: " << loss1 << endl; + cerr << "new model score diff: " << newModelScoreDiff5 << ", loss: " << loss2 << endl; + cerr << "new model score diff: " << newModelScoreDiff6 << ", loss: " << loss3 << endl; } BOOST_FIXTURE_TEST_CASE(test_hildreth_3, MockProducers) { - // Unfeasible example with 21 constraints - cerr << "\n>>>>>Hildreth test, without slack and with 0.01 slack" << endl << endl; - vector< ScoreComponentCollection> featureValueDiffs; - vector< float> lossMinusModelScoreDiff; + // Unfeasible example with 21 constraints + cerr << "\n>>>>>Hildreth test, without slack and with 0.01 slack" << endl << endl; + vector< ScoreComponentCollection> featureValueDiffs; + vector< float> lossMinusModelScoreDiff; - // initial weights - float w[] = { 1, 1, 0.638672, 1, 0 }; - vector vec(w,w+5); - ScoreComponentCollection weights; - weights.PlusEquals(&multi, vec); + // initial weights + float w[] = { 1, 1, 0.638672, 1, 0 }; + vector vec(w,w+5); + ScoreComponentCollection weights; + weights.PlusEquals(&multi, vec); - int numberOfConstraints = 21; + int numberOfConstraints = 21; - // feature value differences (to oracle) - // NOTE: these feature values are only approximations - ScoreComponentCollection s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21; - float arr1[] = { 0, 0, -2.0672, 0, 0 }; - float arr2[] = { 0, 0, 0, 0, 0 }; - float arr3[] = { 0, 0, -2.08436, 1.38629, 0 }; - float arr4[] = { 0, 0, -0.0171661, 1.38629, 0 }; - float arr5[] = { 0, 0, 4.4283, 0, 0 }; - float arr6[] = { 0, 0, 3.84829, 1.38629, 0 }; - float arr7[] = { 0, 0, 6.83689, 0, 0 }; - float arr8[] = { 0, 0, 0, 0, 0 }; - float arr9[] = { 0, 0, -2.0672, 0, 0 }; - float arr10[] = { 0, 0, -0.0171661, 1.38629, 0 }; - float arr11[] = { 0, 0, -2.08436, 1.38629, 0 }; - float arr12[] = { 0, 0, 4.4283, 0, 0 }; - float arr13[] = { 3, 0, 2.41089, 0, 0 }; - float arr14[] = { 3, 0, 2.32709, 0, 0 }; - float arr15[] = { 0, 0, -2.0672, 0, 0 }; - float arr16[] = { 0, 0, -2.08436, 1.38629, 0 }; - float arr17[] = { 0, 0, 4.4283, 0, 0 }; - float arr18[] = { 0, 0, 3.84829, 1.38629, 0 }; - float arr19[] = { 0, 0, -0.0171661, 1.38629, 0 }; - float arr20[] = { 0, 0, 0, 0, 0 }; - float arr21[] = { 0, 0, 6.83689, 0, 0 }; + // feature value differences (to oracle) + // NOTE: these feature values are only approximations + ScoreComponentCollection s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21; + float arr1[] = { 0, 0, -2.0672, 0, 0 }; + float arr2[] = { 0, 0, 0, 0, 0 }; + float arr3[] = { 0, 0, -2.08436, 1.38629, 0 }; + float arr4[] = { 0, 0, -0.0171661, 1.38629, 0 }; + float arr5[] = { 0, 0, 4.4283, 0, 0 }; + float arr6[] = { 0, 0, 3.84829, 1.38629, 0 }; + float arr7[] = { 0, 0, 6.83689, 0, 0 }; + float arr8[] = { 0, 0, 0, 0, 0 }; + float arr9[] = { 0, 0, -2.0672, 0, 0 }; + float arr10[] = { 0, 0, -0.0171661, 1.38629, 0 }; + float arr11[] = { 0, 0, -2.08436, 1.38629, 0 }; + float arr12[] = { 0, 0, 4.4283, 0, 0 }; + float arr13[] = { 3, 0, 2.41089, 0, 0 }; + float arr14[] = { 3, 0, 2.32709, 0, 0 }; + float arr15[] = { 0, 0, -2.0672, 0, 0 }; + float arr16[] = { 0, 0, -2.08436, 1.38629, 0 }; + float arr17[] = { 0, 0, 4.4283, 0, 0 }; + float arr18[] = { 0, 0, 3.84829, 1.38629, 0 }; + float arr19[] = { 0, 0, -0.0171661, 1.38629, 0 }; + float arr20[] = { 0, 0, 0, 0, 0 }; + float arr21[] = { 0, 0, 6.83689, 0, 0 }; - vector losses; - losses.push_back(2.73485); - losses.push_back(0); - losses.push_back(3.64118); - losses.push_back(1.47347); - losses.push_back(3.64118); - losses.push_back(4.16278); - losses.push_back(3.13952); - losses.push_back(0); - losses.push_back(2.73485); - losses.push_back(1.47347); - losses.push_back(3.64118); - losses.push_back(3.64118); - losses.push_back(2.51662); - losses.push_back(2.73485); - losses.push_back(2.73485); - losses.push_back(3.64118); - losses.push_back(3.64118); - losses.push_back(4.16278); - losses.push_back(1.47347); - losses.push_back(0); - losses.push_back(3.13952); + vector losses; + losses.push_back(2.73485); + losses.push_back(0); + losses.push_back(3.64118); + losses.push_back(1.47347); + losses.push_back(3.64118); + losses.push_back(4.16278); + losses.push_back(3.13952); + losses.push_back(0); + losses.push_back(2.73485); + losses.push_back(1.47347); + losses.push_back(3.64118); + losses.push_back(3.64118); + losses.push_back(2.51662); + losses.push_back(2.73485); + losses.push_back(2.73485); + losses.push_back(3.64118); + losses.push_back(3.64118); + losses.push_back(4.16278); + losses.push_back(1.47347); + losses.push_back(0); + losses.push_back(3.13952); - vector vec1(arr1,arr1+5); - vector vec2(arr2,arr2+5); - vector vec3(arr3,arr3+5); - vector vec4(arr4,arr4+5); - vector vec5(arr5,arr5+5); - vector vec6(arr6,arr6+5); - vector vec7(arr7,arr7+5); - vector vec8(arr8,arr8+5); - vector vec9(arr9,arr9+5); - vector vec10(arr10,arr10+5); - vector vec11(arr11,arr11+5); - vector vec12(arr12,arr12+5); - vector vec13(arr13,arr13+5); - vector vec14(arr14,arr14+5); - vector vec15(arr15,arr15+5); - vector vec16(arr16,arr16+5); - vector vec17(arr17,arr17+5); - vector vec18(arr18,arr18+5); - vector vec19(arr19,arr19+5); - vector vec20(arr20,arr20+5); - vector vec21(arr21,arr21+5); + vector vec1(arr1,arr1+5); + vector vec2(arr2,arr2+5); + vector vec3(arr3,arr3+5); + vector vec4(arr4,arr4+5); + vector vec5(arr5,arr5+5); + vector vec6(arr6,arr6+5); + vector vec7(arr7,arr7+5); + vector vec8(arr8,arr8+5); + vector vec9(arr9,arr9+5); + vector vec10(arr10,arr10+5); + vector vec11(arr11,arr11+5); + vector vec12(arr12,arr12+5); + vector vec13(arr13,arr13+5); + vector vec14(arr14,arr14+5); + vector vec15(arr15,arr15+5); + vector vec16(arr16,arr16+5); + vector vec17(arr17,arr17+5); + vector vec18(arr18,arr18+5); + vector vec19(arr19,arr19+5); + vector vec20(arr20,arr20+5); + vector vec21(arr21,arr21+5); - s1.PlusEquals(&multi,vec1); - s2.PlusEquals(&multi,vec2); - s3.PlusEquals(&multi,vec3); - s4.PlusEquals(&multi,vec4); - s5.PlusEquals(&multi,vec5); - s6.PlusEquals(&multi,vec6); - s7.PlusEquals(&multi,vec7); - s8.PlusEquals(&multi,vec8); - s9.PlusEquals(&multi,vec9); - s10.PlusEquals(&multi,vec10); - s11.PlusEquals(&multi,vec11); - s12.PlusEquals(&multi,vec12); - s13.PlusEquals(&multi,vec13); - s14.PlusEquals(&multi,vec14); - s15.PlusEquals(&multi,vec15); - s16.PlusEquals(&multi,vec16); - s17.PlusEquals(&multi,vec17); - s18.PlusEquals(&multi,vec18); - s19.PlusEquals(&multi,vec19); - s20.PlusEquals(&multi,vec20); - s21.PlusEquals(&multi,vec21); + s1.PlusEquals(&multi,vec1); + s2.PlusEquals(&multi,vec2); + s3.PlusEquals(&multi,vec3); + s4.PlusEquals(&multi,vec4); + s5.PlusEquals(&multi,vec5); + s6.PlusEquals(&multi,vec6); + s7.PlusEquals(&multi,vec7); + s8.PlusEquals(&multi,vec8); + s9.PlusEquals(&multi,vec9); + s10.PlusEquals(&multi,vec10); + s11.PlusEquals(&multi,vec11); + s12.PlusEquals(&multi,vec12); + s13.PlusEquals(&multi,vec13); + s14.PlusEquals(&multi,vec14); + s15.PlusEquals(&multi,vec15); + s16.PlusEquals(&multi,vec16); + s17.PlusEquals(&multi,vec17); + s18.PlusEquals(&multi,vec18); + s19.PlusEquals(&multi,vec19); + s20.PlusEquals(&multi,vec20); + s21.PlusEquals(&multi,vec21); - featureValueDiffs.push_back(s1); - featureValueDiffs.push_back(s2); - featureValueDiffs.push_back(s3); - featureValueDiffs.push_back(s4); - featureValueDiffs.push_back(s5); - featureValueDiffs.push_back(s6); - featureValueDiffs.push_back(s7); - featureValueDiffs.push_back(s8); - featureValueDiffs.push_back(s9); - featureValueDiffs.push_back(s10); - featureValueDiffs.push_back(s11); - featureValueDiffs.push_back(s12); - featureValueDiffs.push_back(s13); - featureValueDiffs.push_back(s14); - featureValueDiffs.push_back(s15); - featureValueDiffs.push_back(s16); - featureValueDiffs.push_back(s17); - featureValueDiffs.push_back(s18); - featureValueDiffs.push_back(s19); - featureValueDiffs.push_back(s20); - featureValueDiffs.push_back(s21); + featureValueDiffs.push_back(s1); + featureValueDiffs.push_back(s2); + featureValueDiffs.push_back(s3); + featureValueDiffs.push_back(s4); + featureValueDiffs.push_back(s5); + featureValueDiffs.push_back(s6); + featureValueDiffs.push_back(s7); + featureValueDiffs.push_back(s8); + featureValueDiffs.push_back(s9); + featureValueDiffs.push_back(s10); + featureValueDiffs.push_back(s11); + featureValueDiffs.push_back(s12); + featureValueDiffs.push_back(s13); + featureValueDiffs.push_back(s14); + featureValueDiffs.push_back(s15); + featureValueDiffs.push_back(s16); + featureValueDiffs.push_back(s17); + featureValueDiffs.push_back(s18); + featureValueDiffs.push_back(s19); + featureValueDiffs.push_back(s20); + featureValueDiffs.push_back(s21); - vector oldModelScoreDiff; - for (int i = 0; i < numberOfConstraints; ++i) { - oldModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weights)); - } + vector oldModelScoreDiff; + for (int i = 0; i < numberOfConstraints; ++i) { + oldModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weights)); + } - for (int i = 0; i < numberOfConstraints; ++i) { - cerr << "old model score diff: " << oldModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (oldModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl; - } + for (int i = 0; i < numberOfConstraints; ++i) { + cerr << "old model score diff: " << oldModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (oldModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl; + } - for (int i = 0; i < numberOfConstraints; ++i) { - lossMinusModelScoreDiff.push_back(losses[i] - oldModelScoreDiff[i]); - } + for (int i = 0; i < numberOfConstraints; ++i) { + lossMinusModelScoreDiff.push_back(losses[i] - oldModelScoreDiff[i]); + } - for (int i = 0; i < numberOfConstraints; ++i) { - cerr << "A: " << featureValueDiffs[i] << ", b: " << lossMinusModelScoreDiff[i] << endl; - } + for (int i = 0; i < numberOfConstraints; ++i) { + cerr << "A: " << featureValueDiffs[i] << ", b: " << lossMinusModelScoreDiff[i] << endl; + } - vector< float> alphas1 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff); - vector< float> alphas2 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff, 0.01); + vector< float> alphas1 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff); + vector< float> alphas2 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff, 0.01); - cerr << "\nalphas without slack:" << endl; - for (size_t i = 0; i < alphas1.size(); ++i) { - cerr << "alpha " << i << ": " << alphas1[i] << endl; - } - cerr << endl; + cerr << "\nalphas without slack:" << endl; + for (size_t i = 0; i < alphas1.size(); ++i) { + cerr << "alpha " << i << ": " << alphas1[i] << endl; + } + cerr << endl; - cerr << "partial updates:" << endl; - vector< ScoreComponentCollection> featureValueDiffs1(featureValueDiffs); - FVector totalUpdate1 = ScoreComponentCollection::CreateFVector(); - for (size_t k = 0; k < featureValueDiffs1.size(); ++k) { - featureValueDiffs1[k].MultiplyEquals(alphas1[k]); - cerr << k << ": " << featureValueDiffs1[k].GetScoresVector() << endl; - FVector update = featureValueDiffs1[k].GetScoresVector(); - totalUpdate1 += update; - } - cerr << endl; - cerr << "total update: " << totalUpdate1 << endl << endl; + cerr << "partial updates:" << endl; + vector< ScoreComponentCollection> featureValueDiffs1(featureValueDiffs); + FVector totalUpdate1 = ScoreComponentCollection::CreateFVector(); + for (size_t k = 0; k < featureValueDiffs1.size(); ++k) { + featureValueDiffs1[k].MultiplyEquals(alphas1[k]); + cerr << k << ": " << featureValueDiffs1[k].GetScoresVector() << endl; + FVector update = featureValueDiffs1[k].GetScoresVector(); + totalUpdate1 += update; + } + cerr << endl; + cerr << "total update: " << totalUpdate1 << endl << endl; - ScoreComponentCollection weightsUpdate1(weights); - weightsUpdate1.PlusEquals(totalUpdate1); - cerr << "old weights: " << weights << endl; - cerr << "new weights: " << weightsUpdate1 << endl << endl; + ScoreComponentCollection weightsUpdate1(weights); + weightsUpdate1.PlusEquals(totalUpdate1); + cerr << "old weights: " << weights << endl; + cerr << "new weights: " << weightsUpdate1 << endl << endl; - vector newModelScoreDiff; - for (int i = 0; i < numberOfConstraints; ++i) { - newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate1)); - } + vector newModelScoreDiff; + for (int i = 0; i < numberOfConstraints; ++i) { + newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate1)); + } - for (int i = 0; i < numberOfConstraints; ++i) { - cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (newModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl; - } + for (int i = 0; i < numberOfConstraints; ++i) { + cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (newModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl; + } - cerr << "\n\nalphas with slack 0.01:" << endl; - for (size_t i = 0; i < alphas2.size(); ++i) { - cerr << "alpha " << i << ": " << alphas2[i] << endl; - } - cerr << endl; + cerr << "\n\nalphas with slack 0.01:" << endl; + for (size_t i = 0; i < alphas2.size(); ++i) { + cerr << "alpha " << i << ": " << alphas2[i] << endl; + } + cerr << endl; - cerr << "partial updates:" << endl; - vector< ScoreComponentCollection> featureValueDiffs2(featureValueDiffs); - FVector totalUpdate2 = ScoreComponentCollection::CreateFVector(); - for (size_t k = 0; k < featureValueDiffs2.size(); ++k) { - featureValueDiffs2[k].MultiplyEquals(alphas2[k]); - cerr << k << ": " << featureValueDiffs2[k].GetScoresVector() << endl; - FVector update = featureValueDiffs2[k].GetScoresVector(); - totalUpdate2 += update; - } - cerr << endl; - cerr << "total update: " << totalUpdate2 << endl << endl; + cerr << "partial updates:" << endl; + vector< ScoreComponentCollection> featureValueDiffs2(featureValueDiffs); + FVector totalUpdate2 = ScoreComponentCollection::CreateFVector(); + for (size_t k = 0; k < featureValueDiffs2.size(); ++k) { + featureValueDiffs2[k].MultiplyEquals(alphas2[k]); + cerr << k << ": " << featureValueDiffs2[k].GetScoresVector() << endl; + FVector update = featureValueDiffs2[k].GetScoresVector(); + totalUpdate2 += update; + } + cerr << endl; + cerr << "total update: " << totalUpdate2 << endl << endl; - ScoreComponentCollection weightsUpdate2(weights); - weightsUpdate2.PlusEquals(totalUpdate2); - cerr << "old weights: " << weights << endl; - cerr << "new weights: " << weightsUpdate2 << endl << endl; + ScoreComponentCollection weightsUpdate2(weights); + weightsUpdate2.PlusEquals(totalUpdate2); + cerr << "old weights: " << weights << endl; + cerr << "new weights: " << weightsUpdate2 << endl << endl; - newModelScoreDiff.clear(); - for (int i = 0; i < numberOfConstraints; ++i) { - newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate2)); - } + newModelScoreDiff.clear(); + for (int i = 0; i < numberOfConstraints; ++i) { + newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate2)); + } - for (int i = 0; i < numberOfConstraints; ++i) { - cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << endl; - } + for (int i = 0; i < numberOfConstraints; ++i) { + cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << endl; + } } BOOST_FIXTURE_TEST_CASE(test_hildreth_4, MockProducers) { - // Feasible example with 8 constraints - cerr << "\n>>>>>Hildreth test, without slack and with 0.01 slack" << endl << endl; - vector< ScoreComponentCollection> featureValueDiffs; - vector< float> lossMinusModelScoreDiff; + // Feasible example with 8 constraints + cerr << "\n>>>>>Hildreth test, without slack and with 0.01 slack" << endl << endl; + vector< ScoreComponentCollection> featureValueDiffs; + vector< float> lossMinusModelScoreDiff; - // initial weights - float w[] = { 1, 1, 0.638672, 1, 0 }; - vector vec(w,w+5); - ScoreComponentCollection weights; - weights.PlusEquals(&multi, vec); + // initial weights + float w[] = { 1, 1, 0.638672, 1, 0 }; + vector vec(w,w+5); + ScoreComponentCollection weights; + weights.PlusEquals(&multi, vec); - int numberOfConstraints = 8; + int numberOfConstraints = 8; - // feature value differences (to oracle) - // NOTE: these feature values are only approximations - ScoreComponentCollection s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21; - float arr1[] = { 0, 0, -2.0672, 0, 0 }; - float arr2[] = { 0, 0, 0, 0, 0 }; - float arr3[] = { 0, 0, -2.08436, 1.38629, 0 }; - float arr4[] = { 0, 0, -0.0171661, 1.38629, 0 }; + // feature value differences (to oracle) + // NOTE: these feature values are only approximations + ScoreComponentCollection s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21; + float arr1[] = { 0, 0, -2.0672, 0, 0 }; + float arr2[] = { 0, 0, 0, 0, 0 }; + float arr3[] = { 0, 0, -2.08436, 1.38629, 0 }; + float arr4[] = { 0, 0, -0.0171661, 1.38629, 0 }; // float arr5[] = { 0, 0, 4.4283, 0, 0 }; // float arr6[] = { 0, 0, 3.84829, 1.38629, 0 }; // float arr7[] = { 0, 0, 6.83689, 0, 0 }; - float arr8[] = { 0, 0, 0, 0, 0 }; - float arr9[] = { 0, 0, -2.0672, 0, 0 }; + float arr8[] = { 0, 0, 0, 0, 0 }; + float arr9[] = { 0, 0, -2.0672, 0, 0 }; // float arr10[] = { 0, 0, -0.0171661, 1.38629, 0 }; // float arr11[] = { 0, 0, -2.08436, 1.38629, 0 }; // float arr12[] = { 0, 0, 4.4283, 0, 0 }; // float arr13[] = { 3, 0, 2.41089, 0, 0 }; // float arr14[] = { 3, 0, 2.32709, 0, 0 }; - float arr15[] = { 0, 0, -2.0672, 0, 0 }; - float arr16[] = { 0, 0, -2.08436, 1.38629, 0 }; + float arr15[] = { 0, 0, -2.0672, 0, 0 }; + float arr16[] = { 0, 0, -2.08436, 1.38629, 0 }; // float arr17[] = { 0, 0, 4.4283, 0, 0 }; // float arr18[] = { 0, 0, 3.84829, 1.38629, 0 }; // float arr19[] = { 0, 0, -0.0171661, 1.38629, 0 }; // float arr20[] = { 0, 0, 0, 0, 0 }; // float arr21[] = { 0, 0, 6.83689, 0, 0 }; - vector losses; - losses.push_back(2.73485); - losses.push_back(0); - losses.push_back(3.64118); - losses.push_back(1.47347); + vector losses; + losses.push_back(2.73485); + losses.push_back(0); + losses.push_back(3.64118); + losses.push_back(1.47347); // losses.push_back(3.64118); // losses.push_back(4.16278); // losses.push_back(3.13952); - losses.push_back(0); - losses.push_back(2.73485); + losses.push_back(0); + losses.push_back(2.73485); // losses.push_back(1.47347); // losses.push_back(3.64118); // losses.push_back(3.64118); // losses.push_back(2.51662); // losses.push_back(2.73485); - losses.push_back(2.73485); - losses.push_back(3.64118); + losses.push_back(2.73485); + losses.push_back(3.64118); // losses.push_back(3.64118); // losses.push_back(4.16278); // losses.push_back(1.47347); // losses.push_back(0); // losses.push_back(3.13952); - vector vec1(arr1,arr1+5); - vector vec2(arr2,arr2+5); - vector vec3(arr3,arr3+5); - vector vec4(arr4,arr4+5); + vector vec1(arr1,arr1+5); + vector vec2(arr2,arr2+5); + vector vec3(arr3,arr3+5); + vector vec4(arr4,arr4+5); // vector vec5(arr5,arr5+5); // vector vec6(arr6,arr6+5); // vector vec7(arr7,arr7+5); - vector vec8(arr8,arr8+5); - vector vec9(arr9,arr9+5); + vector vec8(arr8,arr8+5); + vector vec9(arr9,arr9+5); // vector vec10(arr10,arr10+5); // vector vec11(arr11,arr11+5); // vector vec12(arr12,arr12+5); // vector vec13(arr13,arr13+5); // vector vec14(arr14,arr14+5); - vector vec15(arr15,arr15+5); - vector vec16(arr16,arr16+5); + vector vec15(arr15,arr15+5); + vector vec16(arr16,arr16+5); // vector vec17(arr17,arr17+5); // vector vec18(arr18,arr18+5); // vector vec19(arr19,arr19+5); // vector vec20(arr20,arr20+5); // vector vec21(arr21,arr21+5); - s1.PlusEquals(&multi,vec1); - s2.PlusEquals(&multi,vec2); - s3.PlusEquals(&multi,vec3); - s4.PlusEquals(&multi,vec4); + s1.PlusEquals(&multi,vec1); + s2.PlusEquals(&multi,vec2); + s3.PlusEquals(&multi,vec3); + s4.PlusEquals(&multi,vec4); // s5.PlusEquals(&multi,vec5); // s6.PlusEquals(&multi,vec6); // s7.PlusEquals(&multi,vec7); - s8.PlusEquals(&multi,vec8); - s9.PlusEquals(&multi,vec9); + s8.PlusEquals(&multi,vec8); + s9.PlusEquals(&multi,vec9); // s10.PlusEquals(&multi,vec10); // s11.PlusEquals(&multi,vec11); // s12.PlusEquals(&multi,vec12); // s13.PlusEquals(&multi,vec13); // s14.PlusEquals(&multi,vec14); - s15.PlusEquals(&multi,vec15); - s16.PlusEquals(&multi,vec16); + s15.PlusEquals(&multi,vec15); + s16.PlusEquals(&multi,vec16); // s17.PlusEquals(&multi,vec17); // s18.PlusEquals(&multi,vec18); // s19.PlusEquals(&multi,vec19); // s20.PlusEquals(&multi,vec20); // s21.PlusEquals(&multi,vec21); - featureValueDiffs.push_back(s1); - featureValueDiffs.push_back(s2); - featureValueDiffs.push_back(s3); - featureValueDiffs.push_back(s4); + featureValueDiffs.push_back(s1); + featureValueDiffs.push_back(s2); + featureValueDiffs.push_back(s3); + featureValueDiffs.push_back(s4); // featureValueDiffs.push_back(s5); // featureValueDiffs.push_back(s6); // featureValueDiffs.push_back(s7); - featureValueDiffs.push_back(s8); - featureValueDiffs.push_back(s9); + featureValueDiffs.push_back(s8); + featureValueDiffs.push_back(s9); // featureValueDiffs.push_back(s10); // featureValueDiffs.push_back(s11); // featureValueDiffs.push_back(s12); // featureValueDiffs.push_back(s13); // featureValueDiffs.push_back(s14); - featureValueDiffs.push_back(s15); - featureValueDiffs.push_back(s16); + featureValueDiffs.push_back(s15); + featureValueDiffs.push_back(s16); // featureValueDiffs.push_back(s17); // featureValueDiffs.push_back(s18); // featureValueDiffs.push_back(s19); // featureValueDiffs.push_back(s20); // featureValueDiffs.push_back(s21); - vector oldModelScoreDiff; - for (int i = 0; i < numberOfConstraints; ++i) { - oldModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weights)); - } + vector oldModelScoreDiff; + for (int i = 0; i < numberOfConstraints; ++i) { + oldModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weights)); + } - for (int i = 0; i < numberOfConstraints; ++i) { - cerr << "old model score diff: " << oldModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (oldModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl; - } + for (int i = 0; i < numberOfConstraints; ++i) { + cerr << "old model score diff: " << oldModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (oldModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl; + } - for (int i = 0; i < numberOfConstraints; ++i) { - lossMinusModelScoreDiff.push_back(losses[i] - oldModelScoreDiff[i]); - } + for (int i = 0; i < numberOfConstraints; ++i) { + lossMinusModelScoreDiff.push_back(losses[i] - oldModelScoreDiff[i]); + } - for (int i = 0; i < numberOfConstraints; ++i) { - cerr << "A: " << featureValueDiffs[i] << ", b: " << lossMinusModelScoreDiff[i] << endl; - } + for (int i = 0; i < numberOfConstraints; ++i) { + cerr << "A: " << featureValueDiffs[i] << ", b: " << lossMinusModelScoreDiff[i] << endl; + } - vector< float> alphas1 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff); - vector< float> alphas2 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff, 0.01); + vector< float> alphas1 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff); + vector< float> alphas2 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff, 0.01); - cerr << "\nalphas without slack:" << endl; - for (size_t i = 0; i < alphas1.size(); ++i) { - cerr << "alpha " << i << ": " << alphas1[i] << endl; - } - cerr << endl; + cerr << "\nalphas without slack:" << endl; + for (size_t i = 0; i < alphas1.size(); ++i) { + cerr << "alpha " << i << ": " << alphas1[i] << endl; + } + cerr << endl; - cerr << "partial updates:" << endl; - vector< ScoreComponentCollection> featureValueDiffs1(featureValueDiffs); - FVector totalUpdate1 = ScoreComponentCollection::CreateFVector(); - for (size_t k = 0; k < featureValueDiffs1.size(); ++k) { - featureValueDiffs1[k].MultiplyEquals(alphas1[k]); - cerr << k << ": " << featureValueDiffs1[k].GetScoresVector() << endl; - FVector update = featureValueDiffs1[k].GetScoresVector(); - totalUpdate1 += update; - } - cerr << endl; - cerr << "total update: " << totalUpdate1 << endl << endl; + cerr << "partial updates:" << endl; + vector< ScoreComponentCollection> featureValueDiffs1(featureValueDiffs); + FVector totalUpdate1 = ScoreComponentCollection::CreateFVector(); + for (size_t k = 0; k < featureValueDiffs1.size(); ++k) { + featureValueDiffs1[k].MultiplyEquals(alphas1[k]); + cerr << k << ": " << featureValueDiffs1[k].GetScoresVector() << endl; + FVector update = featureValueDiffs1[k].GetScoresVector(); + totalUpdate1 += update; + } + cerr << endl; + cerr << "total update: " << totalUpdate1 << endl << endl; - ScoreComponentCollection weightsUpdate1(weights); - weightsUpdate1.PlusEquals(totalUpdate1); - cerr << "old weights: " << weights << endl; - cerr << "new weights: " << weightsUpdate1 << endl << endl; + ScoreComponentCollection weightsUpdate1(weights); + weightsUpdate1.PlusEquals(totalUpdate1); + cerr << "old weights: " << weights << endl; + cerr << "new weights: " << weightsUpdate1 << endl << endl; - vector newModelScoreDiff; - for (int i = 0; i < numberOfConstraints; ++i) { - newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate1)); - } + vector newModelScoreDiff; + for (int i = 0; i < numberOfConstraints; ++i) { + newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate1)); + } - for (int i = 0; i < numberOfConstraints; ++i) { - cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (newModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl; - } + for (int i = 0; i < numberOfConstraints; ++i) { + cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (newModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl; + } - cerr << "\n\nalphas with slack 0.01:" << endl; - for (size_t i = 0; i < alphas2.size(); ++i) { - cerr << "alpha " << i << ": " << alphas2[i] << endl; - } - cerr << endl; + cerr << "\n\nalphas with slack 0.01:" << endl; + for (size_t i = 0; i < alphas2.size(); ++i) { + cerr << "alpha " << i << ": " << alphas2[i] << endl; + } + cerr << endl; - cerr << "partial updates:" << endl; - vector< ScoreComponentCollection> featureValueDiffs2(featureValueDiffs); - FVector totalUpdate2 = ScoreComponentCollection::CreateFVector(); - for (size_t k = 0; k < featureValueDiffs2.size(); ++k) { - featureValueDiffs2[k].MultiplyEquals(alphas2[k]); - cerr << k << ": " << featureValueDiffs2[k].GetScoresVector() << endl; - FVector update = featureValueDiffs2[k].GetScoresVector(); - totalUpdate2 += update; - } - cerr << endl; - cerr << "total update: " << totalUpdate2 << endl << endl; + cerr << "partial updates:" << endl; + vector< ScoreComponentCollection> featureValueDiffs2(featureValueDiffs); + FVector totalUpdate2 = ScoreComponentCollection::CreateFVector(); + for (size_t k = 0; k < featureValueDiffs2.size(); ++k) { + featureValueDiffs2[k].MultiplyEquals(alphas2[k]); + cerr << k << ": " << featureValueDiffs2[k].GetScoresVector() << endl; + FVector update = featureValueDiffs2[k].GetScoresVector(); + totalUpdate2 += update; + } + cerr << endl; + cerr << "total update: " << totalUpdate2 << endl << endl; - ScoreComponentCollection weightsUpdate2(weights); - weightsUpdate2.PlusEquals(totalUpdate2); - cerr << "old weights: " << weights << endl; - cerr << "new weights: " << weightsUpdate2 << endl << endl; + ScoreComponentCollection weightsUpdate2(weights); + weightsUpdate2.PlusEquals(totalUpdate2); + cerr << "old weights: " << weights << endl; + cerr << "new weights: " << weightsUpdate2 << endl << endl; - newModelScoreDiff.clear(); - for (int i = 0; i < numberOfConstraints; ++i) { - newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate2)); - } + newModelScoreDiff.clear(); + for (int i = 0; i < numberOfConstraints; ++i) { + newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate2)); + } - for (int i = 0; i < numberOfConstraints; ++i) { - cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << endl; - } + for (int i = 0; i < numberOfConstraints; ++i) { + cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << endl; + } } BOOST_FIXTURE_TEST_CASE(test_hildreth_5, MockProducers) { - // Unfeasible example with 2 constraints - cerr << "\n>>>>>Hildreth test, without slack and with 0.01 slack" << endl << endl; - vector< ScoreComponentCollection> featureValueDiffs; - vector< float> lossMinusModelScoreDiff; + // Unfeasible example with 2 constraints + cerr << "\n>>>>>Hildreth test, without slack and with 0.01 slack" << endl << endl; + vector< ScoreComponentCollection> featureValueDiffs; + vector< float> lossMinusModelScoreDiff; - // initial weights - float w[] = { 1, 1, 0.638672, 1, 0 }; - vector vec(w,w+5); - ScoreComponentCollection weights; - weights.PlusEquals(&multi, vec); + // initial weights + float w[] = { 1, 1, 0.638672, 1, 0 }; + vector vec(w,w+5); + ScoreComponentCollection weights; + weights.PlusEquals(&multi, vec); - int numberOfConstraints = 2; + int numberOfConstraints = 2; - // feature value differences (to oracle) - // NOTE: these feature values are only approximations - ScoreComponentCollection s1, s17; - float arr1[] = { 0, 0, -2.0672, 0, 0 }; - float arr17[] = { 0, 0, 4.4283, 0, 0 }; - vector losses; - losses.push_back(2.73485); - losses.push_back(3.64118); + // feature value differences (to oracle) + // NOTE: these feature values are only approximations + ScoreComponentCollection s1, s17; + float arr1[] = { 0, 0, -2.0672, 0, 0 }; + float arr17[] = { 0, 0, 4.4283, 0, 0 }; + vector losses; + losses.push_back(2.73485); + losses.push_back(3.64118); - vector vec1(arr1,arr1+5); - vector vec17(arr17,arr17+5); + vector vec1(arr1,arr1+5); + vector vec17(arr17,arr17+5); - s1.PlusEquals(&multi,vec1); - s17.PlusEquals(&multi,vec17); + s1.PlusEquals(&multi,vec1); + s17.PlusEquals(&multi,vec17); - featureValueDiffs.push_back(s1); - featureValueDiffs.push_back(s17); + featureValueDiffs.push_back(s1); + featureValueDiffs.push_back(s17); - vector oldModelScoreDiff; - for (int i = 0; i < numberOfConstraints; ++i) { - oldModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weights)); - } + vector oldModelScoreDiff; + for (int i = 0; i < numberOfConstraints; ++i) { + oldModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weights)); + } - float sumOfOldError = 0; - for (int i = 0; i < numberOfConstraints; ++i) { - cerr << "old model score diff: " << oldModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (oldModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl; - sumOfOldError += (losses[i] - oldModelScoreDiff[i]); - } - cerr << "sum of old error: " << sumOfOldError << endl; + float sumOfOldError = 0; + for (int i = 0; i < numberOfConstraints; ++i) { + cerr << "old model score diff: " << oldModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (oldModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl; + sumOfOldError += (losses[i] - oldModelScoreDiff[i]); + } + cerr << "sum of old error: " << sumOfOldError << endl; - for (int i = 0; i < numberOfConstraints; ++i) { - lossMinusModelScoreDiff.push_back(losses[i] - oldModelScoreDiff[i]); - } + for (int i = 0; i < numberOfConstraints; ++i) { + lossMinusModelScoreDiff.push_back(losses[i] - oldModelScoreDiff[i]); + } - for (int i = 0; i < numberOfConstraints; ++i) { - cerr << "A: " << featureValueDiffs[i] << ", b: " << lossMinusModelScoreDiff[i] << endl; - } + for (int i = 0; i < numberOfConstraints; ++i) { + cerr << "A: " << featureValueDiffs[i] << ", b: " << lossMinusModelScoreDiff[i] << endl; + } - vector< float> alphas1 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff); - vector< float> alphas2 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff, 0.01); - vector< float> alphas3 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff, 0.1); + vector< float> alphas1 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff); + vector< float> alphas2 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff, 0.01); + vector< float> alphas3 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff, 0.1); - cerr << "\nalphas without slack:" << endl; - for (size_t i = 0; i < alphas1.size(); ++i) { - cerr << "alpha " << i << ": " << alphas1[i] << endl; - } - cerr << endl; + cerr << "\nalphas without slack:" << endl; + for (size_t i = 0; i < alphas1.size(); ++i) { + cerr << "alpha " << i << ": " << alphas1[i] << endl; + } + cerr << endl; - cerr << "partial updates:" << endl; - vector< ScoreComponentCollection> featureValueDiffs1(featureValueDiffs); - FVector totalUpdate1 = ScoreComponentCollection::CreateFVector(); - for (size_t k = 0; k < featureValueDiffs1.size(); ++k) { - featureValueDiffs1[k].MultiplyEquals(alphas1[k]); - cerr << k << ": " << featureValueDiffs1[k].GetScoresVector() << endl; - FVector update = featureValueDiffs1[k].GetScoresVector(); - totalUpdate1 += update; - } - cerr << endl; - cerr << "total update: " << totalUpdate1 << endl << endl; + cerr << "partial updates:" << endl; + vector< ScoreComponentCollection> featureValueDiffs1(featureValueDiffs); + FVector totalUpdate1 = ScoreComponentCollection::CreateFVector(); + for (size_t k = 0; k < featureValueDiffs1.size(); ++k) { + featureValueDiffs1[k].MultiplyEquals(alphas1[k]); + cerr << k << ": " << featureValueDiffs1[k].GetScoresVector() << endl; + FVector update = featureValueDiffs1[k].GetScoresVector(); + totalUpdate1 += update; + } + cerr << endl; + cerr << "total update: " << totalUpdate1 << endl << endl; - ScoreComponentCollection weightsUpdate1(weights); - weightsUpdate1.PlusEquals(totalUpdate1); - cerr << "old weights: " << weights << endl; - cerr << "new weights: " << weightsUpdate1 << endl << endl; + ScoreComponentCollection weightsUpdate1(weights); + weightsUpdate1.PlusEquals(totalUpdate1); + cerr << "old weights: " << weights << endl; + cerr << "new weights: " << weightsUpdate1 << endl << endl; - vector newModelScoreDiff; - for (int i = 0; i < numberOfConstraints; ++i) { - newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate1)); - } + vector newModelScoreDiff; + for (int i = 0; i < numberOfConstraints; ++i) { + newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate1)); + } - float sumOfNewError = 0; - for (int i = 0; i < numberOfConstraints; ++i) { - cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (newModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl; - sumOfNewError += (losses[i] - newModelScoreDiff[i]); - } - cerr << "sum of new error: " << sumOfNewError << endl; + float sumOfNewError = 0; + for (int i = 0; i < numberOfConstraints; ++i) { + cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (newModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl; + sumOfNewError += (losses[i] - newModelScoreDiff[i]); + } + cerr << "sum of new error: " << sumOfNewError << endl; - cerr << "\n\nalphas with slack 0.01:" << endl; - for (size_t i = 0; i < alphas2.size(); ++i) { - cerr << "alpha " << i << ": " << alphas2[i] << endl; - } - cerr << endl; + cerr << "\n\nalphas with slack 0.01:" << endl; + for (size_t i = 0; i < alphas2.size(); ++i) { + cerr << "alpha " << i << ": " << alphas2[i] << endl; + } + cerr << endl; - cerr << "partial updates:" << endl; - vector< ScoreComponentCollection> featureValueDiffs2(featureValueDiffs); - FVector totalUpdate2 = ScoreComponentCollection::CreateFVector(); - for (size_t k = 0; k < featureValueDiffs2.size(); ++k) { - featureValueDiffs2[k].MultiplyEquals(alphas2[k]); - cerr << k << ": " << featureValueDiffs2[k].GetScoresVector() << endl; - FVector update = featureValueDiffs2[k].GetScoresVector(); - totalUpdate2 += update; - } - cerr << endl; - cerr << "total update: " << totalUpdate2 << endl << endl; + cerr << "partial updates:" << endl; + vector< ScoreComponentCollection> featureValueDiffs2(featureValueDiffs); + FVector totalUpdate2 = ScoreComponentCollection::CreateFVector(); + for (size_t k = 0; k < featureValueDiffs2.size(); ++k) { + featureValueDiffs2[k].MultiplyEquals(alphas2[k]); + cerr << k << ": " << featureValueDiffs2[k].GetScoresVector() << endl; + FVector update = featureValueDiffs2[k].GetScoresVector(); + totalUpdate2 += update; + } + cerr << endl; + cerr << "total update: " << totalUpdate2 << endl << endl; - ScoreComponentCollection weightsUpdate2(weights); - weightsUpdate2.PlusEquals(totalUpdate2); - cerr << "old weights: " << weights << endl; - cerr << "new weights: " << weightsUpdate2 << endl << endl; + ScoreComponentCollection weightsUpdate2(weights); + weightsUpdate2.PlusEquals(totalUpdate2); + cerr << "old weights: " << weights << endl; + cerr << "new weights: " << weightsUpdate2 << endl << endl; - newModelScoreDiff.clear(); - for (int i = 0; i < numberOfConstraints; ++i) { - newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate2)); - } + newModelScoreDiff.clear(); + for (int i = 0; i < numberOfConstraints; ++i) { + newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate2)); + } - sumOfNewError = 0; - for (int i = 0; i < numberOfConstraints; ++i) { - cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (newModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl; - sumOfNewError += (losses[i] - newModelScoreDiff[i]); - } - cerr << "sum of new error: " << sumOfNewError << endl; + sumOfNewError = 0; + for (int i = 0; i < numberOfConstraints; ++i) { + cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (newModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl; + sumOfNewError += (losses[i] - newModelScoreDiff[i]); + } + cerr << "sum of new error: " << sumOfNewError << endl; - cerr << "\n\nalphas with slack 0.1:" << endl; - for (size_t i = 0; i < alphas3.size(); ++i) { - cerr << "alpha " << i << ": " << alphas3[i] << endl; - } - cerr << endl; + cerr << "\n\nalphas with slack 0.1:" << endl; + for (size_t i = 0; i < alphas3.size(); ++i) { + cerr << "alpha " << i << ": " << alphas3[i] << endl; + } + cerr << endl; - cerr << "partial updates:" << endl; - vector< ScoreComponentCollection> featureValueDiffs3(featureValueDiffs); - FVector totalUpdate3 = ScoreComponentCollection::CreateFVector(); - for (size_t k = 0; k < featureValueDiffs3.size(); ++k) { - featureValueDiffs3[k].MultiplyEquals(alphas3[k]); - cerr << k << ": " << featureValueDiffs3[k].GetScoresVector() << endl; - FVector update = featureValueDiffs3[k].GetScoresVector(); - totalUpdate3 += update; - } - cerr << endl; - cerr << "total update: " << totalUpdate3 << endl << endl; + cerr << "partial updates:" << endl; + vector< ScoreComponentCollection> featureValueDiffs3(featureValueDiffs); + FVector totalUpdate3 = ScoreComponentCollection::CreateFVector(); + for (size_t k = 0; k < featureValueDiffs3.size(); ++k) { + featureValueDiffs3[k].MultiplyEquals(alphas3[k]); + cerr << k << ": " << featureValueDiffs3[k].GetScoresVector() << endl; + FVector update = featureValueDiffs3[k].GetScoresVector(); + totalUpdate3 += update; + } + cerr << endl; + cerr << "total update: " << totalUpdate3 << endl << endl; - ScoreComponentCollection weightsUpdate3(weights); - weightsUpdate3.PlusEquals(totalUpdate3); - cerr << "old weights: " << weights << endl; - cerr << "new weights: " << weightsUpdate3 << endl << endl; + ScoreComponentCollection weightsUpdate3(weights); + weightsUpdate3.PlusEquals(totalUpdate3); + cerr << "old weights: " << weights << endl; + cerr << "new weights: " << weightsUpdate3 << endl << endl; - newModelScoreDiff.clear(); - for (int i = 0; i < numberOfConstraints; ++i) { - newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate3)); - } + newModelScoreDiff.clear(); + for (int i = 0; i < numberOfConstraints; ++i) { + newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate3)); + } - sumOfNewError = 0; - for (int i = 0; i < numberOfConstraints; ++i) { - cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (newModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl; - sumOfNewError += (losses[i] - newModelScoreDiff[i]); - } - cerr << "sum of new error: " << sumOfNewError << endl; + sumOfNewError = 0; + for (int i = 0; i < numberOfConstraints; ++i) { + cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (newModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl; + sumOfNewError += (losses[i] - newModelScoreDiff[i]); + } + cerr << "sum of new error: " << sumOfNewError << endl; } BOOST_AUTO_TEST_SUITE_END() diff --git a/mira/HypothesisQueue.cpp b/mira/HypothesisQueue.cpp index 43e082b92..8c8daa4da 100644 --- a/mira/HypothesisQueue.cpp +++ b/mira/HypothesisQueue.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -22,13 +22,16 @@ using namespace std; -namespace Moses { +namespace Moses +{ -HypothesisQueue::~HypothesisQueue() { +HypothesisQueue::~HypothesisQueue() +{ m_queue.clear(); } -void HypothesisQueue::Push(BleuIndexPair hypo) { +void HypothesisQueue::Push(BleuIndexPair hypo) +{ //pair::iterator,bool> ret; if (m_capacity == 0 || m_queue.size() < m_capacity) { @@ -52,7 +55,8 @@ void HypothesisQueue::Push(BleuIndexPair hypo) { } } -BleuIndexPair HypothesisQueue::Pop() { +BleuIndexPair HypothesisQueue::Pop() +{ HypoQueueType::iterator p = m_queue.begin(); BleuIndexPair top = *p; m_queue.erase(p); diff --git a/mira/HypothesisQueue.h b/mira/HypothesisQueue.h index a926a40da..63cabbd0f 100644 --- a/mira/HypothesisQueue.h +++ b/mira/HypothesisQueue.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -21,7 +21,8 @@ #include -namespace Moses { +namespace Moses +{ // pair of Bleu score and index typedef std::pair BleuIndexPair; @@ -30,14 +31,17 @@ typedef std::pair BleuIndexPair; // the best scoring hypothesis. The queue assumes ownership of pushed items and // relinquishes ownership when they are popped. Any remaining items at the // time of the queue's destruction are deleted. -class HypothesisQueue { +class HypothesisQueue +{ - public: +public: // Create empty queue with fixed capacity of c. Capacity 0 means unbounded. HypothesisQueue(size_t c) : m_capacity(c) {} ~HypothesisQueue(); - bool Empty() { return m_queue.empty(); } + bool Empty() { + return m_queue.empty(); + } // Add the hypo to the queue or delete it if the queue is full and the // score is no better than the queue's worst score. @@ -47,17 +51,17 @@ class HypothesisQueue { // caller is responsible for deleting the object. BleuIndexPair Pop(); - private: +private: struct HypothesisOrderer { bool operator()(BleuIndexPair a, - BleuIndexPair b) { + BleuIndexPair b) { return (a.first > b.first); } }; typedef std::multiset HypoQueueType; //typedef std::set HypoQueueType; - + HypoQueueType m_queue; const size_t m_capacity; }; diff --git a/mira/Main.cpp b/mira/Main.cpp index 2c62256d9..0dbc9be43 100644 --- a/mira/Main.cpp +++ b/mira/Main.cpp @@ -54,7 +54,8 @@ using namespace std; using namespace Moses; namespace po = boost::program_options; -int main(int argc, char** argv) { +int main(int argc, char** argv) +{ size_t rank = 0; size_t size = 1; #ifdef MPI_ENABLE @@ -141,113 +142,113 @@ int main(int argc, char** argv) { bool modelPlusBleu, simpleHistoryBleu; po::options_description desc("Allowed options"); desc.add_options() - ("continue-epoch", po::value(&continue_epoch)->default_value(0), "Continue an interrupted experiment from this epoch on") - ("freq-reg", po::value(®_on_every_mix)->default_value(false), "Regularize after every weight mixing") - ("l1sparse", po::value(&l1_reg_sparse)->default_value(true), "L1-regularization for sparse weights only") - ("l2sparse", po::value(&l2_reg_sparse)->default_value(true), "L2-regularization for sparse weights only") - ("mv-reg", po::value(&most_violated_reg)->default_value(false), "Regularize most violated constraint") - ("dbg", po::value(&debug)->default_value(true), "More debug output") - ("make-pairs", po::value(&makePairs)->default_value(true), "Make pairs of hypotheses for 1slack") - ("debug", po::value(&debug)->default_value(true), "More debug output") - ("rescale-slack", po::value(&rescaleSlack)->default_value(false), "Rescale slack in 1-slack formulation") - ("disable-bleu-feature", po::value(&disableBleuFeature)->default_value(false), "Disable the Bleu feature") - ("real-bleu", po::value(&realBleu)->default_value(false), "Compute real sentence Bleu on complete translations") - ("add2lm", po::value(&add2lm)->default_value(0.0), "Add the specified amount to all LM weights") - ("hildreth", po::value(&hildreth)->default_value(false), "Prefer Hildreth over analytical update") - ("selective", po::value(&selective)->default_value(false), "Build constraints for every feature") - ("summed", po::value(&summed)->default_value(false), "Sum up all constraints") - ("model-plus-bleu", po::value(&modelPlusBleu)->default_value(false), "Use the sum of model score and +/- bleu to select hope and fear translations") - ("simple-history-bleu", po::value(&simpleHistoryBleu)->default_value(false), "Simple history Bleu") + ("continue-epoch", po::value(&continue_epoch)->default_value(0), "Continue an interrupted experiment from this epoch on") + ("freq-reg", po::value(®_on_every_mix)->default_value(false), "Regularize after every weight mixing") + ("l1sparse", po::value(&l1_reg_sparse)->default_value(true), "L1-regularization for sparse weights only") + ("l2sparse", po::value(&l2_reg_sparse)->default_value(true), "L2-regularization for sparse weights only") + ("mv-reg", po::value(&most_violated_reg)->default_value(false), "Regularize most violated constraint") + ("dbg", po::value(&debug)->default_value(true), "More debug output") + ("make-pairs", po::value(&makePairs)->default_value(true), "Make pairs of hypotheses for 1slack") + ("debug", po::value(&debug)->default_value(true), "More debug output") + ("rescale-slack", po::value(&rescaleSlack)->default_value(false), "Rescale slack in 1-slack formulation") + ("disable-bleu-feature", po::value(&disableBleuFeature)->default_value(false), "Disable the Bleu feature") + ("real-bleu", po::value(&realBleu)->default_value(false), "Compute real sentence Bleu on complete translations") + ("add2lm", po::value(&add2lm)->default_value(0.0), "Add the specified amount to all LM weights") + ("hildreth", po::value(&hildreth)->default_value(false), "Prefer Hildreth over analytical update") + ("selective", po::value(&selective)->default_value(false), "Build constraints for every feature") + ("summed", po::value(&summed)->default_value(false), "Sum up all constraints") + ("model-plus-bleu", po::value(&modelPlusBleu)->default_value(false), "Use the sum of model score and +/- bleu to select hope and fear translations") + ("simple-history-bleu", po::value(&simpleHistoryBleu)->default_value(false), "Simple history Bleu") - ("bleu-weight", po::value(&bleuWeight)->default_value(1.0), "Bleu weight used in decoder objective") - ("bw-hope", po::value(&bleuWeight_hope)->default_value(-1.0), "Bleu weight used in decoder objective for hope") - ("bw-fear", po::value(&bleuWeight_fear)->default_value(-1.0), "Bleu weight used in decoder objective for fear") + ("bleu-weight", po::value(&bleuWeight)->default_value(1.0), "Bleu weight used in decoder objective") + ("bw-hope", po::value(&bleuWeight_hope)->default_value(-1.0), "Bleu weight used in decoder objective for hope") + ("bw-fear", po::value(&bleuWeight_fear)->default_value(-1.0), "Bleu weight used in decoder objective for fear") - ("core-r0", po::value(&core_r0)->default_value(1.0), "Start learning rate for core features") - ("sparse-r0", po::value(&sparse_r0)->default_value(1.0), "Start learning rate for sparse features") + ("core-r0", po::value(&core_r0)->default_value(1.0), "Start learning rate for core features") + ("sparse-r0", po::value(&sparse_r0)->default_value(1.0), "Start learning rate for sparse features") - ("tie-bw-to-lm", po::value(&bleu_weight_lm)->default_value(false), "Make bleu weight depend on lm weight") - ("adjust-bw", po::value(&bleu_weight_lm_adjust)->default_value(false), "Adjust bleu weight when lm weight changes") - ("bw-lm-factor", po::value(&bleu_weight_lm_factor)->default_value(2.0), "Make bleu weight depend on lm weight by this factor") - ("bw-factor-fear", po::value(&bleu_weight_fear_factor)->default_value(1.0), "Multiply fear weight by this factor") - ("accumulate-weights", po::value(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs") - ("average-weights", po::value(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update") - ("avg-ref-length", po::value(&avgRefLength)->default_value(false), "Use average reference length instead of shortest for BLEU score feature") - ("batch-equals-shard", po::value(&batchEqualsShard)->default_value(false), "Batch size is equal to shard size (purely batch)") - ("batch-size,b", po::value(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments") - ("bleu-smoothing-scheme", po::value(&bleu_smoothing_scheme)->default_value(1), "Set a smoothing scheme for sentence-Bleu: +1 (1), +0.1 (2), papineni (3) (default:1)") - ("boost", po::value(&boost)->default_value(false), "Apply boosting factor to updates on misranked candidates") - ("config,f", po::value(&mosesConfigFile), "Moses ini-file") - ("configs-folds", po::value >(&mosesConfigFilesFolds), "Moses ini-files, one for each fold") - ("debug-model", po::value(&debug_model)->default_value(false), "Get best model translation for debugging purposes") - ("decode-hope", po::value(&decode_hope)->default_value(false), "Decode dev input set according to hope objective") - ("decode-fear", po::value(&decode_fear)->default_value(false), "Decode dev input set according to fear objective") - ("decode-model", po::value(&decode_model)->default_value(false), "Decode dev input set according to normal objective") - ("decode-filename", po::value(&decode_filename), "Filename for Bleu objective translations") - ("decoder-settings", po::value(&decoder_settings)->default_value(""), "Decoder settings for tuning runs") - ("distinct-nbest", po::value(&distinctNbest)->default_value(true), "Use n-best list with distinct translations in inference step") - ("dump-mixed-weights", po::value(&dumpMixedWeights)->default_value(false), "Dump mixed weights instead of averaged weights") - ("epochs,e", po::value(&epochs)->default_value(10), "Number of epochs") - ("feature-cutoff", po::value(&featureCutoff)->default_value(-1), "Feature cutoff as additional regularization for sparse features") - ("fear-n", po::value(&fear_n)->default_value(1), "Number of fear translations used") - ("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit") - ("history-bleu", po::value(&historyBleu)->default_value(false), "Use 1best translations to update the history") - ("history-smoothing", po::value(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing") - ("hope-fear", po::value(&hope_fear)->default_value(true), "Use only hope and fear translations for optimisation (not model)") - ("hope-n", po::value(&hope_n)->default_value(2), "Number of hope translations used") - ("input-file,i", po::value(&inputFile), "Input file containing tokenised source") - ("input-files-folds", po::value >(&inputFilesFolds), "Input files containing tokenised source, one for each fold") - ("learner,l", po::value(&learner)->default_value("mira"), "Learning algorithm") - ("l1-lambda", po::value(&l1_lambda)->default_value(0.0001), "Lambda for l1-regularization (w_i +/- lambda)") - ("l2-lambda", po::value(&l2_lambda)->default_value(0.01), "Lambda for l2-regularization (w_i * (1 - lambda))") - ("l1-reg", po::value(&l1_regularize)->default_value(false), "L1-regularization") - ("l2-reg", po::value(&l2_regularize)->default_value(false), "L2-regularization") - ("min-bleu-ratio", po::value(&minBleuRatio)->default_value(-1), "Set a minimum BLEU ratio between hope and fear") - ("max-bleu-ratio", po::value(&maxBleuRatio)->default_value(-1), "Set a maximum BLEU ratio between hope and fear") - ("max-bleu-diff", po::value(&max_bleu_diff)->default_value(true), "Select hope/fear with maximum Bleu difference") - ("min-oracle-bleu", po::value(&min_oracle_bleu)->default_value(0), "Set a minimum oracle BLEU score") - ("min-weight-change", po::value(&min_weight_change)->default_value(0.0001), "Set minimum weight change for stopping criterion") - ("mira-learning-rate", po::value(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)") - ("mixing-frequency", po::value(&mixingFrequency)->default_value(1), "How often per epoch to mix weights, when using mpi") - ("model-hope-fear", po::value(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimisation") - ("moses-src", po::value(&moses_src)->default_value(""), "Moses source directory") - ("nbest,n", po::value(&n)->default_value(1), "Number of translations in n-best list") - ("normalise-weights", po::value(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder") - ("normalise-margin", po::value(&normaliseMargin)->default_value(false), "Normalise the margin: squash between 0 and 1") - ("perceptron-learning-rate", po::value(&perceptron_learning_rate)->default_value(0.01), "Perceptron learning rate") - ("print-feature-values", po::value(&print_feature_values)->default_value(false), "Print out feature values") - ("print-feature-counts", po::value(&printFeatureCounts)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch") - ("print-nbest-with-features", po::value(&printNbestWithFeatures)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch") - ("print-weights", po::value(&print_weights)->default_value(false), "Print out current weights") - ("print-core-weights", po::value(&print_core_weights)->default_value(true), "Print out current core weights") - ("prune-zero-weights", po::value(&pruneZeroWeights)->default_value(false), "Prune zero-valued sparse feature weights") - ("reference-files,r", po::value >(&referenceFiles), "Reference translation files for training") - ("reference-files-folds", po::value >(&referenceFilesFolds), "Reference translation files for training, one for each fold") - ("kbest", po::value(&kbest)->default_value(false), "Select hope/fear pairs from a list of nbest translations") - - ("scale-by-inverse-length", po::value(&scaleByInverseLength)->default_value(false), "Scale BLEU by (history of) inverse input length") - ("scale-by-input-length", po::value(&scaleByInputLength)->default_value(false), "Scale BLEU by (history of) input length") - ("scale-by-avg-input-length", po::value(&scaleByAvgInputLength)->default_value(false), "Scale BLEU by average input length") - ("scale-by-avg-inverse-length", po::value(&scaleByAvgInverseLength)->default_value(false), "Scale BLEU by average inverse input length") - ("scale-by-x", po::value(&scaleByX)->default_value(1), "Scale the BLEU score by value x") - ("scale-lm", po::value(&scale_lm)->default_value(false), "Scale the language model feature") - ("scale-factor-lm", po::value(&scale_lm_factor)->default_value(2), "Scale the language model feature by this factor") - ("scale-wp", po::value(&scale_wp)->default_value(false), "Scale the word penalty feature") - ("scale-factor-wp", po::value(&scale_wp_factor)->default_value(2), "Scale the word penalty feature by this factor") - ("scale-margin", po::value(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation") - ("scale-margin-precision", po::value(&scale_margin_precision)->default_value(0), "Scale margin by precision of oracle") - ("scale-update", po::value(&scale_update)->default_value(0), "Scale update by Bleu score of oracle") - ("scale-update-precision", po::value(&scale_update_precision)->default_value(0), "Scale update by precision of oracle") - ("sentence-level-bleu", po::value(&sentenceBleu)->default_value(true), "Use a sentences level Bleu scoring function") - ("shuffle", po::value(&shuffle)->default_value(false), "Shuffle input sentences before processing") - ("sigmoid-param", po::value(&sigmoidParam)->default_value(1), "y=sigmoidParam is the axis that this sigmoid approaches") - ("slack", po::value(&slack)->default_value(0.01), "Use slack in optimiser") - ("sparse-average", po::value(&sparseAverage)->default_value(false), "Average weights by the number of processes") - ("sparse-no-average", po::value(&sparseNoAverage)->default_value(false), "Don't average sparse weights, just sum") - ("stop-weights", po::value(&weightConvergence)->default_value(true), "Stop when weights converge") - ("verbosity,v", po::value(&verbosity)->default_value(0), "Verbosity level") - ("weight-dump-frequency", po::value(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights (mpi)") - ("weight-dump-stem", po::value(&weightDumpStem)->default_value("weights"), "Stem of filename to use for dumping weights"); + ("tie-bw-to-lm", po::value(&bleu_weight_lm)->default_value(false), "Make bleu weight depend on lm weight") + ("adjust-bw", po::value(&bleu_weight_lm_adjust)->default_value(false), "Adjust bleu weight when lm weight changes") + ("bw-lm-factor", po::value(&bleu_weight_lm_factor)->default_value(2.0), "Make bleu weight depend on lm weight by this factor") + ("bw-factor-fear", po::value(&bleu_weight_fear_factor)->default_value(1.0), "Multiply fear weight by this factor") + ("accumulate-weights", po::value(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs") + ("average-weights", po::value(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update") + ("avg-ref-length", po::value(&avgRefLength)->default_value(false), "Use average reference length instead of shortest for BLEU score feature") + ("batch-equals-shard", po::value(&batchEqualsShard)->default_value(false), "Batch size is equal to shard size (purely batch)") + ("batch-size,b", po::value(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments") + ("bleu-smoothing-scheme", po::value(&bleu_smoothing_scheme)->default_value(1), "Set a smoothing scheme for sentence-Bleu: +1 (1), +0.1 (2), papineni (3) (default:1)") + ("boost", po::value(&boost)->default_value(false), "Apply boosting factor to updates on misranked candidates") + ("config,f", po::value(&mosesConfigFile), "Moses ini-file") + ("configs-folds", po::value >(&mosesConfigFilesFolds), "Moses ini-files, one for each fold") + ("debug-model", po::value(&debug_model)->default_value(false), "Get best model translation for debugging purposes") + ("decode-hope", po::value(&decode_hope)->default_value(false), "Decode dev input set according to hope objective") + ("decode-fear", po::value(&decode_fear)->default_value(false), "Decode dev input set according to fear objective") + ("decode-model", po::value(&decode_model)->default_value(false), "Decode dev input set according to normal objective") + ("decode-filename", po::value(&decode_filename), "Filename for Bleu objective translations") + ("decoder-settings", po::value(&decoder_settings)->default_value(""), "Decoder settings for tuning runs") + ("distinct-nbest", po::value(&distinctNbest)->default_value(true), "Use n-best list with distinct translations in inference step") + ("dump-mixed-weights", po::value(&dumpMixedWeights)->default_value(false), "Dump mixed weights instead of averaged weights") + ("epochs,e", po::value(&epochs)->default_value(10), "Number of epochs") + ("feature-cutoff", po::value(&featureCutoff)->default_value(-1), "Feature cutoff as additional regularization for sparse features") + ("fear-n", po::value(&fear_n)->default_value(1), "Number of fear translations used") + ("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit") + ("history-bleu", po::value(&historyBleu)->default_value(false), "Use 1best translations to update the history") + ("history-smoothing", po::value(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing") + ("hope-fear", po::value(&hope_fear)->default_value(true), "Use only hope and fear translations for optimisation (not model)") + ("hope-n", po::value(&hope_n)->default_value(2), "Number of hope translations used") + ("input-file,i", po::value(&inputFile), "Input file containing tokenised source") + ("input-files-folds", po::value >(&inputFilesFolds), "Input files containing tokenised source, one for each fold") + ("learner,l", po::value(&learner)->default_value("mira"), "Learning algorithm") + ("l1-lambda", po::value(&l1_lambda)->default_value(0.0001), "Lambda for l1-regularization (w_i +/- lambda)") + ("l2-lambda", po::value(&l2_lambda)->default_value(0.01), "Lambda for l2-regularization (w_i * (1 - lambda))") + ("l1-reg", po::value(&l1_regularize)->default_value(false), "L1-regularization") + ("l2-reg", po::value(&l2_regularize)->default_value(false), "L2-regularization") + ("min-bleu-ratio", po::value(&minBleuRatio)->default_value(-1), "Set a minimum BLEU ratio between hope and fear") + ("max-bleu-ratio", po::value(&maxBleuRatio)->default_value(-1), "Set a maximum BLEU ratio between hope and fear") + ("max-bleu-diff", po::value(&max_bleu_diff)->default_value(true), "Select hope/fear with maximum Bleu difference") + ("min-oracle-bleu", po::value(&min_oracle_bleu)->default_value(0), "Set a minimum oracle BLEU score") + ("min-weight-change", po::value(&min_weight_change)->default_value(0.0001), "Set minimum weight change for stopping criterion") + ("mira-learning-rate", po::value(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)") + ("mixing-frequency", po::value(&mixingFrequency)->default_value(1), "How often per epoch to mix weights, when using mpi") + ("model-hope-fear", po::value(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimisation") + ("moses-src", po::value(&moses_src)->default_value(""), "Moses source directory") + ("nbest,n", po::value(&n)->default_value(1), "Number of translations in n-best list") + ("normalise-weights", po::value(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder") + ("normalise-margin", po::value(&normaliseMargin)->default_value(false), "Normalise the margin: squash between 0 and 1") + ("perceptron-learning-rate", po::value(&perceptron_learning_rate)->default_value(0.01), "Perceptron learning rate") + ("print-feature-values", po::value(&print_feature_values)->default_value(false), "Print out feature values") + ("print-feature-counts", po::value(&printFeatureCounts)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch") + ("print-nbest-with-features", po::value(&printNbestWithFeatures)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch") + ("print-weights", po::value(&print_weights)->default_value(false), "Print out current weights") + ("print-core-weights", po::value(&print_core_weights)->default_value(true), "Print out current core weights") + ("prune-zero-weights", po::value(&pruneZeroWeights)->default_value(false), "Prune zero-valued sparse feature weights") + ("reference-files,r", po::value >(&referenceFiles), "Reference translation files for training") + ("reference-files-folds", po::value >(&referenceFilesFolds), "Reference translation files for training, one for each fold") + ("kbest", po::value(&kbest)->default_value(false), "Select hope/fear pairs from a list of nbest translations") + + ("scale-by-inverse-length", po::value(&scaleByInverseLength)->default_value(false), "Scale BLEU by (history of) inverse input length") + ("scale-by-input-length", po::value(&scaleByInputLength)->default_value(false), "Scale BLEU by (history of) input length") + ("scale-by-avg-input-length", po::value(&scaleByAvgInputLength)->default_value(false), "Scale BLEU by average input length") + ("scale-by-avg-inverse-length", po::value(&scaleByAvgInverseLength)->default_value(false), "Scale BLEU by average inverse input length") + ("scale-by-x", po::value(&scaleByX)->default_value(1), "Scale the BLEU score by value x") + ("scale-lm", po::value(&scale_lm)->default_value(false), "Scale the language model feature") + ("scale-factor-lm", po::value(&scale_lm_factor)->default_value(2), "Scale the language model feature by this factor") + ("scale-wp", po::value(&scale_wp)->default_value(false), "Scale the word penalty feature") + ("scale-factor-wp", po::value(&scale_wp_factor)->default_value(2), "Scale the word penalty feature by this factor") + ("scale-margin", po::value(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation") + ("scale-margin-precision", po::value(&scale_margin_precision)->default_value(0), "Scale margin by precision of oracle") + ("scale-update", po::value(&scale_update)->default_value(0), "Scale update by Bleu score of oracle") + ("scale-update-precision", po::value(&scale_update_precision)->default_value(0), "Scale update by precision of oracle") + ("sentence-level-bleu", po::value(&sentenceBleu)->default_value(true), "Use a sentences level Bleu scoring function") + ("shuffle", po::value(&shuffle)->default_value(false), "Shuffle input sentences before processing") + ("sigmoid-param", po::value(&sigmoidParam)->default_value(1), "y=sigmoidParam is the axis that this sigmoid approaches") + ("slack", po::value(&slack)->default_value(0.01), "Use slack in optimiser") + ("sparse-average", po::value(&sparseAverage)->default_value(false), "Average weights by the number of processes") + ("sparse-no-average", po::value(&sparseNoAverage)->default_value(false), "Don't average sparse weights, just sum") + ("stop-weights", po::value(&weightConvergence)->default_value(true), "Stop when weights converge") + ("verbosity,v", po::value(&verbosity)->default_value(0), "Verbosity level") + ("weight-dump-frequency", po::value(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights (mpi)") + ("weight-dump-stem", po::value(&weightDumpStem)->default_value("weights"), "Stem of filename to use for dumping weights"); po::options_description cmdline_options; cmdline_options.add(desc); @@ -257,7 +258,7 @@ int main(int argc, char** argv) { if (help) { std::cout << "Usage: " + string(argv[0]) - + " -f mosesini-file -i input-file -r reference-file(s) [options]" << std::endl; + + " -f mosesini-file -i input-file -r reference-file(s) [options]" << std::endl; std::cout << desc << std::endl; return 0; } @@ -296,8 +297,7 @@ int main(int argc, char** argv) { cerr << "Error: No reference files specified for training with folds" << endl; exit(1); } - } - else { + } else { if (mosesConfigFile.empty()) { cerr << "Error: No moses ini file specified" << endl; return 1; @@ -354,12 +354,11 @@ int main(int argc, char** argv) { } if (referenceSentences[myFold].size() != inputSentencesFolds[myFold].size()) { cerr << "Error: Input file length (" << inputSentencesFolds[myFold].size() << ") != (" - << referenceSentences[myFold].size() << ") reference file length (rank " << rank << ")" << endl; + << referenceSentences[myFold].size() << ") reference file length (rank " << rank << ")" << endl; exit(1); } VERBOSE(1, "Rank " << rank << " reading references from " << referenceFilesFolds[myFold] << endl); - } - else { + } else { if (!loadSentences(inputFile, inputSentences)) { cerr << "Error: Failed to load input sentences from " << inputFile << endl; return 1; @@ -367,15 +366,15 @@ int main(int argc, char** argv) { for (size_t i = 0; i < referenceFiles.size(); ++i) { if (!loadSentences(referenceFiles[i], referenceSentences[i])) { - cerr << "Error: Failed to load reference sentences from " - << referenceFiles[i] << endl; - return 1; + cerr << "Error: Failed to load reference sentences from " + << referenceFiles[i] << endl; + return 1; } if (referenceSentences[i].size() != inputSentences.size()) { - cerr << "Error: Input file length (" << inputSentences.size() << ") != (" - << referenceSentences[i].size() << ") length of reference file " << i - << endl; - return 1; + cerr << "Error: Input file length (" << inputSentences.size() << ") != (" + << referenceSentences[i].size() << ") length of reference file " << i + << endl; + return 1; } } } @@ -401,8 +400,7 @@ int main(int argc, char** argv) { if (trainWithMultipleFolds) { decoder_settings += " "; decoder_settings += referenceFilesFolds[myFold]; - } - else { + } else { for (size_t i=0; i < referenceFiles.size(); ++i) { decoder_settings += " "; decoder_settings += referenceFiles[i]; @@ -416,8 +414,8 @@ int main(int argc, char** argv) { VERBOSE(1, "Rank " << rank << " reading config file from " << configFile << endl); MosesDecoder* decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params); decoder->setBleuParameters(disableBleuFeature, sentenceBleu, scaleByInputLength, scaleByAvgInputLength, - scaleByInverseLength, scaleByAvgInverseLength, - scaleByX, historySmoothing, bleu_smoothing_scheme, simpleHistoryBleu); + scaleByInverseLength, scaleByAvgInverseLength, + scaleByX, historySmoothing, bleu_smoothing_scheme, simpleHistoryBleu); SearchAlgorithm searchAlgorithm = staticData.GetSearchAlgorithm(); bool chartDecoding = (searchAlgorithm == ChartDecoding); @@ -427,11 +425,10 @@ int main(int argc, char** argv) { for (size_t i = 0; i < inputSentencesFolds[myFold].size(); ++i) { order.push_back(i); } - } - else { + } else { if (rank == 0) { for (size_t i = 0; i < inputSentences.size(); ++i) { - order.push_back(i); + order.push_back(i); } } } @@ -444,10 +441,10 @@ int main(int argc, char** argv) { cerr << "slack: " << slack << ", learning rate: " << mira_learning_rate << endl; cerr << "selective: " << selective << endl; if (normaliseMargin) - cerr << "sigmoid parameter: " << sigmoidParam << endl; + cerr << "sigmoid parameter: " << sigmoidParam << endl; } optimiser = new MiraOptimiser(slack, scale_margin, scale_margin_precision, - scale_update, scale_update_precision, boost, normaliseMargin, sigmoidParam); + scale_update, scale_update_precision, boost, normaliseMargin, sigmoidParam); learning_rate = mira_learning_rate; perceptron_update = false; } else if (learner == "perceptron") { @@ -466,30 +463,30 @@ int main(int argc, char** argv) { cerr << "Error: Unknown optimiser: " << learner << endl; return 1; } - + // resolve parameter dependencies if (batchSize > 1 && perceptron_update) { batchSize = 1; cerr << "Info: Setting batch size to 1 for perceptron update" << endl; } - + if (hope_n == -1) hope_n = n; if (fear_n == -1) fear_n = n; - + if (model_hope_fear || kbest) hope_fear = false; // is true by default if (learner == "mira" && !(hope_fear || model_hope_fear || kbest)) { cerr << "Error: Need to select one of parameters --hope-fear/--model-hope-fear/--kbest for mira update." << endl; return 1; } - + #ifdef MPI_ENABLE if (!trainWithMultipleFolds) mpi::broadcast(world, order, 0); #endif - + // Create shards according to the number of processes used vector shard; if (trainWithMultipleFolds) { @@ -505,8 +502,7 @@ int main(int argc, char** argv) { shard.resize(shardSize); copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin()); batchSize = 1; - } - else { + } else { size_t shardSize = order.size() / size; size_t shardStart = (size_t) (shardSize * rank); size_t shardEnd = (size_t) (shardSize * (rank + 1)); @@ -521,49 +517,49 @@ int main(int argc, char** argv) { if (batchEqualsShard) batchSize = shardSize; } - + // get reference to feature functions const vector &featureFunctions = FeatureFunction::GetFeatureFunctions(); ScoreComponentCollection initialWeights = decoder->getWeights(); - - if (add2lm != 0) { - const std::vector &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions(); - for (size_t i = 0; i < statefulFFs.size(); ++i) { - const StatefulFeatureFunction *ff = statefulFFs[i]; - const LanguageModel *lm = dynamic_cast(ff); - if (lm) { - float lmWeight = initialWeights.GetScoreForProducer(lm) + add2lm; - initialWeights.Assign(lm, lmWeight); - cerr << "Rank " << rank << ", add " << add2lm << " to lm weight." << endl; - } - } + if (add2lm != 0) { + const std::vector &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions(); + for (size_t i = 0; i < statefulFFs.size(); ++i) { + const StatefulFeatureFunction *ff = statefulFFs[i]; + const LanguageModel *lm = dynamic_cast(ff); + + if (lm) { + float lmWeight = initialWeights.GetScoreForProducer(lm) + add2lm; + initialWeights.Assign(lm, lmWeight); + cerr << "Rank " << rank << ", add " << add2lm << " to lm weight." << endl; + } + } } - + if (normaliseWeights) { initialWeights.L1Normalise(); cerr << "Rank " << rank << ", normalised initial weights: " << initialWeights << endl; } decoder->setWeights(initialWeights); - + // set bleu weight to twice the size of the language model weight(s) if (bleu_weight_lm) { float lmSum = 0; - const std::vector &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions(); - for (size_t i = 0; i < statefulFFs.size(); ++i) { - const StatefulFeatureFunction *ff = statefulFFs[i]; - const LanguageModel *lm = dynamic_cast(ff); + const std::vector &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions(); + for (size_t i = 0; i < statefulFFs.size(); ++i) { + const StatefulFeatureFunction *ff = statefulFFs[i]; + const LanguageModel *lm = dynamic_cast(ff); - if (lm) { + if (lm) { lmSum += abs(initialWeights.GetScoreForProducer(lm)); - } - } + } + } bleuWeight = lmSum * bleu_weight_lm_factor; cerr << "Set bleu weight to lm weight * " << bleu_weight_lm_factor << endl; } - + if (bleuWeight_hope == -1) { bleuWeight_hope = bleuWeight; } @@ -573,35 +569,35 @@ int main(int argc, char** argv) { bleuWeight_fear *= bleu_weight_fear_factor; cerr << "Bleu weight: " << bleuWeight << endl; cerr << "Bleu weight fear: " << bleuWeight_fear << endl; - + if (decode_hope || decode_fear || decode_model) { size_t decode = 1; if (decode_fear) decode = 2; if (decode_model) decode = 3; decodeHopeOrFear(rank, size, decode, decode_filename, inputSentences, decoder, n, bleuWeight); } - + //Main loop: ScoreComponentCollection cumulativeWeights; // collect weights per epoch to produce an average ScoreComponentCollection cumulativeWeightsBinary; size_t numberOfUpdates = 0; size_t numberOfUpdatesThisEpoch = 0; - + time_t now; time(&now); cerr << "Rank " << rank << ", " << ctime(&now); - + float avgInputLength = 0; float sumOfInputs = 0; size_t numberOfInputs = 0; - + ScoreComponentCollection mixedWeights; ScoreComponentCollection mixedWeightsPrevious; ScoreComponentCollection mixedWeightsBeforePrevious; ScoreComponentCollection mixedAverageWeights; ScoreComponentCollection mixedAverageWeightsPrevious; ScoreComponentCollection mixedAverageWeightsBeforePrevious; - + bool stop = false; // int sumStillViolatedConstraints; float epsilon = 0.0001; @@ -610,66 +606,65 @@ int main(int argc, char** argv) { ScoreComponentCollection confidenceCounts, mixedConfidenceCounts, featureLearningRates; featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0); //initialise core learning rates cerr << "Initial learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl; - + for (size_t epoch = continue_epoch; epoch < epochs && !stop; ++epoch) { if (shuffle) { if (trainWithMultipleFolds || rank == 0) { - cerr << "Rank " << rank << ", epoch " << epoch << ", shuffling input sentences.." << endl; - RandomIndex rindex; - random_shuffle(order.begin(), order.end(), rindex); + cerr << "Rank " << rank << ", epoch " << epoch << ", shuffling input sentences.." << endl; + RandomIndex rindex; + random_shuffle(order.begin(), order.end(), rindex); } - + #ifdef MPI_ENABLE if (!trainWithMultipleFolds) - mpi::broadcast(world, order, 0); + mpi::broadcast(world, order, 0); #endif - + // redo shards if (trainWithMultipleFolds) { - size_t shardSize = order.size()/coresPerFold; - size_t shardStart = (size_t) (shardSize * (rank % coresPerFold)); - size_t shardEnd = shardStart + shardSize; - if (rank % coresPerFold == coresPerFold - 1) { // last rank of each fold - shardEnd = order.size(); - shardSize = shardEnd - shardStart; - } - VERBOSE(1, "Rank: " << rank << ", shard size: " << shardSize << endl); - VERBOSE(1, "Rank: " << rank << ", shard start: " << shardStart << " shard end: " << shardEnd << endl); - shard.resize(shardSize); - copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin()); - batchSize = 1; - } - else { - size_t shardSize = order.size()/size; - size_t shardStart = (size_t) (shardSize * rank); - size_t shardEnd = (size_t) (shardSize * (rank + 1)); - if (rank == size - 1) { - shardEnd = order.size(); - shardSize = shardEnd - shardStart; - } - VERBOSE(1, "Shard size: " << shardSize << endl); - VERBOSE(1, "Rank: " << rank << " Shard start: " << shardStart << " Shard end: " << shardEnd << endl); - shard.resize(shardSize); - copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin()); - if (batchEqualsShard) - batchSize = shardSize; + size_t shardSize = order.size()/coresPerFold; + size_t shardStart = (size_t) (shardSize * (rank % coresPerFold)); + size_t shardEnd = shardStart + shardSize; + if (rank % coresPerFold == coresPerFold - 1) { // last rank of each fold + shardEnd = order.size(); + shardSize = shardEnd - shardStart; + } + VERBOSE(1, "Rank: " << rank << ", shard size: " << shardSize << endl); + VERBOSE(1, "Rank: " << rank << ", shard start: " << shardStart << " shard end: " << shardEnd << endl); + shard.resize(shardSize); + copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin()); + batchSize = 1; + } else { + size_t shardSize = order.size()/size; + size_t shardStart = (size_t) (shardSize * rank); + size_t shardEnd = (size_t) (shardSize * (rank + 1)); + if (rank == size - 1) { + shardEnd = order.size(); + shardSize = shardEnd - shardStart; + } + VERBOSE(1, "Shard size: " << shardSize << endl); + VERBOSE(1, "Rank: " << rank << " Shard start: " << shardStart << " Shard end: " << shardEnd << endl); + shard.resize(shardSize); + copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin()); + if (batchEqualsShard) + batchSize = shardSize; } } - + // sum of violated constraints in an epoch // sumStillViolatedConstraints = 0; - + numberOfUpdatesThisEpoch = 0; // Sum up weights over one epoch, final average uses weights from last epoch if (!accumulateWeights) { cumulativeWeights.ZeroAll(); cumulativeWeightsBinary.ZeroAll(); } - + // number of weight dumps this epoch size_t weightMixingThisEpoch = 0; size_t weightEpochDump = 0; - + size_t shardPosition = 0; vector::const_iterator sid = shard.begin(); while (sid != shard.end()) { @@ -677,7 +672,7 @@ int main(int argc, char** argv) { vector > featureValues; vector > bleuScores; vector > modelScores; - + // variables for hope-fear/perceptron setting vector > featureValuesHope; vector > featureValuesFear; @@ -685,15 +680,15 @@ int main(int argc, char** argv) { vector > bleuScoresFear; vector > modelScoresHope; vector > modelScoresFear; - + // get moses weights ScoreComponentCollection mosesWeights = decoder->getWeights(); VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", weights: " << mosesWeights << endl); - + if (historyBleu || simpleHistoryBleu) { - decoder->printBleuFeatureHistory(cerr); + decoder->printBleuFeatureHistory(cerr); } - + // BATCHING: produce nbest lists for all input sentences in batch vector oracleBleuScores; vector oracleModelScores; @@ -702,926 +697,903 @@ int main(int argc, char** argv) { vector inputLengths; vector ref_ids; size_t actualBatchSize = 0; - + vector::const_iterator current_sid_start = sid; size_t examples_in_batch = 0; bool skip_example = false; for (size_t batchPosition = 0; batchPosition < batchSize && sid - != shard.end(); ++batchPosition) { - string input; - if (trainWithMultipleFolds) - input = inputSentencesFolds[myFold][*sid]; - else - input = inputSentences[*sid]; - - Moses::Sentence *sentence = new Sentence(); - stringstream in(input + "\n"); - const vector inputFactorOrder = staticData.GetInputFactorOrder(); - sentence->Read(in,inputFactorOrder); - cerr << "\nRank " << rank << ", epoch " << epoch << ", input sentence " << *sid << ": \""; - sentence->Print(cerr); - cerr << "\"" << " (batch pos " << batchPosition << ")" << endl; - size_t current_input_length = (*sentence).GetSize(); - - if (epoch == 0 && (scaleByAvgInputLength || scaleByAvgInverseLength)) { - sumOfInputs += current_input_length; - ++numberOfInputs; - avgInputLength = sumOfInputs/numberOfInputs; - decoder->setAvgInputLength(avgInputLength); - cerr << "Rank " << rank << ", epoch 0, average input length: " << avgInputLength << endl; - } - - vector newFeatureValues; - vector newScores; - if (model_hope_fear) { - featureValues.push_back(newFeatureValues); - bleuScores.push_back(newScores); - modelScores.push_back(newScores); - } - if (hope_fear || perceptron_update) { - featureValuesHope.push_back(newFeatureValues); - featureValuesFear.push_back(newFeatureValues); - bleuScoresHope.push_back(newScores); - bleuScoresFear.push_back(newScores); - modelScoresHope.push_back(newScores); - modelScoresFear.push_back(newScores); - if (historyBleu || simpleHistoryBleu || debug_model) { - featureValues.push_back(newFeatureValues); - bleuScores.push_back(newScores); - modelScores.push_back(newScores); - } - } - if (kbest) { - // for decoding - featureValues.push_back(newFeatureValues); - bleuScores.push_back(newScores); - modelScores.push_back(newScores); - - // for storing selected examples - featureValuesHope.push_back(newFeatureValues); - featureValuesFear.push_back(newFeatureValues); - bleuScoresHope.push_back(newScores); - bleuScoresFear.push_back(newScores); - modelScoresHope.push_back(newScores); - modelScoresFear.push_back(newScores); - } - - size_t ref_length; - float avg_ref_length; - - if (print_weights) - cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: " << mosesWeights << endl; - if (print_core_weights) { - cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: "; - mosesWeights.PrintCoreFeatures(); - cerr << endl; - } - - // check LM weight - const std::vector &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions(); - for (size_t i = 0; i < statefulFFs.size(); ++i) { - const StatefulFeatureFunction *ff = statefulFFs[i]; - const LanguageModel *lm = dynamic_cast(ff); + != shard.end(); ++batchPosition) { + string input; + if (trainWithMultipleFolds) + input = inputSentencesFolds[myFold][*sid]; + else + input = inputSentences[*sid]; - if (lm) { - float lmWeight = mosesWeights.GetScoreForProducer(lm); - cerr << "Rank " << rank << ", epoch " << epoch << ", lm weight: " << lmWeight << endl; - if (lmWeight <= 0) { - cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: language model weight should never be <= 0." << endl; - mosesWeights.Assign(lm, 0.1); - cerr << "Rank " << rank << ", epoch " << epoch << ", assign lm weights of 0.1" << endl; - } - } - } - - // select inference scheme - cerr << "Rank " << rank << ", epoch " << epoch << ", real Bleu? " << realBleu << endl; - if (hope_fear || perceptron_update) { - // HOPE - cerr << "Rank " << rank << ", epoch " << epoch << ", " << hope_n << - "best hope translations" << endl; - vector< vector > outputHope = decoder->getNBest(input, *sid, hope_n, 1.0, bleuWeight_hope, - featureValuesHope[batchPosition], bleuScoresHope[batchPosition], modelScoresHope[batchPosition], - 1, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); - vector oracle = outputHope[0]; - decoder->cleanup(chartDecoding); - ref_length = decoder->getClosestReferenceLength(*sid, oracle.size()); - avg_ref_length = ref_length; - float hope_length_ratio = (float)oracle.size()/ref_length; - int oracleSize = (int)oracle.size(); - cerr << endl; - - // count sparse features occurring in hope translation - featureValuesHope[batchPosition][0].IncrementSparseHopeFeatures(); - - float precision = bleuScoresHope[batchPosition][0]; - if (historyBleu || simpleHistoryBleu) { - precision /= decoder->getTargetLengthHistory(); - } - else { - if (scaleByAvgInputLength) precision /= decoder->getAverageInputLength(); - else if (scaleByAvgInverseLength) precision /= (100/decoder->getAverageInputLength()); - precision /= scaleByX; - } - if (scale_margin_precision || scale_update_precision) { - if (historyBleu || simpleHistoryBleu || scaleByAvgInputLength || scaleByAvgInverseLength) { - cerr << "Rank " << rank << ", epoch " << epoch << ", set hope precision: " << precision << endl; - ((MiraOptimiser*) optimiser)->setPrecision(precision); - } - } - - vector bestModel; - if (debug_model || historyBleu || simpleHistoryBleu) { - // MODEL (for updating the history only, using dummy vectors) - cerr << "Rank " << rank << ", epoch " << epoch << ", 1best wrt model score (debug or history)" << endl; - vector< vector > outputModel = decoder->getNBest(input, *sid, n, 0.0, bleuWeight, - featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition], - 1, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); - bestModel = outputModel[0]; - decoder->cleanup(chartDecoding); - cerr << endl; - ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size()); - } - - // FEAR - float fear_length_ratio = 0; - float bleuRatioHopeFear = 0; - int fearSize = 0; - cerr << "Rank " << rank << ", epoch " << epoch << ", " << fear_n << "best fear translations" << endl; - vector< vector > outputFear = decoder->getNBest(input, *sid, fear_n, -1.0, bleuWeight_fear, - featureValuesFear[batchPosition], bleuScoresFear[batchPosition], modelScoresFear[batchPosition], - 1, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); - vector fear = outputFear[0]; - decoder->cleanup(chartDecoding); - ref_length = decoder->getClosestReferenceLength(*sid, fear.size()); - avg_ref_length += ref_length; - avg_ref_length /= 2; - fear_length_ratio = (float)fear.size()/ref_length; - fearSize = (int)fear.size(); - cerr << endl; - for (size_t i = 0; i < fear.size(); ++i) - delete fear[i]; - - // count sparse features occurring in fear translation - featureValuesFear[batchPosition][0].IncrementSparseFearFeatures(); - - // Bleu-related example selection - bool skip = false; - bleuRatioHopeFear = bleuScoresHope[batchPosition][0] / bleuScoresFear[batchPosition][0]; - if (minBleuRatio != -1 && bleuRatioHopeFear < minBleuRatio) - skip = true; - if(maxBleuRatio != -1 && bleuRatioHopeFear > maxBleuRatio) - skip = true; - - // sanity check - if (historyBleu || simpleHistoryBleu) { - if (bleuScores[batchPosition][0] > bleuScoresHope[batchPosition][0] && - modelScores[batchPosition][0] > modelScoresHope[batchPosition][0]) { - if (abs(bleuScores[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon && - abs(modelScores[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) { - cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: MODEL translation better than HOPE translation." << endl; - skip = true; - } - } - if (bleuScoresFear[batchPosition][0] > bleuScores[batchPosition][0] && - modelScoresFear[batchPosition][0] > modelScores[batchPosition][0]) { - if (abs(bleuScoresFear[batchPosition][0] - bleuScores[batchPosition][0]) > epsilon && - abs(modelScoresFear[batchPosition][0] - modelScores[batchPosition][0]) > epsilon) { - cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than MODEL translation." << endl; - skip = true; - } - } - } - if (bleuScoresFear[batchPosition][0] > bleuScoresHope[batchPosition][0]) { - if (abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon) { - // check if it's an error or a warning - skip = true; - if (modelScoresFear[batchPosition][0] > modelScoresHope[batchPosition][0] && abs(modelScoresFear[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) { - cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << ")" <getNBest(input, *sid, n, 1.0, bleuWeight_hope, - featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition], - 0, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); - //vector oracle = outputHope[0]; - // needed for history - inputLengths.push_back(current_input_length); - ref_ids.push_back(*sid); - decoder->cleanup(chartDecoding); - //ref_length = decoder->getClosestReferenceLength(*sid, oracle.size()); - //float hope_length_ratio = (float)oracle.size()/ref_length; - cerr << endl; - - oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]); - oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]); - oracleModelScores.push_back(modelScores[batchPosition][oraclePos]); - - // MODEL - cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl; - if (historyBleu || simpleHistoryBleu) { - vector< vector > outputModel = decoder->getNBest(input, *sid, n, 0.0, - bleuWeight, featureValues[batchPosition], bleuScores[batchPosition], - modelScores[batchPosition], 1, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); - vector bestModel = outputModel[0]; - oneBests.push_back(bestModel); - inputLengths.push_back(current_input_length); - ref_ids.push_back(*sid); - } - else { - decoder->getNBest(input, *sid, n, 0.0, bleuWeight, - featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition], - 0, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); - } - decoder->cleanup(chartDecoding); - //ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size()); - //float model_length_ratio = (float)bestModel.size()/ref_length; - cerr << endl; - - // FEAR - cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best fear translations" << endl; - decoder->getNBest(input, *sid, n, -1.0, bleuWeight_fear, - featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition], - 0, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); - decoder->cleanup(chartDecoding); - //ref_length = decoder->getClosestReferenceLength(*sid, fear.size()); - //float fear_length_ratio = (float)fear.size()/ref_length; - - examples_in_batch++; - } - if (kbest) { - // MODEL - cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl; - if (historyBleu || simpleHistoryBleu) { - vector< vector > outputModel = decoder->getNBest(input, *sid, n, 0.0, - bleuWeight, featureValues[batchPosition], bleuScores[batchPosition], - modelScores[batchPosition], 1, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); - vector bestModel = outputModel[0]; - oneBests.push_back(bestModel); - inputLengths.push_back(current_input_length); - ref_ids.push_back(*sid); - } - else { - decoder->getNBest(input, *sid, n, 0.0, bleuWeight, - featureValues[batchPosition], bleuScores[batchPosition], - modelScores[batchPosition], 0, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); - } - decoder->cleanup(chartDecoding); - //ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size()); - //float model_length_ratio = (float)bestModel.size()/ref_length; - cerr << endl; + Moses::Sentence *sentence = new Sentence(); + stringstream in(input + "\n"); + const vector inputFactorOrder = staticData.GetInputFactorOrder(); + sentence->Read(in,inputFactorOrder); + cerr << "\nRank " << rank << ", epoch " << epoch << ", input sentence " << *sid << ": \""; + sentence->Print(cerr); + cerr << "\"" << " (batch pos " << batchPosition << ")" << endl; + size_t current_input_length = (*sentence).GetSize(); - examples_in_batch++; - - HypothesisQueue queueHope(hope_n); - HypothesisQueue queueFear(fear_n); - cerr << endl; - if (most_violated || all_violated || one_against_all) { - float bleuHope = -1000; - float bleuFear = 1000; - size_t indexHope = -1; - size_t indexFear = -1; - - vector bleuHopeList; - vector bleuFearList; - vector indexHopeList; - vector indexFearList; - - if (most_violated) - cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with most violated constraint" << endl; - else if (all_violated) - cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with violated constraints"; - else - cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with hope"; - - // find best hope, then find fear that violates our constraint most - for (size_t i=0; i modelScores[batchPosition][indexHope]) { - if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexHope]) > epsilon) { - // better model score - bleuHope = bleuScores[batchPosition][i]; - indexHope = i; - } - } - } - else if (bleuScores[batchPosition][i] > bleuHope) { // better than current best - bleuHope = bleuScores[batchPosition][i]; - indexHope = i; - } - } - - float currentViolation = 0; - float minimum_bleu_diff = 0.01; - for (size_t i=0; i epsilon) { - if (one_against_all && bleuDiff > minimum_bleu_diff) { - cerr << ".. adding pair"; - bleuHopeList.push_back(bleuHope); - bleuFearList.push_back(bleuScores[batchPosition][i]); - indexHopeList.push_back(indexHope); - indexFearList.push_back(i); - } - else if (modelDiff < bleuDiff) { - float diff = bleuDiff - modelDiff; - if (diff > epsilon) { - if (all_violated) { - cerr << ".. adding pair"; - bleuHopeList.push_back(bleuHope); - bleuFearList.push_back(bleuScores[batchPosition][i]); - indexHopeList.push_back(indexHope); - indexFearList.push_back(i); - } - else if (most_violated && diff > currentViolation) { - currentViolation = diff; - bleuFear = bleuScores[batchPosition][i]; - indexFear = i; - cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << currentViolation << " (" << modelDiff << " >= " << bleuDiff << ")" << endl; - } - } - } - } - } - - if (most_violated) { - if (currentViolation > 0) { - cerr << "Rank " << rank << ", epoch " << epoch << ", adding pair with violation " << currentViolation << endl; - cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << bleuHope << " (" << indexHope << "), fear: " << bleuFear << " (" << indexFear << ")" << endl; - bleuScoresHope[batchPosition].push_back(bleuHope); - bleuScoresFear[batchPosition].push_back(bleuFear); - featureValuesHope[batchPosition].push_back(featureValues[batchPosition][indexHope]); - featureValuesFear[batchPosition].push_back(featureValues[batchPosition][indexFear]); - float modelScoreHope = modelScores[batchPosition][indexHope]; - float modelScoreFear = modelScores[batchPosition][indexFear]; - if (most_violated_reg) { - // reduce model score difference by factor ~0.5 - float reg = currentViolation/4; - modelScoreHope += abs(reg); - modelScoreFear -= abs(reg); - float newViolation = (bleuHope - bleuFear) - (modelScoreHope - modelScoreFear); - cerr << "Rank " << rank << ", epoch " << epoch << ", regularized violation: " << newViolation << endl; - } - modelScoresHope[batchPosition].push_back(modelScoreHope); - modelScoresFear[batchPosition].push_back(modelScoreFear); - - featureValues[batchPosition][indexHope].IncrementSparseHopeFeatures(); - featureValues[batchPosition][indexFear].IncrementSparseFearFeatures(); - } - else { - cerr << "Rank " << rank << ", epoch " << epoch << ", no violated constraint found." << endl; - skip_example = 1; - } - } - else cerr << endl; - } - if (max_bleu_diff) { - cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with max Bleu diff from list: " << bleuScores[batchPosition].size() << endl; - for (size_t i=0; isetAvgInputLength(avgInputLength); + cerr << "Rank " << rank << ", epoch 0, average input length: " << avgInputLength << endl; + } - vector hopeList, fearList; - for (size_t i=0; i newFeatureValues; + vector newScores; + if (model_hope_fear) { + featureValues.push_back(newFeatureValues); + bleuScores.push_back(newScores); + modelScores.push_back(newScores); + } + if (hope_fear || perceptron_update) { + featureValuesHope.push_back(newFeatureValues); + featureValuesFear.push_back(newFeatureValues); + bleuScoresHope.push_back(newScores); + bleuScoresFear.push_back(newScores); + modelScoresHope.push_back(newScores); + modelScoresFear.push_back(newScores); + if (historyBleu || simpleHistoryBleu || debug_model) { + featureValues.push_back(newFeatureValues); + bleuScores.push_back(newScores); + modelScores.push_back(newScores); + } + } + if (kbest) { + // for decoding + featureValues.push_back(newFeatureValues); + bleuScores.push_back(newScores); + modelScores.push_back(newScores); + + // for storing selected examples + featureValuesHope.push_back(newFeatureValues); + featureValuesFear.push_back(newFeatureValues); + bleuScoresHope.push_back(newScores); + bleuScoresFear.push_back(newScores); + modelScoresHope.push_back(newScores); + modelScoresFear.push_back(newScores); + } + + size_t ref_length; + float avg_ref_length; + + if (print_weights) + cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: " << mosesWeights << endl; + if (print_core_weights) { + cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: "; + mosesWeights.PrintCoreFeatures(); + cerr << endl; + } + + // check LM weight + const std::vector &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions(); + for (size_t i = 0; i < statefulFFs.size(); ++i) { + const StatefulFeatureFunction *ff = statefulFFs[i]; + const LanguageModel *lm = dynamic_cast(ff); + + if (lm) { + float lmWeight = mosesWeights.GetScoreForProducer(lm); + cerr << "Rank " << rank << ", epoch " << epoch << ", lm weight: " << lmWeight << endl; + if (lmWeight <= 0) { + cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: language model weight should never be <= 0." << endl; + mosesWeights.Assign(lm, 0.1); + cerr << "Rank " << rank << ", epoch " << epoch << ", assign lm weights of 0.1" << endl; + } + } + } + + // select inference scheme + cerr << "Rank " << rank << ", epoch " << epoch << ", real Bleu? " << realBleu << endl; + if (hope_fear || perceptron_update) { + // HOPE + cerr << "Rank " << rank << ", epoch " << epoch << ", " << hope_n << + "best hope translations" << endl; + vector< vector > outputHope = decoder->getNBest(input, *sid, hope_n, 1.0, bleuWeight_hope, + featureValuesHope[batchPosition], bleuScoresHope[batchPosition], modelScoresHope[batchPosition], + 1, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); + vector oracle = outputHope[0]; + decoder->cleanup(chartDecoding); + ref_length = decoder->getClosestReferenceLength(*sid, oracle.size()); + avg_ref_length = ref_length; + float hope_length_ratio = (float)oracle.size()/ref_length; + int oracleSize = (int)oracle.size(); + cerr << endl; + + // count sparse features occurring in hope translation + featureValuesHope[batchPosition][0].IncrementSparseHopeFeatures(); + + float precision = bleuScoresHope[batchPosition][0]; + if (historyBleu || simpleHistoryBleu) { + precision /= decoder->getTargetLengthHistory(); + } else { + if (scaleByAvgInputLength) precision /= decoder->getAverageInputLength(); + else if (scaleByAvgInverseLength) precision /= (100/decoder->getAverageInputLength()); + precision /= scaleByX; + } + if (scale_margin_precision || scale_update_precision) { + if (historyBleu || simpleHistoryBleu || scaleByAvgInputLength || scaleByAvgInverseLength) { + cerr << "Rank " << rank << ", epoch " << epoch << ", set hope precision: " << precision << endl; + ((MiraOptimiser*) optimiser)->setPrecision(precision); + } + } + + vector bestModel; + if (debug_model || historyBleu || simpleHistoryBleu) { + // MODEL (for updating the history only, using dummy vectors) + cerr << "Rank " << rank << ", epoch " << epoch << ", 1best wrt model score (debug or history)" << endl; + vector< vector > outputModel = decoder->getNBest(input, *sid, n, 0.0, bleuWeight, + featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition], + 1, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); + bestModel = outputModel[0]; + decoder->cleanup(chartDecoding); + cerr << endl; + ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size()); + } + + // FEAR + float fear_length_ratio = 0; + float bleuRatioHopeFear = 0; + int fearSize = 0; + cerr << "Rank " << rank << ", epoch " << epoch << ", " << fear_n << "best fear translations" << endl; + vector< vector > outputFear = decoder->getNBest(input, *sid, fear_n, -1.0, bleuWeight_fear, + featureValuesFear[batchPosition], bleuScoresFear[batchPosition], modelScoresFear[batchPosition], + 1, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); + vector fear = outputFear[0]; + decoder->cleanup(chartDecoding); + ref_length = decoder->getClosestReferenceLength(*sid, fear.size()); + avg_ref_length += ref_length; + avg_ref_length /= 2; + fear_length_ratio = (float)fear.size()/ref_length; + fearSize = (int)fear.size(); + cerr << endl; + for (size_t i = 0; i < fear.size(); ++i) + delete fear[i]; + + // count sparse features occurring in fear translation + featureValuesFear[batchPosition][0].IncrementSparseFearFeatures(); + + // Bleu-related example selection + bool skip = false; + bleuRatioHopeFear = bleuScoresHope[batchPosition][0] / bleuScoresFear[batchPosition][0]; + if (minBleuRatio != -1 && bleuRatioHopeFear < minBleuRatio) + skip = true; + if(maxBleuRatio != -1 && bleuRatioHopeFear > maxBleuRatio) + skip = true; + + // sanity check + if (historyBleu || simpleHistoryBleu) { + if (bleuScores[batchPosition][0] > bleuScoresHope[batchPosition][0] && + modelScores[batchPosition][0] > modelScoresHope[batchPosition][0]) { + if (abs(bleuScores[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon && + abs(modelScores[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) { + cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: MODEL translation better than HOPE translation." << endl; + skip = true; + } + } + if (bleuScoresFear[batchPosition][0] > bleuScores[batchPosition][0] && + modelScoresFear[batchPosition][0] > modelScores[batchPosition][0]) { + if (abs(bleuScoresFear[batchPosition][0] - bleuScores[batchPosition][0]) > epsilon && + abs(modelScoresFear[batchPosition][0] - modelScores[batchPosition][0]) > epsilon) { + cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than MODEL translation." << endl; + skip = true; + } + } + } + if (bleuScoresFear[batchPosition][0] > bleuScoresHope[batchPosition][0]) { + if (abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon) { + // check if it's an error or a warning + skip = true; + if (modelScoresFear[batchPosition][0] > modelScoresHope[batchPosition][0] && abs(modelScoresFear[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) { + cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << ")" <getNBest(input, *sid, n, 1.0, bleuWeight_hope, + featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition], + 0, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); + //vector oracle = outputHope[0]; + // needed for history + inputLengths.push_back(current_input_length); + ref_ids.push_back(*sid); + decoder->cleanup(chartDecoding); + //ref_length = decoder->getClosestReferenceLength(*sid, oracle.size()); + //float hope_length_ratio = (float)oracle.size()/ref_length; + cerr << endl; + + oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]); + oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]); + oracleModelScores.push_back(modelScores[batchPosition][oraclePos]); + + // MODEL + cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl; + if (historyBleu || simpleHistoryBleu) { + vector< vector > outputModel = decoder->getNBest(input, *sid, n, 0.0, + bleuWeight, featureValues[batchPosition], bleuScores[batchPosition], + modelScores[batchPosition], 1, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); + vector bestModel = outputModel[0]; + oneBests.push_back(bestModel); + inputLengths.push_back(current_input_length); + ref_ids.push_back(*sid); + } else { + decoder->getNBest(input, *sid, n, 0.0, bleuWeight, + featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition], + 0, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); + } + decoder->cleanup(chartDecoding); + //ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size()); + //float model_length_ratio = (float)bestModel.size()/ref_length; + cerr << endl; + + // FEAR + cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best fear translations" << endl; + decoder->getNBest(input, *sid, n, -1.0, bleuWeight_fear, + featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition], + 0, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); + decoder->cleanup(chartDecoding); + //ref_length = decoder->getClosestReferenceLength(*sid, fear.size()); + //float fear_length_ratio = (float)fear.size()/ref_length; + + examples_in_batch++; + } + if (kbest) { + // MODEL + cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl; + if (historyBleu || simpleHistoryBleu) { + vector< vector > outputModel = decoder->getNBest(input, *sid, n, 0.0, + bleuWeight, featureValues[batchPosition], bleuScores[batchPosition], + modelScores[batchPosition], 1, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); + vector bestModel = outputModel[0]; + oneBests.push_back(bestModel); + inputLengths.push_back(current_input_length); + ref_ids.push_back(*sid); + } else { + decoder->getNBest(input, *sid, n, 0.0, bleuWeight, + featureValues[batchPosition], bleuScores[batchPosition], + modelScores[batchPosition], 0, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); + } + decoder->cleanup(chartDecoding); + //ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size()); + //float model_length_ratio = (float)bestModel.size()/ref_length; + cerr << endl; + + examples_in_batch++; + + HypothesisQueue queueHope(hope_n); + HypothesisQueue queueFear(fear_n); + cerr << endl; + if (most_violated || all_violated || one_against_all) { + float bleuHope = -1000; + float bleuFear = 1000; + size_t indexHope = -1; + size_t indexFear = -1; + + vector bleuHopeList; + vector bleuFearList; + vector indexHopeList; + vector indexFearList; + + if (most_violated) + cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with most violated constraint" << endl; + else if (all_violated) + cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with violated constraints"; + else + cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with hope"; + + // find best hope, then find fear that violates our constraint most + for (size_t i=0; i modelScores[batchPosition][indexHope]) { + if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexHope]) > epsilon) { + // better model score + bleuHope = bleuScores[batchPosition][i]; + indexHope = i; + } + } + } else if (bleuScores[batchPosition][i] > bleuHope) { // better than current best + bleuHope = bleuScores[batchPosition][i]; + indexHope = i; + } + } + + float currentViolation = 0; + float minimum_bleu_diff = 0.01; + for (size_t i=0; i epsilon) { + if (one_against_all && bleuDiff > minimum_bleu_diff) { + cerr << ".. adding pair"; + bleuHopeList.push_back(bleuHope); + bleuFearList.push_back(bleuScores[batchPosition][i]); + indexHopeList.push_back(indexHope); + indexFearList.push_back(i); + } else if (modelDiff < bleuDiff) { + float diff = bleuDiff - modelDiff; + if (diff > epsilon) { + if (all_violated) { + cerr << ".. adding pair"; + bleuHopeList.push_back(bleuHope); + bleuFearList.push_back(bleuScores[batchPosition][i]); + indexHopeList.push_back(indexHope); + indexFearList.push_back(i); + } else if (most_violated && diff > currentViolation) { + currentViolation = diff; + bleuFear = bleuScores[batchPosition][i]; + indexFear = i; + cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << currentViolation << " (" << modelDiff << " >= " << bleuDiff << ")" << endl; + } + } + } + } + } + + if (most_violated) { + if (currentViolation > 0) { + cerr << "Rank " << rank << ", epoch " << epoch << ", adding pair with violation " << currentViolation << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << bleuHope << " (" << indexHope << "), fear: " << bleuFear << " (" << indexFear << ")" << endl; + bleuScoresHope[batchPosition].push_back(bleuHope); + bleuScoresFear[batchPosition].push_back(bleuFear); + featureValuesHope[batchPosition].push_back(featureValues[batchPosition][indexHope]); + featureValuesFear[batchPosition].push_back(featureValues[batchPosition][indexFear]); + float modelScoreHope = modelScores[batchPosition][indexHope]; + float modelScoreFear = modelScores[batchPosition][indexFear]; + if (most_violated_reg) { + // reduce model score difference by factor ~0.5 + float reg = currentViolation/4; + modelScoreHope += abs(reg); + modelScoreFear -= abs(reg); + float newViolation = (bleuHope - bleuFear) - (modelScoreHope - modelScoreFear); + cerr << "Rank " << rank << ", epoch " << epoch << ", regularized violation: " << newViolation << endl; + } + modelScoresHope[batchPosition].push_back(modelScoreHope); + modelScoresFear[batchPosition].push_back(modelScoreFear); + + featureValues[batchPosition][indexHope].IncrementSparseHopeFeatures(); + featureValues[batchPosition][indexFear].IncrementSparseFearFeatures(); + } else { + cerr << "Rank " << rank << ", epoch " << epoch << ", no violated constraint found." << endl; + skip_example = 1; + } + } else cerr << endl; + } + if (max_bleu_diff) { + cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with max Bleu diff from list: " << bleuScores[batchPosition].size() << endl; + for (size_t i=0; i hopeList, fearList; + for (size_t i=0; i > losses(actualBatchSize); - if (model_hope_fear) { - // Set loss for each sentence as BLEU(oracle) - BLEU(hypothesis) - for (size_t batchPosition = 0; batchPosition < actualBatchSize; ++batchPosition) { - for (size_t j = 0; j < bleuScores[batchPosition].size(); ++j) { - losses[batchPosition].push_back(oracleBleuScores[batchPosition] - bleuScores[batchPosition][j]); - } - } - } - - // set weight for bleu feature to 0 before optimizing - vector::const_iterator iter; - const vector &featureFunctions2 = FeatureFunction::GetFeatureFunctions(); - for (iter = featureFunctions2.begin(); iter != featureFunctions2.end(); ++iter) { - if ((*iter)->GetScoreProducerDescription() == "BleuScoreFeature") { - mosesWeights.Assign(*iter, 0); - break; - } - } - - // scale LM feature (to avoid rapid changes) - if (scale_lm) { - cerr << "scale lm" << endl; - const std::vector &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions(); - for (size_t i = 0; i < statefulFFs.size(); ++i) { - const StatefulFeatureFunction *ff = statefulFFs[i]; - const LanguageModel *lm = dynamic_cast(ff); - if (lm) { - // scale down score - if (model_hope_fear) { - scaleFeatureScore(lm, scale_lm_factor, featureValues, rank, epoch); - } - else { - scaleFeatureScore(lm, scale_lm_factor, featureValuesHope, rank, epoch); - scaleFeatureScore(lm, scale_lm_factor, featureValuesFear, rank, epoch); - } - } - } - } - - // scale WP - if (scale_wp) { - // scale up weight - WordPenaltyProducer *wp = StaticData::InstanceNonConst().GetWordPenaltyProducer(); - - // scale down score - if (model_hope_fear) { - scaleFeatureScore(wp, scale_wp_factor, featureValues, rank, epoch); - } - else { - scaleFeatureScore(wp, scale_wp_factor, featureValuesHope, rank, epoch); - scaleFeatureScore(wp, scale_wp_factor, featureValuesFear, rank, epoch); - } - } - - // print out the feature values - if (print_feature_values) { - cerr << "\nRank " << rank << ", epoch " << epoch << ", feature values: " << endl; - if (model_hope_fear) printFeatureValues(featureValues); - else { - cerr << "hope: " << endl; - printFeatureValues(featureValuesHope); - cerr << "fear: " << endl; - printFeatureValues(featureValuesFear); - } - } - - // apply learning rates to feature vectors before optimization - if (feature_confidence) { - cerr << "Rank " << rank << ", epoch " << epoch << ", apply feature learning rates with decays " << decay_core << "/" << decay_sparse << ": " << featureLearningRates << endl; - if (model_hope_fear) { - applyPerFeatureLearningRates(featureValues, featureLearningRates, sparse_r0); - } - else { - applyPerFeatureLearningRates(featureValuesHope, featureLearningRates, sparse_r0); - applyPerFeatureLearningRates(featureValuesFear, featureLearningRates, sparse_r0); - } - } - else { - // apply fixed learning rates - cerr << "Rank " << rank << ", epoch " << epoch << ", apply fixed learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl; - if (core_r0 != 1.0 || sparse_r0 != 1.0) { - if (model_hope_fear) { - applyLearningRates(featureValues, core_r0, sparse_r0); - } - else { - applyLearningRates(featureValuesHope, core_r0, sparse_r0); - applyLearningRates(featureValuesFear, core_r0, sparse_r0); - } - } - } - - // Run optimiser on batch: - VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", run optimiser:" << endl); - size_t update_status = 1; - ScoreComponentCollection weightUpdate; - if (perceptron_update) { - vector > dummy1; - update_status = optimiser->updateWeightsHopeFear( weightUpdate, featureValuesHope, - featureValuesFear, dummy1, dummy1, dummy1, dummy1, learning_rate, rank, epoch); - } - else if (hope_fear) { - if (bleuScoresHope[0][0] >= min_oracle_bleu) { - if (hope_n == 1 && fear_n ==1 && batchSize == 1 && !hildreth) { - update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(weightUpdate, - featureValuesHope[0][0], featureValuesFear[0][0], bleuScoresHope[0][0], - bleuScoresFear[0][0], modelScoresHope[0][0], modelScoresFear[0][0], learning_rate, rank, epoch); - } - else - update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope, - featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope, - modelScoresFear, learning_rate, rank, epoch); - } - else - update_status = 1; - } - else if (kbest) { - if (selective) - update_status = ((MiraOptimiser*)optimiser)->updateWeightsHopeFearSelective( - weightUpdate, featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear, - modelScoresHope, modelScoresFear, learning_rate, rank, epoch); - else if (summed) - update_status = ((MiraOptimiser*)optimiser)->updateWeightsHopeFearSummed( - weightUpdate, featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear, - modelScoresHope, modelScoresFear, learning_rate, rank, epoch, rescaleSlack, makePairs); - else { - if (batchSize == 1 && featureValuesHope[0].size() == 1 && !hildreth) { - cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl; - cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl; - update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically( - weightUpdate, featureValuesHope[0][0], featureValuesFear[0][0], - bleuScoresHope[0][0], bleuScoresFear[0][0], modelScoresHope[0][0], - modelScoresFear[0][0], learning_rate, rank, epoch); - } - else { - cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl; - cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl; - update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope, - featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope, - modelScoresFear, learning_rate, rank, epoch); - } - } - } - else { - // model_hope_fear - update_status = ((MiraOptimiser*) optimiser)->updateWeights(weightUpdate, - featureValues, losses, bleuScores, modelScores, oracleFeatureValues, - oracleBleuScores, oracleModelScores, learning_rate, rank, epoch); - } - - // sumStillViolatedConstraints += update_status; - - if (update_status == 0) { // if weights were updated - // apply weight update - if (debug) - cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << weightUpdate << endl; - - if (feature_confidence) { - // update confidence counts based on weight update - confidenceCounts.UpdateConfidenceCounts(weightUpdate, signed_counts); - - // update feature learning rates - featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0); - } - - // apply weight update to Moses weights - mosesWeights.PlusEquals(weightUpdate); - - if (normaliseWeights) - mosesWeights.L1Normalise(); - - cumulativeWeights.PlusEquals(mosesWeights); - if (sparseAverage) { - ScoreComponentCollection binary; - binary.SetToBinaryOf(mosesWeights); - cumulativeWeightsBinary.PlusEquals(binary); - } - - ++numberOfUpdates; - ++numberOfUpdatesThisEpoch; - if (averageWeights) { - ScoreComponentCollection averageWeights(cumulativeWeights); - if (accumulateWeights) { - averageWeights.DivideEquals(numberOfUpdates); - } else { - averageWeights.DivideEquals(numberOfUpdatesThisEpoch); - } - - mosesWeights = averageWeights; - } - - // set new Moses weights - decoder->setWeights(mosesWeights); - //cerr << "Rank " << rank << ", epoch " << epoch << ", new weights: " << mosesWeights << endl; - } - - // update history (for approximate document Bleu) - if (historyBleu || simpleHistoryBleu) { - for (size_t i = 0; i < oneBests.size(); ++i) - cerr << "Rank " << rank << ", epoch " << epoch << ", update history with 1best length: " << oneBests[i].size() << " "; - decoder->updateHistory(oneBests, inputLengths, ref_ids, rank, epoch); - deleteTranslations(oneBests); - } + if (examples_in_batch == 0 || (kbest && skip_example)) { + cerr << "Rank " << rank << ", epoch " << epoch << ", batch is empty." << endl; + } else { + vector > losses(actualBatchSize); + if (model_hope_fear) { + // Set loss for each sentence as BLEU(oracle) - BLEU(hypothesis) + for (size_t batchPosition = 0; batchPosition < actualBatchSize; ++batchPosition) { + for (size_t j = 0; j < bleuScores[batchPosition].size(); ++j) { + losses[batchPosition].push_back(oracleBleuScores[batchPosition] - bleuScores[batchPosition][j]); + } + } + } + + // set weight for bleu feature to 0 before optimizing + vector::const_iterator iter; + const vector &featureFunctions2 = FeatureFunction::GetFeatureFunctions(); + for (iter = featureFunctions2.begin(); iter != featureFunctions2.end(); ++iter) { + if ((*iter)->GetScoreProducerDescription() == "BleuScoreFeature") { + mosesWeights.Assign(*iter, 0); + break; + } + } + + // scale LM feature (to avoid rapid changes) + if (scale_lm) { + cerr << "scale lm" << endl; + const std::vector &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions(); + for (size_t i = 0; i < statefulFFs.size(); ++i) { + const StatefulFeatureFunction *ff = statefulFFs[i]; + const LanguageModel *lm = dynamic_cast(ff); + + if (lm) { + // scale down score + if (model_hope_fear) { + scaleFeatureScore(lm, scale_lm_factor, featureValues, rank, epoch); + } else { + scaleFeatureScore(lm, scale_lm_factor, featureValuesHope, rank, epoch); + scaleFeatureScore(lm, scale_lm_factor, featureValuesFear, rank, epoch); + } + } + } + } + + // scale WP + if (scale_wp) { + // scale up weight + WordPenaltyProducer *wp = StaticData::InstanceNonConst().GetWordPenaltyProducer(); + + // scale down score + if (model_hope_fear) { + scaleFeatureScore(wp, scale_wp_factor, featureValues, rank, epoch); + } else { + scaleFeatureScore(wp, scale_wp_factor, featureValuesHope, rank, epoch); + scaleFeatureScore(wp, scale_wp_factor, featureValuesFear, rank, epoch); + } + } + + // print out the feature values + if (print_feature_values) { + cerr << "\nRank " << rank << ", epoch " << epoch << ", feature values: " << endl; + if (model_hope_fear) printFeatureValues(featureValues); + else { + cerr << "hope: " << endl; + printFeatureValues(featureValuesHope); + cerr << "fear: " << endl; + printFeatureValues(featureValuesFear); + } + } + + // apply learning rates to feature vectors before optimization + if (feature_confidence) { + cerr << "Rank " << rank << ", epoch " << epoch << ", apply feature learning rates with decays " << decay_core << "/" << decay_sparse << ": " << featureLearningRates << endl; + if (model_hope_fear) { + applyPerFeatureLearningRates(featureValues, featureLearningRates, sparse_r0); + } else { + applyPerFeatureLearningRates(featureValuesHope, featureLearningRates, sparse_r0); + applyPerFeatureLearningRates(featureValuesFear, featureLearningRates, sparse_r0); + } + } else { + // apply fixed learning rates + cerr << "Rank " << rank << ", epoch " << epoch << ", apply fixed learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl; + if (core_r0 != 1.0 || sparse_r0 != 1.0) { + if (model_hope_fear) { + applyLearningRates(featureValues, core_r0, sparse_r0); + } else { + applyLearningRates(featureValuesHope, core_r0, sparse_r0); + applyLearningRates(featureValuesFear, core_r0, sparse_r0); + } + } + } + + // Run optimiser on batch: + VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", run optimiser:" << endl); + size_t update_status = 1; + ScoreComponentCollection weightUpdate; + if (perceptron_update) { + vector > dummy1; + update_status = optimiser->updateWeightsHopeFear( weightUpdate, featureValuesHope, + featureValuesFear, dummy1, dummy1, dummy1, dummy1, learning_rate, rank, epoch); + } else if (hope_fear) { + if (bleuScoresHope[0][0] >= min_oracle_bleu) { + if (hope_n == 1 && fear_n ==1 && batchSize == 1 && !hildreth) { + update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(weightUpdate, + featureValuesHope[0][0], featureValuesFear[0][0], bleuScoresHope[0][0], + bleuScoresFear[0][0], modelScoresHope[0][0], modelScoresFear[0][0], learning_rate, rank, epoch); + } else + update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope, + featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope, + modelScoresFear, learning_rate, rank, epoch); + } else + update_status = 1; + } else if (kbest) { + if (selective) + update_status = ((MiraOptimiser*)optimiser)->updateWeightsHopeFearSelective( + weightUpdate, featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear, + modelScoresHope, modelScoresFear, learning_rate, rank, epoch); + else if (summed) + update_status = ((MiraOptimiser*)optimiser)->updateWeightsHopeFearSummed( + weightUpdate, featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear, + modelScoresHope, modelScoresFear, learning_rate, rank, epoch, rescaleSlack, makePairs); + else { + if (batchSize == 1 && featureValuesHope[0].size() == 1 && !hildreth) { + cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl; + update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically( + weightUpdate, featureValuesHope[0][0], featureValuesFear[0][0], + bleuScoresHope[0][0], bleuScoresFear[0][0], modelScoresHope[0][0], + modelScoresFear[0][0], learning_rate, rank, epoch); + } else { + cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl; + update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope, + featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope, + modelScoresFear, learning_rate, rank, epoch); + } + } + } else { + // model_hope_fear + update_status = ((MiraOptimiser*) optimiser)->updateWeights(weightUpdate, + featureValues, losses, bleuScores, modelScores, oracleFeatureValues, + oracleBleuScores, oracleModelScores, learning_rate, rank, epoch); + } + + // sumStillViolatedConstraints += update_status; + + if (update_status == 0) { // if weights were updated + // apply weight update + if (debug) + cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << weightUpdate << endl; + + if (feature_confidence) { + // update confidence counts based on weight update + confidenceCounts.UpdateConfidenceCounts(weightUpdate, signed_counts); + + // update feature learning rates + featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0); + } + + // apply weight update to Moses weights + mosesWeights.PlusEquals(weightUpdate); + + if (normaliseWeights) + mosesWeights.L1Normalise(); + + cumulativeWeights.PlusEquals(mosesWeights); + if (sparseAverage) { + ScoreComponentCollection binary; + binary.SetToBinaryOf(mosesWeights); + cumulativeWeightsBinary.PlusEquals(binary); + } + + ++numberOfUpdates; + ++numberOfUpdatesThisEpoch; + if (averageWeights) { + ScoreComponentCollection averageWeights(cumulativeWeights); + if (accumulateWeights) { + averageWeights.DivideEquals(numberOfUpdates); + } else { + averageWeights.DivideEquals(numberOfUpdatesThisEpoch); + } + + mosesWeights = averageWeights; + } + + // set new Moses weights + decoder->setWeights(mosesWeights); + //cerr << "Rank " << rank << ", epoch " << epoch << ", new weights: " << mosesWeights << endl; + } + + // update history (for approximate document Bleu) + if (historyBleu || simpleHistoryBleu) { + for (size_t i = 0; i < oneBests.size(); ++i) + cerr << "Rank " << rank << ", epoch " << epoch << ", update history with 1best length: " << oneBests[i].size() << " "; + decoder->updateHistory(oneBests, inputLengths, ref_ids, rank, epoch); + deleteTranslations(oneBests); + } } // END TRANSLATE AND UPDATE BATCH - + // size of all shards except for the last one size_t generalShardSize; if (trainWithMultipleFolds) - generalShardSize = order.size()/coresPerFold; + generalShardSize = order.size()/coresPerFold; else - generalShardSize = order.size()/size; - + generalShardSize = order.size()/size; + size_t mixing_base = mixingFrequency == 0 ? 0 : generalShardSize / mixingFrequency; size_t dumping_base = weightDumpFrequency == 0 ? 0 : generalShardSize / weightDumpFrequency; bool mix = evaluateModulo(shardPosition, mixing_base, actualBatchSize); - + // mix weights? if (mix) { #ifdef MPI_ENABLE - cerr << "Rank " << rank << ", epoch " << epoch << ", mixing weights.. " << endl; - // collect all weights in mixedWeights and divide by number of processes - mpi::reduce(world, mosesWeights, mixedWeights, SCCPlus(), 0); - - // mix confidence counts - //mpi::reduce(world, confidenceCounts, mixedConfidenceCounts, SCCPlus(), 0); - ScoreComponentCollection totalBinary; - if (sparseAverage) { - ScoreComponentCollection binary; - binary.SetToBinaryOf(mosesWeights); - mpi::reduce(world, binary, totalBinary, SCCPlus(), 0); - } - if (rank == 0) { - // divide by number of processes - if (sparseNoAverage) - mixedWeights.CoreDivideEquals(size); // average only core weights - else if (sparseAverage) - mixedWeights.DivideEquals(totalBinary); - else - mixedWeights.DivideEquals(size); - - // divide confidence counts - //mixedConfidenceCounts.DivideEquals(size); - - // normalise weights after averaging - if (normaliseWeights) { - mixedWeights.L1Normalise(); - } - - ++weightMixingThisEpoch; + cerr << "Rank " << rank << ", epoch " << epoch << ", mixing weights.. " << endl; + // collect all weights in mixedWeights and divide by number of processes + mpi::reduce(world, mosesWeights, mixedWeights, SCCPlus(), 0); - if (pruneZeroWeights) { - size_t pruned = mixedWeights.PruneZeroWeightFeatures(); - cerr << "Rank " << rank << ", epoch " << epoch << ", " - << pruned << " zero-weighted features pruned from mixedWeights." << endl; - - pruned = cumulativeWeights.PruneZeroWeightFeatures(); - cerr << "Rank " << rank << ", epoch " << epoch << ", " - << pruned << " zero-weighted features pruned from cumulativeWeights." << endl; - } + // mix confidence counts + //mpi::reduce(world, confidenceCounts, mixedConfidenceCounts, SCCPlus(), 0); + ScoreComponentCollection totalBinary; + if (sparseAverage) { + ScoreComponentCollection binary; + binary.SetToBinaryOf(mosesWeights); + mpi::reduce(world, binary, totalBinary, SCCPlus(), 0); + } + if (rank == 0) { + // divide by number of processes + if (sparseNoAverage) + mixedWeights.CoreDivideEquals(size); // average only core weights + else if (sparseAverage) + mixedWeights.DivideEquals(totalBinary); + else + mixedWeights.DivideEquals(size); - if (featureCutoff != -1 && weightMixingThisEpoch == mixingFrequency) { - size_t pruned = mixedWeights.PruneSparseFeatures(featureCutoff); - cerr << "Rank " << rank << ", epoch " << epoch << ", " - << pruned << " features pruned from mixedWeights." << endl; - - pruned = cumulativeWeights.PruneSparseFeatures(featureCutoff); - cerr << "Rank " << rank << ", epoch " << epoch << ", " - << pruned << " features pruned from cumulativeWeights." << endl; - } - - if (weightMixingThisEpoch == mixingFrequency || reg_on_every_mix) { - if (l1_regularize) { - size_t pruned; - if (l1_reg_sparse) - pruned = mixedWeights.SparseL1Regularize(l1_lambda); - else - pruned = mixedWeights.L1Regularize(l1_lambda); - cerr << "Rank " << rank << ", epoch " << epoch << ", " - << "l1-reg. on mixedWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl; - } - if (l2_regularize) { - if (l2_reg_sparse) - mixedWeights.SparseL2Regularize(l2_lambda); - else - mixedWeights.L2Regularize(l2_lambda); - cerr << "Rank " << rank << ", epoch " << epoch << ", " - << "l2-reg. on mixedWeights with lambda=" << l2_lambda << endl; - } - } - } - - // broadcast average weights from process 0 - mpi::broadcast(world, mixedWeights, 0); - decoder->setWeights(mixedWeights); - mosesWeights = mixedWeights; - - // broadcast summed confidence counts - //mpi::broadcast(world, mixedConfidenceCounts, 0); - //confidenceCounts = mixedConfidenceCounts; + // divide confidence counts + //mixedConfidenceCounts.DivideEquals(size); + + // normalise weights after averaging + if (normaliseWeights) { + mixedWeights.L1Normalise(); + } + + ++weightMixingThisEpoch; + + if (pruneZeroWeights) { + size_t pruned = mixedWeights.PruneZeroWeightFeatures(); + cerr << "Rank " << rank << ", epoch " << epoch << ", " + << pruned << " zero-weighted features pruned from mixedWeights." << endl; + + pruned = cumulativeWeights.PruneZeroWeightFeatures(); + cerr << "Rank " << rank << ", epoch " << epoch << ", " + << pruned << " zero-weighted features pruned from cumulativeWeights." << endl; + } + + if (featureCutoff != -1 && weightMixingThisEpoch == mixingFrequency) { + size_t pruned = mixedWeights.PruneSparseFeatures(featureCutoff); + cerr << "Rank " << rank << ", epoch " << epoch << ", " + << pruned << " features pruned from mixedWeights." << endl; + + pruned = cumulativeWeights.PruneSparseFeatures(featureCutoff); + cerr << "Rank " << rank << ", epoch " << epoch << ", " + << pruned << " features pruned from cumulativeWeights." << endl; + } + + if (weightMixingThisEpoch == mixingFrequency || reg_on_every_mix) { + if (l1_regularize) { + size_t pruned; + if (l1_reg_sparse) + pruned = mixedWeights.SparseL1Regularize(l1_lambda); + else + pruned = mixedWeights.L1Regularize(l1_lambda); + cerr << "Rank " << rank << ", epoch " << epoch << ", " + << "l1-reg. on mixedWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl; + } + if (l2_regularize) { + if (l2_reg_sparse) + mixedWeights.SparseL2Regularize(l2_lambda); + else + mixedWeights.L2Regularize(l2_lambda); + cerr << "Rank " << rank << ", epoch " << epoch << ", " + << "l2-reg. on mixedWeights with lambda=" << l2_lambda << endl; + } + } + } + + // broadcast average weights from process 0 + mpi::broadcast(world, mixedWeights, 0); + decoder->setWeights(mixedWeights); + mosesWeights = mixedWeights; + + // broadcast summed confidence counts + //mpi::broadcast(world, mixedConfidenceCounts, 0); + //confidenceCounts = mixedConfidenceCounts; #endif #ifndef MPI_ENABLE - //cerr << "\nRank " << rank << ", no mixing, weights: " << mosesWeights << endl; - mixedWeights = mosesWeights; + //cerr << "\nRank " << rank << ", no mixing, weights: " << mosesWeights << endl; + mixedWeights = mosesWeights; #endif } // end mixing - + // Dump weights? if (trainWithMultipleFolds || weightEpochDump == weightDumpFrequency) { - // dump mixed weights at end of every epoch to enable continuing a crashed experiment - // (for jackknife every time the weights are mixed) - ostringstream filename; - if (epoch < 10) - filename << weightDumpStem << "_mixed_0" << epoch; - else - filename << weightDumpStem << "_mixed_" << epoch; - - if (weightDumpFrequency > 1) - filename << "_" << weightEpochDump; - - mixedWeights.Save(filename.str()); - cerr << "Dumping mixed weights during epoch " << epoch << " to " << filename.str() << endl << endl; + // dump mixed weights at end of every epoch to enable continuing a crashed experiment + // (for jackknife every time the weights are mixed) + ostringstream filename; + if (epoch < 10) + filename << weightDumpStem << "_mixed_0" << epoch; + else + filename << weightDumpStem << "_mixed_" << epoch; + + if (weightDumpFrequency > 1) + filename << "_" << weightEpochDump; + + mixedWeights.Save(filename.str()); + cerr << "Dumping mixed weights during epoch " << epoch << " to " << filename.str() << endl << endl; } if (dumpMixedWeights) { - if (mix && rank == 0 && !weightDumpStem.empty()) { - // dump mixed weights instead of average weights - ostringstream filename; - if (epoch < 10) - filename << weightDumpStem << "_0" << epoch; - else - filename << weightDumpStem << "_" << epoch; - - if (weightDumpFrequency > 1) - filename << "_" << weightEpochDump; - - cerr << "Dumping mixed weights during epoch " << epoch << " to " << filename.str() << endl << endl; - mixedWeights.Save(filename.str()); - ++weightEpochDump; - } - } - else { - if (evaluateModulo(shardPosition, dumping_base, actualBatchSize)) { - cerr << "Rank " << rank << ", epoch " << epoch << ", dump weights.. (pos: " << shardPosition << ", base: " << dumping_base << ")" << endl; - ScoreComponentCollection tmpAverageWeights(cumulativeWeights); - bool proceed = false; - if (accumulateWeights) { - if (numberOfUpdates > 0) { - tmpAverageWeights.DivideEquals(numberOfUpdates); - proceed = true; - } - } else { - if (numberOfUpdatesThisEpoch > 0) { - if (sparseNoAverage) // average only core weights - tmpAverageWeights.CoreDivideEquals(numberOfUpdatesThisEpoch); - else if (sparseAverage) - tmpAverageWeights.DivideEquals(cumulativeWeightsBinary); - else - tmpAverageWeights.DivideEquals(numberOfUpdatesThisEpoch); - proceed = true; - } - } - - if (proceed) { + if (mix && rank == 0 && !weightDumpStem.empty()) { + // dump mixed weights instead of average weights + ostringstream filename; + if (epoch < 10) + filename << weightDumpStem << "_0" << epoch; + else + filename << weightDumpStem << "_" << epoch; + + if (weightDumpFrequency > 1) + filename << "_" << weightEpochDump; + + cerr << "Dumping mixed weights during epoch " << epoch << " to " << filename.str() << endl << endl; + mixedWeights.Save(filename.str()); + ++weightEpochDump; + } + } else { + if (evaluateModulo(shardPosition, dumping_base, actualBatchSize)) { + cerr << "Rank " << rank << ", epoch " << epoch << ", dump weights.. (pos: " << shardPosition << ", base: " << dumping_base << ")" << endl; + ScoreComponentCollection tmpAverageWeights(cumulativeWeights); + bool proceed = false; + if (accumulateWeights) { + if (numberOfUpdates > 0) { + tmpAverageWeights.DivideEquals(numberOfUpdates); + proceed = true; + } + } else { + if (numberOfUpdatesThisEpoch > 0) { + if (sparseNoAverage) // average only core weights + tmpAverageWeights.CoreDivideEquals(numberOfUpdatesThisEpoch); + else if (sparseAverage) + tmpAverageWeights.DivideEquals(cumulativeWeightsBinary); + else + tmpAverageWeights.DivideEquals(numberOfUpdatesThisEpoch); + proceed = true; + } + } + + if (proceed) { #ifdef MPI_ENABLE - // average across processes - mpi::reduce(world, tmpAverageWeights, mixedAverageWeights, SCCPlus(), 0); - ScoreComponentCollection totalBinary; - if (sparseAverage) { - ScoreComponentCollection binary; - binary.SetToBinaryOf(mosesWeights); - mpi::reduce(world, binary, totalBinary, SCCPlus(), 0); - } + // average across processes + mpi::reduce(world, tmpAverageWeights, mixedAverageWeights, SCCPlus(), 0); + ScoreComponentCollection totalBinary; + if (sparseAverage) { + ScoreComponentCollection binary; + binary.SetToBinaryOf(mosesWeights); + mpi::reduce(world, binary, totalBinary, SCCPlus(), 0); + } #endif #ifndef MPI_ENABLE - mixedAverageWeights = tmpAverageWeights; - //FIXME: What do to for non-mpi version - ScoreComponentCollection totalBinary; + mixedAverageWeights = tmpAverageWeights; + //FIXME: What do to for non-mpi version + ScoreComponentCollection totalBinary; #endif - if (rank == 0 && !weightDumpStem.empty()) { - // divide by number of processes - if (sparseNoAverage) - mixedAverageWeights.CoreDivideEquals(size); // average only core weights - else if (sparseAverage) - mixedAverageWeights.DivideEquals(totalBinary); - else - mixedAverageWeights.DivideEquals(size); - - // normalise weights after averaging - if (normaliseWeights) { - mixedAverageWeights.L1Normalise(); - } - - // dump final average weights - ostringstream filename; - if (epoch < 10) { - filename << weightDumpStem << "_0" << epoch; - } else { - filename << weightDumpStem << "_" << epoch; - } - - if (weightDumpFrequency > 1) { - filename << "_" << weightEpochDump; - } - - /*if (accumulateWeights) { - cerr << "\nMixed average weights (cumulative) during epoch " << epoch << ": " << mixedAverageWeights << endl; - } else { - cerr << "\nMixed average weights during epoch " << epoch << ": " << mixedAverageWeights << endl; - }*/ - - cerr << "Dumping mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl; - mixedAverageWeights.Save(filename.str()); - ++weightEpochDump; - - if (weightEpochDump == weightDumpFrequency) { - if (l1_regularize) { - size_t pruned = mixedAverageWeights.SparseL1Regularize(l1_lambda); - cerr << "Rank " << rank << ", epoch " << epoch << ", " - << "l1-reg. on mixedAverageWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl; + if (rank == 0 && !weightDumpStem.empty()) { + // divide by number of processes + if (sparseNoAverage) + mixedAverageWeights.CoreDivideEquals(size); // average only core weights + else if (sparseAverage) + mixedAverageWeights.DivideEquals(totalBinary); + else + mixedAverageWeights.DivideEquals(size); - } - if (l2_regularize) { - mixedAverageWeights.SparseL2Regularize(l2_lambda); - cerr << "Rank " << rank << ", epoch " << epoch << ", " - << "l2-reg. on mixedAverageWeights with lambda=" << l2_lambda << endl; - } - - if (l1_regularize || l2_regularize) { - filename << "_reg"; - cerr << "Dumping regularized mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl; - mixedAverageWeights.Save(filename.str()); - } - } - - if (weightEpochDump == weightDumpFrequency && printFeatureCounts) { - // print out all features with counts - stringstream s1, s2; - s1 << "sparse_feature_hope_counts" << "_" << epoch; - s2 << "sparse_feature_fear_counts" << "_" << epoch; - ofstream sparseFeatureCountsHope(s1.str().c_str()); - ofstream sparseFeatureCountsFear(s2.str().c_str()); - - mixedAverageWeights.PrintSparseHopeFeatureCounts(sparseFeatureCountsHope); - mixedAverageWeights.PrintSparseFearFeatureCounts(sparseFeatureCountsFear); - sparseFeatureCountsHope.close(); - sparseFeatureCountsFear.close(); - } - } - } - }// end dumping + // normalise weights after averaging + if (normaliseWeights) { + mixedAverageWeights.L1Normalise(); + } + + // dump final average weights + ostringstream filename; + if (epoch < 10) { + filename << weightDumpStem << "_0" << epoch; + } else { + filename << weightDumpStem << "_" << epoch; + } + + if (weightDumpFrequency > 1) { + filename << "_" << weightEpochDump; + } + + /*if (accumulateWeights) { + cerr << "\nMixed average weights (cumulative) during epoch " << epoch << ": " << mixedAverageWeights << endl; + } else { + cerr << "\nMixed average weights during epoch " << epoch << ": " << mixedAverageWeights << endl; + }*/ + + cerr << "Dumping mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl; + mixedAverageWeights.Save(filename.str()); + ++weightEpochDump; + + if (weightEpochDump == weightDumpFrequency) { + if (l1_regularize) { + size_t pruned = mixedAverageWeights.SparseL1Regularize(l1_lambda); + cerr << "Rank " << rank << ", epoch " << epoch << ", " + << "l1-reg. on mixedAverageWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl; + + } + if (l2_regularize) { + mixedAverageWeights.SparseL2Regularize(l2_lambda); + cerr << "Rank " << rank << ", epoch " << epoch << ", " + << "l2-reg. on mixedAverageWeights with lambda=" << l2_lambda << endl; + } + + if (l1_regularize || l2_regularize) { + filename << "_reg"; + cerr << "Dumping regularized mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl; + mixedAverageWeights.Save(filename.str()); + } + } + + if (weightEpochDump == weightDumpFrequency && printFeatureCounts) { + // print out all features with counts + stringstream s1, s2; + s1 << "sparse_feature_hope_counts" << "_" << epoch; + s2 << "sparse_feature_fear_counts" << "_" << epoch; + ofstream sparseFeatureCountsHope(s1.str().c_str()); + ofstream sparseFeatureCountsFear(s2.str().c_str()); + + mixedAverageWeights.PrintSparseHopeFeatureCounts(sparseFeatureCountsHope); + mixedAverageWeights.PrintSparseFearFeatureCounts(sparseFeatureCountsFear); + sparseFeatureCountsHope.close(); + sparseFeatureCountsFear.close(); + } + } + } + }// end dumping } // end if dump } // end of shard loop, end of this epoch cerr << "Rank " << rank << ", epoch " << epoch << ", end of epoch.." << endl; - + if (historyBleu || simpleHistoryBleu) { cerr << "Bleu feature history after epoch " << epoch << endl; decoder->printBleuFeatureHistory(cerr); } // cerr << "Rank " << rank << ", epoch " << epoch << ", sum of violated constraints: " << sumStillViolatedConstraints << endl; - + // Check whether there were any weight updates during this epoch size_t sumUpdates; size_t *sendbuf_uint, *recvbuf_uint; @@ -1643,63 +1615,62 @@ int main(int argc, char** argv) { mpi::broadcast(world, stop, 0); #endif } - + if (!stop) { // Test if weights have converged if (weightConvergence) { - bool reached = true; - if (rank == 0 && (epoch >= 2)) { - ScoreComponentCollection firstDiff, secondDiff; - if (dumpMixedWeights) { - firstDiff = mixedWeights; - firstDiff.MinusEquals(mixedWeightsPrevious); - secondDiff = mixedWeights; - secondDiff.MinusEquals(mixedWeightsBeforePrevious); - } - else { - firstDiff = mixedAverageWeights; - firstDiff.MinusEquals(mixedAverageWeightsPrevious); - secondDiff = mixedAverageWeights; - secondDiff.MinusEquals(mixedAverageWeightsBeforePrevious); - } - VERBOSE(1, "Average weight changes since previous epoch: " << firstDiff << " (max: " << firstDiff.GetLInfNorm() << ")" << endl); - VERBOSE(1, "Average weight changes since before previous epoch: " << secondDiff << " (max: " << secondDiff.GetLInfNorm() << ")" << endl << endl); - - // check whether stopping criterion has been reached - // (both difference vectors must have all weight changes smaller than min_weight_change) - if (firstDiff.GetLInfNorm() >= min_weight_change) - reached = false; - if (secondDiff.GetLInfNorm() >= min_weight_change) - reached = false; - if (reached) { - // stop MIRA - stop = true; - cerr << "\nWeights have converged after epoch " << epoch << ".. stopping MIRA." << endl; - ScoreComponentCollection dummy; - ostringstream endfilename; - endfilename << "stopping"; - dummy.Save(endfilename.str()); - } - } - - mixedWeightsBeforePrevious = mixedWeightsPrevious; - mixedWeightsPrevious = mixedWeights; - mixedAverageWeightsBeforePrevious = mixedAverageWeightsPrevious; - mixedAverageWeightsPrevious = mixedAverageWeights; + bool reached = true; + if (rank == 0 && (epoch >= 2)) { + ScoreComponentCollection firstDiff, secondDiff; + if (dumpMixedWeights) { + firstDiff = mixedWeights; + firstDiff.MinusEquals(mixedWeightsPrevious); + secondDiff = mixedWeights; + secondDiff.MinusEquals(mixedWeightsBeforePrevious); + } else { + firstDiff = mixedAverageWeights; + firstDiff.MinusEquals(mixedAverageWeightsPrevious); + secondDiff = mixedAverageWeights; + secondDiff.MinusEquals(mixedAverageWeightsBeforePrevious); + } + VERBOSE(1, "Average weight changes since previous epoch: " << firstDiff << " (max: " << firstDiff.GetLInfNorm() << ")" << endl); + VERBOSE(1, "Average weight changes since before previous epoch: " << secondDiff << " (max: " << secondDiff.GetLInfNorm() << ")" << endl << endl); + + // check whether stopping criterion has been reached + // (both difference vectors must have all weight changes smaller than min_weight_change) + if (firstDiff.GetLInfNorm() >= min_weight_change) + reached = false; + if (secondDiff.GetLInfNorm() >= min_weight_change) + reached = false; + if (reached) { + // stop MIRA + stop = true; + cerr << "\nWeights have converged after epoch " << epoch << ".. stopping MIRA." << endl; + ScoreComponentCollection dummy; + ostringstream endfilename; + endfilename << "stopping"; + dummy.Save(endfilename.str()); + } + } + + mixedWeightsBeforePrevious = mixedWeightsPrevious; + mixedWeightsPrevious = mixedWeights; + mixedAverageWeightsBeforePrevious = mixedAverageWeightsPrevious; + mixedAverageWeightsPrevious = mixedAverageWeights; #ifdef MPI_ENABLE - mpi::broadcast(world, stop, 0); + mpi::broadcast(world, stop, 0); #endif } //end if (weightConvergence) } } // end of epoch loop - + #ifdef MPI_ENABLE MPI_Finalize(); #endif - + time(&now); cerr << "Rank " << rank << ", " << ctime(&now); - + if (rank == 0) { ScoreComponentCollection dummy; ostringstream endfilename; @@ -1711,7 +1682,8 @@ int main(int argc, char** argv) { exit(0); } -bool loadSentences(const string& filename, vector& sentences) { +bool loadSentences(const string& filename, vector& sentences) +{ ifstream in(filename.c_str()); if (!in) return false; @@ -1721,27 +1693,28 @@ bool loadSentences(const string& filename, vector& sentences) { return true; } -bool evaluateModulo(size_t shard_position, size_t mix_or_dump_base, size_t actual_batch_size) { +bool evaluateModulo(size_t shard_position, size_t mix_or_dump_base, size_t actual_batch_size) +{ if (mix_or_dump_base == 0) return 0; if (actual_batch_size > 1) { bool mix_or_dump = false; size_t numberSubtracts = actual_batch_size; do { if (shard_position % mix_or_dump_base == 0) { - mix_or_dump = true; - break; + mix_or_dump = true; + break; } --shard_position; --numberSubtracts; } while (numberSubtracts > 0); return mix_or_dump; - } - else { + } else { return ((shard_position % mix_or_dump_base) == 0); } } -void printFeatureValues(vector > &featureValues) { +void printFeatureValues(vector > &featureValues) +{ for (size_t i = 0; i < featureValues.size(); ++i) { for (size_t j = 0; j < featureValues[i].size(); ++j) { cerr << featureValues[i][j] << endl; @@ -1750,7 +1723,8 @@ void printFeatureValues(vector > &featureValues cerr << endl; } -void deleteTranslations(vector > &translations) { +void deleteTranslations(vector > &translations) +{ for (size_t i = 0; i < translations.size(); ++i) { for (size_t j = 0; j < translations[i].size(); ++j) { delete translations[i][j]; @@ -1758,19 +1732,20 @@ void deleteTranslations(vector > &translations) { } } -void decodeHopeOrFear(size_t rank, size_t size, size_t decode, string filename, vector &inputSentences, MosesDecoder* decoder, size_t n, float bleuWeight) { +void decodeHopeOrFear(size_t rank, size_t size, size_t decode, string filename, vector &inputSentences, MosesDecoder* decoder, size_t n, float bleuWeight) +{ if (decode == 1) cerr << "Rank " << rank << ", decoding dev input set according to hope objective.. " << endl; else if (decode == 2) cerr << "Rank " << rank << ", decoding dev input set according to fear objective.. " << endl; else cerr << "Rank " << rank << ", decoding dev input set according to normal objective.. " << endl; - + // Create shards according to the number of processes used vector order; for (size_t i = 0; i < inputSentences.size(); ++i) order.push_back(i); - + vector shard; float shardSize = (float) (order.size()) / size; size_t shardStart = (size_t) (shardSize * rank); @@ -1783,7 +1758,7 @@ void decodeHopeOrFear(size_t rank, size_t size, size_t decode, string filename, VERBOSE(1, "Rank " << rank << ", shard size: " << shardSize << endl); shard.resize(shardSize); copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin()); - + // open files for writing stringstream fname; fname << filename << ".rank" << rank; @@ -1802,76 +1777,79 @@ void decodeHopeOrFear(size_t rank, size_t size, size_t decode, string filename, msg << "Unable to open " << filename_nbest; throw runtime_error(msg.str()); } - + for (size_t i = 0; i < shard.size(); ++i) { size_t sid = shard[i]; string& input = inputSentences[sid]; - + vector > dummyFeatureValues; vector > dummyBleuScores; vector > dummyModelScores; - + vector newFeatureValues; vector newScores; dummyFeatureValues.push_back(newFeatureValues); dummyBleuScores.push_back(newScores); dummyModelScores.push_back(newScores); - + float factor = 0.0; if (decode == 1) factor = 1.0; if (decode == 2) factor = -1.0; cerr << "Rank " << rank << ", translating sentence " << sid << endl; bool realBleu = false; vector< vector > nbestOutput = decoder->getNBest(input, sid, n, factor, bleuWeight, dummyFeatureValues[0], - dummyBleuScores[0], dummyModelScores[0], n, realBleu, true, false, rank, 0, ""); + dummyBleuScores[0], dummyModelScores[0], n, realBleu, true, false, rank, 0, ""); cerr << endl; decoder->cleanup(StaticData::Instance().GetSearchAlgorithm() == ChartDecoding); - + for (size_t i = 0; i < nbestOutput.size(); ++i) { vector output = nbestOutput[i]; stringstream translation; for (size_t k = 0; k < output.size(); ++k) { - Word* w = const_cast(output[k]); - translation << w->GetString(0); - translation << " "; + Word* w = const_cast(output[k]); + translation << w->GetString(0); + translation << " "; } - + if (i == 0) - out << translation.str() << endl; + out << translation.str() << endl; nbest_out << sid << " ||| " << translation.str() << " ||| " << dummyFeatureValues[0][i] << - " ||| " << dummyModelScores[0][i] << " ||| sBleu=" << dummyBleuScores[0][i] << endl; + " ||| " << dummyModelScores[0][i] << " ||| sBleu=" << dummyBleuScores[0][i] << endl; } } - + out.close(); nbest_out.close(); cerr << "Closing files " << filename << " and " << filename_nbest.str() << endl; - + #ifdef MPI_ENABLE MPI_Finalize(); #endif - + time_t now; time(&now); cerr << "Rank " << rank << ", " << ctime(&now); - + delete decoder; exit(0); } -void applyLearningRates(vector > &featureValues, float core_r0, float sparse_r0) { +void applyLearningRates(vector > &featureValues, float core_r0, float sparse_r0) +{ for (size_t i=0; i > &featureValues, ScoreComponentCollection featureLearningRates, float sparse_r0) { +void applyPerFeatureLearningRates(vector > &featureValues, ScoreComponentCollection featureLearningRates, float sparse_r0) +{ for (size_t i=0; i > &featureValues, size_t rank, size_t epoch) { +void scaleFeatureScore(const FeatureFunction *sp, float scaling_factor, vector > &featureValues, size_t rank, size_t epoch) +{ string name = sp->GetScoreProducerDescription(); // scale down score @@ -1885,7 +1863,8 @@ void scaleFeatureScore(const FeatureFunction *sp, float scaling_factor, vector > &featureValues, size_t rank, size_t epoch) { +void scaleFeatureScores(const FeatureFunction *sp, float scaling_factor, vector > &featureValues, size_t rank, size_t epoch) +{ string name = sp->GetScoreProducerDescription(); // scale down score @@ -1893,7 +1872,7 @@ void scaleFeatureScores(const FeatureFunction *sp, float scaling_factor, vector< for (size_t j=0; j featureScores = featureValues[i][j].GetScoresForProducer(sp); for (size_t k=0; k bool from_string(T& t, const std::string& s, std::ios_base& ( } struct RandomIndex { - ptrdiff_t operator()(ptrdiff_t max) { - srand(time(0)); // Initialize random number generator with current time. - return static_cast (rand() % max); - } + ptrdiff_t operator()(ptrdiff_t max) { + srand(time(0)); // Initialize random number generator with current time. + return static_cast (rand() % max); + } }; //void OutputNBestList(const MosesChart::TrellisPathList &nBestList, const TranslationSystem* system, long translationId); @@ -50,7 +50,7 @@ void ignoreCoreFeatures(std::vector void takeLogs(std::vector > &featureValues, size_t base); void deleteTranslations(std::vector > &translations); void decodeHopeOrFear(size_t rank, size_t size, size_t decode, std::string decode_filename, std::vector &inputSentences, Mira::MosesDecoder* decoder, size_t n, float bleuWeight); -void applyLearningRates(std::vector > &featureValues, float core_r0, float sparse_r0); +void applyLearningRates(std::vector > &featureValues, float core_r0, float sparse_r0); void applyPerFeatureLearningRates(std::vector > &featureValues, Moses::ScoreComponentCollection featureLearningRates, float sparse_r0); void scaleFeatureScore(const Moses::FeatureFunction *sp, float scaling_factor, std::vector > &featureValues, size_t rank, size_t epoch); void scaleFeatureScores(const Moses::FeatureFunction *sp, float scaling_factor, std::vector > &featureValues, size_t rank, size_t epoch); diff --git a/mira/MiraOptimiser.cpp b/mira/MiraOptimiser.cpp index 4cc7f3fc3..82e9d85fb 100644 --- a/mira/MiraOptimiser.cpp +++ b/mira/MiraOptimiser.cpp @@ -5,234 +5,85 @@ using namespace Moses; using namespace std; -namespace Mira { +namespace Mira +{ size_t MiraOptimiser::updateWeights( - ScoreComponentCollection& weightUpdate, - const vector >& featureValues, - const vector >& losses, - const vector >& bleuScores, - const vector >& modelScores, - const vector& oracleFeatureValues, - const vector oracleBleuScores, - const vector oracleModelScores, - float learning_rate, - size_t rank, - size_t epoch) { - - // vector of feature values differences for all created constraints - vector featureValueDiffs; - vector lossMinusModelScoreDiffs; - vector all_losses; - - // most violated constraint in batch - ScoreComponentCollection max_batch_featureValueDiff; - - // Make constraints for new hypothesis translations - float epsilon = 0.0001; - int violatedConstraintsBefore = 0; - float oldDistanceFromOptimum = 0; - // iterate over input sentences (1 (online) or more (batch)) - for (size_t i = 0; i < featureValues.size(); ++i) { - //size_t sentenceId = sentenceIds[i]; - // iterate over hypothesis translations for one input sentence - for (size_t j = 0; j < featureValues[i].size(); ++j) { - ScoreComponentCollection featureValueDiff = oracleFeatureValues[i]; - featureValueDiff.MinusEquals(featureValues[i][j]); - - // cerr << "Rank " << rank << ", epoch " << epoch << ", feature value diff: " << featureValueDiff << endl; - if (featureValueDiff.GetL1Norm() == 0) { - cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl; - continue; - } - - float loss = losses[i][j]; - - // check if constraint is violated - bool violated = false; -// float modelScoreDiff = featureValueDiff.InnerProduct(currWeights); - float modelScoreDiff = oracleModelScores[i] - modelScores[i][j]; - float diff = 0; - - if (loss > modelScoreDiff) - diff = loss - modelScoreDiff; - cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl; - if (diff > epsilon) - violated = true; - - if (m_normaliseMargin) { - modelScoreDiff = (2*m_sigmoidParam/(1 + exp(-modelScoreDiff))) - m_sigmoidParam; - loss = (2*m_sigmoidParam/(1 + exp(-loss))) - m_sigmoidParam; - diff = 0; - if (loss > modelScoreDiff) { - diff = loss - modelScoreDiff; - } - cerr << "Rank " << rank << ", epoch " << epoch << ", normalised constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl; - } - - if (m_scale_margin) { - diff *= oracleBleuScores[i]; - cerr << "Rank " << rank << ", epoch " << epoch << ", scaling margin with oracle bleu score " << oracleBleuScores[i] << endl; - } - - featureValueDiffs.push_back(featureValueDiff); - lossMinusModelScoreDiffs.push_back(diff); - all_losses.push_back(loss); - if (violated) { - ++violatedConstraintsBefore; - oldDistanceFromOptimum += diff; - } - } - } - - // run optimisation: compute alphas for all given constraints - vector alphas; - ScoreComponentCollection summedUpdate; - if (violatedConstraintsBefore > 0) { - cerr << "Rank " << rank << ", epoch " << epoch << ", number of constraints passed to optimizer: " << - featureValueDiffs.size() << " (of which violated: " << violatedConstraintsBefore << ")" << endl; - if (m_slack != 0) { - alphas = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs, m_slack); - } else { - alphas = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs); - } - - // Update the weight vector according to the alphas and the feature value differences - // * w' = w' + SUM alpha_i * (h_i(oracle) - h_i(hypothesis)) - for (size_t k = 0; k < featureValueDiffs.size(); ++k) { - float alpha = alphas[k]; - cerr << "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl; - ScoreComponentCollection update(featureValueDiffs[k]); - update.MultiplyEquals(alpha); - - // sum updates - summedUpdate.PlusEquals(update); - } - } - else { - cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl; -// return 0; - return 1; - } - - // apply learning rate - if (learning_rate != 1) { - cerr << "Rank " << rank << ", epoch " << epoch << ", apply learning rate " << learning_rate << " to update." << endl; - summedUpdate.MultiplyEquals(learning_rate); - } - - // scale update by BLEU of oracle (for batch size 1 only) - if (oracleBleuScores.size() == 1) { - if (m_scale_update) { - cerr << "Rank " << rank << ", epoch " << epoch << ", scaling summed update with oracle bleu score " << oracleBleuScores[0] << endl; - summedUpdate.MultiplyEquals(oracleBleuScores[0]); - } - } - - // cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << summedUpdate << endl; - weightUpdate.PlusEquals(summedUpdate); - - // Sanity check: are there still violated constraints after optimisation? -/* int violatedConstraintsAfter = 0; - float newDistanceFromOptimum = 0; - for (size_t i = 0; i < featureValueDiffs.size(); ++i) { - float modelScoreDiff = featureValueDiffs[i].InnerProduct(currWeights); - float loss = all_losses[i]; - float diff = loss - modelScoreDiff; - if (diff > epsilon) { - ++violatedConstraintsAfter; - newDistanceFromOptimum += diff; - } - } - VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl); - VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);*/ -// return violatedConstraintsAfter; - return 0; -} - -size_t MiraOptimiser::updateWeightsHopeFear( - Moses::ScoreComponentCollection& weightUpdate, - const std::vector< std::vector >& featureValuesHope, - const std::vector< std::vector >& featureValuesFear, - const std::vector >& bleuScoresHope, - const std::vector >& bleuScoresFear, - const std::vector >& modelScoresHope, - const std::vector >& modelScoresFear, - float learning_rate, - size_t rank, - size_t epoch, - int updatePosition) { + ScoreComponentCollection& weightUpdate, + const vector >& featureValues, + const vector >& losses, + const vector >& bleuScores, + const vector >& modelScores, + const vector& oracleFeatureValues, + const vector oracleBleuScores, + const vector oracleModelScores, + float learning_rate, + size_t rank, + size_t epoch) +{ // vector of feature values differences for all created constraints vector featureValueDiffs; vector lossMinusModelScoreDiffs; - vector modelScoreDiffs; vector all_losses; - + // most violated constraint in batch ScoreComponentCollection max_batch_featureValueDiff; - + // Make constraints for new hypothesis translations float epsilon = 0.0001; int violatedConstraintsBefore = 0; float oldDistanceFromOptimum = 0; - // iterate over input sentences (1 (online) or more (batch)) - for (size_t i = 0; i < featureValuesHope.size(); ++i) { - if (updatePosition != -1) { - if (i < updatePosition) - continue; - else if (i > updatePosition) - break; - } - - // Pick all pairs[j,j] of hope and fear translations for one input sentence - for (size_t j = 0; j < featureValuesHope[i].size(); ++j) { - ScoreComponentCollection featureValueDiff = featureValuesHope[i][j]; - featureValueDiff.MinusEquals(featureValuesFear[i][j]); - //cerr << "Rank " << rank << ", epoch " << epoch << ", feature value diff: " << featureValueDiff << endl; + for (size_t i = 0; i < featureValues.size(); ++i) { + //size_t sentenceId = sentenceIds[i]; + // iterate over hypothesis translations for one input sentence + for (size_t j = 0; j < featureValues[i].size(); ++j) { + ScoreComponentCollection featureValueDiff = oracleFeatureValues[i]; + featureValueDiff.MinusEquals(featureValues[i][j]); + + // cerr << "Rank " << rank << ", epoch " << epoch << ", feature value diff: " << featureValueDiff << endl; if (featureValueDiff.GetL1Norm() == 0) { - cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl; - continue; + cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl; + continue; } - - float loss = bleuScoresHope[i][j] - bleuScoresFear[i][j]; - + + float loss = losses[i][j]; + // check if constraint is violated bool violated = false; - //float modelScoreDiff = featureValueDiff.InnerProduct(currWeights); - float modelScoreDiff = modelScoresHope[i][j] - modelScoresFear[i][j]; +// float modelScoreDiff = featureValueDiff.InnerProduct(currWeights); + float modelScoreDiff = oracleModelScores[i] - modelScores[i][j]; float diff = 0; - if (loss > modelScoreDiff) - diff = loss - modelScoreDiff; - cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl; - - if (diff > epsilon) - violated = true; - + + if (loss > modelScoreDiff) + diff = loss - modelScoreDiff; + cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl; + if (diff > epsilon) + violated = true; + if (m_normaliseMargin) { - modelScoreDiff = (2*m_sigmoidParam/(1 + exp(-modelScoreDiff))) - m_sigmoidParam; - loss = (2*m_sigmoidParam/(1 + exp(-loss))) - m_sigmoidParam; - diff = 0; - if (loss > modelScoreDiff) { - diff = loss - modelScoreDiff; - } - cerr << "Rank " << rank << ", epoch " << epoch << ", normalised constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl; + modelScoreDiff = (2*m_sigmoidParam/(1 + exp(-modelScoreDiff))) - m_sigmoidParam; + loss = (2*m_sigmoidParam/(1 + exp(-loss))) - m_sigmoidParam; + diff = 0; + if (loss > modelScoreDiff) { + diff = loss - modelScoreDiff; + } + cerr << "Rank " << rank << ", epoch " << epoch << ", normalised constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl; } - + if (m_scale_margin) { - diff *= bleuScoresHope[i][j]; - cerr << "Rank " << rank << ", epoch " << epoch << ", scaling margin with oracle bleu score " << bleuScoresHope[i][j] << endl; + diff *= oracleBleuScores[i]; + cerr << "Rank " << rank << ", epoch " << epoch << ", scaling margin with oracle bleu score " << oracleBleuScores[i] << endl; } - + featureValueDiffs.push_back(featureValueDiff); lossMinusModelScoreDiffs.push_back(diff); - modelScoreDiffs.push_back(modelScoreDiff); all_losses.push_back(loss); if (violated) { - ++violatedConstraintsBefore; - oldDistanceFromOptimum += diff; - } + ++violatedConstraintsBefore; + oldDistanceFromOptimum += diff; + } } } @@ -241,48 +92,198 @@ size_t MiraOptimiser::updateWeightsHopeFear( ScoreComponentCollection summedUpdate; if (violatedConstraintsBefore > 0) { cerr << "Rank " << rank << ", epoch " << epoch << ", number of constraints passed to optimizer: " << - featureValueDiffs.size() << " (of which violated: " << violatedConstraintsBefore << ")" << endl; + featureValueDiffs.size() << " (of which violated: " << violatedConstraintsBefore << ")" << endl; if (m_slack != 0) { alphas = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs, m_slack); } else { alphas = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs); } - + + // Update the weight vector according to the alphas and the feature value differences + // * w' = w' + SUM alpha_i * (h_i(oracle) - h_i(hypothesis)) + for (size_t k = 0; k < featureValueDiffs.size(); ++k) { + float alpha = alphas[k]; + cerr << "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl; + ScoreComponentCollection update(featureValueDiffs[k]); + update.MultiplyEquals(alpha); + + // sum updates + summedUpdate.PlusEquals(update); + } + } else { + cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl; +// return 0; + return 1; + } + + // apply learning rate + if (learning_rate != 1) { + cerr << "Rank " << rank << ", epoch " << epoch << ", apply learning rate " << learning_rate << " to update." << endl; + summedUpdate.MultiplyEquals(learning_rate); + } + + // scale update by BLEU of oracle (for batch size 1 only) + if (oracleBleuScores.size() == 1) { + if (m_scale_update) { + cerr << "Rank " << rank << ", epoch " << epoch << ", scaling summed update with oracle bleu score " << oracleBleuScores[0] << endl; + summedUpdate.MultiplyEquals(oracleBleuScores[0]); + } + } + + // cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << summedUpdate << endl; + weightUpdate.PlusEquals(summedUpdate); + + // Sanity check: are there still violated constraints after optimisation? + /* int violatedConstraintsAfter = 0; + float newDistanceFromOptimum = 0; + for (size_t i = 0; i < featureValueDiffs.size(); ++i) { + float modelScoreDiff = featureValueDiffs[i].InnerProduct(currWeights); + float loss = all_losses[i]; + float diff = loss - modelScoreDiff; + if (diff > epsilon) { + ++violatedConstraintsAfter; + newDistanceFromOptimum += diff; + } + } + VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl); + VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);*/ +// return violatedConstraintsAfter; + return 0; +} + +size_t MiraOptimiser::updateWeightsHopeFear( + Moses::ScoreComponentCollection& weightUpdate, + const std::vector< std::vector >& featureValuesHope, + const std::vector< std::vector >& featureValuesFear, + const std::vector >& bleuScoresHope, + const std::vector >& bleuScoresFear, + const std::vector >& modelScoresHope, + const std::vector >& modelScoresFear, + float learning_rate, + size_t rank, + size_t epoch, + int updatePosition) +{ + + // vector of feature values differences for all created constraints + vector featureValueDiffs; + vector lossMinusModelScoreDiffs; + vector modelScoreDiffs; + vector all_losses; + + // most violated constraint in batch + ScoreComponentCollection max_batch_featureValueDiff; + + // Make constraints for new hypothesis translations + float epsilon = 0.0001; + int violatedConstraintsBefore = 0; + float oldDistanceFromOptimum = 0; + + // iterate over input sentences (1 (online) or more (batch)) + for (size_t i = 0; i < featureValuesHope.size(); ++i) { + if (updatePosition != -1) { + if (i < updatePosition) + continue; + else if (i > updatePosition) + break; + } + + // Pick all pairs[j,j] of hope and fear translations for one input sentence + for (size_t j = 0; j < featureValuesHope[i].size(); ++j) { + ScoreComponentCollection featureValueDiff = featureValuesHope[i][j]; + featureValueDiff.MinusEquals(featureValuesFear[i][j]); + //cerr << "Rank " << rank << ", epoch " << epoch << ", feature value diff: " << featureValueDiff << endl; + if (featureValueDiff.GetL1Norm() == 0) { + cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl; + continue; + } + + float loss = bleuScoresHope[i][j] - bleuScoresFear[i][j]; + + // check if constraint is violated + bool violated = false; + //float modelScoreDiff = featureValueDiff.InnerProduct(currWeights); + float modelScoreDiff = modelScoresHope[i][j] - modelScoresFear[i][j]; + float diff = 0; + if (loss > modelScoreDiff) + diff = loss - modelScoreDiff; + cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl; + + if (diff > epsilon) + violated = true; + + if (m_normaliseMargin) { + modelScoreDiff = (2*m_sigmoidParam/(1 + exp(-modelScoreDiff))) - m_sigmoidParam; + loss = (2*m_sigmoidParam/(1 + exp(-loss))) - m_sigmoidParam; + diff = 0; + if (loss > modelScoreDiff) { + diff = loss - modelScoreDiff; + } + cerr << "Rank " << rank << ", epoch " << epoch << ", normalised constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl; + } + + if (m_scale_margin) { + diff *= bleuScoresHope[i][j]; + cerr << "Rank " << rank << ", epoch " << epoch << ", scaling margin with oracle bleu score " << bleuScoresHope[i][j] << endl; + } + + featureValueDiffs.push_back(featureValueDiff); + lossMinusModelScoreDiffs.push_back(diff); + modelScoreDiffs.push_back(modelScoreDiff); + all_losses.push_back(loss); + if (violated) { + ++violatedConstraintsBefore; + oldDistanceFromOptimum += diff; + } + } + } + + // run optimisation: compute alphas for all given constraints + vector alphas; + ScoreComponentCollection summedUpdate; + if (violatedConstraintsBefore > 0) { + cerr << "Rank " << rank << ", epoch " << epoch << ", number of constraints passed to optimizer: " << + featureValueDiffs.size() << " (of which violated: " << violatedConstraintsBefore << ")" << endl; + if (m_slack != 0) { + alphas = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs, m_slack); + } else { + alphas = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs); + } + // Update the weight vector according to the alphas and the feature value differences // * w' = w' + SUM alpha_i * (h_i(oracle) - h_i(hypothesis)) for (size_t k = 0; k < featureValueDiffs.size(); ++k) { float alpha = alphas[k]; cerr << "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl; if (alpha != 0) { - // apply boosting factor - if (m_boost && modelScoreDiffs[k] <= 0) { - // factor between 1.5 and 3 (for Bleu scores between 5 and 20, the factor is within the boundaries) - float factor = min(1.5, log2(bleuScoresHope[0][0])); // TODO: make independent of number of oracles!! - factor = min(3.0f, factor); - alpha = alpha * factor; - cerr << "Rank " << rank << ", epoch " << epoch << ", apply boosting factor " << factor << " to update." << endl; - } - - ScoreComponentCollection update(featureValueDiffs[k]); - update.MultiplyEquals(alpha); - - // sum updates - summedUpdate.PlusEquals(update); + // apply boosting factor + if (m_boost && modelScoreDiffs[k] <= 0) { + // factor between 1.5 and 3 (for Bleu scores between 5 and 20, the factor is within the boundaries) + float factor = min(1.5, log2(bleuScoresHope[0][0])); // TODO: make independent of number of oracles!! + factor = min(3.0f, factor); + alpha = alpha * factor; + cerr << "Rank " << rank << ", epoch " << epoch << ", apply boosting factor " << factor << " to update." << endl; + } + + ScoreComponentCollection update(featureValueDiffs[k]); + update.MultiplyEquals(alpha); + + // sum updates + summedUpdate.PlusEquals(update); } } - } - else { + } else { cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl; // return 0; return 1; } - + // apply learning rate if (learning_rate != 1) { cerr << "Rank " << rank << ", epoch " << epoch << ", apply learning rate " << learning_rate << " to update." << endl; summedUpdate.MultiplyEquals(learning_rate); } - + // scale update by BLEU of oracle (for batch size 1 only) if (featureValuesHope.size() == 1) { if (m_scale_update) { @@ -290,46 +291,47 @@ size_t MiraOptimiser::updateWeightsHopeFear( summedUpdate.MultiplyEquals(bleuScoresHope[0][0]); } } - + //cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << summedUpdate << endl; weightUpdate.PlusEquals(summedUpdate); - + // Sanity check: are there still violated constraints after optimisation? -/* int violatedConstraintsAfter = 0; - float newDistanceFromOptimum = 0; - for (size_t i = 0; i < featureValueDiffs.size(); ++i) { - float modelScoreDiff = featureValueDiffs[i].InnerProduct(currWeights); - float loss = all_losses[i]; - float diff = loss - modelScoreDiff; - if (diff > epsilon) { - ++violatedConstraintsAfter; - newDistanceFromOptimum += diff; - } - } - VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl); - VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);*/ + /* int violatedConstraintsAfter = 0; + float newDistanceFromOptimum = 0; + for (size_t i = 0; i < featureValueDiffs.size(); ++i) { + float modelScoreDiff = featureValueDiffs[i].InnerProduct(currWeights); + float loss = all_losses[i]; + float diff = loss - modelScoreDiff; + if (diff > epsilon) { + ++violatedConstraintsAfter; + newDistanceFromOptimum += diff; + } + } + VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl); + VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);*/ // return violatedConstraintsAfter; - return 0; + return 0; } size_t MiraOptimiser::updateWeightsAnalytically( - ScoreComponentCollection& weightUpdate, - ScoreComponentCollection& featureValuesHope, - ScoreComponentCollection& featureValuesFear, - float bleuScoreHope, - float bleuScoreFear, - float modelScoreHope, - float modelScoreFear, - float learning_rate, - size_t rank, - size_t epoch) { + ScoreComponentCollection& weightUpdate, + ScoreComponentCollection& featureValuesHope, + ScoreComponentCollection& featureValuesFear, + float bleuScoreHope, + float bleuScoreFear, + float modelScoreHope, + float modelScoreFear, + float learning_rate, + size_t rank, + size_t epoch) +{ float epsilon = 0.0001; float oldDistanceFromOptimum = 0; bool constraintViolatedBefore = false; - // cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << featureValuesHope << endl; - // cerr << "Rank " << rank << ", epoch " << epoch << ", fear: " << featureValuesFear << endl; +// cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << featureValuesHope << endl; +// cerr << "Rank " << rank << ", epoch " << epoch << ", fear: " << featureValuesFear << endl; ScoreComponentCollection featureValueDiff = featureValuesHope; featureValueDiff.MinusEquals(featureValuesFear); if (featureValueDiff.GetL1Norm() == 0) { @@ -342,35 +344,35 @@ size_t MiraOptimiser::updateWeightsAnalytically( float modelScoreDiff = modelScoreHope - modelScoreFear; float loss = bleuScoreHope - bleuScoreFear; float diff = 0; - if (loss > modelScoreDiff) - diff = loss - modelScoreDiff; + if (loss > modelScoreDiff) + diff = loss - modelScoreDiff; cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl; if (m_normaliseMargin) { modelScoreDiff = (2*m_sigmoidParam/(1 + exp(-modelScoreDiff))) - m_sigmoidParam; loss = (2*m_sigmoidParam/(1 + exp(-loss))) - m_sigmoidParam; - if (loss > modelScoreDiff) + if (loss > modelScoreDiff) diff = loss - modelScoreDiff; cerr << "Rank " << rank << ", epoch " << epoch << ", normalised constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl; } - + if (m_scale_margin) { - diff *= bleuScoreHope; - cerr << "Rank " << rank << ", epoch " << epoch << ", scaling margin with oracle bleu score " << bleuScoreHope << endl; + diff *= bleuScoreHope; + cerr << "Rank " << rank << ", epoch " << epoch << ", scaling margin with oracle bleu score " << bleuScoreHope << endl; } if (m_scale_margin_precision) { - diff *= (1+m_precision); - cerr << "Rank " << rank << ", epoch " << epoch << ", scaling margin with 1+precision: " << (1+m_precision) << endl; + diff *= (1+m_precision); + cerr << "Rank " << rank << ", epoch " << epoch << ", scaling margin with 1+precision: " << (1+m_precision) << endl; } if (diff > epsilon) { - // squash it between 0 and 1 - //diff = tanh(diff); - //diff = (2/(1 + pow(2,-diff))) - 1; + // squash it between 0 and 1 + //diff = tanh(diff); + //diff = (2/(1 + pow(2,-diff))) - 1; /* if (m_normaliseMargin) { - diff = (2/(1 + exp(-diff))) - 1; - cerr << "Rank " << rank << ", epoch " << epoch << ", new margin: " << diff << endl; - }*/ + diff = (2/(1 + exp(-diff))) - 1; + cerr << "Rank " << rank << ", epoch " << epoch << ", new margin: " << diff << endl; + }*/ // constraint violated oldDistanceFromOptimum += diff; @@ -384,134 +386,134 @@ size_t MiraOptimiser::updateWeightsAnalytically( float alpha = diff / squaredNorm; cerr << "Rank " << rank << ", epoch " << epoch << ", unclipped alpha: " << alpha << endl; if (m_slack > 0 ) { - if (alpha > m_slack) { - alpha = m_slack; - } - else if (alpha < m_slack*(-1)) { - alpha = m_slack*(-1); - } + if (alpha > m_slack) { + alpha = m_slack; + } else if (alpha < m_slack*(-1)) { + alpha = m_slack*(-1); + } } // apply learning rate if (learning_rate != 1) - alpha = alpha * learning_rate; - + alpha = alpha * learning_rate; + if (m_scale_update) { - cerr << "Rank " << rank << ", epoch " << epoch << ", scaling update with oracle bleu score " << bleuScoreHope << endl; - alpha *= bleuScoreHope; + cerr << "Rank " << rank << ", epoch " << epoch << ", scaling update with oracle bleu score " << bleuScoreHope << endl; + alpha *= bleuScoreHope; } if (m_scale_update_precision) { - cerr << "Rank " << rank << ", epoch " << epoch << ", scaling update with 1+precision: " << (1+m_precision) << endl; - alpha *= (1+m_precision); + cerr << "Rank " << rank << ", epoch " << epoch << ", scaling update with 1+precision: " << (1+m_precision) << endl; + alpha *= (1+m_precision); } - + cerr << "Rank " << rank << ", epoch " << epoch << ", clipped/scaled alpha: " << alpha << endl; // apply boosting factor if (m_boost && modelScoreDiff <= 0) { - // factor between 1.5 and 3 (for Bleu scores between 5 and 20, the factor is within the boundaries) - float factor = min(1.5, log2(bleuScoreHope)); - factor = min(3.0f, factor); - alpha = alpha * factor; - cerr << "Rank " << rank << ", epoch " << epoch << ", boosted alpha: " << alpha << endl; + // factor between 1.5 and 3 (for Bleu scores between 5 and 20, the factor is within the boundaries) + float factor = min(1.5, log2(bleuScoreHope)); + factor = min(3.0f, factor); + alpha = alpha * factor; + cerr << "Rank " << rank << ", epoch " << epoch << ", boosted alpha: " << alpha << endl; } featureValueDiff.MultiplyEquals(alpha); weightUpdate.PlusEquals(featureValueDiff); // cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << weightUpdate << endl; } - + if (!constraintViolatedBefore) { // constraint satisfied, nothing to do - cerr << "Rank " << rank << ", epoch " << epoch << ", constraint already satisfied" << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", constraint already satisfied" << endl; return 1; } // sanity check: constraint still violated after optimisation? -/* ScoreComponentCollection newWeights(currWeights); - newWeights.PlusEquals(weightUpdate); - bool constraintViolatedAfter = false; - float newDistanceFromOptimum = 0; - featureValueDiff = featureValuesHope; - featureValueDiff.MinusEquals(featureValuesFear); - modelScoreDiff = featureValueDiff.InnerProduct(newWeights); - diff = loss - modelScoreDiff; - // approximate comparison between floats! - if (diff > epsilon) { - constraintViolatedAfter = true; - newDistanceFromOptimum += (loss - modelScoreDiff); - } + /* ScoreComponentCollection newWeights(currWeights); + newWeights.PlusEquals(weightUpdate); + bool constraintViolatedAfter = false; + float newDistanceFromOptimum = 0; + featureValueDiff = featureValuesHope; + featureValueDiff.MinusEquals(featureValuesFear); + modelScoreDiff = featureValueDiff.InnerProduct(newWeights); + diff = loss - modelScoreDiff; + // approximate comparison between floats! + if (diff > epsilon) { + constraintViolatedAfter = true; + newDistanceFromOptimum += (loss - modelScoreDiff); + } - float hopeScore = featureValuesHope.InnerProduct(newWeights); - float fearScore = featureValuesFear.InnerProduct(newWeights); - cerr << "New hope score: " << hopeScore << endl; - cerr << "New fear score: " << fearScore << endl; + float hopeScore = featureValuesHope.InnerProduct(newWeights); + float fearScore = featureValuesFear.InnerProduct(newWeights); + cerr << "New hope score: " << hopeScore << endl; + cerr << "New fear score: " << fearScore << endl; - VERBOSE(0, "Rank " << rank << ", epoch " << epoch << ", check, constraint violated before? " << constraintViolatedBefore << ", after? " << constraintViolatedAfter << endl); - VERBOSE(0, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl); -*/ + VERBOSE(0, "Rank " << rank << ", epoch " << epoch << ", check, constraint violated before? " << constraintViolatedBefore << ", after? " << constraintViolatedAfter << endl); + VERBOSE(0, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl); + */ return 0; } size_t MiraOptimiser::updateWeightsHopeFearSelective( - Moses::ScoreComponentCollection& weightUpdate, - const std::vector< std::vector >& featureValuesHope, - const std::vector< std::vector >& featureValuesFear, - const std::vector >& bleuScoresHope, - const std::vector >& bleuScoresFear, - const std::vector >& modelScoresHope, - const std::vector >& modelScoresFear, - float learning_rate, - size_t rank, - size_t epoch, - int updatePosition) { + Moses::ScoreComponentCollection& weightUpdate, + const std::vector< std::vector >& featureValuesHope, + const std::vector< std::vector >& featureValuesFear, + const std::vector >& bleuScoresHope, + const std::vector >& bleuScoresFear, + const std::vector >& modelScoresHope, + const std::vector >& modelScoresFear, + float learning_rate, + size_t rank, + size_t epoch, + int updatePosition) +{ // vector of feature values differences for all created constraints vector nonZeroFeatures; vector lossMinusModelScoreDiffs; - + // Make constraints for new hypothesis translations float epsilon = 0.0001; int violatedConstraintsBefore = 0; - + // iterate over input sentences (1 (online) or more (batch)) for (size_t i = 0; i < featureValuesHope.size(); ++i) { if (updatePosition != -1) { if (i < updatePosition) - continue; + continue; else if (i > updatePosition) - break; + break; } - + // Pick all pairs[j,j] of hope and fear translations for one input sentence for (size_t j = 0; j < featureValuesHope[i].size(); ++j) { ScoreComponentCollection featureValueDiff = featureValuesHope[i][j]; featureValueDiff.MinusEquals(featureValuesFear[i][j]); if (featureValueDiff.GetL1Norm() == 0) { - cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl; - continue; + cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl; + continue; } - + // check if constraint is violated float loss = bleuScoresHope[i][j] - bleuScoresFear[i][j]; float modelScoreDiff = modelScoresHope[i][j] - modelScoresFear[i][j]; float diff = 0; - if (loss > modelScoreDiff) - diff = loss - modelScoreDiff; - if (diff > epsilon) - ++violatedConstraintsBefore; - cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl; - + if (loss > modelScoreDiff) + diff = loss - modelScoreDiff; + if (diff > epsilon) + ++violatedConstraintsBefore; + cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl; + // iterate over difference vector and add a constraint for every non-zero feature FVector features = featureValueDiff.GetScoresVector(); size_t n_core = 0, n_sparse = 0, n_sparse_hope = 0, n_sparse_fear = 0; for (size_t i=0; i nonZeroFeaturesHope; @@ -522,27 +524,26 @@ size_t MiraOptimiser::updateWeightsHopeFearSelective( f.Assign((i->first).name(), i->second); cerr << "Rank " << rank << ", epoch " << epoch << ", f: " << f << endl; - if (i->second > 0.0) { - ++n_sparse_hope; - nonZeroFeaturesHope.push_back(f); - } - else { - ++n_sparse_fear; - nonZeroFeaturesFear.push_back(f); - } + if (i->second > 0.0) { + ++n_sparse_hope; + nonZeroFeaturesHope.push_back(f); + } else { + ++n_sparse_fear; + nonZeroFeaturesFear.push_back(f); + } } } float n = n_core + n_sparse_hope + n_sparse_fear; for (size_t i=0; i 0) { cerr << "Rank " << rank << ", epoch " << epoch << ", number of constraints passed to optimizer: " << nonZeroFeatures.size() << endl; alphas = Hildreth::optimise(nonZeroFeatures, lossMinusModelScoreDiffs, m_slack); - + // Update the weight vector according to the alphas and the feature value differences // * w' = w' + SUM alpha_i * (h_i(oracle) - h_i(hypothesis)) for (size_t k = 0; k < nonZeroFeatures.size(); ++k) { float alpha = alphas[k]; cerr << "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl; if (alpha != 0) { - ScoreComponentCollection update(nonZeroFeatures[k]); - update.MultiplyEquals(alpha); - - // sum updates - summedUpdate.PlusEquals(update); + ScoreComponentCollection update(nonZeroFeatures[k]); + update.MultiplyEquals(alpha); + + // sum updates + summedUpdate.PlusEquals(update); } } - } - else { + } else { cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl; // return 0; return 1; } - + // apply learning rate if (learning_rate != 1) { cerr << "Rank " << rank << ", epoch " << epoch << ", apply learning rate " << learning_rate << " to update." << endl; summedUpdate.MultiplyEquals(learning_rate); } - + // scale update by BLEU of oracle (for batch size 1 only) if (featureValuesHope.size() == 1) { if (m_scale_update) { @@ -592,56 +592,57 @@ size_t MiraOptimiser::updateWeightsHopeFearSelective( summedUpdate.MultiplyEquals(bleuScoresHope[0][0]); } } - + //cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << summedUpdate << endl; weightUpdate.PlusEquals(summedUpdate); return 0; } size_t MiraOptimiser::updateWeightsHopeFearSummed( - Moses::ScoreComponentCollection& weightUpdate, - const std::vector< std::vector >& featureValuesHope, - const std::vector< std::vector >& featureValuesFear, - const std::vector >& bleuScoresHope, - const std::vector >& bleuScoresFear, - const std::vector >& modelScoresHope, - const std::vector >& modelScoresFear, - float learning_rate, - size_t rank, - size_t epoch, - bool rescaleSlack, - bool makePairs) { + Moses::ScoreComponentCollection& weightUpdate, + const std::vector< std::vector >& featureValuesHope, + const std::vector< std::vector >& featureValuesFear, + const std::vector >& bleuScoresHope, + const std::vector >& bleuScoresFear, + const std::vector >& modelScoresHope, + const std::vector >& modelScoresFear, + float learning_rate, + size_t rank, + size_t epoch, + bool rescaleSlack, + bool makePairs) +{ // vector of feature values differences for all created constraints ScoreComponentCollection averagedFeatureDiffs; float averagedViolations = 0; - + // Make constraints for new hypothesis translations float epsilon = 0.0001; int violatedConstraintsBefore = 0; - + if (!makePairs) { ScoreComponentCollection featureValueDiff; float lossHope = 0, lossFear = 0, modelScoreHope = 0, modelScoreFear = 0, hopeCount = 0, fearCount = 0; // add all hope vectors for (size_t i = 0; i < featureValuesHope.size(); ++i) { for (size_t j = 0; j < featureValuesHope[i].size(); ++j) { - featureValueDiff.PlusEquals(featureValuesHope[i][j]); - lossHope += bleuScoresHope[i][j]; - modelScoreHope += modelScoresHope[i][j]; - ++hopeCount; + featureValueDiff.PlusEquals(featureValuesHope[i][j]); + lossHope += bleuScoresHope[i][j]; + modelScoreHope += modelScoresHope[i][j]; + ++hopeCount; } } lossHope /= hopeCount; modelScoreHope /= hopeCount; - + // subtract all fear vectors for (size_t i = 0; i < featureValuesFear.size(); ++i) { for (size_t j = 0; j < featureValuesFear[i].size(); ++j) { featureValueDiff.MinusEquals(featureValuesFear[i][j]); - lossFear += bleuScoresFear[i][j]; + lossFear += bleuScoresFear[i][j]; modelScoreFear += modelScoresFear[i][j]; - ++fearCount; + ++fearCount; } } lossFear /= fearCount; @@ -653,7 +654,7 @@ size_t MiraOptimiser::updateWeightsHopeFearSummed( return 1; } - // check if constraint is violated + // check if constraint is violated float lossDiff = lossHope - lossFear; float modelScoreDiff = modelScoreHope - modelScoreFear; float diff = 0; @@ -662,54 +663,52 @@ size_t MiraOptimiser::updateWeightsHopeFearSummed( if (diff > epsilon) ++violatedConstraintsBefore; cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << lossDiff << " (current violation: " <<\ - diff << ")" << endl; + diff << ")" << endl; - // add constraint + // add constraint averagedFeatureDiffs = featureValueDiff; averagedViolations = diff; - } - else { - // iterate over input sentences (1 (online) or more (batch)) - for (size_t i = 0; i < featureValuesHope.size(); ++i) { - // Pick all pairs[j,j] of hope and fear translations for one input sentence and add them up - for (size_t j = 0; j < featureValuesHope[i].size(); ++j) { - ScoreComponentCollection featureValueDiff = featureValuesHope[i][j]; - featureValueDiff.MinusEquals(featureValuesFear[i][j]); - if (featureValueDiff.GetL1Norm() == 0) { - cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl; - continue; - } - - // check if constraint is violated - float lossDiff = bleuScoresHope[i][j] - bleuScoresFear[i][j]; - float modelScoreDiff = modelScoresHope[i][j] - modelScoresFear[i][j]; - if (rescaleSlack) { - cerr << "Rank " << rank << ", epoch " << epoch << ", modelScoreDiff scaled by lossDiff: " << modelScoreDiff << " --> " << modelScoreDiff*lossDiff << endl; - modelScoreDiff *= lossDiff; - } - float diff = 0; - if (lossDiff > modelScoreDiff) - diff = lossDiff - modelScoreDiff; - if (diff > epsilon) - ++violatedConstraintsBefore; - cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << lossDiff << " (current violation: " << diff << ")" << endl; + } else { + // iterate over input sentences (1 (online) or more (batch)) + for (size_t i = 0; i < featureValuesHope.size(); ++i) { + // Pick all pairs[j,j] of hope and fear translations for one input sentence and add them up + for (size_t j = 0; j < featureValuesHope[i].size(); ++j) { + ScoreComponentCollection featureValueDiff = featureValuesHope[i][j]; + featureValueDiff.MinusEquals(featureValuesFear[i][j]); + if (featureValueDiff.GetL1Norm() == 0) { + cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl; + continue; + } - // add constraint - if (rescaleSlack) { - averagedFeatureDiffs.MultiplyEquals(lossDiff); - cerr << "Rank " << rank << ", epoch " << epoch << ", featureValueDiff scaled by lossDiff." << endl; + // check if constraint is violated + float lossDiff = bleuScoresHope[i][j] - bleuScoresFear[i][j]; + float modelScoreDiff = modelScoresHope[i][j] - modelScoresFear[i][j]; + if (rescaleSlack) { + cerr << "Rank " << rank << ", epoch " << epoch << ", modelScoreDiff scaled by lossDiff: " << modelScoreDiff << " --> " << modelScoreDiff*lossDiff << endl; + modelScoreDiff *= lossDiff; + } + float diff = 0; + if (lossDiff > modelScoreDiff) + diff = lossDiff - modelScoreDiff; + if (diff > epsilon) + ++violatedConstraintsBefore; + cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << lossDiff << " (current violation: " << diff << ")" << endl; + + // add constraint + if (rescaleSlack) { + averagedFeatureDiffs.MultiplyEquals(lossDiff); + cerr << "Rank " << rank << ", epoch " << epoch << ", featureValueDiff scaled by lossDiff." << endl; + } + averagedFeatureDiffs.PlusEquals(featureValueDiff); + averagedViolations += diff; } - averagedFeatureDiffs.PlusEquals(featureValueDiff); - averagedViolations += diff; - } - } + } } // divide by number of constraints (1/n) if (!makePairs) { averagedFeatureDiffs.DivideEquals(featureValuesHope[0].size()); - } - else { + } else { averagedFeatureDiffs.DivideEquals(featureValuesHope[0].size()); averagedViolations /= featureValuesHope[0].size(); } @@ -717,29 +716,27 @@ size_t MiraOptimiser::updateWeightsHopeFearSummed( cerr << "Rank " << rank << ", epoch " << epoch << ", averaged violations: " << averagedViolations << endl; if (violatedConstraintsBefore > 0) { - // compute alpha for given constraint: (loss diff - model score diff) / || feature value diff ||^2 - // featureValueDiff.GetL2Norm() * featureValueDiff.GetL2Norm() == featureValueDiff.InnerProduct(featureValueDiff) - // from Crammer&Singer 2006: alpha = min {C , l_t/ ||x||^2} + // compute alpha for given constraint: (loss diff - model score diff) / || feature value diff ||^2 + // featureValueDiff.GetL2Norm() * featureValueDiff.GetL2Norm() == featureValueDiff.InnerProduct(featureValueDiff) + // from Crammer&Singer 2006: alpha = min {C , l_t/ ||x||^2} // adjusted for 1 slack according to Joachims 2009, OP4 (margin rescaling), OP5 (slack rescaling) float squaredNorm = averagedFeatureDiffs.GetL2Norm() * averagedFeatureDiffs.GetL2Norm(); float alpha = averagedViolations / squaredNorm; cerr << "Rank " << rank << ", epoch " << epoch << ", unclipped alpha: " << alpha << endl; if (m_slack > 0 ) { if (alpha > m_slack) { - alpha = m_slack; - } - else if (alpha < m_slack*(-1)) { - alpha = m_slack*(-1); + alpha = m_slack; + } else if (alpha < m_slack*(-1)) { + alpha = m_slack*(-1); } } cerr << "Rank " << rank << ", epoch " << epoch << ", clipped alpha: " << alpha << endl; - + // compute update averagedFeatureDiffs.MultiplyEquals(alpha); weightUpdate.PlusEquals(averagedFeatureDiffs); return 0; - } - else { + } else { cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl; return 1; } diff --git a/mira/Optimiser.h b/mira/Optimiser.h index a610268f3..d8afb8a0a 100644 --- a/mira/Optimiser.h +++ b/mira/Optimiser.h @@ -24,151 +24,155 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "moses/ScoreComponentCollection.h" -namespace Mira { - - class Optimiser { - public: - Optimiser() {} - - virtual size_t updateWeightsHopeFear( - Moses::ScoreComponentCollection& weightUpdate, - const std::vector >& featureValuesHope, - const std::vector >& featureValuesFear, - const std::vector >& bleuScoresHope, - const std::vector >& bleuScoresFear, - const std::vector >& modelScoresHope, - const std::vector >& modelScoresFear, - float learning_rate, - size_t rank, - size_t epoch, - int updatePosition = -1) = 0; - }; - - class Perceptron : public Optimiser { - public: - virtual size_t updateWeightsHopeFear( - Moses::ScoreComponentCollection& weightUpdate, - const std::vector >& featureValuesHope, - const std::vector >& featureValuesFear, - const std::vector >& bleuScoresHope, - const std::vector >& bleuScoresFear, - const std::vector >& modelScoresHope, - const std::vector >& modelScoresFear, - float learning_rate, - size_t rank, - size_t epoch, - int updatePosition = -1); - }; +namespace Mira +{ - class MiraOptimiser : public Optimiser { - public: +class Optimiser +{ +public: + Optimiser() {} + + virtual size_t updateWeightsHopeFear( + Moses::ScoreComponentCollection& weightUpdate, + const std::vector >& featureValuesHope, + const std::vector >& featureValuesFear, + const std::vector >& bleuScoresHope, + const std::vector >& bleuScoresFear, + const std::vector >& modelScoresHope, + const std::vector >& modelScoresFear, + float learning_rate, + size_t rank, + size_t epoch, + int updatePosition = -1) = 0; +}; + +class Perceptron : public Optimiser +{ +public: + virtual size_t updateWeightsHopeFear( + Moses::ScoreComponentCollection& weightUpdate, + const std::vector >& featureValuesHope, + const std::vector >& featureValuesFear, + const std::vector >& bleuScoresHope, + const std::vector >& bleuScoresFear, + const std::vector >& modelScoresHope, + const std::vector >& modelScoresFear, + float learning_rate, + size_t rank, + size_t epoch, + int updatePosition = -1); +}; + +class MiraOptimiser : public Optimiser +{ +public: MiraOptimiser() : Optimiser() { } - - MiraOptimiser( - float slack, bool scale_margin, bool scale_margin_precision, - bool scale_update, bool scale_update_precision, bool boost, bool normaliseMargin, float sigmoidParam) : - Optimiser(), - m_slack(slack), - m_scale_margin(scale_margin), - m_scale_margin_precision(scale_margin_precision), - m_scale_update(scale_update), - m_scale_update_precision(scale_update_precision), - m_precision(1), - m_boost(boost), - m_normaliseMargin(normaliseMargin), - m_sigmoidParam(sigmoidParam) { } - - size_t updateWeights( - Moses::ScoreComponentCollection& weightUpdate, - const std::vector >& featureValues, - const std::vector >& losses, - const std::vector >& bleuScores, - const std::vector >& modelScores, - const std::vector< Moses::ScoreComponentCollection>& oracleFeatureValues, - const std::vector< float> oracleBleuScores, - const std::vector< float> oracleModelScores, - float learning_rate, - size_t rank, - size_t epoch); - virtual size_t updateWeightsHopeFear( - Moses::ScoreComponentCollection& weightUpdate, - const std::vector >& featureValuesHope, - const std::vector >& featureValuesFear, - const std::vector >& bleuScoresHope, - const std::vector >& bleuScoresFear, - const std::vector >& modelScoresHope, - const std::vector >& modelScoresFear, - float learning_rate, - size_t rank, - size_t epoch, - int updatePosition = -1); - size_t updateWeightsHopeFearSelective( - Moses::ScoreComponentCollection& weightUpdate, - const std::vector >& featureValuesHope, - const std::vector >& featureValuesFear, - const std::vector >& bleuScoresHope, - const std::vector >& bleuScoresFear, - const std::vector >& modelScoresHope, - const std::vector >& modelScoresFear, - float learning_rate, - size_t rank, - size_t epoch, - int updatePosition = -1); - size_t updateWeightsHopeFearSummed( - Moses::ScoreComponentCollection& weightUpdate, - const std::vector >& featureValuesHope, - const std::vector >& featureValuesFear, - const std::vector >& bleuScoresHope, - const std::vector >& bleuScoresFear, - const std::vector >& modelScoresHope, - const std::vector >& modelScoresFear, - float learning_rate, - size_t rank, - size_t epoch, - bool rescaleSlack, - bool makePairs); - size_t updateWeightsAnalytically( - Moses::ScoreComponentCollection& weightUpdate, - Moses::ScoreComponentCollection& featureValuesHope, - Moses::ScoreComponentCollection& featureValuesFear, - float bleuScoreHope, - float bleuScoreFear, - float modelScoreHope, - float modelScoreFear, - float learning_rate, - size_t rank, - size_t epoch); - void setSlack(float slack) { - m_slack = slack; - } - - void setPrecision(float precision) { - m_precision = precision; - } - - private: - // regularise Hildreth updates - float m_slack; - - // scale margin with BLEU score or precision - bool m_scale_margin, m_scale_margin_precision; - - // scale update with oracle BLEU score or precision - bool m_scale_update, m_scale_update_precision; - - float m_precision; - - // boosting of updates on misranked candidates - bool m_boost; - - // squash margin between 0 and 1 (or depending on m_sigmoidParam) - bool m_normaliseMargin; - - // y=sigmoidParam is the axis that this sigmoid approaches - float m_sigmoidParam ; - }; + MiraOptimiser( + float slack, bool scale_margin, bool scale_margin_precision, + bool scale_update, bool scale_update_precision, bool boost, bool normaliseMargin, float sigmoidParam) : + Optimiser(), + m_slack(slack), + m_scale_margin(scale_margin), + m_scale_margin_precision(scale_margin_precision), + m_scale_update(scale_update), + m_scale_update_precision(scale_update_precision), + m_precision(1), + m_boost(boost), + m_normaliseMargin(normaliseMargin), + m_sigmoidParam(sigmoidParam) { } + + size_t updateWeights( + Moses::ScoreComponentCollection& weightUpdate, + const std::vector >& featureValues, + const std::vector >& losses, + const std::vector >& bleuScores, + const std::vector >& modelScores, + const std::vector< Moses::ScoreComponentCollection>& oracleFeatureValues, + const std::vector< float> oracleBleuScores, + const std::vector< float> oracleModelScores, + float learning_rate, + size_t rank, + size_t epoch); + virtual size_t updateWeightsHopeFear( + Moses::ScoreComponentCollection& weightUpdate, + const std::vector >& featureValuesHope, + const std::vector >& featureValuesFear, + const std::vector >& bleuScoresHope, + const std::vector >& bleuScoresFear, + const std::vector >& modelScoresHope, + const std::vector >& modelScoresFear, + float learning_rate, + size_t rank, + size_t epoch, + int updatePosition = -1); + size_t updateWeightsHopeFearSelective( + Moses::ScoreComponentCollection& weightUpdate, + const std::vector >& featureValuesHope, + const std::vector >& featureValuesFear, + const std::vector >& bleuScoresHope, + const std::vector >& bleuScoresFear, + const std::vector >& modelScoresHope, + const std::vector >& modelScoresFear, + float learning_rate, + size_t rank, + size_t epoch, + int updatePosition = -1); + size_t updateWeightsHopeFearSummed( + Moses::ScoreComponentCollection& weightUpdate, + const std::vector >& featureValuesHope, + const std::vector >& featureValuesFear, + const std::vector >& bleuScoresHope, + const std::vector >& bleuScoresFear, + const std::vector >& modelScoresHope, + const std::vector >& modelScoresFear, + float learning_rate, + size_t rank, + size_t epoch, + bool rescaleSlack, + bool makePairs); + size_t updateWeightsAnalytically( + Moses::ScoreComponentCollection& weightUpdate, + Moses::ScoreComponentCollection& featureValuesHope, + Moses::ScoreComponentCollection& featureValuesFear, + float bleuScoreHope, + float bleuScoreFear, + float modelScoreHope, + float modelScoreFear, + float learning_rate, + size_t rank, + size_t epoch); + + void setSlack(float slack) { + m_slack = slack; + } + + void setPrecision(float precision) { + m_precision = precision; + } + +private: + // regularise Hildreth updates + float m_slack; + + // scale margin with BLEU score or precision + bool m_scale_margin, m_scale_margin_precision; + + // scale update with oracle BLEU score or precision + bool m_scale_update, m_scale_update_precision; + + float m_precision; + + // boosting of updates on misranked candidates + bool m_boost; + + // squash margin between 0 and 1 (or depending on m_sigmoidParam) + bool m_normaliseMargin; + + // y=sigmoidParam is the axis that this sigmoid approaches + float m_sigmoidParam ; +}; } #endif diff --git a/mira/Perceptron.cpp b/mira/Perceptron.cpp index 569a83216..af61c28a9 100644 --- a/mira/Perceptron.cpp +++ b/mira/Perceptron.cpp @@ -22,30 +22,31 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA using namespace Moses; using namespace std; -namespace Mira { +namespace Mira +{ size_t Perceptron::updateWeightsHopeFear( - ScoreComponentCollection& weightUpdate, - const vector< vector >& featureValuesHope, - const vector< vector >& featureValuesFear, - const vector< vector >& dummy1, - const vector< vector >& dummy2, - const vector< vector >& dummy3, - const vector< vector >& dummy4, - float perceptron_learning_rate, - size_t rank, - size_t epoch, - int updatePosition) + ScoreComponentCollection& weightUpdate, + const vector< vector >& featureValuesHope, + const vector< vector >& featureValuesFear, + const vector< vector >& dummy1, + const vector< vector >& dummy2, + const vector< vector >& dummy3, + const vector< vector >& dummy4, + float perceptron_learning_rate, + size_t rank, + size_t epoch, + int updatePosition) { - cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << featureValuesHope[0][0] << endl; - cerr << "Rank " << rank << ", epoch " << epoch << ", fear: " << featureValuesFear[0][0] << endl; - ScoreComponentCollection featureValueDiff = featureValuesHope[0][0]; - featureValueDiff.MinusEquals(featureValuesFear[0][0]); - cerr << "Rank " << rank << ", epoch " << epoch << ", hope - fear: " << featureValueDiff << endl; - featureValueDiff.MultiplyEquals(perceptron_learning_rate); - weightUpdate.PlusEquals(featureValueDiff); - cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << featureValueDiff << endl; - return 0; + cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << featureValuesHope[0][0] << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", fear: " << featureValuesFear[0][0] << endl; + ScoreComponentCollection featureValueDiff = featureValuesHope[0][0]; + featureValueDiff.MinusEquals(featureValuesFear[0][0]); + cerr << "Rank " << rank << ", epoch " << epoch << ", hope - fear: " << featureValueDiff << endl; + featureValueDiff.MultiplyEquals(perceptron_learning_rate); + weightUpdate.PlusEquals(featureValueDiff); + cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << featureValueDiff << endl; + return 0; } } diff --git a/misc/processLexicalTableMin.cpp b/misc/processLexicalTableMin.cpp index 3f2887e55..04f6590b1 100644 --- a/misc/processLexicalTableMin.cpp +++ b/misc/processLexicalTableMin.cpp @@ -20,7 +20,7 @@ void printHelp(char **argv) "\t-T string -- path to temporary directory (uses /tmp by default)\n" #ifdef WITH_THREADS "\t-threads int|all -- number of threads used for conversion\n" -#endif +#endif "\n advanced:\n" "\t-landmark int -- use landmark phrase every 2^n phrases\n" "\t-fingerprint int -- number of bits used for phrase fingerprints\n" @@ -44,11 +44,11 @@ void printHelp(char **argv) int main(int argc, char** argv) { - + std::string inFilePath; std::string outFilePath("out"); std::string tempfilePath; - + size_t orderBits = 10; size_t fingerPrintBits = 16; bool multipleScoreTrees = true; @@ -56,52 +56,36 @@ int main(int argc, char** argv) #ifdef WITH_THREADS size_t threads = 1; -#endif +#endif - if(1 >= argc) - { + if(1 >= argc) { printHelp(argv); return 1; } - for(int i = 1; i < argc; ++i) - { + for(int i = 1; i < argc; ++i) { std::string arg(argv[i]); - if("-in" == arg && i+1 < argc) - { + if("-in" == arg && i+1 < argc) { ++i; inFilePath = argv[i]; - } - else if("-out" == arg && i+1 < argc) - { + } else if("-out" == arg && i+1 < argc) { ++i; outFilePath = argv[i]; - } - else if("-T" == arg && i+1 < argc) { + } else if("-T" == arg && i+1 < argc) { ++i; tempfilePath = argv[i]; util::NormalizeTempPrefix(tempfilePath); - } - else if("-landmark" == arg && i+1 < argc) - { + } else if("-landmark" == arg && i+1 < argc) { ++i; orderBits = atoi(argv[i]); - } - else if("-fingerprint" == arg && i+1 < argc) - { + } else if("-fingerprint" == arg && i+1 < argc) { ++i; fingerPrintBits = atoi(argv[i]); - } - else if("-join-scores" == arg) - { + } else if("-join-scores" == arg) { multipleScoreTrees = false; - } - else if("-quantize" == arg && i+1 < argc) - { + } else if("-quantize" == arg && i+1 < argc) { ++i; quantize = atoi(argv[i]); - } - else if("-threads" == arg && i+1 < argc) - { + } else if("-threads" == arg && i+1 < argc) { #ifdef WITH_THREADS ++i; if(std::string(argv[i]) == "all") { @@ -109,23 +93,20 @@ int main(int argc, char** argv) if(!threads) { std::cerr << "Could not determine number of hardware threads, setting to 1" << std::endl; threads = 1; - } - } - else + } + } else threads = atoi(argv[i]); #else std::cerr << "Thread support not compiled in" << std::endl; exit(1); #endif - } - else - { + } else { //somethings wrong... print help printHelp(argv); return 1; } } - + if(outFilePath.rfind(".minlexr") != outFilePath.size() - 8) outFilePath += ".minlexr"; @@ -135,6 +116,6 @@ int main(int argc, char** argv) multipleScoreTrees, quantize #ifdef WITH_THREADS , threads -#endif +#endif ); } diff --git a/misc/processPhraseTableMin.cpp b/misc/processPhraseTableMin.cpp index 5c25262b8..1ea42252c 100644 --- a/misc/processPhraseTableMin.cpp +++ b/misc/processPhraseTableMin.cpp @@ -2,7 +2,7 @@ #ifdef WITH_THREADS #include -#endif +#endif #include "moses/TypeDef.h" #include "moses/TranslationModel/CompactPT/PhraseTableCreator.h" @@ -11,7 +11,8 @@ using namespace Moses; -void printHelp(char **argv) { +void printHelp(char **argv) +{ std::cerr << "Usage " << argv[0] << ":\n" " options: \n" "\t-in string -- input table file name\n" @@ -21,7 +22,7 @@ void printHelp(char **argv) { "\t-no-alignment-info -- do not include alignment info in the binary phrase table\n" #ifdef WITH_THREADS "\t-threads int|all -- number of threads used for conversion\n" -#endif +#endif "\n advanced:\n" "\t-encoding string -- encoding type: PREnc REnc None (default PREnc)\n" "\t-rankscore int -- score index of P(t|s) (default 2)\n" @@ -48,14 +49,15 @@ void printHelp(char **argv) { } -int main(int argc, char **argv) { - +int main(int argc, char **argv) +{ + std::string inFilePath; std::string outFilePath("out"); std::string tempfilePath; PhraseTableCreator::Coding coding = PhraseTableCreator::PREnc; - - size_t numScoreComponent = 5; + + size_t numScoreComponent = 5; size_t orderBits = 10; size_t fingerprintBits = 16; bool useAlignmentInfo = true; @@ -63,10 +65,10 @@ int main(int argc, char **argv) { size_t quantize = 0; size_t maxRank = 100; bool sortScoreIndexSet = false; - size_t sortScoreIndex = 2; + size_t sortScoreIndex = 2; bool warnMe = true; size_t threads = 1; - + if(1 >= argc) { printHelp(argv); return 1; @@ -76,64 +78,49 @@ int main(int argc, char **argv) { if("-in" == arg && i+1 < argc) { ++i; inFilePath = argv[i]; - } - else if("-out" == arg && i+1 < argc) { + } else if("-out" == arg && i+1 < argc) { ++i; outFilePath = argv[i]; - } - else if("-T" == arg && i+1 < argc) { + } else if("-T" == arg && i+1 < argc) { ++i; tempfilePath = argv[i]; util::NormalizeTempPrefix(tempfilePath); - } - else if("-encoding" == arg && i+1 < argc) { + } else if("-encoding" == arg && i+1 < argc) { ++i; std::string val(argv[i]); if(val == "None" || val == "none") { coding = PhraseTableCreator::None; - } - else if(val == "REnc" || val == "renc") { + } else if(val == "REnc" || val == "renc") { coding = PhraseTableCreator::REnc; - } - else if(val == "PREnc" || val == "prenc") { + } else if(val == "PREnc" || val == "prenc") { coding = PhraseTableCreator::PREnc; } - } - else if("-maxrank" == arg && i+1 < argc) { + } else if("-maxrank" == arg && i+1 < argc) { ++i; maxRank = atoi(argv[i]); - } - else if("-nscores" == arg && i+1 < argc) { + } else if("-nscores" == arg && i+1 < argc) { ++i; numScoreComponent = atoi(argv[i]); - } - else if("-rankscore" == arg && i+1 < argc) { + } else if("-rankscore" == arg && i+1 < argc) { ++i; sortScoreIndex = atoi(argv[i]); sortScoreIndexSet = true; - } - else if("-no-alignment-info" == arg) { + } else if("-no-alignment-info" == arg) { useAlignmentInfo = false; - } - else if("-landmark" == arg && i+1 < argc) { + } else if("-landmark" == arg && i+1 < argc) { ++i; orderBits = atoi(argv[i]); - } - else if("-fingerprint" == arg && i+1 < argc) { + } else if("-fingerprint" == arg && i+1 < argc) { ++i; fingerprintBits = atoi(argv[i]); - } - else if("-join-scores" == arg) { + } else if("-join-scores" == arg) { multipleScoreTrees = false; - } - else if("-quantize" == arg && i+1 < argc) { + } else if("-quantize" == arg && i+1 < argc) { ++i; quantize = atoi(argv[i]); - } - else if("-no-warnings" == arg) { + } else if("-no-warnings" == arg) { warnMe = false; - } - else if("-threads" == arg && i+1 < argc) { + } else if("-threads" == arg && i+1 < argc) { #ifdef WITH_THREADS ++i; if(std::string(argv[i]) == "all") { @@ -141,40 +128,36 @@ int main(int argc, char **argv) { if(!threads) { std::cerr << "Could not determine number of hardware threads, setting to 1" << std::endl; threads = 1; - } - } - else + } + } else threads = atoi(argv[i]); #else std::cerr << "Thread support not compiled in" << std::endl; exit(1); #endif - } - else { + } else { //something's wrong... print help printHelp(argv); return 1; } } - - if(!sortScoreIndexSet && numScoreComponent != 5 && coding == PhraseTableCreator::PREnc) - { + + if(!sortScoreIndexSet && numScoreComponent != 5 && coding == PhraseTableCreator::PREnc) { std::cerr << "WARNING: You are using a nonstandard number of scores (" << numScoreComponent << ") with PREnc. Set the index of P(t|s) " "with -rankscore int if it is not " << sortScoreIndex << "." << std::endl; } - - if(sortScoreIndex >= numScoreComponent) - { + + if(sortScoreIndex >= numScoreComponent) { std::cerr << "ERROR: -rankscore " << sortScoreIndex << " is out of range (0 ... " << (numScoreComponent-1) << ")" << std::endl; abort(); } - + if(outFilePath.rfind(".minphr") != outFilePath.size() - 7) outFilePath += ".minphr"; - + PhraseTableCreator(inFilePath, outFilePath, tempfilePath, numScoreComponent, sortScoreIndex, coding, orderBits, fingerprintBits, @@ -182,6 +165,6 @@ int main(int argc, char **argv) { quantize, maxRank, warnMe #ifdef WITH_THREADS , threads -#endif - ); +#endif + ); } diff --git a/misc/queryPhraseTable.cpp b/misc/queryPhraseTable.cpp index d8103f371..5e4f7755a 100644 --- a/misc/queryPhraseTable.cpp +++ b/misc/queryPhraseTable.cpp @@ -33,8 +33,7 @@ int main(int argc, char **argv) needAlignments = true; } else if (!strcmp(argv[i], "-c")) { reportCounts = true; - } - else + } else usage(); } diff --git a/misc/queryPhraseTableMin.cpp b/misc/queryPhraseTableMin.cpp index f4dca8b6b..6b6f9beaf 100644 --- a/misc/queryPhraseTableMin.cpp +++ b/misc/queryPhraseTableMin.cpp @@ -36,8 +36,7 @@ int main(int argc, char **argv) useAlignments = true; } else if (!strcmp(argv[i], "-c")) { reportCounts = true; - } - else + } else usage(); } @@ -47,28 +46,28 @@ int main(int argc, char **argv) std::vector input(1, 0); std::vector output(1, 0); std::vector weight(nscores, 0); - + Parameter *parameter = new Parameter(); const_cast&>(parameter->GetParam("factor-delimiter")).resize(1, "||dummy_string||"); const_cast&>(parameter->GetParam("input-factors")).resize(1, "0"); const_cast&>(parameter->GetParam("verbose")).resize(1, "0"); const_cast&>(parameter->GetParam("weight-w")).resize(1, "0"); const_cast&>(parameter->GetParam("weight-d")).resize(1, "0"); - + StaticData::InstanceNonConst().LoadData(parameter); PhraseDictionaryCompact pdc("input-factor=0 output-factor=0 num-features=5 path=" + ttable); - bool ret = pdc.InitDictionary(); + bool ret = pdc.InitDictionary(); assert(ret); - + std::string line; while(getline(std::cin, line)) { Phrase sourcePhrase; sourcePhrase.CreateFromString(Input, input, line, "||dummy_string||", NULL); - + TargetPhraseVectorPtr decodedPhraseColl - = pdc.GetTargetPhraseCollectionRaw(sourcePhrase); - + = pdc.GetTargetPhraseCollectionRaw(sourcePhrase); + if(decodedPhraseColl != NULL) { if(reportCounts) std::cout << sourcePhrase << decodedPhraseColl->size() << std::endl; @@ -77,19 +76,18 @@ int main(int argc, char **argv) TargetPhrase &tp = *it; std::cout << sourcePhrase << "||| "; std::cout << static_cast(tp) << "|||"; - + if(useAlignments) - std::cout << " " << tp.GetAlignTerm() << "|||"; - + std::cout << " " << tp.GetAlignTerm() << "|||"; + std::vector scores = tp.GetScoreBreakdown().GetScoresForProducer(&pdc); for(size_t i = 0; i < scores.size(); i++) std::cout << " " << exp(scores[i]); std::cout << std::endl; } - } - else if(reportCounts) + } else if(reportCounts) std::cout << sourcePhrase << 0 << std::endl; - + std::cout.flush(); } } diff --git a/moses-chart-cmd/IOWrapper.cpp b/moses-chart-cmd/IOWrapper.cpp index 92f68a84d..b130943a1 100644 --- a/moses-chart-cmd/IOWrapper.cpp +++ b/moses-chart-cmd/IOWrapper.cpp @@ -138,7 +138,8 @@ IOWrapper::~IOWrapper() delete m_alignmentInfoCollector; } -void IOWrapper::ResetTranslationId() { +void IOWrapper::ResetTranslationId() +{ m_translationId = StaticData::Instance().GetStartTranslationId(); } @@ -174,7 +175,7 @@ void OutputSurface(std::ostream &out, const Phrase &phrase, const std::vector &prevHypos = hypo.GetPrevHypos(); @@ -276,7 +277,7 @@ void IOWrapper::ReconstructApplicationContext(const ChartHypothesis &hypo, // but there are scripts and tools that expect the output of -T to look like // that. void IOWrapper::WriteApplicationContext(std::ostream &out, - const ApplicationContext &context) + const ApplicationContext &context) { assert(!context.empty()); ApplicationContext::const_reverse_iterator p = context.rbegin(); @@ -327,7 +328,7 @@ void IOWrapper::OutputDetailedTranslationReport( CHECK(m_detailOutputCollector); m_detailOutputCollector->Write(translationId, out.str()); } - + void IOWrapper::OutputBestHypo(const ChartHypothesis *hypo, long translationId) { @@ -344,18 +345,18 @@ void IOWrapper::OutputBestHypo(const ChartHypothesis *hypo, long translationId) if (StaticData::Instance().GetOutputHypoScore()) { out << hypo->GetTotalScore() << " "; } - + if (StaticData::Instance().IsPathRecoveryEnabled()) { out << "||| "; } Phrase outPhrase(ARRAY_SIZE_INCR); hypo->CreateOutputPhrase(outPhrase); - + // delete 1st & last CHECK(outPhrase.GetSize() >= 2); outPhrase.RemoveWord(0); outPhrase.RemoveWord(outPhrase.GetSize() - 1); - + const std::vector outputFactorOrder = StaticData::Instance().GetOutputFactorOrder(); string output = outPhrase.GetStringRep(outputFactorOrder); out << output << endl; @@ -371,7 +372,8 @@ void IOWrapper::OutputBestHypo(const ChartHypothesis *hypo, long translationId) m_singleBestOutputCollector->Write(translationId, out.str()); } -void IOWrapper::OutputBestHypo(search::Applied applied, long translationId) { +void IOWrapper::OutputBestHypo(search::Applied applied, long translationId) +{ if (!m_singleBestOutputCollector) return; std::ostringstream out; IOWrapper::FixPrecision(out); @@ -389,7 +391,8 @@ void IOWrapper::OutputBestHypo(search::Applied applied, long translationId) { m_singleBestOutputCollector->Write(translationId, out.str()); } -void IOWrapper::OutputBestNone(long translationId) { +void IOWrapper::OutputBestNone(long translationId) +{ if (!m_singleBestOutputCollector) return; if (StaticData::Instance().GetOutputHypoScore()) { m_singleBestOutputCollector->Write(translationId, "0 \n"); @@ -443,7 +446,8 @@ void IOWrapper::OutputFeatureScores( std::ostream& out, const ScoreComponentColl } } -void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, long translationId) { +void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, long translationId) +{ std::ostringstream out; // Check if we're writing to std::cout. @@ -452,7 +456,7 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, long tran // preserve existing behaviour, but should probably be done either way. IOWrapper::FixPrecision(out); - // Used to check StaticData's GetOutputHypoScore(), but it makes no sense with nbest output. + // Used to check StaticData's GetOutputHypoScore(), but it makes no sense with nbest output. } //bool includeAlignment = StaticData::Instance().NBestIncludesAlignment(); @@ -528,7 +532,8 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, long tran m_nBestOutputCollector->Write(translationId, out.str()); } -void IOWrapper::OutputNBestList(const std::vector &nbest, long translationId) { +void IOWrapper::OutputNBestList(const std::vector &nbest, long translationId) +{ std::ostringstream out; // wtf? copied from the original OutputNBestList if (m_nBestOutputCollector->OutputIsCout()) { @@ -565,12 +570,11 @@ void ShiftOffsets(vector &offsets, T shift) T currPos = shift; for (size_t i = 0; i < offsets.size(); ++i) { if (offsets[i] == 0) { - offsets[i] = currPos; - ++currPos; - } - else { - currPos += offsets[i]; - } + offsets[i] = currPos; + ++currPos; + } else { + currPos += offsets[i]; + } } } @@ -630,8 +634,7 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT totalTargetSize += targetSize; ++targetInd; - } - else { + } else { ++totalTargetSize; } } @@ -666,15 +669,15 @@ void IOWrapper::OutputAlignment(size_t translationId , const Moses::ChartHypothe ostringstream out; if (hypo) { - Alignments retAlign; - OutputAlignment(retAlign, hypo, 0); + Alignments retAlign; + OutputAlignment(retAlign, hypo, 0); - // output alignments - Alignments::const_iterator iter; - for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) { - const pair &alignPoint = *iter; - out << alignPoint.first << "-" << alignPoint.second << " "; - } + // output alignments + Alignments::const_iterator iter; + for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) { + const pair &alignPoint = *iter; + out << alignPoint.first << "-" << alignPoint.second << " "; + } } out << endl; @@ -724,8 +727,7 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth totalTargetSize += targetSize; ++targetInd; - } - else { + } else { ++totalTargetSize; } } diff --git a/moses-chart-cmd/IOWrapper.h b/moses-chart-cmd/IOWrapper.h index 3178b6507..aba73e9a6 100644 --- a/moses-chart-cmd/IOWrapper.h +++ b/moses-chart-cmd/IOWrapper.h @@ -91,11 +91,11 @@ protected: const ApplicationContext &context); void OutputAllFeatureScores(const Moses::ScoreComponentCollection &features - , std::ostream &out); + , std::ostream &out); void OutputFeatureScores( std::ostream& out - , const Moses::ScoreComponentCollection &features - , const Moses::FeatureFunction *ff - , std::string &lastName ); + , const Moses::ScoreComponentCollection &features + , const Moses::FeatureFunction *ff + , std::string &lastName ); public: IOWrapper(const std::vector &inputFactorOrder diff --git a/moses-chart-cmd/Main.cpp b/moses-chart-cmd/Main.cpp index a7568f5fb..61b8b9f5e 100644 --- a/moses-chart-cmd/Main.cpp +++ b/moses-chart-cmd/Main.cpp @@ -190,7 +190,7 @@ static void PrintFeatureWeight(const FeatureFunction* ff) size_t numScoreComps = ff->GetNumScoreComponents(); vector values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff); for (size_t i = 0; i < numScoreComps; ++i) { - cout << " " << values[i]; + cout << " " << values[i]; } cout << endl; @@ -244,12 +244,12 @@ int main(int argc, char* argv[]) ShowWeights(); exit(0); } - + CHECK(staticData.IsChart()); - + // set up read/writing class IOWrapper *ioWrapper = GetIOWrapper(staticData); - + // check on weights const ScoreComponentCollection& weights = staticData.GetAllWeights(); IFVERBOSE(2) { @@ -264,7 +264,7 @@ int main(int argc, char* argv[]) #ifdef WITH_THREADS ThreadPool pool(staticData.ThreadCount()); #endif - + // read each sentence & decode InputType *source=0; while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) { @@ -279,16 +279,16 @@ int main(int argc, char* argv[]) delete task; #endif } - + #ifdef WITH_THREADS pool.Stop(true); // flush remaining jobs #endif - + delete ioWrapper; - + IFVERBOSE(1) PrintUserTime("End."); - + } catch (const std::exception &e) { std::cerr << "Exception: " << e.what() << std::endl; return EXIT_FAILURE; diff --git a/moses-chart-cmd/Main.h b/moses-chart-cmd/Main.h index 4f2765695..319e3889c 100644 --- a/moses-chart-cmd/Main.h +++ b/moses-chart-cmd/Main.h @@ -36,8 +36,9 @@ POSSIBILITY OF SUCH DAMAGE. #include "moses/StaticData.h" -namespace MosesChartCmd { - class IOWrapper; +namespace MosesChartCmd +{ +class IOWrapper; } int main(int argc, char* argv[]); diff --git a/moses-cmd/IOWrapper.cpp b/moses-cmd/IOWrapper.cpp index e5346852f..44e60ddf3 100644 --- a/moses-cmd/IOWrapper.cpp +++ b/moses-cmd/IOWrapper.cpp @@ -53,47 +53,47 @@ namespace MosesCmd { IOWrapper::IOWrapper( - const vector &inputFactorOrder - , const vector &outputFactorOrder - , const FactorMask &inputFactorUsed - , size_t nBestSize - , const string &nBestFilePath) -:m_inputFactorOrder(inputFactorOrder) -,m_outputFactorOrder(outputFactorOrder) -,m_inputFactorUsed(inputFactorUsed) -,m_inputFile(NULL) -,m_inputStream(&std::cin) -,m_nBestStream(NULL) -,m_outputWordGraphStream(NULL) -,m_outputSearchGraphStream(NULL) -,m_detailedTranslationReportingStream(NULL) -,m_alignmentOutputStream(NULL) + const vector &inputFactorOrder + , const vector &outputFactorOrder + , const FactorMask &inputFactorUsed + , size_t nBestSize + , const string &nBestFilePath) + :m_inputFactorOrder(inputFactorOrder) + ,m_outputFactorOrder(outputFactorOrder) + ,m_inputFactorUsed(inputFactorUsed) + ,m_inputFile(NULL) + ,m_inputStream(&std::cin) + ,m_nBestStream(NULL) + ,m_outputWordGraphStream(NULL) + ,m_outputSearchGraphStream(NULL) + ,m_detailedTranslationReportingStream(NULL) + ,m_alignmentOutputStream(NULL) { Initialization(inputFactorOrder, outputFactorOrder - , inputFactorUsed - , nBestSize, nBestFilePath); + , inputFactorUsed + , nBestSize, nBestFilePath); } IOWrapper::IOWrapper(const std::vector &inputFactorOrder - , const std::vector &outputFactorOrder - , const FactorMask &inputFactorUsed - , size_t nBestSize - , const std::string &nBestFilePath - , const std::string &inputFilePath) -:m_inputFactorOrder(inputFactorOrder) -,m_outputFactorOrder(outputFactorOrder) -,m_inputFactorUsed(inputFactorUsed) -,m_inputFilePath(inputFilePath) -,m_inputFile(new InputFileStream(inputFilePath)) -,m_nBestStream(NULL) -,m_outputWordGraphStream(NULL) -,m_outputSearchGraphStream(NULL) -,m_detailedTranslationReportingStream(NULL) -,m_alignmentOutputStream(NULL) + , const std::vector &outputFactorOrder + , const FactorMask &inputFactorUsed + , size_t nBestSize + , const std::string &nBestFilePath + , const std::string &inputFilePath) + :m_inputFactorOrder(inputFactorOrder) + ,m_outputFactorOrder(outputFactorOrder) + ,m_inputFactorUsed(inputFactorUsed) + ,m_inputFilePath(inputFilePath) + ,m_inputFile(new InputFileStream(inputFilePath)) + ,m_nBestStream(NULL) + ,m_outputWordGraphStream(NULL) + ,m_outputSearchGraphStream(NULL) + ,m_detailedTranslationReportingStream(NULL) + ,m_alignmentOutputStream(NULL) { Initialization(inputFactorOrder, outputFactorOrder - , inputFactorUsed - , nBestSize, nBestFilePath); + , inputFactorUsed + , nBestSize, nBestFilePath); m_inputStream = m_inputFile; } @@ -117,10 +117,10 @@ IOWrapper::~IOWrapper() } void IOWrapper::Initialization(const std::vector &/*inputFactorOrder*/ - , const std::vector &/*outputFactorOrder*/ - , const FactorMask &/*inputFactorUsed*/ - , size_t nBestSize - , const std::string &nBestFilePath) + , const std::vector &/*outputFactorOrder*/ + , const FactorMask &/*inputFactorUsed*/ + , size_t nBestSize + , const std::string &nBestFilePath) { const StaticData &staticData = StaticData::Instance(); @@ -192,7 +192,7 @@ InputType*IOWrapper::GetInput(InputType* inputType) * print surface factor only for the given phrase */ void OutputSurface(std::ostream &out, const Hypothesis &edge, const std::vector &outputFactorOrder, - bool reportSegmentation, bool reportAllFactors) + bool reportSegmentation, bool reportAllFactors) { CHECK(outputFactorOrder.size() > 0); const Phrase& phrase = edge.GetCurrTargetPhrase(); @@ -218,12 +218,12 @@ void OutputSurface(std::ostream &out, const Hypothesis &edge, const std::vector< // trace option "-t" if (reportSegmentation == true && phrase.GetSize() > 0) { out << "|" << edge.GetCurrSourceWordsRange().GetStartPos() - << "-" << edge.GetCurrSourceWordsRange().GetEndPos() << "| "; + << "-" << edge.GetCurrSourceWordsRange().GetEndPos() << "| "; } } void OutputBestSurface(std::ostream &out, const Hypothesis *hypo, const std::vector &outputFactorOrder, - bool reportSegmentation, bool reportAllFactors) + bool reportSegmentation, bool reportAllFactors) { if (hypo != NULL) { // recursively retrace this best path through the lattice, starting from the end of the hypothesis sentence @@ -377,10 +377,10 @@ void IOWrapper::OutputBestHypo(const Hypothesis *hypo, long /*translationId*/, b } void OutputNBest(std::ostream& out - , const Moses::TrellisPathList &nBestList - , const std::vector& outputFactorOrder - , long translationId - , bool reportSegmentation) + , const Moses::TrellisPathList &nBestList + , const std::vector& outputFactorOrder + , long translationId + , bool reportSegmentation) { const StaticData &staticData = StaticData::Instance(); bool labeledOutput = staticData.IsLabeledNBestList(); @@ -473,9 +473,9 @@ void OutputAllFeatureScores(const Moses::ScoreComponentCollection &features } void OutputFeatureScores( std::ostream& out - , const ScoreComponentCollection &features - , const FeatureFunction *ff - , std::string &lastName ) + , const ScoreComponentCollection &features + , const FeatureFunction *ff + , std::string &lastName ) { const StaticData &staticData = StaticData::Instance(); bool labeledOutput = staticData.IsLabeledNBestList(); @@ -556,7 +556,7 @@ IOWrapper *GetIOWrapper(const StaticData &staticData) { IOWrapper *ioWrapper; const std::vector &inputFactorOrder = staticData.GetInputFactorOrder() - ,&outputFactorOrder = staticData.GetOutputFactorOrder(); + ,&outputFactorOrder = staticData.GetOutputFactorOrder(); FactorMask inputFactorUsed(inputFactorOrder); // io @@ -565,14 +565,14 @@ IOWrapper *GetIOWrapper(const StaticData &staticData) string filePath = staticData.GetParam("input-file")[0]; ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed - , staticData.GetNBestSize() - , staticData.GetNBestFilePath() - , filePath); + , staticData.GetNBestSize() + , staticData.GetNBestFilePath() + , filePath); } else { VERBOSE(1,"IO from STDOUT/STDIN" << endl); ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed - , staticData.GetNBestSize() - , staticData.GetNBestFilePath()); + , staticData.GetNBestSize() + , staticData.GetNBestFilePath()); } ioWrapper->ResetTranslationId(); diff --git a/moses-cmd/IOWrapper.h b/moses-cmd/IOWrapper.h index fac9ca307..66b9377dc 100644 --- a/moses-cmd/IOWrapper.h +++ b/moses-cmd/IOWrapper.h @@ -142,12 +142,12 @@ void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Mo void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo); void OutputNBest(std::ostream& out - , const Moses::TrellisPathList &nBestList - , const std::vector& outputFactorOrder - , long translationId - , bool reportSegmentation); + , const Moses::TrellisPathList &nBestList + , const std::vector& outputFactorOrder + , long translationId + , bool reportSegmentation); void OutputAllFeatureScores(const Moses::ScoreComponentCollection &features - , std::ostream &out); + , std::ostream &out); void OutputFeatureScores( std::ostream& out , const Moses::ScoreComponentCollection &features , const Moses::FeatureFunction *ff diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp index 3a6f2856e..d70b64536 100644 --- a/moses-cmd/Main.cpp +++ b/moses-cmd/Main.cpp @@ -92,7 +92,7 @@ public: OutputCollector* alignmentInfoCollector, OutputCollector* unknownsCollector, bool outputSearchGraphSLF, - bool outputSearchGraphHypergraph) : + bool outputSearchGraphHypergraph) : m_source(source), m_lineNumber(lineNumber), m_outputCollector(outputCollector), m_nbestCollector(nbestCollector), m_latticeSamplesCollector(latticeSamplesCollector), @@ -103,7 +103,7 @@ public: m_outputSearchGraphSLF(outputSearchGraphSLF), m_outputSearchGraphHypergraph(outputSearchGraphHypergraph) {} - /** Translate one sentence + /** Translate one sentence * gets called by main function implemented at end of this source file */ void Run() { @@ -150,7 +150,7 @@ public: manager.SerializeSearchGraphPB(m_lineNumber, output); } #endif - } + } // Output search graph in HTK standard lattice format (SLF) if (m_outputSearchGraphSLF) { @@ -159,13 +159,13 @@ public: std::ofstream *file = new std::ofstream; file->open(fileName.str().c_str()); if (file->is_open() && file->good()) { - ostringstream out; - fix(out,PRECISION); - manager.OutputSearchGraphAsSLF(m_lineNumber, out); - *file << out.str(); - file -> flush(); + ostringstream out; + fix(out,PRECISION); + manager.OutputSearchGraphAsSLF(m_lineNumber, out); + *file << out.str(); + file -> flush(); } else { - TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl); + TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl); } } @@ -176,91 +176,91 @@ public: bool appendSuffix; if (hypergraphParameters.size() > 0 && hypergraphParameters[0] == "true") { - appendSuffix = true; + appendSuffix = true; } else { - appendSuffix = false; + appendSuffix = false; } string compression; if (hypergraphParameters.size() > 1) { - compression = hypergraphParameters[1]; + compression = hypergraphParameters[1]; } else { - compression = "txt"; + compression = "txt"; } string hypergraphDir; if ( hypergraphParameters.size() > 2 ) { hypergraphDir = hypergraphParameters[2]; } else { - string nbestFile = staticData.GetNBestFilePath(); - if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) { - boost::filesystem::path nbestPath(nbestFile); + string nbestFile = staticData.GetNBestFilePath(); + if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) { + boost::filesystem::path nbestPath(nbestFile); - // In the Boost filesystem API version 2, - // which was the default prior to Boost 1.46, - // the filename() method returned a string. - // - // In the Boost filesystem API version 3, - // which is the default starting with Boost 1.46, - // the filename() method returns a path object. - // - // To get a string from the path object, - // the native() method must be called. - // hypergraphDir = nbestPath.parent_path().filename() - //#if BOOST_VERSION >= 104600 - // .native() - //#endif - //; + // In the Boost filesystem API version 2, + // which was the default prior to Boost 1.46, + // the filename() method returned a string. + // + // In the Boost filesystem API version 3, + // which is the default starting with Boost 1.46, + // the filename() method returns a path object. + // + // To get a string from the path object, + // the native() method must be called. + // hypergraphDir = nbestPath.parent_path().filename() + //#if BOOST_VERSION >= 104600 + // .native() + //#endif + //; - // Hopefully the following compiles under all versions of Boost. - // - // If this line gives you compile errors, - // contact Lane Schwartz on the Moses mailing list - hypergraphDir = nbestPath.parent_path().string(); + // Hopefully the following compiles under all versions of Boost. + // + // If this line gives you compile errors, + // contact Lane Schwartz on the Moses mailing list + hypergraphDir = nbestPath.parent_path().string(); - } else { - stringstream hypergraphDirName; - hypergraphDirName << boost::filesystem::current_path() << "/hypergraph"; - hypergraphDir = hypergraphDirName.str(); - } + } else { + stringstream hypergraphDirName; + hypergraphDirName << boost::filesystem::current_path() << "/hypergraph"; + hypergraphDir = hypergraphDirName.str(); + } } if ( ! boost::filesystem::exists(hypergraphDir) ) { - boost::filesystem::create_directory(hypergraphDir); - } + boost::filesystem::create_directory(hypergraphDir); + } if ( ! boost::filesystem::exists(hypergraphDir) ) { - TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because the directory does not exist" << std::endl); + TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because the directory does not exist" << std::endl); } else if ( ! boost::filesystem::is_directory(hypergraphDir) ) { - TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because that path exists, but is not a directory" << std::endl); + TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because that path exists, but is not a directory" << std::endl); } else { - stringstream fileName; - fileName << hypergraphDir << "/" << m_lineNumber; - if ( appendSuffix ) { - fileName << "." << compression; - } - boost::iostreams::filtering_ostream *file = new boost::iostreams::filtering_ostream; + stringstream fileName; + fileName << hypergraphDir << "/" << m_lineNumber; + if ( appendSuffix ) { + fileName << "." << compression; + } + boost::iostreams::filtering_ostream *file = new boost::iostreams::filtering_ostream; - if ( compression == "gz" ) { - file->push( boost::iostreams::gzip_compressor() ); - } else if ( compression == "bz2" ) { - file->push( boost::iostreams::bzip2_compressor() ); - } else if ( compression != "txt" ) { - TRACE_ERR("Unrecognized hypergraph compression format (" << compression << ") - using uncompressed plain txt" << std::endl); - compression = "txt"; - } + if ( compression == "gz" ) { + file->push( boost::iostreams::gzip_compressor() ); + } else if ( compression == "bz2" ) { + file->push( boost::iostreams::bzip2_compressor() ); + } else if ( compression != "txt" ) { + TRACE_ERR("Unrecognized hypergraph compression format (" << compression << ") - using uncompressed plain txt" << std::endl); + compression = "txt"; + } - file->push( boost::iostreams::file_sink(fileName.str(), ios_base::out) ); + file->push( boost::iostreams::file_sink(fileName.str(), ios_base::out) ); - if (file->is_complete() && file->good()) { - fix(*file,PRECISION); - manager.OutputSearchGraphAsHypergraph(m_lineNumber, *file); - file -> flush(); - } else { - TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file " << fileName.str() << " is not open or not ready for writing" << std::endl); - } - file -> pop(); - delete file; + if (file->is_complete() && file->good()) { + fix(*file,PRECISION); + manager.OutputSearchGraphAsHypergraph(m_lineNumber, *file); + file -> flush(); + } else { + TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file " << fileName.str() << " is not open or not ready for writing" << std::endl); + } + file -> pop(); + delete file; } } @@ -277,8 +277,7 @@ public: // MAP decoding: best hypothesis const Hypothesis* bestHypo = NULL; - if (!staticData.UseMBR()) - { + if (!staticData.UseMBR()) { bestHypo = manager.GetBestHypothesis(); if (bestHypo) { if (staticData.IsPathRecoveryEnabled()) { @@ -296,7 +295,7 @@ public: staticData.GetReportSegmentation(), staticData.GetReportAllFactors()); if (staticData.PrintAlignmentInfo()) { - out << "||| "; + out << "||| "; OutputAlignment(out, bestHypo); } @@ -306,11 +305,10 @@ public: } } out << endl; - } + } // MBR decoding (n-best MBR, lattice MBR, consensus) - else - { + else { // we first need the n-best translations size_t nBestSize = staticData.GetMBRSize(); if (nBestSize <= 0) { @@ -346,7 +344,7 @@ public: } // consensus decoding - else if (staticData.UseConsensusDecoding()) { + else if (staticData.UseConsensusDecoding()) { const TrellisPath &conBestHypo = doConsensusDecoding(manager,nBestList); OutputBestHypo(conBestHypo, m_lineNumber, staticData.GetReportSegmentation(), @@ -355,8 +353,8 @@ public: IFVERBOSE(2) { PrintUserTime("finished Consensus decoding"); } - } - + } + // n-best MBR decoding else { const Moses::TrellisPath &mbrBestHypo = doMBR(nBestList); @@ -380,7 +378,7 @@ public: ostringstream out; manager.CalcNBest(staticData.GetNBestSize(), nBestList,staticData.GetDistinctNBest()); OutputNBest(out, nBestList, staticData.GetOutputFactorOrder(), m_lineNumber, - staticData.GetReportSegmentation()); + staticData.GetReportSegmentation()); m_nbestCollector->Write(m_lineNumber, out.str()); } @@ -390,7 +388,7 @@ public: ostringstream out; manager.CalcLatticeSamples(staticData.GetLatticeSamplesSize(), latticeSamples); OutputNBest(out,latticeSamples, staticData.GetOutputFactorOrder(), m_lineNumber, - staticData.GetReportSegmentation()); + staticData.GetReportSegmentation()); m_latticeSamplesCollector->Write(m_lineNumber, out.str()); } @@ -450,7 +448,7 @@ static void PrintFeatureWeight(const FeatureFunction* ff) size_t numScoreComps = ff->GetNumScoreComponents(); vector values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff); for (size_t i = 0; i < numScoreComps; ++i) { - cout << " " << values[i]; + cout << " " << values[i]; } cout << endl; } @@ -484,13 +482,13 @@ size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff vector values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff); if (numScoreComps > 1) { for (size_t i = 0; i < numScoreComps; ++i) { - outputSearchGraphStream << ff->GetScoreProducerDescription() - << i - << "=" << values[i] << endl; + outputSearchGraphStream << ff->GetScoreProducerDescription() + << i + << "=" << values[i] << endl; } } else { - outputSearchGraphStream << ff->GetScoreProducerDescription() - << "=" << values[0] << endl; + outputSearchGraphStream << ff->GetScoreProducerDescription() + << "=" << values[0] << endl; } return index+numScoreComps; } else { @@ -541,7 +539,7 @@ void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream) int main(int argc, char** argv) { try { - + #ifdef HAVE_PROTOBUF GOOGLE_PROTOBUF_VERIFY_VERSION; #endif @@ -601,20 +599,20 @@ int main(int argc, char** argv) if (staticData.GetOutputSearchGraphHypergraph()) { ofstream* weightsOut = new std::ofstream; stringstream weightsFilename; - if (staticData.GetParam("output-search-graph-hypergraph").size() > 3) { - weightsFilename << staticData.GetParam("output-search-graph-hypergraph")[3]; + if (staticData.GetParam("output-search-graph-hypergraph").size() > 3) { + weightsFilename << staticData.GetParam("output-search-graph-hypergraph")[3]; } else { - string nbestFile = staticData.GetNBestFilePath(); - if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) { - boost::filesystem::path nbestPath(nbestFile); - weightsFilename << nbestPath.parent_path().filename() << "/weights"; - } else { - weightsFilename << boost::filesystem::current_path() << "/hypergraph/weights"; - } + string nbestFile = staticData.GetNBestFilePath(); + if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) { + boost::filesystem::path nbestPath(nbestFile); + weightsFilename << nbestPath.parent_path().filename() << "/weights"; + } else { + weightsFilename << boost::filesystem::current_path() << "/hypergraph/weights"; + } } boost::filesystem::path weightsFilePath(weightsFilename.str()); if ( ! boost::filesystem::exists(weightsFilePath.parent_path()) ) { - boost::filesystem::create_directory(weightsFilePath.parent_path()); + boost::filesystem::create_directory(weightsFilePath.parent_path()); } TRACE_ERR("The weights file is " << weightsFilename.str() << "\n"); weightsOut->open(weightsFilename.str().c_str()); @@ -669,26 +667,26 @@ int main(int argc, char** argv) if (output1best) { outputCollector.reset(new OutputCollector()); } - + // initialize stream for word graph (aka: output lattice) auto_ptr wordGraphCollector; if (staticData.GetOutputWordGraph()) { wordGraphCollector.reset(new OutputCollector(&(ioWrapper->GetOutputWordGraphStream()))); } - + // initialize stream for search graph // note: this is essentially the same as above, but in a different format auto_ptr searchGraphCollector; if (staticData.GetOutputSearchGraph()) { searchGraphCollector.reset(new OutputCollector(&(ioWrapper->GetOutputSearchGraphStream()))); } - + // initialize stram for details about the decoder run auto_ptr detailedTranslationCollector; if (staticData.IsDetailedTranslationReportingEnabled()) { detailedTranslationCollector.reset(new OutputCollector(&(ioWrapper->GetDetailedTranslationReportingStream()))); } - + // initialize stram for word alignment between input and output auto_ptr alignmentInfoCollector; if (!staticData.GetAlignmentOutputFile().empty()) { @@ -706,11 +704,11 @@ int main(int argc, char** argv) } unknownsCollector.reset(new OutputCollector(unknownsStream.get())); } - + #ifdef WITH_THREADS ThreadPool pool(staticData.ThreadCount()); #endif - + // main loop over set of input sentences InputType* source = NULL; size_t lineCount = staticData.GetStartTranslationId(); @@ -728,21 +726,21 @@ int main(int argc, char** argv) detailedTranslationCollector.get(), alignmentInfoCollector.get(), unknownsCollector.get(), - staticData.GetOutputSearchGraphSLF(), - staticData.GetOutputSearchGraphHypergraph()); + staticData.GetOutputSearchGraphSLF(), + staticData.GetOutputSearchGraphHypergraph()); // execute task #ifdef WITH_THREADS - pool.Submit(task); + pool.Submit(task); #else task->Run(); delete task; #endif - + source = NULL; //make sure it doesn't get deleted ++lineCount; } - - // we are done, finishing up + + // we are done, finishing up #ifdef WITH_THREADS pool.Stop(true); //flush remaining jobs #endif diff --git a/moses-cmd/TranslationAnalysis.cpp b/moses-cmd/TranslationAnalysis.cpp index 4231001e9..bd7113096 100644 --- a/moses-cmd/TranslationAnalysis.cpp +++ b/moses-cmd/TranslationAnalysis.cpp @@ -57,7 +57,7 @@ void PrintTranslationAnalysis(std::ostream &os, const Hypothesis* hypo) } } } - + bool epsilon = false; if (target == "") { target=""; @@ -101,21 +101,21 @@ void PrintTranslationAnalysis(std::ostream &os, const Hypothesis* hypo) } os << std::endl << std::endl; if (doLMStats && lmCalls > 0) { - std::vector::iterator acc = lmAcc.begin(); + std::vector::iterator acc = lmAcc.begin(); - const std::vector &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions(); - for (size_t i = 0; i < statefulFFs.size(); ++i) { - const StatefulFeatureFunction *ff = statefulFFs[i]; - const LanguageModel *lm = dynamic_cast(ff); + const std::vector &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions(); + for (size_t i = 0; i < statefulFFs.size(); ++i) { + const StatefulFeatureFunction *ff = statefulFFs[i]; + const LanguageModel *lm = dynamic_cast(ff); - if (lm) { - char buf[256]; - sprintf(buf, "%.4f", (float)(*acc)/(float)lmCalls); - os << lm->GetScoreProducerDescription() <<", AVG N-GRAM LENGTH: " << buf << std::endl; + if (lm) { + char buf[256]; + sprintf(buf, "%.4f", (float)(*acc)/(float)lmCalls); + os << lm->GetScoreProducerDescription() <<", AVG N-GRAM LENGTH: " << buf << std::endl; - ++acc; - } - } + ++acc; + } + } } if (droppedWords.size() > 0) { @@ -125,10 +125,10 @@ void PrintTranslationAnalysis(std::ostream &os, const Hypothesis* hypo) os << "\tdropped=" << *dwi << std::endl; } } - os << std::endl << "SCORES (UNWEIGHTED/WEIGHTED): "; + os << std::endl << "SCORES (UNWEIGHTED/WEIGHTED): "; os << translationPath.back()->GetScoreBreakdown(); os << " weighted(TODO)"; - os << std::endl; + os << std::endl; } } diff --git a/moses/AlignmentInfo.cpp b/moses/AlignmentInfo.cpp index 97eff59b5..178f3438a 100644 --- a/moses/AlignmentInfo.cpp +++ b/moses/AlignmentInfo.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -46,17 +46,18 @@ void AlignmentInfo::BuildNonTermIndexMap() m_nonTermIndexMap.resize(maxIndex+1, NOT_FOUND); size_t i = 0; for (p = begin(); p != end(); ++p) { - if (m_nonTermIndexMap[p->second] != NOT_FOUND) { - // 1-to-many. Definitely a set of terminals. Don't bother storing 1-to-1 index map - m_nonTermIndexMap.clear(); - return; - } + if (m_nonTermIndexMap[p->second] != NOT_FOUND) { + // 1-to-many. Definitely a set of terminals. Don't bother storing 1-to-1 index map + m_nonTermIndexMap.clear(); + return; + } m_nonTermIndexMap[p->second] = i++; } - + } -bool compare_target(const std::pair *a, const std::pair *b) { +bool compare_target(const std::pair *a, const std::pair *b) +{ if(a->second < b->second) return true; if(a->second == b->second) return (a->first < b->first); return false; @@ -66,32 +67,30 @@ bool compare_target(const std::pair *a, const std::pair* > AlignmentInfo::GetSortedAlignments() const { std::vector< const std::pair* > ret; - + CollType::const_iterator iter; - for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) - { + for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) { const std::pair &alignPair = *iter; ret.push_back(&alignPair); } - + const StaticData &staticData = StaticData::Instance(); WordAlignmentSort wordAlignmentSort = staticData.GetWordAlignmentSort(); - - switch (wordAlignmentSort) - { - case NoSort: - break; - - case TargetOrder: - std::sort(ret.begin(), ret.end(), compare_target); - break; - - default: - CHECK(false); + + switch (wordAlignmentSort) { + case NoSort: + break; + + case TargetOrder: + std::sort(ret.begin(), ret.end(), compare_target); + break; + + default: + CHECK(false); } - + return ret; - + } std::vector AlignmentInfo::GetSourceIndex2PosMap() const diff --git a/moses/AlignmentInfo.h b/moses/AlignmentInfo.h index db92791aa..76d4d918a 100644 --- a/moses/AlignmentInfo.h +++ b/moses/AlignmentInfo.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -42,16 +42,20 @@ class AlignmentInfo friend struct AlignmentInfoHasher; friend class AlignmentInfoCollection; - public: +public: typedef std::set > CollType; typedef std::vector NonTermIndexMap; typedef CollType::const_iterator const_iterator; - const_iterator begin() const { return m_collection.begin(); } - const_iterator end() const { return m_collection.end(); } + const_iterator begin() const { + return m_collection.begin(); + } + const_iterator end() const { + return m_collection.end(); + } void Add(size_t sourcePos, size_t targetPos) { - m_collection.insert(std::pair(sourcePos, targetPos)); + m_collection.insert(std::pair(sourcePos, targetPos)); } /** Provides a map from target-side to source-side non-terminal indices. * The target-side index should be the rule symbol index (COUNTING terminals). @@ -64,20 +68,21 @@ class AlignmentInfo const CollType &GetAlignments() const { return m_collection; } - - size_t GetSize() const { return m_collection.size(); } + + size_t GetSize() const { + return m_collection.size(); + } std::vector< const std::pair* > GetSortedAlignments() const; std::vector GetSourceIndex2PosMap() const; - bool operator==(const AlignmentInfo& rhs) const - { + bool operator==(const AlignmentInfo& rhs) const { return m_collection == rhs.m_collection && m_nonTermIndexMap == rhs.m_nonTermIndexMap; } - - private: + +private: //! AlignmentInfo objects should only be created by an AlignmentInfoCollection explicit AlignmentInfo(const std::set > &pairs); @@ -90,25 +95,21 @@ class AlignmentInfo /** Define an arbitrary strict weak ordering between AlignmentInfo objects * for use by AlignmentInfoCollection. */ -struct AlignmentInfoOrderer -{ +struct AlignmentInfoOrderer { bool operator()(const AlignmentInfo &a, const AlignmentInfo &b) const { - if (a.m_collection == b.m_collection) { - return a.m_nonTermIndexMap < b.m_nonTermIndexMap; - } - else { - return a.m_collection < b.m_collection; - } + if (a.m_collection == b.m_collection) { + return a.m_nonTermIndexMap < b.m_nonTermIndexMap; + } else { + return a.m_collection < b.m_collection; + } } }; -/** +/** * Hashing functoid **/ -struct AlignmentInfoHasher -{ - size_t operator()(const AlignmentInfo& a) const - { +struct AlignmentInfoHasher { + size_t operator()(const AlignmentInfo& a) const { size_t seed = 0; boost::hash_combine(seed,a.m_collection); boost::hash_combine(seed,a.m_nonTermIndexMap); @@ -117,7 +118,8 @@ struct AlignmentInfoHasher }; -inline size_t hash_value(const AlignmentInfo& a) { +inline size_t hash_value(const AlignmentInfo& a) +{ static AlignmentInfoHasher hasher; return hasher(a); } diff --git a/moses/AlignmentInfoCollection.cpp b/moses/AlignmentInfoCollection.cpp index 53b83d8cd..ef6e62eb3 100644 --- a/moses/AlignmentInfoCollection.cpp +++ b/moses/AlignmentInfoCollection.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -39,7 +39,7 @@ const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const } const AlignmentInfo *AlignmentInfoCollection::Add( - const std::set > &pairs) + const std::set > &pairs) { AlignmentInfo pairsAlignmentInfo(pairs); #ifdef WITH_THREADS diff --git a/moses/AlignmentInfoCollection.h b/moses/AlignmentInfoCollection.h index 6185b32a9..37d717b0f 100644 --- a/moses/AlignmentInfoCollection.h +++ b/moses/AlignmentInfoCollection.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -36,8 +36,10 @@ namespace Moses */ class AlignmentInfoCollection { - public: - static AlignmentInfoCollection &Instance() { return s_instance; } +public: + static AlignmentInfoCollection &Instance() { + return s_instance; + } /** Returns a pointer to an AlignmentInfo object with the same source-target * alignment pairs as given in the argument. If the collection already @@ -49,7 +51,7 @@ class AlignmentInfoCollection //! Returns a pointer to an empty AlignmentInfo object. const AlignmentInfo &GetEmptyAlignmentInfo() const; - private: +private: typedef std::set AlignmentInfoSet; //! Only a single static variable should be created. @@ -62,7 +64,7 @@ class AlignmentInfoCollection //reader-writer lock mutable boost::shared_mutex m_accessLock; #endif - + AlignmentInfoSet m_collection; const AlignmentInfo *m_emptyAlignmentInfo; }; diff --git a/moses/AlignmentInfoTest.cpp b/moses/AlignmentInfoTest.cpp index 48c88db65..26127f3cf 100644 --- a/moses/AlignmentInfoTest.cpp +++ b/moses/AlignmentInfoTest.cpp @@ -35,8 +35,7 @@ struct AlignmentInfoFixture { const AlignmentInfo* ai2; const AlignmentInfo* ai3; - AlignmentInfoFixture() - { + AlignmentInfoFixture() { AlignmentInfoCollection& collection = AlignmentInfoCollection::Instance(); IndexSet aligns1,aligns2,aligns3; aligns1.insert(IndexPair(1,1)); diff --git a/moses/BitmapContainer.cpp b/moses/BitmapContainer.cpp index 7e8d470ee..64dd9081b 100644 --- a/moses/BitmapContainer.cpp +++ b/moses/BitmapContainer.cpp @@ -275,11 +275,11 @@ BitmapContainer::~BitmapContainer() // As we have created the square position objects we clean up now. while (!m_queue.empty()) { - HypothesisQueueItem *item = m_queue.top(); - m_queue.pop(); + HypothesisQueueItem *item = m_queue.top(); + m_queue.pop(); - FREEHYPO( item->GetHypothesis() ); - delete item; + FREEHYPO( item->GetHypothesis() ); + delete item; } // Delete all edges. diff --git a/moses/ChartCell.cpp b/moses/ChartCell.cpp index fd163450e..b57a4ab36 100644 --- a/moses/ChartCell.cpp +++ b/moses/ChartCell.cpp @@ -45,17 +45,18 @@ ChartCellBase::~ChartCellBase() {} /** Constructor * \param startPos endPos range of this cell - * \param manager pointer back to the manager + * \param manager pointer back to the manager */ ChartCell::ChartCell(size_t startPos, size_t endPos, ChartManager &manager) : - ChartCellBase(startPos, endPos), m_manager(manager) { + ChartCellBase(startPos, endPos), m_manager(manager) +{ const StaticData &staticData = StaticData::Instance(); m_nBestIsEnabled = staticData.IsNBestEnabled(); } ChartCell::~ChartCell() {} -/** Add the given hypothesis to the cell. +/** Add the given hypothesis to the cell. * Returns true if added, false if not. Maybe it already exists in the collection or score falls below threshold etc. * This function just calls the correspondind AddHypothesis() in ChartHypothesisCollection * \param hypo Hypothesis to be added @@ -98,8 +99,7 @@ void ChartCell::ProcessSentence(const ChartTranslationOptionList &transOptList // pluck things out of queue and add to hypo collection const size_t popLimit = staticData.GetCubePruningPopLimit(); - for (size_t numPops = 0; numPops < popLimit && !queue.IsEmpty(); ++numPops) - { + for (size_t numPops = 0; numPops < popLimit && !queue.IsEmpty(); ++numPops) { ChartHypothesis *hypo = queue.Pop(); AddHypothesis(hypo); } @@ -179,15 +179,15 @@ size_t ChartCell::GetSize() const const HypoList *ChartCell::GetAllSortedHypotheses() const { - HypoList *ret = new HypoList(); + HypoList *ret = new HypoList(); - MapType::const_iterator iter; - for (iter = m_hypoColl.begin(); iter != m_hypoColl.end(); ++iter) { - const ChartHypothesisCollection &coll = iter->second; - const HypoList &list = coll.GetSortedHypotheses(); + MapType::const_iterator iter; + for (iter = m_hypoColl.begin(); iter != m_hypoColl.end(); ++iter) { + const ChartHypothesisCollection &coll = iter->second; + const HypoList &list = coll.GetSortedHypotheses(); std::copy(list.begin(), list.end(), std::inserter(*ret, ret->end())); - } - return ret; + } + return ret; } //! call GetSearchGraph() for each hypo collection diff --git a/moses/ChartCell.h b/moses/ChartCell.h index 14ac8e3b4..1fed695ac 100644 --- a/moses/ChartCell.h +++ b/moses/ChartCell.h @@ -44,35 +44,43 @@ class ChartTranslationOptionList; class ChartCellCollection; class ChartManager; -class ChartCellBase { - public: - ChartCellBase(size_t startPos, size_t endPos); +class ChartCellBase +{ +public: + ChartCellBase(size_t startPos, size_t endPos); - virtual ~ChartCellBase(); + virtual ~ChartCellBase(); - const ChartCellLabelSet &GetTargetLabelSet() const { return m_targetLabelSet; } + const ChartCellLabelSet &GetTargetLabelSet() const { + return m_targetLabelSet; + } - ChartCellLabelSet &MutableTargetLabelSet() { return m_targetLabelSet; } + ChartCellLabelSet &MutableTargetLabelSet() { + return m_targetLabelSet; + } - const WordsRange &GetCoverage() const { return m_coverage; } + const WordsRange &GetCoverage() const { + return m_coverage; + } - protected: - const WordsRange m_coverage; - ChartCellLabelSet m_targetLabelSet; +protected: + const WordsRange m_coverage; + ChartCellLabelSet m_targetLabelSet; }; /** 1 cell in chart decoder. * Doesn't directly hold hypotheses. Each cell contain a map of ChartHypothesisCollection that have different constituent labels */ -class ChartCell : public ChartCellBase { +class ChartCell : public ChartCellBase +{ friend std::ostream& operator<<(std::ostream&, const ChartCell&); public: #if defined(BOOST_VERSION) && (BOOST_VERSION >= 104200) typedef boost::unordered_map MapType; + ChartHypothesisCollection, + NonTerminalHasher, + NonTerminalEqualityPred + > MapType; #else typedef std::map MapType; #endif @@ -91,8 +99,7 @@ public: ,const ChartCellCollection &allChartCells); //! Get all hypotheses in the cell that have the specified constituent label - const HypoList *GetSortedHypotheses(const Word &constituentLabel) const - { + const HypoList *GetSortedHypotheses(const Word &constituentLabel) const { MapType::const_iterator p = m_hypoColl.find(constituentLabel); return (p == m_hypoColl.end()) ? NULL : &(p->second.GetSortedHypotheses()); } diff --git a/moses/ChartCellCollection.cpp b/moses/ChartCellCollection.cpp index a34687f59..46392261d 100644 --- a/moses/ChartCellCollection.cpp +++ b/moses/ChartCellCollection.cpp @@ -23,24 +23,27 @@ #include "InputType.h" #include "WordsRange.h" -namespace Moses { +namespace Moses +{ -ChartCellCollectionBase::~ChartCellCollectionBase() { +ChartCellCollectionBase::~ChartCellCollectionBase() +{ m_source.clear(); - for (std::vector >::iterator i = m_cells.begin(); i != m_cells.end(); ++i) + for (std::vector >::iterator i = m_cells.begin(); i != m_cells.end(); ++i) RemoveAllInColl(*i); } -class CubeCellFactory { - public: - explicit CubeCellFactory(ChartManager &manager) : m_manager(manager) {} +class CubeCellFactory +{ +public: + explicit CubeCellFactory(ChartManager &manager) : m_manager(manager) {} - ChartCell *operator()(size_t start, size_t end) const { - return new ChartCell(start, end, m_manager); - } + ChartCell *operator()(size_t start, size_t end) const { + return new ChartCell(start, end, m_manager); + } - private: - ChartManager &m_manager; +private: + ChartManager &m_manager; }; /** Costructor diff --git a/moses/ChartCellCollection.h b/moses/ChartCellCollection.h index 7532503d7..d0423b0b2 100644 --- a/moses/ChartCellCollection.h +++ b/moses/ChartCellCollection.h @@ -31,57 +31,59 @@ namespace Moses class InputType; class ChartManager; -class ChartCellCollectionBase { - public: - template ChartCellCollectionBase(const InputType &input, const Factory &factory) : - m_cells(input.GetSize()) { - size_t size = input.GetSize(); - for (size_t startPos = 0; startPos < size; ++startPos) { - std::vector &inner = m_cells[startPos]; - inner.reserve(size - startPos); - for (size_t endPos = startPos; endPos < size; ++endPos) { - inner.push_back(factory(startPos, endPos)); - } - /* Hack: ChartCellLabel shouldn't need to know its span, but the parser - * gets it from there :-(. The span is actually stored as a reference, - * which needs to point somewhere, so I have it refer to the ChartCell. - */ - m_source.push_back(new ChartCellLabel(inner[0]->GetCoverage(), input.GetWord(startPos))); +class ChartCellCollectionBase +{ +public: + template ChartCellCollectionBase(const InputType &input, const Factory &factory) : + m_cells(input.GetSize()) { + size_t size = input.GetSize(); + for (size_t startPos = 0; startPos < size; ++startPos) { + std::vector &inner = m_cells[startPos]; + inner.reserve(size - startPos); + for (size_t endPos = startPos; endPos < size; ++endPos) { + inner.push_back(factory(startPos, endPos)); } + /* Hack: ChartCellLabel shouldn't need to know its span, but the parser + * gets it from there :-(. The span is actually stored as a reference, + * which needs to point somewhere, so I have it refer to the ChartCell. + */ + m_source.push_back(new ChartCellLabel(inner[0]->GetCoverage(), input.GetWord(startPos))); } + } - virtual ~ChartCellCollectionBase(); + virtual ~ChartCellCollectionBase(); - const ChartCellBase &GetBase(const WordsRange &coverage) const { - return *m_cells[coverage.GetStartPos()][coverage.GetEndPos() - coverage.GetStartPos()]; - } + const ChartCellBase &GetBase(const WordsRange &coverage) const { + return *m_cells[coverage.GetStartPos()][coverage.GetEndPos() - coverage.GetStartPos()]; + } - ChartCellBase &MutableBase(const WordsRange &coverage) { - return *m_cells[coverage.GetStartPos()][coverage.GetEndPos() - coverage.GetStartPos()]; - } + ChartCellBase &MutableBase(const WordsRange &coverage) { + return *m_cells[coverage.GetStartPos()][coverage.GetEndPos() - coverage.GetStartPos()]; + } - const ChartCellLabel &GetSourceWordLabel(size_t at) const { - return m_source[at]; - } + const ChartCellLabel &GetSourceWordLabel(size_t at) const { + return m_source[at]; + } - private: - std::vector > m_cells; +private: + std::vector > m_cells; - boost::ptr_vector m_source; + boost::ptr_vector m_source; }; /** Hold all the chart cells for 1 input sentence. A variable of this type is held by the ChartManager */ -class ChartCellCollection : public ChartCellCollectionBase { - public: - ChartCellCollection(const InputType &input, ChartManager &manager); +class ChartCellCollection : public ChartCellCollectionBase +{ +public: + ChartCellCollection(const InputType &input, ChartManager &manager); //! get a chart cell for a particular range ChartCell &Get(const WordsRange &coverage) { return static_cast(MutableBase(coverage)); } - + //! get a chart cell for a particular range const ChartCell &Get(const WordsRange &coverage) const { return static_cast(GetBase(coverage)); diff --git a/moses/ChartCellLabel.h b/moses/ChartCellLabel.h index 218a512c0..ad6e3565d 100644 --- a/moses/ChartCellLabel.h +++ b/moses/ChartCellLabel.h @@ -23,7 +23,10 @@ #include "Word.h" #include "WordsRange.h" -namespace search { class Vertex; } +namespace search +{ +class Vertex; +} namespace Moses { @@ -31,17 +34,17 @@ namespace Moses class Word; /** Contains a range, word (non-terms?) and a vector of hypotheses. - * @todo This is probably incompatible with lattice decoding when the word that spans + * @todo This is probably incompatible with lattice decoding when the word that spans * a position (or positions) can vary. * @todo is this to hold sorted hypotheses that are in the queue for creating the next hypos? */ class ChartCellLabel { - public: +public: union Stack { const HypoList *cube; // cube pruning - search::Vertex *incr; // incremental search after filling. - void *incr_generator; // incremental search during filling. + search::Vertex *incr; // incremental search after filling. + void *incr_generator; // incremental search during filling. }; @@ -52,13 +55,20 @@ class ChartCellLabel , m_stack(stack) {} - const WordsRange &GetCoverage() const { return m_coverage; } - const Word &GetLabel() const { return m_label; } - Stack GetStack() const { return m_stack; } - Stack &MutableStack() { return m_stack; } + const WordsRange &GetCoverage() const { + return m_coverage; + } + const Word &GetLabel() const { + return m_label; + } + Stack GetStack() const { + return m_stack; + } + Stack &MutableStack() { + return m_stack; + } - bool operator<(const ChartCellLabel &other) const - { + bool operator<(const ChartCellLabel &other) const { // m_coverage and m_label uniquely identify a ChartCellLabel, so don't // need to compare m_stack. if (m_coverage == other.m_coverage) { @@ -67,7 +77,7 @@ class ChartCellLabel return m_coverage < other.m_coverage; } - private: +private: const WordsRange &m_coverage; const Word &m_label; Stack m_stack; diff --git a/moses/ChartCellLabelSet.h b/moses/ChartCellLabelSet.h index 5ea192e51..68c8b4263 100644 --- a/moses/ChartCellLabelSet.h +++ b/moses/ChartCellLabelSet.h @@ -35,46 +35,55 @@ class ChartHypothesisCollection; */ class ChartCellLabelSet { - private: +private: #if defined(BOOST_VERSION) && (BOOST_VERSION >= 104200) typedef boost::unordered_map MapType; + NonTerminalHasher, NonTerminalEqualityPred + > MapType; #else typedef std::map MapType; #endif - public: +public: typedef MapType::const_iterator const_iterator; typedef MapType::iterator iterator; ChartCellLabelSet(const WordsRange &coverage) : m_coverage(coverage) {} - const_iterator begin() const { return m_map.begin(); } - const_iterator end() const { return m_map.end(); } - - iterator mutable_begin() { return m_map.begin(); } - iterator mutable_end() { return m_map.end(); } + const_iterator begin() const { + return m_map.begin(); + } + const_iterator end() const { + return m_map.end(); + } - void AddWord(const Word &w) - { + iterator mutable_begin() { + return m_map.begin(); + } + iterator mutable_end() { + return m_map.end(); + } + + void AddWord(const Word &w) { m_map.insert(std::make_pair(w, ChartCellLabel(m_coverage, w))); } - // Stack is a HypoList or whatever the search algorithm uses. - void AddConstituent(const Word &w, const HypoList *stack) - { + // Stack is a HypoList or whatever the search algorithm uses. + void AddConstituent(const Word &w, const HypoList *stack) { ChartCellLabel::Stack s; s.cube = stack; m_map.insert(std::make_pair(w, ChartCellLabel(m_coverage, w, s))); } - bool Empty() const { return m_map.empty(); } + bool Empty() const { + return m_map.empty(); + } - size_t GetSize() const { return m_map.size(); } + size_t GetSize() const { + return m_map.size(); + } - const ChartCellLabel *Find(const Word &w) const - { + const ChartCellLabel *Find(const Word &w) const { MapType::const_iterator p = m_map.find(w); return p == m_map.end() ? 0 : &(p->second); } @@ -83,7 +92,7 @@ class ChartCellLabelSet return m_map.insert(std::make_pair(w, ChartCellLabel(m_coverage, w))).first->second.MutableStack(); } - private: +private: const WordsRange &m_coverage; MapType m_map; }; diff --git a/moses/ChartHypothesis.cpp b/moses/ChartHypothesis.cpp index c7c1047f1..ce5a318ac 100644 --- a/moses/ChartHypothesis.cpp +++ b/moses/ChartHypothesis.cpp @@ -39,7 +39,7 @@ namespace Moses ObjectPool ChartHypothesis::s_objectPool("ChartHypothesis", 300000); #endif -/** Create a hypothesis from a rule +/** Create a hypothesis from a rule * \param transOpt wrapper around the rule * \param item @todo dunno * \param manager reference back to manager @@ -59,15 +59,14 @@ ChartHypothesis::ChartHypothesis(const ChartTranslationOptions &transOpt, const std::vector &childEntries = item.GetHypothesisDimensions(); m_prevHypos.reserve(childEntries.size()); std::vector::const_iterator iter; - for (iter = childEntries.begin(); iter != childEntries.end(); ++iter) - { + for (iter = childEntries.begin(); iter != childEntries.end(); ++iter) { m_prevHypos.push_back(iter->GetHypothesis()); } } ChartHypothesis::~ChartHypothesis() { - // delete feature function states + // delete feature function states for (unsigned i = 0; i < m_ffStates.size(); ++i) { delete m_ffStates[i]; } @@ -98,8 +97,7 @@ void ChartHypothesis::CreateOutputPhrase(Phrase &outPhrase) const size_t nonTermInd = GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap()[pos]; const ChartHypothesis *prevHypo = m_prevHypos[nonTermInd]; prevHypo->CreateOutputPhrase(outPhrase); - } - else { + } else { outPhrase.AddWord(word); } } @@ -124,17 +122,16 @@ Phrase ChartHypothesis::GetOutputPhrase() const */ int ChartHypothesis::RecombineCompare(const ChartHypothesis &compare) const { - int comp = 0; + int comp = 0; - for (unsigned i = 0; i < m_ffStates.size(); ++i) - { - if (m_ffStates[i] == NULL || compare.m_ffStates[i] == NULL) + for (unsigned i = 0; i < m_ffStates.size(); ++i) { + if (m_ffStates[i] == NULL || compare.m_ffStates[i] == NULL) comp = m_ffStates[i] - compare.m_ffStates[i]; - else + else comp = m_ffStates[i]->Compare(*compare.m_ffStates[i]); - if (comp != 0) - return comp; + if (comp != 0) + return comp; } return 0; @@ -161,16 +158,16 @@ void ChartHypothesis::CalcScore() //Add pre-computed features m_manager.InsertPreCalculatedScores(GetCurrTargetPhrase(), &m_scoreBreakdown); - // compute values of stateless feature functions that were not + // compute values of stateless feature functions that were not // cached in the translation option-- there is no principled distinction const std::vector& sfs = - StatelessFeatureFunction::GetStatelessFeatureFunctions(); + StatelessFeatureFunction::GetStatelessFeatureFunctions(); for (unsigned i = 0; i < sfs.size(); ++i) { sfs[i]->EvaluateChart(ChartBasedFeatureContext(this),&m_scoreBreakdown); } const std::vector& ffs = - StatefulFeatureFunction::GetStatefulFeatureFunctions(); + StatefulFeatureFunction::GetStatefulFeatureFunctions(); for (unsigned i = 0; i < ffs.size(); ++i) m_ffStates[i] = ffs[i]->EvaluateChart(*this,i,&m_scoreBreakdown); @@ -262,13 +259,12 @@ std::ostream& operator<<(std::ostream& out, const ChartHypothesis& hypo) { out << hypo.GetId(); - - // recombination - if (hypo.GetWinningHypothesis() != NULL && - hypo.GetWinningHypothesis() != &hypo) - { - out << "->" << hypo.GetWinningHypothesis()->GetId(); - } + + // recombination + if (hypo.GetWinningHypothesis() != NULL && + hypo.GetWinningHypothesis() != &hypo) { + out << "->" << hypo.GetWinningHypothesis()->GetId(); + } if (StaticData::Instance().GetIncludeLHSInSearchGraph()) { out << " " << hypo.GetTargetLHS() << "=>"; diff --git a/moses/ChartHypothesis.h b/moses/ChartHypothesis.h index 9dc1cba92..61c2faae1 100644 --- a/moses/ChartHypothesis.h +++ b/moses/ChartHypothesis.h @@ -52,7 +52,7 @@ protected: const TargetPhrase &m_targetPhrase; WordsRange m_currSourceWordsRange; - std::vector m_ffStates; /*! stateful feature function states */ + std::vector m_ffStates; /*! stateful feature function states */ ScoreComponentCollection m_scoreBreakdown /*! detailed score break-down by components (for instance language model, word penalty, etc) */ ,m_lmNGram ,m_lmPrefix; @@ -68,8 +68,8 @@ protected: unsigned m_id; /* pkoehn wants to log the order in which hypotheses were generated */ //! not implemented - ChartHypothesis(); - + ChartHypothesis(); + //! not implemented ChartHypothesis(const ChartHypothesis ©); @@ -96,35 +96,39 @@ public: ~ChartHypothesis(); - unsigned GetId() const { return m_id; } + unsigned GetId() const { + return m_id; + } //! Get the rule that created this hypothesis const TargetPhrase &GetCurrTargetPhrase()const { return m_targetPhrase; } - + //! the source range that this hypothesis spans const WordsRange &GetCurrSourceRange()const { return m_currSourceWordsRange; } - + //! the arc list when creating n-best lists inline const ChartArcList* GetArcList() const { return m_arcList; } - + //! the feature function states for a particular feature \param featureID - inline const FFState* GetFFState( size_t featureID ) const { - return m_ffStates[ featureID ]; - } - + inline const FFState* GetFFState( size_t featureID ) const { + return m_ffStates[ featureID ]; + } + //! reference back to the manager - inline const ChartManager& GetManager() const { return m_manager; } + inline const ChartManager& GetManager() const { + return m_manager; + } void CreateOutputPhrase(Phrase &outPhrase) const; Phrase GetOutputPhrase() const; - int RecombineCompare(const ChartHypothesis &compare) const; + int RecombineCompare(const ChartHypothesis &compare) const; void CalcScore(); @@ -133,30 +137,34 @@ public: void SetWinningHypo(const ChartHypothesis *hypo); //! get the unweighted score for each feature function - const ScoreComponentCollection &GetScoreBreakdown() const - { return m_scoreBreakdown; } - - //! Get the weighted total score - float GetTotalScore() const - { return m_totalScore; } + const ScoreComponentCollection &GetScoreBreakdown() const { + return m_scoreBreakdown; + } - //! vector of previous hypotheses this hypo is built on - const std::vector &GetPrevHypos() const - { return m_prevHypos; } + //! Get the weighted total score + float GetTotalScore() const { + return m_totalScore; + } + + //! vector of previous hypotheses this hypo is built on + const std::vector &GetPrevHypos() const { + return m_prevHypos; + } //! get a particular previous hypos - const ChartHypothesis* GetPrevHypo(size_t pos) const { - return m_prevHypos[pos]; - } - + const ChartHypothesis* GetPrevHypo(size_t pos) const { + return m_prevHypos[pos]; + } + //! get the constituency label that covers this hypo const Word &GetTargetLHS() const { return GetCurrTargetPhrase().GetTargetLHS(); } //! get the best hypo in the arc list when doing n-best list creation. It's either this hypothesis, or the best hypo is this hypo is in the arc list - const ChartHypothesis* GetWinningHypothesis() const - { return m_winningHypo; } + const ChartHypothesis* GetWinningHypothesis() const { + return m_winningHypo; + } TO_STRING(); diff --git a/moses/ChartHypothesisCollection.cpp b/moses/ChartHypothesisCollection.cpp index 752bb7f6c..3b80f68dc 100644 --- a/moses/ChartHypothesisCollection.cpp +++ b/moses/ChartHypothesisCollection.cpp @@ -51,7 +51,7 @@ ChartHypothesisCollection::~ChartHypothesisCollection() //RemoveAllInColl(m_hypos); } -/** public function to add hypothesis to this collection. +/** public function to add hypothesis to this collection. * Returns false if equiv hypo exists in collection, otherwise returns true. * Takes care of update arc list for n-best list creation. * Will delete hypo is it exist - once this function is call don't delete hypothesis. @@ -108,8 +108,7 @@ bool ChartHypothesisCollection::AddHypothesis(ChartHypothesis *hypo, ChartManage VERBOSE(3,"worse than matching hyp " << hypoExisting->GetId() << ", recombining" << std::endl) if (m_nBestIsEnabled) { hypoExisting->AddArc(hypo); - } - else { + } else { ChartHypothesis::Delete(hypo); } return false; @@ -146,7 +145,7 @@ pair ChartHypothesisCollectio return ret; } -/** Remove hypothesis pointed to by iterator but DOES NOT delete the object. +/** Remove hypothesis pointed to by iterator but DOES NOT delete the object. * \param iter iterator to delete */ void ChartHypothesisCollection::Detach(const HCType::iterator &iter) diff --git a/moses/ChartHypothesisCollection.h b/moses/ChartHypothesisCollection.h index f88cb8302..fa707b46d 100644 --- a/moses/ChartHypothesisCollection.h +++ b/moses/ChartHypothesisCollection.h @@ -46,7 +46,7 @@ public: bool operator()(const ChartHypothesis* hypoA, const ChartHypothesis* hypoB) const { // assert in same cell const WordsRange &rangeA = hypoA->GetCurrSourceRange() - , &rangeB = hypoB->GetCurrSourceRange(); + , &rangeB = hypoB->GetCurrSourceRange(); CHECK(rangeA == rangeB); // shouldn't be mixing hypos with different lhs @@ -115,7 +115,9 @@ public: } //! return the best total score of all hypos in this collection - float GetBestScore() const { return m_bestScore; } + float GetBestScore() const { + return m_bestScore; + } void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map &reachable) const; diff --git a/moses/ChartManager.cpp b/moses/ChartManager.cpp index fc4865af7..98f0e17f3 100644 --- a/moses/ChartManager.cpp +++ b/moses/ChartManager.cpp @@ -124,10 +124,13 @@ void ChartManager::ProcessSentence() * Doesn't seem to do anything about walls and zones. * @todo check walls & zones. Check that the implementation doesn't leak, xml options sometimes does if you're not careful */ -void ChartManager::AddXmlChartOptions() { +void ChartManager::AddXmlChartOptions() +{ const StaticData &staticData = StaticData::Instance(); const std::vector xmlChartOptionsList = m_source.GetXmlChartTranslationOptions(); - IFVERBOSE(2) { cerr << "AddXmlChartOptions " << xmlChartOptionsList.size() << endl; } + IFVERBOSE(2) { + cerr << "AddXmlChartOptions " << xmlChartOptionsList.size() << endl; + } if (xmlChartOptionsList.size() == 0) return; for(std::vector::const_iterator i = xmlChartOptionsList.begin(); @@ -160,12 +163,12 @@ const ChartHypothesis *ChartManager::GetBestHypothesis() const } } - /** Calculate the n-best paths through the output hypergraph. - * Return the list of paths with the variable ret - * \param count how may paths to return - * \param ret return argument - * \param onlyDistinct whether to check for distinct output sentence or not (default - don't check, just return top n-paths) - */ +/** Calculate the n-best paths through the output hypergraph. + * Return the list of paths with the variable ret + * \param count how may paths to return + * \param ret return argument + * \param onlyDistinct whether to check for distinct output sentence or not (default - don't check, just return top n-paths) + */ void ChartManager::CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDistinct) const { size_t size = m_source.GetSize(); @@ -184,7 +187,7 @@ void ChartManager::CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDi // Add it to the n-best list. if (count == 1) { - ret.Add(basePath); + ret.Add(basePath); return; } @@ -210,21 +213,21 @@ void ChartManager::CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDi // Get all complete translations const HypoList *topHypos = lastCell.GetAllSortedHypotheses(); - + // Create a ChartTrellisDetour for each complete translation and add it to the queue HypoList::const_iterator iter; for (iter = topHypos->begin(); iter != topHypos->end(); ++iter) { - const ChartHypothesis &hypo = **iter; - boost::shared_ptr basePath(new ChartTrellisPath(hypo)); - ChartTrellisDetour *detour = new ChartTrellisDetour(basePath, basePath->GetFinalNode(), hypo); - contenders.Push(detour); + const ChartHypothesis &hypo = **iter; + boost::shared_ptr basePath(new ChartTrellisPath(hypo)); + ChartTrellisDetour *detour = new ChartTrellisDetour(basePath, basePath->GetFinalNode(), hypo); + contenders.Push(detour); } - + delete topHypos; // Record the output phrase if distinct translations are required. set distinctHyps; - + // MAIN loop for (size_t i = 0; ret.GetSize() < count && !contenders.Empty() && i < popLimit; ++i) { // Get the best detour from the queue. @@ -234,7 +237,7 @@ void ChartManager::CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDi // Create a full base path from the chosen detour. //basePath.reset(new ChartTrellisPath(*detour)); boost::shared_ptr path(new ChartTrellisPath(*detour)); - + // Generate new detours from this base path and add them to the queue of // contenders. The new detours deviate from the base path by a single // replacement along the previous detour sub-path. @@ -259,17 +262,17 @@ void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearch { size_t size = m_source.GetSize(); - // which hypotheses are reachable? - std::map reachable; - WordsRange fullRange(0, size-1); - const ChartCell &lastCell = m_hypoStackColl.Get(fullRange); + // which hypotheses are reachable? + std::map reachable; + WordsRange fullRange(0, size-1); + const ChartCell &lastCell = m_hypoStackColl.Get(fullRange); const ChartHypothesis *hypo = lastCell.GetBestHypothesis(); if (hypo == NULL) { // no hypothesis return; } - FindReachableHypotheses( hypo, reachable); + FindReachableHypotheses( hypo, reachable); for (size_t width = 1; width <= size; ++width) { for (size_t startPos = 0; startPos <= size-width; ++startPos) { @@ -285,42 +288,40 @@ void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearch void ChartManager::FindReachableHypotheses( const ChartHypothesis *hypo, std::map &reachable ) const { - // do not recurse, if already visited - if (reachable.find(hypo->GetId()) != reachable.end()) - { - return; - } + // do not recurse, if already visited + if (reachable.find(hypo->GetId()) != reachable.end()) { + return; + } - // recurse - reachable[ hypo->GetId() ] = true; - const std::vector &previous = hypo->GetPrevHypos(); - for(std::vector::const_iterator i = previous.begin(); i != previous.end(); ++i) - { - FindReachableHypotheses( *i, reachable ); - } + // recurse + reachable[ hypo->GetId() ] = true; + const std::vector &previous = hypo->GetPrevHypos(); + for(std::vector::const_iterator i = previous.begin(); i != previous.end(); ++i) { + FindReachableHypotheses( *i, reachable ); + } - // also loop over recombined hypotheses (arcs) - const ChartArcList *arcList = hypo->GetArcList(); - if (arcList) { - ChartArcList::const_iterator iterArc; - for (iterArc = arcList->begin(); iterArc != arcList->end(); ++iterArc) { - const ChartHypothesis &arc = **iterArc; - FindReachableHypotheses( &arc, reachable ); - } - } + // also loop over recombined hypotheses (arcs) + const ChartArcList *arcList = hypo->GetArcList(); + if (arcList) { + ChartArcList::const_iterator iterArc; + for (iterArc = arcList->begin(); iterArc != arcList->end(); ++iterArc) { + const ChartHypothesis &arc = **iterArc; + FindReachableHypotheses( &arc, reachable ); + } + } } void ChartManager::CreateDeviantPaths( - boost::shared_ptr basePath, - ChartTrellisDetourQueue &q) + boost::shared_ptr basePath, + ChartTrellisDetourQueue &q) { CreateDeviantPaths(basePath, basePath->GetFinalNode(), q); } void ChartManager::CreateDeviantPaths( - boost::shared_ptr basePath, - const ChartTrellisNode &substitutedNode, - ChartTrellisDetourQueue &queue) + boost::shared_ptr basePath, + const ChartTrellisNode &substitutedNode, + ChartTrellisDetourQueue &queue) { const ChartArcList *arcList = substitutedNode.GetHypothesis().GetArcList(); if (arcList) { @@ -340,18 +341,18 @@ void ChartManager::CreateDeviantPaths( } } - -void ChartManager::PreCalculateScores() + +void ChartManager::PreCalculateScores() { for (size_t i = 0; i < m_translationOptionList.GetSize(); ++i) { const ChartTranslationOptions& cto = m_translationOptionList.Get(i); for (TargetPhraseCollection::const_iterator j = cto.GetTargetPhraseCollection().begin(); - j != cto.GetTargetPhraseCollection().end(); ++j) { + j != cto.GetTargetPhraseCollection().end(); ++j) { const TargetPhrase* targetPhrase = *j; if (m_precalculatedScores.find(*targetPhrase) == m_precalculatedScores.end()) { ChartBasedFeatureContext context(*targetPhrase,m_source); const vector& sfs = - StatelessFeatureFunction::GetStatelessFeatureFunctions(); + StatelessFeatureFunction::GetStatelessFeatureFunctions(); ScoreComponentCollection& breakdown = m_precalculatedScores[*targetPhrase]; for (size_t k = 0; k < sfs.size(); ++k) { sfs[k]->EvaluateChart(context,&breakdown); @@ -362,18 +363,18 @@ void ChartManager::PreCalculateScores() } void ChartManager::InsertPreCalculatedScores( - const TargetPhrase& targetPhrase, ScoreComponentCollection* scoreBreakdown) const + const TargetPhrase& targetPhrase, ScoreComponentCollection* scoreBreakdown) const { - boost::unordered_map::const_iterator scoreIter = + boost::unordered_map::const_iterator scoreIter = m_precalculatedScores.find(targetPhrase); if (scoreIter != m_precalculatedScores.end()) { scoreBreakdown->PlusEquals(scoreIter->second); } else { TRACE_ERR("ERROR: " << targetPhrase << " missing from precalculation cache" << endl); - assert(0); + assert(0); } } - + } // namespace Moses diff --git a/moses/ChartManager.h b/moses/ChartManager.h index 7f3f24a0b..736986e05 100644 --- a/moses/ChartManager.h +++ b/moses/ChartManager.h @@ -79,35 +79,37 @@ public: void CalcNBest(size_t count, ChartTrellisPathList &ret, bool onlyDistinct=0) const; void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const; - void FindReachableHypotheses( const ChartHypothesis *hypo, std::map &reachable ) const; /* auxilliary function for GetSearchGraph */ + void FindReachableHypotheses( const ChartHypothesis *hypo, std::map &reachable ) const; /* auxilliary function for GetSearchGraph */ //! the input sentence being decoded const InputType& GetSource() const { return m_source; } - + //! debug data collected when decoding sentence SentenceStats& GetSentenceStats() const { return *m_sentenceStats; } - + /*** * to be called after processing a sentence (which may consist of more than just calling ProcessSentence() ) * currently an empty function */ void CalcDecoderStatistics() const { } - + void ResetSentenceStats(const InputType& source) { m_sentenceStats = std::auto_ptr(new SentenceStats(source)); } //! contigious hypo id for each input sentence. For debugging purposes - unsigned GetNextHypoId() { return m_hypothesisId++; } + unsigned GetNextHypoId() { + return m_hypothesisId++; + } //! Access the pre-calculated values void InsertPreCalculatedScores(const TargetPhrase& targetPhrase, - ScoreComponentCollection* scoreBreakdown) const; + ScoreComponentCollection* scoreBreakdown) const; }; diff --git a/moses/ChartParser.cpp b/moses/ChartParser.cpp index 805bec7ab..0dba600e1 100644 --- a/moses/ChartParser.cpp +++ b/moses/ChartParser.cpp @@ -35,16 +35,18 @@ extern bool g_debug; ChartParserUnknown::ChartParserUnknown() {} -ChartParserUnknown::~ChartParserUnknown() { +ChartParserUnknown::~ChartParserUnknown() +{ RemoveAllInColl(m_unksrcs); RemoveAllInColl(m_cacheTargetPhraseCollection); } -void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to) { +void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to) +{ // unknown word, add as trans opt const StaticData &staticData = StaticData::Instance(); const UnknownWordPenaltyProducer *unknownWordPenaltyProducer = staticData.GetUnknownWordPenaltyProducer(); - + size_t isDigit = 0; if (staticData.GetDropUnknown()) { const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface @@ -56,11 +58,11 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range isDigit = 1; // modify the starting bitmap } - + Phrase* unksrc = new Phrase(1); unksrc->AddWord() = sourceWord; m_unksrcs.push_back(unksrc); - + //TranslationOption *transOpt; if (! staticData.GetDropUnknown() || isDigit) { // loop @@ -69,19 +71,19 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) { const string &targetLHSStr = iterLHS->first; float prob = iterLHS->second; - + // lhs //const Word &sourceLHS = staticData.GetInputDefaultNonTerminal(); Word *targetLHS = new Word(true); - + targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true); CHECK(targetLHS->GetFactor(0) != NULL); - + // add to dictionary TargetPhrase *targetPhrase = new TargetPhrase(); Word &targetWord = targetPhrase->AddWord(); targetWord.CreateUnknownWord(sourceWord); - + // scores float unknownScore = FloorScore(TransformScore(prob)); @@ -98,7 +100,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range } else { // drop source word. create blank trans opt float unknownScore = FloorScore(-numeric_limits::infinity()); - + TargetPhrase *targetPhrase = new TargetPhrase(); // loop const UnknownLHSList &lhsList = staticData.GetUnknownLHS(); @@ -106,11 +108,11 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) { const string &targetLHSStr = iterLHS->first; //float prob = iterLHS->second; - + Word *targetLHS = new Word(true); targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true); CHECK(targetLHS->GetFactor(0) != NULL); - + targetPhrase->GetScoreBreakdown().Assign(unknownWordPenaltyProducer, unknownScore); targetPhrase->Evaluate(*unksrc); @@ -125,7 +127,8 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range ChartParser::ChartParser(InputType const &source, ChartCellCollectionBase &cells) : m_decodeGraphList(StaticData::Instance().GetDecodeGraphs()), - m_source(source) { + m_source(source) +{ const StaticData &staticData = StaticData::Instance(); staticData.InitializeForInput(source); @@ -139,14 +142,16 @@ ChartParser::ChartParser(InputType const &source, ChartCellCollectionBase &cells } } -ChartParser::~ChartParser() { +ChartParser::~ChartParser() +{ RemoveAllInColl(m_ruleLookupManagers); StaticData::Instance().CleanUpAfterSentenceProcessing(m_source); } -void ChartParser::Create(const WordsRange &wordsRange, ChartParserCallback &to) { +void ChartParser::Create(const WordsRange &wordsRange, ChartParserCallback &to) +{ assert(m_decodeGraphList.size() == m_ruleLookupManagers.size()); - + std::vector ::const_iterator iterDecodeGraph; std::vector ::const_iterator iterRuleLookupManagers = m_ruleLookupManagers.begin(); for (iterDecodeGraph = m_decodeGraphList.begin(); iterDecodeGraph != m_decodeGraphList.end(); ++iterDecodeGraph, ++iterRuleLookupManagers) { @@ -158,7 +163,7 @@ void ChartParser::Create(const WordsRange &wordsRange, ChartParserCallback &to) ruleLookupManager.GetChartRuleCollection(wordsRange, to); } } - + if (wordsRange.GetNumWordsCovered() == 1 && wordsRange.GetStartPos() != 0 && wordsRange.GetStartPos() != m_source.GetSize()-1) { bool alwaysCreateDirectTranslationOption = StaticData::Instance().IsAlwaysCreateDirectTranslationOption(); if (to.Empty() || alwaysCreateDirectTranslationOption) { @@ -166,7 +171,7 @@ void ChartParser::Create(const WordsRange &wordsRange, ChartParserCallback &to) const Word &sourceWord = m_source.GetWord(wordsRange.GetStartPos()); m_unknown.Process(sourceWord, wordsRange, to); } - } + } } - + } // namespace Moses diff --git a/moses/ChartParser.h b/moses/ChartParser.h index 9d8baa649..1ff99480d 100644 --- a/moses/ChartParser.h +++ b/moses/ChartParser.h @@ -39,31 +39,33 @@ class Phrase; class TargetPhraseCollection; class DecodeGraph; -class ChartParserUnknown { - public: - ChartParserUnknown(); - ~ChartParserUnknown(); +class ChartParserUnknown +{ +public: + ChartParserUnknown(); + ~ChartParserUnknown(); - void Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to); + void Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to); - private: - std::vector m_unksrcs; - std::list m_cacheTargetPhraseCollection; - StackVec m_emptyStackVec; +private: + std::vector m_unksrcs; + std::list m_cacheTargetPhraseCollection; + StackVec m_emptyStackVec; }; -class ChartParser { - public: - ChartParser(const InputType &source, ChartCellCollectionBase &cells); - ~ChartParser(); +class ChartParser +{ +public: + ChartParser(const InputType &source, ChartCellCollectionBase &cells); + ~ChartParser(); - void Create(const WordsRange &range, ChartParserCallback &to); + void Create(const WordsRange &range, ChartParserCallback &to); - private: - ChartParserUnknown m_unknown; - std::vector m_decodeGraphList; - std::vector m_ruleLookupManagers; - InputType const& m_source; /**< source sentence to be translated */ +private: + ChartParserUnknown m_unknown; + std::vector m_decodeGraphList; + std::vector m_ruleLookupManagers; + InputType const& m_source; /**< source sentence to be translated */ }; } diff --git a/moses/ChartParserCallback.h b/moses/ChartParserCallback.h index 797a57156..84ddb8b75 100644 --- a/moses/ChartParserCallback.h +++ b/moses/ChartParserCallback.h @@ -4,21 +4,23 @@ #include -namespace Moses { +namespace Moses +{ class TargetPhraseCollection; class WordsRange; class TargetPhrase; -class ChartParserCallback { - public: - virtual ~ChartParserCallback() {} +class ChartParserCallback +{ +public: + virtual ~ChartParserCallback() {} - virtual void Add(const TargetPhraseCollection &, const StackVec &, const WordsRange &) = 0; + virtual void Add(const TargetPhraseCollection &, const StackVec &, const WordsRange &) = 0; - virtual bool Empty() const = 0; + virtual bool Empty() const = 0; - virtual void AddPhraseOOV(TargetPhrase &phrase, std::list &waste_memory, const WordsRange &range) = 0; + virtual void AddPhraseOOV(TargetPhrase &phrase, std::list &waste_memory, const WordsRange &range) = 0; }; } // namespace Moses diff --git a/moses/ChartRuleLookupManager.h b/moses/ChartRuleLookupManager.h index da8c98cb4..ad936ff9c 100644 --- a/moses/ChartRuleLookupManager.h +++ b/moses/ChartRuleLookupManager.h @@ -50,7 +50,7 @@ public: const InputType &GetSentence() const { return m_sentence; } - + const ChartCellLabelSet &GetTargetLabelSet(size_t begin, size_t end) const { return m_cellCollection.GetBase(WordsRange(begin, end)).GetTargetLabelSet(); } diff --git a/moses/ChartTranslationOptionList.cpp b/moses/ChartTranslationOptionList.cpp index 8f4422e23..5b72ea7a3 100644 --- a/moses/ChartTranslationOptionList.cpp +++ b/moses/ChartTranslationOptionList.cpp @@ -74,11 +74,11 @@ void ChartTranslationOptionList::Add(const TargetPhraseCollection &tpc, if (m_size == m_collection.size()) { // m_collection has reached capacity: create a new object. m_collection.push_back(new ChartTranslationOptions(tpc, stackVec, - range, score)); + range, score)); } else { // Overwrite an unused object. *(m_collection[m_size]) = ChartTranslationOptions(tpc, stackVec, - range, score); + range, score); } ++m_size; @@ -98,7 +98,8 @@ void ChartTranslationOptionList::Add(const TargetPhraseCollection &tpc, } } -void ChartTranslationOptionList::AddPhraseOOV(TargetPhrase &phrase, std::list &waste_memory, const WordsRange &range) { +void ChartTranslationOptionList::AddPhraseOOV(TargetPhrase &phrase, std::list &waste_memory, const WordsRange &range) +{ TargetPhraseCollection *tpc = new TargetPhraseCollection(); tpc->Add(&phrase); waste_memory.push_back(tpc); @@ -106,7 +107,8 @@ void ChartTranslationOptionList::AddPhraseOOV(TargetPhrase &phrase, std::list m_ruleLimit) { // Something's gone wrong if the list has grown to m_ruleLimit * 2 // without being pruned. @@ -134,8 +136,8 @@ void ChartTranslationOptionList::ApplyThreshold() { scoreThreshold += StaticData::Instance().GetTranslationOptionThreshold(); CollType::iterator bound = std::partition(m_collection.begin(), - m_collection.begin()+m_size, - ScoreThresholdPred(scoreThreshold)); + m_collection.begin()+m_size, + ScoreThresholdPred(scoreThreshold)); m_size = std::distance(m_collection.begin(), bound); } diff --git a/moses/ChartTranslationOptionList.h b/moses/ChartTranslationOptionList.h index 0b56b1f61..a2979fcbc 100644 --- a/moses/ChartTranslationOptionList.h +++ b/moses/ChartTranslationOptionList.h @@ -32,27 +32,34 @@ class TargetPhraseCollection; class WordsRange; //! a vector of translations options for a specific range, in a specific sentence -class ChartTranslationOptionList : public ChartParserCallback { - public: +class ChartTranslationOptionList : public ChartParserCallback +{ +public: ChartTranslationOptionList(size_t); ~ChartTranslationOptionList(); - const ChartTranslationOptions &Get(size_t i) const { return *m_collection[i]; } + const ChartTranslationOptions &Get(size_t i) const { + return *m_collection[i]; + } //! number of translation options - size_t GetSize() const { return m_size; } + size_t GetSize() const { + return m_size; + } void Add(const TargetPhraseCollection &, const StackVec &, const WordsRange &); void AddPhraseOOV(TargetPhrase &phrase, std::list &waste_memory, const WordsRange &range); - bool Empty() const { return m_size == 0; } + bool Empty() const { + return m_size == 0; + } void Clear(); void ApplyThreshold(); - private: +private: typedef std::vector CollType; struct ScoreThresholdPred { diff --git a/moses/ChartTranslationOptions.cpp b/moses/ChartTranslationOptions.cpp index c55948a82..5ba88a0db 100644 --- a/moses/ChartTranslationOptions.cpp +++ b/moses/ChartTranslationOptions.cpp @@ -27,8 +27,8 @@ namespace Moses { float ChartTranslationOptions::CalcEstimateOfBestScore( - const TargetPhraseCollection &tpc, - const StackVec &stackVec) + const TargetPhraseCollection &tpc, + const StackVec &stackVec) { const TargetPhrase &targetPhrase = **(tpc.begin()); float estimateOfBestScore = targetPhrase.GetFutureScore(); diff --git a/moses/ChartTranslationOptions.h b/moses/ChartTranslationOptions.h index 4910723f7..459c91659 100644 --- a/moses/ChartTranslationOptions.h +++ b/moses/ChartTranslationOptions.h @@ -35,7 +35,7 @@ namespace Moses */ class ChartTranslationOptions { - public: +public: /** Constructor \param targetPhraseColl @todo dunno \param stackVec @todo dunno @@ -43,13 +43,13 @@ class ChartTranslationOptions \param score @todo dunno */ ChartTranslationOptions(const TargetPhraseCollection &targetPhraseColl, - const StackVec &stackVec, - const WordsRange &wordsRange, - float score) - : m_stackVec(stackVec) - , m_targetPhraseCollection(&targetPhraseColl) - , m_wordsRange(&wordsRange) - , m_estimateOfBestScore(score) + const StackVec &stackVec, + const WordsRange &wordsRange, + float score) + : m_stackVec(stackVec) + , m_targetPhraseCollection(&targetPhraseColl) + , m_wordsRange(&wordsRange) + , m_estimateOfBestScore(score) {} ~ChartTranslationOptions() {} @@ -58,10 +58,12 @@ class ChartTranslationOptions const StackVec &); //! @todo dunno - const StackVec &GetStackVec() const { return m_stackVec; } + const StackVec &GetStackVec() const { + return m_stackVec; + } //! @todo isn't the translation suppose to just contain 1 target phrase, not a whole collection of them? - const TargetPhraseCollection &GetTargetPhraseCollection() const { + const TargetPhraseCollection &GetTargetPhraseCollection() const { return *m_targetPhraseCollection; } @@ -74,9 +76,11 @@ class ChartTranslationOptions * the estimate is the sum of the top target phrase's estimated score plus the * scores of the best child hypotheses. */ - inline float GetEstimateOfBestScore() const { return m_estimateOfBestScore; } + inline float GetEstimateOfBestScore() const { + return m_estimateOfBestScore; + } - private: +private: StackVec m_stackVec; //! vector of hypothesis list! const TargetPhraseCollection *m_targetPhraseCollection; diff --git a/moses/ChartTrellisDetour.cpp b/moses/ChartTrellisDetour.cpp index 550a44a2c..1a187396c 100644 --- a/moses/ChartTrellisDetour.cpp +++ b/moses/ChartTrellisDetour.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -27,15 +27,15 @@ namespace Moses { ChartTrellisDetour::ChartTrellisDetour( - boost::shared_ptr basePath, - const ChartTrellisNode &substitutedNode, - const ChartHypothesis &replacementHypo) + boost::shared_ptr basePath, + const ChartTrellisNode &substitutedNode, + const ChartHypothesis &replacementHypo) : m_basePath(basePath) , m_substitutedNode(substitutedNode) , m_replacementHypo(replacementHypo) { float diff = replacementHypo.GetTotalScore() - - substitutedNode.GetHypothesis().GetTotalScore(); + - substitutedNode.GetHypothesis().GetTotalScore(); m_totalScore = basePath->GetTotalScore() + diff; } diff --git a/moses/ChartTrellisDetour.h b/moses/ChartTrellisDetour.h index 977ccb67d..26c98bef8 100644 --- a/moses/ChartTrellisDetour.h +++ b/moses/ChartTrellisDetour.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -31,20 +31,24 @@ class ChartTrellisPath; */ class ChartTrellisDetour { - public: +public: ChartTrellisDetour(boost::shared_ptr, const ChartTrellisNode &, const ChartHypothesis &); - const ChartTrellisPath &GetBasePath() const { return *m_basePath; } + const ChartTrellisPath &GetBasePath() const { + return *m_basePath; + } const ChartTrellisNode &GetSubstitutedNode() const { return m_substitutedNode; } const ChartHypothesis &GetReplacementHypo() const { return m_replacementHypo; } - float GetTotalScore() const { return m_totalScore; } + float GetTotalScore() const { + return m_totalScore; + } - private: +private: boost::shared_ptr m_basePath; const ChartTrellisNode &m_substitutedNode; const ChartHypothesis &m_replacementHypo; diff --git a/moses/ChartTrellisDetourQueue.cpp b/moses/ChartTrellisDetourQueue.cpp index 9b359ca43..4bb81d20b 100644 --- a/moses/ChartTrellisDetourQueue.cpp +++ b/moses/ChartTrellisDetourQueue.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -21,13 +21,16 @@ #include "Util.h" -namespace Moses { +namespace Moses +{ -ChartTrellisDetourQueue::~ChartTrellisDetourQueue() { +ChartTrellisDetourQueue::~ChartTrellisDetourQueue() +{ RemoveAllInColl(m_queue); } -void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour) { +void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour) +{ if (m_capacity == 0 || m_queue.size() < m_capacity) { m_queue.insert(detour); } else if (detour->GetTotalScore() > (*m_queue.rbegin())->GetTotalScore()) { @@ -43,7 +46,8 @@ void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour) { } } -const ChartTrellisDetour *ChartTrellisDetourQueue::Pop() { +const ChartTrellisDetour *ChartTrellisDetourQueue::Pop() +{ QueueType::iterator p = m_queue.begin(); const ChartTrellisDetour *top = *p; m_queue.erase(p); diff --git a/moses/ChartTrellisDetourQueue.h b/moses/ChartTrellisDetourQueue.h index d6505d8a2..2406a69f5 100644 --- a/moses/ChartTrellisDetourQueue.h +++ b/moses/ChartTrellisDetourQueue.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -23,20 +23,24 @@ #include -namespace Moses { +namespace Moses +{ /** A bounded priority queue of ChartTrellisDetour pointers. The top item is * the best scoring detour. The queue assumes ownership of pushed items and * relinquishes ownership when they are popped. Any remaining items at the * time of the queue's destruction are deleted. */ -class ChartTrellisDetourQueue { - public: +class ChartTrellisDetourQueue +{ +public: // Create empty queue with fixed capacity of c. Capacity 0 means unbounded. ChartTrellisDetourQueue(size_t c) : m_capacity(c) {} ~ChartTrellisDetourQueue(); - bool Empty() const { return m_queue.empty(); } + bool Empty() const { + return m_queue.empty(); + } // Add the detour to the queue or delete it if the queue is full and the // score is no better than the queue's worst score. @@ -46,7 +50,7 @@ class ChartTrellisDetourQueue { // caller is responsible for deleting the object. const ChartTrellisDetour *Pop(); - private: +private: struct DetourOrderer { bool operator()(const ChartTrellisDetour* a, const ChartTrellisDetour* b) const { diff --git a/moses/ChartTrellisNode.cpp b/moses/ChartTrellisNode.cpp index e55d4b1ab..73651f507 100644 --- a/moses/ChartTrellisNode.cpp +++ b/moses/ChartTrellisNode.cpp @@ -29,16 +29,16 @@ namespace Moses { ChartTrellisNode::ChartTrellisNode(const ChartHypothesis &hypo) - : m_hypo(hypo) + : m_hypo(hypo) { CreateChildren(); } ChartTrellisNode::ChartTrellisNode(const ChartTrellisDetour &detour, ChartTrellisNode *&deviationPoint) - : m_hypo((&detour.GetBasePath().GetFinalNode() == &detour.GetSubstitutedNode()) - ? detour.GetReplacementHypo() - : detour.GetBasePath().GetFinalNode().GetHypothesis()) + : m_hypo((&detour.GetBasePath().GetFinalNode() == &detour.GetSubstitutedNode()) + ? detour.GetReplacementHypo() + : detour.GetBasePath().GetFinalNode().GetHypothesis()) { if (&m_hypo == &detour.GetReplacementHypo()) { deviationPoint = this; @@ -54,9 +54,9 @@ ChartTrellisNode::ChartTrellisNode(const ChartTrellisNode &root, const ChartTrellisNode &substitutedNode, const ChartHypothesis &replacementHypo, ChartTrellisNode *&deviationPoint) - : m_hypo((&root == &substitutedNode) - ? replacementHypo - : root.GetHypothesis()) + : m_hypo((&root == &substitutedNode) + ? replacementHypo + : root.GetHypothesis()) { if (&root == &substitutedNode) { deviationPoint = this; @@ -118,8 +118,8 @@ void ChartTrellisNode::CreateChildren(const ChartTrellisNode &rootNode, for (size_t ind = 0; ind < children.size(); ++ind) { const ChartTrellisNode *origChild = children[ind]; ChartTrellisNode *child = new ChartTrellisNode(*origChild, substitutedNode, - replacementHypo, - deviationPoint); + replacementHypo, + deviationPoint); m_children.push_back(child); } } diff --git a/moses/ChartTrellisNode.h b/moses/ChartTrellisNode.h index 58203677e..643809728 100644 --- a/moses/ChartTrellisNode.h +++ b/moses/ChartTrellisNode.h @@ -34,7 +34,7 @@ class ChartTrellisDetour; */ class ChartTrellisNode { - public: +public: typedef std::vector NodeChildren; ChartTrellisNode(const ChartHypothesis &hypo); @@ -42,15 +42,21 @@ class ChartTrellisNode ~ChartTrellisNode(); - const ChartHypothesis &GetHypothesis() const { return m_hypo; } + const ChartHypothesis &GetHypothesis() const { + return m_hypo; + } - const NodeChildren &GetChildren() const { return m_children; } + const NodeChildren &GetChildren() const { + return m_children; + } - const ChartTrellisNode &GetChild(size_t i) const { return *m_children[i]; } + const ChartTrellisNode &GetChild(size_t i) const { + return *m_children[i]; + } Phrase GetOutputPhrase() const; - private: +private: ChartTrellisNode(const ChartTrellisNode &); // Not implemented ChartTrellisNode& operator=(const ChartTrellisNode &); // Not implemented diff --git a/moses/ChartTrellisPath.cpp b/moses/ChartTrellisPath.cpp index 231d4237a..c53e636e9 100644 --- a/moses/ChartTrellisPath.cpp +++ b/moses/ChartTrellisPath.cpp @@ -30,17 +30,17 @@ namespace Moses { ChartTrellisPath::ChartTrellisPath(const ChartHypothesis &hypo) - : m_finalNode(new ChartTrellisNode(hypo)) - , m_deviationPoint(NULL) - , m_scoreBreakdown(hypo.GetScoreBreakdown()) - , m_totalScore(hypo.GetTotalScore()) + : m_finalNode(new ChartTrellisNode(hypo)) + , m_deviationPoint(NULL) + , m_scoreBreakdown(hypo.GetScoreBreakdown()) + , m_totalScore(hypo.GetTotalScore()) { } ChartTrellisPath::ChartTrellisPath(const ChartTrellisDetour &detour) - : m_finalNode(new ChartTrellisNode(detour, m_deviationPoint)) - , m_scoreBreakdown(detour.GetBasePath().m_scoreBreakdown) - , m_totalScore(0) + : m_finalNode(new ChartTrellisNode(detour, m_deviationPoint)) + , m_scoreBreakdown(detour.GetBasePath().m_scoreBreakdown) + , m_totalScore(0) { CHECK(m_deviationPoint); ScoreComponentCollection scoreChange; diff --git a/moses/ChartTrellisPath.h b/moses/ChartTrellisPath.h index 6e5d50e0c..1023ad7b4 100644 --- a/moses/ChartTrellisPath.h +++ b/moses/ChartTrellisPath.h @@ -41,18 +41,24 @@ class ChartTrellisNode; */ class ChartTrellisPath { - public: +public: ChartTrellisPath(const ChartHypothesis &hypo); ChartTrellisPath(const ChartTrellisDetour &detour); ~ChartTrellisPath(); - const ChartTrellisNode &GetFinalNode() const { return *m_finalNode; } + const ChartTrellisNode &GetFinalNode() const { + return *m_finalNode; + } - const ChartTrellisNode *GetDeviationPoint() const { return m_deviationPoint; } + const ChartTrellisNode *GetDeviationPoint() const { + return m_deviationPoint; + } //! get score for this path throught trellis - float GetTotalScore() const { return m_totalScore; } + float GetTotalScore() const { + return m_totalScore; + } Phrase GetOutputPhrase() const; @@ -61,7 +67,7 @@ class ChartTrellisPath return m_scoreBreakdown; } - private: +private: ChartTrellisPath(const ChartTrellisPath &); // Not implemented ChartTrellisPath &operator=(const ChartTrellisPath &); // Not implemented diff --git a/moses/ConfusionNet.h b/moses/ConfusionNet.h index 55fa0c8bf..c9c83e154 100644 --- a/moses/ConfusionNet.h +++ b/moses/ConfusionNet.h @@ -15,7 +15,7 @@ class FactorCollection; class TranslationOptionCollection; class Sentence; -/** An input to the decoder where each position can be 1 of a number of words, +/** An input to the decoder where each position can be 1 of a number of words, * each with an associated probability. Compared with a sentence, where each position is a word */ class ConfusionNet : public InputType diff --git a/moses/DecodeFeature.cpp b/moses/DecodeFeature.cpp index ebec7a7e3..57137170e 100644 --- a/moses/DecodeFeature.cpp +++ b/moses/DecodeFeature.cpp @@ -30,8 +30,8 @@ using namespace std; namespace Moses { DecodeFeature::DecodeFeature( const std::string& description - , const std::string &line) -: StatelessFeatureFunction(description, line) + , const std::string &line) + : StatelessFeatureFunction(description, line) { VERBOSE(2,"DecodeFeature:" << std::endl); for (size_t i = 0; i < m_args.size(); ++i) { @@ -40,8 +40,7 @@ DecodeFeature::DecodeFeature( const std::string& description if (args[0] == "input-factor") { m_input =Tokenize(args[1], ","); m_inputFactors = FactorMask(m_input); - } - else if (args[0] == "output-factor") { + } else if (args[0] == "output-factor") { m_output =Tokenize(args[1], ","); m_outputFactors = FactorMask(m_output); } @@ -50,20 +49,20 @@ DecodeFeature::DecodeFeature( const std::string& description } DecodeFeature::DecodeFeature( const std::string& description - , size_t numScoreComponents - , const std::string &line) -: StatelessFeatureFunction(description,numScoreComponents, line) + , size_t numScoreComponents + , const std::string &line) + : StatelessFeatureFunction(description,numScoreComponents, line) { VERBOSE(2,"DecodeFeature: no factors yet" << std::endl); } DecodeFeature::DecodeFeature(const std::string& description - , size_t numScoreComponents - , const std::vector &input - , const std::vector &output - , const std::string &line) -: StatelessFeatureFunction(description,numScoreComponents, line) -, m_input(input), m_output(output) + , size_t numScoreComponents + , const std::vector &input + , const std::vector &output + , const std::string &line) + : StatelessFeatureFunction(description,numScoreComponents, line) + , m_input(input), m_output(output) { m_inputFactors = FactorMask(input); m_outputFactors = FactorMask(output); diff --git a/moses/DecodeFeature.h b/moses/DecodeFeature.h index b6352b181..d6cf3a323 100644 --- a/moses/DecodeFeature.h +++ b/moses/DecodeFeature.h @@ -34,9 +34,10 @@ namespace Moses /** * Baseclass for phrase-table or generation table feature function **/ -class DecodeFeature : public StatelessFeatureFunction { +class DecodeFeature : public StatelessFeatureFunction +{ - public: +public: DecodeFeature( const std::string& description , const std::string &line); @@ -45,28 +46,29 @@ class DecodeFeature : public StatelessFeatureFunction { , const std::string &line); DecodeFeature( const std::string& description - , size_t numScoreComponents - , const std::vector &input - , const std::vector &output - , const std::string &line); - - //! returns output factor types as specified by the ini file - const FactorMask& GetOutputFactorMask() const; - - //! returns input factor types as specified by the ini file - const FactorMask& GetInputFactorMask() const; - - const std::vector& GetInput() const; - const std::vector& GetOutput() const; + , size_t numScoreComponents + , const std::vector &input + , const std::vector &output + , const std::string &line); - bool IsDecodeFeature() const - { return true; } + //! returns output factor types as specified by the ini file + const FactorMask& GetOutputFactorMask() const; - protected: - std::vector m_input; - std::vector m_output; - FactorMask m_inputFactors; - FactorMask m_outputFactors; + //! returns input factor types as specified by the ini file + const FactorMask& GetInputFactorMask() const; + + const std::vector& GetInput() const; + const std::vector& GetOutput() const; + + bool IsDecodeFeature() const { + return true; + } + +protected: + std::vector m_input; + std::vector m_output; + FactorMask m_inputFactors; + FactorMask m_outputFactors; }; } diff --git a/moses/DecodeStepTranslation.cpp b/moses/DecodeStepTranslation.cpp index 0acd3479f..e4dbb673d 100644 --- a/moses/DecodeStepTranslation.cpp +++ b/moses/DecodeStepTranslation.cpp @@ -94,9 +94,9 @@ void DecodeStepTranslation::Process(const TranslationOption &inputPartialTranslO void DecodeStepTranslation::ProcessInitialTranslation( - const InputType &source - ,PartialTranslOptColl &outputPartialTranslOptColl - , size_t startPos, size_t endPos, bool adhereTableLimit) const + const InputType &source + ,PartialTranslOptColl &outputPartialTranslOptColl + , size_t startPos, size_t endPos, bool adhereTableLimit) const { const PhraseDictionary* phraseDictionary = GetPhraseDictionaryFeature(); const size_t tableLimit = phraseDictionary->GetTableLimit(); diff --git a/moses/FF/BleuScoreFeature.cpp b/moses/FF/BleuScoreFeature.cpp index 091035b0f..7808d6012 100644 --- a/moses/FF/BleuScoreFeature.cpp +++ b/moses/FF/BleuScoreFeature.cpp @@ -5,90 +5,94 @@ using namespace std; -namespace Moses { +namespace Moses +{ size_t BleuScoreState::bleu_order = 4; BleuScoreState::BleuScoreState(): m_words(1), - m_source_length(0), - m_target_length(0), - m_scaled_ref_length(0), - m_ngram_counts(bleu_order), - m_ngram_matches(bleu_order) + m_source_length(0), + m_target_length(0), + m_scaled_ref_length(0), + m_ngram_counts(bleu_order), + m_ngram_matches(bleu_order) { } int BleuScoreState::Compare(const FFState& o) const { - if (&o == this) - return 0; - - const StaticData &staticData = StaticData::Instance(); - SearchAlgorithm searchAlgorithm = staticData.GetSearchAlgorithm(); - bool chartDecoding = (searchAlgorithm == ChartDecoding); - if (chartDecoding) - return 0; - - const BleuScoreState& other = dynamic_cast(o); - int c = m_words.Compare(other.m_words); - if (c != 0) - return c; - - /*for(size_t i = 0; i < m_ngram_counts.size(); i++) { - if (m_ngram_counts[i] < other.m_ngram_counts[i]) - return -1; - if (m_ngram_counts[i] > other.m_ngram_counts[i]) - return 1; - if (m_ngram_matches[i] < other.m_ngram_matches[i]) - return -1; - if (m_ngram_matches[i] > other.m_ngram_matches[i]) - return 1; - }*/ - + if (&o == this) return 0; + + const StaticData &staticData = StaticData::Instance(); + SearchAlgorithm searchAlgorithm = staticData.GetSearchAlgorithm(); + bool chartDecoding = (searchAlgorithm == ChartDecoding); + if (chartDecoding) + return 0; + + const BleuScoreState& other = dynamic_cast(o); + int c = m_words.Compare(other.m_words); + if (c != 0) + return c; + + /*for(size_t i = 0; i < m_ngram_counts.size(); i++) { + if (m_ngram_counts[i] < other.m_ngram_counts[i]) + return -1; + if (m_ngram_counts[i] > other.m_ngram_counts[i]) + return 1; + if (m_ngram_matches[i] < other.m_ngram_matches[i]) + return -1; + if (m_ngram_matches[i] > other.m_ngram_matches[i]) + return 1; + }*/ + + return 0; } -std::ostream& operator<<(std::ostream& out, const BleuScoreState& state) { +std::ostream& operator<<(std::ostream& out, const BleuScoreState& state) +{ state.print(out); return out; } -void BleuScoreState::print(std::ostream& out) const { +void BleuScoreState::print(std::ostream& out) const +{ out << "ref=" << m_scaled_ref_length - << ";source=" << m_source_length - << ";target=" << m_target_length << ";counts="; + << ";source=" << m_source_length + << ";target=" << m_target_length << ";counts="; for (size_t i = 0; i < bleu_order; ++i) { out << m_ngram_matches[i] << "/" << m_ngram_counts[i] << ","; } out << "ctxt=" << m_words; - + } void BleuScoreState::AddNgramCountAndMatches(std::vector< size_t >& counts, - std::vector< size_t >& matches) { - for (size_t order = 0; order < BleuScoreState::bleu_order; ++order) { - m_ngram_counts[order] += counts[order]; - m_ngram_matches[order] += matches[order]; - } + std::vector< size_t >& matches) +{ + for (size_t order = 0; order < BleuScoreState::bleu_order; ++order) { + m_ngram_counts[order] += counts[order]; + m_ngram_matches[order] += matches[order]; + } } BleuScoreFeature::BleuScoreFeature(const std::string &line) -:StatefulFeatureFunction("BleuScoreFeature",1, line), -m_enabled(true), -m_sentence_bleu(true), -m_simple_history_bleu(false), -m_count_history(BleuScoreState::bleu_order), -m_match_history(BleuScoreState::bleu_order), -m_source_length_history(0), -m_target_length_history(0), -m_ref_length_history(0), -m_scale_by_input_length(true), -m_scale_by_avg_input_length(false), -m_scale_by_inverse_length(false), -m_scale_by_avg_inverse_length(false), -m_scale_by_x(1), -m_historySmoothing(0.9), -m_smoothing_scheme(PLUS_POINT_ONE) + :StatefulFeatureFunction("BleuScoreFeature",1, line), + m_enabled(true), + m_sentence_bleu(true), + m_simple_history_bleu(false), + m_count_history(BleuScoreState::bleu_order), + m_match_history(BleuScoreState::bleu_order), + m_source_length_history(0), + m_target_length_history(0), + m_ref_length_history(0), + m_scale_by_input_length(true), + m_scale_by_avg_input_length(false), + m_scale_by_inverse_length(false), + m_scale_by_avg_inverse_length(false), + m_scale_by_x(1), + m_historySmoothing(0.9), + m_smoothing_scheme(PLUS_POINT_ONE) { for (size_t i = 0; i < m_args.size(); ++i) { const vector &args = m_args[i]; @@ -131,10 +135,11 @@ m_smoothing_scheme(PLUS_POINT_ONE) } // for (size_t i = 0; i < toks.size(); ++i) { } -void BleuScoreFeature::PrintHistory(std::ostream& out) const { - out << "source length history=" << m_source_length_history << endl; - out << "target length history=" << m_target_length_history << endl; - out << "ref length history=" << m_ref_length_history << endl; +void BleuScoreFeature::PrintHistory(std::ostream& out) const +{ + out << "source length history=" << m_source_length_history << endl; + out << "target length history=" << m_target_length_history << endl; + out << "ref length history=" << m_ref_length_history << endl; for (size_t i = 0; i < BleuScoreState::bleu_order; ++i) { out << "match history/count history (" << i << "):" << m_match_history[i] << "/" << m_count_history[i] << endl; @@ -142,48 +147,49 @@ void BleuScoreFeature::PrintHistory(std::ostream& out) const { } void BleuScoreFeature::SetBleuParameters(bool disable, bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength, - bool scaleByInverseLength, bool scaleByAvgInverseLength, - float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu) { - m_enabled = !disable; - m_sentence_bleu = sentenceBleu; - m_simple_history_bleu = simpleHistoryBleu; - m_scale_by_input_length = scaleByInputLength; - m_scale_by_avg_input_length = scaleByAvgInputLength; - m_scale_by_inverse_length = scaleByInverseLength; - m_scale_by_avg_inverse_length = scaleByAvgInverseLength; - m_scale_by_x = scaleByX; - m_historySmoothing = historySmoothing; - m_smoothing_scheme = (SmoothingScheme)scheme; + bool scaleByInverseLength, bool scaleByAvgInverseLength, + float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu) +{ + m_enabled = !disable; + m_sentence_bleu = sentenceBleu; + m_simple_history_bleu = simpleHistoryBleu; + m_scale_by_input_length = scaleByInputLength; + m_scale_by_avg_input_length = scaleByAvgInputLength; + m_scale_by_inverse_length = scaleByInverseLength; + m_scale_by_avg_inverse_length = scaleByAvgInverseLength; + m_scale_by_x = scaleByX; + m_historySmoothing = historySmoothing; + m_smoothing_scheme = (SmoothingScheme)scheme; } // Incoming references (refs) are stored as refs[file_id][[sent_id][reference]] // This data structure: m_refs[sent_id][[vector][ngrams]] void BleuScoreFeature::LoadReferences(const std::vector< std::vector< std::string > >& refs) { - m_refs.clear(); - FactorCollection& fc = FactorCollection::Instance(); - for (size_t file_id = 0; file_id < refs.size(); file_id++) { - for (size_t sent_id = 0; sent_id < refs[file_id].size(); sent_id++) { - const string& ref = refs[file_id][sent_id]; - vector refTokens = Tokenize(ref); - if (file_id == 0) - m_refs[sent_id] = RefValue(); - pair,NGrams>& ref_pair = m_refs[sent_id]; - (ref_pair.first).push_back(refTokens.size()); - for (size_t order = 1; order <= BleuScoreState::bleu_order; order++) { - for (size_t end_idx = order; end_idx <= refTokens.size(); end_idx++) { - Phrase ngram(1); - for (size_t s_idx = end_idx - order; s_idx < end_idx; s_idx++) { - const Factor* f = fc.AddFactor(Output, 0, refTokens[s_idx]); - Word w; - w.SetFactor(0, f); - ngram.AddWord(w); - } - ref_pair.second[ngram] += 1; - } - } - } - } + m_refs.clear(); + FactorCollection& fc = FactorCollection::Instance(); + for (size_t file_id = 0; file_id < refs.size(); file_id++) { + for (size_t sent_id = 0; sent_id < refs[file_id].size(); sent_id++) { + const string& ref = refs[file_id][sent_id]; + vector refTokens = Tokenize(ref); + if (file_id == 0) + m_refs[sent_id] = RefValue(); + pair,NGrams>& ref_pair = m_refs[sent_id]; + (ref_pair.first).push_back(refTokens.size()); + for (size_t order = 1; order <= BleuScoreState::bleu_order; order++) { + for (size_t end_idx = order; end_idx <= refTokens.size(); end_idx++) { + Phrase ngram(1); + for (size_t s_idx = end_idx - order; s_idx < end_idx; s_idx++) { + const Factor* f = fc.AddFactor(Output, 0, refTokens[s_idx]); + Word w; + w.SetFactor(0, f); + ngram.AddWord(w); + } + ref_pair.second[ngram] += 1; + } + } + } + } // cerr << "Number of ref files: " << refs.size() << endl; // for (size_t i = 0; i < m_refs.size(); ++i) { @@ -191,51 +197,57 @@ void BleuScoreFeature::LoadReferences(const std::vector< std::vector< std::strin // } } -void BleuScoreFeature::SetCurrSourceLength(size_t source_length) { - m_cur_source_length = source_length; +void BleuScoreFeature::SetCurrSourceLength(size_t source_length) +{ + m_cur_source_length = source_length; } -void BleuScoreFeature::SetCurrNormSourceLength(size_t source_length) { - m_cur_norm_source_length = source_length; +void BleuScoreFeature::SetCurrNormSourceLength(size_t source_length) +{ + m_cur_norm_source_length = source_length; } // m_refs[sent_id][[vector][ngrams]] -void BleuScoreFeature::SetCurrShortestRefLength(size_t sent_id) { - // look for shortest reference - int shortestRef = -1; - for (size_t i = 0; i < (m_refs[sent_id].first).size(); ++i) { - if (shortestRef == -1 || (m_refs[sent_id].first)[i] < shortestRef) - shortestRef = (m_refs[sent_id].first)[i]; - } - m_cur_ref_length = shortestRef; +void BleuScoreFeature::SetCurrShortestRefLength(size_t sent_id) +{ + // look for shortest reference + int shortestRef = -1; + for (size_t i = 0; i < (m_refs[sent_id].first).size(); ++i) { + if (shortestRef == -1 || (m_refs[sent_id].first)[i] < shortestRef) + shortestRef = (m_refs[sent_id].first)[i]; + } + m_cur_ref_length = shortestRef; // cerr << "Set shortest cur_ref_length: " << m_cur_ref_length << endl; } -void BleuScoreFeature::SetCurrAvgRefLength(size_t sent_id) { - // compute average reference length - size_t sum = 0; - size_t numberRefs = (m_refs[sent_id].first).size(); - for (size_t i = 0; i < numberRefs; ++i) { - sum += (m_refs[sent_id].first)[i]; - } - m_cur_ref_length = (float)sum/numberRefs; +void BleuScoreFeature::SetCurrAvgRefLength(size_t sent_id) +{ + // compute average reference length + size_t sum = 0; + size_t numberRefs = (m_refs[sent_id].first).size(); + for (size_t i = 0; i < numberRefs; ++i) { + sum += (m_refs[sent_id].first)[i]; + } + m_cur_ref_length = (float)sum/numberRefs; // cerr << "Set average cur_ref_length: " << m_cur_ref_length << endl; } -void BleuScoreFeature::SetCurrReferenceNgrams(size_t sent_id) { - m_cur_ref_ngrams = m_refs[sent_id].second; +void BleuScoreFeature::SetCurrReferenceNgrams(size_t sent_id) +{ + m_cur_ref_ngrams = m_refs[sent_id].second; } -size_t BleuScoreFeature::GetShortestRefIndex(size_t ref_id) { - // look for shortest reference - int shortestRef = -1; - size_t shortestRefIndex = 0; - for (size_t i = 0; i < (m_refs[ref_id].first).size(); ++i) { - if (shortestRef == -1 || (m_refs[ref_id].first)[i] < shortestRef) { - shortestRef = (m_refs[ref_id].first)[i]; - shortestRefIndex = i; - } - } - return shortestRefIndex; +size_t BleuScoreFeature::GetShortestRefIndex(size_t ref_id) +{ + // look for shortest reference + int shortestRef = -1; + size_t shortestRefIndex = 0; + for (size_t i = 0; i < (m_refs[ref_id].first).size(); ++i) { + if (shortestRef == -1 || (m_refs[ref_id].first)[i] < shortestRef) { + shortestRef = (m_refs[ref_id].first)[i]; + shortestRefIndex = i; + } + } + return shortestRefIndex; } /* @@ -244,73 +256,75 @@ size_t BleuScoreFeature::GetShortestRefIndex(size_t ref_id) { * O = m_historySmoothing * (O + c(e_oracle)) * O_f = m_historySmoothing * (O_f + |f|) input length of pseudo-document */ -void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo) { - Phrase phrase(hypo); - std::vector< size_t > ngram_counts(BleuScoreState::bleu_order); - std::vector< size_t > ngram_matches(BleuScoreState::bleu_order); +void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo) +{ + Phrase phrase(hypo); + std::vector< size_t > ngram_counts(BleuScoreState::bleu_order); + std::vector< size_t > ngram_matches(BleuScoreState::bleu_order); - // compute vector c(e;{r_k}): - // vector of effective reference length, number of ngrams in e, number of ngram matches between e and r_k - GetNgramMatchCounts(phrase, m_cur_ref_ngrams, ngram_counts, ngram_matches, 0); + // compute vector c(e;{r_k}): + // vector of effective reference length, number of ngrams in e, number of ngram matches between e and r_k + GetNgramMatchCounts(phrase, m_cur_ref_ngrams, ngram_counts, ngram_matches, 0); - // update counts and matches for every ngram length with counts from hypo - for (size_t i = 0; i < BleuScoreState::bleu_order; i++) { - m_count_history[i] = m_historySmoothing * (m_count_history[i] + ngram_counts[i]); - m_match_history[i] = m_historySmoothing * (m_match_history[i] + ngram_matches[i]); - } + // update counts and matches for every ngram length with counts from hypo + for (size_t i = 0; i < BleuScoreState::bleu_order; i++) { + m_count_history[i] = m_historySmoothing * (m_count_history[i] + ngram_counts[i]); + m_match_history[i] = m_historySmoothing * (m_match_history[i] + ngram_matches[i]); + } - // update counts for reference and target length - m_source_length_history = m_historySmoothing * (m_source_length_history + m_cur_source_length); - m_target_length_history = m_historySmoothing * (m_target_length_history + hypo.size()); - m_ref_length_history = m_historySmoothing * (m_ref_length_history + m_cur_ref_length); + // update counts for reference and target length + m_source_length_history = m_historySmoothing * (m_source_length_history + m_cur_source_length); + m_target_length_history = m_historySmoothing * (m_target_length_history + hypo.size()); + m_ref_length_history = m_historySmoothing * (m_ref_length_history + m_cur_ref_length); } /* * Update history with a batch of translations */ -void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypos, vector& sourceLengths, vector& ref_ids, size_t rank, size_t epoch) { - for (size_t ref_id = 0; ref_id < hypos.size(); ++ref_id){ - Phrase phrase(hypos[ref_id]); - std::vector< size_t > ngram_counts(BleuScoreState::bleu_order); - std::vector< size_t > ngram_matches(BleuScoreState::bleu_order); +void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypos, vector& sourceLengths, vector& ref_ids, size_t rank, size_t epoch) +{ + for (size_t ref_id = 0; ref_id < hypos.size(); ++ref_id) { + Phrase phrase(hypos[ref_id]); + std::vector< size_t > ngram_counts(BleuScoreState::bleu_order); + std::vector< size_t > ngram_matches(BleuScoreState::bleu_order); - // set current source and reference information for each oracle in the batch - size_t cur_source_length = sourceLengths[ref_id]; - size_t hypo_length = hypos[ref_id].size(); - size_t cur_ref_length = GetClosestRefLength(ref_ids[ref_id], hypo_length); - NGrams cur_ref_ngrams = m_refs[ref_ids[ref_id]].second; - cerr << "reference length: " << cur_ref_length << endl; + // set current source and reference information for each oracle in the batch + size_t cur_source_length = sourceLengths[ref_id]; + size_t hypo_length = hypos[ref_id].size(); + size_t cur_ref_length = GetClosestRefLength(ref_ids[ref_id], hypo_length); + NGrams cur_ref_ngrams = m_refs[ref_ids[ref_id]].second; + cerr << "reference length: " << cur_ref_length << endl; - // compute vector c(e;{r_k}): - // vector of effective reference length, number of ngrams in e, number of ngram matches between e and r_k - GetNgramMatchCounts(phrase, cur_ref_ngrams, ngram_counts, ngram_matches, 0); + // compute vector c(e;{r_k}): + // vector of effective reference length, number of ngrams in e, number of ngram matches between e and r_k + GetNgramMatchCounts(phrase, cur_ref_ngrams, ngram_counts, ngram_matches, 0); - // update counts and matches for every ngram length with counts from hypo - for (size_t i = 0; i < BleuScoreState::bleu_order; i++) { - m_count_history[i] += ngram_counts[i]; - m_match_history[i] += ngram_matches[i]; + // update counts and matches for every ngram length with counts from hypo + for (size_t i = 0; i < BleuScoreState::bleu_order; i++) { + m_count_history[i] += ngram_counts[i]; + m_match_history[i] += ngram_matches[i]; - // do this for last position in batch - if (ref_id == hypos.size() - 1) { - m_count_history[i] *= m_historySmoothing; - m_match_history[i] *= m_historySmoothing; - } - } + // do this for last position in batch + if (ref_id == hypos.size() - 1) { + m_count_history[i] *= m_historySmoothing; + m_match_history[i] *= m_historySmoothing; + } + } - // update counts for reference and target length - m_source_length_history += cur_source_length; - m_target_length_history += hypos[ref_id].size(); - m_ref_length_history += cur_ref_length; + // update counts for reference and target length + m_source_length_history += cur_source_length; + m_target_length_history += hypos[ref_id].size(); + m_ref_length_history += cur_ref_length; - // do this for last position in batch - if (ref_id == hypos.size() - 1) { - cerr << "Rank " << rank << ", epoch " << epoch << " ,source length history: " << m_source_length_history << " --> " << m_source_length_history * m_historySmoothing << endl; - cerr << "Rank " << rank << ", epoch " << epoch << " ,target length history: " << m_target_length_history << " --> " << m_target_length_history * m_historySmoothing << endl; - m_source_length_history *= m_historySmoothing; - m_target_length_history *= m_historySmoothing; - m_ref_length_history *= m_historySmoothing; - } - } + // do this for last position in batch + if (ref_id == hypos.size() - 1) { + cerr << "Rank " << rank << ", epoch " << epoch << " ,source length history: " << m_source_length_history << " --> " << m_source_length_history * m_historySmoothing << endl; + cerr << "Rank " << rank << ", epoch " << epoch << " ,target length history: " << m_target_length_history << " --> " << m_target_length_history * m_historySmoothing << endl; + m_source_length_history *= m_historySmoothing; + m_target_length_history *= m_historySmoothing; + m_ref_length_history *= m_historySmoothing; + } + } } /* @@ -323,17 +337,18 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo } }*/ -size_t BleuScoreFeature::GetClosestRefLength(size_t ref_id, int hypoLength) { - // look for closest reference - int currentDist = -1; - int closestRefLength = -1; - for (size_t i = 0; i < (m_refs[ref_id].first).size(); ++i) { - if (closestRefLength == -1 || abs(hypoLength - (int)(m_refs[ref_id].first)[i]) < currentDist) { - closestRefLength = (m_refs[ref_id].first)[i]; - currentDist = abs(hypoLength - (int)(m_refs[ref_id].first)[i]); - } - } - return (size_t)closestRefLength; +size_t BleuScoreFeature::GetClosestRefLength(size_t ref_id, int hypoLength) +{ + // look for closest reference + int currentDist = -1; + int closestRefLength = -1; + for (size_t i = 0; i < (m_refs[ref_id].first).size(); ++i) { + if (closestRefLength == -1 || abs(hypoLength - (int)(m_refs[ref_id].first)[i]) < currentDist) { + closestRefLength = (m_refs[ref_id].first)[i]; + currentDist = abs(hypoLength - (int)(m_refs[ref_id].first)[i]); + } + } + return (size_t)closestRefLength; } /* @@ -341,206 +356,206 @@ size_t BleuScoreFeature::GetClosestRefLength(size_t ref_id, int hypoLength) { * its ngram matches against the ngrams in the reference translation */ void BleuScoreFeature::GetNgramMatchCounts(Phrase& phrase, - const NGrams& ref_ngram_counts, - std::vector< size_t >& ret_counts, - std::vector< size_t >& ret_matches, - size_t skip_first) const + const NGrams& ref_ngram_counts, + std::vector< size_t >& ret_counts, + std::vector< size_t >& ret_matches, + size_t skip_first) const { - NGrams::const_iterator ref_ngram_counts_iter; - size_t ngram_start_idx, ngram_end_idx; + NGrams::const_iterator ref_ngram_counts_iter; + size_t ngram_start_idx, ngram_end_idx; - // Chiang et al (2008) use unclipped counts of ngram matches - for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) { - for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { - if (order > end_idx) break; + // Chiang et al (2008) use unclipped counts of ngram matches + for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) { + for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { + if (order > end_idx) break; - ngram_end_idx = end_idx; - ngram_start_idx = end_idx - order; + ngram_end_idx = end_idx; + ngram_start_idx = end_idx - order; - Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0); - ret_counts[order]++; + Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0); + ret_counts[order]++; - ref_ngram_counts_iter = ref_ngram_counts.find(ngram); - if (ref_ngram_counts_iter != ref_ngram_counts.end()) - ret_matches[order]++; - } + ref_ngram_counts_iter = ref_ngram_counts.find(ngram); + if (ref_ngram_counts_iter != ref_ngram_counts.end()) + ret_matches[order]++; } + } } // score ngrams of words that have been added before the previous word span void BleuScoreFeature::GetNgramMatchCounts_prefix(Phrase& phrase, - const NGrams& ref_ngram_counts, - std::vector< size_t >& ret_counts, - std::vector< size_t >& ret_matches, - size_t new_start_indices, - size_t last_end_index) const + const NGrams& ref_ngram_counts, + std::vector< size_t >& ret_counts, + std::vector< size_t >& ret_matches, + size_t new_start_indices, + size_t last_end_index) const { - NGrams::const_iterator ref_ngram_counts_iter; - size_t ngram_start_idx, ngram_end_idx; + NGrams::const_iterator ref_ngram_counts_iter; + size_t ngram_start_idx, ngram_end_idx; - // Chiang et al (2008) use unclipped counts of ngram matches - for (size_t start_idx = 0; start_idx < new_start_indices; start_idx++) { - for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { - ngram_start_idx = start_idx; - ngram_end_idx = start_idx + order; - if (order > ngram_end_idx) break; - if (ngram_end_idx > last_end_index) break; + // Chiang et al (2008) use unclipped counts of ngram matches + for (size_t start_idx = 0; start_idx < new_start_indices; start_idx++) { + for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { + ngram_start_idx = start_idx; + ngram_end_idx = start_idx + order; + if (order > ngram_end_idx) break; + if (ngram_end_idx > last_end_index) break; - Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0); - ret_counts[order]++; + Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0); + ret_counts[order]++; - ref_ngram_counts_iter = ref_ngram_counts.find(ngram); - if (ref_ngram_counts_iter != ref_ngram_counts.end()) - ret_matches[order]++; - } + ref_ngram_counts_iter = ref_ngram_counts.find(ngram); + if (ref_ngram_counts_iter != ref_ngram_counts.end()) + ret_matches[order]++; } + } } // score ngrams around the overlap of two previously scored phrases void BleuScoreFeature::GetNgramMatchCounts_overlap(Phrase& phrase, - const NGrams& ref_ngram_counts, - std::vector< size_t >& ret_counts, - std::vector< size_t >& ret_matches, - size_t overlap_index) const + const NGrams& ref_ngram_counts, + std::vector< size_t >& ret_counts, + std::vector< size_t >& ret_matches, + size_t overlap_index) const { - NGrams::const_iterator ref_ngram_counts_iter; - size_t ngram_start_idx, ngram_end_idx; + NGrams::const_iterator ref_ngram_counts_iter; + size_t ngram_start_idx, ngram_end_idx; - // Chiang et al (2008) use unclipped counts of ngram matches - for (size_t end_idx = overlap_index; end_idx < phrase.GetSize(); end_idx++) { - if (end_idx >= (overlap_index+BleuScoreState::bleu_order-1)) break; - for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { - if (order > end_idx) break; + // Chiang et al (2008) use unclipped counts of ngram matches + for (size_t end_idx = overlap_index; end_idx < phrase.GetSize(); end_idx++) { + if (end_idx >= (overlap_index+BleuScoreState::bleu_order-1)) break; + for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { + if (order > end_idx) break; - ngram_end_idx = end_idx; - ngram_start_idx = end_idx - order; - if (ngram_start_idx >= overlap_index) continue; // only score ngrams that span the overlap point + ngram_end_idx = end_idx; + ngram_start_idx = end_idx - order; + if (ngram_start_idx >= overlap_index) continue; // only score ngrams that span the overlap point - Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0); - ret_counts[order]++; + Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0); + ret_counts[order]++; - ref_ngram_counts_iter = ref_ngram_counts.find(ngram); - if (ref_ngram_counts_iter != ref_ngram_counts.end()) - ret_matches[order]++; - } + ref_ngram_counts_iter = ref_ngram_counts.find(ngram); + if (ref_ngram_counts_iter != ref_ngram_counts.end()) + ret_matches[order]++; } + } } void BleuScoreFeature::GetClippedNgramMatchesAndCounts(Phrase& phrase, - const NGrams& ref_ngram_counts, - std::vector< size_t >& ret_counts, - std::vector< size_t >& ret_matches, - size_t skip_first) const + const NGrams& ref_ngram_counts, + std::vector< size_t >& ret_counts, + std::vector< size_t >& ret_matches, + size_t skip_first) const { - NGrams::const_iterator ref_ngram_counts_iter; - size_t ngram_start_idx, ngram_end_idx; + NGrams::const_iterator ref_ngram_counts_iter; + size_t ngram_start_idx, ngram_end_idx; - Matches ngram_matches; - for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) { - for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { - if (order > end_idx) break; + Matches ngram_matches; + for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) { + for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { + if (order > end_idx) break; - ngram_end_idx = end_idx; - ngram_start_idx = end_idx - order; + ngram_end_idx = end_idx; + ngram_start_idx = end_idx - order; - Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0); - ret_counts[order]++; + Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0); + ret_counts[order]++; - ref_ngram_counts_iter = ref_ngram_counts.find(ngram); - if (ref_ngram_counts_iter != ref_ngram_counts.end()) { - ngram_matches[order][ngram]++; - } - } - } - - // clip ngram matches - for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { - NGrams::const_iterator iter; - - // iterate over ngram counts for every ngram order - for (iter=ngram_matches[order].begin(); iter != ngram_matches[order].end(); ++iter) { - ref_ngram_counts_iter = ref_ngram_counts.find(iter->first); - if (iter->second > ref_ngram_counts_iter->second) { - ret_matches[order] += ref_ngram_counts_iter->second; - } - else { - ret_matches[order] += iter->second; - } + ref_ngram_counts_iter = ref_ngram_counts.find(ngram); + if (ref_ngram_counts_iter != ref_ngram_counts.end()) { + ngram_matches[order][ngram]++; + } } - } + } + + // clip ngram matches + for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { + NGrams::const_iterator iter; + + // iterate over ngram counts for every ngram order + for (iter=ngram_matches[order].begin(); iter != ngram_matches[order].end(); ++iter) { + ref_ngram_counts_iter = ref_ngram_counts.find(iter->first); + if (iter->second > ref_ngram_counts_iter->second) { + ret_matches[order] += ref_ngram_counts_iter->second; + } else { + ret_matches[order] += iter->second; + } + } + } } /* * Given a previous state, compute Bleu score for the updated state with an additional target * phrase translated. */ -FFState* BleuScoreFeature::Evaluate(const Hypothesis& cur_hypo, - const FFState* prev_state, +FFState* BleuScoreFeature::Evaluate(const Hypothesis& cur_hypo, + const FFState* prev_state, ScoreComponentCollection* accumulator) const { - if (!m_enabled) return new BleuScoreState(); - - NGrams::const_iterator reference_ngrams_iter; - const BleuScoreState& ps = dynamic_cast(*prev_state); - BleuScoreState* new_state = new BleuScoreState(ps); - - float old_bleu, new_bleu; - size_t num_new_words, ctx_start_idx, ctx_end_idx; + if (!m_enabled) return new BleuScoreState(); - // Calculate old bleu; - old_bleu = CalculateBleu(new_state); + NGrams::const_iterator reference_ngrams_iter; + const BleuScoreState& ps = dynamic_cast(*prev_state); + BleuScoreState* new_state = new BleuScoreState(ps); - // Get context and append new words. - num_new_words = cur_hypo.GetCurrTargetLength(); - if (num_new_words == 0) { - return new_state; - } - - Phrase new_words = ps.m_words; - new_words.Append(cur_hypo.GetCurrTargetPhrase()); - //cerr << "NW: " << new_words << endl; + float old_bleu, new_bleu; + size_t num_new_words, ctx_start_idx, ctx_end_idx; - // get ngram matches for new words - GetNgramMatchCounts(new_words, - m_cur_ref_ngrams, - new_state->m_ngram_counts, - new_state->m_ngram_matches, - new_state->m_words.GetSize()); // number of words in previous states + // Calculate old bleu; + old_bleu = CalculateBleu(new_state); - // Update state variables - ctx_end_idx = new_words.GetSize()-1; - size_t bleu_context_length = BleuScoreState::bleu_order -1; - if (ctx_end_idx > bleu_context_length) { - ctx_start_idx = ctx_end_idx - bleu_context_length; - } else { - ctx_start_idx = 0; - } - - WordsBitmap coverageVector = cur_hypo.GetWordsBitmap(); - new_state->m_source_length = coverageVector.GetNumWordsCovered(); - - new_state->m_words = new_words.GetSubString(WordsRange(ctx_start_idx, - ctx_end_idx)); - new_state->m_target_length += cur_hypo.GetCurrTargetLength(); - - // we need a scaled reference length to compare the current target phrase to the corresponding reference phrase - new_state->m_scaled_ref_length = m_cur_ref_length * - ((float)coverageVector.GetNumWordsCovered()/coverageVector.GetSize()); - - // Calculate new bleu. - new_bleu = CalculateBleu(new_state); - - // Set score to new Bleu score - accumulator->PlusEquals(this, new_bleu - old_bleu); + // Get context and append new words. + num_new_words = cur_hypo.GetCurrTargetLength(); + if (num_new_words == 0) { return new_state; + } + + Phrase new_words = ps.m_words; + new_words.Append(cur_hypo.GetCurrTargetPhrase()); + //cerr << "NW: " << new_words << endl; + + // get ngram matches for new words + GetNgramMatchCounts(new_words, + m_cur_ref_ngrams, + new_state->m_ngram_counts, + new_state->m_ngram_matches, + new_state->m_words.GetSize()); // number of words in previous states + + // Update state variables + ctx_end_idx = new_words.GetSize()-1; + size_t bleu_context_length = BleuScoreState::bleu_order -1; + if (ctx_end_idx > bleu_context_length) { + ctx_start_idx = ctx_end_idx - bleu_context_length; + } else { + ctx_start_idx = 0; + } + + WordsBitmap coverageVector = cur_hypo.GetWordsBitmap(); + new_state->m_source_length = coverageVector.GetNumWordsCovered(); + + new_state->m_words = new_words.GetSubString(WordsRange(ctx_start_idx, + ctx_end_idx)); + new_state->m_target_length += cur_hypo.GetCurrTargetLength(); + + // we need a scaled reference length to compare the current target phrase to the corresponding reference phrase + new_state->m_scaled_ref_length = m_cur_ref_length * + ((float)coverageVector.GetNumWordsCovered()/coverageVector.GetSize()); + + // Calculate new bleu. + new_bleu = CalculateBleu(new_state); + + // Set score to new Bleu score + accumulator->PlusEquals(this, new_bleu - old_bleu); + return new_state; } FFState* BleuScoreFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, - ScoreComponentCollection* accumulator ) const { + ScoreComponentCollection* accumulator ) const +{ if (!m_enabled) return new BleuScoreState(); - + NGrams::const_iterator reference_ngrams_iter; - + const Phrase& curr_target_phrase = static_cast(cur_hypo.GetCurrTargetPhrase()); // cerr << "\nCur target phrase: " << cur_hypo.GetTargetLHS() << " --> " << curr_target_phrase << endl; @@ -553,35 +568,35 @@ FFState* BleuScoreFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int fe assert(cur_hypo.GetPrevHypos().size() <= 2); BleuScoreState* new_state; if (cur_hypo.GetPrevHypos().size() == 0) - new_state = new BleuScoreState(); + new_state = new BleuScoreState(); else { - const FFState* prev_state_zero = cur_hypo.GetPrevHypo(0)->GetFFState(featureID); - const BleuScoreState& ps_zero = dynamic_cast(*prev_state_zero); - new_state = new BleuScoreState(ps_zero); - num_words_first_prev = ps_zero.m_target_length; + const FFState* prev_state_zero = cur_hypo.GetPrevHypo(0)->GetFFState(featureID); + const BleuScoreState& ps_zero = dynamic_cast(*prev_state_zero); + new_state = new BleuScoreState(ps_zero); + num_words_first_prev = ps_zero.m_target_length; - for (size_t i = 0; i < cur_hypo.GetPrevHypos().size(); ++i) { - const FFState* prev_state = cur_hypo.GetPrevHypo(i)->GetFFState(featureID); - const BleuScoreState* ps = dynamic_cast(prev_state); - BleuScoreState* ps_nonConst = const_cast(ps); + for (size_t i = 0; i < cur_hypo.GetPrevHypos().size(); ++i) { + const FFState* prev_state = cur_hypo.GetPrevHypo(i)->GetFFState(featureID); + const BleuScoreState* ps = dynamic_cast(prev_state); + BleuScoreState* ps_nonConst = const_cast(ps); // cerr << "prev phrase: " << cur_hypo.GetPrevHypo(i)->GetOutputPhrase() // << " ( " << cur_hypo.GetPrevHypo(i)->GetTargetLHS() << ")" << endl; - old_bleu += CalculateBleu(ps_nonConst); - num_old_words += ps->m_target_length; + old_bleu += CalculateBleu(ps_nonConst); + num_old_words += ps->m_target_length; - if (i > 0) - // add ngram matches from other previous states - new_state->AddNgramCountAndMatches(ps_nonConst->m_ngram_counts, ps_nonConst->m_ngram_matches); - } + if (i > 0) + // add ngram matches from other previous states + new_state->AddNgramCountAndMatches(ps_nonConst->m_ngram_counts, ps_nonConst->m_ngram_matches); + } } - + // check if we are already done (don't add and ) size_t numWordsCovered = cur_hypo.GetCurrSourceRange().GetNumWordsCovered(); if (numWordsCovered == m_cur_source_length) { - // Bleu score stays the same, do not need to add anything - //accumulator->PlusEquals(this, 0); - return new_state; + // Bleu score stays the same, do not need to add anything + //accumulator->PlusEquals(this, 0); + return new_state; } // set new context @@ -592,55 +607,52 @@ FFState* BleuScoreFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int fe // get ngram matches for new words if (num_old_words == 0) { // cerr << "compute right ngram context" << endl; - GetNgramMatchCounts(new_words, - m_cur_ref_ngrams, - new_state->m_ngram_counts, - new_state->m_ngram_matches, - 0); - } - else if (new_words.GetSize() == num_old_words) { - // two hypotheses were glued together, compute new ngrams on the basis of first hypothesis - num_words_added_right = num_curr_words - num_words_first_prev; - // score around overlap point + GetNgramMatchCounts(new_words, + m_cur_ref_ngrams, + new_state->m_ngram_counts, + new_state->m_ngram_matches, + 0); + } else if (new_words.GetSize() == num_old_words) { + // two hypotheses were glued together, compute new ngrams on the basis of first hypothesis + num_words_added_right = num_curr_words - num_words_first_prev; + // score around overlap point // cerr << "compute overlap ngram context (" << (num_words_first_prev) << ")" << endl; - GetNgramMatchCounts_overlap(new_words, - m_cur_ref_ngrams, - new_state->m_ngram_counts, - new_state->m_ngram_matches, - num_words_first_prev); - } - else if (num_old_words + curr_target_phrase.GetNumTerminals() == num_curr_words) { - assert(curr_target_phrase.GetSize() == curr_target_phrase.GetNumTerminals()+1); - // previous hypothesis + rule with 1 non-terminal were combined (NT substituted by Ts) - for (size_t i = 0; i < curr_target_phrase.GetSize(); ++i) - if (curr_target_phrase.GetWord(i).IsNonTerminal()) { - num_words_added_left = i; - num_words_added_right = curr_target_phrase.GetSize() - (i+1); - break; - } + GetNgramMatchCounts_overlap(new_words, + m_cur_ref_ngrams, + new_state->m_ngram_counts, + new_state->m_ngram_matches, + num_words_first_prev); + } else if (num_old_words + curr_target_phrase.GetNumTerminals() == num_curr_words) { + assert(curr_target_phrase.GetSize() == curr_target_phrase.GetNumTerminals()+1); + // previous hypothesis + rule with 1 non-terminal were combined (NT substituted by Ts) + for (size_t i = 0; i < curr_target_phrase.GetSize(); ++i) + if (curr_target_phrase.GetWord(i).IsNonTerminal()) { + num_words_added_left = i; + num_words_added_right = curr_target_phrase.GetSize() - (i+1); + break; + } - // left context + // left context // cerr << "compute left ngram context" << endl; - if (num_words_added_left > 0) - GetNgramMatchCounts_prefix(new_words, - m_cur_ref_ngrams, - new_state->m_ngram_counts, - new_state->m_ngram_matches, - num_words_added_left, - num_curr_words - num_words_added_right - 1); + if (num_words_added_left > 0) + GetNgramMatchCounts_prefix(new_words, + m_cur_ref_ngrams, + new_state->m_ngram_counts, + new_state->m_ngram_matches, + num_words_added_left, + num_curr_words - num_words_added_right - 1); - // right context + // right context // cerr << "compute right ngram context" << endl; - if (num_words_added_right > 0) - GetNgramMatchCounts(new_words, - m_cur_ref_ngrams, - new_state->m_ngram_counts, - new_state->m_ngram_matches, - num_words_added_left + num_old_words); - } - else { - cerr << "undefined state.. " << endl; - exit(1); + if (num_words_added_right > 0) + GetNgramMatchCounts(new_words, + m_cur_ref_ngrams, + new_state->m_ngram_counts, + new_state->m_ngram_matches, + num_words_added_left + num_old_words); + } else { + cerr << "undefined state.. " << endl; + exit(1); } // Update state variables @@ -659,7 +671,7 @@ FFState* BleuScoreFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int fe // reference phrase size_t cur_source_length = m_cur_source_length; new_state->m_scaled_ref_length = m_cur_ref_length * (float(new_state->m_source_length)/cur_source_length); - + // Calculate new bleu. new_bleu = CalculateBleu(new_state); @@ -675,28 +687,28 @@ float BleuScoreFeature::CalculateBleu(Phrase translation) const { if (translation.GetSize() == 0) return 0.0; - + Phrase normTranslation = translation; // remove start and end symbol for chart decoding if (m_cur_source_length != m_cur_norm_source_length) { WordsRange* range = new WordsRange(1, translation.GetSize()-2); normTranslation = translation.GetSubString(*range); } - + // get ngram matches for translation BleuScoreState* state = new BleuScoreState(); GetClippedNgramMatchesAndCounts(normTranslation, - m_cur_ref_ngrams, - state->m_ngram_counts, - state->m_ngram_matches, - 0); // number of words in previous states + m_cur_ref_ngrams, + state->m_ngram_counts, + state->m_ngram_matches, + 0); // number of words in previous states // set state variables state->m_words = normTranslation; state->m_source_length = m_cur_norm_source_length; state->m_target_length = normTranslation.GetSize(); state->m_scaled_ref_length = m_cur_ref_length; - + // Calculate bleu. return CalculateBleu(state); } @@ -704,52 +716,53 @@ float BleuScoreFeature::CalculateBleu(Phrase translation) const /* * Calculate Bleu score for a partial hypothesis given as state. */ -float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const { +float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const +{ if (!state->m_ngram_counts[0]) return 0; if (!state->m_ngram_matches[0]) return 0; // if we have no unigram matches, score should be 0 - + float precision = 1.0; float smooth = 1; float smoothed_count, smoothed_matches; - + if (m_sentence_bleu || m_simple_history_bleu) { // Calculate geometric mean of modified ngram precisions // BLEU = BP * exp(SUM_1_4 1/4 * log p_n) // = BP * 4th root(PRODUCT_1_4 p_n) for (size_t i = 0; i < BleuScoreState::bleu_order; i++) { if (state->m_ngram_counts[i]) { - smoothed_matches = state->m_ngram_matches[i]; - smoothed_count = state->m_ngram_counts[i]; + smoothed_matches = state->m_ngram_matches[i]; + smoothed_count = state->m_ngram_counts[i]; - switch (m_smoothing_scheme) { - case PLUS_ONE: - default: - if (i > 0) { - // smoothing for all n > 1 - smoothed_matches += 1; - smoothed_count += 1; - } - break; - case PLUS_POINT_ONE: - if (i > 0) { - // smoothing for all n > 1 - smoothed_matches += 0.1; - smoothed_count += 0.1; - } - break; - case PAPINENI: - if (state->m_ngram_matches[i] == 0) { - smooth *= 0.5; - smoothed_matches += smooth; - smoothed_count += smooth; - } - break; - } + switch (m_smoothing_scheme) { + case PLUS_ONE: + default: + if (i > 0) { + // smoothing for all n > 1 + smoothed_matches += 1; + smoothed_count += 1; + } + break; + case PLUS_POINT_ONE: + if (i > 0) { + // smoothing for all n > 1 + smoothed_matches += 0.1; + smoothed_count += 0.1; + } + break; + case PAPINENI: + if (state->m_ngram_matches[i] == 0) { + smooth *= 0.5; + smoothed_matches += smooth; + smoothed_count += smooth; + } + break; + } - if (m_simple_history_bleu) { - smoothed_matches += m_match_history[i]; - smoothed_count += m_count_history[i]; - } + if (m_simple_history_bleu) { + smoothed_matches += m_match_history[i]; + smoothed_count += m_count_history[i]; + } precision *= smoothed_matches/smoothed_count; } @@ -766,40 +779,35 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const { // r: effective reference length (sum of best match lengths for each candidate sentence) if (m_simple_history_bleu) { if ((m_target_length_history + state->m_target_length) < (m_ref_length_history + state->m_scaled_ref_length)) { - float smoothed_target_length = m_target_length_history + state->m_target_length; - float smoothed_ref_length = m_ref_length_history + state->m_scaled_ref_length; - precision *= exp(1 - (smoothed_ref_length/smoothed_target_length)); + float smoothed_target_length = m_target_length_history + state->m_target_length; + float smoothed_ref_length = m_ref_length_history + state->m_scaled_ref_length; + precision *= exp(1 - (smoothed_ref_length/smoothed_target_length)); } - } - else { + } else { if (state->m_target_length < state->m_scaled_ref_length) { - float target_length = state->m_target_length; - float ref_length = state->m_scaled_ref_length; - precision *= exp(1 - (ref_length/target_length)); + float target_length = state->m_target_length; + float ref_length = state->m_scaled_ref_length; + precision *= exp(1 - (ref_length/target_length)); } } - + //cerr << "precision: " << precision << endl; - + // Approximate bleu score as of Chiang/Resnik is scaled by the size of the input: // B(e;f,{r_k}) = (O_f + |f|) * BLEU(O + c(e;{r_k})) // where c(e;) is a vector of reference length, ngram counts and ngram matches if (m_scale_by_input_length) { precision *= m_cur_norm_source_length; - } - else if (m_scale_by_avg_input_length) { + } else if (m_scale_by_avg_input_length) { precision *= m_avg_input_length; - } - else if (m_scale_by_inverse_length) { + } else if (m_scale_by_inverse_length) { precision *= (100/m_cur_norm_source_length); - } - else if (m_scale_by_avg_inverse_length) { + } else if (m_scale_by_avg_inverse_length) { precision *= (100/m_avg_input_length); } - + return precision * m_scale_by_x; - } - else { + } else { // Revised history BLEU: compute Bleu in the context of the pseudo-document // B(b) = size_of_oracle_doc * (Bleu(B_hist + b) - Bleu(B_hist)) // Calculate geometric mean of modified ngram precisions @@ -807,12 +815,12 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const { // = BP * 4th root(PRODUCT_1_4 p_n) for (size_t i = 0; i < BleuScoreState::bleu_order; i++) { if (state->m_ngram_counts[i]) { - smoothed_matches = m_match_history[i] + state->m_ngram_matches[i] + 0.1; - smoothed_count = m_count_history[i] + state->m_ngram_counts[i] + 0.1; - precision *= smoothed_matches/smoothed_count; + smoothed_matches = m_match_history[i] + state->m_ngram_matches[i] + 0.1; + smoothed_count = m_count_history[i] + state->m_ngram_counts[i] + 0.1; + precision *= smoothed_matches/smoothed_count; } } - + // take geometric mean precision = pow(precision, (float)1/4); @@ -826,25 +834,24 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const { float precision_pd = 1.0; if (m_target_length_history > 0) { for (size_t i = 0; i < BleuScoreState::bleu_order; i++) - if (m_count_history[i] != 0) - precision_pd *= (m_match_history[i] + 0.1)/(m_count_history[i] + 0.1); - + if (m_count_history[i] != 0) + precision_pd *= (m_match_history[i] + 0.1)/(m_count_history[i] + 0.1); + // take geometric mean precision_pd = pow(precision_pd, (float)1/4); // Apply brevity penalty if applicable. if (m_target_length_history < m_ref_length_history) - precision_pd *= exp(1 - (m_ref_length_history/m_target_length_history)); - } - else + precision_pd *= exp(1 - (m_ref_length_history/m_target_length_history)); + } else precision_pd = 0; // **end BLEU of pseudo-document** cerr << "precision pd: " << precision_pd << endl; float sentence_impact; - if (m_target_length_history > 0) - sentence_impact = m_target_length_history * (precision - precision_pd); + if (m_target_length_history > 0) + sentence_impact = m_target_length_history * (precision - precision_pd); else sentence_impact = precision; @@ -855,7 +862,7 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const { const FFState* BleuScoreFeature::EmptyHypothesisState(const InputType& input) const { - return new BleuScoreState(); + return new BleuScoreState(); } } // namespace. diff --git a/moses/FF/BleuScoreFeature.h b/moses/FF/BleuScoreFeature.h index dc4495506..96e273672 100644 --- a/moses/FF/BleuScoreFeature.h +++ b/moses/FF/BleuScoreFeature.h @@ -13,31 +13,33 @@ #include "moses/Phrase.h" #include "moses/ChartHypothesis.h" -namespace Moses { +namespace Moses +{ class BleuScoreFeature; -class BleuScoreState : public FFState { +class BleuScoreState : public FFState +{ public: - friend class BleuScoreFeature; - static size_t bleu_order; + friend class BleuScoreFeature; + static size_t bleu_order; - BleuScoreState(); - virtual int Compare(const FFState& other) const; - void print(std::ostream& out) const; + BleuScoreState(); + virtual int Compare(const FFState& other) const; + void print(std::ostream& out) const; private: - Phrase m_words; - size_t m_source_length; - size_t m_target_length; + Phrase m_words; + size_t m_source_length; + size_t m_target_length; - // scaled reference length is needed for scoring incomplete hypotheses against reference translation - float m_scaled_ref_length; + // scaled reference length is needed for scoring incomplete hypotheses against reference translation + float m_scaled_ref_length; - std::vector< size_t > m_ngram_counts; - std::vector< size_t > m_ngram_matches; + std::vector< size_t > m_ngram_counts; + std::vector< size_t > m_ngram_matches; - void AddNgramCountAndMatches(std::vector< size_t >& counts, std::vector< size_t >& matches); + void AddNgramCountAndMatches(std::vector< size_t >& counts, std::vector< size_t >& matches); }; @@ -56,7 +58,8 @@ public: }; -class BleuScoreFeature : public StatefulFeatureFunction { +class BleuScoreFeature : public StatefulFeatureFunction +{ public: typedef boost::unordered_map RefCounts; @@ -64,95 +67,105 @@ public: BleuScoreFeature(const std::string &line); - void PrintHistory(std::ostream& out) const; - void LoadReferences(const std::vector< std::vector< std::string > > &); - void SetCurrSourceLength(size_t); - void SetCurrNormSourceLength(size_t); - void SetCurrShortestRefLength(size_t); - void SetCurrAvgRefLength(size_t sent_id); - void SetAvgInputLength (float l) { m_avg_input_length = l; } - void SetCurrReferenceNgrams(size_t sent_id); - size_t GetShortestRefIndex(size_t ref_id); - size_t GetClosestRefLength(size_t ref_id, int hypoLength); - void UpdateHistory(const std::vector< const Word* >&); - void UpdateHistory(const std::vector< std::vector< const Word* > >& hypos, std::vector& sourceLengths, std::vector& ref_ids, size_t rank, size_t epoch); - void PrintRefLength(const std::vector& ref_ids); - void SetBleuParameters(bool disable, bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength, - bool scaleByInverseLength, bool scaleByAvgInverseLength, - float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu); + void PrintHistory(std::ostream& out) const; + void LoadReferences(const std::vector< std::vector< std::string > > &); + void SetCurrSourceLength(size_t); + void SetCurrNormSourceLength(size_t); + void SetCurrShortestRefLength(size_t); + void SetCurrAvgRefLength(size_t sent_id); + void SetAvgInputLength (float l) { + m_avg_input_length = l; + } + void SetCurrReferenceNgrams(size_t sent_id); + size_t GetShortestRefIndex(size_t ref_id); + size_t GetClosestRefLength(size_t ref_id, int hypoLength); + void UpdateHistory(const std::vector< const Word* >&); + void UpdateHistory(const std::vector< std::vector< const Word* > >& hypos, std::vector& sourceLengths, std::vector& ref_ids, size_t rank, size_t epoch); + void PrintRefLength(const std::vector& ref_ids); + void SetBleuParameters(bool disable, bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength, + bool scaleByInverseLength, bool scaleByAvgInverseLength, + float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu); - void GetNgramMatchCounts(Phrase&, - const NGrams&, - std::vector< size_t >&, - std::vector< size_t >&, - size_t skip = 0) const; - void GetNgramMatchCounts_prefix(Phrase&, - const NGrams&, - std::vector< size_t >&, - std::vector< size_t >&, - size_t new_start_indices, - size_t last_end_index) const; - void GetNgramMatchCounts_overlap(Phrase& phrase, - const NGrams& ref_ngram_counts, - std::vector< size_t >& ret_counts, - std::vector< size_t >& ret_matches, - size_t overlap_index) const; - void GetClippedNgramMatchesAndCounts(Phrase&, - const NGrams&, - std::vector< size_t >&, - std::vector< size_t >&, - size_t skip = 0) const; + void GetNgramMatchCounts(Phrase&, + const NGrams&, + std::vector< size_t >&, + std::vector< size_t >&, + size_t skip = 0) const; + void GetNgramMatchCounts_prefix(Phrase&, + const NGrams&, + std::vector< size_t >&, + std::vector< size_t >&, + size_t new_start_indices, + size_t last_end_index) const; + void GetNgramMatchCounts_overlap(Phrase& phrase, + const NGrams& ref_ngram_counts, + std::vector< size_t >& ret_counts, + std::vector< size_t >& ret_matches, + size_t overlap_index) const; + void GetClippedNgramMatchesAndCounts(Phrase&, + const NGrams&, + std::vector< size_t >&, + std::vector< size_t >&, + size_t skip = 0) const; - FFState* Evaluate( const Hypothesis& cur_hypo, - const FFState* prev_state, - ScoreComponentCollection* accumulator) const; - FFState* EvaluateChart(const ChartHypothesis& cur_hypo, - int featureID, - ScoreComponentCollection* accumulator) const; - bool Enabled() const { return m_enabled; } - float CalculateBleu(BleuScoreState*) const; - float CalculateBleu(Phrase translation) const; - const FFState* EmptyHypothesisState(const InputType&) const; + FFState* Evaluate( const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const; + FFState* EvaluateChart(const ChartHypothesis& cur_hypo, + int featureID, + ScoreComponentCollection* accumulator) const; + bool Enabled() const { + return m_enabled; + } + float CalculateBleu(BleuScoreState*) const; + float CalculateBleu(Phrase translation) const; + const FFState* EmptyHypothesisState(const InputType&) const; - float GetSourceLengthHistory() { return m_source_length_history; } - float GetTargetLengthHistory() { return m_target_length_history; } - float GetAverageInputLength() { return m_avg_input_length; } + float GetSourceLengthHistory() { + return m_source_length_history; + } + float GetTargetLengthHistory() { + return m_target_length_history; + } + float GetAverageInputLength() { + return m_avg_input_length; + } private: - bool m_enabled; - bool m_sentence_bleu; - bool m_simple_history_bleu; + bool m_enabled; + bool m_sentence_bleu; + bool m_simple_history_bleu; - // counts for pseudo-document - std::vector< float > m_count_history; - std::vector< float > m_match_history; - float m_source_length_history; - float m_target_length_history; - float m_ref_length_history; + // counts for pseudo-document + std::vector< float > m_count_history; + std::vector< float > m_match_history; + float m_source_length_history; + float m_target_length_history; + float m_ref_length_history; - size_t m_cur_source_length; - size_t m_cur_norm_source_length; // length without , - RefCounts m_refs; - NGrams m_cur_ref_ngrams; - float m_cur_ref_length; + size_t m_cur_source_length; + size_t m_cur_norm_source_length; // length without , + RefCounts m_refs; + NGrams m_cur_ref_ngrams; + float m_cur_ref_length; - // scale BLEU score by history of input length - bool m_scale_by_input_length; - bool m_scale_by_avg_input_length; + // scale BLEU score by history of input length + bool m_scale_by_input_length; + bool m_scale_by_avg_input_length; - // scale by the inverse of the input length * 100 - bool m_scale_by_inverse_length; - bool m_scale_by_avg_inverse_length; + // scale by the inverse of the input length * 100 + bool m_scale_by_inverse_length; + bool m_scale_by_avg_inverse_length; - float m_avg_input_length; + float m_avg_input_length; - float m_scale_by_x; + float m_scale_by_x; - // smoothing factor for history counts - float m_historySmoothing; + // smoothing factor for history counts + float m_historySmoothing; - enum SmoothingScheme { PLUS_ONE = 1, PLUS_POINT_ONE = 2, PAPINENI = 3 }; - SmoothingScheme m_smoothing_scheme; + enum SmoothingScheme { PLUS_ONE = 1, PLUS_POINT_ONE = 2, PAPINENI = 3 }; + SmoothingScheme m_smoothing_scheme; }; } // Namespace. diff --git a/moses/FF/ChartBasedFeatureContext.cpp b/moses/FF/ChartBasedFeatureContext.cpp index 803f81deb..a74cce50c 100644 --- a/moses/FF/ChartBasedFeatureContext.cpp +++ b/moses/FF/ChartBasedFeatureContext.cpp @@ -5,15 +5,15 @@ namespace Moses { ChartBasedFeatureContext::ChartBasedFeatureContext - (const ChartHypothesis* hypothesis): +(const ChartHypothesis* hypothesis): m_hypothesis(hypothesis), m_targetPhrase(hypothesis->GetCurrTargetPhrase()), m_source(hypothesis->GetManager().GetSource()) {} ChartBasedFeatureContext::ChartBasedFeatureContext( - const TargetPhrase& targetPhrase, - const InputType& source): + const TargetPhrase& targetPhrase, + const InputType& source): m_hypothesis(NULL), m_targetPhrase(targetPhrase), m_source(source) diff --git a/moses/FF/ChartBasedFeatureContext.h b/moses/FF/ChartBasedFeatureContext.h index 7649effde..a204f7c77 100644 --- a/moses/FF/ChartBasedFeatureContext.h +++ b/moses/FF/ChartBasedFeatureContext.h @@ -11,7 +11,7 @@ class TargetPhrase; **/ class ChartBasedFeatureContext { - //The context either has a hypothesis (during search) or a + //The context either has a hypothesis (during search) or a //TargetPhrase and source sentence (during pre-calculation) //TODO: should the context also include some info on where the TargetPhrase //is anchored (assuming it's lexicalised), which is available at pre-calc? @@ -24,11 +24,13 @@ public: ChartBasedFeatureContext(const TargetPhrase& targetPhrase, const InputType& source); - const InputType& GetSource() const - { return m_source; } + const InputType& GetSource() const { + return m_source; + } - const TargetPhrase& GetTargetPhrase() const - { return m_targetPhrase; } + const TargetPhrase& GetTargetPhrase() const { + return m_targetPhrase; + } }; diff --git a/moses/FF/DistortionScoreProducer.cpp b/moses/FF/DistortionScoreProducer.cpp index 413679779..328c833c8 100644 --- a/moses/FF/DistortionScoreProducer.cpp +++ b/moses/FF/DistortionScoreProducer.cpp @@ -39,8 +39,7 @@ float DistortionScoreProducer::CalculateDistortionScore(const Hypothesis& hypo, { if(!StaticData::Instance().UseEarlyDistortionCost()) { return - (float) hypo.GetInput().ComputeDistortionDistance(prev, curr); - } - else { + } else { /* Pay distortion score as soon as possible, from Moore and Quirk MT Summit 2007 Definitions: S : current source range @@ -50,23 +49,23 @@ float DistortionScoreProducer::CalculateDistortionScore(const Hypothesis& hypo, int prefixEndPos = (int)FirstGap-1; if((int)FirstGap==-1) - prefixEndPos = -1; + prefixEndPos = -1; // case1: S is adjacent to S'' => return 0 if ((int) curr.GetStartPos() == prefixEndPos+1) { - IFVERBOSE(4) std::cerr<< "MQ07disto:case1" << std::endl; + IFVERBOSE(4) std::cerr<< "MQ07disto:case1" << std::endl; return 0; } // case2: S is to the left of S' => return 2(length(S)) if ((int) curr.GetEndPos() < (int) prev.GetEndPos()) { - IFVERBOSE(4) std::cerr<< "MQ07disto:case2" << std::endl; + IFVERBOSE(4) std::cerr<< "MQ07disto:case2" << std::endl; return (float) -2*(int)curr.GetNumWordsCovered(); } // case3: S' is a subsequence of S'' => return 2(nbWordBetween(S,S'')+length(S)) if ((int) prev.GetEndPos() <= prefixEndPos) { - IFVERBOSE(4) std::cerr<< "MQ07disto:case3" << std::endl; + IFVERBOSE(4) std::cerr<< "MQ07disto:case3" << std::endl; int z = (int)curr.GetStartPos()-prefixEndPos - 1; return (float) -2*(z + (int)curr.GetNumWordsCovered()); } diff --git a/moses/FF/DistortionScoreProducer.h b/moses/FF/DistortionScoreProducer.h index 394e7f2e1..2601e6398 100644 --- a/moses/FF/DistortionScoreProducer.h +++ b/moses/FF/DistortionScoreProducer.h @@ -17,12 +17,12 @@ class WordsRange; class DistortionScoreProducer : public StatefulFeatureFunction { public: - DistortionScoreProducer(const std::string &line) - : StatefulFeatureFunction("Distortion", 1, line) - {} + DistortionScoreProducer(const std::string &line) + : StatefulFeatureFunction("Distortion", 1, line) + {} static float CalculateDistortionScore(const Hypothesis& hypo, - const WordsRange &prev, const WordsRange &curr, const int FirstGapPosition); + const WordsRange &prev, const WordsRange &curr, const int FirstGapPosition); virtual const FFState* EmptyHypothesisState(const InputType &input) const; @@ -35,8 +35,8 @@ public: const ChartHypothesis& /* cur_hypo */, int /* featureID - used to index the state in the previous hypotheses */, ScoreComponentCollection*) const { - throw std::logic_error("DistortionScoreProducer not supported in chart decoder, yet"); - } + throw std::logic_error("DistortionScoreProducer not supported in chart decoder, yet"); + } }; } diff --git a/moses/FF/FFState.h b/moses/FF/FFState.h index 49b0e55a8..bb3a119ef 100644 --- a/moses/FF/FFState.h +++ b/moses/FF/FFState.h @@ -15,11 +15,12 @@ public: virtual int Compare(const FFState& other) const = 0; }; -class DummyState : public FFState { +class DummyState : public FFState +{ public: DummyState() {} int Compare(const FFState& other) const { - return 0; + return 0; } }; diff --git a/moses/FF/FeatureFunction.cpp b/moses/FF/FeatureFunction.cpp index d1a73e1a9..ea4441522 100644 --- a/moses/FF/FeatureFunction.cpp +++ b/moses/FF/FeatureFunction.cpp @@ -19,7 +19,7 @@ std::vector StatelessFeatureFunction::m_statele std::vector StatefulFeatureFunction::m_statefulFFs; FeatureFunction::FeatureFunction(const std::string& description, const std::string &line) -: m_tuneable(true) + : m_tuneable(true) { ParseLine(description, line); @@ -35,13 +35,13 @@ FeatureFunction::FeatureFunction(const std::string& description, const std::stri m_description = dstream.str(); } - ScoreComponentCollection::RegisterScoreProducer(this); + ScoreComponentCollection::RegisterScoreProducer(this); m_producers.push_back(this); } FeatureFunction::FeatureFunction(const std::string& description, size_t numScoreComponents, const std::string &line) -: m_numScoreComponents(numScoreComponents) -, m_tuneable(true) + : m_numScoreComponents(numScoreComponents) + , m_tuneable(true) { ParseLine(description, line); @@ -75,14 +75,11 @@ void FeatureFunction::ParseLine(const std::string& description, const std::strin if (args[0] == "num-features") { m_numScoreComponents = Scan(args[1]); - } - else if (args[0] == "name") { + } else if (args[0] == "name") { m_description = args[1]; - } - else if (args[0] == "tuneable") { + } else if (args[0] == "tuneable") { m_tuneable = Scan(args[1]); - } - else { + } else { m_args.push_back(args); } } diff --git a/moses/FF/FeatureFunction.h b/moses/FF/FeatureFunction.h index 6e1fa67a8..97e7d754d 100644 --- a/moses/FF/FeatureFunction.h +++ b/moses/FF/FeatureFunction.h @@ -42,26 +42,33 @@ protected: void ParseLine(const std::string& description, const std::string &line); public: - static const std::vector& GetFeatureFunctions() { return m_producers; } + static const std::vector& GetFeatureFunctions() { + return m_producers; + } FeatureFunction(const std::string& description, const std::string &line); FeatureFunction(const std::string& description, size_t numScoreComponents, const std::string &line); - virtual bool IsStateless() const = 0; + virtual bool IsStateless() const = 0; virtual ~FeatureFunction(); - + static void ResetDescriptionCounts() { description_counts.clear(); } //! returns the number of scores that a subclass produces. //! For example, a language model conventionally produces 1, a translation table some arbitrary number, etc - size_t GetNumScoreComponents() const {return m_numScoreComponents;} + size_t GetNumScoreComponents() const { + return m_numScoreComponents; + } //! returns a string description of this producer - const std::string& GetScoreProducerDescription() const - { return m_description; } + const std::string& GetScoreProducerDescription() const { + return m_description; + } - virtual bool IsTuneable() const { return m_tuneable; } + virtual bool IsTuneable() const { + return m_tuneable; + } //! virtual void InitializeForInput(InputType const& source) @@ -71,17 +78,18 @@ public: virtual void CleanUpAfterSentenceProcessing(const InputType& source) {} - const std::string &GetArgLine() const - { return m_argLine; } + const std::string &GetArgLine() const { + return m_argLine; + } virtual void Evaluate(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const {} virtual void Evaluate(const InputType &source - , ScoreComponentCollection &scoreBreakdown) const + , ScoreComponentCollection &scoreBreakdown) const {} }; diff --git a/moses/FF/GlobalLexicalModel.cpp b/moses/FF/GlobalLexicalModel.cpp index 5724f6598..cbc6811ee 100644 --- a/moses/FF/GlobalLexicalModel.cpp +++ b/moses/FF/GlobalLexicalModel.cpp @@ -10,7 +10,7 @@ using namespace std; namespace Moses { GlobalLexicalModel::GlobalLexicalModel(const std::string &line) -: StatelessFeatureFunction("GlobalLexicalModel",1, line) + : StatelessFeatureFunction("GlobalLexicalModel",1, line) { std::cerr << "Creating global lexical model...\n"; @@ -23,14 +23,11 @@ GlobalLexicalModel::GlobalLexicalModel(const std::string &line) if (args[0] == "file") { CHECK(args.size() == 2); filePath = args[1]; - } - else if (args[0] == "inputFactors") { + } else if (args[0] == "inputFactors") { inputFactors = Tokenize(args[1],","); - } - else if (args[0] == "outputFactors") { + } else if (args[0] == "outputFactors") { outputFactors = Tokenize(args[1],","); - } - else { + } else { throw "Unknown argument " + args[0]; } } @@ -179,11 +176,11 @@ float GlobalLexicalModel::GetFromCacheOrScorePhrase( const TargetPhrase& targetP } void GlobalLexicalModel::Evaluate - (const PhraseBasedFeatureContext& context, - ScoreComponentCollection* accumulator) const +(const PhraseBasedFeatureContext& context, + ScoreComponentCollection* accumulator) const { - accumulator->PlusEquals( this, - GetFromCacheOrScorePhrase(context.GetTargetPhrase()) ); + accumulator->PlusEquals( this, + GetFromCacheOrScorePhrase(context.GetTargetPhrase()) ); } } diff --git a/moses/FF/GlobalLexicalModel.h b/moses/FF/GlobalLexicalModel.h index 03659b7f2..b3bf79b53 100644 --- a/moses/FF/GlobalLexicalModel.h +++ b/moses/FF/GlobalLexicalModel.h @@ -37,8 +37,7 @@ class GlobalLexicalModel : public StatelessFeatureFunction typedef std::map< const Word*, float, WordComparer > SingleHash; typedef std::map< const TargetPhrase*, float > LexiconCache; - struct ThreadLocalStorage - { + struct ThreadLocalStorage { LexiconCache cache; const Sentence *input; }; @@ -64,18 +63,17 @@ private: public: GlobalLexicalModel(const std::string &line); - virtual ~GlobalLexicalModel(); + virtual ~GlobalLexicalModel(); void InitializeForInput( Sentence const& in ); void Evaluate(const PhraseBasedFeatureContext& context, - ScoreComponentCollection* accumulator) const; + ScoreComponentCollection* accumulator) const; void EvaluateChart( const ChartBasedFeatureContext& context, - ScoreComponentCollection* accumulator) const - { + ScoreComponentCollection* accumulator) const { throw std::logic_error("GlobalLexicalModel not supported in chart decoder, yet"); } diff --git a/moses/FF/GlobalLexicalModelUnlimited.cpp b/moses/FF/GlobalLexicalModelUnlimited.cpp index d4b1aeb37..5c096e43f 100644 --- a/moses/FF/GlobalLexicalModelUnlimited.cpp +++ b/moses/FF/GlobalLexicalModelUnlimited.cpp @@ -10,7 +10,7 @@ using namespace std; namespace Moses { GlobalLexicalModelUnlimited::GlobalLexicalModelUnlimited(const std::string &line) -:StatelessFeatureFunction("GlobalLexicalModelUnlimited", 0, line) + :StatelessFeatureFunction("GlobalLexicalModelUnlimited", 0, line) { const vector modelSpec = Tokenize(line); @@ -25,7 +25,7 @@ GlobalLexicalModelUnlimited::GlobalLexicalModelUnlimited(const std::string &line if (spec.size() > 0) { if (spec.size() != 2 && spec.size() != 3 && spec.size() != 4 && spec.size() != 6) { UserMessage::Add("Format of glm feature is - [ignore-punct] [use-bias] " - "[context-type] [filename-src filename-tgt]"); + "[context-type] [filename-src filename-tgt]"); //return false; } @@ -41,8 +41,7 @@ GlobalLexicalModelUnlimited::GlobalLexicalModelUnlimited(const std::string &line filenameTarget = spec[5]; restricted = true; } - } - else + } else factors = Tokenize(modelSpec[i],"-"); if ( factors.size() != 2 ) { @@ -66,14 +65,13 @@ GlobalLexicalModelUnlimited::GlobalLexicalModelUnlimited(const std::string &line } bool GlobalLexicalModelUnlimited::Load(const std::string &filePathSource, - const std::string &filePathTarget) + const std::string &filePathTarget) { // restricted source word vocabulary ifstream inFileSource(filePathSource.c_str()); - if (!inFileSource) - { - cerr << "could not open file " << filePathSource << endl; - return false; + if (!inFileSource) { + cerr << "could not open file " << filePathSource << endl; + return false; } std::string line; @@ -85,10 +83,9 @@ bool GlobalLexicalModelUnlimited::Load(const std::string &filePathSource, // restricted target word vocabulary ifstream inFileTarget(filePathTarget.c_str()); - if (!inFileTarget) - { - cerr << "could not open file " << filePathTarget << endl; - return false; + if (!inFileTarget) { + cerr << "could not open file " << filePathTarget << endl; + return false; } while (getline(inFileTarget, line)) { @@ -109,228 +106,222 @@ void GlobalLexicalModelUnlimited::InitializeForInput( Sentence const& in ) void GlobalLexicalModelUnlimited::Evaluate(const Hypothesis& cur_hypo, ScoreComponentCollection* accumulator) const { - const Sentence& input = *(m_local->input); - const TargetPhrase& targetPhrase = cur_hypo.GetCurrTargetPhrase(); + const Sentence& input = *(m_local->input); + const TargetPhrase& targetPhrase = cur_hypo.GetCurrTargetPhrase(); - for(int targetIndex = 0; targetIndex < targetPhrase.GetSize(); targetIndex++ ) { - StringPiece targetString = targetPhrase.GetWord(targetIndex).GetString(0); // TODO: change for other factors + for(int targetIndex = 0; targetIndex < targetPhrase.GetSize(); targetIndex++ ) { + StringPiece targetString = targetPhrase.GetWord(targetIndex).GetString(0); // TODO: change for other factors - if (m_ignorePunctuation) { - // check if first char is punctuation - char firstChar = targetString[0]; - CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); - if(charIterator != m_punctuationHash.end()) - continue; - } + if (m_ignorePunctuation) { + // check if first char is punctuation + char firstChar = targetString[0]; + CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); + if(charIterator != m_punctuationHash.end()) + continue; + } - if (m_biasFeature) { - stringstream feature; - feature << "glm_"; - feature << targetString; - feature << "~"; - feature << "**BIAS**"; - accumulator->SparsePlusEquals(feature.str(), 1); - } + if (m_biasFeature) { + stringstream feature; + feature << "glm_"; + feature << targetString; + feature << "~"; + feature << "**BIAS**"; + accumulator->SparsePlusEquals(feature.str(), 1); + } - boost::unordered_set alreadyScored; - for(int sourceIndex = 0; sourceIndex < input.GetSize(); sourceIndex++ ) { - const StringPiece sourceString = input.GetWord(sourceIndex).GetString(0); // TODO: change for other factors + boost::unordered_set alreadyScored; + for(int sourceIndex = 0; sourceIndex < input.GetSize(); sourceIndex++ ) { + const StringPiece sourceString = input.GetWord(sourceIndex).GetString(0); // TODO: change for other factors - if (m_ignorePunctuation) { - // check if first char is punctuation - char firstChar = sourceString[0]; - CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); - if(charIterator != m_punctuationHash.end()) - continue; - } - const uint64_t sourceHash = util::MurmurHashNative(sourceString.data(), sourceString.size()); + if (m_ignorePunctuation) { + // check if first char is punctuation + char firstChar = sourceString[0]; + CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); + if(charIterator != m_punctuationHash.end()) + continue; + } + const uint64_t sourceHash = util::MurmurHashNative(sourceString.data(), sourceString.size()); - if ( alreadyScored.find(sourceHash) == alreadyScored.end()) { - bool sourceExists, targetExists; - if (!m_unrestricted) { - sourceExists = FindStringPiece(m_vocabSource, sourceString ) != m_vocabSource.end(); - targetExists = FindStringPiece(m_vocabTarget, targetString) != m_vocabTarget.end(); - } + if ( alreadyScored.find(sourceHash) == alreadyScored.end()) { + bool sourceExists, targetExists; + if (!m_unrestricted) { + sourceExists = FindStringPiece(m_vocabSource, sourceString ) != m_vocabSource.end(); + targetExists = FindStringPiece(m_vocabTarget, targetString) != m_vocabTarget.end(); + } - // no feature if vocab is in use and both words are not in restricted vocabularies - if (m_unrestricted || (sourceExists && targetExists)) { - if (m_sourceContext) { - if (sourceIndex == 0) { - // add trigger feature for source - stringstream feature; - feature << "glm_"; - feature << targetString; - feature << "~"; - feature << ","; - feature << sourceString; - accumulator->SparsePlusEquals(feature.str(), 1); - alreadyScored.insert(sourceHash); - } + // no feature if vocab is in use and both words are not in restricted vocabularies + if (m_unrestricted || (sourceExists && targetExists)) { + if (m_sourceContext) { + if (sourceIndex == 0) { + // add trigger feature for source + stringstream feature; + feature << "glm_"; + feature << targetString; + feature << "~"; + feature << ","; + feature << sourceString; + accumulator->SparsePlusEquals(feature.str(), 1); + alreadyScored.insert(sourceHash); + } - // add source words to the right of current source word as context - for(int contextIndex = sourceIndex+1; contextIndex < input.GetSize(); contextIndex++ ) { - StringPiece contextString = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors - bool contextExists; - if (!m_unrestricted) - contextExists = FindStringPiece(m_vocabSource, contextString ) != m_vocabSource.end(); + // add source words to the right of current source word as context + for(int contextIndex = sourceIndex+1; contextIndex < input.GetSize(); contextIndex++ ) { + StringPiece contextString = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors + bool contextExists; + if (!m_unrestricted) + contextExists = FindStringPiece(m_vocabSource, contextString ) != m_vocabSource.end(); - if (m_unrestricted || contextExists) { - stringstream feature; - feature << "glm_"; - feature << targetString; - feature << "~"; - feature << sourceString; - feature << ","; - feature << contextString; - accumulator->SparsePlusEquals(feature.str(), 1); - alreadyScored.insert(sourceHash); - } - } - } - else if (m_biphrase) { - // --> look backwards for constructing context - int globalTargetIndex = cur_hypo.GetSize() - targetPhrase.GetSize() + targetIndex; + if (m_unrestricted || contextExists) { + stringstream feature; + feature << "glm_"; + feature << targetString; + feature << "~"; + feature << sourceString; + feature << ","; + feature << contextString; + accumulator->SparsePlusEquals(feature.str(), 1); + alreadyScored.insert(sourceHash); + } + } + } else if (m_biphrase) { + // --> look backwards for constructing context + int globalTargetIndex = cur_hypo.GetSize() - targetPhrase.GetSize() + targetIndex; - // 1) source-target pair, trigger source word (can be discont.) and adjacent target word (bigram) - StringPiece targetContext; - if (globalTargetIndex > 0) - targetContext = cur_hypo.GetWord(globalTargetIndex-1).GetString(0); // TODO: change for other factors - else - targetContext = ""; + // 1) source-target pair, trigger source word (can be discont.) and adjacent target word (bigram) + StringPiece targetContext; + if (globalTargetIndex > 0) + targetContext = cur_hypo.GetWord(globalTargetIndex-1).GetString(0); // TODO: change for other factors + else + targetContext = ""; - if (sourceIndex == 0) { - StringPiece sourceTrigger = ""; - AddFeature(accumulator, sourceTrigger, sourceString, - targetContext, targetString); - } - else - for(int contextIndex = sourceIndex-1; contextIndex >= 0; contextIndex-- ) { - StringPiece sourceTrigger = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors - bool sourceTriggerExists = false; - if (!m_unrestricted) - sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end(); + if (sourceIndex == 0) { + StringPiece sourceTrigger = ""; + AddFeature(accumulator, sourceTrigger, sourceString, + targetContext, targetString); + } else + for(int contextIndex = sourceIndex-1; contextIndex >= 0; contextIndex-- ) { + StringPiece sourceTrigger = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors + bool sourceTriggerExists = false; + if (!m_unrestricted) + sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end(); - if (m_unrestricted || sourceTriggerExists) - AddFeature(accumulator, sourceTrigger, sourceString, - targetContext, targetString); - } + if (m_unrestricted || sourceTriggerExists) + AddFeature(accumulator, sourceTrigger, sourceString, + targetContext, targetString); + } - // 2) source-target pair, adjacent source word (bigram) and trigger target word (can be discont.) - StringPiece sourceContext; - if (sourceIndex-1 >= 0) - sourceContext = input.GetWord(sourceIndex-1).GetString(0); // TODO: change for other factors - else - sourceContext = ""; + // 2) source-target pair, adjacent source word (bigram) and trigger target word (can be discont.) + StringPiece sourceContext; + if (sourceIndex-1 >= 0) + sourceContext = input.GetWord(sourceIndex-1).GetString(0); // TODO: change for other factors + else + sourceContext = ""; - if (globalTargetIndex == 0) { - string targetTrigger = ""; - AddFeature(accumulator, sourceContext, sourceString, - targetTrigger, targetString); - } - else - for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) { - StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors - bool targetTriggerExists = false; - if (!m_unrestricted) - targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger ) != m_vocabTarget.end(); + if (globalTargetIndex == 0) { + string targetTrigger = ""; + AddFeature(accumulator, sourceContext, sourceString, + targetTrigger, targetString); + } else + for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) { + StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors + bool targetTriggerExists = false; + if (!m_unrestricted) + targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger ) != m_vocabTarget.end(); - if (m_unrestricted || targetTriggerExists) - AddFeature(accumulator, sourceContext, sourceString, - targetTrigger, targetString); - } - } - else if (m_bitrigger) { - // allow additional discont. triggers on both sides - int globalTargetIndex = cur_hypo.GetSize() - targetPhrase.GetSize() + targetIndex; + if (m_unrestricted || targetTriggerExists) + AddFeature(accumulator, sourceContext, sourceString, + targetTrigger, targetString); + } + } else if (m_bitrigger) { + // allow additional discont. triggers on both sides + int globalTargetIndex = cur_hypo.GetSize() - targetPhrase.GetSize() + targetIndex; - if (sourceIndex == 0) { - StringPiece sourceTrigger = ""; - bool sourceTriggerExists = true; + if (sourceIndex == 0) { + StringPiece sourceTrigger = ""; + bool sourceTriggerExists = true; - if (globalTargetIndex == 0) { - string targetTrigger = ""; - bool targetTriggerExists = true; + if (globalTargetIndex == 0) { + string targetTrigger = ""; + bool targetTriggerExists = true; - if (m_unrestricted || (sourceTriggerExists && targetTriggerExists)) - AddFeature(accumulator, sourceTrigger, sourceString, - targetTrigger, targetString); - } - else { - // iterate backwards over target - for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) { - StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors - bool targetTriggerExists = false; - if (!m_unrestricted) - targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger ) != m_vocabTarget.end(); + if (m_unrestricted || (sourceTriggerExists && targetTriggerExists)) + AddFeature(accumulator, sourceTrigger, sourceString, + targetTrigger, targetString); + } else { + // iterate backwards over target + for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) { + StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors + bool targetTriggerExists = false; + if (!m_unrestricted) + targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger ) != m_vocabTarget.end(); - if (m_unrestricted || (sourceTriggerExists && targetTriggerExists)) - AddFeature(accumulator, sourceTrigger, sourceString, - targetTrigger, targetString); - } - } - } - // iterate over both source and target - else { - // iterate backwards over source - for(int contextIndex = sourceIndex-1; contextIndex >= 0; contextIndex-- ) { - StringPiece sourceTrigger = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors - bool sourceTriggerExists = false; - if (!m_unrestricted) - sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end(); + if (m_unrestricted || (sourceTriggerExists && targetTriggerExists)) + AddFeature(accumulator, sourceTrigger, sourceString, + targetTrigger, targetString); + } + } + } + // iterate over both source and target + else { + // iterate backwards over source + for(int contextIndex = sourceIndex-1; contextIndex >= 0; contextIndex-- ) { + StringPiece sourceTrigger = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors + bool sourceTriggerExists = false; + if (!m_unrestricted) + sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end(); - if (globalTargetIndex == 0) { - string targetTrigger = ""; - bool targetTriggerExists = true; + if (globalTargetIndex == 0) { + string targetTrigger = ""; + bool targetTriggerExists = true; - if (m_unrestricted || (sourceTriggerExists && targetTriggerExists)) - AddFeature(accumulator, sourceTrigger, sourceString, - targetTrigger, targetString); - } - else { - // iterate backwards over target - for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) { - StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors - bool targetTriggerExists = false; - if (!m_unrestricted) - targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger ) != m_vocabTarget.end(); + if (m_unrestricted || (sourceTriggerExists && targetTriggerExists)) + AddFeature(accumulator, sourceTrigger, sourceString, + targetTrigger, targetString); + } else { + // iterate backwards over target + for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) { + StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors + bool targetTriggerExists = false; + if (!m_unrestricted) + targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger ) != m_vocabTarget.end(); - if (m_unrestricted || (sourceTriggerExists && targetTriggerExists)) - AddFeature(accumulator, sourceTrigger, sourceString, - targetTrigger, targetString); - } - } - } - } - } - else { - stringstream feature; - feature << "glm_"; - feature << targetString; - feature << "~"; - feature << sourceString; - accumulator->SparsePlusEquals(feature.str(), 1); - alreadyScored.insert(sourceHash); + if (m_unrestricted || (sourceTriggerExists && targetTriggerExists)) + AddFeature(accumulator, sourceTrigger, sourceString, + targetTrigger, targetString); + } + } + } + } + } else { + stringstream feature; + feature << "glm_"; + feature << targetString; + feature << "~"; + feature << sourceString; + accumulator->SparsePlusEquals(feature.str(), 1); + alreadyScored.insert(sourceHash); - } - } - } - } + } + } + } + } } } void GlobalLexicalModelUnlimited::AddFeature(ScoreComponentCollection* accumulator, - StringPiece sourceTrigger, StringPiece sourceWord, - StringPiece targetTrigger, StringPiece targetWord) const { - stringstream feature; - feature << "glm_"; - feature << targetTrigger; - feature << ","; - feature << targetWord; - feature << "~"; - feature << sourceTrigger; - feature << ","; - feature << sourceWord; - accumulator->SparsePlusEquals(feature.str(), 1); + StringPiece sourceTrigger, StringPiece sourceWord, + StringPiece targetTrigger, StringPiece targetWord) const +{ + stringstream feature; + feature << "glm_"; + feature << targetTrigger; + feature << ","; + feature << targetWord; + feature << "~"; + feature << sourceTrigger; + feature << ","; + feature << sourceWord; + accumulator->SparsePlusEquals(feature.str(), 1); } diff --git a/moses/FF/GlobalLexicalModelUnlimited.h b/moses/FF/GlobalLexicalModelUnlimited.h index 42b7abae9..28579f55c 100644 --- a/moses/FF/GlobalLexicalModelUnlimited.h +++ b/moses/FF/GlobalLexicalModelUnlimited.h @@ -38,11 +38,10 @@ class InputType; class GlobalLexicalModelUnlimited : public StatelessFeatureFunction { - typedef std::map< char, short > CharHash; - typedef std::map< std::string, short > StringHash; + typedef std::map< char, short > CharHash; + typedef std::map< std::string, short > StringHash; - struct ThreadLocalStorage - { + struct ThreadLocalStorage { const Sentence *input; }; @@ -77,23 +76,23 @@ public: void InitializeForInput( Sentence const& in ); const FFState* EmptyHypothesisState(const InputType &) const { - return new DummyState(); + return new DummyState(); } //TODO: This implements the old interface, but cannot be updated because //it appears to be stateful void Evaluate(const Hypothesis& cur_hypo, - ScoreComponentCollection* accumulator) const; + ScoreComponentCollection* accumulator) const; void EvaluateChart(const ChartHypothesis& /* cur_hypo */, - int /* featureID */, - ScoreComponentCollection* ) const { + int /* featureID */, + ScoreComponentCollection* ) const { throw std::logic_error("GlobalLexicalModelUnlimited not supported in chart decoder, yet"); } - void AddFeature(ScoreComponentCollection* accumulator, - StringPiece sourceTrigger, StringPiece sourceWord, StringPiece targetTrigger, - StringPiece targetWord) const; + void AddFeature(ScoreComponentCollection* accumulator, + StringPiece sourceTrigger, StringPiece sourceWord, StringPiece targetTrigger, + StringPiece targetWord) const; }; diff --git a/moses/FF/InputFeature.cpp b/moses/FF/InputFeature.cpp index 6dc60f94a..1ef394f9f 100644 --- a/moses/FF/InputFeature.cpp +++ b/moses/FF/InputFeature.cpp @@ -7,7 +7,7 @@ using namespace std; namespace Moses { InputFeature::InputFeature(const std::string &line) -:StatelessFeatureFunction("InputFeature", line) + :StatelessFeatureFunction("InputFeature", line) { } @@ -17,19 +17,19 @@ const InputFeature &InputFeature::GetInputFeature() static const InputFeature *staticObj = NULL; if (staticObj) { - return *staticObj; + return *staticObj; } // 1st time looking up the feature const std::vector &statefulFFs = StatelessFeatureFunction::GetStatelessFeatureFunctions(); for (size_t i = 0; i < statefulFFs.size(); ++i) { - const StatelessFeatureFunction *ff = statefulFFs[i]; - const InputFeature *lm = dynamic_cast(ff); + const StatelessFeatureFunction *ff = statefulFFs[i]; + const InputFeature *lm = dynamic_cast(ff); - if (lm) { - staticObj = lm; - return *staticObj; - } + if (lm) { + staticObj = lm; + return *staticObj; + } } throw std::logic_error("No input feature."); diff --git a/moses/FF/PhraseBasedFeatureContext.cpp b/moses/FF/PhraseBasedFeatureContext.cpp index 46e754801..4127a587c 100644 --- a/moses/FF/PhraseBasedFeatureContext.cpp +++ b/moses/FF/PhraseBasedFeatureContext.cpp @@ -11,7 +11,7 @@ PhraseBasedFeatureContext::PhraseBasedFeatureContext(const Hypothesis* hypothesi m_source(m_hypothesis->GetManager().GetSource()) {} PhraseBasedFeatureContext::PhraseBasedFeatureContext - (const TranslationOption& translationOption, const InputType& source) : +(const TranslationOption& translationOption, const InputType& source) : m_hypothesis(NULL), m_translationOption(translationOption), m_source(source) diff --git a/moses/FF/PhraseBasedFeatureContext.h b/moses/FF/PhraseBasedFeatureContext.h index b2c7052f6..0c41712ca 100644 --- a/moses/FF/PhraseBasedFeatureContext.h +++ b/moses/FF/PhraseBasedFeatureContext.h @@ -17,7 +17,7 @@ class WordsBitmap; **/ class PhraseBasedFeatureContext { - // The context either has a hypothesis (during search), or a TranslationOption and + // The context either has a hypothesis (during search), or a TranslationOption and // source sentence (during pre-calculation). const Hypothesis* m_hypothesis; const TranslationOption& m_translationOption; @@ -28,10 +28,12 @@ public: PhraseBasedFeatureContext(const TranslationOption& translationOption, const InputType& source); - const TranslationOption& GetTranslationOption() const - { return m_translationOption; } - const InputType& GetSource() const - { return m_source; } + const TranslationOption& GetTranslationOption() const { + return m_translationOption; + } + const InputType& GetSource() const { + return m_source; + } const TargetPhrase& GetTargetPhrase() const; //convenience method const WordsBitmap& GetWordsBitmap() const; diff --git a/moses/FF/PhraseBoundaryFeature.cpp b/moses/FF/PhraseBoundaryFeature.cpp index 671cc903e..ff73c760e 100644 --- a/moses/FF/PhraseBoundaryFeature.cpp +++ b/moses/FF/PhraseBoundaryFeature.cpp @@ -4,9 +4,10 @@ using namespace std; -namespace Moses { +namespace Moses +{ -int PhraseBoundaryState::Compare(const FFState& other) const +int PhraseBoundaryState::Compare(const FFState& other) const { const PhraseBoundaryState& rhs = dynamic_cast(other); int tgt = Word::Compare(*m_targetWord,*(rhs.m_targetWord)); @@ -15,7 +16,7 @@ int PhraseBoundaryState::Compare(const FFState& other) const } PhraseBoundaryFeature::PhraseBoundaryFeature(const std::string &line) -: StatefulFeatureFunction("PhraseBoundaryFeature", 0, line) + : StatefulFeatureFunction("PhraseBoundaryFeature", 0, line) { std::cerr << "Initializing source word deletion feature.." << std::endl; @@ -24,17 +25,15 @@ PhraseBoundaryFeature::PhraseBoundaryFeature(const std::string &line) if (args[0] == "source") { m_sourceFactors = Tokenize(args[1], ","); - } - else if (args[0] == "target") { + } else if (args[0] == "target") { m_targetFactors = Tokenize(args[1], ","); - } - else { + } else { throw "Unknown argument " + args[0]; } } } -const FFState* PhraseBoundaryFeature::EmptyHypothesisState(const InputType &) const +const FFState* PhraseBoundaryFeature::EmptyHypothesisState(const InputType &) const { return new PhraseBoundaryState(NULL,NULL); } @@ -42,31 +41,32 @@ const FFState* PhraseBoundaryFeature::EmptyHypothesisState(const InputType &) co void PhraseBoundaryFeature::AddFeatures( const Word* leftWord, const Word* rightWord, const FactorList& factors, const string& side, - ScoreComponentCollection* scores) const { - for (size_t i = 0; i < factors.size(); ++i) { - ostringstream name; - name << side << ":"; - name << factors[i]; - name << ":"; - if (leftWord) { - name << leftWord->GetFactor(factors[i])->GetString(); - } else { - name << BOS_; - } - name << ":"; - if (rightWord) { - name << rightWord->GetFactor(factors[i])->GetString(); - } else { - name << EOS_; - } - scores->PlusEquals(this,name.str(),1); + ScoreComponentCollection* scores) const +{ + for (size_t i = 0; i < factors.size(); ++i) { + ostringstream name; + name << side << ":"; + name << factors[i]; + name << ":"; + if (leftWord) { + name << leftWord->GetFactor(factors[i])->GetString(); + } else { + name << BOS_; } + name << ":"; + if (rightWord) { + name << rightWord->GetFactor(factors[i])->GetString(); + } else { + name << EOS_; + } + scores->PlusEquals(this,name.str(),1); + } } FFState* PhraseBoundaryFeature::Evaluate - (const Hypothesis& cur_hypo, const FFState* prev_state, - ScoreComponentCollection* scores) const +(const Hypothesis& cur_hypo, const FFState* prev_state, + ScoreComponentCollection* scores) const { const PhraseBoundaryState* pbState = dynamic_cast(prev_state); const Phrase& targetPhrase = cur_hypo.GetCurrTargetPhrase(); diff --git a/moses/FF/PhraseBoundaryFeature.h b/moses/FF/PhraseBoundaryFeature.h index 34b12abf6..b06e66eea 100644 --- a/moses/FF/PhraseBoundaryFeature.h +++ b/moses/FF/PhraseBoundaryFeature.h @@ -12,12 +12,17 @@ namespace Moses { -class PhraseBoundaryState : public FFState { +class PhraseBoundaryState : public FFState +{ public: PhraseBoundaryState(const Word* sourceWord, const Word* targetWord) : - m_sourceWord(sourceWord), m_targetWord(targetWord) {} - const Word* GetSourceWord() const {return m_sourceWord;} - const Word* GetTargetWord() const {return m_targetWord;} + m_sourceWord(sourceWord), m_targetWord(targetWord) {} + const Word* GetSourceWord() const { + return m_sourceWord; + } + const Word* GetTargetWord() const { + return m_targetWord; + } virtual int Compare(const FFState& other) const; @@ -30,7 +35,8 @@ private: /** * Concatenations of factors on boundaries of phrases. **/ -class PhraseBoundaryFeature : public StatefulFeatureFunction { +class PhraseBoundaryFeature : public StatefulFeatureFunction +{ public: PhraseBoundaryFeature(const std::string &line); @@ -39,7 +45,7 @@ public: virtual const FFState* EmptyHypothesisState(const InputType &) const; virtual FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state, - ScoreComponentCollection* accumulator) const; + ScoreComponentCollection* accumulator) const; virtual FFState* EvaluateChart( const ChartHypothesis& /* cur_hypo */, int /* featureID */, @@ -49,7 +55,7 @@ public: private: void AddFeatures( - const Word* leftWord, const Word* rightWord, const FactorList& factors, + const Word* leftWord, const Word* rightWord, const FactorList& factors, const std::string& side, ScoreComponentCollection* scores) const ; FactorList m_sourceFactors; FactorList m_targetFactors; diff --git a/moses/FF/PhraseLengthFeature.cpp b/moses/FF/PhraseLengthFeature.cpp index b9e8e9e1d..2efeb07d2 100644 --- a/moses/FF/PhraseLengthFeature.cpp +++ b/moses/FF/PhraseLengthFeature.cpp @@ -4,20 +4,21 @@ #include "moses/ScoreComponentCollection.h" #include "moses/TranslationOption.h" -namespace Moses { +namespace Moses +{ using namespace std; PhraseLengthFeature::PhraseLengthFeature(const std::string &line) -:StatelessFeatureFunction("PhraseLengthFeature", 0, line) + :StatelessFeatureFunction("PhraseLengthFeature", 0, line) { } void PhraseLengthFeature::Evaluate(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const { // get length of source and target phrase size_t targetLength = targetPhrase.GetSize(); diff --git a/moses/FF/PhraseLengthFeature.h b/moses/FF/PhraseLengthFeature.h index 327865558..23c168417 100644 --- a/moses/FF/PhraseLengthFeature.h +++ b/moses/FF/PhraseLengthFeature.h @@ -15,7 +15,8 @@ namespace Moses /** Sets the features for length of source phrase, target phrase, both. */ -class PhraseLengthFeature : public StatelessFeatureFunction { +class PhraseLengthFeature : public StatelessFeatureFunction +{ public: PhraseLengthFeature(const std::string &line); @@ -25,9 +26,9 @@ public: } virtual void Evaluate(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const; + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const; }; diff --git a/moses/FF/PhrasePairFeature.cpp b/moses/FF/PhrasePairFeature.cpp index 58f71271f..9fce7ff4e 100644 --- a/moses/FF/PhrasePairFeature.cpp +++ b/moses/FF/PhrasePairFeature.cpp @@ -9,10 +9,11 @@ using namespace std; -namespace Moses { +namespace Moses +{ PhrasePairFeature::PhrasePairFeature(const std::string &line) -:StatelessFeatureFunction("PhrasePairFeature", 0, line) + :StatelessFeatureFunction("PhrasePairFeature", 0, line) { std::cerr << "Initializing PhrasePairFeature.." << std::endl; @@ -44,47 +45,44 @@ PhrasePairFeature::PhrasePairFeature(const std::string &line) Load(filePathSource); } -bool PhrasePairFeature::Load(const std::string &filePathSource/*, const std::string &filePathTarget*/) +bool PhrasePairFeature::Load(const std::string &filePathSource/*, const std::string &filePathTarget*/) { if (m_domainTrigger) { // domain trigger terms for each input document ifstream inFileSource(filePathSource.c_str()); - if (!inFileSource) - { - cerr << "could not open file " << filePathSource << endl; - return false; - } - + if (!inFileSource) { + cerr << "could not open file " << filePathSource << endl; + return false; + } + std::string line; while (getline(inFileSource, line)) { std::set terms; vector termVector; boost::split(termVector, line, boost::is_any_of("\t ")); - for (size_t i=0; i < termVector.size(); ++i) + for (size_t i=0; i < termVector.size(); ++i) terms.insert(termVector[i]); - + // add term set for current document m_vocabDomain.push_back(terms); } - + inFileSource.close(); - } - else { + } else { // restricted source word vocabulary ifstream inFileSource(filePathSource.c_str()); - if (!inFileSource) - { - cerr << "could not open file " << filePathSource << endl; - return false; - } - + if (!inFileSource) { + cerr << "could not open file " << filePathSource << endl; + return false; + } + std::string line; while (getline(inFileSource, line)) { m_vocabSource.insert(line); } - + inFileSource.close(); - + /* // restricted target word vocabulary ifstream inFileTarget(filePathTarget.c_str()); if (!inFileTarget) @@ -105,11 +103,11 @@ bool PhrasePairFeature::Load(const std::string &filePathSource/*, const std::str } void PhrasePairFeature::Evaluate( - const PhraseBasedFeatureContext& context, - ScoreComponentCollection* accumulator) const + const PhraseBasedFeatureContext& context, + ScoreComponentCollection* accumulator) const { const TargetPhrase& target = context.GetTargetPhrase(); - const Phrase& source = *(context.GetTranslationOption().GetSourcePhrase()); + const Phrase& source = *(context.GetTranslationOption().GetSourcePhrase()); if (m_simple) { ostringstream namestr; namestr << "pp_"; @@ -126,11 +124,11 @@ void PhrasePairFeature::Evaluate( namestr << ","; namestr << targetFactor->GetString(); } - + accumulator->SparsePlusEquals(namestr.str(),1); } if (m_domainTrigger) { - const Sentence& input = static_cast(context.GetSource()); + const Sentence& input = static_cast(context.GetSource()); const bool use_topicid = input.GetUseTopicId(); const bool use_topicid_prob = input.GetUseTopicIdAndProb(); @@ -149,95 +147,92 @@ void PhrasePairFeature::Evaluate( pair << ","; pair << targetFactor->GetString(); } - + if (use_topicid || use_topicid_prob) { if(use_topicid) { - // use topicid as trigger - const long topicid = input.GetTopicId(); - stringstream feature; - feature << "pp_"; - if (topicid == -1) - feature << "unk"; - else - feature << topicid; - - feature << "_"; - feature << pair.str(); - accumulator->SparsePlusEquals(feature.str(), 1); + // use topicid as trigger + const long topicid = input.GetTopicId(); + stringstream feature; + feature << "pp_"; + if (topicid == -1) + feature << "unk"; + else + feature << topicid; + + feature << "_"; + feature << pair.str(); + accumulator->SparsePlusEquals(feature.str(), 1); + } else { + // use topic probabilities + const vector &topicid_prob = *(input.GetTopicIdAndProb()); + if (atol(topicid_prob[0].c_str()) == -1) { + stringstream feature; + feature << "pp_unk_"; + feature << pair.str(); + accumulator->SparsePlusEquals(feature.str(), 1); + } else { + for (size_t i=0; i+1 < topicid_prob.size(); i+=2) { + stringstream feature; + feature << "pp_"; + feature << topicid_prob[i]; + feature << "_"; + feature << pair.str(); + accumulator->SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str())); + } + } } - else { - // use topic probabilities - const vector &topicid_prob = *(input.GetTopicIdAndProb()); - if (atol(topicid_prob[0].c_str()) == -1) { - stringstream feature; - feature << "pp_unk_"; - feature << pair.str(); - accumulator->SparsePlusEquals(feature.str(), 1); - } - else { - for (size_t i=0; i+1 < topicid_prob.size(); i+=2) { - stringstream feature; - feature << "pp_"; - feature << topicid_prob[i]; - feature << "_"; - feature << pair.str(); - accumulator->SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str())); - } - } - } - } - else { + } else { // range over domain trigger words const long docid = input.GetDocumentId(); for (set::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) { - string sourceTrigger = *p; - ostringstream namestr; - namestr << "pp_"; - namestr << sourceTrigger; - namestr << "_"; - namestr << pair.str(); - accumulator->SparsePlusEquals(namestr.str(),1); + string sourceTrigger = *p; + ostringstream namestr; + namestr << "pp_"; + namestr << sourceTrigger; + namestr << "_"; + namestr << pair.str(); + accumulator->SparsePlusEquals(namestr.str(),1); } } } if (m_sourceContext) { const Sentence& input = static_cast(context.GetSource()); - + // range over source words to get context for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) { StringPiece sourceTrigger = input.GetWord(contextIndex).GetFactor(m_sourceFactorId)->GetString(); if (m_ignorePunctuation) { - // check if trigger is punctuation - char firstChar = sourceTrigger[0]; - CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); - if(charIterator != m_punctuationHash.end()) - continue; + // check if trigger is punctuation + char firstChar = sourceTrigger[0]; + CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); + if(charIterator != m_punctuationHash.end()) + continue; } - + bool sourceTriggerExists = false; if (!m_unrestricted) - sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end(); - + sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end(); + if (m_unrestricted || sourceTriggerExists) { - ostringstream namestr; - namestr << "pp_"; - namestr << sourceTrigger; - namestr << "~"; - namestr << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString(); - for (size_t i = 1; i < source.GetSize(); ++i) { - const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId); - namestr << ","; - namestr << sourceFactor->GetString(); - } - namestr << "~"; - namestr << target.GetWord(0).GetFactor(m_targetFactorId)->GetString(); - for (size_t i = 1; i < target.GetSize(); ++i) { - const Factor* targetFactor = target.GetWord(i).GetFactor(m_targetFactorId); - namestr << ","; - namestr << targetFactor->GetString(); - } - - accumulator->SparsePlusEquals(namestr.str(),1); + ostringstream namestr; + namestr << "pp_"; + namestr << sourceTrigger; + namestr << "~"; + namestr << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString(); + for (size_t i = 1; i < source.GetSize(); ++i) { + const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId); + namestr << ","; + namestr << sourceFactor->GetString(); + } + namestr << "~"; + namestr << target.GetWord(0).GetFactor(m_targetFactorId)->GetString(); + for (size_t i = 1; i < target.GetSize(); ++i) { + const Factor* targetFactor = target.GetWord(i).GetFactor(m_targetFactorId); + namestr << ","; + namestr << targetFactor->GetString(); + } + + accumulator->SparsePlusEquals(namestr.str(),1); } } } diff --git a/moses/FF/PhrasePairFeature.h b/moses/FF/PhrasePairFeature.h index e895110f8..d7aa80be7 100644 --- a/moses/FF/PhrasePairFeature.h +++ b/moses/FF/PhrasePairFeature.h @@ -8,39 +8,41 @@ #include "moses/Factor.h" #include "moses/Sentence.h" -namespace Moses { +namespace Moses +{ /** * Phrase pair feature: complete source/target phrase pair **/ -class PhrasePairFeature: public StatelessFeatureFunction { - - typedef std::map< char, short > CharHash; - typedef std::vector< std::set > DocumentVector; - - boost::unordered_set m_vocabSource; - DocumentVector m_vocabDomain; - FactorType m_sourceFactorId; - FactorType m_targetFactorId; - bool m_unrestricted; - bool m_simple; - bool m_sourceContext; - bool m_domainTrigger; - bool m_ignorePunctuation; - CharHash m_punctuationHash; - - public: - PhrasePairFeature(const std::string &line); +class PhrasePairFeature: public StatelessFeatureFunction +{ - void Evaluate(const PhraseBasedFeatureContext& context, - ScoreComponentCollection* accumulator) const; - - void EvaluateChart(const ChartBasedFeatureContext& context, - ScoreComponentCollection*) const { - throw std::logic_error("PhrasePairFeature not valid in chart decoder"); - } + typedef std::map< char, short > CharHash; + typedef std::vector< std::set > DocumentVector; - bool Load(const std::string &filePathSource/*, const std::string &filePathTarget*/); + boost::unordered_set m_vocabSource; + DocumentVector m_vocabDomain; + FactorType m_sourceFactorId; + FactorType m_targetFactorId; + bool m_unrestricted; + bool m_simple; + bool m_sourceContext; + bool m_domainTrigger; + bool m_ignorePunctuation; + CharHash m_punctuationHash; + +public: + PhrasePairFeature(const std::string &line); + + void Evaluate(const PhraseBasedFeatureContext& context, + ScoreComponentCollection* accumulator) const; + + void EvaluateChart(const ChartBasedFeatureContext& context, + ScoreComponentCollection*) const { + throw std::logic_error("PhrasePairFeature not valid in chart decoder"); + } + + bool Load(const std::string &filePathSource/*, const std::string &filePathTarget*/); }; diff --git a/moses/FF/SourceWordDeletionFeature.cpp b/moses/FF/SourceWordDeletionFeature.cpp index 085dbbeea..693812105 100644 --- a/moses/FF/SourceWordDeletionFeature.cpp +++ b/moses/FF/SourceWordDeletionFeature.cpp @@ -11,13 +11,14 @@ #include "util/string_piece_hash.hh" -namespace Moses { +namespace Moses +{ using namespace std; SourceWordDeletionFeature::SourceWordDeletionFeature(const std::string &line) -:StatelessFeatureFunction("SourceWordDeletionFeature", 0, line), -m_unrestricted(true) + :StatelessFeatureFunction("SourceWordDeletionFeature", 0, line), + m_unrestricted(true) { std::cerr << "Initializing source word deletion feature.." << std::endl; @@ -27,11 +28,9 @@ m_unrestricted(true) if (args[0] == "factor") { m_factorType = Scan(args[1]); - } - else if (args[0] == "path") { + } else if (args[0] == "path") { filename = args[1]; - } - else { + } else { throw "Unknown argument " + args[0]; } } @@ -40,19 +39,18 @@ m_unrestricted(true) if (filename != "") { cerr << "loading source word deletion word list from " << filename << endl; if (!Load(filename)) { - UserMessage::Add("Unable to load word list for source word deletion feature from file " + filename); - //return false; + UserMessage::Add("Unable to load word list for source word deletion feature from file " + filename); + //return false; } } } -bool SourceWordDeletionFeature::Load(const std::string &filePath) +bool SourceWordDeletionFeature::Load(const std::string &filePath) { ifstream inFile(filePath.c_str()); - if (!inFile) - { - cerr << "could not open file " << filePath << endl; - return false; + if (!inFile) { + cerr << "could not open file " << filePath << endl; + return false; } std::string line; @@ -67,23 +65,23 @@ bool SourceWordDeletionFeature::Load(const std::string &filePath) } void SourceWordDeletionFeature::Evaluate(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const { const AlignmentInfo &alignmentInfo = targetPhrase.GetAlignTerm(); ComputeFeatures(source, targetPhrase, &scoreBreakdown, alignmentInfo); } void SourceWordDeletionFeature::ComputeFeatures(const Phrase &source, - const TargetPhrase& targetPhrase, - ScoreComponentCollection* accumulator, - const AlignmentInfo &alignmentInfo) const + const TargetPhrase& targetPhrase, + ScoreComponentCollection* accumulator, + const AlignmentInfo &alignmentInfo) const { // handle special case: unknown words (they have no word alignment) - size_t targetLength = targetPhrase.GetSize(); - size_t sourceLength = source.GetSize(); - if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return; + size_t targetLength = targetPhrase.GetSize(); + size_t sourceLength = source.GetSize(); + if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return; // flag aligned words bool aligned[16]; @@ -92,22 +90,21 @@ void SourceWordDeletionFeature::ComputeFeatures(const Phrase &source, aligned[i] = false; for (AlignmentInfo::const_iterator alignmentPoint = alignmentInfo.begin(); alignmentPoint != alignmentInfo.end(); alignmentPoint++) aligned[ alignmentPoint->first ] = true; - + // process unaligned source words for(size_t i=0; iGetString(); - if (word != "" && word != "") { - if (!m_unrestricted && FindStringPiece(m_vocab, word ) == m_vocab.end()) { - accumulator->PlusEquals(this, StringPiece("OTHER"),1); - } - else { - accumulator->PlusEquals(this,word,1); - } - } - } + const Word &w = source.GetWord(i); + if (!w.IsNonTerminal()) { + const StringPiece word = w.GetFactor(m_factorType)->GetString(); + if (word != "" && word != "") { + if (!m_unrestricted && FindStringPiece(m_vocab, word ) == m_vocab.end()) { + accumulator->PlusEquals(this, StringPiece("OTHER"),1); + } else { + accumulator->PlusEquals(this,word,1); + } + } + } } } } diff --git a/moses/FF/SourceWordDeletionFeature.h b/moses/FF/SourceWordDeletionFeature.h index 1bf6323be..7a25ee6e1 100644 --- a/moses/FF/SourceWordDeletionFeature.h +++ b/moses/FF/SourceWordDeletionFeature.h @@ -13,7 +13,8 @@ namespace Moses /** Sets the features for source word deletion */ -class SourceWordDeletionFeature : public StatelessFeatureFunction { +class SourceWordDeletionFeature : public StatelessFeatureFunction +{ private: boost::unordered_set m_vocab; FactorType m_factorType; @@ -21,18 +22,18 @@ private: public: SourceWordDeletionFeature(const std::string &line); - + bool Load(const std::string &filePath); virtual void Evaluate(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const; + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const; void ComputeFeatures(const Phrase &source, - const TargetPhrase& targetPhrase, - ScoreComponentCollection* accumulator, - const AlignmentInfo &alignmentInfo) const; + const TargetPhrase& targetPhrase, + ScoreComponentCollection* accumulator, + const AlignmentInfo &alignmentInfo) const; }; } diff --git a/moses/FF/StatefulFeatureFunction.cpp b/moses/FF/StatefulFeatureFunction.cpp index a97846311..0aeeed62c 100644 --- a/moses/FF/StatefulFeatureFunction.cpp +++ b/moses/FF/StatefulFeatureFunction.cpp @@ -4,13 +4,13 @@ namespace Moses { StatefulFeatureFunction::StatefulFeatureFunction(const std::string& description, const std::string &line) -: FeatureFunction(description, line) + : FeatureFunction(description, line) { m_statefulFFs.push_back(this); } StatefulFeatureFunction::StatefulFeatureFunction(const std::string& description, size_t numScoreComponents, const std::string &line) -: FeatureFunction(description,numScoreComponents, line) + : FeatureFunction(description,numScoreComponents, line) { m_statefulFFs.push_back(this); } diff --git a/moses/FF/StatefulFeatureFunction.h b/moses/FF/StatefulFeatureFunction.h index d2721d4ae..fc5cd4faf 100644 --- a/moses/FF/StatefulFeatureFunction.h +++ b/moses/FF/StatefulFeatureFunction.h @@ -6,7 +6,7 @@ namespace Moses { /** base class for all stateful feature functions. - * eg. LM, distortion penalty + * eg. LM, distortion penalty */ class StatefulFeatureFunction: public FeatureFunction { @@ -14,7 +14,9 @@ class StatefulFeatureFunction: public FeatureFunction static std::vector m_statefulFFs; public: - static const std::vector& GetStatefulFeatureFunctions() {return m_statefulFFs;} + static const std::vector& GetStatefulFeatureFunctions() { + return m_statefulFFs; + } StatefulFeatureFunction(const std::string& description, const std::string &line); StatefulFeatureFunction(const std::string& description, size_t numScoreComponents, const std::string &line); @@ -39,8 +41,9 @@ public: //! return the state associated with the empty hypothesis for a given sentence virtual const FFState* EmptyHypothesisState(const InputType &input) const = 0; - bool IsStateless() const - { return false; } + bool IsStateless() const { + return false; + } }; diff --git a/moses/FF/StatelessFeatureFunction.cpp b/moses/FF/StatelessFeatureFunction.cpp index 1c5e604de..278a90c54 100644 --- a/moses/FF/StatelessFeatureFunction.cpp +++ b/moses/FF/StatelessFeatureFunction.cpp @@ -4,13 +4,13 @@ namespace Moses { StatelessFeatureFunction::StatelessFeatureFunction(const std::string& description, const std::string &line) -:FeatureFunction(description, line) + :FeatureFunction(description, line) { m_statelessFFs.push_back(this); } StatelessFeatureFunction::StatelessFeatureFunction(const std::string& description, size_t numScoreComponents, const std::string &line) -:FeatureFunction(description, numScoreComponents, line) + :FeatureFunction(description, numScoreComponents, line) { m_statelessFFs.push_back(this); } diff --git a/moses/FF/StatelessFeatureFunction.h b/moses/FF/StatelessFeatureFunction.h index d8db7f514..3f120a1de 100644 --- a/moses/FF/StatelessFeatureFunction.h +++ b/moses/FF/StatelessFeatureFunction.h @@ -14,7 +14,9 @@ class StatelessFeatureFunction: public FeatureFunction static std::vector m_statelessFFs; public: - static const std::vector& GetStatelessFeatureFunctions() {return m_statelessFFs;} + static const std::vector& GetStatelessFeatureFunctions() { + return m_statelessFFs; + } StatelessFeatureFunction(const std::string& description, const std::string &line); StatelessFeatureFunction(const std::string& description, size_t numScoreComponents, const std::string &line); @@ -22,7 +24,7 @@ public: * This should be implemented for features that apply to phrase-based models. **/ virtual void Evaluate(const PhraseBasedFeatureContext& context, - ScoreComponentCollection* accumulator) const + ScoreComponentCollection* accumulator) const {} /** @@ -32,8 +34,9 @@ public: ScoreComponentCollection* accumulator) const {} - virtual bool IsStateless() const - { return true; } + virtual bool IsStateless() const { + return true; + } }; diff --git a/moses/FF/TargetBigramFeature.cpp b/moses/FF/TargetBigramFeature.cpp index 441cf9e15..fc30a737f 100644 --- a/moses/FF/TargetBigramFeature.cpp +++ b/moses/FF/TargetBigramFeature.cpp @@ -7,15 +7,17 @@ using namespace std; -namespace Moses { +namespace Moses +{ -int TargetBigramState::Compare(const FFState& other) const { +int TargetBigramState::Compare(const FFState& other) const +{ const TargetBigramState& rhs = dynamic_cast(other); return Word::Compare(m_word,rhs.m_word); } TargetBigramFeature::TargetBigramFeature(const std::string &line) -:StatefulFeatureFunction("TargetBigramFeature", 0, line) + :StatefulFeatureFunction("TargetBigramFeature", 0, line) { std::cerr << "Initializing target bigram feature.." << std::endl; @@ -27,7 +29,7 @@ TargetBigramFeature::TargetBigramFeature(const std::string &line) FactorCollection& factorCollection = FactorCollection::Instance(); const Factor* bosFactor = - factorCollection.AddFactor(Output,m_factorType,BOS_); + factorCollection.AddFactor(Output,m_factorType,BOS_); m_bos.SetFactor(m_factorType,bosFactor); const string &filePath = tokens[2]; @@ -35,13 +37,12 @@ TargetBigramFeature::TargetBigramFeature(const std::string &line) } -bool TargetBigramFeature::Load(const std::string &filePath) +bool TargetBigramFeature::Load(const std::string &filePath) { if (filePath == "*") return true; //allow all ifstream inFile(filePath.c_str()); - if (!inFile) - { - return false; + if (!inFile) { + return false; } std::string line; @@ -87,7 +88,7 @@ FFState* TargetBigramFeature::Evaluate(const Hypothesis& cur_hypo, const StringPiece w2 = f2->GetString(); // skip bigrams if they don't belong to a given restricted vocabulary - if (m_vocab.size() && + if (m_vocab.size() && (FindStringPiece(m_vocab, w1) == m_vocab.end() || FindStringPiece(m_vocab, w2) == m_vocab.end())) { continue; } diff --git a/moses/FF/TargetBigramFeature.h b/moses/FF/TargetBigramFeature.h index f514f2405..e29eace14 100644 --- a/moses/FF/TargetBigramFeature.h +++ b/moses/FF/TargetBigramFeature.h @@ -13,35 +13,38 @@ namespace Moses { -class TargetBigramState : public FFState { - public: - TargetBigramState(const Word& word): m_word(word) {} - const Word& GetWord() const {return m_word;} - virtual int Compare(const FFState& other) const; +class TargetBigramState : public FFState +{ +public: + TargetBigramState(const Word& word): m_word(word) {} + const Word& GetWord() const { + return m_word; + } + virtual int Compare(const FFState& other) const; - private: - Word m_word; +private: + Word m_word; }; /** Sets the features of observed bigrams. */ -class TargetBigramFeature : public StatefulFeatureFunction { +class TargetBigramFeature : public StatefulFeatureFunction +{ public: - TargetBigramFeature(const std::string &line); + TargetBigramFeature(const std::string &line); - bool Load(const std::string &filePath); + bool Load(const std::string &filePath); - virtual const FFState* EmptyHypothesisState(const InputType &input) const; + virtual const FFState* EmptyHypothesisState(const InputType &input) const; - virtual FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state, - ScoreComponentCollection* accumulator) const; + virtual FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state, + ScoreComponentCollection* accumulator) const; virtual FFState* EvaluateChart( const ChartHypothesis& /* cur_hypo */, int /* featureID */, - ScoreComponentCollection* ) const - { - abort(); - } + ScoreComponentCollection* ) const { + abort(); + } private: FactorType m_factorType; diff --git a/moses/FF/TargetNgramFeature.cpp b/moses/FF/TargetNgramFeature.cpp index 174fcfa1a..3c36aef0e 100644 --- a/moses/FF/TargetNgramFeature.cpp +++ b/moses/FF/TargetNgramFeature.cpp @@ -7,38 +7,38 @@ #include "util/string_piece_hash.hh" -namespace Moses { +namespace Moses +{ using namespace std; -int TargetNgramState::Compare(const FFState& other) const { +int TargetNgramState::Compare(const FFState& other) const +{ const TargetNgramState& rhs = dynamic_cast(other); int result; if (m_words.size() == rhs.m_words.size()) { - for (size_t i = 0; i < m_words.size(); ++i) { - result = Word::Compare(m_words[i],rhs.m_words[i]); - if (result != 0) return result; - } + for (size_t i = 0; i < m_words.size(); ++i) { + result = Word::Compare(m_words[i],rhs.m_words[i]); + if (result != 0) return result; + } return 0; - } - else if (m_words.size() < rhs.m_words.size()) { - for (size_t i = 0; i < m_words.size(); ++i) { - result = Word::Compare(m_words[i],rhs.m_words[i]); - if (result != 0) return result; - } - return -1; - } - else { - for (size_t i = 0; i < rhs.m_words.size(); ++i) { - result = Word::Compare(m_words[i],rhs.m_words[i]); - if (result != 0) return result; - } - return 1; + } else if (m_words.size() < rhs.m_words.size()) { + for (size_t i = 0; i < m_words.size(); ++i) { + result = Word::Compare(m_words[i],rhs.m_words[i]); + if (result != 0) return result; + } + return -1; + } else { + for (size_t i = 0; i < rhs.m_words.size(); ++i) { + result = Word::Compare(m_words[i],rhs.m_words[i]); + if (result != 0) return result; + } + return 1; } } TargetNgramFeature::TargetNgramFeature(const std::string &line) -:StatefulFeatureFunction("TargetNgramFeature", 0, line) + :StatefulFeatureFunction("TargetNgramFeature", 0, line) { std::cerr << "Initializing target ngram feature.." << std::endl; @@ -56,9 +56,8 @@ bool TargetNgramFeature::Load(const std::string &filePath) { if (filePath == "*") return true; //allow all ifstream inFile(filePath.c_str()); - if (!inFile) - { - return false; + if (!inFile) { + return false; } std::string line; @@ -74,13 +73,13 @@ bool TargetNgramFeature::Load(const std::string &filePath) const FFState* TargetNgramFeature::EmptyHypothesisState(const InputType &/*input*/) const { - vector bos(1,m_bos); + vector bos(1,m_bos); return new TargetNgramState(bos); } FFState* TargetNgramFeature::Evaluate(const Hypothesis& cur_hypo, - const FFState* prev_state, - ScoreComponentCollection* accumulator) const + const FFState* prev_state, + ScoreComponentCollection* accumulator) const { const TargetNgramState* tnState = static_cast(prev_state); assert(tnState); @@ -99,92 +98,92 @@ FFState* TargetNgramFeature::Evaluate(const Hypothesis& cur_hypo, if (m_lower_ngrams) smallest_n = 1; for (size_t n = m_n; n >= smallest_n; --n) { // iterate over ngram size - for (size_t i = 0; i < targetPhrase.GetSize(); ++i) { + for (size_t i = 0; i < targetPhrase.GetSize(); ++i) { // const string& curr_w = targetPhrase.GetWord(i).GetFactor(m_factorType)->GetString(); - const StringPiece& curr_w = targetPhrase.GetWord(i).GetString(m_factorType); + const StringPiece& curr_w = targetPhrase.GetWord(i).GetString(m_factorType); - if (m_vocab.size() && (FindStringPiece(m_vocab, curr_w) == m_vocab.end())) continue; // skip ngrams + if (m_vocab.size() && (FindStringPiece(m_vocab, curr_w) == m_vocab.end())) continue; // skip ngrams - if (n > 1) { - // can we build an ngram at this position? (" this" --> cannot build 3gram at this position) - size_t pos_in_translation = cur_hypo.GetSize() - targetPhrase.GetSize() + i; - if (pos_in_translation < n - 2) continue; // need at least m_n - 1 words + if (n > 1) { + // can we build an ngram at this position? (" this" --> cannot build 3gram at this position) + size_t pos_in_translation = cur_hypo.GetSize() - targetPhrase.GetSize() + i; + if (pos_in_translation < n - 2) continue; // need at least m_n - 1 words - // how many words needed from previous state? - int from_prev_state = n - (i+1); - skip = false; - if (from_prev_state > 0) { - if (prev_words.size() < from_prev_state) { - // context is too short, make new state from previous state and target phrase - vector new_prev_words; - for (size_t i = 0; i < prev_words.size(); ++i) - new_prev_words.push_back(prev_words[i]); - for (size_t i = 0; i < targetPhrase.GetSize(); ++i) - new_prev_words.push_back(targetPhrase.GetWord(i)); - return new TargetNgramState(new_prev_words); - } + // how many words needed from previous state? + int from_prev_state = n - (i+1); + skip = false; + if (from_prev_state > 0) { + if (prev_words.size() < from_prev_state) { + // context is too short, make new state from previous state and target phrase + vector new_prev_words; + for (size_t i = 0; i < prev_words.size(); ++i) + new_prev_words.push_back(prev_words[i]); + for (size_t i = 0; i < targetPhrase.GetSize(); ++i) + new_prev_words.push_back(targetPhrase.GetWord(i)); + return new TargetNgramState(new_prev_words); + } - // add words from previous state - for (size_t j = prev_words.size()-from_prev_state; j < prev_words.size() && !skip; ++j) - appendNgram(prev_words[j], skip, curr_ngram); + // add words from previous state + for (size_t j = prev_words.size()-from_prev_state; j < prev_words.size() && !skip; ++j) + appendNgram(prev_words[j], skip, curr_ngram); } - // add words from current target phrase - int start = i - n + 1; // add m_n-1 previous words - if (start < 0) start = 0; // or less - for (size_t j = start; j < i && !skip; ++j) - appendNgram(targetPhrase.GetWord(j), skip, curr_ngram); + // add words from current target phrase + int start = i - n + 1; // add m_n-1 previous words + if (start < 0) start = 0; // or less + for (size_t j = start; j < i && !skip; ++j) + appendNgram(targetPhrase.GetWord(j), skip, curr_ngram); } - if (!skip) { - curr_ngram << curr_w; - accumulator->PlusEquals(this,curr_ngram.str(),1); + if (!skip) { + curr_ngram << curr_w; + accumulator->PlusEquals(this,curr_ngram.str(),1); } - curr_ngram.str(""); - } + curr_ngram.str(""); + } } if (cur_hypo.GetWordsBitmap().IsComplete()) { - for (size_t n = m_n; n >= smallest_n; --n) { - stringstream last_ngram; - skip = false; - for (size_t i = cur_hypo.GetSize() - n + 1; i < cur_hypo.GetSize() && !skip; ++i) - appendNgram(cur_hypo.GetWord(i), skip, last_ngram); + for (size_t n = m_n; n >= smallest_n; --n) { + stringstream last_ngram; + skip = false; + for (size_t i = cur_hypo.GetSize() - n + 1; i < cur_hypo.GetSize() && !skip; ++i) + appendNgram(cur_hypo.GetWord(i), skip, last_ngram); - if (n > 1 && !skip) { - last_ngram << EOS_; - accumulator->PlusEquals(this, last_ngram.str(), 1); - } - } - return NULL; + if (n > 1 && !skip) { + last_ngram << EOS_; + accumulator->PlusEquals(this, last_ngram.str(), 1); + } + } + return NULL; } // prepare new state vector new_prev_words; if (targetPhrase.GetSize() >= m_n-1) { - // take subset of target words - for (size_t i = targetPhrase.GetSize() - m_n + 1; i < targetPhrase.GetSize(); ++i) - new_prev_words.push_back(targetPhrase.GetWord(i)); - } - else { - // take words from previous state and from target phrase - int from_prev_state = m_n - 1 - targetPhrase.GetSize(); - for (size_t i = prev_words.size()-from_prev_state; i < prev_words.size(); ++i) - new_prev_words.push_back(prev_words[i]); - for (size_t i = 0; i < targetPhrase.GetSize(); ++i) - new_prev_words.push_back(targetPhrase.GetWord(i)); + // take subset of target words + for (size_t i = targetPhrase.GetSize() - m_n + 1; i < targetPhrase.GetSize(); ++i) + new_prev_words.push_back(targetPhrase.GetWord(i)); + } else { + // take words from previous state and from target phrase + int from_prev_state = m_n - 1 - targetPhrase.GetSize(); + for (size_t i = prev_words.size()-from_prev_state; i < prev_words.size(); ++i) + new_prev_words.push_back(prev_words[i]); + for (size_t i = 0; i < targetPhrase.GetSize(); ++i) + new_prev_words.push_back(targetPhrase.GetWord(i)); } return new TargetNgramState(new_prev_words); } -void TargetNgramFeature::appendNgram(const Word& word, bool& skip, stringstream &ngram) const { +void TargetNgramFeature::appendNgram(const Word& word, bool& skip, stringstream &ngram) const +{ // const string& w = word.GetFactor(m_factorType)->GetString(); - const StringPiece& w = word.GetString(m_factorType); - if (m_vocab.size() && (FindStringPiece(m_vocab, w) == m_vocab.end())) skip = true; - else { - ngram << w; - ngram << ":"; - } + const StringPiece& w = word.GetString(m_factorType); + if (m_vocab.size() && (FindStringPiece(m_vocab, w) == m_vocab.end())) skip = true; + else { + ngram << w; + ngram << ":"; + } } FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int featureId, ScoreComponentCollection* accumulator) const @@ -205,159 +204,149 @@ FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int bool onlyTerminals = true; bool prev_is_NT = false; size_t prev_subPhraseLength = 0; - for (size_t phrasePos = 0; phrasePos < cur_hypo.GetCurrTargetPhrase().GetSize(); phrasePos++) - { + for (size_t phrasePos = 0; phrasePos < cur_hypo.GetCurrTargetPhrase().GetSize(); phrasePos++) { // consult rule for either word or non-terminal const Word &word = cur_hypo.GetCurrTargetPhrase().GetWord(phrasePos); // cerr << "word: " << word << endl; // regular word if (!word.IsNonTerminal()) { - contextFactor.push_back(&word); - prev_is_NT = false; + contextFactor.push_back(&word); + prev_is_NT = false; if (phrasePos==0) - makePrefix = true; + makePrefix = true; if (phrasePos==cur_hypo.GetCurrTargetPhrase().GetSize()-1 || prev_is_NT) - makeSuffix = true; - + makeSuffix = true; + // beginning/end of sentence symbol ,? StringPiece factorZero = word.GetString(0); if (factorZero.compare("") == 0) - prefixTerminals++; + prefixTerminals++; // end of sentence symbol ? else if (factorZero.compare("") == 0) - suffixTerminals++; + suffixTerminals++; // everything else else { - stringstream ngram; - ngram << m_baseName; - if (m_factorType == 0) - ngram << factorZero; - else - ngram << word.GetString(m_factorType); - accumulator->SparsePlusEquals(ngram.str(), 1); + stringstream ngram; + ngram << m_baseName; + if (m_factorType == 0) + ngram << factorZero; + else + ngram << word.GetString(m_factorType); + accumulator->SparsePlusEquals(ngram.str(), 1); - if (collectForPrefix) - prefixTerminals++; - else - suffixTerminals++; + if (collectForPrefix) + prefixTerminals++; + else + suffixTerminals++; } } // non-terminal, add phrase from underlying hypothesis - else if (m_n > 1) - { + else if (m_n > 1) { // look up underlying hypothesis size_t nonTermIndex = nonTermIndexMap[phrasePos]; const ChartHypothesis *prevHypo = cur_hypo.GetPrevHypo(nonTermIndex); const TargetNgramChartState* prevState = - static_cast(prevHypo->GetFFState(featureId)); + static_cast(prevHypo->GetFFState(featureId)); size_t subPhraseLength = prevState->GetNumTargetTerminals(); // special case: rule starts with non-terminal if (phrasePos == 0) { - if (subPhraseLength == 1) { - makePrefix = true; - ++prefixTerminals; + if (subPhraseLength == 1) { + makePrefix = true; + ++prefixTerminals; - const Word &word = prevState->GetSuffix().GetWord(0); + const Word &word = prevState->GetSuffix().GetWord(0); // cerr << "NT0 --> : " << word << endl; - contextFactor.push_back(&word); - } - else { - onlyTerminals = false; - collectForPrefix = false; - int suffixPos = prevState->GetSuffix().GetSize() - (m_n-1); - if (suffixPos < 0) suffixPos = 0; // push all words if less than order - for(;(size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) - { - const Word &word = prevState->GetSuffix().GetWord(suffixPos); + contextFactor.push_back(&word); + } else { + onlyTerminals = false; + collectForPrefix = false; + int suffixPos = prevState->GetSuffix().GetSize() - (m_n-1); + if (suffixPos < 0) suffixPos = 0; // push all words if less than order + for(; (size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) { + const Word &word = prevState->GetSuffix().GetWord(suffixPos); // cerr << "NT0 --> : " << word << endl; - contextFactor.push_back(&word); - } - } + contextFactor.push_back(&word); + } + } } // internal non-terminal - else - { - // push its prefix - for(size_t prefixPos = 0; prefixPos < m_n-1 - && prefixPos < subPhraseLength; prefixPos++) - { + else { + // push its prefix + for(size_t prefixPos = 0; prefixPos < m_n-1 + && prefixPos < subPhraseLength; prefixPos++) { const Word &word = prevState->GetPrefix().GetWord(prefixPos); // cerr << "NT --> " << word << endl; contextFactor.push_back(&word); } - if (subPhraseLength==1) { - if (collectForPrefix) - ++prefixTerminals; - else - ++suffixTerminals; + if (subPhraseLength==1) { + if (collectForPrefix) + ++prefixTerminals; + else + ++suffixTerminals; - if (phrasePos == cur_hypo.GetCurrTargetPhrase().GetSize()-1) - makeSuffix = true; - } - else { - onlyTerminals = false; - collectForPrefix = true; + if (phrasePos == cur_hypo.GetCurrTargetPhrase().GetSize()-1) + makeSuffix = true; + } else { + onlyTerminals = false; + collectForPrefix = true; - // check if something follows this NT - bool wordFollowing = (phrasePos < cur_hypo.GetCurrTargetPhrase().GetSize() - 1)? true : false; + // check if something follows this NT + bool wordFollowing = (phrasePos < cur_hypo.GetCurrTargetPhrase().GetSize() - 1)? true : false; - // check if we are dealing with a large sub-phrase - if (wordFollowing && subPhraseLength > m_n - 1) - { - // clear up pending ngrams - MakePrefixNgrams(contextFactor, accumulator, prefixTerminals); - contextFactor.clear(); - makePrefix = false; - makeSuffix = true; - collectForPrefix = false; - prefixTerminals = 0; - suffixTerminals = 0; + // check if we are dealing with a large sub-phrase + if (wordFollowing && subPhraseLength > m_n - 1) { + // clear up pending ngrams + MakePrefixNgrams(contextFactor, accumulator, prefixTerminals); + contextFactor.clear(); + makePrefix = false; + makeSuffix = true; + collectForPrefix = false; + prefixTerminals = 0; + suffixTerminals = 0; - // push its suffix - size_t remainingWords = (remainingWords > m_n-1) ? m_n-1 : subPhraseLength - (m_n-1); - for(size_t suffixPos = 0; suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) { - const Word &word = prevState->GetSuffix().GetWord(suffixPos); + // push its suffix + size_t remainingWords = (remainingWords > m_n-1) ? m_n-1 : subPhraseLength - (m_n-1); + for(size_t suffixPos = 0; suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) { + const Word &word = prevState->GetSuffix().GetWord(suffixPos); // cerr << "NT --> : " << word << endl; - contextFactor.push_back(&word); - } - } - // subphrase can be used as suffix and as prefix for the next part - else if (wordFollowing && subPhraseLength == m_n - 1) - { - // clear up pending ngrams - MakePrefixNgrams(contextFactor, accumulator, prefixTerminals); - makePrefix = false; - makeSuffix = true; - collectForPrefix = false; - prefixTerminals = 0; - suffixTerminals = 0; - } - else if (prev_is_NT && prev_subPhraseLength > 1 && subPhraseLength > 1) { - // two NTs in a row: make transition - MakePrefixNgrams(contextFactor, accumulator, 1, m_n-2); - MakeSuffixNgrams(contextFactor, accumulator, 1, m_n-2); - makePrefix = false; - makeSuffix = false; - collectForPrefix = false; - prefixTerminals = 0; - suffixTerminals = 0; - - // remove duplicates - stringstream curr_ngram; - curr_ngram << m_baseName; - curr_ngram << (*contextFactor[m_n-2]).GetString(m_factorType); - curr_ngram << ":"; - curr_ngram << (*contextFactor[m_n-1]).GetString(m_factorType); - accumulator->SparseMinusEquals(curr_ngram.str(),1); - } - } + contextFactor.push_back(&word); + } + } + // subphrase can be used as suffix and as prefix for the next part + else if (wordFollowing && subPhraseLength == m_n - 1) { + // clear up pending ngrams + MakePrefixNgrams(contextFactor, accumulator, prefixTerminals); + makePrefix = false; + makeSuffix = true; + collectForPrefix = false; + prefixTerminals = 0; + suffixTerminals = 0; + } else if (prev_is_NT && prev_subPhraseLength > 1 && subPhraseLength > 1) { + // two NTs in a row: make transition + MakePrefixNgrams(contextFactor, accumulator, 1, m_n-2); + MakeSuffixNgrams(contextFactor, accumulator, 1, m_n-2); + makePrefix = false; + makeSuffix = false; + collectForPrefix = false; + prefixTerminals = 0; + suffixTerminals = 0; + + // remove duplicates + stringstream curr_ngram; + curr_ngram << m_baseName; + curr_ngram << (*contextFactor[m_n-2]).GetString(m_factorType); + curr_ngram << ":"; + curr_ngram << (*contextFactor[m_n-1]).GetString(m_factorType); + accumulator->SparseMinusEquals(curr_ngram.str(),1); + } + } } prev_is_NT = true; prev_subPhraseLength = subPhraseLength; @@ -366,25 +355,24 @@ FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int if (m_n > 1) { if (onlyTerminals) { - MakePrefixNgrams(contextFactor, accumulator, prefixTerminals-1); - } - else { + MakePrefixNgrams(contextFactor, accumulator, prefixTerminals-1); + } else { if (makePrefix) - MakePrefixNgrams(contextFactor, accumulator, prefixTerminals); + MakePrefixNgrams(contextFactor, accumulator, prefixTerminals); if (makeSuffix) - MakeSuffixNgrams(contextFactor, accumulator, suffixTerminals); + MakeSuffixNgrams(contextFactor, accumulator, suffixTerminals); // remove duplicates size_t size = contextFactor.size(); if (makePrefix && makeSuffix && (size <= m_n)) { - stringstream curr_ngram; - curr_ngram << m_baseName; - for (size_t i = 0; i < size; ++i) { - curr_ngram << (*contextFactor[i]).GetString(m_factorType); - if (i < size-1) - curr_ngram << ":"; - } - accumulator->SparseMinusEquals(curr_ngram.str(), 1); + stringstream curr_ngram; + curr_ngram << m_baseName; + for (size_t i = 0; i < size; ++i) { + curr_ngram << (*contextFactor[i]).GetString(m_factorType); + if (i < size-1) + curr_ngram << ":"; + } + accumulator->SparseMinusEquals(curr_ngram.str(), 1); } } } @@ -393,22 +381,23 @@ FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int return new TargetNgramChartState(cur_hypo, featureId, m_n); } -void TargetNgramFeature::MakePrefixNgrams(std::vector &contextFactor, ScoreComponentCollection* accumulator, size_t numberOfStartPos, size_t offset) const { - stringstream ngram; - size_t size = contextFactor.size(); +void TargetNgramFeature::MakePrefixNgrams(std::vector &contextFactor, ScoreComponentCollection* accumulator, size_t numberOfStartPos, size_t offset) const +{ + stringstream ngram; + size_t size = contextFactor.size(); for (size_t k = 0; k < numberOfStartPos; ++k) { size_t max_end = (size < m_n+k+offset)? size: m_n+k+offset; for (size_t end_pos = 1+k+offset; end_pos < max_end; ++end_pos) { ngram << m_baseName; - for (size_t i=k+offset; i <= end_pos; ++i) { - if (i > k+offset) - ngram << ":"; + for (size_t i=k+offset; i <= end_pos; ++i) { + if (i > k+offset) + ngram << ":"; StringPiece factorZero = (*contextFactor[i]).GetString(0); if (m_factorType == 0 || factorZero.compare("") == 0 || factorZero.compare("") == 0) - ngram << factorZero; - else - ngram << (*contextFactor[i]).GetString(m_factorType); - const Word w = *contextFactor[i]; + ngram << factorZero; + else + ngram << (*contextFactor[i]).GetString(m_factorType); + const Word w = *contextFactor[i]; } // cerr << "p-ngram: " << ngram.str() << endl; accumulator->SparsePlusEquals(ngram.str(), 1); @@ -417,21 +406,22 @@ void TargetNgramFeature::MakePrefixNgrams(std::vector &contextFacto } } -void TargetNgramFeature::MakeSuffixNgrams(std::vector &contextFactor, ScoreComponentCollection* accumulator, size_t numberOfEndPos, size_t offset) const { - stringstream ngram; +void TargetNgramFeature::MakeSuffixNgrams(std::vector &contextFactor, ScoreComponentCollection* accumulator, size_t numberOfEndPos, size_t offset) const +{ + stringstream ngram; for (size_t k = 0; k < numberOfEndPos; ++k) { size_t end_pos = contextFactor.size()-1-k-offset; for (int start_pos=end_pos-1; (start_pos >= 0) && (end_pos-start_pos < m_n); --start_pos) { - ngram << m_baseName; - for (size_t j=start_pos; j <= end_pos; ++j){ - StringPiece factorZero = (*contextFactor[j]).GetString(0); - if (m_factorType == 0 || factorZero.compare("") == 0 || factorZero.compare("") == 0) - ngram << factorZero; - else - ngram << (*contextFactor[j]).GetString(m_factorType); - if (j < end_pos) - ngram << ":"; - } + ngram << m_baseName; + for (size_t j=start_pos; j <= end_pos; ++j) { + StringPiece factorZero = (*contextFactor[j]).GetString(0); + if (m_factorType == 0 || factorZero.compare("") == 0 || factorZero.compare("") == 0) + ngram << factorZero; + else + ngram << (*contextFactor[j]).GetString(m_factorType); + if (j < end_pos) + ngram << ":"; + } // cerr << "s-ngram: " << ngram.str() << endl; accumulator->SparsePlusEquals(ngram.str(), 1); ngram.str(""); diff --git a/moses/FF/TargetNgramFeature.h b/moses/FF/TargetNgramFeature.h index b50391d43..8001f2f87 100644 --- a/moses/FF/TargetNgramFeature.h +++ b/moses/FF/TargetNgramFeature.h @@ -16,14 +16,17 @@ namespace Moses { -class TargetNgramState : public FFState { - public: - TargetNgramState(std::vector &words): m_words(words) {} - const std::vector GetWords() const {return m_words;} - virtual int Compare(const FFState& other) const; +class TargetNgramState : public FFState +{ +public: + TargetNgramState(std::vector &words): m_words(words) {} + const std::vector GetWords() const { + return m_words; + } + virtual int Compare(const FFState& other) const; - private: - std::vector m_words; +private: + std::vector m_words; }; class TargetNgramChartState : public FFState @@ -39,8 +42,7 @@ private: * \param ret prefix string * \param size maximum size (typically max lm context window) */ - size_t CalcPrefix(const ChartHypothesis &hypo, const int featureId, Phrase &ret, size_t size) const - { + size_t CalcPrefix(const ChartHypothesis &hypo, const int featureId, Phrase &ret, size_t size) const { const TargetPhrase &target = hypo.GetCurrTargetPhrase(); const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = target.GetAlignNonTerm().GetNonTermIndexMap(); @@ -76,9 +78,8 @@ private: * \param ret suffix phrase * \param size maximum size of suffix */ - size_t CalcSuffix(const ChartHypothesis &hypo, int featureId, Phrase &ret, size_t size) const - { - size_t prefixSize = m_contextPrefix.GetSize(); + size_t CalcSuffix(const ChartHypothesis &hypo, int featureId, Phrase &ret, size_t size) const { + size_t prefixSize = m_contextPrefix.GetSize(); assert(prefixSize <= m_numTargetTerminals); // special handling for small hypotheses @@ -98,9 +99,9 @@ private: } // construct suffix analogous to prefix else { - const TargetPhrase targetPhrase = hypo.GetCurrTargetPhrase(); + const TargetPhrase targetPhrase = hypo.GetCurrTargetPhrase(); const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = - targetPhrase.GetAlignTerm().GetNonTermIndexMap(); + targetPhrase.GetAlignTerm().GetNonTermIndexMap(); for (int pos = (int) targetPhrase.GetSize() - 1; pos >= 0 ; --pos) { const Word &word = targetPhrase.GetWord(pos); @@ -108,8 +109,7 @@ private: size_t nonTermInd = nonTermIndexMap[pos]; const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermInd); size = static_cast(prevHypo->GetFFState(featureId))->CalcSuffix(*prevHypo, featureId, ret, size); - } - else { + } else { ret.PrependWord(word); size--; } @@ -124,9 +124,8 @@ private: public: TargetNgramChartState(const ChartHypothesis &hypo, int featureId, size_t order) - :m_contextPrefix(order - 1), - m_contextSuffix(order - 1) - { + :m_contextPrefix(order - 1), + m_contextSuffix(order - 1) { m_numTargetTerminals = hypo.GetCurrTargetPhrase().GetNumTerminals(); const WordsRange range = hypo.GetCurrSourceRange(); m_startPos = range.GetStartPos(); @@ -159,15 +158,13 @@ public: static_cast( o ); // prefix - if (m_startPos > 0) // not for " ..." - { + if (m_startPos > 0) { // not for " ..." int ret = GetPrefix().Compare(other.GetPrefix()); if (ret != 0) return ret; } - if (m_endPos < m_inputSize - 1)// not for "... " - { + if (m_endPos < m_inputSize - 1) { // not for "... " int ret = GetSuffix().Compare(other.GetSuffix()); if (ret != 0) return ret; @@ -178,34 +175,35 @@ public: /** Sets the features of observed ngrams. */ -class TargetNgramFeature : public StatefulFeatureFunction { +class TargetNgramFeature : public StatefulFeatureFunction +{ public: TargetNgramFeature(const std::string &line); - bool Load(const std::string &filePath); + bool Load(const std::string &filePath); - virtual const FFState* EmptyHypothesisState(const InputType &input) const; + virtual const FFState* EmptyHypothesisState(const InputType &input) const; - virtual FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state, - ScoreComponentCollection* accumulator) const; + virtual FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state, + ScoreComponentCollection* accumulator) const; virtual FFState* EvaluateChart(const ChartHypothesis& cur_hypo, int featureId, - ScoreComponentCollection* accumulator) const; + ScoreComponentCollection* accumulator) const; private: FactorType m_factorType; Word m_bos; boost::unordered_set m_vocab; - size_t m_n; - bool m_lower_ngrams; + size_t m_n; + bool m_lower_ngrams; - std::string m_baseName; + std::string m_baseName; - void appendNgram(const Word& word, bool& skip, std::stringstream& ngram) const; - void MakePrefixNgrams(std::vector &contextFactor, ScoreComponentCollection* accumulator, - size_t numberOfStartPos = 1, size_t offset = 0) const; - void MakeSuffixNgrams(std::vector &contextFactor, ScoreComponentCollection* accumulator, - size_t numberOfEndPos = 1, size_t offset = 0) const; + void appendNgram(const Word& word, bool& skip, std::stringstream& ngram) const; + void MakePrefixNgrams(std::vector &contextFactor, ScoreComponentCollection* accumulator, + size_t numberOfStartPos = 1, size_t offset = 0) const; + void MakeSuffixNgrams(std::vector &contextFactor, ScoreComponentCollection* accumulator, + size_t numberOfEndPos = 1, size_t offset = 0) const; }; } diff --git a/moses/FF/TargetWordInsertionFeature.cpp b/moses/FF/TargetWordInsertionFeature.cpp index 386e943be..f20a652e4 100644 --- a/moses/FF/TargetWordInsertionFeature.cpp +++ b/moses/FF/TargetWordInsertionFeature.cpp @@ -9,13 +9,14 @@ #include "moses/UserMessage.h" #include "util/string_piece_hash.hh" -namespace Moses { +namespace Moses +{ using namespace std; TargetWordInsertionFeature::TargetWordInsertionFeature(const std::string &line) -:StatelessFeatureFunction("TargetWordInsertionFeature", 0, line), -m_unrestricted(true) + :StatelessFeatureFunction("TargetWordInsertionFeature", 0, line), + m_unrestricted(true) { std::cerr << "Initializing target word insertion feature.." << std::endl; @@ -26,11 +27,9 @@ m_unrestricted(true) if (args[0] == "factor") { m_factorType = Scan(args[1]); - } - else if (args[0] == "path") { + } else if (args[0] == "path") { filename = args[1]; - } - else { + } else { throw "Unknown argument " + args[0]; } } @@ -46,13 +45,12 @@ m_unrestricted(true) } -bool TargetWordInsertionFeature::Load(const std::string &filePath) +bool TargetWordInsertionFeature::Load(const std::string &filePath) { ifstream inFile(filePath.c_str()); - if (!inFile) - { - cerr << "could not open file " << filePath << endl; - return false; + if (!inFile) { + cerr << "could not open file " << filePath << endl; + return false; } std::string line; @@ -67,18 +65,18 @@ bool TargetWordInsertionFeature::Load(const std::string &filePath) } void TargetWordInsertionFeature::Evaluate(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const { const AlignmentInfo &alignmentInfo = targetPhrase.GetAlignTerm(); ComputeFeatures(source, targetPhrase, &scoreBreakdown, alignmentInfo); } void TargetWordInsertionFeature::ComputeFeatures(const Phrase &source, - const TargetPhrase& targetPhrase, - ScoreComponentCollection* accumulator, - const AlignmentInfo &alignmentInfo) const + const TargetPhrase& targetPhrase, + ScoreComponentCollection* accumulator, + const AlignmentInfo &alignmentInfo) const { // handle special case: unknown words (they have no word alignment) size_t targetLength = targetPhrase.GetSize(); @@ -100,15 +98,14 @@ void TargetWordInsertionFeature::ComputeFeatures(const Phrase &source, if (!aligned[i]) { Word w = targetPhrase.GetWord(i); if (!w.IsNonTerminal()) { - const StringPiece word = w.GetFactor(m_factorType)->GetString(); - if (word != "" && word != "") { - if (!m_unrestricted && FindStringPiece(m_vocab, word ) == m_vocab.end()) { - accumulator->PlusEquals(this,StringPiece("OTHER"),1); - } - else { - accumulator->PlusEquals(this,word,1); - } - } + const StringPiece word = w.GetFactor(m_factorType)->GetString(); + if (word != "" && word != "") { + if (!m_unrestricted && FindStringPiece(m_vocab, word ) == m_vocab.end()) { + accumulator->PlusEquals(this,StringPiece("OTHER"),1); + } else { + accumulator->PlusEquals(this,word,1); + } + } } } } diff --git a/moses/FF/TargetWordInsertionFeature.h b/moses/FF/TargetWordInsertionFeature.h index aabc4cffc..50f7e5f88 100644 --- a/moses/FF/TargetWordInsertionFeature.h +++ b/moses/FF/TargetWordInsertionFeature.h @@ -13,7 +13,8 @@ namespace Moses /** Sets the features for length of source phrase, target phrase, both. */ -class TargetWordInsertionFeature : public StatelessFeatureFunction { +class TargetWordInsertionFeature : public StatelessFeatureFunction +{ private: boost::unordered_set m_vocab; FactorType m_factorType; @@ -21,18 +22,18 @@ private: public: TargetWordInsertionFeature(const std::string &line); - + bool Load(const std::string &filePath); virtual void Evaluate(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const; + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const; void ComputeFeatures(const Phrase &source, - const TargetPhrase& targetPhrase, - ScoreComponentCollection* accumulator, - const AlignmentInfo &alignmentInfo) const; + const TargetPhrase& targetPhrase, + ScoreComponentCollection* accumulator, + const AlignmentInfo &alignmentInfo) const; }; diff --git a/moses/FF/UnknownWordPenaltyProducer.h b/moses/FF/UnknownWordPenaltyProducer.h index b60967746..200033cfc 100644 --- a/moses/FF/UnknownWordPenaltyProducer.h +++ b/moses/FF/UnknownWordPenaltyProducer.h @@ -14,10 +14,9 @@ class WordsRange; class UnknownWordPenaltyProducer : public StatelessFeatureFunction { public: - UnknownWordPenaltyProducer(const std::string &line) - : StatelessFeatureFunction("UnknownWordPenalty",1, line) - { - m_tuneable = false; + UnknownWordPenaltyProducer(const std::string &line) + : StatelessFeatureFunction("UnknownWordPenalty",1, line) { + m_tuneable = false; } }; diff --git a/moses/FF/WordPenaltyProducer.cpp b/moses/FF/WordPenaltyProducer.cpp index ba97852e4..1dc425742 100644 --- a/moses/FF/WordPenaltyProducer.cpp +++ b/moses/FF/WordPenaltyProducer.cpp @@ -5,9 +5,9 @@ namespace Moses { void WordPenaltyProducer::Evaluate(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const { float score = - (float) targetPhrase.GetNumTerminals(); scoreBreakdown.Assign(this, score); diff --git a/moses/FF/WordPenaltyProducer.h b/moses/FF/WordPenaltyProducer.h index fc824dd84..1892c459c 100644 --- a/moses/FF/WordPenaltyProducer.h +++ b/moses/FF/WordPenaltyProducer.h @@ -14,12 +14,12 @@ class ScoreComponentCollection; class WordPenaltyProducer : public StatelessFeatureFunction { public: - WordPenaltyProducer(const std::string &line) : StatelessFeatureFunction("WordPenalty",1, line) {} + WordPenaltyProducer(const std::string &line) : StatelessFeatureFunction("WordPenalty",1, line) {} virtual void Evaluate(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const; + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const; }; diff --git a/moses/FF/WordTranslationFeature.cpp b/moses/FF/WordTranslationFeature.cpp index 2648ac9f1..3f282609f 100644 --- a/moses/FF/WordTranslationFeature.cpp +++ b/moses/FF/WordTranslationFeature.cpp @@ -10,18 +10,19 @@ #include "moses/UserMessage.h" #include "util/string_piece_hash.hh" -namespace Moses { +namespace Moses +{ using namespace std; WordTranslationFeature::WordTranslationFeature(const std::string &line) -:StatelessFeatureFunction("WordTranslationFeature", 0, line) -,m_unrestricted(true) -,m_simple(true) -,m_sourceContext(false) -,m_targetContext(false) -,m_ignorePunctuation(false) -,m_domainTrigger(false) + :StatelessFeatureFunction("WordTranslationFeature", 0, line) + ,m_unrestricted(true) + ,m_simple(true) + ,m_sourceContext(false) + ,m_targetContext(false) + ,m_ignorePunctuation(false) + ,m_domainTrigger(false) { std::cerr << "Initializing word translation feature.. " << endl; @@ -34,35 +35,25 @@ WordTranslationFeature::WordTranslationFeature(const std::string &line) if (args[0] == "input-factor") { m_factorTypeSource = Scan(args[1]); - } - else if (args[0] == "output-factor") { + } else if (args[0] == "output-factor") { m_factorTypeTarget = Scan(args[1]); - } - else if (args[0] == "simple") { + } else if (args[0] == "simple") { m_simple = Scan(args[1]); - } - else if (args[0] == "source-context") { + } else if (args[0] == "source-context") { m_sourceContext = Scan(args[1]); - } - else if (args[0] == "target-context") { + } else if (args[0] == "target-context") { m_targetContext = Scan(args[1]); - } - else if (args[0] == "ignore-punctuation") { + } else if (args[0] == "ignore-punctuation") { m_ignorePunctuation = Scan(args[1]); - } - else if (args[0] == "domain-trigger") { + } else if (args[0] == "domain-trigger") { m_domainTrigger = Scan(args[1]); - } - else if (args[0] == "texttype") { + } else if (args[0] == "texttype") { texttype = args[1]; - } - else if (args[0] == "source-path") { + } else if (args[0] == "source-path") { filenameSource = args[1]; - } - else if (args[0] == "target-path") { + } else if (args[0] == "target-path") { filenameTarget = args[1]; - } - else { + } else { throw "Unknown argument " + args[0]; } } @@ -108,65 +99,62 @@ WordTranslationFeature::WordTranslationFeature(const std::string &line) } -bool WordTranslationFeature::Load(const std::string &filePathSource, const std::string &filePathTarget) +bool WordTranslationFeature::Load(const std::string &filePathSource, const std::string &filePathTarget) { if (m_domainTrigger) { // domain trigger terms for each input document ifstream inFileSource(filePathSource.c_str()); - if (!inFileSource){ + if (!inFileSource) { cerr << "could not open file " << filePathSource << endl; return false; } - + std::string line; while (getline(inFileSource, line)) { - m_vocabDomain.resize(m_vocabDomain.size() + 1); - vector termVector; - boost::split(termVector, line, boost::is_any_of("\t ")); - for (size_t i=0; i < termVector.size(); ++i) - m_vocabDomain.back().insert(termVector[i]); + m_vocabDomain.resize(m_vocabDomain.size() + 1); + vector termVector; + boost::split(termVector, line, boost::is_any_of("\t ")); + for (size_t i=0; i < termVector.size(); ++i) + m_vocabDomain.back().insert(termVector[i]); } - + inFileSource.close(); - } - else { + } else { // restricted source word vocabulary ifstream inFileSource(filePathSource.c_str()); - if (!inFileSource) - { - cerr << "could not open file " << filePathSource << endl; - return false; - } - + if (!inFileSource) { + cerr << "could not open file " << filePathSource << endl; + return false; + } + std::string line; while (getline(inFileSource, line)) { m_vocabSource.insert(line); } - + inFileSource.close(); - + // restricted target word vocabulary ifstream inFileTarget(filePathTarget.c_str()); - if (!inFileTarget) - { - cerr << "could not open file " << filePathTarget << endl; - return false; - } - + if (!inFileTarget) { + cerr << "could not open file " << filePathTarget << endl; + return false; + } + while (getline(inFileTarget, line)) { m_vocabTarget.insert(line); } - + inFileTarget.close(); - + m_unrestricted = false; } return true; } void WordTranslationFeature::Evaluate - (const PhraseBasedFeatureContext& context, - ScoreComponentCollection* accumulator) const +(const PhraseBasedFeatureContext& context, + ScoreComponentCollection* accumulator) const { const Sentence& input = static_cast(context.GetSource()); const TargetPhrase& targetPhrase = context.GetTargetPhrase(); @@ -188,7 +176,7 @@ void WordTranslationFeature::Evaluate char firstChar = sourceWord[0]; CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); if(charIterator != m_punctuationHash.end()) - continue; + continue; firstChar = targetWord[0]; charIterator = m_punctuationHash.find( firstChar ); if(charIterator != m_punctuationHash.end()) @@ -197,9 +185,9 @@ void WordTranslationFeature::Evaluate if (!m_unrestricted) { if (FindStringPiece(m_vocabSource, sourceWord) == m_vocabSource.end()) - sourceWord = "OTHER"; + sourceWord = "OTHER"; if (FindStringPiece(m_vocabTarget, targetWord) == m_vocabTarget.end()) - targetWord = "OTHER"; + targetWord = "OTHER"; } if (m_simple) { @@ -215,174 +203,169 @@ void WordTranslationFeature::Evaluate const bool use_topicid = input.GetUseTopicId(); const bool use_topicid_prob = input.GetUseTopicIdAndProb(); if (use_topicid || use_topicid_prob) { - if(use_topicid) { - // use topicid as trigger - const long topicid = input.GetTopicId(); - stringstream feature; - feature << m_description << "_"; - if (topicid == -1) - feature << "unk"; - else - feature << topicid; + if(use_topicid) { + // use topicid as trigger + const long topicid = input.GetTopicId(); + stringstream feature; + feature << m_description << "_"; + if (topicid == -1) + feature << "unk"; + else + feature << topicid; - feature << "_"; - feature << sourceWord; - feature << "~"; - feature << targetWord; - accumulator->SparsePlusEquals(feature.str(), 1); - } - else { - // use topic probabilities - const vector &topicid_prob = *(input.GetTopicIdAndProb()); - if (atol(topicid_prob[0].c_str()) == -1) { - stringstream feature; - feature << m_description << "_unk_"; - feature << sourceWord; - feature << "~"; - feature << targetWord; - accumulator->SparsePlusEquals(feature.str(), 1); - } - else { - for (size_t i=0; i+1 < topicid_prob.size(); i+=2) { - stringstream feature; - feature << m_description << "_"; - feature << topicid_prob[i]; - feature << "_"; - feature << sourceWord; - feature << "~"; - feature << targetWord; - accumulator->SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str())); - } - } - } - } - else { - // range over domain trigger words (keywords) - const long docid = input.GetDocumentId(); - for (boost::unordered_set::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) { - string sourceTrigger = *p; - stringstream feature; - feature << m_description << "_"; - feature << sourceTrigger; - feature << "_"; - feature << sourceWord; - feature << "~"; - feature << targetWord; - accumulator->SparsePlusEquals(feature.str(), 1); - } + feature << "_"; + feature << sourceWord; + feature << "~"; + feature << targetWord; + accumulator->SparsePlusEquals(feature.str(), 1); + } else { + // use topic probabilities + const vector &topicid_prob = *(input.GetTopicIdAndProb()); + if (atol(topicid_prob[0].c_str()) == -1) { + stringstream feature; + feature << m_description << "_unk_"; + feature << sourceWord; + feature << "~"; + feature << targetWord; + accumulator->SparsePlusEquals(feature.str(), 1); + } else { + for (size_t i=0; i+1 < topicid_prob.size(); i+=2) { + stringstream feature; + feature << m_description << "_"; + feature << topicid_prob[i]; + feature << "_"; + feature << sourceWord; + feature << "~"; + feature << targetWord; + accumulator->SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str())); + } + } + } + } else { + // range over domain trigger words (keywords) + const long docid = input.GetDocumentId(); + for (boost::unordered_set::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) { + string sourceTrigger = *p; + stringstream feature; + feature << m_description << "_"; + feature << sourceTrigger; + feature << "_"; + feature << sourceWord; + feature << "~"; + feature << targetWord; + accumulator->SparsePlusEquals(feature.str(), 1); + } } } if (m_sourceContext) { size_t globalSourceIndex = context.GetTranslationOption().GetStartPos() + sourceIndex; if (!m_domainTrigger && globalSourceIndex == 0) { - // add trigger feature for source - stringstream feature; - feature << m_description << "_"; - feature << ","; - feature << sourceWord; - feature << "~"; - feature << targetWord; - accumulator->SparsePlusEquals(feature.str(), 1); + // add trigger feature for source + stringstream feature; + feature << m_description << "_"; + feature << ","; + feature << sourceWord; + feature << "~"; + feature << targetWord; + accumulator->SparsePlusEquals(feature.str(), 1); } // range over source words to get context for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) { - if (contextIndex == globalSourceIndex) continue; - StringPiece sourceTrigger = input.GetWord(contextIndex).GetFactor(m_factorTypeSource)->GetString(); - if (m_ignorePunctuation) { - // check if trigger is punctuation - char firstChar = sourceTrigger[0]; - CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); - if(charIterator != m_punctuationHash.end()) - continue; - } + if (contextIndex == globalSourceIndex) continue; + StringPiece sourceTrigger = input.GetWord(contextIndex).GetFactor(m_factorTypeSource)->GetString(); + if (m_ignorePunctuation) { + // check if trigger is punctuation + char firstChar = sourceTrigger[0]; + CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); + if(charIterator != m_punctuationHash.end()) + continue; + } - const long docid = input.GetDocumentId(); - bool sourceTriggerExists = false; - if (m_domainTrigger) - sourceTriggerExists = FindStringPiece(m_vocabDomain[docid], sourceTrigger ) != m_vocabDomain[docid].end(); - else if (!m_unrestricted) - sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end(); + const long docid = input.GetDocumentId(); + bool sourceTriggerExists = false; + if (m_domainTrigger) + sourceTriggerExists = FindStringPiece(m_vocabDomain[docid], sourceTrigger ) != m_vocabDomain[docid].end(); + else if (!m_unrestricted) + sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end(); - if (m_domainTrigger) { - if (sourceTriggerExists) { - stringstream feature; - feature << m_description << "_"; - feature << sourceTrigger; - feature << "_"; - feature << sourceWord; - feature << "~"; - feature << targetWord; - accumulator->SparsePlusEquals(feature.str(), 1); - } - } - else if (m_unrestricted || sourceTriggerExists) { - stringstream feature; - feature << m_description << "_"; - if (contextIndex < globalSourceIndex) { - feature << sourceTrigger; - feature << ","; - feature << sourceWord; - } - else { - feature << sourceWord; - feature << ","; - feature << sourceTrigger; - } - feature << "~"; - feature << targetWord; - accumulator->SparsePlusEquals(feature.str(), 1); - } + if (m_domainTrigger) { + if (sourceTriggerExists) { + stringstream feature; + feature << m_description << "_"; + feature << sourceTrigger; + feature << "_"; + feature << sourceWord; + feature << "~"; + feature << targetWord; + accumulator->SparsePlusEquals(feature.str(), 1); + } + } else if (m_unrestricted || sourceTriggerExists) { + stringstream feature; + feature << m_description << "_"; + if (contextIndex < globalSourceIndex) { + feature << sourceTrigger; + feature << ","; + feature << sourceWord; + } else { + feature << sourceWord; + feature << ","; + feature << sourceTrigger; + } + feature << "~"; + feature << targetWord; + accumulator->SparsePlusEquals(feature.str(), 1); + } } } if (m_targetContext) { throw runtime_error("Can't use target words outside current translation option in a stateless feature"); /* - size_t globalTargetIndex = cur_hypo.GetCurrTargetWordsRange().GetStartPos() + targetIndex; - if (globalTargetIndex == 0) { - // add trigger feature for source - stringstream feature; - feature << "wt_"; - feature << sourceWord; - feature << "~"; - feature << ","; - feature << targetWord; - accumulator->SparsePlusEquals(feature.str(), 1); - } + size_t globalTargetIndex = cur_hypo.GetCurrTargetWordsRange().GetStartPos() + targetIndex; + if (globalTargetIndex == 0) { + // add trigger feature for source + stringstream feature; + feature << "wt_"; + feature << sourceWord; + feature << "~"; + feature << ","; + feature << targetWord; + accumulator->SparsePlusEquals(feature.str(), 1); + } - // range over target words (up to current position) to get context - for(size_t contextIndex = 0; contextIndex < globalTargetIndex; contextIndex++ ) { - string targetTrigger = cur_hypo.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString(); - if (m_ignorePunctuation) { - // check if trigger is punctuation - char firstChar = targetTrigger.at(0); - CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); - if(charIterator != m_punctuationHash.end()) - continue; - } + // range over target words (up to current position) to get context + for(size_t contextIndex = 0; contextIndex < globalTargetIndex; contextIndex++ ) { + string targetTrigger = cur_hypo.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString(); + if (m_ignorePunctuation) { + // check if trigger is punctuation + char firstChar = targetTrigger.at(0); + CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); + if(charIterator != m_punctuationHash.end()) + continue; + } - bool targetTriggerExists = false; - if (!m_unrestricted) - targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end(); + bool targetTriggerExists = false; + if (!m_unrestricted) + targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end(); - if (m_unrestricted || targetTriggerExists) { - stringstream feature; - feature << "wt_"; - feature << sourceWord; - feature << "~"; - feature << targetTrigger; - feature << ","; - feature << targetWord; - accumulator->SparsePlusEquals(feature.str(), 1); - } - }*/ + if (m_unrestricted || targetTriggerExists) { + stringstream feature; + feature << "wt_"; + feature << sourceWord; + feature << "~"; + feature << targetTrigger; + feature << ","; + feature << targetWord; + accumulator->SparsePlusEquals(feature.str(), 1); + } + }*/ } } } void WordTranslationFeature::EvaluateChart( - const ChartBasedFeatureContext& context, - ScoreComponentCollection* accumulator) const + const ChartBasedFeatureContext& context, + ScoreComponentCollection* accumulator) const { const TargetPhrase& targetPhrase = context.GetTargetPhrase(); const AlignmentInfo &alignmentInfo = targetPhrase.GetAlignTerm(); @@ -403,7 +386,7 @@ void WordTranslationFeature::EvaluateChart( char firstChar = sourceWord[0]; CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); if(charIterator != m_punctuationHash.end()) - continue; + continue; firstChar = targetWord[0]; charIterator = m_punctuationHash.find( firstChar ); if(charIterator != m_punctuationHash.end()) @@ -411,118 +394,118 @@ void WordTranslationFeature::EvaluateChart( } if (!m_unrestricted) { - if (FindStringPiece(m_vocabSource, sourceWord) == m_vocabSource.end()) - sourceWord = "OTHER"; - if (FindStringPiece(m_vocabTarget, targetWord) == m_vocabTarget.end()) - targetWord = "OTHER"; + if (FindStringPiece(m_vocabSource, sourceWord) == m_vocabSource.end()) + sourceWord = "OTHER"; + if (FindStringPiece(m_vocabTarget, targetWord) == m_vocabTarget.end()) + targetWord = "OTHER"; } - + if (m_simple) { - // construct feature name - stringstream featureName; - featureName << m_description << "_"; - //featureName << ((sourceExists||m_unrestricted) ? sourceWord : "OTHER"); - featureName << sourceWord; - featureName << "~"; - //featureName << ((targetExists||m_unrestricted) ? targetWord : "OTHER"); - featureName << targetWord; - accumulator->SparsePlusEquals(featureName.str(), 1); + // construct feature name + stringstream featureName; + featureName << m_description << "_"; + //featureName << ((sourceExists||m_unrestricted) ? sourceWord : "OTHER"); + featureName << sourceWord; + featureName << "~"; + //featureName << ((targetExists||m_unrestricted) ? targetWord : "OTHER"); + featureName << targetWord; + accumulator->SparsePlusEquals(featureName.str(), 1); } - /* if (m_sourceContext) { - size_t globalSourceIndex = cur_hypo.GetCurrSourceRange().GetStartPos() + sourceIndex; - if (globalSourceIndex == 0) { - // add trigger feature for source - stringstream feature; - feature << "wt_"; - feature << ","; - feature << sourceWord; - feature << "~"; - feature << targetWord; - accumulator->SparsePlusEquals(feature.str(), 1); - cerr << feature.str() << endl; - } + /* if (m_sourceContext) { + size_t globalSourceIndex = cur_hypo.GetCurrSourceRange().GetStartPos() + sourceIndex; + if (globalSourceIndex == 0) { + // add trigger feature for source + stringstream feature; + feature << "wt_"; + feature << ","; + feature << sourceWord; + feature << "~"; + feature << targetWord; + accumulator->SparsePlusEquals(feature.str(), 1); + cerr << feature.str() << endl; + } - // range over source words to get context - for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) { - if (contextIndex == globalSourceIndex) continue; - string sourceTrigger = input.GetWord(contextIndex).GetFactor(m_factorTypeSource)->GetString(); - if (m_ignorePunctuation) { - // check if trigger is punctuation - char firstChar = sourceTrigger.at(0); - CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); - if(charIterator != m_punctuationHash.end()) - continue; - } + // range over source words to get context + for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) { + if (contextIndex == globalSourceIndex) continue; + string sourceTrigger = input.GetWord(contextIndex).GetFactor(m_factorTypeSource)->GetString(); + if (m_ignorePunctuation) { + // check if trigger is punctuation + char firstChar = sourceTrigger.at(0); + CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); + if(charIterator != m_punctuationHash.end()) + continue; + } - bool sourceTriggerExists = false; - if (!m_unrestricted) - sourceTriggerExists = m_vocabSource.find( sourceTrigger ) != m_vocabSource.end(); + bool sourceTriggerExists = false; + if (!m_unrestricted) + sourceTriggerExists = m_vocabSource.find( sourceTrigger ) != m_vocabSource.end(); - if (m_unrestricted || sourceTriggerExists) { - stringstream feature; - feature << "wt_"; - if (contextIndex < globalSourceIndex) { - feature << sourceTrigger; - feature << ","; - feature << sourceWord; - } - else { - feature << sourceWord; - feature << ","; - feature << sourceTrigger; - } - feature << "~"; - feature << targetWord; - accumulator->SparsePlusEquals(feature.str(), 1); - cerr << feature.str() << endl; - } - } - }*/ -/* if (m_targetContext) { - size_t globalTargetIndex = 0; // TODO -// size_t globalTargetIndex = cur_hypo.GetCurrTargetWordsRange().GetStartPos() + targetIndex; - if (globalTargetIndex == 0) { - // add trigger feature for source - stringstream feature; - feature << "wt_"; - feature << sourceWord; - feature << "~"; - feature << ","; - feature << targetWord; - accumulator->SparsePlusEquals(feature.str(), 1); - cerr << feature.str() << endl; - } - - // range over target words (up to current position) to get context - for(size_t contextIndex = 0; contextIndex < globalTargetIndex; contextIndex++ ) { - Phrase outputPhrase = cur_hypo.GetOutputPhrase(); - string targetTrigger = outputPhrase.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString(); - //string targetTrigger = cur_hypo.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString(); - if (m_ignorePunctuation) { - // check if trigger is punctuation - char firstChar = targetTrigger.at(0); - CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); - if(charIterator != m_punctuationHash.end()) - continue; - } - - bool targetTriggerExists = false; - if (!m_unrestricted) - targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end(); - - if (m_unrestricted || targetTriggerExists) { - stringstream feature; - feature << "wt_"; - feature << sourceWord; - feature << "~"; - feature << targetTrigger; - feature << ","; - feature << targetWord; - accumulator->SparsePlusEquals(feature.str(), 1); - cerr << feature.str() << endl; - } - } + if (m_unrestricted || sourceTriggerExists) { + stringstream feature; + feature << "wt_"; + if (contextIndex < globalSourceIndex) { + feature << sourceTrigger; + feature << ","; + feature << sourceWord; + } + else { + feature << sourceWord; + feature << ","; + feature << sourceTrigger; + } + feature << "~"; + feature << targetWord; + accumulator->SparsePlusEquals(feature.str(), 1); + cerr << feature.str() << endl; + } + } }*/ + /* if (m_targetContext) { + size_t globalTargetIndex = 0; // TODO + // size_t globalTargetIndex = cur_hypo.GetCurrTargetWordsRange().GetStartPos() + targetIndex; + if (globalTargetIndex == 0) { + // add trigger feature for source + stringstream feature; + feature << "wt_"; + feature << sourceWord; + feature << "~"; + feature << ","; + feature << targetWord; + accumulator->SparsePlusEquals(feature.str(), 1); + cerr << feature.str() << endl; + } + + // range over target words (up to current position) to get context + for(size_t contextIndex = 0; contextIndex < globalTargetIndex; contextIndex++ ) { + Phrase outputPhrase = cur_hypo.GetOutputPhrase(); + string targetTrigger = outputPhrase.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString(); + //string targetTrigger = cur_hypo.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString(); + if (m_ignorePunctuation) { + // check if trigger is punctuation + char firstChar = targetTrigger.at(0); + CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); + if(charIterator != m_punctuationHash.end()) + continue; + } + + bool targetTriggerExists = false; + if (!m_unrestricted) + targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end(); + + if (m_unrestricted || targetTriggerExists) { + stringstream feature; + feature << "wt_"; + feature << sourceWord; + feature << "~"; + feature << targetTrigger; + feature << ","; + feature << targetWord; + accumulator->SparsePlusEquals(feature.str(), 1); + cerr << feature.str() << endl; + } + } + }*/ } } diff --git a/moses/FF/WordTranslationFeature.h b/moses/FF/WordTranslationFeature.h index 3379e8c84..b3a434325 100644 --- a/moses/FF/WordTranslationFeature.h +++ b/moses/FF/WordTranslationFeature.h @@ -14,11 +14,12 @@ namespace Moses /** Sets the features for word translation */ -class WordTranslationFeature : public StatelessFeatureFunction { +class WordTranslationFeature : public StatelessFeatureFunction +{ typedef std::map< char, short > CharHash; typedef std::vector< boost::unordered_set > DocumentVector; - + private: boost::unordered_set m_vocabSource; boost::unordered_set m_vocabTarget; @@ -32,18 +33,18 @@ private: bool m_domainTrigger; bool m_ignorePunctuation; CharHash m_punctuationHash; - + public: WordTranslationFeature(const std::string &line); - + bool Load(const std::string &filePathSource, const std::string &filePathTarget); - + const FFState* EmptyHypothesisState(const InputType &) const { return new DummyState(); } - - void Evaluate(const PhraseBasedFeatureContext& context, - ScoreComponentCollection* accumulator) const; + + void Evaluate(const PhraseBasedFeatureContext& context, + ScoreComponentCollection* accumulator) const; void EvaluateChart(const ChartBasedFeatureContext& context, ScoreComponentCollection* accumulator) const; diff --git a/moses/Factor.h b/moses/Factor.h index 87e8f8028..f4bb2074d 100644 --- a/moses/Factor.h +++ b/moses/Factor.h @@ -34,8 +34,8 @@ namespace Moses struct FactorFriend; class FactorCollection; -/** Represents a factor (word, POS, etc). - * A Factor has a contiguous identifier and string value. +/** Represents a factor (word, POS, etc). + * A Factor has a contiguous identifier and string value. */ class Factor { @@ -53,10 +53,10 @@ class Factor //! protected constructor. only friend class, FactorCollection, is allowed to create Factor objects Factor() {} - // Needed for STL containers. They'll delegate through FactorFriend, which is never exposed publicly. + // Needed for STL containers. They'll delegate through FactorFriend, which is never exposed publicly. Factor(const Factor &factor) : m_string(factor.m_string), m_id(factor.m_id) {} - // Not implemented. Shouldn't be called. + // Not implemented. Shouldn't be called. Factor &operator=(const Factor &factor); public: diff --git a/moses/FactorCollection.cpp b/moses/FactorCollection.cpp index 969bb39d1..5d6eb1c53 100644 --- a/moses/FactorCollection.cpp +++ b/moses/FactorCollection.cpp @@ -38,11 +38,12 @@ FactorCollection FactorCollection::s_instance; const Factor *FactorCollection::AddFactor(const StringPiece &factorString) { FactorFriend to_ins; - to_ins.in.m_string = factorString; + to_ins.in.m_string = factorString; to_ins.in.m_id = m_factorId; // If we're threaded, hope a read-only lock is sufficient. #ifdef WITH_THREADS - { // read=lock scope + { + // read=lock scope boost::shared_lock read_lock(m_accessLock); Set::const_iterator i = m_set.find(to_ins); if (i != m_set.end()) return &i->in; @@ -52,8 +53,8 @@ const Factor *FactorCollection::AddFactor(const StringPiece &factorString) std::pair ret(m_set.insert(to_ins)); if (ret.second) { ret.first->in.m_string.set( - memcpy(m_string_backing.Allocate(factorString.size()), factorString.data(), factorString.size()), - factorString.size()); + memcpy(m_string_backing.Allocate(factorString.size()), factorString.data(), factorString.size()), + factorString.size()); m_factorId++; } return &ret.first->in; diff --git a/moses/FactorCollection.h b/moses/FactorCollection.h index e7749244f..8c3db5da9 100644 --- a/moses/FactorCollection.h +++ b/moses/FactorCollection.h @@ -44,7 +44,7 @@ namespace Moses * private and friended to FactorFriend. The STL containers can delegate * copying, so friending the container isn't sufficient. STL containers see * FactorFriend's public copy constructor and everybody else sees Factor's - * private copy constructor. + * private copy constructor. */ struct FactorFriend { Factor in; diff --git a/moses/FeatureVector.cpp b/moses/FeatureVector.cpp index f58bb5cab..96dd9a0ce 100644 --- a/moses/FeatureVector.cpp +++ b/moses/FeatureVector.cpp @@ -1,22 +1,22 @@ /* Moses - factored phrase-based language decoder Copyright (C) 2010 University of Edinburgh - - + + This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. - + This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - + */ #include @@ -31,744 +31,815 @@ using namespace std; -namespace Moses { - - const string FName::SEP = "_"; - FName::Name2Id FName::name2id; - vector FName::id2name; - FName::Id2Count FName::id2hopeCount; - FName::Id2Count FName::id2fearCount; +namespace Moses +{ + +const string FName::SEP = "_"; +FName::Name2Id FName::name2id; +vector FName::id2name; +FName::Id2Count FName::id2hopeCount; +FName::Id2Count FName::id2fearCount; #ifdef WITH_THREADS - boost::shared_mutex FName::m_idLock; +boost::shared_mutex FName::m_idLock; #endif - - void FName::init(const StringPiece &name) { + +void FName::init(const StringPiece &name) +{ #ifdef WITH_THREADS - //reader lock - boost::shared_lock lock(m_idLock); + //reader lock + boost::shared_lock lock(m_idLock); #endif - Name2Id::iterator i = FindStringPiece(name2id, name); - if (i != name2id.end()) { - m_id = i->second; - } else { + Name2Id::iterator i = FindStringPiece(name2id, name); + if (i != name2id.end()) { + m_id = i->second; + } else { #ifdef WITH_THREADS - //release the reader lock, and upgrade to writer lock - lock.unlock(); - boost::unique_lock write_lock(m_idLock); + //release the reader lock, and upgrade to writer lock + lock.unlock(); + boost::unique_lock write_lock(m_idLock); #endif - std::pair to_ins; - to_ins.first.assign(name.data(), name.size()); - to_ins.second = name2id.size(); - std::pair res(name2id.insert(to_ins)); - if (res.second) { - // TODO this should be string pointers backed by the hash table. - id2name.push_back(to_ins.first); - } - m_id = res.first->second; - } - } - - size_t FName::getId(const string& name) { - Name2Id::iterator i = name2id.find(name); - assert (i != name2id.end()); - return i->second; - } - - size_t FName::getHopeIdCount(const string& name) { - Name2Id::iterator i = name2id.find(name); - if (i != name2id.end()) { - float id = i->second; - return id2hopeCount[id]; - } - return 0; - } - - size_t FName::getFearIdCount(const string& name) { - Name2Id::iterator i = name2id.find(name); - if (i != name2id.end()) { - float id = i->second; - return id2fearCount[id]; - } - return 0; - } - - void FName::incrementHopeId(const string& name) { - Name2Id::iterator i = name2id.find(name); - assert(i != name2id.end()); -#ifdef WITH_THREADS - // get upgradable lock and upgrade to writer lock - boost::upgrade_lock upgradeLock(m_idLock); - boost::upgrade_to_unique_lock uniqueLock(upgradeLock); -#endif - id2hopeCount[i->second] += 1; - } - - void FName::incrementFearId(const string& name) { - Name2Id::iterator i = name2id.find(name); - assert(i != name2id.end()); -#ifdef WITH_THREADS - // get upgradable lock and upgrade to writer lock - boost::upgrade_lock upgradeLock(m_idLock); - boost::upgrade_to_unique_lock uniqueLock(upgradeLock); -#endif - id2fearCount[i->second] += 1; - } - - void FName::eraseId(size_t id) { -#ifdef WITH_THREADS - // get upgradable lock and upgrade to writer lock - boost::upgrade_lock upgradeLock(m_idLock); - boost::upgrade_to_unique_lock uniqueLock(upgradeLock); -#endif - id2hopeCount.erase(id); - id2fearCount.erase(id); - } - - std::ostream& operator<<( std::ostream& out, const FName& name) { - out << name.name(); - return out; - } - - size_t FName::hash() const { - return boost::hash_value(m_id); - } - - const std::string& FName::name() const { - return id2name[m_id]; - } - - - bool FName::operator==(const FName& rhs) const { - return m_id == rhs.m_id; - } - - bool FName::operator!=(const FName& rhs) const { - return ! (*this == rhs); - } - - FVector::FVector(size_t coreFeatures) : m_coreFeatures(coreFeatures) {} - - void FVector::resize(size_t newsize) { - valarray oldValues(m_coreFeatures); - m_coreFeatures.resize(newsize); - for (size_t i = 0; i < min(m_coreFeatures.size(), oldValues.size()); ++i) { - m_coreFeatures[i] = oldValues[i]; - } - } - - void FVector::clear() { - m_coreFeatures.resize(0); - m_features.clear(); - } - - bool FVector::load(const std::string& filename) { - clear(); - ifstream in (filename.c_str()); - if (!in) { - return false; - } - string line; - while(getline(in,line)) { - if (line[0] == '#') continue; - istringstream linestream(line); - string namestring; - FValue value; - linestream >> namestring; - linestream >> value; - FName fname(namestring); - //cerr << "Setting sparse weight " << fname << " to value " << value << "." << endl; - set(fname,value); - } - return true; - } - - void FVector::save(const string& filename) const { - ofstream out(filename.c_str()); - if (!out) { - ostringstream msg; - msg << "Unable to open " << filename; - throw runtime_error(msg.str()); - } - write(out); - out.close(); - } - - void FVector::write(ostream& out) const { - for (const_iterator i = cbegin(); i != cend(); ++i) { - out << i->first << " " << i->second << endl; - } - } - - static bool equalsTolerance(FValue lhs, FValue rhs) { - if (lhs == rhs) return true; - static const FValue TOLERANCE = 1e-4; - FValue diff = abs(lhs-rhs); - FValue mean = (abs(lhs)+abs(rhs))/2; - //cerr << "ET " << lhs << " " << rhs << " " << diff << " " << mean << " " << endl; - return diff/mean < TOLERANCE ; - } - - bool FVector::operator== (const FVector& rhs) const { - if (this == &rhs) { - return true; - } - if (m_coreFeatures.size() != rhs.m_coreFeatures.size()) { - return false; - } - for (size_t i = 0; i < m_coreFeatures.size(); ++i) { - if (!equalsTolerance(m_coreFeatures[i], rhs.m_coreFeatures[i])) return false; - } - for (const_iterator i = cbegin(); i != cend(); ++i) { - if (!equalsTolerance(i->second,rhs.get(i->first))) return false; - } - for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i) { - if (!equalsTolerance(i->second, get(i->first))) return false; - } - return true; - } - - bool FVector::operator!= (const FVector& rhs) const { - return ! (*this == rhs); - } - - ProxyFVector FVector::operator[](const FName& name) { - // At this point, we don't know whether operator[] was called, so we return - // a proxy object and defer the decision until later - return ProxyFVector(this, name); - } - - /** Equivalent for core features. */ - FValue& FVector::operator[](size_t index) { - return m_coreFeatures[index]; - } - - - FValue FVector::operator[](const FName& name) const { - return get(name); - } - - FValue FVector::operator[](size_t index) const { - return m_coreFeatures[index]; - } - - ostream& FVector::print(ostream& out) const { - out << "core=("; - for (size_t i = 0; i < m_coreFeatures.size(); ++i) { - out << m_coreFeatures[i]; - if (i + 1 < m_coreFeatures.size()) { - out << ","; - } - } - out << ") "; - for (const_iterator i = cbegin(); i != cend(); ++i) { - if (i != cbegin()) - out << " "; - out << i->first << "=" << i->second; - } - return out; - } - - ostream& operator<<(ostream& out, const FVector& fv) { - return fv.print(out); - } - - const FValue& FVector::get(const FName& name) const { - static const FValue DEFAULT = 0; - const_iterator fi = m_features.find(name); - if (fi == m_features.end()) { - return DEFAULT; - } else { - return fi->second; - } - } - - FValue FVector::getBackoff(const FName& name, float backoff) const { - const_iterator fi = m_features.find(name); - if (fi == m_features.end()) { - return backoff; - } else { - return fi->second; - } - } - - void FVector::thresholdScale(FValue maxValue ) { - FValue factor = 1.0; - for (const_iterator i = cbegin(); i != cend(); ++i) { - FValue value = i->second; - if (abs(value)*factor > maxValue) { - factor = abs(value) / maxValue; - } - } - operator*=(factor); - } - - void FVector::capMax(FValue maxValue) { - for (const_iterator i = cbegin(); i != cend(); ++i) - if (i->second > maxValue) - set(i->first, maxValue); - } - - void FVector::capMin(FValue minValue) { - for (const_iterator i = cbegin(); i != cend(); ++i) - if (i->second < minValue) - set(i->first, minValue); - } - - void FVector::set(const FName& name, const FValue& value) { - m_features[name] = value; - } - - void FVector::printCoreFeatures() { - cerr << "core=("; - for (size_t i = 0; i < m_coreFeatures.size(); ++i) { - cerr << m_coreFeatures[i]; - if (i + 1 < m_coreFeatures.size()) { - cerr << ","; - } - } - cerr << ") "; - } - - FVector& FVector::operator+= (const FVector& rhs) { - if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) - resize(rhs.m_coreFeatures.size()); - for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i) - set(i->first, get(i->first) + i->second); - for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i) - m_coreFeatures[i] += rhs.m_coreFeatures[i]; - return *this; - } - - // add only sparse features - void FVector::sparsePlusEquals(const FVector& rhs) { - for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i) - set(i->first, get(i->first) + i->second); - } - - // assign only core features - void FVector::coreAssign(const FVector& rhs) { - for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i) - m_coreFeatures[i] = rhs.m_coreFeatures[i]; - } - - void FVector::incrementSparseHopeFeatures() { - for (const_iterator i = cbegin(); i != cend(); ++i) - FName::incrementHopeId((i->first).name()); - } - - void FVector::incrementSparseFearFeatures() { - for (const_iterator i = cbegin(); i != cend(); ++i) - FName::incrementFearId((i->first).name()); - } - - void FVector::printSparseHopeFeatureCounts(std::ofstream& out) { - for (const_iterator i = cbegin(); i != cend(); ++i) - out << (i->first).name() << ": " << FName::getHopeIdCount((i->first).name()) << std::endl; - } - - void FVector::printSparseFearFeatureCounts(std::ofstream& out) { - for (const_iterator i = cbegin(); i != cend(); ++i) - out << (i->first).name() << ": " << FName::getFearIdCount((i->first).name()) << std::endl; - } - - void FVector::printSparseHopeFeatureCounts() { - for (const_iterator i = cbegin(); i != cend(); ++i) - std::cerr << (i->first).name() << ": " << FName::getHopeIdCount((i->first).name()) << std::endl; - } - - void FVector::printSparseFearFeatureCounts() { - for (const_iterator i = cbegin(); i != cend(); ++i) - std::cerr << (i->first).name() << ": " << FName::getFearIdCount((i->first).name()) << std::endl; - } - - size_t FVector::pruneSparseFeatures(size_t threshold) { - size_t count = 0; - vector toErase; - for (const_iterator i = cbegin(); i != cend(); ++i) { - const std::string& fname = (i->first).name(); - if (FName::getHopeIdCount(fname) < threshold && FName::getFearIdCount(fname) < threshold) { - toErase.push_back(i->first); - std::cerr << "pruning: " << fname << " (" << FName::getHopeIdCount(fname) << ", " << FName::getFearIdCount(fname) << ")" << std::endl; - FName::eraseId(FName::getId(fname)); - ++count; - } - } - - for (size_t i = 0; i < toErase.size(); ++i) - m_features.erase(toErase[i]); - - return count; - } - - size_t FVector::pruneZeroWeightFeatures() { - size_t count = 0; - vector toErase; - for (const_iterator i = cbegin(); i != cend(); ++i) { - const std::string& fname = (i->first).name(); - if (i->second == 0) { - toErase.push_back(i->first); - //std::cerr << "prune: " << fname << std::endl; - FName::eraseId(FName::getId(fname)); - ++count; - } - } - - for (size_t i = 0; i < toErase.size(); ++i) - m_features.erase(toErase[i]); - - return count; - } - - void FVector::updateConfidenceCounts(const FVector& weightUpdate, bool signedCounts) { - for (size_t i = 0; i < weightUpdate.m_coreFeatures.size(); ++i) { - if (signedCounts) { - //int sign = weightUpdate.m_coreFeatures[i] >= 0 ? 1 : -1; - //m_coreFeatures[i] += (weightUpdate.m_coreFeatures[i] * weightUpdate.m_coreFeatures[i]) * sign; - m_coreFeatures[i] += weightUpdate.m_coreFeatures[i]; - } - else - //m_coreFeatures[i] += (weightUpdate.m_coreFeatures[i] * weightUpdate.m_coreFeatures[i]); - m_coreFeatures[i] += abs(weightUpdate.m_coreFeatures[i]); - } - - for (const_iterator i = weightUpdate.cbegin(); i != weightUpdate.cend(); ++i) { - if (weightUpdate[i->first] == 0) - continue; - float value = get(i->first); - if (signedCounts) { - //int sign = weightUpdate[i->first] >= 0 ? 1 : -1; - //value += (weightUpdate[i->first] * weightUpdate[i->first]) * sign; - value += weightUpdate[i->first]; - } - else - //value += (weightUpdate[i->first] * weightUpdate[i->first]); - value += abs(weightUpdate[i->first]); - set(i->first, value); - } - } - - void FVector::updateLearningRates(float decay_core, float decay_sparse, const FVector &confidenceCounts, float core_r0, float sparse_r0) { - for (size_t i = 0; i < confidenceCounts.m_coreFeatures.size(); ++i) { - m_coreFeatures[i] = 1.0/(1.0/core_r0 + decay_core * abs(confidenceCounts.m_coreFeatures[i])); - } - - for (const_iterator i = confidenceCounts.cbegin(); i != confidenceCounts.cend(); ++i) { - float value = 1.0/(1.0/sparse_r0 + decay_sparse * abs(i->second)); - set(i->first, value); - } - } - - // count non-zero occurrences for all sparse features - void FVector::setToBinaryOf(const FVector& rhs) { - for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i) - if (rhs.get(i->first) != 0) - set(i->first, 1); - for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i) - m_coreFeatures[i] = 1; - } - - // divide only core features by scalar - FVector& FVector::coreDivideEquals(float scalar) { - for (size_t i = 0; i < m_coreFeatures.size(); ++i) - m_coreFeatures[i] /= scalar; - return *this; - } - - // lhs vector is a sum of vectors, rhs vector holds number of non-zero summands - FVector& FVector::divideEquals(const FVector& rhs) { - assert(m_coreFeatures.size() == rhs.m_coreFeatures.size()); - for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i) - set(i->first, get(i->first)/rhs.get(i->first)); // divide by number of summands - for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i) - m_coreFeatures[i] /= rhs.m_coreFeatures[i]; // divide by number of summands - return *this; - } - - FVector& FVector::operator-= (const FVector& rhs) { - if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) - resize(rhs.m_coreFeatures.size()); - for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i) - set(i->first, get(i->first) -(i->second)); - for (size_t i = 0; i < m_coreFeatures.size(); ++i) { - if (i < rhs.m_coreFeatures.size()) { - m_coreFeatures[i] -= rhs.m_coreFeatures[i]; - } - } - return *this; - } - - FVector& FVector::operator*= (const FVector& rhs) { - if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) { - resize(rhs.m_coreFeatures.size()); - } - for (iterator i = begin(); i != end(); ++i) { - FValue lhsValue = i->second; - FValue rhsValue = rhs.get(i->first); - set(i->first,lhsValue*rhsValue); - } - for (size_t i = 0; i < m_coreFeatures.size(); ++i) { - if (i < rhs.m_coreFeatures.size()) { - m_coreFeatures[i] *= rhs.m_coreFeatures[i]; - } else { - m_coreFeatures[i] = 0; - } - } - return *this; - } - - FVector& FVector::operator/= (const FVector& rhs) { - if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) { - resize(rhs.m_coreFeatures.size()); - } - for (iterator i = begin(); i != end(); ++i) { - FValue lhsValue = i->second; - FValue rhsValue = rhs.get(i->first); - set(i->first, lhsValue / rhsValue) ; - } - for (size_t i = 0; i < m_coreFeatures.size(); ++i) { - if (i < rhs.m_coreFeatures.size()) { - m_coreFeatures[i] /= rhs.m_coreFeatures[i]; - } else { - if (m_coreFeatures[i] < 0) { - m_coreFeatures[i] = -numeric_limits::infinity(); - } else if (m_coreFeatures[i] > 0) { - m_coreFeatures[i] = numeric_limits::infinity(); - } - } - } - return *this; - } - - FVector& FVector::operator*= (const FValue& rhs) { - //NB Could do this with boost::bind ? - for (iterator i = begin(); i != end(); ++i) { - i->second *= rhs; - } - m_coreFeatures *= rhs; - return *this; - } - - FVector& FVector::operator/= (const FValue& rhs) { - for (iterator i = begin(); i != end(); ++i) { - i->second /= rhs; - } - m_coreFeatures /= rhs; - return *this; - } - - FVector& FVector::multiplyEqualsBackoff(const FVector& rhs, float backoff) { - if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) { - resize(rhs.m_coreFeatures.size()); - } - for (iterator i = begin(); i != end(); ++i) { - FValue lhsValue = i->second; - FValue rhsValue = rhs.getBackoff(i->first, backoff); - set(i->first,lhsValue*rhsValue); - } - for (size_t i = 0; i < m_coreFeatures.size(); ++i) { - if (i < rhs.m_coreFeatures.size()) { - m_coreFeatures[i] *= rhs.m_coreFeatures[i]; - } else { - m_coreFeatures[i] = 0; - } - } - return *this; - } - - FVector& FVector::multiplyEquals(float core_r0, float sparse_r0) { - for (size_t i = 0; i < m_coreFeatures.size(); ++i) { - m_coreFeatures[i] *= core_r0; - } - for (iterator i = begin(); i != end(); ++i) - set(i->first,(i->second)*sparse_r0); - return *this; - } - - FValue FVector::l1norm() const { - FValue norm = 0; - for (const_iterator i = cbegin(); i != cend(); ++i) { - norm += abs(i->second); - } - for (size_t i = 0; i < m_coreFeatures.size(); ++i) { - norm += abs(m_coreFeatures[i]); - } - return norm; - } - - FValue FVector::l1norm_coreFeatures() const { - FValue norm = 0; - // ignore Bleu score feature (last feature) - for (size_t i = 0; i < m_coreFeatures.size()-1; ++i) - norm += abs(m_coreFeatures[i]); - return norm; - } - - FValue FVector::l2norm() const { - return sqrt(inner_product(*this)); - } - - FValue FVector::linfnorm() const { - FValue norm = 0; - for (const_iterator i = cbegin(); i != cend(); ++i) { - float absValue = abs(i->second); - if (absValue > norm) - norm = absValue; - } - for (size_t i = 0; i < m_coreFeatures.size(); ++i) { - float absValue = abs(m_coreFeatures[i]); - if (absValue > norm) - norm = absValue; - } - return norm; - } - - size_t FVector::l1regularize(float lambda) { - for (size_t i = 0; i < m_coreFeatures.size(); ++i) { - float value = m_coreFeatures[i]; - if (value > 0) { - m_coreFeatures[i] = max(0.0f, value - lambda); - } - else { - m_coreFeatures[i] = min(0.0f, value + lambda); - } - } - - size_t numberPruned = size(); - vector toErase; - for (iterator i = begin(); i != end(); ++i) { - float value = i->second; - if (value != 0.0f) { - if (value > 0) - value = max(0.0f, value - lambda); - else - value = min(0.0f, value + lambda); - - if (value != 0.0f) - i->second = value; - else { - toErase.push_back(i->first); - const std::string& fname = (i->first).name(); - FName::eraseId(FName::getId(fname)); - } - } - } - - // erase features that have become zero - for (size_t i = 0; i < toErase.size(); ++i) - m_features.erase(toErase[i]); - numberPruned -= size(); - return numberPruned; - } - - void FVector::l2regularize(float lambda) { - for (size_t i = 0; i < m_coreFeatures.size(); ++i) { - m_coreFeatures[i] *= (1 - lambda); - } - - for (iterator i = begin(); i != end(); ++i) { - i->second *= (1 - lambda); - } - } - - size_t FVector::sparseL1regularize(float lambda) { - /*for (size_t i = 0; i < m_coreFeatures.size(); ++i) { - float value = m_coreFeatures[i]; - if (value > 0) { - m_coreFeatures[i] = max(0.0f, value - lambda); - } - else { - m_coreFeatures[i] = min(0.0f, value + lambda); - } - }*/ - - size_t numberPruned = size(); - vector toErase; - for (iterator i = begin(); i != end(); ++i) { - float value = i->second; - if (value != 0.0f) { - if (value > 0) - value = max(0.0f, value - lambda); - else - value = min(0.0f, value + lambda); - - if (value != 0.0f) - i->second = value; - else { - toErase.push_back(i->first); - const std::string& fname = (i->first).name(); - FName::eraseId(FName::getId(fname)); - } - } - } - - // erase features that have become zero - for (size_t i = 0; i < toErase.size(); ++i) - m_features.erase(toErase[i]); - numberPruned -= size(); - return numberPruned; - } - - void FVector::sparseL2regularize(float lambda) { - /*for (size_t i = 0; i < m_coreFeatures.size(); ++i) { - m_coreFeatures[i] *= (1 - lambda); - }*/ - - for (iterator i = begin(); i != end(); ++i) { - i->second *= (1 - lambda); - } - } - - FValue FVector::sum() const { - FValue sum = 0; - for (const_iterator i = cbegin(); i != cend(); ++i) { - sum += i->second; - } - sum += m_coreFeatures.sum(); - return sum; - } - - FValue FVector::inner_product(const FVector& rhs) const { - CHECK(m_coreFeatures.size() == rhs.m_coreFeatures.size()); - FValue product = 0.0; - for (const_iterator i = cbegin(); i != cend(); ++i) { - product += ((i->second)*(rhs.get(i->first))); - } - for (size_t i = 0; i < m_coreFeatures.size(); ++i) { - product += m_coreFeatures[i]*rhs.m_coreFeatures[i]; - } - return product; - } - - const FVector operator+(const FVector& lhs, const FVector& rhs) { - return FVector(lhs) += rhs; - } - - const FVector operator-(const FVector& lhs, const FVector& rhs) { - return FVector(lhs) -= rhs; - } - - const FVector operator*(const FVector& lhs, const FVector& rhs) { - return FVector(lhs) *= rhs; - } - - const FVector operator/(const FVector& lhs, const FVector& rhs) { - return FVector(lhs) /= rhs; - } - - - const FVector operator*(const FVector& lhs, const FValue& rhs) { - return FVector(lhs) *= rhs; - } - - const FVector operator/(const FVector& lhs, const FValue& rhs) { - return FVector(lhs) /= rhs; - } - - FValue inner_product(const FVector& lhs, const FVector& rhs) { - if (lhs.size() >= rhs.size()) { - return rhs.inner_product(lhs); - } else { - return lhs.inner_product(rhs); + std::pair to_ins; + to_ins.first.assign(name.data(), name.size()); + to_ins.second = name2id.size(); + std::pair res(name2id.insert(to_ins)); + if (res.second) { + // TODO this should be string pointers backed by the hash table. + id2name.push_back(to_ins.first); } + m_id = res.first->second; } } + +size_t FName::getId(const string& name) +{ + Name2Id::iterator i = name2id.find(name); + assert (i != name2id.end()); + return i->second; +} + +size_t FName::getHopeIdCount(const string& name) +{ + Name2Id::iterator i = name2id.find(name); + if (i != name2id.end()) { + float id = i->second; + return id2hopeCount[id]; + } + return 0; +} + +size_t FName::getFearIdCount(const string& name) +{ + Name2Id::iterator i = name2id.find(name); + if (i != name2id.end()) { + float id = i->second; + return id2fearCount[id]; + } + return 0; +} + +void FName::incrementHopeId(const string& name) +{ + Name2Id::iterator i = name2id.find(name); + assert(i != name2id.end()); +#ifdef WITH_THREADS + // get upgradable lock and upgrade to writer lock + boost::upgrade_lock upgradeLock(m_idLock); + boost::upgrade_to_unique_lock uniqueLock(upgradeLock); +#endif + id2hopeCount[i->second] += 1; +} + +void FName::incrementFearId(const string& name) +{ + Name2Id::iterator i = name2id.find(name); + assert(i != name2id.end()); +#ifdef WITH_THREADS + // get upgradable lock and upgrade to writer lock + boost::upgrade_lock upgradeLock(m_idLock); + boost::upgrade_to_unique_lock uniqueLock(upgradeLock); +#endif + id2fearCount[i->second] += 1; +} + +void FName::eraseId(size_t id) +{ +#ifdef WITH_THREADS + // get upgradable lock and upgrade to writer lock + boost::upgrade_lock upgradeLock(m_idLock); + boost::upgrade_to_unique_lock uniqueLock(upgradeLock); +#endif + id2hopeCount.erase(id); + id2fearCount.erase(id); +} + +std::ostream& operator<<( std::ostream& out, const FName& name) +{ + out << name.name(); + return out; +} + +size_t FName::hash() const +{ + return boost::hash_value(m_id); +} + +const std::string& FName::name() const +{ + return id2name[m_id]; +} + + +bool FName::operator==(const FName& rhs) const +{ + return m_id == rhs.m_id; +} + +bool FName::operator!=(const FName& rhs) const +{ + return ! (*this == rhs); +} + +FVector::FVector(size_t coreFeatures) : m_coreFeatures(coreFeatures) {} + +void FVector::resize(size_t newsize) +{ + valarray oldValues(m_coreFeatures); + m_coreFeatures.resize(newsize); + for (size_t i = 0; i < min(m_coreFeatures.size(), oldValues.size()); ++i) { + m_coreFeatures[i] = oldValues[i]; + } +} + +void FVector::clear() +{ + m_coreFeatures.resize(0); + m_features.clear(); +} + +bool FVector::load(const std::string& filename) +{ + clear(); + ifstream in (filename.c_str()); + if (!in) { + return false; + } + string line; + while(getline(in,line)) { + if (line[0] == '#') continue; + istringstream linestream(line); + string namestring; + FValue value; + linestream >> namestring; + linestream >> value; + FName fname(namestring); + //cerr << "Setting sparse weight " << fname << " to value " << value << "." << endl; + set(fname,value); + } + return true; +} + +void FVector::save(const string& filename) const +{ + ofstream out(filename.c_str()); + if (!out) { + ostringstream msg; + msg << "Unable to open " << filename; + throw runtime_error(msg.str()); + } + write(out); + out.close(); +} + +void FVector::write(ostream& out) const +{ + for (const_iterator i = cbegin(); i != cend(); ++i) { + out << i->first << " " << i->second << endl; + } +} + +static bool equalsTolerance(FValue lhs, FValue rhs) +{ + if (lhs == rhs) return true; + static const FValue TOLERANCE = 1e-4; + FValue diff = abs(lhs-rhs); + FValue mean = (abs(lhs)+abs(rhs))/2; + //cerr << "ET " << lhs << " " << rhs << " " << diff << " " << mean << " " << endl; + return diff/mean < TOLERANCE ; +} + +bool FVector::operator== (const FVector& rhs) const +{ + if (this == &rhs) { + return true; + } + if (m_coreFeatures.size() != rhs.m_coreFeatures.size()) { + return false; + } + for (size_t i = 0; i < m_coreFeatures.size(); ++i) { + if (!equalsTolerance(m_coreFeatures[i], rhs.m_coreFeatures[i])) return false; + } + for (const_iterator i = cbegin(); i != cend(); ++i) { + if (!equalsTolerance(i->second,rhs.get(i->first))) return false; + } + for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i) { + if (!equalsTolerance(i->second, get(i->first))) return false; + } + return true; +} + +bool FVector::operator!= (const FVector& rhs) const +{ + return ! (*this == rhs); +} + +ProxyFVector FVector::operator[](const FName& name) +{ + // At this point, we don't know whether operator[] was called, so we return + // a proxy object and defer the decision until later + return ProxyFVector(this, name); +} + +/** Equivalent for core features. */ +FValue& FVector::operator[](size_t index) +{ + return m_coreFeatures[index]; +} + + +FValue FVector::operator[](const FName& name) const +{ + return get(name); +} + +FValue FVector::operator[](size_t index) const +{ + return m_coreFeatures[index]; +} + +ostream& FVector::print(ostream& out) const +{ + out << "core=("; + for (size_t i = 0; i < m_coreFeatures.size(); ++i) { + out << m_coreFeatures[i]; + if (i + 1 < m_coreFeatures.size()) { + out << ","; + } + } + out << ") "; + for (const_iterator i = cbegin(); i != cend(); ++i) { + if (i != cbegin()) + out << " "; + out << i->first << "=" << i->second; + } + return out; +} + +ostream& operator<<(ostream& out, const FVector& fv) +{ + return fv.print(out); +} + +const FValue& FVector::get(const FName& name) const +{ + static const FValue DEFAULT = 0; + const_iterator fi = m_features.find(name); + if (fi == m_features.end()) { + return DEFAULT; + } else { + return fi->second; + } +} + +FValue FVector::getBackoff(const FName& name, float backoff) const +{ + const_iterator fi = m_features.find(name); + if (fi == m_features.end()) { + return backoff; + } else { + return fi->second; + } +} + +void FVector::thresholdScale(FValue maxValue ) +{ + FValue factor = 1.0; + for (const_iterator i = cbegin(); i != cend(); ++i) { + FValue value = i->second; + if (abs(value)*factor > maxValue) { + factor = abs(value) / maxValue; + } + } + operator*=(factor); +} + +void FVector::capMax(FValue maxValue) +{ + for (const_iterator i = cbegin(); i != cend(); ++i) + if (i->second > maxValue) + set(i->first, maxValue); +} + +void FVector::capMin(FValue minValue) +{ + for (const_iterator i = cbegin(); i != cend(); ++i) + if (i->second < minValue) + set(i->first, minValue); +} + +void FVector::set(const FName& name, const FValue& value) +{ + m_features[name] = value; +} + +void FVector::printCoreFeatures() +{ + cerr << "core=("; + for (size_t i = 0; i < m_coreFeatures.size(); ++i) { + cerr << m_coreFeatures[i]; + if (i + 1 < m_coreFeatures.size()) { + cerr << ","; + } + } + cerr << ") "; +} + +FVector& FVector::operator+= (const FVector& rhs) +{ + if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) + resize(rhs.m_coreFeatures.size()); + for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i) + set(i->first, get(i->first) + i->second); + for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i) + m_coreFeatures[i] += rhs.m_coreFeatures[i]; + return *this; +} + +// add only sparse features +void FVector::sparsePlusEquals(const FVector& rhs) +{ + for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i) + set(i->first, get(i->first) + i->second); +} + +// assign only core features +void FVector::coreAssign(const FVector& rhs) +{ + for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i) + m_coreFeatures[i] = rhs.m_coreFeatures[i]; +} + +void FVector::incrementSparseHopeFeatures() +{ + for (const_iterator i = cbegin(); i != cend(); ++i) + FName::incrementHopeId((i->first).name()); +} + +void FVector::incrementSparseFearFeatures() +{ + for (const_iterator i = cbegin(); i != cend(); ++i) + FName::incrementFearId((i->first).name()); +} + +void FVector::printSparseHopeFeatureCounts(std::ofstream& out) +{ + for (const_iterator i = cbegin(); i != cend(); ++i) + out << (i->first).name() << ": " << FName::getHopeIdCount((i->first).name()) << std::endl; +} + +void FVector::printSparseFearFeatureCounts(std::ofstream& out) +{ + for (const_iterator i = cbegin(); i != cend(); ++i) + out << (i->first).name() << ": " << FName::getFearIdCount((i->first).name()) << std::endl; +} + +void FVector::printSparseHopeFeatureCounts() +{ + for (const_iterator i = cbegin(); i != cend(); ++i) + std::cerr << (i->first).name() << ": " << FName::getHopeIdCount((i->first).name()) << std::endl; +} + +void FVector::printSparseFearFeatureCounts() +{ + for (const_iterator i = cbegin(); i != cend(); ++i) + std::cerr << (i->first).name() << ": " << FName::getFearIdCount((i->first).name()) << std::endl; +} + +size_t FVector::pruneSparseFeatures(size_t threshold) +{ + size_t count = 0; + vector toErase; + for (const_iterator i = cbegin(); i != cend(); ++i) { + const std::string& fname = (i->first).name(); + if (FName::getHopeIdCount(fname) < threshold && FName::getFearIdCount(fname) < threshold) { + toErase.push_back(i->first); + std::cerr << "pruning: " << fname << " (" << FName::getHopeIdCount(fname) << ", " << FName::getFearIdCount(fname) << ")" << std::endl; + FName::eraseId(FName::getId(fname)); + ++count; + } + } + + for (size_t i = 0; i < toErase.size(); ++i) + m_features.erase(toErase[i]); + + return count; +} + +size_t FVector::pruneZeroWeightFeatures() +{ + size_t count = 0; + vector toErase; + for (const_iterator i = cbegin(); i != cend(); ++i) { + const std::string& fname = (i->first).name(); + if (i->second == 0) { + toErase.push_back(i->first); + //std::cerr << "prune: " << fname << std::endl; + FName::eraseId(FName::getId(fname)); + ++count; + } + } + + for (size_t i = 0; i < toErase.size(); ++i) + m_features.erase(toErase[i]); + + return count; +} + +void FVector::updateConfidenceCounts(const FVector& weightUpdate, bool signedCounts) +{ + for (size_t i = 0; i < weightUpdate.m_coreFeatures.size(); ++i) { + if (signedCounts) { + //int sign = weightUpdate.m_coreFeatures[i] >= 0 ? 1 : -1; + //m_coreFeatures[i] += (weightUpdate.m_coreFeatures[i] * weightUpdate.m_coreFeatures[i]) * sign; + m_coreFeatures[i] += weightUpdate.m_coreFeatures[i]; + } else + //m_coreFeatures[i] += (weightUpdate.m_coreFeatures[i] * weightUpdate.m_coreFeatures[i]); + m_coreFeatures[i] += abs(weightUpdate.m_coreFeatures[i]); + } + + for (const_iterator i = weightUpdate.cbegin(); i != weightUpdate.cend(); ++i) { + if (weightUpdate[i->first] == 0) + continue; + float value = get(i->first); + if (signedCounts) { + //int sign = weightUpdate[i->first] >= 0 ? 1 : -1; + //value += (weightUpdate[i->first] * weightUpdate[i->first]) * sign; + value += weightUpdate[i->first]; + } else + //value += (weightUpdate[i->first] * weightUpdate[i->first]); + value += abs(weightUpdate[i->first]); + set(i->first, value); + } +} + +void FVector::updateLearningRates(float decay_core, float decay_sparse, const FVector &confidenceCounts, float core_r0, float sparse_r0) +{ + for (size_t i = 0; i < confidenceCounts.m_coreFeatures.size(); ++i) { + m_coreFeatures[i] = 1.0/(1.0/core_r0 + decay_core * abs(confidenceCounts.m_coreFeatures[i])); + } + + for (const_iterator i = confidenceCounts.cbegin(); i != confidenceCounts.cend(); ++i) { + float value = 1.0/(1.0/sparse_r0 + decay_sparse * abs(i->second)); + set(i->first, value); + } +} + +// count non-zero occurrences for all sparse features +void FVector::setToBinaryOf(const FVector& rhs) +{ + for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i) + if (rhs.get(i->first) != 0) + set(i->first, 1); + for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i) + m_coreFeatures[i] = 1; +} + +// divide only core features by scalar +FVector& FVector::coreDivideEquals(float scalar) +{ + for (size_t i = 0; i < m_coreFeatures.size(); ++i) + m_coreFeatures[i] /= scalar; + return *this; +} + +// lhs vector is a sum of vectors, rhs vector holds number of non-zero summands +FVector& FVector::divideEquals(const FVector& rhs) +{ + assert(m_coreFeatures.size() == rhs.m_coreFeatures.size()); + for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i) + set(i->first, get(i->first)/rhs.get(i->first)); // divide by number of summands + for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i) + m_coreFeatures[i] /= rhs.m_coreFeatures[i]; // divide by number of summands + return *this; +} + +FVector& FVector::operator-= (const FVector& rhs) +{ + if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) + resize(rhs.m_coreFeatures.size()); + for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i) + set(i->first, get(i->first) -(i->second)); + for (size_t i = 0; i < m_coreFeatures.size(); ++i) { + if (i < rhs.m_coreFeatures.size()) { + m_coreFeatures[i] -= rhs.m_coreFeatures[i]; + } + } + return *this; +} + +FVector& FVector::operator*= (const FVector& rhs) +{ + if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) { + resize(rhs.m_coreFeatures.size()); + } + for (iterator i = begin(); i != end(); ++i) { + FValue lhsValue = i->second; + FValue rhsValue = rhs.get(i->first); + set(i->first,lhsValue*rhsValue); + } + for (size_t i = 0; i < m_coreFeatures.size(); ++i) { + if (i < rhs.m_coreFeatures.size()) { + m_coreFeatures[i] *= rhs.m_coreFeatures[i]; + } else { + m_coreFeatures[i] = 0; + } + } + return *this; +} + +FVector& FVector::operator/= (const FVector& rhs) +{ + if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) { + resize(rhs.m_coreFeatures.size()); + } + for (iterator i = begin(); i != end(); ++i) { + FValue lhsValue = i->second; + FValue rhsValue = rhs.get(i->first); + set(i->first, lhsValue / rhsValue) ; + } + for (size_t i = 0; i < m_coreFeatures.size(); ++i) { + if (i < rhs.m_coreFeatures.size()) { + m_coreFeatures[i] /= rhs.m_coreFeatures[i]; + } else { + if (m_coreFeatures[i] < 0) { + m_coreFeatures[i] = -numeric_limits::infinity(); + } else if (m_coreFeatures[i] > 0) { + m_coreFeatures[i] = numeric_limits::infinity(); + } + } + } + return *this; +} + +FVector& FVector::operator*= (const FValue& rhs) +{ + //NB Could do this with boost::bind ? + for (iterator i = begin(); i != end(); ++i) { + i->second *= rhs; + } + m_coreFeatures *= rhs; + return *this; +} + +FVector& FVector::operator/= (const FValue& rhs) +{ + for (iterator i = begin(); i != end(); ++i) { + i->second /= rhs; + } + m_coreFeatures /= rhs; + return *this; +} + +FVector& FVector::multiplyEqualsBackoff(const FVector& rhs, float backoff) +{ + if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) { + resize(rhs.m_coreFeatures.size()); + } + for (iterator i = begin(); i != end(); ++i) { + FValue lhsValue = i->second; + FValue rhsValue = rhs.getBackoff(i->first, backoff); + set(i->first,lhsValue*rhsValue); + } + for (size_t i = 0; i < m_coreFeatures.size(); ++i) { + if (i < rhs.m_coreFeatures.size()) { + m_coreFeatures[i] *= rhs.m_coreFeatures[i]; + } else { + m_coreFeatures[i] = 0; + } + } + return *this; +} + +FVector& FVector::multiplyEquals(float core_r0, float sparse_r0) +{ + for (size_t i = 0; i < m_coreFeatures.size(); ++i) { + m_coreFeatures[i] *= core_r0; + } + for (iterator i = begin(); i != end(); ++i) + set(i->first,(i->second)*sparse_r0); + return *this; +} + +FValue FVector::l1norm() const +{ + FValue norm = 0; + for (const_iterator i = cbegin(); i != cend(); ++i) { + norm += abs(i->second); + } + for (size_t i = 0; i < m_coreFeatures.size(); ++i) { + norm += abs(m_coreFeatures[i]); + } + return norm; +} + +FValue FVector::l1norm_coreFeatures() const +{ + FValue norm = 0; + // ignore Bleu score feature (last feature) + for (size_t i = 0; i < m_coreFeatures.size()-1; ++i) + norm += abs(m_coreFeatures[i]); + return norm; +} + +FValue FVector::l2norm() const +{ + return sqrt(inner_product(*this)); +} + +FValue FVector::linfnorm() const +{ + FValue norm = 0; + for (const_iterator i = cbegin(); i != cend(); ++i) { + float absValue = abs(i->second); + if (absValue > norm) + norm = absValue; + } + for (size_t i = 0; i < m_coreFeatures.size(); ++i) { + float absValue = abs(m_coreFeatures[i]); + if (absValue > norm) + norm = absValue; + } + return norm; +} + +size_t FVector::l1regularize(float lambda) +{ + for (size_t i = 0; i < m_coreFeatures.size(); ++i) { + float value = m_coreFeatures[i]; + if (value > 0) { + m_coreFeatures[i] = max(0.0f, value - lambda); + } else { + m_coreFeatures[i] = min(0.0f, value + lambda); + } + } + + size_t numberPruned = size(); + vector toErase; + for (iterator i = begin(); i != end(); ++i) { + float value = i->second; + if (value != 0.0f) { + if (value > 0) + value = max(0.0f, value - lambda); + else + value = min(0.0f, value + lambda); + + if (value != 0.0f) + i->second = value; + else { + toErase.push_back(i->first); + const std::string& fname = (i->first).name(); + FName::eraseId(FName::getId(fname)); + } + } + } + + // erase features that have become zero + for (size_t i = 0; i < toErase.size(); ++i) + m_features.erase(toErase[i]); + numberPruned -= size(); + return numberPruned; +} + +void FVector::l2regularize(float lambda) +{ + for (size_t i = 0; i < m_coreFeatures.size(); ++i) { + m_coreFeatures[i] *= (1 - lambda); + } + + for (iterator i = begin(); i != end(); ++i) { + i->second *= (1 - lambda); + } +} + +size_t FVector::sparseL1regularize(float lambda) +{ + /*for (size_t i = 0; i < m_coreFeatures.size(); ++i) { + float value = m_coreFeatures[i]; + if (value > 0) { + m_coreFeatures[i] = max(0.0f, value - lambda); + } + else { + m_coreFeatures[i] = min(0.0f, value + lambda); + } + }*/ + + size_t numberPruned = size(); + vector toErase; + for (iterator i = begin(); i != end(); ++i) { + float value = i->second; + if (value != 0.0f) { + if (value > 0) + value = max(0.0f, value - lambda); + else + value = min(0.0f, value + lambda); + + if (value != 0.0f) + i->second = value; + else { + toErase.push_back(i->first); + const std::string& fname = (i->first).name(); + FName::eraseId(FName::getId(fname)); + } + } + } + + // erase features that have become zero + for (size_t i = 0; i < toErase.size(); ++i) + m_features.erase(toErase[i]); + numberPruned -= size(); + return numberPruned; +} + +void FVector::sparseL2regularize(float lambda) +{ + /*for (size_t i = 0; i < m_coreFeatures.size(); ++i) { + m_coreFeatures[i] *= (1 - lambda); + }*/ + + for (iterator i = begin(); i != end(); ++i) { + i->second *= (1 - lambda); + } +} + +FValue FVector::sum() const +{ + FValue sum = 0; + for (const_iterator i = cbegin(); i != cend(); ++i) { + sum += i->second; + } + sum += m_coreFeatures.sum(); + return sum; +} + +FValue FVector::inner_product(const FVector& rhs) const +{ + CHECK(m_coreFeatures.size() == rhs.m_coreFeatures.size()); + FValue product = 0.0; + for (const_iterator i = cbegin(); i != cend(); ++i) { + product += ((i->second)*(rhs.get(i->first))); + } + for (size_t i = 0; i < m_coreFeatures.size(); ++i) { + product += m_coreFeatures[i]*rhs.m_coreFeatures[i]; + } + return product; +} + +const FVector operator+(const FVector& lhs, const FVector& rhs) +{ + return FVector(lhs) += rhs; +} + +const FVector operator-(const FVector& lhs, const FVector& rhs) +{ + return FVector(lhs) -= rhs; +} + +const FVector operator*(const FVector& lhs, const FVector& rhs) +{ + return FVector(lhs) *= rhs; +} + +const FVector operator/(const FVector& lhs, const FVector& rhs) +{ + return FVector(lhs) /= rhs; +} + + +const FVector operator*(const FVector& lhs, const FValue& rhs) +{ + return FVector(lhs) *= rhs; +} + +const FVector operator/(const FVector& lhs, const FValue& rhs) +{ + return FVector(lhs) /= rhs; +} + +FValue inner_product(const FVector& lhs, const FVector& rhs) +{ + if (lhs.size() >= rhs.size()) { + return rhs.inner_product(lhs); + } else { + return lhs.inner_product(rhs); + } +} +} diff --git a/moses/FeatureVector.h b/moses/FeatureVector.h index 9c15ba4f7..f4261b520 100644 --- a/moses/FeatureVector.h +++ b/moses/FeatureVector.h @@ -1,21 +1,21 @@ /* Moses - factored phrase-based language decoder Copyright (C) 2010 University of Edinburgh - + This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. - + This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - + */ #pragma once @@ -47,325 +47,336 @@ #include "util/check.hh" #include "util/string_piece.hh" -namespace Moses { - - typedef float FValue; - - /** - * Feature name - **/ - struct FName { - - static const std::string SEP; - - typedef boost::unordered_map Name2Id; - typedef boost::unordered_map Id2Count; - //typedef std::map Name2Id; - static Name2Id name2id; - static std::vector id2name; - static Id2Count id2hopeCount; - static Id2Count id2fearCount; - - //A feature name can either be initialised as a pair of strings, - //which will be concatenated with a SEP between them, or as - //a single string, which will be used as-is. - FName(const StringPiece &root, const StringPiece &name) { - std::string assembled(root.data(), root.size()); - assembled += SEP; - assembled.append(name.data(), name.size()); - init(assembled); - } - explicit FName(const StringPiece &name) - {init(name);} - - const std::string& name() const; - //const std::string& root() const {return m_root;} - - size_t hash() const; +namespace Moses +{ - bool operator==(const FName& rhs) const ; - bool operator!=(const FName& rhs) const ; - - static size_t getId(const std::string& name); - static size_t getHopeIdCount(const std::string& name); - static size_t getFearIdCount(const std::string& name); - static void incrementHopeId(const std::string& name); - static void incrementFearId(const std::string& name); - static void eraseId(size_t id); - - private: - void init(const StringPiece& name); - size_t m_id; +typedef float FValue; + +/** + * Feature name + **/ +struct FName { + + static const std::string SEP; + + typedef boost::unordered_map Name2Id; + typedef boost::unordered_map Id2Count; + //typedef std::map Name2Id; + static Name2Id name2id; + static std::vector id2name; + static Id2Count id2hopeCount; + static Id2Count id2fearCount; + + //A feature name can either be initialised as a pair of strings, + //which will be concatenated with a SEP between them, or as + //a single string, which will be used as-is. + FName(const StringPiece &root, const StringPiece &name) { + std::string assembled(root.data(), root.size()); + assembled += SEP; + assembled.append(name.data(), name.size()); + init(assembled); + } + explicit FName(const StringPiece &name) { + init(name); + } + + const std::string& name() const; + //const std::string& root() const {return m_root;} + + size_t hash() const; + + bool operator==(const FName& rhs) const ; + bool operator!=(const FName& rhs) const ; + + static size_t getId(const std::string& name); + static size_t getHopeIdCount(const std::string& name); + static size_t getFearIdCount(const std::string& name); + static void incrementHopeId(const std::string& name); + static void incrementFearId(const std::string& name); + static void eraseId(size_t id); + +private: + void init(const StringPiece& name); + size_t m_id; #ifdef WITH_THREADS - //reader-writer lock - static boost::shared_mutex m_idLock; + //reader-writer lock + static boost::shared_mutex m_idLock; #endif - }; - - std::ostream& operator<<(std::ostream& out,const FName& name); - - struct FNameEquals { - inline bool operator() (const FName& lhs, const FName& rhs) const { - return (lhs == rhs); - } - }; - - struct FNameHash - : std::unary_function - { - std::size_t operator()(const FName& x) const - { - return x.hash(); - } - }; - - class ProxyFVector; - - /** - * A sparse feature (or weight) vector. - **/ - class FVector - { - public: - /** Empty feature vector */ - FVector(size_t coreFeatures = 0); +}; - FVector& operator=( const FVector& rhs ) { - m_features = rhs.m_features; - m_coreFeatures = rhs.m_coreFeatures; - return *this; - } +std::ostream& operator<<(std::ostream& out,const FName& name); - /* - * Change the number of core features - **/ - void resize(size_t newsize); +struct FNameEquals { + inline bool operator() (const FName& lhs, const FName& rhs) const { + return (lhs == rhs); + } +}; - typedef boost::unordered_map FNVmap; - /** Iterators */ - typedef FNVmap::iterator iterator; - typedef FNVmap::const_iterator const_iterator; - iterator begin() {return m_features.begin();} - iterator end() {return m_features.end();} - const_iterator cbegin() const {return m_features.cbegin();} - const_iterator cend() const {return m_features.cend();} - - bool hasNonDefaultValue(FName name) const { return m_features.find(name) != m_features.end();} - void clear(); - - - /** Load from file - each line should be 'root[_name] value' */ - bool load(const std::string& filename); - void save(const std::string& filename) const; - void write(std::ostream& out) const ; - - /** Element access */ - ProxyFVector operator[](const FName& name); - FValue& operator[](size_t index); - FValue operator[](const FName& name) const; - FValue operator[](size_t index) const; +struct FNameHash + : std::unary_function { + std::size_t operator()(const FName& x) const { + return x.hash(); + } +}; - /** Size */ - size_t size() const { - return m_features.size() + m_coreFeatures.size(); - } +class ProxyFVector; - size_t coreSize() const { - return m_coreFeatures.size(); - } - - const std::valarray &getCoreFeatures() const { - return m_coreFeatures; - } - - /** Equality */ - bool operator== (const FVector& rhs) const; - bool operator!= (const FVector& rhs) const; +/** + * A sparse feature (or weight) vector. + **/ +class FVector +{ +public: + /** Empty feature vector */ + FVector(size_t coreFeatures = 0); - FValue inner_product(const FVector& rhs) const; - - friend class ProxyFVector; - - /**arithmetic */ - //Element-wise - //If one side has fewer core features, take the missing ones to be 0. - FVector& operator+= (const FVector& rhs); - FVector& operator-= (const FVector& rhs); - FVector& operator*= (const FVector& rhs); - FVector& operator/= (const FVector& rhs); - //Scalar - FVector& operator*= (const FValue& rhs); - FVector& operator/= (const FValue& rhs); - - FVector& multiplyEqualsBackoff(const FVector& rhs, float backoff); - FVector& multiplyEquals(float core_r0, float sparse_r0); + FVector& operator=( const FVector& rhs ) { + m_features = rhs.m_features; + m_coreFeatures = rhs.m_coreFeatures; + return *this; + } - FVector& max_equals(const FVector& rhs); + /* + * Change the number of core features + **/ + void resize(size_t newsize); - /** norms and sums */ - FValue l1norm() const; - FValue l1norm_coreFeatures() const; - FValue l2norm() const; - FValue linfnorm() const; - size_t l1regularize(float lambda); - void l2regularize(float lambda); - size_t sparseL1regularize(float lambda); - void sparseL2regularize(float lambda); - FValue sum() const; - - /** pretty printing */ - std::ostream& print(std::ostream& out) const; + typedef boost::unordered_map FNVmap; + /** Iterators */ + typedef FNVmap::iterator iterator; + typedef FNVmap::const_iterator const_iterator; + iterator begin() { + return m_features.begin(); + } + iterator end() { + return m_features.end(); + } + const_iterator cbegin() const { + return m_features.cbegin(); + } + const_iterator cend() const { + return m_features.cend(); + } - /** additional */ - void printCoreFeatures(); - //scale so that abs. value is less than maxvalue - void thresholdScale(float maxValue ); + bool hasNonDefaultValue(FName name) const { + return m_features.find(name) != m_features.end(); + } + void clear(); - void capMax(FValue maxValue); - void capMin(FValue minValue); - void sparsePlusEquals(const FVector& rhs); - void coreAssign(const FVector& rhs); - - void incrementSparseHopeFeatures(); - void incrementSparseFearFeatures(); - void printSparseHopeFeatureCounts(std::ofstream& out); - void printSparseFearFeatureCounts(std::ofstream& out); - void printSparseHopeFeatureCounts(); - void printSparseFearFeatureCounts(); - size_t pruneSparseFeatures(size_t threshold); - size_t pruneZeroWeightFeatures(); - void updateConfidenceCounts(const FVector& weightUpdate, bool signedCounts); - void updateLearningRates(float decay_core, float decay_sparse, const FVector& confidence_counts, float core_r0, float sparse_r0); - - // vector which, for each element of the original vector, reflects whether an element is zero or non-zero - void setToBinaryOf(const FVector& rhs); + /** Load from file - each line should be 'root[_name] value' */ + bool load(const std::string& filename); + void save(const std::string& filename) const; + void write(std::ostream& out) const ; - // divide only core features by scalar - FVector& coreDivideEquals(float scalar); + /** Element access */ + ProxyFVector operator[](const FName& name); + FValue& operator[](size_t index); + FValue operator[](const FName& name) const; + FValue operator[](size_t index) const; - // divide each element by the number given in the rhs vector - FVector& divideEquals(const FVector& rhs); + /** Size */ + size_t size() const { + return m_features.size() + m_coreFeatures.size(); + } + + size_t coreSize() const { + return m_coreFeatures.size(); + } + + const std::valarray &getCoreFeatures() const { + return m_coreFeatures; + } + + /** Equality */ + bool operator== (const FVector& rhs) const; + bool operator!= (const FVector& rhs) const; + + FValue inner_product(const FVector& rhs) const; + + friend class ProxyFVector; + + /**arithmetic */ + //Element-wise + //If one side has fewer core features, take the missing ones to be 0. + FVector& operator+= (const FVector& rhs); + FVector& operator-= (const FVector& rhs); + FVector& operator*= (const FVector& rhs); + FVector& operator/= (const FVector& rhs); + //Scalar + FVector& operator*= (const FValue& rhs); + FVector& operator/= (const FValue& rhs); + + FVector& multiplyEqualsBackoff(const FVector& rhs, float backoff); + FVector& multiplyEquals(float core_r0, float sparse_r0); + + FVector& max_equals(const FVector& rhs); + + /** norms and sums */ + FValue l1norm() const; + FValue l1norm_coreFeatures() const; + FValue l2norm() const; + FValue linfnorm() const; + size_t l1regularize(float lambda); + void l2regularize(float lambda); + size_t sparseL1regularize(float lambda); + void sparseL2regularize(float lambda); + FValue sum() const; + + /** pretty printing */ + std::ostream& print(std::ostream& out) const; + + /** additional */ + void printCoreFeatures(); + //scale so that abs. value is less than maxvalue + void thresholdScale(float maxValue ); + + void capMax(FValue maxValue); + void capMin(FValue minValue); + + void sparsePlusEquals(const FVector& rhs); + void coreAssign(const FVector& rhs); + + void incrementSparseHopeFeatures(); + void incrementSparseFearFeatures(); + void printSparseHopeFeatureCounts(std::ofstream& out); + void printSparseFearFeatureCounts(std::ofstream& out); + void printSparseHopeFeatureCounts(); + void printSparseFearFeatureCounts(); + size_t pruneSparseFeatures(size_t threshold); + size_t pruneZeroWeightFeatures(); + void updateConfidenceCounts(const FVector& weightUpdate, bool signedCounts); + void updateLearningRates(float decay_core, float decay_sparse, const FVector& confidence_counts, float core_r0, float sparse_r0); + + // vector which, for each element of the original vector, reflects whether an element is zero or non-zero + void setToBinaryOf(const FVector& rhs); + + // divide only core features by scalar + FVector& coreDivideEquals(float scalar); + + // divide each element by the number given in the rhs vector + FVector& divideEquals(const FVector& rhs); #ifdef MPI_ENABLE - friend class boost::serialization::access; -#endif - - private: - - /** Internal get and set. */ - const FValue& get(const FName& name) const; - FValue getBackoff(const FName& name, float backoff) const; - void set(const FName& name, const FValue& value); - - FNVmap m_features; - std::valarray m_coreFeatures; - -#ifdef MPI_ENABLE - //serialization - template - void save(Archive &ar, const unsigned int version) const { - std::vector names; - std::vector values; - for (const_iterator i = cbegin(); i != cend(); ++i) { - std::ostringstream ostr; - ostr << i->first; - names.push_back(ostr.str()); - values.push_back(i->second); - } - ar << names; - ar << values; - ar << m_coreFeatures; - } - - template - void load(Archive &ar, const unsigned int version) { - clear(); - std::vector names; - std::vector values; - ar >> names; - ar >> values; - ar >> m_coreFeatures; - CHECK(names.size() == values.size()); - for (size_t i = 0; i < names.size(); ++i) { - set(FName(names[i]), values[i]); - } - } - - BOOST_SERIALIZATION_SPLIT_MEMBER() - + friend class boost::serialization::access; #endif - - }; - - std::ostream& operator<<( std::ostream& out, const FVector& fv); - //Element-wise operations - const FVector operator+(const FVector& lhs, const FVector& rhs); - const FVector operator-(const FVector& lhs, const FVector& rhs); - const FVector operator*(const FVector& lhs, const FVector& rhs); - const FVector operator/(const FVector& lhs, const FVector& rhs); - - //Scalar operations - const FVector operator*(const FVector& lhs, const FValue& rhs); - const FVector operator/(const FVector& lhs, const FValue& rhs); - - const FVector fvmax(const FVector& lhs, const FVector& rhs); - - FValue inner_product(const FVector& lhs, const FVector& rhs); - - struct FVectorPlus { - FVector operator()(const FVector& lhs, const FVector& rhs) const { - return lhs + rhs; - } - }; - - /** - * Used to help with subscript operator overloading. - * See http://stackoverflow.com/questions/1386075/overloading-operator-for-a-sparse-vector - **/ - class ProxyFVector { - public: - ProxyFVector(FVector *fv, const FName& name ) : m_fv(fv), m_name(name) {} - ProxyFVector &operator=(const FValue& value) { - // If we get here, we know that operator[] was called to perform a write access, - // so we can insert an item in the vector if needed - //std::cerr << "Inserting " << value << " into " << m_name << std::endl; - m_fv->set(m_name,value); - return *this; - - } - - operator FValue() { - // If we get here, we know that operator[] was called to perform a read access, - // so we can simply return the value from the vector - return m_fv->get(m_name); - } - - /*operator FValue&() { - return m_fv->m_features[m_name]; - }*/ - - FValue operator++() { - return ++m_fv->m_features[m_name]; - } - - FValue operator +=(FValue lhs) { - return (m_fv->m_features[m_name] += lhs); - } - - FValue operator -=(FValue lhs) { - return (m_fv->m_features[m_name] -= lhs); - } - private: - FValue m_tmp; - - private: - FVector* m_fv; - const FName& m_name; - - }; - +private: + + /** Internal get and set. */ + const FValue& get(const FName& name) const; + FValue getBackoff(const FName& name, float backoff) const; + void set(const FName& name, const FValue& value); + + FNVmap m_features; + std::valarray m_coreFeatures; + +#ifdef MPI_ENABLE + //serialization + template + void save(Archive &ar, const unsigned int version) const { + std::vector names; + std::vector values; + for (const_iterator i = cbegin(); i != cend(); ++i) { + std::ostringstream ostr; + ostr << i->first; + names.push_back(ostr.str()); + values.push_back(i->second); + } + ar << names; + ar << values; + ar << m_coreFeatures; + } + + template + void load(Archive &ar, const unsigned int version) { + clear(); + std::vector names; + std::vector values; + ar >> names; + ar >> values; + ar >> m_coreFeatures; + CHECK(names.size() == values.size()); + for (size_t i = 0; i < names.size(); ++i) { + set(FName(names[i]), values[i]); + } + } + + BOOST_SERIALIZATION_SPLIT_MEMBER() + +#endif + +}; + +std::ostream& operator<<( std::ostream& out, const FVector& fv); +//Element-wise operations +const FVector operator+(const FVector& lhs, const FVector& rhs); +const FVector operator-(const FVector& lhs, const FVector& rhs); +const FVector operator*(const FVector& lhs, const FVector& rhs); +const FVector operator/(const FVector& lhs, const FVector& rhs); + +//Scalar operations +const FVector operator*(const FVector& lhs, const FValue& rhs); +const FVector operator/(const FVector& lhs, const FValue& rhs); + +const FVector fvmax(const FVector& lhs, const FVector& rhs); + +FValue inner_product(const FVector& lhs, const FVector& rhs); + +struct FVectorPlus { + FVector operator()(const FVector& lhs, const FVector& rhs) const { + return lhs + rhs; + } +}; + +/** + * Used to help with subscript operator overloading. + * See http://stackoverflow.com/questions/1386075/overloading-operator-for-a-sparse-vector + **/ +class ProxyFVector +{ +public: + ProxyFVector(FVector *fv, const FName& name ) : m_fv(fv), m_name(name) {} + ProxyFVector &operator=(const FValue& value) { + // If we get here, we know that operator[] was called to perform a write access, + // so we can insert an item in the vector if needed + //std::cerr << "Inserting " << value << " into " << m_name << std::endl; + m_fv->set(m_name,value); + return *this; + + } + + operator FValue() { + // If we get here, we know that operator[] was called to perform a read access, + // so we can simply return the value from the vector + return m_fv->get(m_name); + } + + /*operator FValue&() { + return m_fv->m_features[m_name]; + }*/ + + FValue operator++() { + return ++m_fv->m_features[m_name]; + } + + FValue operator +=(FValue lhs) { + return (m_fv->m_features[m_name] += lhs); + } + + FValue operator -=(FValue lhs) { + return (m_fv->m_features[m_name] -= lhs); + } + +private: + FValue m_tmp; + +private: + FVector* m_fv; + const FName& m_name; + +}; + } #endif diff --git a/moses/FeatureVectorTest.cpp b/moses/FeatureVectorTest.cpp index af1829e62..2e00b276e 100644 --- a/moses/FeatureVectorTest.cpp +++ b/moses/FeatureVectorTest.cpp @@ -28,41 +28,49 @@ static const float TOL = 0.00001; BOOST_AUTO_TEST_SUITE(fv) -BOOST_AUTO_TEST_CASE(vector_sum_diff) +BOOST_AUTO_TEST_CASE(vector_sum_diff) { FVector f1,f2,f3; FName n1("a"); FName n2("b"); FName n3("c"); FName n4("d"); - f1[n1] = 1.2; f1[n2] = 1.4; f1[n3] = -0.1; - f2[n1] = 0.01; f2[n3] = 5.6; f2[n4] = 0.6; + f1[n1] = 1.2; + f1[n2] = 1.4; + f1[n3] = -0.1; + f2[n1] = 0.01; + f2[n3] = 5.6; + f2[n4] = 0.6; f3[n1] =1.2; FVector sum = f1 + f2; FVector diff = f1 - f2; - BOOST_CHECK_CLOSE((FValue)sum[n1], 1.21, TOL); - BOOST_CHECK_CLOSE((FValue)sum[n2], 1.4, TOL); - BOOST_CHECK_CLOSE((FValue)sum[n3], 5.5, TOL); - BOOST_CHECK_CLOSE((FValue)sum[n4], 0.6, TOL); - BOOST_CHECK_CLOSE((FValue)diff[n1], 1.19, TOL); - BOOST_CHECK_CLOSE((FValue)diff[n2], 1.4, TOL); - BOOST_CHECK_CLOSE((FValue)diff[n3], -5.7, TOL); - BOOST_CHECK_CLOSE((FValue)diff[n4], -0.6, TOL); + BOOST_CHECK_CLOSE((FValue)sum[n1], 1.21, TOL); + BOOST_CHECK_CLOSE((FValue)sum[n2], 1.4, TOL); + BOOST_CHECK_CLOSE((FValue)sum[n3], 5.5, TOL); + BOOST_CHECK_CLOSE((FValue)sum[n4], 0.6, TOL); + BOOST_CHECK_CLOSE((FValue)diff[n1], 1.19, TOL); + BOOST_CHECK_CLOSE((FValue)diff[n2], 1.4, TOL); + BOOST_CHECK_CLOSE((FValue)diff[n3], -5.7, TOL); + BOOST_CHECK_CLOSE((FValue)diff[n4], -0.6, TOL); f1 -= f3; cerr << f1 << endl << f3 << endl ; BOOST_CHECK_CLOSE((FValue)f1[n1],0,TOL); } -BOOST_AUTO_TEST_CASE(scalar) +BOOST_AUTO_TEST_CASE(scalar) { FVector f1,f2; FName n1("a"); FName n2("b"); FName n3("c"); FName n4("d"); - f1[n1] = 0.2; f1[n2] = 9.178; f1[n3] = -0.1; - f2[n1] = 0.01; f2[n3] = 5.6; f2[n4] = 0.6; + f1[n1] = 0.2; + f1[n2] = 9.178; + f1[n3] = -0.1; + f2[n1] = 0.01; + f2[n3] = 5.6; + f2[n4] = 0.6; FVector prod1 = f1 * 2; FVector prod2 = f1 * -0.1; FVector quot = f2 / 2; @@ -80,12 +88,13 @@ BOOST_AUTO_TEST_CASE(scalar) BOOST_CHECK_CLOSE((FValue)quot[n4], 0.3, TOL); } -BOOST_AUTO_TEST_CASE(inc) +BOOST_AUTO_TEST_CASE(inc) { FVector f1; FName n1("a"); FName n2("b"); - f1[n1] = 2.3; f1[n2] = -0.4; + f1[n1] = 2.3; + f1[n2] = -0.4; f1[n1]+=2; BOOST_CHECK_CLOSE((FValue)f1[n1], 4.3, TOL); BOOST_CHECK_CLOSE((FValue)f1[n2], -0.4, TOL); @@ -103,8 +112,13 @@ BOOST_AUTO_TEST_CASE(vector_mult) FName n2("b"); FName n3("c"); FName n4("d"); - f1[n1] = 0.2; f1[n2] = 9.178; f1[n3] = -0.1; - f2[n1] = 0.01; f2[n2] = 5.6; f2[n3] = 1; f2[n4] = 0.6; + f1[n1] = 0.2; + f1[n2] = 9.178; + f1[n3] = -0.1; + f2[n1] = 0.01; + f2[n2] = 5.6; + f2[n3] = 1; + f2[n4] = 0.6; FVector prod = f1 * f2; FVector quot = f1/f2; BOOST_CHECK_CLOSE((FValue)prod[n1], 0.002, TOL); @@ -118,7 +132,7 @@ BOOST_AUTO_TEST_CASE(vector_mult) BOOST_CHECK_CLOSE((FValue)quot[n4], 0, TOL); } -BOOST_AUTO_TEST_CASE(core) +BOOST_AUTO_TEST_CASE(core) { FVector f1(2); f1[0] = 1.3; @@ -127,7 +141,7 @@ BOOST_AUTO_TEST_CASE(core) BOOST_CHECK_CLOSE(f1[1],-1.9,TOL); f1[1] = 0.1; BOOST_CHECK_CLOSE(f1[1],0.1,TOL); - + BOOST_CHECK_EQUAL(f1.size(),2); f1[FName("a")] = 1.2; @@ -140,8 +154,13 @@ BOOST_AUTO_TEST_CASE(core_arith) FVector f2(2); FName n1("a"); FName n2("b"); - f1[0] = 1.1; f1[1] = 0.25; f1[n1] = 3.6; f1[n2] = -1.5; - f2[0] = 0.5; f2[1] = -0.1; f2[n1] = 1; + f1[0] = 1.1; + f1[1] = 0.25; + f1[n1] = 3.6; + f1[n2] = -1.5; + f2[0] = 0.5; + f2[1] = -0.1; + f2[n1] = 1; //vector ops FVector sum = f1+f2; @@ -172,9 +191,10 @@ BOOST_AUTO_TEST_CASE(core_arith) //with different length vectors FVector f3(2); FVector f4(1); - f3[0] = 2; f3[1] = -1; + f3[0] = 2; + f3[1] = -1; f4[0] = 5; - + FVector sum1 = f3 + f4; FVector sum2 = f4 + f3; BOOST_CHECK_EQUAL(sum1,sum2); @@ -200,14 +220,17 @@ BOOST_AUTO_TEST_CASE(core_arith) BOOST_CHECK_EQUAL(quot1[1], -numeric_limits::infinity()); BOOST_CHECK_CLOSE(quot2[0], 2.5, TOL); BOOST_CHECK_CLOSE(quot2[1], 0, TOL); - + } BOOST_AUTO_TEST_CASE(core_scalar) { FVector f1(3); FName n1("a"); - f1[0] = 1.5; f1[1] = 2.1; f1[2] = 4; f1[n1] = -0.5; + f1[0] = 1.5; + f1[1] = 2.1; + f1[2] = 4; + f1[n1] = -0.5; FVector prod = f1*2; FVector quot = f1/5; @@ -224,31 +247,41 @@ BOOST_AUTO_TEST_CASE(core_scalar) } -BOOST_AUTO_TEST_CASE(l1norm) +BOOST_AUTO_TEST_CASE(l1norm) { FVector f1(3); FName n1("a"); - f1[0] = 1.5; f1[1] = 2.1; f1[2] = 4; f1[n1] = -0.5; + f1[0] = 1.5; + f1[1] = 2.1; + f1[2] = 4; + f1[n1] = -0.5; FValue n = f1.l1norm(); BOOST_CHECK_CLOSE((FValue)n, abs(1.5)+abs(2.1)+abs(4)+abs(-0.5), TOL); } -BOOST_AUTO_TEST_CASE(sum) +BOOST_AUTO_TEST_CASE(sum) { FVector f1(3); FName n1("a"); FName n2("b"); - f1[0] = 1.5; f1[1] = 2.1; f1[2] = 4; f1[n1] = -0.5; f1[n2] = 2.7; + f1[0] = 1.5; + f1[1] = 2.1; + f1[2] = 4; + f1[n1] = -0.5; + f1[n2] = 2.7; FValue n = f1.sum(); BOOST_CHECK_CLOSE((FValue)n, 1.5+2.1+4-0.5+2.7, TOL); } -BOOST_AUTO_TEST_CASE(l2norm) +BOOST_AUTO_TEST_CASE(l2norm) { FVector f1(3); FName n1("a"); - f1[0] = 1.5; f1[1] = 2.1; f1[2] = 4; f1[n1] = -0.5; + f1[0] = 1.5; + f1[1] = 2.1; + f1[2] = 4; + f1[n1] = -0.5; FValue n = f1.l2norm(); BOOST_CHECK_CLOSE((FValue)n, sqrt((1.5*1.5)+(2.1*2.1)+(4*4)+(-0.5*-0.5)), TOL); } @@ -260,8 +293,14 @@ BOOST_AUTO_TEST_CASE(ip) FName n1("a"); FName n2("b"); FName n3("c"); - f1[0] = 1.1; f1[1] = -0.1; ; f1[n2] = -1.5; f1[n3] = 2.2; - f2[0] = 0.5; f2[1] = 0.25; f2[n1] = 1; f2[n3] = 2.4; + f1[0] = 1.1; + f1[1] = -0.1; ; + f1[n2] = -1.5; + f1[n3] = 2.2; + f2[0] = 0.5; + f2[1] = 0.25; + f2[n1] = 1; + f2[n3] = 2.4; FValue p1 = inner_product(f1,f2); FValue p2 = inner_product(f2,f1); BOOST_CHECK_CLOSE(p1,p2,TOL); diff --git a/moses/GenerationDictionary.cpp b/moses/GenerationDictionary.cpp index f9f418197..dbc0eedb3 100644 --- a/moses/GenerationDictionary.cpp +++ b/moses/GenerationDictionary.cpp @@ -35,7 +35,7 @@ namespace Moses { GenerationDictionary::GenerationDictionary(const std::string &line) -: DecodeFeature("Generation", line) + : DecodeFeature("Generation", line) { string filePath; @@ -44,8 +44,7 @@ GenerationDictionary::GenerationDictionary(const std::string &line) if (args[0] == "path") { filePath = args[1]; - } - else { + } else { //UserMessage::Add("Unknown argument " + args[0]); //abort(); } diff --git a/moses/GenerationDictionary.h b/moses/GenerationDictionary.h index 6a1e4de9a..b2aeb0d96 100644 --- a/moses/GenerationDictionary.h +++ b/moses/GenerationDictionary.h @@ -53,22 +53,21 @@ protected: public: GenerationDictionary(const std::string &line); - virtual ~GenerationDictionary(); - - //! load data file - bool Load(const std::string &filePath, FactorDirection direction); + virtual ~GenerationDictionary(); - /** number of unique input entries in the generation table. - * NOT the number of lines in the generation table - */ - size_t GetSize() const - { - return m_collection.size(); - } - /** returns a bag of output words, OutputWordCollection, for a particular input word. - * Or NULL if the input word isn't found. The search function used is the WordComparer functor - */ - const OutputWordCollection *FindWord(const Word &word) const; + //! load data file + bool Load(const std::string &filePath, FactorDirection direction); + + /** number of unique input entries in the generation table. + * NOT the number of lines in the generation table + */ + size_t GetSize() const { + return m_collection.size(); + } + /** returns a bag of output words, OutputWordCollection, for a particular input word. + * Or NULL if the input word isn't found. The search function used is the WordComparer functor + */ + const OutputWordCollection *FindWord(const Word &word) const; }; diff --git a/moses/Hypothesis.cpp b/moses/Hypothesis.cpp index 7bc3e6a75..50443904c 100644 --- a/moses/Hypothesis.cpp +++ b/moses/Hypothesis.cpp @@ -60,8 +60,8 @@ Hypothesis::Hypothesis(Manager& manager, InputType const& source, const TargetPh , m_arcList(NULL) , m_transOpt(NULL) , m_manager(manager) -, m_totalScore(0.0f) -, m_futureScore(0.0f) + , m_totalScore(0.0f) + , m_futureScore(0.0f) , m_id(m_manager.GetNextHypoId()) { @@ -248,20 +248,22 @@ int Hypothesis::RecombineCompare(const Hypothesis &compare) const } if (comp != 0) return comp; } - + return 0; } void Hypothesis::EvaluateWith(const StatefulFeatureFunction &sfff, - int state_idx) { + int state_idx) +{ m_ffStates[state_idx] = sfff.Evaluate( - *this, - m_prevHypo ? m_prevHypo->m_ffStates[state_idx] : NULL, - &m_scoreBreakdown); - + *this, + m_prevHypo ? m_prevHypo->m_ffStates[state_idx] : NULL, + &m_scoreBreakdown); + } -void Hypothesis::EvaluateWith(const StatelessFeatureFunction& slff) { +void Hypothesis::EvaluateWith(const StatelessFeatureFunction& slff) +{ slff.Evaluate(PhraseBasedFeatureContext(this), &m_scoreBreakdown); } @@ -280,14 +282,14 @@ void Hypothesis::CalcScore(const SquareMatrix &futureScore) // compute values of stateless feature functions that were not // cached in the translation option const vector& sfs = - StatelessFeatureFunction::GetStatelessFeatureFunctions(); + StatelessFeatureFunction::GetStatelessFeatureFunctions(); for (unsigned i = 0; i < sfs.size(); ++i) { - const StatelessFeatureFunction &ff = *sfs[i]; + const StatelessFeatureFunction &ff = *sfs[i]; EvaluateWith(ff); } const vector& ffs = - StatefulFeatureFunction::GetStatefulFeatureFunctions(); + StatefulFeatureFunction::GetStatefulFeatureFunctions(); for (unsigned i = 0; i < ffs.size(); ++i) { const StatefulFeatureFunction &ff = *ffs[i]; m_ffStates[i] = ff.Evaluate( diff --git a/moses/HypothesisStack.h b/moses/HypothesisStack.h index 26e6ed21b..0c3d4198f 100644 --- a/moses/HypothesisStack.h +++ b/moses/HypothesisStack.h @@ -11,7 +11,7 @@ namespace Moses class Manager; -/** abstract unique set of hypotheses that cover a certain number of words, +/** abstract unique set of hypotheses that cover a certain number of words, * ie. a stack in phrase-based decoding */ class HypothesisStack diff --git a/moses/Incremental.cpp b/moses/Incremental.cpp index 3eb66fb0e..e4159063c 100644 --- a/moses/Incremental.cpp +++ b/moses/Incremental.cpp @@ -19,90 +19,98 @@ #include -namespace Moses { -namespace Incremental { -namespace { +namespace Moses +{ +namespace Incremental +{ +namespace +{ // This is called by EdgeGenerator. Route hypotheses to separate vertices for -// each left hand side label, populating ChartCellLabelSet out. -template class HypothesisCallback { - private: - typedef search::VertexGenerator Gen; - public: - HypothesisCallback(search::ContextBase &context, Best &best, ChartCellLabelSet &out, boost::object_pool &vertex_pool) - : context_(context), best_(best), out_(out), vertex_pool_(vertex_pool) {} +// each left hand side label, populating ChartCellLabelSet out. +template class HypothesisCallback +{ +private: + typedef search::VertexGenerator Gen; +public: + HypothesisCallback(search::ContextBase &context, Best &best, ChartCellLabelSet &out, boost::object_pool &vertex_pool) + : context_(context), best_(best), out_(out), vertex_pool_(vertex_pool) {} - void NewHypothesis(search::PartialEdge partial) { - // Get the LHS, look it up in the output ChartCellLabel, and upcast it. - // It's not part of the union because it would have been ugly to expose template types in ChartCellLabel. - ChartCellLabel::Stack &stack = out_.FindOrInsert(static_cast(partial.GetNote().vp)->GetTargetLHS()); - Gen *entry = static_cast(stack.incr_generator); - if (!entry) { - entry = generator_pool_.construct(context_, *vertex_pool_.construct(), best_); - stack.incr_generator = entry; - } - entry->NewHypothesis(partial); + void NewHypothesis(search::PartialEdge partial) { + // Get the LHS, look it up in the output ChartCellLabel, and upcast it. + // It's not part of the union because it would have been ugly to expose template types in ChartCellLabel. + ChartCellLabel::Stack &stack = out_.FindOrInsert(static_cast(partial.GetNote().vp)->GetTargetLHS()); + Gen *entry = static_cast(stack.incr_generator); + if (!entry) { + entry = generator_pool_.construct(context_, *vertex_pool_.construct(), best_); + stack.incr_generator = entry; } + entry->NewHypothesis(partial); + } - void FinishedSearch() { - for (ChartCellLabelSet::iterator i(out_.mutable_begin()); i != out_.mutable_end(); ++i) { - ChartCellLabel::Stack &stack = i->second.MutableStack(); - Gen *gen = static_cast(stack.incr_generator); - gen->FinishedSearch(); - stack.incr = &gen->Generating(); - } + void FinishedSearch() { + for (ChartCellLabelSet::iterator i(out_.mutable_begin()); i != out_.mutable_end(); ++i) { + ChartCellLabel::Stack &stack = i->second.MutableStack(); + Gen *gen = static_cast(stack.incr_generator); + gen->FinishedSearch(); + stack.incr = &gen->Generating(); } + } - private: - search::ContextBase &context_; +private: + search::ContextBase &context_; - Best &best_; + Best &best_; - ChartCellLabelSet &out_; + ChartCellLabelSet &out_; - boost::object_pool &vertex_pool_; - boost::object_pool generator_pool_; + boost::object_pool &vertex_pool_; + boost::object_pool generator_pool_; }; // This is called by the moses parser to collect hypotheses. It converts to my -// edges (search::PartialEdge). -template class Fill : public ChartParserCallback { - public: - Fill(search::Context &context, const std::vector &vocab_mapping, search::Score oov_weight) - : context_(context), vocab_mapping_(vocab_mapping), oov_weight_(oov_weight) {} +// edges (search::PartialEdge). +template class Fill : public ChartParserCallback +{ +public: + Fill(search::Context &context, const std::vector &vocab_mapping, search::Score oov_weight) + : context_(context), vocab_mapping_(vocab_mapping), oov_weight_(oov_weight) {} - void Add(const TargetPhraseCollection &targets, const StackVec &nts, const WordsRange &ignored); + void Add(const TargetPhraseCollection &targets, const StackVec &nts, const WordsRange &ignored); - void AddPhraseOOV(TargetPhrase &phrase, std::list &waste_memory, const WordsRange &range); + void AddPhraseOOV(TargetPhrase &phrase, std::list &waste_memory, const WordsRange &range); - bool Empty() const { return edges_.Empty(); } + bool Empty() const { + return edges_.Empty(); + } - template void Search(Best &best, ChartCellLabelSet &out, boost::object_pool &vertex_pool) { - HypothesisCallback callback(context_, best, out, vertex_pool); - edges_.Search(context_, callback); - } + template void Search(Best &best, ChartCellLabelSet &out, boost::object_pool &vertex_pool) { + HypothesisCallback callback(context_, best, out, vertex_pool); + edges_.Search(context_, callback); + } - // Root: everything into one vertex. - template search::History RootSearch(Best &best) { - search::Vertex vertex; - search::RootVertexGenerator gen(vertex, best); - edges_.Search(context_, gen); - return vertex.BestChild(); - } + // Root: everything into one vertex. + template search::History RootSearch(Best &best) { + search::Vertex vertex; + search::RootVertexGenerator gen(vertex, best); + edges_.Search(context_, gen); + return vertex.BestChild(); + } - private: - lm::WordIndex Convert(const Word &word) const; +private: + lm::WordIndex Convert(const Word &word) const; - search::Context &context_; + search::Context &context_; - const std::vector &vocab_mapping_; + const std::vector &vocab_mapping_; - search::EdgeGenerator edges_; + search::EdgeGenerator edges_; - const search::Score oov_weight_; + const search::Score oov_weight_; }; -template void Fill::Add(const TargetPhraseCollection &targets, const StackVec &nts, const WordsRange &) { +template void Fill::Add(const TargetPhraseCollection &targets, const StackVec &nts, const WordsRange &) +{ std::vector vertices; vertices.reserve(nts.size()); float below_score = 0.0; @@ -131,7 +139,7 @@ template void Fill::Add(const TargetPhraseCollection &targe } edge.SetScore(phrase.GetFutureScore() + below_score); - // prob and oov were already accounted for. + // prob and oov were already accounted for. search::ScoreRule(context_.LanguageModel(), words, edge.Between()); search::Note note; @@ -142,14 +150,15 @@ template void Fill::Add(const TargetPhraseCollection &targe } } -template void Fill::AddPhraseOOV(TargetPhrase &phrase, std::list &, const WordsRange &) { +template void Fill::AddPhraseOOV(TargetPhrase &phrase, std::list &, const WordsRange &) +{ std::vector words; CHECK(phrase.GetSize() <= 1); if (phrase.GetSize()) words.push_back(Convert(phrase.GetWord(0))); search::PartialEdge edge(edges_.AllocateEdge(0)); - // Appears to be a bug that FutureScore does not already include language model. + // Appears to be a bug that FutureScore does not already include language model. search::ScoreRuleRet scored(search::ScoreRule(context_.LanguageModel(), words, edge.Between())); edge.SetScore(phrase.GetFutureScore() + scored.prob * context_.LMWeight() + static_cast(scored.oov) * oov_weight_); @@ -160,8 +169,9 @@ template void Fill::AddPhraseOOV(TargetPhrase &phrase, std: edges_.AddEdge(edge); } -// TODO: factors (but chart doesn't seem to support factors anyway). -template lm::WordIndex Fill::Convert(const Word &word) const { +// TODO: factors (but chart doesn't seem to support factors anyway). +template lm::WordIndex Fill::Convert(const Word &word) const +{ std::size_t factor = word.GetFactor(0)->GetId(); return (factor >= vocab_mapping_.size() ? 0 : vocab_mapping_[factor]); } @@ -180,10 +190,12 @@ Manager::Manager(const InputType &source) : parser_(source, cells_), n_best_(search::NBestConfig(StaticData::Instance().GetNBestSize())) {} -Manager::~Manager() { +Manager::~Manager() +{ } -template search::History Manager::PopulateBest(const Model &model, const std::vector &words, Best &out) { +template search::History Manager::PopulateBest(const Model &model, const std::vector &words, Best &out) +{ const LanguageModel &abstract = LanguageModel::GetFirstLM(); const float oov_weight = abstract.OOVFeatureEnabled() ? abstract.GetOOVWeight() : 0.0; const StaticData &data = StaticData::Instance(); @@ -192,7 +204,7 @@ template search::History Manager::PopulateBest(const M size_t size = source_.GetSize(); boost::object_pool vertex_pool(std::max(size * size / 2, 32)); - + for (size_t width = 1; width < size; ++width) { for (size_t startPos = 0; startPos <= size-width; ++startPos) { WordsRange range(startPos, startPos + width - 1); @@ -208,7 +220,8 @@ template search::History Manager::PopulateBest(const M return filler.RootSearch(out); } -template void Manager::LMCallback(const Model &model, const std::vector &words) { +template void Manager::LMCallback(const Model &model, const std::vector &words) +{ std::size_t nbest = StaticData::Instance().GetNBestSize(); if (nbest <= 1) { search::History ret = PopulateBest(model, words, single_best_); @@ -237,12 +250,14 @@ template void Manager::LMCallback(const lm::ngram::Qu template void Manager::LMCallback(const lm::ngram::ArrayTrieModel &model, const std::vector &words); template void Manager::LMCallback(const lm::ngram::QuantArrayTrieModel &model, const std::vector &words); -const std::vector &Manager::ProcessSentence() { +const std::vector &Manager::ProcessSentence() +{ LanguageModel::GetFirstLM().IncrementalCallback(*this); return *completed_nbest_; } -namespace { +namespace +{ struct NoOp { void operator()(const TargetPhrase &) const {} @@ -254,7 +269,8 @@ struct AccumScore { } ScoreComponentCollection *out_; }; -template void AppendToPhrase(const search::Applied final, Phrase &out, Action action) { +template void AppendToPhrase(const search::Applied final, Phrase &out, Action action) +{ assert(final.Valid()); const TargetPhrase &phrase = *static_cast(final.GetNote().vp); action(phrase); @@ -271,23 +287,25 @@ template void AppendToPhrase(const search::Applied final, Phrase } // namespace -void ToPhrase(const search::Applied final, Phrase &out) { +void ToPhrase(const search::Applied final, Phrase &out) +{ out.Clear(); AppendToPhrase(final, out, NoOp()); } -void PhraseAndFeatures(const search::Applied final, Phrase &phrase, ScoreComponentCollection &features) { +void PhraseAndFeatures(const search::Applied final, Phrase &phrase, ScoreComponentCollection &features) +{ phrase.Clear(); features.ZeroAll(); AppendToPhrase(final, phrase, AccumScore(features)); - // If we made it this far, there is only one language model. + // If we made it this far, there is only one language model. float full, ignored_ngram; std::size_t ignored_oov; const LanguageModel &model = LanguageModel::GetFirstLM(); model.CalcScore(phrase, full, ignored_ngram, ignored_oov); - // CalcScore transforms, but EvaluateChart doesn't. + // CalcScore transforms, but EvaluateChart doesn't. features.Assign(&model, full); } diff --git a/moses/Incremental.h b/moses/Incremental.h index 30f7c588c..20040bf45 100644 --- a/moses/Incremental.h +++ b/moses/Incremental.h @@ -10,49 +10,52 @@ #include #include -namespace Moses { +namespace Moses +{ class ScoreComponentCollection; class InputType; class LanguageModel; -namespace Incremental { +namespace Incremental +{ -class Manager { - public: - Manager(const InputType &source); +class Manager +{ +public: + Manager(const InputType &source); - ~Manager(); + ~Manager(); - template void LMCallback(const Model &model, const std::vector &words); - - const std::vector &ProcessSentence(); + template void LMCallback(const Model &model, const std::vector &words); - // Call to get the same value as ProcessSentence returned. - const std::vector &Completed() const { - return *completed_nbest_; - } + const std::vector &ProcessSentence(); - private: - template search::History PopulateBest(const Model &model, const std::vector &words, Best &out); + // Call to get the same value as ProcessSentence returned. + const std::vector &Completed() const { + return *completed_nbest_; + } - const InputType &source_; - ChartCellCollectionBase cells_; - ChartParser parser_; +private: + template search::History PopulateBest(const Model &model, const std::vector &words, Best &out); - // Only one of single_best_ or n_best_ will be used, but it was easier to do this than a template. - search::SingleBest single_best_; - // ProcessSentence returns a reference to a vector. ProcessSentence - // doesn't have one, so this is populated and returned. - std::vector backing_for_single_; + const InputType &source_; + ChartCellCollectionBase cells_; + ChartParser parser_; - search::NBest n_best_; - - const std::vector *completed_nbest_; + // Only one of single_best_ or n_best_ will be used, but it was easier to do this than a template. + search::SingleBest single_best_; + // ProcessSentence returns a reference to a vector. ProcessSentence + // doesn't have one, so this is populated and returned. + std::vector backing_for_single_; + + search::NBest n_best_; + + const std::vector *completed_nbest_; }; // Just get the phrase. void ToPhrase(const search::Applied final, Phrase &out); -// Get the phrase and the features. +// Get the phrase and the features. void PhraseAndFeatures(const search::Applied final, Phrase &phrase, ScoreComponentCollection &features); diff --git a/moses/InputType.cpp b/moses/InputType.cpp index 64dc9a7fd..1ca3da63b 100644 --- a/moses/InputType.cpp +++ b/moses/InputType.cpp @@ -67,7 +67,7 @@ std::vector InputType::GetXmlChartTranslationOptions( std::vector ret; return ret; } - + } diff --git a/moses/InputType.h b/moses/InputType.h index a065c0bf0..d0106e5ca 100644 --- a/moses/InputType.h +++ b/moses/InputType.h @@ -38,9 +38,9 @@ class Factor; class PhraseDictionary; class TranslationOptionCollection; class ChartTranslationOptions; - + /** base class for all types of inputs to the decoder, - * eg. sentences, confusion networks, lattices and tree + * eg. sentences, confusion networks, lattices and tree */ class InputType { @@ -81,7 +81,7 @@ public: } void SetDocumentId(long documentId) { m_documentId = documentId; - } + } long GetTopicId() const { return m_topicId; } @@ -111,7 +111,7 @@ public: } void SetTextType(std::string type) { m_textType = type; - } + } std::string GetPassthroughInformation() const { return m_passthrough; } diff --git a/moses/LM/Backward.cpp b/moses/LM/Backward.cpp index a9fca1c75..263c90fec 100644 --- a/moses/LM/Backward.cpp +++ b/moses/LM/Backward.cpp @@ -35,281 +35,288 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA //#include "moses/StaticData.h" //#include -namespace Moses { +namespace Moses +{ - /** Constructs a new backward language model. */ - template BackwardLanguageModel::BackwardLanguageModel(const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen(file,factorType,lazy) { - // - // This space intentionally left blank - // - } +/** Constructs a new backward language model. */ +template BackwardLanguageModel::BackwardLanguageModel(const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen(file,factorType,lazy) +{ + // + // This space intentionally left blank + // +} - /** - * Constructs an empty backward language model state. - * - * This state will correspond with a translation hypothesis - * where no source words have been translated. - * - * In a forward language model, the language model state of an empty hypothesis - * would store the beginning of sentence marker . - * - * Because this is a backward language model, the language model state returned by this method - * instead stores the end of sentence marker . - */ - template const FFState *BackwardLanguageModel::EmptyHypothesisState(const InputType &/*input*/) const { - BackwardLMState *ret = new BackwardLMState(); - lm::ngram::RuleScore ruleScore(*m_ngram, ret->state); - ruleScore.Terminal(m_ngram->GetVocabulary().EndSentence()); - // float score = - ruleScore.Finish(); - // VERBOSE(1, "BackwardLM EmptyHypothesisState has score " << score); - return ret; - } - /* - template double BackwardLanguageModel::Score(FFState *ffState) { - BackwardLMState *lmState = static_cast< BackwardLMState* >(ffState); - lm::ngram::ChartState &state = lmState->state; - lm::ngram::RuleScore ruleScore(*m_ngram, lmState); - return ruleScore.Finish(); - } +/** + * Constructs an empty backward language model state. + * + * This state will correspond with a translation hypothesis + * where no source words have been translated. + * + * In a forward language model, the language model state of an empty hypothesis + * would store the beginning of sentence marker . + * + * Because this is a backward language model, the language model state returned by this method + * instead stores the end of sentence marker . + */ +template const FFState *BackwardLanguageModel::EmptyHypothesisState(const InputType &/*input*/) const +{ + BackwardLMState *ret = new BackwardLMState(); + lm::ngram::RuleScore ruleScore(*m_ngram, ret->state); + ruleScore.Terminal(m_ngram->GetVocabulary().EndSentence()); + // float score = + ruleScore.Finish(); + // VERBOSE(1, "BackwardLM EmptyHypothesisState has score " << score); + return ret; +} +/* +template double BackwardLanguageModel::Score(FFState *ffState) { + BackwardLMState *lmState = static_cast< BackwardLMState* >(ffState); + lm::ngram::ChartState &state = lmState->state; + lm::ngram::RuleScore ruleScore(*m_ngram, lmState); + return ruleScore.Finish(); +} */ - /** - * Pre-calculate the n-gram probabilities for the words in the specified phrase. - * - * Note that when this method is called, we do not have access to the context - * in which this phrase will eventually be applied. - * - * In other words, we know what words are in this phrase, - * but we do not know what words will come before or after this phrase. - * - * The parameters fullScore, ngramScore, and oovCount are all output parameters. - * - * The value stored in oovCount is the number of words in the phrase - * that are not in the language model's vocabulary. - * - * The sum of the ngram scores for all words in this phrase are stored in fullScore. - * - * The value stored in ngramScore is similar, but only full-order ngram scores are included. - * - * This is best shown by example: - * - * Assume a trigram backward language model and a phrase "a b c d e f g" - * - * fullScore would represent the sum of the logprob scores for the following values: - * - * p(g) - * p(f | g) - * p(e | g f) - * p(d | f e) - * p(c | e d) - * p(b | d c) - * p(a | c b) - * - * ngramScore would represent the sum of the logprob scores for the following values: - * - * p(g) - * p(f | g) - * p(e | g f) - * p(d | f e) - * p(c | e d) - * p(b | d c) - * p(a | c b) - */ - template void BackwardLanguageModel::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const { - fullScore = 0; - ngramScore = 0; - oovCount = 0; - - if (!phrase.GetSize()) return; +/** + * Pre-calculate the n-gram probabilities for the words in the specified phrase. + * + * Note that when this method is called, we do not have access to the context + * in which this phrase will eventually be applied. + * + * In other words, we know what words are in this phrase, + * but we do not know what words will come before or after this phrase. + * + * The parameters fullScore, ngramScore, and oovCount are all output parameters. + * + * The value stored in oovCount is the number of words in the phrase + * that are not in the language model's vocabulary. + * + * The sum of the ngram scores for all words in this phrase are stored in fullScore. + * + * The value stored in ngramScore is similar, but only full-order ngram scores are included. + * + * This is best shown by example: + * + * Assume a trigram backward language model and a phrase "a b c d e f g" + * + * fullScore would represent the sum of the logprob scores for the following values: + * + * p(g) + * p(f | g) + * p(e | g f) + * p(d | f e) + * p(c | e d) + * p(b | d c) + * p(a | c b) + * + * ngramScore would represent the sum of the logprob scores for the following values: + * + * p(g) + * p(f | g) + * p(e | g f) + * p(d | f e) + * p(c | e d) + * p(b | d c) + * p(a | c b) + */ +template void BackwardLanguageModel::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const +{ + fullScore = 0; + ngramScore = 0; + oovCount = 0; - lm::ngram::ChartState discarded_sadly; - lm::ngram::RuleScore scorer(*m_ngram, discarded_sadly); - + if (!phrase.GetSize()) return; + + lm::ngram::ChartState discarded_sadly; + lm::ngram::RuleScore scorer(*m_ngram, discarded_sadly); + + UTIL_THROW_IF( + (m_beginSentenceFactor == phrase.GetWord(0).GetFactor(m_factorType)), + util::Exception, + "BackwardLanguageModel does not currently support rules that include " + ); + + float before_boundary = 0.0f; + + int lastWord = phrase.GetSize() - 1; + int ngramBoundary = m_ngram->Order() - 1; + int boundary = ( lastWord < ngramBoundary ) ? 0 : ngramBoundary; + + int position; + for (position = lastWord; position >= 0; position-=1) { + const Word &word = phrase.GetWord(position); UTIL_THROW_IF( - (m_beginSentenceFactor == phrase.GetWord(0).GetFactor(m_factorType)), - util::Exception, - "BackwardLanguageModel does not currently support rules that include " - ); - - float before_boundary = 0.0f; - - int lastWord = phrase.GetSize() - 1; - int ngramBoundary = m_ngram->Order() - 1; - int boundary = ( lastWord < ngramBoundary ) ? 0 : ngramBoundary; + (word.IsNonTerminal()), + util::Exception, + "BackwardLanguageModel does not currently support rules that include non-terminals " + ); - int position; - for (position = lastWord; position >= 0; position-=1) { - const Word &word = phrase.GetWord(position); - UTIL_THROW_IF( - (word.IsNonTerminal()), - util::Exception, - "BackwardLanguageModel does not currently support rules that include non-terminals " - ); - - lm::WordIndex index = TranslateID(word); - scorer.Terminal(index); - if (!index) ++oovCount; - - if (position==boundary) { - before_boundary = scorer.Finish(); - } + lm::WordIndex index = TranslateID(word); + scorer.Terminal(index); + if (!index) ++oovCount; + if (position==boundary) { + before_boundary = scorer.Finish(); } - fullScore = scorer.Finish(); - - ngramScore = TransformLMScore(fullScore - before_boundary); - fullScore = TransformLMScore(fullScore); - } - /** - * Calculate the ngram probabilities for the words at the beginning - * (and under some circumstances, also at the end) - * of the phrase represented by the provided hypothesis. - * - * Additionally, calculate a new language model state. - * - * This is best shown by example: - * - * Assume a trigram language model. - * - * Assume the previous phrase was "a b c d e f g", - * which means the previous language model state is "g f". - * - * When the phrase corresponding to "a b c d e f g" was previously processed by CalcScore - * the following full-order ngrams would have been calculated: - * - * p(a | c b) - * p(b | d c) - * p(c | e d) - * p(d | f e) - * p(e | g f) - * - * The following less-than-full-order ngrams would also have been calculated by CalcScore: - * - * p(f | g) - * p(g) - * - * In this method, we now have access to additional context which may allow - * us to compute the full-order ngrams for f and g. - * - * Assume the new provided hypothesis contains the new phrase "h i j k" - * - * Given these assumptions, this method is responsible - * for calculating the scores for the following: - * - * p(f | h g) - * p(g | i h) - * - * This method must also calculate and return a new language model state. - * - * In this example, the returned language model state would be "k j" - * - * If the provided hypothesis represents the end of a completed translation - * (all source words have been translated) - * then this method is additionally responsible for calculating the following: - * - * p(j | k) - * p(k | ) - * - */ - template FFState *BackwardLanguageModel::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const { + fullScore = scorer.Finish(); - // If the current hypothesis contains zero target words - if (!hypo.GetCurrTargetLength()) { + ngramScore = TransformLMScore(fullScore - before_boundary); + fullScore = TransformLMScore(fullScore); - // reuse and return the previous state - std::auto_ptr ret(new BackwardLMState()); - ret->state = static_cast(*ps).state; - return ret.release(); +} - } else { +/** + * Calculate the ngram probabilities for the words at the beginning + * (and under some circumstances, also at the end) + * of the phrase represented by the provided hypothesis. + * + * Additionally, calculate a new language model state. + * + * This is best shown by example: + * + * Assume a trigram language model. + * + * Assume the previous phrase was "a b c d e f g", + * which means the previous language model state is "g f". + * + * When the phrase corresponding to "a b c d e f g" was previously processed by CalcScore + * the following full-order ngrams would have been calculated: + * + * p(a | c b) + * p(b | d c) + * p(c | e d) + * p(d | f e) + * p(e | g f) + * + * The following less-than-full-order ngrams would also have been calculated by CalcScore: + * + * p(f | g) + * p(g) + * + * In this method, we now have access to additional context which may allow + * us to compute the full-order ngrams for f and g. + * + * Assume the new provided hypothesis contains the new phrase "h i j k" + * + * Given these assumptions, this method is responsible + * for calculating the scores for the following: + * + * p(f | h g) + * p(g | i h) + * + * This method must also calculate and return a new language model state. + * + * In this example, the returned language model state would be "k j" + * + * If the provided hypothesis represents the end of a completed translation + * (all source words have been translated) + * then this method is additionally responsible for calculating the following: + * + * p(j | k) + * p(k | ) + * + */ +template FFState *BackwardLanguageModel::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const +{ - float returnedScore; - - FFState *returnedState = this->Evaluate(hypo.GetCurrTargetPhrase(), ps, returnedScore); - - out->PlusEquals(this, returnedScore); - - return returnedState; - - } - } - - - template FFState *BackwardLanguageModel::Evaluate(const Phrase &phrase, const FFState *ps, float &returnedScore) const { - - returnedScore = 0.0f; - - const lm::ngram::ChartState &previous = static_cast(*ps).state; + // If the current hypothesis contains zero target words + if (!hypo.GetCurrTargetLength()) { + // reuse and return the previous state std::auto_ptr ret(new BackwardLMState()); - - lm::ngram::RuleScore scorer(*m_ngram, ret->state); - - int ngramBoundary = m_ngram->Order() - 1; - int lastWord = phrase.GetSize() - 1; - - // Get scores for words at the end of the previous phrase - // that are now adjacent to words at the the beginning of this phrase - for (int position=std::min( lastWord, ngramBoundary - 1); position >= 0; position-=1) { - const Word &word = phrase.GetWord(position); - UTIL_THROW_IF( - (word.IsNonTerminal()), - util::Exception, - "BackwardLanguageModel does not currently support rules that include non-terminals " - ); - - lm::WordIndex index = TranslateID(word); - scorer.Terminal(index); - } - scorer.NonTerminal(previous); - returnedScore = scorer.Finish(); - /* - out->PlusEquals(this, score); - - - UTIL_THROW_IF( - (1==1), - util::Exception, - "This method (BackwardLanguageModel::Evaluate) is not yet fully implemented" - ); - */ + ret->state = static_cast(*ps).state; return ret.release(); - + } else { + + float returnedScore; + + FFState *returnedState = this->Evaluate(hypo.GetCurrTargetPhrase(), ps, returnedScore); + + out->PlusEquals(this, returnedScore); + + return returnedState; } +} - LanguageModel *ConstructBackwardLM(const std::string &file, FactorType factorType, bool lazy) { - try { - lm::ngram::ModelType model_type; - if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) { - switch(model_type) { - case lm::ngram::PROBING: - return new BackwardLanguageModel(file, factorType, lazy); - case lm::ngram::REST_PROBING: - return new BackwardLanguageModel(file, factorType, lazy); - case lm::ngram::TRIE: - return new BackwardLanguageModel(file, factorType, lazy); - case lm::ngram::QUANT_TRIE: - return new BackwardLanguageModel(file, factorType, lazy); - case lm::ngram::ARRAY_TRIE: - return new BackwardLanguageModel(file, factorType, lazy); - case lm::ngram::QUANT_ARRAY_TRIE: - return new BackwardLanguageModel(file, factorType, lazy); - default: - std::cerr << "Unrecognized kenlm model type " << model_type << std::endl; - abort(); - } - } else { - return new BackwardLanguageModel(file, factorType, lazy); + +template FFState *BackwardLanguageModel::Evaluate(const Phrase &phrase, const FFState *ps, float &returnedScore) const +{ + + returnedScore = 0.0f; + + const lm::ngram::ChartState &previous = static_cast(*ps).state; + + std::auto_ptr ret(new BackwardLMState()); + + lm::ngram::RuleScore scorer(*m_ngram, ret->state); + + int ngramBoundary = m_ngram->Order() - 1; + int lastWord = phrase.GetSize() - 1; + + // Get scores for words at the end of the previous phrase + // that are now adjacent to words at the the beginning of this phrase + for (int position=std::min( lastWord, ngramBoundary - 1); position >= 0; position-=1) { + const Word &word = phrase.GetWord(position); + UTIL_THROW_IF( + (word.IsNonTerminal()), + util::Exception, + "BackwardLanguageModel does not currently support rules that include non-terminals " + ); + + lm::WordIndex index = TranslateID(word); + scorer.Terminal(index); + } + scorer.NonTerminal(previous); + returnedScore = scorer.Finish(); + /* + out->PlusEquals(this, score); + + + UTIL_THROW_IF( + (1==1), + util::Exception, + "This method (BackwardLanguageModel::Evaluate) is not yet fully implemented" + ); + */ + return ret.release(); + + + +} + +LanguageModel *ConstructBackwardLM(const std::string &file, FactorType factorType, bool lazy) +{ + try { + lm::ngram::ModelType model_type; + if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) { + switch(model_type) { + case lm::ngram::PROBING: + return new BackwardLanguageModel(file, factorType, lazy); + case lm::ngram::REST_PROBING: + return new BackwardLanguageModel(file, factorType, lazy); + case lm::ngram::TRIE: + return new BackwardLanguageModel(file, factorType, lazy); + case lm::ngram::QUANT_TRIE: + return new BackwardLanguageModel(file, factorType, lazy); + case lm::ngram::ARRAY_TRIE: + return new BackwardLanguageModel(file, factorType, lazy); + case lm::ngram::QUANT_ARRAY_TRIE: + return new BackwardLanguageModel(file, factorType, lazy); + default: + std::cerr << "Unrecognized kenlm model type " << model_type << std::endl; + abort(); } - } catch (std::exception &e) { - std::cerr << e.what() << std::endl; - abort(); + } else { + return new BackwardLanguageModel(file, factorType, lazy); } + } catch (std::exception &e) { + std::cerr << e.what() << std::endl; + abort(); } +} } // namespace Moses diff --git a/moses/LM/Backward.h b/moses/LM/Backward.h index 1bf6b560c..c81c0633d 100644 --- a/moses/LM/Backward.h +++ b/moses/LM/Backward.h @@ -29,53 +29,55 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "lm/state.hh" -namespace Moses { +namespace Moses +{ //! This will also load. Returns a templated backward LM. LanguageModel *ConstructBackwardLM(const std::string &file, FactorType factorType, bool lazy); - class FFState; - // template class BackwardLanguageModelTest; - class BackwardLanguageModelTest; +class FFState; +// template class BackwardLanguageModelTest; +class BackwardLanguageModelTest; /* * An implementation of single factor backward LM using Kenneth's code. */ -template class BackwardLanguageModel : public LanguageModelKen { - public: - BackwardLanguageModel(const std::string &file, FactorType factorType, bool lazy); +template class BackwardLanguageModel : public LanguageModelKen +{ +public: + BackwardLanguageModel(const std::string &file, FactorType factorType, bool lazy); - virtual const FFState *EmptyHypothesisState(const InputType &/*input*/) const; + virtual const FFState *EmptyHypothesisState(const InputType &/*input*/) const; - virtual void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const; + virtual void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const; - virtual FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const; + virtual FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const; - FFState *Evaluate(const Phrase &phrase, const FFState *ps, float &returnedScore) const; + FFState *Evaluate(const Phrase &phrase, const FFState *ps, float &returnedScore) const; - private: +private: - // These lines are required to make the parent class's protected members visible to this class - using LanguageModelKen::m_ngram; - using LanguageModelKen::m_beginSentenceFactor; - using LanguageModelKen::m_factorType; - using LanguageModelKen::TranslateID; + // These lines are required to make the parent class's protected members visible to this class + using LanguageModelKen::m_ngram; + using LanguageModelKen::m_beginSentenceFactor; + using LanguageModelKen::m_factorType; + using LanguageModelKen::TranslateID; - // friend class Moses::BackwardLanguageModelTest; - friend class Moses::BackwardLanguageModelTest; - /* - lm::ngram::ChartState* GetState(FFState *ffState) { - return NULL; - } - */ - /* - double Score(FFState *ffState) { - BackwardLMState *lmState = static_cast< BackwardLMState* >(ffState); - lm::ngram::ChartState &state = lmState->state; - lm::ngram::RuleScore ruleScore(*m_ngram, lmState); - return ruleScore.Finish(); + // friend class Moses::BackwardLanguageModelTest; + friend class Moses::BackwardLanguageModelTest; + /* + lm::ngram::ChartState* GetState(FFState *ffState) { + return NULL; } - */ + */ + /* + double Score(FFState *ffState) { + BackwardLMState *lmState = static_cast< BackwardLMState* >(ffState); + lm::ngram::ChartState &state = lmState->state; + lm::ngram::RuleScore ruleScore(*m_ngram, lmState); + return ruleScore.Finish(); + } + */ }; } // namespace Moses @@ -83,7 +85,7 @@ template class BackwardLanguageModel : public LanguageModelKen lm/backward.arpa diff --git a/moses/LM/BackwardLMState.cpp b/moses/LM/BackwardLMState.cpp index 37a3ab7da..466c4b655 100644 --- a/moses/LM/BackwardLMState.cpp +++ b/moses/LM/BackwardLMState.cpp @@ -22,11 +22,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "moses/LM/BackwardLMState.h" #include "lm/state.hh" -namespace Moses { +namespace Moses +{ - int BackwardLMState::Compare(const FFState &o) const { - const BackwardLMState &other = static_cast(o); - return state.left.Compare(other.state.left); - } +int BackwardLMState::Compare(const FFState &o) const +{ + const BackwardLMState &other = static_cast(o); + return state.left.Compare(other.state.left); +} } diff --git a/moses/LM/BackwardLMState.h b/moses/LM/BackwardLMState.h index 7c6ebff62..e6d1f325a 100644 --- a/moses/LM/BackwardLMState.h +++ b/moses/LM/BackwardLMState.h @@ -36,14 +36,16 @@ namespace lm { //#include "lm/state.hh" -namespace Moses { +namespace Moses +{ - //template +//template class BackwardLanguageModelTest; -class BackwardLMState : public FFState { +class BackwardLMState : public FFState +{ - public: +public: /* int Compare(const FFState &o) const { @@ -53,14 +55,14 @@ class BackwardLMState : public FFState { */ int Compare(const FFState &o) const; - // Allow BackwardLanguageModel to access the private members of this class - template friend class BackwardLanguageModel; + // Allow BackwardLanguageModel to access the private members of this class + template friend class BackwardLanguageModel; // template friend class Moses::BackwardLanguageModelTest; - friend class Moses::BackwardLanguageModelTest; + friend class Moses::BackwardLanguageModelTest; - private: - lm::ngram::ChartState state; +private: + lm::ngram::ChartState state; }; diff --git a/moses/LM/BackwardTest.cpp b/moses/LM/BackwardTest.cpp index 5f58c9f32..dc5de32bd 100644 --- a/moses/LM/BackwardTest.cpp +++ b/moses/LM/BackwardTest.cpp @@ -47,7 +47,7 @@ template void Foo() { Moses::BackwardLanguageModel *backwardLM; // = new Moses::BackwardLanguageModel( filename, factorType, lazy ); - + } template void Everything() { @@ -55,159 +55,160 @@ template void Everything() { } */ -namespace Moses { +namespace Moses +{ -// Apparently some Boost versions use templates and are pretty strict about types matching. +// Apparently some Boost versions use templates and are pretty strict about types matching. #define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast(ref), static_cast(value), static_cast(tol)); -class BackwardLanguageModelTest { +class BackwardLanguageModelTest +{ - public: - BackwardLanguageModelTest() : - dummyInput(new Sentence()), - backwardLM( - static_cast< BackwardLanguageModel * >( - ConstructBackwardLM( - boost::unit_test::framework::master_test_suite().argv[1], - 0, - false) - ) - ) +public: + BackwardLanguageModelTest() : + dummyInput(new Sentence()), + backwardLM( + static_cast< BackwardLanguageModel * >( + ConstructBackwardLM( + boost::unit_test::framework::master_test_suite().argv[1], + 0, + false) + ) + ) { + // This space intentionally left blank + } + + ~BackwardLanguageModelTest() { + delete dummyInput; + delete backwardLM; + } + + void testEmptyHypothesis() { + FFState *ffState = const_cast< FFState * >(backwardLM->EmptyHypothesisState( *dummyInput )); + + BOOST_CHECK( ffState != NULL ); + + delete ffState; + } + + void testCalcScore() { + + double p_the = -1.383059; + double p_licenses = -2.360783; + double p_for = -1.661813; + double p_most = -2.360783; + // double p_software = -1.62042; + + double p_the_licenses = -0.9625873; + double p_licenses_for = -1.661557; + double p_for_most = -0.4526253; + // double p_most_software = -1.70295; + + double p_the_licenses_for = p_the_licenses + p_licenses_for; + // double p_licenses_for_most = p_licenses_for + p_for_most; + + // the { - // This space intentionally left blank + Phrase phrase; + BOOST_CHECK( phrase.GetSize() == 0 ); + + std::vector outputFactorOrder; + outputFactorOrder.push_back(0); + + phrase.CreateFromString( + outputFactorOrder, + "the", + StaticData::Instance().GetFactorDelimiter()); + + BOOST_CHECK( phrase.GetSize() == 1 ); + + float fullScore; + float ngramScore; + size_t oovCount; + backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount); + + BOOST_CHECK( oovCount == 0 ); + SLOPPY_CHECK_CLOSE( TransformLMScore(p_the), fullScore, 0.01); + SLOPPY_CHECK_CLOSE( TransformLMScore( 0.0 ), ngramScore, 0.01); } - ~BackwardLanguageModelTest() { - delete dummyInput; - delete backwardLM; + // the licenses + { + Phrase phrase; + BOOST_CHECK( phrase.GetSize() == 0 ); + + std::vector outputFactorOrder; + outputFactorOrder.push_back(0); + + phrase.CreateFromString( + outputFactorOrder, + "the licenses", + StaticData::Instance().GetFactorDelimiter()); + + BOOST_CHECK( phrase.GetSize() == 2 ); + + float fullScore; + float ngramScore; + size_t oovCount; + backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount); + + BOOST_CHECK( oovCount == 0 ); + SLOPPY_CHECK_CLOSE( TransformLMScore(p_licenses + p_the_licenses), fullScore, 0.01); + SLOPPY_CHECK_CLOSE( TransformLMScore( 0.0 ), ngramScore, 0.01); } - void testEmptyHypothesis() { - FFState *ffState = const_cast< FFState * >(backwardLM->EmptyHypothesisState( *dummyInput )); + // the licenses for + { + Phrase phrase; + BOOST_CHECK( phrase.GetSize() == 0 ); - BOOST_CHECK( ffState != NULL ); + std::vector outputFactorOrder; + outputFactorOrder.push_back(0); - delete ffState; + phrase.CreateFromString( + outputFactorOrder, + "the licenses for", + StaticData::Instance().GetFactorDelimiter()); + + BOOST_CHECK( phrase.GetSize() == 3 ); + + float fullScore; + float ngramScore; + size_t oovCount; + backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount); + + BOOST_CHECK( oovCount == 0 ); + SLOPPY_CHECK_CLOSE( TransformLMScore( p_the_licenses_for ), ngramScore, 0.01); + SLOPPY_CHECK_CLOSE( TransformLMScore(p_for + p_licenses_for + p_the_licenses), fullScore, 0.01); } - void testCalcScore() { + // the licenses for most + { + Phrase phrase; + BOOST_CHECK( phrase.GetSize() == 0 ); - double p_the = -1.383059; - double p_licenses = -2.360783; - double p_for = -1.661813; - double p_most = -2.360783; - // double p_software = -1.62042; + std::vector outputFactorOrder; + outputFactorOrder.push_back(0); - double p_the_licenses = -0.9625873; - double p_licenses_for = -1.661557; - double p_for_most = -0.4526253; - // double p_most_software = -1.70295; + phrase.CreateFromString( + outputFactorOrder, + "the licenses for most", + StaticData::Instance().GetFactorDelimiter()); - double p_the_licenses_for = p_the_licenses + p_licenses_for; - // double p_licenses_for_most = p_licenses_for + p_for_most; - - // the - { - Phrase phrase; - BOOST_CHECK( phrase.GetSize() == 0 ); + BOOST_CHECK( phrase.GetSize() == 4 ); - std::vector outputFactorOrder; - outputFactorOrder.push_back(0); + float fullScore; + float ngramScore; + size_t oovCount; + backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount); - phrase.CreateFromString( - outputFactorOrder, - "the", - StaticData::Instance().GetFactorDelimiter()); - - BOOST_CHECK( phrase.GetSize() == 1 ); - - float fullScore; - float ngramScore; - size_t oovCount; - backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount); - - BOOST_CHECK( oovCount == 0 ); - SLOPPY_CHECK_CLOSE( TransformLMScore(p_the), fullScore, 0.01); - SLOPPY_CHECK_CLOSE( TransformLMScore( 0.0 ), ngramScore, 0.01); - } - - // the licenses - { - Phrase phrase; - BOOST_CHECK( phrase.GetSize() == 0 ); - - std::vector outputFactorOrder; - outputFactorOrder.push_back(0); - - phrase.CreateFromString( - outputFactorOrder, - "the licenses", - StaticData::Instance().GetFactorDelimiter()); - - BOOST_CHECK( phrase.GetSize() == 2 ); - - float fullScore; - float ngramScore; - size_t oovCount; - backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount); - - BOOST_CHECK( oovCount == 0 ); - SLOPPY_CHECK_CLOSE( TransformLMScore(p_licenses + p_the_licenses), fullScore, 0.01); - SLOPPY_CHECK_CLOSE( TransformLMScore( 0.0 ), ngramScore, 0.01); - } - - // the licenses for - { - Phrase phrase; - BOOST_CHECK( phrase.GetSize() == 0 ); - - std::vector outputFactorOrder; - outputFactorOrder.push_back(0); - - phrase.CreateFromString( - outputFactorOrder, - "the licenses for", - StaticData::Instance().GetFactorDelimiter()); - - BOOST_CHECK( phrase.GetSize() == 3 ); - - float fullScore; - float ngramScore; - size_t oovCount; - backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount); - - BOOST_CHECK( oovCount == 0 ); - SLOPPY_CHECK_CLOSE( TransformLMScore( p_the_licenses_for ), ngramScore, 0.01); - SLOPPY_CHECK_CLOSE( TransformLMScore(p_for + p_licenses_for + p_the_licenses), fullScore, 0.01); - } - - // the licenses for most - { - Phrase phrase; - BOOST_CHECK( phrase.GetSize() == 0 ); - - std::vector outputFactorOrder; - outputFactorOrder.push_back(0); - - phrase.CreateFromString( - outputFactorOrder, - "the licenses for most", - StaticData::Instance().GetFactorDelimiter()); - - BOOST_CHECK( phrase.GetSize() == 4 ); - - float fullScore; - float ngramScore; - size_t oovCount; - backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount); - - BOOST_CHECK( oovCount == 0 ); - SLOPPY_CHECK_CLOSE( TransformLMScore( p_the_licenses + p_licenses_for ), ngramScore, 0.01); - SLOPPY_CHECK_CLOSE( TransformLMScore(p_most + p_for_most + p_licenses_for + p_the_licenses), fullScore, 0.01); - } - + BOOST_CHECK( oovCount == 0 ); + SLOPPY_CHECK_CLOSE( TransformLMScore( p_the_licenses + p_licenses_for ), ngramScore, 0.01); + SLOPPY_CHECK_CLOSE( TransformLMScore(p_most + p_for_most + p_licenses_for + p_the_licenses), fullScore, 0.01); } - + + } + void testEvaluate() { FFState *nextState; @@ -223,132 +224,134 @@ class BackwardLanguageModelTest { double p_for_licenses = -1.661557; double p_licenses_the = -0.9625873; double p_the_eos = -1.940311; - - // the - { - Phrase phrase; - BOOST_CHECK( phrase.GetSize() == 0 ); - std::vector outputFactorOrder; - outputFactorOrder.push_back(0); + // the + { + Phrase phrase; + BOOST_CHECK( phrase.GetSize() == 0 ); - phrase.CreateFromString( - outputFactorOrder, - "the", - StaticData::Instance().GetFactorDelimiter()); + std::vector outputFactorOrder; + outputFactorOrder.push_back(0); - BOOST_CHECK( phrase.GetSize() == 1 ); - - float score; - nextState = backwardLM->Evaluate(phrase, prevState, score); + phrase.CreateFromString( + outputFactorOrder, + "the", + StaticData::Instance().GetFactorDelimiter()); - // p(the) * p( | the) / p() - SLOPPY_CHECK_CLOSE( (p_the + p_the_eos - p_eos), score, 0.01); - - delete prevState; - prevState = nextState; + BOOST_CHECK( phrase.GetSize() == 1 ); - } + float score; + nextState = backwardLM->Evaluate(phrase, prevState, score); - // the licenses - { - Phrase phrase; - BOOST_CHECK( phrase.GetSize() == 0 ); - - std::vector outputFactorOrder; - outputFactorOrder.push_back(0); - - phrase.CreateFromString( - outputFactorOrder, - "licenses", - StaticData::Instance().GetFactorDelimiter()); - - BOOST_CHECK( phrase.GetSize() == 1 ); - - float score; - nextState = backwardLM->Evaluate(phrase, prevState, score); - - // p(licenses) * p(licenses | the) / p(the) - SLOPPY_CHECK_CLOSE( (p_licenses + p_licenses_the - p_the), score, 0.01); - - delete prevState; - prevState = nextState; - - } - - // the licenses for - { - Phrase phrase; - BOOST_CHECK( phrase.GetSize() == 0 ); - - std::vector outputFactorOrder; - outputFactorOrder.push_back(0); - - phrase.CreateFromString( - outputFactorOrder, - "for", - StaticData::Instance().GetFactorDelimiter()); - - BOOST_CHECK( phrase.GetSize() == 1 ); - - float score; - nextState = backwardLM->Evaluate(phrase, prevState, score); - - // p(for) * p(for | licenses) / p(licenses) - SLOPPY_CHECK_CLOSE( (p_for + p_for_licenses - p_licenses), score, 0.01); - - delete prevState; - prevState = nextState; - - } - - // the licenses for most - { - Phrase phrase; - BOOST_CHECK( phrase.GetSize() == 0 ); - - std::vector outputFactorOrder; - outputFactorOrder.push_back(0); - - phrase.CreateFromString( - outputFactorOrder, - "most", - StaticData::Instance().GetFactorDelimiter()); - - BOOST_CHECK( phrase.GetSize() == 1 ); - - float score; - nextState = backwardLM->Evaluate(phrase, prevState, score); - - // p(most) * p(most | for) / p(for) - SLOPPY_CHECK_CLOSE( (p_most + p_most_for - p_for), score, 0.01); - - delete prevState; - prevState = nextState; - - } + // p(the) * p( | the) / p() + SLOPPY_CHECK_CLOSE( (p_the + p_the_eos - p_eos), score, 0.01); delete prevState; + prevState = nextState; + + } + + // the licenses + { + Phrase phrase; + BOOST_CHECK( phrase.GetSize() == 0 ); + + std::vector outputFactorOrder; + outputFactorOrder.push_back(0); + + phrase.CreateFromString( + outputFactorOrder, + "licenses", + StaticData::Instance().GetFactorDelimiter()); + + BOOST_CHECK( phrase.GetSize() == 1 ); + + float score; + nextState = backwardLM->Evaluate(phrase, prevState, score); + + // p(licenses) * p(licenses | the) / p(the) + SLOPPY_CHECK_CLOSE( (p_licenses + p_licenses_the - p_the), score, 0.01); + + delete prevState; + prevState = nextState; + + } + + // the licenses for + { + Phrase phrase; + BOOST_CHECK( phrase.GetSize() == 0 ); + + std::vector outputFactorOrder; + outputFactorOrder.push_back(0); + + phrase.CreateFromString( + outputFactorOrder, + "for", + StaticData::Instance().GetFactorDelimiter()); + + BOOST_CHECK( phrase.GetSize() == 1 ); + + float score; + nextState = backwardLM->Evaluate(phrase, prevState, score); + + // p(for) * p(for | licenses) / p(licenses) + SLOPPY_CHECK_CLOSE( (p_for + p_for_licenses - p_licenses), score, 0.01); + + delete prevState; + prevState = nextState; + + } + + // the licenses for most + { + Phrase phrase; + BOOST_CHECK( phrase.GetSize() == 0 ); + + std::vector outputFactorOrder; + outputFactorOrder.push_back(0); + + phrase.CreateFromString( + outputFactorOrder, + "most", + StaticData::Instance().GetFactorDelimiter()); + + BOOST_CHECK( phrase.GetSize() == 1 ); + + float score; + nextState = backwardLM->Evaluate(phrase, prevState, score); + + // p(most) * p(most | for) / p(for) + SLOPPY_CHECK_CLOSE( (p_most + p_most_for - p_for), score, 0.01); + + delete prevState; + prevState = nextState; + + } + + delete prevState; } - - private: - const Sentence *dummyInput; - BackwardLanguageModel *backwardLM; + +private: + const Sentence *dummyInput; + BackwardLanguageModel *backwardLM; }; } -const char *FileLocation() { +const char *FileLocation() +{ if (boost::unit_test::framework::master_test_suite().argc < 2) { BOOST_FAIL("Jamfile must specify arpa file for this test, but did not"); } return boost::unit_test::framework::master_test_suite().argv[1]; } -BOOST_AUTO_TEST_CASE(ProbingAll) { +BOOST_AUTO_TEST_CASE(ProbingAll) +{ BackwardLanguageModelTest test; test.testEmptyHypothesis(); diff --git a/moses/LM/Base.cpp b/moses/LM/Base.cpp index fe35604b0..37dc704de 100644 --- a/moses/LM/Base.cpp +++ b/moses/LM/Base.cpp @@ -31,63 +31,67 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA using namespace std; -namespace Moses { +namespace Moses +{ LanguageModel::LanguageModel(const std::string& description, const std::string &line) : StatefulFeatureFunction(description, StaticData::Instance().GetLMEnableOOVFeature() ? 2 : 1, line ) { - m_enableOOVFeature = StaticData::Instance().GetLMEnableOOVFeature(); + m_enableOOVFeature = StaticData::Instance().GetLMEnableOOVFeature(); } LanguageModel::~LanguageModel() {} -float LanguageModel::GetWeight() const { +float LanguageModel::GetWeight() const +{ //return StaticData::Instance().GetAllWeights().GetScoresForProducer(this)[0]; return StaticData::Instance().GetWeights(this)[0]; } -float LanguageModel::GetOOVWeight() const { +float LanguageModel::GetOOVWeight() const +{ if (m_enableOOVFeature) { //return StaticData::Instance().GetAllWeights().GetScoresForProducer(this)[1]; - return StaticData::Instance().GetWeights(this)[1]; + return StaticData::Instance().GetWeights(this)[1]; } else { return 0; } } -void LanguageModel::IncrementalCallback(Incremental::Manager &manager) const { +void LanguageModel::IncrementalCallback(Incremental::Manager &manager) const +{ UTIL_THROW(util::Exception, "Incremental search is only supported by KenLM."); } void LanguageModel::Evaluate(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const { - if (Useable(targetPhrase)) { - // contains factors used by this LM - float fullScore, nGramScore; - size_t oovCount; + if (Useable(targetPhrase)) { + // contains factors used by this LM + float fullScore, nGramScore; + size_t oovCount; - CalcScore(targetPhrase, fullScore, nGramScore, oovCount); - float estimateScore = fullScore - nGramScore; + CalcScore(targetPhrase, fullScore, nGramScore, oovCount); + float estimateScore = fullScore - nGramScore; - if (StaticData::Instance().GetLMEnableOOVFeature()) { - vector scores(2), estimateScores(2); - scores[0] = nGramScore; - scores[1] = oovCount; - scoreBreakdown.Assign(this, scores); + if (StaticData::Instance().GetLMEnableOOVFeature()) { + vector scores(2), estimateScores(2); + scores[0] = nGramScore; + scores[1] = oovCount; + scoreBreakdown.Assign(this, scores); - estimateScores[0] = estimateScore; - estimateScores[1] = 0; - estimatedFutureScore.Assign(this, estimateScores); - } else { - scoreBreakdown.Assign(this, nGramScore); - estimatedFutureScore.Assign(this, estimateScore); - } + estimateScores[0] = estimateScore; + estimateScores[1] = 0; + estimatedFutureScore.Assign(this, estimateScores); + } else { + scoreBreakdown.Assign(this, nGramScore); + estimatedFutureScore.Assign(this, estimateScore); + } - } + } } const LanguageModel &LanguageModel::GetFirstLM() diff --git a/moses/LM/Base.h b/moses/LM/Base.h index 961fead5f..1f976ee53 100644 --- a/moses/LM/Base.h +++ b/moses/LM/Base.h @@ -30,21 +30,25 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA namespace Moses { -namespace Incremental { class Manager; } +namespace Incremental +{ +class Manager; +} class FactorCollection; class Factor; class Phrase; //! Abstract base class which represent a language model on a contiguous phrase -class LanguageModel : public StatefulFeatureFunction { +class LanguageModel : public StatefulFeatureFunction +{ protected: LanguageModel(const std::string& description, const std::string &line); // This can't be in the constructor for virual function dispatch reasons bool m_enableOOVFeature; - + public: static const LanguageModel &GetFirstLM(); @@ -89,9 +93,9 @@ public: virtual void IncrementalCallback(Incremental::Manager &manager) const; virtual void Evaluate(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const; + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const; }; diff --git a/moses/LM/ChartState.h b/moses/LM/ChartState.h index b6bdd8f7c..186694927 100644 --- a/moses/LM/ChartState.h +++ b/moses/LM/ChartState.h @@ -19,16 +19,15 @@ private: const ChartHypothesis &m_hypo; - /** Construct the prefix string of up to specified size + /** Construct the prefix string of up to specified size * \param ret prefix string * \param size maximum size (typically max lm context window) */ - size_t CalcPrefix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const - { + size_t CalcPrefix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const { const TargetPhrase &target = hypo.GetCurrTargetPhrase(); const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = - target.GetAlignNonTerm().GetNonTermIndexMap(); - + target.GetAlignNonTerm().GetNonTermIndexMap(); + // loop over the rule that is being applied for (size_t pos = 0; pos < target.GetSize(); ++pos) { const Word &word = target.GetWord(pos); @@ -53,13 +52,12 @@ private: return size; } - /** Construct the suffix phrase of up to specified size + /** Construct the suffix phrase of up to specified size * will always be called after the construction of prefix phrase * \param ret suffix phrase * \param size maximum size of suffix */ - size_t CalcSuffix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const - { + size_t CalcSuffix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const { CHECK(m_contextPrefix.GetSize() <= m_numTargetTerminals); // special handling for small hypotheses @@ -81,7 +79,7 @@ private: else { const TargetPhrase& target = hypo.GetCurrTargetPhrase(); const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = - target.GetAlignNonTerm().GetNonTermIndexMap(); + target.GetAlignNonTerm().GetNonTermIndexMap(); for (int pos = (int) target.GetSize() - 1; pos >= 0 ; --pos) { const Word &word = target.GetWord(pos); @@ -89,8 +87,7 @@ private: size_t nonTermInd = nonTermIndexMap[pos]; const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermInd); size = static_cast(prevHypo->GetFFState(featureID))->CalcSuffix(*prevHypo, featureID, ret, size); - } - else { + } else { ret.PrependWord(hypo.GetCurrTargetPhrase().GetWord(pos)); size--; } @@ -106,11 +103,10 @@ private: public: LanguageModelChartState(const ChartHypothesis &hypo, int featureID, size_t order) - :m_lmRightContext(NULL) - ,m_contextPrefix(order - 1) - ,m_contextSuffix( order - 1) - ,m_hypo(hypo) - { + :m_lmRightContext(NULL) + ,m_contextPrefix(order - 1) + ,m_contextSuffix( order - 1) + ,m_hypo(hypo) { m_numTargetTerminals = hypo.GetCurrTargetPhrase().GetNumTerminals(); for (std::vector::const_iterator i = hypo.GetPrevHypos().begin(); i != hypo.GetPrevHypos().end(); ++i) { @@ -131,8 +127,12 @@ public: m_lmRightContext = rightState; } - float GetPrefixScore() const { return m_prefixScore; } - FFState* GetRightContext() const { return m_lmRightContext; } + float GetPrefixScore() const { + return m_prefixScore; + } + FFState* GetRightContext() const { + return m_lmRightContext; + } size_t GetNumTargetTerminals() const { return m_numTargetTerminals; @@ -150,8 +150,7 @@ public: dynamic_cast( o ); // prefix - if (m_hypo.GetCurrSourceRange().GetStartPos() > 0) // not for " ..." - { + if (m_hypo.GetCurrSourceRange().GetStartPos() > 0) { // not for " ..." int ret = GetPrefix().Compare(other.GetPrefix()); if (ret != 0) return ret; @@ -159,8 +158,7 @@ public: // suffix size_t inputSize = m_hypo.GetManager().GetSource().GetSize(); - if (m_hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1)// not for "... " - { + if (m_hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1) { // not for "... " int ret = other.GetRightContext()->Compare(*m_lmRightContext); if (ret != 0) return ret; diff --git a/moses/LM/IRST.cpp b/moses/LM/IRST.cpp index 2d58bd310..ae1bb677d 100644 --- a/moses/LM/IRST.cpp +++ b/moses/LM/IRST.cpp @@ -40,25 +40,22 @@ using namespace std; namespace Moses { LanguageModelIRST::LanguageModelIRST(const std::string &line) -:LanguageModelSingleFactor("IRSTLM", line) + :LanguageModelSingleFactor("IRSTLM", line) { FactorType factorType; size_t nGramOrder; string filePath; for (size_t i = 0; i < m_args.size(); ++i) { - const vector &args = m_args[i]; + const vector &args = m_args[i]; if (args[0] == "factor") { factorType = Scan(args[1]); - } - else if (args[0] == "order") { + } else if (args[0] == "order") { nGramOrder = Scan(args[1]); - } - else if (args[0] == "path") { + } else if (args[0] == "path") { filePath = args[1]; - } - else { + } else { throw "Unknown argument " + args[0]; } } @@ -86,8 +83,7 @@ bool LanguageModelIRST::Load(const std::string &filePath, const StaticData &staticData = StaticData::Instance(); int threadCount = staticData.ThreadCount(); - if (threadCount != 1) - { + if (threadCount != 1) { UserMessage::Add(threadCount + " number of threads specified but IRST LM is not threadsafe."); return false; } @@ -99,7 +95,7 @@ bool LanguageModelIRST::Load(const std::string &filePath, m_filePath = filePath; - m_lmtb = m_lmtb->CreateLanguageModel(m_filePath); + m_lmtb = m_lmtb->CreateLanguageModel(m_filePath); m_lmtb->setMaxLoadedLevel(1000); m_lmtb->load(m_filePath); d=m_lmtb->getDict(); @@ -170,7 +166,7 @@ int LanguageModelIRST::GetLmID( const std::string &str ) const } int LanguageModelIRST::GetLmID( const Factor *factor ) const -{ +{ size_t factorId = factor->GetId(); if ((factorId >= m_lmIdLookup.size()) || (m_lmIdLookup[factorId] == m_empty)) { @@ -180,12 +176,12 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const ////////// ///poiche' non c'e' distinzione tra i factorIDs delle parole sorgenti - ///e delle parole target in Moses, puo' accadere che una parola target + ///e delle parole target in Moses, puo' accadere che una parola target ///di cui non sia stato ancora calcolato il suo codice target abbia ///comunque un factorID noto (e quindi minore di m_lmIdLookup.size()) ///E' necessario dunque identificare questi casi di indeterminatezza ///del codice target. Attualamente, questo controllo e' stato implementato - ///impostando a m_empty tutti i termini che non hanno ancora + ///impostando a m_empty tutti i termini che non hanno ancora //ricevuto un codice target effettivo /////////// @@ -197,7 +193,7 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const /// IN POSIZIONE (factorID-1) invece che in posizione factrID dove dopo andiamo a leggerlo (vedi caso C /// Cosi' funziona .... /// ho un dubbio su cosa c'e' nelle prime posizioni di m_lmIdLookup -/// quindi +/// quindi /// e scopro che rimane vuota una entry ogni due /// perche' factorID cresce di due in due (perche' codifica sia source che target) "vuota" la posizione (factorID-1) /// non da problemi di correttezza, ma solo di "spreco" di memoria @@ -207,10 +203,10 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const //////////////// - if (factorId >= m_lmIdLookup.size()){ - //resize and fill with m_empty - //increment the array more than needed to avoid too many resizing operation. - m_lmIdLookup.resize(factorId+10, m_empty); + if (factorId >= m_lmIdLookup.size()) { + //resize and fill with m_empty + //increment the array more than needed to avoid too many resizing operation. + m_lmIdLookup.resize(factorId+10, m_empty); } //insert new code diff --git a/moses/LM/Implementation.cpp b/moses/LM/Implementation.cpp index 798a12775..e9c651089 100644 --- a/moses/LM/Implementation.cpp +++ b/moses/LM/Implementation.cpp @@ -69,8 +69,9 @@ void LanguageModelImplementation::GetState( GetValueForgotState(contextFactor, state); } -// Calculate score of a phrase. -void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const { +// Calculate score of a phrase. +void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const +{ fullScore = 0; ngramScore = 0; @@ -82,7 +83,7 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco vector contextFactor; contextFactor.reserve(GetNGramOrder()); std::auto_ptr state(NewState((phrase.GetWord(0) == GetSentenceStartWord()) ? - GetBeginSentenceState() : GetNullContextState())); + GetBeginSentenceState() : GetNullContextState())); size_t currPos = 0; while (currPos < phraseSize) { const Word &word = phrase.GetWord(currPos); @@ -109,7 +110,7 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco fullScore += result.score; if (contextFactor.size() == GetNGramOrder()) ngramScore += result.score; - if (result.unknown) ++oovCount; + if (result.unknown) ++oovCount; } } @@ -117,7 +118,8 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco } } -FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const { +FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const +{ // In this function, we only compute the LM scores of n-grams that overlap a // phrase boundary. Phrase-internal scores are taken directly from the // translation option. @@ -179,9 +181,7 @@ FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFS contextFactor[i] = &hypo.GetWord((size_t)currPos); } lmScore += GetValueForgotState(contextFactor, *res).score; - } - else - { + } else { if (endPos < currEndPos) { //need to get the LM state (otherwise the last LM state is fine) for (size_t currPos = endPos+1; currPos <= currEndPos; currPos++) { @@ -208,7 +208,8 @@ FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFS return res; } -FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection* out) const { +FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection* out) const +{ LanguageModelChartState *ret = new LanguageModelChartState(hypo, featureID, GetNGramOrder()); // data structure for factored context phrase (history and predicted word) vector contextFactor; @@ -223,38 +224,33 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo, // get index map for underlying hypotheses const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = - hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap(); + hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap(); // loop over rule for (size_t phrasePos = 0, wordPos = 0; phrasePos < hypo.GetCurrTargetPhrase().GetSize(); - phrasePos++) - { + phrasePos++) { // consult rule for either word or non-terminal const Word &word = hypo.GetCurrTargetPhrase().GetWord(phrasePos); // regular word - if (!word.IsNonTerminal()) - { + if (!word.IsNonTerminal()) { ShiftOrPush(contextFactor, word); // beginning of sentence symbol ? -> just update state - if (word == GetSentenceStartWord()) - { + if (word == GetSentenceStartWord()) { CHECK(phrasePos == 0); delete lmState; lmState = NewState( GetBeginSentenceState() ); } // score a regular word added by the rule - else - { + else { updateChartScore( &prefixScore, &finalizedScore, GetValueGivenState(contextFactor, *lmState).score, ++wordPos ); } } // non-terminal, add phrase from underlying hypothesis - else - { + else { // look up underlying hypothesis size_t nonTermIndex = nonTermIndexMap[phrasePos]; const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex); @@ -278,8 +274,7 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo, // push suffix int suffixPos = prevState->GetSuffix().GetSize() - (GetNGramOrder()-1); if (suffixPos < 0) suffixPos = 0; // push all words if less than order - for(;(size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) - { + for(; (size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) { const Word &word = prevState->GetSuffix().GetWord(suffixPos); ShiftOrPush(contextFactor, word); wordPos++; @@ -287,22 +282,19 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo, } // internal non-terminal - else - { + else { // score its prefix for(size_t prefixPos = 0; prefixPos < GetNGramOrder()-1 // up to LM order window - && prefixPos < subPhraseLength; // up to length - prefixPos++) - { + && prefixPos < subPhraseLength; // up to length + prefixPos++) { const Word &word = prevState->GetPrefix().GetWord(prefixPos); ShiftOrPush(contextFactor, word); updateChartScore( &prefixScore, &finalizedScore, GetValueGivenState(contextFactor, *lmState).score, ++wordPos ); } // check if we are dealing with a large sub-phrase - if (subPhraseLength > GetNGramOrder() - 1) - { + if (subPhraseLength > GetNGramOrder() - 1) { // add its finalized language model score finalizedScore += prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0] // full score @@ -337,11 +329,11 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo, return ret; } -void LanguageModelImplementation::updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const { +void LanguageModelImplementation::updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const +{ if (wordPos < GetNGramOrder()) { *prefixScore += score; - } - else { + } else { *finalizedScore += score; } } diff --git a/moses/LM/Implementation.h b/moses/LM/Implementation.h index d3f83dfe1..fa6619208 100644 --- a/moses/LM/Implementation.h +++ b/moses/LM/Implementation.h @@ -44,7 +44,7 @@ class Phrase; struct LMResult { // log probability float score; - // Is the word unknown? + // Is the word unknown? bool unknown; }; @@ -62,7 +62,7 @@ protected: //! Usually and LanguageModelImplementation(const std::string& description, const std::string &line) - :LanguageModel(description, line) + :LanguageModel(description, line) {} public: @@ -108,8 +108,7 @@ public: return m_sentenceEndWord; } - const FFState* EmptyHypothesisState(const InputType &/*input*/) const - { + const FFState* EmptyHypothesisState(const InputType &/*input*/) const { return NewState(GetBeginSentenceState()); } diff --git a/moses/LM/Joint.h b/moses/LM/Joint.h index 5bc52e2da..3a675cbd6 100644 --- a/moses/LM/Joint.h +++ b/moses/LM/Joint.h @@ -50,8 +50,7 @@ protected: size_t m_implFactor; public: LanguageModelJoint(const std::string &line, LanguageModelSingleFactor *lmImpl) - :LanguageModelMultiFactor("JointLM", line) - { + :LanguageModelMultiFactor("JointLM", line) { m_lmImpl = lmImpl; } diff --git a/moses/LM/Ken.cpp b/moses/LM/Ken.cpp index e251661c3..af24ad858 100644 --- a/moses/LM/Ken.cpp +++ b/moses/LM/Ken.cpp @@ -45,8 +45,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA using namespace std; -namespace Moses { -namespace { +namespace Moses +{ +namespace +{ struct KenLMState : public FFState { lm::ngram::State state; @@ -61,63 +63,65 @@ struct KenLMState : public FFState { /* * An implementation of single factor LM using Ken's code. */ -template class LanguageModelKen : public LanguageModel { - public: - LanguageModelKen(const std::string &description, const std::string &line, const std::string &file, FactorType factorType, bool lazy); +template class LanguageModelKen : public LanguageModel +{ +public: + LanguageModelKen(const std::string &description, const std::string &line, const std::string &file, FactorType factorType, bool lazy); - bool Useable(const Phrase &phrase) const { - return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL); - } + bool Useable(const Phrase &phrase) const { + return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL); + } - const FFState *EmptyHypothesisState(const InputType &/*input*/) const { - KenLMState *ret = new KenLMState(); - ret->state = m_ngram->BeginSentenceState(); - return ret; - } + const FFState *EmptyHypothesisState(const InputType &/*input*/) const { + KenLMState *ret = new KenLMState(); + ret->state = m_ngram->BeginSentenceState(); + return ret; + } - void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const; + void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const; - FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const; + FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const; - FFState *EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const; + FFState *EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const; - void IncrementalCallback(Incremental::Manager &manager) const { - manager.LMCallback(*m_ngram, m_lmIdLookup); - } + void IncrementalCallback(Incremental::Manager &manager) const { + manager.LMCallback(*m_ngram, m_lmIdLookup); + } - private: - LanguageModelKen(const LanguageModelKen ©_from); +private: + LanguageModelKen(const LanguageModelKen ©_from); - lm::WordIndex TranslateID(const Word &word) const { - std::size_t factor = word.GetFactor(m_factorType)->GetId(); - return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]); - } + lm::WordIndex TranslateID(const Word &word) const { + std::size_t factor = word.GetFactor(m_factorType)->GetId(); + return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]); + } - // Convert last words of hypothesis into vocab ids, returning an end pointer. - lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const { - lm::WordIndex *index = indices; - lm::WordIndex *end = indices + m_ngram->Order() - 1; - int position = hypo.GetCurrTargetWordsRange().GetEndPos(); - for (; ; ++index, --position) { - if (index == end) return index; - if (position == -1) { - *index = m_ngram->GetVocabulary().BeginSentence(); - return index + 1; - } - *index = TranslateID(hypo.GetWord(position)); + // Convert last words of hypothesis into vocab ids, returning an end pointer. + lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const { + lm::WordIndex *index = indices; + lm::WordIndex *end = indices + m_ngram->Order() - 1; + int position = hypo.GetCurrTargetWordsRange().GetEndPos(); + for (; ; ++index, --position) { + if (index == end) return index; + if (position == -1) { + *index = m_ngram->GetVocabulary().BeginSentence(); + return index + 1; } + *index = TranslateID(hypo.GetWord(position)); } + } - boost::shared_ptr m_ngram; - - std::vector m_lmIdLookup; + boost::shared_ptr m_ngram; - FactorType m_factorType; + std::vector m_lmIdLookup; - const Factor *m_beginSentenceFactor; + FactorType m_factorType; + + const Factor *m_beginSentenceFactor; }; -class MappingBuilder : public lm::EnumerateVocab { +class MappingBuilder : public lm::EnumerateVocab +{ public: MappingBuilder(FactorCollection &factorCollection, std::vector &mapping) : m_factorCollection(factorCollection), m_mapping(mapping) {} @@ -137,13 +141,14 @@ private: }; template LanguageModelKen::LanguageModelKen(const std::string &description, const std::string &line, const std::string &file, FactorType factorType, bool lazy) -:LanguageModel(description, line) -,m_factorType(factorType) + :LanguageModel(description, line) + ,m_factorType(factorType) { lm::ngram::Config config; IFVERBOSE(1) { config.messages = &std::cerr; - } else { + } + else { config.messages = NULL; } FactorCollection &collection = FactorCollection::Instance(); @@ -157,15 +162,17 @@ template LanguageModelKen::LanguageModelKen(const std::stri } template LanguageModelKen::LanguageModelKen(const LanguageModelKen ©_from) -:LanguageModel(copy_from.GetScoreProducerDescription(), copy_from.GetArgLine()), -m_ngram(copy_from.m_ngram), + :LanguageModel(copy_from.GetScoreProducerDescription(), copy_from.GetArgLine()), + m_ngram(copy_from.m_ngram), // TODO: don't copy this. -m_lmIdLookup(copy_from.m_lmIdLookup), -m_factorType(copy_from.m_factorType), -m_beginSentenceFactor(copy_from.m_beginSentenceFactor) { + m_lmIdLookup(copy_from.m_lmIdLookup), + m_factorType(copy_from.m_factorType), + m_beginSentenceFactor(copy_from.m_beginSentenceFactor) +{ } -template void LanguageModelKen::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const { +template void LanguageModelKen::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const +{ fullScore = 0; ngramScore = 0; oovCount = 0; @@ -174,7 +181,7 @@ template void LanguageModelKen::CalcScore(const Phrase &phr lm::ngram::ChartState discarded_sadly; lm::ngram::RuleScore scorer(*m_ngram, discarded_sadly); - + size_t position; if (m_beginSentenceFactor == phrase.GetWord(0).GetFactor(m_factorType)) { scorer.BeginSentence(); @@ -182,7 +189,7 @@ template void LanguageModelKen::CalcScore(const Phrase &phr } else { position = 0; } - + size_t ngramBoundary = m_ngram->Order() - 1; size_t end_loop = std::min(ngramBoundary, phrase.GetSize()); @@ -199,7 +206,7 @@ template void LanguageModelKen::CalcScore(const Phrase &phr } float before_boundary = fullScore + scorer.Finish(); for (; position < phrase.GetSize(); ++position) { - const Word &word = phrase.GetWord(position); + const Word &word = phrase.GetWord(position); if (word.IsNonTerminal()) { fullScore += scorer.Finish(); scorer.Reset(); @@ -207,7 +214,7 @@ template void LanguageModelKen::CalcScore(const Phrase &phr lm::WordIndex index = TranslateID(word); scorer.Terminal(index); if (!index) ++oovCount; - } + } } fullScore += scorer.Finish(); @@ -215,11 +222,12 @@ template void LanguageModelKen::CalcScore(const Phrase &phr fullScore = TransformLMScore(fullScore); } -template FFState *LanguageModelKen::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const { +template FFState *LanguageModelKen::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const +{ const lm::ngram::State &in_state = static_cast(*ps).state; std::auto_ptr ret(new KenLMState()); - + if (!hypo.GetCurrTargetLength()) { ret->state = in_state; return ret.release(); @@ -242,17 +250,17 @@ template FFState *LanguageModelKen::Evaluate(const Hypothes } if (hypo.IsSourceCompleted()) { - // Score end of sentence. + // Score end of sentence. std::vector indices(m_ngram->Order() - 1); const lm::WordIndex *last = LastIDs(hypo, &indices.front()); score += m_ngram->FullScoreForgotState(&indices.front(), last, m_ngram->GetVocabulary().EndSentence(), ret->state).prob; } else if (adjust_end < end) { - // Get state after adding a long phrase. + // Get state after adding a long phrase. std::vector indices(m_ngram->Order() - 1); const lm::WordIndex *last = LastIDs(hypo, &indices.front()); m_ngram->GetState(&indices.front(), last, ret->state); } else if (state0 != &ret->state) { - // Short enough phrase that we can just reuse the state. + // Short enough phrase that we can just reuse the state. ret->state = *state0; } @@ -270,34 +278,39 @@ template FFState *LanguageModelKen::Evaluate(const Hypothes return ret.release(); } -class LanguageModelChartStateKenLM : public FFState { - public: - LanguageModelChartStateKenLM() {} +class LanguageModelChartStateKenLM : public FFState +{ +public: + LanguageModelChartStateKenLM() {} - const lm::ngram::ChartState &GetChartState() const { return m_state; } - lm::ngram::ChartState &GetChartState() { return m_state; } + const lm::ngram::ChartState &GetChartState() const { + return m_state; + } + lm::ngram::ChartState &GetChartState() { + return m_state; + } - int Compare(const FFState& o) const - { - const LanguageModelChartStateKenLM &other = static_cast(o); - int ret = m_state.Compare(other.m_state); - return ret; - } + int Compare(const FFState& o) const { + const LanguageModelChartStateKenLM &other = static_cast(o); + int ret = m_state.Compare(other.m_state); + return ret; + } - private: - lm::ngram::ChartState m_state; +private: + lm::ngram::ChartState m_state; }; -template FFState *LanguageModelKen::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *accumulator) const { +template FFState *LanguageModelKen::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *accumulator) const +{ LanguageModelChartStateKenLM *newState = new LanguageModelChartStateKenLM(); lm::ngram::RuleScore ruleScore(*m_ngram, newState->GetChartState()); const TargetPhrase &target = hypo.GetCurrTargetPhrase(); const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = - target.GetAlignNonTerm().GetNonTermIndexMap(); + target.GetAlignNonTerm().GetNonTermIndexMap(); const size_t size = hypo.GetCurrTargetPhrase().GetSize(); size_t phrasePos = 0; - // Special cases for first word. + // Special cases for first word. if (size) { const Word &word = hypo.GetCurrTargetPhrase().GetWord(0); if (word.GetFactor(m_factorType) == m_beginSentenceFactor) { @@ -305,7 +318,7 @@ template FFState *LanguageModelKen::EvaluateChart(const Cha ruleScore.BeginSentence(); phrasePos++; } else if (word.IsNonTerminal()) { - // Non-terminal is first so we can copy instead of rescoring. + // Non-terminal is first so we can copy instead of rescoring. const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]); const lm::ngram::ChartState &prevState = static_cast(prevHypo->GetFFState(featureID))->GetChartState(); float prob = UntransformLMScore(prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]); @@ -347,20 +360,15 @@ LanguageModel *ConstructKenLM(const std::string &description, const std::string if (args[0] == "factor") { factorType = Scan(args[1]); - } - else if (args[0] == "order") { + } else if (args[0] == "order") { //nGramOrder = Scan(args[1]); - } - else if (args[0] == "path") { + } else if (args[0] == "path") { filePath = args[1]; - } - else if (args[0] == "lazyken") { + } else if (args[0] == "lazyken") { lazy = Scan(args[1]); - } - else if (args[0] == "name") { + } else if (args[0] == "name") { // that's ok. do nothing, passes onto LM constructor - } - else { + } else { throw "Unknown argument " + args[0]; } } @@ -368,26 +376,27 @@ LanguageModel *ConstructKenLM(const std::string &description, const std::string return ConstructKenLM(description, line, filePath, factorType, lazy); } -LanguageModel *ConstructKenLM(const std::string &description, const std::string &line, const std::string &file, FactorType factorType, bool lazy) { +LanguageModel *ConstructKenLM(const std::string &description, const std::string &line, const std::string &file, FactorType factorType, bool lazy) +{ try { lm::ngram::ModelType model_type; if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) { switch(model_type) { - case lm::ngram::PROBING: - return new LanguageModelKen(description, line, file, factorType, lazy); - case lm::ngram::REST_PROBING: - return new LanguageModelKen(description, line, file, factorType, lazy); - case lm::ngram::TRIE: - return new LanguageModelKen(description, line, file, factorType, lazy); - case lm::ngram::QUANT_TRIE: - return new LanguageModelKen(description, line, file, factorType, lazy); - case lm::ngram::ARRAY_TRIE: - return new LanguageModelKen(description, line, file, factorType, lazy); - case lm::ngram::QUANT_ARRAY_TRIE: - return new LanguageModelKen(description, line, file, factorType, lazy); - default: - std::cerr << "Unrecognized kenlm model type " << model_type << std::endl; - abort(); + case lm::ngram::PROBING: + return new LanguageModelKen(description, line, file, factorType, lazy); + case lm::ngram::REST_PROBING: + return new LanguageModelKen(description, line, file, factorType, lazy); + case lm::ngram::TRIE: + return new LanguageModelKen(description, line, file, factorType, lazy); + case lm::ngram::QUANT_TRIE: + return new LanguageModelKen(description, line, file, factorType, lazy); + case lm::ngram::ARRAY_TRIE: + return new LanguageModelKen(description, line, file, factorType, lazy); + case lm::ngram::QUANT_ARRAY_TRIE: + return new LanguageModelKen(description, line, file, factorType, lazy); + default: + std::cerr << "Unrecognized kenlm model type " << model_type << std::endl; + abort(); } } else { return new LanguageModelKen(description, line, file, factorType, lazy); diff --git a/moses/LM/Ken.h b/moses/LM/Ken.h index 3c2ceb774..360ac7be8 100644 --- a/moses/LM/Ken.h +++ b/moses/LM/Ken.h @@ -26,7 +26,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "moses/TypeDef.h" -namespace Moses { +namespace Moses +{ class LanguageModel; diff --git a/moses/LM/LDHT.cpp b/moses/LM/LDHT.cpp index 5f52b2514..1b4e70661 100644 --- a/moses/LM/LDHT.cpp +++ b/moses/LM/LDHT.cpp @@ -16,7 +16,8 @@ #include -namespace Moses { +namespace Moses +{ struct LDHTLMState : public FFState { LDHT::NewNgram gram_fingerprints; @@ -60,338 +61,358 @@ struct LDHTLMState : public FFState { } }; -class LanguageModelLDHT : public LanguageModel { +class LanguageModelLDHT : public LanguageModel +{ public: - LanguageModelLDHT(); - LanguageModelLDHT(const std::string& path, - ScoreIndexManager& manager, - FactorType factorType); - LanguageModelLDHT(ScoreIndexManager& manager, - LanguageModelLDHT& copyFrom); + LanguageModelLDHT(); + LanguageModelLDHT(const std::string& path, + ScoreIndexManager& manager, + FactorType factorType); + LanguageModelLDHT(ScoreIndexManager& manager, + LanguageModelLDHT& copyFrom); - LDHT::Client* getClientUnsafe() const; - LDHT::Client* getClientSafe(); - LDHT::Client* initTSSClient(); - virtual ~LanguageModelLDHT(); - virtual void InitializeForInput(InputType const& source); - virtual void CleanUpAfterSentenceProcessing(const InputType &source); - virtual const FFState* EmptyHypothesisState(const InputType& input) const; - virtual bool Useable(const Phrase& phrase) const; - virtual void CalcScore(const Phrase& phrase, - float& fullScore, - float& ngramScore, - std::size_t& oovCount) const; - virtual void CalcScoreFromCache(const Phrase& phrase, - float& fullScore, - float& ngramScore, - std::size_t& oovCount) const; - FFState* Evaluate(const Hypothesis& hypo, - const FFState* input_state, - ScoreComponentCollection* score_output) const; - FFState* EvaluateChart(const ChartHypothesis& hypo, - int featureID, - ScoreComponentCollection* accumulator) const; + LDHT::Client* getClientUnsafe() const; + LDHT::Client* getClientSafe(); + LDHT::Client* initTSSClient(); + virtual ~LanguageModelLDHT(); + virtual void InitializeForInput(InputType const& source); + virtual void CleanUpAfterSentenceProcessing(const InputType &source); + virtual const FFState* EmptyHypothesisState(const InputType& input) const; + virtual bool Useable(const Phrase& phrase) const; + virtual void CalcScore(const Phrase& phrase, + float& fullScore, + float& ngramScore, + std::size_t& oovCount) const; + virtual void CalcScoreFromCache(const Phrase& phrase, + float& fullScore, + float& ngramScore, + std::size_t& oovCount) const; + FFState* Evaluate(const Hypothesis& hypo, + const FFState* input_state, + ScoreComponentCollection* score_output) const; + FFState* EvaluateChart(const ChartHypothesis& hypo, + int featureID, + ScoreComponentCollection* accumulator) const; - virtual void IssueRequestsFor(Hypothesis& hypo, - const FFState* input_state); - float calcScoreFromState(LDHTLMState* hypo) const; - void sync(); - void SetFFStateIdx(int state_idx); + virtual void IssueRequestsFor(Hypothesis& hypo, + const FFState* input_state); + float calcScoreFromState(LDHTLMState* hypo) const; + void sync(); + void SetFFStateIdx(int state_idx); protected: - boost::thread_specific_ptr m_client; - std::string m_configPath; - FactorType m_factorType; - int m_state_idx; - int m_calc_score_count; - uint64_t m_start_tick; + boost::thread_specific_ptr m_client; + std::string m_configPath; + FactorType m_factorType; + int m_state_idx; + int m_calc_score_count; + uint64_t m_start_tick; }; LanguageModel* ConstructLDHTLM(const std::string& path, ScoreIndexManager& manager, - FactorType factorType) { - return new LanguageModelLDHT(path, manager, factorType); + FactorType factorType) +{ + return new LanguageModelLDHT(path, manager, factorType); } -LanguageModelLDHT::LanguageModelLDHT() : LanguageModel(), m_client(NULL) { - m_enableOOVFeature = false; +LanguageModelLDHT::LanguageModelLDHT() : LanguageModel(), m_client(NULL) +{ + m_enableOOVFeature = false; } LanguageModelLDHT::LanguageModelLDHT(ScoreIndexManager& manager, - LanguageModelLDHT& copyFrom) { - m_calc_score_count = 0; - //m_client = copyFrom.m_client; - m_factorType = copyFrom.m_factorType; - m_configPath = copyFrom.m_configPath; - Init(manager); + LanguageModelLDHT& copyFrom) +{ + m_calc_score_count = 0; + //m_client = copyFrom.m_client; + m_factorType = copyFrom.m_factorType; + m_configPath = copyFrom.m_configPath; + Init(manager); } LanguageModelLDHT::LanguageModelLDHT(const std::string& path, ScoreIndexManager& manager, FactorType factorType) - : m_factorType(factorType) { - m_configPath = path; - Init(manager); + : m_factorType(factorType) +{ + m_configPath = path; + Init(manager); } -LanguageModelLDHT::~LanguageModelLDHT() { - // TODO(wilson): should cleanup for each individual thread. - //delete getClientSafe(); +LanguageModelLDHT::~LanguageModelLDHT() +{ + // TODO(wilson): should cleanup for each individual thread. + //delete getClientSafe(); } // Check that there is a TSS Client instance, and instantiate one if // there isn't. -LDHT::Client* LanguageModelLDHT::getClientSafe() { - if (m_client.get() == NULL) - m_client.reset(initTSSClient()); - return m_client.get(); +LDHT::Client* LanguageModelLDHT::getClientSafe() +{ + if (m_client.get() == NULL) + m_client.reset(initTSSClient()); + return m_client.get(); } // Do not check that there is a TSS Client instance. -LDHT::Client* LanguageModelLDHT::getClientUnsafe() const { - return m_client.get(); +LDHT::Client* LanguageModelLDHT::getClientUnsafe() const +{ + return m_client.get(); } -LDHT::Client* LanguageModelLDHT::initTSSClient() { - std::ifstream config_file(m_configPath.c_str()); - std::string ldht_config_path; - getline(config_file, ldht_config_path); - std::string ldhtlm_config_path; - getline(config_file, ldhtlm_config_path); +LDHT::Client* LanguageModelLDHT::initTSSClient() +{ + std::ifstream config_file(m_configPath.c_str()); + std::string ldht_config_path; + getline(config_file, ldht_config_path); + std::string ldhtlm_config_path; + getline(config_file, ldhtlm_config_path); - LDHT::FactoryCollection* factory_collection = - LDHT::FactoryCollection::createDefaultFactoryCollection(); + LDHT::FactoryCollection* factory_collection = + LDHT::FactoryCollection::createDefaultFactoryCollection(); - LDHT::Client* client; - //client = new LDHT::ClientLocal(); - client = new LDHT::Client(); - client->fromXmlFiles(*factory_collection, - ldht_config_path, - ldhtlm_config_path); - return client; + LDHT::Client* client; + //client = new LDHT::ClientLocal(); + client = new LDHT::Client(); + client->fromXmlFiles(*factory_collection, + ldht_config_path, + ldhtlm_config_path); + return client; } -void LanguageModelLDHT::InitializeForInput(InputType const& source) { - getClientSafe()->clearCache(); - m_start_tick = LDHT::Util::rdtsc(); +void LanguageModelLDHT::InitializeForInput(InputType const& source) +{ + getClientSafe()->clearCache(); + m_start_tick = LDHT::Util::rdtsc(); } -void LanguageModelLDHT::CleanUpAfterSentenceProcessing(const InputType &source) { - LDHT::Client* client = getClientSafe(); +void LanguageModelLDHT::CleanUpAfterSentenceProcessing(const InputType &source) +{ + LDHT::Client* client = getClientSafe(); - std::cerr << "LDHT sentence stats:" << std::endl; - std::cerr << " ngrams submitted: " << client->getNumNgramsSubmitted() << std::endl - << " ngrams requested: " << client->getNumNgramsRequested() << std::endl - << " ngrams not found: " << client->getKeyNotFoundCount() << std::endl - << " cache hits: " << client->getCacheHitCount() << std::endl - << " inferences: " << client->getInferenceCount() << std::endl - << " pcnt latency: " << (float)client->getLatencyTicks() / (float)(LDHT::Util::rdtsc() - m_start_tick) * 100.0 << std::endl; - m_start_tick = 0; - client->resetLatencyTicks(); - client->resetNumNgramsSubmitted(); - client->resetNumNgramsRequested(); - client->resetInferenceCount(); - client->resetCacheHitCount(); - client->resetKeyNotFoundCount(); + std::cerr << "LDHT sentence stats:" << std::endl; + std::cerr << " ngrams submitted: " << client->getNumNgramsSubmitted() << std::endl + << " ngrams requested: " << client->getNumNgramsRequested() << std::endl + << " ngrams not found: " << client->getKeyNotFoundCount() << std::endl + << " cache hits: " << client->getCacheHitCount() << std::endl + << " inferences: " << client->getInferenceCount() << std::endl + << " pcnt latency: " << (float)client->getLatencyTicks() / (float)(LDHT::Util::rdtsc() - m_start_tick) * 100.0 << std::endl; + m_start_tick = 0; + client->resetLatencyTicks(); + client->resetNumNgramsSubmitted(); + client->resetNumNgramsRequested(); + client->resetInferenceCount(); + client->resetCacheHitCount(); + client->resetKeyNotFoundCount(); } const FFState* LanguageModelLDHT::EmptyHypothesisState( - const InputType& input) const { - return NULL; + const InputType& input) const +{ + return NULL; } -bool LanguageModelLDHT::Useable(const Phrase& phrase) const { - return (phrase.GetSize() > 0 && phrase.GetFactor(0, m_factorType) != NULL); +bool LanguageModelLDHT::Useable(const Phrase& phrase) const +{ + return (phrase.GetSize() > 0 && phrase.GetFactor(0, m_factorType) != NULL); } void LanguageModelLDHT::CalcScore(const Phrase& phrase, float& fullScore, float& ngramScore, - std::size_t& oovCount) const { - const_cast(this)->m_calc_score_count++; - if (m_calc_score_count > 10000) { - const_cast(this)->m_calc_score_count = 0; - const_cast(this)->sync(); - } + std::size_t& oovCount) const +{ + const_cast(this)->m_calc_score_count++; + if (m_calc_score_count > 10000) { + const_cast(this)->m_calc_score_count = 0; + const_cast(this)->sync(); + } - // TODO(wilson): handle nonterminal words. - LDHT::Client* client = getClientUnsafe(); - // Score the first order - 1 words of the phrase. - int order = LDHT::NewNgram::k_max_order; - int prefix_start = 0; - int prefix_end = std::min(phrase.GetSize(), static_cast(order - 1)); - LDHT::NewNgram ngram; - for (int word_idx = prefix_start; word_idx < prefix_end; ++word_idx) { - ngram.appendGram(phrase.GetWord(word_idx) - .GetFactor(m_factorType)->GetString().c_str()); - client->requestNgram(ngram); - } - // Now score all subsequent ngrams to end of phrase. - int internal_start = prefix_end; - int internal_end = phrase.GetSize(); - for (int word_idx = internal_start; word_idx < internal_end; ++word_idx) { - ngram.appendGram(phrase.GetWord(word_idx) - .GetFactor(m_factorType)->GetString().c_str()); - client->requestNgram(ngram); - } + // TODO(wilson): handle nonterminal words. + LDHT::Client* client = getClientUnsafe(); + // Score the first order - 1 words of the phrase. + int order = LDHT::NewNgram::k_max_order; + int prefix_start = 0; + int prefix_end = std::min(phrase.GetSize(), static_cast(order - 1)); + LDHT::NewNgram ngram; + for (int word_idx = prefix_start; word_idx < prefix_end; ++word_idx) { + ngram.appendGram(phrase.GetWord(word_idx) + .GetFactor(m_factorType)->GetString().c_str()); + client->requestNgram(ngram); + } + // Now score all subsequent ngrams to end of phrase. + int internal_start = prefix_end; + int internal_end = phrase.GetSize(); + for (int word_idx = internal_start; word_idx < internal_end; ++word_idx) { + ngram.appendGram(phrase.GetWord(word_idx) + .GetFactor(m_factorType)->GetString().c_str()); + client->requestNgram(ngram); + } - fullScore = 0; - ngramScore = 0; - oovCount = 0; + fullScore = 0; + ngramScore = 0; + oovCount = 0; } void LanguageModelLDHT::CalcScoreFromCache(const Phrase& phrase, - float& fullScore, - float& ngramScore, - std::size_t& oovCount) const { - // Issue requests for phrase internal ngrams. - // Sync if necessary. (or autosync). - const_cast(this)->sync(); + float& fullScore, + float& ngramScore, + std::size_t& oovCount) const +{ + // Issue requests for phrase internal ngrams. + // Sync if necessary. (or autosync). + const_cast(this)->sync(); - // TODO(wilson): handle nonterminal words. - LDHT::Client* client = getClientUnsafe(); - // Score the first order - 1 words of the phrase. - int order = LDHT::NewNgram::k_max_order; - int prefix_start = 0; - int prefix_end = std::min(phrase.GetSize(), static_cast(order - 1)); - LDHT::NewNgram ngram; - std::deque full_score_tags; - for (int word_idx = prefix_start; word_idx < prefix_end; ++word_idx) { - ngram.appendGram(phrase.GetWord(word_idx) - .GetFactor(m_factorType)->GetString().c_str()); - full_score_tags.push_back(client->requestNgram(ngram)); - } - // Now score all subsequent ngrams to end of phrase. - int internal_start = prefix_end; - int internal_end = phrase.GetSize(); - std::deque internal_score_tags; - for (int word_idx = internal_start; word_idx < internal_end; ++word_idx) { - ngram.appendGram(phrase.GetWord(word_idx) - .GetFactor(m_factorType)->GetString().c_str()); - internal_score_tags.push_back(client->requestNgram(ngram)); - } + // TODO(wilson): handle nonterminal words. + LDHT::Client* client = getClientUnsafe(); + // Score the first order - 1 words of the phrase. + int order = LDHT::NewNgram::k_max_order; + int prefix_start = 0; + int prefix_end = std::min(phrase.GetSize(), static_cast(order - 1)); + LDHT::NewNgram ngram; + std::deque full_score_tags; + for (int word_idx = prefix_start; word_idx < prefix_end; ++word_idx) { + ngram.appendGram(phrase.GetWord(word_idx) + .GetFactor(m_factorType)->GetString().c_str()); + full_score_tags.push_back(client->requestNgram(ngram)); + } + // Now score all subsequent ngrams to end of phrase. + int internal_start = prefix_end; + int internal_end = phrase.GetSize(); + std::deque internal_score_tags; + for (int word_idx = internal_start; word_idx < internal_end; ++word_idx) { + ngram.appendGram(phrase.GetWord(word_idx) + .GetFactor(m_factorType)->GetString().c_str()); + internal_score_tags.push_back(client->requestNgram(ngram)); + } - // Wait for resposes from the servers. - //client->awaitResponses(); + // Wait for resposes from the servers. + //client->awaitResponses(); - // Calculate the full phrase score, and the internal score. - fullScore = 0.0; - while (!full_score_tags.empty()) { - fullScore += client->getNgramScore(full_score_tags.front()); - full_score_tags.pop_front(); - } - ngramScore = 0.0; - while (!internal_score_tags.empty()) { - float score = client->getNgramScore(internal_score_tags.front()); - internal_score_tags.pop_front(); - fullScore += score; - ngramScore += score; - } - fullScore = TransformLMScore(fullScore); - ngramScore = TransformLMScore(ngramScore); - oovCount = 0; + // Calculate the full phrase score, and the internal score. + fullScore = 0.0; + while (!full_score_tags.empty()) { + fullScore += client->getNgramScore(full_score_tags.front()); + full_score_tags.pop_front(); + } + ngramScore = 0.0; + while (!internal_score_tags.empty()) { + float score = client->getNgramScore(internal_score_tags.front()); + internal_score_tags.pop_front(); + fullScore += score; + ngramScore += score; + } + fullScore = TransformLMScore(fullScore); + ngramScore = TransformLMScore(ngramScore); + oovCount = 0; } void LanguageModelLDHT::IssueRequestsFor(Hypothesis& hypo, - const FFState* input_state) { - // TODO(wilson): handle nonterminal words. - LDHT::Client* client = getClientUnsafe(); + const FFState* input_state) +{ + // TODO(wilson): handle nonterminal words. + LDHT::Client* client = getClientUnsafe(); - // Create a new state and copy the contents of the input_state if - // supplied. - LDHTLMState* new_state = new LDHTLMState(); - if (input_state == NULL) { - if (hypo.GetCurrTargetWordsRange().GetStartPos() != 0) { - V("got a null state but not at start of sentence"); - abort(); - } - new_state->gram_fingerprints.appendGram(BOS_); + // Create a new state and copy the contents of the input_state if + // supplied. + LDHTLMState* new_state = new LDHTLMState(); + if (input_state == NULL) { + if (hypo.GetCurrTargetWordsRange().GetStartPos() != 0) { + V("got a null state but not at start of sentence"); + abort(); } - else { - if (hypo.GetCurrTargetWordsRange().GetStartPos() == 0) { - V("got a non null state but at start of sentence"); - abort(); - } - new_state->copyFrom(static_cast(*input_state)); + new_state->gram_fingerprints.appendGram(BOS_); + } else { + if (hypo.GetCurrTargetWordsRange().GetStartPos() == 0) { + V("got a non null state but at start of sentence"); + abort(); } + new_state->copyFrom(static_cast(*input_state)); + } - // Score ngrams that overlap with the previous phrase. - int order = LDHT::NewNgram::k_max_order; - int phrase_start = hypo.GetCurrTargetWordsRange().GetStartPos(); - int phrase_end = hypo.GetCurrTargetWordsRange().GetEndPos() + 1; - int overlap_start = phrase_start; - int overlap_end = std::min(phrase_end, phrase_start + order - 1); - int word_idx = overlap_start; - LDHT::NewNgram& ngram = new_state->gram_fingerprints; - for (; word_idx < overlap_end; ++word_idx) { - ngram.appendGram( - hypo.GetFactor(word_idx, m_factorType)->GetString().c_str()); - new_state->appendRequestTag(client->requestNgram(ngram)); - } - // No need to score phrase internal ngrams, but keep track of them - // in the state (which in this case is the NewNgram containing the - // hashes of the individual grams). - for (; word_idx < phrase_end; ++word_idx) { - ngram.appendGram( - hypo.GetFactor(word_idx, m_factorType)->GetString().c_str()); - } - // If this is the last phrase in the sentence, score the last ngram - // with the end of sentence marker on it. - if (hypo.IsSourceCompleted()) { - ngram.appendGram(EOS_); - //request_tags.push_back(client->requestNgram(ngram)); - new_state->appendRequestTag(client->requestNgram(ngram)); - } - hypo.SetFFState(m_state_idx, new_state); + // Score ngrams that overlap with the previous phrase. + int order = LDHT::NewNgram::k_max_order; + int phrase_start = hypo.GetCurrTargetWordsRange().GetStartPos(); + int phrase_end = hypo.GetCurrTargetWordsRange().GetEndPos() + 1; + int overlap_start = phrase_start; + int overlap_end = std::min(phrase_end, phrase_start + order - 1); + int word_idx = overlap_start; + LDHT::NewNgram& ngram = new_state->gram_fingerprints; + for (; word_idx < overlap_end; ++word_idx) { + ngram.appendGram( + hypo.GetFactor(word_idx, m_factorType)->GetString().c_str()); + new_state->appendRequestTag(client->requestNgram(ngram)); + } + // No need to score phrase internal ngrams, but keep track of them + // in the state (which in this case is the NewNgram containing the + // hashes of the individual grams). + for (; word_idx < phrase_end; ++word_idx) { + ngram.appendGram( + hypo.GetFactor(word_idx, m_factorType)->GetString().c_str()); + } + // If this is the last phrase in the sentence, score the last ngram + // with the end of sentence marker on it. + if (hypo.IsSourceCompleted()) { + ngram.appendGram(EOS_); + //request_tags.push_back(client->requestNgram(ngram)); + new_state->appendRequestTag(client->requestNgram(ngram)); + } + hypo.SetFFState(m_state_idx, new_state); } -void LanguageModelLDHT::sync() { - m_calc_score_count = 0; - getClientUnsafe()->awaitResponses(); +void LanguageModelLDHT::sync() +{ + m_calc_score_count = 0; + getClientUnsafe()->awaitResponses(); } -void LanguageModelLDHT::SetFFStateIdx(int state_idx) { - m_state_idx = state_idx; +void LanguageModelLDHT::SetFFStateIdx(int state_idx) +{ + m_state_idx = state_idx; } FFState* LanguageModelLDHT::Evaluate( - const Hypothesis& hypo, - const FFState* input_state_ignored, - ScoreComponentCollection* score_output) const { - // Input state is the state from the previous hypothesis, which - // we are not interested in. The requests for this hypo should - // already have been issued via IssueRequestsFor() and the LM then - // synced and all responses processed, and the tags placed in our - // FFState of hypo. - LDHTLMState* state = const_cast(static_cast(hypo.GetFFState(m_state_idx))); + const Hypothesis& hypo, + const FFState* input_state_ignored, + ScoreComponentCollection* score_output) const +{ + // Input state is the state from the previous hypothesis, which + // we are not interested in. The requests for this hypo should + // already have been issued via IssueRequestsFor() and the LM then + // synced and all responses processed, and the tags placed in our + // FFState of hypo. + LDHTLMState* state = const_cast(static_cast(hypo.GetFFState(m_state_idx))); - float score = calcScoreFromState(state); - score = FloorScore(TransformLMScore(score)); - score_output->PlusEquals(this, score); + float score = calcScoreFromState(state); + score = FloorScore(TransformLMScore(score)); + score_output->PlusEquals(this, score); - return state; + return state; } FFState* LanguageModelLDHT::EvaluateChart( - const ChartHypothesis& hypo, - int featureID, - ScoreComponentCollection* accumulator) const { - return NULL; + const ChartHypothesis& hypo, + int featureID, + ScoreComponentCollection* accumulator) const +{ + return NULL; } -float LanguageModelLDHT::calcScoreFromState(LDHTLMState* state) const { - float score = 0.0; - std::vector::iterator tag_iter; - LDHT::Client* client = getClientUnsafe(); - for (tag_iter = state->requestTagsBegin(); - tag_iter != state->requestTagsEnd(); - ++tag_iter) { - score += client->getNgramScore(*tag_iter); - } - state->clearRequestTags(); - state->setFinalised(); - return score; +float LanguageModelLDHT::calcScoreFromState(LDHTLMState* state) const +{ + float score = 0.0; + std::vector::iterator tag_iter; + LDHT::Client* client = getClientUnsafe(); + for (tag_iter = state->requestTagsBegin(); + tag_iter != state->requestTagsEnd(); + ++tag_iter) { + score += client->getNgramScore(*tag_iter); + } + state->clearRequestTags(); + state->setFinalised(); + return score; } } // namespace Moses. diff --git a/moses/LM/LDHT.h b/moses/LM/LDHT.h index a8489c0e3..8c5c3c36b 100644 --- a/moses/LM/LDHT.h +++ b/moses/LM/LDHT.h @@ -7,7 +7,8 @@ #include "moses/TypeDef.h" -namespace Moses { +namespace Moses +{ class ScoreIndexManager; class LanguageModel; diff --git a/moses/LM/MultiFactor.h b/moses/LM/MultiFactor.h index 491da4abe..21a9d493b 100644 --- a/moses/LM/MultiFactor.h +++ b/moses/LM/MultiFactor.h @@ -33,7 +33,7 @@ namespace Moses class Phrase; -/* Abstract class for for multi factor LM. Only inherited by the JointLM at the moment. +/* Abstract class for for multi factor LM. Only inherited by the JointLM at the moment. * Could use this when factored LM are implemented */ class LanguageModelMultiFactor : public LanguageModelImplementation @@ -41,16 +41,16 @@ class LanguageModelMultiFactor : public LanguageModelImplementation protected: FactorMask m_factorTypes; - LanguageModelMultiFactor(const std::string& description, const std::string &line) - :LanguageModelImplementation(description, line) + LanguageModelMultiFactor(const std::string& description, const std::string &line) + :LanguageModelImplementation(description, line) {} - + public: virtual bool Load(const std::string &filePath , const std::vector &factorTypes , size_t nGramOrder) = 0; - bool Useable(const Phrase &phrase) const; + bool Useable(const Phrase &phrase) const; }; } diff --git a/moses/LM/ORLM.cpp b/moses/LM/ORLM.cpp index 226267ee2..44fd64efb 100644 --- a/moses/LM/ORLM.cpp +++ b/moses/LM/ORLM.cpp @@ -9,10 +9,11 @@ #include "ORLM.h" using std::map; -namespace Moses +namespace Moses +{ +bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType, + size_t nGramOrder) { -bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType, - size_t nGramOrder) { cerr << "Loading LanguageModelORLM..." << endl; m_filePath = filePath; m_factorType = factorType; @@ -26,13 +27,14 @@ bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType, CreateFactors(); return true; } -void LanguageModelORLM::CreateFactors() { +void LanguageModelORLM::CreateFactors() +{ FactorCollection &factorCollection = FactorCollection::Instance(); size_t maxFactorId = 0; // to create lookup vector later on std::map m_lmids_map; // map from factor id -> word id for(std::map::const_iterator vIter = m_lm->vocab_->VocabStart(); - vIter != m_lm->vocab_->VocabEnd(); vIter++){ + vIter != m_lm->vocab_->VocabEnd(); vIter++) { // get word from ORLM vocab and associate with (new) factor id size_t factorId = factorCollection.AddFactor(Output,m_factorType,vIter->first.ToString())->GetId(); m_lmids_map[factorId] = vIter->second; @@ -50,7 +52,7 @@ void LanguageModelORLM::CreateFactors() { maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; m_sentenceEndWord[m_factorType] = m_sentenceEnd; // add to lookup vector in object - lm_ids_vec_.resize(maxFactorId+1); + lm_ids_vec_.resize(maxFactorId+1); // fill with OOV code fill(lm_ids_vec_.begin(), lm_ids_vec_.end(), m_oov_id); @@ -58,15 +60,18 @@ void LanguageModelORLM::CreateFactors() { iter != m_lmids_map.end() ; ++iter) lm_ids_vec_[iter->first] = iter->second; } -wordID_t LanguageModelORLM::GetLmID(const std::string& str) const { +wordID_t LanguageModelORLM::GetLmID(const std::string& str) const +{ return m_lm->vocab_->GetWordID(str); } -wordID_t LanguageModelORLM::GetLmID(const Factor* factor) const { +wordID_t LanguageModelORLM::GetLmID(const Factor* factor) const +{ size_t factorId = factor->GetId(); return (factorId >= lm_ids_vec_.size()) ? m_oov_id : lm_ids_vec_[factorId]; } -LMResult LanguageModelORLM::GetValue(const std::vector &contextFactor, - State* finalState) const { +LMResult LanguageModelORLM::GetValue(const std::vector &contextFactor, + State* finalState) const +{ FactorType factorType = GetFactorType(); // set up context //std::vector factor(1,0); @@ -88,13 +93,14 @@ LMResult LanguageModelORLM::GetValue(const std::vector &contextFact */ return ret; } -bool LanguageModelORLM::UpdateORLM(const std::vector& ngram, const int value) { +bool LanguageModelORLM::UpdateORLM(const std::vector& ngram, const int value) +{ /*cerr << "Inserting into ORLM: \""; iterate(ngram, nit) cerr << *nit << " "; cerr << "\"\t" << value << endl; */ m_lm->vocab_->MakeOpen(); - bool res = m_lm->update(ngram, value); + bool res = m_lm->update(ngram, value); m_lm->vocab_->MakeClosed(); return res; } diff --git a/moses/LM/ORLM.h b/moses/LM/ORLM.h index 48909191e..d7a8b5d35 100644 --- a/moses/LM/ORLM.h +++ b/moses/LM/ORLM.h @@ -17,7 +17,8 @@ class Phrase; /** @todo ask ollie */ -class LanguageModelORLM : public LanguageModelSingleFactor { +class LanguageModelORLM : public LanguageModelSingleFactor +{ public: typedef count_t T; // type for ORLM filter LanguageModelORLM(const std::string &line) @@ -34,10 +35,12 @@ public: fout.close(); delete m_lm; } - void CleanUpAfterSentenceProcessing() {m_lm->clearCache();} // clear caches + void CleanUpAfterSentenceProcessing() { + m_lm->clearCache(); // clear caches + } bool UpdateORLM(const std::vector& ngram, const int value); - protected: +protected: OnlineRLM* m_lm; //MultiOnlineRLM* m_lm; wordID_t m_oov_id; diff --git a/moses/LM/ParallelBackoff.cpp b/moses/LM/ParallelBackoff.cpp index cf8c1509b..0b996de2b 100644 --- a/moses/LM/ParallelBackoff.cpp +++ b/moses/LM/ParallelBackoff.cpp @@ -70,7 +70,7 @@ private: public: LanguageModelParallelBackoff(const std::string &line) - :LanguageModelMultiFactor("ParallelBackoffLM", line) + :LanguageModelMultiFactor("ParallelBackoffLM", line) {} ~LanguageModelParallelBackoff(); diff --git a/moses/LM/Rand.cpp b/moses/LM/Rand.cpp index 8e3e37a1f..5e31029d5 100644 --- a/moses/LM/Rand.cpp +++ b/moses/LM/Rand.cpp @@ -37,7 +37,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA namespace Moses { -namespace +namespace { using namespace std; @@ -45,8 +45,8 @@ class LanguageModelRandLM : public LanguageModelSingleFactor { public: LanguageModelRandLM(const std::string &line) - :LanguageModelSingleFactor("RandLM", line) - , m_lm(0) + :LanguageModelSingleFactor("RandLM", line) + , m_lm(0) {} bool Load(const std::string &filePath, FactorType factorType, size_t nGramOrder); virtual LMResult GetValue(const std::vector &contextFactor, State* finalState = NULL) const; @@ -133,7 +133,7 @@ randlm::WordID LanguageModelRandLM::GetLmID( const std::string &str ) const } LMResult LanguageModelRandLM::GetValue(const vector &contextFactor, - State* finalState) const + State* finalState) const { FactorType factorType = GetFactorType(); // set up context diff --git a/moses/LM/SRI.cpp b/moses/LM/SRI.cpp index b6281512c..54e6f93b9 100644 --- a/moses/LM/SRI.cpp +++ b/moses/LM/SRI.cpp @@ -39,29 +39,26 @@ using namespace std; namespace Moses { LanguageModelSRI::LanguageModelSRI(const std::string &line) -:LanguageModelSingleFactor("SRILM", line) -,m_srilmVocab(0) -,m_srilmModel(0) + :LanguageModelSingleFactor("SRILM", line) + ,m_srilmVocab(0) + ,m_srilmModel(0) { FactorType factorType; size_t nGramOrder; string filePath; for (size_t i = 0; i < m_args.size(); ++i) { - const vector &args = m_args[i]; + const vector &args = m_args[i]; - if (args[0] == "factor") { - factorType = Scan(args[1]); - } - else if (args[0] == "order") { - nGramOrder = Scan(args[1]); - } - else if (args[0] == "path") { - filePath = args[1]; - } - else { - throw "Unknown argument " + args[0]; - } + if (args[0] == "factor") { + factorType = Scan(args[1]); + } else if (args[0] == "order") { + nGramOrder = Scan(args[1]); + } else if (args[0] == "path") { + filePath = args[1]; + } else { + throw "Unknown argument " + args[0]; + } } Load(filePath, factorType, nGramOrder); diff --git a/moses/LM/SingleFactor.cpp b/moses/LM/SingleFactor.cpp index 031fa38ac..abd8aca51 100644 --- a/moses/LM/SingleFactor.cpp +++ b/moses/LM/SingleFactor.cpp @@ -38,7 +38,7 @@ namespace Moses { LanguageModelSingleFactor::LanguageModelSingleFactor(const std::string& description, const std::string &line) -:LanguageModelImplementation(description, line) + :LanguageModelImplementation(description, line) { m_nullContextState = new PointerState(NULL); m_beginSentenceState = new PointerState(NULL); diff --git a/moses/LM/SingleFactor.h b/moses/LM/SingleFactor.h index cb51808ac..9a1f30216 100644 --- a/moses/LM/SingleFactor.h +++ b/moses/LM/SingleFactor.h @@ -43,31 +43,27 @@ protected: FFState *m_nullContextState; FFState *m_beginSentenceState; - LanguageModelSingleFactor(const std::string& description, const std::string &line); + LanguageModelSingleFactor(const std::string& description, const std::string &line); public: - virtual ~LanguageModelSingleFactor(); - virtual bool Load(const std::string &filePath - , FactorType factorType - , size_t nGramOrder) = 0; + virtual ~LanguageModelSingleFactor(); + virtual bool Load(const std::string &filePath + , FactorType factorType + , size_t nGramOrder) = 0; - bool Useable(const Phrase &phrase) const - { - return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL); - } - - const Factor *GetSentenceStart() const - { - return m_sentenceStart; - } - const Factor *GetSentenceEnd() const - { - return m_sentenceEnd; - } - FactorType GetFactorType() const - { - return m_factorType; - } + bool Useable(const Phrase &phrase) const { + return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL); + } + + const Factor *GetSentenceStart() const { + return m_sentenceStart; + } + const Factor *GetSentenceEnd() const { + return m_sentenceEnd; + } + FactorType GetFactorType() const { + return m_factorType; + } virtual const FFState *GetNullContextState() const; virtual const FFState *GetBeginSentenceState() const; diff --git a/moses/LexicalReordering.cpp b/moses/LexicalReordering.cpp index 71c8fb2b8..98dca7b5f 100644 --- a/moses/LexicalReordering.cpp +++ b/moses/LexicalReordering.cpp @@ -10,7 +10,7 @@ using namespace std; namespace Moses { LexicalReordering::LexicalReordering(const std::string &line) -: StatefulFeatureFunction("LexicalReordering", line) + : StatefulFeatureFunction("LexicalReordering", line) { std::cerr << "Initializing LexicalReordering.." << std::endl; @@ -24,41 +24,37 @@ LexicalReordering::LexicalReordering(const std::string &line) m_configuration = new LexicalReorderingConfiguration(args[1]); m_configuration->SetScoreProducer(this); m_modelTypeString = m_configuration->GetModelString(); - } - else if (args[0] == "input-factor") { + } else if (args[0] == "input-factor") { f_factors =Tokenize(args[1]); - } - else if (args[0] == "output-factor") { + } else if (args[0] == "output-factor") { e_factors =Tokenize(args[1]); - } - else if (args[0] == "path") { + } else if (args[0] == "path") { filePath = args[1]; - } - else { + } else { throw "Unknown argument " + args[0]; } } switch(m_configuration->GetCondition()) { - case LexicalReorderingConfiguration::FE: - case LexicalReorderingConfiguration::E: - m_factorsE = e_factors; - if(m_factorsE.empty()) { - UserMessage::Add("TL factor mask for lexical reordering is unexpectedly empty"); - exit(1); - } - if(m_configuration->GetCondition() == LexicalReorderingConfiguration::E) - break; // else fall through - case LexicalReorderingConfiguration::F: - m_factorsF = f_factors; - if(m_factorsF.empty()) { - UserMessage::Add("SL factor mask for lexical reordering is unexpectedly empty"); - exit(1); - } - break; - default: - UserMessage::Add("Unknown conditioning option!"); + case LexicalReorderingConfiguration::FE: + case LexicalReorderingConfiguration::E: + m_factorsE = e_factors; + if(m_factorsE.empty()) { + UserMessage::Add("TL factor mask for lexical reordering is unexpectedly empty"); exit(1); + } + if(m_configuration->GetCondition() == LexicalReorderingConfiguration::E) + break; // else fall through + case LexicalReorderingConfiguration::F: + m_factorsF = f_factors; + if(m_factorsF.empty()) { + UserMessage::Add("SL factor mask for lexical reordering is unexpectedly empty"); + exit(1); + } + break; + default: + UserMessage::Add("Unknown conditioning option!"); + exit(1); } m_table = LexicalReorderingTable::LoadAvailable(filePath, m_factorsF, m_factorsE, std::vector()); diff --git a/moses/LexicalReordering.h b/moses/LexicalReordering.h index 51cf797f0..abaa31c25 100644 --- a/moses/LexicalReordering.h +++ b/moses/LexicalReordering.h @@ -24,15 +24,16 @@ class InputType; /** implementation of lexical reordering (Tilman ...) for phrase-based decoding */ -class LexicalReordering : public StatefulFeatureFunction { -public: +class LexicalReordering : public StatefulFeatureFunction +{ +public: LexicalReordering(const std::string &line); virtual ~LexicalReordering(); virtual const FFState* EmptyHypothesisState(const InputType &input) const; - void InitializeForInput(const InputType& i){ - m_table->InitializeForInput(i); + void InitializeForInput(const InputType& i) { + m_table->InitializeForInput(i); } Scores GetProb(const Phrase& f, const Phrase& e) const; @@ -43,25 +44,25 @@ public: virtual FFState* EvaluateChart(const ChartHypothesis&, int /* featureID */, - ScoreComponentCollection*) const { - CHECK(0); // not valid for chart decoder - return NULL; - } + ScoreComponentCollection*) const { + CHECK(0); // not valid for chart decoder + return NULL; + } private: - bool DecodeCondition(std::string s); - bool DecodeDirection(std::string s); - bool DecodeNumFeatureFunctions(std::string s); + bool DecodeCondition(std::string s); + bool DecodeDirection(std::string s); + bool DecodeNumFeatureFunctions(std::string s); - LexicalReorderingConfiguration *m_configuration; - std::string m_modelTypeString; - std::vector m_modelType; - LexicalReorderingTable* m_table; - //std::vector m_direction; - std::vector m_condition; - //std::vector m_scoreOffset; - //bool m_oneScorePerDirection; - std::vector m_factorsE, m_factorsF; + LexicalReorderingConfiguration *m_configuration; + std::string m_modelTypeString; + std::vector m_modelType; + LexicalReorderingTable* m_table; + //std::vector m_direction; + std::vector m_condition; + //std::vector m_scoreOffset; + //bool m_oneScorePerDirection; + std::vector m_factorsE, m_factorsF; }; } diff --git a/moses/LexicalReorderingState.cpp b/moses/LexicalReorderingState.cpp index ddb089055..3165e447f 100644 --- a/moses/LexicalReorderingState.cpp +++ b/moses/LexicalReorderingState.cpp @@ -212,7 +212,7 @@ LexicalReorderingState* PhraseBasedReorderingState::Expand(const TranslationOpti if (m_direction == LexicalReorderingConfiguration::Forward && m_first) { ClearScores(scores); } else { - if (!m_first || m_useFirstBackwardScore){ + if (!m_first || m_useFirstBackwardScore) { if (modelType == LexicalReorderingConfiguration::MSD) { reoType = GetOrientationTypeMSD(currWordsRange); } else if (modelType == LexicalReorderingConfiguration::MSLR) { diff --git a/moses/LexicalReorderingTable.cpp b/moses/LexicalReorderingTable.cpp index c0da31402..65ba66047 100644 --- a/moses/LexicalReorderingTable.cpp +++ b/moses/LexicalReorderingTable.cpp @@ -9,7 +9,7 @@ #include "TargetPhraseCollection.h" #ifndef WIN32 -#include "TranslationModel/CompactPT/LexicalReorderingTableCompact.h" +#include "TranslationModel/CompactPT/LexicalReorderingTableCompact.h" #endif namespace Moses diff --git a/moses/Manager.cpp b/moses/Manager.cpp index b8e958d04..76809f224 100644 --- a/moses/Manager.cpp +++ b/moses/Manager.cpp @@ -80,7 +80,7 @@ void Manager::ProcessSentence() { // reset statistics ResetSentenceStats(m_source); - + Timer getOptionsTime; getOptionsTime.start(); m_transOptColl->CreateTranslationOptions(); @@ -262,8 +262,9 @@ struct SGNReverseCompare { /** * Implements lattice sampling, as in Chatterjee & Cancedda, emnlp 2010 **/ -void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const { - +void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const +{ + vector searchGraph; GetSearchGraph(searchGraph); @@ -277,15 +278,15 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const { map idToHyp; map fscores; - //Iterating through the hypos in reverse order of id gives a reverse - //topological order. We rely on the fact that hypo ids are given out + //Iterating through the hypos in reverse order of id gives a reverse + //topological order. We rely on the fact that hypo ids are given out //sequentially, as the search proceeds. - //NB: Could just sort by stack. + //NB: Could just sort by stack. sort(searchGraph.begin(), searchGraph.end(), SGNReverseCompare()); //first task is to fill in the outgoing hypos and edge scores. for (vector::const_iterator i = searchGraph.begin(); - i != searchGraph.end(); ++i) { + i != searchGraph.end(); ++i) { const Hypothesis* hypo = i->hypo; idToHyp[hypo->GetId()] = hypo; fscores[hypo->GetId()] = i->fscore; @@ -293,7 +294,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const { //back to current const Hypothesis* prevHypo = i->hypo->GetPrevHypo(); outgoingHyps[prevHypo].insert(hypo); - edgeScores[Edge(prevHypo->GetId(),hypo->GetId())] = + edgeScores[Edge(prevHypo->GetId(),hypo->GetId())] = hypo->GetScore() - prevHypo->GetScore(); } //forward from current @@ -304,7 +305,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const { outgoingHyps[hypo].insert(nextHypo); map::const_iterator fscoreIter = fscores.find(nextHypo->GetId()); CHECK(fscoreIter != fscores.end()); - edgeScores[Edge(hypo->GetId(),nextHypo->GetId())] = + edgeScores[Edge(hypo->GetId(),nextHypo->GetId())] = i->fscore - fscoreIter->second; } } @@ -312,26 +313,26 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const { //then run through again to calculate sigmas for (vector::const_iterator i = searchGraph.begin(); - i != searchGraph.end(); ++i) { + i != searchGraph.end(); ++i) { if (i->forward == -1) { sigmas[i->hypo] = 0; } else { - map >::const_iterator outIter = + map >::const_iterator outIter = outgoingHyps.find(i->hypo); - + CHECK(outIter != outgoingHyps.end()); float sigma = 0; for (set::const_iterator j = outIter->second.begin(); - j != outIter->second.end(); ++j) { + j != outIter->second.end(); ++j) { map::const_iterator succIter = sigmas.find(*j); CHECK(succIter != sigmas.end()); - map::const_iterator edgeScoreIter = + map::const_iterator edgeScoreIter = edgeScores.find(Edge(i->hypo->GetId(),(*j)->GetId())); CHECK(edgeScoreIter != edgeScores.end()); float term = edgeScoreIter->second + succIter->second; // Add sigma(*j) if (sigma == 0) { - sigma = term; + sigma = term; } else { sigma = log_sum(sigma,term); } @@ -347,7 +348,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const { vector path; path.push_back(startHypo); while(1) { - map >::const_iterator outIter = + map >::const_iterator outIter = outgoingHyps.find(path.back()); if (outIter == outgoingHyps.end() || !outIter->second.size()) { //end of the path @@ -358,7 +359,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const { vector candidateScores; float scoreTotal = 0; for (set::const_iterator j = outIter->second.begin(); - j != outIter->second.end(); ++j) { + j != outIter->second.end(); ++j) { candidates.push_back(*j); CHECK(sigmas.find(*j) != sigmas.end()); Edge edge(path.back()->GetId(),(*j)->GetId()); @@ -385,18 +386,18 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const { } //cerr << "Random: " << random << " Chose " << position-1 << endl; const Hypothesis* chosen = candidates[position-1]; - path.push_back(chosen); + path.push_back(chosen); } //cerr << "Path: " << endl; //for (size_t j = 0; j < path.size(); ++j) { - // cerr << path[j]->GetId() << " " << path[j]->GetScoreBreakdown() << endl; + // cerr << path[j]->GetId() << " " << path[j]->GetScoreBreakdown() << endl; //} //cerr << endl; //Convert the hypos to TrellisPath ret.Add(new TrellisPath(path)); //cerr << ret.at(ret.GetSize()-1).GetScoreBreakdown() << endl; - } + } } @@ -680,7 +681,7 @@ void Manager::OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std: // outputSearchGraphStream << endl; // outputSearchGraphStream << (*hypo) << endl; - // const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown(); + // const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown(); // outputSearchGraphStream << scoreCollection << endl; const StaticData& staticData = StaticData::Instance(); @@ -753,10 +754,10 @@ size_t Manager::OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* if (numScoreComps != 0) { vector values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff); for (size_t i = 0; i < numScoreComps; ++i) { - outputSearchGraphStream << "# " << ff->GetScoreProducerDescription() - << " " << ff->GetScoreProducerDescription() - << " " << (i+1) << " of " << numScoreComps << endl - << "x" << (index+i) << "scale=" << values[i] << endl; + outputSearchGraphStream << "# " << ff->GetScoreProducerDescription() + << " " << ff->GetScoreProducerDescription() + << " " << (i+1) << " of " << numScoreComps << endl + << "x" << (index+i) << "scale=" << values[i] << endl; } return index+numScoreComps; } else { @@ -779,28 +780,28 @@ size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypoth // // cout << "m_scores.coreSize()==" << m_scores.coreSize() << endl; // // cout << "m_scores.cbegin() ?= m_scores.cend()\t" << (m_scores.cbegin() == m_scores.cend()) << endl; - + // // for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) { // // std::cout<first) << "\t" << (i->second) << std::endl; // // } // for(int i=0, n=v.size(); iGetScoreBreakdown(); + const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown(); vector featureValues = scoreCollection.GetScoresForProducer(ff); size_t numScoreComps = featureValues.size();//featureValues.coreSize(); // if (numScoreComps != ScoreProducer::unlimited) { - // vector values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff); + // vector values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff); for (size_t i = 0; i < numScoreComps; ++i) { outputSearchGraphStream << "x" << (index+i) << "=" << ((zeros) ? 0.0 : featureValues[i]) << " "; - } - return index+numScoreComps; + } + return index+numScoreComps; // } else { // cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl; // assert(false); @@ -810,7 +811,7 @@ size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypoth size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const { - ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown(); + ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown(); const Hypothesis *prevHypo = hypo->GetPrevHypo(); if (prevHypo) { scoreCollection.MinusEquals( prevHypo->GetScoreBreakdown() ); @@ -851,60 +852,60 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou { long hypergraphHypothesisID = 0; for (size_t arcNumber = 0, size=searchGraph.size(); arcNumber < size; ++arcNumber) { - + // Get an id number for the previous hypothesis const Hypothesis *prevHypo = searchGraph[arcNumber].hypo->GetPrevHypo(); if (prevHypo!=NULL) { - int mosesPrevHypothesisID = prevHypo->GetId(); - if (mosesIDToHypergraphID.count(mosesPrevHypothesisID) == 0) { - mosesIDToHypergraphID[mosesPrevHypothesisID] = hypergraphHypothesisID; - // hypergraphIDToMosesID[hypergraphHypothesisID] = mosesPrevHypothesisID; - hypergraphHypothesisID += 1; - } + int mosesPrevHypothesisID = prevHypo->GetId(); + if (mosesIDToHypergraphID.count(mosesPrevHypothesisID) == 0) { + mosesIDToHypergraphID[mosesPrevHypothesisID] = hypergraphHypothesisID; + // hypergraphIDToMosesID[hypergraphHypothesisID] = mosesPrevHypothesisID; + hypergraphHypothesisID += 1; + } } // Get an id number for this hypothesis int mosesHypothesisID; if (searchGraph[arcNumber].recombinationHypo) { - mosesHypothesisID = searchGraph[arcNumber].recombinationHypo->GetId(); + mosesHypothesisID = searchGraph[arcNumber].recombinationHypo->GetId(); } else { - mosesHypothesisID = searchGraph[arcNumber].hypo->GetId(); + mosesHypothesisID = searchGraph[arcNumber].hypo->GetId(); } if (mosesIDToHypergraphID.count(mosesHypothesisID) == 0) { - - mosesIDToHypergraphID[mosesHypothesisID] = hypergraphHypothesisID; - // hypergraphIDToMosesID[hypergraphHypothesisID] = mosesHypothesisID; - bool terminalNode = (searchGraph[arcNumber].forward == -1); - if (terminalNode) { - // Final arc to end node, representing the end of the sentence - terminalNodes.insert(hypergraphHypothesisID); - } + mosesIDToHypergraphID[mosesHypothesisID] = hypergraphHypothesisID; + // hypergraphIDToMosesID[hypergraphHypothesisID] = mosesHypothesisID; - hypergraphHypothesisID += 1; + bool terminalNode = (searchGraph[arcNumber].forward == -1); + if (terminalNode) { + // Final arc to end node, representing the end of the sentence + terminalNodes.insert(hypergraphHypothesisID); + } + + hypergraphHypothesisID += 1; } // Record that this arc ends at this node hypergraphIDToArcs.insert(pair(mosesIDToHypergraphID[mosesHypothesisID],arcNumber)); } - + // Unique end node endNode = hypergraphHypothesisID; // mosesIDToHypergraphID[hypergraphHypothesisID] = hypergraphHypothesisID; numNodes = endNode + 1; } - + long numArcs = searchGraph.size() + terminalNodes.size(); // Print number of nodes and arcs outputSearchGraphStream << numNodes << " " << numArcs << endl; - VERBOSE(2,"Search graph to output as hypergraph for sentence " << translationId - << " contains " << numArcs << " arcs and " << numNodes << " nodes" << std::endl) + VERBOSE(2,"Search graph to output as hypergraph for sentence " << translationId + << " contains " << numArcs << " arcs and " << numNodes << " nodes" << std::endl) VERBOSE(2,"Outputting search graph to output as hypergraph for sentence " << translationId << std::endl) @@ -920,51 +921,51 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou outputSearchGraphStream << count << "\n"; pair::iterator, multimap::iterator> range = - hypergraphIDToArcs.equal_range(hypergraphHypothesisID); + hypergraphIDToArcs.equal_range(hypergraphHypothesisID); for (multimap::iterator it=range.first; it!=range.second; ++it) { - int lineNumber = (*it).second; - const Hypothesis *thisHypo = searchGraph[lineNumber].hypo; - int mosesHypothesisID;// = thisHypo->GetId(); - if (searchGraph[lineNumber].recombinationHypo) { - mosesHypothesisID = searchGraph[lineNumber].recombinationHypo->GetId(); - } else { - mosesHypothesisID = searchGraph[lineNumber].hypo->GetId(); - } - // int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID]; - UTIL_THROW_IF( - (hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]), - util::Exception, - "Error while writing search lattice as hypergraph for sentence " << translationId << ". " << - "Moses node " << mosesHypothesisID << " was expected to have hypergraph id " << hypergraphHypothesisID << - ", but actually had hypergraph id " << mosesIDToHypergraphID[mosesHypothesisID] << - ". There are " << numNodes << " nodes in the search lattice." - ); + int lineNumber = (*it).second; + const Hypothesis *thisHypo = searchGraph[lineNumber].hypo; + int mosesHypothesisID;// = thisHypo->GetId(); + if (searchGraph[lineNumber].recombinationHypo) { + mosesHypothesisID = searchGraph[lineNumber].recombinationHypo->GetId(); + } else { + mosesHypothesisID = searchGraph[lineNumber].hypo->GetId(); + } + // int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID]; + UTIL_THROW_IF( + (hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]), + util::Exception, + "Error while writing search lattice as hypergraph for sentence " << translationId << ". " << + "Moses node " << mosesHypothesisID << " was expected to have hypergraph id " << hypergraphHypothesisID << + ", but actually had hypergraph id " << mosesIDToHypergraphID[mosesHypothesisID] << + ". There are " << numNodes << " nodes in the search lattice." + ); - const Hypothesis *prevHypo = thisHypo->GetPrevHypo(); - if (prevHypo==NULL) { - // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " start of sentence" << std::endl) - outputSearchGraphStream << " ||| \n"; - } else { - int startNode = mosesIDToHypergraphID[prevHypo->GetId()]; - // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has parent node " << startNode << std::endl) - UTIL_THROW_IF( - (startNode >= hypergraphHypothesisID), - util::Exception, - "Error while writing search lattice as hypergraph for sentence" << translationId << ". " << - "The nodes must be output in topological order. The code attempted to violate this restriction." - ); + const Hypothesis *prevHypo = thisHypo->GetPrevHypo(); + if (prevHypo==NULL) { + // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " start of sentence" << std::endl) + outputSearchGraphStream << " ||| \n"; + } else { + int startNode = mosesIDToHypergraphID[prevHypo->GetId()]; + // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has parent node " << startNode << std::endl) + UTIL_THROW_IF( + (startNode >= hypergraphHypothesisID), + util::Exception, + "Error while writing search lattice as hypergraph for sentence" << translationId << ". " << + "The nodes must be output in topological order. The code attempted to violate this restriction." + ); - const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase(); - int targetWordCount = targetPhrase.GetSize(); + const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase(); + int targetWordCount = targetPhrase.GetSize(); - outputSearchGraphStream << "[" << startNode << "]"; - for (int targetWordIndex=0; targetWordIndexGetId(); if (nodes.count(hypothesisID) == 0) { - + numNodes += targetWordCount; nodes[hypothesisID] = numNodes; //numNodes += 1; bool terminalNode = (searchGraph[arcNumber].forward == -1); if (terminalNode) { - numArcs += 1; + numArcs += 1; } } @@ -1038,35 +1039,35 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea int targetWordCount = targetPhrase.GetSize(); for (int targetWordIndex=0; targetWordIndexGetCurrSourceWordsRange().GetEndPos(); + << " covered=" << searchNode.hypo->GetCurrSourceWordsRange().GetStartPos() + << "-" << searchNode.hypo->GetCurrSourceWordsRange().GetEndPos(); // Modified so that -osgx is a superset of -osg (GST Oct 2011) ScoreComponentCollection scoreBreakdown = searchNode.hypo->GetScoreBreakdown(); scoreBreakdown.MinusEquals( prevHypo->GetScoreBreakdown() ); //outputSearchGraphStream << " scores = [ " << StaticData::Instance().GetAllWeights(); - outputSearchGraphStream << " scores=\"" << scoreBreakdown << "\""; + outputSearchGraphStream << " scores=\"" << scoreBreakdown << "\""; outputSearchGraphStream << " out=\"" << searchNode.hypo->GetSourcePhraseStringRep() << "|" << - searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << "\"" << endl; + searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << "\"" << endl; // outputSearchGraphStream << " out=" << searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << endl; } diff --git a/moses/Manager.h b/moses/Manager.h index 687d8dbeb..fd329c309 100644 --- a/moses/Manager.h +++ b/moses/Manager.h @@ -56,9 +56,9 @@ struct SearchGraphNode { hypo(theHypo), recombinationHypo(theRecombinationHypo), forward(theForward), fscore(theFscore) {} - bool operator<(const SearchGraphNode& sgn) const { - return this->hypo->GetId() < sgn.hypo->GetId(); - } + bool operator<(const SearchGraphNode& sgn) const { + return this->hypo->GetId() < sgn.hypo->GetId(); + } }; diff --git a/moses/MockHypothesis.cpp b/moses/MockHypothesis.cpp index e98794cb7..826104565 100644 --- a/moses/MockHypothesis.cpp +++ b/moses/MockHypothesis.cpp @@ -19,7 +19,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -#include "MockHypothesis.h" +#include "MockHypothesis.h" #include @@ -28,19 +28,20 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA using namespace Moses; using namespace std; -namespace MosesTest { +namespace MosesTest +{ MockHypothesisGuard::MockHypothesisGuard( - const string& sourceSentence, - const vector& alignments, - const vector& targetSegments) -: m_emptyTarget(), - m_sentence(), - m_wp("WordPenalty"), - m_uwp("UnknownWordPenalty"), - m_dist("Distortion"), - m_manager(0,m_sentence,Normal) + const string& sourceSentence, + const vector& alignments, + const vector& targetSegments) + : m_emptyTarget(), + m_sentence(), + m_wp("WordPenalty"), + m_uwp("UnknownWordPenalty"), + m_dist("Distortion"), + m_manager(0,m_sentence,Normal) { BOOST_CHECK_EQUAL(alignments.size(), targetSegments.size()); @@ -49,7 +50,7 @@ MockHypothesisGuard::MockHypothesisGuard( stringstream in(sourceSentence + "\n"); m_sentence.Read(in,factors); - + //Initial empty hypothesis m_manager.ResetSentenceStats(m_sentence); @@ -58,21 +59,20 @@ MockHypothesisGuard::MockHypothesisGuard( //create the chain vector::const_iterator ai = alignments.begin(); vector::const_iterator ti = targetSegments.begin(); - for (; ti != targetSegments.end() && ai != alignments.end(); ++ti,++ai) - { + for (; ti != targetSegments.end() && ai != alignments.end(); ++ti,++ai) { Hypothesis* prevHypo = m_hypothesis; WordsRange wordsRange(ai->first,ai->second); m_targetPhrases.push_back(TargetPhrase()); m_targetPhrases.back().CreateFromString(Input, factors, *ti, "|", NULL); m_toptions.push_back(new TranslationOption - (wordsRange,m_targetPhrases.back())); - m_hypothesis = Hypothesis::Create(*prevHypo,*m_toptions.back(),NULL); + (wordsRange,m_targetPhrases.back())); + m_hypothesis = Hypothesis::Create(*prevHypo,*m_toptions.back(),NULL); } } -MockHypothesisGuard::~MockHypothesisGuard() +MockHypothesisGuard::~MockHypothesisGuard() { RemoveAllInColl(m_toptions); while (m_hypothesis) { diff --git a/moses/MockHypothesis.h b/moses/MockHypothesis.h index 2490dd5a6..67182ad56 100644 --- a/moses/MockHypothesis.h +++ b/moses/MockHypothesis.h @@ -29,7 +29,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "Hypothesis.h" #include "Manager.h" -namespace MosesTest { +namespace MosesTest +{ // // Construct a hypothesis with arbitrary source and target phrase @@ -38,42 +39,52 @@ namespace MosesTest { typedef std::pair Alignment; //(first,last) in source -class MockHypothesisGuard { - public: - /** Creates a phrase-based hypothesis. - */ - MockHypothesisGuard( - const std::string& sourceSentence, - const std::vector& alignments, - const std::vector& targetSegments); - Moses::Hypothesis* operator*() const {return m_hypothesis;} +class MockHypothesisGuard +{ +public: + /** Creates a phrase-based hypothesis. + */ + MockHypothesisGuard( + const std::string& sourceSentence, + const std::vector& alignments, + const std::vector& targetSegments); + Moses::Hypothesis* operator*() const { + return m_hypothesis; + } - /** Destroy the hypothesis chain */ - ~MockHypothesisGuard(); + /** Destroy the hypothesis chain */ + ~MockHypothesisGuard(); - private: - Moses::TargetPhrase m_emptyTarget; - Moses::Sentence m_sentence; - Moses::WordPenaltyProducer m_wp; - Moses::UnknownWordPenaltyProducer m_uwp; - Moses::DistortionScoreProducer m_dist; - Moses::Manager m_manager; - Moses::Hypothesis* m_hypothesis; - std::vector m_targetPhrases; - std::vector m_toptions; +private: + Moses::TargetPhrase m_emptyTarget; + Moses::Sentence m_sentence; + Moses::WordPenaltyProducer m_wp; + Moses::UnknownWordPenaltyProducer m_uwp; + Moses::DistortionScoreProducer m_dist; + Moses::Manager m_manager; + Moses::Hypothesis* m_hypothesis; + std::vector m_targetPhrases; + std::vector m_toptions; }; -class HypothesisFixture { - public: - HypothesisFixture(); - const Moses::Hypothesis* empty() {return **m_empty;} - const Moses::Hypothesis* partial() {return **m_partial;} - const Moses::Hypothesis* full() {return **m_full;} - - private: - std::auto_ptr m_empty; - std::auto_ptr m_partial; - std::auto_ptr m_full; +class HypothesisFixture +{ +public: + HypothesisFixture(); + const Moses::Hypothesis* empty() { + return **m_empty; + } + const Moses::Hypothesis* partial() { + return **m_partial; + } + const Moses::Hypothesis* full() { + return **m_full; + } + +private: + std::auto_ptr m_empty; + std::auto_ptr m_partial; + std::auto_ptr m_full; }; diff --git a/moses/OutputCollector.h b/moses/OutputCollector.h index 96353934e..5f72433d8 100644 --- a/moses/OutputCollector.h +++ b/moses/OutputCollector.h @@ -45,27 +45,23 @@ public: OutputCollector(std::ostream* outStream= &std::cout, std::ostream* debugStream=&std::cerr) : m_nextOutput(0),m_outStream(outStream),m_debugStream(debugStream), m_isHoldingOutputStream(false), m_isHoldingDebugStream(false) {} - - ~OutputCollector() - { + + ~OutputCollector() { if (m_isHoldingOutputStream) delete m_outStream; if (m_isHoldingDebugStream) delete m_debugStream; } - - void HoldOutputStream() - { + + void HoldOutputStream() { m_isHoldingOutputStream = true; } - - void HoldDebugStream() - { + + void HoldDebugStream() { m_isHoldingDebugStream = true; } - - bool OutputIsCout() const - { + + bool OutputIsCout() const { return (m_outStream == std::cout); } @@ -87,7 +83,7 @@ public: *m_outStream << iter->second << std::flush; ++m_nextOutput; std::map::iterator debugIter = m_debugs.find(iter->first); - m_outputs.erase(iter); + m_outputs.erase(iter); if (debugIter != m_debugs.end()) { *m_debugStream << debugIter->second << std::flush; m_debugs.erase(debugIter); diff --git a/moses/PCNTools.h b/moses/PCNTools.h index 8a31e99ad..ea43df838 100644 --- a/moses/PCNTools.h +++ b/moses/PCNTools.h @@ -36,7 +36,7 @@ namespace PCN typedef std::pair >, size_t> CNAlt; typedef std::vector CNCol; typedef std::vector CN; - + /** Given a string ((('foo',0.1,1),('bar',0.9,2)),...) representation of a * word lattice in PCN format, return a CN object representing the lattice */ diff --git a/moses/PDTAimp.h b/moses/PDTAimp.h index 835cd6895..735179217 100644 --- a/moses/PDTAimp.h +++ b/moses/PDTAimp.h @@ -38,7 +38,7 @@ protected: : m_dict(0), m_obj(p),useCache(1),m_numInputScores(nis),totalE(0),distinctE(0) {} - public: +public: std::vector m_input,m_output; PhraseDictionaryTree *m_dict; typedef std::vector vTPC; @@ -185,7 +185,7 @@ protected: void Create(const std::vector &input , const std::vector &output , const std::string &filePath - , const std::vector &weight + , const std::vector &weight ) { // set my members @@ -267,10 +267,10 @@ protected: StringTgtCand::Tokens const& factorStrings, Scores const& scoreVector, const ScoreComponentCollection& sparseFeatures, - std::vector &weights, - float weightWP, + std::vector &weights, + float weightWP, Phrase const* srcPtr) const { - FactorCollection &factorCollection = FactorCollection::Instance(); + FactorCollection &factorCollection = FactorCollection::Instance(); for(size_t k=0; k word(*factorStrings[k], StaticData::Instance().GetFactorDelimiter()); @@ -438,8 +438,8 @@ protected: //put in phrase table scores, logging as we insert std::transform(tcands[i].scores.begin(),tcands[i].scores.end(),nscores.begin() + m_numInputScores,TransformScore); - - CHECK(nscores.size()==weightT.size()); + + CHECK(nscores.size()==weightT.size()); //tally up float score=std::inner_product(nscores.begin(), nscores.end(), weightT.begin(), 0.0f); diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp index 4264010cd..e16b6d08f 100644 --- a/moses/Parameter.cpp +++ b/moses/Parameter.cpp @@ -69,30 +69,30 @@ Parameter::Parameter() AddParam("report-all-factors", "report all factors in output, not just first"); AddParam("report-all-factors-in-n-best", "Report all factors in n-best-lists. Default is false"); #ifdef HAVE_SYNLM - AddParam("slmodel-file", "location of the syntactic language model file(s)"); - AddParam("slmodel-factor", "factor to use with syntactic language model"); - AddParam("slmodel-beam", "beam width to use with syntactic language model's parser"); + AddParam("slmodel-file", "location of the syntactic language model file(s)"); + AddParam("slmodel-factor", "factor to use with syntactic language model"); + AddParam("slmodel-beam", "beam width to use with syntactic language model's parser"); #endif AddParam("stack", "s", "maximum stack size for histogram pruning"); AddParam("stack-diversity", "sd", "minimum number of hypothesis of each coverage in stack (default 0)"); AddParam("threads","th", "number of threads to use in decoding (defaults to single-threaded)"); - AddParam("translation-details", "T", "for each best hypothesis, report translation details to the given file"); - AddParam("ttable-file", "location and properties of the translation tables"); - AddParam("ttable-limit", "ttl", "maximum number of translation table entries per input phrase"); - AddParam("translation-option-threshold", "tot", "threshold for translation options relative to best for input phrase"); - AddParam("early-discarding-threshold", "edt", "threshold for constructing hypotheses based on estimate cost"); - AddParam("verbose", "v", "verbosity level of the logging"); + AddParam("translation-details", "T", "for each best hypothesis, report translation details to the given file"); + AddParam("ttable-file", "location and properties of the translation tables"); + AddParam("ttable-limit", "ttl", "maximum number of translation table entries per input phrase"); + AddParam("translation-option-threshold", "tot", "threshold for translation options relative to best for input phrase"); + AddParam("early-discarding-threshold", "edt", "threshold for constructing hypotheses based on estimate cost"); + AddParam("verbose", "v", "verbosity level of the logging"); AddParam("references", "Reference file(s) - used for bleu score feature"); - AddParam("output-factors", "list if factors in the output"); - AddParam("cache-path", "?"); - AddParam("distortion-limit", "dl", "distortion (reordering) limit in maximum number of words (0 = monotone, -1 = unlimited)"); - AddParam("monotone-at-punctuation", "mp", "do not reorder over punctuation"); - AddParam("distortion-file", "source factors (0 if table independent of source), target factors, location of the factorized/lexicalized reordering tables"); - AddParam("distortion", "configurations for each factorized/lexicalized reordering model."); - AddParam("early-distortion-cost", "edc", "include estimate of distortion cost yet to be incurred in the score [Moore & Quirk 2007]. Default is no"); - AddParam("xml-input", "xi", "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'ignore'"); + AddParam("output-factors", "list if factors in the output"); + AddParam("cache-path", "?"); + AddParam("distortion-limit", "dl", "distortion (reordering) limit in maximum number of words (0 = monotone, -1 = unlimited)"); + AddParam("monotone-at-punctuation", "mp", "do not reorder over punctuation"); + AddParam("distortion-file", "source factors (0 if table independent of source), target factors, location of the factorized/lexicalized reordering tables"); + AddParam("distortion", "configurations for each factorized/lexicalized reordering model."); + AddParam("early-distortion-cost", "edc", "include estimate of distortion cost yet to be incurred in the score [Moore & Quirk 2007]. Default is no"); + AddParam("xml-input", "xi", "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'ignore'"); AddParam("xml-brackets", "xb", "specify strings to be used as xml tags opening and closing, e.g. \"{{ }}\" (default \"< >\"). Avoid square brackets because of configuration file format. Valid only with text input mode" ); - AddParam("minimum-bayes-risk", "mbr", "use miminum Bayes risk to determine best translation"); + AddParam("minimum-bayes-risk", "mbr", "use miminum Bayes risk to determine best translation"); AddParam("lminimum-bayes-risk", "lmbr", "use lattice miminum Bayes risk to determine best translation"); AddParam("mira", "do mira training"); AddParam("consensus-decoding", "con", "use consensus decoding (De Nero et. al. 2009)"); @@ -119,17 +119,17 @@ Parameter::Parameter() #ifdef HAVE_PROTOBUF AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path."); #endif - AddParam("cube-pruning-pop-limit", "cbp", "How many hypotheses should be popped for each stack. (default = 1000)"); - AddParam("cube-pruning-diversity", "cbd", "How many hypotheses should be created for each coverage. (default = 0)"); - AddParam("search-algorithm", "Which search algorithm to use. 0=normal stack, 1=cube pruning, 2=cube growing. (default = 0)"); - AddParam("constraint", "Location of the file with target sentences to produce constraining the search"); - AddParam("description", "Source language, target language, description"); - AddParam("max-chart-span", "maximum num. of source word chart rules can consume (default 10)"); - AddParam("non-terminals", "list of non-term symbols, space separated"); - AddParam("rule-limit", "a little like table limit. But for chart decoding rules. Default is DEFAULT_MAX_TRANS_OPT_SIZE"); - AddParam("source-label-overlap", "What happens if a span already has a label. 0=add more. 1=replace. 2=discard. Default is 0"); - AddParam("output-hypo-score", "Output the hypo score to stdout with the output string. For search error analysis. Default is false"); - AddParam("unknown-lhs", "file containing target lhs of unknown words. 1 per line: LHS prob"); + AddParam("cube-pruning-pop-limit", "cbp", "How many hypotheses should be popped for each stack. (default = 1000)"); + AddParam("cube-pruning-diversity", "cbd", "How many hypotheses should be created for each coverage. (default = 0)"); + AddParam("search-algorithm", "Which search algorithm to use. 0=normal stack, 1=cube pruning, 2=cube growing. (default = 0)"); + AddParam("constraint", "Location of the file with target sentences to produce constraining the search"); + AddParam("description", "Source language, target language, description"); + AddParam("max-chart-span", "maximum num. of source word chart rules can consume (default 10)"); + AddParam("non-terminals", "list of non-term symbols, space separated"); + AddParam("rule-limit", "a little like table limit. But for chart decoding rules. Default is DEFAULT_MAX_TRANS_OPT_SIZE"); + AddParam("source-label-overlap", "What happens if a span already has a label. 0=add more. 1=replace. 2=discard. Default is 0"); + AddParam("output-hypo-score", "Output the hypo score to stdout with the output string. For search error analysis. Default is false"); + AddParam("unknown-lhs", "file containing target lhs of unknown words. 1 per line: LHS prob"); AddParam("phrase-pair-feature", "Source and target factors for phrase pair feature"); AddParam("phrase-boundary-source-feature", "Source factors for phrase boundary feature"); AddParam("phrase-boundary-target-feature", "Target factors for phrase boundary feature"); @@ -153,9 +153,9 @@ Parameter::Parameter() AddParam("show-weights", "print feature weights and exit"); AddParam("start-translation-id", "Id of 1st input. Default = 0"); AddParam("output-unknowns", "Output the unknown (OOV) words to the given file, one line per sentence"); - - // Compact phrase table and reordering table. - AddParam("minlexr-memory", "Load lexical reordering table in minlexr format into memory"); + + // Compact phrase table and reordering table. + AddParam("minlexr-memory", "Load lexical reordering table in minlexr format into memory"); AddParam("minphr-memory", "Load phrase table in minphr format into memory"); AddParam("print-alignment-info", "Output word-to-word alignment to standard out, separated from translation by |||. Word-to-word alignments are takne from the phrase table if any. Default is false"); @@ -214,7 +214,7 @@ void Parameter::AddParam(const string ¶mName, const string &abbrevName, cons m_valid[paramName] = true; m_valid[abbrevName] = true; m_abbreviation[paramName] = abbrevName; - m_fullname[abbrevName] = paramName; + m_fullname[abbrevName] = paramName; m_description[paramName] = description; } @@ -263,7 +263,7 @@ bool Parameter::LoadParam(int argc, char* argv[]) PrintCredit(); Explain(); - cerr << endl; + cerr << endl; UserMessage::Add("No configuration file was specified. Use -config or -f"); cerr << endl; return false; @@ -381,11 +381,9 @@ void Parameter::ConvertWeightArgsSingleWeight(const string &oldWeightName, const PARAM_MAP::iterator iterMap; iterMap = m_setting.find(oldWeightName); - if (iterMap != m_setting.end()) - { + if (iterMap != m_setting.end()) { const PARAM_VEC &weights = iterMap->second; - for (size_t i = 0; i < weights.size(); ++i) - { + for (size_t i = 0; i < weights.size(); ++i) { SetWeight(newWeightName, ind, Scan(weights[i])); } @@ -403,8 +401,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName) CHECK(numInputScores.size() == 0); numInputScores.push_back("1"); numInputScores.push_back("0"); - } - else if (inputWeights.size() == 2) { + } else if (inputWeights.size() == 2) { CHECK(numInputScores.size() == 0); numInputScores.push_back("1"); numInputScores.push_back("1"); @@ -463,8 +460,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName) PhraseTableImplementation implementation = (PhraseTableImplementation) Scan(token[0]); string ptType; - switch (implementation) - { + switch (implementation) { case Memory: ptType = "PhraseDictionaryMemory"; break; @@ -488,8 +484,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName) if (ptIndices.find(ptType) == ptIndices.end()) { ptIndices[ptType] = 0; ptInd = 0; - } - else { + } else { ptInd = ++ptIndices[ptType]; } @@ -516,7 +511,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName) //characteristics of the phrase table vector input = Tokenize(token[1], ",") - ,output = Tokenize(token[2], ","); + ,output = Tokenize(token[2], ","); size_t numScoreComponent = Scan(token[3]); string filePath= token[4]; @@ -561,14 +556,13 @@ void Parameter::ConvertWeightArgsDistortion() // distortion / lex distortion const PARAM_VEC &oldWeights = GetParam(oldWeightName); - if (oldWeights.size() > 0) - { + if (oldWeights.size() > 0) { if (!isParamSpecified("search-algorithm") || - (GetParam("search-algorithm").size() > 0 - && (Trim(GetParam("search-algorithm")[0]) == "0" + (GetParam("search-algorithm").size() > 0 + && (Trim(GetParam("search-algorithm")[0]) == "0" ||Trim(GetParam("search-algorithm")[0]) == "1" - ) - ) + ) + ) ) { // phrase-based. Add distance distortion to list of features AddFeature("Distortion"); @@ -587,8 +581,7 @@ void Parameter::ConvertWeightArgsDistortion() size_t numFF = Scan(toks[2]); vector weights(numFF); - for (size_t currFF = 0; currFF < numFF; ++currFF) - { + for (size_t currFF = 0; currFF < numFF; ++currFF) { CHECK(currOldInd < oldWeights.size()); float weight = Scan(oldWeights[currOldInd]); weights[currFF] = weight; @@ -625,12 +618,12 @@ void Parameter::ConvertWeightArgsLM() bool isChartDecoding = true; if (!isParamSpecified("search-algorithm") || - (GetParam("search-algorithm").size() > 0 - && (Trim(GetParam("search-algorithm")[0]) == "0" - ||Trim(GetParam("search-algorithm")[0]) == "1" - ) - ) - ) { + (GetParam("search-algorithm").size() > 0 + && (Trim(GetParam("search-algorithm")[0]) == "0" + ||Trim(GetParam("search-algorithm")[0]) == "1" + ) + ) + ) { isChartDecoding = false; } @@ -643,8 +636,7 @@ void Parameter::ConvertWeightArgsLM() PARAM_MAP::iterator iterMap; iterMap = m_setting.find(oldWeightName); - if (iterMap != m_setting.end()) - { + if (iterMap != m_setting.end()) { size_t currOldInd = 0; const PARAM_VEC &weights = iterMap->second; @@ -656,8 +648,7 @@ void Parameter::ConvertWeightArgsLM() int lmType = Scan(modelToks[0]); string newFeatureName; - switch (lmType) - { + switch (lmType) { case 0: newFeatureName = "SRILM"; break; @@ -677,12 +668,11 @@ void Parameter::ConvertWeightArgsLM() numFF += oovWeights[lmIndex]; vector weightsLM(numFF); - for (size_t currFF = 0; currFF < numFF; ++currFF) - { + for (size_t currFF = 0; currFF < numFF; ++currFF) { CHECK(currOldInd < weights.size()); weightsLM[currFF] = Scan(weights[currOldInd]); if (isChartDecoding) { - weightsLM[currFF] = UntransformLMScore(weightsLM[currFF]); + weightsLM[currFF] = UntransformLMScore(weightsLM[currFF]); } ++currOldInd; @@ -691,12 +681,11 @@ void Parameter::ConvertWeightArgsLM() SetWeight(newFeatureName, ind, weightsLM); string featureLine = newFeatureName + " " - + "factor=" + modelToks[1] + " " // factor - + "order=" + modelToks[2] + " "; // order + + "factor=" + modelToks[1] + " " // factor + + "order=" + modelToks[2] + " "; // order if (lmType == 9) { featureLine += "lazyken=1 "; - } - else if (lmType == 8) { + } else if (lmType == 8) { featureLine += "lazyken=0 "; } @@ -718,8 +707,7 @@ void Parameter::ConvertWeightArgsGeneration(const std::string &oldWeightName, co // distortion / lex distortion PARAM_VEC &oldWeights = m_setting[oldWeightName]; - if (oldWeights.size() > 0) - { + if (oldWeights.size() > 0) { size_t currOldInd = 0; PARAM_VEC &models = m_setting[oldFeatureName]; @@ -730,8 +718,7 @@ void Parameter::ConvertWeightArgsGeneration(const std::string &oldWeightName, co size_t numFF = Scan(modelToks[2]); vector weights(numFF); - for (size_t currFF = 0; currFF < numFF; ++currFF) - { + for (size_t currFF = 0; currFF < numFF; ++currFF) { CHECK(currOldInd < oldWeights.size()); float weight = Scan(oldWeights[currOldInd]); weights[currFF] = weight; @@ -742,10 +729,10 @@ void Parameter::ConvertWeightArgsGeneration(const std::string &oldWeightName, co stringstream strme; strme << "Generation " - << "input-factor=" << modelToks[0] << " " - << "output-factor=" << modelToks[1] << " " - << "num-features=" << modelToks[2] << " " - << "path=" << modelToks[3]; + << "input-factor=" << modelToks[0] << " " + << "output-factor=" << modelToks[1] << " " + << "num-features=" << modelToks[2] << " " + << "path=" << modelToks[3]; AddFeature(strme.str()); } } @@ -761,23 +748,21 @@ void Parameter::ConvertWeightArgsWordPenalty() bool isChartDecoding = true; if (!isParamSpecified("search-algorithm") || - (GetParam("search-algorithm").size() > 0 - && (Trim(GetParam("search-algorithm")[0]) == "0" - ||Trim(GetParam("search-algorithm")[0]) == "1" - ) - ) - ) { + (GetParam("search-algorithm").size() > 0 + && (Trim(GetParam("search-algorithm")[0]) == "0" + ||Trim(GetParam("search-algorithm")[0]) == "1" + ) + ) + ) { isChartDecoding = false; } PARAM_MAP::iterator iterMap; iterMap = m_setting.find(oldWeightName); - if (iterMap != m_setting.end()) - { + if (iterMap != m_setting.end()) { const PARAM_VEC &weights = iterMap->second; - for (size_t i = 0; i < weights.size(); ++i) - { + for (size_t i = 0; i < weights.size(); ++i) { float weight = Scan(weights[i]); if (isChartDecoding) { weight *= 0.434294482; @@ -800,8 +785,7 @@ void Parameter::ConvertWeightArgs() (m_setting.count("weight-i") || m_setting.count("weight-t") || m_setting.count("weight-w") || m_setting.count("weight-l") || m_setting.count("weight-u") || m_setting.count("weight-lex") || m_setting.count("weight-generation") || m_setting.count("weight-lr") || m_setting.count("weight-d") - )) - { + )) { cerr << "Do not mix old and new format for specify weights"; } @@ -833,8 +817,7 @@ void Parameter::ConvertWeightArgs() void Parameter::CreateWeightsMap() { PARAM_VEC &vec = m_setting["weight"]; - for (size_t i = 0; i < vec.size(); ++i) - { + for (size_t i = 0; i < vec.size(); ++i) { const string &line = vec[i]; vector toks = Tokenize(line); CHECK(toks.size() >= 2); @@ -865,8 +848,7 @@ void Parameter::WeightOverwrite() string name(""); vector weights; vector toks = Tokenize(vec[0]); - for (size_t i = 0; i < toks.size(); ++i) - { + for (size_t i = 0; i < toks.size(); ++i) { const string &tok = toks[i]; if (tok.substr(tok.size() - 1, 1) == "=") { @@ -879,8 +861,7 @@ void Parameter::WeightOverwrite() } name = tok.substr(0, tok.size() - 1); - } - else { + } else { // a weight for curr ff float weight = Scan(toks[i]); weights.push_back(weight); @@ -899,14 +880,13 @@ bool Parameter::Validate() PARAM_MAP::const_iterator iterParams; for (iterParams = m_setting.begin(); iterParams != m_setting.end(); ++iterParams) { const std::string &key = iterParams->first; - - if (m_valid.find(key) == m_valid.end()) - { + + if (m_valid.find(key) == m_valid.end()) { UserMessage::Add("Unknown parameter " + key); noErrorFlag = false; } } - + if (m_setting["lmodel-dub"].size() > 0) { if (m_setting["lmodel-file"].size() != m_setting["lmodel-dub"].size()) { stringstream errorMsg(""); @@ -1082,8 +1062,7 @@ bool Parameter::ReadConfigFile(const string &filePath ) if (line.size() == 0) { // blank line. do nothing. - } - else if (line[0]=='[') { + } else if (line[0]=='[') { // new parameter for (size_t currPos = 0 ; currPos < line.size() ; currPos++) { if (line[currPos] == ']') { @@ -1227,23 +1206,23 @@ void Parameter::PrintCredit() * \param values inew values for paramName */ void Parameter::OverwriteParam(const string ¶mName, PARAM_VEC values) { - VERBOSE(2,"Overwriting parameter " << paramName); - - m_setting[paramName]; // defines the parameter, important for boolean switches - if (m_setting[paramName].size() > 1){ - VERBOSE(2," (the parameter had " << m_setting[paramName].size() << " previous values)"); - CHECK(m_setting[paramName].size() == values.size()); - }else{ - VERBOSE(2," (the parameter does not have previous values)"); - m_setting[paramName].resize(values.size()); - } - VERBOSE(2," with the following values:"); - int i=0; - for (PARAM_VEC::iterator iter = values.begin(); iter != values.end() ; iter++, i++){ - m_setting[paramName][i] = *iter; - VERBOSE(2, " " << *iter); - } - VERBOSE(2, std::endl); + VERBOSE(2,"Overwriting parameter " << paramName); + + m_setting[paramName]; // defines the parameter, important for boolean switches + if (m_setting[paramName].size() > 1) { + VERBOSE(2," (the parameter had " << m_setting[paramName].size() << " previous values)"); + CHECK(m_setting[paramName].size() == values.size()); + } else { + VERBOSE(2," (the parameter does not have previous values)"); + m_setting[paramName].resize(values.size()); + } + VERBOSE(2," with the following values:"); + int i=0; + for (PARAM_VEC::iterator iter = values.begin(); iter != values.end() ; iter++, i++) { + m_setting[paramName][i] = *iter; + VERBOSE(2, " " << *iter); + } + VERBOSE(2, std::endl); } std::set Parameter::GetWeightNames() const @@ -1256,7 +1235,7 @@ std::set Parameter::GetWeightNames() const } return ret; } - + } diff --git a/moses/Parameter.h b/moses/Parameter.h index a78314692..a0c372a53 100644 --- a/moses/Parameter.h +++ b/moses/Parameter.h @@ -38,16 +38,16 @@ typedef std::map PARAM_STRING; /** Handles parameter values set in config file or on command line. * Process raw parameter data (names and values as strings) for StaticData - * to parse; to get useful values, see StaticData. + * to parse; to get useful values, see StaticData. */ class Parameter { protected: - PARAM_MAP m_setting; - PARAM_BOOL m_valid; - PARAM_STRING m_abbreviation; - PARAM_STRING m_description; - PARAM_STRING m_fullname; + PARAM_MAP m_setting; + PARAM_BOOL m_valid; + PARAM_STRING m_abbreviation; + PARAM_STRING m_description; + PARAM_STRING m_fullname; std::map > m_weights; @@ -93,32 +93,30 @@ public: bool isParamSpecified(const std::string ¶mName) { return m_setting.find( paramName ) != m_setting.end(); } - - const std::string GetFullName(std::string abbr) - { - return m_fullname[abbr]; - } - - const std::string GetAbbreviation(std::string full) - { - return m_abbreviation[full]; - } - const PARAM_VEC &GetParamShortName(const std::string ¶mName) - { - return GetParam(GetFullName(paramName)); - } - - void OverwriteParam(const std::string ¶mName, PARAM_VEC values); - void OverwriteParamShortName(const std::string ¶mShortName, PARAM_VEC values){ - OverwriteParam(GetFullName(paramShortName),values); - } - + const std::string GetFullName(std::string abbr) { + return m_fullname[abbr]; + } + + const std::string GetAbbreviation(std::string full) { + return m_abbreviation[full]; + } + const PARAM_VEC &GetParamShortName(const std::string ¶mName) { + return GetParam(GetFullName(paramName)); + } + + void OverwriteParam(const std::string ¶mName, PARAM_VEC values); + + void OverwriteParamShortName(const std::string ¶mShortName, PARAM_VEC values) { + OverwriteParam(GetFullName(paramShortName),values); + } + std::vector &GetWeights(const std::string &name); std::set GetWeightNames() const; - const PARAM_MAP &GetParams() const - { return m_setting; } + const PARAM_MAP &GetParams() const { + return m_setting; + } }; diff --git a/moses/PartialTranslOptColl.h b/moses/PartialTranslOptColl.h index f4f40d413..5a4e816de 100644 --- a/moses/PartialTranslOptColl.h +++ b/moses/PartialTranslOptColl.h @@ -39,7 +39,7 @@ namespace Moses * The expansion process itself may be still explode, so efficient handling * of partial translation options during expansion is required. * This class assists in this tasks by implementing pruning. - * This implementation is similar to the one in HypothesisStack. + * This implementation is similar to the one in HypothesisStack. */ class PartialTranslOptColl { diff --git a/moses/Phrase.cpp b/moses/Phrase.cpp index ef5d09b23..3fa607fb4 100644 --- a/moses/Phrase.cpp +++ b/moses/Phrase.cpp @@ -103,16 +103,15 @@ Phrase Phrase::GetSubString(const WordsRange &wordsRange) const Phrase Phrase::GetSubString(const WordsRange &wordsRange, FactorType factorType) const { - Phrase retPhrase(wordsRange.GetNumWordsCovered()); + Phrase retPhrase(wordsRange.GetNumWordsCovered()); - for (size_t currPos = wordsRange.GetStartPos() ; currPos <= wordsRange.GetEndPos() ; currPos++) - { - const Factor* f = GetFactor(currPos, factorType); - Word &word = retPhrase.AddWord(); - word.SetFactor(factorType, f); - } + for (size_t currPos = wordsRange.GetStartPos() ; currPos <= wordsRange.GetEndPos() ; currPos++) { + const Factor* f = GetFactor(currPos, factorType); + Word &word = retPhrase.AddWord(); + word.SetFactor(factorType, f); + } - return retPhrase; + return retPhrase; } std::string Phrase::GetStringRep(const vector factorsToPrint) const @@ -153,10 +152,10 @@ void Phrase::PrependWord(const Word &newWord) } void Phrase::CreateFromString(FactorDirection direction - ,const std::vector &factorOrder - ,const StringPiece &phraseString - ,const StringPiece &factorDelimiter - ,Word **lhs) + ,const std::vector &factorOrder + ,const StringPiece &phraseString + ,const StringPiece &factorDelimiter + ,Word **lhs) { // parse vector annotatedWordVector; @@ -165,9 +164,9 @@ void Phrase::CreateFromString(FactorDirection direction } if (annotatedWordVector.size() == 0) { - if (lhs) { - (*lhs) = NULL; - } + if (lhs) { + (*lhs) = NULL; + } return; } @@ -188,8 +187,7 @@ void Phrase::CreateFromString(FactorDirection direction (*lhs) = new Word(true); (*lhs)->CreateFromString(direction, factorOrder, annotatedWord.substr(1, annotatedWord.size() - 2), true); assert((*lhs)->IsNonTerminal()); - } - else { + } else { numWords = annotatedWordVector.size(); //CHECK(lhs == NULL); if (lhs) { diff --git a/moses/Phrase.h b/moses/Phrase.h index 196e403ac..209f92f9e 100644 --- a/moses/Phrase.h +++ b/moses/Phrase.h @@ -69,13 +69,13 @@ public: /** Fills phrase with words from format string, typically from phrase table or sentence input * \param factorOrder factor types of each element in 2D string vector * \param phraseString formatted input string to parse - * \param factorDelimiter delimiter between factors. + * \param factorDelimiter delimiter between factors. */ void CreateFromString(FactorDirection direction , const std::vector &factorOrder - , const StringPiece &phraseString - , const StringPiece &factorDelimiter - , Word **lhs); + , const StringPiece &phraseString + , const StringPiece &factorDelimiter + , Word **lhs); /** copy factors from the other phrase to this phrase. IsCompatible() must be run beforehand to ensure incompatible factors aren't overwritten @@ -127,52 +127,49 @@ public: void AddWord(const Word &newWord) { AddWord() = newWord; } - - /** appends a phrase at the end of current phrase **/ - void Append(const Phrase &endPhrase); - void PrependWord(const Word &newWord); - - void Clear() - { - m_words.clear(); - } - - void RemoveWord(size_t pos) - { - CHECK(pos < m_words.size()); - m_words.erase(m_words.begin() + pos); - } - - //! create new phrase class that is a substring of this phrase - Phrase GetSubString(const WordsRange &wordsRange) const; + + /** appends a phrase at the end of current phrase **/ + void Append(const Phrase &endPhrase); + void PrependWord(const Word &newWord); + + void Clear() { + m_words.clear(); + } + + void RemoveWord(size_t pos) { + CHECK(pos < m_words.size()); + m_words.erase(m_words.begin() + pos); + } + + //! create new phrase class that is a substring of this phrase + Phrase GetSubString(const WordsRange &wordsRange) const; Phrase GetSubString(const WordsRange &wordsRange, FactorType factorType) const; - - //! return a string rep of the phrase. Each factor is separated by the factor delimiter as specified in StaticData class - std::string GetStringRep(const std::vector factorsToPrint) const; - - TO_STRING(); - - int Compare(const Phrase &other) const; - - /** transitive comparison between 2 phrases - * used to insert & find phrase in dictionary - */ - bool operator< (const Phrase &compare) const - { - return Compare(compare) < 0; - } - - bool operator== (const Phrase &compare) const - { - return Compare(compare) == 0; - } + //! return a string rep of the phrase. Each factor is separated by the factor delimiter as specified in StaticData class + std::string GetStringRep(const std::vector factorsToPrint) const; - void OnlyTheseFactors(const FactorMask &factors); + TO_STRING(); + + + int Compare(const Phrase &other) const; + + /** transitive comparison between 2 phrases + * used to insert & find phrase in dictionary + */ + bool operator< (const Phrase &compare) const { + return Compare(compare) < 0; + } + + bool operator== (const Phrase &compare) const { + return Compare(compare) == 0; + } + + void OnlyTheseFactors(const FactorMask &factors); }; -inline size_t hash_value(const Phrase& phrase) { +inline size_t hash_value(const Phrase& phrase) +{ size_t seed = 0; for (size_t i = 0; i < phrase.GetSize(); ++i) { boost::hash_combine(seed, phrase.GetWord(i)); diff --git a/moses/PrefixTree.h b/moses/PrefixTree.h index 9cf1360e6..5b81ea175 100644 --- a/moses/PrefixTree.h +++ b/moses/PrefixTree.h @@ -63,7 +63,7 @@ public: keys.insert(i,*b); data.insert(data.begin()+pos,def); - Self *self = NULL; + Self *self = NULL; ptr.insert(ptr.begin()+pos, self); } if(++b!=e) { diff --git a/moses/PrefixTreeMap.h b/moses/PrefixTreeMap.h index fae875bd4..06066878d 100644 --- a/moses/PrefixTreeMap.h +++ b/moses/PrefixTreeMap.h @@ -59,7 +59,7 @@ private: ScoreList m_ScoreList; }; - + /** @todo How is this used in the pb binary phrase table? */ struct PPimp { diff --git a/moses/RuleCube.h b/moses/RuleCube.h index 05f9f1a24..d0c6ea66a 100644 --- a/moses/RuleCube.h +++ b/moses/RuleCube.h @@ -44,7 +44,7 @@ class ChartTranslationOptions; */ class RuleCubeItemScoreOrderer { - public: +public: bool operator()(const RuleCubeItem *p, const RuleCubeItem *q) const { return p->GetScore() < q->GetScore(); } @@ -56,7 +56,7 @@ class RuleCubeItemScoreOrderer */ class RuleCubeItemPositionOrderer { - public: +public: bool operator()(const RuleCubeItem *p, const RuleCubeItem *q) const { return *p < *q; } @@ -66,7 +66,7 @@ class RuleCubeItemPositionOrderer */ class RuleCubeItemHasher { - public: +public: size_t operator()(const RuleCubeItem *p) const { size_t seed = 0; boost::hash_combine(seed, p->GetHypothesisDimensions()); @@ -79,7 +79,7 @@ class RuleCubeItemHasher */ class RuleCubeItemEqualityPred { - public: +public: bool operator()(const RuleCubeItem *p, const RuleCubeItem *q) const { return p->GetHypothesisDimensions() == q->GetHypothesisDimensions() && p->GetTranslationDimension() == q->GetTranslationDimension(); @@ -90,7 +90,7 @@ class RuleCubeItemEqualityPred */ class RuleCube { - public: +public: RuleCube(const ChartTranslationOptions &, const ChartCellCollection &, ChartManager &); @@ -104,26 +104,28 @@ class RuleCube RuleCubeItem *Pop(ChartManager &); - bool IsEmpty() const { return m_queue.empty(); } + bool IsEmpty() const { + return m_queue.empty(); + } const ChartTranslationOptions &GetTranslationOption() const { return m_transOpt; } - private: +private: #if defined(BOOST_VERSION) && (BOOST_VERSION >= 104200) typedef boost::unordered_set ItemSet; + RuleCubeItemHasher, + RuleCubeItemEqualityPred + > ItemSet; #else typedef std::set ItemSet; #endif typedef std::priority_queue, - RuleCubeItemScoreOrderer - > Queue; + std::vector, + RuleCubeItemScoreOrderer + > Queue; RuleCube(const RuleCube &); // Not implemented RuleCube &operator=(const RuleCube &); // Not implemented diff --git a/moses/RuleCubeItem.h b/moses/RuleCubeItem.h index 612079172..75669598b 100644 --- a/moses/RuleCubeItem.h +++ b/moses/RuleCubeItem.h @@ -39,14 +39,16 @@ typedef std::vector HypoList; */ class TranslationDimension { - public: +public: TranslationDimension(std::size_t pos, const std::vector &orderedTargetPhrases) : m_pos(pos) , m_orderedTargetPhrases(&orderedTargetPhrases) {} - std::size_t IncrementPos() { return m_pos++; } + std::size_t IncrementPos() { + return m_pos++; + } bool HasMoreTranslations() const { return m_pos+1 < m_orderedTargetPhrases->size(); @@ -64,7 +66,7 @@ class TranslationDimension return GetTargetPhrase() == compare.GetTargetPhrase(); } - private: +private: std::size_t m_pos; const std::vector *m_orderedTargetPhrases; }; @@ -81,7 +83,9 @@ public: , m_orderedHypos(&orderedHypos) {} - std::size_t IncrementPos() { return m_pos++; } + std::size_t IncrementPos() { + return m_pos++; + } bool HasMoreHypo() const { return m_pos+1 < m_orderedHypos->size(); @@ -109,7 +113,7 @@ std::size_t hash_value(const HypothesisDimension &); /** @todo How is this used. Split out into separate source file */ class RuleCubeItem { - public: +public: RuleCubeItem(const ChartTranslationOptions &, const ChartCellCollection &); RuleCubeItem(const RuleCubeItem &, int); ~RuleCubeItem(); @@ -122,7 +126,9 @@ class RuleCubeItem return m_hypothesisDimensions; } - float GetScore() const { return m_score; } + float GetScore() const { + return m_score; + } void EstimateScore(); @@ -132,7 +138,7 @@ class RuleCubeItem bool operator<(const RuleCubeItem &) const; - private: +private: RuleCubeItem(const RuleCubeItem &); // Not implemented RuleCubeItem &operator=(const RuleCubeItem &); // Not implemented diff --git a/moses/RuleCubeQueue.h b/moses/RuleCubeQueue.h index 9763b3877..ae4d20be0 100644 --- a/moses/RuleCubeQueue.h +++ b/moses/RuleCubeQueue.h @@ -36,7 +36,7 @@ class ChartManager; */ class RuleCubeOrderer { - public: +public: bool operator()(const RuleCube *p, const RuleCube *q) const { return p->GetTopScore() < q->GetTopScore(); } @@ -45,17 +45,19 @@ class RuleCubeOrderer /** @todo how is this used */ class RuleCubeQueue { - public: +public: RuleCubeQueue(ChartManager &manager) : m_manager(manager) {} ~RuleCubeQueue(); void Add(RuleCube *); ChartHypothesis *Pop(); - bool IsEmpty() const { return m_queue.empty(); } + bool IsEmpty() const { + return m_queue.empty(); + } - private: +private: typedef std::priority_queue, - RuleCubeOrderer > Queue; + RuleCubeOrderer > Queue; Queue m_queue; ChartManager &m_manager; diff --git a/moses/ScoreComponentCollection.cpp b/moses/ScoreComponentCollection.cpp index c836ea5b3..44f08b316 100644 --- a/moses/ScoreComponentCollection.cpp +++ b/moses/ScoreComponentCollection.cpp @@ -17,7 +17,7 @@ ScoreComponentCollection::ScoreComponentCollection() : m_scores(s_denseVectorSiz void ScoreComponentCollection::RegisterScoreProducer - (const FeatureFunction* scoreProducer) +(const FeatureFunction* scoreProducer) { size_t start = s_denseVectorSize; size_t end = start + scoreProducer->GetNumScoreComponents(); @@ -29,56 +29,58 @@ void ScoreComponentCollection::RegisterScoreProducer float ScoreComponentCollection::GetWeightedScore() const { - return m_scores.inner_product(StaticData::Instance().GetAllWeights().m_scores); + return m_scores.inner_product(StaticData::Instance().GetAllWeights().m_scores); } void ScoreComponentCollection::MultiplyEquals(float scalar) { - m_scores *= scalar; + m_scores *= scalar; } // Multiply all weights of this sparse producer by a given scalar -void ScoreComponentCollection::MultiplyEquals(const FeatureFunction* sp, float scalar) { +void ScoreComponentCollection::MultiplyEquals(const FeatureFunction* sp, float scalar) +{ std::string prefix = sp->GetScoreProducerDescription() + FName::SEP; for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) { std::stringstream name; name << i->first; if (name.str().substr( 0, prefix.length() ).compare( prefix ) == 0) - m_scores[i->first] = i->second * scalar; + m_scores[i->first] = i->second * scalar; } } // Count weights belonging to this sparse producer -size_t ScoreComponentCollection::GetNumberWeights(const FeatureFunction* sp) { +size_t ScoreComponentCollection::GetNumberWeights(const FeatureFunction* sp) +{ std::string prefix = sp->GetScoreProducerDescription() + FName::SEP; size_t weights = 0; for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) { std::stringstream name; name << i->first; if (name.str().substr( 0, prefix.length() ).compare( prefix ) == 0) - weights++; + weights++; } return weights; } void ScoreComponentCollection::DivideEquals(float scalar) { - m_scores /= scalar; + m_scores /= scalar; } void ScoreComponentCollection::CoreDivideEquals(float scalar) { - m_scores.coreDivideEquals(scalar); + m_scores.coreDivideEquals(scalar); } void ScoreComponentCollection::DivideEquals(const ScoreComponentCollection& rhs) { - m_scores.divideEquals(rhs.m_scores); + m_scores.divideEquals(rhs.m_scores); } void ScoreComponentCollection::MultiplyEquals(const ScoreComponentCollection& rhs) { - m_scores *= rhs.m_scores; + m_scores *= rhs.m_scores; } void ScoreComponentCollection::MultiplyEqualsBackoff(const ScoreComponentCollection& rhs, float backoff) @@ -93,42 +95,51 @@ void ScoreComponentCollection::MultiplyEquals(float core_r0, float sparse_r0) std::ostream& operator<<(std::ostream& os, const ScoreComponentCollection& rhs) { - os << rhs.m_scores; - return os; + os << rhs.m_scores; + return os; } -void ScoreComponentCollection::L1Normalise() { +void ScoreComponentCollection::L1Normalise() +{ m_scores /= m_scores.l1norm_coreFeatures(); } -float ScoreComponentCollection::GetL1Norm() const { +float ScoreComponentCollection::GetL1Norm() const +{ return m_scores.l1norm(); } -float ScoreComponentCollection::GetL2Norm() const { +float ScoreComponentCollection::GetL2Norm() const +{ return m_scores.l2norm(); } -float ScoreComponentCollection::GetLInfNorm() const { +float ScoreComponentCollection::GetLInfNorm() const +{ return m_scores.linfnorm(); } -size_t ScoreComponentCollection::L1Regularize(float lambda) { +size_t ScoreComponentCollection::L1Regularize(float lambda) +{ return m_scores.l1regularize(lambda); } -void ScoreComponentCollection::L2Regularize(float lambda) { +void ScoreComponentCollection::L2Regularize(float lambda) +{ m_scores.l2regularize(lambda); } -size_t ScoreComponentCollection::SparseL1Regularize(float lambda) { +size_t ScoreComponentCollection::SparseL1Regularize(float lambda) +{ return m_scores.sparseL1regularize(lambda); } -void ScoreComponentCollection::SparseL2Regularize(float lambda) { +void ScoreComponentCollection::SparseL2Regularize(float lambda) +{ m_scores.sparseL2regularize(lambda); } -void ScoreComponentCollection::Save(ostream& out) const { +void ScoreComponentCollection::Save(ostream& out) const +{ ScoreIndexMap::const_iterator iter = s_scoreIndexes.begin(); for (; iter != s_scoreIndexes.end(); ++iter ) { string name = iter->first->GetScoreProducerDescription(); @@ -148,7 +159,8 @@ void ScoreComponentCollection::Save(ostream& out) const { m_scores.write(out); } -void ScoreComponentCollection::Save(const string& filename) const { +void ScoreComponentCollection::Save(const string& filename) const +{ ofstream out(filename.c_str()); if (!out) { ostringstream msg; @@ -159,7 +171,8 @@ void ScoreComponentCollection::Save(const string& filename) const { out.close(); } -void ScoreComponentCollection::Assign(const FeatureFunction* sp, const string line) { +void ScoreComponentCollection::Assign(const FeatureFunction* sp, const string line) +{ istringstream istr(line); while(istr) { string namestring; diff --git a/moses/ScoreComponentCollection.h b/moses/ScoreComponentCollection.h index e76c9d06b..70c2a05f1 100644 --- a/moses/ScoreComponentCollection.h +++ b/moses/ScoreComponentCollection.h @@ -64,19 +64,18 @@ class ScoreComponentCollection { friend std::ostream& operator<<(std::ostream& os, const ScoreComponentCollection& rhs); private: - FVector m_scores; + FVector m_scores; typedef std::pair IndexPair; typedef std::map ScoreIndexMap; static ScoreIndexMap s_scoreIndexes; static size_t s_denseVectorSize; - static IndexPair GetIndexes(const FeatureFunction* sp) - { + static IndexPair GetIndexes(const FeatureFunction* sp) { ScoreIndexMap::const_iterator indexIter = s_scoreIndexes.find(sp); if (indexIter == s_scoreIndexes.end()) { std::cerr << "ERROR: FeatureFunction: " << sp->GetScoreProducerDescription() << - " not registered with ScoreIndexMap" << std::endl; + " not registered with ScoreIndexMap" << std::endl; std::cerr << "You must call ScoreComponentCollection.RegisterScoreProducer() " << - " for every FeatureFunction" << std::endl; + " for every FeatureFunction" << std::endl; abort(); } return indexIter->second; @@ -91,9 +90,9 @@ public: ScoreComponentCollection(); //! Clone a score collection - ScoreComponentCollection(const ScoreComponentCollection& rhs) - : m_scores(rhs.m_scores) - {} + ScoreComponentCollection(const ScoreComponentCollection& rhs) + : m_scores(rhs.m_scores) + {} ScoreComponentCollection& operator=( const ScoreComponentCollection& rhs ) { m_scores = rhs.m_scores; @@ -101,124 +100,108 @@ public: } /** - * Register a ScoreProducer with a fixed number of scores, so that it can + * Register a ScoreProducer with a fixed number of scores, so that it can * be allocated space in the dense part of the feature vector. **/ static void RegisterScoreProducer(const FeatureFunction* scoreProducer); /** Load from file */ - bool Load(const std::string& filename) - { - return m_scores.load(filename); + bool Load(const std::string& filename) { + return m_scores.load(filename); } - const FVector& GetScoresVector() const - { - return m_scores; + const FVector& GetScoresVector() const { + return m_scores; } const std::valarray &getCoreFeatures() const { return m_scores.getCoreFeatures(); } - size_t Size() const - { - return m_scores.size(); + size_t Size() const { + return m_scores.size(); } - void Resize() - { + void Resize() { if (m_scores.coreSize() != s_denseVectorSize) { m_scores.resize(s_denseVectorSize); } } /** Create and FVector with the right number of core features */ - static FVector CreateFVector() - { + static FVector CreateFVector() { return FVector(s_denseVectorSize); } - void SetToBinaryOf(const ScoreComponentCollection& rhs) - { - m_scores.setToBinaryOf(rhs.m_scores); + void SetToBinaryOf(const ScoreComponentCollection& rhs) { + m_scores.setToBinaryOf(rhs.m_scores); } //! Set all values to 0.0 - void ZeroAll() - { - m_scores.clear(); - } + void ZeroAll() { + m_scores.clear(); + } - void MultiplyEquals(float scalar); - void DivideEquals(float scalar); - void CoreDivideEquals(float scalar); - void DivideEquals(const ScoreComponentCollection& rhs); - void MultiplyEquals(const ScoreComponentCollection& rhs); - void MultiplyEqualsBackoff(const ScoreComponentCollection& rhs, float backoff); - void MultiplyEquals(float core_r0, float sparse_r0); - void MultiplyEquals(const FeatureFunction* sp, float scalar); + void MultiplyEquals(float scalar); + void DivideEquals(float scalar); + void CoreDivideEquals(float scalar); + void DivideEquals(const ScoreComponentCollection& rhs); + void MultiplyEquals(const ScoreComponentCollection& rhs); + void MultiplyEqualsBackoff(const ScoreComponentCollection& rhs, float backoff); + void MultiplyEquals(float core_r0, float sparse_r0); + void MultiplyEquals(const FeatureFunction* sp, float scalar); - size_t GetNumberWeights(const FeatureFunction* sp); + size_t GetNumberWeights(const FeatureFunction* sp); - void CoreAssign(const ScoreComponentCollection& rhs) - { - m_scores.coreAssign(rhs.m_scores); - } + void CoreAssign(const ScoreComponentCollection& rhs) { + m_scores.coreAssign(rhs.m_scores); + } - //! add the score in rhs - void PlusEquals(const ScoreComponentCollection& rhs) - { - m_scores += rhs.m_scores; - } + //! add the score in rhs + void PlusEquals(const ScoreComponentCollection& rhs) { + m_scores += rhs.m_scores; + } - // add only sparse features - void SparsePlusEquals(const ScoreComponentCollection& rhs) - { - m_scores.sparsePlusEquals(rhs.m_scores); - } + // add only sparse features + void SparsePlusEquals(const ScoreComponentCollection& rhs) { + m_scores.sparsePlusEquals(rhs.m_scores); + } - void PlusEquals(const FVector& scores) - { - m_scores += scores; - } + void PlusEquals(const FVector& scores) { + m_scores += scores; + } - //! subtract the score in rhs - void MinusEquals(const ScoreComponentCollection& rhs) - { - m_scores -= rhs.m_scores; - } + //! subtract the score in rhs + void MinusEquals(const ScoreComponentCollection& rhs) { + m_scores -= rhs.m_scores; + } //For features which have an unbounded number of components - void MinusEquals(const FeatureFunction*sp, const std::string& name, float score) - { + void MinusEquals(const FeatureFunction*sp, const std::string& name, float score) { FName fname(sp->GetScoreProducerDescription(),name); m_scores[fname] -= score; } //For features which have an unbounded number of components - void SparseMinusEquals(const std::string& full_name, float score) - { + void SparseMinusEquals(const std::string& full_name, float score) { FName fname(full_name); m_scores[fname] -= score; } - //! Add scores from a single ScoreProducer only - //! The length of scores must be equal to the number of score components - //! produced by sp - void PlusEquals(const FeatureFunction* sp, const ScoreComponentCollection& scores) - { + //! Add scores from a single ScoreProducer only + //! The length of scores must be equal to the number of score components + //! produced by sp + void PlusEquals(const FeatureFunction* sp, const ScoreComponentCollection& scores) { IndexPair indexes = GetIndexes(sp); for (size_t i = indexes.first; i < indexes.second; ++i) { m_scores[i] += scores.m_scores[i]; } - } + } - //! Add scores from a single FeatureFunction only - //! The length of scores must be equal to the number of score components - //! produced by sp - void PlusEquals(const FeatureFunction* sp, const std::vector& scores) - { + //! Add scores from a single FeatureFunction only + //! The length of scores must be equal to the number of score components + //! produced by sp + void PlusEquals(const FeatureFunction* sp, const std::vector& scores) { IndexPair indexes = GetIndexes(sp); CHECK(scores.size() == indexes.second - indexes.first); for (size_t i = 0; i < scores.size(); ++i) { @@ -226,56 +209,50 @@ public: } } - //! Special version PlusEquals(ScoreProducer, vector) - //! to add the score from a single ScoreProducer that produces - //! a single value - void PlusEquals(const FeatureFunction* sp, float score) - { + //! Special version PlusEquals(ScoreProducer, vector) + //! to add the score from a single ScoreProducer that produces + //! a single value + void PlusEquals(const FeatureFunction* sp, float score) { IndexPair indexes = GetIndexes(sp); CHECK(1 == indexes.second - indexes.first); m_scores[indexes.first] += score; - } + } //For features which have an unbounded number of components - void PlusEquals(const FeatureFunction*sp, const StringPiece& name, float score) - { + void PlusEquals(const FeatureFunction*sp, const StringPiece& name, float score) { FName fname(sp->GetScoreProducerDescription(),name); m_scores[fname] += score; } //For features which have an unbounded number of components - void SparsePlusEquals(const std::string& full_name, float score) - { - FName fname(full_name); + void SparsePlusEquals(const std::string& full_name, float score) { + FName fname(full_name); m_scores[fname] += score; } - void Assign(const FeatureFunction* sp, const std::vector& scores) - { + void Assign(const FeatureFunction* sp, const std::vector& scores) { IndexPair indexes = GetIndexes(sp); CHECK(scores.size() == indexes.second - indexes.first); for (size_t i = 0; i < scores.size(); ++i) { m_scores[i + indexes.first] = scores[i]; } } - + //! Special version Assign(ScoreProducer, vector) //! to add the score from a single ScoreProducer that produces //! a single value - void Assign(const FeatureFunction* sp, float score) - { + void Assign(const FeatureFunction* sp, float score) { IndexPair indexes = GetIndexes(sp); CHECK(1 == indexes.second - indexes.first); m_scores[indexes.first] = score; } - + // Assign core weight by index void Assign(size_t index, float score) { m_scores[index] = score; } - void Assign(const FeatureFunction*sp, const StringPiece &name, float score) - { + void Assign(const FeatureFunction*sp, const StringPiece &name, float score) { FName fname(sp->GetScoreProducerDescription(),name); m_scores[fname] = score; } @@ -285,27 +262,23 @@ public: void Assign(const FeatureFunction* sp, const std::string line); // shortcut: setting the value directly using the feature name - void Assign(const std::string name, float score) - { - FName fname(name); - m_scores[fname] = score; + void Assign(const std::string name, float score) { + FName fname(name); + m_scores[fname] = score; } - float InnerProduct(const ScoreComponentCollection& rhs) const - { - return m_scores.inner_product(rhs.m_scores); - } - - float PartialInnerProduct(const FeatureFunction* sp, const std::vector& rhs) const - { - std::vector lhs = GetScoresForProducer(sp); - CHECK(lhs.size() == rhs.size()); - return std::inner_product(lhs.begin(), lhs.end(), rhs.begin(), 0.0f); - } + float InnerProduct(const ScoreComponentCollection& rhs) const { + return m_scores.inner_product(rhs.m_scores); + } - //! return a vector of all the scores associated with a certain FeatureFunction - std::vector GetScoresForProducer(const FeatureFunction* sp) const - { + float PartialInnerProduct(const FeatureFunction* sp, const std::vector& rhs) const { + std::vector lhs = GetScoresForProducer(sp); + CHECK(lhs.size() == rhs.size()); + return std::inner_product(lhs.begin(), lhs.end(), rhs.begin(), 0.0f); + } + + //! return a vector of all the scores associated with a certain FeatureFunction + std::vector GetScoresForProducer(const FeatureFunction* sp) const { size_t components = sp->GetNumScoreComponents(); std::vector res(components); @@ -314,58 +287,52 @@ public: res[i] = m_scores[i + indexes.first]; } return res; - } + } //! get subset of scores that belong to a certain sparse ScoreProducer FVector GetVectorForProducer(const FeatureFunction* sp) const; - float GetSparseWeight(const FName& featureName) const - { + float GetSparseWeight(const FName& featureName) const { return m_scores[featureName]; } - + void PrintCoreFeatures() { m_scores.printCoreFeatures(); } - void ThresholdScaling(float maxValue) - { - // find (smallest) factor for which all weights are <= maxValue - // 0.1 / 0.14 = 0.714285714 - // 0.1 / 0.17 = 0.588235294 + void ThresholdScaling(float maxValue) { + // find (smallest) factor for which all weights are <= maxValue + // 0.1 / 0.14 = 0.714285714 + // 0.1 / 0.17 = 0.588235294 m_scores.thresholdScale(maxValue); - } + } - void CapMax(float maxValue) - { - // cap all sparse features to maxValue - m_scores.capMax(maxValue); - } + void CapMax(float maxValue) { + // cap all sparse features to maxValue + m_scores.capMax(maxValue); + } - void CapMin(float minValue) - { - // cap all sparse features to minValue - m_scores.capMin(minValue); - } + void CapMin(float minValue) { + // cap all sparse features to minValue + m_scores.capMin(minValue); + } - //! if a FeatureFunction produces a single score (for example, a language model score) - //! this will return it. If not, this method will throw - float GetScoreForProducer(const FeatureFunction* sp) const - { + //! if a FeatureFunction produces a single score (for example, a language model score) + //! this will return it. If not, this method will throw + float GetScoreForProducer(const FeatureFunction* sp) const { IndexPair indexes = GetIndexes(sp); CHECK(indexes.second - indexes.first == 1); return m_scores[indexes.first]; - } + } //For features which have an unbounded number of components float GetScoreForProducer - (const FeatureFunction* sp, const std::string& name) const - { + (const FeatureFunction* sp, const std::string& name) const { FName fname(sp->GetScoreProducerDescription(),name); return m_scores[fname]; } - float GetWeightedScore() const; + float GetWeightedScore() const; void ZeroDenseFeatures(const FeatureFunction* sp); void L1Normalise(); @@ -378,45 +345,65 @@ public: void SparseL2Regularize(float lambda); void Save(const std::string& filename) const; void Save(std::ostream&) const; - - void IncrementSparseHopeFeatures() { m_scores.incrementSparseHopeFeatures(); } - void IncrementSparseFearFeatures() { m_scores.incrementSparseFearFeatures(); } - void PrintSparseHopeFeatureCounts(std::ofstream& out) { m_scores.printSparseHopeFeatureCounts(out); } - void PrintSparseFearFeatureCounts(std::ofstream& out) { m_scores.printSparseFearFeatureCounts(out); } - void PrintSparseHopeFeatureCounts() { m_scores.printSparseHopeFeatureCounts(); } - void PrintSparseFearFeatureCounts() { m_scores.printSparseFearFeatureCounts(); } - size_t PruneSparseFeatures(size_t threshold) { return m_scores.pruneSparseFeatures(threshold); } - size_t PruneZeroWeightFeatures() { return m_scores.pruneZeroWeightFeatures(); } - void UpdateConfidenceCounts(ScoreComponentCollection &weightUpdate, bool signedCounts) { m_scores.updateConfidenceCounts(weightUpdate.m_scores, signedCounts); } - void UpdateLearningRates(float decay_core, float decay_sparse, ScoreComponentCollection &confidenceCounts, float core_r0, float sparse_r0) { m_scores.updateLearningRates(decay_core, decay_sparse, confidenceCounts.m_scores, core_r0, sparse_r0); } + + void IncrementSparseHopeFeatures() { + m_scores.incrementSparseHopeFeatures(); + } + void IncrementSparseFearFeatures() { + m_scores.incrementSparseFearFeatures(); + } + void PrintSparseHopeFeatureCounts(std::ofstream& out) { + m_scores.printSparseHopeFeatureCounts(out); + } + void PrintSparseFearFeatureCounts(std::ofstream& out) { + m_scores.printSparseFearFeatureCounts(out); + } + void PrintSparseHopeFeatureCounts() { + m_scores.printSparseHopeFeatureCounts(); + } + void PrintSparseFearFeatureCounts() { + m_scores.printSparseFearFeatureCounts(); + } + size_t PruneSparseFeatures(size_t threshold) { + return m_scores.pruneSparseFeatures(threshold); + } + size_t PruneZeroWeightFeatures() { + return m_scores.pruneZeroWeightFeatures(); + } + void UpdateConfidenceCounts(ScoreComponentCollection &weightUpdate, bool signedCounts) { + m_scores.updateConfidenceCounts(weightUpdate.m_scores, signedCounts); + } + void UpdateLearningRates(float decay_core, float decay_sparse, ScoreComponentCollection &confidenceCounts, float core_r0, float sparse_r0) { + m_scores.updateLearningRates(decay_core, decay_sparse, confidenceCounts.m_scores, core_r0, sparse_r0); + } #ifdef MPI_ENABLE - public: - friend class boost::serialization::access; - - private: - //serialization - template - void save(Archive &ar, const unsigned int version) const { - ar << m_scores; - } - - template - void load(Archive &ar, const unsigned int version) { - ar >> m_scores; +public: + friend class boost::serialization::access; - } +private: + //serialization + template + void save(Archive &ar, const unsigned int version) const { + ar << m_scores; + } + + template + void load(Archive &ar, const unsigned int version) { + ar >> m_scores; + + } + + BOOST_SERIALIZATION_SPLIT_MEMBER() - BOOST_SERIALIZATION_SPLIT_MEMBER() - #endif }; struct SCCPlus { ScoreComponentCollection operator() - (const ScoreComponentCollection& lhs, - const ScoreComponentCollection& rhs) { + (const ScoreComponentCollection& lhs, + const ScoreComponentCollection& rhs) { ScoreComponentCollection sum(lhs); sum.PlusEquals(rhs); return sum; diff --git a/moses/ScoreComponentCollectionTest.cpp b/moses/ScoreComponentCollectionTest.cpp index f0813f4e8..41fa6562f 100644 --- a/moses/ScoreComponentCollectionTest.cpp +++ b/moses/ScoreComponentCollectionTest.cpp @@ -29,31 +29,35 @@ using namespace std; BOOST_AUTO_TEST_SUITE(scc) -class MockStatelessFeatureFunction : public StatelessFeatureFunction { - public: - MockStatelessFeatureFunction(const string& desc, size_t n, const string &line) : - StatelessFeatureFunction(desc,n, line) {} - virtual void Evaluate(const PhraseBasedFeatureContext&, ScoreComponentCollection*) const {} - virtual void EvaluateChart(const ChartBasedFeatureContext&, ScoreComponentCollection*) const {} - virtual void Evaluate(const TargetPhrase &targetPhrase +class MockStatelessFeatureFunction : public StatelessFeatureFunction +{ +public: + MockStatelessFeatureFunction(const string& desc, size_t n, const string &line) : + StatelessFeatureFunction(desc,n, line) {} + virtual void Evaluate(const PhraseBasedFeatureContext&, ScoreComponentCollection*) const {} + virtual void EvaluateChart(const ChartBasedFeatureContext&, ScoreComponentCollection*) const {} + virtual void Evaluate(const TargetPhrase &targetPhrase , ScoreComponentCollection &scoreBreakdown , ScoreComponentCollection &estimatedFutureScore) const - { } + { } }; -class MockSingleFeature : public MockStatelessFeatureFunction { - public: - MockSingleFeature(): MockStatelessFeatureFunction("MockSingle",1, "MockSingle") {} +class MockSingleFeature : public MockStatelessFeatureFunction +{ +public: + MockSingleFeature(): MockStatelessFeatureFunction("MockSingle",1, "MockSingle") {} }; -class MockMultiFeature : public MockStatelessFeatureFunction { - public: - MockMultiFeature(): MockStatelessFeatureFunction("MockMulti", 5, "MockMulti") {} +class MockMultiFeature : public MockStatelessFeatureFunction +{ +public: + MockMultiFeature(): MockStatelessFeatureFunction("MockMulti", 5, "MockMulti") {} }; -class MockSparseFeature : public MockStatelessFeatureFunction { - public: - MockSparseFeature(): MockStatelessFeatureFunction("MockSparse", 0, "MockSparse") {} +class MockSparseFeature : public MockStatelessFeatureFunction +{ +public: + MockSparseFeature(): MockStatelessFeatureFunction("MockSparse", 0, "MockSparse") {} }; @@ -66,7 +70,7 @@ struct MockProducers { MockSparseFeature sparse; }; -BOOST_FIXTURE_TEST_CASE(ctor, MockProducers) +BOOST_FIXTURE_TEST_CASE(ctor, MockProducers) { ScoreComponentCollection scc; BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&single),0); @@ -88,11 +92,11 @@ BOOST_FIXTURE_TEST_CASE(plusequals, MockProducers) scc.PlusEquals(&multi,vec1); std::vector actual = scc.GetScoresForProducer(&multi); BOOST_CHECK_EQUAL_COLLECTIONS(vec1.begin(),vec1.end() - ,actual.begin(), actual.end()); + ,actual.begin(), actual.end()); scc.PlusEquals(&multi,vec1); actual = scc.GetScoresForProducer(&multi); BOOST_CHECK_EQUAL_COLLECTIONS(vec2.begin(),vec2.end(), - actual.begin(), actual.end()); + actual.begin(), actual.end()); BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&single), 3.4f); } diff --git a/moses/SearchNormalBatch.cpp b/moses/SearchNormalBatch.cpp index ca72b3973..aa3aeb0a3 100644 --- a/moses/SearchNormalBatch.cpp +++ b/moses/SearchNormalBatch.cpp @@ -18,21 +18,21 @@ SearchNormalBatch::SearchNormalBatch(Manager& manager, const InputType &source, // Split the feature functions into sets of stateless, stateful // distributed lm, and stateful non-distributed. const vector& ffs = - StatefulFeatureFunction::GetStatefulFeatureFunctions(); + StatefulFeatureFunction::GetStatefulFeatureFunctions(); for (unsigned i = 0; i < ffs.size(); ++i) { - if (ffs[i]->GetScoreProducerDescription() == "DLM_5gram") { // TODO WFT - m_dlm_ffs[i] = const_cast(static_cast(ffs[i])); - m_dlm_ffs[i]->SetFFStateIdx(i); - } - else { - m_stateful_ffs[i] = const_cast(ffs[i]); - } + if (ffs[i]->GetScoreProducerDescription() == "DLM_5gram") { // TODO WFT + m_dlm_ffs[i] = const_cast(static_cast(ffs[i])); + m_dlm_ffs[i]->SetFFStateIdx(i); + } else { + m_stateful_ffs[i] = const_cast(ffs[i]); + } } m_stateless_ffs = StatelessFeatureFunction::GetStatelessFeatureFunctions(); - + } -SearchNormalBatch::~SearchNormalBatch() { +SearchNormalBatch::~SearchNormalBatch() +{ } /** @@ -138,79 +138,79 @@ void SearchNormalBatch::ExpandHypothesis(const Hypothesis &hypothesis, const Tra for (dlm_iter = m_dlm_ffs.begin(); dlm_iter != m_dlm_ffs.end(); ++dlm_iter) { - const FFState* input_state = newHypo->GetPrevHypo() ? newHypo->GetPrevHypo()->GetFFState((*dlm_iter).first) : NULL; - (*dlm_iter).second->IssueRequestsFor(*newHypo, input_state); + const FFState* input_state = newHypo->GetPrevHypo() ? newHypo->GetPrevHypo()->GetFFState((*dlm_iter).first) : NULL; + (*dlm_iter).second->IssueRequestsFor(*newHypo, input_state); } m_partial_hypos.push_back(newHypo); - } - else { + } else { std::cerr << "can't use early discarding with batch decoding!" << std::endl; abort(); } } -void SearchNormalBatch::EvalAndMergePartialHypos() { - std::vector::iterator partial_hypo_iter; - for (partial_hypo_iter = m_partial_hypos.begin(); - partial_hypo_iter != m_partial_hypos.end(); - ++partial_hypo_iter) { - Hypothesis* hypo = *partial_hypo_iter; +void SearchNormalBatch::EvalAndMergePartialHypos() +{ + std::vector::iterator partial_hypo_iter; + for (partial_hypo_iter = m_partial_hypos.begin(); + partial_hypo_iter != m_partial_hypos.end(); + ++partial_hypo_iter) { + Hypothesis* hypo = *partial_hypo_iter; - // Evaluate with other ffs. - std::map::iterator sfff_iter; - for (sfff_iter = m_stateful_ffs.begin(); - sfff_iter != m_stateful_ffs.end(); - ++sfff_iter) { - const StatefulFeatureFunction &ff = *(sfff_iter->second); - int state_idx = sfff_iter->first; - hypo->EvaluateWith(ff, state_idx); - } - std::vector::iterator slff_iter; - for (slff_iter = m_stateless_ffs.begin(); - slff_iter != m_stateless_ffs.end(); - ++slff_iter) { - hypo->EvaluateWith(**slff_iter); - } + // Evaluate with other ffs. + std::map::iterator sfff_iter; + for (sfff_iter = m_stateful_ffs.begin(); + sfff_iter != m_stateful_ffs.end(); + ++sfff_iter) { + const StatefulFeatureFunction &ff = *(sfff_iter->second); + int state_idx = sfff_iter->first; + hypo->EvaluateWith(ff, state_idx); } + std::vector::iterator slff_iter; + for (slff_iter = m_stateless_ffs.begin(); + slff_iter != m_stateless_ffs.end(); + ++slff_iter) { + hypo->EvaluateWith(**slff_iter); + } + } - // Wait for all requests from the distributed LM to come back. + // Wait for all requests from the distributed LM to come back. + std::map::iterator dlm_iter; + for (dlm_iter = m_dlm_ffs.begin(); + dlm_iter != m_dlm_ffs.end(); + ++dlm_iter) { + (*dlm_iter).second->sync(); + } + + // Incorporate the DLM scores into all hypotheses and put into their + // stacks. + for (partial_hypo_iter = m_partial_hypos.begin(); + partial_hypo_iter != m_partial_hypos.end(); + ++partial_hypo_iter) { + Hypothesis* hypo = *partial_hypo_iter; + + // Calculate DLM scores. std::map::iterator dlm_iter; for (dlm_iter = m_dlm_ffs.begin(); dlm_iter != m_dlm_ffs.end(); ++dlm_iter) { - (*dlm_iter).second->sync(); + LanguageModel &lm = *(dlm_iter->second); + hypo->EvaluateWith(lm, (*dlm_iter).first); } - // Incorporate the DLM scores into all hypotheses and put into their - // stacks. - for (partial_hypo_iter = m_partial_hypos.begin(); - partial_hypo_iter != m_partial_hypos.end(); - ++partial_hypo_iter) { - Hypothesis* hypo = *partial_hypo_iter; + // Put completed hypothesis onto its stack. + size_t wordsTranslated = hypo->GetWordsBitmap().GetNumWordsCovered(); + m_hypoStackColl[wordsTranslated]->AddPrune(hypo); + } + m_partial_hypos.clear(); - // Calculate DLM scores. - std::map::iterator dlm_iter; - for (dlm_iter = m_dlm_ffs.begin(); - dlm_iter != m_dlm_ffs.end(); - ++dlm_iter) { - LanguageModel &lm = *(dlm_iter->second); - hypo->EvaluateWith(lm, (*dlm_iter).first); - } - - // Put completed hypothesis onto its stack. - size_t wordsTranslated = hypo->GetWordsBitmap().GetNumWordsCovered(); - m_hypoStackColl[wordsTranslated]->AddPrune(hypo); - } - m_partial_hypos.clear(); - - std::vector < HypothesisStack* >::iterator stack_iter; - HypothesisStackNormal* stack; - for (stack_iter = m_hypoStackColl.begin(); - stack_iter != m_hypoStackColl.end(); - ++stack_iter) { - stack = static_cast(*stack_iter); - stack->PruneToSize(m_max_stack_size); - } + std::vector < HypothesisStack* >::iterator stack_iter; + HypothesisStackNormal* stack; + for (stack_iter = m_hypoStackColl.begin(); + stack_iter != m_hypoStackColl.end(); + ++stack_iter) { + stack = static_cast(*stack_iter); + stack->PruneToSize(m_max_stack_size); + } } } diff --git a/moses/SearchNormalBatch.h b/moses/SearchNormalBatch.h index fcfda7054..7f6764635 100644 --- a/moses/SearchNormalBatch.h +++ b/moses/SearchNormalBatch.h @@ -13,7 +13,7 @@ class TranslationOptionCollection; /** Implements the phrase-based stack decoding algorithm (no cube pruning) with a twist... * Language model requests are batched together, duplicate requests are removed, and requests are sent together. * Useful for distributed LM where network latency is an issue. - */ + */ class SearchNormalBatch: public SearchNormal { protected: @@ -21,7 +21,7 @@ protected: // Added for asynclm decoding. std::vector m_stateless_ffs; std::map m_dlm_ffs; - std::map m_stateful_ffs; + std::map m_stateful_ffs; std::vector m_partial_hypos; int m_batch_size; int m_max_stack_size; diff --git a/moses/Sentence.cpp b/moses/Sentence.cpp index b2e5a6633..8e76b0f03 100644 --- a/moses/Sentence.cpp +++ b/moses/Sentence.cpp @@ -104,8 +104,7 @@ int Sentence::Read(std::istream& in,const std::vector& factorOrder) this->SetTopicId(atol(topic_params[0].c_str())); this->SetUseTopicId(true); this->SetUseTopicIdAndProb(false); - } - else { + } else { this->SetTopicIdAndProb(topic_params); this->SetUseTopicId(false); this->SetUseTopicIdAndProb(true); diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index df4c14cde..f822e4e13 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -109,15 +109,15 @@ StaticData::~StaticData() typedef std::map, std::pair > Coll; Coll::iterator iter; for (iter = m_transOptCache.begin(); iter != m_transOptCache.end(); ++iter) { - std::pair &valuePair =iter->second; - TranslationOptionList *transOptList = valuePair.first; - delete transOptList; + std::pair &valuePair =iter->second; + TranslationOptionList *transOptList = valuePair.first; + delete transOptList; } /* const std::vector &producers = FeatureFunction::GetFeatureFunctions(); for(size_t i=0;iGetParam("parsing-algorithm").size() > 0) ? - (ParsingAlgorithm) Scan(m_parameter->GetParam("parsing-algorithm")[0]) : ParseCYKPlus; + (ParsingAlgorithm) Scan(m_parameter->GetParam("parsing-algorithm")[0]) : ParseCYKPlus; // to cube or not to cube m_searchAlgorithm = (m_parameter->GetParam("search-algorithm").size() > 0) ? @@ -217,7 +218,7 @@ bool StaticData::LoadData(Parameter *parameter) } else { m_nBestFactor = 20; } - + //lattice samples if (m_parameter->GetParam("lattice-samples").size() ==2 ) { m_latticeSamplesFilePath = m_parameter->GetParam("lattice-samples")[0]; @@ -276,11 +277,11 @@ bool StaticData::LoadData(Parameter *parameter) #endif SetBooleanParameter( &m_unprunedSearchGraph, "unpruned-search-graph", false ); SetBooleanParameter( &m_includeLHSInSearchGraph, "include-lhs-in-search-graph", false ); - + if (m_parameter->isParamSpecified("output-unknowns")) { if (m_parameter->GetParam("output-unknowns").size() == 1) { - m_outputUnknownsFile =Scan(m_parameter->GetParam("output-unknowns")[0]); + m_outputUnknownsFile =Scan(m_parameter->GetParam("output-unknowns")[0]); } else { UserMessage::Add(string("need to specify exactly one file name for unknowns")); return false; @@ -422,7 +423,7 @@ bool StaticData::LoadData(Parameter *parameter) cerr << "Errror: Cannot use both n-best mbr and lattice mbr together" << endl; exit(1); } - + //mira training SetBooleanParameter( &m_mira, "mira", false ); @@ -446,7 +447,7 @@ bool StaticData::LoadData(Parameter *parameter) exit(1); } if (m_useConsensusDecoding) m_mbr=true; - + // Compact phrase table and reordering model SetBooleanParameter( &m_minphrMemory, "minphr-memory", false ); SetBooleanParameter( &m_minlexrMemory, "minlexr-memory", false ); @@ -489,7 +490,7 @@ bool StaticData::LoadData(Parameter *parameter) } m_startTranslationId = (m_parameter->GetParam("start-translation-id").size() > 0) ? - Scan(m_parameter->GetParam("start-translation-id")[0]) : 0; + Scan(m_parameter->GetParam("start-translation-id")[0]) : 0; // Read in constraint decoding file, if provided if(m_parameter->GetParam("constraint").size()) { @@ -503,7 +504,7 @@ bool StaticData::LoadData(Parameter *parameter) InputFileStream constraintFile(m_constraintFileName); std::string line; - + long sentenceID = GetStartTranslationId() - 1; while (getline(constraintFile, line)) { vector vecStr = Tokenize(line, "\t"); @@ -546,14 +547,14 @@ bool StaticData::LoadData(Parameter *parameter) // specify XML tags opening and closing brackets for XML option if (m_parameter->GetParam("xml-brackets").size() > 0) { - std::vector brackets = Tokenize(m_parameter->GetParam("xml-brackets")[0]); - if(brackets.size()!=2) { - cerr << "invalid xml-brackets value, must specify exactly 2 blank-delimited strings for XML tags opening and closing brackets" << endl; - exit(1); - } - m_xmlBrackets.first= brackets[0]; - m_xmlBrackets.second=brackets[1]; - cerr << "XML tags opening and closing brackets for XML input are: " << m_xmlBrackets.first << " and " << m_xmlBrackets.second << endl; + std::vector brackets = Tokenize(m_parameter->GetParam("xml-brackets")[0]); + if(brackets.size()!=2) { + cerr << "invalid xml-brackets value, must specify exactly 2 blank-delimited strings for XML tags opening and closing brackets" << endl; + exit(1); + } + m_xmlBrackets.first= brackets[0]; + m_xmlBrackets.second=brackets[1]; + cerr << "XML tags opening and closing brackets for XML input are: " << m_xmlBrackets.first << " and " << m_xmlBrackets.second << endl; } // all features @@ -574,58 +575,47 @@ bool StaticData::LoadData(Parameter *parameter) GlobalLexicalModel *model = new GlobalLexicalModel(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); SetWeights(model, weights); - } - else if (feature == "GlobalLexicalModelUnlimited") { + } else if (feature == "GlobalLexicalModelUnlimited") { GlobalLexicalModelUnlimited *model = NULL; //new GlobalLexicalModelUnlimited(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); SetWeights(model, weights); - } - else if (feature == "SourceWordDeletionFeature") { + } else if (feature == "SourceWordDeletionFeature") { SourceWordDeletionFeature *model = new SourceWordDeletionFeature(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); //SetWeights(model, weights); - } - else if (feature == "TargetWordInsertionFeature") { + } else if (feature == "TargetWordInsertionFeature") { TargetWordInsertionFeature *model = new TargetWordInsertionFeature(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); //SetWeights(model, weights); - } - else if (feature == "PhraseBoundaryFeature") { + } else if (feature == "PhraseBoundaryFeature") { PhraseBoundaryFeature *model = new PhraseBoundaryFeature(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); //SetWeights(model, weights); - } - else if (feature == "PhraseLengthFeature") { + } else if (feature == "PhraseLengthFeature") { PhraseLengthFeature *model = new PhraseLengthFeature(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); //SetWeights(model, weights); - } - else if (feature == "WordTranslationFeature") { + } else if (feature == "WordTranslationFeature") { WordTranslationFeature *model = new WordTranslationFeature(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); //SetWeights(model, weights); - } - else if (feature == "TargetBigramFeature") { + } else if (feature == "TargetBigramFeature") { TargetBigramFeature *model = new TargetBigramFeature(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); //SetWeights(model, weights); - } - else if (feature == "TargetNgramFeature") { + } else if (feature == "TargetNgramFeature") { TargetNgramFeature *model = new TargetNgramFeature(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); //SetWeights(model, weights); - } - else if (feature == "PhrasePairFeature") { + } else if (feature == "PhrasePairFeature") { PhrasePairFeature *model = new PhrasePairFeature(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); //SetWeights(model, weights); - } - else if (feature == "LexicalReordering") { + } else if (feature == "LexicalReordering") { LexicalReordering *model = new LexicalReordering(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); SetWeights(model, weights); - } - else if (feature == "KENLM") { + } else if (feature == "KENLM") { LanguageModel *model = ConstructKenLM(feature, line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); SetWeights(model, weights); @@ -648,69 +638,58 @@ bool StaticData::LoadData(Parameter *parameter) GenerationDictionary *model = new GenerationDictionary(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); SetWeights(model, weights); - } - else if (feature == "BleuScoreFeature") { + } else if (feature == "BleuScoreFeature") { BleuScoreFeature *model = new BleuScoreFeature(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); SetWeights(model, weights); - } - else if (feature == "Distortion") { + } else if (feature == "Distortion") { DistortionScoreProducer *model = new DistortionScoreProducer(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); SetWeights(model, weights); - } - else if (feature == "WordPenalty") { + } else if (feature == "WordPenalty") { WordPenaltyProducer *model = new WordPenaltyProducer(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); SetWeights(model, weights); m_wpProducer = model; - } - else if (feature == "UnknownWordPenalty") { + } else if (feature == "UnknownWordPenalty") { UnknownWordPenaltyProducer *model = new UnknownWordPenaltyProducer(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); if (weights.size() == 0) weights.push_back(1.0f); SetWeights(model, weights); m_unknownWordPenaltyProducer = model; - } - else if (feature == "PhraseDictionaryBinary") { + } else if (feature == "PhraseDictionaryBinary") { PhraseDictionaryTreeAdaptor* model = new PhraseDictionaryTreeAdaptor(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); SetWeights(model, weights); m_phraseDictionary.push_back(model); - } - else if (feature == "PhraseDictionaryOnDisk") { + } else if (feature == "PhraseDictionaryOnDisk") { PhraseDictionaryOnDisk* model = new PhraseDictionaryOnDisk(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); SetWeights(model, weights); m_phraseDictionary.push_back(model); - } - else if (feature == "PhraseDictionaryMemory") { + } else if (feature == "PhraseDictionaryMemory") { PhraseDictionaryMemory* model = new PhraseDictionaryMemory(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); SetWeights(model, weights); m_phraseDictionary.push_back(model); - } - else if (feature == "PhraseDictionaryCompact") { + } else if (feature == "PhraseDictionaryCompact") { PhraseDictionaryCompact* model = new PhraseDictionaryCompact(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); SetWeights(model, weights); m_phraseDictionary.push_back(model); - } - else if (feature == "PhraseDictionaryMultiModel") { + } else if (feature == "PhraseDictionaryMultiModel") { PhraseDictionaryMultiModel* model = new PhraseDictionaryMultiModel(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); SetWeights(model, weights); m_phraseDictionary.push_back(model); - } - else if (feature == "PhraseDictionaryMultiModelCounts") { + } else if (feature == "PhraseDictionaryMultiModelCounts") { PhraseDictionaryMultiModelCounts* model = new PhraseDictionaryMultiModelCounts(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); SetWeights(model, weights); m_phraseDictionary.push_back(model); - } - else if (feature == "PhraseDictionaryALSuffixArray") { - PhraseDictionaryALSuffixArray* model = new PhraseDictionaryALSuffixArray(line); + } else if (feature == "PhraseDictionaryALSuffixArray") { + PhraseDictionaryALSuffixArray* model = new PhraseDictionaryALSuffixArray(line); vector weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); SetWeights(model, weights); m_phraseDictionary.push_back(model); @@ -912,7 +891,7 @@ bool StaticData::LoadDecodeGraphs() DecodeGraph *decodeGraph; if (IsChart()) { size_t maxChartSpan = (decodeGraphInd < maxChartSpans.size()) ? maxChartSpans[decodeGraphInd] : DEFAULT_MAX_CHART_SPAN; - cerr << "max-chart-span: " << maxChartSpans[decodeGraphInd] << endl; + cerr << "max-chart-span: " << maxChartSpans[decodeGraphInd] << endl; decodeGraph = new DecodeGraph(m_decodeGraphs.size(), maxChartSpan); } else { decodeGraph = new DecodeGraph(m_decodeGraphs.size()); @@ -947,7 +926,7 @@ const TranslationOptionList* StaticData::FindTransOptListInCache(const DecodeGra boost::mutex::scoped_lock lock(m_transOptCacheMutex); #endif std::map, std::pair >::iterator iter - = m_transOptCache.find(key); + = m_transOptCache.find(key); if (iter == m_transOptCache.end()) return NULL; iter->second.second = clock(); // update last used time @@ -994,7 +973,8 @@ void StaticData::AddTransOptListToCache(const DecodeGraph &decodeGraph, const Ph m_transOptCache[key] = make_pair( storedTransOptList, clock() ); ReduceTransOptCache(); } -void StaticData::ClearTransOptionCache() const { +void StaticData::ClearTransOptionCache() const +{ map, std::pair< TranslationOptionList*, clock_t > >::iterator iterCache; for (iterCache = m_transOptCache.begin() ; iterCache != m_transOptCache.end() ; ++iterCache) { TranslationOptionList *transOptList = iterCache->second.first; @@ -1091,20 +1071,19 @@ void StaticData::SetExecPath(const std::string &path) { /* namespace fs = boost::filesystem; - + fs::path full_path( fs::initial_path() ); - + full_path = fs::system_complete( fs::path( path ) ); - + //Without file name m_binPath = full_path.parent_path().string(); */ - + // NOT TESTED size_t pos = path.rfind("/"); - if (pos != string::npos) - { - m_binPath = path.substr(0, pos); + if (pos != string::npos) { + m_binPath = path.substr(0, pos); } cerr << m_binPath << endl; } @@ -1114,27 +1093,31 @@ const string &StaticData::GetBinDirectory() const return m_binPath; } -float StaticData::GetWeightWordPenalty() const { +float StaticData::GetWeightWordPenalty() const +{ float weightWP = GetWeight(m_wpProducer); //VERBOSE(1, "Read weightWP from translation sytem: " << weightWP << std::endl); return weightWP; } -float StaticData::GetWeightUnknownWordPenalty() const { +float StaticData::GetWeightUnknownWordPenalty() const +{ return GetWeight(m_unknownWordPenaltyProducer); } -void StaticData::InitializeForInput(const InputType& source) const { +void StaticData::InitializeForInput(const InputType& source) const +{ const std::vector &producers = FeatureFunction::GetFeatureFunctions(); - for(size_t i=0;i &producers = FeatureFunction::GetFeatureFunctions(); - for(size_t i=0;i::iterator iter = weightNames.find(descr); if (iter == weightNames.end()) { cerr << "Can't find weights for feature function " << descr << endl; - } - else { + } else { weightNames.erase(iter); } } diff --git a/moses/StaticData.h b/moses/StaticData.h index 01a0a19df..5a1cec213 100644 --- a/moses/StaticData.h +++ b/moses/StaticData.h @@ -87,7 +87,7 @@ protected: m_translationOptionThreshold, m_wordDeletionWeight; - + // PhraseTrans, Generation & LanguageModelScore has multiple weights. int m_maxDistortion; // do it differently from old pharaoh @@ -206,7 +206,7 @@ protected: int m_threadCount; long m_startTranslationId; - + StaticData(); @@ -223,7 +223,7 @@ protected: bool m_continuePartialTranslation; std::string m_binPath; - + public: bool IsAlwaysCreateDirectTranslationOption() const { @@ -363,15 +363,15 @@ public: bool IsLabeledNBestList() const { return m_labeledNBestList; } - + bool UseMinphrInMemory() const { - return m_minphrMemory; + return m_minphrMemory; } bool UseMinlexrInMemory() const { - return m_minlexrMemory; + return m_minlexrMemory; } - + size_t GetNumRealWordsInInput() const { return m_numRealWordsInInput; } @@ -421,13 +421,16 @@ public: bool IsChart() const { return m_searchAlgorithm == ChartDecoding || m_searchAlgorithm == ChartIncremental; } - const WordPenaltyProducer *GetWordPenaltyProducer() const - { return m_wpProducer; } - WordPenaltyProducer *GetWordPenaltyProducer() // for mira - { return m_wpProducer; } + const WordPenaltyProducer *GetWordPenaltyProducer() const { + return m_wpProducer; + } + WordPenaltyProducer *GetWordPenaltyProducer() { // for mira + return m_wpProducer; + } - const UnknownWordPenaltyProducer *GetUnknownWordPenaltyProducer() const - { return m_unknownWordPenaltyProducer; } + const UnknownWordPenaltyProducer *GetUnknownWordPenaltyProducer() const { + return m_unknownWordPenaltyProducer; + } size_t GetNumInputScores() const { return m_numInputScores; @@ -458,7 +461,7 @@ public: float GetSparseWeight(const FName& featureName) const { return m_allWeights.GetSparseWeight(featureName); } - + //Weights for feature with fixed number of values void SetWeights(const FeatureFunction* sp, const std::vector& weights); @@ -627,15 +630,17 @@ public: int ThreadCount() const { return m_threadCount; } - - long GetStartTranslationId() const - { return m_startTranslationId; } - + + long GetStartTranslationId() const { + return m_startTranslationId; + } + void SetExecPath(const std::string &path); const std::string &GetBinDirectory() const; bool NeedAlignmentInfo() const { - return m_needAlignmentInfo; } + return m_needAlignmentInfo; + } const std::string &GetAlignmentOutputFile() const { return m_alignmentOutputFile; } @@ -656,19 +661,26 @@ public: float GetWeightWordPenalty() const; float GetWeightUnknownWordPenalty() const; - const std::vector& GetPhraseDictionaries() const - { return m_phraseDictionary;} - const std::vector& GetGenerationDictionaries() const - { return m_generationDictionary;} - const PhraseDictionary*GetTranslationScoreProducer(size_t index) const - { return GetPhraseDictionaries().at(index); } + const std::vector& GetPhraseDictionaries() const { + return m_phraseDictionary; + } + const std::vector& GetGenerationDictionaries() const { + return m_generationDictionary; + } + const PhraseDictionary*GetTranslationScoreProducer(size_t index) const { + return GetPhraseDictionaries().at(index); + } std::vector GetTranslationWeights(size_t index) const { std::vector weights = GetWeights(GetTranslationScoreProducer(index)); return weights; } - const std::vector& GetDecodeGraphs() const {return m_decodeGraphs;} - const std::vector& GetDecodeGraphBackoff() const {return m_decodeGraphBackoff;} + const std::vector& GetDecodeGraphs() const { + return m_decodeGraphs; + } + const std::vector& GetDecodeGraphBackoff() const { + return m_decodeGraphBackoff; + } //sentence (and thread) specific initialisationn and cleanup void InitializeForInput(const InputType& source) const; @@ -697,8 +709,7 @@ public: #ifdef WITH_THREADS if (m_multimodelweights_tmp.find(boost::this_thread::get_id()) != m_multimodelweights_tmp.end()) { return &m_multimodelweights_tmp.find(boost::this_thread::get_id())->second; - } - else { + } else { return NULL; } #else diff --git a/moses/SyntacticLanguageModel.cpp b/moses/SyntacticLanguageModel.cpp index 4a3b26ff1..cde041fe7 100644 --- a/moses/SyntacticLanguageModel.cpp +++ b/moses/SyntacticLanguageModel.cpp @@ -10,154 +10,159 @@ namespace Moses { - SyntacticLanguageModel::SyntacticLanguageModel(const std::string &line) - // Initialize member variables - /* - : m_NumScoreComponents(weights.size()) - , m_files(new SyntacticLanguageModelFiles(filePath)) - , m_factorType(factorType) - , m_beamWidth(beamWidth) { - */ - { - /* taken from StaticData::LoadSyntacticLanguageModel() - cerr << "Loading syntactic language models..." << std::endl; +SyntacticLanguageModel::SyntacticLanguageModel(const std::string &line) +// Initialize member variables +/* +: m_NumScoreComponents(weights.size()) +, m_files(new SyntacticLanguageModelFiles(filePath)) +, m_factorType(factorType) +, m_beamWidth(beamWidth) { +*/ +{ + /* taken from StaticData::LoadSyntacticLanguageModel() + cerr << "Loading syntactic language models..." << std::endl; - const vector weights = Scan(m_parameter->GetParam("weight-slm")); - const vector files = m_parameter->GetParam("slmodel-file"); + const vector weights = Scan(m_parameter->GetParam("weight-slm")); + const vector files = m_parameter->GetParam("slmodel-file"); - const FactorType factorType = (m_parameter->GetParam("slmodel-factor").size() > 0) ? - TransformScore(Scan(m_parameter->GetParam("slmodel-factor")[0])) - : 0; + const FactorType factorType = (m_parameter->GetParam("slmodel-factor").size() > 0) ? + TransformScore(Scan(m_parameter->GetParam("slmodel-factor")[0])) + : 0; - const size_t beamWidth = (m_parameter->GetParam("slmodel-beam").size() > 0) ? - TransformScore(Scan(m_parameter->GetParam("slmodel-beam")[0])) - : 500; + const size_t beamWidth = (m_parameter->GetParam("slmodel-beam").size() > 0) ? + TransformScore(Scan(m_parameter->GetParam("slmodel-beam")[0])) + : 500; - if (files.size() < 1) { - cerr << "No syntactic language model files specified!" << std::endl; + if (files.size() < 1) { + cerr << "No syntactic language model files specified!" << std::endl; + return false; + } + + // check if feature is used + if (weights.size() >= 1) { + + //cout.setf(ios::scientific,ios::floatfield); + //cerr.setf(ios::scientific,ios::floatfield); + + // create the feature + m_syntacticLanguageModel = new SyntacticLanguageModel(files,weights,factorType,beamWidth); + + + ///////////////////////////////////////// + // BEGIN LANE's UNSTABLE EXPERIMENT :) + // + + //double ppl = m_syntacticLanguageModel->perplexity(); + //cerr << "Probability is " << ppl << endl; + + + // + // END LANE's UNSTABLE EXPERIMENT + ///////////////////////////////////////// + + + + if (m_syntacticLanguageModel==NULL) { return false; } - // check if feature is used - if (weights.size() >= 1) { - - //cout.setf(ios::scientific,ios::floatfield); - //cerr.setf(ios::scientific,ios::floatfield); - - // create the feature - m_syntacticLanguageModel = new SyntacticLanguageModel(files,weights,factorType,beamWidth); - - - ///////////////////////////////////////// - // BEGIN LANE's UNSTABLE EXPERIMENT :) - // - - //double ppl = m_syntacticLanguageModel->perplexity(); - //cerr << "Probability is " << ppl << endl; - - - // - // END LANE's UNSTABLE EXPERIMENT - ///////////////////////////////////////// - - - - if (m_syntacticLanguageModel==NULL) { - return false; - } - - } - - return true; - - */ } - SyntacticLanguageModel::~SyntacticLanguageModel() { - VERBOSE(3,"Destructing SyntacticLanguageModel" << std::endl); - delete m_files; - } + return true; - size_t SyntacticLanguageModel::GetNumScoreComponents() const { - return m_NumScoreComponents; - } + */ +} - std::string SyntacticLanguageModel::GetScoreProducerDescription() const { - return "SyntacticLM"; - } +SyntacticLanguageModel::~SyntacticLanguageModel() +{ + VERBOSE(3,"Destructing SyntacticLanguageModel" << std::endl); + delete m_files; +} - const FFState* SyntacticLanguageModel::EmptyHypothesisState(const InputType &input) const { +size_t SyntacticLanguageModel::GetNumScoreComponents() const +{ + return m_NumScoreComponents; +} - return new SyntacticLanguageModelState(m_files,m_beamWidth); +std::string SyntacticLanguageModel::GetScoreProducerDescription() const +{ + return "SyntacticLM"; +} - } +const FFState* SyntacticLanguageModel::EmptyHypothesisState(const InputType &input) const +{ - /* - double SyntacticLanguageModel::perplexity() { - - SyntacticLanguageModelState *prev = - new SyntacticLanguageModelState(m_files,m_beamWidth); - - std::cerr << "Initial prob:" << "\t" << prev->getProb() < words(3); - words[0] = "no"; - words[1] = ","; - words[2] = "zxvth"; - - - for (std::vector::iterator i=words.begin(); - i != words.end(); - i++) { - - prev = new SyntacticLanguageModelState(prev, *i); - std::cerr << *i << "\t" << prev->getProb() <getProb(); - - } - */ - FFState* SyntacticLanguageModel::Evaluate(const Hypothesis& cur_hypo, - const FFState* prev_state, - ScoreComponentCollection* accumulator) const { - - VERBOSE(3,"Evaluating SyntacticLanguageModel for a hypothesis" << endl); - - SyntacticLanguageModelState* tmpState = NULL; - SyntacticLanguageModelState* nextState = NULL; - - - const TargetPhrase& targetPhrase = cur_hypo.GetCurrTargetPhrase(); - - for (size_t i=0, n=targetPhrase.GetSize(); iGetString(); - - if (i==0) { - nextState = new SyntacticLanguageModelState((const SyntacticLanguageModelState*)prev_state, string); - } else { - tmpState = nextState; - nextState = new SyntacticLanguageModelState(tmpState, string); - delete tmpState; - } - - double score = nextState->getScore(); - VERBOSE(3,"SynLM evaluated a score of " << score << endl); - accumulator->Assign( this, score ); - } - - - - return nextState; - - } + return new SyntacticLanguageModelState(m_files,m_beamWidth); + +} + +/* +double SyntacticLanguageModel::perplexity() { + + SyntacticLanguageModelState *prev = + new SyntacticLanguageModelState(m_files,m_beamWidth); + + std::cerr << "Initial prob:" << "\t" << prev->getProb() < words(3); + words[0] = "no"; + words[1] = ","; + words[2] = "zxvth"; + + + for (std::vector::iterator i=words.begin(); + i != words.end(); + i++) { + + prev = new SyntacticLanguageModelState(prev, *i); + std::cerr << *i << "\t" << prev->getProb() <getProb(); + +} +*/ +FFState* SyntacticLanguageModel::Evaluate(const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const +{ + + VERBOSE(3,"Evaluating SyntacticLanguageModel for a hypothesis" << endl); + + SyntacticLanguageModelState* tmpState = NULL; + SyntacticLanguageModelState* nextState = NULL; + + + const TargetPhrase& targetPhrase = cur_hypo.GetCurrTargetPhrase(); + + for (size_t i=0, n=targetPhrase.GetSize(); iGetString(); + + if (i==0) { + nextState = new SyntacticLanguageModelState((const SyntacticLanguageModelState*)prev_state, string); + } else { + tmpState = nextState; + nextState = new SyntacticLanguageModelState(tmpState, string); + delete tmpState; + } + + double score = nextState->getScore(); + VERBOSE(3,"SynLM evaluated a score of " << score << endl); + accumulator->Assign( this, score ); + } + + + + return nextState; + +} } diff --git a/moses/SyntacticLanguageModel.h b/moses/SyntacticLanguageModel.h index 3cd4c58e9..6e88d85c1 100644 --- a/moses/SyntacticLanguageModel.h +++ b/moses/SyntacticLanguageModel.h @@ -12,40 +12,41 @@ class XModel; // observed model namespace Moses { - template class SyntacticLanguageModelFiles; - - class SyntacticLanguageModel : public StatefulFeatureFunction { +template class SyntacticLanguageModelFiles; - public: - SyntacticLanguageModel(const std::string &line); +class SyntacticLanguageModel : public StatefulFeatureFunction +{ - ~SyntacticLanguageModel(); +public: + SyntacticLanguageModel(const std::string &line); - size_t GetNumScoreComponents() const; + ~SyntacticLanguageModel(); - const FFState* EmptyHypothesisState(const InputType &input) const; + size_t GetNumScoreComponents() const; - FFState* Evaluate(const Hypothesis& cur_hypo, - const FFState* prev_state, - ScoreComponentCollection* accumulator) const; + const FFState* EmptyHypothesisState(const InputType &input) const; - FFState* EvaluateChart(const ChartHypothesis& cur_hypo, - int featureID, - ScoreComponentCollection* accumulator) const { - throw std::runtime_error("Syntactic LM can only be used with phrase-based decoder."); - } + FFState* Evaluate(const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const; + + FFState* EvaluateChart(const ChartHypothesis& cur_hypo, + int featureID, + ScoreComponentCollection* accumulator) const { + throw std::runtime_error("Syntactic LM can only be used with phrase-based decoder."); + } - // double perplexity(); + // double perplexity(); - private: +private: - const size_t m_NumScoreComponents; - SyntacticLanguageModelFiles* m_files; - const FactorType m_factorType; - const size_t m_beamWidth; + const size_t m_NumScoreComponents; + SyntacticLanguageModelFiles* m_files; + const FactorType m_factorType; + const size_t m_beamWidth; - }; +}; } diff --git a/moses/SyntacticLanguageModelFiles.h b/moses/SyntacticLanguageModelFiles.h index 2e12e88c6..b91c0abfe 100644 --- a/moses/SyntacticLanguageModelFiles.h +++ b/moses/SyntacticLanguageModelFiles.h @@ -9,50 +9,55 @@ namespace Moses { -template -class SyntacticLanguageModelFiles { +template +class SyntacticLanguageModelFiles +{ - public: +public: SyntacticLanguageModelFiles(const std::vector& filePaths); ~SyntacticLanguageModelFiles(); - + MH* getHiddenModel(); MO* getObservedModel(); - private: +private: MH* hiddenModel; MO* observedModel; - + }; template - SyntacticLanguageModelFiles::SyntacticLanguageModelFiles(const std::vector& filePaths) { +SyntacticLanguageModelFiles::SyntacticLanguageModelFiles(const std::vector& filePaths) +{ this->hiddenModel = new MH(); this->observedModel = new MO(); - + //// I. LOAD MODELS... std::cerr << "Reading syntactic language model files...\n"; // For each model file... for ( int a=0, n=filePaths.size(); a>*(this->hiddenModel)>>"\0"!=NULL - || si>>*(this->observedModel)>>"\0"!=NULL - )) - std::cerr<<"\nERROR: can't parse \'"<>*(this->hiddenModel)>>"\0"!=NULL + || si>>*(this->observedModel)>>"\0"!=NULL + )) + std::cerr<<"\nERROR: can't parse \'"< template - SyntacticLanguageModelFiles::~SyntacticLanguageModelFiles() { +SyntacticLanguageModelFiles::~SyntacticLanguageModelFiles() +{ VERBOSE(3,"Destructing syntactic language model files" << std::endl); delete hiddenModel; @@ -76,15 +82,17 @@ template template - MH* SyntacticLanguageModelFiles::getHiddenModel() { - +MH* SyntacticLanguageModelFiles::getHiddenModel() +{ + return this->hiddenModel; } template - MO* SyntacticLanguageModelFiles::getObservedModel() { - +MO* SyntacticLanguageModelFiles::getObservedModel() +{ + return this->observedModel; } diff --git a/moses/SyntacticLanguageModelState.h b/moses/SyntacticLanguageModelState.h index 15828eedc..bf35616d9 100644 --- a/moses/SyntacticLanguageModelState.h +++ b/moses/SyntacticLanguageModelState.h @@ -15,8 +15,9 @@ namespace Moses { template > - class SyntacticLanguageModelState : public FFState { - public: +class SyntacticLanguageModelState : public FFState +{ +public: // Initialize an empty LM state SyntacticLanguageModelState( SyntacticLanguageModelFiles* modelData, int beamSize ); @@ -25,52 +26,53 @@ template ,pair >* randomVariableStore; - double prob; - double score; - int beamSize; - SyntacticLanguageModelFiles* modelData; - bool sentenceStart; + SafeArray1D,pair >* randomVariableStore; + double prob; + double score; + int beamSize; + SyntacticLanguageModelFiles* modelData; + bool sentenceStart; }; //////////////////////////////////////////////////////////////////////////////// - - template - void SyntacticLanguageModelState::printRV() { - cerr << "*********** BEGIN printRV() ******************" << endl; - int size=randomVariableStore->getSize(); - cerr << "randomVariableStore->getSize() == " << size << endl; +template +void SyntacticLanguageModelState::printRV() +{ - for (int depth=0; depthgetSize(); + cerr << "randomVariableStore->getSize() == " << size << endl; - - const pair *data = &(randomVariableStore->get(depth)); - std::cerr << "randomVariableStore[" << depth << "]\t" << data->first << "\tprob = " << data->second.toProb() << "\tlogProb = " << double(data->second.toInt())/100 << std::endl; + for (int depth=0; depth *data = &(randomVariableStore->get(depth)); + std::cerr << "randomVariableStore[" << depth << "]\t" << data->first << "\tprob = " << data->second.toProb() << "\tlogProb = " << double(data->second.toInt())/100 << std::endl; + + } + cerr << "*********** END printRV() ******************" << endl; + +} // Initialize an empty LM state from grammar files // @@ -78,7 +80,8 @@ template - SyntacticLanguageModelState::SyntacticLanguageModelState( SyntacticLanguageModelFiles* modelData, int beamSize ) { +SyntacticLanguageModelState::SyntacticLanguageModelState( SyntacticLanguageModelFiles* modelData, int beamSize ) +{ this->randomVariableStore = new SafeArray1D,pair >(); this->modelData = modelData; @@ -89,7 +92,7 @@ template StringInput(String(BEG_STATE).c_array())>>xBEG>>"\0"; cerr< //score = l.toDouble(); setScore(l.toDouble()); // MY::F_ROOT_OBS = true; - // this->modelData->getHiddenModel()->setRootObs(true); - - +// this->modelData->getHiddenModel()->setRootObs(true); + + } template - int SyntacticLanguageModelState::Compare(const FFState& other) const { +int SyntacticLanguageModelState::Compare(const FFState& other) const +{ /* - const SyntacticLanguageModelState& o = + const SyntacticLanguageModelState& o = static_cast&>(other); if (o.score > score) return 1; @@ -124,13 +128,14 @@ template else return 0; */ return 0; - } +} template - SyntacticLanguageModelState::SyntacticLanguageModelState( const SyntacticLanguageModelState* prev, std::string word ) { +SyntacticLanguageModelState::SyntacticLanguageModelState( const SyntacticLanguageModelState* prev, std::string word ) +{ - // Initialize member variables + // Initialize member variables this->randomVariableStore = new SafeArray1D,pair >(); this->modelData = prev->modelData; this->beamSize = prev->beamSize; @@ -143,13 +148,13 @@ template // Get HHMM model files MY& mH = *(modelData->getHiddenModel()); MX& mO = *(modelData->getObservedModel()); - + // Initialize HHMM - HMM hmm(mH,mO); + HMM hmm(mH,mO); int MAX_WORDS = 2; hmm.init(MAX_WORDS,this->beamSize,prev->randomVariableStore); - typename MX::RandVarType x(word.c_str()); - // cout << "Examining HHMM just after hmm.init" << endl; + typename MX::RandVarType x(word.c_str()); + // cout << "Examining HHMM just after hmm.init" << endl; // hmm.debugPrint(); @@ -158,21 +163,21 @@ template hmm.writeCurr(cout,1); cerr << "*********** END writeCurr() ******************" << endl; */ -/* - { - - int wnum=1; - list > lys = hmm.getMLSnodes(ysEND); // get mls list - for ( typename list >::iterator i=lys.begin(); i!=lys.end(); i++, wnum++ ) { // for each frame - cout << "HYPOTH " << wnum - << " " << i->getBackData() - << " " << x - << " " << i->getId() - << " (" << i->getLogProb() << ")" - << endl; // print RV val - } - } - */ + /* + { + + int wnum=1; + list > lys = hmm.getMLSnodes(ysEND); // get mls list + for ( typename list >::iterator i=lys.begin(); i!=lys.end(); i++, wnum++ ) { // for each frame + cout << "HYPOTH " << wnum + << " " << i->getBackData() + << " " << x + << " " << i->getId() + << " (" << i->getLogProb() << ")" + << endl; // print RV val + } + } + */ /* @@ -189,7 +194,7 @@ template // typename MX::RandVarType ov; // ov.set(word.c_str(),mO); // MY::WORD = ov.getW(); - //bool endOfSentence = prev->sentenceStart;//true; + //bool endOfSentence = prev->sentenceStart;//true; // std::cerr << "About to give HHMM a word of input:\t" << word << std::endl; @@ -197,27 +202,27 @@ template // cout << "Examining HHMM just after hmm.updateRanked(" << x << "," << prev->sentenceStart << ")" << endl; // hmm.debugPrint(); -/* - cerr << "*********** BEGIN writeCurr() ******************" << endl; - hmm.writeCurr(cout,0); - hmm.writeCurr(cout,1); - cerr << "*********** END writeCurr() ******************" << endl; - */ -/* -{ + /* + cerr << "*********** BEGIN writeCurr() ******************" << endl; + hmm.writeCurr(cout,0); + hmm.writeCurr(cout,1); + cerr << "*********** END writeCurr() ******************" << endl; + */ + /* + { - int wnum=1; - list > lys = hmm.getMLSnodes(ysEND); // get mls list - for ( typename list >::iterator i=lys.begin(); i!=lys.end(); i++, wnum++ ) { // for each frame - cout << "HYPOTH " << wnum - << " " << i->getBackData() - << " " << x - << " " << i->getId() - << " (" << i->getLogProb() << ")" - << endl; // print RV val - } - } - */ + int wnum=1; + list > lys = hmm.getMLSnodes(ysEND); // get mls list + for ( typename list >::iterator i=lys.begin(); i!=lys.end(); i++, wnum++ ) { // for each frame + cout << "HYPOTH " << wnum + << " " << i->getBackData() + << " " << x + << " " << i->getId() + << " (" << i->getLogProb() << ")" + << endl; // print RV val + } + } + */ // X ov(word.c_str()); //mH.setWord(ov); // MY::WORD = ov;//ov.getW(); @@ -226,17 +231,17 @@ template //hmm.updateRanked(ov); //mH.setRootObs(true); //MY::F_ROOT_OBS = false; - + // Get the current score - double currSum = hmm.getCurrSum(); - //VERBOSE(3,"Setting score using currSum for " << scientific << x << " = " << currSum << endl); + double currSum = hmm.getCurrSum(); + //VERBOSE(3,"Setting score using currSum for " << scientific << x << " = " << currSum << endl); setScore(currSum); - // cout << "Examining RV store just before RV init via gatherElementsInBeam" << endl; + // cout << "Examining RV store just before RV init via gatherElementsInBeam" << endl; // printRV(); // Get new hidden random variable store from HHMM hmm.gatherElementsInBeam(randomVariableStore); - // cout << "Examining RV store just after RV init via gatherElementsInBeam" << endl; + // cout << "Examining RV store just after RV init via gatherElementsInBeam" << endl; // printRV(); /* cerr << "Writing hmm.writeCurr..." << endl; @@ -248,22 +253,25 @@ template template -double SyntacticLanguageModelState::getProb() const { - +double SyntacticLanguageModelState::getProb() const +{ + return prob; } template -double SyntacticLanguageModelState::getScore() const { - +double SyntacticLanguageModelState::getScore() const +{ + return score; } template - void SyntacticLanguageModelState::setScore(double score) { +void SyntacticLanguageModelState::setScore(double score) +{ + - this->prob = score; diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp index 81b7adf44..f3cf9d1e1 100644 --- a/moses/TargetPhrase.cpp +++ b/moses/TargetPhrase.cpp @@ -38,10 +38,10 @@ using namespace std; namespace Moses { TargetPhrase::TargetPhrase( std::string out_string) -:Phrase(0), m_fullScore(0.0), m_sourcePhrase(0) -, m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) -, m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) -, m_lhsTarget(NULL) + :Phrase(0), m_fullScore(0.0), m_sourcePhrase(0) + , m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) + , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) + , m_lhsTarget(NULL) { //ACAT @@ -50,37 +50,36 @@ TargetPhrase::TargetPhrase( std::string out_string) } TargetPhrase::TargetPhrase() -:Phrase() -, m_fullScore(0.0) -,m_sourcePhrase() -, m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) -, m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) -, m_lhsTarget(NULL) + :Phrase() + , m_fullScore(0.0) + ,m_sourcePhrase() + , m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) + , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) + , m_lhsTarget(NULL) { } TargetPhrase::TargetPhrase(const Phrase &phrase) -: Phrase(phrase) -, m_fullScore(0.0) -, m_sourcePhrase() -, m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) -, m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) -, m_lhsTarget(NULL) + : Phrase(phrase) + , m_fullScore(0.0) + , m_sourcePhrase() + , m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) + , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) + , m_lhsTarget(NULL) { } TargetPhrase::TargetPhrase(const TargetPhrase ©) -: Phrase(copy) -, m_fullScore(copy.m_fullScore) -, m_sourcePhrase(copy.m_sourcePhrase) -, m_alignTerm(copy.m_alignTerm) -, m_alignNonTerm(copy.m_alignNonTerm) -, m_scoreBreakdown(copy.m_scoreBreakdown) + : Phrase(copy) + , m_fullScore(copy.m_fullScore) + , m_sourcePhrase(copy.m_sourcePhrase) + , m_alignTerm(copy.m_alignTerm) + , m_alignNonTerm(copy.m_alignNonTerm) + , m_scoreBreakdown(copy.m_scoreBreakdown) { if (copy.m_lhsTarget) { m_lhsTarget = new Word(copy.m_lhsTarget); - } - else { + } else { m_lhsTarget = NULL; } @@ -125,8 +124,8 @@ void TargetPhrase::Evaluate(const InputType &input) const std::vector &ffs = FeatureFunction::GetFeatureFunctions(); for (size_t i = 0; i < ffs.size(); ++i) { - const FeatureFunction &ff = *ffs[i]; - ff.Evaluate(input, m_scoreBreakdown); + const FeatureFunction &ff = *ffs[i]; + ff.Evaluate(input, m_scoreBreakdown); } } @@ -180,7 +179,7 @@ TargetPhrase *TargetPhrase::MergeNext(const TargetPhrase &inputPhrase) const void TargetPhrase::SetAlignmentInfo(const StringPiece &alignString) { - AlignmentInfo::CollType alignTerm, alignNonTerm; + AlignmentInfo::CollType alignTerm, alignNonTerm; for (util::TokenIter token(alignString, util::AnyCharacter(" \t")); token; ++token) { util::TokenIter dash(*token, util::SingleCharacter('-')); @@ -194,11 +193,10 @@ void TargetPhrase::SetAlignmentInfo(const StringPiece &alignString) if (GetWord(targetPos).IsNonTerminal()) { - alignNonTerm.insert(std::pair(sourcePos, targetPos)); + alignNonTerm.insert(std::pair(sourcePos, targetPos)); + } else { + alignTerm.insert(std::pair(sourcePos, targetPos)); } - else { - alignTerm.insert(std::pair(sourcePos, targetPos)); - } } SetAlignTerm(alignTerm); SetAlignNonTerm(alignNonTerm); @@ -207,15 +205,15 @@ void TargetPhrase::SetAlignmentInfo(const StringPiece &alignString) void TargetPhrase::SetAlignTerm(const AlignmentInfo::CollType &coll) { - const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll); - m_alignTerm = alignmentInfo; + const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll); + m_alignTerm = alignmentInfo; } void TargetPhrase::SetAlignNonTerm(const AlignmentInfo::CollType &coll) { - const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll); - m_alignNonTerm = alignmentInfo; + const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll); + m_alignNonTerm = alignmentInfo; } void TargetPhrase::SetSparseScore(const FeatureFunction* translationScoreProducer, const StringPiece &sparseString) diff --git a/moses/TargetPhrase.h b/moses/TargetPhrase.h index df876a00a..bb1c7c5a7 100644 --- a/moses/TargetPhrase.h +++ b/moses/TargetPhrase.h @@ -48,10 +48,10 @@ protected: float m_fullScore; ScoreComponentCollection m_scoreBreakdown; - // in case of confusion net, ptr to source phrase - Phrase m_sourcePhrase; - const AlignmentInfo* m_alignTerm, *m_alignNonTerm; - const Word *m_lhsTarget; + // in case of confusion net, ptr to source phrase + Phrase m_sourcePhrase; + const AlignmentInfo* m_alignTerm, *m_alignNonTerm; + const Word *m_lhsTarget; public: TargetPhrase(); @@ -86,26 +86,28 @@ public: return m_fullScore; } - inline const ScoreComponentCollection &GetScoreBreakdown() const - { return m_scoreBreakdown; } - inline ScoreComponentCollection &GetScoreBreakdown() - { return m_scoreBreakdown; } + inline const ScoreComponentCollection &GetScoreBreakdown() const { + return m_scoreBreakdown; + } + inline ScoreComponentCollection &GetScoreBreakdown() { + return m_scoreBreakdown; + } //TODO: Probably shouldn't copy this, but otherwise ownership is unclear - void SetSourcePhrase(const Phrase& p) - { - m_sourcePhrase=p; - } - const Phrase& GetSourcePhrase() const - { - return m_sourcePhrase; - } - - void SetTargetLHS(const Word *lhs) - { m_lhsTarget = lhs; } - const Word &GetTargetLHS() const - { return *m_lhsTarget; } - + void SetSourcePhrase(const Phrase& p) { + m_sourcePhrase=p; + } + const Phrase& GetSourcePhrase() const { + return m_sourcePhrase; + } + + void SetTargetLHS(const Word *lhs) { + m_lhsTarget = lhs; + } + const Word &GetTargetLHS() const { + return *m_lhsTarget; + } + void SetAlignmentInfo(const StringPiece &alignString); void SetAlignTerm(const AlignmentInfo *alignTerm) { m_alignTerm = alignTerm; @@ -117,11 +119,13 @@ public: void SetAlignTerm(const AlignmentInfo::CollType &coll); void SetAlignNonTerm(const AlignmentInfo::CollType &coll); - const AlignmentInfo &GetAlignTerm() const - { return *m_alignTerm; } - const AlignmentInfo &GetAlignNonTerm() const - { return *m_alignNonTerm; } - + const AlignmentInfo &GetAlignTerm() const { + return *m_alignTerm; + } + const AlignmentInfo &GetAlignNonTerm() const { + return *m_alignNonTerm; + } + TO_STRING(); }; @@ -131,10 +135,8 @@ std::ostream& operator<<(std::ostream&, const TargetPhrase&); /** * Hasher that looks at source and target phrase. **/ -struct TargetPhraseHasher -{ - inline size_t operator()(const TargetPhrase& targetPhrase) const - { +struct TargetPhraseHasher { + inline size_t operator()(const TargetPhrase& targetPhrase) const { size_t seed = 0; boost::hash_combine(seed, targetPhrase); boost::hash_combine(seed, targetPhrase.GetSourcePhrase()); @@ -145,14 +147,12 @@ struct TargetPhraseHasher } }; -struct TargetPhraseComparator -{ - inline bool operator()(const TargetPhrase& lhs, const TargetPhrase& rhs) const - { +struct TargetPhraseComparator { + inline bool operator()(const TargetPhrase& lhs, const TargetPhrase& rhs) const { return lhs.Compare(rhs) == 0 && - lhs.GetSourcePhrase().Compare(rhs.GetSourcePhrase()) == 0 && - lhs.GetAlignTerm() == rhs.GetAlignTerm() && - lhs.GetAlignNonTerm() == rhs.GetAlignNonTerm(); + lhs.GetSourcePhrase().Compare(rhs.GetSourcePhrase()) == 0 && + lhs.GetAlignTerm() == rhs.GetAlignTerm() && + lhs.GetAlignNonTerm() == rhs.GetAlignNonTerm(); } }; diff --git a/moses/TargetPhraseCollection.cpp b/moses/TargetPhraseCollection.cpp index 78b63d852..88ce28eb6 100644 --- a/moses/TargetPhraseCollection.cpp +++ b/moses/TargetPhraseCollection.cpp @@ -59,8 +59,8 @@ void TargetPhraseCollection::Sort(bool adhereTableLimit, size_t tableLimit) { std::vector::iterator iterMiddle; iterMiddle = (tableLimit == 0 || m_collection.size() < tableLimit) - ? m_collection.end() - : m_collection.begin()+tableLimit; + ? m_collection.end() + : m_collection.begin()+tableLimit; std::partial_sort(m_collection.begin(), iterMiddle, m_collection.end(), CompareTargetPhrase()); diff --git a/moses/TargetPhraseCollection.h b/moses/TargetPhraseCollection.h index 4efb911fb..4207bccef 100644 --- a/moses/TargetPhraseCollection.h +++ b/moses/TargetPhraseCollection.h @@ -60,7 +60,9 @@ public: RemoveAllInColl(m_collection); } - const std::vector &GetCollection() const { return m_collection; } + const std::vector &GetCollection() const { + return m_collection; + } //! divide collection into 2 buckets using std::nth_element, the top & bottom according to table limit void NthElement(size_t tableLimit); diff --git a/moses/Terminal.h b/moses/Terminal.h index 6247d0b6c..e7d18676e 100644 --- a/moses/Terminal.h +++ b/moses/Terminal.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -29,7 +29,7 @@ namespace Moses class TerminalHasher { - public: +public: // Generate a hash value for a word representing a terminal. It's // assumed that the same subset of factors will be active for all words // that are hashed. @@ -47,7 +47,7 @@ class TerminalHasher class TerminalEqualityPred { - public: +public: // Equality predicate for comparing words representing terminals. As // with the hasher, it's assumed that all words will have the same // subset of active factors. diff --git a/moses/ThreadPool.h b/moses/ThreadPool.h index fad236a98..bf981a2da 100644 --- a/moses/ThreadPool.h +++ b/moses/ThreadPool.h @@ -42,7 +42,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA /** * Classes to implement a ThreadPool. **/ -namespace Moses { +namespace Moses +{ /** A task to be executed by the ThreadPool */ @@ -50,7 +51,9 @@ class Task { public: virtual void Run() = 0; - virtual bool DeleteAfterExecution() { return true; } + virtual bool DeleteAfterExecution() { + return true; + } virtual ~Task() {} }; @@ -58,7 +61,7 @@ public: class ThreadPool { - public: +public: /** * Construct a thread pool of a fixed size. **/ @@ -82,7 +85,9 @@ class ThreadPool /** * Set maximum number of queued threads (otherwise Submit blocks) **/ - void SetQueueLimit( size_t limit ) { m_queueLimit = limit; } + void SetQueueLimit( size_t limit ) { + m_queueLimit = limit; + } private: /** @@ -109,7 +114,7 @@ public: #ifdef BOOST_HAS_PTHREADS pthread_t tid = pthread_self(); #else - typedef void * pthread_t; + typedef void * pthread_t; pthread_t tid = 0; #endif std::cerr << "Executing " << m_id << " in thread id " << tid << std::endl; diff --git a/moses/Timer.h b/moses/Timer.h index deefa4a71..a6bd0e91a 100644 --- a/moses/Timer.h +++ b/moses/Timer.h @@ -33,7 +33,7 @@ public: * using 'start' or 'restart' */ Timer() : running(false) { - start_time = 0; + start_time = 0; } void start(const char* msg = 0); diff --git a/moses/TranslationModel/BilingualDynSuffixArray.cpp b/moses/TranslationModel/BilingualDynSuffixArray.cpp index 824529b91..a47473de5 100644 --- a/moses/TranslationModel/BilingualDynSuffixArray.cpp +++ b/moses/TranslationModel/BilingualDynSuffixArray.cpp @@ -7,495 +7,494 @@ using namespace std; -namespace Moses { +namespace Moses +{ BilingualDynSuffixArray::BilingualDynSuffixArray(): - m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()), - m_maxSampleSize(20) -{ - m_srcSA = 0; - m_trgSA = 0; - m_srcCorpus = new std::vector(); - m_trgCorpus = new std::vector(); - m_srcVocab = new Vocab(false); - m_trgVocab = new Vocab(false); - m_scoreCmp = 0; + m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()), + m_maxSampleSize(20) +{ + m_srcSA = 0; + m_trgSA = 0; + m_srcCorpus = new std::vector(); + m_trgCorpus = new std::vector(); + m_srcVocab = new Vocab(false); + m_trgVocab = new Vocab(false); + m_scoreCmp = 0; } -BilingualDynSuffixArray::~BilingualDynSuffixArray() +BilingualDynSuffixArray::~BilingualDynSuffixArray() { - if(m_srcSA) delete m_srcSA; - if(m_trgSA) delete m_trgSA; - if(m_srcVocab) delete m_srcVocab; - if(m_trgVocab) delete m_trgVocab; - if(m_srcCorpus) delete m_srcCorpus; - if(m_trgCorpus) delete m_trgCorpus; - if(m_scoreCmp) delete m_scoreCmp; + if(m_srcSA) delete m_srcSA; + if(m_trgSA) delete m_trgSA; + if(m_srcVocab) delete m_srcVocab; + if(m_trgVocab) delete m_trgVocab; + if(m_srcCorpus) delete m_srcCorpus; + if(m_trgCorpus) delete m_trgCorpus; + if(m_scoreCmp) delete m_scoreCmp; } bool BilingualDynSuffixArray::Load( - const std::vector& inputFactors, - const std::vector& outputFactors, - std::string source, std::string target, std::string alignments, - const std::vector &weight) + const std::vector& inputFactors, + const std::vector& outputFactors, + std::string source, std::string target, std::string alignments, + const std::vector &weight) { m_inputFactors = inputFactors; m_outputFactors = outputFactors; - m_scoreCmp = new ScoresComp(weight); - InputFileStream sourceStrme(source); - InputFileStream targetStrme(target); - cerr << "Loading source corpus...\n"; - LoadCorpus(Input, sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab); - cerr << "Loading target corpus...\n"; - LoadCorpus(Output, targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab); - CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size()); - - // build suffix arrays and auxilliary arrays - cerr << "Building Source Suffix Array...\n"; - m_srcSA = new DynSuffixArray(m_srcCorpus); - if(!m_srcSA) return false; - cerr << "Building Target Suffix Array...\n"; - //m_trgSA = new DynSuffixArray(m_trgCorpus); - //if(!m_trgSA) return false; - cerr << "\t(Skipped. Not used)\n"; - - InputFileStream alignStrme(alignments); - cerr << "Loading Alignment File...\n"; - LoadRawAlignments(alignStrme); - //LoadAlignments(alignStrme); - cerr << "Building frequent word cache...\n"; - CacheFreqWords(); - return true; -} - -bool BilingualDynSuffixArray::LoadTM( - const std::vector& inputFactors, - const std::vector& outputFactors, - std::string source, std::string target, std::string alignments, - const std::vector &weight) -{ - m_inputFactors = inputFactors; - m_outputFactors = outputFactors; - m_scoreCmp = new ScoresComp(weight); InputFileStream sourceStrme(source); InputFileStream targetStrme(target); - - cerr << "Loading target corpus...\n"; - LoadCorpus(Output, targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab); - - cerr << "Loading source corpus...\n"; + cerr << "Loading source corpus...\n"; LoadCorpus(Input, sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab); - + cerr << "Loading target corpus...\n"; + LoadCorpus(Output, targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab); CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size()); - + // build suffix arrays and auxilliary arrays - cerr << "Building Source Suffix Array...\n"; - m_srcSA = new DynSuffixArray(m_srcCorpus); + cerr << "Building Source Suffix Array...\n"; + m_srcSA = new DynSuffixArray(m_srcCorpus); if(!m_srcSA) return false; - cerr << "Building Target Suffix Array...\n"; - //m_trgSA = new DynSuffixArray(m_trgCorpus); + cerr << "Building Target Suffix Array...\n"; + //m_trgSA = new DynSuffixArray(m_trgCorpus); //if(!m_trgSA) return false; cerr << "\t(Skipped. Not used)\n"; - + InputFileStream alignStrme(alignments); - cerr << "Loading Alignment File...\n"; + cerr << "Loading Alignment File...\n"; LoadRawAlignments(alignStrme); //LoadAlignments(alignStrme); cerr << "Building frequent word cache...\n"; CacheFreqWords(); return true; - } -int BilingualDynSuffixArray::LoadRawAlignments(InputFileStream& align) +bool BilingualDynSuffixArray::LoadTM( + const std::vector& inputFactors, + const std::vector& outputFactors, + std::string source, std::string target, std::string alignments, + const std::vector &weight) { - // stores the alignments in the raw file format - std::string line; - std::vector vtmp; + m_inputFactors = inputFactors; + m_outputFactors = outputFactors; + + m_scoreCmp = new ScoresComp(weight); + InputFileStream sourceStrme(source); + InputFileStream targetStrme(target); + + cerr << "Loading target corpus...\n"; + LoadCorpus(Output, targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab); + + cerr << "Loading source corpus...\n"; + LoadCorpus(Input, sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab); + + CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size()); + + // build suffix arrays and auxilliary arrays + cerr << "Building Source Suffix Array...\n"; + m_srcSA = new DynSuffixArray(m_srcCorpus); + if(!m_srcSA) return false; + cerr << "Building Target Suffix Array...\n"; + //m_trgSA = new DynSuffixArray(m_trgCorpus); + //if(!m_trgSA) return false; + cerr << "\t(Skipped. Not used)\n"; + + InputFileStream alignStrme(alignments); + cerr << "Loading Alignment File...\n"; + LoadRawAlignments(alignStrme); + //LoadAlignments(alignStrme); + cerr << "Building frequent word cache...\n"; + CacheFreqWords(); + return true; + +} + +int BilingualDynSuffixArray::LoadRawAlignments(InputFileStream& align) +{ + // stores the alignments in the raw file format + std::string line; + std::vector vtmp; int lineNum = 1; - while(getline(align, line)) { + while(getline(align, line)) { if (lineNum % 10000 == 0) cerr << lineNum; - Utils::splitToInt(line, vtmp, "- "); - CHECK(vtmp.size() % 2 == 0); - std::vector vAlgn; // store as short ints for memory - for (std::vector::const_iterator itr = vtmp.begin(); - itr != vtmp.end(); ++itr) { - vAlgn.push_back(short(*itr)); - } - m_rawAlignments.push_back(vAlgn); + Utils::splitToInt(line, vtmp, "- "); + CHECK(vtmp.size() % 2 == 0); + std::vector vAlgn; // store as short ints for memory + for (std::vector::const_iterator itr = vtmp.begin(); + itr != vtmp.end(); ++itr) { + vAlgn.push_back(short(*itr)); + } + m_rawAlignments.push_back(vAlgn); ++lineNum; - } - return m_rawAlignments.size(); + } + return m_rawAlignments.size(); } -int BilingualDynSuffixArray::LoadRawAlignments(string& align) { - // stores the alignments in the raw file format +int BilingualDynSuffixArray::LoadRawAlignments(string& align) +{ + // stores the alignments in the raw file format vector vtmp; Utils::splitToInt(align, vtmp, "- "); CHECK(vtmp.size() % 2 == 0); vector vAlgn; // store as short ints for memory for (std::vector::const_iterator itr = vtmp.begin(); - itr != vtmp.end(); ++itr) { - vAlgn.push_back(short(*itr)); + itr != vtmp.end(); ++itr) { + vAlgn.push_back(short(*itr)); } m_rawAlignments.push_back(vAlgn); return m_rawAlignments.size(); } -int BilingualDynSuffixArray::LoadAlignments(InputFileStream& align) +int BilingualDynSuffixArray::LoadAlignments(InputFileStream& align) { - std::string line; - std::vector vtmp; - int sntIndex(0); - - while(getline(align, line)) { - Utils::splitToInt(line, vtmp, "- "); - CHECK(vtmp.size() % 2 == 0); - - int sourceSize = GetSourceSentenceSize(sntIndex); - int targetSize = GetTargetSentenceSize(sntIndex); + std::string line; + std::vector vtmp; + int sntIndex(0); - SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence - for(int i=0; i < (int)vtmp.size(); i+=2) { - int sourcePos = vtmp[i]; - int targetPos = vtmp[i+1]; - CHECK(sourcePos < sourceSize); - CHECK(targetPos < targetSize); - - curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word - curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word - } - curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence - curSnt.trgSnt = m_trgCorpus + sntIndex; - m_alignments.push_back(curSnt); - - sntIndex++; - } - return m_alignments.size(); + while(getline(align, line)) { + Utils::splitToInt(line, vtmp, "- "); + CHECK(vtmp.size() % 2 == 0); + + int sourceSize = GetSourceSentenceSize(sntIndex); + int targetSize = GetTargetSentenceSize(sntIndex); + + SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence + for(int i=0; i < (int)vtmp.size(); i+=2) { + int sourcePos = vtmp[i]; + int targetPos = vtmp[i+1]; + CHECK(sourcePos < sourceSize); + CHECK(targetPos < targetSize); + + curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word + curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word + } + curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence + curSnt.trgSnt = m_trgCorpus + sntIndex; + m_alignments.push_back(curSnt); + + sntIndex++; + } + return m_alignments.size(); } -SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntIndex, bool trg2Src) const +SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntIndex, bool trg2Src) const { - // retrieves the alignments in the format used by SentenceAlignment.Extract() - int sntGiven = trg2Src ? GetTargetSentenceSize(sntIndex) : GetSourceSentenceSize(sntIndex); - int sntExtract = trg2Src ? GetSourceSentenceSize(sntIndex) : GetTargetSentenceSize(sntIndex); - std::vector alignment = m_rawAlignments.at(sntIndex); - SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract); // initialize empty sentence - for(size_t i=0; i < alignment.size(); i+=2) { - int sourcePos = alignment[i]; - int targetPos = alignment[i+1]; - if(trg2Src) { - curSnt.alignedList[targetPos].push_back(sourcePos); // list of target nodes for each source word - curSnt.numberAligned[sourcePos]++; // cnt of how many source words connect to this target word - } - else { - curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word - curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word - } - } - curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence - curSnt.trgSnt = m_trgCorpus + sntIndex; - - return curSnt; + // retrieves the alignments in the format used by SentenceAlignment.Extract() + int sntGiven = trg2Src ? GetTargetSentenceSize(sntIndex) : GetSourceSentenceSize(sntIndex); + int sntExtract = trg2Src ? GetSourceSentenceSize(sntIndex) : GetTargetSentenceSize(sntIndex); + std::vector alignment = m_rawAlignments.at(sntIndex); + SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract); // initialize empty sentence + for(size_t i=0; i < alignment.size(); i+=2) { + int sourcePos = alignment[i]; + int targetPos = alignment[i+1]; + if(trg2Src) { + curSnt.alignedList[targetPos].push_back(sourcePos); // list of target nodes for each source word + curSnt.numberAligned[sourcePos]++; // cnt of how many source words connect to this target word + } else { + curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word + curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word + } + } + curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence + curSnt.trgSnt = m_trgCorpus + sntIndex; + + return curSnt; } -bool BilingualDynSuffixArray::ExtractPhrases(const int& sntIndex, const int& wordIndex, - const int& sourceSize, std::vector& phrasePairs, bool trg2Src) const +bool BilingualDynSuffixArray::ExtractPhrases(const int& sntIndex, const int& wordIndex, + const int& sourceSize, std::vector& phrasePairs, bool trg2Src) const { - /* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src - * parameter */ - SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src); - // get span of phrase in source sentence - int beginSentence = m_srcSntBreaks[sntIndex]; - int rightIdx = wordIndex - beginSentence - ,leftIdx = rightIdx - sourceSize + 1; - return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence + /* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src + * parameter */ + SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src); + // get span of phrase in source sentence + int beginSentence = m_srcSntBreaks[sntIndex]; + int rightIdx = wordIndex - beginSentence + ,leftIdx = rightIdx - sourceSize + 1; + return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence } int BilingualDynSuffixArray::LoadCorpus(FactorDirection direction, InputFileStream& corpus, const FactorList& factors, - std::vector& cArray, std::vector& sntArray, - Vocab* vocab) + std::vector& cArray, std::vector& sntArray, + Vocab* vocab) { - std::string line, word; - int sntIdx(0); + std::string line, word; + int sntIdx(0); // corpus.seekg(0); Seems needless -> commented out to allow loading of gzipped corpora (gzfilebuf doesn't support seeking). - const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter(); - while(getline(corpus, line)) { - sntArray.push_back(sntIdx); - Phrase phrase(ARRAY_SIZE_INCR); - // parse phrase - phrase.CreateFromString(direction, factors, line, factorDelimiter, NULL); - // store words in vocabulary and corpus - for( size_t i = 0; i < phrase.GetSize(); ++i) { - cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) ); - } - sntIdx += phrase.GetSize(); - } - //cArray.push_back(vocab->GetkOOVWordID); // signify end of corpus + const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter(); + while(getline(corpus, line)) { + sntArray.push_back(sntIdx); + Phrase phrase(ARRAY_SIZE_INCR); + // parse phrase + phrase.CreateFromString(direction, factors, line, factorDelimiter, NULL); + // store words in vocabulary and corpus + for( size_t i = 0; i < phrase.GetSize(); ++i) { + cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) ); + } + sntIdx += phrase.GetSize(); + } + //cArray.push_back(vocab->GetkOOVWordID); // signify end of corpus vocab->MakeClosed(); // avoid adding words - return cArray.size(); + return cArray.size(); } -bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const +bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const { - // looks up the SA vocab ids for the current src phrase - size_t phraseSize = src.GetSize(); - for (size_t pos = 0; pos < phraseSize; ++pos) { - const Word &word = src.GetWord(pos); - wordID_t arrayId = m_srcVocab->GetWordID(word); - if (arrayId == m_srcVocab->GetkOOVWordID()) - { // oov - return false; - } - else - { - output.SetId(pos, arrayId); - //cerr << arrayId << " "; - } - } - return true; + // looks up the SA vocab ids for the current src phrase + size_t phraseSize = src.GetSize(); + for (size_t pos = 0; pos < phraseSize; ++pos) { + const Word &word = src.GetWord(pos); + wordID_t arrayId = m_srcVocab->GetWordID(word); + if (arrayId == m_srcVocab->GetkOOVWordID()) { + // oov + return false; + } else { + output.SetId(pos, arrayId); + //cerr << arrayId << " "; + } + } + return true; } -pair BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& phrasepair) const +pair BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& phrasepair) const { - //return pair(1, 1); - float srcLexWeight(1.0), trgLexWeight(1.0); - std::map, float> targetProbs; // collect sum of target probs given source words - //const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex]; - const SentenceAlignment& alignment = GetSentenceAlignment(phrasepair.m_sntIndex); - std::map, pair >::const_iterator itrCache; - // for each source word - for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) { - float srcSumPairProbs(0); - wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs - const std::vector& srcWordAlignments = alignment.alignedList.at(srcIdx); + //return pair(1, 1); + float srcLexWeight(1.0), trgLexWeight(1.0); + std::map, float> targetProbs; // collect sum of target probs given source words + //const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex]; + const SentenceAlignment& alignment = GetSentenceAlignment(phrasepair.m_sntIndex); + std::map, pair >::const_iterator itrCache; + // for each source word + for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) { + float srcSumPairProbs(0); + wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs + const std::vector& srcWordAlignments = alignment.alignedList.at(srcIdx); // for each target word aligned to this source word in this alignment - if(srcWordAlignments.size() == 0) { // get p(NULL|src) - pair wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID()); - itrCache = m_wordPairCache.find(wordpair); - if(itrCache == m_wordPairCache.end()) { // if not in cache - CacheWordProbs(srcWord); - itrCache = m_wordPairCache.find(wordpair); // search cache again - } - CHECK(itrCache != m_wordPairCache.end()); - srcSumPairProbs += itrCache->second.first; - targetProbs[wordpair] = itrCache->second.second; - } - else { // extract p(trg|src) - for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word - int trgIdx = srcWordAlignments[i]; - wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]); - // get probability of this source->target word pair - pair wordpair = make_pair(srcWord, trgWord); - itrCache = m_wordPairCache.find(wordpair); - if(itrCache == m_wordPairCache.end()) { // if not in cache + if(srcWordAlignments.size() == 0) { // get p(NULL|src) + pair wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID()); + itrCache = m_wordPairCache.find(wordpair); + if(itrCache == m_wordPairCache.end()) { // if not in cache + CacheWordProbs(srcWord); + itrCache = m_wordPairCache.find(wordpair); // search cache again + } + CHECK(itrCache != m_wordPairCache.end()); + srcSumPairProbs += itrCache->second.first; + targetProbs[wordpair] = itrCache->second.second; + } else { // extract p(trg|src) + for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word + int trgIdx = srcWordAlignments[i]; + wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]); + // get probability of this source->target word pair + pair wordpair = make_pair(srcWord, trgWord); + itrCache = m_wordPairCache.find(wordpair); + if(itrCache == m_wordPairCache.end()) { // if not in cache CacheWordProbs(srcWord); - itrCache = m_wordPairCache.find(wordpair); // search cache again - } - CHECK(itrCache != m_wordPairCache.end()); - srcSumPairProbs += itrCache->second.first; - targetProbs[wordpair] = itrCache->second.second; - } - } - float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size()); - srcLexWeight *= (srcNormalizer * srcSumPairProbs); - } // end for each source word - for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) { - float trgSumPairProbs(0); - wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]); - for (std::map, float>::const_iterator trgItr - = targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) { - if(trgItr->first.second == trgWord) - trgSumPairProbs += trgItr->second; + itrCache = m_wordPairCache.find(wordpair); // search cache again } - if(trgSumPairProbs == 0) continue; // currently don't store target-side SA - int noAligned = alignment.numberAligned.at(trgIdx); - float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned); - trgLexWeight *= (trgNormalizer * trgSumPairProbs); - } - // TODO::Need to get p(NULL|trg) - return pair(srcLexWeight, trgLexWeight); + CHECK(itrCache != m_wordPairCache.end()); + srcSumPairProbs += itrCache->second.first; + targetProbs[wordpair] = itrCache->second.second; + } + } + float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size()); + srcLexWeight *= (srcNormalizer * srcSumPairProbs); + } // end for each source word + for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) { + float trgSumPairProbs(0); + wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]); + for (std::map, float>::const_iterator trgItr + = targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) { + if(trgItr->first.second == trgWord) + trgSumPairProbs += trgItr->second; + } + if(trgSumPairProbs == 0) continue; // currently don't store target-side SA + int noAligned = alignment.numberAligned.at(trgIdx); + float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned); + trgLexWeight *= (trgNormalizer * trgSumPairProbs); + } + // TODO::Need to get p(NULL|trg) + return pair(srcLexWeight, trgLexWeight); } -void BilingualDynSuffixArray::CacheFreqWords() const { +void BilingualDynSuffixArray::CacheFreqWords() const +{ std::multimap wordCnts; // for each source word in vocab - Vocab::Word2Id::const_iterator it; + Vocab::Word2Id::const_iterator it; for(it = m_srcVocab->VocabStart(); it != m_srcVocab->VocabEnd(); ++it) { // get its frequency wordID_t srcWord = it->second; std::vector sword(1, srcWord), wrdIndices; m_srcSA->GetCorpusIndex(&sword, &wrdIndices); - if(wrdIndices.size() >= 1000) { // min count + if(wrdIndices.size() >= 1000) { // min count wordCnts.insert(make_pair(wrdIndices.size(), srcWord)); } } int numSoFar(0); - std::multimap::reverse_iterator ritr; - for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) { + std::multimap::reverse_iterator ritr; + for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) { m_freqWordsCached.insert(ritr->second); CacheWordProbs(ritr->second); if(++numSoFar == 50) break; // get top counts } cerr << "\tCached " << m_freqWordsCached.size() << " source words\n"; } -void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const +void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const { - std::map counts; - std::vector sword(1, srcWord), wrdIndices; - bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices); - CHECK(ret); - std::vector sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks); - float denom(0); - // for each occurrence of this word - for(size_t snt = 0; snt < sntIndexes.size(); ++snt) { - int sntIdx = sntIndexes.at(snt); // get corpus index for sentence - CHECK(sntIdx != -1); - int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence - const std::vector srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word - if(srcAlg.size() == 0) { - ++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word - ++denom; - } - else { //get target words aligned to srcword in this sentence - for(size_t i=0; i < srcAlg.size(); ++i) { - wordID_t trgWord = m_trgCorpus->at(srcAlg[i] + m_trgSntBreaks[sntIdx]); - ++counts[trgWord]; - ++denom; - } - } - } - // now we've gotten counts of all target words aligned to this source word - // get probs and cache all pairs - for(std::map::const_iterator itrCnt = counts.begin(); - itrCnt != counts.end(); ++itrCnt) { - pair wordPair = make_pair(srcWord, itrCnt->first); - float srcTrgPrb = float(itrCnt->second) / float(denom); // gives p(src->trg) - float trgSrcPrb = float(itrCnt->second) / float(counts.size()); // gives p(trg->src) - m_wordPairCache[wordPair] = pair(srcTrgPrb, trgSrcPrb); - } + std::map counts; + std::vector sword(1, srcWord), wrdIndices; + bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices); + CHECK(ret); + std::vector sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks); + float denom(0); + // for each occurrence of this word + for(size_t snt = 0; snt < sntIndexes.size(); ++snt) { + int sntIdx = sntIndexes.at(snt); // get corpus index for sentence + CHECK(sntIdx != -1); + int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence + const std::vector srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word + if(srcAlg.size() == 0) { + ++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word + ++denom; + } else { //get target words aligned to srcword in this sentence + for(size_t i=0; i < srcAlg.size(); ++i) { + wordID_t trgWord = m_trgCorpus->at(srcAlg[i] + m_trgSntBreaks[sntIdx]); + ++counts[trgWord]; + ++denom; + } + } + } + // now we've gotten counts of all target words aligned to this source word + // get probs and cache all pairs + for(std::map::const_iterator itrCnt = counts.begin(); + itrCnt != counts.end(); ++itrCnt) { + pair wordPair = make_pair(srcWord, itrCnt->first); + float srcTrgPrb = float(itrCnt->second) / float(denom); // gives p(src->trg) + float trgSrcPrb = float(itrCnt->second) / float(counts.size()); // gives p(trg->src) + m_wordPairCache[wordPair] = pair(srcTrgPrb, trgSrcPrb); + } } -SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const +SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const { - // takes sentence indexes and looks up vocab IDs - SAPhrase phraseIds(phrasepair.GetTargetSize()); - int sntIndex = phrasepair.m_sntIndex; - int id(-1), pos(0); - for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) { // look up trg words - id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i); - phraseIds.SetId(pos++, id); - } - return phraseIds; + // takes sentence indexes and looks up vocab IDs + SAPhrase phraseIds(phrasepair.GetTargetSize()); + int sntIndex = phrasepair.m_sntIndex; + int id(-1), pos(0); + for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) { // look up trg words + id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i); + phraseIds.SetId(pos++, id); + } + return phraseIds; } - + TargetPhrase* BilingualDynSuffixArray::GetMosesFactorIDs(const SAPhrase& phrase, const Phrase& sourcePhrase) const { - TargetPhrase* targetPhrase = new TargetPhrase(); - for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words - Word& word = m_trgVocab->GetWord( phrase.words[i]); - CHECK(word != m_trgVocab->GetkOOVWord()); - targetPhrase->AddWord(word); - } - targetPhrase->SetSourcePhrase(sourcePhrase); - // scoring - return targetPhrase; + TargetPhrase* targetPhrase = new TargetPhrase(); + for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words + Word& word = m_trgVocab->GetWord( phrase.words[i]); + CHECK(word != m_trgVocab->GetkOOVWord()); + targetPhrase->AddWord(word); + } + targetPhrase->SetSourcePhrase(sourcePhrase); + // scoring + return targetPhrase; } -void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair > & target) const +void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair > & target) const { //cerr << "phrase is \"" << src << endl; - size_t sourceSize = src.GetSize(); - SAPhrase localIDs(sourceSize); - if(!GetLocalVocabIDs(src, localIDs)) return; - float totalTrgPhrases(0); - std::map phraseCounts; - //std::map phraseColl; // (one of) the word indexes this phrase was taken from - std::map > lexicalWeights; - std::map >::iterator itrLexW; - std::vector wrdIndices; - // extract sentence IDs from SA and return rightmost index of phrases - if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) return; + size_t sourceSize = src.GetSize(); + SAPhrase localIDs(sourceSize); + if(!GetLocalVocabIDs(src, localIDs)) return; + float totalTrgPhrases(0); + std::map phraseCounts; + //std::map phraseColl; // (one of) the word indexes this phrase was taken from + std::map > lexicalWeights; + std::map >::iterator itrLexW; + std::vector wrdIndices; + // extract sentence IDs from SA and return rightmost index of phrases + if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) return; SampleSelection(wrdIndices); - std::vector sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks); - // for each sentence with this phrase - for(size_t snt = 0; snt < sntIndexes.size(); ++snt) { - std::vector phrasePairs; // to store all phrases possible from current sentence - int sntIndex = sntIndexes.at(snt); // get corpus index for sentence - if(sntIndex == -1) continue; // bad flag set by GetSntIndexes() - ExtractPhrases(sntIndex, wrdIndices[snt], sourceSize, phrasePairs); - //cerr << "extracted " << phrasePairs.size() << endl; - totalTrgPhrases += phrasePairs.size(); // keep track of count of each extracted phrase pair - std::vector::iterator iterPhrasePair; - for (iterPhrasePair = phrasePairs.begin(); iterPhrasePair != phrasePairs.end(); ++iterPhrasePair) { - SAPhrase phrase = TrgPhraseFromSntIdx(**iterPhrasePair); - phraseCounts[phrase]++; // count each unique phrase - // NOTE::Correct but slow to extract lexical weight here. could do + std::vector sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks); + // for each sentence with this phrase + for(size_t snt = 0; snt < sntIndexes.size(); ++snt) { + std::vector phrasePairs; // to store all phrases possible from current sentence + int sntIndex = sntIndexes.at(snt); // get corpus index for sentence + if(sntIndex == -1) continue; // bad flag set by GetSntIndexes() + ExtractPhrases(sntIndex, wrdIndices[snt], sourceSize, phrasePairs); + //cerr << "extracted " << phrasePairs.size() << endl; + totalTrgPhrases += phrasePairs.size(); // keep track of count of each extracted phrase pair + std::vector::iterator iterPhrasePair; + for (iterPhrasePair = phrasePairs.begin(); iterPhrasePair != phrasePairs.end(); ++iterPhrasePair) { + SAPhrase phrase = TrgPhraseFromSntIdx(**iterPhrasePair); + phraseCounts[phrase]++; // count each unique phrase + // NOTE::Correct but slow to extract lexical weight here. could do // it later for only the top phrases chosen by phrase prob p(e|f) - pair lexWeight = GetLexicalWeight(**iterPhrasePair); // get lexical weighting for this phrase pair - itrLexW = lexicalWeights.find(phrase); // check if phrase already has lexical weight attached - if((itrLexW != lexicalWeights.end()) && (itrLexW->second.first < lexWeight.first)) - itrLexW->second = lexWeight; // if this lex weight is greater save it - else lexicalWeights[phrase] = lexWeight; // else save - } - // done with sentence. delete SA phrase pairs - RemoveAllInColl(phrasePairs); - } // done with all sentences - // convert to moses phrase pairs - std::map::const_iterator iterPhrases; - std::multimap phraseScores (*m_scoreCmp); - // get scores of all phrases - for(iterPhrases = phraseCounts.begin(); iterPhrases != phraseCounts.end(); ++iterPhrases) { - float trg2SrcMLE = float(iterPhrases->second) / totalTrgPhrases; - itrLexW = lexicalWeights.find(iterPhrases->first); - CHECK(itrLexW != lexicalWeights.end()); - Scores scoreVector(3); - scoreVector[0] = trg2SrcMLE; - scoreVector[1] = itrLexW->second.first; - scoreVector[2] = 2.718; // exp(1); - phraseScores.insert(make_pair(scoreVector, &iterPhrases->first)); - } - // return top scoring phrases - std::multimap::reverse_iterator ritr; - for(ritr = phraseScores.rbegin(); ritr != phraseScores.rend(); ++ritr) { - Scores scoreVector = ritr->first; - TargetPhrase *targetPhrase = GetMosesFactorIDs(*ritr->second, src); - target.push_back(make_pair( scoreVector, targetPhrase)); - if(target.size() == m_maxSampleSize) break; - } + pair lexWeight = GetLexicalWeight(**iterPhrasePair); // get lexical weighting for this phrase pair + itrLexW = lexicalWeights.find(phrase); // check if phrase already has lexical weight attached + if((itrLexW != lexicalWeights.end()) && (itrLexW->second.first < lexWeight.first)) + itrLexW->second = lexWeight; // if this lex weight is greater save it + else lexicalWeights[phrase] = lexWeight; // else save + } + // done with sentence. delete SA phrase pairs + RemoveAllInColl(phrasePairs); + } // done with all sentences + // convert to moses phrase pairs + std::map::const_iterator iterPhrases; + std::multimap phraseScores (*m_scoreCmp); + // get scores of all phrases + for(iterPhrases = phraseCounts.begin(); iterPhrases != phraseCounts.end(); ++iterPhrases) { + float trg2SrcMLE = float(iterPhrases->second) / totalTrgPhrases; + itrLexW = lexicalWeights.find(iterPhrases->first); + CHECK(itrLexW != lexicalWeights.end()); + Scores scoreVector(3); + scoreVector[0] = trg2SrcMLE; + scoreVector[1] = itrLexW->second.first; + scoreVector[2] = 2.718; // exp(1); + phraseScores.insert(make_pair(scoreVector, &iterPhrases->first)); + } + // return top scoring phrases + std::multimap::reverse_iterator ritr; + for(ritr = phraseScores.rbegin(); ritr != phraseScores.rend(); ++ritr) { + Scores scoreVector = ritr->first; + TargetPhrase *targetPhrase = GetMosesFactorIDs(*ritr->second, src); + target.push_back(make_pair( scoreVector, targetPhrase)); + if(target.size() == m_maxSampleSize) break; + } } -std::vector BilingualDynSuffixArray::GetSntIndexes(std::vector& wrdIndices, - const int sourceSize, const std::vector& sntBreaks) const +std::vector BilingualDynSuffixArray::GetSntIndexes(std::vector& wrdIndices, + const int sourceSize, const std::vector& sntBreaks) const { - std::vector::const_iterator vit; - std::vector sntIndexes; - for(size_t i=0; i < wrdIndices.size(); ++i) { - vit = std::upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]); - int index = int(vit - sntBreaks.begin()) - 1; - // check for phrases that cross sentence boundaries - if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index)) - sntIndexes.push_back(-1); // set bad flag - else - sntIndexes.push_back(index); // store the index of the sentence in the corpus - } - return sntIndexes; + std::vector::const_iterator vit; + std::vector sntIndexes; + for(size_t i=0; i < wrdIndices.size(); ++i) { + vit = std::upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]); + int index = int(vit - sntBreaks.begin()) - 1; + // check for phrases that cross sentence boundaries + if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index)) + sntIndexes.push_back(-1); // set bad flag + else + sntIndexes.push_back(index); // store the index of the sentence in the corpus + } + return sntIndexes; } int BilingualDynSuffixArray::SampleSelection(std::vector& sample, - int sampleSize) const + int sampleSize) const { // only use top 'sampleSize' number of samples - if(sample.size() > (size_t)sampleSize) - sample.erase(sample.begin()+sampleSize, sample.end()); - return sample.size(); + if(sample.size() > (size_t)sampleSize) + sample.erase(sample.begin()+sampleSize, sample.end()); + return sample.size(); } -void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& alignment) { +void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& alignment) +{ vuint_t srcFactor, trgFactor; cerr << "source, target, alignment = " << source << ", " << target << ", " << alignment << endl; - const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter(); + const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter(); const unsigned oldSrcCrpSize = m_srcCorpus->size(), oldTrgCrpSize = m_trgCorpus->size(); cerr << "old source corpus size = " << oldSrcCrpSize << "\told target size = " << oldTrgCrpSize << endl; Phrase sphrase(ARRAY_SIZE_INCR); @@ -511,7 +510,7 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& cerr << "srcFactor[" << (srcFactor.size() - 1) << "] = " << srcFactor.back() << endl; m_srcCorpus->push_back(srcFactor.back()); // add word to corpus } - m_srcSntBreaks.push_back(oldSrcCrpSize); // former end of corpus is index of new sentence + m_srcSntBreaks.push_back(oldSrcCrpSize); // former end of corpus is index of new sentence m_srcVocab->MakeClosed(); Phrase tphrase(ARRAY_SIZE_INCR); tphrase.CreateFromString(Output, m_outputFactors, target, factorDelimiter, NULL); @@ -534,16 +533,17 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& LoadRawAlignments(alignment); m_trgVocab->MakeClosed(); //for(size_t i=0; i < sphrase.GetSize(); ++i) - //ClearWordInCache(sIDs[i]); - + //ClearWordInCache(sIDs[i]); + } -void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord) { +void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord) +{ if(m_freqWordsCached.find(srcWord) != m_freqWordsCached.end()) return; - std::map, std::pair >::iterator it, - first, last; + std::map, std::pair >::iterator it, + first, last; for(it = m_wordPairCache.begin(); it != m_wordPairCache.end(); ++it) { - if(it->first.first == srcWord) { // all source words grouped + if(it->first.first == srcWord) { // all source words grouped first = it; // copy first entry of srcWord last = it++; while(it != m_wordPairCache.end() && (it->first.first == srcWord)) { @@ -553,80 +553,77 @@ void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord) { m_wordPairCache.erase(first, last); } } -SentenceAlignment::SentenceAlignment(int sntIndex, int sourceSize, int targetSize) - :m_sntIndex(sntIndex) - ,numberAligned(targetSize, 0) - ,alignedList(sourceSize) +SentenceAlignment::SentenceAlignment(int sntIndex, int sourceSize, int targetSize) + :m_sntIndex(sntIndex) + ,numberAligned(targetSize, 0) + ,alignedList(sourceSize) { - for(int i=0; i < sourceSize; ++i) { - std::vector trgWrd; - alignedList[i] = trgWrd; - } + for(int i=0; i < sourceSize; ++i) { + std::vector trgWrd; + alignedList[i] = trgWrd; + } } bool SentenceAlignment::Extract(int maxPhraseLength, std::vector &ret, int startSource, int endSource) const { - // foreign = target, F=T - // english = source, E=S - int countTarget = numberAligned.size(); - - int minTarget = 9999; - int maxTarget = -1; - std::vector< int > usedTarget = numberAligned; - for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++) - { - for(int ind=0; ind < (int)alignedList[sourcePos].size();ind++) - { - int targetPos = alignedList[sourcePos][ind]; - // cout << "point (" << targetPos << ", " << sourcePos << ")\n"; - if (targetPosmaxTarget) { maxTarget = targetPos; } - usedTarget[ targetPos ]--; - } // for(int ind=0;ind= 0 && // aligned to any foreign words at all - maxTarget-minTarget < maxPhraseLength) - { // foreign phrase within limits - - // check if foreign words are aligned to out of bound english words - bool out_of_bounds = false; - for(int targetPos=minTarget; targetPos <= maxTarget && !out_of_bounds; targetPos++) - { - if (usedTarget[targetPos]>0) - { - // cout << "ouf of bounds: " << targetPos << "\n"; - out_of_bounds = true; - } - } - - // cout << "doing if for ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n"; - if (!out_of_bounds) - { - // start point of foreign phrase may retreat over unaligned - for(int startTarget = minTarget; - (startTarget >= 0 && - startTarget > maxTarget-maxPhraseLength && // within length limit - (startTarget==minTarget || numberAligned[startTarget]==0)); // unaligned - startTarget--) - { - // end point of foreign phrase may advance over unaligned - for (int endTarget=maxTarget; - (endTarget= 0 && - return (ret.size() > 0); - + // foreign = target, F=T + // english = source, E=S + int countTarget = numberAligned.size(); + + int minTarget = 9999; + int maxTarget = -1; + std::vector< int > usedTarget = numberAligned; + for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++) { + for(int ind=0; ind < (int)alignedList[sourcePos].size(); ind++) { + int targetPos = alignedList[sourcePos][ind]; + // cout << "point (" << targetPos << ", " << sourcePos << ")\n"; + if (targetPosmaxTarget) { + maxTarget = targetPos; + } + usedTarget[ targetPos ]--; + } // for(int ind=0;ind= 0 && // aligned to any foreign words at all + maxTarget-minTarget < maxPhraseLength) { + // foreign phrase within limits + + // check if foreign words are aligned to out of bound english words + bool out_of_bounds = false; + for(int targetPos=minTarget; targetPos <= maxTarget && !out_of_bounds; targetPos++) { + if (usedTarget[targetPos]>0) { + // cout << "ouf of bounds: " << targetPos << "\n"; + out_of_bounds = true; + } + } + + // cout << "doing if for ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n"; + if (!out_of_bounds) { + // start point of foreign phrase may retreat over unaligned + for(int startTarget = minTarget; + (startTarget >= 0 && + startTarget > maxTarget-maxPhraseLength && // within length limit + (startTarget==minTarget || numberAligned[startTarget]==0)); // unaligned + startTarget--) { + // end point of foreign phrase may advance over unaligned + for (int endTarget=maxTarget; + (endTarget= 0 && + return (ret.size() > 0); + } }// end namepsace diff --git a/moses/TranslationModel/BilingualDynSuffixArray.h b/moses/TranslationModel/BilingualDynSuffixArray.h index 5dda1e274..08637d095 100644 --- a/moses/TranslationModel/BilingualDynSuffixArray.h +++ b/moses/TranslationModel/BilingualDynSuffixArray.h @@ -1,7 +1,7 @@ #ifndef moses_BilingualDynSuffixArray_h #define moses_BilingualDynSuffixArray_h -#include "DynSuffixArray.h" +#include "DynSuffixArray.h" #include "moses/TranslationModel/DynSAInclude/vocab.h" #include "moses/TranslationModel/DynSAInclude/types.h" #include "moses/TranslationModel/DynSAInclude/utils.h" @@ -9,26 +9,27 @@ #include "moses/FactorTypeSet.h" #include "moses/TargetPhrase.h" -namespace Moses { +namespace Moses +{ /** @todo ask Abbey Levenberg */ class SAPhrase { public: - std::vector words; - - SAPhrase(size_t phraseSize) - :words(phraseSize) - {} - - void SetId(size_t pos, wordID_t id) - { + std::vector words; + + SAPhrase(size_t phraseSize) + :words(phraseSize) + {} + + void SetId(size_t pos, wordID_t id) { CHECK(pos < words.size()); - words[pos] = id; - } - bool operator<(const SAPhrase& phr2) const - { return words < phr2.words; } + words[pos] = id; + } + bool operator<(const SAPhrase& phr2) const { + return words < phr2.words; + } }; /** @todo ask Abbey Levenberg @@ -36,42 +37,44 @@ public: class PhrasePair { public: - int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex; - PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex) - : m_startTarget(startTarget) - , m_endTarget(endTarget) - , m_startSource(startSource) - , m_endSource(endSource) - , m_sntIndex(sntIndex) - {} + int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex; + PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex) + : m_startTarget(startTarget) + , m_endTarget(endTarget) + , m_startSource(startSource) + , m_endSource(endSource) + , m_sntIndex(sntIndex) + {} - size_t GetTargetSize() const - { return m_endTarget - m_startTarget + 1; } + size_t GetTargetSize() const { + return m_endTarget - m_startTarget + 1; + } }; - + /** @todo ask Abbey Levenberg */ -class SentenceAlignment +class SentenceAlignment { public: - SentenceAlignment(int sntIndex, int sourceSize, int targetSize); - int m_sntIndex; - std::vector* trgSnt; - std::vector* srcSnt; - std::vector numberAligned; - std::vector< std::vector > alignedList; - bool Extract(int maxPhraseLength, std::vector &ret, int startSource, int endSource) const; + SentenceAlignment(int sntIndex, int sourceSize, int targetSize); + int m_sntIndex; + std::vector* trgSnt; + std::vector* srcSnt; + std::vector numberAligned; + std::vector< std::vector > alignedList; + bool Extract(int maxPhraseLength, std::vector &ret, int startSource, int endSource) const; }; -class ScoresComp { -public: +class ScoresComp +{ +public: ScoresComp(const std::vector& weights): m_weights(weights) {} - bool operator()(const Scores& s1, const Scores& s2) const { + bool operator()(const Scores& s1, const Scores& s2) const { return s1[0] < s2[0]; // just p(e|f) as approximation /*float score1(0), score2(0); int idx1(0), idx2(0); - for (Scores::const_iterator itr = s1.begin(); + for (Scores::const_iterator itr = s1.begin(); itr != s1.end(); ++itr) { - score1 += log(*itr * m_weights.at(idx1++)); + score1 += log(*itr * m_weights.at(idx1++)); } for (Scores::const_iterator itr = s2.begin(); itr != s2.end(); ++itr) { @@ -79,78 +82,77 @@ public: } return score1 < score2;*/ } -private: +private: const std::vector& m_weights; }; - + /** @todo ask Abbey Levenberg */ -class BilingualDynSuffixArray { -public: - BilingualDynSuffixArray(); - ~BilingualDynSuffixArray(); - bool Load( const std::vector& inputFactors, - const std::vector& outputTactors, - std::string source, std::string target, std::string alignments, - const std::vector &weight); - bool LoadTM( const std::vector& inputFactors, - const std::vector& outputTactors, - std::string source, std::string target, std::string alignments, - const std::vector &weight); - void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair >& target) const; +class BilingualDynSuffixArray +{ +public: + BilingualDynSuffixArray(); + ~BilingualDynSuffixArray(); + bool Load( const std::vector& inputFactors, + const std::vector& outputTactors, + std::string source, std::string target, std::string alignments, + const std::vector &weight); + bool LoadTM( const std::vector& inputFactors, + const std::vector& outputTactors, + std::string source, std::string target, std::string alignments, + const std::vector &weight); + void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair >& target) const; void addSntPair(string& source, string& target, string& alignment); private: - DynSuffixArray* m_srcSA; - DynSuffixArray* m_trgSA; - std::vector* m_srcCorpus; - std::vector* m_trgCorpus; + DynSuffixArray* m_srcSA; + DynSuffixArray* m_trgSA; + std::vector* m_srcCorpus; + std::vector* m_trgCorpus; std::vector m_inputFactors; std::vector m_outputFactors; - std::vector m_srcSntBreaks, m_trgSntBreaks; + std::vector m_srcSntBreaks, m_trgSntBreaks; - Vocab* m_srcVocab, *m_trgVocab; - ScoresComp* m_scoreCmp; + Vocab* m_srcVocab, *m_trgVocab; + ScoresComp* m_scoreCmp; - std::vector m_alignments; - std::vector > m_rawAlignments; + std::vector m_alignments; + std::vector > m_rawAlignments; - mutable std::map, std::pair > m_wordPairCache; + mutable std::map, std::pair > m_wordPairCache; mutable std::set m_freqWordsCached; - const size_t m_maxPhraseLength, m_maxSampleSize; + const size_t m_maxPhraseLength, m_maxSampleSize; - int LoadCorpus(FactorDirection direction, InputFileStream&, const std::vector& factors, - std::vector&, std::vector&, - Vocab*); - int LoadAlignments(InputFileStream& aligs); - int LoadRawAlignments(InputFileStream& aligs); - int LoadRawAlignments(string& aligs); + int LoadCorpus(FactorDirection direction, InputFileStream&, const std::vector& factors, + std::vector&, std::vector&, + Vocab*); + int LoadAlignments(InputFileStream& aligs); + int LoadRawAlignments(InputFileStream& aligs); + int LoadRawAlignments(string& aligs); - bool ExtractPhrases(const int&, const int&, const int&, std::vector&, bool=false) const; - SentenceAlignment GetSentenceAlignment(const int, bool=false) const; - int SampleSelection(std::vector&, int = 300) const; + bool ExtractPhrases(const int&, const int&, const int&, std::vector&, bool=false) const; + SentenceAlignment GetSentenceAlignment(const int, bool=false) const; + int SampleSelection(std::vector&, int = 300) const; - std::vector GetSntIndexes(std::vector&, int, const std::vector&) const; - TargetPhrase* GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const; - SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const; - bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const; - void CacheWordProbs(wordID_t) const; + std::vector GetSntIndexes(std::vector&, int, const std::vector&) const; + TargetPhrase* GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const; + SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const; + bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const; + void CacheWordProbs(wordID_t) const; void CacheFreqWords() const; void ClearWordInCache(wordID_t); - std::pair GetLexicalWeight(const PhrasePair&) const; + std::pair GetLexicalWeight(const PhrasePair&) const; - int GetSourceSentenceSize(size_t sentenceId) const - { - return (sentenceId==m_srcSntBreaks.size()-1) ? - m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) : - m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId); - } - int GetTargetSentenceSize(size_t sentenceId) const - { - return (sentenceId==m_trgSntBreaks.size()-1) ? - m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) : - m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId); - } + int GetSourceSentenceSize(size_t sentenceId) const { + return (sentenceId==m_srcSntBreaks.size()-1) ? + m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) : + m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId); + } + int GetTargetSentenceSize(size_t sentenceId) const { + return (sentenceId==m_trgSntBreaks.size()-1) ? + m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) : + m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId); + } }; } // end namespace #endif diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp index 4f0f6c2cd..e8d2f734a 100644 --- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp +++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.h b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.h index 61982299f..3c3f468c2 100644 --- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.h +++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -35,12 +35,12 @@ class WordsRange; */ class ChartRuleLookupManagerCYKPlus : public ChartRuleLookupManager { - public: +public: ChartRuleLookupManagerCYKPlus(const InputType &sentence, const ChartCellCollectionBase &cellColl) : ChartRuleLookupManager(sentence, cellColl) {} - protected: +protected: void AddCompletedRule( const DottedRule &dottedRule, const TargetPhraseCollection &tpc, diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp index ce6a1d30d..c0c1986f4 100644 --- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp +++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp @@ -75,19 +75,19 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection( // get list of all rules that apply to spans at same starting position DottedRuleColl &dottedRuleCol = *m_dottedRuleColls[range.GetStartPos()]; const DottedRuleList &expandableDottedRuleList = dottedRuleCol.GetExpandableDottedRuleList(); - + const ChartCellLabel &sourceWordLabel = GetSourceAt(absEndPos); // loop through the rules - // (note that expandableDottedRuleList can be expanded as the loop runs + // (note that expandableDottedRuleList can be expanded as the loop runs // through calls to ExtendPartialRuleApplication()) for (size_t ind = 0; ind < expandableDottedRuleList.size(); ++ind) { // rule we are about to extend const DottedRuleInMemory &prevDottedRule = *expandableDottedRuleList[ind]; // we will now try to extend it, starting after where it ended size_t startPos = prevDottedRule.IsRoot() - ? range.GetStartPos() - : prevDottedRule.GetWordsRange().GetEndPos() + 1; + ? range.GetStartPos() + : prevDottedRule.GetWordsRange().GetEndPos() + 1; // search for terminal symbol // (if only one more word position needs to be covered) @@ -100,15 +100,15 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection( // if we found a new rule -> create it and add it to the list if (node != NULL) { - // create the rule + // create the rule #ifdef USE_BOOST_POOL DottedRuleInMemory *dottedRule = m_dottedRulePool.malloc(); new (dottedRule) DottedRuleInMemory(*node, sourceWordLabel, prevDottedRule); #else DottedRuleInMemory *dottedRule = new DottedRuleInMemory(*node, - sourceWordLabel, - prevDottedRule); + sourceWordLabel, + prevDottedRule); #endif dottedRuleCol.Add(relEndPos+1, dottedRule); } @@ -134,9 +134,7 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection( // word. endPos = absEndPos - 1; stackInd = relEndPos; - } - else - { + } else { endPos = absEndPos; stackInd = relEndPos + 1; } @@ -208,7 +206,7 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication( // We'll do whichever minimises the number of lookups: if (numCombinations <= numChildren*2) { - // loop over possible source non-terminal labels (as found in input tree) + // loop over possible source non-terminal labels (as found in input tree) NonTerminalSet::const_iterator p = sourceNonTerms.begin(); NonTerminalSet::const_iterator sEnd = sourceNonTerms.end(); for (; p != sEnd; ++p) { @@ -235,14 +233,12 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication( new (rule) DottedRuleInMemory(*child, cellLabel, prevDottedRule); #else DottedRuleInMemory *rule = new DottedRuleInMemory(*child, cellLabel, - prevDottedRule); + prevDottedRule); #endif dottedRuleColl.Add(stackInd, rule); } } - } - else - { + } else { // loop over possible expansions of the rule PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator p; PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator end = @@ -267,7 +263,7 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication( new (rule) DottedRuleInMemory(child, *cellLabel, prevDottedRule); #else DottedRuleInMemory *rule = new DottedRuleInMemory(child, *cellLabel, - prevDottedRule); + prevDottedRule); #endif dottedRuleColl.Add(stackInd, rule); } diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h index 784e1c70d..74bc7d253 100644 --- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h +++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h @@ -44,7 +44,7 @@ class WordsRange; //! Implementation of ChartRuleLookupManager for in-memory rule tables. class ChartRuleLookupManagerMemory : public ChartRuleLookupManagerCYKPlus { - public: +public: ChartRuleLookupManagerMemory(const InputType &sentence, const ChartCellCollectionBase &cellColl, const PhraseDictionaryMemory &ruleTable); @@ -55,7 +55,7 @@ class ChartRuleLookupManagerMemory : public ChartRuleLookupManagerCYKPlus const WordsRange &range, ChartParserCallback &outColl); - private: +private: void ExtendPartialRuleApplication( const DottedRuleInMemory &prevDottedRule, size_t startPos, diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp index 4ad60eb43..412840782 100644 --- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp +++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp @@ -75,17 +75,17 @@ void ChartRuleLookupManagerMemoryPerSentence::GetChartRuleCollection( // get list of all rules that apply to spans at same starting position DottedRuleColl &dottedRuleCol = *m_dottedRuleColls[range.GetStartPos()]; const DottedRuleList &expandableDottedRuleList = dottedRuleCol.GetExpandableDottedRuleList(); - + // loop through the rules - // (note that expandableDottedRuleList can be expanded as the loop runs + // (note that expandableDottedRuleList can be expanded as the loop runs // through calls to ExtendPartialRuleApplication()) for (size_t ind = 0; ind < expandableDottedRuleList.size(); ++ind) { // rule we are about to extend const DottedRuleInMemory &prevDottedRule = *expandableDottedRuleList[ind]; // we will now try to extend it, starting after where it ended size_t startPos = prevDottedRule.IsRoot() - ? range.GetStartPos() - : prevDottedRule.GetWordsRange().GetEndPos() + 1; + ? range.GetStartPos() + : prevDottedRule.GetWordsRange().GetEndPos() + 1; // search for terminal symbol // (if only one more word position needs to be covered) @@ -99,15 +99,15 @@ void ChartRuleLookupManagerMemoryPerSentence::GetChartRuleCollection( // if we found a new rule -> create it and add it to the list if (node != NULL) { - // create the rule + // create the rule #ifdef USE_BOOST_POOL DottedRuleInMemory *dottedRule = m_dottedRulePool.malloc(); new (dottedRule) DottedRuleInMemory(*node, sourceWordLabel, prevDottedRule); #else DottedRuleInMemory *dottedRule = new DottedRuleInMemory(*node, - sourceWordLabel, - prevDottedRule); + sourceWordLabel, + prevDottedRule); #endif dottedRuleCol.Add(relEndPos+1, dottedRule); } @@ -133,9 +133,7 @@ void ChartRuleLookupManagerMemoryPerSentence::GetChartRuleCollection( // word. endPos = absEndPos - 1; stackInd = relEndPos; - } - else - { + } else { endPos = absEndPos; stackInd = relEndPos + 1; } @@ -207,7 +205,7 @@ void ChartRuleLookupManagerMemoryPerSentence::ExtendPartialRuleApplication( // We'll do whichever minimises the number of lookups: if (numCombinations <= numChildren*2) { - // loop over possible source non-terminal labels (as found in input tree) + // loop over possible source non-terminal labels (as found in input tree) NonTerminalSet::const_iterator p = sourceNonTerms.begin(); NonTerminalSet::const_iterator sEnd = sourceNonTerms.end(); for (; p != sEnd; ++p) { @@ -234,14 +232,12 @@ void ChartRuleLookupManagerMemoryPerSentence::ExtendPartialRuleApplication( new (rule) DottedRuleInMemory(*child, cellLabel, prevDottedRule); #else DottedRuleInMemory *rule = new DottedRuleInMemory(*child, cellLabel, - prevDottedRule); + prevDottedRule); #endif dottedRuleColl.Add(stackInd, rule); } } - } - else - { + } else { // loop over possible expansions of the rule PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator p; PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator end = @@ -266,7 +262,7 @@ void ChartRuleLookupManagerMemoryPerSentence::ExtendPartialRuleApplication( new (rule) DottedRuleInMemory(child, *cellLabel, prevDottedRule); #else DottedRuleInMemory *rule = new DottedRuleInMemory(child, *cellLabel, - prevDottedRule); + prevDottedRule); #endif dottedRuleColl.Add(stackInd, rule); } diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h index 6f2b209a7..ebb8cdd7c 100644 --- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h +++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h @@ -44,10 +44,10 @@ class WordsRange; //! Implementation of ChartRuleLookupManager for in-memory rule tables. class ChartRuleLookupManagerMemoryPerSentence : public ChartRuleLookupManagerCYKPlus { - public: +public: ChartRuleLookupManagerMemoryPerSentence(const InputType &sentence, - const ChartCellCollectionBase &cellColl, - const PhraseDictionaryFuzzyMatch &ruleTable); + const ChartCellCollectionBase &cellColl, + const PhraseDictionaryFuzzyMatch &ruleTable); ~ChartRuleLookupManagerMemoryPerSentence(); @@ -55,7 +55,7 @@ class ChartRuleLookupManagerMemoryPerSentence : public ChartRuleLookupManagerCYK const WordsRange &range, ChartParserCallback &outColl); - private: +private: void ExtendPartialRuleApplication( const DottedRuleInMemory &prevDottedRule, size_t startPos, diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h index 23f83623d..24d06270b 100644 --- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h +++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h @@ -35,7 +35,7 @@ namespace Moses //! Implementation of ChartRuleLookupManager for on-disk rule tables. class ChartRuleLookupManagerOnDisk : public ChartRuleLookupManagerCYKPlus { - public: +public: ChartRuleLookupManagerOnDisk(const InputType &sentence, const ChartCellCollectionBase &cellColl, const PhraseDictionaryOnDisk &dictionary, @@ -49,7 +49,7 @@ class ChartRuleLookupManagerOnDisk : public ChartRuleLookupManagerCYKPlus virtual void GetChartRuleCollection(const WordsRange &range, ChartParserCallback &outColl); - private: +private: const PhraseDictionaryOnDisk &m_dictionary; OnDiskPt::OnDiskWrapper &m_dbWrapper; const std::vector &m_inputFactorsVec; diff --git a/moses/TranslationModel/CYKPlusParser/DotChart.h b/moses/TranslationModel/CYKPlusParser/DotChart.h index 9dd34593f..946f36ff2 100644 --- a/moses/TranslationModel/CYKPlusParser/DotChart.h +++ b/moses/TranslationModel/CYKPlusParser/DotChart.h @@ -28,26 +28,38 @@ namespace Moses */ class DottedRule { - public: +public: // used only to init dot stack. DottedRule() - : m_cellLabel(NULL) - , m_prev(NULL) {} + : m_cellLabel(NULL) + , m_prev(NULL) {} DottedRule(const ChartCellLabel &ccl, const DottedRule &prev) - : m_cellLabel(&ccl) - , m_prev(&prev) {} + : m_cellLabel(&ccl) + , m_prev(&prev) {} - const WordsRange &GetWordsRange() const { return m_cellLabel->GetCoverage(); } - const Word &GetSourceWord() const { return m_cellLabel->GetLabel(); } - bool IsNonTerminal() const { return m_cellLabel->GetLabel().IsNonTerminal(); } - const DottedRule *GetPrev() const { return m_prev; } - bool IsRoot() const { return m_prev == NULL; } - const ChartCellLabel &GetChartCellLabel() const { return *m_cellLabel; } + const WordsRange &GetWordsRange() const { + return m_cellLabel->GetCoverage(); + } + const Word &GetSourceWord() const { + return m_cellLabel->GetLabel(); + } + bool IsNonTerminal() const { + return m_cellLabel->GetLabel().IsNonTerminal(); + } + const DottedRule *GetPrev() const { + return m_prev; + } + bool IsRoot() const { + return m_prev == NULL; + } + const ChartCellLabel &GetChartCellLabel() const { + return *m_cellLabel; + } - private: +private: const ChartCellLabel *m_cellLabel; // usually contains something, unless - // it's the init processed rule + // it's the init processed rule const DottedRule *m_prev; }; diff --git a/moses/TranslationModel/CYKPlusParser/DotChartInMemory.cpp b/moses/TranslationModel/CYKPlusParser/DotChartInMemory.cpp index a28387027..616a2907c 100644 --- a/moses/TranslationModel/CYKPlusParser/DotChartInMemory.cpp +++ b/moses/TranslationModel/CYKPlusParser/DotChartInMemory.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA diff --git a/moses/TranslationModel/CYKPlusParser/DotChartInMemory.h b/moses/TranslationModel/CYKPlusParser/DotChartInMemory.h index f0753a8f1..cfd986d7a 100644 --- a/moses/TranslationModel/CYKPlusParser/DotChartInMemory.h +++ b/moses/TranslationModel/CYKPlusParser/DotChartInMemory.h @@ -32,21 +32,23 @@ namespace Moses */ class DottedRuleInMemory : public DottedRule { - public: +public: // used only to init dot stack. explicit DottedRuleInMemory(const PhraseDictionaryNodeMemory &node) - : DottedRule() - , m_node(node) {} + : DottedRule() + , m_node(node) {} DottedRuleInMemory(const PhraseDictionaryNodeMemory &node, const ChartCellLabel &cellLabel, const DottedRuleInMemory &prev) - : DottedRule(cellLabel, prev) - , m_node(node) {} - - const PhraseDictionaryNodeMemory &GetLastNode() const { return m_node; } + : DottedRule(cellLabel, prev) + , m_node(node) {} - private: + const PhraseDictionaryNodeMemory &GetLastNode() const { + return m_node; + } + +private: const PhraseDictionaryNodeMemory &m_node; }; diff --git a/moses/TranslationModel/CYKPlusParser/DotChartOnDisk.h b/moses/TranslationModel/CYKPlusParser/DotChartOnDisk.h index 5b756ba8d..edd9f3a62 100644 --- a/moses/TranslationModel/CYKPlusParser/DotChartOnDisk.h +++ b/moses/TranslationModel/CYKPlusParser/DotChartOnDisk.h @@ -36,26 +36,32 @@ namespace Moses */ class DottedRuleOnDisk : public DottedRule { - public: +public: // used only to init dot stack. explicit DottedRuleOnDisk(const OnDiskPt::PhraseNode &lastNode) - : DottedRule() - , m_lastNode(lastNode) - , m_done(false) {} + : DottedRule() + , m_lastNode(lastNode) + , m_done(false) {} DottedRuleOnDisk(const OnDiskPt::PhraseNode &lastNode, const ChartCellLabel &cellLabel, const DottedRuleOnDisk &prev) - : DottedRule(cellLabel, prev) - , m_lastNode(lastNode) - , m_done(false) {} + : DottedRule(cellLabel, prev) + , m_lastNode(lastNode) + , m_done(false) {} - const OnDiskPt::PhraseNode &GetLastNode() const { return m_lastNode; } + const OnDiskPt::PhraseNode &GetLastNode() const { + return m_lastNode; + } - bool Done() const { return m_done; } - void Done(bool value) const { m_done = value; } + bool Done() const { + return m_done; + } + void Done(bool value) const { + m_done = value; + } - private: +private: const OnDiskPt::PhraseNode &m_lastNode; mutable bool m_done; }; diff --git a/moses/TranslationModel/CompactPT/BlockHashIndex.cpp b/moses/TranslationModel/CompactPT/BlockHashIndex.cpp index 705493ab7..9afe474f7 100644 --- a/moses/TranslationModel/CompactPT/BlockHashIndex.cpp +++ b/moses/TranslationModel/CompactPT/BlockHashIndex.cpp @@ -1,23 +1,23 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #include "ThrowingFwrite.h" #include "BlockHashIndex.h" @@ -32,25 +32,27 @@ namespace Moses #ifdef WITH_THREADS BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits, size_t threadsNum) -: m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), - m_fileHandle(0), m_fileHandleStart(0), m_size(0), - m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0), - m_threadPool(threadsNum) { + : m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), + m_fileHandle(0), m_fileHandleStart(0), m_size(0), + m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0), + m_threadPool(threadsNum) +{ #ifndef HAVE_CMPH - std::cerr << "minphr: CMPH support not compiled in." << std::endl; - exit(1); -#endif - } + std::cerr << "minphr: CMPH support not compiled in." << std::endl; + exit(1); +#endif +} #else BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits) -: m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), - m_fileHandle(0), m_fileHandleStart(0), m_size(0), - m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0) { + : m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), + m_fileHandle(0), m_fileHandleStart(0), m_size(0), + m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0) +{ #ifndef HAVE_CMPH - std::cerr << "minphr: CMPH support not compiled in." << std::endl; - exit(1); -#endif - } + std::cerr << "minphr: CMPH support not compiled in." << std::endl; + exit(1); +#endif +} #endif BlockHashIndex::~BlockHashIndex() @@ -60,7 +62,7 @@ BlockHashIndex::~BlockHashIndex() it != m_hashes.end(); it++) if(*it != 0) cmph_destroy((cmph_t*)*it); - + for(std::vector*>::iterator it = m_arrays.begin(); it != m_arrays.end(); it++) if(*it != 0) @@ -72,15 +74,15 @@ size_t BlockHashIndex::GetHash(const char* key) { std::string keyStr(key); size_t i = std::distance(m_landmarks.begin(), - std::upper_bound(m_landmarks.begin(), - m_landmarks.end(), keyStr)) - 1; - + std::upper_bound(m_landmarks.begin(), + m_landmarks.end(), keyStr)) - 1; + if(i == 0ul-1) return GetSize(); - + size_t pos = GetHash(i, key); if(pos != GetSize()) - return (1ul << m_orderBits) * i + pos; + return (1ul << m_orderBits) * i + pos; else return GetSize(); } @@ -100,7 +102,7 @@ size_t BlockHashIndex::GetHash(size_t i, const char* key) #endif if(m_hashes[i] == 0) LoadRange(i); -#ifdef HAVE_CMPH +#ifdef HAVE_CMPH size_t idx = cmph_search((cmph_t*)m_hashes[i], key, (cmph_uint32) strlen(key)); #else assert(0); @@ -109,11 +111,11 @@ size_t BlockHashIndex::GetHash(size_t i, const char* key) std::pair orderPrint = m_arrays[i]->Get(idx, m_orderBits, m_fingerPrintBits); m_clocks[i] = clock(); - + if(GetFprint(key) == orderPrint.second) - return orderPrint.first; + return orderPrint.first; else - return GetSize(); + return GetSize(); } size_t BlockHashIndex::GetHash(std::string key) @@ -144,11 +146,11 @@ void BlockHashIndex::BeginSave(std::FILE * mphf) m_fileHandle = mphf; ThrowingFwrite(&m_orderBits, sizeof(size_t), 1, m_fileHandle); ThrowingFwrite(&m_fingerPrintBits, sizeof(size_t), 1, m_fileHandle); - + m_fileHandleStart = std::ftell(m_fileHandle); - + size_t relIndexPos = 0; - ThrowingFwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle); + ThrowingFwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle); } void BlockHashIndex::SaveRange(size_t i) @@ -168,25 +170,22 @@ void BlockHashIndex::SaveLastRange() boost::mutex::scoped_lock lock(m_mutex); #endif - while(!m_queue.empty() && m_lastSaved + 1 == -m_queue.top()) - { + while(!m_queue.empty() && m_lastSaved + 1 == -m_queue.top()) { size_t current = -m_queue.top(); m_queue.pop(); SaveRange(current); m_lastSaved = current; - } + } } void BlockHashIndex::DropRange(size_t i) { #ifdef HAVE_CMPH - if(m_hashes[i] != 0) - { + if(m_hashes[i] != 0) { cmph_destroy((cmph_t*)m_hashes[i]); m_hashes[i] = 0; } - if(m_arrays[i] != 0) - { + if(m_arrays[i] != 0) { delete m_arrays[i]; m_arrays[i] = 0; m_clocks[i] = 0; @@ -201,7 +200,7 @@ void BlockHashIndex::DropLastRange() boost::mutex::scoped_lock lock(m_mutex); #endif - while(m_lastDropped != m_lastSaved) + while(m_lastDropped != m_lastSaved) DropRange(++m_lastDropped); } @@ -219,24 +218,24 @@ size_t BlockHashIndex::FinalizeSave() #endif SaveLastRange(); - + size_t relIndexPos = std::ftell(m_fileHandle) - m_fileHandleStart; - + std::fseek(m_fileHandle, m_fileHandleStart, SEEK_SET); ThrowingFwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle); - + std::fseek(m_fileHandle, m_fileHandleStart + relIndexPos, SEEK_SET); m_landmarks.save(m_fileHandle); - + size_t seekIndexSize = m_seekIndex.size(); ThrowingFwrite(&seekIndexSize, sizeof(size_t), 1, m_fileHandle); ThrowingFwrite(&m_seekIndex[0], sizeof(size_t), seekIndexSize, m_fileHandle); - + ThrowingFwrite(&m_size, sizeof(size_t), 1, m_fileHandle); - + size_t fileHandleStop = std::ftell(m_fileHandle); return fileHandleStop - m_fileHandleStart + sizeof(m_orderBits) - + sizeof(m_fingerPrintBits); + + sizeof(m_fingerPrintBits); } size_t BlockHashIndex::Save(std::FILE * mphf) @@ -251,14 +250,14 @@ size_t BlockHashIndex::Save(std::FILE * mphf) size_t BlockHashIndex::LoadIndex(std::FILE* mphf) { m_fileHandle = mphf; - + size_t beginning = std::ftell(mphf); size_t read = 0; read += std::fread(&m_orderBits, sizeof(size_t), 1, mphf); read += std::fread(&m_fingerPrintBits, sizeof(size_t), 1, mphf); m_fileHandleStart = std::ftell(m_fileHandle); - + size_t relIndexPos; read += std::fread(&relIndexPos, sizeof(size_t), 1, mphf); std::fseek(m_fileHandle, m_fileHandleStart + relIndexPos, SEEK_SET); @@ -272,12 +271,12 @@ size_t BlockHashIndex::LoadIndex(std::FILE* mphf) m_hashes.resize(seekIndexSize, 0); m_clocks.resize(seekIndexSize, 0); m_arrays.resize(seekIndexSize, 0); - + read += std::fread(&m_size, sizeof(size_t), 1, m_fileHandle); size_t end = std::ftell(mphf); - return end - beginning; + return end - beginning; } void BlockHashIndex::LoadRange(size_t i) @@ -288,10 +287,10 @@ void BlockHashIndex::LoadRange(size_t i) m_arrays[i] = new PairedPackedArray<>(0, m_orderBits, m_fingerPrintBits); m_arrays[i]->Load(m_fileHandle); - + m_hashes[i] = (void*)hash; m_clocks[i] = clock(); - + m_numLoadedRanges++; #endif } @@ -308,9 +307,9 @@ size_t BlockHashIndex::Load(std::FILE * mphf) { size_t byteSize = LoadIndex(mphf); size_t end = std::ftell(mphf); - + for(size_t i = 0; i < m_seekIndex.size(); i++) - LoadRange(i); + LoadRange(i); std::fseek(m_fileHandle, end, SEEK_SET); return byteSize; } @@ -327,14 +326,13 @@ void BlockHashIndex::KeepNLastRanges(float ratio, float tolerance) #endif size_t n = m_hashes.size() * ratio; size_t max = n * (1 + tolerance); - if(m_numLoadedRanges > max) - { + if(m_numLoadedRanges > max) { typedef std::vector > LastLoaded; LastLoaded lastLoaded; for(size_t i = 0; i < m_hashes.size(); i++) if(m_hashes[i] != 0) lastLoaded.push_back(std::make_pair(m_clocks[i], i)); - + std::sort(lastLoaded.begin(), lastLoaded.end()); for(LastLoaded::reverse_iterator it = lastLoaded.rbegin() + size_t(n * (1 - tolerance)); it != lastLoaded.rend(); it++) @@ -348,24 +346,23 @@ void BlockHashIndex::CalcHash(size_t current, void* source_void) cmph_io_adapter_t* source = (cmph_io_adapter_t*) source_void; cmph_config_t *config = cmph_config_new(source); cmph_config_set_algo(config, CMPH_CHD); - + cmph_t* hash = cmph_new(config); PairedPackedArray<> *pv = new PairedPackedArray<>(source->nkeys, m_orderBits, m_fingerPrintBits); size_t i = 0; - + source->rewind(source->data); - + std::string lastKey = ""; - while(i < source->nkeys) - { + while(i < source->nkeys) { unsigned keylen; char* key; source->read(source->data, &key, &keylen); std::string temp(key, keylen); source->dispose(source->data, key, keylen); - + if(lastKey > temp) { if(source->nkeys != 2 || temp != "###DUMMY_KEY###") { std::cerr << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort" << std::endl; @@ -375,41 +372,40 @@ void BlockHashIndex::CalcHash(size_t current, void* source_void) } } lastKey = temp; - + size_t fprint = GetFprint(temp.c_str()); size_t idx = cmph_search(hash, temp.c_str(), (cmph_uint32) temp.size()); - + pv->Set(idx, i, fprint, m_orderBits, m_fingerPrintBits); i++; } - + cmph_config_destroy(config); - + #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_mutex); #endif - if(m_hashes.size() <= current) - { - m_hashes.resize(current + 1, 0); + if(m_hashes.size() <= current) { + m_hashes.resize(current + 1, 0); m_arrays.resize(current + 1, 0); m_clocks.resize(current + 1, 0); } - + m_hashes[current] = (void*)hash; m_arrays[current] = pv; m_clocks[current] = clock(); - m_queue.push(-current); + m_queue.push(-current); #endif } -#ifdef HAVE_CMPH +#ifdef HAVE_CMPH void* BlockHashIndex::vectorAdapter(std::vector& v) { return (void*)CmphVectorAdapter(v); } - + void* BlockHashIndex::vectorAdapter(StringVector& sv) { return (void*)CmphStringVectorAdapter(sv); diff --git a/moses/TranslationModel/CompactPT/BlockHashIndex.h b/moses/TranslationModel/CompactPT/BlockHashIndex.h index 8541a2a19..c245d2d66 100644 --- a/moses/TranslationModel/CompactPT/BlockHashIndex.h +++ b/moses/TranslationModel/CompactPT/BlockHashIndex.h @@ -1,23 +1,23 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #ifndef moses_BlockHashIndex_h #define moses_BlockHashIndex_h @@ -25,7 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include #include #include -#include +#include #include #include @@ -42,144 +42,139 @@ namespace Moses class BlockHashIndex { - private: - std::priority_queue m_queue; - - size_t m_orderBits; - size_t m_fingerPrintBits; +private: + std::priority_queue m_queue; + + size_t m_orderBits; + size_t m_fingerPrintBits; + + std::FILE* m_fileHandle; + size_t m_fileHandleStart; + + StringVector m_landmarks; + + std::vector m_hashes; + std::vector m_clocks; + std::vector*> m_arrays; + + std::vector m_seekIndex; + + size_t m_size; + int m_lastSaved; + int m_lastDropped; + size_t m_numLoadedRanges; - std::FILE* m_fileHandle; - size_t m_fileHandleStart; - - StringVector m_landmarks; - - std::vector m_hashes; - std::vector m_clocks; - std::vector*> m_arrays; - - std::vector m_seekIndex; - - size_t m_size; - int m_lastSaved; - int m_lastDropped; - size_t m_numLoadedRanges; - #ifdef WITH_THREADS - ThreadPool m_threadPool; - boost::mutex m_mutex; - - template - class HashTask : public Task - { - public: - HashTask(int id, BlockHashIndex& hash, Keys& keys) - : m_id(id), m_hash(hash), m_keys(new Keys(keys)) {} - - virtual void Run() - { - m_hash.CalcHash(m_id, *m_keys); - } - - virtual ~HashTask() - { - delete m_keys; - } - - private: - int m_id; - BlockHashIndex& m_hash; - Keys* m_keys; - }; -#endif - - size_t GetFprint(const char* key) const; - size_t GetHash(size_t i, const char* key); - + ThreadPool m_threadPool; + boost::mutex m_mutex; + + template + class HashTask : public Task + { public: -#ifdef WITH_THREADS - BlockHashIndex(size_t orderBits, size_t fingerPrintBits, - size_t threadsNum = 2); -#else - BlockHashIndex(size_t orderBits, size_t fingerPrintBits); -#endif + HashTask(int id, BlockHashIndex& hash, Keys& keys) + : m_id(id), m_hash(hash), m_keys(new Keys(keys)) {} - ~BlockHashIndex(); - - size_t GetHash(const char* key); - size_t GetHash(std::string key); - - size_t operator[](std::string key); - size_t operator[](char* key); - - void BeginSave(std::FILE* mphf); - void SaveRange(size_t i); - void SaveLastRange(); - size_t FinalizeSave(); - -#ifdef WITH_THREADS - void WaitAll(); -#endif - - void DropRange(size_t i); - void DropLastRange(); - - size_t LoadIndex(std::FILE* mphf); - void LoadRange(size_t i); - - size_t Save(std::string filename); - size_t Save(std::FILE * mphf); - - size_t Load(std::string filename); - size_t Load(std::FILE * mphf); - - size_t GetSize() const; - - void KeepNLastRanges(float ratio = 0.1, float tolerance = 0.1); - - template - void AddRange(Keys &keys) - { - size_t current = m_landmarks.size(); - - if(m_landmarks.size() && m_landmarks.back().str() >= keys[0]) - { - std::cerr << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort" << std::endl; - std::cerr << "1: " << m_landmarks.back().str() << std::endl; - std::cerr << "2: " << keys[0] << std::endl; - abort(); - } - - m_landmarks.push_back(keys[0]); - m_size += keys.size(); - - if(keys.size() == 1) { - // add dummy key to avoid null hash - keys.push_back("###DUMMY_KEY###"); - } - -#ifdef WITH_THREADS - HashTask* ht = new HashTask(current, *this, keys); - m_threadPool.Submit(ht); -#else - CalcHash(current, keys); -#endif - } - - template - void CalcHash(size_t current, Keys &keys) - { -#ifdef HAVE_CMPH - void* source = vectorAdapter(keys); - CalcHash(current, source); -#endif + virtual void Run() { + m_hash.CalcHash(m_id, *m_keys); } - void CalcHash(size_t current, void* source); - -#ifdef HAVE_CMPH - void* vectorAdapter(std::vector& v); - void* vectorAdapter(StringVector& sv); - void* vectorAdapter(StringVector& sv); + virtual ~HashTask() { + delete m_keys; + } + + private: + int m_id; + BlockHashIndex& m_hash; + Keys* m_keys; + }; +#endif + + size_t GetFprint(const char* key) const; + size_t GetHash(size_t i, const char* key); + +public: +#ifdef WITH_THREADS + BlockHashIndex(size_t orderBits, size_t fingerPrintBits, + size_t threadsNum = 2); +#else + BlockHashIndex(size_t orderBits, size_t fingerPrintBits); +#endif + + ~BlockHashIndex(); + + size_t GetHash(const char* key); + size_t GetHash(std::string key); + + size_t operator[](std::string key); + size_t operator[](char* key); + + void BeginSave(std::FILE* mphf); + void SaveRange(size_t i); + void SaveLastRange(); + size_t FinalizeSave(); + +#ifdef WITH_THREADS + void WaitAll(); +#endif + + void DropRange(size_t i); + void DropLastRange(); + + size_t LoadIndex(std::FILE* mphf); + void LoadRange(size_t i); + + size_t Save(std::string filename); + size_t Save(std::FILE * mphf); + + size_t Load(std::string filename); + size_t Load(std::FILE * mphf); + + size_t GetSize() const; + + void KeepNLastRanges(float ratio = 0.1, float tolerance = 0.1); + + template + void AddRange(Keys &keys) { + size_t current = m_landmarks.size(); + + if(m_landmarks.size() && m_landmarks.back().str() >= keys[0]) { + std::cerr << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort" << std::endl; + std::cerr << "1: " << m_landmarks.back().str() << std::endl; + std::cerr << "2: " << keys[0] << std::endl; + abort(); + } + + m_landmarks.push_back(keys[0]); + m_size += keys.size(); + + if(keys.size() == 1) { + // add dummy key to avoid null hash + keys.push_back("###DUMMY_KEY###"); + } + +#ifdef WITH_THREADS + HashTask* ht = new HashTask(current, *this, keys); + m_threadPool.Submit(ht); +#else + CalcHash(current, keys); +#endif + } + + template + void CalcHash(size_t current, Keys &keys) { +#ifdef HAVE_CMPH + void* source = vectorAdapter(keys); + CalcHash(current, source); +#endif + } + + void CalcHash(size_t current, void* source); + +#ifdef HAVE_CMPH + void* vectorAdapter(std::vector& v); + void* vectorAdapter(StringVector& sv); + void* vectorAdapter(StringVector& sv); #endif }; diff --git a/moses/TranslationModel/CompactPT/CanonicalHuffman.h b/moses/TranslationModel/CompactPT/CanonicalHuffman.h index faf7ce411..8d6e1cbb1 100644 --- a/moses/TranslationModel/CompactPT/CanonicalHuffman.h +++ b/moses/TranslationModel/CompactPT/CanonicalHuffman.h @@ -1,23 +1,23 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #ifndef moses_CanonicalHuffman_h #define moses_CanonicalHuffman_h @@ -29,320 +29,293 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "ThrowingFwrite.h" -namespace Moses { +namespace Moses +{ template class CanonicalHuffman { - private: - std::vector m_symbols; - std::vector m_firstCodes; - std::vector m_lengthIndex; - - typedef boost::unordered_map > EncodeMap; - EncodeMap m_encodeMap; - - struct MinHeapSorter { - std::vector& m_vec; - - MinHeapSorter(std::vector& vec) : m_vec(vec) { } - - bool operator()(size_t a, size_t b) - { - return m_vec[a] > m_vec[b]; - } - }; - - template - void CalcLengths(Iterator begin, Iterator end, std::vector& lengths) - { - size_t n = std::distance(begin, end); - std::vector A(2 * n, 0); - - m_symbols.resize(n); - size_t i = 0; - for(Iterator it = begin; it != end; it++) - { - m_symbols[i] = it->first; - - A[i] = n + i; - A[n + i] = it->second; - i++; - } - - if(n == 1) - { - lengths.push_back(1); - return; - } - - MinHeapSorter hs(A); - std::make_heap(A.begin(), A.begin() + n, hs); - - size_t h = n; - size_t m1, m2; - while(h > 1) - { - m1 = A[0]; - std::pop_heap(A.begin(), A.begin() + h, hs); - - h--; - - m2 = A[0]; - std::pop_heap(A.begin(), A.begin() + h, hs); - - A[h] = A[m1] + A[m2]; - A[h-1] = h; - A[m1] = A[m2] = h; - - std::push_heap(A.begin(), A.begin() + h, hs); - } - - A[1] = 0; - for(size_t i = 2; i < 2*n; i++) - A[i] = A[A[i]] + 1; - - lengths.resize(n); - for(size_t i = 0; i < n; i++) - lengths[i] = A[i + n]; +private: + std::vector m_symbols; + std::vector m_firstCodes; + std::vector m_lengthIndex; + + typedef boost::unordered_map > EncodeMap; + EncodeMap m_encodeMap; + + struct MinHeapSorter { + std::vector& m_vec; + + MinHeapSorter(std::vector& vec) : m_vec(vec) { } + + bool operator()(size_t a, size_t b) { + return m_vec[a] > m_vec[b]; + } + }; + + template + void CalcLengths(Iterator begin, Iterator end, std::vector& lengths) { + size_t n = std::distance(begin, end); + std::vector A(2 * n, 0); + + m_symbols.resize(n); + size_t i = 0; + for(Iterator it = begin; it != end; it++) { + m_symbols[i] = it->first; + + A[i] = n + i; + A[n + i] = it->second; + i++; } - void CalcCodes(std::vector& lengths) - { - std::vector numLength; - for(std::vector::iterator it = lengths.begin(); - it != lengths.end(); it++) { - size_t length = *it; - if(numLength.size() <= length) - numLength.resize(length + 1, 0); - numLength[length]++; - } - - m_lengthIndex.resize(numLength.size()); - m_lengthIndex[0] = 0; - for(size_t l = 1; l < numLength.size(); l++) - m_lengthIndex[l] = m_lengthIndex[l - 1] + numLength[l - 1]; - - size_t maxLength = numLength.size() - 1; - - m_firstCodes.resize(maxLength + 1, 0); - for(size_t l = maxLength - 1; l > 0; l--) - m_firstCodes[l] = (m_firstCodes[l + 1] + numLength[l + 1]) / 2; - - std::vector t_symbols; - t_symbols.resize(lengths.size()); - - std::vector nextCode = m_firstCodes; - for(size_t i = 0; i < lengths.size(); i++) - { - Data data = m_symbols[i]; - size_t length = lengths[i]; - - size_t pos = m_lengthIndex[length] - + (nextCode[length] - m_firstCodes[length]); - t_symbols[pos] = data; - - nextCode[length] = nextCode[length] + 1; - } - - m_symbols.swap(t_symbols); + if(n == 1) { + lengths.push_back(1); + return; } - - void CreateCodeMap() - { - for(size_t l = 1; l < m_lengthIndex.size(); l++) - { - size_t intCode = m_firstCodes[l]; - size_t num = ((l+1 < m_lengthIndex.size()) ? m_lengthIndex[l+1] - : m_symbols.size()) - m_lengthIndex[l]; - - for(size_t i = 0; i < num; i++) - { - Data data = m_symbols[m_lengthIndex[l] + i]; - boost::dynamic_bitset<> bitCode(l, intCode); - m_encodeMap[data] = bitCode; - intCode++; - } + + MinHeapSorter hs(A); + std::make_heap(A.begin(), A.begin() + n, hs); + + size_t h = n; + size_t m1, m2; + while(h > 1) { + m1 = A[0]; + std::pop_heap(A.begin(), A.begin() + h, hs); + + h--; + + m2 = A[0]; + std::pop_heap(A.begin(), A.begin() + h, hs); + + A[h] = A[m1] + A[m2]; + A[h-1] = h; + A[m1] = A[m2] = h; + + std::push_heap(A.begin(), A.begin() + h, hs); + } + + A[1] = 0; + for(size_t i = 2; i < 2*n; i++) + A[i] = A[A[i]] + 1; + + lengths.resize(n); + for(size_t i = 0; i < n; i++) + lengths[i] = A[i + n]; + } + + void CalcCodes(std::vector& lengths) { + std::vector numLength; + for(std::vector::iterator it = lengths.begin(); + it != lengths.end(); it++) { + size_t length = *it; + if(numLength.size() <= length) + numLength.resize(length + 1, 0); + numLength[length]++; + } + + m_lengthIndex.resize(numLength.size()); + m_lengthIndex[0] = 0; + for(size_t l = 1; l < numLength.size(); l++) + m_lengthIndex[l] = m_lengthIndex[l - 1] + numLength[l - 1]; + + size_t maxLength = numLength.size() - 1; + + m_firstCodes.resize(maxLength + 1, 0); + for(size_t l = maxLength - 1; l > 0; l--) + m_firstCodes[l] = (m_firstCodes[l + 1] + numLength[l + 1]) / 2; + + std::vector t_symbols; + t_symbols.resize(lengths.size()); + + std::vector nextCode = m_firstCodes; + for(size_t i = 0; i < lengths.size(); i++) { + Data data = m_symbols[i]; + size_t length = lengths[i]; + + size_t pos = m_lengthIndex[length] + + (nextCode[length] - m_firstCodes[length]); + t_symbols[pos] = data; + + nextCode[length] = nextCode[length] + 1; + } + + m_symbols.swap(t_symbols); + } + + void CreateCodeMap() { + for(size_t l = 1; l < m_lengthIndex.size(); l++) { + size_t intCode = m_firstCodes[l]; + size_t num = ((l+1 < m_lengthIndex.size()) ? m_lengthIndex[l+1] + : m_symbols.size()) - m_lengthIndex[l]; + + for(size_t i = 0; i < num; i++) { + Data data = m_symbols[m_lengthIndex[l] + i]; + boost::dynamic_bitset<> bitCode(l, intCode); + m_encodeMap[data] = bitCode; + intCode++; } } - - boost::dynamic_bitset<>& Encode(Data data) - { - return m_encodeMap[data]; - } - - template - void PutCode(BitWrapper& bitWrapper, boost::dynamic_bitset<>& code) - { - for(int j = code.size()-1; j >= 0; j--) - bitWrapper.Put(code[j]); - } - - public: + } - template - CanonicalHuffman(Iterator begin, Iterator end, bool forEncoding = true) - { - std::vector lengths; - CalcLengths(begin, end, lengths); - CalcCodes(lengths); + boost::dynamic_bitset<>& Encode(Data data) { + return m_encodeMap[data]; + } - if(forEncoding) - CreateCodeMap(); - } - - CanonicalHuffman(std::FILE* pFile, bool forEncoding = false) - { - Load(pFile); - - if(forEncoding) - CreateCodeMap(); - } - - template - void Put(BitWrapper& bitWrapper, Data data) - { - PutCode(bitWrapper, Encode(data)); - } - - template - Data Read(BitWrapper& bitWrapper) - { - if(bitWrapper.TellFromEnd()) - { - size_t intCode = bitWrapper.Read(); - size_t len = 1; - while(intCode < m_firstCodes[len]) { - intCode = 2 * intCode + bitWrapper.Read(); - len++; - } - return m_symbols[m_lengthIndex[len] + (intCode - m_firstCodes[len])]; - } - return Data(); - } - - size_t Load(std::FILE* pFile) - { - size_t start = std::ftell(pFile); - size_t read = 0; - - size_t size; - read += std::fread(&size, sizeof(size_t), 1, pFile); - m_symbols.resize(size); - read += std::fread(&m_symbols[0], sizeof(Data), size, pFile); - - read += std::fread(&size, sizeof(size_t), 1, pFile); - m_firstCodes.resize(size); - read += std::fread(&m_firstCodes[0], sizeof(size_t), size, pFile); - - read += std::fread(&size, sizeof(size_t), 1, pFile); - m_lengthIndex.resize(size); - read += std::fread(&m_lengthIndex[0], sizeof(size_t), size, pFile); - - return std::ftell(pFile) - start; - } - - size_t Save(std::FILE* pFile) - { - size_t start = std::ftell(pFile); - - size_t size = m_symbols.size(); - ThrowingFwrite(&size, sizeof(size_t), 1, pFile); - ThrowingFwrite(&m_symbols[0], sizeof(Data), size, pFile); - - size = m_firstCodes.size(); - ThrowingFwrite(&size, sizeof(size_t), 1, pFile); - ThrowingFwrite(&m_firstCodes[0], sizeof(size_t), size, pFile); - - size = m_lengthIndex.size(); - ThrowingFwrite(&size, sizeof(size_t), 1, pFile); - ThrowingFwrite(&m_lengthIndex[0], sizeof(size_t), size, pFile); - - return std::ftell(pFile) - start; + template + void PutCode(BitWrapper& bitWrapper, boost::dynamic_bitset<>& code) { + for(int j = code.size()-1; j >= 0; j--) + bitWrapper.Put(code[j]); + } + +public: + + template + CanonicalHuffman(Iterator begin, Iterator end, bool forEncoding = true) { + std::vector lengths; + CalcLengths(begin, end, lengths); + CalcCodes(lengths); + + if(forEncoding) + CreateCodeMap(); + } + + CanonicalHuffman(std::FILE* pFile, bool forEncoding = false) { + Load(pFile); + + if(forEncoding) + CreateCodeMap(); + } + + template + void Put(BitWrapper& bitWrapper, Data data) { + PutCode(bitWrapper, Encode(data)); + } + + template + Data Read(BitWrapper& bitWrapper) { + if(bitWrapper.TellFromEnd()) { + size_t intCode = bitWrapper.Read(); + size_t len = 1; + while(intCode < m_firstCodes[len]) { + intCode = 2 * intCode + bitWrapper.Read(); + len++; + } + return m_symbols[m_lengthIndex[len] + (intCode - m_firstCodes[len])]; } + return Data(); + } + + size_t Load(std::FILE* pFile) { + size_t start = std::ftell(pFile); + size_t read = 0; + + size_t size; + read += std::fread(&size, sizeof(size_t), 1, pFile); + m_symbols.resize(size); + read += std::fread(&m_symbols[0], sizeof(Data), size, pFile); + + read += std::fread(&size, sizeof(size_t), 1, pFile); + m_firstCodes.resize(size); + read += std::fread(&m_firstCodes[0], sizeof(size_t), size, pFile); + + read += std::fread(&size, sizeof(size_t), 1, pFile); + m_lengthIndex.resize(size); + read += std::fread(&m_lengthIndex[0], sizeof(size_t), size, pFile); + + return std::ftell(pFile) - start; + } + + size_t Save(std::FILE* pFile) { + size_t start = std::ftell(pFile); + + size_t size = m_symbols.size(); + ThrowingFwrite(&size, sizeof(size_t), 1, pFile); + ThrowingFwrite(&m_symbols[0], sizeof(Data), size, pFile); + + size = m_firstCodes.size(); + ThrowingFwrite(&size, sizeof(size_t), 1, pFile); + ThrowingFwrite(&m_firstCodes[0], sizeof(size_t), size, pFile); + + size = m_lengthIndex.size(); + ThrowingFwrite(&size, sizeof(size_t), 1, pFile); + ThrowingFwrite(&m_lengthIndex[0], sizeof(size_t), size, pFile); + + return std::ftell(pFile) - start; + } }; template class BitWrapper { - private: - Container& m_data; - - typename Container::iterator m_iterator; - typename Container::value_type m_currentValue; - - size_t m_valueBits; - typename Container::value_type m_mask; - size_t m_bitPos; - - public: - - BitWrapper(Container &data) +private: + Container& m_data; + + typename Container::iterator m_iterator; + typename Container::value_type m_currentValue; + + size_t m_valueBits; + typename Container::value_type m_mask; + size_t m_bitPos; + +public: + + BitWrapper(Container &data) : m_data(data), m_iterator(m_data.begin()), m_currentValue(0), m_valueBits(sizeof(typename Container::value_type) * 8), m_mask(1), m_bitPos(0) { } - - bool Read() - { - if(m_bitPos % m_valueBits == 0) - { - if(m_iterator != m_data.end()) - m_currentValue = *m_iterator++; - } - else - m_currentValue = m_currentValue >> 1; - - m_bitPos++; - return (m_currentValue & m_mask); - } - - void Put(bool bit) { - if(m_bitPos % m_valueBits == 0) - m_data.push_back(0); - - if(bit) - m_data[m_data.size()-1] |= m_mask << (m_bitPos % m_valueBits); - - m_bitPos++; - } - - size_t Tell() - { - return m_bitPos; - } - - size_t TellFromEnd() - { - if(m_data.size() * m_valueBits < m_bitPos) - return 0; - return m_data.size() * m_valueBits - m_bitPos; - } - - void Seek(size_t bitPos) - { - m_bitPos = bitPos; - m_iterator = m_data.begin() + int((m_bitPos-1)/m_valueBits); - m_currentValue = (*m_iterator) >> ((m_bitPos-1) % m_valueBits); - m_iterator++; - } - - void SeekFromEnd(size_t bitPosFromEnd) - { - size_t bitPos = m_data.size() * m_valueBits - bitPosFromEnd; - Seek(bitPos); - } - - void Reset() - { - m_iterator = m_data.begin(); - m_currentValue = 0; - m_bitPos = 0; - } - - Container& GetContainer() - { - return m_data; - } + + bool Read() { + if(m_bitPos % m_valueBits == 0) { + if(m_iterator != m_data.end()) + m_currentValue = *m_iterator++; + } else + m_currentValue = m_currentValue >> 1; + + m_bitPos++; + return (m_currentValue & m_mask); + } + + void Put(bool bit) { + if(m_bitPos % m_valueBits == 0) + m_data.push_back(0); + + if(bit) + m_data[m_data.size()-1] |= m_mask << (m_bitPos % m_valueBits); + + m_bitPos++; + } + + size_t Tell() { + return m_bitPos; + } + + size_t TellFromEnd() { + if(m_data.size() * m_valueBits < m_bitPos) + return 0; + return m_data.size() * m_valueBits - m_bitPos; + } + + void Seek(size_t bitPos) { + m_bitPos = bitPos; + m_iterator = m_data.begin() + int((m_bitPos-1)/m_valueBits); + m_currentValue = (*m_iterator) >> ((m_bitPos-1) % m_valueBits); + m_iterator++; + } + + void SeekFromEnd(size_t bitPosFromEnd) { + size_t bitPos = m_data.size() * m_valueBits - bitPosFromEnd; + Seek(bitPos); + } + + void Reset() { + m_iterator = m_data.begin(); + m_currentValue = 0; + m_bitPos = 0; + } + + Container& GetContainer() { + return m_data; + } }; } diff --git a/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp b/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp index 40fff6690..8e4d1641f 100644 --- a/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp +++ b/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp @@ -1,23 +1,23 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #ifdef HAVE_CMPH @@ -25,70 +25,70 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA namespace Moses { - - void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen) - { - delete[] key; - } - void CmphStringVectorAdapterRewind(void *data) - { - cmph_vector_t *cmph_vector = (cmph_vector_t *)data; - cmph_vector->position = 0; - } - - //************************************************************************// - - cmph_io_adapter_t *CmphVectorAdapterNew(std::vector& v) - { - cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t)); - cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t)); - assert(key_source); - assert(cmph_vector); - - cmph_vector->vector = (void *)&v; - cmph_vector->position = 0; - key_source->data = (void *)cmph_vector; - key_source->nkeys = v.size(); - - return key_source; - } +void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen) +{ + delete[] key; +} - int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen) - { - cmph_vector_t *cmph_vector = (cmph_vector_t *)data; - std::vector* v = (std::vector*)cmph_vector->vector; - size_t size; - *keylen = (*v)[cmph_vector->position].size(); - size = *keylen; - *key = new char[size + 1]; - std::string temp = (*v)[cmph_vector->position]; - strcpy(*key, temp.c_str()); - cmph_vector->position = cmph_vector->position + 1; - return (int)(*keylen); - } - - void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen) - { - delete[] key; - } +void CmphStringVectorAdapterRewind(void *data) +{ + cmph_vector_t *cmph_vector = (cmph_vector_t *)data; + cmph_vector->position = 0; +} - void CmphVectorAdapterRewind(void *data) - { - cmph_vector_t *cmph_vector = (cmph_vector_t *)data; - cmph_vector->position = 0; - } +//************************************************************************// + +cmph_io_adapter_t *CmphVectorAdapterNew(std::vector& v) +{ + cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t)); + cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t)); + assert(key_source); + assert(cmph_vector); + + cmph_vector->vector = (void *)&v; + cmph_vector->position = 0; + key_source->data = (void *)cmph_vector; + key_source->nkeys = v.size(); + + return key_source; +} + +int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen) +{ + cmph_vector_t *cmph_vector = (cmph_vector_t *)data; + std::vector* v = (std::vector*)cmph_vector->vector; + size_t size; + *keylen = (*v)[cmph_vector->position].size(); + size = *keylen; + *key = new char[size + 1]; + std::string temp = (*v)[cmph_vector->position]; + strcpy(*key, temp.c_str()); + cmph_vector->position = cmph_vector->position + 1; + return (int)(*keylen); +} + +void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen) +{ + delete[] key; +} + +void CmphVectorAdapterRewind(void *data) +{ + cmph_vector_t *cmph_vector = (cmph_vector_t *)data; + cmph_vector->position = 0; +} + +cmph_io_adapter_t* CmphVectorAdapter(std::vector& v) +{ + cmph_io_adapter_t * key_source = CmphVectorAdapterNew(v); + + key_source->read = CmphVectorAdapterRead; + key_source->dispose = CmphVectorAdapterDispose; + key_source->rewind = CmphVectorAdapterRewind; + return key_source; +} - cmph_io_adapter_t* CmphVectorAdapter(std::vector& v) - { - cmph_io_adapter_t * key_source = CmphVectorAdapterNew(v); - - key_source->read = CmphVectorAdapterRead; - key_source->dispose = CmphVectorAdapterDispose; - key_source->rewind = CmphVectorAdapterRewind; - return key_source; - } - } #endif diff --git a/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h b/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h index 5516d4f4d..4a532c289 100644 --- a/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h +++ b/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h @@ -1,23 +1,23 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #ifndef moses_CmphStringVectorAdapterNew_h #define moses_CmphStringVectorAdapterNew_h @@ -33,72 +33,71 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA namespace Moses { - typedef struct - { - void *vector; - cmph_uint32 position; - } - cmph_vector_t; - - - template class Allocator> - cmph_io_adapter_t *CmphStringVectorAdapterNew(StringVector& sv) - { - cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t)); - cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t)); - assert(key_source); - assert(cmph_vector); - - cmph_vector->vector = (void *)&sv; - cmph_vector->position = 0; - key_source->data = (void *)cmph_vector; - key_source->nkeys = sv.size(); - - return key_source; - } +typedef struct { + void *vector; + cmph_uint32 position; +} +cmph_vector_t; - template class Allocator> - int CmphStringVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen) - { - cmph_vector_t *cmph_vector = (cmph_vector_t *)data; - StringVector* sv = (StringVector*)cmph_vector->vector; - size_t size; - *keylen = (*sv)[cmph_vector->position].size(); - size = *keylen; - *key = new char[size + 1]; - std::string temp = (*sv)[cmph_vector->position]; - std::strcpy(*key, temp.c_str()); - cmph_vector->position = cmph_vector->position + 1; - return (int)(*keylen); - } - - void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen); - void CmphStringVectorAdapterRewind(void *data); +template class Allocator> +cmph_io_adapter_t *CmphStringVectorAdapterNew(StringVector& sv) +{ + cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t)); + cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t)); + assert(key_source); + assert(cmph_vector); - template class Allocator> - cmph_io_adapter_t* CmphStringVectorAdapter(StringVector& sv) - { - cmph_io_adapter_t * key_source = CmphStringVectorAdapterNew(sv); - - key_source->read = CmphStringVectorAdapterRead; - key_source->dispose = CmphStringVectorAdapterDispose; - key_source->rewind = CmphStringVectorAdapterRewind; - return key_source; - } - - //************************************************************************// - - cmph_io_adapter_t *CmphVectorAdapterNew(std::vector& v); + cmph_vector->vector = (void *)&sv; + cmph_vector->position = 0; + key_source->data = (void *)cmph_vector; + key_source->nkeys = sv.size(); - int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen); - - void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen); + return key_source; +} - void CmphVectorAdapterRewind(void *data); +template class Allocator> +int CmphStringVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen) +{ + cmph_vector_t *cmph_vector = (cmph_vector_t *)data; + StringVector* sv = (StringVector*)cmph_vector->vector; + size_t size; + *keylen = (*sv)[cmph_vector->position].size(); + size = *keylen; + *key = new char[size + 1]; + std::string temp = (*sv)[cmph_vector->position]; + std::strcpy(*key, temp.c_str()); + cmph_vector->position = cmph_vector->position + 1; + return (int)(*keylen); +} + +void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen); + +void CmphStringVectorAdapterRewind(void *data); + +template class Allocator> +cmph_io_adapter_t* CmphStringVectorAdapter(StringVector& sv) +{ + cmph_io_adapter_t * key_source = CmphStringVectorAdapterNew(sv); + + key_source->read = CmphStringVectorAdapterRead; + key_source->dispose = CmphStringVectorAdapterDispose; + key_source->rewind = CmphStringVectorAdapterRewind; + return key_source; +} + +//************************************************************************// + +cmph_io_adapter_t *CmphVectorAdapterNew(std::vector& v); + +int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen); + +void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen); + +void CmphVectorAdapterRewind(void *data); + +cmph_io_adapter_t* CmphVectorAdapter(std::vector& v); - cmph_io_adapter_t* CmphVectorAdapter(std::vector& v); - } #endif diff --git a/moses/TranslationModel/CompactPT/ConsistentPhrases.h b/moses/TranslationModel/CompactPT/ConsistentPhrases.h index 0ec86e1ac..c7b7c733b 100644 --- a/moses/TranslationModel/CompactPT/ConsistentPhrases.h +++ b/moses/TranslationModel/CompactPT/ConsistentPhrases.h @@ -1,23 +1,23 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #ifndef moses_ConsistentPhrases_h #define moses_ConsistentPhrases_h @@ -29,97 +29,82 @@ namespace Moses class ConsistentPhrases { - public: - struct Phrase - { - int i, j, m, n; - Phrase(int i_, int m_, int j_, int n_) : i(i_), j(j_), m(m_), n(n_) { } - }; - - struct PhraseSorter - { - bool operator()(Phrase a, Phrase b) - { - if(a.n > b.n) - return true; - if(a.n == b.n && a.j < b.j) - return true; - if(a.n == b.n && a.j == b.j && a.m > b.m) - return true; - if(a.n == b.n && a.j == b.j && a.m == b.m && a.i < b.i) - return true; - return false; - } - }; - - private: - typedef std::set PhraseQueue; - PhraseQueue m_phraseQueue; - - typedef std::pair AlignPoint; - typedef std::set Alignment; - - public: - - ConsistentPhrases(int mmax, int nmax, Alignment& a) - { - for(int i = 0; i < mmax; i++) - { - for(int m = 1; m <= mmax-i; m++) - { - for(int j = 0; j < nmax; j++) - { - for(int n = 1; n <= nmax-j; n++) - { - bool consistant = true; - for(Alignment::iterator it = a.begin(); it != a.end(); it++) - { - int ip = it->first; - int jp = it->second; - if((i <= ip && ip < i+m) != (j <= jp && jp < j+n)) - { - consistant = false; - break; - } +public: + struct Phrase { + int i, j, m, n; + Phrase(int i_, int m_, int j_, int n_) : i(i_), j(j_), m(m_), n(n_) { } + }; + + struct PhraseSorter { + bool operator()(Phrase a, Phrase b) { + if(a.n > b.n) + return true; + if(a.n == b.n && a.j < b.j) + return true; + if(a.n == b.n && a.j == b.j && a.m > b.m) + return true; + if(a.n == b.n && a.j == b.j && a.m == b.m && a.i < b.i) + return true; + return false; + } + }; + +private: + typedef std::set PhraseQueue; + PhraseQueue m_phraseQueue; + + typedef std::pair AlignPoint; + typedef std::set Alignment; + +public: + + ConsistentPhrases(int mmax, int nmax, Alignment& a) { + for(int i = 0; i < mmax; i++) { + for(int m = 1; m <= mmax-i; m++) { + for(int j = 0; j < nmax; j++) { + for(int n = 1; n <= nmax-j; n++) { + bool consistant = true; + for(Alignment::iterator it = a.begin(); it != a.end(); it++) { + int ip = it->first; + int jp = it->second; + if((i <= ip && ip < i+m) != (j <= jp && jp < j+n)) { + consistant = false; + break; } - if(consistant) - m_phraseQueue.insert(Phrase(i, m, j, n)); - } - } - } + } + if(consistant) + m_phraseQueue.insert(Phrase(i, m, j, n)); + } + } } - m_phraseQueue.erase(Phrase(0, mmax, 0, nmax)); } - - size_t Empty() - { - return !m_phraseQueue.size(); + m_phraseQueue.erase(Phrase(0, mmax, 0, nmax)); + } + + size_t Empty() { + return !m_phraseQueue.size(); + } + + Phrase Pop() { + if(m_phraseQueue.size()) { + Phrase p = *m_phraseQueue.begin(); + m_phraseQueue.erase(m_phraseQueue.begin()); + return p; } - - Phrase Pop() - { - if(m_phraseQueue.size()) - { - Phrase p = *m_phraseQueue.begin(); - m_phraseQueue.erase(m_phraseQueue.begin()); - return p; - } - return Phrase(0,0,0,0); + return Phrase(0,0,0,0); + } + + void RemoveOverlap(Phrase p) { + PhraseQueue ok; + for(PhraseQueue::iterator it = m_phraseQueue.begin(); it != m_phraseQueue.end(); it++) { + Phrase pp = *it; + if(!((p.i <= pp.i && pp.i < p.i + p.m) || (pp.i <= p.i && p.i < pp.i + pp.m) || + (p.j <= pp.j && pp.j < p.j + p.n) || (pp.j <= p.j && p.j < pp.j + pp.n))) + ok.insert(pp); } - - void RemoveOverlap(Phrase p) - { - PhraseQueue ok; - for(PhraseQueue::iterator it = m_phraseQueue.begin(); it != m_phraseQueue.end(); it++) - { - Phrase pp = *it; - if(!((p.i <= pp.i && pp.i < p.i + p.m) || (pp.i <= p.i && p.i < pp.i + pp.m) || - (p.j <= pp.j && pp.j < p.j + p.n) || (pp.j <= p.j && p.j < pp.j + pp.n))) - ok.insert(pp); - } - m_phraseQueue = ok; - } - + m_phraseQueue = ok; + } + }; } diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp index ff1c663c9..ad7591a7b 100644 --- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp +++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp @@ -1,27 +1,28 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #include "LexicalReorderingTableCompact.h" -namespace Moses { +namespace Moses +{ LexicalReorderingTableCompact::LexicalReorderingTableCompact( const std::string& filePath, @@ -29,9 +30,9 @@ LexicalReorderingTableCompact::LexicalReorderingTableCompact( const std::vector& e_factors, const std::vector& c_factors) : LexicalReorderingTable(f_factors, e_factors, c_factors), - m_inMemory(StaticData::Instance().UseMinlexrInMemory()), - m_numScoreComponent(6), m_multipleScoreTrees(true), - m_hash(10, 16), m_scoreTrees(1) + m_inMemory(StaticData::Instance().UseMinlexrInMemory()), + m_numScoreComponent(6), m_multipleScoreTrees(true), + m_hash(10, 16), m_scoreTrees(1) { Load(filePath); } @@ -41,12 +42,13 @@ LexicalReorderingTableCompact::LexicalReorderingTableCompact( const std::vector& e_factors, const std::vector& c_factors) : LexicalReorderingTable(f_factors, e_factors, c_factors), - m_inMemory(StaticData::Instance().UseMinlexrInMemory()), - m_numScoreComponent(6), m_multipleScoreTrees(true), - m_hash(10, 16), m_scoreTrees(1) + m_inMemory(StaticData::Instance().UseMinlexrInMemory()), + m_numScoreComponent(6), m_multipleScoreTrees(true), + m_hash(10, 16), m_scoreTrees(1) { } -LexicalReorderingTableCompact::~LexicalReorderingTableCompact() { +LexicalReorderingTableCompact::~LexicalReorderingTableCompact() +{ for(size_t i = 0; i < m_scoreTrees.size(); i++) delete m_scoreTrees[i]; } @@ -57,25 +59,23 @@ std::vector LexicalReorderingTableCompact::GetScore(const Phrase& f, { std::string key; Scores scores; - + if(0 == c.GetSize()) key = MakeKey(f, e, c); else - for(size_t i = 0; i <= c.GetSize(); ++i) - { + for(size_t i = 0; i <= c.GetSize(); ++i) { Phrase sub_c(c.GetSubString(WordsRange(i,c.GetSize()-1))); key = MakeKey(f,e,sub_c); } - + size_t index = m_hash[key]; - if(m_hash.GetSize() != index) - { + if(m_hash.GetSize() != index) { std::string scoresString; if(m_inMemory) scoresString = m_scoresMemory[index]; else scoresString = m_scoresMapped[index]; - + BitWrapper<> bitStream(scoresString); for(size_t i = 0; i < m_numScoreComponent; i++) scores.push_back(m_scoreTrees[m_multipleScoreTrees ? i : 0]->Read(bitStream)); @@ -100,22 +100,17 @@ std::string LexicalReorderingTableCompact::MakeKey(const std::string& f, const std::string& c) const { std::string key; - if(!f.empty()) - { + if(!f.empty()) { key += f; } - if(!m_FactorsE.empty()) - { - if(!key.empty()) - { + if(!m_FactorsE.empty()) { + if(!key.empty()) { key += " ||| "; } key += e; } - if(!m_FactorsC.empty()) - { - if(!key.empty()) - { + if(!m_FactorsC.empty()) { + if(!key.empty()) { key += " ||| "; } key += c; @@ -133,48 +128,43 @@ LexicalReorderingTable* LexicalReorderingTableCompact::CheckAndLoad( #ifdef HAVE_CMPH std::string minlexr = ".minlexr"; // file name is specified without suffix - if(FileExists(filePath + minlexr)) - { + if(FileExists(filePath + minlexr)) { //there exists a compact binary version use that - VERBOSE(2,"Using compact lexical reordering table" << std::endl); - return new LexicalReorderingTableCompact(filePath + minlexr, f_factors, e_factors, c_factors); + VERBOSE(2,"Using compact lexical reordering table" << std::endl); + return new LexicalReorderingTableCompact(filePath + minlexr, f_factors, e_factors, c_factors); } // file name is specified with suffix if(filePath.substr(filePath.length() - minlexr.length(), minlexr.length()) == minlexr - && FileExists(filePath)) - { + && FileExists(filePath)) { //there exists a compact binary version use that - VERBOSE(2,"Using compact lexical reordering table" << std::endl); - return new LexicalReorderingTableCompact(filePath, f_factors, e_factors, c_factors); + VERBOSE(2,"Using compact lexical reordering table" << std::endl); + return new LexicalReorderingTableCompact(filePath, f_factors, e_factors, c_factors); } #endif return 0; } void LexicalReorderingTableCompact::Load(std::string filePath) -{ +{ std::FILE* pFile = std::fopen(filePath.c_str(), "r"); if(m_inMemory) m_hash.Load(pFile); else m_hash.LoadIndex(pFile); - + size_t read = 0; read += std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, pFile); read += std::fread(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, pFile); - - if(m_multipleScoreTrees) - { + + if(m_multipleScoreTrees) { m_scoreTrees.resize(m_numScoreComponent); for(size_t i = 0; i < m_numScoreComponent; i++) m_scoreTrees[i] = new CanonicalHuffman(pFile); - } - else - { + } else { m_scoreTrees.resize(1); m_scoreTrees[0] = new CanonicalHuffman(pFile); } - + if(m_inMemory) m_scoresMemory.load(pFile, false); else diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h index 849c61c08..46f2228c9 100644 --- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h +++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h @@ -1,23 +1,23 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #ifndef moses_LexicalReorderingTableCompact_h #define moses_LexicalReorderingTableCompact_h @@ -33,50 +33,51 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "CanonicalHuffman.h" #include "StringVector.h" -namespace Moses { +namespace Moses +{ class LexicalReorderingTableCompact: public LexicalReorderingTable { - private: - bool m_inMemory; - - size_t m_numScoreComponent; - bool m_multipleScoreTrees; - - BlockHashIndex m_hash; - - typedef CanonicalHuffman ScoreTree; - std::vector m_scoreTrees; - - StringVector m_scoresMapped; - StringVector m_scoresMemory; +private: + bool m_inMemory; - std::string MakeKey(const Phrase& f, const Phrase& e, const Phrase& c) const; - std::string MakeKey(const std::string& f, const std::string& e, const std::string& c) const; - - public: - LexicalReorderingTableCompact( - const std::string& filePath, - const std::vector& f_factors, - const std::vector& e_factors, - const std::vector& c_factors); - - LexicalReorderingTableCompact( - const std::vector& f_factors, - const std::vector& e_factors, - const std::vector& c_factors); - - virtual ~LexicalReorderingTableCompact(); + size_t m_numScoreComponent; + bool m_multipleScoreTrees; - virtual std::vector GetScore(const Phrase& f, const Phrase& e, const Phrase& c); - - static LexicalReorderingTable* CheckAndLoad( - const std::string& filePath, - const std::vector& f_factors, - const std::vector& e_factors, - const std::vector& c_factors); - - void Load(std::string filePath); + BlockHashIndex m_hash; + + typedef CanonicalHuffman ScoreTree; + std::vector m_scoreTrees; + + StringVector m_scoresMapped; + StringVector m_scoresMemory; + + std::string MakeKey(const Phrase& f, const Phrase& e, const Phrase& c) const; + std::string MakeKey(const std::string& f, const std::string& e, const std::string& c) const; + +public: + LexicalReorderingTableCompact( + const std::string& filePath, + const std::vector& f_factors, + const std::vector& e_factors, + const std::vector& c_factors); + + LexicalReorderingTableCompact( + const std::vector& f_factors, + const std::vector& e_factors, + const std::vector& c_factors); + + virtual ~LexicalReorderingTableCompact(); + + virtual std::vector GetScore(const Phrase& f, const Phrase& e, const Phrase& c); + + static LexicalReorderingTable* CheckAndLoad( + const std::string& filePath, + const std::vector& f_factors, + const std::vector& e_factors, + const std::vector& c_factors); + + void Load(std::string filePath); }; } diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp index a3eee1694..655ed01ca 100644 --- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp +++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp @@ -1,23 +1,23 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #include "LexicalReorderingTableCreator.h" #include "ThrowingFwrite.h" @@ -25,7 +25,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "util/file.hh" -namespace Moses { +namespace Moses +{ LexicalReorderingTableCreator::LexicalReorderingTableCreator( std::string inPath, std::string outPath, std::string tempfilePath, @@ -34,49 +35,47 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator( #ifdef WITH_THREADS , size_t threads #endif - ) +) : m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath), - m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), - m_numScoreComponent(0), m_multipleScoreTrees(multipleScoreTrees), - m_quantize(quantize), m_separator(" ||| "), - m_hash(m_orderBits, m_fingerPrintBits), m_lastFlushedLine(-1) -#ifdef WITH_THREADS - , m_threads(threads) + m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), + m_numScoreComponent(0), m_multipleScoreTrees(multipleScoreTrees), + m_quantize(quantize), m_separator(" ||| "), + m_hash(m_orderBits, m_fingerPrintBits), m_lastFlushedLine(-1) +#ifdef WITH_THREADS + , m_threads(threads) #endif -{ +{ PrintInfo(); - + m_outFile = std::fopen(m_outPath.c_str(), "w"); - + std::cerr << "Pass 1/2: Creating phrase index + Counting scores" << std::endl; - m_hash.BeginSave(m_outFile); + m_hash.BeginSave(m_outFile); if(tempfilePath.size()) { MmapAllocator allocEncoded(util::FMakeTemp(tempfilePath)); m_encodedScores = new StringVector(allocEncoded); - } - else { + } else { m_encodedScores = new StringVector(); } - + EncodeScores(); - + std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl; CalcHuffmanCodes(); - + std::cerr << "Pass 2/2: Compressing scores" << std::endl; - - - if(tempfilePath.size()) { + + + if(tempfilePath.size()) { MmapAllocator allocCompressed(util::FMakeTemp(tempfilePath)); m_compressedScores = new StringVector(allocCompressed); - } - else { + } else { m_compressedScores = new StringVector(); } CompressScores(); - + std::cerr << "Saving to " << m_outPath << std::endl; Save(); std::cerr << "Done" << std::endl; @@ -84,20 +83,20 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator( } void LexicalReorderingTableCreator::PrintInfo() -{ +{ std::cerr << "Used options:" << std::endl; std::cerr << "\tText reordering table will be read from: " << m_inPath << std::endl; std::cerr << "\tOutput reordering table will be written to: " << m_outPath << std::endl; std::cerr << "\tStep size for source landmark phrases: 2^" << m_orderBits << "=" << (1ul << m_orderBits) << std::endl; std::cerr << "\tPhrase fingerprint size: " << m_fingerPrintBits << " bits / P(fp)=" << (float(1)/(1ul << m_fingerPrintBits)) << std::endl; - std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl; + std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl; std::cerr << "\tUsing score quantization: "; if(m_quantize) std::cerr << m_quantize << " best" << std::endl; else std::cerr << "no" << std::endl; - -#ifdef WITH_THREADS + +#ifdef WITH_THREADS std::cerr << "\tRunning with " << m_threads << " threads" << std::endl; #endif std::cerr << std::endl; @@ -109,7 +108,7 @@ LexicalReorderingTableCreator::~LexicalReorderingTableCreator() delete m_scoreTrees[i]; delete m_scoreCounters[i]; } - + delete m_encodedScores; delete m_compressedScores; } @@ -121,9 +120,8 @@ void LexicalReorderingTableCreator::EncodeScores() #ifdef WITH_THREADS boost::thread_group threads; - for (size_t i = 0; i < m_threads; ++i) - { - EncodingTaskReordering* et = new EncodingTaskReordering(inFile, *this); + for (size_t i = 0; i < m_threads; ++i) { + EncodingTaskReordering* et = new EncodingTaskReordering(inFile, *this); threads.create_thread(*et); } threads.join_all(); @@ -136,17 +134,16 @@ void LexicalReorderingTableCreator::EncodeScores() } void LexicalReorderingTableCreator::CalcHuffmanCodes() -{ +{ std::vector::iterator treeIt = m_scoreTrees.begin(); for(std::vector::iterator it = m_scoreCounters.begin(); - it != m_scoreCounters.end(); it++) - { + it != m_scoreCounters.end(); it++) { if(m_quantize) - (*it)->Quantize(m_quantize); - + (*it)->Quantize(m_quantize); + std::cerr << "\tCreating Huffman codes for " << (*it)->Size() - << " scores" << std::endl; - + << " scores" << std::endl; + *treeIt = new ScoreTree((*it)->Begin(), (*it)->End()); treeIt++; } @@ -158,7 +155,7 @@ void LexicalReorderingTableCreator::CompressScores() #ifdef WITH_THREADS boost::thread_group threads; for (size_t i = 0; i < m_threads; ++i) { - CompressionTaskReordering* ct = new CompressionTaskReordering(*m_encodedScores, *this); + CompressionTaskReordering* ct = new CompressionTaskReordering(*m_encodedScores, *this); threads.create_thread(*ct); } threads.join_all(); @@ -171,12 +168,12 @@ void LexicalReorderingTableCreator::CompressScores() } void LexicalReorderingTableCreator::Save() -{ +{ ThrowingFwrite(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, m_outFile); ThrowingFwrite(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, m_outFile); for(size_t i = 0; i < m_scoreTrees.size(); i++) m_scoreTrees[i]->Save(m_outFile); - + m_compressedScores->save(m_outFile); } @@ -192,38 +189,37 @@ std::string LexicalReorderingTableCreator::EncodeLine(std::vector& { std::string scoresString = tokens.back(); std::stringstream scoresStream; - + std::vector scores; Tokenize(scores, scoresString); - + if(!m_numScoreComponent) { m_numScoreComponent = scores.size(); m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1); for(std::vector::iterator it = m_scoreCounters.begin(); it != m_scoreCounters.end(); it++) - *it = new ScoreCounter(); + *it = new ScoreCounter(); m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1); } - + if(m_numScoreComponent != scores.size()) { std::cerr << "Error: Wrong number of scores detected (" - << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl; + << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl; std::cerr << "Line: " << tokens[0] << " ||| ... ||| " << scoresString << std::endl; - abort(); + abort(); } - + size_t c = 0; float score; - while(c < m_numScoreComponent) - { + while(c < m_numScoreComponent) { score = scores[c]; score = FloorScore(TransformScore(score)); scoresStream.write((char*)&score, sizeof(score)); - + m_scoreCounters[m_multipleScoreTrees ? c : 0]->Increase(score); c++; } - + return scoresStream.str(); } @@ -232,25 +228,23 @@ void LexicalReorderingTableCreator::AddEncodedLine(PackedItem& pi) m_queue.push(pi); } -void LexicalReorderingTableCreator::FlushEncodedQueue(bool force) { - if(force || m_queue.size() > 10000) - { - while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) - { +void LexicalReorderingTableCreator::FlushEncodedQueue(bool force) +{ + if(force || m_queue.size() > 10000) { + while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) { PackedItem pi = m_queue.top(); m_queue.pop(); m_lastFlushedLine++; - - m_lastRange.push_back(pi.GetSrc()); + + m_lastRange.push_back(pi.GetSrc()); m_encodedScores->push_back(pi.GetTrg()); - + if((pi.GetLine()+1) % 100000 == 0) - std::cerr << "."; + std::cerr << "."; if((pi.GetLine()+1) % 5000000 == 0) - std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl; - - if(m_lastRange.size() == (1ul << m_orderBits)) - { + std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl; + + if(m_lastRange.size() == (1ul << m_orderBits)) { m_hash.AddRange(m_lastRange); m_hash.SaveLastRange(); m_hash.DropLastRange(); @@ -258,14 +252,13 @@ void LexicalReorderingTableCreator::FlushEncodedQueue(bool force) { } } } - - if(force) - { + + if(force) { m_lastFlushedLine = -1; m_hash.AddRange(m_lastRange); m_lastRange.clear(); - + #ifdef WITH_THREADS m_hash.WaitAll(); #endif @@ -278,56 +271,55 @@ void LexicalReorderingTableCreator::FlushEncodedQueue(bool force) { } } -std::string LexicalReorderingTableCreator::CompressEncodedScores(std::string &encodedScores) { +std::string LexicalReorderingTableCreator::CompressEncodedScores(std::string &encodedScores) +{ std::stringstream encodedScoresStream(encodedScores); encodedScoresStream.unsetf(std::ios::skipws); - + std::string compressedScores; BitWrapper<> compressedScoresStream(compressedScores); - + size_t currScore = 0; float score; encodedScoresStream.read((char*) &score, sizeof(score)); - + while(encodedScoresStream) { size_t index = currScore % m_scoreTrees.size(); - + if(m_quantize) score = m_scoreCounters[index]->LowerBound(score); - + m_scoreTrees[index]->Put(compressedScoresStream, score); encodedScoresStream.read((char*) &score, sizeof(score)); currScore++; } - + return compressedScores; } -void LexicalReorderingTableCreator::AddCompressedScores(PackedItem& pi) { - m_queue.push(pi); +void LexicalReorderingTableCreator::AddCompressedScores(PackedItem& pi) +{ + m_queue.push(pi); } void LexicalReorderingTableCreator::FlushCompressedQueue(bool force) -{ - if(force || m_queue.size() > 10000) - { - while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) - { +{ + if(force || m_queue.size() > 10000) { + while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) { PackedItem pi = m_queue.top(); m_queue.pop(); m_lastFlushedLine++; - + m_compressedScores->push_back(pi.GetTrg()); - + if((pi.GetLine()+1) % 100000 == 0) - std::cerr << "."; + std::cerr << "."; if((pi.GetLine()+1) % 5000000 == 0) - std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl; + std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl; } } - - if(force) - { + + if(force) { m_lastFlushedLine = -1; std::cerr << std::endl << std::endl; } @@ -343,63 +335,61 @@ boost::mutex EncodingTaskReordering::m_fileMutex; EncodingTaskReordering::EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator) : m_inFile(inFile), m_creator(creator) {} - + void EncodingTaskReordering::operator()() { size_t lineNum = 0; - + std::vector lines; size_t max_lines = 1000; lines.reserve(max_lines); - + { #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_fileMutex); #endif std::string line; while(lines.size() < max_lines && std::getline(m_inFile, line)) - lines.push_back(line); + lines.push_back(line); lineNum = m_lineNum; m_lineNum += lines.size(); } - + std::vector result; result.reserve(max_lines); - - while(lines.size()) - { - for(size_t i = 0; i < lines.size(); i++) - { + + while(lines.size()) { + for(size_t i = 0; i < lines.size(); i++) { std::vector tokens; Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator); - + std::string encodedLine = m_creator.EncodeLine(tokens); - + std::string f = tokens[0]; - + std::string e; if(tokens.size() > 2) e = tokens[1]; - + PackedItem packedItem(lineNum + i, m_creator.MakeSourceTargetKey(f, e), encodedLine, i); result.push_back(packedItem); } lines.clear(); - + { #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_mutex); #endif - for(size_t i = 0; i < result.size(); i++) + for(size_t i = 0; i < result.size(); i++) m_creator.AddEncodedLine(result[i]); - m_creator.FlushEncodedQueue(); + m_creator.FlushEncodedQueue(); } - + result.clear(); lines.reserve(max_lines); result.reserve(max_lines); - + #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_fileMutex); #endif @@ -419,11 +409,11 @@ boost::mutex CompressionTaskReordering::m_mutex; #endif CompressionTaskReordering::CompressionTaskReordering(StringVector& encodedScores, - LexicalReorderingTableCreator& creator) + MmapAllocator>& encodedScores, + LexicalReorderingTableCreator& creator) : m_encodedScores(encodedScores), m_creator(creator) { } - + void CompressionTaskReordering::operator()() { size_t scoresNum; @@ -434,12 +424,11 @@ void CompressionTaskReordering::operator()() scoresNum = m_scoresNum; m_scoresNum++; } - - while(scoresNum < m_encodedScores.size()) - { + + while(scoresNum < m_encodedScores.size()) { std::string scores = m_encodedScores[scoresNum]; std::string compressedScores - = m_creator.CompressEncodedScores(scores); + = m_creator.CompressEncodedScores(scores); std::string dummy; PackedItem packedItem(scoresNum, dummy, compressedScores, 0); @@ -449,9 +438,9 @@ void CompressionTaskReordering::operator()() #endif m_creator.AddCompressedScores(packedItem); m_creator.FlushCompressedQueue(); - - scoresNum = m_scoresNum; - m_scoresNum++; + + scoresNum = m_scoresNum; + m_scoresNum++; } } diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h index 2e202ce9b..1bf8444fe 100644 --- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h +++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h @@ -1,139 +1,141 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #ifndef moses_LexicalReorderingTableCreator_h #define moses_LexicalReorderingTableCreator_h #include "PhraseTableCreator.h" -namespace Moses { +namespace Moses +{ + +class LexicalReorderingTableCreator +{ +private: + std::string m_inPath; + std::string m_outPath; + std::string m_tempfilePath; + + std::FILE* m_outFile; + + size_t m_orderBits; + size_t m_fingerPrintBits; + + size_t m_numScoreComponent; + + bool m_multipleScoreTrees; + bool m_quantize; + + std::string m_separator; + + BlockHashIndex m_hash; + + typedef Counter ScoreCounter; + typedef CanonicalHuffman ScoreTree; + + std::vector m_scoreCounters; + std::vector m_scoreTrees; + + StringVector* m_encodedScores; + StringVector* m_compressedScores; + + std::priority_queue m_queue; + long m_lastFlushedLine; + long m_lastFlushedSourceNum; + std::string m_lastFlushedSourcePhrase; + std::vector m_lastRange; -class LexicalReorderingTableCreator { - private: - std::string m_inPath; - std::string m_outPath; - std::string m_tempfilePath; - - std::FILE* m_outFile; - - size_t m_orderBits; - size_t m_fingerPrintBits; - - size_t m_numScoreComponent; - - bool m_multipleScoreTrees; - bool m_quantize; - - std::string m_separator; - - BlockHashIndex m_hash; - - typedef Counter ScoreCounter; - typedef CanonicalHuffman ScoreTree; - - std::vector m_scoreCounters; - std::vector m_scoreTrees; - - StringVector* m_encodedScores; - StringVector* m_compressedScores; - - std::priority_queue m_queue; - long m_lastFlushedLine; - long m_lastFlushedSourceNum; - std::string m_lastFlushedSourcePhrase; - std::vector m_lastRange; - -#ifdef WITH_THREADS - size_t m_threads; -#endif - - void PrintInfo(); - - void EncodeScores(); - void CalcHuffmanCodes(); - void CompressScores(); - void Save(); - - std::string MakeSourceTargetKey(std::string&, std::string&); - - std::string EncodeLine(std::vector& tokens); - void AddEncodedLine(PackedItem& pi); - void FlushEncodedQueue(bool force = false); - - std::string CompressEncodedScores(std::string &encodedScores); - void AddCompressedScores(PackedItem& pi); - void FlushCompressedQueue(bool force = false); - - public: - LexicalReorderingTableCreator(std::string inPath, - std::string outPath, - std::string tempfilePath, - size_t orderBits = 10, - size_t fingerPrintBits = 16, - bool multipleScoreTrees = true, - size_t quantize = 0 #ifdef WITH_THREADS - , size_t threads = 2 -#endif - ); - - ~LexicalReorderingTableCreator(); - + size_t m_threads; +#endif + + void PrintInfo(); + + void EncodeScores(); + void CalcHuffmanCodes(); + void CompressScores(); + void Save(); + + std::string MakeSourceTargetKey(std::string&, std::string&); + + std::string EncodeLine(std::vector& tokens); + void AddEncodedLine(PackedItem& pi); + void FlushEncodedQueue(bool force = false); + + std::string CompressEncodedScores(std::string &encodedScores); + void AddCompressedScores(PackedItem& pi); + void FlushCompressedQueue(bool force = false); + +public: + LexicalReorderingTableCreator(std::string inPath, + std::string outPath, + std::string tempfilePath, + size_t orderBits = 10, + size_t fingerPrintBits = 16, + bool multipleScoreTrees = true, + size_t quantize = 0 +#ifdef WITH_THREADS + , size_t threads = 2 +#endif + ); + + ~LexicalReorderingTableCreator(); + friend class EncodingTaskReordering; friend class CompressionTaskReordering; }; class EncodingTaskReordering { - private: +private: #ifdef WITH_THREADS - static boost::mutex m_mutex; - static boost::mutex m_fileMutex; + static boost::mutex m_mutex; + static boost::mutex m_fileMutex; #endif - static size_t m_lineNum; - static size_t m_sourcePhraseNum; - static std::string m_lastSourcePhrase; - - InputFileStream& m_inFile; - LexicalReorderingTableCreator& m_creator; - - public: - EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator); - void operator()(); + static size_t m_lineNum; + static size_t m_sourcePhraseNum; + static std::string m_lastSourcePhrase; + + InputFileStream& m_inFile; + LexicalReorderingTableCreator& m_creator; + +public: + EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator); + void operator()(); }; class CompressionTaskReordering { - private: +private: #ifdef WITH_THREADS - static boost::mutex m_mutex; + static boost::mutex m_mutex; #endif - static size_t m_scoresNum; - StringVector &m_encodedScores; - LexicalReorderingTableCreator &m_creator; - - public: - CompressionTaskReordering(StringVector& - m_encodedScores, LexicalReorderingTableCreator& creator); - void operator()(); + static size_t m_scoresNum; + StringVector &m_encodedScores; + LexicalReorderingTableCreator &m_creator; + +public: + CompressionTaskReordering(StringVector& + m_encodedScores, LexicalReorderingTableCreator& creator); + void operator()(); }; } diff --git a/moses/TranslationModel/CompactPT/ListCoders.h b/moses/TranslationModel/CompactPT/ListCoders.h index 329e1297a..b41e183ce 100644 --- a/moses/TranslationModel/CompactPT/ListCoders.h +++ b/moses/TranslationModel/CompactPT/ListCoders.h @@ -1,23 +1,23 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #ifndef moses_ListCoders_h #define moses_ListCoders_h @@ -31,93 +31,85 @@ namespace Moses template class VarIntType { - private: - template - static void EncodeSymbol(IntType input, OutIt output) - { - if(input == 0) - { - *output = 0; - output++; - return; - } - - T msb = 1 << (sizeof(T)*8-1); - IntType mask = ~msb; - IntType shift = (sizeof(T)*8-1); - - while(input) - { - T res = input & mask; - input >>= shift; - if(input) - res |= msb; - *output = res; - output++; - } - }; - - template - static void DecodeSymbol(InIt &it, InIt end, IntType &output) - { - T msb = 1 << (sizeof(T)*8-1); - IntType shift = (sizeof(T)*8-1); - - output = 0; - size_t i = 0; - while(it != end && *it & msb) { - IntType temp = *it & ~msb; - temp <<= shift*i; - output |= temp; - it++; i++; - } - assert(it != end); - - IntType temp = *it; +private: + template + static void EncodeSymbol(IntType input, OutIt output) { + if(input == 0) { + *output = 0; + output++; + return; + } + + T msb = 1 << (sizeof(T)*8-1); + IntType mask = ~msb; + IntType shift = (sizeof(T)*8-1); + + while(input) { + T res = input & mask; + input >>= shift; + if(input) + res |= msb; + *output = res; + output++; + } + }; + + template + static void DecodeSymbol(InIt &it, InIt end, IntType &output) { + T msb = 1 << (sizeof(T)*8-1); + IntType shift = (sizeof(T)*8-1); + + output = 0; + size_t i = 0; + while(it != end && *it & msb) { + IntType temp = *it & ~msb; temp <<= shift*i; output |= temp; it++; + i++; + } + assert(it != end); + + IntType temp = *it; + temp <<= shift*i; + output |= temp; + it++; + } + +public: + + template + static void Encode(InIt it, InIt end, OutIt outIt) { + while(it != end) { + EncodeSymbol(*it, outIt); + it++; + } + } + + template + static void Decode(InIt &it, InIt end, OutIt outIt) { + while(it != end) { + size_t output; + DecodeSymbol(it, end, output); + *outIt = output; + outIt++; + } + } + + template + static size_t DecodeAndSum(InIt &it, InIt end, size_t num) { + size_t sum = 0; + size_t curr = 0; + + while(it != end && curr < num) { + size_t output; + DecodeSymbol(it, end, output); + sum += output; + curr++; } - public: - - template - static void Encode(InIt it, InIt end, OutIt outIt) - { - while(it != end) - { - EncodeSymbol(*it, outIt); - it++; - } - } - - template - static void Decode(InIt &it, InIt end, OutIt outIt) - { - while(it != end) - { - size_t output; - DecodeSymbol(it, end, output); - *outIt = output; - outIt++; - } - } - - template - static size_t DecodeAndSum(InIt &it, InIt end, size_t num) - { - size_t sum = 0; - size_t curr = 0; - - while(it != end && curr < num) - { - size_t output; - DecodeSymbol(it, end, output); - sum += output; curr++; - } - - return sum; - } + return sum; + } }; @@ -129,179 +121,262 @@ typedef VarIntType VarInt32; class Simple9 { - private: - typedef unsigned int uint; - - template - inline static void EncodeSymbol(uint &output, InIt it, InIt end) - { - uint length = end - it; - - uint type = 0; - uint bitlength = 0; - - switch(length) - { - case 1: type = 1; bitlength = 28; break; - case 2: type = 2; bitlength = 14; break; - case 3: type = 3; bitlength = 9; break; - case 4: type = 4; bitlength = 7; break; - case 5: type = 5; bitlength = 5; break; - case 7: type = 6; bitlength = 4; break; - case 9: type = 7; bitlength = 3; break; - case 14: type = 8; bitlength = 2; break; - case 28: type = 9; bitlength = 1; break; - } - - output = 0; - output |= (type << 28); +private: + typedef unsigned int uint; - uint i = 0; - while(it != end) - { - uint l = bitlength * (length-i-1); - output |= *it << l; - it++; - i++; - } - } - - template - static inline void DecodeSymbol(uint input, OutIt outIt) - { - uint type = (input >> 28); - - uint bitlen = 0; - uint shift = 0; - uint mask = 0; - - switch(type) - { - case 1: bitlen = 28; shift = 0; mask = 268435455; break; - case 2: bitlen = 14; shift = 14; mask = 16383; break; - case 3: bitlen = 9; shift = 18; mask = 511; break; - case 4: bitlen = 7; shift = 21; mask = 127; break; - case 5: bitlen = 5; shift = 20; mask = 31; break; - case 6: bitlen = 4; shift = 24; mask = 15; break; - case 7: bitlen = 3; shift = 24; mask = 7; break; - case 8: bitlen = 2; shift = 26; mask = 3; break; - case 9: bitlen = 1; shift = 27; mask = 1; break; - } - - while(shift > 0) - { - *outIt = (input >> shift) & mask; - shift -= bitlen; - outIt++; - } - *outIt = input & mask; - outIt++; - } - - static inline size_t DecodeAndSumSymbol(uint input, size_t num, size_t &curr) - { - uint type = (input >> 28); - - uint bitlen = 0; - uint shift = 0; - uint mask = 0; - - switch(type) - { - case 1: bitlen = 28; shift = 0; mask = 268435455; break; - case 2: bitlen = 14; shift = 14; mask = 16383; break; - case 3: bitlen = 9; shift = 18; mask = 511; break; - case 4: bitlen = 7; shift = 21; mask = 127; break; - case 5: bitlen = 5; shift = 20; mask = 31; break; - case 6: bitlen = 4; shift = 24; mask = 15; break; - case 7: bitlen = 3; shift = 24; mask = 7; break; - case 8: bitlen = 2; shift = 26; mask = 3; break; - case 9: bitlen = 1; shift = 27; mask = 1; break; - } + template + inline static void EncodeSymbol(uint &output, InIt it, InIt end) { + uint length = end - it; - size_t sum = 0; - while(shift > 0) - { - sum += (input >> shift) & mask; - shift -= bitlen; - if(++curr == num) - return sum; - } - sum += input & mask; - curr++; - return sum; - } - - public: - template - static void Encode(InIt it, InIt end, OutIt outIt) - { - uint parts[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 }; + uint type = 0; + uint bitlength = 0; - uint buffer[28]; - for(InIt i = it; i < end; i++) - { - uint lastbit = 1; - uint lastpos = 0; - uint lastyes = 0; - uint j = 0; - - double log2 = log(2); - while(j < 9 && lastpos < 28 && (i+lastpos) < end) - { - if(lastpos >= parts[j]) - j++; - - buffer[lastpos] = *(i + lastpos); - - uint reqbit = ceil(log(buffer[lastpos]+1)/log2); - assert(reqbit <= 28); - - uint bit = 28/floor(28/reqbit); - if(lastbit < bit) - lastbit = bit; - - if(parts[j] > 28/lastbit) - break; - else if(lastpos == parts[j]-1) - lastyes = lastpos; - - lastpos++; - } - i += lastyes; - - uint length = lastyes + 1; - uint output; - EncodeSymbol(output, buffer, buffer + length); - - *outIt = output; - outIt++; - } + switch(length) { + case 1: + type = 1; + bitlength = 28; + break; + case 2: + type = 2; + bitlength = 14; + break; + case 3: + type = 3; + bitlength = 9; + break; + case 4: + type = 4; + bitlength = 7; + break; + case 5: + type = 5; + bitlength = 5; + break; + case 7: + type = 6; + bitlength = 4; + break; + case 9: + type = 7; + bitlength = 3; + break; + case 14: + type = 8; + bitlength = 2; + break; + case 28: + type = 9; + bitlength = 1; + break; } - - template - static void Decode(InIt &it, InIt end, OutIt outIt) - { - while(it != end) - { - DecodeSymbol(*it, outIt); - it++; - } + + output = 0; + output |= (type << 28); + + uint i = 0; + while(it != end) { + uint l = bitlength * (length-i-1); + output |= *it << l; + it++; + i++; } - - template - static size_t DecodeAndSum(InIt &it, InIt end, size_t num) - { - size_t sum = 0; - size_t curr = 0; - while(it != end && curr < num) - { - sum += DecodeAndSumSymbol(*it, num, curr); - it++; - } - assert(curr == num); - return sum; + } + + template + static inline void DecodeSymbol(uint input, OutIt outIt) { + uint type = (input >> 28); + + uint bitlen = 0; + uint shift = 0; + uint mask = 0; + + switch(type) { + case 1: + bitlen = 28; + shift = 0; + mask = 268435455; + break; + case 2: + bitlen = 14; + shift = 14; + mask = 16383; + break; + case 3: + bitlen = 9; + shift = 18; + mask = 511; + break; + case 4: + bitlen = 7; + shift = 21; + mask = 127; + break; + case 5: + bitlen = 5; + shift = 20; + mask = 31; + break; + case 6: + bitlen = 4; + shift = 24; + mask = 15; + break; + case 7: + bitlen = 3; + shift = 24; + mask = 7; + break; + case 8: + bitlen = 2; + shift = 26; + mask = 3; + break; + case 9: + bitlen = 1; + shift = 27; + mask = 1; + break; } + + while(shift > 0) { + *outIt = (input >> shift) & mask; + shift -= bitlen; + outIt++; + } + *outIt = input & mask; + outIt++; + } + + static inline size_t DecodeAndSumSymbol(uint input, size_t num, size_t &curr) { + uint type = (input >> 28); + + uint bitlen = 0; + uint shift = 0; + uint mask = 0; + + switch(type) { + case 1: + bitlen = 28; + shift = 0; + mask = 268435455; + break; + case 2: + bitlen = 14; + shift = 14; + mask = 16383; + break; + case 3: + bitlen = 9; + shift = 18; + mask = 511; + break; + case 4: + bitlen = 7; + shift = 21; + mask = 127; + break; + case 5: + bitlen = 5; + shift = 20; + mask = 31; + break; + case 6: + bitlen = 4; + shift = 24; + mask = 15; + break; + case 7: + bitlen = 3; + shift = 24; + mask = 7; + break; + case 8: + bitlen = 2; + shift = 26; + mask = 3; + break; + case 9: + bitlen = 1; + shift = 27; + mask = 1; + break; + } + + size_t sum = 0; + while(shift > 0) { + sum += (input >> shift) & mask; + shift -= bitlen; + if(++curr == num) + return sum; + } + sum += input & mask; + curr++; + return sum; + } + +public: + template + static void Encode(InIt it, InIt end, OutIt outIt) { + uint parts[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 }; + + uint buffer[28]; + for(InIt i = it; i < end; i++) { + uint lastbit = 1; + uint lastpos = 0; + uint lastyes = 0; + uint j = 0; + + double log2 = log(2); + while(j < 9 && lastpos < 28 && (i+lastpos) < end) { + if(lastpos >= parts[j]) + j++; + + buffer[lastpos] = *(i + lastpos); + + uint reqbit = ceil(log(buffer[lastpos]+1)/log2); + assert(reqbit <= 28); + + uint bit = 28/floor(28/reqbit); + if(lastbit < bit) + lastbit = bit; + + if(parts[j] > 28/lastbit) + break; + else if(lastpos == parts[j]-1) + lastyes = lastpos; + + lastpos++; + } + i += lastyes; + + uint length = lastyes + 1; + uint output; + EncodeSymbol(output, buffer, buffer + length); + + *outIt = output; + outIt++; + } + } + + template + static void Decode(InIt &it, InIt end, OutIt outIt) { + while(it != end) { + DecodeSymbol(*it, outIt); + it++; + } + } + + template + static size_t DecodeAndSum(InIt &it, InIt end, size_t num) { + size_t sum = 0; + size_t curr = 0; + while(it != end && curr < num) { + sum += DecodeAndSumSymbol(*it, num, curr); + it++; + } + assert(curr == num); + return sum; + } }; } diff --git a/moses/TranslationModel/CompactPT/MmapAllocator.h b/moses/TranslationModel/CompactPT/MmapAllocator.h index 049c0149d..7cd6dd49e 100644 --- a/moses/TranslationModel/CompactPT/MmapAllocator.h +++ b/moses/TranslationModel/CompactPT/MmapAllocator.h @@ -1,23 +1,23 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #ifndef moses_MmapAllocator_h #define moses_MmapAllocator_h @@ -30,174 +30,160 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA namespace Moses { - template - class MmapAllocator - { - protected: - std::FILE* m_file_ptr; - size_t m_file_desc; - - size_t m_page_size; - size_t m_map_size; - - char* m_data_ptr; - size_t m_data_offset; - bool m_fixed; - size_t* m_count; - - public: - typedef T value_type; - typedef T* pointer; - typedef const T* const_pointer; - typedef T& reference; - typedef const T& const_reference; - typedef std::size_t size_type; - typedef std::ptrdiff_t difference_type; - - MmapAllocator() throw() - : m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)), - m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0), - m_data_offset(0), m_fixed(false), m_count(new size_t(0)) - { } - - MmapAllocator(std::FILE* f_ptr) throw() - : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)), - m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0), - m_data_offset(0), m_fixed(false), m_count(new size_t(0)) - { } - - MmapAllocator(std::FILE* f_ptr, size_t data_offset) throw() - : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)), - m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0), - m_data_offset(data_offset), m_fixed(true), m_count(new size_t(0)) - { } - - MmapAllocator(std::string fileName) throw() - : m_file_ptr(std::fopen(fileName.c_str(), "wb+")), m_file_desc(fileno(m_file_ptr)), - m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0), - m_data_offset(0), m_fixed(false), m_count(new size_t(0)) - { } - - MmapAllocator(const MmapAllocator& c) throw() - : m_file_ptr(c.m_file_ptr), m_file_desc(c.m_file_desc), - m_page_size(c.m_page_size), m_map_size(c.m_map_size), - m_data_ptr(c.m_data_ptr), m_data_offset(c.m_data_offset), - m_fixed(c.m_fixed), m_count(c.m_count) - { - (*m_count)++; - } - - ~MmapAllocator() throw() - { - if(m_data_ptr && *m_count == 0) - { - munmap(m_data_ptr, m_map_size); - if(!m_fixed && std::ftell(m_file_ptr) != -1) - std::fclose(m_file_ptr); - } - (*m_count)--; - } - - template - struct rebind { - typedef MmapAllocator other; - }; - - pointer address (reference value) const - { - return &value; - } - - const_pointer address (const_reference value) const - { - return &value; - } - - size_type max_size () const throw() - { - return std::numeric_limits::max() / sizeof(value_type); - } - - pointer allocate (size_type num, const void* = 0) - { - m_map_size = num * sizeof(T); - - if(!m_fixed) - { - size_t read = 0; - read += ftruncate(m_file_desc, m_map_size); - m_data_ptr = (char*)mmap(0, m_map_size, PROT_READ|PROT_WRITE, MAP_SHARED, - m_file_desc, 0); - if(m_data_ptr == MAP_FAILED) - std::cerr << "Error: mmapping" << std::endl; - return (pointer)m_data_ptr; - } - else - { - size_t map_offset = (m_data_offset / m_page_size) * m_page_size; - size_t relative_offset = m_data_offset - map_offset; - - size_t map_size = m_map_size + relative_offset; - - m_data_ptr = (char*)mmap(0, map_size, PROT_READ, MAP_SHARED, - m_file_desc, map_offset); +template +class MmapAllocator +{ +protected: + std::FILE* m_file_ptr; + size_t m_file_desc; - return (pointer)(m_data_ptr + relative_offset); - } - } - - void deallocate (pointer p, size_type num) - { - if(!m_fixed) { - munmap(p, num * sizeof(T)); - } - else { - size_t map_offset = (m_data_offset / m_page_size) * m_page_size; - size_t relative_offset = m_data_offset - map_offset; - munmap((pointer)((char*)p - relative_offset), num * sizeof(T)); - } - - } - - void construct (pointer p, const T& value) - { - if(!m_fixed) - new(p) value_type(value); - } - void destroy (pointer p) - { - if(!m_fixed) - p->~T(); - } - - template - friend bool operator== (const MmapAllocator&, const MmapAllocator&) throw(); - - template - friend bool operator!= (const MmapAllocator&, const MmapAllocator&) throw(); - }; - - template - bool operator== (const MmapAllocator& a1, - const MmapAllocator& a2) throw() - { - bool equal = true; - equal &= a1.m_file_ptr == a2.m_file_ptr; - equal &= a1.m_file_desc == a2.m_file_desc; - equal &= a1.m_page_size == a2.m_page_size; - equal &= a1.m_map_size == a2.m_map_size; - equal &= a1.m_data_ptr == a2.m_data_ptr; - equal &= a1.m_data_offset == a2.m_data_offset; - equal &= a1.m_fixed == a2.m_fixed; - return equal; + size_t m_page_size; + size_t m_map_size; + + char* m_data_ptr; + size_t m_data_offset; + bool m_fixed; + size_t* m_count; + +public: + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + MmapAllocator() throw() + : m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)), + m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0), + m_data_offset(0), m_fixed(false), m_count(new size_t(0)) + { } + + MmapAllocator(std::FILE* f_ptr) throw() + : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)), + m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0), + m_data_offset(0), m_fixed(false), m_count(new size_t(0)) + { } + + MmapAllocator(std::FILE* f_ptr, size_t data_offset) throw() + : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)), + m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0), + m_data_offset(data_offset), m_fixed(true), m_count(new size_t(0)) + { } + + MmapAllocator(std::string fileName) throw() + : m_file_ptr(std::fopen(fileName.c_str(), "wb+")), m_file_desc(fileno(m_file_ptr)), + m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0), + m_data_offset(0), m_fixed(false), m_count(new size_t(0)) + { } + + MmapAllocator(const MmapAllocator& c) throw() + : m_file_ptr(c.m_file_ptr), m_file_desc(c.m_file_desc), + m_page_size(c.m_page_size), m_map_size(c.m_map_size), + m_data_ptr(c.m_data_ptr), m_data_offset(c.m_data_offset), + m_fixed(c.m_fixed), m_count(c.m_count) { + (*m_count)++; + } + + ~MmapAllocator() throw() { + if(m_data_ptr && *m_count == 0) { + munmap(m_data_ptr, m_map_size); + if(!m_fixed && std::ftell(m_file_ptr) != -1) + std::fclose(m_file_ptr); } - - template - bool operator!=(const MmapAllocator& a1, - const MmapAllocator& a2) throw() - { - return !(a1 == a2); + (*m_count)--; + } + + template + struct rebind { + typedef MmapAllocator other; + }; + + pointer address (reference value) const { + return &value; + } + + const_pointer address (const_reference value) const { + return &value; + } + + size_type max_size () const throw() { + return std::numeric_limits::max() / sizeof(value_type); + } + + pointer allocate (size_type num, const void* = 0) { + m_map_size = num * sizeof(T); + + if(!m_fixed) { + size_t read = 0; + read += ftruncate(m_file_desc, m_map_size); + m_data_ptr = (char*)mmap(0, m_map_size, PROT_READ|PROT_WRITE, MAP_SHARED, + m_file_desc, 0); + if(m_data_ptr == MAP_FAILED) + std::cerr << "Error: mmapping" << std::endl; + return (pointer)m_data_ptr; + } else { + size_t map_offset = (m_data_offset / m_page_size) * m_page_size; + size_t relative_offset = m_data_offset - map_offset; + + size_t map_size = m_map_size + relative_offset; + + m_data_ptr = (char*)mmap(0, map_size, PROT_READ, MAP_SHARED, + m_file_desc, map_offset); + + return (pointer)(m_data_ptr + relative_offset); } + } + + void deallocate (pointer p, size_type num) { + if(!m_fixed) { + munmap(p, num * sizeof(T)); + } else { + size_t map_offset = (m_data_offset / m_page_size) * m_page_size; + size_t relative_offset = m_data_offset - map_offset; + munmap((pointer)((char*)p - relative_offset), num * sizeof(T)); + } + + } + + void construct (pointer p, const T& value) { + if(!m_fixed) + new(p) value_type(value); + } + void destroy (pointer p) { + if(!m_fixed) + p->~T(); + } + + template + friend bool operator== (const MmapAllocator&, const MmapAllocator&) throw(); + + template + friend bool operator!= (const MmapAllocator&, const MmapAllocator&) throw(); +}; + +template +bool operator== (const MmapAllocator& a1, + const MmapAllocator& a2) throw() +{ + bool equal = true; + equal &= a1.m_file_ptr == a2.m_file_ptr; + equal &= a1.m_file_desc == a2.m_file_desc; + equal &= a1.m_page_size == a2.m_page_size; + equal &= a1.m_map_size == a2.m_map_size; + equal &= a1.m_data_ptr == a2.m_data_ptr; + equal &= a1.m_data_offset == a2.m_data_offset; + equal &= a1.m_fixed == a2.m_fixed; + return equal; +} + +template +bool operator!=(const MmapAllocator& a1, + const MmapAllocator& a2) throw() +{ + return !(a1 == a2); +} } diff --git a/moses/TranslationModel/CompactPT/MonotonicVector.h b/moses/TranslationModel/CompactPT/MonotonicVector.h index a4423c369..5e965d3e5 100644 --- a/moses/TranslationModel/CompactPT/MonotonicVector.h +++ b/moses/TranslationModel/CompactPT/MonotonicVector.h @@ -1,23 +1,23 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #ifndef moses_MonotonicVector_h #define moses_MonotonicVector_h @@ -43,206 +43,187 @@ namespace Moses { template class Allocator = std::allocator> + template class Allocator = std::allocator> class MonotonicVector { - private: - typedef std::vector > Anchors; - typedef std::vector > Diffs; - - Anchors m_anchors; - Diffs m_diffs; - std::vector m_tempDiffs; - - size_t m_size; - PosT m_last; - bool m_final; - - public: - typedef PosT value_type; - - MonotonicVector() : m_size(0), m_last(0), m_final(false) {} - - size_t size() const - { - return m_size + m_tempDiffs.size(); - } - - PosT at(size_t i) const - { - PosT s = stepSize; - PosT j = m_anchors[i / s]; - PosT r = i % s; - - typename Diffs::const_iterator it = m_diffs.begin() + j; - - PosT k = 0; - k += VarInt32::DecodeAndSum(it, m_diffs.end(), 1); - if(i < m_size) - k += Simple9::DecodeAndSum(it, m_diffs.end(), r); - else if(i < m_size + m_tempDiffs.size()) - for(size_t l = 0; l < r; l++) - k += m_tempDiffs[l]; - - return k; - } - - PosT operator[](PosT i) const - { - return at(i); - } - - PosT back() const - { - return at(size()-1); - } - - void push_back(PosT i) - { - assert(m_final != true); - - if(m_anchors.size() == 0 && m_tempDiffs.size() == 0) - { - m_anchors.push_back(0); - VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs)); - m_last = i; - m_size++; - - return; - } - - if(m_tempDiffs.size() == stepSize-1) - { - Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(), - std::back_inserter(m_diffs)); - m_anchors.push_back(m_diffs.size()); - VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs)); - - m_size += m_tempDiffs.size() + 1; - m_tempDiffs.clear(); - } - else - { - PosT last = m_last; - PosT diff = i - last; - m_tempDiffs.push_back(diff); - } +private: + typedef std::vector > Anchors; + typedef std::vector > Diffs; + + Anchors m_anchors; + Diffs m_diffs; + std::vector m_tempDiffs; + + size_t m_size; + PosT m_last; + bool m_final; + +public: + typedef PosT value_type; + + MonotonicVector() : m_size(0), m_last(0), m_final(false) {} + + size_t size() const { + return m_size + m_tempDiffs.size(); + } + + PosT at(size_t i) const { + PosT s = stepSize; + PosT j = m_anchors[i / s]; + PosT r = i % s; + + typename Diffs::const_iterator it = m_diffs.begin() + j; + + PosT k = 0; + k += VarInt32::DecodeAndSum(it, m_diffs.end(), 1); + if(i < m_size) + k += Simple9::DecodeAndSum(it, m_diffs.end(), r); + else if(i < m_size + m_tempDiffs.size()) + for(size_t l = 0; l < r; l++) + k += m_tempDiffs[l]; + + return k; + } + + PosT operator[](PosT i) const { + return at(i); + } + + PosT back() const { + return at(size()-1); + } + + void push_back(PosT i) { + assert(m_final != true); + + if(m_anchors.size() == 0 && m_tempDiffs.size() == 0) { + m_anchors.push_back(0); + VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs)); m_last = i; + m_size++; + + return; } - - void commit() - { - assert(m_final != true); + + if(m_tempDiffs.size() == stepSize-1) { Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(), std::back_inserter(m_diffs)); - m_size += m_tempDiffs.size(); + m_anchors.push_back(m_diffs.size()); + VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs)); + + m_size += m_tempDiffs.size() + 1; m_tempDiffs.clear(); - m_final = true; + } else { + PosT last = m_last; + PosT diff = i - last; + m_tempDiffs.push_back(diff); } - - size_t usage() - { - return m_diffs.size() * sizeof(unsigned int) - + m_anchors.size() * sizeof(NumT); - } - - size_t load(std::FILE* in, bool map = false) - { - size_t byteSize = 0; - - byteSize += fread(&m_final, sizeof(bool), 1, in) * sizeof(bool); - byteSize += fread(&m_size, sizeof(size_t), 1, in) * sizeof(size_t); - byteSize += fread(&m_last, sizeof(PosT), 1, in) * sizeof(PosT); - - byteSize += loadVector(m_diffs, in, map); - byteSize += loadVector(m_anchors, in, map); - - return byteSize; - } - - template - size_t loadVector(std::vector >& v, - std::FILE* in, bool map = false) - { - // Can only be read into memory. Mapping not possible with std:allocator. - assert(map == false); - - size_t byteSize = 0; - - size_t valSize; - byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t); - + m_last = i; + } + + void commit() { + assert(m_final != true); + Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(), + std::back_inserter(m_diffs)); + m_size += m_tempDiffs.size(); + m_tempDiffs.clear(); + m_final = true; + } + + size_t usage() { + return m_diffs.size() * sizeof(unsigned int) + + m_anchors.size() * sizeof(NumT); + } + + size_t load(std::FILE* in, bool map = false) { + size_t byteSize = 0; + + byteSize += fread(&m_final, sizeof(bool), 1, in) * sizeof(bool); + byteSize += fread(&m_size, sizeof(size_t), 1, in) * sizeof(size_t); + byteSize += fread(&m_last, sizeof(PosT), 1, in) * sizeof(PosT); + + byteSize += loadVector(m_diffs, in, map); + byteSize += loadVector(m_anchors, in, map); + + return byteSize; + } + + template + size_t loadVector(std::vector >& v, + std::FILE* in, bool map = false) { + // Can only be read into memory. Mapping not possible with std:allocator. + assert(map == false); + + size_t byteSize = 0; + + size_t valSize; + byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t); + + v.resize(valSize, 0); + byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); + + return byteSize; + } + + template + size_t loadVector(std::vector >& v, + std::FILE* in, bool map = false) { + size_t byteSize = 0; + + size_t valSize; + byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t); + + if(map == false) { + // Read data into temporary file (default constructor of MmapAllocator) + // and map memory onto temporary file. Can be resized. + v.resize(valSize, 0); byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); - - return byteSize; - } - - template - size_t loadVector(std::vector >& v, - std::FILE* in, bool map = false) - { - size_t byteSize = 0; + } else { + // Map it directly on specified region of file "in" starting at valPos + // with length valSize * sizeof(ValueT). Mapped region cannot be resized. - size_t valSize; - byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t); + size_t valPos = std::ftell(in); - if(map == false) - { - // Read data into temporary file (default constructor of MmapAllocator) - // and map memory onto temporary file. Can be resized. - - v.resize(valSize, 0); - byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); - } - else - { - // Map it directly on specified region of file "in" starting at valPos - // with length valSize * sizeof(ValueT). Mapped region cannot be resized. - - size_t valPos = std::ftell(in); - - Allocator alloc(in, valPos); - std::vector > vTemp(alloc); - vTemp.resize(valSize); - v.swap(vTemp); - - std::fseek(in, valSize * sizeof(ValueT), SEEK_CUR); - byteSize += valSize * sizeof(ValueT); - } - - return byteSize; - } - - size_t save(std::FILE* out) - { - if(!m_final) - commit(); - - bool byteSize = 0; - byteSize += ThrowingFwrite(&m_final, sizeof(bool), 1, out) * sizeof(bool); - byteSize += ThrowingFwrite(&m_size, sizeof(size_t), 1, out) * sizeof(size_t); - byteSize += ThrowingFwrite(&m_last, sizeof(PosT), 1, out) * sizeof(PosT); - - size_t size = m_diffs.size(); - byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t); - byteSize += ThrowingFwrite(&m_diffs[0], sizeof(unsigned int), size, out) * sizeof(unsigned int); - - size = m_anchors.size(); - byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t); - byteSize += ThrowingFwrite(&m_anchors[0], sizeof(NumT), size, out) * sizeof(NumT); - - return byteSize; - } - - void swap(MonotonicVector &mv) - { - if(!m_final) - commit(); - - m_diffs.swap(mv.m_diffs); - m_anchors.swap(mv.m_anchors); + Allocator alloc(in, valPos); + std::vector > vTemp(alloc); + vTemp.resize(valSize); + v.swap(vTemp); + + std::fseek(in, valSize * sizeof(ValueT), SEEK_CUR); + byteSize += valSize * sizeof(ValueT); } + + return byteSize; + } + + size_t save(std::FILE* out) { + if(!m_final) + commit(); + + bool byteSize = 0; + byteSize += ThrowingFwrite(&m_final, sizeof(bool), 1, out) * sizeof(bool); + byteSize += ThrowingFwrite(&m_size, sizeof(size_t), 1, out) * sizeof(size_t); + byteSize += ThrowingFwrite(&m_last, sizeof(PosT), 1, out) * sizeof(PosT); + + size_t size = m_diffs.size(); + byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t); + byteSize += ThrowingFwrite(&m_diffs[0], sizeof(unsigned int), size, out) * sizeof(unsigned int); + + size = m_anchors.size(); + byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t); + byteSize += ThrowingFwrite(&m_anchors[0], sizeof(NumT), size, out) * sizeof(NumT); + + return byteSize; + } + + void swap(MonotonicVector &mv) { + if(!m_final) + commit(); + + m_diffs.swap(mv.m_diffs); + m_anchors.swap(mv.m_anchors); + } }; } diff --git a/moses/TranslationModel/CompactPT/MurmurHash3.cpp b/moses/TranslationModel/CompactPT/MurmurHash3.cpp index 0bf738662..d16cd9502 100644 --- a/moses/TranslationModel/CompactPT/MurmurHash3.cpp +++ b/moses/TranslationModel/CompactPT/MurmurHash3.cpp @@ -107,16 +107,15 @@ void MurmurHash3_x86_32 ( const void * key, int len, const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); - for(int i = -nblocks; i; i++) - { + for(int i = -nblocks; i; i++) { uint32_t k1 = getblock(blocks,i); k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; - + h1 ^= k1; - h1 = ROTL32(h1,13); + h1 = ROTL32(h1,13); h1 = h1*5+0xe6546b64; } @@ -127,12 +126,17 @@ void MurmurHash3_x86_32 ( const void * key, int len, uint32_t k1 = 0; - switch(len & 3) - { - case 3: k1 ^= tail[2] << 16; - case 2: k1 ^= tail[1] << 8; - case 1: k1 ^= tail[0]; - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + switch(len & 3) { + case 3: + k1 ^= tail[2] << 16; + case 2: + k1 ^= tail[1] << 8; + case 1: + k1 ^= tail[0]; + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + h1 ^= k1; }; //---------- @@ -143,7 +147,7 @@ void MurmurHash3_x86_32 ( const void * key, int len, h1 = fmix(h1); *(uint32_t*)out = h1; -} +} //----------------------------------------------------------------------------- @@ -158,9 +162,9 @@ void MurmurHash3_x86_128 ( const void * key, const int len, uint32_t h3 = seed; uint32_t h4 = seed; - uint32_t c1 = 0x239b961b; + uint32_t c1 = 0x239b961b; uint32_t c2 = 0xab0e9789; - uint32_t c3 = 0x38b34ae5; + uint32_t c3 = 0x38b34ae5; uint32_t c4 = 0xa1e38b93; //---------- @@ -168,28 +172,47 @@ void MurmurHash3_x86_128 ( const void * key, const int len, const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); - for(int i = -nblocks; i; i++) - { + for(int i = -nblocks; i; i++) { uint32_t k1 = getblock(blocks,i*4+0); uint32_t k2 = getblock(blocks,i*4+1); uint32_t k3 = getblock(blocks,i*4+2); uint32_t k4 = getblock(blocks,i*4+3); - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + h1 ^= k1; - h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; + h1 = ROTL32(h1,19); + h1 += h2; + h1 = h1*5+0x561ccd1b; - k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + k2 *= c2; + k2 = ROTL32(k2,16); + k2 *= c3; + h2 ^= k2; - h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; + h2 = ROTL32(h2,17); + h2 += h3; + h2 = h2*5+0x0bcaa747; - k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + k3 *= c3; + k3 = ROTL32(k3,17); + k3 *= c4; + h3 ^= k3; - h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; + h3 = ROTL32(h3,15); + h3 += h4; + h3 = h3*5+0x96cd1c35; - k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + k4 *= c4; + k4 = ROTL32(k4,18); + k4 *= c1; + h4 ^= k4; - h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; + h4 = ROTL32(h4,13); + h4 += h1; + h4 = h4*5+0x32ac3b17; } //---------- @@ -202,47 +225,84 @@ void MurmurHash3_x86_128 ( const void * key, const int len, uint32_t k3 = 0; uint32_t k4 = 0; - switch(len & 15) - { - case 15: k4 ^= tail[14] << 16; - case 14: k4 ^= tail[13] << 8; - case 13: k4 ^= tail[12] << 0; - k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + switch(len & 15) { + case 15: + k4 ^= tail[14] << 16; + case 14: + k4 ^= tail[13] << 8; + case 13: + k4 ^= tail[12] << 0; + k4 *= c4; + k4 = ROTL32(k4,18); + k4 *= c1; + h4 ^= k4; - case 12: k3 ^= tail[11] << 24; - case 11: k3 ^= tail[10] << 16; - case 10: k3 ^= tail[ 9] << 8; - case 9: k3 ^= tail[ 8] << 0; - k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + case 12: + k3 ^= tail[11] << 24; + case 11: + k3 ^= tail[10] << 16; + case 10: + k3 ^= tail[ 9] << 8; + case 9: + k3 ^= tail[ 8] << 0; + k3 *= c3; + k3 = ROTL32(k3,17); + k3 *= c4; + h3 ^= k3; - case 8: k2 ^= tail[ 7] << 24; - case 7: k2 ^= tail[ 6] << 16; - case 6: k2 ^= tail[ 5] << 8; - case 5: k2 ^= tail[ 4] << 0; - k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + case 8: + k2 ^= tail[ 7] << 24; + case 7: + k2 ^= tail[ 6] << 16; + case 6: + k2 ^= tail[ 5] << 8; + case 5: + k2 ^= tail[ 4] << 0; + k2 *= c2; + k2 = ROTL32(k2,16); + k2 *= c3; + h2 ^= k2; - case 4: k1 ^= tail[ 3] << 24; - case 3: k1 ^= tail[ 2] << 16; - case 2: k1 ^= tail[ 1] << 8; - case 1: k1 ^= tail[ 0] << 0; - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + case 4: + k1 ^= tail[ 3] << 24; + case 3: + k1 ^= tail[ 2] << 16; + case 2: + k1 ^= tail[ 1] << 8; + case 1: + k1 ^= tail[ 0] << 0; + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + h1 ^= k1; }; //---------- // finalization - h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + h1 ^= len; + h2 ^= len; + h3 ^= len; + h4 ^= len; - h1 += h2; h1 += h3; h1 += h4; - h2 += h1; h3 += h1; h4 += h1; + h1 += h2; + h1 += h3; + h1 += h4; + h2 += h1; + h3 += h1; + h4 += h1; h1 = fmix(h1); h2 = fmix(h2); h3 = fmix(h3); h4 = fmix(h4); - h1 += h2; h1 += h3; h1 += h4; - h2 += h1; h3 += h1; h4 += h1; + h1 += h2; + h1 += h3; + h1 += h4; + h2 += h1; + h3 += h1; + h4 += h1; ((uint32_t*)out)[0] = h1; ((uint32_t*)out)[1] = h2; @@ -269,18 +329,27 @@ void MurmurHash3_x64_128 ( const void * key, const int len, const uint64_t * blocks = (const uint64_t *)(data); - for(int i = 0; i < nblocks; i++) - { + for(int i = 0; i < nblocks; i++) { uint64_t k1 = getblock(blocks,i*2+0); uint64_t k2 = getblock(blocks,i*2+1); - k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + k1 *= c1; + k1 = ROTL64(k1,31); + k1 *= c2; + h1 ^= k1; - h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; + h1 = ROTL64(h1,27); + h1 += h2; + h1 = h1*5+0x52dce729; - k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + k2 *= c2; + k2 = ROTL64(k2,33); + k2 *= c1; + h2 ^= k2; - h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; + h2 = ROTL64(h2,31); + h2 += h1; + h2 = h2*5+0x38495ab5; } //---------- @@ -291,32 +360,53 @@ void MurmurHash3_x64_128 ( const void * key, const int len, uint64_t k1 = 0; uint64_t k2 = 0; - switch(len & 15) - { - case 15: k2 ^= uint64_t(tail[14]) << 48; - case 14: k2 ^= uint64_t(tail[13]) << 40; - case 13: k2 ^= uint64_t(tail[12]) << 32; - case 12: k2 ^= uint64_t(tail[11]) << 24; - case 11: k2 ^= uint64_t(tail[10]) << 16; - case 10: k2 ^= uint64_t(tail[ 9]) << 8; - case 9: k2 ^= uint64_t(tail[ 8]) << 0; - k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + switch(len & 15) { + case 15: + k2 ^= uint64_t(tail[14]) << 48; + case 14: + k2 ^= uint64_t(tail[13]) << 40; + case 13: + k2 ^= uint64_t(tail[12]) << 32; + case 12: + k2 ^= uint64_t(tail[11]) << 24; + case 11: + k2 ^= uint64_t(tail[10]) << 16; + case 10: + k2 ^= uint64_t(tail[ 9]) << 8; + case 9: + k2 ^= uint64_t(tail[ 8]) << 0; + k2 *= c2; + k2 = ROTL64(k2,33); + k2 *= c1; + h2 ^= k2; - case 8: k1 ^= uint64_t(tail[ 7]) << 56; - case 7: k1 ^= uint64_t(tail[ 6]) << 48; - case 6: k1 ^= uint64_t(tail[ 5]) << 40; - case 5: k1 ^= uint64_t(tail[ 4]) << 32; - case 4: k1 ^= uint64_t(tail[ 3]) << 24; - case 3: k1 ^= uint64_t(tail[ 2]) << 16; - case 2: k1 ^= uint64_t(tail[ 1]) << 8; - case 1: k1 ^= uint64_t(tail[ 0]) << 0; - k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + case 8: + k1 ^= uint64_t(tail[ 7]) << 56; + case 7: + k1 ^= uint64_t(tail[ 6]) << 48; + case 6: + k1 ^= uint64_t(tail[ 5]) << 40; + case 5: + k1 ^= uint64_t(tail[ 4]) << 32; + case 4: + k1 ^= uint64_t(tail[ 3]) << 24; + case 3: + k1 ^= uint64_t(tail[ 2]) << 16; + case 2: + k1 ^= uint64_t(tail[ 1]) << 8; + case 1: + k1 ^= uint64_t(tail[ 0]) << 0; + k1 *= c1; + k1 = ROTL64(k1,31); + k1 *= c2; + h1 ^= k1; }; //---------- // finalization - h1 ^= len; h2 ^= len; + h1 ^= len; + h2 ^= len; h1 += h2; h2 += h1; diff --git a/moses/TranslationModel/CompactPT/PackedArray.h b/moses/TranslationModel/CompactPT/PackedArray.h index ad4596546..479c2cc79 100644 --- a/moses/TranslationModel/CompactPT/PackedArray.h +++ b/moses/TranslationModel/CompactPT/PackedArray.h @@ -1,23 +1,23 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #ifndef moses_PackedArray_h #define moses_PackedArray_h @@ -35,128 +35,117 @@ namespace Moses template class PackedArray { - protected: - static size_t m_dataBits; - - size_t m_size; - size_t m_storageSize; - D* m_storage; - - public: - PackedArray() - { - m_size = 0; - m_storageSize = 0; - m_storage = new D[0]; - } - - PackedArray(size_t size, size_t bits) : m_size(size) - { - m_storageSize = ceil(float(bits * size) / float(m_dataBits)); - m_storage = new D[m_storageSize]; - } - - PackedArray(const PackedArray &c) - { - m_size = c.m_size; - - m_storageSize = c.m_storageSize; - m_storage = new D[m_storageSize]; - - std::memcpy(m_storage, c.m_storage, m_storageSize * sizeof(D)); - } - - virtual ~PackedArray() - { - delete [] m_storage; - m_size = 0; - m_storageSize = 0; - m_storage = 0; - } - - T Get(size_t i, size_t bits) const - { - T out = 0; - - size_t bitstart = (i * bits); - size_t bitpos = bitstart; - - size_t zero = ((1ul << (bits)) - 1); - - while(bitpos - bitstart < bits) { - size_t pos = bitpos / m_dataBits; - size_t off = bitpos % m_dataBits; - - out |= (T(m_storage[pos]) << (bitpos - bitstart)) >> off; +protected: + static size_t m_dataBits; - bitpos += (m_dataBits - off); - } - - out &= zero; - return out; + size_t m_size; + size_t m_storageSize; + D* m_storage; + +public: + PackedArray() { + m_size = 0; + m_storageSize = 0; + m_storage = new D[0]; + } + + PackedArray(size_t size, size_t bits) : m_size(size) { + m_storageSize = ceil(float(bits * size) / float(m_dataBits)); + m_storage = new D[m_storageSize]; + } + + PackedArray(const PackedArray &c) { + m_size = c.m_size; + + m_storageSize = c.m_storageSize; + m_storage = new D[m_storageSize]; + + std::memcpy(m_storage, c.m_storage, m_storageSize * sizeof(D)); + } + + virtual ~PackedArray() { + delete [] m_storage; + m_size = 0; + m_storageSize = 0; + m_storage = 0; + } + + T Get(size_t i, size_t bits) const { + T out = 0; + + size_t bitstart = (i * bits); + size_t bitpos = bitstart; + + size_t zero = ((1ul << (bits)) - 1); + + while(bitpos - bitstart < bits) { + size_t pos = bitpos / m_dataBits; + size_t off = bitpos % m_dataBits; + + out |= (T(m_storage[pos]) << (bitpos - bitstart)) >> off; + + bitpos += (m_dataBits - off); } - - void Set(size_t i, T v, size_t bits) - { - size_t bitstart = (i * bits); - size_t bitpos = bitstart; - - while(bitpos - bitstart < bits) { - size_t pos = bitpos / m_dataBits; - size_t off = bitpos % m_dataBits; - - size_t rest = bits - (bitpos - bitstart); - D zero = ~((1ul << (rest + off)) - 1) | ((1ul << off) - 1); - - m_storage[pos] &= zero; - m_storage[pos] |= v << off; - v = v >> (m_dataBits - off); - bitpos += (m_dataBits - off); - } + + out &= zero; + return out; + } + + void Set(size_t i, T v, size_t bits) { + size_t bitstart = (i * bits); + size_t bitpos = bitstart; + + while(bitpos - bitstart < bits) { + size_t pos = bitpos / m_dataBits; + size_t off = bitpos % m_dataBits; + + size_t rest = bits - (bitpos - bitstart); + D zero = ~((1ul << (rest + off)) - 1) | ((1ul << off) - 1); + + m_storage[pos] &= zero; + m_storage[pos] |= v << off; + v = v >> (m_dataBits - off); + bitpos += (m_dataBits - off); } - - virtual D*& GetStorage() - { - return m_storage; - } - - virtual size_t GetStorageSize() const - { - return m_storageSize; - } - - virtual size_t Size() const - { - return m_size; - } - - virtual size_t Load(std::FILE* in) - { - size_t a1 = std::ftell(in); - - size_t read = 0; - read += std::fread(&m_size, sizeof(m_size), 1, in); - read += std::fread(&m_storageSize, sizeof(m_storageSize), 1, in); - delete [] m_storage; - m_storage = new D[m_storageSize]; - read += std::fread(m_storage, sizeof(D), m_storageSize, in); - - size_t a2 = std::ftell(in); - return a2 - a1; - } - - virtual size_t Save(std::FILE* out) - { - size_t a1 = std::ftell(out); - - ThrowingFwrite(&m_size, sizeof(m_size), 1, out); - ThrowingFwrite(&m_storageSize, sizeof(m_storageSize), 1, out); - ThrowingFwrite(m_storage, sizeof(D), m_storageSize, out); - - size_t a2 = std::ftell(out); - return a2 - a1; - } - + } + + virtual D*& GetStorage() { + return m_storage; + } + + virtual size_t GetStorageSize() const { + return m_storageSize; + } + + virtual size_t Size() const { + return m_size; + } + + virtual size_t Load(std::FILE* in) { + size_t a1 = std::ftell(in); + + size_t read = 0; + read += std::fread(&m_size, sizeof(m_size), 1, in); + read += std::fread(&m_storageSize, sizeof(m_storageSize), 1, in); + delete [] m_storage; + m_storage = new D[m_storageSize]; + read += std::fread(m_storage, sizeof(D), m_storageSize, in); + + size_t a2 = std::ftell(in); + return a2 - a1; + } + + virtual size_t Save(std::FILE* out) { + size_t a1 = std::ftell(out); + + ThrowingFwrite(&m_size, sizeof(m_size), 1, out); + ThrowingFwrite(&m_storageSize, sizeof(m_storageSize), 1, out); + ThrowingFwrite(m_storage, sizeof(D), m_storageSize, out); + + size_t a2 = std::ftell(out); + return a2 - a1; + } + }; template @@ -166,34 +155,31 @@ size_t PackedArray::m_dataBits = sizeof(D)*8; template class PairedPackedArray : public PackedArray -{ - public: - PairedPackedArray() : PackedArray() {} - - PairedPackedArray(size_t size, size_t bits1, size_t bits2) +{ +public: + PairedPackedArray() : PackedArray() {} + + PairedPackedArray(size_t size, size_t bits1, size_t bits2) : PackedArray(size, bits1 + bits2) { } - - void Set(size_t i, T a, T b, size_t bits1, size_t bits2) - { - T c = 0; - c = a | (b << bits1); - PackedArray::Set(i, c, bits1 + bits2); - } - - void Set(size_t i, std::pair p, size_t bits1, size_t bits2) - { - T c = 0; - c = p.second | (p.first << bits1); - PackedArray::Set(i, c); - } - - std::pair Get(size_t i, size_t bits1, size_t bits2) - { - T v = PackedArray::Get(i, bits1 + bits2); - T a = v & ((1 << bits1) - 1); - T b = v >> bits1; - return std::pair(a, b); - } + + void Set(size_t i, T a, T b, size_t bits1, size_t bits2) { + T c = 0; + c = a | (b << bits1); + PackedArray::Set(i, c, bits1 + bits2); + } + + void Set(size_t i, std::pair p, size_t bits1, size_t bits2) { + T c = 0; + c = p.second | (p.first << bits1); + PackedArray::Set(i, c); + } + + std::pair Get(size_t i, size_t bits1, size_t bits2) { + T v = PackedArray::Get(i, bits1 + bits2); + T a = v & ((1 << bits1) - 1); + T b = v >> bits1; + return std::pair(a, b); + } }; } diff --git a/moses/TranslationModel/CompactPT/PhraseDecoder.cpp b/moses/TranslationModel/CompactPT/PhraseDecoder.cpp index 03b3f6825..085a7337c 100644 --- a/moses/TranslationModel/CompactPT/PhraseDecoder.cpp +++ b/moses/TranslationModel/CompactPT/PhraseDecoder.cpp @@ -1,23 +1,23 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #include @@ -37,23 +37,23 @@ PhraseDecoder::PhraseDecoder( const std::vector* weight ) : m_coding(None), m_numScoreComponent(numScoreComponent), - m_containsAlignmentInfo(true), m_maxRank(0), - m_symbolTree(0), m_multipleScoreTrees(false), - m_scoreTrees(1), m_alignTree(0), - m_phraseDictionary(phraseDictionary), m_input(input), m_output(output), - m_weight(weight), - m_separator(" ||| ") + m_containsAlignmentInfo(true), m_maxRank(0), + m_symbolTree(0), m_multipleScoreTrees(false), + m_scoreTrees(1), m_alignTree(0), + m_phraseDictionary(phraseDictionary), m_input(input), m_output(output), + m_weight(weight), + m_separator(" ||| ") { } PhraseDecoder::~PhraseDecoder() { if(m_symbolTree) delete m_symbolTree; - + for(size_t i = 0; i < m_scoreTrees.size(); i++) if(m_scoreTrees[i]) delete m_scoreTrees[i]; - + if(m_alignTree) delete m_alignTree; } @@ -61,10 +61,10 @@ PhraseDecoder::~PhraseDecoder() inline unsigned PhraseDecoder::GetSourceSymbolId(std::string& symbol) { boost::unordered_map::iterator it - = m_sourceSymbolsMap.find(symbol); + = m_sourceSymbolsMap.find(symbol); if(it != m_sourceSymbolsMap.end()) return it->second; - + size_t idx = m_sourceSymbols.find(symbol); m_sourceSymbolsMap[symbol] = idx; return idx; @@ -144,76 +144,70 @@ size_t PhraseDecoder::Load(std::FILE* in) { size_t start = std::ftell(in); size_t read = 0; - + read += std::fread(&m_coding, sizeof(m_coding), 1, in); read += std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, in); read += std::fread(&m_containsAlignmentInfo, sizeof(m_containsAlignmentInfo), 1, in); read += std::fread(&m_maxRank, sizeof(m_maxRank), 1, in); read += std::fread(&m_maxPhraseLength, sizeof(m_maxPhraseLength), 1, in); - - if(m_coding == REnc) - { + + if(m_coding == REnc) { m_sourceSymbols.load(in); - + size_t size; read += std::fread(&size, sizeof(size_t), 1, in); m_lexicalTableIndex.resize(size); read += std::fread(&m_lexicalTableIndex[0], sizeof(size_t), size, in); - + read += std::fread(&size, sizeof(size_t), 1, in); m_lexicalTable.resize(size); read += std::fread(&m_lexicalTable[0], sizeof(SrcTrg), size, in); } - + m_targetSymbols.load(in); - + m_symbolTree = new CanonicalHuffman(in); - + read += std::fread(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, in); - if(m_multipleScoreTrees) - { + if(m_multipleScoreTrees) { m_scoreTrees.resize(m_numScoreComponent); for(size_t i = 0; i < m_numScoreComponent; i++) m_scoreTrees[i] = new CanonicalHuffman(in); - } - else - { + } else { m_scoreTrees.resize(1); m_scoreTrees[0] = new CanonicalHuffman(in); } - + if(m_containsAlignmentInfo) m_alignTree = new CanonicalHuffman(in); - + size_t end = std::ftell(in); return end - start; } - + std::string PhraseDecoder::MakeSourceKey(std::string &source) { - return source + m_separator; + return source + m_separator; } - + TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &sourcePhrase, bool topLevel) { - + // Not using TargetPhraseCollection avoiding "new" operator // which can introduce heavy locking with multiple threads TargetPhraseVectorPtr tpv(new TargetPhraseVector()); size_t bitsLeft = 0; - - if(m_coding == PREnc) - { + + if(m_coding == PREnc) { std::pair cachedPhraseColl - = m_decodingCache.Retrieve(sourcePhrase); - + = m_decodingCache.Retrieve(sourcePhrase); + // Has been cached and is complete or does not need to be completed if(cachedPhraseColl.first != NULL && (!topLevel || cachedPhraseColl.second == 0)) return cachedPhraseColl.first; - + // Has been cached, but is incomplete - else if(cachedPhraseColl.first != NULL) - { + else if(cachedPhraseColl.first != NULL) { bitsLeft = cachedPhraseColl.second; tpv->resize(cachedPhraseColl.first->size()); std::copy(cachedPhraseColl.first->begin(), @@ -221,220 +215,187 @@ TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase & tpv->begin()); } } - + // Retrieve source phrase identifier std::string sourcePhraseString = sourcePhrase.GetStringRep(*m_input); size_t sourcePhraseId = m_phraseDictionary.m_hash[MakeSourceKey(sourcePhraseString)]; - - if(sourcePhraseId != m_phraseDictionary.m_hash.GetSize()) - { - // Retrieve compressed and encoded target phrase collection + + if(sourcePhraseId != m_phraseDictionary.m_hash.GetSize()) { + // Retrieve compressed and encoded target phrase collection std::string encodedPhraseCollection; if(m_phraseDictionary.m_inMemory) encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMemory[sourcePhraseId]; else encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMapped[sourcePhraseId]; - + BitWrapper<> encodedBitStream(encodedPhraseCollection); if(m_coding == PREnc && bitsLeft) encodedBitStream.SeekFromEnd(bitsLeft); - + // Decompress and decode target phrase collection TargetPhraseVectorPtr decodedPhraseColl = DecodeCollection(tpv, encodedBitStream, sourcePhrase, topLevel); - + return decodedPhraseColl; - } - else - return TargetPhraseVectorPtr(); + } else + return TargetPhraseVectorPtr(); } - + TargetPhraseVectorPtr PhraseDecoder::DecodeCollection( TargetPhraseVectorPtr tpv, BitWrapper<> &encodedBitStream, const Phrase &sourcePhrase, bool topLevel) { - + bool extending = tpv->size(); size_t bitsLeft = encodedBitStream.TellFromEnd(); - + typedef std::pair AlignPointSizeT; - + std::vector sourceWords; - if(m_coding == REnc) - { - for(size_t i = 0; i < sourcePhrase.GetSize(); i++) - { + if(m_coding == REnc) { + for(size_t i = 0; i < sourcePhrase.GetSize(); i++) { std::string sourceWord - = sourcePhrase.GetWord(i).GetString(*m_input, false); + = sourcePhrase.GetWord(i).GetString(*m_input, false); unsigned idx = GetSourceSymbolId(sourceWord); sourceWords.push_back(idx); } } - + unsigned phraseStopSymbol = 0; AlignPoint alignStopSymbol(-1, -1); - + std::vector scores; std::set alignment; - + enum DecodeState { New, Symbol, Score, Alignment, Add } state = New; - + size_t srcSize = sourcePhrase.GetSize(); - + TargetPhrase* targetPhrase = NULL; - while(encodedBitStream.TellFromEnd()) - { - - if(state == New) - { + while(encodedBitStream.TellFromEnd()) { + + if(state == New) { // Creating new TargetPhrase on the heap tpv->push_back(TargetPhrase()); targetPhrase = &tpv->back(); - + targetPhrase->SetSourcePhrase(sourcePhrase); alignment.clear(); scores.clear(); - + state = Symbol; } - - if(state == Symbol) - { - unsigned symbol = m_symbolTree->Read(encodedBitStream); - if(symbol == phraseStopSymbol) - { + + if(state == Symbol) { + unsigned symbol = m_symbolTree->Read(encodedBitStream); + if(symbol == phraseStopSymbol) { state = Score; - } - else - { - if(m_coding == REnc) - { + } else { + if(m_coding == REnc) { std::string wordString; size_t type = GetREncType(symbol); - - if(type == 1) - { + + if(type == 1) { unsigned decodedSymbol = DecodeREncSymbol1(symbol); wordString = GetTargetSymbol(decodedSymbol); - } - else if (type == 2) - { + } else if (type == 2) { size_t rank = DecodeREncSymbol2Rank(symbol); size_t srcPos = DecodeREncSymbol2Position(symbol); - + if(srcPos >= sourceWords.size()) - return TargetPhraseVectorPtr(); - + return TargetPhraseVectorPtr(); + wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); - if(m_phraseDictionary.m_useAlignmentInfo) - { + if(m_phraseDictionary.m_useAlignmentInfo) { size_t trgPos = targetPhrase->GetSize(); alignment.insert(AlignPoint(srcPos, trgPos)); } - } - else if(type == 3) - { + } else if(type == 3) { size_t rank = DecodeREncSymbol3(symbol); size_t srcPos = targetPhrase->GetSize(); - + if(srcPos >= sourceWords.size()) - return TargetPhraseVectorPtr(); - - wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); - if(m_phraseDictionary.m_useAlignmentInfo) - { + return TargetPhraseVectorPtr(); + + wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); + if(m_phraseDictionary.m_useAlignmentInfo) { size_t trgPos = srcPos; alignment.insert(AlignPoint(srcPos, trgPos)); } } - + Word word; word.CreateFromString(Output, *m_output, wordString, false); targetPhrase->AddWord(word); - } - else if(m_coding == PREnc) - { + } else if(m_coding == PREnc) { // if the symbol is just a word - if(GetPREncType(symbol) == 1) - { + if(GetPREncType(symbol) == 1) { unsigned decodedSymbol = DecodePREncSymbol1(symbol); - + Word word; word.CreateFromString(Output, *m_output, GetTargetSymbol(decodedSymbol), false); targetPhrase->AddWord(word); } // if the symbol is a subphrase pointer - else - { + else { int left = DecodePREncSymbol2Left(symbol); int right = DecodePREncSymbol2Right(symbol); unsigned rank = DecodePREncSymbol2Rank(symbol); - + int srcStart = left + targetPhrase->GetSize(); int srcEnd = srcSize - right - 1; - + // false positive consistency check if(0 > srcStart || srcStart > srcEnd || unsigned(srcEnd) >= srcSize) return TargetPhraseVectorPtr(); - + // false positive consistency check if(m_maxRank && rank > m_maxRank) - return TargetPhraseVectorPtr(); - + return TargetPhraseVectorPtr(); + // set subphrase by default to itself TargetPhraseVectorPtr subTpv = tpv; - + // if range smaller than source phrase retrieve subphrase - if(unsigned(srcEnd - srcStart + 1) != srcSize) - { + if(unsigned(srcEnd - srcStart + 1) != srcSize) { Phrase subPhrase = sourcePhrase.GetSubString(WordsRange(srcStart, srcEnd)); subTpv = CreateTargetPhraseCollection(subPhrase, false); - } - else { + } else { // false positive consistency check if(rank >= tpv->size()-1) return TargetPhraseVectorPtr(); } - + // false positive consistency check - if(subTpv != NULL && rank < subTpv->size()) - { + if(subTpv != NULL && rank < subTpv->size()) { // insert the subphrase into the main target phrase TargetPhrase& subTp = subTpv->at(rank); - if(m_phraseDictionary.m_useAlignmentInfo) - { + if(m_phraseDictionary.m_useAlignmentInfo) { // reconstruct the alignment data based on the alignment of the subphrase for(AlignmentInfo::const_iterator it = subTp.GetAlignTerm().begin(); - it != subTp.GetAlignTerm().end(); it++) - { + it != subTp.GetAlignTerm().end(); it++) { alignment.insert(AlignPointSizeT(srcStart + it->first, targetPhrase->GetSize() + it->second)); } } targetPhrase->Append(subTp); - } - else + } else return TargetPhraseVectorPtr(); } - } - else - { - Word word; - word.CreateFromString(Output, *m_output, - GetTargetSymbol(symbol), false); - targetPhrase->AddWord(word); + } else { + Word word; + word.CreateFromString(Output, *m_output, + GetTargetSymbol(symbol), false); + targetPhrase->AddWord(word); } } - } - else if(state == Score) - { + } else if(state == Score) { size_t idx = m_multipleScoreTrees ? scores.size() : 0; float score = m_scoreTrees[idx]->Read(encodedBitStream); scores.push_back(score); - - if(scores.size() == m_numScoreComponent) - { + + if(scores.size() == m_numScoreComponent) { targetPhrase->GetScoreBreakdown().Assign(&m_phraseDictionary, scores); targetPhrase->Evaluate(sourcePhrase); @@ -443,49 +404,41 @@ TargetPhraseVectorPtr PhraseDecoder::DecodeCollection( else state = Add; } - } - else if(state == Alignment) - { + } else if(state == Alignment) { AlignPoint alignPoint = m_alignTree->Read(encodedBitStream); - if(alignPoint == alignStopSymbol) - { + if(alignPoint == alignStopSymbol) { state = Add; - } - else - { - if(m_phraseDictionary.m_useAlignmentInfo) + } else { + if(m_phraseDictionary.m_useAlignmentInfo) alignment.insert(AlignPointSizeT(alignPoint)); } } - - if(state == Add) - { + + if(state == Add) { if(m_phraseDictionary.m_useAlignmentInfo) { targetPhrase->SetAlignTerm(alignment); } - - if(m_coding == PREnc) - { + + if(m_coding == PREnc) { if(!m_maxRank || tpv->size() <= m_maxRank) bitsLeft = encodedBitStream.TellFromEnd(); - + if(!topLevel && m_maxRank && tpv->size() >= m_maxRank) break; } - + if(encodedBitStream.TellFromEnd() <= 8) break; - + state = New; - } + } } - - if(m_coding == PREnc && !extending) - { + + if(m_coding == PREnc && !extending) { bitsLeft = bitsLeft > 8 ? bitsLeft : 0; m_decodingCache.Cache(sourcePhrase, tpv, bitsLeft, m_maxRank); } - + return tpv; } diff --git a/moses/TranslationModel/CompactPT/PhraseDecoder.h b/moses/TranslationModel/CompactPT/PhraseDecoder.h index 13c8af300..85e9334da 100644 --- a/moses/TranslationModel/CompactPT/PhraseDecoder.h +++ b/moses/TranslationModel/CompactPT/PhraseDecoder.h @@ -1,23 +1,23 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #ifndef moses_PhraseDecoder_h #define moses_PhraseDecoder_h @@ -52,93 +52,93 @@ class PhraseDictionaryCompact; class PhraseDecoder { - protected: - - friend class PhraseDictionaryCompact; - - typedef std::pair AlignPoint; - typedef std::pair SrcTrg; - - enum Coding { None, REnc, PREnc } m_coding; - - size_t m_numScoreComponent; - bool m_containsAlignmentInfo; - size_t m_maxRank; - size_t m_maxPhraseLength; - - boost::unordered_map m_sourceSymbolsMap; - StringVector m_sourceSymbols; - StringVector m_targetSymbols; - - std::vector m_lexicalTableIndex; - std::vector m_lexicalTable; - - CanonicalHuffman* m_symbolTree; - - bool m_multipleScoreTrees; - std::vector*> m_scoreTrees; - - CanonicalHuffman* m_alignTree; - - TargetPhraseCollectionCache m_decodingCache; - - PhraseDictionaryCompact& m_phraseDictionary; - - // *********************************************** - - const std::vector* m_input; - const std::vector* m_output; - const std::vector* m_weight; - - std::string m_separator; - - // *********************************************** - - unsigned GetSourceSymbolId(std::string& s); - std::string GetTargetSymbol(unsigned id) const; - - size_t GetREncType(unsigned encodedSymbol); - size_t GetPREncType(unsigned encodedSymbol); - - unsigned GetTranslation(unsigned srcIdx, size_t rank); - - size_t GetMaxSourcePhraseLength(); - - unsigned DecodeREncSymbol1(unsigned encodedSymbol); - unsigned DecodeREncSymbol2Rank(unsigned encodedSymbol); - unsigned DecodeREncSymbol2Position(unsigned encodedSymbol); - unsigned DecodeREncSymbol3(unsigned encodedSymbol); - - unsigned DecodePREncSymbol1(unsigned encodedSymbol); - int DecodePREncSymbol2Left(unsigned encodedSymbol); - int DecodePREncSymbol2Right(unsigned encodedSymbol); - unsigned DecodePREncSymbol2Rank(unsigned encodedSymbol); - - std::string MakeSourceKey(std::string &); - - public: - - PhraseDecoder( - PhraseDictionaryCompact &phraseDictionary, - const std::vector* input, - const std::vector* output, - size_t numScoreComponent, - const std::vector* weight - ); - - ~PhraseDecoder(); - - size_t Load(std::FILE* in); - - TargetPhraseVectorPtr CreateTargetPhraseCollection(const Phrase &sourcePhrase, - bool topLevel = false); - - TargetPhraseVectorPtr DecodeCollection(TargetPhraseVectorPtr tpv, - BitWrapper<> &encodedBitStream, - const Phrase &sourcePhrase, - bool topLevel); - - void PruneCache(); +protected: + + friend class PhraseDictionaryCompact; + + typedef std::pair AlignPoint; + typedef std::pair SrcTrg; + + enum Coding { None, REnc, PREnc } m_coding; + + size_t m_numScoreComponent; + bool m_containsAlignmentInfo; + size_t m_maxRank; + size_t m_maxPhraseLength; + + boost::unordered_map m_sourceSymbolsMap; + StringVector m_sourceSymbols; + StringVector m_targetSymbols; + + std::vector m_lexicalTableIndex; + std::vector m_lexicalTable; + + CanonicalHuffman* m_symbolTree; + + bool m_multipleScoreTrees; + std::vector*> m_scoreTrees; + + CanonicalHuffman* m_alignTree; + + TargetPhraseCollectionCache m_decodingCache; + + PhraseDictionaryCompact& m_phraseDictionary; + + // *********************************************** + + const std::vector* m_input; + const std::vector* m_output; + const std::vector* m_weight; + + std::string m_separator; + + // *********************************************** + + unsigned GetSourceSymbolId(std::string& s); + std::string GetTargetSymbol(unsigned id) const; + + size_t GetREncType(unsigned encodedSymbol); + size_t GetPREncType(unsigned encodedSymbol); + + unsigned GetTranslation(unsigned srcIdx, size_t rank); + + size_t GetMaxSourcePhraseLength(); + + unsigned DecodeREncSymbol1(unsigned encodedSymbol); + unsigned DecodeREncSymbol2Rank(unsigned encodedSymbol); + unsigned DecodeREncSymbol2Position(unsigned encodedSymbol); + unsigned DecodeREncSymbol3(unsigned encodedSymbol); + + unsigned DecodePREncSymbol1(unsigned encodedSymbol); + int DecodePREncSymbol2Left(unsigned encodedSymbol); + int DecodePREncSymbol2Right(unsigned encodedSymbol); + unsigned DecodePREncSymbol2Rank(unsigned encodedSymbol); + + std::string MakeSourceKey(std::string &); + +public: + + PhraseDecoder( + PhraseDictionaryCompact &phraseDictionary, + const std::vector* input, + const std::vector* output, + size_t numScoreComponent, + const std::vector* weight + ); + + ~PhraseDecoder(); + + size_t Load(std::FILE* in); + + TargetPhraseVectorPtr CreateTargetPhraseCollection(const Phrase &sourcePhrase, + bool topLevel = false); + + TargetPhraseVectorPtr DecodeCollection(TargetPhraseVectorPtr tpv, + BitWrapper<> &encodedBitStream, + const Phrase &sourcePhrase, + bool topLevel); + + void PruneCache(); }; } diff --git a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp index e863eb812..ff33f10a7 100644 --- a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp +++ b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp @@ -1,23 +1,23 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #include #include @@ -40,42 +40,35 @@ using namespace std; namespace Moses { - + bool PhraseDictionaryCompact::InitDictionary() { const StaticData &staticData = StaticData::Instance(); m_weight = staticData.GetWeights(this); - + std::string tFilePath = m_filePath; - + std::string suffix = ".minphr"; - if(tFilePath.substr(tFilePath.length() - suffix.length(), suffix.length()) == suffix) - { - if(!FileExists(tFilePath)) - { + if(tFilePath.substr(tFilePath.length() - suffix.length(), suffix.length()) == suffix) { + if(!FileExists(tFilePath)) { std::cerr << "Error: File " << tFilePath << " does not exit." << std::endl; exit(1); } - } - else - { - if(FileExists(tFilePath + suffix)) - { + } else { + if(FileExists(tFilePath + suffix)) { tFilePath += suffix; - } - else - { - std::cerr << "Error: File " << tFilePath << ".minphr does not exit." << std::endl; - exit(1); + } else { + std::cerr << "Error: File " << tFilePath << ".minphr does not exit." << std::endl; + exit(1); } } m_phraseDecoder = new PhraseDecoder(*this, &m_input, &m_output, - m_numScoreComponents, &m_weight); + m_numScoreComponents, &m_weight); std::FILE* pFile = std::fopen(tFilePath.c_str() , "r"); - + size_t indexSize; if(m_inMemory) // Load source phrase index into memory @@ -85,7 +78,7 @@ bool PhraseDictionaryCompact::InitDictionary() indexSize = m_hash.LoadIndex(pFile); size_t coderSize = m_phraseDecoder->Load(pFile); - + size_t phraseSize; if(m_inMemory) // Load target phrase collections into memory @@ -93,8 +86,8 @@ bool PhraseDictionaryCompact::InitDictionary() else // Keep target phrase collections on disk phraseSize = m_targetPhrasesMapped.load(pFile, true); - - return indexSize && coderSize && phraseSize; + + return indexSize && coderSize && phraseSize; } struct CompareTargetPhrase { @@ -104,21 +97,22 @@ struct CompareTargetPhrase { }; const TargetPhraseCollection* -PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) const { - +PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) const +{ + // There is no souch source phrase if source phrase is longer than longest - // observed source phrase during compilation + // observed source phrase during compilation if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength()) return NULL; // Retrieve target phrase collection from phrase table TargetPhraseVectorPtr decodedPhraseColl - = m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true); - + = m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true); + if(decodedPhraseColl != NULL && decodedPhraseColl->size()) { TargetPhraseVectorPtr tpv(new TargetPhraseVector(*decodedPhraseColl)); TargetPhraseCollection* phraseColl = new TargetPhraseCollection(); - + // Score phrases and if possible apply ttable_limit TargetPhraseVector::iterator nth = (m_tableLimit == 0 || tpv->size() < m_tableLimit) ? @@ -129,21 +123,21 @@ PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) c cerr << *tp << endl; phraseColl->Add(tp); } - + // Cache phrase pair for for clean-up or retrieval with PREnc const_cast(this)->CacheForCleanup(phraseColl); - + return phraseColl; - } - else + } else return NULL; } TargetPhraseVectorPtr -PhraseDictionaryCompact::GetTargetPhraseCollectionRaw(const Phrase &sourcePhrase) const { +PhraseDictionaryCompact::GetTargetPhraseCollectionRaw(const Phrase &sourcePhrase) const +{ // There is no souch source phrase if source phrase is longer than longest - // observed source phrase during compilation + // observed source phrase during compilation if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength()) return TargetPhraseVectorPtr(); @@ -151,42 +145,45 @@ PhraseDictionaryCompact::GetTargetPhraseCollectionRaw(const Phrase &sourcePhrase return m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true); } -PhraseDictionaryCompact::~PhraseDictionaryCompact() { +PhraseDictionaryCompact::~PhraseDictionaryCompact() +{ if(m_phraseDecoder) delete m_phraseDecoder; } //TO_STRING_BODY(PhraseDictionaryCompact) -void PhraseDictionaryCompact::CacheForCleanup(TargetPhraseCollection* tpc) { +void PhraseDictionaryCompact::CacheForCleanup(TargetPhraseCollection* tpc) +{ #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_sentenceMutex); - PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()]; + PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()]; #else - PhraseCache &ref = m_sentenceCache; + PhraseCache &ref = m_sentenceCache; #endif ref.push_back(tpc); } void PhraseDictionaryCompact::AddEquivPhrase(const Phrase &source, - const TargetPhrase &targetPhrase) { } + const TargetPhrase &targetPhrase) { } -void PhraseDictionaryCompact::CleanUpAfterSentenceProcessing(const InputType &source) { +void PhraseDictionaryCompact::CleanUpAfterSentenceProcessing(const InputType &source) +{ if(!m_inMemory) m_hash.KeepNLastRanges(0.01, 0.2); - + m_phraseDecoder->PruneCache(); - + #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_sentenceMutex); - PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()]; + PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()]; #else - PhraseCache &ref = m_sentenceCache; + PhraseCache &ref = m_sentenceCache; #endif - - for(PhraseCache::iterator it = ref.begin(); it != ref.end(); it++) - delete *it; - + + for(PhraseCache::iterator it = ref.begin(); it != ref.end(); it++) + delete *it; + PhraseCache temp; temp.swap(ref); } diff --git a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h index 1eab58894..60969665a 100644 --- a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h +++ b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h @@ -1,23 +1,23 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #ifndef moses_PhraseDictionaryCompact_h #define moses_PhraseDictionaryCompact_h @@ -50,7 +50,7 @@ protected: bool m_inMemory; bool m_useAlignmentInfo; - + typedef std::vector PhraseCache; #ifdef WITH_THREADS boost::mutex m_sentenceMutex; @@ -59,23 +59,22 @@ protected: typedef PhraseCache SentenceCache; #endif SentenceCache m_sentenceCache; - + BlockHashIndex m_hash; PhraseDecoder* m_phraseDecoder; - + StringVector m_targetPhrasesMapped; StringVector m_targetPhrasesMemory; std::vector m_weight; public: PhraseDictionaryCompact(const std::string &line) - :PhraseDictionary("PhraseDictionaryCompact", line) - ,m_inMemory(true) - ,m_useAlignmentInfo(true) - ,m_hash(10, 16) - ,m_phraseDecoder(0) - ,m_weight(0) - { + :PhraseDictionary("PhraseDictionaryCompact", line) + ,m_inMemory(true) + ,m_useAlignmentInfo(true) + ,m_hash(10, 16) + ,m_phraseDecoder(0) + ,m_weight(0) { } ~PhraseDictionaryCompact(); @@ -84,16 +83,15 @@ public: const TargetPhraseCollection* GetTargetPhraseCollection(const Phrase &source) const; TargetPhraseVectorPtr GetTargetPhraseCollectionRaw(const Phrase &source) const; - + void AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase); - + void CacheForCleanup(TargetPhraseCollection* tpc); void CleanUpAfterSentenceProcessing(const InputType &source); virtual ChartRuleLookupManager *CreateRuleLookupManager( const InputType &, - const ChartCellCollectionBase &) - { + const ChartCellCollectionBase &) { assert(false); return 0; } diff --git a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp index c7bd81019..fc3b056c6 100644 --- a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp +++ b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp @@ -1,23 +1,23 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #include @@ -29,17 +29,17 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA namespace Moses { - + bool operator<(const PackedItem &pi1, const PackedItem &pi2) { if(pi1.GetLine() < pi2.GetLine()) - return false; + return false; return true; } - + std::string PhraseTableCreator::m_phraseStopSymbol = "__SPECIAL_STOP_SYMBOL__"; std::string PhraseTableCreator::m_separator = " ||| "; - + PhraseTableCreator::PhraseTableCreator(std::string inPath, std::string outPath, std::string tempfilePath, @@ -56,7 +56,7 @@ PhraseTableCreator::PhraseTableCreator(std::string inPath, #ifdef WITH_THREADS , size_t threads #endif - ) + ) : m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath), m_outFile(std::fopen(m_outPath.c_str(), "w")), m_numScoreComponent(numScoreComponent), m_sortScoreIndex(sortScoreIndex), m_warnMe(warnMe), @@ -64,81 +64,76 @@ PhraseTableCreator::PhraseTableCreator(std::string inPath, m_useAlignmentInfo(useAlignmentInfo), m_multipleScoreTrees(multipleScoreTrees), m_quantize(quantize), m_maxRank(maxRank), - #ifdef WITH_THREADS +#ifdef WITH_THREADS m_threads(threads), m_srcHash(m_orderBits, m_fingerPrintBits, 1), m_rnkHash(10, 24, m_threads), - #else +#else m_srcHash(m_orderBits, m_fingerPrintBits), m_rnkHash(m_orderBits, m_fingerPrintBits), - #endif +#endif m_maxPhraseLength(0), m_lastFlushedLine(-1), m_lastFlushedSourceNum(0), m_lastFlushedSourcePhrase("") { PrintInfo(); - + AddTargetSymbolId(m_phraseStopSymbol); - + size_t cur_pass = 1; size_t all_passes = 2; if(m_coding == PREnc) all_passes = 3; - + m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1); for(std::vector::iterator it = m_scoreCounters.begin(); - it != m_scoreCounters.end(); it++) + it != m_scoreCounters.end(); it++) *it = new ScoreCounter(); m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1); - + // 0th pass - if(m_coding == REnc) - { + if(m_coding == REnc) { size_t found = inPath.find_last_of("/\\"); std::string path; - if(found != std::string::npos) + if(found != std::string::npos) path = inPath.substr(0, found); else path = "."; LoadLexicalTable(path + "/lex.f2e"); - } - else if(m_coding == PREnc) - { + } else if(m_coding == PREnc) { std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Creating hash function for rank assignment" << std::endl; cur_pass++; CreateRankHash(); } - + // 1st pass std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Creating source phrase index + Encoding target phrases" << std::endl; m_srcHash.BeginSave(m_outFile); - + if(tempfilePath.size()) { MmapAllocator allocEncoded(util::FMakeTemp(tempfilePath)); m_encodedTargetPhrases = new StringVector(allocEncoded); - } - else { - m_encodedTargetPhrases = new StringVector(); + } else { + m_encodedTargetPhrases = new StringVector(); } EncodeTargetPhrases(); - + cur_pass++; - + std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl; CalcHuffmanCodes(); - + // 2nd pass std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Compressing target phrases" << std::endl; - + if(tempfilePath.size()) { MmapAllocator allocCompressed(util::FMakeTemp(tempfilePath)); m_compressedTargetPhrases = new StringVector(allocCompressed); - } - else { + } else { m_compressedTargetPhrases = new StringVector(); } CompressTargetPhrases(); - + std::cerr << "Saving to " << m_outPath << std::endl; Save(); std::cerr << "Done" << std::endl; @@ -149,44 +144,43 @@ PhraseTableCreator::~PhraseTableCreator() { delete m_symbolTree; if(m_useAlignmentInfo) - delete m_alignTree; + delete m_alignTree; for(size_t i = 0; i < m_scoreTrees.size(); i++) { delete m_scoreTrees[i]; delete m_scoreCounters[i]; } - + delete m_encodedTargetPhrases; - delete m_compressedTargetPhrases; + delete m_compressedTargetPhrases; } void PhraseTableCreator::PrintInfo() { std::string encodings[3] = {"Huffman", "Huffman + REnc", "Huffman + PREnc"}; - + std::cerr << "Used options:" << std::endl; std::cerr << "\tText phrase table will be read from: " << m_inPath << std::endl; std::cerr << "\tOutput phrase table will be written to: " << m_outPath << std::endl; std::cerr << "\tStep size for source landmark phrases: 2^" << m_orderBits << "=" << (1ul << m_orderBits) << std::endl; std::cerr << "\tSource phrase fingerprint size: " << m_fingerPrintBits << " bits / P(fp)=" << (float(1)/(1ul << m_fingerPrintBits)) << std::endl; std::cerr << "\tSelected target phrase encoding: " << encodings[m_coding] << std::endl; - if(m_coding == PREnc) - { + if(m_coding == PREnc) { std::cerr << "\tMaxiumum allowed rank for PREnc: "; if(!m_maxRank) std::cerr << "unlimited" << std::endl; else - std::cerr << m_maxRank << std::endl; + std::cerr << m_maxRank << std::endl; } - std::cerr << "\tNumber of score components in phrase table: " << m_numScoreComponent << std::endl; - std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl; + std::cerr << "\tNumber of score components in phrase table: " << m_numScoreComponent << std::endl; + std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl; std::cerr << "\tUsing score quantization: "; if(m_quantize) std::cerr << m_quantize << " best" << std::endl; else std::cerr << "no" << std::endl; - std::cerr << "\tExplicitly included alignment information: " << (m_useAlignmentInfo ? "yes" : "no") << std::endl; - -#ifdef WITH_THREADS + std::cerr << "\tExplicitly included alignment information: " << (m_useAlignmentInfo ? "yes" : "no") << std::endl; + +#ifdef WITH_THREADS std::cerr << "\tRunning with " << m_threads << " threads" << std::endl; #endif std::cerr << std::endl; @@ -200,22 +194,21 @@ void PhraseTableCreator::Save() ThrowingFwrite(&m_useAlignmentInfo, sizeof(m_useAlignmentInfo), 1, m_outFile); ThrowingFwrite(&m_maxRank, sizeof(m_maxRank), 1, m_outFile); ThrowingFwrite(&m_maxPhraseLength, sizeof(m_maxPhraseLength), 1, m_outFile); - - if(m_coding == REnc) - { + + if(m_coding == REnc) { // Save source language symbols for REnc std::vector temp1; temp1.resize(m_sourceSymbolsMap.size()); for(boost::unordered_map::iterator it = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++) - temp1[it->second] = it->first; + temp1[it->second] = it->first; std::sort(temp1.begin(), temp1.end()); StringVector sourceSymbols; for(std::vector::iterator it = temp1.begin(); it != temp1.end(); it++) - sourceSymbols.push_back(*it); + sourceSymbols.push_back(*it); sourceSymbols.save(m_outFile); - + // Save lexical translation table for REnc size_t size = m_lexicalTableIndex.size(); ThrowingFwrite(&size, sizeof(size_t), 1, m_outFile); @@ -224,95 +217,92 @@ void PhraseTableCreator::Save() ThrowingFwrite(&size, sizeof(size_t), 1, m_outFile); ThrowingFwrite(&m_lexicalTable[0], sizeof(SrcTrg), size, m_outFile); } - + // Save target language symbols std::vector temp2; temp2.resize(m_targetSymbolsMap.size()); for(boost::unordered_map::iterator it - = m_targetSymbolsMap.begin(); it != m_targetSymbolsMap.end(); it++) + = m_targetSymbolsMap.begin(); it != m_targetSymbolsMap.end(); it++) temp2[it->second] = it->first; StringVector targetSymbols; for(std::vector::iterator it = temp2.begin(); - it != temp2.end(); it++) + it != temp2.end(); it++) targetSymbols.push_back(*it); targetSymbols.save(m_outFile); - + // Save Huffman codes for target language symbols m_symbolTree->Save(m_outFile); - + // Save number of Huffman code sets for scores and // save Huffman code sets ThrowingFwrite(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, m_outFile); size_t numScoreTrees = m_scoreTrees.size(); for(size_t i = 0; i < numScoreTrees; i++) m_scoreTrees[i]->Save(m_outFile); - + // Save Huffman codes for alignments if(m_useAlignmentInfo) m_alignTree->Save(m_outFile); - - // Save compressed target phrase collections + + // Save compressed target phrase collections m_compressedTargetPhrases->save(m_outFile); } - + void PhraseTableCreator::LoadLexicalTable(std::string filePath) { std::vector t_lexTable; - + std::cerr << "Reading in lexical table for Rank Encoding" << std::endl; std::ifstream lexIn(filePath.c_str(), std::ifstream::in); std::string src, trg; float prob; - + // Reading in the translation probability lexicon - + std::cerr << "\tLoading from " << filePath << std::endl; - while(lexIn >> trg >> src >> prob) - { + while(lexIn >> trg >> src >> prob) { t_lexTable.push_back(SrcTrgProb(SrcTrgString(src, trg), prob)); AddSourceSymbolId(src); AddTargetSymbolId(trg); } - + // Sorting lexicon by source words by lexicographical order, corresponding // target words by decreasing probability. - + std::cerr << "\tSorting according to translation rank" << std::endl; std::sort(t_lexTable.begin(), t_lexTable.end(), SrcTrgProbSorter()); - + // Re-assigning source word ids in lexicographical order - + std::vector temp1; temp1.resize(m_sourceSymbolsMap.size()); for(boost::unordered_map::iterator it - = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++) + = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++) temp1[it->second] = it->first; - + std::sort(temp1.begin(), temp1.end()); - + for(size_t i = 0; i < temp1.size(); i++) m_sourceSymbolsMap[temp1[i]] = i; - + // Building the lexicon based on source and target word ids - + std::string srcWord = ""; size_t srcIdx = 0; for(std::vector::iterator it = t_lexTable.begin(); - it != t_lexTable.end(); it++) - { + it != t_lexTable.end(); it++) { // If we encounter a new source word - if(it->first.first != srcWord) - { + if(it->first.first != srcWord) { srcIdx = GetSourceSymbolId(it->first.first); - + // Store position of first translation if(srcIdx >= m_lexicalTableIndex.size()) m_lexicalTableIndex.resize(srcIdx + 1); m_lexicalTableIndex[srcIdx] = m_lexicalTable.size(); } - + // Store pair of source word and target word - size_t trgIdx = GetTargetSymbolId(it->first.second); + size_t trgIdx = GetTargetSymbolId(it->first.second); m_lexicalTable.push_back(SrcTrg(srcIdx, trgIdx)); srcWord = it->first.first; @@ -322,14 +312,13 @@ void PhraseTableCreator::LoadLexicalTable(std::string filePath) } void PhraseTableCreator::CreateRankHash() -{ +{ InputFileStream inFile(m_inPath); #ifdef WITH_THREADS boost::thread_group threads; - for (size_t i = 0; i < m_threads; ++i) - { - RankingTask* rt = new RankingTask(inFile, *this); + for (size_t i = 0; i < m_threads; ++i) { + RankingTask* rt = new RankingTask(inFile, *this); threads.create_thread(*rt); } threads.join_all(); @@ -343,7 +332,7 @@ void PhraseTableCreator::CreateRankHash() inline std::string PhraseTableCreator::MakeSourceKey(std::string &source) { - return source + m_separator; + return source + m_separator; } inline std::string PhraseTableCreator::MakeSourceTargetKey(std::string &source, std::string &target) @@ -357,9 +346,8 @@ void PhraseTableCreator::EncodeTargetPhrases() #ifdef WITH_THREADS boost::thread_group threads; - for (size_t i = 0; i < m_threads; ++i) - { - EncodingTask* et = new EncodingTask(inFile, *this); + for (size_t i = 0; i < m_threads; ++i) { + EncodingTask* et = new EncodingTask(inFile, *this); threads.create_thread(*et); } threads.join_all(); @@ -368,17 +356,17 @@ void PhraseTableCreator::EncodeTargetPhrases() (*et)(); delete et; #endif - FlushEncodedQueue(true); + FlushEncodedQueue(true); } void PhraseTableCreator::CompressTargetPhrases() -{ +{ #ifdef WITH_THREADS boost::thread_group threads; for (size_t i = 0; i < m_threads; ++i) { - CompressionTask* ct = new CompressionTask(*m_encodedTargetPhrases, *this); - threads.create_thread(*ct); + CompressionTask* ct = new CompressionTask(*m_encodedTargetPhrases, *this); + threads.create_thread(*ct); } threads.join_all(); #else @@ -392,29 +380,27 @@ void PhraseTableCreator::CompressTargetPhrases() void PhraseTableCreator::CalcHuffmanCodes() { std::cerr << "\tCreating Huffman codes for " << m_symbolCounter.Size() - << " target phrase symbols" << std::endl; - + << " target phrase symbols" << std::endl; + m_symbolTree = new SymbolTree(m_symbolCounter.Begin(), - m_symbolCounter.End()); - + m_symbolCounter.End()); + std::vector::iterator treeIt = m_scoreTrees.begin(); for(std::vector::iterator it = m_scoreCounters.begin(); - it != m_scoreCounters.end(); it++) - { + it != m_scoreCounters.end(); it++) { if(m_quantize) - (*it)->Quantize(m_quantize); - + (*it)->Quantize(m_quantize); + std::cerr << "\tCreating Huffman codes for " << (*it)->Size() - << " scores" << std::endl; - + << " scores" << std::endl; + *treeIt = new ScoreTree((*it)->Begin(), (*it)->End()); treeIt++; } - - if(m_useAlignmentInfo) - { + + if(m_useAlignmentInfo) { std::cerr << "\tCreating Huffman codes for " << m_alignCounter.Size() - << " alignment points" << std::endl; + << " alignment points" << std::endl; m_alignTree = new AlignTree(m_alignCounter.Begin(), m_alignCounter.End()); } std::cerr << std::endl; @@ -440,9 +426,9 @@ void PhraseTableCreator::AddTargetSymbolId(std::string& symbol) unsigned PhraseTableCreator::GetSourceSymbolId(std::string& symbol) { boost::unordered_map::iterator it - = m_sourceSymbolsMap.find(symbol); - - if(it != m_sourceSymbolsMap.end()) + = m_sourceSymbolsMap.find(symbol); + + if(it != m_sourceSymbolsMap.end()) return it->second; else return m_sourceSymbolsMap.size(); @@ -451,9 +437,9 @@ unsigned PhraseTableCreator::GetSourceSymbolId(std::string& symbol) unsigned PhraseTableCreator::GetTargetSymbolId(std::string& symbol) { boost::unordered_map::iterator it - = m_targetSymbolsMap.find(symbol); - - if(it != m_targetSymbolsMap.end()) + = m_targetSymbolsMap.find(symbol); + + if(it != m_targetSymbolsMap.end()) return it->second; else return m_targetSymbolsMap.size(); @@ -465,12 +451,11 @@ unsigned PhraseTableCreator::GetOrAddTargetSymbolId(std::string& symbol) boost::mutex::scoped_lock lock(m_mutex); #endif boost::unordered_map::iterator it - = m_targetSymbolsMap.find(symbol); - - if(it != m_targetSymbolsMap.end()) + = m_targetSymbolsMap.find(symbol); + + if(it != m_targetSymbolsMap.end()) return it->second; - else - { + else { unsigned value = m_targetSymbolsMap.size(); m_targetSymbolsMap[symbol] = value; return value; @@ -481,12 +466,12 @@ unsigned PhraseTableCreator::GetRank(unsigned srcIdx, unsigned trgIdx) { size_t srcTrgIdx = m_lexicalTableIndex[srcIdx]; while(srcTrgIdx < m_lexicalTable.size() - && srcIdx == m_lexicalTable[srcTrgIdx].first - && m_lexicalTable[srcTrgIdx].second != trgIdx) + && srcIdx == m_lexicalTable[srcTrgIdx].first + && m_lexicalTable[srcTrgIdx].second != trgIdx) srcTrgIdx++; - + if(srcTrgIdx < m_lexicalTable.size() - && m_lexicalTable[srcTrgIdx].second == trgIdx) + && m_lexicalTable[srcTrgIdx].second == trgIdx) return srcTrgIdx - m_lexicalTableIndex[srcIdx]; else return m_lexicalTable.size(); @@ -522,14 +507,14 @@ unsigned PhraseTableCreator::EncodePREncSymbol1(unsigned trgIdx) unsigned PhraseTableCreator::EncodePREncSymbol2(int left, int right, unsigned rank) { // "left" and "right" must be smaller than 2^5 - // "rank" must be smaller than 2^19 + // "rank" must be smaller than 2^19 left = left + 32; right = right + 32; - + assert(64 > left); assert(64 > right); assert(524288 > rank); - + unsigned symbol = 0; symbol |= 1 << 31; symbol |= left << 25; @@ -539,151 +524,135 @@ unsigned PhraseTableCreator::EncodePREncSymbol2(int left, int right, unsigned ra } void PhraseTableCreator::EncodeTargetPhraseNone(std::vector& t, - std::ostream& os) + std::ostream& os) { std::stringstream encodedTargetPhrase; size_t j = 0; - while(j < t.size()) - { + while(j < t.size()) { unsigned targetSymbolId = GetOrAddTargetSymbolId(t[j]); - + m_symbolCounter.Increase(targetSymbolId); os.write((char*)&targetSymbolId, sizeof(targetSymbolId)); j++; } - + unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol); os.write((char*)&stopSymbolId, sizeof(stopSymbolId)); m_symbolCounter.Increase(stopSymbolId); } void PhraseTableCreator::EncodeTargetPhraseREnc(std::vector& s, - std::vector& t, - std::set& a, - std::ostream& os) -{ + std::vector& t, + std::set& a, + std::ostream& os) +{ std::stringstream encodedTargetPhrase; std::vector > a2(t.size()); for(std::set::iterator it = a.begin(); it != a.end(); it++) a2[it->second].push_back(it->first); - for(size_t i = 0; i < t.size(); i++) - { + for(size_t i = 0; i < t.size(); i++) { unsigned idxTarget = GetOrAddTargetSymbolId(t[i]); unsigned encodedSymbol = -1; - + unsigned bestSrcPos = s.size(); unsigned bestDiff = s.size(); unsigned bestRank = m_lexicalTable.size(); unsigned badRank = m_lexicalTable.size(); - - for(std::vector::iterator it = a2[i].begin(); it != a2[i].end(); it++) - { + + for(std::vector::iterator it = a2[i].begin(); it != a2[i].end(); it++) { unsigned idxSource = GetSourceSymbolId(s[*it]); size_t r = GetRank(idxSource, idxTarget); - if(r != badRank) - { - if(r < bestRank) - { + if(r != badRank) { + if(r < bestRank) { bestRank = r; bestSrcPos = *it; bestDiff = abs(*it-i); - } - else if(r == bestRank && unsigned(abs(*it-i)) < bestDiff) - { + } else if(r == bestRank && unsigned(abs(*it-i)) < bestDiff) { bestSrcPos = *it; bestDiff = abs(*it-i); } } } - - if(bestRank != badRank && bestSrcPos < s.size()) - { + + if(bestRank != badRank && bestSrcPos < s.size()) { if(bestSrcPos == i) encodedSymbol = EncodeREncSymbol3(bestRank); else - encodedSymbol = EncodeREncSymbol2(bestSrcPos, bestRank); + encodedSymbol = EncodeREncSymbol2(bestSrcPos, bestRank); a.erase(AlignPoint(bestSrcPos, i)); - } - else - { + } else { encodedSymbol = EncodeREncSymbol1(idxTarget); } - + os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); m_symbolCounter.Increase(encodedSymbol); } - + unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol); unsigned encodedSymbol = EncodeREncSymbol1(stopSymbolId); os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); - m_symbolCounter.Increase(encodedSymbol); + m_symbolCounter.Increase(encodedSymbol); } void PhraseTableCreator::EncodeTargetPhrasePREnc(std::vector& s, - std::vector& t, - std::set& a, - size_t ownRank, - std::ostream& os) + std::vector& t, + std::set& a, + size_t ownRank, + std::ostream& os) { std::vector encodedSymbols(t.size()); std::vector encodedSymbolsLengths(t.size(), 0); - + ConsistentPhrases cp(s.size(), t.size(), a); while(!cp.Empty()) { ConsistentPhrases::Phrase p = cp.Pop(); - + std::stringstream key1; key1 << s[p.i]; for(int i = p.i+1; i < p.i+p.m; i++) key1 << " " << s[i]; - + std::stringstream key2; key2 << t[p.j]; for(int i = p.j+1; i < p.j+p.n; i++) key2 << " " << t[i]; - + int rank = -1; std::string key1Str = key1.str(), key2Str = key2.str(); size_t idx = m_rnkHash[MakeSourceTargetKey(key1Str, key2Str)]; if(idx != m_rnkHash.GetSize()) - rank = m_ranks[idx]; - - if(rank >= 0 && (m_maxRank == 0 || unsigned(rank) < m_maxRank)) - { - if(unsigned(p.m) != s.size() || unsigned(rank) < ownRank) - { + rank = m_ranks[idx]; + + if(rank >= 0 && (m_maxRank == 0 || unsigned(rank) < m_maxRank)) { + if(unsigned(p.m) != s.size() || unsigned(rank) < ownRank) { std::stringstream encodedSymbol; encodedSymbols[p.j] = EncodePREncSymbol2(p.i-p.j, s.size()-(p.i+p.m), rank); encodedSymbolsLengths[p.j] = p.n; - + std::set tAlignment; for(std::set::iterator it = a.begin(); - it != a.end(); it++) + it != a.end(); it++) if(it->first < p.i || it->first >= p.i + p.m - || it->second < p.j || it->second >= p.j + p.n) - tAlignment.insert(*it); + || it->second < p.j || it->second >= p.j + p.n) + tAlignment.insert(*it); a = tAlignment; - cp.RemoveOverlap(p); + cp.RemoveOverlap(p); } } } - + std::stringstream encodedTargetPhrase; - + size_t j = 0; - while(j < t.size()) - { - if(encodedSymbolsLengths[j] > 0) - { + while(j < t.size()) { + if(encodedSymbolsLengths[j] > 0) { unsigned encodedSymbol = encodedSymbols[j]; m_symbolCounter.Increase(encodedSymbol); os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); j += encodedSymbolsLengths[j]; - } - else - { + } else { unsigned targetSymbolId = GetOrAddTargetSymbolId(t[j]); unsigned encodedSymbol = EncodePREncSymbol1(targetSymbolId); m_symbolCounter.Increase(encodedSymbol); @@ -691,7 +660,7 @@ void PhraseTableCreator::EncodeTargetPhrasePREnc(std::vector& s, j++; } } - + unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol); unsigned encodedSymbol = EncodePREncSymbol1(stopSymbolId); os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); @@ -702,9 +671,8 @@ void PhraseTableCreator::EncodeScores(std::vector& scores, std::ostream& { size_t c = 0; float score; - - while(c < scores.size()) - { + + while(c < scores.size()) { score = scores[c]; score = FloorScore(TransformScore(score)); os.write((char*)&score, sizeof(score)); @@ -714,11 +682,10 @@ void PhraseTableCreator::EncodeScores(std::vector& scores, std::ostream& } void PhraseTableCreator::EncodeAlignment(std::set& alignment, - std::ostream& os) + std::ostream& os) { for(std::set::iterator it = alignment.begin(); - it != alignment.end(); it++) - { + it != alignment.end(); it++) { os.write((char*)&(*it), sizeof(AlignPoint)); m_alignCounter.Increase(*it); } @@ -728,83 +695,77 @@ void PhraseTableCreator::EncodeAlignment(std::set& alignment, } std::string PhraseTableCreator::EncodeLine(std::vector& tokens, size_t ownRank) -{ +{ std::string sourcePhraseStr = tokens[0]; std::string targetPhraseStr = tokens[1]; std::string scoresStr = tokens[2]; - + std::string alignmentStr = ""; if(tokens.size() > 3) alignmentStr = tokens[3]; - + std::vector s = Tokenize(sourcePhraseStr); - + size_t phraseLength = s.size(); if(m_maxPhraseLength < phraseLength) m_maxPhraseLength = phraseLength; - + std::vector t = Tokenize(targetPhraseStr); std::vector scores = Tokenize(scoresStr); - + if(scores.size() != m_numScoreComponent) { std::cerr << "Error: Wrong number of scores detected (" - << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl; + << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl; std::cerr << "Line: " << tokens[0] << " ||| " << tokens[1] << " ||| " << tokens[3] << " ..." << std::endl; - abort(); + abort(); } - + std::set a; - if(m_coding != None || m_useAlignmentInfo) - { + if(m_coding != None || m_useAlignmentInfo) { std::vector positions = Tokenize(alignmentStr, " \t-"); - for(size_t i = 0; i < positions.size(); i += 2) - { + for(size_t i = 0; i < positions.size(); i += 2) { a.insert(AlignPoint(positions[i], positions[i+1])); } } - + std::stringstream encodedTargetPhrase; - - if(m_coding == PREnc) - { + + if(m_coding == PREnc) { EncodeTargetPhrasePREnc(s, t, a, ownRank, encodedTargetPhrase); + } else if(m_coding == REnc) { + EncodeTargetPhraseREnc(s, t, a, encodedTargetPhrase); + } else { + EncodeTargetPhraseNone(t, encodedTargetPhrase); } - else if(m_coding == REnc) - { - EncodeTargetPhraseREnc(s, t, a, encodedTargetPhrase); - } - else - { - EncodeTargetPhraseNone(t, encodedTargetPhrase); - } - + EncodeScores(scores, encodedTargetPhrase); - + if(m_useAlignmentInfo) EncodeAlignment(a, encodedTargetPhrase); - + return encodedTargetPhrase.str(); } std::string PhraseTableCreator::CompressEncodedCollection(std::string encodedCollection) -{ +{ enum EncodeState { - ReadSymbol, ReadScore, ReadAlignment, - EncodeSymbol, EncodeScore, EncodeAlignment }; + ReadSymbol, ReadScore, ReadAlignment, + EncodeSymbol, EncodeScore, EncodeAlignment + }; EncodeState state = ReadSymbol; unsigned phraseStopSymbolId; if(m_coding == REnc) phraseStopSymbolId = EncodeREncSymbol1(GetTargetSymbolId(m_phraseStopSymbol)); else if(m_coding == PREnc) - phraseStopSymbolId = EncodePREncSymbol1(GetTargetSymbolId(m_phraseStopSymbol)); + phraseStopSymbolId = EncodePREncSymbol1(GetTargetSymbolId(m_phraseStopSymbol)); else phraseStopSymbolId = GetTargetSymbolId(m_phraseStopSymbol); AlignPoint alignStopSymbol(-1, -1); std::stringstream encodedStream(encodedCollection); encodedStream.unsetf(std::ios::skipws); - + std::string compressedEncodedCollection; BitWrapper<> bitStream(compressedEncodedCollection); @@ -812,56 +773,50 @@ std::string PhraseTableCreator::CompressEncodedCollection(std::string encodedCol float score; size_t currScore = 0; AlignPoint alignPoint; - - while(encodedStream) - { - switch(state) - { - case ReadSymbol: - encodedStream.read((char*) &symbol, sizeof(unsigned)); - state = EncodeSymbol; - break; - case ReadScore: - if(currScore == m_numScoreComponent) - { - currScore = 0; - if(m_useAlignmentInfo) - state = ReadAlignment; - else - state = ReadSymbol; - } + + while(encodedStream) { + switch(state) { + case ReadSymbol: + encodedStream.read((char*) &symbol, sizeof(unsigned)); + state = EncodeSymbol; + break; + case ReadScore: + if(currScore == m_numScoreComponent) { + currScore = 0; + if(m_useAlignmentInfo) + state = ReadAlignment; else - { - encodedStream.read((char*) &score, sizeof(float)); - currScore++; - state = EncodeScore; - } - break; - case ReadAlignment: - encodedStream.read((char*) &alignPoint, sizeof(AlignPoint)); - state = EncodeAlignment; - break; - - case EncodeSymbol: - state = (symbol == phraseStopSymbolId) ? ReadScore : ReadSymbol; - m_symbolTree->Put(bitStream, symbol); - break; - case EncodeScore: - { - state = ReadScore; - size_t idx = m_multipleScoreTrees ? currScore-1 : 0; - if(m_quantize) - score = m_scoreCounters[idx]->LowerBound(score); - m_scoreTrees[idx]->Put(bitStream, score); - } - break; - case EncodeAlignment: - state = (alignPoint == alignStopSymbol) ? ReadSymbol : ReadAlignment; - m_alignTree->Put(bitStream, alignPoint); - break; + state = ReadSymbol; + } else { + encodedStream.read((char*) &score, sizeof(float)); + currScore++; + state = EncodeScore; + } + break; + case ReadAlignment: + encodedStream.read((char*) &alignPoint, sizeof(AlignPoint)); + state = EncodeAlignment; + break; + + case EncodeSymbol: + state = (symbol == phraseStopSymbolId) ? ReadScore : ReadSymbol; + m_symbolTree->Put(bitStream, symbol); + break; + case EncodeScore: { + state = ReadScore; + size_t idx = m_multipleScoreTrees ? currScore-1 : 0; + if(m_quantize) + score = m_scoreCounters[idx]->LowerBound(score); + m_scoreTrees[idx]->Put(bitStream, score); + } + break; + case EncodeAlignment: + state = (alignPoint == alignStopSymbol) ? ReadSymbol : ReadAlignment; + m_alignTree->Put(bitStream, alignPoint); + break; } } - + return compressedEncodedCollection; } @@ -873,32 +828,28 @@ void PhraseTableCreator::AddRankedLine(PackedItem& pi) void PhraseTableCreator::FlushRankedQueue(bool force) { size_t step = 1ul << 10; - - while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) - { + + while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) { m_lastFlushedLine++; PackedItem pi = m_queue.top(); m_queue.pop(); - - if(m_lastSourceRange.size() == step) - { + + if(m_lastSourceRange.size() == step) { m_rnkHash.AddRange(m_lastSourceRange); m_lastSourceRange.clear(); } - - if(m_lastFlushedSourcePhrase != pi.GetSrc()) - { - if(m_rankQueue.size()) { + + if(m_lastFlushedSourcePhrase != pi.GetSrc()) { + if(m_rankQueue.size()) { m_lastFlushedSourceNum++; if(m_lastFlushedSourceNum % 100000 == 0) { std::cerr << "."; } - if(m_lastFlushedSourceNum % 5000000 == 0) - { + if(m_lastFlushedSourceNum % 5000000 == 0) { std::cerr << "[" << m_lastFlushedSourceNum << "]" << std::endl; } - + m_ranks.resize(m_lastFlushedLine + 1); int r = 0; while(!m_rankQueue.empty()) { @@ -907,33 +858,31 @@ void PhraseTableCreator::FlushRankedQueue(bool force) } } } - + m_lastSourceRange.push_back(pi.GetTrg()); - + m_rankQueue.push(std::make_pair(pi.GetScore(), pi.GetLine())); m_lastFlushedSourcePhrase = pi.GetSrc(); } - - if(force) - { + + if(force) { m_rnkHash.AddRange(m_lastSourceRange); m_lastSourceRange.clear(); #ifdef WITH_THREADS m_rnkHash.WaitAll(); #endif - + m_ranks.resize(m_lastFlushedLine + 1); int r = 0; - while(!m_rankQueue.empty()) - { + while(!m_rankQueue.empty()) { m_ranks[m_rankQueue.top().second] = r++; m_rankQueue.pop(); } m_lastFlushedLine = -1; m_lastFlushedSourceNum = 0; - + std::cerr << std::endl << std::endl; } } @@ -946,74 +895,65 @@ void PhraseTableCreator::AddEncodedLine(PackedItem& pi) void PhraseTableCreator::FlushEncodedQueue(bool force) { - while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) - { + while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) { PackedItem pi = m_queue.top(); m_queue.pop(); m_lastFlushedLine++; - - if(m_lastFlushedSourcePhrase != pi.GetSrc()) - { - if(m_lastCollection.size()) - { + + if(m_lastFlushedSourcePhrase != pi.GetSrc()) { + if(m_lastCollection.size()) { std::stringstream targetPhraseCollection; for(std::vector::iterator it = - m_lastCollection.begin(); it != m_lastCollection.end(); it++) + m_lastCollection.begin(); it != m_lastCollection.end(); it++) targetPhraseCollection << *it; - - m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase)); + + m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase)); m_encodedTargetPhrases->push_back(targetPhraseCollection.str()); - + m_lastFlushedSourceNum++; if(m_lastFlushedSourceNum % 100000 == 0) std::cerr << "."; if(m_lastFlushedSourceNum % 5000000 == 0) std::cerr << "[" << m_lastFlushedSourceNum << "]" << std::endl; - + m_lastCollection.clear(); } } - - if(m_lastSourceRange.size() == (1ul << m_orderBits)) - { + + if(m_lastSourceRange.size() == (1ul << m_orderBits)) { m_srcHash.AddRange(m_lastSourceRange); m_srcHash.SaveLastRange(); m_srcHash.DropLastRange(); m_lastSourceRange.clear(); } - + m_lastFlushedSourcePhrase = pi.GetSrc(); - if(m_coding == PREnc) - { + if(m_coding == PREnc) { if(m_lastCollection.size() <= pi.GetRank()) m_lastCollection.resize(pi.GetRank() + 1); m_lastCollection[pi.GetRank()] = pi.GetTrg(); - } - else - { + } else { m_lastCollection.push_back(pi.GetTrg()); } } - - if(force) - { + + if(force) { if(!m_lastSourceRange.size() || m_lastSourceRange.back() != m_lastFlushedSourcePhrase) m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase)); - - if(m_lastCollection.size()) - { + + if(m_lastCollection.size()) { std::stringstream targetPhraseCollection; for(std::vector::iterator it = - m_lastCollection.begin(); it != m_lastCollection.end(); it++) + m_lastCollection.begin(); it != m_lastCollection.end(); it++) targetPhraseCollection << *it; - + m_encodedTargetPhrases->push_back(targetPhraseCollection.str()); m_lastCollection.clear(); } - + m_srcHash.AddRange(m_lastSourceRange); m_lastSourceRange.clear(); - + #ifdef WITH_THREADS m_srcHash.WaitAll(); #endif @@ -1021,7 +961,7 @@ void PhraseTableCreator::FlushEncodedQueue(bool force) m_srcHash.SaveLastRange(); m_srcHash.DropLastRange(); m_srcHash.FinalizeSave(); - + m_lastFlushedLine = -1; m_lastFlushedSourceNum = 0; @@ -1031,30 +971,27 @@ void PhraseTableCreator::FlushEncodedQueue(bool force) void PhraseTableCreator::AddCompressedCollection(PackedItem& pi) { - m_queue.push(pi); + m_queue.push(pi); } void PhraseTableCreator::FlushCompressedQueue(bool force) { - if(force || m_queue.size() > 10000) - { - while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) - { + if(force || m_queue.size() > 10000) { + while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) { PackedItem pi = m_queue.top(); m_queue.pop(); m_lastFlushedLine++; - + m_compressedTargetPhrases->push_back(pi.GetTrg()); - + if((pi.GetLine()+1) % 100000 == 0) std::cerr << "."; if((pi.GetLine()+1) % 5000000 == 0) std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl; } } - - if(force) - { + + if(force) { m_lastFlushedLine = -1; std::cerr << std::endl << std::endl; } @@ -1070,38 +1007,35 @@ boost::mutex RankingTask::m_fileMutex; RankingTask::RankingTask(InputFileStream& inFile, PhraseTableCreator& creator) : m_inFile(inFile), m_creator(creator) {} - + void RankingTask::operator()() { size_t lineNum = 0; - + std::vector lines; size_t max_lines = 1000; lines.reserve(max_lines); - + { #ifdef WITH_THREADS - boost::mutex::scoped_lock lock(m_fileMutex); + boost::mutex::scoped_lock lock(m_fileMutex); #endif - std::string line; - while(lines.size() < max_lines && std::getline(m_inFile, line)) - lines.push_back(line); - lineNum = m_lineNum; - m_lineNum += lines.size(); + std::string line; + while(lines.size() < max_lines && std::getline(m_inFile, line)) + lines.push_back(line); + lineNum = m_lineNum; + m_lineNum += lines.size(); } - + std::vector result; result.reserve(max_lines); - - while(lines.size()) - { - for(size_t i = 0; i < lines.size(); i++) - { + + while(lines.size()) { + for(size_t i = 0; i < lines.size(); i++) { std::vector tokens; Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator); - - if(tokens.size() < 3) - { + + if(tokens.size() < 3) { std::cerr << "Error: It seems the following line has a wrong format:" << std::endl; std::cerr << "Line " << i << ": " << lines[i] << std::endl; abort(); @@ -1112,38 +1046,38 @@ void RankingTask::operator()() std::cerr << "Better use -encoding None or disable this warning with -no-warnings ." << std::endl; std::cerr << "Line " << i << ": " << lines[i] << std::endl; } - + std::vector scores = Tokenize(tokens[2]); if(scores.size() != m_creator.m_numScoreComponent) { std::cerr << "Error: It seems the following line has a wrong number of scores (" - << scores.size() << " != " << m_creator.m_numScoreComponent << ") :" << std::endl; + << scores.size() << " != " << m_creator.m_numScoreComponent << ") :" << std::endl; std::cerr << "Line " << i << ": " << lines[i] << std::endl; - abort(); + abort(); } - + float sortScore = scores[m_creator.m_sortScoreIndex]; - + std::string key1 = m_creator.MakeSourceKey(tokens[0]); std::string key2 = m_creator.MakeSourceTargetKey(tokens[0], tokens[1]); - + PackedItem packedItem(lineNum + i, key1, key2, 0, sortScore); result.push_back(packedItem); } lines.clear(); - + { #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_mutex); #endif - for(size_t i = 0; i < result.size(); i++) + for(size_t i = 0; i < result.size(); i++) m_creator.AddRankedLine(result[i]); - m_creator.FlushRankedQueue(); + m_creator.FlushRankedQueue(); } - + result.clear(); lines.reserve(max_lines); result.reserve(max_lines); - + #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_fileMutex); #endif @@ -1163,15 +1097,15 @@ boost::mutex EncodingTask::m_fileMutex; EncodingTask::EncodingTask(InputFileStream& inFile, PhraseTableCreator& creator) : m_inFile(inFile), m_creator(creator) {} - + void EncodingTask::operator()() { size_t lineNum = 0; - + std::vector lines; size_t max_lines = 1000; lines.reserve(max_lines); - + { #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_fileMutex); @@ -1182,19 +1116,16 @@ void EncodingTask::operator()() lineNum = m_lineNum; m_lineNum += lines.size(); } - + std::vector result; result.reserve(max_lines); - - while(lines.size()) - { - for(size_t i = 0; i < lines.size(); i++) - { + + while(lines.size()) { + for(size_t i = 0; i < lines.size(); i++) { std::vector tokens; Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator); - - if(tokens.size() < 3) - { + + if(tokens.size() < 3) { std::cerr << "Error: It seems the following line has a wrong format:" << std::endl; std::cerr << "Line " << i << ": " << lines[i] << std::endl; abort(); @@ -1207,31 +1138,31 @@ void EncodingTask::operator()() std::cerr << "Better use -encoding None or disable this warning with -no-warnings." << std::endl; std::cerr << "Line " << i << ": " << lines[i] << std::endl; } - + size_t ownRank = 0; if(m_creator.m_coding == PhraseTableCreator::PREnc) ownRank = m_creator.m_ranks[lineNum + i]; - + std::string encodedLine = m_creator.EncodeLine(tokens, ownRank); - + PackedItem packedItem(lineNum + i, tokens[0], encodedLine, ownRank); result.push_back(packedItem); } lines.clear(); - + { #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_mutex); #endif - for(size_t i = 0; i < result.size(); i++) + for(size_t i = 0; i < result.size(); i++) m_creator.AddEncodedLine(result[i]); - m_creator.FlushEncodedQueue(); + m_creator.FlushEncodedQueue(); } - + result.clear(); lines.reserve(max_lines); result.reserve(max_lines); - + #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_fileMutex); #endif @@ -1251,10 +1182,10 @@ boost::mutex CompressionTask::m_mutex; #endif CompressionTask::CompressionTask(StringVector& encodedCollections, - PhraseTableCreator& creator) + MmapAllocator>& encodedCollections, + PhraseTableCreator& creator) : m_encodedCollections(encodedCollections), m_creator(creator) {} - + void CompressionTask::operator()() { size_t collectionNum; @@ -1265,12 +1196,11 @@ void CompressionTask::operator()() collectionNum = m_collectionNum; m_collectionNum++; } - - while(collectionNum < m_encodedCollections.size()) - { + + while(collectionNum < m_encodedCollections.size()) { std::string collection = m_encodedCollections[collectionNum]; std::string compressedCollection - = m_creator.CompressEncodedCollection(collection); + = m_creator.CompressEncodedCollection(collection); std::string dummy; PackedItem packedItem(collectionNum, dummy, compressedCollection, 0); @@ -1280,29 +1210,44 @@ void CompressionTask::operator()() #endif m_creator.AddCompressedCollection(packedItem); m_creator.FlushCompressedQueue(); - - collectionNum = m_collectionNum; - m_collectionNum++; + + collectionNum = m_collectionNum; + m_collectionNum++; } } //****************************************************************************// PackedItem::PackedItem(long line, std::string sourcePhrase, - std::string packedTargetPhrase, size_t rank, - float score) + std::string packedTargetPhrase, size_t rank, + float score) : m_line(line), m_sourcePhrase(sourcePhrase), m_packedTargetPhrase(packedTargetPhrase), m_rank(rank), m_score(score) {} -long PackedItem::GetLine() const { return m_line; } +long PackedItem::GetLine() const +{ + return m_line; +} -const std::string& PackedItem::GetSrc() const { return m_sourcePhrase; } +const std::string& PackedItem::GetSrc() const +{ + return m_sourcePhrase; +} -const std::string& PackedItem::GetTrg() const { return m_packedTargetPhrase; } +const std::string& PackedItem::GetTrg() const +{ + return m_packedTargetPhrase; +} -size_t PackedItem::GetRank() const { return m_rank; } +size_t PackedItem::GetRank() const +{ + return m_rank; +} -float PackedItem::GetScore() const { return m_score; } +float PackedItem::GetScore() const +{ + return m_score; +} } diff --git a/moses/TranslationModel/CompactPT/PhraseTableCreator.h b/moses/TranslationModel/CompactPT/PhraseTableCreator.h index ded3a84eb..fd5fc1581 100644 --- a/moses/TranslationModel/CompactPT/PhraseTableCreator.h +++ b/moses/TranslationModel/CompactPT/PhraseTableCreator.h @@ -1,23 +1,23 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #ifndef moses_PhraseTableCreator_h #define moses_PhraseTableCreator_h @@ -40,386 +40,371 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA namespace Moses { - + typedef std::pair AlignPoint; -template +template class Counter { - public: - typedef boost::unordered_map FreqMap; - typedef typename FreqMap::iterator iterator; - typedef typename FreqMap::mapped_type mapped_type; - typedef typename FreqMap::value_type value_type; +public: + typedef boost::unordered_map FreqMap; + typedef typename FreqMap::iterator iterator; + typedef typename FreqMap::mapped_type mapped_type; + typedef typename FreqMap::value_type value_type; - private: -#ifdef WITH_THREADS - boost::mutex m_mutex; -#endif - FreqMap m_freqMap; - size_t m_maxSize; - std::vector m_bestVec; - - struct FreqSorter - { - bool operator()(const value_type& a, const value_type& b) const - { - if(a.second > b.second) - return true; - // Check impact on translation quality! - if(a.second == b.second && a.first > b.first) - return true; - return false; - } - }; - - public: - Counter() : m_maxSize(0) {} - - iterator Begin() - { - return m_freqMap.begin(); - } - - iterator End() - { - return m_freqMap.end(); - } - - void Increase(DataType data) - { +private: #ifdef WITH_THREADS - boost::mutex::scoped_lock lock(m_mutex); + boost::mutex m_mutex; #endif - m_freqMap[data]++; + FreqMap m_freqMap; + size_t m_maxSize; + std::vector m_bestVec; + + struct FreqSorter { + bool operator()(const value_type& a, const value_type& b) const { + if(a.second > b.second) + return true; + // Check impact on translation quality! + if(a.second == b.second && a.first > b.first) + return true; + return false; } - - void IncreaseBy(DataType data, size_t num) - { + }; + +public: + Counter() : m_maxSize(0) {} + + iterator Begin() { + return m_freqMap.begin(); + } + + iterator End() { + return m_freqMap.end(); + } + + void Increase(DataType data) { #ifdef WITH_THREADS - boost::mutex::scoped_lock lock(m_mutex); + boost::mutex::scoped_lock lock(m_mutex); #endif - m_freqMap[data] += num; - } - - mapped_type& operator[](DataType data) - { - return m_freqMap[data]; - } - - size_t Size() - { + m_freqMap[data]++; + } + + void IncreaseBy(DataType data, size_t num) { #ifdef WITH_THREADS - boost::mutex::scoped_lock lock(m_mutex); + boost::mutex::scoped_lock lock(m_mutex); #endif - return m_freqMap.size(); - } - - void Quantize(size_t maxSize) - { + m_freqMap[data] += num; + } + + mapped_type& operator[](DataType data) { + return m_freqMap[data]; + } + + size_t Size() { #ifdef WITH_THREADS - boost::mutex::scoped_lock lock(m_mutex); + boost::mutex::scoped_lock lock(m_mutex); #endif - m_maxSize = maxSize; - std::vector > freqVec; - freqVec.insert(freqVec.begin(), m_freqMap.begin(), m_freqMap.end()); - std::sort(freqVec.begin(), freqVec.end(), FreqSorter()); - - for(size_t i = 0; i < freqVec.size() && i < m_maxSize; i++) - m_bestVec.push_back(freqVec[i].first); - - std::sort(m_bestVec.begin(), m_bestVec.end()); - - FreqMap t_freqMap; - for(typename std::vector >::iterator it - = freqVec.begin(); it != freqVec.end(); it++) - { - DataType closest = LowerBound(it->first); - t_freqMap[closest] += it->second; - } - - m_freqMap.swap(t_freqMap); - } - - void Clear() - { + return m_freqMap.size(); + } + + void Quantize(size_t maxSize) { #ifdef WITH_THREADS - boost::mutex::scoped_lock lock(m_mutex); + boost::mutex::scoped_lock lock(m_mutex); #endif - m_freqMap.clear(); + m_maxSize = maxSize; + std::vector > freqVec; + freqVec.insert(freqVec.begin(), m_freqMap.begin(), m_freqMap.end()); + std::sort(freqVec.begin(), freqVec.end(), FreqSorter()); + + for(size_t i = 0; i < freqVec.size() && i < m_maxSize; i++) + m_bestVec.push_back(freqVec[i].first); + + std::sort(m_bestVec.begin(), m_bestVec.end()); + + FreqMap t_freqMap; + for(typename std::vector >::iterator it + = freqVec.begin(); it != freqVec.end(); it++) { + DataType closest = LowerBound(it->first); + t_freqMap[closest] += it->second; } - - DataType LowerBound(DataType data) - { - if(m_maxSize == 0 || m_bestVec.size() == 0) - return data; + + m_freqMap.swap(t_freqMap); + } + + void Clear() { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + m_freqMap.clear(); + } + + DataType LowerBound(DataType data) { + if(m_maxSize == 0 || m_bestVec.size() == 0) + return data; + else { + typename std::vector::iterator it + = std::lower_bound(m_bestVec.begin(), m_bestVec.end(), data); + if(it != m_bestVec.end()) + return *it; else - { - typename std::vector::iterator it - = std::lower_bound(m_bestVec.begin(), m_bestVec.end(), data); - if(it != m_bestVec.end()) - return *it; - else - return m_bestVec.back(); - } + return m_bestVec.back(); } + } }; - + class PackedItem { - private: - long m_line; - std::string m_sourcePhrase; - std::string m_packedTargetPhrase; - size_t m_rank; - float m_score; - - public: - PackedItem(long line, std::string sourcePhrase, - std::string packedTargetPhrase, size_t rank, - float m_score = 0); - - long GetLine() const; - const std::string& GetSrc() const; - const std::string& GetTrg() const; - size_t GetRank() const; - float GetScore() const; +private: + long m_line; + std::string m_sourcePhrase; + std::string m_packedTargetPhrase; + size_t m_rank; + float m_score; + +public: + PackedItem(long line, std::string sourcePhrase, + std::string packedTargetPhrase, size_t rank, + float m_score = 0); + + long GetLine() const; + const std::string& GetSrc() const; + const std::string& GetTrg() const; + size_t GetRank() const; + float GetScore() const; }; bool operator<(const PackedItem &pi1, const PackedItem &pi2); class PhraseTableCreator { - public: - enum Coding { None, REnc, PREnc }; - - private: - std::string m_inPath; - std::string m_outPath; - std::string m_tempfilePath; - - std::FILE* m_outFile; - - size_t m_numScoreComponent; - size_t m_sortScoreIndex; - size_t m_warnMe; - - Coding m_coding; - size_t m_orderBits; - size_t m_fingerPrintBits; - bool m_useAlignmentInfo; - bool m_multipleScoreTrees; - size_t m_quantize; - size_t m_maxRank; - - static std::string m_phraseStopSymbol; - static std::string m_separator; - -#ifdef WITH_THREADS - size_t m_threads; - boost::mutex m_mutex; -#endif - - BlockHashIndex m_srcHash; - BlockHashIndex m_rnkHash; - - size_t m_maxPhraseLength; - - std::vector m_ranks; - - typedef std::pair SrcTrg; - typedef std::pair SrcTrgString; - typedef std::pair SrcTrgProb; - - struct SrcTrgProbSorter - { - bool operator()(const SrcTrgProb& a, const SrcTrgProb& b) const - { - if(a.first.first < b.first.first) - return true; +public: + enum Coding { None, REnc, PREnc }; + +private: + std::string m_inPath; + std::string m_outPath; + std::string m_tempfilePath; + + std::FILE* m_outFile; + + size_t m_numScoreComponent; + size_t m_sortScoreIndex; + size_t m_warnMe; + + Coding m_coding; + size_t m_orderBits; + size_t m_fingerPrintBits; + bool m_useAlignmentInfo; + bool m_multipleScoreTrees; + size_t m_quantize; + size_t m_maxRank; + + static std::string m_phraseStopSymbol; + static std::string m_separator; - if(a.first.first == b.first.first && a.second > b.second) - return true; - - if(a.first.first == b.first.first - && a.second == b.second - && a.first.second < b.first.second) - return true; - - return false; - } - }; - - std::vector m_lexicalTableIndex; - std::vector m_lexicalTable; - - StringVector* - m_encodedTargetPhrases; - - StringVector* - m_compressedTargetPhrases; - - boost::unordered_map m_targetSymbolsMap; - boost::unordered_map m_sourceSymbolsMap; - - typedef Counter SymbolCounter; - typedef Counter ScoreCounter; - typedef Counter AlignCounter; - - typedef CanonicalHuffman SymbolTree; - typedef CanonicalHuffman ScoreTree; - typedef CanonicalHuffman AlignTree; - - SymbolCounter m_symbolCounter; - SymbolTree* m_symbolTree; - - AlignCounter m_alignCounter; - AlignTree* m_alignTree; - - std::vector m_scoreCounters; - std::vector m_scoreTrees; - - std::priority_queue m_queue; - long m_lastFlushedLine; - long m_lastFlushedSourceNum; - std::string m_lastFlushedSourcePhrase; - std::vector m_lastSourceRange; - std::priority_queue > m_rankQueue; - std::vector m_lastCollection; - - void Save(); - void PrintInfo(); - - void AddSourceSymbolId(std::string& symbol); - unsigned GetSourceSymbolId(std::string& symbol); - - void AddTargetSymbolId(std::string& symbol); - unsigned GetTargetSymbolId(std::string& symbol); - unsigned GetOrAddTargetSymbolId(std::string& symbol); - - unsigned GetRank(unsigned srcIdx, unsigned trgIdx); - - unsigned EncodeREncSymbol1(unsigned symbol); - unsigned EncodeREncSymbol2(unsigned position, unsigned rank); - unsigned EncodeREncSymbol3(unsigned rank); - - unsigned EncodePREncSymbol1(unsigned symbol); - unsigned EncodePREncSymbol2(int lOff, int rOff, unsigned rank); - - void EncodeTargetPhraseNone(std::vector& t, - std::ostream& os); - - void EncodeTargetPhraseREnc(std::vector& s, - std::vector& t, - std::set& a, - std::ostream& os); - - void EncodeTargetPhrasePREnc(std::vector& s, - std::vector& t, - std::set& a, size_t ownRank, - std::ostream& os); - - void EncodeScores(std::vector& scores, std::ostream& os); - void EncodeAlignment(std::set& alignment, std::ostream& os); - - std::string MakeSourceKey(std::string&); - std::string MakeSourceTargetKey(std::string&, std::string&); - - void LoadLexicalTable(std::string filePath); - - void CreateRankHash(); - void EncodeTargetPhrases(); - void CalcHuffmanCodes(); - void CompressTargetPhrases(); - - void AddRankedLine(PackedItem& pi); - void FlushRankedQueue(bool force = false); - - std::string EncodeLine(std::vector& tokens, size_t ownRank); - void AddEncodedLine(PackedItem& pi); - void FlushEncodedQueue(bool force = false); - - std::string CompressEncodedCollection(std::string encodedCollection); - void AddCompressedCollection(PackedItem& pi); - void FlushCompressedQueue(bool force = false); - - public: - - PhraseTableCreator(std::string inPath, - std::string outPath, - std::string tempfilePath, - size_t numScoreComponent = 5, - size_t sortScoreIndex = 2, - Coding coding = PREnc, - size_t orderBits = 10, - size_t fingerPrintBits = 16, - bool useAlignmentInfo = false, - bool multipleScoreTrees = true, - size_t quantize = 0, - size_t maxRank = 100, - bool warnMe = true #ifdef WITH_THREADS - , size_t threads = 2 + size_t m_threads; + boost::mutex m_mutex; #endif - ); - - ~PhraseTableCreator(); - - friend class RankingTask; - friend class EncodingTask; - friend class CompressionTask; + + BlockHashIndex m_srcHash; + BlockHashIndex m_rnkHash; + + size_t m_maxPhraseLength; + + std::vector m_ranks; + + typedef std::pair SrcTrg; + typedef std::pair SrcTrgString; + typedef std::pair SrcTrgProb; + + struct SrcTrgProbSorter { + bool operator()(const SrcTrgProb& a, const SrcTrgProb& b) const { + if(a.first.first < b.first.first) + return true; + + if(a.first.first == b.first.first && a.second > b.second) + return true; + + if(a.first.first == b.first.first + && a.second == b.second + && a.first.second < b.first.second) + return true; + + return false; + } + }; + + std::vector m_lexicalTableIndex; + std::vector m_lexicalTable; + + StringVector* + m_encodedTargetPhrases; + + StringVector* + m_compressedTargetPhrases; + + boost::unordered_map m_targetSymbolsMap; + boost::unordered_map m_sourceSymbolsMap; + + typedef Counter SymbolCounter; + typedef Counter ScoreCounter; + typedef Counter AlignCounter; + + typedef CanonicalHuffman SymbolTree; + typedef CanonicalHuffman ScoreTree; + typedef CanonicalHuffman AlignTree; + + SymbolCounter m_symbolCounter; + SymbolTree* m_symbolTree; + + AlignCounter m_alignCounter; + AlignTree* m_alignTree; + + std::vector m_scoreCounters; + std::vector m_scoreTrees; + + std::priority_queue m_queue; + long m_lastFlushedLine; + long m_lastFlushedSourceNum; + std::string m_lastFlushedSourcePhrase; + std::vector m_lastSourceRange; + std::priority_queue > m_rankQueue; + std::vector m_lastCollection; + + void Save(); + void PrintInfo(); + + void AddSourceSymbolId(std::string& symbol); + unsigned GetSourceSymbolId(std::string& symbol); + + void AddTargetSymbolId(std::string& symbol); + unsigned GetTargetSymbolId(std::string& symbol); + unsigned GetOrAddTargetSymbolId(std::string& symbol); + + unsigned GetRank(unsigned srcIdx, unsigned trgIdx); + + unsigned EncodeREncSymbol1(unsigned symbol); + unsigned EncodeREncSymbol2(unsigned position, unsigned rank); + unsigned EncodeREncSymbol3(unsigned rank); + + unsigned EncodePREncSymbol1(unsigned symbol); + unsigned EncodePREncSymbol2(int lOff, int rOff, unsigned rank); + + void EncodeTargetPhraseNone(std::vector& t, + std::ostream& os); + + void EncodeTargetPhraseREnc(std::vector& s, + std::vector& t, + std::set& a, + std::ostream& os); + + void EncodeTargetPhrasePREnc(std::vector& s, + std::vector& t, + std::set& a, size_t ownRank, + std::ostream& os); + + void EncodeScores(std::vector& scores, std::ostream& os); + void EncodeAlignment(std::set& alignment, std::ostream& os); + + std::string MakeSourceKey(std::string&); + std::string MakeSourceTargetKey(std::string&, std::string&); + + void LoadLexicalTable(std::string filePath); + + void CreateRankHash(); + void EncodeTargetPhrases(); + void CalcHuffmanCodes(); + void CompressTargetPhrases(); + + void AddRankedLine(PackedItem& pi); + void FlushRankedQueue(bool force = false); + + std::string EncodeLine(std::vector& tokens, size_t ownRank); + void AddEncodedLine(PackedItem& pi); + void FlushEncodedQueue(bool force = false); + + std::string CompressEncodedCollection(std::string encodedCollection); + void AddCompressedCollection(PackedItem& pi); + void FlushCompressedQueue(bool force = false); + +public: + + PhraseTableCreator(std::string inPath, + std::string outPath, + std::string tempfilePath, + size_t numScoreComponent = 5, + size_t sortScoreIndex = 2, + Coding coding = PREnc, + size_t orderBits = 10, + size_t fingerPrintBits = 16, + bool useAlignmentInfo = false, + bool multipleScoreTrees = true, + size_t quantize = 0, + size_t maxRank = 100, + bool warnMe = true +#ifdef WITH_THREADS + , size_t threads = 2 +#endif + ); + + ~PhraseTableCreator(); + + friend class RankingTask; + friend class EncodingTask; + friend class CompressionTask; }; class RankingTask { - private: +private: #ifdef WITH_THREADS - static boost::mutex m_mutex; - static boost::mutex m_fileMutex; + static boost::mutex m_mutex; + static boost::mutex m_fileMutex; #endif - static size_t m_lineNum; - InputFileStream& m_inFile; - PhraseTableCreator& m_creator; - - public: - RankingTask(InputFileStream& inFile, PhraseTableCreator& creator); - void operator()(); + static size_t m_lineNum; + InputFileStream& m_inFile; + PhraseTableCreator& m_creator; + +public: + RankingTask(InputFileStream& inFile, PhraseTableCreator& creator); + void operator()(); }; class EncodingTask { - private: +private: #ifdef WITH_THREADS - static boost::mutex m_mutex; - static boost::mutex m_fileMutex; + static boost::mutex m_mutex; + static boost::mutex m_fileMutex; #endif - static size_t m_lineNum; - static size_t m_sourcePhraseNum; - static std::string m_lastSourcePhrase; - - InputFileStream& m_inFile; - PhraseTableCreator& m_creator; - - public: - EncodingTask(InputFileStream& inFile, PhraseTableCreator& creator); - void operator()(); + static size_t m_lineNum; + static size_t m_sourcePhraseNum; + static std::string m_lastSourcePhrase; + + InputFileStream& m_inFile; + PhraseTableCreator& m_creator; + +public: + EncodingTask(InputFileStream& inFile, PhraseTableCreator& creator); + void operator()(); }; class CompressionTask { - private: +private: #ifdef WITH_THREADS - static boost::mutex m_mutex; + static boost::mutex m_mutex; #endif - static size_t m_collectionNum; - StringVector& - m_encodedCollections; - PhraseTableCreator& m_creator; - - public: - CompressionTask(StringVector& - encodedCollections, PhraseTableCreator& creator); - void operator()(); + static size_t m_collectionNum; + StringVector& + m_encodedCollections; + PhraseTableCreator& m_creator; + +public: + CompressionTask(StringVector& + encodedCollections, PhraseTableCreator& creator); + void operator()(); }; } diff --git a/moses/TranslationModel/CompactPT/StringVector.h b/moses/TranslationModel/CompactPT/StringVector.h index fcc545a19..4545d61c6 100644 --- a/moses/TranslationModel/CompactPT/StringVector.h +++ b/moses/TranslationModel/CompactPT/StringVector.h @@ -1,23 +1,23 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #ifndef moses_StringVector_h #define moses_StringVector_h @@ -43,255 +43,241 @@ namespace Moses template class ValueIteratorRange { - private: - ValueIteratorT m_begin; - ValueIteratorT m_end; - - public: - ValueIteratorRange(ValueIteratorT begin, ValueIteratorT end); - - const ValueIteratorT& begin() const; - const ValueIteratorT& end() const; - const std::string str() const; - operator const std::string() - { - return str(); - } - - size_t size() - { - return std::distance(m_begin, m_end); - } - - template - bool operator==(const StringT& o) const; - bool operator==(const char* c) const; +private: + ValueIteratorT m_begin; + ValueIteratorT m_end; - template - bool operator<(const StringT& o) const; - bool operator<(const char* c) const; +public: + ValueIteratorRange(ValueIteratorT begin, ValueIteratorT end); + + const ValueIteratorT& begin() const; + const ValueIteratorT& end() const; + const std::string str() const; + operator const std::string() { + return str(); + } + + size_t size() { + return std::distance(m_begin, m_end); + } + + template + bool operator==(const StringT& o) const; + bool operator==(const char* c) const; + + template + bool operator<(const StringT& o) const; + bool operator<(const char* c) const; }; // ********** StringVector ********** template class Allocator = std::allocator> + template class Allocator = std::allocator> class StringVector -{ - protected: - bool m_sorted; - bool m_memoryMapped; - - std::vector >* m_charArray; - MonotonicVector m_positions; - - virtual const ValueT* value_ptr(PosT i) const; - - public: - typedef ValueIteratorRange >::const_iterator> range; - - // ********** RangeIterator ********** - - class RangeIterator : public boost::iterator_facade - { - - private: - PosT m_index; - StringVector* m_container; - - public: - RangeIterator(); - RangeIterator(StringVector &sv, PosT index=0); - - PosT get_index(); - - private: - friend class boost::iterator_core_access; - - range dereference() const; - bool equal(RangeIterator const& other) const; - void increment(); - void decrement(); - void advance(PosT n); - - PosT distance_to(RangeIterator const& other) const; - }; - - // ********** StringIterator ********** - - class StringIterator : public boost::iterator_facade - { - - private: - PosT m_index; - StringVector* m_container; - - public: - StringIterator(); - StringIterator(StringVector &sv, PosT index=0); - - PosT get_index(); - - private: - friend class boost::iterator_core_access; - - const std::string dereference() const; - bool equal(StringIterator const& other) const; - void increment(); - void decrement(); - void advance(PosT n); - PosT distance_to(StringIterator const& other) const; - }; +{ +protected: + bool m_sorted; + bool m_memoryMapped; + + std::vector >* m_charArray; + MonotonicVector m_positions; + + virtual const ValueT* value_ptr(PosT i) const; + +public: + typedef ValueIteratorRange >::const_iterator> range; + + // ********** RangeIterator ********** + + class RangeIterator : public boost::iterator_facade + { + + private: + PosT m_index; + StringVector* m_container; + + public: + RangeIterator(); + RangeIterator(StringVector &sv, PosT index=0); + + PosT get_index(); + + private: + friend class boost::iterator_core_access; + + range dereference() const; + bool equal(RangeIterator const& other) const; + void increment(); + void decrement(); + void advance(PosT n); + + PosT distance_to(RangeIterator const& other) const; + }; + + // ********** StringIterator ********** + + class StringIterator : public boost::iterator_facade + { + + private: + PosT m_index; + StringVector* m_container; + + public: + StringIterator(); + StringIterator(StringVector &sv, PosT index=0); + + PosT get_index(); + + private: + friend class boost::iterator_core_access; + + const std::string dereference() const; + bool equal(StringIterator const& other) const; + void increment(); + void decrement(); + void advance(PosT n); + PosT distance_to(StringIterator const& other) const; + }; + + typedef RangeIterator iterator; + typedef StringIterator string_iterator; + + StringVector(); + StringVector(Allocator alloc); + + virtual ~StringVector() { + delete m_charArray; + } + + void swap(StringVector &c) { + m_positions.commit(); + m_positions.swap(c.m_positions); + m_charArray->swap(*c.m_charArray); + + bool temp = m_sorted; + m_sorted = c.m_sorted; + c.m_sorted = temp; + } + + bool is_sorted() const; + PosT size() const; + virtual PosT size2() const; + + template Iterator begin() const; + template Iterator end() const; + + iterator begin() const; + iterator end() const; + + PosT length(PosT i) const; + typename std::vector >::const_iterator begin(PosT i) const; + typename std::vector >::const_iterator end(PosT i) const; + + void clear() { + m_charArray->clear(); + m_sorted = true; + m_positions = MonotonicVector(); + } + + range at(PosT i) const; + range operator[](PosT i) const; + range back() const; + + template + void push_back(StringT s); + void push_back(const char* c); + + template + PosT find(StringT &s) const; + PosT find(const char* c) const; + + virtual size_t load(std::FILE* in, bool memoryMapped = false) { + size_t size = 0; + m_memoryMapped = memoryMapped; + + size += std::fread(&m_sorted, sizeof(bool), 1, in) * sizeof(bool); + size += m_positions.load(in, m_memoryMapped); + + size += loadCharArray(*m_charArray, in, m_memoryMapped); + return size; + } + + size_t loadCharArray(std::vector >& c, + std::FILE* in, bool map = false) { + // Can only be read into memory. Mapping not possible with std:allocator. + assert(map == false); + + size_t byteSize = 0; + + size_t valSize; + byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t); + + c.resize(valSize, 0); + byteSize += std::fread(&c[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); + + return byteSize; + } + + size_t loadCharArray(std::vector >& c, + std::FILE* in, bool map = false) { + size_t byteSize = 0; + + size_t valSize; + byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t); + + if(map == false) { + // Read data into temporary file (default constructor of MmapAllocator) + // and map memory onto temporary file. Can be resized. - typedef RangeIterator iterator; - typedef StringIterator string_iterator; - - StringVector(); - StringVector(Allocator alloc); - - virtual ~StringVector() - { - delete m_charArray; - } - - void swap(StringVector &c) - { - m_positions.commit(); - m_positions.swap(c.m_positions); - m_charArray->swap(*c.m_charArray); - - bool temp = m_sorted; - m_sorted = c.m_sorted; - c.m_sorted = temp; - } - - bool is_sorted() const; - PosT size() const; - virtual PosT size2() const; - - template Iterator begin() const; - template Iterator end() const; - - iterator begin() const; - iterator end() const; - - PosT length(PosT i) const; - typename std::vector >::const_iterator begin(PosT i) const; - typename std::vector >::const_iterator end(PosT i) const; - - void clear() - { - m_charArray->clear(); - m_sorted = true; - m_positions = MonotonicVector(); - } - - range at(PosT i) const; - range operator[](PosT i) const; - range back() const; - - template - void push_back(StringT s); - void push_back(const char* c); - - template - PosT find(StringT &s) const; - PosT find(const char* c) const; - - virtual size_t load(std::FILE* in, bool memoryMapped = false) - { - size_t size = 0; - m_memoryMapped = memoryMapped; - - size += std::fread(&m_sorted, sizeof(bool), 1, in) * sizeof(bool); - size += m_positions.load(in, m_memoryMapped); - - size += loadCharArray(*m_charArray, in, m_memoryMapped); - return size; - } - - size_t loadCharArray(std::vector >& c, - std::FILE* in, bool map = false) - { - // Can only be read into memory. Mapping not possible with std:allocator. - assert(map == false); - - size_t byteSize = 0; - - size_t valSize; - byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t); - c.resize(valSize, 0); byteSize += std::fread(&c[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); - - return byteSize; - } - - size_t loadCharArray(std::vector >& c, - std::FILE* in, bool map = false) - { - size_t byteSize = 0; + } else { + // Map it directly on specified region of file "in" starting at valPos + // with length valSize * sizeof(ValueT). Mapped region cannot be resized. - size_t valSize; - byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t); + size_t valPos = std::ftell(in); + Allocator alloc(in, valPos); + std::vector > charArrayTemp(alloc); + charArrayTemp.resize(valSize); + c.swap(charArrayTemp); - if(map == false) - { - // Read data into temporary file (default constructor of MmapAllocator) - // and map memory onto temporary file. Can be resized. - - c.resize(valSize, 0); - byteSize += std::fread(&c[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); - } - else - { - // Map it directly on specified region of file "in" starting at valPos - // with length valSize * sizeof(ValueT). Mapped region cannot be resized. - - size_t valPos = std::ftell(in); - Allocator alloc(in, valPos); - std::vector > charArrayTemp(alloc); - charArrayTemp.resize(valSize); - c.swap(charArrayTemp); - - byteSize += valSize * sizeof(ValueT); - } - - return byteSize; - } - - size_t load(std::string filename, bool memoryMapped = false) - { - std::FILE* pFile = fopen(filename.c_str(), "r"); - size_t byteSize = load(pFile, memoryMapped); - fclose(pFile); - return byteSize; + byteSize += valSize * sizeof(ValueT); } - size_t save(std::FILE* out) - { - size_t byteSize = 0; - byteSize += ThrowingFwrite(&m_sorted, sizeof(bool), 1, out) * sizeof(bool); - - byteSize += m_positions.save(out); - - size_t valSize = size2(); - byteSize += ThrowingFwrite(&valSize, sizeof(size_t), 1, out) * sizeof(size_t); - byteSize += ThrowingFwrite(&(*m_charArray)[0], sizeof(ValueT), valSize, out) * sizeof(ValueT); + return byteSize; + } + + size_t load(std::string filename, bool memoryMapped = false) { + std::FILE* pFile = fopen(filename.c_str(), "r"); + size_t byteSize = load(pFile, memoryMapped); + fclose(pFile); + return byteSize; + } + + size_t save(std::FILE* out) { + size_t byteSize = 0; + byteSize += ThrowingFwrite(&m_sorted, sizeof(bool), 1, out) * sizeof(bool); + + byteSize += m_positions.save(out); + + size_t valSize = size2(); + byteSize += ThrowingFwrite(&valSize, sizeof(size_t), 1, out) * sizeof(size_t); + byteSize += ThrowingFwrite(&(*m_charArray)[0], sizeof(ValueT), valSize, out) * sizeof(ValueT); + + return byteSize; + } + + size_t save(std::string filename) { + std::FILE* pFile = fopen(filename.c_str(), "w"); + size_t byteSize = save(pFile); + fclose(pFile); + return byteSize; + } - return byteSize; - } - - size_t save(std::string filename) - { - std::FILE* pFile = fopen(filename.c_str(), "w"); - size_t byteSize = save(pFile); - fclose(pFile); - return byteSize; - } - }; // ********** Implementation ********** @@ -300,214 +286,214 @@ class StringVector template ValueIteratorRange::ValueIteratorRange(ValueIteratorT begin, - ValueIteratorT end) : m_begin(begin), m_end(end) { } - + ValueIteratorT end) : m_begin(begin), m_end(end) { } + template const ValueIteratorT& ValueIteratorRange::begin() const { - return m_begin; + return m_begin; } template const ValueIteratorT& ValueIteratorRange::end() const { - return m_end; + return m_end; } template const std::string ValueIteratorRange::str() const { - std::string dummy; - for(ValueIteratorT it = m_begin; it != m_end; it++) - dummy.push_back(*it); - return dummy; + std::string dummy; + for(ValueIteratorT it = m_begin; it != m_end; it++) + dummy.push_back(*it); + return dummy; } template template bool ValueIteratorRange::operator==(const StringT& o) const { - if(std::distance(m_begin, m_end) == std::distance(o.begin(), o.end())) - return std::equal(m_begin, m_end, o.begin()); - else - return false; + if(std::distance(m_begin, m_end) == std::distance(o.begin(), o.end())) + return std::equal(m_begin, m_end, o.begin()); + else + return false; } - + template bool ValueIteratorRange::operator==(const char* c) const { - return *this == std::string(c); + return *this == std::string(c); } template template bool ValueIteratorRange::operator<(const StringT &s2) const { - return std::lexicographical_compare(m_begin, m_end, s2.begin(), s2.end(), - std::less::value_type>()); + return std::lexicographical_compare(m_begin, m_end, s2.begin(), s2.end(), + std::less::value_type>()); } template bool ValueIteratorRange::operator<(const char* c) const { - return *this < std::string(c); + return *this < std::string(c); } template bool operator<(const StringT &s1, const ValueIteratorRange &s2) { - return std::lexicographical_compare(s1.begin(), s1.end(), s2.begin(), s2.end(), - std::less::value_type>()); + return std::lexicographical_compare(s1.begin(), s1.end(), s2.begin(), s2.end(), + std::less::value_type>()); } template bool operator<(const char* c, const ValueIteratorRange &s2) { - size_t len = std::char_traits::length(c); - return std::lexicographical_compare(c, c + len, s2.begin(), s2.end(), - std::less::value_type>()); + size_t len = std::char_traits::length(c); + return std::lexicographical_compare(c, c + len, s2.begin(), s2.end(), + std::less::value_type>()); } template OStream& operator<<(OStream &os, ValueIteratorRange cr) { ValueIteratorT it = cr.begin(); - while(it != cr.end()) - os << *(it++); - return os; + while(it != cr.end()) + os << *(it++); + return os; } // StringVector template class Allocator> StringVector::StringVector() - : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector >()) { } + : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector >()) { } template class Allocator> StringVector::StringVector(Allocator alloc) - : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector >(alloc)) { } + : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector >(alloc)) { } -template class Allocator> +template class Allocator> template void StringVector::push_back(StringT s) { - if(is_sorted() && size() && !(back() < s)) - m_sorted = false; + if(is_sorted() && size() && !(back() < s)) + m_sorted = false; - m_positions.push_back(size2()); - std::copy(s.begin(), s.end(), std::back_inserter(*m_charArray)); + m_positions.push_back(size2()); + std::copy(s.begin(), s.end(), std::back_inserter(*m_charArray)); } -template class Allocator> +template class Allocator> void StringVector::push_back(const char* c) { - std::string dummy(c); - push_back(dummy); + std::string dummy(c); + push_back(dummy); } -template class Allocator> +template class Allocator> template Iterator StringVector::begin() const { - return Iterator(const_cast&>(*this), 0); + return Iterator(const_cast&>(*this), 0); } - -template class Allocator> + +template class Allocator> template Iterator StringVector::end() const { - return Iterator(const_cast&>(*this), size()); + return Iterator(const_cast&>(*this), size()); } template class Allocator> typename StringVector::iterator StringVector::begin() const { - return begin(); -}; + return begin(); +}; template class Allocator> typename StringVector::iterator StringVector::end() const { - return end(); -}; + return end(); +}; template class Allocator> bool StringVector::is_sorted() const { - return m_sorted; + return m_sorted; } template class Allocator> PosT StringVector::size() const { - return m_positions.size(); + return m_positions.size(); } template class Allocator> PosT StringVector::size2() const { - return m_charArray->size(); + return m_charArray->size(); } - + template class Allocator> typename StringVector::range StringVector::at(PosT i) const { - return range(begin(i), end(i)); + return range(begin(i), end(i)); } - + template class Allocator> typename StringVector::range StringVector::operator[](PosT i) const { - return at(i); + return at(i); } template class Allocator> typename StringVector::range StringVector::back() const { - return at(size()-1); + return at(size()-1); } template class Allocator> PosT StringVector::length(PosT i) const { - if(i+1 < size()) - return m_positions[i+1] - m_positions[i]; - else - return size2() - m_positions[i]; + if(i+1 < size()) + return m_positions[i+1] - m_positions[i]; + else + return size2() - m_positions[i]; } template class Allocator> const ValueT* StringVector::value_ptr(PosT i) const { - return &(*m_charArray)[m_positions[i]]; + return &(*m_charArray)[m_positions[i]]; } template class Allocator> typename std::vector >::const_iterator StringVector::begin(PosT i) const { - return typename std::vector >::const_iterator(value_ptr(i)); -} + return typename std::vector >::const_iterator(value_ptr(i)); +} template class Allocator> typename std::vector >::const_iterator StringVector::end(PosT i) const { - return typename std::vector >::const_iterator(value_ptr(i) + length(i)); -} + return typename std::vector >::const_iterator(value_ptr(i) + length(i)); +} template class Allocator> template PosT StringVector::find(StringT &s) const { - if(m_sorted) - return std::distance(begin(), std::lower_bound(begin(), end(), s)); - return std::distance(begin(), std::find(begin(), end(), s)); + if(m_sorted) + return std::distance(begin(), std::lower_bound(begin(), end(), s)); + return std::distance(begin(), std::find(begin(), end(), s)); } template class Allocator> PosT StringVector::find(const char* c) const { - std::string s(c); - return find(s); + std::string s(c); + return find(s); } // RangeIterator @@ -518,21 +504,21 @@ StringVector::RangeIterator::RangeIterator() : m_index( template class Allocator> StringVector::RangeIterator::RangeIterator(StringVector &sv, PosT index) : m_index(index), m_container(&sv) { } - + template class Allocator> PosT StringVector::RangeIterator::get_index() { return m_index; } - + template class Allocator> typename StringVector::range - StringVector::RangeIterator::dereference() const +StringVector::RangeIterator::dereference() const { return typename StringVector::range( - m_container->begin(m_index), - m_container->end(m_index) - ); + m_container->begin(m_index), + m_container->end(m_index) + ); } template class Allocator> @@ -577,18 +563,18 @@ template class Allocator> StringVector::StringIterator::StringIterator( StringVector &sv, PosT index) : m_index(index), m_container(&sv) { } - + template class Allocator> PosT StringVector::StringIterator::get_index() { return m_index; } - + template class Allocator> const std::string StringVector::StringIterator::dereference() const { return StringVector::range(m_container->begin(m_index), - m_container->end(m_index)).str(); + m_container->end(m_index)).str(); } template class Allocator> @@ -620,7 +606,7 @@ template class Allocator> PosT StringVector::StringIterator::distance_to( StringVector::StringIterator const& other) const { - return other.m_index - m_index; + return other.m_index - m_index; } // ********** Some typedefs ********** diff --git a/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h b/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h index 7687d1498..3eac0226a 100644 --- a/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h +++ b/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h @@ -1,23 +1,23 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #ifndef moses_TargetPhraseCollectionCache_h #define moses_TargetPhraseCollectionCache_h @@ -46,135 +46,119 @@ typedef boost::shared_ptr TargetPhraseVectorPtr; class TargetPhraseCollectionCache { - private: - size_t m_max; - float m_tolerance; - - struct LastUsed { - clock_t m_clock; - TargetPhraseVectorPtr m_tpv; - size_t m_bitsLeft; - - LastUsed() : m_clock(0), m_bitsLeft(0) {} - - LastUsed(clock_t clock, TargetPhraseVectorPtr tpv, size_t bitsLeft = 0) +private: + size_t m_max; + float m_tolerance; + + struct LastUsed { + clock_t m_clock; + TargetPhraseVectorPtr m_tpv; + size_t m_bitsLeft; + + LastUsed() : m_clock(0), m_bitsLeft(0) {} + + LastUsed(clock_t clock, TargetPhraseVectorPtr tpv, size_t bitsLeft = 0) : m_clock(clock), m_tpv(tpv), m_bitsLeft(bitsLeft) {} - }; - - typedef std::map CacheMap; - - CacheMap m_phraseCache; - + }; + + typedef std::map CacheMap; + + CacheMap m_phraseCache; + #ifdef WITH_THREADS - boost::mutex m_mutex; + boost::mutex m_mutex; #endif - public: - - typedef CacheMap::iterator iterator; - typedef CacheMap::const_iterator const_iterator; - - TargetPhraseCollectionCache(size_t max = 5000, float tolerance = 0.2) +public: + + typedef CacheMap::iterator iterator; + typedef CacheMap::const_iterator const_iterator; + + TargetPhraseCollectionCache(size_t max = 5000, float tolerance = 0.2) : m_max(max), m_tolerance(tolerance) - {} - - iterator Begin() - { - return m_phraseCache.begin(); - } - - const_iterator Begin() const - { - return m_phraseCache.begin(); - } - - iterator End() - { - return m_phraseCache.end(); - } - - const_iterator End() const - { - return m_phraseCache.end(); - } - - void Cache(const Phrase &sourcePhrase, TargetPhraseVectorPtr tpv, - size_t bitsLeft = 0, size_t maxRank = 0) - { + {} + + iterator Begin() { + return m_phraseCache.begin(); + } + + const_iterator Begin() const { + return m_phraseCache.begin(); + } + + iterator End() { + return m_phraseCache.end(); + } + + const_iterator End() const { + return m_phraseCache.end(); + } + + void Cache(const Phrase &sourcePhrase, TargetPhraseVectorPtr tpv, + size_t bitsLeft = 0, size_t maxRank = 0) { #ifdef WITH_THREADS - boost::mutex::scoped_lock lock(m_mutex); + boost::mutex::scoped_lock lock(m_mutex); #endif - iterator it = m_phraseCache.find(sourcePhrase); - if(it != m_phraseCache.end()) - it->second.m_clock = clock(); - else - { - if(maxRank && tpv->size() > maxRank) - { - TargetPhraseVectorPtr tpv_temp(new TargetPhraseVector()); - tpv_temp->resize(maxRank); - std::copy(tpv->begin(), tpv->begin() + maxRank, tpv_temp->begin()); - m_phraseCache[sourcePhrase] = LastUsed(clock(), tpv_temp, bitsLeft); - } - else - m_phraseCache[sourcePhrase] = LastUsed(clock(), tpv, bitsLeft); - } + iterator it = m_phraseCache.find(sourcePhrase); + if(it != m_phraseCache.end()) + it->second.m_clock = clock(); + else { + if(maxRank && tpv->size() > maxRank) { + TargetPhraseVectorPtr tpv_temp(new TargetPhraseVector()); + tpv_temp->resize(maxRank); + std::copy(tpv->begin(), tpv->begin() + maxRank, tpv_temp->begin()); + m_phraseCache[sourcePhrase] = LastUsed(clock(), tpv_temp, bitsLeft); + } else + m_phraseCache[sourcePhrase] = LastUsed(clock(), tpv, bitsLeft); } + } - std::pair Retrieve(const Phrase &sourcePhrase) - { + std::pair Retrieve(const Phrase &sourcePhrase) { #ifdef WITH_THREADS - boost::mutex::scoped_lock lock(m_mutex); + boost::mutex::scoped_lock lock(m_mutex); #endif - iterator it = m_phraseCache.find(sourcePhrase); - if(it != m_phraseCache.end()) - { + iterator it = m_phraseCache.find(sourcePhrase); + if(it != m_phraseCache.end()) { + LastUsed &lu = it->second; + lu.m_clock = clock(); + return std::make_pair(lu.m_tpv, lu.m_bitsLeft); + } else + return std::make_pair(TargetPhraseVectorPtr(), 0); + } + + void Prune() { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + + if(m_phraseCache.size() > m_max * (1 + m_tolerance)) { + typedef std::set > Cands; + Cands cands; + for(CacheMap::iterator it = m_phraseCache.begin(); + it != m_phraseCache.end(); it++) { LastUsed &lu = it->second; - lu.m_clock = clock(); - return std::make_pair(lu.m_tpv, lu.m_bitsLeft); + cands.insert(std::make_pair(lu.m_clock, it->first)); } - else - return std::make_pair(TargetPhraseVectorPtr(), 0); - } - void Prune() - { -#ifdef WITH_THREADS - boost::mutex::scoped_lock lock(m_mutex); -#endif + for(Cands::iterator it = cands.begin(); it != cands.end(); it++) { + const Phrase& p = it->second; + m_phraseCache.erase(p); - if(m_phraseCache.size() > m_max * (1 + m_tolerance)) - { - typedef std::set > Cands; - Cands cands; - for(CacheMap::iterator it = m_phraseCache.begin(); - it != m_phraseCache.end(); it++) - { - LastUsed &lu = it->second; - cands.insert(std::make_pair(lu.m_clock, it->first)); - } - - for(Cands::iterator it = cands.begin(); it != cands.end(); it++) - { - const Phrase& p = it->second; - m_phraseCache.erase(p); - - if(m_phraseCache.size() < (m_max * (1 - m_tolerance))) - break; - } + if(m_phraseCache.size() < (m_max * (1 - m_tolerance))) + break; } } + } - void CleanUp() - { + void CleanUp() { #ifdef WITH_THREADS - boost::mutex::scoped_lock lock(m_mutex); + boost::mutex::scoped_lock lock(m_mutex); #endif - m_phraseCache.clear(); - } - + m_phraseCache.clear(); + } + }; } diff --git a/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp b/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp index 35e8e3122..b231836f5 100644 --- a/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp +++ b/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp @@ -1,27 +1,28 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh -#include "ThrowingFwrite.h" +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. -size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream) { +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "ThrowingFwrite.h" + +size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream) +{ assert(size); size_t returnValue = std::fwrite(ptr, size, count, stream); UTIL_THROW_IF(count != returnValue, util::ErrnoException, "Short fwrite; requested size " << size); diff --git a/moses/TranslationModel/CompactPT/ThrowingFwrite.h b/moses/TranslationModel/CompactPT/ThrowingFwrite.h index 4f45ae8f5..466d3973b 100644 --- a/moses/TranslationModel/CompactPT/ThrowingFwrite.h +++ b/moses/TranslationModel/CompactPT/ThrowingFwrite.h @@ -1,30 +1,30 @@ -// $Id$ -// vim:tabstop=2 -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ #ifndef moses_ThrowingFwrite_h #define moses_ThrowingFwrite_h #include #include -#include "util/exception.hh" +#include "util/exception.hh" size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream); diff --git a/moses/TranslationModel/DynSAInclude/RandLMCache.h b/moses/TranslationModel/DynSAInclude/RandLMCache.h index b92a2a164..06ce240a1 100644 --- a/moses/TranslationModel/DynSAInclude/RandLMCache.h +++ b/moses/TranslationModel/DynSAInclude/RandLMCache.h @@ -22,177 +22,180 @@ #include #include -namespace randlm { - - //! @todo ask abby2 - template - class CacheNode { - public: - typedef std::map* > childMap; - // initialise value to 'unknown' (i.e. not yet queried or cached). - CacheNode(T unknown_value) : value_(unknown_value) {} - childMap childs_; // child pointers - T value_; // value stored - const void* state_; // state pointer - }; - - template - class Cache { - public: - typedef typename std::map* >::iterator childPtr; - // unknown_value is used to indicate the ngram was not queried (yet) - // null_value_ indicates it was queried but not found in model - // space usage is handled by client. - Cache(T unknown_value, T null_value) : - cur_nodes_(0), unknown_value_(unknown_value), null_value_(null_value) { - root_ = newNode(); - } - ~Cache() { - if(clear()) { - delete root_; - root_ = NULL; - } else { - std::cerr << "Error freeing cache memory.\n"; - } - } - bool setCacheNgram(const wordID_t* ngram, int len, T value, const void* state) { - // inserts full ngram into cache - CacheNode* node = root_; - for (int i = len - 1; i > -1; --i) { - childPtr child = node->childs_.find(ngram[i]); - if( child != node->childs_.end() ) { - // current node is already prefix. Go to child node - node = node->childs_[ngram[i]]; - } else { - // no child for prefix. set new child link in current node - CacheNode * newChild = newNode(node); - node->childs_[ngram[i]] = newChild; - // go to new node - node = newChild; - } - } - node->value_ = value; - node->state_ = state; - return true; - } - bool checkCacheNgram(const wordID_t* ngram, int len, T* value, const void** state) { - // finds value for this full ngram only (returns false if full ngram not in cache) - CacheNode * node = root_; - for(int i = len - 1; i > -1; --i) { - // go to deepest level node of ngram in cache - childPtr child = node->childs_.find(ngram[i]); - if( child != node->childs_.end() ) { - // switch to child node - node = node->childs_[ngram[i]]; - } else { - // not cached - return false; - } - } - *value = node->value_; - if(state) *state = node->state_; - return *value != null_value_ && *value != unknown_value_; - } - int getCache2(const wordID_t* ngram, int len, T** values, int* found) { - // set values array to point to cache value nodes - CacheNode * node = root_; - *found = 0; - //values[0] = &node->value_; // pointer to root node's value - bool all_found = true; - for(int i = len - 1; i > -1; --i) { - // go to deepest level node of ngram in cache - childPtr child = node->childs_.find(ngram[i]); - if( child != node->childs_.end() ) { - // switch to child node - node = node->childs_[ngram[i]]; - // get pointer to value (index by length - 1) - values[i] = &node->value_; - // if null_value then assume all extensions impossible - if (node->value_ == null_value_) { - return len - 1 - i; // max length posible - } - all_found = all_found && (node->value_ != unknown_value_); - if (all_found) - ++(*found); - } else { - // initialise uncached values - CacheNode * newChild = newNode(node); - node->childs_[ngram[i]] = newChild; - // go to new node - node = newChild; - values[i] = &node->value_; - } - } - return len; // all possible - } - int getCache(const wordID_t* ngram, int len, T** values, int* found) { - // get pointers to values for ngram and constituents. - // returns upper bound on longest subngram in model. - // 'found' stores longest non-null and known value found. - CacheNode * node = root_; - *found = 0; - values[0] = &node->value_; // pointer to root node's value - bool all_found = true; - for(int i = len - 1; i > -1; --i) { - // go to deepest level node of ngram in cache - childPtr child = node->childs_.find(ngram[i]); - if( child != node->childs_.end() ) { - // switch to child node - node = node->childs_[ngram[i]]; - // get pointer to value (index by length - 1) - values[len - i] = &node->value_; - // if null_value then assume all extensions impossible - if (node->value_ == null_value_) - return len - 1 - i; // max length posible - all_found = all_found && (node->value_ != unknown_value_); - if (all_found) - ++(*found); - } else { - // initialise uncached values - CacheNode * newChild = newNode(node); - node->childs_[ngram[i]] = newChild; - // go to new node - node = newChild; - values[len - i] = &node->value_; - } - } - return len; // all possible - } - bool clear() { - std::cerr << "Clearing cache with " << static_cast(cur_nodes_ * nodeSize()) - / static_cast(1ull << 20) << "MB" << std::endl; - return clearNodes(root_); - } - int nodes() { - // returns number of nodes - return cur_nodes_; - } - int nodeSize() { - return sizeof(CacheNode) + sizeof(root_->childs_); - } - private: - CacheNode * root_; - count_t cur_nodes_; - T unknown_value_; // Used to initialise data at each node - T null_value_; // Indicates cached something not in model - CacheNode* newNode(CacheNode * node = 0) { - ++cur_nodes_; - return new CacheNode(unknown_value_); - } - bool clearNodes(CacheNode * node) { - //delete children from this node - if(!node->childs_.empty()) { - iterate(node->childs_, itr) { - if(!clearNodes(itr->second)) - std::cerr << "Error emptying cache\n"; - delete itr->second; - --cur_nodes_; - } - node->childs_.clear(); - } - return true; - } +namespace randlm +{ - }; +//! @todo ask abby2 +template +class CacheNode +{ +public: + typedef std::map* > childMap; + // initialise value to 'unknown' (i.e. not yet queried or cached). + CacheNode(T unknown_value) : value_(unknown_value) {} + childMap childs_; // child pointers + T value_; // value stored + const void* state_; // state pointer +}; + +template +class Cache +{ +public: + typedef typename std::map* >::iterator childPtr; + // unknown_value is used to indicate the ngram was not queried (yet) + // null_value_ indicates it was queried but not found in model + // space usage is handled by client. + Cache(T unknown_value, T null_value) : + cur_nodes_(0), unknown_value_(unknown_value), null_value_(null_value) { + root_ = newNode(); + } + ~Cache() { + if(clear()) { + delete root_; + root_ = NULL; + } else { + std::cerr << "Error freeing cache memory.\n"; + } + } + bool setCacheNgram(const wordID_t* ngram, int len, T value, const void* state) { + // inserts full ngram into cache + CacheNode* node = root_; + for (int i = len - 1; i > -1; --i) { + childPtr child = node->childs_.find(ngram[i]); + if( child != node->childs_.end() ) { + // current node is already prefix. Go to child node + node = node->childs_[ngram[i]]; + } else { + // no child for prefix. set new child link in current node + CacheNode * newChild = newNode(node); + node->childs_[ngram[i]] = newChild; + // go to new node + node = newChild; + } + } + node->value_ = value; + node->state_ = state; + return true; + } + bool checkCacheNgram(const wordID_t* ngram, int len, T* value, const void** state) { + // finds value for this full ngram only (returns false if full ngram not in cache) + CacheNode * node = root_; + for(int i = len - 1; i > -1; --i) { + // go to deepest level node of ngram in cache + childPtr child = node->childs_.find(ngram[i]); + if( child != node->childs_.end() ) { + // switch to child node + node = node->childs_[ngram[i]]; + } else { + // not cached + return false; + } + } + *value = node->value_; + if(state) *state = node->state_; + return *value != null_value_ && *value != unknown_value_; + } + int getCache2(const wordID_t* ngram, int len, T** values, int* found) { + // set values array to point to cache value nodes + CacheNode * node = root_; + *found = 0; + //values[0] = &node->value_; // pointer to root node's value + bool all_found = true; + for(int i = len - 1; i > -1; --i) { + // go to deepest level node of ngram in cache + childPtr child = node->childs_.find(ngram[i]); + if( child != node->childs_.end() ) { + // switch to child node + node = node->childs_[ngram[i]]; + // get pointer to value (index by length - 1) + values[i] = &node->value_; + // if null_value then assume all extensions impossible + if (node->value_ == null_value_) { + return len - 1 - i; // max length posible + } + all_found = all_found && (node->value_ != unknown_value_); + if (all_found) + ++(*found); + } else { + // initialise uncached values + CacheNode * newChild = newNode(node); + node->childs_[ngram[i]] = newChild; + // go to new node + node = newChild; + values[i] = &node->value_; + } + } + return len; // all possible + } + int getCache(const wordID_t* ngram, int len, T** values, int* found) { + // get pointers to values for ngram and constituents. + // returns upper bound on longest subngram in model. + // 'found' stores longest non-null and known value found. + CacheNode * node = root_; + *found = 0; + values[0] = &node->value_; // pointer to root node's value + bool all_found = true; + for(int i = len - 1; i > -1; --i) { + // go to deepest level node of ngram in cache + childPtr child = node->childs_.find(ngram[i]); + if( child != node->childs_.end() ) { + // switch to child node + node = node->childs_[ngram[i]]; + // get pointer to value (index by length - 1) + values[len - i] = &node->value_; + // if null_value then assume all extensions impossible + if (node->value_ == null_value_) + return len - 1 - i; // max length posible + all_found = all_found && (node->value_ != unknown_value_); + if (all_found) + ++(*found); + } else { + // initialise uncached values + CacheNode * newChild = newNode(node); + node->childs_[ngram[i]] = newChild; + // go to new node + node = newChild; + values[len - i] = &node->value_; + } + } + return len; // all possible + } + bool clear() { + std::cerr << "Clearing cache with " << static_cast(cur_nodes_ * nodeSize()) + / static_cast(1ull << 20) << "MB" << std::endl; + return clearNodes(root_); + } + int nodes() { + // returns number of nodes + return cur_nodes_; + } + int nodeSize() { + return sizeof(CacheNode) + sizeof(root_->childs_); + } +private: + CacheNode * root_; + count_t cur_nodes_; + T unknown_value_; // Used to initialise data at each node + T null_value_; // Indicates cached something not in model + CacheNode* newNode(CacheNode * node = 0) { + ++cur_nodes_; + return new CacheNode(unknown_value_); + } + bool clearNodes(CacheNode * node) { + //delete children from this node + if(!node->childs_.empty()) { + iterate(node->childs_, itr) { + if(!clearNodes(itr->second)) + std::cerr << "Error emptying cache\n"; + delete itr->second; + --cur_nodes_; + } + node->childs_.clear(); + } + return true; + } + +}; } //end namespace #endif //INC_RANDLM_CACHE_H diff --git a/moses/TranslationModel/DynSAInclude/RandLMFilter.h b/moses/TranslationModel/DynSAInclude/RandLMFilter.h index 298464693..0923f52af 100644 --- a/moses/TranslationModel/DynSAInclude/RandLMFilter.h +++ b/moses/TranslationModel/DynSAInclude/RandLMFilter.h @@ -24,296 +24,307 @@ #define log2(X) (log((double)X)/log((double)2)) #endif -namespace randlm { - - /* Class Filter wraps a contiguous array of data. Filter and its subclasses - * implement read/write/increment functionality on arrays with arbitrary sized addresses - * (i.e. an address may not use a full number of bytes). When converting to byte-based - * representation we assume "unused" bits are to left. - * E.g. if the underlying data is stored in units T = uint16 and the 'width' = 11 - * to read 'address' = 3 we extract bits at indices [33,42] (i.e. [11*3, 11*4 - 1]) - * and store in a uint16 in positions 0000 0001 111111 where the first 7 bits have - * been masked out. - */ - template - class Filter { - public: - Filter(uint64_t addresses, int width) : addresses_(addresses), width_(width), data_(NULL) { - // number of bits in T - cell_width_ = sizeof(T) << 3; - // current implementation has following constraints - CHECK(cell_width_ > 0 && cell_width_ <= 64 && cell_width_ >= width); - // used for >> division - log_cell_width_ = static_cast(floor(log((double)cell_width_)/log((double)2) + 0.000001)); - // size of underlying data in Ts - cells_ = ((addresses * width) + cell_width_ - 1) >> log_cell_width_; - // instantiate underlying data - data_ = new T[cells_]; - CHECK(data_ != NULL); - CHECK(reset()); - // 'first_bit' marks the first bit used by 'address' (left padded with zeros). - first_bit_ = (width % cell_width_ == 0) ? 0 : cell_width_ - (width % cell_width_); - // mask for full cell - full_mask_ = static_cast(0xffffffffffffffffull); - // mask for bits that make up the address - address_mask_ = full_mask_ >> first_bit_; - } - Filter(FileHandler* fin, bool loaddata = true) : data_(NULL) { - CHECK(loadHeader(fin)); - if (loaddata) - CHECK(loadData(fin)); - } - virtual ~Filter() { - delete[] data_; - } - bool reset() { - for (uint64_t i = 0; i < cells_; ++i) - data_[i] = 0; - return true; - } - count_t size() { - // return approx size of filter in MBs - return cells_ * sizeof(T) >> 20; - } - // read / write functions - inline bool read(uint64_t address, T* value) { - CHECK(address <= addresses_); - // copy address to 'value' - uint64_t data_bit = address * width_; - uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_; - // 'offset' shows how address in 'data' and 'value' align - int offset = (data_bit % cell_width_) - first_bit_; - // they align so just copy across masking unneeded leading bits - if (offset == 0) { - *value = data_[data_cell] & address_mask_; - return true; - } - // data address starts to left so shift it right - if (offset < 0) { - *value = (data_[data_cell] >> -offset) & address_mask_; - return true; - } - // data address is to right so shift it left and look at one more cell to right - *value = ((data_[data_cell] << offset) - | (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ; - return true; - } - inline T read(uint64_t address) { - CHECK(address <= addresses_); - // return value at address - T value = 0; - uint64_t data_bit = address * width_; - uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_; - // 'offset' shows how address in 'data' and 'value' align - int offset = (data_bit % cell_width_) - first_bit_; - // they align so just copy across masking unneeded leading bits - if (offset == 0) { - value = data_[data_cell] & address_mask_; - } - // data address starts to left so shift it right - else if (offset < 0) { - value = (data_[data_cell] >> -offset) & address_mask_; - } - // data address is to right so shift it left and look at one more cell to right - else - value = ((data_[data_cell] << offset) - | (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ; - return value; - } - inline bool write(uint64_t address, T value) { - CHECK(address <= addresses_); - CHECK(log2(value) <= width_); - // write 'value' to address - uint64_t data_bit = address * width_; - uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_; - // 'offset' shows how address in 'data' and 'value' align - int offset = (data_bit % cell_width_) - first_bit_; - // they align so just copy across masking unneeded leading zeros of value - if (offset == 0) { - data_[data_cell] = value | (data_[data_cell] & ~address_mask_); - return true; - } - // address in data is to left so shift value left by -offset - if (offset < 0) { - data_[data_cell] = (value << -offset) - | (data_[data_cell] & ~(address_mask_ << -offset)); - return true; - } - // address in data is to right so shift value right by offset - data_[data_cell] = (value >> offset) | - (data_[data_cell] & ~(address_mask_ >> offset)); - data_[data_cell + 1] = (value << (cell_width_ - offset)) | - (data_[data_cell + 1] & (full_mask_ >> offset)); - return true; - } - inline bool readWithFingerprint(uint64_t address, T finger, T* value) { - // copy 'address' ^ 'finger' to 'value' - uint64_t data_bit = address * width_; - uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_; - // 'offset' shows how address in 'data' and 'value' align - int offset = (data_bit % cell_width_) - first_bit_; - // they align so just copy across masking unneeded leading bits - if (offset == 0) { - *value = (finger ^ data_[data_cell]) & address_mask_; - return true; - } - // data address starts to left so shift it right - if (offset < 0) { - *value = ((data_[data_cell] >> -offset) ^ finger) & address_mask_; - return true; - } - // data address is to right so shift it left and look at one more cell to right - *value = (((data_[data_cell] << offset) - | (data_[data_cell + 1] >> (cell_width_ - offset))) ^ finger) - & address_mask_ ; - return true; - } - inline bool writeWithFingerprint(uint64_t address, T finger, T value) { - // write 'value' ^ 'finger' to address - finger &= address_mask_; // make sure fingerprint is correct size - uint64_t data_bit = address * width_; - uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_; - // 'offset' shows how address in 'data' and 'value' align - int offset = (data_bit % cell_width_) - first_bit_; - // they align so just copy across masking unneeded leading zeros of value - if (offset == 0) { - data_[data_cell] = (finger ^ value) | (data_[data_cell] & ~address_mask_); - return true; - } - // address in data is to left so shift value left by -offset - if (offset < 0) { - data_[data_cell] = ((finger ^ value) << -offset) - | (data_[data_cell] & ~(address_mask_ << -offset)); - return true; - } - // address in data is to right so shift value right by offset - data_[data_cell] = ((finger ^ value) >> offset) | - (data_[data_cell] & ~(address_mask_ >> offset)); - data_[data_cell + 1] = ((finger ^ value) << (cell_width_ - offset)) | - (data_[data_cell + 1] & (full_mask_ >> offset)); - return true; - } - // debugging - void printFilter(const std::string & prefix = "", uint32_t truncate = 64){ - std::cout << prefix; - for (uint32_t i = 0; i < cells_ && i < truncate; ++i) { - for (int j = cell_width_ - 1; j >= 0; --j) - if (data_[i] & (1ull << j)) - std::cout << 1; - else - std::cout << 0; - std::cout << "\n"; - } - std::cout << std::endl; - } - // i/o - uint64_t getAddresses() { return addresses_; } - int getWidth() { return width_; } - int getCellWidth() { return cell_width_; } - uint32_t getCells() { return cells_; } - virtual bool save(FileHandler* out) { - CHECK(out != NULL); - CHECK(out->write((char*)&cells_, sizeof(cells_))); - CHECK(out->write((char*)&cell_width_, sizeof(cell_width_))); - CHECK(out->write((char*)&log_cell_width_, sizeof(log_cell_width_))); - CHECK(out->write((char*)&addresses_, sizeof(addresses_))); - CHECK(out->write((char*)&width_, sizeof(width_))); - CHECK(out->write((char*)&first_bit_, sizeof(first_bit_))); - CHECK(out->write((char*)&full_mask_, sizeof(full_mask_))); - CHECK(out->write((char*)&address_mask_, sizeof(address_mask_))); - //CHECK(out->write((char*)data_, cells_ * sizeof(T))); - const uint64_t jump = 524288032ul; //(uint64_t)pow(2, 29); - if((width_ == 1) || cells_ < jump) - CHECK(out->write((char*)data_, cells_ * sizeof(T))); - else { - uint64_t idx(0); - while(idx + jump < cells_) { - CHECK(out->write((char*)&data_[idx], jump * sizeof(T))); - idx += jump; - } - CHECK(out->write((char*)&data_[idx], (cells_ - idx) * sizeof(T))); - } - return true; - } - protected: - bool loadHeader(FileHandler* fin) { - CHECK(fin != NULL); - CHECK(fin->read((char*)&cells_, sizeof(cells_))); - CHECK(fin->read((char*)&cell_width_, sizeof(cell_width_))); - CHECK(cell_width_ == sizeof(T) << 3); // make sure correct underlying data type - CHECK(fin->read((char*)&log_cell_width_, sizeof(log_cell_width_))); - CHECK(fin->read((char*)&addresses_, sizeof(addresses_))); - CHECK(fin->read((char*)&width_, sizeof(width_))); - CHECK(fin->read((char*)&first_bit_, sizeof(first_bit_))); - CHECK(fin->read((char*)&full_mask_, sizeof(full_mask_))); - CHECK(fin->read((char*)&address_mask_, sizeof(address_mask_))); - return true; - } - bool loadData(FileHandler* fin) { - // instantiate underlying array - data_ = new T[cells_]; - CHECK(data_ != NULL); - CHECK(fin->read((char*)data_, cells_ * sizeof(T))); - //CHECK(fin->read((char*)&data_[0], ceil(float(cells_) / 2.0) * sizeof(T))); - //CHECK(fin->read((char*)&data_[cells_ / 2], (cells_ / 2) * sizeof(T))); - return true; - } - uint64_t cells_; // number T making up 'data_' - int cell_width_; // bits per cell (i.e. sizeof(T) << 3) - int log_cell_width_; // log of bits used for >> division - uint64_t addresses_; // number of addresses in the filter - int width_; // width in bits of each address - int first_bit_; // position of first bit in initial byte - T full_mask_; // all 1s - T address_mask_; // 1s in those positions that are part of address - T* data_; // the raw data as bytes - }; +namespace randlm +{ - // Extension with bit test/setter methods added - class BitFilter : public Filter { - public: - BitFilter(uint64_t bits) : Filter(bits, 1) {} - BitFilter(FileHandler* fin, bool loaddata = true) - : Filter(fin, loaddata) { - if (loaddata) - CHECK(load(fin)); - } - // TODO: overload operator[] - virtual bool testBit(uint64_t location) { - // test bit referenced by location - return data_[(location % addresses_) >> 3] & 1 << ((location % addresses_) % 8); - } - virtual bool setBit(uint64_t location) { - // set bit referenced by location - data_[(location % addresses_) >> 3] |= 1 << ((location % addresses_) % 8); +/* Class Filter wraps a contiguous array of data. Filter and its subclasses + * implement read/write/increment functionality on arrays with arbitrary sized addresses + * (i.e. an address may not use a full number of bytes). When converting to byte-based + * representation we assume "unused" bits are to left. + * E.g. if the underlying data is stored in units T = uint16 and the 'width' = 11 + * to read 'address' = 3 we extract bits at indices [33,42] (i.e. [11*3, 11*4 - 1]) + * and store in a uint16 in positions 0000 0001 111111 where the first 7 bits have + * been masked out. + */ +template +class Filter +{ +public: + Filter(uint64_t addresses, int width) : addresses_(addresses), width_(width), data_(NULL) { + // number of bits in T + cell_width_ = sizeof(T) << 3; + // current implementation has following constraints + CHECK(cell_width_ > 0 && cell_width_ <= 64 && cell_width_ >= width); + // used for >> division + log_cell_width_ = static_cast(floor(log((double)cell_width_)/log((double)2) + 0.000001)); + // size of underlying data in Ts + cells_ = ((addresses * width) + cell_width_ - 1) >> log_cell_width_; + // instantiate underlying data + data_ = new T[cells_]; + CHECK(data_ != NULL); + CHECK(reset()); + // 'first_bit' marks the first bit used by 'address' (left padded with zeros). + first_bit_ = (width % cell_width_ == 0) ? 0 : cell_width_ - (width % cell_width_); + // mask for full cell + full_mask_ = static_cast(0xffffffffffffffffull); + // mask for bits that make up the address + address_mask_ = full_mask_ >> first_bit_; + } + Filter(FileHandler* fin, bool loaddata = true) : data_(NULL) { + CHECK(loadHeader(fin)); + if (loaddata) + CHECK(loadData(fin)); + } + virtual ~Filter() { + delete[] data_; + } + bool reset() { + for (uint64_t i = 0; i < cells_; ++i) + data_[i] = 0; + return true; + } + count_t size() { + // return approx size of filter in MBs + return cells_ * sizeof(T) >> 20; + } + // read / write functions + inline bool read(uint64_t address, T* value) { + CHECK(address <= addresses_); + // copy address to 'value' + uint64_t data_bit = address * width_; + uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_; + // 'offset' shows how address in 'data' and 'value' align + int offset = (data_bit % cell_width_) - first_bit_; + // they align so just copy across masking unneeded leading bits + if (offset == 0) { + *value = data_[data_cell] & address_mask_; return true; } - virtual bool clearBit(uint64_t location) { - // set bit referenced by location - data_[(location % addresses_) >> 3] &= 0 << ((location % addresses_) % 8); + // data address starts to left so shift it right + if (offset < 0) { + *value = (data_[data_cell] >> -offset) & address_mask_; return true; } - bool save(FileHandler* fout) { - CHECK(Filter::save(fout)); - std::cerr << "Saved BitFilter. Rho = " << rho() << "." << std::endl;; + // data address is to right so shift it left and look at one more cell to right + *value = ((data_[data_cell] << offset) + | (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ; + return true; + } + inline T read(uint64_t address) { + CHECK(address <= addresses_); + // return value at address + T value = 0; + uint64_t data_bit = address * width_; + uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_; + // 'offset' shows how address in 'data' and 'value' align + int offset = (data_bit % cell_width_) - first_bit_; + // they align so just copy across masking unneeded leading bits + if (offset == 0) { + value = data_[data_cell] & address_mask_; + } + // data address starts to left so shift it right + else if (offset < 0) { + value = (data_[data_cell] >> -offset) & address_mask_; + } + // data address is to right so shift it left and look at one more cell to right + else + value = ((data_[data_cell] << offset) + | (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ; + return value; + } + inline bool write(uint64_t address, T value) { + CHECK(address <= addresses_); + CHECK(log2(value) <= width_); + // write 'value' to address + uint64_t data_bit = address * width_; + uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_; + // 'offset' shows how address in 'data' and 'value' align + int offset = (data_bit % cell_width_) - first_bit_; + // they align so just copy across masking unneeded leading zeros of value + if (offset == 0) { + data_[data_cell] = value | (data_[data_cell] & ~address_mask_); return true; } - float rho(uint64_t limit = 0) { - uint64_t ones = 0; - uint64_t range = limit > 0 ? std::min(limit,cells_) : cells_; - for (uint64_t i = 0; i < range; ++i) - for (int j = 0; j < 8; ++j) - if (data_[i] & (1 << j)) - ++ones; - return static_cast((range << 3) - ones)/static_cast(range << 3); - } - protected: - bool load(FileHandler* fin) { - std::cerr << "Loaded BitFilter. Rho = " << rho() << "." << std::endl;; + // address in data is to left so shift value left by -offset + if (offset < 0) { + data_[data_cell] = (value << -offset) + | (data_[data_cell] & ~(address_mask_ << -offset)); return true; } - }; -/* + // address in data is to right so shift value right by offset + data_[data_cell] = (value >> offset) | + (data_[data_cell] & ~(address_mask_ >> offset)); + data_[data_cell + 1] = (value << (cell_width_ - offset)) | + (data_[data_cell + 1] & (full_mask_ >> offset)); + return true; + } + inline bool readWithFingerprint(uint64_t address, T finger, T* value) { + // copy 'address' ^ 'finger' to 'value' + uint64_t data_bit = address * width_; + uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_; + // 'offset' shows how address in 'data' and 'value' align + int offset = (data_bit % cell_width_) - first_bit_; + // they align so just copy across masking unneeded leading bits + if (offset == 0) { + *value = (finger ^ data_[data_cell]) & address_mask_; + return true; + } + // data address starts to left so shift it right + if (offset < 0) { + *value = ((data_[data_cell] >> -offset) ^ finger) & address_mask_; + return true; + } + // data address is to right so shift it left and look at one more cell to right + *value = (((data_[data_cell] << offset) + | (data_[data_cell + 1] >> (cell_width_ - offset))) ^ finger) + & address_mask_ ; + return true; + } + inline bool writeWithFingerprint(uint64_t address, T finger, T value) { + // write 'value' ^ 'finger' to address + finger &= address_mask_; // make sure fingerprint is correct size + uint64_t data_bit = address * width_; + uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_; + // 'offset' shows how address in 'data' and 'value' align + int offset = (data_bit % cell_width_) - first_bit_; + // they align so just copy across masking unneeded leading zeros of value + if (offset == 0) { + data_[data_cell] = (finger ^ value) | (data_[data_cell] & ~address_mask_); + return true; + } + // address in data is to left so shift value left by -offset + if (offset < 0) { + data_[data_cell] = ((finger ^ value) << -offset) + | (data_[data_cell] & ~(address_mask_ << -offset)); + return true; + } + // address in data is to right so shift value right by offset + data_[data_cell] = ((finger ^ value) >> offset) | + (data_[data_cell] & ~(address_mask_ >> offset)); + data_[data_cell + 1] = ((finger ^ value) << (cell_width_ - offset)) | + (data_[data_cell + 1] & (full_mask_ >> offset)); + return true; + } + // debugging + void printFilter(const std::string & prefix = "", uint32_t truncate = 64) { + std::cout << prefix; + for (uint32_t i = 0; i < cells_ && i < truncate; ++i) { + for (int j = cell_width_ - 1; j >= 0; --j) + if (data_[i] & (1ull << j)) + std::cout << 1; + else + std::cout << 0; + std::cout << "\n"; + } + std::cout << std::endl; + } + // i/o + uint64_t getAddresses() { + return addresses_; + } + int getWidth() { + return width_; + } + int getCellWidth() { + return cell_width_; + } + uint32_t getCells() { + return cells_; + } + virtual bool save(FileHandler* out) { + CHECK(out != NULL); + CHECK(out->write((char*)&cells_, sizeof(cells_))); + CHECK(out->write((char*)&cell_width_, sizeof(cell_width_))); + CHECK(out->write((char*)&log_cell_width_, sizeof(log_cell_width_))); + CHECK(out->write((char*)&addresses_, sizeof(addresses_))); + CHECK(out->write((char*)&width_, sizeof(width_))); + CHECK(out->write((char*)&first_bit_, sizeof(first_bit_))); + CHECK(out->write((char*)&full_mask_, sizeof(full_mask_))); + CHECK(out->write((char*)&address_mask_, sizeof(address_mask_))); + //CHECK(out->write((char*)data_, cells_ * sizeof(T))); + const uint64_t jump = 524288032ul; //(uint64_t)pow(2, 29); + if((width_ == 1) || cells_ < jump) + CHECK(out->write((char*)data_, cells_ * sizeof(T))); + else { + uint64_t idx(0); + while(idx + jump < cells_) { + CHECK(out->write((char*)&data_[idx], jump * sizeof(T))); + idx += jump; + } + CHECK(out->write((char*)&data_[idx], (cells_ - idx) * sizeof(T))); + } + return true; + } +protected: + bool loadHeader(FileHandler* fin) { + CHECK(fin != NULL); + CHECK(fin->read((char*)&cells_, sizeof(cells_))); + CHECK(fin->read((char*)&cell_width_, sizeof(cell_width_))); + CHECK(cell_width_ == sizeof(T) << 3); // make sure correct underlying data type + CHECK(fin->read((char*)&log_cell_width_, sizeof(log_cell_width_))); + CHECK(fin->read((char*)&addresses_, sizeof(addresses_))); + CHECK(fin->read((char*)&width_, sizeof(width_))); + CHECK(fin->read((char*)&first_bit_, sizeof(first_bit_))); + CHECK(fin->read((char*)&full_mask_, sizeof(full_mask_))); + CHECK(fin->read((char*)&address_mask_, sizeof(address_mask_))); + return true; + } + bool loadData(FileHandler* fin) { + // instantiate underlying array + data_ = new T[cells_]; + CHECK(data_ != NULL); + CHECK(fin->read((char*)data_, cells_ * sizeof(T))); + //CHECK(fin->read((char*)&data_[0], ceil(float(cells_) / 2.0) * sizeof(T))); + //CHECK(fin->read((char*)&data_[cells_ / 2], (cells_ / 2) * sizeof(T))); + return true; + } + uint64_t cells_; // number T making up 'data_' + int cell_width_; // bits per cell (i.e. sizeof(T) << 3) + int log_cell_width_; // log of bits used for >> division + uint64_t addresses_; // number of addresses in the filter + int width_; // width in bits of each address + int first_bit_; // position of first bit in initial byte + T full_mask_; // all 1s + T address_mask_; // 1s in those positions that are part of address + T* data_; // the raw data as bytes +}; + +// Extension with bit test/setter methods added +class BitFilter : public Filter +{ +public: + BitFilter(uint64_t bits) : Filter(bits, 1) {} + BitFilter(FileHandler* fin, bool loaddata = true) + : Filter(fin, loaddata) { + if (loaddata) + CHECK(load(fin)); + } + // TODO: overload operator[] + virtual bool testBit(uint64_t location) { + // test bit referenced by location + return data_[(location % addresses_) >> 3] & 1 << ((location % addresses_) % 8); + } + virtual bool setBit(uint64_t location) { + // set bit referenced by location + data_[(location % addresses_) >> 3] |= 1 << ((location % addresses_) % 8); + return true; + } + virtual bool clearBit(uint64_t location) { + // set bit referenced by location + data_[(location % addresses_) >> 3] &= 0 << ((location % addresses_) % 8); + return true; + } + bool save(FileHandler* fout) { + CHECK(Filter::save(fout)); + std::cerr << "Saved BitFilter. Rho = " << rho() << "." << std::endl;; + return true; + } + float rho(uint64_t limit = 0) { + uint64_t ones = 0; + uint64_t range = limit > 0 ? std::min(limit,cells_) : cells_; + for (uint64_t i = 0; i < range; ++i) + for (int j = 0; j < 8; ++j) + if (data_[i] & (1 << j)) + ++ones; + return static_cast((range << 3) - ones)/static_cast(range << 3); + } +protected: + bool load(FileHandler* fin) { + std::cerr << "Loaded BitFilter. Rho = " << rho() << "." << std::endl;; + return true; + } +}; +/* // ResizedBitFilter deals with resizing to save memory // whereas other filters should expect locations to be within range // this filter will need to resize (and possibly rehash) locations @@ -385,9 +396,9 @@ namespace randlm { carry = incrementSubCell(data_bit, this->width_, &this->data_[data_cell]); } // last update must not have carried - if (!carry) + if (!carry) return true; - // wrapped round so check whether need to reset to max count + // wrapped round so check whether need to reset to max count if (!wrap_around_) CHECK(this->write(address, this->address_mask_)); return false; // false to indicate that overflowed @@ -402,7 +413,7 @@ namespace randlm { } inline bool incrementSubCell(int bit, int len, T* cell) { // increment counter consisting of bits [startbit, startbit + len - 1] rest stays unchanged - *cell = ((((*cell >> (this->cell_width_ - bit - len)) + 1) + *cell = ((((*cell >> (this->cell_width_ - bit - len)) + 1) & (this->full_mask_ >> (this->cell_width_ - len))) << (this->cell_width_ - bit - len)) | (*cell & ~(((this->full_mask_ >> (this->cell_width_ - len)) << (this->cell_width_ - bit - len)))); // indicate overflow as true diff --git a/moses/TranslationModel/DynSAInclude/hash.h b/moses/TranslationModel/DynSAInclude/hash.h index 03669845e..9e6cfe62a 100644 --- a/moses/TranslationModel/DynSAInclude/hash.h +++ b/moses/TranslationModel/DynSAInclude/hash.h @@ -11,60 +11,68 @@ typedef uint64_t P; // largest input range is 2^64 //! @todo ask abby2 template -class HashBase { - protected: - T m_; // range of hash output - count_t H_; // number of hash functions to instantiate - virtual void initSeeds()=0; - virtual void freeSeeds()=0; - public: - HashBase(float m, count_t H=1):m_((T)m), H_(H) { - //cerr << "range = (0..." << m_ << "]" << endl; - } - HashBase(FileHandler* fin) { - load(fin); - } - virtual ~HashBase(){} - virtual T hash(const char*s, count_t h)=0; // string hashing - virtual T hash(const wordID_t* id, const int len, count_t h)=0; // vocab mapped hashing - count_t size() { return H_;} - virtual void save(FileHandler* fout) { - CHECK(fout != 0); - fout->write((char*)&m_, sizeof(m_)); - fout->write((char*)&H_, sizeof(H_)); - } - virtual void load(FileHandler* fin) { - CHECK(fin != 0); - fin->read((char*)&m_, sizeof(m_)); - fin->read((char*)&H_, sizeof(H_)); - } +class HashBase +{ +protected: + T m_; // range of hash output + count_t H_; // number of hash functions to instantiate + virtual void initSeeds()=0; + virtual void freeSeeds()=0; +public: + HashBase(float m, count_t H=1):m_((T)m), H_(H) { + //cerr << "range = (0..." << m_ << "]" << endl; + } + HashBase(FileHandler* fin) { + load(fin); + } + virtual ~HashBase() {} + virtual T hash(const char*s, count_t h)=0; // string hashing + virtual T hash(const wordID_t* id, const int len, count_t h)=0; // vocab mapped hashing + count_t size() { + return H_; + } + virtual void save(FileHandler* fout) { + CHECK(fout != 0); + fout->write((char*)&m_, sizeof(m_)); + fout->write((char*)&H_, sizeof(H_)); + } + virtual void load(FileHandler* fin) { + CHECK(fin != 0); + fin->read((char*)&m_, sizeof(m_)); + fin->read((char*)&H_, sizeof(H_)); + } }; //! @todo ask abby2 template -class UnivHash_linear: public HashBase { - public: - UnivHash_linear(float m, count_t H, P pr): - HashBase(m, H), pr_(pr) { - //CHECK(isPrime(pr_)); - initSeeds(); - } - UnivHash_linear(FileHandler* fin): - HashBase(fin) { - load(fin); - } - ~UnivHash_linear() {freeSeeds();} - T hash(const char* s, count_t h){return 0;} //not implemented - T hash(const wordID_t* id, const int len, count_t h); - T hash(const wordID_t id, const count_t pos, - const T prevValue, count_t h); - void save(FileHandler* fout); - void load(FileHandler* fin); - private: - T** a_, **b_; - P pr_; - void initSeeds(); - void freeSeeds(); +class UnivHash_linear: public HashBase +{ +public: + UnivHash_linear(float m, count_t H, P pr): + HashBase(m, H), pr_(pr) { + //CHECK(isPrime(pr_)); + initSeeds(); + } + UnivHash_linear(FileHandler* fin): + HashBase(fin) { + load(fin); + } + ~UnivHash_linear() { + freeSeeds(); + } + T hash(const char* s, count_t h) { + return 0; //not implemented + } + T hash(const wordID_t* id, const int len, count_t h); + T hash(const wordID_t id, const count_t pos, + const T prevValue, count_t h); + void save(FileHandler* fout); + void load(FileHandler* fin); +private: + T** a_, **b_; + P pr_; + void initSeeds(); + void freeSeeds(); }; /** UnivHash_noPrimes: @@ -74,76 +82,91 @@ class UnivHash_linear: public HashBase { * # of hash function = 2^(l-1) */ template -class UnivHash_noPrimes: public HashBase { - public: - UnivHash_noPrimes(float k, float l): - HashBase(k, 100), d_(count_t((l-k))) { - if(((int)l >> 3) == sizeof(P)) p_ = (P) pow(2,l) - 1; - else p_ = (P) pow(2,l); - initSeeds(); - } - UnivHash_noPrimes(FileHandler* fin): - HashBase(fin) { - load(fin); - } - ~UnivHash_noPrimes() {freeSeeds();} - T hash(const char* s, count_t h); - T hash(const wordID_t* id, const int len, count_t h); - T hash(const P x, count_t h); - void save(FileHandler* fout); - void load(FileHandler* fin); - private: - count_t d_; // l-k - P p_, *a_; // real-valued input range, storage - void initSeeds(); - void freeSeeds() {delete[] a_;} +class UnivHash_noPrimes: public HashBase +{ +public: + UnivHash_noPrimes(float k, float l): + HashBase(k, 100), d_(count_t((l-k))) { + if(((int)l >> 3) == sizeof(P)) p_ = (P) pow(2,l) - 1; + else p_ = (P) pow(2,l); + initSeeds(); + } + UnivHash_noPrimes(FileHandler* fin): + HashBase(fin) { + load(fin); + } + ~UnivHash_noPrimes() { + freeSeeds(); + } + T hash(const char* s, count_t h); + T hash(const wordID_t* id, const int len, count_t h); + T hash(const P x, count_t h); + void save(FileHandler* fout); + void load(FileHandler* fin); +private: + count_t d_; // l-k + P p_, *a_; // real-valued input range, storage + void initSeeds(); + void freeSeeds() { + delete[] a_; + } }; //! @todo ask abby2 template -class Hash_shiftAddXOR: public HashBase { - public: - Hash_shiftAddXOR(float m, count_t H=5): HashBase(m,H), - l_(5), r_(2) { - initSeeds(); - } - ~Hash_shiftAddXOR() {freeSeeds();} - T hash(const char* s, count_t h); - T hash(const wordID_t* id, const int len, count_t h) {} // empty - private: - T* v_; // random seed storage - const unsigned short l_, r_; // left-shift bits, right-shift bits - void initSeeds(); - void freeSeeds() {delete[] v_;} +class Hash_shiftAddXOR: public HashBase +{ +public: + Hash_shiftAddXOR(float m, count_t H=5): HashBase(m,H), + l_(5), r_(2) { + initSeeds(); + } + ~Hash_shiftAddXOR() { + freeSeeds(); + } + T hash(const char* s, count_t h); + T hash(const wordID_t* id, const int len, count_t h) {} // empty +private: + T* v_; // random seed storage + const unsigned short l_, r_; // left-shift bits, right-shift bits + void initSeeds(); + void freeSeeds() { + delete[] v_; + } }; //! @todo ask abby2 template -class UnivHash_tableXOR: public HashBase { - public: - UnivHash_tableXOR(float m, count_t H=5): HashBase(m, H), - table_(NULL), tblLen_(255*MAX_STR_LEN) { - initSeeds(); - } - ~UnivHash_tableXOR() {freeSeeds();} - T hash(const char* s, count_t h); - T hash(const wordID_t* id, const int len, count_t h) {} - private: - T** table_; // storage for random numbers - count_t tblLen_; // length of table - void initSeeds(); - void freeSeeds(); +class UnivHash_tableXOR: public HashBase +{ +public: + UnivHash_tableXOR(float m, count_t H=5): HashBase(m, H), + table_(NULL), tblLen_(255*MAX_STR_LEN) { + initSeeds(); + } + ~UnivHash_tableXOR() { + freeSeeds(); + } + T hash(const char* s, count_t h); + T hash(const wordID_t* id, const int len, count_t h) {} +private: + T** table_; // storage for random numbers + count_t tblLen_; // length of table + void initSeeds(); + void freeSeeds(); }; // ShiftAddXor template -void Hash_shiftAddXOR::initSeeds() { +void Hash_shiftAddXOR::initSeeds() +{ v_ = new T[this->H_]; for(count_t i=0; i < this->H_; i++) - v_[i] = Utils::rand() + 1; + v_[i] = Utils::rand() + 1; } template -T Hash_shiftAddXOR::hash(const char* s, count_t h) { +T Hash_shiftAddXOR::hash(const char* s, count_t h) +{ T value = v_[h]; int pos(0); unsigned char c; @@ -155,40 +178,44 @@ T Hash_shiftAddXOR::hash(const char* s, count_t h) { // UnivHash_tableXOR template -void UnivHash_tableXOR::initSeeds() { +void UnivHash_tableXOR::initSeeds() +{ // delete any values in table - if(table_) freeSeeds(); + if(table_) freeSeeds(); // instance of new table table_ = new T* [this->H_]; // fill with random values for(count_t j=0; j < this->H_; j++) { table_[j] = new T[tblLen_]; - for(count_t i=0; i < tblLen_; i++) { - table_[j][i] = Utils::rand(this->m_-1); + for(count_t i=0; i < tblLen_; i++) { + table_[j][i] = Utils::rand(this->m_-1); } } } template -void UnivHash_tableXOR::freeSeeds() { +void UnivHash_tableXOR::freeSeeds() +{ for(count_t j = 0; j < this->H_; j++) delete[] table_[j]; delete[] table_; table_ = NULL; } template -T UnivHash_tableXOR::hash(const char* s, count_t h) { +T UnivHash_tableXOR::hash(const char* s, count_t h) +{ T value = 0; count_t pos = 0, idx = 0; unsigned char c; while((c = *s++) && (++pos < MAX_STR_LEN)) value ^= table_[h][idx += c]; - CHECK(value < this->m_); + CHECK(value < this->m_); return value; } // UnivHash_noPrimes template -void UnivHash_noPrimes::initSeeds() { +void UnivHash_noPrimes::initSeeds() +{ a_ = new P[this->H_]; for(T i=0; i < this->H_; i++) { a_[i] = Utils::rand

(); @@ -196,14 +223,16 @@ void UnivHash_noPrimes::initSeeds() { } } template -T UnivHash_noPrimes::hash(const P x, count_t h) { +T UnivHash_noPrimes::hash(const P x, count_t h) +{ // h_a(x) = (ax mod 2^l) div 2^(l-k) T value = ((a_[h] * x) % p_) >> d_; return value % this->m_; } template -T UnivHash_noPrimes::hash(const wordID_t* id, const int len, - count_t h) { +T UnivHash_noPrimes::hash(const wordID_t* id, const int len, + count_t h) +{ T value = 0; int pos(0); while(pos < len) { @@ -213,39 +242,42 @@ T UnivHash_noPrimes::hash(const wordID_t* id, const int len, return value % this->m_; } template -T UnivHash_noPrimes::hash(const char* s, count_t h) { +T UnivHash_noPrimes::hash(const char* s, count_t h) +{ T value = 0; int pos(0); unsigned char c; while((c = *s++) && (++pos < MAX_STR_LEN)) { - value ^= hash((P)c, h); + value ^= hash((P)c, h); } return value % this->m_; } template -void UnivHash_noPrimes::save(FileHandler* fout) { +void UnivHash_noPrimes::save(FileHandler* fout) +{ HashBase::save(fout); fout->write((char*)&p_, sizeof(p_)); fout->write((char*)&d_, sizeof(d_)); - for(T i=0; i < this->H_; i++) { + for(T i=0; i < this->H_; i++) { fout->write((char*)&a_[i], sizeof(a_[i])); } } template -void UnivHash_noPrimes::load(FileHandler* fin) { +void UnivHash_noPrimes::load(FileHandler* fin) +{ a_ = new P[this->H_]; // HashBase::load(fin) already done in constructor fin->read((char*)&p_, sizeof(p_)); fin->read((char*)&d_, sizeof(d_)); - for(T i=0; i < this->H_; i++) - { + for(T i=0; i < this->H_; i++) { fin->read((char*)&a_[i], sizeof(a_[i])); } } //UnivHash_linear template -void UnivHash_linear::initSeeds() { +void UnivHash_linear::initSeeds() +{ a_ = new T*[this->H_]; b_ = new T*[this->H_]; for(count_t i=0; i < this->H_; i++) { @@ -258,7 +290,8 @@ void UnivHash_linear::initSeeds() { } } template -void UnivHash_linear::freeSeeds() { +void UnivHash_linear::freeSeeds() +{ for(count_t i=0; i < this->H_; i++) { delete[] a_[i]; delete[] b_[i]; @@ -268,8 +301,9 @@ void UnivHash_linear::freeSeeds() { a_ = b_ = NULL; } template -inline T UnivHash_linear::hash(const wordID_t* id, const int len, - count_t h) { +inline T UnivHash_linear::hash(const wordID_t* id, const int len, + count_t h) +{ CHECK(h < this->H_); T value = 0; int pos(0); @@ -281,19 +315,21 @@ inline T UnivHash_linear::hash(const wordID_t* id, const int len, } template inline T UnivHash_linear::hash(const wordID_t id, const count_t pos, - const T prevValue, count_t h) { + const T prevValue, count_t h) +{ CHECK(h < this->H_); T value = prevValue + ((a_[h][pos] * id) + b_[h][pos]); // % pr_; return value % this->m_; } template -void UnivHash_linear::save(FileHandler* fout) { +void UnivHash_linear::save(FileHandler* fout) +{ // int bytes = sizeof(a_[0][0]); HashBase::save(fout); fout->write((char*)&pr_, sizeof(pr_)); for(count_t i=0; i < this->H_; i++) { for(count_t j=0; j < MAX_NGRAM_ORDER; j++) { - fout->write((char*)&a_[i][j], sizeof(a_[i][j])); + fout->write((char*)&a_[i][j], sizeof(a_[i][j])); fout->write((char*)&b_[i][j], sizeof(b_[i][j])); //cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl; //cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl; @@ -301,7 +337,8 @@ void UnivHash_linear::save(FileHandler* fout) { } } template -void UnivHash_linear::load(FileHandler* fin) { +void UnivHash_linear::load(FileHandler* fin) +{ // HashBase::load(fin) already done in constructor fin->read((char*)&pr_, sizeof(pr_)); a_ = new T*[this->H_]; @@ -310,8 +347,8 @@ void UnivHash_linear::load(FileHandler* fin) { a_[i] = new T[MAX_NGRAM_ORDER]; b_[i] = new T[MAX_NGRAM_ORDER]; for(count_t j=0; j < MAX_NGRAM_ORDER; j++) { - fin->read((char*)&a_[i][j], sizeof(a_[i][j])); - fin->read((char*)&b_[i][j], sizeof(b_[i][j])); + fin->read((char*)&a_[i][j], sizeof(a_[i][j])); + fin->read((char*)&b_[i][j], sizeof(b_[i][j])); //cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl; //cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl; } diff --git a/moses/TranslationModel/DynSAInclude/onlineRLM.h b/moses/TranslationModel/DynSAInclude/onlineRLM.h index b47cfdd0e..527f1e5d9 100644 --- a/moses/TranslationModel/DynSAInclude/onlineRLM.h +++ b/moses/TranslationModel/DynSAInclude/onlineRLM.h @@ -18,27 +18,28 @@ const bool strict_checks_ = false; //! @todo ask abby2 template -class OnlineRLM: public PerfectHash { +class OnlineRLM: public PerfectHash +{ public: - OnlineRLM(uint16_t MBs, int width, int bucketRange, count_t order, - Moses::Vocab* v, float qBase = 8): PerfectHash(MBs, width, bucketRange, qBase), + OnlineRLM(uint16_t MBs, int width, int bucketRange, count_t order, + Moses::Vocab* v, float qBase = 8): PerfectHash(MBs, width, bucketRange, qBase), vocab_(v), bAdapting_(false), order_(order), corpusSize_(0), alpha_(0) { CHECK(vocab_ != 0); //instantiate quantizer class here cache_ = new Cache(8888.8888, 9999.9999); // unknown_value, null_value alpha_ = new float[order_ + 1]; - for(count_t i = 0; i <= order_; ++i) + for(count_t i = 0; i <= order_; ++i) alpha_[i] = i * log10(0.4); cerr << "Initialzing auxillary bit filters...\n"; bPrefix_ = new BitFilter(this->cells_); bHit_ = new BitFilter(this->cells_); } - OnlineRLM(FileHandler* fin, count_t order): + OnlineRLM(FileHandler* fin, count_t order): PerfectHash(fin), bAdapting_(true), order_(order), corpusSize_(0) { load(fin); cache_ = new Cache(8888.8888, 9999.9999); // unknown_value, null_value alpha_ = new float[order_ + 1]; - for(count_t i = 0; i <= order_; ++i) + for(count_t i = 0; i <= order_; ++i) alpha_[i] = i * log10(0.4); } ~OnlineRLM() { @@ -54,14 +55,18 @@ public: bool insert(const std::vector& ngram, const int value); bool update(const std::vector& ngram, const int value); int query(const wordID_t* IDs, const int len); - int sbsqQuery(const std::vector& ngram, int* len, - bool bStrict = false); - int sbsqQuery(const wordID_t* IDs, const int len, int* codes, - bool bStrict = false); + int sbsqQuery(const std::vector& ngram, int* len, + bool bStrict = false); + int sbsqQuery(const wordID_t* IDs, const int len, int* codes, + bool bStrict = false); void remove(const std::vector& ngram); count_t heurDelete(count_t num2del, count_t order = 5); - uint64_t corpusSize() {return corpusSize_;} - void corpusSize(uint64_t c) {corpusSize_ = c;} + uint64_t corpusSize() { + return corpusSize_; + } + void corpusSize(uint64_t c) { + corpusSize_ = c; + } void clearCache() { if(cache_) cache_->clear(); } @@ -79,7 +84,7 @@ protected: void markQueried(hpdEntry_t& value); bool markPrefix(const wordID_t* IDs, const int len, bool bSet); private: - const void* getContext(const wordID_t* ngram, int len); + const void* getContext(const wordID_t* ngram, int len); const bool bAdapting_; // used to signal adaptation of model const count_t order_; // LM order uint64_t corpusSize_; // total training corpus size @@ -90,48 +95,50 @@ private: }; template -bool OnlineRLM::insert(const std::vector& ngram, const int value) { +bool OnlineRLM::insert(const std::vector& ngram, const int value) +{ int len = ngram.size(); wordID_t wrdIDs[len]; uint64_t index(this->cells_ + 1); - for(int i = 0; i < len; ++i) + for(int i = 0; i < len; ++i) wrdIDs[i] = vocab_->GetWordID(ngram[i]); index = PerfectHash::insert(wrdIDs, len, value); if(value > 1 && len < order_) markPrefix(wrdIDs, ngram.size(), true); // mark context // keep track of total items from training data minus "" - if(ngram.size() == 1 && (!bAdapting_)) // hack to not change corpusSize when adapting + if(ngram.size() == 1 && (!bAdapting_)) // hack to not change corpusSize when adapting corpusSize_ += (wrdIDs[0] != vocab_->GetBOSWordID()) ? value : 0; - if(bAdapting_ && (index < this->cells_)) // mark to keep while adapting + if(bAdapting_ && (index < this->cells_)) // mark to keep while adapting markQueried(index); return true; } template -bool OnlineRLM::update(const std::vector& ngram, const int value) { +bool OnlineRLM::update(const std::vector& ngram, const int value) +{ int len = ngram.size(); std::vector wrdIDs(len); uint64_t index(this->cells_ + 1); hpdEntry_t hpdItr; vocab_->MakeOpen(); - for(int i = 0; i < len; ++i) + for(int i = 0; i < len; ++i) wrdIDs[i] = vocab_->GetWordID(ngram[i]); - // if updating, minimize false positives by pre-checking if context already in model - bool bIncluded(true); + // if updating, minimize false positives by pre-checking if context already in model + bool bIncluded(true); if(value > 1 && len < (int)order_) bIncluded = markPrefix(&wrdIDs[0], ngram.size(), true); // mark context - if(bIncluded) { // if context found + if(bIncluded) { // if context found bIncluded = PerfectHash::update2(&wrdIDs[0], len, value, hpdItr, index); if(index < this->cells_) { markQueried(index); - } - else if(hpdItr != this->dict_.end()) markQueried(hpdItr); + } else if(hpdItr != this->dict_.end()) markQueried(hpdItr); } return bIncluded; } template -int OnlineRLM::query(const wordID_t* IDs, int len) { +int OnlineRLM::query(const wordID_t* IDs, int len) +{ uint64_t filterIdx = 0; hpdEntry_t hpdItr; int value(0); @@ -140,8 +147,7 @@ int OnlineRLM::query(const wordID_t* IDs, int len) { if(hpdItr != this->dict_.end()) { //markQueried(hpdItr); // mark this event as "hit" value -= ((value & this->hitMask_) != 0) ? this->hitMask_ : 0; // check for previous hit marks - } - else { + } else { CHECK(filterIdx < this->cells_); //markQueried(filterIdx); } @@ -150,15 +156,16 @@ int OnlineRLM::query(const wordID_t* IDs, int len) { } template -bool OnlineRLM::markPrefix(const wordID_t* IDs, const int len, bool bSet) { - if(len <= 1) return true; // only do this for for ngrams with context - static Cache pfCache(-1, -1); // local prefix cache +bool OnlineRLM::markPrefix(const wordID_t* IDs, const int len, bool bSet) +{ + if(len <= 1) return true; // only do this for for ngrams with context + static Cache pfCache(-1, -1); // local prefix cache int code(0); - if(!pfCache.checkCacheNgram(IDs, len - 1, &code, NULL)) { - hpdEntry_t hpdItr; + if(!pfCache.checkCacheNgram(IDs, len - 1, &code, NULL)) { + hpdEntry_t hpdItr; uint64_t filterIndex(0); code = PerfectHash::query(IDs, len - 1, hpdItr, filterIndex); // hash IDs[0..len-1] - if(code == -1) { // encountered false positive in pipeline + if(code == -1) { // encountered false positive in pipeline cerr << "WARNING: markPrefix(). The O-RLM is *not* well-formed.\n"; // add all prefixes or return false; return false; @@ -167,10 +174,9 @@ bool OnlineRLM::markPrefix(const wordID_t* IDs, const int len, bool bSet) { CHECK(hpdItr == this->dict_.end()); if(bSet) bPrefix_->setBit(filterIndex); // mark index else bPrefix_->clearBit(filterIndex); // unset index - } - else { + } else { CHECK(filterIndex == this->cells_ + 1); - //how to handle hpd prefixes? + //how to handle hpd prefixes? } if(pfCache.nodes() > 10000) pfCache.clear(); pfCache.setCacheNgram(IDs, len - 1, code, NULL); @@ -179,39 +185,43 @@ bool OnlineRLM::markPrefix(const wordID_t* IDs, const int len, bool bSet) { } template -void OnlineRLM::markQueried(const uint64_t& index) { +void OnlineRLM::markQueried(const uint64_t& index) +{ bHit_->setBit(index); //cerr << "filter[" << index << "] = " << this->filter_->read(index) << endl; } template -void OnlineRLM::markQueried(hpdEntry_t& value) { - // set high bit of counter to indicate "hit" status +void OnlineRLM::markQueried(hpdEntry_t& value) +{ + // set high bit of counter to indicate "hit" status value->second |= this->hitMask_; } template -void OnlineRLM::remove(const std::vector& ngram) { +void OnlineRLM::remove(const std::vector& ngram) +{ wordID_t IDs[ngram.size()]; - for(count_t i = 0; i < ngram.size(); ++i) + for(count_t i = 0; i < ngram.size(); ++i) IDs[i] = vocab_->GetWordID(ngram[i]); PerfectHash::remove(IDs, ngram.size()); } template -count_t OnlineRLM::heurDelete(count_t num2del, count_t order) { +count_t OnlineRLM::heurDelete(count_t num2del, count_t order) +{ count_t deleted = 0; cout << "Deleting " << num2del << " of order "<< order << endl; // delete from filter first - int full = *std::max_element(this->idxTracker_, this->idxTracker_ - + this->totBuckets_); + int full = *std::max_element(this->idxTracker_, this->idxTracker_ + + this->totBuckets_); for(; full > 0; --full) // delete from fullest buckets first - for(int bk = 0; bk < this->totBuckets_; ++bk) { + for(int bk = 0; bk < this->totBuckets_; ++bk) { if(deleted >= num2del) break; if(this->idxTracker_[bk] == full) { // if full uint64_t first = bk * this->bucketRange_, - last = first + this->bucketRange_; - for(uint64_t row = first; row < last; ++row) { // check each row + last = first + this->bucketRange_; + for(uint64_t row = first; row < last; ++row) { // check each row if(!(bHit_->testBit(row) || bPrefix_->testBit(row) )) { if(this->filter_->read(row) != 0) { PerfectHash::remove(row); // remove from filter @@ -231,16 +241,18 @@ count_t OnlineRLM::heurDelete(count_t num2del, count_t order) { template int OnlineRLM::sbsqQuery(const std::vector& ngram, int* codes, - bool bStrict) { + bool bStrict) +{ wordID_t IDs[ngram.size()]; - for(count_t i = 0; i < ngram.size(); ++i) + for(count_t i = 0; i < ngram.size(); ++i) IDs[i] = vocab_->GetWordID(ngram[i]); return sbsqQuery(IDs, ngram.size(), codes, bStrict); } template -int OnlineRLM::sbsqQuery(const wordID_t* IDs, const int len, int* codes, - bool bStrict) { +int OnlineRLM::sbsqQuery(const wordID_t* IDs, const int len, int* codes, + bool bStrict) +{ uint64_t filterIdx = 0; int val(0), fnd(0); hpdEntry_t hpdItr; @@ -252,14 +264,13 @@ int OnlineRLM::sbsqQuery(const wordID_t* IDs, const int len, int* codes, if(hpdItr != this->dict_.end()) { val -= ((val & this->hitMask_) != 0) ? this->hitMask_ : 0; // account for previous hit marks } - } - else if(bStrict) { - break; + } else if(bStrict) { + break; } // add to value array codes[i] = val > 0 ? val : 0; } - while(bStrict && (fnd > 1)) { // do checks the other way + while(bStrict && (fnd > 1)) { // do checks the other way val = PerfectHash::query(&IDs[len - fnd], fnd - 1, hpdItr, filterIdx); if(val != -1) break; // if anything found else --fnd; // else decrement found @@ -269,8 +280,9 @@ int OnlineRLM::sbsqQuery(const wordID_t* IDs, const int len, int* codes, } template -float OnlineRLM::getProb(const wordID_t* ngram, int len, - const void** state) { +float OnlineRLM::getProb(const wordID_t* ngram, int len, + const void** state) +{ static const float oovprob = log10(1.0 / (static_cast(vocab_->Size()) - 1)); float logprob(0); const void* context = (state) ? *state : 0; @@ -278,66 +290,66 @@ float OnlineRLM::getProb(const wordID_t* ngram, int len, if(!cache_->checkCacheNgram(ngram, len, &logprob, &context)) { // get full prob and put in cache int num_fnd(0), den_val(0); - int *in = new int[len]; // in[] keeps counts of increasing order numerator + int *in = new int[len]; // in[] keeps counts of increasing order numerator for(int i = 0; i < len; ++i) in[i] = 0; for(int i = len - 1; i >= 0; --i) { if(ngram[i] == vocab_->GetkOOVWordID()) break; // no need to query if OOV in[i] = query(&ngram[i], len - i); if(in[i] > 0) { num_fnd = len - i; - } - else if(strict_checks_) break; + } else if(strict_checks_) break; } while(num_fnd > 1) { // get lower order count - //get sub-context of size one less than length found (exluding target) + //get sub-context of size one less than length found (exluding target) if(((den_val = query(&ngram[len - num_fnd], num_fnd - 1)) > 0) && (den_val >= in[len - num_fnd]) && (in[len - num_fnd] > 0)) { break; - } - else --num_fnd; // else backoff to lower ngram order + } else --num_fnd; // else backoff to lower ngram order } - if(num_fnd == 1 && (in[len - 1] < 1)) // sanity check for unigrams + if(num_fnd == 1 && (in[len - 1] < 1)) // sanity check for unigrams num_fnd = 0; switch(num_fnd) { // find prob (need to refactor into precomputation) - case 0: // OOV - logprob = alpha_[len] + oovprob; - break; - case 1: // unigram found only - CHECK(in[len - 1] > 0); - logprob = alpha_[len - 1] + (corpusSize_ > 0 ? - log10(static_cast(in[len - 1]) / static_cast(corpusSize_)) : 0); - //logprob = alpha_[len - 1] + - //log10(static_cast(in[len - 1]) / static_cast(corpusSize_)); - break; - default: - CHECK(den_val > 0); - //if(subgram == in[len - found]) ++subgram; // avoid returning zero probs???? - logprob = alpha_[len - num_fnd] + - log10(static_cast(in[len - num_fnd]) / static_cast(den_val)); - break; + case 0: // OOV + logprob = alpha_[len] + oovprob; + break; + case 1: // unigram found only + CHECK(in[len - 1] > 0); + logprob = alpha_[len - 1] + (corpusSize_ > 0 ? + log10(static_cast(in[len - 1]) / static_cast(corpusSize_)) : 0); + //logprob = alpha_[len - 1] + + //log10(static_cast(in[len - 1]) / static_cast(corpusSize_)); + break; + default: + CHECK(den_val > 0); + //if(subgram == in[len - found]) ++subgram; // avoid returning zero probs???? + logprob = alpha_[len - num_fnd] + + log10(static_cast(in[len - num_fnd]) / static_cast(den_val)); + break; } // need unique context context = getContext(&ngram[len - num_fnd], num_fnd); // put whatever was found in cache cache_->setCacheNgram(ngram, len, logprob, context); } // end checkCache - return logprob; + return logprob; } template -const void* OnlineRLM::getContext(const wordID_t* ngram, int len) { +const void* OnlineRLM::getContext(const wordID_t* ngram, int len) +{ int dummy(0); float* *addresses = new float*[len]; // only interested in addresses of cache CHECK(cache_->getCache2(ngram, len, &addresses[0], &dummy) == len); // return address of cache node - + float *addr0 = addresses[0]; free( addresses ); return (const void*)addr0; } template -void OnlineRLM::randDelete(int num2del) { +void OnlineRLM::randDelete(int num2del) +{ int deleted = 0; for(uint64_t i = 0; i < this->cells_; i++) { if(this->filter_->read(i) != 0) { @@ -349,19 +361,21 @@ void OnlineRLM::randDelete(int num2del) { } template -int OnlineRLM::countHits() { +int OnlineRLM::countHits() +{ int hit(0); for(uint64_t i = 0; i < this->cells_; ++i) if(bHit_->testBit(i)) ++hit; iterate(this->dict_, itr) - if((itr->second & this->hitMask_) != 0) - ++hit; + if((itr->second & this->hitMask_) != 0) + ++hit; cerr << "Hit count = " << hit << endl; return hit; } template -int OnlineRLM::countPrefixes() { +int OnlineRLM::countPrefixes() +{ int pfx(0); for(uint64_t i = 0; i < this->cells_; ++i) if(bPrefix_->testBit(i)) ++pfx; @@ -371,23 +385,25 @@ int OnlineRLM::countPrefixes() { } template -int OnlineRLM::cleanUpHPD() { +int OnlineRLM::cleanUpHPD() +{ cerr << "HPD size before = " << this->dict_.size() << endl; std::vector vDel, vtmp; iterate(this->dict_, itr) { if(((itr->second & this->hitMask_) == 0) && // if not hit during testing - (Utils::splitToStr(itr->first, vtmp, "¬") >= 3)) { // and higher order ngram + (Utils::splitToStr(itr->first, vtmp, "¬") >= 3)) { // and higher order ngram vDel.push_back(itr->first); } } - iterate(vDel, vitr) - this->dict_.erase(*vitr); + iterate(vDel, vitr) + this->dict_.erase(*vitr); cerr << "HPD size after = " << this->dict_.size() << endl; return vDel.size(); } template -void OnlineRLM::clearMarkings() { +void OnlineRLM::clearMarkings() +{ cerr << "clearing all event hits\n"; bHit_->reset(); count_t* value(0); @@ -398,7 +414,8 @@ void OnlineRLM::clearMarkings() { } template -void OnlineRLM::save(FileHandler* fout) { +void OnlineRLM::save(FileHandler* fout) +{ cerr << "Saving ORLM...\n"; // save vocab vocab_->Save(fout); @@ -412,7 +429,8 @@ void OnlineRLM::save(FileHandler* fout) { } template -void OnlineRLM::load(FileHandler* fin) { +void OnlineRLM::load(FileHandler* fin) +{ cerr << "Loading ORLM...\n"; // load vocab first vocab_ = new Moses::Vocab(fin); @@ -428,12 +446,13 @@ void OnlineRLM::load(FileHandler* fin) { } template -void OnlineRLM::removeNonMarked() { +void OnlineRLM::removeNonMarked() +{ cerr << "deleting all unused events\n"; int deleted(0); for(uint64_t i = 0; i < this->cells_; ++i) { - if(!(bHit_->testBit(i) || bPrefix_->testBit(i)) - && (this->filter_->read(i) != 0)) { + if(!(bHit_->testBit(i) || bPrefix_->testBit(i)) + && (this->filter_->read(i) != 0)) { PerfectHash::remove(i); ++deleted; } @@ -456,36 +475,36 @@ float OnlineRLM::getProb2(const wordID_t* ngram, int len, const void** state) // constrain cache queries using model assumptions int denom_len = cache_->getCache(ngram, len - 1, &denom_codes[0], &denom_found); cerr << "denom_len = " << denom_len << endl; - int num_len = cache_->getCache(&ngram[len - denom_len - 1], denom_len + 1, + int num_len = cache_->getCache(&ngram[len - denom_len - 1], denom_len + 1, &num_codes[0], &found); cerr << "num_len= " << num_len << endl; // keed reducing ngram size until both denominator and numerator are found // allowed to leave kUnknownCode in cache because we check for this. found = num_len; // guaranteed to be <= denom_len + 1 // still check for OOV - for (int i = len - found; i < len; ++i) - if (ngram[i] == Vocab::kOOVWordID) { + for (int i = len - found; i < len; ++i) + if (ngram[i] == Vocab::kOOVWordID) { found = len - i - 1; } // check for relative estimator while(found > 1) { - if(*denom_codes[found-1] == cache_unk_ && - ((*denom_codes[found-1] = query(&ngram[len-found], found-1)) == 0)) { + if(*denom_codes[found-1] == cache_unk_ && + ((*denom_codes[found-1] = query(&ngram[len-found], found-1)) == 0)) { //!struct_->query(&ngram[len-*found], *found-1, kMainEventIdx, denom_codes[*found-1])) { *num_codes[found] = cache_unk_; } else { if(*num_codes[found] != cache_unk_ || ((*num_codes[found] = query(&ngram[len-found], found)) <= *denom_codes[found-1])) - // struct_->query(&ngram[len-*found], *found, kMainEventIdx, + // struct_->query(&ngram[len-*found], *found, kMainEventIdx, // num_codes[*found], *denom_codes[*found-1])) break; - } + } --found; } - // didn't find bigram numerator or unigram denominator + // didn't find bigram numerator or unigram denominator if (found == 1) - found = *num_codes[1] != cache_unk_ - || ((*num_codes[1] = query(&ngram[len - 1], 1)) != 0); + found = *num_codes[1] != cache_unk_ + || ((*num_codes[1] = query(&ngram[len - 1], 1)) != 0); //struct_->query(&ngram[len - 1], 1, kMainEventIdx, num_codes[1]); // .... // return estimate applying correct backoff score (precomputed) @@ -496,20 +515,20 @@ float OnlineRLM::getProb2(const wordID_t* ngram, int len, const void** state) //log_prob = stupid_backoff_log10_[len] + uniform_log10prob_; break; case 1: // unigram over whole corpus - log_prob = alpha_[len - 1] + + log_prob = alpha_[len - 1] + log10(static_cast(*num_codes[1]) / static_cast(corpusSize_)); - //log_prob = log_quantiser_->getLog10Value(*num_codes[1]) - corpus_size_log10_ + //log_prob = log_quantiser_->getLog10Value(*num_codes[1]) - corpus_size_log10_ // + stupid_backoff_log10_[len - 1]; // precomputed break; default: // otherwise use both statistics and (possibly zero) backoff weight - log_prob = alpha_[len - found] + + log_prob = alpha_[len - found] + log10(static_cast(*num_codes[found]) / static_cast(*denom_codes[found-1])); - //log_prob = log_quantiser_->getLog10Value(*num_codes[*found ]) - // - log_quantiser_->getLog10Value(*denom_codes[*found - 1]) + //log_prob = log_quantiser_->getLog10Value(*num_codes[*found ]) + // - log_quantiser_->getLog10Value(*denom_codes[*found - 1]) // + stupid_backoff_log10_[len - *found]; } context_state = (const void*)num_codes[found == len ? found - 1 : found];; - //probCache_->store(len, log_prob, context_state); + //probCache_->store(len, log_prob, context_state); if (state) *state = context_state; return log_prob; diff --git a/moses/TranslationModel/DynSAInclude/params.cpp b/moses/TranslationModel/DynSAInclude/params.cpp index 4be3a1676..a4d51d5b2 100644 --- a/moses/TranslationModel/DynSAInclude/params.cpp +++ b/moses/TranslationModel/DynSAInclude/params.cpp @@ -1,10 +1,11 @@ #include "params.h" -namespace Moses { +namespace Moses +{ // parameter constants const std::string Parameters::kNotSetValue = "__NOT_SET__"; -const int Parameters::kBoolValue = 0; +const int Parameters::kBoolValue = 0; const int Parameters::kIntValue = 1; const int Parameters::kFloatValue = 2; const int Parameters::kStringValue = 3; @@ -13,26 +14,30 @@ const int Parameters::kUndefinedValue = -1; const std::string Parameters::kTrueValue = "1"; const std::string Parameters::kFalseValue = "0"; -Parameters::Parameters(const ParamDefs * paramdefs, const count_t paramNum) { +Parameters::Parameters(const ParamDefs * paramdefs, const count_t paramNum) +{ initialize(paramdefs, paramNum); } -Parameters::Parameters(int argc, char ** argv, const ParamDefs * paramdefs, - const count_t paramNum) { +Parameters::Parameters(int argc, char ** argv, const ParamDefs * paramdefs, + const count_t paramNum) +{ initialize(paramdefs, paramNum); loadParams(argc, argv); } -void Parameters::initialize(const ParamDefs * paramdefs, const count_t paramNum) { +void Parameters::initialize(const ParamDefs * paramdefs, const count_t paramNum) +{ for( count_t i = 0; i < paramNum; i++ ) { params_[paramdefs[i].name] = paramdefs[i]; // assign name } cerr << "Default parameter values:\n"; - iterate(params_, itr) - cerr << "\t" << itr->first << " --> " << itr->second.value << endl; + iterate(params_, itr) + cerr << "\t" << itr->first << " --> " << itr->second.value << endl; } -bool Parameters::loadParams(int argc, char ** argv) { +bool Parameters::loadParams(int argc, char ** argv) +{ // load params from commandline args //if( argc < 3 ) { // fprintf(stderr, "ERROR: No parameters. Use \"-config\" or \"-f\" to specify configuration file.\n"); @@ -66,7 +71,7 @@ bool Parameters::loadParams(int argc, char ** argv) { std::string val = argv[i+1]; Utils::trim(val); if( param == "config" ) - load_from_file = true; + load_from_file = true; if(!setParamValue(param, val)) { std::cerr << "Invalid Param name->value " << param << "->" << val << std::endl; return false; @@ -80,35 +85,40 @@ bool Parameters::loadParams(int argc, char ** argv) { return success; } -std::string Parameters::normaliseParamName(const std::string & name) { +std::string Parameters::normaliseParamName(const std::string & name) +{ // Map valid abbreviations to long names. Retain other names. if( params_.find(name) == params_.end() ) - iterate(params_, i) - if( i->second.abbrev == name ) - return i->first; + iterate(params_, i) + if( i->second.abbrev == name ) + return i->first; return name; } -int Parameters::getValueType(const std::string& name) { +int Parameters::getValueType(const std::string& name) +{ if(params_.find(name) != params_.end()) return params_[name].type; return Parameters::kUndefinedValue; } -bool Parameters::isValidParamName(const std::string & name) { - return params_.find(name) != params_.end(); +bool Parameters::isValidParamName(const std::string & name) +{ + return params_.find(name) != params_.end(); } -bool Parameters::setParamValue(const std::string& name, const std::string& val) { - // TODO: Add basic type checking w verifyValueType() - bool set = isValidParamName(name); - if(set) { - params_[name].value = val; +bool Parameters::setParamValue(const std::string& name, const std::string& val) +{ + // TODO: Add basic type checking w verifyValueType() + bool set = isValidParamName(name); + if(set) { + params_[name].value = val; std::cerr << "PARAM SET: "<< name << "=" << val << std::endl; } return( set ); } -std::string Parameters::getParamValue(const std::string& name) { +std::string Parameters::getParamValue(const std::string& name) +{ std::string value = Parameters::kNotSetValue; if(isValidParamName(name)) if(params_.find(name) != params_.end()) @@ -117,43 +127,46 @@ std::string Parameters::getParamValue(const std::string& name) { value = kFalseValue; return value; } -std::string Parameters::getParam(const std::string& name) { +std::string Parameters::getParam(const std::string& name) +{ return getParamValue(name); -/*void* Parameters::getParam(const std::string& name) { - void* paramVal = 0; - int type = getValueType(name); - const char* sval = getParamValue(name).c_str(); - switch(type) { - case kIntValue: { - int ival = atoi(sval); - paramVal = (void*)&ival; - break; + /*void* Parameters::getParam(const std::string& name) { + void* paramVal = 0; + int type = getValueType(name); + const char* sval = getParamValue(name).c_str(); + switch(type) { + case kIntValue: { + int ival = atoi(sval); + paramVal = (void*)&ival; + break; + } + case kFloatValue: { + float fval = atof(sval); + paramVal = (void*)&fval; + break; + } + case kStringValue: { + paramVal = (void*)sval; + break; + } + case kBoolValue: { + bool bval = sval == Parameters::kTrueValue ? true : false; + paramVal = (void*)&bval; + break; + } + default: // --> Parameters::kUndefinedValue + paramVal = (void*)sval; // will set to Parameters::kNotSetValue } - case kFloatValue: { - float fval = atof(sval); - paramVal = (void*)&fval; - break; - } - case kStringValue: { - paramVal = (void*)sval; - break; - } - case kBoolValue: { - bool bval = sval == Parameters::kTrueValue ? true : false; - paramVal = (void*)&bval; - break; - } - default: // --> Parameters::kUndefinedValue - paramVal = (void*)sval; // will set to Parameters::kNotSetValue - } - return paramVal;*/ + return paramVal;*/ } -bool Parameters::verifyValueType(const std::string& name, const std::string& val) { +bool Parameters::verifyValueType(const std::string& name, const std::string& val) +{ // Implement basic type checking return true; } -int Parameters::getParamCount() const { +int Parameters::getParamCount() const +{ return params_.size(); } @@ -161,7 +174,8 @@ int Parameters::getParamCount() const { * HAVE TO CHANGE loadParams() from file to not overwrite command lines but * override default if different*/ bool Parameters::loadParams(const std::string & file_path, - std::set& setParams) { + std::set& setParams) +{ // parameters loaded from file don't override cmd line paramters /*std::set::iterator end = setParams.end(); FileHandler file(file_path.c_str(), std::ios::in); diff --git a/moses/TranslationModel/DynSAInclude/params.h b/moses/TranslationModel/DynSAInclude/params.h index d5af6331d..efc0a6ba3 100644 --- a/moses/TranslationModel/DynSAInclude/params.h +++ b/moses/TranslationModel/DynSAInclude/params.h @@ -10,21 +10,23 @@ #include "utils.h" #include "types.h" -#define NumOfParams(paramArray) (sizeof(paramArray)/sizeof((paramArray)[0])) +#define NumOfParams(paramArray) (sizeof(paramArray)/sizeof((paramArray)[0])) -namespace Moses { +namespace Moses +{ typedef struct ParamDefs { std::string name; - std::string value; + std::string value; std::string abbrev; int type; std::string description; } ParamDefs; - //! @todo ask abby2 -class Parameters { +//! @todo ask abby2 +class Parameters +{ public: - static const std::string kNotSetValue; + static const std::string kNotSetValue; static const int kBoolValue; static const int kIntValue; static const int kFloatValue; @@ -32,15 +34,15 @@ public: static const int kUndefinedValue; static const std::string kFalseValue; static const std::string kTrueValue; - + Parameters(const ParamDefs * paramdefs, const count_t paramNum); Parameters(int argc, char** argv, const ParamDefs * paramdefs, const count_t paramNum); ~Parameters() {} bool loadParams(int argc, char ** argv); bool loadParams(const std::string& param_file, std::set&); int getValueType(const std::string & name); - bool setParamValue(const std::string& name, const std::string& value); - bool verifyValueType(const std::string& name, const std::string& value); + bool setParamValue(const std::string& name, const std::string& value); + bool verifyValueType(const std::string& name, const std::string& value); bool isValidParamName(const std::string & name); std::string getParamValue(const std::string& name); //void* getParam(const std::string& name); diff --git a/moses/TranslationModel/DynSAInclude/perfectHash.h b/moses/TranslationModel/DynSAInclude/perfectHash.h index f445e063a..8ea20fa06 100644 --- a/moses/TranslationModel/DynSAInclude/perfectHash.h +++ b/moses/TranslationModel/DynSAInclude/perfectHash.h @@ -9,18 +9,19 @@ #include "quantizer.h" /** - * PerfectHash handles setting up hash functions and storage - * for LM data. - */ + * PerfectHash handles setting up hash functions and storage + * for LM data. + */ using randlm::Filter; using randlm::BitFilter; typedef std::map hpDict_t; typedef hpDict_t::iterator hpdEntry_t; static count_t collisions_ = 0; -/* Based on Mortenson et. al. 2006 */ +/* Based on Mortenson et. al. 2006 */ template -class PerfectHash { +class PerfectHash +{ public: PerfectHash(uint16_t MBs, int width, int bucketRange, float qBase); PerfectHash(FileHandler* fin) { @@ -41,11 +42,11 @@ protected: uint8_t* idxTracker_; uint64_t insert(const wordID_t* IDs, const int len, const count_t value); bool update(const wordID_t* IDs, const int len, const count_t value, - hpdEntry_t& hpdAddr, uint64_t& filterIdx); + hpdEntry_t& hpdAddr, uint64_t& filterIdx); bool update2(const wordID_t* IDs, const int len, const count_t value, - hpdEntry_t& hpdAddr, uint64_t& filterIdx); - int query(const wordID_t* IDs, const int len, - hpdEntry_t& hpdAddr, uint64_t& filterIdx); + hpdEntry_t& hpdAddr, uint64_t& filterIdx); + int query(const wordID_t* IDs, const int len, + hpdEntry_t& hpdAddr, uint64_t& filterIdx); virtual void remove(const wordID_t* IDs, const int len); void remove(uint64_t index); void save(FileHandler* fout); @@ -54,33 +55,34 @@ protected: //pointer to a specific entry in a hpDict_t virtual void markQueried(hpdEntry_t&)=0; private: - T nonZeroSignature(const wordID_t* IDs, const int len, count_t bucket); + T nonZeroSignature(const wordID_t* IDs, const int len, count_t bucket); string hpDictKeyValue(const wordID_t* IDs, const int len); uint64_t memBound_; // total memory bound in bytes uint16_t cellWidth_; // in bits - UnivHash_linear* bucketHash_; + UnivHash_linear* bucketHash_; UnivHash_linear* fingerHash_; LogQtizer* qtizer_; }; template -PerfectHash::PerfectHash(uint16_t MBs, int width, int bucketRange, - float qBase): hitMask_(1 << 31), memBound_(MBs * (1ULL << 20)), - cellWidth_(width) { +PerfectHash::PerfectHash(uint16_t MBs, int width, int bucketRange, + float qBase): hitMask_(1 << 31), memBound_(MBs * (1ULL << 20)), + cellWidth_(width) +{ bucketRange_ = static_cast(bucketRange); if(bucketRange > 255) { - cerr << "ERROR: Max bucket range is > 2^8\n"; + cerr << "ERROR: Max bucket range is > 2^8\n"; exit(1); } qtizer_ = new LogQtizer(qBase); int valBits = (int)ceil(log2((float)qtizer_->maxcode())); cerr << "BITS FOR VALUES ARRAY = " << valBits << endl; uint64_t totalBits = memBound_ << 3; - cells_ = (uint64_t) ceil((float)totalBits / (float)(cellWidth_ + valBits)); // upper bound on cells + cells_ = (uint64_t) ceil((float)totalBits / (float)(cellWidth_ + valBits)); // upper bound on cells cells_ += (cells_ % bucketRange_); // make cells multiple of bucket range totBuckets_ = (cells_ / bucketRange_) - 1; // minus 1 so totBuckets * bucksize + bucksize = cells filter_ = new Filter(cells_, cellWidth_); - values_ = new Filter(cells_, valBits); + values_ = new Filter(cells_, valBits); idxTracker_ = new uint8_t[totBuckets_]; for(int i=0; i < totBuckets_; ++i) idxTracker_[i] = 0; // initialize ranges for each hash function @@ -89,7 +91,8 @@ PerfectHash::PerfectHash(uint16_t MBs, int width, int bucketRange, } template -PerfectHash::~PerfectHash() { +PerfectHash::~PerfectHash() +{ delete[] idxTracker_; delete filter_; filter_ = NULL; @@ -99,22 +102,22 @@ PerfectHash::~PerfectHash() { delete values_; } -template -uint64_t PerfectHash::insert(const wordID_t* IDs, const int len, - const count_t value) { +template +uint64_t PerfectHash::insert(const wordID_t* IDs, const int len, + const count_t value) +{ count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0)); - if(idxTracker_[bucket] < (int)bucketRange_) { // if empty rows + if(idxTracker_[bucket] < (int)bucketRange_) { // if empty rows // restriction on fprint value is non-zero T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); uint64_t emptyidx = cells_ + 1; uint64_t index = bucket * bucketRange_, // starting bucket row - lastrow = index + bucketRange_; // ending row - while(index < lastrow) { // unique so check each row for "matching" signature + lastrow = index + bucketRange_; // ending row + while(index < lastrow) { // unique so check each row for "matching" signature T filterVal = filter_->read(index); - if((filterVal == 0) && (emptyidx == cells_ + 1)) { // record first empty row + if((filterVal == 0) && (emptyidx == cells_ + 1)) { // record first empty row emptyidx = index; - } - else if(filterVal == fp) { + } else if(filterVal == fp) { ++collisions_; dict_[hpDictKeyValue(IDs, len)] = value; // store exact in hpd return cells_ + 1; // finished @@ -127,21 +130,21 @@ uint64_t PerfectHash::insert(const wordID_t* IDs, const int len, values_->write(emptyidx, code); ++idxTracker_[bucket]; // keep track of bucket size return emptyidx; - } - else { // bucket is full + } else { // bucket is full dict_[hpDictKeyValue(IDs, len)] = value; // add to hpd return cells_ + 1; } } -template -bool PerfectHash::update(const wordID_t* IDs, const int len, - const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) { +template +bool PerfectHash::update(const wordID_t* IDs, const int len, + const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) +{ // check if key is in high perf. dictionary filterIdx = cells_ + 1; string skey = hpDictKeyValue(IDs, len); if((hpdAddr = dict_.find(skey)) != dict_.end()) { - hpdAddr->second = value; + hpdAddr->second = value; return true; } // else hash ngram @@ -150,45 +153,45 @@ bool PerfectHash::update(const wordID_t* IDs, const int len, // restriction on fprint value is non-zero T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); uint64_t index = bucket * bucketRange_, // starting bucket row - lastrow = index + bucketRange_; + lastrow = index + bucketRange_; while(index < lastrow) { // must check each row for matching fp event T filterVal = filter_->read(index); if(filterVal == fp) { // found event w.h.p. - values_->write(index, (T)qtizer_->code(value)); + values_->write(index, (T)qtizer_->code(value)); filterIdx = index; return true; } ++index; } - // could add if it gets here. + // could add if it gets here. return false; } -template -int PerfectHash::query(const wordID_t* IDs, const int len, - hpdEntry_t& hpdAddr, uint64_t& filterIdx) { +template +int PerfectHash::query(const wordID_t* IDs, const int len, + hpdEntry_t& hpdAddr, uint64_t& filterIdx) +{ // check if key is in high perf. dictionary string skey = hpDictKeyValue(IDs, len); if((hpdAddr = dict_.find(skey)) != dict_.end()) { filterIdx = cells_ + 1; return(hpdAddr->second); // returns copy of value - } - else { // check if key is in filter - // get bucket + } else { // check if key is in filter + // get bucket //count_t bucket = bucketHash_->hash(IDs, len); count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0)); // restriction on fprint value is non-zero T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); // return value if ngram is in filter uint64_t index = bucket * bucketRange_, - lastrow = index + bucketRange_; + lastrow = index + bucketRange_; for(; index < lastrow; ++index) { if(filter_->read(index) == fp) { - //cout << "fp = " << fp << "\tbucket = " << bucket << "\tfilter =" << - //filter_->read(index) << "\tcode = " << code << endl; + //cout << "fp = " << fp << "\tbucket = " << bucket << "\tfilter =" << + //filter_->read(index) << "\tcode = " << code << endl; filterIdx = index; hpdAddr = dict_.end(); - return (int)qtizer_->value(values_->read(index)); + return (int)qtizer_->value(values_->read(index)); } } } @@ -196,22 +199,23 @@ int PerfectHash::query(const wordID_t* IDs, const int len, } template -void PerfectHash::remove(const wordID_t* IDs, const int len) { +void PerfectHash::remove(const wordID_t* IDs, const int len) +{ // delete key if in high perf. dictionary string skey = hpDictKeyValue(IDs, len); if(dict_.find(skey) != dict_.end()) dict_.erase(skey); else { // check if key is in filter - // get small representation for ngrams + // get small representation for ngrams //count_t bucket = bucketHash_->hash(IDs, len); count_t bucket = (bucketHash_->size() > 1? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0)); // retrieve non zero fingerprint for ngram - T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); + T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); // return value if ngram is in filter uint64_t index = bucket * bucketRange_, - lastrow = index + bucketRange_; + lastrow = index + bucketRange_; for(; index < lastrow; ++index) { - if(filter_->read(index) == fp) { + if(filter_->read(index) == fp) { filter_->write(index, 0); values_->write(index, 0); --idxTracker_[bucket]; // track bucket size reduction @@ -222,7 +226,8 @@ void PerfectHash::remove(const wordID_t* IDs, const int len) { } template // clear filter index -void PerfectHash::remove(uint64_t index) { +void PerfectHash::remove(uint64_t index) +{ CHECK(index < cells_); CHECK(filter_->read(index) != 0); // slow filter_->write(index, 0); @@ -234,20 +239,22 @@ void PerfectHash::remove(uint64_t index) { template T PerfectHash::nonZeroSignature(const wordID_t* IDs, const int len, - count_t bucket) { + count_t bucket) +{ count_t h = bucket; T fingerprint(0); do { fingerprint = fingerHash_->hash(IDs, len, h); - h += (h < fingerHash_->size() - 1 ? 1 : -h); // wrap around + h += (h < fingerHash_->size() - 1 ? 1 : -h); // wrap around } while((fingerprint == 0) && (h != bucket)); - if(fingerprint == 0) + if(fingerprint == 0) cerr << "WARNING: Unable to find non-zero signature for ngram\n" << endl; return fingerprint; } template -string PerfectHash::hpDictKeyValue(const wordID_t* IDs, const int len) { +string PerfectHash::hpDictKeyValue(const wordID_t* IDs, const int len) +{ string skey(" "); for(int i = 0; i < len; ++i) skey += Utils::IntToStr(IDs[i]) + "¬"; @@ -256,19 +263,22 @@ string PerfectHash::hpDictKeyValue(const wordID_t* IDs, const int len) { } template -count_t PerfectHash::hpDictMemUse() { +count_t PerfectHash::hpDictMemUse() +{ // return hpDict memory usage in MBs return (count_t) sizeof(hpDict_t::value_type)* dict_.size() >> 20; } template -count_t PerfectHash::bucketsMemUse() { +count_t PerfectHash::bucketsMemUse() +{ // return bucket memory usage in MBs - return (count_t) (filter_->size() + values_->size()); + return (count_t) (filter_->size() + values_->size()); } template -void PerfectHash::save(FileHandler* fout) { +void PerfectHash::save(FileHandler* fout) +{ CHECK(fout != 0); cerr << "\tSaving perfect hash parameters...\n"; fout->write((char*)&hitMask_, sizeof(hitMask_)); @@ -289,12 +299,13 @@ void PerfectHash::save(FileHandler* fout) { count_t size = dict_.size(); fout->write((char*)&size, sizeof(count_t)); *fout << endl; - iterate(dict_, t) - *fout << t->first << "\t" << t->second << "\n"; + iterate(dict_, t) + *fout << t->first << "\t" << t->second << "\n"; } template -void PerfectHash::load(FileHandler* fin) { +void PerfectHash::load(FileHandler* fin) +{ CHECK(fin != 0); cerr << "\tLoading perfect hash parameters...\n"; fin->read((char*)&hitMask_, sizeof(hitMask_)); @@ -331,12 +342,13 @@ void PerfectHash::load(FileHandler* fin) { } template -void PerfectHash::analyze() { +void PerfectHash::analyze() +{ cerr << "Analyzing Dynamic Bloomier Filter...\n"; // see how many items in each bucket uint8_t* bucketCnt = new uint8_t[totBuckets_]; - unsigned largestBucket = 0, totalCellsSet = 0, - smallestBucket = bucketRange_, totalZeroes = 0; + unsigned largestBucket = 0, totalCellsSet = 0, + smallestBucket = bucketRange_, totalZeroes = 0; int curBucket = -1, fullBuckets(0); for(int i = 0; i < totBuckets_; ++i) bucketCnt[i] = 0; for(uint64_t i =0; i < cells_; ++i) { @@ -344,16 +356,14 @@ void PerfectHash::analyze() { if(filter_->read(i) != 0) { ++bucketCnt[curBucket]; ++totalCellsSet; - } - else ++totalZeroes; + } else ++totalZeroes; } count_t bi = 0, si = 0; for(int i = 0; i < totBuckets_; ++i) { if(bucketCnt[i] > largestBucket) { largestBucket = bucketCnt[i]; bi = i; - } - else if(bucketCnt[i] < smallestBucket) { + } else if(bucketCnt[i] < smallestBucket) { smallestBucket = bucketCnt[i]; si = i; } @@ -366,8 +376,8 @@ void PerfectHash::analyze() { } for(int i = 0; i < totBuckets_; ++i) { if(bucketCnt[i] != idxTracker_[i]) - cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] << - "\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << endl; + cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] << + "\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << endl; } cerr << "total cells= " << cells_ << endl; cerr << "total buckets= " << totBuckets_ << endl; @@ -380,7 +390,7 @@ void PerfectHash::analyze() { cerr << "largest bucket (" << bi << ") size= " << largestBucket << endl; cerr << "smallest bucket (" << si << ") size= " << smallestBucket << endl; cerr << "last bucket size= " << (int)bucketCnt[totBuckets_ - 1] << - " (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << endl; + " (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << endl; cerr << "total buckets full = " << fullBuckets << endl; cerr << "total collision errors= " << collisions_ << endl; cerr << "high performance dictionary size= " << dict_.size() << endl; @@ -390,14 +400,15 @@ void PerfectHash::analyze() { delete[] bucketCnt; } -template -bool PerfectHash::update2(const wordID_t* IDs, const int len, - const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) { +template +bool PerfectHash::update2(const wordID_t* IDs, const int len, + const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) +{ // check if key is in high perf. dictionary filterIdx = cells_ + 1; string skey = hpDictKeyValue(IDs, len); if((hpdAddr = dict_.find(skey)) != dict_.end()) { - hpdAddr->second += value; + hpdAddr->second += value; return true; } // else hash ngram @@ -406,18 +417,18 @@ bool PerfectHash::update2(const wordID_t* IDs, const int len, // restriction on fprint value is non-zero T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); uint64_t index = bucket * bucketRange_, // starting bucket row - lastrow = index + bucketRange_; + lastrow = index + bucketRange_; while(index < lastrow) { // must check each row for matching fp event T filterVal = filter_->read(index); if(filterVal == fp) { // found event w.h.p. - int oldval = (int)qtizer_->value(values_->read(index)); - values_->write(index, (T)qtizer_->code(oldval + value)); + int oldval = (int)qtizer_->value(values_->read(index)); + values_->write(index, (T)qtizer_->code(oldval + value)); filterIdx = index; return true; } ++index; } - // add if it gets here. + // add if it gets here. insert(IDs, len, value); return false; } diff --git a/moses/TranslationModel/DynSAInclude/quantizer.h b/moses/TranslationModel/DynSAInclude/quantizer.h index 6c6850fa6..68d6a55a3 100644 --- a/moses/TranslationModel/DynSAInclude/quantizer.h +++ b/moses/TranslationModel/DynSAInclude/quantizer.h @@ -14,7 +14,8 @@ static const float kFloatErr = 0.00001f; #endif //! @todo ask abby2 -class LogQtizer { +class LogQtizer +{ public: LogQtizer(float i): base_(pow(2, 1 / i)) { CHECK(base_ > 1); @@ -22,8 +23,8 @@ public: float value = 1; // code = 1 -> value = 1 for any base std::vector code_to_value_vec; while (log2(value) < 30) { // assume 2^30 is largest count - code_to_value_vec.push_back(value); - value = pow(base_, ++max_code_); + code_to_value_vec.push_back(value); + value = pow(base_, ++max_code_); } code_to_value_vec.push_back(value); // store max_code_ so in total [0, max_code_] // get valid range @@ -46,22 +47,22 @@ public: int code(float value) { // should just be: return log_b(value) CHECK(!(value < min_value_ || value > max_value_)); - // but binary search removes errors due to floor operator above - int code = static_cast(std::lower_bound(code_to_value_, code_to_value_+ max_code_, - value) - code_to_value_); - // make sure not overestimating + // but binary search removes errors due to floor operator above + int code = static_cast(std::lower_bound(code_to_value_, code_to_value_+ max_code_, + value) - code_to_value_); + // make sure not overestimating code = code_to_value_[code] > value ? code - 1 : code; return code; } inline float value(int code) { - // table look up for values + // table look up for values return code_to_value_[code]; } inline int maxcode() { return max_code_; } inline float logValue(int code) { - // table look up for log of values + // table look up for log of values return code_to_log_value_[code]; } ~LogQtizer() { @@ -75,15 +76,15 @@ public: fout->write((char*)&min_value_, sizeof(min_value_)); for (int j = 0; j <= max_code_; ++j) fout->write((char*)&code_to_value_[j], sizeof(code_to_value_[j])); - for (int j = 0; j <= max_code_; ++j) + for (int j = 0; j <= max_code_; ++j) fout->write((char*)&code_to_log_value_[j], sizeof(code_to_log_value_[j])); std::cerr << "Saved log codebook with " << max_code_ + 1 << " codes." <size(); ++i) { if(m_F->at(i) == word) { - return i; + return i; } } return -1;*/ - //NOTE: lower_bound is faster than linear search above but may cause issues - // if ordering of vocab is not consecutive (ie..after deletions) + //NOTE: lower_bound is faster than linear search above but may cause issues + // if ordering of vocab is not consecutive (ie..after deletions) int low = std::lower_bound(m_F->begin(), m_F->end(), word) - m_F->begin(); //cerr << "in F_firstIdx with word = " << word << " and low = " << low << " and F->size() =" << m_F->size() << endl; if((size_t)low >= m_F->size()) @@ -146,8 +146,8 @@ void DynSuffixArray::Reorder(unsigned j, unsigned jprime) { set > seen; while(j != jprime) { - // this 'seenit' check added for data with many loops. will remove after double - // checking. + // this 'seenit' check added for data with many loops. will remove after double + // checking. bool seenit = seen.insert(std::make_pair(j, jprime)).second; if(seenit) { for(size_t i=1; i < m_SA->size(); ++i) { @@ -163,9 +163,9 @@ void DynSuffixArray::Reorder(unsigned j, unsigned jprime) int new_j = LastFirstFunc(j); CHECK(j <= jprime); // for SA and L, the element at pos j is moved to pos j' - m_L->insert(m_L->begin() + jprime + 1, m_L->at(j)); + m_L->insert(m_L->begin() + jprime + 1, m_L->at(j)); m_L->erase(m_L->begin() + j); - m_SA->insert(m_SA->begin() + jprime + 1, m_SA->at(j)); + m_SA->insert(m_SA->begin() + jprime + 1, m_SA->at(j)); m_SA->erase(m_SA->begin() + j); // all ISA values between (j...j'] decremented for(size_t i = 0; i < m_ISA->size(); ++i) { diff --git a/moses/TranslationModel/PhraseDictionary.cpp b/moses/TranslationModel/PhraseDictionary.cpp index 808f7ce81..a0c94ccdc 100644 --- a/moses/TranslationModel/PhraseDictionary.cpp +++ b/moses/TranslationModel/PhraseDictionary.cpp @@ -31,7 +31,7 @@ namespace Moses { PhraseDictionary::PhraseDictionary(const std::string &description, const std::string &line) -:DecodeFeature(description, line) + :DecodeFeature(description, line) { m_tableLimit= 20; // TODO default? @@ -40,20 +40,15 @@ PhraseDictionary::PhraseDictionary(const std::string &description, const std::st if (args[0] == "num-input-features") { m_numInputScores = Scan(args[1]); - } - else if (args[0] == "path") { + } else if (args[0] == "path") { m_filePath = args[1]; - } - else if (args[0] == "table-limit") { + } else if (args[0] == "table-limit") { m_tableLimit = Scan(args[1]); - } - else if (args[0] == "target-path") { + } else if (args[0] == "target-path") { m_targetFile = args[1]; - } - else if (args[0] == "alignment-path") { + } else if (args[0] == "alignment-path") { m_alignmentsFile = args[1]; - } - else { + } else { //throw "Unknown argument " + args[0]; } } // for (size_t i = 0; i < toks.size(); ++i) { diff --git a/moses/TranslationModel/PhraseDictionary.h b/moses/TranslationModel/PhraseDictionary.h index 4c10c2e6b..1b1197eb1 100644 --- a/moses/TranslationModel/PhraseDictionary.h +++ b/moses/TranslationModel/PhraseDictionary.h @@ -88,7 +88,9 @@ public: const PhraseDictionary* GetDictionary() const; PhraseDictionary* GetDictionary(); - const std::string &GetFilePath() const { return m_filePath; } + const std::string &GetFilePath() const { + return m_filePath; + } protected: size_t m_tableLimit; diff --git a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp index 126dd3365..afa1c4abc 100644 --- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp +++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp @@ -9,7 +9,7 @@ using namespace std; namespace Moses { PhraseDictionaryDynSuffixArray::PhraseDictionaryDynSuffixArray(const std::string &line) -:PhraseDictionary("PhraseDictionaryDynSuffixArray", line) + :PhraseDictionary("PhraseDictionaryDynSuffixArray", line) { m_biSA = new BilingualDynSuffixArray(); } @@ -63,7 +63,7 @@ const TargetPhraseCollection *PhraseDictionaryDynSuffixArray::GetTargetPhraseCol void PhraseDictionaryDynSuffixArray::insertSnt(string& source, string& target, string& alignment) { m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays - //StaticData::Instance().ClearTransOptionCache(); // clear translation option cache + //StaticData::Instance().ClearTransOptionCache(); // clear translation option cache } void PhraseDictionaryDynSuffixArray::deleteSnt(unsigned /* idx */, unsigned /* num2Del */) { diff --git a/moses/TranslationModel/PhraseDictionaryMemory.cpp b/moses/TranslationModel/PhraseDictionaryMemory.cpp index 27cac9f5f..c43b919a4 100644 --- a/moses/TranslationModel/PhraseDictionaryMemory.cpp +++ b/moses/TranslationModel/PhraseDictionaryMemory.cpp @@ -41,9 +41,9 @@ namespace Moses { TargetPhraseCollection &PhraseDictionaryMemory::GetOrCreateTargetPhraseCollection( - const Phrase &source - , const TargetPhrase &target - , const Word *sourceLHS) + const Phrase &source + , const TargetPhrase &target + , const Word *sourceLHS) { PhraseDictionaryNodeMemory &currNode = GetOrCreateNode(source, target, sourceLHS); return currNode.GetOrCreateTargetPhraseCollection(); @@ -73,8 +73,8 @@ const TargetPhraseCollection *PhraseDictionaryMemory::GetTargetPhraseCollection( } PhraseDictionaryNodeMemory &PhraseDictionaryMemory::GetOrCreateNode(const Phrase &source - , const TargetPhrase &target - , const Word *sourceLHS) + , const TargetPhrase &target + , const Word *sourceLHS) { const size_t size = source.GetSize(); @@ -102,12 +102,12 @@ PhraseDictionaryNodeMemory &PhraseDictionaryMemory::GetOrCreateNode(const Phrase CHECK(currNode != NULL); } - + // finally, the source LHS //currNode = currNode->GetOrCreateChild(sourceLHS); //CHECK(currNode != NULL); - + return *currNode; } @@ -120,8 +120,7 @@ ChartRuleLookupManager *PhraseDictionaryMemory::CreateRuleLookupManager( void PhraseDictionaryMemory::SortAndPrune() { - if (GetTableLimit()) - { + if (GetTableLimit()) { m_collection.Sort(GetTableLimit()); } } diff --git a/moses/TranslationModel/PhraseDictionaryMemory.h b/moses/TranslationModel/PhraseDictionaryMemory.h index dad8b3bbd..d2a8d0ad3 100644 --- a/moses/TranslationModel/PhraseDictionaryMemory.h +++ b/moses/TranslationModel/PhraseDictionaryMemory.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -38,15 +38,17 @@ class PhraseDictionaryMemory : public RuleTableTrie protected: PhraseDictionaryMemory(const std::string &description, const std::string &line) - : RuleTableTrie(description, line) + : RuleTableTrie(description, line) {} public: PhraseDictionaryMemory(const std::string &line) - : RuleTableTrie("PhraseDictionaryMemory", line) + : RuleTableTrie("PhraseDictionaryMemory", line) {} - const PhraseDictionaryNodeMemory &GetRootNode() const { return m_collection; } + const PhraseDictionaryNodeMemory &GetRootNode() const { + return m_collection; + } ChartRuleLookupManager *CreateRuleLookupManager( const InputType &, @@ -54,14 +56,14 @@ public: TO_STRING(); - protected: +protected: TargetPhraseCollection &GetOrCreateTargetPhraseCollection( - const Phrase &source, const TargetPhrase &target, const Word *sourceLHS); + const Phrase &source, const TargetPhrase &target, const Word *sourceLHS); const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase& source) const; PhraseDictionaryNodeMemory &GetOrCreateNode(const Phrase &source - , const TargetPhrase &target - , const Word *sourceLHS); + , const TargetPhrase &target + , const Word *sourceLHS); void SortAndPrune(); diff --git a/moses/TranslationModel/PhraseDictionaryMultiModel.cpp b/moses/TranslationModel/PhraseDictionaryMultiModel.cpp index bf3f01a1e..e395cb5a3 100644 --- a/moses/TranslationModel/PhraseDictionaryMultiModel.cpp +++ b/moses/TranslationModel/PhraseDictionaryMultiModel.cpp @@ -26,7 +26,7 @@ namespace Moses { PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &line) -:PhraseDictionary("PhraseDictionaryMultiModel", line) + :PhraseDictionary("PhraseDictionaryMultiModel", line) { for (size_t i = 0; i < m_args.size(); ++i) { const vector &args = m_args[i]; @@ -37,12 +37,10 @@ PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &line) msg << "combination mode unknown: " << m_mode; throw runtime_error(msg.str()); } - } - else if (args[0] == "components") { + } else if (args[0] == "components") { m_pdStr = Tokenize(args[1], ","); m_numModels = m_pdStr.size(); - } - else if (args[0] == "lambda") { + } else if (args[0] == "lambda") { m_multimodelweights = Tokenize(args[1], ","); } } // for @@ -55,15 +53,14 @@ PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &line) } PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &description, const std::string &line) -:PhraseDictionary(description, line) + :PhraseDictionary(description, line) { for (size_t i = 0; i < m_args.size(); ++i) { const vector &args = m_args[i]; if (args[0] == "components") { m_pdStr = Tokenize(args[1], ","); m_numModels = m_pdStr.size(); - } - else if (args[0] == "lambda") { + } else if (args[0] == "lambda") { m_multimodelweights = Tokenize(args[1], ","); } } // for @@ -83,7 +80,7 @@ bool PhraseDictionaryMultiModel::InitDictionary() // one could choose a higher value than tableLimit (or 0) here for maximal precision, at a cost of speed. - for(size_t i = 0; i < m_numModels; ++i){ + for(size_t i = 0; i < m_numModels; ++i) { const string &ptName = m_pdStr[i]; PhraseDictionary *pt = FindPhraseDictionary(ptName); @@ -144,7 +141,7 @@ const TargetPhraseCollection *PhraseDictionaryMultiModel::GetTargetPhraseCollect void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src, std::map* allStats) const { - for(size_t i = 0; i < m_numModels; ++i){ + for(size_t i = 0; i < m_numModels; ++i) { const PhraseDictionary &pd = *m_pd[i]; TargetPhraseCollection *ret_raw = (TargetPhraseCollection*) pd.GetTargetPhraseCollection( src); @@ -152,10 +149,9 @@ void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src, TargetPhraseCollection::iterator iterTargetPhrase, iterLast; if (m_tableLimit != 0 && ret_raw->GetSize() > m_tableLimit) { - iterLast = ret_raw->begin() + m_tableLimit; - } - else { - iterLast = ret_raw->end(); + iterLast = ret_raw->begin() + m_tableLimit; + } else { + iterLast = ret_raw->end(); } for (iterTargetPhrase = ret_raw->begin(); iterTargetPhrase != iterLast; ++iterTargetPhrase) { @@ -173,9 +169,9 @@ void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src, Scores scoreVector(m_numScoreComponents); statistics->p.resize(m_numScoreComponents); - for(size_t j = 0; j < m_numScoreComponents; ++j){ - statistics->p[j].resize(m_numModels); - scoreVector[j] = -raw_scores[j]; + for(size_t j = 0; j < m_numScoreComponents; ++j) { + statistics->p[j].resize(m_numModels); + scoreVector[j] = -raw_scores[j]; } statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); // set scores to 0 @@ -186,8 +182,8 @@ void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src, } multiModelStatistics * statistics = (*allStats)[targetString]; - for(size_t j = 0; j < m_numScoreComponents; ++j){ - statistics->p[j][i] = UntransformScore(raw_scores[j]); + for(size_t j = 0; j < m_numScoreComponents; ++j) { + statistics->p[j][i] = UntransformScore(raw_scores[j]); } (*allStats)[targetString] = statistics; @@ -199,26 +195,26 @@ void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src, TargetPhraseCollection* PhraseDictionaryMultiModel::CreateTargetPhraseCollectionLinearInterpolation(const Phrase& src, std::map* allStats, std::vector > &multimodelweights) const { - TargetPhraseCollection *ret = new TargetPhraseCollection(); - for ( std::map< std::string, multiModelStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) { + TargetPhraseCollection *ret = new TargetPhraseCollection(); + for ( std::map< std::string, multiModelStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) { - multiModelStatistics * statistics = iter->second; + multiModelStatistics * statistics = iter->second; - Scores scoreVector(m_numScoreComponents); + Scores scoreVector(m_numScoreComponents); - for(size_t i = 0; i < m_numScoreComponents-1; ++i){ - scoreVector[i] = TransformScore(std::inner_product(statistics->p[i].begin(), statistics->p[i].end(), multimodelweights[i].begin(), 0.0)); - } - - //assuming that last value is phrase penalty - scoreVector[m_numScoreComponents-1] = 1.0; - - statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); - statistics->targetPhrase->Evaluate(src); - - ret->Add(new TargetPhrase(*statistics->targetPhrase)); + for(size_t i = 0; i < m_numScoreComponents-1; ++i) { + scoreVector[i] = TransformScore(std::inner_product(statistics->p[i].begin(), statistics->p[i].end(), multimodelweights[i].begin(), 0.0)); } - return ret; + + //assuming that last value is phrase penalty + scoreVector[m_numScoreComponents-1] = 1.0; + + statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); + statistics->targetPhrase->Evaluate(src); + + ret->Add(new TargetPhrase(*statistics->targetPhrase)); + } + return ret; } @@ -235,8 +231,7 @@ std::vector > PhraseDictionaryMultiModel::getWeights(size_t n //checking weights passed to mosesserver; only valid for this sentence; *don't* raise exception if client weights are malformed if (weights_ptr == NULL || weights_ptr->size() == 0) { weights_ptr = &m_multimodelweights; //fall back to weights defined in config - } - else if(weights_ptr->size() != m_numModels && weights_ptr->size() != m_numModels * numWeights) { + } else if(weights_ptr->size() != m_numModels && weights_ptr->size() != m_numModels * numWeights) { //TODO: can we pass error message to client if weights are malformed? std::stringstream strme; strme << "Must have either one multimodel weight per model (" << m_numModels << "), or one per weighted feature and model (" << numWeights << "*" << m_numModels << "). You have " << weights_ptr->size() << ". Reverting to weights in config"; @@ -246,34 +241,30 @@ std::vector > PhraseDictionaryMultiModel::getWeights(size_t n //checking weights defined in config; only valid for this sentence; raise exception if config weights are malformed if (weights_ptr == NULL || weights_ptr->size() == 0) { - for (size_t i=0;i < m_numModels;i++) { + for (size_t i=0; i < m_numModels; i++) { raw_weights.push_back(1.0/m_numModels); //uniform weights created online } - } - else if(weights_ptr->size() != m_numModels && weights_ptr->size() != m_numModels * numWeights) { + } else if(weights_ptr->size() != m_numModels && weights_ptr->size() != m_numModels * numWeights) { std::stringstream strme; strme << "Must have either one multimodel weight per model (" << m_numModels << "), or one per weighted feature and model (" << numWeights << "*" << m_numModels << "). You have " << weights_ptr->size() << "."; UTIL_THROW(util::Exception, strme.str()); - } - else { - raw_weights = *weights_ptr; + } else { + raw_weights = *weights_ptr; } std::vector > multimodelweights (numWeights); - for (size_t i=0;i < numWeights;i++) { + for (size_t i=0; i < numWeights; i++) { std::vector weights_onefeature (m_numModels); if(raw_weights.size() == m_numModels) { - weights_onefeature = raw_weights; - } - else { - copy ( raw_weights.begin()+i*m_numModels, raw_weights.begin()+(i+1)*m_numModels, weights_onefeature.begin() ); + weights_onefeature = raw_weights; + } else { + copy ( raw_weights.begin()+i*m_numModels, raw_weights.begin()+(i+1)*m_numModels, weights_onefeature.begin() ); } if(normalize) { - multimodelweights[i] = normalizeWeights(weights_onefeature); - } - else { - multimodelweights[i] = weights_onefeature; + multimodelweights[i] = normalizeWeights(weights_onefeature); + } else { + multimodelweights[i] = weights_onefeature; } } @@ -282,12 +273,12 @@ std::vector > PhraseDictionaryMultiModel::getWeights(size_t n std::vector PhraseDictionaryMultiModel::normalizeWeights(std::vector &weights) const { - std::vector ret (m_numModels); - float total = std::accumulate(weights.begin(),weights.end(),0.0); - for (size_t i=0;i < weights.size();i++) { - ret[i] = weights[i]/total; - } - return ret; + std::vector ret (m_numModels); + float total = std::accumulate(weights.begin(),weights.end(),0.0); + for (size_t i=0; i < weights.size(); i++) { + ret[i] = weights[i]/total; + } + return ret; } @@ -298,7 +289,8 @@ ChartRuleLookupManager *PhraseDictionaryMultiModel::CreateRuleLookupManager(cons //copied from PhraseDictionaryCompact; free memory allocated to TargetPhraseCollection (and each TargetPhrase) at end of sentence -void PhraseDictionaryMultiModel::CacheForCleanup(TargetPhraseCollection* tpc) { +void PhraseDictionaryMultiModel::CacheForCleanup(TargetPhraseCollection* tpc) +{ #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_sentenceMutex); PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()]; @@ -309,7 +301,8 @@ void PhraseDictionaryMultiModel::CacheForCleanup(TargetPhraseCollection* tpc) { } -void PhraseDictionaryMultiModel::CleanUpAfterSentenceProcessing(const InputType &source) { +void PhraseDictionaryMultiModel::CleanUpAfterSentenceProcessing(const InputType &source) +{ #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_sentenceMutex); PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()]; @@ -317,7 +310,7 @@ void PhraseDictionaryMultiModel::CleanUpAfterSentenceProcessing(const InputType PhraseCache &ref = m_sentenceCache; #endif for(PhraseCache::iterator it = ref.begin(); it != ref.end(); it++) { - delete *it; + delete *it; } PhraseCache temp; @@ -331,149 +324,150 @@ void PhraseDictionaryMultiModel::CleanUpAfterSentenceProcessing(const InputType } -void PhraseDictionaryMultiModel::CleanUpComponentModels(const InputType &source) { - for(size_t i = 0; i < m_numModels; ++i){ +void PhraseDictionaryMultiModel::CleanUpComponentModels(const InputType &source) +{ + for(size_t i = 0; i < m_numModels; ++i) { m_pd[i]->CleanUpAfterSentenceProcessing(source); } } #ifdef WITH_DLIB -vector PhraseDictionaryMultiModel::MinimizePerplexity(vector > &phrase_pair_vector) { +vector PhraseDictionaryMultiModel::MinimizePerplexity(vector > &phrase_pair_vector) +{ - const StaticData &staticData = StaticData::Instance(); - const string& factorDelimiter = staticData.GetFactorDelimiter(); + const StaticData &staticData = StaticData::Instance(); + const string& factorDelimiter = staticData.GetFactorDelimiter(); - map, size_t> phrase_pair_map; + map, size_t> phrase_pair_map; - for ( vector >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) { - phrase_pair_map[*iter] += 1; + for ( vector >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) { + phrase_pair_map[*iter] += 1; + } + + vector optimizerStats; + + for ( map, size_t>::iterator iter = phrase_pair_map.begin(); iter != phrase_pair_map.end(); ++iter ) { + + pair phrase_pair = iter->first; + string source_string = phrase_pair.first; + string target_string = phrase_pair.second; + + vector fs(m_numModels); + map* allStats = new(map); + + Phrase sourcePhrase(0); + sourcePhrase.CreateFromString(Input, m_input, source_string, factorDelimiter, NULL); + + CollectSufficientStatistics(sourcePhrase, allStats); //optimization potential: only call this once per source phrase + + //phrase pair not found; leave cache empty + if (allStats->find(target_string) == allStats->end()) { + RemoveAllInMap(*allStats); + delete allStats; + continue; } - vector optimizerStats; + multiModelStatisticsOptimization* targetStatistics = new multiModelStatisticsOptimization(); + targetStatistics->targetPhrase = new TargetPhrase(*(*allStats)[target_string]->targetPhrase); + targetStatistics->p = (*allStats)[target_string]->p; + targetStatistics->f = iter->second; + optimizerStats.push_back(targetStatistics); - for ( map, size_t>::iterator iter = phrase_pair_map.begin(); iter != phrase_pair_map.end(); ++iter ) { + RemoveAllInMap(*allStats); + delete allStats; + } - pair phrase_pair = iter->first; - string source_string = phrase_pair.first; - string target_string = phrase_pair.second; + Sentence sentence; + CleanUpAfterSentenceProcessing(sentence); // free memory used by compact phrase tables - vector fs(m_numModels); - map* allStats = new(map); + size_t numWeights = m_numScoreComponents; + if (m_mode == "interpolate") { + //interpolation of phrase penalty is skipped, and fixed-value (2.718) is used instead. results will be screwed up if phrase penalty is not last feature + numWeights = m_numScoreComponents-1; + } - Phrase sourcePhrase(0); - sourcePhrase.CreateFromString(Input, m_input, source_string, factorDelimiter, NULL); + vector ret (m_numModels*numWeights); + for (size_t iFeature=0; iFeature < numWeights; iFeature++) { - CollectSufficientStatistics(sourcePhrase, allStats); //optimization potential: only call this once per source phrase + CrossEntropy * ObjectiveFunction = new CrossEntropy(optimizerStats, this, iFeature); - //phrase pair not found; leave cache empty - if (allStats->find(target_string) == allStats->end()) { - RemoveAllInMap(*allStats); - delete allStats; - continue; - } + vector weight_vector = Optimize(ObjectiveFunction, m_numModels); - multiModelStatisticsOptimization* targetStatistics = new multiModelStatisticsOptimization(); - targetStatistics->targetPhrase = new TargetPhrase(*(*allStats)[target_string]->targetPhrase); - targetStatistics->p = (*allStats)[target_string]->p; - targetStatistics->f = iter->second; - optimizerStats.push_back(targetStatistics); - - RemoveAllInMap(*allStats); - delete allStats; - } - - Sentence sentence; - CleanUpAfterSentenceProcessing(sentence); // free memory used by compact phrase tables - - size_t numWeights = m_numScoreComponents; if (m_mode == "interpolate") { - //interpolation of phrase penalty is skipped, and fixed-value (2.718) is used instead. results will be screwed up if phrase penalty is not last feature - numWeights = m_numScoreComponents-1; + weight_vector = normalizeWeights(weight_vector); } - vector ret (m_numModels*numWeights); - for (size_t iFeature=0; iFeature < numWeights; iFeature++) { - - CrossEntropy * ObjectiveFunction = new CrossEntropy(optimizerStats, this, iFeature); - - vector weight_vector = Optimize(ObjectiveFunction, m_numModels); - - if (m_mode == "interpolate") { - weight_vector = normalizeWeights(weight_vector); - } - - cerr << "Weight vector for feature " << iFeature << ": "; - for (size_t i=0; i < m_numModels; i++) { - ret[(iFeature*m_numModels)+i] = weight_vector[i]; - cerr << weight_vector[i] << " "; - } - cerr << endl; - delete ObjectiveFunction; + cerr << "Weight vector for feature " << iFeature << ": "; + for (size_t i=0; i < m_numModels; i++) { + ret[(iFeature*m_numModels)+i] = weight_vector[i]; + cerr << weight_vector[i] << " "; } + cerr << endl; + delete ObjectiveFunction; + } - RemoveAllInColl(optimizerStats); - return ret; + RemoveAllInColl(optimizerStats); + return ret; } -vector PhraseDictionaryMultiModel::Optimize(OptimizationObjective *ObjectiveFunction, size_t numModels) { +vector PhraseDictionaryMultiModel::Optimize(OptimizationObjective *ObjectiveFunction, size_t numModels) +{ - dlib::matrix starting_point; - starting_point.set_size(numModels); - starting_point = 1.0; + dlib::matrix starting_point; + starting_point.set_size(numModels); + starting_point = 1.0; - try { - dlib::find_min_bobyqa(*ObjectiveFunction, - starting_point, - 2*numModels+1, // number of interpolation points - dlib::uniform_matrix(numModels,1, 1e-09), // lower bound constraint - dlib::uniform_matrix(numModels,1, 1e100), // upper bound constraint - 1.0, // initial trust region radius - 1e-5, // stopping trust region radius - 10000 // max number of objective function evaluations - ); - } - catch (dlib::bobyqa_failure& e) - { - cerr << e.what() << endl; - } + try { + dlib::find_min_bobyqa(*ObjectiveFunction, + starting_point, + 2*numModels+1, // number of interpolation points + dlib::uniform_matrix(numModels,1, 1e-09), // lower bound constraint + dlib::uniform_matrix(numModels,1, 1e100), // upper bound constraint + 1.0, // initial trust region radius + 1e-5, // stopping trust region radius + 10000 // max number of objective function evaluations + ); + } catch (dlib::bobyqa_failure& e) { + cerr << e.what() << endl; + } - vector weight_vector (numModels); + vector weight_vector (numModels); - for (int i=0; i < starting_point.nr(); i++) { - weight_vector[i] = starting_point(i); - } + for (int i=0; i < starting_point.nr(); i++) { + weight_vector[i] = starting_point(i); + } - cerr << "Cross-entropy: " << (*ObjectiveFunction)(starting_point) << endl; - return weight_vector; + cerr << "Cross-entropy: " << (*ObjectiveFunction)(starting_point) << endl; + return weight_vector; } double CrossEntropy::operator() ( const dlib::matrix& arg) const { - double total = 0.0; - double n = 0.0; - std::vector weight_vector (m_model->m_numModels); + double total = 0.0; + double n = 0.0; + std::vector weight_vector (m_model->m_numModels); - for (int i=0; i < arg.nr(); i++) { - weight_vector[i] = arg(i); - } - if (m_model->m_mode == "interpolate") { - weight_vector = m_model->normalizeWeights(weight_vector); - } + for (int i=0; i < arg.nr(); i++) { + weight_vector[i] = arg(i); + } + if (m_model->m_mode == "interpolate") { + weight_vector = m_model->normalizeWeights(weight_vector); + } - for ( std::vector::const_iterator iter = m_optimizerStats.begin(); iter != m_optimizerStats.end(); ++iter ) { - multiModelStatisticsOptimization* statistics = *iter; - size_t f = statistics->f; + for ( std::vector::const_iterator iter = m_optimizerStats.begin(); iter != m_optimizerStats.end(); ++iter ) { + multiModelStatisticsOptimization* statistics = *iter; + size_t f = statistics->f; - double score; - score = std::inner_product(statistics->p[m_iFeature].begin(), statistics->p[m_iFeature].end(), weight_vector.begin(), 0.0); + double score; + score = std::inner_product(statistics->p[m_iFeature].begin(), statistics->p[m_iFeature].end(), weight_vector.begin(), 0.0); - total -= (FloorScore(TransformScore(score))/TransformScore(2))*f; - n += f; - } - return total/n; + total -= (FloorScore(TransformScore(score))/TransformScore(2))*f; + n += f; + } + return total/n; } #endif diff --git a/moses/TranslationModel/PhraseDictionaryMultiModel.h b/moses/TranslationModel/PhraseDictionaryMultiModel.h index 467333b0a..5feb4f373 100644 --- a/moses/TranslationModel/PhraseDictionaryMultiModel.h +++ b/moses/TranslationModel/PhraseDictionaryMultiModel.h @@ -36,15 +36,17 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA namespace Moses { - struct multiModelStatistics { - TargetPhrase *targetPhrase; - std::vector > p; - ~multiModelStatistics() {delete targetPhrase;}; +struct multiModelStatistics { + TargetPhrase *targetPhrase; + std::vector > p; + ~multiModelStatistics() { + delete targetPhrase; }; +}; - struct multiModelStatisticsOptimization: multiModelStatistics { - size_t f; - }; +struct multiModelStatisticsOptimization: multiModelStatistics { + size_t f; +}; class OptimizationObjective; @@ -53,7 +55,7 @@ class OptimizationObjective; class PhraseDictionaryMultiModel: public PhraseDictionary { #ifdef WITH_DLIB -friend class CrossEntropy; + friend class CrossEntropy; #endif public: @@ -100,34 +102,33 @@ protected: }; #ifdef WITH_DLIB -class OptimizationObjective +class OptimizationObjective { public: - virtual double operator() ( const dlib::matrix& arg) const = 0; + virtual double operator() ( const dlib::matrix& arg) const = 0; }; class CrossEntropy: public OptimizationObjective { public: - CrossEntropy ( - std::vector &optimizerStats, - PhraseDictionaryMultiModel * model, - size_t iFeature - ) - { - m_optimizerStats = optimizerStats; - m_model = model; - m_iFeature = iFeature; - } + CrossEntropy ( + std::vector &optimizerStats, + PhraseDictionaryMultiModel * model, + size_t iFeature + ) { + m_optimizerStats = optimizerStats; + m_model = model; + m_iFeature = iFeature; + } - double operator() ( const dlib::matrix& arg) const; + double operator() ( const dlib::matrix& arg) const; protected: - std::vector m_optimizerStats; - PhraseDictionaryMultiModel * m_model; - size_t m_iFeature; + std::vector m_optimizerStats; + PhraseDictionaryMultiModel * m_model; + size_t m_iFeature; }; #endif diff --git a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp index 4c61fba91..298e23a9b 100644 --- a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp +++ b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp @@ -61,59 +61,56 @@ namespace Moses { PhraseDictionaryMultiModelCounts::PhraseDictionaryMultiModelCounts(const std::string &line) -:PhraseDictionaryMultiModel("PhraseDictionaryMultiModelCounts", line) + :PhraseDictionaryMultiModel("PhraseDictionaryMultiModelCounts", line) { - m_mode = "instance_weighting"; //TODO: set this in config; use m_mode to switch between interpolation and instance weighting - m_combineFunction = InstanceWeighting; - //m_mode = "interpolate"; - //m_combineFunction = LinearInterpolationFromCounts; + m_mode = "instance_weighting"; //TODO: set this in config; use m_mode to switch between interpolation and instance weighting + m_combineFunction = InstanceWeighting; + //m_mode = "interpolate"; + //m_combineFunction = LinearInterpolationFromCounts; - for (size_t i = 0; i < m_args.size(); ++i) { - const vector &args = m_args[i]; - if (args[0] == "mode") { - m_mode = args[1]; - if (m_mode == "instance_weighting") - m_combineFunction = InstanceWeighting; - else if (m_mode == "interpolate") { - m_combineFunction = LinearInterpolationFromCounts; - } - else { - ostringstream msg; - msg << "combination mode unknown: " << m_mode; - throw runtime_error(msg.str()); - } - - } - else if (args[0] == "lex-e2f") { - m_lexE2FStr = Tokenize(args[1], ","); - CHECK(m_lexE2FStr.size() == m_pdStr.size()); - } - else if (args[0] == "lex-f2e") { - m_lexF2EStr = Tokenize(args[1], ","); - CHECK(m_lexF2EStr.size() == m_pdStr.size()); + for (size_t i = 0; i < m_args.size(); ++i) { + const vector &args = m_args[i]; + if (args[0] == "mode") { + m_mode = args[1]; + if (m_mode == "instance_weighting") + m_combineFunction = InstanceWeighting; + else if (m_mode == "interpolate") { + m_combineFunction = LinearInterpolationFromCounts; + } else { + ostringstream msg; + msg << "combination mode unknown: " << m_mode; + throw runtime_error(msg.str()); } - else if (args[0] == "target-table") { - m_targetTable = Tokenize(args[1], ","); - CHECK(m_targetTable.size() == m_pdStr.size()); - } + } else if (args[0] == "lex-e2f") { + m_lexE2FStr = Tokenize(args[1], ","); + CHECK(m_lexE2FStr.size() == m_pdStr.size()); + } else if (args[0] == "lex-f2e") { + m_lexF2EStr = Tokenize(args[1], ","); + CHECK(m_lexF2EStr.size() == m_pdStr.size()); + } + + else if (args[0] == "target-table") { + m_targetTable = Tokenize(args[1], ","); + CHECK(m_targetTable.size() == m_pdStr.size()); + } - } // for + } // for } PhraseDictionaryMultiModelCounts::~PhraseDictionaryMultiModelCounts() { - RemoveAllInColl(m_lexTable_e2f); - RemoveAllInColl(m_lexTable_f2e); + RemoveAllInColl(m_lexTable_e2f); + RemoveAllInColl(m_lexTable_f2e); } bool PhraseDictionaryMultiModelCounts::InitDictionary() { - for(size_t i = 0; i < m_numModels; ++i){ + for(size_t i = 0; i < m_numModels; ++i) { // phrase table const string &ptName = m_pdStr[i]; @@ -189,8 +186,8 @@ bool PhraseDictionaryMultiModelCounts::InitDictionary() pdta_inverse->SetNumScoreComponentMultiModel(m_numScoreComponent); pdta_inverse->Load(input, output, target_table, m_weight, m_componentTableLimit, languageModels, m_weightWP); m_inverse_pd.push_back(pdta_inverse); - } else if (implementation == Compact) { -#ifndef WIN32 + } else if (implementation == Compact) { + #ifndef WIN32 PhraseDictionaryCompact* pdc = new PhraseDictionaryCompact(m_numScoreComponent, implementation, m_feature_load); pdc->SetNumScoreComponentMultiModel(m_numScoreComponent); //for compact models, we need to pass number of log-linear components to correctly resize the score vector pdc->Load( input, output, main_table, m_weight, componentTableLimit, languageModels, m_weightWP); @@ -200,9 +197,9 @@ bool PhraseDictionaryMultiModelCounts::InitDictionary() pdc_inverse->SetNumScoreComponentMultiModel(m_numScoreComponent); pdc_inverse->Load( input, output, target_table, m_weight, componentTableLimit, languageModels, m_weightWP); m_inverse_pd.push_back(pdc_inverse); -#else - UTIL_THROW(util::Exception, "Compact phrase table not supported in windows"); -#endif + #else + UTIL_THROW(util::Exception, "Compact phrase table not supported in windows"); + #endif } else { UTIL_THROW(util::Exception,"PhraseDictionaryMultiModel does not support phrase table type " << implementation); @@ -218,7 +215,7 @@ bool PhraseDictionaryMultiModelCounts::InitDictionary() } -*/ + */ return true; } @@ -250,7 +247,7 @@ const TargetPhraseCollection *PhraseDictionaryMultiModelCounts::GetTargetPhraseC void PhraseDictionaryMultiModelCounts::CollectSufficientStatistics(const Phrase& src, vector &fs, map* allStats) const //fill fs and allStats with statistics from models { - for(size_t i = 0; i < m_numModels; ++i){ + for(size_t i = 0; i < m_numModels; ++i) { const PhraseDictionary &pd = *m_pd[i]; TargetPhraseCollection *ret_raw = (TargetPhraseCollection*) pd.GetTargetPhraseCollection( src); @@ -298,9 +295,9 @@ void PhraseDictionaryMultiModelCounts::CollectSufficientStatistics(const Phrase& multiModelCountsStatistics * statistics = iter->second; for (size_t i = 0; i < m_numModels; ++i) { - if (!statistics->ft[i]) { - statistics->ft[i] = GetTargetCount(static_cast(*statistics->targetPhrase), i); - } + if (!statistics->ft[i]) { + statistics->ft[i] = GetTargetCount(static_cast(*statistics->targetPhrase), i); + } } } } @@ -313,28 +310,27 @@ TargetPhraseCollection* PhraseDictionaryMultiModelCounts::CreateTargetPhraseColl multiModelCountsStatistics * statistics = iter->second; if (statistics->targetPhrase->GetAlignTerm().GetSize() == 0) { - UTIL_THROW(util::Exception, " alignment information empty\ncount-tables need to include alignment information for computation of lexical weights.\nUse --phrase-word-alignment during training; for on-disk tables, also set -alignment-info when creating on-disk tables."); + UTIL_THROW(util::Exception, " alignment information empty\ncount-tables need to include alignment information for computation of lexical weights.\nUse --phrase-word-alignment during training; for on-disk tables, also set -alignment-info when creating on-disk tables."); } try { - pair >, vector< set > > alignment = GetAlignmentsForLexWeights(src, static_cast(*statistics->targetPhrase), statistics->targetPhrase->GetAlignTerm()); - vector< set > alignedToT = alignment.first; - vector< set > alignedToS = alignment.second; - double lexst = ComputeWeightedLexicalTranslation(static_cast(*statistics->targetPhrase), src, alignedToS, m_lexTable_e2f, multimodelweights[1], m_output, m_input ); - double lexts = ComputeWeightedLexicalTranslation(src, static_cast(*statistics->targetPhrase), alignedToT, m_lexTable_f2e, multimodelweights[3], m_input, m_output ); + pair >, vector< set > > alignment = GetAlignmentsForLexWeights(src, static_cast(*statistics->targetPhrase), statistics->targetPhrase->GetAlignTerm()); + vector< set > alignedToT = alignment.first; + vector< set > alignedToS = alignment.second; + double lexst = ComputeWeightedLexicalTranslation(static_cast(*statistics->targetPhrase), src, alignedToS, m_lexTable_e2f, multimodelweights[1], m_output, m_input ); + double lexts = ComputeWeightedLexicalTranslation(src, static_cast(*statistics->targetPhrase), alignedToT, m_lexTable_f2e, multimodelweights[3], m_input, m_output ); - Scores scoreVector(5); - scoreVector[0] = FloorScore(TransformScore(m_combineFunction(statistics->fst, statistics->ft, multimodelweights[0]))); - scoreVector[1] = FloorScore(TransformScore(lexst)); - scoreVector[2] = FloorScore(TransformScore(m_combineFunction(statistics->fst, fs, multimodelweights[2]))); - scoreVector[3] = FloorScore(TransformScore(lexts)); - scoreVector[4] = FloorScore(TransformScore(2.718)); + Scores scoreVector(5); + scoreVector[0] = FloorScore(TransformScore(m_combineFunction(statistics->fst, statistics->ft, multimodelweights[0]))); + scoreVector[1] = FloorScore(TransformScore(lexst)); + scoreVector[2] = FloorScore(TransformScore(m_combineFunction(statistics->fst, fs, multimodelweights[2]))); + scoreVector[3] = FloorScore(TransformScore(lexts)); + scoreVector[4] = FloorScore(TransformScore(2.718)); - statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); - statistics->targetPhrase->Evaluate(src); - } - catch (AlignmentException& e) { - continue; + statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); + statistics->targetPhrase->Evaluate(src); + } catch (AlignmentException& e) { + continue; } ret->Add(new TargetPhrase(*statistics->targetPhrase)); @@ -346,47 +342,50 @@ TargetPhraseCollection* PhraseDictionaryMultiModelCounts::CreateTargetPhraseColl } -float PhraseDictionaryMultiModelCounts::GetTargetCount(const Phrase &target, size_t modelIndex) const { +float PhraseDictionaryMultiModelCounts::GetTargetCount(const Phrase &target, size_t modelIndex) const +{ - const PhraseDictionary &pd = *m_inverse_pd[modelIndex]; - TargetPhraseCollection *ret_raw = (TargetPhraseCollection*) pd.GetTargetPhraseCollection(target); + const PhraseDictionary &pd = *m_inverse_pd[modelIndex]; + TargetPhraseCollection *ret_raw = (TargetPhraseCollection*) pd.GetTargetPhraseCollection(target); - // in inverse mode, we want the first score of the first phrase pair (note: if we were to work with truly symmetric models, it would be the third score) - if (ret_raw != NULL) { - TargetPhrase * targetPhrase = *(ret_raw->begin()); - return UntransformScore(targetPhrase->GetScoreBreakdown().GetScoresForProducer(&pd)[0]); - } + // in inverse mode, we want the first score of the first phrase pair (note: if we were to work with truly symmetric models, it would be the third score) + if (ret_raw != NULL) { + TargetPhrase * targetPhrase = *(ret_raw->begin()); + return UntransformScore(targetPhrase->GetScoreBreakdown().GetScoresForProducer(&pd)[0]); + } - // target phrase unknown - else return 0; + // target phrase unknown + else return 0; } -pair PhraseDictionaryMultiModelCounts::GetAlignmentsForLexWeights(const Phrase &phraseS, const Phrase &phraseT, const AlignmentInfo &alignment) const { +pair PhraseDictionaryMultiModelCounts::GetAlignmentsForLexWeights(const Phrase &phraseS, const Phrase &phraseT, const AlignmentInfo &alignment) const +{ - size_t tsize = phraseT.GetSize(); - size_t ssize = phraseS.GetSize(); - AlignVector alignedToT (tsize); - AlignVector alignedToS (ssize); - AlignmentInfo::const_iterator iter; + size_t tsize = phraseT.GetSize(); + size_t ssize = phraseS.GetSize(); + AlignVector alignedToT (tsize); + AlignVector alignedToS (ssize); + AlignmentInfo::const_iterator iter; - for (iter = alignment.begin(); iter != alignment.end(); ++iter) { + for (iter = alignment.begin(); iter != alignment.end(); ++iter) { const pair &alignPair = *iter; - size_t s = alignPair.first; - size_t t = alignPair.second; - if (s >= ssize || t >= tsize) { - cerr << "Error: inconsistent alignment for phrase pair: " << phraseS << " - " << phraseT << endl; - cerr << "phrase pair will be discarded" << endl; - throw AlignmentException(); - } - alignedToT[t].insert( s ); - alignedToS[s].insert( t ); + size_t s = alignPair.first; + size_t t = alignPair.second; + if (s >= ssize || t >= tsize) { + cerr << "Error: inconsistent alignment for phrase pair: " << phraseS << " - " << phraseT << endl; + cerr << "phrase pair will be discarded" << endl; + throw AlignmentException(); + } + alignedToT[t].insert( s ); + alignedToS[s].insert( t ); } return make_pair(alignedToT,alignedToS); } -double PhraseDictionaryMultiModelCounts::ComputeWeightedLexicalTranslation( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const vector &tables, vector &multimodelweights, const vector &input_factors, const vector &output_factors ) const { +double PhraseDictionaryMultiModelCounts::ComputeWeightedLexicalTranslation( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const vector &tables, vector &multimodelweights, const vector &input_factors, const vector &output_factors ) const +{ // lexical translation probability double lexScore = 1.0; @@ -414,7 +413,8 @@ double PhraseDictionaryMultiModelCounts::ComputeWeightedLexicalTranslation( cons } -lexicalCache PhraseDictionaryMultiModelCounts::CacheLexicalStatistics( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const vector &tables, const vector &input_factors, const vector &output_factors ) { +lexicalCache PhraseDictionaryMultiModelCounts::CacheLexicalStatistics( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const vector &tables, const vector &input_factors, const vector &output_factors ) +{ //do all the necessary lexical table lookups and get counts, but don't apply weights yet string null = "NULL"; @@ -455,60 +455,65 @@ lexicalCache PhraseDictionaryMultiModelCounts::CacheLexicalStatistics( const Phr } -double PhraseDictionaryMultiModelCounts::ComputeWeightedLexicalTranslationFromCache( lexicalCache &cache, vector &weights ) const { +double PhraseDictionaryMultiModelCounts::ComputeWeightedLexicalTranslationFromCache( lexicalCache &cache, vector &weights ) const +{ // lexical translation probability double lexScore = 1.0; for (lexicalCache::const_iterator iter = cache.begin(); iter != cache.end(); ++iter) { - vector t_vector = *iter; - double thisWordScore = 0; - for ( vector::const_iterator iter2 = t_vector.begin(); iter2 != t_vector.end(); ++iter2) { - vector joint_count = iter2->first; - vector marginal = iter2->second; - thisWordScore += m_combineFunction(joint_count, marginal, weights); - } - lexScore *= thisWordScore / t_vector.size(); + vector t_vector = *iter; + double thisWordScore = 0; + for ( vector::const_iterator iter2 = t_vector.begin(); iter2 != t_vector.end(); ++iter2) { + vector joint_count = iter2->first; + vector marginal = iter2->second; + thisWordScore += m_combineFunction(joint_count, marginal, weights); + } + lexScore *= thisWordScore / t_vector.size(); } return lexScore; } // get lexical probability for single word alignment pair -double PhraseDictionaryMultiModelCounts::GetLexicalProbability( string &wordS, string &wordT, const vector &tables, vector &multimodelweights ) const { - vector joint_count (m_numModels); - vector marginals (m_numModels); +double PhraseDictionaryMultiModelCounts::GetLexicalProbability( string &wordS, string &wordT, const vector &tables, vector &multimodelweights ) const +{ + vector joint_count (m_numModels); + vector marginals (m_numModels); - FillLexicalCountsJoint(wordS, wordT, joint_count, tables); - FillLexicalCountsMarginal(wordS, marginals, tables); + FillLexicalCountsJoint(wordS, wordT, joint_count, tables); + FillLexicalCountsMarginal(wordS, marginals, tables); - double lexProb = m_combineFunction(joint_count, marginals, multimodelweights); + double lexProb = m_combineFunction(joint_count, marginals, multimodelweights); return lexProb; } -void PhraseDictionaryMultiModelCounts::FillLexicalCountsJoint(string &wordS, string &wordT, vector &count, const vector &tables) const { - for (size_t i=0;i < m_numModels;i++) { - lexicalMapJoint::iterator joint_s = tables[i]->joint.find( wordS ); - if (joint_s == tables[i]->joint.end()) count[i] = 0.0; - else { - lexicalMap::iterator joint_t = joint_s->second.find( wordT ); - if (joint_t == joint_s->second.end()) count[i] = 0.0; - else count[i] = joint_t->second; - } +void PhraseDictionaryMultiModelCounts::FillLexicalCountsJoint(string &wordS, string &wordT, vector &count, const vector &tables) const +{ + for (size_t i=0; i < m_numModels; i++) { + lexicalMapJoint::iterator joint_s = tables[i]->joint.find( wordS ); + if (joint_s == tables[i]->joint.end()) count[i] = 0.0; + else { + lexicalMap::iterator joint_t = joint_s->second.find( wordT ); + if (joint_t == joint_s->second.end()) count[i] = 0.0; + else count[i] = joint_t->second; } + } } -void PhraseDictionaryMultiModelCounts::FillLexicalCountsMarginal(string &wordS, vector &count, const vector &tables) const { - for (size_t i=0;i < m_numModels;i++) { - lexicalMap::iterator marginal_s = tables[i]->marginal.find( wordS ); - if (marginal_s == tables[i]->marginal.end()) count[i] = 0.0; - else count[i] = marginal_s->second; - } +void PhraseDictionaryMultiModelCounts::FillLexicalCountsMarginal(string &wordS, vector &count, const vector &tables) const +{ + for (size_t i=0; i < m_numModels; i++) { + lexicalMap::iterator marginal_s = tables[i]->marginal.find( wordS ); + if (marginal_s == tables[i]->marginal.end()) count[i] = 0.0; + else count[i] = marginal_s->second; + } } -void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexicalTable* ltable) { +void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexicalTable* ltable) +{ cerr << "Loading lexical translation table from " << fileName; ifstream inFile; @@ -549,165 +554,161 @@ void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexic #ifdef WITH_DLIB -vector PhraseDictionaryMultiModelCounts::MinimizePerplexity(vector > &phrase_pair_vector) { +vector PhraseDictionaryMultiModelCounts::MinimizePerplexity(vector > &phrase_pair_vector) +{ - const StaticData &staticData = StaticData::Instance(); - const string& factorDelimiter = staticData.GetFactorDelimiter(); + const StaticData &staticData = StaticData::Instance(); + const string& factorDelimiter = staticData.GetFactorDelimiter(); - map, size_t> phrase_pair_map; + map, size_t> phrase_pair_map; - for ( vector >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) { - phrase_pair_map[*iter] += 1; + for ( vector >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) { + phrase_pair_map[*iter] += 1; + } + + vector optimizerStats; + + for ( map, size_t>::iterator iter = phrase_pair_map.begin(); iter != phrase_pair_map.end(); ++iter ) { + + pair phrase_pair = iter->first; + string source_string = phrase_pair.first; + string target_string = phrase_pair.second; + + vector fs(m_numModels); + map* allStats = new(map); + + Phrase sourcePhrase(0); + sourcePhrase.CreateFromString(Input, m_input, source_string, factorDelimiter, NULL); + + CollectSufficientStatistics(sourcePhrase, fs, allStats); //optimization potential: only call this once per source phrase + + //phrase pair not found; leave cache empty + if (allStats->find(target_string) == allStats->end()) { + RemoveAllInMap(*allStats); + delete allStats; + continue; } - vector optimizerStats; + multiModelCountsStatisticsOptimization * targetStatistics = new multiModelCountsStatisticsOptimization(); + targetStatistics->targetPhrase = new TargetPhrase(*(*allStats)[target_string]->targetPhrase); + targetStatistics->fs = fs; + targetStatistics->fst = (*allStats)[target_string]->fst; + targetStatistics->ft = (*allStats)[target_string]->ft; + targetStatistics->f = iter->second; - for ( map, size_t>::iterator iter = phrase_pair_map.begin(); iter != phrase_pair_map.end(); ++iter ) { + try { + pair >, vector< set > > alignment = GetAlignmentsForLexWeights(sourcePhrase, static_cast(*targetStatistics->targetPhrase), targetStatistics->targetPhrase->GetAlignTerm()); + targetStatistics->lexCachee2f = CacheLexicalStatistics(static_cast(*targetStatistics->targetPhrase), sourcePhrase, alignment.second, m_lexTable_e2f, m_output, m_input ); + targetStatistics->lexCachef2e = CacheLexicalStatistics(sourcePhrase, static_cast(*targetStatistics->targetPhrase), alignment.first, m_lexTable_f2e, m_input, m_output ); - pair phrase_pair = iter->first; - string source_string = phrase_pair.first; - string target_string = phrase_pair.second; + optimizerStats.push_back(targetStatistics); + } catch (AlignmentException& e) {} - vector fs(m_numModels); - map* allStats = new(map); + RemoveAllInMap(*allStats); + delete allStats; + } - Phrase sourcePhrase(0); - sourcePhrase.CreateFromString(Input, m_input, source_string, factorDelimiter, NULL); + Sentence sentence; + CleanUpAfterSentenceProcessing(sentence); // free memory used by compact phrase tables - CollectSufficientStatistics(sourcePhrase, fs, allStats); //optimization potential: only call this once per source phrase + vector ret (m_numModels*4); + for (size_t iFeature=0; iFeature < 4; iFeature++) { - //phrase pair not found; leave cache empty - if (allStats->find(target_string) == allStats->end()) { - RemoveAllInMap(*allStats); - delete allStats; - continue; - } + CrossEntropyCounts * ObjectiveFunction = new CrossEntropyCounts(optimizerStats, this, iFeature); - multiModelCountsStatisticsOptimization * targetStatistics = new multiModelCountsStatisticsOptimization(); - targetStatistics->targetPhrase = new TargetPhrase(*(*allStats)[target_string]->targetPhrase); - targetStatistics->fs = fs; - targetStatistics->fst = (*allStats)[target_string]->fst; - targetStatistics->ft = (*allStats)[target_string]->ft; - targetStatistics->f = iter->second; + vector weight_vector = Optimize(ObjectiveFunction, m_numModels); - try { - pair >, vector< set > > alignment = GetAlignmentsForLexWeights(sourcePhrase, static_cast(*targetStatistics->targetPhrase), targetStatistics->targetPhrase->GetAlignTerm()); - targetStatistics->lexCachee2f = CacheLexicalStatistics(static_cast(*targetStatistics->targetPhrase), sourcePhrase, alignment.second, m_lexTable_e2f, m_output, m_input ); - targetStatistics->lexCachef2e = CacheLexicalStatistics(sourcePhrase, static_cast(*targetStatistics->targetPhrase), alignment.first, m_lexTable_f2e, m_input, m_output ); - - optimizerStats.push_back(targetStatistics); - } - catch (AlignmentException& e) {} - - RemoveAllInMap(*allStats); - delete allStats; + if (m_mode == "interpolate") { + weight_vector = normalizeWeights(weight_vector); + } else if (m_mode == "instance_weighting") { + float first_value = weight_vector[0]; + for (size_t i=0; i < m_numModels; i++) { + weight_vector[i] = weight_vector[i]/first_value; + } } - - Sentence sentence; - CleanUpAfterSentenceProcessing(sentence); // free memory used by compact phrase tables - - vector ret (m_numModels*4); - for (size_t iFeature=0; iFeature < 4; iFeature++) { - - CrossEntropyCounts * ObjectiveFunction = new CrossEntropyCounts(optimizerStats, this, iFeature); - - vector weight_vector = Optimize(ObjectiveFunction, m_numModels); - - if (m_mode == "interpolate") { - weight_vector = normalizeWeights(weight_vector); - } - else if (m_mode == "instance_weighting") { - float first_value = weight_vector[0]; - for (size_t i=0; i < m_numModels; i++) { - weight_vector[i] = weight_vector[i]/first_value; - } - } - cerr << "Weight vector for feature " << iFeature << ": "; - for (size_t i=0; i < m_numModels; i++) { - ret[(iFeature*m_numModels)+i] = weight_vector[i]; - cerr << weight_vector[i] << " "; - } - cerr << endl; - delete ObjectiveFunction; + cerr << "Weight vector for feature " << iFeature << ": "; + for (size_t i=0; i < m_numModels; i++) { + ret[(iFeature*m_numModels)+i] = weight_vector[i]; + cerr << weight_vector[i] << " "; } + cerr << endl; + delete ObjectiveFunction; + } - RemoveAllInColl(optimizerStats); - return ret; + RemoveAllInColl(optimizerStats); + return ret; } double CrossEntropyCounts::operator() ( const dlib::matrix& arg) const { - double total = 0.0; - double n = 0.0; - std::vector weight_vector (m_model->m_numModels); + double total = 0.0; + double n = 0.0; + std::vector weight_vector (m_model->m_numModels); - for (int i=0; i < arg.nr(); i++) { - weight_vector[i] = arg(i); - } - if (m_model->m_mode == "interpolate") { - weight_vector = m_model->normalizeWeights(weight_vector); - } + for (int i=0; i < arg.nr(); i++) { + weight_vector[i] = arg(i); + } + if (m_model->m_mode == "interpolate") { + weight_vector = m_model->normalizeWeights(weight_vector); + } - for ( std::vector::const_iterator iter = m_optimizerStats.begin(); iter != m_optimizerStats.end(); ++iter ) { - multiModelCountsStatisticsOptimization* statistics = *iter; - size_t f = statistics->f; + for ( std::vector::const_iterator iter = m_optimizerStats.begin(); iter != m_optimizerStats.end(); ++iter ) { + multiModelCountsStatisticsOptimization* statistics = *iter; + size_t f = statistics->f; - double score; - if (m_iFeature == 0) { - score = m_model->m_combineFunction(statistics->fst, statistics->ft, weight_vector); - } - else if (m_iFeature == 1) { - score = m_model->ComputeWeightedLexicalTranslationFromCache(statistics->lexCachee2f, weight_vector); - } - else if (m_iFeature == 2) { - score = m_model->m_combineFunction(statistics->fst, statistics->fs, weight_vector); - } - else if (m_iFeature == 3) { - score = m_model->ComputeWeightedLexicalTranslationFromCache(statistics->lexCachef2e, weight_vector); - } - else { - score = 0; - UTIL_THROW(util::Exception, "Trying to optimize feature that I don't know. Aborting"); - } - total -= (FloorScore(TransformScore(score))/TransformScore(2))*f; - n += f; + double score; + if (m_iFeature == 0) { + score = m_model->m_combineFunction(statistics->fst, statistics->ft, weight_vector); + } else if (m_iFeature == 1) { + score = m_model->ComputeWeightedLexicalTranslationFromCache(statistics->lexCachee2f, weight_vector); + } else if (m_iFeature == 2) { + score = m_model->m_combineFunction(statistics->fst, statistics->fs, weight_vector); + } else if (m_iFeature == 3) { + score = m_model->ComputeWeightedLexicalTranslationFromCache(statistics->lexCachef2e, weight_vector); + } else { + score = 0; + UTIL_THROW(util::Exception, "Trying to optimize feature that I don't know. Aborting"); } - return total/n; + total -= (FloorScore(TransformScore(score))/TransformScore(2))*f; + n += f; + } + return total/n; } #endif // calculate weighted probability based on instance weighting of joint counts and marginal counts -double InstanceWeighting(vector &joint_counts, vector &marginals, vector &multimodelweights) { +double InstanceWeighting(vector &joint_counts, vector &marginals, vector &multimodelweights) +{ - double joint_counts_weighted = inner_product(joint_counts.begin(), joint_counts.end(), multimodelweights.begin(), 0.0); - double marginals_weighted = inner_product(marginals.begin(), marginals.end(), multimodelweights.begin(), 0.0); + double joint_counts_weighted = inner_product(joint_counts.begin(), joint_counts.end(), multimodelweights.begin(), 0.0); + double marginals_weighted = inner_product(marginals.begin(), marginals.end(), multimodelweights.begin(), 0.0); - if (marginals_weighted == 0) { - return 0; - } - else { - return joint_counts_weighted/marginals_weighted; - } + if (marginals_weighted == 0) { + return 0; + } else { + return joint_counts_weighted/marginals_weighted; + } } // calculate linear interpolation of relative frequency estimates based on joint count and marginal counts //unused for now; enable in config? -double LinearInterpolationFromCounts(vector &joint_counts, vector &marginals, vector &multimodelweights) { +double LinearInterpolationFromCounts(vector &joint_counts, vector &marginals, vector &multimodelweights) +{ - vector p(marginals.size()); + vector p(marginals.size()); - for (size_t i=0;i < marginals.size();i++) { - if (marginals[i] != 0) { - p[i] = joint_counts[i]/marginals[i]; - } + for (size_t i=0; i < marginals.size(); i++) { + if (marginals[i] != 0) { + p[i] = joint_counts[i]/marginals[i]; } + } - double p_weighted = inner_product(p.begin(), p.end(), multimodelweights.begin(), 0.0); + double p_weighted = inner_product(p.begin(), p.end(), multimodelweights.begin(), 0.0); - return p_weighted; + return p_weighted; } diff --git a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.h b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.h index ef89272c3..04be77dd6 100644 --- a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.h +++ b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.h @@ -35,32 +35,33 @@ extern std::vector tokenize( const char*); namespace Moses { - typedef boost::unordered_map lexicalMap; - typedef boost::unordered_map lexicalMapJoint; - typedef std::pair, std::vector > lexicalPair; - typedef std::vector > lexicalCache; +typedef boost::unordered_map lexicalMap; +typedef boost::unordered_map lexicalMapJoint; +typedef std::pair, std::vector > lexicalPair; +typedef std::vector > lexicalCache; - struct multiModelCountsStatistics : multiModelStatistics { - std::vector fst, ft; - }; +struct multiModelCountsStatistics : multiModelStatistics { + std::vector fst, ft; +}; - struct multiModelCountsStatisticsOptimization: multiModelCountsStatistics { - std::vector fs; - lexicalCache lexCachee2f, lexCachef2e; - size_t f; - }; +struct multiModelCountsStatisticsOptimization: multiModelCountsStatistics { + std::vector fs; + lexicalCache lexCachee2f, lexCachef2e; + size_t f; +}; - struct lexicalTable { - lexicalMapJoint joint; - lexicalMap marginal; - }; +struct lexicalTable { + lexicalMapJoint joint; + lexicalMap marginal; +}; - double InstanceWeighting(std::vector &joint_counts, std::vector &marginals, std::vector &multimodelweights); - double LinearInterpolationFromCounts(std::vector &joint_counts, std::vector &marginals, std::vector &multimodelweights); +double InstanceWeighting(std::vector &joint_counts, std::vector &marginals, std::vector &multimodelweights); +double LinearInterpolationFromCounts(std::vector &joint_counts, std::vector &marginals, std::vector &multimodelweights); //thrown if alignment information does not match phrase pair (out-of-bound alignment points) -class AlignmentException : public std::runtime_error { +class AlignmentException : public std::runtime_error +{ public: AlignmentException() : std::runtime_error("AlignmentException") { } }; @@ -72,10 +73,10 @@ class PhraseDictionaryMultiModelCounts: public PhraseDictionaryMultiModel { #ifdef WITH_DLIB -friend class CrossEntropyCounts; + friend class CrossEntropyCounts; #endif -typedef std::vector< std::set > AlignVector; + typedef std::vector< std::set > AlignVector; public: @@ -116,23 +117,22 @@ class CrossEntropyCounts: public OptimizationObjective { public: - CrossEntropyCounts ( - std::vector &optimizerStats, - PhraseDictionaryMultiModelCounts * model, - size_t iFeature - ) - { - m_optimizerStats = optimizerStats; - m_model = model; - m_iFeature = iFeature; - } + CrossEntropyCounts ( + std::vector &optimizerStats, + PhraseDictionaryMultiModelCounts * model, + size_t iFeature + ) { + m_optimizerStats = optimizerStats; + m_model = model; + m_iFeature = iFeature; + } - double operator() ( const dlib::matrix& arg) const; + double operator() ( const dlib::matrix& arg) const; private: - std::vector m_optimizerStats; - PhraseDictionaryMultiModelCounts * m_model; - size_t m_iFeature; + std::vector m_optimizerStats; + PhraseDictionaryMultiModelCounts * m_model; + size_t m_iFeature; }; #endif diff --git a/moses/TranslationModel/PhraseDictionaryNodeMemory.cpp b/moses/TranslationModel/PhraseDictionaryNodeMemory.cpp index 465c076d5..389c74394 100644 --- a/moses/TranslationModel/PhraseDictionaryNodeMemory.cpp +++ b/moses/TranslationModel/PhraseDictionaryNodeMemory.cpp @@ -31,8 +31,8 @@ namespace Moses PhraseDictionaryNodeMemory::~PhraseDictionaryNodeMemory() { for (TerminalMap::iterator iter = m_sourceTermMap.begin(); iter != m_sourceTermMap.end(); ++iter) { - const PhraseDictionaryNodeMemory *node = iter->second; - delete node; + const PhraseDictionaryNodeMemory *node = iter->second; + delete node; } for (NonTerminalMap::iterator iter = m_nonTermMap.begin(); iter != m_nonTermMap.end(); ++iter) { const PhraseDictionaryNodeMemory *node = iter->second; @@ -41,7 +41,8 @@ PhraseDictionaryNodeMemory::~PhraseDictionaryNodeMemory() delete m_targetPhraseCollection; } -TargetPhraseCollection &PhraseDictionaryNodeMemory::GetOrCreateTargetPhraseCollection() { +TargetPhraseCollection &PhraseDictionaryNodeMemory::GetOrCreateTargetPhraseCollection() +{ if (m_targetPhraseCollection == NULL) m_targetPhraseCollection = new TargetPhraseCollection(); return *m_targetPhraseCollection; @@ -138,9 +139,9 @@ void PhraseDictionaryNodeMemory::Clear() m_sourceTermMap.clear(); m_nonTermMap.clear(); delete m_targetPhraseCollection; - + } - + std::ostream& operator<<(std::ostream &out, const PhraseDictionaryNodeMemory &node) { out << node.GetTargetPhraseCollection(); diff --git a/moses/TranslationModel/PhraseDictionaryNodeMemory.h b/moses/TranslationModel/PhraseDictionaryNodeMemory.h index 672196ba2..136e10c0a 100644 --- a/moses/TranslationModel/PhraseDictionaryNodeMemory.h +++ b/moses/TranslationModel/PhraseDictionaryNodeMemory.h @@ -39,8 +39,8 @@ namespace Moses class PhraseDictionaryMemory; class PhraseDictionaryFuzzyMatch; - - //! @todo why? + +//! @todo why? class NonTerminalMapKeyHasher { public: @@ -152,7 +152,7 @@ public: } void Clear(); - + TO_STRING(); }; diff --git a/moses/TranslationModel/PhraseDictionaryTree.cpp b/moses/TranslationModel/PhraseDictionaryTree.cpp index 321924dfe..c5eefc290 100644 --- a/moses/TranslationModel/PhraseDictionaryTree.cpp +++ b/moses/TranslationModel/PhraseDictionaryTree.cpp @@ -157,7 +157,8 @@ PhraseDictionaryTree::PrefixPtr::operator bool() const typedef LVoc WordVoc; -class PDTimp { +class PDTimp +{ public: typedef PrefixTreeF PTF; typedef FilePtr CPT; @@ -481,7 +482,7 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out) std::vector vo; size_t lnc=0; size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info - size_t missingAlignmentCount = 0; + size_t missingAlignmentCount = 0; while(getline(inFile, line)) { ++lnc; @@ -553,9 +554,9 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out) if (!sparseFeatureString.empty()) { std::vector sparseTokens = Tokenize(sparseFeatureString); if (sparseTokens.size() % 2 != 0) { - TRACE_ERR("ERROR: incorrectly formatted sparse feature string: " << - sparseFeatureString << std::endl); - abort(); + TRACE_ERR("ERROR: incorrectly formatted sparse feature string: " << + sparseFeatureString << std::endl); + abort(); } for (size_t i = 0; i < sparseTokens.size(); i+=2) { fnames.push_back(imp->tv.add(sparseTokens[i])); @@ -624,7 +625,7 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out) if ( PrintWordAlignment()) { TRACE_ERR("Count of lines with missing alignments: " << - missingAlignmentCount << "/" << lnc << "\n"); + missingAlignmentCount << "/" << lnc << "\n"); } fClose(os); diff --git a/moses/TranslationModel/PhraseDictionaryTree.h b/moses/TranslationModel/PhraseDictionaryTree.h index 1b88637c3..6214c8194 100644 --- a/moses/TranslationModel/PhraseDictionaryTree.h +++ b/moses/TranslationModel/PhraseDictionaryTree.h @@ -31,8 +31,7 @@ class PDTimp; typedef PrefixTreeF PTF; //typedef std::pair,Scores > StringTgtCand; -struct StringTgtCand -{ +struct StringTgtCand { typedef std::vector Tokens; Tokens tokens; Scores scores; @@ -86,7 +85,7 @@ public: // get the target candidates for a given phrase void GetTargetCandidates(const std::vector& src, std::vector& rv) const; - + // get the target candidates for a given phrase void GetTargetCandidates(const std::vector& src, diff --git a/moses/TranslationModel/RuleTable/Loader.h b/moses/TranslationModel/RuleTable/Loader.h index 4d3e03351..48390e37e 100644 --- a/moses/TranslationModel/RuleTable/Loader.h +++ b/moses/TranslationModel/RuleTable/Loader.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -32,7 +32,7 @@ namespace Moses */ class RuleTableLoader { - public: +public: virtual ~RuleTableLoader() {} virtual bool Load(const std::vector &input, @@ -41,7 +41,7 @@ class RuleTableLoader size_t tableLimit, RuleTableTrie &) = 0; - protected: +protected: // Provide access to RuleTableTrie's private SortAndPrune function. void SortAndPrune(RuleTableTrie &ruleTable) { ruleTable.SortAndPrune(); @@ -50,10 +50,10 @@ class RuleTableLoader // Provide access to RuleTableTrie's private // GetOrCreateTargetPhraseCollection function. TargetPhraseCollection &GetOrCreateTargetPhraseCollection( - RuleTableTrie &ruleTable - , const Phrase &source - , const TargetPhrase &target - , const Word *sourceLHS) { + RuleTableTrie &ruleTable + , const Phrase &source + , const TargetPhrase &target + , const Word *sourceLHS) { return ruleTable.GetOrCreateTargetPhraseCollection(source, target, sourceLHS); } }; diff --git a/moses/TranslationModel/RuleTable/LoaderCompact.cpp b/moses/TranslationModel/RuleTable/LoaderCompact.cpp index f235b3e79..2b4a6003a 100644 --- a/moses/TranslationModel/RuleTable/LoaderCompact.cpp +++ b/moses/TranslationModel/RuleTable/LoaderCompact.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -84,9 +84,9 @@ bool RuleTableLoaderCompact::Load(const std::vector &input, } void RuleTableLoaderCompact::LoadVocabularySection( - LineReader &reader, - const std::vector &factorTypes, - std::vector &vocabulary) + LineReader &reader, + const std::vector &factorTypes, + std::vector &vocabulary) { // Read symbol count. reader.ReadLine(); @@ -106,10 +106,10 @@ void RuleTableLoaderCompact::LoadVocabularySection( } void RuleTableLoaderCompact::LoadPhraseSection( - LineReader &reader, - const std::vector &vocab, - std::vector &rhsPhrases, - std::vector &lhsIds) + LineReader &reader, + const std::vector &vocab, + std::vector &rhsPhrases, + std::vector &lhsIds) { // Read phrase count. reader.ReadLine(); @@ -132,7 +132,7 @@ void RuleTableLoaderCompact::LoadPhraseSection( } void RuleTableLoaderCompact::LoadAlignmentSection( - LineReader &reader, std::vector &alignmentSets, std::vector &sourcePhrases) + LineReader &reader, std::vector &alignmentSets, std::vector &sourcePhrases) { // Read alignment set count. reader.ReadLine(); @@ -144,8 +144,8 @@ void RuleTableLoaderCompact::LoadAlignmentSection( std::vector points; for (size_t i = 0; i < alignmentSetCount; ++i) { // Read alignment set, lookup in collection, and store pointer. - alignTerm.clear(); - alignNonTerm.clear(); + alignTerm.clear(); + alignNonTerm.clear(); tokens.clear(); reader.ReadLine(); @@ -157,11 +157,10 @@ void RuleTableLoaderCompact::LoadAlignmentSection( std::pair alignmentPair(points[0], points[1]); if (sourcePhrases[i].GetWord(alignmentPair.first).IsNonTerminal()) { - alignNonTerm.insert(alignmentPair); + alignNonTerm.insert(alignmentPair); + } else { + alignTerm.insert(alignmentPair); } - else { - alignTerm.insert(alignmentPair); - } } alignmentSets[i*2] = AlignmentInfoCollection::Instance().Add(alignNonTerm); @@ -170,13 +169,13 @@ void RuleTableLoaderCompact::LoadAlignmentSection( } bool RuleTableLoaderCompact::LoadRuleSection( - LineReader &reader, - const std::vector &vocab, - const std::vector &sourcePhrases, - const std::vector &targetPhrases, - const std::vector &targetLhsIds, - const std::vector &alignmentSets, - RuleTableTrie &ruleTable) + LineReader &reader, + const std::vector &vocab, + const std::vector &sourcePhrases, + const std::vector &targetPhrases, + const std::vector &targetLhsIds, + const std::vector &alignmentSets, + RuleTableTrie &ruleTable) { // Read rule count. reader.ReadLine(); @@ -232,7 +231,7 @@ bool RuleTableLoaderCompact::LoadRuleSection( // Insert rule into table. TargetPhraseCollection &coll = GetOrCreateTargetPhraseCollection( - ruleTable, sourcePhrase, *targetPhrase, &sourceLHS); + ruleTable, sourcePhrase, *targetPhrase, &sourceLHS); coll.Add(targetPhrase); } diff --git a/moses/TranslationModel/RuleTable/LoaderCompact.h b/moses/TranslationModel/RuleTable/LoaderCompact.h index 314cfca57..26e19fce6 100644 --- a/moses/TranslationModel/RuleTable/LoaderCompact.h +++ b/moses/TranslationModel/RuleTable/LoaderCompact.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -35,14 +35,14 @@ class RuleTableTrie; //! @todo ask phil williams class RuleTableLoaderCompact : public RuleTableLoader { - public: +public: bool Load(const std::vector &input, const std::vector &output, const std::string &inFile, size_t tableLimit, RuleTableTrie &); - private: +private: struct LineReader { LineReader(std::istream &input) : m_input(input), m_lineNum(0) {} void ReadLine() { @@ -78,8 +78,7 @@ class RuleTableLoaderCompact : public RuleTableLoader // Like Tokenize() but records starting positions of tokens (instead of // copying substrings) and assumes delimiter is ASCII space character. - void FindTokens(std::vector &output, const std::string &str) const - { + void FindTokens(std::vector &output, const std::string &str) const { // Skip delimiters at beginning. size_t lastPos = str.find_first_not_of(' ', 0); // Find first "non-delimiter". diff --git a/moses/TranslationModel/RuleTable/LoaderFactory.cpp b/moses/TranslationModel/RuleTable/LoaderFactory.cpp index b3bd00555..cdbfc965a 100644 --- a/moses/TranslationModel/RuleTable/LoaderFactory.cpp +++ b/moses/TranslationModel/RuleTable/LoaderFactory.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -37,7 +37,7 @@ namespace Moses // Determines the rule table type by peeking inside the file then creates // a suitable RuleTableLoader object. std::auto_ptr RuleTableLoaderFactory::Create( - const std::string &path) + const std::string &path) { InputFileStream input(path); std::string line; @@ -54,17 +54,15 @@ std::auto_ptr RuleTableLoaderFactory::Create( msg << "Unsupported compact rule table format: " << tokens[0]; UserMessage::Add(msg.str()); return std::auto_ptr(); + } else if (tokens[0] == "[X]" && tokens[1] == "|||") { + return std::auto_ptr(new + RuleTableLoaderHiero()); + } - else if (tokens[0] == "[X]" && tokens[1] == "|||") { - return std::auto_ptr(new - RuleTableLoaderHiero()); - - } - + return std::auto_ptr(new RuleTableLoaderStandard()); - } - else - { // empty phrase table + } else { + // empty phrase table return std::auto_ptr(new RuleTableLoaderStandard()); } } diff --git a/moses/TranslationModel/RuleTable/LoaderFactory.h b/moses/TranslationModel/RuleTable/LoaderFactory.h index 01c168680..c695738e4 100644 --- a/moses/TranslationModel/RuleTable/LoaderFactory.h +++ b/moses/TranslationModel/RuleTable/LoaderFactory.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -30,7 +30,7 @@ class RuleTableLoader; //! Creates a RuleTableLoader object suitable for loading the specified file. class RuleTableLoaderFactory { - public: +public: static std::auto_ptr Create(const std::string &); }; diff --git a/moses/TranslationModel/RuleTable/LoaderHiero.cpp b/moses/TranslationModel/RuleTable/LoaderHiero.cpp index c0526be02..81289d9b2 100644 --- a/moses/TranslationModel/RuleTable/LoaderHiero.cpp +++ b/moses/TranslationModel/RuleTable/LoaderHiero.cpp @@ -11,19 +11,20 @@ using namespace std; -namespace Moses { - +namespace Moses +{ + bool RuleTableLoaderHiero::Load(const std::vector &input, - const std::vector &output, - const std::string &inFile, - size_t tableLimit, - RuleTableTrie &ruleTable) + const std::vector &output, + const std::string &inFile, + size_t tableLimit, + RuleTableTrie &ruleTable) { bool ret = RuleTableLoaderStandard::Load(HieroFormat - ,input, output - ,inFile - ,tableLimit - ,ruleTable); + ,input, output + ,inFile + ,tableLimit + ,ruleTable); return ret; } diff --git a/moses/TranslationModel/RuleTable/LoaderHiero.h b/moses/TranslationModel/RuleTable/LoaderHiero.h index 1f6b66725..099787281 100644 --- a/moses/TranslationModel/RuleTable/LoaderHiero.h +++ b/moses/TranslationModel/RuleTable/LoaderHiero.h @@ -11,7 +11,8 @@ #include "LoaderStandard.h" -namespace Moses { +namespace Moses +{ //! specific implementation of SCFG loader to load rule tables formatted in Hiero-style format class RuleTableLoaderHiero : public RuleTableLoaderStandard diff --git a/moses/TranslationModel/RuleTable/LoaderStandard.cpp b/moses/TranslationModel/RuleTable/LoaderStandard.cpp index 566684775..fb5052c40 100644 --- a/moses/TranslationModel/RuleTable/LoaderStandard.cpp +++ b/moses/TranslationModel/RuleTable/LoaderStandard.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -59,38 +59,34 @@ bool RuleTableLoaderStandard::Load(const std::vector &input return ret; } - + void ReformatHieroRule(int sourceTarget, string &phrase, map > &ntAlign) { vector toks; Tokenize(toks, phrase, " "); - for (size_t i = 0; i < toks.size(); ++i) - { + for (size_t i = 0; i < toks.size(); ++i) { string &tok = toks[i]; size_t tokLen = tok.size(); - if (tok.substr(0, 1) == "[" && tok.substr(tokLen - 1, 1) == "]") - { // no-term + if (tok.substr(0, 1) == "[" && tok.substr(tokLen - 1, 1) == "]") { + // no-term vector split = Tokenize(tok, ","); CHECK(split.size() == 2); - + tok = "[X]" + split[0] + "]"; size_t coIndex = Scan(split[1]); - + pair &alignPoint = ntAlign[coIndex]; - if (sourceTarget == 0) - { + if (sourceTarget == 0) { alignPoint.first = i; - } - else - { + } else { alignPoint.second = i; } } } - + phrase = Join(" ", toks) + " [X]"; - + } void ReformateHieroScore(string &scoreString) @@ -98,8 +94,7 @@ void ReformateHieroScore(string &scoreString) vector toks; Tokenize(toks, scoreString, " "); - for (size_t i = 0; i < toks.size(); ++i) - { + for (size_t i = 0; i < toks.size(); ++i) { string &tok = toks[i]; vector nameValue = Tokenize(tok, "="); CHECK(nameValue.size() == 2); @@ -108,49 +103,48 @@ void ReformateHieroScore(string &scoreString) score = exp(-score); tok = SPrint(score); } - + scoreString = Join(" ", toks); } - + void ReformatHieroRule(const string &lineOrig, string &out) -{ +{ vector tokens; vector scoreVector; - + TokenizeMultiCharSeparator(tokens, lineOrig, "|||" ); string &sourcePhraseString = tokens[1] - , &targetPhraseString = tokens[2] - , &scoreString = tokens[3]; + , &targetPhraseString = tokens[2] + , &scoreString = tokens[3]; map > ntAlign; ReformatHieroRule(0, sourcePhraseString, ntAlign); ReformatHieroRule(1, targetPhraseString, ntAlign); ReformateHieroScore(scoreString); - + stringstream align; map >::const_iterator iterAlign; - for (iterAlign = ntAlign.begin(); iterAlign != ntAlign.end(); ++iterAlign) - { + for (iterAlign = ntAlign.begin(); iterAlign != ntAlign.end(); ++iterAlign) { const pair &alignPoint = iterAlign->second; align << alignPoint.first << "-" << alignPoint.second << " "; } - + stringstream ret; ret << sourcePhraseString << " ||| " - << targetPhraseString << " ||| " + << targetPhraseString << " ||| " << scoreString << " ||| " << align.str(); - + out = ret.str(); } - + bool RuleTableLoaderStandard::Load(FormatType format - , const std::vector &input - , const std::vector &output - , const std::string &inFile - , size_t /* tableLimit */ - , RuleTableTrie &ruleTable) + , const std::vector &input + , const std::vector &output + , const std::string &inFile + , size_t /* tableLimit */ + , RuleTableTrie &ruleTable) { PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format"); @@ -174,7 +168,9 @@ bool RuleTableLoaderStandard::Load(FormatType format while(true) { try { line = in.ReadLine(); - } catch (const util::EndOfFileException &e) { break; } + } catch (const util::EndOfFileException &e) { + break; + } if (format == HieroFormat) { // inefficiently reformat line hiero_before.assign(line.data(), line.size()); @@ -186,7 +182,7 @@ bool RuleTableLoaderStandard::Load(FormatType format StringPiece sourcePhraseString(*pipes); StringPiece targetPhraseString(*++pipes); StringPiece scoreString(*++pipes); - + StringPiece alignString; if (++pipes) { StringPiece temp(*pipes); @@ -237,9 +233,9 @@ bool RuleTableLoaderStandard::Load(FormatType format // rest of target phrase targetPhrase->SetAlignmentInfo(alignString); targetPhrase->SetTargetLHS(targetLHS); - + //targetPhrase->SetDebugOutput(string("New Format pt ") + line); - + if (++pipes) { StringPiece sparseString(*pipes); targetPhrase->SetSparseScore(&ruleTable, sparseString); diff --git a/moses/TranslationModel/RuleTable/LoaderStandard.h b/moses/TranslationModel/RuleTable/LoaderStandard.h index 4beefea39..b47f7c00b 100644 --- a/moses/TranslationModel/RuleTable/LoaderStandard.h +++ b/moses/TranslationModel/RuleTable/LoaderStandard.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -35,7 +35,7 @@ protected: const std::string &inFile, size_t tableLimit, RuleTableTrie &); - public: +public: bool Load(const std::vector &input, const std::vector &output, const std::string &inFile, diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp index 8f736af60..1f8ebab15 100644 --- a/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp +++ b/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp @@ -18,14 +18,14 @@ using namespace std; -namespace Moses +namespace Moses { PhraseDictionaryALSuffixArray::PhraseDictionaryALSuffixArray(const std::string &line) -: PhraseDictionaryMemory("PhraseDictionaryALSuffixArray", line) + : PhraseDictionaryMemory("PhraseDictionaryALSuffixArray", line) { const StaticData &staticData = StaticData::Instance(); if (staticData.ThreadCount() > 1) { - throw runtime_error("Suffix array implementation is not threadsafe"); + throw runtime_error("Suffix array implementation is not threadsafe"); } } @@ -33,14 +33,14 @@ void PhraseDictionaryALSuffixArray::InitializeForInput(InputType const& source) { // populate with rules for this sentence long translationId = source.GetTranslationId(); - + string grammarFile = GetFilePath() + "/grammar." + SPrint(translationId) + ".gz"; - + std::auto_ptr loader = - RuleTableLoaderFactory::Create(grammarFile); + RuleTableLoaderFactory::Create(grammarFile); bool ret = loader->Load(m_input, m_output, grammarFile, m_tableLimit, *this); - + CHECK(ret); } diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h b/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h index 81e1e02cf..aa4c15258 100644 --- a/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h +++ b/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h @@ -11,26 +11,28 @@ #include "moses/TranslationModel/PhraseDictionaryMemory.h" -namespace Moses { - +namespace Moses +{ + /** Implementation of in-memory phrase table for use with Adam Lopez's suffix array. * Does 2 things that the normal in-memory pt doesn't do: * 1. Loads grammar for a sentence to be decoded only when the sentence is being decoded. Unload afterwards 2. Format of the pt file follows Hiero, rather than Moses - */ + */ class PhraseDictionaryALSuffixArray : public PhraseDictionaryMemory { public: PhraseDictionaryALSuffixArray(const std::string &line); - bool InitDictionary() - { return true; } + bool InitDictionary() { + return true; + } void InitializeForInput(InputType const& source); void CleanUpAfterSentenceProcessing(const InputType& source); protected: - + }; - + } diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp index be6996399..669e7306b 100644 --- a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp +++ b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp @@ -3,17 +3,17 @@ /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2006 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -49,316 +49,312 @@ using namespace std; namespace Moses { - PhraseDictionaryFuzzyMatch::PhraseDictionaryFuzzyMatch(const std::string &line) +PhraseDictionaryFuzzyMatch::PhraseDictionaryFuzzyMatch(const std::string &line) : PhraseDictionary("PhraseDictionaryFuzzyMatch", line) - {} +{} - bool PhraseDictionaryFuzzyMatch::Load(const std::vector &input - , const std::vector &output - , const std::string &initStr - , size_t tableLimit) - { - m_tableLimit = tableLimit; - m_input = &input; - m_output = &output; - - - cerr << "initStr=" << initStr << endl; - m_config = Tokenize(initStr, ";"); - assert(m_config.size() == 3); +bool PhraseDictionaryFuzzyMatch::Load(const std::vector &input + , const std::vector &output + , const std::string &initStr + , size_t tableLimit) +{ + m_tableLimit = tableLimit; + m_input = &input; + m_output = &output; - m_FuzzyMatchWrapper = new tmmt::FuzzyMatchWrapper(m_config[0], m_config[1], m_config[2]); - - return true; - } - - ChartRuleLookupManager *PhraseDictionaryFuzzyMatch::CreateRuleLookupManager( - const InputType &sentence, - const ChartCellCollectionBase &cellCollection) - { - return new ChartRuleLookupManagerMemoryPerSentence(sentence, cellCollection, *this); - } - - - int removedirectoryrecursively(const char *dirname) - { - DIR *dir; - struct dirent *entry; - char path[PATH_MAX]; - - if (path == NULL) { - fprintf(stderr, "Out of memory error\n"); - return 0; - } - dir = opendir(dirname); - if (dir == NULL) { - perror("Error opendir()"); - return 0; - } - - while ((entry = readdir(dir)) != NULL) { - if (strcmp(entry->d_name, ".") && strcmp(entry->d_name, "..")) { - snprintf(path, (size_t) PATH_MAX, "%s/%s", dirname, entry->d_name); - if (entry->d_type == DT_DIR) { - removedirectoryrecursively(path); - } - - remove(path); - /* - * Here, the actual deletion must be done. Beacuse this is - * quite a dangerous thing to do, and this program is not very - * well tested, we are just printing as if we are deleting. - */ - //printf("(not really) Deleting: %s\n", path); - /* - * When you are finished testing this and feel you are ready to do the real - * deleting, use this: remove*STUB*(path); - * (see "man 3 remove") - * Please note that I DONT TAKE RESPONSIBILITY for data you delete with this! - */ - } - - } - closedir(dir); - - rmdir(dirname); - /* - * Now the directory is emtpy, finally delete the directory itself. (Just - * printing here, see above) - */ - //printf("(not really) Deleting: %s\n", dirname); - - return 1; - } - void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSentence) - { - char dirName[] = "/tmp/moses.XXXXXX"; - char *temp = mkdtemp(dirName); - CHECK(temp); - string dirNameStr(dirName); - - string inFileName(dirNameStr + "/in"); - - ofstream inFile(inFileName.c_str()); - - for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i) - { - inFile << inputSentence.GetWord(i); - } - inFile << endl; - inFile.close(); - - long translationId = inputSentence.GetTranslationId(); - string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr); + cerr << "initStr=" << initStr << endl; + m_config = Tokenize(initStr, ";"); + assert(m_config.size() == 3); - // populate with rules for this sentence - PhraseDictionaryNodeMemory &rootNode = m_collection[translationId]; - FormatType format = MosesFormat; - - // data from file - InputFileStream inStream(ptFileName); - - // copied from class LoaderStandard - PrintUserTime("Start loading fuzzy-match phrase model"); - - const StaticData &staticData = StaticData::Instance(); - const std::string& factorDelimiter = staticData.GetFactorDelimiter(); - - - string lineOrig; - size_t count = 0; - - while(getline(inStream, lineOrig)) { - const string *line; - if (format == HieroFormat) { // reformat line - assert(false); - //line = ReformatHieroRule(lineOrig); - } - else - { // do nothing to format of line - line = &lineOrig; - } - - vector tokens; - vector scoreVector; - - TokenizeMultiCharSeparator(tokens, *line , "|||" ); - - if (tokens.size() != 4 && tokens.size() != 5) { - stringstream strme; - strme << "Syntax error at " << ptFileName << ":" << count; - UserMessage::Add(strme.str()); - abort(); - } - - const string &sourcePhraseString = tokens[0] - , &targetPhraseString = tokens[1] - , &scoreString = tokens[2] - , &alignString = tokens[3]; - - bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); - if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { - TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n"); - continue; - } - - Tokenize(scoreVector, scoreString); - const size_t numScoreComponents = GetNumScoreComponents(); - if (scoreVector.size() != numScoreComponents) { - stringstream strme; - strme << "Size of scoreVector != number (" << scoreVector.size() << "!=" - << numScoreComponents << ") of score components on line " << count; - UserMessage::Add(strme.str()); - abort(); - } - CHECK(scoreVector.size() == numScoreComponents); - - // parse source & find pt node - - // constituent labels - Word *sourceLHS; - Word *targetLHS; - - // source - Phrase sourcePhrase( 0); - sourcePhrase.CreateFromString(Input, *m_input, sourcePhraseString, factorDelimiter, &sourceLHS); - - // create target phrase obj - TargetPhrase *targetPhrase = new TargetPhrase(); - targetPhrase->CreateFromString(Output, *m_output, targetPhraseString, factorDelimiter, &targetLHS); - - // rest of target phrase - targetPhrase->SetAlignmentInfo(alignString); - targetPhrase->SetTargetLHS(targetLHS); - //targetPhrase->SetDebugOutput(string("New Format pt ") + line); - - // component score, for n-best output - std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore); - std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); - - targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); - targetPhrase->Evaluate(sourcePhrase); - - TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS); - phraseColl.Add(targetPhrase); - - count++; - - if (format == HieroFormat) { // reformat line - delete line; - } - else - { // do nothing - } - - } - - // sort and prune each target phrase collection - SortAndPrune(rootNode); - - //removedirectoryrecursively(dirName); - } - - TargetPhraseCollection &PhraseDictionaryFuzzyMatch::GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeMemory &rootNode - , const Phrase &source - , const TargetPhrase &target - , const Word *sourceLHS) - { - PhraseDictionaryNodeMemory &currNode = GetOrCreateNode(rootNode, source, target, sourceLHS); - return currNode.GetOrCreateTargetPhraseCollection(); - } + m_FuzzyMatchWrapper = new tmmt::FuzzyMatchWrapper(m_config[0], m_config[1], m_config[2]); + + return true; +} + +ChartRuleLookupManager *PhraseDictionaryFuzzyMatch::CreateRuleLookupManager( + const InputType &sentence, + const ChartCellCollectionBase &cellCollection) +{ + return new ChartRuleLookupManagerMemoryPerSentence(sentence, cellCollection, *this); +} + + +int removedirectoryrecursively(const char *dirname) +{ + DIR *dir; + struct dirent *entry; + char path[PATH_MAX]; + + if (path == NULL) { + fprintf(stderr, "Out of memory error\n"); + return 0; + } + dir = opendir(dirname); + if (dir == NULL) { + perror("Error opendir()"); + return 0; + } + + while ((entry = readdir(dir)) != NULL) { + if (strcmp(entry->d_name, ".") && strcmp(entry->d_name, "..")) { + snprintf(path, (size_t) PATH_MAX, "%s/%s", dirname, entry->d_name); + if (entry->d_type == DT_DIR) { + removedirectoryrecursively(path); + } + + remove(path); + /* + * Here, the actual deletion must be done. Beacuse this is + * quite a dangerous thing to do, and this program is not very + * well tested, we are just printing as if we are deleting. + */ + //printf("(not really) Deleting: %s\n", path); + /* + * When you are finished testing this and feel you are ready to do the real + * deleting, use this: remove*STUB*(path); + * (see "man 3 remove") + * Please note that I DONT TAKE RESPONSIBILITY for data you delete with this! + */ + } + + } + closedir(dir); + + rmdir(dirname); + /* + * Now the directory is emtpy, finally delete the directory itself. (Just + * printing here, see above) + */ + //printf("(not really) Deleting: %s\n", dirname); + + return 1; +} + +void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSentence) +{ + char dirName[] = "/tmp/moses.XXXXXX"; + char *temp = mkdtemp(dirName); + CHECK(temp); + string dirNameStr(dirName); + + string inFileName(dirNameStr + "/in"); + + ofstream inFile(inFileName.c_str()); + + for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i) { + inFile << inputSentence.GetWord(i); + } + inFile << endl; + inFile.close(); + + long translationId = inputSentence.GetTranslationId(); + string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr); + + // populate with rules for this sentence + PhraseDictionaryNodeMemory &rootNode = m_collection[translationId]; + FormatType format = MosesFormat; + + // data from file + InputFileStream inStream(ptFileName); + + // copied from class LoaderStandard + PrintUserTime("Start loading fuzzy-match phrase model"); + + const StaticData &staticData = StaticData::Instance(); + const std::string& factorDelimiter = staticData.GetFactorDelimiter(); + + + string lineOrig; + size_t count = 0; + + while(getline(inStream, lineOrig)) { + const string *line; + if (format == HieroFormat) { // reformat line + assert(false); + //line = ReformatHieroRule(lineOrig); + } else { + // do nothing to format of line + line = &lineOrig; + } + + vector tokens; + vector scoreVector; + + TokenizeMultiCharSeparator(tokens, *line , "|||" ); + + if (tokens.size() != 4 && tokens.size() != 5) { + stringstream strme; + strme << "Syntax error at " << ptFileName << ":" << count; + UserMessage::Add(strme.str()); + abort(); + } + + const string &sourcePhraseString = tokens[0] + , &targetPhraseString = tokens[1] + , &scoreString = tokens[2] + , &alignString = tokens[3]; + + bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); + if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { + TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n"); + continue; + } + + Tokenize(scoreVector, scoreString); + const size_t numScoreComponents = GetNumScoreComponents(); + if (scoreVector.size() != numScoreComponents) { + stringstream strme; + strme << "Size of scoreVector != number (" << scoreVector.size() << "!=" + << numScoreComponents << ") of score components on line " << count; + UserMessage::Add(strme.str()); + abort(); + } + CHECK(scoreVector.size() == numScoreComponents); + + // parse source & find pt node + + // constituent labels + Word *sourceLHS; + Word *targetLHS; + + // source + Phrase sourcePhrase( 0); + sourcePhrase.CreateFromString(Input, *m_input, sourcePhraseString, factorDelimiter, &sourceLHS); + + // create target phrase obj + TargetPhrase *targetPhrase = new TargetPhrase(); + targetPhrase->CreateFromString(Output, *m_output, targetPhraseString, factorDelimiter, &targetLHS); + + // rest of target phrase + targetPhrase->SetAlignmentInfo(alignString); + targetPhrase->SetTargetLHS(targetLHS); + //targetPhrase->SetDebugOutput(string("New Format pt ") + line); + + // component score, for n-best output + std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore); + std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); + + targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); + targetPhrase->Evaluate(sourcePhrase); + + TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS); + phraseColl.Add(targetPhrase); + + count++; + + if (format == HieroFormat) { // reformat line + delete line; + } else { + // do nothing + } + + } + + // sort and prune each target phrase collection + SortAndPrune(rootNode); + + //removedirectoryrecursively(dirName); +} + +TargetPhraseCollection &PhraseDictionaryFuzzyMatch::GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeMemory &rootNode + , const Phrase &source + , const TargetPhrase &target + , const Word *sourceLHS) +{ + PhraseDictionaryNodeMemory &currNode = GetOrCreateNode(rootNode, source, target, sourceLHS); + return currNode.GetOrCreateTargetPhraseCollection(); +} + +PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode + , const Phrase &source + , const TargetPhrase &target + , const Word *sourceLHS) +{ + cerr << source << endl << target << endl; + const size_t size = source.GetSize(); + + const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm(); + AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin(); + + PhraseDictionaryNodeMemory *currNode = &rootNode; + for (size_t pos = 0 ; pos < size ; ++pos) { + const Word& word = source.GetWord(pos); + + if (word.IsNonTerminal()) { + // indexed by source label 1st + const Word &sourceNonTerm = word; + + CHECK(iterAlign != alignmentInfo.end()); + CHECK(iterAlign->first == pos); + size_t targetNonTermInd = iterAlign->second; + ++iterAlign; + const Word &targetNonTerm = target.GetWord(targetNonTermInd); + + currNode = currNode->GetOrCreateChild(sourceNonTerm, targetNonTerm); + } else { + currNode = currNode->GetOrCreateChild(word); + } + + CHECK(currNode != NULL); + } + + // finally, the source LHS + //currNode = currNode->GetOrCreateChild(sourceLHS); + //CHECK(currNode != NULL); + + + return *currNode; +} + +void PhraseDictionaryFuzzyMatch::SortAndPrune(PhraseDictionaryNodeMemory &rootNode) +{ + if (GetTableLimit()) { + rootNode.Sort(GetTableLimit()); + } +} + +void PhraseDictionaryFuzzyMatch::CleanUpAfterSentenceProcessing(const InputType &source) +{ + m_collection.erase(source.GetTranslationId()); +} + +const PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source) const +{ + long transId = source.GetTranslationId(); + std::map::const_iterator iter = m_collection.find(transId); + CHECK(iter != m_collection.end()); + return iter->second; +} +PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source) +{ + long transId = source.GetTranslationId(); + std::map::iterator iter = m_collection.find(transId); + CHECK(iter != m_collection.end()); + return iter->second; +} + +TO_STRING_BODY(PhraseDictionaryFuzzyMatch); + +// friend +ostream& operator<<(ostream& out, const PhraseDictionaryFuzzyMatch& phraseDict) +{ + typedef PhraseDictionaryNodeMemory::TerminalMap TermMap; + typedef PhraseDictionaryNodeMemory::NonTerminalMap NonTermMap; + + /* + const PhraseDictionaryNodeMemory &coll = phraseDict.m_collection; + for (NonTermMap::const_iterator p = coll.m_nonTermMap.begin(); p != coll.m_nonTermMap.end(); ++p) { + const Word &sourceNonTerm = p->first.first; + out << sourceNonTerm; + } + for (TermMap::const_iterator p = coll.m_sourceTermMap.begin(); p != coll.m_sourceTermMap.end(); ++p) { + const Word &sourceTerm = p->first; + out << sourceTerm; + } + */ + + return out; +} - PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode - , const Phrase &source - , const TargetPhrase &target - , const Word *sourceLHS) - { - cerr << source << endl << target << endl; - const size_t size = source.GetSize(); - - const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm(); - AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin(); - - PhraseDictionaryNodeMemory *currNode = &rootNode; - for (size_t pos = 0 ; pos < size ; ++pos) { - const Word& word = source.GetWord(pos); - - if (word.IsNonTerminal()) { - // indexed by source label 1st - const Word &sourceNonTerm = word; - - CHECK(iterAlign != alignmentInfo.end()); - CHECK(iterAlign->first == pos); - size_t targetNonTermInd = iterAlign->second; - ++iterAlign; - const Word &targetNonTerm = target.GetWord(targetNonTermInd); - - currNode = currNode->GetOrCreateChild(sourceNonTerm, targetNonTerm); - } else { - currNode = currNode->GetOrCreateChild(word); - } - - CHECK(currNode != NULL); - } - - // finally, the source LHS - //currNode = currNode->GetOrCreateChild(sourceLHS); - //CHECK(currNode != NULL); - - - return *currNode; - } - - void PhraseDictionaryFuzzyMatch::SortAndPrune(PhraseDictionaryNodeMemory &rootNode) - { - if (GetTableLimit()) - { - rootNode.Sort(GetTableLimit()); - } - } - - void PhraseDictionaryFuzzyMatch::CleanUpAfterSentenceProcessing(const InputType &source) - { - m_collection.erase(source.GetTranslationId()); - } - - const PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source) const - { - long transId = source.GetTranslationId(); - std::map::const_iterator iter = m_collection.find(transId); - CHECK(iter != m_collection.end()); - return iter->second; - } - PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source) - { - long transId = source.GetTranslationId(); - std::map::iterator iter = m_collection.find(transId); - CHECK(iter != m_collection.end()); - return iter->second; - } - - TO_STRING_BODY(PhraseDictionaryFuzzyMatch); - - // friend - ostream& operator<<(ostream& out, const PhraseDictionaryFuzzyMatch& phraseDict) - { - typedef PhraseDictionaryNodeMemory::TerminalMap TermMap; - typedef PhraseDictionaryNodeMemory::NonTerminalMap NonTermMap; - - /* - const PhraseDictionaryNodeMemory &coll = phraseDict.m_collection; - for (NonTermMap::const_iterator p = coll.m_nonTermMap.begin(); p != coll.m_nonTermMap.end(); ++p) { - const Word &sourceNonTerm = p->first.first; - out << sourceNonTerm; - } - for (TermMap::const_iterator p = coll.m_sourceTermMap.begin(); p != coll.m_sourceTermMap.end(); ++p) { - const Word &sourceTerm = p->first; - out << sourceTerm; - } - */ - - return out; - } - } diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h index 8e4d20423..94966b175 100644 --- a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h +++ b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -29,61 +29,60 @@ namespace Moses { - class PhraseDictionaryNodeMemory; - - /** Implementation of a SCFG rule table in a trie. Looking up a rule of - * length n symbols requires n look-ups to find the TargetPhraseCollection. - */ - class PhraseDictionaryFuzzyMatch : public PhraseDictionary - { - friend std::ostream& operator<<(std::ostream&, const PhraseDictionaryFuzzyMatch&); - friend class RuleTableLoader; - - public: - PhraseDictionaryFuzzyMatch(const std::string &line); - bool Load(const std::vector &input - , const std::vector &output - , const std::string &initStr - , size_t tableLimit); - - const PhraseDictionaryNodeMemory &GetRootNode(const InputType &source) const; - - ChartRuleLookupManager *CreateRuleLookupManager( - const InputType &, - const ChartCellCollectionBase &); - void InitializeForInput(InputType const& inputSentence); - void CleanUpAfterSentenceProcessing(const InputType& source); - - virtual const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase& src) const - { - assert(false); - return NULL; - } - - TO_STRING(); - - protected: - TargetPhraseCollection &GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeMemory &rootNode - , const Phrase &source - , const TargetPhrase &target - , const Word *sourceLHS); - - PhraseDictionaryNodeMemory &GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode - , const Phrase &source - , const TargetPhrase &target - , const Word *sourceLHS); - - void SortAndPrune(PhraseDictionaryNodeMemory &rootNode); - PhraseDictionaryNodeMemory &GetRootNode(const InputType &source); +class PhraseDictionaryNodeMemory; - std::map m_collection; - std::vector m_config; - - const std::vector *m_input, *m_output; - const std::vector *m_weight; - - tmmt::FuzzyMatchWrapper *m_FuzzyMatchWrapper; +/** Implementation of a SCFG rule table in a trie. Looking up a rule of + * length n symbols requires n look-ups to find the TargetPhraseCollection. + */ +class PhraseDictionaryFuzzyMatch : public PhraseDictionary +{ + friend std::ostream& operator<<(std::ostream&, const PhraseDictionaryFuzzyMatch&); + friend class RuleTableLoader; + +public: + PhraseDictionaryFuzzyMatch(const std::string &line); + bool Load(const std::vector &input + , const std::vector &output + , const std::string &initStr + , size_t tableLimit); + + const PhraseDictionaryNodeMemory &GetRootNode(const InputType &source) const; + + ChartRuleLookupManager *CreateRuleLookupManager( + const InputType &, + const ChartCellCollectionBase &); + void InitializeForInput(InputType const& inputSentence); + void CleanUpAfterSentenceProcessing(const InputType& source); + + virtual const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase& src) const { + assert(false); + return NULL; + } + + TO_STRING(); + +protected: + TargetPhraseCollection &GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeMemory &rootNode + , const Phrase &source + , const TargetPhrase &target + , const Word *sourceLHS); + + PhraseDictionaryNodeMemory &GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode + , const Phrase &source + , const TargetPhrase &target + , const Word *sourceLHS); + + void SortAndPrune(PhraseDictionaryNodeMemory &rootNode); + PhraseDictionaryNodeMemory &GetRootNode(const InputType &source); + + std::map m_collection; + std::vector m_config; + + const std::vector *m_input, *m_output; + const std::vector *m_weight; + + tmmt::FuzzyMatchWrapper *m_FuzzyMatchWrapper; + +}; - }; - } // namespace Moses diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp index cd509f544..38cf247af 100644 --- a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp +++ b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp @@ -1,4 +1,4 @@ - // vim:tabstop=2 +// vim:tabstop=2 /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2010 Hieu Hoang diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h index 9b186def9..874478cdc 100644 --- a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h +++ b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h @@ -35,7 +35,7 @@ class TargetPhraseCollection; class DottedRuleStackOnDisk; /** Implementation of on-disk phrase table for hierarchical/syntax model. - */ + */ class PhraseDictionaryOnDisk : public PhraseDictionary { typedef PhraseDictionary MyBase; diff --git a/moses/TranslationModel/RuleTable/Trie.cpp b/moses/TranslationModel/RuleTable/Trie.cpp index c3590074d..950271d29 100644 --- a/moses/TranslationModel/RuleTable/Trie.cpp +++ b/moses/TranslationModel/RuleTable/Trie.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -38,7 +38,7 @@ bool RuleTableTrie::InitDictionary() { std::auto_ptr loader = - Moses::RuleTableLoaderFactory::Create(m_filePath); + Moses::RuleTableLoaderFactory::Create(m_filePath); if (!loader.get()) { return false; } diff --git a/moses/TranslationModel/RuleTable/Trie.h b/moses/TranslationModel/RuleTable/Trie.h index 822ef8b92..c2f757ab8 100644 --- a/moses/TranslationModel/RuleTable/Trie.h +++ b/moses/TranslationModel/RuleTable/Trie.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -39,28 +39,27 @@ class Word; */ class RuleTableTrie : public PhraseDictionary { - public: +public: RuleTableTrie(const std::string &description, const std::string &line) - : PhraseDictionary(description, line) - {} + : PhraseDictionary(description, line) + {} virtual ~RuleTableTrie(); bool InitDictionary(); // Required by PhraseDictionary. - virtual const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase &) const - { + virtual const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase &) const { CHECK(false); return NULL; } - private: +private: friend class RuleTableLoader; virtual TargetPhraseCollection &GetOrCreateTargetPhraseCollection( - const Phrase &source, const TargetPhrase &target, - const Word *sourceLHS) = 0; + const Phrase &source, const TargetPhrase &target, + const Word *sourceLHS) = 0; virtual void SortAndPrune() = 0; diff --git a/moses/TranslationModel/RuleTable/UTrie.cpp b/moses/TranslationModel/RuleTable/UTrie.cpp index bcfc0d538..17f457f22 100644 --- a/moses/TranslationModel/RuleTable/UTrie.cpp +++ b/moses/TranslationModel/RuleTable/UTrie.cpp @@ -39,15 +39,15 @@ namespace Moses { TargetPhraseCollection &RuleTableUTrie::GetOrCreateTargetPhraseCollection( - const Phrase &source, const TargetPhrase &target, const Word *sourceLHS) + const Phrase &source, const TargetPhrase &target, const Word *sourceLHS) { UTrieNode &currNode = GetOrCreateNode(source, target, sourceLHS); return currNode.GetOrCreateTargetPhraseCollection(target); } UTrieNode &RuleTableUTrie::GetOrCreateNode(const Phrase &source, - const TargetPhrase &target, - const Word */*sourceLHS*/) + const TargetPhrase &target, + const Word */*sourceLHS*/) { const size_t size = source.GetSize(); diff --git a/moses/TranslationModel/RuleTable/UTrie.h b/moses/TranslationModel/RuleTable/UTrie.h index d31e22cc7..a8f218158 100644 --- a/moses/TranslationModel/RuleTable/UTrie.h +++ b/moses/TranslationModel/RuleTable/UTrie.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -43,21 +43,23 @@ class Word; */ class RuleTableUTrie : public RuleTableTrie { - public: +public: RuleTableUTrie(const std::string &line) - : RuleTableTrie("RuleTableUTrie", line) + : RuleTableTrie("RuleTableUTrie", line) {} - const UTrieNode &GetRootNode() const { return m_root; } + const UTrieNode &GetRootNode() const { + return m_root; + } ChartRuleLookupManager *CreateRuleLookupManager(const InputType &, - const ChartCellCollectionBase &); + const ChartCellCollectionBase &); - private: +private: const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase &) const; TargetPhraseCollection &GetOrCreateTargetPhraseCollection( - const Phrase &source, const TargetPhrase &target, const Word *sourceLHS); + const Phrase &source, const TargetPhrase &target, const Word *sourceLHS); UTrieNode &GetOrCreateNode(const Phrase &source, const TargetPhrase &target, const Word *sourceLHS); diff --git a/moses/TranslationModel/RuleTable/UTrieNode.cpp b/moses/TranslationModel/RuleTable/UTrieNode.cpp index d2275422e..725f02c97 100644 --- a/moses/TranslationModel/RuleTable/UTrieNode.cpp +++ b/moses/TranslationModel/RuleTable/UTrieNode.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -90,7 +90,7 @@ UTrieNode *UTrieNode::GetOrCreateNonTerminalChild(const Word &targetNonTerm) } TargetPhraseCollection &UTrieNode::GetOrCreateTargetPhraseCollection( - const TargetPhrase &target) + const TargetPhrase &target) { const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm(); const size_t rank = alignmentInfo.GetSize(); diff --git a/moses/TranslationModel/RuleTable/UTrieNode.h b/moses/TranslationModel/RuleTable/UTrieNode.h index b3d82cddc..436bcbea1 100644 --- a/moses/TranslationModel/RuleTable/UTrieNode.h +++ b/moses/TranslationModel/RuleTable/UTrieNode.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -42,49 +42,62 @@ class RuleTableUTrie; //! @todo ask phil williams - whats the diff between this and phrasedictionaryNode class UTrieNode { - public: +public: typedef std::vector > LabelTable; #if defined(BOOST_VERSION) && (BOOST_VERSION >= 104200) typedef boost::unordered_map TerminalMap; + UTrieNode, + TerminalHasher, + TerminalEqualityPred> TerminalMap; typedef boost::unordered_map, - TargetPhraseCollection> LabelMap; + TargetPhraseCollection> LabelMap; #else typedef std::map TerminalMap; typedef std::map, TargetPhraseCollection> LabelMap; #endif - ~UTrieNode() { delete m_gapNode; } + ~UTrieNode() { + delete m_gapNode; + } - const LabelTable &GetLabelTable() const { return m_labelTable; } - const LabelMap &GetLabelMap() const { return m_labelMap; } - const TerminalMap &GetTerminalMap() const { return m_terminalMap; } + const LabelTable &GetLabelTable() const { + return m_labelTable; + } + const LabelMap &GetLabelMap() const { + return m_labelMap; + } + const TerminalMap &GetTerminalMap() const { + return m_terminalMap; + } - const UTrieNode *GetNonTerminalChild() const { return m_gapNode; } + const UTrieNode *GetNonTerminalChild() const { + return m_gapNode; + } UTrieNode *GetOrCreateTerminalChild(const Word &sourceTerm); UTrieNode *GetOrCreateNonTerminalChild(const Word &targetNonTerm); TargetPhraseCollection &GetOrCreateTargetPhraseCollection( - const TargetPhrase &); + const TargetPhrase &); - bool IsLeaf() const { return m_terminalMap.empty() && m_gapNode == NULL; } + bool IsLeaf() const { + return m_terminalMap.empty() && m_gapNode == NULL; + } - bool HasRules() const { return !m_labelMap.empty(); } + bool HasRules() const { + return !m_labelMap.empty(); + } void Prune(size_t tableLimit); void Sort(size_t tableLimit); - private: +private: friend class RuleTableUTrie; UTrieNode() : m_gapNode(NULL) {} - int InsertLabel(int i, const Word &w) - { + int InsertLabel(int i, const Word &w) { std::vector &inner = m_labelTable[i]; for (size_t j = 0; j < inner.size(); ++j) { if (inner[j] == w) { diff --git a/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.cpp b/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.cpp index ec5c1d8f1..b635dc050 100644 --- a/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.cpp +++ b/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -39,7 +39,7 @@ void ApplicableRuleTrie::Extend(const UTrieNode &root, int minPos, size_t index = *r; if (index == (size_t)minPos || (followsGap && index > (size_t)minPos) || minPos == -1) { ApplicableRuleTrie *subTrie = new ApplicableRuleTrie(index, index, - child); + child); subTrie->Extend(child, index+1, sentMap, false); m_children.push_back(subTrie); } diff --git a/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.h b/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.h index 35243adde..9d2f2cda9 100644 --- a/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.h +++ b/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.h @@ -33,14 +33,13 @@ struct VarSpanNode; /** @todo what is this? */ -struct ApplicableRuleTrie -{ - public: +struct ApplicableRuleTrie { +public: ApplicableRuleTrie(int start, int end, const UTrieNode &node) - : m_start(start) - , m_end(end) - , m_node(&node) - , m_vstNode(NULL) {} + : m_start(start) + , m_end(end) + , m_node(&node) + , m_vstNode(NULL) {} ~ApplicableRuleTrie() { RemoveAllInColl(m_children); diff --git a/moses/TranslationModel/Scope3Parser/IntermediateVarSpanNode.h b/moses/TranslationModel/Scope3Parser/IntermediateVarSpanNode.h index 353fabf22..499085127 100644 --- a/moses/TranslationModel/Scope3Parser/IntermediateVarSpanNode.h +++ b/moses/TranslationModel/Scope3Parser/IntermediateVarSpanNode.h @@ -26,9 +26,8 @@ namespace Moses /** @todo what is this? */ -struct IntermediateVarSpanNode -{ - public: +struct IntermediateVarSpanNode { +public: typedef std::pair Range; IntermediateVarSpanNode() @@ -41,8 +40,12 @@ struct IntermediateVarSpanNode , m_end(end) , m_numSplitPoints(0) {} - bool isOpen() { return m_end.second == -1; } - bool isClosed() { return !isOpen(); } + bool isOpen() { + return m_end.second == -1; + } + bool isClosed() { + return !isOpen(); + } Range m_start; Range m_end; diff --git a/moses/TranslationModel/Scope3Parser/Parser.cpp b/moses/TranslationModel/Scope3Parser/Parser.cpp index bfcacb1ed..81e156b3d 100644 --- a/moses/TranslationModel/Scope3Parser/Parser.cpp +++ b/moses/TranslationModel/Scope3Parser/Parser.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -38,8 +38,8 @@ namespace Moses { void Scope3Parser::GetChartRuleCollection( - const WordsRange &range, - ChartParserCallback &outColl) + const WordsRange &range, + ChartParserCallback &outColl) { const size_t start = range.GetStartPos(); const size_t end = range.GetEndPos(); @@ -122,7 +122,7 @@ void Scope3Parser::InitRuleApplicationVector() } void Scope3Parser::FillSentenceMap( - const Sentence &sent, SentenceMap &sentMap) + const Sentence &sent, SentenceMap &sentMap) { for (size_t i = 0; i < sent.GetSize(); ++i) { sentMap[sent.GetWord(i)].push_back(i); @@ -130,10 +130,10 @@ void Scope3Parser::FillSentenceMap( } void Scope3Parser::AddRulesToCells( - const ApplicableRuleTrie &node, - std::pair start, - int maxPos, - int depth) + const ApplicableRuleTrie &node, + std::pair start, + int maxPos, + int depth) { if (depth > 0) { // Determine the start range for this path if not already known. @@ -183,7 +183,7 @@ void Scope3Parser::AddRulesToCells( break; } m_ruleApplications[i][span].push_back(std::make_pair(node.m_node, - node.m_vstNode)); + node.m_vstNode)); } } } diff --git a/moses/TranslationModel/Scope3Parser/Parser.h b/moses/TranslationModel/Scope3Parser/Parser.h index 0b5e63d95..2a46de9a8 100644 --- a/moses/TranslationModel/Scope3Parser/Parser.h +++ b/moses/TranslationModel/Scope3Parser/Parser.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -46,15 +46,14 @@ class WordsRange; */ class Scope3Parser : public ChartRuleLookupManager { - public: +public: Scope3Parser(const InputType &sentence, const ChartCellCollectionBase &cellColl, const RuleTableUTrie &ruleTable, size_t maxChartSpan) - : ChartRuleLookupManager(sentence, cellColl) - , m_ruleTable(ruleTable) - , m_maxChartSpan(maxChartSpan) - { + : ChartRuleLookupManager(sentence, cellColl) + , m_ruleTable(ruleTable) + , m_maxChartSpan(maxChartSpan) { Init(); } @@ -62,23 +61,21 @@ class Scope3Parser : public ChartRuleLookupManager const WordsRange &range, ChartParserCallback &outColl); - private: +private: // Define a callback type for use by StackLatticeSearcher. - struct MatchCallback - { - public: - MatchCallback(const WordsRange &range, - ChartParserCallback &out) - : m_range(range) - , m_out(out) - , m_tpc(NULL) {} - void operator()(const StackVec &stackVec) - { - m_out.Add(*m_tpc, stackVec, m_range); - } - const WordsRange &m_range; - ChartParserCallback &m_out; - const TargetPhraseCollection *m_tpc; + struct MatchCallback { + public: + MatchCallback(const WordsRange &range, + ChartParserCallback &out) + : m_range(range) + , m_out(out) + , m_tpc(NULL) {} + void operator()(const StackVec &stackVec) { + m_out.Add(*m_tpc, stackVec, m_range); + } + const WordsRange &m_range; + ChartParserCallback &m_out; + const TargetPhraseCollection *m_tpc; }; void Init(); @@ -89,7 +86,7 @@ class Scope3Parser : public ChartRuleLookupManager const RuleTableUTrie &m_ruleTable; std::vector > > > m_ruleApplications; + std::pair > > > m_ruleApplications; std::auto_ptr m_varSpanTrie; StackVec m_emptyStackVec; const size_t m_maxChartSpan; diff --git a/moses/TranslationModel/Scope3Parser/SentenceMap.h b/moses/TranslationModel/Scope3Parser/SentenceMap.h index 9bc46db93..a7a1fdad9 100644 --- a/moses/TranslationModel/Scope3Parser/SentenceMap.h +++ b/moses/TranslationModel/Scope3Parser/SentenceMap.h @@ -29,7 +29,7 @@ namespace Moses { typedef boost::unordered_map, - TerminalHasher, - TerminalEqualityPred> SentenceMap; + std::vector, + TerminalHasher, + TerminalEqualityPred> SentenceMap; } diff --git a/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.cpp b/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.cpp index bb553a116..26e4e6aca 100644 --- a/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.cpp +++ b/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.cpp @@ -28,14 +28,14 @@ namespace Moses { void StackLatticeBuilder::Build( - int start, - int end, - const UTrieNode &ruleNode, - const VarSpanNode &varSpanNode, - const std::vector &ranges, - const ChartRuleLookupManager &manager, - StackLattice &lattice, - std::vector > &checkTable) + int start, + int end, + const UTrieNode &ruleNode, + const VarSpanNode &varSpanNode, + const std::vector &ranges, + const ChartRuleLookupManager &manager, + StackLattice &lattice, + std::vector > &checkTable) { // Extend the lattice if necessary. Do not shrink it. const size_t span = end - start + 1; diff --git a/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.h b/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.h index 7091e8f18..551655e30 100644 --- a/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.h +++ b/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.h @@ -32,7 +32,7 @@ class ChartCellCollection; */ class StackLatticeBuilder { - public: +public: StackLatticeBuilder() {} void Build(int, int, const UTrieNode &, const VarSpanNode &, diff --git a/moses/TranslationModel/Scope3Parser/StackLatticeSearcher.h b/moses/TranslationModel/Scope3Parser/StackLatticeSearcher.h index 749a3a2c1..4deac31f8 100644 --- a/moses/TranslationModel/Scope3Parser/StackLatticeSearcher.h +++ b/moses/TranslationModel/Scope3Parser/StackLatticeSearcher.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -33,22 +33,20 @@ class ChartHypothesisCollection; template class StackLatticeSearcher { - public: +public: StackLatticeSearcher(const StackLattice &lattice, const std::vector &ranges) - : m_lattice(lattice) - , m_ranges(ranges) {} + : m_lattice(lattice) + , m_ranges(ranges) {} - void Search(const std::vector &labels, MatchCallBackType &callback) - { + void Search(const std::vector &labels, MatchCallBackType &callback) { m_labels = &labels; m_matchCB = &callback; SearchInner(0, 0); } - private: - void SearchInner(int start, size_t index) - { +private: + void SearchInner(int start, size_t index) { assert(m_stackVec.size() == index); const VarSpanNode::NonTermRange &range = m_ranges[index]; diff --git a/moses/TranslationModel/Scope3Parser/VarSpanNode.h b/moses/TranslationModel/Scope3Parser/VarSpanNode.h index 52dc32382..0dda6a787 100644 --- a/moses/TranslationModel/Scope3Parser/VarSpanNode.h +++ b/moses/TranslationModel/Scope3Parser/VarSpanNode.h @@ -33,9 +33,8 @@ namespace Moses /** @todo what is this? */ -struct VarSpanNode -{ - public: +struct VarSpanNode { +public: struct NonTermRange { size_t s1; size_t s2; @@ -48,8 +47,7 @@ struct VarSpanNode VarSpanNode() : m_parent(0), m_label(0), m_rank(0) {} - VarSpanNode &Insert(const NodeVec &vec) - { + VarSpanNode &Insert(const NodeVec &vec) { if (vec.empty()) { return *this; } @@ -59,8 +57,7 @@ struct VarSpanNode // Given a span, determine the ranges of possible start and end offsets // for each non-terminal. void CalculateRanges(int start, int end, - std::vector &ranges) const - { + std::vector &ranges) const { ranges.resize(m_rank); const VarSpanNode *n = this; size_t firstIndex = m_rank; @@ -103,10 +100,9 @@ struct VarSpanNode size_t m_rank; MapType m_children; - private: +private: VarSpanNode &Insert(NodeVec::const_iterator first, - NodeVec::const_iterator last) - { + NodeVec::const_iterator last) { assert(first != last); KeyType key; @@ -117,7 +113,7 @@ struct VarSpanNode key[4] = first->m_numSplitPoints; std::pair result = m_children.insert( - std::make_pair(key, VarSpanNode())); + std::make_pair(key, VarSpanNode())); VarSpanNode &child = result.first->second; if (result.second) { child.m_parent = this; diff --git a/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.cpp b/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.cpp index 16b180ea5..35e66978b 100644 --- a/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.cpp +++ b/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.cpp @@ -30,7 +30,7 @@ namespace Moses { std::auto_ptr VarSpanTrieBuilder::Build( - ApplicableRuleTrie &root) + ApplicableRuleTrie &root) { std::auto_ptr vstRoot(new VarSpanNode()); NodeVec vec; diff --git a/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.h b/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.h index 13c701b4f..2513a2878 100644 --- a/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.h +++ b/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.h @@ -34,13 +34,12 @@ struct VarSpanNode; */ class VarSpanTrieBuilder { - public: +public: std::auto_ptr Build(ApplicableRuleTrie &); - private: +private: typedef std::vector NodeVec; - struct NodeVecState - { + struct NodeVecState { std::size_t m_size; IntermediateVarSpanNode m_lastNode; }; diff --git a/moses/TranslationModel/fuzzy-match/Alignments.cpp b/moses/TranslationModel/fuzzy-match/Alignments.cpp index f15d82a5e..142aff251 100644 --- a/moses/TranslationModel/fuzzy-match/Alignments.cpp +++ b/moses/TranslationModel/fuzzy-match/Alignments.cpp @@ -8,12 +8,11 @@ using namespace std; using namespace Moses; Alignments::Alignments(const std::string &str, size_t sourceSize, size_t targetSize) -:m_alignS2T(sourceSize) -,m_alignT2S(targetSize) + :m_alignS2T(sourceSize) + ,m_alignT2S(targetSize) { vector toks = Tokenize(str, " "); - for (size_t i = 0; i < toks.size(); ++i) - { + for (size_t i = 0; i < toks.size(); ++i) { string &tok = toks[i]; vector point = Tokenize(tok, "-"); @@ -25,20 +24,18 @@ Alignments::Alignments(const std::string &str, size_t sourceSize, size_t targetS std::map &targets = m_alignS2T[ point[0] ]; iter = targets.find(point[1]); if (iter == targets .end()) { - targets[ point[1] ] = 0; - } - else { - ++(iter->second); + targets[ point[1] ] = 0; + } else { + ++(iter->second); } // m_alignedToS std::map &sources = m_alignT2S[ point[1] ]; iter = sources.find(point[0]); if (iter == targets .end()) { - sources[ point[0] ] = 0; - } - else { - ++(iter->second); + sources[ point[0] ] = 0; + } else { + ++(iter->second); } } diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp index 065368ca7..a4264f6a4 100644 --- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp +++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp @@ -17,10 +17,10 @@ using namespace std; -namespace tmmt +namespace tmmt { - FuzzyMatchWrapper::FuzzyMatchWrapper(const std::string &sourcePath, const std::string &targetPath, const std::string &alignmentPath) +FuzzyMatchWrapper::FuzzyMatchWrapper(const std::string &sourcePath, const std::string &targetPath, const std::string &alignmentPath) :basic_flag(false) ,lsed_flag(true) ,refined_flag(true) @@ -30,790 +30,735 @@ namespace tmmt ,multiple_flag(true) ,multiple_slack(0) ,multiple_max(100) - { - cerr << "creating suffix array" << endl; - suffixArray = new tmmt::SuffixArray( sourcePath ); +{ + cerr << "creating suffix array" << endl; + suffixArray = new tmmt::SuffixArray( sourcePath ); - //cerr << "loading source data" << endl; - //load_corpus(sourcePath, source); + //cerr << "loading source data" << endl; + //load_corpus(sourcePath, source); - cerr << "loading target data" << endl; - load_target(targetPath, targetAndAlignment); + cerr << "loading target data" << endl; + load_target(targetPath, targetAndAlignment); - cerr << "loading alignment" << endl; - load_alignment(alignmentPath, targetAndAlignment); + cerr << "loading alignment" << endl; + load_alignment(alignmentPath, targetAndAlignment); - // create suffix array - //load_corpus(m_config[0], input); - - cerr << "loading completed" << endl; - } + // create suffix array + //load_corpus(m_config[0], input); - string FuzzyMatchWrapper::Extract(long translationId, const string &dirNameStr) - { - const Moses::StaticData &staticData = Moses::StaticData::Instance(); + cerr << "loading completed" << endl; +} - WordIndex wordIndex; +string FuzzyMatchWrapper::Extract(long translationId, const string &dirNameStr) +{ + const Moses::StaticData &staticData = Moses::StaticData::Instance(); - string fuzzyMatchFile = ExtractTM(wordIndex, translationId, dirNameStr); - - // create extrac files - create_xml(fuzzyMatchFile); + WordIndex wordIndex; - // create phrase table with usual Moses scoring and consolidate programs - string cmd; - cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract | gzip -c > " - + fuzzyMatchFile + ".extract.sorted.gz"; - system(cmd.c_str()); - cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract.inv | gzip -c > " - + fuzzyMatchFile + ".extract.inv.sorted.gz"; - system(cmd.c_str()); + string fuzzyMatchFile = ExtractTM(wordIndex, translationId, dirNameStr); + + // create extrac files + create_xml(fuzzyMatchFile); + + // create phrase table with usual Moses scoring and consolidate programs + string cmd; + cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract | gzip -c > " + + fuzzyMatchFile + ".extract.sorted.gz"; + system(cmd.c_str()); + cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract.inv | gzip -c > " + + fuzzyMatchFile + ".extract.inv.sorted.gz"; + system(cmd.c_str()); #ifdef IS_XCODE - cmd = "/Users/hieuhoang/unison/workspace/github/moses-smt/bin"; + cmd = "/Users/hieuhoang/unison/workspace/github/moses-smt/bin"; #elif IS_ECLIPSE - cmd = "/home/hieu/workspace/github/moses-smt/bin"; + cmd = "/home/hieu/workspace/github/moses-smt/bin"; #else - cmd = staticData.GetBinDirectory(); + cmd = staticData.GetBinDirectory(); #endif - cmd += string("/../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical ") - + " -extract-file " + fuzzyMatchFile + ".extract -lexical-file - -score-options \"--NoLex\" " - + " -phrase-translation-table " + fuzzyMatchFile + ".pt"; - system(cmd.c_str()); + cmd += string("/../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical ") + + " -extract-file " + fuzzyMatchFile + ".extract -lexical-file - -score-options \"--NoLex\" " + + " -phrase-translation-table " + fuzzyMatchFile + ".pt"; + system(cmd.c_str()); - return fuzzyMatchFile + ".pt.gz"; - } - - string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, const string &dirNameStr) - { - const std::vector< std::vector< WORD_ID > > &source = suffixArray->GetCorpus(); + return fuzzyMatchFile + ".pt.gz"; +} - string inputPath = dirNameStr + "/in"; - string fuzzyMatchFile = dirNameStr + "/fuzzyMatchFile"; - ofstream fuzzyMatchStream(fuzzyMatchFile.c_str()); - - vector< vector< WORD_ID > > input; - load_corpus(inputPath, input); - - assert(input.size() == 1); - size_t sentenceInd = 0; - - clock_t start_clock = clock(); - // if (i % 10 == 0) cerr << "."; - - // establish some basic statistics - - // int input_length = compute_length( input[i] ); - int input_length = input[sentenceInd].size(); - int best_cost = input_length * (100-min_match) / 100 + 1; - - int match_count = 0; // how many substring matches to be considered - //cerr << endl << "sentence " << i << ", length " << input_length << ", best_cost " << best_cost << endl; - - // find match ranges in suffix array - vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range; - for(size_t start=0;startGetSize()-1; - vector< string > substring; - bool stillMatched = true; - vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart; - //cerr << "start: " << start; - for(int word=start; stillMatched && wordFindMatches( substring, first_match, last_match, prior_first_match, prior_last_match ) ) - { - stillMatched = true; - matchedAtThisStart.push_back( make_pair( first_match, last_match ) ); - //cerr << " (" << first_match << "," << last_match << ")"; - //cerr << " " << ( last_match - first_match + 1 ); - prior_first_match = first_match; - prior_last_match = last_match; - } - //} - } - //cerr << endl; - match_range.push_back( matchedAtThisStart ); - } - - clock_t clock_range = clock(); - - map< int, vector< Match > > sentence_match; - map< int, int > sentence_match_word_count; - - // go through all matches, longest first - for(int length = input[sentenceInd].size(); length >= 1; length--) - { - // do not create matches, if these are handled by the short match function - if (length <= short_match_max_length( input_length ) ) - { - continue; - } - - unsigned int count = 0; - for(int start = 0; start <= input[sentenceInd].size() - length; start++) - { - if (match_range[start].size() >= length) - { - pair< SuffixArray::INDEX, SuffixArray::INDEX > &range = match_range[start][length-1]; - // cerr << " (" << range.first << "," << range.second << ")"; - count += range.second - range.first + 1; - - for(SuffixArray::INDEX i=range.first; i<=range.second; i++) - { - int position = suffixArray->GetPosition( i ); - - // sentence length mismatch - size_t sentence_id = suffixArray->GetSentence( position ); - int sentence_length = suffixArray->GetSentenceLength( sentence_id ); - int diff = abs( (int)sentence_length - (int)input_length ); - // cerr << endl << i << "\tsentence " << sentence_id << ", length " << sentence_length; - //if (length <= 2 && input_length>=5 && - // sentence_match.find( sentence_id ) == sentence_match.end()) - // continue; - - if (diff > best_cost) - continue; - - // compute minimal cost - int start_pos = suffixArray->GetWordInSentence( position ); - int end_pos = start_pos + length-1; - // cerr << endl << "\t" << start_pos << "-" << end_pos << " (" << sentence_length << ") vs. " - // << start << "-" << (start+length-1) << " (" << input_length << ")"; - // different number of prior words -> cost is at least diff - int min_cost = abs( start - start_pos ); - - // same number of words, but not sent. start -> cost is at least 1 - if (start == start_pos && start>0) - min_cost++; - - // different number of remaining words -> cost is at least diff - min_cost += abs( ( sentence_length-1 - end_pos ) - - ( input_length-1 - (start+length-1) ) ); - - // same number of words, but not sent. end -> cost is at least 1 - if ( sentence_length-1 - end_pos == - input_length-1 - (start+length-1) - && end_pos != sentence_length-1 ) - min_cost++; - - // cerr << " -> min_cost " << min_cost; - if (min_cost > best_cost) - continue; - - // valid match - match_count++; - - // compute maximal cost - int max_cost = max( start, start_pos ) - + max( sentence_length-1 - end_pos, - input_length-1 - (start+length-1) ); - // cerr << ", max_cost " << max_cost; - - Match m = Match( start, start+length-1, - start_pos, start_pos+length-1, - min_cost, max_cost, 0); - sentence_match[ sentence_id ].push_back( m ); - sentence_match_word_count[ sentence_id ] += length; - - if (max_cost < best_cost) - { - best_cost = max_cost; - if (best_cost == 0) break; - } - //if (match_count >= MAX_MATCH_COUNT) break; - } - } - // cerr << endl; - if (best_cost == 0) break; - //if (match_count >= MAX_MATCH_COUNT) break; - } - // cerr << count << " matches at length " << length << " in " << sentence_match.size() << " tm." << endl; - - if (best_cost == 0) break; - //if (match_count >= MAX_MATCH_COUNT) break; - } - cerr << match_count << " matches in " << sentence_match.size() << " sentences." << endl; - - clock_t clock_matches = clock(); - - // consider each sentence for which we have matches - int old_best_cost = best_cost; - int tm_count_word_match = 0; - int tm_count_word_match2 = 0; - int pruned_match_count = 0; - if (short_match_max_length( input_length )) - { - init_short_matches(wordIndex, translationId, input[sentenceInd] ); - } - vector< int > best_tm; - typedef map< int, vector< Match > >::iterator I; - - clock_t clock_validation_sum = 0; - - for(I tm=sentence_match.begin(); tm!=sentence_match.end(); tm++) - { - int tmID = tm->first; - int tm_length = suffixArray->GetSentenceLength(tmID); - vector< Match > &match = tm->second; - add_short_matches(wordIndex, translationId, match, source[tmID], input_length, best_cost ); - - //cerr << "match in sentence " << tmID << ": " << match.size() << " [" << tm_length << "]" << endl; - - // quick look: how many words are matched - int words_matched = 0; - for(int m=0;m best_cost) - { - if (length_filter_flag) continue; - } - tm_count_word_match++; - - // prune, check again how many words are matched - vector< Match > pruned = prune_matches( match, best_cost ); - words_matched = 0; - for(int p=0;p best_cost) - { - if (length_filter_flag) continue; - } - tm_count_word_match2++; - - pruned_match_count += pruned.size(); - int prior_best_cost = best_cost; - int cost; - - clock_t clock_validation_start = clock(); - if (! parse_flag || - pruned.size()>=10) // to prevent worst cases - { - string path; - cost = sed( input[sentenceInd], source[tmID], path, false ); - if (cost < best_cost) - { - best_cost = cost; - } - } - - else - { - cost = parse_matches( pruned, input_length, tm_length, best_cost ); - if (prior_best_cost != best_cost) - { - best_tm.clear(); - } - } - clock_validation_sum += clock() - clock_validation_start; - if (cost == best_cost) - { - best_tm.push_back( tmID ); - } - } - cerr << "reduced best cost from " << old_best_cost << " to " << best_cost << endl; - cerr << "tm considered: " << sentence_match.size() - << " word-matched: " << tm_count_word_match - << " word-matched2: " << tm_count_word_match2 - << " best: " << best_tm.size() << endl; - - cerr << "pruned matches: " << ((float)pruned_match_count/(float)tm_count_word_match2) << endl; - - // create xml and extract files - string inputStr, sourceStr; - for (size_t pos = 0; pos < input_length; ++pos) { - inputStr += GetVocabulary().GetWord(input[sentenceInd][pos]) + " "; +string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, const string &dirNameStr) +{ + const std::vector< std::vector< WORD_ID > > &source = suffixArray->GetCorpus(); + + string inputPath = dirNameStr + "/in"; + string fuzzyMatchFile = dirNameStr + "/fuzzyMatchFile"; + ofstream fuzzyMatchStream(fuzzyMatchFile.c_str()); + + vector< vector< WORD_ID > > input; + load_corpus(inputPath, input); + + assert(input.size() == 1); + size_t sentenceInd = 0; + + clock_t start_clock = clock(); + // if (i % 10 == 0) cerr << "."; + + // establish some basic statistics + + // int input_length = compute_length( input[i] ); + int input_length = input[sentenceInd].size(); + int best_cost = input_length * (100-min_match) / 100 + 1; + + int match_count = 0; // how many substring matches to be considered + //cerr << endl << "sentence " << i << ", length " << input_length << ", best_cost " << best_cost << endl; + + // find match ranges in suffix array + vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range; + for(size_t start=0; startGetSize()-1; + vector< string > substring; + bool stillMatched = true; + vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart; + //cerr << "start: " << start; + for(int word=start; stillMatched && wordFindMatches( substring, first_match, last_match, prior_first_match, prior_last_match ) ) { + stillMatched = true; + matchedAtThisStart.push_back( make_pair( first_match, last_match ) ); + //cerr << " (" << first_match << "," << last_match << ")"; + //cerr << " " << ( last_match - first_match + 1 ); + prior_first_match = first_match; + prior_last_match = last_match; + } + //} } - - // do not try to find the best ... report multiple matches - if (multiple_flag) { - int input_letter_length = compute_length( input[sentenceInd] ); - for(int si=0; si &sourceSentence = source[s]; - vector &targets = targetAndAlignment[s]; - create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, path, fuzzyMatchStream); - - } - } // if (multiple_flag) - else { - - // find the best matches according to letter sed - string best_path = ""; - int best_match = -1; - int best_letter_cost; - if (lsed_flag) { - best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1; - for(int si=0; si > sentence_match; + map< int, int > sentence_match_word_count; + + // go through all matches, longest first + for(int length = input[sentenceInd].size(); length >= 1; length--) { + // do not create matches, if these are handled by the short match function + if (length <= short_match_max_length( input_length ) ) { + continue; + } + + unsigned int count = 0; + for(int start = 0; start <= input[sentenceInd].size() - length; start++) { + if (match_range[start].size() >= length) { + pair< SuffixArray::INDEX, SuffixArray::INDEX > &range = match_range[start][length-1]; + // cerr << " (" << range.first << "," << range.second << ")"; + count += range.second - range.first + 1; + + for(SuffixArray::INDEX i=range.first; i<=range.second; i++) { + int position = suffixArray->GetPosition( i ); + + // sentence length mismatch + size_t sentence_id = suffixArray->GetSentence( position ); + int sentence_length = suffixArray->GetSentenceLength( sentence_id ); + int diff = abs( (int)sentence_length - (int)input_length ); + // cerr << endl << i << "\tsentence " << sentence_id << ", length " << sentence_length; + //if (length <= 2 && input_length>=5 && + // sentence_match.find( sentence_id ) == sentence_match.end()) + // continue; + + if (diff > best_cost) + continue; + + // compute minimal cost + int start_pos = suffixArray->GetWordInSentence( position ); + int end_pos = start_pos + length-1; + // cerr << endl << "\t" << start_pos << "-" << end_pos << " (" << sentence_length << ") vs. " + // << start << "-" << (start+length-1) << " (" << input_length << ")"; + // different number of prior words -> cost is at least diff + int min_cost = abs( start - start_pos ); + + // same number of words, but not sent. start -> cost is at least 1 + if (start == start_pos && start>0) + min_cost++; + + // different number of remaining words -> cost is at least diff + min_cost += abs( ( sentence_length-1 - end_pos ) - + ( input_length-1 - (start+length-1) ) ); + + // same number of words, but not sent. end -> cost is at least 1 + if ( sentence_length-1 - end_pos == + input_length-1 - (start+length-1) + && end_pos != sentence_length-1 ) + min_cost++; + + // cerr << " -> min_cost " << min_cost; + if (min_cost > best_cost) + continue; + + // valid match + match_count++; + + // compute maximal cost + int max_cost = max( start, start_pos ) + + max( sentence_length-1 - end_pos, + input_length-1 - (start+length-1) ); + // cerr << ", max_cost " << max_cost; + + Match m = Match( start, start+length-1, + start_pos, start_pos+length-1, + min_cost, max_cost, 0); + sentence_match[ sentence_id ].push_back( m ); + sentence_match_word_count[ sentence_id ] += length; + + if (max_cost < best_cost) { + best_cost = max_cost; + if (best_cost == 0) break; } + //if (match_count >= MAX_MATCH_COUNT) break; } } - // if letter sed turned off, just compute path for first match - else { - if (best_tm.size() > 0) { - string path; - sed( input[sentenceInd], source[best_tm[0]], path, false ); + // cerr << endl; + if (best_cost == 0) break; + //if (match_count >= MAX_MATCH_COUNT) break; + } + // cerr << count << " matches at length " << length << " in " << sentence_match.size() << " tm." << endl; + + if (best_cost == 0) break; + //if (match_count >= MAX_MATCH_COUNT) break; + } + cerr << match_count << " matches in " << sentence_match.size() << " sentences." << endl; + + clock_t clock_matches = clock(); + + // consider each sentence for which we have matches + int old_best_cost = best_cost; + int tm_count_word_match = 0; + int tm_count_word_match2 = 0; + int pruned_match_count = 0; + if (short_match_max_length( input_length )) { + init_short_matches(wordIndex, translationId, input[sentenceInd] ); + } + vector< int > best_tm; + typedef map< int, vector< Match > >::iterator I; + + clock_t clock_validation_sum = 0; + + for(I tm=sentence_match.begin(); tm!=sentence_match.end(); tm++) { + int tmID = tm->first; + int tm_length = suffixArray->GetSentenceLength(tmID); + vector< Match > &match = tm->second; + add_short_matches(wordIndex, translationId, match, source[tmID], input_length, best_cost ); + + //cerr << "match in sentence " << tmID << ": " << match.size() << " [" << tm_length << "]" << endl; + + // quick look: how many words are matched + int words_matched = 0; + for(int m=0; m best_cost) { + if (length_filter_flag) continue; + } + tm_count_word_match++; + + // prune, check again how many words are matched + vector< Match > pruned = prune_matches( match, best_cost ); + words_matched = 0; + for(int p=0; p best_cost) { + if (length_filter_flag) continue; + } + tm_count_word_match2++; + + pruned_match_count += pruned.size(); + int prior_best_cost = best_cost; + int cost; + + clock_t clock_validation_start = clock(); + if (! parse_flag || + pruned.size()>=10) { // to prevent worst cases + string path; + cost = sed( input[sentenceInd], source[tmID], path, false ); + if (cost < best_cost) { + best_cost = cost; + } + } + + else { + cost = parse_matches( pruned, input_length, tm_length, best_cost ); + if (prior_best_cost != best_cost) { + best_tm.clear(); + } + } + clock_validation_sum += clock() - clock_validation_start; + if (cost == best_cost) { + best_tm.push_back( tmID ); + } + } + cerr << "reduced best cost from " << old_best_cost << " to " << best_cost << endl; + cerr << "tm considered: " << sentence_match.size() + << " word-matched: " << tm_count_word_match + << " word-matched2: " << tm_count_word_match2 + << " best: " << best_tm.size() << endl; + + cerr << "pruned matches: " << ((float)pruned_match_count/(float)tm_count_word_match2) << endl; + + // create xml and extract files + string inputStr, sourceStr; + for (size_t pos = 0; pos < input_length; ++pos) { + inputStr += GetVocabulary().GetWord(input[sentenceInd][pos]) + " "; + } + + // do not try to find the best ... report multiple matches + if (multiple_flag) { + int input_letter_length = compute_length( input[sentenceInd] ); + for(int si=0; si &sourceSentence = source[s]; + vector &targets = targetAndAlignment[s]; + create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, path, fuzzyMatchStream); + + } + } // if (multiple_flag) + else { + + // find the best matches according to letter sed + string best_path = ""; + int best_match = -1; + int best_letter_cost; + if (lsed_flag) { + best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1; + for(int si=0; si 0) { + string path; + sed( input[sentenceInd], source[best_tm[0]], path, false ); + best_path = path; + best_match = best_tm[0]; } - //cout << best_cost <<"/" << input_length; - if (lsed_flag) { - //cout << ")"; + } + cerr << "elapsed: " << (1000 * (clock()-start_clock) / CLOCKS_PER_SEC) + << " ( range: " << (1000 * (clock_range-start_clock) / CLOCKS_PER_SEC) + << " match: " << (1000 * (clock_matches-clock_range) / CLOCKS_PER_SEC) + << " tm: " << (1000 * (clock()-clock_matches) / CLOCKS_PER_SEC) + << " (validation: " << (1000 * (clock_validation_sum) / CLOCKS_PER_SEC) << ")" + << " )" << endl; + if (lsed_flag) { + //cout << best_letter_cost << "/" << compute_length( input[sentenceInd] ) << " ("; + } + //cout << best_cost <<"/" << input_length; + if (lsed_flag) { + //cout << ")"; + } + //cout << " ||| " << best_match << " ||| " << best_path << endl; + + if (best_match == -1) { + CHECK(source.size()); + best_match = 0; + } + + // creat xml & extracts + const vector &sourceSentence = source[best_match]; + vector &targets = targetAndAlignment[best_match]; + create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, best_path, fuzzyMatchStream); + + } // else if (multiple_flag) + + fuzzyMatchStream.close(); + + return fuzzyMatchFile; +} + +void FuzzyMatchWrapper::load_corpus( const std::string &fileName, vector< vector< WORD_ID > > &corpus ) +{ + // source + ifstream fileStream; + fileStream.open(fileName.c_str()); + if (!fileStream) { + cerr << "file not found: " << fileName << endl; + exit(1); + } + cerr << "loading " << fileName << endl; + + istream *fileStreamP = &fileStream; + + char line[LINE_MAX_LENGTH]; + while(true) { + SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n'); + if (fileStreamP->eof()) break; + corpus.push_back( GetVocabulary().Tokenize( line ) ); + } +} + +void FuzzyMatchWrapper::load_target(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus) +{ + ifstream fileStream; + fileStream.open(fileName.c_str()); + if (!fileStream) { + cerr << "file not found: " << fileName << endl; + exit(1); + } + cerr << "loading " << fileName << endl; + + istream *fileStreamP = &fileStream; + + WORD_ID delimiter = GetVocabulary().StoreIfNew("|||"); + + int lineNum = 0; + char line[LINE_MAX_LENGTH]; + while(true) { + SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n'); + if (fileStreamP->eof()) break; + + vector toks = GetVocabulary().Tokenize( line ); + + corpus.push_back(vector< SentenceAlignment >()); + vector< SentenceAlignment > &vec = corpus.back(); + + vec.push_back(SentenceAlignment()); + SentenceAlignment *sentence = &vec.back(); + + const WORD &countStr = GetVocabulary().GetWord(toks[0]); + sentence->count = atoi(countStr.c_str()); + + for (size_t i = 1; i < toks.size(); ++i) { + WORD_ID wordId = toks[i]; + + if (wordId == delimiter) { + // target and alignments can have multiple sentences. + vec.push_back(SentenceAlignment()); + sentence = &vec.back(); + + // count + ++i; + + const WORD &countStr = GetVocabulary().GetWord(toks[i]); + sentence->count = atoi(countStr.c_str()); + } else { + // just a normal word, add + sentence->target.push_back(wordId); } - //cout << " ||| " << best_match << " ||| " << best_path << endl; - - if (best_match == -1) { - CHECK(source.size()); - best_match = 0; - } - - // creat xml & extracts - const vector &sourceSentence = source[best_match]; - vector &targets = targetAndAlignment[best_match]; - create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, best_path, fuzzyMatchStream); - - } // else if (multiple_flag) - - fuzzyMatchStream.close(); - - return fuzzyMatchFile; + } + + ++lineNum; + } - void FuzzyMatchWrapper::load_corpus( const std::string &fileName, vector< vector< WORD_ID > > &corpus ) - { // source - ifstream fileStream; - fileStream.open(fileName.c_str()); - if (!fileStream) { - cerr << "file not found: " << fileName << endl; - exit(1); - } - cerr << "loading " << fileName << endl; - - istream *fileStreamP = &fileStream; - - char line[LINE_MAX_LENGTH]; - while(true) - { - SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n'); - if (fileStreamP->eof()) break; - corpus.push_back( GetVocabulary().Tokenize( line ) ); - } +} + + +void FuzzyMatchWrapper::load_alignment(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus ) +{ + ifstream fileStream; + fileStream.open(fileName.c_str()); + if (!fileStream) { + cerr << "file not found: " << fileName << endl; + exit(1); } - - void FuzzyMatchWrapper::load_target(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus) - { - ifstream fileStream; - fileStream.open(fileName.c_str()); - if (!fileStream) { - cerr << "file not found: " << fileName << endl; - exit(1); - } - cerr << "loading " << fileName << endl; - - istream *fileStreamP = &fileStream; - - WORD_ID delimiter = GetVocabulary().StoreIfNew("|||"); - - int lineNum = 0; - char line[LINE_MAX_LENGTH]; - while(true) - { - SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n'); - if (fileStreamP->eof()) break; - - vector toks = GetVocabulary().Tokenize( line ); - - corpus.push_back(vector< SentenceAlignment >()); - vector< SentenceAlignment > &vec = corpus.back(); - - vec.push_back(SentenceAlignment()); - SentenceAlignment *sentence = &vec.back(); - - const WORD &countStr = GetVocabulary().GetWord(toks[0]); - sentence->count = atoi(countStr.c_str()); - - for (size_t i = 1; i < toks.size(); ++i) { - WORD_ID wordId = toks[i]; - - if (wordId == delimiter) { - // target and alignments can have multiple sentences. - vec.push_back(SentenceAlignment()); - sentence = &vec.back(); - - // count - ++i; - - const WORD &countStr = GetVocabulary().GetWord(toks[i]); - sentence->count = atoi(countStr.c_str()); - } - else { - // just a normal word, add - sentence->target.push_back(wordId); - } + cerr << "loading " << fileName << endl; + + istream *fileStreamP = &fileStream; + + string delimiter = "|||"; + + int lineNum = 0; + char line[LINE_MAX_LENGTH]; + while(true) { + SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n'); + if (fileStreamP->eof()) break; + + vector< SentenceAlignment > &vec = corpus[lineNum]; + size_t targetInd = 0; + SentenceAlignment *sentence = &vec[targetInd]; + + vector toks = Moses::Tokenize(line); + + for (size_t i = 0; i < toks.size(); ++i) { + string &tok = toks[i]; + + if (tok == delimiter) { + // target and alignments can have multiple sentences. + ++targetInd; + sentence = &vec[targetInd]; + + ++i; + } else { + // just a normal alignment, add + vector alignPoint = Moses::Tokenize(tok, "-"); + assert(alignPoint.size() == 2); + sentence->alignment.push_back(pair(alignPoint[0], alignPoint[1])); } - - ++lineNum; - } - + + ++lineNum; + } - - - void FuzzyMatchWrapper::load_alignment(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus ) - { - ifstream fileStream; - fileStream.open(fileName.c_str()); - if (!fileStream) { - cerr << "file not found: " << fileName << endl; - exit(1); - } - cerr << "loading " << fileName << endl; - - istream *fileStreamP = &fileStream; - - string delimiter = "|||"; - - int lineNum = 0; - char line[LINE_MAX_LENGTH]; - while(true) - { - SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n'); - if (fileStreamP->eof()) break; - - vector< SentenceAlignment > &vec = corpus[lineNum]; - size_t targetInd = 0; - SentenceAlignment *sentence = &vec[targetInd]; - - vector toks = Moses::Tokenize(line); - - for (size_t i = 0; i < toks.size(); ++i) { - string &tok = toks[i]; - - if (tok == delimiter) { - // target and alignments can have multiple sentences. - ++targetInd; - sentence = &vec[targetInd]; - - ++i; - } - else { - // just a normal alignment, add - vector alignPoint = Moses::Tokenize(tok, "-"); - assert(alignPoint.size() == 2); - sentence->alignment.push_back(pair(alignPoint[0], alignPoint[1])); - } - } - - ++lineNum; - - } - } - - bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const - { +} + +bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const +{ #ifdef WITH_THREADS - boost::shared_lock read_lock(m_accessLock); + boost::shared_lock read_lock(m_accessLock); #endif - map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key ); - if (lookup != m_lsed.end()) { - value = lookup->second; - return true; - } - - return false; + map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key ); + if (lookup != m_lsed.end()) { + value = lookup->second; + return true; } - void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value) - { + return false; +} + +void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value) +{ #ifdef WITH_THREADS - boost::unique_lock lock(m_accessLock); + boost::unique_lock lock(m_accessLock); #endif - m_lsed[ key ] = value; - } + m_lsed[ key ] = value; +} /* Letter string edit distance, e.g. sub 'their' to 'there' costs 2 */ unsigned int FuzzyMatchWrapper::letter_sed( WORD_ID aIdx, WORD_ID bIdx ) { - // check if already computed -> lookup in cache - pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx ); - unsigned int value; - bool ret = GetLSEDCache(pIdx, value); - if (ret) { - return value; - } - - // get surface strings for word indices - const string &a = GetVocabulary().GetWord( aIdx ); - const string &b = GetVocabulary().GetWord( bIdx ); - - // initialize cost matrix - unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 ); - for( unsigned int i=0; i<=a.size(); i++ ) { - cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 ); - cost[i][0] = i; - } - for( unsigned int j=0; j<=b.size(); j++ ) { - cost[0][j] = j; - } - - // core string edit distance loop - for( unsigned int i=1; i<=a.size(); i++ ) { - for( unsigned int j=1; j<=b.size(); j++ ) { - - unsigned int ins = cost[i-1][j] + 1; - unsigned int del = cost[i][j-1] + 1; - bool match = (a.substr(i-1,1).compare( b.substr(j-1,1) ) == 0); - unsigned int diag = cost[i-1][j-1] + (match ? 0 : 1); - - unsigned int min = (ins < del) ? ins : del; - min = (diag < min) ? diag : min; - - cost[i][j] = min; - } - } - - // clear out memory - unsigned int final = cost[a.size()][b.size()]; - for( unsigned int i=0; i<=a.size(); i++ ) { - free( cost[i] ); - } - free( cost ); - - // cache and return result - SetLSEDCache(pIdx, final); - return final; -} - - /* string edit distance implementation */ - - unsigned int FuzzyMatchWrapper::sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed ) { - - // initialize cost and path matrices - unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 ); - char **path = (char**) calloc( sizeof( char* ), a.size()+1 ); - - for( unsigned int i=0; i<=a.size(); i++ ) { - cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 ); - path[i] = (char*) calloc( sizeof(char), b.size()+1 ); - if (i>0) - { - cost[i][0] = cost[i-1][0]; - if (use_letter_sed) - { - cost[i][0] += GetVocabulary().GetWord( a[i-1] ).size(); - } - else - { - cost[i][0]++; - } - } - else - { - cost[i][0] = 0; - } - path[i][0] = 'I'; - } - - for( unsigned int j=0; j<=b.size(); j++ ) { - if (j>0) - { - cost[0][j] = cost[0][j-1]; - if (use_letter_sed) - { - cost[0][j] += GetVocabulary().GetWord( b[j-1] ).size(); - } - else - { - cost[0][j]++; - } - } - else - { - cost[0][j] = 0; - } - path[0][j] = 'D'; - } - - // core string edit distance algorithm - for( unsigned int i=1; i<=a.size(); i++ ) { - for( unsigned int j=1; j<=b.size(); j++ ) { - unsigned int ins = cost[i-1][j]; - unsigned int del = cost[i][j-1]; - unsigned int match; - if (use_letter_sed) - { - ins += GetVocabulary().GetWord( a[i-1] ).size(); - del += GetVocabulary().GetWord( b[j-1] ).size(); - match = letter_sed( a[i-1], b[j-1] ); - } - else - { - ins++; - del++; - match = ( a[i-1] == b[j-1] ) ? 0 : 1; - } - unsigned int diag = cost[i-1][j-1] + match; - - char action = (ins < del) ? 'I' : 'D'; - unsigned int min = (ins < del) ? ins : del; - if (diag < min) - { - action = (match>0) ? 'S' : 'M'; - min = diag; - } - - cost[i][j] = min; - path[i][j] = action; - } - } - - // construct string for best path - unsigned int i = a.size(); - unsigned int j = b.size(); - best_path = ""; - while( i>0 || j>0 ) - { - best_path = path[i][j] + best_path; - if (path[i][j] == 'I') - { - i--; - } - else if (path[i][j] == 'D') - { - j--; - } - else - { - i--; - j--; - } - } - - - // clear out memory - unsigned int final = cost[a.size()][b.size()]; - - for( unsigned int i=0; i<=a.size(); i++ ) { - free( cost[i] ); - free( path[i] ); - } - free( cost ); - free( path ); - - // return result - return final; + // check if already computed -> lookup in cache + pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx ); + unsigned int value; + bool ret = GetLSEDCache(pIdx, value); + if (ret) { + return value; } -/* utlility function: compute length of sentence in characters + // get surface strings for word indices + const string &a = GetVocabulary().GetWord( aIdx ); + const string &b = GetVocabulary().GetWord( bIdx ); + + // initialize cost matrix + unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 ); + for( unsigned int i=0; i<=a.size(); i++ ) { + cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 ); + cost[i][0] = i; + } + for( unsigned int j=0; j<=b.size(); j++ ) { + cost[0][j] = j; + } + + // core string edit distance loop + for( unsigned int i=1; i<=a.size(); i++ ) { + for( unsigned int j=1; j<=b.size(); j++ ) { + + unsigned int ins = cost[i-1][j] + 1; + unsigned int del = cost[i][j-1] + 1; + bool match = (a.substr(i-1,1).compare( b.substr(j-1,1) ) == 0); + unsigned int diag = cost[i-1][j-1] + (match ? 0 : 1); + + unsigned int min = (ins < del) ? ins : del; + min = (diag < min) ? diag : min; + + cost[i][j] = min; + } + } + + // clear out memory + unsigned int final = cost[a.size()][b.size()]; + for( unsigned int i=0; i<=a.size(); i++ ) { + free( cost[i] ); + } + free( cost ); + + // cache and return result + SetLSEDCache(pIdx, final); + return final; +} + +/* string edit distance implementation */ + +unsigned int FuzzyMatchWrapper::sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed ) +{ + + // initialize cost and path matrices + unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 ); + char **path = (char**) calloc( sizeof( char* ), a.size()+1 ); + + for( unsigned int i=0; i<=a.size(); i++ ) { + cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 ); + path[i] = (char*) calloc( sizeof(char), b.size()+1 ); + if (i>0) { + cost[i][0] = cost[i-1][0]; + if (use_letter_sed) { + cost[i][0] += GetVocabulary().GetWord( a[i-1] ).size(); + } else { + cost[i][0]++; + } + } else { + cost[i][0] = 0; + } + path[i][0] = 'I'; + } + + for( unsigned int j=0; j<=b.size(); j++ ) { + if (j>0) { + cost[0][j] = cost[0][j-1]; + if (use_letter_sed) { + cost[0][j] += GetVocabulary().GetWord( b[j-1] ).size(); + } else { + cost[0][j]++; + } + } else { + cost[0][j] = 0; + } + path[0][j] = 'D'; + } + + // core string edit distance algorithm + for( unsigned int i=1; i<=a.size(); i++ ) { + for( unsigned int j=1; j<=b.size(); j++ ) { + unsigned int ins = cost[i-1][j]; + unsigned int del = cost[i][j-1]; + unsigned int match; + if (use_letter_sed) { + ins += GetVocabulary().GetWord( a[i-1] ).size(); + del += GetVocabulary().GetWord( b[j-1] ).size(); + match = letter_sed( a[i-1], b[j-1] ); + } else { + ins++; + del++; + match = ( a[i-1] == b[j-1] ) ? 0 : 1; + } + unsigned int diag = cost[i-1][j-1] + match; + + char action = (ins < del) ? 'I' : 'D'; + unsigned int min = (ins < del) ? ins : del; + if (diag < min) { + action = (match>0) ? 'S' : 'M'; + min = diag; + } + + cost[i][j] = min; + path[i][j] = action; + } + } + + // construct string for best path + unsigned int i = a.size(); + unsigned int j = b.size(); + best_path = ""; + while( i>0 || j>0 ) { + best_path = path[i][j] + best_path; + if (path[i][j] == 'I') { + i--; + } else if (path[i][j] == 'D') { + j--; + } else { + i--; + j--; + } + } + + + // clear out memory + unsigned int final = cost[a.size()][b.size()]; + + for( unsigned int i=0; i<=a.size(); i++ ) { + free( cost[i] ); + free( path[i] ); + } + free( cost ); + free( path ); + + // return result + return final; +} + +/* utlility function: compute length of sentence in characters (spaces do not count) */ unsigned int FuzzyMatchWrapper::compute_length( const vector< WORD_ID > &sentence ) { - unsigned int length = 0; for( unsigned int i=0; i > source, - vector< vector< WORD_ID > > input ) +int FuzzyMatchWrapper::basic_fuzzy_match( vector< vector< WORD_ID > > source, + vector< vector< WORD_ID > > input ) { - // go through input set... - for(unsigned int i=0;i= best_cost)) - { - continue; - } - - // compute string edit distance - string path; - unsigned int cost = sed( input[i], source[s], path, use_letter_sed ); - - // update if new best - if (cost < best_cost) - { - best_cost = cost; - best_path = path; - best_match = s; - } - } - //cout << best_cost << " ||| " << best_match << " ||| " << best_path << endl; - } + // go through input set... + for(unsigned int i=0; i= best_cost)) { + continue; + } + + // compute string edit distance + string path; + unsigned int cost = sed( input[i], source[s], path, use_letter_sed ); + + // update if new best + if (cost < best_cost) { + best_cost = cost; + best_path = path; + best_match = s; + } + } + //cout << best_cost << " ||| " << best_match << " ||| " << best_path << endl; + } } /* definition of short matches @@ -823,274 +768,250 @@ unsigned int FuzzyMatchWrapper::compute_length( const vector< WORD_ID > &sentenc int FuzzyMatchWrapper::short_match_max_length( int input_length ) { - if ( ! refined_flag ) + if ( ! refined_flag ) return 0; if ( input_length >= 5 ) return 1; - return 0; + return 0; } /* if we have non-short matches in a sentence, we need to - take a closer look at it. + take a closer look at it. this function creates a hash map for all input words and their positions - (to be used by the next function) + (to be used by the next function) (done here, because this has be done only once for an input sentence) */ void FuzzyMatchWrapper::init_short_matches(WordIndex &wordIndex, long translationId, const vector< WORD_ID > &input ) { - int max_length = short_match_max_length( input.size() ); - if (max_length == 0) - return; - - wordIndex.clear(); - - // store input words and their positions in hash map - for(int i=0; i position_vector; - wordIndex[ input[i] ] = position_vector; - } - wordIndex[ input[i] ].push_back( i ); - } + int max_length = short_match_max_length( input.size() ); + if (max_length == 0) + return; + + wordIndex.clear(); + + // store input words and their positions in hash map + for(int i=0; i position_vector; + wordIndex[ input[i] ] = position_vector; + } + wordIndex[ input[i] ].push_back( i ); + } } /* add all short matches to list of matches for a sentence */ void FuzzyMatchWrapper::add_short_matches(WordIndex &wordIndex, long translationId, vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost ) -{ - int max_length = short_match_max_length( input_length ); - if (max_length == 0) - return; - - int tm_length = tm.size(); - map< WORD_ID,vector< int > >::iterator input_word_hit; - for(int t_pos=0; t_pos &position_vector = input_word_hit->second; - for(int j=0; j0 && i_pos == t_pos ) - min_cost++; - - // after match - max_cost += max( (input_length-i_pos) , (tm_length-t_pos)); - min_cost += abs( (input_length-i_pos) - (tm_length-t_pos)); - if ( i_pos != input_length-1 && (input_length-i_pos) == (tm_length-t_pos)) - min_cost++; - - if (min_cost <= best_cost) - { - Match new_match( i_pos,i_pos, t_pos,t_pos, min_cost,max_cost,0 ); - match.push_back( new_match ); - } - } - } - } +{ + int max_length = short_match_max_length( input_length ); + if (max_length == 0) + return; + + int tm_length = tm.size(); + map< WORD_ID,vector< int > >::iterator input_word_hit; + for(int t_pos=0; t_pos &position_vector = input_word_hit->second; + for(int j=0; j0 && i_pos == t_pos ) + min_cost++; + + // after match + max_cost += max( (input_length-i_pos) , (tm_length-t_pos)); + min_cost += abs( (input_length-i_pos) - (tm_length-t_pos)); + if ( i_pos != input_length-1 && (input_length-i_pos) == (tm_length-t_pos)) + min_cost++; + + if (min_cost <= best_cost) { + Match new_match( i_pos,i_pos, t_pos,t_pos, min_cost,max_cost,0 ); + match.push_back( new_match ); + } + } + } + } } /* remove matches that are subsumed by a larger match */ vector< Match > FuzzyMatchWrapper::prune_matches( const vector< Match > &match, int best_cost ) { - //cerr << "\tpruning"; - vector< Match > pruned; - for(int i=match.size()-1; i>=0; i--) - { - //cerr << " (" << match[i].input_start << "," << match[i].input_end - // << " ; " << match[i].tm_start << "," << match[i].tm_end - // << " * " << match[i].min_cost << ")"; - - //if (match[i].min_cost > best_cost) - // continue; - - bool subsumed = false; - for(int j=match.size()-1; j>=0; j--) - { - if (i!=j // do not compare match with itself - && ( match[i].input_end - match[i].input_start <= - match[j].input_end - match[j].input_start ) // i shorter than j - && ((match[i].input_start == match[j].input_start && - match[i].tm_start == match[j].tm_start ) || - (match[i].input_end == match[j].input_end && - match[i].tm_end == match[j].tm_end) ) ) - { - subsumed = true; - } - } - if (! subsumed && match[i].min_cost <= best_cost) - { - //cerr << "*"; - pruned.push_back( match[i] ); - } - } - //cerr << endl; - return pruned; + //cerr << "\tpruning"; + vector< Match > pruned; + for(int i=match.size()-1; i>=0; i--) { + //cerr << " (" << match[i].input_start << "," << match[i].input_end + // << " ; " << match[i].tm_start << "," << match[i].tm_end + // << " * " << match[i].min_cost << ")"; + + //if (match[i].min_cost > best_cost) + // continue; + + bool subsumed = false; + for(int j=match.size()-1; j>=0; j--) { + if (i!=j // do not compare match with itself + && ( match[i].input_end - match[i].input_start <= + match[j].input_end - match[j].input_start ) // i shorter than j + && ((match[i].input_start == match[j].input_start && + match[i].tm_start == match[j].tm_start ) || + (match[i].input_end == match[j].input_end && + match[i].tm_end == match[j].tm_end) ) ) { + subsumed = true; + } + } + if (! subsumed && match[i].min_cost <= best_cost) { + //cerr << "*"; + pruned.push_back( match[i] ); + } + } + //cerr << endl; + return pruned; } /* A* parsing method to compute string edit distance */ int FuzzyMatchWrapper::parse_matches( vector< Match > &match, int input_length, int tm_length, int &best_cost ) -{ - // cerr << "sentence has " << match.size() << " matches, best cost: " << best_cost << ", lengths input: " << input_length << " tm: " << tm_length << endl; - - if (match.size() == 1) - return match[0].max_cost; - if (match.size() == 0) - return input_length+tm_length; - - int this_best_cost = input_length + tm_length; - for(int i=0;i > multi_match; - multi_match.push_back( match ); - - int match_level = 1; - while(multi_match[ match_level-1 ].size()>0) - { - // init vector - vector< Match > empty; - multi_match.push_back( empty ); - - for(int first_level = 0; first_level <= (match_level-1)/2; first_level++) - { - int second_level = match_level - first_level -1; - //cerr << "\tcombining level " << first_level << " and " << second_level << endl; - - vector< Match > &first_match = multi_match[ first_level ]; - vector< Match > &second_match = multi_match[ second_level ]; - - for(int i1 = 0; i1 < first_match.size(); i1++) { - for(int i2 = 0; i2 < second_match.size(); i2++) { - - // do not combine the same pair twice - if (first_level == second_level && i2 <= i1) - { - continue; - } - - // get sorted matches (first is before second) - Match *first, *second; - if (first_match[i1].input_start < second_match[i2].input_start ) - { - first = &first_match[i1]; - second = &second_match[i2]; - } - else - { - second = &first_match[i1]; - first = &second_match[i2]; - } - - //cerr << "\tcombining " - // << "(" << first->input_start << "," << first->input_end << "), " - // << first->tm_start << " [" << first->internal_cost << "]" - // << " with " - // << "(" << second->input_start << "," << second->input_end << "), " - // << second->tm_start<< " [" << second->internal_cost << "]" - // << endl; - - // do not process overlapping matches - if (first->input_end >= second->input_start) - { - continue; - } - - // no overlap / mismatch in tm - if (first->tm_end >= second->tm_start) - { - continue; - } - - // compute cost - int min_cost = 0; - int max_cost = 0; - - // initial - min_cost += abs( first->input_start - first->tm_start ); - max_cost += max( first->input_start, first->tm_start ); - - // same number of words, but not sent. start -> cost is at least 1 - if (first->input_start == first->tm_start && first->input_start > 0) - { - min_cost++; - } - - // in-between - int skipped_words = second->input_start - first->input_end -1; - int skipped_words_tm = second->tm_start - first->tm_end -1; - int internal_cost = max( skipped_words, skipped_words_tm ); - internal_cost += first->internal_cost + second->internal_cost; - min_cost += internal_cost; - max_cost += internal_cost; - - // final - min_cost += abs( (tm_length-1 - second->tm_end) - - (input_length-1 - second->input_end) ); - max_cost += max( (tm_length-1 - second->tm_end), - (input_length-1 - second->input_end) ); - - // same number of words, but not sent. end -> cost is at least 1 - if ( ( input_length-1 - second->input_end - == tm_length-1 - second->tm_end ) - && input_length-1 != second->input_end ) - { - min_cost++; - } - - // cerr << "\tcost: " << min_cost << "-" << max_cost << endl; - - // if worst than best cost, forget it - if (min_cost > best_cost) - { - continue; - } - - // add match - Match new_match( first->input_start, - second->input_end, - first->tm_start, - second->tm_end, - min_cost, - max_cost, - internal_cost); - multi_match[ match_level ].push_back( new_match ); - // cerr << "\tstored\n"; - - // possibly updating this_best_cost - if (max_cost < this_best_cost) - { - // cerr << "\tupdating this best cost to " << max_cost << "\n"; - this_best_cost = max_cost; - - // possibly updating best_cost - if (max_cost < best_cost) - { - // cerr << "\tupdating best cost to " << max_cost << "\n"; - best_cost = max_cost; - } - } - } - } - } - match_level++; - } - return this_best_cost; +{ + // cerr << "sentence has " << match.size() << " matches, best cost: " << best_cost << ", lengths input: " << input_length << " tm: " << tm_length << endl; + + if (match.size() == 1) + return match[0].max_cost; + if (match.size() == 0) + return input_length+tm_length; + + int this_best_cost = input_length + tm_length; + for(int i=0; i > multi_match; + multi_match.push_back( match ); + + int match_level = 1; + while(multi_match[ match_level-1 ].size()>0) { + // init vector + vector< Match > empty; + multi_match.push_back( empty ); + + for(int first_level = 0; first_level <= (match_level-1)/2; first_level++) { + int second_level = match_level - first_level -1; + //cerr << "\tcombining level " << first_level << " and " << second_level << endl; + + vector< Match > &first_match = multi_match[ first_level ]; + vector< Match > &second_match = multi_match[ second_level ]; + + for(int i1 = 0; i1 < first_match.size(); i1++) { + for(int i2 = 0; i2 < second_match.size(); i2++) { + + // do not combine the same pair twice + if (first_level == second_level && i2 <= i1) { + continue; + } + + // get sorted matches (first is before second) + Match *first, *second; + if (first_match[i1].input_start < second_match[i2].input_start ) { + first = &first_match[i1]; + second = &second_match[i2]; + } else { + second = &first_match[i1]; + first = &second_match[i2]; + } + + //cerr << "\tcombining " + // << "(" << first->input_start << "," << first->input_end << "), " + // << first->tm_start << " [" << first->internal_cost << "]" + // << " with " + // << "(" << second->input_start << "," << second->input_end << "), " + // << second->tm_start<< " [" << second->internal_cost << "]" + // << endl; + + // do not process overlapping matches + if (first->input_end >= second->input_start) { + continue; + } + + // no overlap / mismatch in tm + if (first->tm_end >= second->tm_start) { + continue; + } + + // compute cost + int min_cost = 0; + int max_cost = 0; + + // initial + min_cost += abs( first->input_start - first->tm_start ); + max_cost += max( first->input_start, first->tm_start ); + + // same number of words, but not sent. start -> cost is at least 1 + if (first->input_start == first->tm_start && first->input_start > 0) { + min_cost++; + } + + // in-between + int skipped_words = second->input_start - first->input_end -1; + int skipped_words_tm = second->tm_start - first->tm_end -1; + int internal_cost = max( skipped_words, skipped_words_tm ); + internal_cost += first->internal_cost + second->internal_cost; + min_cost += internal_cost; + max_cost += internal_cost; + + // final + min_cost += abs( (tm_length-1 - second->tm_end) - + (input_length-1 - second->input_end) ); + max_cost += max( (tm_length-1 - second->tm_end), + (input_length-1 - second->input_end) ); + + // same number of words, but not sent. end -> cost is at least 1 + if ( ( input_length-1 - second->input_end + == tm_length-1 - second->tm_end ) + && input_length-1 != second->input_end ) { + min_cost++; + } + + // cerr << "\tcost: " << min_cost << "-" << max_cost << endl; + + // if worst than best cost, forget it + if (min_cost > best_cost) { + continue; + } + + // add match + Match new_match( first->input_start, + second->input_end, + first->tm_start, + second->tm_end, + min_cost, + max_cost, + internal_cost); + multi_match[ match_level ].push_back( new_match ); + // cerr << "\tstored\n"; + + // possibly updating this_best_cost + if (max_cost < this_best_cost) { + // cerr << "\tupdating this best cost to " << max_cost << "\n"; + this_best_cost = max_cost; + + // possibly updating best_cost + if (max_cost < best_cost) { + // cerr << "\tupdating best cost to " << max_cost << "\n"; + best_cost = max_cost; + } + } + } + } + } + match_level++; + } + return this_best_cost; } @@ -1101,22 +1022,22 @@ void FuzzyMatchWrapper::create_extract(int sentenceInd, int cost, const vector< WORD_ID wordId = sourceSentence[pos]; sourceStr += GetVocabulary().GetWord(wordId) + " "; } - + for (size_t targetInd = 0; targetInd < targets.size(); ++targetInd) { - const SentenceAlignment &sentenceAlignment = targets[targetInd]; + const SentenceAlignment &sentenceAlignment = targets[targetInd]; string targetStr = sentenceAlignment.getTargetString(GetVocabulary()); string alignStr = sentenceAlignment.getAlignmentString(); - + outputFile - << sentenceInd << endl - << cost << endl - << sourceStr << endl - << inputStr << endl - << targetStr << endl - << alignStr << endl - << path << endl - << sentenceAlignment.count << endl; - + << sentenceInd << endl + << cost << endl + << sourceStr << endl + << inputStr << endl + << targetStr << endl + << alignStr << endl + << path << endl + << sentenceAlignment.count << endl; + } } diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h index a6f772fb9..d8813a65c 100644 --- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h +++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h @@ -20,18 +20,18 @@ #include "Match.h" #include "moses/InputType.h" -namespace tmmt +namespace tmmt { class Match; class SentenceAlignment; - + class FuzzyMatchWrapper { public: FuzzyMatchWrapper(const std::string &source, const std::string &target, const std::string &alignment); std::string Extract(long translationId, const std::string &dirNameStr); - + protected: // tm-mt std::vector< std::vector< tmmt::SentenceAlignment > > targetAndAlignment; @@ -58,13 +58,13 @@ protected: void load_corpus( const std::string &fileName, std::vector< std::vector< tmmt::WORD_ID > > &corpus ); void load_target( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus); void load_alignment( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus ); - + /** brute force method: compare input to all corpus sentences */ - int basic_fuzzy_match( std::vector< std::vector< tmmt::WORD_ID > > source, - std::vector< std::vector< tmmt::WORD_ID > > input ) ; - - /** utlility function: compute length of sentence in characters - (spaces do not count) */ + int basic_fuzzy_match( std::vector< std::vector< tmmt::WORD_ID > > source, + std::vector< std::vector< tmmt::WORD_ID > > input ) ; + + /** utlility function: compute length of sentence in characters + (spaces do not count) */ unsigned int compute_length( const std::vector< tmmt::WORD_ID > &sentence ); unsigned int letter_sed( WORD_ID aIdx, WORD_ID bIdx ); unsigned int sed( const std::vector< WORD_ID > &a, const std::vector< WORD_ID > &b, std::string &best_path, bool use_letter_sed ); @@ -77,8 +77,9 @@ protected: void create_extract(int sentenceInd, int cost, const std::vector< WORD_ID > &sourceSentence, const std::vector &targets, const std::string &inputStr, const std::string &path, std::ofstream &outputFile); std::string ExtractTM(WordIndex &wordIndex, long translationId, const std::string &inputPath); - Vocabulary &GetVocabulary() - { return suffixArray->GetVocabulary(); } + Vocabulary &GetVocabulary() { + return suffixArray->GetVocabulary(); + } bool GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const; void SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value); diff --git a/moses/TranslationModel/fuzzy-match/Match.h b/moses/TranslationModel/fuzzy-match/Match.h index 7feb25769..f2ba2c150 100644 --- a/moses/TranslationModel/fuzzy-match/Match.h +++ b/moses/TranslationModel/fuzzy-match/Match.h @@ -14,17 +14,18 @@ namespace tmmt /* data structure for n-gram match between input and corpus */ -class Match { +class Match +{ public: - int input_start; - int input_end; - int tm_start; - int tm_end; - int min_cost; - int max_cost; - int internal_cost; - Match( int is, int ie, int ts, int te, int min, int max, int i ) - :input_start(is), input_end(ie), tm_start(ts), tm_end(te), min_cost(min), max_cost(max), internal_cost(i) + int input_start; + int input_end; + int tm_start; + int tm_end; + int min_cost; + int max_cost; + int internal_cost; + Match( int is, int ie, int ts, int te, int min, int max, int i ) + :input_start(is), input_end(ie), tm_start(ts), tm_end(te), min_cost(min), max_cost(max), internal_cost(i) {} }; diff --git a/moses/TranslationModel/fuzzy-match/SentenceAlignment.h b/moses/TranslationModel/fuzzy-match/SentenceAlignment.h index 30c887fc1..466baa149 100644 --- a/moses/TranslationModel/fuzzy-match/SentenceAlignment.h +++ b/moses/TranslationModel/fuzzy-match/SentenceAlignment.h @@ -15,20 +15,18 @@ namespace tmmt { - -struct SentenceAlignment -{ + +struct SentenceAlignment { int count; std::vector< WORD_ID > target; std::vector< std::pair > alignment; - + SentenceAlignment() {} - + std::string getTargetString(const Vocabulary &vocab) const; - - std::string getAlignmentString() const - { + + std::string getAlignmentString() const { std::stringstream strme; for (size_t i = 0; i < alignment.size(); ++i) { const std::pair &alignPair = alignment[i]; @@ -36,7 +34,7 @@ struct SentenceAlignment } return strme.str(); } - + }; } diff --git a/moses/TranslationModel/fuzzy-match/SuffixArray.cpp b/moses/TranslationModel/fuzzy-match/SuffixArray.cpp index 8a67fd954..5f49952ce 100644 --- a/moses/TranslationModel/fuzzy-match/SuffixArray.cpp +++ b/moses/TranslationModel/fuzzy-match/SuffixArray.cpp @@ -8,247 +8,235 @@ using namespace std; namespace tmmt { -SuffixArray::SuffixArray( string fileName ) +SuffixArray::SuffixArray( string fileName ) { - m_vcb.StoreIfNew( "" ); - m_endOfSentence = m_vcb.StoreIfNew( "" ); + m_vcb.StoreIfNew( "" ); + m_endOfSentence = m_vcb.StoreIfNew( "" ); - ifstream extractFile; - char line[LINE_MAX_LENGTH]; + ifstream extractFile; + char line[LINE_MAX_LENGTH]; - // count the number of words first; - extractFile.open(fileName.c_str()); - istream *fileP = &extractFile; - m_size = 0; - size_t sentenceCount = 0; - while(!fileP->eof()) { - SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n'); - if (fileP->eof()) break; - vector< WORD_ID > words = m_vcb.Tokenize( line ); - m_size += words.size() + 1; - sentenceCount++; - } - extractFile.close(); - cerr << m_size << " words (incl. sentence boundaries)" << endl; + // count the number of words first; + extractFile.open(fileName.c_str()); + istream *fileP = &extractFile; + m_size = 0; + size_t sentenceCount = 0; + while(!fileP->eof()) { + SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n'); + if (fileP->eof()) break; + vector< WORD_ID > words = m_vcb.Tokenize( line ); + m_size += words.size() + 1; + sentenceCount++; + } + extractFile.close(); + cerr << m_size << " words (incl. sentence boundaries)" << endl; - // allocate memory - m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size ); - m_index = (INDEX*) calloc( sizeof( INDEX ), m_size ); - m_wordInSentence = (char*) calloc( sizeof( char ), m_size ); - m_sentence = (size_t*) calloc( sizeof( size_t ), m_size ); - m_sentenceLength = (char*) calloc( sizeof( char ), sentenceCount ); + // allocate memory + m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size ); + m_index = (INDEX*) calloc( sizeof( INDEX ), m_size ); + m_wordInSentence = (char*) calloc( sizeof( char ), m_size ); + m_sentence = (size_t*) calloc( sizeof( size_t ), m_size ); + m_sentenceLength = (char*) calloc( sizeof( char ), sentenceCount ); - // fill the array - int wordIndex = 0; - int sentenceId = 0; - extractFile.open(fileName.c_str()); - fileP = &extractFile; - while(!fileP->eof()) { - SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n'); - if (fileP->eof()) break; - vector< WORD_ID > words = m_vcb.Tokenize( line ); + // fill the array + int wordIndex = 0; + int sentenceId = 0; + extractFile.open(fileName.c_str()); + fileP = &extractFile; + while(!fileP->eof()) { + SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n'); + if (fileP->eof()) break; + vector< WORD_ID > words = m_vcb.Tokenize( line ); - // add to corpus vector - corpus.push_back(words); + // add to corpus vector + corpus.push_back(words); - // create SA - - vector< WORD_ID >::const_iterator i; - for( i=words.begin(); i!=words.end(); i++) - { - m_index[ wordIndex ] = wordIndex; - m_sentence[ wordIndex ] = sentenceId; - m_wordInSentence[ wordIndex ] = i-words.begin(); - m_array[ wordIndex++ ] = *i; - } - m_index[ wordIndex ] = wordIndex; - m_array[ wordIndex++ ] = m_endOfSentence; - m_sentenceLength[ sentenceId++ ] = words.size(); - } - extractFile.close(); - cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl; - // List(0,9); + // create SA - // sort - m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size ); - Sort( 0, m_size-1 ); - free( m_buffer ); - cerr << "done sorting" << endl; + vector< WORD_ID >::const_iterator i; + for( i=words.begin(); i!=words.end(); i++) { + m_index[ wordIndex ] = wordIndex; + m_sentence[ wordIndex ] = sentenceId; + m_wordInSentence[ wordIndex ] = i-words.begin(); + m_array[ wordIndex++ ] = *i; + } + m_index[ wordIndex ] = wordIndex; + m_array[ wordIndex++ ] = m_endOfSentence; + m_sentenceLength[ sentenceId++ ] = words.size(); + } + extractFile.close(); + cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl; + // List(0,9); + + // sort + m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size ); + Sort( 0, m_size-1 ); + free( m_buffer ); + cerr << "done sorting" << endl; } // good ol' quick sort -void SuffixArray::Sort(INDEX start, INDEX end) { - if (start == end) return; - INDEX mid = (start+end+1)/2; - Sort( start, mid-1 ); - Sort( mid, end ); +void SuffixArray::Sort(INDEX start, INDEX end) +{ + if (start == end) return; + INDEX mid = (start+end+1)/2; + Sort( start, mid-1 ); + Sort( mid, end ); - // merge - int i = start; - int j = mid; - int k = 0; - int length = end-start+1; - while( k end ) - { - m_buffer[ k++ ] = m_index[ i++ ]; - } - else { - if (CompareIndex( m_index[i], m_index[j] ) < 0) - { - m_buffer[ k++ ] = m_index[ i++ ]; - } - else - { - m_buffer[ k++ ] = m_index[ j++ ]; - } - } - } - - memcpy( ((char*)m_index) + sizeof( INDEX ) * start, - ((char*)m_buffer), sizeof( INDEX ) * (end-start+1) ); + // merge + int i = start; + int j = mid; + int k = 0; + int length = end-start+1; + while( k end ) { + m_buffer[ k++ ] = m_index[ i++ ]; + } else { + if (CompareIndex( m_index[i], m_index[j] ) < 0) { + m_buffer[ k++ ] = m_index[ i++ ]; + } else { + m_buffer[ k++ ] = m_index[ j++ ]; + } + } + } + + memcpy( ((char*)m_index) + sizeof( INDEX ) * start, + ((char*)m_buffer), sizeof( INDEX ) * (end-start+1) ); } SuffixArray::~SuffixArray() -{ - free(m_index); - free(m_array); +{ + free(m_index); + free(m_array); } int SuffixArray::CompareIndex( INDEX a, INDEX b ) const { - // skip over identical words - INDEX offset = 0; - while( a+offset < m_size && - b+offset < m_size && - m_array[ a+offset ] == m_array[ b+offset ] ) - { offset++; } - - if( a+offset == m_size ) return -1; - if( b+offset == m_size ) return 1; - return CompareWord( m_array[ a+offset ], m_array[ b+offset ] ); + // skip over identical words + INDEX offset = 0; + while( a+offset < m_size && + b+offset < m_size && + m_array[ a+offset ] == m_array[ b+offset ] ) { + offset++; + } + + if( a+offset == m_size ) return -1; + if( b+offset == m_size ) return 1; + return CompareWord( m_array[ a+offset ], m_array[ b+offset ] ); } inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const { - // cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl; - return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ); + // cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl; + return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ); } int SuffixArray::Count( const vector< WORD > &phrase ) { - INDEX dummy; - return LimitedCount( phrase, m_size, dummy, dummy, 0, m_size-1 ); + INDEX dummy; + return LimitedCount( phrase, m_size, dummy, dummy, 0, m_size-1 ); } bool SuffixArray::MinCount( const vector< WORD > &phrase, INDEX min ) { - INDEX dummy; - return LimitedCount( phrase, min, dummy, dummy, 0, m_size-1 ) >= min; + INDEX dummy; + return LimitedCount( phrase, min, dummy, dummy, 0, m_size-1 ) >= min; } bool SuffixArray::Exists( const vector< WORD > &phrase ) { - INDEX dummy; - return LimitedCount( phrase, 1, dummy, dummy, 0, m_size-1 ) == 1; + INDEX dummy; + return LimitedCount( phrase, 1, dummy, dummy, 0, m_size-1 ) == 1; } int SuffixArray::FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end ) { - return LimitedCount( phrase, m_size, firstMatch, lastMatch, search_start, search_end ); + return LimitedCount( phrase, m_size, firstMatch, lastMatch, search_start, search_end ); } int SuffixArray::LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end ) { - // cerr << "FindFirst\n"; - INDEX start = search_start; - INDEX end = (search_end == -1) ? (m_size-1) : search_end; - INDEX mid = FindFirst( phrase, start, end ); - // cerr << "done\n"; - if (mid == m_size) return 0; // no matches - if (min == 1) return 1; // only existance check + // cerr << "FindFirst\n"; + INDEX start = search_start; + INDEX end = (search_end == -1) ? (m_size-1) : search_end; + INDEX mid = FindFirst( phrase, start, end ); + // cerr << "done\n"; + if (mid == m_size) return 0; // no matches + if (min == 1) return 1; // only existance check - int matchCount = 1; + int matchCount = 1; - //cerr << "before...\n"; - firstMatch = FindLast( phrase, mid, start, -1 ); - matchCount += mid - firstMatch; + //cerr << "before...\n"; + firstMatch = FindLast( phrase, mid, start, -1 ); + matchCount += mid - firstMatch; - //cerr << "after...\n"; - lastMatch = FindLast( phrase, mid, end, 1 ); - matchCount += lastMatch - mid; + //cerr << "after...\n"; + lastMatch = FindLast( phrase, mid, end, 1 ); + matchCount += lastMatch - mid; - return matchCount; + return matchCount; } SuffixArray::INDEX SuffixArray::FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction ) { - end += direction; - while(true) - { - INDEX mid = ( start + end + (direction>0 ? 0 : 1) )/2; - - int match = Match( phrase, mid ); - int matchNext = Match( phrase, mid+direction ); - //cerr << "\t" << start << ";" << mid << ";" << end << " -> " << match << "," << matchNext << endl; - - if (match == 0 && matchNext != 0) return mid; + end += direction; + while(true) { + INDEX mid = ( start + end + (direction>0 ? 0 : 1) )/2; - if (match == 0) // mid point is a match - start = mid; - else - end = mid; - } + int match = Match( phrase, mid ); + int matchNext = Match( phrase, mid+direction ); + //cerr << "\t" << start << ";" << mid << ";" << end << " -> " << match << "," << matchNext << endl; + + if (match == 0 && matchNext != 0) return mid; + + if (match == 0) // mid point is a match + start = mid; + else + end = mid; + } } SuffixArray::INDEX SuffixArray::FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end ) { - while(true) - { - INDEX mid = ( start + end + 1 )/2; - //cerr << "FindFirst(" << start << ";" << mid << ";" << end << ")\n"; - int match = Match( phrase, mid ); - - if (match == 0) return mid; - if (start >= end && match != 0 ) return m_size; - - if (match > 0) - start = mid+1; - else - end = mid-1; - } + while(true) { + INDEX mid = ( start + end + 1 )/2; + //cerr << "FindFirst(" << start << ";" << mid << ";" << end << ")\n"; + int match = Match( phrase, mid ); + + if (match == 0) return mid; + if (start >= end && match != 0 ) return m_size; + + if (match > 0) + start = mid+1; + else + end = mid-1; + } } int SuffixArray::Match( const vector< WORD > &phrase, INDEX index ) { - INDEX pos = m_index[ index ]; - for(INDEX i=0; i > corpus; WORD_ID *m_array; - INDEX *m_index; - INDEX *m_buffer; - char *m_wordInSentence; - size_t *m_sentence; - char *m_sentenceLength; - WORD_ID m_endOfSentence; - Vocabulary m_vcb; - INDEX m_size; + INDEX *m_index; + INDEX *m_buffer; + char *m_wordInSentence; + size_t *m_sentence; + char *m_sentenceLength; + WORD_ID m_endOfSentence; + Vocabulary m_vcb; + INDEX m_size; public: - SuffixArray( std::string fileName ); - ~SuffixArray(); + SuffixArray( std::string fileName ); + ~SuffixArray(); - void Sort(INDEX start, INDEX end); - int CompareIndex( INDEX a, INDEX b ) const; - inline int CompareWord( WORD_ID a, WORD_ID b ) const; - int Count( const std::vector< WORD > &phrase ); - bool MinCount( const std::vector< WORD > &phrase, INDEX min ); - bool Exists( const std::vector< WORD > &phrase ); - int FindMatches( const std::vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 ); - int LimitedCount( const std::vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 ); - INDEX FindFirst( const std::vector< WORD > &phrase, INDEX &start, INDEX &end ); - INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction ); - int Match( const std::vector< WORD > &phrase, INDEX index ); - void List( INDEX start, INDEX end ); - inline INDEX GetPosition( INDEX index ) { return m_index[ index ]; } - inline size_t GetSentence( INDEX position ) { return m_sentence[position]; } - inline char GetWordInSentence( INDEX position ) { return m_wordInSentence[position]; } - inline char GetSentenceLength( size_t sentenceId ) { return m_sentenceLength[sentenceId]; } - inline INDEX GetSize() { return m_size; } + void Sort(INDEX start, INDEX end); + int CompareIndex( INDEX a, INDEX b ) const; + inline int CompareWord( WORD_ID a, WORD_ID b ) const; + int Count( const std::vector< WORD > &phrase ); + bool MinCount( const std::vector< WORD > &phrase, INDEX min ); + bool Exists( const std::vector< WORD > &phrase ); + int FindMatches( const std::vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 ); + int LimitedCount( const std::vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 ); + INDEX FindFirst( const std::vector< WORD > &phrase, INDEX &start, INDEX &end ); + INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction ); + int Match( const std::vector< WORD > &phrase, INDEX index ); + void List( INDEX start, INDEX end ); + inline INDEX GetPosition( INDEX index ) { + return m_index[ index ]; + } + inline size_t GetSentence( INDEX position ) { + return m_sentence[position]; + } + inline char GetWordInSentence( INDEX position ) { + return m_wordInSentence[position]; + } + inline char GetSentenceLength( size_t sentenceId ) { + return m_sentenceLength[sentenceId]; + } + inline INDEX GetSize() { + return m_size; + } - Vocabulary &GetVocabulary() - { return m_vcb; } - const std::vector< std::vector< WORD_ID > > &GetCorpus() const - { return corpus; } + Vocabulary &GetVocabulary() { + return m_vcb; + } + const std::vector< std::vector< WORD_ID > > &GetCorpus() const { + return corpus; + } }; } diff --git a/moses/TranslationModel/fuzzy-match/Vocabulary.cpp b/moses/TranslationModel/fuzzy-match/Vocabulary.cpp index 0c833ff78..ab1439a29 100644 --- a/moses/TranslationModel/fuzzy-match/Vocabulary.cpp +++ b/moses/TranslationModel/fuzzy-match/Vocabulary.cpp @@ -10,7 +10,8 @@ namespace tmmt { // as in beamdecoder/tables.cpp -vector Vocabulary::Tokenize( const char input[] ) { +vector Vocabulary::Tokenize( const char input[] ) +{ vector< WORD_ID > token; bool betweenWords = true; int start=0; @@ -21,8 +22,7 @@ vector Vocabulary::Tokenize( const char input[] ) { if (!isSpace && betweenWords) { start = i; betweenWords = false; - } - else if (isSpace && !betweenWords) { + } else if (isSpace && !betweenWords) { token.push_back( StoreIfNew ( string( input+start, i-start ) ) ); betweenWords = true; } @@ -32,9 +32,11 @@ vector Vocabulary::Tokenize( const char input[] ) { return token; } -WORD_ID Vocabulary::StoreIfNew( const WORD& word ) { +WORD_ID Vocabulary::StoreIfNew( const WORD& word ) +{ - { // read=lock scope + { + // read=lock scope #ifdef WITH_THREADS boost::shared_lock read_lock(m_accessLock); #endif @@ -43,17 +45,18 @@ WORD_ID Vocabulary::StoreIfNew( const WORD& word ) { if( i != lookup.end() ) return i->second; } - + #ifdef WITH_THREADS boost::unique_lock lock(m_accessLock); #endif WORD_ID id = vocab.size(); vocab.push_back( word ); lookup[ word ] = id; - return id; + return id; } -WORD_ID Vocabulary::GetWordID( const WORD &word ) { +WORD_ID Vocabulary::GetWordID( const WORD &word ) +{ #ifdef WITH_THREADS boost::shared_lock read_lock(m_accessLock); #endif diff --git a/moses/TranslationModel/fuzzy-match/Vocabulary.h b/moses/TranslationModel/fuzzy-match/Vocabulary.h index 7be82bcbe..dfa11c1db 100644 --- a/moses/TranslationModel/fuzzy-match/Vocabulary.h +++ b/moses/TranslationModel/fuzzy-match/Vocabulary.h @@ -34,16 +34,20 @@ namespace tmmt typedef std::string WORD; typedef unsigned int WORD_ID; -class Vocabulary { - public: +class Vocabulary +{ +public: std::map lookup; std::vector< WORD > vocab; WORD_ID StoreIfNew( const WORD& ); WORD_ID GetWordID( const WORD& ); std::vector Tokenize( const char[] ); - inline WORD &GetWord( WORD_ID id ) const { WORD &i = (WORD&) vocab[ id ]; return i; } + inline WORD &GetWord( WORD_ID id ) const { + WORD &i = (WORD&) vocab[ id ]; + return i; + } - protected: +protected: #ifdef WITH_THREADS //reader-writer lock mutable boost::shared_mutex m_accessLock; diff --git a/moses/TranslationModel/fuzzy-match/create_xml.cpp b/moses/TranslationModel/fuzzy-match/create_xml.cpp index 783fb93eb..44c1efc9f 100644 --- a/moses/TranslationModel/fuzzy-match/create_xml.cpp +++ b/moses/TranslationModel/fuzzy-match/create_xml.cpp @@ -42,12 +42,10 @@ void create_xml(const string &inPath) string inLine; int step = 0; - while (!inStrme.eof()) - { + while (!inStrme.eof()) { getline(inStrme, inLine); //cout << inLine << endl; - switch (step) - { + switch (step) { case 0: setenceId = Scan(inLine); ++step; @@ -63,8 +61,7 @@ void create_xml(const string &inPath) case 3: if (input == NULL) { input = new string(inLine); - } - else { + } else { assert(inLine == *input); } ++step; @@ -87,9 +84,9 @@ void create_xml(const string &inPath) //print STDOUT $frame."\n"; rule << ret.ruleS << " [X] ||| " << ret.ruleT << " [X] ||| " << ret.ruleAlignment - << " ||| " << count << endl; + << " ||| " << count << endl; ruleInv << ret.ruleT << " [X] ||| " << ret.ruleS << " [X] ||| " << ret.ruleAlignmentInv - << " ||| " << count << endl; + << " ||| " << count << endl; //print STDOUT "$sentenceInd ||| $score ||| $count\n"; ++ruleCount; @@ -112,8 +109,8 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string & { CreateXMLRetValues ret; vector sourceToks = Tokenize(source, " ") - ,inputToks = Tokenize(input, " ") - ,targetsToks = Tokenize(target, " "); + ,inputToks = Tokenize(input, " ") + ,targetsToks = Tokenize(target, " "); Alignments alignments(align, sourceToks.size(), targetsToks.size()); map frameInput; map alignI2S; @@ -241,8 +238,7 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string & if (action == "M") { inputBitmap.push_back(1); - } - else if (action == "I" || action == "S") { + } else if (action == "I" || action == "S") { inputBitmap.push_back(0); } @@ -358,9 +354,8 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string & } // end of tm target inclusion (not included word or inserted input) else if (currently_included - && ( targetBitmap[t] || frameInput.find(t) != frameInput.end() ) - ) - { + && ( targetBitmap[t] || frameInput.find(t) != frameInput.end() ) + ) { // add xml (unless change is at the beginning of the sentence if ( start_t >= 0 ) { string target = ""; diff --git a/moses/TranslationOption.cpp b/moses/TranslationOption.cpp index 87721bc52..dfe3312fc 100644 --- a/moses/TranslationOption.cpp +++ b/moses/TranslationOption.cpp @@ -42,11 +42,11 @@ TranslationOption::TranslationOption(const WordsRange &wordsRange } TranslationOption::TranslationOption(const TranslationOption ©, const WordsRange &sourceWordsRange) -: m_targetPhrase(copy.m_targetPhrase) + : m_targetPhrase(copy.m_targetPhrase) //, m_sourcePhrase(new Phrase(*copy.m_sourcePhrase)) // TODO use when confusion network trans opt for confusion net properly implemented -, m_sourceWordsRange(sourceWordsRange) -, m_futureScore(copy.m_futureScore) -, m_lexReorderingScores(copy.m_lexReorderingScores) + , m_sourceWordsRange(sourceWordsRange) + , m_futureScore(copy.m_futureScore) + , m_lexReorderingScores(copy.m_lexReorderingScores) {} bool TranslationOption::IsCompatible(const Phrase& phrase, const std::vector& featuresToCheck) const diff --git a/moses/TranslationOption.h b/moses/TranslationOption.h index 8e2064f83..b1de31eb1 100644 --- a/moses/TranslationOption.h +++ b/moses/TranslationOption.h @@ -146,18 +146,18 @@ public: void CacheLexReorderingScores(const LexicalReordering &scoreProducer, const Scores &score); TO_STRING(); - - bool operator== (const TranslationOption &rhs) const - { + + bool operator== (const TranslationOption &rhs) const { return m_sourceWordsRange == rhs.m_sourceWordsRange && - m_targetPhrase == rhs.m_targetPhrase; - } + m_targetPhrase == rhs.m_targetPhrase; + } }; //XXX: This doesn't look at the alignment. Is this correct? -inline size_t hash_value(const TranslationOption& translationOption) { +inline size_t hash_value(const TranslationOption& translationOption) +{ size_t seed = 0; boost::hash_combine(seed, translationOption.GetTargetPhrase()); boost::hash_combine(seed, translationOption.GetStartPos()); diff --git a/moses/TranslationOptionCollection.cpp b/moses/TranslationOptionCollection.cpp index 16bcce791..2d7024c7a 100644 --- a/moses/TranslationOptionCollection.cpp +++ b/moses/TranslationOptionCollection.cpp @@ -48,11 +48,11 @@ bool CompareTranslationOption(const TranslationOption *a, const TranslationOptio * This fn should be called by inherited classes */ TranslationOptionCollection::TranslationOptionCollection( - InputType const& src, size_t maxNoTransOptPerCoverage, float translationOptionThreshold) + InputType const& src, size_t maxNoTransOptPerCoverage, float translationOptionThreshold) : m_source(src) - ,m_futureScore(src.GetSize()) - ,m_maxNoTransOptPerCoverage(maxNoTransOptPerCoverage) - ,m_translationOptionThreshold(translationOptionThreshold) + ,m_futureScore(src.GetSize()) + ,m_maxNoTransOptPerCoverage(maxNoTransOptPerCoverage) + ,m_translationOptionThreshold(translationOptionThreshold) { // create 2-d vector size_t size = src.GetSize(); @@ -202,73 +202,68 @@ void TranslationOptionCollection::ProcessOneUnknownWord(const Word &sourceWord,s const UnknownWordPenaltyProducer *unknownWordPenaltyProducer = staticData.GetUnknownWordPenaltyProducer(); float unknownScore = FloorScore(TransformScore(0)); - // unknown word, add as trans opt - FactorCollection &factorCollection = FactorCollection::Instance(); + // unknown word, add as trans opt + FactorCollection &factorCollection = FactorCollection::Instance(); - size_t isDigit = 0; - - const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface - const StringPiece s = f->GetString(); - bool isEpsilon = (s=="" || s==EPSILON); - if (StaticData::Instance().GetDropUnknown()) - { + size_t isDigit = 0; + + const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface + const StringPiece s = f->GetString(); + bool isEpsilon = (s=="" || s==EPSILON); + if (StaticData::Instance().GetDropUnknown()) { - isDigit = s.find_first_of("0123456789"); - if (isDigit == 1) - isDigit = 1; - else - isDigit = 0; - // modify the starting bitmap - } - - Phrase* m_unksrc = new Phrase(1); + isDigit = s.find_first_of("0123456789"); + if (isDigit == 1) + isDigit = 1; + else + isDigit = 0; + // modify the starting bitmap + } + + Phrase* m_unksrc = new Phrase(1); m_unksrc->AddWord() = sourceWord; - m_unksrcs.push_back(m_unksrc); + m_unksrcs.push_back(m_unksrc); - TranslationOption *transOpt; - TargetPhrase targetPhrase; - targetPhrase.SetSourcePhrase(*m_unksrc); - - if (!(staticData.GetDropUnknown() || isEpsilon) || isDigit) - { - // add to dictionary + TranslationOption *transOpt; + TargetPhrase targetPhrase; + targetPhrase.SetSourcePhrase(*m_unksrc); - Word &targetWord = targetPhrase.AddWord(); - - for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++) - { - FactorType factorType = static_cast(currFactor); - - const Factor *sourceFactor = sourceWord[currFactor]; - if (sourceFactor == NULL) - targetWord[factorType] = factorCollection.AddFactor(UNKNOWN_FACTOR); - else - targetWord[factorType] = factorCollection.AddFactor(sourceFactor->GetString()); - } - //create a one-to-one alignment between UNKNOWN_FACTOR and its verbatim translation - - targetPhrase.SetAlignmentInfo("0-0"); - - } - else - { - // drop source word. create blank trans opt + if (!(staticData.GetDropUnknown() || isEpsilon) || isDigit) { + // add to dictionary - //targetPhrase.SetAlignment(); + Word &targetWord = targetPhrase.AddWord(); - } + for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++) { + FactorType factorType = static_cast(currFactor); + + const Factor *sourceFactor = sourceWord[currFactor]; + if (sourceFactor == NULL) + targetWord[factorType] = factorCollection.AddFactor(UNKNOWN_FACTOR); + else + targetWord[factorType] = factorCollection.AddFactor(sourceFactor->GetString()); + } + //create a one-to-one alignment between UNKNOWN_FACTOR and its verbatim translation + + targetPhrase.SetAlignmentInfo("0-0"); + + } else { + // drop source word. create blank trans opt + + //targetPhrase.SetAlignment(); + + } targetPhrase.GetScoreBreakdown().Assign(unknownWordPenaltyProducer, unknownScore); - if (inputScores != NULL) { - targetPhrase.SetInputScore(*inputScores); - } + if (inputScores != NULL) { + targetPhrase.SetInputScore(*inputScores); + } - targetPhrase.Evaluate(*m_unksrc); + targetPhrase.Evaluate(*m_unksrc); - transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos + length - 1), targetPhrase); - Add(transOpt); + transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos + length - 1), targetPhrase); + Add(transOpt); } @@ -426,19 +421,19 @@ void TranslationOptionCollection::EvaluateWithSource() { const size_t size = m_source.GetSize(); for (size_t startPos = 0 ; startPos < size ; ++startPos) { - size_t maxSize = m_source.GetSize() - startPos; - size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength(); - maxSize = std::min(maxSize, maxSizePhrase); + size_t maxSize = m_source.GetSize() - startPos; + size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength(); + maxSize = std::min(maxSize, maxSizePhrase); - for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos) { - TranslationOptionList &transOptList = GetTranslationOptionList(startPos, endPos); + for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos) { + TranslationOptionList &transOptList = GetTranslationOptionList(startPos, endPos); - TranslationOptionList::const_iterator iterTransOpt; - for(iterTransOpt = transOptList.begin() ; iterTransOpt != transOptList.end() ; ++iterTransOpt) { - TranslationOption &transOpt = **iterTransOpt; - transOpt.Evaluate(m_source); - } - } + TranslationOptionList::const_iterator iterTransOpt; + for(iterTransOpt = transOptList.begin() ; iterTransOpt != transOptList.end() ; ++iterTransOpt) { + TranslationOption &transOpt = **iterTransOpt; + transOpt.Evaluate(m_source); + } + } } } @@ -514,7 +509,7 @@ void TranslationOptionCollection::CreateTranslationOptionsForRange( for (++iterStep ; iterStep != decodeGraph.end() ; ++iterStep) { - const DecodeStep &decodeStep = **iterStep; + const DecodeStep &decodeStep = **iterStep; PartialTranslOptColl* newPtoc = new PartialTranslOptColl; // go thru each intermediate trans opt just created @@ -634,7 +629,7 @@ std::ostream& operator<<(std::ostream& out, const TranslationOptionCollection& c return out; } -const std::vector& TranslationOptionCollection::GetUnknownSources() const +const std::vector& TranslationOptionCollection::GetUnknownSources() const { return m_unksrcs; } diff --git a/moses/TranslationOptionCollection.h b/moses/TranslationOptionCollection.h index 36164f135..171a082e2 100644 --- a/moses/TranslationOptionCollection.h +++ b/moses/TranslationOptionCollection.h @@ -42,7 +42,7 @@ class InputType; class FactorMask; class Word; class DecodeGraph; - + /** Contains all phrase translations applicable to current input type (a sentence or confusion network). * A key insight into efficient decoding is that various input * conditions (trelliss, factored input, normal text, xml markup) diff --git a/moses/TranslationOptionCollectionConfusionNet.cpp b/moses/TranslationOptionCollectionConfusionNet.cpp index a25e8cffb..93953ba8a 100644 --- a/moses/TranslationOptionCollectionConfusionNet.cpp +++ b/moses/TranslationOptionCollectionConfusionNet.cpp @@ -10,8 +10,8 @@ namespace Moses /** constructor; just initialize the base class */ TranslationOptionCollectionConfusionNet::TranslationOptionCollectionConfusionNet( - const ConfusionNet &input - , size_t maxNoTransOptPerCoverage, float translationOptionThreshold) + const ConfusionNet &input + , size_t maxNoTransOptPerCoverage, float translationOptionThreshold) : TranslationOptionCollection(input, maxNoTransOptPerCoverage, translationOptionThreshold) {} /* forcibly create translation option for a particular source word. diff --git a/moses/TreeInput.cpp b/moses/TreeInput.cpp index acae0bdb1..166445602 100644 --- a/moses/TreeInput.cpp +++ b/moses/TreeInput.cpp @@ -149,7 +149,7 @@ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector return false; } - // may be either a input span label ("label"), or a specified output translation "translation" + // may be either a input span label ("label"), or a specified output translation "translation" string label = ParseXmlTagAttribute(tagContent,"label"); string translation = ParseXmlTagAttribute(tagContent,"translation"); @@ -165,18 +165,17 @@ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector vector altTexts = TokenizeMultiCharSeparator(translation, "||"); vector altLabel = TokenizeMultiCharSeparator(label, "||"); vector altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||"); - //TRACE_ERR("number of translations: " << altTexts.size() << endl); + //TRACE_ERR("number of translations: " << altTexts.size() << endl); for (size_t i=0; i i && altLabel[i].size() > 0) { targetLHSstr = altLabel[i]; - } - else { + } else { const UnknownLHSList &lhsList = StaticData::Instance().GetUnknownLHS(); UnknownLHSList::const_iterator iterLHS = lhsList.begin(); targetLHSstr = iterLHS->first; diff --git a/moses/TrellisPath.cpp b/moses/TrellisPath.cpp index c73575b2c..fed8f9658 100644 --- a/moses/TrellisPath.cpp +++ b/moses/TrellisPath.cpp @@ -41,7 +41,8 @@ TrellisPath::TrellisPath(const Hypothesis *hypo) } } -void TrellisPath::InitScore() { +void TrellisPath::InitScore() +{ m_totalScore = m_path[0]->GetWinningHypo()->GetTotalScore(); m_scoreBreakdown= m_path[0]->GetWinningHypo()->GetScoreBreakdown(); @@ -82,8 +83,8 @@ TrellisPath::TrellisPath(const TrellisPath ©, size_t edgeIndex, const Hypoth InitScore(); } -TrellisPath::TrellisPath(const vector edges) -:m_prevEdgeChanged(NOT_FOUND) +TrellisPath::TrellisPath(const vector edges) + :m_prevEdgeChanged(NOT_FOUND) { m_path.resize(edges.size()); copy(edges.rbegin(),edges.rend(),m_path.begin()); diff --git a/moses/TrellisPath.h b/moses/TrellisPath.h index d8005435c..26e722696 100644 --- a/moses/TrellisPath.h +++ b/moses/TrellisPath.h @@ -59,36 +59,36 @@ protected: void InitScore(); public: - TrellisPath(); // not implemented - - //! create path OF pure hypo - TrellisPath(const Hypothesis *hypo); - - /** create path from another path, deviate at edgeIndex by using arc instead, - * which may change other hypo back from there - */ - TrellisPath(const TrellisPath ©, size_t edgeIndex, const Hypothesis *arc); + TrellisPath(); // not implemented - //! get score for this path throught trellis - inline float GetTotalScore() const { return m_totalScore; } + //! create path OF pure hypo + TrellisPath(const Hypothesis *hypo); - /** list of each hypo/arcs in path. For anything other than the best hypo, it is not possible just to follow the - * m_prevHypo variable in the hypothesis object - */ - inline const std::vector &GetEdges() const - { - return m_path; - } + /** create path from another path, deviate at edgeIndex by using arc instead, + * which may change other hypo back from there + */ + TrellisPath(const TrellisPath ©, size_t edgeIndex, const Hypothesis *arc); - inline size_t GetSize() const - { - return m_path.size(); - } - - //! create a set of next best paths by wiggling 1 of the node at a time. - void CreateDeviantPaths(TrellisPathCollection &pathColl) const; - - //! create a list of next best paths by wiggling 1 of the node at a time. + //! get score for this path throught trellis + inline float GetTotalScore() const { + return m_totalScore; + } + + /** list of each hypo/arcs in path. For anything other than the best hypo, it is not possible just to follow the + * m_prevHypo variable in the hypothesis object + */ + inline const std::vector &GetEdges() const { + return m_path; + } + + inline size_t GetSize() const { + return m_path.size(); + } + + //! create a set of next best paths by wiggling 1 of the node at a time. + void CreateDeviantPaths(TrellisPathCollection &pathColl) const; + + //! create a list of next best paths by wiggling 1 of the node at a time. void CreateDeviantPaths(TrellisPathList &pathColl) const; inline const ScoreComponentCollection &GetScoreBreakdown() const { diff --git a/moses/TypeDef.h b/moses/TypeDef.h index e44946a2f..2b98b5bc3 100644 --- a/moses/TypeDef.h +++ b/moses/TypeDef.h @@ -129,7 +129,7 @@ enum InputTypeEnum { ,WordLatticeInput = 2 ,TreeInputType = 3 ,WordLatticeInput2 = 4 - + }; enum XmlInputType { @@ -169,8 +169,7 @@ enum WordAlignmentSort { ,TargetOrder = 1 }; -enum FormatType -{ +enum FormatType { MosesFormat ,HieroFormat }; diff --git a/moses/Util.cpp b/moses/Util.cpp index 13cee27f9..f92c32dbb 100644 --- a/moses/Util.cpp +++ b/moses/Util.cpp @@ -182,8 +182,7 @@ std::string PassthroughSGML(std::string &line, const std::string tagName, const size_t close = lline.find(rbrack, open); //check whether the tag is closed with '/>'; if not return the empty string - if (close == std::string::npos) - { + if (close == std::string::npos) { TRACE_ERR("PassthroughSGML error: the tag does not end properly\n"); return meta; } @@ -198,8 +197,7 @@ std::string PassthroughSGML(std::string &line, const std::string tagName, const lline = ToLower(line); open = lline.find(lbrack+tagName); - if (open != std::string::npos) - { + if (open != std::string::npos) { TRACE_ERR("PassthroughSGML error: there are two tags\n"); } return meta; diff --git a/moses/Util.h b/moses/Util.h index 9f43d9dc3..e5bdc820a 100644 --- a/moses/Util.h +++ b/moses/Util.h @@ -363,27 +363,27 @@ std::string PassthroughSGML(std::string &line, const std::string tagName,const s */ inline std::string GetFirstString(const std::string& str, int& first_pos, const std::string& delimiters = " \t") { - - std::string first_str; - // Skip delimiters at beginning. - std::string::size_type lastPos = str.find_first_not_of(delimiters, first_pos); - - // Find first "non-delimiter". - std::string::size_type pos = str.find_first_of(delimiters, lastPos); - - if (std::string::npos != pos || std::string::npos != lastPos){ - - first_str = str.substr(lastPos, pos - lastPos); - - // Skip delimiters. Note the "not_of" - lastPos = str.find_first_not_of(delimiters, pos); - - } - - first_pos = lastPos; - return first_str; + + std::string first_str; + // Skip delimiters at beginning. + std::string::size_type lastPos = str.find_first_not_of(delimiters, first_pos); + + // Find first "non-delimiter". + std::string::size_type pos = str.find_first_of(delimiters, lastPos); + + if (std::string::npos != pos || std::string::npos != lastPos) { + + first_str = str.substr(lastPos, pos - lastPos); + + // Skip delimiters. Note the "not_of" + lastPos = str.find_first_not_of(delimiters, pos); + + } + + first_pos = lastPos; + return first_str; } - + template T log_sum (T log_a, T log_b) { diff --git a/moses/Word.cpp b/moses/Word.cpp index 69d382c8a..41e5fae03 100644 --- a/moses/Word.cpp +++ b/moses/Word.cpp @@ -87,7 +87,8 @@ std::string Word::GetString(const vector factorType,bool endWithBlan return strme.str(); } -StringPiece Word::GetString(FactorType factorType) const { +StringPiece Word::GetString(FactorType factorType) const +{ return m_factorArray[factorType]->GetString(); } diff --git a/moses/Word.h b/moses/Word.h index d650fb67e..e88b0441b 100644 --- a/moses/Word.h +++ b/moses/Word.h @@ -152,8 +152,9 @@ struct WordComparer { }; -inline size_t hash_value(const Word& word) { - return word.hash(); +inline size_t hash_value(const Word& word) +{ + return word.hash(); } } diff --git a/moses/XmlOption.cpp b/moses/XmlOption.cpp index c8d639e0a..4b703b247 100644 --- a/moses/XmlOption.cpp +++ b/moses/XmlOption.cpp @@ -83,8 +83,8 @@ string TrimXml(const string& str, const std::string& lbrackStr, const std::strin */ bool isXmlTag(const string& tag, const std::string& lbrackStr, const std::string& rbrackStr) { - return (tag.substr(0,lbrackStr.length()) == lbrackStr && - (tag[lbrackStr.length()] == '/' || + return (tag.substr(0,lbrackStr.length()) == lbrackStr && + (tag[lbrackStr.length()] == '/' || (tag[lbrackStr.length()] >= 'a' && tag[lbrackStr.length()] <= 'z') || (tag[lbrackStr.length()] >= 'A' && tag[lbrackStr.length()] <= 'Z'))); } @@ -111,7 +111,7 @@ vector TokenizeXml(const string& str, const std::string& lbrackStr, cons // walk thorugh the string (loop vver cpos) while (cpos != str.size()) { // find the next opening "<" of an xml tag - lpos = str.find(lbrack, cpos); // lpos = str.find_first_of(lbrack, cpos); + lpos = str.find(lbrack, cpos); // lpos = str.find_first_of(lbrack, cpos); if (lpos != string::npos) { // find the end of the xml tag rpos = str.find(rbrack, lpos+lbrackStr.length()-1); // rpos = str.find_first_of(rbrack, lpos); @@ -149,8 +149,8 @@ vector TokenizeXml(const string& str, const std::string& lbrackStr, cons * \param lbrackStr xml tag's left bracket string, typically "<" * \param rbrackStr xml tag's right bracket string, typically ">" */ -bool ProcessAndStripXMLTags(string &line, vector &res, ReorderingConstraint &reorderingConstraint, vector< size_t > &walls, - const std::string& lbrackStr, const std::string& rbrackStr) +bool ProcessAndStripXMLTags(string &line, vector &res, ReorderingConstraint &reorderingConstraint, vector< size_t > &walls, + const std::string& lbrackStr, const std::string& rbrackStr) { //parse XML markup in translation line diff --git a/moses/XmlOption.h b/moses/XmlOption.h index 45989c841..942446b26 100644 --- a/moses/XmlOption.h +++ b/moses/XmlOption.h @@ -30,8 +30,8 @@ std::string TrimXml(const std::string& str, const std::string& lbrackStr="<", co bool isXmlTag(const std::string& tag, const std::string& lbrackStr="<", const std::string& rbrackStr=">"); std::vector TokenizeXml(const std::string& str, const std::string& lbrackStr="<", const std::string& rbrackStr=">"); -bool ProcessAndStripXMLTags(std::string &line, std::vector &res, ReorderingConstraint &reorderingConstraint, std::vector< size_t > &walls, - const std::string& lbrackStr="<", const std::string& rbrackStr=">"); +bool ProcessAndStripXMLTags(std::string &line, std::vector &res, ReorderingConstraint &reorderingConstraint, std::vector< size_t > &walls, + const std::string& lbrackStr="<", const std::string& rbrackStr=">"); } diff --git a/phrase-extract/AlignmentPhrase.h b/phrase-extract/AlignmentPhrase.h index ec6431f18..52d9c85ea 100644 --- a/phrase-extract/AlignmentPhrase.h +++ b/phrase-extract/AlignmentPhrase.h @@ -25,7 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA namespace MosesTraining { - + class WordsRange; class AlignmentElement diff --git a/phrase-extract/ExtractedRule.cpp b/phrase-extract/ExtractedRule.cpp index 985f2f093..50d9085e6 100644 --- a/phrase-extract/ExtractedRule.cpp +++ b/phrase-extract/ExtractedRule.cpp @@ -23,20 +23,19 @@ void ExtractedRule::OutputNTLengths(std::ostream &out) const void ExtractedRule::OutputNTLengths(std::ostringstream &outString) const { std::map >::const_iterator iter; - for (iter = m_ntLengths.begin(); iter != m_ntLengths.end(); ++iter) - { + for (iter = m_ntLengths.begin(); iter != m_ntLengths.end(); ++iter) { size_t sourcePos = iter->first; const std::pair &spanLengths = iter->second; - outString << sourcePos << "=" << spanLengths.first << "," < > m_ntLengths; - + ExtractedRule(int sT, int eT, int sS, int eS) : source() , target() @@ -64,12 +64,11 @@ public: , count(0) , pcfgScore(0.0) {} - - void SetSpanLength(size_t sourcePos, size_t sourceLength, size_t targetLength) - { + + void SetSpanLength(size_t sourcePos, size_t sourceLength, size_t targetLength) { m_ntLengths[sourcePos] = std::pair(sourceLength, targetLength); } - + void OutputNTLengths(std::ostream &out) const; void OutputNTLengths(std::ostringstream &out) const; }; diff --git a/phrase-extract/Hole.h b/phrase-extract/Hole.h index c570ec7a1..efedf2f53 100644 --- a/phrase-extract/Hole.h +++ b/phrase-extract/Hole.h @@ -72,7 +72,7 @@ public: int GetSize(size_t direction) const { return m_end[direction] - m_start[direction] + 1; } - + void SetPos(int pos, size_t direction) { m_pos[direction] = pos; } diff --git a/phrase-extract/HoleCollection.cpp b/phrase-extract/HoleCollection.cpp index fba295993..e63e2eacc 100644 --- a/phrase-extract/HoleCollection.cpp +++ b/phrase-extract/HoleCollection.cpp @@ -64,7 +64,7 @@ int HoleCollection::Scope(const Hole &proposedHole) const const int holeEnd = proposedHole.GetEnd(0); int scope = m_scope.back(); if (holeStart == m_sourcePhraseStart.back() || - find(m_sourceHoleEndPoints.begin(), m_sourceHoleEndPoints.end(), holeStart-1) != m_sourceHoleEndPoints.end()) { + find(m_sourceHoleEndPoints.begin(), m_sourceHoleEndPoints.end(), holeStart-1) != m_sourceHoleEndPoints.end()) { ++scope; // Adding hole would introduce choice point at start of hole. } if (holeEnd == m_sourcePhraseEnd.back() || diff --git a/phrase-extract/OutputFileStream.cpp b/phrase-extract/OutputFileStream.cpp index 2cad33bb9..a61ce1ab1 100644 --- a/phrase-extract/OutputFileStream.cpp +++ b/phrase-extract/OutputFileStream.cpp @@ -46,11 +46,11 @@ OutputFileStream::~OutputFileStream() bool OutputFileStream::Open(const std::string &filePath) { - m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary); + m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary); if (m_outFile->fail()) { return false; } - + if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") { this->push(boost::iostreams::gzip_compressor()); } @@ -64,10 +64,10 @@ void OutputFileStream::Close() if (m_outFile == NULL) { return; } - + this->flush(); this->pop(); // file - + m_outFile->close(); delete m_outFile; m_outFile = NULL; diff --git a/phrase-extract/PhraseAlignment.cpp b/phrase-extract/PhraseAlignment.cpp index bdfead082..f830e411f 100644 --- a/phrase-extract/PhraseAlignment.cpp +++ b/phrase-extract/PhraseAlignment.cpp @@ -29,10 +29,10 @@ extern bool hierarchicalFlag; template inline T Scan(const std::string &input) { - std::stringstream stream(input); - T ret; - stream >> ret; - return ret; + std::stringstream stream(input); + T ret; + stream >> ret; + return ret; } @@ -40,11 +40,10 @@ inline T Scan(const std::string &input) template inline void Scan(std::vector &output, const std::vector< std::string > &input) { - output.resize(input.size()); - for (size_t i = 0 ; i < input.size() ; i++) - { - output[i] = Scan( input[i] ); - } + output.resize(input.size()); + for (size_t i = 0 ; i < input.size() ; i++) { + output[i] = Scan( input[i] ); + } } @@ -56,7 +55,7 @@ inline void Tokenize(std::vector &output std::string::size_type lastPos = str.find_first_not_of(delimiters, 0); // Find first "non-delimiter". std::string::size_type pos = str.find_first_of(delimiters, lastPos); - + while (std::string::npos != pos || std::string::npos != lastPos) { // Found a token, add it to the vector. output.push_back(str.substr(lastPos, pos - lastPos)); @@ -70,12 +69,12 @@ inline void Tokenize(std::vector &output // speeded up version of above template inline void Tokenize( std::vector &output - , const std::string &input - , const std::string& delimiters = " \t") + , const std::string &input + , const std::string& delimiters = " \t") { - std::vector stringVector; - Tokenize(stringVector, input, delimiters); - return Scan(output, stringVector ); + std::vector stringVector; + Tokenize(stringVector, input, delimiters); + return Scan(output, stringVector ); } // read in a phrase pair and store it @@ -94,8 +93,7 @@ void PhraseAlignment::create( char line[], int lineID, bool includeSentenceIdFla else if (item == 2) { // target phrase phraseT.push_back( vcbT.storeIfNew( token[j] ) ); - } - else if (item == 3) { // alignment + } else if (item == 3) { // alignment int s,t; sscanf(token[j].c_str(), "%d-%d", &s, &t); if ((size_t)t >= phraseT.size() || (size_t)s >= phraseS.size()) { @@ -135,17 +133,17 @@ void PhraseAlignment::create( char line[], int lineID, bool includeSentenceIdFla void PhraseAlignment::addNTLength(const std::string &tok) { vector< string > tokens; - + Tokenize(tokens, tok, "="); assert(tokens.size() == 2); - + size_t sourcePos = Scan(tokens[0]); assert(sourcePos < phraseS.size()); - + vector< size_t > ntLengths; Tokenize(ntLengths, tokens[1], ","); assert(ntLengths.size() == 2); - + m_ntLengths[sourcePos] = std::pair(ntLengths[0], ntLengths[1]); } @@ -211,13 +209,13 @@ int PhraseAlignment::Compare(const PhraseAlignment &other) const if (this == &other) // comparing with itself return 0; - if (GetTarget() != other.GetTarget()) + if (GetTarget() != other.GetTarget()) return ( GetTarget() < other.GetTarget() ) ? -1 : +1; if (GetSource() != other.GetSource()) - return ( GetSource() < other.GetSource() ) ? -1 : +1; + return ( GetSource() < other.GetSource() ) ? -1 : +1; - if (!hierarchicalFlag) + if (!hierarchicalFlag) return 0; // loop over all words (note: 0 = left hand side of rule) @@ -228,15 +226,14 @@ int PhraseAlignment::Compare(const PhraseAlignment &other) const if (alignedToT[i].size() != 1 || other.alignedToT[i].size() != 1 || - thisAlign != otherAlign) - { + thisAlign != otherAlign) { int ret = (thisAlign < otherAlign) ? -1 : +1; return ret; } } } return 0; - + } } diff --git a/phrase-extract/PhraseAlignment.h b/phrase-extract/PhraseAlignment.h index c0df2aa37..06d9cfad0 100644 --- a/phrase-extract/PhraseAlignment.h +++ b/phrase-extract/PhraseAlignment.h @@ -24,7 +24,7 @@ protected: PHRASE phraseT; std::map > m_ntLengths; - + void createAlignVec(size_t sourceSize, size_t targetSize); void addNTLength(const std::string &tok); public: @@ -41,11 +41,10 @@ public: bool equals( const PhraseAlignment& ); bool match( const PhraseAlignment& ); - int Compare(const PhraseAlignment &compare) const; - inline bool operator<(const PhraseAlignment &compare) const - { - return Compare(compare) < 0; - } + int Compare(const PhraseAlignment &compare) const; + inline bool operator<(const PhraseAlignment &compare) const { + return Compare(compare) < 0; + } const PHRASE &GetSource() const { return phraseS; @@ -53,9 +52,10 @@ public: const PHRASE &GetTarget() const { return phraseT; } - - const std::map > &GetNTLengths() const - { return m_ntLengths; } + + const std::map > &GetNTLengths() const { + return m_ntLengths; + } }; @@ -67,8 +67,7 @@ typedef std::vector PhraseAlignmentCollection; class PhraseAlignmentCollectionOrderer { public: - bool operator()(const PhraseAlignmentCollection &collA, const PhraseAlignmentCollection &collB) const - { + bool operator()(const PhraseAlignmentCollection &collA, const PhraseAlignmentCollection &collB) const { assert(collA.size() > 0); assert(collB.size() > 0); @@ -77,7 +76,7 @@ public: bool ret = objA < objB; return ret; - } + } }; @@ -97,10 +96,12 @@ public: std::pair insert ( const PhraseAlignmentCollection& obj ); - const SortedColl &GetSortedColl() const - { return m_sortedColl; } - size_t GetSize() const - { return m_coll.size(); } + const SortedColl &GetSortedColl() const { + return m_sortedColl; + } + size_t GetSize() const { + return m_coll.size(); + } private: SortedColl m_sortedColl; diff --git a/phrase-extract/PhraseExtractionOptions.h b/phrase-extract/PhraseExtractionOptions.h index 2daeaf0ca..60e56b08c 100644 --- a/phrase-extract/PhraseExtractionOptions.h +++ b/phrase-extract/PhraseExtractionOptions.h @@ -29,11 +29,12 @@ enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO}; enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN}; -class PhraseExtractionOptions { - - public: - const int maxPhraseLength; - private: +class PhraseExtractionOptions +{ + +public: + const int maxPhraseLength; +private: bool allModelsOutputFlag; bool wordModel; REO_MODEL_TYPE wordType; @@ -48,103 +49,103 @@ class PhraseExtractionOptions { bool gzOutput; std::string instanceWeightsFile; //weights for each sentence -public: +public: PhraseExtractionOptions(const int initmaxPhraseLength): - maxPhraseLength(initmaxPhraseLength), - allModelsOutputFlag(false), - wordModel(false), - wordType(REO_MSD), - phraseModel(false), - phraseType(REO_MSD), - hierModel(false), - hierType(REO_MSD), - orientationFlag(false), - translationFlag(true), - includeSentenceIdFlag(false), - onlyOutputSpanInfo(false), - gzOutput(false){} - - //functions for initialization of options - void initAllModelsOutputFlag(const bool initallModelsOutputFlag){ - allModelsOutputFlag=initallModelsOutputFlag; - } - void initWordModel(const bool initwordModel){ - wordModel=initwordModel; - } - void initWordType(REO_MODEL_TYPE initwordType ){ - wordType=initwordType; - } - void initPhraseModel(const bool initphraseModel ){ - phraseModel=initphraseModel; - } - void initPhraseType(REO_MODEL_TYPE initphraseType){ - phraseType=initphraseType; - } - void initHierModel(const bool inithierModel){ - hierModel=inithierModel; - } - void initHierType(REO_MODEL_TYPE inithierType){ - hierType=inithierType; - } - void initOrientationFlag(const bool initorientationFlag){ - orientationFlag=initorientationFlag; - } - void initTranslationFlag(const bool inittranslationFlag){ - translationFlag=inittranslationFlag; - } - void initIncludeSentenceIdFlag(const bool initincludeSentenceIdFlag){ - includeSentenceIdFlag=initincludeSentenceIdFlag; - } - void initOnlyOutputSpanInfo(const bool initonlyOutputSpanInfo){ - onlyOutputSpanInfo= initonlyOutputSpanInfo; - } - void initGzOutput (const bool initgzOutput){ - gzOutput= initgzOutput; - } - void initInstanceWeightsFile(const char* initInstanceWeightsFile) { - instanceWeightsFile = std::string(initInstanceWeightsFile); - } - - // functions for getting values - bool isAllModelsOutputFlag() const { - return allModelsOutputFlag; - } - bool isWordModel() const { - return wordModel; - } - REO_MODEL_TYPE isWordType() const { - return wordType; - } - bool isPhraseModel() const { - return phraseModel; - } - REO_MODEL_TYPE isPhraseType() const { - return phraseType; - } - bool isHierModel() const { - return hierModel; - } - REO_MODEL_TYPE isHierType() const { - return hierType; - } - bool isOrientationFlag() const { - return orientationFlag; - } - bool isTranslationFlag() const { - return translationFlag; - } - bool isIncludeSentenceIdFlag() const { - return includeSentenceIdFlag; - } - bool isOnlyOutputSpanInfo() const { - return onlyOutputSpanInfo; - } - bool isGzOutput () const { - return gzOutput; - } - std::string getInstanceWeightsFile() const { - return instanceWeightsFile; - } + maxPhraseLength(initmaxPhraseLength), + allModelsOutputFlag(false), + wordModel(false), + wordType(REO_MSD), + phraseModel(false), + phraseType(REO_MSD), + hierModel(false), + hierType(REO_MSD), + orientationFlag(false), + translationFlag(true), + includeSentenceIdFlag(false), + onlyOutputSpanInfo(false), + gzOutput(false) {} + + //functions for initialization of options + void initAllModelsOutputFlag(const bool initallModelsOutputFlag) { + allModelsOutputFlag=initallModelsOutputFlag; + } + void initWordModel(const bool initwordModel) { + wordModel=initwordModel; + } + void initWordType(REO_MODEL_TYPE initwordType ) { + wordType=initwordType; + } + void initPhraseModel(const bool initphraseModel ) { + phraseModel=initphraseModel; + } + void initPhraseType(REO_MODEL_TYPE initphraseType) { + phraseType=initphraseType; + } + void initHierModel(const bool inithierModel) { + hierModel=inithierModel; + } + void initHierType(REO_MODEL_TYPE inithierType) { + hierType=inithierType; + } + void initOrientationFlag(const bool initorientationFlag) { + orientationFlag=initorientationFlag; + } + void initTranslationFlag(const bool inittranslationFlag) { + translationFlag=inittranslationFlag; + } + void initIncludeSentenceIdFlag(const bool initincludeSentenceIdFlag) { + includeSentenceIdFlag=initincludeSentenceIdFlag; + } + void initOnlyOutputSpanInfo(const bool initonlyOutputSpanInfo) { + onlyOutputSpanInfo= initonlyOutputSpanInfo; + } + void initGzOutput (const bool initgzOutput) { + gzOutput= initgzOutput; + } + void initInstanceWeightsFile(const char* initInstanceWeightsFile) { + instanceWeightsFile = std::string(initInstanceWeightsFile); + } + + // functions for getting values + bool isAllModelsOutputFlag() const { + return allModelsOutputFlag; + } + bool isWordModel() const { + return wordModel; + } + REO_MODEL_TYPE isWordType() const { + return wordType; + } + bool isPhraseModel() const { + return phraseModel; + } + REO_MODEL_TYPE isPhraseType() const { + return phraseType; + } + bool isHierModel() const { + return hierModel; + } + REO_MODEL_TYPE isHierType() const { + return hierType; + } + bool isOrientationFlag() const { + return orientationFlag; + } + bool isTranslationFlag() const { + return translationFlag; + } + bool isIncludeSentenceIdFlag() const { + return includeSentenceIdFlag; + } + bool isOnlyOutputSpanInfo() const { + return onlyOutputSpanInfo; + } + bool isGzOutput () const { + return gzOutput; + } + std::string getInstanceWeightsFile() const { + return instanceWeightsFile; + } }; } diff --git a/phrase-extract/RuleExtractionOptions.h b/phrase-extract/RuleExtractionOptions.h index 431be58b0..772d803a4 100644 --- a/phrase-extract/RuleExtractionOptions.h +++ b/phrase-extract/RuleExtractionOptions.h @@ -54,7 +54,7 @@ public: bool unpairedExtractFormat; bool conditionOnTargetLhs; bool boundaryRules; - + RuleExtractionOptions() : maxSpan(10) , minHoleSource(2) diff --git a/phrase-extract/ScoreFeature.cpp b/phrase-extract/ScoreFeature.cpp index 5998c528c..25e497df2 100644 --- a/phrase-extract/ScoreFeature.cpp +++ b/phrase-extract/ScoreFeature.cpp @@ -22,82 +22,81 @@ using namespace std; -namespace MosesTraining +namespace MosesTraining { - const string& ScoreFeatureManager::usage() const - { - const static string& usage = "[--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]]" ; - return usage; - } - - void ScoreFeatureManager::configure(const std::vector args) - { - bool domainAdded = false; - bool sparseDomainAdded = false; - for (size_t i = 0; i < args.size(); ++i) { - if (args[i] == "--IgnoreSentenceId") { - m_includeSentenceId = true; - } - else if (args[i].substr(0,8) == "--Domain") { - string type = args[i].substr(8); - ++i; - UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file"); - string domainFile = args[i]; - UTIL_THROW_IF(domainAdded, ScoreFeatureArgumentException, - "Only allowed one domain feature"); - if (type == "Subset") { - m_features.push_back(ScoreFeaturePtr(new SubsetDomainFeature(domainFile))); - } else if (type == "Ratio") { - m_features.push_back(ScoreFeaturePtr(new RatioDomainFeature(domainFile))); - } else if (type == "Indicator") { - m_features.push_back(ScoreFeaturePtr(new IndicatorDomainFeature(domainFile))); - } else { - UTIL_THROW(ScoreFeatureArgumentException, "Unknown domain feature type " << type); - } - domainAdded = true; - m_includeSentenceId = true; - } else if (args[i].substr(0,14) == "--SparseDomain") { - string type = args[i].substr(14); - ++i; - UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file"); - string domainFile = args[i]; - UTIL_THROW_IF(sparseDomainAdded, ScoreFeatureArgumentException, - "Only allowed one sparse domain feature"); - if (type == "Subset") { - m_features.push_back(ScoreFeaturePtr(new SparseSubsetDomainFeature(domainFile))); - } else if (type == "Ratio") { - m_features.push_back(ScoreFeaturePtr(new SparseRatioDomainFeature(domainFile))); - } else if (type == "Indicator") { - m_features.push_back(ScoreFeaturePtr(new SparseIndicatorDomainFeature(domainFile))); - } else { - UTIL_THROW(ScoreFeatureArgumentException, "Unknown domain feature type " << type); - } - sparseDomainAdded = true; - m_includeSentenceId = true; - } else { - UTIL_THROW(ScoreFeatureArgumentException,"Unknown score argument " << args[i]); - } - } - - } - - bool ScoreFeatureManager::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const - { - for (size_t i = 0; i < m_features.size(); ++i) { - if (!m_features[i]->equals(lhs,rhs)) return false; - } - return true; - } - - void ScoreFeatureManager::addFeatures(const ScoreFeatureContext& context, - std::vector& denseValues, - std::map& sparseValues) const - { - for (size_t i = 0; i < m_features.size(); ++i) { - m_features[i]->add(context, denseValues, sparseValues); - } - } +const string& ScoreFeatureManager::usage() const +{ + const static string& usage = "[--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]]" ; + return usage; +} + +void ScoreFeatureManager::configure(const std::vector args) +{ + bool domainAdded = false; + bool sparseDomainAdded = false; + for (size_t i = 0; i < args.size(); ++i) { + if (args[i] == "--IgnoreSentenceId") { + m_includeSentenceId = true; + } else if (args[i].substr(0,8) == "--Domain") { + string type = args[i].substr(8); + ++i; + UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file"); + string domainFile = args[i]; + UTIL_THROW_IF(domainAdded, ScoreFeatureArgumentException, + "Only allowed one domain feature"); + if (type == "Subset") { + m_features.push_back(ScoreFeaturePtr(new SubsetDomainFeature(domainFile))); + } else if (type == "Ratio") { + m_features.push_back(ScoreFeaturePtr(new RatioDomainFeature(domainFile))); + } else if (type == "Indicator") { + m_features.push_back(ScoreFeaturePtr(new IndicatorDomainFeature(domainFile))); + } else { + UTIL_THROW(ScoreFeatureArgumentException, "Unknown domain feature type " << type); + } + domainAdded = true; + m_includeSentenceId = true; + } else if (args[i].substr(0,14) == "--SparseDomain") { + string type = args[i].substr(14); + ++i; + UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file"); + string domainFile = args[i]; + UTIL_THROW_IF(sparseDomainAdded, ScoreFeatureArgumentException, + "Only allowed one sparse domain feature"); + if (type == "Subset") { + m_features.push_back(ScoreFeaturePtr(new SparseSubsetDomainFeature(domainFile))); + } else if (type == "Ratio") { + m_features.push_back(ScoreFeaturePtr(new SparseRatioDomainFeature(domainFile))); + } else if (type == "Indicator") { + m_features.push_back(ScoreFeaturePtr(new SparseIndicatorDomainFeature(domainFile))); + } else { + UTIL_THROW(ScoreFeatureArgumentException, "Unknown domain feature type " << type); + } + sparseDomainAdded = true; + m_includeSentenceId = true; + } else { + UTIL_THROW(ScoreFeatureArgumentException,"Unknown score argument " << args[i]); + } + } + +} + +bool ScoreFeatureManager::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const +{ + for (size_t i = 0; i < m_features.size(); ++i) { + if (!m_features[i]->equals(lhs,rhs)) return false; + } + return true; +} + +void ScoreFeatureManager::addFeatures(const ScoreFeatureContext& context, + std::vector& denseValues, + std::map& sparseValues) const +{ + for (size_t i = 0; i < m_features.size(); ++i) { + m_features[i]->add(context, denseValues, sparseValues); + } +} } diff --git a/phrase-extract/ScoreFeature.h b/phrase-extract/ScoreFeature.h index c7d856bcf..76939436f 100644 --- a/phrase-extract/ScoreFeature.h +++ b/phrase-extract/ScoreFeature.h @@ -20,7 +20,7 @@ /** * This contains extra features that can be added to the scorer. To add a new feature: * 1. Implement a subclass of ScoreFeature - * 2. Updated ScoreFeatureManager.configure() to configure your feature, and usage() to + * 2. Updated ScoreFeatureManager.configure() to configure your feature, and usage() to * display usage info. * 3. Write unit tests (see ScoreFeatureTest.cpp) and regression tests **/ @@ -37,35 +37,37 @@ #include "PhraseAlignment.h" -namespace MosesTraining +namespace MosesTraining { -struct MaybeLog{ +struct MaybeLog { MaybeLog(bool useLog, float negativeLog): m_useLog(useLog), m_negativeLog(negativeLog) {} - - inline float operator() (float a) const - { return m_useLog ? m_negativeLog*log(a) : a; } + + inline float operator() (float a) const { + return m_useLog ? m_negativeLog*log(a) : a; + } float m_useLog; float m_negativeLog; }; -class ScoreFeatureArgumentException : public util::Exception +class ScoreFeatureArgumentException : public util::Exception { - public: - ScoreFeatureArgumentException() throw() {*this << "Unable to configure features: ";} - ~ScoreFeatureArgumentException() throw() {} +public: + ScoreFeatureArgumentException() throw() { + *this << "Unable to configure features: "; + } + ~ScoreFeatureArgumentException() throw() {} }; /** Passed to each feature to be used to calculate its values */ -struct ScoreFeatureContext -{ +struct ScoreFeatureContext { ScoreFeatureContext( const PhraseAlignmentCollection &thePhrasePair, float theCount, /* Total counts of all phrase pairs*/ const MaybeLog& theMaybeLog - ) : + ) : phrasePair(thePhrasePair), count(theCount), maybeLog(theMaybeLog) @@ -82,53 +84,57 @@ struct ScoreFeatureContext **/ class ScoreFeature { - public: - /** Add the values for this feature function. */ - virtual void add(const ScoreFeatureContext& context, - std::vector& denseValues, - std::map& sparseValues) const = 0; +public: + /** Add the values for this feature function. */ + virtual void add(const ScoreFeatureContext& context, + std::vector& denseValues, + std::map& sparseValues) const = 0; - /** Return true if the two phrase pairs are equal from the point of this feature. Assume - that they already compare true according to PhraseAlignment.equals() - **/ - virtual bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const = 0; + /** Return true if the two phrase pairs are equal from the point of this feature. Assume + that they already compare true according to PhraseAlignment.equals() + **/ + virtual bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const = 0; - virtual ~ScoreFeature() {} + virtual ~ScoreFeature() {} }; typedef boost::shared_ptr ScoreFeaturePtr; class ScoreFeatureManager { - public: - ScoreFeatureManager(): - m_includeSentenceId(false) {} +public: + ScoreFeatureManager(): + m_includeSentenceId(false) {} - /** To be appended to the score usage message */ - const std::string& usage() const; - - /** Pass the unused command-line arguments to configure the extra features */ - void configure(const std::vector args); + /** To be appended to the score usage message */ + const std::string& usage() const; - /** Add all the features */ - void addFeatures(const ScoreFeatureContext& context, - std::vector& denseValues, - std::map& sparseValues) const; + /** Pass the unused command-line arguments to configure the extra features */ + void configure(const std::vector args); - /** - * Used to tell if the PhraseAlignment should be considered the same by all - * extended features. - **/ - bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const; + /** Add all the features */ + void addFeatures(const ScoreFeatureContext& context, + std::vector& denseValues, + std::map& sparseValues) const; - const std::vector& getFeatures() const {return m_features;} + /** + * Used to tell if the PhraseAlignment should be considered the same by all + * extended features. + **/ + bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const; - /** Do we need to include sentence ids in phrase pairs? */ - bool includeSentenceId() const {return m_includeSentenceId;} + const std::vector& getFeatures() const { + return m_features; + } - private: - std::vector m_features; - bool m_includeSentenceId; + /** Do we need to include sentence ids in phrase pairs? */ + bool includeSentenceId() const { + return m_includeSentenceId; + } + +private: + std::vector m_features; + bool m_includeSentenceId; }; } diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index fecde015a..f4570fe30 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -31,14 +31,16 @@ using namespace MosesTraining; using namespace std; //pesky global variables -namespace MosesTraining { - bool hierarchicalFlag = false; - Vocabulary vcbT; - Vocabulary vcbS; +namespace MosesTraining +{ +bool hierarchicalFlag = false; +Vocabulary vcbT; +Vocabulary vcbS; } -const char *DomainFileLocation() { +const char *DomainFileLocation() +{ if (boost::unit_test::framework::master_test_suite().argc < 2) { return "test.domain"; } @@ -62,7 +64,7 @@ BOOST_AUTO_TEST_CASE(manager_configure_domain_except) template static void checkDomainConfigured( - const vector& args) + const vector& args) { ScoreFeatureManager manager; manager.configure(args); @@ -76,17 +78,17 @@ static void checkDomainConfigured( BOOST_AUTO_TEST_CASE(manager_config_domain) { checkDomainConfigured - (boost::assign::list_of ("--DomainRatio")("/dev/null")); + (boost::assign::list_of ("--DomainRatio")("/dev/null")); checkDomainConfigured - (boost::assign::list_of("--DomainIndicator")("/dev/null")); + (boost::assign::list_of("--DomainIndicator")("/dev/null")); checkDomainConfigured - (boost::assign::list_of("--DomainSubset")("/dev/null")); + (boost::assign::list_of("--DomainSubset")("/dev/null")); checkDomainConfigured - (boost::assign::list_of("--SparseDomainRatio")("/dev/null")); + (boost::assign::list_of("--SparseDomainRatio")("/dev/null")); checkDomainConfigured - (boost::assign::list_of("--SparseDomainIndicator")("/dev/null")); + (boost::assign::list_of("--SparseDomainIndicator")("/dev/null")); checkDomainConfigured - (boost::assign::list_of("--SparseDomainSubset")("/dev/null")); + (boost::assign::list_of("--SparseDomainSubset")("/dev/null")); } @@ -98,8 +100,8 @@ BOOST_AUTO_TEST_CASE(domain_equals) char buf2[] = "a ||| b ||| 0-0 ||| 2"; char buf3[] = "a ||| b ||| 0-0 ||| 3"; a1.create(buf1, 0, true); //domain a - a2.create(buf2, 1, true); //domain c - a3.create(buf3, 2, true); //domain c + a2.create(buf2, 1, true); //domain c + a3.create(buf3, 2, true); //domain c BOOST_CHECK(feature.equals(a2,a3)); BOOST_CHECK(!feature.equals(a1,a3)); BOOST_CHECK(!feature.equals(a1,a3)); diff --git a/phrase-extract/SentenceAlignment.cpp b/phrase-extract/SentenceAlignment.cpp index 96ef02865..b2d5520aa 100644 --- a/phrase-extract/SentenceAlignment.cpp +++ b/phrase-extract/SentenceAlignment.cpp @@ -94,12 +94,12 @@ bool SentenceAlignment::create( char targetString[], char sourceString[], char a cerr << "T: " << targetString << endl << "S: " << sourceString << endl; return false; } - + if (boundaryRules) { ++s; ++t; } - + // cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << s << ", " << t << endl; if ((size_t)t >= target.size() || (size_t)s >= source.size()) { cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n"; @@ -109,16 +109,16 @@ bool SentenceAlignment::create( char targetString[], char sourceString[], char a alignedToT[t].push_back( s ); alignedCountS[s]++; } - + if (boundaryRules) { alignedToT[0].push_back(0); alignedCountS[0]++; - + alignedToT.back().push_back(alignedCountS.size() - 1); alignedCountS.back()++; - + } - + return true; } diff --git a/phrase-extract/SentenceAlignment.h b/phrase-extract/SentenceAlignment.h index 76cf950d4..e215f5fef 100644 --- a/phrase-extract/SentenceAlignment.h +++ b/phrase-extract/SentenceAlignment.h @@ -45,7 +45,7 @@ public: bool create(char targetString[], char sourceString[], char alignmentString[], char weightString[], int sentenceID, bool boundaryRules); - + }; } diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp index 05bcefe20..eedb3b260 100644 --- a/phrase-extract/XmlTree.cpp +++ b/phrase-extract/XmlTree.cpp @@ -364,7 +364,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label string pcfgString = ParseXmlTagAttribute(tagContent,"pcfg"); float pcfgScore = pcfgString == "" ? 0.0f - : std::atof(pcfgString.c_str()); + : std::atof(pcfgString.c_str()); // report what we have processed so far if (0) { diff --git a/phrase-extract/consolidate-direct-main.cpp b/phrase-extract/consolidate-direct-main.cpp index e7e68e977..3b38f741c 100644 --- a/phrase-extract/consolidate-direct-main.cpp +++ b/phrase-extract/consolidate-direct-main.cpp @@ -89,21 +89,20 @@ int main(int argc, char* argv[]) char* &fileNameConsolidated = argv[2]; ostream *fileConsolidated; - - if (strcmp(fileNameConsolidated, "-") == 0) { - fileConsolidated = &cout; - } - else { + + if (strcmp(fileNameConsolidated, "-") == 0) { + fileConsolidated = &cout; + } else { Moses::OutputFileStream *outputFile = new Moses::OutputFileStream(); - bool success = outputFile->Open(fileNameConsolidated); - if (!success) { - cerr << "ERROR: could not open file phrase table file " - << fileNameConsolidated << endl; - exit(1); - } - fileConsolidated = outputFile; - } - + bool success = outputFile->Open(fileNameConsolidated); + if (!success) { + cerr << "ERROR: could not open file phrase table file " + << fileNameConsolidated << endl; + exit(1); + } + fileConsolidated = outputFile; + } + int i=0; while(true) { i++; @@ -119,8 +118,8 @@ int main(int argc, char* argv[]) // output alignment and probabilities (*fileConsolidated) << itemDirect[2] // prob direct - << " 2.718" // phrase count feature - << " ||| " << itemDirect[3]; // alignment + << " 2.718" // phrase count feature + << " ||| " << itemDirect[3]; // alignment // counts (*fileConsolidated) << "||| 0 " << itemDirect[4]; // indirect @@ -128,11 +127,11 @@ int main(int argc, char* argv[]) } - fileConsolidated->flush(); - if (fileConsolidated != &cout) { - delete fileConsolidated; - } - + fileConsolidated->flush(); + if (fileConsolidated != &cout) { + delete fileConsolidated; + } + cerr << "Finished" << endl; } diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp index fd33907de..67a097910 100644 --- a/phrase-extract/consolidate-main.cpp +++ b/phrase-extract/consolidate-main.cpp @@ -42,7 +42,10 @@ bool goodTuringFlag = false; bool kneserNeyFlag = false; bool logProbFlag = false; bool outputNTLengths = false; -inline float maybeLogProb( float a ) { return logProbFlag ? log(a) : a; } +inline float maybeLogProb( float a ) +{ + return logProbFlag ? log(a) : a; +} char line[LINE_MAX_LENGTH]; void processFiles( char*, char*, char*, char* ); @@ -79,7 +82,7 @@ int main(int argc, char* argv[]) cerr << "not including the phrase count feature\n"; } else if (strcmp(argv[i],"--GoodTuring") == 0) { goodTuringFlag = true; - if (i+1==argc) { + if (i+1==argc) { cerr << "ERROR: specify count of count files for Good Turing discounting!\n"; exit(1); } @@ -87,7 +90,7 @@ int main(int argc, char* argv[]) cerr << "adjusting phrase translation probabilities with Good Turing discounting\n"; } else if (strcmp(argv[i],"--KneserNey") == 0) { kneserNeyFlag = true; - if (i+1==argc) { + if (i+1==argc) { cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n"; exit(1); } @@ -105,8 +108,11 @@ int main(int argc, char* argv[]) while(i+1='0' && argv[i+1][0]<='9') { int binCount = atoi(argv[++i]); countBin.push_back( binCount ); - if (prev+1 == binCount) { cerr << " " << binCount; } - else { cerr << " " << (prev+1) << "-" << binCount; } + if (prev+1 == binCount) { + cerr << " " << binCount; + } else { + cerr << " " << (prev+1) << "-" << binCount; + } prev = binCount; } cerr << " " << (prev+1) << "+\n"; @@ -152,7 +158,7 @@ void loadCountOfCounts( char* fileNameCountOfCounts ) if (goodTuringFlag) { goodTuringDiscount.push_back(0.01); // floor value for( size_t i=1; i1) goodTuringDiscount[i] = 1; if (goodTuringDiscount[i] countEF) D = countEF - 0.01; // sanity constraint + if (kneserNeyFlag) { + float D = kneserNey_D3; + if (countEF < 2) D = kneserNey_D1; + else if (countEF < 3) D = kneserNey_D2; + if (D > countEF) D = countEF - 0.01; // sanity constraint - float p_b_E = n1_E / totalCount; // target phrase prob based on distinct - float alpha_F = D * n1_F / countF; // available mass - adjustedCountEF = countEF - D + countF * alpha_F * p_b_E; + float p_b_E = n1_E / totalCount; // target phrase prob based on distinct + float alpha_F = D * n1_F / countF; // available mass + adjustedCountEF = countEF - D + countF * alpha_F * p_b_E; - // for indirect - float p_b_F = n1_F / totalCount; // target phrase prob based on distinct - float alpha_E = D * n1_E / countE; // available mass - adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F; - } + // for indirect + float p_b_F = n1_F / totalCount; // target phrase prob based on distinct + float alpha_E = D * n1_E / countE; // available mass + adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F; + } // prob indirect if (!onlyDirectFlag) { @@ -296,30 +302,27 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC if (!foundBin && countEF <= countBin[i]) { fileConsolidated << " " << maybeLogProb(2.718); foundBin = true; - } - else { + } else { fileConsolidated << " " << maybeLogProb(1); } } - fileConsolidated << " " << maybeLogProb( foundBin ? 1 : 2.718 ); + fileConsolidated << " " << maybeLogProb( foundBin ? 1 : 2.718 ); } // alignment fileConsolidated << " ||| " << itemDirect[3]; // counts, for debugging - fileConsolidated << "||| " << countE << " " << countF << " " << countEF; + fileConsolidated << "||| " << countE << " " << countF << " " << countEF; - if (outputNTLengths) - { + if (outputNTLengths) { fileConsolidated << " ||| " << itemDirect[5]; } - + // count bin feature (as a sparse feature) - if (sparseCountBinFeatureFlag || - directSparseScores.compare("") != 0 || - indirectSparseScores.compare("") != 0) - { + if (sparseCountBinFeatureFlag || + directSparseScores.compare("") != 0 || + indirectSparseScores.compare("") != 0) { fileConsolidated << " |||"; if (directSparseScores.compare("") != 0) fileConsolidated << " " << directSparseScores; @@ -351,13 +354,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC fileConsolidated.Close(); } -void breakdownCoreAndSparse( string combined, string &core, string &sparse ) +void breakdownCoreAndSparse( string combined, string &core, string &sparse ) { core = ""; sparse = ""; vector score = tokenize( combined.c_str() ); for(size_t i=0; i= '0' && score[i][0] <= '9') || i+1 == score.size()) + if ((score[i][0] >= '0' && score[i][0] <= '9') || i+1 == score.size()) core += " " + score[i]; else { sparse += " " + score[i]; diff --git a/phrase-extract/consolidate-reverse-main.cpp b/phrase-extract/consolidate-reverse-main.cpp index c86d870c8..6843bf3aa 100644 --- a/phrase-extract/consolidate-reverse-main.cpp +++ b/phrase-extract/consolidate-reverse-main.cpp @@ -229,13 +229,12 @@ string reverseAlignment(const string &alignments) vector alignToks = tokenize(alignments.c_str()); - for (size_t i = 0; i < alignToks.size(); ++i) - { + for (size_t i = 0; i < alignToks.size(); ++i) { string &alignPair = alignToks[i]; vector alignPoints; Tokenize(alignPoints, alignPair, "-"); assert(alignPoints.size() == 2); - + ret << alignPoints[1] << "-" << alignPoints[0] << " "; } diff --git a/phrase-extract/domain.cpp b/phrase-extract/domain.cpp index 29ba8ee64..67b4a13c3 100644 --- a/phrase-extract/domain.cpp +++ b/phrase-extract/domain.cpp @@ -13,7 +13,8 @@ namespace MosesTraining { // handling of domain names: load database with sentence-id / domain name info -void Domain::load( const std::string &domainFileName ) { +void Domain::load( const std::string &domainFileName ) +{ Moses::InputFileStream fileS( domainFileName ); istream *fileP = &fileS; while(true) { @@ -39,7 +40,8 @@ void Domain::load( const std::string &domainFileName ) { } // get domain name based on sentence number -string Domain::getDomainOfSentence( int sentenceId ) const { +string Domain::getDomainOfSentence( int sentenceId ) const +{ for(size_t i=0; i& denseValues, - std::map& sparseValues) const +void DomainFeature::add(const ScoreFeatureContext& context, + std::vector& denseValues, + std::map& sparseValues) const { map< string, float > domainCount; for(size_t i=0; i& domainCount,float count, - const MaybeLog& maybeLog, - std::vector& denseValues, - std::map& sparseValues) const + const MaybeLog& maybeLog, + std::vector& denseValues, + std::map& sparseValues) const { - if (m_domain.list.size() > 6) { + if (m_domain.list.size() > 6) { UTIL_THROW_IF(m_domain.list.size() > 6, ScoreFeatureArgumentException, - "too many domains for core domain subset features"); + "too many domains for core domain subset features"); } size_t bitmap = 0; for(size_t bit = 0; bit < m_domain.list.size(); bit++) { @@ -87,13 +89,13 @@ void SubsetDomainFeature::add(const map& domainCount,float count, } for(size_t i = 1; i < (1 << m_domain.list.size()); i++) { denseValues.push_back(maybeLog( (bitmap == i) ? 2.718 : 1 )); - } + } } void SparseSubsetDomainFeature::add(const map& domainCount,float count, - const MaybeLog& maybeLog, - std::vector& denseValues, - std::map& sparseValues) const + const MaybeLog& maybeLog, + std::vector& denseValues, + std::map& sparseValues) const { typedef vector::const_iterator I; ostringstream key; @@ -108,9 +110,9 @@ void SparseSubsetDomainFeature::add(const map& domainCount,float c void RatioDomainFeature::add(const map& domainCount,float count, - const MaybeLog& maybeLog, - std::vector& denseValues, - std::map& sparseValues) const + const MaybeLog& maybeLog, + std::vector& denseValues, + std::map& sparseValues) const { typedef vector< string >::const_iterator I; for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) { @@ -125,9 +127,9 @@ void RatioDomainFeature::add(const map& domainCount,float count, void SparseRatioDomainFeature::add(const map& domainCount,float count, - const MaybeLog& maybeLog, - std::vector& denseValues, - std::map& sparseValues) const + const MaybeLog& maybeLog, + std::vector& denseValues, + std::map& sparseValues) const { typedef map< string, float >::const_iterator I; for (I i=domainCount.begin(); i != domainCount.end(); i++) { @@ -137,9 +139,9 @@ void SparseRatioDomainFeature::add(const map& domainCount,float co void IndicatorDomainFeature::add(const map& domainCount,float count, - const MaybeLog& maybeLog, - std::vector& denseValues, - std::map& sparseValues) const + const MaybeLog& maybeLog, + std::vector& denseValues, + std::map& sparseValues) const { typedef vector< string >::const_iterator I; for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) { @@ -154,20 +156,20 @@ void IndicatorDomainFeature::add(const map& domainCount,float coun } void SparseIndicatorDomainFeature::add(const map& domainCount,float count, - const MaybeLog& maybeLog, - std::vector& denseValues, - std::map& sparseValues) const + const MaybeLog& maybeLog, + std::vector& denseValues, + std::map& sparseValues) const { typedef map< string, float >::const_iterator I; for (I i=domainCount.begin(); i != domainCount.end(); i++) { - sparseValues["dom_" + i->first] = 1; + sparseValues["dom_" + i->first] = 1; } } -bool DomainFeature::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const +bool DomainFeature::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const { return m_domain.getDomainOfSentence(lhs.sentenceId) == - m_domain.getDomainOfSentence( rhs.sentenceId); + m_domain.getDomainOfSentence( rhs.sentenceId); } diff --git a/phrase-extract/domain.h b/phrase-extract/domain.h index f3e1e92a3..279496e01 100644 --- a/phrase-extract/domain.h +++ b/phrase-extract/domain.h @@ -31,106 +31,106 @@ public: class DomainFeature : public ScoreFeature { - public: +public: - DomainFeature(const std::string& domainFile); - bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const; - void add(const ScoreFeatureContext& context, - std::vector& denseValues, - std::map& sparseValues) const; + DomainFeature(const std::string& domainFile); + bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const; + void add(const ScoreFeatureContext& context, + std::vector& denseValues, + std::map& sparseValues) const; - protected: - /** Overriden in subclass */ - virtual void add(const std::map& domainCounts, float count, - const MaybeLog& maybeLog, - std::vector& denseValues, - std::map& sparseValues) const = 0; - +protected: + /** Overriden in subclass */ + virtual void add(const std::map& domainCounts, float count, + const MaybeLog& maybeLog, + std::vector& denseValues, + std::map& sparseValues) const = 0; - Domain m_domain; + + Domain m_domain; }; class SubsetDomainFeature : public DomainFeature { - public: - SubsetDomainFeature(const std::string& domainFile) : - DomainFeature(domainFile) {} +public: + SubsetDomainFeature(const std::string& domainFile) : + DomainFeature(domainFile) {} - protected: - virtual void add(const std::map& domainCounts, float count, - const MaybeLog& maybeLog, - std::vector& denseValues, - std::map& sparseValues) const; +protected: + virtual void add(const std::map& domainCounts, float count, + const MaybeLog& maybeLog, + std::vector& denseValues, + std::map& sparseValues) const; }; class SparseSubsetDomainFeature : public DomainFeature { - public: - SparseSubsetDomainFeature(const std::string& domainFile) : - DomainFeature(domainFile) {} +public: + SparseSubsetDomainFeature(const std::string& domainFile) : + DomainFeature(domainFile) {} - protected: - virtual void add(const std::map& domainCounts, float count, - const MaybeLog& maybeLog, - std::vector& denseValues, - std::map& sparseValues) const; +protected: + virtual void add(const std::map& domainCounts, float count, + const MaybeLog& maybeLog, + std::vector& denseValues, + std::map& sparseValues) const; }; class IndicatorDomainFeature : public DomainFeature { - public: - IndicatorDomainFeature(const std::string& domainFile) : - DomainFeature(domainFile) {} +public: + IndicatorDomainFeature(const std::string& domainFile) : + DomainFeature(domainFile) {} - protected: - virtual void add(const std::map& domainCounts, float count, - const MaybeLog& maybeLog, - std::vector& denseValues, - std::map& sparseValues) const; +protected: + virtual void add(const std::map& domainCounts, float count, + const MaybeLog& maybeLog, + std::vector& denseValues, + std::map& sparseValues) const; }; class SparseIndicatorDomainFeature : public DomainFeature { - public: - SparseIndicatorDomainFeature(const std::string& domainFile) : - DomainFeature(domainFile) {} +public: + SparseIndicatorDomainFeature(const std::string& domainFile) : + DomainFeature(domainFile) {} - protected: - virtual void add(const std::map& domainCounts, float count, - const MaybeLog& maybeLog, - std::vector& denseValues, - std::map& sparseValues) const; +protected: + virtual void add(const std::map& domainCounts, float count, + const MaybeLog& maybeLog, + std::vector& denseValues, + std::map& sparseValues) const; }; class RatioDomainFeature : public DomainFeature { - public: - RatioDomainFeature(const std::string& domainFile) : - DomainFeature(domainFile) {} +public: + RatioDomainFeature(const std::string& domainFile) : + DomainFeature(domainFile) {} - protected: - virtual void add(const std::map& domainCounts, float count, - const MaybeLog& maybeLog, - std::vector& denseValues, - std::map& sparseValues) const; +protected: + virtual void add(const std::map& domainCounts, float count, + const MaybeLog& maybeLog, + std::vector& denseValues, + std::map& sparseValues) const; }; class SparseRatioDomainFeature : public DomainFeature { - public: - SparseRatioDomainFeature(const std::string& domainFile) : - DomainFeature(domainFile) {} +public: + SparseRatioDomainFeature(const std::string& domainFile) : + DomainFeature(domainFile) {} - protected: - virtual void add(const std::map& domainCounts, float count, - const MaybeLog& maybeLog, - std::vector& denseValues, - std::map& sparseValues) const; +protected: + virtual void add(const std::map& domainCounts, float count, + const MaybeLog& maybeLog, + std::vector& denseValues, + std::map& sparseValues) const; }; diff --git a/phrase-extract/extract-ghkm/Alignment.cpp b/phrase-extract/extract-ghkm/Alignment.cpp index fcd5e14e1..744b4b1a2 100644 --- a/phrase-extract/extract-ghkm/Alignment.cpp +++ b/phrase-extract/extract-ghkm/Alignment.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -24,8 +24,10 @@ #include #include -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ Alignment ReadAlignment(const std::string &s) { diff --git a/phrase-extract/extract-ghkm/Alignment.h b/phrase-extract/extract-ghkm/Alignment.h index bc42191e1..051d5ca92 100644 --- a/phrase-extract/extract-ghkm/Alignment.h +++ b/phrase-extract/extract-ghkm/Alignment.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -25,8 +25,10 @@ #include #include -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ typedef std::vector > Alignment; diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/phrase-extract/extract-ghkm/AlignmentGraph.cpp index 6bd32a13b..974188dbd 100644 --- a/phrase-extract/extract-ghkm/AlignmentGraph.cpp +++ b/phrase-extract/extract-ghkm/AlignmentGraph.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -30,8 +30,10 @@ #include #include -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ AlignmentGraph::AlignmentGraph(const ParseTree *t, const std::vector &s, @@ -84,8 +86,8 @@ AlignmentGraph::~AlignmentGraph() } Subgraph AlignmentGraph::ComputeMinimalFrontierGraphFragment( - Node *root, - const std::set &frontierSet) + Node *root, + const std::set &frontierSet) { std::stack expandableNodes; std::set expandedNodes; @@ -302,7 +304,7 @@ void AlignmentGraph::CalcComplementSpans(Node *root) } void AlignmentGraph::GetTargetTreeLeaves(Node *root, - std::vector &leaves) + std::vector &leaves) { if (root->IsSink()) { leaves.push_back(root); diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.h b/phrase-extract/extract-ghkm/AlignmentGraph.h index 94948758a..cf26b8c27 100644 --- a/phrase-extract/extract-ghkm/AlignmentGraph.h +++ b/phrase-extract/extract-ghkm/AlignmentGraph.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -28,8 +28,10 @@ #include #include -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ class Node; class ParseTree; @@ -37,20 +39,24 @@ class Subgraph; class AlignmentGraph { - public: +public: AlignmentGraph(const ParseTree *, const std::vector &, const Alignment &); ~AlignmentGraph(); - Node *GetRoot() { return m_root; } - const std::vector &GetTargetNodes() { return m_targetNodes; } + Node *GetRoot() { + return m_root; + } + const std::vector &GetTargetNodes() { + return m_targetNodes; + } void ExtractMinimalRules(const Options &); void ExtractComposedRules(const Options &); - private: +private: // Disallow copying AlignmentGraph(const AlignmentGraph &); AlignmentGraph &operator=(const AlignmentGraph &); @@ -58,11 +64,11 @@ class AlignmentGraph Node *CopyParseTree(const ParseTree *); void ComputeFrontierSet(Node *, const Options &, std::set &) const; void CalcComplementSpans(Node *); - void GetTargetTreeLeaves(Node *, std::vector &); + void GetTargetTreeLeaves(Node *, std::vector &); void AttachUnalignedSourceWords(); Node *DetermineAttachmentPoint(int); Subgraph ComputeMinimalFrontierGraphFragment(Node *, - const std::set &); + const std::set &); void ExtractComposedRules(Node *, const Options &); Node *m_root; diff --git a/phrase-extract/extract-ghkm/ComposedRule.cpp b/phrase-extract/extract-ghkm/ComposedRule.cpp index 8bf3cfc72..e9fc826b7 100644 --- a/phrase-extract/extract-ghkm/ComposedRule.cpp +++ b/phrase-extract/extract-ghkm/ComposedRule.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -27,14 +27,16 @@ #include #include -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ ComposedRule::ComposedRule(const Subgraph &baseRule) - : m_baseRule(baseRule) - , m_depth(baseRule.GetDepth()) - , m_size(baseRule.GetSize()) - , m_nodeCount(baseRule.GetNodeCount()) + : m_baseRule(baseRule) + , m_depth(baseRule.GetDepth()) + , m_size(baseRule.GetSize()) + , m_nodeCount(baseRule.GetNodeCount()) { const std::set &leaves = baseRule.GetLeaves(); for (std::set::const_iterator p = leaves.begin(); @@ -47,12 +49,12 @@ ComposedRule::ComposedRule(const Subgraph &baseRule) ComposedRule::ComposedRule(const ComposedRule &other, const Subgraph &rule, int depth) - : m_baseRule(other.m_baseRule) - , m_attachedRules(other.m_attachedRules) - , m_openAttachmentPoints(other.m_openAttachmentPoints) - , m_depth(depth) - , m_size(other.m_size+rule.GetSize()) - , m_nodeCount(other.m_nodeCount+rule.GetNodeCount()-1) + : m_baseRule(other.m_baseRule) + , m_attachedRules(other.m_attachedRules) + , m_openAttachmentPoints(other.m_openAttachmentPoints) + , m_depth(depth) + , m_size(other.m_size+rule.GetSize()) + , m_nodeCount(other.m_nodeCount+rule.GetNodeCount()-1) { m_attachedRules.push_back(&rule); m_openAttachmentPoints.pop(); @@ -71,7 +73,7 @@ void ComposedRule::CloseAttachmentPoint() } ComposedRule *ComposedRule::AttemptComposition(const Subgraph &rule, - const Options &options) const + const Options &options) const { // The smallest possible rule fragment should be rooted at a tree node. // Note that this differs from the original GHKM definition. diff --git a/phrase-extract/extract-ghkm/ComposedRule.h b/phrase-extract/extract-ghkm/ComposedRule.h index 65ce9ac70..b5f72a492 100644 --- a/phrase-extract/extract-ghkm/ComposedRule.h +++ b/phrase-extract/extract-ghkm/ComposedRule.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -26,15 +26,17 @@ #include #include -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ class Node; struct Options; class ComposedRule { - public: +public: // Form a 'trivial' ComposedRule from a single existing rule. ComposedRule(const Subgraph &baseRule); @@ -53,7 +55,7 @@ class ComposedRule // Constructs a Subgraph object corresponding to the composed rule. Subgraph CreateSubgraph(); - private: +private: ComposedRule(const ComposedRule &, const Subgraph &, int); const Subgraph &m_baseRule; diff --git a/phrase-extract/extract-ghkm/Exception.h b/phrase-extract/extract-ghkm/Exception.h index 9928785f0..a1e623cd1 100644 --- a/phrase-extract/extract-ghkm/Exception.h +++ b/phrase-extract/extract-ghkm/Exception.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -23,16 +23,20 @@ #include -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ class Exception { - public: +public: Exception(const char *msg) : m_msg(msg) {} Exception(const std::string &msg) : m_msg(msg) {} - const std::string &GetMsg() const { return m_msg; } - private: + const std::string &GetMsg() const { + return m_msg; + } +private: std::string m_msg; }; diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index e3b52943c..80568ccd5 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -43,8 +43,10 @@ #include #include -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ int ExtractGHKM::Main(int argc, char *argv[]) { @@ -107,7 +109,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) ++lineNum; // Parse target tree. - if (targetLine.size() == 0) { + if (targetLine.size() == 0) { std::cerr << "skipping line " << lineNum << " with empty target tree\n"; continue; } @@ -263,64 +265,64 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[], // Declare the command line options that are visible to the user. po::options_description visible(usageTop.str()); visible.add_options() - //("help", "print this help message and exit") - ("AllowUnary", - "allow fully non-lexical unary rules") - ("ConditionOnTargetLHS", - "write target LHS instead of \"X\" as source LHS") - ("GlueGrammar", - po::value(&options.glueGrammarFile), - "write glue grammar to named file") - ("GZOutput", - "write gzipped extract files") - ("MaxNodes", - po::value(&options.maxNodes)->default_value(options.maxNodes), - "set maximum number of tree nodes for composed rules") - ("MaxRuleDepth", - po::value(&options.maxRuleDepth)->default_value(options.maxRuleDepth), - "set maximum depth for composed rules") - ("MaxRuleSize", - po::value(&options.maxRuleSize)->default_value(options.maxRuleSize), - "set maximum size for composed rules") - ("MaxScope", - po::value(&options.maxScope)->default_value(options.maxScope), - "set maximum allowed scope") - ("Minimal", - "extract minimal rules only") - ("PCFG", - "include score based on PCFG scores in target corpus") - ("SentenceOffset", - po::value(&options.sentenceOffset)->default_value(options.sentenceOffset), - "set sentence number offset if processing split corpus") - ("UnknownWordLabel", - po::value(&options.unknownWordFile), - "write unknown word labels to named file") - ("UnknownWordMinRelFreq", - po::value(&options.unknownWordMinRelFreq)->default_value( - options.unknownWordMinRelFreq), - "set minimum relative frequency for unknown word labels") - ("UnknownWordUniform", - "write uniform weights to unknown word label file") - ("UnpairedExtractFormat", - "do not pair non-terminals in extract files") + //("help", "print this help message and exit") + ("AllowUnary", + "allow fully non-lexical unary rules") + ("ConditionOnTargetLHS", + "write target LHS instead of \"X\" as source LHS") + ("GlueGrammar", + po::value(&options.glueGrammarFile), + "write glue grammar to named file") + ("GZOutput", + "write gzipped extract files") + ("MaxNodes", + po::value(&options.maxNodes)->default_value(options.maxNodes), + "set maximum number of tree nodes for composed rules") + ("MaxRuleDepth", + po::value(&options.maxRuleDepth)->default_value(options.maxRuleDepth), + "set maximum depth for composed rules") + ("MaxRuleSize", + po::value(&options.maxRuleSize)->default_value(options.maxRuleSize), + "set maximum size for composed rules") + ("MaxScope", + po::value(&options.maxScope)->default_value(options.maxScope), + "set maximum allowed scope") + ("Minimal", + "extract minimal rules only") + ("PCFG", + "include score based on PCFG scores in target corpus") + ("SentenceOffset", + po::value(&options.sentenceOffset)->default_value(options.sentenceOffset), + "set sentence number offset if processing split corpus") + ("UnknownWordLabel", + po::value(&options.unknownWordFile), + "write unknown word labels to named file") + ("UnknownWordMinRelFreq", + po::value(&options.unknownWordMinRelFreq)->default_value( + options.unknownWordMinRelFreq), + "set minimum relative frequency for unknown word labels") + ("UnknownWordUniform", + "write uniform weights to unknown word label file") + ("UnpairedExtractFormat", + "do not pair non-terminals in extract files") ; // Declare the command line options that are hidden from the user // (these are used as positional options). po::options_description hidden("Hidden options"); hidden.add_options() - ("TargetFile", - po::value(&options.targetFile), - "target file") - ("SourceFile", - po::value(&options.sourceFile), - "source file") - ("AlignmentFile", - po::value(&options.alignmentFile), - "alignment file") - ("ExtractFile", - po::value(&options.extractFile), - "extract file") + ("TargetFile", + po::value(&options.targetFile), + "target file") + ("SourceFile", + po::value(&options.sourceFile), + "source file") + ("AlignmentFile", + po::value(&options.alignmentFile), + "alignment file") + ("ExtractFile", + po::value(&options.extractFile), + "extract file") ; // Compose the full set of command-line options. @@ -337,8 +339,8 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[], // Process the command-line. po::variables_map vm; const int optionStyle = cls::allow_long - | cls::long_allow_adjacent - | cls::long_allow_next; + | cls::long_allow_adjacent + | cls::long_allow_next; try { po::store(po::command_line_parser(argc, argv).style(optionStyle). options(cmdLineOptions).positional(p).run(), vm); @@ -424,9 +426,9 @@ std::vector ExtractGHKM::ReadTokens(const std::string &s) } void ExtractGHKM::WriteGlueGrammar( - const std::set &labelSet, - const std::map &topLabelSet, - std::ostream &out) + const std::set &labelSet, + const std::map &topLabelSet, + std::ostream &out) { // chose a top label that is not already a label std::string topLabel = "QQQQQQ"; @@ -457,10 +459,10 @@ void ExtractGHKM::WriteGlueGrammar( } void ExtractGHKM::CollectWordLabelCounts( - ParseTree &root, - const Options &options, - std::map &wordCount, - std::map &wordLabel) + ParseTree &root, + const Options &options, + std::map &wordCount, + std::map &wordLabel) { std::vector leaves; root.GetLeaves(std::back_inserter(leaves)); @@ -486,10 +488,10 @@ void ExtractGHKM::CollectWordLabelCounts( } void ExtractGHKM::WriteUnknownWordLabel( - const std::map &wordCount, - const std::map &wordLabel, - const Options &options, - std::ostream &out) + const std::map &wordCount, + const std::map &wordLabel, + const Options &options, + std::ostream &out) { std::map labelCount; int total = 0; diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.h b/phrase-extract/extract-ghkm/ExtractGHKM.h index 6519bf675..c78aea109 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.h +++ b/phrase-extract/extract-ghkm/ExtractGHKM.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -27,22 +27,26 @@ #include #include -namespace Moses { +namespace Moses +{ class OutputFileStream; -namespace GHKM { +namespace GHKM +{ struct Options; class ParseTree; class ExtractGHKM { - public: +public: ExtractGHKM() : m_name("extract-ghkm") {} - const std::string &GetName() const { return m_name; } + const std::string &GetName() const { + return m_name; + } int Main(int argc, char *argv[]); - private: +private: void Error(const std::string &) const; void OpenInputFileOrDie(const std::string &, std::ifstream &); void OpenOutputFileOrDie(const std::string &, std::ofstream &); @@ -60,7 +64,7 @@ class ExtractGHKM const std::map &, std::ostream &); std::vector ReadTokens(const std::string &); - + void ProcessOptions(int, char *[], Options &) const; std::string m_name; diff --git a/phrase-extract/extract-ghkm/Main.cpp b/phrase-extract/extract-ghkm/Main.cpp index faf3230a6..14064406b 100644 --- a/phrase-extract/extract-ghkm/Main.cpp +++ b/phrase-extract/extract-ghkm/Main.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA diff --git a/phrase-extract/extract-ghkm/Node.cpp b/phrase-extract/extract-ghkm/Node.cpp index beb7470b8..e14d8c050 100644 --- a/phrase-extract/extract-ghkm/Node.cpp +++ b/phrase-extract/extract-ghkm/Node.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -21,8 +21,10 @@ #include "Subgraph.h" -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ Node::~Node() { diff --git a/phrase-extract/extract-ghkm/Node.h b/phrase-extract/extract-ghkm/Node.h index 775473362..2eed01311 100644 --- a/phrase-extract/extract-ghkm/Node.h +++ b/phrase-extract/extract-ghkm/Node.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -28,8 +28,10 @@ #include #include -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ class Subgraph; @@ -37,34 +39,68 @@ enum NodeType { SOURCE, TARGET, TREE }; class Node { - public: +public: Node(const std::string &label, NodeType type) - : m_label(label) - , m_type(type) - , m_pcfgScore(0.0f) {} + : m_label(label) + , m_type(type) + , m_pcfgScore(0.0f) {} ~Node(); - const std::string &GetLabel() const { return m_label; } - NodeType GetType() const { return m_type; } - const std::vector &GetChildren() const { return m_children; } - const std::vector &GetParents() const { return m_parents; } - float GetPcfgScore() const { return m_pcfgScore; } - const Span &GetSpan() const { return m_span; } - const Span &GetComplementSpan() const { return m_complementSpan; } - const std::vector &GetRules() const { return m_rules; } + const std::string &GetLabel() const { + return m_label; + } + NodeType GetType() const { + return m_type; + } + const std::vector &GetChildren() const { + return m_children; + } + const std::vector &GetParents() const { + return m_parents; + } + float GetPcfgScore() const { + return m_pcfgScore; + } + const Span &GetSpan() const { + return m_span; + } + const Span &GetComplementSpan() const { + return m_complementSpan; + } + const std::vector &GetRules() const { + return m_rules; + } - void SetChildren(const std::vector &c) { m_children = c; } - void SetParents(const std::vector &p) { m_parents = p; } - void SetPcfgScore(float s) { m_pcfgScore = s; } - void SetSpan(const Span &s) { m_span = s; } - void SetComplementSpan(const Span &cs) { m_complementSpan = cs; } + void SetChildren(const std::vector &c) { + m_children = c; + } + void SetParents(const std::vector &p) { + m_parents = p; + } + void SetPcfgScore(float s) { + m_pcfgScore = s; + } + void SetSpan(const Span &s) { + m_span = s; + } + void SetComplementSpan(const Span &cs) { + m_complementSpan = cs; + } - void AddChild(Node *c) { m_children.push_back(c); } - void AddParent(Node *p) { m_parents.push_back(p); } - void AddRule(const Subgraph *s) { m_rules.push_back(s); } + void AddChild(Node *c) { + m_children.push_back(c); + } + void AddParent(Node *p) { + m_parents.push_back(p); + } + void AddRule(const Subgraph *s) { + m_rules.push_back(s); + } - bool IsSink() const { return m_children.empty(); } + bool IsSink() const { + return m_children.empty(); + } bool IsPreterminal() const; void PropagateIndex(int); @@ -82,7 +118,7 @@ class Node template static Node *LowestCommonAncestor(InputIterator first, InputIterator last); - private: +private: // Disallow copying Node(const Node &); Node &operator=(const Node &); diff --git a/phrase-extract/extract-ghkm/Options.h b/phrase-extract/extract-ghkm/Options.h index d348a57d8..e54a9ddae 100644 --- a/phrase-extract/extract-ghkm/Options.h +++ b/phrase-extract/extract-ghkm/Options.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -23,25 +23,27 @@ #include -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ struct Options { - public: +public: Options() - : allowUnary(false) - , conditionOnTargetLhs(false) - , gzOutput(false) - , maxNodes(15) - , maxRuleDepth(3) - , maxRuleSize(3) - , maxScope(3) - , minimal(false) - , pcfg(false) - , sentenceOffset(0) - , unpairedExtractFormat(false) - , unknownWordMinRelFreq(0.03f) - , unknownWordUniform(false) {} + : allowUnary(false) + , conditionOnTargetLhs(false) + , gzOutput(false) + , maxNodes(15) + , maxRuleDepth(3) + , maxRuleSize(3) + , maxScope(3) + , minimal(false) + , pcfg(false) + , sentenceOffset(0) + , unpairedExtractFormat(false) + , unknownWordMinRelFreq(0.03f) + , unknownWordUniform(false) {} // Positional options std::string targetFile; diff --git a/phrase-extract/extract-ghkm/ParseTree.cpp b/phrase-extract/extract-ghkm/ParseTree.cpp index 052b8dee1..f86486487 100644 --- a/phrase-extract/extract-ghkm/ParseTree.cpp +++ b/phrase-extract/extract-ghkm/ParseTree.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -19,8 +19,10 @@ #include "ParseTree.h" -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ ParseTree::~ParseTree() { diff --git a/phrase-extract/extract-ghkm/ParseTree.h b/phrase-extract/extract-ghkm/ParseTree.h index 273e2e04e..03da17735 100644 --- a/phrase-extract/extract-ghkm/ParseTree.h +++ b/phrase-extract/extract-ghkm/ParseTree.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -24,27 +24,39 @@ #include #include -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ class ParseTree { - public: +public: ParseTree(const std::string &label) - : m_label(label) - , m_parent(0) - , m_pcfgScore(0.0) {} + : m_label(label) + , m_parent(0) + , m_pcfgScore(0.0) {} ~ParseTree(); - const std::string &GetLabel() const { return m_label; } - const std::vector &GetChildren() const { return m_children; } - const ParseTree *GetParent() const { return m_parent; } - float GetPcfgScore() const { return m_pcfgScore; } + const std::string &GetLabel() const { + return m_label; + } + const std::vector &GetChildren() const { + return m_children; + } + const ParseTree *GetParent() const { + return m_parent; + } + float GetPcfgScore() const { + return m_pcfgScore; + } void SetParent(ParseTree *); void SetChildren(const std::vector &); - void SetPcfgScore(float score) { m_pcfgScore = score; } + void SetPcfgScore(float score) { + m_pcfgScore = score; + } void AddChild(ParseTree *); @@ -53,7 +65,7 @@ class ParseTree template void GetLeaves(OutputIterator); - private: +private: // Disallow copying ParseTree(const ParseTree &); ParseTree &operator=(const ParseTree &); diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp index 5dc70052c..2c901413d 100644 --- a/phrase-extract/extract-ghkm/ScfgRule.cpp +++ b/phrase-extract/extract-ghkm/ScfgRule.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -24,13 +24,15 @@ #include -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ ScfgRule::ScfgRule(const Subgraph &fragment) - : m_sourceLHS("X", NonTerminal) - , m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal) - , m_pcfgScore(fragment.GetPcfgScore()) + : m_sourceLHS("X", NonTerminal) + , m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal) + , m_pcfgScore(fragment.GetPcfgScore()) { // Source RHS diff --git a/phrase-extract/extract-ghkm/ScfgRule.h b/phrase-extract/extract-ghkm/ScfgRule.h index 2405d8fa3..21a9e9900 100644 --- a/phrase-extract/extract-ghkm/ScfgRule.h +++ b/phrase-extract/extract-ghkm/ScfgRule.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -26,42 +26,59 @@ #include #include -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ class Node; class Subgraph; enum SymbolType { Terminal, NonTerminal }; -struct Symbol -{ - public: +struct Symbol { +public: Symbol(const std::string &v, SymbolType t) : m_value(v) , m_type(t) {} - const std::string &GetValue() const { return m_value; } - SymbolType GetType() const { return m_type; } + const std::string &GetValue() const { + return m_value; + } + SymbolType GetType() const { + return m_type; + } - private: +private: std::string m_value; SymbolType m_type; }; class ScfgRule { - public: +public: ScfgRule(const Subgraph &fragment); - const Symbol &GetSourceLHS() const { return m_sourceLHS; } - const Symbol &GetTargetLHS() const { return m_targetLHS; } - const std::vector &GetSourceRHS() const { return m_sourceRHS; } - const std::vector &GetTargetRHS() const { return m_targetRHS; } - const Alignment &GetAlignment() const { return m_alignment; } - float GetPcfgScore() const { return m_pcfgScore; } + const Symbol &GetSourceLHS() const { + return m_sourceLHS; + } + const Symbol &GetTargetLHS() const { + return m_targetLHS; + } + const std::vector &GetSourceRHS() const { + return m_sourceRHS; + } + const std::vector &GetTargetRHS() const { + return m_targetRHS; + } + const Alignment &GetAlignment() const { + return m_alignment; + } + float GetPcfgScore() const { + return m_pcfgScore; + } int Scope() const; - private: +private: static bool PartitionOrderComp(const Node *, const Node *); Symbol m_sourceLHS; diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp index cd993d6e8..54b3978d1 100644 --- a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp +++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -30,8 +30,10 @@ #include #include -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ void ScfgRuleWriter::Write(const ScfgRule &rule) { @@ -70,8 +72,8 @@ void ScfgRuleWriter::Write(const ScfgRule &rule) } void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule, - std::ostream &sourceSS, - std::ostream &targetSS) + std::ostream &sourceSS, + std::ostream &targetSS) { const std::vector &sourceRHS = rule.GetSourceRHS(); const std::vector &targetRHS = rule.GetTargetRHS(); @@ -122,8 +124,8 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule, } void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule, - std::ostream &sourceSS, - std::ostream &targetSS) + std::ostream &sourceSS, + std::ostream &targetSS) { const std::vector &sourceRHS = rule.GetSourceRHS(); const std::vector &targetRHS = rule.GetTargetRHS(); diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.h b/phrase-extract/extract-ghkm/ScfgRuleWriter.h index b92a432a1..ee29e49e5 100644 --- a/phrase-extract/extract-ghkm/ScfgRuleWriter.h +++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -23,8 +23,10 @@ #include -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ struct Options; class ScfgRule; @@ -32,15 +34,15 @@ struct Symbol; class ScfgRuleWriter { - public: +public: ScfgRuleWriter(std::ostream &fwd, std::ostream &inv, const Options &options) - : m_fwd(fwd) - , m_inv(inv) - , m_options(options) {} + : m_fwd(fwd) + , m_inv(inv) + , m_options(options) {} void Write(const ScfgRule &); - private: +private: // Disallow copying ScfgRuleWriter(const ScfgRuleWriter &); ScfgRuleWriter &operator=(const ScfgRuleWriter &); diff --git a/phrase-extract/extract-ghkm/Span.cpp b/phrase-extract/extract-ghkm/Span.cpp index f0eccbdf2..d637ec3d2 100644 --- a/phrase-extract/extract-ghkm/Span.cpp +++ b/phrase-extract/extract-ghkm/Span.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -19,8 +19,10 @@ #include "Span.h" -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ bool SpansIntersect(const Span &a, const ContiguousSpan &b) { diff --git a/phrase-extract/extract-ghkm/Span.h b/phrase-extract/extract-ghkm/Span.h index 003d1ef84..c4d146c4e 100644 --- a/phrase-extract/extract-ghkm/Span.h +++ b/phrase-extract/extract-ghkm/Span.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -24,8 +24,10 @@ #include #include -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ typedef std::set Span; typedef std::pair ContiguousSpan; diff --git a/phrase-extract/extract-ghkm/Subgraph.cpp b/phrase-extract/extract-ghkm/Subgraph.cpp index e048f2c55..3c0503010 100644 --- a/phrase-extract/extract-ghkm/Subgraph.cpp +++ b/phrase-extract/extract-ghkm/Subgraph.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -21,8 +21,10 @@ #include "Node.h" -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ void Subgraph::GetTargetLeaves(std::vector &result) const { diff --git a/phrase-extract/extract-ghkm/Subgraph.h b/phrase-extract/extract-ghkm/Subgraph.h index ede1233e9..f4d1e0c8d 100644 --- a/phrase-extract/extract-ghkm/Subgraph.h +++ b/phrase-extract/extract-ghkm/Subgraph.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -26,47 +26,62 @@ #include #include -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ class Node; class Subgraph { - public: +public: Subgraph(const Node *root) - : m_root(root) - , m_depth(0) - , m_size(root->GetType() == TREE ? 1 : 0) - , m_nodeCount(1) - , m_pcfgScore(0.0f) {} + : m_root(root) + , m_depth(0) + , m_size(root->GetType() == TREE ? 1 : 0) + , m_nodeCount(1) + , m_pcfgScore(0.0f) {} Subgraph(const Node *root, const std::set &leaves) - : m_root(root) - , m_leaves(leaves) - , m_depth(-1) - , m_size(-1) - , m_nodeCount(-1) - , m_pcfgScore(0.0f) - { + : m_root(root) + , m_leaves(leaves) + , m_depth(-1) + , m_size(-1) + , m_nodeCount(-1) + , m_pcfgScore(0.0f) { m_depth = CalcDepth(m_root); m_size = CalcSize(m_root); m_nodeCount = CountNodes(m_root); m_pcfgScore = CalcPcfgScore(); } - const Node *GetRoot() const { return m_root; } - const std::set &GetLeaves() const { return m_leaves; } - int GetDepth() const { return m_depth; } - int GetSize() const { return m_size; } - int GetNodeCount() const { return m_nodeCount; } - float GetPcfgScore() const { return m_pcfgScore; } + const Node *GetRoot() const { + return m_root; + } + const std::set &GetLeaves() const { + return m_leaves; + } + int GetDepth() const { + return m_depth; + } + int GetSize() const { + return m_size; + } + int GetNodeCount() const { + return m_nodeCount; + } + float GetPcfgScore() const { + return m_pcfgScore; + } - bool IsTrivial() const { return m_leaves.empty(); } + bool IsTrivial() const { + return m_leaves.empty(); + } void GetTargetLeaves(std::vector &) const; - private: +private: void GetTargetLeaves(const Node *, std::vector &) const; int CalcDepth(const Node *) const; int CalcSize(const Node *) const; diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/phrase-extract/extract-ghkm/XmlTreeParser.cpp index 66024ff01..2f28c3244 100644 --- a/phrase-extract/extract-ghkm/XmlTreeParser.cpp +++ b/phrase-extract/extract-ghkm/XmlTreeParser.cpp @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -29,13 +29,15 @@ using namespace MosesTraining; -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ XmlTreeParser::XmlTreeParser(std::set &labelSet, std::map &topLabelSet) - : m_labelSet(labelSet) - , m_topLabelSet(topLabelSet) + : m_labelSet(labelSet) + , m_topLabelSet(topLabelSet) { } @@ -60,8 +62,8 @@ std::auto_ptr XmlTreeParser::Parse(const std::string &line) // Converts a SyntaxNode tree to a Moses::GHKM::ParseTree. std::auto_ptr XmlTreeParser::ConvertTree( - const SyntaxNode &tree, - const std::vector &words) + const SyntaxNode &tree, + const std::vector &words) { std::auto_ptr root(new ParseTree(tree.GetLabel())); root->SetPcfgScore(tree.GetPcfgScore()); diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h index 7b63ae1e4..d00fd7d9f 100644 --- a/phrase-extract/extract-ghkm/XmlTreeParser.h +++ b/phrase-extract/extract-ghkm/XmlTreeParser.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -31,18 +31,21 @@ #include #include -namespace Moses { -namespace GHKM { +namespace Moses +{ +namespace GHKM +{ class ParseTree; // Parses a string in Moses' XML parse tree format and returns a ParseTree // object. -class XmlTreeParser { - public: +class XmlTreeParser +{ +public: XmlTreeParser(std::set &, std::map &); std::auto_ptr Parse(const std::string &); - private: +private: std::auto_ptr ConvertTree(const MosesTraining::SyntaxNode &, const std::vector &); diff --git a/phrase-extract/extract-lex-main.cpp b/phrase-extract/extract-lex-main.cpp index a59450da8..f63015a6a 100644 --- a/phrase-extract/extract-lex-main.cpp +++ b/phrase-extract/extract-lex-main.cpp @@ -10,16 +10,16 @@ using namespace MosesTraining; float COUNT_INCR = 1; -void fix(std::ostream& stream) +void fix(std::ostream& stream) { - stream.setf(std::ios::fixed); - stream.precision(7); + stream.setf(std::ios::fixed); + stream.precision(7); } int main(int argc, char* argv[]) { cerr << "Starting...\n"; - + assert(argc == 6); char* &filePathTarget = argv[1]; char* &filePathSource = argv[2]; @@ -43,8 +43,7 @@ int main(int argc, char* argv[]) size_t lineCount = 0; string lineTarget, lineSource, lineAlign; - while (getline(streamTarget, lineTarget)) - { + while (getline(streamTarget, lineTarget)) { if (lineCount % 10000 == 0) cerr << lineCount << " "; @@ -52,7 +51,7 @@ int main(int argc, char* argv[]) assert(isSource); istream &isAlign = getline(streamAlign, lineAlign); assert(isAlign); - + vector toksTarget, toksSource, toksAlign; Tokenize(toksTarget, lineTarget); Tokenize(toksSource, lineSource); @@ -61,13 +60,13 @@ int main(int argc, char* argv[]) /* cerr << endl << toksTarget.size() << " " << lineTarget << endl - << toksSource.size() << " " << lineSource << endl + << toksSource.size() << " " << lineSource << endl << toksAlign.size() << " " << lineAlign << endl; */ extractSingleton.Process(toksTarget, toksSource, toksAlign, lineCount); - - ++lineCount; + + ++lineCount; } extractSingleton.Output(streamLexS2T, streamLexT2S); @@ -86,35 +85,32 @@ namespace MosesTraining const std::string *Vocab::GetOrAdd(const std::string &word) { - const string *ret = &(*m_coll.insert(word).first); + const string *ret = &(*m_coll.insert(word).first); return ret; } void ExtractLex::Process(vector &toksTarget, vector &toksSource, vector &toksAlign, size_t lineCount) { std::vector m_sourceAligned(toksSource.size(), false) - , m_targetAligned(toksTarget.size(), false); + , m_targetAligned(toksTarget.size(), false); vector::const_iterator iterAlign; - for (iterAlign = toksAlign.begin(); iterAlign != toksAlign.end(); ++iterAlign) - { + for (iterAlign = toksAlign.begin(); iterAlign != toksAlign.end(); ++iterAlign) { const string &alignTok = *iterAlign; - + vector alignPos; Tokenize(alignPos, alignTok, "-"); assert(alignPos.size() == 2); - if (alignPos[0] >= toksSource.size()) - { - cerr << "ERROR: alignment over source length. Alignment " << alignPos[0] << " at line " << lineCount << endl; - continue; - } - if (alignPos[1] >= toksTarget.size()) - { - cerr << "ERROR: alignment over target length. Alignment " << alignPos[1] << " at line " << lineCount << endl; - continue; - } - + if (alignPos[0] >= toksSource.size()) { + cerr << "ERROR: alignment over source length. Alignment " << alignPos[0] << " at line " << lineCount << endl; + continue; + } + if (alignPos[1] >= toksTarget.size()) { + cerr << "ERROR: alignment over target length. Alignment " << alignPos[1] << " at line " << lineCount << endl; + continue; + } + assert(alignPos[0] < toksSource.size()); assert(alignPos[1] < toksTarget.size()); @@ -123,12 +119,12 @@ void ExtractLex::Process(vector &toksTarget, vector &toksSource, const string &tmpSource = toksSource[ alignPos[0] ]; const string &tmpTarget = toksTarget[ alignPos[1] ]; - + const string *source = m_vocab.GetOrAdd(tmpSource); const string *target = m_vocab.GetOrAdd(tmpTarget); Process(target, source); - + } ProcessUnaligned(toksTarget, toksSource, m_sourceAligned, m_targetAligned); @@ -154,15 +150,13 @@ void ExtractLex::Process(WordCount &wcIn, const std::string *out) } void ExtractLex::ProcessUnaligned(vector &toksTarget, vector &toksSource - , const std::vector &m_sourceAligned, const std::vector &m_targetAligned) + , const std::vector &m_sourceAligned, const std::vector &m_targetAligned) { - const string *nullWord = m_vocab.GetOrAdd("NULL"); + const string *nullWord = m_vocab.GetOrAdd("NULL"); - for (size_t pos = 0; pos < m_sourceAligned.size(); ++pos) - { + for (size_t pos = 0; pos < m_sourceAligned.size(); ++pos) { bool isAlignedCurr = m_sourceAligned[pos]; - if (!isAlignedCurr) - { + if (!isAlignedCurr) { const string &tmpWord = toksSource[pos]; const string *sourceWord = m_vocab.GetOrAdd(tmpWord); @@ -170,11 +164,9 @@ void ExtractLex::ProcessUnaligned(vector &toksTarget, vector &to } } - for (size_t pos = 0; pos < m_targetAligned.size(); ++pos) - { + for (size_t pos = 0; pos < m_targetAligned.size(); ++pos) { bool isAlignedCurr = m_targetAligned[pos]; - if (!isAlignedCurr) - { + if (!isAlignedCurr) { const string &tmpWord = toksTarget[pos]; const string *targetWord = m_vocab.GetOrAdd(tmpWord); @@ -193,16 +185,14 @@ void ExtractLex::Output(std::ofstream &streamLexS2T, std::ofstream &streamLexT2S void ExtractLex::Output(const std::map &coll, std::ofstream &outStream) { std::map::const_iterator iterOuter; - for (iterOuter = coll.begin(); iterOuter != coll.end(); ++iterOuter) - { + for (iterOuter = coll.begin(); iterOuter != coll.end(); ++iterOuter) { const string &inStr = *iterOuter->first; const WordCount &inWC = iterOuter->second; const std::map &outColl = inWC.GetColl(); std::map::const_iterator iterInner; - for (iterInner = outColl.begin(); iterInner != outColl.end(); ++iterInner) - { + for (iterInner = outColl.begin(); iterInner != outColl.end(); ++iterInner) { const string &outStr = *iterInner->first; const WordCount &outWC = iterInner->second; diff --git a/phrase-extract/extract-lex.h b/phrase-extract/extract-lex.h index d272cf6ff..d79038fc6 100644 --- a/phrase-extract/extract-lex.h +++ b/phrase-extract/extract-lex.h @@ -14,10 +14,10 @@ namespace MosesTraining template inline T Scan(const std::string &input) { - std::stringstream stream(input); - T ret; - stream >> ret; - return ret; + std::stringstream stream(input); + T ret; + stream >> ret; + return ret; } @@ -25,13 +25,12 @@ inline T Scan(const std::string &input) template inline void Scan(std::vector &output, const std::vector< std::string > &input) { - output.resize(input.size()); - for (size_t i = 0 ; i < input.size() ; i++) - { - output[i] = Scan( input[i] ); - } + output.resize(input.size()); + for (size_t i = 0 ; i < input.size() ; i++) { + output[i] = Scan( input[i] ); + } } - + inline void Tokenize(std::vector &output , const std::string& str @@ -55,17 +54,17 @@ inline void Tokenize(std::vector &output // speeded up version of above template inline void Tokenize( std::vector &output - , const std::string &input - , const std::string& delimiters = " \t") + , const std::string &input + , const std::string& delimiters = " \t") { - std::vector stringVector; - Tokenize(stringVector, input, delimiters); - return Scan(output, stringVector ); + std::vector stringVector; + Tokenize(stringVector, input, delimiters); + return Scan(output, stringVector ); } class WordCount { - friend std::ostream& operator<<(std::ostream&, const WordCount&); + friend std::ostream& operator<<(std::ostream&, const WordCount&); public: float m_count; @@ -83,13 +82,16 @@ public: void AddCount(float incr); - std::map &GetColl() - { return m_coll; } - const std::map &GetColl() const - { return m_coll; } + std::map &GetColl() { + return m_coll; + } + const std::map &GetColl() const { + return m_coll; + } - const float GetCount() const - { return m_count; } + const float GetCount() const { + return m_count; + } }; diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp index cab91e92d..a8edb298a 100644 --- a/phrase-extract/extract-main.cpp +++ b/phrase-extract/extract-main.cpp @@ -29,7 +29,8 @@ using namespace std; using namespace MosesTraining; -namespace MosesTraining { +namespace MosesTraining +{ const long int LINE_MAX_LENGTH = 500000 ; @@ -49,37 +50,38 @@ typedef vector < HPhrase > HPhraseVector; // The key of the map is the English index and the value is a set of the source ones typedef map > HSentenceVertices; - REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, +REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, int, int, int, int, int, int, int, bool (*)(int, int), bool (*)(int, int)); - REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, +REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, int, int, int, int, int, int, int, bool (*)(int, int), bool (*)(int, int), const HSentenceVertices &, const HSentenceVertices &); - REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, +REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, int, int, int, int, int, int, int, bool (*)(int, int), bool (*)(int, int), const HSentenceVertices &, const HSentenceVertices &, const HSentenceVertices &, const HSentenceVertices &, REO_POS); - void insertVertex(HSentenceVertices &, int, int); - void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, +void insertVertex(HSentenceVertices &, int, int); +void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, int, int, int, int); - string getOrientString(REO_POS, REO_MODEL_TYPE); +string getOrientString(REO_POS, REO_MODEL_TYPE); - bool ge(int, int); - bool le(int, int); - bool lt(int, int); +bool ge(int, int); +bool le(int, int); +bool lt(int, int); - bool isAligned (SentenceAlignment &, int, int); - int sentenceOffset = 0; +bool isAligned (SentenceAlignment &, int, int); +int sentenceOffset = 0; } -namespace MosesTraining{ +namespace MosesTraining +{ -class ExtractTask +class ExtractTask { public: ExtractTask(size_t id, SentenceAlignment &sentence,PhraseExtractionOptions &initoptions, Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFileInv,Moses::OutputFileStream &extractFileOrientation): @@ -87,8 +89,8 @@ public: m_options(initoptions), m_extractFile(extractFile), m_extractFileInv(extractFileInv), - m_extractFileOrientation(extractFileOrientation){} -void Run(); + m_extractFileOrientation(extractFileOrientation) {} + void Run(); private: vector< string > m_extractedPhrases; vector< string > m_extractedPhrasesInv; @@ -98,7 +100,7 @@ private: void extract(SentenceAlignment &); void addPhrase(SentenceAlignment &, int, int, int, int, string &); void writePhrasesToFile(); - + SentenceAlignment &m_sentence; const PhraseExtractionOptions &m_options; Moses::OutputFileStream &m_extractFile; @@ -112,7 +114,7 @@ int main(int argc, char* argv[]) cerr << "PhraseExtract v1.4, written by Philipp Koehn\n" << "phrase extraction from an aligned parallel corpus\n"; - if (argc < 6) { + if (argc < 6) { cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] "; cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ]\n"; exit(1); @@ -135,7 +137,7 @@ int main(int argc, char* argv[]) } else if (strcmp(argv[i],"--NoTTable") == 0) { options.initTranslationFlag(false); } else if (strcmp(argv[i], "--IncludeSentenceId") == 0) { - options.initIncludeSentenceIdFlag(true); + options.initIncludeSentenceIdFlag(true); } else if (strcmp(argv[i], "--SentenceOffset") == 0) { if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') { cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl; @@ -143,7 +145,7 @@ int main(int argc, char* argv[]) } sentenceOffset = atoi(argv[++i]); } else if (strcmp(argv[i], "--GZOutput") == 0) { - options.initGzOutput(true); + options.initGzOutput(true); } else if (strcmp(argv[i], "--InstanceWeights") == 0) { if (i+1 >= argc) { cerr << "extract: syntax error, used switch --InstanceWeights without file name" << endl; @@ -260,7 +262,7 @@ int main(int argc, char* argv[]) SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__); } SentenceAlignment sentence; - // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl; + // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl; //az: output src, tgt, and alingment line if (options.isOnlyOutputSpanInfo()) { cout << "LOG: SRC: " << foreignString << endl; @@ -268,8 +270,8 @@ int main(int argc, char* argv[]) cout << "LOG: ALT: " << alignmentString << endl; cout << "LOG: PHRASES_BEGIN:" << endl; } - if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) { - ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFile , extractFileInv, extractFileOrientation); + if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) { + ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFile , extractFileInv, extractFileOrientation); task->Run(); delete task; @@ -286,17 +288,18 @@ int main(int argc, char* argv[]) if (options.isTranslationFlag()) { extractFile.Close(); extractFileInv.Close(); - + + } + if (options.isOrientationFlag()) { + extractFileOrientation.Close(); } - if (options.isOrientationFlag()){ - extractFileOrientation.Close(); - } } } namespace MosesTraining { -void ExtractTask::Run() { +void ExtractTask::Run() +{ extract(m_sentence); writePhrasesToFile(); m_extractedPhrases.clear(); @@ -665,16 +668,16 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, { // source // // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n"; - ostringstream outextractstr; - ostringstream outextractstrInv; - ostringstream outextractstrOrientation; + ostringstream outextractstr; + ostringstream outextractstrInv; + ostringstream outextractstrOrientation; if (m_options.isOnlyOutputSpanInfo()) { cout << startF << " " << endF << " " << startE << " " << endE << endl; return; } -for(int fi=startF; fi<=endF; fi++) { + for(int fi=startF; fi<=endF; fi++) { if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " "; if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " "; } @@ -693,13 +696,13 @@ for(int fi=startF; fi<=endF; fi++) { // source (for inverse) - if (m_options.isTranslationFlag()) { + if (m_options.isTranslationFlag()) { for(int fi=startF; fi<=endF; fi++) outextractstrInv << sentence.source[fi] << " "; outextractstrInv << "|||"; } // alignment - if (m_options.isTranslationFlag()) { + if (m_options.isTranslationFlag()) { for(int ei=startE; ei<=endE; ei++) { for(unsigned int i=0; i::const_iterator phrase=m_extractedPhrases.begin();phrase!=m_extractedPhrases.end();phrase++){ - outextractFile<data(); - } - for(vector::const_iterator phrase=m_extractedPhrasesInv.begin();phrase!=m_extractedPhrasesInv.end();phrase++){ - outextractFileInv<data(); - } - for(vector::const_iterator phrase=m_extractedPhrasesOri.begin();phrase!=m_extractedPhrasesOri.end();phrase++){ - outextractFileOrientation<data(); - } + for(vector::const_iterator phrase=m_extractedPhrases.begin(); phrase!=m_extractedPhrases.end(); phrase++) { + outextractFile<data(); + } + for(vector::const_iterator phrase=m_extractedPhrasesInv.begin(); phrase!=m_extractedPhrasesInv.end(); phrase++) { + outextractFileInv<data(); + } + for(vector::const_iterator phrase=m_extractedPhrasesOri.begin(); phrase!=m_extractedPhrasesOri.end(); phrase++) { + outextractFileOrientation<data(); + } - m_extractFile << outextractFile.str(); - m_extractFileInv << outextractFileInv.str(); - m_extractFileOrientation << outextractFileOrientation.str(); + m_extractFile << outextractFile.str(); + m_extractFileInv << outextractFileInv.str(); + m_extractFileOrientation << outextractFileOrientation.str(); } // if proper conditioning, we need the number of times a source phrase occured void ExtractTask::extractBase( SentenceAlignment &sentence ) { - ostringstream outextractFile; - ostringstream outextractFileInv; + ostringstream outextractFile; + ostringstream outextractFileInv; int countF = sentence.source.size(); for(int startF=0; startF LabelIndex; typedef map< int, int > WordIndex; -class ExtractTask +class ExtractTask { private: SentenceAlignmentWithSyntax &m_sentence; @@ -64,31 +64,30 @@ private: Moses::OutputFileStream& m_extractFileInv; vector< ExtractedRule > m_extractedRules; - + // main functions void extractRules(); void addRuleToCollection(ExtractedRule &rule); void consolidateRules(); void writeRulesToFile(); - + // subs void addRule( int, int, int, int, int, RuleExist &ruleExist); void addHieroRule( int startT, int endT, int startS, int endS - , RuleExist &ruleExist, HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS); + , RuleExist &ruleExist, HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS); void saveHieroPhrase( int startT, int endT, int startS, int endS , HoleCollection &holeColl, LabelIndex &labelIndex, int countS); string saveTargetHieroPhrase( int startT, int endT, int startS, int endS - , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore, int countS); + , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore, int countS); string saveSourceHieroPhrase( int startT, int endT, int startS, int endS , HoleCollection &holeColl, const LabelIndex &labelIndex); void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS - , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex); + , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex); void saveHieroAlignment( int startT, int endT, int startS, int endS - , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule); + , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule); void saveAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS); - - inline string IntToString( int i ) - { + + inline string IntToString( int i ) { stringstream out; out << i; return out.str(); @@ -123,7 +122,7 @@ int main(int argc, char* argv[]) if (argc < 5) { cerr << "syntax: extract-rules corpus.target corpus.source corpus.align extract [" - << " --GlueGrammar FILE" + << " --GlueGrammar FILE" << " | --UnknownWordLabel FILE" << " | --OnlyDirect" << " | --OutputNTLengths" @@ -139,8 +138,8 @@ int main(int argc, char* argv[]) << " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource | --NoNonTermFirstWord | --NoFractionalCounting" << " | --UnpairedExtractFormat" << " | --ConditionOnTargetLHS ]" - << " | --BoundaryRules[" << options.boundaryRules << "]"; - + << " | --BoundaryRules[" << options.boundaryRules << "]"; + exit(1); } char* &fileNameT = argv[1]; @@ -212,10 +211,9 @@ int main(int argc, char* argv[]) cerr << "extract error: --MaxScope should be at least 0" << endl; exit(1); } + } else if (strcmp(argv[i], "--GZOutput") == 0) { + options.gzOutput = true; } - else if (strcmp(argv[i], "--GZOutput") == 0) { - options.gzOutput = true; - } // allow consecutive non-terminals (X Y | X Y) else if (strcmp(argv[i],"--TargetSyntax") == 0) { options.targetSyntax = true; @@ -265,7 +263,7 @@ int main(int argc, char* argv[]) options.unpairedExtractFormat = true; } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) { options.conditionOnTargetLhs = true; - } else if (strcmp(argv[i],"-threads") == 0 || + } else if (strcmp(argv[i],"-threads") == 0 || strcmp(argv[i],"--threads") == 0 || strcmp(argv[i],"--Threads") == 0) { #ifdef WITH_THREADS @@ -327,8 +325,8 @@ int main(int argc, char* argv[]) SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__); SentenceAlignmentWithSyntax sentence - (targetLabelCollection, sourceLabelCollection, - targetTopLabelCollection, sourceTopLabelCollection, options); + (targetLabelCollection, sourceLabelCollection, + targetTopLabelCollection, sourceTopLabelCollection, options); //az: output src, tgt, and alingment line if (options.onlyOutputSpanInfo) { cout << "LOG: SRC: " << sourceString << endl; @@ -364,7 +362,8 @@ int main(int argc, char* argv[]) writeUnknownWordLabel(fileNameUnknownWordLabel); } -void ExtractTask::Run() { +void ExtractTask::Run() +{ extractRules(); consolidateRules(); writeRulesToFile(); @@ -471,7 +470,7 @@ void ExtractTask::extractRules() } void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS - , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex) + , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex) { vector::iterator iterHoleList = holeColl.GetSortedSourceHoles().begin(); assert(iterHoleList != holeColl.GetSortedSourceHoles().end()); @@ -509,8 +508,8 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS, } string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int endS - , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore - , int countS) + , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore + , int countS) { HoleList::iterator iterHoleList = holeColl.GetHoles().begin(); assert(iterHoleList != holeColl.GetHoles().end()); @@ -536,11 +535,11 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int if (m_options.targetSyntax) { targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetLabel(); } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) { - targetLabel = "S"; + targetLabel = "S"; } else { targetLabel = "X"; } - + hole.SetLabel(targetLabel, 1); if (m_options.unpairedExtractFormat) { @@ -571,7 +570,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int } string ExtractTask::saveSourceHieroPhrase( int startT, int endT, int startS, int endS - , HoleCollection &holeColl, const LabelIndex &labelIndex) + , HoleCollection &holeColl, const LabelIndex &labelIndex) { vector::iterator iterHoleList = holeColl.GetSortedSourceHoles().begin(); assert(iterHoleList != holeColl.GetSortedSourceHoles().end()); @@ -615,7 +614,7 @@ string ExtractTask::saveSourceHieroPhrase( int startT, int endT, int startS, int } void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS - , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule) + , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule) { // print alignment of words for(int ti=startT; ti<=endT; ti++) { @@ -636,13 +635,13 @@ void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS HoleList::const_iterator iterHole; for (iterHole = holeColl.GetHoles().begin(); iterHole != holeColl.GetHoles().end(); ++iterHole) { const Hole &hole = *iterHole; - + std::string sourceSymbolIndex = IntToString(hole.GetPos(0)); std::string targetSymbolIndex = IntToString(hole.GetPos(1)); rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " "; if (!m_options.onlyDirectFlag) rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " "; - + rule.SetSpanLength(hole.GetPos(0), hole.GetSize(0), hole.GetSize(1) ) ; } @@ -654,7 +653,7 @@ void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS } void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS - , HoleCollection &holeColl, LabelIndex &labelIndex, int countS) + , HoleCollection &holeColl, LabelIndex &labelIndex, int countS) { WordIndex indexS, indexT; // to keep track of word positions in rule @@ -680,12 +679,12 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS if (m_options.pcfgScore) { double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore(); rule.target = saveTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS) - + " [" + targetLabel + "]"; + + " [" + targetLabel + "]"; rule.pcfgScore = std::exp(logPCFGScore); } else { double logPCFGScore = 0.0f; rule.target = saveTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS) - + " [" + targetLabel + "]"; + + " [" + targetLabel + "]"; } // source @@ -754,8 +753,8 @@ void ExtractTask::saveAllHieroPhrases( int startT, int endT, int startS, int end // this function is called recursively // it pokes a new hole into the phrase pair, and then calls itself for more holes void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS - , RuleExist &ruleExist, HoleCollection &holeColl - , int numHoles, int initStartT, int wordCountT, int wordCountS) + , RuleExist &ruleExist, HoleCollection &holeColl + , int numHoles, int initStartT, int wordCountT, int wordCountS) { // done, if already the maximum number of non-terminals in phrase pair if (numHoles >= m_options.maxNonTerm) @@ -862,7 +861,7 @@ void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS allowablePhrase = false; // passed all checks... - if (allowablePhrase) + if (allowablePhrase) saveAllHieroPhrases(startT, endT, startS, endS, holeColl, wordCountS); // recursively search for next hole @@ -880,12 +879,12 @@ void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS void ExtractTask::addRule( int startT, int endT, int startS, int endS, int countS, RuleExist &ruleExist) { // contains only or . Don't output - if (m_options.boundaryRules - && ( (startS == 0 && endS == 0) - || (startS == countS-1 && endS == countS-1))) { + if (m_options.boundaryRules + && ( (startS == 0 && endS == 0) + || (startS == countS-1 && endS == countS-1))) { return; } - + if (m_options.onlyOutputSpanInfo) { cout << startS << " " << endS << " " << startT << " " << endT << endl; return; @@ -897,11 +896,10 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count string targetLabel,sourceLabel; if (m_options.targetSyntax && m_options.conditionOnTargetLhs) { sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel(); - } - else { + } else { sourceLabel = m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X"; - + if (m_options.targetSyntax) { targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel(); } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) { @@ -1008,7 +1006,7 @@ void ExtractTask::writeRulesToFile() << rule->alignment << " ||| " << rule->count << " ||| "; if (m_options.outputNTLengths) { - rule->OutputNTLengths(out); + rule->OutputNTLengths(out); } if (m_options.pcfgScore) { out << " ||| " << rule->pcfgScore; diff --git a/phrase-extract/lexical-reordering/reordering_classes.cpp b/phrase-extract/lexical-reordering/reordering_classes.cpp index e5b3fe7cd..8c5163f9b 100644 --- a/phrase-extract/lexical-reordering/reordering_classes.cpp +++ b/phrase-extract/lexical-reordering/reordering_classes.cpp @@ -57,7 +57,7 @@ void ModelScore::reset_f() } void ModelScore::add_example - (const StringPiece& previous, const StringPiece& next, float weight) +(const StringPiece& previous, const StringPiece& next, float weight) { count_fe_prev[getType(previous)]+=weight; count_f_prev[getType(previous)]+=weight; diff --git a/phrase-extract/lexical-reordering/score.cpp b/phrase-extract/lexical-reordering/score.cpp index 545abf303..d404822b8 100644 --- a/phrase-extract/lexical-reordering/score.cpp +++ b/phrase-extract/lexical-reordering/score.cpp @@ -29,11 +29,11 @@ void get_orientations(const StringPiece& pair, StringPiece& previous, StringPiec class FileFormatException : public util::Exception { - public: - FileFormatException() throw() { - *this << "Invalid extract file format: "; - } - ~FileFormatException() throw() {} +public: + FileFormatException() throw() { + *this << "Invalid extract file format: "; + } + ~FileFormatException() throw() {} }; int main(int argc, char* argv[]) @@ -214,9 +214,10 @@ int main(int argc, char* argv[]) } template StringPiece -GrabOrDie(It &it, const StringPiece& line) { - UTIL_THROW_IF(!it, FileFormatException, line.as_string()); - return *it++; +GrabOrDie(It &it, const StringPiece& line) +{ + UTIL_THROW_IF(!it, FileFormatException, line.as_string()); + return *it++; } @@ -236,12 +237,12 @@ void split_line( | phrase | hier | phrase | hier ||| weight */ - + util::TokenIter pipes(line, util::MultiCharacter(" ||| ")); foreign = GrabOrDie(pipes,line); english = GrabOrDie(pipes,line); StringPiece next = GrabOrDie(pipes,line); - + util::TokenIter singlePipe(next, util::MultiCharacter(" | ")); wbe = GrabOrDie(singlePipe,line); if (singlePipe) { diff --git a/phrase-extract/pcfg-common/exception.h b/phrase-extract/pcfg-common/exception.h index 3dbd59d0e..d9266ca36 100644 --- a/phrase-extract/pcfg-common/exception.h +++ b/phrase-extract/pcfg-common/exception.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -23,15 +23,20 @@ #include -namespace Moses { -namespace PCFG { +namespace Moses +{ +namespace PCFG +{ -class Exception { - public: +class Exception +{ +public: Exception(const char *msg) : msg_(msg) {} Exception(const std::string &msg) : msg_(msg) {} - const std::string &msg() const { return msg_; } - private: + const std::string &msg() const { + return msg_; + } +private: std::string msg_; }; diff --git a/phrase-extract/pcfg-common/numbered_set.h b/phrase-extract/pcfg-common/numbered_set.h index 15e768b4c..66e960404 100644 --- a/phrase-extract/pcfg-common/numbered_set.h +++ b/phrase-extract/pcfg-common/numbered_set.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -29,32 +29,45 @@ #include #include -namespace Moses { -namespace PCFG { +namespace Moses +{ +namespace PCFG +{ // Stores a set of elements of type T, each of which is allocated an integral // ID of type I. IDs are contiguous starting at 0. Individual elements cannot // be removed once inserted (but the whole set can be cleared). template -class NumberedSet { - private: +class NumberedSet +{ +private: typedef boost::unordered_map ElementToIdMap; typedef std::vector IdToElementMap; - public: +public: typedef I IdType; typedef typename IdToElementMap::const_iterator const_iterator; NumberedSet() {} - const_iterator begin() const { return id_to_element_.begin(); } - const_iterator end() const { return id_to_element_.end(); } + const_iterator begin() const { + return id_to_element_.begin(); + } + const_iterator end() const { + return id_to_element_.end(); + } // Static value - static I NullId() { return std::numeric_limits::max(); } + static I NullId() { + return std::numeric_limits::max(); + } - bool Empty() const { return id_to_element_.empty(); } - std::size_t Size() const { return id_to_element_.size(); } + bool Empty() const { + return id_to_element_.empty(); + } + std::size_t Size() const { + return id_to_element_.size(); + } // Insert the given object and return its ID. I Insert(const T &); @@ -64,19 +77,21 @@ class NumberedSet { void Clear(); - private: +private: ElementToIdMap element_to_id_; IdToElementMap id_to_element_; }; template -I NumberedSet::Lookup(const T &s) const { +I NumberedSet::Lookup(const T &s) const +{ typename ElementToIdMap::const_iterator p = element_to_id_.find(s); return (p == element_to_id_.end()) ? NullId() : p->second; } template -const T &NumberedSet::Lookup(I id) const { +const T &NumberedSet::Lookup(I id) const +{ if (id < 0 || id >= id_to_element_.size()) { std::ostringstream msg; msg << "Value not found: " << id; @@ -86,10 +101,11 @@ const T &NumberedSet::Lookup(I id) const { } template -I NumberedSet::Insert(const T &x) { +I NumberedSet::Insert(const T &x) +{ std::pair value(x, id_to_element_.size()); std::pair result = - element_to_id_.insert(value); + element_to_id_.insert(value); if (result.second) { // x is a new element. id_to_element_.push_back(&result.first->first); @@ -98,7 +114,8 @@ I NumberedSet::Insert(const T &x) { } template -void NumberedSet::Clear() { +void NumberedSet::Clear() +{ element_to_id_.clear(); id_to_element_.clear(); } diff --git a/phrase-extract/pcfg-common/pcfg.h b/phrase-extract/pcfg-common/pcfg.h index b87336584..5398cd97e 100644 --- a/phrase-extract/pcfg-common/pcfg.h +++ b/phrase-extract/pcfg-common/pcfg.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -28,11 +28,14 @@ #include #include -namespace Moses { -namespace PCFG { +namespace Moses +{ +namespace PCFG +{ -class Pcfg { - public: +class Pcfg +{ +public: typedef std::vector Key; typedef std::map Map; typedef Map::iterator iterator; @@ -40,18 +43,26 @@ class Pcfg { Pcfg() {} - iterator begin() { return rules_.begin(); } - const_iterator begin() const { return rules_.begin(); } + iterator begin() { + return rules_.begin(); + } + const_iterator begin() const { + return rules_.begin(); + } - iterator end() { return rules_.end(); } - const_iterator end() const { return rules_.end(); } + iterator end() { + return rules_.end(); + } + const_iterator end() const { + return rules_.end(); + } void Add(const Key &, double); bool Lookup(const Key &, double &) const; void Read(std::istream &, Vocabulary &); void Write(const Vocabulary &, std::ostream &) const; - private: +private: Map rules_; }; diff --git a/phrase-extract/pcfg-common/pcfg_tree.h b/phrase-extract/pcfg-common/pcfg_tree.h index bdac64dfc..d125cad16 100644 --- a/phrase-extract/pcfg-common/pcfg_tree.h +++ b/phrase-extract/pcfg-common/pcfg_tree.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -26,34 +26,43 @@ #include -namespace Moses { -namespace PCFG { +namespace Moses +{ +namespace PCFG +{ template -class PcfgTreeBase : public SyntaxTreeBase { - public: +class PcfgTreeBase : public SyntaxTreeBase +{ +public: typedef std::string LabelType; typedef SyntaxTreeBase BaseType; PcfgTreeBase(const LabelType &label) : BaseType(label), score_(0.0) {} - double score() const { return score_; } - void set_score(double s) { score_ = s; } + double score() const { + return score_; + } + void set_score(double s) { + score_ = s; + } - private: +private: double score_; }; -class PcfgTree : public PcfgTreeBase { - public: +class PcfgTree : public PcfgTreeBase +{ +public: typedef PcfgTreeBase BaseType; PcfgTree(const BaseType::LabelType &label) : BaseType(label) {} }; // Specialise XmlOutputHandler for PcfgTree. template<> -class XmlOutputHandler { - public: +class XmlOutputHandler +{ +public: typedef std::map AttributeMap; void GetLabel(const PcfgTree &tree, std::string &label) const { diff --git a/phrase-extract/pcfg-common/syntax_tree.h b/phrase-extract/pcfg-common/syntax_tree.h index 89c6ec0c3..93d9dbec9 100644 --- a/phrase-extract/pcfg-common/syntax_tree.h +++ b/phrase-extract/pcfg-common/syntax_tree.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -24,62 +24,87 @@ #include #include -namespace Moses { -namespace PCFG { +namespace Moses +{ +namespace PCFG +{ // Base class for SyntaxTree, AgreementTree, and friends. template -class SyntaxTreeBase { - public: +class SyntaxTreeBase +{ +public: // Constructors SyntaxTreeBase(const T &label) - : label_(label) - , children_() - , parent_(0) {} + : label_(label) + , children_() + , parent_(0) {} SyntaxTreeBase(const T &label, const std::vector &children) - : label_(label) - , children_(children) - , parent_(0) {} + : label_(label) + , children_(children) + , parent_(0) {} // Destructor virtual ~SyntaxTreeBase(); - const T &label() const { return label_; } - const DerivedType *parent() const { return parent_; } - DerivedType *parent() { return parent_; } - const std::vector &children() const { return children_; } - std::vector &children() { return children_; } + const T &label() const { + return label_; + } + const DerivedType *parent() const { + return parent_; + } + DerivedType *parent() { + return parent_; + } + const std::vector &children() const { + return children_; + } + std::vector &children() { + return children_; + } - void set_label(const T &label) { label_ = label; } - void set_parent(DerivedType *parent) { parent_ = parent; } - void set_children(const std::vector &c) { children_ = c; } + void set_label(const T &label) { + label_ = label; + } + void set_parent(DerivedType *parent) { + parent_ = parent; + } + void set_children(const std::vector &c) { + children_ = c; + } - bool IsLeaf() const { return children_.empty(); } + bool IsLeaf() const { + return children_.empty(); + } bool IsPreterminal() const { return children_.size() == 1 && children_[0]->IsLeaf(); } - void AddChild(DerivedType *child) { children_.push_back(child); } + void AddChild(DerivedType *child) { + children_.push_back(child); + } - private: +private: T label_; std::vector children_; DerivedType *parent_; }; template -class SyntaxTree : public SyntaxTreeBase > { - public: +class SyntaxTree : public SyntaxTreeBase > +{ +public: typedef SyntaxTreeBase > BaseType; SyntaxTree(const T &label) : BaseType(label) {} SyntaxTree(const T &label, const std::vector &children) - : BaseType(label, children) {} + : BaseType(label, children) {} }; template -SyntaxTreeBase::~SyntaxTreeBase() { +SyntaxTreeBase::~SyntaxTreeBase() +{ for (std::size_t i = 0; i < children_.size(); ++i) { delete children_[i]; } diff --git a/phrase-extract/pcfg-common/tool.h b/phrase-extract/pcfg-common/tool.h index 0af342569..aada036e3 100644 --- a/phrase-extract/pcfg-common/tool.h +++ b/phrase-extract/pcfg-common/tool.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -28,18 +28,23 @@ #include #include -namespace Moses { -namespace PCFG { +namespace Moses +{ +namespace PCFG +{ -class Tool { - public: +class Tool +{ +public: virtual ~Tool() {} - const std::string &name() const { return name_; } + const std::string &name() const { + return name_; + } virtual int Main(int argc, char *argv[]) = 0; - protected: +protected: Tool(const std::string &name) : name_(name) {} // Returns the boost::program_options style that should be used by all tools. @@ -77,7 +82,7 @@ class Tool { // the file cannot be opened for writing. void OpenNamedOutputOrDie(const std::string &, std::ofstream &); - private: +private: std::string name_; std::istream *input_ptr_; std::ifstream input_file_stream_; diff --git a/phrase-extract/pcfg-common/typedef.h b/phrase-extract/pcfg-common/typedef.h index 49a12d681..ce3e0423b 100644 --- a/phrase-extract/pcfg-common/typedef.h +++ b/phrase-extract/pcfg-common/typedef.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -26,8 +26,10 @@ #include -namespace Moses { -namespace PCFG { +namespace Moses +{ +namespace PCFG +{ typedef NumberedSet Vocabulary; diff --git a/phrase-extract/pcfg-common/xml_tree_parser.h b/phrase-extract/pcfg-common/xml_tree_parser.h index 7d01b0684..7eec14033 100644 --- a/phrase-extract/pcfg-common/xml_tree_parser.h +++ b/phrase-extract/pcfg-common/xml_tree_parser.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -30,16 +30,19 @@ #include #include -namespace Moses { -namespace PCFG { +namespace Moses +{ +namespace PCFG +{ // Parses a string in Moses' XML parse tree format and returns a PcfgTree // object. -class XmlTreeParser { - public: +class XmlTreeParser +{ +public: XmlTreeParser(); std::auto_ptr Parse(const std::string &); - private: +private: std::auto_ptr ConvertTree(const MosesTraining::SyntaxNode &, const std::vector &); diff --git a/phrase-extract/pcfg-common/xml_tree_writer.h b/phrase-extract/pcfg-common/xml_tree_writer.h index 6a9a3de05..426efec17 100644 --- a/phrase-extract/pcfg-common/xml_tree_writer.h +++ b/phrase-extract/pcfg-common/xml_tree_writer.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -32,12 +32,15 @@ #include #include -namespace Moses { -namespace PCFG { +namespace Moses +{ +namespace PCFG +{ template -class XmlOutputHandler { - public: +class XmlOutputHandler +{ +public: typedef std::map AttributeMap; void GetLabel(const InputTree &, std::string &) const; @@ -45,17 +48,19 @@ class XmlOutputHandler { }; template -class XmlTreeWriter : public XmlOutputHandler { - public: +class XmlTreeWriter : public XmlOutputHandler +{ +public: typedef XmlOutputHandler Base; void Write(const InputTree &, std::ostream &) const; - private: +private: std::string Escape(const std::string &) const; }; template void XmlTreeWriter::Write(const InputTree &tree, - std::ostream &out) const { + std::ostream &out) const +{ assert(!tree.IsLeaf()); // Opening tag @@ -99,7 +104,8 @@ void XmlTreeWriter::Write(const InputTree &tree, // Escapes XML special characters. template -std::string XmlTreeWriter::Escape(const std::string &s) const { +std::string XmlTreeWriter::Escape(const std::string &s) const +{ std::string t; std::size_t len = s.size(); t.reserve(len); diff --git a/phrase-extract/pcfg-extract/options.h b/phrase-extract/pcfg-extract/options.h index 3acb31b58..2633f025a 100644 --- a/phrase-extract/pcfg-extract/options.h +++ b/phrase-extract/pcfg-extract/options.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -23,8 +23,10 @@ #include -namespace Moses { -namespace PCFG { +namespace Moses +{ +namespace PCFG +{ struct Options { std::string corpus_file; diff --git a/phrase-extract/pcfg-extract/pcfg_extract.h b/phrase-extract/pcfg-extract/pcfg_extract.h index 1af6cb4fe..e8c306876 100644 --- a/phrase-extract/pcfg-extract/pcfg_extract.h +++ b/phrase-extract/pcfg-extract/pcfg_extract.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -23,16 +23,19 @@ #include "pcfg-common/tool.h" -namespace Moses { -namespace PCFG { +namespace Moses +{ +namespace PCFG +{ class Options; -class PcfgExtract : public Tool { - public: +class PcfgExtract : public Tool +{ +public: PcfgExtract() : Tool("pcfg-extract") {} virtual int Main(int, char *[]); - private: +private: void ProcessOptions(int, char *[], Options &) const; }; diff --git a/phrase-extract/pcfg-extract/rule_collection.h b/phrase-extract/pcfg-extract/rule_collection.h index 452fa0e97..32cb2dc05 100644 --- a/phrase-extract/pcfg-extract/rule_collection.h +++ b/phrase-extract/pcfg-extract/rule_collection.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -27,12 +27,15 @@ #include -namespace Moses { -namespace PCFG { +namespace Moses +{ +namespace PCFG +{ // Contains PCFG rules and their counts. -class RuleCollection { - public: +class RuleCollection +{ +public: typedef boost::unordered_map, std::size_t> RhsCountMap; typedef boost::unordered_map Map; typedef Map::iterator iterator; @@ -40,16 +43,24 @@ class RuleCollection { RuleCollection() {} - iterator begin() { return collection_.begin(); } - const_iterator begin() const { return collection_.begin(); } + iterator begin() { + return collection_.begin(); + } + const_iterator begin() const { + return collection_.begin(); + } - iterator end() { return collection_.end(); } - const_iterator end() const { return collection_.end(); } + iterator end() { + return collection_.end(); + } + const_iterator end() const { + return collection_.end(); + } void Add(std::size_t, const std::vector &); void CreatePcfg(Pcfg &); - private: +private: Map collection_; }; diff --git a/phrase-extract/pcfg-extract/rule_extractor.h b/phrase-extract/pcfg-extract/rule_extractor.h index 6bcffbc61..e4b411c01 100644 --- a/phrase-extract/pcfg-extract/rule_extractor.h +++ b/phrase-extract/pcfg-extract/rule_extractor.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -25,17 +25,20 @@ #include "pcfg-common/typedef.h" -namespace Moses { -namespace PCFG { +namespace Moses +{ +namespace PCFG +{ class PcfgTree; // Extracts PCFG rules from syntax trees and adds them to a RuleCollection. -class RuleExtractor { - public: +class RuleExtractor +{ +public: RuleExtractor(Vocabulary &); void Extract(const PcfgTree &, RuleCollection &) const; - private: +private: Vocabulary &non_term_vocab_; }; diff --git a/phrase-extract/pcfg-score/options.h b/phrase-extract/pcfg-score/options.h index e54b2a0b9..fd54b4b6b 100644 --- a/phrase-extract/pcfg-score/options.h +++ b/phrase-extract/pcfg-score/options.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -23,8 +23,10 @@ #include -namespace Moses { -namespace PCFG { +namespace Moses +{ +namespace PCFG +{ struct Options { std::string pcfg_file; diff --git a/phrase-extract/pcfg-score/pcfg_score.h b/phrase-extract/pcfg-score/pcfg_score.h index 5e506c39d..f49c9a0be 100644 --- a/phrase-extract/pcfg-score/pcfg_score.h +++ b/phrase-extract/pcfg-score/pcfg_score.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -23,16 +23,19 @@ #include "pcfg-common/tool.h" -namespace Moses { -namespace PCFG { +namespace Moses +{ +namespace PCFG +{ class Options; -class PcfgScore : public Tool { - public: +class PcfgScore : public Tool +{ +public: PcfgScore() : Tool("pcfg-score") {} virtual int Main(int, char *[]); - private: +private: void ProcessOptions(int, char *[], Options &) const; }; diff --git a/phrase-extract/pcfg-score/tree_scorer.h b/phrase-extract/pcfg-score/tree_scorer.h index 36f4e1e99..8cb59c0c2 100644 --- a/phrase-extract/pcfg-score/tree_scorer.h +++ b/phrase-extract/pcfg-score/tree_scorer.h @@ -1,17 +1,17 @@ /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2012 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -25,18 +25,21 @@ #include "pcfg-common/pcfg_tree.h" #include "pcfg-common/typedef.h" -namespace Moses { -namespace PCFG { +namespace Moses +{ +namespace PCFG +{ -class TreeScorer { - public: +class TreeScorer +{ +public: TreeScorer(const Pcfg &, const Vocabulary &); // Score tree according to PCFG. Returns false if unsuccessful (due to // missing rule). bool Score(PcfgTree &) const; - private: +private: const Pcfg &pcfg_; const Vocabulary &non_term_vocab_; }; diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp index 0e4ad57f4..3042cbe3e 100644 --- a/phrase-extract/score-main.cpp +++ b/phrase-extract/score-main.cpp @@ -68,7 +68,7 @@ float minCountHierarchical = 0; Vocabulary vcbT; Vocabulary vcbS; - + } // namespace vector tokenize( const char [] ); @@ -130,18 +130,18 @@ int main(int argc, char* argv[]) cerr << "not computing lexical translation score\n"; } else if (strcmp(argv[i],"--GoodTuring") == 0) { goodTuringFlag = true; - fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc"; + fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc"; cerr << "adjusting phrase translation probabilities with Good Turing discounting\n"; } else if (strcmp(argv[i],"--KneserNey") == 0) { kneserNeyFlag = true; - fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc"; + fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc"; cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n"; } else if (strcmp(argv[i],"--UnalignedPenalty") == 0) { unalignedFlag = true; cerr << "using unaligned word penalty\n"; } else if (strcmp(argv[i],"--UnalignedFunctionWordPenalty") == 0) { unalignedFWFlag = true; - if (i+1==argc) { + if (i+1==argc) { cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n"; exit(1); } @@ -204,22 +204,21 @@ int main(int argc, char* argv[]) istream &extractFileP = extractFile; // output file: phrase translation table - ostream *phraseTableFile; + ostream *phraseTableFile; + + if (fileNamePhraseTable == "-") { + phraseTableFile = &cout; + } else { + Moses::OutputFileStream *outputFile = new Moses::OutputFileStream(); + bool success = outputFile->Open(fileNamePhraseTable); + if (!success) { + cerr << "ERROR: could not open file phrase table file " + << fileNamePhraseTable << endl; + exit(1); + } + phraseTableFile = outputFile; + } - if (fileNamePhraseTable == "-") { - phraseTableFile = &cout; - } - else { - Moses::OutputFileStream *outputFile = new Moses::OutputFileStream(); - bool success = outputFile->Open(fileNamePhraseTable); - if (!success) { - cerr << "ERROR: could not open file phrase table file " - << fileNamePhraseTable << endl; - exit(1); - } - phraseTableFile = outputFile; - } - // loop through all extracted phrase translations float lastCount = 0.0f; float lastPcfgSum = 0.0f; @@ -250,25 +249,23 @@ int main(int argc, char* argv[]) lastPcfgSum = phrasePair.pcfgSum; // only differs in count? just add count - if (lastPhrasePair != NULL - && lastPhrasePair->equals( phrasePair ) - && featureManager.equals(*lastPhrasePair, phrasePair)) { + if (lastPhrasePair != NULL + && lastPhrasePair->equals( phrasePair ) + && featureManager.equals(*lastPhrasePair, phrasePair)) { lastPhrasePair->count += phrasePair.count; lastPhrasePair->pcfgSum += phrasePair.pcfgSum; continue; } - + // if new source phrase, process last batch if (lastPhrasePair != NULL && lastPhrasePair->GetSource() != phrasePair.GetSource()) { processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton, featureManager, maybeLogProb ); - + phrasePairsWithSameF.clear(); isSingleton = false; lastPhrasePair = NULL; - } - else - { + } else { isSingleton = true; } @@ -277,11 +274,11 @@ int main(int argc, char* argv[]) lastPhrasePair = &phrasePairsWithSameF.back(); } processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton, featureManager, maybeLogProb ); - - phraseTableFile->flush(); - if (phraseTableFile != &cout) { - delete phraseTableFile; - } + + phraseTableFile->flush(); + if (phraseTableFile != &cout) { + delete phraseTableFile; + } // output count of count statistics if (goodTuringFlag || kneserNeyFlag) { @@ -292,13 +289,13 @@ int main(int argc, char* argv[]) void writeCountOfCounts( const string &fileNameCountOfCounts ) { // open file - Moses::OutputFileStream countOfCountsFile; - bool success = countOfCountsFile.Open(fileNameCountOfCounts.c_str()); - if (!success) { - cerr << "ERROR: could not open count-of-counts file " - << fileNameCountOfCounts << endl; + Moses::OutputFileStream countOfCountsFile; + bool success = countOfCountsFile.Open(fileNameCountOfCounts.c_str()); + if (!success) { + cerr << "ERROR: could not open count-of-counts file " + << fileNameCountOfCounts << endl; return; - } + } // Kneser-Ney needs the total number of phrase pairs countOfCountsFile << totalDistinct << endl; @@ -307,7 +304,7 @@ void writeCountOfCounts( const string &fileNameCountOfCounts ) for(int i=1; i<=COC_MAX; i++) { countOfCountsFile << countOfCounts[ i ] << endl; } - countOfCountsFile.Close(); + countOfCountsFile.Close(); } void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile, bool isSingleton, const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb ) @@ -317,65 +314,63 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseT // group phrase pairs based on alignments that matter // (i.e. that re-arrange non-terminals) PhrasePairGroup phrasePairGroup; - + float totalSource = 0; //cerr << "phrasePair.size() = " << phrasePair.size() << endl; - + // loop through phrase pairs for(size_t i=0; i::const_iterator i = extraSparse.begin(); - i != extraSparse.end(); ++i) { + i != extraSparse.end(); ++i) { phraseTableFile << " " << i->first << " " << i->second; } @@ -633,8 +614,8 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo // alignment info for non-terminals if (! inverseFlag) { if (hierarchicalFlag) { - // always output alignment if hiero style, but only for non-terms - // (eh: output all alignments, needed for some feature functions) + // always output alignment if hiero style, but only for non-terms + // (eh: output all alignments, needed for some feature functions) assert(phraseT.size() == bestAlignment.alignedToT.size() + 1); std::vector alignment; for(size_t j = 0; j < phraseT.size() - 1; j++) { @@ -657,15 +638,15 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo std::stringstream point; point << sourcePos << "-" << j; alignment.push_back(point.str()); - } - } - } - // now print all alignments, sorted by source index - sort(alignment.begin(), alignment.end()); - for (size_t i = 0; i < alignment.size(); ++i) { - phraseTableFile << alignment[i] << " "; - } - } else if (wordAlignmentFlag) { + } + } + } + // now print all alignments, sorted by source index + sort(alignment.begin(), alignment.end()); + for (size_t i = 0; i < alignment.size(); ++i) { + phraseTableFile << alignment[i] << " "; + } + } else if (wordAlignmentFlag) { // alignment info in pb model for(size_t j=0; j &aligned = bestAlignment.alignedToT[j]; @@ -678,28 +659,26 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo // counts - + phraseTableFile << " ||| " << totalCount << " " << count; - if (kneserNeyFlag) + if (kneserNeyFlag) phraseTableFile << " " << distinctCount; - - // nt lengths - if (outputNTLengths) - { + + // nt lengths + if (outputNTLengths) { phraseTableFile << " ||| "; - if (!inverseFlag) - { + if (!inverseFlag) { map > sourceProb, targetProb; // 1st sourcePos, 2nd = length, 3rd = prob calcNTLengthProb(phrasePair, sourceProb, targetProb); - + outputNTLengthProbs(phraseTableFile, sourceProb, "S"); outputNTLengthProbs(phraseTableFile, targetProb, "T"); - } + } } - + phraseTableFile << endl; } @@ -878,13 +857,13 @@ void printTargetPhrase(const PHRASE &phraseS, const PHRASE &phraseT, std::pair PhrasePairGroup::insert ( const PhraseAlignmentCollection& obj ) { std::pair ret = m_coll.insert(obj); - - if (ret.second) - { // obj inserted. Also add to sorted vector + + if (ret.second) { + // obj inserted. Also add to sorted vector const PhraseAlignmentCollection &insertedObj = *ret.first; m_sortedColl.push_back(&insertedObj); } - + return ret; } diff --git a/phrase-extract/score.h b/phrase-extract/score.h index 59d2cf58f..6a10536c1 100644 --- a/phrase-extract/score.h +++ b/phrase-extract/score.h @@ -32,6 +32,6 @@ inline bool isNonTerminal( const std::string &word ) return (word.length()>=3 && word[0] == '[' && word[word.length()-1] == ']'); } - + } diff --git a/phrase-extract/tables-core.cpp b/phrase-extract/tables-core.cpp index 6b35f371b..30c1544e9 100644 --- a/phrase-extract/tables-core.cpp +++ b/phrase-extract/tables-core.cpp @@ -33,8 +33,9 @@ vector tokenize( const char* input ) namespace MosesTraining { -bool isNonTerminal( const WORD &symbol ) { - return symbol.substr(0, 1) == "[" && symbol.substr(symbol.size()-1, 1) == "]"; +bool isNonTerminal( const WORD &symbol ) +{ + return symbol.substr(0, 1) == "[" && symbol.substr(symbol.size()-1, 1) == "]"; } WORD_ID Vocabulary::storeIfNew( const WORD& word ) @@ -105,7 +106,7 @@ void DTable::load( const string& fileName ) std::cerr << "Error reading from " << fileName << std::endl; abort(); } - + vector token = tokenize(line.c_str()); if (token.size() < 2) { cerr << "line " << i << " in " << fileName << " too short, skipping\n"; diff --git a/scripts/other/beautify.perl b/scripts/other/beautify.perl index 110539893..01afb1af2 100755 --- a/scripts/other/beautify.perl +++ b/scripts/other/beautify.perl @@ -1,31 +1,46 @@ -#!/usr/bin/perl +#!/usr/bin/perl use strict; +use File::Basename; -#my $cmd = "astyle --style='k&r' -s2 -v --recursive *.h *.cpp"; -#print STDERR "Executing: $cmd \n"; -#system($cmd); +sub Beautify($); -opendir(DIR,".") or die "Can't open the current directory: $!\n"; +Beautify("/home/hieu/workspace/github/mosesdecoder"); + +sub Beautify($) +{ +my $path = shift; +opendir(DIR, $path) or die "Can't open the current directory: $!\n"; # read file/directory names in that directory into @names my @names = readdir(DIR) or die "Unable to read current dir:$!\n"; foreach my $name (@names) { - next if ($name eq "."); # skip the current directory entry - next if ($name eq ".."); # skip the parent directory entry - next if ($name eq "boost"); # skip the parent directory entry - next if ($name eq "contrib"); # skip the parent directory entry - next if ($name eq "jam-files"); # skip the parent directory entry - next if ($name eq ".git"); # skip the parent directory entry + next if ($name eq "."); + next if ($name eq ".."); + next if ($name eq "boost"); + next if ($name eq "contrib"); + next if ($name eq "jam-files"); + next if ($name eq ".git"); + next if ($name eq "util"); + next if ($name eq "lm"); + next if ($name eq "search"); - if (-d $name){ # is this a directory? - my $cmd = "astyle --style='k&r' -s2 -v --recursive $name/*.h $name/*.cpp"; - print STDERR "Executing: $cmd \n"; - system($cmd); - - next; # can skip to the next name in the for loop + $name = $path ."/" .$name; + if (-d $name) { + print STDERR "Into: $name \n"; + Beautify($name); + } + else { # is this a directory? + (my $nameOnly, my $pathOnly,my $suffix) = fileparse($name,qr"\..[^.]*$"); + if ($suffix eq ".cpp" || $suffix eq ".h") { + my $cmd = "astyle --style='k&r' -s2 -v $name"; + print STDERR "Executing: $cmd \n"; + system($cmd); + } } } closedir(DIR); +} + diff --git a/symal/symal.cpp b/symal/symal.cpp index da386d973..dbe68f1b9 100644 --- a/symal/symal.cpp +++ b/symal/symal.cpp @@ -411,7 +411,7 @@ int main(int argc, char** argv) "o", CMDSTRINGTYPE, &output, "v", CMDENUMTYPE, &verbose, BoolEnum, "verbose", CMDENUMTYPE, &verbose, BoolEnum, - + (char*)NULL); GetParams(&argc, &argv, (char*)NULL); diff --git a/util/double-conversion/bignum-dtoa.h b/util/double-conversion/bignum-dtoa.h index 34b961992..652a4db9a 100644 --- a/util/double-conversion/bignum-dtoa.h +++ b/util/double-conversion/bignum-dtoa.h @@ -30,7 +30,8 @@ #include "utils.h" -namespace double_conversion { +namespace double_conversion +{ enum BignumDtoaMode { // Return the shortest correct representation. diff --git a/util/double-conversion/bignum.h b/util/double-conversion/bignum.h index 5ec3544f5..5deadbfbe 100644 --- a/util/double-conversion/bignum.h +++ b/util/double-conversion/bignum.h @@ -30,10 +30,12 @@ #include "utils.h" -namespace double_conversion { +namespace double_conversion +{ -class Bignum { - public: +class Bignum +{ +public: // 3584 = 128 * 28. We can represent 2^3584 > 10^1000 accurately. // This bignum can encode much bigger numbers, since it contains an // exponent. @@ -60,7 +62,9 @@ class Bignum { void MultiplyByUInt32(uint32_t factor); void MultiplyByUInt64(uint64_t factor); void MultiplyByPowerOfTen(int exponent); - void Times10() { return MultiplyByUInt32(10); } + void Times10() { + return MultiplyByUInt32(10); + } // Pseudocode: // int result = this / other; // this = this % other; @@ -97,7 +101,7 @@ class Bignum { static bool PlusLess(const Bignum& a, const Bignum& b, const Bignum& c) { return PlusCompare(a, b, c) < 0; } - private: +private: typedef uint32_t Chunk; typedef uint64_t DoubleChunk; @@ -125,7 +129,9 @@ class Bignum { // shift_amount must be < kBigitSize. void BigitsShiftLeft(int shift_amount); // BigitLength includes the "hidden" digits encoded in the exponent. - int BigitLength() const { return used_digits_ + exponent_; } + int BigitLength() const { + return used_digits_ + exponent_; + } Chunk BigitAt(int index) const; void SubtractTimes(const Bignum& other, int factor); diff --git a/util/double-conversion/cached-powers.h b/util/double-conversion/cached-powers.h index 61a50614c..3daf52d51 100644 --- a/util/double-conversion/cached-powers.h +++ b/util/double-conversion/cached-powers.h @@ -30,10 +30,12 @@ #include "diy-fp.h" -namespace double_conversion { +namespace double_conversion +{ -class PowersOfTenCache { - public: +class PowersOfTenCache +{ +public: // Not all powers of ten are cached. The decimal exponent of two neighboring // cached numbers will differ by kDecimalExponentDistance. @@ -45,9 +47,9 @@ class PowersOfTenCache { // Returns a cached power-of-ten with a binary exponent in the range // [min_exponent; max_exponent] (boundaries included). static void GetCachedPowerForBinaryExponentRange(int min_exponent, - int max_exponent, - DiyFp* power, - int* decimal_exponent); + int max_exponent, + DiyFp* power, + int* decimal_exponent); // Returns a cached power of ten x ~= 10^k such that // k <= decimal_exponent < k + kCachedPowersDecimalDistance. @@ -55,8 +57,8 @@ class PowersOfTenCache { // kMinDecimalExponent <= requested_exponent, and // requested_exponent < kMaxDecimalExponent + kDecimalExponentDistance. static void GetCachedPowerForDecimalExponent(int requested_exponent, - DiyFp* power, - int* found_exponent); + DiyFp* power, + int* found_exponent); }; } // namespace double_conversion diff --git a/util/double-conversion/diy-fp.h b/util/double-conversion/diy-fp.h index 9dcf8fbdb..39a6bd7dd 100644 --- a/util/double-conversion/diy-fp.h +++ b/util/double-conversion/diy-fp.h @@ -30,15 +30,17 @@ #include "utils.h" -namespace double_conversion { +namespace double_conversion +{ // This "Do It Yourself Floating Point" class implements a floating-point number // with a uint64 significand and an int exponent. Normalized DiyFp numbers will // have the most significant bit of the significand set. // Multiplication and Subtraction do not normalize their results. // DiyFp are not designed to contain special doubles (NaN and Infinity). -class DiyFp { - public: +class DiyFp +{ +public: static const int kSignificandSize = 64; DiyFp() : f_(0), e_(0) {} @@ -100,13 +102,21 @@ class DiyFp { return result; } - uint64_t f() const { return f_; } - int e() const { return e_; } + uint64_t f() const { + return f_; + } + int e() const { + return e_; + } - void set_f(uint64_t new_value) { f_ = new_value; } - void set_e(int new_value) { e_ = new_value; } + void set_f(uint64_t new_value) { + f_ = new_value; + } + void set_e(int new_value) { + e_ = new_value; + } - private: +private: static const uint64_t kUint64MSB = UINT64_2PART_C(0x80000000, 00000000); uint64_t f_; diff --git a/util/double-conversion/double-conversion.h b/util/double-conversion/double-conversion.h index 1c3387d4f..b3e51bae8 100644 --- a/util/double-conversion/double-conversion.h +++ b/util/double-conversion/double-conversion.h @@ -30,10 +30,12 @@ #include "utils.h" -namespace double_conversion { +namespace double_conversion +{ -class DoubleToStringConverter { - public: +class DoubleToStringConverter +{ +public: // When calling ToFixed with a double > 10^kMaxFixedDigitsBeforePoint // or a requested_digits parameter > kMaxFixedDigitsAfterPoint then the // function returns false. @@ -112,20 +114,20 @@ class DoubleToStringConverter { int decimal_in_shortest_high, int max_leading_padding_zeroes_in_precision_mode, int max_trailing_padding_zeroes_in_precision_mode) - : flags_(flags), - infinity_symbol_(infinity_symbol), - nan_symbol_(nan_symbol), - exponent_character_(exponent_character), - decimal_in_shortest_low_(decimal_in_shortest_low), - decimal_in_shortest_high_(decimal_in_shortest_high), - max_leading_padding_zeroes_in_precision_mode_( - max_leading_padding_zeroes_in_precision_mode), - max_trailing_padding_zeroes_in_precision_mode_( - max_trailing_padding_zeroes_in_precision_mode) { + : flags_(flags), + infinity_symbol_(infinity_symbol), + nan_symbol_(nan_symbol), + exponent_character_(exponent_character), + decimal_in_shortest_low_(decimal_in_shortest_low), + decimal_in_shortest_high_(decimal_in_shortest_high), + max_leading_padding_zeroes_in_precision_mode_( + max_leading_padding_zeroes_in_precision_mode), + max_trailing_padding_zeroes_in_precision_mode_( + max_trailing_padding_zeroes_in_precision_mode) { // When 'trailing zero after the point' is set, then 'trailing point' // must be set too. ASSERT(((flags & EMIT_TRAILING_DECIMAL_POINT) != 0) || - !((flags & EMIT_TRAILING_ZERO_AFTER_POINT) != 0)); + !((flags & EMIT_TRAILING_ZERO_AFTER_POINT) != 0)); } // Returns a converter following the EcmaScript specification. @@ -341,7 +343,7 @@ class DoubleToStringConverter { int* length, int* point); - private: +private: // Implementation for ToShortest and ToShortestSingle. bool ToShortestIeeeNumber(double value, StringBuilder* result_builder, @@ -378,8 +380,9 @@ class DoubleToStringConverter { }; -class StringToDoubleConverter { - public: +class StringToDoubleConverter +{ +public: // Enumeration for allowing octals and ignoring junk when converting // strings to numbers. enum Flags { @@ -488,11 +491,11 @@ class StringToDoubleConverter { double junk_string_value, const char* infinity_symbol, const char* nan_symbol) - : flags_(flags), - empty_string_value_(empty_string_value), - junk_string_value_(junk_string_value), - infinity_symbol_(infinity_symbol), - nan_symbol_(nan_symbol) { + : flags_(flags), + empty_string_value_(empty_string_value), + junk_string_value_(junk_string_value), + infinity_symbol_(infinity_symbol), + nan_symbol_(nan_symbol) { } // Performs the conversion. @@ -516,7 +519,7 @@ class StringToDoubleConverter { processed_characters_count, false)); } - private: +private: const int flags_; const double empty_string_value_; const double junk_string_value_; diff --git a/util/double-conversion/fast-dtoa.h b/util/double-conversion/fast-dtoa.h index 5f1e8eee5..184f9cade 100644 --- a/util/double-conversion/fast-dtoa.h +++ b/util/double-conversion/fast-dtoa.h @@ -30,7 +30,8 @@ #include "utils.h" -namespace double_conversion { +namespace double_conversion +{ enum FastDtoaMode { // Computes the shortest representation of the given input. The returned diff --git a/util/double-conversion/fixed-dtoa.h b/util/double-conversion/fixed-dtoa.h index 3bdd08e21..9383cb936 100644 --- a/util/double-conversion/fixed-dtoa.h +++ b/util/double-conversion/fixed-dtoa.h @@ -30,7 +30,8 @@ #include "utils.h" -namespace double_conversion { +namespace double_conversion +{ // Produces digits necessary to print a given number with // 'fractional_count' digits after the decimal point. diff --git a/util/double-conversion/ieee.h b/util/double-conversion/ieee.h index 839dc47d4..0922129d5 100644 --- a/util/double-conversion/ieee.h +++ b/util/double-conversion/ieee.h @@ -30,17 +30,31 @@ #include "diy-fp.h" -namespace double_conversion { +namespace double_conversion +{ // We assume that doubles and uint64_t have the same endianness. -static uint64_t double_to_uint64(double d) { return BitCast(d); } -static double uint64_to_double(uint64_t d64) { return BitCast(d64); } -static uint32_t float_to_uint32(float f) { return BitCast(f); } -static float uint32_to_float(uint32_t d32) { return BitCast(d32); } +static uint64_t double_to_uint64(double d) +{ + return BitCast(d); +} +static double uint64_to_double(uint64_t d64) +{ + return BitCast(d64); +} +static uint32_t float_to_uint32(float f) +{ + return BitCast(f); +} +static float uint32_to_float(uint32_t d32) +{ + return BitCast(d32); +} // Helper functions for doubles. -class Double { - public: +class Double +{ +public: static const uint64_t kSignMask = UINT64_2PART_C(0x80000000, 00000000); static const uint64_t kExponentMask = UINT64_2PART_C(0x7FF00000, 00000000); static const uint64_t kSignificandMask = UINT64_2PART_C(0x000FFFFF, FFFFFFFF); @@ -113,7 +127,7 @@ class Double { uint64_t d64 = AsUint64(); int biased_e = - static_cast((d64 & kExponentMask) >> kPhysicalSignificandSize); + static_cast((d64 & kExponentMask) >> kPhysicalSignificandSize); return biased_e - kExponentBias; } @@ -143,13 +157,13 @@ class Double { bool IsNan() const { uint64_t d64 = AsUint64(); return ((d64 & kExponentMask) == kExponentMask) && - ((d64 & kSignificandMask) != 0); + ((d64 & kSignificandMask) != 0); } bool IsInfinite() const { uint64_t d64 = AsUint64(); return ((d64 & kExponentMask) == kExponentMask) && - ((d64 & kSignificandMask) == 0); + ((d64 & kSignificandMask) == 0); } int Sign() const { @@ -197,7 +211,9 @@ class Double { return physical_significand_is_zero && (Exponent() != kDenormalExponent); } - double value() const { return uint64_to_double(d64_); } + double value() const { + return uint64_to_double(d64_); + } // Returns the significand size for a given order of magnitude. // If v = f*2^e with 2^p-1 <= f <= 2^p then p+e is v's order of magnitude. @@ -221,7 +237,7 @@ class Double { return Double(kNaN).value(); } - private: +private: static const int kExponentBias = 0x3FF + kPhysicalSignificandSize; static const int kDenormalExponent = -kExponentBias + 1; static const int kMaxExponent = 0x7FF - kExponentBias; @@ -254,12 +270,13 @@ class Double { biased_exponent = static_cast(exponent + kExponentBias); } return (significand & kSignificandMask) | - (biased_exponent << kPhysicalSignificandSize); + (biased_exponent << kPhysicalSignificandSize); } }; -class Single { - public: +class Single +{ +public: static const uint32_t kSignMask = 0x80000000; static const uint32_t kExponentMask = 0x7F800000; static const uint32_t kSignificandMask = 0x007FFFFF; @@ -289,7 +306,7 @@ class Single { uint32_t d32 = AsUint32(); int biased_e = - static_cast((d32 & kExponentMask) >> kPhysicalSignificandSize); + static_cast((d32 & kExponentMask) >> kPhysicalSignificandSize); return biased_e - kExponentBias; } @@ -319,13 +336,13 @@ class Single { bool IsNan() const { uint32_t d32 = AsUint32(); return ((d32 & kExponentMask) == kExponentMask) && - ((d32 & kSignificandMask) != 0); + ((d32 & kSignificandMask) != 0); } bool IsInfinite() const { uint32_t d32 = AsUint32(); return ((d32 & kExponentMask) == kExponentMask) && - ((d32 & kSignificandMask) == 0); + ((d32 & kSignificandMask) == 0); } int Sign() const { @@ -373,7 +390,9 @@ class Single { return physical_significand_is_zero && (Exponent() != kDenormalExponent); } - float value() const { return uint32_to_float(d32_); } + float value() const { + return uint32_to_float(d32_); + } static float Infinity() { return Single(kInfinity).value(); @@ -383,7 +402,7 @@ class Single { return Single(kNaN).value(); } - private: +private: static const int kExponentBias = 0x7F + kPhysicalSignificandSize; static const int kDenormalExponent = -kExponentBias + 1; static const int kMaxExponent = 0xFF - kExponentBias; diff --git a/util/double-conversion/strtod.h b/util/double-conversion/strtod.h index ed0293b8f..1d81078d2 100644 --- a/util/double-conversion/strtod.h +++ b/util/double-conversion/strtod.h @@ -30,7 +30,8 @@ #include "utils.h" -namespace double_conversion { +namespace double_conversion +{ // The buffer must only contain digits in the range [0-9]. It must not // contain a dot or a sign. It must not start with '0', and must not be empty. diff --git a/util/double-conversion/utils.h b/util/double-conversion/utils.h index 2bd716050..91f1e6c48 100644 --- a/util/double-conversion/utils.h +++ b/util/double-conversion/utils.h @@ -126,25 +126,29 @@ typedef unsigned __int64 uint64_t; DISALLOW_COPY_AND_ASSIGN(TypeName) #endif -namespace double_conversion { +namespace double_conversion +{ static const int kCharSize = sizeof(char); // Returns the maximum of the two parameters. template -static T Max(T a, T b) { +static T Max(T a, T b) +{ return a < b ? b : a; } // Returns the minimum of the two parameters. template -static T Min(T a, T b) { +static T Min(T a, T b) +{ return a < b ? a : b; } -inline int StrLength(const char* string) { +inline int StrLength(const char* string) +{ size_t length = strlen(string); ASSERT(length == static_cast(static_cast(length))); return static_cast(length); @@ -152,8 +156,9 @@ inline int StrLength(const char* string) { // This is a simplified version of V8's Vector class. template -class Vector { - public: +class Vector +{ +public: Vector() : start_(NULL), length_(0) {} Vector(T* data, int length) : start_(data), length_(length) { ASSERT(length == 0 || (length > 0 && data != NULL)); @@ -169,13 +174,19 @@ class Vector { } // Returns the length of the vector. - int length() const { return length_; } + int length() const { + return length_; + } // Returns whether or not the vector is empty. - bool is_empty() const { return length_ == 0; } + bool is_empty() const { + return length_ == 0; + } // Returns the pointer to the start of the data in the vector. - T* start() const { return start_; } + T* start() const { + return start_; + } // Access individual vector elements - checks bounds in debug mode. T& operator[](int index) const { @@ -183,11 +194,15 @@ class Vector { return start_[index]; } - T& first() { return start_[0]; } + T& first() { + return start_[0]; + } - T& last() { return start_[length_ - 1]; } + T& last() { + return start_[length_ - 1]; + } - private: +private: T* start_; int length_; }; @@ -196,14 +211,19 @@ class Vector { // Helper class for building result strings in a character buffer. The // purpose of the class is to use safe operations that checks the // buffer bounds on all operations in debug mode. -class StringBuilder { - public: +class StringBuilder +{ +public: StringBuilder(char* buffer, int size) - : buffer_(buffer, size), position_(0) { } + : buffer_(buffer, size), position_(0) { } - ~StringBuilder() { if (!is_finalized()) Finalize(); } + ~StringBuilder() { + if (!is_finalized()) Finalize(); + } - int size() const { return buffer_.length(); } + int size() const { + return buffer_.length(); + } // Get the current position in the builder. int position() const { @@ -212,7 +232,9 @@ class StringBuilder { } // Reset the position. - void Reset() { position_ = 0; } + void Reset() { + position_ = 0; + } // Add a single character to the builder. It is not allowed to add // 0-characters; use the Finalize() method to terminate the string @@ -262,11 +284,13 @@ class StringBuilder { return buffer_.start(); } - private: +private: Vector buffer_; int position_; - bool is_finalized() const { return position_ < 0; } + bool is_finalized() const { + return position_ < 0; + } DISALLOW_IMPLICIT_CONSTRUCTORS(StringBuilder); }; @@ -296,7 +320,8 @@ class StringBuilder { // enough that it can no longer see that you have cast one pointer type to // another thus avoiding the warning. template -inline Dest BitCast(const Source& source) { +inline Dest BitCast(const Source& source) +{ // Compile time assertion: sizeof(Dest) == sizeof(Source) // A compile error here means your Dest and Source have different sizes. typedef char VerifySizesAreEqual[sizeof(Dest) == sizeof(Source) ? 1 : -1]; @@ -307,7 +332,8 @@ inline Dest BitCast(const Source& source) { } template -inline Dest BitCast(Source* source) { +inline Dest BitCast(Source* source) +{ return BitCast(reinterpret_cast(source)); }