diff --git a/moses/src/Hypothesis.cpp b/moses/src/Hypothesis.cpp index e7b4435fa..718b871b2 100755 --- a/moses/src/Hypothesis.cpp +++ b/moses/src/Hypothesis.cpp @@ -32,7 +32,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "StaticData.h" #include "Input.h" #include "LMList.h" -#include "md5.h" +#include "hash.h" using namespace std; @@ -53,6 +53,7 @@ Hypothesis::Hypothesis(InputType const& source) , m_id(s_HypothesesCreated++) { // used for initial seeding of trans process // initialize scores + _hash_computed = false; ResetScore(); } @@ -68,6 +69,7 @@ Hypothesis::Hypothesis(const Hypothesis ©) #endif , m_id (s_HypothesesCreated++) { + _hash_computed = false; m_targetPhrase.AddWords( copy.m_targetPhrase ); // initialize scores @@ -95,6 +97,7 @@ Hypothesis::Hypothesis(const Hypothesis &prevHypo, const TranslationOption &tran assert(!m_sourceCompleted.Overlap(m_currSourceWordsRange)); + _hash_computed = false; m_sourceCompleted.SetValue(m_currSourceWordsRange.GetStartPos(), m_currSourceWordsRange.GetEndPos(), true); // add new words from poss trans @@ -209,6 +212,7 @@ bool Hypothesis::IsCompatible(const Phrase &phrase) const return true; } +#if 0 void Hypothesis::GenerateNGramCompareKey(size_t contextSize) { struct MD5Context md5c; @@ -238,6 +242,32 @@ void Hypothesis::GenerateNGramCompareKey(size_t contextSize) } MD5Final(m_compSignature, &md5c); } +#endif + +void Hypothesis::GenerateNGramCompareHash() const +{ + _hash = 0xcafe5137; // random + const size_t thisSize = GetSize(); + + for (size_t currFactor = 0 ; currFactor < NUM_FACTORS ; currFactor++) + { + size_t ngramMax = StaticData::Instance()->GetMaxNGramOrderForFactorId(currFactor); + if (ngramMax < 2) continue; // unigrams have no context + + const size_t minSize = std::min(ngramMax-1, thisSize); + _hash = quick_hash((const char*)&minSize, sizeof(size_t), _hash); + + for (size_t currNGram = 1 ; currNGram <= minSize ; currNGram++) + { + FactorType factorType = static_cast(currFactor); + const Factor *thisFactor = GetFactor(thisSize - currNGram, factorType); + _hash = quick_hash((const char*)&thisFactor, sizeof(const Factor*), _hash); + } + } + vector wordCoverage = m_sourceCompleted.GetCompressedReprentation(); + _hash = quick_hash((const char*)&wordCoverage[0], sizeof(size_t)*wordCoverage.size(), _hash); + _hash_computed = true; +} int Hypothesis::NGramCompare(const Hypothesis &compare) const { // -1 = this < compare diff --git a/moses/src/Hypothesis.h b/moses/src/Hypothesis.h index 7a9fbfcfb..ff74ed69b 100755 --- a/moses/src/Hypothesis.h +++ b/moses/src/Hypothesis.h @@ -77,6 +77,9 @@ protected: void CalcDeletionScore(const Sentence& sourceSentence, const WordsRange& sourceWordsRange, const WordDeletionTable& wordDeletionTable); + void GenerateNGramCompareHash() const; + mutable size_t _hash; + mutable bool _hash_computed; public: @@ -210,6 +213,13 @@ public: return m_sourceCompleted; } + inline size_t hash() const + { + if (_hash_computed) return _hash; + GenerateNGramCompareHash(); + return _hash; + } + /*** * requires that GenerateNGramCompareKey was previously run */ diff --git a/moses/src/HypothesisCollection.cpp b/moses/src/HypothesisCollection.cpp index e5c357cb8..b8e3f5358 100755 --- a/moses/src/HypothesisCollection.cpp +++ b/moses/src/HypothesisCollection.cpp @@ -25,10 +25,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "HypothesisCollection.h" #include "TypeDef.h" #include "Util.h" +#include "StaticData.h" using namespace std; -size_t CompareHypothesisCollection::s_ngramMaxOrder[NUM_FACTORS] = {0,0,0,0}; +size_t HypothesisRecombinationOrderer::s_ngramMaxOrder[NUM_FACTORS] = {0,0,0,0}; // need to change if we add more factors, or use a macro void HypothesisCollection::RemoveAll() @@ -62,8 +63,10 @@ bool HypothesisCollection::AddPrune(Hypothesis *hypo) { // if returns false, hypothesis not used // caller must take care to delete unused hypo to avoid leak - if (hypo->GetScore(ScoreType::Total) < m_worstScore) + if (hypo->GetScore(ScoreType::Total) < m_worstScore) { + StaticData::Instance()->GetSentenceStats().numPruned++; return false; + } // over threshold // recombine if ngram-equivalent to another hypo @@ -74,6 +77,7 @@ bool HypothesisCollection::AddPrune(Hypothesis *hypo) return true; } + StaticData::Instance()->GetSentenceStats().numRecombinations++; // found existing hypo with same target ending. // keep the best 1 @@ -148,6 +152,7 @@ void HypothesisCollection::PruneToSize(size_t newSize) { iterator iterRemove = iter++; Remove(iterRemove); + StaticData::Instance()->GetSentenceStats().numPruned++; } else { diff --git a/moses/src/HypothesisCollection.h b/moses/src/HypothesisCollection.h index 6c10dfed1..77d01129a 100755 --- a/moses/src/HypothesisCollection.h +++ b/moses/src/HypothesisCollection.h @@ -25,7 +25,12 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include #include "Hypothesis.h" -class CompareHypothesisCollection +#if 0 +//#ifdef __GNUG__ +#include +#endif + +class HypothesisRecombinationOrderer { protected: // static @@ -64,11 +69,34 @@ public: } }; +struct HypothesisRecombinationComparer +{ + bool operator()(const Hypothesis* hypoA, const Hypothesis* hypoB) const + { + if (hypoA->NGramCompare(*hypoB) != 0) return false; + return (hypoA->GetWordsBitmap().Compare(hypoB->GetWordsBitmap()) == 0); + } +}; + +struct HypothesisRecombinationHasher +{ + size_t operator()(const Hypothesis* hypo) const { + return hypo->hash(); + } +}; + class HypothesisCollection { +private: +#if 0 +//#ifdef __GNUG__ + typedef __gnu_cxx::hash_set< Hypothesis*, HypothesisRecombinationHasher, HypothesisRecombinationComparer > _HCType; +#else + typedef std::set< Hypothesis*, HypothesisRecombinationOrderer > _HCType; +#endif public: - typedef std::set< Hypothesis*, CompareHypothesisCollection >::iterator iterator; - typedef std::set< Hypothesis*, CompareHypothesisCollection >::const_iterator const_iterator; + typedef _HCType::iterator iterator; + typedef _HCType::const_iterator const_iterator; friend std::ostream& operator<<(std::ostream&, const HypothesisCollection&); protected: @@ -76,7 +104,7 @@ protected: float m_worstScore; float m_beamThreshold; size_t m_maxHypoStackSize; - std::set< Hypothesis*, CompareHypothesisCollection > m_hypos; + _HCType m_hypos; void Add(Hypothesis *hypothesis); diff --git a/moses/src/Manager.cpp b/moses/src/Manager.cpp index 38bdaf382..358e85102 100755 --- a/moses/src/Manager.cpp +++ b/moses/src/Manager.cpp @@ -57,6 +57,7 @@ Manager::~Manager() {} */ void Manager::ProcessSentence() { + m_staticData.GetSentenceStats().ZeroAll(); list < DecodeStep > &decodeStepList = m_staticData.GetDecodeStepList(); // create list of all possible translations // this is only valid if: @@ -104,6 +105,7 @@ void Manager::ProcessSentence() // some more logging if (m_staticData.GetVerboseLevel() > 0) { + cerr << m_staticData.GetSentenceStats(); cerr << "Hypotheses created since startup: "<< Hypothesis::s_HypothesesCreated< + +struct SentenceStats +{ + SentenceStats() : numRecombinations(0), numPruned(0) {}; + unsigned int numRecombinations; + unsigned int numPruned; + + void ZeroAll() { numRecombinations = 0; numPruned = 0; } +}; + +inline std::ostream& operator<<(std::ostream& os, const SentenceStats& ss) +{ + return os << "number of hypotheses recombined=" << ss.numRecombinations << std::endl + << " \" \" pruned=" << ss.numPruned << std::endl; +} + +#endif diff --git a/moses/src/StaticData.cpp b/moses/src/StaticData.cpp index 01d90cfdd..fad5111dc 100755 --- a/moses/src/StaticData.cpp +++ b/moses/src/StaticData.cpp @@ -34,6 +34,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "LanguageModel.h" #include "LanguageModelFactory.h" #include "LexicalReordering.h" +#include "SentenceStats.h" #ifndef WIN32 #include "PhraseDictionaryTreeAdaptor.h" @@ -240,7 +241,7 @@ bool StaticData::LoadParameters(int argc, char* argv[]) timer.check(("Finished loading LanguageModel " + languageModelFile).c_str()); m_languageModel[type].push_back(lm); - CompareHypothesisCollection::SetMaxNGramOrder(factorType, nGramMaxOrder); + HypothesisRecombinationOrderer::SetMaxNGramOrder(factorType, nGramMaxOrder); } } // flag indicating that language models were loaded, diff --git a/moses/src/StaticData.h b/moses/src/StaticData.h index 7fef7d4ce..14f19fcf7 100755 --- a/moses/src/StaticData.h +++ b/moses/src/StaticData.h @@ -32,6 +32,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "InputOutput.h" #include "DecodeStep.h" #include "LMList.h" +#include "SentenceStats.h" //#include "UnknownWordHandler.h" class InputType; @@ -92,6 +93,8 @@ protected: bool m_reportSourceSpan; bool m_reportAllFactors; + mutable SentenceStats m_sentenceStats; + public: StaticData(); ~StaticData(); @@ -277,5 +280,9 @@ public: int GetInputType() const {return m_inputType;} void InitializeBeforeSentenceProcessing(InputType const&); void CleanUpAfterSentenceProcessing(); + SentenceStats& GetSentenceStats() const + { + return m_sentenceStats; + } }; diff --git a/moses/src/WordsBitmap.cpp b/moses/src/WordsBitmap.cpp index 112106324..1b12aa487 100755 --- a/moses/src/WordsBitmap.cpp +++ b/moses/src/WordsBitmap.cpp @@ -57,3 +57,19 @@ int WordsBitmap::GetFutureCosts(int lastPos) const return sum; } + +std::vector WordsBitmap::GetCompressedReprentation() const +{ + std::vector res(1 + (m_size >> (sizeof(int) + 3)), 0); + size_t c=0; size_t x=0; size_t ci=0; + for(size_t i=0;i GetCompressedReprentation() const; inline int Compare (const WordsBitmap &compare) const {