enable statistics gathering for # hypotheses pruned, recombined. add code for hash-based recombination. it works, but it's a little different than the tree-based impl so it's not enabled

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@363 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
redpony 2006-07-28 21:18:51 +00:00
parent 04a52a546f
commit fdff6911d6
10 changed files with 130 additions and 8 deletions

View File

@ -32,7 +32,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "StaticData.h"
#include "Input.h"
#include "LMList.h"
#include "md5.h"
#include "hash.h"
using namespace std;
@ -53,6 +53,7 @@ Hypothesis::Hypothesis(InputType const& source)
, m_id(s_HypothesesCreated++)
{ // used for initial seeding of trans process
// initialize scores
_hash_computed = false;
ResetScore();
}
@ -68,6 +69,7 @@ Hypothesis::Hypothesis(const Hypothesis &copy)
#endif
, m_id (s_HypothesesCreated++)
{
_hash_computed = false;
m_targetPhrase.AddWords( copy.m_targetPhrase );
// initialize scores
@ -95,6 +97,7 @@ Hypothesis::Hypothesis(const Hypothesis &prevHypo, const TranslationOption &tran
assert(!m_sourceCompleted.Overlap(m_currSourceWordsRange));
_hash_computed = false;
m_sourceCompleted.SetValue(m_currSourceWordsRange.GetStartPos(), m_currSourceWordsRange.GetEndPos(), true);
// add new words from poss trans
@ -209,6 +212,7 @@ bool Hypothesis::IsCompatible(const Phrase &phrase) const
return true;
}
#if 0
void Hypothesis::GenerateNGramCompareKey(size_t contextSize)
{
struct MD5Context md5c;
@ -238,6 +242,32 @@ void Hypothesis::GenerateNGramCompareKey(size_t contextSize)
}
MD5Final(m_compSignature, &md5c);
}
#endif
void Hypothesis::GenerateNGramCompareHash() const
{
_hash = 0xcafe5137; // random
const size_t thisSize = GetSize();
for (size_t currFactor = 0 ; currFactor < NUM_FACTORS ; currFactor++)
{
size_t ngramMax = StaticData::Instance()->GetMaxNGramOrderForFactorId(currFactor);
if (ngramMax < 2) continue; // unigrams have no context
const size_t minSize = std::min(ngramMax-1, thisSize);
_hash = quick_hash((const char*)&minSize, sizeof(size_t), _hash);
for (size_t currNGram = 1 ; currNGram <= minSize ; currNGram++)
{
FactorType factorType = static_cast<FactorType>(currFactor);
const Factor *thisFactor = GetFactor(thisSize - currNGram, factorType);
_hash = quick_hash((const char*)&thisFactor, sizeof(const Factor*), _hash);
}
}
vector<size_t> wordCoverage = m_sourceCompleted.GetCompressedReprentation();
_hash = quick_hash((const char*)&wordCoverage[0], sizeof(size_t)*wordCoverage.size(), _hash);
_hash_computed = true;
}
int Hypothesis::NGramCompare(const Hypothesis &compare) const
{ // -1 = this < compare

View File

@ -77,6 +77,9 @@ protected:
void CalcDeletionScore(const Sentence& sourceSentence, const WordsRange& sourceWordsRange, const WordDeletionTable& wordDeletionTable);
void GenerateNGramCompareHash() const;
mutable size_t _hash;
mutable bool _hash_computed;
public:
@ -210,6 +213,13 @@ public:
return m_sourceCompleted;
}
inline size_t hash() const
{
if (_hash_computed) return _hash;
GenerateNGramCompareHash();
return _hash;
}
/***
* requires that GenerateNGramCompareKey was previously run
*/

View File

@ -25,10 +25,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "HypothesisCollection.h"
#include "TypeDef.h"
#include "Util.h"
#include "StaticData.h"
using namespace std;
size_t CompareHypothesisCollection::s_ngramMaxOrder[NUM_FACTORS] = {0,0,0,0};
size_t HypothesisRecombinationOrderer::s_ngramMaxOrder[NUM_FACTORS] = {0,0,0,0};
// need to change if we add more factors, or use a macro
void HypothesisCollection::RemoveAll()
@ -62,8 +63,10 @@ bool HypothesisCollection::AddPrune(Hypothesis *hypo)
{ // if returns false, hypothesis not used
// caller must take care to delete unused hypo to avoid leak
if (hypo->GetScore(ScoreType::Total) < m_worstScore)
if (hypo->GetScore(ScoreType::Total) < m_worstScore) {
StaticData::Instance()->GetSentenceStats().numPruned++;
return false;
}
// over threshold
// recombine if ngram-equivalent to another hypo
@ -74,6 +77,7 @@ bool HypothesisCollection::AddPrune(Hypothesis *hypo)
return true;
}
StaticData::Instance()->GetSentenceStats().numRecombinations++;
// found existing hypo with same target ending.
// keep the best 1
@ -148,6 +152,7 @@ void HypothesisCollection::PruneToSize(size_t newSize)
{
iterator iterRemove = iter++;
Remove(iterRemove);
StaticData::Instance()->GetSentenceStats().numPruned++;
}
else
{

View File

@ -25,7 +25,12 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <set>
#include "Hypothesis.h"
class CompareHypothesisCollection
#if 0
//#ifdef __GNUG__
#include <ext/hash_set>
#endif
class HypothesisRecombinationOrderer
{
protected:
// static
@ -64,11 +69,34 @@ public:
}
};
struct HypothesisRecombinationComparer
{
bool operator()(const Hypothesis* hypoA, const Hypothesis* hypoB) const
{
if (hypoA->NGramCompare(*hypoB) != 0) return false;
return (hypoA->GetWordsBitmap().Compare(hypoB->GetWordsBitmap()) == 0);
}
};
struct HypothesisRecombinationHasher
{
size_t operator()(const Hypothesis* hypo) const {
return hypo->hash();
}
};
class HypothesisCollection
{
private:
#if 0
//#ifdef __GNUG__
typedef __gnu_cxx::hash_set< Hypothesis*, HypothesisRecombinationHasher, HypothesisRecombinationComparer > _HCType;
#else
typedef std::set< Hypothesis*, HypothesisRecombinationOrderer > _HCType;
#endif
public:
typedef std::set< Hypothesis*, CompareHypothesisCollection >::iterator iterator;
typedef std::set< Hypothesis*, CompareHypothesisCollection >::const_iterator const_iterator;
typedef _HCType::iterator iterator;
typedef _HCType::const_iterator const_iterator;
friend std::ostream& operator<<(std::ostream&, const HypothesisCollection&);
protected:
@ -76,7 +104,7 @@ protected:
float m_worstScore;
float m_beamThreshold;
size_t m_maxHypoStackSize;
std::set< Hypothesis*, CompareHypothesisCollection > m_hypos;
_HCType m_hypos;
void Add(Hypothesis *hypothesis);

View File

@ -57,6 +57,7 @@ Manager::~Manager() {}
*/
void Manager::ProcessSentence()
{
m_staticData.GetSentenceStats().ZeroAll();
list < DecodeStep > &decodeStepList = m_staticData.GetDecodeStepList();
// create list of all possible translations
// this is only valid if:
@ -104,6 +105,7 @@ void Manager::ProcessSentence()
// some more logging
if (m_staticData.GetVerboseLevel() > 0) {
cerr << m_staticData.GetSentenceStats();
cerr << "Hypotheses created since startup: "<< Hypothesis::s_HypothesesCreated<<endl;
//OutputHypoStack();
//OutputHypoStackSize();

21
moses/src/SentenceStats.h Normal file
View File

@ -0,0 +1,21 @@
#ifndef _SENTENCE_STATS_H_
#define _SENTENCE_STATS_H_
#include <iostream>
struct SentenceStats
{
SentenceStats() : numRecombinations(0), numPruned(0) {};
unsigned int numRecombinations;
unsigned int numPruned;
void ZeroAll() { numRecombinations = 0; numPruned = 0; }
};
inline std::ostream& operator<<(std::ostream& os, const SentenceStats& ss)
{
return os << "number of hypotheses recombined=" << ss.numRecombinations << std::endl
<< " \" \" pruned=" << ss.numPruned << std::endl;
}
#endif

View File

@ -34,6 +34,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "LanguageModel.h"
#include "LanguageModelFactory.h"
#include "LexicalReordering.h"
#include "SentenceStats.h"
#ifndef WIN32
#include "PhraseDictionaryTreeAdaptor.h"
@ -240,7 +241,7 @@ bool StaticData::LoadParameters(int argc, char* argv[])
timer.check(("Finished loading LanguageModel " + languageModelFile).c_str());
m_languageModel[type].push_back(lm);
CompareHypothesisCollection::SetMaxNGramOrder(factorType, nGramMaxOrder);
HypothesisRecombinationOrderer::SetMaxNGramOrder(factorType, nGramMaxOrder);
}
}
// flag indicating that language models were loaded,

View File

@ -32,6 +32,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "InputOutput.h"
#include "DecodeStep.h"
#include "LMList.h"
#include "SentenceStats.h"
//#include "UnknownWordHandler.h"
class InputType;
@ -92,6 +93,8 @@ protected:
bool m_reportSourceSpan;
bool m_reportAllFactors;
mutable SentenceStats m_sentenceStats;
public:
StaticData();
~StaticData();
@ -277,5 +280,9 @@ public:
int GetInputType() const {return m_inputType;}
void InitializeBeforeSentenceProcessing(InputType const&);
void CleanUpAfterSentenceProcessing();
SentenceStats& GetSentenceStats() const
{
return m_sentenceStats;
}
};

View File

@ -57,3 +57,19 @@ int WordsBitmap::GetFutureCosts(int lastPos) const
return sum;
}
std::vector<size_t> WordsBitmap::GetCompressedReprentation() const
{
std::vector<size_t> res(1 + (m_size >> (sizeof(int) + 3)), 0);
size_t c=0; size_t x=0; size_t ci=0;
for(size_t i=0;i<m_size;++i) {
x |= (size_t)m_bitmap[i];
x <<= 1;
c++;
if (c == sizeof(int)*8) {
res[ci++] = x; x = 0;
}
}
return res;
}

View File

@ -139,6 +139,8 @@ public:
return m_size;
}
std::vector<size_t> GetCompressedReprentation() const;
inline int Compare (const WordsBitmap &compare) const
{
// -1 = less than