mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-08 12:36:35 +03:00
enable statistics gathering for # hypotheses pruned, recombined. add code for hash-based recombination. it works, but it's a little different than the tree-based impl so it's not enabled
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@363 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
04a52a546f
commit
fdff6911d6
@ -32,7 +32,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include "StaticData.h"
|
||||
#include "Input.h"
|
||||
#include "LMList.h"
|
||||
#include "md5.h"
|
||||
#include "hash.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -53,6 +53,7 @@ Hypothesis::Hypothesis(InputType const& source)
|
||||
, m_id(s_HypothesesCreated++)
|
||||
{ // used for initial seeding of trans process
|
||||
// initialize scores
|
||||
_hash_computed = false;
|
||||
ResetScore();
|
||||
}
|
||||
|
||||
@ -68,6 +69,7 @@ Hypothesis::Hypothesis(const Hypothesis ©)
|
||||
#endif
|
||||
, m_id (s_HypothesesCreated++)
|
||||
{
|
||||
_hash_computed = false;
|
||||
m_targetPhrase.AddWords( copy.m_targetPhrase );
|
||||
|
||||
// initialize scores
|
||||
@ -95,6 +97,7 @@ Hypothesis::Hypothesis(const Hypothesis &prevHypo, const TranslationOption &tran
|
||||
|
||||
assert(!m_sourceCompleted.Overlap(m_currSourceWordsRange));
|
||||
|
||||
_hash_computed = false;
|
||||
m_sourceCompleted.SetValue(m_currSourceWordsRange.GetStartPos(), m_currSourceWordsRange.GetEndPos(), true);
|
||||
|
||||
// add new words from poss trans
|
||||
@ -209,6 +212,7 @@ bool Hypothesis::IsCompatible(const Phrase &phrase) const
|
||||
return true;
|
||||
}
|
||||
|
||||
#if 0
|
||||
void Hypothesis::GenerateNGramCompareKey(size_t contextSize)
|
||||
{
|
||||
struct MD5Context md5c;
|
||||
@ -238,6 +242,32 @@ void Hypothesis::GenerateNGramCompareKey(size_t contextSize)
|
||||
}
|
||||
MD5Final(m_compSignature, &md5c);
|
||||
}
|
||||
#endif
|
||||
|
||||
void Hypothesis::GenerateNGramCompareHash() const
|
||||
{
|
||||
_hash = 0xcafe5137; // random
|
||||
const size_t thisSize = GetSize();
|
||||
|
||||
for (size_t currFactor = 0 ; currFactor < NUM_FACTORS ; currFactor++)
|
||||
{
|
||||
size_t ngramMax = StaticData::Instance()->GetMaxNGramOrderForFactorId(currFactor);
|
||||
if (ngramMax < 2) continue; // unigrams have no context
|
||||
|
||||
const size_t minSize = std::min(ngramMax-1, thisSize);
|
||||
_hash = quick_hash((const char*)&minSize, sizeof(size_t), _hash);
|
||||
|
||||
for (size_t currNGram = 1 ; currNGram <= minSize ; currNGram++)
|
||||
{
|
||||
FactorType factorType = static_cast<FactorType>(currFactor);
|
||||
const Factor *thisFactor = GetFactor(thisSize - currNGram, factorType);
|
||||
_hash = quick_hash((const char*)&thisFactor, sizeof(const Factor*), _hash);
|
||||
}
|
||||
}
|
||||
vector<size_t> wordCoverage = m_sourceCompleted.GetCompressedReprentation();
|
||||
_hash = quick_hash((const char*)&wordCoverage[0], sizeof(size_t)*wordCoverage.size(), _hash);
|
||||
_hash_computed = true;
|
||||
}
|
||||
|
||||
int Hypothesis::NGramCompare(const Hypothesis &compare) const
|
||||
{ // -1 = this < compare
|
||||
|
@ -77,6 +77,9 @@ protected:
|
||||
|
||||
void CalcDeletionScore(const Sentence& sourceSentence, const WordsRange& sourceWordsRange, const WordDeletionTable& wordDeletionTable);
|
||||
|
||||
void GenerateNGramCompareHash() const;
|
||||
mutable size_t _hash;
|
||||
mutable bool _hash_computed;
|
||||
|
||||
public:
|
||||
|
||||
@ -210,6 +213,13 @@ public:
|
||||
return m_sourceCompleted;
|
||||
}
|
||||
|
||||
inline size_t hash() const
|
||||
{
|
||||
if (_hash_computed) return _hash;
|
||||
GenerateNGramCompareHash();
|
||||
return _hash;
|
||||
}
|
||||
|
||||
/***
|
||||
* requires that GenerateNGramCompareKey was previously run
|
||||
*/
|
||||
|
@ -25,10 +25,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include "HypothesisCollection.h"
|
||||
#include "TypeDef.h"
|
||||
#include "Util.h"
|
||||
#include "StaticData.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
size_t CompareHypothesisCollection::s_ngramMaxOrder[NUM_FACTORS] = {0,0,0,0};
|
||||
size_t HypothesisRecombinationOrderer::s_ngramMaxOrder[NUM_FACTORS] = {0,0,0,0};
|
||||
// need to change if we add more factors, or use a macro
|
||||
|
||||
void HypothesisCollection::RemoveAll()
|
||||
@ -62,8 +63,10 @@ bool HypothesisCollection::AddPrune(Hypothesis *hypo)
|
||||
{ // if returns false, hypothesis not used
|
||||
// caller must take care to delete unused hypo to avoid leak
|
||||
|
||||
if (hypo->GetScore(ScoreType::Total) < m_worstScore)
|
||||
if (hypo->GetScore(ScoreType::Total) < m_worstScore) {
|
||||
StaticData::Instance()->GetSentenceStats().numPruned++;
|
||||
return false;
|
||||
}
|
||||
|
||||
// over threshold
|
||||
// recombine if ngram-equivalent to another hypo
|
||||
@ -74,6 +77,7 @@ bool HypothesisCollection::AddPrune(Hypothesis *hypo)
|
||||
return true;
|
||||
}
|
||||
|
||||
StaticData::Instance()->GetSentenceStats().numRecombinations++;
|
||||
|
||||
// found existing hypo with same target ending.
|
||||
// keep the best 1
|
||||
@ -148,6 +152,7 @@ void HypothesisCollection::PruneToSize(size_t newSize)
|
||||
{
|
||||
iterator iterRemove = iter++;
|
||||
Remove(iterRemove);
|
||||
StaticData::Instance()->GetSentenceStats().numPruned++;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -25,7 +25,12 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include <set>
|
||||
#include "Hypothesis.h"
|
||||
|
||||
class CompareHypothesisCollection
|
||||
#if 0
|
||||
//#ifdef __GNUG__
|
||||
#include <ext/hash_set>
|
||||
#endif
|
||||
|
||||
class HypothesisRecombinationOrderer
|
||||
{
|
||||
protected:
|
||||
// static
|
||||
@ -64,11 +69,34 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
struct HypothesisRecombinationComparer
|
||||
{
|
||||
bool operator()(const Hypothesis* hypoA, const Hypothesis* hypoB) const
|
||||
{
|
||||
if (hypoA->NGramCompare(*hypoB) != 0) return false;
|
||||
return (hypoA->GetWordsBitmap().Compare(hypoB->GetWordsBitmap()) == 0);
|
||||
}
|
||||
};
|
||||
|
||||
struct HypothesisRecombinationHasher
|
||||
{
|
||||
size_t operator()(const Hypothesis* hypo) const {
|
||||
return hypo->hash();
|
||||
}
|
||||
};
|
||||
|
||||
class HypothesisCollection
|
||||
{
|
||||
private:
|
||||
#if 0
|
||||
//#ifdef __GNUG__
|
||||
typedef __gnu_cxx::hash_set< Hypothesis*, HypothesisRecombinationHasher, HypothesisRecombinationComparer > _HCType;
|
||||
#else
|
||||
typedef std::set< Hypothesis*, HypothesisRecombinationOrderer > _HCType;
|
||||
#endif
|
||||
public:
|
||||
typedef std::set< Hypothesis*, CompareHypothesisCollection >::iterator iterator;
|
||||
typedef std::set< Hypothesis*, CompareHypothesisCollection >::const_iterator const_iterator;
|
||||
typedef _HCType::iterator iterator;
|
||||
typedef _HCType::const_iterator const_iterator;
|
||||
friend std::ostream& operator<<(std::ostream&, const HypothesisCollection&);
|
||||
|
||||
protected:
|
||||
@ -76,7 +104,7 @@ protected:
|
||||
float m_worstScore;
|
||||
float m_beamThreshold;
|
||||
size_t m_maxHypoStackSize;
|
||||
std::set< Hypothesis*, CompareHypothesisCollection > m_hypos;
|
||||
_HCType m_hypos;
|
||||
|
||||
|
||||
void Add(Hypothesis *hypothesis);
|
||||
|
@ -57,6 +57,7 @@ Manager::~Manager() {}
|
||||
*/
|
||||
void Manager::ProcessSentence()
|
||||
{
|
||||
m_staticData.GetSentenceStats().ZeroAll();
|
||||
list < DecodeStep > &decodeStepList = m_staticData.GetDecodeStepList();
|
||||
// create list of all possible translations
|
||||
// this is only valid if:
|
||||
@ -104,6 +105,7 @@ void Manager::ProcessSentence()
|
||||
|
||||
// some more logging
|
||||
if (m_staticData.GetVerboseLevel() > 0) {
|
||||
cerr << m_staticData.GetSentenceStats();
|
||||
cerr << "Hypotheses created since startup: "<< Hypothesis::s_HypothesesCreated<<endl;
|
||||
//OutputHypoStack();
|
||||
//OutputHypoStackSize();
|
||||
|
21
moses/src/SentenceStats.h
Normal file
21
moses/src/SentenceStats.h
Normal file
@ -0,0 +1,21 @@
|
||||
#ifndef _SENTENCE_STATS_H_
|
||||
#define _SENTENCE_STATS_H_
|
||||
|
||||
#include <iostream>
|
||||
|
||||
struct SentenceStats
|
||||
{
|
||||
SentenceStats() : numRecombinations(0), numPruned(0) {};
|
||||
unsigned int numRecombinations;
|
||||
unsigned int numPruned;
|
||||
|
||||
void ZeroAll() { numRecombinations = 0; numPruned = 0; }
|
||||
};
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const SentenceStats& ss)
|
||||
{
|
||||
return os << "number of hypotheses recombined=" << ss.numRecombinations << std::endl
|
||||
<< " \" \" pruned=" << ss.numPruned << std::endl;
|
||||
}
|
||||
|
||||
#endif
|
@ -34,6 +34,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include "LanguageModel.h"
|
||||
#include "LanguageModelFactory.h"
|
||||
#include "LexicalReordering.h"
|
||||
#include "SentenceStats.h"
|
||||
|
||||
#ifndef WIN32
|
||||
#include "PhraseDictionaryTreeAdaptor.h"
|
||||
@ -240,7 +241,7 @@ bool StaticData::LoadParameters(int argc, char* argv[])
|
||||
timer.check(("Finished loading LanguageModel " + languageModelFile).c_str());
|
||||
m_languageModel[type].push_back(lm);
|
||||
|
||||
CompareHypothesisCollection::SetMaxNGramOrder(factorType, nGramMaxOrder);
|
||||
HypothesisRecombinationOrderer::SetMaxNGramOrder(factorType, nGramMaxOrder);
|
||||
}
|
||||
}
|
||||
// flag indicating that language models were loaded,
|
||||
|
@ -32,6 +32,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include "InputOutput.h"
|
||||
#include "DecodeStep.h"
|
||||
#include "LMList.h"
|
||||
#include "SentenceStats.h"
|
||||
//#include "UnknownWordHandler.h"
|
||||
|
||||
class InputType;
|
||||
@ -92,6 +93,8 @@ protected:
|
||||
bool m_reportSourceSpan;
|
||||
bool m_reportAllFactors;
|
||||
|
||||
mutable SentenceStats m_sentenceStats;
|
||||
|
||||
public:
|
||||
StaticData();
|
||||
~StaticData();
|
||||
@ -277,5 +280,9 @@ public:
|
||||
int GetInputType() const {return m_inputType;}
|
||||
void InitializeBeforeSentenceProcessing(InputType const&);
|
||||
void CleanUpAfterSentenceProcessing();
|
||||
SentenceStats& GetSentenceStats() const
|
||||
{
|
||||
return m_sentenceStats;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -57,3 +57,19 @@ int WordsBitmap::GetFutureCosts(int lastPos) const
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
std::vector<size_t> WordsBitmap::GetCompressedReprentation() const
|
||||
{
|
||||
std::vector<size_t> res(1 + (m_size >> (sizeof(int) + 3)), 0);
|
||||
size_t c=0; size_t x=0; size_t ci=0;
|
||||
for(size_t i=0;i<m_size;++i) {
|
||||
x |= (size_t)m_bitmap[i];
|
||||
x <<= 1;
|
||||
c++;
|
||||
if (c == sizeof(int)*8) {
|
||||
res[ci++] = x; x = 0;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
@ -138,6 +138,8 @@ public:
|
||||
{
|
||||
return m_size;
|
||||
}
|
||||
|
||||
std::vector<size_t> GetCompressedReprentation() const;
|
||||
|
||||
inline int Compare (const WordsBitmap &compare) const
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user