Merge branch 'hieu_opt_input' of ../hh

This commit is contained in:
Hieu Hoang 2013-08-16 17:05:04 +01:00
commit 51ac2d6567
9 changed files with 141 additions and 32 deletions

View File

@ -131,7 +131,6 @@ Parameter::Parameter()
AddParam("rule-limit", "a little like table limit. But for chart decoding rules. Default is DEFAULT_MAX_TRANS_OPT_SIZE");
AddParam("source-label-overlap", "What happens if a span already has a label. 0=add more. 1=replace. 2=discard. Default is 0");
AddParam("output-hypo-score", "Output the hypo score to stdout with the output string. For search error analysis. Default is false");
AddParam("unknown-lhs", "file containing target lhs of unknown words. 1 per line: LHS prob");
AddParam("show-weights", "print feature weights and exit");
AddParam("start-translation-id", "Id of 1st input. Default = 0");
AddParam("output-unknowns", "Output the unknown (OOV) words to the given file, one line per sentence");

View File

@ -107,7 +107,7 @@ struct CompareTargetPhrase {
};
const TargetPhraseCollection*
PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) const
PhraseDictionaryCompact::GetTargetPhraseCollectionNonCache(const Phrase &sourcePhrase) const
{
// There is no souch source phrase if source phrase is longer than longest
@ -171,6 +171,8 @@ void PhraseDictionaryCompact::CacheForCleanup(TargetPhraseCollection* tpc)
PhraseCache &ref = m_sentenceCache;
#endif
ref.push_back(tpc);
ReduceCache();
}
void PhraseDictionaryCompact::AddEquivPhrase(const Phrase &source,

View File

@ -74,7 +74,7 @@ public:
void Load();
const TargetPhraseCollection* GetTargetPhraseCollection(const Phrase &source) const;
const TargetPhraseCollection* GetTargetPhraseCollectionNonCache(const Phrase &source) const;
TargetPhraseVectorPtr GetTargetPhraseCollectionRaw(const Phrase &source) const;
void AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase);

View File

@ -35,17 +35,17 @@ namespace Moses
PhraseDictionary::PhraseDictionary(const std::string &description, const std::string &line)
:DecodeFeature(description, line)
,m_tableLimit(20) // default
,m_useCache(666)
,m_maxCacheSize(DEFAULT_MAX_TRANS_OPT_CACHE_SIZE)
{
}
const TargetPhraseCollection *PhraseDictionary::GetTargetPhraseCollection(const Phrase& src) const
{
const TargetPhraseCollection *ret;
if (m_useCache) {
if (m_maxCacheSize) {
size_t hash = hash_value(src);
std::map<size_t, const TargetPhraseCollection*>::const_iterator iter;
std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::iterator iter;
{
// scope of read lock
@ -56,19 +56,26 @@ const TargetPhraseCollection *PhraseDictionary::GetTargetPhraseCollection(const
}
if (iter == m_cache.end()) {
// not in cache, need to look up from phrase table
ret = GetTargetPhraseCollectionNonCache(src);
if (ret) {
ret = new TargetPhraseCollection(*ret);
}
std::pair<const TargetPhraseCollection*, clock_t> value(ret, clock());
#ifdef WITH_THREADS
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
#endif
m_cache[hash] = ret;
m_cache[hash] = value;
} else {
ret = iter->second;
// in cache. just use it
std::pair<const TargetPhraseCollection*, clock_t> &value = iter->second;
value.second = clock();
ret = value.first;
}
} else {
// don't use cache. look up from phrase table
ret = GetTargetPhraseCollectionNonCache(src);
}
@ -91,8 +98,8 @@ GetTargetPhraseCollectionLegacy(InputType const& src,WordsRange const& range) co
void PhraseDictionary::SetParameter(const std::string& key, const std::string& value)
{
if (key == "use-cache") {
m_useCache = Scan<int>(value);
if (key == "cache-size") {
m_maxCacheSize = Scan<size_t>(value);
} else if (key == "path") {
m_filePath = value;
} else if (key == "table-limit") {
@ -126,5 +133,37 @@ void PhraseDictionary::GetTargetPhraseCollectionBatch(const InputPathList &phras
}
}
void PhraseDictionary::ReduceCache() const
{
if (m_cache.size() <= m_maxCacheSize) return; // not full
clock_t t = clock();
// find cutoff for last used time
priority_queue< clock_t > lastUsedTimes;
std::map<size_t, std::pair<const TargetPhraseCollection*,clock_t> >::iterator iter;
iter = m_cache.begin();
while( iter != m_cache.end() ) {
lastUsedTimes.push( iter->second.second );
iter++;
}
for( size_t i=0; i < lastUsedTimes.size()-m_maxCacheSize/2; i++ )
lastUsedTimes.pop();
clock_t cutoffLastUsedTime = lastUsedTimes.top();
// remove all old entries
#ifdef WITH_THREADS
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
#endif
iter = m_cache.begin();
while( iter != m_cache.end() ) {
if (iter->second.second < cutoffLastUsedTime) {
std::map<size_t, std::pair<const TargetPhraseCollection*,clock_t> >::iterator iterRemove = iter++;
delete iterRemove->second.first;
m_cache.erase(iterRemove);
} else iter++;
}
VERBOSE(2,"Reduced persistent translation option cache in " << ((clock()-t)/(float)CLOCKS_PER_SEC) << " seconds." << std::endl);
}
}

View File

@ -115,15 +115,15 @@ protected:
void SetFeaturesToApply();
// cache
int m_useCache; // 666=not yet set, otherwise act like a bool
mutable std::map<size_t, const TargetPhraseCollection*> m_cache;
size_t m_maxCacheSize; // 0 = no caching
mutable std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> > m_cache;
#ifdef WITH_THREADS
//reader-writer lock
mutable boost::shared_mutex m_accessLock;
#endif
virtual const TargetPhraseCollection *GetTargetPhraseCollectionNonCache(const Phrase& src) const;
void ReduceCache() const;
};
}

View File

@ -44,6 +44,10 @@ PhraseDictionaryMemory::PhraseDictionaryMemory(const std::string &line)
: RuleTableTrie("PhraseDictionaryMemory", line)
{
ReadParameters();
// caching for memory pt is pointless
m_maxCacheSize = 0;
}
TargetPhraseCollection &PhraseDictionaryMemory::GetOrCreateTargetPhraseCollection(

View File

@ -34,9 +34,10 @@ PhraseDictionaryTreeAdaptor(const std::string &line)
PhraseDictionaryTreeAdaptor::~PhraseDictionaryTreeAdaptor()
{
std::map<size_t, const TargetPhraseCollection*>::const_iterator iter;
std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::const_iterator iter;
for (iter = m_cache.begin(); iter != m_cache.end(); ++iter) {
const TargetPhraseCollection *coll = iter->second;
const std::pair<const TargetPhraseCollection*, clock_t> &value = iter->second;
const TargetPhraseCollection *coll = value.first;
delete coll;
}
}
@ -50,6 +51,8 @@ void PhraseDictionaryTreeAdaptor::InitializeForInput(InputType const& source)
{
const StaticData &staticData = StaticData::Instance();
ReduceCache();
PDTAimp *obj = new PDTAimp(this);
vector<float> weight = staticData.GetWeights(this);

View File

@ -79,6 +79,8 @@ void PhraseDictionaryOnDisk::InitializeForInput(InputType const& source)
{
const StaticData &staticData = StaticData::Instance();
ReduceCache();
OnDiskPt::OnDiskWrapper *obj = new OnDiskPt::OnDiskWrapper();
if (!obj->BeginLoad(m_filePath))
return;
@ -91,13 +93,18 @@ void PhraseDictionaryOnDisk::InitializeForInput(InputType const& source)
m_implementation.reset(obj);
}
void PhraseDictionaryOnDisk::GetTargetPhraseCollectionBatch(const InputPathList &phraseDictionaryQueue) const
void PhraseDictionaryOnDisk::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const
{
OnDiskPt::OnDiskWrapper &wrapper = const_cast<OnDiskPt::OnDiskWrapper&>(GetImplementation());
InputPathList::const_iterator iter;
for (iter = phraseDictionaryQueue.begin(); iter != phraseDictionaryQueue.end(); ++iter) {
for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) {
InputPath &inputPath = **iter;
GetTargetPhraseCollectionBatch(inputPath);
}
}
void PhraseDictionaryOnDisk::GetTargetPhraseCollectionBatch(InputPath &inputPath) const
{
OnDiskPt::OnDiskWrapper &wrapper = const_cast<OnDiskPt::OnDiskWrapper&>(GetImplementation());
const Phrase &phrase = inputPath.GetPhrase();
const InputPath *prevInputPath = inputPath.GetPrevNode();
@ -122,17 +129,8 @@ void PhraseDictionaryOnDisk::GetTargetPhraseCollectionBatch(const InputPathList
} else {
const OnDiskPt::PhraseNode *ptNode = prevPtNode->GetChild(*lastWordOnDisk, wrapper);
if (ptNode) {
vector<float> weightT = StaticData::Instance().GetWeights(this);
OnDiskPt::Vocab &vocab = wrapper.GetVocab();
const OnDiskPt::TargetPhraseCollection *targetPhrasesOnDisk = ptNode->GetTargetPhraseCollection(m_tableLimit, wrapper);
TargetPhraseCollection *targetPhrases
= targetPhrasesOnDisk->ConvertToMoses(m_input, m_output, *this, weightT, vocab, false);
inputPath.SetTargetPhrases(*this, targetPhrases, ptNode);
delete targetPhrasesOnDisk;
const TargetPhraseCollection *targetPhrases = GetTargetPhraseCollection(ptNode);
inputPath.SetTargetPhrases(*this, targetPhrases, ptNode);
} else {
inputPath.SetTargetPhrases(*this, NULL, NULL);
}
@ -140,8 +138,67 @@ void PhraseDictionaryOnDisk::GetTargetPhraseCollectionBatch(const InputPathList
delete lastWordOnDisk;
}
}
}
}
const TargetPhraseCollection *PhraseDictionaryOnDisk::GetTargetPhraseCollection(const OnDiskPt::PhraseNode *ptNode) const
{
const TargetPhraseCollection *ret;
if (m_maxCacheSize) {
size_t hash = (size_t) ptNode->GetFilePos();
std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::iterator iter;
{
// scope of read lock
#ifdef WITH_THREADS
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
#endif
iter = m_cache.find(hash);
}
if (iter == m_cache.end()) {
// not in cache, need to look up from phrase table
ret = GetTargetPhraseCollectionNonCache(ptNode);
if (ret) {
ret = new TargetPhraseCollection(*ret);
}
std::pair<const TargetPhraseCollection*, clock_t> value(ret, clock());
#ifdef WITH_THREADS
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
#endif
m_cache[hash] = value;
}
else {
// in cache. just use it
std::pair<const TargetPhraseCollection*, clock_t> &value = iter->second;
value.second = clock();
ret = value.first;
}
} else {
ret = GetTargetPhraseCollectionNonCache(ptNode);
}
return ret;
}
const TargetPhraseCollection *PhraseDictionaryOnDisk::GetTargetPhraseCollectionNonCache(const OnDiskPt::PhraseNode *ptNode) const
{
OnDiskPt::OnDiskWrapper &wrapper = const_cast<OnDiskPt::OnDiskWrapper&>(GetImplementation());
vector<float> weightT = StaticData::Instance().GetWeights(this);
OnDiskPt::Vocab &vocab = wrapper.GetVocab();
const OnDiskPt::TargetPhraseCollection *targetPhrasesOnDisk = ptNode->GetTargetPhraseCollection(m_tableLimit, wrapper);
TargetPhraseCollection *targetPhrases
= targetPhrasesOnDisk->ConvertToMoses(m_input, m_output, *this, weightT, vocab, false);
delete targetPhrasesOnDisk;
return targetPhrases;
}
} // namespace

View File

@ -60,6 +60,8 @@ protected:
OnDiskPt::OnDiskWrapper &GetImplementation();
const OnDiskPt::OnDiskWrapper &GetImplementation() const;
void GetTargetPhraseCollectionBatch(InputPath &inputPath) const;
public:
PhraseDictionaryOnDisk(const std::string &line);
~PhraseDictionaryOnDisk();
@ -75,7 +77,10 @@ public:
const ChartCellCollectionBase &);
virtual void InitializeForInput(InputType const& source);
void GetTargetPhraseCollectionBatch(const InputPathList &phraseDictionaryQueue) const;
void GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const;
const TargetPhraseCollection *GetTargetPhraseCollection(const OnDiskPt::PhraseNode *ptNode) const;
const TargetPhraseCollection *GetTargetPhraseCollectionNonCache(const OnDiskPt::PhraseNode *ptNode) const;
};