port PhraseDictionaryMultiModel to new format

This commit is contained in:
Hieu Hoang 2013-05-10 12:30:01 +01:00
parent 1f5fc77c94
commit ed7ab8146f
4 changed files with 71 additions and 31 deletions

View File

@ -1441,6 +1441,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h</locationURI>
</link>
<link>
<name>TranslationModel/PhraseDictionaryMultiModel.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/PhraseDictionaryMultiModel.cpp</locationURI>
</link>
<link>
<name>TranslationModel/PhraseDictionaryMultiModel.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/PhraseDictionaryMultiModel.h</locationURI>
</link>
<link>
<name>TranslationModel/PhraseDictionaryTree.cpp</name>
<type>1</type>

View File

@ -33,6 +33,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <string>
#ifdef WITH_THREADS
#include <boost/thread.hpp>
#include <boost/thread/mutex.hpp>
#endif
@ -700,6 +701,40 @@ public:
void CollectFeatureFunctions();
bool CheckWeights() const;
void SetTemporaryMultiModelWeightsVector(std::vector<float> weights) const {
#ifdef WITH_THREADS
m_multimodelweights_tmp[boost::this_thread::get_id()] = weights;
#else
m_multimodelweights_tmp = weights;
#endif
}
// multimodel
std::vector<float> m_multimodelweights;
#ifdef WITH_THREADS
mutable std::map<boost::thread::id, std::vector<float> > m_multimodelweights_tmp;
#else
mutable std::vector<float> m_multimodelweights_tmp;
#endif
const std::vector<float>* GetMultiModelWeightsVector() const {
return &m_multimodelweights;
}
const std::vector<float>* GetTemporaryMultiModelWeightsVector() const {
#ifdef WITH_THREADS
if (m_multimodelweights_tmp.find(boost::this_thread::get_id()) != m_multimodelweights_tmp.end()) {
return &m_multimodelweights_tmp.find(boost::this_thread::get_id())->second;
}
else {
return NULL;
}
#else
return &m_multimodelweights_tmp;
#endif
}
};
}

View File

@ -25,10 +25,9 @@ using namespace std;
namespace Moses
{
PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(size_t numScoreComponent,
PhraseDictionaryFeature* feature): PhraseDictionary(numScoreComponent, feature)
PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &line)
:PhraseDictionary("PhraseDictionaryMultiModel", line)
{
m_feature_load = feature;
}
PhraseDictionaryMultiModel::~PhraseDictionaryMultiModel()
@ -45,6 +44,7 @@ bool PhraseDictionaryMultiModel::Load(const std::vector<FactorType> &input
, const LMList &languageModels
, float weightWP)
{
/*
m_languageModels = &languageModels;
m_weight = weight;
m_weightWP = weightWP;
@ -63,7 +63,7 @@ bool PhraseDictionaryMultiModel::Load(const std::vector<FactorType> &input
//how many actual scores there are in the phrase tables
//so far, equal to number of log-linear scores, but it is allowed to be smaller (for other combination types)
size_t numPtScores = m_numScoreComponent;
size_t numPtScores = m_numScoreComponents;
if (m_mode != "interpolate") {
ostringstream msg;
@ -88,18 +88,18 @@ bool PhraseDictionaryMultiModel::Load(const std::vector<FactorType> &input
if (!FileExists(file) && FileExists(file + ".gz")) file += ".gz";
PhraseDictionaryMemory* pdm = new PhraseDictionaryMemory(m_numScoreComponent, m_feature_load);
PhraseDictionaryMemory* pdm = new PhraseDictionaryMemory(m_numScoreComponents, m_feature_load);
pdm->SetNumScoreComponentMultiModel(numPtScores); //instead of complaining about inequal number of scores, silently fill up the score vector with zeroes
pdm->Load( input, output, file, m_weight, m_componentTableLimit, languageModels, m_weightWP);
m_pd.push_back(pdm);
} else if (implementation == Binary) {
PhraseDictionaryTreeAdaptor* pdta = new PhraseDictionaryTreeAdaptor(m_numScoreComponent, numInputScores , m_feature_load);
PhraseDictionaryTreeAdaptor* pdta = new PhraseDictionaryTreeAdaptor(m_numScoreComponents, numInputScores , m_feature_load);
pdta->Load(input, output, file, m_weight, m_componentTableLimit, languageModels, m_weightWP);
m_pd.push_back(pdta);
} else if (implementation == Compact) {
#ifndef WIN32
PhraseDictionaryCompact* pdc = new PhraseDictionaryCompact(m_numScoreComponent, implementation, m_feature_load);
pdc->SetNumScoreComponentMultiModel(m_numScoreComponent); //for compact models, we need to pass number of log-linear components to correctly resize the score vector
PhraseDictionaryCompact* pdc = new PhraseDictionaryCompact(m_numScoreComponents, implementation, m_feature_load);
pdc->SetNumScoreComponentMultiModel(m_numScoreComponents); //for compact models, we need to pass number of log-linear components to correctly resize the score vector
pdc->Load( input, output, file, m_weight, m_componentTableLimit, languageModels, m_weightWP);
m_pd.push_back(pdc);
#else
@ -110,6 +110,7 @@ bool PhraseDictionaryMultiModel::Load(const std::vector<FactorType> &input
UTIL_THROW(util::Exception,"PhraseDictionaryMultiModel does not support phrase table type " << implementation);
}
}
*/
return true;
}
@ -122,7 +123,7 @@ const TargetPhraseCollection *PhraseDictionaryMultiModel::GetTargetPhraseCollect
if (m_mode == "interpolate") {
//interpolation of phrase penalty is skipped, and fixed-value (2.718) is used instead. results will be screwed up if phrase penalty is not last feature
size_t numWeights = m_numScoreComponent-1;
size_t numWeights = m_numScoreComponents-1;
multimodelweights = getWeights(numWeights, true);
}
@ -161,7 +162,7 @@ void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src,
for (iterTargetPhrase = ret_raw->begin(); iterTargetPhrase != iterLast; ++iterTargetPhrase) {
TargetPhrase * targetPhrase = *iterTargetPhrase;
std::vector<float> raw_scores = targetPhrase->GetScoreBreakdown().GetScoresForProducer(m_feature);
std::vector<float> raw_scores = targetPhrase->GetScoreBreakdown().GetScoresForProducer(this);
std::string targetString = targetPhrase->GetStringRep(m_output);
if (allStats->find(targetString) == allStats->end()) {
@ -169,21 +170,21 @@ void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src,
multiModelStatistics * statistics = new multiModelStatistics;
statistics->targetPhrase = new TargetPhrase(*targetPhrase); //make a copy so that we don't overwrite the original phrase table info
Scores scoreVector(m_numScoreComponent);
statistics->p.resize(m_numScoreComponent);
for(size_t j = 0; j < m_numScoreComponent; ++j){
Scores scoreVector(m_numScoreComponents);
statistics->p.resize(m_numScoreComponents);
for(size_t j = 0; j < m_numScoreComponents; ++j){
statistics->p[j].resize(m_numModels);
scoreVector[j] = -raw_scores[j];
}
statistics->targetPhrase->SetScore(m_feature, scoreVector, ScoreComponentCollection(), m_weight, m_weightWP, *m_languageModels); // set scores to 0
statistics->targetPhrase->SetScore(this, scoreVector); // set scores to 0
(*allStats)[targetString] = statistics;
}
multiModelStatistics * statistics = (*allStats)[targetString];
for(size_t j = 0; j < m_numScoreComponent; ++j){
for(size_t j = 0; j < m_numScoreComponents; ++j){
statistics->p[j][i] = UntransformScore(raw_scores[j]);
}
@ -201,16 +202,16 @@ TargetPhraseCollection* PhraseDictionaryMultiModel::CreateTargetPhraseCollection
multiModelStatistics * statistics = iter->second;
Scores scoreVector(m_numScoreComponent);
Scores scoreVector(m_numScoreComponents);
for(size_t i = 0; i < m_numScoreComponent-1; ++i){
for(size_t i = 0; i < m_numScoreComponents-1; ++i){
scoreVector[i] = TransformScore(std::inner_product(statistics->p[i].begin(), statistics->p[i].end(), multimodelweights[i].begin(), 0.0));
}
//assuming that last value is phrase penalty
scoreVector[m_numScoreComponent-1] = 1.0;
scoreVector[m_numScoreComponents-1] = 1.0;
statistics->targetPhrase->SetScore(m_feature, scoreVector, ScoreComponentCollection(), m_weight, m_weightWP, *m_languageModels);
statistics->targetPhrase->SetScore(this, scoreVector);
ret->Add(new TargetPhrase(*statistics->targetPhrase));
}
return ret;
@ -303,7 +304,7 @@ void PhraseDictionaryMultiModel::CacheForCleanup(TargetPhraseCollection* tpc) {
}
void PhraseDictionaryMultiModel::CleanUp(const InputType &source) {
void PhraseDictionaryMultiModel::CleanUpAfterSentenceProcessing(const InputType &source) {
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_sentenceMutex);
PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()];
@ -327,7 +328,7 @@ void PhraseDictionaryMultiModel::CleanUp(const InputType &source) {
void PhraseDictionaryMultiModel::CleanUpComponentModels(const InputType &source) {
for(size_t i = 0; i < m_numModels; ++i){
m_pd[i]->CleanUp(source);
m_pd[i]->CleanUpAfterSentenceProcessing(source);
}
}
@ -380,10 +381,10 @@ vector<float> PhraseDictionaryMultiModel::MinimizePerplexity(vector<pair<string,
Sentence sentence;
CleanUp(sentence); // free memory used by compact phrase tables
size_t numWeights = m_numScoreComponent;
size_t numWeights = m_numScoreComponents;
if (m_mode == "interpolate") {
//interpolation of phrase penalty is skipped, and fixed-value (2.718) is used instead. results will be screwed up if phrase penalty is not last feature
numWeights = m_numScoreComponent-1;
numWeights = m_numScoreComponents-1;
}
vector<float> ret (m_numModels*numWeights);

View File

@ -21,11 +21,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#define moses_PhraseDictionaryMultiModel_h
#include "moses/TranslationModel/PhraseDictionary.h"
#include "moses/TranslationModel/PhraseDictionaryMemory.h"
#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
#ifndef WIN32
#include "moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h"
#endif
#include <boost/unordered_map.hpp>
@ -62,7 +57,7 @@ friend class CrossEntropy;
#endif
public:
PhraseDictionaryMultiModel(size_t m_numScoreComponent, PhraseDictionaryFeature* feature);
PhraseDictionaryMultiModel(const std::string &line);
~PhraseDictionaryMultiModel();
bool Load(const std::vector<FactorType> &input
, const std::vector<FactorType> &output
@ -77,7 +72,7 @@ public:
std::vector<std::vector<float> > getWeights(size_t numWeights, bool normalize) const;
std::vector<float> normalizeWeights(std::vector<float> &weights) const;
void CacheForCleanup(TargetPhraseCollection* tpc);
void CleanUp(const InputType &source);
void CleanUpAfterSentenceProcessing(const InputType &source);
virtual void CleanUpComponentModels(const InputType &source);
#ifdef WITH_DLIB
virtual std::vector<float> MinimizePerplexity(std::vector<std::pair<std::string, std::string> > &phrase_pair_vector);
@ -100,7 +95,6 @@ protected:
std::vector<FactorType> m_output;
size_t m_numModels;
size_t m_componentTableLimit;
PhraseDictionaryFeature* m_feature_load;
typedef std::vector<TargetPhraseCollection*> PhraseCache;
#ifdef WITH_THREADS