2014-01-22 17:06:10 +04:00
|
|
|
// vim:tabstop=2
|
|
|
|
#include "ProbingPT.h"
|
2014-01-23 22:32:24 +04:00
|
|
|
#include "moses/StaticData.h"
|
2014-01-23 18:24:45 +04:00
|
|
|
#include "moses/FactorCollection.h"
|
2014-01-22 17:06:10 +04:00
|
|
|
#include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h"
|
|
|
|
#include "quering.hh"
|
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
|
|
|
|
namespace Moses
|
|
|
|
{
|
|
|
|
ProbingPT::ProbingPT(const std::string &line)
|
|
|
|
: PhraseDictionary(line)
|
|
|
|
,m_engine(NULL)
|
|
|
|
{
|
|
|
|
ReadParameters();
|
2014-01-23 18:24:45 +04:00
|
|
|
|
|
|
|
assert(m_input.size() == 1);
|
|
|
|
assert(m_output.size() == 1);
|
2014-01-22 17:06:10 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
ProbingPT::~ProbingPT()
|
|
|
|
{
|
|
|
|
delete m_engine;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ProbingPT::Load()
|
|
|
|
{
|
|
|
|
SetFeaturesToApply();
|
|
|
|
|
2014-01-23 04:51:03 +04:00
|
|
|
m_engine = new QueryEngine(m_filePath.c_str());
|
2014-01-22 17:06:10 +04:00
|
|
|
|
2014-01-23 18:24:45 +04:00
|
|
|
m_unkId = 456456546456;
|
|
|
|
|
2014-03-19 16:59:55 +04:00
|
|
|
// source vocab
|
|
|
|
const std::map<uint64_t, std::string> &sourceVocab = m_engine->getSourceVocab();
|
2014-03-19 17:20:00 +04:00
|
|
|
std::map<uint64_t, std::string>::const_iterator iterSource;
|
|
|
|
for (iterSource = sourceVocab.begin(); iterSource != sourceVocab.end(); ++iterSource) {
|
|
|
|
const string &wordStr = iterSource->second;
|
2014-03-19 16:59:55 +04:00
|
|
|
const Factor *factor = FactorCollection::Instance().AddFactor(wordStr);
|
|
|
|
|
2014-03-19 17:20:00 +04:00
|
|
|
uint64_t probingId = iterSource->first;
|
2014-03-19 16:59:55 +04:00
|
|
|
|
|
|
|
SourceVocabMap::value_type entry(factor, probingId);
|
|
|
|
m_sourceVocabMap.insert(entry);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
// target vocab
|
2014-03-19 15:39:27 +04:00
|
|
|
const std::map<unsigned int, std::string> &probingVocab = m_engine->getVocab();
|
|
|
|
std::map<unsigned int, std::string>::const_iterator iter;
|
2014-01-23 04:51:03 +04:00
|
|
|
for (iter = probingVocab.begin(); iter != probingVocab.end(); ++iter) {
|
2014-01-23 18:24:45 +04:00
|
|
|
const string &wordStr = iter->second;
|
|
|
|
const Factor *factor = FactorCollection::Instance().AddFactor(wordStr);
|
|
|
|
|
2014-03-19 15:39:27 +04:00
|
|
|
unsigned int probingId = iter->first;
|
2014-01-23 18:24:45 +04:00
|
|
|
|
2014-03-19 16:59:55 +04:00
|
|
|
TargetVocabMap::value_type entry(factor, probingId);
|
2014-01-23 18:24:45 +04:00
|
|
|
m_vocabMap.insert(entry);
|
2014-01-22 17:06:10 +04:00
|
|
|
|
2014-01-23 04:51:03 +04:00
|
|
|
}
|
2014-01-22 17:06:10 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void ProbingPT::InitializeForInput(InputType const& source)
|
|
|
|
{
|
|
|
|
ReduceCache();
|
|
|
|
}
|
|
|
|
|
|
|
|
void ProbingPT::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const
|
|
|
|
{
|
|
|
|
CacheColl &cache = GetCache();
|
|
|
|
|
|
|
|
InputPathList::const_iterator iter;
|
|
|
|
for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) {
|
|
|
|
InputPath &inputPath = **iter;
|
|
|
|
const Phrase &sourcePhrase = inputPath.GetPhrase();
|
|
|
|
|
2014-01-23 22:32:24 +04:00
|
|
|
if (sourcePhrase.GetSize() > StaticData::Instance().GetMaxPhraseLength()) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2014-01-22 22:24:51 +04:00
|
|
|
TargetPhraseCollection *tpColl = CreateTargetPhrase(sourcePhrase);
|
2014-01-22 17:06:10 +04:00
|
|
|
|
|
|
|
// add target phrase to phrase-table cache
|
|
|
|
size_t hash = hash_value(sourcePhrase);
|
|
|
|
std::pair<const TargetPhraseCollection*, clock_t> value(tpColl, clock());
|
|
|
|
cache[hash] = value;
|
|
|
|
|
|
|
|
inputPath.SetTargetPhrases(*this, tpColl, NULL);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-03-19 17:20:00 +04:00
|
|
|
std::vector<uint64_t> ProbingPT::ConvertToProbingSourcePhrase(const Phrase &sourcePhrase, bool &ok) const
|
2014-01-23 18:24:45 +04:00
|
|
|
{
|
|
|
|
size_t size = sourcePhrase.GetSize();
|
|
|
|
std::vector<uint64_t> ret(size);
|
|
|
|
for (size_t i = 0; i < size; ++i) {
|
|
|
|
const Factor *factor = sourcePhrase.GetFactor(i, m_input[0]);
|
2014-03-19 17:20:00 +04:00
|
|
|
uint64_t probingId = GetSourceProbingId(factor);
|
2014-01-23 18:24:45 +04:00
|
|
|
if (probingId == m_unkId) {
|
|
|
|
ok = false;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
ret[i] = probingId;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ok = true;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-01-22 22:24:51 +04:00
|
|
|
TargetPhraseCollection *ProbingPT::CreateTargetPhrase(const Phrase &sourcePhrase) const
|
2014-01-22 17:06:10 +04:00
|
|
|
{
|
|
|
|
// create a target phrase from the 1st word of the source, prefix with 'ProbingPT:'
|
|
|
|
assert(sourcePhrase.GetSize());
|
|
|
|
|
2014-01-23 18:24:45 +04:00
|
|
|
bool ok;
|
2014-03-19 17:20:00 +04:00
|
|
|
vector<uint64_t> probingSource = ConvertToProbingSourcePhrase(sourcePhrase, ok);
|
2014-01-23 18:24:45 +04:00
|
|
|
if (!ok) {
|
|
|
|
// source phrase contains a word unknown in the pt.
|
|
|
|
// We know immediately there's no translation for it
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2014-01-22 22:24:51 +04:00
|
|
|
std::pair<bool, std::vector<target_text> > query_result;
|
|
|
|
|
|
|
|
TargetPhraseCollection *tpColl = NULL;
|
2014-01-22 17:06:10 +04:00
|
|
|
|
|
|
|
//Actual lookup
|
2014-01-23 18:24:45 +04:00
|
|
|
query_result = m_engine->query(probingSource);
|
2014-01-22 22:24:51 +04:00
|
|
|
|
|
|
|
if (query_result.first) {
|
2014-01-23 18:24:45 +04:00
|
|
|
//m_engine->printTargetInfo(query_result.second);
|
2014-01-22 22:24:51 +04:00
|
|
|
tpColl = new TargetPhraseCollection();
|
|
|
|
|
|
|
|
const std::vector<target_text> &probingTargetPhrases = query_result.second;
|
|
|
|
for (size_t i = 0; i < probingTargetPhrases.size(); ++i) {
|
|
|
|
const target_text &probingTargetPhrase = probingTargetPhrases[i];
|
|
|
|
TargetPhrase *tp = CreateTargetPhrase(sourcePhrase, probingTargetPhrase);
|
|
|
|
|
|
|
|
tpColl->Add(tp);
|
|
|
|
}
|
|
|
|
|
2014-01-23 22:32:24 +04:00
|
|
|
tpColl->Prune(true, m_tableLimit);
|
2014-01-22 22:24:51 +04:00
|
|
|
}
|
2014-01-22 17:06:10 +04:00
|
|
|
|
2014-01-22 22:24:51 +04:00
|
|
|
return tpColl;
|
|
|
|
}
|
2014-01-22 17:06:10 +04:00
|
|
|
|
2014-01-22 22:24:51 +04:00
|
|
|
TargetPhrase *ProbingPT::CreateTargetPhrase(const Phrase &sourcePhrase, const target_text &probingTargetPhrase) const
|
|
|
|
{
|
2014-03-19 15:39:27 +04:00
|
|
|
const std::vector<unsigned int> &probingPhrase = probingTargetPhrase.target_phrase;
|
2014-01-22 22:24:51 +04:00
|
|
|
size_t size = probingPhrase.size();
|
2014-01-22 17:06:10 +04:00
|
|
|
|
|
|
|
TargetPhrase *tp = new TargetPhrase();
|
2014-01-22 22:24:51 +04:00
|
|
|
|
2014-01-22 22:41:58 +04:00
|
|
|
// words
|
2014-01-22 22:24:51 +04:00
|
|
|
for (size_t i = 0; i < size; ++i) {
|
2014-01-23 18:24:45 +04:00
|
|
|
uint64_t probingId = probingPhrase[i];
|
2014-03-19 17:20:00 +04:00
|
|
|
const Factor *factor = GetTargetFactor(probingId);
|
2014-01-23 18:24:45 +04:00
|
|
|
assert(factor);
|
2014-01-22 22:24:51 +04:00
|
|
|
|
|
|
|
Word &word = tp->AddWord();
|
2014-01-23 18:24:45 +04:00
|
|
|
word.SetFactor(m_output[0], factor);
|
2014-01-22 22:24:51 +04:00
|
|
|
}
|
2014-01-22 17:06:10 +04:00
|
|
|
|
|
|
|
// score for this phrase table
|
2014-03-19 15:39:27 +04:00
|
|
|
vector<float> scores = probingTargetPhrase.prob;
|
2014-01-22 22:41:58 +04:00
|
|
|
std::transform(scores.begin(), scores.end(), scores.begin(),TransformScore);
|
2014-01-22 17:06:10 +04:00
|
|
|
tp->GetScoreBreakdown().PlusEquals(this, scores);
|
|
|
|
|
2014-01-22 22:41:58 +04:00
|
|
|
// alignment
|
2014-01-23 22:32:24 +04:00
|
|
|
/*
|
2014-03-19 15:39:27 +04:00
|
|
|
const std::vector<unsigned char> &alignments = probingTargetPhrase.word_all1;
|
2014-01-22 22:41:58 +04:00
|
|
|
|
|
|
|
AlignmentInfo &aligns = tp->GetAlignTerm();
|
2014-03-19 15:39:27 +04:00
|
|
|
for (size_t i = 0; i < alignS.size(); i += 2 ) {
|
|
|
|
aligns.Add((size_t) alignments[i], (size_t) alignments[i+1]);
|
2014-01-22 22:41:58 +04:00
|
|
|
}
|
|
|
|
*/
|
|
|
|
|
2014-01-22 17:06:10 +04:00
|
|
|
// score of all other ff when this rule is being loaded
|
|
|
|
tp->Evaluate(sourcePhrase, GetFeaturesToApply());
|
2014-01-23 22:32:24 +04:00
|
|
|
return tp;
|
2014-01-22 17:06:10 +04:00
|
|
|
}
|
|
|
|
|
2014-03-19 17:20:00 +04:00
|
|
|
const Factor *ProbingPT::GetTargetFactor(uint64_t probingId) const
|
2014-01-23 18:24:45 +04:00
|
|
|
{
|
2014-03-19 16:59:55 +04:00
|
|
|
TargetVocabMap::right_map::const_iterator iter;
|
2014-01-23 18:24:45 +04:00
|
|
|
iter = m_vocabMap.right.find(probingId);
|
|
|
|
if (iter != m_vocabMap.right.end()) {
|
|
|
|
return iter->second;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
// not in mapping. Must be UNK
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-03-19 17:20:00 +04:00
|
|
|
uint64_t ProbingPT::GetSourceProbingId(const Factor *factor) const
|
2014-01-23 18:24:45 +04:00
|
|
|
{
|
2014-03-19 17:20:00 +04:00
|
|
|
SourceVocabMap::left_map::const_iterator iter;
|
|
|
|
iter = m_sourceVocabMap.left.find(factor);
|
|
|
|
if (iter != m_sourceVocabMap.left.end()) {
|
2014-01-23 18:24:45 +04:00
|
|
|
return iter->second;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
// not in mapping. Must be UNK
|
|
|
|
return m_unkId;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-01-22 17:06:10 +04:00
|
|
|
ChartRuleLookupManager* ProbingPT::CreateRuleLookupManager(const ChartParser &parser,
|
|
|
|
const ChartCellCollectionBase &cellCollection)
|
|
|
|
{
|
|
|
|
abort();
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
TO_STRING_BODY(ProbingPT);
|
|
|
|
|
|
|
|
// friend
|
|
|
|
ostream& operator<<(ostream& out, const ProbingPT& phraseDict)
|
|
|
|
{
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|