prefix subphrase optimization for confusion networks

This commit is contained in:
Hieu Hoang 2013-07-09 16:56:49 +01:00
parent 92163d5091
commit 958d7ca9b8
6 changed files with 71 additions and 44 deletions

View File

@ -4,7 +4,7 @@
namespace Moses
{
InputPath::InputPath(const Phrase &phrase, const WordsRange &range, const InputPath *prevNode
,const ScoreComponentCollection *inputScore)
,const ScoreComponentCollection *inputScore)
:m_prevNode(prevNode)
,m_phrase(phrase)
,m_range(range)

View File

@ -41,7 +41,7 @@ public:
}
InputPath(const Phrase &phrase, const WordsRange &range, const InputPath *prevNode
,const ScoreComponentCollection *inputScore);
,const ScoreComponentCollection *inputScore);
~InputPath();
const Phrase &GetPhrase() const {
@ -62,8 +62,9 @@ public:
}
const TargetPhraseCollection *GetTargetPhrases(const PhraseDictionary &phraseDictionary) const;
const void *GetPtNode(const PhraseDictionary &phraseDictionary) const;
const ScoreComponentCollection *GetInputScore() const
{ return m_inputScore; }
const ScoreComponentCollection *GetInputScore() const {
return m_inputScore;
}
};

View File

@ -234,12 +234,12 @@ void ChartRuleLookupManagerOnDisk::GetChartRuleCollection(
std::vector<float> weightT = staticData.GetWeights(&m_dictionary);
targetPhraseCollection
= tpcollBerkeleyDb->ConvertToMoses(m_inputFactorsVec
,m_outputFactorsVec
,m_dictionary
,weightT
,m_dbWrapper.GetVocab()
,true);
= tpcollBerkeleyDb->ConvertToMoses(m_inputFactorsVec
,m_outputFactorsVec
,m_dictionary
,weightT
,m_dbWrapper.GetVocab()
,true);
delete tpcollBerkeleyDb;
m_cache[tpCollFilePos] = targetPhraseCollection;

View File

@ -134,7 +134,7 @@ void PhraseDictionaryOnDisk::SetTargetPhraseFromPtMatrix(const InputPathList &ph
const OnDiskPt::TargetPhraseCollection *targetPhrasesOnDisk = ptNode->GetTargetPhraseCollection(m_tableLimit, wrapper);
TargetPhraseCollection *targetPhrases
= targetPhrasesOnDisk->ConvertToMoses(m_input, m_output, *this, weightT, vocab, false);
= targetPhrasesOnDisk->ConvertToMoses(m_input, m_output, *this, weightT, vocab, false);
node.SetTargetPhrases(*this, targetPhrases, ptNode);

View File

@ -28,51 +28,76 @@ TranslationOptionCollectionConfusionNet::TranslationOptionCollectionConfusionNet
// 1-word phrases
for (size_t startPos = 0; startPos < size; ++startPos) {
vector<InputPathList> &vec = m_targetPhrasesfromPt[startPos];
vec.push_back(InputPathList());
InputPathList &list = vec.back();
vec.push_back(InputPathList());
InputPathList &list = vec.back();
WordsRange range(startPos, startPos);
WordsRange range(startPos, startPos);
const ConfusionNet::Column &col = input.GetColumn(startPos);
for (size_t i = 0; i < col.size(); ++i) {
const Word &word = col[i].first;
Phrase subphrase;
subphrase.AddWord(word);
const ConfusionNet::Column &col = input.GetColumn(startPos);
for (size_t i = 0; i < col.size(); ++i) {
const Word &word = col[i].first;
Phrase subphrase;
subphrase.AddWord(word);
const std::vector<float> &scores = col[i].second;
ScoreComponentCollection *inputScore = new ScoreComponentCollection();
inputScore->Assign(inputFeature, scores);
const std::vector<float> &scores = col[i].second;
ScoreComponentCollection *inputScore = new ScoreComponentCollection();
inputScore->Assign(inputFeature, scores);
InputPath *node = new InputPath(subphrase, range, NULL, inputScore);
list.push_back(node);
InputPath *node = new InputPath(subphrase, range, NULL, inputScore);
list.push_back(node);
}
m_phraseDictionaryQueue.push_back(node);
}
}
/*
for (size_t phaseSize = 1; phaseSize <= size; ++phaseSize) {
for (size_t startPos = 0; startPos < size - phaseSize + 1; ++startPos) {
size_t endPos = startPos + phaseSize -1;
vector<InputPathList> &vec = m_targetPhrasesfromPt[startPos];
// subphrases of 2+ words
for (size_t phaseSize = 2; phaseSize <= size; ++phaseSize) {
for (size_t startPos = 0; startPos < size - phaseSize + 1; ++startPos) {
size_t endPos = startPos + phaseSize -1;
WordsRange range(startPos, endPos);
Phrase subphrase(input.GetSubString(WordsRange(startPos, endPos)));
WordsRange range(startPos, endPos);
vector<InputPathList> &vec = m_targetPhrasesfromPt[startPos];
InputPathList &list = vec.back();
InputPath *node;
if (range.GetNumWordsCovered() == 1) {
node = new InputPath(subphrase, range, NULL, NULL);
vec.push_back(node);
} else {
const InputPath &prevNode = GetInputPath(startPos, endPos - 1);
node = new InputPath(subphrase, range, &prevNode, NULL);
vec.push_back(node);
}
m_phraseDictionaryQueue.push_back(node);
}
// loop thru every previous path
const InputPathList &prevNodes = GetInputPathList(startPos, endPos - 1);
InputPathList::const_iterator iter;
for (iter = prevNodes.begin(); iter != prevNodes.end(); ++iter) {
const InputPath &prevNode = **iter;
const Phrase &prevPhrase = prevNode.GetPhrase();
const ScoreComponentCollection *prevInputScore = prevNode.GetInputScore();
CHECK(prevInputScore);
// loop thru every word at this position
const ConfusionNet::Column &col = input.GetColumn(startPos);
for (size_t i = 0; i < col.size(); ++i) {
const Word &word = col[i].first;
Phrase subphrase(prevPhrase);
subphrase.AddWord(word);
const std::vector<float> &scores = col[i].second;
ScoreComponentCollection *inputScore = new ScoreComponentCollection(*prevInputScore);
inputScore->PlusEquals(inputFeature, scores);
InputPath *node = new InputPath(subphrase, range, NULL, inputScore);
list.push_back(node);
m_phraseDictionaryQueue.push_back(node);
}
}
}
}
*/
}
InputPathList &TranslationOptionCollectionConfusionNet::GetInputPathList(size_t startPos, size_t endPos)
{
size_t offset = endPos - startPos;
CHECK(offset < m_targetPhrasesfromPt[startPos].size());
return m_targetPhrasesfromPt[startPos][offset];
}
/* forcibly create translation option for a particular source word.

View File

@ -21,6 +21,7 @@ public:
protected:
TargetPhraseMatrix m_targetPhrasesfromPt; /*< contains translation options */
InputPathList &GetInputPathList(size_t startPos, size_t endPos);
public:
TranslationOptionCollectionConfusionNet(const ConfusionNet &source, size_t maxNoTransOptPerCoverage, float translationOptionThreshold);