2011-01-24 22:14:19 +03:00
|
|
|
/***********************************************************************
|
|
|
|
Moses - factored phrase-based language decoder
|
|
|
|
Copyright (C) 2011 University of Edinburgh
|
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with this library; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
***********************************************************************/
|
|
|
|
|
|
|
|
#include "ChartRuleLookupManagerOnDisk.h"
|
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
|
|
|
|
#include "PhraseDictionaryOnDisk.h"
|
|
|
|
#include "StaticData.h"
|
|
|
|
#include "DotChartOnDisk.h"
|
|
|
|
#include "ChartTranslationOptionList.h"
|
|
|
|
#include "../../OnDiskPt/src/TargetPhraseCollection.h"
|
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
|
|
|
|
namespace Moses
|
|
|
|
{
|
|
|
|
|
|
|
|
ChartRuleLookupManagerOnDisk::ChartRuleLookupManagerOnDisk(
|
2011-02-24 16:14:42 +03:00
|
|
|
const InputType &sentence,
|
2011-04-05 00:43:02 +04:00
|
|
|
const ChartCellCollection &cellColl,
|
2011-02-24 16:14:42 +03:00
|
|
|
const PhraseDictionaryOnDisk &dictionary,
|
|
|
|
OnDiskPt::OnDiskWrapper &dbWrapper,
|
|
|
|
const LMList *languageModels,
|
|
|
|
const WordPenaltyProducer *wpProducer,
|
|
|
|
const std::vector<FactorType> &inputFactorsVec,
|
|
|
|
const std::vector<FactorType> &outputFactorsVec,
|
|
|
|
const std::vector<float> &weight,
|
|
|
|
const std::string &filePath)
|
2011-01-24 22:14:19 +03:00
|
|
|
: ChartRuleLookupManager(sentence, cellColl)
|
|
|
|
, m_dictionary(dictionary)
|
|
|
|
, m_dbWrapper(dbWrapper)
|
|
|
|
, m_languageModels(languageModels)
|
|
|
|
, m_wpProducer(wpProducer)
|
|
|
|
, m_inputFactorsVec(inputFactorsVec)
|
|
|
|
, m_outputFactorsVec(outputFactorsVec)
|
|
|
|
, m_weight(weight)
|
|
|
|
, m_filePath(filePath)
|
|
|
|
{
|
2011-03-11 16:08:43 +03:00
|
|
|
assert(m_expandableDottedRuleListVec.size() == 0);
|
2011-02-24 16:14:42 +03:00
|
|
|
size_t sourceSize = sentence.GetSize();
|
2011-03-11 16:08:43 +03:00
|
|
|
m_expandableDottedRuleListVec.resize(sourceSize);
|
2011-01-24 22:14:19 +03:00
|
|
|
|
2011-03-11 16:08:43 +03:00
|
|
|
for (size_t ind = 0; ind < m_expandableDottedRuleListVec.size(); ++ind) {
|
|
|
|
DottedRuleOnDisk *initDottedRule = new DottedRuleOnDisk(m_dbWrapper.GetRootSourceNode());
|
2011-01-24 22:14:19 +03:00
|
|
|
|
2011-03-11 16:08:43 +03:00
|
|
|
DottedRuleStackOnDisk *processedStack = new DottedRuleStackOnDisk(sourceSize - ind + 1);
|
|
|
|
processedStack->Add(0, initDottedRule); // init rule. stores the top node in tree
|
2011-01-24 22:14:19 +03:00
|
|
|
|
2011-03-11 16:08:43 +03:00
|
|
|
m_expandableDottedRuleListVec[ind] = processedStack;
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
2011-01-24 22:14:19 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
ChartRuleLookupManagerOnDisk::~ChartRuleLookupManagerOnDisk()
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
std::map<UINT64, const TargetPhraseCollection*>::const_iterator iterCache;
|
|
|
|
for (iterCache = m_cache.begin(); iterCache != m_cache.end(); ++iterCache) {
|
|
|
|
delete iterCache->second;
|
|
|
|
}
|
|
|
|
m_cache.clear();
|
2011-01-24 22:14:19 +03:00
|
|
|
|
2011-03-11 16:08:43 +03:00
|
|
|
RemoveAllInColl(m_expandableDottedRuleListVec);
|
2011-02-24 16:14:42 +03:00
|
|
|
RemoveAllInColl(m_sourcePhraseNode);
|
2011-01-24 22:14:19 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void ChartRuleLookupManagerOnDisk::GetChartRuleCollection(
|
2011-02-24 16:14:42 +03:00
|
|
|
const WordsRange &range,
|
|
|
|
bool adhereTableLimit,
|
|
|
|
ChartTranslationOptionList &outColl)
|
2011-01-24 22:14:19 +03:00
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
const StaticData &staticData = StaticData::Instance();
|
|
|
|
size_t rulesLimit = StaticData::Instance().GetRuleLimit();
|
|
|
|
|
|
|
|
size_t relEndPos = range.GetEndPos() - range.GetStartPos();
|
|
|
|
size_t absEndPos = range.GetEndPos();
|
|
|
|
|
|
|
|
// MAIN LOOP. create list of nodes of target phrases
|
2011-03-11 16:08:43 +03:00
|
|
|
DottedRuleStackOnDisk &expandableDottedRuleList = *m_expandableDottedRuleListVec[range.GetStartPos()];
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
// sort save nodes so only do nodes with most counts
|
2011-03-11 16:08:43 +03:00
|
|
|
expandableDottedRuleList.SortSavedNodes();
|
2011-02-24 16:14:42 +03:00
|
|
|
size_t numDerivations = 0
|
|
|
|
,maxDerivations = 999999; // staticData.GetMaxDerivations();
|
|
|
|
bool overThreshold = true;
|
|
|
|
|
2011-03-11 16:08:43 +03:00
|
|
|
const DottedRuleStackOnDisk::SavedNodeColl &savedNodeColl = expandableDottedRuleList.GetSavedNodeColl();
|
2011-02-24 16:14:42 +03:00
|
|
|
//cerr << "savedNodeColl=" << savedNodeColl.size() << " ";
|
|
|
|
|
|
|
|
for (size_t ind = 0; ind < (savedNodeColl.size()) && ((numDerivations < maxDerivations) || overThreshold) ; ++ind) {
|
|
|
|
const SavedNodeOnDisk &savedNode = *savedNodeColl[ind];
|
|
|
|
|
2011-03-11 16:08:43 +03:00
|
|
|
const DottedRuleOnDisk &prevDottedRule = savedNode.GetDottedRule();
|
|
|
|
const OnDiskPt::PhraseNode &prevNode = prevDottedRule.GetLastNode();
|
2011-03-11 19:28:36 +03:00
|
|
|
const CoveredChartSpan *prevCoveredChartSpan = prevDottedRule.GetLastCoveredChartSpan();
|
|
|
|
size_t startPos = (prevCoveredChartSpan == NULL) ? range.GetStartPos() : prevCoveredChartSpan->GetWordsRange().GetEndPos() + 1;
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
// search for terminal symbol
|
|
|
|
if (startPos == absEndPos) {
|
|
|
|
const Word &sourceWord = GetSentence().GetWord(absEndPos);
|
|
|
|
OnDiskPt::Word *sourceWordBerkeleyDb = m_dbWrapper.ConvertFromMoses(Input, m_inputFactorsVec, sourceWord);
|
|
|
|
|
|
|
|
if (sourceWordBerkeleyDb != NULL) {
|
|
|
|
const OnDiskPt::PhraseNode *node = prevNode.GetChild(*sourceWordBerkeleyDb, m_dbWrapper);
|
|
|
|
if (node != NULL) {
|
|
|
|
// TODO figure out why source word is needed from node, not from sentence
|
|
|
|
// prob to do with factors or non-term
|
|
|
|
//const Word &sourceWord = node->GetSourceWord();
|
2011-03-11 19:28:36 +03:00
|
|
|
CoveredChartSpan *newCoveredChartSpan = new CoveredChartSpan(absEndPos, absEndPos
|
2011-02-24 16:14:42 +03:00
|
|
|
, sourceWord
|
2011-03-11 19:28:36 +03:00
|
|
|
, prevCoveredChartSpan);
|
|
|
|
DottedRuleOnDisk *dottedRule = new DottedRuleOnDisk(*node, newCoveredChartSpan);
|
2011-03-11 16:08:43 +03:00
|
|
|
expandableDottedRuleList.Add(relEndPos+1, dottedRule);
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
// cache for cleanup
|
|
|
|
m_sourcePhraseNode.push_back(node);
|
2011-01-24 22:14:19 +03:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
delete sourceWordBerkeleyDb;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// search for non-terminals
|
|
|
|
size_t endPos, stackInd;
|
|
|
|
if (startPos > absEndPos)
|
|
|
|
continue;
|
|
|
|
else if (startPos == range.GetStartPos() && range.GetEndPos() > range.GetStartPos()) {
|
|
|
|
// start.
|
|
|
|
endPos = absEndPos - 1;
|
|
|
|
stackInd = relEndPos;
|
|
|
|
} else {
|
|
|
|
endPos = absEndPos;
|
|
|
|
stackInd = relEndPos + 1;
|
|
|
|
}
|
|
|
|
|
2011-02-24 19:17:38 +03:00
|
|
|
// size_t nonTermNumWordsCovered = endPos - startPos + 1;
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2011-03-11 16:08:43 +03:00
|
|
|
// get target nonterminals in this span from chart
|
|
|
|
const NonTerminalSet &chartNonTermSet = GetCellCollection().GetConstituentLabelSet(WordsRange(startPos, endPos));
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2011-02-24 19:17:38 +03:00
|
|
|
//const Word &defaultSourceNonTerm = staticData.GetInputDefaultNonTerminal()
|
|
|
|
// ,&defaultTargetNonTerm = staticData.GetOutputDefaultNonTerminal();
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
// go through each SOURCE lhs
|
|
|
|
const NonTerminalSet &sourceLHSSet = GetSentence().GetLabelSet(startPos, endPos);
|
|
|
|
|
|
|
|
NonTerminalSet::const_iterator iterSourceLHS;
|
|
|
|
for (iterSourceLHS = sourceLHSSet.begin(); iterSourceLHS != sourceLHSSet.end(); ++iterSourceLHS) {
|
|
|
|
const Word &sourceLHS = *iterSourceLHS;
|
|
|
|
|
|
|
|
OnDiskPt::Word *sourceLHSBerkeleyDb = m_dbWrapper.ConvertFromMoses(Input, m_inputFactorsVec, sourceLHS);
|
|
|
|
|
|
|
|
if (sourceLHSBerkeleyDb == NULL) {
|
|
|
|
delete sourceLHSBerkeleyDb;
|
|
|
|
continue; // vocab not in pt. node definately won't be in there
|
|
|
|
}
|
|
|
|
|
|
|
|
const OnDiskPt::PhraseNode *sourceNode = prevNode.GetChild(*sourceLHSBerkeleyDb, m_dbWrapper);
|
|
|
|
delete sourceLHSBerkeleyDb;
|
|
|
|
|
|
|
|
if (sourceNode == NULL)
|
|
|
|
continue; // didn't find source node
|
|
|
|
|
|
|
|
// go through each TARGET lhs
|
2011-03-11 16:08:43 +03:00
|
|
|
NonTerminalSet::const_iterator iterChartNonTerm;
|
|
|
|
for (iterChartNonTerm = chartNonTermSet.begin(); iterChartNonTerm != chartNonTermSet.end(); ++iterChartNonTerm) {
|
|
|
|
const Word &chartNonTerm = *iterChartNonTerm;
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2011-03-11 16:08:43 +03:00
|
|
|
//cerr << sourceLHS << " " << defaultSourceNonTerm << " " << chartNonTerm << " " << defaultTargetNonTerm << endl;
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2011-03-11 16:08:43 +03:00
|
|
|
//bool isSyntaxNonTerm = (sourceLHS != defaultSourceNonTerm) || (chartNonTerm != defaultTargetNonTerm);
|
2011-02-24 16:14:42 +03:00
|
|
|
bool doSearch = true; //isSyntaxNonTerm ? nonTermNumWordsCovered <= maxSyntaxSpan :
|
|
|
|
// nonTermNumWordsCovered <= maxDefaultSpan;
|
|
|
|
|
|
|
|
if (doSearch) {
|
|
|
|
|
2011-03-11 16:08:43 +03:00
|
|
|
OnDiskPt::Word *chartNonTermBerkeleyDb = m_dbWrapper.ConvertFromMoses(Output, m_outputFactorsVec, chartNonTerm);
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2011-03-11 16:08:43 +03:00
|
|
|
if (chartNonTermBerkeleyDb == NULL)
|
2011-01-24 22:14:19 +03:00
|
|
|
continue;
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2011-03-11 16:08:43 +03:00
|
|
|
const OnDiskPt::PhraseNode *node = sourceNode->GetChild(*chartNonTermBerkeleyDb, m_dbWrapper);
|
|
|
|
delete chartNonTermBerkeleyDb;
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
if (node == NULL)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// found matching entry
|
|
|
|
//const Word &sourceWord = node->GetSourceWord();
|
2011-03-11 19:28:36 +03:00
|
|
|
CoveredChartSpan *newCoveredChartSpan = new CoveredChartSpan(startPos, endPos
|
2011-03-11 16:08:43 +03:00
|
|
|
, chartNonTerm
|
2011-03-11 19:28:36 +03:00
|
|
|
, prevCoveredChartSpan);
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2011-03-11 19:28:36 +03:00
|
|
|
DottedRuleOnDisk *dottedRule = new DottedRuleOnDisk(*node, newCoveredChartSpan);
|
2011-03-11 16:08:43 +03:00
|
|
|
expandableDottedRuleList.Add(stackInd, dottedRule);
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
m_sourcePhraseNode.push_back(node);
|
2011-01-24 22:14:19 +03:00
|
|
|
}
|
2011-03-11 16:08:43 +03:00
|
|
|
} // for (iterChartNonTerm
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
delete sourceNode;
|
|
|
|
|
|
|
|
} // for (iterLabelListf
|
|
|
|
|
|
|
|
// return list of target phrases
|
2011-03-11 16:08:43 +03:00
|
|
|
DottedRuleCollOnDisk &nodes = expandableDottedRuleList.Get(relEndPos + 1);
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
// source LHS
|
2011-03-11 16:08:43 +03:00
|
|
|
DottedRuleCollOnDisk::const_iterator iterDottedRuleColl;
|
|
|
|
for (iterDottedRuleColl = nodes.begin(); iterDottedRuleColl != nodes.end(); ++iterDottedRuleColl) {
|
2011-02-24 16:14:42 +03:00
|
|
|
// node of last source word
|
2011-03-11 16:08:43 +03:00
|
|
|
const DottedRuleOnDisk &prevDottedRule = **iterDottedRuleColl;
|
|
|
|
if (prevDottedRule.Done())
|
2011-02-24 16:14:42 +03:00
|
|
|
continue;
|
2011-03-11 16:08:43 +03:00
|
|
|
prevDottedRule.Done(true);
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2011-03-11 19:28:36 +03:00
|
|
|
const CoveredChartSpan *coveredChartSpan = prevDottedRule.GetLastCoveredChartSpan();
|
|
|
|
assert(coveredChartSpan);
|
2011-01-24 22:14:19 +03:00
|
|
|
|
2011-03-11 16:08:43 +03:00
|
|
|
const OnDiskPt::PhraseNode &prevNode = prevDottedRule.GetLastNode();
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
//get node for each source LHS
|
|
|
|
const NonTerminalSet &lhsSet = GetSentence().GetLabelSet(range.GetStartPos(), range.GetEndPos());
|
|
|
|
NonTerminalSet::const_iterator iterLabelSet;
|
|
|
|
for (iterLabelSet = lhsSet.begin(); iterLabelSet != lhsSet.end(); ++iterLabelSet) {
|
|
|
|
const Word &sourceLHS = *iterLabelSet;
|
|
|
|
|
|
|
|
OnDiskPt::Word *sourceLHSBerkeleyDb = m_dbWrapper.ConvertFromMoses(Input, m_inputFactorsVec, sourceLHS);
|
|
|
|
if (sourceLHSBerkeleyDb == NULL)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
const TargetPhraseCollection *targetPhraseCollection = NULL;
|
|
|
|
const OnDiskPt::PhraseNode *node = prevNode.GetChild(*sourceLHSBerkeleyDb, m_dbWrapper);
|
|
|
|
if (node) {
|
|
|
|
UINT64 tpCollFilePos = node->GetValue();
|
|
|
|
std::map<UINT64, const TargetPhraseCollection*>::const_iterator iterCache = m_cache.find(tpCollFilePos);
|
|
|
|
if (iterCache == m_cache.end()) {
|
|
|
|
// not in case
|
|
|
|
overThreshold = node->GetCount(0) > staticData.GetRuleCountThreshold();
|
|
|
|
//cerr << node->GetCount(0) << " ";
|
|
|
|
|
|
|
|
const OnDiskPt::TargetPhraseCollection *tpcollBerkeleyDb = node->GetTargetPhraseCollection(m_dictionary.GetTableLimit(), m_dbWrapper);
|
|
|
|
|
|
|
|
targetPhraseCollection
|
|
|
|
= tpcollBerkeleyDb->ConvertToMoses(m_inputFactorsVec
|
|
|
|
,m_outputFactorsVec
|
|
|
|
,m_dictionary
|
|
|
|
,m_weight
|
|
|
|
,m_wpProducer
|
|
|
|
,*m_languageModels
|
|
|
|
,m_filePath
|
|
|
|
, m_dbWrapper.GetVocab());
|
|
|
|
|
|
|
|
delete tpcollBerkeleyDb;
|
|
|
|
m_cache[tpCollFilePos] = targetPhraseCollection;
|
|
|
|
} else {
|
|
|
|
// jsut get out of cache
|
|
|
|
targetPhraseCollection = iterCache->second;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(targetPhraseCollection);
|
2011-04-05 00:43:02 +04:00
|
|
|
outColl.Add(*targetPhraseCollection, *coveredChartSpan,
|
|
|
|
GetCellCollection(), adhereTableLimit, rulesLimit);
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
numDerivations++;
|
|
|
|
|
|
|
|
} // if (node)
|
|
|
|
|
|
|
|
delete node;
|
|
|
|
delete sourceLHSBerkeleyDb;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} // for (size_t ind = 0; ind < savedNodeColl.size(); ++ind)
|
|
|
|
|
|
|
|
outColl.CreateChartRules(rulesLimit);
|
|
|
|
|
|
|
|
//cerr << numDerivations << " ";
|
2011-01-24 22:14:19 +03:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2011-01-24 22:14:19 +03:00
|
|
|
} // namespace Moses
|