mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp
2015-01-14 11:07:42 +00:00

271 lines
9.8 KiB
C++

/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <iostream>
#include "ChartRuleLookupManagerMemoryPerSentence.h"
#include "moses/ChartParser.h"
#include "moses/InputType.h"
#include "moses/Terminal.h"
#include "moses/ChartParserCallback.h"
#include "moses/StaticData.h"
#include "moses/NonTerminal.h"
#include "moses/ChartCellCollection.h"
#include "moses/FactorCollection.h"
#include "moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h"
using namespace std;
namespace Moses
{
ChartRuleLookupManagerMemoryPerSentence::ChartRuleLookupManagerMemoryPerSentence(
const ChartParser &parser,
const ChartCellCollectionBase &cellColl,
const PhraseDictionaryFuzzyMatch &ruleTable)
: ChartRuleLookupManagerCYKPlus(parser, cellColl)
, m_ruleTable(ruleTable)
, m_softMatchingMap(StaticData::Instance().GetSoftMatches())
{
size_t sourceSize = parser.GetSize();
m_completedRules.resize(sourceSize);
m_isSoftMatching = !m_softMatchingMap.empty();
}
void ChartRuleLookupManagerMemoryPerSentence::GetChartRuleCollection(
const InputPath &inputPath,
size_t lastPos,
ChartParserCallback &outColl)
{
const WordsRange &range = inputPath.GetWordsRange();
size_t startPos = range.GetStartPos();
size_t absEndPos = range.GetEndPos();
m_lastPos = lastPos;
m_stackVec.clear();
m_stackScores.clear();
m_outColl = &outColl;
m_unaryPos = absEndPos-1; // rules ending in this position are unary and should not be added to collection
// create/update data structure to quickly look up all chart cells that match start position and label.
UpdateCompressedMatrix(startPos, absEndPos, lastPos);
const PhraseDictionaryNodeMemory &rootNode = m_ruleTable.GetRootNode(GetParser().GetTranslationId());
// all rules starting with terminal
if (startPos == absEndPos) {
GetTerminalExtension(&rootNode, startPos);
}
// all rules starting with nonterminal
else if (absEndPos > startPos) {
GetNonTerminalExtension(&rootNode, startPos);
}
// copy temporarily stored rules to out collection
CompletedRuleCollection & rules = m_completedRules[absEndPos];
for (vector<CompletedRule*>::const_iterator iter = rules.begin(); iter != rules.end(); ++iter) {
outColl.Add((*iter)->GetTPC(), (*iter)->GetStackVector(), range);
}
rules.Clear();
}
// Create/update compressed matrix that stores all valid ChartCellLabels for a given start position and label.
void ChartRuleLookupManagerMemoryPerSentence::UpdateCompressedMatrix(size_t startPos,
size_t origEndPos,
size_t lastPos)
{
std::vector<size_t> endPosVec;
size_t numNonTerms = FactorCollection::Instance().GetNumNonTerminals();
m_compressedMatrixVec.resize(lastPos+1);
// we only need to update cell at [startPos, origEndPos-1] for initial lookup
if (startPos < origEndPos) {
endPosVec.push_back(origEndPos-1);
}
// update all cells starting from startPos+1 for lookup of rule extensions
else if (startPos == origEndPos) {
startPos++;
for (size_t endPos = startPos; endPos <= lastPos; endPos++) {
endPosVec.push_back(endPos);
}
//re-use data structure for cells with later start position, but remove chart cells that would break max-chart-span
for (size_t pos = startPos+1; pos <= lastPos; pos++) {
CompressedMatrix & cellMatrix = m_compressedMatrixVec[pos];
cellMatrix.resize(numNonTerms);
for (size_t i = 0; i < numNonTerms; i++) {
if (!cellMatrix[i].empty() && cellMatrix[i].back().endPos > lastPos) {
cellMatrix[i].pop_back();
}
}
}
}
if (startPos > lastPos) {
return;
}
// populate compressed matrix with all chart cells that start at current start position
CompressedMatrix & cellMatrix = m_compressedMatrixVec[startPos];
cellMatrix.clear();
cellMatrix.resize(numNonTerms);
for (std::vector<size_t>::iterator p = endPosVec.begin(); p != endPosVec.end(); ++p) {
size_t endPos = *p;
// target non-terminal labels for the span
const ChartCellLabelSet &targetNonTerms = GetTargetLabelSet(startPos, endPos);
if (targetNonTerms.GetSize() == 0) {
continue;
}
#if !defined(UNLABELLED_SOURCE)
// source non-terminal labels for the span
const InputPath &inputPath = GetParser().GetInputPath(startPos, endPos);
// can this ever be true? Moses seems to pad the non-terminal set of the input with [X]
if (inputPath.GetNonTerminalSet().size() == 0) {
continue;
}
#endif
for (size_t i = 0; i < numNonTerms; i++) {
const ChartCellLabel *cellLabel = targetNonTerms.Find(i);
if (cellLabel != NULL) {
float score = cellLabel->GetBestScore(m_outColl);
cellMatrix[i].push_back(ChartCellCache(endPos, cellLabel, score));
}
}
}
}
// if a (partial) rule matches, add it to list completed rules (if non-unary and non-empty), and try find expansions that have this partial rule as prefix.
void ChartRuleLookupManagerMemoryPerSentence::AddAndExtend(
const PhraseDictionaryNodeMemory *node,
size_t endPos)
{
const TargetPhraseCollection &tpc = node->GetTargetPhraseCollection();
// add target phrase collection (except if rule is empty or a unary non-terminal rule)
if (!tpc.IsEmpty() && (m_stackVec.empty() || endPos != m_unaryPos)) {
m_completedRules[endPos].Add(tpc, m_stackVec, m_stackScores, *m_outColl);
}
// get all further extensions of rule (until reaching end of sentence or max-chart-span)
if (endPos < m_lastPos) {
if (!node->GetTerminalMap().empty()) {
GetTerminalExtension(node, endPos+1);
}
if (!node->GetNonTerminalMap().empty()) {
GetNonTerminalExtension(node, endPos+1);
}
}
}
// search all possible terminal extensions of a partial rule (pointed at by node) at a given position
// recursively try to expand partial rules into full rules up to m_lastPos.
void ChartRuleLookupManagerMemoryPerSentence::GetTerminalExtension(
const PhraseDictionaryNodeMemory *node,
size_t pos)
{
const Word &sourceWord = GetSourceAt(pos).GetLabel();
const PhraseDictionaryNodeMemory::TerminalMap & terminals = node->GetTerminalMap();
// if node has small number of terminal edges, test word equality for each.
if (terminals.size() < 5) {
for (PhraseDictionaryNodeMemory::TerminalMap::const_iterator iter = terminals.begin(); iter != terminals.end(); ++iter) {
const Word & word = iter->first;
if (TerminalEqualityPred()(word, sourceWord)) {
const PhraseDictionaryNodeMemory *child = & iter->second;
AddAndExtend(child, pos);
break;
}
}
}
// else, do hash lookup
else {
const PhraseDictionaryNodeMemory *child = node->GetChild(sourceWord);
if (child != NULL) {
AddAndExtend(child, pos);
}
}
}
// search all nonterminal possible nonterminal extensions of a partial rule (pointed at by node) for a variable span (starting from startPos).
// recursively try to expand partial rules into full rules up to m_lastPos.
void ChartRuleLookupManagerMemoryPerSentence::GetNonTerminalExtension(
const PhraseDictionaryNodeMemory *node,
size_t startPos)
{
const CompressedMatrix &compressedMatrix = m_compressedMatrixVec[startPos];
// non-terminal labels in phrase dictionary node
const PhraseDictionaryNodeMemory::NonTerminalMap & nonTermMap = node->GetNonTerminalMap();
// make room for back pointer
m_stackVec.push_back(NULL);
m_stackScores.push_back(0);
// loop over possible expansions of the rule
PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator p;
PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator end = nonTermMap.end();
for (p = nonTermMap.begin(); p != end; ++p) {
// does it match possible source and target non-terminals?
#if defined(UNLABELLED_SOURCE)
const Word &targetNonTerm = p->first;
#else
const Word &targetNonTerm = p->first.second;
#endif
const PhraseDictionaryNodeMemory *child = &p->second;
//soft matching of NTs
if (m_isSoftMatching && !m_softMatchingMap[targetNonTerm[0]->GetId()].empty()) {
const std::vector<Word>& softMatches = m_softMatchingMap[targetNonTerm[0]->GetId()];
for (std::vector<Word>::const_iterator softMatch = softMatches.begin(); softMatch != softMatches.end(); ++softMatch) {
const CompressedColumn &matches = compressedMatrix[(*softMatch)[0]->GetId()];
for (CompressedColumn::const_iterator match = matches.begin(); match != matches.end(); ++match) {
m_stackVec.back() = match->cellLabel;
m_stackScores.back() = match->score;
AddAndExtend(child, match->endPos);
}
}
} // end of soft matches lookup
const CompressedColumn &matches = compressedMatrix[targetNonTerm[0]->GetId()];
for (CompressedColumn::const_iterator match = matches.begin(); match != matches.end(); ++match) {
m_stackVec.back() = match->cellLabel;
m_stackScores.back() = match->score;
AddAndExtend(child, match->endPos);
}
}
// remove last back pointer
m_stackVec.pop_back();
m_stackScores.pop_back();
}
} // namespace Moses