2012-10-11 16:45:36 +04:00
|
|
|
// $Id$
|
|
|
|
// vim:tabstop=2
|
|
|
|
/***********************************************************************
|
|
|
|
Moses - factored phrase-based language decoder
|
|
|
|
Copyright (C) 2010 Hieu Hoang
|
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with this library; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
***********************************************************************/
|
|
|
|
|
|
|
|
#include "ChartParser.h"
|
|
|
|
#include "ChartParserCallback.h"
|
|
|
|
#include "ChartRuleLookupManager.h"
|
|
|
|
#include "StaticData.h"
|
|
|
|
#include "TreeInput.h"
|
2013-05-25 03:25:20 +04:00
|
|
|
#include "moses/FF/UnknownWordPenaltyProducer.h"
|
2012-10-11 16:45:36 +04:00
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
using namespace Moses;
|
|
|
|
|
|
|
|
namespace Moses
|
|
|
|
{
|
|
|
|
extern bool g_debug;
|
|
|
|
|
2013-05-11 17:13:26 +04:00
|
|
|
ChartParserUnknown::ChartParserUnknown() {}
|
2012-10-11 16:45:36 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
ChartParserUnknown::~ChartParserUnknown()
|
|
|
|
{
|
2012-10-11 16:45:36 +04:00
|
|
|
RemoveAllInColl(m_unksrcs);
|
|
|
|
RemoveAllInColl(m_cacheTargetPhraseCollection);
|
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to)
|
|
|
|
{
|
2012-10-11 16:45:36 +04:00
|
|
|
// unknown word, add as trans opt
|
|
|
|
const StaticData &staticData = StaticData::Instance();
|
2012-12-19 20:22:10 +04:00
|
|
|
const UnknownWordPenaltyProducer *unknownWordPenaltyProducer = staticData.GetUnknownWordPenaltyProducer();
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2012-10-11 16:45:36 +04:00
|
|
|
size_t isDigit = 0;
|
|
|
|
if (staticData.GetDropUnknown()) {
|
|
|
|
const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface
|
2013-04-29 21:46:48 +04:00
|
|
|
const StringPiece s = f->GetString();
|
2012-10-11 16:45:36 +04:00
|
|
|
isDigit = s.find_first_of("0123456789");
|
|
|
|
if (isDigit == string::npos)
|
|
|
|
isDigit = 0;
|
|
|
|
else
|
|
|
|
isDigit = 1;
|
|
|
|
// modify the starting bitmap
|
|
|
|
}
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2012-10-11 16:45:36 +04:00
|
|
|
Phrase* unksrc = new Phrase(1);
|
|
|
|
unksrc->AddWord() = sourceWord;
|
2013-06-24 17:45:20 +04:00
|
|
|
Word &newWord = unksrc->GetWord(0);
|
|
|
|
newWord.SetIsOOV(true);
|
|
|
|
|
2012-10-11 16:45:36 +04:00
|
|
|
m_unksrcs.push_back(unksrc);
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2012-10-11 16:45:36 +04:00
|
|
|
//TranslationOption *transOpt;
|
|
|
|
if (! staticData.GetDropUnknown() || isDigit) {
|
|
|
|
// loop
|
|
|
|
const UnknownLHSList &lhsList = staticData.GetUnknownLHS();
|
|
|
|
UnknownLHSList::const_iterator iterLHS;
|
|
|
|
for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) {
|
|
|
|
const string &targetLHSStr = iterLHS->first;
|
|
|
|
float prob = iterLHS->second;
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2012-10-11 16:45:36 +04:00
|
|
|
// lhs
|
|
|
|
//const Word &sourceLHS = staticData.GetInputDefaultNonTerminal();
|
2013-05-22 14:22:17 +04:00
|
|
|
Word *targetLHS = new Word(true);
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2013-05-22 14:22:17 +04:00
|
|
|
targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true);
|
|
|
|
CHECK(targetLHS->GetFactor(0) != NULL);
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2012-10-11 16:45:36 +04:00
|
|
|
// add to dictionary
|
2012-10-22 20:40:23 +04:00
|
|
|
TargetPhrase *targetPhrase = new TargetPhrase();
|
2012-10-11 16:45:36 +04:00
|
|
|
Word &targetWord = targetPhrase->AddWord();
|
|
|
|
targetWord.CreateUnknownWord(sourceWord);
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2012-10-11 16:45:36 +04:00
|
|
|
// scores
|
2013-05-13 18:36:09 +04:00
|
|
|
float unknownScore = FloorScore(TransformScore(prob));
|
|
|
|
|
|
|
|
targetPhrase->GetScoreBreakdown().Assign(unknownWordPenaltyProducer, unknownScore);
|
2013-05-28 03:41:25 +04:00
|
|
|
targetPhrase->Evaluate(*unksrc);
|
2013-05-13 18:36:09 +04:00
|
|
|
|
2012-10-16 00:35:56 +04:00
|
|
|
targetPhrase->SetSourcePhrase(*unksrc);
|
2012-10-11 16:45:36 +04:00
|
|
|
targetPhrase->SetTargetLHS(targetLHS);
|
2013-02-20 20:56:36 +04:00
|
|
|
targetPhrase->SetAlignmentInfo("0-0");
|
|
|
|
|
2012-10-11 16:45:36 +04:00
|
|
|
// chart rule
|
2012-10-12 20:04:36 +04:00
|
|
|
to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range);
|
2012-10-11 16:45:36 +04:00
|
|
|
} // for (iterLHS
|
|
|
|
} else {
|
|
|
|
// drop source word. create blank trans opt
|
2013-05-13 18:36:09 +04:00
|
|
|
float unknownScore = FloorScore(-numeric_limits<float>::infinity());
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2012-10-22 20:40:23 +04:00
|
|
|
TargetPhrase *targetPhrase = new TargetPhrase();
|
2012-10-11 16:45:36 +04:00
|
|
|
// loop
|
|
|
|
const UnknownLHSList &lhsList = staticData.GetUnknownLHS();
|
|
|
|
UnknownLHSList::const_iterator iterLHS;
|
|
|
|
for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) {
|
|
|
|
const string &targetLHSStr = iterLHS->first;
|
|
|
|
//float prob = iterLHS->second;
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2013-05-22 14:22:17 +04:00
|
|
|
Word *targetLHS = new Word(true);
|
|
|
|
targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true);
|
|
|
|
CHECK(targetLHS->GetFactor(0) != NULL);
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2013-05-13 18:36:09 +04:00
|
|
|
targetPhrase->GetScoreBreakdown().Assign(unknownWordPenaltyProducer, unknownScore);
|
2013-05-28 03:41:25 +04:00
|
|
|
targetPhrase->Evaluate(*unksrc);
|
2013-05-13 18:36:09 +04:00
|
|
|
|
2012-10-16 00:35:56 +04:00
|
|
|
targetPhrase->SetSourcePhrase(*unksrc);
|
2012-10-11 16:45:36 +04:00
|
|
|
targetPhrase->SetTargetLHS(targetLHS);
|
2013-02-20 20:56:36 +04:00
|
|
|
|
2012-10-11 16:45:36 +04:00
|
|
|
// chart rule
|
2012-10-12 20:04:36 +04:00
|
|
|
to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range);
|
2012-10-11 16:45:36 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-05-11 17:13:26 +04:00
|
|
|
ChartParser::ChartParser(InputType const &source, ChartCellCollectionBase &cells) :
|
2012-12-21 19:28:34 +04:00
|
|
|
m_decodeGraphList(StaticData::Instance().GetDecodeGraphs()),
|
2013-05-29 21:16:15 +04:00
|
|
|
m_source(source)
|
|
|
|
{
|
2012-12-21 18:54:43 +04:00
|
|
|
const StaticData &staticData = StaticData::Instance();
|
|
|
|
|
2012-12-24 22:52:04 +04:00
|
|
|
staticData.InitializeForInput(source);
|
2013-02-22 23:17:57 +04:00
|
|
|
const std::vector<PhraseDictionary*> &dictionaries = staticData.GetPhraseDictionaries();
|
2012-10-11 16:45:36 +04:00
|
|
|
m_ruleLookupManagers.reserve(dictionaries.size());
|
2013-02-22 23:17:57 +04:00
|
|
|
for (std::vector<PhraseDictionary*>::const_iterator p = dictionaries.begin();
|
2012-10-11 16:45:36 +04:00
|
|
|
p != dictionaries.end(); ++p) {
|
2013-02-22 23:17:57 +04:00
|
|
|
const PhraseDictionary *dict = *p;
|
2012-10-11 16:45:36 +04:00
|
|
|
PhraseDictionary *nonConstDict = const_cast<PhraseDictionary*>(dict);
|
|
|
|
m_ruleLookupManagers.push_back(nonConstDict->CreateRuleLookupManager(source, cells));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
ChartParser::~ChartParser()
|
|
|
|
{
|
2012-10-11 16:45:36 +04:00
|
|
|
RemoveAllInColl(m_ruleLookupManagers);
|
2012-12-28 00:12:35 +04:00
|
|
|
StaticData::Instance().CleanUpAfterSentenceProcessing(m_source);
|
2012-10-11 16:45:36 +04:00
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
void ChartParser::Create(const WordsRange &wordsRange, ChartParserCallback &to)
|
|
|
|
{
|
2012-10-11 16:45:36 +04:00
|
|
|
assert(m_decodeGraphList.size() == m_ruleLookupManagers.size());
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2012-10-11 16:45:36 +04:00
|
|
|
std::vector <DecodeGraph*>::const_iterator iterDecodeGraph;
|
|
|
|
std::vector <ChartRuleLookupManager*>::const_iterator iterRuleLookupManagers = m_ruleLookupManagers.begin();
|
|
|
|
for (iterDecodeGraph = m_decodeGraphList.begin(); iterDecodeGraph != m_decodeGraphList.end(); ++iterDecodeGraph, ++iterRuleLookupManagers) {
|
|
|
|
const DecodeGraph &decodeGraph = **iterDecodeGraph;
|
|
|
|
assert(decodeGraph.GetSize() == 1);
|
|
|
|
ChartRuleLookupManager &ruleLookupManager = **iterRuleLookupManagers;
|
|
|
|
size_t maxSpan = decodeGraph.GetMaxChartSpan();
|
|
|
|
if (maxSpan == 0 || wordsRange.GetNumWordsCovered() <= maxSpan) {
|
|
|
|
ruleLookupManager.GetChartRuleCollection(wordsRange, to);
|
|
|
|
}
|
|
|
|
}
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2012-10-11 16:45:36 +04:00
|
|
|
if (wordsRange.GetNumWordsCovered() == 1 && wordsRange.GetStartPos() != 0 && wordsRange.GetStartPos() != m_source.GetSize()-1) {
|
|
|
|
bool alwaysCreateDirectTranslationOption = StaticData::Instance().IsAlwaysCreateDirectTranslationOption();
|
|
|
|
if (to.Empty() || alwaysCreateDirectTranslationOption) {
|
|
|
|
// create unknown words for 1 word coverage where we don't have any trans options
|
|
|
|
const Word &sourceWord = m_source.GetWord(wordsRange.GetStartPos());
|
|
|
|
m_unknown.Process(sourceWord, wordsRange, to);
|
|
|
|
}
|
2013-05-29 21:16:15 +04:00
|
|
|
}
|
2012-10-11 16:45:36 +04:00
|
|
|
}
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2012-10-11 16:45:36 +04:00
|
|
|
} // namespace Moses
|