mosesdecoder/moses/Sentence.cpp
Ulrich Germann 524109e2ca Reorganisation of options.
The purpose of this effort is to have options local to the individual translation task,
so that they can be changed in the running system in a multi-threaded system.
2015-08-06 21:11:43 +01:00

378 lines
12 KiB
C++

// $Id$
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <stdexcept>
#include <boost/algorithm/string.hpp>
#include <boost/foreach.hpp>
#include "Sentence.h"
#include "TranslationOptionCollectionText.h"
#include "StaticData.h"
#include "moses/FF/DynamicCacheBasedLanguageModel.h"
#include "moses/TranslationModel/PhraseDictionaryDynamicCacheBased.h"
#include "ChartTranslationOptions.h"
#include "Util.h"
#include "XmlOption.h"
#include "FactorCollection.h"
using namespace std;
namespace Moses
{
Sentence::
Sentence() : Phrase(0) , InputType()
{
const StaticData& SD = StaticData::Instance();
if (SD.IsSyntax())
m_defaultLabelSet.insert(SD.GetInputDefaultNonTerminal());
}
Sentence::
~Sentence()
{
RemoveAllInColl(m_xmlOptions);
}
void
Sentence::
aux_init_partial_translation(string& line)
{
string sourceCompletedStr;
int loc1 = line.find( "|||", 0 );
int loc2 = line.find( "|||", loc1 + 3 );
if (loc1 > -1 && loc2 > -1) {
m_initialTargetPhrase = Trim(line.substr(0, loc1));
string scov = Trim(line.substr(loc1 + 3, loc2 - loc1 - 3));
line = line.substr(loc2 + 3);
m_sourceCompleted.resize(scov.size());
int contiguous = 1;
for (size_t i = 0; i < scov.size(); ++i) {
if (sourceCompletedStr.at(i) == '1') {
m_sourceCompleted[i] = true;
if (contiguous) m_frontSpanCoveredLength++;
} else {
m_sourceCompleted[i] = false;
contiguous = 0;
}
}
}
}
void
Sentence::
aux_interpret_sgml_markup(string& line)
{
// if sentences is specified as "<seg id=1> ... </seg>", extract id
typedef std::map<std::string, std::string> metamap;
metamap meta = ProcessAndStripSGML(line);
metamap::const_iterator i;
if ((i = meta.find("id")) != meta.end())
this->SetTranslationId(atol(i->second.c_str()));
if ((i = meta.find("docid")) != meta.end()) {
this->SetDocumentId(atol(i->second.c_str()));
this->SetUseTopicId(false);
this->SetUseTopicIdAndProb(false);
}
if ((i = meta.find("topic")) != meta.end()) {
vector<string> topic_params;
boost::split(topic_params, i->second, boost::is_any_of("\t "));
if (topic_params.size() == 1) {
this->SetTopicId(atol(topic_params[0].c_str()));
this->SetUseTopicId(true);
this->SetUseTopicIdAndProb(false);
} else {
this->SetTopicIdAndProb(topic_params);
this->SetUseTopicId(false);
this->SetUseTopicIdAndProb(true);
}
}
if ((i = meta.find("weight-setting")) != meta.end()) {
this->SetWeightSetting(i->second);
this->SetSpecifiesWeightSetting(true);
StaticData::Instance().SetWeightSetting(i->second);
// oh this is so horrible! Why does this have to be propagated globally?
// --- UG
} else this->SetSpecifiesWeightSetting(false);
}
void
Sentence::
aux_interpret_dlt(string& line) // whatever DLT means ... --- UG
{
using namespace std;
typedef map<string, string> str2str_map;
vector<str2str_map> meta = ProcessAndStripDLT(line);
BOOST_FOREACH(str2str_map const& M, meta) {
str2str_map::const_iterator i,j;
if ((i = M.find("type")) != M.end()) {
j = M.find("id");
string id = j == M.end() ? "default" : j->second;
if (i->second == "cbtm") {
PhraseDictionaryDynamicCacheBased* cbtm;
cbtm = PhraseDictionaryDynamicCacheBased::InstanceNonConst(id);
if (cbtm) cbtm->ExecuteDlt(M);
}
if (i->second == "cblm") {
DynamicCacheBasedLanguageModel* cblm;
cblm = DynamicCacheBasedLanguageModel::InstanceNonConst(id);
if (cblm) cblm->ExecuteDlt(M);
}
}
}
}
void
Sentence::
aux_interpret_xml(std::string& line, std::vector<size_t> & xmlWalls,
std::vector<std::pair<size_t, std::string> >& placeholders)
{
// parse XML markup in translation line
const StaticData &SD = StaticData::Instance();
using namespace std;
if (SD.GetXmlInputType() != XmlPassThrough) {
int offset = SD.IsSyntax() ? 1 : 0;
bool OK = ProcessAndStripXMLTags(line, m_xmlOptions,
m_reorderingConstraint,
xmlWalls, placeholders, offset,
SD.GetXmlBrackets().first,
SD.GetXmlBrackets().second);
UTIL_THROW_IF2(!OK, "Unable to parse XML in line: " << line);
}
}
void
Sentence::
init(string line, std::vector<FactorType> const& factorOrder)
{
using namespace std;
const StaticData &SD = StaticData::Instance();
m_frontSpanCoveredLength = 0;
m_sourceCompleted.resize(0);
if (SD.ContinuePartialTranslation())
aux_init_partial_translation(line);
line = Trim(line);
aux_interpret_sgml_markup(line); // for "<seg id=..." markup
aux_interpret_dlt(line); // some poorly documented cache-based stuff
// if sentences is specified as "<passthrough tag1=""/>"
if (SD.IsPassthroughEnabled() || SD.options().nbest.include_passthrough) {
string pthru = PassthroughSGML(line,"passthrough");
this->SetPassthroughInformation(pthru);
}
vector<size_t> xmlWalls;
vector<pair<size_t, string> >placeholders;
aux_interpret_xml(line, xmlWalls, placeholders);
Phrase::CreateFromString(Input, factorOrder, line, NULL);
ProcessPlaceholders(placeholders);
if (SD.IsSyntax()) InitStartEndWord();
// now that we have final word positions in phrase (from
// CreateFromString), we can make input phrase objects to go with
// our XmlOptions and create TranslationOptions
// only fill the vector if we are parsing XML
if (SD.GetXmlInputType() != XmlPassThrough) {
m_xmlCoverageMap.assign(GetSize(), false);
BOOST_FOREACH(XmlOption* o, m_xmlOptions) {
WordsRange const& r = o->range;
for(size_t j = r.GetStartPos(); j <= r.GetEndPos(); ++j)
m_xmlCoverageMap[j]=true;
}
}
// reordering walls and zones
m_reorderingConstraint.InitializeWalls(GetSize());
// set reordering walls, if "-monotone-at-punction" is set
if (SD.UseReorderingConstraint() && GetSize()) {
WordsRange r(0, GetSize()-1);
m_reorderingConstraint.SetMonotoneAtPunctuation(GetSubString(r));
}
// set walls obtained from xml
for(size_t i=0; i<xmlWalls.size(); i++)
if(xmlWalls[i] < GetSize()) // no buggy walls, please
m_reorderingConstraint.SetWall(xmlWalls[i], true);
m_reorderingConstraint.FinalizeWalls();
}
int
Sentence::
Read(std::istream& in,const std::vector<FactorType>& factorOrder)
{
std::string line;
if (getline(in, line, '\n').eof())
return 0;
init(line, factorOrder);
return 1;
}
void
Sentence::
ProcessPlaceholders(const std::vector< std::pair<size_t, std::string> > &placeholders)
{
FactorType placeholderFactor = StaticData::Instance().GetPlaceholderFactor();
if (placeholderFactor == NOT_FOUND) {
return;
}
for (size_t i = 0; i < placeholders.size(); ++i) {
size_t pos = placeholders[i].first;
const string &str = placeholders[i].second;
const Factor *factor = FactorCollection::Instance().AddFactor(str);
Word &word = Phrase::GetWord(pos);
word[placeholderFactor] = factor;
}
}
TranslationOptionCollection*
Sentence::
CreateTranslationOptionCollection(ttasksptr const& ttask) const
{
size_t maxNoTransOptPerCoverage = StaticData::Instance().GetMaxNoTransOptPerCoverage();
float transOptThreshold = StaticData::Instance().GetTranslationOptionThreshold();
TranslationOptionCollection *rv
= new TranslationOptionCollectionText(ttask, *this, maxNoTransOptPerCoverage,
transOptThreshold);
assert(rv);
return rv;
}
void Sentence::Print(std::ostream& out) const
{
out<<*static_cast<Phrase const*>(this);
}
bool Sentence::XmlOverlap(size_t startPos, size_t endPos) const
{
for (size_t pos = startPos; pos <= endPos ; pos++) {
if (pos < m_xmlCoverageMap.size() && m_xmlCoverageMap[pos]) {
return true;
}
}
return false;
}
void Sentence::GetXmlTranslationOptions(std::vector <TranslationOption*> &list) const
{
for (std::vector<XmlOption*>::const_iterator iterXMLOpts = m_xmlOptions.begin();
iterXMLOpts != m_xmlOptions.end(); ++iterXMLOpts) {
const XmlOption &xmlOption = **iterXMLOpts;
const WordsRange &range = xmlOption.range;
const TargetPhrase &targetPhrase = xmlOption.targetPhrase;
TranslationOption *transOpt = new TranslationOption(range, targetPhrase);
list.push_back(transOpt);
}
}
void Sentence::GetXmlTranslationOptions(std::vector <TranslationOption*> &list, size_t startPos, size_t endPos) const
{
//iterate over XmlOptions list, find exact source/target matches
for (std::vector<XmlOption*>::const_iterator iterXMLOpts = m_xmlOptions.begin();
iterXMLOpts != m_xmlOptions.end(); ++iterXMLOpts) {
const XmlOption &xmlOption = **iterXMLOpts;
const WordsRange &range = xmlOption.range;
if (startPos == range.GetStartPos()
&& endPos == range.GetEndPos()) {
const TargetPhrase &targetPhrase = xmlOption.targetPhrase;
TranslationOption *transOpt = new TranslationOption(range, targetPhrase);
list.push_back(transOpt);
}
}
}
std::vector <ChartTranslationOptions*> Sentence::GetXmlChartTranslationOptions() const
{
const StaticData &staticData = StaticData::Instance();
std::vector <ChartTranslationOptions*> ret;
// XML Options
// this code is a copy of the 1 in Sentence.
//only fill the vector if we are parsing XML
if (staticData.GetXmlInputType() != XmlPassThrough ) {
//TODO: needed to handle exclusive
//for (size_t i=0; i<GetSize(); i++) {
// m_xmlCoverageMap.push_back(false);
//}
//iterXMLOpts will be empty for XmlIgnore
//look at each column
for(std::vector<XmlOption*>::const_iterator iterXmlOpts = m_xmlOptions.begin();
iterXmlOpts != m_xmlOptions.end(); iterXmlOpts++) {
const XmlOption &xmlOption = **iterXmlOpts;
TargetPhrase *targetPhrase = new TargetPhrase(xmlOption.targetPhrase);
WordsRange *range = new WordsRange(xmlOption.range);
StackVec emptyStackVec; // hmmm... maybe dangerous, but it is never consulted
TargetPhraseCollection *tpc = new TargetPhraseCollection;
tpc->Add(targetPhrase);
ChartTranslationOptions *transOpt = new ChartTranslationOptions(*tpc, emptyStackVec, *range, 0.0f);
ret.push_back(transOpt);
//TODO: needed to handle exclusive
//for(size_t j=transOpt->GetSourceWordsRange().GetStartPos(); j<=transOpt->GetSourceWordsRange().GetEndPos(); j++) {
// m_xmlCoverageMap[j]=true;
//}
}
}
return ret;
}
void
Sentence::
CreateFromString(vector<FactorType> const& FOrder, string const& phraseString)
{
Phrase::CreateFromString(Input, FOrder, phraseString, NULL);
}
Sentence::
Sentence(size_t const transId, string const& stext,
vector<FactorType> const* IFO)
: InputType(transId)
{
if (IFO) init(stext, *IFO);
else init(stext, StaticData::Instance().GetInputFactorOrder());
}
}