mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-26 11:28:48 +03:00
244 lines
7.9 KiB
C++
244 lines
7.9 KiB
C++
|
// $Id$
|
||
|
// vim:tabstop=2
|
||
|
|
||
|
/***********************************************************************
|
||
|
Moses - factored phrase-based language decoder
|
||
|
Copyright (C) 2006 University of Edinburgh
|
||
|
|
||
|
This library is free software; you can redistribute it and/or
|
||
|
modify it under the terms of the GNU Lesser General Public
|
||
|
License as published by the Free Software Foundation; either
|
||
|
version 2.1 of the License, or (at your option) any later version.
|
||
|
|
||
|
This library is distributed in the hope that it will be useful,
|
||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
Lesser General Public License for more details.
|
||
|
|
||
|
You should have received a copy of the GNU Lesser General Public
|
||
|
License along with this library; if not, write to the Free Software
|
||
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||
|
***********************************************************************/
|
||
|
|
||
|
#include "XmlOption.h"
|
||
|
#include <vector>
|
||
|
#include <string>
|
||
|
#include <iostream>
|
||
|
#include "Util.h"
|
||
|
#include "StaticData.h"
|
||
|
#include "TranslationOption.h"
|
||
|
|
||
|
namespace {
|
||
|
|
||
|
std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName){
|
||
|
/*TODO deal with unescaping \"*/
|
||
|
string tagOpen = attributeName + "=\"";
|
||
|
size_t contentsStart = tag.find(tagOpen);
|
||
|
if (contentsStart == std::string::npos) return "";
|
||
|
contentsStart += tagOpen.size();
|
||
|
size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
|
||
|
if (contentsEnd == std::string::npos) {
|
||
|
TRACE_ERR("Malformed XML attribute: "<< tag);
|
||
|
return "";
|
||
|
}
|
||
|
size_t possibleEnd;
|
||
|
while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != std::string::npos) {
|
||
|
contentsEnd = possibleEnd;
|
||
|
}
|
||
|
return tag.substr(contentsStart,contentsEnd-contentsStart);
|
||
|
}
|
||
|
|
||
|
std::string TrimXml(const std::string& str) {
|
||
|
if (str.size() < 2) return str;
|
||
|
if (str[0] == '<' && str[str.size() - 1] == '>') {
|
||
|
return str.substr(1, str.size() - 2);
|
||
|
} else { return str; }
|
||
|
}
|
||
|
|
||
|
bool isXmlTag(const std::string& tag)
|
||
|
{
|
||
|
return tag[0] == '<';
|
||
|
}
|
||
|
|
||
|
inline std::vector<std::string> TokenizeXml(const std::string& str)
|
||
|
{
|
||
|
std::string lbrack = "<";
|
||
|
std::string rbrack = ">";
|
||
|
std::vector<std::string> tokens;
|
||
|
// Find first "non-delimiter".
|
||
|
std::string::size_type cpos = 0;
|
||
|
std::string::size_type lpos = 0;
|
||
|
std::string::size_type rpos = 0;
|
||
|
|
||
|
while (cpos != str.size()) {
|
||
|
lpos = str.find_first_of(lbrack, cpos);
|
||
|
if (lpos != std::string::npos) {
|
||
|
rpos = str.find_first_of(rbrack, lpos);
|
||
|
if (rpos == std::string::npos) {
|
||
|
TRACE_ERR("ERROR: malformed XML: " << str << endl);
|
||
|
return tokens;
|
||
|
}
|
||
|
} else {
|
||
|
tokens.push_back(str.substr(cpos));
|
||
|
break;
|
||
|
}
|
||
|
if (lpos - cpos > 0)
|
||
|
tokens.push_back(str.substr(cpos, lpos - cpos));
|
||
|
tokens.push_back(str.substr(lpos, rpos-lpos+1));
|
||
|
cpos = rpos + 1;
|
||
|
}
|
||
|
return tokens;
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
std::vector<TranslationOption*> ProcessAndStripXMLTags(std::string& line, const InputType &source) {
|
||
|
//parse XML markup in translation line
|
||
|
std::vector<TranslationOption*> res;
|
||
|
std::string rstr;
|
||
|
std::string linkedStr;
|
||
|
if (line.find_first_of('<') == std::string::npos) { return res; }
|
||
|
std::vector<std::string> xmlTokens = TokenizeXml(line);
|
||
|
std::string tagName = "";
|
||
|
std::string tagContents = "";
|
||
|
std::vector<std::string> altTexts;
|
||
|
std::vector<std::string> altProbs;
|
||
|
size_t tagStart=0;
|
||
|
size_t tagEnd=0;
|
||
|
size_t curWord=0;
|
||
|
int numUnary = 0;
|
||
|
bool doClose = false;
|
||
|
bool isLinked = false;
|
||
|
for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++)
|
||
|
{
|
||
|
if(!isXmlTag(xmlTokens[xmlTokenPos]))
|
||
|
{
|
||
|
//phrase, not tag
|
||
|
rstr += xmlTokens[xmlTokenPos];
|
||
|
curWord = Tokenize(rstr).size();
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
//tag data
|
||
|
std::string tag = Trim(TrimXml(xmlTokens[xmlTokenPos]));
|
||
|
VERBOSE(3,"XML TAG IS: " << tag << std::endl);
|
||
|
std::string::size_type endOfName = tag.find_first_of(' ');
|
||
|
std::string nextTagName = tag;
|
||
|
bool isUnary = tag[tag.size() - 1] == '/';
|
||
|
bool isOpen = tag[0] != '/';
|
||
|
if (endOfName != std::string::npos) {
|
||
|
nextTagName = tag.substr(0,endOfName);
|
||
|
tagContents = tag.substr(endOfName+1);
|
||
|
}
|
||
|
if (nextTagName == "linked") {
|
||
|
isLinked = true;
|
||
|
linkedStr = "";
|
||
|
}
|
||
|
else if (nextTagName == "/linked") {
|
||
|
isLinked = false;
|
||
|
// recurse to process linked tags
|
||
|
std::vector<TranslationOption*> tOptions = ProcessAndStripXMLTags(linkedStr, source);
|
||
|
// link them together
|
||
|
std::vector<TranslationOption*>::const_iterator iterTransOpts1;
|
||
|
std::vector<TranslationOption*>::const_iterator iterTransOpts2;
|
||
|
for (iterTransOpts1 = tOptions.begin(); iterTransOpts1 != tOptions.end(); iterTransOpts1++) {
|
||
|
for (iterTransOpts2 = tOptions.begin(); iterTransOpts2 != tOptions.end(); iterTransOpts2++) {
|
||
|
if (iterTransOpts1 != iterTransOpts2) {
|
||
|
(**iterTransOpts1).AddLinkedTransOpt(*iterTransOpts2);
|
||
|
}
|
||
|
}
|
||
|
res.push_back(*iterTransOpts1);
|
||
|
}
|
||
|
}
|
||
|
else if (isLinked) {
|
||
|
linkedStr += xmlTokens[xmlTokenPos];
|
||
|
}
|
||
|
else if (isOpen)
|
||
|
{
|
||
|
//this is an open tag
|
||
|
tagName = nextTagName;
|
||
|
altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContents,"english"), "||");
|
||
|
altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContents,"prob"), "||");
|
||
|
std::string span = ParseXmlTagAttribute(tagContents,"span");
|
||
|
tagStart = curWord;
|
||
|
if (isUnary) {
|
||
|
numUnary++;
|
||
|
if (span.empty()) {
|
||
|
TRACE_ERR("ERROR: unary tags must have a span attribute: " << line << endl);
|
||
|
return res;
|
||
|
}
|
||
|
std::vector<std::string> ij = Tokenize(span, ",");
|
||
|
if (ij.size() != 2) {
|
||
|
TRACE_ERR("ERROR: span tag must be of the form \"i,j\": " << line << endl);
|
||
|
return res;
|
||
|
}
|
||
|
tagStart = atoi(ij[0].c_str());
|
||
|
tagEnd = atoi(ij[1].c_str());
|
||
|
if (tagEnd < tagStart) {
|
||
|
TRACE_ERR("ERROR: span tag " << span << " invalid" << endl);
|
||
|
return res;
|
||
|
}
|
||
|
doClose = true;
|
||
|
VERBOSE(3,"XML TAG IS UNARY" << endl);
|
||
|
}
|
||
|
VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl);
|
||
|
VERBOSE(3,"XML TAG ENGLISH IS: '" << altTexts[0] << "'" << endl);
|
||
|
VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl);
|
||
|
VERBOSE(3,"XML TAG STARTS AT WORD: " << tagStart << endl);
|
||
|
if (altTexts.size() != altProbs.size()) {
|
||
|
TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl);
|
||
|
return res;
|
||
|
}
|
||
|
}
|
||
|
else if ((nextTagName.size() == 0) || (nextTagName.at(0) != '/') || (nextTagName.substr(1) != tagName))
|
||
|
{
|
||
|
//mismatched tag, abort!
|
||
|
TRACE_ERR("ERROR: tried to parse malformed XML with xml-input enabled: " << line << endl);
|
||
|
return res;
|
||
|
}
|
||
|
else {
|
||
|
doClose = true;
|
||
|
tagEnd = curWord-1; //size is inclusive
|
||
|
}
|
||
|
if (doClose) {
|
||
|
VERBOSE(3,"XML END TAG IS: " << nextTagName.substr(1) << endl);
|
||
|
VERBOSE(3,"XML TAG ENDS AT WORD: " << tagEnd << endl);
|
||
|
//store translation options into members
|
||
|
|
||
|
//TODO: deal with multiple XML options here
|
||
|
|
||
|
if (StaticData::Instance().GetXmlInputType() != XmlIgnore) {
|
||
|
const std::vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
|
||
|
for (size_t i=0; i<altTexts.size(); ++i) {
|
||
|
//only store options if we aren't ignoring them
|
||
|
//set default probability
|
||
|
float probValue = 1;
|
||
|
if (altProbs[i] != "") probValue = Scan<float>(altProbs[i]);
|
||
|
//Convert from prob to log-prob
|
||
|
float scoreValue = FloorScore(TransformScore(probValue));
|
||
|
|
||
|
TargetPhrase targetPhrase(Output);
|
||
|
targetPhrase.CreateFromString(outputFactorOrder,altTexts[i],StaticData::Instance().GetFactorDelimiter());
|
||
|
targetPhrase.SetScore(scoreValue);
|
||
|
WordsRange range(tagStart,tagEnd);
|
||
|
|
||
|
TranslationOption *option = new TranslationOption(range,targetPhrase,source);
|
||
|
assert(option);
|
||
|
|
||
|
res.push_back(option);
|
||
|
}
|
||
|
}
|
||
|
tagName= "";
|
||
|
tagContents = "";
|
||
|
altTexts.clear();
|
||
|
altProbs.clear();
|
||
|
doClose = false;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
line = rstr;
|
||
|
return res;
|
||
|
}
|
||
|
|
||
|
|