mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 13:23:25 +03:00
fa51da28c5
Final commit in this round of refactoring (which started with commit 2f735998...). The main changes are: - a general storage mechanism for attribute/value pairs in XML-style tree / lattice input. E.g. the "pcfg-score" and "semantic-role" attributes in: <tree label="PRP" pcfg-score="1.0" semantic-role="AGENT"> I </tree> - consolidation of the various near-duplicate Tree / XmlTreeParser classes that have accumulated over the years (my fault) - miscellaneous de-crufting
431 lines
14 KiB
C++
431 lines
14 KiB
C++
/***********************************************************************
|
|
Moses - factored phrase-based language decoder
|
|
Copyright (C) 2006 University of Edinburgh
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with this library; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
***********************************************************************/
|
|
|
|
#include <cassert>
|
|
#include <vector>
|
|
#include <string>
|
|
#include <set>
|
|
#include <iostream>
|
|
#include <cstdlib>
|
|
#include <sstream>
|
|
|
|
#include "SyntaxNodeCollection.h"
|
|
#include "XmlException.h"
|
|
|
|
using namespace std;
|
|
|
|
namespace MosesTraining
|
|
{
|
|
|
|
inline std::vector<std::string> Tokenize(const std::string& str,
|
|
const std::string& delimiters = " \t")
|
|
{
|
|
std::vector<std::string> tokens;
|
|
// Skip delimiters at beginning.
|
|
std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
|
|
// Find first "non-delimiter".
|
|
std::string::size_type pos = str.find_first_of(delimiters, lastPos);
|
|
|
|
while (std::string::npos != pos || std::string::npos != lastPos) {
|
|
// Found a token, add it to the vector.
|
|
tokens.push_back(str.substr(lastPos, pos - lastPos));
|
|
// Skip delimiters. Note the "not_of"
|
|
lastPos = str.find_first_not_of(delimiters, pos);
|
|
// Find next "non-delimiter"
|
|
pos = str.find_first_of(delimiters, lastPos);
|
|
}
|
|
|
|
return tokens;
|
|
}
|
|
|
|
std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r")
|
|
{
|
|
std::string res = str;
|
|
res.erase(str.find_last_not_of(dropChars)+1);
|
|
return res.erase(0, res.find_first_not_of(dropChars));
|
|
}
|
|
|
|
string ParseXmlTagAttribute(const string& tag,const string& attributeName)
|
|
{
|
|
/*TODO deal with unescaping \"*/
|
|
string tagOpen = attributeName + "=\"";
|
|
size_t contentsStart = tag.find(tagOpen);
|
|
if (contentsStart == string::npos) return "";
|
|
contentsStart += tagOpen.size();
|
|
size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
|
|
if (contentsEnd == string::npos) {
|
|
cerr << "Malformed XML attribute: "<< tag;
|
|
return "";
|
|
}
|
|
size_t possibleEnd;
|
|
while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) {
|
|
contentsEnd = possibleEnd;
|
|
}
|
|
return tag.substr(contentsStart,contentsEnd-contentsStart);
|
|
}
|
|
|
|
// s should be a sequence of name=attribute pairs separated by whitespace.
|
|
// e.g. "label=\"S\" pcfg=\"-1.452\" foo=\"blah\\\"blah\""
|
|
void ParseXmlTagAttributes(const std::string &s,
|
|
std::map<std::string, std::string> &attributes)
|
|
{
|
|
std::size_t begin = 0;
|
|
while (true) {
|
|
std::size_t pos = s.find('=', begin);
|
|
if (pos == std::string::npos) {
|
|
return;
|
|
}
|
|
std::string name = Trim(s.substr(begin, pos-begin));
|
|
begin = s.find('"', pos+1);
|
|
if (begin == std::string::npos) {
|
|
throw XmlException("invalid tag content");
|
|
}
|
|
pos = s.find('"', begin+1);
|
|
if (pos == std::string::npos) {
|
|
throw XmlException("invalid tag content");
|
|
}
|
|
while (s[pos-1] == '\\') {
|
|
pos = s.find('"', pos+1);
|
|
if (pos == std::string::npos) {
|
|
throw XmlException("invalid tag content");
|
|
}
|
|
}
|
|
if (name != "label" && name != "span") {
|
|
attributes[name] = s.substr(begin+1, pos-begin-1);
|
|
}
|
|
begin = pos+1;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Remove "<" and ">" from XML tag
|
|
*
|
|
* \param str xml token to be stripped
|
|
*/
|
|
string TrimXml(const string& str)
|
|
{
|
|
// too short to be xml token -> do nothing
|
|
if (str.size() < 2) return str;
|
|
|
|
// strip first and last character
|
|
if (str[0] == '<' && str[str.size() - 1] == '>') {
|
|
return str.substr(1, str.size() - 2);
|
|
}
|
|
// not an xml token -> do nothing
|
|
else {
|
|
return str;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check if the token is an XML tag, i.e. starts with "<"
|
|
*
|
|
* \param tag token to be checked
|
|
*/
|
|
bool isXmlTag(const string& tag)
|
|
{
|
|
return tag[0] == '<';
|
|
}
|
|
|
|
/**
|
|
* Unescape XML special characters.
|
|
*/
|
|
string unescape(const string& str)
|
|
{
|
|
string s;
|
|
s.reserve(str.size());
|
|
string::size_type n;
|
|
string::size_type start = 0;
|
|
while ((n = str.find('&', start)) != string::npos) {
|
|
s += str.substr(start, n-start);
|
|
string::size_type end = str.find(';', n);
|
|
assert(n != string::npos);
|
|
string name = str.substr(n+1, end-n-1);
|
|
if (name == "lt") {
|
|
s += string("<");
|
|
} else if (name == "gt") {
|
|
s += string(">");
|
|
} else if (name == "#91") {
|
|
s += string("[");
|
|
} else if (name == "#93") {
|
|
s += string("]");
|
|
} else if (name == "bra") {
|
|
s += string("[");
|
|
} else if (name == "ket") {
|
|
s += string("]");
|
|
} else if (name == "bar" || name == "#124") {
|
|
s += string("|");
|
|
} else if (name == "amp") {
|
|
s += string("&");
|
|
} else if (name == "apos") {
|
|
s += string("'");
|
|
} else if (name == "quot") {
|
|
s += string("\"");
|
|
} else {
|
|
// Currently only handles the following five XML escape sequences:
|
|
// < <
|
|
// > >
|
|
// & &
|
|
// ' '
|
|
// " "
|
|
// Numeric character references (like ö) are not supported.
|
|
std::ostringstream msg;
|
|
msg << "unsupported XML escape sequence: &" << name << ";";
|
|
throw XmlException(msg.str());
|
|
}
|
|
if (end == str.size()-1) {
|
|
return s;
|
|
}
|
|
start = end + 1;
|
|
}
|
|
s += str.substr(start);
|
|
return s;
|
|
}
|
|
|
|
/**
|
|
* Split up the input character string into tokens made up of
|
|
* either XML tags or text.
|
|
* example: this <b> is a </b> test .
|
|
* => (this ), (<b>), ( is a ), (</b>), ( test .)
|
|
*
|
|
* \param str input string
|
|
*/
|
|
vector<string> TokenizeXml(const string& str)
|
|
{
|
|
string lbrack = "<";
|
|
string rbrack = ">";
|
|
vector<string> tokens; // vector of tokens to be returned
|
|
string::size_type cpos = 0; // current position in string
|
|
string::size_type lpos = 0; // left start of xml tag
|
|
string::size_type rpos = 0; // right end of xml tag
|
|
|
|
// walk thorugh the string (loop vver cpos)
|
|
while (cpos != str.size()) {
|
|
// find the next opening "<" of an xml tag
|
|
lpos = str.find_first_of(lbrack, cpos);
|
|
if (lpos != string::npos) {
|
|
// find the end of the xml tag
|
|
rpos = str.find_first_of(rbrack, lpos);
|
|
// sanity check: there has to be closing ">"
|
|
if (rpos == string::npos) {
|
|
cerr << "ERROR: malformed XML: " << str << endl;
|
|
return tokens;
|
|
}
|
|
} else { // no more tags found
|
|
// add the rest as token
|
|
tokens.push_back(str.substr(cpos));
|
|
break;
|
|
}
|
|
|
|
// add stuff before xml tag as token, if there is any
|
|
if (lpos - cpos > 0)
|
|
tokens.push_back(str.substr(cpos, lpos - cpos));
|
|
|
|
// add xml tag as token
|
|
tokens.push_back(str.substr(lpos, rpos-lpos+1));
|
|
cpos = rpos + 1;
|
|
}
|
|
return tokens;
|
|
}
|
|
|
|
/**
|
|
* Process a sentence with XML-style annotation of syntactic nodes.
|
|
*
|
|
* \param line[in,out] in: sentence, out: sentence without the XML
|
|
* \param nodeCollection[out] the collection of SyntaxNode objects for this
|
|
* sentence
|
|
* \param labelCollection[out] label values are inserted into this set
|
|
* \param topLabelCollection[out] top labels (key) and their counts (value)
|
|
* are inserted into this map
|
|
* \param unescapeSpecialChars flag indicating whether XML special characters
|
|
* should be unescaped
|
|
*/
|
|
bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
|
|
set< string > &labelCollection,
|
|
map< string, int > &topLabelCollection,
|
|
bool unescapeSpecialChars )
|
|
{
|
|
//parse XML markup in translation line
|
|
|
|
// no xml tag? we're done.
|
|
if (line.find_first_of('<') == string::npos) {
|
|
return true;
|
|
}
|
|
|
|
// break up input into a vector of xml tags and text
|
|
// example: (this), (<b>), (is a), (</b>), (test .)
|
|
vector<string> xmlTokens = TokenizeXml(line);
|
|
|
|
// we need to store opened tags, until they are closed
|
|
// tags are stored as tripled (tagname, startpos, contents)
|
|
typedef pair< string, pair< size_t, string > > OpenedTag;
|
|
vector< OpenedTag > tagStack; // stack that contains active opened tags
|
|
|
|
string cleanLine; // return string (text without xml)
|
|
size_t wordPos = 0; // position in sentence (in terms of number of words)
|
|
|
|
// loop through the tokens
|
|
for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
|
|
// not a xml tag, but regular text (may contain many words)
|
|
if(!isXmlTag(xmlTokens[xmlTokenPos])) {
|
|
// add a space at boundary, if necessary
|
|
if (cleanLine.size()>0 &&
|
|
cleanLine[cleanLine.size() - 1] != ' ' &&
|
|
xmlTokens[xmlTokenPos][0] != ' ') {
|
|
cleanLine += " ";
|
|
}
|
|
// add words to output
|
|
if (unescapeSpecialChars) {
|
|
cleanLine += unescape(xmlTokens[xmlTokenPos]);
|
|
} else {
|
|
cleanLine += xmlTokens[xmlTokenPos];
|
|
}
|
|
wordPos = Tokenize(cleanLine).size(); // count all the words
|
|
}
|
|
|
|
// process xml tag
|
|
else {
|
|
// *** get essential information about tag ***
|
|
|
|
// strip extra boundary spaces and "<" and ">"
|
|
string tag = Trim(TrimXml(xmlTokens[xmlTokenPos]));
|
|
// cerr << "XML TAG IS: " << tag << std::endl;
|
|
|
|
if (tag.size() == 0) {
|
|
cerr << "ERROR: empty tag name: " << line << endl;
|
|
return false;
|
|
}
|
|
|
|
// check if unary (e.g., "<wall/>")
|
|
bool isUnary = ( tag[tag.size() - 1] == '/' );
|
|
|
|
// check if opening tag (e.g. "<a>", not "</a>")g
|
|
bool isClosed = ( tag[0] == '/' );
|
|
bool isOpen = !isClosed;
|
|
|
|
if (isClosed && isUnary) {
|
|
cerr << "ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl;
|
|
return false;
|
|
}
|
|
|
|
if (isClosed)
|
|
tag = tag.substr(1); // remove "/" at the beginning
|
|
if (isUnary)
|
|
tag = tag.substr(0,tag.size()-1); // remove "/" at the end
|
|
|
|
// find the tag name and contents
|
|
string::size_type endOfName = tag.find_first_of(' ');
|
|
string tagName = tag;
|
|
string tagContent = "";
|
|
if (endOfName != string::npos) {
|
|
tagName = tag.substr(0,endOfName);
|
|
tagContent = tag.substr(endOfName+1);
|
|
}
|
|
|
|
// *** process new tag ***
|
|
|
|
if (isOpen || isUnary) {
|
|
// put the tag on the tag stack
|
|
OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
|
|
tagStack.push_back( openedTag );
|
|
// cerr << "XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl;
|
|
}
|
|
|
|
// *** process completed tag ***
|
|
|
|
if (isClosed || isUnary) {
|
|
// pop last opened tag from stack;
|
|
if (tagStack.size() == 0) {
|
|
cerr << "ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl;
|
|
return false;
|
|
}
|
|
OpenedTag openedTag = tagStack.back();
|
|
tagStack.pop_back();
|
|
|
|
// tag names have to match
|
|
if (openedTag.first != tagName) {
|
|
cerr << "ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl;
|
|
return false;
|
|
}
|
|
|
|
// assemble remaining information about tag
|
|
size_t startPos = openedTag.second.first;
|
|
string tagContent = openedTag.second.second;
|
|
size_t endPos = wordPos;
|
|
|
|
// span attribute overwrites position
|
|
string span = ParseXmlTagAttribute(tagContent,"span");
|
|
if (! span.empty()) {
|
|
vector<string> ij = Tokenize(span, "-");
|
|
if (ij.size() != 1 && ij.size() != 2) {
|
|
cerr << "ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl;
|
|
return false;
|
|
}
|
|
startPos = atoi(ij[0].c_str());
|
|
if (ij.size() == 1) endPos = startPos + 1;
|
|
else endPos = atoi(ij[1].c_str()) + 1;
|
|
}
|
|
|
|
// cerr << "XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl;
|
|
|
|
if (startPos > endPos) {
|
|
cerr << "ERROR: tag " << tagName << " startPos is bigger than endPos (" << startPos << "-" << endPos << "): " << line << endl;
|
|
return false;
|
|
} else if (startPos == endPos) {
|
|
cerr << "WARNING: tag " << tagName << ". Ignoring 0 span (" << startPos << "-" << endPos << "): " << line << endl;
|
|
continue;
|
|
}
|
|
|
|
string label = ParseXmlTagAttribute(tagContent,"label");
|
|
labelCollection.insert( label );
|
|
|
|
// report what we have processed so far
|
|
if (0) {
|
|
cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
|
|
cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
|
|
cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
|
|
}
|
|
SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label );
|
|
ParseXmlTagAttributes(tagContent, node->attributes);
|
|
}
|
|
}
|
|
}
|
|
// we are done. check if there are tags that are still open
|
|
if (tagStack.size() > 0) {
|
|
cerr << "ERROR: some opened tags were never closed: " << line << endl;
|
|
return false;
|
|
}
|
|
|
|
// collect top labels
|
|
const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 );
|
|
for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) {
|
|
SyntaxNode *n = *node;
|
|
const string &label = n->label;
|
|
if (topLabelCollection.find( label ) == topLabelCollection.end())
|
|
topLabelCollection[ label ] = 0;
|
|
topLabelCollection[ label ]++;
|
|
}
|
|
|
|
// return de-xml'ed sentence in line
|
|
line = cleanLine;
|
|
return true;
|
|
}
|
|
|
|
}
|