mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-28 14:32:38 +03:00
escaping
This commit is contained in:
parent
2a7e2c5fc5
commit
5c45762f67
@ -6,6 +6,7 @@
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include "EnOpenNLPChunker.h"
|
||||
#include "moses/Util.h"
|
||||
@ -32,6 +33,7 @@ void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out)
|
||||
|
||||
string line;
|
||||
while (getline(in, line)) {
|
||||
Unescape(line);
|
||||
inFile << line << endl;
|
||||
}
|
||||
inFile.close();
|
||||
@ -46,7 +48,7 @@ void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out)
|
||||
+ m_openNLPPath + "/bin/opennlp ChunkerME "
|
||||
+ m_openNLPPath + "/models/en-chunker.bin > "
|
||||
+ outStr;
|
||||
//cerr << "Executing:" << cmd << endl;
|
||||
//g << "Executing:" << cmd << endl;
|
||||
int ret = system(cmd.c_str());
|
||||
|
||||
// read result of chunker and output as Moses xml trees
|
||||
@ -68,35 +70,53 @@ void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out)
|
||||
|
||||
void EnOpenNLPChunker::MosesReformat(const string &line, std::ostream &out)
|
||||
{
|
||||
cerr << "REFORMATING:" << line << endl;
|
||||
//cerr << "REFORMATING:" << line << endl;
|
||||
vector<string> toks;
|
||||
Moses::Tokenize(toks, line);
|
||||
for (size_t i = 0; i < toks.size(); ++i) {
|
||||
const string &tok = toks[i];
|
||||
|
||||
if (tok.substr(0, 1) == "[") {
|
||||
if (tok.substr(0, 1) == "[" && tok.substr(1,1) != "_") {
|
||||
// start of chunk
|
||||
string label = tok.substr(1);
|
||||
out << "<tree label='" << label << "'>";
|
||||
out << "<tree label=\"" << label << "\">";
|
||||
}
|
||||
else if (tok.substr(tok.size()-1, 1) == "]") {
|
||||
// end of chunk
|
||||
if (tok.size() > 1) {
|
||||
if (tok.substr(1,1) == "_") {
|
||||
// just a word that happens to be ]
|
||||
vector<string> factors;
|
||||
Moses::Tokenize(factors, tok, "_");
|
||||
assert(factors.size() == 2);
|
||||
|
||||
Escape(factors[0]);
|
||||
out << factors[0] << " ";
|
||||
}
|
||||
else {
|
||||
// a word and end of tree
|
||||
string word = tok.substr(0, tok.size()-1);
|
||||
|
||||
vector<string> factors;
|
||||
Moses::Tokenize(factors, word, "_");
|
||||
assert(factors.size() == 2);
|
||||
|
||||
Escape(factors[0]);
|
||||
out << factors[0] << " ";
|
||||
}
|
||||
|
||||
out << "</tree> ";
|
||||
}
|
||||
else {
|
||||
out << "</tree> ";
|
||||
}
|
||||
|
||||
}
|
||||
else {
|
||||
// lexical item
|
||||
vector<string> factors;
|
||||
Moses::Tokenize(factors, tok, "_");
|
||||
if (factors.size() == 2) {
|
||||
Escape(factors[0]);
|
||||
out << factors[0] << " ";
|
||||
}
|
||||
else if (factors.size() == 1) {
|
||||
@ -110,3 +130,47 @@ void EnOpenNLPChunker::MosesReformat(const string &line, std::ostream &out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::string
|
||||
replaceAll( std::string const& original,
|
||||
std::string const& before,
|
||||
std::string const& after )
|
||||
{
|
||||
std::string retval;
|
||||
std::string::const_iterator end = original.end();
|
||||
std::string::const_iterator current = original.begin();
|
||||
std::string::const_iterator next =
|
||||
std::search( current, end, before.begin(), before.end() );
|
||||
while ( next != end ) {
|
||||
retval.append( current, next );
|
||||
retval.append( after );
|
||||
current = next + before.size();
|
||||
next = std::search( current, end, before.begin(), before.end() );
|
||||
}
|
||||
retval.append( current, next );
|
||||
return retval;
|
||||
}
|
||||
|
||||
void EnOpenNLPChunker::Escape(string &line)
|
||||
{
|
||||
line = replaceAll(line, "&", "&");
|
||||
line = replaceAll(line, "|", "|");
|
||||
line = replaceAll(line, "<", "<");
|
||||
line = replaceAll(line, ">", ">");
|
||||
line = replaceAll(line, "'", "'");
|
||||
line = replaceAll(line, "\"", """);
|
||||
line = replaceAll(line, "[", "[");
|
||||
line = replaceAll(line, "]", "]");
|
||||
}
|
||||
|
||||
void EnOpenNLPChunker::Unescape(string &line)
|
||||
{
|
||||
line = replaceAll(line, "|", "|");
|
||||
line = replaceAll(line, "<", "<");
|
||||
line = replaceAll(line, ">", ">");
|
||||
line = replaceAll(line, """, "\"");
|
||||
line = replaceAll(line, "'", "'");
|
||||
line = replaceAll(line, "[", "[");
|
||||
line = replaceAll(line, "]", "]");
|
||||
line = replaceAll(line, "&", "&");
|
||||
}
|
||||
|
@ -18,6 +18,9 @@ public:
|
||||
protected:
|
||||
const std::string m_openNLPPath;
|
||||
|
||||
void Escape(std::string &line);
|
||||
void Unescape(std::string &line);
|
||||
|
||||
void MosesReformat(const std::string &line, std::ostream &out);
|
||||
};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user