mosesdecoder/moses/FF/Dsg-Feature/Desegmenter.cpp
2016-04-11 00:00:43 +01:00

88 lines
2.3 KiB
C++

#include <fstream>
#include <iostream>
#include<string>
#include<sstream>
#include<vector>
#include<map>
#include "Desegmenter.h"
#include <boost/algorithm/string/replace.hpp>
using namespace std;
namespace Moses
{
void Desegmenter::Load(const string filename)
{
std::ifstream myFile(filename.c_str() );
if (myFile.is_open()) {
cerr << "Desegmentation File open successful." << endl;
string line;
while (getline(myFile, line)) {
stringstream ss(line);
string token;
vector<string> myline;
while (getline(ss, token, '\t')) {
myline.push_back(token);
}
mmDesegTable.insert(pair<string, string>(myline[2], myline[1] ));
}
myFile.close();
} else
cerr << "open() failed: check if Desegmentation file is in right folder" << endl;
}
vector<string> Desegmenter::Search(string myKey)
{
multimap<string, string>::const_iterator mmiPairFound = mmDesegTable.find(myKey);
vector<string> result;
if (mmiPairFound != mmDesegTable.end()) {
size_t nNumPairsInMap = mmDesegTable.count(myKey);
for (size_t nValuesCounter = 0; nValuesCounter < nNumPairsInMap; ++nValuesCounter) {
if (mmiPairFound != mmDesegTable.end()) {
result.push_back(mmiPairFound->second);
}
++mmiPairFound;
}
return result;
} else {
string rule_deseg ;
rule_deseg = ApplyRules(myKey);
result.push_back(rule_deseg);
return result;
}
}
string Desegmenter::ApplyRules(string & segToken)
{
string desegToken=segToken;
if (!simple) {
boost::replace_all(desegToken, "l+ All", "ll");
boost::replace_all(desegToken, "l+ Al", "ll");
boost::replace_all(desegToken, "y+ y ", "y");
boost::replace_all(desegToken, "p+ ", "t");
boost::replace_all(desegToken, "' +", "}");
boost::replace_all(desegToken, "y +", "A");
boost::replace_all(desegToken, "n +n", "n");
boost::replace_all(desegToken, "mn +m", "mm");
boost::replace_all(desegToken, "En +m", "Em");
boost::replace_all(desegToken, "An +lA", "Em");
boost::replace_all(desegToken, "-LRB-", "(");
boost::replace_all(desegToken, "-RRB-", ")");
}
boost::replace_all(desegToken, "+ +", "");
boost::replace_all(desegToken, "+ ", "");
boost::replace_all(desegToken, " +", "");
return desegToken;
}
Desegmenter::~Desegmenter()
{}
}