mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-26 19:37:58 +03:00
88 lines
2.3 KiB
C++
88 lines
2.3 KiB
C++
#include <fstream>
|
|
#include <iostream>
|
|
#include<string>
|
|
#include<sstream>
|
|
#include<vector>
|
|
#include<map>
|
|
#include "Desegmenter.h"
|
|
#include <boost/algorithm/string/replace.hpp>
|
|
|
|
using namespace std;
|
|
|
|
namespace Moses
|
|
{
|
|
void Desegmenter::Load(const string filename)
|
|
{
|
|
|
|
std::ifstream myFile(filename.c_str() );
|
|
if (myFile.is_open()) {
|
|
cerr << "Desegmentation File open successful." << endl;
|
|
string line;
|
|
while (getline(myFile, line)) {
|
|
stringstream ss(line);
|
|
string token;
|
|
vector<string> myline;
|
|
while (getline(ss, token, '\t')) {
|
|
myline.push_back(token);
|
|
}
|
|
mmDesegTable.insert(pair<string, string>(myline[2], myline[1] ));
|
|
}
|
|
myFile.close();
|
|
} else
|
|
cerr << "open() failed: check if Desegmentation file is in right folder" << endl;
|
|
}
|
|
|
|
|
|
vector<string> Desegmenter::Search(string myKey)
|
|
{
|
|
multimap<string, string>::const_iterator mmiPairFound = mmDesegTable.find(myKey);
|
|
vector<string> result;
|
|
if (mmiPairFound != mmDesegTable.end()) {
|
|
size_t nNumPairsInMap = mmDesegTable.count(myKey);
|
|
for (size_t nValuesCounter = 0; nValuesCounter < nNumPairsInMap; ++nValuesCounter) {
|
|
if (mmiPairFound != mmDesegTable.end()) {
|
|
result.push_back(mmiPairFound->second);
|
|
}
|
|
++mmiPairFound;
|
|
}
|
|
return result;
|
|
} else {
|
|
string rule_deseg ;
|
|
rule_deseg = ApplyRules(myKey);
|
|
result.push_back(rule_deseg);
|
|
return result;
|
|
}
|
|
}
|
|
|
|
|
|
string Desegmenter::ApplyRules(string & segToken)
|
|
{
|
|
|
|
string desegToken=segToken;
|
|
if (!simple) {
|
|
boost::replace_all(desegToken, "l+ All", "ll");
|
|
boost::replace_all(desegToken, "l+ Al", "ll");
|
|
boost::replace_all(desegToken, "y+ y ", "y");
|
|
boost::replace_all(desegToken, "p+ ", "t");
|
|
boost::replace_all(desegToken, "' +", "}");
|
|
boost::replace_all(desegToken, "y +", "A");
|
|
boost::replace_all(desegToken, "n +n", "n");
|
|
boost::replace_all(desegToken, "mn +m", "mm");
|
|
boost::replace_all(desegToken, "En +m", "Em");
|
|
boost::replace_all(desegToken, "An +lA", "Em");
|
|
boost::replace_all(desegToken, "-LRB-", "(");
|
|
boost::replace_all(desegToken, "-RRB-", ")");
|
|
}
|
|
|
|
boost::replace_all(desegToken, "+ +", "");
|
|
boost::replace_all(desegToken, "+ ", "");
|
|
boost::replace_all(desegToken, " +", "");
|
|
|
|
return desegToken;
|
|
}
|
|
|
|
Desegmenter::~Desegmenter()
|
|
{}
|
|
|
|
}
|