Merge ../mosesdecoder into perf_moses2

This commit is contained in:
Hieu Hoang 2016-04-12 23:04:51 +04:00
commit 1ff1d04f76
9 changed files with 641 additions and 612 deletions

View File

@ -11,73 +11,74 @@ using namespace std;
namespace Moses
{
void Desegmenter::Load(const string filename){
void Desegmenter::Load(const string filename)
{
std::ifstream myFile(filename.c_str() );
if (myFile.is_open()){
cerr << "Desegmentation File open successful." << endl;
string line;
while (getline(myFile, line)){
stringstream ss(line);
string token;
vector<string> myline;
while (getline(ss, token, '\t')){
myline.push_back(token);
}
mmDesegTable.insert(pair<string, string>(myline[2], myline[1] ));
}
myFile.close();
}
else
cerr << "open() failed: check if Desegmentation file is in right folder" << endl;
std::ifstream myFile(filename.c_str() );
if (myFile.is_open()) {
cerr << "Desegmentation File open successful." << endl;
string line;
while (getline(myFile, line)) {
stringstream ss(line);
string token;
vector<string> myline;
while (getline(ss, token, '\t')) {
myline.push_back(token);
}
mmDesegTable.insert(pair<string, string>(myline[2], myline[1] ));
}
myFile.close();
} else
cerr << "open() failed: check if Desegmentation file is in right folder" << endl;
}
vector<string> Desegmenter::Search(string myKey){
multimap<string, string>::const_iterator mmiPairFound = mmDesegTable.find(myKey);
vector<string> result;
if (mmiPairFound != mmDesegTable.end()){
size_t nNumPairsInMap = mmDesegTable.count(myKey);
for (size_t nValuesCounter = 0; nValuesCounter < nNumPairsInMap; ++nValuesCounter){
if (mmiPairFound != mmDesegTable.end()) {
result.push_back(mmiPairFound->second);
}
++mmiPairFound;
}
return result;
}
else{
string rule_deseg ;
rule_deseg = ApplyRules(myKey);
result.push_back(rule_deseg);
return result;
}
vector<string> Desegmenter::Search(string myKey)
{
multimap<string, string>::const_iterator mmiPairFound = mmDesegTable.find(myKey);
vector<string> result;
if (mmiPairFound != mmDesegTable.end()) {
size_t nNumPairsInMap = mmDesegTable.count(myKey);
for (size_t nValuesCounter = 0; nValuesCounter < nNumPairsInMap; ++nValuesCounter) {
if (mmiPairFound != mmDesegTable.end()) {
result.push_back(mmiPairFound->second);
}
++mmiPairFound;
}
return result;
} else {
string rule_deseg ;
rule_deseg = ApplyRules(myKey);
result.push_back(rule_deseg);
return result;
}
}
string Desegmenter::ApplyRules(string & segToken){
string Desegmenter::ApplyRules(string & segToken)
{
string desegToken=segToken;
if (!simple){
boost::replace_all(desegToken, "l+ All", "ll");
boost::replace_all(desegToken, "l+ Al", "ll");
boost::replace_all(desegToken, "y+ y ", "y");
boost::replace_all(desegToken, "p+ ", "t");
boost::replace_all(desegToken, "' +", "}");
boost::replace_all(desegToken, "y +", "A");
boost::replace_all(desegToken, "n +n", "n");
boost::replace_all(desegToken, "mn +m", "mm");
boost::replace_all(desegToken, "En +m", "Em");
boost::replace_all(desegToken, "An +lA", "Em");
boost::replace_all(desegToken, "-LRB-", "(");
boost::replace_all(desegToken, "-RRB-", ")");
}
boost::replace_all(desegToken, "+ +", "");
boost::replace_all(desegToken, "+ ", "");
boost::replace_all(desegToken, " +", "");
return desegToken;
string desegToken=segToken;
if (!simple) {
boost::replace_all(desegToken, "l+ All", "ll");
boost::replace_all(desegToken, "l+ Al", "ll");
boost::replace_all(desegToken, "y+ y ", "y");
boost::replace_all(desegToken, "p+ ", "t");
boost::replace_all(desegToken, "' +", "}");
boost::replace_all(desegToken, "y +", "A");
boost::replace_all(desegToken, "n +n", "n");
boost::replace_all(desegToken, "mn +m", "mm");
boost::replace_all(desegToken, "En +m", "Em");
boost::replace_all(desegToken, "An +lA", "Em");
boost::replace_all(desegToken, "-LRB-", "(");
boost::replace_all(desegToken, "-RRB-", ")");
}
boost::replace_all(desegToken, "+ +", "");
boost::replace_all(desegToken, "+ ", "");
boost::replace_all(desegToken, " +", "");
return desegToken;
}
Desegmenter::~Desegmenter()

View File

@ -11,21 +11,23 @@ namespace Moses
class Desegmenter
{
private:
std::multimap<string, string> mmDesegTable;
std::string filename;
bool simple;
void Load(const string filename);
std::multimap<string, string> mmDesegTable;
std::string filename;
bool simple;
void Load(const string filename);
public:
Desegmenter(const std::string& file, const bool scheme){
filename = file;
simple=scheme;
Load(filename);
}
string getFileName(){ return filename; }
vector<string> Search(string myKey);
string ApplyRules(string &);
~Desegmenter();
Desegmenter(const std::string& file, const bool scheme) {
filename = file;
simple=scheme;
Load(filename);
}
string getFileName() {
return filename;
}
vector<string> Search(string myKey);
string ApplyRules(string &);
~Desegmenter();
};
}

View File

@ -10,147 +10,147 @@ using namespace lm::ngram;
namespace Moses
{
DesegModel::DesegModel(const std::string &line)
:StatefulFeatureFunction(5, line )
{
tFactor = 0;
order=5;
numFeatures = 5;
optimistic = 1;
ReadParameters();
DesegModel::DesegModel(const std::string &line)
:StatefulFeatureFunction(5, line )
{
tFactor = 0;
order=5;
numFeatures = 5;
optimistic = 1;
ReadParameters();
}
DesegModel::~DesegModel()
{
delete DSGM;
}
void DesegModel :: readLanguageModel(const char *lmFile)
{
DSGM = ConstructDsgLM(m_lmPath.c_str());
State startState = DSGM->NullContextState();
desegT=new Desegmenter(m_desegPath,m_simple);// Desegmentation Table
}
void DesegModel::Load(AllOptions::ptr const& opts)
{
m_options = opts;
readLanguageModel(m_lmPath.c_str());
}
void DesegModel:: EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedScores) const
{
dsgHypothesis obj;
vector <string> myTargetPhrase;
vector<float> scores;
vector<string> targ_phrase; //stores the segmented tokens in the target phrase
const AlignmentInfo &align = targetPhrase.GetAlignTerm();
for (int i = 0; i < targetPhrase.GetSize(); i++) {
targ_phrase.push_back(targetPhrase.GetWord(i).GetFactor(tFactor)->GetString().as_string());
}
DesegModel::~DesegModel()
{
delete DSGM;
obj.setState(DSGM->NullContextState());
obj.setPhrases(targ_phrase);
obj.calculateDsgProbinIsol(*DSGM,*desegT,align);
obj.populateScores(scores,numFeatures);
estimatedScores.PlusEquals(this, scores);
}
FFState* DesegModel::EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const
{
const TargetPhrase &target = cur_hypo.GetCurrTargetPhrase();
const Range &src_rng =cur_hypo.GetCurrSourceWordsRange();
const AlignmentInfo &align = cur_hypo.GetCurrTargetPhrase().GetAlignTerm();
size_t sourceOffset = src_rng.GetStartPos();
dsgHypothesis obj;
vector<float> scores;
vector<string> targ_phrase; //stores the segmented tokens in the target phrase
bool isCompleted;
isCompleted=cur_hypo.IsSourceCompleted();
for (int i = 0; i < cur_hypo.GetCurrTargetLength(); i++) {
targ_phrase.push_back(target.GetWord(i).GetFactor(tFactor)->GetString().as_string());
}
void DesegModel :: readLanguageModel(const char *lmFile)
{
DSGM = ConstructDsgLM(m_lmPath.c_str());
State startState = DSGM->NullContextState();
desegT=new Desegmenter(m_desegPath,m_simple);// Desegmentation Table
}
obj.setState(prev_state);
obj.setPhrases( targ_phrase );
obj.calculateDsgProb(*DSGM,*desegT,isCompleted,align, sourceOffset, optimistic);
obj.populateScores(scores,numFeatures);
accumulator->PlusEquals(this, scores);
return obj.saveState();
}
FFState* DesegModel::EvaluateWhenApplied(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const
{
UTIL_THROW2("Chart decoding not support by UTIL_THROW2");
}
const FFState* DesegModel::EmptyHypothesisState(const InputType &input) const
{
VERBOSE(3,"DesegModel::EmptyHypothesisState()" << endl);
State startState = DSGM->BeginSentenceState();
dsgState ss= dsgState(startState);
return new dsgState(ss);
}
std::string DesegModel::GetScoreProducerWeightShortName(unsigned idx) const
{
return "dsg";
}
void DesegModel::Load(AllOptions::ptr const& opts)
{
m_options = opts;
readLanguageModel(m_lmPath.c_str());
}
void DesegModel::SetParameter(const std::string& key, const std::string& value)
{
void DesegModel:: EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedScores) const
{
dsgHypothesis obj;
vector <string> myTargetPhrase;
vector<float> scores;
vector<string> targ_phrase; //stores the segmented tokens in the target phrase
const AlignmentInfo &align = targetPhrase.GetAlignTerm();
for (int i = 0; i < targetPhrase.GetSize(); i++) {
targ_phrase.push_back(targetPhrase.GetWord(i).GetFactor(tFactor)->GetString().as_string());
}
obj.setState(DSGM->NullContextState());
obj.setPhrases(targ_phrase);
obj.calculateDsgProbinIsol(*DSGM,*desegT,align);
obj.populateScores(scores,numFeatures);
estimatedScores.PlusEquals(this, scores);
}
FFState* DesegModel::EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const
{
const TargetPhrase &target = cur_hypo.GetCurrTargetPhrase();
const Range &src_rng =cur_hypo.GetCurrSourceWordsRange();
const AlignmentInfo &align = cur_hypo.GetCurrTargetPhrase().GetAlignTerm();
size_t sourceOffset = src_rng.GetStartPos();
dsgHypothesis obj;
vector<float> scores;
vector<string> targ_phrase; //stores the segmented tokens in the target phrase
bool isCompleted;
isCompleted=cur_hypo.IsSourceCompleted();
for (int i = 0; i < cur_hypo.GetCurrTargetLength(); i++) {
targ_phrase.push_back(target.GetWord(i).GetFactor(tFactor)->GetString().as_string());
}
obj.setState(prev_state);
obj.setPhrases( targ_phrase );
obj.calculateDsgProb(*DSGM,*desegT,isCompleted,align, sourceOffset, optimistic);
obj.populateScores(scores,numFeatures);
accumulator->PlusEquals(this, scores);
return obj.saveState();
}
FFState* DesegModel::EvaluateWhenApplied(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const
{
UTIL_THROW2("Chart decoding not support by UTIL_THROW2");
}
const FFState* DesegModel::EmptyHypothesisState(const InputType &input) const
{
VERBOSE(3,"DesegModel::EmptyHypothesisState()" << endl);
State startState = DSGM->BeginSentenceState();
dsgState ss= dsgState(startState);
return new dsgState(ss);
}
std::string DesegModel::GetScoreProducerWeightShortName(unsigned idx) const
{
return "dsg";
}
void DesegModel::SetParameter(const std::string& key, const std::string& value)
{
if (key == "path") {
m_lmPath = value;
} else if (key == "contiguity-features") {
if(value == "no")
numFeatures = 1;
else
numFeatures = 5;
} else if (key == "output-factor") {
tFactor = Scan<int>(value);
} else if (key == "optimistic") {
if (value == "n")
if (key == "path") {
m_lmPath = value;
} else if (key == "contiguity-features") {
if(value == "no")
numFeatures = 1;
else
numFeatures = 5;
} else if (key == "output-factor") {
tFactor = Scan<int>(value);
} else if (key == "optimistic") {
if (value == "n")
optimistic = 0;
else
else
optimistic = 1;
} else if (key == "deseg-path") {
m_desegPath = Scan<int>(value);
} else if (key == "deseg-scheme") {
if(value == "s")
m_simple = 1;
else
m_simple = 0;
} else if (key == "order") {
order = Scan<int>(value);
} else {
StatefulFeatureFunction::SetParameter(key, value);
}
} else if (key == "deseg-path") {
m_desegPath = Scan<int>(value);
} else if (key == "deseg-scheme") {
if(value == "s")
m_simple = 1;
else
m_simple = 0;
} else if (key == "order") {
order = Scan<int>(value);
} else {
StatefulFeatureFunction::SetParameter(key, value);
}
}
bool DesegModel::IsUseable(const FactorMask &mask) const
{
bool ret = mask[0];
return ret;
}
bool DesegModel::IsUseable(const FactorMask &mask) const
{
bool ret = mask[0];
return ret;
}
} // namespace

View File

@ -13,52 +13,52 @@
namespace Moses
{
class DesegModel : public StatefulFeatureFunction
{
public:
class DesegModel : public StatefulFeatureFunction
{
public:
DsgLM * DSGM;
Desegmenter* desegT;
int tFactor;// Target Factor ...
int order;
int numFeatures; // Number of features used an be 1 (unsegmented LM)or 5 (with 3 contiguity features and 1 UnsegWP)
bool optimistic;
DsgLM * DSGM;
Desegmenter* desegT;
int tFactor;// Target Factor ...
int order;
int numFeatures; // Number of features used an be 1 (unsegmented LM)or 5 (with 3 contiguity features and 1 UnsegWP)
bool optimistic;
DesegModel(const std::string &line);
~DesegModel();
DesegModel(const std::string &line);
~DesegModel();
void readLanguageModel(const char *);
void Load(AllOptions::ptr const& opts);
void readLanguageModel(const char *);
void Load(AllOptions::ptr const& opts);
FFState* EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
FFState* EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
virtual FFState* EvaluateWhenApplied(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const;
virtual FFState* EvaluateWhenApplied(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const;
void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedScores) const;
void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedScores) const;
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
virtual std::string GetScoreProducerWeightShortName(unsigned idx=0) const;
virtual std::string GetScoreProducerWeightShortName(unsigned idx=0) const;
void SetParameter(const std::string& key, const std::string& value);
void SetParameter(const std::string& key, const std::string& value);
bool IsUseable(const FactorMask &mask) const;
bool IsUseable(const FactorMask &mask) const;
protected:
typedef std::vector<float> Scores;
std::string m_lmPath;
std::string m_desegPath;
bool m_simple; //desegmentation scheme; if 1 then use simple, else use rule and backoff to simple
};
protected:
typedef std::vector<float> Scores;
std::string m_lmPath;
std::string m_desegPath;
bool m_simple; //desegmentation scheme; if 1 then use simple, else use rule and backoff to simple
};
}

View File

@ -3,32 +3,32 @@
namespace Moses
{
DsgLM* ConstructDsgLM(const char *file)
{
lm::ngram::ModelType model_type;
lm::ngram::Config config;
if (lm::ngram::RecognizeBinary(file, model_type)) {
switch(model_type) {
case lm::ngram::PROBING:
return new KenDsg<lm::ngram::ProbingModel>(file, config);
case lm::ngram::REST_PROBING:
return new KenDsg<lm::ngram::RestProbingModel>(file, config);
case lm::ngram::TRIE:
return new KenDsg<lm::ngram::TrieModel>(file, config);
case lm::ngram::QUANT_TRIE:
return new KenDsg<lm::ngram::QuantTrieModel>(file, config);
case lm::ngram::ARRAY_TRIE:
return new KenDsg<lm::ngram::ArrayTrieModel>(file, config);
case lm::ngram::QUANT_ARRAY_TRIE:
return new KenDsg<lm::ngram::QuantArrayTrieModel>(file, config);
default:
UTIL_THROW2("Unrecognized kenlm model type " << model_type);
}
} else {
DsgLM* ConstructDsgLM(const char *file)
{
lm::ngram::ModelType model_type;
lm::ngram::Config config;
if (lm::ngram::RecognizeBinary(file, model_type)) {
switch(model_type) {
case lm::ngram::PROBING:
return new KenDsg<lm::ngram::ProbingModel>(file, config);
}
}
} // namespace
case lm::ngram::REST_PROBING:
return new KenDsg<lm::ngram::RestProbingModel>(file, config);
case lm::ngram::TRIE:
return new KenDsg<lm::ngram::TrieModel>(file, config);
case lm::ngram::QUANT_TRIE:
return new KenDsg<lm::ngram::QuantTrieModel>(file, config);
case lm::ngram::ARRAY_TRIE:
return new KenDsg<lm::ngram::ArrayTrieModel>(file, config);
case lm::ngram::QUANT_ARRAY_TRIE:
return new KenDsg<lm::ngram::QuantArrayTrieModel>(file, config);
default:
UTIL_THROW2("Unrecognized kenlm model type " << model_type);
}
} else {
return new KenDsg<lm::ngram::ProbingModel>(file, config);
}
}
} // namespace

View File

@ -8,7 +8,7 @@ namespace Moses
class KenDsgBase
{
public:
public:
virtual ~KenDsgBase() {}
virtual float Score(const lm::ngram::State&, StringPiece,
@ -22,17 +22,17 @@ class KenDsgBase
};
template <class KenModel>
class KenDsg : public KenDsgBase
class KenDsg : public KenDsgBase
{
public:
public:
KenDsg(const char *file, const lm::ngram::Config &config)
: m_kenlm(file, config) {}
float Score(const lm::ngram::State &in_state,
StringPiece word,
lm::ngram::State &out_state) const {
StringPiece word,
lm::ngram::State &out_state) const {
return m_kenlm.Score(in_state, m_kenlm.GetVocabulary().Index(word),
out_state);
out_state);
}
const lm::ngram::State &BeginSentenceState() const {
@ -48,13 +48,13 @@ template <class KenModel>
}
private:
private:
KenModel m_kenlm;
};
typedef KenDsgBase DsgLM;
typedef KenDsgBase DsgLM;
DsgLM* ConstructDsgLM(const char *file);
DsgLM* ConstructDsgLM(const char *file);
} // namespace

View File

@ -2,9 +2,9 @@
#include <sstream>
#include <boost/algorithm/string.hpp>
#include <algorithm>
#include <cstdlib>
#include <math.h>
#include <map>
#include <cstdlib>
#include <math.h>
#include <map>
using namespace std;
@ -12,356 +12,380 @@ using namespace lm::ngram;
namespace Moses
{
dsgState::dsgState(const State & val)
{
lmState = val;
}
dsgState::dsgState(const State & val)
{
lmState = val;
}
void dsgState::saveState( std::vector<std::string> danglingTok, std::vector<int> srcSpans,float deltaValue)
{
buffer = danglingTok;
span=srcSpans;
delta=deltaValue;
}
void dsgState::saveState( std::vector<std::string> danglingTok, std::vector<int> srcSpans,float deltaValue)
{
buffer = danglingTok;
span=srcSpans;
delta=deltaValue;
}
size_t dsgState::hash() const
{
size_t dsgState::hash() const
{
size_t ret = 0;
boost::hash_combine(ret, lmState);
size_t ret = 0;
boost::hash_combine(ret, lmState);
/*size_t ret = delta;
/*size_t ret = delta;
boost::hash_combine(ret, buffer);
boost::hash_combine(ret, span);
boost::hash_combine(ret, lmState.length);
return ret;*/
}
bool dsgState::operator==(const FFState& otherBase) const //CHECK
{
const dsgState &other = static_cast<const dsgState&>(otherBase);
if (lmState < other.lmState) return false;
if (lmState == other.lmState) return true;
return false;
}
// ----------------------------------------
std::string dsgState :: getName() const
{
return "done";
}
dsgHypothesis :: dsgHypothesis()
{
lmProb = 0;
discontig0 = 0;
discontig1 = 0;
discontig2 = 0;
UnsegWP = 0;
m_buffer.clear();//="";
}
void dsgHypothesis :: setState(const FFState* prev_state)
{
if(prev_state != NULL) {
m_buffer = static_cast <const dsgState *> (prev_state)->getBuffer();
m_span = static_cast <const dsgState *> (prev_state)->getSpan();
lmState = static_cast <const dsgState *> (prev_state)->getLMState();
delta = static_cast <const dsgState *> (prev_state)->getDelta(); //NEW
}
}
bool dsgState::operator==(const FFState& otherBase) const //CHECK
{
const dsgState &other = static_cast<const dsgState&>(otherBase);
dsgState * dsgHypothesis :: saveState()
{
dsgState * statePtr = new dsgState(lmState);
statePtr->saveState(m_buffer, m_span, delta);
return statePtr;
}
if (lmState < other.lmState) return false;
if (lmState == other.lmState) return true;
void dsgHypothesis :: populateScores(vector <float> & scores , const int numFeatures)
{
scores.clear();
scores.push_back(lmProb);
if (numFeatures == 1)
return;
scores.push_back(discontig0);
scores.push_back(discontig1);
scores.push_back(discontig2);
scores.push_back(UnsegWP);
}
bool dsgHypothesis::isPrefix(const std::string &tok)
{
if ((tok.at(tok.size() - 1) == '+' )&& (tok != "+")) {
return true;
} else {
return false;
};
}
bool dsgHypothesis::isSuffix(const std::string &tok)
{
if ((tok.at(0) == '+' )&& (tok != "+")) {
return true;
} else {
return false;
};
}
bool dsgHypothesis::isStem(const std::string &tok)
{
if ((tok.at(0) != '+') && (tok.at(tok.size() - 1) != '+')) {
return true;
} else {
return false;
};
}
/**
* chain stores segmented tokens that are in process of building a word
* The function checks if tok contributes to the word being formed in chain
*
*/
bool dsgHypothesis::isValidChain(const std::string &tok, std::vector<std::string> &chain)
{
std::string last_tok;
if (chain.size() >= 1) {
last_tok = chain[chain.size() - 1];
} else {
last_tok = "NULL";
}
if(tok=="+") {
return false;
}
// ----------------------------------------
std::string dsgState :: getName() const
{
return "done";
if (isPrefix(tok) && (chain.size() == 0 || isPrefix(last_tok))) {
return true;
} else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok)))) {
return true; // allows one suffix ONLY
}
dsgHypothesis :: dsgHypothesis()
{
lmProb = 0;
discontig0 = 0;
discontig1 = 0;
discontig2 = 0;
UnsegWP = 0;
m_buffer.clear();//="";
//else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok) || isSuffix(last_tok) ))) { return true; } // allows multiple suffixes
else if (isStem(tok) && (chain.size() == 0 || isPrefix(last_tok))) {
return true;
} else {
return false;
}
}
void dsgHypothesis :: setState(const FFState* prev_state)
{
if(prev_state != NULL) {
m_buffer = static_cast <const dsgState *> (prev_state)->getBuffer();
m_span = static_cast <const dsgState *> (prev_state)->getSpan();
lmState = static_cast <const dsgState *> (prev_state)->getLMState();
delta = static_cast <const dsgState *> (prev_state)->getDelta(); //NEW
/**
* grouper function groups tokens that form a word together
*/
vector<string> dsgHypothesis::grouper(std::vector<std::string> &phr_vec,vector<vector<int> > &allchain_ids, int sourceOffset,const AlignmentInfo &align, bool isolation)
{
std::vector<std::string> chain;
std::vector<int> chain_ids;
std::vector<std::string> allchains;
chain_ids=m_span;
if (!m_buffer.empty() && !isolation) { // if evaluate in isolation is called, then do not add buffer content
for (int i = 0; i < m_buffer.size(); i++) { // initialize chain with the content of the buffer
chain.push_back(m_buffer[i]);
}
}
dsgState * dsgHypothesis :: saveState()
{
dsgState * statePtr = new dsgState(lmState);
statePtr->saveState(m_buffer, m_span, delta);
return statePtr;
}
for (int i = 0; i < phr_vec.size(); i++) {
std::set<std::size_t> sourcePosSet = align.GetAlignmentsForTarget(i);
void dsgHypothesis :: populateScores(vector <float> & scores , const int numFeatures)
{
scores.clear();
scores.push_back(lmProb);
if (numFeatures == 1)
return;
scores.push_back(discontig0);
scores.push_back(discontig1);
scores.push_back(discontig2);
scores.push_back(UnsegWP);
}
bool dsgHypothesis::isPrefix(const std::string &tok){
if ((tok.at(tok.size() - 1) == '+' )&& (tok != "+")) { return true; }
else { return false; };
}
bool dsgHypothesis::isSuffix(const std::string &tok){
if ((tok.at(0) == '+' )&& (tok != "+")) { return true; }
else { return false; };
}
bool dsgHypothesis::isStem(const std::string &tok){
if ((tok.at(0) != '+') && (tok.at(tok.size() - 1) != '+')){ return true; }
else { return false; };
}
/**
* chain stores segmented tokens that are in process of building a word
* The function checks if tok contributes to the word being formed in chain
*
*/
bool dsgHypothesis::isValidChain(const std::string &tok, std::vector<std::string> &chain){
std::string last_tok;
if (chain.size() >= 1){
last_tok = chain[chain.size() - 1];
}
else{
last_tok = "NULL";
}
if(tok=="+"){return false;}
if (isPrefix(tok) && (chain.size() == 0 || isPrefix(last_tok))) { return true; }
else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok)))) { return true; } // allows one suffix ONLY
//else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok) || isSuffix(last_tok) ))) { return true; } // allows multiple suffixes
else if (isStem(tok) && (chain.size() == 0 || isPrefix(last_tok))) { return true; }
else { return false; }
}
/**
* grouper function groups tokens that form a word together
*/
vector<string> dsgHypothesis::grouper(std::vector<std::string> &phr_vec,vector<vector<int> > &allchain_ids, int sourceOffset,const AlignmentInfo &align, bool isolation){
std::vector<std::string> chain;
std::vector<int> chain_ids;
std::vector<std::string> allchains;
chain_ids=m_span;
if (!m_buffer.empty() && !isolation){// if evaluate in isolation is called, then do not add buffer content
for (int i = 0; i < m_buffer.size(); i++){ // initialize chain with the content of the buffer
chain.push_back(m_buffer[i]);
if (isValidChain(phr_vec[i], chain)) {
chain.push_back(phr_vec[i]);
if (sourcePosSet.empty()==false) {
for (std::set<size_t>::iterator it(sourcePosSet.begin()); it != sourcePosSet.end(); it++) {
int cur=*it;
chain_ids.push_back(cur+sourceOffset);
}
}
}
for (int i = 0; i < phr_vec.size(); i++){
std::set<std::size_t> sourcePosSet = align.GetAlignmentsForTarget(i);
if (isValidChain(phr_vec[i], chain)){
chain.push_back(phr_vec[i]);
if (sourcePosSet.empty()==false){
for (std::set<size_t>::iterator it(sourcePosSet.begin());it != sourcePosSet.end(); it++) {
int cur=*it;
chain_ids.push_back(cur+sourceOffset);
}
}
}
else if (chain.size() == 0) { // start of a suffix at hypothesis0
allchains.push_back(phr_vec[i]);
allchain_ids.push_back(chain_ids);
chain_ids.clear();//={};
}
else { // tokens formed a complete word; add tokens segmented by space to allchains
std::string joined = boost::algorithm::join(chain, " ");
allchains.push_back(joined);
allchain_ids.push_back(chain_ids);
chain.clear();// = {};
chain_ids.clear();//={};
chain.push_back(phr_vec[i]);
if (sourcePosSet.empty()==false){
for (std::set<size_t>::iterator it(sourcePosSet.begin());it != sourcePosSet.end(); it++) {
int cur=*it;
chain_ids.push_back(cur+sourceOffset);
}
}
}
else if (chain.size() == 0) { // start of a suffix at hypothesis0
allchains.push_back(phr_vec[i]);
allchain_ids.push_back(chain_ids);
chain_ids.clear();//={};
}
if (!chain.empty()){
else { // tokens formed a complete word; add tokens segmented by space to allchains
std::string joined = boost::algorithm::join(chain, " ");
allchains.push_back(joined);
allchain_ids.push_back(chain_ids);
chain.clear();// = {};
chain_ids.clear();//={};
chain.push_back(phr_vec[i]);
if (sourcePosSet.empty()==false) {
for (std::set<size_t>::iterator it(sourcePosSet.begin()); it != sourcePosSet.end(); it++) {
int cur=*it;
chain_ids.push_back(cur+sourceOffset);
}
}
}
return allchains;
}
if (!chain.empty()) {
std::string joined = boost::algorithm::join(chain, " ");
allchains.push_back(joined);
allchain_ids.push_back(chain_ids);
}
return allchains;
}
void dsgHypothesis :: calculateDsgProbinIsol(DsgLM & ptrDsgLM, Desegmenter &desegT, const AlignmentInfo &align ){
lmProb = 0;
State currState = lmState;
State temp;
string desegmented="";
vector <string> words;
vector <string> currFVec;
discontig0=0;
discontig1=0;
discontig2=0;
UnsegWP=0;
void dsgHypothesis :: calculateDsgProbinIsol(DsgLM & ptrDsgLM, Desegmenter &desegT, const AlignmentInfo &align )
{
lmProb = 0;
State currState = lmState;
State temp;
string desegmented="";
vector <string> words;
vector <string> currFVec;
currFVec = m_buffer;
currFVec.insert( currFVec.end(), m_curr_phr.begin(), m_curr_phr.end() );
discontig0=0;
discontig1=0;
discontig2=0;
UnsegWP=0;
int vecSize=currFVec.size();
currFVec = m_buffer;
currFVec.insert( currFVec.end(), m_curr_phr.begin(), m_curr_phr.end() );
// phrases with suffix-starts and prefix-end
if (currFVec.size()>0 && isPrefix (currFVec.back())) {
UnsegWP-=0.5;}
if (currFVec.size()>0 && isSuffix (currFVec.front())) {
UnsegWP-=0.5;}
int vecSize=currFVec.size();
/* //Dropping prefix-end and suffix-start
while (currFVec.size()>0 && isPrefix (currFVec.back())){
currFVec.pop_back(); //drop prefix appearing at end of phrase
}
// phrases with suffix-starts and prefix-end
if (currFVec.size()>0 && isPrefix (currFVec.back())) {
UnsegWP-=0.5;
}
if (currFVec.size()>0 && isSuffix (currFVec.front())) {
UnsegWP-=0.5;
}
while (currFVec.size()>0 && isSuffix (currFVec.front())){
currFVec.erase (currFVec.begin()); //drop suffix appearning at start of a phrase
} */
/* //Dropping prefix-end and suffix-start
while (currFVec.size()>0 && isPrefix (currFVec.back())){
currFVec.pop_back(); //drop prefix appearing at end of phrase
}
vector<vector<int> > chain_ids;
words = grouper(currFVec,chain_ids,0,align,1);
while (currFVec.size()>0 && isSuffix (currFVec.front())){
currFVec.erase (currFVec.begin()); //drop suffix appearning at start of a phrase
} */
for (int i = 0; i<words.size(); i++) {
vector<vector<int> > chain_ids;
words = grouper(currFVec,chain_ids,0,align,1);
for (int i = 0; i<words.size(); i++) {
UnsegWP+=1;
temp = currState;
if (words[i].find(" ")!=std::string::npos) {
desegmented=desegT.Search(words[i])[0];
lmProb += ptrDsgLM.Score(temp,desegmented,currState);
} else {
boost::replace_all(words[i], "-LRB-", "(");
boost::replace_all(words[i], "-RRB-", ")");
lmProb += ptrDsgLM.Score(temp,words[i],currState);
}
}
lmState = currState;
}
void dsgHypothesis :: calculateDsgProb(DsgLM& ptrDsgLM, Desegmenter &desegT, bool isCompleted , const AlignmentInfo &align, int sourceOffset, bool optimistic)
{
lmProb = 0;
discontig0=0;
discontig1=0;
discontig2=0;
UnsegWP=0;
State currState = lmState;
State temp;
string desegmented="";
vector <string> words;
vector <string> currFVec;
bool completePhraseSuffixEnd = false;
vector<vector<int> > all_chain_ids;
double pscore;
currFVec=m_curr_phr;
// Check if the the phrase ends in a suffix, which means that it completes a full word;Make sure to change the isValidChain
if (isSuffix (currFVec.back()) && (currFVec.back()!="+")) {
completePhraseSuffixEnd=true;
}
words = grouper(currFVec,all_chain_ids,sourceOffset,align,0);
for (int i = 0; i < words.size(); i++) {
temp = currState;
if (i==words.size()-1) {
if (completePhraseSuffixEnd) { //i.e if phrase ends with suffix, which marks an end of a word
m_buffer.clear();// ="";
m_span.clear();// ={};
} else if (!isCompleted) { // not end of sentence( or final hypothesis), and probably the last token is not a complete word
m_buffer.clear();
if (optimistic == 1) {
if ( isPrefix (currFVec.back())) { // this will delay scoring of prefix in prefix-ending phrases until the next hypothesis arrives
//pscore = ptrDsgLM.Score(temp,desegmented,currState);
lmProb -= delta;
delta = 0.0;
}
else if (words[i].find(" ")!=std::string::npos) {
desegmented=desegT.Search(words[i])[0];
pscore=ptrDsgLM.Score(temp,desegmented,currState);
lmProb = lmProb + pscore - delta;
delta=pscore;
currState=temp;
} else {
boost::replace_all(words[i], "-LRB-", "(");
boost::replace_all(words[i], "-RRB-", ")");
pscore=ptrDsgLM.Score(temp,words[i],currState);
lmProb = lmProb + pscore - delta;
delta=pscore;
currState=temp;
}
}
m_buffer.push_back(words.back());
m_span=all_chain_ids.back();
break;
}
}
//temp = currState;
if (words[i].find(" ")!=std::string::npos) {
UnsegWP+=1;
temp = currState;
if (words[i].find(" ")!=std::string::npos){
desegmented=desegT.Search(words[i])[0];
lmProb += ptrDsgLM.Score(temp,desegmented,currState);
}
else{
boost::replace_all(words[i], "-LRB-", "(");
boost::replace_all(words[i], "-RRB-", ")");
lmProb += ptrDsgLM.Score(temp,words[i],currState);
desegmented=desegT.Search(words[i])[0];
std::set<int> cur_chain_ids(all_chain_ids[i].begin(),all_chain_ids[i].end());
if (cur_chain_ids.size()>1) {
vector<int> dsc;
for (std::set<int>::iterator it(cur_chain_ids.begin()), next(it); it != cur_chain_ids.end() && ++next != cur_chain_ids.end(); it = next) {
int cur=*it;
int mynext=*next;
if (std::abs(cur - mynext)>= 3) {
dsc.push_back(3);
} else if (std::abs(cur - mynext)== 2) {
dsc.push_back(2);
} else if (std::abs(cur - mynext)<= 1) {
dsc.push_back(1);
}
}
int mymax=*std::max_element(dsc.begin(),dsc.end());
if (mymax==3) {
discontig2+=1;
} else if (mymax==2) {
discontig1+=1;
} else {
discontig0+=1;
}
} else {
discontig0 += 1;
}
lmProb += ptrDsgLM.Score(temp,desegmented,currState);
} else {
UnsegWP+=1;
boost::replace_all(words[i], "-LRB-", "(");
boost::replace_all(words[i], "-RRB-", ")");
lmProb += ptrDsgLM.Score(temp,words[i],currState);
}
lmState = currState;
}
void dsgHypothesis :: calculateDsgProb(DsgLM& ptrDsgLM, Desegmenter &desegT, bool isCompleted , const AlignmentInfo &align, int sourceOffset, bool optimistic)
{
lmProb = 0;
discontig0=0;
discontig1=0;
discontig2=0;
UnsegWP=0;
State currState = lmState;
State temp;
string desegmented="";
vector <string> words;
vector <string> currFVec;
bool completePhraseSuffixEnd = false;
vector<vector<int> > all_chain_ids;
double pscore;
currFVec=m_curr_phr;
// Check if the the phrase ends in a suffix, which means that it completes a full word;Make sure to change the isValidChain
if (isSuffix (currFVec.back()) && (currFVec.back()!="+")){completePhraseSuffixEnd=true;}
words = grouper(currFVec,all_chain_ids,sourceOffset,align,0);
for (int i = 0; i < words.size(); i++) {
temp = currState;
if (i==words.size()-1){
if (completePhraseSuffixEnd){ //i.e if phrase ends with suffix, which marks an end of a word
m_buffer.clear();// ="";
m_span.clear();// ={};
}
else if (!isCompleted) { // not end of sentence( or final hypothesis), and probably the last token is not a complete word
m_buffer.clear();
if (optimistic == 1){
if ( isPrefix (currFVec.back())){ // this will delay scoring of prefix in prefix-ending phrases until the next hypothesis arrives
//pscore = ptrDsgLM.Score(temp,desegmented,currState);
lmProb -= delta;
delta = 0.0;
}
else if (words[i].find(" ")!=std::string::npos){
desegmented=desegT.Search(words[i])[0];
pscore=ptrDsgLM.Score(temp,desegmented,currState);
lmProb = lmProb + pscore - delta;
delta=pscore;
currState=temp;
}
else{
boost::replace_all(words[i], "-LRB-", "(");
boost::replace_all(words[i], "-RRB-", ")");
pscore=ptrDsgLM.Score(temp,words[i],currState);
lmProb = lmProb + pscore - delta;
delta=pscore;
currState=temp;
} }
m_buffer.push_back(words.back());
m_span=all_chain_ids.back();
break;
}
}
//temp = currState;
if (words[i].find(" ")!=std::string::npos){
UnsegWP+=1;
desegmented=desegT.Search(words[i])[0];
std::set<int> cur_chain_ids(all_chain_ids[i].begin(),all_chain_ids[i].end());
if (cur_chain_ids.size()>1){
vector<int> dsc;
for (std::set<int>::iterator it(cur_chain_ids.begin()), next(it);it != cur_chain_ids.end() && ++next != cur_chain_ids.end(); it = next) {
int cur=*it;
int mynext=*next;
if (std::abs(cur - mynext)>= 3) {
dsc.push_back(3);
}
else if (std::abs(cur - mynext)== 2){
dsc.push_back(2);
}
else if (std::abs(cur - mynext)<= 1){
dsc.push_back(1);
}
}
int mymax=*std::max_element(dsc.begin(),dsc.end());
if (mymax==3){discontig2+=1;}
else if (mymax==2){discontig1+=1;}
else{discontig0+=1;}
}
else{
discontig0 += 1;
}
lmProb += ptrDsgLM.Score(temp,desegmented,currState);
}
else{
UnsegWP+=1;
boost::replace_all(words[i], "-LRB-", "(");
boost::replace_all(words[i], "-RRB-", ")");
lmProb += ptrDsgLM.Score(temp,words[i],currState);
}
}
if (isCompleted){
temp = currState;
lmProb = lmProb + ptrDsgLM.ScoreEndSentence(temp,currState) - delta;
}
lmState = currState;
if (isCompleted) {
temp = currState;
lmProb = lmProb + ptrDsgLM.ScoreEndSentence(temp,currState) - delta;
}
lmState = currState;
}
void dsgHypothesis :: print()
{}
void dsgHypothesis :: print()
{}
} // namespace

View File

@ -14,53 +14,53 @@
namespace Moses
{
class dsgState : public FFState
{
public:
class dsgState : public FFState
{
public:
dsgState(const lm::ngram::State & val);
virtual bool operator==(const FFState& other) const;
void saveState( std::vector<std::string> bufferVal,std::vector<int> spanVal, float deltaValue);
dsgState(const lm::ngram::State & val);
virtual bool operator==(const FFState& other) const;
void saveState( std::vector<std::string> bufferVal,std::vector<int> spanVal, float deltaValue);
std::vector<std::string> getBuffer() const {
return buffer;
}
std::vector<std::string> getBuffer() const {
return buffer;
}
std::vector<int> getSpan() const {
return span;
}
std::vector<int> getSpan() const {
return span;
}
lm::ngram::State getLMState() const {
return lmState;
}
lm::ngram::State getLMState() const {
return lmState;
}
float getDelta() const {
return delta;
}
float getDelta() const {
return delta;
}
void setDelta(double val1 ) {
delta = val1;
}
void setDelta(double val1 ) {
delta = val1;
}
void print() const;
std::string getName() const;
void print() const;
std::string getName() const;
virtual size_t hash() const;
virtual size_t hash() const;
protected:
std::vector<std::string> buffer;
std::vector<int> span;
lm::ngram::State lmState;
double delta; //NEW
};
protected:
std::vector<std::string> buffer;
std::vector<int> span;
lm::ngram::State lmState;
double delta; //NEW
};
class dsgHypothesis
{
private:
private:
std::vector<std::string> m_buffer;// maintains dangling affix from previous hypothesis
std::vector<int> m_span;// maintains source alignment for dangling affix from previous hypothesis
lm::ngram::State lmState; // KenLM's Model State ...
@ -73,7 +73,7 @@ class dsgHypothesis
int discontig2;
double UnsegWP; //Word Penalty score based on count of words
public:
public:
dsgHypothesis();
~dsgHypothesis() {};
@ -84,7 +84,7 @@ class dsgHypothesis
m_curr_phr = val1;
}
void setDelta(double val1 ) {
void setDelta(double val1 ) {
delta = val1;
}

View File

@ -173,8 +173,10 @@ bool TryHuge(std::size_t size, uint8_t alignment_bits, bool populate, util::scop
// Second try: manually configured hugetlb pages exist, but kernel too old to
// pick size or not available. This might pick the wrong size huge pages,
// but the sysadmin must have made them available in the first place.
#ifdef MAP_HUGETLB
if (AnonymousMap(size, MAP_HUGETLB, populate, to))
return true;
#endif
// Third try: align to a multiple of the huge page size by overallocating.
// I feel bad about doing this, but it's also how posix_memalign is