mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-26 19:37:58 +03:00
daily automatic beautifier
This commit is contained in:
parent
ba0a3d92f4
commit
7b205b0c8a
@ -11,73 +11,74 @@ using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
void Desegmenter::Load(const string filename){
|
||||
void Desegmenter::Load(const string filename)
|
||||
{
|
||||
|
||||
std::ifstream myFile(filename.c_str() );
|
||||
if (myFile.is_open()){
|
||||
cerr << "Desegmentation File open successful." << endl;
|
||||
string line;
|
||||
while (getline(myFile, line)){
|
||||
stringstream ss(line);
|
||||
string token;
|
||||
vector<string> myline;
|
||||
while (getline(ss, token, '\t')){
|
||||
myline.push_back(token);
|
||||
}
|
||||
mmDesegTable.insert(pair<string, string>(myline[2], myline[1] ));
|
||||
}
|
||||
myFile.close();
|
||||
}
|
||||
else
|
||||
cerr << "open() failed: check if Desegmentation file is in right folder" << endl;
|
||||
std::ifstream myFile(filename.c_str() );
|
||||
if (myFile.is_open()) {
|
||||
cerr << "Desegmentation File open successful." << endl;
|
||||
string line;
|
||||
while (getline(myFile, line)) {
|
||||
stringstream ss(line);
|
||||
string token;
|
||||
vector<string> myline;
|
||||
while (getline(ss, token, '\t')) {
|
||||
myline.push_back(token);
|
||||
}
|
||||
mmDesegTable.insert(pair<string, string>(myline[2], myline[1] ));
|
||||
}
|
||||
myFile.close();
|
||||
} else
|
||||
cerr << "open() failed: check if Desegmentation file is in right folder" << endl;
|
||||
}
|
||||
|
||||
|
||||
vector<string> Desegmenter::Search(string myKey){
|
||||
multimap<string, string>::const_iterator mmiPairFound = mmDesegTable.find(myKey);
|
||||
vector<string> result;
|
||||
if (mmiPairFound != mmDesegTable.end()){
|
||||
size_t nNumPairsInMap = mmDesegTable.count(myKey);
|
||||
for (size_t nValuesCounter = 0; nValuesCounter < nNumPairsInMap; ++nValuesCounter){
|
||||
if (mmiPairFound != mmDesegTable.end()) {
|
||||
result.push_back(mmiPairFound->second);
|
||||
}
|
||||
++mmiPairFound;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
else{
|
||||
string rule_deseg ;
|
||||
rule_deseg = ApplyRules(myKey);
|
||||
result.push_back(rule_deseg);
|
||||
return result;
|
||||
}
|
||||
vector<string> Desegmenter::Search(string myKey)
|
||||
{
|
||||
multimap<string, string>::const_iterator mmiPairFound = mmDesegTable.find(myKey);
|
||||
vector<string> result;
|
||||
if (mmiPairFound != mmDesegTable.end()) {
|
||||
size_t nNumPairsInMap = mmDesegTable.count(myKey);
|
||||
for (size_t nValuesCounter = 0; nValuesCounter < nNumPairsInMap; ++nValuesCounter) {
|
||||
if (mmiPairFound != mmDesegTable.end()) {
|
||||
result.push_back(mmiPairFound->second);
|
||||
}
|
||||
++mmiPairFound;
|
||||
}
|
||||
return result;
|
||||
} else {
|
||||
string rule_deseg ;
|
||||
rule_deseg = ApplyRules(myKey);
|
||||
result.push_back(rule_deseg);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
string Desegmenter::ApplyRules(string & segToken){
|
||||
string Desegmenter::ApplyRules(string & segToken)
|
||||
{
|
||||
|
||||
string desegToken=segToken;
|
||||
if (!simple){
|
||||
boost::replace_all(desegToken, "l+ All", "ll");
|
||||
boost::replace_all(desegToken, "l+ Al", "ll");
|
||||
boost::replace_all(desegToken, "y+ y ", "y");
|
||||
boost::replace_all(desegToken, "p+ ", "t");
|
||||
boost::replace_all(desegToken, "' +", "}");
|
||||
boost::replace_all(desegToken, "y +", "A");
|
||||
boost::replace_all(desegToken, "n +n", "n");
|
||||
boost::replace_all(desegToken, "mn +m", "mm");
|
||||
boost::replace_all(desegToken, "En +m", "Em");
|
||||
boost::replace_all(desegToken, "An +lA", "Em");
|
||||
boost::replace_all(desegToken, "-LRB-", "(");
|
||||
boost::replace_all(desegToken, "-RRB-", ")");
|
||||
}
|
||||
|
||||
boost::replace_all(desegToken, "+ +", "");
|
||||
boost::replace_all(desegToken, "+ ", "");
|
||||
boost::replace_all(desegToken, " +", "");
|
||||
|
||||
return desegToken;
|
||||
string desegToken=segToken;
|
||||
if (!simple) {
|
||||
boost::replace_all(desegToken, "l+ All", "ll");
|
||||
boost::replace_all(desegToken, "l+ Al", "ll");
|
||||
boost::replace_all(desegToken, "y+ y ", "y");
|
||||
boost::replace_all(desegToken, "p+ ", "t");
|
||||
boost::replace_all(desegToken, "' +", "}");
|
||||
boost::replace_all(desegToken, "y +", "A");
|
||||
boost::replace_all(desegToken, "n +n", "n");
|
||||
boost::replace_all(desegToken, "mn +m", "mm");
|
||||
boost::replace_all(desegToken, "En +m", "Em");
|
||||
boost::replace_all(desegToken, "An +lA", "Em");
|
||||
boost::replace_all(desegToken, "-LRB-", "(");
|
||||
boost::replace_all(desegToken, "-RRB-", ")");
|
||||
}
|
||||
|
||||
boost::replace_all(desegToken, "+ +", "");
|
||||
boost::replace_all(desegToken, "+ ", "");
|
||||
boost::replace_all(desegToken, " +", "");
|
||||
|
||||
return desegToken;
|
||||
}
|
||||
|
||||
Desegmenter::~Desegmenter()
|
||||
|
@ -11,21 +11,23 @@ namespace Moses
|
||||
class Desegmenter
|
||||
{
|
||||
private:
|
||||
std::multimap<string, string> mmDesegTable;
|
||||
std::string filename;
|
||||
bool simple;
|
||||
void Load(const string filename);
|
||||
std::multimap<string, string> mmDesegTable;
|
||||
std::string filename;
|
||||
bool simple;
|
||||
void Load(const string filename);
|
||||
|
||||
public:
|
||||
Desegmenter(const std::string& file, const bool scheme){
|
||||
filename = file;
|
||||
simple=scheme;
|
||||
Load(filename);
|
||||
}
|
||||
string getFileName(){ return filename; }
|
||||
|
||||
vector<string> Search(string myKey);
|
||||
string ApplyRules(string &);
|
||||
~Desegmenter();
|
||||
Desegmenter(const std::string& file, const bool scheme) {
|
||||
filename = file;
|
||||
simple=scheme;
|
||||
Load(filename);
|
||||
}
|
||||
string getFileName() {
|
||||
return filename;
|
||||
}
|
||||
|
||||
vector<string> Search(string myKey);
|
||||
string ApplyRules(string &);
|
||||
~Desegmenter();
|
||||
};
|
||||
}
|
||||
|
@ -10,147 +10,147 @@ using namespace lm::ngram;
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
DesegModel::DesegModel(const std::string &line)
|
||||
:StatefulFeatureFunction(5, line )
|
||||
{
|
||||
tFactor = 0;
|
||||
order=5;
|
||||
numFeatures = 5;
|
||||
optimistic = 1;
|
||||
ReadParameters();
|
||||
DesegModel::DesegModel(const std::string &line)
|
||||
:StatefulFeatureFunction(5, line )
|
||||
{
|
||||
tFactor = 0;
|
||||
order=5;
|
||||
numFeatures = 5;
|
||||
optimistic = 1;
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
DesegModel::~DesegModel()
|
||||
{
|
||||
delete DSGM;
|
||||
}
|
||||
|
||||
void DesegModel :: readLanguageModel(const char *lmFile)
|
||||
{
|
||||
DSGM = ConstructDsgLM(m_lmPath.c_str());
|
||||
State startState = DSGM->NullContextState();
|
||||
desegT=new Desegmenter(m_desegPath,m_simple);// Desegmentation Table
|
||||
}
|
||||
|
||||
|
||||
void DesegModel::Load(AllOptions::ptr const& opts)
|
||||
{
|
||||
m_options = opts;
|
||||
readLanguageModel(m_lmPath.c_str());
|
||||
}
|
||||
|
||||
|
||||
|
||||
void DesegModel:: EvaluateInIsolation(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedScores) const
|
||||
{
|
||||
|
||||
dsgHypothesis obj;
|
||||
vector <string> myTargetPhrase;
|
||||
vector<float> scores;
|
||||
vector<string> targ_phrase; //stores the segmented tokens in the target phrase
|
||||
const AlignmentInfo &align = targetPhrase.GetAlignTerm();
|
||||
|
||||
for (int i = 0; i < targetPhrase.GetSize(); i++) {
|
||||
targ_phrase.push_back(targetPhrase.GetWord(i).GetFactor(tFactor)->GetString().as_string());
|
||||
}
|
||||
|
||||
DesegModel::~DesegModel()
|
||||
{
|
||||
delete DSGM;
|
||||
obj.setState(DSGM->NullContextState());
|
||||
obj.setPhrases(targ_phrase);
|
||||
obj.calculateDsgProbinIsol(*DSGM,*desegT,align);
|
||||
obj.populateScores(scores,numFeatures);
|
||||
estimatedScores.PlusEquals(this, scores);
|
||||
}
|
||||
|
||||
|
||||
FFState* DesegModel::EvaluateWhenApplied(
|
||||
const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
const TargetPhrase &target = cur_hypo.GetCurrTargetPhrase();
|
||||
const Range &src_rng =cur_hypo.GetCurrSourceWordsRange();
|
||||
const AlignmentInfo &align = cur_hypo.GetCurrTargetPhrase().GetAlignTerm();
|
||||
size_t sourceOffset = src_rng.GetStartPos();
|
||||
|
||||
dsgHypothesis obj;
|
||||
vector<float> scores;
|
||||
vector<string> targ_phrase; //stores the segmented tokens in the target phrase
|
||||
bool isCompleted;
|
||||
|
||||
isCompleted=cur_hypo.IsSourceCompleted();
|
||||
for (int i = 0; i < cur_hypo.GetCurrTargetLength(); i++) {
|
||||
targ_phrase.push_back(target.GetWord(i).GetFactor(tFactor)->GetString().as_string());
|
||||
}
|
||||
|
||||
void DesegModel :: readLanguageModel(const char *lmFile)
|
||||
{
|
||||
DSGM = ConstructDsgLM(m_lmPath.c_str());
|
||||
State startState = DSGM->NullContextState();
|
||||
desegT=new Desegmenter(m_desegPath,m_simple);// Desegmentation Table
|
||||
}
|
||||
obj.setState(prev_state);
|
||||
obj.setPhrases( targ_phrase );
|
||||
obj.calculateDsgProb(*DSGM,*desegT,isCompleted,align, sourceOffset, optimistic);
|
||||
obj.populateScores(scores,numFeatures);
|
||||
accumulator->PlusEquals(this, scores);
|
||||
return obj.saveState();
|
||||
|
||||
}
|
||||
|
||||
FFState* DesegModel::EvaluateWhenApplied(
|
||||
const ChartHypothesis& /* cur_hypo */,
|
||||
int /* featureID - used to index the state in the previous hypotheses */,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
UTIL_THROW2("Chart decoding not support by UTIL_THROW2");
|
||||
}
|
||||
|
||||
const FFState* DesegModel::EmptyHypothesisState(const InputType &input) const
|
||||
{
|
||||
VERBOSE(3,"DesegModel::EmptyHypothesisState()" << endl);
|
||||
State startState = DSGM->BeginSentenceState();
|
||||
dsgState ss= dsgState(startState);
|
||||
return new dsgState(ss);
|
||||
}
|
||||
|
||||
std::string DesegModel::GetScoreProducerWeightShortName(unsigned idx) const
|
||||
{
|
||||
return "dsg";
|
||||
}
|
||||
|
||||
|
||||
void DesegModel::Load(AllOptions::ptr const& opts)
|
||||
{
|
||||
m_options = opts;
|
||||
readLanguageModel(m_lmPath.c_str());
|
||||
}
|
||||
void DesegModel::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
|
||||
|
||||
|
||||
void DesegModel:: EvaluateInIsolation(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedScores) const
|
||||
{
|
||||
|
||||
dsgHypothesis obj;
|
||||
vector <string> myTargetPhrase;
|
||||
vector<float> scores;
|
||||
vector<string> targ_phrase; //stores the segmented tokens in the target phrase
|
||||
const AlignmentInfo &align = targetPhrase.GetAlignTerm();
|
||||
|
||||
for (int i = 0; i < targetPhrase.GetSize(); i++) {
|
||||
targ_phrase.push_back(targetPhrase.GetWord(i).GetFactor(tFactor)->GetString().as_string());
|
||||
}
|
||||
|
||||
obj.setState(DSGM->NullContextState());
|
||||
obj.setPhrases(targ_phrase);
|
||||
obj.calculateDsgProbinIsol(*DSGM,*desegT,align);
|
||||
obj.populateScores(scores,numFeatures);
|
||||
estimatedScores.PlusEquals(this, scores);
|
||||
}
|
||||
|
||||
|
||||
FFState* DesegModel::EvaluateWhenApplied(
|
||||
const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
const TargetPhrase &target = cur_hypo.GetCurrTargetPhrase();
|
||||
const Range &src_rng =cur_hypo.GetCurrSourceWordsRange();
|
||||
const AlignmentInfo &align = cur_hypo.GetCurrTargetPhrase().GetAlignTerm();
|
||||
size_t sourceOffset = src_rng.GetStartPos();
|
||||
|
||||
dsgHypothesis obj;
|
||||
vector<float> scores;
|
||||
vector<string> targ_phrase; //stores the segmented tokens in the target phrase
|
||||
bool isCompleted;
|
||||
|
||||
isCompleted=cur_hypo.IsSourceCompleted();
|
||||
for (int i = 0; i < cur_hypo.GetCurrTargetLength(); i++) {
|
||||
targ_phrase.push_back(target.GetWord(i).GetFactor(tFactor)->GetString().as_string());
|
||||
}
|
||||
|
||||
obj.setState(prev_state);
|
||||
obj.setPhrases( targ_phrase );
|
||||
obj.calculateDsgProb(*DSGM,*desegT,isCompleted,align, sourceOffset, optimistic);
|
||||
obj.populateScores(scores,numFeatures);
|
||||
accumulator->PlusEquals(this, scores);
|
||||
return obj.saveState();
|
||||
|
||||
}
|
||||
|
||||
FFState* DesegModel::EvaluateWhenApplied(
|
||||
const ChartHypothesis& /* cur_hypo */,
|
||||
int /* featureID - used to index the state in the previous hypotheses */,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
UTIL_THROW2("Chart decoding not support by UTIL_THROW2");
|
||||
}
|
||||
|
||||
const FFState* DesegModel::EmptyHypothesisState(const InputType &input) const
|
||||
{
|
||||
VERBOSE(3,"DesegModel::EmptyHypothesisState()" << endl);
|
||||
State startState = DSGM->BeginSentenceState();
|
||||
dsgState ss= dsgState(startState);
|
||||
return new dsgState(ss);
|
||||
}
|
||||
|
||||
std::string DesegModel::GetScoreProducerWeightShortName(unsigned idx) const
|
||||
{
|
||||
return "dsg";
|
||||
}
|
||||
|
||||
|
||||
void DesegModel::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
|
||||
if (key == "path") {
|
||||
m_lmPath = value;
|
||||
} else if (key == "contiguity-features") {
|
||||
if(value == "no")
|
||||
numFeatures = 1;
|
||||
else
|
||||
numFeatures = 5;
|
||||
} else if (key == "output-factor") {
|
||||
tFactor = Scan<int>(value);
|
||||
} else if (key == "optimistic") {
|
||||
if (value == "n")
|
||||
if (key == "path") {
|
||||
m_lmPath = value;
|
||||
} else if (key == "contiguity-features") {
|
||||
if(value == "no")
|
||||
numFeatures = 1;
|
||||
else
|
||||
numFeatures = 5;
|
||||
} else if (key == "output-factor") {
|
||||
tFactor = Scan<int>(value);
|
||||
} else if (key == "optimistic") {
|
||||
if (value == "n")
|
||||
optimistic = 0;
|
||||
else
|
||||
else
|
||||
optimistic = 1;
|
||||
} else if (key == "deseg-path") {
|
||||
m_desegPath = Scan<int>(value);
|
||||
} else if (key == "deseg-scheme") {
|
||||
if(value == "s")
|
||||
m_simple = 1;
|
||||
else
|
||||
m_simple = 0;
|
||||
} else if (key == "order") {
|
||||
order = Scan<int>(value);
|
||||
} else {
|
||||
StatefulFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
} else if (key == "deseg-path") {
|
||||
m_desegPath = Scan<int>(value);
|
||||
} else if (key == "deseg-scheme") {
|
||||
if(value == "s")
|
||||
m_simple = 1;
|
||||
else
|
||||
m_simple = 0;
|
||||
} else if (key == "order") {
|
||||
order = Scan<int>(value);
|
||||
} else {
|
||||
StatefulFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
bool DesegModel::IsUseable(const FactorMask &mask) const
|
||||
{
|
||||
bool ret = mask[0];
|
||||
return ret;
|
||||
}
|
||||
bool DesegModel::IsUseable(const FactorMask &mask) const
|
||||
{
|
||||
bool ret = mask[0];
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
@ -13,52 +13,52 @@
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class DesegModel : public StatefulFeatureFunction
|
||||
{
|
||||
public:
|
||||
class DesegModel : public StatefulFeatureFunction
|
||||
{
|
||||
public:
|
||||
|
||||
DsgLM * DSGM;
|
||||
Desegmenter* desegT;
|
||||
int tFactor;// Target Factor ...
|
||||
int order;
|
||||
int numFeatures; // Number of features used an be 1 (unsegmented LM)or 5 (with 3 contiguity features and 1 UnsegWP)
|
||||
bool optimistic;
|
||||
DsgLM * DSGM;
|
||||
Desegmenter* desegT;
|
||||
int tFactor;// Target Factor ...
|
||||
int order;
|
||||
int numFeatures; // Number of features used an be 1 (unsegmented LM)or 5 (with 3 contiguity features and 1 UnsegWP)
|
||||
bool optimistic;
|
||||
|
||||
DesegModel(const std::string &line);
|
||||
~DesegModel();
|
||||
DesegModel(const std::string &line);
|
||||
~DesegModel();
|
||||
|
||||
void readLanguageModel(const char *);
|
||||
void Load(AllOptions::ptr const& opts);
|
||||
void readLanguageModel(const char *);
|
||||
void Load(AllOptions::ptr const& opts);
|
||||
|
||||
FFState* EvaluateWhenApplied(
|
||||
const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
FFState* EvaluateWhenApplied(
|
||||
const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
|
||||
virtual FFState* EvaluateWhenApplied(
|
||||
const ChartHypothesis& /* cur_hypo */,
|
||||
int /* featureID - used to index the state in the previous hypotheses */,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
virtual FFState* EvaluateWhenApplied(
|
||||
const ChartHypothesis& /* cur_hypo */,
|
||||
int /* featureID - used to index the state in the previous hypotheses */,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
|
||||
void EvaluateInIsolation(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedScores) const;
|
||||
void EvaluateInIsolation(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedScores) const;
|
||||
|
||||
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
|
||||
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
|
||||
|
||||
virtual std::string GetScoreProducerWeightShortName(unsigned idx=0) const;
|
||||
virtual std::string GetScoreProducerWeightShortName(unsigned idx=0) const;
|
||||
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
bool IsUseable(const FactorMask &mask) const;
|
||||
bool IsUseable(const FactorMask &mask) const;
|
||||
|
||||
protected:
|
||||
typedef std::vector<float> Scores;
|
||||
std::string m_lmPath;
|
||||
std::string m_desegPath;
|
||||
bool m_simple; //desegmentation scheme; if 1 then use simple, else use rule and backoff to simple
|
||||
};
|
||||
protected:
|
||||
typedef std::vector<float> Scores;
|
||||
std::string m_lmPath;
|
||||
std::string m_desegPath;
|
||||
bool m_simple; //desegmentation scheme; if 1 then use simple, else use rule and backoff to simple
|
||||
};
|
||||
|
||||
|
||||
}
|
||||
|
@ -3,32 +3,32 @@
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
DsgLM* ConstructDsgLM(const char *file)
|
||||
{
|
||||
lm::ngram::ModelType model_type;
|
||||
lm::ngram::Config config;
|
||||
if (lm::ngram::RecognizeBinary(file, model_type)) {
|
||||
switch(model_type) {
|
||||
case lm::ngram::PROBING:
|
||||
return new KenDsg<lm::ngram::ProbingModel>(file, config);
|
||||
case lm::ngram::REST_PROBING:
|
||||
return new KenDsg<lm::ngram::RestProbingModel>(file, config);
|
||||
case lm::ngram::TRIE:
|
||||
return new KenDsg<lm::ngram::TrieModel>(file, config);
|
||||
case lm::ngram::QUANT_TRIE:
|
||||
return new KenDsg<lm::ngram::QuantTrieModel>(file, config);
|
||||
case lm::ngram::ARRAY_TRIE:
|
||||
return new KenDsg<lm::ngram::ArrayTrieModel>(file, config);
|
||||
case lm::ngram::QUANT_ARRAY_TRIE:
|
||||
return new KenDsg<lm::ngram::QuantArrayTrieModel>(file, config);
|
||||
default:
|
||||
UTIL_THROW2("Unrecognized kenlm model type " << model_type);
|
||||
}
|
||||
} else {
|
||||
DsgLM* ConstructDsgLM(const char *file)
|
||||
{
|
||||
lm::ngram::ModelType model_type;
|
||||
lm::ngram::Config config;
|
||||
if (lm::ngram::RecognizeBinary(file, model_type)) {
|
||||
switch(model_type) {
|
||||
case lm::ngram::PROBING:
|
||||
return new KenDsg<lm::ngram::ProbingModel>(file, config);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
case lm::ngram::REST_PROBING:
|
||||
return new KenDsg<lm::ngram::RestProbingModel>(file, config);
|
||||
case lm::ngram::TRIE:
|
||||
return new KenDsg<lm::ngram::TrieModel>(file, config);
|
||||
case lm::ngram::QUANT_TRIE:
|
||||
return new KenDsg<lm::ngram::QuantTrieModel>(file, config);
|
||||
case lm::ngram::ARRAY_TRIE:
|
||||
return new KenDsg<lm::ngram::ArrayTrieModel>(file, config);
|
||||
case lm::ngram::QUANT_ARRAY_TRIE:
|
||||
return new KenDsg<lm::ngram::QuantArrayTrieModel>(file, config);
|
||||
default:
|
||||
UTIL_THROW2("Unrecognized kenlm model type " << model_type);
|
||||
}
|
||||
} else {
|
||||
return new KenDsg<lm::ngram::ProbingModel>(file, config);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
|
@ -8,7 +8,7 @@ namespace Moses
|
||||
|
||||
class KenDsgBase
|
||||
{
|
||||
public:
|
||||
public:
|
||||
virtual ~KenDsgBase() {}
|
||||
|
||||
virtual float Score(const lm::ngram::State&, StringPiece,
|
||||
@ -22,17 +22,17 @@ class KenDsgBase
|
||||
};
|
||||
|
||||
template <class KenModel>
|
||||
class KenDsg : public KenDsgBase
|
||||
class KenDsg : public KenDsgBase
|
||||
{
|
||||
public:
|
||||
public:
|
||||
KenDsg(const char *file, const lm::ngram::Config &config)
|
||||
: m_kenlm(file, config) {}
|
||||
|
||||
float Score(const lm::ngram::State &in_state,
|
||||
StringPiece word,
|
||||
lm::ngram::State &out_state) const {
|
||||
StringPiece word,
|
||||
lm::ngram::State &out_state) const {
|
||||
return m_kenlm.Score(in_state, m_kenlm.GetVocabulary().Index(word),
|
||||
out_state);
|
||||
out_state);
|
||||
}
|
||||
|
||||
const lm::ngram::State &BeginSentenceState() const {
|
||||
@ -48,13 +48,13 @@ template <class KenModel>
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
private:
|
||||
KenModel m_kenlm;
|
||||
};
|
||||
|
||||
typedef KenDsgBase DsgLM;
|
||||
typedef KenDsgBase DsgLM;
|
||||
|
||||
DsgLM* ConstructDsgLM(const char *file);
|
||||
DsgLM* ConstructDsgLM(const char *file);
|
||||
|
||||
|
||||
} // namespace
|
||||
|
@ -2,9 +2,9 @@
|
||||
#include <sstream>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <math.h>
|
||||
#include <map>
|
||||
#include <cstdlib>
|
||||
#include <math.h>
|
||||
#include <map>
|
||||
|
||||
|
||||
using namespace std;
|
||||
@ -12,356 +12,380 @@ using namespace lm::ngram;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
dsgState::dsgState(const State & val)
|
||||
{
|
||||
lmState = val;
|
||||
}
|
||||
dsgState::dsgState(const State & val)
|
||||
{
|
||||
lmState = val;
|
||||
}
|
||||
|
||||
void dsgState::saveState( std::vector<std::string> danglingTok, std::vector<int> srcSpans,float deltaValue)
|
||||
{
|
||||
buffer = danglingTok;
|
||||
span=srcSpans;
|
||||
delta=deltaValue;
|
||||
}
|
||||
void dsgState::saveState( std::vector<std::string> danglingTok, std::vector<int> srcSpans,float deltaValue)
|
||||
{
|
||||
buffer = danglingTok;
|
||||
span=srcSpans;
|
||||
delta=deltaValue;
|
||||
}
|
||||
|
||||
|
||||
size_t dsgState::hash() const
|
||||
{
|
||||
size_t dsgState::hash() const
|
||||
{
|
||||
|
||||
size_t ret = 0;
|
||||
boost::hash_combine(ret, lmState);
|
||||
size_t ret = 0;
|
||||
boost::hash_combine(ret, lmState);
|
||||
|
||||
/*size_t ret = delta;
|
||||
/*size_t ret = delta;
|
||||
boost::hash_combine(ret, buffer);
|
||||
boost::hash_combine(ret, span);
|
||||
boost::hash_combine(ret, lmState.length);
|
||||
return ret;*/
|
||||
}
|
||||
|
||||
bool dsgState::operator==(const FFState& otherBase) const //CHECK
|
||||
{
|
||||
const dsgState &other = static_cast<const dsgState&>(otherBase);
|
||||
|
||||
if (lmState < other.lmState) return false;
|
||||
if (lmState == other.lmState) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// ----------------------------------------
|
||||
|
||||
std::string dsgState :: getName() const
|
||||
{
|
||||
return "done";
|
||||
}
|
||||
|
||||
dsgHypothesis :: dsgHypothesis()
|
||||
{
|
||||
lmProb = 0;
|
||||
discontig0 = 0;
|
||||
discontig1 = 0;
|
||||
discontig2 = 0;
|
||||
UnsegWP = 0;
|
||||
m_buffer.clear();//="";
|
||||
}
|
||||
|
||||
void dsgHypothesis :: setState(const FFState* prev_state)
|
||||
{
|
||||
if(prev_state != NULL) {
|
||||
m_buffer = static_cast <const dsgState *> (prev_state)->getBuffer();
|
||||
m_span = static_cast <const dsgState *> (prev_state)->getSpan();
|
||||
lmState = static_cast <const dsgState *> (prev_state)->getLMState();
|
||||
delta = static_cast <const dsgState *> (prev_state)->getDelta(); //NEW
|
||||
}
|
||||
}
|
||||
|
||||
bool dsgState::operator==(const FFState& otherBase) const //CHECK
|
||||
{
|
||||
const dsgState &other = static_cast<const dsgState&>(otherBase);
|
||||
dsgState * dsgHypothesis :: saveState()
|
||||
{
|
||||
dsgState * statePtr = new dsgState(lmState);
|
||||
statePtr->saveState(m_buffer, m_span, delta);
|
||||
return statePtr;
|
||||
}
|
||||
|
||||
if (lmState < other.lmState) return false;
|
||||
if (lmState == other.lmState) return true;
|
||||
void dsgHypothesis :: populateScores(vector <float> & scores , const int numFeatures)
|
||||
{
|
||||
scores.clear();
|
||||
scores.push_back(lmProb);
|
||||
|
||||
if (numFeatures == 1)
|
||||
return;
|
||||
scores.push_back(discontig0);
|
||||
scores.push_back(discontig1);
|
||||
scores.push_back(discontig2);
|
||||
scores.push_back(UnsegWP);
|
||||
}
|
||||
|
||||
|
||||
|
||||
bool dsgHypothesis::isPrefix(const std::string &tok)
|
||||
{
|
||||
if ((tok.at(tok.size() - 1) == '+' )&& (tok != "+")) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
};
|
||||
}
|
||||
|
||||
bool dsgHypothesis::isSuffix(const std::string &tok)
|
||||
{
|
||||
if ((tok.at(0) == '+' )&& (tok != "+")) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
};
|
||||
}
|
||||
|
||||
bool dsgHypothesis::isStem(const std::string &tok)
|
||||
{
|
||||
if ((tok.at(0) != '+') && (tok.at(tok.size() - 1) != '+')) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* chain stores segmented tokens that are in process of building a word
|
||||
* The function checks if tok contributes to the word being formed in chain
|
||||
*
|
||||
*/
|
||||
bool dsgHypothesis::isValidChain(const std::string &tok, std::vector<std::string> &chain)
|
||||
{
|
||||
std::string last_tok;
|
||||
if (chain.size() >= 1) {
|
||||
last_tok = chain[chain.size() - 1];
|
||||
} else {
|
||||
last_tok = "NULL";
|
||||
}
|
||||
if(tok=="+") {
|
||||
return false;
|
||||
}
|
||||
|
||||
// ----------------------------------------
|
||||
|
||||
std::string dsgState :: getName() const
|
||||
{
|
||||
return "done";
|
||||
if (isPrefix(tok) && (chain.size() == 0 || isPrefix(last_tok))) {
|
||||
return true;
|
||||
} else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok)))) {
|
||||
return true; // allows one suffix ONLY
|
||||
}
|
||||
|
||||
dsgHypothesis :: dsgHypothesis()
|
||||
{
|
||||
lmProb = 0;
|
||||
discontig0 = 0;
|
||||
discontig1 = 0;
|
||||
discontig2 = 0;
|
||||
UnsegWP = 0;
|
||||
m_buffer.clear();//="";
|
||||
//else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok) || isSuffix(last_tok) ))) { return true; } // allows multiple suffixes
|
||||
else if (isStem(tok) && (chain.size() == 0 || isPrefix(last_tok))) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
void dsgHypothesis :: setState(const FFState* prev_state)
|
||||
{
|
||||
if(prev_state != NULL) {
|
||||
m_buffer = static_cast <const dsgState *> (prev_state)->getBuffer();
|
||||
m_span = static_cast <const dsgState *> (prev_state)->getSpan();
|
||||
lmState = static_cast <const dsgState *> (prev_state)->getLMState();
|
||||
delta = static_cast <const dsgState *> (prev_state)->getDelta(); //NEW
|
||||
/**
|
||||
* grouper function groups tokens that form a word together
|
||||
*/
|
||||
vector<string> dsgHypothesis::grouper(std::vector<std::string> &phr_vec,vector<vector<int> > &allchain_ids, int sourceOffset,const AlignmentInfo &align, bool isolation)
|
||||
{
|
||||
|
||||
std::vector<std::string> chain;
|
||||
std::vector<int> chain_ids;
|
||||
std::vector<std::string> allchains;
|
||||
chain_ids=m_span;
|
||||
|
||||
if (!m_buffer.empty() && !isolation) { // if evaluate in isolation is called, then do not add buffer content
|
||||
for (int i = 0; i < m_buffer.size(); i++) { // initialize chain with the content of the buffer
|
||||
chain.push_back(m_buffer[i]);
|
||||
}
|
||||
}
|
||||
|
||||
dsgState * dsgHypothesis :: saveState()
|
||||
{
|
||||
dsgState * statePtr = new dsgState(lmState);
|
||||
statePtr->saveState(m_buffer, m_span, delta);
|
||||
return statePtr;
|
||||
}
|
||||
for (int i = 0; i < phr_vec.size(); i++) {
|
||||
std::set<std::size_t> sourcePosSet = align.GetAlignmentsForTarget(i);
|
||||
|
||||
void dsgHypothesis :: populateScores(vector <float> & scores , const int numFeatures)
|
||||
{
|
||||
scores.clear();
|
||||
scores.push_back(lmProb);
|
||||
|
||||
if (numFeatures == 1)
|
||||
return;
|
||||
scores.push_back(discontig0);
|
||||
scores.push_back(discontig1);
|
||||
scores.push_back(discontig2);
|
||||
scores.push_back(UnsegWP);
|
||||
}
|
||||
|
||||
|
||||
|
||||
bool dsgHypothesis::isPrefix(const std::string &tok){
|
||||
if ((tok.at(tok.size() - 1) == '+' )&& (tok != "+")) { return true; }
|
||||
else { return false; };
|
||||
}
|
||||
|
||||
bool dsgHypothesis::isSuffix(const std::string &tok){
|
||||
if ((tok.at(0) == '+' )&& (tok != "+")) { return true; }
|
||||
else { return false; };
|
||||
}
|
||||
|
||||
bool dsgHypothesis::isStem(const std::string &tok){
|
||||
if ((tok.at(0) != '+') && (tok.at(tok.size() - 1) != '+')){ return true; }
|
||||
else { return false; };
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* chain stores segmented tokens that are in process of building a word
|
||||
* The function checks if tok contributes to the word being formed in chain
|
||||
*
|
||||
*/
|
||||
bool dsgHypothesis::isValidChain(const std::string &tok, std::vector<std::string> &chain){
|
||||
std::string last_tok;
|
||||
if (chain.size() >= 1){
|
||||
last_tok = chain[chain.size() - 1];
|
||||
}
|
||||
else{
|
||||
last_tok = "NULL";
|
||||
}
|
||||
if(tok=="+"){return false;}
|
||||
if (isPrefix(tok) && (chain.size() == 0 || isPrefix(last_tok))) { return true; }
|
||||
else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok)))) { return true; } // allows one suffix ONLY
|
||||
//else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok) || isSuffix(last_tok) ))) { return true; } // allows multiple suffixes
|
||||
else if (isStem(tok) && (chain.size() == 0 || isPrefix(last_tok))) { return true; }
|
||||
else { return false; }
|
||||
}
|
||||
|
||||
/**
|
||||
* grouper function groups tokens that form a word together
|
||||
*/
|
||||
vector<string> dsgHypothesis::grouper(std::vector<std::string> &phr_vec,vector<vector<int> > &allchain_ids, int sourceOffset,const AlignmentInfo &align, bool isolation){
|
||||
|
||||
std::vector<std::string> chain;
|
||||
std::vector<int> chain_ids;
|
||||
std::vector<std::string> allchains;
|
||||
chain_ids=m_span;
|
||||
|
||||
if (!m_buffer.empty() && !isolation){// if evaluate in isolation is called, then do not add buffer content
|
||||
for (int i = 0; i < m_buffer.size(); i++){ // initialize chain with the content of the buffer
|
||||
chain.push_back(m_buffer[i]);
|
||||
if (isValidChain(phr_vec[i], chain)) {
|
||||
chain.push_back(phr_vec[i]);
|
||||
if (sourcePosSet.empty()==false) {
|
||||
for (std::set<size_t>::iterator it(sourcePosSet.begin()); it != sourcePosSet.end(); it++) {
|
||||
int cur=*it;
|
||||
chain_ids.push_back(cur+sourceOffset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < phr_vec.size(); i++){
|
||||
std::set<std::size_t> sourcePosSet = align.GetAlignmentsForTarget(i);
|
||||
|
||||
if (isValidChain(phr_vec[i], chain)){
|
||||
chain.push_back(phr_vec[i]);
|
||||
if (sourcePosSet.empty()==false){
|
||||
for (std::set<size_t>::iterator it(sourcePosSet.begin());it != sourcePosSet.end(); it++) {
|
||||
int cur=*it;
|
||||
chain_ids.push_back(cur+sourceOffset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
else if (chain.size() == 0) { // start of a suffix at hypothesis0
|
||||
allchains.push_back(phr_vec[i]);
|
||||
allchain_ids.push_back(chain_ids);
|
||||
chain_ids.clear();//={};
|
||||
}
|
||||
|
||||
else { // tokens formed a complete word; add tokens segmented by space to allchains
|
||||
std::string joined = boost::algorithm::join(chain, " ");
|
||||
allchains.push_back(joined);
|
||||
allchain_ids.push_back(chain_ids);
|
||||
|
||||
chain.clear();// = {};
|
||||
chain_ids.clear();//={};
|
||||
|
||||
chain.push_back(phr_vec[i]);
|
||||
if (sourcePosSet.empty()==false){
|
||||
for (std::set<size_t>::iterator it(sourcePosSet.begin());it != sourcePosSet.end(); it++) {
|
||||
int cur=*it;
|
||||
chain_ids.push_back(cur+sourceOffset);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
else if (chain.size() == 0) { // start of a suffix at hypothesis0
|
||||
allchains.push_back(phr_vec[i]);
|
||||
allchain_ids.push_back(chain_ids);
|
||||
chain_ids.clear();//={};
|
||||
}
|
||||
|
||||
if (!chain.empty()){
|
||||
else { // tokens formed a complete word; add tokens segmented by space to allchains
|
||||
std::string joined = boost::algorithm::join(chain, " ");
|
||||
allchains.push_back(joined);
|
||||
allchain_ids.push_back(chain_ids);
|
||||
|
||||
chain.clear();// = {};
|
||||
chain_ids.clear();//={};
|
||||
|
||||
chain.push_back(phr_vec[i]);
|
||||
if (sourcePosSet.empty()==false) {
|
||||
for (std::set<size_t>::iterator it(sourcePosSet.begin()); it != sourcePosSet.end(); it++) {
|
||||
int cur=*it;
|
||||
chain_ids.push_back(cur+sourceOffset);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return allchains;
|
||||
|
||||
}
|
||||
|
||||
if (!chain.empty()) {
|
||||
std::string joined = boost::algorithm::join(chain, " ");
|
||||
allchains.push_back(joined);
|
||||
allchain_ids.push_back(chain_ids);
|
||||
}
|
||||
return allchains;
|
||||
}
|
||||
|
||||
|
||||
void dsgHypothesis :: calculateDsgProbinIsol(DsgLM & ptrDsgLM, Desegmenter &desegT, const AlignmentInfo &align ){
|
||||
lmProb = 0;
|
||||
State currState = lmState;
|
||||
State temp;
|
||||
string desegmented="";
|
||||
vector <string> words;
|
||||
vector <string> currFVec;
|
||||
|
||||
discontig0=0;
|
||||
discontig1=0;
|
||||
discontig2=0;
|
||||
UnsegWP=0;
|
||||
void dsgHypothesis :: calculateDsgProbinIsol(DsgLM & ptrDsgLM, Desegmenter &desegT, const AlignmentInfo &align )
|
||||
{
|
||||
lmProb = 0;
|
||||
State currState = lmState;
|
||||
State temp;
|
||||
string desegmented="";
|
||||
vector <string> words;
|
||||
vector <string> currFVec;
|
||||
|
||||
currFVec = m_buffer;
|
||||
currFVec.insert( currFVec.end(), m_curr_phr.begin(), m_curr_phr.end() );
|
||||
discontig0=0;
|
||||
discontig1=0;
|
||||
discontig2=0;
|
||||
UnsegWP=0;
|
||||
|
||||
int vecSize=currFVec.size();
|
||||
currFVec = m_buffer;
|
||||
currFVec.insert( currFVec.end(), m_curr_phr.begin(), m_curr_phr.end() );
|
||||
|
||||
// phrases with suffix-starts and prefix-end
|
||||
if (currFVec.size()>0 && isPrefix (currFVec.back())) {
|
||||
UnsegWP-=0.5;}
|
||||
if (currFVec.size()>0 && isSuffix (currFVec.front())) {
|
||||
UnsegWP-=0.5;}
|
||||
int vecSize=currFVec.size();
|
||||
|
||||
/* //Dropping prefix-end and suffix-start
|
||||
while (currFVec.size()>0 && isPrefix (currFVec.back())){
|
||||
currFVec.pop_back(); //drop prefix appearing at end of phrase
|
||||
}
|
||||
// phrases with suffix-starts and prefix-end
|
||||
if (currFVec.size()>0 && isPrefix (currFVec.back())) {
|
||||
UnsegWP-=0.5;
|
||||
}
|
||||
if (currFVec.size()>0 && isSuffix (currFVec.front())) {
|
||||
UnsegWP-=0.5;
|
||||
}
|
||||
|
||||
while (currFVec.size()>0 && isSuffix (currFVec.front())){
|
||||
currFVec.erase (currFVec.begin()); //drop suffix appearning at start of a phrase
|
||||
} */
|
||||
/* //Dropping prefix-end and suffix-start
|
||||
while (currFVec.size()>0 && isPrefix (currFVec.back())){
|
||||
currFVec.pop_back(); //drop prefix appearing at end of phrase
|
||||
}
|
||||
|
||||
vector<vector<int> > chain_ids;
|
||||
words = grouper(currFVec,chain_ids,0,align,1);
|
||||
while (currFVec.size()>0 && isSuffix (currFVec.front())){
|
||||
currFVec.erase (currFVec.begin()); //drop suffix appearning at start of a phrase
|
||||
} */
|
||||
|
||||
for (int i = 0; i<words.size(); i++) {
|
||||
vector<vector<int> > chain_ids;
|
||||
words = grouper(currFVec,chain_ids,0,align,1);
|
||||
|
||||
for (int i = 0; i<words.size(); i++) {
|
||||
UnsegWP+=1;
|
||||
temp = currState;
|
||||
if (words[i].find(" ")!=std::string::npos) {
|
||||
desegmented=desegT.Search(words[i])[0];
|
||||
lmProb += ptrDsgLM.Score(temp,desegmented,currState);
|
||||
} else {
|
||||
boost::replace_all(words[i], "-LRB-", "(");
|
||||
boost::replace_all(words[i], "-RRB-", ")");
|
||||
lmProb += ptrDsgLM.Score(temp,words[i],currState);
|
||||
}
|
||||
}
|
||||
lmState = currState;
|
||||
}
|
||||
|
||||
void dsgHypothesis :: calculateDsgProb(DsgLM& ptrDsgLM, Desegmenter &desegT, bool isCompleted , const AlignmentInfo &align, int sourceOffset, bool optimistic)
|
||||
{
|
||||
lmProb = 0;
|
||||
discontig0=0;
|
||||
discontig1=0;
|
||||
discontig2=0;
|
||||
UnsegWP=0;
|
||||
|
||||
State currState = lmState;
|
||||
State temp;
|
||||
string desegmented="";
|
||||
vector <string> words;
|
||||
vector <string> currFVec;
|
||||
bool completePhraseSuffixEnd = false;
|
||||
vector<vector<int> > all_chain_ids;
|
||||
double pscore;
|
||||
currFVec=m_curr_phr;
|
||||
|
||||
// Check if the the phrase ends in a suffix, which means that it completes a full word;Make sure to change the isValidChain
|
||||
if (isSuffix (currFVec.back()) && (currFVec.back()!="+")) {
|
||||
completePhraseSuffixEnd=true;
|
||||
}
|
||||
|
||||
words = grouper(currFVec,all_chain_ids,sourceOffset,align,0);
|
||||
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
temp = currState;
|
||||
|
||||
if (i==words.size()-1) {
|
||||
if (completePhraseSuffixEnd) { //i.e if phrase ends with suffix, which marks an end of a word
|
||||
m_buffer.clear();// ="";
|
||||
m_span.clear();// ={};
|
||||
} else if (!isCompleted) { // not end of sentence( or final hypothesis), and probably the last token is not a complete word
|
||||
m_buffer.clear();
|
||||
if (optimistic == 1) {
|
||||
if ( isPrefix (currFVec.back())) { // this will delay scoring of prefix in prefix-ending phrases until the next hypothesis arrives
|
||||
//pscore = ptrDsgLM.Score(temp,desegmented,currState);
|
||||
lmProb -= delta;
|
||||
delta = 0.0;
|
||||
}
|
||||
|
||||
else if (words[i].find(" ")!=std::string::npos) {
|
||||
desegmented=desegT.Search(words[i])[0];
|
||||
pscore=ptrDsgLM.Score(temp,desegmented,currState);
|
||||
lmProb = lmProb + pscore - delta;
|
||||
delta=pscore;
|
||||
currState=temp;
|
||||
} else {
|
||||
boost::replace_all(words[i], "-LRB-", "(");
|
||||
boost::replace_all(words[i], "-RRB-", ")");
|
||||
pscore=ptrDsgLM.Score(temp,words[i],currState);
|
||||
lmProb = lmProb + pscore - delta;
|
||||
delta=pscore;
|
||||
currState=temp;
|
||||
}
|
||||
}
|
||||
|
||||
m_buffer.push_back(words.back());
|
||||
m_span=all_chain_ids.back();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
//temp = currState;
|
||||
if (words[i].find(" ")!=std::string::npos) {
|
||||
UnsegWP+=1;
|
||||
temp = currState;
|
||||
if (words[i].find(" ")!=std::string::npos){
|
||||
desegmented=desegT.Search(words[i])[0];
|
||||
lmProb += ptrDsgLM.Score(temp,desegmented,currState);
|
||||
}
|
||||
else{
|
||||
boost::replace_all(words[i], "-LRB-", "(");
|
||||
boost::replace_all(words[i], "-RRB-", ")");
|
||||
lmProb += ptrDsgLM.Score(temp,words[i],currState);
|
||||
desegmented=desegT.Search(words[i])[0];
|
||||
std::set<int> cur_chain_ids(all_chain_ids[i].begin(),all_chain_ids[i].end());
|
||||
if (cur_chain_ids.size()>1) {
|
||||
vector<int> dsc;
|
||||
for (std::set<int>::iterator it(cur_chain_ids.begin()), next(it); it != cur_chain_ids.end() && ++next != cur_chain_ids.end(); it = next) {
|
||||
int cur=*it;
|
||||
int mynext=*next;
|
||||
if (std::abs(cur - mynext)>= 3) {
|
||||
dsc.push_back(3);
|
||||
} else if (std::abs(cur - mynext)== 2) {
|
||||
dsc.push_back(2);
|
||||
} else if (std::abs(cur - mynext)<= 1) {
|
||||
dsc.push_back(1);
|
||||
}
|
||||
}
|
||||
int mymax=*std::max_element(dsc.begin(),dsc.end());
|
||||
if (mymax==3) {
|
||||
discontig2+=1;
|
||||
} else if (mymax==2) {
|
||||
discontig1+=1;
|
||||
} else {
|
||||
discontig0+=1;
|
||||
}
|
||||
} else {
|
||||
discontig0 += 1;
|
||||
}
|
||||
|
||||
lmProb += ptrDsgLM.Score(temp,desegmented,currState);
|
||||
} else {
|
||||
UnsegWP+=1;
|
||||
boost::replace_all(words[i], "-LRB-", "(");
|
||||
boost::replace_all(words[i], "-RRB-", ")");
|
||||
lmProb += ptrDsgLM.Score(temp,words[i],currState);
|
||||
}
|
||||
lmState = currState;
|
||||
}
|
||||
|
||||
void dsgHypothesis :: calculateDsgProb(DsgLM& ptrDsgLM, Desegmenter &desegT, bool isCompleted , const AlignmentInfo &align, int sourceOffset, bool optimistic)
|
||||
{
|
||||
lmProb = 0;
|
||||
discontig0=0;
|
||||
discontig1=0;
|
||||
discontig2=0;
|
||||
UnsegWP=0;
|
||||
|
||||
State currState = lmState;
|
||||
State temp;
|
||||
string desegmented="";
|
||||
vector <string> words;
|
||||
vector <string> currFVec;
|
||||
bool completePhraseSuffixEnd = false;
|
||||
vector<vector<int> > all_chain_ids;
|
||||
double pscore;
|
||||
currFVec=m_curr_phr;
|
||||
|
||||
// Check if the the phrase ends in a suffix, which means that it completes a full word;Make sure to change the isValidChain
|
||||
if (isSuffix (currFVec.back()) && (currFVec.back()!="+")){completePhraseSuffixEnd=true;}
|
||||
|
||||
words = grouper(currFVec,all_chain_ids,sourceOffset,align,0);
|
||||
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
temp = currState;
|
||||
|
||||
if (i==words.size()-1){
|
||||
if (completePhraseSuffixEnd){ //i.e if phrase ends with suffix, which marks an end of a word
|
||||
m_buffer.clear();// ="";
|
||||
m_span.clear();// ={};
|
||||
}
|
||||
else if (!isCompleted) { // not end of sentence( or final hypothesis), and probably the last token is not a complete word
|
||||
m_buffer.clear();
|
||||
if (optimistic == 1){
|
||||
if ( isPrefix (currFVec.back())){ // this will delay scoring of prefix in prefix-ending phrases until the next hypothesis arrives
|
||||
//pscore = ptrDsgLM.Score(temp,desegmented,currState);
|
||||
lmProb -= delta;
|
||||
delta = 0.0;
|
||||
}
|
||||
|
||||
else if (words[i].find(" ")!=std::string::npos){
|
||||
desegmented=desegT.Search(words[i])[0];
|
||||
pscore=ptrDsgLM.Score(temp,desegmented,currState);
|
||||
lmProb = lmProb + pscore - delta;
|
||||
delta=pscore;
|
||||
currState=temp;
|
||||
}
|
||||
else{
|
||||
boost::replace_all(words[i], "-LRB-", "(");
|
||||
boost::replace_all(words[i], "-RRB-", ")");
|
||||
pscore=ptrDsgLM.Score(temp,words[i],currState);
|
||||
lmProb = lmProb + pscore - delta;
|
||||
delta=pscore;
|
||||
currState=temp;
|
||||
} }
|
||||
|
||||
m_buffer.push_back(words.back());
|
||||
m_span=all_chain_ids.back();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
//temp = currState;
|
||||
if (words[i].find(" ")!=std::string::npos){
|
||||
UnsegWP+=1;
|
||||
desegmented=desegT.Search(words[i])[0];
|
||||
std::set<int> cur_chain_ids(all_chain_ids[i].begin(),all_chain_ids[i].end());
|
||||
if (cur_chain_ids.size()>1){
|
||||
vector<int> dsc;
|
||||
for (std::set<int>::iterator it(cur_chain_ids.begin()), next(it);it != cur_chain_ids.end() && ++next != cur_chain_ids.end(); it = next) {
|
||||
int cur=*it;
|
||||
int mynext=*next;
|
||||
if (std::abs(cur - mynext)>= 3) {
|
||||
dsc.push_back(3);
|
||||
}
|
||||
else if (std::abs(cur - mynext)== 2){
|
||||
dsc.push_back(2);
|
||||
}
|
||||
else if (std::abs(cur - mynext)<= 1){
|
||||
dsc.push_back(1);
|
||||
}
|
||||
}
|
||||
int mymax=*std::max_element(dsc.begin(),dsc.end());
|
||||
if (mymax==3){discontig2+=1;}
|
||||
else if (mymax==2){discontig1+=1;}
|
||||
else{discontig0+=1;}
|
||||
}
|
||||
else{
|
||||
discontig0 += 1;
|
||||
}
|
||||
|
||||
lmProb += ptrDsgLM.Score(temp,desegmented,currState);
|
||||
}
|
||||
else{
|
||||
UnsegWP+=1;
|
||||
boost::replace_all(words[i], "-LRB-", "(");
|
||||
boost::replace_all(words[i], "-RRB-", ")");
|
||||
lmProb += ptrDsgLM.Score(temp,words[i],currState);
|
||||
}
|
||||
}
|
||||
|
||||
if (isCompleted){
|
||||
temp = currState;
|
||||
lmProb = lmProb + ptrDsgLM.ScoreEndSentence(temp,currState) - delta;
|
||||
}
|
||||
lmState = currState;
|
||||
if (isCompleted) {
|
||||
temp = currState;
|
||||
lmProb = lmProb + ptrDsgLM.ScoreEndSentence(temp,currState) - delta;
|
||||
}
|
||||
lmState = currState;
|
||||
}
|
||||
|
||||
|
||||
void dsgHypothesis :: print()
|
||||
{}
|
||||
void dsgHypothesis :: print()
|
||||
{}
|
||||
|
||||
|
||||
} // namespace
|
||||
|
@ -14,53 +14,53 @@
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class dsgState : public FFState
|
||||
{
|
||||
public:
|
||||
class dsgState : public FFState
|
||||
{
|
||||
public:
|
||||
|
||||
dsgState(const lm::ngram::State & val);
|
||||
virtual bool operator==(const FFState& other) const;
|
||||
void saveState( std::vector<std::string> bufferVal,std::vector<int> spanVal, float deltaValue);
|
||||
dsgState(const lm::ngram::State & val);
|
||||
virtual bool operator==(const FFState& other) const;
|
||||
void saveState( std::vector<std::string> bufferVal,std::vector<int> spanVal, float deltaValue);
|
||||
|
||||
std::vector<std::string> getBuffer() const {
|
||||
return buffer;
|
||||
}
|
||||
std::vector<std::string> getBuffer() const {
|
||||
return buffer;
|
||||
}
|
||||
|
||||
std::vector<int> getSpan() const {
|
||||
return span;
|
||||
}
|
||||
std::vector<int> getSpan() const {
|
||||
return span;
|
||||
}
|
||||
|
||||
lm::ngram::State getLMState() const {
|
||||
return lmState;
|
||||
}
|
||||
lm::ngram::State getLMState() const {
|
||||
return lmState;
|
||||
}
|
||||
|
||||
float getDelta() const {
|
||||
return delta;
|
||||
}
|
||||
float getDelta() const {
|
||||
return delta;
|
||||
}
|
||||
|
||||
void setDelta(double val1 ) {
|
||||
delta = val1;
|
||||
}
|
||||
void setDelta(double val1 ) {
|
||||
delta = val1;
|
||||
}
|
||||
|
||||
void print() const;
|
||||
std::string getName() const;
|
||||
void print() const;
|
||||
std::string getName() const;
|
||||
|
||||
virtual size_t hash() const;
|
||||
virtual size_t hash() const;
|
||||
|
||||
|
||||
protected:
|
||||
std::vector<std::string> buffer;
|
||||
std::vector<int> span;
|
||||
lm::ngram::State lmState;
|
||||
double delta; //NEW
|
||||
};
|
||||
protected:
|
||||
std::vector<std::string> buffer;
|
||||
std::vector<int> span;
|
||||
lm::ngram::State lmState;
|
||||
double delta; //NEW
|
||||
};
|
||||
|
||||
|
||||
|
||||
class dsgHypothesis
|
||||
{
|
||||
|
||||
private:
|
||||
private:
|
||||
std::vector<std::string> m_buffer;// maintains dangling affix from previous hypothesis
|
||||
std::vector<int> m_span;// maintains source alignment for dangling affix from previous hypothesis
|
||||
lm::ngram::State lmState; // KenLM's Model State ...
|
||||
@ -73,7 +73,7 @@ class dsgHypothesis
|
||||
int discontig2;
|
||||
double UnsegWP; //Word Penalty score based on count of words
|
||||
|
||||
public:
|
||||
public:
|
||||
|
||||
dsgHypothesis();
|
||||
~dsgHypothesis() {};
|
||||
@ -84,7 +84,7 @@ class dsgHypothesis
|
||||
m_curr_phr = val1;
|
||||
}
|
||||
|
||||
void setDelta(double val1 ) {
|
||||
void setDelta(double val1 ) {
|
||||
delta = val1;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user