diff --git a/moses/FF/Dsg-Feature/Desegmenter.cpp b/moses/FF/Dsg-Feature/Desegmenter.cpp index a9713831b..677de6e6e 100644 --- a/moses/FF/Dsg-Feature/Desegmenter.cpp +++ b/moses/FF/Dsg-Feature/Desegmenter.cpp @@ -11,73 +11,74 @@ using namespace std; namespace Moses { -void Desegmenter::Load(const string filename){ +void Desegmenter::Load(const string filename) +{ - std::ifstream myFile(filename.c_str() ); - if (myFile.is_open()){ - cerr << "Desegmentation File open successful." << endl; - string line; - while (getline(myFile, line)){ - stringstream ss(line); - string token; - vector myline; - while (getline(ss, token, '\t')){ - myline.push_back(token); - } - mmDesegTable.insert(pair(myline[2], myline[1] )); - } - myFile.close(); - } - else - cerr << "open() failed: check if Desegmentation file is in right folder" << endl; + std::ifstream myFile(filename.c_str() ); + if (myFile.is_open()) { + cerr << "Desegmentation File open successful." << endl; + string line; + while (getline(myFile, line)) { + stringstream ss(line); + string token; + vector myline; + while (getline(ss, token, '\t')) { + myline.push_back(token); + } + mmDesegTable.insert(pair(myline[2], myline[1] )); + } + myFile.close(); + } else + cerr << "open() failed: check if Desegmentation file is in right folder" << endl; } -vector Desegmenter::Search(string myKey){ - multimap::const_iterator mmiPairFound = mmDesegTable.find(myKey); - vector result; - if (mmiPairFound != mmDesegTable.end()){ - size_t nNumPairsInMap = mmDesegTable.count(myKey); - for (size_t nValuesCounter = 0; nValuesCounter < nNumPairsInMap; ++nValuesCounter){ - if (mmiPairFound != mmDesegTable.end()) { - result.push_back(mmiPairFound->second); - } - ++mmiPairFound; - } - return result; - } - else{ - string rule_deseg ; - rule_deseg = ApplyRules(myKey); - result.push_back(rule_deseg); - return result; - } +vector Desegmenter::Search(string myKey) +{ + multimap::const_iterator mmiPairFound = mmDesegTable.find(myKey); + vector result; + if (mmiPairFound != mmDesegTable.end()) { + size_t nNumPairsInMap = mmDesegTable.count(myKey); + for (size_t nValuesCounter = 0; nValuesCounter < nNumPairsInMap; ++nValuesCounter) { + if (mmiPairFound != mmDesegTable.end()) { + result.push_back(mmiPairFound->second); + } + ++mmiPairFound; + } + return result; + } else { + string rule_deseg ; + rule_deseg = ApplyRules(myKey); + result.push_back(rule_deseg); + return result; + } } -string Desegmenter::ApplyRules(string & segToken){ +string Desegmenter::ApplyRules(string & segToken) +{ - string desegToken=segToken; - if (!simple){ - boost::replace_all(desegToken, "l+ All", "ll"); - boost::replace_all(desegToken, "l+ Al", "ll"); - boost::replace_all(desegToken, "y+ y ", "y"); - boost::replace_all(desegToken, "p+ ", "t"); - boost::replace_all(desegToken, "' +", "}"); - boost::replace_all(desegToken, "y +", "A"); - boost::replace_all(desegToken, "n +n", "n"); - boost::replace_all(desegToken, "mn +m", "mm"); - boost::replace_all(desegToken, "En +m", "Em"); - boost::replace_all(desegToken, "An +lA", "Em"); - boost::replace_all(desegToken, "-LRB-", "("); - boost::replace_all(desegToken, "-RRB-", ")"); - } - - boost::replace_all(desegToken, "+ +", ""); - boost::replace_all(desegToken, "+ ", ""); - boost::replace_all(desegToken, " +", ""); - - return desegToken; + string desegToken=segToken; + if (!simple) { + boost::replace_all(desegToken, "l+ All", "ll"); + boost::replace_all(desegToken, "l+ Al", "ll"); + boost::replace_all(desegToken, "y+ y ", "y"); + boost::replace_all(desegToken, "p+ ", "t"); + boost::replace_all(desegToken, "' +", "}"); + boost::replace_all(desegToken, "y +", "A"); + boost::replace_all(desegToken, "n +n", "n"); + boost::replace_all(desegToken, "mn +m", "mm"); + boost::replace_all(desegToken, "En +m", "Em"); + boost::replace_all(desegToken, "An +lA", "Em"); + boost::replace_all(desegToken, "-LRB-", "("); + boost::replace_all(desegToken, "-RRB-", ")"); + } + + boost::replace_all(desegToken, "+ +", ""); + boost::replace_all(desegToken, "+ ", ""); + boost::replace_all(desegToken, " +", ""); + + return desegToken; } Desegmenter::~Desegmenter() diff --git a/moses/FF/Dsg-Feature/Desegmenter.h b/moses/FF/Dsg-Feature/Desegmenter.h index 397140f91..21da78d2e 100644 --- a/moses/FF/Dsg-Feature/Desegmenter.h +++ b/moses/FF/Dsg-Feature/Desegmenter.h @@ -11,21 +11,23 @@ namespace Moses class Desegmenter { private: - std::multimap mmDesegTable; - std::string filename; - bool simple; - void Load(const string filename); + std::multimap mmDesegTable; + std::string filename; + bool simple; + void Load(const string filename); public: - Desegmenter(const std::string& file, const bool scheme){ - filename = file; - simple=scheme; - Load(filename); - } - string getFileName(){ return filename; } - - vector Search(string myKey); - string ApplyRules(string &); - ~Desegmenter(); + Desegmenter(const std::string& file, const bool scheme) { + filename = file; + simple=scheme; + Load(filename); + } + string getFileName() { + return filename; + } + + vector Search(string myKey); + string ApplyRules(string &); + ~Desegmenter(); }; } diff --git a/moses/FF/Dsg-Feature/DsgModel.cpp b/moses/FF/Dsg-Feature/DsgModel.cpp index 090b5545a..0bd25a50b 100644 --- a/moses/FF/Dsg-Feature/DsgModel.cpp +++ b/moses/FF/Dsg-Feature/DsgModel.cpp @@ -10,147 +10,147 @@ using namespace lm::ngram; namespace Moses { - DesegModel::DesegModel(const std::string &line) - :StatefulFeatureFunction(5, line ) - { - tFactor = 0; - order=5; - numFeatures = 5; - optimistic = 1; - ReadParameters(); +DesegModel::DesegModel(const std::string &line) + :StatefulFeatureFunction(5, line ) +{ + tFactor = 0; + order=5; + numFeatures = 5; + optimistic = 1; + ReadParameters(); +} + +DesegModel::~DesegModel() +{ + delete DSGM; +} + +void DesegModel :: readLanguageModel(const char *lmFile) +{ + DSGM = ConstructDsgLM(m_lmPath.c_str()); + State startState = DSGM->NullContextState(); + desegT=new Desegmenter(m_desegPath,m_simple);// Desegmentation Table +} + + +void DesegModel::Load(AllOptions::ptr const& opts) +{ + m_options = opts; + readLanguageModel(m_lmPath.c_str()); +} + + + +void DesegModel:: EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedScores) const +{ + + dsgHypothesis obj; + vector myTargetPhrase; + vector scores; + vector targ_phrase; //stores the segmented tokens in the target phrase + const AlignmentInfo &align = targetPhrase.GetAlignTerm(); + + for (int i = 0; i < targetPhrase.GetSize(); i++) { + targ_phrase.push_back(targetPhrase.GetWord(i).GetFactor(tFactor)->GetString().as_string()); } - DesegModel::~DesegModel() - { - delete DSGM; + obj.setState(DSGM->NullContextState()); + obj.setPhrases(targ_phrase); + obj.calculateDsgProbinIsol(*DSGM,*desegT,align); + obj.populateScores(scores,numFeatures); + estimatedScores.PlusEquals(this, scores); +} + + +FFState* DesegModel::EvaluateWhenApplied( + const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const +{ + const TargetPhrase &target = cur_hypo.GetCurrTargetPhrase(); + const Range &src_rng =cur_hypo.GetCurrSourceWordsRange(); + const AlignmentInfo &align = cur_hypo.GetCurrTargetPhrase().GetAlignTerm(); + size_t sourceOffset = src_rng.GetStartPos(); + + dsgHypothesis obj; + vector scores; + vector targ_phrase; //stores the segmented tokens in the target phrase + bool isCompleted; + + isCompleted=cur_hypo.IsSourceCompleted(); + for (int i = 0; i < cur_hypo.GetCurrTargetLength(); i++) { + targ_phrase.push_back(target.GetWord(i).GetFactor(tFactor)->GetString().as_string()); } - void DesegModel :: readLanguageModel(const char *lmFile) - { - DSGM = ConstructDsgLM(m_lmPath.c_str()); - State startState = DSGM->NullContextState(); - desegT=new Desegmenter(m_desegPath,m_simple);// Desegmentation Table - } + obj.setState(prev_state); + obj.setPhrases( targ_phrase ); + obj.calculateDsgProb(*DSGM,*desegT,isCompleted,align, sourceOffset, optimistic); + obj.populateScores(scores,numFeatures); + accumulator->PlusEquals(this, scores); + return obj.saveState(); + +} + +FFState* DesegModel::EvaluateWhenApplied( + const ChartHypothesis& /* cur_hypo */, + int /* featureID - used to index the state in the previous hypotheses */, + ScoreComponentCollection* accumulator) const +{ + UTIL_THROW2("Chart decoding not support by UTIL_THROW2"); +} + +const FFState* DesegModel::EmptyHypothesisState(const InputType &input) const +{ + VERBOSE(3,"DesegModel::EmptyHypothesisState()" << endl); + State startState = DSGM->BeginSentenceState(); + dsgState ss= dsgState(startState); + return new dsgState(ss); +} + +std::string DesegModel::GetScoreProducerWeightShortName(unsigned idx) const +{ + return "dsg"; +} - void DesegModel::Load(AllOptions::ptr const& opts) - { - m_options = opts; - readLanguageModel(m_lmPath.c_str()); - } +void DesegModel::SetParameter(const std::string& key, const std::string& value) +{ - - - void DesegModel:: EvaluateInIsolation(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedScores) const - { - - dsgHypothesis obj; - vector myTargetPhrase; - vector scores; - vector targ_phrase; //stores the segmented tokens in the target phrase - const AlignmentInfo &align = targetPhrase.GetAlignTerm(); - - for (int i = 0; i < targetPhrase.GetSize(); i++) { - targ_phrase.push_back(targetPhrase.GetWord(i).GetFactor(tFactor)->GetString().as_string()); - } - - obj.setState(DSGM->NullContextState()); - obj.setPhrases(targ_phrase); - obj.calculateDsgProbinIsol(*DSGM,*desegT,align); - obj.populateScores(scores,numFeatures); - estimatedScores.PlusEquals(this, scores); - } - - - FFState* DesegModel::EvaluateWhenApplied( - const Hypothesis& cur_hypo, - const FFState* prev_state, - ScoreComponentCollection* accumulator) const - { - const TargetPhrase &target = cur_hypo.GetCurrTargetPhrase(); - const Range &src_rng =cur_hypo.GetCurrSourceWordsRange(); - const AlignmentInfo &align = cur_hypo.GetCurrTargetPhrase().GetAlignTerm(); - size_t sourceOffset = src_rng.GetStartPos(); - - dsgHypothesis obj; - vector scores; - vector targ_phrase; //stores the segmented tokens in the target phrase - bool isCompleted; - - isCompleted=cur_hypo.IsSourceCompleted(); - for (int i = 0; i < cur_hypo.GetCurrTargetLength(); i++) { - targ_phrase.push_back(target.GetWord(i).GetFactor(tFactor)->GetString().as_string()); - } - - obj.setState(prev_state); - obj.setPhrases( targ_phrase ); - obj.calculateDsgProb(*DSGM,*desegT,isCompleted,align, sourceOffset, optimistic); - obj.populateScores(scores,numFeatures); - accumulator->PlusEquals(this, scores); - return obj.saveState(); - - } - - FFState* DesegModel::EvaluateWhenApplied( - const ChartHypothesis& /* cur_hypo */, - int /* featureID - used to index the state in the previous hypotheses */, - ScoreComponentCollection* accumulator) const - { - UTIL_THROW2("Chart decoding not support by UTIL_THROW2"); - } - - const FFState* DesegModel::EmptyHypothesisState(const InputType &input) const - { - VERBOSE(3,"DesegModel::EmptyHypothesisState()" << endl); - State startState = DSGM->BeginSentenceState(); - dsgState ss= dsgState(startState); - return new dsgState(ss); - } - - std::string DesegModel::GetScoreProducerWeightShortName(unsigned idx) const - { - return "dsg"; - } - - - void DesegModel::SetParameter(const std::string& key, const std::string& value) - { - - if (key == "path") { - m_lmPath = value; - } else if (key == "contiguity-features") { - if(value == "no") - numFeatures = 1; - else - numFeatures = 5; - } else if (key == "output-factor") { - tFactor = Scan(value); - } else if (key == "optimistic") { - if (value == "n") + if (key == "path") { + m_lmPath = value; + } else if (key == "contiguity-features") { + if(value == "no") + numFeatures = 1; + else + numFeatures = 5; + } else if (key == "output-factor") { + tFactor = Scan(value); + } else if (key == "optimistic") { + if (value == "n") optimistic = 0; - else + else optimistic = 1; - } else if (key == "deseg-path") { - m_desegPath = Scan(value); - } else if (key == "deseg-scheme") { - if(value == "s") - m_simple = 1; - else - m_simple = 0; - } else if (key == "order") { - order = Scan(value); - } else { - StatefulFeatureFunction::SetParameter(key, value); - } + } else if (key == "deseg-path") { + m_desegPath = Scan(value); + } else if (key == "deseg-scheme") { + if(value == "s") + m_simple = 1; + else + m_simple = 0; + } else if (key == "order") { + order = Scan(value); + } else { + StatefulFeatureFunction::SetParameter(key, value); } +} - bool DesegModel::IsUseable(const FactorMask &mask) const - { - bool ret = mask[0]; - return ret; - } +bool DesegModel::IsUseable(const FactorMask &mask) const +{ + bool ret = mask[0]; + return ret; +} } // namespace diff --git a/moses/FF/Dsg-Feature/DsgModel.h b/moses/FF/Dsg-Feature/DsgModel.h index f456123d2..7c19f0a63 100644 --- a/moses/FF/Dsg-Feature/DsgModel.h +++ b/moses/FF/Dsg-Feature/DsgModel.h @@ -13,52 +13,52 @@ namespace Moses { - class DesegModel : public StatefulFeatureFunction - { - public: +class DesegModel : public StatefulFeatureFunction +{ +public: - DsgLM * DSGM; - Desegmenter* desegT; - int tFactor;// Target Factor ... - int order; - int numFeatures; // Number of features used an be 1 (unsegmented LM)or 5 (with 3 contiguity features and 1 UnsegWP) - bool optimistic; + DsgLM * DSGM; + Desegmenter* desegT; + int tFactor;// Target Factor ... + int order; + int numFeatures; // Number of features used an be 1 (unsegmented LM)or 5 (with 3 contiguity features and 1 UnsegWP) + bool optimistic; - DesegModel(const std::string &line); - ~DesegModel(); + DesegModel(const std::string &line); + ~DesegModel(); - void readLanguageModel(const char *); - void Load(AllOptions::ptr const& opts); + void readLanguageModel(const char *); + void Load(AllOptions::ptr const& opts); - FFState* EvaluateWhenApplied( - const Hypothesis& cur_hypo, - const FFState* prev_state, - ScoreComponentCollection* accumulator) const; + FFState* EvaluateWhenApplied( + const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const; - virtual FFState* EvaluateWhenApplied( - const ChartHypothesis& /* cur_hypo */, - int /* featureID - used to index the state in the previous hypotheses */, - ScoreComponentCollection* accumulator) const; + virtual FFState* EvaluateWhenApplied( + const ChartHypothesis& /* cur_hypo */, + int /* featureID - used to index the state in the previous hypotheses */, + ScoreComponentCollection* accumulator) const; - void EvaluateInIsolation(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedScores) const; + void EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedScores) const; - virtual const FFState* EmptyHypothesisState(const InputType &input) const; + virtual const FFState* EmptyHypothesisState(const InputType &input) const; - virtual std::string GetScoreProducerWeightShortName(unsigned idx=0) const; + virtual std::string GetScoreProducerWeightShortName(unsigned idx=0) const; - void SetParameter(const std::string& key, const std::string& value); + void SetParameter(const std::string& key, const std::string& value); - bool IsUseable(const FactorMask &mask) const; + bool IsUseable(const FactorMask &mask) const; - protected: - typedef std::vector Scores; - std::string m_lmPath; - std::string m_desegPath; - bool m_simple; //desegmentation scheme; if 1 then use simple, else use rule and backoff to simple - }; +protected: + typedef std::vector Scores; + std::string m_lmPath; + std::string m_desegPath; + bool m_simple; //desegmentation scheme; if 1 then use simple, else use rule and backoff to simple +}; } diff --git a/moses/FF/Dsg-Feature/KenDsg.cpp b/moses/FF/Dsg-Feature/KenDsg.cpp index 08a8dd0ed..b9fd92aef 100644 --- a/moses/FF/Dsg-Feature/KenDsg.cpp +++ b/moses/FF/Dsg-Feature/KenDsg.cpp @@ -3,32 +3,32 @@ namespace Moses { - DsgLM* ConstructDsgLM(const char *file) - { - lm::ngram::ModelType model_type; - lm::ngram::Config config; - if (lm::ngram::RecognizeBinary(file, model_type)) { - switch(model_type) { - case lm::ngram::PROBING: - return new KenDsg(file, config); - case lm::ngram::REST_PROBING: - return new KenDsg(file, config); - case lm::ngram::TRIE: - return new KenDsg(file, config); - case lm::ngram::QUANT_TRIE: - return new KenDsg(file, config); - case lm::ngram::ARRAY_TRIE: - return new KenDsg(file, config); - case lm::ngram::QUANT_ARRAY_TRIE: - return new KenDsg(file, config); - default: - UTIL_THROW2("Unrecognized kenlm model type " << model_type); - } - } else { +DsgLM* ConstructDsgLM(const char *file) +{ + lm::ngram::ModelType model_type; + lm::ngram::Config config; + if (lm::ngram::RecognizeBinary(file, model_type)) { + switch(model_type) { + case lm::ngram::PROBING: return new KenDsg(file, config); - } - } - -} // namespace - - + case lm::ngram::REST_PROBING: + return new KenDsg(file, config); + case lm::ngram::TRIE: + return new KenDsg(file, config); + case lm::ngram::QUANT_TRIE: + return new KenDsg(file, config); + case lm::ngram::ARRAY_TRIE: + return new KenDsg(file, config); + case lm::ngram::QUANT_ARRAY_TRIE: + return new KenDsg(file, config); + default: + UTIL_THROW2("Unrecognized kenlm model type " << model_type); + } + } else { + return new KenDsg(file, config); + } +} + +} // namespace + + diff --git a/moses/FF/Dsg-Feature/KenDsg.h b/moses/FF/Dsg-Feature/KenDsg.h index d32a2d98a..3fc07003f 100644 --- a/moses/FF/Dsg-Feature/KenDsg.h +++ b/moses/FF/Dsg-Feature/KenDsg.h @@ -8,7 +8,7 @@ namespace Moses class KenDsgBase { - public: +public: virtual ~KenDsgBase() {} virtual float Score(const lm::ngram::State&, StringPiece, @@ -22,17 +22,17 @@ class KenDsgBase }; template - class KenDsg : public KenDsgBase +class KenDsg : public KenDsgBase { - public: +public: KenDsg(const char *file, const lm::ngram::Config &config) : m_kenlm(file, config) {} float Score(const lm::ngram::State &in_state, - StringPiece word, - lm::ngram::State &out_state) const { + StringPiece word, + lm::ngram::State &out_state) const { return m_kenlm.Score(in_state, m_kenlm.GetVocabulary().Index(word), - out_state); + out_state); } const lm::ngram::State &BeginSentenceState() const { @@ -48,13 +48,13 @@ template } - private: +private: KenModel m_kenlm; }; - typedef KenDsgBase DsgLM; +typedef KenDsgBase DsgLM; - DsgLM* ConstructDsgLM(const char *file); +DsgLM* ConstructDsgLM(const char *file); } // namespace diff --git a/moses/FF/Dsg-Feature/dsgHyp.cpp b/moses/FF/Dsg-Feature/dsgHyp.cpp index 7daba14c0..d33262d11 100644 --- a/moses/FF/Dsg-Feature/dsgHyp.cpp +++ b/moses/FF/Dsg-Feature/dsgHyp.cpp @@ -2,9 +2,9 @@ #include #include #include -#include -#include -#include +#include +#include +#include using namespace std; @@ -12,356 +12,380 @@ using namespace lm::ngram; namespace Moses { - dsgState::dsgState(const State & val) - { - lmState = val; - } +dsgState::dsgState(const State & val) +{ + lmState = val; +} - void dsgState::saveState( std::vector danglingTok, std::vector srcSpans,float deltaValue) - { - buffer = danglingTok; - span=srcSpans; - delta=deltaValue; - } +void dsgState::saveState( std::vector danglingTok, std::vector srcSpans,float deltaValue) +{ + buffer = danglingTok; + span=srcSpans; + delta=deltaValue; +} - size_t dsgState::hash() const - { +size_t dsgState::hash() const +{ - size_t ret = 0; - boost::hash_combine(ret, lmState); + size_t ret = 0; + boost::hash_combine(ret, lmState); - /*size_t ret = delta; + /*size_t ret = delta; boost::hash_combine(ret, buffer); boost::hash_combine(ret, span); boost::hash_combine(ret, lmState.length); return ret;*/ +} + +bool dsgState::operator==(const FFState& otherBase) const //CHECK +{ + const dsgState &other = static_cast(otherBase); + + if (lmState < other.lmState) return false; + if (lmState == other.lmState) return true; + return false; +} + +// ---------------------------------------- + +std::string dsgState :: getName() const +{ + return "done"; +} + +dsgHypothesis :: dsgHypothesis() +{ + lmProb = 0; + discontig0 = 0; + discontig1 = 0; + discontig2 = 0; + UnsegWP = 0; + m_buffer.clear();//=""; +} + +void dsgHypothesis :: setState(const FFState* prev_state) +{ + if(prev_state != NULL) { + m_buffer = static_cast (prev_state)->getBuffer(); + m_span = static_cast (prev_state)->getSpan(); + lmState = static_cast (prev_state)->getLMState(); + delta = static_cast (prev_state)->getDelta(); //NEW } +} - bool dsgState::operator==(const FFState& otherBase) const //CHECK - { - const dsgState &other = static_cast(otherBase); +dsgState * dsgHypothesis :: saveState() +{ + dsgState * statePtr = new dsgState(lmState); + statePtr->saveState(m_buffer, m_span, delta); + return statePtr; +} - if (lmState < other.lmState) return false; - if (lmState == other.lmState) return true; +void dsgHypothesis :: populateScores(vector & scores , const int numFeatures) +{ + scores.clear(); + scores.push_back(lmProb); + + if (numFeatures == 1) + return; + scores.push_back(discontig0); + scores.push_back(discontig1); + scores.push_back(discontig2); + scores.push_back(UnsegWP); +} + + + +bool dsgHypothesis::isPrefix(const std::string &tok) +{ + if ((tok.at(tok.size() - 1) == '+' )&& (tok != "+")) { + return true; + } else { + return false; + }; +} + +bool dsgHypothesis::isSuffix(const std::string &tok) +{ + if ((tok.at(0) == '+' )&& (tok != "+")) { + return true; + } else { + return false; + }; +} + +bool dsgHypothesis::isStem(const std::string &tok) +{ + if ((tok.at(0) != '+') && (tok.at(tok.size() - 1) != '+')) { + return true; + } else { + return false; + }; +} + + + +/** + * chain stores segmented tokens that are in process of building a word + * The function checks if tok contributes to the word being formed in chain + * + */ +bool dsgHypothesis::isValidChain(const std::string &tok, std::vector &chain) +{ + std::string last_tok; + if (chain.size() >= 1) { + last_tok = chain[chain.size() - 1]; + } else { + last_tok = "NULL"; + } + if(tok=="+") { return false; } - - // ---------------------------------------- - - std::string dsgState :: getName() const - { - return "done"; + if (isPrefix(tok) && (chain.size() == 0 || isPrefix(last_tok))) { + return true; + } else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok)))) { + return true; // allows one suffix ONLY } - - dsgHypothesis :: dsgHypothesis() - { - lmProb = 0; - discontig0 = 0; - discontig1 = 0; - discontig2 = 0; - UnsegWP = 0; - m_buffer.clear();//=""; + //else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok) || isSuffix(last_tok) ))) { return true; } // allows multiple suffixes + else if (isStem(tok) && (chain.size() == 0 || isPrefix(last_tok))) { + return true; + } else { + return false; } +} - void dsgHypothesis :: setState(const FFState* prev_state) - { - if(prev_state != NULL) { - m_buffer = static_cast (prev_state)->getBuffer(); - m_span = static_cast (prev_state)->getSpan(); - lmState = static_cast (prev_state)->getLMState(); - delta = static_cast (prev_state)->getDelta(); //NEW +/** + * grouper function groups tokens that form a word together + */ +vector dsgHypothesis::grouper(std::vector &phr_vec,vector > &allchain_ids, int sourceOffset,const AlignmentInfo &align, bool isolation) +{ + + std::vector chain; + std::vector chain_ids; + std::vector allchains; + chain_ids=m_span; + + if (!m_buffer.empty() && !isolation) { // if evaluate in isolation is called, then do not add buffer content + for (int i = 0; i < m_buffer.size(); i++) { // initialize chain with the content of the buffer + chain.push_back(m_buffer[i]); } } - dsgState * dsgHypothesis :: saveState() - { - dsgState * statePtr = new dsgState(lmState); - statePtr->saveState(m_buffer, m_span, delta); - return statePtr; - } + for (int i = 0; i < phr_vec.size(); i++) { + std::set sourcePosSet = align.GetAlignmentsForTarget(i); - void dsgHypothesis :: populateScores(vector & scores , const int numFeatures) - { - scores.clear(); - scores.push_back(lmProb); - - if (numFeatures == 1) - return; - scores.push_back(discontig0); - scores.push_back(discontig1); - scores.push_back(discontig2); - scores.push_back(UnsegWP); - } - - - - bool dsgHypothesis::isPrefix(const std::string &tok){ - if ((tok.at(tok.size() - 1) == '+' )&& (tok != "+")) { return true; } - else { return false; }; - } - - bool dsgHypothesis::isSuffix(const std::string &tok){ - if ((tok.at(0) == '+' )&& (tok != "+")) { return true; } - else { return false; }; - } - - bool dsgHypothesis::isStem(const std::string &tok){ - if ((tok.at(0) != '+') && (tok.at(tok.size() - 1) != '+')){ return true; } - else { return false; }; - } - - - - /** - * chain stores segmented tokens that are in process of building a word - * The function checks if tok contributes to the word being formed in chain - * - */ - bool dsgHypothesis::isValidChain(const std::string &tok, std::vector &chain){ - std::string last_tok; - if (chain.size() >= 1){ - last_tok = chain[chain.size() - 1]; - } - else{ - last_tok = "NULL"; - } - if(tok=="+"){return false;} - if (isPrefix(tok) && (chain.size() == 0 || isPrefix(last_tok))) { return true; } - else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok)))) { return true; } // allows one suffix ONLY - //else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok) || isSuffix(last_tok) ))) { return true; } // allows multiple suffixes - else if (isStem(tok) && (chain.size() == 0 || isPrefix(last_tok))) { return true; } - else { return false; } - } - - /** - * grouper function groups tokens that form a word together - */ - vector dsgHypothesis::grouper(std::vector &phr_vec,vector > &allchain_ids, int sourceOffset,const AlignmentInfo &align, bool isolation){ - - std::vector chain; - std::vector chain_ids; - std::vector allchains; - chain_ids=m_span; - - if (!m_buffer.empty() && !isolation){// if evaluate in isolation is called, then do not add buffer content - for (int i = 0; i < m_buffer.size(); i++){ // initialize chain with the content of the buffer - chain.push_back(m_buffer[i]); + if (isValidChain(phr_vec[i], chain)) { + chain.push_back(phr_vec[i]); + if (sourcePosSet.empty()==false) { + for (std::set::iterator it(sourcePosSet.begin()); it != sourcePosSet.end(); it++) { + int cur=*it; + chain_ids.push_back(cur+sourceOffset); + } } } - for (int i = 0; i < phr_vec.size(); i++){ - std::set sourcePosSet = align.GetAlignmentsForTarget(i); - - if (isValidChain(phr_vec[i], chain)){ - chain.push_back(phr_vec[i]); - if (sourcePosSet.empty()==false){ - for (std::set::iterator it(sourcePosSet.begin());it != sourcePosSet.end(); it++) { - int cur=*it; - chain_ids.push_back(cur+sourceOffset); - } - } - } - - else if (chain.size() == 0) { // start of a suffix at hypothesis0 - allchains.push_back(phr_vec[i]); - allchain_ids.push_back(chain_ids); - chain_ids.clear();//={}; - } - - else { // tokens formed a complete word; add tokens segmented by space to allchains - std::string joined = boost::algorithm::join(chain, " "); - allchains.push_back(joined); - allchain_ids.push_back(chain_ids); - - chain.clear();// = {}; - chain_ids.clear();//={}; - - chain.push_back(phr_vec[i]); - if (sourcePosSet.empty()==false){ - for (std::set::iterator it(sourcePosSet.begin());it != sourcePosSet.end(); it++) { - int cur=*it; - chain_ids.push_back(cur+sourceOffset); - } - } - - } - + else if (chain.size() == 0) { // start of a suffix at hypothesis0 + allchains.push_back(phr_vec[i]); + allchain_ids.push_back(chain_ids); + chain_ids.clear();//={}; } - if (!chain.empty()){ + else { // tokens formed a complete word; add tokens segmented by space to allchains std::string joined = boost::algorithm::join(chain, " "); allchains.push_back(joined); allchain_ids.push_back(chain_ids); + + chain.clear();// = {}; + chain_ids.clear();//={}; + + chain.push_back(phr_vec[i]); + if (sourcePosSet.empty()==false) { + for (std::set::iterator it(sourcePosSet.begin()); it != sourcePosSet.end(); it++) { + int cur=*it; + chain_ids.push_back(cur+sourceOffset); + } + } + } - return allchains; + } + if (!chain.empty()) { + std::string joined = boost::algorithm::join(chain, " "); + allchains.push_back(joined); + allchain_ids.push_back(chain_ids); + } + return allchains; +} - void dsgHypothesis :: calculateDsgProbinIsol(DsgLM & ptrDsgLM, Desegmenter &desegT, const AlignmentInfo &align ){ - lmProb = 0; - State currState = lmState; - State temp; - string desegmented=""; - vector words; - vector currFVec; - discontig0=0; - discontig1=0; - discontig2=0; - UnsegWP=0; +void dsgHypothesis :: calculateDsgProbinIsol(DsgLM & ptrDsgLM, Desegmenter &desegT, const AlignmentInfo &align ) +{ + lmProb = 0; + State currState = lmState; + State temp; + string desegmented=""; + vector words; + vector currFVec; - currFVec = m_buffer; - currFVec.insert( currFVec.end(), m_curr_phr.begin(), m_curr_phr.end() ); + discontig0=0; + discontig1=0; + discontig2=0; + UnsegWP=0; - int vecSize=currFVec.size(); + currFVec = m_buffer; + currFVec.insert( currFVec.end(), m_curr_phr.begin(), m_curr_phr.end() ); - // phrases with suffix-starts and prefix-end - if (currFVec.size()>0 && isPrefix (currFVec.back())) { - UnsegWP-=0.5;} - if (currFVec.size()>0 && isSuffix (currFVec.front())) { - UnsegWP-=0.5;} + int vecSize=currFVec.size(); - /* //Dropping prefix-end and suffix-start - while (currFVec.size()>0 && isPrefix (currFVec.back())){ - currFVec.pop_back(); //drop prefix appearing at end of phrase - } + // phrases with suffix-starts and prefix-end + if (currFVec.size()>0 && isPrefix (currFVec.back())) { + UnsegWP-=0.5; + } + if (currFVec.size()>0 && isSuffix (currFVec.front())) { + UnsegWP-=0.5; + } - while (currFVec.size()>0 && isSuffix (currFVec.front())){ - currFVec.erase (currFVec.begin()); //drop suffix appearning at start of a phrase - } */ + /* //Dropping prefix-end and suffix-start + while (currFVec.size()>0 && isPrefix (currFVec.back())){ + currFVec.pop_back(); //drop prefix appearing at end of phrase + } - vector > chain_ids; - words = grouper(currFVec,chain_ids,0,align,1); + while (currFVec.size()>0 && isSuffix (currFVec.front())){ + currFVec.erase (currFVec.begin()); //drop suffix appearning at start of a phrase + } */ - for (int i = 0; i > chain_ids; + words = grouper(currFVec,chain_ids,0,align,1); + + for (int i = 0; i words; + vector currFVec; + bool completePhraseSuffixEnd = false; + vector > all_chain_ids; + double pscore; + currFVec=m_curr_phr; + + // Check if the the phrase ends in a suffix, which means that it completes a full word;Make sure to change the isValidChain + if (isSuffix (currFVec.back()) && (currFVec.back()!="+")) { + completePhraseSuffixEnd=true; + } + + words = grouper(currFVec,all_chain_ids,sourceOffset,align,0); + + for (int i = 0; i < words.size(); i++) { + temp = currState; + + if (i==words.size()-1) { + if (completePhraseSuffixEnd) { //i.e if phrase ends with suffix, which marks an end of a word + m_buffer.clear();// =""; + m_span.clear();// ={}; + } else if (!isCompleted) { // not end of sentence( or final hypothesis), and probably the last token is not a complete word + m_buffer.clear(); + if (optimistic == 1) { + if ( isPrefix (currFVec.back())) { // this will delay scoring of prefix in prefix-ending phrases until the next hypothesis arrives + //pscore = ptrDsgLM.Score(temp,desegmented,currState); + lmProb -= delta; + delta = 0.0; + } + + else if (words[i].find(" ")!=std::string::npos) { + desegmented=desegT.Search(words[i])[0]; + pscore=ptrDsgLM.Score(temp,desegmented,currState); + lmProb = lmProb + pscore - delta; + delta=pscore; + currState=temp; + } else { + boost::replace_all(words[i], "-LRB-", "("); + boost::replace_all(words[i], "-RRB-", ")"); + pscore=ptrDsgLM.Score(temp,words[i],currState); + lmProb = lmProb + pscore - delta; + delta=pscore; + currState=temp; + } + } + + m_buffer.push_back(words.back()); + m_span=all_chain_ids.back(); + break; + } + } + + //temp = currState; + if (words[i].find(" ")!=std::string::npos) { UnsegWP+=1; - temp = currState; - if (words[i].find(" ")!=std::string::npos){ - desegmented=desegT.Search(words[i])[0]; - lmProb += ptrDsgLM.Score(temp,desegmented,currState); - } - else{ - boost::replace_all(words[i], "-LRB-", "("); - boost::replace_all(words[i], "-RRB-", ")"); - lmProb += ptrDsgLM.Score(temp,words[i],currState); + desegmented=desegT.Search(words[i])[0]; + std::set cur_chain_ids(all_chain_ids[i].begin(),all_chain_ids[i].end()); + if (cur_chain_ids.size()>1) { + vector dsc; + for (std::set::iterator it(cur_chain_ids.begin()), next(it); it != cur_chain_ids.end() && ++next != cur_chain_ids.end(); it = next) { + int cur=*it; + int mynext=*next; + if (std::abs(cur - mynext)>= 3) { + dsc.push_back(3); + } else if (std::abs(cur - mynext)== 2) { + dsc.push_back(2); + } else if (std::abs(cur - mynext)<= 1) { + dsc.push_back(1); + } + } + int mymax=*std::max_element(dsc.begin(),dsc.end()); + if (mymax==3) { + discontig2+=1; + } else if (mymax==2) { + discontig1+=1; + } else { + discontig0+=1; + } + } else { + discontig0 += 1; } + + lmProb += ptrDsgLM.Score(temp,desegmented,currState); + } else { + UnsegWP+=1; + boost::replace_all(words[i], "-LRB-", "("); + boost::replace_all(words[i], "-RRB-", ")"); + lmProb += ptrDsgLM.Score(temp,words[i],currState); } - lmState = currState; } - void dsgHypothesis :: calculateDsgProb(DsgLM& ptrDsgLM, Desegmenter &desegT, bool isCompleted , const AlignmentInfo &align, int sourceOffset, bool optimistic) - { - lmProb = 0; - discontig0=0; - discontig1=0; - discontig2=0; - UnsegWP=0; - - State currState = lmState; - State temp; - string desegmented=""; - vector words; - vector currFVec; - bool completePhraseSuffixEnd = false; - vector > all_chain_ids; - double pscore; - currFVec=m_curr_phr; - - // Check if the the phrase ends in a suffix, which means that it completes a full word;Make sure to change the isValidChain - if (isSuffix (currFVec.back()) && (currFVec.back()!="+")){completePhraseSuffixEnd=true;} - - words = grouper(currFVec,all_chain_ids,sourceOffset,align,0); - - for (int i = 0; i < words.size(); i++) { - temp = currState; - - if (i==words.size()-1){ - if (completePhraseSuffixEnd){ //i.e if phrase ends with suffix, which marks an end of a word - m_buffer.clear();// =""; - m_span.clear();// ={}; - } - else if (!isCompleted) { // not end of sentence( or final hypothesis), and probably the last token is not a complete word - m_buffer.clear(); - if (optimistic == 1){ - if ( isPrefix (currFVec.back())){ // this will delay scoring of prefix in prefix-ending phrases until the next hypothesis arrives - //pscore = ptrDsgLM.Score(temp,desegmented,currState); - lmProb -= delta; - delta = 0.0; - } - - else if (words[i].find(" ")!=std::string::npos){ - desegmented=desegT.Search(words[i])[0]; - pscore=ptrDsgLM.Score(temp,desegmented,currState); - lmProb = lmProb + pscore - delta; - delta=pscore; - currState=temp; - } - else{ - boost::replace_all(words[i], "-LRB-", "("); - boost::replace_all(words[i], "-RRB-", ")"); - pscore=ptrDsgLM.Score(temp,words[i],currState); - lmProb = lmProb + pscore - delta; - delta=pscore; - currState=temp; - } } - - m_buffer.push_back(words.back()); - m_span=all_chain_ids.back(); - break; - } - } - - //temp = currState; - if (words[i].find(" ")!=std::string::npos){ - UnsegWP+=1; - desegmented=desegT.Search(words[i])[0]; - std::set cur_chain_ids(all_chain_ids[i].begin(),all_chain_ids[i].end()); - if (cur_chain_ids.size()>1){ - vector dsc; - for (std::set::iterator it(cur_chain_ids.begin()), next(it);it != cur_chain_ids.end() && ++next != cur_chain_ids.end(); it = next) { - int cur=*it; - int mynext=*next; - if (std::abs(cur - mynext)>= 3) { - dsc.push_back(3); - } - else if (std::abs(cur - mynext)== 2){ - dsc.push_back(2); - } - else if (std::abs(cur - mynext)<= 1){ - dsc.push_back(1); - } - } - int mymax=*std::max_element(dsc.begin(),dsc.end()); - if (mymax==3){discontig2+=1;} - else if (mymax==2){discontig1+=1;} - else{discontig0+=1;} - } - else{ - discontig0 += 1; - } - - lmProb += ptrDsgLM.Score(temp,desegmented,currState); - } - else{ - UnsegWP+=1; - boost::replace_all(words[i], "-LRB-", "("); - boost::replace_all(words[i], "-RRB-", ")"); - lmProb += ptrDsgLM.Score(temp,words[i],currState); - } - } - - if (isCompleted){ - temp = currState; - lmProb = lmProb + ptrDsgLM.ScoreEndSentence(temp,currState) - delta; - } - lmState = currState; + if (isCompleted) { + temp = currState; + lmProb = lmProb + ptrDsgLM.ScoreEndSentence(temp,currState) - delta; } + lmState = currState; +} - void dsgHypothesis :: print() - {} +void dsgHypothesis :: print() +{} } // namespace diff --git a/moses/FF/Dsg-Feature/dsgHyp.h b/moses/FF/Dsg-Feature/dsgHyp.h index 0df4af11a..d36ad0530 100644 --- a/moses/FF/Dsg-Feature/dsgHyp.h +++ b/moses/FF/Dsg-Feature/dsgHyp.h @@ -14,53 +14,53 @@ namespace Moses { - class dsgState : public FFState - { - public: +class dsgState : public FFState +{ +public: - dsgState(const lm::ngram::State & val); - virtual bool operator==(const FFState& other) const; - void saveState( std::vector bufferVal,std::vector spanVal, float deltaValue); + dsgState(const lm::ngram::State & val); + virtual bool operator==(const FFState& other) const; + void saveState( std::vector bufferVal,std::vector spanVal, float deltaValue); - std::vector getBuffer() const { - return buffer; - } + std::vector getBuffer() const { + return buffer; + } - std::vector getSpan() const { - return span; - } + std::vector getSpan() const { + return span; + } - lm::ngram::State getLMState() const { - return lmState; - } + lm::ngram::State getLMState() const { + return lmState; + } - float getDelta() const { - return delta; - } + float getDelta() const { + return delta; + } - void setDelta(double val1 ) { - delta = val1; - } + void setDelta(double val1 ) { + delta = val1; + } - void print() const; - std::string getName() const; + void print() const; + std::string getName() const; - virtual size_t hash() const; + virtual size_t hash() const; - protected: - std::vector buffer; - std::vector span; - lm::ngram::State lmState; - double delta; //NEW - }; +protected: + std::vector buffer; + std::vector span; + lm::ngram::State lmState; + double delta; //NEW +}; class dsgHypothesis { - private: +private: std::vector m_buffer;// maintains dangling affix from previous hypothesis std::vector m_span;// maintains source alignment for dangling affix from previous hypothesis lm::ngram::State lmState; // KenLM's Model State ... @@ -73,7 +73,7 @@ class dsgHypothesis int discontig2; double UnsegWP; //Word Penalty score based on count of words - public: +public: dsgHypothesis(); ~dsgHypothesis() {}; @@ -84,7 +84,7 @@ class dsgHypothesis m_curr_phr = val1; } - void setDelta(double val1 ) { + void setDelta(double val1 ) { delta = val1; }