Fix a few bugs in BilingualLM for phrase based decoding.

2024-12-27 14:05:29 +03:00 · 2014-09-26 12:02:12 +01:00 · 2014-09-26 12:02:12 +01:00 · 3624bd776c
commit 3624bd776c
parent 5f87cf94d8
2 changed files with 86 additions and 109 deletions
--- a/moses/LM/BilingualLM.cpp
+++ b/moses/LM/BilingualLM.cpp
@ -106,12 +106,12 @@ int BilingualLM::getNeuralLMId(const Word& word) const{

 //Populates words with amount words from the targetPhrase from the previous hypothesis where
 //words[0] is the last word of the previous hypothesis, words[1] is the second last etc...
-void BilingualLM::requestPrevTargetNgrams(const Hypothesis &cur_hypo, int amount, std::vector<int> &words) const {
+void BilingualLM::requestPrevTargetNgrams(
+    const Hypothesis &cur_hypo, int amount, std::vector<int> &words) const {
  const Hypothesis * prev_hyp = cur_hypo.GetPrevHypo();
  int found = 0;

-  while (found != amount){
-    if (prev_hyp){
+  while (prev_hyp && found != amount) {
    const TargetPhrase& currTargetPhrase = prev_hyp->GetCurrTargetPhrase();
    for (int i = currTargetPhrase.GetSize() - 1; i> -1; i--){
      if (found != amount){
@ -122,9 +122,7 @@ void BilingualLM::requestPrevTargetNgrams(const Hypothesis &cur_hypo, int amount
        return; //We have gotten everything needed
      }
    }
-    } else {
-      break; //We have reached the beginning of the hypothesis
-    }
+
    prev_hyp = prev_hyp->GetPrevHypo();
  }

@ -132,43 +130,40 @@ void BilingualLM::requestPrevTargetNgrams(const Hypothesis &cur_hypo, int amount
  for (int i = found; i < amount; i++){
    words[i] = neuralLM_wordID;
  }
-
 }

 //Populates the words vector with target_ngrams sized that also contains the current word we are looking at. 
 //(in effect target_ngrams + 1)
-void BilingualLM::getTargetWords(const Hypothesis &cur_hypo
-                , const TargetPhrase &targetPhrase
-                , int current_word_index
-                , std::vector<int> &words) const {
-
+void BilingualLM::getTargetWords(
+    const Hypothesis &cur_hypo,
+    const TargetPhrase &targetPhrase,
+    int current_word_index,
+    std::vector<int> &words) const {
  //Check if we need to look at previous target phrases
  int additional_needed = current_word_index - target_ngrams;
  if (additional_needed < 0) {
    additional_needed = -additional_needed;
    std::vector<int> prev_words(additional_needed);
    requestPrevTargetNgrams(cur_hypo, additional_needed, prev_words);
-    for (int i=additional_needed -1 ; i>-1; i--){
+    for (int i = additional_needed - 1; i >= 0; i--) {
      words.push_back(prev_words[i]);
    }
  }

-  if (words.size()!=source_ngrams){
+  if (words.size() > 0) {
    //We have added some words from previous phrases
    //Just add until we reach current_word_index
-    for (int i = 0; i<current_word_index + 1; i++){
+    for (int i = 0; i <= current_word_index; i++) {
      const Word& word = targetPhrase.GetWord(i);
      words.push_back(getNeuralLMId(word));
    }
-
  } else {
    //We haven't added any words, proceed as before
-    for (int i = current_word_index - target_ngrams; i < current_word_index + 1; i++){
+    for (int i = current_word_index - target_ngrams; i <= current_word_index; i++){
      const Word& word = targetPhrase.GetWord(i);
      words.push_back(getNeuralLMId(word));
    }
  }
-
 }

 //Returns target_ngrams sized word vector that contains the current word we are looking at. (in effect target_ngrams + 1)
@ -202,95 +197,88 @@ void BilingualLM::getTargetWords(Phrase &whole_phrase
 */
 //Returns source words in the way NeuralLM expects them.

-void BilingualLM::getSourceWords(const TargetPhrase &targetPhrase
-                , int targetWordIdx
-                , const Sentence &source_sent
-                , const WordsRange &sourceWordRange
-                , std::vector<int> &words) const {
+size_t BilingualLM::selectMiddleAlignment(
+    const set<size_t>& alignment_links) const {
+  assert(alignment_links.size() > 0);
+
+  set<size_t>::iterator it = alignment_links.begin();
+  for (int i = 0; i < (alignment_links.size() - 1) / 2; ++i) {
+    ++it;
+  }
+
+  return *it;
+}
+
+void BilingualLM::getSourceWords(
+    const TargetPhrase &targetPhrase,
+    int targetWordIdx,
+    const Sentence &source_sent,
+    const WordsRange &sourceWordRange,
+    std::vector<int> &words) const {
  //Get source context

  //Get alignment for the word we require
  const AlignmentInfo& alignments = targetPhrase.GetAlignTerm();

-  //We are getting word alignment for targetPhrase.GetWord(i + target_ngrams -1) according to the paper.
-  //Try to get some alignment, because the word we desire might be unaligned.
+  // We are getting word alignment for targetPhrase.GetWord(i + target_ngrams -1) according to the paper.
+  // Find the closest target word with alignment links.
  std::set<size_t> last_word_al;
-  for (int j = 0; j < targetPhrase.GetSize(); j++){
-    //Sometimes our word will not be aligned, so find the nearest aligned word right
+  for (int j = 0; j < targetPhrase.GetSize(); j++) {
+    // Find the nearest aligned word with preference for right.
    if ((targetWordIdx + j) < targetPhrase.GetSize()){
      last_word_al = alignments.GetAlignmentsForTarget(targetWordIdx + j);
-      if (!last_word_al.empty()){
+      if (!last_word_al.empty()) {
        break;
      }
-    } else if ((targetWordIdx - j) > 0) {
-      //We couldn't find word on the right, try the left.
+    }
+
+    // We couldn't find word on the right, try to the left.
+    if ((targetWordIdx - j) >= 0) {
      last_word_al = alignments.GetAlignmentsForTarget(targetWordIdx - j);
-      if (!last_word_al.empty()){
+      if (!last_word_al.empty()) {
        break;
      }
-
    }
-    
  }

  //Assume we have gotten some alignment here. If we couldn't get an alignment from the above routine it means
  //that none of the words in the target phrase aligned to any word in the source phrase

-  //Now we get the source words.
-  size_t source_center_index;
-  if (last_word_al.size() == 1) {
-    //We have only one word aligned
-    source_center_index = *last_word_al.begin();
-  } else { //We have more than one alignments, take the middle one
-    int tempidx = 0; //Temporary index to track where the iterator is.
-    for (std::set<size_t>::iterator it = last_word_al.begin(); it != last_word_al.end(); it++){
-      if (tempidx == last_word_al.size()/2){
-        source_center_index = *(it);
-        break;
-      }
-    }
-  }
-
-  //We have found the alignment. Now determine how much to shift by to get the actual source word index.
+  // Now we get the source words. First select middle alignment.
+  size_t source_center_index = selectMiddleAlignment(last_word_al);
+  // We have found the alignment. Now determine how much to shift by to get the actual source word index.
  size_t phrase_start_pos = sourceWordRange.GetStartPos();
-  size_t source_word_mid_idx = phrase_start_pos + targetWordIdx; //Account for how far the current word is from the start of the phrase.
-
+  // Account for how far the current word is from the start of the phrase.
+  size_t source_word_mid_idx = phrase_start_pos + source_center_index;

  appendSourceWordsToVector(source_sent, words, source_word_mid_idx);
-
 }

 size_t BilingualLM::getState(const Hypothesis& cur_hypo) const {
-
  const TargetPhrase &targetPhrase = cur_hypo.GetCurrTargetPhrase();
-
  size_t hashCode = 0;

-  //Check if we need to look at previous target phrases
+  // Check if we need to look at previous target phrases
  int additional_needed = targetPhrase.GetSize() - target_ngrams;
  if (additional_needed < 0) {
    additional_needed = -additional_needed;
    std::vector<int> prev_words(additional_needed);
    requestPrevTargetNgrams(cur_hypo, additional_needed, prev_words);
-    for (int i=additional_needed - 1; i>-1; i--) {
+    for (int i = additional_needed - 1; i >= 0; i--) {
      boost::hash_combine(hashCode, prev_words[i]);
    }
-    //Get the rest of the phrases needed
+
+    // Get the rest of the phrases needed
    for (int i = 0; i < targetPhrase.GetSize(); i++) {
-      int neuralLM_wordID;
-
      const Word& word = targetPhrase.GetWord(i);
-      neuralLM_wordID = getNeuralLMId(word);
-
+      int neuralLM_wordID = getNeuralLMId(word);
      boost::hash_combine(hashCode, neuralLM_wordID);
    }
-
  } else {
+    // We just need the last target_ngrams from the current target phrase.
    for (int i = targetPhrase.GetSize() - target_ngrams; i < targetPhrase.GetSize(); i++) {
-      int neuralLM_wordID;
-
      const Word& word = targetPhrase.GetWord(i);
-      neuralLM_wordID = getNeuralLMId(word);
+      int neuralLM_wordID = getNeuralLMId(word);

      boost::hash_combine(hashCode, neuralLM_wordID);
    }
@ -439,45 +427,30 @@ void BilingualLM::EvaluateWithSourceContext(const InputType &input
 FFState* BilingualLM::EvaluateWhenApplied(
    const Hypothesis& cur_hypo,
    const FFState* prev_state,
-  ScoreComponentCollection* accumulator) const
-{
-
+    ScoreComponentCollection* accumulator) const {
  Manager& manager = cur_hypo.GetManager();
  const Sentence& source_sent = static_cast<const Sentence&>(manager.GetSource());

-
-
-  //Init vectors
+  // Init vectors.
  std::vector<int> source_words;
  source_words.reserve(source_ngrams);
  std::vector<int> target_words;
  target_words.reserve(target_ngrams);

  float value = 0;
-
  const TargetPhrase& currTargetPhrase = cur_hypo.GetCurrTargetPhrase();
  const WordsRange& sourceWordRange = cur_hypo.GetCurrSourceWordsRange(); //Source words range to calculate offsets

-  //For each word in the current target phrase get its LM score
+  // For each word in the current target phrase get its LM score.
  for (int i = 0; i < currTargetPhrase.GetSize(); i++){
-    //std::cout << "Size of Before Words " << all_words.size() << std::endl;
-    getSourceWords(currTargetPhrase
-              , i //The current target phrase
-              , source_sent
-              , sourceWordRange
-              , source_words);
-
-    getTargetWords(cur_hypo
-              , currTargetPhrase
-              , i
-              , target_words);
-
+    getSourceWords(
+        currTargetPhrase, i, source_sent, sourceWordRange, source_words);
+    getTargetWords(cur_hypo, currTargetPhrase, i, target_words);
    value += Score(source_words, target_words);

-    //Clear the vector
+    // Clear the vectors.
    source_words.clear();
    target_words.clear();
-
  }

  size_t new_state = getState(cur_hypo); 
--- a/moses/LM/BilingualLM.h
+++ b/moses/LM/BilingualLM.h
@ -47,18 +47,22 @@ private:
  virtual void loadModel() const = 0;
  virtual bool parseAdditionalSettings(const std::string& key, const std::string& value) = 0;

-  void getSourceWords(const TargetPhrase &targetPhrase
-                , int targetWordIdx
-                , const Sentence &source_sent
-                , const WordsRange &sourceWordRange
-                , std::vector<int> &words) const;
+  size_t selectMiddleAlignment(const std::set<size_t>& alignment_links) const;
+
+  void getSourceWords(
+      const TargetPhrase &targetPhrase,
+      int targetWordIdx,
+      const Sentence &source_sent,
+      const WordsRange &sourceWordRange,
+      std::vector<int> &words) const;

  void appendSourceWordsToVector(const Sentence &source_sent, std::vector<int> &words, int source_word_mid_idx) const;

-  void getTargetWords(const Hypothesis &cur_hypo
-                , const TargetPhrase &targetPhrase
-                , int current_word_index
-                , std::vector<int> &words) const;
+  void getTargetWords(
+      const Hypothesis &cur_hypo,
+      const TargetPhrase &targetPhrase,
+      int current_word_index,
+      std::vector<int> &words) const;

  //size_t getState(const TargetPhrase &targetPhrase, std::vector<int> &prev_words) const;