mosesdecoder/moses/LM/BilingualLM.cpp

#include <vector>
#include "BilingualLM.h"
#include "moses/ScoreComponentCollection.h"

using namespace std;

namespace Moses
{

////////////////////////////////////////////////////////////////
BilingualLM::BilingualLM(const std::string &line)
  : StatefulFeatureFunction(1, line),
    word_factortype(0)
{
  FactorCollection& factorFactory = FactorCollection::Instance(); //Factor Factory to use for BOS_ and EOS_
  BOS_factor = factorFactory.AddFactor(BOS_);
  BOS_word.SetFactor(0, BOS_factor);
  EOS_factor = factorFactory.AddFactor(EOS_);
  EOS_word.SetFactor(0, EOS_factor);

}

void BilingualLM::Load(AllOptions::ptr const& opts)
{
  m_options = opts;
  ReadParameters();
  loadModel();
}

//Populates words with amount words from the targetPhrase from the previous hypothesis where
//words[0] is the last word of the previous hypothesis, words[1] is the second last etc...
void BilingualLM::requestPrevTargetNgrams(
  const Hypothesis &cur_hypo, int amount, std::vector<int> &words) const
{
  const Hypothesis * prev_hyp = cur_hypo.GetPrevHypo();
  int found = 0;

  while (prev_hyp && found != amount) {
    const TargetPhrase& currTargetPhrase = prev_hyp->GetCurrTargetPhrase();
    for (int i = currTargetPhrase.GetSize() - 1; i> -1; i--) {
      if (found != amount) {
        const Word& word = currTargetPhrase.GetWord(i);
        words[found] = getNeuralLMId(word, false);
        found++;
      } else {
        return; //We have gotten everything needed
      }
    }

    prev_hyp = prev_hyp->GetPrevHypo();
  }

  int neuralLM_wordID = getNeuralLMId(BOS_word, false);
  for (int i = found; i < amount; i++) {
    words[i] = neuralLM_wordID;
  }
}

//Populates the words vector with target_ngrams sized that also contains the current word we are looking at.
//(in effect target_ngrams + 1)
void BilingualLM::getTargetWords(
  const Hypothesis &cur_hypo,
  const TargetPhrase &targetPhrase,
  int current_word_index,
  std::vector<int> &words) const
{
  //Check if we need to look at previous target phrases
  int additional_needed = current_word_index - target_ngrams;
  if (additional_needed < 0) {
    additional_needed = -additional_needed;
    std::vector<int> prev_words(additional_needed);
    requestPrevTargetNgrams(cur_hypo, additional_needed, prev_words);
    for (int i = additional_needed - 1; i >= 0; i--) {
      words.push_back(prev_words[i]);
    }
  }

  if (words.size() > 0) {
    //We have added some words from previous phrases
    //Just add until we reach current_word_index
    for (int i = 0; i <= current_word_index; i++) {
      const Word& word = targetPhrase.GetWord(i);
      words.push_back(getNeuralLMId(word, false));
    }
  } else {
    //We haven't added any words, proceed as before
    for (int i = current_word_index - target_ngrams; i <= current_word_index; i++) {
      const Word& word = targetPhrase.GetWord(i);
      words.push_back(getNeuralLMId(word, false));
    }
  }
}

//Returns source words in the way NeuralLM expects them.

size_t BilingualLM::selectMiddleAlignment(
  const set<size_t>& alignment_links) const
{

  set<size_t>::iterator it = alignment_links.begin();
  for (size_t i = 0; i < (alignment_links.size() - 1) / 2; ++i) {
    ++it;
  }

  return *it;
}

void BilingualLM::getSourceWords(
  const TargetPhrase &targetPhrase,
  int targetWordIdx,
  const Sentence &source_sent,
  const Range &sourceWordRange,
  std::vector<int> &words) const
{
  //Get source context

  //Get alignment for the word we require
  const AlignmentInfo& alignments = targetPhrase.GetAlignTerm();

  // We are getting word alignment for targetPhrase.GetWord(i + target_ngrams -1) according to the paper.
  // Find the closest target word with alignment links.
  std::set<size_t> last_word_al;
  for (int j = 0; j < targetPhrase.GetSize(); j++) {
    // Find the nearest aligned word with preference for right.
    if ((targetWordIdx + j) < targetPhrase.GetSize()) {
      last_word_al = alignments.GetAlignmentsForTarget(targetWordIdx + j);
      if (!last_word_al.empty()) {
        break;
      }
    }

    // We couldn't find word on the right, try to the left.
    if ((targetWordIdx - j) >= 0) {
      last_word_al = alignments.GetAlignmentsForTarget(targetWordIdx - j);
      if (!last_word_al.empty()) {
        break;
      }
    }
  }

  //Assume we have gotten some alignment here. If we couldn't get an alignment from the above routine it means
  //that none of the words in the target phrase aligned to any word in the source phrase

  // Now we get the source words. First select middle alignment.
  //It should never be the case the the word_al size would be zero, but several times this has happened because
  //of a corrupt phrase table. It is best to have this check here, as it makes debugging the problem a lot easier.
  UTIL_THROW_IF2(last_word_al.size() == 0,
                 "A target phrase with no alignments detected! " << targetPhrase << "Check if there is something wrong with your phrase table.");
  size_t source_center_index = selectMiddleAlignment(last_word_al);
  // We have found the alignment. Now determine how much to shift by to get the actual source word index.
  size_t phrase_start_pos = sourceWordRange.GetStartPos();
  // Account for how far the current word is from the start of the phrase.
  size_t source_word_mid_idx = phrase_start_pos + source_center_index;

  appendSourceWordsToVector(source_sent, words, source_word_mid_idx);
}

size_t BilingualLM::getState(const Hypothesis& cur_hypo) const
{
  const TargetPhrase &targetPhrase = cur_hypo.GetCurrTargetPhrase();
  size_t hashCode = 0;

  // Check if we need to look at previous target phrases
  int additional_needed = targetPhrase.GetSize() - target_ngrams;
  if (additional_needed < 0) {
    additional_needed = -additional_needed;
    std::vector<int> prev_words(additional_needed);
    requestPrevTargetNgrams(cur_hypo, additional_needed, prev_words);
    for (int i = additional_needed - 1; i >= 0; i--) {
      boost::hash_combine(hashCode, prev_words[i]);
    }

    // Get the rest of the phrases needed
    for (int i = 0; i < targetPhrase.GetSize(); i++) {
      const Word& word = targetPhrase.GetWord(i);
      int neuralLM_wordID = getNeuralLMId(word, false);
      boost::hash_combine(hashCode, neuralLM_wordID);
    }
  } else {
    // We just need the last target_ngrams from the current target phrase.
    for (int i = targetPhrase.GetSize() - target_ngrams; i < targetPhrase.GetSize(); i++) {
      const Word& word = targetPhrase.GetWord(i);
      int neuralLM_wordID = getNeuralLMId(word, false);

      boost::hash_combine(hashCode, neuralLM_wordID);
    }
  }

  return hashCode;
}

FFState* BilingualLM::EvaluateWhenApplied(
  const Hypothesis& cur_hypo,
  const FFState* prev_state,
  ScoreComponentCollection* accumulator) const
{
  Manager& manager = cur_hypo.GetManager();
  const Sentence& source_sent = static_cast<const Sentence&>(manager.GetSource());

  // Init vectors.
  std::vector<int> source_words;
  source_words.reserve(source_ngrams);
  std::vector<int> target_words;
  target_words.reserve(target_ngrams);

  float value = 0;
  const TargetPhrase& currTargetPhrase = cur_hypo.GetCurrTargetPhrase();
  const Range& sourceWordRange = cur_hypo.GetCurrSourceWordsRange(); //Source words range to calculate offsets

  // For each word in the current target phrase get its LM score.
  for (int i = 0; i < currTargetPhrase.GetSize(); i++) {
    getSourceWords(
      currTargetPhrase, i, source_sent, sourceWordRange, source_words);
    getTargetWords(cur_hypo, currTargetPhrase, i, target_words);
    value += Score(source_words, target_words);

    // Clear the vectors.
    source_words.clear();
    target_words.clear();
  }

  size_t new_state = getState(cur_hypo);
  accumulator->PlusEquals(this, value);

  return new BilingualLMState(new_state);
}

void BilingualLM::getAllTargetIdsChart(const ChartHypothesis& cur_hypo, size_t featureID, std::vector<int>& wordIds) const
{
  const TargetPhrase targetPhrase = cur_hypo.GetCurrTargetPhrase();

  for (int i = 0; i < targetPhrase.GetSize(); i++) {
    if (targetPhrase.GetWord(i).IsNonTerminal()) { //Nonterminal get from prev state
      const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(targetPhrase.GetAlignNonTerm().GetNonTermIndexMap()[i]);
      const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
      const std::vector<int> prevWordIDs = prev_state->GetWordIdsVector();
      for (std::vector<int>::const_iterator it = prevWordIDs.begin(); it!= prevWordIDs.end(); it++) {
        wordIds.push_back(*it);
      }
    } else {
      wordIds.push_back(getNeuralLMId(targetPhrase.GetWord(i), false));
    }
  }
}

void BilingualLM::getAllAlignments(const ChartHypothesis& cur_hypo, size_t featureID, std::vector<int>& word_alignments) const
{
  const TargetPhrase targetPhrase = cur_hypo.GetCurrTargetPhrase();
  int source_word_mid_idx; //The word alignment

  //Get source sent
  const AlignmentInfo& alignments = targetPhrase.GetAlignTerm();

  // get absolute position in source sentence for each source word in rule
  std::vector<int> absolute_source_position (cur_hypo.GetCurrSourceRange().GetNumWordsCovered(), 0); //we actually only need number of source symbols in rule; can we get this number cheaply?

  absolute_source_position[0] = cur_hypo.GetCurrSourceRange().GetStartPos();
  // get last absolute position of each source nonterminal symbol
  for (int i = 0; i < targetPhrase.GetSize(); i++) {
    if (targetPhrase.GetWord(i).IsNonTerminal()) {
      const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(targetPhrase.GetAlignNonTerm().GetNonTermIndexMap()[i]);
      absolute_source_position[targetPhrase.GetAlignNonTerm().GetNonTermIndexMap2()[i]] = prev_hypo->GetCurrSourceRange().GetEndPos();
    }
  }

  // set absolute position of all source terminal symbols based on absolute position of previous symbol
  for (int i = 0; i != absolute_source_position.size(); i++) {
    if (i && absolute_source_position[i] == 0) {
      absolute_source_position[i] = absolute_source_position[i-1] + 1;
    }
  }

  for (int i = 0; i < targetPhrase.GetSize(); i++) {
    //Sometimes we have to traverse more than one target words because of
    //unaligned words. This is O(n^2) in worst case, but usually closer to O(n)
    if (targetPhrase.GetWord(i).IsNonTerminal()) {
      //If we have a non terminal we can get the alignments from the previous state
      const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(targetPhrase.GetAlignNonTerm().GetNonTermIndexMap()[i]);
      const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
      const std::vector<int> prevWordAls = prev_state->GetWordAlignmentVector();
      for (std::vector<int>::const_iterator it = prevWordAls.begin(); it!= prevWordAls.end(); it++) {
        word_alignments.push_back(*it);
      }
    } else {
      bool resolvedIndexis = false; //If we are aligning to an existing nonterm we don't need to calculate offsets
      std::set<size_t> word_al = alignments.GetAlignmentsForTarget(i);
      if (word_al.empty()) {
        for (int j = 1; j < targetPhrase.GetSize(); j++) {
          //Try to get alignment from the current word and if it is unaligned,
          //try from the first word to the right and then to the left
          if ((i+j) < targetPhrase.GetSize()) {
            //TODO: this will always succeed, even if first word in previous hypo is unaligned. should it?
            if (targetPhrase.GetWord(i + j).IsNonTerminal()) {
              const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(targetPhrase.GetAlignNonTerm().GetNonTermIndexMap()[i+j]);
              const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
              source_word_mid_idx = prev_state->GetWordAlignmentVector().front(); // The first word on the right of our word
              resolvedIndexis = true;
              break;
            }
            word_al = alignments.GetAlignmentsForTarget(i + j);
            if (!word_al.empty()) {
              break;
            }
          }

          if ((i - j) >= 0) {
            //TODO: this will always succeed, even if last word in previous hypo is unaligned. should it?
            if (targetPhrase.GetWord(i - j).IsNonTerminal()) {
              const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(targetPhrase.GetAlignNonTerm().GetNonTermIndexMap()[i-j]);
              const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
              source_word_mid_idx = prev_state->GetWordAlignmentVector().back(); // The first word on the left of our word
              resolvedIndexis = true;
              break;
            }

            word_al = alignments.GetAlignmentsForTarget(i - j);
            if (!word_al.empty()) {
              break;
            }
          }
        }
      }

      if (!resolvedIndexis) {
        //It should never be the case the the word_al size would be zero, but several times this has happened because
        //of a corrupt phrase table. It is best to have this check here, as it makes debugging the problem a lot easier.
        UTIL_THROW_IF2(word_al.size() == 0,
                       "A target phrase with no alignments detected! " << targetPhrase << "Check if there is something wrong with your phrase table.");
        size_t source_center_index = selectMiddleAlignment(word_al);
        // We have found the alignment. Now determine how much to shift by to get the actual source word index.
        source_word_mid_idx = absolute_source_position[source_center_index];
      }
      word_alignments.push_back(source_word_mid_idx);
    }
  }

}

size_t BilingualLM::getStateChart(std::vector<int>& neuralLMids) const
{
  size_t hashCode = 0;
  for (int i = neuralLMids.size() - target_ngrams; i < neuralLMids.size(); i++) {
    int neuralLM_wordID;
    if (i < 0) {
      neuralLM_wordID = getNeuralLMId(BOS_word, false);
    } else {
      neuralLM_wordID = neuralLMids[i];
    }
    boost::hash_combine(hashCode, neuralLM_wordID);
  }
  return hashCode;
}

void BilingualLM::getTargetWordsChart(
  std::vector<int>& neuralLMids,
  int current_word_index,
  std::vector<int>& words,
  bool sentence_begin) const
{

  for (int i = current_word_index - target_ngrams; i <= current_word_index; i++) {
    if (i < 0) {
      if (sentence_begin) {
        words.push_back(getNeuralLMId(BOS_word, false));
      } else {
        words.push_back(getNeuralLMId(getNullWord(), false));
      }
    } else {
      words.push_back(neuralLMids[i]);
    }
  }
}

void BilingualLM::appendSourceWordsToVector(const Sentence &source_sent, std::vector<int> &words, int source_word_mid_idx) const
{
  //Define begin and end indexes of the lookup. Cases for even and odd ngrams
  //This can result in indexes which span larger than the length of the source phrase.
  //In this case we just
  int begin_idx;
  int end_idx;

  if (source_ngrams % 2 == 0) {
    begin_idx = source_word_mid_idx - source_ngrams / 2 + 1;
    end_idx = source_word_mid_idx + source_ngrams / 2;
  } else {
    begin_idx = source_word_mid_idx - (source_ngrams - 1) / 2;
    end_idx = source_word_mid_idx + (source_ngrams - 1) / 2;
  }

  //Add words to vector
  for (int j = begin_idx; j <= end_idx; j++) {
    int neuralLM_wordID;
    if (j < 0) {
      neuralLM_wordID = getNeuralLMId(BOS_word, true);
    } else if (j >= source_sent.GetSize()) {
      neuralLM_wordID = getNeuralLMId(EOS_word, true);
    } else {
      const Word& word = source_sent.GetWord(j);
      neuralLM_wordID = getNeuralLMId(word, true);
    }
    words.push_back(neuralLM_wordID);
  }
}

FFState* BilingualLM::EvaluateWhenApplied(
  const ChartHypothesis& cur_hypo,
  int featureID, /* - used to index the state in the previous hypotheses */
  ScoreComponentCollection* accumulator) const
{
  //Init vectors
  std::vector<int> source_words;
  source_words.reserve(source_ngrams);
  std::vector<int> target_words;
  target_words.reserve(target_ngrams+1);

  float value = 0; //NeuralLM score
  const TargetPhrase& currTargetPhrase = cur_hypo.GetCurrTargetPhrase();

  std::vector<int> neuralLMids; //Equivalent more or less to whole_phrase. Contains all word ids but not as expensive
  std::vector<int> alignments;
  //Estimate size and reserve vectors to avoid reallocation
  int future_size = currTargetPhrase.GetNumTerminals();
  for (int i =0; i<currTargetPhrase.GetNumNonTerminals(); i++) {
    const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(i); //We need to look at the nonterm on the left.
    future_size += prev_hypo->GetCurrTargetPhrase().GetSize();
  }
  neuralLMids.reserve(future_size);
  alignments.reserve(future_size);

  getAllTargetIdsChart(cur_hypo, featureID, neuralLMids);
  getAllAlignments(cur_hypo, featureID, alignments);

  bool sentence_begin = false; //Check if this hypothesis' target words are located in the beginning of the sentence
  if (neuralLMids[0] == getNeuralLMId(BOS_word, false)) {
    sentence_begin = true;
  }

  //Get source sentence
  const ChartManager& manager = cur_hypo.GetManager();
  const Sentence& source_sent = static_cast<const Sentence&>(manager.GetSource());

  for (int i = 0; i < neuralLMids.size(); i++) { //This loop should be bigger as non terminals expand

    //We already have resolved the nonterminals, we are left with a simple loop.
    appendSourceWordsToVector(source_sent, source_words, alignments[i]);
    getTargetWordsChart(neuralLMids, i, target_words, sentence_begin);

    value += Score(source_words, target_words); // Get the score

    //Clear the vectors before the next iteration
    source_words.clear();
    target_words.clear();

  }
  size_t new_state = getStateChart(neuralLMids);

  // we're rescoring the full hypothesis, so we need to detract scores from previous hypos
  for (std::vector<const ChartHypothesis*>::const_iterator iter = cur_hypo.GetPrevHypos().begin(); iter != cur_hypo.GetPrevHypos().end(); ++iter) {
    const ChartHypothesis &prevHypo = **iter;
    value -= (prevHypo.GetScoreBreakdown().GetScoreForProducer(this));
  }

  accumulator->PlusEquals(this, value);

  return new BilingualLMState(new_state, alignments, neuralLMids);
}

void BilingualLM::SetParameter(const std::string& key, const std::string& value)
{
  if (key == "path") {
    m_filePath = value;
  } else {
    StatefulFeatureFunction::SetParameter(key, value);
  }
}

} // namespace Moses