mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-26 19:37:58 +03:00
479 lines
18 KiB
C++
479 lines
18 KiB
C++
#include <vector>
|
|
#include "BilingualLM.h"
|
|
#include "moses/ScoreComponentCollection.h"
|
|
|
|
using namespace std;
|
|
|
|
namespace Moses
|
|
{
|
|
|
|
////////////////////////////////////////////////////////////////
|
|
BilingualLM::BilingualLM(const std::string &line)
|
|
: StatefulFeatureFunction(1, line),
|
|
word_factortype(0)
|
|
{
|
|
FactorCollection& factorFactory = FactorCollection::Instance(); //Factor Factory to use for BOS_ and EOS_
|
|
BOS_factor = factorFactory.AddFactor(BOS_);
|
|
BOS_word.SetFactor(0, BOS_factor);
|
|
EOS_factor = factorFactory.AddFactor(EOS_);
|
|
EOS_word.SetFactor(0, EOS_factor);
|
|
|
|
}
|
|
|
|
void BilingualLM::Load(AllOptions::ptr const& opts)
|
|
{
|
|
m_options = opts;
|
|
ReadParameters();
|
|
loadModel();
|
|
}
|
|
|
|
//Populates words with amount words from the targetPhrase from the previous hypothesis where
|
|
//words[0] is the last word of the previous hypothesis, words[1] is the second last etc...
|
|
void BilingualLM::requestPrevTargetNgrams(
|
|
const Hypothesis &cur_hypo, int amount, std::vector<int> &words) const
|
|
{
|
|
const Hypothesis * prev_hyp = cur_hypo.GetPrevHypo();
|
|
int found = 0;
|
|
|
|
while (prev_hyp && found != amount) {
|
|
const TargetPhrase& currTargetPhrase = prev_hyp->GetCurrTargetPhrase();
|
|
for (int i = currTargetPhrase.GetSize() - 1; i> -1; i--) {
|
|
if (found != amount) {
|
|
const Word& word = currTargetPhrase.GetWord(i);
|
|
words[found] = getNeuralLMId(word, false);
|
|
found++;
|
|
} else {
|
|
return; //We have gotten everything needed
|
|
}
|
|
}
|
|
|
|
prev_hyp = prev_hyp->GetPrevHypo();
|
|
}
|
|
|
|
int neuralLM_wordID = getNeuralLMId(BOS_word, false);
|
|
for (int i = found; i < amount; i++) {
|
|
words[i] = neuralLM_wordID;
|
|
}
|
|
}
|
|
|
|
//Populates the words vector with target_ngrams sized that also contains the current word we are looking at.
|
|
//(in effect target_ngrams + 1)
|
|
void BilingualLM::getTargetWords(
|
|
const Hypothesis &cur_hypo,
|
|
const TargetPhrase &targetPhrase,
|
|
int current_word_index,
|
|
std::vector<int> &words) const
|
|
{
|
|
//Check if we need to look at previous target phrases
|
|
int additional_needed = current_word_index - target_ngrams;
|
|
if (additional_needed < 0) {
|
|
additional_needed = -additional_needed;
|
|
std::vector<int> prev_words(additional_needed);
|
|
requestPrevTargetNgrams(cur_hypo, additional_needed, prev_words);
|
|
for (int i = additional_needed - 1; i >= 0; i--) {
|
|
words.push_back(prev_words[i]);
|
|
}
|
|
}
|
|
|
|
if (words.size() > 0) {
|
|
//We have added some words from previous phrases
|
|
//Just add until we reach current_word_index
|
|
for (int i = 0; i <= current_word_index; i++) {
|
|
const Word& word = targetPhrase.GetWord(i);
|
|
words.push_back(getNeuralLMId(word, false));
|
|
}
|
|
} else {
|
|
//We haven't added any words, proceed as before
|
|
for (int i = current_word_index - target_ngrams; i <= current_word_index; i++) {
|
|
const Word& word = targetPhrase.GetWord(i);
|
|
words.push_back(getNeuralLMId(word, false));
|
|
}
|
|
}
|
|
}
|
|
|
|
//Returns source words in the way NeuralLM expects them.
|
|
|
|
size_t BilingualLM::selectMiddleAlignment(
|
|
const set<size_t>& alignment_links) const
|
|
{
|
|
|
|
set<size_t>::iterator it = alignment_links.begin();
|
|
for (size_t i = 0; i < (alignment_links.size() - 1) / 2; ++i) {
|
|
++it;
|
|
}
|
|
|
|
return *it;
|
|
}
|
|
|
|
void BilingualLM::getSourceWords(
|
|
const TargetPhrase &targetPhrase,
|
|
int targetWordIdx,
|
|
const Sentence &source_sent,
|
|
const Range &sourceWordRange,
|
|
std::vector<int> &words) const
|
|
{
|
|
//Get source context
|
|
|
|
//Get alignment for the word we require
|
|
const AlignmentInfo& alignments = targetPhrase.GetAlignTerm();
|
|
|
|
// We are getting word alignment for targetPhrase.GetWord(i + target_ngrams -1) according to the paper.
|
|
// Find the closest target word with alignment links.
|
|
std::set<size_t> last_word_al;
|
|
for (int j = 0; j < targetPhrase.GetSize(); j++) {
|
|
// Find the nearest aligned word with preference for right.
|
|
if ((targetWordIdx + j) < targetPhrase.GetSize()) {
|
|
last_word_al = alignments.GetAlignmentsForTarget(targetWordIdx + j);
|
|
if (!last_word_al.empty()) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// We couldn't find word on the right, try to the left.
|
|
if ((targetWordIdx - j) >= 0) {
|
|
last_word_al = alignments.GetAlignmentsForTarget(targetWordIdx - j);
|
|
if (!last_word_al.empty()) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
//Assume we have gotten some alignment here. If we couldn't get an alignment from the above routine it means
|
|
//that none of the words in the target phrase aligned to any word in the source phrase
|
|
|
|
// Now we get the source words. First select middle alignment.
|
|
//It should never be the case the the word_al size would be zero, but several times this has happened because
|
|
//of a corrupt phrase table. It is best to have this check here, as it makes debugging the problem a lot easier.
|
|
UTIL_THROW_IF2(last_word_al.size() == 0,
|
|
"A target phrase with no alignments detected! " << targetPhrase << "Check if there is something wrong with your phrase table.");
|
|
size_t source_center_index = selectMiddleAlignment(last_word_al);
|
|
// We have found the alignment. Now determine how much to shift by to get the actual source word index.
|
|
size_t phrase_start_pos = sourceWordRange.GetStartPos();
|
|
// Account for how far the current word is from the start of the phrase.
|
|
size_t source_word_mid_idx = phrase_start_pos + source_center_index;
|
|
|
|
appendSourceWordsToVector(source_sent, words, source_word_mid_idx);
|
|
}
|
|
|
|
size_t BilingualLM::getState(const Hypothesis& cur_hypo) const
|
|
{
|
|
const TargetPhrase &targetPhrase = cur_hypo.GetCurrTargetPhrase();
|
|
size_t hashCode = 0;
|
|
|
|
// Check if we need to look at previous target phrases
|
|
int additional_needed = targetPhrase.GetSize() - target_ngrams;
|
|
if (additional_needed < 0) {
|
|
additional_needed = -additional_needed;
|
|
std::vector<int> prev_words(additional_needed);
|
|
requestPrevTargetNgrams(cur_hypo, additional_needed, prev_words);
|
|
for (int i = additional_needed - 1; i >= 0; i--) {
|
|
boost::hash_combine(hashCode, prev_words[i]);
|
|
}
|
|
|
|
// Get the rest of the phrases needed
|
|
for (int i = 0; i < targetPhrase.GetSize(); i++) {
|
|
const Word& word = targetPhrase.GetWord(i);
|
|
int neuralLM_wordID = getNeuralLMId(word, false);
|
|
boost::hash_combine(hashCode, neuralLM_wordID);
|
|
}
|
|
} else {
|
|
// We just need the last target_ngrams from the current target phrase.
|
|
for (int i = targetPhrase.GetSize() - target_ngrams; i < targetPhrase.GetSize(); i++) {
|
|
const Word& word = targetPhrase.GetWord(i);
|
|
int neuralLM_wordID = getNeuralLMId(word, false);
|
|
|
|
boost::hash_combine(hashCode, neuralLM_wordID);
|
|
}
|
|
}
|
|
|
|
return hashCode;
|
|
}
|
|
|
|
FFState* BilingualLM::EvaluateWhenApplied(
|
|
const Hypothesis& cur_hypo,
|
|
const FFState* prev_state,
|
|
ScoreComponentCollection* accumulator) const
|
|
{
|
|
Manager& manager = cur_hypo.GetManager();
|
|
const Sentence& source_sent = static_cast<const Sentence&>(manager.GetSource());
|
|
|
|
// Init vectors.
|
|
std::vector<int> source_words;
|
|
source_words.reserve(source_ngrams);
|
|
std::vector<int> target_words;
|
|
target_words.reserve(target_ngrams);
|
|
|
|
float value = 0;
|
|
const TargetPhrase& currTargetPhrase = cur_hypo.GetCurrTargetPhrase();
|
|
const Range& sourceWordRange = cur_hypo.GetCurrSourceWordsRange(); //Source words range to calculate offsets
|
|
|
|
// For each word in the current target phrase get its LM score.
|
|
for (int i = 0; i < currTargetPhrase.GetSize(); i++) {
|
|
getSourceWords(
|
|
currTargetPhrase, i, source_sent, sourceWordRange, source_words);
|
|
getTargetWords(cur_hypo, currTargetPhrase, i, target_words);
|
|
value += Score(source_words, target_words);
|
|
|
|
// Clear the vectors.
|
|
source_words.clear();
|
|
target_words.clear();
|
|
}
|
|
|
|
size_t new_state = getState(cur_hypo);
|
|
accumulator->PlusEquals(this, value);
|
|
|
|
return new BilingualLMState(new_state);
|
|
}
|
|
|
|
void BilingualLM::getAllTargetIdsChart(const ChartHypothesis& cur_hypo, size_t featureID, std::vector<int>& wordIds) const
|
|
{
|
|
const TargetPhrase targetPhrase = cur_hypo.GetCurrTargetPhrase();
|
|
|
|
for (int i = 0; i < targetPhrase.GetSize(); i++) {
|
|
if (targetPhrase.GetWord(i).IsNonTerminal()) { //Nonterminal get from prev state
|
|
const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(targetPhrase.GetAlignNonTerm().GetNonTermIndexMap()[i]);
|
|
const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
|
|
const std::vector<int> prevWordIDs = prev_state->GetWordIdsVector();
|
|
for (std::vector<int>::const_iterator it = prevWordIDs.begin(); it!= prevWordIDs.end(); it++) {
|
|
wordIds.push_back(*it);
|
|
}
|
|
} else {
|
|
wordIds.push_back(getNeuralLMId(targetPhrase.GetWord(i), false));
|
|
}
|
|
}
|
|
}
|
|
|
|
void BilingualLM::getAllAlignments(const ChartHypothesis& cur_hypo, size_t featureID, std::vector<int>& word_alignments) const
|
|
{
|
|
const TargetPhrase targetPhrase = cur_hypo.GetCurrTargetPhrase();
|
|
int source_word_mid_idx; //The word alignment
|
|
|
|
//Get source sent
|
|
const AlignmentInfo& alignments = targetPhrase.GetAlignTerm();
|
|
|
|
// get absolute position in source sentence for each source word in rule
|
|
std::vector<int> absolute_source_position (cur_hypo.GetCurrSourceRange().GetNumWordsCovered(), 0); //we actually only need number of source symbols in rule; can we get this number cheaply?
|
|
|
|
absolute_source_position[0] = cur_hypo.GetCurrSourceRange().GetStartPos();
|
|
// get last absolute position of each source nonterminal symbol
|
|
for (int i = 0; i < targetPhrase.GetSize(); i++) {
|
|
if (targetPhrase.GetWord(i).IsNonTerminal()) {
|
|
const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(targetPhrase.GetAlignNonTerm().GetNonTermIndexMap()[i]);
|
|
absolute_source_position[targetPhrase.GetAlignNonTerm().GetNonTermIndexMap2()[i]] = prev_hypo->GetCurrSourceRange().GetEndPos();
|
|
}
|
|
}
|
|
|
|
// set absolute position of all source terminal symbols based on absolute position of previous symbol
|
|
for (int i = 0; i != absolute_source_position.size(); i++) {
|
|
if (i && absolute_source_position[i] == 0) {
|
|
absolute_source_position[i] = absolute_source_position[i-1] + 1;
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < targetPhrase.GetSize(); i++) {
|
|
//Sometimes we have to traverse more than one target words because of
|
|
//unaligned words. This is O(n^2) in worst case, but usually closer to O(n)
|
|
if (targetPhrase.GetWord(i).IsNonTerminal()) {
|
|
//If we have a non terminal we can get the alignments from the previous state
|
|
const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(targetPhrase.GetAlignNonTerm().GetNonTermIndexMap()[i]);
|
|
const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
|
|
const std::vector<int> prevWordAls = prev_state->GetWordAlignmentVector();
|
|
for (std::vector<int>::const_iterator it = prevWordAls.begin(); it!= prevWordAls.end(); it++) {
|
|
word_alignments.push_back(*it);
|
|
}
|
|
} else {
|
|
bool resolvedIndexis = false; //If we are aligning to an existing nonterm we don't need to calculate offsets
|
|
std::set<size_t> word_al = alignments.GetAlignmentsForTarget(i);
|
|
if (word_al.empty()) {
|
|
for (int j = 1; j < targetPhrase.GetSize(); j++) {
|
|
//Try to get alignment from the current word and if it is unaligned,
|
|
//try from the first word to the right and then to the left
|
|
if ((i+j) < targetPhrase.GetSize()) {
|
|
//TODO: this will always succeed, even if first word in previous hypo is unaligned. should it?
|
|
if (targetPhrase.GetWord(i + j).IsNonTerminal()) {
|
|
const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(targetPhrase.GetAlignNonTerm().GetNonTermIndexMap()[i+j]);
|
|
const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
|
|
source_word_mid_idx = prev_state->GetWordAlignmentVector().front(); // The first word on the right of our word
|
|
resolvedIndexis = true;
|
|
break;
|
|
}
|
|
word_al = alignments.GetAlignmentsForTarget(i + j);
|
|
if (!word_al.empty()) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if ((i - j) >= 0) {
|
|
//TODO: this will always succeed, even if last word in previous hypo is unaligned. should it?
|
|
if (targetPhrase.GetWord(i - j).IsNonTerminal()) {
|
|
const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(targetPhrase.GetAlignNonTerm().GetNonTermIndexMap()[i-j]);
|
|
const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
|
|
source_word_mid_idx = prev_state->GetWordAlignmentVector().back(); // The first word on the left of our word
|
|
resolvedIndexis = true;
|
|
break;
|
|
}
|
|
|
|
word_al = alignments.GetAlignmentsForTarget(i - j);
|
|
if (!word_al.empty()) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!resolvedIndexis) {
|
|
//It should never be the case the the word_al size would be zero, but several times this has happened because
|
|
//of a corrupt phrase table. It is best to have this check here, as it makes debugging the problem a lot easier.
|
|
UTIL_THROW_IF2(word_al.size() == 0,
|
|
"A target phrase with no alignments detected! " << targetPhrase << "Check if there is something wrong with your phrase table.");
|
|
size_t source_center_index = selectMiddleAlignment(word_al);
|
|
// We have found the alignment. Now determine how much to shift by to get the actual source word index.
|
|
source_word_mid_idx = absolute_source_position[source_center_index];
|
|
}
|
|
word_alignments.push_back(source_word_mid_idx);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
size_t BilingualLM::getStateChart(std::vector<int>& neuralLMids) const
|
|
{
|
|
size_t hashCode = 0;
|
|
for (int i = neuralLMids.size() - target_ngrams; i < neuralLMids.size(); i++) {
|
|
int neuralLM_wordID;
|
|
if (i < 0) {
|
|
neuralLM_wordID = getNeuralLMId(BOS_word, false);
|
|
} else {
|
|
neuralLM_wordID = neuralLMids[i];
|
|
}
|
|
boost::hash_combine(hashCode, neuralLM_wordID);
|
|
}
|
|
return hashCode;
|
|
}
|
|
|
|
void BilingualLM::getTargetWordsChart(
|
|
std::vector<int>& neuralLMids,
|
|
int current_word_index,
|
|
std::vector<int>& words,
|
|
bool sentence_begin) const
|
|
{
|
|
|
|
for (int i = current_word_index - target_ngrams; i <= current_word_index; i++) {
|
|
if (i < 0) {
|
|
if (sentence_begin) {
|
|
words.push_back(getNeuralLMId(BOS_word, false));
|
|
} else {
|
|
words.push_back(getNeuralLMId(getNullWord(), false));
|
|
}
|
|
} else {
|
|
words.push_back(neuralLMids[i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
void BilingualLM::appendSourceWordsToVector(const Sentence &source_sent, std::vector<int> &words, int source_word_mid_idx) const
|
|
{
|
|
//Define begin and end indexes of the lookup. Cases for even and odd ngrams
|
|
//This can result in indexes which span larger than the length of the source phrase.
|
|
//In this case we just
|
|
int begin_idx;
|
|
int end_idx;
|
|
|
|
if (source_ngrams % 2 == 0) {
|
|
begin_idx = source_word_mid_idx - source_ngrams / 2 + 1;
|
|
end_idx = source_word_mid_idx + source_ngrams / 2;
|
|
} else {
|
|
begin_idx = source_word_mid_idx - (source_ngrams - 1) / 2;
|
|
end_idx = source_word_mid_idx + (source_ngrams - 1) / 2;
|
|
}
|
|
|
|
//Add words to vector
|
|
for (int j = begin_idx; j <= end_idx; j++) {
|
|
int neuralLM_wordID;
|
|
if (j < 0) {
|
|
neuralLM_wordID = getNeuralLMId(BOS_word, true);
|
|
} else if (j >= source_sent.GetSize()) {
|
|
neuralLM_wordID = getNeuralLMId(EOS_word, true);
|
|
} else {
|
|
const Word& word = source_sent.GetWord(j);
|
|
neuralLM_wordID = getNeuralLMId(word, true);
|
|
}
|
|
words.push_back(neuralLM_wordID);
|
|
}
|
|
}
|
|
|
|
FFState* BilingualLM::EvaluateWhenApplied(
|
|
const ChartHypothesis& cur_hypo,
|
|
int featureID, /* - used to index the state in the previous hypotheses */
|
|
ScoreComponentCollection* accumulator) const
|
|
{
|
|
//Init vectors
|
|
std::vector<int> source_words;
|
|
source_words.reserve(source_ngrams);
|
|
std::vector<int> target_words;
|
|
target_words.reserve(target_ngrams+1);
|
|
|
|
float value = 0; //NeuralLM score
|
|
const TargetPhrase& currTargetPhrase = cur_hypo.GetCurrTargetPhrase();
|
|
|
|
std::vector<int> neuralLMids; //Equivalent more or less to whole_phrase. Contains all word ids but not as expensive
|
|
std::vector<int> alignments;
|
|
//Estimate size and reserve vectors to avoid reallocation
|
|
int future_size = currTargetPhrase.GetNumTerminals();
|
|
for (int i =0; i<currTargetPhrase.GetNumNonTerminals(); i++) {
|
|
const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(i); //We need to look at the nonterm on the left.
|
|
future_size += prev_hypo->GetCurrTargetPhrase().GetSize();
|
|
}
|
|
neuralLMids.reserve(future_size);
|
|
alignments.reserve(future_size);
|
|
|
|
getAllTargetIdsChart(cur_hypo, featureID, neuralLMids);
|
|
getAllAlignments(cur_hypo, featureID, alignments);
|
|
|
|
bool sentence_begin = false; //Check if this hypothesis' target words are located in the beginning of the sentence
|
|
if (neuralLMids[0] == getNeuralLMId(BOS_word, false)) {
|
|
sentence_begin = true;
|
|
}
|
|
|
|
//Get source sentence
|
|
const ChartManager& manager = cur_hypo.GetManager();
|
|
const Sentence& source_sent = static_cast<const Sentence&>(manager.GetSource());
|
|
|
|
for (int i = 0; i < neuralLMids.size(); i++) { //This loop should be bigger as non terminals expand
|
|
|
|
//We already have resolved the nonterminals, we are left with a simple loop.
|
|
appendSourceWordsToVector(source_sent, source_words, alignments[i]);
|
|
getTargetWordsChart(neuralLMids, i, target_words, sentence_begin);
|
|
|
|
value += Score(source_words, target_words); // Get the score
|
|
|
|
//Clear the vectors before the next iteration
|
|
source_words.clear();
|
|
target_words.clear();
|
|
|
|
}
|
|
size_t new_state = getStateChart(neuralLMids);
|
|
|
|
// we're rescoring the full hypothesis, so we need to detract scores from previous hypos
|
|
for (std::vector<const ChartHypothesis*>::const_iterator iter = cur_hypo.GetPrevHypos().begin(); iter != cur_hypo.GetPrevHypos().end(); ++iter) {
|
|
const ChartHypothesis &prevHypo = **iter;
|
|
value -= (prevHypo.GetScoreBreakdown().GetScoreForProducer(this));
|
|
}
|
|
|
|
accumulator->PlusEquals(this, value);
|
|
|
|
return new BilingualLMState(new_state, alignments, neuralLMids);
|
|
}
|
|
|
|
void BilingualLM::SetParameter(const std::string& key, const std::string& value)
|
|
{
|
|
if (key == "path") {
|
|
m_filePath = value;
|
|
} else {
|
|
StatefulFeatureFunction::SetParameter(key, value);
|
|
}
|
|
}
|
|
|
|
} // namespace Moses
|
|
|