added CacheFreqWords() to speed up decoding with suffix array PTs

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4064 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
leven101 2011-07-01 14:36:28 +00:00
parent 126739f3f1
commit e0174b413c
2 changed files with 59 additions and 36 deletions

View File

@ -56,13 +56,16 @@ bool BilingualDynSuffixArray::Load(
m_srcSA = new DynSuffixArray(m_srcCorpus);
if(!m_srcSA) return false;
cerr << "Building Target Suffix Array...\n";
m_trgSA = new DynSuffixArray(m_trgCorpus);
if(!m_trgSA) return false;
//m_trgSA = new DynSuffixArray(m_trgCorpus);
//if(!m_trgSA) return false;
cerr << "\t(Skipped. Not used)\n";
InputFileStream alignStrme(alignments);
cerr << "Loading Alignment File...\n";
LoadRawAlignments(alignStrme);
//LoadAlignments(alignStrme);
cerr << "Building frequent word cache...\n";
CacheFreqWords();
return true;
}
@ -169,7 +172,7 @@ bool BilingualDynSuffixArray::ExtractPhrases(const int& sntIndex, const int& wor
void BilingualDynSuffixArray::CleanUp()
{
m_wordPairCache.clear();
//m_wordPairCache.clear();
}
int BilingualDynSuffixArray::LoadCorpus(InputFileStream& corpus, const FactorList& factors,
@ -218,7 +221,7 @@ bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &outp
pair<float, float> BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& phrasepair) const
{
//return pair<float, float>(0, 0);
//return pair<float, float>(1, 1);
float srcLexWeight(1.0), trgLexWeight(1.0);
std::map<pair<wordID_t, wordID_t>, float> targetProbs; // collect sum of target probs given source words
//const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex];
@ -229,8 +232,9 @@ pair<float, float> BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& p
float srcSumPairProbs(0);
wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs
const std::vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx);
// for each target word aligned to this source word in this alignment
if(srcWordAlignments.size() == 0) { // get p(NULL|src)
pair<wordID_t, wordID_t> wordpair = std::make_pair(srcWord, m_srcVocab->GetkOOVWordID());
pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID());
itrCache = m_wordPairCache.find(wordpair);
if(itrCache == m_wordPairCache.end()) { // if not in cache
CacheWordProbs(srcWord);
@ -245,7 +249,7 @@ pair<float, float> BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& p
int trgIdx = srcWordAlignments[i];
wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
// get probability of this source->target word pair
pair<wordID_t, wordID_t> wordpair = std::make_pair(srcWord, trgWord);
pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord);
itrCache = m_wordPairCache.find(wordpair);
if(itrCache == m_wordPairCache.end()) { // if not in cache
CacheWordProbs(srcWord);
@ -275,14 +279,35 @@ pair<float, float> BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& p
// TODO::Need to get p(NULL|trg)
return pair<float, float>(srcLexWeight, trgLexWeight);
}
void BilingualDynSuffixArray::CacheFreqWords() const {
std::multimap<int, wordID_t> wordCnts;
// for each source word in vocab
Vocab::Word2Id::const_iterator it;
for(it = m_srcVocab->VocabStart(); it != m_srcVocab->VocabEnd(); ++it) {
// get its frequency
wordID_t srcWord = it->second;
std::vector<wordID_t> sword(1, srcWord), wrdIndices;
m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
if(wrdIndices.size() >= 1000) { // min count
wordCnts.insert(make_pair(wrdIndices.size(), srcWord));
}
}
int numSoFar(0);
std::multimap<int, wordID_t>::reverse_iterator ritr;
for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) {
m_freqWordsCached.insert(ritr->second);
CacheWordProbs(ritr->second);
if(++numSoFar == 50) break; // get top counts
}
cerr << "\tCached " << m_freqWordsCached.size() << " source words\n";
}
void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
{
std::map<wordID_t, int> counts;
std::vector<wordID_t> vword(1, srcWord), wrdIndices;
bool ret = m_srcSA->GetCorpusIndex(&vword, &wrdIndices);
std::vector<wordID_t> sword(1, srcWord), wrdIndices;
bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
assert(ret);
std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1);
std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks);
float denom(0);
// for each occurrence of this word
for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
@ -290,7 +315,6 @@ void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
assert(sntIdx != -1);
int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence
const std::vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word
//const std::vector<int>& srcAlg = m_alignments.at(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word
if(srcAlg.size() == 0) {
++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word
++denom;
@ -307,7 +331,7 @@ void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
// get probs and cache all pairs
for(std::map<wordID_t, int>::const_iterator itrCnt = counts.begin();
itrCnt != counts.end(); ++itrCnt) {
pair<wordID_t, wordID_t> wordPair = std::make_pair(srcWord, itrCnt->first);
pair<wordID_t, wordID_t> wordPair = make_pair(srcWord, itrCnt->first);
float srcTrgPrb = float(itrCnt->second) / float(denom); // gives p(src->trg)
float trgSrcPrb = float(itrCnt->second) / float(counts.size()); // gives p(trg->src)
m_wordPairCache[wordPair] = pair<float, float>(srcTrgPrb, trgSrcPrb);
@ -347,12 +371,14 @@ void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src,
if(!GetLocalVocabIDs(src, localIDs)) return;
float totalTrgPhrases(0);
std::map<SAPhrase, int> phraseCounts;
//std::map<SAPhrase, PhrasePair> phraseColl; // (one of) the word indexes this phrase was taken from
std::map<SAPhrase, pair<float, float> > lexicalWeights;
std::map<SAPhrase, pair<float, float> >::iterator itrLexW;
std::vector<unsigned> wrdIndices;
// extract sentence IDs from SA and return rightmost index of phrases
if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) return;
std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize);
SampleSelection(wrdIndices);
std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks);
// for each sentence with this phrase
for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
std::vector<PhrasePair*> phrasePairs; // to store all phrases possible from current sentence
@ -365,8 +391,8 @@ void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src,
for (iterPhrasePair = phrasePairs.begin(); iterPhrasePair != phrasePairs.end(); ++iterPhrasePair) {
SAPhrase phrase = TrgPhraseFromSntIdx(**iterPhrasePair);
phraseCounts[phrase]++; // count each unique phrase
// TODO::Inefficient to extract lexical weight here. Should do it later
// once the top phrases have been chosen by phrase prob p(e|f)
// NOTE::Correct but slow to extract lexical weight here. could do
// it later for only the top phrases chosen by phrase prob p(e|f)
pair<float, float> lexWeight = GetLexicalWeight(**iterPhrasePair); // get lexical weighting for this phrase pair
itrLexW = lexicalWeights.find(phrase); // check if phrase already has lexical weight attached
if((itrLexW != lexicalWeights.end()) && (itrLexW->second.first < lexWeight.first))
@ -388,7 +414,7 @@ void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src,
scoreVector[0] = trg2SrcMLE;
scoreVector[1] = itrLexW->second.first;
scoreVector[2] = 2.718; // exp(1);
phraseScores.insert(pair<Scores, const SAPhrase*>(scoreVector, &iterPhrases->first));
phraseScores.insert(make_pair(scoreVector, &iterPhrases->first));
}
// return top scoring phrases
std::multimap<Scores, const SAPhrase*, ScoresComp>::reverse_iterator ritr;
@ -398,19 +424,18 @@ void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src,
target.push_back(make_pair( scoreVector, targetPhrase));
if(target.size() == m_maxSampleSize) break;
}
return;
}
std::vector<int> BilingualDynSuffixArray::GetSntIndexes(std::vector<unsigned>& wrdIndices,
const int sourceSize) const
const int sourceSize, const std::vector<unsigned>& sntBreaks) const
{
std::vector<unsigned>::const_iterator vit;
std::vector<int> sntIndexes;
for(size_t i=0; i < wrdIndices.size(); ++i) {
vit = std::upper_bound(m_srcSntBreaks.begin(), m_srcSntBreaks.end(), wrdIndices[i]);
int index = int(vit - m_srcSntBreaks.begin()) - 1;
vit = std::upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]);
int index = int(vit - sntBreaks.begin()) - 1;
// check for phrases that cross sentence boundaries
if(wrdIndices[i] - sourceSize + 1 < m_srcSntBreaks.at(index))
if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index))
sntIndexes.push_back(-1); // set bad flag
else
sntIndexes.push_back(index); // store the index of the sentence in the corpus
@ -418,17 +443,13 @@ std::vector<int> BilingualDynSuffixArray::GetSntIndexes(std::vector<unsigned>& w
return sntIndexes;
}
std::vector<unsigned> BilingualDynSuffixArray::SampleSelection(std::vector<unsigned> sample) const
int BilingualDynSuffixArray::SampleSelection(std::vector<unsigned>& sample,
int sampleSize) const
{
//sample.erase(wrdIndices.begin()+m_maxSampleSize, wrdIndices.end());
//return sample;
int size = sample.size();
//if(size < m_maxSampleSize) return sample;
std::vector<unsigned> subSample;
int jump = size / m_maxSampleSize;
for(int i=0; i < size; i+=jump)
subSample.push_back(sample.at(i));
return subSample;
// only use top 'sampleSize' number of samples
if(sample.size() > sampleSize)
sample.erase(sample.begin()+sampleSize, sample.end());
return sample.size();
}
void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& alignment) {
@ -458,7 +479,7 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string&
}
m_trgSntBreaks.push_back(oldTrgCrpSize);
m_srcSA->Insert(&srcFactor, oldSrcCrpSize);
m_trgSA->Insert(&trgFactor, oldTrgCrpSize);
//m_trgSA->Insert(&trgFactor, oldTrgCrpSize);
LoadRawAlignments(alignment);
m_trgVocab->MakeClosed();
}

View File

@ -105,6 +105,7 @@ private:
std::vector<std::vector<short> > m_rawAlignments;
mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
mutable std::set<wordID_t> m_freqWordsCached;
const size_t m_maxPhraseLength, m_maxSampleSize;
int LoadCorpus(InputFileStream&, const std::vector<FactorType>& factors,
@ -116,13 +117,14 @@ private:
bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
std::vector<unsigned> SampleSelection(std::vector<unsigned>) const;
int SampleSelection(std::vector<unsigned>&, int = 300) const;
std::vector<int> GetSntIndexes(std::vector<unsigned>&, const int) const;
std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
TargetPhrase* GetMosesFactorIDs(const SAPhrase&) const;
SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
void CacheWordProbs(wordID_t) const;
void CacheFreqWords() const;
std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
int GetSourceSentenceSize(size_t sentenceId) const