mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 22:14:57 +03:00
856 lines
30 KiB
C++
856 lines
30 KiB
C++
#include "BilingualDynSuffixArray.h"
|
|
#include "moses/TranslationModel/DynSAInclude/utils.h"
|
|
#include "moses/FactorCollection.h"
|
|
#include "moses/StaticData.h"
|
|
#include "moses/TargetPhrase.h"
|
|
|
|
#include "moses/generic/sorting/NBestList.h"
|
|
#include "moses/generic/sampling/Sampling.h"
|
|
|
|
#include <boost/foreach.hpp>
|
|
#include <iomanip>
|
|
|
|
using namespace std;
|
|
|
|
namespace Moses
|
|
{
|
|
|
|
BilingualDynSuffixArray::
|
|
BilingualDynSuffixArray():
|
|
m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()),
|
|
m_maxSampleSize(20), m_maxPTEntries(20)
|
|
{
|
|
m_srcSA = 0;
|
|
m_trgSA = 0;
|
|
m_srcCorpus = new vector<wordID_t>();
|
|
m_trgCorpus = new vector<wordID_t>();
|
|
m_srcVocab = new Vocab(false);
|
|
m_trgVocab = new Vocab(false);
|
|
m_scoreCmp = 0;
|
|
}
|
|
|
|
BilingualDynSuffixArray::
|
|
~BilingualDynSuffixArray()
|
|
{
|
|
if(m_srcSA) delete m_srcSA;
|
|
if(m_trgSA) delete m_trgSA;
|
|
if(m_srcVocab) delete m_srcVocab;
|
|
if(m_trgVocab) delete m_trgVocab;
|
|
if(m_srcCorpus) delete m_srcCorpus;
|
|
if(m_trgCorpus) delete m_trgCorpus;
|
|
if(m_scoreCmp) delete m_scoreCmp;
|
|
}
|
|
|
|
bool
|
|
BilingualDynSuffixArray::
|
|
Load(
|
|
const vector<FactorType>& inputFactors,
|
|
const vector<FactorType>& outputFactors,
|
|
string source, string target, string alignments,
|
|
const vector<float> &weight)
|
|
{
|
|
m_inputFactors = inputFactors;
|
|
m_outputFactors = outputFactors;
|
|
|
|
// m_scoreCmp = new ScoresComp(weight);
|
|
InputFileStream sourceStrme(source);
|
|
InputFileStream targetStrme(target);
|
|
cerr << "Loading source corpus...\n";
|
|
// Input and Output are 'Factor directions' (whatever that is) defined in Typedef.h
|
|
LoadCorpus(Input, sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
|
|
cerr << "Loading target corpus...\n";
|
|
LoadCorpus(Output, targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
|
|
CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size());
|
|
|
|
// build suffix arrays and auxilliary arrays
|
|
cerr << "Building Source Suffix Array...\n";
|
|
m_srcSA = new DynSuffixArray(m_srcCorpus);
|
|
if(!m_srcSA) return false;
|
|
cerr << "Building Target Suffix Array...\n";
|
|
m_trgSA = new DynSuffixArray(m_trgCorpus);
|
|
if(!m_trgSA) return false;
|
|
|
|
InputFileStream alignStrme(alignments);
|
|
cerr << "Loading Alignment File...\n";
|
|
LoadRawAlignments(alignStrme);
|
|
cerr << m_srcSntBreaks.size() << " "
|
|
<< m_trgSntBreaks.size() << " "
|
|
<< m_rawAlignments.size() << endl;
|
|
//LoadAlignments(alignStrme);
|
|
cerr << "Building frequent word cache...\n";
|
|
CacheFreqWords();
|
|
|
|
wordID_t const* s = &(*m_srcCorpus)[0];
|
|
wordID_t const* t = &(*m_trgCorpus)[0];
|
|
for (size_t sid = 0; sid < m_srcSntBreaks.size(); ++sid) {
|
|
wordID_t const* se = s + GetSourceSentenceSize(sid);
|
|
wordID_t const* te = t + GetTargetSentenceSize(sid);
|
|
vector<short> const& a = m_rawAlignments[sid];
|
|
m_wrd_cooc.Count(vector<wordID_t>(s,se),
|
|
vector<wordID_t>(t,te), a,
|
|
m_srcVocab->GetkOOVWordID(),
|
|
m_trgVocab->GetkOOVWordID());
|
|
s = se;
|
|
t = te;
|
|
}
|
|
if (m_srcSntBreaks.size() != m_trgSntBreaks.size() ||
|
|
m_rawAlignments.size() != m_trgSntBreaks.size()) {
|
|
cerr << "FATAL ERROR: Line counts don't match!\n"
|
|
<< "Source side text corpus: " << m_srcSntBreaks.size() << "\n"
|
|
<< "Target side text corpus: " << m_trgSntBreaks.size() << "\n"
|
|
<< "Word alignments: " << m_rawAlignments.size() << endl;
|
|
exit(1);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
int
|
|
BilingualDynSuffixArray::
|
|
LoadRawAlignments(InputFileStream& align)
|
|
{
|
|
// stores the alignments in the raw file format
|
|
string line;
|
|
// vector<int> vtmp;
|
|
// int lineNum = 0;
|
|
while(getline(align, line)) {
|
|
// if (++lineNum % 10000 == 0) cerr << lineNum << endl;
|
|
LoadRawAlignments(line);
|
|
}
|
|
return m_rawAlignments.size();
|
|
}
|
|
|
|
|
|
int
|
|
BilingualDynSuffixArray::
|
|
LoadRawAlignments(string& align)
|
|
{
|
|
// stores the alignments in the raw file format
|
|
vector<int> vtmp;
|
|
Utils::splitToInt(align, vtmp, "- ");
|
|
CHECK(vtmp.size() % 2 == 0);
|
|
vector<short> vAlgn; // store as short ints for memory
|
|
for (vector<int>::const_iterator itr = vtmp.begin();
|
|
itr != vtmp.end(); ++itr) {
|
|
vAlgn.push_back(short(*itr));
|
|
}
|
|
m_rawAlignments.push_back(vAlgn);
|
|
return m_rawAlignments.size();
|
|
}
|
|
|
|
// int
|
|
// BilingualDynSuffixArray::
|
|
// LoadAlignments(InputFileStream& align)
|
|
// {
|
|
// string line;
|
|
// vector<int> vtmp;
|
|
// int sntIndex = 0;
|
|
|
|
// while(getline(align, line))
|
|
// {
|
|
// Utils::splitToInt(line, vtmp, "- ");
|
|
// CHECK(vtmp.size() % 2 == 0);
|
|
|
|
// int sourceSize = GetSourceSentenceSize(sntIndex);
|
|
// int targetSize = GetTargetSentenceSize(sntIndex);
|
|
|
|
// SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence
|
|
// for(int i=0; i < (int)vtmp.size(); i+=2)
|
|
// {
|
|
// int sourcePos = vtmp[i];
|
|
// int targetPos = vtmp[i+1];
|
|
// CHECK(sourcePos < sourceSize);
|
|
// CHECK(targetPos < targetSize);
|
|
|
|
// curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
|
|
// curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
|
|
// }
|
|
// curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
|
|
// curSnt.trgSnt = m_trgCorpus + sntIndex;
|
|
// m_alignments.push_back(curSnt);
|
|
|
|
// sntIndex++;
|
|
// }
|
|
// return m_alignments.size();
|
|
// }
|
|
|
|
SentenceAlignment
|
|
BilingualDynSuffixArray::
|
|
GetSentenceAlignment(const int sntIndex, bool trg2Src) const
|
|
{
|
|
// retrieves the alignments in the format used by SentenceAlignment.Extract()
|
|
int t = GetTargetSentenceSize(sntIndex);
|
|
int s = GetSourceSentenceSize(sntIndex);
|
|
int sntGiven = trg2Src ? t : s;
|
|
int sntExtract = trg2Src ? s : t;
|
|
SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract); // initialize empty sentence
|
|
vector<short> const& a = m_rawAlignments.at(sntIndex);
|
|
for(size_t i=0; i < a.size(); i+=2) {
|
|
int sourcePos = a[i];
|
|
int targetPos = a[i+1];
|
|
if(trg2Src) {
|
|
curSnt.alignedList[targetPos].push_back(sourcePos); // list of target nodes for each source word
|
|
curSnt.numberAligned[sourcePos]++; // cnt of how many source words connect to this target word
|
|
} else {
|
|
curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
|
|
curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
|
|
}
|
|
}
|
|
curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
|
|
curSnt.trgSnt = m_trgCorpus + sntIndex;
|
|
|
|
return curSnt;
|
|
}
|
|
|
|
bool
|
|
BilingualDynSuffixArray::
|
|
ExtractPhrases(const int& sntIndex,
|
|
const int& wordIndex,
|
|
const int& sourceSize,
|
|
vector<PhrasePair*>& phrasePairs,
|
|
bool trg2Src) const
|
|
{
|
|
/* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src
|
|
* parameter */
|
|
SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src);
|
|
// get span of phrase in source sentence
|
|
int beginSentence = m_srcSntBreaks[sntIndex];
|
|
int rightIdx = wordIndex - beginSentence;
|
|
int leftIdx = rightIdx - sourceSize + 1;
|
|
return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence
|
|
}
|
|
|
|
void
|
|
BilingualDynSuffixArray::
|
|
CleanUp(const InputType& source)
|
|
{
|
|
//m_wordPairCache.clear();
|
|
}
|
|
|
|
int
|
|
BilingualDynSuffixArray::
|
|
LoadCorpus(FactorDirection direction,
|
|
InputFileStream & corpus,
|
|
const FactorList & factors,
|
|
vector<wordID_t> & cArray,
|
|
vector<wordID_t> & sntArray,
|
|
Vocab* vocab)
|
|
{
|
|
string line, word;
|
|
int sntIdx(0);
|
|
// corpus.seekg(0); Seems needless -> commented out to allow
|
|
// loading of gzipped corpora (gzfilebuf doesn't support seeking).
|
|
const string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
|
|
while(getline(corpus, line)) {
|
|
sntArray.push_back(sntIdx);
|
|
Phrase phrase(ARRAY_SIZE_INCR);
|
|
// parse phrase
|
|
phrase.CreateFromString( direction, factors, line, factorDelimiter, NULL);
|
|
// store words in vocabulary and corpus
|
|
for( size_t i = 0; i < phrase.GetSize(); ++i) {
|
|
cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) );
|
|
}
|
|
sntIdx += phrase.GetSize();
|
|
}
|
|
//cArray.push_back(vocab->GetkOOVWordID); // signify end of corpus
|
|
vocab->MakeClosed(); // avoid adding words
|
|
return cArray.size();
|
|
}
|
|
|
|
bool
|
|
BilingualDynSuffixArray::
|
|
GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const
|
|
{
|
|
// looks up the SA vocab ids for the current src phrase
|
|
size_t phraseSize = src.GetSize();
|
|
for (size_t pos = 0; pos < phraseSize; ++pos) {
|
|
const Word &word = src.GetWord(pos);
|
|
wordID_t arrayId = m_srcVocab->GetWordID(word);
|
|
if (arrayId == m_srcVocab->GetkOOVWordID()) {
|
|
// oov
|
|
return false;
|
|
} else {
|
|
output.SetId(pos, arrayId);
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
pair<float, float>
|
|
BilingualDynSuffixArray::
|
|
GetLexicalWeight(const PhrasePair& pp) const
|
|
{
|
|
// sp,tp: sum of link probabilities
|
|
// sc,tc: count of links
|
|
int src_size = pp.GetSourceSize();
|
|
int trg_size = pp.GetTargetSize();
|
|
vector<float> sp(src_size, 0), tp(trg_size, 0);
|
|
vector<int> sc(src_size,0), tc(trg_size,0);
|
|
wordID_t const* sw = &(m_srcCorpus->at(m_srcSntBreaks.at(pp.m_sntIndex)));
|
|
wordID_t const* tw = &(m_trgCorpus->at(m_trgSntBreaks.at(pp.m_sntIndex)));
|
|
vector<short> const & a = m_rawAlignments.at(pp.m_sntIndex);
|
|
for (size_t i = 0; i < a.size(); i += 2) {
|
|
int s = a[i], t = a.at(i+1), sx, tx;
|
|
// sx, tx: local positions within phrase pair
|
|
|
|
if (s < pp.m_startSource || t < pp.m_startTarget) continue;
|
|
if ((sx = s - pp.m_startSource) >= src_size) continue;
|
|
if ((tx = t - pp.m_startTarget) >= trg_size) continue;
|
|
sp[sx] += m_wrd_cooc.pbwd(sw[s],tw[t]);
|
|
tp[tx] += m_wrd_cooc.pfwd(sw[s],tw[t]);
|
|
++sc[sx];
|
|
++tc[tx];
|
|
}
|
|
pair<float,float> ret(1,1);
|
|
wordID_t null_trg = m_trgVocab->GetkOOVWordID();
|
|
wordID_t null_src = m_srcVocab->GetkOOVWordID();
|
|
for (size_t i = 0, k = pp.m_startSource; i < sp.size(); ++i, ++k) {
|
|
if (sc[i]) ret.first *= sp[i]/sc[i];
|
|
else ret.first *= m_wrd_cooc.pbwd(sw[k], null_trg);
|
|
}
|
|
for (size_t i = 0, k = pp.m_startTarget; i < tp.size(); ++i, ++k) {
|
|
if (tc[i]) ret.second *= tp[i]/tc[i];
|
|
else ret.second *= m_wrd_cooc.pfwd(null_src,tw[k]);
|
|
}
|
|
return ret;
|
|
|
|
// //return pair<float, float>(1, 1);
|
|
// float srcLexWeight(1.0), trgLexWeight(1.0);
|
|
// map<pair<wordID_t, wordID_t>, float> targetProbs;
|
|
// // collects sum of target probs given source words
|
|
|
|
// //const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex];
|
|
// const SentenceAlignment alignment = GetSentenceAlignment(phrasepair.m_sntIndex);
|
|
// map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache;
|
|
// // for each source word
|
|
// for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) {
|
|
// float srcSumPairProbs(0);
|
|
// wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs
|
|
// const vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx);
|
|
// // for each target word aligned to this source word in this alignment
|
|
// if(srcWordAlignments.size() == 0) { // get p(NULL|src)
|
|
// pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID());
|
|
// itrCache = m_wordPairCache.find(wordpair);
|
|
// if(itrCache == m_wordPairCache.end()) { // if not in cache
|
|
// CacheWordProbs(srcWord);
|
|
// itrCache = m_wordPairCache.find(wordpair); // search cache again
|
|
// }
|
|
// CHECK(itrCache != m_wordPairCache.end());
|
|
// srcSumPairProbs += itrCache->second.first;
|
|
// targetProbs[wordpair] = itrCache->second.second;
|
|
// }
|
|
// else { // extract p(trg|src)
|
|
// for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word
|
|
// int trgIdx = srcWordAlignments[i];
|
|
// wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
|
|
// // get probability of this source->target word pair
|
|
// pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord);
|
|
// itrCache = m_wordPairCache.find(wordpair);
|
|
// if(itrCache == m_wordPairCache.end()) { // if not in cache
|
|
// CacheWordProbs(srcWord);
|
|
// itrCache = m_wordPairCache.find(wordpair); // search cache again
|
|
// }
|
|
// CHECK(itrCache != m_wordPairCache.end());
|
|
// srcSumPairProbs += itrCache->second.first;
|
|
// targetProbs[wordpair] = itrCache->second.second;
|
|
// }
|
|
// }
|
|
// float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size());
|
|
// srcLexWeight *= (srcNormalizer * srcSumPairProbs);
|
|
// } // end for each source word
|
|
|
|
// for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) {
|
|
// float trgSumPairProbs(0);
|
|
// wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
|
|
// for (map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr
|
|
// = targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) {
|
|
// if(trgItr->first.second == trgWord)
|
|
// trgSumPairProbs += trgItr->second;
|
|
// }
|
|
// if(trgSumPairProbs == 0) continue; // currently don't store target-side SA
|
|
// int noAligned = alignment.numberAligned.at(trgIdx);
|
|
// float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned);
|
|
// trgLexWeight *= (trgNormalizer * trgSumPairProbs);
|
|
// }
|
|
|
|
// // TODO::Need to get p(NULL|trg)
|
|
|
|
// return pair<float, float>(srcLexWeight, trgLexWeight);
|
|
}
|
|
|
|
void
|
|
BilingualDynSuffixArray::
|
|
CacheFreqWords() const
|
|
{
|
|
multimap<int, wordID_t> wordCnts;
|
|
// for each source word in vocab
|
|
Vocab::Word2Id::const_iterator it;
|
|
for(it = m_srcVocab->VocabStart(); it != m_srcVocab->VocabEnd(); ++it) {
|
|
// get its frequency
|
|
wordID_t srcWord = it->second;
|
|
vector<wordID_t> sword(1, srcWord), wrdIndices;
|
|
m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
|
|
if(wrdIndices.size() >= 1000) { // min count
|
|
wordCnts.insert(make_pair(wrdIndices.size(), srcWord));
|
|
}
|
|
}
|
|
int numSoFar(0);
|
|
multimap<int, wordID_t>::reverse_iterator ritr;
|
|
for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) {
|
|
m_freqWordsCached.insert(ritr->second);
|
|
CacheWordProbs(ritr->second);
|
|
if(++numSoFar == 50) break; // get top counts
|
|
}
|
|
cerr << "\tCached " << m_freqWordsCached.size() << " source words\n";
|
|
}
|
|
|
|
void
|
|
BilingualDynSuffixArray::
|
|
CacheWordProbs(wordID_t srcWord) const
|
|
{
|
|
map<wordID_t, int> counts;
|
|
vector<wordID_t> sword(1, srcWord), wrdIndices;
|
|
bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
|
|
CHECK(ret);
|
|
vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks);
|
|
float denom(0);
|
|
// for each occurrence of this word
|
|
for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
|
|
int sntIdx = sntIndexes.at(snt); // get corpus index for sentence
|
|
CHECK(sntIdx != -1);
|
|
int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence
|
|
const vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word
|
|
if(srcAlg.size() == 0) {
|
|
++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word
|
|
++denom;
|
|
} else { //get target words aligned to srcword in this sentence
|
|
for(size_t i=0; i < srcAlg.size(); ++i) {
|
|
wordID_t trgWord = m_trgCorpus->at(srcAlg[i] + m_trgSntBreaks[sntIdx]);
|
|
++counts[trgWord];
|
|
++denom;
|
|
}
|
|
}
|
|
}
|
|
// now we've gotten counts of all target words aligned to this source word
|
|
// get probs and cache all pairs
|
|
for(map<wordID_t, int>::const_iterator itrCnt = counts.begin();
|
|
itrCnt != counts.end(); ++itrCnt) {
|
|
pair<wordID_t, wordID_t> wordPair = make_pair(srcWord, itrCnt->first);
|
|
float srcTrgPrb = float(itrCnt->second) / float(denom); // gives p(src->trg)
|
|
float trgSrcPrb = float(itrCnt->second) / float(counts.size()); // gives p(trg->src)
|
|
m_wordPairCache[wordPair] = pair<float, float>(srcTrgPrb, trgSrcPrb);
|
|
}
|
|
}
|
|
|
|
SAPhrase
|
|
BilingualDynSuffixArray::
|
|
TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const
|
|
{
|
|
// takes sentence indexes and looks up vocab IDs
|
|
SAPhrase phraseIds(phrasepair.GetTargetSize());
|
|
int sntIndex = phrasepair.m_sntIndex;
|
|
int id(-1), pos(0);
|
|
for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) { // look up trg words
|
|
id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i);
|
|
phraseIds.SetId(pos++, id);
|
|
}
|
|
return phraseIds;
|
|
}
|
|
|
|
TargetPhrase*
|
|
BilingualDynSuffixArray::
|
|
GetMosesFactorIDs(const SAPhrase& phrase, const Phrase& sourcePhrase) const
|
|
{
|
|
TargetPhrase* targetPhrase = new TargetPhrase();
|
|
for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words
|
|
Word& word = m_trgVocab->GetWord( phrase.words[i]);
|
|
CHECK(word != m_trgVocab->GetkOOVWord());
|
|
targetPhrase->AddWord(word);
|
|
}
|
|
targetPhrase->SetSourcePhrase(sourcePhrase);
|
|
// scoring
|
|
return targetPhrase;
|
|
}
|
|
|
|
/// Gather translation candidates for source phrase /src/ and store raw
|
|
// phrase pair statistics in /pstats/. Return the sample rate
|
|
// (number of samples considered / total number of hits) and total number of
|
|
// phrase pairs
|
|
pair<float,float>
|
|
BilingualDynSuffixArray::
|
|
GatherCands(Phrase const& src, map<SAPhrase, vector<float> >& pstats) const
|
|
{
|
|
typedef map<SAPhrase, vector<float> >::iterator pstat_iter;
|
|
typedef map<SAPhrase, vector<float> >::value_type pstat_entry;
|
|
pair<float,float> ret(0,0);
|
|
float& sampleRate = ret.first;
|
|
float& totalPhrases = ret.second;
|
|
size_t srcSize = src.GetSize();
|
|
SAPhrase localIDs(srcSize);
|
|
vector<unsigned> wrdIndices;
|
|
if(!GetLocalVocabIDs(src, localIDs) ||
|
|
!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices))
|
|
return ret; // source phrase contains OOVs
|
|
|
|
// select a sample of the occurrences for phrase extraction
|
|
size_t m1 = wrdIndices.size();
|
|
SampleSelection(wrdIndices); // careful! SampleSelection alters wrdIndices!
|
|
sampleRate = float(wrdIndices.size())/m1;
|
|
|
|
// determine the sentences in which these phrases occur
|
|
vector<int> sntIndices = GetSntIndexes(wrdIndices, srcSize, m_srcSntBreaks);
|
|
for(size_t s = 0; s < sntIndices.size(); ++s) {
|
|
int sntStart = sntIndices.at(s);
|
|
if(sntStart == -1) continue; // marked as bad by GetSntIndexes()
|
|
vector<PhrasePair*> phrasePairs;
|
|
ExtractPhrases(sntStart, wrdIndices[s], srcSize, phrasePairs);
|
|
totalPhrases += phrasePairs.size();
|
|
vector<PhrasePair*>::iterator p;
|
|
for (p = phrasePairs.begin(); p != phrasePairs.end(); ++p) {
|
|
assert(*p);
|
|
pair<float, float> lex = GetLexicalWeight(**p);
|
|
pstat_entry entry(TrgPhraseFromSntIdx(**p), Scores(5));
|
|
pair<pstat_iter, bool> foo = pstats.insert(entry);
|
|
Scores& feats = foo.first->second;
|
|
if (foo.second) {
|
|
feats[0] = 1; // count
|
|
feats[1] = lex.first;
|
|
feats[3] = lex.second;
|
|
} else {
|
|
feats[0] += 1;
|
|
feats[1] = max(feats[1],lex.first);
|
|
feats[3] = max(feats[3],lex.second);
|
|
}
|
|
delete *p;
|
|
}
|
|
} // done with all sentences
|
|
BOOST_FOREACH(pstat_entry & e, pstats) {
|
|
Scores& feats = e.second;
|
|
// 0: bwd phrase prob
|
|
// 1: lex 1
|
|
// 2: fwd phrase prob
|
|
// 3: lex 2
|
|
// 4: phrase penalty
|
|
float x = m_trgSA->GetCount(e.first.words)-feats[0] * sampleRate;
|
|
feats[4] = 1;
|
|
feats[3] = log(feats[3]);
|
|
feats[2] = log(feats[0]) - log(totalPhrases);
|
|
feats[1] = log(feats[1]);
|
|
feats[0] = log(feats[0]) - log(feats[0] + x);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
// void
|
|
// BilingualDynSuffixArray::
|
|
// GetTargetPhrasesByLexicalWeight
|
|
// (const Phrase& src, vector< pair<Scores, TargetPhrase*> > & target)
|
|
// const
|
|
// {
|
|
// size_t sourceSize = src.GetSize();
|
|
// SAPhrase localIDs(sourceSize);
|
|
// if(!GetLocalVocabIDs(src, localIDs))
|
|
// return; // source phrase contains OOVs
|
|
|
|
// float totalTrgPhrases(0);
|
|
// map<SAPhrase, int> phraseCounts;
|
|
// map<SAPhrase, pair<float, float> > lexicalWeights;
|
|
// map<SAPhrase, pair<float, float> >::iterator itrLexW;
|
|
|
|
// // find all occurrences of the phrase in the corpus;
|
|
// // wrdIndices stores the rightmost position
|
|
// vector<unsigned> wrdIndices;
|
|
// if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices))
|
|
// return; // none found
|
|
|
|
// // select a sample of the occurrences for phrase extraction
|
|
// size_t m1 = wrdIndices.size();
|
|
// SampleSelection(wrdIndices);
|
|
// float sampleRate = float(wrdIndices.size())/m1;
|
|
|
|
// // determine the sentences in which these phrases occur
|
|
// vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks);
|
|
|
|
// // for each sentence with this phrase
|
|
// for(size_t s = 0; s < sntIndexes.size(); ++s)
|
|
// {
|
|
// vector<PhrasePair*> phrasePairs;
|
|
// int sntIndex = sntIndexes.at(s);
|
|
// if(sntIndex == -1) continue; // bad flag set by GetSntIndexes()
|
|
// ExtractPhrases(sntIndex, wrdIndices[s], sourceSize, phrasePairs);
|
|
// totalTrgPhrases += phrasePairs.size();
|
|
// vector<PhrasePair*>::iterator p;
|
|
// for (p = phrasePairs.begin(); p != phrasePairs.end(); ++p)
|
|
// {
|
|
// PhrasePair* px = *p;
|
|
// assert(px);
|
|
// SAPhrase phrase = TrgPhraseFromSntIdx(*px);
|
|
// phraseCounts[phrase]++; // count each unique phrase
|
|
// // NOTE::Correct but slow to extract lexical weight here. could do
|
|
// // it later for only the top phrases chosen by phrase prob p(e|f)
|
|
// pair<float, float> lexWeight = GetLexicalWeight(**p);
|
|
// itrLexW = lexicalWeights.find(phrase);
|
|
// if((itrLexW != lexicalWeights.end()) &&
|
|
// (itrLexW->second.first < lexWeight.first))
|
|
// itrLexW->second = lexWeight; // if this lex weight is greater save it
|
|
// else lexicalWeights[phrase] = lexWeight; // else save
|
|
// }
|
|
// // done with sentence. delete SA phrase pairs
|
|
// BOOST_FOREACH(PhrasePair* p, phrasePairs) delete p;
|
|
// } // done with all sentences
|
|
|
|
// cerr << "Done extracting ... " << endl;
|
|
|
|
// // convert to moses phrase pairs
|
|
// map<SAPhrase, int>::iterator pcnt;
|
|
// BetterPhrase better(*m_scoreCmp);
|
|
// NBestList<pair<Scores,SAPhrase const*>,BetterPhrase> nbest(m_maxPTEntries,better);
|
|
// for(pcnt = phraseCounts.begin(); pcnt != phraseCounts.end(); ++pcnt)
|
|
// {
|
|
// float tmarginal = (m_trgSA->GetCount(pcnt->first.words) * sampleRate);
|
|
// float pfwd = pcnt->second / totalTrgPhrases;
|
|
// float pbwd = pcnt->second / tmarginal;
|
|
// pair<float, float> lexWeight = lexicalWeights[pcnt->first];
|
|
// pair<Scores, SAPhrase const*> entry;
|
|
// entry.first.resize(5);
|
|
// entry.first[0] = pbwd;
|
|
// entry.first[1] = itrLexW->second.first;
|
|
// entry.first[2] = pfwd;
|
|
// entry.first[3] = itrLexW->second.second;
|
|
// entry.first[4] = 2.718; // exp(1);
|
|
// entry.second = &pcnt->first;
|
|
// nbest.add(entry);
|
|
// }
|
|
|
|
// // return top scoring phrases
|
|
// for (size_t n = 0; n < nbest.size(); ++n)
|
|
// {
|
|
// pair<Scores, SAPhrase const*> e = nbest[n];
|
|
// target.push_back(make_pair(e.first,GetMosesFactorIDs(*e.second, src)));
|
|
// }
|
|
// }
|
|
|
|
vector<int>
|
|
BilingualDynSuffixArray::
|
|
GetSntIndexes(vector<unsigned>& wrdIndices,
|
|
const int sourceSize,
|
|
const vector<unsigned>& sntBreaks) const
|
|
{
|
|
vector<unsigned>::const_iterator vit;
|
|
vector<int> sntIndices;
|
|
for(size_t i=0; i < wrdIndices.size(); ++i) {
|
|
vit = upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]);
|
|
int index = int(vit - sntBreaks.begin()) - 1;
|
|
// check for phrases that cross sentence boundaries
|
|
if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index))
|
|
sntIndices.push_back(-1); // set bad flag
|
|
else
|
|
sntIndices.push_back(index); // store the index of the sentence in the corpus
|
|
}
|
|
return sntIndices;
|
|
}
|
|
|
|
int
|
|
BilingualDynSuffixArray::
|
|
SampleSelection(vector<unsigned>& sample, int sampleSize) const
|
|
{
|
|
// only use top 'sampleSize' number of samples
|
|
vector<unsigned> s;
|
|
randomSample<unsigned>(s,sampleSize,sample.size());
|
|
for (size_t i = 0; i < s.size(); ++i)
|
|
s[i] = sample[s[i]];
|
|
sample.swap(s);
|
|
return sample.size();
|
|
}
|
|
|
|
void
|
|
BilingualDynSuffixArray::
|
|
addSntPair(string& source, string& target, string& alignment)
|
|
{
|
|
vuint_t srcFactor, trgFactor;
|
|
cerr << "source, target, alignment = " << source << ", "
|
|
<< target << ", " << alignment << endl;
|
|
const string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
|
|
const unsigned oldSrcCrpSize = m_srcCorpus->size(), oldTrgCrpSize = m_trgCorpus->size();
|
|
cerr << "old source corpus size = " << oldSrcCrpSize << "\told target size = " << oldTrgCrpSize << endl;
|
|
Phrase sphrase(ARRAY_SIZE_INCR);
|
|
sphrase.CreateFromString(Input, m_inputFactors, source, factorDelimiter, NULL);
|
|
m_srcVocab->MakeOpen();
|
|
vector<wordID_t> sIDs(sphrase.GetSize());
|
|
// store words in vocabulary and corpus
|
|
for(int i = sphrase.GetSize()-1; i >= 0; --i) {
|
|
sIDs[i] = m_srcVocab->GetWordID(sphrase.GetWord(i)); // get vocab id backwards
|
|
}
|
|
for(size_t i = 0; i < sphrase.GetSize(); ++i) {
|
|
srcFactor.push_back(sIDs[i]);
|
|
cerr << "srcFactor[" << (srcFactor.size() - 1) << "] = " << srcFactor.back() << endl;
|
|
m_srcCorpus->push_back(srcFactor.back()); // add word to corpus
|
|
}
|
|
m_srcSntBreaks.push_back(oldSrcCrpSize); // former end of corpus is index of new sentence
|
|
m_srcVocab->MakeClosed();
|
|
Phrase tphrase(ARRAY_SIZE_INCR);
|
|
tphrase.CreateFromString(Output, m_outputFactors, target, factorDelimiter, NULL);
|
|
m_trgVocab->MakeOpen();
|
|
vector<wordID_t> tIDs(tphrase.GetSize());
|
|
for(int i = tphrase.GetSize()-1; i >= 0; --i) {
|
|
tIDs[i] = m_trgVocab->GetWordID(tphrase.GetWord(i)); // get vocab id
|
|
}
|
|
for(size_t i = 0; i < tphrase.GetSize(); ++i) {
|
|
trgFactor.push_back(tIDs[i]);
|
|
cerr << "trgFactor[" << (trgFactor.size() - 1) << "] = " << trgFactor.back() << endl;
|
|
m_trgCorpus->push_back(trgFactor.back());
|
|
}
|
|
cerr << "gets to 1\n";
|
|
m_trgSntBreaks.push_back(oldTrgCrpSize);
|
|
cerr << "gets to 2\n";
|
|
m_srcSA->Insert(&srcFactor, oldSrcCrpSize);
|
|
cerr << "gets to 3\n";
|
|
m_trgSA->Insert(&trgFactor, oldTrgCrpSize);
|
|
LoadRawAlignments(alignment);
|
|
m_trgVocab->MakeClosed();
|
|
|
|
m_wrd_cooc.Count(sIDs,tIDs, m_rawAlignments.back(),
|
|
m_srcVocab->GetkOOVWordID(),
|
|
m_trgVocab->GetkOOVWordID());
|
|
|
|
//for(size_t i=0; i < sphrase.GetSize(); ++i)
|
|
//ClearWordInCache(sIDs[i]);
|
|
|
|
}
|
|
|
|
void
|
|
BilingualDynSuffixArray::
|
|
ClearWordInCache(wordID_t srcWord)
|
|
{
|
|
if(m_freqWordsCached.find(srcWord) != m_freqWordsCached.end())
|
|
return;
|
|
map<pair<wordID_t, wordID_t>, pair<float, float> >::iterator it,
|
|
first, last;
|
|
for(it = m_wordPairCache.begin(); it != m_wordPairCache.end(); ++it) {
|
|
if(it->first.first == srcWord) { // all source words grouped
|
|
first = it; // copy first entry of srcWord
|
|
last = it++;
|
|
while(it != m_wordPairCache.end() && (it->first.first == srcWord)) {
|
|
last = it++;
|
|
}
|
|
}
|
|
m_wordPairCache.erase(first, last);
|
|
}
|
|
}
|
|
|
|
SentenceAlignment::
|
|
SentenceAlignment(int sntIndex, int sourceSize, int targetSize)
|
|
: m_sntIndex(sntIndex)
|
|
, numberAligned(targetSize, 0)
|
|
, alignedList(sourceSize)
|
|
{
|
|
// What is the code below supposed to accomplish??? UG.
|
|
// for(int i=0; i < sourceSize; ++i) {
|
|
// vector<int> trgWrd;
|
|
// alignedList[i] = trgWrd;
|
|
// }
|
|
}
|
|
|
|
bool
|
|
SentenceAlignment::
|
|
Extract(int maxPhraseLength, vector<PhrasePair*> &ret, int startSource, int endSource) const
|
|
{
|
|
// foreign = target, F=T
|
|
// english = source, E=S
|
|
int countTarget = numberAligned.size();
|
|
|
|
int minTarget = 9999;
|
|
int maxTarget = -1;
|
|
vector< int > usedTarget = numberAligned;
|
|
for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++) {
|
|
for(int ind=0; ind < (int)alignedList[sourcePos].size(); ind++) {
|
|
int targetPos = alignedList[sourcePos][ind];
|
|
// cout << "point (" << targetPos << ", " << sourcePos << ")\n";
|
|
if (targetPos<minTarget) {
|
|
minTarget = targetPos;
|
|
}
|
|
if (targetPos>maxTarget) {
|
|
maxTarget = targetPos;
|
|
}
|
|
usedTarget[ targetPos ]--;
|
|
} // for(int ind=0;ind<sentence
|
|
} // for(int sourcePos=startSource
|
|
|
|
// cout << "f projected ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
|
|
|
|
if (maxTarget >= 0 && // aligned to any foreign words at all
|
|
maxTarget-minTarget < maxPhraseLength) {
|
|
// foreign phrase within limits
|
|
|
|
// check if foreign words are aligned to out of bound english words
|
|
bool out_of_bounds = false;
|
|
for(int targetPos=minTarget; targetPos <= maxTarget && !out_of_bounds; targetPos++) {
|
|
if (usedTarget[targetPos]>0) {
|
|
// cout << "ouf of bounds: " << targetPos << "\n";
|
|
out_of_bounds = true;
|
|
}
|
|
}
|
|
|
|
// cout << "doing if for ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
|
|
if (!out_of_bounds) {
|
|
// start point of foreign phrase may retreat over unaligned
|
|
for(int startTarget = minTarget;
|
|
(startTarget >= 0 &&
|
|
startTarget > maxTarget-maxPhraseLength && // within length limit
|
|
(startTarget==minTarget || numberAligned[startTarget]==0)); // unaligned
|
|
startTarget--) {
|
|
// end point of foreign phrase may advance over unaligned
|
|
for (int endTarget=maxTarget;
|
|
(endTarget<countTarget &&
|
|
endTarget<startTarget+maxPhraseLength && // within length limit
|
|
(endTarget==maxTarget || numberAligned[endTarget]==0)); // unaligned
|
|
endTarget++) {
|
|
PhrasePair *phrasePair = new PhrasePair(startTarget,endTarget,startSource,endSource, m_sntIndex);
|
|
ret.push_back(phrasePair);
|
|
} // for (int endTarget=maxTarget;
|
|
} // for(int startTarget=minTarget;
|
|
} // if (!out_of_bounds)
|
|
} // if (maxTarget >= 0 &&
|
|
return (ret.size() > 0);
|
|
|
|
}
|
|
|
|
int
|
|
BilingualDynSuffixArray::
|
|
GetSourceSentenceSize(size_t sentenceId) const
|
|
{
|
|
return (sentenceId==m_srcSntBreaks.size()-1) ?
|
|
m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
|
|
m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
|
|
}
|
|
|
|
int
|
|
BilingualDynSuffixArray::
|
|
GetTargetSentenceSize(size_t sentenceId) const
|
|
{
|
|
return (sentenceId==m_trgSntBreaks.size()-1) ?
|
|
m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
|
|
m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
|
|
}
|
|
|
|
BetterPhrase::
|
|
BetterPhrase(ScoresComp const& sc)
|
|
: cmp(sc) {}
|
|
|
|
// bool
|
|
// BetterPhrase::
|
|
// operator()(pair<Scores, TargetPhrase const*> const& a,
|
|
// pair<Scores, TargetPhrase const*> const& b) const
|
|
// {
|
|
// return cmp(b.first,a.first);
|
|
// }
|
|
|
|
bool
|
|
BetterPhrase::
|
|
operator()(pair<Scores, SAPhrase const*> const& a,
|
|
pair<Scores, SAPhrase const*> const& b) const
|
|
{
|
|
return cmp(b.first,a.first);
|
|
}
|
|
|
|
}// end namepsace
|