Merge branch 'nadir_osm' of github.com:moses-smt/mosesdecoder into nadir_osm

This commit is contained in:
Nadir Durrani 2013-07-01 11:07:21 +01:00
commit ba72c70c6e
25 changed files with 1146 additions and 435 deletions

View File

@ -1571,6 +1571,16 @@
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>TranslationModel/WordCoocTable.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/WordCoocTable.cpp</locationURI>
</link>
<link>
<name>TranslationModel/WordCoocTable.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/WordCoocTable.h</locationURI>
</link>
<link>
<name>TranslationModel/fuzzy-match</name>
<type>2</type>

View File

@ -22,7 +22,8 @@ struct DistortionState_traditional : public FFState {
};
DistortionScoreProducer::DistortionScoreProducer(const std::string &line)
: StatefulFeatureFunction("Distortion", 1, line) {
: StatefulFeatureFunction("Distortion", 1, line)
{
ReadParameters();
}

View File

@ -102,8 +102,8 @@ void FeatureFunction::SetParameter(const std::string& key, const std::string& va
void FeatureFunction::ReadParameters()
{
while (!m_args.empty()) {
const vector<string> &args = m_args[0];
SetParameter(args[0], args[1]);
const vector<string> &args = m_args[0];
SetParameter(args[0], args[1]);
m_args.erase(m_args.begin());
}

View File

@ -5,14 +5,15 @@
namespace Moses
{
PhrasePenalty::PhrasePenalty(const std::string &line)
: StatelessFeatureFunction("PhrasePenalty",1, line) {
: StatelessFeatureFunction("PhrasePenalty",1, line)
{
ReadParameters();
}
void PhrasePenalty::Evaluate(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{
scoreBreakdown.Assign(this, 1.0f);
}

View File

@ -11,13 +11,13 @@ public:
PhrasePenalty(const std::string &line);
bool IsUseable(const FactorMask &mask) const {
return true;
return true;
}
virtual void Evaluate(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const;
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const;
};
} //namespace

View File

@ -7,7 +7,8 @@ using namespace std;
namespace Moses
{
UnknownWordPenaltyProducer::UnknownWordPenaltyProducer(const std::string &line)
: StatelessFeatureFunction("UnknownWordPenalty",1, line) {
: StatelessFeatureFunction("UnknownWordPenalty",1, line)
{
m_tuneable = false;
ReadParameters();
}

View File

@ -7,7 +7,8 @@ using namespace std;
namespace Moses
{
WordPenaltyProducer::WordPenaltyProducer(const std::string &line)
: StatelessFeatureFunction("WordPenalty",1, line) {
: StatelessFeatureFunction("WordPenalty",1, line)
{
ReadParameters();
}

View File

@ -275,13 +275,15 @@ bool Parameter::LoadParam(int argc, char* argv[])
}
// overwrite parameters with values from switches
for(PARAM_STRING::const_iterator iterParam = m_description.begin(); iterParam != m_description.end(); iterParam++) {
for(PARAM_STRING::const_iterator iterParam = m_description.begin();
iterParam != m_description.end(); iterParam++) {
const string paramName = iterParam->first;
OverwriteParam("-" + paramName, paramName, argc, argv);
}
// ... also shortcuts
for(PARAM_STRING::const_iterator iterParam = m_abbreviation.begin(); iterParam != m_abbreviation.end(); iterParam++) {
for(PARAM_STRING::const_iterator iterParam = m_abbreviation.begin();
iterParam != m_abbreviation.end(); iterParam++) {
const string paramName = iterParam->first;
const string paramShortName = iterParam->second;
OverwriteParam("-" + paramShortName, paramName, argc, argv);
@ -294,7 +296,8 @@ bool Parameter::LoadParam(int argc, char* argv[])
verbose = Scan<int>(m_setting["verbose"][0]);
if (verbose >= 1) { // only if verbose
TRACE_ERR( "Defined parameters (per moses.ini or switch):" << endl);
for(PARAM_MAP::const_iterator iterParam = m_setting.begin() ; iterParam != m_setting.end(); iterParam++) {
for(PARAM_MAP::const_iterator iterParam = m_setting.begin() ;
iterParam != m_setting.end(); iterParam++) {
TRACE_ERR( "\t" << iterParam->first << ": ");
for ( size_t i = 0; i < iterParam->second.size(); i++ )
TRACE_ERR( iterParam->second[i] << " ");
@ -303,7 +306,8 @@ bool Parameter::LoadParam(int argc, char* argv[])
}
// convert old weights args to new format
if (!isParamSpecified("feature"))
// WHAT IS GOING ON HERE??? - UG
if (!isParamSpecified("feature")) // UG
ConvertWeightArgs();
CreateWeightsMap();
WeightOverwrite();
@ -331,11 +335,11 @@ std::vector<float> &Parameter::GetWeights(const std::string &name)
{
std::vector<float> &ret = m_weights[name];
cerr << "WEIGHT " << name << "=";
for (size_t i = 0; i < ret.size(); ++i) {
cerr << ret[i] << ",";
}
cerr << endl;
// cerr << "WEIGHT " << name << "=";
// for (size_t i = 0; i < ret.size(); ++i) {
// cerr << ret[i] << ",";
// }
// cerr << endl;
return ret;
}
@ -357,7 +361,10 @@ void Parameter::SetWeight(const std::string &name, size_t ind, const vector<floa
newWeights.push_back(line);
}
void Parameter::AddWeight(const std::string &name, size_t ind, const std::vector<float> &weights)
void
Parameter::
AddWeight(const std::string &name, size_t ind,
const std::vector<float> &weights)
{
PARAM_VEC &newWeights = m_setting["weight"];
@ -478,6 +485,12 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
case Compact:
ptType = "PhraseDictionaryCompact";
break;
case SuffixArray:
ptType = "PhraseDictionarySuffixArray";
break;
case DSuffixArray:
ptType = "PhraseDictionaryDynSuffixArray";
break;
default:
break;
}
@ -502,6 +515,9 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
++currOldInd;
}
// cerr << weights.size() << " PHRASE TABLE WEIGHTS "
// << __FILE__ << ":" << __LINE__ << endl;
AddWeight(ptType, ptInd, weights);
// actual pt
@ -527,7 +543,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
ptLine << "num-features=" << numScoreComponent << " ";
ptLine << "table-limit=" << maxTargetPhrase[currDict] << " ";
if (implementation == SuffixArray) {
if (implementation == SuffixArray || implementation == DSuffixArray) {
ptLine << "target-path=" << token[5] << " ";
ptLine << "alignment-path=" << token[6] << " ";
}

View File

@ -63,7 +63,9 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "moses/FF/InputFeature.h"
#include "moses/FF/PhrasePenalty.h"
#ifdef LM_SRI
#include "moses/FF/OSM-Feature/OpSequenceModel.h"
#endif
#include "LM/Ken.h"
#ifdef LM_IRST
@ -695,13 +697,17 @@ bool StaticData::LoadData(Parameter *parameter)
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
} else if (feature == "OpSequenceModel") {
#ifdef HAVE_SRI
OpSequenceModel* model = new OpSequenceModel(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
#else
UTIL_THROW(util::Exception, "TODO(nadir): Fix OSM to work without SRILM");
#endif
} else if (feature == "PhrasePenalty") {
PhrasePenalty* model = new PhrasePenalty(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
}
#ifdef HAVE_SYNLM
@ -1177,7 +1183,6 @@ void StaticData::LoadFeatureFunctions()
}
}
// load phrase table
for (size_t i = 0; i < m_phraseDictionary.size(); ++i) {
PhraseDictionary *pt = m_phraseDictionary[i];
pt->Load();

View File

@ -35,11 +35,11 @@ struct CompareTargetPhrase {
void TargetPhraseCollection::NthElement(size_t tableLimit)
{
vector<TargetPhrase*>::iterator
iterMiddle = (tableLimit == 0 || m_collection.size() < tableLimit) ?m_collection.end() : m_collection.begin() + tableLimit;
//std::sort(m_collection.begin(), m_collection.end(), CompareTargetPhrase());
std::nth_element(m_collection.begin(), iterMiddle, m_collection.end(), CompareTargetPhrase());
vector<TargetPhrase*>::iterator nth;
nth = (tableLimit && tableLimit <= m_collection.size()
? m_collection.begin() + tableLimit
: m_collection.end());
std::nth_element(m_collection.begin(), nth, m_collection.end(), CompareTargetPhrase());
}
void TargetPhraseCollection::Prune(bool adhereTableLimit, size_t tableLimit)

File diff suppressed because it is too large Load Diff

View File

@ -5,23 +5,29 @@
#include "moses/TranslationModel/DynSAInclude/vocab.h"
#include "moses/TranslationModel/DynSAInclude/types.h"
#include "moses/TranslationModel/DynSAInclude/utils.h"
#include "moses/TranslationModel/WordCoocTable.h"
#include "moses/InputFileStream.h"
#include "moses/FactorTypeSet.h"
#include "moses/TargetPhrase.h"
#include <boost/dynamic_bitset.hpp>
#include "moses/TargetPhraseCollection.h"
#include <map>
using namespace std;
namespace Moses
{
class PhraseDictionaryDynSuffixArray;
/** @todo ask Abbey Levenberg
*/
class SAPhrase
{
public:
std::vector<wordID_t> words;
vector<wordID_t> words;
SAPhrase(size_t phraseSize)
:words(phraseSize) {
}
:words(phraseSize)
{}
void SetId(size_t pos, wordID_t id) {
CHECK(pos < words.size());
@ -43,12 +49,16 @@ public:
, m_endTarget(endTarget)
, m_startSource(startSource)
, m_endSource(endSource)
, m_sntIndex(sntIndex) {
}
, m_sntIndex(sntIndex)
{}
size_t GetTargetSize() const {
return m_endTarget - m_startTarget + 1;
}
size_t GetSourceSize() const {
return m_endSource - m_startSource + 1;
}
};
/** @todo ask Abbey Levenberg
@ -58,32 +68,43 @@ class SentenceAlignment
public:
SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
int m_sntIndex;
std::vector<wordID_t>* trgSnt;
std::vector<wordID_t>* srcSnt;
std::vector<int> numberAligned;
std::vector< std::vector<int> > alignedList;
bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
vector<wordID_t>* trgSnt;
vector<wordID_t>* srcSnt;
vector<int> numberAligned;
vector< vector<int> > alignedList;
bool Extract(int maxPhraseLength, vector<PhrasePair*> &ret,
int startSource, int endSource) const;
};
class ScoresComp
{
public:
ScoresComp(const std::vector<float>& weights): m_weights(weights) {}
ScoresComp(const vector<float>& weights): m_weights(weights) {}
bool operator()(const Scores& s1, const Scores& s2) const {
return s1[0] < s2[0]; // just p(e|f) as approximation
/*float score1(0), score2(0);
int idx1(0), idx2(0);
for (Scores::const_iterator itr = s1.begin();
itr != s1.end(); ++itr) {
score1 += log(*itr * m_weights.at(idx1++));
}
for (Scores::const_iterator itr = s2.begin();
itr != s2.end(); ++itr) {
score2 += log(*itr * m_weights.at(idx2++));
}
return score1 < score2;*/
// float score1(0), score2(0);
// int idx1(0), idx2(0);
// for (Scores::const_iterator itr = s1.begin();
// itr != s1.end(); ++itr) {
// score1 += log(*itr * m_weights.at(idx1++));
// }
// for (Scores::const_iterator itr = s2.begin();
// itr != s2.end(); ++itr) {
// score2 += log(*itr * m_weights.at(idx2++));
// }
// return score1 < score2;
}
private:
const std::vector<float>& m_weights;
const vector<float>& m_weights;
};
struct BetterPhrase {
ScoresComp const& cmp;
BetterPhrase(ScoresComp const& sc);
// bool operator()(pair<Scores, TargetPhrase const*> const& a,
// pair<Scores, TargetPhrase const*> const& b) const;
bool operator()(pair<Scores, SAPhrase const*> const& a,
pair<Scores, SAPhrase const*> const& b) const;
};
/** @todo ask Abbey Levenberg
@ -93,66 +114,70 @@ class BilingualDynSuffixArray
public:
BilingualDynSuffixArray();
~BilingualDynSuffixArray();
bool Load( const std::vector<FactorType>& inputFactors,
const std::vector<FactorType>& outputTactors,
std::string source, std::string target, std::string alignments,
const std::vector<float> &weight);
bool LoadTM( const std::vector<FactorType>& inputFactors,
const std::vector<FactorType>& outputTactors,
std::string source, std::string target, std::string alignments,
const std::vector<float> &weight);
void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
void addSntPair(string& source, string& target, string& alignment);
private:
DynSuffixArray* m_srcSA;
DynSuffixArray* m_trgSA;
std::vector<wordID_t>* m_srcCorpus;
std::vector<wordID_t>* m_trgCorpus;
std::vector<FactorType> m_inputFactors;
std::vector<FactorType> m_outputFactors;
bool Load( const vector<FactorType>& inputFactors,
const vector<FactorType>& outputTactors,
string source, string target, string alignments,
const vector<float> &weight);
// bool LoadTM( const vector<FactorType>& inputFactors,
// const vector<FactorType>& outputTactors,
// string source, string target, string alignments,
// const vector<float> &weight);
void GetTargetPhrasesByLexicalWeight(const Phrase& src, vector< pair<Scores, TargetPhrase*> >& target) const;
std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
void CleanUp(const InputType& source);
void addSntPair(string& source, string& target, string& alignment);
pair<float,float>
GatherCands(Phrase const& src, map<SAPhrase, vector<float> >& pstats) const;
TargetPhrase*
GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;
private:
mutable WordCoocTable m_wrd_cooc;
DynSuffixArray * m_srcSA;
DynSuffixArray * m_trgSA;
vector<wordID_t>* m_srcCorpus;
vector<wordID_t>* m_trgCorpus;
vector<FactorType> m_inputFactors;
vector<FactorType> m_outputFactors;
vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
Vocab* m_srcVocab, *m_trgVocab;
ScoresComp* m_scoreCmp;
std::vector<SentenceAlignment> m_alignments;
std::vector<std::vector<short> > m_rawAlignments;
vector<SentenceAlignment> m_alignments;
vector<vector<short> > m_rawAlignments;
mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
mutable std::set<wordID_t> m_freqWordsCached;
mutable map<pair<wordID_t, wordID_t>, pair<float, float> > m_wordPairCache;
mutable set<wordID_t> m_freqWordsCached;
const size_t m_maxPhraseLength, m_maxSampleSize;
int LoadCorpus(FactorDirection direction, InputFileStream&, const std::vector<FactorType>& factors,
std::vector<wordID_t>&, std::vector<wordID_t>&,
const size_t m_maxPTEntries;
int LoadCorpus(FactorDirection direction,
InputFileStream&, const vector<FactorType>& factors,
vector<wordID_t>&, vector<wordID_t>&,
Vocab*);
int LoadAlignments(InputFileStream& aligs);
int LoadRawAlignments(InputFileStream& aligs);
int LoadRawAlignments(string& aligs);
bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
bool ExtractPhrases(const int&, const int&, const int&, vector<PhrasePair*>&, bool=false) const;
SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
int SampleSelection(std::vector<unsigned>&, int = 300) const;
int SampleSelection(vector<unsigned>&, int = 300) const;
std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
TargetPhrase* GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;
vector<int> GetSntIndexes(vector<unsigned>&, int, const vector<unsigned>&) const;
SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
void CacheWordProbs(wordID_t) const;
void CacheFreqWords() const;
void ClearWordInCache(wordID_t);
std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
pair<float, float> GetLexicalWeight(const PhrasePair&) const;
int GetSourceSentenceSize(size_t sentenceId) const;
int GetTargetSentenceSize(size_t sentenceId) const;
int GetSourceSentenceSize(size_t sentenceId) const {
return (sentenceId==m_srcSntBreaks.size()-1) ?
m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
}
int GetTargetSentenceSize(size_t sentenceId) const {
return (sentenceId==m_trgSntBreaks.size()-1) ?
m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
}
};
} // end namespace
#endif

View File

@ -1,5 +1,6 @@
#include "DynSuffixArray.h"
#include <iostream>
#include <boost/foreach.hpp>
using namespace std;
@ -215,8 +216,37 @@ void DynSuffixArray::Substitute(vuint_t* /* newSents */, unsigned /* newIndex */
return;
}
ComparePosition::
ComparePosition(vuint_t const& crp, vuint_t const& sfa)
: m_crp(crp), m_sfa(sfa) { }
bool
ComparePosition::
operator()(unsigned const& i, vector<wordID_t> const& phrase) const
{
unsigned const* x = &m_crp.at(i);
unsigned const* e = &m_crp.back();
size_t k = 0;
for (; k < phrase.size() && x < e; ++k, ++x)
if (*x != phrase[k]) return *x < phrase[k];
return (x == e && k < phrase.size());
}
bool
ComparePosition::
operator()(vector<wordID_t> const& phrase, unsigned const& i) const
{
unsigned const* x = &m_crp.at(i);
unsigned const* e = &m_crp.back();
size_t k = 0;
for (; k < phrase.size() && x < e; ++k, ++x)
if (*x != phrase[k]) return phrase[k] < *x;
return false; // (k == phrase.size() && x < e);
}
bool DynSuffixArray::GetCorpusIndex(const vuint_t* phrase, vuint_t* indices)
{
// DOES THIS EVEN WORK WHEN A DynSuffixArray has been saved and reloaded????
pair<vuint_t::iterator,vuint_t::iterator> bounds;
indices->clear();
size_t phrasesize = phrase->size();
@ -251,6 +281,16 @@ bool DynSuffixArray::GetCorpusIndex(const vuint_t* phrase, vuint_t* indices)
return (indices->size() > 0);
}
size_t
DynSuffixArray::
GetCount(vuint_t const& phrase) const
{
ComparePosition cmp(*m_corpus, *m_SA);
vuint_t::const_iterator lb = lower_bound(m_SA->begin(), m_SA->end(), phrase, cmp);
vuint_t::const_iterator ub = upper_bound(m_SA->begin(), m_SA->end(), phrase, cmp);
return ub-lb;
}
void DynSuffixArray::Save(FILE* fout)
{
fWriteVector(fout, *m_SA);

View File

@ -11,9 +11,25 @@
namespace Moses
{
using namespace std;
typedef std::vector<unsigned> vuint_t;
/// compare position /i/ in the suffix array /m_sfa/ into corpus /m_crp/
/// against reference phrase /phrase/
// added by Ulrich Germann
class ComparePosition
{
vuint_t const& m_crp;
vuint_t const& m_sfa;
public:
ComparePosition(vuint_t const& crp, vuint_t const& sfa);
bool operator()(unsigned const& i, vector<wordID_t> const& phrase) const;
bool operator()(vector<wordID_t> const& phrase, unsigned const& i) const;
};
/** @todo ask Abbey Levenberg
*/
class DynSuffixArray
@ -30,6 +46,8 @@ public:
void Delete(unsigned, unsigned);
void Substitute(vuint_t*, unsigned);
size_t GetCount(vuint_t const& phrase) const;
private:
vuint_t* m_SA;
vuint_t* m_ISA;
@ -46,10 +64,10 @@ private:
void PrintAuxArrays() {
std::cerr << "SA\tISA\tF\tL\n";
for(size_t i=0; i < m_SA->size(); ++i)
std::cerr << m_SA->at(i) << "\t" << m_ISA->at(i) << "\t" << m_F->at(i) << "\t" << m_L->at(i) << std::endl;
std::cerr << m_SA->at(i) << "\t" << m_ISA->at(i) << "\t"
<< m_F->at(i) << "\t" << m_L->at(i) << std::endl;
}
};
} //end namespace
#endif

View File

@ -0,0 +1,4 @@
Specifying Dynamic Suffix Array-based Phrase Tables in moses.ini
[ttable-file]
14 0 0 5 <source language text file> <target language text file> <file with alignment info in symal format>

View File

@ -3,70 +3,32 @@
#include "moses/StaticData.h"
#include "moses/TargetPhrase.h"
#include <iomanip>
#include <boost/foreach.hpp>
using namespace std;
namespace Moses
{
PhraseDictionaryDynSuffixArray::PhraseDictionaryDynSuffixArray(const std::string &line)
:PhraseDictionary("PhraseDictionaryDynSuffixArray", line)
PhraseDictionaryDynSuffixArray::
PhraseDictionaryDynSuffixArray(const std::string &line)
: PhraseDictionary("PhraseDictionaryDynSuffixArray", line)
,m_biSA(new BilingualDynSuffixArray())
{
ReadParameters();
}
PhraseDictionaryDynSuffixArray::~PhraseDictionaryDynSuffixArray()
{
delete m_biSA;
}
void PhraseDictionaryDynSuffixArray::Load()
{
SetFeaturesToApply();
const StaticData &staticData = StaticData::Instance();
vector<float> weight = staticData.GetWeights(this);
m_biSA->Load( m_input, m_output, m_source, m_target, m_alignments, weight);
vector<float> weight = StaticData::Instance().GetWeights(this);
m_biSA->Load(m_input, m_output, m_source, m_target, m_alignments, weight);
}
const TargetPhraseCollection *PhraseDictionaryDynSuffixArray::GetTargetPhraseCollection(const Phrase& src) const
PhraseDictionaryDynSuffixArray::
~PhraseDictionaryDynSuffixArray()
{
TargetPhraseCollection *ret = new TargetPhraseCollection();
std::vector< std::pair< Scores, TargetPhrase*> > trg;
// extract target phrases and their scores from suffix array
m_biSA->GetTargetPhrasesByLexicalWeight( src, trg);
std::vector< std::pair< Scores, TargetPhrase*> >::iterator itr;
for(itr = trg.begin(); itr != trg.end(); ++itr) {
Scores scoreVector = itr->first;
TargetPhrase *targetPhrase = itr->second;
//std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),NegateScore);
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
targetPhrase->Evaluate(src);
//cout << *targetPhrase << "\t" << std::setprecision(8) << scoreVector[2] << endl;
ret->Add(targetPhrase);
}
ret->NthElement(m_tableLimit); // sort the phrases for the dcoder
return ret;
}
void PhraseDictionaryDynSuffixArray::insertSnt(string& source, string& target, string& alignment)
{
m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays
//StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
}
void PhraseDictionaryDynSuffixArray::deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
{
// need to implement --
}
ChartRuleLookupManager *PhraseDictionaryDynSuffixArray::CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&)
{
throw "Chart decoding not supported by PhraseDictionaryDynSuffixArray";
delete m_biSA;
}
void PhraseDictionaryDynSuffixArray::SetParameter(const std::string& key, const std::string& value)
@ -82,4 +44,62 @@ void PhraseDictionaryDynSuffixArray::SetParameter(const std::string& key, const
}
}
const TargetPhraseCollection*
PhraseDictionaryDynSuffixArray::
GetTargetPhraseCollection(const Phrase& src) const
{
typedef map<SAPhrase, vector<float> >::value_type pstat_entry;
map<SAPhrase, vector<float> > pstats; // phrase (pair) statistics
m_biSA->GatherCands(src,pstats);
TargetPhraseCollection *ret = new TargetPhraseCollection();
BOOST_FOREACH(pstat_entry & e, pstats) {
TargetPhrase* tp = m_biSA->GetMosesFactorIDs(e.first, src);
tp->GetScoreBreakdown().Assign(this,e.second);
ret->Add(tp);
}
// return ret;
// TargetPhraseCollection *ret = new TargetPhraseCollection();
// std::vector< std::pair< Scores, TargetPhrase*> > trg;
//
// // extract target phrases and their scores from suffix array
// m_biSA->GetTargetPhrasesByLexicalWeight(src, trg);
//
// std::vector< std::pair< Scores, TargetPhrase*> >::iterator itr;
// for(itr = trg.begin(); itr != trg.end(); ++itr) {
// Scores scoreVector = itr->first;
// TargetPhrase *targetPhrase = itr->second;
// std::transform(scoreVector.begin(),scoreVector.end(),
// scoreVector.begin(),FloorScore);
// targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
// targetPhrase->Evaluate();
// ret->Add(targetPhrase);
// }
ret->NthElement(m_tableLimit); // sort the phrases for the decoder
return ret;
}
void
PhraseDictionaryDynSuffixArray::
insertSnt(string& source, string& target, string& alignment)
{
m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays
//StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
}
void
PhraseDictionaryDynSuffixArray::
deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
{
// need to implement --
}
ChartRuleLookupManager*
PhraseDictionaryDynSuffixArray::
CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&)
{
CHECK(false);
return 0;
}
}// end namepsace

View File

@ -17,21 +17,19 @@ class PhraseDictionaryDynSuffixArray: public PhraseDictionary
public:
PhraseDictionaryDynSuffixArray(const std::string &line);
~PhraseDictionaryDynSuffixArray();
bool InitDictionary();
void Load();
// functions below required by base class
const TargetPhraseCollection* GetTargetPhraseCollection(const Phrase& src) const;
void insertSnt(string&, string&, string&);
void deleteSnt(unsigned, unsigned);
ChartRuleLookupManager *CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&);
void SetParameter(const std::string& key, const std::string& value);
private:
BilingualDynSuffixArray *m_biSA;
std::string m_source, m_target, m_alignments;
std::vector<float> m_weight;
};
} // end namespace

View File

@ -31,7 +31,8 @@ using namespace std;
namespace Moses
{
PhraseDictionaryOnDisk::PhraseDictionaryOnDisk(const std::string &line)
: MyBase("PhraseDictionaryOnDisk", line) {
: MyBase("PhraseDictionaryOnDisk", line)
{
ReadParameters();
}

View File

@ -48,12 +48,6 @@ public:
void Load();
// Required by PhraseDictionary.
virtual const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase &) const {
CHECK(false);
return NULL;
}
private:
friend class RuleTableLoader;

View File

@ -0,0 +1,72 @@
#include "moses/TranslationModel/WordCoocTable.h"
using namespace std;
namespace Moses
{
WordCoocTable::
WordCoocTable()
{
m_cooc.reserve(1000000);
m_marg1.reserve(1000000);
m_marg2.reserve(1000000);
}
WordCoocTable::
WordCoocTable(wordID_t const VocabSize1, wordID_t const VocabSize2)
: m_cooc(VocabSize1), m_marg1(VocabSize1,0), m_marg2(VocabSize2, 0)
{}
void
WordCoocTable::
Count(size_t const a, size_t const b)
{
while (a >= m_marg1.size()) {
m_cooc.push_back(my_map_t());
m_marg1.push_back(0);
}
while (b >= m_marg2.size())
m_marg2.push_back(0);
++m_marg1[a];
++m_marg2[b];
++m_cooc[a][b];
}
uint32_t
WordCoocTable::
GetJoint(size_t const a, size_t const b) const
{
if (a >= m_marg1.size() || b >= m_marg2.size()) return 0;
my_map_t::const_iterator m = m_cooc.at(a).find(b);
if (m == m_cooc[a].end()) return 0;
return m->second;
}
uint32_t
WordCoocTable::
GetMarg1(size_t const x) const
{
return x >= m_marg1.size() ? 0 : m_marg1[x];
}
uint32_t
WordCoocTable::
GetMarg2(size_t const x) const
{
return x >= m_marg2.size() ? 0 : m_marg2[x];
}
float
WordCoocTable::
pfwd(size_t const a, size_t const b) const
{
return float(GetJoint(a,b))/GetMarg1(a);
}
float
WordCoocTable::
pbwd(size_t const a, size_t const b) const
{
// cerr << "at " << __FILE__ << ":" << __LINE__ << endl;
return float(GetJoint(a,b))/GetMarg2(b);
}
}

View File

@ -0,0 +1,72 @@
#ifndef moses_WordCoocTable_h
#define moses_WordCoocTable_h
#include "moses/TranslationModel/DynSAInclude/vocab.h"
#include "moses/TranslationModel/DynSAInclude/types.h"
#include "moses/TranslationModel/DynSAInclude/utils.h"
#include "moses/InputFileStream.h"
#include "moses/FactorTypeSet.h"
#include "moses/TargetPhrase.h"
#include <boost/dynamic_bitset.hpp>
#include <map>
namespace Moses
{
using namespace std;
#ifndef bitvector
typedef boost::dynamic_bitset<uint64_t> bitvector;
#endif
/**
* Stores word cooccurrence counts
* @todo ask Uli Germann
*/
class WordCoocTable
{
typedef map<wordID_t,uint32_t> my_map_t;
vector<my_map_t> m_cooc;
vector<uint32_t> m_marg1;
vector<uint32_t> m_marg2;
public:
WordCoocTable();
WordCoocTable(wordID_t const VocabSize1, wordID_t const VocabSize2);
uint32_t GetJoint(size_t const a, size_t const b) const;
uint32_t GetMarg1(size_t const x) const;
uint32_t GetMarg2(size_t const x) const;
float pfwd(size_t const a, size_t const b) const;
float pbwd(size_t const a, size_t const b) const;
void
Count(size_t const a, size_t const b);
template<typename idvec, typename alnvec>
void
Count(idvec const& s1, idvec const& s2, alnvec const& aln,
wordID_t const NULL1, wordID_t const NULL2);
};
template<typename idvec, typename alnvec>
void
WordCoocTable::
Count(idvec const& s1, idvec const& s2, alnvec const& aln,
wordID_t const NULL1, wordID_t const NULL2)
{
boost::dynamic_bitset<uint64_t> check1(s1.size()), check2(s2.size());
check1.set();
check2.set();
for (size_t i = 0; i < aln.size(); i += 2) {
Count(s1[aln[i]], s2[aln[i+1]]);
check1.reset(aln[i]);
check2.reset(aln[i+1]);
}
for (size_t i = check1.find_first(); i < check1.size(); i = check1.find_next(i))
Count(s1[i], NULL2);
for (size_t i = check2.find_first(); i < check2.size(); i = check2.find_next(i))
Count(NULL1, s2[i]);
}
}
#endif

View File

@ -121,6 +121,7 @@ enum PhraseTableImplementation {
,FuzzyMatch = 11
,Compact = 12
,Interpolated = 13
,DSuffixArray = 14
};
enum InputTypeEnum {

View File

@ -0,0 +1,51 @@
#ifndef __sampling_h
#define __sampling_h
// Utility functions for proper sub-sampling.
// (c) 2007-2012 Ulrich Germann
namespace Moses
{
inline
size_t
randInt(size_t N)
{
return N*(rand()/(RAND_MAX+1.));
}
// select a random sample of size /s/ without restitution from the range of
// integers [0,N);
template<typename idx_t>
void
randomSample(vector<idx_t>& v, size_t s, size_t N)
{
// see also Knuth: Art of Computer Programming Vol. 2, p. 142
s = min(s,N);
v.resize(s);
// the first option tries to be a bit more efficient than O(N) in picking
// the samples. The threshold is an ad-hoc, off-the-cuff guess. I still
// need to figure out the optimal break-even point between a linear sweep
// and repeatedly picking random numbers with the risk of hitting the same
// number many times.
if (s*10<N) {
boost::dynamic_bitset<uint64_t> check(N,0);
for (size_t i = 0; i < v.size(); i++) {
size_t x = randInt(N);
while (check[x]) x = randInt(N);
check[x]=true;
v[i] = x;
}
} else {
size_t m=0;
for (size_t t = 0; m <= s && t < N; t++)
if (s==N || randInt(N-t) < s-m) v[m++] = t;
}
}
};
#endif

View File

@ -0,0 +1,85 @@
#ifndef __n_best_list_h
#define __n_best_list_h
#include <algorithm>
#include "moses/generic/sorting/VectorIndexSorter.h"
// NBest List; (c) 2007-2012 Ulrich Germann
//
// The 'trick' used in this implementation is to maintain a heap of size <= N
// such that the lowest-scoring item is on top of the heap. For each incoming
// item we can then determine easily if it is in the top N.
namespace Moses
{
using namespace std;
template<typename THINGY, typename CMP>
class
NBestList
{
vector<uint32_t> m_heap;
vector<THINGY> m_list;
VectorIndexSorter<THINGY, CMP, uint32_t> m_better;
mutable vector<uint32_t> m_order;
mutable bool m_changed;
public:
NBestList(size_t const max_size, CMP const& cmp);
NBestList(size_t const max_size);
bool add(THINGY const& item);
THINGY const& operator[](int i) const;
size_t size() const {
return m_heap.size();
}
};
template<typename THINGY, typename CMP>
NBestList<THINGY,CMP>::
NBestList(size_t const max_size, CMP const& cmp)
: m_better(m_list, cmp), m_changed(false)
{
m_heap.reserve(max_size);
}
template<typename THINGY, typename CMP>
NBestList<THINGY,CMP>::
NBestList(size_t const max_size)
: m_better(m_heap), m_changed(false)
{
m_heap.reserve(max_size);
}
template<typename THINGY, typename CMP>
bool
NBestList<THINGY,CMP>::
add(THINGY const& item)
{
if (m_heap.size() == m_heap.capacity()) {
if (m_better.Compare(item, m_list[m_heap.at(0)])) {
pop_heap(m_heap.begin(),m_heap.end(),m_better);
m_list[m_heap.back()] = item;
} else return false;
} else {
m_list.push_back(item);
m_heap.push_back(m_heap.size());
}
push_heap(m_heap.begin(),m_heap.end(),m_better);
return m_changed = true;
}
template<typename THINGY, typename CMP>
THINGY const&
NBestList<THINGY,CMP>::
operator[](int i) const
{
if (m_changed) {
m_order.assign(m_heap.begin(),m_heap.end());
for (size_t k = m_heap.size(); k != 0; --k)
pop_heap(m_order.begin(), m_order.begin()+k);
m_changed = false;
}
if (i < 0) i += m_order.size();
return m_list[m_order.at(i)];
}
}
#endif

View File

@ -0,0 +1,69 @@
#ifndef __vector_index_sorter_h
#define __vector_index_sorter_h
// VectorIndexSorter; (c) 2007-2012 Ulrich Germann
// A VectorIndexSorter is a function object for sorting indices into a vector
// of objects (instead of sorting the vector itself).
//
// typcial use:
// vector<thingy> my_vector;
// VectorIndexSorter<thingy,less<thingy>,int> sorter(my_vector);
// vector<int> order;
// sorter.get_order(order);
namespace Moses
{
template<typename VAL, typename COMP = greater<VAL>, typename IDX_T=size_t>
class
VectorIndexSorter : public binary_function<IDX_T const&, IDX_T const&, bool>
{
vector<VAL> const& m_vecref;
boost::shared_ptr<COMP> m_comp;
public:
COMP const& Compare;
VectorIndexSorter(vector<VAL> const& v, COMP const& comp)
: m_vecref(v), Compare(comp)
{ }
VectorIndexSorter(vector<VAL> const& v)
: m_vecref(v), m_comp(new COMP()), Compare(*m_comp)
{ }
bool operator()(IDX_T const & a, IDX_T const & b) const {
bool fwd = Compare(m_vecref.at(a) ,m_vecref.at(b));
bool bwd = Compare(m_vecref[b], m_vecref[a]);
return (fwd == bwd ? a < b : fwd);
}
boost::shared_ptr<vector<IDX_T> >
GetOrder() const;
void
GetOrder(vector<IDX_T> & order) const;
};
template<typename VAL, typename COMP, typename IDX_T>
boost::shared_ptr<vector<IDX_T> >
VectorIndexSorter<VAL,COMP,IDX_T>::
GetOrder() const
{
boost::shared_ptr<vector<IDX_T> > ret(new vector<IDX_T>(m_vecref.size()));
get_order(*ret);
return ret;
}
template<typename VAL, typename COMP, typename IDX_T>
void
VectorIndexSorter<VAL,COMP,IDX_T>::
GetOrder(vector<IDX_T> & order) const
{
order.resize(m_vecref.size());
for (IDX_T i = 0; i < IDX_T(m_vecref.size()); ++i) order[i] = i;
sort(order.begin(), order.end(), *this);
}
}
#endif