This commit is contained in:
Hieu Hoang 2013-05-29 18:16:15 +01:00
parent 59bd7deb4b
commit 6249432407
501 changed files with 20914 additions and 20027 deletions

View File

@ -50,14 +50,14 @@ int main (int argc, char * const argv[])
} }
int numSourceFactors = Moses::Scan<int>(argv[1]) int numSourceFactors = Moses::Scan<int>(argv[1])
, numTargetFactors = Moses::Scan<int>(argv[2]) , numTargetFactors = Moses::Scan<int>(argv[2])
, numScores = Moses::Scan<int>(argv[3]) , numScores = Moses::Scan<int>(argv[3])
, tableLimit = Moses::Scan<int>(argv[4]); , tableLimit = Moses::Scan<int>(argv[4]);
TargetPhraseCollection::s_sortScoreInd = Moses::Scan<int>(argv[5]); TargetPhraseCollection::s_sortScoreInd = Moses::Scan<int>(argv[5]);
assert(TargetPhraseCollection::s_sortScoreInd < numScores); assert(TargetPhraseCollection::s_sortScoreInd < numScores);
const string filePath = argv[6] const string filePath = argv[6]
,destPath = argv[7]; ,destPath = argv[7];
Moses::InputFileStream inStream(filePath); Moses::InputFileStream inStream(filePath);
@ -128,10 +128,10 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
} else { } else {
switch (stage) { switch (stage) {
case 0: { case 0: {
WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper); WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper);
if (w != NULL) if (w != NULL)
out->AddWord(w); out->AddWord(w);
break; break;
} }
case 1: { case 1: {
@ -146,19 +146,19 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
} }
case 3: { case 3: {
//targetPhrase.Create1AlignFromString(tok); //targetPhrase.Create1AlignFromString(tok);
targetPhrase.CreateAlignFromString(tok); targetPhrase.CreateAlignFromString(tok);
break; break;
} }
case 4: case 4:
++stage; ++stage;
break; break;
/* case 5: { /* case 5: {
// count info. Only store the 2nd one // count info. Only store the 2nd one
float val = Moses::Scan<float>(tok); float val = Moses::Scan<float>(tok);
misc[0] = val; misc[0] = val;
++stage; ++stage;
break; break;
}*/ }*/
case 5: { case 5: {
// count info. Only store the 2nd one // count info. Only store the 2nd one
//float val = Moses::Scan<float>(tok); //float val = Moses::Scan<float>(tok);
@ -167,12 +167,12 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
break; break;
} }
case 6: { case 6: {
// store only the 3rd one (rule count) // store only the 3rd one (rule count)
float val = Moses::Scan<float>(tok); float val = Moses::Scan<float>(tok);
misc[0] = val; misc[0] = val;
++stage; ++stage;
break; break;
} }
default: default:
cerr << "ERROR in line " << line << endl; cerr << "ERROR in line " << line << endl;
assert(false); assert(false);
@ -189,8 +189,8 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
} // Tokenize() } // Tokenize()
OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
, OnDiskPt::OnDiskWrapper &onDiskWrapper) , OnDiskPt::OnDiskWrapper &onDiskWrapper)
{ {
bool nonTerm = false; bool nonTerm = false;
@ -218,7 +218,7 @@ OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
if (addSourceNonTerm) { if (addSourceNonTerm) {
WordPtr word(new Word()); WordPtr word(new Word());
word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
phrase.AddWord(word); phrase.AddWord(word);
} }
wordStr = token.substr(splitPos, tokSize - splitPos); wordStr = token.substr(splitPos, tokSize - splitPos);
@ -237,7 +237,7 @@ OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
phrase.AddWord(word); phrase.AddWord(word);
out = word; out = word;
} }
return out; return out;
} }

View File

@ -26,12 +26,12 @@ typedef std::pair<size_t, size_t> AlignPair;
typedef std::vector<AlignPair> AlignType; typedef std::vector<AlignPair> AlignType;
OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
, OnDiskPt::OnDiskWrapper &onDiskWrapper); , OnDiskPt::OnDiskWrapper &onDiskWrapper);
OnDiskPt::PhrasePtr Tokenize(OnDiskPt::SourcePhrase &sourcePhrase, OnDiskPt::TargetPhrase &targetPhrase OnDiskPt::PhrasePtr Tokenize(OnDiskPt::SourcePhrase &sourcePhrase, OnDiskPt::TargetPhrase &targetPhrase
, char *line, OnDiskPt::OnDiskWrapper &onDiskWrapper , char *line, OnDiskPt::OnDiskWrapper &onDiskWrapper
, int numScores , int numScores
, std::vector<float> &misc); , std::vector<float> &misc);
void InsertTargetNonTerminals(std::vector<std::string> &sourceToks, const std::vector<std::string> &targetToks, const AlignType &alignments); void InsertTargetNonTerminals(std::vector<std::string> &sourceToks, const std::vector<std::string> &targetToks, const AlignType &alignments);
void SortAlign(AlignType &alignments); void SortAlign(AlignType &alignments);

View File

@ -3,10 +3,10 @@
namespace OnDiskPt namespace OnDiskPt
{ {
void OnDiskQuery::Tokenize(Phrase &phrase, void OnDiskQuery::Tokenize(Phrase &phrase,
const std::string &token, const std::string &token,
bool addSourceNonTerm, bool addSourceNonTerm,
bool addTargetNonTerm) bool addTargetNonTerm)
{ {
bool nonTerm = false; bool nonTerm = false;
size_t tokSize = token.size(); size_t tokSize = token.size();
@ -50,13 +50,13 @@ void OnDiskQuery::Tokenize(Phrase &phrase,
phrase.AddWord(word); phrase.AddWord(word);
} }
} }
SourcePhrase OnDiskQuery::Tokenize(const std::vector<std::string>& tokens) SourcePhrase OnDiskQuery::Tokenize(const std::vector<std::string>& tokens)
{ {
SourcePhrase sourcePhrase; SourcePhrase sourcePhrase;
if (tokens.size() > 0){ if (tokens.size() > 0) {
std::vector<std::string>::const_iterator token = tokens.begin(); std::vector<std::string>::const_iterator token = tokens.begin();
for (; token + 1 != tokens.end(); ++token){ for (; token + 1 != tokens.end(); ++token) {
Tokenize(sourcePhrase, *token, true, true); Tokenize(sourcePhrase, *token, true, true);
} }
// last position. LHS non-term // last position. LHS non-term
@ -64,22 +64,20 @@ SourcePhrase OnDiskQuery::Tokenize(const std::vector<std::string>& tokens)
} }
return sourcePhrase; return sourcePhrase;
} }
const PhraseNode* OnDiskQuery::Query(const SourcePhrase& sourcePhrase) const PhraseNode* OnDiskQuery::Query(const SourcePhrase& sourcePhrase)
{ {
const PhraseNode *node = &m_wrapper.GetRootSourceNode(); const PhraseNode *node = &m_wrapper.GetRootSourceNode();
assert(node); assert(node);
for (size_t pos = 0; pos < sourcePhrase.GetSize(); ++pos) for (size_t pos = 0; pos < sourcePhrase.GetSize(); ++pos) {
{ const Word &word = sourcePhrase.GetWord(pos);
const Word &word = sourcePhrase.GetWord(pos); node = node->GetChild(word, m_wrapper);
node = node->GetChild(word, m_wrapper); if (node == NULL) {
if (node == NULL) break;
{
break;
}
} }
return node; }
return node;
} }
} }

View File

@ -18,22 +18,21 @@ private:
public: public:
OnDiskQuery(OnDiskWrapper &wrapper):m_wrapper(wrapper){} OnDiskQuery(OnDiskWrapper &wrapper):m_wrapper(wrapper) {}
void Tokenize(Phrase &phrase,
const std::string &token,
bool addSourceNonTerm,
bool addTargetNonTerm);
void Tokenize(Phrase &phrase,
const std::string &token,
bool addSourceNonTerm,
bool addTargetNonTerm);
SourcePhrase Tokenize(const std::vector<std::string>& tokens); SourcePhrase Tokenize(const std::vector<std::string>& tokens);
const PhraseNode *Query(const SourcePhrase& sourcePhrase); const PhraseNode *Query(const SourcePhrase& sourcePhrase);
inline const PhraseNode *Query(const std::vector<std::string>& tokens) inline const PhraseNode *Query(const std::vector<std::string>& tokens) {
{
return Query(Tokenize(tokens)); return Query(Tokenize(tokens));
} }
}; };

View File

@ -204,16 +204,16 @@ Word *OnDiskWrapper::ConvertFromMoses(Moses::FactorDirection /* direction */
Word *newWord = new Word(isNonTerminal); Word *newWord = new Word(isNonTerminal);
stringstream strme; stringstream strme;
size_t factorType = factorsVec[0]; size_t factorType = factorsVec[0];
const Moses::Factor *factor = origWord.GetFactor(factorType); const Moses::Factor *factor = origWord.GetFactor(factorType);
CHECK(factor); CHECK(factor);
strme << factor->GetString(); strme << factor->GetString();
for (size_t ind = 1 ; ind < factorsVec.size() ; ++ind) { for (size_t ind = 1 ; ind < factorsVec.size() ; ++ind) {
size_t factorType = factorsVec[ind]; size_t factorType = factorsVec[ind];
const Moses::Factor *factor = origWord.GetFactor(factorType); const Moses::Factor *factor = origWord.GetFactor(factorType);
if (factor == NULL) if (factor == NULL) {
{ // can have less factors than factorType.size() // can have less factors than factorType.size()
break; break;
} }
CHECK(factor); CHECK(factor);

View File

@ -28,7 +28,7 @@ namespace OnDiskPt
{ {
const float DEFAULT_COUNT = 66666; const float DEFAULT_COUNT = 66666;
/** Global class with misc information need to create and use the on-disk rule table. /** Global class with misc information need to create and use the on-disk rule table.
* 1 object of this class should be instantiated per rule table. * 1 object of this class should be instantiated per rule table.
* Currently only hierarchical/syntax models use this, but can & should be used with pb models too * Currently only hierarchical/syntax models use this, but can & should be used with pb models too
*/ */

View File

@ -38,7 +38,7 @@ size_t PhraseNode::GetNodeSize(size_t numChildren, size_t wordSize, size_t count
} }
PhraseNode::PhraseNode() PhraseNode::PhraseNode()
: m_value(0) : m_value(0)
,m_currChild(NULL) ,m_currChild(NULL)
,m_saved(false) ,m_saved(false)
,m_memLoad(NULL) ,m_memLoad(NULL)
@ -58,7 +58,7 @@ PhraseNode::PhraseNode(UINT64 filePos, OnDiskWrapper &onDiskWrapper)
CHECK(filePos == (UINT64)file.tellg()); CHECK(filePos == (UINT64)file.tellg());
file.read((char*) &m_numChildrenLoad, sizeof(UINT64)); file.read((char*) &m_numChildrenLoad, sizeof(UINT64));
size_t memAlloc = GetNodeSize(m_numChildrenLoad, onDiskWrapper.GetSourceWordSize(), countSize); size_t memAlloc = GetNodeSize(m_numChildrenLoad, onDiskWrapper.GetSourceWordSize(), countSize);
m_memLoad = (char*) malloc(memAlloc); m_memLoad = (char*) malloc(memAlloc);
@ -168,7 +168,7 @@ void PhraseNode::AddTargetPhrase(const SourcePhrase &sourcePhrase, TargetPhrase
void PhraseNode::AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase void PhraseNode::AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase
, TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper , TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper
, size_t tableLimit, const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort) , size_t tableLimit, const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort)
{ {
size_t phraseSize = sourcePhrase.GetSize(); size_t phraseSize = sourcePhrase.GetSize();
if (pos < phraseSize) { if (pos < phraseSize) {
const Word &word = sourcePhrase.GetWord(pos); const Word &word = sourcePhrase.GetWord(pos);
@ -185,7 +185,7 @@ void PhraseNode::AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase
m_currChild = &node; m_currChild = &node;
} }
// keep searching for target phrase node.. // keep searching for target phrase node..
node.AddTargetPhrase(pos + 1, sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, counts, spShort); node.AddTargetPhrase(pos + 1, sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, counts, spShort);
} else { } else {
// drilled down to the right node // drilled down to the right node

View File

@ -53,7 +53,7 @@ protected:
void AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase void AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase
, TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper , TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper
, size_t tableLimit, const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort); , size_t tableLimit, const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort);
size_t ReadChild(Word &wordFound, UINT64 &childFilePos, const char *mem) const; size_t ReadChild(Word &wordFound, UINT64 &childFilePos, const char *mem) const;
void GetChild(Word &wordFound, UINT64 &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const; void GetChild(Word &wordFound, UINT64 &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const;
public: public:

View File

@ -64,13 +64,13 @@ void TargetPhrase::Create1AlignFromString(const std::string &align1Str)
void TargetPhrase::CreateAlignFromString(const std::string &alignStr) void TargetPhrase::CreateAlignFromString(const std::string &alignStr)
{ {
vector<std::string> alignPairs; vector<std::string> alignPairs;
boost::split(alignPairs, alignStr, boost::is_any_of("\t ")); boost::split(alignPairs, alignStr, boost::is_any_of("\t "));
for (size_t i = 0; i < alignPairs.size(); ++i) { for (size_t i = 0; i < alignPairs.size(); ++i) {
vector<size_t> alignPoints; vector<size_t> alignPoints;
Moses::Tokenize<size_t>(alignPoints, alignPairs[i], "-"); Moses::Tokenize<size_t>(alignPoints, alignPairs[i], "-");
m_align.push_back(pair<size_t, size_t>(alignPoints[0], alignPoints[1]) ); m_align.push_back(pair<size_t, size_t>(alignPoints[0], alignPoints[1]) );
} }
} }
@ -97,16 +97,16 @@ char *TargetPhrase::WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed)
{ {
size_t phraseSize = GetSize(); size_t phraseSize = GetSize();
size_t targetWordSize = onDiskWrapper.GetTargetWordSize(); size_t targetWordSize = onDiskWrapper.GetTargetWordSize();
const PhrasePtr sp = GetSourcePhrase(); const PhrasePtr sp = GetSourcePhrase();
size_t spSize = sp->GetSize(); size_t spSize = sp->GetSize();
size_t sourceWordSize = onDiskWrapper.GetSourceWordSize(); size_t sourceWordSize = onDiskWrapper.GetSourceWordSize();
size_t memNeeded = sizeof(UINT64) // num of words size_t memNeeded = sizeof(UINT64) // num of words
+ targetWordSize * phraseSize // actual words. lhs as last words + targetWordSize * phraseSize // actual words. lhs as last words
+ sizeof(UINT64) // num source words + sizeof(UINT64) // num source words
+ sourceWordSize * spSize; // actual source words + sourceWordSize * spSize; // actual source words
memUsed = 0; memUsed = 0;
UINT64 *mem = (UINT64*) malloc(memNeeded); UINT64 *mem = (UINT64*) malloc(memNeeded);
@ -125,13 +125,13 @@ char *TargetPhrase::WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed)
char *currPtr = (char*)mem + memUsed; char *currPtr = (char*)mem + memUsed;
UINT64 *memTmp = (UINT64*) currPtr; UINT64 *memTmp = (UINT64*) currPtr;
memTmp[0] = spSize; memTmp[0] = spSize;
memUsed += sizeof(UINT64); memUsed += sizeof(UINT64);
for (size_t pos = 0; pos < spSize; ++pos) { for (size_t pos = 0; pos < spSize; ++pos) {
const Word &word = sp->GetWord(pos); const Word &word = sp->GetWord(pos);
char *currPtr = (char*)mem + memUsed; char *currPtr = (char*)mem + memUsed;
memUsed += word.WriteToMemory((char*) currPtr); memUsed += word.WriteToMemory((char*) currPtr);
} }
CHECK(memUsed == memNeeded); CHECK(memUsed == memNeeded);
return (char *) mem; return (char *) mem;
} }
@ -174,7 +174,7 @@ char *TargetPhrase::WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t
// phrase id // phrase id
memcpy(mem, &m_filePos, sizeof(UINT64)); memcpy(mem, &m_filePos, sizeof(UINT64));
memUsed += sizeof(UINT64); memUsed += sizeof(UINT64);
// align // align
size_t tmp = WriteAlignToMemory(mem + memUsed); size_t tmp = WriteAlignToMemory(mem + memUsed);
memUsed += tmp; memUsed += tmp;
@ -223,7 +223,7 @@ size_t TargetPhrase::WriteScoresToMemory(char *mem) const
} }
Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::FactorType> & inputFactors Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::FactorType> & inputFactors
, const std::vector<Moses::FactorType> &outputFactors , const std::vector<Moses::FactorType> &outputFactors
, const Vocab &vocab , const Vocab &vocab
, const Moses::PhraseDictionary &phraseDict , const Moses::PhraseDictionary &phraseDict
@ -244,7 +244,7 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::Facto
int index = 0; int index = 0;
Moses::AlignmentInfo::CollType alignTerm, alignNonTerm; Moses::AlignmentInfo::CollType alignTerm, alignNonTerm;
std::set<std::pair<size_t, size_t> > alignmentInfo; std::set<std::pair<size_t, size_t> > alignmentInfo;
const PhrasePtr sp = GetSourcePhrase(); const PhrasePtr sp = GetSourcePhrase();
for (size_t ind = 0; ind < m_align.size(); ++ind) { for (size_t ind = 0; ind < m_align.size(); ++ind) {
const std::pair<size_t, size_t> &entry = m_align[ind]; const std::pair<size_t, size_t> &entry = m_align[ind];
alignmentInfo.insert(entry); alignmentInfo.insert(entry);
@ -252,11 +252,10 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::Facto
size_t targetPos = entry.second; size_t targetPos = entry.second;
if (GetWord(targetPos).IsNonTerminal()) { if (GetWord(targetPos).IsNonTerminal()) {
alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos)); alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
} else {
alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
} }
else {
alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
}
} }
ret->SetAlignTerm(alignTerm); ret->SetAlignTerm(alignTerm);
@ -313,7 +312,7 @@ UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP)
bytesRead += word->ReadFromFile(fileTP); bytesRead += word->ReadFromFile(fileTP);
AddWord(word); AddWord(word);
} }
// read source words // read source words
UINT64 numSourceWords; UINT64 numSourceWords;
fileTP.read((char*) &numSourceWords, sizeof(UINT64)); fileTP.read((char*) &numSourceWords, sizeof(UINT64));
@ -371,7 +370,7 @@ UINT64 TargetPhrase::ReadScoresFromFile(std::fstream &fileTPColl)
void TargetPhrase::DebugPrint(ostream &out, const Vocab &vocab) const void TargetPhrase::DebugPrint(ostream &out, const Vocab &vocab) const
{ {
Phrase::DebugPrint(out, vocab); Phrase::DebugPrint(out, vocab);
for (size_t ind = 0; ind < m_align.size(); ++ind) { for (size_t ind = 0; ind < m_align.size(); ++ind) {
const AlignPair &alignPair = m_align[ind]; const AlignPair &alignPair = m_align[ind];
out << alignPair.first << "-" << alignPair.second << " "; out << alignPair.first << "-" << alignPair.second << " ";

View File

@ -49,7 +49,7 @@ class TargetPhrase: public Phrase
friend std::ostream& operator<<(std::ostream&, const TargetPhrase&); friend std::ostream& operator<<(std::ostream&, const TargetPhrase&);
protected: protected:
AlignType m_align; AlignType m_align;
PhrasePtr m_sourcePhrase; PhrasePtr m_sourcePhrase;
std::vector<float> m_scores; std::vector<float> m_scores;
UINT64 m_filePos; UINT64 m_filePos;
@ -73,10 +73,10 @@ public:
const PhrasePtr GetSourcePhrase() const { const PhrasePtr GetSourcePhrase() const {
return m_sourcePhrase; return m_sourcePhrase;
} }
const std::vector<float> &GetScores() const{ const std::vector<float> &GetScores() const {
return m_scores; return m_scores;
} }
void SetLHS(WordPtr lhs); void SetLHS(WordPtr lhs);
void Create1AlignFromString(const std::string &align1Str); void Create1AlignFromString(const std::string &align1Str);
@ -107,7 +107,7 @@ public:
UINT64 ReadOtherInfoFromFile(UINT64 filePos, std::fstream &fileTPColl); UINT64 ReadOtherInfoFromFile(UINT64 filePos, std::fstream &fileTPColl);
UINT64 ReadFromFile(std::fstream &fileTP); UINT64 ReadFromFile(std::fstream &fileTP);
virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const; virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const;
}; };

View File

@ -82,7 +82,7 @@ void TargetPhraseCollection::Save(OnDiskWrapper &onDiskWrapper)
CollType::iterator iter; CollType::iterator iter;
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) { for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) {
// save phrase // save phrase
TargetPhrase &targetPhrase = **iter; TargetPhrase &targetPhrase = **iter;
targetPhrase.Save(onDiskWrapper); targetPhrase.Save(onDiskWrapper);
// save coll // save coll
@ -150,9 +150,9 @@ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnD
{ {
fstream &fileTPColl = onDiskWrapper.GetFileTargetColl(); fstream &fileTPColl = onDiskWrapper.GetFileTargetColl();
fstream &fileTP = onDiskWrapper.GetFileTargetInd(); fstream &fileTP = onDiskWrapper.GetFileTargetInd();
size_t numScores = onDiskWrapper.GetNumScores(); size_t numScores = onDiskWrapper.GetNumScores();
UINT64 numPhrases; UINT64 numPhrases;
@ -164,9 +164,9 @@ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnD
numPhrases = std::min(numPhrases, (UINT64) tableLimit); numPhrases = std::min(numPhrases, (UINT64) tableLimit);
currFilePos += sizeof(UINT64); currFilePos += sizeof(UINT64);
for (size_t ind = 0; ind < numPhrases; ++ind) { for (size_t ind = 0; ind < numPhrases; ++ind) {
TargetPhrase *tp = new TargetPhrase(numScores); TargetPhrase *tp = new TargetPhrase(numScores);
UINT64 sizeOtherInfo = tp->ReadOtherInfoFromFile(currFilePos, fileTPColl); UINT64 sizeOtherInfo = tp->ReadOtherInfoFromFile(currFilePos, fileTPColl);
tp->ReadFromFile(fileTP); tp->ReadFromFile(fileTP);
@ -197,7 +197,7 @@ const TargetPhrase &TargetPhraseCollection::GetTargetPhrase(size_t ind) const
assert(ind < GetSize()); assert(ind < GetSize());
return *m_coll[ind]; return *m_coll[ind];
} }
} }

View File

@ -64,9 +64,9 @@ public:
size_t GetSize() const { size_t GetSize() const {
return m_coll.size(); return m_coll.size();
} }
const TargetPhrase &GetTargetPhrase(size_t ind) const; const TargetPhrase &GetTargetPhrase(size_t ind) const;
UINT64 GetFilePos() const; UINT64 GetFilePos() const;
Moses::TargetPhraseCollection *ConvertToMoses(const std::vector<Moses::FactorType> &inputFactors Moses::TargetPhraseCollection *ConvertToMoses(const std::vector<Moses::FactorType> &inputFactors

View File

@ -44,7 +44,7 @@ bool Vocab::Load(OnDiskWrapper &onDiskWrapper)
// assume contiguous vocab id // assume contiguous vocab id
m_lookup.resize(m_vocabColl.size() + 1); m_lookup.resize(m_vocabColl.size() + 1);
m_nextId = m_lookup.size(); m_nextId = m_lookup.size();
CollType::const_iterator iter; CollType::const_iterator iter;
for (iter = m_vocabColl.begin(); iter != m_vocabColl.end(); ++iter) { for (iter = m_vocabColl.begin(); iter != m_vocabColl.end(); ++iter) {
UINT32 vocabId = iter->second; UINT32 vocabId = iter->second;

View File

@ -97,13 +97,14 @@ size_t Word::ReadFromFile(std::fstream &file)
} }
void Word::ConvertToMoses( void Word::ConvertToMoses(
const std::vector<Moses::FactorType> &outputFactorsVec, const std::vector<Moses::FactorType> &outputFactorsVec,
const Vocab &vocab, const Vocab &vocab,
Moses::Word &overwrite) const { Moses::Word &overwrite) const
{
Moses::FactorCollection &factorColl = Moses::FactorCollection::Instance(); Moses::FactorCollection &factorColl = Moses::FactorCollection::Instance();
overwrite = Moses::Word(m_isNonTerminal); overwrite = Moses::Word(m_isNonTerminal);
// TODO: this conversion should have been done at load time. // TODO: this conversion should have been done at load time.
util::TokenIter<util::SingleCharacter> tok(vocab.GetString(m_vocabId), '|'); util::TokenIter<util::SingleCharacter> tok(vocab.GetString(m_vocabId), '|');
for (std::vector<Moses::FactorType>::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) { for (std::vector<Moses::FactorType>::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) {
@ -144,14 +145,14 @@ bool Word::operator==(const Word &compare) const
void Word::DebugPrint(ostream &out, const Vocab &vocab) const void Word::DebugPrint(ostream &out, const Vocab &vocab) const
{ {
const string &str = vocab.GetString(m_vocabId); const string &str = vocab.GetString(m_vocabId);
out << str; out << str;
} }
std::ostream& operator<<(std::ostream &out, const Word &word) std::ostream& operator<<(std::ostream &out, const Word &word)
{ {
out << "("; out << "(";
out << word.m_vocabId; out << word.m_vocabId;
out << (word.m_isNonTerminal ? "n" : "t"); out << (word.m_isNonTerminal ? "n" : "t");
out << ")"; out << ")";

View File

@ -50,8 +50,8 @@ public:
{} {}
explicit Word(bool isNonTerminal) explicit Word(bool isNonTerminal)
:m_isNonTerminal(isNonTerminal) :m_isNonTerminal(isNonTerminal)
,m_vocabId(0) ,m_vocabId(0)
{} {}
Word(const Word &copy); Word(const Word &copy);
@ -77,8 +77,7 @@ public:
Moses::Word &overwrite) const; Moses::Word &overwrite) const;
void DebugPrint(std::ostream &out, const Vocab &vocab) const; void DebugPrint(std::ostream &out, const Vocab &vocab) const;
inline const std::string &GetString(const Vocab &vocab) const inline const std::string &GetString(const Vocab &vocab) const {
{
return vocab.GetString(m_vocabId); return vocab.GetString(m_vocabId);
} }

View File

@ -33,8 +33,7 @@ int main(int argc, char **argv)
if(i + 1 == argc) if(i + 1 == argc)
usage(); usage();
ttable = argv[++i]; ttable = argv[++i];
} } else
else
usage(); usage();
} }
@ -55,30 +54,27 @@ int main(int argc, char **argv)
cerr << "line: " << line << endl; cerr << "line: " << line << endl;
const PhraseNode* node = onDiskQuery.Query(tokens); const PhraseNode* node = onDiskQuery.Query(tokens);
if (node) if (node) {
{ // source phrase points to a bunch of rules // source phrase points to a bunch of rules
const TargetPhraseCollection *coll = node->GetTargetPhraseCollection(tableLimit, onDiskWrapper); const TargetPhraseCollection *coll = node->GetTargetPhraseCollection(tableLimit, onDiskWrapper);
string str = coll->GetDebugStr(); string str = coll->GetDebugStr();
cout << "Found " << coll->GetSize() << endl; cout << "Found " << coll->GetSize() << endl;
for (size_t ind = 0; ind < coll->GetSize(); ++ind) for (size_t ind = 0; ind < coll->GetSize(); ++ind) {
{
const TargetPhrase &targetPhrase = coll->GetTargetPhrase(ind); const TargetPhrase &targetPhrase = coll->GetTargetPhrase(ind);
cerr << " "; cerr << " ";
targetPhrase.DebugPrint(cerr, onDiskWrapper.GetVocab()); targetPhrase.DebugPrint(cerr, onDiskWrapper.GetVocab());
cerr << endl; cerr << endl;
} }
} } else {
else
{
cout << "Not found" << endl; cout << "Not found" << endl;
} }
std::cout << '\n'; std::cout << '\n';
std::cout.flush(); std::cout.flush();
} }
cerr << "Finished." << endl; cerr << "Finished." << endl;
} }

View File

@ -5,7 +5,8 @@
#include <stdlib.h> #include <stdlib.h>
#include <cstring> #include <cstring>
namespace { namespace
{
const int LINE_MAX_LENGTH = 10000; const int LINE_MAX_LENGTH = 10000;
@ -84,10 +85,10 @@ void Alignment::Create(const string& fileName)
} }
Alignment::Alignment() Alignment::Alignment()
: m_array(NULL), : m_array(NULL),
m_sentenceEnd(NULL), m_sentenceEnd(NULL),
m_size(0), m_size(0),
m_sentenceCount(0) {} m_sentenceCount(0) {}
Alignment::~Alignment() Alignment::~Alignment()
{ {

View File

@ -23,16 +23,16 @@ enum {
}; };
Mismatch::Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end ) Mismatch::Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end )
:m_suffixArray(sa) :m_suffixArray(sa)
,m_targetCorpus(tc) ,m_targetCorpus(tc)
,m_alignment(a) ,m_alignment(a)
,m_sentence_id(sentence_id) ,m_sentence_id(sentence_id)
,m_source_length(source_length) ,m_source_length(source_length)
,m_target_length(target_length) ,m_target_length(target_length)
,m_source_position(position) ,m_source_position(position)
,m_source_start(source_start) ,m_source_start(source_start)
,m_source_end(source_end) ,m_source_end(source_end)
,m_unaligned(true) ,m_unaligned(true)
{ {
// initialize unaligned indexes // initialize unaligned indexes
for (int i = 0; i < m_source_length; i++) { for (int i = 0; i < m_source_length; i++) {
@ -42,7 +42,7 @@ Mismatch::Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sente
m_target_unaligned[i] = true; m_target_unaligned[i] = true;
} }
m_num_alignment_points = m_num_alignment_points =
m_alignment->GetNumberOfAlignmentPoints( sentence_id ); m_alignment->GetNumberOfAlignmentPoints( sentence_id );
for(INDEX ap=0; ap<m_num_alignment_points; ap++) { for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
m_source_unaligned[ (int)m_alignment->GetSourceWord( sentence_id, ap ) ] = false; m_source_unaligned[ (int)m_alignment->GetSourceWord( sentence_id, ap ) ] = false;
m_target_unaligned[ (int)m_alignment->GetTargetWord( sentence_id, ap ) ] = false; m_target_unaligned[ (int)m_alignment->GetTargetWord( sentence_id, ap ) ] = false;
@ -58,234 +58,235 @@ Mismatch::~Mismatch () {}
void Mismatch::PrintClippedHTML( ostream* out, int width ) void Mismatch::PrintClippedHTML( ostream* out, int width )
{ {
int source_annotation[256], target_annotation[256]; int source_annotation[256], target_annotation[256];
vector< string > label_class; vector< string > label_class;
label_class.push_back( "" ); label_class.push_back( "" );
label_class.push_back( "mismatch_pre_aligned" ); label_class.push_back( "mismatch_pre_aligned" );
label_class.push_back( "mismatch_post_aligned" ); label_class.push_back( "mismatch_post_aligned" );
label_class.push_back( "null_aligned" ); label_class.push_back( "null_aligned" );
label_class.push_back( "mismatch_misaligned" ); label_class.push_back( "mismatch_misaligned" );
label_class.push_back( "mismatch_aligned" ); label_class.push_back( "mismatch_aligned" );
for(int i=0; i<m_source_length;i++) source_annotation[i] = UNANNOTATED; for(int i=0; i<m_source_length; i++) source_annotation[i] = UNANNOTATED;
for(int i=0; i<m_target_length;i++) target_annotation[i] = UNANNOTATED; for(int i=0; i<m_target_length; i++) target_annotation[i] = UNANNOTATED;
if (m_unaligned) {
// find alignment points for prior and next word(s) and
// center target phrase around those.
bool found_aligned = false;
for(int i=1; i<m_source_length && !found_aligned; i++) {
if (m_source_start-i >= 0) {
int word_id = m_source_start-i;
source_annotation[ word_id ] = UNALIGNED;
if (!m_source_unaligned[ word_id ]) {
found_aligned = true;
LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED );
}
}
if (m_source_end+i < m_source_length) { if (m_unaligned) {
int word_id = m_source_end+i; // find alignment points for prior and next word(s) and
source_annotation[ word_id ] = UNALIGNED; // center target phrase around those.
if (!m_source_unaligned[ word_id ]) { bool found_aligned = false;
found_aligned = true; for(int i=1; i<m_source_length && !found_aligned; i++) {
LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED ); if (m_source_start-i >= 0) {
} int word_id = m_source_start-i;
} source_annotation[ word_id ] = UNALIGNED;
} if (!m_source_unaligned[ word_id ]) {
found_aligned = true;
} LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED );
// misalignment }
else { }
// label aligned output words
for(int i=m_source_start; i<=m_source_end; i++)
LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED );
// find first and last if (m_source_end+i < m_source_length) {
int target_start = -1; int word_id = m_source_end+i;
int target_end; source_annotation[ word_id ] = UNALIGNED;
for(int i=0; i<m_target_length; i++) if (!m_source_unaligned[ word_id ]) {
if (target_annotation[i] == ALIGNED) { found_aligned = true;
if (target_start == -1) LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED );
target_start = i; }
target_end = i; }
} }
// go over all enclosed target words
for(int i=target_start; i<=target_end; i++) { }
// label other target words as unaligned or misaligned // misalignment
if (m_target_unaligned[ i ]) else {
target_annotation[ i ] = UNALIGNED; // label aligned output words
else { for(int i=m_source_start; i<=m_source_end; i++)
if (target_annotation[ i ] != ALIGNED) LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED );
target_annotation[ i ] = MISALIGNED;
// loop over aligned source words // find first and last
for(INDEX ap=0; ap<m_num_alignment_points; ap++) { int target_start = -1;
if (m_alignment->GetTargetWord( m_sentence_id, ap ) == i) { int target_end;
int source_word = m_alignment->GetSourceWord( m_sentence_id, ap ); for(int i=0; i<m_target_length; i++)
// if not part of the source phrase -> also misaligned if (target_annotation[i] == ALIGNED) {
if (source_word < m_source_start || source_word > m_source_end) if (target_start == -1)
source_annotation[ source_word ] = MISALIGNED; target_start = i;
} target_end = i;
} }
} // go over all enclosed target words
} for(int i=target_start; i<=target_end; i++) {
// closure // label other target words as unaligned or misaligned
bool change = true; if (m_target_unaligned[ i ])
while(change) { target_annotation[ i ] = UNALIGNED;
change = false; else {
for(INDEX ap=0; ap<m_num_alignment_points; ap++) { if (target_annotation[ i ] != ALIGNED)
int source_word = m_alignment->GetSourceWord( m_sentence_id, ap ); target_annotation[ i ] = MISALIGNED;
int target_word = m_alignment->GetTargetWord( m_sentence_id, ap ); // loop over aligned source words
if (source_annotation[source_word] != UNANNOTATED && for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
target_annotation[target_word] == UNANNOTATED) { if (m_alignment->GetTargetWord( m_sentence_id, ap ) == i) {
target_annotation[target_word] = MISALIGNED; int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
change = true; // if not part of the source phrase -> also misaligned
} if (source_word < m_source_start || source_word > m_source_end)
if (source_annotation[source_word] == UNANNOTATED && source_annotation[ source_word ] = MISALIGNED;
target_annotation[target_word] != UNANNOTATED) { }
source_annotation[source_word] = MISALIGNED; }
change = true; }
} }
} // closure
} bool change = true;
} while(change) {
change = false;
// print source for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
// shorten source context if too long int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
int target_word = m_alignment->GetTargetWord( m_sentence_id, ap );
if (source_annotation[source_word] != UNANNOTATED &&
target_annotation[target_word] == UNANNOTATED) {
target_annotation[target_word] = MISALIGNED;
change = true;
}
if (source_annotation[source_word] == UNANNOTATED &&
target_annotation[target_word] != UNANNOTATED) {
source_annotation[source_word] = MISALIGNED;
change = true;
}
}
}
}
// print source
// shorten source context if too long
int sentence_start = m_source_position - m_source_start; int sentence_start = m_source_position - m_source_start;
int context_space = width/2; int context_space = width/2;
for(int i=m_source_start;i<=m_source_end;i++) for(int i=m_source_start; i<=m_source_end; i++)
context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1; context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1;
context_space /= 2; context_space /= 2;
int remaining = context_space; int remaining = context_space;
int start_word = m_source_start; int start_word = m_source_start;
for(;start_word>0 && remaining>0; start_word--) for(; start_word>0 && remaining>0; start_word--)
remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1; remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1;
if (remaining<0 || start_word == -1) start_word++; if (remaining<0 || start_word == -1) start_word++;
remaining = context_space; remaining = context_space;
int end_word = m_source_end; int end_word = m_source_end;
for(;end_word<m_source_length && remaining>0; end_word++) for(; end_word<m_source_length && remaining>0; end_word++)
remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1; remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1;
end_word--; end_word--;
// output with markup // output with markup
*out << "<tr><td class=\"pp_source_left\">"; *out << "<tr><td class=\"pp_source_left\">";
char current_label = UNANNOTATED; char current_label = UNANNOTATED;
if (start_word>0) { if (start_word>0) {
current_label = source_annotation[start_word-1]; current_label = source_annotation[start_word-1];
*out << "... "; *out << "... ";
} }
for(int i=start_word; i<=end_word; i++) { for(int i=start_word; i<=end_word; i++) {
// change to phrase block // change to phrase block
if (i == m_source_start) { if (i == m_source_start) {
if (current_label != UNANNOTATED && i!=start_word) if (current_label != UNANNOTATED && i!=start_word)
*out << "</span>"; *out << "</span>";
*out << "</td><td class=\"pp_source\">"; *out << "</td><td class=\"pp_source\">";
current_label = UNANNOTATED; current_label = UNANNOTATED;
} }
// change to labeled word // change to labeled word
else if (source_annotation[i] != current_label && else if (source_annotation[i] != current_label &&
source_annotation[i] != ALIGNED) { source_annotation[i] != ALIGNED) {
if (current_label != UNANNOTATED && i!=start_word) if (current_label != UNANNOTATED && i!=start_word)
*out << "</span>"; *out << "</span>";
if (source_annotation[i] != UNANNOTATED) if (source_annotation[i] != UNANNOTATED)
*out << "<span class=\"" *out << "<span class=\""
<< label_class[ source_annotation[i] ] << label_class[ source_annotation[i] ]
<< "\">"; << "\">";
current_label = source_annotation[i]; current_label = source_annotation[i];
} }
// output word // output word
*out << m_suffixArray->GetWord( sentence_start + i ) << " "; *out << m_suffixArray->GetWord( sentence_start + i ) << " ";
// change to right context block // change to right context block
if (i == m_source_end) { if (i == m_source_end) {
*out << "</td><td class=\"pp_source_right\">"; *out << "</td><td class=\"pp_source_right\">";
current_label = UNANNOTATED; current_label = UNANNOTATED;
} }
} }
if (current_label != UNANNOTATED && end_word>m_source_end) if (current_label != UNANNOTATED && end_word>m_source_end)
*out << "</span>"; *out << "</span>";
if (end_word<m_source_length-1) if (end_word<m_source_length-1)
*out << "... "; *out << "... ";
// print target // print target
// shorten target context if too long // shorten target context if too long
int target_start = -1; int target_start = -1;
int target_end; int target_end;
for(int i=0; i<m_target_length; i++) for(int i=0; i<m_target_length; i++)
if (target_annotation[i] != UNANNOTATED) { if (target_annotation[i] != UNANNOTATED) {
if (target_start == -1) if (target_start == -1)
target_start = i; target_start = i;
target_end = i; target_end = i;
} }
context_space = width/2; context_space = width/2;
for(int i=target_start;i<=target_end;i++) for(int i=target_start; i<=target_end; i++)
context_space -= m_targetCorpus->GetWord( m_sentence_id, i ).size() + 1; context_space -= m_targetCorpus->GetWord( m_sentence_id, i ).size() + 1;
while (context_space < 0) { // shorten matched part, if too long while (context_space < 0) { // shorten matched part, if too long
context_space += context_space +=
m_targetCorpus->GetWord( m_sentence_id, target_start ).size() + m_targetCorpus->GetWord( m_sentence_id, target_start ).size() +
m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2; m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2;
target_start++; target_start++;
target_end--; target_end--;
} }
context_space /= 2; context_space /= 2;
remaining = context_space; remaining = context_space;
start_word = target_start; start_word = target_start;
for(;start_word>0 && remaining>0; start_word--) { for(; start_word>0 && remaining>0; start_word--) {
//cerr << "remaining: " << remaining << ", start_word: " << start_word << endl; //cerr << "remaining: " << remaining << ", start_word: " << start_word << endl;
remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1; remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1;
} }
if (remaining<0 || start_word == -1) start_word++; if (remaining<0 || start_word == -1) start_word++;
remaining = context_space; remaining = context_space;
end_word = target_end; end_word = target_end;
for(;end_word<m_target_length && remaining>0; end_word++) { for(; end_word<m_target_length && remaining>0; end_word++) {
//cerr << "remaining: " << remaining << ", end_word: " << end_word << endl; //cerr << "remaining: " << remaining << ", end_word: " << end_word << endl;
remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1; remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1;
} }
end_word--; end_word--;
// output with markup // output with markup
*out << "</td><td class=\"mismatch_target\">"; *out << "</td><td class=\"mismatch_target\">";
current_label = UNANNOTATED; current_label = UNANNOTATED;
if (start_word>0) { if (start_word>0) {
current_label = target_annotation[start_word-1]; current_label = target_annotation[start_word-1];
*out << "... "; *out << "... ";
} }
for(int i=start_word; i<=end_word; i++) { for(int i=start_word; i<=end_word; i++) {
if (target_annotation[i] != current_label) { if (target_annotation[i] != current_label) {
if (current_label != UNANNOTATED && i!=start_word) if (current_label != UNANNOTATED && i!=start_word)
*out << "</span>"; *out << "</span>";
if (target_annotation[i] != UNANNOTATED) if (target_annotation[i] != UNANNOTATED)
*out << "<span class=\"" *out << "<span class=\""
<< label_class[ target_annotation[i] ] << label_class[ target_annotation[i] ]
<< "\">"; << "\">";
current_label = target_annotation[i]; current_label = target_annotation[i];
} }
// output word // output word
*out << m_targetCorpus->GetWord( m_sentence_id, i ) << " "; *out << m_targetCorpus->GetWord( m_sentence_id, i ) << " ";
} }
if (current_label != UNANNOTATED && end_word>target_end) if (current_label != UNANNOTATED && end_word>target_end)
*out << "</span>"; *out << "</span>";
if (end_word<m_target_length-1) if (end_word<m_target_length-1)
*out << "... "; *out << "... ";
*out << "</td></tr>"; *out << "</td></tr>";
} }
void Mismatch::LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label ) { void Mismatch::LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label )
for(INDEX ap=0; ap<m_num_alignment_points; ap++) { {
if (m_alignment->GetSourceWord( m_sentence_id, ap ) == source_id) { for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
source_annotation[ source_id ] = label; if (m_alignment->GetSourceWord( m_sentence_id, ap ) == source_id) {
target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label; source_annotation[ source_id ] = label;
} target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label;
} }
}
} }

View File

@ -34,7 +34,9 @@ public:
Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end ); Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end );
~Mismatch(); ~Mismatch();
bool Unaligned() const { return m_unaligned; } bool Unaligned() const {
return m_unaligned;
}
void PrintClippedHTML(std::ostream* out, int width ); void PrintClippedHTML(std::ostream* out, int width );
void LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label ); void LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label );
}; };

View File

@ -37,7 +37,7 @@ void PhrasePair::Print( ostream* out ) const
INDEX ap_points = m_alignment->GetNumberOfAlignmentPoints( m_sentence_id ); INDEX ap_points = m_alignment->GetNumberOfAlignmentPoints( m_sentence_id );
for( INDEX i=0; i<ap_points; i++) { for( INDEX i=0; i<ap_points; i++) {
*out << " " << m_alignment->GetSourceWord( m_sentence_id, i ) *out << " " << m_alignment->GetSourceWord( m_sentence_id, i )
<< "-" << m_alignment->GetTargetWord( m_sentence_id, i ); << "-" << m_alignment->GetTargetWord( m_sentence_id, i );
} }
*out << endl; *out << endl;
@ -185,27 +185,27 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
size_t source_pre_width = (source_width-source.size())/2; size_t source_pre_width = (source_width-source.size())/2;
size_t source_post_width = (source_width-source.size()+1)/2; size_t source_post_width = (source_width-source.size()+1)/2;
// if phrase is too long, don't show any context // if phrase is too long, don't show any context
if (source.size() > (size_t)width) { if (source.size() > (size_t)width) {
source_pre_width = 0; source_pre_width = 0;
source_post_width = 0; source_post_width = 0;
} }
// too long -> truncate and add "..." // too long -> truncate and add "..."
if (source_pre.size() > source_pre_width) { if (source_pre.size() > source_pre_width) {
// first skip up to a space // first skip up to a space
while(source_pre_width>0 && while(source_pre_width>0 &&
source_pre.substr(source_pre.size()-source_pre_width,1) != " ") { source_pre.substr(source_pre.size()-source_pre_width,1) != " ") {
source_pre_width--; source_pre_width--;
} }
source_pre = "..." + source_pre.substr( source_pre.size()-source_pre_width, source_pre_width ); source_pre = "..." + source_pre.substr( source_pre.size()-source_pre_width, source_pre_width );
} }
if (source_post.size() > source_post_width) { if (source_post.size() > source_post_width) {
while(source_post_width>0 && while(source_post_width>0 &&
source_post.substr(source_post_width-1,1) != " ") { source_post.substr(source_post_width-1,1) != " ") {
source_post_width--; source_post_width--;
} }
source_post = source_post.substr( 0, source_post_width ) + "..."; source_post = source_post.substr( 0, source_post_width ) + "...";
} }
*out << "<tr><td class=\"pp_source_left\">" *out << "<tr><td class=\"pp_source_left\">"
<< source_pre << source_pre
@ -220,13 +220,13 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
string target_pre = ""; string target_pre = "";
string target = ""; string target = "";
string target_post = ""; string target_post = "";
size_t target_pre_null_width = 0; size_t target_pre_null_width = 0;
size_t target_post_null_width = 0; size_t target_post_null_width = 0;
for( char i=0; i<m_target_start; i++ ) { for( char i=0; i<m_target_start; i++ ) {
WORD word = m_targetCorpus->GetWord( m_sentence_id, i); WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
target_pre += " " + word; target_pre += " " + word;
if (i >= m_target_start-m_pre_null) if (i >= m_target_start-m_pre_null)
target_pre_null_width += word.size() + 1; target_pre_null_width += word.size() + 1;
} }
for( char i=m_target_start; i<=m_target_end; i++ ) { for( char i=m_target_start; i<=m_target_end; i++ ) {
if (i>m_target_start) target += " "; if (i>m_target_start) target += " ";
@ -234,11 +234,11 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
} }
for( char i=m_target_end+1; i<m_target_length; i++ ) { for( char i=m_target_end+1; i<m_target_length; i++ ) {
if (i>m_target_end+1) target_post += " "; if (i>m_target_end+1) target_post += " ";
WORD word = m_targetCorpus->GetWord( m_sentence_id, i); WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
target_post += word; target_post += word;
if (i-(m_target_end+1) < m_post_null) { if (i-(m_target_end+1) < m_post_null) {
target_post_null_width += word.size() + 1; target_post_null_width += word.size() + 1;
} }
} }
size_t target_pre_width = (target_width-target.size())/2; size_t target_pre_width = (target_width-target.size())/2;
@ -249,46 +249,45 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
target_post_width = 0; target_post_width = 0;
} }
if (target_pre.size() < target_pre_width) if (target_pre.size() < target_pre_width)
target_pre_width = target_pre.size(); target_pre_width = target_pre.size();
else { else {
while(target_pre_width>0 && while(target_pre_width>0 &&
target_pre.substr(target_pre.size()-target_pre_width,1) != " ") { target_pre.substr(target_pre.size()-target_pre_width,1) != " ") {
target_pre_width--; target_pre_width--;
} }
target_pre = "..." + target_pre.substr( target_pre.size()-target_pre_width, target_pre_width ); target_pre = "..." + target_pre.substr( target_pre.size()-target_pre_width, target_pre_width );
} }
if (target_post.size() < target_post_width) { if (target_post.size() < target_post_width) {
target_post_width = target_post.size(); target_post_width = target_post.size();
} } else {
else { while(target_post_width>0 &&
while(target_post_width>0 && target_post.substr(target_post_width-1,1) != " ") {
target_post.substr(target_post_width-1,1) != " ") { target_post_width--;
target_post_width--; }
} target_post = target_post.substr( 0, target_post_width ) + "...";
target_post = target_post.substr( 0, target_post_width ) + "..."; }
}
if (m_pre_null) { if (m_pre_null) {
//cerr << endl << "target_pre_width=" << target_pre_width << ", target_pre_null_width=" << target_pre_null_width << ", target_pre.size()=" << target_pre.size() << endl; //cerr << endl << "target_pre_width=" << target_pre_width << ", target_pre_null_width=" << target_pre_null_width << ", target_pre.size()=" << target_pre.size() << endl;
if (target_pre_width < target_pre.size()) if (target_pre_width < target_pre.size())
target_pre_null_width -= target_pre.size()-target_pre_width; target_pre_null_width -= target_pre.size()-target_pre_width;
target_pre = target_pre.substr(0,target_pre_width-target_pre_null_width) target_pre = target_pre.substr(0,target_pre_width-target_pre_null_width)
+ "<span class=\"null_aligned\">" + "<span class=\"null_aligned\">"
+ target_pre.substr(target_pre_width-target_pre_null_width) + target_pre.substr(target_pre_width-target_pre_null_width)
+ "</span>"; + "</span>";
} }
if (m_post_null) { if (m_post_null) {
//cerr << endl << "target_post_width=" << target_post_width << ", target_post_null_width=" << target_post_null_width << ", target_post.size()=" << target_post.size() << endl; //cerr << endl << "target_post_width=" << target_post_width << ", target_post_null_width=" << target_post_null_width << ", target_post.size()=" << target_post.size() << endl;
if (target_post_null_width > target_post.size()) { if (target_post_null_width > target_post.size()) {
target_post_null_width = target_post.size(); target_post_null_width = target_post.size();
} }
target_post = "<span class=\"null_aligned\">" target_post = "<span class=\"null_aligned\">"
+ target_post.substr(0,target_post_null_width) + target_post.substr(0,target_post_null_width)
+ "</span>" + "</span>"
+ target_post.substr(target_post_null_width); + target_post.substr(target_post_null_width);
} }
*out << "<td class=\"pp_target_left\">" *out << "<td class=\"pp_target_left\">"
<< target_pre << target_pre

View File

@ -47,15 +47,15 @@ int PhrasePairCollection::GetCollection( const vector< string >& sourceString )
int sentence_length = m_suffixArray->GetSentenceLength( sentence_id ); int sentence_length = m_suffixArray->GetSentenceLength( sentence_id );
int target_length = m_targetCorpus->GetSentenceLength( sentence_id ); int target_length = m_targetCorpus->GetSentenceLength( sentence_id );
//cerr << "match " << (i-first_match) //cerr << "match " << (i-first_match)
//<< " in sentence " << sentence_id //<< " in sentence " << sentence_id
//<< ", starting at word " << source_start //<< ", starting at word " << source_start
//<< " of " << sentence_length //<< " of " << sentence_length
//<< ". target sentence has " << target_length << " words."; //<< ". target sentence has " << target_length << " words.";
int target_start, target_end, pre_null, post_null; int target_start, target_end, pre_null, post_null;
if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) { if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) {
//cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]"; //cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]";
//cerr << " +(" << (int)pre_null << "," << (int)post_null << ")"; //cerr << " +(" << (int)pre_null << "," << (int)post_null << ")";
bool null_boundary_words = false; bool null_boundary_words = false;
for (int pre = 0; pre <= pre_null && (pre == 0 || null_boundary_words); pre++ ) { for (int pre = 0; pre <= pre_null && (pre == 0 || null_boundary_words); pre++ ) {
for (int post = 0; post <= post_null && (post == 0 || null_boundary_words); post++ ) { for (int post = 0; post <= post_null && (post == 0 || null_boundary_words); post++ ) {
vector< WORD_ID > targetString; vector< WORD_ID > targetString;
@ -75,19 +75,18 @@ int PhrasePairCollection::GetCollection( const vector< string >& sourceString )
m_size++; m_size++;
} }
} }
} else {
//cerr << "mismatch " << (i-first_match)
// << " in sentence " << sentence_id
// << ", starting at word " << source_start
// << " of " << sentence_length
// << ". target sentence has " << target_length << " words.";
Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end );
if (mismatch->Unaligned())
m_unaligned.push_back( mismatch );
else
m_mismatch.push_back( mismatch );
} }
else {
//cerr << "mismatch " << (i-first_match)
// << " in sentence " << sentence_id
// << ", starting at word " << source_start
// << " of " << sentence_length
// << ". target sentence has " << target_length << " words.";
Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end );
if (mismatch->Unaligned())
m_unaligned.push_back( mismatch );
else
m_mismatch.push_back( mismatch );
}
//cerr << endl; //cerr << endl;
if (found > (INDEX)m_max_lookup) { if (found > (INDEX)m_max_lookup) {
@ -111,8 +110,7 @@ void PhrasePairCollection::Print(bool pretty) const
for(int j=0; j<ppWithSameTarget->size() && j<m_max_example; j++, p++ ) { for(int j=0; j<ppWithSameTarget->size() && j<m_max_example; j++, p++ ) {
if (pretty) { if (pretty) {
(*p)->PrintPretty( &cout, 100 ); (*p)->PrintPretty( &cout, 100 );
} } else {
else {
(*p)->Print( &cout ); (*p)->Print( &cout );
} }
if (ppWithSameTarget->size() > m_max_example) { if (ppWithSameTarget->size() > m_max_example) {
@ -125,33 +123,32 @@ void PhrasePairCollection::Print(bool pretty) const
void PhrasePairCollection::PrintHTML() const void PhrasePairCollection::PrintHTML() const
{ {
int pp_target = 0; int pp_target = 0;
bool singleton = false; bool singleton = false;
// loop over all translations // loop over all translations
vector< vector<PhrasePair*> >::const_iterator ppWithSameTarget; vector< vector<PhrasePair*> >::const_iterator ppWithSameTarget;
for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_translation; ppWithSameTarget++, pp_target++ ) { for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_translation; ppWithSameTarget++, pp_target++ ) {
int count = ppWithSameTarget->size(); int count = ppWithSameTarget->size();
if (!singleton) { if (!singleton) {
if (count == 1) { if (count == 1) {
singleton = true; singleton = true;
cout << "<p class=\"pp_singleton_header\">singleton" cout << "<p class=\"pp_singleton_header\">singleton"
<< (m_collection.end() - ppWithSameTarget==1?"":"s") << " (" << (m_collection.end() - ppWithSameTarget==1?"":"s") << " ("
<< (m_collection.end() - ppWithSameTarget) << (m_collection.end() - ppWithSameTarget)
<< "/" << m_size << ")</p>"; << "/" << m_size << ")</p>";
} } else {
else { cout << "<p class=\"pp_target_header\">";
cout << "<p class=\"pp_target_header\">"; (*(ppWithSameTarget->begin()))->PrintTarget( &cout );
(*(ppWithSameTarget->begin()))->PrintTarget( &cout ); cout << " (" << count << "/" << m_size << ")" << endl;
cout << " (" << count << "/" << m_size << ")" << endl; cout << "<p><div id=\"pp_" << pp_target << "\">";
cout << "<p><div id=\"pp_" << pp_target << "\">"; }
} cout << "<table align=\"center\">";
cout << "<table align=\"center\">"; }
}
vector< PhrasePair* >::const_iterator p; vector< PhrasePair* >::const_iterator p;
// loop over all sentences where translation occurs // loop over all sentences where translation occurs
int pp=0; int pp=0;
int i=0; int i=0;
for(p = ppWithSameTarget->begin(); i<10 && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) { for(p = ppWithSameTarget->begin(); i<10 && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
(*p)->PrintClippedHTML( &cout, 160 ); (*p)->PrintClippedHTML( &cout, 160 );
if (count > m_max_example) { if (count > m_max_example) {
@ -159,54 +156,54 @@ void PhrasePairCollection::PrintHTML() const
pp += count/m_max_example-1; pp += count/m_max_example-1;
} }
} }
if (i == 10 && pp < count) { if (i == 10 && pp < count) {
// extended table // extended table
cout << "<tr><td colspan=7 align=center class=\"pp_more\" onclick=\"javascript:document.getElementById('pp_" << pp_target << "').style.display = 'none'; document.getElementById('pp_ext_" << pp_target << "').style.display = 'block';\">(more)</td></tr></table></div>"; cout << "<tr><td colspan=7 align=center class=\"pp_more\" onclick=\"javascript:document.getElementById('pp_" << pp_target << "').style.display = 'none'; document.getElementById('pp_ext_" << pp_target << "').style.display = 'block';\">(more)</td></tr></table></div>";
cout << "<div id=\"pp_ext_" << pp_target << "\" style=\"display:none;\";\">"; cout << "<div id=\"pp_ext_" << pp_target << "\" style=\"display:none;\";\">";
cout << "<table align=\"center\">"; cout << "<table align=\"center\">";
for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_example && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) { for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_example && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
(*p)->PrintClippedHTML( &cout, 160 ); (*p)->PrintClippedHTML( &cout, 160 );
if (count > m_max_example) { if (count > m_max_example) {
p += count/m_max_example-1; p += count/m_max_example-1;
pp += count/m_max_example-1; pp += count/m_max_example-1;
} }
} }
} }
if (!singleton) cout << "</table></div>\n"; if (!singleton) cout << "</table></div>\n";
if (!singleton && pp_target == 9) { if (!singleton && pp_target == 9) {
cout << "<div id=\"pp_toggle\" onclick=\"javascript:document.getElementById('pp_toggle').style.display = 'none'; document.getElementById('pp_additional').style.display = 'block';\">"; cout << "<div id=\"pp_toggle\" onclick=\"javascript:document.getElementById('pp_toggle').style.display = 'none'; document.getElementById('pp_additional').style.display = 'block';\">";
cout << "<p class=\"pp_target_header\">(more)</p></div>"; cout << "<p class=\"pp_target_header\">(more)</p></div>";
cout << "<div id=\"pp_additional\" style=\"display:none;\";\">"; cout << "<div id=\"pp_additional\" style=\"display:none;\";\">";
} }
} }
if (singleton) cout << "</table></div>\n"; if (singleton) cout << "</table></div>\n";
else if (pp_target > 9) cout << "</div>"; else if (pp_target > 9) cout << "</div>";
size_t max_mismatch = m_max_example/3; size_t max_mismatch = m_max_example/3;
// unaligned phrases // unaligned phrases
if (m_unaligned.size() > 0) { if (m_unaligned.size() > 0) {
cout << "<p class=\"pp_singleton_header\">unaligned" cout << "<p class=\"pp_singleton_header\">unaligned"
<< " (" << (m_unaligned.size()) << ")</p>"; << " (" << (m_unaligned.size()) << ")</p>";
cout << "<table align=\"center\">"; cout << "<table align=\"center\">";
int step_size = 1; int step_size = 1;
if (m_unaligned.size() > max_mismatch) if (m_unaligned.size() > max_mismatch)
step_size = (m_unaligned.size()+max_mismatch-1) / max_mismatch; step_size = (m_unaligned.size()+max_mismatch-1) / max_mismatch;
for(size_t i=0;i<m_unaligned.size();i+=step_size) for(size_t i=0; i<m_unaligned.size(); i+=step_size)
m_unaligned[i]->PrintClippedHTML( &cout, 160 ); m_unaligned[i]->PrintClippedHTML( &cout, 160 );
cout << "</table>"; cout << "</table>";
} }
// mismatched phrases // mismatched phrases
if (m_mismatch.size() > 0) { if (m_mismatch.size() > 0) {
cout << "<p class=\"pp_singleton_header\">mismatched" cout << "<p class=\"pp_singleton_header\">mismatched"
<< " (" << (m_mismatch.size()) << ")</p>"; << " (" << (m_mismatch.size()) << ")</p>";
cout << "<table align=\"center\">"; cout << "<table align=\"center\">";
int step_size = 1; int step_size = 1;
if (m_mismatch.size() > max_mismatch) if (m_mismatch.size() > max_mismatch)
step_size = (m_mismatch.size()+max_mismatch-1) / max_mismatch; step_size = (m_mismatch.size()+max_mismatch-1) / max_mismatch;
for(size_t i=0;i<m_mismatch.size();i+=step_size) for(size_t i=0; i<m_mismatch.size(); i+=step_size)
m_mismatch[i]->PrintClippedHTML( &cout, 160 ); m_mismatch[i]->PrintClippedHTML( &cout, 160 );
cout << "</table>"; cout << "</table>";
} }
} }

View File

@ -5,7 +5,8 @@
#include <stdlib.h> #include <stdlib.h>
#include <cstring> #include <cstring>
namespace { namespace
{
const int LINE_MAX_LENGTH = 10000; const int LINE_MAX_LENGTH = 10000;
@ -14,15 +15,15 @@ const int LINE_MAX_LENGTH = 10000;
using namespace std; using namespace std;
SuffixArray::SuffixArray() SuffixArray::SuffixArray()
: m_array(NULL), : m_array(NULL),
m_index(NULL), m_index(NULL),
m_buffer(NULL), m_buffer(NULL),
m_wordInSentence(NULL), m_wordInSentence(NULL),
m_sentence(NULL), m_sentence(NULL),
m_sentenceLength(NULL), m_sentenceLength(NULL),
m_vcb(), m_vcb(),
m_size(0), m_size(0),
m_sentenceCount(0) { } m_sentenceCount(0) { }
SuffixArray::~SuffixArray() SuffixArray::~SuffixArray()
{ {

View File

@ -5,7 +5,8 @@
#include <stdlib.h> #include <stdlib.h>
#include <cstring> #include <cstring>
namespace { namespace
{
const int LINE_MAX_LENGTH = 10000; const int LINE_MAX_LENGTH = 10000;
@ -14,11 +15,11 @@ const int LINE_MAX_LENGTH = 10000;
using namespace std; using namespace std;
TargetCorpus::TargetCorpus() TargetCorpus::TargetCorpus()
: m_array(NULL), : m_array(NULL),
m_sentenceEnd(NULL), m_sentenceEnd(NULL),
m_vcb(), m_vcb(),
m_size(0), m_size(0),
m_sentenceCount(0) {} m_sentenceCount(0) {}
TargetCorpus::~TargetCorpus() TargetCorpus::~TargetCorpus()
{ {

View File

@ -2,7 +2,8 @@
#include "Vocabulary.h" #include "Vocabulary.h"
#include <fstream> #include <fstream>
namespace { namespace
{
const int MAX_LENGTH = 10000; const int MAX_LENGTH = 10000;

View File

@ -1,4 +1,4 @@
/* /*
base64.cpp and base64.h base64.cpp and base64.h
Copyright (C) 2004-2008 René Nyffenegger Copyright (C) 2004-2008 René Nyffenegger
@ -28,17 +28,19 @@
#include "base64.h" #include "base64.h"
#include <iostream> #include <iostream>
static const std::string base64_chars = static const std::string base64_chars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz"
"0123456789+/"; "0123456789+/";
static inline bool is_base64(unsigned char c) { static inline bool is_base64(unsigned char c)
{
return (isalnum(c) || (c == '+') || (c == '/')); return (isalnum(c) || (c == '+') || (c == '/'));
} }
std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len) { std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len)
{
std::string ret; std::string ret;
int i = 0; int i = 0;
int j = 0; int j = 0;
@ -59,8 +61,7 @@ std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_
} }
} }
if (i) if (i) {
{
for(j = i; j < 3; j++) for(j = i; j < 3; j++)
char_array_3[j] = '\0'; char_array_3[j] = '\0';
@ -81,7 +82,8 @@ std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_
} }
std::string base64_decode(std::string const& encoded_string) { std::string base64_decode(std::string const& encoded_string)
{
int in_len = encoded_string.size(); int in_len = encoded_string.size();
int i = 0; int i = 0;
int j = 0; int j = 0;
@ -90,7 +92,8 @@ std::string base64_decode(std::string const& encoded_string) {
std::string ret; std::string ret;
while (in_len-- && ( encoded_string[in_] != '=') && is_base64(encoded_string[in_])) { while (in_len-- && ( encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
char_array_4[i++] = encoded_string[in_]; in_++; char_array_4[i++] = encoded_string[in_];
in_++;
if (i ==4) { if (i ==4) {
for (i = 0; i <4; i++) for (i = 0; i <4; i++)
char_array_4[i] = base64_chars.find(char_array_4[i]); char_array_4[i] = base64_chars.find(char_array_4[i]);

View File

@ -150,22 +150,19 @@ int main(int argc, char* argv[])
cout << "TOTAL: " << total << endl; cout << "TOTAL: " << total << endl;
if (htmlFlag) { if (htmlFlag) {
ppCollection.PrintHTML(); ppCollection.PrintHTML();
} } else {
else { ppCollection.Print(prettyFlag);
ppCollection.Print(prettyFlag);
} }
cout << "-|||- BICONCOR END -|||-" << endl << flush; cout << "-|||- BICONCOR END -|||-" << endl << flush;
} }
} } else if (queryFlag) {
else if (queryFlag) {
cerr << "query is " << query << endl; cerr << "query is " << query << endl;
vector< string > queryString = alignment.Tokenize( query.c_str() ); vector< string > queryString = alignment.Tokenize( query.c_str() );
PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example ); PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example );
ppCollection.GetCollection( queryString ); ppCollection.GetCollection( queryString );
if (htmlFlag) { if (htmlFlag) {
ppCollection.PrintHTML(); ppCollection.PrintHTML();
} } else {
else {
ppCollection.Print(prettyFlag); ppCollection.Print(prettyFlag);
} }
} }

View File

@ -29,155 +29,158 @@ using namespace std;
namespace Moses namespace Moses
{ {
PhraseDictionaryInterpolated::PhraseDictionaryInterpolated PhraseDictionaryInterpolated::PhraseDictionaryInterpolated
(size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature): (size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature):
PhraseDictionary(numScoreComponent,feature), PhraseDictionary(numScoreComponent,feature),
m_targetPhrases(NULL), m_targetPhrases(NULL),
m_languageModels(NULL) {} m_languageModels(NULL) {}
bool PhraseDictionaryInterpolated::Load( bool PhraseDictionaryInterpolated::Load(
const std::vector<FactorType> &input const std::vector<FactorType> &input
, const std::vector<FactorType> &output , const std::vector<FactorType> &output
, const std::vector<std::string>& config , const std::vector<std::string>& config
, const std::vector<float> &weightT , const std::vector<float> &weightT
, size_t tableLimit , size_t tableLimit
, const LMList &languageModels , const LMList &languageModels
, float weightWP) { , float weightWP)
{
m_languageModels = &languageModels; m_languageModels = &languageModels;
m_weightT = weightT; m_weightT = weightT;
m_tableLimit = tableLimit; m_tableLimit = tableLimit;
m_weightWP = weightWP; m_weightWP = weightWP;
//The config should be as follows: //The config should be as follows:
//0-3: type factor factor num-components (as usual) //0-3: type factor factor num-components (as usual)
//4: combination mode (e.g. naive) //4: combination mode (e.g. naive)
//5-(length-2): List of phrase-table files //5-(length-2): List of phrase-table files
//length-1: Weight string, in the same format as used for tmcombine //length-1: Weight string, in the same format as used for tmcombine
UTIL_THROW_IF(config.size() < 7, util::Exception, "Missing fields from phrase table configuration: expected at least 7"); UTIL_THROW_IF(config.size() < 7, util::Exception, "Missing fields from phrase table configuration: expected at least 7");
UTIL_THROW_IF(config[4] != "naive", util::Exception, "Unsupported combination mode: '" << config[4] << "'"); UTIL_THROW_IF(config[4] != "naive", util::Exception, "Unsupported combination mode: '" << config[4] << "'");
// Create the dictionaries
for (size_t i = 5; i < config.size()-1; ++i) {
m_dictionaries.push_back(DictionaryHandle(new PhraseDictionaryTreeAdaptor(
GetFeature()->GetNumScoreComponents(),
GetFeature()->GetNumInputScores(),
GetFeature())));
bool ret = m_dictionaries.back()->Load(
input,
output,
config[i],
weightT,
0,
languageModels,
weightWP);
if (!ret) return ret;
}
//Parse the weight strings // Create the dictionaries
for (util::TokenIter<util::SingleCharacter, false> featureWeights(config.back(), util::SingleCharacter(';')); featureWeights; ++featureWeights) { for (size_t i = 5; i < config.size()-1; ++i) {
m_weights.push_back(vector<float>()); m_dictionaries.push_back(DictionaryHandle(new PhraseDictionaryTreeAdaptor(
float sum = 0; GetFeature()->GetNumScoreComponents(),
for (util::TokenIter<util::SingleCharacter, false> tableWeights(*featureWeights, util::SingleCharacter(',')); tableWeights; ++tableWeights) { GetFeature()->GetNumInputScores(),
const float weight = boost::lexical_cast<float>(*tableWeights); GetFeature())));
m_weights.back().push_back(weight); bool ret = m_dictionaries.back()->Load(
sum += weight; input,
} output,
UTIL_THROW_IF(m_weights.back().size() != m_dictionaries.size(), util::Exception, config[i],
"Number of weights (" << m_weights.back().size() << weightT,
") does not match number of dictionaries to combine (" << m_dictionaries.size() << ")"); 0,
UTIL_THROW_IF(abs(sum - 1) > 0.01, util::Exception, "Weights not normalised"); languageModels,
weightWP);
} if (!ret) return ret;
//check number of weight sets. Make sure there is a weight for every score component
//except for the last - which is assumed to be the phrase penalty.
UTIL_THROW_IF(m_weights.size() != 1 && m_weights.size() != GetFeature()->GetNumScoreComponents()-1, util::Exception, "Unexpected number of weight sets");
//if 1 weight set, then repeat
if (m_weights.size() == 1) {
while(m_weights.size() < GetFeature()->GetNumScoreComponents()-1) {
m_weights.push_back(m_weights[0]);
}
}
return true;
} }
void PhraseDictionaryInterpolated::InitializeForInput(InputType const& source) { //Parse the weight strings
for (size_t i = 0; i < m_dictionaries.size(); ++i) { for (util::TokenIter<util::SingleCharacter, false> featureWeights(config.back(), util::SingleCharacter(';')); featureWeights; ++featureWeights) {
m_dictionaries[i]->InitializeForInput(source); m_weights.push_back(vector<float>());
float sum = 0;
for (util::TokenIter<util::SingleCharacter, false> tableWeights(*featureWeights, util::SingleCharacter(',')); tableWeights; ++tableWeights) {
const float weight = boost::lexical_cast<float>(*tableWeights);
m_weights.back().push_back(weight);
sum += weight;
}
UTIL_THROW_IF(m_weights.back().size() != m_dictionaries.size(), util::Exception,
"Number of weights (" << m_weights.back().size() <<
") does not match number of dictionaries to combine (" << m_dictionaries.size() << ")");
UTIL_THROW_IF(abs(sum - 1) > 0.01, util::Exception, "Weights not normalised");
}
//check number of weight sets. Make sure there is a weight for every score component
//except for the last - which is assumed to be the phrase penalty.
UTIL_THROW_IF(m_weights.size() != 1 && m_weights.size() != GetFeature()->GetNumScoreComponents()-1, util::Exception, "Unexpected number of weight sets");
//if 1 weight set, then repeat
if (m_weights.size() == 1) {
while(m_weights.size() < GetFeature()->GetNumScoreComponents()-1) {
m_weights.push_back(m_weights[0]);
} }
} }
typedef return true;
boost::unordered_set<TargetPhrase*,PhrasePtrHasher,PhrasePtrComparator> PhraseSet; }
const TargetPhraseCollection* void PhraseDictionaryInterpolated::InitializeForInput(InputType const& source)
PhraseDictionaryInterpolated::GetTargetPhraseCollection(const Phrase& src) const { {
for (size_t i = 0; i < m_dictionaries.size(); ++i) {
m_dictionaries[i]->InitializeForInput(source);
}
}
delete m_targetPhrases; typedef
m_targetPhrases = new TargetPhraseCollection(); boost::unordered_set<TargetPhrase*,PhrasePtrHasher,PhrasePtrComparator> PhraseSet;
PhraseSet allPhrases;
vector<PhraseSet> phrasesByTable(m_dictionaries.size());
for (size_t i = 0; i < m_dictionaries.size(); ++i) { const TargetPhraseCollection*
const TargetPhraseCollection* phrases = m_dictionaries[i]->GetTargetPhraseCollection(src); PhraseDictionaryInterpolated::GetTargetPhraseCollection(const Phrase& src) const
if (phrases) { {
for (TargetPhraseCollection::const_iterator j = phrases->begin();
j != phrases->end(); ++j) { delete m_targetPhrases;
allPhrases.insert(*j); m_targetPhrases = new TargetPhraseCollection();
phrasesByTable[i].insert(*j); PhraseSet allPhrases;
vector<PhraseSet> phrasesByTable(m_dictionaries.size());
for (size_t i = 0; i < m_dictionaries.size(); ++i) {
const TargetPhraseCollection* phrases = m_dictionaries[i]->GetTargetPhraseCollection(src);
if (phrases) {
for (TargetPhraseCollection::const_iterator j = phrases->begin();
j != phrases->end(); ++j) {
allPhrases.insert(*j);
phrasesByTable[i].insert(*j);
}
}
}
ScoreComponentCollection sparseVector;
for (PhraseSet::const_iterator i = allPhrases.begin(); i != allPhrases.end(); ++i) {
TargetPhrase* combinedPhrase = new TargetPhrase((Phrase)**i);
//combinedPhrase->ResetScore();
//cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl;
combinedPhrase->SetSourcePhrase((*i)->GetSourcePhrase());
combinedPhrase->SetAlignTerm(&((*i)->GetAlignTerm()));
combinedPhrase->SetAlignNonTerm(&((*i)->GetAlignTerm()));
Scores combinedScores(GetFeature()->GetNumScoreComponents());
for (size_t j = 0; j < phrasesByTable.size(); ++j) {
PhraseSet::const_iterator tablePhrase = phrasesByTable[j].find(combinedPhrase);
if (tablePhrase != phrasesByTable[j].end()) {
Scores tableScores = (*tablePhrase)->GetScoreBreakdown()
.GetScoresForProducer(GetFeature());
//cerr << "Scores from " << j << " table: ";
for (size_t k = 0; k < tableScores.size()-1; ++k) {
//cerr << tableScores[k] << "(" << exp(tableScores[k]) << ") ";
combinedScores[k] += m_weights[k][j] * exp(tableScores[k]);
//cerr << m_weights[k][j] * exp(tableScores[k]) << " ";
} }
//cerr << endl;
} }
} }
ScoreComponentCollection sparseVector; //map back to log space
for (PhraseSet::const_iterator i = allPhrases.begin(); i != allPhrases.end(); ++i) { //cerr << "Combined ";
TargetPhrase* combinedPhrase = new TargetPhrase((Phrase)**i); for (size_t k = 0; k < combinedScores.size()-1; ++k) {
//combinedPhrase->ResetScore(); //cerr << combinedScores[k] << " ";
//cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl; combinedScores[k] = log(combinedScores[k]);
combinedPhrase->SetSourcePhrase((*i)->GetSourcePhrase()); //cerr << combinedScores[k] << " ";
combinedPhrase->SetAlignTerm(&((*i)->GetAlignTerm()));
combinedPhrase->SetAlignNonTerm(&((*i)->GetAlignTerm()));
Scores combinedScores(GetFeature()->GetNumScoreComponents());
for (size_t j = 0; j < phrasesByTable.size(); ++j) {
PhraseSet::const_iterator tablePhrase = phrasesByTable[j].find(combinedPhrase);
if (tablePhrase != phrasesByTable[j].end()) {
Scores tableScores = (*tablePhrase)->GetScoreBreakdown()
.GetScoresForProducer(GetFeature());
//cerr << "Scores from " << j << " table: ";
for (size_t k = 0; k < tableScores.size()-1; ++k) {
//cerr << tableScores[k] << "(" << exp(tableScores[k]) << ") ";
combinedScores[k] += m_weights[k][j] * exp(tableScores[k]);
//cerr << m_weights[k][j] * exp(tableScores[k]) << " ";
}
//cerr << endl;
}
}
//map back to log space
//cerr << "Combined ";
for (size_t k = 0; k < combinedScores.size()-1; ++k) {
//cerr << combinedScores[k] << " ";
combinedScores[k] = log(combinedScores[k]);
//cerr << combinedScores[k] << " ";
}
//cerr << endl;
combinedScores.back() = 1; //assume last is penalty
combinedPhrase->SetScore(
GetFeature(),
combinedScores,
sparseVector,
m_weightT,
m_weightWP,
*m_languageModels);
//cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl;
m_targetPhrases->Add(combinedPhrase);
} }
//cerr << endl;
m_targetPhrases->Prune(true,m_tableLimit); combinedScores.back() = 1; //assume last is penalty
combinedPhrase->SetScore(
GetFeature(),
return m_targetPhrases; combinedScores,
sparseVector,
m_weightT,
m_weightWP,
*m_languageModels);
//cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl;
m_targetPhrases->Add(combinedPhrase);
} }
m_targetPhrases->Prune(true,m_tableLimit);
return m_targetPhrases;
}
} }

View File

@ -34,12 +34,14 @@ namespace Moses
**/ **/
class PhraseDictionaryInterpolated : public PhraseDictionary class PhraseDictionaryInterpolated : public PhraseDictionary
{ {
public: public:
PhraseDictionaryInterpolated PhraseDictionaryInterpolated
(size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature); (size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature);
virtual ~PhraseDictionaryInterpolated() {delete m_targetPhrases;} virtual ~PhraseDictionaryInterpolated() {
delete m_targetPhrases;
}
// initialize ... // initialize ...
bool Load(const std::vector<FactorType> &input bool Load(const std::vector<FactorType> &input
@ -58,7 +60,7 @@ class PhraseDictionaryInterpolated : public PhraseDictionary
throw std::logic_error("PhraseDictionaryInterpolated.CreateRuleLookupManager() Not implemented"); throw std::logic_error("PhraseDictionaryInterpolated.CreateRuleLookupManager() Not implemented");
} }
private: private:
typedef boost::shared_ptr<PhraseDictionaryTreeAdaptor> DictionaryHandle; typedef boost::shared_ptr<PhraseDictionaryTreeAdaptor> DictionaryHandle;
std::vector<DictionaryHandle> m_dictionaries; std::vector<DictionaryHandle> m_dictionaries;

View File

@ -31,7 +31,8 @@ BOOST_AUTO_TEST_SUITE(phrase_length_feature)
//TODO: Factor out setup code so that it can be reused //TODO: Factor out setup code so that it can be reused
static Word MakeWord(string text) { static Word MakeWord(string text)
{
FactorCollection &factorCollection = FactorCollection::Instance(); FactorCollection &factorCollection = FactorCollection::Instance();
const Factor* f = factorCollection.AddFactor(Input,0,text); const Factor* f = factorCollection.AddFactor(Input,0,text);
Word w; Word w;
@ -40,7 +41,8 @@ static Word MakeWord(string text) {
} }
BOOST_AUTO_TEST_CASE(evaluate) { BOOST_AUTO_TEST_CASE(evaluate)
{
Word w1 = MakeWord("w1"); Word w1 = MakeWord("w1");
Word w2 = MakeWord("y2"); Word w2 = MakeWord("y2");
Word w3 = MakeWord("x3"); Word w3 = MakeWord("x3");
@ -78,7 +80,7 @@ BOOST_AUTO_TEST_CASE(evaluate) {
PhraseBasedFeatureContext context1(topt1,sentence); PhraseBasedFeatureContext context1(topt1,sentence);
PhraseBasedFeatureContext context2(topt2,sentence); PhraseBasedFeatureContext context2(topt2,sentence);
PhraseBasedFeatureContext context3(topt3,sentence); PhraseBasedFeatureContext context3(topt3,sentence);
PhraseLengthFeature plf; PhraseLengthFeature plf;
ScoreComponentCollection acc1,acc2,acc3; ScoreComponentCollection acc1,acc2,acc3;

View File

@ -34,12 +34,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace std; using namespace std;
using namespace Moses; using namespace Moses;
namespace MosesTest namespace MosesTest
{ {
BOOST_AUTO_TEST_SUITE(target_bigram) BOOST_AUTO_TEST_SUITE(target_bigram)
static Word MakeWord(string text) { static Word MakeWord(string text)
{
FactorCollection &factorCollection = FactorCollection::Instance(); FactorCollection &factorCollection = FactorCollection::Instance();
const Factor* f = factorCollection.AddFactor(Input,0,text); const Factor* f = factorCollection.AddFactor(Input,0,text);
Word w; Word w;
@ -47,34 +48,32 @@ static Word MakeWord(string text) {
return w; return w;
} }
class VocabFileFixture { class VocabFileFixture
public: {
template<class I> public:
VocabFileFixture(I begin, I end) template<class I>
{ VocabFileFixture(I begin, I end) {
char name[] = "TargetBigramXXXXXX"; char name[] = "TargetBigramXXXXXX";
int fd = mkstemp(name); int fd = mkstemp(name);
BOOST_CHECK(fd != -1); BOOST_CHECK(fd != -1);
BOOST_CHECK(!close(fd)); BOOST_CHECK(!close(fd));
filename = name; filename = name;
ofstream out(name); ofstream out(name);
for (I i = begin; i != end; ++i) for (I i = begin; i != end; ++i) {
{ out << *i << endl;
out << *i << endl;
}
out.close();
} }
out.close();
}
~VocabFileFixture() ~VocabFileFixture() {
{ BOOST_CHECK(!remove(filename.c_str()));
BOOST_CHECK(!remove(filename.c_str())); }
}
string filename; string filename;
}; };
/* /*
BOOST_AUTO_TEST_CASE(Test2) BOOST_AUTO_TEST_CASE(Test2)
{ {
HypothesisFixture hypos; HypothesisFixture hypos;
cerr << hypos.empty() << ", " << *hypos.empty() << endl; cerr << hypos.empty() << ", " << *hypos.empty() << endl;
@ -113,7 +112,7 @@ BOOST_AUTO_TEST_CASE(score_components)
ScoreProducer::unlimited); ScoreProducer::unlimited);
} }
BOOST_AUTO_TEST_CASE(empty_hypo) BOOST_AUTO_TEST_CASE(empty_hypo)
{ {
Sentence s; Sentence s;
TargetBigramFeature tbf; TargetBigramFeature tbf;
@ -124,7 +123,7 @@ BOOST_AUTO_TEST_CASE(empty_hypo)
} }
//Test of evaluate() where a vocab is specified //Test of evaluate() where a vocab is specified
BOOST_AUTO_TEST_CASE(evaluate_vocab) BOOST_AUTO_TEST_CASE(evaluate_vocab)
{ {
string vocab[] = {"i", "do"}; string vocab[] = {"i", "do"};
VocabFileFixture vocabFile(vocab,vocab+2); VocabFileFixture vocabFile(vocab,vocab+2);
@ -156,7 +155,7 @@ BOOST_AUTO_TEST_CASE(evaluate_all)
BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&tbf, "do:not"),1); BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&tbf, "do:not"),1);
BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&tbf, "not:</s>"),0); BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&tbf, "not:</s>"),0);
BOOST_CHECK(! currState->Compare(TargetBigramState(MakeWord("not")))); BOOST_CHECK(! currState->Compare(TargetBigramState(MakeWord("not"))));
} }
BOOST_AUTO_TEST_CASE(evaluate_empty) BOOST_AUTO_TEST_CASE(evaluate_empty)
@ -171,7 +170,7 @@ BOOST_AUTO_TEST_CASE(evaluate_empty)
BOOST_CHECK(! currState->Compare(*prevState)); BOOST_CHECK(! currState->Compare(*prevState));
} }
BOOST_AUTO_TEST_CASE(evaluate_eos) BOOST_AUTO_TEST_CASE(evaluate_eos)
{ {
HypothesisFixture hypos; HypothesisFixture hypos;
TargetBigramFeature tbf; TargetBigramFeature tbf;

View File

@ -18,7 +18,8 @@
using namespace std; using namespace std;
namespace { namespace
{
// configure regularisation // configure regularisation
const char KEY_REFLEN[] = "reflen"; const char KEY_REFLEN[] = "reflen";
@ -33,8 +34,9 @@ namespace MosesTuning
BleuScorer::BleuScorer(const string& config) BleuScorer::BleuScorer(const string& config)
: StatisticsBasedScorer("BLEU", config), : StatisticsBasedScorer("BLEU", config),
m_ref_length_type(CLOSEST) { m_ref_length_type(CLOSEST)
{
const string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST); const string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST);
if (reflen == REFLEN_AVERAGE) { if (reflen == REFLEN_AVERAGE) {
m_ref_length_type = AVERAGE; m_ref_length_type = AVERAGE;
@ -101,7 +103,8 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
} }
} }
bool BleuScorer::OpenReference(const char* filename, size_t file_id) { bool BleuScorer::OpenReference(const char* filename, size_t file_id)
{
ifstream ifs(filename); ifstream ifs(filename);
if (!ifs) { if (!ifs) {
cerr << "Cannot open " << filename << endl; cerr << "Cannot open " << filename << endl;
@ -110,7 +113,8 @@ bool BleuScorer::OpenReference(const char* filename, size_t file_id) {
return OpenReferenceStream(&ifs, file_id); return OpenReferenceStream(&ifs, file_id);
} }
bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id) { bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id)
{
if (is == NULL) return false; if (is == NULL) return false;
string line; string line;
@ -203,25 +207,27 @@ statscore_t BleuScorer::calculateScore(const vector<int>& comps) const
return exp(logbleu); return exp(logbleu);
} }
int BleuScorer::CalcReferenceLength(size_t sentence_id, size_t length) { int BleuScorer::CalcReferenceLength(size_t sentence_id, size_t length)
{
switch (m_ref_length_type) { switch (m_ref_length_type) {
case AVERAGE: case AVERAGE:
return m_references[sentence_id]->CalcAverage(); return m_references[sentence_id]->CalcAverage();
break; break;
case CLOSEST: case CLOSEST:
return m_references[sentence_id]->CalcClosest(length); return m_references[sentence_id]->CalcClosest(length);
break; break;
case SHORTEST: case SHORTEST:
return m_references[sentence_id]->CalcShortest(); return m_references[sentence_id]->CalcShortest();
break; break;
default: default:
cerr << "unknown reference types." << endl; cerr << "unknown reference types." << endl;
exit(1); exit(1);
} }
} }
void BleuScorer::DumpCounts(ostream* os, void BleuScorer::DumpCounts(ostream* os,
const NgramCounts& counts) const { const NgramCounts& counts) const
{
for (NgramCounts::const_iterator it = counts.begin(); for (NgramCounts::const_iterator it = counts.begin();
it != counts.end(); ++it) { it != counts.end(); ++it) {
*os << "("; *os << "(";
@ -238,7 +244,8 @@ void BleuScorer::DumpCounts(ostream* os,
} }
float smoothedSentenceBleu float smoothedSentenceBleu
(const std::vector<float>& stats, float smoothing, bool smoothBP) { (const std::vector<float>& stats, float smoothing, bool smoothBP)
{
CHECK(stats.size() == kBleuNgramOrder * 2 + 1); CHECK(stats.size() == kBleuNgramOrder * 2 + 1);
@ -247,8 +254,8 @@ float smoothedSentenceBleu
logbleu += log(stats[2 * j] + smoothing) - log(stats[2 * j + 1] + smoothing); logbleu += log(stats[2 * j] + smoothing) - log(stats[2 * j + 1] + smoothing);
} }
logbleu /= kBleuNgramOrder; logbleu /= kBleuNgramOrder;
const float reflength = stats[(kBleuNgramOrder * 2)] + const float reflength = stats[(kBleuNgramOrder * 2)] +
(smoothBP ? smoothing : 0.0f); (smoothBP ? smoothing : 0.0f);
const float brevity = 1.0 - reflength / stats[1]; const float brevity = 1.0 - reflength / stats[1];
if (brevity < 0.0) { if (brevity < 0.0) {
@ -263,7 +270,7 @@ float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vec
std::vector<float> stats; std::vector<float> stats;
CHECK(sent.size()==bg.size()); CHECK(sent.size()==bg.size());
CHECK(sent.size()==kBleuNgramOrder*2+1); CHECK(sent.size()==kBleuNgramOrder*2+1);
for(size_t i=0;i<sent.size();i++) for(size_t i=0; i<sent.size(); i++)
stats.push_back(sent[i]+bg[i]); stats.push_back(sent[i]+bg[i]);
// Calculate BLEU // Calculate BLEU
@ -282,7 +289,8 @@ float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vec
return exp(logbleu) * stats[kBleuNgramOrder*2]; return exp(logbleu) * stats[kBleuNgramOrder*2];
} }
float unsmoothedBleu(const std::vector<float>& stats) { float unsmoothedBleu(const std::vector<float>& stats)
{
CHECK(stats.size() == kBleuNgramOrder * 2 + 1); CHECK(stats.size() == kBleuNgramOrder * 2 + 1);
float logbleu = 0.0; float logbleu = 0.0;
@ -298,50 +306,51 @@ float unsmoothedBleu(const std::vector<float>& stats) {
return exp(logbleu); return exp(logbleu);
} }
vector<float> BleuScorer::ScoreNbestList(const string& scoreFile, const string& featureFile) { vector<float> BleuScorer::ScoreNbestList(const string& scoreFile, const string& featureFile)
vector<string> scoreFiles; {
vector<string> featureFiles; vector<string> scoreFiles;
scoreFiles.push_back(scoreFile); vector<string> featureFiles;
featureFiles.push_back(featureFile); scoreFiles.push_back(scoreFile);
featureFiles.push_back(featureFile);
vector<FeatureDataIterator> featureDataIters; vector<FeatureDataIterator> featureDataIters;
vector<ScoreDataIterator> scoreDataIters; vector<ScoreDataIterator> scoreDataIters;
for (size_t i = 0; i < featureFiles.size(); ++i) { for (size_t i = 0; i < featureFiles.size(); ++i) {
featureDataIters.push_back(FeatureDataIterator(featureFiles[i])); featureDataIters.push_back(FeatureDataIterator(featureFiles[i]));
scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i])); scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i]));
} }
vector<pair<size_t,size_t> > hypotheses; vector<pair<size_t,size_t> > hypotheses;
if (featureDataIters[0] == FeatureDataIterator::end()) { if (featureDataIters[0] == FeatureDataIterator::end()) {
cerr << "Error: at the end of feature data iterator" << endl; cerr << "Error: at the end of feature data iterator" << endl;
exit(1); exit(1);
} }
for (size_t i = 0; i < featureFiles.size(); ++i) { for (size_t i = 0; i < featureFiles.size(); ++i) {
if (featureDataIters[i] == FeatureDataIterator::end()) { if (featureDataIters[i] == FeatureDataIterator::end()) {
cerr << "Error: Feature file " << i << " ended prematurely" << endl; cerr << "Error: Feature file " << i << " ended prematurely" << endl;
exit(1); exit(1);
} }
if (scoreDataIters[i] == ScoreDataIterator::end()) { if (scoreDataIters[i] == ScoreDataIterator::end()) {
cerr << "Error: Score file " << i << " ended prematurely" << endl; cerr << "Error: Score file " << i << " ended prematurely" << endl;
exit(1); exit(1);
} }
if (featureDataIters[i]->size() != scoreDataIters[i]->size()) { if (featureDataIters[i]->size() != scoreDataIters[i]->size()) {
cerr << "Error: features and scores have different size" << endl; cerr << "Error: features and scores have different size" << endl;
exit(1); exit(1);
} }
for (size_t j = 0; j < featureDataIters[i]->size(); ++j) { for (size_t j = 0; j < featureDataIters[i]->size(); ++j) {
hypotheses.push_back(pair<size_t,size_t>(i,j)); hypotheses.push_back(pair<size_t,size_t>(i,j));
} }
} }
// score the nbest list // score the nbest list
vector<float> bleuScores; vector<float> bleuScores;
for (size_t i=0; i < hypotheses.size(); ++i) { for (size_t i=0; i < hypotheses.size(); ++i) {
pair<size_t,size_t> translation = hypotheses[i]; pair<size_t,size_t> translation = hypotheses[i];
float bleu = smoothedSentenceBleu(scoreDataIters[translation.first]->operator[](translation.second)); float bleu = smoothedSentenceBleu(scoreDataIters[translation.first]->operator[](translation.second));
bleuScores.push_back(bleu); bleuScores.push_back(bleu);
} }
return bleuScores; return bleuScores;
} }

View File

@ -38,14 +38,22 @@ public:
virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles); virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles);
virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry); virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry);
virtual statscore_t calculateScore(const std::vector<int>& comps) const; virtual statscore_t calculateScore(const std::vector<int>& comps) const;
virtual std::size_t NumberOfScores() const { return 2 * kBleuNgramOrder + 1; } virtual std::size_t NumberOfScores() const {
return 2 * kBleuNgramOrder + 1;
}
int CalcReferenceLength(std::size_t sentence_id, std::size_t length); int CalcReferenceLength(std::size_t sentence_id, std::size_t length);
ReferenceLengthType GetReferenceLengthType() const { return m_ref_length_type; } ReferenceLengthType GetReferenceLengthType() const {
void SetReferenceLengthType(ReferenceLengthType type) { m_ref_length_type = type; } return m_ref_length_type;
}
void SetReferenceLengthType(ReferenceLengthType type) {
m_ref_length_type = type;
}
const std::vector<Reference*>& GetReferences() const { return m_references.get(); } const std::vector<Reference*>& GetReferences() const {
return m_references.get();
}
/** /**
* Count the ngrams of each type, up to the given length in the input line. * Count the ngrams of each type, up to the given length in the input line.
@ -74,7 +82,7 @@ private:
* This function is used in PRO. * This function is used in PRO.
*/ */
float smoothedSentenceBleu float smoothedSentenceBleu
(const std::vector<float>& stats, float smoothing=1.0, bool smoothBP=false); (const std::vector<float>& stats, float smoothing=1.0, bool smoothBP=false);
/** Computes sentence-level BLEU score given a background corpus. /** Computes sentence-level BLEU score given a background corpus.
* This function is used in batch MIRA. * This function is used in batch MIRA.

View File

@ -10,16 +10,19 @@
using namespace MosesTuning; using namespace MosesTuning;
namespace { namespace
{
NgramCounts* g_counts = NULL; NgramCounts* g_counts = NULL;
NgramCounts* GetNgramCounts() { NgramCounts* GetNgramCounts()
{
assert(g_counts); assert(g_counts);
return g_counts; return g_counts;
} }
void SetNgramCounts(NgramCounts* counts) { void SetNgramCounts(NgramCounts* counts)
{
g_counts = counts; g_counts = counts;
} }
@ -58,33 +61,38 @@ struct Fourgram {
NgramCounts::Key instance; NgramCounts::Key instance;
}; };
bool CheckUnigram(const std::string& str) { bool CheckUnigram(const std::string& str)
{
Unigram unigram(str); Unigram unigram(str);
NgramCounts::Value v; NgramCounts::Value v;
return GetNgramCounts()->Lookup(unigram.instance, &v); return GetNgramCounts()->Lookup(unigram.instance, &v);
} }
bool CheckBigram(const std::string& a, const std::string& b) { bool CheckBigram(const std::string& a, const std::string& b)
{
Bigram bigram(a, b); Bigram bigram(a, b);
NgramCounts::Value v; NgramCounts::Value v;
return GetNgramCounts()->Lookup(bigram.instance, &v); return GetNgramCounts()->Lookup(bigram.instance, &v);
} }
bool CheckTrigram(const std::string& a, const std::string& b, bool CheckTrigram(const std::string& a, const std::string& b,
const std::string& c) { const std::string& c)
{
Trigram trigram(a, b, c); Trigram trigram(a, b, c);
NgramCounts::Value v; NgramCounts::Value v;
return GetNgramCounts()->Lookup(trigram.instance, &v); return GetNgramCounts()->Lookup(trigram.instance, &v);
} }
bool CheckFourgram(const std::string& a, const std::string& b, bool CheckFourgram(const std::string& a, const std::string& b,
const std::string& c, const std::string& d) { const std::string& c, const std::string& d)
{
Fourgram fourgram(a, b, c, d); Fourgram fourgram(a, b, c, d);
NgramCounts::Value v; NgramCounts::Value v;
return GetNgramCounts()->Lookup(fourgram.instance, &v); return GetNgramCounts()->Lookup(fourgram.instance, &v);
} }
void SetUpReferences(BleuScorer& scorer) { void SetUpReferences(BleuScorer& scorer)
{
// The following examples are taken from Koehn, "Statistical Machine Translation", // The following examples are taken from Koehn, "Statistical Machine Translation",
// Cambridge University Press, 2010. // Cambridge University Press, 2010.
{ {
@ -115,7 +123,8 @@ void SetUpReferences(BleuScorer& scorer) {
} // namespace } // namespace
BOOST_AUTO_TEST_CASE(bleu_reference_type) { BOOST_AUTO_TEST_CASE(bleu_reference_type)
{
BleuScorer scorer; BleuScorer scorer;
// BleuScorer will use "closest" by default. // BleuScorer will use "closest" by default.
BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::CLOSEST); BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::CLOSEST);
@ -127,7 +136,8 @@ BOOST_AUTO_TEST_CASE(bleu_reference_type) {
BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::SHORTEST); BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::SHORTEST);
} }
BOOST_AUTO_TEST_CASE(bleu_reference_type_with_config) { BOOST_AUTO_TEST_CASE(bleu_reference_type_with_config)
{
{ {
BleuScorer scorer("reflen:average"); BleuScorer scorer("reflen:average");
BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::AVERAGE); BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::AVERAGE);
@ -139,7 +149,8 @@ BOOST_AUTO_TEST_CASE(bleu_reference_type_with_config) {
} }
} }
BOOST_AUTO_TEST_CASE(bleu_count_ngrams) { BOOST_AUTO_TEST_CASE(bleu_count_ngrams)
{
BleuScorer scorer; BleuScorer scorer;
std::string line = "I saw a girl with a telescope ."; std::string line = "I saw a girl with a telescope .";
@ -198,7 +209,8 @@ BOOST_AUTO_TEST_CASE(bleu_count_ngrams) {
BOOST_CHECK(CheckFourgram("with", "a", "telescope", ".")); BOOST_CHECK(CheckFourgram("with", "a", "telescope", "."));
} }
BOOST_AUTO_TEST_CASE(bleu_clipped_counts) { BOOST_AUTO_TEST_CASE(bleu_clipped_counts)
{
BleuScorer scorer; BleuScorer scorer;
SetUpReferences(scorer); SetUpReferences(scorer);
std::string line("israeli officials responsibility of airport safety"); std::string line("israeli officials responsibility of airport safety");
@ -220,7 +232,8 @@ BOOST_AUTO_TEST_CASE(bleu_clipped_counts) {
BOOST_CHECK_EQUAL(entry.get(7), 3); // fourgram BOOST_CHECK_EQUAL(entry.get(7), 3); // fourgram
} }
BOOST_AUTO_TEST_CASE(calculate_actual_score) { BOOST_AUTO_TEST_CASE(calculate_actual_score)
{
BOOST_REQUIRE(4 == kBleuNgramOrder); BOOST_REQUIRE(4 == kBleuNgramOrder);
std::vector<int> stats(2 * kBleuNgramOrder + 1); std::vector<int> stats(2 * kBleuNgramOrder + 1);
BleuScorer scorer; BleuScorer scorer;
@ -247,7 +260,8 @@ BOOST_AUTO_TEST_CASE(calculate_actual_score) {
BOOST_CHECK_CLOSE(0.5115f, scorer.calculateScore(stats), 0.01); BOOST_CHECK_CLOSE(0.5115f, scorer.calculateScore(stats), 0.01);
} }
BOOST_AUTO_TEST_CASE(sentence_level_bleu) { BOOST_AUTO_TEST_CASE(sentence_level_bleu)
{
BOOST_REQUIRE(4 == kBleuNgramOrder); BOOST_REQUIRE(4 == kBleuNgramOrder);
std::vector<float> stats(2 * kBleuNgramOrder + 1); std::vector<float> stats(2 * kBleuNgramOrder + 1);

View File

@ -6,9 +6,11 @@
using namespace std; using namespace std;
namespace { namespace
{
inline int CalcDistance(int word1, int word2) { inline int CalcDistance(int word1, int word2)
{
return word1 == word2 ? 0 : 1; return word1 == word2 ? 0 : 1;
} }
@ -16,11 +18,11 @@ inline int CalcDistance(int word1, int word2) {
namespace MosesTuning namespace MosesTuning
{ {
CderScorer::CderScorer(const string& config, bool allowed_long_jumps) CderScorer::CderScorer(const string& config, bool allowed_long_jumps)
: StatisticsBasedScorer(allowed_long_jumps ? "CDER" : "WER", config), : StatisticsBasedScorer(allowed_long_jumps ? "CDER" : "WER", config),
m_allowed_long_jumps(allowed_long_jumps) {} m_allowed_long_jumps(allowed_long_jumps) {}
CderScorer::~CderScorer() {} CderScorer::~CderScorer() {}
@ -82,7 +84,8 @@ float CderScorer::calculateScore(const vector<int>& comps) const
} }
void CderScorer::computeCD(const sent_t& cand, const sent_t& ref, void CderScorer::computeCD(const sent_t& cand, const sent_t& ref,
vector<int>& stats) const { vector<int>& stats) const
{
int I = cand.size() + 1; // Number of inter-words positions in candidate sentence int I = cand.size() + 1; // Number of inter-words positions in candidate sentence
int L = ref.size() + 1; // Number of inter-words positions in reference sentence int L = ref.size() + 1; // Number of inter-words positions in reference sentence
@ -95,11 +98,9 @@ void CderScorer::computeCD(const sent_t& cand, const sent_t& ref,
for (int i = 1; i < I; ++i) (*row)[i] = 1; for (int i = 1; i < I; ++i) (*row)[i] = 1;
// Calculating costs for next row using costs from the previous row. // Calculating costs for next row using costs from the previous row.
while (++l < L) while (++l < L) {
{
vector<int>* nextRow = new vector<int>(I); vector<int>* nextRow = new vector<int>(I);
for (int i = 0; i < I; ++i) for (int i = 0; i < I; ++i) {
{
vector<int> possibleCosts; vector<int> possibleCosts;
if (i > 0) { if (i > 0) {
possibleCosts.push_back((*nextRow)[i-1] + 1); // Deletion possibleCosts.push_back((*nextRow)[i-1] + 1); // Deletion

View File

@ -8,13 +8,14 @@
namespace MosesTuning namespace MosesTuning
{ {
/** /**
* CderScorer class can compute both CDER and WER metric. * CderScorer class can compute both CDER and WER metric.
*/ */
class CderScorer: public StatisticsBasedScorer { class CderScorer: public StatisticsBasedScorer
public: {
public:
explicit CderScorer(const std::string& config, bool allowed_long_jumps = true); explicit CderScorer(const std::string& config, bool allowed_long_jumps = true);
~CderScorer(); ~CderScorer();
@ -24,11 +25,13 @@ class CderScorer: public StatisticsBasedScorer {
virtual void prepareStatsVector(std::size_t sid, const std::string& text, std::vector<int>& stats); virtual void prepareStatsVector(std::size_t sid, const std::string& text, std::vector<int>& stats);
virtual std::size_t NumberOfScores() const { return 2; } virtual std::size_t NumberOfScores() const {
return 2;
}
virtual float calculateScore(const std::vector<int>& comps) const; virtual float calculateScore(const std::vector<int>& comps) const;
private: private:
bool m_allowed_long_jumps; bool m_allowed_long_jumps;
typedef std::vector<int> sent_t; typedef std::vector<int> sent_t;

View File

@ -27,11 +27,11 @@ namespace MosesTuning
{ {
Data::Data(Scorer* scorer, const string& sparse_weights_file) Data::Data(Scorer* scorer, const string& sparse_weights_file)
: m_scorer(scorer), : m_scorer(scorer),
m_score_type(m_scorer->getName()), m_score_type(m_scorer->getName()),
m_num_scores(0), m_num_scores(0),
m_score_data(new ScoreData(m_scorer)), m_score_data(new ScoreData(m_scorer)),
m_feature_data(new FeatureData) m_feature_data(new FeatureData)
{ {
TRACE_ERR("Data::m_score_type " << m_score_type << endl); TRACE_ERR("Data::m_score_type " << m_score_type << endl);
TRACE_ERR("Data::Scorer type from Scorer: " << m_scorer->getName() << endl); TRACE_ERR("Data::Scorer type from Scorer: " << m_scorer->getName() << endl);
@ -48,7 +48,8 @@ Data::Data(Scorer* scorer, const string& sparse_weights_file)
//ADDED BY TS //ADDED BY TS
// TODO: This is too long; consider creating additional functions to // TODO: This is too long; consider creating additional functions to
// reduce the lines of this function. // reduce the lines of this function.
void Data::removeDuplicates() { void Data::removeDuplicates()
{
size_t nSentences = m_feature_data->size(); size_t nSentences = m_feature_data->size();
assert(m_score_data->size() == nSentences); assert(m_score_data->size() == nSentences);
@ -128,7 +129,8 @@ void Data::removeDuplicates() {
} }
//END_ADDED //END_ADDED
void Data::load(const std::string &featfile, const std::string &scorefile) { void Data::load(const std::string &featfile, const std::string &scorefile)
{
m_feature_data->load(featfile, m_sparse_weights); m_feature_data->load(featfile, m_sparse_weights);
m_score_data->load(scorefile); m_score_data->load(scorefile);
} }
@ -192,7 +194,8 @@ void Data::loadNBest(const string &file)
} }
} }
void Data::save(const std::string &featfile, const std::string &scorefile, bool bin) { void Data::save(const std::string &featfile, const std::string &scorefile, bool bin)
{
if (bin) if (bin)
cerr << "Binary write mode is selected" << endl; cerr << "Binary write mode is selected" << endl;
else else
@ -202,7 +205,8 @@ void Data::save(const std::string &featfile, const std::string &scorefile, bool
m_score_data->save(scorefile, bin); m_score_data->save(scorefile, bin);
} }
void Data::InitFeatureMap(const string& str) { void Data::InitFeatureMap(const string& str)
{
string buf = str; string buf = str;
string substr; string substr;
string features = ""; string features = "";
@ -231,7 +235,8 @@ void Data::InitFeatureMap(const string& str) {
} }
void Data::AddFeatures(const string& str, void Data::AddFeatures(const string& str,
int sentence_index) { int sentence_index)
{
string buf = str; string buf = str;
string substr; string substr;
FeatureStats feature_entry; FeatureStats feature_entry;

View File

@ -44,18 +44,28 @@ public:
m_feature_data->clear(); m_feature_data->clear();
} }
ScoreDataHandle getScoreData() { return m_score_data; } ScoreDataHandle getScoreData() {
return m_score_data;
}
FeatureDataHandle getFeatureData() { return m_feature_data; } FeatureDataHandle getFeatureData() {
return m_feature_data;
}
Scorer* getScorer() { return m_scorer; } Scorer* getScorer() {
return m_scorer;
}
std::size_t NumberOfFeatures() const { std::size_t NumberOfFeatures() const {
return m_feature_data->NumberOfFeatures(); return m_feature_data->NumberOfFeatures();
} }
std::string Features() const { return m_feature_data->Features(); } std::string Features() const {
void Features(const std::string &f) { m_feature_data->Features(f); } return m_feature_data->Features();
}
void Features(const std::string &f) {
m_feature_data->Features(f);
}
void loadNBest(const std::string &file); void loadNBest(const std::string &file);

View File

@ -10,7 +10,8 @@
using namespace MosesTuning; using namespace MosesTuning;
//very basic test of sharding //very basic test of sharding
BOOST_AUTO_TEST_CASE(shard_basic) { BOOST_AUTO_TEST_CASE(shard_basic)
{
boost::scoped_ptr<Scorer> scorer(ScorerFactory::getScorer("BLEU", "")); boost::scoped_ptr<Scorer> scorer(ScorerFactory::getScorer("BLEU", ""));
Data data(scorer.get()); Data data(scorer.get());
FeatureArray fa1, fa2, fa3, fa4; FeatureArray fa1, fa2, fa3, fa4;
@ -39,7 +40,8 @@ BOOST_AUTO_TEST_CASE(shard_basic) {
BOOST_CHECK_EQUAL(shards[1].getFeatureData()->size(),(std::size_t)2); BOOST_CHECK_EQUAL(shards[1].getFeatureData()->size(),(std::size_t)2);
} }
BOOST_AUTO_TEST_CASE(init_feature_map_test) { BOOST_AUTO_TEST_CASE(init_feature_map_test)
{
boost::scoped_ptr<Scorer> scorer(ScorerFactory::getScorer("BLEU", "")); boost::scoped_ptr<Scorer> scorer(ScorerFactory::getScorer("BLEU", ""));
Data data(scorer.get()); Data data(scorer.get());
@ -49,7 +51,8 @@ BOOST_AUTO_TEST_CASE(init_feature_map_test) {
BOOST_CHECK_EQUAL(expected, data.Features()); BOOST_CHECK_EQUAL(expected, data.Features());
} }
BOOST_AUTO_TEST_CASE(add_features_test) { BOOST_AUTO_TEST_CASE(add_features_test)
{
boost::scoped_ptr<Scorer> scorer(ScorerFactory::getScorer("BLEU", "")); boost::scoped_ptr<Scorer> scorer(ScorerFactory::getScorer("BLEU", ""));
Data data(scorer.get()); Data data(scorer.get());

View File

@ -13,27 +13,27 @@
#define BUFFER_SIZE (32768) #define BUFFER_SIZE (32768)
namespace MosesTuning namespace MosesTuning
{ {
class _fdstream class _fdstream
{ {
protected: protected:
_fdstream() : _fdstream() :
_file_descriptor(-1), _filebuf(NULL) _file_descriptor(-1), _filebuf(NULL)
{ } { }
_fdstream(int file_descriptor, std::ios_base::openmode openmode) : _fdstream(int file_descriptor, std::ios_base::openmode openmode) :
_file_descriptor(file_descriptor), _openmode(openmode) _file_descriptor(file_descriptor), _openmode(openmode) {
{
_filebuf = NULL; _filebuf = NULL;
open(file_descriptor, openmode); open(file_descriptor, openmode);
} }
std::ios_base::openmode openmode() const { return _openmode; } std::ios_base::openmode openmode() const {
return _openmode;
}
void open(int file_descriptor, std::ios_base::openmode openmode) void open(int file_descriptor, std::ios_base::openmode openmode) {
{
if (!_filebuf) if (!_filebuf)
// We create a C++ stream from a file descriptor // We create a C++ stream from a file descriptor
// stdio_filebuf is not synced with stdio. // stdio_filebuf is not synced with stdio.
@ -41,11 +41,10 @@ protected:
// You can also create the filebuf from a FILE* with // You can also create the filebuf from a FILE* with
// FILE* f = fdopen(file_descriptor, mode); // FILE* f = fdopen(file_descriptor, mode);
_filebuf = new __gnu_cxx::stdio_filebuf<char> (file_descriptor, _filebuf = new __gnu_cxx::stdio_filebuf<char> (file_descriptor,
openmode); openmode);
} }
virtual ~_fdstream() virtual ~_fdstream() {
{
close(_file_descriptor); close(_file_descriptor);
delete _filebuf; delete _filebuf;
_filebuf = NULL; _filebuf = NULL;
@ -60,59 +59,51 @@ class ifdstream : public _fdstream
{ {
public: public:
ifdstream() : ifdstream() :
_fdstream(), _stream(NULL) _fdstream(), _stream(NULL)
{ } { }
ifdstream(int file_descriptor) : ifdstream(int file_descriptor) :
_fdstream(file_descriptor, std::ios_base::in) _fdstream(file_descriptor, std::ios_base::in) {
{
_stream = new std::istream(_filebuf); _stream = new std::istream(_filebuf);
} }
void open(int file_descriptor) void open(int file_descriptor) {
{ if (!_stream) {
if (!_stream) _fdstream::open(file_descriptor, std::ios_base::in);
{ _stream = new std::istream(_filebuf);
_fdstream::open(file_descriptor, std::ios_base::in); }
_stream = new std::istream(_filebuf);
}
} }
ifdstream& operator>> (std::string& str) ifdstream& operator>> (std::string& str) {
{
(*_stream) >> str; (*_stream) >> str;
return *this; return *this;
} }
std::size_t getline(std::string& str) std::size_t getline(std::string& str) {
{
char tmp[BUFFER_SIZE]; char tmp[BUFFER_SIZE];
std::size_t ret = getline(tmp, BUFFER_SIZE); std::size_t ret = getline(tmp, BUFFER_SIZE);
str = tmp; str = tmp;
return ret; return ret;
} }
std::size_t getline(char* s, std::streamsize n) std::size_t getline(char* s, std::streamsize n) {
{
return (getline(s, n, '\n')); return (getline(s, n, '\n'));
} }
std::size_t getline(char* s, std::streamsize n, char delim) std::size_t getline(char* s, std::streamsize n, char delim) {
{
int i = 0; int i = 0;
do{ do {
s[i] = _stream->get(); s[i] = _stream->get();
i++; i++;
}while(i < n-1 && s[i-1] != delim && s[i-1] != '\0'); } while(i < n-1 && s[i-1] != delim && s[i-1] != '\0');
s[i-1] = '\0'; // overwrite the delimiter given with string end s[i-1] = '\0'; // overwrite the delimiter given with string end
return i-1; return i-1;
} }
~ifdstream() ~ifdstream() {
{
//this->~_fdstream(); //this->~_fdstream();
delete _stream; delete _stream;
} }
@ -125,27 +116,23 @@ class ofdstream : public _fdstream
{ {
public: public:
ofdstream() : ofdstream() :
_fdstream(), _stream(NULL) _fdstream(), _stream(NULL)
{ } { }
ofdstream(int file_descriptor) : ofdstream(int file_descriptor) :
_fdstream(file_descriptor, std::ios_base::out) _fdstream(file_descriptor, std::ios_base::out) {
{
_stream = new std::ostream(_filebuf); _stream = new std::ostream(_filebuf);
} }
void open(int file_descriptor) void open(int file_descriptor) {
{ if (!_stream) {
if (!_stream)
{
_fdstream::open(file_descriptor, std::ios_base::out); _fdstream::open(file_descriptor, std::ios_base::out);
_stream = new std::ostream(_filebuf); _stream = new std::ostream(_filebuf);
} }
} }
ofdstream& operator<< (const std::string& str) ofdstream& operator<< (const std::string& str) {
{
if (_stream->good()) if (_stream->good())
(*_stream) << str; (*_stream) << str;
@ -153,8 +140,7 @@ public:
return *this; return *this;
} }
~ofdstream() ~ofdstream() {
{
//this->~_fdstream(); //this->~_fdstream();
delete _stream; delete _stream;
} }

View File

@ -19,14 +19,14 @@ namespace MosesTuning
FeatureArray::FeatureArray() FeatureArray::FeatureArray()
: m_index(0), m_num_features(0){} : m_index(0), m_num_features(0) {}
FeatureArray::~FeatureArray() {} FeatureArray::~FeatureArray() {}
void FeatureArray::savetxt(ostream* os) void FeatureArray::savetxt(ostream* os)
{ {
*os << FEATURES_TXT_BEGIN << " " << m_index << " " << m_array.size() *os << FEATURES_TXT_BEGIN << " " << m_index << " " << m_array.size()
<< " " << m_num_features << " " << m_features << endl; << " " << m_num_features << " " << m_features << endl;
for (featarray_t::iterator i = m_array.begin(); i != m_array.end(); ++i) { for (featarray_t::iterator i = m_array.begin(); i != m_array.end(); ++i) {
i->savetxt(os); i->savetxt(os);
*os << endl; *os << endl;
@ -37,7 +37,7 @@ void FeatureArray::savetxt(ostream* os)
void FeatureArray::savebin(ostream* os) void FeatureArray::savebin(ostream* os)
{ {
*os << FEATURES_BIN_BEGIN << " " << m_index << " " << m_array.size() *os << FEATURES_BIN_BEGIN << " " << m_index << " " << m_array.size()
<< " " << m_num_features << " " << m_features << endl; << " " << m_num_features << " " << m_features << endl;
for (featarray_t::iterator i = m_array.begin(); i != m_array.end(); ++i) for (featarray_t::iterator i = m_array.begin(); i != m_array.end(); ++i)
i->savebin(os); i->savebin(os);

View File

@ -36,16 +36,28 @@ public:
FeatureArray(); FeatureArray();
~FeatureArray(); ~FeatureArray();
void clear() { m_array.clear(); } void clear() {
m_array.clear();
}
int getIndex() const { return m_index; } int getIndex() const {
void setIndex(const int value) { m_index = value; } return m_index;
}
void setIndex(const int value) {
m_index = value;
}
FeatureStats& get(std::size_t i) { return m_array.at(i); } FeatureStats& get(std::size_t i) {
const FeatureStats& get(std::size_t i) const { return m_array.at(i); } return m_array.at(i);
}
const FeatureStats& get(std::size_t i) const {
return m_array.at(i);
}
void add(FeatureStats& e) { m_array.push_back(e); } void add(FeatureStats& e) {
m_array.push_back(e);
}
//ADDED BY TS //ADDED BY TS
void swap(std::size_t i, std::size_t j) { void swap(std::size_t i, std::size_t j) {
@ -59,13 +71,23 @@ public:
void merge(FeatureArray& e); void merge(FeatureArray& e);
std::size_t size() const { return m_array.size(); } std::size_t size() const {
return m_array.size();
}
std::size_t NumberOfFeatures() const { return m_num_features; } std::size_t NumberOfFeatures() const {
void NumberOfFeatures(std::size_t v) { m_num_features = v; } return m_num_features;
}
void NumberOfFeatures(std::size_t v) {
m_num_features = v;
}
std::string Features() const { return m_features; } std::string Features() const {
void Features(const std::string& f) { m_features = f; } return m_features;
}
void Features(const std::string& f) {
m_features = f;
}
void savetxt(std::ostream* os); void savetxt(std::ostream* os);
void savebin(std::ostream* os); void savebin(std::ostream* os);

View File

@ -20,7 +20,7 @@ namespace MosesTuning
FeatureData::FeatureData() FeatureData::FeatureData()
: m_num_features(0) {} : m_num_features(0) {}
void FeatureData::save(ostream* os, bool bin) void FeatureData::save(ostream* os, bool bin)
{ {
@ -38,7 +38,8 @@ void FeatureData::save(const string &file, bool bin)
ofs.close(); ofs.close();
} }
void FeatureData::save(bool bin) { void FeatureData::save(bool bin)
{
save(&cout, bin); save(&cout, bin);
} }
@ -145,7 +146,8 @@ void FeatureData::setFeatureMap(const string& feat)
} }
} }
string FeatureData::ToString() const { string FeatureData::ToString() const
{
string res; string res;
{ {

View File

@ -33,7 +33,9 @@ public:
FeatureData(); FeatureData();
~FeatureData() {} ~FeatureData() {}
void clear() { m_array.clear(); } void clear() {
m_array.clear();
}
FeatureArray& get(size_t idx) { FeatureArray& get(size_t idx) {
return m_array.at(idx); return m_array.at(idx);
@ -61,13 +63,23 @@ public:
void add(FeatureArray& e); void add(FeatureArray& e);
void add(FeatureStats& e, int sent_idx); void add(FeatureStats& e, int sent_idx);
std::size_t size() const { return m_array.size(); } std::size_t size() const {
return m_array.size();
}
std::size_t NumberOfFeatures() const { return m_num_features; } std::size_t NumberOfFeatures() const {
void NumberOfFeatures(std::size_t v) { m_num_features = v; } return m_num_features;
}
void NumberOfFeatures(std::size_t v) {
m_num_features = v;
}
std::string Features() const { return m_features; } std::string Features() const {
void Features(const std::string& f) { m_features = f; } return m_features;
}
void Features(const std::string& f) {
m_features = f;
}
void save(const std::string &file, bool bin=false); void save(const std::string &file, bool bin=false);
void save(std::ostream* os, bool bin=false); void save(std::ostream* os, bool bin=false);

View File

@ -32,9 +32,10 @@ using namespace util;
namespace MosesTuning namespace MosesTuning
{ {
int ParseInt(const StringPiece& str ) {
int ParseInt(const StringPiece& str )
{
char* errIndex; char* errIndex;
//could wrap? //could wrap?
int value = static_cast<int>(strtol(str.data(), &errIndex,10)); int value = static_cast<int>(strtol(str.data(), &errIndex,10));
@ -44,7 +45,8 @@ int ParseInt(const StringPiece& str ) {
return value; return value;
} }
float ParseFloat(const StringPiece& str) { float ParseFloat(const StringPiece& str)
{
char* errIndex; char* errIndex;
float value = static_cast<float>(strtod(str.data(), &errIndex)); float value = static_cast<float>(strtod(str.data(), &errIndex));
if (errIndex == str.data()) { if (errIndex == str.data()) {
@ -53,11 +55,13 @@ float ParseFloat(const StringPiece& str) {
return value; return value;
} }
bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2) { bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2)
{
return item1.dense==item1.dense && item1.sparse==item1.sparse; return item1.dense==item1.dense && item1.sparse==item1.sparse;
} }
size_t hash_value(FeatureDataItem const& item) { size_t hash_value(FeatureDataItem const& item)
{
size_t seed = 0; size_t seed = 0;
boost::hash_combine(seed,item.dense); boost::hash_combine(seed,item.dense);
boost::hash_combine(seed,item.sparse); boost::hash_combine(seed,item.sparse);
@ -67,14 +71,16 @@ size_t hash_value(FeatureDataItem const& item) {
FeatureDataIterator::FeatureDataIterator() {} FeatureDataIterator::FeatureDataIterator() {}
FeatureDataIterator::FeatureDataIterator(const string& filename) { FeatureDataIterator::FeatureDataIterator(const string& filename)
{
m_in.reset(new FilePiece(filename.c_str())); m_in.reset(new FilePiece(filename.c_str()));
readNext(); readNext();
} }
FeatureDataIterator::~FeatureDataIterator() {} FeatureDataIterator::~FeatureDataIterator() {}
void FeatureDataIterator::readNext() { void FeatureDataIterator::readNext()
{
m_next.clear(); m_next.clear();
try { try {
StringPiece marker = m_in->ReadDelimited(); StringPiece marker = m_in->ReadDelimited();
@ -101,7 +107,7 @@ void FeatureDataIterator::readNext() {
//sparse feature //sparse feature
StringPiece second = *value; StringPiece second = *value;
float floatValue = ParseFloat(second); float floatValue = ParseFloat(second);
m_next.back().sparse.set(first.as_string(),floatValue); m_next.back().sparse.set(first.as_string(),floatValue);
} }
} }
if (length != m_next.back().dense.size()) { if (length != m_next.back().dense.size()) {
@ -117,11 +123,13 @@ void FeatureDataIterator::readNext() {
} }
} }
void FeatureDataIterator::increment() { void FeatureDataIterator::increment()
{
readNext(); readNext();
} }
bool FeatureDataIterator::equal(const FeatureDataIterator& rhs) const { bool FeatureDataIterator::equal(const FeatureDataIterator& rhs) const
{
if (!m_in && !rhs.m_in) { if (!m_in && !rhs.m_in) {
return true; return true;
} else if (!m_in) { } else if (!m_in) {
@ -129,12 +137,13 @@ bool FeatureDataIterator::equal(const FeatureDataIterator& rhs) const {
} else if (!rhs.m_in) { } else if (!rhs.m_in) {
return false; return false;
} else { } else {
return m_in->FileName() == rhs.m_in->FileName() && return m_in->FileName() == rhs.m_in->FileName() &&
m_in->Offset() == rhs.m_in->Offset(); m_in->Offset() == rhs.m_in->Offset();
} }
} }
const vector<FeatureDataItem>& FeatureDataIterator::dereference() const { const vector<FeatureDataItem>& FeatureDataIterator::dereference() const
{
return m_next; return m_next;
} }

View File

@ -37,18 +37,21 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "FeatureStats.h" #include "FeatureStats.h"
namespace util { class FilePiece; } namespace util
{
class FilePiece;
}
namespace MosesTuning namespace MosesTuning
{ {
class FileFormatException : public util::Exception
class FileFormatException : public util::Exception
{ {
public: public:
explicit FileFormatException(const std::string& filename, const std::string& line) { explicit FileFormatException(const std::string& filename, const std::string& line) {
*this << "Error in line \"" << line << "\" of " << filename; *this << "Error in line \"" << line << "\" of " << filename;
} }
}; };
@ -56,45 +59,45 @@ class FileFormatException : public util::Exception
int ParseInt(const StringPiece& str ); int ParseInt(const StringPiece& str );
/** Assumes a delimiter, so only apply to tokens */ /** Assumes a delimiter, so only apply to tokens */
float ParseFloat(const StringPiece& str); float ParseFloat(const StringPiece& str);
class FeatureDataItem class FeatureDataItem
{ {
public: public:
std::vector<float> dense; std::vector<float> dense;
SparseVector sparse; SparseVector sparse;
}; };
bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2); bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2);
std::size_t hash_value(FeatureDataItem const& item); std::size_t hash_value(FeatureDataItem const& item);
class FeatureDataIterator : class FeatureDataIterator :
public boost::iterator_facade<FeatureDataIterator, public boost::iterator_facade<FeatureDataIterator,
const std::vector<FeatureDataItem>, const std::vector<FeatureDataItem>,
boost::forward_traversal_tag> boost::forward_traversal_tag>
{ {
public: public:
FeatureDataIterator(); FeatureDataIterator();
explicit FeatureDataIterator(const std::string& filename); explicit FeatureDataIterator(const std::string& filename);
~FeatureDataIterator(); ~FeatureDataIterator();
static FeatureDataIterator end() { static FeatureDataIterator end() {
return FeatureDataIterator(); return FeatureDataIterator();
} }
private: private:
friend class boost::iterator_core_access; friend class boost::iterator_core_access;
void increment(); void increment();
bool equal(const FeatureDataIterator& rhs) const; bool equal(const FeatureDataIterator& rhs) const;
const std::vector<FeatureDataItem>& dereference() const; const std::vector<FeatureDataItem>& dereference() const;
void readNext(); void readNext();
boost::shared_ptr<util::FilePiece> m_in; boost::shared_ptr<util::FilePiece> m_in;
std::vector<FeatureDataItem> m_next; std::vector<FeatureDataItem> m_next;
}; };
} }

View File

@ -7,10 +7,12 @@
using namespace MosesTuning; using namespace MosesTuning;
namespace { namespace
{
void CheckFeatureMap(const FeatureData* feature_data, void CheckFeatureMap(const FeatureData* feature_data,
const char* str, int num_feature, int* cnt) { const char* str, int num_feature, int* cnt)
{
for (int i = 0; i < num_feature; ++i) { for (int i = 0; i < num_feature; ++i) {
std::stringstream ss; std::stringstream ss;
ss << str << "_" << i; ss << str << "_" << i;
@ -23,7 +25,8 @@ void CheckFeatureMap(const FeatureData* feature_data,
} // namespace } // namespace
BOOST_AUTO_TEST_CASE(set_feature_map) { BOOST_AUTO_TEST_CASE(set_feature_map)
{
std::string str("d_0 d_1 d_2 d_3 d_4 d_5 d_6 lm_0 lm_1 tm_0 tm_1 tm_2 tm_3 tm_4 w_0 "); std::string str("d_0 d_1 d_2 d_3 d_4 d_5 d_6 lm_0 lm_1 tm_0 tm_1 tm_2 tm_3 tm_4 w_0 ");
FeatureData feature_data; FeatureData feature_data;

View File

@ -18,31 +18,35 @@
using namespace std; using namespace std;
namespace { namespace
{
const int kAvailableSize = 8; const int kAvailableSize = 8;
} // namespace } // namespace
namespace MosesTuning namespace MosesTuning
{ {
SparseVector::name2id_t SparseVector::m_name_to_id; SparseVector::name2id_t SparseVector::m_name_to_id;
SparseVector::id2name_t SparseVector::m_id_to_name; SparseVector::id2name_t SparseVector::m_id_to_name;
FeatureStatsType SparseVector::get(const string& name) const { FeatureStatsType SparseVector::get(const string& name) const
{
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name); name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
if (name2id_iter == m_name_to_id.end()) return 0; if (name2id_iter == m_name_to_id.end()) return 0;
size_t id = name2id_iter->second; size_t id = name2id_iter->second;
return get(id); return get(id);
} }
FeatureStatsType SparseVector::get(size_t id) const { FeatureStatsType SparseVector::get(size_t id) const
{
fvector_t::const_iterator fvector_iter = m_fvector.find(id); fvector_t::const_iterator fvector_iter = m_fvector.find(id);
if (fvector_iter == m_fvector.end()) return 0; if (fvector_iter == m_fvector.end()) return 0;
return fvector_iter->second; return fvector_iter->second;
} }
void SparseVector::set(const string& name, FeatureStatsType value) { void SparseVector::set(const string& name, FeatureStatsType value)
{
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name); name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
size_t id = 0; size_t id = 0;
if (name2id_iter == m_name_to_id.end()) { if (name2id_iter == m_name_to_id.end()) {
@ -55,7 +59,8 @@ void SparseVector::set(const string& name, FeatureStatsType value) {
m_fvector[id] = value; m_fvector[id] = value;
} }
void SparseVector::write(ostream& out, const string& sep) const { void SparseVector::write(ostream& out, const string& sep) const
{
for (fvector_t::const_iterator i = m_fvector.begin(); i != m_fvector.end(); ++i) { for (fvector_t::const_iterator i = m_fvector.begin(); i != m_fvector.end(); ++i) {
if (abs(i->second) < 0.00001) continue; if (abs(i->second) < 0.00001) continue;
string name = m_id_to_name[i->first]; string name = m_id_to_name[i->first];
@ -63,11 +68,13 @@ void SparseVector::write(ostream& out, const string& sep) const {
} }
} }
void SparseVector::clear() { void SparseVector::clear()
{
m_fvector.clear(); m_fvector.clear();
} }
void SparseVector::load(const string& file) { void SparseVector::load(const string& file)
{
ifstream in(file.c_str()); ifstream in(file.c_str());
if (!in) { if (!in) {
throw runtime_error("Failed to open sparse weights file: " + file); throw runtime_error("Failed to open sparse weights file: " + file);
@ -84,39 +91,44 @@ void SparseVector::load(const string& file) {
} }
} }
SparseVector& SparseVector::operator-=(const SparseVector& rhs) { SparseVector& SparseVector::operator-=(const SparseVector& rhs)
{
for (fvector_t::const_iterator i = rhs.m_fvector.begin(); for (fvector_t::const_iterator i = rhs.m_fvector.begin();
i != rhs.m_fvector.end(); ++i) { i != rhs.m_fvector.end(); ++i) {
m_fvector[i->first] = get(i->first) - (i->second); m_fvector[i->first] = get(i->first) - (i->second);
} }
return *this; return *this;
} }
FeatureStatsType SparseVector::inner_product(const SparseVector& rhs) const { FeatureStatsType SparseVector::inner_product(const SparseVector& rhs) const
{
FeatureStatsType product = 0.0; FeatureStatsType product = 0.0;
for (fvector_t::const_iterator i = m_fvector.begin(); for (fvector_t::const_iterator i = m_fvector.begin();
i != m_fvector.end(); ++i) { i != m_fvector.end(); ++i) {
product += ((i->second) * (rhs.get(i->first))); product += ((i->second) * (rhs.get(i->first)));
} }
return product; return product;
} }
SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs) { SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs)
{
SparseVector res(lhs); SparseVector res(lhs);
res -= rhs; res -= rhs;
return res; return res;
} }
FeatureStatsType inner_product(const SparseVector& lhs, const SparseVector& rhs) { FeatureStatsType inner_product(const SparseVector& lhs, const SparseVector& rhs)
if (lhs.size() >= rhs.size()) { {
return rhs.inner_product(lhs); if (lhs.size() >= rhs.size()) {
} else { return rhs.inner_product(lhs);
return lhs.inner_product(rhs); } else {
} return lhs.inner_product(rhs);
}
} }
std::vector<std::size_t> SparseVector::feats() const { std::vector<std::size_t> SparseVector::feats() const
{
std::vector<std::size_t> toRet; std::vector<std::size_t> toRet;
for(fvector_t::const_iterator iter = m_fvector.begin(); for(fvector_t::const_iterator iter = m_fvector.begin();
iter!=m_fvector.end(); iter!=m_fvector.end();
@ -126,7 +138,8 @@ std::vector<std::size_t> SparseVector::feats() const {
return toRet; return toRet;
} }
std::size_t SparseVector::encode(const std::string& name) { std::size_t SparseVector::encode(const std::string& name)
{
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name); name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
size_t id = 0; size_t id = 0;
if (name2id_iter == m_name_to_id.end()) { if (name2id_iter == m_name_to_id.end()) {
@ -139,26 +152,29 @@ std::size_t SparseVector::encode(const std::string& name) {
return id; return id;
} }
std::string SparseVector::decode(std::size_t id) { std::string SparseVector::decode(std::size_t id)
{
return m_id_to_name[id]; return m_id_to_name[id];
} }
bool operator==(SparseVector const& item1, SparseVector const& item2) { bool operator==(SparseVector const& item1, SparseVector const& item2)
{
return item1.m_fvector==item2.m_fvector; return item1.m_fvector==item2.m_fvector;
} }
std::size_t hash_value(SparseVector const& item) { std::size_t hash_value(SparseVector const& item)
{
boost::hash<SparseVector::fvector_t> hasher; boost::hash<SparseVector::fvector_t> hasher;
return hasher(item.m_fvector); return hasher(item.m_fvector);
} }
FeatureStats::FeatureStats() FeatureStats::FeatureStats()
: m_available_size(kAvailableSize), m_entries(0), : m_available_size(kAvailableSize), m_entries(0),
m_array(new FeatureStatsType[m_available_size]) {} m_array(new FeatureStatsType[m_available_size]) {}
FeatureStats::FeatureStats(const size_t size) FeatureStats::FeatureStats(const size_t size)
: m_available_size(size), m_entries(size), : m_available_size(size), m_entries(size),
m_array(new FeatureStatsType[m_available_size]) m_array(new FeatureStatsType[m_available_size])
{ {
memset(m_array, 0, GetArraySizeWithBytes()); memset(m_array, 0, GetArraySizeWithBytes());
} }
@ -276,7 +292,8 @@ void FeatureStats::savetxt(ostream* os)
*os << *this; *os << *this;
} }
void FeatureStats::savetxt() { void FeatureStats::savetxt()
{
savetxt(&cout); savetxt(&cout);
} }
@ -298,7 +315,8 @@ ostream& operator<<(ostream& o, const FeatureStats& e)
return o; return o;
} }
bool operator==(const FeatureStats& f1, const FeatureStats& f2) { bool operator==(const FeatureStats& f1, const FeatureStats& f2)
{
size_t size = f1.size(); size_t size = f1.size();
if (size != f2.size()) if (size != f2.size())

View File

@ -18,10 +18,11 @@
namespace MosesTuning namespace MosesTuning
{ {
// Minimal sparse vector // Minimal sparse vector
class SparseVector { class SparseVector
{
public: public:
typedef std::map<std::size_t,FeatureStatsType> fvector_t; typedef std::map<std::size_t,FeatureStatsType> fvector_t;
typedef std::map<std::string, std::size_t> name2id_t; typedef std::map<std::string, std::size_t> name2id_t;
@ -32,8 +33,10 @@ public:
void set(const std::string& name, FeatureStatsType value); void set(const std::string& name, FeatureStatsType value);
void clear(); void clear();
void load(const std::string& file); void load(const std::string& file);
std::size_t size() const { return m_fvector.size(); } std::size_t size() const {
return m_fvector.size();
}
void write(std::ostream& out, const std::string& sep = " ") const; void write(std::ostream& out, const std::string& sep = " ") const;
SparseVector& operator-=(const SparseVector& rhs); SparseVector& operator-=(const SparseVector& rhs);
@ -78,7 +81,9 @@ public:
void Copy(const FeatureStats &stats); void Copy(const FeatureStats &stats);
bool isfull() const { return (m_entries < m_available_size) ? 0 : 1; } bool isfull() const {
return (m_entries < m_available_size) ? 0 : 1;
}
void expand(); void expand();
void add(FeatureStatsType v); void add(FeatureStatsType v);
void addSparse(const std::string& name, FeatureStatsType v); void addSparse(const std::string& name, FeatureStatsType v);
@ -93,23 +98,37 @@ public:
clear(); clear();
} }
FeatureStatsType get(std::size_t i) { return m_array[i]; } FeatureStatsType get(std::size_t i) {
FeatureStatsType get(std::size_t i)const { return m_array[i]; } return m_array[i];
featstats_t getArray() const { return m_array; } }
FeatureStatsType get(std::size_t i)const {
return m_array[i];
}
featstats_t getArray() const {
return m_array;
}
const SparseVector& getSparse() const { return m_map; } const SparseVector& getSparse() const {
return m_map;
}
void set(std::string &theString, const SparseVector& sparseWeights); void set(std::string &theString, const SparseVector& sparseWeights);
inline std::size_t bytes() const { return GetArraySizeWithBytes(); } inline std::size_t bytes() const {
return GetArraySizeWithBytes();
}
std::size_t GetArraySizeWithBytes() const { std::size_t GetArraySizeWithBytes() const {
return m_entries * sizeof(FeatureStatsType); return m_entries * sizeof(FeatureStatsType);
} }
std::size_t size() const { return m_entries; } std::size_t size() const {
return m_entries;
}
std::size_t available() const { return m_available_size; } std::size_t available() const {
return m_available_size;
}
void savetxt(const std::string &file); void savetxt(const std::string &file);
void savetxt(std::ostream* os); void savetxt(std::ostream* os);

View File

@ -5,15 +5,17 @@
using namespace std; using namespace std;
namespace { namespace
bool IsGzipFile(const std::string &filename) { {
bool IsGzipFile(const std::string &filename)
{
return filename.size() > 3 && return filename.size() > 3 &&
filename.substr(filename.size() - 3, 3) == ".gz"; filename.substr(filename.size() - 3, 3) == ".gz";
} }
} // namespace } // namespace
inputfilestream::inputfilestream(const std::string &filePath) inputfilestream::inputfilestream(const std::string &filePath)
: std::istream(0), m_streambuf(0), m_is_good(false) : std::istream(0), m_streambuf(0), m_is_good(false)
{ {
// check if file is readable // check if file is readable
std::filebuf* fb = new std::filebuf(); std::filebuf* fb = new std::filebuf();
@ -40,7 +42,7 @@ void inputfilestream::close()
} }
outputfilestream::outputfilestream(const std::string &filePath) outputfilestream::outputfilestream(const std::string &filePath)
: std::ostream(0), m_streambuf(0), m_is_good(false) : std::ostream(0), m_streambuf(0), m_is_good(false)
{ {
// check if file is readable // check if file is readable
std::filebuf* fb = new std::filebuf(); std::filebuf* fb = new std::filebuf();

View File

@ -16,7 +16,9 @@ public:
explicit inputfilestream(const std::string &filePath); explicit inputfilestream(const std::string &filePath);
virtual ~inputfilestream(); virtual ~inputfilestream();
bool good() const { return m_is_good; } bool good() const {
return m_is_good;
}
void close(); void close();
}; };
@ -30,7 +32,9 @@ public:
explicit outputfilestream(const std::string &filePath); explicit outputfilestream(const std::string &filePath);
virtual ~outputfilestream(); virtual ~outputfilestream();
bool good() const { return m_is_good; } bool good() const {
return m_is_good;
}
void close(); void close();
}; };

View File

@ -5,7 +5,8 @@
#include <cstdio> #include <cstdio>
#include <iostream> #include <iostream>
GzFileBuf::GzFileBuf(const char* filename) { GzFileBuf::GzFileBuf(const char* filename)
{
m_gz_file = gzopen(filename, "rb"); m_gz_file = gzopen(filename, "rb");
if (m_gz_file == NULL) { if (m_gz_file == NULL) {
std::cerr << "ERROR: Failed to open " << filename << std::endl; std::cerr << "ERROR: Failed to open " << filename << std::endl;
@ -16,16 +17,19 @@ GzFileBuf::GzFileBuf(const char* filename) {
m_buf + sizeof(int)); // end position m_buf + sizeof(int)); // end position
} }
GzFileBuf::~GzFileBuf() { GzFileBuf::~GzFileBuf()
{
gzclose(m_gz_file); gzclose(m_gz_file);
} }
int GzFileBuf::overflow(int_type c) { int GzFileBuf::overflow(int_type c)
{
throw; throw;
} }
// read one character // read one character
int GzFileBuf::underflow() { int GzFileBuf::underflow()
{
// is read position before end of m_buf? // is read position before end of m_buf?
if (gptr() < egptr()) { if (gptr() < egptr()) {
return traits_type::to_int_type(*gptr()); return traits_type::to_int_type(*gptr());
@ -64,17 +68,20 @@ int GzFileBuf::underflow() {
} }
std::streampos GzFileBuf::seekpos( std::streampos GzFileBuf::seekpos(
std::streampos sp, std::streampos sp,
std::ios_base::openmode which) { std::ios_base::openmode which)
{
throw; throw;
} }
std::streamsize GzFileBuf::xsgetn(char* s, std::streamsize GzFileBuf::xsgetn(char* s,
std::streamsize num) { std::streamsize num)
{
return static_cast<std::streamsize>(gzread(m_gz_file,s,num)); return static_cast<std::streamsize>(gzread(m_gz_file,s,num));
} }
std::streamsize GzFileBuf::xsputn(const char* s, std::streamsize GzFileBuf::xsputn(const char* s,
std::streamsize num) { std::streamsize num)
{
throw; throw;
} }

View File

@ -17,8 +17,8 @@ protected:
virtual int_type underflow(); virtual int_type underflow();
virtual std::streampos seekpos( virtual std::streampos seekpos(
std::streampos sp, std::streampos sp,
std::ios_base::openmode which = std::ios_base::in | std::ios_base::out); std::ios_base::openmode which = std::ios_base::in | std::ios_base::out);
virtual std::streamsize xsgetn(char* s, std::streamsize num); virtual std::streamsize xsgetn(char* s, std::streamsize num);

View File

@ -8,13 +8,13 @@ using namespace std;
namespace MosesTuning namespace MosesTuning
{ {
StreamingHypPackEnumerator::StreamingHypPackEnumerator StreamingHypPackEnumerator::StreamingHypPackEnumerator
( (
vector<std::string> const& featureFiles, vector<std::string> const& featureFiles,
vector<std::string> const& scoreFiles vector<std::string> const& scoreFiles
) )
: m_featureFiles(featureFiles), : m_featureFiles(featureFiles),
m_scoreFiles(scoreFiles) m_scoreFiles(scoreFiles)
{ {
@ -22,19 +22,20 @@ StreamingHypPackEnumerator::StreamingHypPackEnumerator
cerr << "No data to process" << endl; cerr << "No data to process" << endl;
exit(0); exit(0);
} }
if (featureFiles.size() != scoreFiles.size()) { if (featureFiles.size() != scoreFiles.size()) {
cerr << "Error: Number of feature files (" << featureFiles.size() << cerr << "Error: Number of feature files (" << featureFiles.size() <<
") does not match number of score files (" << scoreFiles.size() << ")" << endl; ") does not match number of score files (" << scoreFiles.size() << ")" << endl;
exit(1); exit(1);
} }
m_num_lists = scoreFiles.size(); m_num_lists = scoreFiles.size();
m_primed = false; m_primed = false;
m_iNumDense = -1; m_iNumDense = -1;
} }
size_t StreamingHypPackEnumerator::num_dense() const { size_t StreamingHypPackEnumerator::num_dense() const
{
if(m_iNumDense<0) { if(m_iNumDense<0) {
cerr << "Error: Requested num_dense() for an unprimed StreamingHypPackEnumerator" << endl; cerr << "Error: Requested num_dense() for an unprimed StreamingHypPackEnumerator" << endl;
exit(1); exit(1);
@ -42,12 +43,13 @@ size_t StreamingHypPackEnumerator::num_dense() const {
return (size_t) m_iNumDense; return (size_t) m_iNumDense;
} }
void StreamingHypPackEnumerator::prime(){ void StreamingHypPackEnumerator::prime()
{
m_current_indexes.clear(); m_current_indexes.clear();
m_current_featureVectors.clear(); m_current_featureVectors.clear();
boost::unordered_set<FeatureDataItem> seen; boost::unordered_set<FeatureDataItem> seen;
m_primed = true; m_primed = true;
for (size_t i = 0; i < m_num_lists; ++i) { for (size_t i = 0; i < m_num_lists; ++i) {
if (m_featureDataIters[i] == FeatureDataIterator::end()) { if (m_featureDataIters[i] == FeatureDataIterator::end()) {
cerr << "Error: Feature file " << i << " ended prematurely" << endl; cerr << "Error: Feature file " << i << " ended prematurely" << endl;
@ -78,13 +80,14 @@ void StreamingHypPackEnumerator::prime(){
} }
// Store item for retrieval // Store item for retrieval
m_current_indexes.push_back(pair<size_t,size_t>(i,j)); m_current_indexes.push_back(pair<size_t,size_t>(i,j));
m_current_featureVectors.push_back(MiraFeatureVector(item)); m_current_featureVectors.push_back(MiraFeatureVector(item));
} }
} }
} }
} }
void StreamingHypPackEnumerator::reset(){ void StreamingHypPackEnumerator::reset()
{
m_featureDataIters.clear(); m_featureDataIters.clear();
m_scoreDataIters.clear(); m_scoreDataIters.clear();
for (size_t i = 0; i < m_num_lists; ++i) { for (size_t i = 0; i < m_num_lists; ++i) {
@ -95,11 +98,13 @@ void StreamingHypPackEnumerator::reset(){
prime(); prime();
} }
bool StreamingHypPackEnumerator::finished(){ bool StreamingHypPackEnumerator::finished()
{
return m_featureDataIters[0]==FeatureDataIterator::end(); return m_featureDataIters[0]==FeatureDataIterator::end();
} }
void StreamingHypPackEnumerator::next(){ void StreamingHypPackEnumerator::next()
{
if(!m_primed) { if(!m_primed) {
cerr << "Enumerating an unprimed HypPackEnumerator" << endl; cerr << "Enumerating an unprimed HypPackEnumerator" << endl;
exit(1); exit(1);
@ -113,7 +118,8 @@ void StreamingHypPackEnumerator::next(){
if(!finished()) prime(); if(!finished()) prime();
} }
size_t StreamingHypPackEnumerator::cur_size(){ size_t StreamingHypPackEnumerator::cur_size()
{
if(!m_primed) { if(!m_primed) {
cerr << "Querying size from an unprimed HypPackEnumerator" << endl; cerr << "Querying size from an unprimed HypPackEnumerator" << endl;
exit(1); exit(1);
@ -121,7 +127,8 @@ size_t StreamingHypPackEnumerator::cur_size(){
return m_current_indexes.size(); return m_current_indexes.size();
} }
const MiraFeatureVector& StreamingHypPackEnumerator::featuresAt(size_t index){ const MiraFeatureVector& StreamingHypPackEnumerator::featuresAt(size_t index)
{
if(!m_primed) { if(!m_primed) {
cerr << "Querying features from an unprimed HypPackEnumerator" << endl; cerr << "Querying features from an unprimed HypPackEnumerator" << endl;
exit(1); exit(1);
@ -129,7 +136,8 @@ const MiraFeatureVector& StreamingHypPackEnumerator::featuresAt(size_t index){
return m_current_featureVectors[index]; return m_current_featureVectors[index];
} }
const ScoreDataItem& StreamingHypPackEnumerator::scoresAt(size_t index) { const ScoreDataItem& StreamingHypPackEnumerator::scoresAt(size_t index)
{
if(!m_primed) { if(!m_primed) {
cerr << "Querying scores from an unprimed HypPackEnumerator" << endl; cerr << "Querying scores from an unprimed HypPackEnumerator" << endl;
exit(1); exit(1);
@ -138,22 +146,23 @@ const ScoreDataItem& StreamingHypPackEnumerator::scoresAt(size_t index) {
return m_scoreDataIters[pij.first]->operator[](pij.second); return m_scoreDataIters[pij.first]->operator[](pij.second);
} }
size_t StreamingHypPackEnumerator::cur_id() { size_t StreamingHypPackEnumerator::cur_id()
{
return m_sentenceId; return m_sentenceId;
} }
/* --------- RandomAccessHypPackEnumerator ------------- */ /* --------- RandomAccessHypPackEnumerator ------------- */
RandomAccessHypPackEnumerator::RandomAccessHypPackEnumerator(vector<string> const& featureFiles, RandomAccessHypPackEnumerator::RandomAccessHypPackEnumerator(vector<string> const& featureFiles,
vector<string> const& scoreFiles, vector<string> const& scoreFiles,
bool no_shuffle) bool no_shuffle)
{ {
StreamingHypPackEnumerator train(featureFiles,scoreFiles); StreamingHypPackEnumerator train(featureFiles,scoreFiles);
size_t index=0; size_t index=0;
for(train.reset(); !train.finished(); train.next()) { for(train.reset(); !train.finished(); train.next()) {
m_features.push_back(vector<MiraFeatureVector>()); m_features.push_back(vector<MiraFeatureVector>());
m_scores.push_back(vector<ScoreDataItem>()); m_scores.push_back(vector<ScoreDataItem>());
for(size_t j=0;j<train.cur_size();j++) { for(size_t j=0; j<train.cur_size(); j++) {
m_features.back().push_back(train.featuresAt(j)); m_features.back().push_back(train.featuresAt(j));
m_scores.back().push_back(train.scoresAt(j)); m_scores.back().push_back(train.scoresAt(j));
} }
@ -165,35 +174,43 @@ RandomAccessHypPackEnumerator::RandomAccessHypPackEnumerator(vector<string> cons
m_num_dense = train.num_dense(); m_num_dense = train.num_dense();
} }
size_t RandomAccessHypPackEnumerator::num_dense() const { size_t RandomAccessHypPackEnumerator::num_dense() const
{
return m_num_dense; return m_num_dense;
} }
void RandomAccessHypPackEnumerator::reset() { void RandomAccessHypPackEnumerator::reset()
{
m_cur_index = 0; m_cur_index = 0;
if(!m_no_shuffle) random_shuffle(m_indexes.begin(),m_indexes.end()); if(!m_no_shuffle) random_shuffle(m_indexes.begin(),m_indexes.end());
} }
bool RandomAccessHypPackEnumerator::finished() { bool RandomAccessHypPackEnumerator::finished()
{
return m_cur_index >= m_indexes.size(); return m_cur_index >= m_indexes.size();
} }
void RandomAccessHypPackEnumerator::next() { void RandomAccessHypPackEnumerator::next()
{
m_cur_index++; m_cur_index++;
} }
size_t RandomAccessHypPackEnumerator::cur_size() { size_t RandomAccessHypPackEnumerator::cur_size()
{
assert(m_features[m_indexes[m_cur_index]].size()==m_scores[m_indexes[m_cur_index]].size()); assert(m_features[m_indexes[m_cur_index]].size()==m_scores[m_indexes[m_cur_index]].size());
return m_features[m_indexes[m_cur_index]].size(); return m_features[m_indexes[m_cur_index]].size();
} }
const MiraFeatureVector& RandomAccessHypPackEnumerator::featuresAt(size_t i) { const MiraFeatureVector& RandomAccessHypPackEnumerator::featuresAt(size_t i)
{
return m_features[m_indexes[m_cur_index]][i]; return m_features[m_indexes[m_cur_index]][i];
} }
const ScoreDataItem& RandomAccessHypPackEnumerator::scoresAt(size_t i) { const ScoreDataItem& RandomAccessHypPackEnumerator::scoresAt(size_t i)
{
return m_scores[m_indexes[m_cur_index]][i]; return m_scores[m_indexes[m_cur_index]][i];
} }
size_t RandomAccessHypPackEnumerator::cur_id() { size_t RandomAccessHypPackEnumerator::cur_id()
{
return m_indexes[m_cur_index]; return m_indexes[m_cur_index];
} }
// --Emacs trickery-- // --Emacs trickery--
// Local Variables: // Local Variables:
// mode:c++ // mode:c++

View File

@ -20,11 +20,12 @@
namespace MosesTuning namespace MosesTuning
{ {
// Start with these abstract classes // Start with these abstract classes
class HypPackEnumerator { class HypPackEnumerator
{
public: public:
virtual ~HypPackEnumerator() {} virtual ~HypPackEnumerator() {}
@ -41,7 +42,8 @@ public:
// Instantiation that streams from disk // Instantiation that streams from disk
// Low-memory, low-speed, sequential access // Low-memory, low-speed, sequential access
class StreamingHypPackEnumerator : public HypPackEnumerator { class StreamingHypPackEnumerator : public HypPackEnumerator
{
public: public:
StreamingHypPackEnumerator(std::vector<std::string> const& featureFiles, StreamingHypPackEnumerator(std::vector<std::string> const& featureFiles,
std::vector<std::string> const& scoreFiles); std::vector<std::string> const& scoreFiles);
@ -75,7 +77,8 @@ private:
// Instantiation that reads into memory // Instantiation that reads into memory
// High-memory, high-speed, random access // High-memory, high-speed, random access
// (Actually randomizes with each call to reset) // (Actually randomizes with each call to reset)
class RandomAccessHypPackEnumerator : public HypPackEnumerator { class RandomAccessHypPackEnumerator : public HypPackEnumerator
{
public: public:
RandomAccessHypPackEnumerator(std::vector<std::string> const& featureFiles, RandomAccessHypPackEnumerator(std::vector<std::string> const& featureFiles,
std::vector<std::string> const& scoreFiles, std::vector<std::string> const& scoreFiles,

View File

@ -11,7 +11,7 @@ namespace MosesTuning
// TODO: This is too long. Consider creating a function for // TODO: This is too long. Consider creating a function for
// initialization such as Init(). // initialization such as Init().
InterpolatedScorer::InterpolatedScorer(const string& name, const string& config) InterpolatedScorer::InterpolatedScorer(const string& name, const string& config)
: Scorer(name,config) : Scorer(name,config)
{ {
// name would be: HAMMING,BLEU or similar // name would be: HAMMING,BLEU or similar
string scorers = name; string scorers = name;
@ -66,7 +66,8 @@ InterpolatedScorer::InterpolatedScorer(const string& name, const string& config)
cerr <<endl; cerr <<endl;
} }
bool InterpolatedScorer::useAlignment() const { bool InterpolatedScorer::useAlignment() const
{
//cout << "InterpolatedScorer::useAlignment" << endl; //cout << "InterpolatedScorer::useAlignment" << endl;
for (vector<Scorer*>::const_iterator itsc = m_scorers.begin(); itsc < m_scorers.end(); itsc++) { for (vector<Scorer*>::const_iterator itsc = m_scorers.begin(); itsc < m_scorers.end(); itsc++) {
if ((*itsc)->useAlignment()) { if ((*itsc)->useAlignment()) {
@ -176,8 +177,7 @@ void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats
ScoreStats tempEntry; ScoreStats tempEntry;
if ((*itsc)->useAlignment()) { if ((*itsc)->useAlignment()) {
(*itsc)->prepareStats(sid, text, tempEntry); (*itsc)->prepareStats(sid, text, tempEntry);
} } else {
else {
(*itsc)->prepareStats(sid, sentence, tempEntry); (*itsc)->prepareStats(sid, sentence, tempEntry);
} }
if (i > 0) buff << " "; if (i > 0) buff << " ";
@ -206,17 +206,17 @@ void InterpolatedScorer::setFactors(const string& factors)
void InterpolatedScorer::setFilter(const string& filterCommand) void InterpolatedScorer::setFilter(const string& filterCommand)
{ {
if (filterCommand.empty()) return; if (filterCommand.empty()) return;
vector<string> csplit; vector<string> csplit;
split(filterCommand, ',', csplit); split(filterCommand, ',', csplit);
if (csplit.size() != m_scorers.size()) if (csplit.size() != m_scorers.size())
throw runtime_error("Number of command specifications does not equal number of interpolated scorers."); throw runtime_error("Number of command specifications does not equal number of interpolated scorers.");
for (size_t i = 0; i < m_scorers.size(); ++i) { for (size_t i = 0; i < m_scorers.size(); ++i) {
m_scorers[i]->setFilter(csplit[i]); m_scorers[i]->setFilter(csplit[i]);
} }
} }
} }

View File

@ -10,7 +10,7 @@
namespace MosesTuning namespace MosesTuning
{ {
/** /**
* Class that includes other scorers eg. * Class that includes other scorers eg.

View File

@ -7,7 +7,7 @@ using namespace std;
namespace MosesTuning namespace MosesTuning
{ {
MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec) MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec)
: m_dense(vec.dense) : m_dense(vec.dense)
@ -17,8 +17,7 @@ MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec)
size_t lastFeat = 0; size_t lastFeat = 0;
m_sparseFeats.reserve(sparseFeats.size()); m_sparseFeats.reserve(sparseFeats.size());
m_sparseVals.reserve(sparseFeats.size()); m_sparseVals.reserve(sparseFeats.size());
for(size_t i=0;i<sparseFeats.size();i++) for(size_t i=0; i<sparseFeats.size(); i++) {
{
size_t feat = m_dense.size() + sparseFeats[i]; size_t feat = m_dense.size() + sparseFeats[i];
m_sparseFeats.push_back(feat); m_sparseFeats.push_back(feat);
m_sparseVals.push_back(vec.sparse.get(sparseFeats[i])); m_sparseVals.push_back(vec.sparse.get(sparseFeats[i]));
@ -26,8 +25,7 @@ MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec)
// Check ordered property // Check ordered property
if(bFirst) { if(bFirst) {
bFirst = false; bFirst = false;
} } else {
else {
if(lastFeat>=feat) { if(lastFeat>=feat) {
cerr << "Error: Feature indeces must be strictly ascending coming out of SparseVector" << endl; cerr << "Error: Feature indeces must be strictly ascending coming out of SparseVector" << endl;
exit(1); exit(1);
@ -61,29 +59,33 @@ MiraFeatureVector::MiraFeatureVector(const vector<ValType>& dense,
} }
} }
ValType MiraFeatureVector::val(size_t index) const { ValType MiraFeatureVector::val(size_t index) const
{
if(index < m_dense.size()) if(index < m_dense.size())
return m_dense[index]; return m_dense[index];
else else
return m_sparseVals[index-m_dense.size()]; return m_sparseVals[index-m_dense.size()];
} }
size_t MiraFeatureVector::feat(size_t index) const { size_t MiraFeatureVector::feat(size_t index) const
{
if(index < m_dense.size()) if(index < m_dense.size())
return index; return index;
else else
return m_sparseFeats[index-m_dense.size()]; return m_sparseFeats[index-m_dense.size()];
} }
size_t MiraFeatureVector::size() const { size_t MiraFeatureVector::size() const
{
return m_dense.size() + m_sparseVals.size(); return m_dense.size() + m_sparseVals.size();
} }
ValType MiraFeatureVector::sqrNorm() const { ValType MiraFeatureVector::sqrNorm() const
{
ValType toRet = 0.0; ValType toRet = 0.0;
for(size_t i=0;i<m_dense.size();i++) for(size_t i=0; i<m_dense.size(); i++)
toRet += m_dense[i]*m_dense[i]; toRet += m_dense[i]*m_dense[i];
for(size_t i=0;i<m_sparseVals.size();i++) for(size_t i=0; i<m_sparseVals.size(); i++)
toRet += m_sparseVals[i] * m_sparseVals[i]; toRet += m_sparseVals[i] * m_sparseVals[i];
return toRet; return toRet;
} }
@ -96,7 +98,7 @@ MiraFeatureVector operator-(const MiraFeatureVector& a, const MiraFeatureVector&
cerr << "Mismatching dense vectors passed to MiraFeatureVector subtraction" << endl; cerr << "Mismatching dense vectors passed to MiraFeatureVector subtraction" << endl;
exit(1); exit(1);
} }
for(size_t i=0;i<a.m_dense.size();i++) { for(size_t i=0; i<a.m_dense.size(); i++) {
dense.push_back(a.m_dense[i] - b.m_dense[i]); dense.push_back(a.m_dense[i] - b.m_dense[i]);
} }
@ -148,7 +150,7 @@ MiraFeatureVector operator-(const MiraFeatureVector& a, const MiraFeatureVector&
ostream& operator<<(ostream& o, const MiraFeatureVector& e) ostream& operator<<(ostream& o, const MiraFeatureVector& e)
{ {
for(size_t i=0;i<e.size();i++) { for(size_t i=0; i<e.size(); i++) {
if(i>0) o << " "; if(i>0) o << " ";
o << e.feat(i) << ":" << e.val(i); o << e.feat(i) << ":" << e.val(i);
} }

View File

@ -19,11 +19,12 @@
namespace MosesTuning namespace MosesTuning
{ {
typedef FeatureStatsType ValType; typedef FeatureStatsType ValType;
class MiraFeatureVector { class MiraFeatureVector
{
public: public:
MiraFeatureVector(const FeatureDataItem& vec); MiraFeatureVector(const FeatureDataItem& vec);
MiraFeatureVector(const MiraFeatureVector& other); MiraFeatureVector(const MiraFeatureVector& other);

View File

@ -6,7 +6,7 @@ using namespace std;
namespace MosesTuning namespace MosesTuning
{ {
/** /**
* Constructor, initializes to the zero vector * Constructor, initializes to the zero vector
@ -36,9 +36,10 @@ MiraWeightVector::MiraWeightVector(const vector<ValType>& init)
* \param fv Feature vector to be added to the weights * \param fv Feature vector to be added to the weights
* \param tau FV will be scaled by this value before update * \param tau FV will be scaled by this value before update
*/ */
void MiraWeightVector::update(const MiraFeatureVector& fv, float tau) { void MiraWeightVector::update(const MiraFeatureVector& fv, float tau)
{
m_numUpdates++; m_numUpdates++;
for(size_t i=0;i<fv.size();i++) { for(size_t i=0; i<fv.size(); i++) {
update(fv.feat(i), fv.val(i)*tau); update(fv.feat(i), fv.val(i)*tau);
} }
} }
@ -46,7 +47,8 @@ void MiraWeightVector::update(const MiraFeatureVector& fv, float tau) {
/** /**
* Perform an empty update (affects averaging) * Perform an empty update (affects averaging)
*/ */
void MiraWeightVector::tick() { void MiraWeightVector::tick()
{
m_numUpdates++; m_numUpdates++;
} }
@ -54,7 +56,8 @@ void MiraWeightVector::tick() {
* Score a feature vector according to the model * Score a feature vector according to the model
* \param fv Feature vector to be scored * \param fv Feature vector to be scored
*/ */
ValType MiraWeightVector::score(const MiraFeatureVector& fv) const { ValType MiraWeightVector::score(const MiraFeatureVector& fv) const
{
ValType toRet = 0.0; ValType toRet = 0.0;
for(size_t i=0; i<fv.size(); i++) { for(size_t i=0; i<fv.size(); i++) {
toRet += weight(fv.feat(i)) * fv.val(i); toRet += weight(fv.feat(i)) * fv.val(i);
@ -65,7 +68,8 @@ ValType MiraWeightVector::score(const MiraFeatureVector& fv) const {
/** /**
* Return an averaged view of this weight vector * Return an averaged view of this weight vector
*/ */
AvgWeightVector MiraWeightVector::avg() { AvgWeightVector MiraWeightVector::avg()
{
this->fixTotals(); this->fixTotals();
return AvgWeightVector(*this); return AvgWeightVector(*this);
} }
@ -73,7 +77,8 @@ AvgWeightVector MiraWeightVector::avg() {
/** /**
* Updates a weight and lazily updates its total * Updates a weight and lazily updates its total
*/ */
void MiraWeightVector::update(size_t index, ValType delta) { void MiraWeightVector::update(size_t index, ValType delta)
{
// Handle previously unseen weights // Handle previously unseen weights
while(index>=m_weights.size()) { while(index>=m_weights.size()) {
@ -91,25 +96,27 @@ void MiraWeightVector::update(size_t index, ValType delta) {
/** /**
* Make sure everyone's total is up-to-date * Make sure everyone's total is up-to-date
*/ */
void MiraWeightVector::fixTotals() { void MiraWeightVector::fixTotals()
{
for(size_t i=0; i<m_weights.size(); i++) update(i,0); for(size_t i=0; i<m_weights.size(); i++) update(i,0);
} }
/** /**
* Helper to handle out of range weights * Helper to handle out of range weights
*/ */
ValType MiraWeightVector::weight(size_t index) const { ValType MiraWeightVector::weight(size_t index) const
{
if(index < m_weights.size()) { if(index < m_weights.size()) {
return m_weights[index]; return m_weights[index];
} } else {
else {
return 0; return 0;
} }
} }
ValType MiraWeightVector::sqrNorm() const { ValType MiraWeightVector::sqrNorm() const
{
ValType toRet = 0; ValType toRet = 0;
for(size_t i=0;i<m_weights.size();i++) { for(size_t i=0; i<m_weights.size(); i++) {
toRet += weight(i) * weight(i); toRet += weight(i) * weight(i);
} }
return toRet; return toRet;
@ -119,9 +126,9 @@ AvgWeightVector::AvgWeightVector(const MiraWeightVector& wv)
:m_wv(wv) :m_wv(wv)
{} {}
ostream& operator<<(ostream& o, const MiraWeightVector& e) ostream& operator<<(ostream& o, const MiraWeightVector& e)
{ {
for(size_t i=0;i<e.m_weights.size();i++) { for(size_t i=0; i<e.m_weights.size(); i++) {
if(abs(e.m_weights[i])>1e-8) { if(abs(e.m_weights[i])>1e-8) {
if(i>0) o << " "; if(i>0) o << " ";
cerr << i << ":" << e.m_weights[i]; cerr << i << ":" << e.m_weights[i];
@ -136,14 +143,14 @@ ValType AvgWeightVector::weight(size_t index) const
else { else {
if(index < m_wv.m_totals.size()) { if(index < m_wv.m_totals.size()) {
return m_wv.m_totals[index] / m_wv.m_numUpdates; return m_wv.m_totals[index] / m_wv.m_numUpdates;
} } else {
else {
return 0; return 0;
} }
} }
} }
ValType AvgWeightVector::score(const MiraFeatureVector& fv) const { ValType AvgWeightVector::score(const MiraFeatureVector& fv) const
{
ValType toRet = 0.0; ValType toRet = 0.0;
for(size_t i=0; i<fv.size(); i++) { for(size_t i=0; i<fv.size(); i++) {
toRet += weight(fv.feat(i)) * fv.val(i); toRet += weight(fv.feat(i)) * fv.val(i);
@ -151,7 +158,8 @@ ValType AvgWeightVector::score(const MiraFeatureVector& fv) const {
return toRet; return toRet;
} }
size_t AvgWeightVector::size() const { size_t AvgWeightVector::size() const
{
return m_wv.m_weights.size(); return m_wv.m_weights.size();
} }

View File

@ -17,11 +17,12 @@
namespace MosesTuning namespace MosesTuning
{ {
class AvgWeightVector; class AvgWeightVector;
class MiraWeightVector { class MiraWeightVector
{
public: public:
/** /**
* Constructor, initializes to the zero vector * Constructor, initializes to the zero vector
@ -91,7 +92,8 @@ private:
/** /**
* Averaged view of a weight vector * Averaged view of a weight vector
*/ */
class AvgWeightVector { class AvgWeightVector
{
public: public:
AvgWeightVector(const MiraWeightVector& wv); AvgWeightVector(const MiraWeightVector& wv);
ValType score(const MiraFeatureVector& fv) const; ValType score(const MiraFeatureVector& fv) const;

View File

@ -13,8 +13,9 @@ namespace MosesTuning
* typical accessors and mutaors, but we intentionally does not allow * typical accessors and mutaors, but we intentionally does not allow
* erasing elements. * erasing elements.
*/ */
class NgramCounts { class NgramCounts
public: {
public:
// Used to construct the ngram map // Used to construct the ngram map
struct NgramComparator { struct NgramComparator {
bool operator()(const std::vector<int>& a, const std::vector<int>& b) const { bool operator()(const std::vector<int>& a, const std::vector<int>& b) const {
@ -45,7 +46,9 @@ class NgramCounts {
/** /**
* If the specified "ngram" is found, we add counts. * If the specified "ngram" is found, we add counts.
* If not, we insert the default count in the container. */ * If not, we insert the default count in the container. */
inline void Add(const Key& ngram) { m_counts[ngram]++; } inline void Add(const Key& ngram) {
m_counts[ngram]++;
}
/** /**
* Return true iff the specified "ngram" is found in the container. * Return true iff the specified "ngram" is found in the container.
@ -60,34 +63,58 @@ class NgramCounts {
/** /**
* Clear all elments in the container. * Clear all elments in the container.
*/ */
void clear() { m_counts.clear(); } void clear() {
m_counts.clear();
}
/** /**
* Return true iff the container is empty. * Return true iff the container is empty.
*/ */
bool empty() const { return m_counts.empty(); } bool empty() const {
return m_counts.empty();
}
/** /**
* Return the the number of elements in the container. * Return the the number of elements in the container.
*/ */
std::size_t size() const { return m_counts.size(); } std::size_t size() const {
return m_counts.size();
}
std::size_t max_size() const { return m_counts.max_size(); } std::size_t max_size() const {
return m_counts.max_size();
}
// Note: This is mainly used by unit tests. // Note: This is mainly used by unit tests.
int get_default_count() const { return kDefaultCount; } int get_default_count() const {
return kDefaultCount;
}
iterator find(const Key& ngram) { return m_counts.find(ngram); } iterator find(const Key& ngram) {
const_iterator find(const Key& ngram) const { return m_counts.find(ngram); } return m_counts.find(ngram);
}
const_iterator find(const Key& ngram) const {
return m_counts.find(ngram);
}
Value& operator[](const Key& ngram) { return m_counts[ngram]; } Value& operator[](const Key& ngram) {
return m_counts[ngram];
}
iterator begin() { return m_counts.begin(); } iterator begin() {
const_iterator begin() const { return m_counts.begin(); } return m_counts.begin();
iterator end() { return m_counts.end(); } }
const_iterator end() const { return m_counts.end(); } const_iterator begin() const {
return m_counts.begin();
}
iterator end() {
return m_counts.end();
}
const_iterator end() const {
return m_counts.end();
}
private: private:
const int kDefaultCount; const int kDefaultCount;
boost::unordered_map<Key, Value> m_counts; boost::unordered_map<Key, Value> m_counts;
}; };

View File

@ -5,7 +5,8 @@
using namespace MosesTuning; using namespace MosesTuning;
BOOST_AUTO_TEST_CASE(ngram_basic) { BOOST_AUTO_TEST_CASE(ngram_basic)
{
NgramCounts counts; NgramCounts counts;
NgramCounts::Key key; NgramCounts::Key key;
key.push_back(1); key.push_back(1);
@ -25,7 +26,8 @@ BOOST_AUTO_TEST_CASE(ngram_basic) {
BOOST_CHECK_EQUAL(it->second, 1); BOOST_CHECK_EQUAL(it->second, 1);
} }
BOOST_AUTO_TEST_CASE(ngram_Add) { BOOST_AUTO_TEST_CASE(ngram_Add)
{
NgramCounts counts; NgramCounts counts;
NgramCounts::Key key; NgramCounts::Key key;
key.push_back(1); key.push_back(1);
@ -49,7 +51,8 @@ BOOST_AUTO_TEST_CASE(ngram_Add) {
BOOST_CHECK_EQUAL(counts[key3], counts.get_default_count()); BOOST_CHECK_EQUAL(counts[key3], counts.get_default_count());
} }
BOOST_AUTO_TEST_CASE(ngram_lookup) { BOOST_AUTO_TEST_CASE(ngram_lookup)
{
NgramCounts counts; NgramCounts counts;
NgramCounts::Key key; NgramCounts::Key key;
key.push_back(1); key.push_back(1);

View File

@ -17,7 +17,8 @@ using namespace std;
static const float MIN_FLOAT = -1.0 * numeric_limits<float>::max(); static const float MIN_FLOAT = -1.0 * numeric_limits<float>::max();
static const float MAX_FLOAT = numeric_limits<float>::max(); static const float MAX_FLOAT = numeric_limits<float>::max();
namespace { namespace
{
/** /**
* Compute the intersection of 2 lines. * Compute the intersection of 2 lines.
@ -35,7 +36,7 @@ inline float intersect(float m1, float b1, float m2, float b2)
namespace MosesTuning namespace MosesTuning
{ {
Optimizer::Optimizer(unsigned Pd, const vector<unsigned>& i2O, const vector<bool>& pos, const vector<parameter_t>& start, unsigned int nrandom) Optimizer::Optimizer(unsigned Pd, const vector<unsigned>& i2O, const vector<bool>& pos, const vector<parameter_t>& start, unsigned int nrandom)
: m_scorer(NULL), m_feature_data(), m_num_random_directions(nrandom), m_positive(pos) : m_scorer(NULL), m_feature_data(), m_num_random_directions(nrandom), m_positive(pos)
@ -198,7 +199,7 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction,
thresholdmap.erase(previnserted); // erase old previnsert thresholdmap.erase(previnserted); // erase old previnsert
previnserted = thresholdmap.find(leftmostx); // point previnsert to the new threshold previnserted = thresholdmap.find(leftmostx); // point previnsert to the new threshold
previnserted->second.back()=newd; // We update the diff for sentence S previnserted->second.back()=newd; // We update the diff for sentence S
// Threshold already exists but is not the previous one. // Threshold already exists but is not the previous one.
} else { } else {
// We append the diffs in previnsert to tit before destroying previnsert. // We append the diffs in previnsert to tit before destroying previnsert.
tit->second.insert(tit->second.end(),previnserted->second.begin(),previnserted->second.end()); tit->second.insert(tit->second.end(),previnserted->second.begin(),previnserted->second.end());
@ -405,8 +406,7 @@ statscore_t SimpleOptimizer::TrueRun(Point& P) const
for (unsigned int i = 0; i < Point::getdim(); i++) for (unsigned int i = 0; i < Point::getdim(); i++)
direction[i]=0.0; direction[i]=0.0;
direction[d]=1.0; direction[d]=1.0;
} } else { // random direction update
else { // random direction update
direction.Randomize(); direction.Randomize();
} }
statscore_t curscore = LineOptimize(P, direction, linebest);//find the minimum on the line statscore_t curscore = LineOptimize(P, direction, linebest);//find the minimum on the line
@ -443,8 +443,7 @@ statscore_t RandomDirectionOptimizer::TrueRun(Point& P) const
// do specified number of random direction optimizations // do specified number of random direction optimizations
unsigned int nrun = 0; unsigned int nrun = 0;
unsigned int nrun_no_change = 0; unsigned int nrun_no_change = 0;
for (; nrun_no_change < m_num_random_directions; nrun++, nrun_no_change++) for (; nrun_no_change < m_num_random_directions; nrun++, nrun_no_change++) {
{
// choose a random direction in which to optimize // choose a random direction in which to optimize
Point direction; Point direction;
direction.Randomize(); direction.Randomize();

View File

@ -12,7 +12,7 @@ static const float kMaxFloat = std::numeric_limits<float>::max();
namespace MosesTuning namespace MosesTuning
{ {
class Point; class Point;
@ -31,8 +31,12 @@ protected:
public: public:
Optimizer(unsigned Pd, const std::vector<unsigned>& i2O, const std::vector<bool>& positive, const std::vector<parameter_t>& start, unsigned int nrandom); Optimizer(unsigned Pd, const std::vector<unsigned>& i2O, const std::vector<bool>& positive, const std::vector<parameter_t>& start, unsigned int nrandom);
void SetScorer(Scorer *scorer) { m_scorer = scorer; } void SetScorer(Scorer *scorer) {
void SetFeatureData(FeatureDataHandle feature_data) { m_feature_data = feature_data; } m_scorer = scorer;
}
void SetFeatureData(FeatureDataHandle feature_data) {
m_feature_data = feature_data;
}
virtual ~Optimizer(); virtual ~Optimizer();
unsigned size() const { unsigned size() const {
@ -97,7 +101,7 @@ private:
public: public:
RandomDirectionOptimizer(unsigned dim, const std::vector<unsigned>& i2O, const std::vector<bool>& positive, RandomDirectionOptimizer(unsigned dim, const std::vector<unsigned>& i2O, const std::vector<bool>& positive,
const std::vector<parameter_t>& start, unsigned int nrandom) const std::vector<parameter_t>& start, unsigned int nrandom)
: Optimizer(dim, i2O, positive, start, nrandom), kEPS(0.0001f) {} : Optimizer(dim, i2O, positive, start, nrandom), kEPS(0.0001f) {}
virtual statscore_t TrueRun(Point&) const; virtual statscore_t TrueRun(Point&) const;
}; };
@ -109,7 +113,7 @@ class RandomOptimizer : public Optimizer
public: public:
RandomOptimizer(unsigned dim, const std::vector<unsigned>& i2O, const std::vector<bool>& positive, RandomOptimizer(unsigned dim, const std::vector<unsigned>& i2O, const std::vector<bool>& positive,
const std::vector<parameter_t>& start, unsigned int nrandom) const std::vector<parameter_t>& start, unsigned int nrandom)
: Optimizer(dim, i2O, positive, start, nrandom) {} : Optimizer(dim, i2O, positive, start, nrandom) {}
virtual statscore_t TrueRun(Point&) const; virtual statscore_t TrueRun(Point&) const;
}; };

View File

@ -5,7 +5,7 @@ using namespace std;
namespace MosesTuning namespace MosesTuning
{ {
vector<string> OptimizerFactory::m_type_names; vector<string> OptimizerFactory::m_type_names;
@ -38,11 +38,11 @@ OptimizerFactory::OptimizerType OptimizerFactory::GetOptimizerType(const string&
} }
Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim, Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim,
const vector<unsigned>& i2o, const vector<unsigned>& i2o,
const std::vector<bool>& positive, const std::vector<bool>& positive,
const vector<parameter_t>& start, const vector<parameter_t>& start,
const string& type, const string& type,
unsigned int nrandom) unsigned int nrandom)
{ {
OptimizerType opt_type = GetOptimizerType(type); OptimizerType opt_type = GetOptimizerType(type);
if (opt_type == NOPTIMIZER) { if (opt_type == NOPTIMIZER) {
@ -55,18 +55,18 @@ Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim,
} }
switch (opt_type) { switch (opt_type) {
case POWELL: case POWELL:
return new SimpleOptimizer(dim, i2o, positive, start, nrandom); return new SimpleOptimizer(dim, i2o, positive, start, nrandom);
break; break;
case RANDOM_DIRECTION: case RANDOM_DIRECTION:
return new RandomDirectionOptimizer(dim, i2o, positive, start, nrandom); return new RandomDirectionOptimizer(dim, i2o, positive, start, nrandom);
break; break;
case RANDOM: case RANDOM:
return new RandomOptimizer(dim, i2o, positive, start, nrandom); return new RandomOptimizer(dim, i2o, positive, start, nrandom);
break; break;
default: default:
cerr << "Error: unknown optimizer" << type << endl; cerr << "Error: unknown optimizer" << type << endl;
return NULL; return NULL;
} }
} }

View File

@ -6,13 +6,13 @@
namespace MosesTuning namespace MosesTuning
{ {
class Optimizer; class Optimizer;
class OptimizerFactory class OptimizerFactory
{ {
public: public:
// NOTE: Add new optimizer here BEFORE NOPTIMZER // NOTE: Add new optimizer here BEFORE NOPTIMZER
enum OptimizerType { enum OptimizerType {
POWELL = 0, POWELL = 0,
@ -36,7 +36,7 @@ class OptimizerFactory
const std::string& type, const std::string& type,
unsigned int nrandom); unsigned int nrandom);
private: private:
OptimizerFactory() {} OptimizerFactory() {}
~OptimizerFactory() {} ~OptimizerFactory() {}

View File

@ -7,21 +7,24 @@
using namespace MosesTuning; using namespace MosesTuning;
namespace { namespace
{
inline bool CheckBuildOptimizer(unsigned dim, inline bool CheckBuildOptimizer(unsigned dim,
const std::vector<unsigned>& to_optimize, const std::vector<unsigned>& to_optimize,
const std::vector<bool>& positive, const std::vector<bool>& positive,
const std::vector<parameter_t>& start, const std::vector<parameter_t>& start,
const std::string& type, const std::string& type,
unsigned int num_random) { unsigned int num_random)
{
boost::scoped_ptr<Optimizer> optimizer(OptimizerFactory::BuildOptimizer(dim, to_optimize, positive, start, type, num_random)); boost::scoped_ptr<Optimizer> optimizer(OptimizerFactory::BuildOptimizer(dim, to_optimize, positive, start, type, num_random));
return optimizer.get() != NULL; return optimizer.get() != NULL;
} }
} // namespace } // namespace
BOOST_AUTO_TEST_CASE(optimizer_type) { BOOST_AUTO_TEST_CASE(optimizer_type)
{
BOOST_CHECK_EQUAL(OptimizerFactory::GetOptimizerType("powell"), BOOST_CHECK_EQUAL(OptimizerFactory::GetOptimizerType("powell"),
OptimizerFactory::POWELL); OptimizerFactory::POWELL);
BOOST_CHECK_EQUAL(OptimizerFactory::GetOptimizerType("random"), BOOST_CHECK_EQUAL(OptimizerFactory::GetOptimizerType("random"),
@ -30,7 +33,8 @@ BOOST_AUTO_TEST_CASE(optimizer_type) {
OptimizerFactory::RANDOM_DIRECTION); OptimizerFactory::RANDOM_DIRECTION);
} }
BOOST_AUTO_TEST_CASE(optimizer_build) { BOOST_AUTO_TEST_CASE(optimizer_build)
{
const unsigned dim = 3; const unsigned dim = 3;
std::vector<unsigned> to_optimize; std::vector<unsigned> to_optimize;
to_optimize.push_back(1); to_optimize.push_back(1);

View File

@ -10,7 +10,7 @@ using namespace std;
namespace MosesTuning namespace MosesTuning
{ {
PerScorer::PerScorer(const string& config) PerScorer::PerScorer(const string& config)
: StatisticsBasedScorer("PER",config) {} : StatisticsBasedScorer("PER",config) {}

View File

@ -9,7 +9,7 @@
namespace MosesTuning namespace MosesTuning
{ {
class ScoreStats; class ScoreStats;
@ -27,7 +27,9 @@ public:
virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles); virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles);
virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry); virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry);
virtual std::size_t NumberOfScores() const { return 3; } virtual std::size_t NumberOfScores() const {
return 3;
}
virtual float calculateScore(const std::vector<int>& comps) const; virtual float calculateScore(const std::vector<int>& comps) const;
private: private:

View File

@ -16,7 +16,7 @@ using namespace std;
namespace MosesTuning namespace MosesTuning
{ {
Permutation::Permutation(const string &alignment, const int sourceLength, const int targetLength ) Permutation::Permutation(const string &alignment, const int sourceLength, const int targetLength )
{ {
@ -86,7 +86,7 @@ void Permutation::set(const string & alignment,const int sourceLength)
//cout << "SP:" << sourcePos << " TP:" << targetPos << endl; //cout << "SP:" << sourcePos << " TP:" << targetPos << endl;
if (sourcePos > sourceLength) { if (sourcePos > sourceLength) {
cerr << "Source sentence length:" << sourceLength << " is smaller than alignment source position:" << sourcePos << endl; cerr << "Source sentence length:" << sourceLength << " is smaller than alignment source position:" << sourcePos << endl;
cerr << "******** Permutation::set :" << alignment << ": len : " << sourceLength <<endl; cerr << "******** Permutation::set :" << alignment << ": len : " << sourceLength <<endl;
exit(1); exit(1);
} }
//If have multiple target pos aligned to one source, //If have multiple target pos aligned to one source,
@ -187,7 +187,7 @@ float Permutation::distance(const Permutation &permCompare, const distanceMetric
float score=0; float score=0;
//bool debug= (verboselevel()>3); // TODO: fix verboselevel() //bool debug= (verboselevel()>3); // TODO: fix verboselevel()
bool debug=false; bool debug=false;
if (debug) { if (debug) {
cout << "*****Permutation::distance" <<endl; cout << "*****Permutation::distance" <<endl;
cout << "Hypo:" << endl; cout << "Hypo:" << endl;

View File

@ -19,7 +19,7 @@
namespace MosesTuning namespace MosesTuning
{ {
class Permutation class Permutation
{ {

View File

@ -5,7 +5,7 @@ using namespace std;
namespace MosesTuning namespace MosesTuning
{ {
const int PermutationScorer::SCORE_PRECISION = 5; const int PermutationScorer::SCORE_PRECISION = 5;
const int PermutationScorer::SCORE_MULTFACT = 100000; // 100000=10^SCORE_PRECISION const int PermutationScorer::SCORE_MULTFACT = 100000; // 100000=10^SCORE_PRECISION
@ -147,7 +147,7 @@ int PermutationScorer::getNumberWords (const string& text) const
void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{ {
//bool debug= (verboselevel()>3); // TODO: fix verboselevel() //bool debug= (verboselevel()>3); // TODO: fix verboselevel()
bool debug=false; bool debug=false;
if (debug) { if (debug) {
cout << "*******prepareStats" ; cout << "*******prepareStats" ;
cout << text << endl; cout << text << endl;

View File

@ -19,7 +19,7 @@
namespace MosesTuning namespace MosesTuning
{ {
/** /**
* Permutation * Permutation
**/ **/

View File

@ -29,7 +29,7 @@ Point::Point() : vector<parameter_t>(m_dim), m_score(0.0) {}
Point::Point(const vector<parameter_t>& init, Point::Point(const vector<parameter_t>& init,
const vector<parameter_t>& min, const vector<parameter_t>& min,
const vector<parameter_t>& max) const vector<parameter_t>& max)
: vector<parameter_t>(Point::m_dim), m_score(0.0) : vector<parameter_t>(Point::m_dim), m_score(0.0)
{ {
m_min.resize(Point::m_dim); m_min.resize(Point::m_dim);
m_max.resize(Point::m_dim); m_max.resize(Point::m_dim);

View File

@ -8,7 +8,7 @@
namespace MosesTuning namespace MosesTuning
{ {
class FeatureStats; class FeatureStats;
class Optimizer; class Optimizer;
@ -53,11 +53,19 @@ private:
statscore_t m_score; statscore_t m_score;
public: public:
static unsigned int getdim() { return m_dim; } static unsigned int getdim() {
static void setdim(std::size_t d) { m_dim = d; } return m_dim;
}
static void setdim(std::size_t d) {
m_dim = d;
}
static unsigned int getpdim() { return m_pdim; } static unsigned int getpdim() {
static void setpdim(std::size_t pd) { m_pdim = pd; } return m_pdim;
}
static void setpdim(std::size_t pd) {
m_pdim = pd;
}
static void set_optindices(const std::vector<unsigned int>& indices) { static void set_optindices(const std::vector<unsigned int>& indices) {
m_opt_indices = indices; m_opt_indices = indices;
@ -90,7 +98,9 @@ public:
*/ */
friend std::ostream& operator<<(std::ostream& o,const Point& P); friend std::ostream& operator<<(std::ostream& o,const Point& P);
void Normalize() { NormalizeL2(); } void Normalize() {
NormalizeL2();
}
void NormalizeL2(); void NormalizeL2();
void NormalizeL1(); void NormalizeL1();
@ -100,8 +110,12 @@ public:
*/ */
void GetAllWeights(std::vector<parameter_t>& w) const; void GetAllWeights(std::vector<parameter_t>& w) const;
statscore_t GetScore() const { return m_score; } statscore_t GetScore() const {
void SetScore(statscore_t score) { m_score = score; } return m_score;
}
void SetScore(statscore_t score) {
m_score = score;
}
}; };
} }

View File

@ -9,7 +9,8 @@
using namespace std; using namespace std;
using namespace MosesTuning; using namespace MosesTuning;
BOOST_AUTO_TEST_CASE(point_operators) { BOOST_AUTO_TEST_CASE(point_operators)
{
const unsigned int dim = 5; const unsigned int dim = 5;
vector<float> init(dim); vector<float> init(dim);
init[0] = 1.0f; init[0] = 1.0f;

View File

@ -18,7 +18,7 @@ using namespace std;
namespace MosesTuning namespace MosesTuning
{ {
// Child exec error signal // Child exec error signal
void exec_failed (int sig) void exec_failed (int sig)
@ -28,116 +28,108 @@ void exec_failed (int sig)
} }
PreProcessFilter::PreProcessFilter(const string& filterCommand) PreProcessFilter::PreProcessFilter(const string& filterCommand)
: m_toFilter(NULL), : m_toFilter(NULL),
m_fromFilter(NULL) m_fromFilter(NULL)
{ {
// Child error signal install // Child error signal install
// sigaction is the replacement for the traditional signal() method // sigaction is the replacement for the traditional signal() method
struct sigaction action; struct sigaction action;
action.sa_handler = exec_failed; action.sa_handler = exec_failed;
sigemptyset(&action.sa_mask); sigemptyset(&action.sa_mask);
action.sa_flags = 0; action.sa_flags = 0;
if (sigaction(SIGUSR1, &action, NULL) < 0) if (sigaction(SIGUSR1, &action, NULL) < 0) {
{ perror("SIGUSR1 install error");
perror("SIGUSR1 install error"); exit(EXIT_FAILURE);
exit(EXIT_FAILURE); }
}
int pipe_status; int pipe_status;
int pipefds_input[2]; int pipefds_input[2];
int pipefds_output[2]; int pipefds_output[2];
// int pipefds_error[2]; // int pipefds_error[2];
// Create the pipes // Create the pipes
// We do this before the fork so both processes will know about // We do this before the fork so both processes will know about
// the same pipe and they can communicate. // the same pipe and they can communicate.
pipe_status = pipe(pipefds_input); pipe_status = pipe(pipefds_input);
if (pipe_status == -1) if (pipe_status == -1) {
{ perror("Error creating the pipe");
perror("Error creating the pipe"); exit(EXIT_FAILURE);
exit(EXIT_FAILURE); }
}
pipe_status = pipe(pipefds_output); pipe_status = pipe(pipefds_output);
if (pipe_status == -1) if (pipe_status == -1) {
{ perror("Error creating the pipe");
perror("Error creating the pipe"); exit(EXIT_FAILURE);
exit(EXIT_FAILURE); }
}
/* /*
pipe_status = pipe(pipefds_error); pipe_status = pipe(pipefds_error);
if (pipe_status == -1) if (pipe_status == -1)
{ {
perror("Error creating the pipe"); perror("Error creating the pipe");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
*/ */
pid_t pid; pid_t pid;
// Create child process; both processes continue from here // Create child process; both processes continue from here
pid = fork(); pid = fork();
if (pid == pid_t(0)) if (pid == pid_t(0)) {
{ // Child process
// Child process
// When the child process finishes sends a SIGCHLD signal // When the child process finishes sends a SIGCHLD signal
// to the parent // to the parent
// Tie the standard input, output and error streams to the // Tie the standard input, output and error streams to the
// appropiate pipe ends // appropiate pipe ends
// The file descriptor 0 is the standard input // The file descriptor 0 is the standard input
// We tie it to the read end of the pipe as we will use // We tie it to the read end of the pipe as we will use
// this end of the pipe to read from it // this end of the pipe to read from it
dup2 (CHILD_STDIN_READ,0); dup2 (CHILD_STDIN_READ,0);
dup2 (CHILD_STDOUT_WRITE,1); dup2 (CHILD_STDOUT_WRITE,1);
// dup2 (CHILD_STDERR_WRITE,2); // dup2 (CHILD_STDERR_WRITE,2);
// Close in the child the unused ends of the pipes // Close in the child the unused ends of the pipes
close(CHILD_STDIN_WRITE); close(CHILD_STDIN_WRITE);
close(CHILD_STDOUT_READ); close(CHILD_STDOUT_READ);
//close(CHILD_STDERR_READ); //close(CHILD_STDERR_READ);
// Execute the program // Execute the program
execl("/bin/bash", "bash", "-c", filterCommand.c_str() , (char*)NULL); execl("/bin/bash", "bash", "-c", filterCommand.c_str() , (char*)NULL);
// We should never reach this point // We should never reach this point
// Tell the parent the exec failed // Tell the parent the exec failed
kill(getppid(), SIGUSR1); kill(getppid(), SIGUSR1);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} } else if (pid > pid_t(0)) {
else if (pid > pid_t(0)) // Parent
{
// Parent
// Close in the parent the unused ends of the pipes // Close in the parent the unused ends of the pipes
close(CHILD_STDIN_READ); close(CHILD_STDIN_READ);
close(CHILD_STDOUT_WRITE); close(CHILD_STDOUT_WRITE);
// close(CHILD_STDERR_WRITE); // close(CHILD_STDERR_WRITE);
m_toFilter = new ofdstream(CHILD_STDIN_WRITE); m_toFilter = new ofdstream(CHILD_STDIN_WRITE);
m_fromFilter = new ifdstream(CHILD_STDOUT_READ); m_fromFilter = new ifdstream(CHILD_STDOUT_READ);
} } else {
else perror("Error: fork failed");
{ exit(EXIT_FAILURE);
perror("Error: fork failed"); }
exit(EXIT_FAILURE);
}
} }
string PreProcessFilter::ProcessSentence(const string& sentence) string PreProcessFilter::ProcessSentence(const string& sentence)
{ {
*m_toFilter << sentence << "\n"; *m_toFilter << sentence << "\n";
string processedSentence; string processedSentence;
m_fromFilter->getline(processedSentence); m_fromFilter->getline(processedSentence);
return processedSentence; return processedSentence;
} }
PreProcessFilter::~PreProcessFilter() PreProcessFilter::~PreProcessFilter()
{ {
delete m_toFilter; delete m_toFilter;
delete m_fromFilter; delete m_fromFilter;
} }
} }

View File

@ -5,7 +5,7 @@
namespace MosesTuning namespace MosesTuning
{ {
class ofdstream; class ofdstream;
class ifdstream; class ifdstream;
@ -22,8 +22,8 @@ public:
~PreProcessFilter(); ~PreProcessFilter();
private: private:
ofdstream* m_toFilter; ofdstream* m_toFilter;
ifdstream* m_fromFilter; ifdstream* m_fromFilter;
}; };
} }

View File

@ -9,38 +9,57 @@
namespace MosesTuning namespace MosesTuning
{ {
/** /**
* Reference class represents reference translations for an output * Reference class represents reference translations for an output
* translation used in calculating BLEU score. * translation used in calculating BLEU score.
*/ */
class Reference { class Reference
public: {
public:
// for m_length // for m_length
typedef std::vector<std::size_t>::iterator iterator; typedef std::vector<std::size_t>::iterator iterator;
typedef std::vector<std::size_t>::const_iterator const_iterator; typedef std::vector<std::size_t>::const_iterator const_iterator;
Reference() : m_counts(new NgramCounts) { } Reference() : m_counts(new NgramCounts) { }
~Reference() { delete m_counts; } ~Reference() {
delete m_counts;
}
NgramCounts* get_counts() { return m_counts; } NgramCounts* get_counts() {
const NgramCounts* get_counts() const { return m_counts; } return m_counts;
}
const NgramCounts* get_counts() const {
return m_counts;
}
iterator begin() { return m_length.begin(); } iterator begin() {
const_iterator begin() const { return m_length.begin(); } return m_length.begin();
iterator end() { return m_length.end(); } }
const_iterator end() const { return m_length.end(); } const_iterator begin() const {
return m_length.begin();
}
iterator end() {
return m_length.end();
}
const_iterator end() const {
return m_length.end();
}
void push_back(std::size_t len) { m_length.push_back(len); } void push_back(std::size_t len) {
m_length.push_back(len);
}
std::size_t num_references() const { return m_length.size(); } std::size_t num_references() const {
return m_length.size();
}
int CalcAverage() const; int CalcAverage() const;
int CalcClosest(std::size_t length) const; int CalcClosest(std::size_t length) const;
int CalcShortest() const; int CalcShortest() const;
private: private:
NgramCounts* m_counts; NgramCounts* m_counts;
// multiple reference lengths // multiple reference lengths
@ -49,16 +68,18 @@ class Reference {
// TODO(tetsuok): fix this function and related stuff. // TODO(tetsuok): fix this function and related stuff.
// "average" reference length should not be calculated at sentence-level unlike "closest". // "average" reference length should not be calculated at sentence-level unlike "closest".
inline int Reference::CalcAverage() const { inline int Reference::CalcAverage() const
{
int total = 0; int total = 0;
for (std::size_t i = 0; i < m_length.size(); ++i) { for (std::size_t i = 0; i < m_length.size(); ++i) {
total += m_length[i]; total += m_length[i];
} }
return static_cast<int>( return static_cast<int>(
static_cast<float>(total) / m_length.size()); static_cast<float>(total) / m_length.size());
} }
inline int Reference::CalcClosest(std::size_t length) const { inline int Reference::CalcClosest(std::size_t length) const
{
int min_diff = INT_MAX; int min_diff = INT_MAX;
int closest_ref_id = 0; // an index of the closest reference translation int closest_ref_id = 0; // an index of the closest reference translation
for (std::size_t i = 0; i < m_length.size(); ++i) { for (std::size_t i = 0; i < m_length.size(); ++i) {
@ -79,7 +100,8 @@ inline int Reference::CalcClosest(std::size_t length) const {
return static_cast<int>(m_length[closest_ref_id]); return static_cast<int>(m_length[closest_ref_id]);
} }
inline int Reference::CalcShortest() const { inline int Reference::CalcShortest() const
{
return *std::min_element(m_length.begin(), m_length.end()); return *std::min_element(m_length.begin(), m_length.end());
} }

View File

@ -5,12 +5,14 @@
using namespace MosesTuning; using namespace MosesTuning;
BOOST_AUTO_TEST_CASE(refernece_count) { BOOST_AUTO_TEST_CASE(refernece_count)
{
Reference ref; Reference ref;
BOOST_CHECK(ref.get_counts() != NULL); BOOST_CHECK(ref.get_counts() != NULL);
} }
BOOST_AUTO_TEST_CASE(refernece_length_iterator) { BOOST_AUTO_TEST_CASE(refernece_length_iterator)
{
Reference ref; Reference ref;
ref.push_back(4); ref.push_back(4);
ref.push_back(2); ref.push_back(2);
@ -24,7 +26,8 @@ BOOST_AUTO_TEST_CASE(refernece_length_iterator) {
BOOST_CHECK(it == ref.end()); BOOST_CHECK(it == ref.end());
} }
BOOST_AUTO_TEST_CASE(refernece_length_average) { BOOST_AUTO_TEST_CASE(refernece_length_average)
{
{ {
Reference ref; Reference ref;
ref.push_back(4); ref.push_back(4);
@ -49,7 +52,8 @@ BOOST_AUTO_TEST_CASE(refernece_length_average) {
} }
} }
BOOST_AUTO_TEST_CASE(refernece_length_closest) { BOOST_AUTO_TEST_CASE(refernece_length_closest)
{
{ {
Reference ref; Reference ref;
ref.push_back(4); ref.push_back(4);
@ -92,7 +96,8 @@ BOOST_AUTO_TEST_CASE(refernece_length_closest) {
} }
} }
BOOST_AUTO_TEST_CASE(refernece_length_shortest) { BOOST_AUTO_TEST_CASE(refernece_length_shortest)
{
{ {
Reference ref; Reference ref;
ref.push_back(4); ref.push_back(4);

View File

@ -5,19 +5,26 @@
namespace MosesTuning namespace MosesTuning
{ {
template <class T> template <class T>
class ScopedVector { class ScopedVector
public: {
public:
typedef typename std::vector<T*>::iterator iterator; typedef typename std::vector<T*>::iterator iterator;
typedef typename std::vector<T*>::const_iterator const_iterator; typedef typename std::vector<T*>::const_iterator const_iterator;
ScopedVector() {} ScopedVector() {}
virtual ~ScopedVector() { reset(); } virtual ~ScopedVector() {
reset();
}
bool empty() const { return m_vec.empty(); } bool empty() const {
return m_vec.empty();
}
void push_back(T *e) { m_vec.push_back(e); } void push_back(T *e) {
m_vec.push_back(e);
}
void reset() { void reset() {
for (iterator it = m_vec.begin(); it != m_vec.end(); ++it) { for (iterator it = m_vec.begin(); it != m_vec.end(); ++it) {
@ -26,27 +33,53 @@ class ScopedVector {
m_vec.clear(); m_vec.clear();
} }
void reserve(std::size_t capacity) { m_vec.reserve(capacity); } void reserve(std::size_t capacity) {
void resize(std::size_t size) { m_vec.resize(size); } m_vec.reserve(capacity);
}
void resize(std::size_t size) {
m_vec.resize(size);
}
std::size_t size() const {return m_vec.size(); } std::size_t size() const {
return m_vec.size();
}
iterator begin() { return m_vec.begin(); } iterator begin() {
const_iterator begin() const { return m_vec.begin(); } return m_vec.begin();
}
const_iterator begin() const {
return m_vec.begin();
}
iterator end() { return m_vec.end(); } iterator end() {
const_iterator end() const { return m_vec.end(); } return m_vec.end();
}
const_iterator end() const {
return m_vec.end();
}
std::vector<T*>& get() { return m_vec; } std::vector<T*>& get() {
const std::vector<T*>& get() const { return m_vec; } return m_vec;
}
const std::vector<T*>& get() const {
return m_vec;
}
std::vector<T*>* operator->() { return &m_vec; } std::vector<T*>* operator->() {
const std::vector<T*>* operator->() const { return &m_vec; } return &m_vec;
}
const std::vector<T*>* operator->() const {
return &m_vec;
}
T*& operator[](std::size_t i) { return m_vec[i]; } T*& operator[](std::size_t i) {
const T* operator[](std::size_t i) const { return m_vec[i]; } return m_vec[i];
}
const T* operator[](std::size_t i) const {
return m_vec[i];
}
private: private:
std::vector<T*> m_vec; std::vector<T*> m_vec;
// no copying allowed. // no copying allowed.

View File

@ -17,12 +17,12 @@ namespace MosesTuning
ScoreArray::ScoreArray() ScoreArray::ScoreArray()
: m_num_scores(0), m_index(0) {} : m_num_scores(0), m_index(0) {}
void ScoreArray::savetxt(ostream* os, const string& sctype) void ScoreArray::savetxt(ostream* os, const string& sctype)
{ {
*os << SCORES_TXT_BEGIN << " " << m_index << " " << m_array.size() *os << SCORES_TXT_BEGIN << " " << m_index << " " << m_array.size()
<< " " << m_num_scores << " " << sctype << endl; << " " << m_num_scores << " " << sctype << endl;
for (scorearray_t::iterator i = m_array.begin(); i !=m_array.end(); i++) { for (scorearray_t::iterator i = m_array.begin(); i !=m_array.end(); i++) {
i->savetxt(os); i->savetxt(os);
*os << endl; *os << endl;
@ -33,7 +33,7 @@ void ScoreArray::savetxt(ostream* os, const string& sctype)
void ScoreArray::savebin(ostream* os, const string& score_type) void ScoreArray::savebin(ostream* os, const string& score_type)
{ {
*os << SCORES_BIN_BEGIN << " " << m_index << " " << m_array.size() *os << SCORES_BIN_BEGIN << " " << m_index << " " << m_array.size()
<< " " << m_num_scores << " " << score_type << endl; << " " << m_num_scores << " " << score_type << endl;
for (scorearray_t::iterator i = m_array.begin(); for (scorearray_t::iterator i = m_array.begin();
i != m_array.end(); i++) { i != m_array.end(); i++) {
i->savebin(os); i->savebin(os);
@ -63,7 +63,8 @@ void ScoreArray::save(const string &file, const string& score_type, bool bin)
ofs.close(); ofs.close();
} }
void ScoreArray::save(const string& score_type, bool bin) { void ScoreArray::save(const string& score_type, bool bin)
{
save(&cout, score_type, bin); save(&cout, score_type, bin);
} }

View File

@ -25,7 +25,7 @@ const char SCORES_BIN_END[] = "SCORES_BIN_END_0";
class ScoreArray class ScoreArray
{ {
private: private:
scorearray_t m_array; scorearray_t m_array;
std::string m_score_type; std::string m_score_type;
std::size_t m_num_scores; std::size_t m_num_scores;
@ -38,17 +38,29 @@ public:
ScoreArray(); ScoreArray();
~ScoreArray() {} ~ScoreArray() {}
void clear() { m_array.clear(); } void clear() {
m_array.clear();
}
int getIndex() const { return m_index; } int getIndex() const {
return m_index;
}
void setIndex(int value) { m_index = value; } void setIndex(int value) {
m_index = value;
}
ScoreStats& get(std::size_t i) { return m_array.at(i); } ScoreStats& get(std::size_t i) {
return m_array.at(i);
}
const ScoreStats& get(std::size_t i) const { return m_array.at(i); } const ScoreStats& get(std::size_t i) const {
return m_array.at(i);
}
void add(const ScoreStats& e) { m_array.push_back(e); } void add(const ScoreStats& e) {
m_array.push_back(e);
}
//ADDED BY TS //ADDED BY TS
void swap(std::size_t i, std::size_t j) { void swap(std::size_t i, std::size_t j) {
@ -62,15 +74,25 @@ public:
void merge(ScoreArray& e); void merge(ScoreArray& e);
std::string name() const { return m_score_type; } std::string name() const {
return m_score_type;
}
void name(std::string &score_type) { m_score_type = score_type; } void name(std::string &score_type) {
m_score_type = score_type;
}
std::size_t size() const { return m_array.size(); } std::size_t size() const {
return m_array.size();
}
std::size_t NumberOfScores() const { return m_num_scores; } std::size_t NumberOfScores() const {
return m_num_scores;
}
void NumberOfScores(std::size_t v) { m_num_scores = v; } void NumberOfScores(std::size_t v) {
m_num_scores = v;
}
void savetxt(std::ostream* os, const std::string& score_type); void savetxt(std::ostream* os, const std::string& score_type);
void savebin(std::ostream* os, const std::string& score_type); void savebin(std::ostream* os, const std::string& score_type);

View File

@ -50,7 +50,8 @@ void ScoreData::save(const string &file, bool bin)
ofs.close(); ofs.close();
} }
void ScoreData::save(bool bin) { void ScoreData::save(bool bin)
{
save(&cout, bin); save(&cout, bin);
} }

View File

@ -40,7 +40,9 @@ public:
ScoreData(Scorer* scorer); ScoreData(Scorer* scorer);
~ScoreData() {} ~ScoreData() {}
void clear() { m_array.clear(); } void clear() {
m_array.clear();
}
inline ScoreArray& get(std::size_t idx) { inline ScoreArray& get(std::size_t idx) {
return m_array.at(idx); return m_array.at(idx);
@ -66,7 +68,9 @@ public:
return m_array.at(i).get(j); return m_array.at(i).get(j);
} }
std::string name() const { return m_score_type; } std::string name() const {
return m_score_type;
}
std::string name(const std::string &score_type) { std::string name(const std::string &score_type) {
return m_score_type = score_type; return m_score_type = score_type;
@ -75,8 +79,12 @@ public:
void add(ScoreArray& e); void add(ScoreArray& e);
void add(const ScoreStats& e, int sent_idx); void add(const ScoreStats& e, int sent_idx);
std::size_t NumberOfScores() const { return m_num_scores; } std::size_t NumberOfScores() const {
std::size_t size() const { return m_array.size(); } return m_num_scores;
}
std::size_t size() const {
return m_array.size();
}
void save(const std::string &file, bool bin=false); void save(const std::string &file, bool bin=false);
void save(std::ostream* os, bool bin=false); void save(std::ostream* os, bool bin=false);

View File

@ -29,18 +29,20 @@ using namespace util;
namespace MosesTuning namespace MosesTuning
{ {
ScoreDataIterator::ScoreDataIterator() {} ScoreDataIterator::ScoreDataIterator() {}
ScoreDataIterator::ScoreDataIterator(const string& filename) { ScoreDataIterator::ScoreDataIterator(const string& filename)
{
m_in.reset(new FilePiece(filename.c_str())); m_in.reset(new FilePiece(filename.c_str()));
readNext(); readNext();
} }
ScoreDataIterator::~ScoreDataIterator() {} ScoreDataIterator::~ScoreDataIterator() {}
void ScoreDataIterator::readNext() { void ScoreDataIterator::readNext()
{
m_next.clear(); m_next.clear();
try { try {
StringPiece marker = m_in->ReadDelimited(); StringPiece marker = m_in->ReadDelimited();
@ -71,12 +73,14 @@ void ScoreDataIterator::readNext() {
} }
} }
void ScoreDataIterator::increment() { void ScoreDataIterator::increment()
{
readNext(); readNext();
} }
bool ScoreDataIterator::equal(const ScoreDataIterator& rhs) const { bool ScoreDataIterator::equal(const ScoreDataIterator& rhs) const
{
if (!m_in && !rhs.m_in) { if (!m_in && !rhs.m_in) {
return true; return true;
} else if (!m_in) { } else if (!m_in) {
@ -84,13 +88,14 @@ bool ScoreDataIterator::equal(const ScoreDataIterator& rhs) const {
} else if (!rhs.m_in) { } else if (!rhs.m_in) {
return false; return false;
} else { } else {
return m_in->FileName() == rhs.m_in->FileName() && return m_in->FileName() == rhs.m_in->FileName() &&
m_in->Offset() == rhs.m_in->Offset(); m_in->Offset() == rhs.m_in->Offset();
} }
} }
const vector<ScoreDataItem>& ScoreDataIterator::dereference() const { const vector<ScoreDataItem>& ScoreDataIterator::dereference() const
{
return m_next; return m_next;
} }

View File

@ -33,40 +33,43 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "FeatureDataIterator.h" #include "FeatureDataIterator.h"
namespace util { class FilePiece; } namespace util
{
class FilePiece;
}
namespace MosesTuning namespace MosesTuning
{ {
typedef std::vector<float> ScoreDataItem; typedef std::vector<float> ScoreDataItem;
class ScoreDataIterator : class ScoreDataIterator :
public boost::iterator_facade<ScoreDataIterator, public boost::iterator_facade<ScoreDataIterator,
const std::vector<ScoreDataItem>, const std::vector<ScoreDataItem>,
boost::forward_traversal_tag> boost::forward_traversal_tag>
{ {
public: public:
ScoreDataIterator(); ScoreDataIterator();
explicit ScoreDataIterator(const std::string& filename); explicit ScoreDataIterator(const std::string& filename);
~ScoreDataIterator(); ~ScoreDataIterator();
static ScoreDataIterator end() { static ScoreDataIterator end() {
return ScoreDataIterator(); return ScoreDataIterator();
} }
private: private:
friend class boost::iterator_core_access; friend class boost::iterator_core_access;
void increment(); void increment();
bool equal(const ScoreDataIterator& rhs) const; bool equal(const ScoreDataIterator& rhs) const;
const std::vector<ScoreDataItem>& dereference() const; const std::vector<ScoreDataItem>& dereference() const;
void readNext(); void readNext();
boost::shared_ptr<util::FilePiece> m_in; boost::shared_ptr<util::FilePiece> m_in;
std::vector<ScoreDataItem> m_next; std::vector<ScoreDataItem> m_next;
}; };
} }

View File

@ -13,21 +13,22 @@
using namespace std; using namespace std;
namespace { namespace
{
const int kAvailableSize = 8; const int kAvailableSize = 8;
} // namespace } // namespace
namespace MosesTuning namespace MosesTuning
{ {
ScoreStats::ScoreStats() ScoreStats::ScoreStats()
: m_available_size(kAvailableSize), m_entries(0), : m_available_size(kAvailableSize), m_entries(0),
m_array(new ScoreStatsType[m_available_size]) {} m_array(new ScoreStatsType[m_available_size]) {}
ScoreStats::ScoreStats(const size_t size) ScoreStats::ScoreStats(const size_t size)
: m_available_size(size), m_entries(size), : m_available_size(size), m_entries(size),
m_array(new ScoreStatsType[m_available_size]) m_array(new ScoreStatsType[m_available_size])
{ {
memset(m_array, 0, GetArraySizeWithBytes()); memset(m_array, 0, GetArraySizeWithBytes());
} }
@ -123,7 +124,8 @@ void ScoreStats::savetxt(ostream* os)
*os << *this; *os << *this;
} }
void ScoreStats::savetxt() { void ScoreStats::savetxt()
{
savetxt(&cout); savetxt(&cout);
} }
@ -140,7 +142,8 @@ ostream& operator<<(ostream& o, const ScoreStats& e)
return o; return o;
} }
bool operator==(const ScoreStats& s1, const ScoreStats& s2) { bool operator==(const ScoreStats& s1, const ScoreStats& s2)
{
size_t size = s1.size(); size_t size = s1.size();
if (size != s2.size()) if (size != s2.size())

View File

@ -18,7 +18,7 @@
namespace MosesTuning namespace MosesTuning
{ {
class ScoreStats class ScoreStats
{ {
@ -41,7 +41,9 @@ public:
void Copy(const ScoreStats &stats); void Copy(const ScoreStats &stats);
bool isfull() const { return (m_entries < m_available_size) ? 0 : 1; } bool isfull() const {
return (m_entries < m_available_size) ? 0 : 1;
}
void expand(); void expand();
void add(ScoreStatsType v); void add(ScoreStatsType v);
@ -55,9 +57,15 @@ public:
clear(); clear();
} }
ScoreStatsType get(std::size_t i) { return m_array[i]; } ScoreStatsType get(std::size_t i) {
ScoreStatsType get(std::size_t i) const { return m_array[i]; } return m_array[i];
scorestats_t getArray() const { return m_array; } }
ScoreStatsType get(std::size_t i) const {
return m_array[i];
}
scorestats_t getArray() const {
return m_array;
}
void set(const std::string& str); void set(const std::string& str);
@ -69,15 +77,21 @@ public:
} }
} }
std::size_t bytes() const { return GetArraySizeWithBytes(); } std::size_t bytes() const {
return GetArraySizeWithBytes();
}
std::size_t GetArraySizeWithBytes() const { std::size_t GetArraySizeWithBytes() const {
return m_entries * sizeof(ScoreStatsType); return m_entries * sizeof(ScoreStatsType);
} }
std::size_t size() const { return m_entries; } std::size_t size() const {
return m_entries;
}
std::size_t available() const { return m_available_size; } std::size_t available() const {
return m_available_size;
}
void savetxt(const std::string &file); void savetxt(const std::string &file);
void savetxt(std::ostream* os); void savetxt(std::ostream* os);

View File

@ -12,27 +12,31 @@ using namespace std;
namespace MosesTuning namespace MosesTuning
{ {
namespace { namespace
{
// For tokenizing a hypothesis translation, we may encounter unknown tokens which // For tokenizing a hypothesis translation, we may encounter unknown tokens which
// do not exist in the corresponding reference translations. // do not exist in the corresponding reference translations.
const int kUnknownToken = -1; const int kUnknownToken = -1;
} // namespace } // namespace
Scorer::Scorer(const string& name, const string& config) Scorer::Scorer(const string& name, const string& config)
: m_name(name), : m_name(name),
m_vocab(mert::VocabularyFactory::GetVocabulary()), m_vocab(mert::VocabularyFactory::GetVocabulary()),
m_filter(NULL), m_filter(NULL),
m_score_data(NULL), m_score_data(NULL),
m_enable_preserve_case(true) { m_enable_preserve_case(true)
{
InitConfig(config); InitConfig(config);
} }
Scorer::~Scorer() { Scorer::~Scorer()
{
Singleton<mert::Vocabulary>::Delete(); Singleton<mert::Vocabulary>::Delete();
delete m_filter; delete m_filter;
} }
void Scorer::InitConfig(const string& config) { void Scorer::InitConfig(const string& config)
{
// cerr << "Scorer config string: " << config << endl; // cerr << "Scorer config string: " << config << endl;
size_t start = 0; size_t start = 0;
while (start < config.size()) { while (start < config.size()) {
@ -53,7 +57,8 @@ void Scorer::InitConfig(const string& config) {
} }
} }
void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) { void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded)
{
for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" ")); for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));
it; ++it) { it; ++it) {
if (!m_enable_preserve_case) { if (!m_enable_preserve_case) {
@ -69,7 +74,8 @@ void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) {
} }
} }
void Scorer::TokenizeAndEncodeTesting(const string& line, vector<int>& encoded) { void Scorer::TokenizeAndEncodeTesting(const string& line, vector<int>& encoded)
{
for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" ")); for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));
it; ++it) { it; ++it) {
if (!m_enable_preserve_case) { if (!m_enable_preserve_case) {
@ -103,8 +109,7 @@ void Scorer::setFactors(const string& factors)
if (factors.empty()) return; if (factors.empty()) return;
vector<string> factors_vec; vector<string> factors_vec;
split(factors, '|', factors_vec); split(factors, '|', factors_vec);
for(vector<string>::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it) for(vector<string>::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it) {
{
int factor = atoi(it->c_str()); int factor = atoi(it->c_str());
m_factors.push_back(factor); m_factors.push_back(factor);
} }
@ -115,8 +120,8 @@ void Scorer::setFactors(const string& factors)
*/ */
void Scorer::setFilter(const string& filterCommand) void Scorer::setFilter(const string& filterCommand)
{ {
if (filterCommand.empty()) return; if (filterCommand.empty()) return;
m_filter = new PreProcessFilter(filterCommand); m_filter = new PreProcessFilter(filterCommand);
} }
/** /**
@ -130,8 +135,7 @@ string Scorer::applyFactors(const string& sentence) const
split(sentence, ' ', tokens); split(sentence, ' ', tokens);
stringstream sstream; stringstream sstream;
for (size_t i = 0; i < tokens.size(); ++i) for (size_t i = 0; i < tokens.size(); ++i) {
{
if (tokens[i] == "") continue; if (tokens[i] == "") continue;
vector<string> factors; vector<string> factors;
@ -141,8 +145,7 @@ string Scorer::applyFactors(const string& sentence) const
if (i > 0) sstream << " "; if (i > 0) sstream << " ";
for (size_t j = 0; j < m_factors.size(); ++j) for (size_t j = 0; j < m_factors.size(); ++j) {
{
int findex = m_factors[j]; int findex = m_factors[j];
if (findex < 0 || findex >= fsize) throw runtime_error("Factor index is out of range."); if (findex < 0 || findex >= fsize) throw runtime_error("Factor index is out of range.");
@ -158,17 +161,15 @@ string Scorer::applyFactors(const string& sentence) const
*/ */
string Scorer::applyFilter(const string& sentence) const string Scorer::applyFilter(const string& sentence) const
{ {
if (m_filter) if (m_filter) {
{
return m_filter->ProcessSentence(sentence); return m_filter->ProcessSentence(sentence);
} } else {
else
{
return sentence; return sentence;
} }
} }
float Scorer::score(const candidates_t& candidates) const { float Scorer::score(const candidates_t& candidates) const
{
diffs_t diffs; diffs_t diffs;
statscores_t scores; statscores_t scores;
score(candidates, diffs, scores); score(candidates, diffs, scores);

View File

@ -10,7 +10,8 @@
#include "Types.h" #include "Types.h"
#include "ScoreData.h" #include "ScoreData.h"
namespace mert { namespace mert
{
class Vocabulary; class Vocabulary;
@ -32,7 +33,7 @@ enum ScorerRegularisationStrategy {REG_NONE, REG_AVERAGE, REG_MINIMUM};
*/ */
class Scorer class Scorer
{ {
public: public:
Scorer(const std::string& name, const std::string& config); Scorer(const std::string& name, const std::string& config);
virtual ~Scorer(); virtual ~Scorer();
@ -117,14 +118,16 @@ class Scorer
*/ */
virtual void setFactors(const std::string& factors); virtual void setFactors(const std::string& factors);
mert::Vocabulary* GetVocab() const { return m_vocab; } mert::Vocabulary* GetVocab() const {
return m_vocab;
}
/** /**
* Set unix filter, which will be used to preprocess the sentences * Set unix filter, which will be used to preprocess the sentences
*/ */
virtual void setFilter(const std::string& filterCommand); virtual void setFilter(const std::string& filterCommand);
private: private:
void InitConfig(const std::string& config); void InitConfig(const std::string& config);
/** /**
@ -143,7 +146,7 @@ class Scorer
std::vector<int> m_factors; std::vector<int> m_factors;
PreProcessFilter* m_filter; PreProcessFilter* m_filter;
protected: protected:
ScoreData* m_score_data; ScoreData* m_score_data;
bool m_enable_preserve_case; bool m_enable_preserve_case;
@ -173,40 +176,40 @@ class Scorer
/** /**
* Every inherited scorer should call this function for each sentence * Every inherited scorer should call this function for each sentence
*/ */
std::string preprocessSentence(const std::string& sentence) const std::string preprocessSentence(const std::string& sentence) const {
{
return applyFactors(applyFilter(sentence)); return applyFactors(applyFilter(sentence));
} }
}; };
namespace { namespace
{
//regularisation strategies //regularisation strategies
inline float score_min(const statscores_t& scores, size_t start, size_t end) inline float score_min(const statscores_t& scores, size_t start, size_t end)
{ {
float min = std::numeric_limits<float>::max(); float min = std::numeric_limits<float>::max();
for (size_t i = start; i < end; ++i) { for (size_t i = start; i < end; ++i) {
if (scores[i] < min) { if (scores[i] < min) {
min = scores[i]; min = scores[i];
}
} }
return min; }
return min;
}
inline float score_average(const statscores_t& scores, size_t start, size_t end)
{
if ((end - start) < 1) {
// this shouldn't happen
return 0;
}
float total = 0;
for (size_t j = start; j < end; ++j) {
total += scores[j];
} }
inline float score_average(const statscores_t& scores, size_t start, size_t end) return total / (end - start);
{ }
if ((end - start) < 1) {
// this shouldn't happen
return 0;
}
float total = 0;
for (size_t j = start; j < end; ++j) {
total += scores[j];
}
return total / (end - start);
}
} // namespace } // namespace

View File

@ -14,9 +14,10 @@ using namespace std;
namespace MosesTuning namespace MosesTuning
{ {
vector<string> ScorerFactory::getTypes() {
vector<string> ScorerFactory::getTypes()
{
vector<string> types; vector<string> types;
types.push_back(string("BLEU")); types.push_back(string("BLEU"));
types.push_back(string("PER")); types.push_back(string("PER"));
@ -29,7 +30,8 @@ vector<string> ScorerFactory::getTypes() {
return types; return types;
} }
Scorer* ScorerFactory::getScorer(const string& type, const string& config) { Scorer* ScorerFactory::getScorer(const string& type, const string& config)
{
if (type == "BLEU") { if (type == "BLEU") {
return new BleuScorer(config); return new BleuScorer(config);
} else if (type == "PER") { } else if (type == "PER") {
@ -48,8 +50,7 @@ Scorer* ScorerFactory::getScorer(const string& type, const string& config) {
} else { } else {
if (type.find(',') != string::npos) { if (type.find(',') != string::npos) {
return new InterpolatedScorer(type, config); return new InterpolatedScorer(type, config);
} } else {
else {
throw runtime_error("Unknown scorer type: " + type); throw runtime_error("Unknown scorer type: " + type);
} }
} }

View File

@ -6,7 +6,7 @@
namespace MosesTuning namespace MosesTuning
{ {
class Scorer; class Scorer;

View File

@ -6,7 +6,8 @@
using namespace std; using namespace std;
namespace { namespace
{
MosesTuning::SemposOverlapping* g_overlapping = NULL; MosesTuning::SemposOverlapping* g_overlapping = NULL;
@ -14,9 +15,10 @@ MosesTuning::SemposOverlapping* g_overlapping = NULL;
namespace MosesTuning namespace MosesTuning
{ {
SemposOverlapping* SemposOverlappingFactory::GetOverlapping(const string& str, const SemposScorer* sempos) {
SemposOverlapping* SemposOverlappingFactory::GetOverlapping(const string& str, const SemposScorer* sempos)
{
if (str == "cap-micro") { if (str == "cap-micro") {
return new CapMicroOverlapping(sempos); return new CapMicroOverlapping(sempos);
} else if (str == "cap-macro") { } else if (str == "cap-macro") {
@ -26,7 +28,8 @@ SemposOverlapping* SemposOverlappingFactory::GetOverlapping(const string& str, c
} }
} }
void SemposOverlappingFactory::SetOverlapping(SemposOverlapping* ovr) { void SemposOverlappingFactory::SetOverlapping(SemposOverlapping* ovr)
{
g_overlapping = ovr; g_overlapping = ovr;
} }
@ -41,15 +44,13 @@ vector<int> CapMicroOverlapping::prepareStats(const sentence_t& cand, const sent
int multCoeff = 1000; int multCoeff = 1000;
float interSum = 0; float interSum = 0;
for (sentence_t::iterator it = intersection.begin(); it != intersection.end(); it++) for (sentence_t::iterator it = intersection.begin(); it != intersection.end(); it++) {
{
interSum += semposScorer->weight(it->first); interSum += semposScorer->weight(it->first);
} }
float refSum = 0; float refSum = 0;
for (sentence_t::iterator it = ref.begin(); it != ref.end(); it++) for (sentence_t::iterator it = ref.begin(); it != ref.end(); it++) {
{ refSum += semposScorer->weight(it->first);
refSum += semposScorer->weight(it->first);
} }
stats[0] = (int)(multCoeff * interSum); stats[0] = (int)(multCoeff * interSum);

View File

@ -9,7 +9,7 @@
namespace MosesTuning namespace MosesTuning
{ {
class SemposScorer; class SemposScorer;
@ -36,14 +36,15 @@ public:
virtual std::size_t NumberOfScores() const = 0; virtual std::size_t NumberOfScores() const = 0;
}; };
class SemposOverlappingFactory { class SemposOverlappingFactory
public: {
public:
static SemposOverlapping* GetOverlapping(const std::string& str, const SemposScorer* sempos); static SemposOverlapping* GetOverlapping(const std::string& str, const SemposScorer* sempos);
// dependency injection for unit testing. // dependency injection for unit testing.
static void SetOverlapping(SemposOverlapping* ovr); static void SetOverlapping(SemposOverlapping* ovr);
private: private:
SemposOverlappingFactory() {} SemposOverlappingFactory() {}
~SemposOverlappingFactory() {} ~SemposOverlappingFactory() {}
}; };
@ -62,9 +63,11 @@ public:
virtual std::vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref); virtual std::vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref);
virtual float calculateScore(const std::vector<int>& stats) const; virtual float calculateScore(const std::vector<int>& stats) const;
virtual std::size_t NumberOfScores() const { return 2; } virtual std::size_t NumberOfScores() const {
return 2;
}
private: private:
// no copying allowed. // no copying allowed.
CapMicroOverlapping(const CapMicroOverlapping&); CapMicroOverlapping(const CapMicroOverlapping&);
CapMicroOverlapping& operator=(const CapMicroOverlapping&); CapMicroOverlapping& operator=(const CapMicroOverlapping&);
@ -82,9 +85,11 @@ public:
virtual std::vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref); virtual std::vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref);
virtual float calculateScore(const std::vector<int>& stats) const; virtual float calculateScore(const std::vector<int>& stats) const;
virtual std::size_t NumberOfScores() const { return kMaxNOC * 2; } virtual std::size_t NumberOfScores() const {
return kMaxNOC * 2;
}
private: private:
// no copying allowed. // no copying allowed.
CapMacroOverlapping(const CapMacroOverlapping&); CapMacroOverlapping(const CapMacroOverlapping&);
CapMacroOverlapping& operator=(const CapMacroOverlapping&); CapMacroOverlapping& operator=(const CapMacroOverlapping&);

View File

@ -12,7 +12,7 @@ using namespace std;
namespace MosesTuning namespace MosesTuning
{ {
SemposScorer::SemposScorer(const string& config) SemposScorer::SemposScorer(const string& config)
: StatisticsBasedScorer("SEMPOS", config), : StatisticsBasedScorer("SEMPOS", config),
@ -25,8 +25,7 @@ SemposScorer::SemposScorer(const string& config)
m_semposMap.clear(); m_semposMap.clear();
string weightsfile = getConfig("weightsfile", ""); string weightsfile = getConfig("weightsfile", "");
if (weightsfile != "") if (weightsfile != "") {
{
loadWeights(weightsfile); loadWeights(weightsfile);
} }
} }
@ -144,42 +143,35 @@ int SemposScorer::encodeSempos(const string& sempos)
float SemposScorer::weight(int item) const float SemposScorer::weight(int item) const
{ {
std::map<int,float>::const_iterator it = weightsMap.find(item); std::map<int,float>::const_iterator it = weightsMap.find(item);
if (it == weightsMap.end()) if (it == weightsMap.end()) {
{ return 1.0f;
return 1.0f; } else {
} return it->second;
else }
{
return it->second;
}
} }
void SemposScorer::loadWeights(const string& weightsfile) void SemposScorer::loadWeights(const string& weightsfile)
{ {
string line; string line;
ifstream myfile; ifstream myfile;
myfile.open(weightsfile.c_str(), ifstream::in); myfile.open(weightsfile.c_str(), ifstream::in);
if (myfile.is_open()) if (myfile.is_open()) {
{ while ( myfile.good() ) {
while ( myfile.good() ) getline (myfile,line);
{ vector<string> fields;
getline (myfile,line); if (line == "") continue;
vector<string> fields; split(line, '\t', fields);
if (line == "") continue; if (fields.size() != 2) throw std::runtime_error("Bad format of a row in weights file.");
split(line, '\t', fields); int encoded = encodeString(fields[0]);
if (fields.size() != 2) throw std::runtime_error("Bad format of a row in weights file."); float weight = atof(fields[1].c_str());
int encoded = encodeString(fields[0]); weightsMap[encoded] = weight;
float weight = atof(fields[1].c_str());
weightsMap[encoded] = weight;
}
myfile.close();
}
else
{
cerr << "Unable to open file "<< weightsfile << endl;
exit(1);
} }
myfile.close();
} else {
cerr << "Unable to open file "<< weightsfile << endl;
exit(1);
}
} }

View File

@ -19,7 +19,7 @@
namespace MosesTuning namespace MosesTuning
{ {
/** /**
* This class represents sempos based metrics. * This class represents sempos based metrics.
@ -32,12 +32,16 @@ public:
virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles); virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles);
virtual void prepareStats(std::size_t sindex, const std::string& text, ScoreStats& entry); virtual void prepareStats(std::size_t sindex, const std::string& text, ScoreStats& entry);
virtual std::size_t NumberOfScores() const { return m_ovr->NumberOfScores(); } virtual std::size_t NumberOfScores() const {
return m_ovr->NumberOfScores();
}
virtual float calculateScore(const std::vector<int>& comps) const { virtual float calculateScore(const std::vector<int>& comps) const {
return m_ovr->calculateScore(comps); return m_ovr->calculateScore(comps);
} }
bool EnableDebug() const { return m_enable_debug; } bool EnableDebug() const {
return m_enable_debug;
}
float weight(int item) const; float weight(int item) const;

View File

@ -17,48 +17,50 @@ namespace MosesTuning
{ {
SentenceLevelScorer::SentenceLevelScorer(const string& name, const string& config) SentenceLevelScorer::SentenceLevelScorer(const string& name, const string& config)
: Scorer(name, config), : Scorer(name, config),
m_regularisationStrategy(REG_NONE), m_regularisationStrategy(REG_NONE),
m_regularisationWindow(0) { m_regularisationWindow(0)
{
Init(); Init();
} }
SentenceLevelScorer::~SentenceLevelScorer() {} SentenceLevelScorer::~SentenceLevelScorer() {}
void SentenceLevelScorer::Init() { void SentenceLevelScorer::Init()
// Configure regularisation. {
static string KEY_TYPE = "regtype"; // Configure regularisation.
static string KEY_WINDOW = "regwin"; static string KEY_TYPE = "regtype";
static string KEY_CASE = "case"; static string KEY_WINDOW = "regwin";
static string TYPE_NONE = "none"; static string KEY_CASE = "case";
static string TYPE_AVERAGE = "average"; static string TYPE_NONE = "none";
static string TYPE_MINIMUM = "min"; static string TYPE_AVERAGE = "average";
static string TRUE = "true"; static string TYPE_MINIMUM = "min";
static string FALSE = "false"; static string TRUE = "true";
static string FALSE = "false";
const string type = getConfig(KEY_TYPE, TYPE_NONE); const string type = getConfig(KEY_TYPE, TYPE_NONE);
if (type == TYPE_NONE) { if (type == TYPE_NONE) {
m_regularisationStrategy = REG_NONE; m_regularisationStrategy = REG_NONE;
} else if (type == TYPE_AVERAGE) { } else if (type == TYPE_AVERAGE) {
m_regularisationStrategy = REG_AVERAGE; m_regularisationStrategy = REG_AVERAGE;
} else if (type == TYPE_MINIMUM) { } else if (type == TYPE_MINIMUM) {
m_regularisationStrategy = REG_MINIMUM; m_regularisationStrategy = REG_MINIMUM;
} else { } else {
throw boost::lexer::runtime_error("Unknown scorer regularisation strategy: " + type); throw boost::lexer::runtime_error("Unknown scorer regularisation strategy: " + type);
} }
cerr << "Using scorer regularisation strategy: " << type << endl; cerr << "Using scorer regularisation strategy: " << type << endl;
const string window = getConfig(KEY_WINDOW, "0"); const string window = getConfig(KEY_WINDOW, "0");
m_regularisationWindow = atoi(window.c_str()); m_regularisationWindow = atoi(window.c_str());
cerr << "Using scorer regularisation window: " << m_regularisationWindow << endl; cerr << "Using scorer regularisation window: " << m_regularisationWindow << endl;
const string preservecase = getConfig(KEY_CASE, TRUE); const string preservecase = getConfig(KEY_CASE, TRUE);
if (preservecase == TRUE) { if (preservecase == TRUE) {
m_enable_preserve_case = true; m_enable_preserve_case = true;
} else if (preservecase == FALSE) { } else if (preservecase == FALSE) {
m_enable_preserve_case = false; m_enable_preserve_case = false;
} }
cerr << "Using case preservation: " << m_enable_preserve_case << endl; cerr << "Using case preservation: " << m_enable_preserve_case << endl;
} }
void SentenceLevelScorer::score(const candidates_t& candidates, const diffs_t& diffs, void SentenceLevelScorer::score(const candidates_t& candidates, const diffs_t& diffs,
@ -83,8 +85,8 @@ void SentenceLevelScorer::score(const candidates_t& candidates, const diffs_t&
if (stats.size() != totals.size()) { if (stats.size() != totals.size()) {
stringstream msg; stringstream msg;
msg << "Statistics for (" << "," << candidates[i] << ") have incorrect " msg << "Statistics for (" << "," << candidates[i] << ") have incorrect "
<< "number of fields. Found: " << stats.size() << " Expected: " << "number of fields. Found: " << stats.size() << " Expected: "
<< totals.size(); << totals.size();
throw runtime_error(msg.str()); throw runtime_error(msg.str());
} }
//Add up scores for all sentences, would normally be just one score //Add up scores for all sentences, would normally be just one score

View File

@ -5,13 +5,14 @@
namespace MosesTuning namespace MosesTuning
{ {
// thread *un*safe singleton. // thread *un*safe singleton.
// TODO: replace this with thread-safe singleton. // TODO: replace this with thread-safe singleton.
template <typename T> template <typename T>
class Singleton { class Singleton
public: {
public:
static T* GetInstance() { static T* GetInstance() {
if (m_instance == NULL) { if (m_instance == NULL) {
m_instance = new T; m_instance = new T;
@ -26,7 +27,7 @@ class Singleton {
} }
} }
private: private:
Singleton(); Singleton();
static T* m_instance; static T* m_instance;
}; };

View File

@ -5,19 +5,24 @@
using namespace MosesTuning; using namespace MosesTuning;
namespace { namespace
{
static int g_count = 0; static int g_count = 0;
class Instance { class Instance
public: {
Instance() { ++g_count; } public:
Instance() {
++g_count;
}
~Instance() {} ~Instance() {}
}; };
} // namespace } // namespace
BOOST_AUTO_TEST_CASE(singleton_basic) { BOOST_AUTO_TEST_CASE(singleton_basic)
{
Instance* instance1 = Singleton<Instance>::GetInstance(); Instance* instance1 = Singleton<Instance>::GetInstance();
Instance* instance2 = Singleton<Instance>::GetInstance(); Instance* instance2 = Singleton<Instance>::GetInstance();
Instance* instance3 = Singleton<Instance>::GetInstance(); Instance* instance3 = Singleton<Instance>::GetInstance();

Some files were not shown because too many files have changed in this diff Show More