mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-25 04:43:03 +03:00
beautify
This commit is contained in:
parent
59bd7deb4b
commit
6249432407
@ -50,14 +50,14 @@ int main (int argc, char * const argv[])
|
||||
}
|
||||
|
||||
int numSourceFactors = Moses::Scan<int>(argv[1])
|
||||
, numTargetFactors = Moses::Scan<int>(argv[2])
|
||||
, numScores = Moses::Scan<int>(argv[3])
|
||||
, tableLimit = Moses::Scan<int>(argv[4]);
|
||||
, numTargetFactors = Moses::Scan<int>(argv[2])
|
||||
, numScores = Moses::Scan<int>(argv[3])
|
||||
, tableLimit = Moses::Scan<int>(argv[4]);
|
||||
TargetPhraseCollection::s_sortScoreInd = Moses::Scan<int>(argv[5]);
|
||||
assert(TargetPhraseCollection::s_sortScoreInd < numScores);
|
||||
|
||||
|
||||
const string filePath = argv[6]
|
||||
,destPath = argv[7];
|
||||
,destPath = argv[7];
|
||||
|
||||
Moses::InputFileStream inStream(filePath);
|
||||
|
||||
@ -128,10 +128,10 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
|
||||
} else {
|
||||
switch (stage) {
|
||||
case 0: {
|
||||
WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper);
|
||||
if (w != NULL)
|
||||
out->AddWord(w);
|
||||
|
||||
WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper);
|
||||
if (w != NULL)
|
||||
out->AddWord(w);
|
||||
|
||||
break;
|
||||
}
|
||||
case 1: {
|
||||
@ -146,19 +146,19 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
|
||||
}
|
||||
case 3: {
|
||||
//targetPhrase.Create1AlignFromString(tok);
|
||||
targetPhrase.CreateAlignFromString(tok);
|
||||
targetPhrase.CreateAlignFromString(tok);
|
||||
break;
|
||||
}
|
||||
case 4:
|
||||
++stage;
|
||||
break;
|
||||
/* case 5: {
|
||||
// count info. Only store the 2nd one
|
||||
float val = Moses::Scan<float>(tok);
|
||||
misc[0] = val;
|
||||
++stage;
|
||||
break;
|
||||
}*/
|
||||
/* case 5: {
|
||||
// count info. Only store the 2nd one
|
||||
float val = Moses::Scan<float>(tok);
|
||||
misc[0] = val;
|
||||
++stage;
|
||||
break;
|
||||
}*/
|
||||
case 5: {
|
||||
// count info. Only store the 2nd one
|
||||
//float val = Moses::Scan<float>(tok);
|
||||
@ -167,12 +167,12 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
|
||||
break;
|
||||
}
|
||||
case 6: {
|
||||
// store only the 3rd one (rule count)
|
||||
// store only the 3rd one (rule count)
|
||||
float val = Moses::Scan<float>(tok);
|
||||
misc[0] = val;
|
||||
++stage;
|
||||
break;
|
||||
}
|
||||
}
|
||||
default:
|
||||
cerr << "ERROR in line " << line << endl;
|
||||
assert(false);
|
||||
@ -189,8 +189,8 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
|
||||
} // Tokenize()
|
||||
|
||||
OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
|
||||
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
|
||||
, OnDiskPt::OnDiskWrapper &onDiskWrapper)
|
||||
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
|
||||
, OnDiskPt::OnDiskWrapper &onDiskWrapper)
|
||||
{
|
||||
|
||||
bool nonTerm = false;
|
||||
@ -218,7 +218,7 @@ OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
|
||||
if (addSourceNonTerm) {
|
||||
WordPtr word(new Word());
|
||||
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
|
||||
phrase.AddWord(word);
|
||||
phrase.AddWord(word);
|
||||
}
|
||||
|
||||
wordStr = token.substr(splitPos, tokSize - splitPos);
|
||||
@ -237,7 +237,7 @@ OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
|
||||
phrase.AddWord(word);
|
||||
out = word;
|
||||
}
|
||||
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
|
@ -26,12 +26,12 @@ typedef std::pair<size_t, size_t> AlignPair;
|
||||
typedef std::vector<AlignPair> AlignType;
|
||||
|
||||
OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
|
||||
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
|
||||
, OnDiskPt::OnDiskWrapper &onDiskWrapper);
|
||||
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
|
||||
, OnDiskPt::OnDiskWrapper &onDiskWrapper);
|
||||
OnDiskPt::PhrasePtr Tokenize(OnDiskPt::SourcePhrase &sourcePhrase, OnDiskPt::TargetPhrase &targetPhrase
|
||||
, char *line, OnDiskPt::OnDiskWrapper &onDiskWrapper
|
||||
, int numScores
|
||||
, std::vector<float> &misc);
|
||||
, char *line, OnDiskPt::OnDiskWrapper &onDiskWrapper
|
||||
, int numScores
|
||||
, std::vector<float> &misc);
|
||||
|
||||
void InsertTargetNonTerminals(std::vector<std::string> &sourceToks, const std::vector<std::string> &targetToks, const AlignType &alignments);
|
||||
void SortAlign(AlignType &alignments);
|
||||
|
@ -3,10 +3,10 @@
|
||||
namespace OnDiskPt
|
||||
{
|
||||
|
||||
void OnDiskQuery::Tokenize(Phrase &phrase,
|
||||
const std::string &token,
|
||||
bool addSourceNonTerm,
|
||||
bool addTargetNonTerm)
|
||||
void OnDiskQuery::Tokenize(Phrase &phrase,
|
||||
const std::string &token,
|
||||
bool addSourceNonTerm,
|
||||
bool addTargetNonTerm)
|
||||
{
|
||||
bool nonTerm = false;
|
||||
size_t tokSize = token.size();
|
||||
@ -50,13 +50,13 @@ void OnDiskQuery::Tokenize(Phrase &phrase,
|
||||
phrase.AddWord(word);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
SourcePhrase OnDiskQuery::Tokenize(const std::vector<std::string>& tokens)
|
||||
{
|
||||
SourcePhrase sourcePhrase;
|
||||
if (tokens.size() > 0){
|
||||
if (tokens.size() > 0) {
|
||||
std::vector<std::string>::const_iterator token = tokens.begin();
|
||||
for (; token + 1 != tokens.end(); ++token){
|
||||
for (; token + 1 != tokens.end(); ++token) {
|
||||
Tokenize(sourcePhrase, *token, true, true);
|
||||
}
|
||||
// last position. LHS non-term
|
||||
@ -64,22 +64,20 @@ SourcePhrase OnDiskQuery::Tokenize(const std::vector<std::string>& tokens)
|
||||
}
|
||||
return sourcePhrase;
|
||||
}
|
||||
|
||||
|
||||
const PhraseNode* OnDiskQuery::Query(const SourcePhrase& sourcePhrase)
|
||||
{
|
||||
const PhraseNode *node = &m_wrapper.GetRootSourceNode();
|
||||
assert(node);
|
||||
|
||||
for (size_t pos = 0; pos < sourcePhrase.GetSize(); ++pos)
|
||||
{
|
||||
const Word &word = sourcePhrase.GetWord(pos);
|
||||
node = node->GetChild(word, m_wrapper);
|
||||
if (node == NULL)
|
||||
{
|
||||
break;
|
||||
}
|
||||
const PhraseNode *node = &m_wrapper.GetRootSourceNode();
|
||||
assert(node);
|
||||
|
||||
for (size_t pos = 0; pos < sourcePhrase.GetSize(); ++pos) {
|
||||
const Word &word = sourcePhrase.GetWord(pos);
|
||||
node = node->GetChild(word, m_wrapper);
|
||||
if (node == NULL) {
|
||||
break;
|
||||
}
|
||||
return node;
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -18,22 +18,21 @@ private:
|
||||
|
||||
public:
|
||||
|
||||
OnDiskQuery(OnDiskWrapper &wrapper):m_wrapper(wrapper){}
|
||||
OnDiskQuery(OnDiskWrapper &wrapper):m_wrapper(wrapper) {}
|
||||
|
||||
void Tokenize(Phrase &phrase,
|
||||
const std::string &token,
|
||||
bool addSourceNonTerm,
|
||||
bool addTargetNonTerm);
|
||||
|
||||
void Tokenize(Phrase &phrase,
|
||||
const std::string &token,
|
||||
bool addSourceNonTerm,
|
||||
bool addTargetNonTerm);
|
||||
|
||||
SourcePhrase Tokenize(const std::vector<std::string>& tokens);
|
||||
|
||||
const PhraseNode *Query(const SourcePhrase& sourcePhrase);
|
||||
|
||||
inline const PhraseNode *Query(const std::vector<std::string>& tokens)
|
||||
{
|
||||
inline const PhraseNode *Query(const std::vector<std::string>& tokens) {
|
||||
return Query(Tokenize(tokens));
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
@ -204,16 +204,16 @@ Word *OnDiskWrapper::ConvertFromMoses(Moses::FactorDirection /* direction */
|
||||
Word *newWord = new Word(isNonTerminal);
|
||||
stringstream strme;
|
||||
|
||||
size_t factorType = factorsVec[0];
|
||||
size_t factorType = factorsVec[0];
|
||||
const Moses::Factor *factor = origWord.GetFactor(factorType);
|
||||
CHECK(factor);
|
||||
CHECK(factor);
|
||||
strme << factor->GetString();
|
||||
|
||||
for (size_t ind = 1 ; ind < factorsVec.size() ; ++ind) {
|
||||
size_t factorType = factorsVec[ind];
|
||||
const Moses::Factor *factor = origWord.GetFactor(factorType);
|
||||
if (factor == NULL)
|
||||
{ // can have less factors than factorType.size()
|
||||
if (factor == NULL) {
|
||||
// can have less factors than factorType.size()
|
||||
break;
|
||||
}
|
||||
CHECK(factor);
|
||||
|
@ -28,7 +28,7 @@ namespace OnDiskPt
|
||||
{
|
||||
const float DEFAULT_COUNT = 66666;
|
||||
|
||||
/** Global class with misc information need to create and use the on-disk rule table.
|
||||
/** Global class with misc information need to create and use the on-disk rule table.
|
||||
* 1 object of this class should be instantiated per rule table.
|
||||
* Currently only hierarchical/syntax models use this, but can & should be used with pb models too
|
||||
*/
|
||||
|
@ -38,7 +38,7 @@ size_t PhraseNode::GetNodeSize(size_t numChildren, size_t wordSize, size_t count
|
||||
}
|
||||
|
||||
PhraseNode::PhraseNode()
|
||||
: m_value(0)
|
||||
: m_value(0)
|
||||
,m_currChild(NULL)
|
||||
,m_saved(false)
|
||||
,m_memLoad(NULL)
|
||||
@ -58,7 +58,7 @@ PhraseNode::PhraseNode(UINT64 filePos, OnDiskWrapper &onDiskWrapper)
|
||||
CHECK(filePos == (UINT64)file.tellg());
|
||||
|
||||
file.read((char*) &m_numChildrenLoad, sizeof(UINT64));
|
||||
|
||||
|
||||
size_t memAlloc = GetNodeSize(m_numChildrenLoad, onDiskWrapper.GetSourceWordSize(), countSize);
|
||||
m_memLoad = (char*) malloc(memAlloc);
|
||||
|
||||
@ -168,7 +168,7 @@ void PhraseNode::AddTargetPhrase(const SourcePhrase &sourcePhrase, TargetPhrase
|
||||
void PhraseNode::AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase
|
||||
, TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper
|
||||
, size_t tableLimit, const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort)
|
||||
{
|
||||
{
|
||||
size_t phraseSize = sourcePhrase.GetSize();
|
||||
if (pos < phraseSize) {
|
||||
const Word &word = sourcePhrase.GetWord(pos);
|
||||
@ -185,7 +185,7 @@ void PhraseNode::AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase
|
||||
m_currChild = &node;
|
||||
}
|
||||
|
||||
// keep searching for target phrase node..
|
||||
// keep searching for target phrase node..
|
||||
node.AddTargetPhrase(pos + 1, sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, counts, spShort);
|
||||
} else {
|
||||
// drilled down to the right node
|
||||
|
@ -53,7 +53,7 @@ protected:
|
||||
void AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase
|
||||
, TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper
|
||||
, size_t tableLimit, const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort);
|
||||
size_t ReadChild(Word &wordFound, UINT64 &childFilePos, const char *mem) const;
|
||||
size_t ReadChild(Word &wordFound, UINT64 &childFilePos, const char *mem) const;
|
||||
void GetChild(Word &wordFound, UINT64 &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const;
|
||||
|
||||
public:
|
||||
|
@ -64,13 +64,13 @@ void TargetPhrase::Create1AlignFromString(const std::string &align1Str)
|
||||
|
||||
void TargetPhrase::CreateAlignFromString(const std::string &alignStr)
|
||||
{
|
||||
vector<std::string> alignPairs;
|
||||
boost::split(alignPairs, alignStr, boost::is_any_of("\t "));
|
||||
for (size_t i = 0; i < alignPairs.size(); ++i) {
|
||||
vector<size_t> alignPoints;
|
||||
Moses::Tokenize<size_t>(alignPoints, alignPairs[i], "-");
|
||||
m_align.push_back(pair<size_t, size_t>(alignPoints[0], alignPoints[1]) );
|
||||
}
|
||||
vector<std::string> alignPairs;
|
||||
boost::split(alignPairs, alignStr, boost::is_any_of("\t "));
|
||||
for (size_t i = 0; i < alignPairs.size(); ++i) {
|
||||
vector<size_t> alignPoints;
|
||||
Moses::Tokenize<size_t>(alignPoints, alignPairs[i], "-");
|
||||
m_align.push_back(pair<size_t, size_t>(alignPoints[0], alignPoints[1]) );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -97,16 +97,16 @@ char *TargetPhrase::WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed)
|
||||
{
|
||||
size_t phraseSize = GetSize();
|
||||
size_t targetWordSize = onDiskWrapper.GetTargetWordSize();
|
||||
|
||||
|
||||
const PhrasePtr sp = GetSourcePhrase();
|
||||
size_t spSize = sp->GetSize();
|
||||
size_t sourceWordSize = onDiskWrapper.GetSourceWordSize();
|
||||
|
||||
|
||||
size_t memNeeded = sizeof(UINT64) // num of words
|
||||
+ targetWordSize * phraseSize // actual words. lhs as last words
|
||||
+ sizeof(UINT64) // num source words
|
||||
+ sourceWordSize * spSize; // actual source words
|
||||
|
||||
+ sizeof(UINT64) // num source words
|
||||
+ sourceWordSize * spSize; // actual source words
|
||||
|
||||
memUsed = 0;
|
||||
UINT64 *mem = (UINT64*) malloc(memNeeded);
|
||||
|
||||
@ -125,13 +125,13 @@ char *TargetPhrase::WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed)
|
||||
char *currPtr = (char*)mem + memUsed;
|
||||
UINT64 *memTmp = (UINT64*) currPtr;
|
||||
memTmp[0] = spSize;
|
||||
memUsed += sizeof(UINT64);
|
||||
memUsed += sizeof(UINT64);
|
||||
for (size_t pos = 0; pos < spSize; ++pos) {
|
||||
const Word &word = sp->GetWord(pos);
|
||||
char *currPtr = (char*)mem + memUsed;
|
||||
memUsed += word.WriteToMemory((char*) currPtr);
|
||||
}
|
||||
|
||||
|
||||
CHECK(memUsed == memNeeded);
|
||||
return (char *) mem;
|
||||
}
|
||||
@ -174,7 +174,7 @@ char *TargetPhrase::WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t
|
||||
// phrase id
|
||||
memcpy(mem, &m_filePos, sizeof(UINT64));
|
||||
memUsed += sizeof(UINT64);
|
||||
|
||||
|
||||
// align
|
||||
size_t tmp = WriteAlignToMemory(mem + memUsed);
|
||||
memUsed += tmp;
|
||||
@ -223,7 +223,7 @@ size_t TargetPhrase::WriteScoresToMemory(char *mem) const
|
||||
}
|
||||
|
||||
|
||||
Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::FactorType> & inputFactors
|
||||
Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::FactorType> & inputFactors
|
||||
, const std::vector<Moses::FactorType> &outputFactors
|
||||
, const Vocab &vocab
|
||||
, const Moses::PhraseDictionary &phraseDict
|
||||
@ -244,7 +244,7 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::Facto
|
||||
int index = 0;
|
||||
Moses::AlignmentInfo::CollType alignTerm, alignNonTerm;
|
||||
std::set<std::pair<size_t, size_t> > alignmentInfo;
|
||||
const PhrasePtr sp = GetSourcePhrase();
|
||||
const PhrasePtr sp = GetSourcePhrase();
|
||||
for (size_t ind = 0; ind < m_align.size(); ++ind) {
|
||||
const std::pair<size_t, size_t> &entry = m_align[ind];
|
||||
alignmentInfo.insert(entry);
|
||||
@ -252,11 +252,10 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::Facto
|
||||
size_t targetPos = entry.second;
|
||||
|
||||
if (GetWord(targetPos).IsNonTerminal()) {
|
||||
alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
|
||||
alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
|
||||
} else {
|
||||
alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
|
||||
}
|
||||
else {
|
||||
alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
|
||||
}
|
||||
|
||||
}
|
||||
ret->SetAlignTerm(alignTerm);
|
||||
@ -313,7 +312,7 @@ UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP)
|
||||
bytesRead += word->ReadFromFile(fileTP);
|
||||
AddWord(word);
|
||||
}
|
||||
|
||||
|
||||
// read source words
|
||||
UINT64 numSourceWords;
|
||||
fileTP.read((char*) &numSourceWords, sizeof(UINT64));
|
||||
@ -371,7 +370,7 @@ UINT64 TargetPhrase::ReadScoresFromFile(std::fstream &fileTPColl)
|
||||
void TargetPhrase::DebugPrint(ostream &out, const Vocab &vocab) const
|
||||
{
|
||||
Phrase::DebugPrint(out, vocab);
|
||||
|
||||
|
||||
for (size_t ind = 0; ind < m_align.size(); ++ind) {
|
||||
const AlignPair &alignPair = m_align[ind];
|
||||
out << alignPair.first << "-" << alignPair.second << " ";
|
||||
|
@ -49,7 +49,7 @@ class TargetPhrase: public Phrase
|
||||
friend std::ostream& operator<<(std::ostream&, const TargetPhrase&);
|
||||
protected:
|
||||
AlignType m_align;
|
||||
PhrasePtr m_sourcePhrase;
|
||||
PhrasePtr m_sourcePhrase;
|
||||
|
||||
std::vector<float> m_scores;
|
||||
UINT64 m_filePos;
|
||||
@ -73,10 +73,10 @@ public:
|
||||
const PhrasePtr GetSourcePhrase() const {
|
||||
return m_sourcePhrase;
|
||||
}
|
||||
const std::vector<float> &GetScores() const{
|
||||
const std::vector<float> &GetScores() const {
|
||||
return m_scores;
|
||||
}
|
||||
|
||||
|
||||
void SetLHS(WordPtr lhs);
|
||||
|
||||
void Create1AlignFromString(const std::string &align1Str);
|
||||
@ -107,7 +107,7 @@ public:
|
||||
UINT64 ReadOtherInfoFromFile(UINT64 filePos, std::fstream &fileTPColl);
|
||||
UINT64 ReadFromFile(std::fstream &fileTP);
|
||||
|
||||
virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const;
|
||||
virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const;
|
||||
|
||||
};
|
||||
|
||||
|
@ -82,7 +82,7 @@ void TargetPhraseCollection::Save(OnDiskWrapper &onDiskWrapper)
|
||||
CollType::iterator iter;
|
||||
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) {
|
||||
// save phrase
|
||||
TargetPhrase &targetPhrase = **iter;
|
||||
TargetPhrase &targetPhrase = **iter;
|
||||
targetPhrase.Save(onDiskWrapper);
|
||||
|
||||
// save coll
|
||||
@ -150,9 +150,9 @@ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnD
|
||||
{
|
||||
fstream &fileTPColl = onDiskWrapper.GetFileTargetColl();
|
||||
fstream &fileTP = onDiskWrapper.GetFileTargetInd();
|
||||
|
||||
|
||||
size_t numScores = onDiskWrapper.GetNumScores();
|
||||
|
||||
|
||||
|
||||
UINT64 numPhrases;
|
||||
|
||||
@ -164,9 +164,9 @@ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnD
|
||||
numPhrases = std::min(numPhrases, (UINT64) tableLimit);
|
||||
|
||||
currFilePos += sizeof(UINT64);
|
||||
|
||||
|
||||
for (size_t ind = 0; ind < numPhrases; ++ind) {
|
||||
TargetPhrase *tp = new TargetPhrase(numScores);
|
||||
TargetPhrase *tp = new TargetPhrase(numScores);
|
||||
|
||||
UINT64 sizeOtherInfo = tp->ReadOtherInfoFromFile(currFilePos, fileTPColl);
|
||||
tp->ReadFromFile(fileTP);
|
||||
@ -197,7 +197,7 @@ const TargetPhrase &TargetPhraseCollection::GetTargetPhrase(size_t ind) const
|
||||
assert(ind < GetSize());
|
||||
return *m_coll[ind];
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -64,9 +64,9 @@ public:
|
||||
size_t GetSize() const {
|
||||
return m_coll.size();
|
||||
}
|
||||
|
||||
|
||||
const TargetPhrase &GetTargetPhrase(size_t ind) const;
|
||||
|
||||
|
||||
UINT64 GetFilePos() const;
|
||||
|
||||
Moses::TargetPhraseCollection *ConvertToMoses(const std::vector<Moses::FactorType> &inputFactors
|
||||
|
@ -44,7 +44,7 @@ bool Vocab::Load(OnDiskWrapper &onDiskWrapper)
|
||||
// assume contiguous vocab id
|
||||
m_lookup.resize(m_vocabColl.size() + 1);
|
||||
m_nextId = m_lookup.size();
|
||||
|
||||
|
||||
CollType::const_iterator iter;
|
||||
for (iter = m_vocabColl.begin(); iter != m_vocabColl.end(); ++iter) {
|
||||
UINT32 vocabId = iter->second;
|
||||
|
@ -97,13 +97,14 @@ size_t Word::ReadFromFile(std::fstream &file)
|
||||
}
|
||||
|
||||
void Word::ConvertToMoses(
|
||||
const std::vector<Moses::FactorType> &outputFactorsVec,
|
||||
const Vocab &vocab,
|
||||
Moses::Word &overwrite) const {
|
||||
const std::vector<Moses::FactorType> &outputFactorsVec,
|
||||
const Vocab &vocab,
|
||||
Moses::Word &overwrite) const
|
||||
{
|
||||
Moses::FactorCollection &factorColl = Moses::FactorCollection::Instance();
|
||||
overwrite = Moses::Word(m_isNonTerminal);
|
||||
|
||||
// TODO: this conversion should have been done at load time.
|
||||
// TODO: this conversion should have been done at load time.
|
||||
util::TokenIter<util::SingleCharacter> tok(vocab.GetString(m_vocabId), '|');
|
||||
|
||||
for (std::vector<Moses::FactorType>::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) {
|
||||
@ -144,14 +145,14 @@ bool Word::operator==(const Word &compare) const
|
||||
|
||||
void Word::DebugPrint(ostream &out, const Vocab &vocab) const
|
||||
{
|
||||
const string &str = vocab.GetString(m_vocabId);
|
||||
const string &str = vocab.GetString(m_vocabId);
|
||||
out << str;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream &out, const Word &word)
|
||||
{
|
||||
out << "(";
|
||||
out << word.m_vocabId;
|
||||
out << word.m_vocabId;
|
||||
|
||||
out << (word.m_isNonTerminal ? "n" : "t");
|
||||
out << ")";
|
||||
|
@ -50,8 +50,8 @@ public:
|
||||
{}
|
||||
|
||||
explicit Word(bool isNonTerminal)
|
||||
:m_isNonTerminal(isNonTerminal)
|
||||
,m_vocabId(0)
|
||||
:m_isNonTerminal(isNonTerminal)
|
||||
,m_vocabId(0)
|
||||
{}
|
||||
|
||||
Word(const Word ©);
|
||||
@ -77,8 +77,7 @@ public:
|
||||
Moses::Word &overwrite) const;
|
||||
|
||||
void DebugPrint(std::ostream &out, const Vocab &vocab) const;
|
||||
inline const std::string &GetString(const Vocab &vocab) const
|
||||
{
|
||||
inline const std::string &GetString(const Vocab &vocab) const {
|
||||
return vocab.GetString(m_vocabId);
|
||||
}
|
||||
|
||||
|
@ -33,8 +33,7 @@ int main(int argc, char **argv)
|
||||
if(i + 1 == argc)
|
||||
usage();
|
||||
ttable = argv[++i];
|
||||
}
|
||||
else
|
||||
} else
|
||||
usage();
|
||||
}
|
||||
|
||||
@ -55,30 +54,27 @@ int main(int argc, char **argv)
|
||||
|
||||
cerr << "line: " << line << endl;
|
||||
const PhraseNode* node = onDiskQuery.Query(tokens);
|
||||
|
||||
if (node)
|
||||
{ // source phrase points to a bunch of rules
|
||||
|
||||
if (node) {
|
||||
// source phrase points to a bunch of rules
|
||||
const TargetPhraseCollection *coll = node->GetTargetPhraseCollection(tableLimit, onDiskWrapper);
|
||||
string str = coll->GetDebugStr();
|
||||
cout << "Found " << coll->GetSize() << endl;
|
||||
|
||||
for (size_t ind = 0; ind < coll->GetSize(); ++ind)
|
||||
{
|
||||
|
||||
for (size_t ind = 0; ind < coll->GetSize(); ++ind) {
|
||||
const TargetPhrase &targetPhrase = coll->GetTargetPhrase(ind);
|
||||
cerr << " ";
|
||||
targetPhrase.DebugPrint(cerr, onDiskWrapper.GetVocab());
|
||||
cerr << endl;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
cout << "Not found" << endl;
|
||||
}
|
||||
|
||||
|
||||
std::cout << '\n';
|
||||
std::cout.flush();
|
||||
}
|
||||
|
||||
|
||||
cerr << "Finished." << endl;
|
||||
}
|
||||
|
||||
|
@ -5,7 +5,8 @@
|
||||
#include <stdlib.h>
|
||||
#include <cstring>
|
||||
|
||||
namespace {
|
||||
namespace
|
||||
{
|
||||
|
||||
const int LINE_MAX_LENGTH = 10000;
|
||||
|
||||
@ -84,10 +85,10 @@ void Alignment::Create(const string& fileName)
|
||||
}
|
||||
|
||||
Alignment::Alignment()
|
||||
: m_array(NULL),
|
||||
m_sentenceEnd(NULL),
|
||||
m_size(0),
|
||||
m_sentenceCount(0) {}
|
||||
: m_array(NULL),
|
||||
m_sentenceEnd(NULL),
|
||||
m_size(0),
|
||||
m_sentenceCount(0) {}
|
||||
|
||||
Alignment::~Alignment()
|
||||
{
|
||||
|
@ -23,16 +23,16 @@ enum {
|
||||
};
|
||||
|
||||
Mismatch::Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end )
|
||||
:m_suffixArray(sa)
|
||||
,m_targetCorpus(tc)
|
||||
,m_alignment(a)
|
||||
,m_sentence_id(sentence_id)
|
||||
,m_source_length(source_length)
|
||||
,m_target_length(target_length)
|
||||
,m_source_position(position)
|
||||
,m_source_start(source_start)
|
||||
,m_source_end(source_end)
|
||||
,m_unaligned(true)
|
||||
:m_suffixArray(sa)
|
||||
,m_targetCorpus(tc)
|
||||
,m_alignment(a)
|
||||
,m_sentence_id(sentence_id)
|
||||
,m_source_length(source_length)
|
||||
,m_target_length(target_length)
|
||||
,m_source_position(position)
|
||||
,m_source_start(source_start)
|
||||
,m_source_end(source_end)
|
||||
,m_unaligned(true)
|
||||
{
|
||||
// initialize unaligned indexes
|
||||
for (int i = 0; i < m_source_length; i++) {
|
||||
@ -42,7 +42,7 @@ Mismatch::Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sente
|
||||
m_target_unaligned[i] = true;
|
||||
}
|
||||
m_num_alignment_points =
|
||||
m_alignment->GetNumberOfAlignmentPoints( sentence_id );
|
||||
m_alignment->GetNumberOfAlignmentPoints( sentence_id );
|
||||
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
|
||||
m_source_unaligned[ (int)m_alignment->GetSourceWord( sentence_id, ap ) ] = false;
|
||||
m_target_unaligned[ (int)m_alignment->GetTargetWord( sentence_id, ap ) ] = false;
|
||||
@ -58,234 +58,235 @@ Mismatch::~Mismatch () {}
|
||||
|
||||
void Mismatch::PrintClippedHTML( ostream* out, int width )
|
||||
{
|
||||
int source_annotation[256], target_annotation[256];
|
||||
vector< string > label_class;
|
||||
label_class.push_back( "" );
|
||||
label_class.push_back( "mismatch_pre_aligned" );
|
||||
label_class.push_back( "mismatch_post_aligned" );
|
||||
label_class.push_back( "null_aligned" );
|
||||
label_class.push_back( "mismatch_misaligned" );
|
||||
label_class.push_back( "mismatch_aligned" );
|
||||
int source_annotation[256], target_annotation[256];
|
||||
vector< string > label_class;
|
||||
label_class.push_back( "" );
|
||||
label_class.push_back( "mismatch_pre_aligned" );
|
||||
label_class.push_back( "mismatch_post_aligned" );
|
||||
label_class.push_back( "null_aligned" );
|
||||
label_class.push_back( "mismatch_misaligned" );
|
||||
label_class.push_back( "mismatch_aligned" );
|
||||
|
||||
for(int i=0; i<m_source_length;i++) source_annotation[i] = UNANNOTATED;
|
||||
for(int i=0; i<m_target_length;i++) target_annotation[i] = UNANNOTATED;
|
||||
|
||||
if (m_unaligned) {
|
||||
// find alignment points for prior and next word(s) and
|
||||
// center target phrase around those.
|
||||
bool found_aligned = false;
|
||||
for(int i=1; i<m_source_length && !found_aligned; i++) {
|
||||
if (m_source_start-i >= 0) {
|
||||
int word_id = m_source_start-i;
|
||||
source_annotation[ word_id ] = UNALIGNED;
|
||||
if (!m_source_unaligned[ word_id ]) {
|
||||
found_aligned = true;
|
||||
LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED );
|
||||
}
|
||||
}
|
||||
for(int i=0; i<m_source_length; i++) source_annotation[i] = UNANNOTATED;
|
||||
for(int i=0; i<m_target_length; i++) target_annotation[i] = UNANNOTATED;
|
||||
|
||||
if (m_source_end+i < m_source_length) {
|
||||
int word_id = m_source_end+i;
|
||||
source_annotation[ word_id ] = UNALIGNED;
|
||||
if (!m_source_unaligned[ word_id ]) {
|
||||
found_aligned = true;
|
||||
LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
// misalignment
|
||||
else {
|
||||
// label aligned output words
|
||||
for(int i=m_source_start; i<=m_source_end; i++)
|
||||
LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED );
|
||||
if (m_unaligned) {
|
||||
// find alignment points for prior and next word(s) and
|
||||
// center target phrase around those.
|
||||
bool found_aligned = false;
|
||||
for(int i=1; i<m_source_length && !found_aligned; i++) {
|
||||
if (m_source_start-i >= 0) {
|
||||
int word_id = m_source_start-i;
|
||||
source_annotation[ word_id ] = UNALIGNED;
|
||||
if (!m_source_unaligned[ word_id ]) {
|
||||
found_aligned = true;
|
||||
LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED );
|
||||
}
|
||||
}
|
||||
|
||||
// find first and last
|
||||
int target_start = -1;
|
||||
int target_end;
|
||||
for(int i=0; i<m_target_length; i++)
|
||||
if (target_annotation[i] == ALIGNED) {
|
||||
if (target_start == -1)
|
||||
target_start = i;
|
||||
target_end = i;
|
||||
}
|
||||
// go over all enclosed target words
|
||||
for(int i=target_start; i<=target_end; i++) {
|
||||
// label other target words as unaligned or misaligned
|
||||
if (m_target_unaligned[ i ])
|
||||
target_annotation[ i ] = UNALIGNED;
|
||||
else {
|
||||
if (target_annotation[ i ] != ALIGNED)
|
||||
target_annotation[ i ] = MISALIGNED;
|
||||
// loop over aligned source words
|
||||
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
|
||||
if (m_alignment->GetTargetWord( m_sentence_id, ap ) == i) {
|
||||
int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
|
||||
// if not part of the source phrase -> also misaligned
|
||||
if (source_word < m_source_start || source_word > m_source_end)
|
||||
source_annotation[ source_word ] = MISALIGNED;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// closure
|
||||
bool change = true;
|
||||
while(change) {
|
||||
change = false;
|
||||
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
|
||||
int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
|
||||
int target_word = m_alignment->GetTargetWord( m_sentence_id, ap );
|
||||
if (source_annotation[source_word] != UNANNOTATED &&
|
||||
target_annotation[target_word] == UNANNOTATED) {
|
||||
target_annotation[target_word] = MISALIGNED;
|
||||
change = true;
|
||||
}
|
||||
if (source_annotation[source_word] == UNANNOTATED &&
|
||||
target_annotation[target_word] != UNANNOTATED) {
|
||||
source_annotation[source_word] = MISALIGNED;
|
||||
change = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// print source
|
||||
// shorten source context if too long
|
||||
if (m_source_end+i < m_source_length) {
|
||||
int word_id = m_source_end+i;
|
||||
source_annotation[ word_id ] = UNALIGNED;
|
||||
if (!m_source_unaligned[ word_id ]) {
|
||||
found_aligned = true;
|
||||
LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
// misalignment
|
||||
else {
|
||||
// label aligned output words
|
||||
for(int i=m_source_start; i<=m_source_end; i++)
|
||||
LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED );
|
||||
|
||||
// find first and last
|
||||
int target_start = -1;
|
||||
int target_end;
|
||||
for(int i=0; i<m_target_length; i++)
|
||||
if (target_annotation[i] == ALIGNED) {
|
||||
if (target_start == -1)
|
||||
target_start = i;
|
||||
target_end = i;
|
||||
}
|
||||
// go over all enclosed target words
|
||||
for(int i=target_start; i<=target_end; i++) {
|
||||
// label other target words as unaligned or misaligned
|
||||
if (m_target_unaligned[ i ])
|
||||
target_annotation[ i ] = UNALIGNED;
|
||||
else {
|
||||
if (target_annotation[ i ] != ALIGNED)
|
||||
target_annotation[ i ] = MISALIGNED;
|
||||
// loop over aligned source words
|
||||
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
|
||||
if (m_alignment->GetTargetWord( m_sentence_id, ap ) == i) {
|
||||
int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
|
||||
// if not part of the source phrase -> also misaligned
|
||||
if (source_word < m_source_start || source_word > m_source_end)
|
||||
source_annotation[ source_word ] = MISALIGNED;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// closure
|
||||
bool change = true;
|
||||
while(change) {
|
||||
change = false;
|
||||
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
|
||||
int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
|
||||
int target_word = m_alignment->GetTargetWord( m_sentence_id, ap );
|
||||
if (source_annotation[source_word] != UNANNOTATED &&
|
||||
target_annotation[target_word] == UNANNOTATED) {
|
||||
target_annotation[target_word] = MISALIGNED;
|
||||
change = true;
|
||||
}
|
||||
if (source_annotation[source_word] == UNANNOTATED &&
|
||||
target_annotation[target_word] != UNANNOTATED) {
|
||||
source_annotation[source_word] = MISALIGNED;
|
||||
change = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// print source
|
||||
// shorten source context if too long
|
||||
int sentence_start = m_source_position - m_source_start;
|
||||
int context_space = width/2;
|
||||
for(int i=m_source_start;i<=m_source_end;i++)
|
||||
context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1;
|
||||
context_space /= 2;
|
||||
int context_space = width/2;
|
||||
for(int i=m_source_start; i<=m_source_end; i++)
|
||||
context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1;
|
||||
context_space /= 2;
|
||||
|
||||
int remaining = context_space;
|
||||
int start_word = m_source_start;
|
||||
for(;start_word>0 && remaining>0; start_word--)
|
||||
remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1;
|
||||
if (remaining<0 || start_word == -1) start_word++;
|
||||
int remaining = context_space;
|
||||
int start_word = m_source_start;
|
||||
for(; start_word>0 && remaining>0; start_word--)
|
||||
remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1;
|
||||
if (remaining<0 || start_word == -1) start_word++;
|
||||
|
||||
remaining = context_space;
|
||||
int end_word = m_source_end;
|
||||
for(;end_word<m_source_length && remaining>0; end_word++)
|
||||
remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1;
|
||||
end_word--;
|
||||
remaining = context_space;
|
||||
int end_word = m_source_end;
|
||||
for(; end_word<m_source_length && remaining>0; end_word++)
|
||||
remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1;
|
||||
end_word--;
|
||||
|
||||
// output with markup
|
||||
*out << "<tr><td class=\"pp_source_left\">";
|
||||
char current_label = UNANNOTATED;
|
||||
if (start_word>0) {
|
||||
current_label = source_annotation[start_word-1];
|
||||
*out << "... ";
|
||||
}
|
||||
for(int i=start_word; i<=end_word; i++) {
|
||||
// change to phrase block
|
||||
if (i == m_source_start) {
|
||||
if (current_label != UNANNOTATED && i!=start_word)
|
||||
*out << "</span>";
|
||||
*out << "</td><td class=\"pp_source\">";
|
||||
current_label = UNANNOTATED;
|
||||
}
|
||||
// output with markup
|
||||
*out << "<tr><td class=\"pp_source_left\">";
|
||||
char current_label = UNANNOTATED;
|
||||
if (start_word>0) {
|
||||
current_label = source_annotation[start_word-1];
|
||||
*out << "... ";
|
||||
}
|
||||
for(int i=start_word; i<=end_word; i++) {
|
||||
// change to phrase block
|
||||
if (i == m_source_start) {
|
||||
if (current_label != UNANNOTATED && i!=start_word)
|
||||
*out << "</span>";
|
||||
*out << "</td><td class=\"pp_source\">";
|
||||
current_label = UNANNOTATED;
|
||||
}
|
||||
|
||||
// change to labeled word
|
||||
else if (source_annotation[i] != current_label &&
|
||||
source_annotation[i] != ALIGNED) {
|
||||
if (current_label != UNANNOTATED && i!=start_word)
|
||||
*out << "</span>";
|
||||
if (source_annotation[i] != UNANNOTATED)
|
||||
*out << "<span class=\""
|
||||
<< label_class[ source_annotation[i] ]
|
||||
<< "\">";
|
||||
current_label = source_annotation[i];
|
||||
}
|
||||
// change to labeled word
|
||||
else if (source_annotation[i] != current_label &&
|
||||
source_annotation[i] != ALIGNED) {
|
||||
if (current_label != UNANNOTATED && i!=start_word)
|
||||
*out << "</span>";
|
||||
if (source_annotation[i] != UNANNOTATED)
|
||||
*out << "<span class=\""
|
||||
<< label_class[ source_annotation[i] ]
|
||||
<< "\">";
|
||||
current_label = source_annotation[i];
|
||||
}
|
||||
|
||||
// output word
|
||||
*out << m_suffixArray->GetWord( sentence_start + i ) << " ";
|
||||
// output word
|
||||
*out << m_suffixArray->GetWord( sentence_start + i ) << " ";
|
||||
|
||||
// change to right context block
|
||||
if (i == m_source_end) {
|
||||
*out << "</td><td class=\"pp_source_right\">";
|
||||
current_label = UNANNOTATED;
|
||||
}
|
||||
}
|
||||
// change to right context block
|
||||
if (i == m_source_end) {
|
||||
*out << "</td><td class=\"pp_source_right\">";
|
||||
current_label = UNANNOTATED;
|
||||
}
|
||||
}
|
||||
|
||||
if (current_label != UNANNOTATED && end_word>m_source_end)
|
||||
*out << "</span>";
|
||||
if (end_word<m_source_length-1)
|
||||
*out << "... ";
|
||||
if (current_label != UNANNOTATED && end_word>m_source_end)
|
||||
*out << "</span>";
|
||||
if (end_word<m_source_length-1)
|
||||
*out << "... ";
|
||||
|
||||
// print target
|
||||
// shorten target context if too long
|
||||
int target_start = -1;
|
||||
int target_end;
|
||||
for(int i=0; i<m_target_length; i++)
|
||||
if (target_annotation[i] != UNANNOTATED) {
|
||||
if (target_start == -1)
|
||||
target_start = i;
|
||||
target_end = i;
|
||||
}
|
||||
// print target
|
||||
// shorten target context if too long
|
||||
int target_start = -1;
|
||||
int target_end;
|
||||
for(int i=0; i<m_target_length; i++)
|
||||
if (target_annotation[i] != UNANNOTATED) {
|
||||
if (target_start == -1)
|
||||
target_start = i;
|
||||
target_end = i;
|
||||
}
|
||||
|
||||
context_space = width/2;
|
||||
for(int i=target_start;i<=target_end;i++)
|
||||
context_space -= m_targetCorpus->GetWord( m_sentence_id, i ).size() + 1;
|
||||
while (context_space < 0) { // shorten matched part, if too long
|
||||
context_space +=
|
||||
m_targetCorpus->GetWord( m_sentence_id, target_start ).size() +
|
||||
m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2;
|
||||
target_start++;
|
||||
target_end--;
|
||||
}
|
||||
context_space /= 2;
|
||||
context_space = width/2;
|
||||
for(int i=target_start; i<=target_end; i++)
|
||||
context_space -= m_targetCorpus->GetWord( m_sentence_id, i ).size() + 1;
|
||||
while (context_space < 0) { // shorten matched part, if too long
|
||||
context_space +=
|
||||
m_targetCorpus->GetWord( m_sentence_id, target_start ).size() +
|
||||
m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2;
|
||||
target_start++;
|
||||
target_end--;
|
||||
}
|
||||
context_space /= 2;
|
||||
|
||||
remaining = context_space;
|
||||
start_word = target_start;
|
||||
for(;start_word>0 && remaining>0; start_word--) {
|
||||
//cerr << "remaining: " << remaining << ", start_word: " << start_word << endl;
|
||||
remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1;
|
||||
}
|
||||
if (remaining<0 || start_word == -1) start_word++;
|
||||
remaining = context_space;
|
||||
start_word = target_start;
|
||||
for(; start_word>0 && remaining>0; start_word--) {
|
||||
//cerr << "remaining: " << remaining << ", start_word: " << start_word << endl;
|
||||
remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1;
|
||||
}
|
||||
if (remaining<0 || start_word == -1) start_word++;
|
||||
|
||||
remaining = context_space;
|
||||
end_word = target_end;
|
||||
for(;end_word<m_target_length && remaining>0; end_word++) {
|
||||
//cerr << "remaining: " << remaining << ", end_word: " << end_word << endl;
|
||||
remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1;
|
||||
}
|
||||
end_word--;
|
||||
remaining = context_space;
|
||||
end_word = target_end;
|
||||
for(; end_word<m_target_length && remaining>0; end_word++) {
|
||||
//cerr << "remaining: " << remaining << ", end_word: " << end_word << endl;
|
||||
remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1;
|
||||
}
|
||||
end_word--;
|
||||
|
||||
// output with markup
|
||||
*out << "</td><td class=\"mismatch_target\">";
|
||||
current_label = UNANNOTATED;
|
||||
if (start_word>0) {
|
||||
current_label = target_annotation[start_word-1];
|
||||
*out << "... ";
|
||||
}
|
||||
for(int i=start_word; i<=end_word; i++) {
|
||||
if (target_annotation[i] != current_label) {
|
||||
if (current_label != UNANNOTATED && i!=start_word)
|
||||
*out << "</span>";
|
||||
if (target_annotation[i] != UNANNOTATED)
|
||||
*out << "<span class=\""
|
||||
<< label_class[ target_annotation[i] ]
|
||||
<< "\">";
|
||||
current_label = target_annotation[i];
|
||||
}
|
||||
// output with markup
|
||||
*out << "</td><td class=\"mismatch_target\">";
|
||||
current_label = UNANNOTATED;
|
||||
if (start_word>0) {
|
||||
current_label = target_annotation[start_word-1];
|
||||
*out << "... ";
|
||||
}
|
||||
for(int i=start_word; i<=end_word; i++) {
|
||||
if (target_annotation[i] != current_label) {
|
||||
if (current_label != UNANNOTATED && i!=start_word)
|
||||
*out << "</span>";
|
||||
if (target_annotation[i] != UNANNOTATED)
|
||||
*out << "<span class=\""
|
||||
<< label_class[ target_annotation[i] ]
|
||||
<< "\">";
|
||||
current_label = target_annotation[i];
|
||||
}
|
||||
|
||||
// output word
|
||||
*out << m_targetCorpus->GetWord( m_sentence_id, i ) << " ";
|
||||
}
|
||||
// output word
|
||||
*out << m_targetCorpus->GetWord( m_sentence_id, i ) << " ";
|
||||
}
|
||||
|
||||
if (current_label != UNANNOTATED && end_word>target_end)
|
||||
*out << "</span>";
|
||||
if (end_word<m_target_length-1)
|
||||
*out << "... ";
|
||||
*out << "</td></tr>";
|
||||
if (current_label != UNANNOTATED && end_word>target_end)
|
||||
*out << "</span>";
|
||||
if (end_word<m_target_length-1)
|
||||
*out << "... ";
|
||||
*out << "</td></tr>";
|
||||
}
|
||||
|
||||
void Mismatch::LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label ) {
|
||||
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
|
||||
if (m_alignment->GetSourceWord( m_sentence_id, ap ) == source_id) {
|
||||
source_annotation[ source_id ] = label;
|
||||
target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label;
|
||||
}
|
||||
}
|
||||
void Mismatch::LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label )
|
||||
{
|
||||
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
|
||||
if (m_alignment->GetSourceWord( m_sentence_id, ap ) == source_id) {
|
||||
source_annotation[ source_id ] = label;
|
||||
target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -34,7 +34,9 @@ public:
|
||||
Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end );
|
||||
~Mismatch();
|
||||
|
||||
bool Unaligned() const { return m_unaligned; }
|
||||
bool Unaligned() const {
|
||||
return m_unaligned;
|
||||
}
|
||||
void PrintClippedHTML(std::ostream* out, int width );
|
||||
void LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label );
|
||||
};
|
||||
|
@ -37,7 +37,7 @@ void PhrasePair::Print( ostream* out ) const
|
||||
INDEX ap_points = m_alignment->GetNumberOfAlignmentPoints( m_sentence_id );
|
||||
for( INDEX i=0; i<ap_points; i++) {
|
||||
*out << " " << m_alignment->GetSourceWord( m_sentence_id, i )
|
||||
<< "-" << m_alignment->GetTargetWord( m_sentence_id, i );
|
||||
<< "-" << m_alignment->GetTargetWord( m_sentence_id, i );
|
||||
}
|
||||
|
||||
*out << endl;
|
||||
@ -185,27 +185,27 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
|
||||
size_t source_pre_width = (source_width-source.size())/2;
|
||||
size_t source_post_width = (source_width-source.size()+1)/2;
|
||||
|
||||
// if phrase is too long, don't show any context
|
||||
// if phrase is too long, don't show any context
|
||||
if (source.size() > (size_t)width) {
|
||||
source_pre_width = 0;
|
||||
source_post_width = 0;
|
||||
}
|
||||
// too long -> truncate and add "..."
|
||||
// too long -> truncate and add "..."
|
||||
if (source_pre.size() > source_pre_width) {
|
||||
// first skip up to a space
|
||||
while(source_pre_width>0 &&
|
||||
source_pre.substr(source_pre.size()-source_pre_width,1) != " ") {
|
||||
source_pre_width--;
|
||||
}
|
||||
// first skip up to a space
|
||||
while(source_pre_width>0 &&
|
||||
source_pre.substr(source_pre.size()-source_pre_width,1) != " ") {
|
||||
source_pre_width--;
|
||||
}
|
||||
source_pre = "..." + source_pre.substr( source_pre.size()-source_pre_width, source_pre_width );
|
||||
}
|
||||
}
|
||||
if (source_post.size() > source_post_width) {
|
||||
while(source_post_width>0 &&
|
||||
source_post.substr(source_post_width-1,1) != " ") {
|
||||
source_post_width--;
|
||||
}
|
||||
while(source_post_width>0 &&
|
||||
source_post.substr(source_post_width-1,1) != " ") {
|
||||
source_post_width--;
|
||||
}
|
||||
source_post = source_post.substr( 0, source_post_width ) + "...";
|
||||
}
|
||||
}
|
||||
|
||||
*out << "<tr><td class=\"pp_source_left\">"
|
||||
<< source_pre
|
||||
@ -220,13 +220,13 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
|
||||
string target_pre = "";
|
||||
string target = "";
|
||||
string target_post = "";
|
||||
size_t target_pre_null_width = 0;
|
||||
size_t target_post_null_width = 0;
|
||||
size_t target_pre_null_width = 0;
|
||||
size_t target_post_null_width = 0;
|
||||
for( char i=0; i<m_target_start; i++ ) {
|
||||
WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
|
||||
WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
|
||||
target_pre += " " + word;
|
||||
if (i >= m_target_start-m_pre_null)
|
||||
target_pre_null_width += word.size() + 1;
|
||||
if (i >= m_target_start-m_pre_null)
|
||||
target_pre_null_width += word.size() + 1;
|
||||
}
|
||||
for( char i=m_target_start; i<=m_target_end; i++ ) {
|
||||
if (i>m_target_start) target += " ";
|
||||
@ -234,11 +234,11 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
|
||||
}
|
||||
for( char i=m_target_end+1; i<m_target_length; i++ ) {
|
||||
if (i>m_target_end+1) target_post += " ";
|
||||
WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
|
||||
WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
|
||||
target_post += word;
|
||||
if (i-(m_target_end+1) < m_post_null) {
|
||||
target_post_null_width += word.size() + 1;
|
||||
}
|
||||
if (i-(m_target_end+1) < m_post_null) {
|
||||
target_post_null_width += word.size() + 1;
|
||||
}
|
||||
}
|
||||
|
||||
size_t target_pre_width = (target_width-target.size())/2;
|
||||
@ -249,46 +249,45 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
|
||||
target_post_width = 0;
|
||||
}
|
||||
|
||||
if (target_pre.size() < target_pre_width)
|
||||
target_pre_width = target_pre.size();
|
||||
else {
|
||||
while(target_pre_width>0 &&
|
||||
target_pre.substr(target_pre.size()-target_pre_width,1) != " ") {
|
||||
target_pre_width--;
|
||||
}
|
||||
if (target_pre.size() < target_pre_width)
|
||||
target_pre_width = target_pre.size();
|
||||
else {
|
||||
while(target_pre_width>0 &&
|
||||
target_pre.substr(target_pre.size()-target_pre_width,1) != " ") {
|
||||
target_pre_width--;
|
||||
}
|
||||
target_pre = "..." + target_pre.substr( target_pre.size()-target_pre_width, target_pre_width );
|
||||
}
|
||||
}
|
||||
|
||||
if (target_post.size() < target_post_width) {
|
||||
target_post_width = target_post.size();
|
||||
}
|
||||
else {
|
||||
while(target_post_width>0 &&
|
||||
target_post.substr(target_post_width-1,1) != " ") {
|
||||
target_post_width--;
|
||||
}
|
||||
target_post = target_post.substr( 0, target_post_width ) + "...";
|
||||
}
|
||||
if (target_post.size() < target_post_width) {
|
||||
target_post_width = target_post.size();
|
||||
} else {
|
||||
while(target_post_width>0 &&
|
||||
target_post.substr(target_post_width-1,1) != " ") {
|
||||
target_post_width--;
|
||||
}
|
||||
target_post = target_post.substr( 0, target_post_width ) + "...";
|
||||
}
|
||||
|
||||
if (m_pre_null) {
|
||||
//cerr << endl << "target_pre_width=" << target_pre_width << ", target_pre_null_width=" << target_pre_null_width << ", target_pre.size()=" << target_pre.size() << endl;
|
||||
if (target_pre_width < target_pre.size())
|
||||
target_pre_null_width -= target_pre.size()-target_pre_width;
|
||||
target_pre = target_pre.substr(0,target_pre_width-target_pre_null_width)
|
||||
+ "<span class=\"null_aligned\">"
|
||||
+ target_pre.substr(target_pre_width-target_pre_null_width)
|
||||
+ "</span>";
|
||||
}
|
||||
if (m_post_null) {
|
||||
//cerr << endl << "target_post_width=" << target_post_width << ", target_post_null_width=" << target_post_null_width << ", target_post.size()=" << target_post.size() << endl;
|
||||
if (target_post_null_width > target_post.size()) {
|
||||
target_post_null_width = target_post.size();
|
||||
}
|
||||
target_post = "<span class=\"null_aligned\">"
|
||||
+ target_post.substr(0,target_post_null_width)
|
||||
+ "</span>"
|
||||
+ target_post.substr(target_post_null_width);
|
||||
}
|
||||
if (m_pre_null) {
|
||||
//cerr << endl << "target_pre_width=" << target_pre_width << ", target_pre_null_width=" << target_pre_null_width << ", target_pre.size()=" << target_pre.size() << endl;
|
||||
if (target_pre_width < target_pre.size())
|
||||
target_pre_null_width -= target_pre.size()-target_pre_width;
|
||||
target_pre = target_pre.substr(0,target_pre_width-target_pre_null_width)
|
||||
+ "<span class=\"null_aligned\">"
|
||||
+ target_pre.substr(target_pre_width-target_pre_null_width)
|
||||
+ "</span>";
|
||||
}
|
||||
if (m_post_null) {
|
||||
//cerr << endl << "target_post_width=" << target_post_width << ", target_post_null_width=" << target_post_null_width << ", target_post.size()=" << target_post.size() << endl;
|
||||
if (target_post_null_width > target_post.size()) {
|
||||
target_post_null_width = target_post.size();
|
||||
}
|
||||
target_post = "<span class=\"null_aligned\">"
|
||||
+ target_post.substr(0,target_post_null_width)
|
||||
+ "</span>"
|
||||
+ target_post.substr(target_post_null_width);
|
||||
}
|
||||
|
||||
*out << "<td class=\"pp_target_left\">"
|
||||
<< target_pre
|
||||
|
@ -47,15 +47,15 @@ int PhrasePairCollection::GetCollection( const vector< string >& sourceString )
|
||||
int sentence_length = m_suffixArray->GetSentenceLength( sentence_id );
|
||||
int target_length = m_targetCorpus->GetSentenceLength( sentence_id );
|
||||
//cerr << "match " << (i-first_match)
|
||||
//<< " in sentence " << sentence_id
|
||||
//<< ", starting at word " << source_start
|
||||
//<< " of " << sentence_length
|
||||
//<< ". target sentence has " << target_length << " words.";
|
||||
//<< " in sentence " << sentence_id
|
||||
//<< ", starting at word " << source_start
|
||||
//<< " of " << sentence_length
|
||||
//<< ". target sentence has " << target_length << " words.";
|
||||
int target_start, target_end, pre_null, post_null;
|
||||
if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) {
|
||||
//cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]";
|
||||
//cerr << " +(" << (int)pre_null << "," << (int)post_null << ")";
|
||||
bool null_boundary_words = false;
|
||||
bool null_boundary_words = false;
|
||||
for (int pre = 0; pre <= pre_null && (pre == 0 || null_boundary_words); pre++ ) {
|
||||
for (int post = 0; post <= post_null && (post == 0 || null_boundary_words); post++ ) {
|
||||
vector< WORD_ID > targetString;
|
||||
@ -75,19 +75,18 @@ int PhrasePairCollection::GetCollection( const vector< string >& sourceString )
|
||||
m_size++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
//cerr << "mismatch " << (i-first_match)
|
||||
// << " in sentence " << sentence_id
|
||||
// << ", starting at word " << source_start
|
||||
// << " of " << sentence_length
|
||||
// << ". target sentence has " << target_length << " words.";
|
||||
Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end );
|
||||
if (mismatch->Unaligned())
|
||||
m_unaligned.push_back( mismatch );
|
||||
else
|
||||
m_mismatch.push_back( mismatch );
|
||||
}
|
||||
else {
|
||||
//cerr << "mismatch " << (i-first_match)
|
||||
// << " in sentence " << sentence_id
|
||||
// << ", starting at word " << source_start
|
||||
// << " of " << sentence_length
|
||||
// << ". target sentence has " << target_length << " words.";
|
||||
Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end );
|
||||
if (mismatch->Unaligned())
|
||||
m_unaligned.push_back( mismatch );
|
||||
else
|
||||
m_mismatch.push_back( mismatch );
|
||||
}
|
||||
//cerr << endl;
|
||||
|
||||
if (found > (INDEX)m_max_lookup) {
|
||||
@ -111,8 +110,7 @@ void PhrasePairCollection::Print(bool pretty) const
|
||||
for(int j=0; j<ppWithSameTarget->size() && j<m_max_example; j++, p++ ) {
|
||||
if (pretty) {
|
||||
(*p)->PrintPretty( &cout, 100 );
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
(*p)->Print( &cout );
|
||||
}
|
||||
if (ppWithSameTarget->size() > m_max_example) {
|
||||
@ -125,33 +123,32 @@ void PhrasePairCollection::Print(bool pretty) const
|
||||
void PhrasePairCollection::PrintHTML() const
|
||||
{
|
||||
int pp_target = 0;
|
||||
bool singleton = false;
|
||||
// loop over all translations
|
||||
bool singleton = false;
|
||||
// loop over all translations
|
||||
vector< vector<PhrasePair*> >::const_iterator ppWithSameTarget;
|
||||
for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_translation; ppWithSameTarget++, pp_target++ ) {
|
||||
|
||||
int count = ppWithSameTarget->size();
|
||||
if (!singleton) {
|
||||
if (count == 1) {
|
||||
singleton = true;
|
||||
cout << "<p class=\"pp_singleton_header\">singleton"
|
||||
<< (m_collection.end() - ppWithSameTarget==1?"":"s") << " ("
|
||||
<< (m_collection.end() - ppWithSameTarget)
|
||||
<< "/" << m_size << ")</p>";
|
||||
}
|
||||
else {
|
||||
cout << "<p class=\"pp_target_header\">";
|
||||
(*(ppWithSameTarget->begin()))->PrintTarget( &cout );
|
||||
cout << " (" << count << "/" << m_size << ")" << endl;
|
||||
cout << "<p><div id=\"pp_" << pp_target << "\">";
|
||||
}
|
||||
cout << "<table align=\"center\">";
|
||||
}
|
||||
int count = ppWithSameTarget->size();
|
||||
if (!singleton) {
|
||||
if (count == 1) {
|
||||
singleton = true;
|
||||
cout << "<p class=\"pp_singleton_header\">singleton"
|
||||
<< (m_collection.end() - ppWithSameTarget==1?"":"s") << " ("
|
||||
<< (m_collection.end() - ppWithSameTarget)
|
||||
<< "/" << m_size << ")</p>";
|
||||
} else {
|
||||
cout << "<p class=\"pp_target_header\">";
|
||||
(*(ppWithSameTarget->begin()))->PrintTarget( &cout );
|
||||
cout << " (" << count << "/" << m_size << ")" << endl;
|
||||
cout << "<p><div id=\"pp_" << pp_target << "\">";
|
||||
}
|
||||
cout << "<table align=\"center\">";
|
||||
}
|
||||
|
||||
vector< PhrasePair* >::const_iterator p;
|
||||
// loop over all sentences where translation occurs
|
||||
// loop over all sentences where translation occurs
|
||||
int pp=0;
|
||||
int i=0;
|
||||
int i=0;
|
||||
for(p = ppWithSameTarget->begin(); i<10 && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
|
||||
(*p)->PrintClippedHTML( &cout, 160 );
|
||||
if (count > m_max_example) {
|
||||
@ -159,54 +156,54 @@ void PhrasePairCollection::PrintHTML() const
|
||||
pp += count/m_max_example-1;
|
||||
}
|
||||
}
|
||||
if (i == 10 && pp < count) {
|
||||
// extended table
|
||||
cout << "<tr><td colspan=7 align=center class=\"pp_more\" onclick=\"javascript:document.getElementById('pp_" << pp_target << "').style.display = 'none'; document.getElementById('pp_ext_" << pp_target << "').style.display = 'block';\">(more)</td></tr></table></div>";
|
||||
cout << "<div id=\"pp_ext_" << pp_target << "\" style=\"display:none;\";\">";
|
||||
cout << "<table align=\"center\">";
|
||||
for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_example && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
|
||||
(*p)->PrintClippedHTML( &cout, 160 );
|
||||
if (count > m_max_example) {
|
||||
p += count/m_max_example-1;
|
||||
pp += count/m_max_example-1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!singleton) cout << "</table></div>\n";
|
||||
|
||||
if (!singleton && pp_target == 9) {
|
||||
cout << "<div id=\"pp_toggle\" onclick=\"javascript:document.getElementById('pp_toggle').style.display = 'none'; document.getElementById('pp_additional').style.display = 'block';\">";
|
||||
cout << "<p class=\"pp_target_header\">(more)</p></div>";
|
||||
cout << "<div id=\"pp_additional\" style=\"display:none;\";\">";
|
||||
}
|
||||
if (i == 10 && pp < count) {
|
||||
// extended table
|
||||
cout << "<tr><td colspan=7 align=center class=\"pp_more\" onclick=\"javascript:document.getElementById('pp_" << pp_target << "').style.display = 'none'; document.getElementById('pp_ext_" << pp_target << "').style.display = 'block';\">(more)</td></tr></table></div>";
|
||||
cout << "<div id=\"pp_ext_" << pp_target << "\" style=\"display:none;\";\">";
|
||||
cout << "<table align=\"center\">";
|
||||
for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_example && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
|
||||
(*p)->PrintClippedHTML( &cout, 160 );
|
||||
if (count > m_max_example) {
|
||||
p += count/m_max_example-1;
|
||||
pp += count/m_max_example-1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!singleton) cout << "</table></div>\n";
|
||||
|
||||
if (!singleton && pp_target == 9) {
|
||||
cout << "<div id=\"pp_toggle\" onclick=\"javascript:document.getElementById('pp_toggle').style.display = 'none'; document.getElementById('pp_additional').style.display = 'block';\">";
|
||||
cout << "<p class=\"pp_target_header\">(more)</p></div>";
|
||||
cout << "<div id=\"pp_additional\" style=\"display:none;\";\">";
|
||||
}
|
||||
}
|
||||
if (singleton) cout << "</table></div>\n";
|
||||
else if (pp_target > 9) cout << "</div>";
|
||||
if (singleton) cout << "</table></div>\n";
|
||||
else if (pp_target > 9) cout << "</div>";
|
||||
|
||||
size_t max_mismatch = m_max_example/3;
|
||||
// unaligned phrases
|
||||
if (m_unaligned.size() > 0) {
|
||||
cout << "<p class=\"pp_singleton_header\">unaligned"
|
||||
<< " (" << (m_unaligned.size()) << ")</p>";
|
||||
cout << "<table align=\"center\">";
|
||||
int step_size = 1;
|
||||
if (m_unaligned.size() > max_mismatch)
|
||||
step_size = (m_unaligned.size()+max_mismatch-1) / max_mismatch;
|
||||
for(size_t i=0;i<m_unaligned.size();i+=step_size)
|
||||
m_unaligned[i]->PrintClippedHTML( &cout, 160 );
|
||||
cout << "</table>";
|
||||
}
|
||||
size_t max_mismatch = m_max_example/3;
|
||||
// unaligned phrases
|
||||
if (m_unaligned.size() > 0) {
|
||||
cout << "<p class=\"pp_singleton_header\">unaligned"
|
||||
<< " (" << (m_unaligned.size()) << ")</p>";
|
||||
cout << "<table align=\"center\">";
|
||||
int step_size = 1;
|
||||
if (m_unaligned.size() > max_mismatch)
|
||||
step_size = (m_unaligned.size()+max_mismatch-1) / max_mismatch;
|
||||
for(size_t i=0; i<m_unaligned.size(); i+=step_size)
|
||||
m_unaligned[i]->PrintClippedHTML( &cout, 160 );
|
||||
cout << "</table>";
|
||||
}
|
||||
|
||||
// mismatched phrases
|
||||
if (m_mismatch.size() > 0) {
|
||||
cout << "<p class=\"pp_singleton_header\">mismatched"
|
||||
<< " (" << (m_mismatch.size()) << ")</p>";
|
||||
cout << "<table align=\"center\">";
|
||||
int step_size = 1;
|
||||
if (m_mismatch.size() > max_mismatch)
|
||||
step_size = (m_mismatch.size()+max_mismatch-1) / max_mismatch;
|
||||
for(size_t i=0;i<m_mismatch.size();i+=step_size)
|
||||
m_mismatch[i]->PrintClippedHTML( &cout, 160 );
|
||||
cout << "</table>";
|
||||
}
|
||||
// mismatched phrases
|
||||
if (m_mismatch.size() > 0) {
|
||||
cout << "<p class=\"pp_singleton_header\">mismatched"
|
||||
<< " (" << (m_mismatch.size()) << ")</p>";
|
||||
cout << "<table align=\"center\">";
|
||||
int step_size = 1;
|
||||
if (m_mismatch.size() > max_mismatch)
|
||||
step_size = (m_mismatch.size()+max_mismatch-1) / max_mismatch;
|
||||
for(size_t i=0; i<m_mismatch.size(); i+=step_size)
|
||||
m_mismatch[i]->PrintClippedHTML( &cout, 160 );
|
||||
cout << "</table>";
|
||||
}
|
||||
}
|
||||
|
@ -5,7 +5,8 @@
|
||||
#include <stdlib.h>
|
||||
#include <cstring>
|
||||
|
||||
namespace {
|
||||
namespace
|
||||
{
|
||||
|
||||
const int LINE_MAX_LENGTH = 10000;
|
||||
|
||||
@ -14,15 +15,15 @@ const int LINE_MAX_LENGTH = 10000;
|
||||
using namespace std;
|
||||
|
||||
SuffixArray::SuffixArray()
|
||||
: m_array(NULL),
|
||||
m_index(NULL),
|
||||
m_buffer(NULL),
|
||||
m_wordInSentence(NULL),
|
||||
m_sentence(NULL),
|
||||
m_sentenceLength(NULL),
|
||||
m_vcb(),
|
||||
m_size(0),
|
||||
m_sentenceCount(0) { }
|
||||
: m_array(NULL),
|
||||
m_index(NULL),
|
||||
m_buffer(NULL),
|
||||
m_wordInSentence(NULL),
|
||||
m_sentence(NULL),
|
||||
m_sentenceLength(NULL),
|
||||
m_vcb(),
|
||||
m_size(0),
|
||||
m_sentenceCount(0) { }
|
||||
|
||||
SuffixArray::~SuffixArray()
|
||||
{
|
||||
|
@ -5,7 +5,8 @@
|
||||
#include <stdlib.h>
|
||||
#include <cstring>
|
||||
|
||||
namespace {
|
||||
namespace
|
||||
{
|
||||
|
||||
const int LINE_MAX_LENGTH = 10000;
|
||||
|
||||
@ -14,11 +15,11 @@ const int LINE_MAX_LENGTH = 10000;
|
||||
using namespace std;
|
||||
|
||||
TargetCorpus::TargetCorpus()
|
||||
: m_array(NULL),
|
||||
m_sentenceEnd(NULL),
|
||||
m_vcb(),
|
||||
m_size(0),
|
||||
m_sentenceCount(0) {}
|
||||
: m_array(NULL),
|
||||
m_sentenceEnd(NULL),
|
||||
m_vcb(),
|
||||
m_size(0),
|
||||
m_sentenceCount(0) {}
|
||||
|
||||
TargetCorpus::~TargetCorpus()
|
||||
{
|
||||
|
@ -2,7 +2,8 @@
|
||||
#include "Vocabulary.h"
|
||||
#include <fstream>
|
||||
|
||||
namespace {
|
||||
namespace
|
||||
{
|
||||
|
||||
const int MAX_LENGTH = 10000;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
/*
|
||||
/*
|
||||
base64.cpp and base64.h
|
||||
|
||||
Copyright (C) 2004-2008 René Nyffenegger
|
||||
@ -28,17 +28,19 @@
|
||||
#include "base64.h"
|
||||
#include <iostream>
|
||||
|
||||
static const std::string base64_chars =
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"abcdefghijklmnopqrstuvwxyz"
|
||||
"0123456789+/";
|
||||
static const std::string base64_chars =
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"abcdefghijklmnopqrstuvwxyz"
|
||||
"0123456789+/";
|
||||
|
||||
|
||||
static inline bool is_base64(unsigned char c) {
|
||||
static inline bool is_base64(unsigned char c)
|
||||
{
|
||||
return (isalnum(c) || (c == '+') || (c == '/'));
|
||||
}
|
||||
|
||||
std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len) {
|
||||
std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len)
|
||||
{
|
||||
std::string ret;
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
@ -59,8 +61,7 @@ std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_
|
||||
}
|
||||
}
|
||||
|
||||
if (i)
|
||||
{
|
||||
if (i) {
|
||||
for(j = i; j < 3; j++)
|
||||
char_array_3[j] = '\0';
|
||||
|
||||
@ -81,7 +82,8 @@ std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_
|
||||
|
||||
}
|
||||
|
||||
std::string base64_decode(std::string const& encoded_string) {
|
||||
std::string base64_decode(std::string const& encoded_string)
|
||||
{
|
||||
int in_len = encoded_string.size();
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
@ -90,7 +92,8 @@ std::string base64_decode(std::string const& encoded_string) {
|
||||
std::string ret;
|
||||
|
||||
while (in_len-- && ( encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
|
||||
char_array_4[i++] = encoded_string[in_]; in_++;
|
||||
char_array_4[i++] = encoded_string[in_];
|
||||
in_++;
|
||||
if (i ==4) {
|
||||
for (i = 0; i <4; i++)
|
||||
char_array_4[i] = base64_chars.find(char_array_4[i]);
|
||||
|
@ -150,22 +150,19 @@ int main(int argc, char* argv[])
|
||||
cout << "TOTAL: " << total << endl;
|
||||
if (htmlFlag) {
|
||||
ppCollection.PrintHTML();
|
||||
}
|
||||
else {
|
||||
ppCollection.Print(prettyFlag);
|
||||
} else {
|
||||
ppCollection.Print(prettyFlag);
|
||||
}
|
||||
cout << "-|||- BICONCOR END -|||-" << endl << flush;
|
||||
}
|
||||
}
|
||||
else if (queryFlag) {
|
||||
} else if (queryFlag) {
|
||||
cerr << "query is " << query << endl;
|
||||
vector< string > queryString = alignment.Tokenize( query.c_str() );
|
||||
PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example );
|
||||
ppCollection.GetCollection( queryString );
|
||||
if (htmlFlag) {
|
||||
ppCollection.PrintHTML();
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
ppCollection.Print(prettyFlag);
|
||||
}
|
||||
}
|
||||
|
@ -29,155 +29,158 @@ using namespace std;
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
PhraseDictionaryInterpolated::PhraseDictionaryInterpolated
|
||||
(size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature):
|
||||
PhraseDictionary(numScoreComponent,feature),
|
||||
m_targetPhrases(NULL),
|
||||
m_languageModels(NULL) {}
|
||||
PhraseDictionaryInterpolated::PhraseDictionaryInterpolated
|
||||
(size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature):
|
||||
PhraseDictionary(numScoreComponent,feature),
|
||||
m_targetPhrases(NULL),
|
||||
m_languageModels(NULL) {}
|
||||
|
||||
bool PhraseDictionaryInterpolated::Load(
|
||||
const std::vector<FactorType> &input
|
||||
, const std::vector<FactorType> &output
|
||||
, const std::vector<std::string>& config
|
||||
, const std::vector<float> &weightT
|
||||
, size_t tableLimit
|
||||
, const LMList &languageModels
|
||||
, float weightWP) {
|
||||
bool PhraseDictionaryInterpolated::Load(
|
||||
const std::vector<FactorType> &input
|
||||
, const std::vector<FactorType> &output
|
||||
, const std::vector<std::string>& config
|
||||
, const std::vector<float> &weightT
|
||||
, size_t tableLimit
|
||||
, const LMList &languageModels
|
||||
, float weightWP)
|
||||
{
|
||||
|
||||
m_languageModels = &languageModels;
|
||||
m_weightT = weightT;
|
||||
m_tableLimit = tableLimit;
|
||||
m_weightWP = weightWP;
|
||||
m_languageModels = &languageModels;
|
||||
m_weightT = weightT;
|
||||
m_tableLimit = tableLimit;
|
||||
m_weightWP = weightWP;
|
||||
|
||||
//The config should be as follows:
|
||||
//0-3: type factor factor num-components (as usual)
|
||||
//4: combination mode (e.g. naive)
|
||||
//5-(length-2): List of phrase-table files
|
||||
//length-1: Weight string, in the same format as used for tmcombine
|
||||
//The config should be as follows:
|
||||
//0-3: type factor factor num-components (as usual)
|
||||
//4: combination mode (e.g. naive)
|
||||
//5-(length-2): List of phrase-table files
|
||||
//length-1: Weight string, in the same format as used for tmcombine
|
||||
|
||||
UTIL_THROW_IF(config.size() < 7, util::Exception, "Missing fields from phrase table configuration: expected at least 7");
|
||||
UTIL_THROW_IF(config[4] != "naive", util::Exception, "Unsupported combination mode: '" << config[4] << "'");
|
||||
|
||||
// Create the dictionaries
|
||||
for (size_t i = 5; i < config.size()-1; ++i) {
|
||||
m_dictionaries.push_back(DictionaryHandle(new PhraseDictionaryTreeAdaptor(
|
||||
GetFeature()->GetNumScoreComponents(),
|
||||
GetFeature()->GetNumInputScores(),
|
||||
GetFeature())));
|
||||
bool ret = m_dictionaries.back()->Load(
|
||||
input,
|
||||
output,
|
||||
config[i],
|
||||
weightT,
|
||||
0,
|
||||
languageModels,
|
||||
weightWP);
|
||||
if (!ret) return ret;
|
||||
}
|
||||
UTIL_THROW_IF(config.size() < 7, util::Exception, "Missing fields from phrase table configuration: expected at least 7");
|
||||
UTIL_THROW_IF(config[4] != "naive", util::Exception, "Unsupported combination mode: '" << config[4] << "'");
|
||||
|
||||
//Parse the weight strings
|
||||
for (util::TokenIter<util::SingleCharacter, false> featureWeights(config.back(), util::SingleCharacter(';')); featureWeights; ++featureWeights) {
|
||||
m_weights.push_back(vector<float>());
|
||||
float sum = 0;
|
||||
for (util::TokenIter<util::SingleCharacter, false> tableWeights(*featureWeights, util::SingleCharacter(',')); tableWeights; ++tableWeights) {
|
||||
const float weight = boost::lexical_cast<float>(*tableWeights);
|
||||
m_weights.back().push_back(weight);
|
||||
sum += weight;
|
||||
}
|
||||
UTIL_THROW_IF(m_weights.back().size() != m_dictionaries.size(), util::Exception,
|
||||
"Number of weights (" << m_weights.back().size() <<
|
||||
") does not match number of dictionaries to combine (" << m_dictionaries.size() << ")");
|
||||
UTIL_THROW_IF(abs(sum - 1) > 0.01, util::Exception, "Weights not normalised");
|
||||
|
||||
}
|
||||
|
||||
//check number of weight sets. Make sure there is a weight for every score component
|
||||
//except for the last - which is assumed to be the phrase penalty.
|
||||
UTIL_THROW_IF(m_weights.size() != 1 && m_weights.size() != GetFeature()->GetNumScoreComponents()-1, util::Exception, "Unexpected number of weight sets");
|
||||
//if 1 weight set, then repeat
|
||||
if (m_weights.size() == 1) {
|
||||
while(m_weights.size() < GetFeature()->GetNumScoreComponents()-1) {
|
||||
m_weights.push_back(m_weights[0]);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
// Create the dictionaries
|
||||
for (size_t i = 5; i < config.size()-1; ++i) {
|
||||
m_dictionaries.push_back(DictionaryHandle(new PhraseDictionaryTreeAdaptor(
|
||||
GetFeature()->GetNumScoreComponents(),
|
||||
GetFeature()->GetNumInputScores(),
|
||||
GetFeature())));
|
||||
bool ret = m_dictionaries.back()->Load(
|
||||
input,
|
||||
output,
|
||||
config[i],
|
||||
weightT,
|
||||
0,
|
||||
languageModels,
|
||||
weightWP);
|
||||
if (!ret) return ret;
|
||||
}
|
||||
|
||||
void PhraseDictionaryInterpolated::InitializeForInput(InputType const& source) {
|
||||
for (size_t i = 0; i < m_dictionaries.size(); ++i) {
|
||||
m_dictionaries[i]->InitializeForInput(source);
|
||||
//Parse the weight strings
|
||||
for (util::TokenIter<util::SingleCharacter, false> featureWeights(config.back(), util::SingleCharacter(';')); featureWeights; ++featureWeights) {
|
||||
m_weights.push_back(vector<float>());
|
||||
float sum = 0;
|
||||
for (util::TokenIter<util::SingleCharacter, false> tableWeights(*featureWeights, util::SingleCharacter(',')); tableWeights; ++tableWeights) {
|
||||
const float weight = boost::lexical_cast<float>(*tableWeights);
|
||||
m_weights.back().push_back(weight);
|
||||
sum += weight;
|
||||
}
|
||||
UTIL_THROW_IF(m_weights.back().size() != m_dictionaries.size(), util::Exception,
|
||||
"Number of weights (" << m_weights.back().size() <<
|
||||
") does not match number of dictionaries to combine (" << m_dictionaries.size() << ")");
|
||||
UTIL_THROW_IF(abs(sum - 1) > 0.01, util::Exception, "Weights not normalised");
|
||||
|
||||
}
|
||||
|
||||
//check number of weight sets. Make sure there is a weight for every score component
|
||||
//except for the last - which is assumed to be the phrase penalty.
|
||||
UTIL_THROW_IF(m_weights.size() != 1 && m_weights.size() != GetFeature()->GetNumScoreComponents()-1, util::Exception, "Unexpected number of weight sets");
|
||||
//if 1 weight set, then repeat
|
||||
if (m_weights.size() == 1) {
|
||||
while(m_weights.size() < GetFeature()->GetNumScoreComponents()-1) {
|
||||
m_weights.push_back(m_weights[0]);
|
||||
}
|
||||
}
|
||||
|
||||
typedef
|
||||
boost::unordered_set<TargetPhrase*,PhrasePtrHasher,PhrasePtrComparator> PhraseSet;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
const TargetPhraseCollection*
|
||||
PhraseDictionaryInterpolated::GetTargetPhraseCollection(const Phrase& src) const {
|
||||
void PhraseDictionaryInterpolated::InitializeForInput(InputType const& source)
|
||||
{
|
||||
for (size_t i = 0; i < m_dictionaries.size(); ++i) {
|
||||
m_dictionaries[i]->InitializeForInput(source);
|
||||
}
|
||||
}
|
||||
|
||||
delete m_targetPhrases;
|
||||
m_targetPhrases = new TargetPhraseCollection();
|
||||
PhraseSet allPhrases;
|
||||
vector<PhraseSet> phrasesByTable(m_dictionaries.size());
|
||||
for (size_t i = 0; i < m_dictionaries.size(); ++i) {
|
||||
const TargetPhraseCollection* phrases = m_dictionaries[i]->GetTargetPhraseCollection(src);
|
||||
if (phrases) {
|
||||
for (TargetPhraseCollection::const_iterator j = phrases->begin();
|
||||
j != phrases->end(); ++j) {
|
||||
allPhrases.insert(*j);
|
||||
phrasesByTable[i].insert(*j);
|
||||
typedef
|
||||
boost::unordered_set<TargetPhrase*,PhrasePtrHasher,PhrasePtrComparator> PhraseSet;
|
||||
|
||||
|
||||
const TargetPhraseCollection*
|
||||
PhraseDictionaryInterpolated::GetTargetPhraseCollection(const Phrase& src) const
|
||||
{
|
||||
|
||||
delete m_targetPhrases;
|
||||
m_targetPhrases = new TargetPhraseCollection();
|
||||
PhraseSet allPhrases;
|
||||
vector<PhraseSet> phrasesByTable(m_dictionaries.size());
|
||||
for (size_t i = 0; i < m_dictionaries.size(); ++i) {
|
||||
const TargetPhraseCollection* phrases = m_dictionaries[i]->GetTargetPhraseCollection(src);
|
||||
if (phrases) {
|
||||
for (TargetPhraseCollection::const_iterator j = phrases->begin();
|
||||
j != phrases->end(); ++j) {
|
||||
allPhrases.insert(*j);
|
||||
phrasesByTable[i].insert(*j);
|
||||
}
|
||||
}
|
||||
}
|
||||
ScoreComponentCollection sparseVector;
|
||||
for (PhraseSet::const_iterator i = allPhrases.begin(); i != allPhrases.end(); ++i) {
|
||||
TargetPhrase* combinedPhrase = new TargetPhrase((Phrase)**i);
|
||||
//combinedPhrase->ResetScore();
|
||||
//cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl;
|
||||
combinedPhrase->SetSourcePhrase((*i)->GetSourcePhrase());
|
||||
combinedPhrase->SetAlignTerm(&((*i)->GetAlignTerm()));
|
||||
combinedPhrase->SetAlignNonTerm(&((*i)->GetAlignTerm()));
|
||||
Scores combinedScores(GetFeature()->GetNumScoreComponents());
|
||||
for (size_t j = 0; j < phrasesByTable.size(); ++j) {
|
||||
PhraseSet::const_iterator tablePhrase = phrasesByTable[j].find(combinedPhrase);
|
||||
if (tablePhrase != phrasesByTable[j].end()) {
|
||||
Scores tableScores = (*tablePhrase)->GetScoreBreakdown()
|
||||
.GetScoresForProducer(GetFeature());
|
||||
//cerr << "Scores from " << j << " table: ";
|
||||
for (size_t k = 0; k < tableScores.size()-1; ++k) {
|
||||
//cerr << tableScores[k] << "(" << exp(tableScores[k]) << ") ";
|
||||
combinedScores[k] += m_weights[k][j] * exp(tableScores[k]);
|
||||
//cerr << m_weights[k][j] * exp(tableScores[k]) << " ";
|
||||
}
|
||||
//cerr << endl;
|
||||
}
|
||||
}
|
||||
ScoreComponentCollection sparseVector;
|
||||
for (PhraseSet::const_iterator i = allPhrases.begin(); i != allPhrases.end(); ++i) {
|
||||
TargetPhrase* combinedPhrase = new TargetPhrase((Phrase)**i);
|
||||
//combinedPhrase->ResetScore();
|
||||
//cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl;
|
||||
combinedPhrase->SetSourcePhrase((*i)->GetSourcePhrase());
|
||||
combinedPhrase->SetAlignTerm(&((*i)->GetAlignTerm()));
|
||||
combinedPhrase->SetAlignNonTerm(&((*i)->GetAlignTerm()));
|
||||
Scores combinedScores(GetFeature()->GetNumScoreComponents());
|
||||
for (size_t j = 0; j < phrasesByTable.size(); ++j) {
|
||||
PhraseSet::const_iterator tablePhrase = phrasesByTable[j].find(combinedPhrase);
|
||||
if (tablePhrase != phrasesByTable[j].end()) {
|
||||
Scores tableScores = (*tablePhrase)->GetScoreBreakdown()
|
||||
.GetScoresForProducer(GetFeature());
|
||||
//cerr << "Scores from " << j << " table: ";
|
||||
for (size_t k = 0; k < tableScores.size()-1; ++k) {
|
||||
//cerr << tableScores[k] << "(" << exp(tableScores[k]) << ") ";
|
||||
combinedScores[k] += m_weights[k][j] * exp(tableScores[k]);
|
||||
//cerr << m_weights[k][j] * exp(tableScores[k]) << " ";
|
||||
}
|
||||
//cerr << endl;
|
||||
}
|
||||
}
|
||||
//map back to log space
|
||||
//cerr << "Combined ";
|
||||
for (size_t k = 0; k < combinedScores.size()-1; ++k) {
|
||||
//cerr << combinedScores[k] << " ";
|
||||
combinedScores[k] = log(combinedScores[k]);
|
||||
//cerr << combinedScores[k] << " ";
|
||||
}
|
||||
//cerr << endl;
|
||||
combinedScores.back() = 1; //assume last is penalty
|
||||
combinedPhrase->SetScore(
|
||||
GetFeature(),
|
||||
combinedScores,
|
||||
sparseVector,
|
||||
m_weightT,
|
||||
m_weightWP,
|
||||
*m_languageModels);
|
||||
//cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl;
|
||||
m_targetPhrases->Add(combinedPhrase);
|
||||
//map back to log space
|
||||
//cerr << "Combined ";
|
||||
for (size_t k = 0; k < combinedScores.size()-1; ++k) {
|
||||
//cerr << combinedScores[k] << " ";
|
||||
combinedScores[k] = log(combinedScores[k]);
|
||||
//cerr << combinedScores[k] << " ";
|
||||
}
|
||||
|
||||
m_targetPhrases->Prune(true,m_tableLimit);
|
||||
|
||||
|
||||
return m_targetPhrases;
|
||||
//cerr << endl;
|
||||
combinedScores.back() = 1; //assume last is penalty
|
||||
combinedPhrase->SetScore(
|
||||
GetFeature(),
|
||||
combinedScores,
|
||||
sparseVector,
|
||||
m_weightT,
|
||||
m_weightWP,
|
||||
*m_languageModels);
|
||||
//cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl;
|
||||
m_targetPhrases->Add(combinedPhrase);
|
||||
}
|
||||
|
||||
m_targetPhrases->Prune(true,m_tableLimit);
|
||||
|
||||
|
||||
return m_targetPhrases;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -34,12 +34,14 @@ namespace Moses
|
||||
**/
|
||||
class PhraseDictionaryInterpolated : public PhraseDictionary
|
||||
{
|
||||
public:
|
||||
public:
|
||||
|
||||
PhraseDictionaryInterpolated
|
||||
(size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature);
|
||||
(size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature);
|
||||
|
||||
virtual ~PhraseDictionaryInterpolated() {delete m_targetPhrases;}
|
||||
virtual ~PhraseDictionaryInterpolated() {
|
||||
delete m_targetPhrases;
|
||||
}
|
||||
|
||||
// initialize ...
|
||||
bool Load(const std::vector<FactorType> &input
|
||||
@ -58,7 +60,7 @@ class PhraseDictionaryInterpolated : public PhraseDictionary
|
||||
throw std::logic_error("PhraseDictionaryInterpolated.CreateRuleLookupManager() Not implemented");
|
||||
}
|
||||
|
||||
private:
|
||||
private:
|
||||
|
||||
typedef boost::shared_ptr<PhraseDictionaryTreeAdaptor> DictionaryHandle;
|
||||
std::vector<DictionaryHandle> m_dictionaries;
|
||||
|
@ -31,7 +31,8 @@ BOOST_AUTO_TEST_SUITE(phrase_length_feature)
|
||||
|
||||
//TODO: Factor out setup code so that it can be reused
|
||||
|
||||
static Word MakeWord(string text) {
|
||||
static Word MakeWord(string text)
|
||||
{
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
const Factor* f = factorCollection.AddFactor(Input,0,text);
|
||||
Word w;
|
||||
@ -40,7 +41,8 @@ static Word MakeWord(string text) {
|
||||
}
|
||||
|
||||
|
||||
BOOST_AUTO_TEST_CASE(evaluate) {
|
||||
BOOST_AUTO_TEST_CASE(evaluate)
|
||||
{
|
||||
Word w1 = MakeWord("w1");
|
||||
Word w2 = MakeWord("y2");
|
||||
Word w3 = MakeWord("x3");
|
||||
@ -78,7 +80,7 @@ BOOST_AUTO_TEST_CASE(evaluate) {
|
||||
PhraseBasedFeatureContext context1(topt1,sentence);
|
||||
PhraseBasedFeatureContext context2(topt2,sentence);
|
||||
PhraseBasedFeatureContext context3(topt3,sentence);
|
||||
|
||||
|
||||
PhraseLengthFeature plf;
|
||||
|
||||
ScoreComponentCollection acc1,acc2,acc3;
|
||||
|
@ -34,12 +34,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
using namespace std;
|
||||
using namespace Moses;
|
||||
|
||||
namespace MosesTest
|
||||
namespace MosesTest
|
||||
{
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(target_bigram)
|
||||
|
||||
static Word MakeWord(string text) {
|
||||
static Word MakeWord(string text)
|
||||
{
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
const Factor* f = factorCollection.AddFactor(Input,0,text);
|
||||
Word w;
|
||||
@ -47,34 +48,32 @@ static Word MakeWord(string text) {
|
||||
return w;
|
||||
}
|
||||
|
||||
class VocabFileFixture {
|
||||
public:
|
||||
template<class I>
|
||||
VocabFileFixture(I begin, I end)
|
||||
{
|
||||
char name[] = "TargetBigramXXXXXX";
|
||||
int fd = mkstemp(name);
|
||||
BOOST_CHECK(fd != -1);
|
||||
BOOST_CHECK(!close(fd));
|
||||
filename = name;
|
||||
ofstream out(name);
|
||||
for (I i = begin; i != end; ++i)
|
||||
{
|
||||
out << *i << endl;
|
||||
}
|
||||
out.close();
|
||||
class VocabFileFixture
|
||||
{
|
||||
public:
|
||||
template<class I>
|
||||
VocabFileFixture(I begin, I end) {
|
||||
char name[] = "TargetBigramXXXXXX";
|
||||
int fd = mkstemp(name);
|
||||
BOOST_CHECK(fd != -1);
|
||||
BOOST_CHECK(!close(fd));
|
||||
filename = name;
|
||||
ofstream out(name);
|
||||
for (I i = begin; i != end; ++i) {
|
||||
out << *i << endl;
|
||||
}
|
||||
out.close();
|
||||
}
|
||||
|
||||
~VocabFileFixture()
|
||||
{
|
||||
BOOST_CHECK(!remove(filename.c_str()));
|
||||
}
|
||||
~VocabFileFixture() {
|
||||
BOOST_CHECK(!remove(filename.c_str()));
|
||||
}
|
||||
|
||||
string filename;
|
||||
string filename;
|
||||
};
|
||||
|
||||
/*
|
||||
BOOST_AUTO_TEST_CASE(Test2)
|
||||
BOOST_AUTO_TEST_CASE(Test2)
|
||||
{
|
||||
HypothesisFixture hypos;
|
||||
cerr << hypos.empty() << ", " << *hypos.empty() << endl;
|
||||
@ -113,7 +112,7 @@ BOOST_AUTO_TEST_CASE(score_components)
|
||||
ScoreProducer::unlimited);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(empty_hypo)
|
||||
BOOST_AUTO_TEST_CASE(empty_hypo)
|
||||
{
|
||||
Sentence s;
|
||||
TargetBigramFeature tbf;
|
||||
@ -124,7 +123,7 @@ BOOST_AUTO_TEST_CASE(empty_hypo)
|
||||
}
|
||||
|
||||
//Test of evaluate() where a vocab is specified
|
||||
BOOST_AUTO_TEST_CASE(evaluate_vocab)
|
||||
BOOST_AUTO_TEST_CASE(evaluate_vocab)
|
||||
{
|
||||
string vocab[] = {"i", "do"};
|
||||
VocabFileFixture vocabFile(vocab,vocab+2);
|
||||
@ -156,7 +155,7 @@ BOOST_AUTO_TEST_CASE(evaluate_all)
|
||||
BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&tbf, "do:not"),1);
|
||||
BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&tbf, "not:</s>"),0);
|
||||
BOOST_CHECK(! currState->Compare(TargetBigramState(MakeWord("not"))));
|
||||
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(evaluate_empty)
|
||||
@ -171,7 +170,7 @@ BOOST_AUTO_TEST_CASE(evaluate_empty)
|
||||
BOOST_CHECK(! currState->Compare(*prevState));
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(evaluate_eos)
|
||||
BOOST_AUTO_TEST_CASE(evaluate_eos)
|
||||
{
|
||||
HypothesisFixture hypos;
|
||||
TargetBigramFeature tbf;
|
||||
|
@ -18,7 +18,8 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace {
|
||||
namespace
|
||||
{
|
||||
|
||||
// configure regularisation
|
||||
const char KEY_REFLEN[] = "reflen";
|
||||
@ -33,8 +34,9 @@ namespace MosesTuning
|
||||
|
||||
|
||||
BleuScorer::BleuScorer(const string& config)
|
||||
: StatisticsBasedScorer("BLEU", config),
|
||||
m_ref_length_type(CLOSEST) {
|
||||
: StatisticsBasedScorer("BLEU", config),
|
||||
m_ref_length_type(CLOSEST)
|
||||
{
|
||||
const string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST);
|
||||
if (reflen == REFLEN_AVERAGE) {
|
||||
m_ref_length_type = AVERAGE;
|
||||
@ -101,7 +103,8 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
}
|
||||
}
|
||||
|
||||
bool BleuScorer::OpenReference(const char* filename, size_t file_id) {
|
||||
bool BleuScorer::OpenReference(const char* filename, size_t file_id)
|
||||
{
|
||||
ifstream ifs(filename);
|
||||
if (!ifs) {
|
||||
cerr << "Cannot open " << filename << endl;
|
||||
@ -110,7 +113,8 @@ bool BleuScorer::OpenReference(const char* filename, size_t file_id) {
|
||||
return OpenReferenceStream(&ifs, file_id);
|
||||
}
|
||||
|
||||
bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id) {
|
||||
bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id)
|
||||
{
|
||||
if (is == NULL) return false;
|
||||
|
||||
string line;
|
||||
@ -203,25 +207,27 @@ statscore_t BleuScorer::calculateScore(const vector<int>& comps) const
|
||||
return exp(logbleu);
|
||||
}
|
||||
|
||||
int BleuScorer::CalcReferenceLength(size_t sentence_id, size_t length) {
|
||||
int BleuScorer::CalcReferenceLength(size_t sentence_id, size_t length)
|
||||
{
|
||||
switch (m_ref_length_type) {
|
||||
case AVERAGE:
|
||||
return m_references[sentence_id]->CalcAverage();
|
||||
break;
|
||||
case CLOSEST:
|
||||
return m_references[sentence_id]->CalcClosest(length);
|
||||
break;
|
||||
case SHORTEST:
|
||||
return m_references[sentence_id]->CalcShortest();
|
||||
break;
|
||||
default:
|
||||
cerr << "unknown reference types." << endl;
|
||||
exit(1);
|
||||
case AVERAGE:
|
||||
return m_references[sentence_id]->CalcAverage();
|
||||
break;
|
||||
case CLOSEST:
|
||||
return m_references[sentence_id]->CalcClosest(length);
|
||||
break;
|
||||
case SHORTEST:
|
||||
return m_references[sentence_id]->CalcShortest();
|
||||
break;
|
||||
default:
|
||||
cerr << "unknown reference types." << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
void BleuScorer::DumpCounts(ostream* os,
|
||||
const NgramCounts& counts) const {
|
||||
const NgramCounts& counts) const
|
||||
{
|
||||
for (NgramCounts::const_iterator it = counts.begin();
|
||||
it != counts.end(); ++it) {
|
||||
*os << "(";
|
||||
@ -238,7 +244,8 @@ void BleuScorer::DumpCounts(ostream* os,
|
||||
}
|
||||
|
||||
float smoothedSentenceBleu
|
||||
(const std::vector<float>& stats, float smoothing, bool smoothBP) {
|
||||
(const std::vector<float>& stats, float smoothing, bool smoothBP)
|
||||
{
|
||||
|
||||
CHECK(stats.size() == kBleuNgramOrder * 2 + 1);
|
||||
|
||||
@ -247,8 +254,8 @@ float smoothedSentenceBleu
|
||||
logbleu += log(stats[2 * j] + smoothing) - log(stats[2 * j + 1] + smoothing);
|
||||
}
|
||||
logbleu /= kBleuNgramOrder;
|
||||
const float reflength = stats[(kBleuNgramOrder * 2)] +
|
||||
(smoothBP ? smoothing : 0.0f);
|
||||
const float reflength = stats[(kBleuNgramOrder * 2)] +
|
||||
(smoothBP ? smoothing : 0.0f);
|
||||
const float brevity = 1.0 - reflength / stats[1];
|
||||
|
||||
if (brevity < 0.0) {
|
||||
@ -263,7 +270,7 @@ float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vec
|
||||
std::vector<float> stats;
|
||||
CHECK(sent.size()==bg.size());
|
||||
CHECK(sent.size()==kBleuNgramOrder*2+1);
|
||||
for(size_t i=0;i<sent.size();i++)
|
||||
for(size_t i=0; i<sent.size(); i++)
|
||||
stats.push_back(sent[i]+bg[i]);
|
||||
|
||||
// Calculate BLEU
|
||||
@ -282,7 +289,8 @@ float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vec
|
||||
return exp(logbleu) * stats[kBleuNgramOrder*2];
|
||||
}
|
||||
|
||||
float unsmoothedBleu(const std::vector<float>& stats) {
|
||||
float unsmoothedBleu(const std::vector<float>& stats)
|
||||
{
|
||||
CHECK(stats.size() == kBleuNgramOrder * 2 + 1);
|
||||
|
||||
float logbleu = 0.0;
|
||||
@ -298,50 +306,51 @@ float unsmoothedBleu(const std::vector<float>& stats) {
|
||||
return exp(logbleu);
|
||||
}
|
||||
|
||||
vector<float> BleuScorer::ScoreNbestList(const string& scoreFile, const string& featureFile) {
|
||||
vector<string> scoreFiles;
|
||||
vector<string> featureFiles;
|
||||
scoreFiles.push_back(scoreFile);
|
||||
featureFiles.push_back(featureFile);
|
||||
vector<float> BleuScorer::ScoreNbestList(const string& scoreFile, const string& featureFile)
|
||||
{
|
||||
vector<string> scoreFiles;
|
||||
vector<string> featureFiles;
|
||||
scoreFiles.push_back(scoreFile);
|
||||
featureFiles.push_back(featureFile);
|
||||
|
||||
vector<FeatureDataIterator> featureDataIters;
|
||||
vector<ScoreDataIterator> scoreDataIters;
|
||||
for (size_t i = 0; i < featureFiles.size(); ++i) {
|
||||
featureDataIters.push_back(FeatureDataIterator(featureFiles[i]));
|
||||
scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i]));
|
||||
}
|
||||
vector<FeatureDataIterator> featureDataIters;
|
||||
vector<ScoreDataIterator> scoreDataIters;
|
||||
for (size_t i = 0; i < featureFiles.size(); ++i) {
|
||||
featureDataIters.push_back(FeatureDataIterator(featureFiles[i]));
|
||||
scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i]));
|
||||
}
|
||||
|
||||
vector<pair<size_t,size_t> > hypotheses;
|
||||
if (featureDataIters[0] == FeatureDataIterator::end()) {
|
||||
cerr << "Error: at the end of feature data iterator" << endl;
|
||||
exit(1);
|
||||
}
|
||||
for (size_t i = 0; i < featureFiles.size(); ++i) {
|
||||
if (featureDataIters[i] == FeatureDataIterator::end()) {
|
||||
cerr << "Error: Feature file " << i << " ended prematurely" << endl;
|
||||
exit(1);
|
||||
}
|
||||
if (scoreDataIters[i] == ScoreDataIterator::end()) {
|
||||
cerr << "Error: Score file " << i << " ended prematurely" << endl;
|
||||
exit(1);
|
||||
}
|
||||
if (featureDataIters[i]->size() != scoreDataIters[i]->size()) {
|
||||
cerr << "Error: features and scores have different size" << endl;
|
||||
exit(1);
|
||||
}
|
||||
for (size_t j = 0; j < featureDataIters[i]->size(); ++j) {
|
||||
hypotheses.push_back(pair<size_t,size_t>(i,j));
|
||||
}
|
||||
}
|
||||
vector<pair<size_t,size_t> > hypotheses;
|
||||
if (featureDataIters[0] == FeatureDataIterator::end()) {
|
||||
cerr << "Error: at the end of feature data iterator" << endl;
|
||||
exit(1);
|
||||
}
|
||||
for (size_t i = 0; i < featureFiles.size(); ++i) {
|
||||
if (featureDataIters[i] == FeatureDataIterator::end()) {
|
||||
cerr << "Error: Feature file " << i << " ended prematurely" << endl;
|
||||
exit(1);
|
||||
}
|
||||
if (scoreDataIters[i] == ScoreDataIterator::end()) {
|
||||
cerr << "Error: Score file " << i << " ended prematurely" << endl;
|
||||
exit(1);
|
||||
}
|
||||
if (featureDataIters[i]->size() != scoreDataIters[i]->size()) {
|
||||
cerr << "Error: features and scores have different size" << endl;
|
||||
exit(1);
|
||||
}
|
||||
for (size_t j = 0; j < featureDataIters[i]->size(); ++j) {
|
||||
hypotheses.push_back(pair<size_t,size_t>(i,j));
|
||||
}
|
||||
}
|
||||
|
||||
// score the nbest list
|
||||
vector<float> bleuScores;
|
||||
for (size_t i=0; i < hypotheses.size(); ++i) {
|
||||
pair<size_t,size_t> translation = hypotheses[i];
|
||||
float bleu = smoothedSentenceBleu(scoreDataIters[translation.first]->operator[](translation.second));
|
||||
bleuScores.push_back(bleu);
|
||||
}
|
||||
return bleuScores;
|
||||
// score the nbest list
|
||||
vector<float> bleuScores;
|
||||
for (size_t i=0; i < hypotheses.size(); ++i) {
|
||||
pair<size_t,size_t> translation = hypotheses[i];
|
||||
float bleu = smoothedSentenceBleu(scoreDataIters[translation.first]->operator[](translation.second));
|
||||
bleuScores.push_back(bleu);
|
||||
}
|
||||
return bleuScores;
|
||||
}
|
||||
|
||||
|
||||
|
@ -38,14 +38,22 @@ public:
|
||||
virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles);
|
||||
virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry);
|
||||
virtual statscore_t calculateScore(const std::vector<int>& comps) const;
|
||||
virtual std::size_t NumberOfScores() const { return 2 * kBleuNgramOrder + 1; }
|
||||
virtual std::size_t NumberOfScores() const {
|
||||
return 2 * kBleuNgramOrder + 1;
|
||||
}
|
||||
|
||||
int CalcReferenceLength(std::size_t sentence_id, std::size_t length);
|
||||
|
||||
ReferenceLengthType GetReferenceLengthType() const { return m_ref_length_type; }
|
||||
void SetReferenceLengthType(ReferenceLengthType type) { m_ref_length_type = type; }
|
||||
ReferenceLengthType GetReferenceLengthType() const {
|
||||
return m_ref_length_type;
|
||||
}
|
||||
void SetReferenceLengthType(ReferenceLengthType type) {
|
||||
m_ref_length_type = type;
|
||||
}
|
||||
|
||||
const std::vector<Reference*>& GetReferences() const { return m_references.get(); }
|
||||
const std::vector<Reference*>& GetReferences() const {
|
||||
return m_references.get();
|
||||
}
|
||||
|
||||
/**
|
||||
* Count the ngrams of each type, up to the given length in the input line.
|
||||
@ -74,7 +82,7 @@ private:
|
||||
* This function is used in PRO.
|
||||
*/
|
||||
float smoothedSentenceBleu
|
||||
(const std::vector<float>& stats, float smoothing=1.0, bool smoothBP=false);
|
||||
(const std::vector<float>& stats, float smoothing=1.0, bool smoothBP=false);
|
||||
|
||||
/** Computes sentence-level BLEU score given a background corpus.
|
||||
* This function is used in batch MIRA.
|
||||
|
@ -10,16 +10,19 @@
|
||||
|
||||
using namespace MosesTuning;
|
||||
|
||||
namespace {
|
||||
namespace
|
||||
{
|
||||
|
||||
NgramCounts* g_counts = NULL;
|
||||
|
||||
NgramCounts* GetNgramCounts() {
|
||||
NgramCounts* GetNgramCounts()
|
||||
{
|
||||
assert(g_counts);
|
||||
return g_counts;
|
||||
}
|
||||
|
||||
void SetNgramCounts(NgramCounts* counts) {
|
||||
void SetNgramCounts(NgramCounts* counts)
|
||||
{
|
||||
g_counts = counts;
|
||||
}
|
||||
|
||||
@ -58,33 +61,38 @@ struct Fourgram {
|
||||
NgramCounts::Key instance;
|
||||
};
|
||||
|
||||
bool CheckUnigram(const std::string& str) {
|
||||
bool CheckUnigram(const std::string& str)
|
||||
{
|
||||
Unigram unigram(str);
|
||||
NgramCounts::Value v;
|
||||
return GetNgramCounts()->Lookup(unigram.instance, &v);
|
||||
}
|
||||
|
||||
bool CheckBigram(const std::string& a, const std::string& b) {
|
||||
bool CheckBigram(const std::string& a, const std::string& b)
|
||||
{
|
||||
Bigram bigram(a, b);
|
||||
NgramCounts::Value v;
|
||||
return GetNgramCounts()->Lookup(bigram.instance, &v);
|
||||
}
|
||||
|
||||
bool CheckTrigram(const std::string& a, const std::string& b,
|
||||
const std::string& c) {
|
||||
const std::string& c)
|
||||
{
|
||||
Trigram trigram(a, b, c);
|
||||
NgramCounts::Value v;
|
||||
return GetNgramCounts()->Lookup(trigram.instance, &v);
|
||||
}
|
||||
|
||||
bool CheckFourgram(const std::string& a, const std::string& b,
|
||||
const std::string& c, const std::string& d) {
|
||||
const std::string& c, const std::string& d)
|
||||
{
|
||||
Fourgram fourgram(a, b, c, d);
|
||||
NgramCounts::Value v;
|
||||
return GetNgramCounts()->Lookup(fourgram.instance, &v);
|
||||
}
|
||||
|
||||
void SetUpReferences(BleuScorer& scorer) {
|
||||
void SetUpReferences(BleuScorer& scorer)
|
||||
{
|
||||
// The following examples are taken from Koehn, "Statistical Machine Translation",
|
||||
// Cambridge University Press, 2010.
|
||||
{
|
||||
@ -115,7 +123,8 @@ void SetUpReferences(BleuScorer& scorer) {
|
||||
|
||||
} // namespace
|
||||
|
||||
BOOST_AUTO_TEST_CASE(bleu_reference_type) {
|
||||
BOOST_AUTO_TEST_CASE(bleu_reference_type)
|
||||
{
|
||||
BleuScorer scorer;
|
||||
// BleuScorer will use "closest" by default.
|
||||
BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::CLOSEST);
|
||||
@ -127,7 +136,8 @@ BOOST_AUTO_TEST_CASE(bleu_reference_type) {
|
||||
BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::SHORTEST);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(bleu_reference_type_with_config) {
|
||||
BOOST_AUTO_TEST_CASE(bleu_reference_type_with_config)
|
||||
{
|
||||
{
|
||||
BleuScorer scorer("reflen:average");
|
||||
BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::AVERAGE);
|
||||
@ -139,7 +149,8 @@ BOOST_AUTO_TEST_CASE(bleu_reference_type_with_config) {
|
||||
}
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(bleu_count_ngrams) {
|
||||
BOOST_AUTO_TEST_CASE(bleu_count_ngrams)
|
||||
{
|
||||
BleuScorer scorer;
|
||||
|
||||
std::string line = "I saw a girl with a telescope .";
|
||||
@ -198,7 +209,8 @@ BOOST_AUTO_TEST_CASE(bleu_count_ngrams) {
|
||||
BOOST_CHECK(CheckFourgram("with", "a", "telescope", "."));
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(bleu_clipped_counts) {
|
||||
BOOST_AUTO_TEST_CASE(bleu_clipped_counts)
|
||||
{
|
||||
BleuScorer scorer;
|
||||
SetUpReferences(scorer);
|
||||
std::string line("israeli officials responsibility of airport safety");
|
||||
@ -220,7 +232,8 @@ BOOST_AUTO_TEST_CASE(bleu_clipped_counts) {
|
||||
BOOST_CHECK_EQUAL(entry.get(7), 3); // fourgram
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(calculate_actual_score) {
|
||||
BOOST_AUTO_TEST_CASE(calculate_actual_score)
|
||||
{
|
||||
BOOST_REQUIRE(4 == kBleuNgramOrder);
|
||||
std::vector<int> stats(2 * kBleuNgramOrder + 1);
|
||||
BleuScorer scorer;
|
||||
@ -247,7 +260,8 @@ BOOST_AUTO_TEST_CASE(calculate_actual_score) {
|
||||
BOOST_CHECK_CLOSE(0.5115f, scorer.calculateScore(stats), 0.01);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(sentence_level_bleu) {
|
||||
BOOST_AUTO_TEST_CASE(sentence_level_bleu)
|
||||
{
|
||||
BOOST_REQUIRE(4 == kBleuNgramOrder);
|
||||
std::vector<float> stats(2 * kBleuNgramOrder + 1);
|
||||
|
||||
|
@ -6,9 +6,11 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace {
|
||||
namespace
|
||||
{
|
||||
|
||||
inline int CalcDistance(int word1, int word2) {
|
||||
inline int CalcDistance(int word1, int word2)
|
||||
{
|
||||
return word1 == word2 ? 0 : 1;
|
||||
}
|
||||
|
||||
@ -16,11 +18,11 @@ inline int CalcDistance(int word1, int word2) {
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
CderScorer::CderScorer(const string& config, bool allowed_long_jumps)
|
||||
: StatisticsBasedScorer(allowed_long_jumps ? "CDER" : "WER", config),
|
||||
m_allowed_long_jumps(allowed_long_jumps) {}
|
||||
: StatisticsBasedScorer(allowed_long_jumps ? "CDER" : "WER", config),
|
||||
m_allowed_long_jumps(allowed_long_jumps) {}
|
||||
|
||||
CderScorer::~CderScorer() {}
|
||||
|
||||
@ -82,7 +84,8 @@ float CderScorer::calculateScore(const vector<int>& comps) const
|
||||
}
|
||||
|
||||
void CderScorer::computeCD(const sent_t& cand, const sent_t& ref,
|
||||
vector<int>& stats) const {
|
||||
vector<int>& stats) const
|
||||
{
|
||||
int I = cand.size() + 1; // Number of inter-words positions in candidate sentence
|
||||
int L = ref.size() + 1; // Number of inter-words positions in reference sentence
|
||||
|
||||
@ -95,11 +98,9 @@ void CderScorer::computeCD(const sent_t& cand, const sent_t& ref,
|
||||
for (int i = 1; i < I; ++i) (*row)[i] = 1;
|
||||
|
||||
// Calculating costs for next row using costs from the previous row.
|
||||
while (++l < L)
|
||||
{
|
||||
while (++l < L) {
|
||||
vector<int>* nextRow = new vector<int>(I);
|
||||
for (int i = 0; i < I; ++i)
|
||||
{
|
||||
for (int i = 0; i < I; ++i) {
|
||||
vector<int> possibleCosts;
|
||||
if (i > 0) {
|
||||
possibleCosts.push_back((*nextRow)[i-1] + 1); // Deletion
|
||||
|
@ -8,13 +8,14 @@
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* CderScorer class can compute both CDER and WER metric.
|
||||
*/
|
||||
class CderScorer: public StatisticsBasedScorer {
|
||||
public:
|
||||
class CderScorer: public StatisticsBasedScorer
|
||||
{
|
||||
public:
|
||||
explicit CderScorer(const std::string& config, bool allowed_long_jumps = true);
|
||||
~CderScorer();
|
||||
|
||||
@ -24,11 +25,13 @@ class CderScorer: public StatisticsBasedScorer {
|
||||
|
||||
virtual void prepareStatsVector(std::size_t sid, const std::string& text, std::vector<int>& stats);
|
||||
|
||||
virtual std::size_t NumberOfScores() const { return 2; }
|
||||
virtual std::size_t NumberOfScores() const {
|
||||
return 2;
|
||||
}
|
||||
|
||||
virtual float calculateScore(const std::vector<int>& comps) const;
|
||||
|
||||
private:
|
||||
private:
|
||||
bool m_allowed_long_jumps;
|
||||
|
||||
typedef std::vector<int> sent_t;
|
||||
|
@ -27,11 +27,11 @@ namespace MosesTuning
|
||||
{
|
||||
|
||||
Data::Data(Scorer* scorer, const string& sparse_weights_file)
|
||||
: m_scorer(scorer),
|
||||
m_score_type(m_scorer->getName()),
|
||||
m_num_scores(0),
|
||||
m_score_data(new ScoreData(m_scorer)),
|
||||
m_feature_data(new FeatureData)
|
||||
: m_scorer(scorer),
|
||||
m_score_type(m_scorer->getName()),
|
||||
m_num_scores(0),
|
||||
m_score_data(new ScoreData(m_scorer)),
|
||||
m_feature_data(new FeatureData)
|
||||
{
|
||||
TRACE_ERR("Data::m_score_type " << m_score_type << endl);
|
||||
TRACE_ERR("Data::Scorer type from Scorer: " << m_scorer->getName() << endl);
|
||||
@ -48,7 +48,8 @@ Data::Data(Scorer* scorer, const string& sparse_weights_file)
|
||||
//ADDED BY TS
|
||||
// TODO: This is too long; consider creating additional functions to
|
||||
// reduce the lines of this function.
|
||||
void Data::removeDuplicates() {
|
||||
void Data::removeDuplicates()
|
||||
{
|
||||
size_t nSentences = m_feature_data->size();
|
||||
assert(m_score_data->size() == nSentences);
|
||||
|
||||
@ -128,7 +129,8 @@ void Data::removeDuplicates() {
|
||||
}
|
||||
//END_ADDED
|
||||
|
||||
void Data::load(const std::string &featfile, const std::string &scorefile) {
|
||||
void Data::load(const std::string &featfile, const std::string &scorefile)
|
||||
{
|
||||
m_feature_data->load(featfile, m_sparse_weights);
|
||||
m_score_data->load(scorefile);
|
||||
}
|
||||
@ -192,7 +194,8 @@ void Data::loadNBest(const string &file)
|
||||
}
|
||||
}
|
||||
|
||||
void Data::save(const std::string &featfile, const std::string &scorefile, bool bin) {
|
||||
void Data::save(const std::string &featfile, const std::string &scorefile, bool bin)
|
||||
{
|
||||
if (bin)
|
||||
cerr << "Binary write mode is selected" << endl;
|
||||
else
|
||||
@ -202,7 +205,8 @@ void Data::save(const std::string &featfile, const std::string &scorefile, bool
|
||||
m_score_data->save(scorefile, bin);
|
||||
}
|
||||
|
||||
void Data::InitFeatureMap(const string& str) {
|
||||
void Data::InitFeatureMap(const string& str)
|
||||
{
|
||||
string buf = str;
|
||||
string substr;
|
||||
string features = "";
|
||||
@ -231,7 +235,8 @@ void Data::InitFeatureMap(const string& str) {
|
||||
}
|
||||
|
||||
void Data::AddFeatures(const string& str,
|
||||
int sentence_index) {
|
||||
int sentence_index)
|
||||
{
|
||||
string buf = str;
|
||||
string substr;
|
||||
FeatureStats feature_entry;
|
||||
|
20
mert/Data.h
20
mert/Data.h
@ -44,18 +44,28 @@ public:
|
||||
m_feature_data->clear();
|
||||
}
|
||||
|
||||
ScoreDataHandle getScoreData() { return m_score_data; }
|
||||
ScoreDataHandle getScoreData() {
|
||||
return m_score_data;
|
||||
}
|
||||
|
||||
FeatureDataHandle getFeatureData() { return m_feature_data; }
|
||||
FeatureDataHandle getFeatureData() {
|
||||
return m_feature_data;
|
||||
}
|
||||
|
||||
Scorer* getScorer() { return m_scorer; }
|
||||
Scorer* getScorer() {
|
||||
return m_scorer;
|
||||
}
|
||||
|
||||
std::size_t NumberOfFeatures() const {
|
||||
return m_feature_data->NumberOfFeatures();
|
||||
}
|
||||
|
||||
std::string Features() const { return m_feature_data->Features(); }
|
||||
void Features(const std::string &f) { m_feature_data->Features(f); }
|
||||
std::string Features() const {
|
||||
return m_feature_data->Features();
|
||||
}
|
||||
void Features(const std::string &f) {
|
||||
m_feature_data->Features(f);
|
||||
}
|
||||
|
||||
void loadNBest(const std::string &file);
|
||||
|
||||
|
@ -10,7 +10,8 @@
|
||||
using namespace MosesTuning;
|
||||
|
||||
//very basic test of sharding
|
||||
BOOST_AUTO_TEST_CASE(shard_basic) {
|
||||
BOOST_AUTO_TEST_CASE(shard_basic)
|
||||
{
|
||||
boost::scoped_ptr<Scorer> scorer(ScorerFactory::getScorer("BLEU", ""));
|
||||
Data data(scorer.get());
|
||||
FeatureArray fa1, fa2, fa3, fa4;
|
||||
@ -39,7 +40,8 @@ BOOST_AUTO_TEST_CASE(shard_basic) {
|
||||
BOOST_CHECK_EQUAL(shards[1].getFeatureData()->size(),(std::size_t)2);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(init_feature_map_test) {
|
||||
BOOST_AUTO_TEST_CASE(init_feature_map_test)
|
||||
{
|
||||
boost::scoped_ptr<Scorer> scorer(ScorerFactory::getScorer("BLEU", ""));
|
||||
Data data(scorer.get());
|
||||
|
||||
@ -49,7 +51,8 @@ BOOST_AUTO_TEST_CASE(init_feature_map_test) {
|
||||
BOOST_CHECK_EQUAL(expected, data.Features());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(add_features_test) {
|
||||
BOOST_AUTO_TEST_CASE(add_features_test)
|
||||
{
|
||||
boost::scoped_ptr<Scorer> scorer(ScorerFactory::getScorer("BLEU", ""));
|
||||
Data data(scorer.get());
|
||||
|
||||
|
@ -13,27 +13,27 @@
|
||||
|
||||
#define BUFFER_SIZE (32768)
|
||||
|
||||
namespace MosesTuning
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
class _fdstream
|
||||
{
|
||||
protected:
|
||||
_fdstream() :
|
||||
_file_descriptor(-1), _filebuf(NULL)
|
||||
_file_descriptor(-1), _filebuf(NULL)
|
||||
{ }
|
||||
|
||||
_fdstream(int file_descriptor, std::ios_base::openmode openmode) :
|
||||
_file_descriptor(file_descriptor), _openmode(openmode)
|
||||
{
|
||||
_file_descriptor(file_descriptor), _openmode(openmode) {
|
||||
_filebuf = NULL;
|
||||
open(file_descriptor, openmode);
|
||||
}
|
||||
|
||||
std::ios_base::openmode openmode() const { return _openmode; }
|
||||
std::ios_base::openmode openmode() const {
|
||||
return _openmode;
|
||||
}
|
||||
|
||||
void open(int file_descriptor, std::ios_base::openmode openmode)
|
||||
{
|
||||
void open(int file_descriptor, std::ios_base::openmode openmode) {
|
||||
if (!_filebuf)
|
||||
// We create a C++ stream from a file descriptor
|
||||
// stdio_filebuf is not synced with stdio.
|
||||
@ -41,11 +41,10 @@ protected:
|
||||
// You can also create the filebuf from a FILE* with
|
||||
// FILE* f = fdopen(file_descriptor, mode);
|
||||
_filebuf = new __gnu_cxx::stdio_filebuf<char> (file_descriptor,
|
||||
openmode);
|
||||
openmode);
|
||||
}
|
||||
|
||||
virtual ~_fdstream()
|
||||
{
|
||||
virtual ~_fdstream() {
|
||||
close(_file_descriptor);
|
||||
delete _filebuf;
|
||||
_filebuf = NULL;
|
||||
@ -60,59 +59,51 @@ class ifdstream : public _fdstream
|
||||
{
|
||||
public:
|
||||
ifdstream() :
|
||||
_fdstream(), _stream(NULL)
|
||||
_fdstream(), _stream(NULL)
|
||||
{ }
|
||||
|
||||
ifdstream(int file_descriptor) :
|
||||
_fdstream(file_descriptor, std::ios_base::in)
|
||||
{
|
||||
_fdstream(file_descriptor, std::ios_base::in) {
|
||||
_stream = new std::istream(_filebuf);
|
||||
}
|
||||
|
||||
void open(int file_descriptor)
|
||||
{
|
||||
if (!_stream)
|
||||
{
|
||||
_fdstream::open(file_descriptor, std::ios_base::in);
|
||||
_stream = new std::istream(_filebuf);
|
||||
}
|
||||
void open(int file_descriptor) {
|
||||
if (!_stream) {
|
||||
_fdstream::open(file_descriptor, std::ios_base::in);
|
||||
_stream = new std::istream(_filebuf);
|
||||
}
|
||||
}
|
||||
|
||||
ifdstream& operator>> (std::string& str)
|
||||
{
|
||||
ifdstream& operator>> (std::string& str) {
|
||||
(*_stream) >> str;
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
std::size_t getline(std::string& str)
|
||||
{
|
||||
std::size_t getline(std::string& str) {
|
||||
char tmp[BUFFER_SIZE];
|
||||
std::size_t ret = getline(tmp, BUFFER_SIZE);
|
||||
str = tmp;
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::size_t getline(char* s, std::streamsize n)
|
||||
{
|
||||
std::size_t getline(char* s, std::streamsize n) {
|
||||
return (getline(s, n, '\n'));
|
||||
}
|
||||
|
||||
std::size_t getline(char* s, std::streamsize n, char delim)
|
||||
{
|
||||
std::size_t getline(char* s, std::streamsize n, char delim) {
|
||||
int i = 0;
|
||||
do{
|
||||
do {
|
||||
s[i] = _stream->get();
|
||||
i++;
|
||||
}while(i < n-1 && s[i-1] != delim && s[i-1] != '\0');
|
||||
} while(i < n-1 && s[i-1] != delim && s[i-1] != '\0');
|
||||
|
||||
s[i-1] = '\0'; // overwrite the delimiter given with string end
|
||||
|
||||
return i-1;
|
||||
}
|
||||
|
||||
~ifdstream()
|
||||
{
|
||||
~ifdstream() {
|
||||
//this->~_fdstream();
|
||||
delete _stream;
|
||||
}
|
||||
@ -125,27 +116,23 @@ class ofdstream : public _fdstream
|
||||
{
|
||||
public:
|
||||
ofdstream() :
|
||||
_fdstream(), _stream(NULL)
|
||||
_fdstream(), _stream(NULL)
|
||||
{ }
|
||||
|
||||
ofdstream(int file_descriptor) :
|
||||
_fdstream(file_descriptor, std::ios_base::out)
|
||||
{
|
||||
_fdstream(file_descriptor, std::ios_base::out) {
|
||||
_stream = new std::ostream(_filebuf);
|
||||
}
|
||||
|
||||
void open(int file_descriptor)
|
||||
{
|
||||
if (!_stream)
|
||||
{
|
||||
void open(int file_descriptor) {
|
||||
if (!_stream) {
|
||||
_fdstream::open(file_descriptor, std::ios_base::out);
|
||||
_stream = new std::ostream(_filebuf);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
ofdstream& operator<< (const std::string& str)
|
||||
{
|
||||
ofdstream& operator<< (const std::string& str) {
|
||||
if (_stream->good())
|
||||
(*_stream) << str;
|
||||
|
||||
@ -153,8 +140,7 @@ public:
|
||||
return *this;
|
||||
}
|
||||
|
||||
~ofdstream()
|
||||
{
|
||||
~ofdstream() {
|
||||
//this->~_fdstream();
|
||||
delete _stream;
|
||||
}
|
||||
|
@ -19,14 +19,14 @@ namespace MosesTuning
|
||||
|
||||
|
||||
FeatureArray::FeatureArray()
|
||||
: m_index(0), m_num_features(0){}
|
||||
: m_index(0), m_num_features(0) {}
|
||||
|
||||
FeatureArray::~FeatureArray() {}
|
||||
|
||||
void FeatureArray::savetxt(ostream* os)
|
||||
{
|
||||
*os << FEATURES_TXT_BEGIN << " " << m_index << " " << m_array.size()
|
||||
<< " " << m_num_features << " " << m_features << endl;
|
||||
<< " " << m_num_features << " " << m_features << endl;
|
||||
for (featarray_t::iterator i = m_array.begin(); i != m_array.end(); ++i) {
|
||||
i->savetxt(os);
|
||||
*os << endl;
|
||||
@ -37,7 +37,7 @@ void FeatureArray::savetxt(ostream* os)
|
||||
void FeatureArray::savebin(ostream* os)
|
||||
{
|
||||
*os << FEATURES_BIN_BEGIN << " " << m_index << " " << m_array.size()
|
||||
<< " " << m_num_features << " " << m_features << endl;
|
||||
<< " " << m_num_features << " " << m_features << endl;
|
||||
for (featarray_t::iterator i = m_array.begin(); i != m_array.end(); ++i)
|
||||
i->savebin(os);
|
||||
|
||||
|
@ -36,16 +36,28 @@ public:
|
||||
FeatureArray();
|
||||
~FeatureArray();
|
||||
|
||||
void clear() { m_array.clear(); }
|
||||
void clear() {
|
||||
m_array.clear();
|
||||
}
|
||||
|
||||
|
||||
int getIndex() const { return m_index; }
|
||||
void setIndex(const int value) { m_index = value; }
|
||||
int getIndex() const {
|
||||
return m_index;
|
||||
}
|
||||
void setIndex(const int value) {
|
||||
m_index = value;
|
||||
}
|
||||
|
||||
FeatureStats& get(std::size_t i) { return m_array.at(i); }
|
||||
const FeatureStats& get(std::size_t i) const { return m_array.at(i); }
|
||||
FeatureStats& get(std::size_t i) {
|
||||
return m_array.at(i);
|
||||
}
|
||||
const FeatureStats& get(std::size_t i) const {
|
||||
return m_array.at(i);
|
||||
}
|
||||
|
||||
void add(FeatureStats& e) { m_array.push_back(e); }
|
||||
void add(FeatureStats& e) {
|
||||
m_array.push_back(e);
|
||||
}
|
||||
|
||||
//ADDED BY TS
|
||||
void swap(std::size_t i, std::size_t j) {
|
||||
@ -59,13 +71,23 @@ public:
|
||||
|
||||
void merge(FeatureArray& e);
|
||||
|
||||
std::size_t size() const { return m_array.size(); }
|
||||
std::size_t size() const {
|
||||
return m_array.size();
|
||||
}
|
||||
|
||||
std::size_t NumberOfFeatures() const { return m_num_features; }
|
||||
void NumberOfFeatures(std::size_t v) { m_num_features = v; }
|
||||
std::size_t NumberOfFeatures() const {
|
||||
return m_num_features;
|
||||
}
|
||||
void NumberOfFeatures(std::size_t v) {
|
||||
m_num_features = v;
|
||||
}
|
||||
|
||||
std::string Features() const { return m_features; }
|
||||
void Features(const std::string& f) { m_features = f; }
|
||||
std::string Features() const {
|
||||
return m_features;
|
||||
}
|
||||
void Features(const std::string& f) {
|
||||
m_features = f;
|
||||
}
|
||||
|
||||
void savetxt(std::ostream* os);
|
||||
void savebin(std::ostream* os);
|
||||
|
@ -20,7 +20,7 @@ namespace MosesTuning
|
||||
|
||||
|
||||
FeatureData::FeatureData()
|
||||
: m_num_features(0) {}
|
||||
: m_num_features(0) {}
|
||||
|
||||
void FeatureData::save(ostream* os, bool bin)
|
||||
{
|
||||
@ -38,7 +38,8 @@ void FeatureData::save(const string &file, bool bin)
|
||||
ofs.close();
|
||||
}
|
||||
|
||||
void FeatureData::save(bool bin) {
|
||||
void FeatureData::save(bool bin)
|
||||
{
|
||||
save(&cout, bin);
|
||||
}
|
||||
|
||||
@ -145,7 +146,8 @@ void FeatureData::setFeatureMap(const string& feat)
|
||||
}
|
||||
}
|
||||
|
||||
string FeatureData::ToString() const {
|
||||
string FeatureData::ToString() const
|
||||
{
|
||||
string res;
|
||||
|
||||
{
|
||||
|
@ -33,7 +33,9 @@ public:
|
||||
FeatureData();
|
||||
~FeatureData() {}
|
||||
|
||||
void clear() { m_array.clear(); }
|
||||
void clear() {
|
||||
m_array.clear();
|
||||
}
|
||||
|
||||
FeatureArray& get(size_t idx) {
|
||||
return m_array.at(idx);
|
||||
@ -61,13 +63,23 @@ public:
|
||||
void add(FeatureArray& e);
|
||||
void add(FeatureStats& e, int sent_idx);
|
||||
|
||||
std::size_t size() const { return m_array.size(); }
|
||||
std::size_t size() const {
|
||||
return m_array.size();
|
||||
}
|
||||
|
||||
std::size_t NumberOfFeatures() const { return m_num_features; }
|
||||
void NumberOfFeatures(std::size_t v) { m_num_features = v; }
|
||||
std::size_t NumberOfFeatures() const {
|
||||
return m_num_features;
|
||||
}
|
||||
void NumberOfFeatures(std::size_t v) {
|
||||
m_num_features = v;
|
||||
}
|
||||
|
||||
std::string Features() const { return m_features; }
|
||||
void Features(const std::string& f) { m_features = f; }
|
||||
std::string Features() const {
|
||||
return m_features;
|
||||
}
|
||||
void Features(const std::string& f) {
|
||||
m_features = f;
|
||||
}
|
||||
|
||||
void save(const std::string &file, bool bin=false);
|
||||
void save(std::ostream* os, bool bin=false);
|
||||
|
@ -32,9 +32,10 @@ using namespace util;
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
int ParseInt(const StringPiece& str ) {
|
||||
|
||||
int ParseInt(const StringPiece& str )
|
||||
{
|
||||
char* errIndex;
|
||||
//could wrap?
|
||||
int value = static_cast<int>(strtol(str.data(), &errIndex,10));
|
||||
@ -44,7 +45,8 @@ int ParseInt(const StringPiece& str ) {
|
||||
return value;
|
||||
}
|
||||
|
||||
float ParseFloat(const StringPiece& str) {
|
||||
float ParseFloat(const StringPiece& str)
|
||||
{
|
||||
char* errIndex;
|
||||
float value = static_cast<float>(strtod(str.data(), &errIndex));
|
||||
if (errIndex == str.data()) {
|
||||
@ -53,11 +55,13 @@ float ParseFloat(const StringPiece& str) {
|
||||
return value;
|
||||
}
|
||||
|
||||
bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2) {
|
||||
bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2)
|
||||
{
|
||||
return item1.dense==item1.dense && item1.sparse==item1.sparse;
|
||||
}
|
||||
|
||||
size_t hash_value(FeatureDataItem const& item) {
|
||||
size_t hash_value(FeatureDataItem const& item)
|
||||
{
|
||||
size_t seed = 0;
|
||||
boost::hash_combine(seed,item.dense);
|
||||
boost::hash_combine(seed,item.sparse);
|
||||
@ -67,14 +71,16 @@ size_t hash_value(FeatureDataItem const& item) {
|
||||
|
||||
FeatureDataIterator::FeatureDataIterator() {}
|
||||
|
||||
FeatureDataIterator::FeatureDataIterator(const string& filename) {
|
||||
FeatureDataIterator::FeatureDataIterator(const string& filename)
|
||||
{
|
||||
m_in.reset(new FilePiece(filename.c_str()));
|
||||
readNext();
|
||||
}
|
||||
|
||||
FeatureDataIterator::~FeatureDataIterator() {}
|
||||
|
||||
void FeatureDataIterator::readNext() {
|
||||
void FeatureDataIterator::readNext()
|
||||
{
|
||||
m_next.clear();
|
||||
try {
|
||||
StringPiece marker = m_in->ReadDelimited();
|
||||
@ -101,7 +107,7 @@ void FeatureDataIterator::readNext() {
|
||||
//sparse feature
|
||||
StringPiece second = *value;
|
||||
float floatValue = ParseFloat(second);
|
||||
m_next.back().sparse.set(first.as_string(),floatValue);
|
||||
m_next.back().sparse.set(first.as_string(),floatValue);
|
||||
}
|
||||
}
|
||||
if (length != m_next.back().dense.size()) {
|
||||
@ -117,11 +123,13 @@ void FeatureDataIterator::readNext() {
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureDataIterator::increment() {
|
||||
void FeatureDataIterator::increment()
|
||||
{
|
||||
readNext();
|
||||
}
|
||||
|
||||
bool FeatureDataIterator::equal(const FeatureDataIterator& rhs) const {
|
||||
bool FeatureDataIterator::equal(const FeatureDataIterator& rhs) const
|
||||
{
|
||||
if (!m_in && !rhs.m_in) {
|
||||
return true;
|
||||
} else if (!m_in) {
|
||||
@ -129,12 +137,13 @@ bool FeatureDataIterator::equal(const FeatureDataIterator& rhs) const {
|
||||
} else if (!rhs.m_in) {
|
||||
return false;
|
||||
} else {
|
||||
return m_in->FileName() == rhs.m_in->FileName() &&
|
||||
m_in->Offset() == rhs.m_in->Offset();
|
||||
return m_in->FileName() == rhs.m_in->FileName() &&
|
||||
m_in->Offset() == rhs.m_in->Offset();
|
||||
}
|
||||
}
|
||||
|
||||
const vector<FeatureDataItem>& FeatureDataIterator::dereference() const {
|
||||
const vector<FeatureDataItem>& FeatureDataIterator::dereference() const
|
||||
{
|
||||
return m_next;
|
||||
}
|
||||
|
||||
|
@ -37,18 +37,21 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
#include "FeatureStats.h"
|
||||
|
||||
namespace util { class FilePiece; }
|
||||
namespace util
|
||||
{
|
||||
class FilePiece;
|
||||
}
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
class FileFormatException : public util::Exception
|
||||
|
||||
class FileFormatException : public util::Exception
|
||||
{
|
||||
public:
|
||||
explicit FileFormatException(const std::string& filename, const std::string& line) {
|
||||
*this << "Error in line \"" << line << "\" of " << filename;
|
||||
}
|
||||
public:
|
||||
explicit FileFormatException(const std::string& filename, const std::string& line) {
|
||||
*this << "Error in line \"" << line << "\" of " << filename;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -56,45 +59,45 @@ class FileFormatException : public util::Exception
|
||||
int ParseInt(const StringPiece& str );
|
||||
|
||||
/** Assumes a delimiter, so only apply to tokens */
|
||||
float ParseFloat(const StringPiece& str);
|
||||
float ParseFloat(const StringPiece& str);
|
||||
|
||||
|
||||
class FeatureDataItem
|
||||
class FeatureDataItem
|
||||
{
|
||||
public:
|
||||
std::vector<float> dense;
|
||||
SparseVector sparse;
|
||||
public:
|
||||
std::vector<float> dense;
|
||||
SparseVector sparse;
|
||||
};
|
||||
|
||||
bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2);
|
||||
std::size_t hash_value(FeatureDataItem const& item);
|
||||
|
||||
class FeatureDataIterator :
|
||||
class FeatureDataIterator :
|
||||
public boost::iterator_facade<FeatureDataIterator,
|
||||
const std::vector<FeatureDataItem>,
|
||||
boost::forward_traversal_tag>
|
||||
const std::vector<FeatureDataItem>,
|
||||
boost::forward_traversal_tag>
|
||||
{
|
||||
public:
|
||||
FeatureDataIterator();
|
||||
explicit FeatureDataIterator(const std::string& filename);
|
||||
~FeatureDataIterator();
|
||||
public:
|
||||
FeatureDataIterator();
|
||||
explicit FeatureDataIterator(const std::string& filename);
|
||||
~FeatureDataIterator();
|
||||
|
||||
static FeatureDataIterator end() {
|
||||
return FeatureDataIterator();
|
||||
}
|
||||
static FeatureDataIterator end() {
|
||||
return FeatureDataIterator();
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
friend class boost::iterator_core_access;
|
||||
private:
|
||||
friend class boost::iterator_core_access;
|
||||
|
||||
void increment();
|
||||
bool equal(const FeatureDataIterator& rhs) const;
|
||||
const std::vector<FeatureDataItem>& dereference() const;
|
||||
void increment();
|
||||
bool equal(const FeatureDataIterator& rhs) const;
|
||||
const std::vector<FeatureDataItem>& dereference() const;
|
||||
|
||||
void readNext();
|
||||
void readNext();
|
||||
|
||||
boost::shared_ptr<util::FilePiece> m_in;
|
||||
std::vector<FeatureDataItem> m_next;
|
||||
boost::shared_ptr<util::FilePiece> m_in;
|
||||
std::vector<FeatureDataItem> m_next;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -7,10 +7,12 @@
|
||||
|
||||
using namespace MosesTuning;
|
||||
|
||||
namespace {
|
||||
namespace
|
||||
{
|
||||
|
||||
void CheckFeatureMap(const FeatureData* feature_data,
|
||||
const char* str, int num_feature, int* cnt) {
|
||||
const char* str, int num_feature, int* cnt)
|
||||
{
|
||||
for (int i = 0; i < num_feature; ++i) {
|
||||
std::stringstream ss;
|
||||
ss << str << "_" << i;
|
||||
@ -23,7 +25,8 @@ void CheckFeatureMap(const FeatureData* feature_data,
|
||||
|
||||
} // namespace
|
||||
|
||||
BOOST_AUTO_TEST_CASE(set_feature_map) {
|
||||
BOOST_AUTO_TEST_CASE(set_feature_map)
|
||||
{
|
||||
std::string str("d_0 d_1 d_2 d_3 d_4 d_5 d_6 lm_0 lm_1 tm_0 tm_1 tm_2 tm_3 tm_4 w_0 ");
|
||||
FeatureData feature_data;
|
||||
|
||||
|
@ -18,31 +18,35 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace {
|
||||
namespace
|
||||
{
|
||||
const int kAvailableSize = 8;
|
||||
} // namespace
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
SparseVector::name2id_t SparseVector::m_name_to_id;
|
||||
SparseVector::id2name_t SparseVector::m_id_to_name;
|
||||
|
||||
FeatureStatsType SparseVector::get(const string& name) const {
|
||||
FeatureStatsType SparseVector::get(const string& name) const
|
||||
{
|
||||
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
|
||||
if (name2id_iter == m_name_to_id.end()) return 0;
|
||||
size_t id = name2id_iter->second;
|
||||
return get(id);
|
||||
}
|
||||
|
||||
FeatureStatsType SparseVector::get(size_t id) const {
|
||||
FeatureStatsType SparseVector::get(size_t id) const
|
||||
{
|
||||
fvector_t::const_iterator fvector_iter = m_fvector.find(id);
|
||||
if (fvector_iter == m_fvector.end()) return 0;
|
||||
return fvector_iter->second;
|
||||
}
|
||||
|
||||
void SparseVector::set(const string& name, FeatureStatsType value) {
|
||||
void SparseVector::set(const string& name, FeatureStatsType value)
|
||||
{
|
||||
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
|
||||
size_t id = 0;
|
||||
if (name2id_iter == m_name_to_id.end()) {
|
||||
@ -55,7 +59,8 @@ void SparseVector::set(const string& name, FeatureStatsType value) {
|
||||
m_fvector[id] = value;
|
||||
}
|
||||
|
||||
void SparseVector::write(ostream& out, const string& sep) const {
|
||||
void SparseVector::write(ostream& out, const string& sep) const
|
||||
{
|
||||
for (fvector_t::const_iterator i = m_fvector.begin(); i != m_fvector.end(); ++i) {
|
||||
if (abs(i->second) < 0.00001) continue;
|
||||
string name = m_id_to_name[i->first];
|
||||
@ -63,11 +68,13 @@ void SparseVector::write(ostream& out, const string& sep) const {
|
||||
}
|
||||
}
|
||||
|
||||
void SparseVector::clear() {
|
||||
void SparseVector::clear()
|
||||
{
|
||||
m_fvector.clear();
|
||||
}
|
||||
|
||||
void SparseVector::load(const string& file) {
|
||||
void SparseVector::load(const string& file)
|
||||
{
|
||||
ifstream in(file.c_str());
|
||||
if (!in) {
|
||||
throw runtime_error("Failed to open sparse weights file: " + file);
|
||||
@ -84,39 +91,44 @@ void SparseVector::load(const string& file) {
|
||||
}
|
||||
}
|
||||
|
||||
SparseVector& SparseVector::operator-=(const SparseVector& rhs) {
|
||||
SparseVector& SparseVector::operator-=(const SparseVector& rhs)
|
||||
{
|
||||
|
||||
for (fvector_t::const_iterator i = rhs.m_fvector.begin();
|
||||
i != rhs.m_fvector.end(); ++i) {
|
||||
i != rhs.m_fvector.end(); ++i) {
|
||||
m_fvector[i->first] = get(i->first) - (i->second);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
FeatureStatsType SparseVector::inner_product(const SparseVector& rhs) const {
|
||||
FeatureStatsType SparseVector::inner_product(const SparseVector& rhs) const
|
||||
{
|
||||
FeatureStatsType product = 0.0;
|
||||
for (fvector_t::const_iterator i = m_fvector.begin();
|
||||
i != m_fvector.end(); ++i) {
|
||||
i != m_fvector.end(); ++i) {
|
||||
product += ((i->second) * (rhs.get(i->first)));
|
||||
}
|
||||
return product;
|
||||
}
|
||||
|
||||
SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs) {
|
||||
SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs)
|
||||
{
|
||||
SparseVector res(lhs);
|
||||
res -= rhs;
|
||||
return res;
|
||||
}
|
||||
|
||||
FeatureStatsType inner_product(const SparseVector& lhs, const SparseVector& rhs) {
|
||||
if (lhs.size() >= rhs.size()) {
|
||||
return rhs.inner_product(lhs);
|
||||
} else {
|
||||
return lhs.inner_product(rhs);
|
||||
}
|
||||
FeatureStatsType inner_product(const SparseVector& lhs, const SparseVector& rhs)
|
||||
{
|
||||
if (lhs.size() >= rhs.size()) {
|
||||
return rhs.inner_product(lhs);
|
||||
} else {
|
||||
return lhs.inner_product(rhs);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::size_t> SparseVector::feats() const {
|
||||
std::vector<std::size_t> SparseVector::feats() const
|
||||
{
|
||||
std::vector<std::size_t> toRet;
|
||||
for(fvector_t::const_iterator iter = m_fvector.begin();
|
||||
iter!=m_fvector.end();
|
||||
@ -126,7 +138,8 @@ std::vector<std::size_t> SparseVector::feats() const {
|
||||
return toRet;
|
||||
}
|
||||
|
||||
std::size_t SparseVector::encode(const std::string& name) {
|
||||
std::size_t SparseVector::encode(const std::string& name)
|
||||
{
|
||||
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
|
||||
size_t id = 0;
|
||||
if (name2id_iter == m_name_to_id.end()) {
|
||||
@ -139,26 +152,29 @@ std::size_t SparseVector::encode(const std::string& name) {
|
||||
return id;
|
||||
}
|
||||
|
||||
std::string SparseVector::decode(std::size_t id) {
|
||||
std::string SparseVector::decode(std::size_t id)
|
||||
{
|
||||
return m_id_to_name[id];
|
||||
}
|
||||
|
||||
bool operator==(SparseVector const& item1, SparseVector const& item2) {
|
||||
bool operator==(SparseVector const& item1, SparseVector const& item2)
|
||||
{
|
||||
return item1.m_fvector==item2.m_fvector;
|
||||
}
|
||||
|
||||
std::size_t hash_value(SparseVector const& item) {
|
||||
std::size_t hash_value(SparseVector const& item)
|
||||
{
|
||||
boost::hash<SparseVector::fvector_t> hasher;
|
||||
return hasher(item.m_fvector);
|
||||
}
|
||||
|
||||
FeatureStats::FeatureStats()
|
||||
: m_available_size(kAvailableSize), m_entries(0),
|
||||
m_array(new FeatureStatsType[m_available_size]) {}
|
||||
: m_available_size(kAvailableSize), m_entries(0),
|
||||
m_array(new FeatureStatsType[m_available_size]) {}
|
||||
|
||||
FeatureStats::FeatureStats(const size_t size)
|
||||
: m_available_size(size), m_entries(size),
|
||||
m_array(new FeatureStatsType[m_available_size])
|
||||
: m_available_size(size), m_entries(size),
|
||||
m_array(new FeatureStatsType[m_available_size])
|
||||
{
|
||||
memset(m_array, 0, GetArraySizeWithBytes());
|
||||
}
|
||||
@ -276,7 +292,8 @@ void FeatureStats::savetxt(ostream* os)
|
||||
*os << *this;
|
||||
}
|
||||
|
||||
void FeatureStats::savetxt() {
|
||||
void FeatureStats::savetxt()
|
||||
{
|
||||
savetxt(&cout);
|
||||
}
|
||||
|
||||
@ -298,7 +315,8 @@ ostream& operator<<(ostream& o, const FeatureStats& e)
|
||||
return o;
|
||||
}
|
||||
|
||||
bool operator==(const FeatureStats& f1, const FeatureStats& f2) {
|
||||
bool operator==(const FeatureStats& f1, const FeatureStats& f2)
|
||||
{
|
||||
size_t size = f1.size();
|
||||
|
||||
if (size != f2.size())
|
||||
|
@ -18,10 +18,11 @@
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
// Minimal sparse vector
|
||||
class SparseVector {
|
||||
class SparseVector
|
||||
{
|
||||
public:
|
||||
typedef std::map<std::size_t,FeatureStatsType> fvector_t;
|
||||
typedef std::map<std::string, std::size_t> name2id_t;
|
||||
@ -32,8 +33,10 @@ public:
|
||||
void set(const std::string& name, FeatureStatsType value);
|
||||
void clear();
|
||||
void load(const std::string& file);
|
||||
std::size_t size() const { return m_fvector.size(); }
|
||||
|
||||
std::size_t size() const {
|
||||
return m_fvector.size();
|
||||
}
|
||||
|
||||
void write(std::ostream& out, const std::string& sep = " ") const;
|
||||
|
||||
SparseVector& operator-=(const SparseVector& rhs);
|
||||
@ -78,7 +81,9 @@ public:
|
||||
|
||||
void Copy(const FeatureStats &stats);
|
||||
|
||||
bool isfull() const { return (m_entries < m_available_size) ? 0 : 1; }
|
||||
bool isfull() const {
|
||||
return (m_entries < m_available_size) ? 0 : 1;
|
||||
}
|
||||
void expand();
|
||||
void add(FeatureStatsType v);
|
||||
void addSparse(const std::string& name, FeatureStatsType v);
|
||||
@ -93,23 +98,37 @@ public:
|
||||
clear();
|
||||
}
|
||||
|
||||
FeatureStatsType get(std::size_t i) { return m_array[i]; }
|
||||
FeatureStatsType get(std::size_t i)const { return m_array[i]; }
|
||||
featstats_t getArray() const { return m_array; }
|
||||
FeatureStatsType get(std::size_t i) {
|
||||
return m_array[i];
|
||||
}
|
||||
FeatureStatsType get(std::size_t i)const {
|
||||
return m_array[i];
|
||||
}
|
||||
featstats_t getArray() const {
|
||||
return m_array;
|
||||
}
|
||||
|
||||
const SparseVector& getSparse() const { return m_map; }
|
||||
const SparseVector& getSparse() const {
|
||||
return m_map;
|
||||
}
|
||||
|
||||
void set(std::string &theString, const SparseVector& sparseWeights);
|
||||
|
||||
inline std::size_t bytes() const { return GetArraySizeWithBytes(); }
|
||||
inline std::size_t bytes() const {
|
||||
return GetArraySizeWithBytes();
|
||||
}
|
||||
|
||||
std::size_t GetArraySizeWithBytes() const {
|
||||
return m_entries * sizeof(FeatureStatsType);
|
||||
}
|
||||
|
||||
std::size_t size() const { return m_entries; }
|
||||
std::size_t size() const {
|
||||
return m_entries;
|
||||
}
|
||||
|
||||
std::size_t available() const { return m_available_size; }
|
||||
std::size_t available() const {
|
||||
return m_available_size;
|
||||
}
|
||||
|
||||
void savetxt(const std::string &file);
|
||||
void savetxt(std::ostream* os);
|
||||
|
@ -5,15 +5,17 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace {
|
||||
bool IsGzipFile(const std::string &filename) {
|
||||
namespace
|
||||
{
|
||||
bool IsGzipFile(const std::string &filename)
|
||||
{
|
||||
return filename.size() > 3 &&
|
||||
filename.substr(filename.size() - 3, 3) == ".gz";
|
||||
filename.substr(filename.size() - 3, 3) == ".gz";
|
||||
}
|
||||
} // namespace
|
||||
|
||||
inputfilestream::inputfilestream(const std::string &filePath)
|
||||
: std::istream(0), m_streambuf(0), m_is_good(false)
|
||||
: std::istream(0), m_streambuf(0), m_is_good(false)
|
||||
{
|
||||
// check if file is readable
|
||||
std::filebuf* fb = new std::filebuf();
|
||||
@ -40,7 +42,7 @@ void inputfilestream::close()
|
||||
}
|
||||
|
||||
outputfilestream::outputfilestream(const std::string &filePath)
|
||||
: std::ostream(0), m_streambuf(0), m_is_good(false)
|
||||
: std::ostream(0), m_streambuf(0), m_is_good(false)
|
||||
{
|
||||
// check if file is readable
|
||||
std::filebuf* fb = new std::filebuf();
|
||||
|
@ -16,7 +16,9 @@ public:
|
||||
explicit inputfilestream(const std::string &filePath);
|
||||
virtual ~inputfilestream();
|
||||
|
||||
bool good() const { return m_is_good; }
|
||||
bool good() const {
|
||||
return m_is_good;
|
||||
}
|
||||
void close();
|
||||
};
|
||||
|
||||
@ -30,7 +32,9 @@ public:
|
||||
explicit outputfilestream(const std::string &filePath);
|
||||
virtual ~outputfilestream();
|
||||
|
||||
bool good() const { return m_is_good; }
|
||||
bool good() const {
|
||||
return m_is_good;
|
||||
}
|
||||
void close();
|
||||
};
|
||||
|
||||
|
@ -5,7 +5,8 @@
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
|
||||
GzFileBuf::GzFileBuf(const char* filename) {
|
||||
GzFileBuf::GzFileBuf(const char* filename)
|
||||
{
|
||||
m_gz_file = gzopen(filename, "rb");
|
||||
if (m_gz_file == NULL) {
|
||||
std::cerr << "ERROR: Failed to open " << filename << std::endl;
|
||||
@ -16,16 +17,19 @@ GzFileBuf::GzFileBuf(const char* filename) {
|
||||
m_buf + sizeof(int)); // end position
|
||||
}
|
||||
|
||||
GzFileBuf::~GzFileBuf() {
|
||||
GzFileBuf::~GzFileBuf()
|
||||
{
|
||||
gzclose(m_gz_file);
|
||||
}
|
||||
|
||||
int GzFileBuf::overflow(int_type c) {
|
||||
int GzFileBuf::overflow(int_type c)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
|
||||
// read one character
|
||||
int GzFileBuf::underflow() {
|
||||
int GzFileBuf::underflow()
|
||||
{
|
||||
// is read position before end of m_buf?
|
||||
if (gptr() < egptr()) {
|
||||
return traits_type::to_int_type(*gptr());
|
||||
@ -64,17 +68,20 @@ int GzFileBuf::underflow() {
|
||||
}
|
||||
|
||||
std::streampos GzFileBuf::seekpos(
|
||||
std::streampos sp,
|
||||
std::ios_base::openmode which) {
|
||||
std::streampos sp,
|
||||
std::ios_base::openmode which)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
|
||||
std::streamsize GzFileBuf::xsgetn(char* s,
|
||||
std::streamsize num) {
|
||||
std::streamsize num)
|
||||
{
|
||||
return static_cast<std::streamsize>(gzread(m_gz_file,s,num));
|
||||
}
|
||||
|
||||
std::streamsize GzFileBuf::xsputn(const char* s,
|
||||
std::streamsize num) {
|
||||
std::streamsize num)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
|
@ -17,8 +17,8 @@ protected:
|
||||
virtual int_type underflow();
|
||||
|
||||
virtual std::streampos seekpos(
|
||||
std::streampos sp,
|
||||
std::ios_base::openmode which = std::ios_base::in | std::ios_base::out);
|
||||
std::streampos sp,
|
||||
std::ios_base::openmode which = std::ios_base::in | std::ios_base::out);
|
||||
|
||||
virtual std::streamsize xsgetn(char* s, std::streamsize num);
|
||||
|
||||
|
@ -8,13 +8,13 @@ using namespace std;
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
StreamingHypPackEnumerator::StreamingHypPackEnumerator
|
||||
(
|
||||
vector<std::string> const& featureFiles,
|
||||
vector<std::string> const& scoreFiles
|
||||
)
|
||||
vector<std::string> const& featureFiles,
|
||||
vector<std::string> const& scoreFiles
|
||||
)
|
||||
: m_featureFiles(featureFiles),
|
||||
m_scoreFiles(scoreFiles)
|
||||
{
|
||||
@ -22,19 +22,20 @@ StreamingHypPackEnumerator::StreamingHypPackEnumerator
|
||||
cerr << "No data to process" << endl;
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
||||
if (featureFiles.size() != scoreFiles.size()) {
|
||||
cerr << "Error: Number of feature files (" << featureFiles.size() <<
|
||||
") does not match number of score files (" << scoreFiles.size() << ")" << endl;
|
||||
") does not match number of score files (" << scoreFiles.size() << ")" << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
m_num_lists = scoreFiles.size();
|
||||
m_primed = false;
|
||||
m_iNumDense = -1;
|
||||
}
|
||||
|
||||
size_t StreamingHypPackEnumerator::num_dense() const {
|
||||
size_t StreamingHypPackEnumerator::num_dense() const
|
||||
{
|
||||
if(m_iNumDense<0) {
|
||||
cerr << "Error: Requested num_dense() for an unprimed StreamingHypPackEnumerator" << endl;
|
||||
exit(1);
|
||||
@ -42,12 +43,13 @@ size_t StreamingHypPackEnumerator::num_dense() const {
|
||||
return (size_t) m_iNumDense;
|
||||
}
|
||||
|
||||
void StreamingHypPackEnumerator::prime(){
|
||||
void StreamingHypPackEnumerator::prime()
|
||||
{
|
||||
m_current_indexes.clear();
|
||||
m_current_featureVectors.clear();
|
||||
boost::unordered_set<FeatureDataItem> seen;
|
||||
m_primed = true;
|
||||
|
||||
|
||||
for (size_t i = 0; i < m_num_lists; ++i) {
|
||||
if (m_featureDataIters[i] == FeatureDataIterator::end()) {
|
||||
cerr << "Error: Feature file " << i << " ended prematurely" << endl;
|
||||
@ -78,13 +80,14 @@ void StreamingHypPackEnumerator::prime(){
|
||||
}
|
||||
// Store item for retrieval
|
||||
m_current_indexes.push_back(pair<size_t,size_t>(i,j));
|
||||
m_current_featureVectors.push_back(MiraFeatureVector(item));
|
||||
m_current_featureVectors.push_back(MiraFeatureVector(item));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void StreamingHypPackEnumerator::reset(){
|
||||
void StreamingHypPackEnumerator::reset()
|
||||
{
|
||||
m_featureDataIters.clear();
|
||||
m_scoreDataIters.clear();
|
||||
for (size_t i = 0; i < m_num_lists; ++i) {
|
||||
@ -95,11 +98,13 @@ void StreamingHypPackEnumerator::reset(){
|
||||
prime();
|
||||
}
|
||||
|
||||
bool StreamingHypPackEnumerator::finished(){
|
||||
bool StreamingHypPackEnumerator::finished()
|
||||
{
|
||||
return m_featureDataIters[0]==FeatureDataIterator::end();
|
||||
}
|
||||
|
||||
void StreamingHypPackEnumerator::next(){
|
||||
void StreamingHypPackEnumerator::next()
|
||||
{
|
||||
if(!m_primed) {
|
||||
cerr << "Enumerating an unprimed HypPackEnumerator" << endl;
|
||||
exit(1);
|
||||
@ -113,7 +118,8 @@ void StreamingHypPackEnumerator::next(){
|
||||
if(!finished()) prime();
|
||||
}
|
||||
|
||||
size_t StreamingHypPackEnumerator::cur_size(){
|
||||
size_t StreamingHypPackEnumerator::cur_size()
|
||||
{
|
||||
if(!m_primed) {
|
||||
cerr << "Querying size from an unprimed HypPackEnumerator" << endl;
|
||||
exit(1);
|
||||
@ -121,7 +127,8 @@ size_t StreamingHypPackEnumerator::cur_size(){
|
||||
return m_current_indexes.size();
|
||||
}
|
||||
|
||||
const MiraFeatureVector& StreamingHypPackEnumerator::featuresAt(size_t index){
|
||||
const MiraFeatureVector& StreamingHypPackEnumerator::featuresAt(size_t index)
|
||||
{
|
||||
if(!m_primed) {
|
||||
cerr << "Querying features from an unprimed HypPackEnumerator" << endl;
|
||||
exit(1);
|
||||
@ -129,7 +136,8 @@ const MiraFeatureVector& StreamingHypPackEnumerator::featuresAt(size_t index){
|
||||
return m_current_featureVectors[index];
|
||||
}
|
||||
|
||||
const ScoreDataItem& StreamingHypPackEnumerator::scoresAt(size_t index) {
|
||||
const ScoreDataItem& StreamingHypPackEnumerator::scoresAt(size_t index)
|
||||
{
|
||||
if(!m_primed) {
|
||||
cerr << "Querying scores from an unprimed HypPackEnumerator" << endl;
|
||||
exit(1);
|
||||
@ -138,22 +146,23 @@ const ScoreDataItem& StreamingHypPackEnumerator::scoresAt(size_t index) {
|
||||
return m_scoreDataIters[pij.first]->operator[](pij.second);
|
||||
}
|
||||
|
||||
size_t StreamingHypPackEnumerator::cur_id() {
|
||||
size_t StreamingHypPackEnumerator::cur_id()
|
||||
{
|
||||
return m_sentenceId;
|
||||
}
|
||||
|
||||
/* --------- RandomAccessHypPackEnumerator ------------- */
|
||||
|
||||
RandomAccessHypPackEnumerator::RandomAccessHypPackEnumerator(vector<string> const& featureFiles,
|
||||
vector<string> const& scoreFiles,
|
||||
bool no_shuffle)
|
||||
vector<string> const& scoreFiles,
|
||||
bool no_shuffle)
|
||||
{
|
||||
StreamingHypPackEnumerator train(featureFiles,scoreFiles);
|
||||
size_t index=0;
|
||||
for(train.reset(); !train.finished(); train.next()) {
|
||||
m_features.push_back(vector<MiraFeatureVector>());
|
||||
m_scores.push_back(vector<ScoreDataItem>());
|
||||
for(size_t j=0;j<train.cur_size();j++) {
|
||||
for(size_t j=0; j<train.cur_size(); j++) {
|
||||
m_features.back().push_back(train.featuresAt(j));
|
||||
m_scores.back().push_back(train.scoresAt(j));
|
||||
}
|
||||
@ -165,35 +174,43 @@ RandomAccessHypPackEnumerator::RandomAccessHypPackEnumerator(vector<string> cons
|
||||
m_num_dense = train.num_dense();
|
||||
}
|
||||
|
||||
size_t RandomAccessHypPackEnumerator::num_dense() const {
|
||||
size_t RandomAccessHypPackEnumerator::num_dense() const
|
||||
{
|
||||
return m_num_dense;
|
||||
}
|
||||
|
||||
void RandomAccessHypPackEnumerator::reset() {
|
||||
|
||||
void RandomAccessHypPackEnumerator::reset()
|
||||
{
|
||||
m_cur_index = 0;
|
||||
if(!m_no_shuffle) random_shuffle(m_indexes.begin(),m_indexes.end());
|
||||
}
|
||||
bool RandomAccessHypPackEnumerator::finished() {
|
||||
bool RandomAccessHypPackEnumerator::finished()
|
||||
{
|
||||
return m_cur_index >= m_indexes.size();
|
||||
}
|
||||
void RandomAccessHypPackEnumerator::next() {
|
||||
void RandomAccessHypPackEnumerator::next()
|
||||
{
|
||||
m_cur_index++;
|
||||
}
|
||||
|
||||
size_t RandomAccessHypPackEnumerator::cur_size() {
|
||||
size_t RandomAccessHypPackEnumerator::cur_size()
|
||||
{
|
||||
assert(m_features[m_indexes[m_cur_index]].size()==m_scores[m_indexes[m_cur_index]].size());
|
||||
return m_features[m_indexes[m_cur_index]].size();
|
||||
}
|
||||
const MiraFeatureVector& RandomAccessHypPackEnumerator::featuresAt(size_t i) {
|
||||
const MiraFeatureVector& RandomAccessHypPackEnumerator::featuresAt(size_t i)
|
||||
{
|
||||
return m_features[m_indexes[m_cur_index]][i];
|
||||
}
|
||||
const ScoreDataItem& RandomAccessHypPackEnumerator::scoresAt(size_t i) {
|
||||
const ScoreDataItem& RandomAccessHypPackEnumerator::scoresAt(size_t i)
|
||||
{
|
||||
return m_scores[m_indexes[m_cur_index]][i];
|
||||
}
|
||||
|
||||
size_t RandomAccessHypPackEnumerator::cur_id() {
|
||||
size_t RandomAccessHypPackEnumerator::cur_id()
|
||||
{
|
||||
return m_indexes[m_cur_index];
|
||||
}
|
||||
}
|
||||
// --Emacs trickery--
|
||||
// Local Variables:
|
||||
// mode:c++
|
||||
|
@ -20,11 +20,12 @@
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
// Start with these abstract classes
|
||||
|
||||
class HypPackEnumerator {
|
||||
class HypPackEnumerator
|
||||
{
|
||||
public:
|
||||
virtual ~HypPackEnumerator() {}
|
||||
|
||||
@ -41,7 +42,8 @@ public:
|
||||
|
||||
// Instantiation that streams from disk
|
||||
// Low-memory, low-speed, sequential access
|
||||
class StreamingHypPackEnumerator : public HypPackEnumerator {
|
||||
class StreamingHypPackEnumerator : public HypPackEnumerator
|
||||
{
|
||||
public:
|
||||
StreamingHypPackEnumerator(std::vector<std::string> const& featureFiles,
|
||||
std::vector<std::string> const& scoreFiles);
|
||||
@ -75,7 +77,8 @@ private:
|
||||
// Instantiation that reads into memory
|
||||
// High-memory, high-speed, random access
|
||||
// (Actually randomizes with each call to reset)
|
||||
class RandomAccessHypPackEnumerator : public HypPackEnumerator {
|
||||
class RandomAccessHypPackEnumerator : public HypPackEnumerator
|
||||
{
|
||||
public:
|
||||
RandomAccessHypPackEnumerator(std::vector<std::string> const& featureFiles,
|
||||
std::vector<std::string> const& scoreFiles,
|
||||
|
@ -11,7 +11,7 @@ namespace MosesTuning
|
||||
// TODO: This is too long. Consider creating a function for
|
||||
// initialization such as Init().
|
||||
InterpolatedScorer::InterpolatedScorer(const string& name, const string& config)
|
||||
: Scorer(name,config)
|
||||
: Scorer(name,config)
|
||||
{
|
||||
// name would be: HAMMING,BLEU or similar
|
||||
string scorers = name;
|
||||
@ -66,7 +66,8 @@ InterpolatedScorer::InterpolatedScorer(const string& name, const string& config)
|
||||
cerr <<endl;
|
||||
}
|
||||
|
||||
bool InterpolatedScorer::useAlignment() const {
|
||||
bool InterpolatedScorer::useAlignment() const
|
||||
{
|
||||
//cout << "InterpolatedScorer::useAlignment" << endl;
|
||||
for (vector<Scorer*>::const_iterator itsc = m_scorers.begin(); itsc < m_scorers.end(); itsc++) {
|
||||
if ((*itsc)->useAlignment()) {
|
||||
@ -176,8 +177,7 @@ void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats
|
||||
ScoreStats tempEntry;
|
||||
if ((*itsc)->useAlignment()) {
|
||||
(*itsc)->prepareStats(sid, text, tempEntry);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
(*itsc)->prepareStats(sid, sentence, tempEntry);
|
||||
}
|
||||
if (i > 0) buff << " ";
|
||||
@ -206,17 +206,17 @@ void InterpolatedScorer::setFactors(const string& factors)
|
||||
|
||||
void InterpolatedScorer::setFilter(const string& filterCommand)
|
||||
{
|
||||
if (filterCommand.empty()) return;
|
||||
if (filterCommand.empty()) return;
|
||||
|
||||
vector<string> csplit;
|
||||
split(filterCommand, ',', csplit);
|
||||
vector<string> csplit;
|
||||
split(filterCommand, ',', csplit);
|
||||
|
||||
if (csplit.size() != m_scorers.size())
|
||||
throw runtime_error("Number of command specifications does not equal number of interpolated scorers.");
|
||||
if (csplit.size() != m_scorers.size())
|
||||
throw runtime_error("Number of command specifications does not equal number of interpolated scorers.");
|
||||
|
||||
for (size_t i = 0; i < m_scorers.size(); ++i) {
|
||||
m_scorers[i]->setFilter(csplit[i]);
|
||||
}
|
||||
for (size_t i = 0; i < m_scorers.size(); ++i) {
|
||||
m_scorers[i]->setFilter(csplit[i]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -10,7 +10,7 @@
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Class that includes other scorers eg.
|
||||
|
@ -7,7 +7,7 @@ using namespace std;
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec)
|
||||
: m_dense(vec.dense)
|
||||
@ -17,8 +17,7 @@ MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec)
|
||||
size_t lastFeat = 0;
|
||||
m_sparseFeats.reserve(sparseFeats.size());
|
||||
m_sparseVals.reserve(sparseFeats.size());
|
||||
for(size_t i=0;i<sparseFeats.size();i++)
|
||||
{
|
||||
for(size_t i=0; i<sparseFeats.size(); i++) {
|
||||
size_t feat = m_dense.size() + sparseFeats[i];
|
||||
m_sparseFeats.push_back(feat);
|
||||
m_sparseVals.push_back(vec.sparse.get(sparseFeats[i]));
|
||||
@ -26,8 +25,7 @@ MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec)
|
||||
// Check ordered property
|
||||
if(bFirst) {
|
||||
bFirst = false;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
if(lastFeat>=feat) {
|
||||
cerr << "Error: Feature indeces must be strictly ascending coming out of SparseVector" << endl;
|
||||
exit(1);
|
||||
@ -61,29 +59,33 @@ MiraFeatureVector::MiraFeatureVector(const vector<ValType>& dense,
|
||||
}
|
||||
}
|
||||
|
||||
ValType MiraFeatureVector::val(size_t index) const {
|
||||
ValType MiraFeatureVector::val(size_t index) const
|
||||
{
|
||||
if(index < m_dense.size())
|
||||
return m_dense[index];
|
||||
else
|
||||
return m_sparseVals[index-m_dense.size()];
|
||||
}
|
||||
|
||||
size_t MiraFeatureVector::feat(size_t index) const {
|
||||
size_t MiraFeatureVector::feat(size_t index) const
|
||||
{
|
||||
if(index < m_dense.size())
|
||||
return index;
|
||||
else
|
||||
return m_sparseFeats[index-m_dense.size()];
|
||||
}
|
||||
|
||||
size_t MiraFeatureVector::size() const {
|
||||
size_t MiraFeatureVector::size() const
|
||||
{
|
||||
return m_dense.size() + m_sparseVals.size();
|
||||
}
|
||||
|
||||
ValType MiraFeatureVector::sqrNorm() const {
|
||||
ValType MiraFeatureVector::sqrNorm() const
|
||||
{
|
||||
ValType toRet = 0.0;
|
||||
for(size_t i=0;i<m_dense.size();i++)
|
||||
for(size_t i=0; i<m_dense.size(); i++)
|
||||
toRet += m_dense[i]*m_dense[i];
|
||||
for(size_t i=0;i<m_sparseVals.size();i++)
|
||||
for(size_t i=0; i<m_sparseVals.size(); i++)
|
||||
toRet += m_sparseVals[i] * m_sparseVals[i];
|
||||
return toRet;
|
||||
}
|
||||
@ -96,7 +98,7 @@ MiraFeatureVector operator-(const MiraFeatureVector& a, const MiraFeatureVector&
|
||||
cerr << "Mismatching dense vectors passed to MiraFeatureVector subtraction" << endl;
|
||||
exit(1);
|
||||
}
|
||||
for(size_t i=0;i<a.m_dense.size();i++) {
|
||||
for(size_t i=0; i<a.m_dense.size(); i++) {
|
||||
dense.push_back(a.m_dense[i] - b.m_dense[i]);
|
||||
}
|
||||
|
||||
@ -148,7 +150,7 @@ MiraFeatureVector operator-(const MiraFeatureVector& a, const MiraFeatureVector&
|
||||
|
||||
ostream& operator<<(ostream& o, const MiraFeatureVector& e)
|
||||
{
|
||||
for(size_t i=0;i<e.size();i++) {
|
||||
for(size_t i=0; i<e.size(); i++) {
|
||||
if(i>0) o << " ";
|
||||
o << e.feat(i) << ":" << e.val(i);
|
||||
}
|
||||
|
@ -19,11 +19,12 @@
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
typedef FeatureStatsType ValType;
|
||||
|
||||
class MiraFeatureVector {
|
||||
class MiraFeatureVector
|
||||
{
|
||||
public:
|
||||
MiraFeatureVector(const FeatureDataItem& vec);
|
||||
MiraFeatureVector(const MiraFeatureVector& other);
|
||||
|
@ -6,7 +6,7 @@ using namespace std;
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Constructor, initializes to the zero vector
|
||||
@ -36,9 +36,10 @@ MiraWeightVector::MiraWeightVector(const vector<ValType>& init)
|
||||
* \param fv Feature vector to be added to the weights
|
||||
* \param tau FV will be scaled by this value before update
|
||||
*/
|
||||
void MiraWeightVector::update(const MiraFeatureVector& fv, float tau) {
|
||||
void MiraWeightVector::update(const MiraFeatureVector& fv, float tau)
|
||||
{
|
||||
m_numUpdates++;
|
||||
for(size_t i=0;i<fv.size();i++) {
|
||||
for(size_t i=0; i<fv.size(); i++) {
|
||||
update(fv.feat(i), fv.val(i)*tau);
|
||||
}
|
||||
}
|
||||
@ -46,7 +47,8 @@ void MiraWeightVector::update(const MiraFeatureVector& fv, float tau) {
|
||||
/**
|
||||
* Perform an empty update (affects averaging)
|
||||
*/
|
||||
void MiraWeightVector::tick() {
|
||||
void MiraWeightVector::tick()
|
||||
{
|
||||
m_numUpdates++;
|
||||
}
|
||||
|
||||
@ -54,7 +56,8 @@ void MiraWeightVector::tick() {
|
||||
* Score a feature vector according to the model
|
||||
* \param fv Feature vector to be scored
|
||||
*/
|
||||
ValType MiraWeightVector::score(const MiraFeatureVector& fv) const {
|
||||
ValType MiraWeightVector::score(const MiraFeatureVector& fv) const
|
||||
{
|
||||
ValType toRet = 0.0;
|
||||
for(size_t i=0; i<fv.size(); i++) {
|
||||
toRet += weight(fv.feat(i)) * fv.val(i);
|
||||
@ -65,7 +68,8 @@ ValType MiraWeightVector::score(const MiraFeatureVector& fv) const {
|
||||
/**
|
||||
* Return an averaged view of this weight vector
|
||||
*/
|
||||
AvgWeightVector MiraWeightVector::avg() {
|
||||
AvgWeightVector MiraWeightVector::avg()
|
||||
{
|
||||
this->fixTotals();
|
||||
return AvgWeightVector(*this);
|
||||
}
|
||||
@ -73,7 +77,8 @@ AvgWeightVector MiraWeightVector::avg() {
|
||||
/**
|
||||
* Updates a weight and lazily updates its total
|
||||
*/
|
||||
void MiraWeightVector::update(size_t index, ValType delta) {
|
||||
void MiraWeightVector::update(size_t index, ValType delta)
|
||||
{
|
||||
|
||||
// Handle previously unseen weights
|
||||
while(index>=m_weights.size()) {
|
||||
@ -91,25 +96,27 @@ void MiraWeightVector::update(size_t index, ValType delta) {
|
||||
/**
|
||||
* Make sure everyone's total is up-to-date
|
||||
*/
|
||||
void MiraWeightVector::fixTotals() {
|
||||
void MiraWeightVector::fixTotals()
|
||||
{
|
||||
for(size_t i=0; i<m_weights.size(); i++) update(i,0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to handle out of range weights
|
||||
*/
|
||||
ValType MiraWeightVector::weight(size_t index) const {
|
||||
ValType MiraWeightVector::weight(size_t index) const
|
||||
{
|
||||
if(index < m_weights.size()) {
|
||||
return m_weights[index];
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
ValType MiraWeightVector::sqrNorm() const {
|
||||
ValType MiraWeightVector::sqrNorm() const
|
||||
{
|
||||
ValType toRet = 0;
|
||||
for(size_t i=0;i<m_weights.size();i++) {
|
||||
for(size_t i=0; i<m_weights.size(); i++) {
|
||||
toRet += weight(i) * weight(i);
|
||||
}
|
||||
return toRet;
|
||||
@ -119,9 +126,9 @@ AvgWeightVector::AvgWeightVector(const MiraWeightVector& wv)
|
||||
:m_wv(wv)
|
||||
{}
|
||||
|
||||
ostream& operator<<(ostream& o, const MiraWeightVector& e)
|
||||
ostream& operator<<(ostream& o, const MiraWeightVector& e)
|
||||
{
|
||||
for(size_t i=0;i<e.m_weights.size();i++) {
|
||||
for(size_t i=0; i<e.m_weights.size(); i++) {
|
||||
if(abs(e.m_weights[i])>1e-8) {
|
||||
if(i>0) o << " ";
|
||||
cerr << i << ":" << e.m_weights[i];
|
||||
@ -136,14 +143,14 @@ ValType AvgWeightVector::weight(size_t index) const
|
||||
else {
|
||||
if(index < m_wv.m_totals.size()) {
|
||||
return m_wv.m_totals[index] / m_wv.m_numUpdates;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ValType AvgWeightVector::score(const MiraFeatureVector& fv) const {
|
||||
ValType AvgWeightVector::score(const MiraFeatureVector& fv) const
|
||||
{
|
||||
ValType toRet = 0.0;
|
||||
for(size_t i=0; i<fv.size(); i++) {
|
||||
toRet += weight(fv.feat(i)) * fv.val(i);
|
||||
@ -151,7 +158,8 @@ ValType AvgWeightVector::score(const MiraFeatureVector& fv) const {
|
||||
return toRet;
|
||||
}
|
||||
|
||||
size_t AvgWeightVector::size() const {
|
||||
size_t AvgWeightVector::size() const
|
||||
{
|
||||
return m_wv.m_weights.size();
|
||||
}
|
||||
|
||||
|
@ -17,11 +17,12 @@
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
class AvgWeightVector;
|
||||
|
||||
class MiraWeightVector {
|
||||
class MiraWeightVector
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Constructor, initializes to the zero vector
|
||||
@ -91,7 +92,8 @@ private:
|
||||
/**
|
||||
* Averaged view of a weight vector
|
||||
*/
|
||||
class AvgWeightVector {
|
||||
class AvgWeightVector
|
||||
{
|
||||
public:
|
||||
AvgWeightVector(const MiraWeightVector& wv);
|
||||
ValType score(const MiraFeatureVector& fv) const;
|
||||
|
59
mert/Ngram.h
59
mert/Ngram.h
@ -13,8 +13,9 @@ namespace MosesTuning
|
||||
* typical accessors and mutaors, but we intentionally does not allow
|
||||
* erasing elements.
|
||||
*/
|
||||
class NgramCounts {
|
||||
public:
|
||||
class NgramCounts
|
||||
{
|
||||
public:
|
||||
// Used to construct the ngram map
|
||||
struct NgramComparator {
|
||||
bool operator()(const std::vector<int>& a, const std::vector<int>& b) const {
|
||||
@ -45,7 +46,9 @@ class NgramCounts {
|
||||
/**
|
||||
* If the specified "ngram" is found, we add counts.
|
||||
* If not, we insert the default count in the container. */
|
||||
inline void Add(const Key& ngram) { m_counts[ngram]++; }
|
||||
inline void Add(const Key& ngram) {
|
||||
m_counts[ngram]++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true iff the specified "ngram" is found in the container.
|
||||
@ -60,34 +63,58 @@ class NgramCounts {
|
||||
/**
|
||||
* Clear all elments in the container.
|
||||
*/
|
||||
void clear() { m_counts.clear(); }
|
||||
void clear() {
|
||||
m_counts.clear();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true iff the container is empty.
|
||||
*/
|
||||
bool empty() const { return m_counts.empty(); }
|
||||
bool empty() const {
|
||||
return m_counts.empty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the the number of elements in the container.
|
||||
*/
|
||||
std::size_t size() const { return m_counts.size(); }
|
||||
std::size_t size() const {
|
||||
return m_counts.size();
|
||||
}
|
||||
|
||||
std::size_t max_size() const { return m_counts.max_size(); }
|
||||
std::size_t max_size() const {
|
||||
return m_counts.max_size();
|
||||
}
|
||||
|
||||
// Note: This is mainly used by unit tests.
|
||||
int get_default_count() const { return kDefaultCount; }
|
||||
int get_default_count() const {
|
||||
return kDefaultCount;
|
||||
}
|
||||
|
||||
iterator find(const Key& ngram) { return m_counts.find(ngram); }
|
||||
const_iterator find(const Key& ngram) const { return m_counts.find(ngram); }
|
||||
iterator find(const Key& ngram) {
|
||||
return m_counts.find(ngram);
|
||||
}
|
||||
const_iterator find(const Key& ngram) const {
|
||||
return m_counts.find(ngram);
|
||||
}
|
||||
|
||||
Value& operator[](const Key& ngram) { return m_counts[ngram]; }
|
||||
Value& operator[](const Key& ngram) {
|
||||
return m_counts[ngram];
|
||||
}
|
||||
|
||||
iterator begin() { return m_counts.begin(); }
|
||||
const_iterator begin() const { return m_counts.begin(); }
|
||||
iterator end() { return m_counts.end(); }
|
||||
const_iterator end() const { return m_counts.end(); }
|
||||
iterator begin() {
|
||||
return m_counts.begin();
|
||||
}
|
||||
const_iterator begin() const {
|
||||
return m_counts.begin();
|
||||
}
|
||||
iterator end() {
|
||||
return m_counts.end();
|
||||
}
|
||||
const_iterator end() const {
|
||||
return m_counts.end();
|
||||
}
|
||||
|
||||
private:
|
||||
private:
|
||||
const int kDefaultCount;
|
||||
boost::unordered_map<Key, Value> m_counts;
|
||||
};
|
||||
|
@ -5,7 +5,8 @@
|
||||
|
||||
using namespace MosesTuning;
|
||||
|
||||
BOOST_AUTO_TEST_CASE(ngram_basic) {
|
||||
BOOST_AUTO_TEST_CASE(ngram_basic)
|
||||
{
|
||||
NgramCounts counts;
|
||||
NgramCounts::Key key;
|
||||
key.push_back(1);
|
||||
@ -25,7 +26,8 @@ BOOST_AUTO_TEST_CASE(ngram_basic) {
|
||||
BOOST_CHECK_EQUAL(it->second, 1);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(ngram_Add) {
|
||||
BOOST_AUTO_TEST_CASE(ngram_Add)
|
||||
{
|
||||
NgramCounts counts;
|
||||
NgramCounts::Key key;
|
||||
key.push_back(1);
|
||||
@ -49,7 +51,8 @@ BOOST_AUTO_TEST_CASE(ngram_Add) {
|
||||
BOOST_CHECK_EQUAL(counts[key3], counts.get_default_count());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(ngram_lookup) {
|
||||
BOOST_AUTO_TEST_CASE(ngram_lookup)
|
||||
{
|
||||
NgramCounts counts;
|
||||
NgramCounts::Key key;
|
||||
key.push_back(1);
|
||||
|
@ -17,7 +17,8 @@ using namespace std;
|
||||
static const float MIN_FLOAT = -1.0 * numeric_limits<float>::max();
|
||||
static const float MAX_FLOAT = numeric_limits<float>::max();
|
||||
|
||||
namespace {
|
||||
namespace
|
||||
{
|
||||
|
||||
/**
|
||||
* Compute the intersection of 2 lines.
|
||||
@ -35,7 +36,7 @@ inline float intersect(float m1, float b1, float m2, float b2)
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
Optimizer::Optimizer(unsigned Pd, const vector<unsigned>& i2O, const vector<bool>& pos, const vector<parameter_t>& start, unsigned int nrandom)
|
||||
: m_scorer(NULL), m_feature_data(), m_num_random_directions(nrandom), m_positive(pos)
|
||||
@ -198,7 +199,7 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction,
|
||||
thresholdmap.erase(previnserted); // erase old previnsert
|
||||
previnserted = thresholdmap.find(leftmostx); // point previnsert to the new threshold
|
||||
previnserted->second.back()=newd; // We update the diff for sentence S
|
||||
// Threshold already exists but is not the previous one.
|
||||
// Threshold already exists but is not the previous one.
|
||||
} else {
|
||||
// We append the diffs in previnsert to tit before destroying previnsert.
|
||||
tit->second.insert(tit->second.end(),previnserted->second.begin(),previnserted->second.end());
|
||||
@ -405,8 +406,7 @@ statscore_t SimpleOptimizer::TrueRun(Point& P) const
|
||||
for (unsigned int i = 0; i < Point::getdim(); i++)
|
||||
direction[i]=0.0;
|
||||
direction[d]=1.0;
|
||||
}
|
||||
else { // random direction update
|
||||
} else { // random direction update
|
||||
direction.Randomize();
|
||||
}
|
||||
statscore_t curscore = LineOptimize(P, direction, linebest);//find the minimum on the line
|
||||
@ -443,8 +443,7 @@ statscore_t RandomDirectionOptimizer::TrueRun(Point& P) const
|
||||
// do specified number of random direction optimizations
|
||||
unsigned int nrun = 0;
|
||||
unsigned int nrun_no_change = 0;
|
||||
for (; nrun_no_change < m_num_random_directions; nrun++, nrun_no_change++)
|
||||
{
|
||||
for (; nrun_no_change < m_num_random_directions; nrun++, nrun_no_change++) {
|
||||
// choose a random direction in which to optimize
|
||||
Point direction;
|
||||
direction.Randomize();
|
||||
|
@ -12,7 +12,7 @@ static const float kMaxFloat = std::numeric_limits<float>::max();
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
class Point;
|
||||
|
||||
@ -31,8 +31,12 @@ protected:
|
||||
public:
|
||||
Optimizer(unsigned Pd, const std::vector<unsigned>& i2O, const std::vector<bool>& positive, const std::vector<parameter_t>& start, unsigned int nrandom);
|
||||
|
||||
void SetScorer(Scorer *scorer) { m_scorer = scorer; }
|
||||
void SetFeatureData(FeatureDataHandle feature_data) { m_feature_data = feature_data; }
|
||||
void SetScorer(Scorer *scorer) {
|
||||
m_scorer = scorer;
|
||||
}
|
||||
void SetFeatureData(FeatureDataHandle feature_data) {
|
||||
m_feature_data = feature_data;
|
||||
}
|
||||
virtual ~Optimizer();
|
||||
|
||||
unsigned size() const {
|
||||
@ -97,7 +101,7 @@ private:
|
||||
public:
|
||||
RandomDirectionOptimizer(unsigned dim, const std::vector<unsigned>& i2O, const std::vector<bool>& positive,
|
||||
const std::vector<parameter_t>& start, unsigned int nrandom)
|
||||
: Optimizer(dim, i2O, positive, start, nrandom), kEPS(0.0001f) {}
|
||||
: Optimizer(dim, i2O, positive, start, nrandom), kEPS(0.0001f) {}
|
||||
virtual statscore_t TrueRun(Point&) const;
|
||||
};
|
||||
|
||||
@ -109,7 +113,7 @@ class RandomOptimizer : public Optimizer
|
||||
public:
|
||||
RandomOptimizer(unsigned dim, const std::vector<unsigned>& i2O, const std::vector<bool>& positive,
|
||||
const std::vector<parameter_t>& start, unsigned int nrandom)
|
||||
: Optimizer(dim, i2O, positive, start, nrandom) {}
|
||||
: Optimizer(dim, i2O, positive, start, nrandom) {}
|
||||
virtual statscore_t TrueRun(Point&) const;
|
||||
};
|
||||
|
||||
|
@ -5,7 +5,7 @@ using namespace std;
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
vector<string> OptimizerFactory::m_type_names;
|
||||
|
||||
@ -38,11 +38,11 @@ OptimizerFactory::OptimizerType OptimizerFactory::GetOptimizerType(const string&
|
||||
}
|
||||
|
||||
Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim,
|
||||
const vector<unsigned>& i2o,
|
||||
const std::vector<bool>& positive,
|
||||
const vector<parameter_t>& start,
|
||||
const string& type,
|
||||
unsigned int nrandom)
|
||||
const vector<unsigned>& i2o,
|
||||
const std::vector<bool>& positive,
|
||||
const vector<parameter_t>& start,
|
||||
const string& type,
|
||||
unsigned int nrandom)
|
||||
{
|
||||
OptimizerType opt_type = GetOptimizerType(type);
|
||||
if (opt_type == NOPTIMIZER) {
|
||||
@ -55,18 +55,18 @@ Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim,
|
||||
}
|
||||
|
||||
switch (opt_type) {
|
||||
case POWELL:
|
||||
return new SimpleOptimizer(dim, i2o, positive, start, nrandom);
|
||||
break;
|
||||
case RANDOM_DIRECTION:
|
||||
return new RandomDirectionOptimizer(dim, i2o, positive, start, nrandom);
|
||||
break;
|
||||
case RANDOM:
|
||||
return new RandomOptimizer(dim, i2o, positive, start, nrandom);
|
||||
break;
|
||||
default:
|
||||
cerr << "Error: unknown optimizer" << type << endl;
|
||||
return NULL;
|
||||
case POWELL:
|
||||
return new SimpleOptimizer(dim, i2o, positive, start, nrandom);
|
||||
break;
|
||||
case RANDOM_DIRECTION:
|
||||
return new RandomDirectionOptimizer(dim, i2o, positive, start, nrandom);
|
||||
break;
|
||||
case RANDOM:
|
||||
return new RandomOptimizer(dim, i2o, positive, start, nrandom);
|
||||
break;
|
||||
default:
|
||||
cerr << "Error: unknown optimizer" << type << endl;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -6,13 +6,13 @@
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
class Optimizer;
|
||||
|
||||
class OptimizerFactory
|
||||
{
|
||||
public:
|
||||
public:
|
||||
// NOTE: Add new optimizer here BEFORE NOPTIMZER
|
||||
enum OptimizerType {
|
||||
POWELL = 0,
|
||||
@ -36,7 +36,7 @@ class OptimizerFactory
|
||||
const std::string& type,
|
||||
unsigned int nrandom);
|
||||
|
||||
private:
|
||||
private:
|
||||
OptimizerFactory() {}
|
||||
~OptimizerFactory() {}
|
||||
|
||||
|
@ -7,21 +7,24 @@
|
||||
|
||||
using namespace MosesTuning;
|
||||
|
||||
namespace {
|
||||
namespace
|
||||
{
|
||||
|
||||
inline bool CheckBuildOptimizer(unsigned dim,
|
||||
const std::vector<unsigned>& to_optimize,
|
||||
const std::vector<bool>& positive,
|
||||
const std::vector<parameter_t>& start,
|
||||
const std::string& type,
|
||||
unsigned int num_random) {
|
||||
unsigned int num_random)
|
||||
{
|
||||
boost::scoped_ptr<Optimizer> optimizer(OptimizerFactory::BuildOptimizer(dim, to_optimize, positive, start, type, num_random));
|
||||
return optimizer.get() != NULL;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
BOOST_AUTO_TEST_CASE(optimizer_type) {
|
||||
BOOST_AUTO_TEST_CASE(optimizer_type)
|
||||
{
|
||||
BOOST_CHECK_EQUAL(OptimizerFactory::GetOptimizerType("powell"),
|
||||
OptimizerFactory::POWELL);
|
||||
BOOST_CHECK_EQUAL(OptimizerFactory::GetOptimizerType("random"),
|
||||
@ -30,7 +33,8 @@ BOOST_AUTO_TEST_CASE(optimizer_type) {
|
||||
OptimizerFactory::RANDOM_DIRECTION);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(optimizer_build) {
|
||||
BOOST_AUTO_TEST_CASE(optimizer_build)
|
||||
{
|
||||
const unsigned dim = 3;
|
||||
std::vector<unsigned> to_optimize;
|
||||
to_optimize.push_back(1);
|
||||
|
@ -10,7 +10,7 @@ using namespace std;
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
PerScorer::PerScorer(const string& config)
|
||||
: StatisticsBasedScorer("PER",config) {}
|
||||
|
@ -9,7 +9,7 @@
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
class ScoreStats;
|
||||
|
||||
@ -27,7 +27,9 @@ public:
|
||||
|
||||
virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles);
|
||||
virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry);
|
||||
virtual std::size_t NumberOfScores() const { return 3; }
|
||||
virtual std::size_t NumberOfScores() const {
|
||||
return 3;
|
||||
}
|
||||
virtual float calculateScore(const std::vector<int>& comps) const;
|
||||
|
||||
private:
|
||||
|
@ -16,7 +16,7 @@ using namespace std;
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
Permutation::Permutation(const string &alignment, const int sourceLength, const int targetLength )
|
||||
{
|
||||
@ -86,7 +86,7 @@ void Permutation::set(const string & alignment,const int sourceLength)
|
||||
//cout << "SP:" << sourcePos << " TP:" << targetPos << endl;
|
||||
if (sourcePos > sourceLength) {
|
||||
cerr << "Source sentence length:" << sourceLength << " is smaller than alignment source position:" << sourcePos << endl;
|
||||
cerr << "******** Permutation::set :" << alignment << ": len : " << sourceLength <<endl;
|
||||
cerr << "******** Permutation::set :" << alignment << ": len : " << sourceLength <<endl;
|
||||
exit(1);
|
||||
}
|
||||
//If have multiple target pos aligned to one source,
|
||||
@ -187,7 +187,7 @@ float Permutation::distance(const Permutation &permCompare, const distanceMetric
|
||||
float score=0;
|
||||
|
||||
//bool debug= (verboselevel()>3); // TODO: fix verboselevel()
|
||||
bool debug=false;
|
||||
bool debug=false;
|
||||
if (debug) {
|
||||
cout << "*****Permutation::distance" <<endl;
|
||||
cout << "Hypo:" << endl;
|
||||
|
@ -19,7 +19,7 @@
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
class Permutation
|
||||
{
|
||||
|
@ -5,7 +5,7 @@ using namespace std;
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
const int PermutationScorer::SCORE_PRECISION = 5;
|
||||
const int PermutationScorer::SCORE_MULTFACT = 100000; // 100000=10^SCORE_PRECISION
|
||||
@ -147,7 +147,7 @@ int PermutationScorer::getNumberWords (const string& text) const
|
||||
void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
|
||||
{
|
||||
//bool debug= (verboselevel()>3); // TODO: fix verboselevel()
|
||||
bool debug=false;
|
||||
bool debug=false;
|
||||
if (debug) {
|
||||
cout << "*******prepareStats" ;
|
||||
cout << text << endl;
|
||||
|
@ -19,7 +19,7 @@
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
/**
|
||||
* Permutation
|
||||
**/
|
||||
|
@ -29,7 +29,7 @@ Point::Point() : vector<parameter_t>(m_dim), m_score(0.0) {}
|
||||
Point::Point(const vector<parameter_t>& init,
|
||||
const vector<parameter_t>& min,
|
||||
const vector<parameter_t>& max)
|
||||
: vector<parameter_t>(Point::m_dim), m_score(0.0)
|
||||
: vector<parameter_t>(Point::m_dim), m_score(0.0)
|
||||
{
|
||||
m_min.resize(Point::m_dim);
|
||||
m_max.resize(Point::m_dim);
|
||||
|
30
mert/Point.h
30
mert/Point.h
@ -8,7 +8,7 @@
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
class FeatureStats;
|
||||
class Optimizer;
|
||||
@ -53,11 +53,19 @@ private:
|
||||
statscore_t m_score;
|
||||
|
||||
public:
|
||||
static unsigned int getdim() { return m_dim; }
|
||||
static void setdim(std::size_t d) { m_dim = d; }
|
||||
static unsigned int getdim() {
|
||||
return m_dim;
|
||||
}
|
||||
static void setdim(std::size_t d) {
|
||||
m_dim = d;
|
||||
}
|
||||
|
||||
static unsigned int getpdim() { return m_pdim; }
|
||||
static void setpdim(std::size_t pd) { m_pdim = pd; }
|
||||
static unsigned int getpdim() {
|
||||
return m_pdim;
|
||||
}
|
||||
static void setpdim(std::size_t pd) {
|
||||
m_pdim = pd;
|
||||
}
|
||||
|
||||
static void set_optindices(const std::vector<unsigned int>& indices) {
|
||||
m_opt_indices = indices;
|
||||
@ -90,7 +98,9 @@ public:
|
||||
*/
|
||||
friend std::ostream& operator<<(std::ostream& o,const Point& P);
|
||||
|
||||
void Normalize() { NormalizeL2(); }
|
||||
void Normalize() {
|
||||
NormalizeL2();
|
||||
}
|
||||
void NormalizeL2();
|
||||
void NormalizeL1();
|
||||
|
||||
@ -100,8 +110,12 @@ public:
|
||||
*/
|
||||
void GetAllWeights(std::vector<parameter_t>& w) const;
|
||||
|
||||
statscore_t GetScore() const { return m_score; }
|
||||
void SetScore(statscore_t score) { m_score = score; }
|
||||
statscore_t GetScore() const {
|
||||
return m_score;
|
||||
}
|
||||
void SetScore(statscore_t score) {
|
||||
m_score = score;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -9,7 +9,8 @@
|
||||
using namespace std;
|
||||
using namespace MosesTuning;
|
||||
|
||||
BOOST_AUTO_TEST_CASE(point_operators) {
|
||||
BOOST_AUTO_TEST_CASE(point_operators)
|
||||
{
|
||||
const unsigned int dim = 5;
|
||||
vector<float> init(dim);
|
||||
init[0] = 1.0f;
|
||||
|
@ -18,7 +18,7 @@ using namespace std;
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
// Child exec error signal
|
||||
void exec_failed (int sig)
|
||||
@ -28,116 +28,108 @@ void exec_failed (int sig)
|
||||
}
|
||||
|
||||
PreProcessFilter::PreProcessFilter(const string& filterCommand)
|
||||
: m_toFilter(NULL),
|
||||
m_fromFilter(NULL)
|
||||
: m_toFilter(NULL),
|
||||
m_fromFilter(NULL)
|
||||
{
|
||||
// Child error signal install
|
||||
// sigaction is the replacement for the traditional signal() method
|
||||
struct sigaction action;
|
||||
action.sa_handler = exec_failed;
|
||||
sigemptyset(&action.sa_mask);
|
||||
action.sa_flags = 0;
|
||||
if (sigaction(SIGUSR1, &action, NULL) < 0)
|
||||
{
|
||||
perror("SIGUSR1 install error");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
// Child error signal install
|
||||
// sigaction is the replacement for the traditional signal() method
|
||||
struct sigaction action;
|
||||
action.sa_handler = exec_failed;
|
||||
sigemptyset(&action.sa_mask);
|
||||
action.sa_flags = 0;
|
||||
if (sigaction(SIGUSR1, &action, NULL) < 0) {
|
||||
perror("SIGUSR1 install error");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
int pipe_status;
|
||||
int pipefds_input[2];
|
||||
int pipefds_output[2];
|
||||
// int pipefds_error[2];
|
||||
int pipe_status;
|
||||
int pipefds_input[2];
|
||||
int pipefds_output[2];
|
||||
// int pipefds_error[2];
|
||||
|
||||
// Create the pipes
|
||||
// We do this before the fork so both processes will know about
|
||||
// the same pipe and they can communicate.
|
||||
// Create the pipes
|
||||
// We do this before the fork so both processes will know about
|
||||
// the same pipe and they can communicate.
|
||||
|
||||
pipe_status = pipe(pipefds_input);
|
||||
if (pipe_status == -1)
|
||||
{
|
||||
perror("Error creating the pipe");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
pipe_status = pipe(pipefds_input);
|
||||
if (pipe_status == -1) {
|
||||
perror("Error creating the pipe");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
pipe_status = pipe(pipefds_output);
|
||||
if (pipe_status == -1)
|
||||
{
|
||||
perror("Error creating the pipe");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
pipe_status = pipe(pipefds_output);
|
||||
if (pipe_status == -1) {
|
||||
perror("Error creating the pipe");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
/*
|
||||
pipe_status = pipe(pipefds_error);
|
||||
if (pipe_status == -1)
|
||||
{
|
||||
perror("Error creating the pipe");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
*/
|
||||
/*
|
||||
pipe_status = pipe(pipefds_error);
|
||||
if (pipe_status == -1)
|
||||
{
|
||||
perror("Error creating the pipe");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
*/
|
||||
|
||||
pid_t pid;
|
||||
// Create child process; both processes continue from here
|
||||
pid = fork();
|
||||
pid_t pid;
|
||||
// Create child process; both processes continue from here
|
||||
pid = fork();
|
||||
|
||||
if (pid == pid_t(0))
|
||||
{
|
||||
// Child process
|
||||
if (pid == pid_t(0)) {
|
||||
// Child process
|
||||
|
||||
// When the child process finishes sends a SIGCHLD signal
|
||||
// to the parent
|
||||
// When the child process finishes sends a SIGCHLD signal
|
||||
// to the parent
|
||||
|
||||
// Tie the standard input, output and error streams to the
|
||||
// appropiate pipe ends
|
||||
// The file descriptor 0 is the standard input
|
||||
// We tie it to the read end of the pipe as we will use
|
||||
// this end of the pipe to read from it
|
||||
dup2 (CHILD_STDIN_READ,0);
|
||||
dup2 (CHILD_STDOUT_WRITE,1);
|
||||
// dup2 (CHILD_STDERR_WRITE,2);
|
||||
// Close in the child the unused ends of the pipes
|
||||
close(CHILD_STDIN_WRITE);
|
||||
close(CHILD_STDOUT_READ);
|
||||
//close(CHILD_STDERR_READ);
|
||||
// Tie the standard input, output and error streams to the
|
||||
// appropiate pipe ends
|
||||
// The file descriptor 0 is the standard input
|
||||
// We tie it to the read end of the pipe as we will use
|
||||
// this end of the pipe to read from it
|
||||
dup2 (CHILD_STDIN_READ,0);
|
||||
dup2 (CHILD_STDOUT_WRITE,1);
|
||||
// dup2 (CHILD_STDERR_WRITE,2);
|
||||
// Close in the child the unused ends of the pipes
|
||||
close(CHILD_STDIN_WRITE);
|
||||
close(CHILD_STDOUT_READ);
|
||||
//close(CHILD_STDERR_READ);
|
||||
|
||||
// Execute the program
|
||||
execl("/bin/bash", "bash", "-c", filterCommand.c_str() , (char*)NULL);
|
||||
// Execute the program
|
||||
execl("/bin/bash", "bash", "-c", filterCommand.c_str() , (char*)NULL);
|
||||
|
||||
// We should never reach this point
|
||||
// Tell the parent the exec failed
|
||||
kill(getppid(), SIGUSR1);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
else if (pid > pid_t(0))
|
||||
{
|
||||
// Parent
|
||||
// We should never reach this point
|
||||
// Tell the parent the exec failed
|
||||
kill(getppid(), SIGUSR1);
|
||||
exit(EXIT_FAILURE);
|
||||
} else if (pid > pid_t(0)) {
|
||||
// Parent
|
||||
|
||||
// Close in the parent the unused ends of the pipes
|
||||
close(CHILD_STDIN_READ);
|
||||
close(CHILD_STDOUT_WRITE);
|
||||
// close(CHILD_STDERR_WRITE);
|
||||
// Close in the parent the unused ends of the pipes
|
||||
close(CHILD_STDIN_READ);
|
||||
close(CHILD_STDOUT_WRITE);
|
||||
// close(CHILD_STDERR_WRITE);
|
||||
|
||||
m_toFilter = new ofdstream(CHILD_STDIN_WRITE);
|
||||
m_fromFilter = new ifdstream(CHILD_STDOUT_READ);
|
||||
}
|
||||
else
|
||||
{
|
||||
perror("Error: fork failed");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
m_toFilter = new ofdstream(CHILD_STDIN_WRITE);
|
||||
m_fromFilter = new ifdstream(CHILD_STDOUT_READ);
|
||||
} else {
|
||||
perror("Error: fork failed");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
string PreProcessFilter::ProcessSentence(const string& sentence)
|
||||
{
|
||||
*m_toFilter << sentence << "\n";
|
||||
string processedSentence;
|
||||
m_fromFilter->getline(processedSentence);
|
||||
return processedSentence;
|
||||
*m_toFilter << sentence << "\n";
|
||||
string processedSentence;
|
||||
m_fromFilter->getline(processedSentence);
|
||||
return processedSentence;
|
||||
}
|
||||
|
||||
PreProcessFilter::~PreProcessFilter()
|
||||
{
|
||||
delete m_toFilter;
|
||||
delete m_fromFilter;
|
||||
delete m_toFilter;
|
||||
delete m_fromFilter;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -5,7 +5,7 @@
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
class ofdstream;
|
||||
class ifdstream;
|
||||
@ -22,8 +22,8 @@ public:
|
||||
~PreProcessFilter();
|
||||
|
||||
private:
|
||||
ofdstream* m_toFilter;
|
||||
ifdstream* m_fromFilter;
|
||||
ofdstream* m_toFilter;
|
||||
ifdstream* m_fromFilter;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -9,38 +9,57 @@
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Reference class represents reference translations for an output
|
||||
* translation used in calculating BLEU score.
|
||||
*/
|
||||
class Reference {
|
||||
public:
|
||||
class Reference
|
||||
{
|
||||
public:
|
||||
// for m_length
|
||||
typedef std::vector<std::size_t>::iterator iterator;
|
||||
typedef std::vector<std::size_t>::const_iterator const_iterator;
|
||||
|
||||
Reference() : m_counts(new NgramCounts) { }
|
||||
~Reference() { delete m_counts; }
|
||||
~Reference() {
|
||||
delete m_counts;
|
||||
}
|
||||
|
||||
NgramCounts* get_counts() { return m_counts; }
|
||||
const NgramCounts* get_counts() const { return m_counts; }
|
||||
NgramCounts* get_counts() {
|
||||
return m_counts;
|
||||
}
|
||||
const NgramCounts* get_counts() const {
|
||||
return m_counts;
|
||||
}
|
||||
|
||||
iterator begin() { return m_length.begin(); }
|
||||
const_iterator begin() const { return m_length.begin(); }
|
||||
iterator end() { return m_length.end(); }
|
||||
const_iterator end() const { return m_length.end(); }
|
||||
iterator begin() {
|
||||
return m_length.begin();
|
||||
}
|
||||
const_iterator begin() const {
|
||||
return m_length.begin();
|
||||
}
|
||||
iterator end() {
|
||||
return m_length.end();
|
||||
}
|
||||
const_iterator end() const {
|
||||
return m_length.end();
|
||||
}
|
||||
|
||||
void push_back(std::size_t len) { m_length.push_back(len); }
|
||||
void push_back(std::size_t len) {
|
||||
m_length.push_back(len);
|
||||
}
|
||||
|
||||
std::size_t num_references() const { return m_length.size(); }
|
||||
std::size_t num_references() const {
|
||||
return m_length.size();
|
||||
}
|
||||
|
||||
int CalcAverage() const;
|
||||
int CalcClosest(std::size_t length) const;
|
||||
int CalcShortest() const;
|
||||
|
||||
private:
|
||||
private:
|
||||
NgramCounts* m_counts;
|
||||
|
||||
// multiple reference lengths
|
||||
@ -49,16 +68,18 @@ class Reference {
|
||||
|
||||
// TODO(tetsuok): fix this function and related stuff.
|
||||
// "average" reference length should not be calculated at sentence-level unlike "closest".
|
||||
inline int Reference::CalcAverage() const {
|
||||
inline int Reference::CalcAverage() const
|
||||
{
|
||||
int total = 0;
|
||||
for (std::size_t i = 0; i < m_length.size(); ++i) {
|
||||
total += m_length[i];
|
||||
}
|
||||
return static_cast<int>(
|
||||
static_cast<float>(total) / m_length.size());
|
||||
static_cast<float>(total) / m_length.size());
|
||||
}
|
||||
|
||||
inline int Reference::CalcClosest(std::size_t length) const {
|
||||
inline int Reference::CalcClosest(std::size_t length) const
|
||||
{
|
||||
int min_diff = INT_MAX;
|
||||
int closest_ref_id = 0; // an index of the closest reference translation
|
||||
for (std::size_t i = 0; i < m_length.size(); ++i) {
|
||||
@ -79,7 +100,8 @@ inline int Reference::CalcClosest(std::size_t length) const {
|
||||
return static_cast<int>(m_length[closest_ref_id]);
|
||||
}
|
||||
|
||||
inline int Reference::CalcShortest() const {
|
||||
inline int Reference::CalcShortest() const
|
||||
{
|
||||
return *std::min_element(m_length.begin(), m_length.end());
|
||||
}
|
||||
|
||||
|
@ -5,12 +5,14 @@
|
||||
|
||||
using namespace MosesTuning;
|
||||
|
||||
BOOST_AUTO_TEST_CASE(refernece_count) {
|
||||
BOOST_AUTO_TEST_CASE(refernece_count)
|
||||
{
|
||||
Reference ref;
|
||||
BOOST_CHECK(ref.get_counts() != NULL);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(refernece_length_iterator) {
|
||||
BOOST_AUTO_TEST_CASE(refernece_length_iterator)
|
||||
{
|
||||
Reference ref;
|
||||
ref.push_back(4);
|
||||
ref.push_back(2);
|
||||
@ -24,7 +26,8 @@ BOOST_AUTO_TEST_CASE(refernece_length_iterator) {
|
||||
BOOST_CHECK(it == ref.end());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(refernece_length_average) {
|
||||
BOOST_AUTO_TEST_CASE(refernece_length_average)
|
||||
{
|
||||
{
|
||||
Reference ref;
|
||||
ref.push_back(4);
|
||||
@ -49,7 +52,8 @@ BOOST_AUTO_TEST_CASE(refernece_length_average) {
|
||||
}
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(refernece_length_closest) {
|
||||
BOOST_AUTO_TEST_CASE(refernece_length_closest)
|
||||
{
|
||||
{
|
||||
Reference ref;
|
||||
ref.push_back(4);
|
||||
@ -92,7 +96,8 @@ BOOST_AUTO_TEST_CASE(refernece_length_closest) {
|
||||
}
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(refernece_length_shortest) {
|
||||
BOOST_AUTO_TEST_CASE(refernece_length_shortest)
|
||||
{
|
||||
{
|
||||
Reference ref;
|
||||
ref.push_back(4);
|
||||
|
@ -5,19 +5,26 @@
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
template <class T>
|
||||
class ScopedVector {
|
||||
public:
|
||||
class ScopedVector
|
||||
{
|
||||
public:
|
||||
typedef typename std::vector<T*>::iterator iterator;
|
||||
typedef typename std::vector<T*>::const_iterator const_iterator;
|
||||
|
||||
ScopedVector() {}
|
||||
virtual ~ScopedVector() { reset(); }
|
||||
virtual ~ScopedVector() {
|
||||
reset();
|
||||
}
|
||||
|
||||
bool empty() const { return m_vec.empty(); }
|
||||
bool empty() const {
|
||||
return m_vec.empty();
|
||||
}
|
||||
|
||||
void push_back(T *e) { m_vec.push_back(e); }
|
||||
void push_back(T *e) {
|
||||
m_vec.push_back(e);
|
||||
}
|
||||
|
||||
void reset() {
|
||||
for (iterator it = m_vec.begin(); it != m_vec.end(); ++it) {
|
||||
@ -26,27 +33,53 @@ class ScopedVector {
|
||||
m_vec.clear();
|
||||
}
|
||||
|
||||
void reserve(std::size_t capacity) { m_vec.reserve(capacity); }
|
||||
void resize(std::size_t size) { m_vec.resize(size); }
|
||||
void reserve(std::size_t capacity) {
|
||||
m_vec.reserve(capacity);
|
||||
}
|
||||
void resize(std::size_t size) {
|
||||
m_vec.resize(size);
|
||||
}
|
||||
|
||||
std::size_t size() const {return m_vec.size(); }
|
||||
std::size_t size() const {
|
||||
return m_vec.size();
|
||||
}
|
||||
|
||||
iterator begin() { return m_vec.begin(); }
|
||||
const_iterator begin() const { return m_vec.begin(); }
|
||||
iterator begin() {
|
||||
return m_vec.begin();
|
||||
}
|
||||
const_iterator begin() const {
|
||||
return m_vec.begin();
|
||||
}
|
||||
|
||||
iterator end() { return m_vec.end(); }
|
||||
const_iterator end() const { return m_vec.end(); }
|
||||
iterator end() {
|
||||
return m_vec.end();
|
||||
}
|
||||
const_iterator end() const {
|
||||
return m_vec.end();
|
||||
}
|
||||
|
||||
std::vector<T*>& get() { return m_vec; }
|
||||
const std::vector<T*>& get() const { return m_vec; }
|
||||
std::vector<T*>& get() {
|
||||
return m_vec;
|
||||
}
|
||||
const std::vector<T*>& get() const {
|
||||
return m_vec;
|
||||
}
|
||||
|
||||
std::vector<T*>* operator->() { return &m_vec; }
|
||||
const std::vector<T*>* operator->() const { return &m_vec; }
|
||||
std::vector<T*>* operator->() {
|
||||
return &m_vec;
|
||||
}
|
||||
const std::vector<T*>* operator->() const {
|
||||
return &m_vec;
|
||||
}
|
||||
|
||||
T*& operator[](std::size_t i) { return m_vec[i]; }
|
||||
const T* operator[](std::size_t i) const { return m_vec[i]; }
|
||||
T*& operator[](std::size_t i) {
|
||||
return m_vec[i];
|
||||
}
|
||||
const T* operator[](std::size_t i) const {
|
||||
return m_vec[i];
|
||||
}
|
||||
|
||||
private:
|
||||
private:
|
||||
std::vector<T*> m_vec;
|
||||
|
||||
// no copying allowed.
|
||||
|
@ -17,12 +17,12 @@ namespace MosesTuning
|
||||
|
||||
|
||||
ScoreArray::ScoreArray()
|
||||
: m_num_scores(0), m_index(0) {}
|
||||
: m_num_scores(0), m_index(0) {}
|
||||
|
||||
void ScoreArray::savetxt(ostream* os, const string& sctype)
|
||||
{
|
||||
*os << SCORES_TXT_BEGIN << " " << m_index << " " << m_array.size()
|
||||
<< " " << m_num_scores << " " << sctype << endl;
|
||||
<< " " << m_num_scores << " " << sctype << endl;
|
||||
for (scorearray_t::iterator i = m_array.begin(); i !=m_array.end(); i++) {
|
||||
i->savetxt(os);
|
||||
*os << endl;
|
||||
@ -33,7 +33,7 @@ void ScoreArray::savetxt(ostream* os, const string& sctype)
|
||||
void ScoreArray::savebin(ostream* os, const string& score_type)
|
||||
{
|
||||
*os << SCORES_BIN_BEGIN << " " << m_index << " " << m_array.size()
|
||||
<< " " << m_num_scores << " " << score_type << endl;
|
||||
<< " " << m_num_scores << " " << score_type << endl;
|
||||
for (scorearray_t::iterator i = m_array.begin();
|
||||
i != m_array.end(); i++) {
|
||||
i->savebin(os);
|
||||
@ -63,7 +63,8 @@ void ScoreArray::save(const string &file, const string& score_type, bool bin)
|
||||
ofs.close();
|
||||
}
|
||||
|
||||
void ScoreArray::save(const string& score_type, bool bin) {
|
||||
void ScoreArray::save(const string& score_type, bool bin)
|
||||
{
|
||||
save(&cout, score_type, bin);
|
||||
}
|
||||
|
||||
|
@ -25,7 +25,7 @@ const char SCORES_BIN_END[] = "SCORES_BIN_END_0";
|
||||
|
||||
class ScoreArray
|
||||
{
|
||||
private:
|
||||
private:
|
||||
scorearray_t m_array;
|
||||
std::string m_score_type;
|
||||
std::size_t m_num_scores;
|
||||
@ -38,17 +38,29 @@ public:
|
||||
ScoreArray();
|
||||
~ScoreArray() {}
|
||||
|
||||
void clear() { m_array.clear(); }
|
||||
void clear() {
|
||||
m_array.clear();
|
||||
}
|
||||
|
||||
int getIndex() const { return m_index; }
|
||||
int getIndex() const {
|
||||
return m_index;
|
||||
}
|
||||
|
||||
void setIndex(int value) { m_index = value; }
|
||||
void setIndex(int value) {
|
||||
m_index = value;
|
||||
}
|
||||
|
||||
ScoreStats& get(std::size_t i) { return m_array.at(i); }
|
||||
ScoreStats& get(std::size_t i) {
|
||||
return m_array.at(i);
|
||||
}
|
||||
|
||||
const ScoreStats& get(std::size_t i) const { return m_array.at(i); }
|
||||
const ScoreStats& get(std::size_t i) const {
|
||||
return m_array.at(i);
|
||||
}
|
||||
|
||||
void add(const ScoreStats& e) { m_array.push_back(e); }
|
||||
void add(const ScoreStats& e) {
|
||||
m_array.push_back(e);
|
||||
}
|
||||
|
||||
//ADDED BY TS
|
||||
void swap(std::size_t i, std::size_t j) {
|
||||
@ -62,15 +74,25 @@ public:
|
||||
|
||||
void merge(ScoreArray& e);
|
||||
|
||||
std::string name() const { return m_score_type; }
|
||||
std::string name() const {
|
||||
return m_score_type;
|
||||
}
|
||||
|
||||
void name(std::string &score_type) { m_score_type = score_type; }
|
||||
void name(std::string &score_type) {
|
||||
m_score_type = score_type;
|
||||
}
|
||||
|
||||
std::size_t size() const { return m_array.size(); }
|
||||
std::size_t size() const {
|
||||
return m_array.size();
|
||||
}
|
||||
|
||||
std::size_t NumberOfScores() const { return m_num_scores; }
|
||||
std::size_t NumberOfScores() const {
|
||||
return m_num_scores;
|
||||
}
|
||||
|
||||
void NumberOfScores(std::size_t v) { m_num_scores = v; }
|
||||
void NumberOfScores(std::size_t v) {
|
||||
m_num_scores = v;
|
||||
}
|
||||
|
||||
void savetxt(std::ostream* os, const std::string& score_type);
|
||||
void savebin(std::ostream* os, const std::string& score_type);
|
||||
|
@ -50,7 +50,8 @@ void ScoreData::save(const string &file, bool bin)
|
||||
ofs.close();
|
||||
}
|
||||
|
||||
void ScoreData::save(bool bin) {
|
||||
void ScoreData::save(bool bin)
|
||||
{
|
||||
save(&cout, bin);
|
||||
}
|
||||
|
||||
|
@ -40,7 +40,9 @@ public:
|
||||
ScoreData(Scorer* scorer);
|
||||
~ScoreData() {}
|
||||
|
||||
void clear() { m_array.clear(); }
|
||||
void clear() {
|
||||
m_array.clear();
|
||||
}
|
||||
|
||||
inline ScoreArray& get(std::size_t idx) {
|
||||
return m_array.at(idx);
|
||||
@ -66,7 +68,9 @@ public:
|
||||
return m_array.at(i).get(j);
|
||||
}
|
||||
|
||||
std::string name() const { return m_score_type; }
|
||||
std::string name() const {
|
||||
return m_score_type;
|
||||
}
|
||||
|
||||
std::string name(const std::string &score_type) {
|
||||
return m_score_type = score_type;
|
||||
@ -75,8 +79,12 @@ public:
|
||||
void add(ScoreArray& e);
|
||||
void add(const ScoreStats& e, int sent_idx);
|
||||
|
||||
std::size_t NumberOfScores() const { return m_num_scores; }
|
||||
std::size_t size() const { return m_array.size(); }
|
||||
std::size_t NumberOfScores() const {
|
||||
return m_num_scores;
|
||||
}
|
||||
std::size_t size() const {
|
||||
return m_array.size();
|
||||
}
|
||||
|
||||
void save(const std::string &file, bool bin=false);
|
||||
void save(std::ostream* os, bool bin=false);
|
||||
|
@ -29,18 +29,20 @@ using namespace util;
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
ScoreDataIterator::ScoreDataIterator() {}
|
||||
|
||||
ScoreDataIterator::ScoreDataIterator(const string& filename) {
|
||||
ScoreDataIterator::ScoreDataIterator(const string& filename)
|
||||
{
|
||||
m_in.reset(new FilePiece(filename.c_str()));
|
||||
readNext();
|
||||
}
|
||||
|
||||
ScoreDataIterator::~ScoreDataIterator() {}
|
||||
|
||||
void ScoreDataIterator::readNext() {
|
||||
void ScoreDataIterator::readNext()
|
||||
{
|
||||
m_next.clear();
|
||||
try {
|
||||
StringPiece marker = m_in->ReadDelimited();
|
||||
@ -71,12 +73,14 @@ void ScoreDataIterator::readNext() {
|
||||
}
|
||||
}
|
||||
|
||||
void ScoreDataIterator::increment() {
|
||||
void ScoreDataIterator::increment()
|
||||
{
|
||||
readNext();
|
||||
}
|
||||
|
||||
|
||||
bool ScoreDataIterator::equal(const ScoreDataIterator& rhs) const {
|
||||
bool ScoreDataIterator::equal(const ScoreDataIterator& rhs) const
|
||||
{
|
||||
if (!m_in && !rhs.m_in) {
|
||||
return true;
|
||||
} else if (!m_in) {
|
||||
@ -84,13 +88,14 @@ bool ScoreDataIterator::equal(const ScoreDataIterator& rhs) const {
|
||||
} else if (!rhs.m_in) {
|
||||
return false;
|
||||
} else {
|
||||
return m_in->FileName() == rhs.m_in->FileName() &&
|
||||
m_in->Offset() == rhs.m_in->Offset();
|
||||
return m_in->FileName() == rhs.m_in->FileName() &&
|
||||
m_in->Offset() == rhs.m_in->Offset();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const vector<ScoreDataItem>& ScoreDataIterator::dereference() const {
|
||||
const vector<ScoreDataItem>& ScoreDataIterator::dereference() const
|
||||
{
|
||||
return m_next;
|
||||
}
|
||||
|
||||
|
@ -33,40 +33,43 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
#include "FeatureDataIterator.h"
|
||||
|
||||
namespace util { class FilePiece; }
|
||||
namespace util
|
||||
{
|
||||
class FilePiece;
|
||||
}
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
typedef std::vector<float> ScoreDataItem;
|
||||
|
||||
class ScoreDataIterator :
|
||||
public boost::iterator_facade<ScoreDataIterator,
|
||||
const std::vector<ScoreDataItem>,
|
||||
boost::forward_traversal_tag>
|
||||
const std::vector<ScoreDataItem>,
|
||||
boost::forward_traversal_tag>
|
||||
{
|
||||
public:
|
||||
ScoreDataIterator();
|
||||
explicit ScoreDataIterator(const std::string& filename);
|
||||
public:
|
||||
ScoreDataIterator();
|
||||
explicit ScoreDataIterator(const std::string& filename);
|
||||
|
||||
~ScoreDataIterator();
|
||||
~ScoreDataIterator();
|
||||
|
||||
static ScoreDataIterator end() {
|
||||
return ScoreDataIterator();
|
||||
}
|
||||
static ScoreDataIterator end() {
|
||||
return ScoreDataIterator();
|
||||
}
|
||||
|
||||
private:
|
||||
friend class boost::iterator_core_access;
|
||||
private:
|
||||
friend class boost::iterator_core_access;
|
||||
|
||||
void increment();
|
||||
bool equal(const ScoreDataIterator& rhs) const;
|
||||
const std::vector<ScoreDataItem>& dereference() const;
|
||||
void increment();
|
||||
bool equal(const ScoreDataIterator& rhs) const;
|
||||
const std::vector<ScoreDataItem>& dereference() const;
|
||||
|
||||
void readNext();
|
||||
void readNext();
|
||||
|
||||
boost::shared_ptr<util::FilePiece> m_in;
|
||||
std::vector<ScoreDataItem> m_next;
|
||||
boost::shared_ptr<util::FilePiece> m_in;
|
||||
std::vector<ScoreDataItem> m_next;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -13,21 +13,22 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace {
|
||||
namespace
|
||||
{
|
||||
const int kAvailableSize = 8;
|
||||
} // namespace
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
ScoreStats::ScoreStats()
|
||||
: m_available_size(kAvailableSize), m_entries(0),
|
||||
m_array(new ScoreStatsType[m_available_size]) {}
|
||||
: m_available_size(kAvailableSize), m_entries(0),
|
||||
m_array(new ScoreStatsType[m_available_size]) {}
|
||||
|
||||
ScoreStats::ScoreStats(const size_t size)
|
||||
: m_available_size(size), m_entries(size),
|
||||
m_array(new ScoreStatsType[m_available_size])
|
||||
: m_available_size(size), m_entries(size),
|
||||
m_array(new ScoreStatsType[m_available_size])
|
||||
{
|
||||
memset(m_array, 0, GetArraySizeWithBytes());
|
||||
}
|
||||
@ -123,7 +124,8 @@ void ScoreStats::savetxt(ostream* os)
|
||||
*os << *this;
|
||||
}
|
||||
|
||||
void ScoreStats::savetxt() {
|
||||
void ScoreStats::savetxt()
|
||||
{
|
||||
savetxt(&cout);
|
||||
}
|
||||
|
||||
@ -140,7 +142,8 @@ ostream& operator<<(ostream& o, const ScoreStats& e)
|
||||
return o;
|
||||
}
|
||||
|
||||
bool operator==(const ScoreStats& s1, const ScoreStats& s2) {
|
||||
bool operator==(const ScoreStats& s1, const ScoreStats& s2)
|
||||
{
|
||||
size_t size = s1.size();
|
||||
|
||||
if (size != s2.size())
|
||||
|
@ -18,7 +18,7 @@
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
class ScoreStats
|
||||
{
|
||||
@ -41,7 +41,9 @@ public:
|
||||
|
||||
void Copy(const ScoreStats &stats);
|
||||
|
||||
bool isfull() const { return (m_entries < m_available_size) ? 0 : 1; }
|
||||
bool isfull() const {
|
||||
return (m_entries < m_available_size) ? 0 : 1;
|
||||
}
|
||||
|
||||
void expand();
|
||||
void add(ScoreStatsType v);
|
||||
@ -55,9 +57,15 @@ public:
|
||||
clear();
|
||||
}
|
||||
|
||||
ScoreStatsType get(std::size_t i) { return m_array[i]; }
|
||||
ScoreStatsType get(std::size_t i) const { return m_array[i]; }
|
||||
scorestats_t getArray() const { return m_array; }
|
||||
ScoreStatsType get(std::size_t i) {
|
||||
return m_array[i];
|
||||
}
|
||||
ScoreStatsType get(std::size_t i) const {
|
||||
return m_array[i];
|
||||
}
|
||||
scorestats_t getArray() const {
|
||||
return m_array;
|
||||
}
|
||||
|
||||
void set(const std::string& str);
|
||||
|
||||
@ -69,15 +77,21 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
std::size_t bytes() const { return GetArraySizeWithBytes(); }
|
||||
std::size_t bytes() const {
|
||||
return GetArraySizeWithBytes();
|
||||
}
|
||||
|
||||
std::size_t GetArraySizeWithBytes() const {
|
||||
return m_entries * sizeof(ScoreStatsType);
|
||||
}
|
||||
|
||||
std::size_t size() const { return m_entries; }
|
||||
std::size_t size() const {
|
||||
return m_entries;
|
||||
}
|
||||
|
||||
std::size_t available() const { return m_available_size; }
|
||||
std::size_t available() const {
|
||||
return m_available_size;
|
||||
}
|
||||
|
||||
void savetxt(const std::string &file);
|
||||
void savetxt(std::ostream* os);
|
||||
|
@ -12,27 +12,31 @@ using namespace std;
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
namespace {
|
||||
namespace
|
||||
{
|
||||
// For tokenizing a hypothesis translation, we may encounter unknown tokens which
|
||||
// do not exist in the corresponding reference translations.
|
||||
const int kUnknownToken = -1;
|
||||
} // namespace
|
||||
|
||||
Scorer::Scorer(const string& name, const string& config)
|
||||
: m_name(name),
|
||||
m_vocab(mert::VocabularyFactory::GetVocabulary()),
|
||||
m_filter(NULL),
|
||||
m_score_data(NULL),
|
||||
m_enable_preserve_case(true) {
|
||||
: m_name(name),
|
||||
m_vocab(mert::VocabularyFactory::GetVocabulary()),
|
||||
m_filter(NULL),
|
||||
m_score_data(NULL),
|
||||
m_enable_preserve_case(true)
|
||||
{
|
||||
InitConfig(config);
|
||||
}
|
||||
|
||||
Scorer::~Scorer() {
|
||||
Scorer::~Scorer()
|
||||
{
|
||||
Singleton<mert::Vocabulary>::Delete();
|
||||
delete m_filter;
|
||||
}
|
||||
|
||||
void Scorer::InitConfig(const string& config) {
|
||||
void Scorer::InitConfig(const string& config)
|
||||
{
|
||||
// cerr << "Scorer config string: " << config << endl;
|
||||
size_t start = 0;
|
||||
while (start < config.size()) {
|
||||
@ -53,7 +57,8 @@ void Scorer::InitConfig(const string& config) {
|
||||
}
|
||||
}
|
||||
|
||||
void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) {
|
||||
void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded)
|
||||
{
|
||||
for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));
|
||||
it; ++it) {
|
||||
if (!m_enable_preserve_case) {
|
||||
@ -69,7 +74,8 @@ void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) {
|
||||
}
|
||||
}
|
||||
|
||||
void Scorer::TokenizeAndEncodeTesting(const string& line, vector<int>& encoded) {
|
||||
void Scorer::TokenizeAndEncodeTesting(const string& line, vector<int>& encoded)
|
||||
{
|
||||
for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));
|
||||
it; ++it) {
|
||||
if (!m_enable_preserve_case) {
|
||||
@ -103,8 +109,7 @@ void Scorer::setFactors(const string& factors)
|
||||
if (factors.empty()) return;
|
||||
vector<string> factors_vec;
|
||||
split(factors, '|', factors_vec);
|
||||
for(vector<string>::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it)
|
||||
{
|
||||
for(vector<string>::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it) {
|
||||
int factor = atoi(it->c_str());
|
||||
m_factors.push_back(factor);
|
||||
}
|
||||
@ -115,8 +120,8 @@ void Scorer::setFactors(const string& factors)
|
||||
*/
|
||||
void Scorer::setFilter(const string& filterCommand)
|
||||
{
|
||||
if (filterCommand.empty()) return;
|
||||
m_filter = new PreProcessFilter(filterCommand);
|
||||
if (filterCommand.empty()) return;
|
||||
m_filter = new PreProcessFilter(filterCommand);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -130,8 +135,7 @@ string Scorer::applyFactors(const string& sentence) const
|
||||
split(sentence, ' ', tokens);
|
||||
|
||||
stringstream sstream;
|
||||
for (size_t i = 0; i < tokens.size(); ++i)
|
||||
{
|
||||
for (size_t i = 0; i < tokens.size(); ++i) {
|
||||
if (tokens[i] == "") continue;
|
||||
|
||||
vector<string> factors;
|
||||
@ -141,8 +145,7 @@ string Scorer::applyFactors(const string& sentence) const
|
||||
|
||||
if (i > 0) sstream << " ";
|
||||
|
||||
for (size_t j = 0; j < m_factors.size(); ++j)
|
||||
{
|
||||
for (size_t j = 0; j < m_factors.size(); ++j) {
|
||||
int findex = m_factors[j];
|
||||
if (findex < 0 || findex >= fsize) throw runtime_error("Factor index is out of range.");
|
||||
|
||||
@ -158,17 +161,15 @@ string Scorer::applyFactors(const string& sentence) const
|
||||
*/
|
||||
string Scorer::applyFilter(const string& sentence) const
|
||||
{
|
||||
if (m_filter)
|
||||
{
|
||||
if (m_filter) {
|
||||
return m_filter->ProcessSentence(sentence);
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
return sentence;
|
||||
}
|
||||
}
|
||||
|
||||
float Scorer::score(const candidates_t& candidates) const {
|
||||
float Scorer::score(const candidates_t& candidates) const
|
||||
{
|
||||
diffs_t diffs;
|
||||
statscores_t scores;
|
||||
score(candidates, diffs, scores);
|
||||
|
@ -10,7 +10,8 @@
|
||||
#include "Types.h"
|
||||
#include "ScoreData.h"
|
||||
|
||||
namespace mert {
|
||||
namespace mert
|
||||
{
|
||||
|
||||
class Vocabulary;
|
||||
|
||||
@ -32,7 +33,7 @@ enum ScorerRegularisationStrategy {REG_NONE, REG_AVERAGE, REG_MINIMUM};
|
||||
*/
|
||||
class Scorer
|
||||
{
|
||||
public:
|
||||
public:
|
||||
Scorer(const std::string& name, const std::string& config);
|
||||
virtual ~Scorer();
|
||||
|
||||
@ -117,14 +118,16 @@ class Scorer
|
||||
*/
|
||||
virtual void setFactors(const std::string& factors);
|
||||
|
||||
mert::Vocabulary* GetVocab() const { return m_vocab; }
|
||||
mert::Vocabulary* GetVocab() const {
|
||||
return m_vocab;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set unix filter, which will be used to preprocess the sentences
|
||||
*/
|
||||
virtual void setFilter(const std::string& filterCommand);
|
||||
|
||||
private:
|
||||
private:
|
||||
void InitConfig(const std::string& config);
|
||||
|
||||
/**
|
||||
@ -143,7 +146,7 @@ class Scorer
|
||||
std::vector<int> m_factors;
|
||||
PreProcessFilter* m_filter;
|
||||
|
||||
protected:
|
||||
protected:
|
||||
ScoreData* m_score_data;
|
||||
bool m_enable_preserve_case;
|
||||
|
||||
@ -173,40 +176,40 @@ class Scorer
|
||||
/**
|
||||
* Every inherited scorer should call this function for each sentence
|
||||
*/
|
||||
std::string preprocessSentence(const std::string& sentence) const
|
||||
{
|
||||
std::string preprocessSentence(const std::string& sentence) const {
|
||||
return applyFactors(applyFilter(sentence));
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
namespace {
|
||||
namespace
|
||||
{
|
||||
|
||||
//regularisation strategies
|
||||
inline float score_min(const statscores_t& scores, size_t start, size_t end)
|
||||
{
|
||||
float min = std::numeric_limits<float>::max();
|
||||
for (size_t i = start; i < end; ++i) {
|
||||
if (scores[i] < min) {
|
||||
min = scores[i];
|
||||
}
|
||||
//regularisation strategies
|
||||
inline float score_min(const statscores_t& scores, size_t start, size_t end)
|
||||
{
|
||||
float min = std::numeric_limits<float>::max();
|
||||
for (size_t i = start; i < end; ++i) {
|
||||
if (scores[i] < min) {
|
||||
min = scores[i];
|
||||
}
|
||||
return min;
|
||||
}
|
||||
return min;
|
||||
}
|
||||
|
||||
inline float score_average(const statscores_t& scores, size_t start, size_t end)
|
||||
{
|
||||
if ((end - start) < 1) {
|
||||
// this shouldn't happen
|
||||
return 0;
|
||||
}
|
||||
float total = 0;
|
||||
for (size_t j = start; j < end; ++j) {
|
||||
total += scores[j];
|
||||
}
|
||||
|
||||
inline float score_average(const statscores_t& scores, size_t start, size_t end)
|
||||
{
|
||||
if ((end - start) < 1) {
|
||||
// this shouldn't happen
|
||||
return 0;
|
||||
}
|
||||
float total = 0;
|
||||
for (size_t j = start; j < end; ++j) {
|
||||
total += scores[j];
|
||||
}
|
||||
|
||||
return total / (end - start);
|
||||
}
|
||||
return total / (end - start);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
|
@ -14,9 +14,10 @@ using namespace std;
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
vector<string> ScorerFactory::getTypes() {
|
||||
|
||||
vector<string> ScorerFactory::getTypes()
|
||||
{
|
||||
vector<string> types;
|
||||
types.push_back(string("BLEU"));
|
||||
types.push_back(string("PER"));
|
||||
@ -29,7 +30,8 @@ vector<string> ScorerFactory::getTypes() {
|
||||
return types;
|
||||
}
|
||||
|
||||
Scorer* ScorerFactory::getScorer(const string& type, const string& config) {
|
||||
Scorer* ScorerFactory::getScorer(const string& type, const string& config)
|
||||
{
|
||||
if (type == "BLEU") {
|
||||
return new BleuScorer(config);
|
||||
} else if (type == "PER") {
|
||||
@ -48,8 +50,7 @@ Scorer* ScorerFactory::getScorer(const string& type, const string& config) {
|
||||
} else {
|
||||
if (type.find(',') != string::npos) {
|
||||
return new InterpolatedScorer(type, config);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
throw runtime_error("Unknown scorer type: " + type);
|
||||
}
|
||||
}
|
||||
|
@ -6,7 +6,7 @@
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
class Scorer;
|
||||
|
||||
|
@ -6,7 +6,8 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace {
|
||||
namespace
|
||||
{
|
||||
|
||||
MosesTuning::SemposOverlapping* g_overlapping = NULL;
|
||||
|
||||
@ -14,9 +15,10 @@ MosesTuning::SemposOverlapping* g_overlapping = NULL;
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
SemposOverlapping* SemposOverlappingFactory::GetOverlapping(const string& str, const SemposScorer* sempos) {
|
||||
|
||||
SemposOverlapping* SemposOverlappingFactory::GetOverlapping(const string& str, const SemposScorer* sempos)
|
||||
{
|
||||
if (str == "cap-micro") {
|
||||
return new CapMicroOverlapping(sempos);
|
||||
} else if (str == "cap-macro") {
|
||||
@ -26,7 +28,8 @@ SemposOverlapping* SemposOverlappingFactory::GetOverlapping(const string& str, c
|
||||
}
|
||||
}
|
||||
|
||||
void SemposOverlappingFactory::SetOverlapping(SemposOverlapping* ovr) {
|
||||
void SemposOverlappingFactory::SetOverlapping(SemposOverlapping* ovr)
|
||||
{
|
||||
g_overlapping = ovr;
|
||||
}
|
||||
|
||||
@ -41,15 +44,13 @@ vector<int> CapMicroOverlapping::prepareStats(const sentence_t& cand, const sent
|
||||
int multCoeff = 1000;
|
||||
|
||||
float interSum = 0;
|
||||
for (sentence_t::iterator it = intersection.begin(); it != intersection.end(); it++)
|
||||
{
|
||||
for (sentence_t::iterator it = intersection.begin(); it != intersection.end(); it++) {
|
||||
interSum += semposScorer->weight(it->first);
|
||||
}
|
||||
|
||||
float refSum = 0;
|
||||
for (sentence_t::iterator it = ref.begin(); it != ref.end(); it++)
|
||||
{
|
||||
refSum += semposScorer->weight(it->first);
|
||||
for (sentence_t::iterator it = ref.begin(); it != ref.end(); it++) {
|
||||
refSum += semposScorer->weight(it->first);
|
||||
}
|
||||
|
||||
stats[0] = (int)(multCoeff * interSum);
|
||||
|
@ -9,7 +9,7 @@
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
class SemposScorer;
|
||||
|
||||
@ -36,14 +36,15 @@ public:
|
||||
virtual std::size_t NumberOfScores() const = 0;
|
||||
};
|
||||
|
||||
class SemposOverlappingFactory {
|
||||
public:
|
||||
class SemposOverlappingFactory
|
||||
{
|
||||
public:
|
||||
static SemposOverlapping* GetOverlapping(const std::string& str, const SemposScorer* sempos);
|
||||
|
||||
// dependency injection for unit testing.
|
||||
static void SetOverlapping(SemposOverlapping* ovr);
|
||||
|
||||
private:
|
||||
private:
|
||||
SemposOverlappingFactory() {}
|
||||
~SemposOverlappingFactory() {}
|
||||
};
|
||||
@ -62,9 +63,11 @@ public:
|
||||
|
||||
virtual std::vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref);
|
||||
virtual float calculateScore(const std::vector<int>& stats) const;
|
||||
virtual std::size_t NumberOfScores() const { return 2; }
|
||||
virtual std::size_t NumberOfScores() const {
|
||||
return 2;
|
||||
}
|
||||
|
||||
private:
|
||||
private:
|
||||
// no copying allowed.
|
||||
CapMicroOverlapping(const CapMicroOverlapping&);
|
||||
CapMicroOverlapping& operator=(const CapMicroOverlapping&);
|
||||
@ -82,9 +85,11 @@ public:
|
||||
|
||||
virtual std::vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref);
|
||||
virtual float calculateScore(const std::vector<int>& stats) const;
|
||||
virtual std::size_t NumberOfScores() const { return kMaxNOC * 2; }
|
||||
virtual std::size_t NumberOfScores() const {
|
||||
return kMaxNOC * 2;
|
||||
}
|
||||
|
||||
private:
|
||||
private:
|
||||
// no copying allowed.
|
||||
CapMacroOverlapping(const CapMacroOverlapping&);
|
||||
CapMacroOverlapping& operator=(const CapMacroOverlapping&);
|
||||
|
@ -12,7 +12,7 @@ using namespace std;
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
SemposScorer::SemposScorer(const string& config)
|
||||
: StatisticsBasedScorer("SEMPOS", config),
|
||||
@ -25,8 +25,7 @@ SemposScorer::SemposScorer(const string& config)
|
||||
m_semposMap.clear();
|
||||
|
||||
string weightsfile = getConfig("weightsfile", "");
|
||||
if (weightsfile != "")
|
||||
{
|
||||
if (weightsfile != "") {
|
||||
loadWeights(weightsfile);
|
||||
}
|
||||
}
|
||||
@ -144,42 +143,35 @@ int SemposScorer::encodeSempos(const string& sempos)
|
||||
|
||||
float SemposScorer::weight(int item) const
|
||||
{
|
||||
std::map<int,float>::const_iterator it = weightsMap.find(item);
|
||||
if (it == weightsMap.end())
|
||||
{
|
||||
return 1.0f;
|
||||
}
|
||||
else
|
||||
{
|
||||
return it->second;
|
||||
}
|
||||
std::map<int,float>::const_iterator it = weightsMap.find(item);
|
||||
if (it == weightsMap.end()) {
|
||||
return 1.0f;
|
||||
} else {
|
||||
return it->second;
|
||||
}
|
||||
}
|
||||
|
||||
void SemposScorer::loadWeights(const string& weightsfile)
|
||||
{
|
||||
string line;
|
||||
ifstream myfile;
|
||||
myfile.open(weightsfile.c_str(), ifstream::in);
|
||||
if (myfile.is_open())
|
||||
{
|
||||
while ( myfile.good() )
|
||||
{
|
||||
getline (myfile,line);
|
||||
vector<string> fields;
|
||||
if (line == "") continue;
|
||||
split(line, '\t', fields);
|
||||
if (fields.size() != 2) throw std::runtime_error("Bad format of a row in weights file.");
|
||||
int encoded = encodeString(fields[0]);
|
||||
float weight = atof(fields[1].c_str());
|
||||
weightsMap[encoded] = weight;
|
||||
}
|
||||
myfile.close();
|
||||
}
|
||||
else
|
||||
{
|
||||
cerr << "Unable to open file "<< weightsfile << endl;
|
||||
exit(1);
|
||||
string line;
|
||||
ifstream myfile;
|
||||
myfile.open(weightsfile.c_str(), ifstream::in);
|
||||
if (myfile.is_open()) {
|
||||
while ( myfile.good() ) {
|
||||
getline (myfile,line);
|
||||
vector<string> fields;
|
||||
if (line == "") continue;
|
||||
split(line, '\t', fields);
|
||||
if (fields.size() != 2) throw std::runtime_error("Bad format of a row in weights file.");
|
||||
int encoded = encodeString(fields[0]);
|
||||
float weight = atof(fields[1].c_str());
|
||||
weightsMap[encoded] = weight;
|
||||
}
|
||||
myfile.close();
|
||||
} else {
|
||||
cerr << "Unable to open file "<< weightsfile << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -19,7 +19,7 @@
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* This class represents sempos based metrics.
|
||||
@ -32,12 +32,16 @@ public:
|
||||
|
||||
virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles);
|
||||
virtual void prepareStats(std::size_t sindex, const std::string& text, ScoreStats& entry);
|
||||
virtual std::size_t NumberOfScores() const { return m_ovr->NumberOfScores(); }
|
||||
virtual std::size_t NumberOfScores() const {
|
||||
return m_ovr->NumberOfScores();
|
||||
}
|
||||
virtual float calculateScore(const std::vector<int>& comps) const {
|
||||
return m_ovr->calculateScore(comps);
|
||||
}
|
||||
|
||||
bool EnableDebug() const { return m_enable_debug; }
|
||||
bool EnableDebug() const {
|
||||
return m_enable_debug;
|
||||
}
|
||||
|
||||
float weight(int item) const;
|
||||
|
||||
|
@ -17,48 +17,50 @@ namespace MosesTuning
|
||||
{
|
||||
|
||||
SentenceLevelScorer::SentenceLevelScorer(const string& name, const string& config)
|
||||
: Scorer(name, config),
|
||||
m_regularisationStrategy(REG_NONE),
|
||||
m_regularisationWindow(0) {
|
||||
: Scorer(name, config),
|
||||
m_regularisationStrategy(REG_NONE),
|
||||
m_regularisationWindow(0)
|
||||
{
|
||||
Init();
|
||||
}
|
||||
|
||||
SentenceLevelScorer::~SentenceLevelScorer() {}
|
||||
|
||||
void SentenceLevelScorer::Init() {
|
||||
// Configure regularisation.
|
||||
static string KEY_TYPE = "regtype";
|
||||
static string KEY_WINDOW = "regwin";
|
||||
static string KEY_CASE = "case";
|
||||
static string TYPE_NONE = "none";
|
||||
static string TYPE_AVERAGE = "average";
|
||||
static string TYPE_MINIMUM = "min";
|
||||
static string TRUE = "true";
|
||||
static string FALSE = "false";
|
||||
void SentenceLevelScorer::Init()
|
||||
{
|
||||
// Configure regularisation.
|
||||
static string KEY_TYPE = "regtype";
|
||||
static string KEY_WINDOW = "regwin";
|
||||
static string KEY_CASE = "case";
|
||||
static string TYPE_NONE = "none";
|
||||
static string TYPE_AVERAGE = "average";
|
||||
static string TYPE_MINIMUM = "min";
|
||||
static string TRUE = "true";
|
||||
static string FALSE = "false";
|
||||
|
||||
const string type = getConfig(KEY_TYPE, TYPE_NONE);
|
||||
if (type == TYPE_NONE) {
|
||||
m_regularisationStrategy = REG_NONE;
|
||||
} else if (type == TYPE_AVERAGE) {
|
||||
m_regularisationStrategy = REG_AVERAGE;
|
||||
} else if (type == TYPE_MINIMUM) {
|
||||
m_regularisationStrategy = REG_MINIMUM;
|
||||
} else {
|
||||
throw boost::lexer::runtime_error("Unknown scorer regularisation strategy: " + type);
|
||||
}
|
||||
cerr << "Using scorer regularisation strategy: " << type << endl;
|
||||
const string type = getConfig(KEY_TYPE, TYPE_NONE);
|
||||
if (type == TYPE_NONE) {
|
||||
m_regularisationStrategy = REG_NONE;
|
||||
} else if (type == TYPE_AVERAGE) {
|
||||
m_regularisationStrategy = REG_AVERAGE;
|
||||
} else if (type == TYPE_MINIMUM) {
|
||||
m_regularisationStrategy = REG_MINIMUM;
|
||||
} else {
|
||||
throw boost::lexer::runtime_error("Unknown scorer regularisation strategy: " + type);
|
||||
}
|
||||
cerr << "Using scorer regularisation strategy: " << type << endl;
|
||||
|
||||
const string window = getConfig(KEY_WINDOW, "0");
|
||||
m_regularisationWindow = atoi(window.c_str());
|
||||
cerr << "Using scorer regularisation window: " << m_regularisationWindow << endl;
|
||||
const string window = getConfig(KEY_WINDOW, "0");
|
||||
m_regularisationWindow = atoi(window.c_str());
|
||||
cerr << "Using scorer regularisation window: " << m_regularisationWindow << endl;
|
||||
|
||||
const string preservecase = getConfig(KEY_CASE, TRUE);
|
||||
if (preservecase == TRUE) {
|
||||
m_enable_preserve_case = true;
|
||||
} else if (preservecase == FALSE) {
|
||||
m_enable_preserve_case = false;
|
||||
}
|
||||
cerr << "Using case preservation: " << m_enable_preserve_case << endl;
|
||||
const string preservecase = getConfig(KEY_CASE, TRUE);
|
||||
if (preservecase == TRUE) {
|
||||
m_enable_preserve_case = true;
|
||||
} else if (preservecase == FALSE) {
|
||||
m_enable_preserve_case = false;
|
||||
}
|
||||
cerr << "Using case preservation: " << m_enable_preserve_case << endl;
|
||||
}
|
||||
|
||||
void SentenceLevelScorer::score(const candidates_t& candidates, const diffs_t& diffs,
|
||||
@ -83,8 +85,8 @@ void SentenceLevelScorer::score(const candidates_t& candidates, const diffs_t&
|
||||
if (stats.size() != totals.size()) {
|
||||
stringstream msg;
|
||||
msg << "Statistics for (" << "," << candidates[i] << ") have incorrect "
|
||||
<< "number of fields. Found: " << stats.size() << " Expected: "
|
||||
<< totals.size();
|
||||
<< "number of fields. Found: " << stats.size() << " Expected: "
|
||||
<< totals.size();
|
||||
throw runtime_error(msg.str());
|
||||
}
|
||||
//Add up scores for all sentences, would normally be just one score
|
||||
|
@ -5,13 +5,14 @@
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
// thread *un*safe singleton.
|
||||
// TODO: replace this with thread-safe singleton.
|
||||
template <typename T>
|
||||
class Singleton {
|
||||
public:
|
||||
class Singleton
|
||||
{
|
||||
public:
|
||||
static T* GetInstance() {
|
||||
if (m_instance == NULL) {
|
||||
m_instance = new T;
|
||||
@ -26,7 +27,7 @@ class Singleton {
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
private:
|
||||
Singleton();
|
||||
static T* m_instance;
|
||||
};
|
||||
|
@ -5,19 +5,24 @@
|
||||
|
||||
using namespace MosesTuning;
|
||||
|
||||
namespace {
|
||||
namespace
|
||||
{
|
||||
|
||||
static int g_count = 0;
|
||||
|
||||
class Instance {
|
||||
public:
|
||||
Instance() { ++g_count; }
|
||||
class Instance
|
||||
{
|
||||
public:
|
||||
Instance() {
|
||||
++g_count;
|
||||
}
|
||||
~Instance() {}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
BOOST_AUTO_TEST_CASE(singleton_basic) {
|
||||
BOOST_AUTO_TEST_CASE(singleton_basic)
|
||||
{
|
||||
Instance* instance1 = Singleton<Instance>::GetInstance();
|
||||
Instance* instance2 = Singleton<Instance>::GetInstance();
|
||||
Instance* instance3 = Singleton<Instance>::GetInstance();
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user