This commit is contained in:
Hieu Hoang 2013-05-29 18:16:15 +01:00
parent 59bd7deb4b
commit 6249432407
501 changed files with 20914 additions and 20027 deletions

View File

@ -50,14 +50,14 @@ int main (int argc, char * const argv[])
}
int numSourceFactors = Moses::Scan<int>(argv[1])
, numTargetFactors = Moses::Scan<int>(argv[2])
, numScores = Moses::Scan<int>(argv[3])
, tableLimit = Moses::Scan<int>(argv[4]);
, numTargetFactors = Moses::Scan<int>(argv[2])
, numScores = Moses::Scan<int>(argv[3])
, tableLimit = Moses::Scan<int>(argv[4]);
TargetPhraseCollection::s_sortScoreInd = Moses::Scan<int>(argv[5]);
assert(TargetPhraseCollection::s_sortScoreInd < numScores);
const string filePath = argv[6]
,destPath = argv[7];
,destPath = argv[7];
Moses::InputFileStream inStream(filePath);
@ -128,9 +128,9 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
} else {
switch (stage) {
case 0: {
WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper);
if (w != NULL)
out->AddWord(w);
WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper);
if (w != NULL)
out->AddWord(w);
break;
}
@ -146,19 +146,19 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
}
case 3: {
//targetPhrase.Create1AlignFromString(tok);
targetPhrase.CreateAlignFromString(tok);
targetPhrase.CreateAlignFromString(tok);
break;
}
case 4:
++stage;
break;
/* case 5: {
// count info. Only store the 2nd one
float val = Moses::Scan<float>(tok);
misc[0] = val;
++stage;
break;
}*/
/* case 5: {
// count info. Only store the 2nd one
float val = Moses::Scan<float>(tok);
misc[0] = val;
++stage;
break;
}*/
case 5: {
// count info. Only store the 2nd one
//float val = Moses::Scan<float>(tok);
@ -167,12 +167,12 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
break;
}
case 6: {
// store only the 3rd one (rule count)
// store only the 3rd one (rule count)
float val = Moses::Scan<float>(tok);
misc[0] = val;
++stage;
break;
}
}
default:
cerr << "ERROR in line " << line << endl;
assert(false);
@ -189,8 +189,8 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
} // Tokenize()
OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
, OnDiskPt::OnDiskWrapper &onDiskWrapper)
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
, OnDiskPt::OnDiskWrapper &onDiskWrapper)
{
bool nonTerm = false;

View File

@ -26,12 +26,12 @@ typedef std::pair<size_t, size_t> AlignPair;
typedef std::vector<AlignPair> AlignType;
OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
, OnDiskPt::OnDiskWrapper &onDiskWrapper);
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
, OnDiskPt::OnDiskWrapper &onDiskWrapper);
OnDiskPt::PhrasePtr Tokenize(OnDiskPt::SourcePhrase &sourcePhrase, OnDiskPt::TargetPhrase &targetPhrase
, char *line, OnDiskPt::OnDiskWrapper &onDiskWrapper
, int numScores
, std::vector<float> &misc);
, char *line, OnDiskPt::OnDiskWrapper &onDiskWrapper
, int numScores
, std::vector<float> &misc);
void InsertTargetNonTerminals(std::vector<std::string> &sourceToks, const std::vector<std::string> &targetToks, const AlignType &alignments);
void SortAlign(AlignType &alignments);

View File

@ -4,9 +4,9 @@ namespace OnDiskPt
{
void OnDiskQuery::Tokenize(Phrase &phrase,
const std::string &token,
bool addSourceNonTerm,
bool addTargetNonTerm)
const std::string &token,
bool addSourceNonTerm,
bool addTargetNonTerm)
{
bool nonTerm = false;
size_t tokSize = token.size();
@ -54,9 +54,9 @@ void OnDiskQuery::Tokenize(Phrase &phrase,
SourcePhrase OnDiskQuery::Tokenize(const std::vector<std::string>& tokens)
{
SourcePhrase sourcePhrase;
if (tokens.size() > 0){
if (tokens.size() > 0) {
std::vector<std::string>::const_iterator token = tokens.begin();
for (; token + 1 != tokens.end(); ++token){
for (; token + 1 != tokens.end(); ++token) {
Tokenize(sourcePhrase, *token, true, true);
}
// last position. LHS non-term
@ -67,19 +67,17 @@ SourcePhrase OnDiskQuery::Tokenize(const std::vector<std::string>& tokens)
const PhraseNode* OnDiskQuery::Query(const SourcePhrase& sourcePhrase)
{
const PhraseNode *node = &m_wrapper.GetRootSourceNode();
assert(node);
const PhraseNode *node = &m_wrapper.GetRootSourceNode();
assert(node);
for (size_t pos = 0; pos < sourcePhrase.GetSize(); ++pos)
{
const Word &word = sourcePhrase.GetWord(pos);
node = node->GetChild(word, m_wrapper);
if (node == NULL)
{
break;
}
for (size_t pos = 0; pos < sourcePhrase.GetSize(); ++pos) {
const Word &word = sourcePhrase.GetWord(pos);
node = node->GetChild(word, m_wrapper);
if (node == NULL) {
break;
}
return node;
}
return node;
}
}

View File

@ -18,19 +18,18 @@ private:
public:
OnDiskQuery(OnDiskWrapper &wrapper):m_wrapper(wrapper){}
OnDiskQuery(OnDiskWrapper &wrapper):m_wrapper(wrapper) {}
void Tokenize(Phrase &phrase,
const std::string &token,
bool addSourceNonTerm,
bool addTargetNonTerm);
const std::string &token,
bool addSourceNonTerm,
bool addTargetNonTerm);
SourcePhrase Tokenize(const std::vector<std::string>& tokens);
const PhraseNode *Query(const SourcePhrase& sourcePhrase);
inline const PhraseNode *Query(const std::vector<std::string>& tokens)
{
inline const PhraseNode *Query(const std::vector<std::string>& tokens) {
return Query(Tokenize(tokens));
}

View File

@ -212,8 +212,8 @@ Word *OnDiskWrapper::ConvertFromMoses(Moses::FactorDirection /* direction */
for (size_t ind = 1 ; ind < factorsVec.size() ; ++ind) {
size_t factorType = factorsVec[ind];
const Moses::Factor *factor = origWord.GetFactor(factorType);
if (factor == NULL)
{ // can have less factors than factorType.size()
if (factor == NULL) {
// can have less factors than factorType.size()
break;
}
CHECK(factor);

View File

@ -64,13 +64,13 @@ void TargetPhrase::Create1AlignFromString(const std::string &align1Str)
void TargetPhrase::CreateAlignFromString(const std::string &alignStr)
{
vector<std::string> alignPairs;
boost::split(alignPairs, alignStr, boost::is_any_of("\t "));
for (size_t i = 0; i < alignPairs.size(); ++i) {
vector<size_t> alignPoints;
Moses::Tokenize<size_t>(alignPoints, alignPairs[i], "-");
m_align.push_back(pair<size_t, size_t>(alignPoints[0], alignPoints[1]) );
}
vector<std::string> alignPairs;
boost::split(alignPairs, alignStr, boost::is_any_of("\t "));
for (size_t i = 0; i < alignPairs.size(); ++i) {
vector<size_t> alignPoints;
Moses::Tokenize<size_t>(alignPoints, alignPairs[i], "-");
m_align.push_back(pair<size_t, size_t>(alignPoints[0], alignPoints[1]) );
}
}
@ -105,7 +105,7 @@ char *TargetPhrase::WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed)
size_t memNeeded = sizeof(UINT64) // num of words
+ targetWordSize * phraseSize // actual words. lhs as last words
+ sizeof(UINT64) // num source words
+ sourceWordSize * spSize; // actual source words
+ sourceWordSize * spSize; // actual source words
memUsed = 0;
UINT64 *mem = (UINT64*) malloc(memNeeded);
@ -252,11 +252,10 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::Facto
size_t targetPos = entry.second;
if (GetWord(targetPos).IsNonTerminal()) {
alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
} else {
alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
}
else {
alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
}
}
ret->SetAlignTerm(alignTerm);

View File

@ -73,7 +73,7 @@ public:
const PhrasePtr GetSourcePhrase() const {
return m_sourcePhrase;
}
const std::vector<float> &GetScores() const{
const std::vector<float> &GetScores() const {
return m_scores;
}
@ -107,7 +107,7 @@ public:
UINT64 ReadOtherInfoFromFile(UINT64 filePos, std::fstream &fileTPColl);
UINT64 ReadFromFile(std::fstream &fileTP);
virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const;
virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const;
};

View File

@ -82,7 +82,7 @@ void TargetPhraseCollection::Save(OnDiskWrapper &onDiskWrapper)
CollType::iterator iter;
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) {
// save phrase
TargetPhrase &targetPhrase = **iter;
TargetPhrase &targetPhrase = **iter;
targetPhrase.Save(onDiskWrapper);
// save coll

View File

@ -97,9 +97,10 @@ size_t Word::ReadFromFile(std::fstream &file)
}
void Word::ConvertToMoses(
const std::vector<Moses::FactorType> &outputFactorsVec,
const Vocab &vocab,
Moses::Word &overwrite) const {
const std::vector<Moses::FactorType> &outputFactorsVec,
const Vocab &vocab,
Moses::Word &overwrite) const
{
Moses::FactorCollection &factorColl = Moses::FactorCollection::Instance();
overwrite = Moses::Word(m_isNonTerminal);
@ -144,14 +145,14 @@ bool Word::operator==(const Word &compare) const
void Word::DebugPrint(ostream &out, const Vocab &vocab) const
{
const string &str = vocab.GetString(m_vocabId);
const string &str = vocab.GetString(m_vocabId);
out << str;
}
std::ostream& operator<<(std::ostream &out, const Word &word)
{
out << "(";
out << word.m_vocabId;
out << word.m_vocabId;
out << (word.m_isNonTerminal ? "n" : "t");
out << ")";

View File

@ -50,8 +50,8 @@ public:
{}
explicit Word(bool isNonTerminal)
:m_isNonTerminal(isNonTerminal)
,m_vocabId(0)
:m_isNonTerminal(isNonTerminal)
,m_vocabId(0)
{}
Word(const Word &copy);
@ -77,8 +77,7 @@ public:
Moses::Word &overwrite) const;
void DebugPrint(std::ostream &out, const Vocab &vocab) const;
inline const std::string &GetString(const Vocab &vocab) const
{
inline const std::string &GetString(const Vocab &vocab) const {
return vocab.GetString(m_vocabId);
}

View File

@ -33,8 +33,7 @@ int main(int argc, char **argv)
if(i + 1 == argc)
usage();
ttable = argv[++i];
}
else
} else
usage();
}
@ -56,22 +55,19 @@ int main(int argc, char **argv)
cerr << "line: " << line << endl;
const PhraseNode* node = onDiskQuery.Query(tokens);
if (node)
{ // source phrase points to a bunch of rules
if (node) {
// source phrase points to a bunch of rules
const TargetPhraseCollection *coll = node->GetTargetPhraseCollection(tableLimit, onDiskWrapper);
string str = coll->GetDebugStr();
cout << "Found " << coll->GetSize() << endl;
for (size_t ind = 0; ind < coll->GetSize(); ++ind)
{
for (size_t ind = 0; ind < coll->GetSize(); ++ind) {
const TargetPhrase &targetPhrase = coll->GetTargetPhrase(ind);
cerr << " ";
targetPhrase.DebugPrint(cerr, onDiskWrapper.GetVocab());
cerr << endl;
}
}
else
{
} else {
cout << "Not found" << endl;
}

View File

@ -5,7 +5,8 @@
#include <stdlib.h>
#include <cstring>
namespace {
namespace
{
const int LINE_MAX_LENGTH = 10000;
@ -84,10 +85,10 @@ void Alignment::Create(const string& fileName)
}
Alignment::Alignment()
: m_array(NULL),
m_sentenceEnd(NULL),
m_size(0),
m_sentenceCount(0) {}
: m_array(NULL),
m_sentenceEnd(NULL),
m_size(0),
m_sentenceCount(0) {}
Alignment::~Alignment()
{

View File

@ -23,16 +23,16 @@ enum {
};
Mismatch::Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end )
:m_suffixArray(sa)
,m_targetCorpus(tc)
,m_alignment(a)
,m_sentence_id(sentence_id)
,m_source_length(source_length)
,m_target_length(target_length)
,m_source_position(position)
,m_source_start(source_start)
,m_source_end(source_end)
,m_unaligned(true)
:m_suffixArray(sa)
,m_targetCorpus(tc)
,m_alignment(a)
,m_sentence_id(sentence_id)
,m_source_length(source_length)
,m_target_length(target_length)
,m_source_position(position)
,m_source_start(source_start)
,m_source_end(source_end)
,m_unaligned(true)
{
// initialize unaligned indexes
for (int i = 0; i < m_source_length; i++) {
@ -42,7 +42,7 @@ Mismatch::Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sente
m_target_unaligned[i] = true;
}
m_num_alignment_points =
m_alignment->GetNumberOfAlignmentPoints( sentence_id );
m_alignment->GetNumberOfAlignmentPoints( sentence_id );
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
m_source_unaligned[ (int)m_alignment->GetSourceWord( sentence_id, ap ) ] = false;
m_target_unaligned[ (int)m_alignment->GetTargetWord( sentence_id, ap ) ] = false;
@ -58,234 +58,235 @@ Mismatch::~Mismatch () {}
void Mismatch::PrintClippedHTML( ostream* out, int width )
{
int source_annotation[256], target_annotation[256];
vector< string > label_class;
label_class.push_back( "" );
label_class.push_back( "mismatch_pre_aligned" );
label_class.push_back( "mismatch_post_aligned" );
label_class.push_back( "null_aligned" );
label_class.push_back( "mismatch_misaligned" );
label_class.push_back( "mismatch_aligned" );
int source_annotation[256], target_annotation[256];
vector< string > label_class;
label_class.push_back( "" );
label_class.push_back( "mismatch_pre_aligned" );
label_class.push_back( "mismatch_post_aligned" );
label_class.push_back( "null_aligned" );
label_class.push_back( "mismatch_misaligned" );
label_class.push_back( "mismatch_aligned" );
for(int i=0; i<m_source_length;i++) source_annotation[i] = UNANNOTATED;
for(int i=0; i<m_target_length;i++) target_annotation[i] = UNANNOTATED;
for(int i=0; i<m_source_length; i++) source_annotation[i] = UNANNOTATED;
for(int i=0; i<m_target_length; i++) target_annotation[i] = UNANNOTATED;
if (m_unaligned) {
// find alignment points for prior and next word(s) and
// center target phrase around those.
bool found_aligned = false;
for(int i=1; i<m_source_length && !found_aligned; i++) {
if (m_source_start-i >= 0) {
int word_id = m_source_start-i;
source_annotation[ word_id ] = UNALIGNED;
if (!m_source_unaligned[ word_id ]) {
found_aligned = true;
LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED );
}
}
if (m_unaligned) {
// find alignment points for prior and next word(s) and
// center target phrase around those.
bool found_aligned = false;
for(int i=1; i<m_source_length && !found_aligned; i++) {
if (m_source_start-i >= 0) {
int word_id = m_source_start-i;
source_annotation[ word_id ] = UNALIGNED;
if (!m_source_unaligned[ word_id ]) {
found_aligned = true;
LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED );
}
}
if (m_source_end+i < m_source_length) {
int word_id = m_source_end+i;
source_annotation[ word_id ] = UNALIGNED;
if (!m_source_unaligned[ word_id ]) {
found_aligned = true;
LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED );
}
}
}
if (m_source_end+i < m_source_length) {
int word_id = m_source_end+i;
source_annotation[ word_id ] = UNALIGNED;
if (!m_source_unaligned[ word_id ]) {
found_aligned = true;
LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED );
}
}
}
}
// misalignment
else {
// label aligned output words
for(int i=m_source_start; i<=m_source_end; i++)
LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED );
}
// misalignment
else {
// label aligned output words
for(int i=m_source_start; i<=m_source_end; i++)
LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED );
// find first and last
int target_start = -1;
int target_end;
for(int i=0; i<m_target_length; i++)
if (target_annotation[i] == ALIGNED) {
if (target_start == -1)
target_start = i;
target_end = i;
}
// go over all enclosed target words
for(int i=target_start; i<=target_end; i++) {
// label other target words as unaligned or misaligned
if (m_target_unaligned[ i ])
target_annotation[ i ] = UNALIGNED;
else {
if (target_annotation[ i ] != ALIGNED)
target_annotation[ i ] = MISALIGNED;
// loop over aligned source words
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
if (m_alignment->GetTargetWord( m_sentence_id, ap ) == i) {
int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
// if not part of the source phrase -> also misaligned
if (source_word < m_source_start || source_word > m_source_end)
source_annotation[ source_word ] = MISALIGNED;
}
}
}
}
// closure
bool change = true;
while(change) {
change = false;
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
int target_word = m_alignment->GetTargetWord( m_sentence_id, ap );
if (source_annotation[source_word] != UNANNOTATED &&
target_annotation[target_word] == UNANNOTATED) {
target_annotation[target_word] = MISALIGNED;
change = true;
}
if (source_annotation[source_word] == UNANNOTATED &&
target_annotation[target_word] != UNANNOTATED) {
source_annotation[source_word] = MISALIGNED;
change = true;
}
}
}
}
// find first and last
int target_start = -1;
int target_end;
for(int i=0; i<m_target_length; i++)
if (target_annotation[i] == ALIGNED) {
if (target_start == -1)
target_start = i;
target_end = i;
}
// go over all enclosed target words
for(int i=target_start; i<=target_end; i++) {
// label other target words as unaligned or misaligned
if (m_target_unaligned[ i ])
target_annotation[ i ] = UNALIGNED;
else {
if (target_annotation[ i ] != ALIGNED)
target_annotation[ i ] = MISALIGNED;
// loop over aligned source words
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
if (m_alignment->GetTargetWord( m_sentence_id, ap ) == i) {
int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
// if not part of the source phrase -> also misaligned
if (source_word < m_source_start || source_word > m_source_end)
source_annotation[ source_word ] = MISALIGNED;
}
}
}
}
// closure
bool change = true;
while(change) {
change = false;
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
int target_word = m_alignment->GetTargetWord( m_sentence_id, ap );
if (source_annotation[source_word] != UNANNOTATED &&
target_annotation[target_word] == UNANNOTATED) {
target_annotation[target_word] = MISALIGNED;
change = true;
}
if (source_annotation[source_word] == UNANNOTATED &&
target_annotation[target_word] != UNANNOTATED) {
source_annotation[source_word] = MISALIGNED;
change = true;
}
}
}
}
// print source
// shorten source context if too long
// print source
// shorten source context if too long
int sentence_start = m_source_position - m_source_start;
int context_space = width/2;
for(int i=m_source_start;i<=m_source_end;i++)
context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1;
context_space /= 2;
int context_space = width/2;
for(int i=m_source_start; i<=m_source_end; i++)
context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1;
context_space /= 2;
int remaining = context_space;
int start_word = m_source_start;
for(;start_word>0 && remaining>0; start_word--)
remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1;
if (remaining<0 || start_word == -1) start_word++;
int remaining = context_space;
int start_word = m_source_start;
for(; start_word>0 && remaining>0; start_word--)
remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1;
if (remaining<0 || start_word == -1) start_word++;
remaining = context_space;
int end_word = m_source_end;
for(;end_word<m_source_length && remaining>0; end_word++)
remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1;
end_word--;
remaining = context_space;
int end_word = m_source_end;
for(; end_word<m_source_length && remaining>0; end_word++)
remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1;
end_word--;
// output with markup
*out << "<tr><td class=\"pp_source_left\">";
char current_label = UNANNOTATED;
if (start_word>0) {
current_label = source_annotation[start_word-1];
*out << "... ";
}
for(int i=start_word; i<=end_word; i++) {
// change to phrase block
if (i == m_source_start) {
if (current_label != UNANNOTATED && i!=start_word)
*out << "</span>";
*out << "</td><td class=\"pp_source\">";
current_label = UNANNOTATED;
}
// output with markup
*out << "<tr><td class=\"pp_source_left\">";
char current_label = UNANNOTATED;
if (start_word>0) {
current_label = source_annotation[start_word-1];
*out << "... ";
}
for(int i=start_word; i<=end_word; i++) {
// change to phrase block
if (i == m_source_start) {
if (current_label != UNANNOTATED && i!=start_word)
*out << "</span>";
*out << "</td><td class=\"pp_source\">";
current_label = UNANNOTATED;
}
// change to labeled word
else if (source_annotation[i] != current_label &&
source_annotation[i] != ALIGNED) {
if (current_label != UNANNOTATED && i!=start_word)
*out << "</span>";
if (source_annotation[i] != UNANNOTATED)
*out << "<span class=\""
<< label_class[ source_annotation[i] ]
<< "\">";
current_label = source_annotation[i];
}
// change to labeled word
else if (source_annotation[i] != current_label &&
source_annotation[i] != ALIGNED) {
if (current_label != UNANNOTATED && i!=start_word)
*out << "</span>";
if (source_annotation[i] != UNANNOTATED)
*out << "<span class=\""
<< label_class[ source_annotation[i] ]
<< "\">";
current_label = source_annotation[i];
}
// output word
*out << m_suffixArray->GetWord( sentence_start + i ) << " ";
// output word
*out << m_suffixArray->GetWord( sentence_start + i ) << " ";
// change to right context block
if (i == m_source_end) {
*out << "</td><td class=\"pp_source_right\">";
current_label = UNANNOTATED;
}
}
// change to right context block
if (i == m_source_end) {
*out << "</td><td class=\"pp_source_right\">";
current_label = UNANNOTATED;
}
}
if (current_label != UNANNOTATED && end_word>m_source_end)
*out << "</span>";
if (end_word<m_source_length-1)
*out << "... ";
if (current_label != UNANNOTATED && end_word>m_source_end)
*out << "</span>";
if (end_word<m_source_length-1)
*out << "... ";
// print target
// shorten target context if too long
int target_start = -1;
int target_end;
for(int i=0; i<m_target_length; i++)
if (target_annotation[i] != UNANNOTATED) {
if (target_start == -1)
target_start = i;
target_end = i;
}
// print target
// shorten target context if too long
int target_start = -1;
int target_end;
for(int i=0; i<m_target_length; i++)
if (target_annotation[i] != UNANNOTATED) {
if (target_start == -1)
target_start = i;
target_end = i;
}
context_space = width/2;
for(int i=target_start;i<=target_end;i++)
context_space -= m_targetCorpus->GetWord( m_sentence_id, i ).size() + 1;
while (context_space < 0) { // shorten matched part, if too long
context_space +=
m_targetCorpus->GetWord( m_sentence_id, target_start ).size() +
m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2;
target_start++;
target_end--;
}
context_space /= 2;
context_space = width/2;
for(int i=target_start; i<=target_end; i++)
context_space -= m_targetCorpus->GetWord( m_sentence_id, i ).size() + 1;
while (context_space < 0) { // shorten matched part, if too long
context_space +=
m_targetCorpus->GetWord( m_sentence_id, target_start ).size() +
m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2;
target_start++;
target_end--;
}
context_space /= 2;
remaining = context_space;
start_word = target_start;
for(;start_word>0 && remaining>0; start_word--) {
//cerr << "remaining: " << remaining << ", start_word: " << start_word << endl;
remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1;
}
if (remaining<0 || start_word == -1) start_word++;
remaining = context_space;
start_word = target_start;
for(; start_word>0 && remaining>0; start_word--) {
//cerr << "remaining: " << remaining << ", start_word: " << start_word << endl;
remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1;
}
if (remaining<0 || start_word == -1) start_word++;
remaining = context_space;
end_word = target_end;
for(;end_word<m_target_length && remaining>0; end_word++) {
//cerr << "remaining: " << remaining << ", end_word: " << end_word << endl;
remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1;
}
end_word--;
remaining = context_space;
end_word = target_end;
for(; end_word<m_target_length && remaining>0; end_word++) {
//cerr << "remaining: " << remaining << ", end_word: " << end_word << endl;
remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1;
}
end_word--;
// output with markup
*out << "</td><td class=\"mismatch_target\">";
current_label = UNANNOTATED;
if (start_word>0) {
current_label = target_annotation[start_word-1];
*out << "... ";
}
for(int i=start_word; i<=end_word; i++) {
if (target_annotation[i] != current_label) {
if (current_label != UNANNOTATED && i!=start_word)
*out << "</span>";
if (target_annotation[i] != UNANNOTATED)
*out << "<span class=\""
<< label_class[ target_annotation[i] ]
<< "\">";
current_label = target_annotation[i];
}
// output with markup
*out << "</td><td class=\"mismatch_target\">";
current_label = UNANNOTATED;
if (start_word>0) {
current_label = target_annotation[start_word-1];
*out << "... ";
}
for(int i=start_word; i<=end_word; i++) {
if (target_annotation[i] != current_label) {
if (current_label != UNANNOTATED && i!=start_word)
*out << "</span>";
if (target_annotation[i] != UNANNOTATED)
*out << "<span class=\""
<< label_class[ target_annotation[i] ]
<< "\">";
current_label = target_annotation[i];
}
// output word
*out << m_targetCorpus->GetWord( m_sentence_id, i ) << " ";
}
// output word
*out << m_targetCorpus->GetWord( m_sentence_id, i ) << " ";
}
if (current_label != UNANNOTATED && end_word>target_end)
*out << "</span>";
if (end_word<m_target_length-1)
*out << "... ";
*out << "</td></tr>";
if (current_label != UNANNOTATED && end_word>target_end)
*out << "</span>";
if (end_word<m_target_length-1)
*out << "... ";
*out << "</td></tr>";
}
void Mismatch::LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label ) {
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
if (m_alignment->GetSourceWord( m_sentence_id, ap ) == source_id) {
source_annotation[ source_id ] = label;
target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label;
}
}
void Mismatch::LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label )
{
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
if (m_alignment->GetSourceWord( m_sentence_id, ap ) == source_id) {
source_annotation[ source_id ] = label;
target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label;
}
}
}

View File

@ -34,7 +34,9 @@ public:
Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end );
~Mismatch();
bool Unaligned() const { return m_unaligned; }
bool Unaligned() const {
return m_unaligned;
}
void PrintClippedHTML(std::ostream* out, int width );
void LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label );
};

View File

@ -37,7 +37,7 @@ void PhrasePair::Print( ostream* out ) const
INDEX ap_points = m_alignment->GetNumberOfAlignmentPoints( m_sentence_id );
for( INDEX i=0; i<ap_points; i++) {
*out << " " << m_alignment->GetSourceWord( m_sentence_id, i )
<< "-" << m_alignment->GetTargetWord( m_sentence_id, i );
<< "-" << m_alignment->GetTargetWord( m_sentence_id, i );
}
*out << endl;
@ -185,27 +185,27 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
size_t source_pre_width = (source_width-source.size())/2;
size_t source_post_width = (source_width-source.size()+1)/2;
// if phrase is too long, don't show any context
// if phrase is too long, don't show any context
if (source.size() > (size_t)width) {
source_pre_width = 0;
source_post_width = 0;
}
// too long -> truncate and add "..."
// too long -> truncate and add "..."
if (source_pre.size() > source_pre_width) {
// first skip up to a space
while(source_pre_width>0 &&
source_pre.substr(source_pre.size()-source_pre_width,1) != " ") {
source_pre_width--;
}
// first skip up to a space
while(source_pre_width>0 &&
source_pre.substr(source_pre.size()-source_pre_width,1) != " ") {
source_pre_width--;
}
source_pre = "..." + source_pre.substr( source_pre.size()-source_pre_width, source_pre_width );
}
}
if (source_post.size() > source_post_width) {
while(source_post_width>0 &&
source_post.substr(source_post_width-1,1) != " ") {
source_post_width--;
}
while(source_post_width>0 &&
source_post.substr(source_post_width-1,1) != " ") {
source_post_width--;
}
source_post = source_post.substr( 0, source_post_width ) + "...";
}
}
*out << "<tr><td class=\"pp_source_left\">"
<< source_pre
@ -220,13 +220,13 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
string target_pre = "";
string target = "";
string target_post = "";
size_t target_pre_null_width = 0;
size_t target_post_null_width = 0;
size_t target_pre_null_width = 0;
size_t target_post_null_width = 0;
for( char i=0; i<m_target_start; i++ ) {
WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
target_pre += " " + word;
if (i >= m_target_start-m_pre_null)
target_pre_null_width += word.size() + 1;
if (i >= m_target_start-m_pre_null)
target_pre_null_width += word.size() + 1;
}
for( char i=m_target_start; i<=m_target_end; i++ ) {
if (i>m_target_start) target += " ";
@ -234,11 +234,11 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
}
for( char i=m_target_end+1; i<m_target_length; i++ ) {
if (i>m_target_end+1) target_post += " ";
WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
target_post += word;
if (i-(m_target_end+1) < m_post_null) {
target_post_null_width += word.size() + 1;
}
if (i-(m_target_end+1) < m_post_null) {
target_post_null_width += word.size() + 1;
}
}
size_t target_pre_width = (target_width-target.size())/2;
@ -250,45 +250,44 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
}
if (target_pre.size() < target_pre_width)
target_pre_width = target_pre.size();
else {
while(target_pre_width>0 &&
target_pre.substr(target_pre.size()-target_pre_width,1) != " ") {
target_pre_width--;
}
target_pre_width = target_pre.size();
else {
while(target_pre_width>0 &&
target_pre.substr(target_pre.size()-target_pre_width,1) != " ") {
target_pre_width--;
}
target_pre = "..." + target_pre.substr( target_pre.size()-target_pre_width, target_pre_width );
}
}
if (target_post.size() < target_post_width) {
target_post_width = target_post.size();
}
else {
while(target_post_width>0 &&
target_post.substr(target_post_width-1,1) != " ") {
target_post_width--;
}
target_post = target_post.substr( 0, target_post_width ) + "...";
}
if (target_post.size() < target_post_width) {
target_post_width = target_post.size();
} else {
while(target_post_width>0 &&
target_post.substr(target_post_width-1,1) != " ") {
target_post_width--;
}
target_post = target_post.substr( 0, target_post_width ) + "...";
}
if (m_pre_null) {
//cerr << endl << "target_pre_width=" << target_pre_width << ", target_pre_null_width=" << target_pre_null_width << ", target_pre.size()=" << target_pre.size() << endl;
if (target_pre_width < target_pre.size())
target_pre_null_width -= target_pre.size()-target_pre_width;
target_pre = target_pre.substr(0,target_pre_width-target_pre_null_width)
+ "<span class=\"null_aligned\">"
+ target_pre.substr(target_pre_width-target_pre_null_width)
+ "</span>";
}
if (m_post_null) {
//cerr << endl << "target_post_width=" << target_post_width << ", target_post_null_width=" << target_post_null_width << ", target_post.size()=" << target_post.size() << endl;
if (target_post_null_width > target_post.size()) {
target_post_null_width = target_post.size();
}
target_post = "<span class=\"null_aligned\">"
+ target_post.substr(0,target_post_null_width)
+ "</span>"
+ target_post.substr(target_post_null_width);
}
if (m_pre_null) {
//cerr << endl << "target_pre_width=" << target_pre_width << ", target_pre_null_width=" << target_pre_null_width << ", target_pre.size()=" << target_pre.size() << endl;
if (target_pre_width < target_pre.size())
target_pre_null_width -= target_pre.size()-target_pre_width;
target_pre = target_pre.substr(0,target_pre_width-target_pre_null_width)
+ "<span class=\"null_aligned\">"
+ target_pre.substr(target_pre_width-target_pre_null_width)
+ "</span>";
}
if (m_post_null) {
//cerr << endl << "target_post_width=" << target_post_width << ", target_post_null_width=" << target_post_null_width << ", target_post.size()=" << target_post.size() << endl;
if (target_post_null_width > target_post.size()) {
target_post_null_width = target_post.size();
}
target_post = "<span class=\"null_aligned\">"
+ target_post.substr(0,target_post_null_width)
+ "</span>"
+ target_post.substr(target_post_null_width);
}
*out << "<td class=\"pp_target_left\">"
<< target_pre

View File

@ -47,15 +47,15 @@ int PhrasePairCollection::GetCollection( const vector< string >& sourceString )
int sentence_length = m_suffixArray->GetSentenceLength( sentence_id );
int target_length = m_targetCorpus->GetSentenceLength( sentence_id );
//cerr << "match " << (i-first_match)
//<< " in sentence " << sentence_id
//<< ", starting at word " << source_start
//<< " of " << sentence_length
//<< ". target sentence has " << target_length << " words.";
//<< " in sentence " << sentence_id
//<< ", starting at word " << source_start
//<< " of " << sentence_length
//<< ". target sentence has " << target_length << " words.";
int target_start, target_end, pre_null, post_null;
if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) {
//cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]";
//cerr << " +(" << (int)pre_null << "," << (int)post_null << ")";
bool null_boundary_words = false;
bool null_boundary_words = false;
for (int pre = 0; pre <= pre_null && (pre == 0 || null_boundary_words); pre++ ) {
for (int post = 0; post <= post_null && (post == 0 || null_boundary_words); post++ ) {
vector< WORD_ID > targetString;
@ -75,19 +75,18 @@ int PhrasePairCollection::GetCollection( const vector< string >& sourceString )
m_size++;
}
}
} else {
//cerr << "mismatch " << (i-first_match)
// << " in sentence " << sentence_id
// << ", starting at word " << source_start
// << " of " << sentence_length
// << ". target sentence has " << target_length << " words.";
Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end );
if (mismatch->Unaligned())
m_unaligned.push_back( mismatch );
else
m_mismatch.push_back( mismatch );
}
else {
//cerr << "mismatch " << (i-first_match)
// << " in sentence " << sentence_id
// << ", starting at word " << source_start
// << " of " << sentence_length
// << ". target sentence has " << target_length << " words.";
Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end );
if (mismatch->Unaligned())
m_unaligned.push_back( mismatch );
else
m_mismatch.push_back( mismatch );
}
//cerr << endl;
if (found > (INDEX)m_max_lookup) {
@ -111,8 +110,7 @@ void PhrasePairCollection::Print(bool pretty) const
for(int j=0; j<ppWithSameTarget->size() && j<m_max_example; j++, p++ ) {
if (pretty) {
(*p)->PrintPretty( &cout, 100 );
}
else {
} else {
(*p)->Print( &cout );
}
if (ppWithSameTarget->size() > m_max_example) {
@ -125,33 +123,32 @@ void PhrasePairCollection::Print(bool pretty) const
void PhrasePairCollection::PrintHTML() const
{
int pp_target = 0;
bool singleton = false;
// loop over all translations
bool singleton = false;
// loop over all translations
vector< vector<PhrasePair*> >::const_iterator ppWithSameTarget;
for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_translation; ppWithSameTarget++, pp_target++ ) {
int count = ppWithSameTarget->size();
if (!singleton) {
if (count == 1) {
singleton = true;
cout << "<p class=\"pp_singleton_header\">singleton"
<< (m_collection.end() - ppWithSameTarget==1?"":"s") << " ("
<< (m_collection.end() - ppWithSameTarget)
<< "/" << m_size << ")</p>";
}
else {
cout << "<p class=\"pp_target_header\">";
(*(ppWithSameTarget->begin()))->PrintTarget( &cout );
cout << " (" << count << "/" << m_size << ")" << endl;
cout << "<p><div id=\"pp_" << pp_target << "\">";
}
cout << "<table align=\"center\">";
}
int count = ppWithSameTarget->size();
if (!singleton) {
if (count == 1) {
singleton = true;
cout << "<p class=\"pp_singleton_header\">singleton"
<< (m_collection.end() - ppWithSameTarget==1?"":"s") << " ("
<< (m_collection.end() - ppWithSameTarget)
<< "/" << m_size << ")</p>";
} else {
cout << "<p class=\"pp_target_header\">";
(*(ppWithSameTarget->begin()))->PrintTarget( &cout );
cout << " (" << count << "/" << m_size << ")" << endl;
cout << "<p><div id=\"pp_" << pp_target << "\">";
}
cout << "<table align=\"center\">";
}
vector< PhrasePair* >::const_iterator p;
// loop over all sentences where translation occurs
// loop over all sentences where translation occurs
int pp=0;
int i=0;
int i=0;
for(p = ppWithSameTarget->begin(); i<10 && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
(*p)->PrintClippedHTML( &cout, 160 );
if (count > m_max_example) {
@ -159,54 +156,54 @@ void PhrasePairCollection::PrintHTML() const
pp += count/m_max_example-1;
}
}
if (i == 10 && pp < count) {
// extended table
cout << "<tr><td colspan=7 align=center class=\"pp_more\" onclick=\"javascript:document.getElementById('pp_" << pp_target << "').style.display = 'none'; document.getElementById('pp_ext_" << pp_target << "').style.display = 'block';\">(more)</td></tr></table></div>";
cout << "<div id=\"pp_ext_" << pp_target << "\" style=\"display:none;\";\">";
cout << "<table align=\"center\">";
for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_example && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
(*p)->PrintClippedHTML( &cout, 160 );
if (count > m_max_example) {
p += count/m_max_example-1;
pp += count/m_max_example-1;
}
}
}
if (!singleton) cout << "</table></div>\n";
if (i == 10 && pp < count) {
// extended table
cout << "<tr><td colspan=7 align=center class=\"pp_more\" onclick=\"javascript:document.getElementById('pp_" << pp_target << "').style.display = 'none'; document.getElementById('pp_ext_" << pp_target << "').style.display = 'block';\">(more)</td></tr></table></div>";
cout << "<div id=\"pp_ext_" << pp_target << "\" style=\"display:none;\";\">";
cout << "<table align=\"center\">";
for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_example && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
(*p)->PrintClippedHTML( &cout, 160 );
if (count > m_max_example) {
p += count/m_max_example-1;
pp += count/m_max_example-1;
}
}
}
if (!singleton) cout << "</table></div>\n";
if (!singleton && pp_target == 9) {
cout << "<div id=\"pp_toggle\" onclick=\"javascript:document.getElementById('pp_toggle').style.display = 'none'; document.getElementById('pp_additional').style.display = 'block';\">";
cout << "<p class=\"pp_target_header\">(more)</p></div>";
cout << "<div id=\"pp_additional\" style=\"display:none;\";\">";
}
if (!singleton && pp_target == 9) {
cout << "<div id=\"pp_toggle\" onclick=\"javascript:document.getElementById('pp_toggle').style.display = 'none'; document.getElementById('pp_additional').style.display = 'block';\">";
cout << "<p class=\"pp_target_header\">(more)</p></div>";
cout << "<div id=\"pp_additional\" style=\"display:none;\";\">";
}
}
if (singleton) cout << "</table></div>\n";
else if (pp_target > 9) cout << "</div>";
if (singleton) cout << "</table></div>\n";
else if (pp_target > 9) cout << "</div>";
size_t max_mismatch = m_max_example/3;
// unaligned phrases
if (m_unaligned.size() > 0) {
cout << "<p class=\"pp_singleton_header\">unaligned"
<< " (" << (m_unaligned.size()) << ")</p>";
cout << "<table align=\"center\">";
int step_size = 1;
if (m_unaligned.size() > max_mismatch)
step_size = (m_unaligned.size()+max_mismatch-1) / max_mismatch;
for(size_t i=0;i<m_unaligned.size();i+=step_size)
m_unaligned[i]->PrintClippedHTML( &cout, 160 );
cout << "</table>";
}
size_t max_mismatch = m_max_example/3;
// unaligned phrases
if (m_unaligned.size() > 0) {
cout << "<p class=\"pp_singleton_header\">unaligned"
<< " (" << (m_unaligned.size()) << ")</p>";
cout << "<table align=\"center\">";
int step_size = 1;
if (m_unaligned.size() > max_mismatch)
step_size = (m_unaligned.size()+max_mismatch-1) / max_mismatch;
for(size_t i=0; i<m_unaligned.size(); i+=step_size)
m_unaligned[i]->PrintClippedHTML( &cout, 160 );
cout << "</table>";
}
// mismatched phrases
if (m_mismatch.size() > 0) {
cout << "<p class=\"pp_singleton_header\">mismatched"
<< " (" << (m_mismatch.size()) << ")</p>";
cout << "<table align=\"center\">";
int step_size = 1;
if (m_mismatch.size() > max_mismatch)
step_size = (m_mismatch.size()+max_mismatch-1) / max_mismatch;
for(size_t i=0;i<m_mismatch.size();i+=step_size)
m_mismatch[i]->PrintClippedHTML( &cout, 160 );
cout << "</table>";
}
// mismatched phrases
if (m_mismatch.size() > 0) {
cout << "<p class=\"pp_singleton_header\">mismatched"
<< " (" << (m_mismatch.size()) << ")</p>";
cout << "<table align=\"center\">";
int step_size = 1;
if (m_mismatch.size() > max_mismatch)
step_size = (m_mismatch.size()+max_mismatch-1) / max_mismatch;
for(size_t i=0; i<m_mismatch.size(); i+=step_size)
m_mismatch[i]->PrintClippedHTML( &cout, 160 );
cout << "</table>";
}
}

View File

@ -5,7 +5,8 @@
#include <stdlib.h>
#include <cstring>
namespace {
namespace
{
const int LINE_MAX_LENGTH = 10000;
@ -14,15 +15,15 @@ const int LINE_MAX_LENGTH = 10000;
using namespace std;
SuffixArray::SuffixArray()
: m_array(NULL),
m_index(NULL),
m_buffer(NULL),
m_wordInSentence(NULL),
m_sentence(NULL),
m_sentenceLength(NULL),
m_vcb(),
m_size(0),
m_sentenceCount(0) { }
: m_array(NULL),
m_index(NULL),
m_buffer(NULL),
m_wordInSentence(NULL),
m_sentence(NULL),
m_sentenceLength(NULL),
m_vcb(),
m_size(0),
m_sentenceCount(0) { }
SuffixArray::~SuffixArray()
{

View File

@ -5,7 +5,8 @@
#include <stdlib.h>
#include <cstring>
namespace {
namespace
{
const int LINE_MAX_LENGTH = 10000;
@ -14,11 +15,11 @@ const int LINE_MAX_LENGTH = 10000;
using namespace std;
TargetCorpus::TargetCorpus()
: m_array(NULL),
m_sentenceEnd(NULL),
m_vcb(),
m_size(0),
m_sentenceCount(0) {}
: m_array(NULL),
m_sentenceEnd(NULL),
m_vcb(),
m_size(0),
m_sentenceCount(0) {}
TargetCorpus::~TargetCorpus()
{

View File

@ -2,7 +2,8 @@
#include "Vocabulary.h"
#include <fstream>
namespace {
namespace
{
const int MAX_LENGTH = 10000;

View File

@ -29,16 +29,18 @@
#include <iostream>
static const std::string base64_chars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789+/";
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789+/";
static inline bool is_base64(unsigned char c) {
static inline bool is_base64(unsigned char c)
{
return (isalnum(c) || (c == '+') || (c == '/'));
}
std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len) {
std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len)
{
std::string ret;
int i = 0;
int j = 0;
@ -59,8 +61,7 @@ std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_
}
}
if (i)
{
if (i) {
for(j = i; j < 3; j++)
char_array_3[j] = '\0';
@ -81,7 +82,8 @@ std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_
}
std::string base64_decode(std::string const& encoded_string) {
std::string base64_decode(std::string const& encoded_string)
{
int in_len = encoded_string.size();
int i = 0;
int j = 0;
@ -90,7 +92,8 @@ std::string base64_decode(std::string const& encoded_string) {
std::string ret;
while (in_len-- && ( encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
char_array_4[i++] = encoded_string[in_]; in_++;
char_array_4[i++] = encoded_string[in_];
in_++;
if (i ==4) {
for (i = 0; i <4; i++)
char_array_4[i] = base64_chars.find(char_array_4[i]);

View File

@ -150,22 +150,19 @@ int main(int argc, char* argv[])
cout << "TOTAL: " << total << endl;
if (htmlFlag) {
ppCollection.PrintHTML();
}
else {
ppCollection.Print(prettyFlag);
} else {
ppCollection.Print(prettyFlag);
}
cout << "-|||- BICONCOR END -|||-" << endl << flush;
}
}
else if (queryFlag) {
} else if (queryFlag) {
cerr << "query is " << query << endl;
vector< string > queryString = alignment.Tokenize( query.c_str() );
PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example );
ppCollection.GetCollection( queryString );
if (htmlFlag) {
ppCollection.PrintHTML();
}
else {
} else {
ppCollection.Print(prettyFlag);
}
}

View File

@ -29,155 +29,158 @@ using namespace std;
namespace Moses
{
PhraseDictionaryInterpolated::PhraseDictionaryInterpolated
(size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature):
PhraseDictionary(numScoreComponent,feature),
m_targetPhrases(NULL),
m_languageModels(NULL) {}
PhraseDictionaryInterpolated::PhraseDictionaryInterpolated
(size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature):
PhraseDictionary(numScoreComponent,feature),
m_targetPhrases(NULL),
m_languageModels(NULL) {}
bool PhraseDictionaryInterpolated::Load(
const std::vector<FactorType> &input
, const std::vector<FactorType> &output
, const std::vector<std::string>& config
, const std::vector<float> &weightT
, size_t tableLimit
, const LMList &languageModels
, float weightWP) {
bool PhraseDictionaryInterpolated::Load(
const std::vector<FactorType> &input
, const std::vector<FactorType> &output
, const std::vector<std::string>& config
, const std::vector<float> &weightT
, size_t tableLimit
, const LMList &languageModels
, float weightWP)
{
m_languageModels = &languageModels;
m_weightT = weightT;
m_tableLimit = tableLimit;
m_weightWP = weightWP;
m_languageModels = &languageModels;
m_weightT = weightT;
m_tableLimit = tableLimit;
m_weightWP = weightWP;
//The config should be as follows:
//0-3: type factor factor num-components (as usual)
//4: combination mode (e.g. naive)
//5-(length-2): List of phrase-table files
//length-1: Weight string, in the same format as used for tmcombine
//The config should be as follows:
//0-3: type factor factor num-components (as usual)
//4: combination mode (e.g. naive)
//5-(length-2): List of phrase-table files
//length-1: Weight string, in the same format as used for tmcombine
UTIL_THROW_IF(config.size() < 7, util::Exception, "Missing fields from phrase table configuration: expected at least 7");
UTIL_THROW_IF(config[4] != "naive", util::Exception, "Unsupported combination mode: '" << config[4] << "'");
UTIL_THROW_IF(config.size() < 7, util::Exception, "Missing fields from phrase table configuration: expected at least 7");
UTIL_THROW_IF(config[4] != "naive", util::Exception, "Unsupported combination mode: '" << config[4] << "'");
// Create the dictionaries
for (size_t i = 5; i < config.size()-1; ++i) {
m_dictionaries.push_back(DictionaryHandle(new PhraseDictionaryTreeAdaptor(
GetFeature()->GetNumScoreComponents(),
GetFeature()->GetNumInputScores(),
GetFeature())));
bool ret = m_dictionaries.back()->Load(
input,
output,
config[i],
weightT,
0,
languageModels,
weightWP);
if (!ret) return ret;
}
//Parse the weight strings
for (util::TokenIter<util::SingleCharacter, false> featureWeights(config.back(), util::SingleCharacter(';')); featureWeights; ++featureWeights) {
m_weights.push_back(vector<float>());
float sum = 0;
for (util::TokenIter<util::SingleCharacter, false> tableWeights(*featureWeights, util::SingleCharacter(',')); tableWeights; ++tableWeights) {
const float weight = boost::lexical_cast<float>(*tableWeights);
m_weights.back().push_back(weight);
sum += weight;
}
UTIL_THROW_IF(m_weights.back().size() != m_dictionaries.size(), util::Exception,
"Number of weights (" << m_weights.back().size() <<
") does not match number of dictionaries to combine (" << m_dictionaries.size() << ")");
UTIL_THROW_IF(abs(sum - 1) > 0.01, util::Exception, "Weights not normalised");
}
//check number of weight sets. Make sure there is a weight for every score component
//except for the last - which is assumed to be the phrase penalty.
UTIL_THROW_IF(m_weights.size() != 1 && m_weights.size() != GetFeature()->GetNumScoreComponents()-1, util::Exception, "Unexpected number of weight sets");
//if 1 weight set, then repeat
if (m_weights.size() == 1) {
while(m_weights.size() < GetFeature()->GetNumScoreComponents()-1) {
m_weights.push_back(m_weights[0]);
}
}
return true;
// Create the dictionaries
for (size_t i = 5; i < config.size()-1; ++i) {
m_dictionaries.push_back(DictionaryHandle(new PhraseDictionaryTreeAdaptor(
GetFeature()->GetNumScoreComponents(),
GetFeature()->GetNumInputScores(),
GetFeature())));
bool ret = m_dictionaries.back()->Load(
input,
output,
config[i],
weightT,
0,
languageModels,
weightWP);
if (!ret) return ret;
}
void PhraseDictionaryInterpolated::InitializeForInput(InputType const& source) {
for (size_t i = 0; i < m_dictionaries.size(); ++i) {
m_dictionaries[i]->InitializeForInput(source);
//Parse the weight strings
for (util::TokenIter<util::SingleCharacter, false> featureWeights(config.back(), util::SingleCharacter(';')); featureWeights; ++featureWeights) {
m_weights.push_back(vector<float>());
float sum = 0;
for (util::TokenIter<util::SingleCharacter, false> tableWeights(*featureWeights, util::SingleCharacter(',')); tableWeights; ++tableWeights) {
const float weight = boost::lexical_cast<float>(*tableWeights);
m_weights.back().push_back(weight);
sum += weight;
}
UTIL_THROW_IF(m_weights.back().size() != m_dictionaries.size(), util::Exception,
"Number of weights (" << m_weights.back().size() <<
") does not match number of dictionaries to combine (" << m_dictionaries.size() << ")");
UTIL_THROW_IF(abs(sum - 1) > 0.01, util::Exception, "Weights not normalised");
}
//check number of weight sets. Make sure there is a weight for every score component
//except for the last - which is assumed to be the phrase penalty.
UTIL_THROW_IF(m_weights.size() != 1 && m_weights.size() != GetFeature()->GetNumScoreComponents()-1, util::Exception, "Unexpected number of weight sets");
//if 1 weight set, then repeat
if (m_weights.size() == 1) {
while(m_weights.size() < GetFeature()->GetNumScoreComponents()-1) {
m_weights.push_back(m_weights[0]);
}
}
typedef
boost::unordered_set<TargetPhrase*,PhrasePtrHasher,PhrasePtrComparator> PhraseSet;
return true;
}
void PhraseDictionaryInterpolated::InitializeForInput(InputType const& source)
{
for (size_t i = 0; i < m_dictionaries.size(); ++i) {
m_dictionaries[i]->InitializeForInput(source);
}
}
typedef
boost::unordered_set<TargetPhrase*,PhrasePtrHasher,PhrasePtrComparator> PhraseSet;
const TargetPhraseCollection*
PhraseDictionaryInterpolated::GetTargetPhraseCollection(const Phrase& src) const {
const TargetPhraseCollection*
PhraseDictionaryInterpolated::GetTargetPhraseCollection(const Phrase& src) const
{
delete m_targetPhrases;
m_targetPhrases = new TargetPhraseCollection();
PhraseSet allPhrases;
vector<PhraseSet> phrasesByTable(m_dictionaries.size());
for (size_t i = 0; i < m_dictionaries.size(); ++i) {
const TargetPhraseCollection* phrases = m_dictionaries[i]->GetTargetPhraseCollection(src);
if (phrases) {
for (TargetPhraseCollection::const_iterator j = phrases->begin();
j != phrases->end(); ++j) {
allPhrases.insert(*j);
phrasesByTable[i].insert(*j);
delete m_targetPhrases;
m_targetPhrases = new TargetPhraseCollection();
PhraseSet allPhrases;
vector<PhraseSet> phrasesByTable(m_dictionaries.size());
for (size_t i = 0; i < m_dictionaries.size(); ++i) {
const TargetPhraseCollection* phrases = m_dictionaries[i]->GetTargetPhraseCollection(src);
if (phrases) {
for (TargetPhraseCollection::const_iterator j = phrases->begin();
j != phrases->end(); ++j) {
allPhrases.insert(*j);
phrasesByTable[i].insert(*j);
}
}
}
ScoreComponentCollection sparseVector;
for (PhraseSet::const_iterator i = allPhrases.begin(); i != allPhrases.end(); ++i) {
TargetPhrase* combinedPhrase = new TargetPhrase((Phrase)**i);
//combinedPhrase->ResetScore();
//cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl;
combinedPhrase->SetSourcePhrase((*i)->GetSourcePhrase());
combinedPhrase->SetAlignTerm(&((*i)->GetAlignTerm()));
combinedPhrase->SetAlignNonTerm(&((*i)->GetAlignTerm()));
Scores combinedScores(GetFeature()->GetNumScoreComponents());
for (size_t j = 0; j < phrasesByTable.size(); ++j) {
PhraseSet::const_iterator tablePhrase = phrasesByTable[j].find(combinedPhrase);
if (tablePhrase != phrasesByTable[j].end()) {
Scores tableScores = (*tablePhrase)->GetScoreBreakdown()
.GetScoresForProducer(GetFeature());
//cerr << "Scores from " << j << " table: ";
for (size_t k = 0; k < tableScores.size()-1; ++k) {
//cerr << tableScores[k] << "(" << exp(tableScores[k]) << ") ";
combinedScores[k] += m_weights[k][j] * exp(tableScores[k]);
//cerr << m_weights[k][j] * exp(tableScores[k]) << " ";
}
//cerr << endl;
}
}
ScoreComponentCollection sparseVector;
for (PhraseSet::const_iterator i = allPhrases.begin(); i != allPhrases.end(); ++i) {
TargetPhrase* combinedPhrase = new TargetPhrase((Phrase)**i);
//combinedPhrase->ResetScore();
//cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl;
combinedPhrase->SetSourcePhrase((*i)->GetSourcePhrase());
combinedPhrase->SetAlignTerm(&((*i)->GetAlignTerm()));
combinedPhrase->SetAlignNonTerm(&((*i)->GetAlignTerm()));
Scores combinedScores(GetFeature()->GetNumScoreComponents());
for (size_t j = 0; j < phrasesByTable.size(); ++j) {
PhraseSet::const_iterator tablePhrase = phrasesByTable[j].find(combinedPhrase);
if (tablePhrase != phrasesByTable[j].end()) {
Scores tableScores = (*tablePhrase)->GetScoreBreakdown()
.GetScoresForProducer(GetFeature());
//cerr << "Scores from " << j << " table: ";
for (size_t k = 0; k < tableScores.size()-1; ++k) {
//cerr << tableScores[k] << "(" << exp(tableScores[k]) << ") ";
combinedScores[k] += m_weights[k][j] * exp(tableScores[k]);
//cerr << m_weights[k][j] * exp(tableScores[k]) << " ";
}
//cerr << endl;
}
}
//map back to log space
//cerr << "Combined ";
for (size_t k = 0; k < combinedScores.size()-1; ++k) {
//cerr << combinedScores[k] << " ";
combinedScores[k] = log(combinedScores[k]);
//cerr << combinedScores[k] << " ";
}
//cerr << endl;
combinedScores.back() = 1; //assume last is penalty
combinedPhrase->SetScore(
GetFeature(),
combinedScores,
sparseVector,
m_weightT,
m_weightWP,
*m_languageModels);
//cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl;
m_targetPhrases->Add(combinedPhrase);
//map back to log space
//cerr << "Combined ";
for (size_t k = 0; k < combinedScores.size()-1; ++k) {
//cerr << combinedScores[k] << " ";
combinedScores[k] = log(combinedScores[k]);
//cerr << combinedScores[k] << " ";
}
m_targetPhrases->Prune(true,m_tableLimit);
return m_targetPhrases;
//cerr << endl;
combinedScores.back() = 1; //assume last is penalty
combinedPhrase->SetScore(
GetFeature(),
combinedScores,
sparseVector,
m_weightT,
m_weightWP,
*m_languageModels);
//cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl;
m_targetPhrases->Add(combinedPhrase);
}
m_targetPhrases->Prune(true,m_tableLimit);
return m_targetPhrases;
}
}

View File

@ -34,12 +34,14 @@ namespace Moses
**/
class PhraseDictionaryInterpolated : public PhraseDictionary
{
public:
public:
PhraseDictionaryInterpolated
(size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature);
(size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature);
virtual ~PhraseDictionaryInterpolated() {delete m_targetPhrases;}
virtual ~PhraseDictionaryInterpolated() {
delete m_targetPhrases;
}
// initialize ...
bool Load(const std::vector<FactorType> &input
@ -58,7 +60,7 @@ class PhraseDictionaryInterpolated : public PhraseDictionary
throw std::logic_error("PhraseDictionaryInterpolated.CreateRuleLookupManager() Not implemented");
}
private:
private:
typedef boost::shared_ptr<PhraseDictionaryTreeAdaptor> DictionaryHandle;
std::vector<DictionaryHandle> m_dictionaries;

View File

@ -31,7 +31,8 @@ BOOST_AUTO_TEST_SUITE(phrase_length_feature)
//TODO: Factor out setup code so that it can be reused
static Word MakeWord(string text) {
static Word MakeWord(string text)
{
FactorCollection &factorCollection = FactorCollection::Instance();
const Factor* f = factorCollection.AddFactor(Input,0,text);
Word w;
@ -40,7 +41,8 @@ static Word MakeWord(string text) {
}
BOOST_AUTO_TEST_CASE(evaluate) {
BOOST_AUTO_TEST_CASE(evaluate)
{
Word w1 = MakeWord("w1");
Word w2 = MakeWord("y2");
Word w3 = MakeWord("x3");

View File

@ -39,7 +39,8 @@ namespace MosesTest
BOOST_AUTO_TEST_SUITE(target_bigram)
static Word MakeWord(string text) {
static Word MakeWord(string text)
{
FactorCollection &factorCollection = FactorCollection::Instance();
const Factor* f = factorCollection.AddFactor(Input,0,text);
Word w;
@ -47,30 +48,28 @@ static Word MakeWord(string text) {
return w;
}
class VocabFileFixture {
public:
template<class I>
VocabFileFixture(I begin, I end)
{
char name[] = "TargetBigramXXXXXX";
int fd = mkstemp(name);
BOOST_CHECK(fd != -1);
BOOST_CHECK(!close(fd));
filename = name;
ofstream out(name);
for (I i = begin; i != end; ++i)
{
out << *i << endl;
}
out.close();
class VocabFileFixture
{
public:
template<class I>
VocabFileFixture(I begin, I end) {
char name[] = "TargetBigramXXXXXX";
int fd = mkstemp(name);
BOOST_CHECK(fd != -1);
BOOST_CHECK(!close(fd));
filename = name;
ofstream out(name);
for (I i = begin; i != end; ++i) {
out << *i << endl;
}
out.close();
}
~VocabFileFixture()
{
BOOST_CHECK(!remove(filename.c_str()));
}
~VocabFileFixture() {
BOOST_CHECK(!remove(filename.c_str()));
}
string filename;
string filename;
};
/*

View File

@ -18,7 +18,8 @@
using namespace std;
namespace {
namespace
{
// configure regularisation
const char KEY_REFLEN[] = "reflen";
@ -33,8 +34,9 @@ namespace MosesTuning
BleuScorer::BleuScorer(const string& config)
: StatisticsBasedScorer("BLEU", config),
m_ref_length_type(CLOSEST) {
: StatisticsBasedScorer("BLEU", config),
m_ref_length_type(CLOSEST)
{
const string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST);
if (reflen == REFLEN_AVERAGE) {
m_ref_length_type = AVERAGE;
@ -101,7 +103,8 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
}
}
bool BleuScorer::OpenReference(const char* filename, size_t file_id) {
bool BleuScorer::OpenReference(const char* filename, size_t file_id)
{
ifstream ifs(filename);
if (!ifs) {
cerr << "Cannot open " << filename << endl;
@ -110,7 +113,8 @@ bool BleuScorer::OpenReference(const char* filename, size_t file_id) {
return OpenReferenceStream(&ifs, file_id);
}
bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id) {
bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id)
{
if (is == NULL) return false;
string line;
@ -203,25 +207,27 @@ statscore_t BleuScorer::calculateScore(const vector<int>& comps) const
return exp(logbleu);
}
int BleuScorer::CalcReferenceLength(size_t sentence_id, size_t length) {
int BleuScorer::CalcReferenceLength(size_t sentence_id, size_t length)
{
switch (m_ref_length_type) {
case AVERAGE:
return m_references[sentence_id]->CalcAverage();
break;
case CLOSEST:
return m_references[sentence_id]->CalcClosest(length);
break;
case SHORTEST:
return m_references[sentence_id]->CalcShortest();
break;
default:
cerr << "unknown reference types." << endl;
exit(1);
case AVERAGE:
return m_references[sentence_id]->CalcAverage();
break;
case CLOSEST:
return m_references[sentence_id]->CalcClosest(length);
break;
case SHORTEST:
return m_references[sentence_id]->CalcShortest();
break;
default:
cerr << "unknown reference types." << endl;
exit(1);
}
}
void BleuScorer::DumpCounts(ostream* os,
const NgramCounts& counts) const {
const NgramCounts& counts) const
{
for (NgramCounts::const_iterator it = counts.begin();
it != counts.end(); ++it) {
*os << "(";
@ -238,7 +244,8 @@ void BleuScorer::DumpCounts(ostream* os,
}
float smoothedSentenceBleu
(const std::vector<float>& stats, float smoothing, bool smoothBP) {
(const std::vector<float>& stats, float smoothing, bool smoothBP)
{
CHECK(stats.size() == kBleuNgramOrder * 2 + 1);
@ -248,7 +255,7 @@ float smoothedSentenceBleu
}
logbleu /= kBleuNgramOrder;
const float reflength = stats[(kBleuNgramOrder * 2)] +
(smoothBP ? smoothing : 0.0f);
(smoothBP ? smoothing : 0.0f);
const float brevity = 1.0 - reflength / stats[1];
if (brevity < 0.0) {
@ -263,7 +270,7 @@ float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vec
std::vector<float> stats;
CHECK(sent.size()==bg.size());
CHECK(sent.size()==kBleuNgramOrder*2+1);
for(size_t i=0;i<sent.size();i++)
for(size_t i=0; i<sent.size(); i++)
stats.push_back(sent[i]+bg[i]);
// Calculate BLEU
@ -282,7 +289,8 @@ float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vec
return exp(logbleu) * stats[kBleuNgramOrder*2];
}
float unsmoothedBleu(const std::vector<float>& stats) {
float unsmoothedBleu(const std::vector<float>& stats)
{
CHECK(stats.size() == kBleuNgramOrder * 2 + 1);
float logbleu = 0.0;
@ -298,50 +306,51 @@ float unsmoothedBleu(const std::vector<float>& stats) {
return exp(logbleu);
}
vector<float> BleuScorer::ScoreNbestList(const string& scoreFile, const string& featureFile) {
vector<string> scoreFiles;
vector<string> featureFiles;
scoreFiles.push_back(scoreFile);
featureFiles.push_back(featureFile);
vector<float> BleuScorer::ScoreNbestList(const string& scoreFile, const string& featureFile)
{
vector<string> scoreFiles;
vector<string> featureFiles;
scoreFiles.push_back(scoreFile);
featureFiles.push_back(featureFile);
vector<FeatureDataIterator> featureDataIters;
vector<ScoreDataIterator> scoreDataIters;
for (size_t i = 0; i < featureFiles.size(); ++i) {
featureDataIters.push_back(FeatureDataIterator(featureFiles[i]));
scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i]));
}
vector<FeatureDataIterator> featureDataIters;
vector<ScoreDataIterator> scoreDataIters;
for (size_t i = 0; i < featureFiles.size(); ++i) {
featureDataIters.push_back(FeatureDataIterator(featureFiles[i]));
scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i]));
}
vector<pair<size_t,size_t> > hypotheses;
if (featureDataIters[0] == FeatureDataIterator::end()) {
cerr << "Error: at the end of feature data iterator" << endl;
exit(1);
}
for (size_t i = 0; i < featureFiles.size(); ++i) {
if (featureDataIters[i] == FeatureDataIterator::end()) {
cerr << "Error: Feature file " << i << " ended prematurely" << endl;
exit(1);
}
if (scoreDataIters[i] == ScoreDataIterator::end()) {
cerr << "Error: Score file " << i << " ended prematurely" << endl;
exit(1);
}
if (featureDataIters[i]->size() != scoreDataIters[i]->size()) {
cerr << "Error: features and scores have different size" << endl;
exit(1);
}
for (size_t j = 0; j < featureDataIters[i]->size(); ++j) {
hypotheses.push_back(pair<size_t,size_t>(i,j));
}
}
vector<pair<size_t,size_t> > hypotheses;
if (featureDataIters[0] == FeatureDataIterator::end()) {
cerr << "Error: at the end of feature data iterator" << endl;
exit(1);
}
for (size_t i = 0; i < featureFiles.size(); ++i) {
if (featureDataIters[i] == FeatureDataIterator::end()) {
cerr << "Error: Feature file " << i << " ended prematurely" << endl;
exit(1);
}
if (scoreDataIters[i] == ScoreDataIterator::end()) {
cerr << "Error: Score file " << i << " ended prematurely" << endl;
exit(1);
}
if (featureDataIters[i]->size() != scoreDataIters[i]->size()) {
cerr << "Error: features and scores have different size" << endl;
exit(1);
}
for (size_t j = 0; j < featureDataIters[i]->size(); ++j) {
hypotheses.push_back(pair<size_t,size_t>(i,j));
}
}
// score the nbest list
vector<float> bleuScores;
for (size_t i=0; i < hypotheses.size(); ++i) {
pair<size_t,size_t> translation = hypotheses[i];
float bleu = smoothedSentenceBleu(scoreDataIters[translation.first]->operator[](translation.second));
bleuScores.push_back(bleu);
}
return bleuScores;
// score the nbest list
vector<float> bleuScores;
for (size_t i=0; i < hypotheses.size(); ++i) {
pair<size_t,size_t> translation = hypotheses[i];
float bleu = smoothedSentenceBleu(scoreDataIters[translation.first]->operator[](translation.second));
bleuScores.push_back(bleu);
}
return bleuScores;
}

View File

@ -38,14 +38,22 @@ public:
virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles);
virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry);
virtual statscore_t calculateScore(const std::vector<int>& comps) const;
virtual std::size_t NumberOfScores() const { return 2 * kBleuNgramOrder + 1; }
virtual std::size_t NumberOfScores() const {
return 2 * kBleuNgramOrder + 1;
}
int CalcReferenceLength(std::size_t sentence_id, std::size_t length);
ReferenceLengthType GetReferenceLengthType() const { return m_ref_length_type; }
void SetReferenceLengthType(ReferenceLengthType type) { m_ref_length_type = type; }
ReferenceLengthType GetReferenceLengthType() const {
return m_ref_length_type;
}
void SetReferenceLengthType(ReferenceLengthType type) {
m_ref_length_type = type;
}
const std::vector<Reference*>& GetReferences() const { return m_references.get(); }
const std::vector<Reference*>& GetReferences() const {
return m_references.get();
}
/**
* Count the ngrams of each type, up to the given length in the input line.
@ -74,7 +82,7 @@ private:
* This function is used in PRO.
*/
float smoothedSentenceBleu
(const std::vector<float>& stats, float smoothing=1.0, bool smoothBP=false);
(const std::vector<float>& stats, float smoothing=1.0, bool smoothBP=false);
/** Computes sentence-level BLEU score given a background corpus.
* This function is used in batch MIRA.

View File

@ -10,16 +10,19 @@
using namespace MosesTuning;
namespace {
namespace
{
NgramCounts* g_counts = NULL;
NgramCounts* GetNgramCounts() {
NgramCounts* GetNgramCounts()
{
assert(g_counts);
return g_counts;
}
void SetNgramCounts(NgramCounts* counts) {
void SetNgramCounts(NgramCounts* counts)
{
g_counts = counts;
}
@ -58,33 +61,38 @@ struct Fourgram {
NgramCounts::Key instance;
};
bool CheckUnigram(const std::string& str) {
bool CheckUnigram(const std::string& str)
{
Unigram unigram(str);
NgramCounts::Value v;
return GetNgramCounts()->Lookup(unigram.instance, &v);
}
bool CheckBigram(const std::string& a, const std::string& b) {
bool CheckBigram(const std::string& a, const std::string& b)
{
Bigram bigram(a, b);
NgramCounts::Value v;
return GetNgramCounts()->Lookup(bigram.instance, &v);
}
bool CheckTrigram(const std::string& a, const std::string& b,
const std::string& c) {
const std::string& c)
{
Trigram trigram(a, b, c);
NgramCounts::Value v;
return GetNgramCounts()->Lookup(trigram.instance, &v);
}
bool CheckFourgram(const std::string& a, const std::string& b,
const std::string& c, const std::string& d) {
const std::string& c, const std::string& d)
{
Fourgram fourgram(a, b, c, d);
NgramCounts::Value v;
return GetNgramCounts()->Lookup(fourgram.instance, &v);
}
void SetUpReferences(BleuScorer& scorer) {
void SetUpReferences(BleuScorer& scorer)
{
// The following examples are taken from Koehn, "Statistical Machine Translation",
// Cambridge University Press, 2010.
{
@ -115,7 +123,8 @@ void SetUpReferences(BleuScorer& scorer) {
} // namespace
BOOST_AUTO_TEST_CASE(bleu_reference_type) {
BOOST_AUTO_TEST_CASE(bleu_reference_type)
{
BleuScorer scorer;
// BleuScorer will use "closest" by default.
BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::CLOSEST);
@ -127,7 +136,8 @@ BOOST_AUTO_TEST_CASE(bleu_reference_type) {
BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::SHORTEST);
}
BOOST_AUTO_TEST_CASE(bleu_reference_type_with_config) {
BOOST_AUTO_TEST_CASE(bleu_reference_type_with_config)
{
{
BleuScorer scorer("reflen:average");
BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::AVERAGE);
@ -139,7 +149,8 @@ BOOST_AUTO_TEST_CASE(bleu_reference_type_with_config) {
}
}
BOOST_AUTO_TEST_CASE(bleu_count_ngrams) {
BOOST_AUTO_TEST_CASE(bleu_count_ngrams)
{
BleuScorer scorer;
std::string line = "I saw a girl with a telescope .";
@ -198,7 +209,8 @@ BOOST_AUTO_TEST_CASE(bleu_count_ngrams) {
BOOST_CHECK(CheckFourgram("with", "a", "telescope", "."));
}
BOOST_AUTO_TEST_CASE(bleu_clipped_counts) {
BOOST_AUTO_TEST_CASE(bleu_clipped_counts)
{
BleuScorer scorer;
SetUpReferences(scorer);
std::string line("israeli officials responsibility of airport safety");
@ -220,7 +232,8 @@ BOOST_AUTO_TEST_CASE(bleu_clipped_counts) {
BOOST_CHECK_EQUAL(entry.get(7), 3); // fourgram
}
BOOST_AUTO_TEST_CASE(calculate_actual_score) {
BOOST_AUTO_TEST_CASE(calculate_actual_score)
{
BOOST_REQUIRE(4 == kBleuNgramOrder);
std::vector<int> stats(2 * kBleuNgramOrder + 1);
BleuScorer scorer;
@ -247,7 +260,8 @@ BOOST_AUTO_TEST_CASE(calculate_actual_score) {
BOOST_CHECK_CLOSE(0.5115f, scorer.calculateScore(stats), 0.01);
}
BOOST_AUTO_TEST_CASE(sentence_level_bleu) {
BOOST_AUTO_TEST_CASE(sentence_level_bleu)
{
BOOST_REQUIRE(4 == kBleuNgramOrder);
std::vector<float> stats(2 * kBleuNgramOrder + 1);

View File

@ -6,9 +6,11 @@
using namespace std;
namespace {
namespace
{
inline int CalcDistance(int word1, int word2) {
inline int CalcDistance(int word1, int word2)
{
return word1 == word2 ? 0 : 1;
}
@ -19,8 +21,8 @@ namespace MosesTuning
CderScorer::CderScorer(const string& config, bool allowed_long_jumps)
: StatisticsBasedScorer(allowed_long_jumps ? "CDER" : "WER", config),
m_allowed_long_jumps(allowed_long_jumps) {}
: StatisticsBasedScorer(allowed_long_jumps ? "CDER" : "WER", config),
m_allowed_long_jumps(allowed_long_jumps) {}
CderScorer::~CderScorer() {}
@ -82,7 +84,8 @@ float CderScorer::calculateScore(const vector<int>& comps) const
}
void CderScorer::computeCD(const sent_t& cand, const sent_t& ref,
vector<int>& stats) const {
vector<int>& stats) const
{
int I = cand.size() + 1; // Number of inter-words positions in candidate sentence
int L = ref.size() + 1; // Number of inter-words positions in reference sentence
@ -95,11 +98,9 @@ void CderScorer::computeCD(const sent_t& cand, const sent_t& ref,
for (int i = 1; i < I; ++i) (*row)[i] = 1;
// Calculating costs for next row using costs from the previous row.
while (++l < L)
{
while (++l < L) {
vector<int>* nextRow = new vector<int>(I);
for (int i = 0; i < I; ++i)
{
for (int i = 0; i < I; ++i) {
vector<int> possibleCosts;
if (i > 0) {
possibleCosts.push_back((*nextRow)[i-1] + 1); // Deletion

View File

@ -13,8 +13,9 @@ namespace MosesTuning
/**
* CderScorer class can compute both CDER and WER metric.
*/
class CderScorer: public StatisticsBasedScorer {
public:
class CderScorer: public StatisticsBasedScorer
{
public:
explicit CderScorer(const std::string& config, bool allowed_long_jumps = true);
~CderScorer();
@ -24,11 +25,13 @@ class CderScorer: public StatisticsBasedScorer {
virtual void prepareStatsVector(std::size_t sid, const std::string& text, std::vector<int>& stats);
virtual std::size_t NumberOfScores() const { return 2; }
virtual std::size_t NumberOfScores() const {
return 2;
}
virtual float calculateScore(const std::vector<int>& comps) const;
private:
private:
bool m_allowed_long_jumps;
typedef std::vector<int> sent_t;

View File

@ -27,11 +27,11 @@ namespace MosesTuning
{
Data::Data(Scorer* scorer, const string& sparse_weights_file)
: m_scorer(scorer),
m_score_type(m_scorer->getName()),
m_num_scores(0),
m_score_data(new ScoreData(m_scorer)),
m_feature_data(new FeatureData)
: m_scorer(scorer),
m_score_type(m_scorer->getName()),
m_num_scores(0),
m_score_data(new ScoreData(m_scorer)),
m_feature_data(new FeatureData)
{
TRACE_ERR("Data::m_score_type " << m_score_type << endl);
TRACE_ERR("Data::Scorer type from Scorer: " << m_scorer->getName() << endl);
@ -48,7 +48,8 @@ Data::Data(Scorer* scorer, const string& sparse_weights_file)
//ADDED BY TS
// TODO: This is too long; consider creating additional functions to
// reduce the lines of this function.
void Data::removeDuplicates() {
void Data::removeDuplicates()
{
size_t nSentences = m_feature_data->size();
assert(m_score_data->size() == nSentences);
@ -128,7 +129,8 @@ void Data::removeDuplicates() {
}
//END_ADDED
void Data::load(const std::string &featfile, const std::string &scorefile) {
void Data::load(const std::string &featfile, const std::string &scorefile)
{
m_feature_data->load(featfile, m_sparse_weights);
m_score_data->load(scorefile);
}
@ -192,7 +194,8 @@ void Data::loadNBest(const string &file)
}
}
void Data::save(const std::string &featfile, const std::string &scorefile, bool bin) {
void Data::save(const std::string &featfile, const std::string &scorefile, bool bin)
{
if (bin)
cerr << "Binary write mode is selected" << endl;
else
@ -202,7 +205,8 @@ void Data::save(const std::string &featfile, const std::string &scorefile, bool
m_score_data->save(scorefile, bin);
}
void Data::InitFeatureMap(const string& str) {
void Data::InitFeatureMap(const string& str)
{
string buf = str;
string substr;
string features = "";
@ -231,7 +235,8 @@ void Data::InitFeatureMap(const string& str) {
}
void Data::AddFeatures(const string& str,
int sentence_index) {
int sentence_index)
{
string buf = str;
string substr;
FeatureStats feature_entry;

View File

@ -44,18 +44,28 @@ public:
m_feature_data->clear();
}
ScoreDataHandle getScoreData() { return m_score_data; }
ScoreDataHandle getScoreData() {
return m_score_data;
}
FeatureDataHandle getFeatureData() { return m_feature_data; }
FeatureDataHandle getFeatureData() {
return m_feature_data;
}
Scorer* getScorer() { return m_scorer; }
Scorer* getScorer() {
return m_scorer;
}
std::size_t NumberOfFeatures() const {
return m_feature_data->NumberOfFeatures();
}
std::string Features() const { return m_feature_data->Features(); }
void Features(const std::string &f) { m_feature_data->Features(f); }
std::string Features() const {
return m_feature_data->Features();
}
void Features(const std::string &f) {
m_feature_data->Features(f);
}
void loadNBest(const std::string &file);

View File

@ -10,7 +10,8 @@
using namespace MosesTuning;
//very basic test of sharding
BOOST_AUTO_TEST_CASE(shard_basic) {
BOOST_AUTO_TEST_CASE(shard_basic)
{
boost::scoped_ptr<Scorer> scorer(ScorerFactory::getScorer("BLEU", ""));
Data data(scorer.get());
FeatureArray fa1, fa2, fa3, fa4;
@ -39,7 +40,8 @@ BOOST_AUTO_TEST_CASE(shard_basic) {
BOOST_CHECK_EQUAL(shards[1].getFeatureData()->size(),(std::size_t)2);
}
BOOST_AUTO_TEST_CASE(init_feature_map_test) {
BOOST_AUTO_TEST_CASE(init_feature_map_test)
{
boost::scoped_ptr<Scorer> scorer(ScorerFactory::getScorer("BLEU", ""));
Data data(scorer.get());
@ -49,7 +51,8 @@ BOOST_AUTO_TEST_CASE(init_feature_map_test) {
BOOST_CHECK_EQUAL(expected, data.Features());
}
BOOST_AUTO_TEST_CASE(add_features_test) {
BOOST_AUTO_TEST_CASE(add_features_test)
{
boost::scoped_ptr<Scorer> scorer(ScorerFactory::getScorer("BLEU", ""));
Data data(scorer.get());

View File

@ -20,20 +20,20 @@ class _fdstream
{
protected:
_fdstream() :
_file_descriptor(-1), _filebuf(NULL)
_file_descriptor(-1), _filebuf(NULL)
{ }
_fdstream(int file_descriptor, std::ios_base::openmode openmode) :
_file_descriptor(file_descriptor), _openmode(openmode)
{
_file_descriptor(file_descriptor), _openmode(openmode) {
_filebuf = NULL;
open(file_descriptor, openmode);
}
std::ios_base::openmode openmode() const { return _openmode; }
std::ios_base::openmode openmode() const {
return _openmode;
}
void open(int file_descriptor, std::ios_base::openmode openmode)
{
void open(int file_descriptor, std::ios_base::openmode openmode) {
if (!_filebuf)
// We create a C++ stream from a file descriptor
// stdio_filebuf is not synced with stdio.
@ -41,11 +41,10 @@ protected:
// You can also create the filebuf from a FILE* with
// FILE* f = fdopen(file_descriptor, mode);
_filebuf = new __gnu_cxx::stdio_filebuf<char> (file_descriptor,
openmode);
openmode);
}
virtual ~_fdstream()
{
virtual ~_fdstream() {
close(_file_descriptor);
delete _filebuf;
_filebuf = NULL;
@ -60,59 +59,51 @@ class ifdstream : public _fdstream
{
public:
ifdstream() :
_fdstream(), _stream(NULL)
_fdstream(), _stream(NULL)
{ }
ifdstream(int file_descriptor) :
_fdstream(file_descriptor, std::ios_base::in)
{
_fdstream(file_descriptor, std::ios_base::in) {
_stream = new std::istream(_filebuf);
}
void open(int file_descriptor)
{
if (!_stream)
{
_fdstream::open(file_descriptor, std::ios_base::in);
_stream = new std::istream(_filebuf);
}
void open(int file_descriptor) {
if (!_stream) {
_fdstream::open(file_descriptor, std::ios_base::in);
_stream = new std::istream(_filebuf);
}
}
ifdstream& operator>> (std::string& str)
{
ifdstream& operator>> (std::string& str) {
(*_stream) >> str;
return *this;
}
std::size_t getline(std::string& str)
{
std::size_t getline(std::string& str) {
char tmp[BUFFER_SIZE];
std::size_t ret = getline(tmp, BUFFER_SIZE);
str = tmp;
return ret;
}
std::size_t getline(char* s, std::streamsize n)
{
std::size_t getline(char* s, std::streamsize n) {
return (getline(s, n, '\n'));
}
std::size_t getline(char* s, std::streamsize n, char delim)
{
std::size_t getline(char* s, std::streamsize n, char delim) {
int i = 0;
do{
do {
s[i] = _stream->get();
i++;
}while(i < n-1 && s[i-1] != delim && s[i-1] != '\0');
} while(i < n-1 && s[i-1] != delim && s[i-1] != '\0');
s[i-1] = '\0'; // overwrite the delimiter given with string end
return i-1;
}
~ifdstream()
{
~ifdstream() {
//this->~_fdstream();
delete _stream;
}
@ -125,27 +116,23 @@ class ofdstream : public _fdstream
{
public:
ofdstream() :
_fdstream(), _stream(NULL)
_fdstream(), _stream(NULL)
{ }
ofdstream(int file_descriptor) :
_fdstream(file_descriptor, std::ios_base::out)
{
_fdstream(file_descriptor, std::ios_base::out) {
_stream = new std::ostream(_filebuf);
}
void open(int file_descriptor)
{
if (!_stream)
{
void open(int file_descriptor) {
if (!_stream) {
_fdstream::open(file_descriptor, std::ios_base::out);
_stream = new std::ostream(_filebuf);
}
}
ofdstream& operator<< (const std::string& str)
{
ofdstream& operator<< (const std::string& str) {
if (_stream->good())
(*_stream) << str;
@ -153,8 +140,7 @@ public:
return *this;
}
~ofdstream()
{
~ofdstream() {
//this->~_fdstream();
delete _stream;
}

View File

@ -19,14 +19,14 @@ namespace MosesTuning
FeatureArray::FeatureArray()
: m_index(0), m_num_features(0){}
: m_index(0), m_num_features(0) {}
FeatureArray::~FeatureArray() {}
void FeatureArray::savetxt(ostream* os)
{
*os << FEATURES_TXT_BEGIN << " " << m_index << " " << m_array.size()
<< " " << m_num_features << " " << m_features << endl;
<< " " << m_num_features << " " << m_features << endl;
for (featarray_t::iterator i = m_array.begin(); i != m_array.end(); ++i) {
i->savetxt(os);
*os << endl;
@ -37,7 +37,7 @@ void FeatureArray::savetxt(ostream* os)
void FeatureArray::savebin(ostream* os)
{
*os << FEATURES_BIN_BEGIN << " " << m_index << " " << m_array.size()
<< " " << m_num_features << " " << m_features << endl;
<< " " << m_num_features << " " << m_features << endl;
for (featarray_t::iterator i = m_array.begin(); i != m_array.end(); ++i)
i->savebin(os);

View File

@ -36,16 +36,28 @@ public:
FeatureArray();
~FeatureArray();
void clear() { m_array.clear(); }
void clear() {
m_array.clear();
}
int getIndex() const { return m_index; }
void setIndex(const int value) { m_index = value; }
int getIndex() const {
return m_index;
}
void setIndex(const int value) {
m_index = value;
}
FeatureStats& get(std::size_t i) { return m_array.at(i); }
const FeatureStats& get(std::size_t i) const { return m_array.at(i); }
FeatureStats& get(std::size_t i) {
return m_array.at(i);
}
const FeatureStats& get(std::size_t i) const {
return m_array.at(i);
}
void add(FeatureStats& e) { m_array.push_back(e); }
void add(FeatureStats& e) {
m_array.push_back(e);
}
//ADDED BY TS
void swap(std::size_t i, std::size_t j) {
@ -59,13 +71,23 @@ public:
void merge(FeatureArray& e);
std::size_t size() const { return m_array.size(); }
std::size_t size() const {
return m_array.size();
}
std::size_t NumberOfFeatures() const { return m_num_features; }
void NumberOfFeatures(std::size_t v) { m_num_features = v; }
std::size_t NumberOfFeatures() const {
return m_num_features;
}
void NumberOfFeatures(std::size_t v) {
m_num_features = v;
}
std::string Features() const { return m_features; }
void Features(const std::string& f) { m_features = f; }
std::string Features() const {
return m_features;
}
void Features(const std::string& f) {
m_features = f;
}
void savetxt(std::ostream* os);
void savebin(std::ostream* os);

View File

@ -20,7 +20,7 @@ namespace MosesTuning
FeatureData::FeatureData()
: m_num_features(0) {}
: m_num_features(0) {}
void FeatureData::save(ostream* os, bool bin)
{
@ -38,7 +38,8 @@ void FeatureData::save(const string &file, bool bin)
ofs.close();
}
void FeatureData::save(bool bin) {
void FeatureData::save(bool bin)
{
save(&cout, bin);
}
@ -145,7 +146,8 @@ void FeatureData::setFeatureMap(const string& feat)
}
}
string FeatureData::ToString() const {
string FeatureData::ToString() const
{
string res;
{

View File

@ -33,7 +33,9 @@ public:
FeatureData();
~FeatureData() {}
void clear() { m_array.clear(); }
void clear() {
m_array.clear();
}
FeatureArray& get(size_t idx) {
return m_array.at(idx);
@ -61,13 +63,23 @@ public:
void add(FeatureArray& e);
void add(FeatureStats& e, int sent_idx);
std::size_t size() const { return m_array.size(); }
std::size_t size() const {
return m_array.size();
}
std::size_t NumberOfFeatures() const { return m_num_features; }
void NumberOfFeatures(std::size_t v) { m_num_features = v; }
std::size_t NumberOfFeatures() const {
return m_num_features;
}
void NumberOfFeatures(std::size_t v) {
m_num_features = v;
}
std::string Features() const { return m_features; }
void Features(const std::string& f) { m_features = f; }
std::string Features() const {
return m_features;
}
void Features(const std::string& f) {
m_features = f;
}
void save(const std::string &file, bool bin=false);
void save(std::ostream* os, bool bin=false);

View File

@ -34,7 +34,8 @@ namespace MosesTuning
{
int ParseInt(const StringPiece& str ) {
int ParseInt(const StringPiece& str )
{
char* errIndex;
//could wrap?
int value = static_cast<int>(strtol(str.data(), &errIndex,10));
@ -44,7 +45,8 @@ int ParseInt(const StringPiece& str ) {
return value;
}
float ParseFloat(const StringPiece& str) {
float ParseFloat(const StringPiece& str)
{
char* errIndex;
float value = static_cast<float>(strtod(str.data(), &errIndex));
if (errIndex == str.data()) {
@ -53,11 +55,13 @@ float ParseFloat(const StringPiece& str) {
return value;
}
bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2) {
bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2)
{
return item1.dense==item1.dense && item1.sparse==item1.sparse;
}
size_t hash_value(FeatureDataItem const& item) {
size_t hash_value(FeatureDataItem const& item)
{
size_t seed = 0;
boost::hash_combine(seed,item.dense);
boost::hash_combine(seed,item.sparse);
@ -67,14 +71,16 @@ size_t hash_value(FeatureDataItem const& item) {
FeatureDataIterator::FeatureDataIterator() {}
FeatureDataIterator::FeatureDataIterator(const string& filename) {
FeatureDataIterator::FeatureDataIterator(const string& filename)
{
m_in.reset(new FilePiece(filename.c_str()));
readNext();
}
FeatureDataIterator::~FeatureDataIterator() {}
void FeatureDataIterator::readNext() {
void FeatureDataIterator::readNext()
{
m_next.clear();
try {
StringPiece marker = m_in->ReadDelimited();
@ -117,11 +123,13 @@ void FeatureDataIterator::readNext() {
}
}
void FeatureDataIterator::increment() {
void FeatureDataIterator::increment()
{
readNext();
}
bool FeatureDataIterator::equal(const FeatureDataIterator& rhs) const {
bool FeatureDataIterator::equal(const FeatureDataIterator& rhs) const
{
if (!m_in && !rhs.m_in) {
return true;
} else if (!m_in) {
@ -130,11 +138,12 @@ bool FeatureDataIterator::equal(const FeatureDataIterator& rhs) const {
return false;
} else {
return m_in->FileName() == rhs.m_in->FileName() &&
m_in->Offset() == rhs.m_in->Offset();
m_in->Offset() == rhs.m_in->Offset();
}
}
const vector<FeatureDataItem>& FeatureDataIterator::dereference() const {
const vector<FeatureDataItem>& FeatureDataIterator::dereference() const
{
return m_next;
}

View File

@ -37,7 +37,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "FeatureStats.h"
namespace util { class FilePiece; }
namespace util
{
class FilePiece;
}
namespace MosesTuning
{
@ -45,10 +48,10 @@ namespace MosesTuning
class FileFormatException : public util::Exception
{
public:
explicit FileFormatException(const std::string& filename, const std::string& line) {
*this << "Error in line \"" << line << "\" of " << filename;
}
public:
explicit FileFormatException(const std::string& filename, const std::string& line) {
*this << "Error in line \"" << line << "\" of " << filename;
}
};
@ -61,9 +64,9 @@ float ParseFloat(const StringPiece& str);
class FeatureDataItem
{
public:
std::vector<float> dense;
SparseVector sparse;
public:
std::vector<float> dense;
SparseVector sparse;
};
bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2);
@ -71,30 +74,30 @@ std::size_t hash_value(FeatureDataItem const& item);
class FeatureDataIterator :
public boost::iterator_facade<FeatureDataIterator,
const std::vector<FeatureDataItem>,
boost::forward_traversal_tag>
const std::vector<FeatureDataItem>,
boost::forward_traversal_tag>
{
public:
FeatureDataIterator();
explicit FeatureDataIterator(const std::string& filename);
~FeatureDataIterator();
public:
FeatureDataIterator();
explicit FeatureDataIterator(const std::string& filename);
~FeatureDataIterator();
static FeatureDataIterator end() {
return FeatureDataIterator();
}
static FeatureDataIterator end() {
return FeatureDataIterator();
}
private:
friend class boost::iterator_core_access;
private:
friend class boost::iterator_core_access;
void increment();
bool equal(const FeatureDataIterator& rhs) const;
const std::vector<FeatureDataItem>& dereference() const;
void increment();
bool equal(const FeatureDataIterator& rhs) const;
const std::vector<FeatureDataItem>& dereference() const;
void readNext();
void readNext();
boost::shared_ptr<util::FilePiece> m_in;
std::vector<FeatureDataItem> m_next;
boost::shared_ptr<util::FilePiece> m_in;
std::vector<FeatureDataItem> m_next;
};
}

View File

@ -7,10 +7,12 @@
using namespace MosesTuning;
namespace {
namespace
{
void CheckFeatureMap(const FeatureData* feature_data,
const char* str, int num_feature, int* cnt) {
const char* str, int num_feature, int* cnt)
{
for (int i = 0; i < num_feature; ++i) {
std::stringstream ss;
ss << str << "_" << i;
@ -23,7 +25,8 @@ void CheckFeatureMap(const FeatureData* feature_data,
} // namespace
BOOST_AUTO_TEST_CASE(set_feature_map) {
BOOST_AUTO_TEST_CASE(set_feature_map)
{
std::string str("d_0 d_1 d_2 d_3 d_4 d_5 d_6 lm_0 lm_1 tm_0 tm_1 tm_2 tm_3 tm_4 w_0 ");
FeatureData feature_data;

View File

@ -18,7 +18,8 @@
using namespace std;
namespace {
namespace
{
const int kAvailableSize = 8;
} // namespace
@ -29,20 +30,23 @@ namespace MosesTuning
SparseVector::name2id_t SparseVector::m_name_to_id;
SparseVector::id2name_t SparseVector::m_id_to_name;
FeatureStatsType SparseVector::get(const string& name) const {
FeatureStatsType SparseVector::get(const string& name) const
{
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
if (name2id_iter == m_name_to_id.end()) return 0;
size_t id = name2id_iter->second;
return get(id);
}
FeatureStatsType SparseVector::get(size_t id) const {
FeatureStatsType SparseVector::get(size_t id) const
{
fvector_t::const_iterator fvector_iter = m_fvector.find(id);
if (fvector_iter == m_fvector.end()) return 0;
return fvector_iter->second;
}
void SparseVector::set(const string& name, FeatureStatsType value) {
void SparseVector::set(const string& name, FeatureStatsType value)
{
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
size_t id = 0;
if (name2id_iter == m_name_to_id.end()) {
@ -55,7 +59,8 @@ void SparseVector::set(const string& name, FeatureStatsType value) {
m_fvector[id] = value;
}
void SparseVector::write(ostream& out, const string& sep) const {
void SparseVector::write(ostream& out, const string& sep) const
{
for (fvector_t::const_iterator i = m_fvector.begin(); i != m_fvector.end(); ++i) {
if (abs(i->second) < 0.00001) continue;
string name = m_id_to_name[i->first];
@ -63,11 +68,13 @@ void SparseVector::write(ostream& out, const string& sep) const {
}
}
void SparseVector::clear() {
void SparseVector::clear()
{
m_fvector.clear();
}
void SparseVector::load(const string& file) {
void SparseVector::load(const string& file)
{
ifstream in(file.c_str());
if (!in) {
throw runtime_error("Failed to open sparse weights file: " + file);
@ -84,39 +91,44 @@ void SparseVector::load(const string& file) {
}
}
SparseVector& SparseVector::operator-=(const SparseVector& rhs) {
SparseVector& SparseVector::operator-=(const SparseVector& rhs)
{
for (fvector_t::const_iterator i = rhs.m_fvector.begin();
i != rhs.m_fvector.end(); ++i) {
i != rhs.m_fvector.end(); ++i) {
m_fvector[i->first] = get(i->first) - (i->second);
}
return *this;
}
FeatureStatsType SparseVector::inner_product(const SparseVector& rhs) const {
FeatureStatsType SparseVector::inner_product(const SparseVector& rhs) const
{
FeatureStatsType product = 0.0;
for (fvector_t::const_iterator i = m_fvector.begin();
i != m_fvector.end(); ++i) {
i != m_fvector.end(); ++i) {
product += ((i->second) * (rhs.get(i->first)));
}
return product;
}
SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs) {
SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs)
{
SparseVector res(lhs);
res -= rhs;
return res;
}
FeatureStatsType inner_product(const SparseVector& lhs, const SparseVector& rhs) {
if (lhs.size() >= rhs.size()) {
return rhs.inner_product(lhs);
} else {
return lhs.inner_product(rhs);
}
FeatureStatsType inner_product(const SparseVector& lhs, const SparseVector& rhs)
{
if (lhs.size() >= rhs.size()) {
return rhs.inner_product(lhs);
} else {
return lhs.inner_product(rhs);
}
}
std::vector<std::size_t> SparseVector::feats() const {
std::vector<std::size_t> SparseVector::feats() const
{
std::vector<std::size_t> toRet;
for(fvector_t::const_iterator iter = m_fvector.begin();
iter!=m_fvector.end();
@ -126,7 +138,8 @@ std::vector<std::size_t> SparseVector::feats() const {
return toRet;
}
std::size_t SparseVector::encode(const std::string& name) {
std::size_t SparseVector::encode(const std::string& name)
{
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
size_t id = 0;
if (name2id_iter == m_name_to_id.end()) {
@ -139,26 +152,29 @@ std::size_t SparseVector::encode(const std::string& name) {
return id;
}
std::string SparseVector::decode(std::size_t id) {
std::string SparseVector::decode(std::size_t id)
{
return m_id_to_name[id];
}
bool operator==(SparseVector const& item1, SparseVector const& item2) {
bool operator==(SparseVector const& item1, SparseVector const& item2)
{
return item1.m_fvector==item2.m_fvector;
}
std::size_t hash_value(SparseVector const& item) {
std::size_t hash_value(SparseVector const& item)
{
boost::hash<SparseVector::fvector_t> hasher;
return hasher(item.m_fvector);
}
FeatureStats::FeatureStats()
: m_available_size(kAvailableSize), m_entries(0),
m_array(new FeatureStatsType[m_available_size]) {}
: m_available_size(kAvailableSize), m_entries(0),
m_array(new FeatureStatsType[m_available_size]) {}
FeatureStats::FeatureStats(const size_t size)
: m_available_size(size), m_entries(size),
m_array(new FeatureStatsType[m_available_size])
: m_available_size(size), m_entries(size),
m_array(new FeatureStatsType[m_available_size])
{
memset(m_array, 0, GetArraySizeWithBytes());
}
@ -276,7 +292,8 @@ void FeatureStats::savetxt(ostream* os)
*os << *this;
}
void FeatureStats::savetxt() {
void FeatureStats::savetxt()
{
savetxt(&cout);
}
@ -298,7 +315,8 @@ ostream& operator<<(ostream& o, const FeatureStats& e)
return o;
}
bool operator==(const FeatureStats& f1, const FeatureStats& f2) {
bool operator==(const FeatureStats& f1, const FeatureStats& f2)
{
size_t size = f1.size();
if (size != f2.size())

View File

@ -21,7 +21,8 @@ namespace MosesTuning
// Minimal sparse vector
class SparseVector {
class SparseVector
{
public:
typedef std::map<std::size_t,FeatureStatsType> fvector_t;
typedef std::map<std::string, std::size_t> name2id_t;
@ -32,7 +33,9 @@ public:
void set(const std::string& name, FeatureStatsType value);
void clear();
void load(const std::string& file);
std::size_t size() const { return m_fvector.size(); }
std::size_t size() const {
return m_fvector.size();
}
void write(std::ostream& out, const std::string& sep = " ") const;
@ -78,7 +81,9 @@ public:
void Copy(const FeatureStats &stats);
bool isfull() const { return (m_entries < m_available_size) ? 0 : 1; }
bool isfull() const {
return (m_entries < m_available_size) ? 0 : 1;
}
void expand();
void add(FeatureStatsType v);
void addSparse(const std::string& name, FeatureStatsType v);
@ -93,23 +98,37 @@ public:
clear();
}
FeatureStatsType get(std::size_t i) { return m_array[i]; }
FeatureStatsType get(std::size_t i)const { return m_array[i]; }
featstats_t getArray() const { return m_array; }
FeatureStatsType get(std::size_t i) {
return m_array[i];
}
FeatureStatsType get(std::size_t i)const {
return m_array[i];
}
featstats_t getArray() const {
return m_array;
}
const SparseVector& getSparse() const { return m_map; }
const SparseVector& getSparse() const {
return m_map;
}
void set(std::string &theString, const SparseVector& sparseWeights);
inline std::size_t bytes() const { return GetArraySizeWithBytes(); }
inline std::size_t bytes() const {
return GetArraySizeWithBytes();
}
std::size_t GetArraySizeWithBytes() const {
return m_entries * sizeof(FeatureStatsType);
}
std::size_t size() const { return m_entries; }
std::size_t size() const {
return m_entries;
}
std::size_t available() const { return m_available_size; }
std::size_t available() const {
return m_available_size;
}
void savetxt(const std::string &file);
void savetxt(std::ostream* os);

View File

@ -5,15 +5,17 @@
using namespace std;
namespace {
bool IsGzipFile(const std::string &filename) {
namespace
{
bool IsGzipFile(const std::string &filename)
{
return filename.size() > 3 &&
filename.substr(filename.size() - 3, 3) == ".gz";
filename.substr(filename.size() - 3, 3) == ".gz";
}
} // namespace
inputfilestream::inputfilestream(const std::string &filePath)
: std::istream(0), m_streambuf(0), m_is_good(false)
: std::istream(0), m_streambuf(0), m_is_good(false)
{
// check if file is readable
std::filebuf* fb = new std::filebuf();
@ -40,7 +42,7 @@ void inputfilestream::close()
}
outputfilestream::outputfilestream(const std::string &filePath)
: std::ostream(0), m_streambuf(0), m_is_good(false)
: std::ostream(0), m_streambuf(0), m_is_good(false)
{
// check if file is readable
std::filebuf* fb = new std::filebuf();

View File

@ -16,7 +16,9 @@ public:
explicit inputfilestream(const std::string &filePath);
virtual ~inputfilestream();
bool good() const { return m_is_good; }
bool good() const {
return m_is_good;
}
void close();
};
@ -30,7 +32,9 @@ public:
explicit outputfilestream(const std::string &filePath);
virtual ~outputfilestream();
bool good() const { return m_is_good; }
bool good() const {
return m_is_good;
}
void close();
};

View File

@ -5,7 +5,8 @@
#include <cstdio>
#include <iostream>
GzFileBuf::GzFileBuf(const char* filename) {
GzFileBuf::GzFileBuf(const char* filename)
{
m_gz_file = gzopen(filename, "rb");
if (m_gz_file == NULL) {
std::cerr << "ERROR: Failed to open " << filename << std::endl;
@ -16,16 +17,19 @@ GzFileBuf::GzFileBuf(const char* filename) {
m_buf + sizeof(int)); // end position
}
GzFileBuf::~GzFileBuf() {
GzFileBuf::~GzFileBuf()
{
gzclose(m_gz_file);
}
int GzFileBuf::overflow(int_type c) {
int GzFileBuf::overflow(int_type c)
{
throw;
}
// read one character
int GzFileBuf::underflow() {
int GzFileBuf::underflow()
{
// is read position before end of m_buf?
if (gptr() < egptr()) {
return traits_type::to_int_type(*gptr());
@ -64,17 +68,20 @@ int GzFileBuf::underflow() {
}
std::streampos GzFileBuf::seekpos(
std::streampos sp,
std::ios_base::openmode which) {
std::streampos sp,
std::ios_base::openmode which)
{
throw;
}
std::streamsize GzFileBuf::xsgetn(char* s,
std::streamsize num) {
std::streamsize num)
{
return static_cast<std::streamsize>(gzread(m_gz_file,s,num));
}
std::streamsize GzFileBuf::xsputn(const char* s,
std::streamsize num) {
std::streamsize num)
{
throw;
}

View File

@ -17,8 +17,8 @@ protected:
virtual int_type underflow();
virtual std::streampos seekpos(
std::streampos sp,
std::ios_base::openmode which = std::ios_base::in | std::ios_base::out);
std::streampos sp,
std::ios_base::openmode which = std::ios_base::in | std::ios_base::out);
virtual std::streamsize xsgetn(char* s, std::streamsize num);

View File

@ -12,9 +12,9 @@ namespace MosesTuning
StreamingHypPackEnumerator::StreamingHypPackEnumerator
(
vector<std::string> const& featureFiles,
vector<std::string> const& scoreFiles
)
vector<std::string> const& featureFiles,
vector<std::string> const& scoreFiles
)
: m_featureFiles(featureFiles),
m_scoreFiles(scoreFiles)
{
@ -25,7 +25,7 @@ StreamingHypPackEnumerator::StreamingHypPackEnumerator
if (featureFiles.size() != scoreFiles.size()) {
cerr << "Error: Number of feature files (" << featureFiles.size() <<
") does not match number of score files (" << scoreFiles.size() << ")" << endl;
") does not match number of score files (" << scoreFiles.size() << ")" << endl;
exit(1);
}
@ -34,7 +34,8 @@ StreamingHypPackEnumerator::StreamingHypPackEnumerator
m_iNumDense = -1;
}
size_t StreamingHypPackEnumerator::num_dense() const {
size_t StreamingHypPackEnumerator::num_dense() const
{
if(m_iNumDense<0) {
cerr << "Error: Requested num_dense() for an unprimed StreamingHypPackEnumerator" << endl;
exit(1);
@ -42,7 +43,8 @@ size_t StreamingHypPackEnumerator::num_dense() const {
return (size_t) m_iNumDense;
}
void StreamingHypPackEnumerator::prime(){
void StreamingHypPackEnumerator::prime()
{
m_current_indexes.clear();
m_current_featureVectors.clear();
boost::unordered_set<FeatureDataItem> seen;
@ -78,13 +80,14 @@ void StreamingHypPackEnumerator::prime(){
}
// Store item for retrieval
m_current_indexes.push_back(pair<size_t,size_t>(i,j));
m_current_featureVectors.push_back(MiraFeatureVector(item));
m_current_featureVectors.push_back(MiraFeatureVector(item));
}
}
}
}
void StreamingHypPackEnumerator::reset(){
void StreamingHypPackEnumerator::reset()
{
m_featureDataIters.clear();
m_scoreDataIters.clear();
for (size_t i = 0; i < m_num_lists; ++i) {
@ -95,11 +98,13 @@ void StreamingHypPackEnumerator::reset(){
prime();
}
bool StreamingHypPackEnumerator::finished(){
bool StreamingHypPackEnumerator::finished()
{
return m_featureDataIters[0]==FeatureDataIterator::end();
}
void StreamingHypPackEnumerator::next(){
void StreamingHypPackEnumerator::next()
{
if(!m_primed) {
cerr << "Enumerating an unprimed HypPackEnumerator" << endl;
exit(1);
@ -113,7 +118,8 @@ void StreamingHypPackEnumerator::next(){
if(!finished()) prime();
}
size_t StreamingHypPackEnumerator::cur_size(){
size_t StreamingHypPackEnumerator::cur_size()
{
if(!m_primed) {
cerr << "Querying size from an unprimed HypPackEnumerator" << endl;
exit(1);
@ -121,7 +127,8 @@ size_t StreamingHypPackEnumerator::cur_size(){
return m_current_indexes.size();
}
const MiraFeatureVector& StreamingHypPackEnumerator::featuresAt(size_t index){
const MiraFeatureVector& StreamingHypPackEnumerator::featuresAt(size_t index)
{
if(!m_primed) {
cerr << "Querying features from an unprimed HypPackEnumerator" << endl;
exit(1);
@ -129,7 +136,8 @@ const MiraFeatureVector& StreamingHypPackEnumerator::featuresAt(size_t index){
return m_current_featureVectors[index];
}
const ScoreDataItem& StreamingHypPackEnumerator::scoresAt(size_t index) {
const ScoreDataItem& StreamingHypPackEnumerator::scoresAt(size_t index)
{
if(!m_primed) {
cerr << "Querying scores from an unprimed HypPackEnumerator" << endl;
exit(1);
@ -138,22 +146,23 @@ const ScoreDataItem& StreamingHypPackEnumerator::scoresAt(size_t index) {
return m_scoreDataIters[pij.first]->operator[](pij.second);
}
size_t StreamingHypPackEnumerator::cur_id() {
size_t StreamingHypPackEnumerator::cur_id()
{
return m_sentenceId;
}
/* --------- RandomAccessHypPackEnumerator ------------- */
RandomAccessHypPackEnumerator::RandomAccessHypPackEnumerator(vector<string> const& featureFiles,
vector<string> const& scoreFiles,
bool no_shuffle)
vector<string> const& scoreFiles,
bool no_shuffle)
{
StreamingHypPackEnumerator train(featureFiles,scoreFiles);
size_t index=0;
for(train.reset(); !train.finished(); train.next()) {
m_features.push_back(vector<MiraFeatureVector>());
m_scores.push_back(vector<ScoreDataItem>());
for(size_t j=0;j<train.cur_size();j++) {
for(size_t j=0; j<train.cur_size(); j++) {
m_features.back().push_back(train.featuresAt(j));
m_scores.back().push_back(train.scoresAt(j));
}
@ -165,33 +174,41 @@ RandomAccessHypPackEnumerator::RandomAccessHypPackEnumerator(vector<string> cons
m_num_dense = train.num_dense();
}
size_t RandomAccessHypPackEnumerator::num_dense() const {
size_t RandomAccessHypPackEnumerator::num_dense() const
{
return m_num_dense;
}
void RandomAccessHypPackEnumerator::reset() {
void RandomAccessHypPackEnumerator::reset()
{
m_cur_index = 0;
if(!m_no_shuffle) random_shuffle(m_indexes.begin(),m_indexes.end());
}
bool RandomAccessHypPackEnumerator::finished() {
bool RandomAccessHypPackEnumerator::finished()
{
return m_cur_index >= m_indexes.size();
}
void RandomAccessHypPackEnumerator::next() {
void RandomAccessHypPackEnumerator::next()
{
m_cur_index++;
}
size_t RandomAccessHypPackEnumerator::cur_size() {
size_t RandomAccessHypPackEnumerator::cur_size()
{
assert(m_features[m_indexes[m_cur_index]].size()==m_scores[m_indexes[m_cur_index]].size());
return m_features[m_indexes[m_cur_index]].size();
}
const MiraFeatureVector& RandomAccessHypPackEnumerator::featuresAt(size_t i) {
const MiraFeatureVector& RandomAccessHypPackEnumerator::featuresAt(size_t i)
{
return m_features[m_indexes[m_cur_index]][i];
}
const ScoreDataItem& RandomAccessHypPackEnumerator::scoresAt(size_t i) {
const ScoreDataItem& RandomAccessHypPackEnumerator::scoresAt(size_t i)
{
return m_scores[m_indexes[m_cur_index]][i];
}
size_t RandomAccessHypPackEnumerator::cur_id() {
size_t RandomAccessHypPackEnumerator::cur_id()
{
return m_indexes[m_cur_index];
}
// --Emacs trickery--

View File

@ -24,7 +24,8 @@ namespace MosesTuning
// Start with these abstract classes
class HypPackEnumerator {
class HypPackEnumerator
{
public:
virtual ~HypPackEnumerator() {}
@ -41,7 +42,8 @@ public:
// Instantiation that streams from disk
// Low-memory, low-speed, sequential access
class StreamingHypPackEnumerator : public HypPackEnumerator {
class StreamingHypPackEnumerator : public HypPackEnumerator
{
public:
StreamingHypPackEnumerator(std::vector<std::string> const& featureFiles,
std::vector<std::string> const& scoreFiles);
@ -75,7 +77,8 @@ private:
// Instantiation that reads into memory
// High-memory, high-speed, random access
// (Actually randomizes with each call to reset)
class RandomAccessHypPackEnumerator : public HypPackEnumerator {
class RandomAccessHypPackEnumerator : public HypPackEnumerator
{
public:
RandomAccessHypPackEnumerator(std::vector<std::string> const& featureFiles,
std::vector<std::string> const& scoreFiles,

View File

@ -11,7 +11,7 @@ namespace MosesTuning
// TODO: This is too long. Consider creating a function for
// initialization such as Init().
InterpolatedScorer::InterpolatedScorer(const string& name, const string& config)
: Scorer(name,config)
: Scorer(name,config)
{
// name would be: HAMMING,BLEU or similar
string scorers = name;
@ -66,7 +66,8 @@ InterpolatedScorer::InterpolatedScorer(const string& name, const string& config)
cerr <<endl;
}
bool InterpolatedScorer::useAlignment() const {
bool InterpolatedScorer::useAlignment() const
{
//cout << "InterpolatedScorer::useAlignment" << endl;
for (vector<Scorer*>::const_iterator itsc = m_scorers.begin(); itsc < m_scorers.end(); itsc++) {
if ((*itsc)->useAlignment()) {
@ -176,8 +177,7 @@ void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats
ScoreStats tempEntry;
if ((*itsc)->useAlignment()) {
(*itsc)->prepareStats(sid, text, tempEntry);
}
else {
} else {
(*itsc)->prepareStats(sid, sentence, tempEntry);
}
if (i > 0) buff << " ";
@ -206,17 +206,17 @@ void InterpolatedScorer::setFactors(const string& factors)
void InterpolatedScorer::setFilter(const string& filterCommand)
{
if (filterCommand.empty()) return;
if (filterCommand.empty()) return;
vector<string> csplit;
split(filterCommand, ',', csplit);
vector<string> csplit;
split(filterCommand, ',', csplit);
if (csplit.size() != m_scorers.size())
throw runtime_error("Number of command specifications does not equal number of interpolated scorers.");
if (csplit.size() != m_scorers.size())
throw runtime_error("Number of command specifications does not equal number of interpolated scorers.");
for (size_t i = 0; i < m_scorers.size(); ++i) {
m_scorers[i]->setFilter(csplit[i]);
}
for (size_t i = 0; i < m_scorers.size(); ++i) {
m_scorers[i]->setFilter(csplit[i]);
}
}
}

View File

@ -17,8 +17,7 @@ MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec)
size_t lastFeat = 0;
m_sparseFeats.reserve(sparseFeats.size());
m_sparseVals.reserve(sparseFeats.size());
for(size_t i=0;i<sparseFeats.size();i++)
{
for(size_t i=0; i<sparseFeats.size(); i++) {
size_t feat = m_dense.size() + sparseFeats[i];
m_sparseFeats.push_back(feat);
m_sparseVals.push_back(vec.sparse.get(sparseFeats[i]));
@ -26,8 +25,7 @@ MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec)
// Check ordered property
if(bFirst) {
bFirst = false;
}
else {
} else {
if(lastFeat>=feat) {
cerr << "Error: Feature indeces must be strictly ascending coming out of SparseVector" << endl;
exit(1);
@ -61,29 +59,33 @@ MiraFeatureVector::MiraFeatureVector(const vector<ValType>& dense,
}
}
ValType MiraFeatureVector::val(size_t index) const {
ValType MiraFeatureVector::val(size_t index) const
{
if(index < m_dense.size())
return m_dense[index];
else
return m_sparseVals[index-m_dense.size()];
}
size_t MiraFeatureVector::feat(size_t index) const {
size_t MiraFeatureVector::feat(size_t index) const
{
if(index < m_dense.size())
return index;
else
return m_sparseFeats[index-m_dense.size()];
}
size_t MiraFeatureVector::size() const {
size_t MiraFeatureVector::size() const
{
return m_dense.size() + m_sparseVals.size();
}
ValType MiraFeatureVector::sqrNorm() const {
ValType MiraFeatureVector::sqrNorm() const
{
ValType toRet = 0.0;
for(size_t i=0;i<m_dense.size();i++)
for(size_t i=0; i<m_dense.size(); i++)
toRet += m_dense[i]*m_dense[i];
for(size_t i=0;i<m_sparseVals.size();i++)
for(size_t i=0; i<m_sparseVals.size(); i++)
toRet += m_sparseVals[i] * m_sparseVals[i];
return toRet;
}
@ -96,7 +98,7 @@ MiraFeatureVector operator-(const MiraFeatureVector& a, const MiraFeatureVector&
cerr << "Mismatching dense vectors passed to MiraFeatureVector subtraction" << endl;
exit(1);
}
for(size_t i=0;i<a.m_dense.size();i++) {
for(size_t i=0; i<a.m_dense.size(); i++) {
dense.push_back(a.m_dense[i] - b.m_dense[i]);
}
@ -148,7 +150,7 @@ MiraFeatureVector operator-(const MiraFeatureVector& a, const MiraFeatureVector&
ostream& operator<<(ostream& o, const MiraFeatureVector& e)
{
for(size_t i=0;i<e.size();i++) {
for(size_t i=0; i<e.size(); i++) {
if(i>0) o << " ";
o << e.feat(i) << ":" << e.val(i);
}

View File

@ -23,7 +23,8 @@ namespace MosesTuning
typedef FeatureStatsType ValType;
class MiraFeatureVector {
class MiraFeatureVector
{
public:
MiraFeatureVector(const FeatureDataItem& vec);
MiraFeatureVector(const MiraFeatureVector& other);

View File

@ -36,9 +36,10 @@ MiraWeightVector::MiraWeightVector(const vector<ValType>& init)
* \param fv Feature vector to be added to the weights
* \param tau FV will be scaled by this value before update
*/
void MiraWeightVector::update(const MiraFeatureVector& fv, float tau) {
void MiraWeightVector::update(const MiraFeatureVector& fv, float tau)
{
m_numUpdates++;
for(size_t i=0;i<fv.size();i++) {
for(size_t i=0; i<fv.size(); i++) {
update(fv.feat(i), fv.val(i)*tau);
}
}
@ -46,7 +47,8 @@ void MiraWeightVector::update(const MiraFeatureVector& fv, float tau) {
/**
* Perform an empty update (affects averaging)
*/
void MiraWeightVector::tick() {
void MiraWeightVector::tick()
{
m_numUpdates++;
}
@ -54,7 +56,8 @@ void MiraWeightVector::tick() {
* Score a feature vector according to the model
* \param fv Feature vector to be scored
*/
ValType MiraWeightVector::score(const MiraFeatureVector& fv) const {
ValType MiraWeightVector::score(const MiraFeatureVector& fv) const
{
ValType toRet = 0.0;
for(size_t i=0; i<fv.size(); i++) {
toRet += weight(fv.feat(i)) * fv.val(i);
@ -65,7 +68,8 @@ ValType MiraWeightVector::score(const MiraFeatureVector& fv) const {
/**
* Return an averaged view of this weight vector
*/
AvgWeightVector MiraWeightVector::avg() {
AvgWeightVector MiraWeightVector::avg()
{
this->fixTotals();
return AvgWeightVector(*this);
}
@ -73,7 +77,8 @@ AvgWeightVector MiraWeightVector::avg() {
/**
* Updates a weight and lazily updates its total
*/
void MiraWeightVector::update(size_t index, ValType delta) {
void MiraWeightVector::update(size_t index, ValType delta)
{
// Handle previously unseen weights
while(index>=m_weights.size()) {
@ -91,25 +96,27 @@ void MiraWeightVector::update(size_t index, ValType delta) {
/**
* Make sure everyone's total is up-to-date
*/
void MiraWeightVector::fixTotals() {
void MiraWeightVector::fixTotals()
{
for(size_t i=0; i<m_weights.size(); i++) update(i,0);
}
/**
* Helper to handle out of range weights
*/
ValType MiraWeightVector::weight(size_t index) const {
ValType MiraWeightVector::weight(size_t index) const
{
if(index < m_weights.size()) {
return m_weights[index];
}
else {
} else {
return 0;
}
}
ValType MiraWeightVector::sqrNorm() const {
ValType MiraWeightVector::sqrNorm() const
{
ValType toRet = 0;
for(size_t i=0;i<m_weights.size();i++) {
for(size_t i=0; i<m_weights.size(); i++) {
toRet += weight(i) * weight(i);
}
return toRet;
@ -121,7 +128,7 @@ AvgWeightVector::AvgWeightVector(const MiraWeightVector& wv)
ostream& operator<<(ostream& o, const MiraWeightVector& e)
{
for(size_t i=0;i<e.m_weights.size();i++) {
for(size_t i=0; i<e.m_weights.size(); i++) {
if(abs(e.m_weights[i])>1e-8) {
if(i>0) o << " ";
cerr << i << ":" << e.m_weights[i];
@ -136,14 +143,14 @@ ValType AvgWeightVector::weight(size_t index) const
else {
if(index < m_wv.m_totals.size()) {
return m_wv.m_totals[index] / m_wv.m_numUpdates;
}
else {
} else {
return 0;
}
}
}
ValType AvgWeightVector::score(const MiraFeatureVector& fv) const {
ValType AvgWeightVector::score(const MiraFeatureVector& fv) const
{
ValType toRet = 0.0;
for(size_t i=0; i<fv.size(); i++) {
toRet += weight(fv.feat(i)) * fv.val(i);
@ -151,7 +158,8 @@ ValType AvgWeightVector::score(const MiraFeatureVector& fv) const {
return toRet;
}
size_t AvgWeightVector::size() const {
size_t AvgWeightVector::size() const
{
return m_wv.m_weights.size();
}

View File

@ -21,7 +21,8 @@ namespace MosesTuning
class AvgWeightVector;
class MiraWeightVector {
class MiraWeightVector
{
public:
/**
* Constructor, initializes to the zero vector
@ -91,7 +92,8 @@ private:
/**
* Averaged view of a weight vector
*/
class AvgWeightVector {
class AvgWeightVector
{
public:
AvgWeightVector(const MiraWeightVector& wv);
ValType score(const MiraFeatureVector& fv) const;

View File

@ -13,8 +13,9 @@ namespace MosesTuning
* typical accessors and mutaors, but we intentionally does not allow
* erasing elements.
*/
class NgramCounts {
public:
class NgramCounts
{
public:
// Used to construct the ngram map
struct NgramComparator {
bool operator()(const std::vector<int>& a, const std::vector<int>& b) const {
@ -45,7 +46,9 @@ class NgramCounts {
/**
* If the specified "ngram" is found, we add counts.
* If not, we insert the default count in the container. */
inline void Add(const Key& ngram) { m_counts[ngram]++; }
inline void Add(const Key& ngram) {
m_counts[ngram]++;
}
/**
* Return true iff the specified "ngram" is found in the container.
@ -60,34 +63,58 @@ class NgramCounts {
/**
* Clear all elments in the container.
*/
void clear() { m_counts.clear(); }
void clear() {
m_counts.clear();
}
/**
* Return true iff the container is empty.
*/
bool empty() const { return m_counts.empty(); }
bool empty() const {
return m_counts.empty();
}
/**
* Return the the number of elements in the container.
*/
std::size_t size() const { return m_counts.size(); }
std::size_t size() const {
return m_counts.size();
}
std::size_t max_size() const { return m_counts.max_size(); }
std::size_t max_size() const {
return m_counts.max_size();
}
// Note: This is mainly used by unit tests.
int get_default_count() const { return kDefaultCount; }
int get_default_count() const {
return kDefaultCount;
}
iterator find(const Key& ngram) { return m_counts.find(ngram); }
const_iterator find(const Key& ngram) const { return m_counts.find(ngram); }
iterator find(const Key& ngram) {
return m_counts.find(ngram);
}
const_iterator find(const Key& ngram) const {
return m_counts.find(ngram);
}
Value& operator[](const Key& ngram) { return m_counts[ngram]; }
Value& operator[](const Key& ngram) {
return m_counts[ngram];
}
iterator begin() { return m_counts.begin(); }
const_iterator begin() const { return m_counts.begin(); }
iterator end() { return m_counts.end(); }
const_iterator end() const { return m_counts.end(); }
iterator begin() {
return m_counts.begin();
}
const_iterator begin() const {
return m_counts.begin();
}
iterator end() {
return m_counts.end();
}
const_iterator end() const {
return m_counts.end();
}
private:
private:
const int kDefaultCount;
boost::unordered_map<Key, Value> m_counts;
};

View File

@ -5,7 +5,8 @@
using namespace MosesTuning;
BOOST_AUTO_TEST_CASE(ngram_basic) {
BOOST_AUTO_TEST_CASE(ngram_basic)
{
NgramCounts counts;
NgramCounts::Key key;
key.push_back(1);
@ -25,7 +26,8 @@ BOOST_AUTO_TEST_CASE(ngram_basic) {
BOOST_CHECK_EQUAL(it->second, 1);
}
BOOST_AUTO_TEST_CASE(ngram_Add) {
BOOST_AUTO_TEST_CASE(ngram_Add)
{
NgramCounts counts;
NgramCounts::Key key;
key.push_back(1);
@ -49,7 +51,8 @@ BOOST_AUTO_TEST_CASE(ngram_Add) {
BOOST_CHECK_EQUAL(counts[key3], counts.get_default_count());
}
BOOST_AUTO_TEST_CASE(ngram_lookup) {
BOOST_AUTO_TEST_CASE(ngram_lookup)
{
NgramCounts counts;
NgramCounts::Key key;
key.push_back(1);

View File

@ -17,7 +17,8 @@ using namespace std;
static const float MIN_FLOAT = -1.0 * numeric_limits<float>::max();
static const float MAX_FLOAT = numeric_limits<float>::max();
namespace {
namespace
{
/**
* Compute the intersection of 2 lines.
@ -198,7 +199,7 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction,
thresholdmap.erase(previnserted); // erase old previnsert
previnserted = thresholdmap.find(leftmostx); // point previnsert to the new threshold
previnserted->second.back()=newd; // We update the diff for sentence S
// Threshold already exists but is not the previous one.
// Threshold already exists but is not the previous one.
} else {
// We append the diffs in previnsert to tit before destroying previnsert.
tit->second.insert(tit->second.end(),previnserted->second.begin(),previnserted->second.end());
@ -405,8 +406,7 @@ statscore_t SimpleOptimizer::TrueRun(Point& P) const
for (unsigned int i = 0; i < Point::getdim(); i++)
direction[i]=0.0;
direction[d]=1.0;
}
else { // random direction update
} else { // random direction update
direction.Randomize();
}
statscore_t curscore = LineOptimize(P, direction, linebest);//find the minimum on the line
@ -443,8 +443,7 @@ statscore_t RandomDirectionOptimizer::TrueRun(Point& P) const
// do specified number of random direction optimizations
unsigned int nrun = 0;
unsigned int nrun_no_change = 0;
for (; nrun_no_change < m_num_random_directions; nrun++, nrun_no_change++)
{
for (; nrun_no_change < m_num_random_directions; nrun++, nrun_no_change++) {
// choose a random direction in which to optimize
Point direction;
direction.Randomize();

View File

@ -31,8 +31,12 @@ protected:
public:
Optimizer(unsigned Pd, const std::vector<unsigned>& i2O, const std::vector<bool>& positive, const std::vector<parameter_t>& start, unsigned int nrandom);
void SetScorer(Scorer *scorer) { m_scorer = scorer; }
void SetFeatureData(FeatureDataHandle feature_data) { m_feature_data = feature_data; }
void SetScorer(Scorer *scorer) {
m_scorer = scorer;
}
void SetFeatureData(FeatureDataHandle feature_data) {
m_feature_data = feature_data;
}
virtual ~Optimizer();
unsigned size() const {
@ -97,7 +101,7 @@ private:
public:
RandomDirectionOptimizer(unsigned dim, const std::vector<unsigned>& i2O, const std::vector<bool>& positive,
const std::vector<parameter_t>& start, unsigned int nrandom)
: Optimizer(dim, i2O, positive, start, nrandom), kEPS(0.0001f) {}
: Optimizer(dim, i2O, positive, start, nrandom), kEPS(0.0001f) {}
virtual statscore_t TrueRun(Point&) const;
};
@ -109,7 +113,7 @@ class RandomOptimizer : public Optimizer
public:
RandomOptimizer(unsigned dim, const std::vector<unsigned>& i2O, const std::vector<bool>& positive,
const std::vector<parameter_t>& start, unsigned int nrandom)
: Optimizer(dim, i2O, positive, start, nrandom) {}
: Optimizer(dim, i2O, positive, start, nrandom) {}
virtual statscore_t TrueRun(Point&) const;
};

View File

@ -38,11 +38,11 @@ OptimizerFactory::OptimizerType OptimizerFactory::GetOptimizerType(const string&
}
Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim,
const vector<unsigned>& i2o,
const std::vector<bool>& positive,
const vector<parameter_t>& start,
const string& type,
unsigned int nrandom)
const vector<unsigned>& i2o,
const std::vector<bool>& positive,
const vector<parameter_t>& start,
const string& type,
unsigned int nrandom)
{
OptimizerType opt_type = GetOptimizerType(type);
if (opt_type == NOPTIMIZER) {
@ -55,18 +55,18 @@ Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim,
}
switch (opt_type) {
case POWELL:
return new SimpleOptimizer(dim, i2o, positive, start, nrandom);
break;
case RANDOM_DIRECTION:
return new RandomDirectionOptimizer(dim, i2o, positive, start, nrandom);
break;
case RANDOM:
return new RandomOptimizer(dim, i2o, positive, start, nrandom);
break;
default:
cerr << "Error: unknown optimizer" << type << endl;
return NULL;
case POWELL:
return new SimpleOptimizer(dim, i2o, positive, start, nrandom);
break;
case RANDOM_DIRECTION:
return new RandomDirectionOptimizer(dim, i2o, positive, start, nrandom);
break;
case RANDOM:
return new RandomOptimizer(dim, i2o, positive, start, nrandom);
break;
default:
cerr << "Error: unknown optimizer" << type << endl;
return NULL;
}
}

View File

@ -12,7 +12,7 @@ class Optimizer;
class OptimizerFactory
{
public:
public:
// NOTE: Add new optimizer here BEFORE NOPTIMZER
enum OptimizerType {
POWELL = 0,
@ -36,7 +36,7 @@ class OptimizerFactory
const std::string& type,
unsigned int nrandom);
private:
private:
OptimizerFactory() {}
~OptimizerFactory() {}

View File

@ -7,21 +7,24 @@
using namespace MosesTuning;
namespace {
namespace
{
inline bool CheckBuildOptimizer(unsigned dim,
const std::vector<unsigned>& to_optimize,
const std::vector<bool>& positive,
const std::vector<parameter_t>& start,
const std::string& type,
unsigned int num_random) {
unsigned int num_random)
{
boost::scoped_ptr<Optimizer> optimizer(OptimizerFactory::BuildOptimizer(dim, to_optimize, positive, start, type, num_random));
return optimizer.get() != NULL;
}
} // namespace
BOOST_AUTO_TEST_CASE(optimizer_type) {
BOOST_AUTO_TEST_CASE(optimizer_type)
{
BOOST_CHECK_EQUAL(OptimizerFactory::GetOptimizerType("powell"),
OptimizerFactory::POWELL);
BOOST_CHECK_EQUAL(OptimizerFactory::GetOptimizerType("random"),
@ -30,7 +33,8 @@ BOOST_AUTO_TEST_CASE(optimizer_type) {
OptimizerFactory::RANDOM_DIRECTION);
}
BOOST_AUTO_TEST_CASE(optimizer_build) {
BOOST_AUTO_TEST_CASE(optimizer_build)
{
const unsigned dim = 3;
std::vector<unsigned> to_optimize;
to_optimize.push_back(1);

View File

@ -27,7 +27,9 @@ public:
virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles);
virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry);
virtual std::size_t NumberOfScores() const { return 3; }
virtual std::size_t NumberOfScores() const {
return 3;
}
virtual float calculateScore(const std::vector<int>& comps) const;
private:

View File

@ -29,7 +29,7 @@ Point::Point() : vector<parameter_t>(m_dim), m_score(0.0) {}
Point::Point(const vector<parameter_t>& init,
const vector<parameter_t>& min,
const vector<parameter_t>& max)
: vector<parameter_t>(Point::m_dim), m_score(0.0)
: vector<parameter_t>(Point::m_dim), m_score(0.0)
{
m_min.resize(Point::m_dim);
m_max.resize(Point::m_dim);

View File

@ -53,11 +53,19 @@ private:
statscore_t m_score;
public:
static unsigned int getdim() { return m_dim; }
static void setdim(std::size_t d) { m_dim = d; }
static unsigned int getdim() {
return m_dim;
}
static void setdim(std::size_t d) {
m_dim = d;
}
static unsigned int getpdim() { return m_pdim; }
static void setpdim(std::size_t pd) { m_pdim = pd; }
static unsigned int getpdim() {
return m_pdim;
}
static void setpdim(std::size_t pd) {
m_pdim = pd;
}
static void set_optindices(const std::vector<unsigned int>& indices) {
m_opt_indices = indices;
@ -90,7 +98,9 @@ public:
*/
friend std::ostream& operator<<(std::ostream& o,const Point& P);
void Normalize() { NormalizeL2(); }
void Normalize() {
NormalizeL2();
}
void NormalizeL2();
void NormalizeL1();
@ -100,8 +110,12 @@ public:
*/
void GetAllWeights(std::vector<parameter_t>& w) const;
statscore_t GetScore() const { return m_score; }
void SetScore(statscore_t score) { m_score = score; }
statscore_t GetScore() const {
return m_score;
}
void SetScore(statscore_t score) {
m_score = score;
}
};
}

View File

@ -9,7 +9,8 @@
using namespace std;
using namespace MosesTuning;
BOOST_AUTO_TEST_CASE(point_operators) {
BOOST_AUTO_TEST_CASE(point_operators)
{
const unsigned int dim = 5;
vector<float> init(dim);
init[0] = 1.0f;

View File

@ -28,116 +28,108 @@ void exec_failed (int sig)
}
PreProcessFilter::PreProcessFilter(const string& filterCommand)
: m_toFilter(NULL),
m_fromFilter(NULL)
: m_toFilter(NULL),
m_fromFilter(NULL)
{
// Child error signal install
// sigaction is the replacement for the traditional signal() method
struct sigaction action;
action.sa_handler = exec_failed;
sigemptyset(&action.sa_mask);
action.sa_flags = 0;
if (sigaction(SIGUSR1, &action, NULL) < 0)
{
perror("SIGUSR1 install error");
exit(EXIT_FAILURE);
}
// Child error signal install
// sigaction is the replacement for the traditional signal() method
struct sigaction action;
action.sa_handler = exec_failed;
sigemptyset(&action.sa_mask);
action.sa_flags = 0;
if (sigaction(SIGUSR1, &action, NULL) < 0) {
perror("SIGUSR1 install error");
exit(EXIT_FAILURE);
}
int pipe_status;
int pipefds_input[2];
int pipefds_output[2];
// int pipefds_error[2];
int pipe_status;
int pipefds_input[2];
int pipefds_output[2];
// int pipefds_error[2];
// Create the pipes
// We do this before the fork so both processes will know about
// the same pipe and they can communicate.
// Create the pipes
// We do this before the fork so both processes will know about
// the same pipe and they can communicate.
pipe_status = pipe(pipefds_input);
if (pipe_status == -1)
{
perror("Error creating the pipe");
exit(EXIT_FAILURE);
}
pipe_status = pipe(pipefds_input);
if (pipe_status == -1) {
perror("Error creating the pipe");
exit(EXIT_FAILURE);
}
pipe_status = pipe(pipefds_output);
if (pipe_status == -1)
{
perror("Error creating the pipe");
exit(EXIT_FAILURE);
}
pipe_status = pipe(pipefds_output);
if (pipe_status == -1) {
perror("Error creating the pipe");
exit(EXIT_FAILURE);
}
/*
pipe_status = pipe(pipefds_error);
if (pipe_status == -1)
{
perror("Error creating the pipe");
exit(EXIT_FAILURE);
}
*/
/*
pipe_status = pipe(pipefds_error);
if (pipe_status == -1)
{
perror("Error creating the pipe");
exit(EXIT_FAILURE);
}
*/
pid_t pid;
// Create child process; both processes continue from here
pid = fork();
pid_t pid;
// Create child process; both processes continue from here
pid = fork();
if (pid == pid_t(0))
{
// Child process
if (pid == pid_t(0)) {
// Child process
// When the child process finishes sends a SIGCHLD signal
// to the parent
// When the child process finishes sends a SIGCHLD signal
// to the parent
// Tie the standard input, output and error streams to the
// appropiate pipe ends
// The file descriptor 0 is the standard input
// We tie it to the read end of the pipe as we will use
// this end of the pipe to read from it
dup2 (CHILD_STDIN_READ,0);
dup2 (CHILD_STDOUT_WRITE,1);
// dup2 (CHILD_STDERR_WRITE,2);
// Close in the child the unused ends of the pipes
close(CHILD_STDIN_WRITE);
close(CHILD_STDOUT_READ);
//close(CHILD_STDERR_READ);
// Tie the standard input, output and error streams to the
// appropiate pipe ends
// The file descriptor 0 is the standard input
// We tie it to the read end of the pipe as we will use
// this end of the pipe to read from it
dup2 (CHILD_STDIN_READ,0);
dup2 (CHILD_STDOUT_WRITE,1);
// dup2 (CHILD_STDERR_WRITE,2);
// Close in the child the unused ends of the pipes
close(CHILD_STDIN_WRITE);
close(CHILD_STDOUT_READ);
//close(CHILD_STDERR_READ);
// Execute the program
execl("/bin/bash", "bash", "-c", filterCommand.c_str() , (char*)NULL);
// Execute the program
execl("/bin/bash", "bash", "-c", filterCommand.c_str() , (char*)NULL);
// We should never reach this point
// Tell the parent the exec failed
kill(getppid(), SIGUSR1);
exit(EXIT_FAILURE);
}
else if (pid > pid_t(0))
{
// Parent
// We should never reach this point
// Tell the parent the exec failed
kill(getppid(), SIGUSR1);
exit(EXIT_FAILURE);
} else if (pid > pid_t(0)) {
// Parent
// Close in the parent the unused ends of the pipes
close(CHILD_STDIN_READ);
close(CHILD_STDOUT_WRITE);
// close(CHILD_STDERR_WRITE);
// Close in the parent the unused ends of the pipes
close(CHILD_STDIN_READ);
close(CHILD_STDOUT_WRITE);
// close(CHILD_STDERR_WRITE);
m_toFilter = new ofdstream(CHILD_STDIN_WRITE);
m_fromFilter = new ifdstream(CHILD_STDOUT_READ);
}
else
{
perror("Error: fork failed");
exit(EXIT_FAILURE);
}
m_toFilter = new ofdstream(CHILD_STDIN_WRITE);
m_fromFilter = new ifdstream(CHILD_STDOUT_READ);
} else {
perror("Error: fork failed");
exit(EXIT_FAILURE);
}
}
string PreProcessFilter::ProcessSentence(const string& sentence)
{
*m_toFilter << sentence << "\n";
string processedSentence;
m_fromFilter->getline(processedSentence);
return processedSentence;
*m_toFilter << sentence << "\n";
string processedSentence;
m_fromFilter->getline(processedSentence);
return processedSentence;
}
PreProcessFilter::~PreProcessFilter()
{
delete m_toFilter;
delete m_fromFilter;
delete m_toFilter;
delete m_fromFilter;
}
}

View File

@ -22,8 +22,8 @@ public:
~PreProcessFilter();
private:
ofdstream* m_toFilter;
ifdstream* m_fromFilter;
ofdstream* m_toFilter;
ifdstream* m_fromFilter;
};
}

View File

@ -15,32 +15,51 @@ namespace MosesTuning
* Reference class represents reference translations for an output
* translation used in calculating BLEU score.
*/
class Reference {
public:
class Reference
{
public:
// for m_length
typedef std::vector<std::size_t>::iterator iterator;
typedef std::vector<std::size_t>::const_iterator const_iterator;
Reference() : m_counts(new NgramCounts) { }
~Reference() { delete m_counts; }
~Reference() {
delete m_counts;
}
NgramCounts* get_counts() { return m_counts; }
const NgramCounts* get_counts() const { return m_counts; }
NgramCounts* get_counts() {
return m_counts;
}
const NgramCounts* get_counts() const {
return m_counts;
}
iterator begin() { return m_length.begin(); }
const_iterator begin() const { return m_length.begin(); }
iterator end() { return m_length.end(); }
const_iterator end() const { return m_length.end(); }
iterator begin() {
return m_length.begin();
}
const_iterator begin() const {
return m_length.begin();
}
iterator end() {
return m_length.end();
}
const_iterator end() const {
return m_length.end();
}
void push_back(std::size_t len) { m_length.push_back(len); }
void push_back(std::size_t len) {
m_length.push_back(len);
}
std::size_t num_references() const { return m_length.size(); }
std::size_t num_references() const {
return m_length.size();
}
int CalcAverage() const;
int CalcClosest(std::size_t length) const;
int CalcShortest() const;
private:
private:
NgramCounts* m_counts;
// multiple reference lengths
@ -49,16 +68,18 @@ class Reference {
// TODO(tetsuok): fix this function and related stuff.
// "average" reference length should not be calculated at sentence-level unlike "closest".
inline int Reference::CalcAverage() const {
inline int Reference::CalcAverage() const
{
int total = 0;
for (std::size_t i = 0; i < m_length.size(); ++i) {
total += m_length[i];
}
return static_cast<int>(
static_cast<float>(total) / m_length.size());
static_cast<float>(total) / m_length.size());
}
inline int Reference::CalcClosest(std::size_t length) const {
inline int Reference::CalcClosest(std::size_t length) const
{
int min_diff = INT_MAX;
int closest_ref_id = 0; // an index of the closest reference translation
for (std::size_t i = 0; i < m_length.size(); ++i) {
@ -79,7 +100,8 @@ inline int Reference::CalcClosest(std::size_t length) const {
return static_cast<int>(m_length[closest_ref_id]);
}
inline int Reference::CalcShortest() const {
inline int Reference::CalcShortest() const
{
return *std::min_element(m_length.begin(), m_length.end());
}

View File

@ -5,12 +5,14 @@
using namespace MosesTuning;
BOOST_AUTO_TEST_CASE(refernece_count) {
BOOST_AUTO_TEST_CASE(refernece_count)
{
Reference ref;
BOOST_CHECK(ref.get_counts() != NULL);
}
BOOST_AUTO_TEST_CASE(refernece_length_iterator) {
BOOST_AUTO_TEST_CASE(refernece_length_iterator)
{
Reference ref;
ref.push_back(4);
ref.push_back(2);
@ -24,7 +26,8 @@ BOOST_AUTO_TEST_CASE(refernece_length_iterator) {
BOOST_CHECK(it == ref.end());
}
BOOST_AUTO_TEST_CASE(refernece_length_average) {
BOOST_AUTO_TEST_CASE(refernece_length_average)
{
{
Reference ref;
ref.push_back(4);
@ -49,7 +52,8 @@ BOOST_AUTO_TEST_CASE(refernece_length_average) {
}
}
BOOST_AUTO_TEST_CASE(refernece_length_closest) {
BOOST_AUTO_TEST_CASE(refernece_length_closest)
{
{
Reference ref;
ref.push_back(4);
@ -92,7 +96,8 @@ BOOST_AUTO_TEST_CASE(refernece_length_closest) {
}
}
BOOST_AUTO_TEST_CASE(refernece_length_shortest) {
BOOST_AUTO_TEST_CASE(refernece_length_shortest)
{
{
Reference ref;
ref.push_back(4);

View File

@ -7,17 +7,24 @@ namespace MosesTuning
{
template <class T>
class ScopedVector {
public:
class ScopedVector
{
public:
typedef typename std::vector<T*>::iterator iterator;
typedef typename std::vector<T*>::const_iterator const_iterator;
ScopedVector() {}
virtual ~ScopedVector() { reset(); }
virtual ~ScopedVector() {
reset();
}
bool empty() const { return m_vec.empty(); }
bool empty() const {
return m_vec.empty();
}
void push_back(T *e) { m_vec.push_back(e); }
void push_back(T *e) {
m_vec.push_back(e);
}
void reset() {
for (iterator it = m_vec.begin(); it != m_vec.end(); ++it) {
@ -26,27 +33,53 @@ class ScopedVector {
m_vec.clear();
}
void reserve(std::size_t capacity) { m_vec.reserve(capacity); }
void resize(std::size_t size) { m_vec.resize(size); }
void reserve(std::size_t capacity) {
m_vec.reserve(capacity);
}
void resize(std::size_t size) {
m_vec.resize(size);
}
std::size_t size() const {return m_vec.size(); }
std::size_t size() const {
return m_vec.size();
}
iterator begin() { return m_vec.begin(); }
const_iterator begin() const { return m_vec.begin(); }
iterator begin() {
return m_vec.begin();
}
const_iterator begin() const {
return m_vec.begin();
}
iterator end() { return m_vec.end(); }
const_iterator end() const { return m_vec.end(); }
iterator end() {
return m_vec.end();
}
const_iterator end() const {
return m_vec.end();
}
std::vector<T*>& get() { return m_vec; }
const std::vector<T*>& get() const { return m_vec; }
std::vector<T*>& get() {
return m_vec;
}
const std::vector<T*>& get() const {
return m_vec;
}
std::vector<T*>* operator->() { return &m_vec; }
const std::vector<T*>* operator->() const { return &m_vec; }
std::vector<T*>* operator->() {
return &m_vec;
}
const std::vector<T*>* operator->() const {
return &m_vec;
}
T*& operator[](std::size_t i) { return m_vec[i]; }
const T* operator[](std::size_t i) const { return m_vec[i]; }
T*& operator[](std::size_t i) {
return m_vec[i];
}
const T* operator[](std::size_t i) const {
return m_vec[i];
}
private:
private:
std::vector<T*> m_vec;
// no copying allowed.

View File

@ -17,12 +17,12 @@ namespace MosesTuning
ScoreArray::ScoreArray()
: m_num_scores(0), m_index(0) {}
: m_num_scores(0), m_index(0) {}
void ScoreArray::savetxt(ostream* os, const string& sctype)
{
*os << SCORES_TXT_BEGIN << " " << m_index << " " << m_array.size()
<< " " << m_num_scores << " " << sctype << endl;
<< " " << m_num_scores << " " << sctype << endl;
for (scorearray_t::iterator i = m_array.begin(); i !=m_array.end(); i++) {
i->savetxt(os);
*os << endl;
@ -33,7 +33,7 @@ void ScoreArray::savetxt(ostream* os, const string& sctype)
void ScoreArray::savebin(ostream* os, const string& score_type)
{
*os << SCORES_BIN_BEGIN << " " << m_index << " " << m_array.size()
<< " " << m_num_scores << " " << score_type << endl;
<< " " << m_num_scores << " " << score_type << endl;
for (scorearray_t::iterator i = m_array.begin();
i != m_array.end(); i++) {
i->savebin(os);
@ -63,7 +63,8 @@ void ScoreArray::save(const string &file, const string& score_type, bool bin)
ofs.close();
}
void ScoreArray::save(const string& score_type, bool bin) {
void ScoreArray::save(const string& score_type, bool bin)
{
save(&cout, score_type, bin);
}

View File

@ -25,7 +25,7 @@ const char SCORES_BIN_END[] = "SCORES_BIN_END_0";
class ScoreArray
{
private:
private:
scorearray_t m_array;
std::string m_score_type;
std::size_t m_num_scores;
@ -38,17 +38,29 @@ public:
ScoreArray();
~ScoreArray() {}
void clear() { m_array.clear(); }
void clear() {
m_array.clear();
}
int getIndex() const { return m_index; }
int getIndex() const {
return m_index;
}
void setIndex(int value) { m_index = value; }
void setIndex(int value) {
m_index = value;
}
ScoreStats& get(std::size_t i) { return m_array.at(i); }
ScoreStats& get(std::size_t i) {
return m_array.at(i);
}
const ScoreStats& get(std::size_t i) const { return m_array.at(i); }
const ScoreStats& get(std::size_t i) const {
return m_array.at(i);
}
void add(const ScoreStats& e) { m_array.push_back(e); }
void add(const ScoreStats& e) {
m_array.push_back(e);
}
//ADDED BY TS
void swap(std::size_t i, std::size_t j) {
@ -62,15 +74,25 @@ public:
void merge(ScoreArray& e);
std::string name() const { return m_score_type; }
std::string name() const {
return m_score_type;
}
void name(std::string &score_type) { m_score_type = score_type; }
void name(std::string &score_type) {
m_score_type = score_type;
}
std::size_t size() const { return m_array.size(); }
std::size_t size() const {
return m_array.size();
}
std::size_t NumberOfScores() const { return m_num_scores; }
std::size_t NumberOfScores() const {
return m_num_scores;
}
void NumberOfScores(std::size_t v) { m_num_scores = v; }
void NumberOfScores(std::size_t v) {
m_num_scores = v;
}
void savetxt(std::ostream* os, const std::string& score_type);
void savebin(std::ostream* os, const std::string& score_type);

View File

@ -50,7 +50,8 @@ void ScoreData::save(const string &file, bool bin)
ofs.close();
}
void ScoreData::save(bool bin) {
void ScoreData::save(bool bin)
{
save(&cout, bin);
}

View File

@ -40,7 +40,9 @@ public:
ScoreData(Scorer* scorer);
~ScoreData() {}
void clear() { m_array.clear(); }
void clear() {
m_array.clear();
}
inline ScoreArray& get(std::size_t idx) {
return m_array.at(idx);
@ -66,7 +68,9 @@ public:
return m_array.at(i).get(j);
}
std::string name() const { return m_score_type; }
std::string name() const {
return m_score_type;
}
std::string name(const std::string &score_type) {
return m_score_type = score_type;
@ -75,8 +79,12 @@ public:
void add(ScoreArray& e);
void add(const ScoreStats& e, int sent_idx);
std::size_t NumberOfScores() const { return m_num_scores; }
std::size_t size() const { return m_array.size(); }
std::size_t NumberOfScores() const {
return m_num_scores;
}
std::size_t size() const {
return m_array.size();
}
void save(const std::string &file, bool bin=false);
void save(std::ostream* os, bool bin=false);

View File

@ -33,14 +33,16 @@ namespace MosesTuning
ScoreDataIterator::ScoreDataIterator() {}
ScoreDataIterator::ScoreDataIterator(const string& filename) {
ScoreDataIterator::ScoreDataIterator(const string& filename)
{
m_in.reset(new FilePiece(filename.c_str()));
readNext();
}
ScoreDataIterator::~ScoreDataIterator() {}
void ScoreDataIterator::readNext() {
void ScoreDataIterator::readNext()
{
m_next.clear();
try {
StringPiece marker = m_in->ReadDelimited();
@ -71,12 +73,14 @@ void ScoreDataIterator::readNext() {
}
}
void ScoreDataIterator::increment() {
void ScoreDataIterator::increment()
{
readNext();
}
bool ScoreDataIterator::equal(const ScoreDataIterator& rhs) const {
bool ScoreDataIterator::equal(const ScoreDataIterator& rhs) const
{
if (!m_in && !rhs.m_in) {
return true;
} else if (!m_in) {
@ -85,12 +89,13 @@ bool ScoreDataIterator::equal(const ScoreDataIterator& rhs) const {
return false;
} else {
return m_in->FileName() == rhs.m_in->FileName() &&
m_in->Offset() == rhs.m_in->Offset();
m_in->Offset() == rhs.m_in->Offset();
}
}
const vector<ScoreDataItem>& ScoreDataIterator::dereference() const {
const vector<ScoreDataItem>& ScoreDataIterator::dereference() const
{
return m_next;
}

View File

@ -33,7 +33,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "FeatureDataIterator.h"
namespace util { class FilePiece; }
namespace util
{
class FilePiece;
}
namespace MosesTuning
{
@ -43,30 +46,30 @@ typedef std::vector<float> ScoreDataItem;
class ScoreDataIterator :
public boost::iterator_facade<ScoreDataIterator,
const std::vector<ScoreDataItem>,
boost::forward_traversal_tag>
const std::vector<ScoreDataItem>,
boost::forward_traversal_tag>
{
public:
ScoreDataIterator();
explicit ScoreDataIterator(const std::string& filename);
public:
ScoreDataIterator();
explicit ScoreDataIterator(const std::string& filename);
~ScoreDataIterator();
~ScoreDataIterator();
static ScoreDataIterator end() {
return ScoreDataIterator();
}
static ScoreDataIterator end() {
return ScoreDataIterator();
}
private:
friend class boost::iterator_core_access;
private:
friend class boost::iterator_core_access;
void increment();
bool equal(const ScoreDataIterator& rhs) const;
const std::vector<ScoreDataItem>& dereference() const;
void increment();
bool equal(const ScoreDataIterator& rhs) const;
const std::vector<ScoreDataItem>& dereference() const;
void readNext();
void readNext();
boost::shared_ptr<util::FilePiece> m_in;
std::vector<ScoreDataItem> m_next;
boost::shared_ptr<util::FilePiece> m_in;
std::vector<ScoreDataItem> m_next;
};
}

View File

@ -13,7 +13,8 @@
using namespace std;
namespace {
namespace
{
const int kAvailableSize = 8;
} // namespace
@ -22,12 +23,12 @@ namespace MosesTuning
ScoreStats::ScoreStats()
: m_available_size(kAvailableSize), m_entries(0),
m_array(new ScoreStatsType[m_available_size]) {}
: m_available_size(kAvailableSize), m_entries(0),
m_array(new ScoreStatsType[m_available_size]) {}
ScoreStats::ScoreStats(const size_t size)
: m_available_size(size), m_entries(size),
m_array(new ScoreStatsType[m_available_size])
: m_available_size(size), m_entries(size),
m_array(new ScoreStatsType[m_available_size])
{
memset(m_array, 0, GetArraySizeWithBytes());
}
@ -123,7 +124,8 @@ void ScoreStats::savetxt(ostream* os)
*os << *this;
}
void ScoreStats::savetxt() {
void ScoreStats::savetxt()
{
savetxt(&cout);
}
@ -140,7 +142,8 @@ ostream& operator<<(ostream& o, const ScoreStats& e)
return o;
}
bool operator==(const ScoreStats& s1, const ScoreStats& s2) {
bool operator==(const ScoreStats& s1, const ScoreStats& s2)
{
size_t size = s1.size();
if (size != s2.size())

View File

@ -41,7 +41,9 @@ public:
void Copy(const ScoreStats &stats);
bool isfull() const { return (m_entries < m_available_size) ? 0 : 1; }
bool isfull() const {
return (m_entries < m_available_size) ? 0 : 1;
}
void expand();
void add(ScoreStatsType v);
@ -55,9 +57,15 @@ public:
clear();
}
ScoreStatsType get(std::size_t i) { return m_array[i]; }
ScoreStatsType get(std::size_t i) const { return m_array[i]; }
scorestats_t getArray() const { return m_array; }
ScoreStatsType get(std::size_t i) {
return m_array[i];
}
ScoreStatsType get(std::size_t i) const {
return m_array[i];
}
scorestats_t getArray() const {
return m_array;
}
void set(const std::string& str);
@ -69,15 +77,21 @@ public:
}
}
std::size_t bytes() const { return GetArraySizeWithBytes(); }
std::size_t bytes() const {
return GetArraySizeWithBytes();
}
std::size_t GetArraySizeWithBytes() const {
return m_entries * sizeof(ScoreStatsType);
}
std::size_t size() const { return m_entries; }
std::size_t size() const {
return m_entries;
}
std::size_t available() const { return m_available_size; }
std::size_t available() const {
return m_available_size;
}
void savetxt(const std::string &file);
void savetxt(std::ostream* os);

View File

@ -12,27 +12,31 @@ using namespace std;
namespace MosesTuning
{
namespace {
namespace
{
// For tokenizing a hypothesis translation, we may encounter unknown tokens which
// do not exist in the corresponding reference translations.
const int kUnknownToken = -1;
} // namespace
Scorer::Scorer(const string& name, const string& config)
: m_name(name),
m_vocab(mert::VocabularyFactory::GetVocabulary()),
m_filter(NULL),
m_score_data(NULL),
m_enable_preserve_case(true) {
: m_name(name),
m_vocab(mert::VocabularyFactory::GetVocabulary()),
m_filter(NULL),
m_score_data(NULL),
m_enable_preserve_case(true)
{
InitConfig(config);
}
Scorer::~Scorer() {
Scorer::~Scorer()
{
Singleton<mert::Vocabulary>::Delete();
delete m_filter;
}
void Scorer::InitConfig(const string& config) {
void Scorer::InitConfig(const string& config)
{
// cerr << "Scorer config string: " << config << endl;
size_t start = 0;
while (start < config.size()) {
@ -53,7 +57,8 @@ void Scorer::InitConfig(const string& config) {
}
}
void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) {
void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded)
{
for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));
it; ++it) {
if (!m_enable_preserve_case) {
@ -69,7 +74,8 @@ void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) {
}
}
void Scorer::TokenizeAndEncodeTesting(const string& line, vector<int>& encoded) {
void Scorer::TokenizeAndEncodeTesting(const string& line, vector<int>& encoded)
{
for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));
it; ++it) {
if (!m_enable_preserve_case) {
@ -103,8 +109,7 @@ void Scorer::setFactors(const string& factors)
if (factors.empty()) return;
vector<string> factors_vec;
split(factors, '|', factors_vec);
for(vector<string>::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it)
{
for(vector<string>::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it) {
int factor = atoi(it->c_str());
m_factors.push_back(factor);
}
@ -115,8 +120,8 @@ void Scorer::setFactors(const string& factors)
*/
void Scorer::setFilter(const string& filterCommand)
{
if (filterCommand.empty()) return;
m_filter = new PreProcessFilter(filterCommand);
if (filterCommand.empty()) return;
m_filter = new PreProcessFilter(filterCommand);
}
/**
@ -130,8 +135,7 @@ string Scorer::applyFactors(const string& sentence) const
split(sentence, ' ', tokens);
stringstream sstream;
for (size_t i = 0; i < tokens.size(); ++i)
{
for (size_t i = 0; i < tokens.size(); ++i) {
if (tokens[i] == "") continue;
vector<string> factors;
@ -141,8 +145,7 @@ string Scorer::applyFactors(const string& sentence) const
if (i > 0) sstream << " ";
for (size_t j = 0; j < m_factors.size(); ++j)
{
for (size_t j = 0; j < m_factors.size(); ++j) {
int findex = m_factors[j];
if (findex < 0 || findex >= fsize) throw runtime_error("Factor index is out of range.");
@ -158,17 +161,15 @@ string Scorer::applyFactors(const string& sentence) const
*/
string Scorer::applyFilter(const string& sentence) const
{
if (m_filter)
{
if (m_filter) {
return m_filter->ProcessSentence(sentence);
}
else
{
} else {
return sentence;
}
}
float Scorer::score(const candidates_t& candidates) const {
float Scorer::score(const candidates_t& candidates) const
{
diffs_t diffs;
statscores_t scores;
score(candidates, diffs, scores);

View File

@ -10,7 +10,8 @@
#include "Types.h"
#include "ScoreData.h"
namespace mert {
namespace mert
{
class Vocabulary;
@ -32,7 +33,7 @@ enum ScorerRegularisationStrategy {REG_NONE, REG_AVERAGE, REG_MINIMUM};
*/
class Scorer
{
public:
public:
Scorer(const std::string& name, const std::string& config);
virtual ~Scorer();
@ -117,14 +118,16 @@ class Scorer
*/
virtual void setFactors(const std::string& factors);
mert::Vocabulary* GetVocab() const { return m_vocab; }
mert::Vocabulary* GetVocab() const {
return m_vocab;
}
/**
* Set unix filter, which will be used to preprocess the sentences
*/
virtual void setFilter(const std::string& filterCommand);
private:
private:
void InitConfig(const std::string& config);
/**
@ -143,7 +146,7 @@ class Scorer
std::vector<int> m_factors;
PreProcessFilter* m_filter;
protected:
protected:
ScoreData* m_score_data;
bool m_enable_preserve_case;
@ -173,40 +176,40 @@ class Scorer
/**
* Every inherited scorer should call this function for each sentence
*/
std::string preprocessSentence(const std::string& sentence) const
{
std::string preprocessSentence(const std::string& sentence) const {
return applyFactors(applyFilter(sentence));
}
};
namespace {
namespace
{
//regularisation strategies
inline float score_min(const statscores_t& scores, size_t start, size_t end)
{
float min = std::numeric_limits<float>::max();
for (size_t i = start; i < end; ++i) {
if (scores[i] < min) {
min = scores[i];
}
//regularisation strategies
inline float score_min(const statscores_t& scores, size_t start, size_t end)
{
float min = std::numeric_limits<float>::max();
for (size_t i = start; i < end; ++i) {
if (scores[i] < min) {
min = scores[i];
}
return min;
}
return min;
}
inline float score_average(const statscores_t& scores, size_t start, size_t end)
{
if ((end - start) < 1) {
// this shouldn't happen
return 0;
}
float total = 0;
for (size_t j = start; j < end; ++j) {
total += scores[j];
}
inline float score_average(const statscores_t& scores, size_t start, size_t end)
{
if ((end - start) < 1) {
// this shouldn't happen
return 0;
}
float total = 0;
for (size_t j = start; j < end; ++j) {
total += scores[j];
}
return total / (end - start);
}
return total / (end - start);
}
} // namespace

View File

@ -16,7 +16,8 @@ namespace MosesTuning
{
vector<string> ScorerFactory::getTypes() {
vector<string> ScorerFactory::getTypes()
{
vector<string> types;
types.push_back(string("BLEU"));
types.push_back(string("PER"));
@ -29,7 +30,8 @@ vector<string> ScorerFactory::getTypes() {
return types;
}
Scorer* ScorerFactory::getScorer(const string& type, const string& config) {
Scorer* ScorerFactory::getScorer(const string& type, const string& config)
{
if (type == "BLEU") {
return new BleuScorer(config);
} else if (type == "PER") {
@ -48,8 +50,7 @@ Scorer* ScorerFactory::getScorer(const string& type, const string& config) {
} else {
if (type.find(',') != string::npos) {
return new InterpolatedScorer(type, config);
}
else {
} else {
throw runtime_error("Unknown scorer type: " + type);
}
}

View File

@ -6,7 +6,8 @@
using namespace std;
namespace {
namespace
{
MosesTuning::SemposOverlapping* g_overlapping = NULL;
@ -16,7 +17,8 @@ namespace MosesTuning
{
SemposOverlapping* SemposOverlappingFactory::GetOverlapping(const string& str, const SemposScorer* sempos) {
SemposOverlapping* SemposOverlappingFactory::GetOverlapping(const string& str, const SemposScorer* sempos)
{
if (str == "cap-micro") {
return new CapMicroOverlapping(sempos);
} else if (str == "cap-macro") {
@ -26,7 +28,8 @@ SemposOverlapping* SemposOverlappingFactory::GetOverlapping(const string& str, c
}
}
void SemposOverlappingFactory::SetOverlapping(SemposOverlapping* ovr) {
void SemposOverlappingFactory::SetOverlapping(SemposOverlapping* ovr)
{
g_overlapping = ovr;
}
@ -41,14 +44,12 @@ vector<int> CapMicroOverlapping::prepareStats(const sentence_t& cand, const sent
int multCoeff = 1000;
float interSum = 0;
for (sentence_t::iterator it = intersection.begin(); it != intersection.end(); it++)
{
for (sentence_t::iterator it = intersection.begin(); it != intersection.end(); it++) {
interSum += semposScorer->weight(it->first);
}
float refSum = 0;
for (sentence_t::iterator it = ref.begin(); it != ref.end(); it++)
{
for (sentence_t::iterator it = ref.begin(); it != ref.end(); it++) {
refSum += semposScorer->weight(it->first);
}

View File

@ -36,14 +36,15 @@ public:
virtual std::size_t NumberOfScores() const = 0;
};
class SemposOverlappingFactory {
public:
class SemposOverlappingFactory
{
public:
static SemposOverlapping* GetOverlapping(const std::string& str, const SemposScorer* sempos);
// dependency injection for unit testing.
static void SetOverlapping(SemposOverlapping* ovr);
private:
private:
SemposOverlappingFactory() {}
~SemposOverlappingFactory() {}
};
@ -62,9 +63,11 @@ public:
virtual std::vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref);
virtual float calculateScore(const std::vector<int>& stats) const;
virtual std::size_t NumberOfScores() const { return 2; }
virtual std::size_t NumberOfScores() const {
return 2;
}
private:
private:
// no copying allowed.
CapMicroOverlapping(const CapMicroOverlapping&);
CapMicroOverlapping& operator=(const CapMicroOverlapping&);
@ -82,9 +85,11 @@ public:
virtual std::vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref);
virtual float calculateScore(const std::vector<int>& stats) const;
virtual std::size_t NumberOfScores() const { return kMaxNOC * 2; }
virtual std::size_t NumberOfScores() const {
return kMaxNOC * 2;
}
private:
private:
// no copying allowed.
CapMacroOverlapping(const CapMacroOverlapping&);
CapMacroOverlapping& operator=(const CapMacroOverlapping&);

View File

@ -25,8 +25,7 @@ SemposScorer::SemposScorer(const string& config)
m_semposMap.clear();
string weightsfile = getConfig("weightsfile", "");
if (weightsfile != "")
{
if (weightsfile != "") {
loadWeights(weightsfile);
}
}
@ -144,42 +143,35 @@ int SemposScorer::encodeSempos(const string& sempos)
float SemposScorer::weight(int item) const
{
std::map<int,float>::const_iterator it = weightsMap.find(item);
if (it == weightsMap.end())
{
return 1.0f;
}
else
{
return it->second;
}
std::map<int,float>::const_iterator it = weightsMap.find(item);
if (it == weightsMap.end()) {
return 1.0f;
} else {
return it->second;
}
}
void SemposScorer::loadWeights(const string& weightsfile)
{
string line;
ifstream myfile;
myfile.open(weightsfile.c_str(), ifstream::in);
if (myfile.is_open())
{
while ( myfile.good() )
{
getline (myfile,line);
vector<string> fields;
if (line == "") continue;
split(line, '\t', fields);
if (fields.size() != 2) throw std::runtime_error("Bad format of a row in weights file.");
int encoded = encodeString(fields[0]);
float weight = atof(fields[1].c_str());
weightsMap[encoded] = weight;
}
myfile.close();
}
else
{
cerr << "Unable to open file "<< weightsfile << endl;
exit(1);
string line;
ifstream myfile;
myfile.open(weightsfile.c_str(), ifstream::in);
if (myfile.is_open()) {
while ( myfile.good() ) {
getline (myfile,line);
vector<string> fields;
if (line == "") continue;
split(line, '\t', fields);
if (fields.size() != 2) throw std::runtime_error("Bad format of a row in weights file.");
int encoded = encodeString(fields[0]);
float weight = atof(fields[1].c_str());
weightsMap[encoded] = weight;
}
myfile.close();
} else {
cerr << "Unable to open file "<< weightsfile << endl;
exit(1);
}
}

View File

@ -32,12 +32,16 @@ public:
virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles);
virtual void prepareStats(std::size_t sindex, const std::string& text, ScoreStats& entry);
virtual std::size_t NumberOfScores() const { return m_ovr->NumberOfScores(); }
virtual std::size_t NumberOfScores() const {
return m_ovr->NumberOfScores();
}
virtual float calculateScore(const std::vector<int>& comps) const {
return m_ovr->calculateScore(comps);
}
bool EnableDebug() const { return m_enable_debug; }
bool EnableDebug() const {
return m_enable_debug;
}
float weight(int item) const;

View File

@ -17,48 +17,50 @@ namespace MosesTuning
{
SentenceLevelScorer::SentenceLevelScorer(const string& name, const string& config)
: Scorer(name, config),
m_regularisationStrategy(REG_NONE),
m_regularisationWindow(0) {
: Scorer(name, config),
m_regularisationStrategy(REG_NONE),
m_regularisationWindow(0)
{
Init();
}
SentenceLevelScorer::~SentenceLevelScorer() {}
void SentenceLevelScorer::Init() {
// Configure regularisation.
static string KEY_TYPE = "regtype";
static string KEY_WINDOW = "regwin";
static string KEY_CASE = "case";
static string TYPE_NONE = "none";
static string TYPE_AVERAGE = "average";
static string TYPE_MINIMUM = "min";
static string TRUE = "true";
static string FALSE = "false";
void SentenceLevelScorer::Init()
{
// Configure regularisation.
static string KEY_TYPE = "regtype";
static string KEY_WINDOW = "regwin";
static string KEY_CASE = "case";
static string TYPE_NONE = "none";
static string TYPE_AVERAGE = "average";
static string TYPE_MINIMUM = "min";
static string TRUE = "true";
static string FALSE = "false";
const string type = getConfig(KEY_TYPE, TYPE_NONE);
if (type == TYPE_NONE) {
m_regularisationStrategy = REG_NONE;
} else if (type == TYPE_AVERAGE) {
m_regularisationStrategy = REG_AVERAGE;
} else if (type == TYPE_MINIMUM) {
m_regularisationStrategy = REG_MINIMUM;
} else {
throw boost::lexer::runtime_error("Unknown scorer regularisation strategy: " + type);
}
cerr << "Using scorer regularisation strategy: " << type << endl;
const string type = getConfig(KEY_TYPE, TYPE_NONE);
if (type == TYPE_NONE) {
m_regularisationStrategy = REG_NONE;
} else if (type == TYPE_AVERAGE) {
m_regularisationStrategy = REG_AVERAGE;
} else if (type == TYPE_MINIMUM) {
m_regularisationStrategy = REG_MINIMUM;
} else {
throw boost::lexer::runtime_error("Unknown scorer regularisation strategy: " + type);
}
cerr << "Using scorer regularisation strategy: " << type << endl;
const string window = getConfig(KEY_WINDOW, "0");
m_regularisationWindow = atoi(window.c_str());
cerr << "Using scorer regularisation window: " << m_regularisationWindow << endl;
const string window = getConfig(KEY_WINDOW, "0");
m_regularisationWindow = atoi(window.c_str());
cerr << "Using scorer regularisation window: " << m_regularisationWindow << endl;
const string preservecase = getConfig(KEY_CASE, TRUE);
if (preservecase == TRUE) {
m_enable_preserve_case = true;
} else if (preservecase == FALSE) {
m_enable_preserve_case = false;
}
cerr << "Using case preservation: " << m_enable_preserve_case << endl;
const string preservecase = getConfig(KEY_CASE, TRUE);
if (preservecase == TRUE) {
m_enable_preserve_case = true;
} else if (preservecase == FALSE) {
m_enable_preserve_case = false;
}
cerr << "Using case preservation: " << m_enable_preserve_case << endl;
}
void SentenceLevelScorer::score(const candidates_t& candidates, const diffs_t& diffs,
@ -83,8 +85,8 @@ void SentenceLevelScorer::score(const candidates_t& candidates, const diffs_t&
if (stats.size() != totals.size()) {
stringstream msg;
msg << "Statistics for (" << "," << candidates[i] << ") have incorrect "
<< "number of fields. Found: " << stats.size() << " Expected: "
<< totals.size();
<< "number of fields. Found: " << stats.size() << " Expected: "
<< totals.size();
throw runtime_error(msg.str());
}
//Add up scores for all sentences, would normally be just one score

View File

@ -10,8 +10,9 @@ namespace MosesTuning
// thread *un*safe singleton.
// TODO: replace this with thread-safe singleton.
template <typename T>
class Singleton {
public:
class Singleton
{
public:
static T* GetInstance() {
if (m_instance == NULL) {
m_instance = new T;
@ -26,7 +27,7 @@ class Singleton {
}
}
private:
private:
Singleton();
static T* m_instance;
};

View File

@ -5,19 +5,24 @@
using namespace MosesTuning;
namespace {
namespace
{
static int g_count = 0;
class Instance {
public:
Instance() { ++g_count; }
class Instance
{
public:
Instance() {
++g_count;
}
~Instance() {}
};
} // namespace
BOOST_AUTO_TEST_CASE(singleton_basic) {
BOOST_AUTO_TEST_CASE(singleton_basic)
{
Instance* instance1 = Singleton<Instance>::GetInstance();
Instance* instance2 = Singleton<Instance>::GetInstance();
Instance* instance3 = Singleton<Instance>::GetInstance();

View File

@ -16,7 +16,8 @@ namespace MosesTuning
StatisticsBasedScorer::StatisticsBasedScorer(const string& name, const string& config)
: Scorer(name,config) {
: Scorer(name,config)
{
//configure regularisation
static string KEY_TYPE = "regtype";
static string KEY_WINDOW = "regwin";
@ -72,8 +73,8 @@ void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t
if (stats.size() != totals.size()) {
stringstream msg;
msg << "Statistics for (" << "," << candidates[i] << ") have incorrect "
<< "number of fields. Found: " << stats.size() << " Expected: "
<< totals.size();
<< "number of fields. Found: " << stats.size() << " Expected: "
<< totals.size();
throw runtime_error(msg.str());
}
for (size_t k = 0; k < totals.size(); ++k) {
@ -91,7 +92,7 @@ void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t
size_t last_nid = last_candidates[sid];
for (size_t k = 0; k < totals.size(); ++k) {
int diff = m_score_data->get(sid,nid).get(k)
- m_score_data->get(sid,last_nid).get(k);
- m_score_data->get(sid,last_nid).get(k);
totals[k] += diff;
}
last_candidates[sid] = nid;

View File

@ -17,7 +17,7 @@ namespace MosesTuning
TerScorer::TerScorer(const string& config)
: StatisticsBasedScorer("TER",config), kLENGTH(2) {}
: StatisticsBasedScorer("TER",config), kLENGTH(2) {}
TerScorer::~TerScorer() {}

View File

@ -6,14 +6,17 @@
#include <sys/time.h>
#endif
namespace {
namespace
{
#if !defined(_WIN32) && !defined(_WIN64)
uint64_t GetMicroSeconds(const struct timeval& tv) {
uint64_t GetMicroSeconds(const struct timeval& tv)
{
return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
}
uint64_t GetTimeOfDayMicroSeconds() {
uint64_t GetTimeOfDayMicroSeconds()
{
struct timeval tv;
gettimeofday(&tv, NULL);
return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
@ -26,7 +29,8 @@ namespace MosesTuning
{
void Timer::GetCPUTimeMicroSeconds(Timer::CPUTime* cpu_time) const {
void Timer::GetCPUTimeMicroSeconds(Timer::CPUTime* cpu_time) const
{
#if !defined(_WIN32) && !defined(_WIN64)
struct rusage usage;
if (getrusage(RUSAGE_SELF, &usage)) {
@ -41,22 +45,26 @@ void Timer::GetCPUTimeMicroSeconds(Timer::CPUTime* cpu_time) const {
#endif
}
double Timer::get_elapsed_cpu_time() const {
double Timer::get_elapsed_cpu_time() const
{
return static_cast<double>(get_elapsed_cpu_time_microseconds()) * 1e-6;
}
uint64_t Timer::get_elapsed_cpu_time_microseconds() const {
uint64_t Timer::get_elapsed_cpu_time_microseconds() const
{
CPUTime e;
GetCPUTimeMicroSeconds(&e);
return (e.user_time - m_start_time.user_time) +
(e.sys_time - m_start_time.sys_time);
(e.sys_time - m_start_time.sys_time);
}
double Timer::get_elapsed_wall_time() const {
double Timer::get_elapsed_wall_time() const
{
return static_cast<double>(get_elapsed_wall_time_microseconds()) * 1e-6;
}
uint64_t Timer::get_elapsed_wall_time_microseconds() const {
uint64_t Timer::get_elapsed_wall_time_microseconds() const
{
return GetTimeOfDayMicroSeconds() - m_wall;
}
@ -92,7 +100,8 @@ void Timer::check(const char* msg)
}
}
std::string Timer::ToString() const {
std::string Timer::ToString() const
{
std::string res;
const double wall = get_elapsed_wall_time();
CPUTime e;

View File

@ -11,7 +11,7 @@ namespace MosesTuning
class Timer
{
private:
private:
// Time values are stored in microseconds.
struct CPUTime {
uint64_t user_time; // user CPU time
@ -30,15 +30,15 @@ class Timer
Timer(const Timer&);
void operator=(const Timer&);
public:
public:
/**
* 'm_is_running' is initially false. A timer needs to be explicitly started
* using 'start'.
*/
Timer()
: m_is_running(false),
m_wall(0),
m_start_time() {}
: m_is_running(false),
m_wall(0),
m_start_time() {}
~Timer() {}
@ -61,7 +61,9 @@ class Timer
/**
*/
bool is_running() const { return m_is_running; }
bool is_running() const {
return m_is_running;
}
/**
* Return the total time in seconds that the timer has been in the
@ -97,7 +99,8 @@ class Timer
* for an ostream 'os' and a timer 't'. For example, "cout << t" will
* print out the total amount of time 't' has been "running".
*/
inline std::ostream& operator<<(std::ostream& os, const Timer& t) {
inline std::ostream& operator<<(std::ostream& os, const Timer& t)
{
if (t.is_running()) {
os << t.ToString();
} else {

View File

@ -8,7 +8,8 @@
using namespace MosesTuning;
BOOST_AUTO_TEST_CASE(timer_basic_test) {
BOOST_AUTO_TEST_CASE(timer_basic_test)
{
Timer timer;
const int sleep_time_microsec = 40; // ad-hoc microseconds to pass unit tests.

View File

@ -11,7 +11,8 @@
using namespace std;
namespace {
namespace
{
MosesTuning::Timer g_timer;
int g_verbose = 0;
@ -56,7 +57,8 @@ size_t getNextPound(std::string &str, std::string &substr,
return pos;
}
void split(const std::string &s, char delim, std::vector<std::string> &elems) {
void split(const std::string &s, char delim, std::vector<std::string> &elems)
{
std::stringstream ss(s);
std::string item;
while(std::getline(ss, item, delim)) {
@ -65,7 +67,8 @@ void split(const std::string &s, char delim, std::vector<std::string> &elems) {
}
void Tokenize(const char *str, const char delim,
std::vector<std::string> *res) {
std::vector<std::string> *res)
{
while (1) {
const char *begin = str;
while (*str != delim && *str) str++;

View File

@ -40,7 +40,8 @@ int setverboselevel(int v);
const float kEPS = 0.0001f;
template <typename T>
bool IsAlmostEqual(T expected, T actual, float round=kEPS) {
bool IsAlmostEqual(T expected, T actual, float round=kEPS)
{
if (std::abs(expected - actual) < round) {
return true;
} else {
@ -86,7 +87,8 @@ inline T Scan(const std::string &input)
* Returns true iff "str" ends with "suffix".
* e.g., Given str = "abc:" and suffix = ":", this function returns true.
*/
inline bool EndsWith(const std::string& str, const char* suffix) {
inline bool EndsWith(const std::string& str, const char* suffix)
{
return str.find_last_of(suffix) == str.size() - 1;
}

View File

@ -5,7 +5,8 @@
using namespace MosesTuning;
BOOST_AUTO_TEST_CASE(util_get_next_pound_test) {
BOOST_AUTO_TEST_CASE(util_get_next_pound_test)
{
{
std::string str("9 9 7 ");
std::string substr;
@ -38,7 +39,8 @@ BOOST_AUTO_TEST_CASE(util_get_next_pound_test) {
}
}
BOOST_AUTO_TEST_CASE(util_tokenize_test) {
BOOST_AUTO_TEST_CASE(util_tokenize_test)
{
{
std::vector<std::string> res;
Tokenize("9 9 7", ' ', &res);
@ -66,7 +68,8 @@ BOOST_AUTO_TEST_CASE(util_tokenize_test) {
}
}
BOOST_AUTO_TEST_CASE(util_ends_with_test) {
BOOST_AUTO_TEST_CASE(util_ends_with_test)
{
BOOST_CHECK(EndsWith("abc:", ":"));
BOOST_CHECK(EndsWith("a b c:", ":"));
BOOST_CHECK(!EndsWith("a", ":"));

View File

@ -1,34 +1,39 @@
#include "Vocabulary.h"
#include "Singleton.h"
namespace mert {
namespace {
namespace mert
{
namespace
{
Vocabulary* g_vocab = NULL;
} // namespace
int Vocabulary::Encode(const std::string& token) {
iterator it = m_vocab.find(token);
int encoded_token;
if (it == m_vocab.end()) {
// Add an new entry to the vocaburary.
encoded_token = static_cast<int>(m_vocab.size());
int Vocabulary::Encode(const std::string& token)
{
iterator it = m_vocab.find(token);
int encoded_token;
if (it == m_vocab.end()) {
// Add an new entry to the vocaburary.
encoded_token = static_cast<int>(m_vocab.size());
m_vocab[token] = encoded_token;
} else {
encoded_token = it->second;
}
return encoded_token;
m_vocab[token] = encoded_token;
} else {
encoded_token = it->second;
}
return encoded_token;
}
bool Vocabulary::Lookup(const std::string&str , int* v) const {
bool Vocabulary::Lookup(const std::string&str , int* v) const
{
const_iterator it = m_vocab.find(str);
if (it == m_vocab.end()) return false;
*v = it->second;
return true;
const_iterator it = m_vocab.find(str);
if (it == m_vocab.end()) return false;
*v = it->second;
return true;
}
Vocabulary* VocabularyFactory::GetVocabulary() {
Vocabulary* VocabularyFactory::GetVocabulary()
{
if (g_vocab == NULL) {
return MosesTuning::Singleton<Vocabulary>::GetInstance();
} else {
@ -36,7 +41,8 @@ Vocabulary* VocabularyFactory::GetVocabulary() {
}
}
void VocabularyFactory::SetVocabulary(Vocabulary* vocab) {
void VocabularyFactory::SetVocabulary(Vocabulary* vocab)
{
g_vocab = vocab;
}

View File

@ -4,7 +4,8 @@
#include <boost/unordered_map.hpp>
#include <string>
namespace mert {
namespace mert
{
/**
* A map to handle vocabularies to calculate
@ -12,8 +13,9 @@ namespace mert {
*
* TODO: replace this with more efficient data structure.
*/
class Vocabulary {
public:
class Vocabulary
{
public:
typedef boost::unordered_map<std::string, int>::iterator iterator;
typedef boost::unordered_map<std::string, int>::const_iterator const_iterator;
@ -28,32 +30,53 @@ class Vocabulary {
*/
bool Lookup(const std::string&str , int* v) const;
void clear() { m_vocab.clear(); }
void clear() {
m_vocab.clear();
}
bool empty() const { return m_vocab.empty(); }
bool empty() const {
return m_vocab.empty();
}
std::size_t size() const { return m_vocab.size(); }
std::size_t size() const {
return m_vocab.size();
}
iterator find(const std::string& str) { return m_vocab.find(str); }
const_iterator find(const std::string& str) const { return m_vocab.find(str); }
iterator find(const std::string& str) {
return m_vocab.find(str);
}
const_iterator find(const std::string& str) const {
return m_vocab.find(str);
}
int& operator[](const std::string& str) { return m_vocab[str]; }
int& operator[](const std::string& str) {
return m_vocab[str];
}
iterator begin() { return m_vocab.begin(); }
const_iterator begin() const { return m_vocab.begin(); }
iterator end() { return m_vocab.end(); }
const_iterator end() const { return m_vocab.end(); }
iterator begin() {
return m_vocab.begin();
}
const_iterator begin() const {
return m_vocab.begin();
}
iterator end() {
return m_vocab.end();
}
const_iterator end() const {
return m_vocab.end();
}
private:
private:
boost::unordered_map<std::string, int> m_vocab;
};
class VocabularyFactory {
public:
class VocabularyFactory
{
public:
static Vocabulary* GetVocabulary();
static void SetVocabulary(Vocabulary* vocab);
private:
private:
VocabularyFactory() {}
virtual ~VocabularyFactory() {}
};

View File

@ -6,16 +6,20 @@
using namespace MosesTuning;
namespace mert {
namespace {
namespace mert
{
namespace
{
void TearDown() {
void TearDown()
{
Singleton<Vocabulary>::Delete();
}
} // namespace
BOOST_AUTO_TEST_CASE(vocab_basic) {
BOOST_AUTO_TEST_CASE(vocab_basic)
{
Vocabulary vocab;
BOOST_REQUIRE(vocab.empty());
vocab.clear();
@ -39,7 +43,8 @@ BOOST_AUTO_TEST_CASE(vocab_basic) {
BOOST_CHECK(!vocab.Lookup("world", &v));
}
BOOST_AUTO_TEST_CASE(vocab_factory_test) {
BOOST_AUTO_TEST_CASE(vocab_factory_test)
{
Vocabulary* vocab1 = VocabularyFactory::GetVocabulary();
Vocabulary* vocab2 = VocabularyFactory::GetVocabulary();
Vocabulary* vocab3 = VocabularyFactory::GetVocabulary();

View File

@ -14,7 +14,8 @@
using namespace std;
using namespace MosesTuning;
namespace {
namespace
{
Scorer* g_scorer = NULL;
bool g_has_more_files = false;
@ -22,13 +23,14 @@ bool g_has_more_scorers = false;
const float g_alpha = 0.05;
class EvaluatorUtil {
public:
class EvaluatorUtil
{
public:
static void evaluate(const string& candFile, int bootstrap);
static float average(const vector<float>& list);
static string int2string(int n);
private:
private:
EvaluatorUtil() {}
~EvaluatorUtil() {}
};
@ -43,22 +45,18 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap)
// Loading sentences and preparing statistics
ScoreStats scoreentry;
string line;
while (getline(cand, line))
{
while (getline(cand, line)) {
g_scorer->prepareStats(entries.size(), line, scoreentry);
entries.push_back(scoreentry);
}
int n = entries.size();
if (bootstrap)
{
if (bootstrap) {
vector<float> scores;
for (int i = 0; i < bootstrap; ++i)
{
for (int i = 0; i < bootstrap; ++i) {
// TODO: Use smart pointer for exceptional-safety.
ScoreData* scoredata = new ScoreData(g_scorer);
for (int j = 0; j < n; ++j)
{
for (int j = 0; j < n; ++j) {
int randomIndex = random() % n;
scoredata->add(entries[randomIndex], j);
}
@ -85,13 +83,10 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap)
cout.setf(ios::fixed, ios::floatfield);
cout.precision(4);
cout << avg << "\t[" << lb << "," << rb << "]" << endl;
}
else
{
} else {
// TODO: Use smart pointer for exceptional-safety.
ScoreData* scoredata = new ScoreData(g_scorer);
for (int sid = 0; sid < n; ++sid)
{
for (int sid = 0; sid < n; ++sid) {
scoredata->add(entries[sid], sid);
}
g_scorer->setScoreData(scoredata);
@ -184,56 +179,56 @@ struct ProgramOption {
bool has_seed;
ProgramOption()
: reference(""),
candidate(""),
bootstrap(0),
seed(0),
has_seed(false) { }
: reference(""),
candidate(""),
bootstrap(0),
seed(0),
has_seed(false) { }
};
void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
void ParseCommandOptions(int argc, char** argv, ProgramOption* opt)
{
int c;
int option_index;
int last_scorer_index = -1;
while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:f:l:h", long_options, &option_index)) != -1) {
switch(c) {
case 's':
opt->scorer_types.push_back(string(optarg));
opt->scorer_configs.push_back(string(""));
opt->scorer_factors.push_back(string(""));
opt->scorer_filter.push_back(string(""));
last_scorer_index++;
break;
case 'c':
opt->scorer_configs[last_scorer_index] = string(optarg);
break;
case 'R':
opt->reference = string(optarg);
break;
case 'C':
opt->candidate = string(optarg);
break;
case 'b':
opt->bootstrap = atoi(optarg);
break;
case 'r':
opt->seed = strtol(optarg, NULL, 10);
opt->has_seed = true;
break;
case 'f':
opt->scorer_factors[last_scorer_index] = string(optarg);
break;
case 'l':
opt->scorer_filter[last_scorer_index] = string(optarg);
break;
default:
usage();
case 's':
opt->scorer_types.push_back(string(optarg));
opt->scorer_configs.push_back(string(""));
opt->scorer_factors.push_back(string(""));
opt->scorer_filter.push_back(string(""));
last_scorer_index++;
break;
case 'c':
opt->scorer_configs[last_scorer_index] = string(optarg);
break;
case 'R':
opt->reference = string(optarg);
break;
case 'C':
opt->candidate = string(optarg);
break;
case 'b':
opt->bootstrap = atoi(optarg);
break;
case 'r':
opt->seed = strtol(optarg, NULL, 10);
opt->has_seed = true;
break;
case 'f':
opt->scorer_factors[last_scorer_index] = string(optarg);
break;
case 'l':
opt->scorer_filter[last_scorer_index] = string(optarg);
break;
default:
usage();
}
}
// Add default scorer if no scorer provided
if (opt->scorer_types.size() == 0)
{
if (opt->scorer_types.size() == 0) {
opt->scorer_types.push_back(string("BLEU"));
opt->scorer_configs.push_back(string(""));
opt->scorer_factors.push_back(string(""));
@ -241,7 +236,8 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
}
}
void InitSeed(const ProgramOption *opt) {
void InitSeed(const ProgramOption *opt)
{
if (opt->has_seed) {
cerr << "Seeding random numbers with " << opt->seed << endl;
srandom(opt->seed);
@ -260,8 +256,7 @@ int main(int argc, char** argv)
ProgramOption option;
ParseCommandOptions(argc, argv, &option);
if (option.bootstrap)
{
if (option.bootstrap) {
InitSeed(&option);
}
@ -278,17 +273,15 @@ int main(int argc, char** argv)
if (candFiles.size() > 1) g_has_more_files = true;
if (option.scorer_types.size() > 1) g_has_more_scorers = true;
for (vector<string>::const_iterator fileIt = candFiles.begin(); fileIt != candFiles.end(); ++fileIt)
{
for (size_t i = 0; i < option.scorer_types.size(); i++)
{
g_scorer = ScorerFactory::getScorer(option.scorer_types[i], option.scorer_configs[i]);
g_scorer->setFactors(option.scorer_factors[i]);
g_scorer->setFilter(option.scorer_filter[i]);
g_scorer->setReferenceFiles(refFiles);
EvaluatorUtil::evaluate(*fileIt, option.bootstrap);
delete g_scorer;
}
for (vector<string>::const_iterator fileIt = candFiles.begin(); fileIt != candFiles.end(); ++fileIt) {
for (size_t i = 0; i < option.scorer_types.size(); i++) {
g_scorer = ScorerFactory::getScorer(option.scorer_types[i], option.scorer_configs[i]);
g_scorer->setFactors(option.scorer_factors[i]);
g_scorer->setFilter(option.scorer_filter[i]);
g_scorer->setReferenceFiles(refFiles);
EvaluatorUtil::evaluate(*fileIt, option.bootstrap);
delete g_scorer;
}
}
return EXIT_SUCCESS;
} catch (const exception& e) {

Some files were not shown because too many files have changed in this diff Show More