Create private class to encapssulate encoding process.

Instead of using typedefs inside a class only,
it might be better to create a private class to do same things.
This commit is contained in:
Tetsuo Kiso 2012-02-01 21:19:25 +09:00
parent a351a74c18
commit 17e864e446
6 changed files with 67 additions and 45 deletions

View File

@ -37,7 +37,7 @@ size_t BleuScorer::countNgrams(const string& line, counts_t& counts, unsigned in
{
vector<int> encoded_tokens;
//cerr << line << endl;
encode(line,encoded_tokens);
TokenizeAndEncode(line, encoded_tokens);
//copy(encoded_tokens.begin(), encoded_tokens.end(), ostream_iterator<int>(cerr," "));
//cerr << endl;
for (size_t k = 1; k <= n; ++k) {
@ -70,7 +70,7 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
//make sure reference data is clear
m_ref_counts.reset();
m_ref_lengths.clear();
m_encodings.clear();
ClearEncoder();
//load reference data
for (size_t i = 0; i < referenceFiles.size(); ++i) {

View File

@ -22,7 +22,7 @@ void CderScorer::setReferenceFiles(const vector<string>& referenceFiles)
string line;
while (getline(refin,line)) {
sent_t encoded;
encode(line, encoded);
TokenizeAndEncode(line, encoded);
m_ref_sentences[rid].push_back(encoded);
}
}
@ -31,7 +31,7 @@ void CderScorer::setReferenceFiles(const vector<string>& referenceFiles)
void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<int>& stats)
{
sent_t cand;
encode(text, cand);
TokenizeAndEncode(text, cand);
float max = -2;
for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) {

View File

@ -30,7 +30,7 @@ void PerScorer::setReferenceFiles(const vector<string>& referenceFiles)
int sid = 0;
while (getline(in,line)) {
vector<int> tokens;
encode(line,tokens);
TokenizeAndEncode(line, tokens);
m_ref_tokens.push_back(multiset<int>());
for (size_t i = 0; i < tokens.size(); ++i) {
m_ref_tokens.back().insert(tokens[i]);
@ -55,7 +55,7 @@ void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
// Calculate correct, output_length and ref_length for
// the line and store it in entry
vector<int> testtokens;
encode(text,testtokens);
TokenizeAndEncode(text, testtokens);
multiset<int> testtokens_all(testtokens.begin(),testtokens.end());
set<int> testtokens_unique(testtokens.begin(),testtokens.end());
int correct = 0;

View File

@ -2,7 +2,10 @@
#include <limits>
Scorer::Scorer(const string& name, const string& config)
: m_name(name), m_score_data(0), m_enable_preserve_case(true) {
: m_name(name),
m_encoder(new Encoder),
m_score_data(0),
m_enable_preserve_case(true) {
// cerr << "Scorer config string: " << config << endl;
size_t start = 0;
while (start < config.size()) {
@ -23,6 +26,41 @@ Scorer::Scorer(const string& name, const string& config)
}
}
Scorer::~Scorer() {
delete m_encoder;
}
Scorer::Encoder::Encoder() {}
Scorer::Encoder::~Encoder() {}
int Scorer::Encoder::Encode(const string& token) {
map<string, int>::iterator it = m_vocab.find(token);
int encoded_token;
if (it == m_vocab.end()) {
// Add an new entry to the vocaburary.
encoded_token = static_cast<int>(m_vocab.size());
m_vocab[token] = encoded_token;
} else {
encoded_token = it->second;
}
return encoded_token;
}
void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) {
std::istringstream in(line);
std::string token;
while (in >> token) {
if (!m_enable_preserve_case) {
for (std::string::iterator it = token.begin();
it != token.end(); ++it) {
*it = tolower(*it);
}
}
encoded.push_back(m_encoder->Encode(token));
}
}
//regularisation strategies
static float score_min(const statscores_t& scores, size_t start, size_t end)
{

View File

@ -21,12 +21,9 @@ class ScoreStats;
*/
class Scorer
{
private:
string m_name;
public:
public:
Scorer(const string& name, const string& config);
virtual ~Scorer() {}
virtual ~Scorer();
/**
* Return the number of statistics needed for the computation of the score.
@ -102,12 +99,24 @@ public:
m_score_data = data;
}
protected:
typedef map<string,int> encodings_t;
typedef map<string,int>::iterator encodings_it;
private:
class Encoder {
public:
Encoder();
virtual ~Encoder();
int Encode(const std::string& token);
void Clear() { m_vocab.clear(); }
private:
std::map<std::string, int> m_vocab;
};
string m_name;
Encoder* m_encoder;
map<string, string> m_config;
protected:
ScoreData* m_score_data;
encodings_t m_encodings;
bool m_enable_preserve_case;
/**
@ -122,38 +131,13 @@ protected:
}
}
/**
* Tokenise line and encode.
* Note: We assume that all tokens are separated by single spaces.
*/
void encode(const string& line, vector<int>& encoded) {
//cerr << line << endl;
istringstream in (line);
string token;
while (in >> token) {
if (!m_enable_preserve_case) {
for (string::iterator i = token.begin(); i != token.end(); ++i) {
*i = tolower(*i);
}
}
encodings_it encoding = m_encodings.find(token);
int encoded_token;
if (encoding == m_encodings.end()) {
encoded_token = static_cast<int>(m_encodings.size());
m_encodings[token] = encoded_token;
//cerr << encoded_token << "(n) ";
} else {
encoded_token = encoding->second;
//cerr << encoded_token << " ";
}
encoded.push_back(encoded_token);
}
//cerr << endl;
}
void TokenizeAndEncode(const string& line, vector<int>& encoded);
private:
map<string, string> m_config;
void ClearEncoder() { m_encoder->Clear(); }
};

View File

@ -34,7 +34,7 @@ void TerScorer::setReferenceFiles ( const vector<string>& referenceFiles )
int sid = 0;
while ( getline ( in, line ) ) {
vector<int> tokens;
encode ( line, tokens );
TokenizeAndEncode(line, tokens);
m_references.push_back ( tokens );
TRACE_ERR ( "." );
++sid;
@ -74,7 +74,7 @@ void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry
averageLength+=(double)m_multi_references.at ( incRefsBis ).at ( sid ).size();
}
averageLength=averageLength/( double ) m_multi_references.size();
encode ( text, testtokens );
TokenizeAndEncode(text, testtokens);
terCalc * evaluation=new terCalc();
evaluation->setDebugMode ( false );
terAlignment tmp_result = evaluation->TER ( reftokens, testtokens );