mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 13:23:25 +03:00
beautify
This commit is contained in:
parent
f93a1db381
commit
7dc6ad4255
@ -6,391 +6,370 @@
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
DynamicCacheBasedLanguageModel::DynamicCacheBasedLanguageModel(const std::string &line)
|
||||
: StatelessFeatureFunction("DynamicCacheBasedLanguageModel", line)
|
||||
{
|
||||
std::cerr << "Initializing DynamicCacheBasedLanguageModel feature.." << std::endl;
|
||||
|
||||
query_type = CBLM_QUERY_TYPE_ALLSUBSTRINGS;
|
||||
score_type = CBLM_SCORE_TYPE_HYPERBOLA;
|
||||
maxAge = 1000;
|
||||
DynamicCacheBasedLanguageModel::DynamicCacheBasedLanguageModel(const std::string &line)
|
||||
: StatelessFeatureFunction("DynamicCacheBasedLanguageModel", line)
|
||||
{
|
||||
std::cerr << "Initializing DynamicCacheBasedLanguageModel feature.." << std::endl;
|
||||
|
||||
ReadParameters();
|
||||
}
|
||||
query_type = CBLM_QUERY_TYPE_ALLSUBSTRINGS;
|
||||
score_type = CBLM_SCORE_TYPE_HYPERBOLA;
|
||||
maxAge = 1000;
|
||||
|
||||
DynamicCacheBasedLanguageModel::~DynamicCacheBasedLanguageModel(){};
|
||||
|
||||
void DynamicCacheBasedLanguageModel::SetPreComputedScores()
|
||||
{
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
DynamicCacheBasedLanguageModel::~DynamicCacheBasedLanguageModel() {};
|
||||
|
||||
void DynamicCacheBasedLanguageModel::SetPreComputedScores()
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
|
||||
#endif
|
||||
precomputedScores.clear();
|
||||
for (size_t i=0; i<maxAge; i++)
|
||||
{
|
||||
precomputedScores.push_back(decaying_score(i));
|
||||
}
|
||||
boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
|
||||
#endif
|
||||
precomputedScores.clear();
|
||||
for (size_t i=0; i<maxAge; i++) {
|
||||
precomputedScores.push_back(decaying_score(i));
|
||||
}
|
||||
|
||||
if ( score_type == CBLM_SCORE_TYPE_HYPERBOLA
|
||||
|| score_type == CBLM_SCORE_TYPE_POWER
|
||||
|| score_type == CBLM_SCORE_TYPE_EXPONENTIAL
|
||||
|| score_type == CBLM_SCORE_TYPE_COSINE )
|
||||
{
|
||||
precomputedScores.push_back(decaying_score(maxAge));
|
||||
}else{ // score_type = CBLM_SCORE_TYPE_XXXXXXXXX_REWARD
|
||||
precomputedScores.push_back(0.0);
|
||||
}
|
||||
}
|
||||
if ( score_type == CBLM_SCORE_TYPE_HYPERBOLA
|
||||
|| score_type == CBLM_SCORE_TYPE_POWER
|
||||
|| score_type == CBLM_SCORE_TYPE_EXPONENTIAL
|
||||
|| score_type == CBLM_SCORE_TYPE_COSINE ) {
|
||||
precomputedScores.push_back(decaying_score(maxAge));
|
||||
} else { // score_type = CBLM_SCORE_TYPE_XXXXXXXXX_REWARD
|
||||
precomputedScores.push_back(0.0);
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
std::cerr << "DynamicCacheBasedLanguageModel::SetParameter" << std::endl;
|
||||
if (key == "cblm-query-type") {
|
||||
query_type = Scan<size_t>(value);
|
||||
}
|
||||
else if (key == "cblm-score-type") {
|
||||
score_type = Scan<size_t>(value);
|
||||
}
|
||||
else if (key == "cblm-file") {
|
||||
m_initfiles = Scan<std::string>(value);
|
||||
} else {
|
||||
StatelessFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
}
|
||||
void DynamicCacheBasedLanguageModel::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
std::cerr << "DynamicCacheBasedLanguageModel::SetParameter" << std::endl;
|
||||
if (key == "cblm-query-type") {
|
||||
query_type = Scan<size_t>(value);
|
||||
} else if (key == "cblm-score-type") {
|
||||
score_type = Scan<size_t>(value);
|
||||
} else if (key == "cblm-file") {
|
||||
m_initfiles = Scan<std::string>(value);
|
||||
} else {
|
||||
StatelessFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Evaluate(const Phrase &sp
|
||||
, const TargetPhrase &tp
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
{
|
||||
float score;
|
||||
switch(query_type){
|
||||
case CBLM_QUERY_TYPE_WHOLESTRING:
|
||||
score = Evaluate_Whole_String(tp);
|
||||
break;
|
||||
case CBLM_QUERY_TYPE_ALLSUBSTRINGS:
|
||||
score = Evaluate_All_Substrings(tp);
|
||||
break;
|
||||
default:
|
||||
CHECK(false);
|
||||
}
|
||||
void DynamicCacheBasedLanguageModel::Evaluate(const Phrase &sp
|
||||
, const TargetPhrase &tp
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
{
|
||||
float score;
|
||||
switch(query_type) {
|
||||
case CBLM_QUERY_TYPE_WHOLESTRING:
|
||||
score = Evaluate_Whole_String(tp);
|
||||
break;
|
||||
case CBLM_QUERY_TYPE_ALLSUBSTRINGS:
|
||||
score = Evaluate_All_Substrings(tp);
|
||||
break;
|
||||
default:
|
||||
CHECK(false);
|
||||
}
|
||||
|
||||
VERBOSE(2,"cblm::Evaluate: score:|" << score << "|" << std::endl);
|
||||
scoreBreakdown.Assign(this, score);
|
||||
}
|
||||
VERBOSE(2,"cblm::Evaluate: score:|" << score << "|" << std::endl);
|
||||
scoreBreakdown.Assign(this, score);
|
||||
}
|
||||
|
||||
float DynamicCacheBasedLanguageModel::Evaluate_Whole_String(const TargetPhrase& tp) const
|
||||
{
|
||||
//consider all words in the TargetPhrase as one n-gram
|
||||
// and compute the decaying_score for all words
|
||||
// and return their sum
|
||||
float DynamicCacheBasedLanguageModel::Evaluate_Whole_String(const TargetPhrase& tp) const
|
||||
{
|
||||
//consider all words in the TargetPhrase as one n-gram
|
||||
// and compute the decaying_score for all words
|
||||
// and return their sum
|
||||
|
||||
decaying_cache_t::const_iterator it;
|
||||
float score = 0.0;
|
||||
decaying_cache_t::const_iterator it;
|
||||
float score = 0.0;
|
||||
|
||||
std::string w = "";
|
||||
size_t endpos = tp.GetSize();
|
||||
for (size_t pos = 0 ; pos < endpos ; ++pos) {
|
||||
w += tp.GetWord(pos).GetFactor(0)->GetString().as_string();
|
||||
if ((pos == 0) && (endpos > 1)){
|
||||
w += " ";
|
||||
}
|
||||
}
|
||||
it = m_cache.find(w);
|
||||
std::string w = "";
|
||||
size_t endpos = tp.GetSize();
|
||||
for (size_t pos = 0 ; pos < endpos ; ++pos) {
|
||||
w += tp.GetWord(pos).GetFactor(0)->GetString().as_string();
|
||||
if ((pos == 0) && (endpos > 1)) {
|
||||
w += " ";
|
||||
}
|
||||
}
|
||||
it = m_cache.find(w);
|
||||
// VERBOSE(1,"cblm::Evaluate: cheching cache for w:|" << w << "|" << std::endl);
|
||||
|
||||
if (it != m_cache.end()) //found!
|
||||
{
|
||||
score += ((*it).second).second;
|
||||
VERBOSE(3,"cblm::Evaluate_Whole_String: found w:|" << w << "| actual score:|" << ((*it).second).second << "| score:|" <<
|
||||
score << "|" << std::endl);
|
||||
}
|
||||
if (it != m_cache.end()) { //found!
|
||||
score += ((*it).second).second;
|
||||
VERBOSE(3,"cblm::Evaluate_Whole_String: found w:|" << w << "| actual score:|" << ((*it).second).second << "| score:|" <<
|
||||
score << "|" << std::endl);
|
||||
}
|
||||
|
||||
VERBOSE(3,"cblm::Evaluate_Whole_String: returning score:|" << score << "|" << std::endl);
|
||||
return score;
|
||||
}
|
||||
VERBOSE(3,"cblm::Evaluate_Whole_String: returning score:|" << score << "|" << std::endl);
|
||||
return score;
|
||||
}
|
||||
|
||||
float DynamicCacheBasedLanguageModel::Evaluate_All_Substrings(const TargetPhrase& tp) const
|
||||
{
|
||||
//loop over all n-grams in the TargetPhrase (no matter of n)
|
||||
// and compute the decaying_score for all words
|
||||
// and return their sum
|
||||
float DynamicCacheBasedLanguageModel::Evaluate_All_Substrings(const TargetPhrase& tp) const
|
||||
{
|
||||
//loop over all n-grams in the TargetPhrase (no matter of n)
|
||||
// and compute the decaying_score for all words
|
||||
// and return their sum
|
||||
|
||||
decaying_cache_t::const_iterator it;
|
||||
float score = 0.0;
|
||||
for (size_t startpos = 0 ; startpos < tp.GetSize() ; ++startpos) {
|
||||
std::string w = "";
|
||||
for (size_t endpos = startpos; endpos < tp.GetSize() ; ++endpos) {
|
||||
w += tp.GetWord(endpos).GetFactor(0)->GetString().as_string();
|
||||
it = m_cache.find(w);
|
||||
decaying_cache_t::const_iterator it;
|
||||
float score = 0.0;
|
||||
for (size_t startpos = 0 ; startpos < tp.GetSize() ; ++startpos) {
|
||||
std::string w = "";
|
||||
for (size_t endpos = startpos; endpos < tp.GetSize() ; ++endpos) {
|
||||
w += tp.GetWord(endpos).GetFactor(0)->GetString().as_string();
|
||||
it = m_cache.find(w);
|
||||
|
||||
// VERBOSE(1,"cblm::Evaluate_All_Substrings: cheching cache for w:|" << w << "|" << std::endl);
|
||||
if (it != m_cache.end()) //found!
|
||||
{
|
||||
score += ((*it).second).second;
|
||||
VERBOSE(3,"cblm::Evaluate_All_Substrings: found w:|" << w << "| actual score:|" << ((*it).second).second << "| score:|" << score << "|" << std::endl);
|
||||
}
|
||||
if (it != m_cache.end()) { //found!
|
||||
score += ((*it).second).second;
|
||||
VERBOSE(3,"cblm::Evaluate_All_Substrings: found w:|" << w << "| actual score:|" << ((*it).second).second << "| score:|" << score << "|" << std::endl);
|
||||
}
|
||||
|
||||
if (endpos == startpos){
|
||||
w += " ";
|
||||
}
|
||||
if (endpos == startpos) {
|
||||
w += " ";
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
VERBOSE(3,"cblm::Evaluate_All_Substrings: returning score:|" << score << "|" << std::endl);
|
||||
return score;
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Print() const
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> read_lock(m_cacheLock);
|
||||
#endif
|
||||
decaying_cache_t::const_iterator it;
|
||||
std::cout << "Content of the cache of Cache-Based Language Model" << std::endl;
|
||||
for ( it=m_cache.begin() ; it != m_cache.end(); it++ )
|
||||
{
|
||||
std::cout << "word:|" << (*it).first << "| age:|" << ((*it).second).first << "| score:|" << ((*it).second).second << "|" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Decay()
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
|
||||
#endif
|
||||
decaying_cache_t::iterator it;
|
||||
|
||||
int age;
|
||||
float score;
|
||||
for ( it=m_cache.begin() ; it != m_cache.end(); it++ )
|
||||
{
|
||||
age=((*it).second).first + 1;
|
||||
if (age > 1000)
|
||||
{
|
||||
m_cache.erase(it);
|
||||
it--;
|
||||
}
|
||||
else
|
||||
{
|
||||
score = decaying_score(age);
|
||||
decaying_cache_value_t p (age, score);
|
||||
(*it).second = p;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Update(std::vector<std::string> words, int age)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
|
||||
#endif
|
||||
for (size_t j=0; j<words.size(); j++)
|
||||
{
|
||||
words[j] = Trim(words[j]);
|
||||
VERBOSE(3,"CacheBasedLanguageModel::Update word[" << j << "]:"<< words[j] << " age:" << age << " decaying_score(age):" << decaying_score(age) << std::endl);
|
||||
decaying_cache_value_t p (age,decaying_score(age));
|
||||
std::pair<std::string, decaying_cache_value_t> e (words[j],p);
|
||||
m_cache.erase(words[j]); //always erase the element (do nothing if the entry does not exist)
|
||||
m_cache.insert(e); //insert the entry
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Insert(std::string &entries)
|
||||
{
|
||||
if (entries != "")
|
||||
{
|
||||
VERBOSE(1,"entries:|" << entries << "|" << std::endl);
|
||||
std::vector<std::string> elements = TokenizeMultiCharSeparator(entries, "||");
|
||||
VERBOSE(1,"elements.size() after:|" << elements.size() << "|" << std::endl);
|
||||
Insert(elements);
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Insert(std::vector<std::string> ngrams)
|
||||
{
|
||||
VERBOSE(1,"CacheBasedLanguageModel Insert ngrams.size():|" << ngrams.size() << "|" << std::endl);
|
||||
Decay();
|
||||
Update(ngrams,1);
|
||||
IFVERBOSE(2) Print();
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Execute(std::string command)
|
||||
{
|
||||
VERBOSE(1,"CacheBasedLanguageModel::Execute(std::string command:|" << command << "|" << std::endl);
|
||||
std::vector<std::string> commands = Tokenize(command, "||");
|
||||
Execute(commands);
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Execute(std::vector<std::string> commands)
|
||||
{
|
||||
for (size_t j=0; j<commands.size(); j++)
|
||||
{
|
||||
Execute_Single_Command(commands[j]);
|
||||
}
|
||||
IFVERBOSE(2) Print();
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Execute_Single_Command(std::string command)
|
||||
{
|
||||
VERBOSE(1,"CacheBasedLanguageModel::Execute_Single_Command(std::string command:|" << command << "|" << std::endl);
|
||||
if (command == "clear")
|
||||
{
|
||||
VERBOSE(1,"CacheBasedLanguageModel Execute command:|"<< command << "|. Cache cleared." << std::endl);
|
||||
Clear();
|
||||
}
|
||||
else if (command == "settype_wholestring")
|
||||
{
|
||||
VERBOSE(1,"CacheBasedLanguageModel Execute command:|"<< command << "|. Query type set to " << CBLM_QUERY_TYPE_WHOLESTRING << " (CBLM_QUERY_TYPE_WHOLESTRING)." << std::endl);
|
||||
SetQueryType(CBLM_QUERY_TYPE_WHOLESTRING);
|
||||
}
|
||||
else if (command == "settype_allsubstrings")
|
||||
{
|
||||
VERBOSE(1,"CacheBasedLanguageModel Execute command:|"<< command << "|. Query type set to " << CBLM_QUERY_TYPE_ALLSUBSTRINGS << " (CBLM_QUERY_TYPE_ALLSUBSTRINGS)." << std::endl);
|
||||
SetQueryType(CBLM_QUERY_TYPE_ALLSUBSTRINGS);
|
||||
}
|
||||
else
|
||||
{
|
||||
VERBOSE(1,"CacheBasedLanguageModel Execute command:|"<< command << "| is unknown. Skipped." << std::endl);
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Clear()
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
|
||||
#endif
|
||||
m_cache.clear();
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Load()
|
||||
{
|
||||
VERBOSE(2,"DynamicCacheBasedLanguageModel::Load()" << std::endl);
|
||||
Load(m_initfiles);
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Load(const std::string file)
|
||||
{
|
||||
VERBOSE(2,"DynamicCacheBasedLanguageModel::Loadconst std::string file()" << std::endl);
|
||||
std::vector<std::string> files = Tokenize(m_initfiles, "||");
|
||||
Load(files);
|
||||
}
|
||||
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Load(std::vector<std::string> files)
|
||||
{
|
||||
for(size_t j = 0; j < files.size(); ++j)
|
||||
{
|
||||
Load_Single_File(files[j]);
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Load_Single_File(const std::string file)
|
||||
{
|
||||
//file format
|
||||
//age || n-gram
|
||||
//age || n-gram || n-gram || n-gram || ...
|
||||
//....
|
||||
//each n-gram is a sequence of n words (no matter of n)
|
||||
//
|
||||
//there is no limit on the size of n
|
||||
//
|
||||
//entries can be repeated, but the last entry overwrites the previous
|
||||
|
||||
|
||||
VERBOSE(2,"Loading data from the cache file " << file << std::endl);
|
||||
InputFileStream cacheFile(file);
|
||||
|
||||
std::string line;
|
||||
int age;
|
||||
std::vector<std::string> words;
|
||||
|
||||
while (getline(cacheFile, line)) {
|
||||
std::vector<std::string> vecStr = TokenizeMultiCharSeparator( line , "||" );
|
||||
if (vecStr.size() >= 2) {
|
||||
age = Scan<int>(vecStr[0]);
|
||||
vecStr.erase(vecStr.begin());
|
||||
Update(vecStr,age);
|
||||
} else {
|
||||
TRACE_ERR("ERROR: The format of the loaded file is wrong: " << line << std::endl);
|
||||
CHECK(false);
|
||||
}
|
||||
}
|
||||
IFVERBOSE(2) Print();
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::SetQueryType(size_t type) {
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> read_lock(m_cacheLock);
|
||||
#endif
|
||||
|
||||
query_type = type;
|
||||
if ( query_type != CBLM_QUERY_TYPE_WHOLESTRING
|
||||
&& query_type != CBLM_QUERY_TYPE_ALLSUBSTRINGS )
|
||||
{
|
||||
VERBOSE(2, "This query type " << query_type << " is unknown. Instead used " << CBLM_QUERY_TYPE_ALLSUBSTRINGS << "." << std::endl);
|
||||
query_type = CBLM_QUERY_TYPE_ALLSUBSTRINGS;
|
||||
}
|
||||
VERBOSE(2, "CacheBasedLanguageModel QueryType: " << query_type << std::endl);
|
||||
|
||||
};
|
||||
|
||||
void DynamicCacheBasedLanguageModel::SetScoreType(size_t type) {
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> read_lock(m_cacheLock);
|
||||
#endif
|
||||
score_type = type;
|
||||
if ( score_type != CBLM_SCORE_TYPE_HYPERBOLA
|
||||
&& score_type != CBLM_SCORE_TYPE_POWER
|
||||
&& score_type != CBLM_SCORE_TYPE_EXPONENTIAL
|
||||
&& score_type != CBLM_SCORE_TYPE_COSINE
|
||||
&& score_type != CBLM_SCORE_TYPE_HYPERBOLA_REWARD
|
||||
&& score_type != CBLM_SCORE_TYPE_POWER_REWARD
|
||||
&& score_type != CBLM_SCORE_TYPE_EXPONENTIAL_REWARD )
|
||||
{
|
||||
VERBOSE(2, "This score type " << score_type << " is unknown. Instead used " << CBLM_SCORE_TYPE_HYPERBOLA << "." << std::endl);
|
||||
score_type = CBLM_SCORE_TYPE_HYPERBOLA;
|
||||
}
|
||||
VERBOSE(2, "CacheBasedLanguageModel ScoreType: " << score_type << std::endl);
|
||||
};
|
||||
|
||||
void DynamicCacheBasedLanguageModel::SetMaxAge(unsigned int age) {
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> read_lock(m_cacheLock);
|
||||
#endif
|
||||
maxAge = age;
|
||||
VERBOSE(2, "CacheBasedLanguageModel MaxAge: " << maxAge << std::endl);
|
||||
};
|
||||
|
||||
float DynamicCacheBasedLanguageModel::decaying_score(const int age)
|
||||
{
|
||||
float sc;
|
||||
switch(score_type){
|
||||
case CBLM_SCORE_TYPE_HYPERBOLA:
|
||||
sc = (float) 1.0/age - 1.0;
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_POWER:
|
||||
sc = (float) pow(age, -0.25) - 1.0;
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_EXPONENTIAL:
|
||||
sc = (age == 1) ? 0.0 : (float) exp( 1.0/age ) / exp(1.0) - 1.0;
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_COSINE:
|
||||
sc = (float) cos( (age-1) * (PI/2) / maxAge ) - 1.0;
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_HYPERBOLA_REWARD:
|
||||
sc = (float) 1.0/age;
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_POWER_REWARD:
|
||||
sc = (float) pow(age, -0.25);
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_EXPONENTIAL_REWARD:
|
||||
sc = (age == 1) ? 1.0 : (float) exp( 1.0/age ) / exp(1.0);
|
||||
break;
|
||||
default:
|
||||
sc = -1.0;
|
||||
}
|
||||
return sc;
|
||||
}
|
||||
}
|
||||
}
|
||||
VERBOSE(3,"cblm::Evaluate_All_Substrings: returning score:|" << score << "|" << std::endl);
|
||||
return score;
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Print() const
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> read_lock(m_cacheLock);
|
||||
#endif
|
||||
decaying_cache_t::const_iterator it;
|
||||
std::cout << "Content of the cache of Cache-Based Language Model" << std::endl;
|
||||
for ( it=m_cache.begin() ; it != m_cache.end(); it++ ) {
|
||||
std::cout << "word:|" << (*it).first << "| age:|" << ((*it).second).first << "| score:|" << ((*it).second).second << "|" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Decay()
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
|
||||
#endif
|
||||
decaying_cache_t::iterator it;
|
||||
|
||||
int age;
|
||||
float score;
|
||||
for ( it=m_cache.begin() ; it != m_cache.end(); it++ ) {
|
||||
age=((*it).second).first + 1;
|
||||
if (age > 1000) {
|
||||
m_cache.erase(it);
|
||||
it--;
|
||||
} else {
|
||||
score = decaying_score(age);
|
||||
decaying_cache_value_t p (age, score);
|
||||
(*it).second = p;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Update(std::vector<std::string> words, int age)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
|
||||
#endif
|
||||
for (size_t j=0; j<words.size(); j++) {
|
||||
words[j] = Trim(words[j]);
|
||||
VERBOSE(3,"CacheBasedLanguageModel::Update word[" << j << "]:"<< words[j] << " age:" << age << " decaying_score(age):" << decaying_score(age) << std::endl);
|
||||
decaying_cache_value_t p (age,decaying_score(age));
|
||||
std::pair<std::string, decaying_cache_value_t> e (words[j],p);
|
||||
m_cache.erase(words[j]); //always erase the element (do nothing if the entry does not exist)
|
||||
m_cache.insert(e); //insert the entry
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Insert(std::string &entries)
|
||||
{
|
||||
if (entries != "") {
|
||||
VERBOSE(1,"entries:|" << entries << "|" << std::endl);
|
||||
std::vector<std::string> elements = TokenizeMultiCharSeparator(entries, "||");
|
||||
VERBOSE(1,"elements.size() after:|" << elements.size() << "|" << std::endl);
|
||||
Insert(elements);
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Insert(std::vector<std::string> ngrams)
|
||||
{
|
||||
VERBOSE(1,"CacheBasedLanguageModel Insert ngrams.size():|" << ngrams.size() << "|" << std::endl);
|
||||
Decay();
|
||||
Update(ngrams,1);
|
||||
IFVERBOSE(2) Print();
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Execute(std::string command)
|
||||
{
|
||||
VERBOSE(1,"CacheBasedLanguageModel::Execute(std::string command:|" << command << "|" << std::endl);
|
||||
std::vector<std::string> commands = Tokenize(command, "||");
|
||||
Execute(commands);
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Execute(std::vector<std::string> commands)
|
||||
{
|
||||
for (size_t j=0; j<commands.size(); j++) {
|
||||
Execute_Single_Command(commands[j]);
|
||||
}
|
||||
IFVERBOSE(2) Print();
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Execute_Single_Command(std::string command)
|
||||
{
|
||||
VERBOSE(1,"CacheBasedLanguageModel::Execute_Single_Command(std::string command:|" << command << "|" << std::endl);
|
||||
if (command == "clear") {
|
||||
VERBOSE(1,"CacheBasedLanguageModel Execute command:|"<< command << "|. Cache cleared." << std::endl);
|
||||
Clear();
|
||||
} else if (command == "settype_wholestring") {
|
||||
VERBOSE(1,"CacheBasedLanguageModel Execute command:|"<< command << "|. Query type set to " << CBLM_QUERY_TYPE_WHOLESTRING << " (CBLM_QUERY_TYPE_WHOLESTRING)." << std::endl);
|
||||
SetQueryType(CBLM_QUERY_TYPE_WHOLESTRING);
|
||||
} else if (command == "settype_allsubstrings") {
|
||||
VERBOSE(1,"CacheBasedLanguageModel Execute command:|"<< command << "|. Query type set to " << CBLM_QUERY_TYPE_ALLSUBSTRINGS << " (CBLM_QUERY_TYPE_ALLSUBSTRINGS)." << std::endl);
|
||||
SetQueryType(CBLM_QUERY_TYPE_ALLSUBSTRINGS);
|
||||
} else {
|
||||
VERBOSE(1,"CacheBasedLanguageModel Execute command:|"<< command << "| is unknown. Skipped." << std::endl);
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Clear()
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
|
||||
#endif
|
||||
m_cache.clear();
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Load()
|
||||
{
|
||||
VERBOSE(2,"DynamicCacheBasedLanguageModel::Load()" << std::endl);
|
||||
Load(m_initfiles);
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Load(const std::string file)
|
||||
{
|
||||
VERBOSE(2,"DynamicCacheBasedLanguageModel::Loadconst std::string file()" << std::endl);
|
||||
std::vector<std::string> files = Tokenize(m_initfiles, "||");
|
||||
Load(files);
|
||||
}
|
||||
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Load(std::vector<std::string> files)
|
||||
{
|
||||
for(size_t j = 0; j < files.size(); ++j) {
|
||||
Load_Single_File(files[j]);
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Load_Single_File(const std::string file)
|
||||
{
|
||||
//file format
|
||||
//age || n-gram
|
||||
//age || n-gram || n-gram || n-gram || ...
|
||||
//....
|
||||
//each n-gram is a sequence of n words (no matter of n)
|
||||
//
|
||||
//there is no limit on the size of n
|
||||
//
|
||||
//entries can be repeated, but the last entry overwrites the previous
|
||||
|
||||
|
||||
VERBOSE(2,"Loading data from the cache file " << file << std::endl);
|
||||
InputFileStream cacheFile(file);
|
||||
|
||||
std::string line;
|
||||
int age;
|
||||
std::vector<std::string> words;
|
||||
|
||||
while (getline(cacheFile, line)) {
|
||||
std::vector<std::string> vecStr = TokenizeMultiCharSeparator( line , "||" );
|
||||
if (vecStr.size() >= 2) {
|
||||
age = Scan<int>(vecStr[0]);
|
||||
vecStr.erase(vecStr.begin());
|
||||
Update(vecStr,age);
|
||||
} else {
|
||||
TRACE_ERR("ERROR: The format of the loaded file is wrong: " << line << std::endl);
|
||||
CHECK(false);
|
||||
}
|
||||
}
|
||||
IFVERBOSE(2) Print();
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::SetQueryType(size_t type)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> read_lock(m_cacheLock);
|
||||
#endif
|
||||
|
||||
query_type = type;
|
||||
if ( query_type != CBLM_QUERY_TYPE_WHOLESTRING
|
||||
&& query_type != CBLM_QUERY_TYPE_ALLSUBSTRINGS ) {
|
||||
VERBOSE(2, "This query type " << query_type << " is unknown. Instead used " << CBLM_QUERY_TYPE_ALLSUBSTRINGS << "." << std::endl);
|
||||
query_type = CBLM_QUERY_TYPE_ALLSUBSTRINGS;
|
||||
}
|
||||
VERBOSE(2, "CacheBasedLanguageModel QueryType: " << query_type << std::endl);
|
||||
|
||||
};
|
||||
|
||||
void DynamicCacheBasedLanguageModel::SetScoreType(size_t type)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> read_lock(m_cacheLock);
|
||||
#endif
|
||||
score_type = type;
|
||||
if ( score_type != CBLM_SCORE_TYPE_HYPERBOLA
|
||||
&& score_type != CBLM_SCORE_TYPE_POWER
|
||||
&& score_type != CBLM_SCORE_TYPE_EXPONENTIAL
|
||||
&& score_type != CBLM_SCORE_TYPE_COSINE
|
||||
&& score_type != CBLM_SCORE_TYPE_HYPERBOLA_REWARD
|
||||
&& score_type != CBLM_SCORE_TYPE_POWER_REWARD
|
||||
&& score_type != CBLM_SCORE_TYPE_EXPONENTIAL_REWARD ) {
|
||||
VERBOSE(2, "This score type " << score_type << " is unknown. Instead used " << CBLM_SCORE_TYPE_HYPERBOLA << "." << std::endl);
|
||||
score_type = CBLM_SCORE_TYPE_HYPERBOLA;
|
||||
}
|
||||
VERBOSE(2, "CacheBasedLanguageModel ScoreType: " << score_type << std::endl);
|
||||
};
|
||||
|
||||
void DynamicCacheBasedLanguageModel::SetMaxAge(unsigned int age)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> read_lock(m_cacheLock);
|
||||
#endif
|
||||
maxAge = age;
|
||||
VERBOSE(2, "CacheBasedLanguageModel MaxAge: " << maxAge << std::endl);
|
||||
};
|
||||
|
||||
float DynamicCacheBasedLanguageModel::decaying_score(const int age)
|
||||
{
|
||||
float sc;
|
||||
switch(score_type) {
|
||||
case CBLM_SCORE_TYPE_HYPERBOLA:
|
||||
sc = (float) 1.0/age - 1.0;
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_POWER:
|
||||
sc = (float) pow(age, -0.25) - 1.0;
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_EXPONENTIAL:
|
||||
sc = (age == 1) ? 0.0 : (float) exp( 1.0/age ) / exp(1.0) - 1.0;
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_COSINE:
|
||||
sc = (float) cos( (age-1) * (PI/2) / maxAge ) - 1.0;
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_HYPERBOLA_REWARD:
|
||||
sc = (float) 1.0/age;
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_POWER_REWARD:
|
||||
sc = (float) pow(age, -0.25);
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_EXPONENTIAL_REWARD:
|
||||
sc = (age == 1) ? 1.0 : (float) exp( 1.0/age ) / exp(1.0);
|
||||
break;
|
||||
default:
|
||||
sc = -1.0;
|
||||
}
|
||||
return sc;
|
||||
}
|
||||
}
|
||||
|
@ -10,8 +10,8 @@
|
||||
#include <boost/thread/locks.hpp>
|
||||
#endif
|
||||
|
||||
typedef std::pair<int, float> decaying_cache_value_t;
|
||||
typedef std::map<std::string, decaying_cache_value_t > decaying_cache_t;
|
||||
typedef std::pair<int, float> decaying_cache_value_t;
|
||||
typedef std::map<std::string, decaying_cache_value_t > decaying_cache_t;
|
||||
|
||||
#define CBLM_QUERY_TYPE_ALLSUBSTRINGS 0
|
||||
#define CBLM_QUERY_TYPE_WHOLESTRING 1
|
||||
@ -57,10 +57,10 @@ class DynamicCacheBasedLanguageModel : public StatelessFeatureFunction
|
||||
|
||||
void Decay();
|
||||
void Update(std::vector<std::string> words, int age);
|
||||
|
||||
|
||||
void Execute(std::vector<std::string> commands);
|
||||
void Execute_Single_Command(std::string command);
|
||||
|
||||
|
||||
void Load(std::vector<std::string> files);
|
||||
void Load_Single_File(const std::string file);
|
||||
|
||||
@ -91,9 +91,9 @@ public:
|
||||
void Insert(std::string &entries);
|
||||
|
||||
void Evaluate(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const;
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const;
|
||||
|
||||
};
|
||||
|
||||
@ -116,7 +116,7 @@ class DynamicCacheBasedLanguageModel : public StatelessFeatureFunction
|
||||
|
||||
void Decay();
|
||||
void Update(std::vector<std::string> words, int age);
|
||||
|
||||
|
||||
void Execute(std::vector<std::string> commands);
|
||||
void Execute_Single_Command(std::string command);
|
||||
|
||||
@ -130,7 +130,7 @@ class DynamicCacheBasedLanguageModel : public StatelessFeatureFunction
|
||||
void Evaluate( const TargetPhrase&, ScoreComponentCollection* ) const;
|
||||
|
||||
public:
|
||||
|
||||
|
||||
DynamicCacheBasedLanguageModel(const std::string &line);
|
||||
~DynamicCacheBasedLanguageModel();
|
||||
|
||||
|
@ -480,15 +480,15 @@ public:
|
||||
return m_dynamicCBLM;
|
||||
}
|
||||
|
||||
/*
|
||||
DynamicCacheBasedPhraseDictionary *GetDynamicCacheBasedPhraseDictionary() const {
|
||||
return m_dynamicCBPD;
|
||||
}
|
||||
/*
|
||||
DynamicCacheBasedPhraseDictionary *GetDynamicCacheBasedPhraseDictionary() const {
|
||||
return m_dynamicCBPD;
|
||||
}
|
||||
|
||||
const DynamicCacheBasedPhraseDictionary *GetDynamicCacheBasedPhraseDictionary() { // for mira
|
||||
return m_dynamicCBPD;
|
||||
}
|
||||
*/
|
||||
const DynamicCacheBasedPhraseDictionary *GetDynamicCacheBasedPhraseDictionary() { // for mira
|
||||
return m_dynamicCBPD;
|
||||
}
|
||||
*/
|
||||
|
||||
const ScoreComponentCollection& GetAllWeights() const {
|
||||
return m_allWeights;
|
||||
|
@ -122,7 +122,7 @@ enum PhraseTableImplementation {
|
||||
,Compact = 12
|
||||
,Interpolated = 13
|
||||
,DSuffixArray = 14
|
||||
,DCacheBased = 32
|
||||
,DCacheBased = 32
|
||||
};
|
||||
|
||||
enum InputTypeEnum {
|
||||
|
@ -112,8 +112,8 @@ std::vector< std::map<std::string, std::string> > ProcessAndStripDLT(std::string
|
||||
std::vector< std::map<std::string, std::string> > meta;
|
||||
std::string lline = ToLower(line);
|
||||
bool check_dlt = true;
|
||||
|
||||
std::cerr << "GLOBAL START" << endl;
|
||||
|
||||
std::cerr << "GLOBAL START" << endl;
|
||||
while (check_dlt) {
|
||||
size_t start = lline.find("<dlt");
|
||||
if (start == std::string::npos) {
|
||||
@ -133,50 +133,50 @@ std::vector< std::map<std::string, std::string> > ProcessAndStripDLT(std::string
|
||||
line.erase(start,close-start+2);
|
||||
lline.erase(start,close-start+2);
|
||||
|
||||
if (dlt != ""){
|
||||
if (dlt != "") {
|
||||
|
||||
std::map<std::string, std::string> tmp_meta;
|
||||
for (size_t i = 1; i < dlt.size(); i++) {
|
||||
if (dlt[i] == '=') {
|
||||
std::string label = dlt.substr(0, i);
|
||||
std::string val = dlt.substr(i+1);
|
||||
std::cerr << "label:|" << label << "|" << endl;
|
||||
std::cerr << "val:|" << val << "|" << endl;
|
||||
if (val[0] == '"') {
|
||||
val = val.substr(1);
|
||||
// it admits any double quotation mark in the value of the attribute
|
||||
// it assumes that just one attribute is present in the tag,
|
||||
// it assumes that the value starts and ends with double quotation mark
|
||||
size_t close = val.rfind('"');
|
||||
if (close == std::string::npos) {
|
||||
TRACE_ERR("SGML parse error: missing \"\n");
|
||||
dlt = "";
|
||||
i = 0;
|
||||
std::map<std::string, std::string> tmp_meta;
|
||||
for (size_t i = 1; i < dlt.size(); i++) {
|
||||
if (dlt[i] == '=') {
|
||||
std::string label = dlt.substr(0, i);
|
||||
std::string val = dlt.substr(i+1);
|
||||
std::cerr << "label:|" << label << "|" << endl;
|
||||
std::cerr << "val:|" << val << "|" << endl;
|
||||
if (val[0] == '"') {
|
||||
val = val.substr(1);
|
||||
// it admits any double quotation mark in the value of the attribute
|
||||
// it assumes that just one attribute is present in the tag,
|
||||
// it assumes that the value starts and ends with double quotation mark
|
||||
size_t close = val.rfind('"');
|
||||
if (close == std::string::npos) {
|
||||
TRACE_ERR("SGML parse error: missing \"\n");
|
||||
dlt = "";
|
||||
i = 0;
|
||||
} else {
|
||||
dlt = val.substr(close+1);
|
||||
val = val.substr(0, close);
|
||||
i = 0;
|
||||
}
|
||||
} else {
|
||||
dlt = val.substr(close+1);
|
||||
val = val.substr(0, close);
|
||||
i = 0;
|
||||
}
|
||||
} else {
|
||||
size_t close = val.find(' ');
|
||||
if (close == std::string::npos) {
|
||||
dlt = "";
|
||||
i = 0;
|
||||
} else {
|
||||
dlt = val.substr(close+1);
|
||||
val = val.substr(0, close);
|
||||
size_t close = val.find(' ');
|
||||
if (close == std::string::npos) {
|
||||
dlt = "";
|
||||
i = 0;
|
||||
} else {
|
||||
dlt = val.substr(close+1);
|
||||
val = val.substr(0, close);
|
||||
}
|
||||
}
|
||||
label = Trim(label);
|
||||
dlt = Trim(dlt);
|
||||
|
||||
tmp_meta[label] = val;
|
||||
std::cerr << "tmp_meta:|" << tmp_meta[label] << "|" << endl;
|
||||
}
|
||||
label = Trim(label);
|
||||
dlt = Trim(dlt);
|
||||
|
||||
tmp_meta[label] = val;
|
||||
std::cerr << "tmp_meta:|" << tmp_meta[label] << "|" << endl;
|
||||
}
|
||||
}
|
||||
|
||||
meta.push_back(tmp_meta);
|
||||
}
|
||||
meta.push_back(tmp_meta);
|
||||
}
|
||||
}
|
||||
std::cerr << "GLOBAL END" << endl;
|
||||
return meta;
|
||||
|
Loading…
Reference in New Issue
Block a user