Remove trailing whitespace in C++ files.

This commit is contained in:
Jeroen Vermeulen 2015-04-30 12:05:11 +07:00
parent 85acdc62b1
commit eca5824100
368 changed files with 5749 additions and 5749 deletions

View File

@ -1,101 +1,101 @@
// $Id: Vocabulary.cpp 1565 2008-02-22 14:42:01Z bojar $
#include "Vocabulary.h"
#include <fstream>
namespace
{
const int MAX_LENGTH = 10000;
} // namespace
using namespace std;
// as in beamdecoder/tables.cpp
vector<WORD_ID> Vocabulary::Tokenize( const char input[] )
{
vector< WORD_ID > token;
bool betweenWords = true;
int start=0;
int i=0;
for(; input[i] != '\0'; i++) {
bool isSpace = (input[i] == ' ' || input[i] == '\t');
if (!isSpace && betweenWords) {
start = i;
betweenWords = false;
} else if (isSpace && !betweenWords) {
token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
betweenWords = true;
}
}
if (!betweenWords)
token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
return token;
}
WORD_ID Vocabulary::StoreIfNew( const WORD& word )
{
map<WORD, WORD_ID>::iterator i = lookup.find( word );
if( i != lookup.end() )
return i->second;
WORD_ID id = vocab.size();
vocab.push_back( word );
lookup[ word ] = id;
return id;
}
WORD_ID Vocabulary::GetWordID( const WORD &word ) const
{
map<WORD, WORD_ID>::const_iterator i = lookup.find( word );
if( i == lookup.end() )
return 0;
WORD_ID w= (WORD_ID) i->second;
return w;
}
void Vocabulary::Save(const string& fileName ) const
{
ofstream vcbFile;
vcbFile.open( fileName.c_str(), ios::out | ios::ate | ios::trunc);
if (!vcbFile) {
cerr << "Failed to open " << vcbFile << endl;
exit(1);
}
vector< WORD >::const_iterator i;
for(i = vocab.begin(); i != vocab.end(); i++) {
const string &word = *i;
vcbFile << word << endl;
}
vcbFile.close();
}
void Vocabulary::Load(const string& fileName )
{
ifstream vcbFile;
char line[MAX_LENGTH];
vcbFile.open(fileName.c_str());
if (!vcbFile) {
cerr << "no such file or directory: " << vcbFile << endl;
exit(1);
}
cerr << "loading from " << fileName << endl;
istream *fileP = &vcbFile;
int count = 0;
while(!fileP->eof()) {
SAFE_GETLINE((*fileP), line, MAX_LENGTH, '\n');
if (fileP->eof()) break;
int length = 0;
for(; line[length] != '\0'; length++);
StoreIfNew( string( line, length ) );
count++;
}
vcbFile.close();
cerr << count << " word read, vocabulary size " << vocab.size() << endl;
}
// $Id: Vocabulary.cpp 1565 2008-02-22 14:42:01Z bojar $
#include "Vocabulary.h"
#include <fstream>
namespace
{
const int MAX_LENGTH = 10000;
} // namespace
using namespace std;
// as in beamdecoder/tables.cpp
vector<WORD_ID> Vocabulary::Tokenize( const char input[] )
{
vector< WORD_ID > token;
bool betweenWords = true;
int start=0;
int i=0;
for(; input[i] != '\0'; i++) {
bool isSpace = (input[i] == ' ' || input[i] == '\t');
if (!isSpace && betweenWords) {
start = i;
betweenWords = false;
} else if (isSpace && !betweenWords) {
token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
betweenWords = true;
}
}
if (!betweenWords)
token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
return token;
}
WORD_ID Vocabulary::StoreIfNew( const WORD& word )
{
map<WORD, WORD_ID>::iterator i = lookup.find( word );
if( i != lookup.end() )
return i->second;
WORD_ID id = vocab.size();
vocab.push_back( word );
lookup[ word ] = id;
return id;
}
WORD_ID Vocabulary::GetWordID( const WORD &word ) const
{
map<WORD, WORD_ID>::const_iterator i = lookup.find( word );
if( i == lookup.end() )
return 0;
WORD_ID w= (WORD_ID) i->second;
return w;
}
void Vocabulary::Save(const string& fileName ) const
{
ofstream vcbFile;
vcbFile.open( fileName.c_str(), ios::out | ios::ate | ios::trunc);
if (!vcbFile) {
cerr << "Failed to open " << vcbFile << endl;
exit(1);
}
vector< WORD >::const_iterator i;
for(i = vocab.begin(); i != vocab.end(); i++) {
const string &word = *i;
vcbFile << word << endl;
}
vcbFile.close();
}
void Vocabulary::Load(const string& fileName )
{
ifstream vcbFile;
char line[MAX_LENGTH];
vcbFile.open(fileName.c_str());
if (!vcbFile) {
cerr << "no such file or directory: " << vcbFile << endl;
exit(1);
}
cerr << "loading from " << fileName << endl;
istream *fileP = &vcbFile;
int count = 0;
while(!fileP->eof()) {
SAFE_GETLINE((*fileP), line, MAX_LENGTH, '\n');
if (fileP->eof()) break;
int length = 0;
for(; line[length] != '\0'; length++);
StoreIfNew( string( line, length ) );
count++;
}
vcbFile.close();
cerr << count << " word read, vocabulary size " << vocab.size() << endl;
}

View File

@ -46,7 +46,7 @@ RE2 curr_en_x("^[Nn]?[\'][\\p{L}]"); // english contraction suffixes conjoin to
RE2 pre_en_x(".*[\\p{L}\\p{N}]+$"); // valid english contraction prefixes
RE2 curr_fr_x(".*[\\p{L}\\p{N}]+[\']"); // french/italian contraction prefixes conjoin to the right
RE2 post_fr_x("^[\\p{L}\\p{N}]*"); // valid french/italian contraction suffixes
// anything rarely used will just be given as a string and compiled on demand by RE2
// anything rarely used will just be given as a string and compiled on demand by RE2
const char *
SPC_BYTE = " ";
@ -85,8 +85,8 @@ const char *ESCAPE_MOSES[] = {
"&apos;", // ' 6 (27)
"&quot;", // " 7 (22)
};
const std::set<std::string>
const std::set<std::string>
ESCAPE_SET = {
std::string(ESCAPE_MOSES[0]),
std::string(ESCAPE_MOSES[1]),
@ -98,7 +98,7 @@ ESCAPE_SET = {
std::string(ESCAPE_MOSES[7]),
};
const std::map<std::wstring,gunichar>
const std::map<std::wstring,gunichar>
ENTITY_MAP = {
{ std::wstring(L"&quot;"), L'"' },
{ std::wstring(L"&amp;"), L'&' },
@ -355,7 +355,7 @@ ENTITY_MAP = {
{ std::wstring(L"&diams;"), L'\u2666' }
};
inline gunichar
inline gunichar
get_entity(gunichar *ptr, size_t len) {
// try hex, decimal entity first
gunichar ech(0);
@ -380,16 +380,16 @@ get_entity(gunichar *ptr, size_t len) {
ech = 0;
}
}
if (ech)
if (ech)
return ech;
std::map<std::wstring,gunichar>::const_iterator it =
std::map<std::wstring,gunichar>::const_iterator it =
ENTITY_MAP.find(std::wstring((wchar_t *)(ptr),len));
return it != ENTITY_MAP.end() ? it->second : gunichar(0);
}
inline gunichar
inline gunichar
get_entity(char *ptr, size_t len) {
glong ulen = 0;
gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)ptr, len, &ulen);
@ -399,7 +399,7 @@ get_entity(char *ptr, size_t len) {
}
inline std::string
inline std::string
trim(const std::string& in)
{
std::size_t start = 0;
@ -413,7 +413,7 @@ trim(const std::string& in)
}
inline std::vector<std::string>
inline std::vector<std::string>
split(const std::string& in)
{
std::vector<std::string> outv;
@ -476,7 +476,7 @@ Tokenizer::Tokenizer(const Parameters& _)
//
// dtor deletes dynamically allocated per-language RE2 compiled expressions
//
Tokenizer::~Tokenizer()
Tokenizer::~Tokenizer()
{
for (auto& ptr : prot_pat_vec) {
if (ptr == &numprefixed_x || ptr == &quasinumeric_x)
@ -491,7 +491,7 @@ Tokenizer::~Tokenizer()
// others into nbpre_gen_set
//
std::pair<int,int>
Tokenizer::load_prefixes(std::ifstream& ifs)
Tokenizer::load_prefixes(std::ifstream& ifs)
{
RE2 numonly("(.*)[\\s]+(\\#NUMERIC_ONLY\\#)");
std::string line;
@ -547,7 +547,7 @@ Tokenizer::init(const char *cfg_dir_optional) {
try {
std::pair<int,int> counts = load_prefixes(cfg);
if (verbose_p) {
std::cerr << "loaded " << counts.first << " non-numeric, "
std::cerr << "loaded " << counts.first << " non-numeric, "
<< counts.second << " numeric prefixes from "
<< nbpre_path << std::endl;
}
@ -570,7 +570,7 @@ Tokenizer::init(const char *cfg_dir_optional) {
std::string protpat_path(cfg_dir);
protpat_path.append("/protected_pattern.").append(lang_iso);
// default to generic version
if (::access(protpat_path.c_str(),R_OK))
if (::access(protpat_path.c_str(),R_OK))
protpat_path = protpat_path.substr(0,protpat_path.size()-lang_iso.size()-1);
prot_pat_vec.push_back(&numprefixed_x);
@ -596,7 +596,7 @@ Tokenizer::init(const char *cfg_dir_optional) {
throw std::runtime_error(ess.str());
}
if (verbose_p) {
std::cerr << "loaded " << npat << " protected patterns from "
std::cerr << "loaded " << npat << " protected patterns from "
<< protpat_path << std::endl;
}
} else if (verbose_p) {
@ -612,7 +612,7 @@ Tokenizer::reset() {
//
// apply ctor-selected tokenization to a string, in-place, no newlines allowed,
// assumes protections are applied already, some invariants are in place,
// assumes protections are applied already, some invariants are in place,
// e.g. that successive chars <= ' ' have been normalized to a single ' '
//
void
@ -633,7 +633,7 @@ Tokenizer::protected_tokenize(std::string& text) {
}
if (pos < textpc.size() && textpc[pos] != ' ')
words.push_back(textpc.substr(pos,textpc.size()-pos));
// regurgitate words with look-ahead handling for tokens with final mumble
std::string outs;
std::size_t nwords(words.size());
@ -659,7 +659,7 @@ Tokenizer::protected_tokenize(std::string& text) {
// lower-case look-ahead does not break
sentence_break_p = false;
}
}
}
outs.append(words[ii].data(),len);
if (sentence_break_p)
@ -671,15 +671,15 @@ Tokenizer::protected_tokenize(std::string& text) {
}
bool
bool
Tokenizer::unescape(std::string& word) {
std::ostringstream oss;
std::size_t was = 0; // last processed
std::size_t pos = 0; // last unprocessed
std::size_t len = 0; // processed length
bool hit = false;
for (std::size_t endp=0;
(pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos;
for (std::size_t endp=0;
(pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos;
was = endp == std::string::npos ? pos : 1+endp) {
len = endp - pos + 1;
glong ulen(0);
@ -703,7 +703,7 @@ Tokenizer::unescape(std::string& word) {
}
g_free(gtmp);
}
if (was < word.size())
if (was < word.size())
oss << word.substr(was);
if (hit)
word = oss.str();
@ -727,7 +727,7 @@ Tokenizer::escape(std::string& text) {
if (mod_p)
outs.append(pp,pt-pp+1);
} else {
if (mod_p)
if (mod_p)
outs.append(pp,mk-pp);
pt = --mk;
}
@ -751,7 +751,7 @@ Tokenizer::escape(std::string& text) {
} else if (*pt > ']') {
if (*pt =='|') { // 7c
sequence_p = ESCAPE_MOSES[0];
}
}
} else if (*pt > 'Z') {
if (*pt == '<') { // 3e
sequence_p = ESCAPE_MOSES[4];
@ -761,11 +761,11 @@ Tokenizer::escape(std::string& text) {
sequence_p = ESCAPE_MOSES[1];
} else if (*pt == ']') { // 5d
sequence_p = ESCAPE_MOSES[2];
}
}
}
if (sequence_p) {
if (pt > pp)
if (pt > pp)
outs.append(pp,pt-pp);
outs.append(sequence_p);
mod_p = true;
@ -774,7 +774,7 @@ Tokenizer::escape(std::string& text) {
++pt;
}
}
if (mod_p) {
if (pp < pt) {
outs.append(pp,pt-pp);
@ -795,13 +795,13 @@ Tokenizer::penn_tokenize(const std::string& buf)
std::string text(buf);
std::string outs;
if (skip_alltags_p)
if (skip_alltags_p)
RE2::GlobalReplace(&text,genl_tags_x,SPC_BYTE);
// directed quote patches
size_t len = text.size();
if (len > 2 && text.substr(0,2) == "``")
text.replace(0,2,"`` ",3);
if (len > 2 && text.substr(0,2) == "``")
text.replace(0,2,"`` ",3);
else if (text[0] == '"')
text.replace(0,1,"`` ",3);
else if (text[0] == '`' || text[0] == '\'')
@ -811,9 +811,9 @@ Tokenizer::penn_tokenize(const std::string& buf)
RE2::GlobalReplace(&text,x1_v_gg,one_gg);
RE2::GlobalReplace(&text,x1_v_g,"\\1 ` \\2");
RE2::GlobalReplace(&text,x1_v_q,"\\1 ` ");
// protect ellipsis
for (size_t pos = text.find("..."); pos != std::string::npos; pos = text.find("...",pos+11))
for (size_t pos = text.find("..."); pos != std::string::npos; pos = text.find("...",pos+11))
text.replace(pos,3,"MANYELIPSIS",11);
// numeric commas
@ -826,13 +826,13 @@ Tokenizer::penn_tokenize(const std::string& buf)
// isolable slash
RE2::GlobalReplace(&text,slash_x,special_refs);
// isolate final period
RE2::GlobalReplace(&text,final_x,"\\1 \\2\\3");
// isolate q.m., e.m.
RE2::GlobalReplace(&text,qx_x,isolate_ref);
// isolate braces
RE2::GlobalReplace(&text,braces_x,isolate_ref);
@ -866,7 +866,7 @@ Tokenizer::penn_tokenize(const std::string& buf)
}
std::string ntext(SPC_BYTE);
ntext.append(text);
// convert double quote to paired single-quotes
RE2::GlobalReplace(&ntext,"\""," '' ");
@ -894,7 +894,7 @@ Tokenizer::penn_tokenize(const std::string& buf)
RE2::GlobalReplace(&ntext," ([Ww])anna "," \\1an na ");
protected_tokenize(ntext);
// restore ellipsis
RE2::GlobalReplace(&ntext,"MANYELIPSIS","...");
@ -919,7 +919,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
int num = 0;
// this is the main moses-compatible tokenizer
// push all the prefixes matching protected patterns
std::vector<std::string> prot_stack;
std::string match;
@ -942,7 +942,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
}
}
}
const char *pt(text.c_str());
const char *ep(pt + text.size());
while (pt < ep && *pt >= 0 && *pt <= ' ')
@ -990,8 +990,8 @@ Tokenizer::quik_tokenize(const std::string& buf)
if (!since_start) {
if (std::isalpha(char(*ucs4)))
alpha_prefix++;
} else if (alpha_prefix == since_start
&& char(*ucs4) == ':'
} else if (alpha_prefix == since_start
&& char(*ucs4) == ':'
&& next_type != G_UNICODE_SPACE_SEPARATOR) {
in_url_p = true;
}
@ -1018,7 +1018,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
// fallthough
case G_UNICODE_UPPERCASE_LETTER:
case G_UNICODE_LOWERCASE_LETTER:
if (downcase_p && curr_type == G_UNICODE_UPPERCASE_LETTER)
if (downcase_p && curr_type == G_UNICODE_UPPERCASE_LETTER)
curr_uch = g_unichar_tolower(*ucs4);
break;
case G_UNICODE_SPACING_MARK:
@ -1082,8 +1082,8 @@ Tokenizer::quik_tokenize(const std::string& buf)
substitute_p = L"@-@";
post_break_p = pre_break_p = true;
} else if ( ( curr_uch > gunichar(L'\u002D') && curr_uch < gunichar(L'\u2010') ) ||
( curr_uch > gunichar(L'\u2011')
&& curr_uch != gunichar(L'\u30A0')
( curr_uch > gunichar(L'\u2011')
&& curr_uch != gunichar(L'\u30A0')
&& curr_uch < gunichar(L'\uFE63') ) ) {
// dash, not a hyphen
post_break_p = pre_break_p = true;
@ -1151,7 +1151,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
default:
post_break_p = pre_break_p = prev_uch != curr_uch;
break;
}
}
}
}
break;
@ -1159,8 +1159,8 @@ Tokenizer::quik_tokenize(const std::string& buf)
switch (curr_uch) {
case gunichar(L':'):
case gunichar(L'/'):
if (refined_p && !in_url_p
&& prev_type == G_UNICODE_DECIMAL_NUMBER
if (refined_p && !in_url_p
&& prev_type == G_UNICODE_DECIMAL_NUMBER
&& next_type == G_UNICODE_DECIMAL_NUMBER) {
break;
}
@ -1178,7 +1178,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
break;
case gunichar(L'&'):
if (unescape_p) {
if (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER
if (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER
|| next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'#')) {
gunichar *eptr = nxt4;
GUnicodeType eptr_type(G_UNICODE_UNASSIGNED);
@ -1223,16 +1223,16 @@ Tokenizer::quik_tokenize(const std::string& buf)
next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED;
goto retry;
}
}
post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR;
if (escape_p)
if (escape_p)
substitute_p = L"&amp;";
break;
case gunichar(L'\''):
if (english_p) {
if (!in_url_p) {
bool next_letter_p = next_type == G_UNICODE_LOWERCASE_LETTER
bool next_letter_p = next_type == G_UNICODE_LOWERCASE_LETTER
|| next_type == G_UNICODE_UPPERCASE_LETTER;
pre_break_p = true;
if (next_letter_p && refined_p) {
@ -1241,9 +1241,9 @@ Tokenizer::quik_tokenize(const std::string& buf)
*(uptr - 1) = gunichar(L' ');
*(uptr++) = prev_uch;
pre_break_p = false;
}
}
}
post_break_p = since_start == 0
post_break_p = since_start == 0
|| (!next_letter_p && next_type != G_UNICODE_DECIMAL_NUMBER);
}
} else if (latin_p) {
@ -1252,12 +1252,12 @@ Tokenizer::quik_tokenize(const std::string& buf)
} else {
post_break_p = pre_break_p = !in_url_p;
}
if (escape_p)
if (escape_p)
substitute_p = L"&apos;";
break;
case gunichar(L'"'):
post_break_p = pre_break_p = true;
if (escape_p)
if (escape_p)
substitute_p = L"&quot;";
break;
case gunichar(L','):
@ -1303,7 +1303,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
}
}
// terminal isolated letter does not break
} else if (class_follows_p(nxt4,lim4,G_UNICODE_LOWERCASE_LETTER) ||
} else if (class_follows_p(nxt4,lim4,G_UNICODE_LOWERCASE_LETTER) ||
g_unichar_type(*nxt4) == G_UNICODE_DASH_PUNCTUATION) {
// lower-case look-ahead does not break
} else {
@ -1315,7 +1315,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
pre_break_p = true;
break;
}
}
}
break;
}
} else {
@ -1346,11 +1346,11 @@ Tokenizer::quik_tokenize(const std::string& buf)
case gunichar(L')'):
break;
case gunichar(L'['):
if (escape_p)
if (escape_p)
substitute_p = L"&#91;";
break;
case gunichar(L']'):
if (escape_p)
if (escape_p)
substitute_p = L"&#93;";
break;
default:
@ -1377,7 +1377,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
if (english_p) {
if (!in_url_p) {
pre_break_p = true;
post_break_p = since_start == 0 ||
post_break_p = since_start == 0 ||
(next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
}
} else if (latin_p) {
@ -1386,23 +1386,23 @@ Tokenizer::quik_tokenize(const std::string& buf)
} else {
post_break_p = pre_break_p = !in_url_p;
}
if (escape_p)
if (escape_p)
substitute_p = L"&apos;";
else
else
curr_uch = gunichar(L'\'');
break;
case gunichar(L'|'):
if (escape_p)
if (escape_p)
substitute_p = L"&#124;";
post_break_p = pre_break_p = true;
break;
case gunichar(L'<'):
if (escape_p)
if (escape_p)
substitute_p = L"&lt;";
post_break_p = pre_break_p = true;
break;
case gunichar(L'>'):
if (escape_p)
if (escape_p)
substitute_p = L"&gt;";
post_break_p = pre_break_p = true;
break;
@ -1414,7 +1414,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
case gunichar(L'='):
case gunichar(L'~'):
in_num_p = false;
post_break_p = pre_break_p = !in_url_p;
post_break_p = pre_break_p = !in_url_p;
break;
case gunichar(L'+'):
post_break_p = pre_break_p = !in_url_p;
@ -1444,12 +1444,12 @@ Tokenizer::quik_tokenize(const std::string& buf)
curr_uch = gunichar(L' ');
} else if (curr_uch < gunichar(L' ')) {
curr_uch = gunichar(L' ');
} else if (curr_uch == gunichar(L'\u0092') &&
} else if (curr_uch == gunichar(L'\u0092') &&
(next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER)) {
// observed corpus corruption case
if (english_p) {
pre_break_p = true;
post_break_p = since_start == 0 ||
post_break_p = since_start == 0 ||
(next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
} else if (latin_p) {
post_break_p = true;
@ -1457,9 +1457,9 @@ Tokenizer::quik_tokenize(const std::string& buf)
} else {
post_break_p = pre_break_p = true;
}
if (escape_p)
if (escape_p)
substitute_p = L"&apos;";
else
else
curr_uch = gunichar(L'\'');
} else {
post_break_p = pre_break_p = true;
@ -1491,7 +1491,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
in_url_p = in_num_p = false;
break;
}
if (pre_break_p || curr_uch == gunichar(L' ') || (bad_length && curr_type != G_UNICODE_UNASSIGNED)) {
if (since_start) {
// non-empty token emitted previously, so pre-break must emit token separator
@ -1501,8 +1501,8 @@ Tokenizer::quik_tokenize(const std::string& buf)
if (curr_uch == gunichar(L' '))
// suppress emission below, fall-through to substitute logic
curr_uch = 0;
}
}
if (substitute_p) {
for (gunichar *sptr = (gunichar *)substitute_p; *sptr; ++sptr) {
*uptr++ = *sptr;
@ -1521,7 +1521,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
glong nbytes = 0;
gchar *utf8 = g_ucs4_to_utf8(ubuf,uptr-ubuf,0,&nbytes,0); // g_free
if (utf8[nbytes-1] == ' ')
if (utf8[nbytes-1] == ' ')
--nbytes;
text.assign((const char *)utf8,(const char *)(utf8 + nbytes));
g_free(utf8);
@ -1552,7 +1552,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
}
std::size_t
std::size_t
Tokenizer::tokenize(std::istream& is, std::ostream& os)
{
std::size_t line_no = 0;
@ -1561,10 +1561,10 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
std::vector< std::vector< std::string > > results(nthreads);
std::vector< boost::thread > workers(nthreads);
bool done_p = !(is.good() && os.good());
for (std::size_t tranche = 0; !done_p; ++tranche) {
// for loop starting threads for chunks of input
for (std::size_t ithread = 0; ithread < nthreads; ++ithread) {
@ -1589,19 +1589,19 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
results[ithread].resize(line_pos);
break;
}
lines[ithread][line_pos].clear();
} else if (skip_xml_p &&
(RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
lines[ithread][line_pos].clear();
lines[ithread][line_pos].clear();
} else if (skip_xml_p &&
(RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
lines[ithread][line_pos].clear();
} else {
lines[ithread][line_pos] =
std::string(SPC_BYTE).append(istr).append(SPC_BYTE);
lines[ithread][line_pos] =
std::string(SPC_BYTE).append(istr).append(SPC_BYTE);
}
}
}
if (line_pos) {
workers[ithread] =
boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread]));
workers[ithread] =
boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread]));
}
} // end for loop starting threads
@ -1616,22 +1616,22 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
if (nlin != nres) {
std::ostringstream emsg;
emsg << "Tranche " << tranche
<< " worker " << ithread << "/" << nthreads
emsg << "Tranche " << tranche
<< " worker " << ithread << "/" << nthreads
<< " |lines|==" << nlin << " != |results|==" << nres;
throw std::runtime_error(emsg.str());
}
for (std::size_t ires = 0; ires < nres; ++ires)
for (std::size_t ires = 0; ires < nres; ++ires)
os << results[ithread][ires] << std::endl;
} // end loop over joined results
if (verbose_p) {
std::cerr << line_no << ' ';
std::cerr.flush();
}
} // end loop over chunks
return line_no;
@ -1642,18 +1642,18 @@ std::string
Tokenizer::detokenize(const std::string& buf)
{
std::vector<std::string> words = split(trim(buf));
std::size_t squotes = 0;
std::size_t dquotes = 0;
std::string prepends("");
std::ostringstream oss;
std::size_t nwords = words.size();
std::size_t iword = 0;
if (unescape_p)
for (auto &word: words)
if (unescape_p)
for (auto &word: words)
unescape(word);
for (auto &word: words) {
@ -1665,13 +1665,13 @@ Tokenizer::detokenize(const std::string& buf)
} else if (RE2::FullMatch(word,left_x)) {
oss << word;
prepends = SPC_BYTE;
} else if (english_p && iword
&& RE2::FullMatch(word,curr_en_x)
} else if (english_p && iword
&& RE2::FullMatch(word,curr_en_x)
&& RE2::FullMatch(words[iword-1],pre_en_x)) {
oss << word;
prepends = SPC_BYTE;
} else if (latin_p && iword < nwords - 2
&& RE2::FullMatch(word,curr_fr_x)
} else if (latin_p && iword < nwords - 2
&& RE2::FullMatch(word,curr_fr_x)
&& RE2::FullMatch(words[iword+1],post_fr_x)) {
oss << prepends << word;
prepends.clear();
@ -1679,7 +1679,7 @@ Tokenizer::detokenize(const std::string& buf)
if ((word.at(0) == '\'' && ((squotes % 2) == 0 )) ||
(word.at(0) == '"' && ((dquotes % 2) == 0))) {
if (english_p && iword
&& word.at(0) == '\''
&& word.at(0) == '\''
&& std::tolower(words[iword-1].at(words[iword-1].size()-1)) == 's') {
oss << word;
prepends = SPC_BYTE;
@ -1698,7 +1698,7 @@ Tokenizer::detokenize(const std::string& buf)
prepends = SPC_BYTE;
if (word.at(0) == '\'')
squotes++;
else if (word.at(0) == '"')
else if (word.at(0) == '"')
dquotes++;
}
} else {
@ -1707,8 +1707,8 @@ Tokenizer::detokenize(const std::string& buf)
}
iword++;
}
std::string text(oss.str());
RE2::GlobalReplace(&text," +",SPC_BYTE);
RE2::GlobalReplace(&text,"\n ","\n");
@ -1718,14 +1718,14 @@ Tokenizer::detokenize(const std::string& buf)
std::size_t
Tokenizer::detokenize(std::istream& is, std::ostream& os)
Tokenizer::detokenize(std::istream& is, std::ostream& os)
{
size_t line_no = 0;
while (is.good() && os.good()) {
std::string istr;
std::getline(is,istr);
line_no ++;
if (istr.empty())
if (istr.empty())
continue;
if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
os << istr << std::endl;
@ -1749,7 +1749,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
return parts;
}
gunichar *uout = (gunichar *)g_malloc0(2*ncp*sizeof(gunichar));
const wchar_t GENL_HYPH = L'\u2010';
const wchar_t IDEO_STOP = L'\u3002';
const wchar_t KANA_MDOT = L'\u30FB';
@ -1786,7 +1786,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
std::vector<std::size_t> breaks;
std::set<std::size_t> suppress;
for (; icp <= ncp; ++icp) {
currwc = wchar_t(ucs4[icp]);
curr_type = g_unichar_type(currwc);
@ -1798,7 +1798,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
case G_UNICODE_OTHER_NUMBER:
curr_class = numba;
curr_word_p = true;
break;
break;
case G_UNICODE_LOWERCASE_LETTER:
case G_UNICODE_MODIFIER_LETTER:
case G_UNICODE_OTHER_LETTER:
@ -1822,7 +1822,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
} else if (currwc >= SMAL_HYPH) {
curr_word_p = true;
} else {
curr_word_p = (currwc >= WAVE_DASH) && (currwc <= KANA_DHYP);
curr_word_p = (currwc >= WAVE_DASH) && (currwc <= KANA_DHYP);
}
break;
case G_UNICODE_CLOSE_PUNCTUATION:
@ -1860,7 +1860,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
curr_word_p = false;
break;
}
// # condition for prefix test
// $words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/
// $words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/
@ -1875,7 +1875,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
} else if (curr_word_p) {
if (!fini_word) {
init_word = ocp;
}
}
fini_word = ocp+1;
dotslen = finilen = 0;
} else if (curr_class >= quote && curr_class <= pfpct && curr_class != pinit) {
@ -1893,7 +1893,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
} else {
init_word = fini_word = 0;
}
if (check_abbr_p) {
// not a valid word character or post-word punctuation character: check word
std::wstring k((wchar_t *)uout+init_word,fini_word-init_word);
@ -1986,7 +1986,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
}
init_word = fini_word = 0;
}
if (seqpos >= SEQ_LIM) {
seqpos = 0;
}
@ -2015,7 +2015,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
continue;
}
}
if (!seqpos) {
if (curr_class != blank) {
uout[ocp++] = gunichar(currwc);
@ -2024,7 +2024,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
}
continue;
}
if (curr_class == blank) {
if (prev_class != blank) {
seq[seqpos] = blank;
@ -2034,7 +2034,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
}
if (icp < ncp)
continue;
}
}
if (curr_class >= quote && curr_class <= pfini) {
if (prev_class < quote || prev_class > pfini) {
@ -2158,8 +2158,8 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
if (uout[chkpos] == L'\n' || uout[chkpos] == L' ') {
endpos = chkpos;
continue;
}
if (g_unichar_isgraph(uout[chkpos]))
}
if (g_unichar_isgraph(uout[chkpos]))
break;
endpos = chkpos;
}
@ -2171,17 +2171,17 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
if (continuation_ptr)
*continuation_ptr = endpos > iop;
iop = nextpos;
}
}
g_free(uout);
g_free(ucs4);
return parts;
}
std::pair<std::size_t,std::size_t>
Tokenizer::splitter(std::istream& is, std::ostream& os)
Tokenizer::splitter(std::istream& is, std::ostream& os)
{
std::pair<std::size_t,std::size_t> counts = { 0, 0 };
bool continuation_p = false;
@ -2197,7 +2197,7 @@ Tokenizer::splitter(std::istream& is, std::ostream& os)
if (istr.empty() && (is.eof() ||!para_marks_p))
continue;
if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x)))
if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x)))
continue;
std::vector<std::string> sentences(splitter(istr,&continuation_p));
@ -2221,13 +2221,13 @@ Tokenizer::splitter(std::istream& is, std::ostream& os)
os << " ";
pending_gap = false;
}
for (std::size_t ii = 0; ii < nsents-1; ++ii)
for (std::size_t ii = 0; ii < nsents-1; ++ii)
os << sentences[ii] << std::endl;
os << sentences[nsents-1];
if (continuation_p)
if (continuation_p)
pending_gap = !split_breaks_p;
if (!pending_gap)
os << std::endl;

View File

@ -26,7 +26,7 @@ class Tokenizer {
private:
typedef enum {
typedef enum {
empty = 0,
blank,
upper, // upper case
@ -56,7 +56,7 @@ private:
// non-breaking prefixes (other) ucs4
std::set<std::wstring> nbpre_gen_ucs4;
// compiled protected patterns
// compiled protected patterns
std::vector<re2::RE2 *> prot_pat_vec;
protected:
@ -96,10 +96,10 @@ protected:
Tokenizer *tokenizer;
std::vector<std::string>& in;
std::vector<std::string>& out;
VectorTokenizerCallable(Tokenizer *_tokenizer,
std::vector<std::string>& _in,
std::vector<std::string>& _out)
VectorTokenizerCallable(Tokenizer *_tokenizer,
std::vector<std::string>& _in,
std::vector<std::string>& _out)
: tokenizer(_tokenizer)
, in(_in)
, out(_out) {
@ -107,10 +107,10 @@ protected:
void operator()() {
out.resize(in.size());
for (std::size_t ii = 0; ii < in.size(); ++ii)
for (std::size_t ii = 0; ii < in.size(); ++ii)
if (in[ii].empty())
out[ii] = in[ii];
else if (tokenizer->penn_p)
else if (tokenizer->penn_p)
out[ii] = tokenizer->penn_tokenize(in[ii]);
else
out[ii] = tokenizer->quik_tokenize(in[ii]);

View File

@ -10,8 +10,8 @@ using namespace TOKENIZER_NAMESPACE ;
#endif
void
usage(const char *path)
void
usage(const char *path)
{
std::cerr << "Usage: " << path << "[-{v|x|p|a|e|s|u|n|N]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
std::cerr << " -a -- aggressive hyphenization" << std::endl;
@ -89,7 +89,7 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
int nlines = 0;
std::string line;
while (ifs.good() && std::getline(ifs,line)) {
if (line.empty())
if (line.empty())
continue;
std::vector<std::string> tokens(tize.tokens(line));
int count = 0;
@ -127,7 +127,7 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
}
int main(int ac, char **av)
int main(int ac, char **av)
{
int rc = 0;
Parameters params;
@ -140,7 +140,7 @@ int main(int ac, char **av)
if (!detokenize_p)
params.split_p = std::strstr(av[0],"splitter") != 0;
while (++av,--ac) {
while (++av,--ac) {
if (**av == '-') {
switch (av[0][1]) {
case 'a':
@ -244,7 +244,7 @@ int main(int ac, char **av)
if (comma) {
*comma++ = 0;
params.chunksize = std::strtoul(comma,0,0);
}
}
params.nthreads = std::strtoul(*av,0,0);
} else {
params.args.push_back(std::string(*av));
@ -275,7 +275,7 @@ int main(int ac, char **av)
cfg_mos_str.append("/moses");
if (!::access(cfg_mos_str.c_str(),X_OK)) {
params.cfg_path = strdup(cfg_mos_str.c_str());
} else if (!::access(cfg_shr_str.c_str(),X_OK)) {
} else if (!::access(cfg_shr_str.c_str(),X_OK)) {
params.cfg_path = strdup(cfg_shr_str.c_str());
} else if (!::access(cfg_dir_str.c_str(),X_OK)) {
params.cfg_path = strdup(cfg_dir_str.c_str());
@ -287,7 +287,7 @@ int main(int ac, char **av)
if (params.verbose_p) {
std::cerr << "config path: " << params.cfg_path << std::endl;
}
}
}
std::unique_ptr<std::ofstream> pofs = 0;
if (!params.out_path.empty()) {
@ -345,7 +345,7 @@ int main(int ac, char **av)
if (plines.second) {
std::cerr << "%%% " << plines.second << " sentences." << std::endl;
}
}
}
return rc;
}

View File

@ -1,236 +1,236 @@
/**
* ISS (Indexed Strings Storage) - memory efficient storage for permanent strings.
*
* Implementation note: use #define USE_HASHSET to switch between implementation
* using __gnu_cxx::hash_set and implementation using std::set.
*
* (C) Ceslav Przywara, UFAL MFF UK, 2011
*
* $Id$
*/
#ifndef _ISS_H
#define _ISS_H
#include <limits>
#include <vector>
#include <string.h>
// Use hashset instead of std::set for string-to-number indexing?
#ifdef USE_HASHSET
#include <ext/hash_set>
#else
#include <set>
#endif
#include <boost/pool/pool.hpp>
#ifdef USE_HASHSET
// Forward declaration of comparator functor.
template<class IndType>
class StringsEqualComparator;
template<class IndType>
class Hasher;
#else
// Forward declaration of comparator functor.
template<class IndType>
class StringsLessComparator;
#endif
/**
*/
template<class IndType>
class IndexedStringsStorage {
public:
typedef IndType index_type;
#ifdef USE_HASHSET
typedef StringsEqualComparator<IndType> equality_comparator_t;
typedef Hasher<IndType> hasher_t;
/** @typedef Hash set used as lookup table (string -> numeric index). */
typedef __gnu_cxx::hash_set<IndType, hasher_t, equality_comparator_t> index_t;
#else
typedef StringsLessComparator<IndType> less_comparator_t;
/** @typedef Set used as lookup table (string -> numeric index). */
typedef std::set<IndType, less_comparator_t> index_t;
#endif
/** @typedef Container of pointers to stored C-strings. Acts as
* conversion table: numeric index -> string.
*/
typedef std::vector<const char*> table_t;
private:
/** @var memory pool used to store C-strings */
boost::pool<> _storage;
/** @var index-to-string conversion table */
table_t _table;
/** @var index lookup table */
index_t _index;
public:
/** Default constructor.
*/
IndexedStringsStorage(void);
/** @return True, if the indices are exhausted (new strings cannot be stored).
*/
inline bool is_full(void) const { return _table.size() == std::numeric_limits<IndType>::max(); }
/** Retrieves pointer to C-string instance represented by given index.
* Note: No range checks are performed!
* @param index Index of C-string to retrieve.
* @return Pointer to stored C-string instance.
*/
inline const char* get(IndType index) const { return _table[index]; }
/** Stores the string and returns its numeric index.
* @param str Pointer to C-string to store.
* @return Index of stored copy of str.
* @throw std::bad_alloc When insertion of new string would cause
* overflow of indices datatype.
*/
IndType put(const char* str);
/** @return Number of unique strings stored so far.
*/
inline table_t::size_type size(void) const { return _table.size(); }
};
/** Functor designed for less than comparison of C-strings stored within StringStore.
* @param IndType Type of numerical indices of strings within given StringStore.
*/
#ifdef USE_HASHSET
template<class IndType>
class StringsEqualComparator: public std::binary_function<IndType, IndType, bool> {
#else
template<class IndType>
class StringsLessComparator: public std::binary_function<IndType, IndType, bool> {
#endif
/** @var conversion table: index -> string (necessary for indices comparison) */
const typename IndexedStringsStorage<IndType>::table_t& _table;
public:
#ifdef USE_HASHSET
StringsEqualComparator<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _table(table) {}
#else
StringsLessComparator<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _table(table) {}
#endif
/** Comparison of two pointers to C-strings.
* @param lhs Pointer to 1st C-string.
* @param rhs Pointer to 2nd C-string.
* @return True, if 1st argument is equal/less than 2nd argument.
*/
inline bool operator()(IndType lhs, IndType rhs) const {
#ifdef USE_HASHSET
return strcmp(_table[lhs], _table[rhs]) == 0;
#else
return strcmp(_table[lhs], _table[rhs]) < 0;
#endif
}
};
#ifdef USE_HASHSET
/** Functor... TODO.
*/
template<class IndType>
class Hasher: public std::unary_function<IndType, size_t> {
__gnu_cxx::hash<const char*> _hash;
/** @var conversion table: index -> string (necessary for indices comparison) */
const typename IndexedStringsStorage<IndType>::table_t& _table;
public:
/** */
Hasher<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _hash(), _table(table) {}
/** Hashing function.
* @param index
* @return Counted hash.
*/
inline size_t operator()(const IndType index) const {
return _hash(_table[index]);
}
};
#endif
template <class IndType>
#ifdef USE_HASHSET
IndexedStringsStorage<IndType>::IndexedStringsStorage(void): _storage(sizeof(char)), _table(), _index(100, hasher_t(_table), equality_comparator_t(_table)) {}
#else
IndexedStringsStorage<IndType>::IndexedStringsStorage(void): _storage(sizeof(char)), _table(), _index(less_comparator_t(_table)) {}
#endif
template <class IndType>
IndType IndexedStringsStorage<IndType>::put(const char* str) {
if ( this->is_full() ) {
// What a pity, not a single index left to spend.
throw std::bad_alloc();
}
// To use the index for lookup we first have to store passed string
// in conversion table (cause during lookup we compare the strings indirectly
// by using their indices).
// Note: thread unsafe! TODO: Redesing.
IndType index = static_cast<IndType>(_table.size());
_table.push_back(str);
#ifdef USE_HASHSET
//
typename index_t::iterator iIndex = _index.find(index);
#else
// A lower_bound() search enables us to use "found" iterator as a hint for
// eventual insertion.
typename index_t::iterator iIndex = _index.lower_bound(index);
#endif
if ( (iIndex != _index.end())
#ifndef USE_HASHSET
// In case of lower_bound() search we have to also compare found item
// with passed string.
&& (strcmp(_table[*iIndex], str) == 0)
#endif
) {
// String is already present in storage!
// Pop back temporary stored pointer...
_table.pop_back();
// ...and return numeric index to already stored copy of `str`.
return static_cast<IndType>(*iIndex);
}
// String not found within storage.
// Allocate memory required for string storage...
char* mem = static_cast<char*>(_storage.ordered_malloc(strlen(str) + 1));
// ...and fill it with copy of passed string.
strcpy(mem, str);
// Overwrite temporary stored pointer to `str` with pointer to freshly
// saved copy.
_table[index] = mem;
#ifdef USE_HASHSET
// Insert the index into lookup table.
_index.insert(index);
#else
// Insert the index into lookup table (use previously retrieved iterator
// as a hint).
_index.insert(iIndex, index);
#endif
// Finally.
return index;
}
#endif
/**
* ISS (Indexed Strings Storage) - memory efficient storage for permanent strings.
*
* Implementation note: use #define USE_HASHSET to switch between implementation
* using __gnu_cxx::hash_set and implementation using std::set.
*
* (C) Ceslav Przywara, UFAL MFF UK, 2011
*
* $Id$
*/
#ifndef _ISS_H
#define _ISS_H
#include <limits>
#include <vector>
#include <string.h>
// Use hashset instead of std::set for string-to-number indexing?
#ifdef USE_HASHSET
#include <ext/hash_set>
#else
#include <set>
#endif
#include <boost/pool/pool.hpp>
#ifdef USE_HASHSET
// Forward declaration of comparator functor.
template<class IndType>
class StringsEqualComparator;
template<class IndType>
class Hasher;
#else
// Forward declaration of comparator functor.
template<class IndType>
class StringsLessComparator;
#endif
/**
*/
template<class IndType>
class IndexedStringsStorage {
public:
typedef IndType index_type;
#ifdef USE_HASHSET
typedef StringsEqualComparator<IndType> equality_comparator_t;
typedef Hasher<IndType> hasher_t;
/** @typedef Hash set used as lookup table (string -> numeric index). */
typedef __gnu_cxx::hash_set<IndType, hasher_t, equality_comparator_t> index_t;
#else
typedef StringsLessComparator<IndType> less_comparator_t;
/** @typedef Set used as lookup table (string -> numeric index). */
typedef std::set<IndType, less_comparator_t> index_t;
#endif
/** @typedef Container of pointers to stored C-strings. Acts as
* conversion table: numeric index -> string.
*/
typedef std::vector<const char*> table_t;
private:
/** @var memory pool used to store C-strings */
boost::pool<> _storage;
/** @var index-to-string conversion table */
table_t _table;
/** @var index lookup table */
index_t _index;
public:
/** Default constructor.
*/
IndexedStringsStorage(void);
/** @return True, if the indices are exhausted (new strings cannot be stored).
*/
inline bool is_full(void) const { return _table.size() == std::numeric_limits<IndType>::max(); }
/** Retrieves pointer to C-string instance represented by given index.
* Note: No range checks are performed!
* @param index Index of C-string to retrieve.
* @return Pointer to stored C-string instance.
*/
inline const char* get(IndType index) const { return _table[index]; }
/** Stores the string and returns its numeric index.
* @param str Pointer to C-string to store.
* @return Index of stored copy of str.
* @throw std::bad_alloc When insertion of new string would cause
* overflow of indices datatype.
*/
IndType put(const char* str);
/** @return Number of unique strings stored so far.
*/
inline table_t::size_type size(void) const { return _table.size(); }
};
/** Functor designed for less than comparison of C-strings stored within StringStore.
* @param IndType Type of numerical indices of strings within given StringStore.
*/
#ifdef USE_HASHSET
template<class IndType>
class StringsEqualComparator: public std::binary_function<IndType, IndType, bool> {
#else
template<class IndType>
class StringsLessComparator: public std::binary_function<IndType, IndType, bool> {
#endif
/** @var conversion table: index -> string (necessary for indices comparison) */
const typename IndexedStringsStorage<IndType>::table_t& _table;
public:
#ifdef USE_HASHSET
StringsEqualComparator<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _table(table) {}
#else
StringsLessComparator<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _table(table) {}
#endif
/** Comparison of two pointers to C-strings.
* @param lhs Pointer to 1st C-string.
* @param rhs Pointer to 2nd C-string.
* @return True, if 1st argument is equal/less than 2nd argument.
*/
inline bool operator()(IndType lhs, IndType rhs) const {
#ifdef USE_HASHSET
return strcmp(_table[lhs], _table[rhs]) == 0;
#else
return strcmp(_table[lhs], _table[rhs]) < 0;
#endif
}
};
#ifdef USE_HASHSET
/** Functor... TODO.
*/
template<class IndType>
class Hasher: public std::unary_function<IndType, size_t> {
__gnu_cxx::hash<const char*> _hash;
/** @var conversion table: index -> string (necessary for indices comparison) */
const typename IndexedStringsStorage<IndType>::table_t& _table;
public:
/** */
Hasher<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _hash(), _table(table) {}
/** Hashing function.
* @param index
* @return Counted hash.
*/
inline size_t operator()(const IndType index) const {
return _hash(_table[index]);
}
};
#endif
template <class IndType>
#ifdef USE_HASHSET
IndexedStringsStorage<IndType>::IndexedStringsStorage(void): _storage(sizeof(char)), _table(), _index(100, hasher_t(_table), equality_comparator_t(_table)) {}
#else
IndexedStringsStorage<IndType>::IndexedStringsStorage(void): _storage(sizeof(char)), _table(), _index(less_comparator_t(_table)) {}
#endif
template <class IndType>
IndType IndexedStringsStorage<IndType>::put(const char* str) {
if ( this->is_full() ) {
// What a pity, not a single index left to spend.
throw std::bad_alloc();
}
// To use the index for lookup we first have to store passed string
// in conversion table (cause during lookup we compare the strings indirectly
// by using their indices).
// Note: thread unsafe! TODO: Redesing.
IndType index = static_cast<IndType>(_table.size());
_table.push_back(str);
#ifdef USE_HASHSET
//
typename index_t::iterator iIndex = _index.find(index);
#else
// A lower_bound() search enables us to use "found" iterator as a hint for
// eventual insertion.
typename index_t::iterator iIndex = _index.lower_bound(index);
#endif
if ( (iIndex != _index.end())
#ifndef USE_HASHSET
// In case of lower_bound() search we have to also compare found item
// with passed string.
&& (strcmp(_table[*iIndex], str) == 0)
#endif
) {
// String is already present in storage!
// Pop back temporary stored pointer...
_table.pop_back();
// ...and return numeric index to already stored copy of `str`.
return static_cast<IndType>(*iIndex);
}
// String not found within storage.
// Allocate memory required for string storage...
char* mem = static_cast<char*>(_storage.ordered_malloc(strlen(str) + 1));
// ...and fill it with copy of passed string.
strcpy(mem, str);
// Overwrite temporary stored pointer to `str` with pointer to freshly
// saved copy.
_table[index] = mem;
#ifdef USE_HASHSET
// Insert the index into lookup table.
_index.insert(index);
#else
// Insert the index into lookup table (use previously retrieved iterator
// as a hint).
_index.insert(iIndex, index);
#endif
// Finally.
return index;
}
#endif

View File

@ -83,7 +83,7 @@ public:
const counter_t bucketWidth; // ceil(1/error)
private:
/** @var Current epoch bucket ID (b-current) */
counter_t _bucketId;
@ -182,7 +182,7 @@ class LossyCounterIterator: public std::iterator<std::forward_iterator_tag, type
public:
typedef LossyCounterIterator<T> self_type;
typedef typename LossyCounter<T>::storage_t::const_iterator const_iterator;
protected:
@ -288,7 +288,7 @@ protected:
template<class T>
void LossyCounter<T>::add(const T& item) {
typename storage_t::iterator iter = _storage.find(item);
if ( iter == _storage.end() ) {
@ -330,7 +330,7 @@ void LossyCounter<T>::prune(void) {
////////////////////////////////////////////////////////////////////////////////
template<class T>
LossyCounterIterator<T> LossyCounterIterator<T>::operator++(void) {
LossyCounterIterator<T> LossyCounterIterator<T>::operator++(void) {
this->forward();
return *this;
}

View File

@ -92,7 +92,7 @@ int main(int argc, char* argv[]) {
// Init lossy counters.
std::string lossyCountersParams;
int paramIdx = 5;
while ( (argc > paramIdx) && (*argv[paramIdx] != '-') ) {
std::string param = std::string(argv[paramIdx]);
if ( !parse_lossy_counting_params(param) ) {
@ -113,7 +113,7 @@ int main(int argc, char* argv[]) {
usage(argv[0]);
}
}
if ( (argc > paramIdx) && (strcmp(argv[paramIdx], "--compact") == 0) ) {
compactOutputFlag = true;
++paramIdx;
@ -154,7 +154,7 @@ int main(int argc, char* argv[]) {
readInput(eFile, fFile, aFile);
std::cerr << std::endl; // Leave the progress bar end on previous line.
// close input files
eFile.close();
fFile.close();

View File

@ -32,14 +32,14 @@ typedef std::vector<output_pair_t> output_vector_t;
class PhraseComp {
/** @var If true, sort by target phrase first. */
bool _inverted;
bool compareAlignments(const indexed_phrases_pair_t& a, const indexed_phrases_pair_t& b);
int comparePhrases(const indexed_phrases_pair_t::phrase_t& a, const indexed_phrases_pair_t::phrase_t& b);
public:
PhraseComp(bool inverted): _inverted(inverted) {}
bool operator()(const output_pair_t& a, const output_pair_t& b);
};
@ -448,9 +448,9 @@ void extract(SentenceAlignment &sentence) {
((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : "") + " | " +
((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : "");
}
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
} // end of for loop through inbound phrases
} // end if buildExtraStructure
@ -567,7 +567,7 @@ bool PhraseComp::operator()(const output_pair_t& a, const output_pair_t& b) {
else {
return cmp < 0;
}
}
@ -607,7 +607,7 @@ bool PhraseComp::compareAlignments(const indexed_phrases_pair_t& a, const indexe
return cmp < 0;
}
}
// Note: LC_ALL=C GNU sort treats shorter item as lesser than longer one.
return (cmp == 0) ? (aSize < bSize) : (cmp < 0);
@ -685,7 +685,7 @@ void processSortedOutput(OutputProcessor& processor) {
void processUnsortedOutput(OutputProcessor& processor) {
LossyCountersVector::value_type current = NULL, prev = NULL;
for ( size_t i = 1; i < lossyCounters.size(); ++i ) { // Intentionally skip 0.
@ -759,7 +759,7 @@ void printStats(void) {
if ( (current == NULL) || ((current != prev) && (prev != NULL)) ) {
// Time to print.
to = i-1;
// Increment overall stats.
outputMass += prev->outputMass;
outputSize += prev->outputSize;
@ -787,7 +787,7 @@ void printStats(void) {
from = i;
}
prev = current;
}

View File

@ -10,15 +10,15 @@ int main(int argc, char* argv[])
using namespace boost::locale;
using namespace std;
generator gen;
locale loc=gen("");
cout.imbue(loc);
cout << "Hello, World" << endl;
cout << "This is how we show currency in this locale " << as::currency << 103.34 << endl;
return 0;
}

File diff suppressed because it is too large Load Diff

View File

@ -1,231 +1,231 @@
// XGetopt.cpp Version 1.2
//
// Author: Hans Dietrich
// hdietrich2@hotmail.com
//
// Description:
// XGetopt.cpp implements getopt(), a function to parse command lines.
//
// History
// Version 1.2 - 2003 May 17
// - Added Unicode support
//
// Version 1.1 - 2002 March 10
// - Added example to XGetopt.cpp module header
//
// This software is released into the public domain.
// You are free to use it in any way you like.
//
// This software is provided "as is" with no expressed
// or implied warranty. I accept no liability for any
// damage or loss of business that this software may cause.
//
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// if you are using precompiled headers then include this line:
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// if you are not using precompiled headers then include these lines:
//#include <windows.h>
//#include <cstdio>
//#include <tchar.h>
///////////////////////////////////////////////////////////////////////////////
#include <cstdio>
#include <cstring>
#include <cmath>
#include "WIN32_functions.h"
///////////////////////////////////////////////////////////////////////////////
//
// X G e t o p t . c p p
//
//
// NAME
// getopt -- parse command line options
//
// SYNOPSIS
// int getopt(int argc, char *argv[], char *optstring)
//
// extern char *optarg;
// extern int optind;
//
// DESCRIPTION
// The getopt() function parses the command line arguments. Its
// arguments argc and argv are the argument count and array as
// passed into the application on program invocation. In the case
// of Visual C++ programs, argc and argv are available via the
// variables __argc and __argv (double underscores), respectively.
// getopt returns the next option letter in argv that matches a
// letter in optstring. (Note: Unicode programs should use
// __targv instead of __argv. Also, all character and string
// literals should be enclosed in ( ) ).
//
// optstring is a string of recognized option letters; if a letter
// is followed by a colon, the option is expected to have an argument
// that may or may not be separated from it by white space. optarg
// is set to point to the start of the option argument on return from
// getopt.
//
// Option letters may be combined, e.g., "-ab" is equivalent to
// "-a -b". Option letters are case sensitive.
//
// getopt places in the external variable optind the argv index
// of the next argument to be processed. optind is initialized
// to 0 before the first call to getopt.
//
// When all options have been processed (i.e., up to the first
// non-option argument), getopt returns EOF, optarg will point
// to the argument, and optind will be set to the argv index of
// the argument. If there are no non-option arguments, optarg
// will be set to NULL.
//
// The special option "--" may be used to delimit the end of the
// options; EOF will be returned, and "--" (and everything after it)
// will be skipped.
//
// RETURN VALUE
// For option letters contained in the string optstring, getopt
// will return the option letter. getopt returns a question mark (?)
// when it encounters an option letter not included in optstring.
// EOF is returned when processing is finished.
//
// BUGS
// 1) Long options are not supported.
// 2) The GNU double-colon extension is not supported.
// 3) The environment variable POSIXLY_CORRECT is not supported.
// 4) The + syntax is not supported.
// 5) The automatic permutation of arguments is not supported.
// 6) This implementation of getopt() returns EOF if an error is
// encountered, instead of -1 as the latest standard requires.
//
// EXAMPLE
// BOOL CMyApp::ProcessCommandLine(int argc, char *argv[])
// {
// int c;
//
// while ((c = getopt(argc, argv, ("aBn:"))) != EOF)
// {
// switch (c)
// {
// case ('a'):
// TRACE(("option a\n"));
// //
// // set some flag here
// //
// break;
//
// case ('B'):
// TRACE( ("option B\n"));
// //
// // set some other flag here
// //
// break;
//
// case ('n'):
// TRACE(("option n: value=%d\n"), atoi(optarg));
// //
// // do something with value here
// //
// break;
//
// case ('?'):
// TRACE(("ERROR: illegal option %s\n"), argv[optind-1]);
// return FALSE;
// break;
//
// default:
// TRACE(("WARNING: no handler for option %c\n"), c);
// return FALSE;
// break;
// }
// }
// //
// // check for non-option args here
// //
// return TRUE;
// }
//
///////////////////////////////////////////////////////////////////////////////
char *optarg; // global argument pointer
int optind = 0; // global argv index
int getopt(int argc, char *argv[], char *optstring)
{
static char *next = NULL;
if (optind == 0)
next = NULL;
optarg = NULL;
if (next == NULL || *next =='\0') {
if (optind == 0)
optind++;
if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0')) {
optarg = NULL;
if (optind < argc)
optarg = argv[optind];
return EOF;
}
if (strcmp(argv[optind], "--") == 0) {
optind++;
optarg = NULL;
if (optind < argc)
optarg = argv[optind];
return EOF;
}
next = argv[optind];
next++; // skip past -
optind++;
}
char c = *next++;
char *cp = strchr(optstring, c);
if (cp == NULL || c == (':'))
return ('?');
cp++;
if (*cp == (':')) {
if (*next != ('\0')) {
optarg = next;
next = NULL;
} else if (optind < argc) {
optarg = argv[optind];
optind++;
} else {
return ('?');
}
}
return c;
}
// for an overview, see
// W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
double lgamma(int x)
{
// size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
if (x <= 2) {
return 0.0;
}
static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
double tmp=(double)x+5.5;
tmp -= (((double)x)+0.5)*log(tmp);
double y=(double)x;
double sum = 1.000000000190015;
for (size_t j=0; j<6; ++j) {
sum += coefs[j]/++y;
}
return -tmp+log(2.5066282746310005*sum/(double)x);
}
// XGetopt.cpp Version 1.2
//
// Author: Hans Dietrich
// hdietrich2@hotmail.com
//
// Description:
// XGetopt.cpp implements getopt(), a function to parse command lines.
//
// History
// Version 1.2 - 2003 May 17
// - Added Unicode support
//
// Version 1.1 - 2002 March 10
// - Added example to XGetopt.cpp module header
//
// This software is released into the public domain.
// You are free to use it in any way you like.
//
// This software is provided "as is" with no expressed
// or implied warranty. I accept no liability for any
// damage or loss of business that this software may cause.
//
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// if you are using precompiled headers then include this line:
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// if you are not using precompiled headers then include these lines:
//#include <windows.h>
//#include <cstdio>
//#include <tchar.h>
///////////////////////////////////////////////////////////////////////////////
#include <cstdio>
#include <cstring>
#include <cmath>
#include "WIN32_functions.h"
///////////////////////////////////////////////////////////////////////////////
//
// X G e t o p t . c p p
//
//
// NAME
// getopt -- parse command line options
//
// SYNOPSIS
// int getopt(int argc, char *argv[], char *optstring)
//
// extern char *optarg;
// extern int optind;
//
// DESCRIPTION
// The getopt() function parses the command line arguments. Its
// arguments argc and argv are the argument count and array as
// passed into the application on program invocation. In the case
// of Visual C++ programs, argc and argv are available via the
// variables __argc and __argv (double underscores), respectively.
// getopt returns the next option letter in argv that matches a
// letter in optstring. (Note: Unicode programs should use
// __targv instead of __argv. Also, all character and string
// literals should be enclosed in ( ) ).
//
// optstring is a string of recognized option letters; if a letter
// is followed by a colon, the option is expected to have an argument
// that may or may not be separated from it by white space. optarg
// is set to point to the start of the option argument on return from
// getopt.
//
// Option letters may be combined, e.g., "-ab" is equivalent to
// "-a -b". Option letters are case sensitive.
//
// getopt places in the external variable optind the argv index
// of the next argument to be processed. optind is initialized
// to 0 before the first call to getopt.
//
// When all options have been processed (i.e., up to the first
// non-option argument), getopt returns EOF, optarg will point
// to the argument, and optind will be set to the argv index of
// the argument. If there are no non-option arguments, optarg
// will be set to NULL.
//
// The special option "--" may be used to delimit the end of the
// options; EOF will be returned, and "--" (and everything after it)
// will be skipped.
//
// RETURN VALUE
// For option letters contained in the string optstring, getopt
// will return the option letter. getopt returns a question mark (?)
// when it encounters an option letter not included in optstring.
// EOF is returned when processing is finished.
//
// BUGS
// 1) Long options are not supported.
// 2) The GNU double-colon extension is not supported.
// 3) The environment variable POSIXLY_CORRECT is not supported.
// 4) The + syntax is not supported.
// 5) The automatic permutation of arguments is not supported.
// 6) This implementation of getopt() returns EOF if an error is
// encountered, instead of -1 as the latest standard requires.
//
// EXAMPLE
// BOOL CMyApp::ProcessCommandLine(int argc, char *argv[])
// {
// int c;
//
// while ((c = getopt(argc, argv, ("aBn:"))) != EOF)
// {
// switch (c)
// {
// case ('a'):
// TRACE(("option a\n"));
// //
// // set some flag here
// //
// break;
//
// case ('B'):
// TRACE( ("option B\n"));
// //
// // set some other flag here
// //
// break;
//
// case ('n'):
// TRACE(("option n: value=%d\n"), atoi(optarg));
// //
// // do something with value here
// //
// break;
//
// case ('?'):
// TRACE(("ERROR: illegal option %s\n"), argv[optind-1]);
// return FALSE;
// break;
//
// default:
// TRACE(("WARNING: no handler for option %c\n"), c);
// return FALSE;
// break;
// }
// }
// //
// // check for non-option args here
// //
// return TRUE;
// }
//
///////////////////////////////////////////////////////////////////////////////
char *optarg; // global argument pointer
int optind = 0; // global argv index
int getopt(int argc, char *argv[], char *optstring)
{
static char *next = NULL;
if (optind == 0)
next = NULL;
optarg = NULL;
if (next == NULL || *next =='\0') {
if (optind == 0)
optind++;
if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0')) {
optarg = NULL;
if (optind < argc)
optarg = argv[optind];
return EOF;
}
if (strcmp(argv[optind], "--") == 0) {
optind++;
optarg = NULL;
if (optind < argc)
optarg = argv[optind];
return EOF;
}
next = argv[optind];
next++; // skip past -
optind++;
}
char c = *next++;
char *cp = strchr(optstring, c);
if (cp == NULL || c == (':'))
return ('?');
cp++;
if (*cp == (':')) {
if (*next != ('\0')) {
optarg = next;
next = NULL;
} else if (optind < argc) {
optarg = argv[optind];
optind++;
} else {
return ('?');
}
}
return c;
}
// for an overview, see
// W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
double lgamma(int x)
{
// size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
if (x <= 2) {
return 0.0;
}
static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
double tmp=(double)x+5.5;
tmp -= (((double)x)+0.5)*log(tmp);
double y=(double)x;
double sum = 1.000000000190015;
for (size_t j=0; j<6; ++j) {
sum += coefs[j]/++y;
}
return -tmp+log(2.5066282746310005*sum/(double)x);
}

View File

@ -1,24 +1,24 @@
// XGetopt.h Version 1.2
//
// Author: Hans Dietrich
// hdietrich2@hotmail.com
//
// This software is released into the public domain.
// You are free to use it in any way you like.
//
// This software is provided "as is" with no expressed
// or implied warranty. I accept no liability for any
// damage or loss of business that this software may cause.
//
///////////////////////////////////////////////////////////////////////////////
#ifndef XGETOPT_H
#define XGETOPT_H
extern int optind, opterr;
extern char *optarg;
int getopt(int argc, char *argv[], char *optstring);
double lgamma(int x);
#endif //XGETOPT_H
// XGetopt.h Version 1.2
//
// Author: Hans Dietrich
// hdietrich2@hotmail.com
//
// This software is released into the public domain.
// You are free to use it in any way you like.
//
// This software is provided "as is" with no expressed
// or implied warranty. I accept no liability for any
// damage or loss of business that this software may cause.
//
///////////////////////////////////////////////////////////////////////////////
#ifndef XGETOPT_H
#define XGETOPT_H
extern int optind, opterr;
extern char *optarg;
int getopt(int argc, char *argv[], char *optstring);
double lgamma(int x);
#endif //XGETOPT_H

View File

@ -1,5 +1,5 @@
#include <cstring>
#include <cstring>
#include <cassert>
#include <cstdio>
#include <cstdlib>

View File

@ -234,13 +234,13 @@ void OutputAlignment(ostream &out, const AlignmentInfo &ai, size_t sourceOffset,
{
typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
AlignVec alignments = ai.GetSortedAlignments();
AlignVec::const_iterator it;
for (it = alignments.begin(); it != alignments.end(); ++it) {
const std::pair<size_t,size_t> &alignment = **it;
out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " ";
}
}
void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
@ -251,7 +251,7 @@ void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
const Hypothesis &edge = *edges[currEdge];
const TargetPhrase &tp = edge.GetCurrTargetPhrase();
size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos();
OutputAlignment(out, tp.GetAlignmentInfo(), sourceOffset, targetOffset);
targetOffset += tp.GetSize();
@ -263,7 +263,7 @@ void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<co
{
ostringstream out;
OutputAlignment(out, edges);
collector->Write(lineNo,out.str());
}
@ -477,7 +477,7 @@ void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, con
const int sourceOffset = sourceRange.GetStartPos();
const int targetOffset = targetRange.GetStartPos();
const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignmentInfo();
OutputAlignment(out, ai, sourceOffset, targetOffset);
}

View File

@ -168,18 +168,18 @@ static void ShowWeights()
int main(int argc, char** argv)
{
try {
// echo command line, if verbose
IFVERBOSE(1) {
TRACE_ERR("command: ");
for(int i=0; i<argc; ++i) TRACE_ERR(argv[i]<<" ");
TRACE_ERR(endl);
}
// set number of significant decimals in output
fix(cout,PRECISION);
fix(cerr,PRECISION);
// load all the settings into the Parameter class
// (stores them as strings, or array of strings)
Parameter* params = new Parameter();
@ -187,34 +187,34 @@ int main(int argc, char** argv)
params->Explain();
exit(1);
}
// initialize all "global" variables, which are stored in StaticData
// note: this also loads models such as the language model, etc.
if (!StaticData::LoadDataStatic(params, argv[0])) {
exit(1);
}
// setting "-show-weights" -> just dump out weights and exit
if (params->isParamSpecified("show-weights")) {
ShowWeights();
exit(0);
}
// shorthand for accessing information in StaticData
const StaticData& staticData = StaticData::Instance();
//initialise random numbers
rand_init();
// set up read/writing class
IOWrapper* ioWrapper = GetIOWrapper(staticData);
if (!ioWrapper) {
cerr << "Error; Failed to create IO object" << endl;
exit(1);
}
// check on weights
vector<float> weights = staticData.GetAllWeights();
IFVERBOSE(2) {
@ -233,7 +233,7 @@ int main(int argc, char** argv)
// setting lexicalized reordering setup
PhraseBasedReorderingState::m_useFirstBackwardScore = false;
auto_ptr<OutputCollector> outputCollector;
outputCollector.reset(new OutputCollector());
@ -241,7 +241,7 @@ int main(int argc, char** argv)
#ifdef WITH_THREADS
ThreadPool pool(staticData.ThreadCount());
#endif
// main loop over set of input sentences
InputType* source = NULL;
size_t lineCount = 0;
@ -259,11 +259,11 @@ int main(int argc, char** argv)
task->Run();
delete task;
#endif
source = NULL; //make sure it doesn't get deleted
++lineCount;
}
// we are done, finishing up
#ifdef WITH_THREADS
pool.Stop(true); //flush remaining jobs

View File

@ -70,7 +70,7 @@ namespace MosesCmd
if (neg_log_div > 100){
return 100;
}
return neg_log_div;
return neg_log_div;
}
void RelativeEntropyCalc::ConcatOutputPhraseRecursive(Phrase& phrase, const Hypothesis *hypo){

View File

@ -57,7 +57,7 @@ void PrintTranslationAnalysis(const TranslationSystem* system, std::ostream &os,
}
}
}
bool epsilon = false;
if (target == "") {
target="<EPSILON>";

View File

@ -60,12 +60,12 @@ static void add(const string& e, const vector<float> scores,
static void finalise(Probs& p_e_given_f, Probs& p_f_given_e) {
//cerr << "Sizes: p(e|f): " << p_e_given_f.size() << " p(f|e): " << p_f_given_e.size() << endl;
for (Probs::const_iterator e1_iter = p_f_given_e.begin() ;
for (Probs::const_iterator e1_iter = p_f_given_e.begin() ;
e1_iter != p_f_given_e.end(); ++e1_iter) {
for (Probs::const_iterator e2_iter = p_e_given_f.begin() ;
e2_iter != p_e_given_f.end(); ++e2_iter) {
if (e1_iter->second == e2_iter->second) continue;
if (e1_iter->second == e2_iter->second) continue;
cout << e1_iter->second << " ||| " << e2_iter->second << " ||| " <<
e1_iter->first * e2_iter->first << " ||| " << endl;
}

View File

@ -3,10 +3,10 @@
// The separate moses server executable is being phased out.
// Since there were problems with the migration into the main
// executable, this separate program is still included in the
// distribution for legacy reasons. Contributors are encouraged
// to add their contributions to moses/server rather than
// distribution for legacy reasons. Contributors are encouraged
// to add their contributions to moses/server rather than
// contrib/server. This recommendation does not apply to wrapper
// scripts.
// scripts.
// The future is this:
/** main function of the command line version of the decoder **/
@ -83,7 +83,7 @@ public:
pdsa->add(source_,target_,alignment_);
#else
const PhraseDictionary* pdf = PhraseDictionary::GetColl()[0];
PhraseDictionaryDynSuffixArray*
PhraseDictionaryDynSuffixArray*
pdsa = (PhraseDictionaryDynSuffixArray*) pdf;
cerr << "Inserting into address " << pdsa << endl;
pdsa->insertSnt(source_, target_, alignment_);
@ -146,7 +146,7 @@ public:
}
}
*/
void breakOutParams(const params_t& params) {
params_t::const_iterator si = params.find("source");
if(si == params.end())
@ -236,7 +236,7 @@ public:
class TranslationTask : public virtual Moses::TranslationTask {
protected:
TranslationTask(xmlrpc_c::paramList const& paramList,
boost::condition_variable& cond, boost::mutex& mut)
boost::condition_variable& cond, boost::mutex& mut)
: m_paramList(paramList),
m_cond(cond),
m_mut(mut),
@ -244,7 +244,7 @@ protected:
{}
public:
static boost::shared_ptr<TranslationTask>
static boost::shared_ptr<TranslationTask>
create(xmlrpc_c::paramList const& paramList,
boost::condition_variable& cond, boost::mutex& mut)
{
@ -252,15 +252,15 @@ public:
ret->m_self = ret;
return ret;
}
virtual bool DeleteAfterExecution() {return false;}
bool IsDone() const {return m_done;}
const map<string, xmlrpc_c::value>& GetRetData() { return m_retData;}
virtual void
Run()
virtual void
Run()
{
using namespace xmlrpc_c;
const params_t params = m_paramList.getStruct(0);
@ -292,25 +292,25 @@ public:
vector<float> multiModelWeights;
si = params.find("lambda");
if (si != params.end())
if (si != params.end())
{
value_array multiModelArray = value_array(si->second);
vector<value> multiModelValueVector(multiModelArray.vectorValueValue());
for (size_t i=0;i < multiModelValueVector.size();i++)
for (size_t i=0;i < multiModelValueVector.size();i++)
{
multiModelWeights.push_back(value_double(multiModelValueVector[i]));
}
}
si = params.find("model_name");
if (si != params.end() && multiModelWeights.size() > 0)
if (si != params.end() && multiModelWeights.size() > 0)
{
const string model_name = value_string(si->second);
PhraseDictionaryMultiModel* pdmm
PhraseDictionaryMultiModel* pdmm
= (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name);
pdmm->SetTemporaryMultiModelWeightsVector(multiModelWeights);
}
const StaticData &staticData = StaticData::Instance();
//Make sure alternative paths are retained, if necessary
@ -321,7 +321,7 @@ public:
stringstream out, graphInfo, transCollOpts;
if (staticData.IsSyntax())
if (staticData.IsSyntax())
{
boost::shared_ptr<TreeInput> tinput(new TreeInput);
const vector<FactorType>& IFO = staticData.GetInputFactorOrder();
@ -338,8 +338,8 @@ public:
manager.OutputSearchGraphMoses(sgstream);
m_retData["sg"] = value_string(sgstream.str());
}
}
else
}
else
{
// size_t lineNumber = 0; // TODO: Include sentence request number here?
boost::shared_ptr<Sentence> sentence(new Sentence(0,source));
@ -351,30 +351,30 @@ public:
vector<xmlrpc_c::value> alignInfo;
outputHypo(out,hypo,addAlignInfo,alignInfo,reportAllFactors);
if (addAlignInfo) m_retData["align"] = value_array(alignInfo);
if (addWordAlignInfo)
if (addWordAlignInfo)
{
stringstream wordAlignment;
hypo->OutputAlignment(wordAlignment);
vector<xmlrpc_c::value> alignments;
string alignmentPair;
while (wordAlignment >> alignmentPair)
while (wordAlignment >> alignmentPair)
{
int pos = alignmentPair.find('-');
map<string, xmlrpc_c::value> wordAlignInfo;
wordAlignInfo["source-word"]
wordAlignInfo["source-word"]
= value_int(atoi(alignmentPair.substr(0, pos).c_str()));
wordAlignInfo["target-word"]
wordAlignInfo["target-word"]
= value_int(atoi(alignmentPair.substr(pos + 1).c_str()));
alignments.push_back(value_struct(wordAlignInfo));
}
m_retData["word-align"] = value_array(alignments);
}
if (addGraphInfo) insertGraphInfo(manager,m_retData);
if (addTopts) insertTranslationOptions(manager,m_retData);
if (nbest_size > 0)
if (nbest_size > 0)
{
outputNBest(manager, m_retData, nbest_size, nbest_distinct,
outputNBest(manager, m_retData, nbest_size, nbest_distinct,
reportAllFactors, addAlignInfo, addScoreBreakdown);
}
(const_cast<StaticData&>(staticData)).SetOutputSearchGraph(false);
@ -389,11 +389,11 @@ public:
}
void outputHypo(ostream& out, const Hypothesis* hypo,
bool addAlignmentInfo, vector<xmlrpc_c::value>& alignInfo,
void outputHypo(ostream& out, const Hypothesis* hypo,
bool addAlignmentInfo, vector<xmlrpc_c::value>& alignInfo,
bool reportAllFactors = false) {
if (hypo->GetPrevHypo() != NULL) {
outputHypo(out,hypo->GetPrevHypo(),addAlignmentInfo,
outputHypo(out,hypo->GetPrevHypo(),addAlignmentInfo,
alignInfo, reportAllFactors);
Phrase p = hypo->GetCurrTargetPhrase();
if(reportAllFactors) {
@ -547,14 +547,14 @@ public:
retData.insert(pair<string, xmlrpc_c::value>("nbest", xmlrpc_c::value_array(nBestXml)));
}
void
insertTranslationOptions(Manager& manager, map<string, xmlrpc_c::value>& retData)
void
insertTranslationOptions(Manager& manager, map<string, xmlrpc_c::value>& retData)
{
const TranslationOptionCollection* toptsColl = manager.getSntTranslationOptions();
vector<xmlrpc_c::value> toptsXml;
size_t const stop = toptsColl->GetSource().GetSize();
TranslationOptionList const* tol;
for (size_t s = 0 ; s < stop ; ++s)
for (size_t s = 0 ; s < stop ; ++s)
{
for (size_t e = s; (tol = toptsColl->GetTranslationOptionList(s,e)) != NULL; ++e)
{
@ -569,11 +569,11 @@ public:
toptXml["start"] = xmlrpc_c::value_int(s);
toptXml["end"] = xmlrpc_c::value_int(e);
vector<xmlrpc_c::value> scoresXml;
const std::valarray<FValue> &scores
const std::valarray<FValue> &scores
= topt->GetScoreBreakdown().getCoreFeatures();
for (size_t j = 0; j < scores.size(); ++j)
for (size_t j = 0; j < scores.size(); ++j)
scoresXml.push_back(xmlrpc_c::value_double(scores[j]));
toptXml["scores"] = xmlrpc_c::value_array(scoresXml);
toptsXml.push_back(xmlrpc_c::value_struct(toptXml));
}
@ -581,7 +581,7 @@ public:
}
retData.insert(pair<string, xmlrpc_c::value>("topt", xmlrpc_c::value_array(toptsXml)));
}
private:
xmlrpc_c::paramList const& m_paramList;
map<string, xmlrpc_c::value> m_retData;
@ -619,8 +619,8 @@ private:
Moses::ThreadPool m_threadPool;
};
static
void
static
void
PrintFeatureWeight(ostream& out, const FeatureFunction* ff)
{
out << ff->GetScoreProducerDescription() << "=";
@ -632,16 +632,16 @@ PrintFeatureWeight(ostream& out, const FeatureFunction* ff)
out << endl;
}
static
void
static
void
ShowWeights(ostream& out)
{
// adapted from moses-cmd/Main.cpp
std::ios::fmtflags old_flags = out.setf(std::ios::fixed);
size_t old_precision = out.precision(6);
const vector<const StatelessFeatureFunction*>&
const vector<const StatelessFeatureFunction*>&
slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
const vector<const StatefulFeatureFunction*>&
const vector<const StatefulFeatureFunction*>&
sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
for (size_t i = 0; i < sff.size(); ++i) {
@ -662,7 +662,7 @@ ShowWeights(ostream& out)
out << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
}
}
if (! (old_flags & std::ios::fixed))
if (! (old_flags & std::ios::fixed))
out.unsetf(std::ios::fixed);
out.precision(old_precision);
}
@ -754,7 +754,7 @@ int main(int argc, char** argv)
.allowOrigin("*")
);
*/
XVERBOSE(1,"Listening on port " << port << endl);
if (isSerial) {
while(1) myAbyssServer.runOnce();

View File

@ -1,231 +1,231 @@
// XGetopt.cpp Version 1.2
//
// Author: Hans Dietrich
// hdietrich2@hotmail.com
//
// Description:
// XGetopt.cpp implements getopt(), a function to parse command lines.
//
// History
// Version 1.2 - 2003 May 17
// - Added Unicode support
//
// Version 1.1 - 2002 March 10
// - Added example to XGetopt.cpp module header
//
// This software is released into the public domain.
// You are free to use it in any way you like.
//
// This software is provided "as is" with no expressed
// or implied warranty. I accept no liability for any
// damage or loss of business that this software may cause.
//
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// if you are using precompiled headers then include this line:
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// if you are not using precompiled headers then include these lines:
//#include <windows.h>
//#include <cstdio>
//#include <tchar.h>
///////////////////////////////////////////////////////////////////////////////
#include <cstdio>
#include <cstring>
#include <cmath>
#include "WIN32_functions.h"
///////////////////////////////////////////////////////////////////////////////
//
// X G e t o p t . c p p
//
//
// NAME
// getopt -- parse command line options
//
// SYNOPSIS
// int getopt(int argc, char *argv[], char *optstring)
//
// extern char *optarg;
// extern int optind;
//
// DESCRIPTION
// The getopt() function parses the command line arguments. Its
// arguments argc and argv are the argument count and array as
// passed into the application on program invocation. In the case
// of Visual C++ programs, argc and argv are available via the
// variables __argc and __argv (double underscores), respectively.
// getopt returns the next option letter in argv that matches a
// letter in optstring. (Note: Unicode programs should use
// __targv instead of __argv. Also, all character and string
// literals should be enclosed in ( ) ).
//
// optstring is a string of recognized option letters; if a letter
// is followed by a colon, the option is expected to have an argument
// that may or may not be separated from it by white space. optarg
// is set to point to the start of the option argument on return from
// getopt.
//
// Option letters may be combined, e.g., "-ab" is equivalent to
// "-a -b". Option letters are case sensitive.
//
// getopt places in the external variable optind the argv index
// of the next argument to be processed. optind is initialized
// to 0 before the first call to getopt.
//
// When all options have been processed (i.e., up to the first
// non-option argument), getopt returns EOF, optarg will point
// to the argument, and optind will be set to the argv index of
// the argument. If there are no non-option arguments, optarg
// will be set to NULL.
//
// The special option "--" may be used to delimit the end of the
// options; EOF will be returned, and "--" (and everything after it)
// will be skipped.
//
// RETURN VALUE
// For option letters contained in the string optstring, getopt
// will return the option letter. getopt returns a question mark (?)
// when it encounters an option letter not included in optstring.
// EOF is returned when processing is finished.
//
// BUGS
// 1) Long options are not supported.
// 2) The GNU double-colon extension is not supported.
// 3) The environment variable POSIXLY_CORRECT is not supported.
// 4) The + syntax is not supported.
// 5) The automatic permutation of arguments is not supported.
// 6) This implementation of getopt() returns EOF if an error is
// encountered, instead of -1 as the latest standard requires.
//
// EXAMPLE
// BOOL CMyApp::ProcessCommandLine(int argc, char *argv[])
// {
// int c;
//
// while ((c = getopt(argc, argv, ("aBn:"))) != EOF)
// {
// switch (c)
// {
// case ('a'):
// TRACE(("option a\n"));
// //
// // set some flag here
// //
// break;
//
// case ('B'):
// TRACE( ("option B\n"));
// //
// // set some other flag here
// //
// break;
//
// case ('n'):
// TRACE(("option n: value=%d\n"), atoi(optarg));
// //
// // do something with value here
// //
// break;
//
// case ('?'):
// TRACE(("ERROR: illegal option %s\n"), argv[optind-1]);
// return FALSE;
// break;
//
// default:
// TRACE(("WARNING: no handler for option %c\n"), c);
// return FALSE;
// break;
// }
// }
// //
// // check for non-option args here
// //
// return TRUE;
// }
//
///////////////////////////////////////////////////////////////////////////////
char *optarg; // global argument pointer
int optind = 0; // global argv index
int getopt(int argc, char *argv[], char *optstring)
{
static char *next = NULL;
if (optind == 0)
next = NULL;
optarg = NULL;
if (next == NULL || *next =='\0') {
if (optind == 0)
optind++;
if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0')) {
optarg = NULL;
if (optind < argc)
optarg = argv[optind];
return EOF;
}
if (strcmp(argv[optind], "--") == 0) {
optind++;
optarg = NULL;
if (optind < argc)
optarg = argv[optind];
return EOF;
}
next = argv[optind];
next++; // skip past -
optind++;
}
char c = *next++;
char *cp = strchr(optstring, c);
if (cp == NULL || c == (':'))
return ('?');
cp++;
if (*cp == (':')) {
if (*next != ('\0')) {
optarg = next;
next = NULL;
} else if (optind < argc) {
optarg = argv[optind];
optind++;
} else {
return ('?');
}
}
return c;
}
// for an overview, see
// W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
double lgamma(int x)
{
// size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
if (x <= 2) {
return 0.0;
}
static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
double tmp=(double)x+5.5;
tmp -= (((double)x)+0.5)*log(tmp);
double y=(double)x;
double sum = 1.000000000190015;
for (size_t j=0; j<6; ++j) {
sum += coefs[j]/++y;
}
return -tmp+log(2.5066282746310005*sum/(double)x);
}
// XGetopt.cpp Version 1.2
//
// Author: Hans Dietrich
// hdietrich2@hotmail.com
//
// Description:
// XGetopt.cpp implements getopt(), a function to parse command lines.
//
// History
// Version 1.2 - 2003 May 17
// - Added Unicode support
//
// Version 1.1 - 2002 March 10
// - Added example to XGetopt.cpp module header
//
// This software is released into the public domain.
// You are free to use it in any way you like.
//
// This software is provided "as is" with no expressed
// or implied warranty. I accept no liability for any
// damage or loss of business that this software may cause.
//
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// if you are using precompiled headers then include this line:
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// if you are not using precompiled headers then include these lines:
//#include <windows.h>
//#include <cstdio>
//#include <tchar.h>
///////////////////////////////////////////////////////////////////////////////
#include <cstdio>
#include <cstring>
#include <cmath>
#include "WIN32_functions.h"
///////////////////////////////////////////////////////////////////////////////
//
// X G e t o p t . c p p
//
//
// NAME
// getopt -- parse command line options
//
// SYNOPSIS
// int getopt(int argc, char *argv[], char *optstring)
//
// extern char *optarg;
// extern int optind;
//
// DESCRIPTION
// The getopt() function parses the command line arguments. Its
// arguments argc and argv are the argument count and array as
// passed into the application on program invocation. In the case
// of Visual C++ programs, argc and argv are available via the
// variables __argc and __argv (double underscores), respectively.
// getopt returns the next option letter in argv that matches a
// letter in optstring. (Note: Unicode programs should use
// __targv instead of __argv. Also, all character and string
// literals should be enclosed in ( ) ).
//
// optstring is a string of recognized option letters; if a letter
// is followed by a colon, the option is expected to have an argument
// that may or may not be separated from it by white space. optarg
// is set to point to the start of the option argument on return from
// getopt.
//
// Option letters may be combined, e.g., "-ab" is equivalent to
// "-a -b". Option letters are case sensitive.
//
// getopt places in the external variable optind the argv index
// of the next argument to be processed. optind is initialized
// to 0 before the first call to getopt.
//
// When all options have been processed (i.e., up to the first
// non-option argument), getopt returns EOF, optarg will point
// to the argument, and optind will be set to the argv index of
// the argument. If there are no non-option arguments, optarg
// will be set to NULL.
//
// The special option "--" may be used to delimit the end of the
// options; EOF will be returned, and "--" (and everything after it)
// will be skipped.
//
// RETURN VALUE
// For option letters contained in the string optstring, getopt
// will return the option letter. getopt returns a question mark (?)
// when it encounters an option letter not included in optstring.
// EOF is returned when processing is finished.
//
// BUGS
// 1) Long options are not supported.
// 2) The GNU double-colon extension is not supported.
// 3) The environment variable POSIXLY_CORRECT is not supported.
// 4) The + syntax is not supported.
// 5) The automatic permutation of arguments is not supported.
// 6) This implementation of getopt() returns EOF if an error is
// encountered, instead of -1 as the latest standard requires.
//
// EXAMPLE
// BOOL CMyApp::ProcessCommandLine(int argc, char *argv[])
// {
// int c;
//
// while ((c = getopt(argc, argv, ("aBn:"))) != EOF)
// {
// switch (c)
// {
// case ('a'):
// TRACE(("option a\n"));
// //
// // set some flag here
// //
// break;
//
// case ('B'):
// TRACE( ("option B\n"));
// //
// // set some other flag here
// //
// break;
//
// case ('n'):
// TRACE(("option n: value=%d\n"), atoi(optarg));
// //
// // do something with value here
// //
// break;
//
// case ('?'):
// TRACE(("ERROR: illegal option %s\n"), argv[optind-1]);
// return FALSE;
// break;
//
// default:
// TRACE(("WARNING: no handler for option %c\n"), c);
// return FALSE;
// break;
// }
// }
// //
// // check for non-option args here
// //
// return TRUE;
// }
//
///////////////////////////////////////////////////////////////////////////////
char *optarg; // global argument pointer
int optind = 0; // global argv index
int getopt(int argc, char *argv[], char *optstring)
{
static char *next = NULL;
if (optind == 0)
next = NULL;
optarg = NULL;
if (next == NULL || *next =='\0') {
if (optind == 0)
optind++;
if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0')) {
optarg = NULL;
if (optind < argc)
optarg = argv[optind];
return EOF;
}
if (strcmp(argv[optind], "--") == 0) {
optind++;
optarg = NULL;
if (optind < argc)
optarg = argv[optind];
return EOF;
}
next = argv[optind];
next++; // skip past -
optind++;
}
char c = *next++;
char *cp = strchr(optstring, c);
if (cp == NULL || c == (':'))
return ('?');
cp++;
if (*cp == (':')) {
if (*next != ('\0')) {
optarg = next;
next = NULL;
} else if (optind < argc) {
optarg = argv[optind];
optind++;
} else {
return ('?');
}
}
return c;
}
// for an overview, see
// W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
double lgamma(int x)
{
// size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
if (x <= 2) {
return 0.0;
}
static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
double tmp=(double)x+5.5;
tmp -= (((double)x)+0.5)*log(tmp);
double y=(double)x;
double sum = 1.000000000190015;
for (size_t j=0; j<6; ++j) {
sum += coefs[j]/++y;
}
return -tmp+log(2.5066282746310005*sum/(double)x);
}

View File

@ -1,24 +1,24 @@
// XGetopt.h Version 1.2
//
// Author: Hans Dietrich
// hdietrich2@hotmail.com
//
// This software is released into the public domain.
// You are free to use it in any way you like.
//
// This software is provided "as is" with no expressed
// or implied warranty. I accept no liability for any
// damage or loss of business that this software may cause.
//
///////////////////////////////////////////////////////////////////////////////
#ifndef XGETOPT_H
#define XGETOPT_H
extern int optind, opterr;
extern char *optarg;
int getopt(int argc, char *argv[], char *optstring);
double lgamma(int x);
#endif //XGETOPT_H
// XGetopt.h Version 1.2
//
// Author: Hans Dietrich
// hdietrich2@hotmail.com
//
// This software is released into the public domain.
// You are free to use it in any way you like.
//
// This software is provided "as is" with no expressed
// or implied warranty. I accept no liability for any
// damage or loss of business that this software may cause.
//
///////////////////////////////////////////////////////////////////////////////
#ifndef XGETOPT_H
#define XGETOPT_H
extern int optind, opterr;
extern char *optarg;
int getopt(int argc, char *argv[], char *optstring);
double lgamma(int x);
#endif //XGETOPT_H

View File

@ -1,5 +1,5 @@
#include <cstring>
#include <cstring>
#include <cassert>
#include <cstdio>
#include <cstdlib>
@ -14,7 +14,7 @@
#include <set>
#include <boost/thread/tss.hpp>
#include <boost/thread.hpp>
#include <boost/thread.hpp>
#include <boost/unordered_map.hpp>
#ifdef WIN32
@ -58,9 +58,9 @@ typedef boost::shared_ptr<std::vector<TextLenType> > SentIdSet;
class Cache {
typedef std::pair<SentIdSet, clock_t> ClockedSet;
typedef boost::unordered_map<std::string, ClockedSet> ClockedMap;
public:
SentIdSet get(const std::string& phrase) {
boost::shared_lock<boost::shared_mutex> lock(m_mutex);
if(m_cont.count(phrase)) {
@ -70,27 +70,27 @@ class Cache {
}
return SentIdSet( new SentIdSet::element_type() );
}
void put(const std::string& phrase, const SentIdSet set) {
boost::unique_lock<boost::shared_mutex> lock(m_mutex);
m_cont[phrase] = std::make_pair(set, clock());
}
static void set_max_cache(size_t max_cache) {
s_max_cache = max_cache;
}
void prune() {
if(s_max_cache > 0) {
boost::upgrade_lock<boost::shared_mutex> lock(m_mutex);
if(m_cont.size() > s_max_cache) {
std::vector<clock_t> clocks;
for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++)
for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++)
clocks.push_back(it->second.second);
std::sort(clocks.begin(), clocks.end());
clock_t out = clocks[m_cont.size() - s_max_cache];
boost::upgrade_to_unique_lock<boost::shared_mutex> uniq_lock(lock);
for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++)
if(it->second.second < out)
@ -98,7 +98,7 @@ class Cache {
}
}
}
private:
ClockedMap m_cont;
boost::shared_mutex m_mutex;
@ -282,12 +282,12 @@ void lookup_phrase(SentIdSet& ids, const std::string& phrase,
i != locations.end(); ++i) {
ids->push_back(i->sentIdInCorpus);
}
std::sort(ids->begin(), ids->end());
SentIdSet::element_type::iterator it =
std::unique(ids->begin(), ids->end());
ids->resize(it - ids->begin());
if(ids->size() >= MINIMUM_SIZE_TO_KEEP)
cache.put(phrase, ids);
}
@ -295,8 +295,8 @@ void lookup_phrase(SentIdSet& ids, const std::string& phrase,
void lookup_multiple_phrases(SentIdSet& ids, vector<std::string> & phrases,
C_SuffixArraySearchApplicationBase & my_sa,
const std::string & rule, Cache& cache)
{
const std::string & rule, Cache& cache)
{
if (phrases.size() == 1) {
lookup_phrase(ids, phrases.front(), my_sa, cache);
@ -372,32 +372,32 @@ void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options,
delete *i;
options.erase(options.begin() + pfe_filter_limit,options.end());
}
if (pef_filter_only)
return;
if (options.empty())
return;
SentIdSet fset( new SentIdSet::element_type() );
find_occurrences(fset, options.front()->f_phrase, f_sa, f_cache);
size_t cf = fset->size();
for (std::vector<PTEntry*>::iterator i = options.begin();
i != options.end(); ++i) {
const std::string& e_phrase = (*i)->e_phrase;
SentIdSet eset( new SentIdSet::element_type() );
find_occurrences(eset, e_phrase, e_sa, e_cache);
size_t ce = eset->size();
SentIdSet efset( new SentIdSet::element_type() );
ordered_set_intersect(efset, fset, eset);
size_t cef = efset->size();
double nlp = -log(fisher_exact(cef, cf, ce));
(*i)->set_cooc_stats(cef, cf, ce, nlp);
}
std::vector<PTEntry*>::iterator new_end =
std::remove_if(options.begin(), options.end(),
NlogSigThresholder(sig_filter_limit));
@ -406,7 +406,7 @@ void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options,
}
void filter(std::istream* in, std::ostream* out, int pfe_index) {
std::vector<std::string> lines;
std::string prev = "";
std::vector<PTEntry*> options;
@ -415,23 +415,23 @@ void filter(std::istream* in, std::ostream* out, int pfe_index) {
boost::mutex::scoped_lock lock(in_mutex);
if(in->eof())
break;
lines.clear();
std::string line;
while(getline(*in, line) && lines.size() < 500000)
lines.push_back(line);
}
std::stringstream out_temp;
for(std::vector<std::string>::iterator it = lines.begin(); it != lines.end(); it++) {
size_t tmp_lines = ++pt_lines;
if(tmp_lines % 10000 == 0) {
boost::mutex::scoped_lock lock(err_mutex);
std::cerr << ".";
if(tmp_lines % 500000 == 0)
std::cerr << "[n:" << tmp_lines << "]\n";
if(tmp_lines % 10000000 == 0) {
float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
@ -446,30 +446,30 @@ void filter(std::istream* in, std::ostream* out, int pfe_index) {
<< "------------------------------------------------------\n";
}
}
if(pt_lines % 10000 == 0) {
f_cache.prune();
e_cache.prune();
}
if(it->length() > 0) {
PTEntry* pp = new PTEntry(it->c_str(), pfe_index);
if (prev != pp->f_phrase) {
prev = pp->f_phrase;
if (!options.empty()) { // always true after first line
compute_cooc_stats_and_filter(options, f_cache, e_cache);
}
for (std::vector<PTEntry*>::iterator i = options.begin();
i != options.end(); ++i) {
out_temp << **i << '\n';
delete *i;
}
options.clear();
options.push_back(pp);
} else {
options.push_back(pp);
}
@ -479,7 +479,7 @@ void filter(std::istream* in, std::ostream* out, int pfe_index) {
*out << out_temp.str() << std::flush;
}
compute_cooc_stats_and_filter(options, f_cache, e_cache);
boost::mutex::scoped_lock lock(out_mutex);
for (std::vector<PTEntry*>::iterator i = options.begin();
i != options.end(); ++i) {
@ -512,11 +512,11 @@ int main(int argc, char * argv[])
pfe_filter_limit = atoi(optarg);
std::cerr << "P(f|e) filter limit: " << pfe_filter_limit << std::endl;
break;
case 't':
case 't':
threads = atoi(optarg);
std::cerr << "Using threads: " << threads << std::endl;
break;
case 'm':
case 'm':
max_cache = atoi(optarg);
std::cerr << "Using max phrases in caches: " << max_cache << std::endl;
break;
@ -548,13 +548,13 @@ int main(int argc, char * argv[])
usage();
}
}
if (sig_filter_limit == 0.0) pef_filter_only = true;
//-----------------------------------------------------------------------------
if (optind != argc || ((!efile || !ffile) && !pef_filter_only)) {
usage();
}
//load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false)
if (!pef_filter_only) {
e_sa.loadData_forSearch(efile, false, false);
@ -582,15 +582,15 @@ int main(int argc, char * argv[])
Cache::set_max_cache(max_cache);
std::ios_base::sync_with_stdio(false);
boost::thread_group threadGroup;
for(int i = 0; i < threads; i++)
for(int i = 0; i < threads; i++)
threadGroup.add_thread(new boost::thread(filter, &std::cin, &std::cout, pfe_index));
threadGroup.join_all();
float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
std::cerr << "\n\n------------------------------------------------------\n"
<< " unfiltered phrases pairs: " << pt_lines << "\n"
<< "\n"
@ -599,5 +599,5 @@ int main(int argc, char * argv[])
<< " TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << " (" << (sigfper + pfefper) << "%)\n"
<< "\n"
<< " FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << " (" << (100.0-sigfper - pfefper) << "%)\n"
<< "------------------------------------------------------\n";
<< "------------------------------------------------------\n";
}

View File

@ -65,7 +65,7 @@ class Numbered : public T {
friend String& operator<< ( String& str, const Numbered<SD1,I,SD2,T,SD3>& rv ) { return str<<SD1<<rv.i<<SD2<<rv.getT()<<SD3; }
friend pair<StringInput,Numbered<SD1,I,SD2,T,SD3>*> operator>> ( StringInput ps, Numbered<SD1,I,SD2,T,SD3>& rv ) { return pair<StringInput,Numbered<SD1,I,SD2,T,SD3>*>(ps,&rv); }
friend StringInput operator>> ( pair<StringInput,Numbered<SD1,I,SD2,T,SD3>*> delimbuff, const char* psPostDelim ) {
return ( (SD3[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->i>>SD2>>delimbuff.second->setT()>>psPostDelim
return ( (SD3[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->i>>SD2>>delimbuff.second->setT()>>psPostDelim
: delimbuff.first>>SD1>>delimbuff.second->i>>SD2>>delimbuff.second->setT()>>SD3>>psPostDelim );
}
};
@ -106,7 +106,7 @@ template<class V>
pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const V& v ) const {
//const Scored<typename V::ElementType,pair<int,SafePtr<const V> > > sipvDummy ( DBL_MAX );
//MinHeap<Scored<typename V::ElementType,pair<int,SafePtr<const V> > > > hsiv ( MapType::size()+1, sipvDummy );
MinHeap<Scored<typename V::ElementType,pair<int,SafePtr<const NV> > > >& hsiv =
MinHeap<Scored<typename V::ElementType,pair<int,SafePtr<const NV> > > >& hsiv =
const_cast<MinHeap<Scored<typename V::ElementType,pair<int,SafePtr<const NV> > > >&> ( hsivCalc );
hsiv.clear();
@ -120,7 +120,7 @@ pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const
typename V::ElementType d = v.getMarginalDistance ( hsiv.get(iNext).first, hsiv.get(iNext).second.getRef() );
hsiv.set(iNext).setScore() = d;
//hsiv.set(iNext).setScore() = v.getMarginalDistance ( hsiv.getMin().first, iUpper->second.second );
////int j =
////int j =
hsiv.fixDecr(iNext);
////cerr<<" adding ln"<<&hsiv.get(j).second.getRef()<<" marg-dist="<<d<<" new-score="<<double(hsiv.get(j).getScore())<<" new-pos="<<j<<"\n";
iNext++;
@ -140,7 +140,7 @@ pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const
typename V::ElementType d = v.getMarginalDistance ( ++hsiv.setMin().first, hsiv.getMin().second.getRef() );
hsiv.setMin().setScore() += d;
////cerr<<" matching ln"<<&hsiv.getMin().second.getRef()<<" i="<<hsiv.setMin().first<<" marg-dist="<<d<<" new-score="<<hsiv.getMin().getScore();
////int j =
////int j =
hsiv.fixIncr(0);
////cerr<<" new-pos="<<j<<"\n";
////if(j!=0) for(int i=0;i<iNext;i++) cerr<<" "<<i<<": ln"<<hsiv.get(i).second.getRef().lineNum.toInt()<<" new-score="<<double(hsiv.get(i).getScore())<<"\n";
@ -151,7 +151,7 @@ pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const
hsiv.set(iNext).second = SafePtr<const NV> ( iUpper->second );
typename V::ElementType d = v.getMarginalDistance ( hsiv.get(iNext).first, hsiv.get(iNext).second.getRef() );
hsiv.set(iNext).setScore() = d;
////int j =
////int j =
hsiv.fixDecr(iNext);
////cerr<<" adding ln"<<&hsiv.get(j).second.getRef()<<" marg-dist="<<d<<" new-score="<<double(hsiv.get(j).getScore())<<" new-pos="<<j<<"\n";
iNext++;
@ -164,7 +164,7 @@ pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const
hsiv.set(iNext).second = SafePtr<const NV> ( iLower->second );
typename V::ElementType d = v.getMarginalDistance ( hsiv.get(iNext).first, hsiv.get(iNext).second.getRef() );
hsiv.set(iNext).setScore() = d;
////int j =
////int j =
hsiv.fixDecr(iNext);
////cerr<<" adding ln"<<&hsiv.get(j).second.getRef()<<" marg-dist="<<d<<" new-score="<<double(hsiv.get(j).getScore())<<" new-pos="<<j<<"\n";
iNext++;

View File

@ -27,7 +27,7 @@
#include <cassert>
#include <iostream>
using namespace std;
using namespace std;
////////////////////////////////////////////////////////////////////////////////

View File

@ -101,8 +101,8 @@ class Beam {
void write(FILE *pf){
/* for (typename BeamMap::const_iterator i = mkid.begin(); i != mkid.end(); i++){
i->first.write(pf);
fprintf(pf, " %d ", i->second.first);
// i->second.second.write(pf);
fprintf(pf, " %d ", i->second.first);
// i->second.second.write(pf);
fprintf(pf, "\n");
}
*/

View File

@ -394,7 +394,7 @@ class SimpleMap : public map<X,Y> {
private:
typedef map<X,Y> OrigMap;
static const Y yDummy;
public:
// Constructor / destructor methods...
SimpleMap ( ) : OrigMap() { }
@ -899,7 +899,7 @@ class GenericHidVarCPTModel : public SimpleHash<K,typename Y::template ArrayDist
const typename Y::template ArrayDistrib<P>& getDistrib ( const K& k ) const {
return HKYP::get(k);
}
P& setProb ( const Y& y, const K& k ) {
pair<typename Y::BaseType,P>& yp = HKYP::set(k).add();
yp.first = y;

View File

@ -36,7 +36,7 @@
//
////////////////////////////////////////////////////////////////////////////////
template <class Y,class X1,class X2>
template <class Y,class X1,class X2>
class CRF3DModeledRV : public Y {
private:
@ -90,7 +90,7 @@ template <class Y,class X1,class X2> SafeArray5D<Id<int>,int,int,int,int,float>
////////////////////////////////////////////////////////////////////////////////
template <class Y,class X1,class X2>
template <class Y,class X1,class X2>
Prob CRF3DModeledRV<Y,X1,X2>::getProb( const X1& x1, const X2& x2 ) const {
SafeArray2D<int,int,int> aaCnds ( cardOff, cardSh ) ;
@ -131,7 +131,7 @@ Prob CRF3DModeledRV<Y,X1,X2>::getProb( const X1& x1, const X2& x2 ) const {
for ( int configRghtValSite=0; configRghtValSite<(1<<bitsValSite); configRghtValSite++ )
for ( int configValOverlap=0; configValOverlap<(1<<(bitsVal-bitsValSite)); configValOverlap++ ) {
int configRghtVal = (configValOverlap<<bitsValSite)+configRghtValSite;
// For each possible preceding trellis node...
// For each possible preceding trellis node...
for ( int configLeftValSite=0; configLeftValSite<(1<<bitsValSite); configLeftValSite++ ) {
int configLeftVal = (configLeftValSite<<(bitsVal-bitsValSite))+configValOverlap;
// Add product of result and previous trellis cell to current trellis cell...
@ -158,7 +158,7 @@ Prob CRF3DModeledRV<Y,X1,X2>::getProb( const X1& x1, const X2& x2 ) const {
////////////////////////////////////////////////////////////////////////////////
template <class Y,class X1,class X2>
template <class Y,class X1,class X2>
bool CRF3DModeledRV<Y,X1,X2>::readModelFields ( char* aps[], int numFields ) {
if ( 7==numFields )
setPotential ( X1(string(aps[1])), // globals
@ -172,7 +172,7 @@ bool CRF3DModeledRV<Y,X1,X2>::readModelFields ( char* aps[], int numFields ) {
////////////////////////////////////////////////////////////////////////////////
template <class Y,class X1,class X2>
template <class Y,class X1,class X2>
void CRF3DModeledRV<Y,X1,X2>::writeObservCliqueConfigs ( FILE* pf, int frame, const char* psMdl,
const X1& x1, const X2& x2, bool bObsVal ) const {
fprintf ( pf, "%04d> %s ", frame, psMdl );
@ -199,7 +199,7 @@ void CRF3DModeledRV<Y,X1,X2>::writeObservCliqueConfigs ( FILE* pf, int frame, co
//
////////////////////////////////////////////////////////////////////////////////
template <class Y,class X1,class X2,class X3>
template <class Y,class X1,class X2,class X3>
class CRF4DModeledRV : public Y {
private:
@ -247,13 +247,13 @@ template <class Y,class X1,class X2,class X3> int CRF4DModeledRV<Y,X1,X2,X3>::c
template <class Y,class X1,class X2,class X3> int CRF4DModeledRV<Y,X1,X2,X3>::cardCnd = 0;
template <class Y,class X1,class X2,class X3> int CRF4DModeledRV<Y,X1,X2,X3>::bitsVal = 0;
template <class Y,class X1,class X2,class X3> int CRF4DModeledRV<Y,X1,X2,X3>::bitsValSite = 0;
template <class Y,class X1,class X2,class X3> SafeArray5D<Id<int>,int,int,int,int,float>
template <class Y,class X1,class X2,class X3> SafeArray5D<Id<int>,int,int,int,int,float>
CRF4DModeledRV<Y,X1,X2,X3>::aaaaaPotentials;
/* template <class Y,class X1,class X2> SafeArray3D<int> CRF4DModeledRV<Y,X1,X2>::aaaCnds; */
////////////////////////////////////////////////////////////////////////////////
template <class Y,class X1,class X2,class X3>
template <class Y,class X1,class X2,class X3>
Prob CRF4DModeledRV<Y,X1,X2,X3>::getProb( const X1& x1, const X2& x2, const X3& x3 ) const {
SafeArray2D<int,int,int> aaCnds ( cardOff, cardSh ) ;
@ -294,7 +294,7 @@ Prob CRF4DModeledRV<Y,X1,X2,X3>::getProb( const X1& x1, const X2& x2, const X3&
for ( int configRghtValSite=0; configRghtValSite<(1<<bitsValSite); configRghtValSite++ )
for ( int configValOverlap=0; configValOverlap<(1<<(bitsVal-bitsValSite)); configValOverlap++ ) {
int configRghtVal = (configValOverlap<<bitsValSite)+configRghtValSite;
// For each possible preceding trellis node...
// For each possible preceding trellis node...
for ( int configLeftValSite=0; configLeftValSite<(1<<bitsValSite); configLeftValSite++ ) {
int configLeftVal = (configLeftValSite<<(bitsVal-bitsValSite))+configValOverlap;
// Add product of result and previous trellis cell to current trellis cell...
@ -321,7 +321,7 @@ Prob CRF4DModeledRV<Y,X1,X2,X3>::getProb( const X1& x1, const X2& x2, const X3&
////////////////////////////////////////////////////////////////////////////////
template <class Y,class X1,class X2,class X3>
template <class Y,class X1,class X2,class X3>
bool CRF4DModeledRV<Y,X1,X2,X3>::readModelFields ( char* aps[], int numFields ) {
if ( 7==numFields )
setPotential ( X1(string(aps[1])), // globals
@ -335,9 +335,9 @@ bool CRF4DModeledRV<Y,X1,X2,X3>::readModelFields ( char* aps[], int numFields )
////////////////////////////////////////////////////////////////////////////////
template <class Y,class X1,class X2, class X3>
template <class Y,class X1,class X2, class X3>
void CRF4DModeledRV<Y,X1,X2,X3>::writeObservCliqueConfigs ( FILE* pf, int frame, const char* psMdl,
const X1& x1, const X2& x2,
const X1& x1, const X2& x2,
const X3& x3, bool bObsVal ) const {
fprintf ( pf, "%04d> %s ", frame, psMdl );
// For each shape (feature slope)...

View File

@ -80,7 +80,7 @@ void VecE<N,I,RC>::read ( char* ps, const ReaderContext& rc ) {
*/
char* psT; int i=0;
for ( char* psU=strtok_r(ps,",",&psT);
psU && i<NUM_ENTS;
psU && i<NUM_ENTS;
psU=strtok_r(NULL,",",&psT),i++ )
StaticSafeArray<N,I>::set(i) = psU;
}
@ -166,7 +166,7 @@ void VecV<N,I,RC,ND1,ND2>::read ( char* ps, VecVReaderContext& rc ) {
// Chop into individual coinds strings...
char* psT; int i=0;
for ( char* psU=strtok_r(ps,",",&psT);
psU && i<NUM_ENTS;
psU && i<NUM_ENTS;
psU=strtok_r(NULL,",",&psT), i++ )
asV.set(i) = psU;
@ -230,7 +230,7 @@ class JointVecV { //// : public StaticSafeArray<V1::NUM_ENTS+V2::NUM_ENTS,I> {
static const int NUM_ENTS;
// Constructor / destructor methods...
JointVecV ( ) { }
JointVecV ( const V1& a1, const V2& a2 ) {
JointVecV ( const V1& a1, const V2& a2 ) {
////fprintf(stderr,"iJoin "); a1.V1::write(stderr); fprintf(stderr," "); a2.V2::write(stderr); fprintf(stderr,"\n");
for (int i=0; i<NUM_ENTS; i++) {
if ( i<V1::NUM_ENTS ) set(i) = (a1.get(i)==-1) ? IntType(-1) : (a1.get(i)<V1::NUM_ENTS) ? IntType(a1.get(i)) : a1.get(i)+V2::NUM_ENTS;

View File

@ -75,7 +75,7 @@ class ContDTree2DModel : public Generic2DModel<Y,X,P>, public Tree<ContDecisNode
// Extraction methods...
const P getProb ( const Y y, const X& x ) const {
const Tree<ContDecisNode<Y,P> >* ptr = this;
while ( !ptr->isTerm() ) {
while ( !ptr->isTerm() ) {
double sumsqr=0.0;
for(A a;a<X::getSize();a.setNext()) sumsqr += pow(x.get(a.toInt()),2.0) / X::getSize();
Wt wtdavg = -Tree<ContDecisNode<Y,P> >::getWt();
@ -112,7 +112,7 @@ class ContDTree2DModel : public Generic2DModel<Y,X,P>, public Tree<ContDecisNode
};
////////////////////
template <class Y,class X, class P>
template <class Y,class X, class P>
bool ContDTree2DModel<Y,X,P>::readFields ( char* aps[], int numFields ) {
if ( /*aps[0]==sId &&*/ (3==numFields || 4==numFields) ) {
//fprintf(stderr,"%s,%d\n",aps[3],numFields);
@ -171,7 +171,7 @@ class ContDTree3DModel : public Generic3DModel<Y,X1,X2,P> {
};
////////////////////
template <class Y,class X1,class X2, class P>
template <class Y,class X1,class X2, class P>
bool ContDTree3DModel<Y,X1,X2,P>::readFields ( char* aps[], int numFields ) {
if ( /*aps[0]==sId &&*/ (4==numFields || 5==numFields) ) {
//fprintf(stderr,"%s,%d\n",aps[3],numFields);
@ -212,7 +212,7 @@ bool ContDTree3DModel<Y,X1,X2,P>::readFields ( char* aps[], int numFields ) {
////////////////////////////////////////////////////////////////////////////////
template<class Y, class X, class P>
class TrainableContDTree2DModel : public ContDTree2DModel<Y,X,P> {
class TrainableContDTree2DModel : public ContDTree2DModel<Y,X,P> {
private:
List<Joint2DRV<X,Y> > lxy;
public:
@ -225,7 +225,7 @@ class TrainableContDTree2DModel : public ContDTree2DModel<Y,X,P> {
void train ( List<Joint2DRV<X,Y> >&, const double ) ;
void train ( const double d ) { train(lxy,d); }
////// Input / output methods...
bool readData ( char* vs[], int numFields ) {
bool readData ( char* vs[], int numFields ) {
if ( 3==numFields ) lxy.add() = Joint2DRV<X,Y> ( X(vs[1]), Y(vs[2]) );
else return false;
return true;
@ -312,7 +312,7 @@ void TrainableContDTree2DModel<Y,X,P>::train ( List<Joint2DRV<X,Y> >& lxy, cons
// if ( double(rand())/double(RAND_MAX) < prRarest/modelY.getProb(pxy->getSub2()) ) {
dCtr++;
double gamma = dTot/(dTot+dCtr); // 1.0/(double(epoch)+dCtr/dTot); // 1.0/double(epoch); // 1.0/(double(epoch)+dCtr/(dTot*prRarest*2.0)); //
double gamma = dTot/(dTot+dCtr); // 1.0/(double(epoch)+dCtr/dTot); // 1.0/double(epoch); // 1.0/(double(epoch)+dCtr/(dTot*prRarest*2.0)); //
// Weight deltas for next epoch...
Wt wDelta = 0.0;
@ -333,7 +333,7 @@ void TrainableContDTree2DModel<Y,X,P>::train ( List<Joint2DRV<X,Y> >& lxy, cons
P prY = 1.0 / ( 1.0 + exp(-wtdavg) );
// Calc deltas for each feature/attribute/dimension...
double dEachWt = 1.0/dTot; // 1.0/dTot * modelY.getProb ( Y(1-pxy->getSub2().toInt()) ); // 1.0/(dTot*prRarest*2.0); //
double dEachWt = 1.0/dTot; // 1.0/dTot * modelY.getProb ( Y(1-pxy->getSub2().toInt()) ); // 1.0/(dTot*prRarest*2.0); //
wDelta += dEachWt * -1 * ( prY - P(double(pxy->getSub2().toInt())) );
for ( A a; a<X::getSize(); a.setNext() )
awDeltas.set(a) += dEachWt * pxy->getSub1().get(a.toInt()) * ( prY - P(double(pxy->getSub2().toInt())) );
@ -439,7 +439,7 @@ void TrainableContDTree2DModel<Y,X,P>::train ( List<Joint2DRV<X,Y> >& lxy, cons
////////////////////////////////////////////////////////////////////////////////
template<class Y, class X1, class X2, class P>
class TrainableContDTree3DModel : public ContDTree3DModel<Y,X1,X2,P> {
class TrainableContDTree3DModel : public ContDTree3DModel<Y,X1,X2,P> {
private:
@ -455,7 +455,7 @@ class TrainableContDTree3DModel : public ContDTree3DModel<Y,X1,X2,P> {
TrainableContDTree2DModel<Y,X2,P>& setTree(const X1& x1) { return static_cast<TrainableContDTree2DModel<Y,X2,P>&>(ContDTree3DModel<Y,X1,X2,P>::setTree(x1)); }
////// Add training data to per-subphone lists...
bool readData ( char* vs[], int numFields ) {
bool readData ( char* vs[], int numFields ) {
if ( 4==numFields ) {
mqlxy[X1(vs[1])].add() = Joint2DRV<X2,Y> ( X2(vs[2]), Y(vs[3]) );
////mqlxy[X1(vs[1])].getLast()->write(stderr); fprintf(stderr,"\n");

View File

@ -129,8 +129,8 @@ class DTree2DModel : public Tree < typename X::ElementType, DecisNode<X,Y,P> > {
friend StringInput operator>> ( pair<StringInput,DTree2DModel<Y,X,P>*> si_m, const char* psD ) {
if (StringInput(NULL)==si_m.first) return si_m.first;
Y y; String xs; StringInput si,si2; si=si_m.first; DTree2DModel<Y,X,P>* pm=si_m.second;
while((si2=si>>" ")!=NULL)si=si2;
si=si>>xs>>" ";
while((si2=si>>" ")!=NULL)si=si2;
si=si>>xs>>" ";
while((si2=si>>" ")!=NULL)si=si2;
// Find appropriate node, creating nodes as necessary...
for(int i=1; i<int(strlen(xs.c_array()))-1; i++) {
@ -140,22 +140,22 @@ class DTree2DModel : public Tree < typename X::ElementType, DecisNode<X,Y,P> > {
if ( si!=NULL && si[0]==':' ) {
si=si>>": ";
while((si2=si>>" ")!=NULL)si=si2;
while((si2=si>>" ")!=NULL)si=si2;
si=si>>y>>" ";
while((si2=si>>" ")!=NULL)si=si2;
while((si2=si>>" ")!=NULL)si=si2;
si=si>>"= ";
while((si2=si>>" ")!=NULL)si=si2;
while((si2=si>>" ")!=NULL)si=si2;
// Specify attribute number (at nonterminal) or probability in distribution (at terminal)...
return (si!=NULL) ? si>>pm->setProb(y)>>psD : si;
}
else if ( si!=NULL && si[0]=='=' ) {
si=si>>"= "; //cerr<<" in after equals "<<((si==NULL) ? "yes" : "no") << endl;
while((si2=si>>" ")!=NULL)si=si2;
while((si2=si>>" ")!=NULL)si=si2;
//m.setA() = atoi(si.c_str());
int aVar = 0;
si=si>>aVar>>psD;
pm->setA()=aVar;
si=si>>aVar>>psD;
pm->setA()=aVar;
////cerr<<" at end "<<((si==NULL) ? "yes" : "no") << endl;
////cerr<<" m.getA() is "<< m.getA().toInt() << endl;
return si;
@ -169,15 +169,15 @@ class DTree2DModel : public Tree < typename X::ElementType, DecisNode<X,Y,P> > {
si=si_m.first;
sRt = si.c_str();
if (sRt.find(':')!=string::npos) {
while((si2=si>>" [")!=NULL)si=si2;
si=si>>xs>>"] ";
while((si2=si>>" ")!=NULL)si=si2;
while((si2=si>>" [")!=NULL)si=si2;
si=si>>xs>>"] ";
while((si2=si>>" ")!=NULL)si=si2;
si=si>>": ";
while((si2=si>>" ")!=NULL)si=si2;
while((si2=si>>" ")!=NULL)si=si2;
si=si>>y>>" ";
while((si2=si>>" ")!=NULL)si=si2;
while((si2=si>>" ")!=NULL)si=si2;
si=si>>"= ";
// For DTree, must find the node labeled by X
//Tree<B,DecisNode<X,Y,P> >* ptr = m;
//assert(ptr);
@ -189,15 +189,15 @@ class DTree2DModel : public Tree < typename X::ElementType, DecisNode<X,Y,P> > {
// Specify attribute number (at nonterminal) or probability in distribution (at terminal)...
return (si!=NULL) ? si>>m.setProb(y)>>psD : si;
} else {
while((si2=si>>" [")!=NULL)si=si2;
while((si2=si>>" [")!=NULL)si=si2;
si=si>>xs>>"] "; //cerr<<" in bracket "<<((si==NULL) ? "yes" : "no") << endl;
while((si2=si>>" ")!=NULL)si=si2;
while((si2=si>>" ")!=NULL)si=si2;
si=si>>"= "; //cerr<<" in after equals "<<((si==NULL) ? "yes" : "no") << endl;
//m.setA() = atoi(si.c_str());
int aVar = 0;
si=si>>aVar>>psD;
m.setA()=aVar;
si=si>>aVar>>psD;
m.setA()=aVar;
//cerr<<" at end "<<((si==NULL) ? "yes" : "no") << endl;
//cerr<<" m.getA() is "<< m.getA().toInt() << endl;
return si;
@ -209,7 +209,7 @@ class DTree2DModel : public Tree < typename X::ElementType, DecisNode<X,Y,P> > {
};
////////////////////
template <class Y,class X, class P>
template <class Y,class X, class P>
bool DTree2DModel<Y,X,P>::readFields ( Array<char*>& aps ) {
if ( /*aps[0]==sId &&*/ (3==aps.size() || 4==aps.size()) ) {
//fprintf(stderr,"%s,%d\n",aps[3],numFields);
@ -269,7 +269,7 @@ class DTree3DModel {
};
////////////////////
template <class Y,class X1,class X2, class P>
template <class Y,class X1,class X2, class P>
bool DTree3DModel<Y,X1,X2,P>::readFields ( char* aps[], int numFields ) {
if ( /*aps[0]==sId &&*/ (4==numFields || 5==numFields) ) {
//fprintf(stderr,"%s,%d\n",aps[3],numFields);
@ -307,7 +307,7 @@ bool DTree3DModel<Y,X1,X2,P>::readFields ( char* aps[], int numFields ) {
////////////////////////////////////////////////////////////////////////////////
template<class Y, class X, class P>
class TrainableDTree2DModel : public DTree2DModel<Y,X,P> {
class TrainableDTree2DModel : public DTree2DModel<Y,X,P> {
private:
// Type members...
typedef typename X::ElementType B;
@ -485,7 +485,7 @@ void TrainableDTree2DModel<Y,X,P>::train ( List<Joint2DRV<X,Y> >& lxy, const De
////////////////////////////////////////////////////////////////////////////////
template<class Y, class X1, class X2, class P>
class TrainableDTree3DModel : public DTree3DModel<Y,X1,X2,P> {
class TrainableDTree3DModel : public DTree3DModel<Y,X1,X2,P> {
private:

View File

@ -34,7 +34,7 @@ class Matrix : public SafeArray2D<Id<int>,Id<int>,T> {
Matrix ( ) : SafeArray2D<Id<int>,Id<int>,T>( ) { }//{ xSize=0; ySize=0; }
Matrix (int x, int y) : SafeArray2D<Id<int>,Id<int>,T>(x,y) { }//{ xSize=x; ySize=y; }
Matrix (int x, int y, const T& t) : SafeArray2D<Id<int>,Id<int>,T>(x,y,t) { }//{ xSize=x; ySize=y; }
Matrix (const Matrix& a) : SafeArray2D<Id<int>,Id<int>,T>(a.xSize(),a.ySize()) { //xSize=a.xSize; ySize=a.ySize;
Matrix (const Matrix& a) : SafeArray2D<Id<int>,Id<int>,T>(a.xSize(),a.ySize()) { //xSize=a.xSize; ySize=a.ySize;
for(int i=0;i<xSize();i++) for(int j=0;j<ySize();j++) this->set(i,j)=a.get(i,j); }
// Specification methods...
//Matrix& operator= ( const Matrix<T>& sat )
@ -195,34 +195,34 @@ class Matrix : public SafeArray2D<Id<int>,Id<int>,T> {
}
return false;
}
bool operator== ( const Matrix<T>& a ) const {
bool operator== ( const Matrix<T>& a ) const {
if (xSize()!=a.xSize() || ySize()!=a.ySize()) return false;
for (int i=0;i<a.xSize();i++)
for (int i=0;i<a.xSize();i++)
for (int j=0;j<a.ySize();j++)
if (this->get(Id<int>(i),Id<int>(j))!=a.get(Id<int>(i),Id<int>(j))) return false;
return true;
}
// Input/output methods...
friend ostream& operator<< ( ostream& os, const Matrix<T>& a ) {
friend ostream& operator<< ( ostream& os, const Matrix<T>& a ) {
os<<"\n ";
for (int i=0;i<a.xSize();i++) {
for (int j=0;j<a.ySize();j++) {
os<<((j==0)?"":",")<<a.get(Id<int>(i),Id<int>(j));
}
}
os<<(i==a.xSize()-1?"\n":"\n ");
}
return os;
return os;
}
friend String& operator<< ( String& str, const Matrix<T>& a ) {
friend String& operator<< ( String& str, const Matrix<T>& a ) {
str<<"\n ";
for (int i=0;i<a.xSize();i++) {
for (int j=0;j<a.ySize();j++) {
str<<((j==0)?"":",")<<a.get(Id<int>(i),Id<int>(j));
}
}
str<<";";
}
return str;
return str;
}
string getString( ) const;
@ -234,7 +234,7 @@ string Matrix<T>::getString() const {
for (int j=0;j<ySize();j++) {
str += ((j==0)?"":",");
str += this->get(Id<int>(i),Id<int>(j));
}
}
str += ";";
}
return str;

View File

@ -43,7 +43,7 @@ static const PDFVal VARIANCE_THRESHOLD = 0.01; //0.0001; //0
//
////////////////////////////////////////////////////////////////////////////////
template <class Y>
template <class Y>
class DiagGauss1DModel : public Generic1DModel<Y,PDFVal> {
private:
// Member variables...
@ -53,7 +53,7 @@ class DiagGauss1DModel : public Generic1DModel<Y,PDFVal> {
SimpleHash<Id<int>,PDFVal> aMeans;
SimpleHash<Id<int>,PDFVal> aVariances;
PDFVal prInvRootNormVariances;
PDFVal prProduct;
PDFVal prProduct;
SimpleHash<Id<int>,PDFVal> algprNegHalfInvVariances;
public:
// Constructor / destructor methods...
@ -78,7 +78,7 @@ class DiagGauss1DModel : public Generic1DModel<Y,PDFVal> {
};
////////////////////////////////////////
template <class Y>
template <class Y>
inline void DiagGauss1DModel<Y>::precomputeVarianceTerms ( ) {
// Inverse square root of norm of variances...
setInvRootNormVar() = 1.0;
@ -92,7 +92,7 @@ inline void DiagGauss1DModel<Y>::precomputeVarianceTerms ( ) {
}
////////////////////////////////////////
template <class Y>
template <class Y>
inline PDFVal DiagGauss1DModel<Y>::getProb ( const Y& y ) const {
// fprintf(stderr,"--------------------\n");
// y.write(stderr);
@ -109,7 +109,7 @@ inline PDFVal DiagGauss1DModel<Y>::getProb ( const Y& y ) const {
}
////////////////////////////////////////
template <class Y>
template <class Y>
bool DiagGauss1DModel<Y>::readFields ( char* as[], int numFields ) {
if ( 0==strcmp(as[1],"m") && numFields>2 ) {
char* psT;
@ -126,12 +126,12 @@ bool DiagGauss1DModel<Y>::readFields ( char* as[], int numFields ) {
}
////////////////////////////////////////
template <class Y>
template <class Y>
void DiagGauss1DModel<Y>::writeFields ( FILE* pf, const string& sPref ) const {
fprintf(pf,"%s m = ",sPref.c_str());
for(int i=0; i<getNumFeats(); i++) fprintf(pf,"%s%f",(0==i)?"":"_",getMean(i));
fprintf ( pf, "\n" ) ;
fprintf(pf,"%s v = ",sPref.c_str());
for(int i=0; i<getNumFeats(); i++) fprintf(pf,"%s%f",(0==i)?"":"_",getVariance(i));
fprintf ( pf, "\n" ) ;
@ -141,7 +141,7 @@ void DiagGauss1DModel<Y>::writeFields ( FILE* pf, const string& sPref ) const {
////////////////////////////////////////////////////////////////////////////////
/*
template <class Y,class X>
template <class Y,class X>
class DiagGauss2DModel : public Generic2DModel<Y,X,PDFVal> {
private:
// Member variables...
@ -177,7 +177,7 @@ class DiagGauss2DModel : public Generic2DModel<Y,X,PDFVal> {
////////////////////////////////////////////////////////////////////////////////
template <class Y,class X1,class X2>
template <class Y,class X1,class X2>
class DiagGauss3DModel : public Generic3DModel<Y,X1,X2,PDFVal> {
private:
// Member variables...
@ -220,7 +220,7 @@ class DiagGauss3DModel : public Generic3DModel<Y,X1,X2,PDFVal> {
//
////////////////////////////////////////////////////////////////////////////////
template <class Y>
template <class Y>
class TrainableDiagGauss1DModel : public DiagGauss1DModel<Y> {
public:
TrainableDiagGauss1DModel ( ) : DiagGauss1DModel<Y>() { }

View File

@ -54,7 +54,7 @@ class SimpleHash : public hash_map<X,Y,SimpleHashFn<X>,SimpleHashEqual<X> > /*pu
// tr1::unordered_map<X,Y,SimpleHashFn<X>,SimpleHashEqual<X> > mxy;
static const Y yDummy;
//static Y yNonconstDummy;
public:
// typedef typename OrigHash::const_iterator const_iterator;
// typedef typename OrigHash::iterator iterator;

View File

@ -209,7 +209,7 @@ template <class MY, class MX, class S, class B>
void HMM<MY,MX,S,B>::debugPrint() const{
for (int frame=0, numFrames=aatnTrellis.getxSize(); frame<numFrames; frame++) {
for (int beamIndex=0, beamSize=aatnTrellis.getySize(); beamIndex<beamSize; beamIndex++) {
if (aatnTrellis.get(frame,beamIndex).getLogProb().toDouble() > 0) {
@ -306,7 +306,7 @@ void HMM<MY,MX,S,B>::updateRanked ( const typename MX::RandVarType& x, bool b1 )
// Add best transition (top of queue)...
//mx.getProb(o,my.setTrellDat(ashpiQueue.getTop().first,ashpiQueue.getTop().second));
if ( ashpiQueue.getSize() > 0 ) {
S s; my.setTrellDat(s,ashpiQueue.getTop().second);
S s; my.setTrellDat(s,ashpiQueue.getTop().second);
bFull |= btn.tryAdd ( s, IB(ashpiQueue.getTop().first,my.setBackDat(ashpiQueue.getTop().second)), ashpiQueue.getTop().third );
////cerr<<ashpiQueue.getSize()<<" queue elems A "<<ashpiQueue.getTop()<<"\n";
////cerr<<"/-----A-----\\\n"<<ashpiQueue<<"\\-----A-----/\n";
@ -379,7 +379,7 @@ void HMM<MY,MX,S,B>::updateSerial ( const typename MX::RandVarType& x ) {
// Incorporate into trellis...
btn.tryAdd ( s, IB(i,my.setBackDat(y)), lgprFull );
//if(OUTPUT_VERYNOISY)
// fprintf ( stderr," (S_t-1:[e^%0.6f] * Y:e^%0.6f * X:e^%0.6f = S_t:[e^%0.6f])\n",
// fprintf ( stderr," (S_t-1:[e^%0.6f] * Y:e^%0.6f * X:e^%0.6f = S_t:[e^%0.6f])\n",
// float(aatnTrellis.get(frameLast-1,i).getLogProb().toInt())/100.0,
// float(lgprY.toInt())/100.0,
// float(lgprX.toInt())/100.0,
@ -389,7 +389,7 @@ void HMM<MY,MX,S,B>::updateSerial ( const typename MX::RandVarType& x ) {
}
// for(int i=0;i<BEAM_WIDTH;i++) {
// fprintf(stderr,"> "); btn.get(i)->first.write(stderr); fprintf(stderr,"\n");
// fprintf(stderr,"> "); btn.get(i)->first.write(stderr); fprintf(stderr,"\n");
// }
btn.sort(atnSorted);
@ -429,8 +429,8 @@ void HMM<MY,MX,S,B>::each ( const typename MX::RandVarType& x, Beam<LogProb,S,IB
const TrellNode<S,B>& tnsbPrev = aatnTrellis.get(frameLast-1,i);
// If prob still not below beam minimum...
if ( tnsbPrev.getLogProb() > btn.getMin().getScore() ) {
//if (OUTPUT_VERYNOISY) { fprintf(stderr,"FROM: "); tnsbPrev.getId().write(stderr); fprintf(stderr,"\n"); }
//if (OUTPUT_VERYNOISY) { fprintf(stderr,"FROM: "); tnsbPrev.getId().write(stderr); fprintf(stderr,"\n"); }
// For each possible transition...
const S& sPrev = tnsbPrev.getId();
typename MY::IterVal y;
@ -447,7 +447,7 @@ void HMM<MY,MX,S,B>::each ( const typename MX::RandVarType& x, Beam<LogProb,S,IB
lgprX = mx.getProb(x,my.setTrellDat(s,y)); if ( !OUTPUT_VERYNOISY && LogProb()==lgprX ) continue;
#endif /////////////////////////////////////////////////////////////////
lgprFull = tnsbPrev.getLogProb() * lgprY * lgprX;
if (OUTPUT_VERYNOISY) {
if (OUTPUT_VERYNOISY) {
boost::mutex::scoped_lock lock1(mutexHmmParanoiaLock);
//fprintf(stderr," TO: "); y.write(stderr); fprintf(stderr,"\n");
cout<<" "<<tnsbPrev.getId()<<" ==("<<tnsbPrev.getLogProb().toInt()<<"*"<<lgprY.toInt()<<"*"<<lgprX.toInt()<<"="<<lgprFull.toInt()<<")==> "<<y<<"\n";
@ -459,7 +459,7 @@ void HMM<MY,MX,S,B>::each ( const typename MX::RandVarType& x, Beam<LogProb,S,IB
// Incorporate into trellis...
btn.tryAdd ( s, IB(i,my.setBackDat(y)), lgprFull );
// if(OUTPUT_VERYNOISY)
// fprintf ( stderr," (S_t-1:[e^%0.6f] * Y:e^%0.6f * X:e^%0.6f = S_t:[e^%0.6f])\n",
// fprintf ( stderr," (S_t-1:[e^%0.6f] * Y:e^%0.6f * X:e^%0.6f = S_t:[e^%0.6f])\n",
// float(aatnTrellis.get(frameLast-1,i).getLogProb().toInt())/100.0,
// float(lgprY.toInt())/100.0,
// float(lgprO.toInt())/100.0,
@ -695,7 +695,7 @@ std::list<string> HMM<MY,MX,S,B>::getMLS(const S& sLast) const {
//// sprintf(tmp,"HYPOTH %04d> ", fr-1);
//// string tString(tmp);
//// tString +=
string tString =
string tString =
//// aatnTrellis.get(fr,iBest).getId().getString() + " " +
aatnTrellis.get(fr,iBest).getBackData().getString()
//// + "\n"
@ -737,7 +737,7 @@ template <class MY, class MX, class S, class B>
void HMM<MY,MX,S,B>::writeCurr ( ostream& os, int f=-1 ) const {
if ( -1==f ) f=frameLast;
if ( 0<=f && f<=frameLast )
for ( int i=0; i<BEAM_WIDTH; i++ )
for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
//fprintf(pf,"at f=%04d b=%04d: ",f,i);
os<<"at "<<std::setfill('0')<<std::setw(4)<<f<<" "<<std::setw(4)<<i<<": ";
@ -765,7 +765,7 @@ void HMM<MY,MX,S,B>::writeCurrSum ( FILE* pf, int f=-1 ) const {
if ( 0<=f && f<=frameLast ) {
LogProb sum = 0.0;
LogProb logtop = 0.0;
for ( int i=0; i<BEAM_WIDTH; i++ )
for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
LogProb big1 = sum - logtop;
@ -818,7 +818,7 @@ void HMM<MY,MX,S,B>::gatherElementsInBeam( SafeArray1D<Id<int>,pair<S,LogProb> >
result->init(BEAM_WIDTH);
if ( -1==f ) f=frameLast;
if ( 0<=f && f<=frameLast ) {
for ( int i=0; i<BEAM_WIDTH && &(aatnTrellis.get(f,i))!=NULL; i++ ) {
for ( int i=0; i<BEAM_WIDTH && &(aatnTrellis.get(f,i))!=NULL; i++ ) {
result->set(i).first = aatnTrellis.get(f,i).getId();
result->set(i).second = aatnTrellis.get(f,i).getLogProb();
}
@ -836,7 +836,7 @@ void HMM<MY,MX,S,B>::writeCurrEntropy ( FILE* pf, int f=-1 ) const {
if ( 0<=f && f<=frameLast ) {
LogProb logh = 0.0;
LogProb logtop = 0.0;
for ( int i=0; i<BEAM_WIDTH; i++ )
for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
LogProb big1 = logh - logtop;
@ -862,12 +862,12 @@ void HMM<MY,MX,S,B>::writeCurrDepths ( FILE* pf, int f=-1 ) const {
Array<int> depths = Array<int>();
Array<LogProb> logprobs = Array<LogProb>();
double avgdepth = 0.0;
for ( int i=0; i<BEAM_WIDTH; i++ )
for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
logprobs.set(i) = aatnTrellis.get(f,i).getLogProb();
// loop over values in S node to find lowest meaningful depth
for ( int j=0; j<aatnTrellis.get(f,i).getId().first.getSize(); j++) {
// store the depth, if it's equal to G_BOT/G_BOT
@ -996,7 +996,7 @@ int HMM<MY,MX,S,B>::getBeamUsed ( int f=-1 ) const {
if ( -1==f ) f=frameLast;
int ctr=0;
if ( 0<=f && f<=frameLast )
for ( int i=0; i<BEAM_WIDTH; i++ )
for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
ctr++;
}

View File

@ -269,7 +269,7 @@ void HMM<MH,MO,X,B>::updateRanked ( const typename MO::RandVarType& o ) {
// Add best transition (top of queue)...
//mo.getProb(o,mh.setTrellDat(axhpiQueue.getTop().first,axhpiQueue.getTop().second));
if ( axhpiQueue.getSize() > 0 ) {
X x; mh.setTrellDat(x,axhpiQueue.getTop().second);
X x; mh.setTrellDat(x,axhpiQueue.getTop().second);
bFull |= btn.tryAdd ( x, IB(axhpiQueue.getTop().first,mh.setBackDat(axhpiQueue.getTop().second)), axhpiQueue.getTop().third );
//cerr<<axhpiQueue.getSize()<<" queue elems A "<<axhpiQueue.getTop()<<"\n";
//cerr<<"/-----A-----\\\n + bFull: "<<bFull<<"\naxhpiQueue: \n"<<axhpiQueue<<"\\-----A-----/\n";
@ -341,7 +341,7 @@ void HMM<MH,MO,X,B>::updateSerial ( const typename MO::RandVarType& o ) {
// Incorporate into trellis...
btn.tryAdd ( x, IB(i,mh.setBackDat(h)), lgprFull );
//if(OUTPUT_VERYNOISY)
// fprintf ( stderr," (X_t-1:[e^%0.6f] * H:e^%0.6f * O:e^%0.6f = X_t:[e^%0.6f])\n",
// fprintf ( stderr," (X_t-1:[e^%0.6f] * H:e^%0.6f * O:e^%0.6f = X_t:[e^%0.6f])\n",
// float(aatnTrellis.get(frameLast-1,i).getLogProb().toInt())/100.0,
// float(lgprH.toInt())/100.0,
// float(lgprO.toInt())/100.0,
@ -351,7 +351,7 @@ void HMM<MH,MO,X,B>::updateSerial ( const typename MO::RandVarType& o ) {
}
// for(int i=0;i<BEAM_WIDTH;i++) {
// fprintf(stderr,"> "); btn.get(i)->first.write(stderr); fprintf(stderr,"\n");
// fprintf(stderr,"> "); btn.get(i)->first.write(stderr); fprintf(stderr,"\n");
// }
btn.sort(atnSorted);
@ -390,8 +390,8 @@ void HMM<MH,MO,X,B>::each ( const typename MO::RandVarType& o, Beam<LogProb,X,IB
const TrellNode<X,B>& tnxbPrev = aatnTrellis.get(frameLast-1,i);
// If prob still not below beam minimum...
if ( tnxbPrev.getLogProb() > btn.getMin().getScore() ) {
//if (OUTPUT_VERYNOISY) { fprintf(stderr,"FROM: "); tnxbPrev.getId().write(stderr); fprintf(stderr,"\n"); }
//if (OUTPUT_VERYNOISY) { fprintf(stderr,"FROM: "); tnxbPrev.getId().write(stderr); fprintf(stderr,"\n"); }
// For each possible transition...
const X& xPrev = tnxbPrev.getId();
typename MH::IterVal h;
@ -408,7 +408,7 @@ void HMM<MH,MO,X,B>::each ( const typename MO::RandVarType& o, Beam<LogProb,X,IB
lgprO = mo.getProb(o,mh.setTrellDat(x,h)); if ( !OUTPUT_VERYNOISY && LogProb()==lgprO ) continue;
#endif /////////////////////////////////////////////////////////////////
lgprFull = tnxbPrev.getLogProb() * lgprH * lgprO;
if (OUTPUT_VERYNOISY) {
if (OUTPUT_VERYNOISY) {
boost::mutex::scoped_lock lock1(mutexHmmParanoiaLock);
//fprintf(stderr," TO: "); h.write(stderr); fprintf(stderr,"\n");
cout<<" "<<tnxbPrev.getId()<<" ==("<<tnxbPrev.getLogProb().toInt()<<"*"<<lgprH.toInt()<<"*"<<lgprO.toInt()<<"="<<lgprFull.toInt()<<")==> "<<h<<"\n";
@ -420,7 +420,7 @@ void HMM<MH,MO,X,B>::each ( const typename MO::RandVarType& o, Beam<LogProb,X,IB
// Incorporate into trellis...
btn.tryAdd ( x, IB(i,mh.setBackDat(h)), lgprFull );
// if(OUTPUT_VERYNOISY)
// fprintf ( stderr," (X_t-1:[e^%0.6f] * H:e^%0.6f * O:e^%0.6f = X_t:[e^%0.6f])\n",
// fprintf ( stderr," (X_t-1:[e^%0.6f] * H:e^%0.6f * O:e^%0.6f = X_t:[e^%0.6f])\n",
// float(aatnTrellis.get(frameLast-1,i).getLogProb().toInt())/100.0,
// float(lgprH.toInt())/100.0,
// float(lgprO.toInt())/100.0,
@ -656,7 +656,7 @@ std::list<string> HMM<MH,MO,X,B>::getMLS(const X& xLast) const {
//// sprintf(tmp,"HYPOTH %04d> ", fr-1);
//// string tString(tmp);
//// tString +=
string tString =
string tString =
//// aatnTrellis.get(fr,iBest).getId().getString() + " " +
aatnTrellis.get(fr,iBest).getBackData().getString()
//// + "\n"
@ -697,7 +697,7 @@ template <class MH, class MO, class X, class B>
void HMM<MH,MO,X,B>::writeCurr ( FILE* pf, int f=-1 ) const {
if ( -1==f ) f=frameLast;
if ( 0<=f && f<=frameLast )
for ( int i=0; i<BEAM_WIDTH; i++ )
for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
fprintf(pf,"at f=%04d b=%04d: ",f,i);
String str; str<<aatnTrellis.get(f,i).getId(); //.write(pf);
@ -721,7 +721,7 @@ void HMM<MH,MO,X,B>::writeCurrSum ( FILE* pf, int f=-1 ) const {
if ( 0<=f && f<=frameLast ) {
LogProb sum = 0.0;
LogProb logtop = 0.0;
for ( int i=0; i<BEAM_WIDTH; i++ )
for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
LogProb big1 = sum - logtop;
@ -741,7 +741,7 @@ void HMM<MH,MO,X,B>::writeCurrEntropy ( FILE* pf, int f=-1 ) const {
if ( 0<=f && f<=frameLast ) {
LogProb logh = 0.0;
LogProb logtop = 0.0;
for ( int i=0; i<BEAM_WIDTH; i++ )
for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
LogProb big1 = logh - logtop;
@ -768,12 +768,12 @@ void HMM<MH,MO,X,B>::writeCurrDepths ( FILE* pf, int f=-1 ) const {
Array<int> depths = Array<int>();
Array<LogProb> logprobs = Array<LogProb>();
double avgdepth = 0.0;
for ( int i=0; i<BEAM_WIDTH; i++ )
for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
logprobs.set(i) = aatnTrellis.get(f,i).getLogProb();
// loop over values in S node to find lowest meaningful depth
for ( int j=0; j<aatnTrellis.get(f,i).getId().first.getSize(); j++) {
// store the depth, if it's equal to G_BOT/G_BOT
@ -900,7 +900,7 @@ int HMM<MH,MO,X,B>::getBeamUsed ( int f=-1 ) const {
if ( -1==f ) f=frameLast;
int ctr=0;
if ( 0<=f && f<=frameLast )
for ( int i=0; i<BEAM_WIDTH; i++ )
for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
ctr++;
}

View File

@ -348,7 +348,7 @@ const TrellNode<S,B>& HMMLoop<MY,MX,S,B>::update ( const typename MX::RandVarTyp
//modX.getProb(o,modY.setTrellDat(ashpiQueue.getTop().first,ashpiQueue.getTop().second));
if ( ashpiQueue.getSize() > 0 ) {
S s ( ashpiQueue.getTop().second );
////S s; modY.setTrellDat(s,ashpiQueue.getTop().second);
////S s; modY.setTrellDat(s,ashpiQueue.getTop().second);
bFull |= btn.tryAdd ( s, IB(ashpiQueue.getTop().first,B(ashpiQueue.getTop().second)), ashpiQueue.getTop().third );
////cerr<<ashpiQueue.getSize()<<" queue elems A "<<ashpiQueue.getTop()<<"\n";
////cerr<<"/-----A-----\\\n"<<ashpiQueue<<"\\-----A-----/\n";

View File

@ -90,8 +90,8 @@ class Vector : public X {
Vector<X> operator- ( ElementType d ) const { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = X::get(i)-d; return vO; }
friend Vector<X> operator* ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d*v[i]; return vO; }
friend Vector<X> operator/ ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d/v[i]; return vO; }
friend Vector<X> operator+ ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d+v[i]; return vO; }
friend Vector<X> operator- ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d-v[i]; return vO; }
friend Vector<X> operator+ ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d+v[i]; return vO; }
friend Vector<X> operator- ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d-v[i]; return vO; }
Vector<X>& operator*= ( ElementType d ) { for(uint i=0;i<SIZE;i++) X::set(i)*=d; return *this; }
Vector<X>& operator/= ( ElementType d ) { for(uint i=0;i<SIZE;i++) X::set(i)/=d; return *this; }
Vector<X>& operator+= ( ElementType d ) { for(uint i=0;i<SIZE;i++) X::set(i)+=d; return *this; }

View File

@ -97,7 +97,7 @@ class Mixture3DModel : public Generic2DModel<Y,X,Prob> {
//
////////////////////////////////////////////////////////////////////////////////
template <template <class MY> class M,class Y,class C>
template <template <class MY> class M,class Y,class C>
class TrainableMixture2DModel : public Mixture2DModel<M,Y,C> {
// private:
// LogPDFVal logpdfPrevDataAvg;
@ -110,7 +110,7 @@ class TrainableMixture2DModel : public Mixture2DModel<M,Y,C> {
};
////////////////////////////////////////
template <template <class MY> class M,class Y,class C>
template <template <class MY> class M,class Y,class C>
void TrainableMixture2DModel<M,Y,C>::updateFields ( const List<Joint2DRV<Y,Prob> >& lyp, const PDFVal WEIGHT_LIMIT, bool& bShouldStop ) {
LogPDFVal logpdfData = 0.0;
CPT1DModel<C,Prob> mprPseudoEmpC; // pseudo-empirical prob marginal
@ -178,7 +178,7 @@ void TrainableMixture2DModel<M,Y,C>::updateFields ( const List<Joint2DRV<Y,Prob>
}
////////////////////////////////////////
template <template <class MY> class M,class Y,class C>
template <template <class MY> class M,class Y,class C>
void TrainableMixture2DModel<M,Y,C>::train ( List<Joint2DRV<Y,Prob> >& lyp, const int EPOCH_LIMIT, const PDFVal WEIGHT_LIMIT ) {
// Normalize model...
@ -204,7 +204,7 @@ void TrainableMixture2DModel<M,Y,C>::train ( List<Joint2DRV<Y,Prob> >& lyp, cons
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
template <template <class MY> class M,class Y,class X,class C>
template <template <class MY> class M,class Y,class X,class C>
class TrainableMixture3DModel : public Generic2DModel<Y,X,C> {
private:
string sId;
@ -225,7 +225,7 @@ class TrainableMixture3DModel : public Generic2DModel<Y,X,C> {
};
////////////////////////////////////////
template <template <class MY> class M,class Y,class X,class C>
template <template <class MY> class M,class Y,class X,class C>
void TrainableMixture3DModel<M,Y,X,C>::train ( const int EPOCH_LIMIT, const PDFVal WEIGHT_LIMIT ) {
// Update each subphone from list...
int ctr = 0;
@ -237,7 +237,7 @@ void TrainableMixture3DModel<M,Y,X,C>::train ( const int EPOCH_LIMIT, const PDFV
}
////////////////////////////////////////
template <template <class MY> class M,class Y,class X,class C>
template <template <class MY> class M,class Y,class X,class C>
void TrainableMixture3DModel<M,Y,X,C>::train ( const List<Joint3DRV<X,Y,Prob> >& lxyp, const int EPOCH_LIMIT, const PDFVal WEIGHT_LIMIT ) {
// Chop list into phone-specific sub-lists...
ListedObject<Joint3DRV<X,Y,Prob> >* pxyp;
@ -248,7 +248,7 @@ void TrainableMixture3DModel<M,Y,X,C>::train ( const List<Joint3DRV<X,Y,Prob> >&
}
////////////////////////////////////////
template <template <class MY> class M,class Y,class X,class C>
template <template <class MY> class M,class Y,class X,class C>
bool TrainableMixture3DModel<M,Y,X,C>::readData ( char* as[], int numFields ) {
if ( /*as[0]!=sId+"dat" ||*/ numFields!=3 ) return false;
alyp.set(X(as[1])).add() = Joint2DRV<Y,Prob>(Y(as[2]),Prob(1.0));
@ -256,7 +256,7 @@ bool TrainableMixture3DModel<M,Y,X,C>::readData ( char* as[], int numFields ) {
}
////////////////////////////////////////
template <template <class MY> class M,class Y,class X,class C>
template <template <class MY> class M,class Y,class X,class C>
void TrainableMixture3DModel<M,Y,X,C>::writeFields ( FILE* pf, string sPref ) {
X x; for ( bool b=x.setFirst(); b; b=x.setNext() ) {
am.get(x).writeFields(pf,sPref+" "+x.getString());

View File

@ -37,7 +37,7 @@ void processModelFilePtr ( FILE* pf, bool rF(Array<char*>&) ) {
int i=0; int numFields=0; int c=' '; int line=1;
CONSUME_ALL(pf,c,WHITESPACE(c),line); // Get to first record
while ( c!=EOF ) { // For each record
if ( c=='#' ) CONSUME_ALL(pf, c, c!='\n' && c!='\0', line ) ; // If comment, consume
if ( c=='#' ) CONSUME_ALL(pf, c, c!='\n' && c!='\0', line ) ; // If comment, consume
else { // If no comment,
Array<char*> aps(100);
String psBuff(1000);
@ -49,7 +49,7 @@ void processModelFilePtr ( FILE* pf, bool rF(Array<char*>&) ) {
if (!z) break;
aps[i]=z;
}
if ( !rF(aps) ) // Try to process fields, else complain
fprintf( stderr, "\nERROR: %d %d-arg %s in line %d\n\n", numFields, aps.size(), aps[0], line);
}
@ -75,7 +75,7 @@ void processModelSocket ( const int tSockfd, int& c, bool rF(Array<char*>&) ) {
int i=0; int numFields=0; int line=1;
CONSUME_ALL_SOCKET(tSockfd,c,WHITESPACE(c),line); // Get to first record
while ( c!='\0' && c!='\5' ) { // For each record
if ( c=='#' ) CONSUME_ALL_SOCKET(tSockfd, c, (c!='\n' && c!='\0' && c!='\5'), line ) ; // If comment, consume
if ( c=='#' ) CONSUME_ALL_SOCKET(tSockfd, c, (c!='\n' && c!='\0' && c!='\5'), line ) ; // If comment, consume
else { // If no comment,
Array<char*> aps(100);
String psBuff(1000);
@ -88,7 +88,7 @@ void processModelSocket ( const int tSockfd, int& c, bool rF(Array<char*>&) ) {
if (!z) break;
aps[i]=z;
}
if ( !rF(aps) ) // Try to process fields, else complain
fprintf( stderr, "\nERROR: %d-arg %s in line %d\n\n", numFields, aps[0], line);
}
@ -97,7 +97,7 @@ void processModelSocket ( const int tSockfd, int& c, bool rF(Array<char*>&) ) {
}
void processModelSocket ( const int tSockfd, bool rF(Array<char*>&) ) {
int c=' ';
int c=' ';
processModelSocket ( tSockfd, c, rF );
}

View File

@ -80,12 +80,12 @@ class binuint {
// Input / output methods...
friend StringInput operator>> ( StringInput si, binuint& i ) {
if(si==NULL) return si;
i.b=0;
i.b=0;
for ( char c=si[0]; '0'<=c && c<='1'; ++si,c=si[0])
{ i.b=i.b*2+c-'0'; }
return si; }
friend ostream& operator<< ( ostream& os, binuint i ) { for(int e=uint(log2(i.b));e>=0;e--)os <<((i.b>>e)%2); return os; }
friend String& operator<< ( String& str, binuint i ) { for(int e=uint(log2(i.b));e>=0;e--)str<<((i.b>>e)%2); return str; }
friend ostream& operator<< ( ostream& os, binuint i ) { for(int e=uint(log2(i.b));e>=0;e--)os <<((i.b>>e)%2); return os; }
friend String& operator<< ( String& str, binuint i ) { for(int e=uint(log2(i.b));e>=0;e--)str<<((i.b>>e)%2); return str; }
};
////////////////////////////////////////////////////////////////////////////////

View File

@ -43,7 +43,7 @@ class Prob {
Prob ( ) { gVal = 0.0; }
Prob (double d) { gVal = d; }
Prob (const char* ps) { gVal = atof(ps); }
operator double() const { return gVal; }
double toDouble() const { return gVal; }
Prob& operator+= ( const Prob p ) { gVal += p.gVal; return *this; }
@ -54,7 +54,7 @@ class Prob {
friend ostream& operator<< ( ostream& os, const Prob& pr ) { return os<<pr.toDouble(); }
friend String& operator<< ( String& str, const Prob& pr ) { return str<<pr.toDouble(); }
friend pair<StringInput,Prob*> operator>> ( StringInput si, Prob& n ) { return pair<StringInput,Prob*>(si,&n); }
friend StringInput operator>> ( pair<StringInput,Prob*> si_n, const char* psDlm ) {
friend StringInput operator>> ( pair<StringInput,Prob*> si_n, const char* psDlm ) {
double d=0.0; StringInput si=si_n.first>>d>>psDlm; *si_n.second=Prob(d); return si; }
};
@ -129,7 +129,7 @@ class LogProb : public Id<int> {
friend ostream& operator<< ( ostream& os, const LogProb& lp ) { return os<<lp.toInt(); }
friend String& operator<< ( String& str, const LogProb& lp ) { return str<<lp.toInt(); }
friend pair<StringInput,LogProb*> operator>> ( StringInput si, LogProb& n ) { return pair<StringInput,LogProb*>(si,&n); }
friend StringInput operator>> ( pair<StringInput,LogProb*> si_n, const char* psDlm ) {
friend StringInput operator>> ( pair<StringInput,LogProb*> si_n, const char* psDlm ) {
double d=0.0; StringInput si=si_n.first>>d>>psDlm; *si_n.second=LogProb(d); return si; }
};

View File

@ -33,7 +33,7 @@
//
////////////////////////////////////////////////////////////////////////////////
template<class Y,class P>
template<class Y,class P>
class Generic1DModel {
public:
typedef Y RVType;
@ -45,7 +45,7 @@ class Generic1DModel {
////////////////////////////////////////////////////////////
template<class Y,class X1,class P>
template<class Y,class X1,class P>
class Generic2DModel {
public:
typedef Y RVType;
@ -60,7 +60,7 @@ class Generic2DModel {
////////////////////////////////////////////////////////////
template<class Y,class X1,class X2,class P>
template<class Y,class X1,class X2,class P>
class Generic3DModel {
public:
typedef Y RVType;
@ -76,7 +76,7 @@ class Generic3DModel {
////////////////////////////////////////////////////////////
template<class Y,class X1,class X2,class X3,class P>
template<class Y,class X1,class X2,class X3,class P>
class Generic4DModel {
public:
typedef Y RVType;
@ -93,7 +93,7 @@ class Generic4DModel {
////////////////////////////////////////////////////////////
template<class Y,class X1,class X2,class X3,class X4,class P>
template<class Y,class X1,class X2,class X3,class X4,class P>
class Generic5DModel {
public:
typedef Y RVType;
@ -111,7 +111,7 @@ class Generic5DModel {
////////////////////////////////////////////////////////////
template<class Y,class X1,class X2,class X3,class X4,class X5,class P>
template<class Y,class X1,class X2,class X3,class X4,class X5,class P>
class Generic6DModel {
public:
typedef Y RVType;
@ -130,7 +130,7 @@ class Generic6DModel {
////////////////////////////////////////////////////////////
template<class Y,class X1,class X2,class X3,class X4,class X5,class X6,class P>
template<class Y,class X1,class X2,class X3,class X4,class X5,class X6,class P>
class Generic7DModel {
public:
typedef Y RVType;
@ -302,7 +302,7 @@ class Modeled5DRV : public M::RVType {
const typename M::Dep2Type& x2,
const typename M::Dep3Type& x3,
const typename M::Dep4Type& x4 ) const { return m.getProb(*this,x1,x2,x3,x4); }
};
///////////////////////////////////////////////////////////////////////////////
@ -346,7 +346,7 @@ class Modeled6DRV : public M::RVType {
const typename M::Dep3Type& x3,
const typename M::Dep4Type& x4,
const typename M::Dep5Type& x5 ) const { return m.getProb(*this,x1,x2,x3,x4,x5); }
};
///////////////////////////////////////////////////////////////////////////////
@ -395,7 +395,7 @@ class Modeled7DRV : public M::RVType {
const typename M::Dep4Type& x4,
const typename M::Dep5Type& x5,
const typename M::Dep6Type& x6 ) const { return m.getProb(*this,x1,x2,x3,x4,x5,x6); }
};
///////////////////////////////////////////////////////////////////////////////

View File

@ -42,7 +42,7 @@ class GenericRACPTModel : public SimpleHash<K,P> {
return ( SimpleHash<K,P>::contains(k) );
}
/*
/*
P getProb ( const IterVal& ikyp, const K& k ) const {
if ( ikyp.iter.first == ikyp.iter.second ) { cerr<<"ERROR: no iterator to fix probability: "<<k<<endl; return P(); }
return ( ikyp.iter.first->second );
@ -91,7 +91,7 @@ class GenericRACPTModel : public SimpleHash<K,P> {
for ( typename HKP::const_iterator ik=HKP::begin(); ik!=HKP::end(); ik++ ) {
K k=ik->first;
os << psId<<" "<<k<<" = "<<getProb(k).toDouble()<<endl;
// IterVal y;
// for ( bool b=setFirst(y,k); b; b=setNext(y,k) )
// os<<psId<<" "<<k<<" : "<<y<<" = "<<getProb(y,k).toDouble()<<"\n";
@ -110,14 +110,14 @@ class GenericRACPTModel : public SimpleHash<K,P> {
friend pair<StringInput,GenericRACPTModel<K,P>*> operator>> ( StringInput si, GenericRACPTModel<K,P>& m ) {
return pair<StringInput,GenericRACPTModel<K,P>*>(si,&m); }
friend StringInput operator>> ( pair<StringInput,GenericRACPTModel<K,P>*> delimbuff, const char* psD ) {
K k;
StringInput si,si2,si3;
K k;
StringInput si,si2,si3;
GenericRACPTModel<K,P>& m = *delimbuff.second;
si=delimbuff.first;
if ( si==NULL ) return si;
// Kill the colon since we're treating the whole thing as the condition
char * str = si.c_str();
char * p = strchr(str, ':');
@ -125,17 +125,17 @@ class GenericRACPTModel : public SimpleHash<K,P> {
p[0] = ' ';
}
si=str;
while((si2=si>>" ")!=NULL)si=si2;
while((si2=si>>" ")!=NULL)si=si2;
si=si>>k>>" ";
while((si2=si>>" ")!=NULL)si=si2;
while((si2=si>>" ")!=NULL)si=si2;
si=si>>"= ";
while((si2=si>>" ")!=NULL)si=si2;
while((si2=si>>" ")!=NULL)si=si2;
return (si!=NULL) ? si>>m.setProb(k)>>psD : si;
}
};
template<class Y, class P>
template<class Y, class P>
class RandAccCPT1DModel : public GenericRACPTModel<MapKey1D<Y>,P> {
public:
// typedef typename GenericCPTModel<Y,MapKey1D<Unit>,P>::IterVal IterVal;
@ -170,7 +170,7 @@ P& setProb ( const Y& y ) {
////////////////////
template<class Y, class X1, class P>
template<class Y, class X1, class P>
class RandAccCPT2DModel : public GenericRACPTModel<MapKey2D<X1,Y>,P> {
public:
@ -187,7 +187,7 @@ class RandAccCPT2DModel : public GenericRACPTModel<MapKey2D<X1,Y>,P> {
P getProb ( const Y& y, const X1& x1 ) const {
return GenericRACPTModel<MapKey2D<X1,Y>,P>::getProb ( MapKey2D<X1,Y>(x1,y) );
}
/*
P& setProb ( const Y& y, const X1& x1 ) {
cerr << "setProb called on racpt2d" << endl;
@ -199,7 +199,7 @@ class RandAccCPT2DModel : public GenericRACPTModel<MapKey2D<X1,Y>,P> {
////////////////////
template<class Y, class X1, class X2, class P>
template<class Y, class X1, class X2, class P>
class RandAccCPT3DModel : public GenericRACPTModel<MapKey3D<X1,X2,Y>,P> {
public:
@ -219,7 +219,7 @@ class RandAccCPT3DModel : public GenericRACPTModel<MapKey3D<X1,X2,Y>,P> {
/*
////////////////////
template<class Y, class X1, class X2, class X3, class P>
template<class Y, class X1, class X2, class X3, class P>
class CPT4DModel : public GenericCPTModel<Y,MapKey3D<X1,X2,X3>,P> {
public:
typedef typename GenericCPTModel<Y,MapKey3D<X1,X2,X3>,P>::IterVal IterVal;
@ -256,7 +256,7 @@ class CPT4DModel : public GenericCPTModel<Y,MapKey3D<X1,X2,X3>,P> {
////////////////////
template<class Y, class X1, class X2, class X3, class X4, class P>
template<class Y, class X1, class X2, class X3, class X4, class P>
class CPT5DModel : public GenericCPTModel<Y,MapKey4D<X1,X2,X3,X4>,P> {
public:
typedef typename GenericCPTModel<Y,MapKey4D<X1,X2,X3,X4>,P>::IterVal IterVal;
@ -293,7 +293,7 @@ class CPT5DModel : public GenericCPTModel<Y,MapKey4D<X1,X2,X3,X4>,P> {
////////////////////
template<class Y, class X1, class X2, class X3, class X4, class X5, class P>
template<class Y, class X1, class X2, class X3, class X4, class X5, class P>
class RACPT6DModel : public GenericCPTModel<Y,MapKey5D<X1,X2,X3,X4,X5>,P> {
public:
typedef typename GenericCPTModel<Y,MapKey5D<X1,X2,X3,X4,X5>,P>::IterVal IterVal;

View File

@ -129,7 +129,7 @@ class DiscreteDomainRV : public Id<T> {
friend pair<StringInput,DiscreteDomainRV<T,domain>*> operator>> ( const StringInput ps, DiscreteDomainRV<T,domain>& rv ) { return pair<StringInput,DiscreteDomainRV<T,domain>*>(ps,&rv); }
friend StringInput operator>> ( pair<StringInput,DiscreteDomainRV<T,domain>*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
////assert(*delimbuff.second<domain.getSize());
////assert(*delimbuff.second<domain.getSize());
int j=0;
StringInput psIn = delimbuff.first;
if(psDlm[0]=='\0') { *delimbuff.second=psIn.c_str(); return psIn+strlen(psIn.c_str()); }
@ -203,7 +203,7 @@ template <class T> const T RefRV<T>::DUMMY;
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////
template<class V1,class V2>
template<class V1,class V2>
class Joint2DRV {
public:
@ -216,7 +216,7 @@ class Joint2DRV {
Joint2DRV ( const V1& v1, const V2& v2 ) { first=v1; second=v2; }
// Extraction methods...
size_t getHashKey ( ) const { size_t k=rotLeft(first.getHashKey(),3); k^=second.getHashKey();
size_t getHashKey ( ) const { size_t k=rotLeft(first.getHashKey(),3); k^=second.getHashKey();
/*fprintf(stderr," (%d) %d ^& %d = %d\n",sizeof(*this),x1.getHashKey(),x2.getHashKey(),k);*/ return k; }
bool operator< ( const Joint2DRV<V1,V2>& j ) const { return ( (first<j.first) ||
(first==j.first && second<j.second) ); }
@ -276,7 +276,7 @@ class DelimitedJoint2DRV : public Joint2DRV<V1,V2> {
friend pair<StringInput,DelimitedJoint2DRV<SD1,V1,SD2,V2,SD3>*> operator>> ( StringInput ps, DelimitedJoint2DRV<SD1,V1,SD2,V2,SD3>& rv ) { return pair<StringInput,DelimitedJoint2DRV<SD1,V1,SD2,V2,SD3>*>(ps,&rv); }
friend StringInput operator>> ( pair<StringInput,DelimitedJoint2DRV<SD1,V1,SD2,V2,SD3>*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
return ( (SD3[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>psDlm
return ( (SD3[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>psDlm
: delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>SD3>>psDlm );
}
};
@ -290,7 +290,7 @@ class DelimitedJoint2DRV : public Joint2DRV<V1,V2> {
//
////////////////////////////////////////////////////////////////////////////////
template<class V1,class V2,class V3>
template<class V1,class V2,class V3>
class Joint3DRV {
public:
@ -361,7 +361,7 @@ class DelimitedJoint3DRV : public Joint3DRV<V1,V2,V3> {
return pair<StringInput,DelimitedJoint3DRV<SD1,V1,SD2,V2,SD3,V3,SD4>*>(ps,&rv); }
friend StringInput operator>> ( pair<StringInput,DelimitedJoint3DRV<SD1,V1,SD2,V2,SD3,V3,SD4>*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
return ( (SD4[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>SD3>>delimbuff.second->third>>psDlm
return ( (SD4[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>SD3>>delimbuff.second->third>>psDlm
: delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>SD3>>delimbuff.second->third>>SD4>>psDlm );
}
};
@ -453,7 +453,7 @@ class DelimitedJoint4DRV : public Joint4DRV<V1,V2,V3,V4> {
//
////////////////////////////////////////////////////////////////////////////////
template <int I, class T>
template <int I, class T>
class JointArrayRV {
private:
// Data members...
@ -491,7 +491,7 @@ class JointArrayRV {
////////////////////////////////////////////////////////////////////////////////
template <int I, char* SD, class T>
template <int I, char* SD, class T>
class DelimitedJointArrayRV : public JointArrayRV<I,T> {
public:
@ -569,7 +569,7 @@ class History {
/*
void read ( char* ps, const ReaderContext& rc=ReaderContext() ) { char* psT; for(int i=0;i<N;i++){char* z=strtok_r((0==i)?ps:NULL,";",&psT); assert(z); at.set(i).read(z);} }
//at.set(i).read(strtok_r((0==i)?ps:NULL,";",&psT)); }
*/
*/
friend ostream& operator<< ( ostream& os, const History<N,T>& a ) { for(int i=0;i<N;i++)os<<((i==0)?"":";")<<a.getBack(i); return os; }
friend pair<StringInput,History<N,T>*> operator>> ( StringInput ps, History<N,T>& a ) { return pair<StringInput,History<N,T>*>(ps,&a); }

View File

@ -30,7 +30,7 @@
#include "nl-stream.h"
#include <iostream>
using namespace std;
using namespace std;
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
@ -39,7 +39,7 @@ using namespace std;
//
////////////////////////////////////////////////////////////////////////////////
template <int I, class T>
template <int I, class T>
class StaticSafeArray {
private:
// Data members...
@ -84,7 +84,7 @@ class StaticSafeArray {
////////////////////////////////////////////////////////////////////////////////
template <int I, char* SD, class T>
template <int I, char* SD, class T>
class DelimitedStaticSafeArray : public StaticSafeArray<I,T> {
public:
DelimitedStaticSafeArray ( ) : StaticSafeArray<I,T>() { }
@ -349,7 +349,7 @@ class SafeArray2D {
// Extraction methods...
const T& get (const X1& x,const X2& y) const { assert(at!=NULL);
assert(x.toInt()>=0); assert(x.toInt()<xSize);
assert(y.toInt()>=0);
assert(y.toInt()>=0);
//this assert failed when compile without -DNDEBUG (needed for debugging). Have to figure out why before adding this assert back in
//assert(y.toInt()<ySize);
return at[x.toInt()*ySize + y.toInt()];}
@ -423,7 +423,7 @@ class SafeArray4D {
{ delete[] at; wSize=sat.wSize; xSize=sat.xSize; ySize=sat.ySize;
zSize=sat.zSize; at=new T[wSize*xSize*ySize*zSize];
for(int i=0;i<wSize*xSize*ySize*zSize;i++) at[i]=sat.at[i]; return *this; }
void init (int w,int x,int y,int z)
void init (int w,int x,int y,int z)
{ delete[] at; wSize=w; xSize=x; ySize=y; zSize=z; at=new T[w*x*y*z]; }
void init (int w,int x,int y,int z,const T& t)
{ delete[] at; wSize=w; xSize=x; ySize=y; zSize=z; at=new T[w*x*y*z];
@ -472,7 +472,7 @@ class SafeArray5D {
{ delete[] at; vSize=sat.vSize; wSize=sat.wSize; xSize=sat.xSize;
ySize=sat.ySize; zSize=sat.zSize; at=new T[vSize*wSize*xSize*ySize*zSize];
for(int i=0;i<vSize*wSize*xSize*ySize*zSize;i++) at[i]=sat.at[i]; return *this; }
void init(int v,int w,int x,int y,int z)
void init(int v,int w,int x,int y,int z)
{ delete[] at; vSize=v; wSize=w; xSize=x; ySize=y; zSize=z; at=new T[v*w*x*y*z]; }
void init(int v,int w,int x,int y,int z,const T& t)
{ delete[] at; vSize=v; wSize=w; xSize=x; ySize=y; zSize=z; at=new T[v*w*x*y*z];

View File

@ -86,7 +86,7 @@ class IStream {
friend ostream& operator<< ( ostream& os, const IStream& is ) { return os<<is.iIndex<<","<<is.psrc<<","<<*is.psrc; }
// Match single char...
friend IStream operator>> ( IStream is, char& c ) {
friend IStream operator>> ( IStream is, char& c ) {
// Propagate fail...
if (IStream()==is) return IStream();
c=is.get(is.iIndex);
@ -106,7 +106,7 @@ class IStream {
// Match anything else followed by zero-terminated string delimiter...
template<class X> friend pair<IStream,X*> operator>> ( IStream is, X& x ) { return pair<IStream,X*>(is,&x); }
template<class X> friend IStream operator>> ( pair<IStream,X*> is_x, const char* psDlm ) {
template<class X> friend IStream operator>> ( pair<IStream,X*> is_x, const char* psDlm ) {
IStream& is = is_x.first;
X& x = *is_x.second;
// Propagate fail...
@ -129,7 +129,7 @@ class IStream {
}
// Match integer followed by zero-terminated string delimiter...
friend IStream operator>> ( pair<IStream,int*> is_x, const char* psDlm ) {
friend IStream operator>> ( pair<IStream,int*> is_x, const char* psDlm ) {
IStream& is = is_x.first;
int& x = *is_x.second;
// Propagate fail...
@ -151,7 +151,7 @@ class IStream {
}
// Match unsigned int followed by zero-terminated string delimiter...
friend IStream operator>> ( pair<IStream,unsigned int*> is_x, const char* psDlm ) {
friend IStream operator>> ( pair<IStream,unsigned int*> is_x, const char* psDlm ) {
IStream& is = is_x.first;
unsigned int& x = *is_x.second;
// Propagate fail...
@ -173,7 +173,7 @@ class IStream {
}
// Match float followed by zero-terminated string delimiter...
friend IStream operator>> ( pair<IStream,float*> is_x, const char* psDlm ) {
friend IStream operator>> ( pair<IStream,float*> is_x, const char* psDlm ) {
IStream& is = is_x.first;
float& x = *is_x.second;
// Propagate fail...
@ -195,7 +195,7 @@ class IStream {
}
// Match double followed by zero-terminated string delimiter...
friend IStream operator>> ( pair<IStream,double*> is_x, const char* psDlm ) {
friend IStream operator>> ( pair<IStream,double*> is_x, const char* psDlm ) {
IStream& is = is_x.first;
double& x = *is_x.second;
// Propagate fail...
@ -217,7 +217,7 @@ class IStream {
}
// Match void pointer followed by zero-terminated string delimiter...
friend IStream operator>> ( pair<IStream,void**> is_x, const char* psDlm ) {
friend IStream operator>> ( pair<IStream,void**> is_x, const char* psDlm ) {
IStream& is = is_x.first;
// Propagate fail...
if (IStream()==is) return IStream();

View File

@ -68,13 +68,13 @@ class StringInput {
friend StringInput operator>> ( StringInput psIn, const char* psDlm ) {
if (StringInput(NULL)==psIn) return psIn;
int i;
for (i=0; psIn[i]!='\0' && psDlm[i]!='\0'; i++)
for (i=0; psIn[i]!='\0' && psDlm[i]!='\0'; i++)
if(psIn[i]!=psDlm[i]) return StringInput(NULL); //psIn;
return (psDlm[i]!='\0') ? StringInput(NULL) : (psIn[i]!='\0') ? psIn+i : SI_EOS;
}
friend pair<StringInput,int*> operator>> ( StringInput ps, int& n ) { return pair<StringInput,int*>(ps,&n); }
friend StringInput operator>> ( pair<StringInput,int*> delimbuff, const char* psDlm ) {
friend StringInput operator>> ( pair<StringInput,int*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
///int i; for(i=0;psIn[i]!='\0';i++) if(psIn[i]==psDlm[i]) return psIn; return psIn+i;
int j=0;
@ -90,7 +90,7 @@ class StringInput {
}
friend pair<StringInput,unsigned int*> operator>> ( StringInput ps, unsigned int& n ) { return pair<StringInput,unsigned int*>(ps,&n); }
friend StringInput operator>> ( pair<StringInput,unsigned int*> delimbuff, const char* psDlm ) {
friend StringInput operator>> ( pair<StringInput,unsigned int*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
///int i; for(i=0;psIn[i]!='\0';i++) if(psIn[i]==psDlm[i]) return psIn; return psIn+i;
int j=0;
@ -106,7 +106,7 @@ class StringInput {
}
friend pair<StringInput,double*> operator>> ( StringInput ps, double& d ) { return pair<StringInput,double*>(ps,&d); }
friend StringInput operator>> ( pair<StringInput,double*> delimbuff, const char* psDlm ) {
friend StringInput operator>> ( pair<StringInput,double*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
///int i; for(i=0;psIn[i]!='\0';i++) if(psIn[i]==psDlm[i]) return psIn; return psIn+i;
int j=0;
@ -191,7 +191,7 @@ class String : public Array<char> {
friend pair<StringInput,String*> operator>> ( const StringInput ps, String& s ) { return pair<StringInput,String*>(ps,&s); }
friend StringInput operator>> ( pair<StringInput,String*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
////assert(*delimbuff.second<domain.getSize());
////assert(*delimbuff.second<domain.getSize());
int j=0;
StringInput psIn = delimbuff.first;
if(psDlm[0]=='\0') { *delimbuff.second=String(psIn.c_str()); return psIn+strlen(psIn.c_str()); }

View File

@ -38,7 +38,7 @@ class StringIndex{
map <string, int> msi;
map <int, string> mis;
int maxIndex;
public:
// Constructor / destructor methods...

View File

@ -22,7 +22,7 @@
///////////////////////////////////////////////////////////////////////////////
/***********************************************
* nl-tetrahex.h
* nl-tetrahex.h
* a little header with some base conversion stuff
* so that we can represent base 16, 32 or 64 with
* one character.

View File

@ -41,7 +41,7 @@ class Timer {
}
double elapsed ( ) { // in milliseconds.
return (double(kept.tv_sec)*1000.0 + double(kept.tv_usec)/1000.0);
//struct timeval end; gettimeofday(&end,NULL);
//struct timeval end; gettimeofday(&end,NULL);
//double beg_time_s = (double) beg.tv_sec + (double) ((double)beg.tv_usec / 1000000.0);
//double end_time_s = (double) end.tv_sec + (double) ((double)end.tv_usec / 1000000.0);
//return ( (end_time_s - beg_time_s) * 1000.0 );

View File

@ -136,7 +136,7 @@ class Rd : public DiscreteDomainRV<int,domRd> {
}
if (!hToG.contains(*this)) {
size_t i=s.find(',');
assert(i!=string::npos);
assert(i!=string::npos);
hToG.set(*this) = G(s.substr(i+1).c_str());
if ( '1'==s[0] )
hFromG.set(G(s.substr(i+1).c_str())) = *this;

View File

@ -42,11 +42,11 @@ typedef HidVarCPT2DModel<P,C,LogProb> PgivCModel;
class WModel {
private:
TrainableDTree2DModel<P,W,LogProb> modPgivWdt;
RandAccCPT2DModel<P,W,LogProb> modPgivWs;
RandAccCPT1DModel<P,LogProb> modP;
RandAccCPT1DModel<W,LogProb> modW;
public:
//LogProb getProb ( const W& w, const HidVarCPT1DModel<P,LogProb>::IterVal& p ) const {
LogProb getProb ( const W& w, const P::ArrayIterator<LogProb>& p ) const {
@ -93,8 +93,8 @@ class OModel {
};
typedef DistribModeledWgivC RandVarType;
void calcProb ( OModel::RandVarType& o, const W& w ) const {
o.clear();
@ -106,7 +106,7 @@ class OModel {
for (LogProb pr=modPgivC.setIterProb(p,c,aCtr); pr!=LogProb(); pr = modPgivC.setIterProb(p,c,aCtr=0) ){
o.setProb(c) += modPgivC.getProb(p,c).toProb() * modWgivP.getProb(w,p).toProb();
}
}
}
@ -134,7 +134,7 @@ class XModel {
RandAccCPT2DModel<P,W,Prob> modPgivW;
RandAccCPT1DModel<P,Prob> modP;
RandAccCPT1DModel<W,Prob> modW;
public:
typedef X RandVarType;

View File

@ -11,12 +11,12 @@ namespace lm {
namespace ngram {
namespace trie {
DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) :
DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) :
next_(util::BitsMask::ByMax(max_next)) {}
const uint8_t kArrayBhikshaVersion = 0;
// TODO: put this in binary file header instead when I change the binary file format again.
// TODO: put this in binary file header instead when I change the binary file format again.
void ArrayBhiksha::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) {
uint8_t buffer[2];
file.ReadForConfig(buffer, 2, offset);
@ -33,7 +33,7 @@ uint8_t ChopBits(uint64_t max_offset, uint64_t max_next, const Config &config) {
uint8_t required = util::RequiredBits(max_next);
uint8_t best_chop = 0;
int64_t lowest_change = std::numeric_limits<int64_t>::max();
// There are probably faster ways but I don't care because this is only done once per order at construction time.
// There are probably faster ways but I don't care because this is only done once per order at construction time.
for (uint8_t chop = 0; chop <= std::min(required, config.pointer_bhiksha_bits); ++chop) {
int64_t change = (max_next >> (required - chop)) * 64 /* table cost in bits */
- max_offset * static_cast<int64_t>(chop); /* savings in bits*/

View File

@ -7,7 +7,7 @@
* pages={388--391},
* }
*
* Currently only used for next pointers.
* Currently only used for next pointers.
*/
#ifndef LM_BHIKSHA_H
@ -86,9 +86,9 @@ class ArrayBhiksha {
// assert(end_it == std::upper_bound(offset_begin_, offset_end_, index + 1));
--end_it;
// assert(end_it >= begin_it);
out.begin = ((begin_it - offset_begin_) << next_inline_.bits) |
out.begin = ((begin_it - offset_begin_) << next_inline_.bits) |
util::ReadInt57(base, bit_offset, next_inline_.bits, next_inline_.mask);
out.end = ((end_it - offset_begin_) << next_inline_.bits) |
out.end = ((end_it - offset_begin_) << next_inline_.bits) |
util::ReadInt57(base, bit_offset + total_bits, next_inline_.bits, next_inline_.mask);
// If this fails, consider rebuilding your model using KenLM after 1e333d786b748555e8f368d2bbba29a016c98052
assert(out.end >= out.begin);

View File

@ -135,7 +135,7 @@ void MatchCheck(ModelType model_type, unsigned int search_version, const Paramet
const std::size_t kInvalidSize = static_cast<std::size_t>(-1);
BinaryFormat::BinaryFormat(const Config &config)
BinaryFormat::BinaryFormat(const Config &config)
: write_method_(config.write_method), write_mmap_(config.write_mmap), load_method_(config.load_method),
header_size_(kInvalidSize), vocab_size_(kInvalidSize), vocab_string_offset_(kInvalidOffset) {}

View File

@ -19,18 +19,18 @@ namespace ngram {
extern const char *kModelNames[6];
/*Inspect a file to determine if it is a binary lm. If not, return false.
/*Inspect a file to determine if it is a binary lm. If not, return false.
* If so, return true and set recognized to the type. This is the only API in
* this header designed for use by decoder authors.
* this header designed for use by decoder authors.
*/
bool RecognizeBinary(const char *file, ModelType &recognized);
struct FixedWidthParameters {
unsigned char order;
float probing_multiplier;
// What type of model is this?
// What type of model is this?
ModelType model_type;
// Does the end of the file have the actual strings in the vocabulary?
// Does the end of the file have the actual strings in the vocabulary?
bool has_vocabulary;
unsigned int search_version;
};
@ -38,7 +38,7 @@ struct FixedWidthParameters {
// This is a macro instead of an inline function so constants can be assigned using it.
#define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8)
// Parameters stored in the header of a binary file.
// Parameters stored in the header of a binary file.
struct Parameters {
FixedWidthParameters fixed;
std::vector<uint64_t> counts;
@ -79,7 +79,7 @@ class BinaryFormat {
const char *write_mmap_;
util::LoadMethod load_method_;
// File behind memory, if any.
// File behind memory, if any.
util::scoped_fd file_;
// If there is a file involved, a single mapping.

View File

@ -15,9 +15,9 @@ namespace ngram {
* kNoExtensionBackoff. If the n-gram might be extended, then out_state must
* contain the full n-gram, in which case kExtensionBackoff is set. In any
* case, if an n-gram has non-zero backoff, the full state is returned so
* backoff can be properly charged.
* backoff can be properly charged.
* These differ only in sign bit because the backoff is in fact zero in either
* case.
* case.
*/
const float kNoExtensionBackoff = -0.0;
const float kExtensionBackoff = 0.0;
@ -28,7 +28,7 @@ inline void SetExtension(float &backoff) {
if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff;
}
// This compiles down nicely.
// This compiles down nicely.
inline bool HasExtension(const float &backoff) {
typedef union { float f; uint32_t i; } UnionValue;
UnionValue compare, interpret;

View File

@ -56,7 +56,7 @@ void Usage(const char *name, const char *default_mem) {
exit(1);
}
// I could really use boost::lexical_cast right about now.
// I could really use boost::lexical_cast right about now.
float ParseFloat(const char *from) {
char *end;
float ret = strtod(from, &end);

View File

@ -114,7 +114,7 @@ class CollapseStream {
current_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())),
prune_threshold_(prune_threshold),
prune_words_(prune_words),
block_(position) {
block_(position) {
StartBlock();
}
@ -125,27 +125,27 @@ class CollapseStream {
CollapseStream &operator++() {
assert(block_);
if (current_.begin()[1] == kBOS && current_.Base() < copy_from_) {
memcpy(current_.Base(), copy_from_, current_.TotalSize());
UpdateCopyFrom();
// Mark highest order n-grams for later pruning
if(current_.Count() <= prune_threshold_) {
current_.Mark();
current_.Mark();
}
if(!prune_words_.empty()) {
for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
if(prune_words_[*i]) {
current_.Mark();
current_.Mark();
break;
}
}
}
}
current_.NextInMemory();
uint8_t *block_base = static_cast<uint8_t*>(block_->Get());
if (current_.Base() == block_base + block_->ValidSize()) {
@ -153,21 +153,21 @@ class CollapseStream {
++block_;
StartBlock();
}
// Mark highest order n-grams for later pruning
if(current_.Count() <= prune_threshold_) {
current_.Mark();
current_.Mark();
}
if(!prune_words_.empty()) {
for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
if(prune_words_[*i]) {
current_.Mark();
current_.Mark();
break;
}
}
}
return *this;
}
@ -180,21 +180,21 @@ class CollapseStream {
current_.ReBase(block_->Get());
copy_from_ = static_cast<uint8_t*>(block_->Get()) + block_->ValidSize();
UpdateCopyFrom();
// Mark highest order n-grams for later pruning
if(current_.Count() <= prune_threshold_) {
current_.Mark();
current_.Mark();
}
if(!prune_words_.empty()) {
for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
if(prune_words_[*i]) {
current_.Mark();
current_.Mark();
break;
}
}
}
}
// Find last without bos.
@ -222,18 +222,18 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
StatCollector stats(order, counts_, counts_pruned_, discounts_);
if (order == 1) {
// Only unigrams. Just collect stats.
// Only unigrams. Just collect stats.
for (NGramStream full(positions[0]); full; ++full) {
// Do not prune <s> </s> <unk>
if(*full->begin() > 2) {
if(full->Count() <= prune_thresholds_[0])
full->Mark();
if(!prune_words_.empty() && prune_words_[*full->begin()])
full->Mark();
}
stats.AddFull(full->UnmarkedCount(), full->IsMarked());
}
@ -243,7 +243,7 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
NGramStreams streams;
streams.Init(positions, positions.size() - 1);
CollapseStream full(positions[positions.size() - 1], prune_thresholds_.back(), prune_words_);
// Initialization: <unk> has count 0 and so does <s>.
@ -261,7 +261,7 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
std::vector<uint64_t> actual_counts(positions.size(), 0);
// Something of a hack: don't prune <s>.
actual_counts[0] = std::numeric_limits<uint64_t>::max();
// Iterate over full (the stream of the highest order ngrams)
for (; full; ++full) {
const WordIndex *different = FindDifference(*full, **lower_valid);
@ -272,16 +272,16 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
uint64_t order_minus_1 = lower_valid - streams_begin;
if(actual_counts[order_minus_1] <= prune_thresholds_[order_minus_1])
(*lower_valid)->Mark();
if(!prune_words_.empty()) {
for(WordIndex* i = (*lower_valid)->begin(); i != (*lower_valid)->end(); i++) {
if(prune_words_[*i]) {
(*lower_valid)->Mark();
(*lower_valid)->Mark();
break;
}
}
}
stats.Add(order_minus_1, (*lower_valid)->UnmarkedCount(), (*lower_valid)->IsMarked());
++*lower_valid;
}
@ -327,16 +327,16 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
uint64_t lower_count = actual_counts[(*s)->Order() - 1];
if(lower_count <= prune_thresholds_[(*s)->Order() - 1])
(*s)->Mark();
if(!prune_words_.empty()) {
for(WordIndex* i = (*s)->begin(); i != (*s)->end(); i++) {
if(prune_words_[*i]) {
(*s)->Mark();
(*s)->Mark();
break;
}
}
}
stats.Add(s - streams.begin(), lower_count, (*s)->IsMarked());
++*s;
}

View File

@ -30,9 +30,9 @@ struct DiscountConfig {
WarningAction bad_action;
};
/* Compute adjusted counts.
/* Compute adjusted counts.
* Input: unique suffix sorted N-grams (and just the N-grams) with raw counts.
* Output: [1,N]-grams with adjusted counts.
* Output: [1,N]-grams with adjusted counts.
* [1,N)-grams are in suffix order
* N-grams are in undefined order (they're going to be sorted anyway).
*/
@ -50,13 +50,13 @@ class AdjustCounts {
const DiscountConfig &discount_config,
std::vector<Discount> &discounts)
: prune_thresholds_(prune_thresholds), counts_(counts), counts_pruned_(counts_pruned),
prune_words_(prune_words), discount_config_(discount_config), discounts_(discounts)
prune_words_(prune_words), discount_config_(discount_config), discounts_(discounts)
{}
void Run(const util::stream::ChainPositions &positions);
private:
const std::vector<uint64_t> &prune_thresholds_;
const std::vector<uint64_t> &prune_thresholds_;
std::vector<uint64_t> &counts_;
std::vector<uint64_t> &counts_pruned_;
const std::vector<bool> &prune_words_;

View File

@ -82,7 +82,7 @@ BOOST_AUTO_TEST_CASE(Simple) {
}
BOOST_REQUIRE_EQUAL(4UL, counts.size());
BOOST_CHECK_EQUAL(4UL, counts[0]);
// These are no longer set because the discounts are bad.
// These are no longer set because the discounts are bad.
/* BOOST_CHECK_EQUAL(4UL, counts[1]);
BOOST_CHECK_EQUAL(3UL, counts[2]);
BOOST_CHECK_EQUAL(3UL, counts[3]);*/

View File

@ -45,7 +45,7 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
std::size_t operator()(const WordIndex *start) const {
return util::MurmurHashNative(start, size_);
}
private:
const std::size_t size_;
};
@ -53,11 +53,11 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
class DedupeEquals : public std::binary_function<const WordIndex *, const WordIndex *, bool> {
public:
explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {}
bool operator()(const WordIndex *first, const WordIndex *second) const {
return !memcmp(first, second, size_);
}
}
private:
const std::size_t size_;
};
@ -82,7 +82,7 @@ typedef util::ProbingHashTable<DedupeEntry, DedupeHash, DedupeEquals> Dedupe;
class Writer {
public:
Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size)
Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size)
: block_(position), gram_(block_->Get(), order),
dedupe_invalid_(order, std::numeric_limits<WordIndex>::max()),
dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)),
@ -91,7 +91,7 @@ class Writer {
dedupe_.Clear();
assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size);
if (order == 1) {
// Add special words. AdjustCounts is responsible if order != 1.
// Add special words. AdjustCounts is responsible if order != 1.
AddUnigramWord(kUNK);
AddUnigramWord(kBOS);
}
@ -121,16 +121,16 @@ class Writer {
memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1));
return;
}
// Complete the write.
// Complete the write.
gram_.Count() = 1;
// Prepare the next n-gram.
// Prepare the next n-gram.
if (reinterpret_cast<uint8_t*>(gram_.begin()) + gram_.TotalSize() != static_cast<uint8_t*>(block_->Get()) + block_size_) {
NGram last(gram_);
gram_.NextInMemory();
std::copy(last.begin() + 1, last.end(), gram_.begin());
return;
}
// Block end. Need to store the context in a temporary buffer.
// Block end. Need to store the context in a temporary buffer.
std::copy(gram_.begin() + 1, gram_.end(), buffer_.get());
dedupe_.Clear();
block_->SetValidSize(block_size_);
@ -158,7 +158,7 @@ class Writer {
// Hash table combiner implementation.
Dedupe dedupe_;
// Small buffer to hold existing ngrams when shifting across a block boundary.
// Small buffer to hold existing ngrams when shifting across a block boundary.
boost::scoped_array<WordIndex> buffer_;
const std::size_t block_size_;
@ -224,12 +224,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
} catch (const util::EndOfFileException &e) {}
token_count_ = count;
type_count_ = vocab.Size();
// Create list of unigrams that are supposed to be pruned
if (!prune_vocab_filename_.empty()) {
try {
util::FilePiece prune_vocab_file(prune_vocab_filename_.c_str());
prune_words_.resize(vocab.Size(), true);
try {
while (true) {
@ -238,12 +238,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
prune_words_[vocab.Index(*w)] = false;
}
} catch (const util::EndOfFileException &e) {}
// Never prune <unk>, <s>, </s>
prune_words_[kUNK] = false;
prune_words_[kBOS] = false;
prune_words_[kEOS] = false;
} catch (const util::Exception &e) {
std::cerr << e.what() << std::endl;
abort();

View File

@ -40,7 +40,7 @@ class CorpusCount {
uint64_t &token_count_;
WordIndex &type_count_;
std::vector<bool>& prune_words_;
const std::string& prune_vocab_filename_;
const std::string& prune_vocab_filename_;
std::size_t dedupe_mem_size_;
util::scoped_malloc dedupe_mem_;

View File

@ -27,9 +27,9 @@ struct HashBufferEntry : public BufferEntry {
uint64_t hash_value;
};
// Reads all entries in order like NGramStream does.
// Reads all entries in order like NGramStream does.
// But deletes any entries that have CutoffCount below or equal to pruning
// threshold.
// threshold.
class PruneNGramStream {
public:
PruneNGramStream(const util::stream::ChainPosition &position) :
@ -37,7 +37,7 @@ class PruneNGramStream {
dest_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())),
currentCount_(0),
block_(position)
{
{
StartBlock();
}
@ -50,7 +50,7 @@ class PruneNGramStream {
PruneNGramStream &operator++() {
assert(block_);
if(current_.Order() == 1 && *current_.begin() <= 2)
dest_.NextInMemory();
else if(currentCount_ > 0) {
@ -59,9 +59,9 @@ class PruneNGramStream {
}
dest_.NextInMemory();
}
current_.NextInMemory();
uint8_t *block_base = static_cast<uint8_t*>(block_->Get());
if (current_.Base() == block_base + block_->ValidSize()) {
block_->SetValidSize(dest_.Base() - block_base);
@ -70,13 +70,13 @@ class PruneNGramStream {
if (block_) {
currentCount_ = current_.CutoffCount();
}
} else {
} else {
currentCount_ = current_.CutoffCount();
}
return *this;
}
private:
void StartBlock() {
for (; ; ++block_) {
@ -85,13 +85,13 @@ class PruneNGramStream {
}
current_.ReBase(block_->Get());
currentCount_ = current_.CutoffCount();
dest_.ReBase(block_->Get());
}
NGram current_; // input iterator
NGram dest_; // output iterator
uint64_t currentCount_;
util::stream::Link block_;
@ -155,24 +155,24 @@ class AddRight {
memcpy(previous_raw, in->begin(), size);
uint64_t denominator = 0;
uint64_t normalizer = 0;
uint64_t counts[4];
memset(counts, 0, sizeof(counts));
do {
denominator += in->UnmarkedCount();
// Collect unused probability mass from pruning.
// Becomes 0 for unpruned ngrams.
normalizer += in->UnmarkedCount() - in->CutoffCount();
// Chen&Goodman do not mention counting based on cutoffs, but
// backoff becomes larger than 1 otherwise, so probably needs
// to count cutoffs. Counts normally without pruning.
if(in->CutoffCount() > 0)
++counts[std::min(in->CutoffCount(), static_cast<uint64_t>(3))];
} while (++in && !memcmp(previous_raw, in->begin(), size));
BufferEntry &entry = *reinterpret_cast<BufferEntry*>(out.Get());
entry.denominator = static_cast<float>(denominator);
entry.gamma = 0.0;
@ -182,9 +182,9 @@ class AddRight {
// Makes model sum to 1 with pruning (I hope).
entry.gamma += normalizer;
entry.gamma /= entry.denominator;
if(pruning_) {
// If pruning is enabled the stream actually contains HashBufferEntry, see InitialProbabilities(...),
// so add a hash value that identifies the current ngram.
@ -244,13 +244,13 @@ class MergeRight {
++summed;
return;
}
std::vector<WordIndex> previous(grams->Order() - 1);
const std::size_t size = sizeof(WordIndex) * previous.size();
for (; grams; ++summed) {
memcpy(&previous[0], grams->begin(), size);
const BufferEntry &sums = *static_cast<const BufferEntry*>(summed.Get());
do {
Payload &pay = grams->Value();
pay.uninterp.prob = discount_.Apply(grams->UnmarkedCount()) / sums.denominator;
@ -288,7 +288,7 @@ void InitialProbabilities(
gamma_out[i] >> AddRight(discounts[i], second, prune_vocab || prune_thresholds[i] > 0);
primary[i] >> MergeRight(config.interpolate_unigrams, gamma_out[i].Add(), discounts[i]);
// Don't bother with the OnlyGamma thread for something to discard.
if (i) gamma_out[i] >> OnlyGamma(prune_vocab || prune_thresholds[i] > 0);
}

View File

@ -15,17 +15,17 @@ struct InitialProbabilitiesConfig {
// These should be small buffers to keep the adder from getting too far ahead
util::stream::ChainConfig adder_in;
util::stream::ChainConfig adder_out;
// SRILM doesn't normally interpolate unigrams.
// SRILM doesn't normally interpolate unigrams.
bool interpolate_unigrams;
};
/* Compute initial (uninterpolated) probabilities
* primary: the normal chain of n-grams. Incoming is context sorted adjusted
* counts. Outgoing has uninterpolated probabilities for use by Interpolate.
* second_in: a second copy of the primary input. Discard the output.
* second_in: a second copy of the primary input. Discard the output.
* gamma_out: Computed gamma values are output on these chains in suffix order.
* The values are bare floats and should be buffered for interpolation to
* use.
* use.
*/
void InitialProbabilities(
const InitialProbabilitiesConfig &config,

View File

@ -47,7 +47,7 @@ class OutputQ {
private:
// Product of backoffs in the numerator divided by backoffs in the
// denominator. Does not include
// denominator. Does not include
std::vector<float> q_delta_;
};
@ -81,7 +81,7 @@ template <class Output> class Callback {
if(prune_vocab_ || prune_thresholds_[i + 1] > 0)
while(backoffs_[i])
++backoffs_[i];
if (backoffs_[i]) {
std::cerr << "Backoffs do not match for order " << (i + 1) << std::endl;
abort();
@ -99,7 +99,7 @@ template <class Output> class Callback {
if(prune_vocab_ || prune_thresholds_[order_minus_1 + 1] > 0) {
//Compute hash value for current context
uint64_t current_hash = util::MurmurHashNative(gram.begin(), gram.Order() * sizeof(WordIndex));
const HashGamma *hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get());
while(current_hash != hashed_backoff->hash_value && ++backoffs_[order_minus_1])
hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get());

View File

@ -8,8 +8,8 @@
#include <stdint.h>
namespace lm { namespace builder {
/* Interpolate step.
/* Interpolate step.
* Input: suffix sorted n-grams with (p_uninterpolated, gamma) from
* InitialProbabilities.
* Output: suffix sorted n-grams with complete probability

View File

@ -35,7 +35,7 @@ template <class Callback, class Compare> void JointOrder(const util::stream::Cha
// Does the context match the lower one?
if (!memcmp(streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) {
callback.Enter(current, *streams[current]);
// Transition to looking for extensions.
// Transition to looking for extensions.
if (++current < order) continue;
}
#ifdef DEBUG
@ -46,16 +46,16 @@ template <class Callback, class Compare> void JointOrder(const util::stream::Cha
abort();
}
#endif // DEBUG
// No extension left.
// No extension left.
while(true) {
assert(current > 0);
--current;
callback.Exit(current, *streams[current]);
if (++streams[current]) break;
UTIL_THROW_IF(order != current + 1, FormatLoadException, "Detected n-gram without matching suffix");
order = current;
if (!order) return;
}

View File

@ -53,7 +53,7 @@ std::vector<uint64_t> ParsePruning(const std::vector<std::string> &param, std::s
// throw if each n-gram order has not threshold specified
UTIL_THROW_IF(prune_thresholds.size() > order, util::Exception, "You specified pruning thresholds for orders 1 through " << prune_thresholds.size() << " but the model only has order " << order);
// threshold for unigram can only be 0 (no pruning)
// check if threshold are not in decreasing order
uint64_t lower_threshold = 0;
for (std::vector<uint64_t>::iterator it = prune_thresholds.begin(); it != prune_thresholds.end(); ++it) {
@ -124,7 +124,7 @@ int main(int argc, char *argv[]) {
po::store(po::parse_command_line(argc, argv, options), vm);
if (argc == 1 || vm["help"].as<bool>()) {
std::cerr <<
std::cerr <<
"Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
"Please cite:\n"
"@inproceedings{Heafield-estimate,\n"
@ -147,7 +147,7 @@ int main(int argc, char *argv[]) {
std::cerr << "This machine has " << mem << " bytes of memory.\n\n";
} else {
std::cerr << "Unable to determine the amount of memory on this machine.\n\n";
}
}
std::cerr << options << std::endl;
return 1;
}
@ -191,11 +191,11 @@ int main(int argc, char *argv[]) {
else {
pipeline.prune_vocab = false;
}
util::NormalizeTempPrefix(pipeline.sort.temp_prefix);
lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
// TODO: evaluate options for these.
// TODO: evaluate options for these.
initial.adder_in.total_memory = 32768;
initial.adder_in.block_count = 2;
initial.adder_out.total_memory = 32768;

View File

@ -68,26 +68,26 @@ class NGram {
assert(size == TotalSize(ret));
return ret;
}
// manipulate msb to signal that ngram can be pruned
/*mjd**********************************************************************/
bool IsMarked() const {
return Value().count >> (sizeof(Value().count) * 8 - 1);
}
void Mark() {
Value().count |= (1ul << (sizeof(Value().count) * 8 - 1));
}
void Unmark() {
Value().count &= ~(1ul << (sizeof(Value().count) * 8 - 1));
}
uint64_t UnmarkedCount() const {
return Value().count & ~(1ul << (sizeof(Value().count) * 8 - 1));
}
uint64_t CutoffCount() const {
return IsMarked() ? 0 : UnmarkedCount();
}

View File

@ -37,7 +37,7 @@ void PrintStatistics(const std::vector<uint64_t> &counts, const std::vector<uint
class Master {
public:
explicit Master(PipelineConfig &config)
explicit Master(PipelineConfig &config)
: config_(config), chains_(config.order), files_(config.order) {
config_.minimum_block = std::max(NGram::TotalSize(config_.order), config_.minimum_block);
}
@ -64,7 +64,7 @@ class Master {
CreateChains(config_.TotalMemory() - merge_using, count_bounds);
ngrams.Output(chains_.back(), merge_using);
// Setup unigram file.
// Setup unigram file.
files_.push_back(util::MakeTemp(config_.TempPrefix()));
}
@ -204,7 +204,7 @@ class Master {
PipelineConfig &config_;
util::stream::Chains chains_;
// Often only unigrams, but sometimes all orders.
// Often only unigrams, but sometimes all orders.
util::FixedArray<util::stream::FileBuffer> files_;
};
@ -214,7 +214,7 @@ void CountText(int text_file /* input */, int vocab_file /* output */, Master &m
const std::size_t vocab_usage = CorpusCount::VocabUsage(config.vocab_estimate);
UTIL_THROW_IF(config.TotalMemory() < vocab_usage, util::Exception, "Vocab hash size estimate " << vocab_usage << " exceeds total memory " << config.TotalMemory());
std::size_t memory_for_chain =
std::size_t memory_for_chain =
// This much memory to work with after vocab hash table.
static_cast<float>(config.TotalMemory() - vocab_usage) /
// Solve for block size including the dedupe multiplier for one block.
@ -252,7 +252,7 @@ void InitialProbabilities(const std::vector<uint64_t> &counts, const std::vector
util::stream::Chains gamma_chains(config.order);
InitialProbabilities(config.initial_probs, discounts, master.MutableChains(), second, gamma_chains, prune_thresholds, prune_vocab);
// Don't care about gamma for 0.
// Don't care about gamma for 0.
gamma_chains[0] >> util::stream::kRecycle;
gammas.Init(config.order - 1);
for (std::size_t i = 1; i < config.order; ++i) {
@ -307,16 +307,16 @@ void Pipeline(PipelineConfig &config, int text_file, Output &output) {
// master's destructor will wait for chains. But they might be deadlocked if
// this thread dies because e.g. it ran out of memory.
try {
util::scoped_fd vocab_file(config.vocab_file.empty() ?
util::MakeTemp(config.TempPrefix()) :
util::scoped_fd vocab_file(config.vocab_file.empty() ?
util::MakeTemp(config.TempPrefix()) :
util::CreateOrThrow(config.vocab_file.c_str()));
output.SetVocabFD(vocab_file.get());
uint64_t token_count;
std::string text_file_name;
std::vector<bool> prune_words;
CountText(text_file, vocab_file.get(), master, token_count, text_file_name, prune_words);
std::vector<uint64_t> counts;
std::vector<uint64_t> counts_pruned;
std::vector<Discount> discounts;

View File

@ -44,7 +44,7 @@ struct PipelineConfig {
// Compute collapsed q values instead of probability and backoff
bool output_q;
/* Computing the perplexity of LMs with different vocabularies is hard. For
* example, the lowest perplexity is attained by a unigram model that
* predicts p(<unk>) = 1 and has no other vocabulary. Also, linearly

View File

@ -55,7 +55,7 @@ void PrintARPA::Run(const util::stream::ChainPositions &positions) {
if (order != positions.size())
out << '\t' << stream->Value().complete.backoff;
out << '\n';
}
out << '\n';
}

View File

@ -14,7 +14,7 @@
// Warning: print routines read all unigrams before all bigrams before all
// trigrams etc. So if other parts of the chain move jointly, you'll have to
// buffer.
// buffer.
namespace lm { namespace builder {
@ -42,7 +42,7 @@ class VocabReconstitute {
std::vector<const char*> map_;
};
// Not defined, only specialized.
// Not defined, only specialized.
template <class T> void PrintPayload(util::FakeOFStream &to, const Payload &payload);
template <> inline void PrintPayload<uint64_t>(util::FakeOFStream &to, const Payload &payload) {
// TODO slow
@ -55,7 +55,7 @@ template <> inline void PrintPayload<ProbBackoff>(util::FakeOFStream &to, const
to << payload.complete.prob << ' ' << payload.complete.backoff;
}
// template parameter is the type stored.
// template parameter is the type stored.
template <class V> class Print {
public:
static void DumpSeparateFiles(const VocabReconstitute &vocab, const std::string &file_base, util::stream::Chains &chains) {

View File

@ -19,7 +19,7 @@ namespace builder {
*/
template <class Child> class Comparator : public std::binary_function<const void *, const void *, bool> {
public:
/**
* Constructs a comparator capable of comparing two n-grams.
*
@ -51,8 +51,8 @@ template <class Child> class Comparator : public std::binary_function<const void
/**
* N-gram comparator that compares n-grams according to their reverse (suffix) order.
*
* This comparator compares n-grams lexicographically, one word at a time,
* beginning with the last word of each n-gram and ending with the first word of each n-gram.
* This comparator compares n-grams lexicographically, one word at a time,
* beginning with the last word of each n-gram and ending with the first word of each n-gram.
*
* Some examples of n-gram comparisons as defined by this comparator:
* - a b c == a b c
@ -64,8 +64,8 @@ template <class Child> class Comparator : public std::binary_function<const void
*/
class SuffixOrder : public Comparator<SuffixOrder> {
public:
/**
/**
* Constructs a comparator capable of comparing two n-grams.
*
* @param order Number of words in each n-gram
@ -73,7 +73,7 @@ class SuffixOrder : public Comparator<SuffixOrder> {
explicit SuffixOrder(std::size_t order) : Comparator<SuffixOrder>(order) {}
/**
* Compares two n-grams lexicographically, one word at a time,
* Compares two n-grams lexicographically, one word at a time,
* beginning with the last word of each n-gram and ending with the first word of each n-gram.
*
* @param lhs A pointer to the n-gram on the left-hand side of the comparison
@ -90,11 +90,11 @@ class SuffixOrder : public Comparator<SuffixOrder> {
static const unsigned kMatchOffset = 1;
};
/**
* N-gram comparator that compares n-grams according to the reverse (suffix) order of the n-gram context.
*
* This comparator compares n-grams lexicographically, one word at a time,
* This comparator compares n-grams lexicographically, one word at a time,
* beginning with the penultimate word of each n-gram and ending with the first word of each n-gram;
* finally, this comparator compares the last word of each n-gram.
*
@ -108,8 +108,8 @@ class SuffixOrder : public Comparator<SuffixOrder> {
*/
class ContextOrder : public Comparator<ContextOrder> {
public:
/**
/**
* Constructs a comparator capable of comparing two n-grams.
*
* @param order Number of words in each n-gram
@ -117,7 +117,7 @@ class ContextOrder : public Comparator<ContextOrder> {
explicit ContextOrder(std::size_t order) : Comparator<ContextOrder>(order) {}
/**
* Compares two n-grams lexicographically, one word at a time,
* Compares two n-grams lexicographically, one word at a time,
* beginning with the penultimate word of each n-gram and ending with the first word of each n-gram;
* finally, this comparator compares the last word of each n-gram.
*
@ -136,7 +136,7 @@ class ContextOrder : public Comparator<ContextOrder> {
/**
* N-gram comparator that compares n-grams according to their natural (prefix) order.
*
* This comparator compares n-grams lexicographically, one word at a time,
* This comparator compares n-grams lexicographically, one word at a time,
* beginning with the first word of each n-gram and ending with the last word of each n-gram.
*
* Some examples of n-gram comparisons as defined by this comparator:
@ -149,8 +149,8 @@ class ContextOrder : public Comparator<ContextOrder> {
*/
class PrefixOrder : public Comparator<PrefixOrder> {
public:
/**
/**
* Constructs a comparator capable of comparing two n-grams.
*
* @param order Number of words in each n-gram
@ -158,7 +158,7 @@ class PrefixOrder : public Comparator<PrefixOrder> {
explicit PrefixOrder(std::size_t order) : Comparator<PrefixOrder>(order) {}
/**
* Compares two n-grams lexicographically, one word at a time,
* Compares two n-grams lexicographically, one word at a time,
* beginning with the first word of each n-gram and ending with the last word of each n-gram.
*
* @param lhs A pointer to the n-gram on the left-hand side of the comparison
@ -171,7 +171,7 @@ class PrefixOrder : public Comparator<PrefixOrder> {
}
return false;
}
static const unsigned kMatchOffset = 0;
};
@ -179,7 +179,7 @@ class PrefixOrder : public Comparator<PrefixOrder> {
struct AddCombiner {
bool operator()(void *first_void, const void *second_void, const SuffixOrder &compare) const {
NGram first(first_void, compare.Order());
// There isn't a const version of NGram.
// There isn't a const version of NGram.
NGram second(const_cast<void*>(second_void), compare.Order());
if (memcmp(first.begin(), second.begin(), sizeof(WordIndex) * compare.Order())) return false;
first.Count() += second.Count();
@ -204,10 +204,10 @@ template <class Compare> class Sorts : public util::FixedArray<util::stream::Sor
typedef util::FixedArray<S> P;
public:
/**
* Constructs, but does not initialize.
*
*
* @ref util::FixedArray::Init() "Init" must be called before use.
*
* @see util::FixedArray::Init()
@ -222,7 +222,7 @@ template <class Compare> class Sorts : public util::FixedArray<util::stream::Sor
*/
explicit Sorts(std::size_t number) : util::FixedArray<util::stream::Sort<Compare> >(number) {}
/**
/**
* Constructs a new @ref util::stream::Sort "Sort" object which is stored in this @ref util::FixedArray "array".
*
* The new @ref util::stream::Sort "Sort" object is constructed using the provided @ref util::stream::SortConfig "SortConfig" and @ref Comparator "ngram comparator";

View File

@ -10,7 +10,7 @@ namespace lm {
* and implement Add. Then put a pointer in Config.enumerate_vocab; it does
* not take ownership. Add is called once per vocab word. index starts at 0
* and increases by 1 each time. This is only used by the Model constructor;
* the pointer is not retained by the class.
* the pointer is not retained by the class.
*/
class EnumerateVocab {
public:

View File

@ -9,8 +9,8 @@
namespace lm {
namespace base {
// Common model interface that depends on knowing the specific classes.
// Curiously recurring template pattern.
// Common model interface that depends on knowing the specific classes.
// Curiously recurring template pattern.
template <class Child, class StateT, class VocabularyT> class ModelFacade : public Model {
public:
typedef StateT State;
@ -32,7 +32,7 @@ template <class Child, class StateT, class VocabularyT> class ModelFacade : publ
*reinterpret_cast<State*>(out_state));
}
// Default Score function calls FullScore. Model can override this.
// Default Score function calls FullScore. Model can override this.
float Score(const State &in_state, const WordIndex new_word, State &out_state) const {
return static_cast<const Child*>(this)->FullScore(in_state, new_word, out_state).prob;
}
@ -53,7 +53,7 @@ template <class Child, class StateT, class VocabularyT> class ModelFacade : publ
virtual ~ModelFacade() {}
// begin_sentence and null_context can disappear after. vocab should stay.
// begin_sentence and null_context can disappear after. vocab should stay.
void Init(const State &begin_sentence, const State &null_context, const Vocabulary &vocab, unsigned char order) {
begin_sentence_ = begin_sentence;
null_context_ = null_context;

View File

@ -33,7 +33,7 @@ class CountOutput : boost::noncopyable {
class CountBatch {
public:
explicit CountBatch(std::streamsize initial_read)
explicit CountBatch(std::streamsize initial_read)
: initial_read_(initial_read) {
buffer_.reserve(initial_read);
}
@ -66,7 +66,7 @@ class CountBatch {
private:
std::streamsize initial_read_;
// This could have been a std::string but that's less happy with raw writes.
// This could have been a std::string but that's less happy with raw writes.
std::vector<char> buffer_;
};

View File

@ -58,7 +58,7 @@ typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} Fil
typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format;
struct Config {
Config() :
Config() :
#ifndef NTHREAD
batch_size(25000),
threads(boost::thread::hardware_concurrency()),

View File

@ -134,12 +134,12 @@ struct CountFormat {
/* For multithreading, the buffer classes hold batches of filter inputs and
* outputs in memory. The strings get reused a lot, so keep them around
* instead of clearing each time.
* instead of clearing each time.
*/
class InputBuffer {
public:
InputBuffer() : actual_(0) {}
void Reserve(size_t size) { lines_.reserve(size); }
template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) {
@ -179,18 +179,18 @@ class BinaryOutputBuffer {
void Reserve(size_t size) {
lines_.reserve(size);
}
void AddNGram(const StringPiece &line) {
lines_.push_back(line);
}
template <class Output> void Flush(Output &output) {
for (std::vector<StringPiece>::const_iterator i = lines_.begin(); i != lines_.end(); ++i) {
output.AddNGram(*i);
}
lines_.clear();
}
private:
std::vector<StringPiece> lines_;
};
@ -234,7 +234,7 @@ class MultipleOutputBuffer {
private:
struct Annotated {
// If this is empty, send to all systems.
// If this is empty, send to all systems.
// A filter should never send to all systems and send to a single one.
std::vector<size_t> systems;
StringPiece line;

View File

@ -31,14 +31,14 @@ unsigned int ReadMultiple(std::istream &in, Substrings &out) {
word.clear();
}
if (c == ' ') continue;
// It's more than just a space. Close out the phrase.
// It's more than just a space. Close out the phrase.
if (!phrase.empty()) {
sentence_content = true;
out.AddPhrase(sentence_id, phrase.begin(), phrase.end());
phrase.clear();
}
if (c == '\t' || c == '\v') continue;
// It's more than a space or tab: a newline.
// It's more than a space or tab: a newline.
if (sentence_content) {
++sentence_id;
sentence_content = false;
@ -53,7 +53,7 @@ typedef unsigned int Sentence;
typedef std::vector<Sentence> Sentences;
} // namespace
namespace detail {
namespace detail {
const StringPiece kEndSentence("</s>");
@ -61,7 +61,7 @@ class Arc {
public:
Arc() {}
// For arcs from one vertex to another.
// For arcs from one vertex to another.
void SetPhrase(detail::Vertex &from, detail::Vertex &to, const Sentences &intersect) {
Set(to, intersect);
from_ = &from;
@ -69,7 +69,7 @@ class Arc {
/* For arcs from before the n-gram begins to somewhere in the n-gram (right
* aligned). These have no from_ vertex; it implictly matches every
* sentence. This also handles when the n-gram is a substring of a phrase.
* sentence. This also handles when the n-gram is a substring of a phrase.
*/
void SetRight(detail::Vertex &to, const Sentences &complete) {
Set(to, complete);
@ -87,12 +87,12 @@ class Arc {
/* When this function returns:
* If Empty() then there's nothing left from this intersection.
*
* If Current() == to then to is part of the intersection.
* If Current() == to then to is part of the intersection.
*
* Otherwise, Current() > to. In this case, to is not part of the
* intersection and neither is anything < Current(). To determine if
* any value >= Current() is in the intersection, call LowerBound again
* with the value.
* with the value.
*/
void LowerBound(const Sentence to);
@ -160,15 +160,15 @@ void Arc::Set(Vertex &to, const Sentences &sentences) {
void Vertex::LowerBound(const Sentence to) {
if (Empty()) return;
// Union lower bound.
// Union lower bound.
while (true) {
Arc *top = incoming_.top();
if (top->Current() > to) {
current_ = top->Current();
return;
}
// If top->Current() == to, we still need to verify that's an actual
// element and not just a bound.
// If top->Current() == to, we still need to verify that's an actual
// element and not just a bound.
incoming_.pop();
top->LowerBound(to);
if (!top->Empty()) {
@ -213,13 +213,13 @@ void BuildGraph(const Substrings &phrase, const std::vector<Hash> &hashes, detai
}
}
// Phrases starting at the second or later word in the n-gram.
// Phrases starting at the second or later word in the n-gram.
Vertex *vertex_from = vertices;
for (const Hash *word_from = first_word + 1; word_from != &*hashes.end(); ++word_from, ++vertex_from) {
hash = 0;
Vertex *vertex_to = vertex_from + 1;
for (const Hash *word_to = word_from; ; ++word_to, ++vertex_to) {
// Notice that word_to and vertex_to have the same index.
// Notice that word_to and vertex_to have the same index.
hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *word_to);
// Now hash covers [word_from, word_to].
if (word_to == last_word) {
@ -250,7 +250,7 @@ detail::Vertex &ConditionCommon::MakeGraph() {
vertices_.clear();
vertices_.resize(hashes_.size());
arcs_.clear();
// One for every substring.
// One for every substring.
arcs_.resize(((hashes_.size() + 1) * hashes_.size()) / 2);
BuildGraph(substrings_, hashes_, &*vertices_.begin(), &*arcs_.begin());
return vertices_[hashes_.size() - 1];

View File

@ -27,7 +27,7 @@ class Substrings {
private:
/* This is the value in a hash table where the key is a string. It indicates
* four sets of sentences:
* substring is sentences with a phrase containing the key as a substring.
* substring is sentences with a phrase containing the key as a substring.
* left is sentencess with a phrase that begins with the key (left aligned).
* right is sentences with a phrase that ends with the key (right aligned).
* phrase is sentences where the key is a phrase.
@ -39,8 +39,8 @@ class Substrings {
/* Most of the CPU is hash table lookups, so let's not complicate it with
* vector equality comparisons. If a collision happens, the SentenceRelation
* structure will contain the union of sentence ids over the colliding strings.
* In that case, the filter will be slightly more permissive.
* The key here is the same as boost's hash of std::vector<std::string>.
* In that case, the filter will be slightly more permissive.
* The key here is the same as boost's hash of std::vector<std::string>.
*/
typedef boost::unordered_map<Hash, SentenceRelation> Table;
@ -58,9 +58,9 @@ class Substrings {
LM_FILTER_PHRASE_METHOD(Phrase, phrase)
#pragma GCC diagnostic ignored "-Wuninitialized" // end != finish so there's always an initialization
// sentence_id must be non-decreasing. Iterators are over words in the phrase.
// sentence_id must be non-decreasing. Iterators are over words in the phrase.
template <class Iterator> void AddPhrase(unsigned int sentence_id, const Iterator &begin, const Iterator &end) {
// Iterate over all substrings.
// Iterate over all substrings.
for (Iterator start = begin; start != end; ++start) {
Hash hash = 0;
SentenceRelation *relation;
@ -85,7 +85,7 @@ class Substrings {
};
// Read a file with one sentence per line containing tab-delimited phrases of
// space-separated words.
// space-separated words.
unsigned int ReadMultiple(std::istream &in, Substrings &out);
namespace detail {
@ -94,7 +94,7 @@ extern const StringPiece kEndSentence;
template <class Iterator> void MakeHashes(Iterator i, const Iterator &end, std::vector<Hash> &hashes) {
hashes.clear();
if (i == end) return;
// TODO: check strict phrase boundaries after <s> and before </s>. For now, just skip tags.
// TODO: check strict phrase boundaries after <s> and before </s>. For now, just skip tags.
if ((i->data()[0] == '<') && (i->data()[i->size() - 1] == '>')) {
++i;
}

View File

@ -88,7 +88,7 @@ class TargetWords {
class Input {
public:
explicit Input(std::size_t max_length)
explicit Input(std::size_t max_length)
: max_length_(max_length), sentence_id_(0), empty_() {}
void AddSentence(StringPiece sentence, TargetWords &targets) {
@ -125,7 +125,7 @@ class Input {
Map map_;
std::size_t sentence_id_;
// Temporaries in AddSentence.
std::string canonical_;
std::vector<std::size_t> starts_;

View File

@ -13,29 +13,29 @@ namespace lm {
template <class OutputBuffer> class ThreadBatch {
public:
ThreadBatch() {}
void Reserve(size_t size) {
input_.Reserve(size);
output_.Reserve(size);
}
// File reading thread.
// File reading thread.
InputBuffer &Fill(uint64_t sequence) {
sequence_ = sequence;
// Why wait until now to clear instead of after output? free in the same
// thread as allocated.
// thread as allocated.
input_.Clear();
return input_;
}
// Filter worker thread.
// Filter worker thread.
template <class Filter> void CallFilter(Filter &filter) {
input_.CallFilter(filter, output_);
}
uint64_t Sequence() const { return sequence_; }
// File writing thread.
// File writing thread.
template <class RealOutput> void Flush(RealOutput &output) {
output_.Flush(output);
}
@ -73,7 +73,7 @@ template <class Batch, class Output> class OutputWorker {
void operator()(Request request) {
assert(request->Sequence() >= base_sequence_);
// Assemble the output in order.
// Assemble the output in order.
uint64_t pos = request->Sequence() - base_sequence_;
if (pos >= ordering_.size()) {
ordering_.resize(pos + 1, NULL);
@ -102,7 +102,7 @@ template <class Filter, class OutputBuffer, class RealOutput> class Controller :
typedef ThreadBatch<OutputBuffer> Batch;
public:
Controller(size_t batch_size, size_t queue, size_t workers, const Filter &filter, RealOutput &output)
Controller(size_t batch_size, size_t queue, size_t workers, const Filter &filter, RealOutput &output)
: batch_size_(batch_size), queue_size_(queue),
batches_(queue),
to_read_(queue),

View File

@ -30,7 +30,7 @@ bool IsLineEnd(std::istream &in) {
}// namespace
// Read space separated words in enter separated lines. These lines can be
// very long, so don't read an entire line at a time.
// very long, so don't read an entire line at a time.
unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out) {
in.exceptions(std::istream::badbit);
unsigned int sentence = 0;

View File

@ -26,7 +26,7 @@ unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, st
/* Is this a special tag like <s> or <UNK>? This actually includes anything
* surrounded with < and >, which most tokenizers separate for real words, so
* this should not catch real words as it looks at a single token.
* this should not catch real words as it looks at a single token.
*/
inline bool IsTag(const StringPiece &value) {
// The parser should never give an empty string.

View File

@ -13,7 +13,7 @@ namespace lm {
// multiple-output filter so clients code against one interface.
template <class Binary> class BinaryFilter {
public:
// Binary modes are just references (and a set) and it makes the API cleaner to copy them.
// Binary modes are just references (and a set) and it makes the API cleaner to copy them.
explicit BinaryFilter(Binary binary) : binary_(binary) {}
template <class Iterator, class Output> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) {

View File

@ -1,22 +1,22 @@
/* Efficient left and right language model state for sentence fragments.
* Intended usage:
* Store ChartState with every chart entry.
* Store ChartState with every chart entry.
* To do a rule application:
* 1. Make a ChartState object for your new entry.
* 2. Construct RuleScore.
* 3. Going from left to right, call Terminal or NonTerminal.
* For terminals, just pass the vocab id.
* 1. Make a ChartState object for your new entry.
* 2. Construct RuleScore.
* 3. Going from left to right, call Terminal or NonTerminal.
* For terminals, just pass the vocab id.
* For non-terminals, pass that non-terminal's ChartState.
* If your decoder expects scores inclusive of subtree scores (i.e. you
* label entries with the highest-scoring path), pass the non-terminal's
* score as prob.
* score as prob.
* If your decoder expects relative scores and will walk the chart later,
* pass prob = 0.0.
* pass prob = 0.0.
* In other words, the only effect of prob is that it gets added to the
* returned log probability.
* 4. Call Finish. It returns the log probability.
* returned log probability.
* 4. Call Finish. It returns the log probability.
*
* There's a couple more details:
* There's a couple more details:
* Do not pass <s> to Terminal as it is formally not a word in the sentence,
* only context. Instead, call BeginSentence. If called, it should be the
* first call after RuleScore is constructed (since <s> is always the
@ -27,12 +27,12 @@
* Hashing and sorting comparison operators are provided. All state objects
* are POD. If you intend to use memcmp on raw state objects, you must call
* ZeroRemaining first, as the value of array entries beyond length is
* otherwise undefined.
* otherwise undefined.
*
* Usage is of course not limited to chart decoding. Anything that generates
* sentence fragments missing left context could benefit. For example, a
* phrase-based decoder could pre-score phrases, storing ChartState with each
* phrase, even if hypotheses are generated left-to-right.
* phrase, even if hypotheses are generated left-to-right.
*/
#ifndef LM_LEFT_H
@ -77,7 +77,7 @@ template <class M> class RuleScore {
left_done_ = true;
}
// Faster version of NonTerminal for the case where the rule begins with a non-terminal.
// Faster version of NonTerminal for the case where the rule begins with a non-terminal.
void BeginNonTerminal(const ChartState &in, float prob = 0.0) {
prob_ = prob;
*out_ = in;
@ -86,7 +86,7 @@ template <class M> class RuleScore {
void NonTerminal(const ChartState &in, float prob = 0.0) {
prob_ += prob;
if (!in.left.length) {
if (in.left.full) {
for (const float *i = out_->right.backoff; i < out_->right.backoff + out_->right.length; ++i) prob_ += *i;
@ -131,26 +131,26 @@ template <class M> class RuleScore {
return;
}
// Right state was minimized, so it's already independent of the new words to the left.
// Right state was minimized, so it's already independent of the new words to the left.
if (in.right.length < in.left.length) {
out_->right = in.right;
return;
}
// Shift exisiting words down.
// Shift exisiting words down.
for (WordIndex *i = out_->right.words + next_use - 1; i >= out_->right.words; --i) {
*(i + in.right.length) = *i;
}
// Add words from in.right.
// Add words from in.right.
std::copy(in.right.words, in.right.words + in.right.length, out_->right.words);
// Assemble backoff composed on the existing state's backoff followed by the new state's backoff.
// Assemble backoff composed on the existing state's backoff followed by the new state's backoff.
std::copy(in.right.backoff, in.right.backoff + in.right.length, out_->right.backoff);
std::copy(back, back + next_use, out_->right.backoff + in.right.length);
out_->right.length = in.right.length + next_use;
}
float Finish() {
// A N-1-gram might extend left and right but we should still set full to true because it's an N-1-gram.
// A N-1-gram might extend left and right but we should still set full to true because it's an N-1-gram.
out_->left.full = left_done_ || (out_->left.length == model_.Order() - 1);
return prob_;
}
@ -173,17 +173,17 @@ template <class M> class RuleScore {
back_in, // Backoffs to use
in.left.pointers[extend_length - 1], extend_length, // Words to be extended
back_out, // Backoffs for the next score
next_use)); // Length of n-gram to use in next scoring.
next_use)); // Length of n-gram to use in next scoring.
if (next_use != out_->right.length) {
left_done_ = true;
if (!next_use) {
// Early exit.
// Early exit.
out_->right = in.right;
prob_ += model_.UnRest(in.left.pointers + extend_length, in.left.pointers + in.left.length, extend_length + 1);
return true;
}
}
// Continue scoring.
// Continue scoring.
return false;
}

View File

@ -16,7 +16,7 @@ namespace {
#define Term(word) score.Terminal(m.GetVocabulary().Index(word));
#define VCheck(word, value) BOOST_CHECK_EQUAL(m.GetVocabulary().Index(word), value);
// Apparently some Boost versions use templates and are pretty strict about types matching.
// Apparently some Boost versions use templates and are pretty strict about types matching.
#define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast<double>(ref), static_cast<double>(value), static_cast<double>(tol));
template <class M> void Short(const M &m) {
@ -175,7 +175,7 @@ template <class M> void LookupVocab(const M &m, const StringPiece &str, std::vec
SLOPPY_CHECK_CLOSE(expect, RightToLeft(m, words, rest), 0.001); \
SLOPPY_CHECK_CLOSE(expect, TreeMiddle(m, words, rest), 0.001); \
// Build sentences, or parts thereof, from right to left.
// Build sentences, or parts thereof, from right to left.
template <class M> void GrowBig(const M &m, bool rest = false) {
std::vector<WordIndex> words;
float expect;

View File

@ -1,7 +1,7 @@
#ifndef LM_LM_EXCEPTION_H
#define LM_LM_EXCEPTION_H
// Named to avoid conflict with util/exception.hh.
// Named to avoid conflict with util/exception.hh.
#include "util/exception.hh"
#include "util/string_piece.hh"

View File

@ -1,7 +1,7 @@
#ifndef LM_MAX_ORDER_H
#define LM_MAX_ORDER_H
/* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM.
* If not, this is the default maximum order.
* If not, this is the default maximum order.
* Having this limit means that State can be
* (kMaxOrder - 1) * sizeof(float) bytes instead of
* sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead

View File

@ -25,7 +25,7 @@ namespace lm {
namespace ngram {
namespace detail {
// Should return the same results as SRI.
// Should return the same results as SRI.
// ModelFacade typedefs Vocabulary so we use VocabularyT to avoid naming conflicts.
template <class Search, class VocabularyT> class GenericModel : public base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> {
private:
@ -38,7 +38,7 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
/* Get the size of memory that will be mapped given ngram counts. This
* does not include small non-mapped control structures, such as this class
* itself.
* itself.
*/
static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config = Config());
@ -46,47 +46,47 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
* files must have the format expected by this class or you'll get an
* exception. So TrieModel can only load ARPA or binary created by
* TrieModel. To classify binary files, call RecognizeBinary in
* lm/binary_format.hh.
* lm/binary_format.hh.
*/
explicit GenericModel(const char *file, const Config &config = Config());
/* Score p(new_word | in_state) and incorporate new_word into out_state.
* Note that in_state and out_state must be different references:
* &in_state != &out_state.
* &in_state != &out_state.
*/
FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const;
/* Slower call without in_state. Try to remember state, but sometimes it
* would cost too much memory or your decoder isn't setup properly.
* would cost too much memory or your decoder isn't setup properly.
* To use this function, make an array of WordIndex containing the context
* vocabulary ids in reverse order. Then, pass the bounds of the array:
* [context_rbegin, context_rend). The new_word is not part of the context
* array unless you intend to repeat words.
* array unless you intend to repeat words.
*/
FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
/* Get the state for a context. Don't use this if you can avoid it. Use
* BeginSentenceState or NullContextState and extend from those. If
* you're only going to use this state to call FullScore once, use
* FullScoreForgotState.
* FullScoreForgotState.
* To use this function, make an array of WordIndex containing the context
* vocabulary ids in reverse order. Then, pass the bounds of the array:
* [context_rbegin, context_rend).
* [context_rbegin, context_rend).
*/
void GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const;
/* More efficient version of FullScore where a partial n-gram has already
* been scored.
* NOTE: THE RETURNED .rest AND .prob ARE RELATIVE TO THE .rest RETURNED BEFORE.
* been scored.
* NOTE: THE RETURNED .rest AND .prob ARE RELATIVE TO THE .rest RETURNED BEFORE.
*/
FullScoreReturn ExtendLeft(
// Additional context in reverse order. This will update add_rend to
// Additional context in reverse order. This will update add_rend to
const WordIndex *add_rbegin, const WordIndex *add_rend,
// Backoff weights to use.
// Backoff weights to use.
const float *backoff_in,
// extend_left returned by a previous query.
uint64_t extend_pointer,
// Length of n-gram that the pointer corresponds to.
// Length of n-gram that the pointer corresponds to.
unsigned char extend_length,
// Where to write additional backoffs for [extend_length + 1, min(Order() - 1, return.ngram_length)]
float *backoff_out,
@ -95,17 +95,17 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
/* Return probabilities minus rest costs for an array of pointers. The
* first length should be the length of the n-gram to which pointers_begin
* points.
* points.
*/
float UnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const {
// Compiler should optimize this if away.
// Compiler should optimize this if away.
return Search::kDifferentRest ? InternalUnRest(pointers_begin, pointers_end, first_length) : 0.0;
}
private:
FullScoreReturn ScoreExceptBackoff(const WordIndex *const context_rbegin, const WordIndex *const context_rend, const WordIndex new_word, State &out_state) const;
// Score bigrams and above. Do not include backoff.
// Score bigrams and above. Do not include backoff.
void ResumeScore(const WordIndex *context_rbegin, const WordIndex *const context_rend, unsigned char starting_order_minus_2, typename Search::Node &node, float *backoff_out, unsigned char &next_use, FullScoreReturn &ret) const;
// Appears after Size in the cc file.
@ -116,7 +116,7 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
float InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const;
BinaryFormat backing_;
VocabularyT vocab_;
Search search_;
@ -124,8 +124,8 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
} // namespace detail
// Instead of typedef, inherit. This allows the Model etc to be forward declared.
// Oh the joys of C and C++.
// Instead of typedef, inherit. This allows the Model etc to be forward declared.
// Oh the joys of C and C++.
#define LM_COMMA() ,
#define LM_NAME_MODEL(name, from)\
class name : public from {\
@ -140,7 +140,7 @@ LM_NAME_MODEL(ArrayTrieModel, detail::GenericModel<trie::TrieSearch<DontQuantize
LM_NAME_MODEL(QuantTrieModel, detail::GenericModel<trie::TrieSearch<SeparatelyQuantize LM_COMMA() trie::DontBhiksha> LM_COMMA() SortedVocabulary>);
LM_NAME_MODEL(QuantArrayTrieModel, detail::GenericModel<trie::TrieSearch<SeparatelyQuantize LM_COMMA() trie::ArrayBhiksha> LM_COMMA() SortedVocabulary>);
// Default implementation. No real reason for it to be the default.
// Default implementation. No real reason for it to be the default.
typedef ::lm::ngram::ProbingVocabulary Vocabulary;
typedef ProbingModel Model;

View File

@ -7,7 +7,7 @@
#include <boost/test/unit_test.hpp>
#include <boost/test/floating_point_comparison.hpp>
// Apparently some Boost versions use templates and are pretty strict about types matching.
// Apparently some Boost versions use templates and are pretty strict about types matching.
#define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast<double>(ref), static_cast<double>(value), static_cast<double>(tol));
namespace lm {
@ -118,7 +118,7 @@ template <class M> void Blanks(const M &model) {
AppendTest("not_found", 1, -1.995635 - 7.0 - 0.30103, true);
state = model.NullContextState();
// higher looking is a blank.
// higher looking is a blank.
AppendTest("higher", 1, -1.509559, false);
AppendTest("looking", 2, -1.285941 - 0.30103, false);
@ -150,7 +150,7 @@ template <class M> void Unknowns(const M &model) {
State preserve = state;
AppendTest("not_found2", 2, -15.0, true);
AppendTest("not_found3", 2, -15.0 - 2.0, true);
state = preserve;
AppendTest("however", 2, -4, true);
AppendTest("not_found3", 3, -6, true);
@ -167,7 +167,7 @@ template <class M> void MinimalState(const M &model) {
AppendTest("foo", 1, -3.141592, true);
BOOST_CHECK_EQUAL(1, state.length);
AppendTest("bar", 2, -6.0, true);
// Has to include the backoff weight.
// Has to include the backoff weight.
BOOST_CHECK_EQUAL(1, state.length);
AppendTest("bar", 1, -2.718281 + 3.0, true);
BOOST_CHECK_EQUAL(1, state.length);
@ -263,7 +263,7 @@ template <class M> void Stateless(const M &model) {
// the
AppendTest("the", 1, -4.04005, true);
StatelessTest(5, 5, 1, -4.04005);
// No context of the.
// No context of the.
StatelessTest(5, 0, 1, -1.687872);
// biarritz
StatelessTest(6, 1, 1, -1.9889);

View File

@ -8,7 +8,7 @@ namespace ngram {
* and I want to preserve existing binary files. */
typedef enum {PROBING=0, REST_PROBING=1, TRIE=2, QUANT_TRIE=3, ARRAY_TRIE=4, QUANT_ARRAY_TRIE=5} ModelType;
// Historical names.
// Historical names.
const ModelType HASH_PROBING = PROBING;
const ModelType TRIE_SORTED = TRIE;
const ModelType QUANT_TRIE_SORTED = QUANT_TRIE;

View File

@ -22,7 +22,7 @@ struct BasicPrint {
std::cout << "Total: " << total << " OOV: " << oov << '\n';
}
void Summary(double, double, uint64_t, uint64_t) {}
};
struct FullPrint : public BasicPrint {
@ -31,7 +31,7 @@ struct FullPrint : public BasicPrint {
}
void Summary(double ppl_including_oov, double ppl_excluding_oov, uint64_t corpus_oov, uint64_t corpus_tokens) {
std::cout <<
std::cout <<
"Perplexity including OOVs:\t" << ppl_including_oov << "\n"
"Perplexity excluding OOVs:\t" << ppl_excluding_oov << "\n"
"OOVs:\t" << corpus_oov << "\n"

View File

@ -35,9 +35,9 @@ template <class Model> ExtendReturn ExtendLoop(
unsigned char i = 0;
unsigned char length = pointers_end - pointers;
// pointers_write is NULL means that the existing left state is full, so we should use completed probabilities.
// pointers_write is NULL means that the existing left state is full, so we should use completed probabilities.
if (pointers_write) {
// Using full context, writing to new left state.
// Using full context, writing to new left state.
for (; i < length; ++i) {
FullScoreReturn ret(model.ExtendLeft(
add_rbegin, add_rbegin + value.next_use,
@ -61,7 +61,7 @@ template <class Model> ExtendReturn ExtendLoop(
}
}
}
// Using some of the new context.
// Using some of the new context.
for (; i < length && value.next_use; ++i) {
FullScoreReturn ret(model.ExtendLeft(
add_rbegin, add_rbegin + value.next_use,
@ -73,7 +73,7 @@ template <class Model> ExtendReturn ExtendLoop(
value.adjust += ret.prob;
}
float unrest = model.UnRest(pointers + i, pointers_end, i + seen + 1);
// Using none of the new context.
// Using none of the new context.
value.adjust += unrest;
std::copy(backoff_in, backoff_in + value.next_use, backoff_write);
@ -100,7 +100,7 @@ template <class Model> float RevealBefore(const Model &model, const Right &revea
if (left.full) {
for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += backoff_buffer[i];
} else {
// If left wasn't full when it came in, put words into right state.
// If left wasn't full when it came in, put words into right state.
std::copy(reveal.words + seen, reveal.words + seen + value.next_use, right.words + right.length);
right.length += value.next_use;
left.full = value.make_full || (right.length == model.Order() - 1);

View File

@ -123,7 +123,7 @@ BOOST_AUTO_TEST_CASE(EndSentence) {
before.words[1] = loin;
before.backoff[0] = -0.845098;
before.backoff[1] = 0.0;
before.length = 1;
BOOST_CHECK_CLOSE(-0.0410707, RevealBefore(m, before, 0, true, between.left, between.right), 0.001);
BOOST_CHECK_EQUAL(0, between.left.length);
@ -159,7 +159,7 @@ void CheckAdjustment(const RestProbingModel &model, float expect, const Right &b
if (before_full) {
got += RevealBefore(model, before, before.length, true, between.left, between.right);
}
// Sometimes they're zero and BOOST_CHECK_CLOSE fails for this.
// Sometimes they're zero and BOOST_CHECK_CLOSE fails for this.
BOOST_CHECK(fabs(expect - got) < 0.001);
}

View File

@ -50,12 +50,12 @@ void SeparatelyQuantize::UpdateConfigFromBinary(const BinaryFormat &file, uint64
void SeparatelyQuantize::SetupMemory(void *base, unsigned char order, const Config &config) {
prob_bits_ = config.prob_bits;
backoff_bits_ = config.backoff_bits;
// We need the reserved values.
// We need the reserved values.
if (config.prob_bits == 0) UTIL_THROW(ConfigException, "You can't quantize probability to zero");
if (config.backoff_bits == 0) UTIL_THROW(ConfigException, "You can't quantize backoff to zero");
if (config.prob_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing probability supports at most 25 bits. Currently you have requested " << static_cast<unsigned>(config.prob_bits) << " bits.");
if (config.backoff_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing backoff supports at most 25 bits. Currently you have requested " << static_cast<unsigned>(config.backoff_bits) << " bits.");
// Reserve 8 byte header for bit counts.
// Reserve 8 byte header for bit counts.
actual_base_ = static_cast<uint8_t*>(base);
float *start = reinterpret_cast<float*>(actual_base_ + 8);
for (unsigned char i = 0; i < order - 2; ++i) {

View File

@ -85,7 +85,7 @@ class DontQuantize {
void SetupMemory(void * /*start*/, unsigned char /*order*/, const Config & /*config*/) {}
static const bool kTrain = false;
// These should never be called because kTrain is false.
// These should never be called because kTrain is false.
void Train(uint8_t /*order*/, std::vector<float> &/*prob*/, std::vector<float> &/*backoff*/) {}
void TrainProb(uint8_t, std::vector<float> &/*prob*/) {}
@ -142,7 +142,7 @@ class SeparatelyQuantize {
static uint64_t Size(uint8_t order, const Config &config) {
uint64_t longest_table = (static_cast<uint64_t>(1) << static_cast<uint64_t>(config.prob_bits)) * sizeof(float);
uint64_t middle_table = (static_cast<uint64_t>(1) << static_cast<uint64_t>(config.backoff_bits)) * sizeof(float) + longest_table;
// unigrams are currently not quantized so no need for a table.
// unigrams are currently not quantized so no need for a table.
return (order - 2) * middle_table + longest_table + /* for the bit counts and alignment padding) */ 8;
}
@ -168,7 +168,7 @@ class SeparatelyQuantize {
float Rest() const { return Prob(); }
void Write(float prob, float backoff) const {
util::WriteInt57(address_.base, address_.offset, ProbBins().Bits() + BackoffBins().Bits(),
util::WriteInt57(address_.base, address_.offset, ProbBins().Bits() + BackoffBins().Bits(),
(ProbBins().EncodeProb(prob) << BackoffBins().Bits()) | BackoffBins().EncodeBackoff(backoff));
}
@ -183,7 +183,7 @@ class SeparatelyQuantize {
class LongestPointer {
public:
LongestPointer(const SeparatelyQuantize &quant, const util::BitAddress &address) : table_(&quant.LongestTable()), address_(address) {}
LongestPointer() : address_(NULL, 0) {}
bool Found() const { return address_.base != NULL; }
@ -206,7 +206,7 @@ class SeparatelyQuantize {
void SetupMemory(void *start, unsigned char order, const Config &config);
static const bool kTrain = true;
// Assumes 0.0 is removed from backoff.
// Assumes 0.0 is removed from backoff.
void Train(uint8_t order, std::vector<float> &prob, std::vector<float> &backoff);
// Train just probabilities (for longest order).
void TrainProb(uint8_t order, std::vector<float> &prob);

View File

@ -9,7 +9,7 @@ struct FullScoreReturn {
// log10 probability
float prob;
/* The length of n-gram matched. Do not use this for recombination.
/* The length of n-gram matched. Do not use this for recombination.
* Consider a model containing only the following n-grams:
* -1 foo
* -3.14 bar
@ -18,9 +18,9 @@ struct FullScoreReturn {
*
* If you score ``bar'' then ngram_length is 1 and recombination state is the
* empty string because bar has zero backoff and does not extend to the
* right.
* If you score ``foo'' then ngram_length is 1 and recombination state is
* ``foo''.
* right.
* If you score ``foo'' then ngram_length is 1 and recombination state is
* ``foo''.
*
* Ideally, keep output states around and compare them. Failing that,
* get out_state.ValidLength() and use that length for recombination.
@ -29,7 +29,7 @@ struct FullScoreReturn {
/* Left extension information. If independent_left is set, then prob is
* independent of words to the left (up to additional backoff). Otherwise,
* extend_left indicates how to efficiently extend further to the left.
* extend_left indicates how to efficiently extend further to the left.
*/
bool independent_left;
uint64_t extend_left; // Defined only if independent_left

Some files were not shown because too many files have changed in this diff Show More