Remove trailing whitespace in C++ files.

This commit is contained in:
Jeroen Vermeulen 2015-04-30 12:05:11 +07:00
parent 85acdc62b1
commit eca5824100
368 changed files with 5749 additions and 5749 deletions

View File

@ -1,101 +1,101 @@
// $Id: Vocabulary.cpp 1565 2008-02-22 14:42:01Z bojar $ // $Id: Vocabulary.cpp 1565 2008-02-22 14:42:01Z bojar $
#include "Vocabulary.h" #include "Vocabulary.h"
#include <fstream> #include <fstream>
namespace namespace
{ {
const int MAX_LENGTH = 10000; const int MAX_LENGTH = 10000;
} // namespace } // namespace
using namespace std; using namespace std;
// as in beamdecoder/tables.cpp // as in beamdecoder/tables.cpp
vector<WORD_ID> Vocabulary::Tokenize( const char input[] ) vector<WORD_ID> Vocabulary::Tokenize( const char input[] )
{ {
vector< WORD_ID > token; vector< WORD_ID > token;
bool betweenWords = true; bool betweenWords = true;
int start=0; int start=0;
int i=0; int i=0;
for(; input[i] != '\0'; i++) { for(; input[i] != '\0'; i++) {
bool isSpace = (input[i] == ' ' || input[i] == '\t'); bool isSpace = (input[i] == ' ' || input[i] == '\t');
if (!isSpace && betweenWords) { if (!isSpace && betweenWords) {
start = i; start = i;
betweenWords = false; betweenWords = false;
} else if (isSpace && !betweenWords) { } else if (isSpace && !betweenWords) {
token.push_back( StoreIfNew ( string( input+start, i-start ) ) ); token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
betweenWords = true; betweenWords = true;
} }
} }
if (!betweenWords) if (!betweenWords)
token.push_back( StoreIfNew ( string( input+start, i-start ) ) ); token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
return token; return token;
} }
WORD_ID Vocabulary::StoreIfNew( const WORD& word ) WORD_ID Vocabulary::StoreIfNew( const WORD& word )
{ {
map<WORD, WORD_ID>::iterator i = lookup.find( word ); map<WORD, WORD_ID>::iterator i = lookup.find( word );
if( i != lookup.end() ) if( i != lookup.end() )
return i->second; return i->second;
WORD_ID id = vocab.size(); WORD_ID id = vocab.size();
vocab.push_back( word ); vocab.push_back( word );
lookup[ word ] = id; lookup[ word ] = id;
return id; return id;
} }
WORD_ID Vocabulary::GetWordID( const WORD &word ) const WORD_ID Vocabulary::GetWordID( const WORD &word ) const
{ {
map<WORD, WORD_ID>::const_iterator i = lookup.find( word ); map<WORD, WORD_ID>::const_iterator i = lookup.find( word );
if( i == lookup.end() ) if( i == lookup.end() )
return 0; return 0;
WORD_ID w= (WORD_ID) i->second; WORD_ID w= (WORD_ID) i->second;
return w; return w;
} }
void Vocabulary::Save(const string& fileName ) const void Vocabulary::Save(const string& fileName ) const
{ {
ofstream vcbFile; ofstream vcbFile;
vcbFile.open( fileName.c_str(), ios::out | ios::ate | ios::trunc); vcbFile.open( fileName.c_str(), ios::out | ios::ate | ios::trunc);
if (!vcbFile) { if (!vcbFile) {
cerr << "Failed to open " << vcbFile << endl; cerr << "Failed to open " << vcbFile << endl;
exit(1); exit(1);
} }
vector< WORD >::const_iterator i; vector< WORD >::const_iterator i;
for(i = vocab.begin(); i != vocab.end(); i++) { for(i = vocab.begin(); i != vocab.end(); i++) {
const string &word = *i; const string &word = *i;
vcbFile << word << endl; vcbFile << word << endl;
} }
vcbFile.close(); vcbFile.close();
} }
void Vocabulary::Load(const string& fileName ) void Vocabulary::Load(const string& fileName )
{ {
ifstream vcbFile; ifstream vcbFile;
char line[MAX_LENGTH]; char line[MAX_LENGTH];
vcbFile.open(fileName.c_str()); vcbFile.open(fileName.c_str());
if (!vcbFile) { if (!vcbFile) {
cerr << "no such file or directory: " << vcbFile << endl; cerr << "no such file or directory: " << vcbFile << endl;
exit(1); exit(1);
} }
cerr << "loading from " << fileName << endl; cerr << "loading from " << fileName << endl;
istream *fileP = &vcbFile; istream *fileP = &vcbFile;
int count = 0; int count = 0;
while(!fileP->eof()) { while(!fileP->eof()) {
SAFE_GETLINE((*fileP), line, MAX_LENGTH, '\n'); SAFE_GETLINE((*fileP), line, MAX_LENGTH, '\n');
if (fileP->eof()) break; if (fileP->eof()) break;
int length = 0; int length = 0;
for(; line[length] != '\0'; length++); for(; line[length] != '\0'; length++);
StoreIfNew( string( line, length ) ); StoreIfNew( string( line, length ) );
count++; count++;
} }
vcbFile.close(); vcbFile.close();
cerr << count << " word read, vocabulary size " << vocab.size() << endl; cerr << count << " word read, vocabulary size " << vocab.size() << endl;
} }

View File

@ -46,7 +46,7 @@ RE2 curr_en_x("^[Nn]?[\'][\\p{L}]"); // english contraction suffixes conjoin to
RE2 pre_en_x(".*[\\p{L}\\p{N}]+$"); // valid english contraction prefixes RE2 pre_en_x(".*[\\p{L}\\p{N}]+$"); // valid english contraction prefixes
RE2 curr_fr_x(".*[\\p{L}\\p{N}]+[\']"); // french/italian contraction prefixes conjoin to the right RE2 curr_fr_x(".*[\\p{L}\\p{N}]+[\']"); // french/italian contraction prefixes conjoin to the right
RE2 post_fr_x("^[\\p{L}\\p{N}]*"); // valid french/italian contraction suffixes RE2 post_fr_x("^[\\p{L}\\p{N}]*"); // valid french/italian contraction suffixes
// anything rarely used will just be given as a string and compiled on demand by RE2 // anything rarely used will just be given as a string and compiled on demand by RE2
const char * const char *
SPC_BYTE = " "; SPC_BYTE = " ";
@ -85,8 +85,8 @@ const char *ESCAPE_MOSES[] = {
"&apos;", // ' 6 (27) "&apos;", // ' 6 (27)
"&quot;", // " 7 (22) "&quot;", // " 7 (22)
}; };
const std::set<std::string> const std::set<std::string>
ESCAPE_SET = { ESCAPE_SET = {
std::string(ESCAPE_MOSES[0]), std::string(ESCAPE_MOSES[0]),
std::string(ESCAPE_MOSES[1]), std::string(ESCAPE_MOSES[1]),
@ -98,7 +98,7 @@ ESCAPE_SET = {
std::string(ESCAPE_MOSES[7]), std::string(ESCAPE_MOSES[7]),
}; };
const std::map<std::wstring,gunichar> const std::map<std::wstring,gunichar>
ENTITY_MAP = { ENTITY_MAP = {
{ std::wstring(L"&quot;"), L'"' }, { std::wstring(L"&quot;"), L'"' },
{ std::wstring(L"&amp;"), L'&' }, { std::wstring(L"&amp;"), L'&' },
@ -355,7 +355,7 @@ ENTITY_MAP = {
{ std::wstring(L"&diams;"), L'\u2666' } { std::wstring(L"&diams;"), L'\u2666' }
}; };
inline gunichar inline gunichar
get_entity(gunichar *ptr, size_t len) { get_entity(gunichar *ptr, size_t len) {
// try hex, decimal entity first // try hex, decimal entity first
gunichar ech(0); gunichar ech(0);
@ -380,16 +380,16 @@ get_entity(gunichar *ptr, size_t len) {
ech = 0; ech = 0;
} }
} }
if (ech) if (ech)
return ech; return ech;
std::map<std::wstring,gunichar>::const_iterator it = std::map<std::wstring,gunichar>::const_iterator it =
ENTITY_MAP.find(std::wstring((wchar_t *)(ptr),len)); ENTITY_MAP.find(std::wstring((wchar_t *)(ptr),len));
return it != ENTITY_MAP.end() ? it->second : gunichar(0); return it != ENTITY_MAP.end() ? it->second : gunichar(0);
} }
inline gunichar inline gunichar
get_entity(char *ptr, size_t len) { get_entity(char *ptr, size_t len) {
glong ulen = 0; glong ulen = 0;
gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)ptr, len, &ulen); gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)ptr, len, &ulen);
@ -399,7 +399,7 @@ get_entity(char *ptr, size_t len) {
} }
inline std::string inline std::string
trim(const std::string& in) trim(const std::string& in)
{ {
std::size_t start = 0; std::size_t start = 0;
@ -413,7 +413,7 @@ trim(const std::string& in)
} }
inline std::vector<std::string> inline std::vector<std::string>
split(const std::string& in) split(const std::string& in)
{ {
std::vector<std::string> outv; std::vector<std::string> outv;
@ -476,7 +476,7 @@ Tokenizer::Tokenizer(const Parameters& _)
// //
// dtor deletes dynamically allocated per-language RE2 compiled expressions // dtor deletes dynamically allocated per-language RE2 compiled expressions
// //
Tokenizer::~Tokenizer() Tokenizer::~Tokenizer()
{ {
for (auto& ptr : prot_pat_vec) { for (auto& ptr : prot_pat_vec) {
if (ptr == &numprefixed_x || ptr == &quasinumeric_x) if (ptr == &numprefixed_x || ptr == &quasinumeric_x)
@ -491,7 +491,7 @@ Tokenizer::~Tokenizer()
// others into nbpre_gen_set // others into nbpre_gen_set
// //
std::pair<int,int> std::pair<int,int>
Tokenizer::load_prefixes(std::ifstream& ifs) Tokenizer::load_prefixes(std::ifstream& ifs)
{ {
RE2 numonly("(.*)[\\s]+(\\#NUMERIC_ONLY\\#)"); RE2 numonly("(.*)[\\s]+(\\#NUMERIC_ONLY\\#)");
std::string line; std::string line;
@ -547,7 +547,7 @@ Tokenizer::init(const char *cfg_dir_optional) {
try { try {
std::pair<int,int> counts = load_prefixes(cfg); std::pair<int,int> counts = load_prefixes(cfg);
if (verbose_p) { if (verbose_p) {
std::cerr << "loaded " << counts.first << " non-numeric, " std::cerr << "loaded " << counts.first << " non-numeric, "
<< counts.second << " numeric prefixes from " << counts.second << " numeric prefixes from "
<< nbpre_path << std::endl; << nbpre_path << std::endl;
} }
@ -570,7 +570,7 @@ Tokenizer::init(const char *cfg_dir_optional) {
std::string protpat_path(cfg_dir); std::string protpat_path(cfg_dir);
protpat_path.append("/protected_pattern.").append(lang_iso); protpat_path.append("/protected_pattern.").append(lang_iso);
// default to generic version // default to generic version
if (::access(protpat_path.c_str(),R_OK)) if (::access(protpat_path.c_str(),R_OK))
protpat_path = protpat_path.substr(0,protpat_path.size()-lang_iso.size()-1); protpat_path = protpat_path.substr(0,protpat_path.size()-lang_iso.size()-1);
prot_pat_vec.push_back(&numprefixed_x); prot_pat_vec.push_back(&numprefixed_x);
@ -596,7 +596,7 @@ Tokenizer::init(const char *cfg_dir_optional) {
throw std::runtime_error(ess.str()); throw std::runtime_error(ess.str());
} }
if (verbose_p) { if (verbose_p) {
std::cerr << "loaded " << npat << " protected patterns from " std::cerr << "loaded " << npat << " protected patterns from "
<< protpat_path << std::endl; << protpat_path << std::endl;
} }
} else if (verbose_p) { } else if (verbose_p) {
@ -612,7 +612,7 @@ Tokenizer::reset() {
// //
// apply ctor-selected tokenization to a string, in-place, no newlines allowed, // apply ctor-selected tokenization to a string, in-place, no newlines allowed,
// assumes protections are applied already, some invariants are in place, // assumes protections are applied already, some invariants are in place,
// e.g. that successive chars <= ' ' have been normalized to a single ' ' // e.g. that successive chars <= ' ' have been normalized to a single ' '
// //
void void
@ -633,7 +633,7 @@ Tokenizer::protected_tokenize(std::string& text) {
} }
if (pos < textpc.size() && textpc[pos] != ' ') if (pos < textpc.size() && textpc[pos] != ' ')
words.push_back(textpc.substr(pos,textpc.size()-pos)); words.push_back(textpc.substr(pos,textpc.size()-pos));
// regurgitate words with look-ahead handling for tokens with final mumble // regurgitate words with look-ahead handling for tokens with final mumble
std::string outs; std::string outs;
std::size_t nwords(words.size()); std::size_t nwords(words.size());
@ -659,7 +659,7 @@ Tokenizer::protected_tokenize(std::string& text) {
// lower-case look-ahead does not break // lower-case look-ahead does not break
sentence_break_p = false; sentence_break_p = false;
} }
} }
outs.append(words[ii].data(),len); outs.append(words[ii].data(),len);
if (sentence_break_p) if (sentence_break_p)
@ -671,15 +671,15 @@ Tokenizer::protected_tokenize(std::string& text) {
} }
bool bool
Tokenizer::unescape(std::string& word) { Tokenizer::unescape(std::string& word) {
std::ostringstream oss; std::ostringstream oss;
std::size_t was = 0; // last processed std::size_t was = 0; // last processed
std::size_t pos = 0; // last unprocessed std::size_t pos = 0; // last unprocessed
std::size_t len = 0; // processed length std::size_t len = 0; // processed length
bool hit = false; bool hit = false;
for (std::size_t endp=0; for (std::size_t endp=0;
(pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos; (pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos;
was = endp == std::string::npos ? pos : 1+endp) { was = endp == std::string::npos ? pos : 1+endp) {
len = endp - pos + 1; len = endp - pos + 1;
glong ulen(0); glong ulen(0);
@ -703,7 +703,7 @@ Tokenizer::unescape(std::string& word) {
} }
g_free(gtmp); g_free(gtmp);
} }
if (was < word.size()) if (was < word.size())
oss << word.substr(was); oss << word.substr(was);
if (hit) if (hit)
word = oss.str(); word = oss.str();
@ -727,7 +727,7 @@ Tokenizer::escape(std::string& text) {
if (mod_p) if (mod_p)
outs.append(pp,pt-pp+1); outs.append(pp,pt-pp+1);
} else { } else {
if (mod_p) if (mod_p)
outs.append(pp,mk-pp); outs.append(pp,mk-pp);
pt = --mk; pt = --mk;
} }
@ -751,7 +751,7 @@ Tokenizer::escape(std::string& text) {
} else if (*pt > ']') { } else if (*pt > ']') {
if (*pt =='|') { // 7c if (*pt =='|') { // 7c
sequence_p = ESCAPE_MOSES[0]; sequence_p = ESCAPE_MOSES[0];
} }
} else if (*pt > 'Z') { } else if (*pt > 'Z') {
if (*pt == '<') { // 3e if (*pt == '<') { // 3e
sequence_p = ESCAPE_MOSES[4]; sequence_p = ESCAPE_MOSES[4];
@ -761,11 +761,11 @@ Tokenizer::escape(std::string& text) {
sequence_p = ESCAPE_MOSES[1]; sequence_p = ESCAPE_MOSES[1];
} else if (*pt == ']') { // 5d } else if (*pt == ']') { // 5d
sequence_p = ESCAPE_MOSES[2]; sequence_p = ESCAPE_MOSES[2];
} }
} }
if (sequence_p) { if (sequence_p) {
if (pt > pp) if (pt > pp)
outs.append(pp,pt-pp); outs.append(pp,pt-pp);
outs.append(sequence_p); outs.append(sequence_p);
mod_p = true; mod_p = true;
@ -774,7 +774,7 @@ Tokenizer::escape(std::string& text) {
++pt; ++pt;
} }
} }
if (mod_p) { if (mod_p) {
if (pp < pt) { if (pp < pt) {
outs.append(pp,pt-pp); outs.append(pp,pt-pp);
@ -795,13 +795,13 @@ Tokenizer::penn_tokenize(const std::string& buf)
std::string text(buf); std::string text(buf);
std::string outs; std::string outs;
if (skip_alltags_p) if (skip_alltags_p)
RE2::GlobalReplace(&text,genl_tags_x,SPC_BYTE); RE2::GlobalReplace(&text,genl_tags_x,SPC_BYTE);
// directed quote patches // directed quote patches
size_t len = text.size(); size_t len = text.size();
if (len > 2 && text.substr(0,2) == "``") if (len > 2 && text.substr(0,2) == "``")
text.replace(0,2,"`` ",3); text.replace(0,2,"`` ",3);
else if (text[0] == '"') else if (text[0] == '"')
text.replace(0,1,"`` ",3); text.replace(0,1,"`` ",3);
else if (text[0] == '`' || text[0] == '\'') else if (text[0] == '`' || text[0] == '\'')
@ -811,9 +811,9 @@ Tokenizer::penn_tokenize(const std::string& buf)
RE2::GlobalReplace(&text,x1_v_gg,one_gg); RE2::GlobalReplace(&text,x1_v_gg,one_gg);
RE2::GlobalReplace(&text,x1_v_g,"\\1 ` \\2"); RE2::GlobalReplace(&text,x1_v_g,"\\1 ` \\2");
RE2::GlobalReplace(&text,x1_v_q,"\\1 ` "); RE2::GlobalReplace(&text,x1_v_q,"\\1 ` ");
// protect ellipsis // protect ellipsis
for (size_t pos = text.find("..."); pos != std::string::npos; pos = text.find("...",pos+11)) for (size_t pos = text.find("..."); pos != std::string::npos; pos = text.find("...",pos+11))
text.replace(pos,3,"MANYELIPSIS",11); text.replace(pos,3,"MANYELIPSIS",11);
// numeric commas // numeric commas
@ -826,13 +826,13 @@ Tokenizer::penn_tokenize(const std::string& buf)
// isolable slash // isolable slash
RE2::GlobalReplace(&text,slash_x,special_refs); RE2::GlobalReplace(&text,slash_x,special_refs);
// isolate final period // isolate final period
RE2::GlobalReplace(&text,final_x,"\\1 \\2\\3"); RE2::GlobalReplace(&text,final_x,"\\1 \\2\\3");
// isolate q.m., e.m. // isolate q.m., e.m.
RE2::GlobalReplace(&text,qx_x,isolate_ref); RE2::GlobalReplace(&text,qx_x,isolate_ref);
// isolate braces // isolate braces
RE2::GlobalReplace(&text,braces_x,isolate_ref); RE2::GlobalReplace(&text,braces_x,isolate_ref);
@ -866,7 +866,7 @@ Tokenizer::penn_tokenize(const std::string& buf)
} }
std::string ntext(SPC_BYTE); std::string ntext(SPC_BYTE);
ntext.append(text); ntext.append(text);
// convert double quote to paired single-quotes // convert double quote to paired single-quotes
RE2::GlobalReplace(&ntext,"\""," '' "); RE2::GlobalReplace(&ntext,"\""," '' ");
@ -894,7 +894,7 @@ Tokenizer::penn_tokenize(const std::string& buf)
RE2::GlobalReplace(&ntext," ([Ww])anna "," \\1an na "); RE2::GlobalReplace(&ntext," ([Ww])anna "," \\1an na ");
protected_tokenize(ntext); protected_tokenize(ntext);
// restore ellipsis // restore ellipsis
RE2::GlobalReplace(&ntext,"MANYELIPSIS","..."); RE2::GlobalReplace(&ntext,"MANYELIPSIS","...");
@ -919,7 +919,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
int num = 0; int num = 0;
// this is the main moses-compatible tokenizer // this is the main moses-compatible tokenizer
// push all the prefixes matching protected patterns // push all the prefixes matching protected patterns
std::vector<std::string> prot_stack; std::vector<std::string> prot_stack;
std::string match; std::string match;
@ -942,7 +942,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
} }
} }
} }
const char *pt(text.c_str()); const char *pt(text.c_str());
const char *ep(pt + text.size()); const char *ep(pt + text.size());
while (pt < ep && *pt >= 0 && *pt <= ' ') while (pt < ep && *pt >= 0 && *pt <= ' ')
@ -990,8 +990,8 @@ Tokenizer::quik_tokenize(const std::string& buf)
if (!since_start) { if (!since_start) {
if (std::isalpha(char(*ucs4))) if (std::isalpha(char(*ucs4)))
alpha_prefix++; alpha_prefix++;
} else if (alpha_prefix == since_start } else if (alpha_prefix == since_start
&& char(*ucs4) == ':' && char(*ucs4) == ':'
&& next_type != G_UNICODE_SPACE_SEPARATOR) { && next_type != G_UNICODE_SPACE_SEPARATOR) {
in_url_p = true; in_url_p = true;
} }
@ -1018,7 +1018,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
// fallthough // fallthough
case G_UNICODE_UPPERCASE_LETTER: case G_UNICODE_UPPERCASE_LETTER:
case G_UNICODE_LOWERCASE_LETTER: case G_UNICODE_LOWERCASE_LETTER:
if (downcase_p && curr_type == G_UNICODE_UPPERCASE_LETTER) if (downcase_p && curr_type == G_UNICODE_UPPERCASE_LETTER)
curr_uch = g_unichar_tolower(*ucs4); curr_uch = g_unichar_tolower(*ucs4);
break; break;
case G_UNICODE_SPACING_MARK: case G_UNICODE_SPACING_MARK:
@ -1082,8 +1082,8 @@ Tokenizer::quik_tokenize(const std::string& buf)
substitute_p = L"@-@"; substitute_p = L"@-@";
post_break_p = pre_break_p = true; post_break_p = pre_break_p = true;
} else if ( ( curr_uch > gunichar(L'\u002D') && curr_uch < gunichar(L'\u2010') ) || } else if ( ( curr_uch > gunichar(L'\u002D') && curr_uch < gunichar(L'\u2010') ) ||
( curr_uch > gunichar(L'\u2011') ( curr_uch > gunichar(L'\u2011')
&& curr_uch != gunichar(L'\u30A0') && curr_uch != gunichar(L'\u30A0')
&& curr_uch < gunichar(L'\uFE63') ) ) { && curr_uch < gunichar(L'\uFE63') ) ) {
// dash, not a hyphen // dash, not a hyphen
post_break_p = pre_break_p = true; post_break_p = pre_break_p = true;
@ -1151,7 +1151,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
default: default:
post_break_p = pre_break_p = prev_uch != curr_uch; post_break_p = pre_break_p = prev_uch != curr_uch;
break; break;
} }
} }
} }
break; break;
@ -1159,8 +1159,8 @@ Tokenizer::quik_tokenize(const std::string& buf)
switch (curr_uch) { switch (curr_uch) {
case gunichar(L':'): case gunichar(L':'):
case gunichar(L'/'): case gunichar(L'/'):
if (refined_p && !in_url_p if (refined_p && !in_url_p
&& prev_type == G_UNICODE_DECIMAL_NUMBER && prev_type == G_UNICODE_DECIMAL_NUMBER
&& next_type == G_UNICODE_DECIMAL_NUMBER) { && next_type == G_UNICODE_DECIMAL_NUMBER) {
break; break;
} }
@ -1178,7 +1178,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
break; break;
case gunichar(L'&'): case gunichar(L'&'):
if (unescape_p) { if (unescape_p) {
if (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER if (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER
|| next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'#')) { || next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'#')) {
gunichar *eptr = nxt4; gunichar *eptr = nxt4;
GUnicodeType eptr_type(G_UNICODE_UNASSIGNED); GUnicodeType eptr_type(G_UNICODE_UNASSIGNED);
@ -1223,16 +1223,16 @@ Tokenizer::quik_tokenize(const std::string& buf)
next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED; next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED;
goto retry; goto retry;
} }
} }
post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR; post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR;
if (escape_p) if (escape_p)
substitute_p = L"&amp;"; substitute_p = L"&amp;";
break; break;
case gunichar(L'\''): case gunichar(L'\''):
if (english_p) { if (english_p) {
if (!in_url_p) { if (!in_url_p) {
bool next_letter_p = next_type == G_UNICODE_LOWERCASE_LETTER bool next_letter_p = next_type == G_UNICODE_LOWERCASE_LETTER
|| next_type == G_UNICODE_UPPERCASE_LETTER; || next_type == G_UNICODE_UPPERCASE_LETTER;
pre_break_p = true; pre_break_p = true;
if (next_letter_p && refined_p) { if (next_letter_p && refined_p) {
@ -1241,9 +1241,9 @@ Tokenizer::quik_tokenize(const std::string& buf)
*(uptr - 1) = gunichar(L' '); *(uptr - 1) = gunichar(L' ');
*(uptr++) = prev_uch; *(uptr++) = prev_uch;
pre_break_p = false; pre_break_p = false;
} }
} }
post_break_p = since_start == 0 post_break_p = since_start == 0
|| (!next_letter_p && next_type != G_UNICODE_DECIMAL_NUMBER); || (!next_letter_p && next_type != G_UNICODE_DECIMAL_NUMBER);
} }
} else if (latin_p) { } else if (latin_p) {
@ -1252,12 +1252,12 @@ Tokenizer::quik_tokenize(const std::string& buf)
} else { } else {
post_break_p = pre_break_p = !in_url_p; post_break_p = pre_break_p = !in_url_p;
} }
if (escape_p) if (escape_p)
substitute_p = L"&apos;"; substitute_p = L"&apos;";
break; break;
case gunichar(L'"'): case gunichar(L'"'):
post_break_p = pre_break_p = true; post_break_p = pre_break_p = true;
if (escape_p) if (escape_p)
substitute_p = L"&quot;"; substitute_p = L"&quot;";
break; break;
case gunichar(L','): case gunichar(L','):
@ -1303,7 +1303,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
} }
} }
// terminal isolated letter does not break // terminal isolated letter does not break
} else if (class_follows_p(nxt4,lim4,G_UNICODE_LOWERCASE_LETTER) || } else if (class_follows_p(nxt4,lim4,G_UNICODE_LOWERCASE_LETTER) ||
g_unichar_type(*nxt4) == G_UNICODE_DASH_PUNCTUATION) { g_unichar_type(*nxt4) == G_UNICODE_DASH_PUNCTUATION) {
// lower-case look-ahead does not break // lower-case look-ahead does not break
} else { } else {
@ -1315,7 +1315,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
pre_break_p = true; pre_break_p = true;
break; break;
} }
} }
break; break;
} }
} else { } else {
@ -1346,11 +1346,11 @@ Tokenizer::quik_tokenize(const std::string& buf)
case gunichar(L')'): case gunichar(L')'):
break; break;
case gunichar(L'['): case gunichar(L'['):
if (escape_p) if (escape_p)
substitute_p = L"&#91;"; substitute_p = L"&#91;";
break; break;
case gunichar(L']'): case gunichar(L']'):
if (escape_p) if (escape_p)
substitute_p = L"&#93;"; substitute_p = L"&#93;";
break; break;
default: default:
@ -1377,7 +1377,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
if (english_p) { if (english_p) {
if (!in_url_p) { if (!in_url_p) {
pre_break_p = true; pre_break_p = true;
post_break_p = since_start == 0 || post_break_p = since_start == 0 ||
(next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER); (next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
} }
} else if (latin_p) { } else if (latin_p) {
@ -1386,23 +1386,23 @@ Tokenizer::quik_tokenize(const std::string& buf)
} else { } else {
post_break_p = pre_break_p = !in_url_p; post_break_p = pre_break_p = !in_url_p;
} }
if (escape_p) if (escape_p)
substitute_p = L"&apos;"; substitute_p = L"&apos;";
else else
curr_uch = gunichar(L'\''); curr_uch = gunichar(L'\'');
break; break;
case gunichar(L'|'): case gunichar(L'|'):
if (escape_p) if (escape_p)
substitute_p = L"&#124;"; substitute_p = L"&#124;";
post_break_p = pre_break_p = true; post_break_p = pre_break_p = true;
break; break;
case gunichar(L'<'): case gunichar(L'<'):
if (escape_p) if (escape_p)
substitute_p = L"&lt;"; substitute_p = L"&lt;";
post_break_p = pre_break_p = true; post_break_p = pre_break_p = true;
break; break;
case gunichar(L'>'): case gunichar(L'>'):
if (escape_p) if (escape_p)
substitute_p = L"&gt;"; substitute_p = L"&gt;";
post_break_p = pre_break_p = true; post_break_p = pre_break_p = true;
break; break;
@ -1414,7 +1414,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
case gunichar(L'='): case gunichar(L'='):
case gunichar(L'~'): case gunichar(L'~'):
in_num_p = false; in_num_p = false;
post_break_p = pre_break_p = !in_url_p; post_break_p = pre_break_p = !in_url_p;
break; break;
case gunichar(L'+'): case gunichar(L'+'):
post_break_p = pre_break_p = !in_url_p; post_break_p = pre_break_p = !in_url_p;
@ -1444,12 +1444,12 @@ Tokenizer::quik_tokenize(const std::string& buf)
curr_uch = gunichar(L' '); curr_uch = gunichar(L' ');
} else if (curr_uch < gunichar(L' ')) { } else if (curr_uch < gunichar(L' ')) {
curr_uch = gunichar(L' '); curr_uch = gunichar(L' ');
} else if (curr_uch == gunichar(L'\u0092') && } else if (curr_uch == gunichar(L'\u0092') &&
(next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER)) { (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER)) {
// observed corpus corruption case // observed corpus corruption case
if (english_p) { if (english_p) {
pre_break_p = true; pre_break_p = true;
post_break_p = since_start == 0 || post_break_p = since_start == 0 ||
(next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER); (next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
} else if (latin_p) { } else if (latin_p) {
post_break_p = true; post_break_p = true;
@ -1457,9 +1457,9 @@ Tokenizer::quik_tokenize(const std::string& buf)
} else { } else {
post_break_p = pre_break_p = true; post_break_p = pre_break_p = true;
} }
if (escape_p) if (escape_p)
substitute_p = L"&apos;"; substitute_p = L"&apos;";
else else
curr_uch = gunichar(L'\''); curr_uch = gunichar(L'\'');
} else { } else {
post_break_p = pre_break_p = true; post_break_p = pre_break_p = true;
@ -1491,7 +1491,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
in_url_p = in_num_p = false; in_url_p = in_num_p = false;
break; break;
} }
if (pre_break_p || curr_uch == gunichar(L' ') || (bad_length && curr_type != G_UNICODE_UNASSIGNED)) { if (pre_break_p || curr_uch == gunichar(L' ') || (bad_length && curr_type != G_UNICODE_UNASSIGNED)) {
if (since_start) { if (since_start) {
// non-empty token emitted previously, so pre-break must emit token separator // non-empty token emitted previously, so pre-break must emit token separator
@ -1501,8 +1501,8 @@ Tokenizer::quik_tokenize(const std::string& buf)
if (curr_uch == gunichar(L' ')) if (curr_uch == gunichar(L' '))
// suppress emission below, fall-through to substitute logic // suppress emission below, fall-through to substitute logic
curr_uch = 0; curr_uch = 0;
} }
if (substitute_p) { if (substitute_p) {
for (gunichar *sptr = (gunichar *)substitute_p; *sptr; ++sptr) { for (gunichar *sptr = (gunichar *)substitute_p; *sptr; ++sptr) {
*uptr++ = *sptr; *uptr++ = *sptr;
@ -1521,7 +1521,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
glong nbytes = 0; glong nbytes = 0;
gchar *utf8 = g_ucs4_to_utf8(ubuf,uptr-ubuf,0,&nbytes,0); // g_free gchar *utf8 = g_ucs4_to_utf8(ubuf,uptr-ubuf,0,&nbytes,0); // g_free
if (utf8[nbytes-1] == ' ') if (utf8[nbytes-1] == ' ')
--nbytes; --nbytes;
text.assign((const char *)utf8,(const char *)(utf8 + nbytes)); text.assign((const char *)utf8,(const char *)(utf8 + nbytes));
g_free(utf8); g_free(utf8);
@ -1552,7 +1552,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
} }
std::size_t std::size_t
Tokenizer::tokenize(std::istream& is, std::ostream& os) Tokenizer::tokenize(std::istream& is, std::ostream& os)
{ {
std::size_t line_no = 0; std::size_t line_no = 0;
@ -1561,10 +1561,10 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
std::vector< std::vector< std::string > > results(nthreads); std::vector< std::vector< std::string > > results(nthreads);
std::vector< boost::thread > workers(nthreads); std::vector< boost::thread > workers(nthreads);
bool done_p = !(is.good() && os.good()); bool done_p = !(is.good() && os.good());
for (std::size_t tranche = 0; !done_p; ++tranche) { for (std::size_t tranche = 0; !done_p; ++tranche) {
// for loop starting threads for chunks of input // for loop starting threads for chunks of input
for (std::size_t ithread = 0; ithread < nthreads; ++ithread) { for (std::size_t ithread = 0; ithread < nthreads; ++ithread) {
@ -1589,19 +1589,19 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
results[ithread].resize(line_pos); results[ithread].resize(line_pos);
break; break;
} }
lines[ithread][line_pos].clear(); lines[ithread][line_pos].clear();
} else if (skip_xml_p && } else if (skip_xml_p &&
(RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) { (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
lines[ithread][line_pos].clear(); lines[ithread][line_pos].clear();
} else { } else {
lines[ithread][line_pos] = lines[ithread][line_pos] =
std::string(SPC_BYTE).append(istr).append(SPC_BYTE); std::string(SPC_BYTE).append(istr).append(SPC_BYTE);
} }
} }
if (line_pos) { if (line_pos) {
workers[ithread] = workers[ithread] =
boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread])); boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread]));
} }
} // end for loop starting threads } // end for loop starting threads
@ -1616,22 +1616,22 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
if (nlin != nres) { if (nlin != nres) {
std::ostringstream emsg; std::ostringstream emsg;
emsg << "Tranche " << tranche emsg << "Tranche " << tranche
<< " worker " << ithread << "/" << nthreads << " worker " << ithread << "/" << nthreads
<< " |lines|==" << nlin << " != |results|==" << nres; << " |lines|==" << nlin << " != |results|==" << nres;
throw std::runtime_error(emsg.str()); throw std::runtime_error(emsg.str());
} }
for (std::size_t ires = 0; ires < nres; ++ires) for (std::size_t ires = 0; ires < nres; ++ires)
os << results[ithread][ires] << std::endl; os << results[ithread][ires] << std::endl;
} // end loop over joined results } // end loop over joined results
if (verbose_p) { if (verbose_p) {
std::cerr << line_no << ' '; std::cerr << line_no << ' ';
std::cerr.flush(); std::cerr.flush();
} }
} // end loop over chunks } // end loop over chunks
return line_no; return line_no;
@ -1642,18 +1642,18 @@ std::string
Tokenizer::detokenize(const std::string& buf) Tokenizer::detokenize(const std::string& buf)
{ {
std::vector<std::string> words = split(trim(buf)); std::vector<std::string> words = split(trim(buf));
std::size_t squotes = 0; std::size_t squotes = 0;
std::size_t dquotes = 0; std::size_t dquotes = 0;
std::string prepends(""); std::string prepends("");
std::ostringstream oss; std::ostringstream oss;
std::size_t nwords = words.size(); std::size_t nwords = words.size();
std::size_t iword = 0; std::size_t iword = 0;
if (unescape_p) if (unescape_p)
for (auto &word: words) for (auto &word: words)
unescape(word); unescape(word);
for (auto &word: words) { for (auto &word: words) {
@ -1665,13 +1665,13 @@ Tokenizer::detokenize(const std::string& buf)
} else if (RE2::FullMatch(word,left_x)) { } else if (RE2::FullMatch(word,left_x)) {
oss << word; oss << word;
prepends = SPC_BYTE; prepends = SPC_BYTE;
} else if (english_p && iword } else if (english_p && iword
&& RE2::FullMatch(word,curr_en_x) && RE2::FullMatch(word,curr_en_x)
&& RE2::FullMatch(words[iword-1],pre_en_x)) { && RE2::FullMatch(words[iword-1],pre_en_x)) {
oss << word; oss << word;
prepends = SPC_BYTE; prepends = SPC_BYTE;
} else if (latin_p && iword < nwords - 2 } else if (latin_p && iword < nwords - 2
&& RE2::FullMatch(word,curr_fr_x) && RE2::FullMatch(word,curr_fr_x)
&& RE2::FullMatch(words[iword+1],post_fr_x)) { && RE2::FullMatch(words[iword+1],post_fr_x)) {
oss << prepends << word; oss << prepends << word;
prepends.clear(); prepends.clear();
@ -1679,7 +1679,7 @@ Tokenizer::detokenize(const std::string& buf)
if ((word.at(0) == '\'' && ((squotes % 2) == 0 )) || if ((word.at(0) == '\'' && ((squotes % 2) == 0 )) ||
(word.at(0) == '"' && ((dquotes % 2) == 0))) { (word.at(0) == '"' && ((dquotes % 2) == 0))) {
if (english_p && iword if (english_p && iword
&& word.at(0) == '\'' && word.at(0) == '\''
&& std::tolower(words[iword-1].at(words[iword-1].size()-1)) == 's') { && std::tolower(words[iword-1].at(words[iword-1].size()-1)) == 's') {
oss << word; oss << word;
prepends = SPC_BYTE; prepends = SPC_BYTE;
@ -1698,7 +1698,7 @@ Tokenizer::detokenize(const std::string& buf)
prepends = SPC_BYTE; prepends = SPC_BYTE;
if (word.at(0) == '\'') if (word.at(0) == '\'')
squotes++; squotes++;
else if (word.at(0) == '"') else if (word.at(0) == '"')
dquotes++; dquotes++;
} }
} else { } else {
@ -1707,8 +1707,8 @@ Tokenizer::detokenize(const std::string& buf)
} }
iword++; iword++;
} }
std::string text(oss.str()); std::string text(oss.str());
RE2::GlobalReplace(&text," +",SPC_BYTE); RE2::GlobalReplace(&text," +",SPC_BYTE);
RE2::GlobalReplace(&text,"\n ","\n"); RE2::GlobalReplace(&text,"\n ","\n");
@ -1718,14 +1718,14 @@ Tokenizer::detokenize(const std::string& buf)
std::size_t std::size_t
Tokenizer::detokenize(std::istream& is, std::ostream& os) Tokenizer::detokenize(std::istream& is, std::ostream& os)
{ {
size_t line_no = 0; size_t line_no = 0;
while (is.good() && os.good()) { while (is.good() && os.good()) {
std::string istr; std::string istr;
std::getline(is,istr); std::getline(is,istr);
line_no ++; line_no ++;
if (istr.empty()) if (istr.empty())
continue; continue;
if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) { if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
os << istr << std::endl; os << istr << std::endl;
@ -1749,7 +1749,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
return parts; return parts;
} }
gunichar *uout = (gunichar *)g_malloc0(2*ncp*sizeof(gunichar)); gunichar *uout = (gunichar *)g_malloc0(2*ncp*sizeof(gunichar));
const wchar_t GENL_HYPH = L'\u2010'; const wchar_t GENL_HYPH = L'\u2010';
const wchar_t IDEO_STOP = L'\u3002'; const wchar_t IDEO_STOP = L'\u3002';
const wchar_t KANA_MDOT = L'\u30FB'; const wchar_t KANA_MDOT = L'\u30FB';
@ -1786,7 +1786,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
std::vector<std::size_t> breaks; std::vector<std::size_t> breaks;
std::set<std::size_t> suppress; std::set<std::size_t> suppress;
for (; icp <= ncp; ++icp) { for (; icp <= ncp; ++icp) {
currwc = wchar_t(ucs4[icp]); currwc = wchar_t(ucs4[icp]);
curr_type = g_unichar_type(currwc); curr_type = g_unichar_type(currwc);
@ -1798,7 +1798,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
case G_UNICODE_OTHER_NUMBER: case G_UNICODE_OTHER_NUMBER:
curr_class = numba; curr_class = numba;
curr_word_p = true; curr_word_p = true;
break; break;
case G_UNICODE_LOWERCASE_LETTER: case G_UNICODE_LOWERCASE_LETTER:
case G_UNICODE_MODIFIER_LETTER: case G_UNICODE_MODIFIER_LETTER:
case G_UNICODE_OTHER_LETTER: case G_UNICODE_OTHER_LETTER:
@ -1822,7 +1822,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
} else if (currwc >= SMAL_HYPH) { } else if (currwc >= SMAL_HYPH) {
curr_word_p = true; curr_word_p = true;
} else { } else {
curr_word_p = (currwc >= WAVE_DASH) && (currwc <= KANA_DHYP); curr_word_p = (currwc >= WAVE_DASH) && (currwc <= KANA_DHYP);
} }
break; break;
case G_UNICODE_CLOSE_PUNCTUATION: case G_UNICODE_CLOSE_PUNCTUATION:
@ -1860,7 +1860,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
curr_word_p = false; curr_word_p = false;
break; break;
} }
// # condition for prefix test // # condition for prefix test
// $words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/ // $words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/
// $words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/ // $words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/
@ -1875,7 +1875,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
} else if (curr_word_p) { } else if (curr_word_p) {
if (!fini_word) { if (!fini_word) {
init_word = ocp; init_word = ocp;
} }
fini_word = ocp+1; fini_word = ocp+1;
dotslen = finilen = 0; dotslen = finilen = 0;
} else if (curr_class >= quote && curr_class <= pfpct && curr_class != pinit) { } else if (curr_class >= quote && curr_class <= pfpct && curr_class != pinit) {
@ -1893,7 +1893,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
} else { } else {
init_word = fini_word = 0; init_word = fini_word = 0;
} }
if (check_abbr_p) { if (check_abbr_p) {
// not a valid word character or post-word punctuation character: check word // not a valid word character or post-word punctuation character: check word
std::wstring k((wchar_t *)uout+init_word,fini_word-init_word); std::wstring k((wchar_t *)uout+init_word,fini_word-init_word);
@ -1986,7 +1986,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
} }
init_word = fini_word = 0; init_word = fini_word = 0;
} }
if (seqpos >= SEQ_LIM) { if (seqpos >= SEQ_LIM) {
seqpos = 0; seqpos = 0;
} }
@ -2015,7 +2015,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
continue; continue;
} }
} }
if (!seqpos) { if (!seqpos) {
if (curr_class != blank) { if (curr_class != blank) {
uout[ocp++] = gunichar(currwc); uout[ocp++] = gunichar(currwc);
@ -2024,7 +2024,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
} }
continue; continue;
} }
if (curr_class == blank) { if (curr_class == blank) {
if (prev_class != blank) { if (prev_class != blank) {
seq[seqpos] = blank; seq[seqpos] = blank;
@ -2034,7 +2034,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
} }
if (icp < ncp) if (icp < ncp)
continue; continue;
} }
if (curr_class >= quote && curr_class <= pfini) { if (curr_class >= quote && curr_class <= pfini) {
if (prev_class < quote || prev_class > pfini) { if (prev_class < quote || prev_class > pfini) {
@ -2158,8 +2158,8 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
if (uout[chkpos] == L'\n' || uout[chkpos] == L' ') { if (uout[chkpos] == L'\n' || uout[chkpos] == L' ') {
endpos = chkpos; endpos = chkpos;
continue; continue;
} }
if (g_unichar_isgraph(uout[chkpos])) if (g_unichar_isgraph(uout[chkpos]))
break; break;
endpos = chkpos; endpos = chkpos;
} }
@ -2171,17 +2171,17 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
if (continuation_ptr) if (continuation_ptr)
*continuation_ptr = endpos > iop; *continuation_ptr = endpos > iop;
iop = nextpos; iop = nextpos;
} }
g_free(uout); g_free(uout);
g_free(ucs4); g_free(ucs4);
return parts; return parts;
} }
std::pair<std::size_t,std::size_t> std::pair<std::size_t,std::size_t>
Tokenizer::splitter(std::istream& is, std::ostream& os) Tokenizer::splitter(std::istream& is, std::ostream& os)
{ {
std::pair<std::size_t,std::size_t> counts = { 0, 0 }; std::pair<std::size_t,std::size_t> counts = { 0, 0 };
bool continuation_p = false; bool continuation_p = false;
@ -2197,7 +2197,7 @@ Tokenizer::splitter(std::istream& is, std::ostream& os)
if (istr.empty() && (is.eof() ||!para_marks_p)) if (istr.empty() && (is.eof() ||!para_marks_p))
continue; continue;
if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x)))
continue; continue;
std::vector<std::string> sentences(splitter(istr,&continuation_p)); std::vector<std::string> sentences(splitter(istr,&continuation_p));
@ -2221,13 +2221,13 @@ Tokenizer::splitter(std::istream& is, std::ostream& os)
os << " "; os << " ";
pending_gap = false; pending_gap = false;
} }
for (std::size_t ii = 0; ii < nsents-1; ++ii) for (std::size_t ii = 0; ii < nsents-1; ++ii)
os << sentences[ii] << std::endl; os << sentences[ii] << std::endl;
os << sentences[nsents-1]; os << sentences[nsents-1];
if (continuation_p) if (continuation_p)
pending_gap = !split_breaks_p; pending_gap = !split_breaks_p;
if (!pending_gap) if (!pending_gap)
os << std::endl; os << std::endl;

View File

@ -26,7 +26,7 @@ class Tokenizer {
private: private:
typedef enum { typedef enum {
empty = 0, empty = 0,
blank, blank,
upper, // upper case upper, // upper case
@ -56,7 +56,7 @@ private:
// non-breaking prefixes (other) ucs4 // non-breaking prefixes (other) ucs4
std::set<std::wstring> nbpre_gen_ucs4; std::set<std::wstring> nbpre_gen_ucs4;
// compiled protected patterns // compiled protected patterns
std::vector<re2::RE2 *> prot_pat_vec; std::vector<re2::RE2 *> prot_pat_vec;
protected: protected:
@ -96,10 +96,10 @@ protected:
Tokenizer *tokenizer; Tokenizer *tokenizer;
std::vector<std::string>& in; std::vector<std::string>& in;
std::vector<std::string>& out; std::vector<std::string>& out;
VectorTokenizerCallable(Tokenizer *_tokenizer, VectorTokenizerCallable(Tokenizer *_tokenizer,
std::vector<std::string>& _in, std::vector<std::string>& _in,
std::vector<std::string>& _out) std::vector<std::string>& _out)
: tokenizer(_tokenizer) : tokenizer(_tokenizer)
, in(_in) , in(_in)
, out(_out) { , out(_out) {
@ -107,10 +107,10 @@ protected:
void operator()() { void operator()() {
out.resize(in.size()); out.resize(in.size());
for (std::size_t ii = 0; ii < in.size(); ++ii) for (std::size_t ii = 0; ii < in.size(); ++ii)
if (in[ii].empty()) if (in[ii].empty())
out[ii] = in[ii]; out[ii] = in[ii];
else if (tokenizer->penn_p) else if (tokenizer->penn_p)
out[ii] = tokenizer->penn_tokenize(in[ii]); out[ii] = tokenizer->penn_tokenize(in[ii]);
else else
out[ii] = tokenizer->quik_tokenize(in[ii]); out[ii] = tokenizer->quik_tokenize(in[ii]);

View File

@ -10,8 +10,8 @@ using namespace TOKENIZER_NAMESPACE ;
#endif #endif
void void
usage(const char *path) usage(const char *path)
{ {
std::cerr << "Usage: " << path << "[-{v|x|p|a|e|s|u|n|N]* [LL] [-{c|o} PATH]* INFILE*" << std::endl; std::cerr << "Usage: " << path << "[-{v|x|p|a|e|s|u|n|N]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
std::cerr << " -a -- aggressive hyphenization" << std::endl; std::cerr << " -a -- aggressive hyphenization" << std::endl;
@ -89,7 +89,7 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
int nlines = 0; int nlines = 0;
std::string line; std::string line;
while (ifs.good() && std::getline(ifs,line)) { while (ifs.good() && std::getline(ifs,line)) {
if (line.empty()) if (line.empty())
continue; continue;
std::vector<std::string> tokens(tize.tokens(line)); std::vector<std::string> tokens(tize.tokens(line));
int count = 0; int count = 0;
@ -127,7 +127,7 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
} }
int main(int ac, char **av) int main(int ac, char **av)
{ {
int rc = 0; int rc = 0;
Parameters params; Parameters params;
@ -140,7 +140,7 @@ int main(int ac, char **av)
if (!detokenize_p) if (!detokenize_p)
params.split_p = std::strstr(av[0],"splitter") != 0; params.split_p = std::strstr(av[0],"splitter") != 0;
while (++av,--ac) { while (++av,--ac) {
if (**av == '-') { if (**av == '-') {
switch (av[0][1]) { switch (av[0][1]) {
case 'a': case 'a':
@ -244,7 +244,7 @@ int main(int ac, char **av)
if (comma) { if (comma) {
*comma++ = 0; *comma++ = 0;
params.chunksize = std::strtoul(comma,0,0); params.chunksize = std::strtoul(comma,0,0);
} }
params.nthreads = std::strtoul(*av,0,0); params.nthreads = std::strtoul(*av,0,0);
} else { } else {
params.args.push_back(std::string(*av)); params.args.push_back(std::string(*av));
@ -275,7 +275,7 @@ int main(int ac, char **av)
cfg_mos_str.append("/moses"); cfg_mos_str.append("/moses");
if (!::access(cfg_mos_str.c_str(),X_OK)) { if (!::access(cfg_mos_str.c_str(),X_OK)) {
params.cfg_path = strdup(cfg_mos_str.c_str()); params.cfg_path = strdup(cfg_mos_str.c_str());
} else if (!::access(cfg_shr_str.c_str(),X_OK)) { } else if (!::access(cfg_shr_str.c_str(),X_OK)) {
params.cfg_path = strdup(cfg_shr_str.c_str()); params.cfg_path = strdup(cfg_shr_str.c_str());
} else if (!::access(cfg_dir_str.c_str(),X_OK)) { } else if (!::access(cfg_dir_str.c_str(),X_OK)) {
params.cfg_path = strdup(cfg_dir_str.c_str()); params.cfg_path = strdup(cfg_dir_str.c_str());
@ -287,7 +287,7 @@ int main(int ac, char **av)
if (params.verbose_p) { if (params.verbose_p) {
std::cerr << "config path: " << params.cfg_path << std::endl; std::cerr << "config path: " << params.cfg_path << std::endl;
} }
} }
std::unique_ptr<std::ofstream> pofs = 0; std::unique_ptr<std::ofstream> pofs = 0;
if (!params.out_path.empty()) { if (!params.out_path.empty()) {
@ -345,7 +345,7 @@ int main(int ac, char **av)
if (plines.second) { if (plines.second) {
std::cerr << "%%% " << plines.second << " sentences." << std::endl; std::cerr << "%%% " << plines.second << " sentences." << std::endl;
} }
} }
return rc; return rc;
} }

View File

@ -1,236 +1,236 @@
/** /**
* ISS (Indexed Strings Storage) - memory efficient storage for permanent strings. * ISS (Indexed Strings Storage) - memory efficient storage for permanent strings.
* *
* Implementation note: use #define USE_HASHSET to switch between implementation * Implementation note: use #define USE_HASHSET to switch between implementation
* using __gnu_cxx::hash_set and implementation using std::set. * using __gnu_cxx::hash_set and implementation using std::set.
* *
* (C) Ceslav Przywara, UFAL MFF UK, 2011 * (C) Ceslav Przywara, UFAL MFF UK, 2011
* *
* $Id$ * $Id$
*/ */
#ifndef _ISS_H #ifndef _ISS_H
#define _ISS_H #define _ISS_H
#include <limits> #include <limits>
#include <vector> #include <vector>
#include <string.h> #include <string.h>
// Use hashset instead of std::set for string-to-number indexing? // Use hashset instead of std::set for string-to-number indexing?
#ifdef USE_HASHSET #ifdef USE_HASHSET
#include <ext/hash_set> #include <ext/hash_set>
#else #else
#include <set> #include <set>
#endif #endif
#include <boost/pool/pool.hpp> #include <boost/pool/pool.hpp>
#ifdef USE_HASHSET #ifdef USE_HASHSET
// Forward declaration of comparator functor. // Forward declaration of comparator functor.
template<class IndType> template<class IndType>
class StringsEqualComparator; class StringsEqualComparator;
template<class IndType> template<class IndType>
class Hasher; class Hasher;
#else #else
// Forward declaration of comparator functor. // Forward declaration of comparator functor.
template<class IndType> template<class IndType>
class StringsLessComparator; class StringsLessComparator;
#endif #endif
/** /**
*/ */
template<class IndType> template<class IndType>
class IndexedStringsStorage { class IndexedStringsStorage {
public: public:
typedef IndType index_type; typedef IndType index_type;
#ifdef USE_HASHSET #ifdef USE_HASHSET
typedef StringsEqualComparator<IndType> equality_comparator_t; typedef StringsEqualComparator<IndType> equality_comparator_t;
typedef Hasher<IndType> hasher_t; typedef Hasher<IndType> hasher_t;
/** @typedef Hash set used as lookup table (string -> numeric index). */ /** @typedef Hash set used as lookup table (string -> numeric index). */
typedef __gnu_cxx::hash_set<IndType, hasher_t, equality_comparator_t> index_t; typedef __gnu_cxx::hash_set<IndType, hasher_t, equality_comparator_t> index_t;
#else #else
typedef StringsLessComparator<IndType> less_comparator_t; typedef StringsLessComparator<IndType> less_comparator_t;
/** @typedef Set used as lookup table (string -> numeric index). */ /** @typedef Set used as lookup table (string -> numeric index). */
typedef std::set<IndType, less_comparator_t> index_t; typedef std::set<IndType, less_comparator_t> index_t;
#endif #endif
/** @typedef Container of pointers to stored C-strings. Acts as /** @typedef Container of pointers to stored C-strings. Acts as
* conversion table: numeric index -> string. * conversion table: numeric index -> string.
*/ */
typedef std::vector<const char*> table_t; typedef std::vector<const char*> table_t;
private: private:
/** @var memory pool used to store C-strings */ /** @var memory pool used to store C-strings */
boost::pool<> _storage; boost::pool<> _storage;
/** @var index-to-string conversion table */ /** @var index-to-string conversion table */
table_t _table; table_t _table;
/** @var index lookup table */ /** @var index lookup table */
index_t _index; index_t _index;
public: public:
/** Default constructor. /** Default constructor.
*/ */
IndexedStringsStorage(void); IndexedStringsStorage(void);
/** @return True, if the indices are exhausted (new strings cannot be stored). /** @return True, if the indices are exhausted (new strings cannot be stored).
*/ */
inline bool is_full(void) const { return _table.size() == std::numeric_limits<IndType>::max(); } inline bool is_full(void) const { return _table.size() == std::numeric_limits<IndType>::max(); }
/** Retrieves pointer to C-string instance represented by given index. /** Retrieves pointer to C-string instance represented by given index.
* Note: No range checks are performed! * Note: No range checks are performed!
* @param index Index of C-string to retrieve. * @param index Index of C-string to retrieve.
* @return Pointer to stored C-string instance. * @return Pointer to stored C-string instance.
*/ */
inline const char* get(IndType index) const { return _table[index]; } inline const char* get(IndType index) const { return _table[index]; }
/** Stores the string and returns its numeric index. /** Stores the string and returns its numeric index.
* @param str Pointer to C-string to store. * @param str Pointer to C-string to store.
* @return Index of stored copy of str. * @return Index of stored copy of str.
* @throw std::bad_alloc When insertion of new string would cause * @throw std::bad_alloc When insertion of new string would cause
* overflow of indices datatype. * overflow of indices datatype.
*/ */
IndType put(const char* str); IndType put(const char* str);
/** @return Number of unique strings stored so far. /** @return Number of unique strings stored so far.
*/ */
inline table_t::size_type size(void) const { return _table.size(); } inline table_t::size_type size(void) const { return _table.size(); }
}; };
/** Functor designed for less than comparison of C-strings stored within StringStore. /** Functor designed for less than comparison of C-strings stored within StringStore.
* @param IndType Type of numerical indices of strings within given StringStore. * @param IndType Type of numerical indices of strings within given StringStore.
*/ */
#ifdef USE_HASHSET #ifdef USE_HASHSET
template<class IndType> template<class IndType>
class StringsEqualComparator: public std::binary_function<IndType, IndType, bool> { class StringsEqualComparator: public std::binary_function<IndType, IndType, bool> {
#else #else
template<class IndType> template<class IndType>
class StringsLessComparator: public std::binary_function<IndType, IndType, bool> { class StringsLessComparator: public std::binary_function<IndType, IndType, bool> {
#endif #endif
/** @var conversion table: index -> string (necessary for indices comparison) */ /** @var conversion table: index -> string (necessary for indices comparison) */
const typename IndexedStringsStorage<IndType>::table_t& _table; const typename IndexedStringsStorage<IndType>::table_t& _table;
public: public:
#ifdef USE_HASHSET #ifdef USE_HASHSET
StringsEqualComparator<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _table(table) {} StringsEqualComparator<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _table(table) {}
#else #else
StringsLessComparator<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _table(table) {} StringsLessComparator<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _table(table) {}
#endif #endif
/** Comparison of two pointers to C-strings. /** Comparison of two pointers to C-strings.
* @param lhs Pointer to 1st C-string. * @param lhs Pointer to 1st C-string.
* @param rhs Pointer to 2nd C-string. * @param rhs Pointer to 2nd C-string.
* @return True, if 1st argument is equal/less than 2nd argument. * @return True, if 1st argument is equal/less than 2nd argument.
*/ */
inline bool operator()(IndType lhs, IndType rhs) const { inline bool operator()(IndType lhs, IndType rhs) const {
#ifdef USE_HASHSET #ifdef USE_HASHSET
return strcmp(_table[lhs], _table[rhs]) == 0; return strcmp(_table[lhs], _table[rhs]) == 0;
#else #else
return strcmp(_table[lhs], _table[rhs]) < 0; return strcmp(_table[lhs], _table[rhs]) < 0;
#endif #endif
} }
}; };
#ifdef USE_HASHSET #ifdef USE_HASHSET
/** Functor... TODO. /** Functor... TODO.
*/ */
template<class IndType> template<class IndType>
class Hasher: public std::unary_function<IndType, size_t> { class Hasher: public std::unary_function<IndType, size_t> {
__gnu_cxx::hash<const char*> _hash; __gnu_cxx::hash<const char*> _hash;
/** @var conversion table: index -> string (necessary for indices comparison) */ /** @var conversion table: index -> string (necessary for indices comparison) */
const typename IndexedStringsStorage<IndType>::table_t& _table; const typename IndexedStringsStorage<IndType>::table_t& _table;
public: public:
/** */ /** */
Hasher<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _hash(), _table(table) {} Hasher<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _hash(), _table(table) {}
/** Hashing function. /** Hashing function.
* @param index * @param index
* @return Counted hash. * @return Counted hash.
*/ */
inline size_t operator()(const IndType index) const { inline size_t operator()(const IndType index) const {
return _hash(_table[index]); return _hash(_table[index]);
} }
}; };
#endif #endif
template <class IndType> template <class IndType>
#ifdef USE_HASHSET #ifdef USE_HASHSET
IndexedStringsStorage<IndType>::IndexedStringsStorage(void): _storage(sizeof(char)), _table(), _index(100, hasher_t(_table), equality_comparator_t(_table)) {} IndexedStringsStorage<IndType>::IndexedStringsStorage(void): _storage(sizeof(char)), _table(), _index(100, hasher_t(_table), equality_comparator_t(_table)) {}
#else #else
IndexedStringsStorage<IndType>::IndexedStringsStorage(void): _storage(sizeof(char)), _table(), _index(less_comparator_t(_table)) {} IndexedStringsStorage<IndType>::IndexedStringsStorage(void): _storage(sizeof(char)), _table(), _index(less_comparator_t(_table)) {}
#endif #endif
template <class IndType> template <class IndType>
IndType IndexedStringsStorage<IndType>::put(const char* str) { IndType IndexedStringsStorage<IndType>::put(const char* str) {
if ( this->is_full() ) { if ( this->is_full() ) {
// What a pity, not a single index left to spend. // What a pity, not a single index left to spend.
throw std::bad_alloc(); throw std::bad_alloc();
} }
// To use the index for lookup we first have to store passed string // To use the index for lookup we first have to store passed string
// in conversion table (cause during lookup we compare the strings indirectly // in conversion table (cause during lookup we compare the strings indirectly
// by using their indices). // by using their indices).
// Note: thread unsafe! TODO: Redesing. // Note: thread unsafe! TODO: Redesing.
IndType index = static_cast<IndType>(_table.size()); IndType index = static_cast<IndType>(_table.size());
_table.push_back(str); _table.push_back(str);
#ifdef USE_HASHSET #ifdef USE_HASHSET
// //
typename index_t::iterator iIndex = _index.find(index); typename index_t::iterator iIndex = _index.find(index);
#else #else
// A lower_bound() search enables us to use "found" iterator as a hint for // A lower_bound() search enables us to use "found" iterator as a hint for
// eventual insertion. // eventual insertion.
typename index_t::iterator iIndex = _index.lower_bound(index); typename index_t::iterator iIndex = _index.lower_bound(index);
#endif #endif
if ( (iIndex != _index.end()) if ( (iIndex != _index.end())
#ifndef USE_HASHSET #ifndef USE_HASHSET
// In case of lower_bound() search we have to also compare found item // In case of lower_bound() search we have to also compare found item
// with passed string. // with passed string.
&& (strcmp(_table[*iIndex], str) == 0) && (strcmp(_table[*iIndex], str) == 0)
#endif #endif
) { ) {
// String is already present in storage! // String is already present in storage!
// Pop back temporary stored pointer... // Pop back temporary stored pointer...
_table.pop_back(); _table.pop_back();
// ...and return numeric index to already stored copy of `str`. // ...and return numeric index to already stored copy of `str`.
return static_cast<IndType>(*iIndex); return static_cast<IndType>(*iIndex);
} }
// String not found within storage. // String not found within storage.
// Allocate memory required for string storage... // Allocate memory required for string storage...
char* mem = static_cast<char*>(_storage.ordered_malloc(strlen(str) + 1)); char* mem = static_cast<char*>(_storage.ordered_malloc(strlen(str) + 1));
// ...and fill it with copy of passed string. // ...and fill it with copy of passed string.
strcpy(mem, str); strcpy(mem, str);
// Overwrite temporary stored pointer to `str` with pointer to freshly // Overwrite temporary stored pointer to `str` with pointer to freshly
// saved copy. // saved copy.
_table[index] = mem; _table[index] = mem;
#ifdef USE_HASHSET #ifdef USE_HASHSET
// Insert the index into lookup table. // Insert the index into lookup table.
_index.insert(index); _index.insert(index);
#else #else
// Insert the index into lookup table (use previously retrieved iterator // Insert the index into lookup table (use previously retrieved iterator
// as a hint). // as a hint).
_index.insert(iIndex, index); _index.insert(iIndex, index);
#endif #endif
// Finally. // Finally.
return index; return index;
} }
#endif #endif

View File

@ -83,7 +83,7 @@ public:
const counter_t bucketWidth; // ceil(1/error) const counter_t bucketWidth; // ceil(1/error)
private: private:
/** @var Current epoch bucket ID (b-current) */ /** @var Current epoch bucket ID (b-current) */
counter_t _bucketId; counter_t _bucketId;
@ -182,7 +182,7 @@ class LossyCounterIterator: public std::iterator<std::forward_iterator_tag, type
public: public:
typedef LossyCounterIterator<T> self_type; typedef LossyCounterIterator<T> self_type;
typedef typename LossyCounter<T>::storage_t::const_iterator const_iterator; typedef typename LossyCounter<T>::storage_t::const_iterator const_iterator;
protected: protected:
@ -288,7 +288,7 @@ protected:
template<class T> template<class T>
void LossyCounter<T>::add(const T& item) { void LossyCounter<T>::add(const T& item) {
typename storage_t::iterator iter = _storage.find(item); typename storage_t::iterator iter = _storage.find(item);
if ( iter == _storage.end() ) { if ( iter == _storage.end() ) {
@ -330,7 +330,7 @@ void LossyCounter<T>::prune(void) {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template<class T> template<class T>
LossyCounterIterator<T> LossyCounterIterator<T>::operator++(void) { LossyCounterIterator<T> LossyCounterIterator<T>::operator++(void) {
this->forward(); this->forward();
return *this; return *this;
} }

View File

@ -92,7 +92,7 @@ int main(int argc, char* argv[]) {
// Init lossy counters. // Init lossy counters.
std::string lossyCountersParams; std::string lossyCountersParams;
int paramIdx = 5; int paramIdx = 5;
while ( (argc > paramIdx) && (*argv[paramIdx] != '-') ) { while ( (argc > paramIdx) && (*argv[paramIdx] != '-') ) {
std::string param = std::string(argv[paramIdx]); std::string param = std::string(argv[paramIdx]);
if ( !parse_lossy_counting_params(param) ) { if ( !parse_lossy_counting_params(param) ) {
@ -113,7 +113,7 @@ int main(int argc, char* argv[]) {
usage(argv[0]); usage(argv[0]);
} }
} }
if ( (argc > paramIdx) && (strcmp(argv[paramIdx], "--compact") == 0) ) { if ( (argc > paramIdx) && (strcmp(argv[paramIdx], "--compact") == 0) ) {
compactOutputFlag = true; compactOutputFlag = true;
++paramIdx; ++paramIdx;
@ -154,7 +154,7 @@ int main(int argc, char* argv[]) {
readInput(eFile, fFile, aFile); readInput(eFile, fFile, aFile);
std::cerr << std::endl; // Leave the progress bar end on previous line. std::cerr << std::endl; // Leave the progress bar end on previous line.
// close input files // close input files
eFile.close(); eFile.close();
fFile.close(); fFile.close();

View File

@ -32,14 +32,14 @@ typedef std::vector<output_pair_t> output_vector_t;
class PhraseComp { class PhraseComp {
/** @var If true, sort by target phrase first. */ /** @var If true, sort by target phrase first. */
bool _inverted; bool _inverted;
bool compareAlignments(const indexed_phrases_pair_t& a, const indexed_phrases_pair_t& b); bool compareAlignments(const indexed_phrases_pair_t& a, const indexed_phrases_pair_t& b);
int comparePhrases(const indexed_phrases_pair_t::phrase_t& a, const indexed_phrases_pair_t::phrase_t& b); int comparePhrases(const indexed_phrases_pair_t::phrase_t& a, const indexed_phrases_pair_t::phrase_t& b);
public: public:
PhraseComp(bool inverted): _inverted(inverted) {} PhraseComp(bool inverted): _inverted(inverted) {}
bool operator()(const output_pair_t& a, const output_pair_t& b); bool operator()(const output_pair_t& a, const output_pair_t& b);
}; };
@ -448,9 +448,9 @@ void extract(SentenceAlignment &sentence) {
((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : "") + " | " + ((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : "") + " | " +
((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : ""); ((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : "");
} }
addPhrase(sentence, startE, endE, startF, endF, orientationInfo); addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
} // end of for loop through inbound phrases } // end of for loop through inbound phrases
} // end if buildExtraStructure } // end if buildExtraStructure
@ -567,7 +567,7 @@ bool PhraseComp::operator()(const output_pair_t& a, const output_pair_t& b) {
else { else {
return cmp < 0; return cmp < 0;
} }
} }
@ -607,7 +607,7 @@ bool PhraseComp::compareAlignments(const indexed_phrases_pair_t& a, const indexe
return cmp < 0; return cmp < 0;
} }
} }
// Note: LC_ALL=C GNU sort treats shorter item as lesser than longer one. // Note: LC_ALL=C GNU sort treats shorter item as lesser than longer one.
return (cmp == 0) ? (aSize < bSize) : (cmp < 0); return (cmp == 0) ? (aSize < bSize) : (cmp < 0);
@ -685,7 +685,7 @@ void processSortedOutput(OutputProcessor& processor) {
void processUnsortedOutput(OutputProcessor& processor) { void processUnsortedOutput(OutputProcessor& processor) {
LossyCountersVector::value_type current = NULL, prev = NULL; LossyCountersVector::value_type current = NULL, prev = NULL;
for ( size_t i = 1; i < lossyCounters.size(); ++i ) { // Intentionally skip 0. for ( size_t i = 1; i < lossyCounters.size(); ++i ) { // Intentionally skip 0.
@ -759,7 +759,7 @@ void printStats(void) {
if ( (current == NULL) || ((current != prev) && (prev != NULL)) ) { if ( (current == NULL) || ((current != prev) && (prev != NULL)) ) {
// Time to print. // Time to print.
to = i-1; to = i-1;
// Increment overall stats. // Increment overall stats.
outputMass += prev->outputMass; outputMass += prev->outputMass;
outputSize += prev->outputSize; outputSize += prev->outputSize;
@ -787,7 +787,7 @@ void printStats(void) {
from = i; from = i;
} }
prev = current; prev = current;
} }

View File

@ -10,15 +10,15 @@ int main(int argc, char* argv[])
using namespace boost::locale; using namespace boost::locale;
using namespace std; using namespace std;
generator gen; generator gen;
locale loc=gen(""); locale loc=gen("");
cout.imbue(loc); cout.imbue(loc);
cout << "Hello, World" << endl; cout << "Hello, World" << endl;
cout << "This is how we show currency in this locale " << as::currency << 103.34 << endl; cout << "This is how we show currency in this locale " << as::currency << 103.34 << endl;
return 0; return 0;
} }

File diff suppressed because it is too large Load Diff

View File

@ -1,231 +1,231 @@
// XGetopt.cpp Version 1.2 // XGetopt.cpp Version 1.2
// //
// Author: Hans Dietrich // Author: Hans Dietrich
// hdietrich2@hotmail.com // hdietrich2@hotmail.com
// //
// Description: // Description:
// XGetopt.cpp implements getopt(), a function to parse command lines. // XGetopt.cpp implements getopt(), a function to parse command lines.
// //
// History // History
// Version 1.2 - 2003 May 17 // Version 1.2 - 2003 May 17
// - Added Unicode support // - Added Unicode support
// //
// Version 1.1 - 2002 March 10 // Version 1.1 - 2002 March 10
// - Added example to XGetopt.cpp module header // - Added example to XGetopt.cpp module header
// //
// This software is released into the public domain. // This software is released into the public domain.
// You are free to use it in any way you like. // You are free to use it in any way you like.
// //
// This software is provided "as is" with no expressed // This software is provided "as is" with no expressed
// or implied warranty. I accept no liability for any // or implied warranty. I accept no liability for any
// damage or loss of business that this software may cause. // damage or loss of business that this software may cause.
// //
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// if you are using precompiled headers then include this line: // if you are using precompiled headers then include this line:
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// if you are not using precompiled headers then include these lines: // if you are not using precompiled headers then include these lines:
//#include <windows.h> //#include <windows.h>
//#include <cstdio> //#include <cstdio>
//#include <tchar.h> //#include <tchar.h>
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <cmath> #include <cmath>
#include "WIN32_functions.h" #include "WIN32_functions.h"
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// //
// X G e t o p t . c p p // X G e t o p t . c p p
// //
// //
// NAME // NAME
// getopt -- parse command line options // getopt -- parse command line options
// //
// SYNOPSIS // SYNOPSIS
// int getopt(int argc, char *argv[], char *optstring) // int getopt(int argc, char *argv[], char *optstring)
// //
// extern char *optarg; // extern char *optarg;
// extern int optind; // extern int optind;
// //
// DESCRIPTION // DESCRIPTION
// The getopt() function parses the command line arguments. Its // The getopt() function parses the command line arguments. Its
// arguments argc and argv are the argument count and array as // arguments argc and argv are the argument count and array as
// passed into the application on program invocation. In the case // passed into the application on program invocation. In the case
// of Visual C++ programs, argc and argv are available via the // of Visual C++ programs, argc and argv are available via the
// variables __argc and __argv (double underscores), respectively. // variables __argc and __argv (double underscores), respectively.
// getopt returns the next option letter in argv that matches a // getopt returns the next option letter in argv that matches a
// letter in optstring. (Note: Unicode programs should use // letter in optstring. (Note: Unicode programs should use
// __targv instead of __argv. Also, all character and string // __targv instead of __argv. Also, all character and string
// literals should be enclosed in ( ) ). // literals should be enclosed in ( ) ).
// //
// optstring is a string of recognized option letters; if a letter // optstring is a string of recognized option letters; if a letter
// is followed by a colon, the option is expected to have an argument // is followed by a colon, the option is expected to have an argument
// that may or may not be separated from it by white space. optarg // that may or may not be separated from it by white space. optarg
// is set to point to the start of the option argument on return from // is set to point to the start of the option argument on return from
// getopt. // getopt.
// //
// Option letters may be combined, e.g., "-ab" is equivalent to // Option letters may be combined, e.g., "-ab" is equivalent to
// "-a -b". Option letters are case sensitive. // "-a -b". Option letters are case sensitive.
// //
// getopt places in the external variable optind the argv index // getopt places in the external variable optind the argv index
// of the next argument to be processed. optind is initialized // of the next argument to be processed. optind is initialized
// to 0 before the first call to getopt. // to 0 before the first call to getopt.
// //
// When all options have been processed (i.e., up to the first // When all options have been processed (i.e., up to the first
// non-option argument), getopt returns EOF, optarg will point // non-option argument), getopt returns EOF, optarg will point
// to the argument, and optind will be set to the argv index of // to the argument, and optind will be set to the argv index of
// the argument. If there are no non-option arguments, optarg // the argument. If there are no non-option arguments, optarg
// will be set to NULL. // will be set to NULL.
// //
// The special option "--" may be used to delimit the end of the // The special option "--" may be used to delimit the end of the
// options; EOF will be returned, and "--" (and everything after it) // options; EOF will be returned, and "--" (and everything after it)
// will be skipped. // will be skipped.
// //
// RETURN VALUE // RETURN VALUE
// For option letters contained in the string optstring, getopt // For option letters contained in the string optstring, getopt
// will return the option letter. getopt returns a question mark (?) // will return the option letter. getopt returns a question mark (?)
// when it encounters an option letter not included in optstring. // when it encounters an option letter not included in optstring.
// EOF is returned when processing is finished. // EOF is returned when processing is finished.
// //
// BUGS // BUGS
// 1) Long options are not supported. // 1) Long options are not supported.
// 2) The GNU double-colon extension is not supported. // 2) The GNU double-colon extension is not supported.
// 3) The environment variable POSIXLY_CORRECT is not supported. // 3) The environment variable POSIXLY_CORRECT is not supported.
// 4) The + syntax is not supported. // 4) The + syntax is not supported.
// 5) The automatic permutation of arguments is not supported. // 5) The automatic permutation of arguments is not supported.
// 6) This implementation of getopt() returns EOF if an error is // 6) This implementation of getopt() returns EOF if an error is
// encountered, instead of -1 as the latest standard requires. // encountered, instead of -1 as the latest standard requires.
// //
// EXAMPLE // EXAMPLE
// BOOL CMyApp::ProcessCommandLine(int argc, char *argv[]) // BOOL CMyApp::ProcessCommandLine(int argc, char *argv[])
// { // {
// int c; // int c;
// //
// while ((c = getopt(argc, argv, ("aBn:"))) != EOF) // while ((c = getopt(argc, argv, ("aBn:"))) != EOF)
// { // {
// switch (c) // switch (c)
// { // {
// case ('a'): // case ('a'):
// TRACE(("option a\n")); // TRACE(("option a\n"));
// // // //
// // set some flag here // // set some flag here
// // // //
// break; // break;
// //
// case ('B'): // case ('B'):
// TRACE( ("option B\n")); // TRACE( ("option B\n"));
// // // //
// // set some other flag here // // set some other flag here
// // // //
// break; // break;
// //
// case ('n'): // case ('n'):
// TRACE(("option n: value=%d\n"), atoi(optarg)); // TRACE(("option n: value=%d\n"), atoi(optarg));
// // // //
// // do something with value here // // do something with value here
// // // //
// break; // break;
// //
// case ('?'): // case ('?'):
// TRACE(("ERROR: illegal option %s\n"), argv[optind-1]); // TRACE(("ERROR: illegal option %s\n"), argv[optind-1]);
// return FALSE; // return FALSE;
// break; // break;
// //
// default: // default:
// TRACE(("WARNING: no handler for option %c\n"), c); // TRACE(("WARNING: no handler for option %c\n"), c);
// return FALSE; // return FALSE;
// break; // break;
// } // }
// } // }
// // // //
// // check for non-option args here // // check for non-option args here
// // // //
// return TRUE; // return TRUE;
// } // }
// //
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
char *optarg; // global argument pointer char *optarg; // global argument pointer
int optind = 0; // global argv index int optind = 0; // global argv index
int getopt(int argc, char *argv[], char *optstring) int getopt(int argc, char *argv[], char *optstring)
{ {
static char *next = NULL; static char *next = NULL;
if (optind == 0) if (optind == 0)
next = NULL; next = NULL;
optarg = NULL; optarg = NULL;
if (next == NULL || *next =='\0') { if (next == NULL || *next =='\0') {
if (optind == 0) if (optind == 0)
optind++; optind++;
if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0')) { if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0')) {
optarg = NULL; optarg = NULL;
if (optind < argc) if (optind < argc)
optarg = argv[optind]; optarg = argv[optind];
return EOF; return EOF;
} }
if (strcmp(argv[optind], "--") == 0) { if (strcmp(argv[optind], "--") == 0) {
optind++; optind++;
optarg = NULL; optarg = NULL;
if (optind < argc) if (optind < argc)
optarg = argv[optind]; optarg = argv[optind];
return EOF; return EOF;
} }
next = argv[optind]; next = argv[optind];
next++; // skip past - next++; // skip past -
optind++; optind++;
} }
char c = *next++; char c = *next++;
char *cp = strchr(optstring, c); char *cp = strchr(optstring, c);
if (cp == NULL || c == (':')) if (cp == NULL || c == (':'))
return ('?'); return ('?');
cp++; cp++;
if (*cp == (':')) { if (*cp == (':')) {
if (*next != ('\0')) { if (*next != ('\0')) {
optarg = next; optarg = next;
next = NULL; next = NULL;
} else if (optind < argc) { } else if (optind < argc) {
optarg = argv[optind]; optarg = argv[optind];
optind++; optind++;
} else { } else {
return ('?'); return ('?');
} }
} }
return c; return c;
} }
// for an overview, see // for an overview, see
// W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1. // W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
double lgamma(int x) double lgamma(int x)
{ {
// size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum)); // size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
if (x <= 2) { if (x <= 2) {
return 0.0; return 0.0;
} }
static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5}; static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
double tmp=(double)x+5.5; double tmp=(double)x+5.5;
tmp -= (((double)x)+0.5)*log(tmp); tmp -= (((double)x)+0.5)*log(tmp);
double y=(double)x; double y=(double)x;
double sum = 1.000000000190015; double sum = 1.000000000190015;
for (size_t j=0; j<6; ++j) { for (size_t j=0; j<6; ++j) {
sum += coefs[j]/++y; sum += coefs[j]/++y;
} }
return -tmp+log(2.5066282746310005*sum/(double)x); return -tmp+log(2.5066282746310005*sum/(double)x);
} }

View File

@ -1,24 +1,24 @@
// XGetopt.h Version 1.2 // XGetopt.h Version 1.2
// //
// Author: Hans Dietrich // Author: Hans Dietrich
// hdietrich2@hotmail.com // hdietrich2@hotmail.com
// //
// This software is released into the public domain. // This software is released into the public domain.
// You are free to use it in any way you like. // You are free to use it in any way you like.
// //
// This software is provided "as is" with no expressed // This software is provided "as is" with no expressed
// or implied warranty. I accept no liability for any // or implied warranty. I accept no liability for any
// damage or loss of business that this software may cause. // damage or loss of business that this software may cause.
// //
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
#ifndef XGETOPT_H #ifndef XGETOPT_H
#define XGETOPT_H #define XGETOPT_H
extern int optind, opterr; extern int optind, opterr;
extern char *optarg; extern char *optarg;
int getopt(int argc, char *argv[], char *optstring); int getopt(int argc, char *argv[], char *optstring);
double lgamma(int x); double lgamma(int x);
#endif //XGETOPT_H #endif //XGETOPT_H

View File

@ -1,5 +1,5 @@
#include <cstring> #include <cstring>
#include <cassert> #include <cassert>
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>

View File

@ -234,13 +234,13 @@ void OutputAlignment(ostream &out, const AlignmentInfo &ai, size_t sourceOffset,
{ {
typedef std::vector< const std::pair<size_t,size_t>* > AlignVec; typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
AlignVec alignments = ai.GetSortedAlignments(); AlignVec alignments = ai.GetSortedAlignments();
AlignVec::const_iterator it; AlignVec::const_iterator it;
for (it = alignments.begin(); it != alignments.end(); ++it) { for (it = alignments.begin(); it != alignments.end(); ++it) {
const std::pair<size_t,size_t> &alignment = **it; const std::pair<size_t,size_t> &alignment = **it;
out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " "; out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " ";
} }
} }
void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges) void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
@ -251,7 +251,7 @@ void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
const Hypothesis &edge = *edges[currEdge]; const Hypothesis &edge = *edges[currEdge];
const TargetPhrase &tp = edge.GetCurrTargetPhrase(); const TargetPhrase &tp = edge.GetCurrTargetPhrase();
size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos(); size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos();
OutputAlignment(out, tp.GetAlignmentInfo(), sourceOffset, targetOffset); OutputAlignment(out, tp.GetAlignmentInfo(), sourceOffset, targetOffset);
targetOffset += tp.GetSize(); targetOffset += tp.GetSize();
@ -263,7 +263,7 @@ void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<co
{ {
ostringstream out; ostringstream out;
OutputAlignment(out, edges); OutputAlignment(out, edges);
collector->Write(lineNo,out.str()); collector->Write(lineNo,out.str());
} }
@ -477,7 +477,7 @@ void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, con
const int sourceOffset = sourceRange.GetStartPos(); const int sourceOffset = sourceRange.GetStartPos();
const int targetOffset = targetRange.GetStartPos(); const int targetOffset = targetRange.GetStartPos();
const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignmentInfo(); const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignmentInfo();
OutputAlignment(out, ai, sourceOffset, targetOffset); OutputAlignment(out, ai, sourceOffset, targetOffset);
} }

View File

@ -168,18 +168,18 @@ static void ShowWeights()
int main(int argc, char** argv) int main(int argc, char** argv)
{ {
try { try {
// echo command line, if verbose // echo command line, if verbose
IFVERBOSE(1) { IFVERBOSE(1) {
TRACE_ERR("command: "); TRACE_ERR("command: ");
for(int i=0; i<argc; ++i) TRACE_ERR(argv[i]<<" "); for(int i=0; i<argc; ++i) TRACE_ERR(argv[i]<<" ");
TRACE_ERR(endl); TRACE_ERR(endl);
} }
// set number of significant decimals in output // set number of significant decimals in output
fix(cout,PRECISION); fix(cout,PRECISION);
fix(cerr,PRECISION); fix(cerr,PRECISION);
// load all the settings into the Parameter class // load all the settings into the Parameter class
// (stores them as strings, or array of strings) // (stores them as strings, or array of strings)
Parameter* params = new Parameter(); Parameter* params = new Parameter();
@ -187,34 +187,34 @@ int main(int argc, char** argv)
params->Explain(); params->Explain();
exit(1); exit(1);
} }
// initialize all "global" variables, which are stored in StaticData // initialize all "global" variables, which are stored in StaticData
// note: this also loads models such as the language model, etc. // note: this also loads models such as the language model, etc.
if (!StaticData::LoadDataStatic(params, argv[0])) { if (!StaticData::LoadDataStatic(params, argv[0])) {
exit(1); exit(1);
} }
// setting "-show-weights" -> just dump out weights and exit // setting "-show-weights" -> just dump out weights and exit
if (params->isParamSpecified("show-weights")) { if (params->isParamSpecified("show-weights")) {
ShowWeights(); ShowWeights();
exit(0); exit(0);
} }
// shorthand for accessing information in StaticData // shorthand for accessing information in StaticData
const StaticData& staticData = StaticData::Instance(); const StaticData& staticData = StaticData::Instance();
//initialise random numbers //initialise random numbers
rand_init(); rand_init();
// set up read/writing class // set up read/writing class
IOWrapper* ioWrapper = GetIOWrapper(staticData); IOWrapper* ioWrapper = GetIOWrapper(staticData);
if (!ioWrapper) { if (!ioWrapper) {
cerr << "Error; Failed to create IO object" << endl; cerr << "Error; Failed to create IO object" << endl;
exit(1); exit(1);
} }
// check on weights // check on weights
vector<float> weights = staticData.GetAllWeights(); vector<float> weights = staticData.GetAllWeights();
IFVERBOSE(2) { IFVERBOSE(2) {
@ -233,7 +233,7 @@ int main(int argc, char** argv)
// setting lexicalized reordering setup // setting lexicalized reordering setup
PhraseBasedReorderingState::m_useFirstBackwardScore = false; PhraseBasedReorderingState::m_useFirstBackwardScore = false;
auto_ptr<OutputCollector> outputCollector; auto_ptr<OutputCollector> outputCollector;
outputCollector.reset(new OutputCollector()); outputCollector.reset(new OutputCollector());
@ -241,7 +241,7 @@ int main(int argc, char** argv)
#ifdef WITH_THREADS #ifdef WITH_THREADS
ThreadPool pool(staticData.ThreadCount()); ThreadPool pool(staticData.ThreadCount());
#endif #endif
// main loop over set of input sentences // main loop over set of input sentences
InputType* source = NULL; InputType* source = NULL;
size_t lineCount = 0; size_t lineCount = 0;
@ -259,11 +259,11 @@ int main(int argc, char** argv)
task->Run(); task->Run();
delete task; delete task;
#endif #endif
source = NULL; //make sure it doesn't get deleted source = NULL; //make sure it doesn't get deleted
++lineCount; ++lineCount;
} }
// we are done, finishing up // we are done, finishing up
#ifdef WITH_THREADS #ifdef WITH_THREADS
pool.Stop(true); //flush remaining jobs pool.Stop(true); //flush remaining jobs

View File

@ -70,7 +70,7 @@ namespace MosesCmd
if (neg_log_div > 100){ if (neg_log_div > 100){
return 100; return 100;
} }
return neg_log_div; return neg_log_div;
} }
void RelativeEntropyCalc::ConcatOutputPhraseRecursive(Phrase& phrase, const Hypothesis *hypo){ void RelativeEntropyCalc::ConcatOutputPhraseRecursive(Phrase& phrase, const Hypothesis *hypo){

View File

@ -57,7 +57,7 @@ void PrintTranslationAnalysis(const TranslationSystem* system, std::ostream &os,
} }
} }
} }
bool epsilon = false; bool epsilon = false;
if (target == "") { if (target == "") {
target="<EPSILON>"; target="<EPSILON>";

View File

@ -60,12 +60,12 @@ static void add(const string& e, const vector<float> scores,
static void finalise(Probs& p_e_given_f, Probs& p_f_given_e) { static void finalise(Probs& p_e_given_f, Probs& p_f_given_e) {
//cerr << "Sizes: p(e|f): " << p_e_given_f.size() << " p(f|e): " << p_f_given_e.size() << endl; //cerr << "Sizes: p(e|f): " << p_e_given_f.size() << " p(f|e): " << p_f_given_e.size() << endl;
for (Probs::const_iterator e1_iter = p_f_given_e.begin() ; for (Probs::const_iterator e1_iter = p_f_given_e.begin() ;
e1_iter != p_f_given_e.end(); ++e1_iter) { e1_iter != p_f_given_e.end(); ++e1_iter) {
for (Probs::const_iterator e2_iter = p_e_given_f.begin() ; for (Probs::const_iterator e2_iter = p_e_given_f.begin() ;
e2_iter != p_e_given_f.end(); ++e2_iter) { e2_iter != p_e_given_f.end(); ++e2_iter) {
if (e1_iter->second == e2_iter->second) continue; if (e1_iter->second == e2_iter->second) continue;
cout << e1_iter->second << " ||| " << e2_iter->second << " ||| " << cout << e1_iter->second << " ||| " << e2_iter->second << " ||| " <<
e1_iter->first * e2_iter->first << " ||| " << endl; e1_iter->first * e2_iter->first << " ||| " << endl;
} }

View File

@ -3,10 +3,10 @@
// The separate moses server executable is being phased out. // The separate moses server executable is being phased out.
// Since there were problems with the migration into the main // Since there were problems with the migration into the main
// executable, this separate program is still included in the // executable, this separate program is still included in the
// distribution for legacy reasons. Contributors are encouraged // distribution for legacy reasons. Contributors are encouraged
// to add their contributions to moses/server rather than // to add their contributions to moses/server rather than
// contrib/server. This recommendation does not apply to wrapper // contrib/server. This recommendation does not apply to wrapper
// scripts. // scripts.
// The future is this: // The future is this:
/** main function of the command line version of the decoder **/ /** main function of the command line version of the decoder **/
@ -83,7 +83,7 @@ public:
pdsa->add(source_,target_,alignment_); pdsa->add(source_,target_,alignment_);
#else #else
const PhraseDictionary* pdf = PhraseDictionary::GetColl()[0]; const PhraseDictionary* pdf = PhraseDictionary::GetColl()[0];
PhraseDictionaryDynSuffixArray* PhraseDictionaryDynSuffixArray*
pdsa = (PhraseDictionaryDynSuffixArray*) pdf; pdsa = (PhraseDictionaryDynSuffixArray*) pdf;
cerr << "Inserting into address " << pdsa << endl; cerr << "Inserting into address " << pdsa << endl;
pdsa->insertSnt(source_, target_, alignment_); pdsa->insertSnt(source_, target_, alignment_);
@ -146,7 +146,7 @@ public:
} }
} }
*/ */
void breakOutParams(const params_t& params) { void breakOutParams(const params_t& params) {
params_t::const_iterator si = params.find("source"); params_t::const_iterator si = params.find("source");
if(si == params.end()) if(si == params.end())
@ -236,7 +236,7 @@ public:
class TranslationTask : public virtual Moses::TranslationTask { class TranslationTask : public virtual Moses::TranslationTask {
protected: protected:
TranslationTask(xmlrpc_c::paramList const& paramList, TranslationTask(xmlrpc_c::paramList const& paramList,
boost::condition_variable& cond, boost::mutex& mut) boost::condition_variable& cond, boost::mutex& mut)
: m_paramList(paramList), : m_paramList(paramList),
m_cond(cond), m_cond(cond),
m_mut(mut), m_mut(mut),
@ -244,7 +244,7 @@ protected:
{} {}
public: public:
static boost::shared_ptr<TranslationTask> static boost::shared_ptr<TranslationTask>
create(xmlrpc_c::paramList const& paramList, create(xmlrpc_c::paramList const& paramList,
boost::condition_variable& cond, boost::mutex& mut) boost::condition_variable& cond, boost::mutex& mut)
{ {
@ -252,15 +252,15 @@ public:
ret->m_self = ret; ret->m_self = ret;
return ret; return ret;
} }
virtual bool DeleteAfterExecution() {return false;} virtual bool DeleteAfterExecution() {return false;}
bool IsDone() const {return m_done;} bool IsDone() const {return m_done;}
const map<string, xmlrpc_c::value>& GetRetData() { return m_retData;} const map<string, xmlrpc_c::value>& GetRetData() { return m_retData;}
virtual void virtual void
Run() Run()
{ {
using namespace xmlrpc_c; using namespace xmlrpc_c;
const params_t params = m_paramList.getStruct(0); const params_t params = m_paramList.getStruct(0);
@ -292,25 +292,25 @@ public:
vector<float> multiModelWeights; vector<float> multiModelWeights;
si = params.find("lambda"); si = params.find("lambda");
if (si != params.end()) if (si != params.end())
{ {
value_array multiModelArray = value_array(si->second); value_array multiModelArray = value_array(si->second);
vector<value> multiModelValueVector(multiModelArray.vectorValueValue()); vector<value> multiModelValueVector(multiModelArray.vectorValueValue());
for (size_t i=0;i < multiModelValueVector.size();i++) for (size_t i=0;i < multiModelValueVector.size();i++)
{ {
multiModelWeights.push_back(value_double(multiModelValueVector[i])); multiModelWeights.push_back(value_double(multiModelValueVector[i]));
} }
} }
si = params.find("model_name"); si = params.find("model_name");
if (si != params.end() && multiModelWeights.size() > 0) if (si != params.end() && multiModelWeights.size() > 0)
{ {
const string model_name = value_string(si->second); const string model_name = value_string(si->second);
PhraseDictionaryMultiModel* pdmm PhraseDictionaryMultiModel* pdmm
= (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name); = (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name);
pdmm->SetTemporaryMultiModelWeightsVector(multiModelWeights); pdmm->SetTemporaryMultiModelWeightsVector(multiModelWeights);
} }
const StaticData &staticData = StaticData::Instance(); const StaticData &staticData = StaticData::Instance();
//Make sure alternative paths are retained, if necessary //Make sure alternative paths are retained, if necessary
@ -321,7 +321,7 @@ public:
stringstream out, graphInfo, transCollOpts; stringstream out, graphInfo, transCollOpts;
if (staticData.IsSyntax()) if (staticData.IsSyntax())
{ {
boost::shared_ptr<TreeInput> tinput(new TreeInput); boost::shared_ptr<TreeInput> tinput(new TreeInput);
const vector<FactorType>& IFO = staticData.GetInputFactorOrder(); const vector<FactorType>& IFO = staticData.GetInputFactorOrder();
@ -338,8 +338,8 @@ public:
manager.OutputSearchGraphMoses(sgstream); manager.OutputSearchGraphMoses(sgstream);
m_retData["sg"] = value_string(sgstream.str()); m_retData["sg"] = value_string(sgstream.str());
} }
} }
else else
{ {
// size_t lineNumber = 0; // TODO: Include sentence request number here? // size_t lineNumber = 0; // TODO: Include sentence request number here?
boost::shared_ptr<Sentence> sentence(new Sentence(0,source)); boost::shared_ptr<Sentence> sentence(new Sentence(0,source));
@ -351,30 +351,30 @@ public:
vector<xmlrpc_c::value> alignInfo; vector<xmlrpc_c::value> alignInfo;
outputHypo(out,hypo,addAlignInfo,alignInfo,reportAllFactors); outputHypo(out,hypo,addAlignInfo,alignInfo,reportAllFactors);
if (addAlignInfo) m_retData["align"] = value_array(alignInfo); if (addAlignInfo) m_retData["align"] = value_array(alignInfo);
if (addWordAlignInfo) if (addWordAlignInfo)
{ {
stringstream wordAlignment; stringstream wordAlignment;
hypo->OutputAlignment(wordAlignment); hypo->OutputAlignment(wordAlignment);
vector<xmlrpc_c::value> alignments; vector<xmlrpc_c::value> alignments;
string alignmentPair; string alignmentPair;
while (wordAlignment >> alignmentPair) while (wordAlignment >> alignmentPair)
{ {
int pos = alignmentPair.find('-'); int pos = alignmentPair.find('-');
map<string, xmlrpc_c::value> wordAlignInfo; map<string, xmlrpc_c::value> wordAlignInfo;
wordAlignInfo["source-word"] wordAlignInfo["source-word"]
= value_int(atoi(alignmentPair.substr(0, pos).c_str())); = value_int(atoi(alignmentPair.substr(0, pos).c_str()));
wordAlignInfo["target-word"] wordAlignInfo["target-word"]
= value_int(atoi(alignmentPair.substr(pos + 1).c_str())); = value_int(atoi(alignmentPair.substr(pos + 1).c_str()));
alignments.push_back(value_struct(wordAlignInfo)); alignments.push_back(value_struct(wordAlignInfo));
} }
m_retData["word-align"] = value_array(alignments); m_retData["word-align"] = value_array(alignments);
} }
if (addGraphInfo) insertGraphInfo(manager,m_retData); if (addGraphInfo) insertGraphInfo(manager,m_retData);
if (addTopts) insertTranslationOptions(manager,m_retData); if (addTopts) insertTranslationOptions(manager,m_retData);
if (nbest_size > 0) if (nbest_size > 0)
{ {
outputNBest(manager, m_retData, nbest_size, nbest_distinct, outputNBest(manager, m_retData, nbest_size, nbest_distinct,
reportAllFactors, addAlignInfo, addScoreBreakdown); reportAllFactors, addAlignInfo, addScoreBreakdown);
} }
(const_cast<StaticData&>(staticData)).SetOutputSearchGraph(false); (const_cast<StaticData&>(staticData)).SetOutputSearchGraph(false);
@ -389,11 +389,11 @@ public:
} }
void outputHypo(ostream& out, const Hypothesis* hypo, void outputHypo(ostream& out, const Hypothesis* hypo,
bool addAlignmentInfo, vector<xmlrpc_c::value>& alignInfo, bool addAlignmentInfo, vector<xmlrpc_c::value>& alignInfo,
bool reportAllFactors = false) { bool reportAllFactors = false) {
if (hypo->GetPrevHypo() != NULL) { if (hypo->GetPrevHypo() != NULL) {
outputHypo(out,hypo->GetPrevHypo(),addAlignmentInfo, outputHypo(out,hypo->GetPrevHypo(),addAlignmentInfo,
alignInfo, reportAllFactors); alignInfo, reportAllFactors);
Phrase p = hypo->GetCurrTargetPhrase(); Phrase p = hypo->GetCurrTargetPhrase();
if(reportAllFactors) { if(reportAllFactors) {
@ -547,14 +547,14 @@ public:
retData.insert(pair<string, xmlrpc_c::value>("nbest", xmlrpc_c::value_array(nBestXml))); retData.insert(pair<string, xmlrpc_c::value>("nbest", xmlrpc_c::value_array(nBestXml)));
} }
void void
insertTranslationOptions(Manager& manager, map<string, xmlrpc_c::value>& retData) insertTranslationOptions(Manager& manager, map<string, xmlrpc_c::value>& retData)
{ {
const TranslationOptionCollection* toptsColl = manager.getSntTranslationOptions(); const TranslationOptionCollection* toptsColl = manager.getSntTranslationOptions();
vector<xmlrpc_c::value> toptsXml; vector<xmlrpc_c::value> toptsXml;
size_t const stop = toptsColl->GetSource().GetSize(); size_t const stop = toptsColl->GetSource().GetSize();
TranslationOptionList const* tol; TranslationOptionList const* tol;
for (size_t s = 0 ; s < stop ; ++s) for (size_t s = 0 ; s < stop ; ++s)
{ {
for (size_t e = s; (tol = toptsColl->GetTranslationOptionList(s,e)) != NULL; ++e) for (size_t e = s; (tol = toptsColl->GetTranslationOptionList(s,e)) != NULL; ++e)
{ {
@ -569,11 +569,11 @@ public:
toptXml["start"] = xmlrpc_c::value_int(s); toptXml["start"] = xmlrpc_c::value_int(s);
toptXml["end"] = xmlrpc_c::value_int(e); toptXml["end"] = xmlrpc_c::value_int(e);
vector<xmlrpc_c::value> scoresXml; vector<xmlrpc_c::value> scoresXml;
const std::valarray<FValue> &scores const std::valarray<FValue> &scores
= topt->GetScoreBreakdown().getCoreFeatures(); = topt->GetScoreBreakdown().getCoreFeatures();
for (size_t j = 0; j < scores.size(); ++j) for (size_t j = 0; j < scores.size(); ++j)
scoresXml.push_back(xmlrpc_c::value_double(scores[j])); scoresXml.push_back(xmlrpc_c::value_double(scores[j]));
toptXml["scores"] = xmlrpc_c::value_array(scoresXml); toptXml["scores"] = xmlrpc_c::value_array(scoresXml);
toptsXml.push_back(xmlrpc_c::value_struct(toptXml)); toptsXml.push_back(xmlrpc_c::value_struct(toptXml));
} }
@ -581,7 +581,7 @@ public:
} }
retData.insert(pair<string, xmlrpc_c::value>("topt", xmlrpc_c::value_array(toptsXml))); retData.insert(pair<string, xmlrpc_c::value>("topt", xmlrpc_c::value_array(toptsXml)));
} }
private: private:
xmlrpc_c::paramList const& m_paramList; xmlrpc_c::paramList const& m_paramList;
map<string, xmlrpc_c::value> m_retData; map<string, xmlrpc_c::value> m_retData;
@ -619,8 +619,8 @@ private:
Moses::ThreadPool m_threadPool; Moses::ThreadPool m_threadPool;
}; };
static static
void void
PrintFeatureWeight(ostream& out, const FeatureFunction* ff) PrintFeatureWeight(ostream& out, const FeatureFunction* ff)
{ {
out << ff->GetScoreProducerDescription() << "="; out << ff->GetScoreProducerDescription() << "=";
@ -632,16 +632,16 @@ PrintFeatureWeight(ostream& out, const FeatureFunction* ff)
out << endl; out << endl;
} }
static static
void void
ShowWeights(ostream& out) ShowWeights(ostream& out)
{ {
// adapted from moses-cmd/Main.cpp // adapted from moses-cmd/Main.cpp
std::ios::fmtflags old_flags = out.setf(std::ios::fixed); std::ios::fmtflags old_flags = out.setf(std::ios::fixed);
size_t old_precision = out.precision(6); size_t old_precision = out.precision(6);
const vector<const StatelessFeatureFunction*>& const vector<const StatelessFeatureFunction*>&
slf = StatelessFeatureFunction::GetStatelessFeatureFunctions(); slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
const vector<const StatefulFeatureFunction*>& const vector<const StatefulFeatureFunction*>&
sff = StatefulFeatureFunction::GetStatefulFeatureFunctions(); sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
for (size_t i = 0; i < sff.size(); ++i) { for (size_t i = 0; i < sff.size(); ++i) {
@ -662,7 +662,7 @@ ShowWeights(ostream& out)
out << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl; out << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
} }
} }
if (! (old_flags & std::ios::fixed)) if (! (old_flags & std::ios::fixed))
out.unsetf(std::ios::fixed); out.unsetf(std::ios::fixed);
out.precision(old_precision); out.precision(old_precision);
} }
@ -754,7 +754,7 @@ int main(int argc, char** argv)
.allowOrigin("*") .allowOrigin("*")
); );
*/ */
XVERBOSE(1,"Listening on port " << port << endl); XVERBOSE(1,"Listening on port " << port << endl);
if (isSerial) { if (isSerial) {
while(1) myAbyssServer.runOnce(); while(1) myAbyssServer.runOnce();

View File

@ -1,231 +1,231 @@
// XGetopt.cpp Version 1.2 // XGetopt.cpp Version 1.2
// //
// Author: Hans Dietrich // Author: Hans Dietrich
// hdietrich2@hotmail.com // hdietrich2@hotmail.com
// //
// Description: // Description:
// XGetopt.cpp implements getopt(), a function to parse command lines. // XGetopt.cpp implements getopt(), a function to parse command lines.
// //
// History // History
// Version 1.2 - 2003 May 17 // Version 1.2 - 2003 May 17
// - Added Unicode support // - Added Unicode support
// //
// Version 1.1 - 2002 March 10 // Version 1.1 - 2002 March 10
// - Added example to XGetopt.cpp module header // - Added example to XGetopt.cpp module header
// //
// This software is released into the public domain. // This software is released into the public domain.
// You are free to use it in any way you like. // You are free to use it in any way you like.
// //
// This software is provided "as is" with no expressed // This software is provided "as is" with no expressed
// or implied warranty. I accept no liability for any // or implied warranty. I accept no liability for any
// damage or loss of business that this software may cause. // damage or loss of business that this software may cause.
// //
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// if you are using precompiled headers then include this line: // if you are using precompiled headers then include this line:
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// if you are not using precompiled headers then include these lines: // if you are not using precompiled headers then include these lines:
//#include <windows.h> //#include <windows.h>
//#include <cstdio> //#include <cstdio>
//#include <tchar.h> //#include <tchar.h>
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <cmath> #include <cmath>
#include "WIN32_functions.h" #include "WIN32_functions.h"
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// //
// X G e t o p t . c p p // X G e t o p t . c p p
// //
// //
// NAME // NAME
// getopt -- parse command line options // getopt -- parse command line options
// //
// SYNOPSIS // SYNOPSIS
// int getopt(int argc, char *argv[], char *optstring) // int getopt(int argc, char *argv[], char *optstring)
// //
// extern char *optarg; // extern char *optarg;
// extern int optind; // extern int optind;
// //
// DESCRIPTION // DESCRIPTION
// The getopt() function parses the command line arguments. Its // The getopt() function parses the command line arguments. Its
// arguments argc and argv are the argument count and array as // arguments argc and argv are the argument count and array as
// passed into the application on program invocation. In the case // passed into the application on program invocation. In the case
// of Visual C++ programs, argc and argv are available via the // of Visual C++ programs, argc and argv are available via the
// variables __argc and __argv (double underscores), respectively. // variables __argc and __argv (double underscores), respectively.
// getopt returns the next option letter in argv that matches a // getopt returns the next option letter in argv that matches a
// letter in optstring. (Note: Unicode programs should use // letter in optstring. (Note: Unicode programs should use
// __targv instead of __argv. Also, all character and string // __targv instead of __argv. Also, all character and string
// literals should be enclosed in ( ) ). // literals should be enclosed in ( ) ).
// //
// optstring is a string of recognized option letters; if a letter // optstring is a string of recognized option letters; if a letter
// is followed by a colon, the option is expected to have an argument // is followed by a colon, the option is expected to have an argument
// that may or may not be separated from it by white space. optarg // that may or may not be separated from it by white space. optarg
// is set to point to the start of the option argument on return from // is set to point to the start of the option argument on return from
// getopt. // getopt.
// //
// Option letters may be combined, e.g., "-ab" is equivalent to // Option letters may be combined, e.g., "-ab" is equivalent to
// "-a -b". Option letters are case sensitive. // "-a -b". Option letters are case sensitive.
// //
// getopt places in the external variable optind the argv index // getopt places in the external variable optind the argv index
// of the next argument to be processed. optind is initialized // of the next argument to be processed. optind is initialized
// to 0 before the first call to getopt. // to 0 before the first call to getopt.
// //
// When all options have been processed (i.e., up to the first // When all options have been processed (i.e., up to the first
// non-option argument), getopt returns EOF, optarg will point // non-option argument), getopt returns EOF, optarg will point
// to the argument, and optind will be set to the argv index of // to the argument, and optind will be set to the argv index of
// the argument. If there are no non-option arguments, optarg // the argument. If there are no non-option arguments, optarg
// will be set to NULL. // will be set to NULL.
// //
// The special option "--" may be used to delimit the end of the // The special option "--" may be used to delimit the end of the
// options; EOF will be returned, and "--" (and everything after it) // options; EOF will be returned, and "--" (and everything after it)
// will be skipped. // will be skipped.
// //
// RETURN VALUE // RETURN VALUE
// For option letters contained in the string optstring, getopt // For option letters contained in the string optstring, getopt
// will return the option letter. getopt returns a question mark (?) // will return the option letter. getopt returns a question mark (?)
// when it encounters an option letter not included in optstring. // when it encounters an option letter not included in optstring.
// EOF is returned when processing is finished. // EOF is returned when processing is finished.
// //
// BUGS // BUGS
// 1) Long options are not supported. // 1) Long options are not supported.
// 2) The GNU double-colon extension is not supported. // 2) The GNU double-colon extension is not supported.
// 3) The environment variable POSIXLY_CORRECT is not supported. // 3) The environment variable POSIXLY_CORRECT is not supported.
// 4) The + syntax is not supported. // 4) The + syntax is not supported.
// 5) The automatic permutation of arguments is not supported. // 5) The automatic permutation of arguments is not supported.
// 6) This implementation of getopt() returns EOF if an error is // 6) This implementation of getopt() returns EOF if an error is
// encountered, instead of -1 as the latest standard requires. // encountered, instead of -1 as the latest standard requires.
// //
// EXAMPLE // EXAMPLE
// BOOL CMyApp::ProcessCommandLine(int argc, char *argv[]) // BOOL CMyApp::ProcessCommandLine(int argc, char *argv[])
// { // {
// int c; // int c;
// //
// while ((c = getopt(argc, argv, ("aBn:"))) != EOF) // while ((c = getopt(argc, argv, ("aBn:"))) != EOF)
// { // {
// switch (c) // switch (c)
// { // {
// case ('a'): // case ('a'):
// TRACE(("option a\n")); // TRACE(("option a\n"));
// // // //
// // set some flag here // // set some flag here
// // // //
// break; // break;
// //
// case ('B'): // case ('B'):
// TRACE( ("option B\n")); // TRACE( ("option B\n"));
// // // //
// // set some other flag here // // set some other flag here
// // // //
// break; // break;
// //
// case ('n'): // case ('n'):
// TRACE(("option n: value=%d\n"), atoi(optarg)); // TRACE(("option n: value=%d\n"), atoi(optarg));
// // // //
// // do something with value here // // do something with value here
// // // //
// break; // break;
// //
// case ('?'): // case ('?'):
// TRACE(("ERROR: illegal option %s\n"), argv[optind-1]); // TRACE(("ERROR: illegal option %s\n"), argv[optind-1]);
// return FALSE; // return FALSE;
// break; // break;
// //
// default: // default:
// TRACE(("WARNING: no handler for option %c\n"), c); // TRACE(("WARNING: no handler for option %c\n"), c);
// return FALSE; // return FALSE;
// break; // break;
// } // }
// } // }
// // // //
// // check for non-option args here // // check for non-option args here
// // // //
// return TRUE; // return TRUE;
// } // }
// //
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
char *optarg; // global argument pointer char *optarg; // global argument pointer
int optind = 0; // global argv index int optind = 0; // global argv index
int getopt(int argc, char *argv[], char *optstring) int getopt(int argc, char *argv[], char *optstring)
{ {
static char *next = NULL; static char *next = NULL;
if (optind == 0) if (optind == 0)
next = NULL; next = NULL;
optarg = NULL; optarg = NULL;
if (next == NULL || *next =='\0') { if (next == NULL || *next =='\0') {
if (optind == 0) if (optind == 0)
optind++; optind++;
if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0')) { if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0')) {
optarg = NULL; optarg = NULL;
if (optind < argc) if (optind < argc)
optarg = argv[optind]; optarg = argv[optind];
return EOF; return EOF;
} }
if (strcmp(argv[optind], "--") == 0) { if (strcmp(argv[optind], "--") == 0) {
optind++; optind++;
optarg = NULL; optarg = NULL;
if (optind < argc) if (optind < argc)
optarg = argv[optind]; optarg = argv[optind];
return EOF; return EOF;
} }
next = argv[optind]; next = argv[optind];
next++; // skip past - next++; // skip past -
optind++; optind++;
} }
char c = *next++; char c = *next++;
char *cp = strchr(optstring, c); char *cp = strchr(optstring, c);
if (cp == NULL || c == (':')) if (cp == NULL || c == (':'))
return ('?'); return ('?');
cp++; cp++;
if (*cp == (':')) { if (*cp == (':')) {
if (*next != ('\0')) { if (*next != ('\0')) {
optarg = next; optarg = next;
next = NULL; next = NULL;
} else if (optind < argc) { } else if (optind < argc) {
optarg = argv[optind]; optarg = argv[optind];
optind++; optind++;
} else { } else {
return ('?'); return ('?');
} }
} }
return c; return c;
} }
// for an overview, see // for an overview, see
// W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1. // W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
double lgamma(int x) double lgamma(int x)
{ {
// size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum)); // size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
if (x <= 2) { if (x <= 2) {
return 0.0; return 0.0;
} }
static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5}; static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
double tmp=(double)x+5.5; double tmp=(double)x+5.5;
tmp -= (((double)x)+0.5)*log(tmp); tmp -= (((double)x)+0.5)*log(tmp);
double y=(double)x; double y=(double)x;
double sum = 1.000000000190015; double sum = 1.000000000190015;
for (size_t j=0; j<6; ++j) { for (size_t j=0; j<6; ++j) {
sum += coefs[j]/++y; sum += coefs[j]/++y;
} }
return -tmp+log(2.5066282746310005*sum/(double)x); return -tmp+log(2.5066282746310005*sum/(double)x);
} }

View File

@ -1,24 +1,24 @@
// XGetopt.h Version 1.2 // XGetopt.h Version 1.2
// //
// Author: Hans Dietrich // Author: Hans Dietrich
// hdietrich2@hotmail.com // hdietrich2@hotmail.com
// //
// This software is released into the public domain. // This software is released into the public domain.
// You are free to use it in any way you like. // You are free to use it in any way you like.
// //
// This software is provided "as is" with no expressed // This software is provided "as is" with no expressed
// or implied warranty. I accept no liability for any // or implied warranty. I accept no liability for any
// damage or loss of business that this software may cause. // damage or loss of business that this software may cause.
// //
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
#ifndef XGETOPT_H #ifndef XGETOPT_H
#define XGETOPT_H #define XGETOPT_H
extern int optind, opterr; extern int optind, opterr;
extern char *optarg; extern char *optarg;
int getopt(int argc, char *argv[], char *optstring); int getopt(int argc, char *argv[], char *optstring);
double lgamma(int x); double lgamma(int x);
#endif //XGETOPT_H #endif //XGETOPT_H

View File

@ -1,5 +1,5 @@
#include <cstring> #include <cstring>
#include <cassert> #include <cassert>
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
@ -14,7 +14,7 @@
#include <set> #include <set>
#include <boost/thread/tss.hpp> #include <boost/thread/tss.hpp>
#include <boost/thread.hpp> #include <boost/thread.hpp>
#include <boost/unordered_map.hpp> #include <boost/unordered_map.hpp>
#ifdef WIN32 #ifdef WIN32
@ -58,9 +58,9 @@ typedef boost::shared_ptr<std::vector<TextLenType> > SentIdSet;
class Cache { class Cache {
typedef std::pair<SentIdSet, clock_t> ClockedSet; typedef std::pair<SentIdSet, clock_t> ClockedSet;
typedef boost::unordered_map<std::string, ClockedSet> ClockedMap; typedef boost::unordered_map<std::string, ClockedSet> ClockedMap;
public: public:
SentIdSet get(const std::string& phrase) { SentIdSet get(const std::string& phrase) {
boost::shared_lock<boost::shared_mutex> lock(m_mutex); boost::shared_lock<boost::shared_mutex> lock(m_mutex);
if(m_cont.count(phrase)) { if(m_cont.count(phrase)) {
@ -70,27 +70,27 @@ class Cache {
} }
return SentIdSet( new SentIdSet::element_type() ); return SentIdSet( new SentIdSet::element_type() );
} }
void put(const std::string& phrase, const SentIdSet set) { void put(const std::string& phrase, const SentIdSet set) {
boost::unique_lock<boost::shared_mutex> lock(m_mutex); boost::unique_lock<boost::shared_mutex> lock(m_mutex);
m_cont[phrase] = std::make_pair(set, clock()); m_cont[phrase] = std::make_pair(set, clock());
} }
static void set_max_cache(size_t max_cache) { static void set_max_cache(size_t max_cache) {
s_max_cache = max_cache; s_max_cache = max_cache;
} }
void prune() { void prune() {
if(s_max_cache > 0) { if(s_max_cache > 0) {
boost::upgrade_lock<boost::shared_mutex> lock(m_mutex); boost::upgrade_lock<boost::shared_mutex> lock(m_mutex);
if(m_cont.size() > s_max_cache) { if(m_cont.size() > s_max_cache) {
std::vector<clock_t> clocks; std::vector<clock_t> clocks;
for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++) for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++)
clocks.push_back(it->second.second); clocks.push_back(it->second.second);
std::sort(clocks.begin(), clocks.end()); std::sort(clocks.begin(), clocks.end());
clock_t out = clocks[m_cont.size() - s_max_cache]; clock_t out = clocks[m_cont.size() - s_max_cache];
boost::upgrade_to_unique_lock<boost::shared_mutex> uniq_lock(lock); boost::upgrade_to_unique_lock<boost::shared_mutex> uniq_lock(lock);
for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++) for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++)
if(it->second.second < out) if(it->second.second < out)
@ -98,7 +98,7 @@ class Cache {
} }
} }
} }
private: private:
ClockedMap m_cont; ClockedMap m_cont;
boost::shared_mutex m_mutex; boost::shared_mutex m_mutex;
@ -282,12 +282,12 @@ void lookup_phrase(SentIdSet& ids, const std::string& phrase,
i != locations.end(); ++i) { i != locations.end(); ++i) {
ids->push_back(i->sentIdInCorpus); ids->push_back(i->sentIdInCorpus);
} }
std::sort(ids->begin(), ids->end()); std::sort(ids->begin(), ids->end());
SentIdSet::element_type::iterator it = SentIdSet::element_type::iterator it =
std::unique(ids->begin(), ids->end()); std::unique(ids->begin(), ids->end());
ids->resize(it - ids->begin()); ids->resize(it - ids->begin());
if(ids->size() >= MINIMUM_SIZE_TO_KEEP) if(ids->size() >= MINIMUM_SIZE_TO_KEEP)
cache.put(phrase, ids); cache.put(phrase, ids);
} }
@ -295,8 +295,8 @@ void lookup_phrase(SentIdSet& ids, const std::string& phrase,
void lookup_multiple_phrases(SentIdSet& ids, vector<std::string> & phrases, void lookup_multiple_phrases(SentIdSet& ids, vector<std::string> & phrases,
C_SuffixArraySearchApplicationBase & my_sa, C_SuffixArraySearchApplicationBase & my_sa,
const std::string & rule, Cache& cache) const std::string & rule, Cache& cache)
{ {
if (phrases.size() == 1) { if (phrases.size() == 1) {
lookup_phrase(ids, phrases.front(), my_sa, cache); lookup_phrase(ids, phrases.front(), my_sa, cache);
@ -372,32 +372,32 @@ void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options,
delete *i; delete *i;
options.erase(options.begin() + pfe_filter_limit,options.end()); options.erase(options.begin() + pfe_filter_limit,options.end());
} }
if (pef_filter_only) if (pef_filter_only)
return; return;
if (options.empty()) if (options.empty())
return; return;
SentIdSet fset( new SentIdSet::element_type() ); SentIdSet fset( new SentIdSet::element_type() );
find_occurrences(fset, options.front()->f_phrase, f_sa, f_cache); find_occurrences(fset, options.front()->f_phrase, f_sa, f_cache);
size_t cf = fset->size(); size_t cf = fset->size();
for (std::vector<PTEntry*>::iterator i = options.begin(); for (std::vector<PTEntry*>::iterator i = options.begin();
i != options.end(); ++i) { i != options.end(); ++i) {
const std::string& e_phrase = (*i)->e_phrase; const std::string& e_phrase = (*i)->e_phrase;
SentIdSet eset( new SentIdSet::element_type() ); SentIdSet eset( new SentIdSet::element_type() );
find_occurrences(eset, e_phrase, e_sa, e_cache); find_occurrences(eset, e_phrase, e_sa, e_cache);
size_t ce = eset->size(); size_t ce = eset->size();
SentIdSet efset( new SentIdSet::element_type() ); SentIdSet efset( new SentIdSet::element_type() );
ordered_set_intersect(efset, fset, eset); ordered_set_intersect(efset, fset, eset);
size_t cef = efset->size(); size_t cef = efset->size();
double nlp = -log(fisher_exact(cef, cf, ce)); double nlp = -log(fisher_exact(cef, cf, ce));
(*i)->set_cooc_stats(cef, cf, ce, nlp); (*i)->set_cooc_stats(cef, cf, ce, nlp);
} }
std::vector<PTEntry*>::iterator new_end = std::vector<PTEntry*>::iterator new_end =
std::remove_if(options.begin(), options.end(), std::remove_if(options.begin(), options.end(),
NlogSigThresholder(sig_filter_limit)); NlogSigThresholder(sig_filter_limit));
@ -406,7 +406,7 @@ void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options,
} }
void filter(std::istream* in, std::ostream* out, int pfe_index) { void filter(std::istream* in, std::ostream* out, int pfe_index) {
std::vector<std::string> lines; std::vector<std::string> lines;
std::string prev = ""; std::string prev = "";
std::vector<PTEntry*> options; std::vector<PTEntry*> options;
@ -415,23 +415,23 @@ void filter(std::istream* in, std::ostream* out, int pfe_index) {
boost::mutex::scoped_lock lock(in_mutex); boost::mutex::scoped_lock lock(in_mutex);
if(in->eof()) if(in->eof())
break; break;
lines.clear(); lines.clear();
std::string line; std::string line;
while(getline(*in, line) && lines.size() < 500000) while(getline(*in, line) && lines.size() < 500000)
lines.push_back(line); lines.push_back(line);
} }
std::stringstream out_temp; std::stringstream out_temp;
for(std::vector<std::string>::iterator it = lines.begin(); it != lines.end(); it++) { for(std::vector<std::string>::iterator it = lines.begin(); it != lines.end(); it++) {
size_t tmp_lines = ++pt_lines; size_t tmp_lines = ++pt_lines;
if(tmp_lines % 10000 == 0) { if(tmp_lines % 10000 == 0) {
boost::mutex::scoped_lock lock(err_mutex); boost::mutex::scoped_lock lock(err_mutex);
std::cerr << "."; std::cerr << ".";
if(tmp_lines % 500000 == 0) if(tmp_lines % 500000 == 0)
std::cerr << "[n:" << tmp_lines << "]\n"; std::cerr << "[n:" << tmp_lines << "]\n";
if(tmp_lines % 10000000 == 0) { if(tmp_lines % 10000000 == 0) {
float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines; float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines; float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
@ -446,30 +446,30 @@ void filter(std::istream* in, std::ostream* out, int pfe_index) {
<< "------------------------------------------------------\n"; << "------------------------------------------------------\n";
} }
} }
if(pt_lines % 10000 == 0) { if(pt_lines % 10000 == 0) {
f_cache.prune(); f_cache.prune();
e_cache.prune(); e_cache.prune();
} }
if(it->length() > 0) { if(it->length() > 0) {
PTEntry* pp = new PTEntry(it->c_str(), pfe_index); PTEntry* pp = new PTEntry(it->c_str(), pfe_index);
if (prev != pp->f_phrase) { if (prev != pp->f_phrase) {
prev = pp->f_phrase; prev = pp->f_phrase;
if (!options.empty()) { // always true after first line if (!options.empty()) { // always true after first line
compute_cooc_stats_and_filter(options, f_cache, e_cache); compute_cooc_stats_and_filter(options, f_cache, e_cache);
} }
for (std::vector<PTEntry*>::iterator i = options.begin(); for (std::vector<PTEntry*>::iterator i = options.begin();
i != options.end(); ++i) { i != options.end(); ++i) {
out_temp << **i << '\n'; out_temp << **i << '\n';
delete *i; delete *i;
} }
options.clear(); options.clear();
options.push_back(pp); options.push_back(pp);
} else { } else {
options.push_back(pp); options.push_back(pp);
} }
@ -479,7 +479,7 @@ void filter(std::istream* in, std::ostream* out, int pfe_index) {
*out << out_temp.str() << std::flush; *out << out_temp.str() << std::flush;
} }
compute_cooc_stats_and_filter(options, f_cache, e_cache); compute_cooc_stats_and_filter(options, f_cache, e_cache);
boost::mutex::scoped_lock lock(out_mutex); boost::mutex::scoped_lock lock(out_mutex);
for (std::vector<PTEntry*>::iterator i = options.begin(); for (std::vector<PTEntry*>::iterator i = options.begin();
i != options.end(); ++i) { i != options.end(); ++i) {
@ -512,11 +512,11 @@ int main(int argc, char * argv[])
pfe_filter_limit = atoi(optarg); pfe_filter_limit = atoi(optarg);
std::cerr << "P(f|e) filter limit: " << pfe_filter_limit << std::endl; std::cerr << "P(f|e) filter limit: " << pfe_filter_limit << std::endl;
break; break;
case 't': case 't':
threads = atoi(optarg); threads = atoi(optarg);
std::cerr << "Using threads: " << threads << std::endl; std::cerr << "Using threads: " << threads << std::endl;
break; break;
case 'm': case 'm':
max_cache = atoi(optarg); max_cache = atoi(optarg);
std::cerr << "Using max phrases in caches: " << max_cache << std::endl; std::cerr << "Using max phrases in caches: " << max_cache << std::endl;
break; break;
@ -548,13 +548,13 @@ int main(int argc, char * argv[])
usage(); usage();
} }
} }
if (sig_filter_limit == 0.0) pef_filter_only = true; if (sig_filter_limit == 0.0) pef_filter_only = true;
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
if (optind != argc || ((!efile || !ffile) && !pef_filter_only)) { if (optind != argc || ((!efile || !ffile) && !pef_filter_only)) {
usage(); usage();
} }
//load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false) //load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false)
if (!pef_filter_only) { if (!pef_filter_only) {
e_sa.loadData_forSearch(efile, false, false); e_sa.loadData_forSearch(efile, false, false);
@ -582,15 +582,15 @@ int main(int argc, char * argv[])
Cache::set_max_cache(max_cache); Cache::set_max_cache(max_cache);
std::ios_base::sync_with_stdio(false); std::ios_base::sync_with_stdio(false);
boost::thread_group threadGroup; boost::thread_group threadGroup;
for(int i = 0; i < threads; i++) for(int i = 0; i < threads; i++)
threadGroup.add_thread(new boost::thread(filter, &std::cin, &std::cout, pfe_index)); threadGroup.add_thread(new boost::thread(filter, &std::cin, &std::cout, pfe_index));
threadGroup.join_all(); threadGroup.join_all();
float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines; float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines; float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
std::cerr << "\n\n------------------------------------------------------\n" std::cerr << "\n\n------------------------------------------------------\n"
<< " unfiltered phrases pairs: " << pt_lines << "\n" << " unfiltered phrases pairs: " << pt_lines << "\n"
<< "\n" << "\n"
@ -599,5 +599,5 @@ int main(int argc, char * argv[])
<< " TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << " (" << (sigfper + pfefper) << "%)\n" << " TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << " (" << (sigfper + pfefper) << "%)\n"
<< "\n" << "\n"
<< " FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << " (" << (100.0-sigfper - pfefper) << "%)\n" << " FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << " (" << (100.0-sigfper - pfefper) << "%)\n"
<< "------------------------------------------------------\n"; << "------------------------------------------------------\n";
} }

View File

@ -65,7 +65,7 @@ class Numbered : public T {
friend String& operator<< ( String& str, const Numbered<SD1,I,SD2,T,SD3>& rv ) { return str<<SD1<<rv.i<<SD2<<rv.getT()<<SD3; } friend String& operator<< ( String& str, const Numbered<SD1,I,SD2,T,SD3>& rv ) { return str<<SD1<<rv.i<<SD2<<rv.getT()<<SD3; }
friend pair<StringInput,Numbered<SD1,I,SD2,T,SD3>*> operator>> ( StringInput ps, Numbered<SD1,I,SD2,T,SD3>& rv ) { return pair<StringInput,Numbered<SD1,I,SD2,T,SD3>*>(ps,&rv); } friend pair<StringInput,Numbered<SD1,I,SD2,T,SD3>*> operator>> ( StringInput ps, Numbered<SD1,I,SD2,T,SD3>& rv ) { return pair<StringInput,Numbered<SD1,I,SD2,T,SD3>*>(ps,&rv); }
friend StringInput operator>> ( pair<StringInput,Numbered<SD1,I,SD2,T,SD3>*> delimbuff, const char* psPostDelim ) { friend StringInput operator>> ( pair<StringInput,Numbered<SD1,I,SD2,T,SD3>*> delimbuff, const char* psPostDelim ) {
return ( (SD3[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->i>>SD2>>delimbuff.second->setT()>>psPostDelim return ( (SD3[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->i>>SD2>>delimbuff.second->setT()>>psPostDelim
: delimbuff.first>>SD1>>delimbuff.second->i>>SD2>>delimbuff.second->setT()>>SD3>>psPostDelim ); : delimbuff.first>>SD1>>delimbuff.second->i>>SD2>>delimbuff.second->setT()>>SD3>>psPostDelim );
} }
}; };
@ -106,7 +106,7 @@ template<class V>
pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const V& v ) const { pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const V& v ) const {
//const Scored<typename V::ElementType,pair<int,SafePtr<const V> > > sipvDummy ( DBL_MAX ); //const Scored<typename V::ElementType,pair<int,SafePtr<const V> > > sipvDummy ( DBL_MAX );
//MinHeap<Scored<typename V::ElementType,pair<int,SafePtr<const V> > > > hsiv ( MapType::size()+1, sipvDummy ); //MinHeap<Scored<typename V::ElementType,pair<int,SafePtr<const V> > > > hsiv ( MapType::size()+1, sipvDummy );
MinHeap<Scored<typename V::ElementType,pair<int,SafePtr<const NV> > > >& hsiv = MinHeap<Scored<typename V::ElementType,pair<int,SafePtr<const NV> > > >& hsiv =
const_cast<MinHeap<Scored<typename V::ElementType,pair<int,SafePtr<const NV> > > >&> ( hsivCalc ); const_cast<MinHeap<Scored<typename V::ElementType,pair<int,SafePtr<const NV> > > >&> ( hsivCalc );
hsiv.clear(); hsiv.clear();
@ -120,7 +120,7 @@ pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const
typename V::ElementType d = v.getMarginalDistance ( hsiv.get(iNext).first, hsiv.get(iNext).second.getRef() ); typename V::ElementType d = v.getMarginalDistance ( hsiv.get(iNext).first, hsiv.get(iNext).second.getRef() );
hsiv.set(iNext).setScore() = d; hsiv.set(iNext).setScore() = d;
//hsiv.set(iNext).setScore() = v.getMarginalDistance ( hsiv.getMin().first, iUpper->second.second ); //hsiv.set(iNext).setScore() = v.getMarginalDistance ( hsiv.getMin().first, iUpper->second.second );
////int j = ////int j =
hsiv.fixDecr(iNext); hsiv.fixDecr(iNext);
////cerr<<" adding ln"<<&hsiv.get(j).second.getRef()<<" marg-dist="<<d<<" new-score="<<double(hsiv.get(j).getScore())<<" new-pos="<<j<<"\n"; ////cerr<<" adding ln"<<&hsiv.get(j).second.getRef()<<" marg-dist="<<d<<" new-score="<<double(hsiv.get(j).getScore())<<" new-pos="<<j<<"\n";
iNext++; iNext++;
@ -140,7 +140,7 @@ pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const
typename V::ElementType d = v.getMarginalDistance ( ++hsiv.setMin().first, hsiv.getMin().second.getRef() ); typename V::ElementType d = v.getMarginalDistance ( ++hsiv.setMin().first, hsiv.getMin().second.getRef() );
hsiv.setMin().setScore() += d; hsiv.setMin().setScore() += d;
////cerr<<" matching ln"<<&hsiv.getMin().second.getRef()<<" i="<<hsiv.setMin().first<<" marg-dist="<<d<<" new-score="<<hsiv.getMin().getScore(); ////cerr<<" matching ln"<<&hsiv.getMin().second.getRef()<<" i="<<hsiv.setMin().first<<" marg-dist="<<d<<" new-score="<<hsiv.getMin().getScore();
////int j = ////int j =
hsiv.fixIncr(0); hsiv.fixIncr(0);
////cerr<<" new-pos="<<j<<"\n"; ////cerr<<" new-pos="<<j<<"\n";
////if(j!=0) for(int i=0;i<iNext;i++) cerr<<" "<<i<<": ln"<<hsiv.get(i).second.getRef().lineNum.toInt()<<" new-score="<<double(hsiv.get(i).getScore())<<"\n"; ////if(j!=0) for(int i=0;i<iNext;i++) cerr<<" "<<i<<": ln"<<hsiv.get(i).second.getRef().lineNum.toInt()<<" new-score="<<double(hsiv.get(i).getScore())<<"\n";
@ -151,7 +151,7 @@ pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const
hsiv.set(iNext).second = SafePtr<const NV> ( iUpper->second ); hsiv.set(iNext).second = SafePtr<const NV> ( iUpper->second );
typename V::ElementType d = v.getMarginalDistance ( hsiv.get(iNext).first, hsiv.get(iNext).second.getRef() ); typename V::ElementType d = v.getMarginalDistance ( hsiv.get(iNext).first, hsiv.get(iNext).second.getRef() );
hsiv.set(iNext).setScore() = d; hsiv.set(iNext).setScore() = d;
////int j = ////int j =
hsiv.fixDecr(iNext); hsiv.fixDecr(iNext);
////cerr<<" adding ln"<<&hsiv.get(j).second.getRef()<<" marg-dist="<<d<<" new-score="<<double(hsiv.get(j).getScore())<<" new-pos="<<j<<"\n"; ////cerr<<" adding ln"<<&hsiv.get(j).second.getRef()<<" marg-dist="<<d<<" new-score="<<double(hsiv.get(j).getScore())<<" new-pos="<<j<<"\n";
iNext++; iNext++;
@ -164,7 +164,7 @@ pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const
hsiv.set(iNext).second = SafePtr<const NV> ( iLower->second ); hsiv.set(iNext).second = SafePtr<const NV> ( iLower->second );
typename V::ElementType d = v.getMarginalDistance ( hsiv.get(iNext).first, hsiv.get(iNext).second.getRef() ); typename V::ElementType d = v.getMarginalDistance ( hsiv.get(iNext).first, hsiv.get(iNext).second.getRef() );
hsiv.set(iNext).setScore() = d; hsiv.set(iNext).setScore() = d;
////int j = ////int j =
hsiv.fixDecr(iNext); hsiv.fixDecr(iNext);
////cerr<<" adding ln"<<&hsiv.get(j).second.getRef()<<" marg-dist="<<d<<" new-score="<<double(hsiv.get(j).getScore())<<" new-pos="<<j<<"\n"; ////cerr<<" adding ln"<<&hsiv.get(j).second.getRef()<<" marg-dist="<<d<<" new-score="<<double(hsiv.get(j).getScore())<<" new-pos="<<j<<"\n";
iNext++; iNext++;

View File

@ -27,7 +27,7 @@
#include <cassert> #include <cassert>
#include <iostream> #include <iostream>
using namespace std; using namespace std;
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////

View File

@ -101,8 +101,8 @@ class Beam {
void write(FILE *pf){ void write(FILE *pf){
/* for (typename BeamMap::const_iterator i = mkid.begin(); i != mkid.end(); i++){ /* for (typename BeamMap::const_iterator i = mkid.begin(); i != mkid.end(); i++){
i->first.write(pf); i->first.write(pf);
fprintf(pf, " %d ", i->second.first); fprintf(pf, " %d ", i->second.first);
// i->second.second.write(pf); // i->second.second.write(pf);
fprintf(pf, "\n"); fprintf(pf, "\n");
} }
*/ */

View File

@ -394,7 +394,7 @@ class SimpleMap : public map<X,Y> {
private: private:
typedef map<X,Y> OrigMap; typedef map<X,Y> OrigMap;
static const Y yDummy; static const Y yDummy;
public: public:
// Constructor / destructor methods... // Constructor / destructor methods...
SimpleMap ( ) : OrigMap() { } SimpleMap ( ) : OrigMap() { }
@ -899,7 +899,7 @@ class GenericHidVarCPTModel : public SimpleHash<K,typename Y::template ArrayDist
const typename Y::template ArrayDistrib<P>& getDistrib ( const K& k ) const { const typename Y::template ArrayDistrib<P>& getDistrib ( const K& k ) const {
return HKYP::get(k); return HKYP::get(k);
} }
P& setProb ( const Y& y, const K& k ) { P& setProb ( const Y& y, const K& k ) {
pair<typename Y::BaseType,P>& yp = HKYP::set(k).add(); pair<typename Y::BaseType,P>& yp = HKYP::set(k).add();
yp.first = y; yp.first = y;

View File

@ -36,7 +36,7 @@
// //
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <class Y,class X1,class X2> template <class Y,class X1,class X2>
class CRF3DModeledRV : public Y { class CRF3DModeledRV : public Y {
private: private:
@ -90,7 +90,7 @@ template <class Y,class X1,class X2> SafeArray5D<Id<int>,int,int,int,int,float>
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <class Y,class X1,class X2> template <class Y,class X1,class X2>
Prob CRF3DModeledRV<Y,X1,X2>::getProb( const X1& x1, const X2& x2 ) const { Prob CRF3DModeledRV<Y,X1,X2>::getProb( const X1& x1, const X2& x2 ) const {
SafeArray2D<int,int,int> aaCnds ( cardOff, cardSh ) ; SafeArray2D<int,int,int> aaCnds ( cardOff, cardSh ) ;
@ -131,7 +131,7 @@ Prob CRF3DModeledRV<Y,X1,X2>::getProb( const X1& x1, const X2& x2 ) const {
for ( int configRghtValSite=0; configRghtValSite<(1<<bitsValSite); configRghtValSite++ ) for ( int configRghtValSite=0; configRghtValSite<(1<<bitsValSite); configRghtValSite++ )
for ( int configValOverlap=0; configValOverlap<(1<<(bitsVal-bitsValSite)); configValOverlap++ ) { for ( int configValOverlap=0; configValOverlap<(1<<(bitsVal-bitsValSite)); configValOverlap++ ) {
int configRghtVal = (configValOverlap<<bitsValSite)+configRghtValSite; int configRghtVal = (configValOverlap<<bitsValSite)+configRghtValSite;
// For each possible preceding trellis node... // For each possible preceding trellis node...
for ( int configLeftValSite=0; configLeftValSite<(1<<bitsValSite); configLeftValSite++ ) { for ( int configLeftValSite=0; configLeftValSite<(1<<bitsValSite); configLeftValSite++ ) {
int configLeftVal = (configLeftValSite<<(bitsVal-bitsValSite))+configValOverlap; int configLeftVal = (configLeftValSite<<(bitsVal-bitsValSite))+configValOverlap;
// Add product of result and previous trellis cell to current trellis cell... // Add product of result and previous trellis cell to current trellis cell...
@ -158,7 +158,7 @@ Prob CRF3DModeledRV<Y,X1,X2>::getProb( const X1& x1, const X2& x2 ) const {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <class Y,class X1,class X2> template <class Y,class X1,class X2>
bool CRF3DModeledRV<Y,X1,X2>::readModelFields ( char* aps[], int numFields ) { bool CRF3DModeledRV<Y,X1,X2>::readModelFields ( char* aps[], int numFields ) {
if ( 7==numFields ) if ( 7==numFields )
setPotential ( X1(string(aps[1])), // globals setPotential ( X1(string(aps[1])), // globals
@ -172,7 +172,7 @@ bool CRF3DModeledRV<Y,X1,X2>::readModelFields ( char* aps[], int numFields ) {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <class Y,class X1,class X2> template <class Y,class X1,class X2>
void CRF3DModeledRV<Y,X1,X2>::writeObservCliqueConfigs ( FILE* pf, int frame, const char* psMdl, void CRF3DModeledRV<Y,X1,X2>::writeObservCliqueConfigs ( FILE* pf, int frame, const char* psMdl,
const X1& x1, const X2& x2, bool bObsVal ) const { const X1& x1, const X2& x2, bool bObsVal ) const {
fprintf ( pf, "%04d> %s ", frame, psMdl ); fprintf ( pf, "%04d> %s ", frame, psMdl );
@ -199,7 +199,7 @@ void CRF3DModeledRV<Y,X1,X2>::writeObservCliqueConfigs ( FILE* pf, int frame, co
// //
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <class Y,class X1,class X2,class X3> template <class Y,class X1,class X2,class X3>
class CRF4DModeledRV : public Y { class CRF4DModeledRV : public Y {
private: private:
@ -247,13 +247,13 @@ template <class Y,class X1,class X2,class X3> int CRF4DModeledRV<Y,X1,X2,X3>::c
template <class Y,class X1,class X2,class X3> int CRF4DModeledRV<Y,X1,X2,X3>::cardCnd = 0; template <class Y,class X1,class X2,class X3> int CRF4DModeledRV<Y,X1,X2,X3>::cardCnd = 0;
template <class Y,class X1,class X2,class X3> int CRF4DModeledRV<Y,X1,X2,X3>::bitsVal = 0; template <class Y,class X1,class X2,class X3> int CRF4DModeledRV<Y,X1,X2,X3>::bitsVal = 0;
template <class Y,class X1,class X2,class X3> int CRF4DModeledRV<Y,X1,X2,X3>::bitsValSite = 0; template <class Y,class X1,class X2,class X3> int CRF4DModeledRV<Y,X1,X2,X3>::bitsValSite = 0;
template <class Y,class X1,class X2,class X3> SafeArray5D<Id<int>,int,int,int,int,float> template <class Y,class X1,class X2,class X3> SafeArray5D<Id<int>,int,int,int,int,float>
CRF4DModeledRV<Y,X1,X2,X3>::aaaaaPotentials; CRF4DModeledRV<Y,X1,X2,X3>::aaaaaPotentials;
/* template <class Y,class X1,class X2> SafeArray3D<int> CRF4DModeledRV<Y,X1,X2>::aaaCnds; */ /* template <class Y,class X1,class X2> SafeArray3D<int> CRF4DModeledRV<Y,X1,X2>::aaaCnds; */
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <class Y,class X1,class X2,class X3> template <class Y,class X1,class X2,class X3>
Prob CRF4DModeledRV<Y,X1,X2,X3>::getProb( const X1& x1, const X2& x2, const X3& x3 ) const { Prob CRF4DModeledRV<Y,X1,X2,X3>::getProb( const X1& x1, const X2& x2, const X3& x3 ) const {
SafeArray2D<int,int,int> aaCnds ( cardOff, cardSh ) ; SafeArray2D<int,int,int> aaCnds ( cardOff, cardSh ) ;
@ -294,7 +294,7 @@ Prob CRF4DModeledRV<Y,X1,X2,X3>::getProb( const X1& x1, const X2& x2, const X3&
for ( int configRghtValSite=0; configRghtValSite<(1<<bitsValSite); configRghtValSite++ ) for ( int configRghtValSite=0; configRghtValSite<(1<<bitsValSite); configRghtValSite++ )
for ( int configValOverlap=0; configValOverlap<(1<<(bitsVal-bitsValSite)); configValOverlap++ ) { for ( int configValOverlap=0; configValOverlap<(1<<(bitsVal-bitsValSite)); configValOverlap++ ) {
int configRghtVal = (configValOverlap<<bitsValSite)+configRghtValSite; int configRghtVal = (configValOverlap<<bitsValSite)+configRghtValSite;
// For each possible preceding trellis node... // For each possible preceding trellis node...
for ( int configLeftValSite=0; configLeftValSite<(1<<bitsValSite); configLeftValSite++ ) { for ( int configLeftValSite=0; configLeftValSite<(1<<bitsValSite); configLeftValSite++ ) {
int configLeftVal = (configLeftValSite<<(bitsVal-bitsValSite))+configValOverlap; int configLeftVal = (configLeftValSite<<(bitsVal-bitsValSite))+configValOverlap;
// Add product of result and previous trellis cell to current trellis cell... // Add product of result and previous trellis cell to current trellis cell...
@ -321,7 +321,7 @@ Prob CRF4DModeledRV<Y,X1,X2,X3>::getProb( const X1& x1, const X2& x2, const X3&
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <class Y,class X1,class X2,class X3> template <class Y,class X1,class X2,class X3>
bool CRF4DModeledRV<Y,X1,X2,X3>::readModelFields ( char* aps[], int numFields ) { bool CRF4DModeledRV<Y,X1,X2,X3>::readModelFields ( char* aps[], int numFields ) {
if ( 7==numFields ) if ( 7==numFields )
setPotential ( X1(string(aps[1])), // globals setPotential ( X1(string(aps[1])), // globals
@ -335,9 +335,9 @@ bool CRF4DModeledRV<Y,X1,X2,X3>::readModelFields ( char* aps[], int numFields )
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <class Y,class X1,class X2, class X3> template <class Y,class X1,class X2, class X3>
void CRF4DModeledRV<Y,X1,X2,X3>::writeObservCliqueConfigs ( FILE* pf, int frame, const char* psMdl, void CRF4DModeledRV<Y,X1,X2,X3>::writeObservCliqueConfigs ( FILE* pf, int frame, const char* psMdl,
const X1& x1, const X2& x2, const X1& x1, const X2& x2,
const X3& x3, bool bObsVal ) const { const X3& x3, bool bObsVal ) const {
fprintf ( pf, "%04d> %s ", frame, psMdl ); fprintf ( pf, "%04d> %s ", frame, psMdl );
// For each shape (feature slope)... // For each shape (feature slope)...

View File

@ -80,7 +80,7 @@ void VecE<N,I,RC>::read ( char* ps, const ReaderContext& rc ) {
*/ */
char* psT; int i=0; char* psT; int i=0;
for ( char* psU=strtok_r(ps,",",&psT); for ( char* psU=strtok_r(ps,",",&psT);
psU && i<NUM_ENTS; psU && i<NUM_ENTS;
psU=strtok_r(NULL,",",&psT),i++ ) psU=strtok_r(NULL,",",&psT),i++ )
StaticSafeArray<N,I>::set(i) = psU; StaticSafeArray<N,I>::set(i) = psU;
} }
@ -166,7 +166,7 @@ void VecV<N,I,RC,ND1,ND2>::read ( char* ps, VecVReaderContext& rc ) {
// Chop into individual coinds strings... // Chop into individual coinds strings...
char* psT; int i=0; char* psT; int i=0;
for ( char* psU=strtok_r(ps,",",&psT); for ( char* psU=strtok_r(ps,",",&psT);
psU && i<NUM_ENTS; psU && i<NUM_ENTS;
psU=strtok_r(NULL,",",&psT), i++ ) psU=strtok_r(NULL,",",&psT), i++ )
asV.set(i) = psU; asV.set(i) = psU;
@ -230,7 +230,7 @@ class JointVecV { //// : public StaticSafeArray<V1::NUM_ENTS+V2::NUM_ENTS,I> {
static const int NUM_ENTS; static const int NUM_ENTS;
// Constructor / destructor methods... // Constructor / destructor methods...
JointVecV ( ) { } JointVecV ( ) { }
JointVecV ( const V1& a1, const V2& a2 ) { JointVecV ( const V1& a1, const V2& a2 ) {
////fprintf(stderr,"iJoin "); a1.V1::write(stderr); fprintf(stderr," "); a2.V2::write(stderr); fprintf(stderr,"\n"); ////fprintf(stderr,"iJoin "); a1.V1::write(stderr); fprintf(stderr," "); a2.V2::write(stderr); fprintf(stderr,"\n");
for (int i=0; i<NUM_ENTS; i++) { for (int i=0; i<NUM_ENTS; i++) {
if ( i<V1::NUM_ENTS ) set(i) = (a1.get(i)==-1) ? IntType(-1) : (a1.get(i)<V1::NUM_ENTS) ? IntType(a1.get(i)) : a1.get(i)+V2::NUM_ENTS; if ( i<V1::NUM_ENTS ) set(i) = (a1.get(i)==-1) ? IntType(-1) : (a1.get(i)<V1::NUM_ENTS) ? IntType(a1.get(i)) : a1.get(i)+V2::NUM_ENTS;

View File

@ -75,7 +75,7 @@ class ContDTree2DModel : public Generic2DModel<Y,X,P>, public Tree<ContDecisNode
// Extraction methods... // Extraction methods...
const P getProb ( const Y y, const X& x ) const { const P getProb ( const Y y, const X& x ) const {
const Tree<ContDecisNode<Y,P> >* ptr = this; const Tree<ContDecisNode<Y,P> >* ptr = this;
while ( !ptr->isTerm() ) { while ( !ptr->isTerm() ) {
double sumsqr=0.0; double sumsqr=0.0;
for(A a;a<X::getSize();a.setNext()) sumsqr += pow(x.get(a.toInt()),2.0) / X::getSize(); for(A a;a<X::getSize();a.setNext()) sumsqr += pow(x.get(a.toInt()),2.0) / X::getSize();
Wt wtdavg = -Tree<ContDecisNode<Y,P> >::getWt(); Wt wtdavg = -Tree<ContDecisNode<Y,P> >::getWt();
@ -112,7 +112,7 @@ class ContDTree2DModel : public Generic2DModel<Y,X,P>, public Tree<ContDecisNode
}; };
//////////////////// ////////////////////
template <class Y,class X, class P> template <class Y,class X, class P>
bool ContDTree2DModel<Y,X,P>::readFields ( char* aps[], int numFields ) { bool ContDTree2DModel<Y,X,P>::readFields ( char* aps[], int numFields ) {
if ( /*aps[0]==sId &&*/ (3==numFields || 4==numFields) ) { if ( /*aps[0]==sId &&*/ (3==numFields || 4==numFields) ) {
//fprintf(stderr,"%s,%d\n",aps[3],numFields); //fprintf(stderr,"%s,%d\n",aps[3],numFields);
@ -171,7 +171,7 @@ class ContDTree3DModel : public Generic3DModel<Y,X1,X2,P> {
}; };
//////////////////// ////////////////////
template <class Y,class X1,class X2, class P> template <class Y,class X1,class X2, class P>
bool ContDTree3DModel<Y,X1,X2,P>::readFields ( char* aps[], int numFields ) { bool ContDTree3DModel<Y,X1,X2,P>::readFields ( char* aps[], int numFields ) {
if ( /*aps[0]==sId &&*/ (4==numFields || 5==numFields) ) { if ( /*aps[0]==sId &&*/ (4==numFields || 5==numFields) ) {
//fprintf(stderr,"%s,%d\n",aps[3],numFields); //fprintf(stderr,"%s,%d\n",aps[3],numFields);
@ -212,7 +212,7 @@ bool ContDTree3DModel<Y,X1,X2,P>::readFields ( char* aps[], int numFields ) {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template<class Y, class X, class P> template<class Y, class X, class P>
class TrainableContDTree2DModel : public ContDTree2DModel<Y,X,P> { class TrainableContDTree2DModel : public ContDTree2DModel<Y,X,P> {
private: private:
List<Joint2DRV<X,Y> > lxy; List<Joint2DRV<X,Y> > lxy;
public: public:
@ -225,7 +225,7 @@ class TrainableContDTree2DModel : public ContDTree2DModel<Y,X,P> {
void train ( List<Joint2DRV<X,Y> >&, const double ) ; void train ( List<Joint2DRV<X,Y> >&, const double ) ;
void train ( const double d ) { train(lxy,d); } void train ( const double d ) { train(lxy,d); }
////// Input / output methods... ////// Input / output methods...
bool readData ( char* vs[], int numFields ) { bool readData ( char* vs[], int numFields ) {
if ( 3==numFields ) lxy.add() = Joint2DRV<X,Y> ( X(vs[1]), Y(vs[2]) ); if ( 3==numFields ) lxy.add() = Joint2DRV<X,Y> ( X(vs[1]), Y(vs[2]) );
else return false; else return false;
return true; return true;
@ -312,7 +312,7 @@ void TrainableContDTree2DModel<Y,X,P>::train ( List<Joint2DRV<X,Y> >& lxy, cons
// if ( double(rand())/double(RAND_MAX) < prRarest/modelY.getProb(pxy->getSub2()) ) { // if ( double(rand())/double(RAND_MAX) < prRarest/modelY.getProb(pxy->getSub2()) ) {
dCtr++; dCtr++;
double gamma = dTot/(dTot+dCtr); // 1.0/(double(epoch)+dCtr/dTot); // 1.0/double(epoch); // 1.0/(double(epoch)+dCtr/(dTot*prRarest*2.0)); // double gamma = dTot/(dTot+dCtr); // 1.0/(double(epoch)+dCtr/dTot); // 1.0/double(epoch); // 1.0/(double(epoch)+dCtr/(dTot*prRarest*2.0)); //
// Weight deltas for next epoch... // Weight deltas for next epoch...
Wt wDelta = 0.0; Wt wDelta = 0.0;
@ -333,7 +333,7 @@ void TrainableContDTree2DModel<Y,X,P>::train ( List<Joint2DRV<X,Y> >& lxy, cons
P prY = 1.0 / ( 1.0 + exp(-wtdavg) ); P prY = 1.0 / ( 1.0 + exp(-wtdavg) );
// Calc deltas for each feature/attribute/dimension... // Calc deltas for each feature/attribute/dimension...
double dEachWt = 1.0/dTot; // 1.0/dTot * modelY.getProb ( Y(1-pxy->getSub2().toInt()) ); // 1.0/(dTot*prRarest*2.0); // double dEachWt = 1.0/dTot; // 1.0/dTot * modelY.getProb ( Y(1-pxy->getSub2().toInt()) ); // 1.0/(dTot*prRarest*2.0); //
wDelta += dEachWt * -1 * ( prY - P(double(pxy->getSub2().toInt())) ); wDelta += dEachWt * -1 * ( prY - P(double(pxy->getSub2().toInt())) );
for ( A a; a<X::getSize(); a.setNext() ) for ( A a; a<X::getSize(); a.setNext() )
awDeltas.set(a) += dEachWt * pxy->getSub1().get(a.toInt()) * ( prY - P(double(pxy->getSub2().toInt())) ); awDeltas.set(a) += dEachWt * pxy->getSub1().get(a.toInt()) * ( prY - P(double(pxy->getSub2().toInt())) );
@ -439,7 +439,7 @@ void TrainableContDTree2DModel<Y,X,P>::train ( List<Joint2DRV<X,Y> >& lxy, cons
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template<class Y, class X1, class X2, class P> template<class Y, class X1, class X2, class P>
class TrainableContDTree3DModel : public ContDTree3DModel<Y,X1,X2,P> { class TrainableContDTree3DModel : public ContDTree3DModel<Y,X1,X2,P> {
private: private:
@ -455,7 +455,7 @@ class TrainableContDTree3DModel : public ContDTree3DModel<Y,X1,X2,P> {
TrainableContDTree2DModel<Y,X2,P>& setTree(const X1& x1) { return static_cast<TrainableContDTree2DModel<Y,X2,P>&>(ContDTree3DModel<Y,X1,X2,P>::setTree(x1)); } TrainableContDTree2DModel<Y,X2,P>& setTree(const X1& x1) { return static_cast<TrainableContDTree2DModel<Y,X2,P>&>(ContDTree3DModel<Y,X1,X2,P>::setTree(x1)); }
////// Add training data to per-subphone lists... ////// Add training data to per-subphone lists...
bool readData ( char* vs[], int numFields ) { bool readData ( char* vs[], int numFields ) {
if ( 4==numFields ) { if ( 4==numFields ) {
mqlxy[X1(vs[1])].add() = Joint2DRV<X2,Y> ( X2(vs[2]), Y(vs[3]) ); mqlxy[X1(vs[1])].add() = Joint2DRV<X2,Y> ( X2(vs[2]), Y(vs[3]) );
////mqlxy[X1(vs[1])].getLast()->write(stderr); fprintf(stderr,"\n"); ////mqlxy[X1(vs[1])].getLast()->write(stderr); fprintf(stderr,"\n");

View File

@ -129,8 +129,8 @@ class DTree2DModel : public Tree < typename X::ElementType, DecisNode<X,Y,P> > {
friend StringInput operator>> ( pair<StringInput,DTree2DModel<Y,X,P>*> si_m, const char* psD ) { friend StringInput operator>> ( pair<StringInput,DTree2DModel<Y,X,P>*> si_m, const char* psD ) {
if (StringInput(NULL)==si_m.first) return si_m.first; if (StringInput(NULL)==si_m.first) return si_m.first;
Y y; String xs; StringInput si,si2; si=si_m.first; DTree2DModel<Y,X,P>* pm=si_m.second; Y y; String xs; StringInput si,si2; si=si_m.first; DTree2DModel<Y,X,P>* pm=si_m.second;
while((si2=si>>" ")!=NULL)si=si2; while((si2=si>>" ")!=NULL)si=si2;
si=si>>xs>>" "; si=si>>xs>>" ";
while((si2=si>>" ")!=NULL)si=si2; while((si2=si>>" ")!=NULL)si=si2;
// Find appropriate node, creating nodes as necessary... // Find appropriate node, creating nodes as necessary...
for(int i=1; i<int(strlen(xs.c_array()))-1; i++) { for(int i=1; i<int(strlen(xs.c_array()))-1; i++) {
@ -140,22 +140,22 @@ class DTree2DModel : public Tree < typename X::ElementType, DecisNode<X,Y,P> > {
if ( si!=NULL && si[0]==':' ) { if ( si!=NULL && si[0]==':' ) {
si=si>>": "; si=si>>": ";
while((si2=si>>" ")!=NULL)si=si2; while((si2=si>>" ")!=NULL)si=si2;
si=si>>y>>" "; si=si>>y>>" ";
while((si2=si>>" ")!=NULL)si=si2; while((si2=si>>" ")!=NULL)si=si2;
si=si>>"= "; si=si>>"= ";
while((si2=si>>" ")!=NULL)si=si2; while((si2=si>>" ")!=NULL)si=si2;
// Specify attribute number (at nonterminal) or probability in distribution (at terminal)... // Specify attribute number (at nonterminal) or probability in distribution (at terminal)...
return (si!=NULL) ? si>>pm->setProb(y)>>psD : si; return (si!=NULL) ? si>>pm->setProb(y)>>psD : si;
} }
else if ( si!=NULL && si[0]=='=' ) { else if ( si!=NULL && si[0]=='=' ) {
si=si>>"= "; //cerr<<" in after equals "<<((si==NULL) ? "yes" : "no") << endl; si=si>>"= "; //cerr<<" in after equals "<<((si==NULL) ? "yes" : "no") << endl;
while((si2=si>>" ")!=NULL)si=si2; while((si2=si>>" ")!=NULL)si=si2;
//m.setA() = atoi(si.c_str()); //m.setA() = atoi(si.c_str());
int aVar = 0; int aVar = 0;
si=si>>aVar>>psD; si=si>>aVar>>psD;
pm->setA()=aVar; pm->setA()=aVar;
////cerr<<" at end "<<((si==NULL) ? "yes" : "no") << endl; ////cerr<<" at end "<<((si==NULL) ? "yes" : "no") << endl;
////cerr<<" m.getA() is "<< m.getA().toInt() << endl; ////cerr<<" m.getA() is "<< m.getA().toInt() << endl;
return si; return si;
@ -169,15 +169,15 @@ class DTree2DModel : public Tree < typename X::ElementType, DecisNode<X,Y,P> > {
si=si_m.first; si=si_m.first;
sRt = si.c_str(); sRt = si.c_str();
if (sRt.find(':')!=string::npos) { if (sRt.find(':')!=string::npos) {
while((si2=si>>" [")!=NULL)si=si2; while((si2=si>>" [")!=NULL)si=si2;
si=si>>xs>>"] "; si=si>>xs>>"] ";
while((si2=si>>" ")!=NULL)si=si2; while((si2=si>>" ")!=NULL)si=si2;
si=si>>": "; si=si>>": ";
while((si2=si>>" ")!=NULL)si=si2; while((si2=si>>" ")!=NULL)si=si2;
si=si>>y>>" "; si=si>>y>>" ";
while((si2=si>>" ")!=NULL)si=si2; while((si2=si>>" ")!=NULL)si=si2;
si=si>>"= "; si=si>>"= ";
// For DTree, must find the node labeled by X // For DTree, must find the node labeled by X
//Tree<B,DecisNode<X,Y,P> >* ptr = m; //Tree<B,DecisNode<X,Y,P> >* ptr = m;
//assert(ptr); //assert(ptr);
@ -189,15 +189,15 @@ class DTree2DModel : public Tree < typename X::ElementType, DecisNode<X,Y,P> > {
// Specify attribute number (at nonterminal) or probability in distribution (at terminal)... // Specify attribute number (at nonterminal) or probability in distribution (at terminal)...
return (si!=NULL) ? si>>m.setProb(y)>>psD : si; return (si!=NULL) ? si>>m.setProb(y)>>psD : si;
} else { } else {
while((si2=si>>" [")!=NULL)si=si2; while((si2=si>>" [")!=NULL)si=si2;
si=si>>xs>>"] "; //cerr<<" in bracket "<<((si==NULL) ? "yes" : "no") << endl; si=si>>xs>>"] "; //cerr<<" in bracket "<<((si==NULL) ? "yes" : "no") << endl;
while((si2=si>>" ")!=NULL)si=si2; while((si2=si>>" ")!=NULL)si=si2;
si=si>>"= "; //cerr<<" in after equals "<<((si==NULL) ? "yes" : "no") << endl; si=si>>"= "; //cerr<<" in after equals "<<((si==NULL) ? "yes" : "no") << endl;
//m.setA() = atoi(si.c_str()); //m.setA() = atoi(si.c_str());
int aVar = 0; int aVar = 0;
si=si>>aVar>>psD; si=si>>aVar>>psD;
m.setA()=aVar; m.setA()=aVar;
//cerr<<" at end "<<((si==NULL) ? "yes" : "no") << endl; //cerr<<" at end "<<((si==NULL) ? "yes" : "no") << endl;
//cerr<<" m.getA() is "<< m.getA().toInt() << endl; //cerr<<" m.getA() is "<< m.getA().toInt() << endl;
return si; return si;
@ -209,7 +209,7 @@ class DTree2DModel : public Tree < typename X::ElementType, DecisNode<X,Y,P> > {
}; };
//////////////////// ////////////////////
template <class Y,class X, class P> template <class Y,class X, class P>
bool DTree2DModel<Y,X,P>::readFields ( Array<char*>& aps ) { bool DTree2DModel<Y,X,P>::readFields ( Array<char*>& aps ) {
if ( /*aps[0]==sId &&*/ (3==aps.size() || 4==aps.size()) ) { if ( /*aps[0]==sId &&*/ (3==aps.size() || 4==aps.size()) ) {
//fprintf(stderr,"%s,%d\n",aps[3],numFields); //fprintf(stderr,"%s,%d\n",aps[3],numFields);
@ -269,7 +269,7 @@ class DTree3DModel {
}; };
//////////////////// ////////////////////
template <class Y,class X1,class X2, class P> template <class Y,class X1,class X2, class P>
bool DTree3DModel<Y,X1,X2,P>::readFields ( char* aps[], int numFields ) { bool DTree3DModel<Y,X1,X2,P>::readFields ( char* aps[], int numFields ) {
if ( /*aps[0]==sId &&*/ (4==numFields || 5==numFields) ) { if ( /*aps[0]==sId &&*/ (4==numFields || 5==numFields) ) {
//fprintf(stderr,"%s,%d\n",aps[3],numFields); //fprintf(stderr,"%s,%d\n",aps[3],numFields);
@ -307,7 +307,7 @@ bool DTree3DModel<Y,X1,X2,P>::readFields ( char* aps[], int numFields ) {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template<class Y, class X, class P> template<class Y, class X, class P>
class TrainableDTree2DModel : public DTree2DModel<Y,X,P> { class TrainableDTree2DModel : public DTree2DModel<Y,X,P> {
private: private:
// Type members... // Type members...
typedef typename X::ElementType B; typedef typename X::ElementType B;
@ -485,7 +485,7 @@ void TrainableDTree2DModel<Y,X,P>::train ( List<Joint2DRV<X,Y> >& lxy, const De
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template<class Y, class X1, class X2, class P> template<class Y, class X1, class X2, class P>
class TrainableDTree3DModel : public DTree3DModel<Y,X1,X2,P> { class TrainableDTree3DModel : public DTree3DModel<Y,X1,X2,P> {
private: private:

View File

@ -34,7 +34,7 @@ class Matrix : public SafeArray2D<Id<int>,Id<int>,T> {
Matrix ( ) : SafeArray2D<Id<int>,Id<int>,T>( ) { }//{ xSize=0; ySize=0; } Matrix ( ) : SafeArray2D<Id<int>,Id<int>,T>( ) { }//{ xSize=0; ySize=0; }
Matrix (int x, int y) : SafeArray2D<Id<int>,Id<int>,T>(x,y) { }//{ xSize=x; ySize=y; } Matrix (int x, int y) : SafeArray2D<Id<int>,Id<int>,T>(x,y) { }//{ xSize=x; ySize=y; }
Matrix (int x, int y, const T& t) : SafeArray2D<Id<int>,Id<int>,T>(x,y,t) { }//{ xSize=x; ySize=y; } Matrix (int x, int y, const T& t) : SafeArray2D<Id<int>,Id<int>,T>(x,y,t) { }//{ xSize=x; ySize=y; }
Matrix (const Matrix& a) : SafeArray2D<Id<int>,Id<int>,T>(a.xSize(),a.ySize()) { //xSize=a.xSize; ySize=a.ySize; Matrix (const Matrix& a) : SafeArray2D<Id<int>,Id<int>,T>(a.xSize(),a.ySize()) { //xSize=a.xSize; ySize=a.ySize;
for(int i=0;i<xSize();i++) for(int j=0;j<ySize();j++) this->set(i,j)=a.get(i,j); } for(int i=0;i<xSize();i++) for(int j=0;j<ySize();j++) this->set(i,j)=a.get(i,j); }
// Specification methods... // Specification methods...
//Matrix& operator= ( const Matrix<T>& sat ) //Matrix& operator= ( const Matrix<T>& sat )
@ -195,34 +195,34 @@ class Matrix : public SafeArray2D<Id<int>,Id<int>,T> {
} }
return false; return false;
} }
bool operator== ( const Matrix<T>& a ) const { bool operator== ( const Matrix<T>& a ) const {
if (xSize()!=a.xSize() || ySize()!=a.ySize()) return false; if (xSize()!=a.xSize() || ySize()!=a.ySize()) return false;
for (int i=0;i<a.xSize();i++) for (int i=0;i<a.xSize();i++)
for (int j=0;j<a.ySize();j++) for (int j=0;j<a.ySize();j++)
if (this->get(Id<int>(i),Id<int>(j))!=a.get(Id<int>(i),Id<int>(j))) return false; if (this->get(Id<int>(i),Id<int>(j))!=a.get(Id<int>(i),Id<int>(j))) return false;
return true; return true;
} }
// Input/output methods... // Input/output methods...
friend ostream& operator<< ( ostream& os, const Matrix<T>& a ) { friend ostream& operator<< ( ostream& os, const Matrix<T>& a ) {
os<<"\n "; os<<"\n ";
for (int i=0;i<a.xSize();i++) { for (int i=0;i<a.xSize();i++) {
for (int j=0;j<a.ySize();j++) { for (int j=0;j<a.ySize();j++) {
os<<((j==0)?"":",")<<a.get(Id<int>(i),Id<int>(j)); os<<((j==0)?"":",")<<a.get(Id<int>(i),Id<int>(j));
} }
os<<(i==a.xSize()-1?"\n":"\n "); os<<(i==a.xSize()-1?"\n":"\n ");
} }
return os; return os;
} }
friend String& operator<< ( String& str, const Matrix<T>& a ) { friend String& operator<< ( String& str, const Matrix<T>& a ) {
str<<"\n "; str<<"\n ";
for (int i=0;i<a.xSize();i++) { for (int i=0;i<a.xSize();i++) {
for (int j=0;j<a.ySize();j++) { for (int j=0;j<a.ySize();j++) {
str<<((j==0)?"":",")<<a.get(Id<int>(i),Id<int>(j)); str<<((j==0)?"":",")<<a.get(Id<int>(i),Id<int>(j));
} }
str<<";"; str<<";";
} }
return str; return str;
} }
string getString( ) const; string getString( ) const;
@ -234,7 +234,7 @@ string Matrix<T>::getString() const {
for (int j=0;j<ySize();j++) { for (int j=0;j<ySize();j++) {
str += ((j==0)?"":","); str += ((j==0)?"":",");
str += this->get(Id<int>(i),Id<int>(j)); str += this->get(Id<int>(i),Id<int>(j));
} }
str += ";"; str += ";";
} }
return str; return str;

View File

@ -43,7 +43,7 @@ static const PDFVal VARIANCE_THRESHOLD = 0.01; //0.0001; //0
// //
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <class Y> template <class Y>
class DiagGauss1DModel : public Generic1DModel<Y,PDFVal> { class DiagGauss1DModel : public Generic1DModel<Y,PDFVal> {
private: private:
// Member variables... // Member variables...
@ -53,7 +53,7 @@ class DiagGauss1DModel : public Generic1DModel<Y,PDFVal> {
SimpleHash<Id<int>,PDFVal> aMeans; SimpleHash<Id<int>,PDFVal> aMeans;
SimpleHash<Id<int>,PDFVal> aVariances; SimpleHash<Id<int>,PDFVal> aVariances;
PDFVal prInvRootNormVariances; PDFVal prInvRootNormVariances;
PDFVal prProduct; PDFVal prProduct;
SimpleHash<Id<int>,PDFVal> algprNegHalfInvVariances; SimpleHash<Id<int>,PDFVal> algprNegHalfInvVariances;
public: public:
// Constructor / destructor methods... // Constructor / destructor methods...
@ -78,7 +78,7 @@ class DiagGauss1DModel : public Generic1DModel<Y,PDFVal> {
}; };
//////////////////////////////////////// ////////////////////////////////////////
template <class Y> template <class Y>
inline void DiagGauss1DModel<Y>::precomputeVarianceTerms ( ) { inline void DiagGauss1DModel<Y>::precomputeVarianceTerms ( ) {
// Inverse square root of norm of variances... // Inverse square root of norm of variances...
setInvRootNormVar() = 1.0; setInvRootNormVar() = 1.0;
@ -92,7 +92,7 @@ inline void DiagGauss1DModel<Y>::precomputeVarianceTerms ( ) {
} }
//////////////////////////////////////// ////////////////////////////////////////
template <class Y> template <class Y>
inline PDFVal DiagGauss1DModel<Y>::getProb ( const Y& y ) const { inline PDFVal DiagGauss1DModel<Y>::getProb ( const Y& y ) const {
// fprintf(stderr,"--------------------\n"); // fprintf(stderr,"--------------------\n");
// y.write(stderr); // y.write(stderr);
@ -109,7 +109,7 @@ inline PDFVal DiagGauss1DModel<Y>::getProb ( const Y& y ) const {
} }
//////////////////////////////////////// ////////////////////////////////////////
template <class Y> template <class Y>
bool DiagGauss1DModel<Y>::readFields ( char* as[], int numFields ) { bool DiagGauss1DModel<Y>::readFields ( char* as[], int numFields ) {
if ( 0==strcmp(as[1],"m") && numFields>2 ) { if ( 0==strcmp(as[1],"m") && numFields>2 ) {
char* psT; char* psT;
@ -126,12 +126,12 @@ bool DiagGauss1DModel<Y>::readFields ( char* as[], int numFields ) {
} }
//////////////////////////////////////// ////////////////////////////////////////
template <class Y> template <class Y>
void DiagGauss1DModel<Y>::writeFields ( FILE* pf, const string& sPref ) const { void DiagGauss1DModel<Y>::writeFields ( FILE* pf, const string& sPref ) const {
fprintf(pf,"%s m = ",sPref.c_str()); fprintf(pf,"%s m = ",sPref.c_str());
for(int i=0; i<getNumFeats(); i++) fprintf(pf,"%s%f",(0==i)?"":"_",getMean(i)); for(int i=0; i<getNumFeats(); i++) fprintf(pf,"%s%f",(0==i)?"":"_",getMean(i));
fprintf ( pf, "\n" ) ; fprintf ( pf, "\n" ) ;
fprintf(pf,"%s v = ",sPref.c_str()); fprintf(pf,"%s v = ",sPref.c_str());
for(int i=0; i<getNumFeats(); i++) fprintf(pf,"%s%f",(0==i)?"":"_",getVariance(i)); for(int i=0; i<getNumFeats(); i++) fprintf(pf,"%s%f",(0==i)?"":"_",getVariance(i));
fprintf ( pf, "\n" ) ; fprintf ( pf, "\n" ) ;
@ -141,7 +141,7 @@ void DiagGauss1DModel<Y>::writeFields ( FILE* pf, const string& sPref ) const {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/* /*
template <class Y,class X> template <class Y,class X>
class DiagGauss2DModel : public Generic2DModel<Y,X,PDFVal> { class DiagGauss2DModel : public Generic2DModel<Y,X,PDFVal> {
private: private:
// Member variables... // Member variables...
@ -177,7 +177,7 @@ class DiagGauss2DModel : public Generic2DModel<Y,X,PDFVal> {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <class Y,class X1,class X2> template <class Y,class X1,class X2>
class DiagGauss3DModel : public Generic3DModel<Y,X1,X2,PDFVal> { class DiagGauss3DModel : public Generic3DModel<Y,X1,X2,PDFVal> {
private: private:
// Member variables... // Member variables...
@ -220,7 +220,7 @@ class DiagGauss3DModel : public Generic3DModel<Y,X1,X2,PDFVal> {
// //
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <class Y> template <class Y>
class TrainableDiagGauss1DModel : public DiagGauss1DModel<Y> { class TrainableDiagGauss1DModel : public DiagGauss1DModel<Y> {
public: public:
TrainableDiagGauss1DModel ( ) : DiagGauss1DModel<Y>() { } TrainableDiagGauss1DModel ( ) : DiagGauss1DModel<Y>() { }

View File

@ -54,7 +54,7 @@ class SimpleHash : public hash_map<X,Y,SimpleHashFn<X>,SimpleHashEqual<X> > /*pu
// tr1::unordered_map<X,Y,SimpleHashFn<X>,SimpleHashEqual<X> > mxy; // tr1::unordered_map<X,Y,SimpleHashFn<X>,SimpleHashEqual<X> > mxy;
static const Y yDummy; static const Y yDummy;
//static Y yNonconstDummy; //static Y yNonconstDummy;
public: public:
// typedef typename OrigHash::const_iterator const_iterator; // typedef typename OrigHash::const_iterator const_iterator;
// typedef typename OrigHash::iterator iterator; // typedef typename OrigHash::iterator iterator;

View File

@ -209,7 +209,7 @@ template <class MY, class MX, class S, class B>
void HMM<MY,MX,S,B>::debugPrint() const{ void HMM<MY,MX,S,B>::debugPrint() const{
for (int frame=0, numFrames=aatnTrellis.getxSize(); frame<numFrames; frame++) { for (int frame=0, numFrames=aatnTrellis.getxSize(); frame<numFrames; frame++) {
for (int beamIndex=0, beamSize=aatnTrellis.getySize(); beamIndex<beamSize; beamIndex++) { for (int beamIndex=0, beamSize=aatnTrellis.getySize(); beamIndex<beamSize; beamIndex++) {
if (aatnTrellis.get(frame,beamIndex).getLogProb().toDouble() > 0) { if (aatnTrellis.get(frame,beamIndex).getLogProb().toDouble() > 0) {
@ -306,7 +306,7 @@ void HMM<MY,MX,S,B>::updateRanked ( const typename MX::RandVarType& x, bool b1 )
// Add best transition (top of queue)... // Add best transition (top of queue)...
//mx.getProb(o,my.setTrellDat(ashpiQueue.getTop().first,ashpiQueue.getTop().second)); //mx.getProb(o,my.setTrellDat(ashpiQueue.getTop().first,ashpiQueue.getTop().second));
if ( ashpiQueue.getSize() > 0 ) { if ( ashpiQueue.getSize() > 0 ) {
S s; my.setTrellDat(s,ashpiQueue.getTop().second); S s; my.setTrellDat(s,ashpiQueue.getTop().second);
bFull |= btn.tryAdd ( s, IB(ashpiQueue.getTop().first,my.setBackDat(ashpiQueue.getTop().second)), ashpiQueue.getTop().third ); bFull |= btn.tryAdd ( s, IB(ashpiQueue.getTop().first,my.setBackDat(ashpiQueue.getTop().second)), ashpiQueue.getTop().third );
////cerr<<ashpiQueue.getSize()<<" queue elems A "<<ashpiQueue.getTop()<<"\n"; ////cerr<<ashpiQueue.getSize()<<" queue elems A "<<ashpiQueue.getTop()<<"\n";
////cerr<<"/-----A-----\\\n"<<ashpiQueue<<"\\-----A-----/\n"; ////cerr<<"/-----A-----\\\n"<<ashpiQueue<<"\\-----A-----/\n";
@ -379,7 +379,7 @@ void HMM<MY,MX,S,B>::updateSerial ( const typename MX::RandVarType& x ) {
// Incorporate into trellis... // Incorporate into trellis...
btn.tryAdd ( s, IB(i,my.setBackDat(y)), lgprFull ); btn.tryAdd ( s, IB(i,my.setBackDat(y)), lgprFull );
//if(OUTPUT_VERYNOISY) //if(OUTPUT_VERYNOISY)
// fprintf ( stderr," (S_t-1:[e^%0.6f] * Y:e^%0.6f * X:e^%0.6f = S_t:[e^%0.6f])\n", // fprintf ( stderr," (S_t-1:[e^%0.6f] * Y:e^%0.6f * X:e^%0.6f = S_t:[e^%0.6f])\n",
// float(aatnTrellis.get(frameLast-1,i).getLogProb().toInt())/100.0, // float(aatnTrellis.get(frameLast-1,i).getLogProb().toInt())/100.0,
// float(lgprY.toInt())/100.0, // float(lgprY.toInt())/100.0,
// float(lgprX.toInt())/100.0, // float(lgprX.toInt())/100.0,
@ -389,7 +389,7 @@ void HMM<MY,MX,S,B>::updateSerial ( const typename MX::RandVarType& x ) {
} }
// for(int i=0;i<BEAM_WIDTH;i++) { // for(int i=0;i<BEAM_WIDTH;i++) {
// fprintf(stderr,"> "); btn.get(i)->first.write(stderr); fprintf(stderr,"\n"); // fprintf(stderr,"> "); btn.get(i)->first.write(stderr); fprintf(stderr,"\n");
// } // }
btn.sort(atnSorted); btn.sort(atnSorted);
@ -429,8 +429,8 @@ void HMM<MY,MX,S,B>::each ( const typename MX::RandVarType& x, Beam<LogProb,S,IB
const TrellNode<S,B>& tnsbPrev = aatnTrellis.get(frameLast-1,i); const TrellNode<S,B>& tnsbPrev = aatnTrellis.get(frameLast-1,i);
// If prob still not below beam minimum... // If prob still not below beam minimum...
if ( tnsbPrev.getLogProb() > btn.getMin().getScore() ) { if ( tnsbPrev.getLogProb() > btn.getMin().getScore() ) {
//if (OUTPUT_VERYNOISY) { fprintf(stderr,"FROM: "); tnsbPrev.getId().write(stderr); fprintf(stderr,"\n"); } //if (OUTPUT_VERYNOISY) { fprintf(stderr,"FROM: "); tnsbPrev.getId().write(stderr); fprintf(stderr,"\n"); }
// For each possible transition... // For each possible transition...
const S& sPrev = tnsbPrev.getId(); const S& sPrev = tnsbPrev.getId();
typename MY::IterVal y; typename MY::IterVal y;
@ -447,7 +447,7 @@ void HMM<MY,MX,S,B>::each ( const typename MX::RandVarType& x, Beam<LogProb,S,IB
lgprX = mx.getProb(x,my.setTrellDat(s,y)); if ( !OUTPUT_VERYNOISY && LogProb()==lgprX ) continue; lgprX = mx.getProb(x,my.setTrellDat(s,y)); if ( !OUTPUT_VERYNOISY && LogProb()==lgprX ) continue;
#endif ///////////////////////////////////////////////////////////////// #endif /////////////////////////////////////////////////////////////////
lgprFull = tnsbPrev.getLogProb() * lgprY * lgprX; lgprFull = tnsbPrev.getLogProb() * lgprY * lgprX;
if (OUTPUT_VERYNOISY) { if (OUTPUT_VERYNOISY) {
boost::mutex::scoped_lock lock1(mutexHmmParanoiaLock); boost::mutex::scoped_lock lock1(mutexHmmParanoiaLock);
//fprintf(stderr," TO: "); y.write(stderr); fprintf(stderr,"\n"); //fprintf(stderr," TO: "); y.write(stderr); fprintf(stderr,"\n");
cout<<" "<<tnsbPrev.getId()<<" ==("<<tnsbPrev.getLogProb().toInt()<<"*"<<lgprY.toInt()<<"*"<<lgprX.toInt()<<"="<<lgprFull.toInt()<<")==> "<<y<<"\n"; cout<<" "<<tnsbPrev.getId()<<" ==("<<tnsbPrev.getLogProb().toInt()<<"*"<<lgprY.toInt()<<"*"<<lgprX.toInt()<<"="<<lgprFull.toInt()<<")==> "<<y<<"\n";
@ -459,7 +459,7 @@ void HMM<MY,MX,S,B>::each ( const typename MX::RandVarType& x, Beam<LogProb,S,IB
// Incorporate into trellis... // Incorporate into trellis...
btn.tryAdd ( s, IB(i,my.setBackDat(y)), lgprFull ); btn.tryAdd ( s, IB(i,my.setBackDat(y)), lgprFull );
// if(OUTPUT_VERYNOISY) // if(OUTPUT_VERYNOISY)
// fprintf ( stderr," (S_t-1:[e^%0.6f] * Y:e^%0.6f * X:e^%0.6f = S_t:[e^%0.6f])\n", // fprintf ( stderr," (S_t-1:[e^%0.6f] * Y:e^%0.6f * X:e^%0.6f = S_t:[e^%0.6f])\n",
// float(aatnTrellis.get(frameLast-1,i).getLogProb().toInt())/100.0, // float(aatnTrellis.get(frameLast-1,i).getLogProb().toInt())/100.0,
// float(lgprY.toInt())/100.0, // float(lgprY.toInt())/100.0,
// float(lgprO.toInt())/100.0, // float(lgprO.toInt())/100.0,
@ -695,7 +695,7 @@ std::list<string> HMM<MY,MX,S,B>::getMLS(const S& sLast) const {
//// sprintf(tmp,"HYPOTH %04d> ", fr-1); //// sprintf(tmp,"HYPOTH %04d> ", fr-1);
//// string tString(tmp); //// string tString(tmp);
//// tString += //// tString +=
string tString = string tString =
//// aatnTrellis.get(fr,iBest).getId().getString() + " " + //// aatnTrellis.get(fr,iBest).getId().getString() + " " +
aatnTrellis.get(fr,iBest).getBackData().getString() aatnTrellis.get(fr,iBest).getBackData().getString()
//// + "\n" //// + "\n"
@ -737,7 +737,7 @@ template <class MY, class MX, class S, class B>
void HMM<MY,MX,S,B>::writeCurr ( ostream& os, int f=-1 ) const { void HMM<MY,MX,S,B>::writeCurr ( ostream& os, int f=-1 ) const {
if ( -1==f ) f=frameLast; if ( -1==f ) f=frameLast;
if ( 0<=f && f<=frameLast ) if ( 0<=f && f<=frameLast )
for ( int i=0; i<BEAM_WIDTH; i++ ) for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){ if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
//fprintf(pf,"at f=%04d b=%04d: ",f,i); //fprintf(pf,"at f=%04d b=%04d: ",f,i);
os<<"at "<<std::setfill('0')<<std::setw(4)<<f<<" "<<std::setw(4)<<i<<": "; os<<"at "<<std::setfill('0')<<std::setw(4)<<f<<" "<<std::setw(4)<<i<<": ";
@ -765,7 +765,7 @@ void HMM<MY,MX,S,B>::writeCurrSum ( FILE* pf, int f=-1 ) const {
if ( 0<=f && f<=frameLast ) { if ( 0<=f && f<=frameLast ) {
LogProb sum = 0.0; LogProb sum = 0.0;
LogProb logtop = 0.0; LogProb logtop = 0.0;
for ( int i=0; i<BEAM_WIDTH; i++ ) for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){ if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); } if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
LogProb big1 = sum - logtop; LogProb big1 = sum - logtop;
@ -818,7 +818,7 @@ void HMM<MY,MX,S,B>::gatherElementsInBeam( SafeArray1D<Id<int>,pair<S,LogProb> >
result->init(BEAM_WIDTH); result->init(BEAM_WIDTH);
if ( -1==f ) f=frameLast; if ( -1==f ) f=frameLast;
if ( 0<=f && f<=frameLast ) { if ( 0<=f && f<=frameLast ) {
for ( int i=0; i<BEAM_WIDTH && &(aatnTrellis.get(f,i))!=NULL; i++ ) { for ( int i=0; i<BEAM_WIDTH && &(aatnTrellis.get(f,i))!=NULL; i++ ) {
result->set(i).first = aatnTrellis.get(f,i).getId(); result->set(i).first = aatnTrellis.get(f,i).getId();
result->set(i).second = aatnTrellis.get(f,i).getLogProb(); result->set(i).second = aatnTrellis.get(f,i).getLogProb();
} }
@ -836,7 +836,7 @@ void HMM<MY,MX,S,B>::writeCurrEntropy ( FILE* pf, int f=-1 ) const {
if ( 0<=f && f<=frameLast ) { if ( 0<=f && f<=frameLast ) {
LogProb logh = 0.0; LogProb logh = 0.0;
LogProb logtop = 0.0; LogProb logtop = 0.0;
for ( int i=0; i<BEAM_WIDTH; i++ ) for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){ if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); } if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
LogProb big1 = logh - logtop; LogProb big1 = logh - logtop;
@ -862,12 +862,12 @@ void HMM<MY,MX,S,B>::writeCurrDepths ( FILE* pf, int f=-1 ) const {
Array<int> depths = Array<int>(); Array<int> depths = Array<int>();
Array<LogProb> logprobs = Array<LogProb>(); Array<LogProb> logprobs = Array<LogProb>();
double avgdepth = 0.0; double avgdepth = 0.0;
for ( int i=0; i<BEAM_WIDTH; i++ ) for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){ if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); } if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
logprobs.set(i) = aatnTrellis.get(f,i).getLogProb(); logprobs.set(i) = aatnTrellis.get(f,i).getLogProb();
// loop over values in S node to find lowest meaningful depth // loop over values in S node to find lowest meaningful depth
for ( int j=0; j<aatnTrellis.get(f,i).getId().first.getSize(); j++) { for ( int j=0; j<aatnTrellis.get(f,i).getId().first.getSize(); j++) {
// store the depth, if it's equal to G_BOT/G_BOT // store the depth, if it's equal to G_BOT/G_BOT
@ -996,7 +996,7 @@ int HMM<MY,MX,S,B>::getBeamUsed ( int f=-1 ) const {
if ( -1==f ) f=frameLast; if ( -1==f ) f=frameLast;
int ctr=0; int ctr=0;
if ( 0<=f && f<=frameLast ) if ( 0<=f && f<=frameLast )
for ( int i=0; i<BEAM_WIDTH; i++ ) for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){ if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
ctr++; ctr++;
} }

View File

@ -269,7 +269,7 @@ void HMM<MH,MO,X,B>::updateRanked ( const typename MO::RandVarType& o ) {
// Add best transition (top of queue)... // Add best transition (top of queue)...
//mo.getProb(o,mh.setTrellDat(axhpiQueue.getTop().first,axhpiQueue.getTop().second)); //mo.getProb(o,mh.setTrellDat(axhpiQueue.getTop().first,axhpiQueue.getTop().second));
if ( axhpiQueue.getSize() > 0 ) { if ( axhpiQueue.getSize() > 0 ) {
X x; mh.setTrellDat(x,axhpiQueue.getTop().second); X x; mh.setTrellDat(x,axhpiQueue.getTop().second);
bFull |= btn.tryAdd ( x, IB(axhpiQueue.getTop().first,mh.setBackDat(axhpiQueue.getTop().second)), axhpiQueue.getTop().third ); bFull |= btn.tryAdd ( x, IB(axhpiQueue.getTop().first,mh.setBackDat(axhpiQueue.getTop().second)), axhpiQueue.getTop().third );
//cerr<<axhpiQueue.getSize()<<" queue elems A "<<axhpiQueue.getTop()<<"\n"; //cerr<<axhpiQueue.getSize()<<" queue elems A "<<axhpiQueue.getTop()<<"\n";
//cerr<<"/-----A-----\\\n + bFull: "<<bFull<<"\naxhpiQueue: \n"<<axhpiQueue<<"\\-----A-----/\n"; //cerr<<"/-----A-----\\\n + bFull: "<<bFull<<"\naxhpiQueue: \n"<<axhpiQueue<<"\\-----A-----/\n";
@ -341,7 +341,7 @@ void HMM<MH,MO,X,B>::updateSerial ( const typename MO::RandVarType& o ) {
// Incorporate into trellis... // Incorporate into trellis...
btn.tryAdd ( x, IB(i,mh.setBackDat(h)), lgprFull ); btn.tryAdd ( x, IB(i,mh.setBackDat(h)), lgprFull );
//if(OUTPUT_VERYNOISY) //if(OUTPUT_VERYNOISY)
// fprintf ( stderr," (X_t-1:[e^%0.6f] * H:e^%0.6f * O:e^%0.6f = X_t:[e^%0.6f])\n", // fprintf ( stderr," (X_t-1:[e^%0.6f] * H:e^%0.6f * O:e^%0.6f = X_t:[e^%0.6f])\n",
// float(aatnTrellis.get(frameLast-1,i).getLogProb().toInt())/100.0, // float(aatnTrellis.get(frameLast-1,i).getLogProb().toInt())/100.0,
// float(lgprH.toInt())/100.0, // float(lgprH.toInt())/100.0,
// float(lgprO.toInt())/100.0, // float(lgprO.toInt())/100.0,
@ -351,7 +351,7 @@ void HMM<MH,MO,X,B>::updateSerial ( const typename MO::RandVarType& o ) {
} }
// for(int i=0;i<BEAM_WIDTH;i++) { // for(int i=0;i<BEAM_WIDTH;i++) {
// fprintf(stderr,"> "); btn.get(i)->first.write(stderr); fprintf(stderr,"\n"); // fprintf(stderr,"> "); btn.get(i)->first.write(stderr); fprintf(stderr,"\n");
// } // }
btn.sort(atnSorted); btn.sort(atnSorted);
@ -390,8 +390,8 @@ void HMM<MH,MO,X,B>::each ( const typename MO::RandVarType& o, Beam<LogProb,X,IB
const TrellNode<X,B>& tnxbPrev = aatnTrellis.get(frameLast-1,i); const TrellNode<X,B>& tnxbPrev = aatnTrellis.get(frameLast-1,i);
// If prob still not below beam minimum... // If prob still not below beam minimum...
if ( tnxbPrev.getLogProb() > btn.getMin().getScore() ) { if ( tnxbPrev.getLogProb() > btn.getMin().getScore() ) {
//if (OUTPUT_VERYNOISY) { fprintf(stderr,"FROM: "); tnxbPrev.getId().write(stderr); fprintf(stderr,"\n"); } //if (OUTPUT_VERYNOISY) { fprintf(stderr,"FROM: "); tnxbPrev.getId().write(stderr); fprintf(stderr,"\n"); }
// For each possible transition... // For each possible transition...
const X& xPrev = tnxbPrev.getId(); const X& xPrev = tnxbPrev.getId();
typename MH::IterVal h; typename MH::IterVal h;
@ -408,7 +408,7 @@ void HMM<MH,MO,X,B>::each ( const typename MO::RandVarType& o, Beam<LogProb,X,IB
lgprO = mo.getProb(o,mh.setTrellDat(x,h)); if ( !OUTPUT_VERYNOISY && LogProb()==lgprO ) continue; lgprO = mo.getProb(o,mh.setTrellDat(x,h)); if ( !OUTPUT_VERYNOISY && LogProb()==lgprO ) continue;
#endif ///////////////////////////////////////////////////////////////// #endif /////////////////////////////////////////////////////////////////
lgprFull = tnxbPrev.getLogProb() * lgprH * lgprO; lgprFull = tnxbPrev.getLogProb() * lgprH * lgprO;
if (OUTPUT_VERYNOISY) { if (OUTPUT_VERYNOISY) {
boost::mutex::scoped_lock lock1(mutexHmmParanoiaLock); boost::mutex::scoped_lock lock1(mutexHmmParanoiaLock);
//fprintf(stderr," TO: "); h.write(stderr); fprintf(stderr,"\n"); //fprintf(stderr," TO: "); h.write(stderr); fprintf(stderr,"\n");
cout<<" "<<tnxbPrev.getId()<<" ==("<<tnxbPrev.getLogProb().toInt()<<"*"<<lgprH.toInt()<<"*"<<lgprO.toInt()<<"="<<lgprFull.toInt()<<")==> "<<h<<"\n"; cout<<" "<<tnxbPrev.getId()<<" ==("<<tnxbPrev.getLogProb().toInt()<<"*"<<lgprH.toInt()<<"*"<<lgprO.toInt()<<"="<<lgprFull.toInt()<<")==> "<<h<<"\n";
@ -420,7 +420,7 @@ void HMM<MH,MO,X,B>::each ( const typename MO::RandVarType& o, Beam<LogProb,X,IB
// Incorporate into trellis... // Incorporate into trellis...
btn.tryAdd ( x, IB(i,mh.setBackDat(h)), lgprFull ); btn.tryAdd ( x, IB(i,mh.setBackDat(h)), lgprFull );
// if(OUTPUT_VERYNOISY) // if(OUTPUT_VERYNOISY)
// fprintf ( stderr," (X_t-1:[e^%0.6f] * H:e^%0.6f * O:e^%0.6f = X_t:[e^%0.6f])\n", // fprintf ( stderr," (X_t-1:[e^%0.6f] * H:e^%0.6f * O:e^%0.6f = X_t:[e^%0.6f])\n",
// float(aatnTrellis.get(frameLast-1,i).getLogProb().toInt())/100.0, // float(aatnTrellis.get(frameLast-1,i).getLogProb().toInt())/100.0,
// float(lgprH.toInt())/100.0, // float(lgprH.toInt())/100.0,
// float(lgprO.toInt())/100.0, // float(lgprO.toInt())/100.0,
@ -656,7 +656,7 @@ std::list<string> HMM<MH,MO,X,B>::getMLS(const X& xLast) const {
//// sprintf(tmp,"HYPOTH %04d> ", fr-1); //// sprintf(tmp,"HYPOTH %04d> ", fr-1);
//// string tString(tmp); //// string tString(tmp);
//// tString += //// tString +=
string tString = string tString =
//// aatnTrellis.get(fr,iBest).getId().getString() + " " + //// aatnTrellis.get(fr,iBest).getId().getString() + " " +
aatnTrellis.get(fr,iBest).getBackData().getString() aatnTrellis.get(fr,iBest).getBackData().getString()
//// + "\n" //// + "\n"
@ -697,7 +697,7 @@ template <class MH, class MO, class X, class B>
void HMM<MH,MO,X,B>::writeCurr ( FILE* pf, int f=-1 ) const { void HMM<MH,MO,X,B>::writeCurr ( FILE* pf, int f=-1 ) const {
if ( -1==f ) f=frameLast; if ( -1==f ) f=frameLast;
if ( 0<=f && f<=frameLast ) if ( 0<=f && f<=frameLast )
for ( int i=0; i<BEAM_WIDTH; i++ ) for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){ if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
fprintf(pf,"at f=%04d b=%04d: ",f,i); fprintf(pf,"at f=%04d b=%04d: ",f,i);
String str; str<<aatnTrellis.get(f,i).getId(); //.write(pf); String str; str<<aatnTrellis.get(f,i).getId(); //.write(pf);
@ -721,7 +721,7 @@ void HMM<MH,MO,X,B>::writeCurrSum ( FILE* pf, int f=-1 ) const {
if ( 0<=f && f<=frameLast ) { if ( 0<=f && f<=frameLast ) {
LogProb sum = 0.0; LogProb sum = 0.0;
LogProb logtop = 0.0; LogProb logtop = 0.0;
for ( int i=0; i<BEAM_WIDTH; i++ ) for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){ if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); } if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
LogProb big1 = sum - logtop; LogProb big1 = sum - logtop;
@ -741,7 +741,7 @@ void HMM<MH,MO,X,B>::writeCurrEntropy ( FILE* pf, int f=-1 ) const {
if ( 0<=f && f<=frameLast ) { if ( 0<=f && f<=frameLast ) {
LogProb logh = 0.0; LogProb logh = 0.0;
LogProb logtop = 0.0; LogProb logtop = 0.0;
for ( int i=0; i<BEAM_WIDTH; i++ ) for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){ if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); } if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
LogProb big1 = logh - logtop; LogProb big1 = logh - logtop;
@ -768,12 +768,12 @@ void HMM<MH,MO,X,B>::writeCurrDepths ( FILE* pf, int f=-1 ) const {
Array<int> depths = Array<int>(); Array<int> depths = Array<int>();
Array<LogProb> logprobs = Array<LogProb>(); Array<LogProb> logprobs = Array<LogProb>();
double avgdepth = 0.0; double avgdepth = 0.0;
for ( int i=0; i<BEAM_WIDTH; i++ ) for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){ if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); } if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
logprobs.set(i) = aatnTrellis.get(f,i).getLogProb(); logprobs.set(i) = aatnTrellis.get(f,i).getLogProb();
// loop over values in S node to find lowest meaningful depth // loop over values in S node to find lowest meaningful depth
for ( int j=0; j<aatnTrellis.get(f,i).getId().first.getSize(); j++) { for ( int j=0; j<aatnTrellis.get(f,i).getId().first.getSize(); j++) {
// store the depth, if it's equal to G_BOT/G_BOT // store the depth, if it's equal to G_BOT/G_BOT
@ -900,7 +900,7 @@ int HMM<MH,MO,X,B>::getBeamUsed ( int f=-1 ) const {
if ( -1==f ) f=frameLast; if ( -1==f ) f=frameLast;
int ctr=0; int ctr=0;
if ( 0<=f && f<=frameLast ) if ( 0<=f && f<=frameLast )
for ( int i=0; i<BEAM_WIDTH; i++ ) for ( int i=0; i<BEAM_WIDTH; i++ )
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){ if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
ctr++; ctr++;
} }

View File

@ -348,7 +348,7 @@ const TrellNode<S,B>& HMMLoop<MY,MX,S,B>::update ( const typename MX::RandVarTyp
//modX.getProb(o,modY.setTrellDat(ashpiQueue.getTop().first,ashpiQueue.getTop().second)); //modX.getProb(o,modY.setTrellDat(ashpiQueue.getTop().first,ashpiQueue.getTop().second));
if ( ashpiQueue.getSize() > 0 ) { if ( ashpiQueue.getSize() > 0 ) {
S s ( ashpiQueue.getTop().second ); S s ( ashpiQueue.getTop().second );
////S s; modY.setTrellDat(s,ashpiQueue.getTop().second); ////S s; modY.setTrellDat(s,ashpiQueue.getTop().second);
bFull |= btn.tryAdd ( s, IB(ashpiQueue.getTop().first,B(ashpiQueue.getTop().second)), ashpiQueue.getTop().third ); bFull |= btn.tryAdd ( s, IB(ashpiQueue.getTop().first,B(ashpiQueue.getTop().second)), ashpiQueue.getTop().third );
////cerr<<ashpiQueue.getSize()<<" queue elems A "<<ashpiQueue.getTop()<<"\n"; ////cerr<<ashpiQueue.getSize()<<" queue elems A "<<ashpiQueue.getTop()<<"\n";
////cerr<<"/-----A-----\\\n"<<ashpiQueue<<"\\-----A-----/\n"; ////cerr<<"/-----A-----\\\n"<<ashpiQueue<<"\\-----A-----/\n";

View File

@ -90,8 +90,8 @@ class Vector : public X {
Vector<X> operator- ( ElementType d ) const { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = X::get(i)-d; return vO; } Vector<X> operator- ( ElementType d ) const { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = X::get(i)-d; return vO; }
friend Vector<X> operator* ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d*v[i]; return vO; } friend Vector<X> operator* ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d*v[i]; return vO; }
friend Vector<X> operator/ ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d/v[i]; return vO; } friend Vector<X> operator/ ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d/v[i]; return vO; }
friend Vector<X> operator+ ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d+v[i]; return vO; } friend Vector<X> operator+ ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d+v[i]; return vO; }
friend Vector<X> operator- ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d-v[i]; return vO; } friend Vector<X> operator- ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d-v[i]; return vO; }
Vector<X>& operator*= ( ElementType d ) { for(uint i=0;i<SIZE;i++) X::set(i)*=d; return *this; } Vector<X>& operator*= ( ElementType d ) { for(uint i=0;i<SIZE;i++) X::set(i)*=d; return *this; }
Vector<X>& operator/= ( ElementType d ) { for(uint i=0;i<SIZE;i++) X::set(i)/=d; return *this; } Vector<X>& operator/= ( ElementType d ) { for(uint i=0;i<SIZE;i++) X::set(i)/=d; return *this; }
Vector<X>& operator+= ( ElementType d ) { for(uint i=0;i<SIZE;i++) X::set(i)+=d; return *this; } Vector<X>& operator+= ( ElementType d ) { for(uint i=0;i<SIZE;i++) X::set(i)+=d; return *this; }

View File

@ -97,7 +97,7 @@ class Mixture3DModel : public Generic2DModel<Y,X,Prob> {
// //
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <template <class MY> class M,class Y,class C> template <template <class MY> class M,class Y,class C>
class TrainableMixture2DModel : public Mixture2DModel<M,Y,C> { class TrainableMixture2DModel : public Mixture2DModel<M,Y,C> {
// private: // private:
// LogPDFVal logpdfPrevDataAvg; // LogPDFVal logpdfPrevDataAvg;
@ -110,7 +110,7 @@ class TrainableMixture2DModel : public Mixture2DModel<M,Y,C> {
}; };
//////////////////////////////////////// ////////////////////////////////////////
template <template <class MY> class M,class Y,class C> template <template <class MY> class M,class Y,class C>
void TrainableMixture2DModel<M,Y,C>::updateFields ( const List<Joint2DRV<Y,Prob> >& lyp, const PDFVal WEIGHT_LIMIT, bool& bShouldStop ) { void TrainableMixture2DModel<M,Y,C>::updateFields ( const List<Joint2DRV<Y,Prob> >& lyp, const PDFVal WEIGHT_LIMIT, bool& bShouldStop ) {
LogPDFVal logpdfData = 0.0; LogPDFVal logpdfData = 0.0;
CPT1DModel<C,Prob> mprPseudoEmpC; // pseudo-empirical prob marginal CPT1DModel<C,Prob> mprPseudoEmpC; // pseudo-empirical prob marginal
@ -178,7 +178,7 @@ void TrainableMixture2DModel<M,Y,C>::updateFields ( const List<Joint2DRV<Y,Prob>
} }
//////////////////////////////////////// ////////////////////////////////////////
template <template <class MY> class M,class Y,class C> template <template <class MY> class M,class Y,class C>
void TrainableMixture2DModel<M,Y,C>::train ( List<Joint2DRV<Y,Prob> >& lyp, const int EPOCH_LIMIT, const PDFVal WEIGHT_LIMIT ) { void TrainableMixture2DModel<M,Y,C>::train ( List<Joint2DRV<Y,Prob> >& lyp, const int EPOCH_LIMIT, const PDFVal WEIGHT_LIMIT ) {
// Normalize model... // Normalize model...
@ -204,7 +204,7 @@ void TrainableMixture2DModel<M,Y,C>::train ( List<Joint2DRV<Y,Prob> >& lyp, cons
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <template <class MY> class M,class Y,class X,class C> template <template <class MY> class M,class Y,class X,class C>
class TrainableMixture3DModel : public Generic2DModel<Y,X,C> { class TrainableMixture3DModel : public Generic2DModel<Y,X,C> {
private: private:
string sId; string sId;
@ -225,7 +225,7 @@ class TrainableMixture3DModel : public Generic2DModel<Y,X,C> {
}; };
//////////////////////////////////////// ////////////////////////////////////////
template <template <class MY> class M,class Y,class X,class C> template <template <class MY> class M,class Y,class X,class C>
void TrainableMixture3DModel<M,Y,X,C>::train ( const int EPOCH_LIMIT, const PDFVal WEIGHT_LIMIT ) { void TrainableMixture3DModel<M,Y,X,C>::train ( const int EPOCH_LIMIT, const PDFVal WEIGHT_LIMIT ) {
// Update each subphone from list... // Update each subphone from list...
int ctr = 0; int ctr = 0;
@ -237,7 +237,7 @@ void TrainableMixture3DModel<M,Y,X,C>::train ( const int EPOCH_LIMIT, const PDFV
} }
//////////////////////////////////////// ////////////////////////////////////////
template <template <class MY> class M,class Y,class X,class C> template <template <class MY> class M,class Y,class X,class C>
void TrainableMixture3DModel<M,Y,X,C>::train ( const List<Joint3DRV<X,Y,Prob> >& lxyp, const int EPOCH_LIMIT, const PDFVal WEIGHT_LIMIT ) { void TrainableMixture3DModel<M,Y,X,C>::train ( const List<Joint3DRV<X,Y,Prob> >& lxyp, const int EPOCH_LIMIT, const PDFVal WEIGHT_LIMIT ) {
// Chop list into phone-specific sub-lists... // Chop list into phone-specific sub-lists...
ListedObject<Joint3DRV<X,Y,Prob> >* pxyp; ListedObject<Joint3DRV<X,Y,Prob> >* pxyp;
@ -248,7 +248,7 @@ void TrainableMixture3DModel<M,Y,X,C>::train ( const List<Joint3DRV<X,Y,Prob> >&
} }
//////////////////////////////////////// ////////////////////////////////////////
template <template <class MY> class M,class Y,class X,class C> template <template <class MY> class M,class Y,class X,class C>
bool TrainableMixture3DModel<M,Y,X,C>::readData ( char* as[], int numFields ) { bool TrainableMixture3DModel<M,Y,X,C>::readData ( char* as[], int numFields ) {
if ( /*as[0]!=sId+"dat" ||*/ numFields!=3 ) return false; if ( /*as[0]!=sId+"dat" ||*/ numFields!=3 ) return false;
alyp.set(X(as[1])).add() = Joint2DRV<Y,Prob>(Y(as[2]),Prob(1.0)); alyp.set(X(as[1])).add() = Joint2DRV<Y,Prob>(Y(as[2]),Prob(1.0));
@ -256,7 +256,7 @@ bool TrainableMixture3DModel<M,Y,X,C>::readData ( char* as[], int numFields ) {
} }
//////////////////////////////////////// ////////////////////////////////////////
template <template <class MY> class M,class Y,class X,class C> template <template <class MY> class M,class Y,class X,class C>
void TrainableMixture3DModel<M,Y,X,C>::writeFields ( FILE* pf, string sPref ) { void TrainableMixture3DModel<M,Y,X,C>::writeFields ( FILE* pf, string sPref ) {
X x; for ( bool b=x.setFirst(); b; b=x.setNext() ) { X x; for ( bool b=x.setFirst(); b; b=x.setNext() ) {
am.get(x).writeFields(pf,sPref+" "+x.getString()); am.get(x).writeFields(pf,sPref+" "+x.getString());

View File

@ -37,7 +37,7 @@ void processModelFilePtr ( FILE* pf, bool rF(Array<char*>&) ) {
int i=0; int numFields=0; int c=' '; int line=1; int i=0; int numFields=0; int c=' '; int line=1;
CONSUME_ALL(pf,c,WHITESPACE(c),line); // Get to first record CONSUME_ALL(pf,c,WHITESPACE(c),line); // Get to first record
while ( c!=EOF ) { // For each record while ( c!=EOF ) { // For each record
if ( c=='#' ) CONSUME_ALL(pf, c, c!='\n' && c!='\0', line ) ; // If comment, consume if ( c=='#' ) CONSUME_ALL(pf, c, c!='\n' && c!='\0', line ) ; // If comment, consume
else { // If no comment, else { // If no comment,
Array<char*> aps(100); Array<char*> aps(100);
String psBuff(1000); String psBuff(1000);
@ -49,7 +49,7 @@ void processModelFilePtr ( FILE* pf, bool rF(Array<char*>&) ) {
if (!z) break; if (!z) break;
aps[i]=z; aps[i]=z;
} }
if ( !rF(aps) ) // Try to process fields, else complain if ( !rF(aps) ) // Try to process fields, else complain
fprintf( stderr, "\nERROR: %d %d-arg %s in line %d\n\n", numFields, aps.size(), aps[0], line); fprintf( stderr, "\nERROR: %d %d-arg %s in line %d\n\n", numFields, aps.size(), aps[0], line);
} }
@ -75,7 +75,7 @@ void processModelSocket ( const int tSockfd, int& c, bool rF(Array<char*>&) ) {
int i=0; int numFields=0; int line=1; int i=0; int numFields=0; int line=1;
CONSUME_ALL_SOCKET(tSockfd,c,WHITESPACE(c),line); // Get to first record CONSUME_ALL_SOCKET(tSockfd,c,WHITESPACE(c),line); // Get to first record
while ( c!='\0' && c!='\5' ) { // For each record while ( c!='\0' && c!='\5' ) { // For each record
if ( c=='#' ) CONSUME_ALL_SOCKET(tSockfd, c, (c!='\n' && c!='\0' && c!='\5'), line ) ; // If comment, consume if ( c=='#' ) CONSUME_ALL_SOCKET(tSockfd, c, (c!='\n' && c!='\0' && c!='\5'), line ) ; // If comment, consume
else { // If no comment, else { // If no comment,
Array<char*> aps(100); Array<char*> aps(100);
String psBuff(1000); String psBuff(1000);
@ -88,7 +88,7 @@ void processModelSocket ( const int tSockfd, int& c, bool rF(Array<char*>&) ) {
if (!z) break; if (!z) break;
aps[i]=z; aps[i]=z;
} }
if ( !rF(aps) ) // Try to process fields, else complain if ( !rF(aps) ) // Try to process fields, else complain
fprintf( stderr, "\nERROR: %d-arg %s in line %d\n\n", numFields, aps[0], line); fprintf( stderr, "\nERROR: %d-arg %s in line %d\n\n", numFields, aps[0], line);
} }
@ -97,7 +97,7 @@ void processModelSocket ( const int tSockfd, int& c, bool rF(Array<char*>&) ) {
} }
void processModelSocket ( const int tSockfd, bool rF(Array<char*>&) ) { void processModelSocket ( const int tSockfd, bool rF(Array<char*>&) ) {
int c=' '; int c=' ';
processModelSocket ( tSockfd, c, rF ); processModelSocket ( tSockfd, c, rF );
} }

View File

@ -80,12 +80,12 @@ class binuint {
// Input / output methods... // Input / output methods...
friend StringInput operator>> ( StringInput si, binuint& i ) { friend StringInput operator>> ( StringInput si, binuint& i ) {
if(si==NULL) return si; if(si==NULL) return si;
i.b=0; i.b=0;
for ( char c=si[0]; '0'<=c && c<='1'; ++si,c=si[0]) for ( char c=si[0]; '0'<=c && c<='1'; ++si,c=si[0])
{ i.b=i.b*2+c-'0'; } { i.b=i.b*2+c-'0'; }
return si; } return si; }
friend ostream& operator<< ( ostream& os, binuint i ) { for(int e=uint(log2(i.b));e>=0;e--)os <<((i.b>>e)%2); return os; } friend ostream& operator<< ( ostream& os, binuint i ) { for(int e=uint(log2(i.b));e>=0;e--)os <<((i.b>>e)%2); return os; }
friend String& operator<< ( String& str, binuint i ) { for(int e=uint(log2(i.b));e>=0;e--)str<<((i.b>>e)%2); return str; } friend String& operator<< ( String& str, binuint i ) { for(int e=uint(log2(i.b));e>=0;e--)str<<((i.b>>e)%2); return str; }
}; };
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////

View File

@ -43,7 +43,7 @@ class Prob {
Prob ( ) { gVal = 0.0; } Prob ( ) { gVal = 0.0; }
Prob (double d) { gVal = d; } Prob (double d) { gVal = d; }
Prob (const char* ps) { gVal = atof(ps); } Prob (const char* ps) { gVal = atof(ps); }
operator double() const { return gVal; } operator double() const { return gVal; }
double toDouble() const { return gVal; } double toDouble() const { return gVal; }
Prob& operator+= ( const Prob p ) { gVal += p.gVal; return *this; } Prob& operator+= ( const Prob p ) { gVal += p.gVal; return *this; }
@ -54,7 +54,7 @@ class Prob {
friend ostream& operator<< ( ostream& os, const Prob& pr ) { return os<<pr.toDouble(); } friend ostream& operator<< ( ostream& os, const Prob& pr ) { return os<<pr.toDouble(); }
friend String& operator<< ( String& str, const Prob& pr ) { return str<<pr.toDouble(); } friend String& operator<< ( String& str, const Prob& pr ) { return str<<pr.toDouble(); }
friend pair<StringInput,Prob*> operator>> ( StringInput si, Prob& n ) { return pair<StringInput,Prob*>(si,&n); } friend pair<StringInput,Prob*> operator>> ( StringInput si, Prob& n ) { return pair<StringInput,Prob*>(si,&n); }
friend StringInput operator>> ( pair<StringInput,Prob*> si_n, const char* psDlm ) { friend StringInput operator>> ( pair<StringInput,Prob*> si_n, const char* psDlm ) {
double d=0.0; StringInput si=si_n.first>>d>>psDlm; *si_n.second=Prob(d); return si; } double d=0.0; StringInput si=si_n.first>>d>>psDlm; *si_n.second=Prob(d); return si; }
}; };
@ -129,7 +129,7 @@ class LogProb : public Id<int> {
friend ostream& operator<< ( ostream& os, const LogProb& lp ) { return os<<lp.toInt(); } friend ostream& operator<< ( ostream& os, const LogProb& lp ) { return os<<lp.toInt(); }
friend String& operator<< ( String& str, const LogProb& lp ) { return str<<lp.toInt(); } friend String& operator<< ( String& str, const LogProb& lp ) { return str<<lp.toInt(); }
friend pair<StringInput,LogProb*> operator>> ( StringInput si, LogProb& n ) { return pair<StringInput,LogProb*>(si,&n); } friend pair<StringInput,LogProb*> operator>> ( StringInput si, LogProb& n ) { return pair<StringInput,LogProb*>(si,&n); }
friend StringInput operator>> ( pair<StringInput,LogProb*> si_n, const char* psDlm ) { friend StringInput operator>> ( pair<StringInput,LogProb*> si_n, const char* psDlm ) {
double d=0.0; StringInput si=si_n.first>>d>>psDlm; *si_n.second=LogProb(d); return si; } double d=0.0; StringInput si=si_n.first>>d>>psDlm; *si_n.second=LogProb(d); return si; }
}; };

View File

@ -33,7 +33,7 @@
// //
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template<class Y,class P> template<class Y,class P>
class Generic1DModel { class Generic1DModel {
public: public:
typedef Y RVType; typedef Y RVType;
@ -45,7 +45,7 @@ class Generic1DModel {
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
template<class Y,class X1,class P> template<class Y,class X1,class P>
class Generic2DModel { class Generic2DModel {
public: public:
typedef Y RVType; typedef Y RVType;
@ -60,7 +60,7 @@ class Generic2DModel {
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
template<class Y,class X1,class X2,class P> template<class Y,class X1,class X2,class P>
class Generic3DModel { class Generic3DModel {
public: public:
typedef Y RVType; typedef Y RVType;
@ -76,7 +76,7 @@ class Generic3DModel {
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
template<class Y,class X1,class X2,class X3,class P> template<class Y,class X1,class X2,class X3,class P>
class Generic4DModel { class Generic4DModel {
public: public:
typedef Y RVType; typedef Y RVType;
@ -93,7 +93,7 @@ class Generic4DModel {
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
template<class Y,class X1,class X2,class X3,class X4,class P> template<class Y,class X1,class X2,class X3,class X4,class P>
class Generic5DModel { class Generic5DModel {
public: public:
typedef Y RVType; typedef Y RVType;
@ -111,7 +111,7 @@ class Generic5DModel {
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
template<class Y,class X1,class X2,class X3,class X4,class X5,class P> template<class Y,class X1,class X2,class X3,class X4,class X5,class P>
class Generic6DModel { class Generic6DModel {
public: public:
typedef Y RVType; typedef Y RVType;
@ -130,7 +130,7 @@ class Generic6DModel {
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
template<class Y,class X1,class X2,class X3,class X4,class X5,class X6,class P> template<class Y,class X1,class X2,class X3,class X4,class X5,class X6,class P>
class Generic7DModel { class Generic7DModel {
public: public:
typedef Y RVType; typedef Y RVType;
@ -302,7 +302,7 @@ class Modeled5DRV : public M::RVType {
const typename M::Dep2Type& x2, const typename M::Dep2Type& x2,
const typename M::Dep3Type& x3, const typename M::Dep3Type& x3,
const typename M::Dep4Type& x4 ) const { return m.getProb(*this,x1,x2,x3,x4); } const typename M::Dep4Type& x4 ) const { return m.getProb(*this,x1,x2,x3,x4); }
}; };
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@ -346,7 +346,7 @@ class Modeled6DRV : public M::RVType {
const typename M::Dep3Type& x3, const typename M::Dep3Type& x3,
const typename M::Dep4Type& x4, const typename M::Dep4Type& x4,
const typename M::Dep5Type& x5 ) const { return m.getProb(*this,x1,x2,x3,x4,x5); } const typename M::Dep5Type& x5 ) const { return m.getProb(*this,x1,x2,x3,x4,x5); }
}; };
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@ -395,7 +395,7 @@ class Modeled7DRV : public M::RVType {
const typename M::Dep4Type& x4, const typename M::Dep4Type& x4,
const typename M::Dep5Type& x5, const typename M::Dep5Type& x5,
const typename M::Dep6Type& x6 ) const { return m.getProb(*this,x1,x2,x3,x4,x5,x6); } const typename M::Dep6Type& x6 ) const { return m.getProb(*this,x1,x2,x3,x4,x5,x6); }
}; };
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////

View File

@ -42,7 +42,7 @@ class GenericRACPTModel : public SimpleHash<K,P> {
return ( SimpleHash<K,P>::contains(k) ); return ( SimpleHash<K,P>::contains(k) );
} }
/* /*
P getProb ( const IterVal& ikyp, const K& k ) const { P getProb ( const IterVal& ikyp, const K& k ) const {
if ( ikyp.iter.first == ikyp.iter.second ) { cerr<<"ERROR: no iterator to fix probability: "<<k<<endl; return P(); } if ( ikyp.iter.first == ikyp.iter.second ) { cerr<<"ERROR: no iterator to fix probability: "<<k<<endl; return P(); }
return ( ikyp.iter.first->second ); return ( ikyp.iter.first->second );
@ -91,7 +91,7 @@ class GenericRACPTModel : public SimpleHash<K,P> {
for ( typename HKP::const_iterator ik=HKP::begin(); ik!=HKP::end(); ik++ ) { for ( typename HKP::const_iterator ik=HKP::begin(); ik!=HKP::end(); ik++ ) {
K k=ik->first; K k=ik->first;
os << psId<<" "<<k<<" = "<<getProb(k).toDouble()<<endl; os << psId<<" "<<k<<" = "<<getProb(k).toDouble()<<endl;
// IterVal y; // IterVal y;
// for ( bool b=setFirst(y,k); b; b=setNext(y,k) ) // for ( bool b=setFirst(y,k); b; b=setNext(y,k) )
// os<<psId<<" "<<k<<" : "<<y<<" = "<<getProb(y,k).toDouble()<<"\n"; // os<<psId<<" "<<k<<" : "<<y<<" = "<<getProb(y,k).toDouble()<<"\n";
@ -110,14 +110,14 @@ class GenericRACPTModel : public SimpleHash<K,P> {
friend pair<StringInput,GenericRACPTModel<K,P>*> operator>> ( StringInput si, GenericRACPTModel<K,P>& m ) { friend pair<StringInput,GenericRACPTModel<K,P>*> operator>> ( StringInput si, GenericRACPTModel<K,P>& m ) {
return pair<StringInput,GenericRACPTModel<K,P>*>(si,&m); } return pair<StringInput,GenericRACPTModel<K,P>*>(si,&m); }
friend StringInput operator>> ( pair<StringInput,GenericRACPTModel<K,P>*> delimbuff, const char* psD ) { friend StringInput operator>> ( pair<StringInput,GenericRACPTModel<K,P>*> delimbuff, const char* psD ) {
K k; K k;
StringInput si,si2,si3; StringInput si,si2,si3;
GenericRACPTModel<K,P>& m = *delimbuff.second; GenericRACPTModel<K,P>& m = *delimbuff.second;
si=delimbuff.first; si=delimbuff.first;
if ( si==NULL ) return si; if ( si==NULL ) return si;
// Kill the colon since we're treating the whole thing as the condition // Kill the colon since we're treating the whole thing as the condition
char * str = si.c_str(); char * str = si.c_str();
char * p = strchr(str, ':'); char * p = strchr(str, ':');
@ -125,17 +125,17 @@ class GenericRACPTModel : public SimpleHash<K,P> {
p[0] = ' '; p[0] = ' ';
} }
si=str; si=str;
while((si2=si>>" ")!=NULL)si=si2; while((si2=si>>" ")!=NULL)si=si2;
si=si>>k>>" "; si=si>>k>>" ";
while((si2=si>>" ")!=NULL)si=si2; while((si2=si>>" ")!=NULL)si=si2;
si=si>>"= "; si=si>>"= ";
while((si2=si>>" ")!=NULL)si=si2; while((si2=si>>" ")!=NULL)si=si2;
return (si!=NULL) ? si>>m.setProb(k)>>psD : si; return (si!=NULL) ? si>>m.setProb(k)>>psD : si;
} }
}; };
template<class Y, class P> template<class Y, class P>
class RandAccCPT1DModel : public GenericRACPTModel<MapKey1D<Y>,P> { class RandAccCPT1DModel : public GenericRACPTModel<MapKey1D<Y>,P> {
public: public:
// typedef typename GenericCPTModel<Y,MapKey1D<Unit>,P>::IterVal IterVal; // typedef typename GenericCPTModel<Y,MapKey1D<Unit>,P>::IterVal IterVal;
@ -170,7 +170,7 @@ P& setProb ( const Y& y ) {
//////////////////// ////////////////////
template<class Y, class X1, class P> template<class Y, class X1, class P>
class RandAccCPT2DModel : public GenericRACPTModel<MapKey2D<X1,Y>,P> { class RandAccCPT2DModel : public GenericRACPTModel<MapKey2D<X1,Y>,P> {
public: public:
@ -187,7 +187,7 @@ class RandAccCPT2DModel : public GenericRACPTModel<MapKey2D<X1,Y>,P> {
P getProb ( const Y& y, const X1& x1 ) const { P getProb ( const Y& y, const X1& x1 ) const {
return GenericRACPTModel<MapKey2D<X1,Y>,P>::getProb ( MapKey2D<X1,Y>(x1,y) ); return GenericRACPTModel<MapKey2D<X1,Y>,P>::getProb ( MapKey2D<X1,Y>(x1,y) );
} }
/* /*
P& setProb ( const Y& y, const X1& x1 ) { P& setProb ( const Y& y, const X1& x1 ) {
cerr << "setProb called on racpt2d" << endl; cerr << "setProb called on racpt2d" << endl;
@ -199,7 +199,7 @@ class RandAccCPT2DModel : public GenericRACPTModel<MapKey2D<X1,Y>,P> {
//////////////////// ////////////////////
template<class Y, class X1, class X2, class P> template<class Y, class X1, class X2, class P>
class RandAccCPT3DModel : public GenericRACPTModel<MapKey3D<X1,X2,Y>,P> { class RandAccCPT3DModel : public GenericRACPTModel<MapKey3D<X1,X2,Y>,P> {
public: public:
@ -219,7 +219,7 @@ class RandAccCPT3DModel : public GenericRACPTModel<MapKey3D<X1,X2,Y>,P> {
/* /*
//////////////////// ////////////////////
template<class Y, class X1, class X2, class X3, class P> template<class Y, class X1, class X2, class X3, class P>
class CPT4DModel : public GenericCPTModel<Y,MapKey3D<X1,X2,X3>,P> { class CPT4DModel : public GenericCPTModel<Y,MapKey3D<X1,X2,X3>,P> {
public: public:
typedef typename GenericCPTModel<Y,MapKey3D<X1,X2,X3>,P>::IterVal IterVal; typedef typename GenericCPTModel<Y,MapKey3D<X1,X2,X3>,P>::IterVal IterVal;
@ -256,7 +256,7 @@ class CPT4DModel : public GenericCPTModel<Y,MapKey3D<X1,X2,X3>,P> {
//////////////////// ////////////////////
template<class Y, class X1, class X2, class X3, class X4, class P> template<class Y, class X1, class X2, class X3, class X4, class P>
class CPT5DModel : public GenericCPTModel<Y,MapKey4D<X1,X2,X3,X4>,P> { class CPT5DModel : public GenericCPTModel<Y,MapKey4D<X1,X2,X3,X4>,P> {
public: public:
typedef typename GenericCPTModel<Y,MapKey4D<X1,X2,X3,X4>,P>::IterVal IterVal; typedef typename GenericCPTModel<Y,MapKey4D<X1,X2,X3,X4>,P>::IterVal IterVal;
@ -293,7 +293,7 @@ class CPT5DModel : public GenericCPTModel<Y,MapKey4D<X1,X2,X3,X4>,P> {
//////////////////// ////////////////////
template<class Y, class X1, class X2, class X3, class X4, class X5, class P> template<class Y, class X1, class X2, class X3, class X4, class X5, class P>
class RACPT6DModel : public GenericCPTModel<Y,MapKey5D<X1,X2,X3,X4,X5>,P> { class RACPT6DModel : public GenericCPTModel<Y,MapKey5D<X1,X2,X3,X4,X5>,P> {
public: public:
typedef typename GenericCPTModel<Y,MapKey5D<X1,X2,X3,X4,X5>,P>::IterVal IterVal; typedef typename GenericCPTModel<Y,MapKey5D<X1,X2,X3,X4,X5>,P>::IterVal IterVal;

View File

@ -129,7 +129,7 @@ class DiscreteDomainRV : public Id<T> {
friend pair<StringInput,DiscreteDomainRV<T,domain>*> operator>> ( const StringInput ps, DiscreteDomainRV<T,domain>& rv ) { return pair<StringInput,DiscreteDomainRV<T,domain>*>(ps,&rv); } friend pair<StringInput,DiscreteDomainRV<T,domain>*> operator>> ( const StringInput ps, DiscreteDomainRV<T,domain>& rv ) { return pair<StringInput,DiscreteDomainRV<T,domain>*>(ps,&rv); }
friend StringInput operator>> ( pair<StringInput,DiscreteDomainRV<T,domain>*> delimbuff, const char* psDlm ) { friend StringInput operator>> ( pair<StringInput,DiscreteDomainRV<T,domain>*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first; if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
////assert(*delimbuff.second<domain.getSize()); ////assert(*delimbuff.second<domain.getSize());
int j=0; int j=0;
StringInput psIn = delimbuff.first; StringInput psIn = delimbuff.first;
if(psDlm[0]=='\0') { *delimbuff.second=psIn.c_str(); return psIn+strlen(psIn.c_str()); } if(psDlm[0]=='\0') { *delimbuff.second=psIn.c_str(); return psIn+strlen(psIn.c_str()); }
@ -203,7 +203,7 @@ template <class T> const T RefRV<T>::DUMMY;
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
template<class V1,class V2> template<class V1,class V2>
class Joint2DRV { class Joint2DRV {
public: public:
@ -216,7 +216,7 @@ class Joint2DRV {
Joint2DRV ( const V1& v1, const V2& v2 ) { first=v1; second=v2; } Joint2DRV ( const V1& v1, const V2& v2 ) { first=v1; second=v2; }
// Extraction methods... // Extraction methods...
size_t getHashKey ( ) const { size_t k=rotLeft(first.getHashKey(),3); k^=second.getHashKey(); size_t getHashKey ( ) const { size_t k=rotLeft(first.getHashKey(),3); k^=second.getHashKey();
/*fprintf(stderr," (%d) %d ^& %d = %d\n",sizeof(*this),x1.getHashKey(),x2.getHashKey(),k);*/ return k; } /*fprintf(stderr," (%d) %d ^& %d = %d\n",sizeof(*this),x1.getHashKey(),x2.getHashKey(),k);*/ return k; }
bool operator< ( const Joint2DRV<V1,V2>& j ) const { return ( (first<j.first) || bool operator< ( const Joint2DRV<V1,V2>& j ) const { return ( (first<j.first) ||
(first==j.first && second<j.second) ); } (first==j.first && second<j.second) ); }
@ -276,7 +276,7 @@ class DelimitedJoint2DRV : public Joint2DRV<V1,V2> {
friend pair<StringInput,DelimitedJoint2DRV<SD1,V1,SD2,V2,SD3>*> operator>> ( StringInput ps, DelimitedJoint2DRV<SD1,V1,SD2,V2,SD3>& rv ) { return pair<StringInput,DelimitedJoint2DRV<SD1,V1,SD2,V2,SD3>*>(ps,&rv); } friend pair<StringInput,DelimitedJoint2DRV<SD1,V1,SD2,V2,SD3>*> operator>> ( StringInput ps, DelimitedJoint2DRV<SD1,V1,SD2,V2,SD3>& rv ) { return pair<StringInput,DelimitedJoint2DRV<SD1,V1,SD2,V2,SD3>*>(ps,&rv); }
friend StringInput operator>> ( pair<StringInput,DelimitedJoint2DRV<SD1,V1,SD2,V2,SD3>*> delimbuff, const char* psDlm ) { friend StringInput operator>> ( pair<StringInput,DelimitedJoint2DRV<SD1,V1,SD2,V2,SD3>*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first; if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
return ( (SD3[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>psDlm return ( (SD3[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>psDlm
: delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>SD3>>psDlm ); : delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>SD3>>psDlm );
} }
}; };
@ -290,7 +290,7 @@ class DelimitedJoint2DRV : public Joint2DRV<V1,V2> {
// //
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template<class V1,class V2,class V3> template<class V1,class V2,class V3>
class Joint3DRV { class Joint3DRV {
public: public:
@ -361,7 +361,7 @@ class DelimitedJoint3DRV : public Joint3DRV<V1,V2,V3> {
return pair<StringInput,DelimitedJoint3DRV<SD1,V1,SD2,V2,SD3,V3,SD4>*>(ps,&rv); } return pair<StringInput,DelimitedJoint3DRV<SD1,V1,SD2,V2,SD3,V3,SD4>*>(ps,&rv); }
friend StringInput operator>> ( pair<StringInput,DelimitedJoint3DRV<SD1,V1,SD2,V2,SD3,V3,SD4>*> delimbuff, const char* psDlm ) { friend StringInput operator>> ( pair<StringInput,DelimitedJoint3DRV<SD1,V1,SD2,V2,SD3,V3,SD4>*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first; if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
return ( (SD4[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>SD3>>delimbuff.second->third>>psDlm return ( (SD4[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>SD3>>delimbuff.second->third>>psDlm
: delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>SD3>>delimbuff.second->third>>SD4>>psDlm ); : delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>SD3>>delimbuff.second->third>>SD4>>psDlm );
} }
}; };
@ -453,7 +453,7 @@ class DelimitedJoint4DRV : public Joint4DRV<V1,V2,V3,V4> {
// //
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <int I, class T> template <int I, class T>
class JointArrayRV { class JointArrayRV {
private: private:
// Data members... // Data members...
@ -491,7 +491,7 @@ class JointArrayRV {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <int I, char* SD, class T> template <int I, char* SD, class T>
class DelimitedJointArrayRV : public JointArrayRV<I,T> { class DelimitedJointArrayRV : public JointArrayRV<I,T> {
public: public:
@ -569,7 +569,7 @@ class History {
/* /*
void read ( char* ps, const ReaderContext& rc=ReaderContext() ) { char* psT; for(int i=0;i<N;i++){char* z=strtok_r((0==i)?ps:NULL,";",&psT); assert(z); at.set(i).read(z);} } void read ( char* ps, const ReaderContext& rc=ReaderContext() ) { char* psT; for(int i=0;i<N;i++){char* z=strtok_r((0==i)?ps:NULL,";",&psT); assert(z); at.set(i).read(z);} }
//at.set(i).read(strtok_r((0==i)?ps:NULL,";",&psT)); } //at.set(i).read(strtok_r((0==i)?ps:NULL,";",&psT)); }
*/ */
friend ostream& operator<< ( ostream& os, const History<N,T>& a ) { for(int i=0;i<N;i++)os<<((i==0)?"":";")<<a.getBack(i); return os; } friend ostream& operator<< ( ostream& os, const History<N,T>& a ) { for(int i=0;i<N;i++)os<<((i==0)?"":";")<<a.getBack(i); return os; }
friend pair<StringInput,History<N,T>*> operator>> ( StringInput ps, History<N,T>& a ) { return pair<StringInput,History<N,T>*>(ps,&a); } friend pair<StringInput,History<N,T>*> operator>> ( StringInput ps, History<N,T>& a ) { return pair<StringInput,History<N,T>*>(ps,&a); }

View File

@ -30,7 +30,7 @@
#include "nl-stream.h" #include "nl-stream.h"
#include <iostream> #include <iostream>
using namespace std; using namespace std;
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -39,7 +39,7 @@ using namespace std;
// //
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <int I, class T> template <int I, class T>
class StaticSafeArray { class StaticSafeArray {
private: private:
// Data members... // Data members...
@ -84,7 +84,7 @@ class StaticSafeArray {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <int I, char* SD, class T> template <int I, char* SD, class T>
class DelimitedStaticSafeArray : public StaticSafeArray<I,T> { class DelimitedStaticSafeArray : public StaticSafeArray<I,T> {
public: public:
DelimitedStaticSafeArray ( ) : StaticSafeArray<I,T>() { } DelimitedStaticSafeArray ( ) : StaticSafeArray<I,T>() { }
@ -349,7 +349,7 @@ class SafeArray2D {
// Extraction methods... // Extraction methods...
const T& get (const X1& x,const X2& y) const { assert(at!=NULL); const T& get (const X1& x,const X2& y) const { assert(at!=NULL);
assert(x.toInt()>=0); assert(x.toInt()<xSize); assert(x.toInt()>=0); assert(x.toInt()<xSize);
assert(y.toInt()>=0); assert(y.toInt()>=0);
//this assert failed when compile without -DNDEBUG (needed for debugging). Have to figure out why before adding this assert back in //this assert failed when compile without -DNDEBUG (needed for debugging). Have to figure out why before adding this assert back in
//assert(y.toInt()<ySize); //assert(y.toInt()<ySize);
return at[x.toInt()*ySize + y.toInt()];} return at[x.toInt()*ySize + y.toInt()];}
@ -423,7 +423,7 @@ class SafeArray4D {
{ delete[] at; wSize=sat.wSize; xSize=sat.xSize; ySize=sat.ySize; { delete[] at; wSize=sat.wSize; xSize=sat.xSize; ySize=sat.ySize;
zSize=sat.zSize; at=new T[wSize*xSize*ySize*zSize]; zSize=sat.zSize; at=new T[wSize*xSize*ySize*zSize];
for(int i=0;i<wSize*xSize*ySize*zSize;i++) at[i]=sat.at[i]; return *this; } for(int i=0;i<wSize*xSize*ySize*zSize;i++) at[i]=sat.at[i]; return *this; }
void init (int w,int x,int y,int z) void init (int w,int x,int y,int z)
{ delete[] at; wSize=w; xSize=x; ySize=y; zSize=z; at=new T[w*x*y*z]; } { delete[] at; wSize=w; xSize=x; ySize=y; zSize=z; at=new T[w*x*y*z]; }
void init (int w,int x,int y,int z,const T& t) void init (int w,int x,int y,int z,const T& t)
{ delete[] at; wSize=w; xSize=x; ySize=y; zSize=z; at=new T[w*x*y*z]; { delete[] at; wSize=w; xSize=x; ySize=y; zSize=z; at=new T[w*x*y*z];
@ -472,7 +472,7 @@ class SafeArray5D {
{ delete[] at; vSize=sat.vSize; wSize=sat.wSize; xSize=sat.xSize; { delete[] at; vSize=sat.vSize; wSize=sat.wSize; xSize=sat.xSize;
ySize=sat.ySize; zSize=sat.zSize; at=new T[vSize*wSize*xSize*ySize*zSize]; ySize=sat.ySize; zSize=sat.zSize; at=new T[vSize*wSize*xSize*ySize*zSize];
for(int i=0;i<vSize*wSize*xSize*ySize*zSize;i++) at[i]=sat.at[i]; return *this; } for(int i=0;i<vSize*wSize*xSize*ySize*zSize;i++) at[i]=sat.at[i]; return *this; }
void init(int v,int w,int x,int y,int z) void init(int v,int w,int x,int y,int z)
{ delete[] at; vSize=v; wSize=w; xSize=x; ySize=y; zSize=z; at=new T[v*w*x*y*z]; } { delete[] at; vSize=v; wSize=w; xSize=x; ySize=y; zSize=z; at=new T[v*w*x*y*z]; }
void init(int v,int w,int x,int y,int z,const T& t) void init(int v,int w,int x,int y,int z,const T& t)
{ delete[] at; vSize=v; wSize=w; xSize=x; ySize=y; zSize=z; at=new T[v*w*x*y*z]; { delete[] at; vSize=v; wSize=w; xSize=x; ySize=y; zSize=z; at=new T[v*w*x*y*z];

View File

@ -86,7 +86,7 @@ class IStream {
friend ostream& operator<< ( ostream& os, const IStream& is ) { return os<<is.iIndex<<","<<is.psrc<<","<<*is.psrc; } friend ostream& operator<< ( ostream& os, const IStream& is ) { return os<<is.iIndex<<","<<is.psrc<<","<<*is.psrc; }
// Match single char... // Match single char...
friend IStream operator>> ( IStream is, char& c ) { friend IStream operator>> ( IStream is, char& c ) {
// Propagate fail... // Propagate fail...
if (IStream()==is) return IStream(); if (IStream()==is) return IStream();
c=is.get(is.iIndex); c=is.get(is.iIndex);
@ -106,7 +106,7 @@ class IStream {
// Match anything else followed by zero-terminated string delimiter... // Match anything else followed by zero-terminated string delimiter...
template<class X> friend pair<IStream,X*> operator>> ( IStream is, X& x ) { return pair<IStream,X*>(is,&x); } template<class X> friend pair<IStream,X*> operator>> ( IStream is, X& x ) { return pair<IStream,X*>(is,&x); }
template<class X> friend IStream operator>> ( pair<IStream,X*> is_x, const char* psDlm ) { template<class X> friend IStream operator>> ( pair<IStream,X*> is_x, const char* psDlm ) {
IStream& is = is_x.first; IStream& is = is_x.first;
X& x = *is_x.second; X& x = *is_x.second;
// Propagate fail... // Propagate fail...
@ -129,7 +129,7 @@ class IStream {
} }
// Match integer followed by zero-terminated string delimiter... // Match integer followed by zero-terminated string delimiter...
friend IStream operator>> ( pair<IStream,int*> is_x, const char* psDlm ) { friend IStream operator>> ( pair<IStream,int*> is_x, const char* psDlm ) {
IStream& is = is_x.first; IStream& is = is_x.first;
int& x = *is_x.second; int& x = *is_x.second;
// Propagate fail... // Propagate fail...
@ -151,7 +151,7 @@ class IStream {
} }
// Match unsigned int followed by zero-terminated string delimiter... // Match unsigned int followed by zero-terminated string delimiter...
friend IStream operator>> ( pair<IStream,unsigned int*> is_x, const char* psDlm ) { friend IStream operator>> ( pair<IStream,unsigned int*> is_x, const char* psDlm ) {
IStream& is = is_x.first; IStream& is = is_x.first;
unsigned int& x = *is_x.second; unsigned int& x = *is_x.second;
// Propagate fail... // Propagate fail...
@ -173,7 +173,7 @@ class IStream {
} }
// Match float followed by zero-terminated string delimiter... // Match float followed by zero-terminated string delimiter...
friend IStream operator>> ( pair<IStream,float*> is_x, const char* psDlm ) { friend IStream operator>> ( pair<IStream,float*> is_x, const char* psDlm ) {
IStream& is = is_x.first; IStream& is = is_x.first;
float& x = *is_x.second; float& x = *is_x.second;
// Propagate fail... // Propagate fail...
@ -195,7 +195,7 @@ class IStream {
} }
// Match double followed by zero-terminated string delimiter... // Match double followed by zero-terminated string delimiter...
friend IStream operator>> ( pair<IStream,double*> is_x, const char* psDlm ) { friend IStream operator>> ( pair<IStream,double*> is_x, const char* psDlm ) {
IStream& is = is_x.first; IStream& is = is_x.first;
double& x = *is_x.second; double& x = *is_x.second;
// Propagate fail... // Propagate fail...
@ -217,7 +217,7 @@ class IStream {
} }
// Match void pointer followed by zero-terminated string delimiter... // Match void pointer followed by zero-terminated string delimiter...
friend IStream operator>> ( pair<IStream,void**> is_x, const char* psDlm ) { friend IStream operator>> ( pair<IStream,void**> is_x, const char* psDlm ) {
IStream& is = is_x.first; IStream& is = is_x.first;
// Propagate fail... // Propagate fail...
if (IStream()==is) return IStream(); if (IStream()==is) return IStream();

View File

@ -68,13 +68,13 @@ class StringInput {
friend StringInput operator>> ( StringInput psIn, const char* psDlm ) { friend StringInput operator>> ( StringInput psIn, const char* psDlm ) {
if (StringInput(NULL)==psIn) return psIn; if (StringInput(NULL)==psIn) return psIn;
int i; int i;
for (i=0; psIn[i]!='\0' && psDlm[i]!='\0'; i++) for (i=0; psIn[i]!='\0' && psDlm[i]!='\0'; i++)
if(psIn[i]!=psDlm[i]) return StringInput(NULL); //psIn; if(psIn[i]!=psDlm[i]) return StringInput(NULL); //psIn;
return (psDlm[i]!='\0') ? StringInput(NULL) : (psIn[i]!='\0') ? psIn+i : SI_EOS; return (psDlm[i]!='\0') ? StringInput(NULL) : (psIn[i]!='\0') ? psIn+i : SI_EOS;
} }
friend pair<StringInput,int*> operator>> ( StringInput ps, int& n ) { return pair<StringInput,int*>(ps,&n); } friend pair<StringInput,int*> operator>> ( StringInput ps, int& n ) { return pair<StringInput,int*>(ps,&n); }
friend StringInput operator>> ( pair<StringInput,int*> delimbuff, const char* psDlm ) { friend StringInput operator>> ( pair<StringInput,int*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first; if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
///int i; for(i=0;psIn[i]!='\0';i++) if(psIn[i]==psDlm[i]) return psIn; return psIn+i; ///int i; for(i=0;psIn[i]!='\0';i++) if(psIn[i]==psDlm[i]) return psIn; return psIn+i;
int j=0; int j=0;
@ -90,7 +90,7 @@ class StringInput {
} }
friend pair<StringInput,unsigned int*> operator>> ( StringInput ps, unsigned int& n ) { return pair<StringInput,unsigned int*>(ps,&n); } friend pair<StringInput,unsigned int*> operator>> ( StringInput ps, unsigned int& n ) { return pair<StringInput,unsigned int*>(ps,&n); }
friend StringInput operator>> ( pair<StringInput,unsigned int*> delimbuff, const char* psDlm ) { friend StringInput operator>> ( pair<StringInput,unsigned int*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first; if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
///int i; for(i=0;psIn[i]!='\0';i++) if(psIn[i]==psDlm[i]) return psIn; return psIn+i; ///int i; for(i=0;psIn[i]!='\0';i++) if(psIn[i]==psDlm[i]) return psIn; return psIn+i;
int j=0; int j=0;
@ -106,7 +106,7 @@ class StringInput {
} }
friend pair<StringInput,double*> operator>> ( StringInput ps, double& d ) { return pair<StringInput,double*>(ps,&d); } friend pair<StringInput,double*> operator>> ( StringInput ps, double& d ) { return pair<StringInput,double*>(ps,&d); }
friend StringInput operator>> ( pair<StringInput,double*> delimbuff, const char* psDlm ) { friend StringInput operator>> ( pair<StringInput,double*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first; if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
///int i; for(i=0;psIn[i]!='\0';i++) if(psIn[i]==psDlm[i]) return psIn; return psIn+i; ///int i; for(i=0;psIn[i]!='\0';i++) if(psIn[i]==psDlm[i]) return psIn; return psIn+i;
int j=0; int j=0;
@ -191,7 +191,7 @@ class String : public Array<char> {
friend pair<StringInput,String*> operator>> ( const StringInput ps, String& s ) { return pair<StringInput,String*>(ps,&s); } friend pair<StringInput,String*> operator>> ( const StringInput ps, String& s ) { return pair<StringInput,String*>(ps,&s); }
friend StringInput operator>> ( pair<StringInput,String*> delimbuff, const char* psDlm ) { friend StringInput operator>> ( pair<StringInput,String*> delimbuff, const char* psDlm ) {
if (StringInput(NULL)==delimbuff.first) return delimbuff.first; if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
////assert(*delimbuff.second<domain.getSize()); ////assert(*delimbuff.second<domain.getSize());
int j=0; int j=0;
StringInput psIn = delimbuff.first; StringInput psIn = delimbuff.first;
if(psDlm[0]=='\0') { *delimbuff.second=String(psIn.c_str()); return psIn+strlen(psIn.c_str()); } if(psDlm[0]=='\0') { *delimbuff.second=String(psIn.c_str()); return psIn+strlen(psIn.c_str()); }

View File

@ -38,7 +38,7 @@ class StringIndex{
map <string, int> msi; map <string, int> msi;
map <int, string> mis; map <int, string> mis;
int maxIndex; int maxIndex;
public: public:
// Constructor / destructor methods... // Constructor / destructor methods...

View File

@ -22,7 +22,7 @@
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
/*********************************************** /***********************************************
* nl-tetrahex.h * nl-tetrahex.h
* a little header with some base conversion stuff * a little header with some base conversion stuff
* so that we can represent base 16, 32 or 64 with * so that we can represent base 16, 32 or 64 with
* one character. * one character.

View File

@ -41,7 +41,7 @@ class Timer {
} }
double elapsed ( ) { // in milliseconds. double elapsed ( ) { // in milliseconds.
return (double(kept.tv_sec)*1000.0 + double(kept.tv_usec)/1000.0); return (double(kept.tv_sec)*1000.0 + double(kept.tv_usec)/1000.0);
//struct timeval end; gettimeofday(&end,NULL); //struct timeval end; gettimeofday(&end,NULL);
//double beg_time_s = (double) beg.tv_sec + (double) ((double)beg.tv_usec / 1000000.0); //double beg_time_s = (double) beg.tv_sec + (double) ((double)beg.tv_usec / 1000000.0);
//double end_time_s = (double) end.tv_sec + (double) ((double)end.tv_usec / 1000000.0); //double end_time_s = (double) end.tv_sec + (double) ((double)end.tv_usec / 1000000.0);
//return ( (end_time_s - beg_time_s) * 1000.0 ); //return ( (end_time_s - beg_time_s) * 1000.0 );

View File

@ -136,7 +136,7 @@ class Rd : public DiscreteDomainRV<int,domRd> {
} }
if (!hToG.contains(*this)) { if (!hToG.contains(*this)) {
size_t i=s.find(','); size_t i=s.find(',');
assert(i!=string::npos); assert(i!=string::npos);
hToG.set(*this) = G(s.substr(i+1).c_str()); hToG.set(*this) = G(s.substr(i+1).c_str());
if ( '1'==s[0] ) if ( '1'==s[0] )
hFromG.set(G(s.substr(i+1).c_str())) = *this; hFromG.set(G(s.substr(i+1).c_str())) = *this;

View File

@ -42,11 +42,11 @@ typedef HidVarCPT2DModel<P,C,LogProb> PgivCModel;
class WModel { class WModel {
private: private:
TrainableDTree2DModel<P,W,LogProb> modPgivWdt; TrainableDTree2DModel<P,W,LogProb> modPgivWdt;
RandAccCPT2DModel<P,W,LogProb> modPgivWs; RandAccCPT2DModel<P,W,LogProb> modPgivWs;
RandAccCPT1DModel<P,LogProb> modP; RandAccCPT1DModel<P,LogProb> modP;
RandAccCPT1DModel<W,LogProb> modW; RandAccCPT1DModel<W,LogProb> modW;
public: public:
//LogProb getProb ( const W& w, const HidVarCPT1DModel<P,LogProb>::IterVal& p ) const { //LogProb getProb ( const W& w, const HidVarCPT1DModel<P,LogProb>::IterVal& p ) const {
LogProb getProb ( const W& w, const P::ArrayIterator<LogProb>& p ) const { LogProb getProb ( const W& w, const P::ArrayIterator<LogProb>& p ) const {
@ -93,8 +93,8 @@ class OModel {
}; };
typedef DistribModeledWgivC RandVarType; typedef DistribModeledWgivC RandVarType;
void calcProb ( OModel::RandVarType& o, const W& w ) const { void calcProb ( OModel::RandVarType& o, const W& w ) const {
o.clear(); o.clear();
@ -106,7 +106,7 @@ class OModel {
for (LogProb pr=modPgivC.setIterProb(p,c,aCtr); pr!=LogProb(); pr = modPgivC.setIterProb(p,c,aCtr=0) ){ for (LogProb pr=modPgivC.setIterProb(p,c,aCtr); pr!=LogProb(); pr = modPgivC.setIterProb(p,c,aCtr=0) ){
o.setProb(c) += modPgivC.getProb(p,c).toProb() * modWgivP.getProb(w,p).toProb(); o.setProb(c) += modPgivC.getProb(p,c).toProb() * modWgivP.getProb(w,p).toProb();
} }
} }
} }
@ -134,7 +134,7 @@ class XModel {
RandAccCPT2DModel<P,W,Prob> modPgivW; RandAccCPT2DModel<P,W,Prob> modPgivW;
RandAccCPT1DModel<P,Prob> modP; RandAccCPT1DModel<P,Prob> modP;
RandAccCPT1DModel<W,Prob> modW; RandAccCPT1DModel<W,Prob> modW;
public: public:
typedef X RandVarType; typedef X RandVarType;

View File

@ -11,12 +11,12 @@ namespace lm {
namespace ngram { namespace ngram {
namespace trie { namespace trie {
DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) : DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) :
next_(util::BitsMask::ByMax(max_next)) {} next_(util::BitsMask::ByMax(max_next)) {}
const uint8_t kArrayBhikshaVersion = 0; const uint8_t kArrayBhikshaVersion = 0;
// TODO: put this in binary file header instead when I change the binary file format again. // TODO: put this in binary file header instead when I change the binary file format again.
void ArrayBhiksha::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) { void ArrayBhiksha::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) {
uint8_t buffer[2]; uint8_t buffer[2];
file.ReadForConfig(buffer, 2, offset); file.ReadForConfig(buffer, 2, offset);
@ -33,7 +33,7 @@ uint8_t ChopBits(uint64_t max_offset, uint64_t max_next, const Config &config) {
uint8_t required = util::RequiredBits(max_next); uint8_t required = util::RequiredBits(max_next);
uint8_t best_chop = 0; uint8_t best_chop = 0;
int64_t lowest_change = std::numeric_limits<int64_t>::max(); int64_t lowest_change = std::numeric_limits<int64_t>::max();
// There are probably faster ways but I don't care because this is only done once per order at construction time. // There are probably faster ways but I don't care because this is only done once per order at construction time.
for (uint8_t chop = 0; chop <= std::min(required, config.pointer_bhiksha_bits); ++chop) { for (uint8_t chop = 0; chop <= std::min(required, config.pointer_bhiksha_bits); ++chop) {
int64_t change = (max_next >> (required - chop)) * 64 /* table cost in bits */ int64_t change = (max_next >> (required - chop)) * 64 /* table cost in bits */
- max_offset * static_cast<int64_t>(chop); /* savings in bits*/ - max_offset * static_cast<int64_t>(chop); /* savings in bits*/

View File

@ -7,7 +7,7 @@
* pages={388--391}, * pages={388--391},
* } * }
* *
* Currently only used for next pointers. * Currently only used for next pointers.
*/ */
#ifndef LM_BHIKSHA_H #ifndef LM_BHIKSHA_H
@ -86,9 +86,9 @@ class ArrayBhiksha {
// assert(end_it == std::upper_bound(offset_begin_, offset_end_, index + 1)); // assert(end_it == std::upper_bound(offset_begin_, offset_end_, index + 1));
--end_it; --end_it;
// assert(end_it >= begin_it); // assert(end_it >= begin_it);
out.begin = ((begin_it - offset_begin_) << next_inline_.bits) | out.begin = ((begin_it - offset_begin_) << next_inline_.bits) |
util::ReadInt57(base, bit_offset, next_inline_.bits, next_inline_.mask); util::ReadInt57(base, bit_offset, next_inline_.bits, next_inline_.mask);
out.end = ((end_it - offset_begin_) << next_inline_.bits) | out.end = ((end_it - offset_begin_) << next_inline_.bits) |
util::ReadInt57(base, bit_offset + total_bits, next_inline_.bits, next_inline_.mask); util::ReadInt57(base, bit_offset + total_bits, next_inline_.bits, next_inline_.mask);
// If this fails, consider rebuilding your model using KenLM after 1e333d786b748555e8f368d2bbba29a016c98052 // If this fails, consider rebuilding your model using KenLM after 1e333d786b748555e8f368d2bbba29a016c98052
assert(out.end >= out.begin); assert(out.end >= out.begin);

View File

@ -135,7 +135,7 @@ void MatchCheck(ModelType model_type, unsigned int search_version, const Paramet
const std::size_t kInvalidSize = static_cast<std::size_t>(-1); const std::size_t kInvalidSize = static_cast<std::size_t>(-1);
BinaryFormat::BinaryFormat(const Config &config) BinaryFormat::BinaryFormat(const Config &config)
: write_method_(config.write_method), write_mmap_(config.write_mmap), load_method_(config.load_method), : write_method_(config.write_method), write_mmap_(config.write_mmap), load_method_(config.load_method),
header_size_(kInvalidSize), vocab_size_(kInvalidSize), vocab_string_offset_(kInvalidOffset) {} header_size_(kInvalidSize), vocab_size_(kInvalidSize), vocab_string_offset_(kInvalidOffset) {}

View File

@ -19,18 +19,18 @@ namespace ngram {
extern const char *kModelNames[6]; extern const char *kModelNames[6];
/*Inspect a file to determine if it is a binary lm. If not, return false. /*Inspect a file to determine if it is a binary lm. If not, return false.
* If so, return true and set recognized to the type. This is the only API in * If so, return true and set recognized to the type. This is the only API in
* this header designed for use by decoder authors. * this header designed for use by decoder authors.
*/ */
bool RecognizeBinary(const char *file, ModelType &recognized); bool RecognizeBinary(const char *file, ModelType &recognized);
struct FixedWidthParameters { struct FixedWidthParameters {
unsigned char order; unsigned char order;
float probing_multiplier; float probing_multiplier;
// What type of model is this? // What type of model is this?
ModelType model_type; ModelType model_type;
// Does the end of the file have the actual strings in the vocabulary? // Does the end of the file have the actual strings in the vocabulary?
bool has_vocabulary; bool has_vocabulary;
unsigned int search_version; unsigned int search_version;
}; };
@ -38,7 +38,7 @@ struct FixedWidthParameters {
// This is a macro instead of an inline function so constants can be assigned using it. // This is a macro instead of an inline function so constants can be assigned using it.
#define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8) #define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8)
// Parameters stored in the header of a binary file. // Parameters stored in the header of a binary file.
struct Parameters { struct Parameters {
FixedWidthParameters fixed; FixedWidthParameters fixed;
std::vector<uint64_t> counts; std::vector<uint64_t> counts;
@ -79,7 +79,7 @@ class BinaryFormat {
const char *write_mmap_; const char *write_mmap_;
util::LoadMethod load_method_; util::LoadMethod load_method_;
// File behind memory, if any. // File behind memory, if any.
util::scoped_fd file_; util::scoped_fd file_;
// If there is a file involved, a single mapping. // If there is a file involved, a single mapping.

View File

@ -15,9 +15,9 @@ namespace ngram {
* kNoExtensionBackoff. If the n-gram might be extended, then out_state must * kNoExtensionBackoff. If the n-gram might be extended, then out_state must
* contain the full n-gram, in which case kExtensionBackoff is set. In any * contain the full n-gram, in which case kExtensionBackoff is set. In any
* case, if an n-gram has non-zero backoff, the full state is returned so * case, if an n-gram has non-zero backoff, the full state is returned so
* backoff can be properly charged. * backoff can be properly charged.
* These differ only in sign bit because the backoff is in fact zero in either * These differ only in sign bit because the backoff is in fact zero in either
* case. * case.
*/ */
const float kNoExtensionBackoff = -0.0; const float kNoExtensionBackoff = -0.0;
const float kExtensionBackoff = 0.0; const float kExtensionBackoff = 0.0;
@ -28,7 +28,7 @@ inline void SetExtension(float &backoff) {
if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff; if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff;
} }
// This compiles down nicely. // This compiles down nicely.
inline bool HasExtension(const float &backoff) { inline bool HasExtension(const float &backoff) {
typedef union { float f; uint32_t i; } UnionValue; typedef union { float f; uint32_t i; } UnionValue;
UnionValue compare, interpret; UnionValue compare, interpret;

View File

@ -56,7 +56,7 @@ void Usage(const char *name, const char *default_mem) {
exit(1); exit(1);
} }
// I could really use boost::lexical_cast right about now. // I could really use boost::lexical_cast right about now.
float ParseFloat(const char *from) { float ParseFloat(const char *from) {
char *end; char *end;
float ret = strtod(from, &end); float ret = strtod(from, &end);

View File

@ -114,7 +114,7 @@ class CollapseStream {
current_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())), current_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())),
prune_threshold_(prune_threshold), prune_threshold_(prune_threshold),
prune_words_(prune_words), prune_words_(prune_words),
block_(position) { block_(position) {
StartBlock(); StartBlock();
} }
@ -125,27 +125,27 @@ class CollapseStream {
CollapseStream &operator++() { CollapseStream &operator++() {
assert(block_); assert(block_);
if (current_.begin()[1] == kBOS && current_.Base() < copy_from_) { if (current_.begin()[1] == kBOS && current_.Base() < copy_from_) {
memcpy(current_.Base(), copy_from_, current_.TotalSize()); memcpy(current_.Base(), copy_from_, current_.TotalSize());
UpdateCopyFrom(); UpdateCopyFrom();
// Mark highest order n-grams for later pruning // Mark highest order n-grams for later pruning
if(current_.Count() <= prune_threshold_) { if(current_.Count() <= prune_threshold_) {
current_.Mark(); current_.Mark();
} }
if(!prune_words_.empty()) { if(!prune_words_.empty()) {
for(WordIndex* i = current_.begin(); i != current_.end(); i++) { for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
if(prune_words_[*i]) { if(prune_words_[*i]) {
current_.Mark(); current_.Mark();
break; break;
} }
} }
} }
} }
current_.NextInMemory(); current_.NextInMemory();
uint8_t *block_base = static_cast<uint8_t*>(block_->Get()); uint8_t *block_base = static_cast<uint8_t*>(block_->Get());
if (current_.Base() == block_base + block_->ValidSize()) { if (current_.Base() == block_base + block_->ValidSize()) {
@ -153,21 +153,21 @@ class CollapseStream {
++block_; ++block_;
StartBlock(); StartBlock();
} }
// Mark highest order n-grams for later pruning // Mark highest order n-grams for later pruning
if(current_.Count() <= prune_threshold_) { if(current_.Count() <= prune_threshold_) {
current_.Mark(); current_.Mark();
} }
if(!prune_words_.empty()) { if(!prune_words_.empty()) {
for(WordIndex* i = current_.begin(); i != current_.end(); i++) { for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
if(prune_words_[*i]) { if(prune_words_[*i]) {
current_.Mark(); current_.Mark();
break; break;
} }
} }
} }
return *this; return *this;
} }
@ -180,21 +180,21 @@ class CollapseStream {
current_.ReBase(block_->Get()); current_.ReBase(block_->Get());
copy_from_ = static_cast<uint8_t*>(block_->Get()) + block_->ValidSize(); copy_from_ = static_cast<uint8_t*>(block_->Get()) + block_->ValidSize();
UpdateCopyFrom(); UpdateCopyFrom();
// Mark highest order n-grams for later pruning // Mark highest order n-grams for later pruning
if(current_.Count() <= prune_threshold_) { if(current_.Count() <= prune_threshold_) {
current_.Mark(); current_.Mark();
} }
if(!prune_words_.empty()) { if(!prune_words_.empty()) {
for(WordIndex* i = current_.begin(); i != current_.end(); i++) { for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
if(prune_words_[*i]) { if(prune_words_[*i]) {
current_.Mark(); current_.Mark();
break; break;
} }
} }
} }
} }
// Find last without bos. // Find last without bos.
@ -222,18 +222,18 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
StatCollector stats(order, counts_, counts_pruned_, discounts_); StatCollector stats(order, counts_, counts_pruned_, discounts_);
if (order == 1) { if (order == 1) {
// Only unigrams. Just collect stats. // Only unigrams. Just collect stats.
for (NGramStream full(positions[0]); full; ++full) { for (NGramStream full(positions[0]); full; ++full) {
// Do not prune <s> </s> <unk> // Do not prune <s> </s> <unk>
if(*full->begin() > 2) { if(*full->begin() > 2) {
if(full->Count() <= prune_thresholds_[0]) if(full->Count() <= prune_thresholds_[0])
full->Mark(); full->Mark();
if(!prune_words_.empty() && prune_words_[*full->begin()]) if(!prune_words_.empty() && prune_words_[*full->begin()])
full->Mark(); full->Mark();
} }
stats.AddFull(full->UnmarkedCount(), full->IsMarked()); stats.AddFull(full->UnmarkedCount(), full->IsMarked());
} }
@ -243,7 +243,7 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
NGramStreams streams; NGramStreams streams;
streams.Init(positions, positions.size() - 1); streams.Init(positions, positions.size() - 1);
CollapseStream full(positions[positions.size() - 1], prune_thresholds_.back(), prune_words_); CollapseStream full(positions[positions.size() - 1], prune_thresholds_.back(), prune_words_);
// Initialization: <unk> has count 0 and so does <s>. // Initialization: <unk> has count 0 and so does <s>.
@ -261,7 +261,7 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
std::vector<uint64_t> actual_counts(positions.size(), 0); std::vector<uint64_t> actual_counts(positions.size(), 0);
// Something of a hack: don't prune <s>. // Something of a hack: don't prune <s>.
actual_counts[0] = std::numeric_limits<uint64_t>::max(); actual_counts[0] = std::numeric_limits<uint64_t>::max();
// Iterate over full (the stream of the highest order ngrams) // Iterate over full (the stream of the highest order ngrams)
for (; full; ++full) { for (; full; ++full) {
const WordIndex *different = FindDifference(*full, **lower_valid); const WordIndex *different = FindDifference(*full, **lower_valid);
@ -272,16 +272,16 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
uint64_t order_minus_1 = lower_valid - streams_begin; uint64_t order_minus_1 = lower_valid - streams_begin;
if(actual_counts[order_minus_1] <= prune_thresholds_[order_minus_1]) if(actual_counts[order_minus_1] <= prune_thresholds_[order_minus_1])
(*lower_valid)->Mark(); (*lower_valid)->Mark();
if(!prune_words_.empty()) { if(!prune_words_.empty()) {
for(WordIndex* i = (*lower_valid)->begin(); i != (*lower_valid)->end(); i++) { for(WordIndex* i = (*lower_valid)->begin(); i != (*lower_valid)->end(); i++) {
if(prune_words_[*i]) { if(prune_words_[*i]) {
(*lower_valid)->Mark(); (*lower_valid)->Mark();
break; break;
} }
} }
} }
stats.Add(order_minus_1, (*lower_valid)->UnmarkedCount(), (*lower_valid)->IsMarked()); stats.Add(order_minus_1, (*lower_valid)->UnmarkedCount(), (*lower_valid)->IsMarked());
++*lower_valid; ++*lower_valid;
} }
@ -327,16 +327,16 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
uint64_t lower_count = actual_counts[(*s)->Order() - 1]; uint64_t lower_count = actual_counts[(*s)->Order() - 1];
if(lower_count <= prune_thresholds_[(*s)->Order() - 1]) if(lower_count <= prune_thresholds_[(*s)->Order() - 1])
(*s)->Mark(); (*s)->Mark();
if(!prune_words_.empty()) { if(!prune_words_.empty()) {
for(WordIndex* i = (*s)->begin(); i != (*s)->end(); i++) { for(WordIndex* i = (*s)->begin(); i != (*s)->end(); i++) {
if(prune_words_[*i]) { if(prune_words_[*i]) {
(*s)->Mark(); (*s)->Mark();
break; break;
} }
} }
} }
stats.Add(s - streams.begin(), lower_count, (*s)->IsMarked()); stats.Add(s - streams.begin(), lower_count, (*s)->IsMarked());
++*s; ++*s;
} }

View File

@ -30,9 +30,9 @@ struct DiscountConfig {
WarningAction bad_action; WarningAction bad_action;
}; };
/* Compute adjusted counts. /* Compute adjusted counts.
* Input: unique suffix sorted N-grams (and just the N-grams) with raw counts. * Input: unique suffix sorted N-grams (and just the N-grams) with raw counts.
* Output: [1,N]-grams with adjusted counts. * Output: [1,N]-grams with adjusted counts.
* [1,N)-grams are in suffix order * [1,N)-grams are in suffix order
* N-grams are in undefined order (they're going to be sorted anyway). * N-grams are in undefined order (they're going to be sorted anyway).
*/ */
@ -50,13 +50,13 @@ class AdjustCounts {
const DiscountConfig &discount_config, const DiscountConfig &discount_config,
std::vector<Discount> &discounts) std::vector<Discount> &discounts)
: prune_thresholds_(prune_thresholds), counts_(counts), counts_pruned_(counts_pruned), : prune_thresholds_(prune_thresholds), counts_(counts), counts_pruned_(counts_pruned),
prune_words_(prune_words), discount_config_(discount_config), discounts_(discounts) prune_words_(prune_words), discount_config_(discount_config), discounts_(discounts)
{} {}
void Run(const util::stream::ChainPositions &positions); void Run(const util::stream::ChainPositions &positions);
private: private:
const std::vector<uint64_t> &prune_thresholds_; const std::vector<uint64_t> &prune_thresholds_;
std::vector<uint64_t> &counts_; std::vector<uint64_t> &counts_;
std::vector<uint64_t> &counts_pruned_; std::vector<uint64_t> &counts_pruned_;
const std::vector<bool> &prune_words_; const std::vector<bool> &prune_words_;

View File

@ -82,7 +82,7 @@ BOOST_AUTO_TEST_CASE(Simple) {
} }
BOOST_REQUIRE_EQUAL(4UL, counts.size()); BOOST_REQUIRE_EQUAL(4UL, counts.size());
BOOST_CHECK_EQUAL(4UL, counts[0]); BOOST_CHECK_EQUAL(4UL, counts[0]);
// These are no longer set because the discounts are bad. // These are no longer set because the discounts are bad.
/* BOOST_CHECK_EQUAL(4UL, counts[1]); /* BOOST_CHECK_EQUAL(4UL, counts[1]);
BOOST_CHECK_EQUAL(3UL, counts[2]); BOOST_CHECK_EQUAL(3UL, counts[2]);
BOOST_CHECK_EQUAL(3UL, counts[3]);*/ BOOST_CHECK_EQUAL(3UL, counts[3]);*/

View File

@ -45,7 +45,7 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
std::size_t operator()(const WordIndex *start) const { std::size_t operator()(const WordIndex *start) const {
return util::MurmurHashNative(start, size_); return util::MurmurHashNative(start, size_);
} }
private: private:
const std::size_t size_; const std::size_t size_;
}; };
@ -53,11 +53,11 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
class DedupeEquals : public std::binary_function<const WordIndex *, const WordIndex *, bool> { class DedupeEquals : public std::binary_function<const WordIndex *, const WordIndex *, bool> {
public: public:
explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {} explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {}
bool operator()(const WordIndex *first, const WordIndex *second) const { bool operator()(const WordIndex *first, const WordIndex *second) const {
return !memcmp(first, second, size_); return !memcmp(first, second, size_);
} }
private: private:
const std::size_t size_; const std::size_t size_;
}; };
@ -82,7 +82,7 @@ typedef util::ProbingHashTable<DedupeEntry, DedupeHash, DedupeEquals> Dedupe;
class Writer { class Writer {
public: public:
Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size) Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size)
: block_(position), gram_(block_->Get(), order), : block_(position), gram_(block_->Get(), order),
dedupe_invalid_(order, std::numeric_limits<WordIndex>::max()), dedupe_invalid_(order, std::numeric_limits<WordIndex>::max()),
dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)), dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)),
@ -91,7 +91,7 @@ class Writer {
dedupe_.Clear(); dedupe_.Clear();
assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size); assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size);
if (order == 1) { if (order == 1) {
// Add special words. AdjustCounts is responsible if order != 1. // Add special words. AdjustCounts is responsible if order != 1.
AddUnigramWord(kUNK); AddUnigramWord(kUNK);
AddUnigramWord(kBOS); AddUnigramWord(kBOS);
} }
@ -121,16 +121,16 @@ class Writer {
memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1)); memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1));
return; return;
} }
// Complete the write. // Complete the write.
gram_.Count() = 1; gram_.Count() = 1;
// Prepare the next n-gram. // Prepare the next n-gram.
if (reinterpret_cast<uint8_t*>(gram_.begin()) + gram_.TotalSize() != static_cast<uint8_t*>(block_->Get()) + block_size_) { if (reinterpret_cast<uint8_t*>(gram_.begin()) + gram_.TotalSize() != static_cast<uint8_t*>(block_->Get()) + block_size_) {
NGram last(gram_); NGram last(gram_);
gram_.NextInMemory(); gram_.NextInMemory();
std::copy(last.begin() + 1, last.end(), gram_.begin()); std::copy(last.begin() + 1, last.end(), gram_.begin());
return; return;
} }
// Block end. Need to store the context in a temporary buffer. // Block end. Need to store the context in a temporary buffer.
std::copy(gram_.begin() + 1, gram_.end(), buffer_.get()); std::copy(gram_.begin() + 1, gram_.end(), buffer_.get());
dedupe_.Clear(); dedupe_.Clear();
block_->SetValidSize(block_size_); block_->SetValidSize(block_size_);
@ -158,7 +158,7 @@ class Writer {
// Hash table combiner implementation. // Hash table combiner implementation.
Dedupe dedupe_; Dedupe dedupe_;
// Small buffer to hold existing ngrams when shifting across a block boundary. // Small buffer to hold existing ngrams when shifting across a block boundary.
boost::scoped_array<WordIndex> buffer_; boost::scoped_array<WordIndex> buffer_;
const std::size_t block_size_; const std::size_t block_size_;
@ -224,12 +224,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
} catch (const util::EndOfFileException &e) {} } catch (const util::EndOfFileException &e) {}
token_count_ = count; token_count_ = count;
type_count_ = vocab.Size(); type_count_ = vocab.Size();
// Create list of unigrams that are supposed to be pruned // Create list of unigrams that are supposed to be pruned
if (!prune_vocab_filename_.empty()) { if (!prune_vocab_filename_.empty()) {
try { try {
util::FilePiece prune_vocab_file(prune_vocab_filename_.c_str()); util::FilePiece prune_vocab_file(prune_vocab_filename_.c_str());
prune_words_.resize(vocab.Size(), true); prune_words_.resize(vocab.Size(), true);
try { try {
while (true) { while (true) {
@ -238,12 +238,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
prune_words_[vocab.Index(*w)] = false; prune_words_[vocab.Index(*w)] = false;
} }
} catch (const util::EndOfFileException &e) {} } catch (const util::EndOfFileException &e) {}
// Never prune <unk>, <s>, </s> // Never prune <unk>, <s>, </s>
prune_words_[kUNK] = false; prune_words_[kUNK] = false;
prune_words_[kBOS] = false; prune_words_[kBOS] = false;
prune_words_[kEOS] = false; prune_words_[kEOS] = false;
} catch (const util::Exception &e) { } catch (const util::Exception &e) {
std::cerr << e.what() << std::endl; std::cerr << e.what() << std::endl;
abort(); abort();

View File

@ -40,7 +40,7 @@ class CorpusCount {
uint64_t &token_count_; uint64_t &token_count_;
WordIndex &type_count_; WordIndex &type_count_;
std::vector<bool>& prune_words_; std::vector<bool>& prune_words_;
const std::string& prune_vocab_filename_; const std::string& prune_vocab_filename_;
std::size_t dedupe_mem_size_; std::size_t dedupe_mem_size_;
util::scoped_malloc dedupe_mem_; util::scoped_malloc dedupe_mem_;

View File

@ -27,9 +27,9 @@ struct HashBufferEntry : public BufferEntry {
uint64_t hash_value; uint64_t hash_value;
}; };
// Reads all entries in order like NGramStream does. // Reads all entries in order like NGramStream does.
// But deletes any entries that have CutoffCount below or equal to pruning // But deletes any entries that have CutoffCount below or equal to pruning
// threshold. // threshold.
class PruneNGramStream { class PruneNGramStream {
public: public:
PruneNGramStream(const util::stream::ChainPosition &position) : PruneNGramStream(const util::stream::ChainPosition &position) :
@ -37,7 +37,7 @@ class PruneNGramStream {
dest_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())), dest_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())),
currentCount_(0), currentCount_(0),
block_(position) block_(position)
{ {
StartBlock(); StartBlock();
} }
@ -50,7 +50,7 @@ class PruneNGramStream {
PruneNGramStream &operator++() { PruneNGramStream &operator++() {
assert(block_); assert(block_);
if(current_.Order() == 1 && *current_.begin() <= 2) if(current_.Order() == 1 && *current_.begin() <= 2)
dest_.NextInMemory(); dest_.NextInMemory();
else if(currentCount_ > 0) { else if(currentCount_ > 0) {
@ -59,9 +59,9 @@ class PruneNGramStream {
} }
dest_.NextInMemory(); dest_.NextInMemory();
} }
current_.NextInMemory(); current_.NextInMemory();
uint8_t *block_base = static_cast<uint8_t*>(block_->Get()); uint8_t *block_base = static_cast<uint8_t*>(block_->Get());
if (current_.Base() == block_base + block_->ValidSize()) { if (current_.Base() == block_base + block_->ValidSize()) {
block_->SetValidSize(dest_.Base() - block_base); block_->SetValidSize(dest_.Base() - block_base);
@ -70,13 +70,13 @@ class PruneNGramStream {
if (block_) { if (block_) {
currentCount_ = current_.CutoffCount(); currentCount_ = current_.CutoffCount();
} }
} else { } else {
currentCount_ = current_.CutoffCount(); currentCount_ = current_.CutoffCount();
} }
return *this; return *this;
} }
private: private:
void StartBlock() { void StartBlock() {
for (; ; ++block_) { for (; ; ++block_) {
@ -85,13 +85,13 @@ class PruneNGramStream {
} }
current_.ReBase(block_->Get()); current_.ReBase(block_->Get());
currentCount_ = current_.CutoffCount(); currentCount_ = current_.CutoffCount();
dest_.ReBase(block_->Get()); dest_.ReBase(block_->Get());
} }
NGram current_; // input iterator NGram current_; // input iterator
NGram dest_; // output iterator NGram dest_; // output iterator
uint64_t currentCount_; uint64_t currentCount_;
util::stream::Link block_; util::stream::Link block_;
@ -155,24 +155,24 @@ class AddRight {
memcpy(previous_raw, in->begin(), size); memcpy(previous_raw, in->begin(), size);
uint64_t denominator = 0; uint64_t denominator = 0;
uint64_t normalizer = 0; uint64_t normalizer = 0;
uint64_t counts[4]; uint64_t counts[4];
memset(counts, 0, sizeof(counts)); memset(counts, 0, sizeof(counts));
do { do {
denominator += in->UnmarkedCount(); denominator += in->UnmarkedCount();
// Collect unused probability mass from pruning. // Collect unused probability mass from pruning.
// Becomes 0 for unpruned ngrams. // Becomes 0 for unpruned ngrams.
normalizer += in->UnmarkedCount() - in->CutoffCount(); normalizer += in->UnmarkedCount() - in->CutoffCount();
// Chen&Goodman do not mention counting based on cutoffs, but // Chen&Goodman do not mention counting based on cutoffs, but
// backoff becomes larger than 1 otherwise, so probably needs // backoff becomes larger than 1 otherwise, so probably needs
// to count cutoffs. Counts normally without pruning. // to count cutoffs. Counts normally without pruning.
if(in->CutoffCount() > 0) if(in->CutoffCount() > 0)
++counts[std::min(in->CutoffCount(), static_cast<uint64_t>(3))]; ++counts[std::min(in->CutoffCount(), static_cast<uint64_t>(3))];
} while (++in && !memcmp(previous_raw, in->begin(), size)); } while (++in && !memcmp(previous_raw, in->begin(), size));
BufferEntry &entry = *reinterpret_cast<BufferEntry*>(out.Get()); BufferEntry &entry = *reinterpret_cast<BufferEntry*>(out.Get());
entry.denominator = static_cast<float>(denominator); entry.denominator = static_cast<float>(denominator);
entry.gamma = 0.0; entry.gamma = 0.0;
@ -182,9 +182,9 @@ class AddRight {
// Makes model sum to 1 with pruning (I hope). // Makes model sum to 1 with pruning (I hope).
entry.gamma += normalizer; entry.gamma += normalizer;
entry.gamma /= entry.denominator; entry.gamma /= entry.denominator;
if(pruning_) { if(pruning_) {
// If pruning is enabled the stream actually contains HashBufferEntry, see InitialProbabilities(...), // If pruning is enabled the stream actually contains HashBufferEntry, see InitialProbabilities(...),
// so add a hash value that identifies the current ngram. // so add a hash value that identifies the current ngram.
@ -244,13 +244,13 @@ class MergeRight {
++summed; ++summed;
return; return;
} }
std::vector<WordIndex> previous(grams->Order() - 1); std::vector<WordIndex> previous(grams->Order() - 1);
const std::size_t size = sizeof(WordIndex) * previous.size(); const std::size_t size = sizeof(WordIndex) * previous.size();
for (; grams; ++summed) { for (; grams; ++summed) {
memcpy(&previous[0], grams->begin(), size); memcpy(&previous[0], grams->begin(), size);
const BufferEntry &sums = *static_cast<const BufferEntry*>(summed.Get()); const BufferEntry &sums = *static_cast<const BufferEntry*>(summed.Get());
do { do {
Payload &pay = grams->Value(); Payload &pay = grams->Value();
pay.uninterp.prob = discount_.Apply(grams->UnmarkedCount()) / sums.denominator; pay.uninterp.prob = discount_.Apply(grams->UnmarkedCount()) / sums.denominator;
@ -288,7 +288,7 @@ void InitialProbabilities(
gamma_out[i] >> AddRight(discounts[i], second, prune_vocab || prune_thresholds[i] > 0); gamma_out[i] >> AddRight(discounts[i], second, prune_vocab || prune_thresholds[i] > 0);
primary[i] >> MergeRight(config.interpolate_unigrams, gamma_out[i].Add(), discounts[i]); primary[i] >> MergeRight(config.interpolate_unigrams, gamma_out[i].Add(), discounts[i]);
// Don't bother with the OnlyGamma thread for something to discard. // Don't bother with the OnlyGamma thread for something to discard.
if (i) gamma_out[i] >> OnlyGamma(prune_vocab || prune_thresholds[i] > 0); if (i) gamma_out[i] >> OnlyGamma(prune_vocab || prune_thresholds[i] > 0);
} }

View File

@ -15,17 +15,17 @@ struct InitialProbabilitiesConfig {
// These should be small buffers to keep the adder from getting too far ahead // These should be small buffers to keep the adder from getting too far ahead
util::stream::ChainConfig adder_in; util::stream::ChainConfig adder_in;
util::stream::ChainConfig adder_out; util::stream::ChainConfig adder_out;
// SRILM doesn't normally interpolate unigrams. // SRILM doesn't normally interpolate unigrams.
bool interpolate_unigrams; bool interpolate_unigrams;
}; };
/* Compute initial (uninterpolated) probabilities /* Compute initial (uninterpolated) probabilities
* primary: the normal chain of n-grams. Incoming is context sorted adjusted * primary: the normal chain of n-grams. Incoming is context sorted adjusted
* counts. Outgoing has uninterpolated probabilities for use by Interpolate. * counts. Outgoing has uninterpolated probabilities for use by Interpolate.
* second_in: a second copy of the primary input. Discard the output. * second_in: a second copy of the primary input. Discard the output.
* gamma_out: Computed gamma values are output on these chains in suffix order. * gamma_out: Computed gamma values are output on these chains in suffix order.
* The values are bare floats and should be buffered for interpolation to * The values are bare floats and should be buffered for interpolation to
* use. * use.
*/ */
void InitialProbabilities( void InitialProbabilities(
const InitialProbabilitiesConfig &config, const InitialProbabilitiesConfig &config,

View File

@ -47,7 +47,7 @@ class OutputQ {
private: private:
// Product of backoffs in the numerator divided by backoffs in the // Product of backoffs in the numerator divided by backoffs in the
// denominator. Does not include // denominator. Does not include
std::vector<float> q_delta_; std::vector<float> q_delta_;
}; };
@ -81,7 +81,7 @@ template <class Output> class Callback {
if(prune_vocab_ || prune_thresholds_[i + 1] > 0) if(prune_vocab_ || prune_thresholds_[i + 1] > 0)
while(backoffs_[i]) while(backoffs_[i])
++backoffs_[i]; ++backoffs_[i];
if (backoffs_[i]) { if (backoffs_[i]) {
std::cerr << "Backoffs do not match for order " << (i + 1) << std::endl; std::cerr << "Backoffs do not match for order " << (i + 1) << std::endl;
abort(); abort();
@ -99,7 +99,7 @@ template <class Output> class Callback {
if(prune_vocab_ || prune_thresholds_[order_minus_1 + 1] > 0) { if(prune_vocab_ || prune_thresholds_[order_minus_1 + 1] > 0) {
//Compute hash value for current context //Compute hash value for current context
uint64_t current_hash = util::MurmurHashNative(gram.begin(), gram.Order() * sizeof(WordIndex)); uint64_t current_hash = util::MurmurHashNative(gram.begin(), gram.Order() * sizeof(WordIndex));
const HashGamma *hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get()); const HashGamma *hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get());
while(current_hash != hashed_backoff->hash_value && ++backoffs_[order_minus_1]) while(current_hash != hashed_backoff->hash_value && ++backoffs_[order_minus_1])
hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get()); hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get());

View File

@ -8,8 +8,8 @@
#include <stdint.h> #include <stdint.h>
namespace lm { namespace builder { namespace lm { namespace builder {
/* Interpolate step. /* Interpolate step.
* Input: suffix sorted n-grams with (p_uninterpolated, gamma) from * Input: suffix sorted n-grams with (p_uninterpolated, gamma) from
* InitialProbabilities. * InitialProbabilities.
* Output: suffix sorted n-grams with complete probability * Output: suffix sorted n-grams with complete probability

View File

@ -35,7 +35,7 @@ template <class Callback, class Compare> void JointOrder(const util::stream::Cha
// Does the context match the lower one? // Does the context match the lower one?
if (!memcmp(streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) { if (!memcmp(streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) {
callback.Enter(current, *streams[current]); callback.Enter(current, *streams[current]);
// Transition to looking for extensions. // Transition to looking for extensions.
if (++current < order) continue; if (++current < order) continue;
} }
#ifdef DEBUG #ifdef DEBUG
@ -46,16 +46,16 @@ template <class Callback, class Compare> void JointOrder(const util::stream::Cha
abort(); abort();
} }
#endif // DEBUG #endif // DEBUG
// No extension left. // No extension left.
while(true) { while(true) {
assert(current > 0); assert(current > 0);
--current; --current;
callback.Exit(current, *streams[current]); callback.Exit(current, *streams[current]);
if (++streams[current]) break; if (++streams[current]) break;
UTIL_THROW_IF(order != current + 1, FormatLoadException, "Detected n-gram without matching suffix"); UTIL_THROW_IF(order != current + 1, FormatLoadException, "Detected n-gram without matching suffix");
order = current; order = current;
if (!order) return; if (!order) return;
} }

View File

@ -53,7 +53,7 @@ std::vector<uint64_t> ParsePruning(const std::vector<std::string> &param, std::s
// throw if each n-gram order has not threshold specified // throw if each n-gram order has not threshold specified
UTIL_THROW_IF(prune_thresholds.size() > order, util::Exception, "You specified pruning thresholds for orders 1 through " << prune_thresholds.size() << " but the model only has order " << order); UTIL_THROW_IF(prune_thresholds.size() > order, util::Exception, "You specified pruning thresholds for orders 1 through " << prune_thresholds.size() << " but the model only has order " << order);
// threshold for unigram can only be 0 (no pruning) // threshold for unigram can only be 0 (no pruning)
// check if threshold are not in decreasing order // check if threshold are not in decreasing order
uint64_t lower_threshold = 0; uint64_t lower_threshold = 0;
for (std::vector<uint64_t>::iterator it = prune_thresholds.begin(); it != prune_thresholds.end(); ++it) { for (std::vector<uint64_t>::iterator it = prune_thresholds.begin(); it != prune_thresholds.end(); ++it) {
@ -124,7 +124,7 @@ int main(int argc, char *argv[]) {
po::store(po::parse_command_line(argc, argv, options), vm); po::store(po::parse_command_line(argc, argv, options), vm);
if (argc == 1 || vm["help"].as<bool>()) { if (argc == 1 || vm["help"].as<bool>()) {
std::cerr << std::cerr <<
"Builds unpruned language models with modified Kneser-Ney smoothing.\n\n" "Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
"Please cite:\n" "Please cite:\n"
"@inproceedings{Heafield-estimate,\n" "@inproceedings{Heafield-estimate,\n"
@ -147,7 +147,7 @@ int main(int argc, char *argv[]) {
std::cerr << "This machine has " << mem << " bytes of memory.\n\n"; std::cerr << "This machine has " << mem << " bytes of memory.\n\n";
} else { } else {
std::cerr << "Unable to determine the amount of memory on this machine.\n\n"; std::cerr << "Unable to determine the amount of memory on this machine.\n\n";
} }
std::cerr << options << std::endl; std::cerr << options << std::endl;
return 1; return 1;
} }
@ -191,11 +191,11 @@ int main(int argc, char *argv[]) {
else { else {
pipeline.prune_vocab = false; pipeline.prune_vocab = false;
} }
util::NormalizeTempPrefix(pipeline.sort.temp_prefix); util::NormalizeTempPrefix(pipeline.sort.temp_prefix);
lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs; lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
// TODO: evaluate options for these. // TODO: evaluate options for these.
initial.adder_in.total_memory = 32768; initial.adder_in.total_memory = 32768;
initial.adder_in.block_count = 2; initial.adder_in.block_count = 2;
initial.adder_out.total_memory = 32768; initial.adder_out.total_memory = 32768;

View File

@ -68,26 +68,26 @@ class NGram {
assert(size == TotalSize(ret)); assert(size == TotalSize(ret));
return ret; return ret;
} }
// manipulate msb to signal that ngram can be pruned // manipulate msb to signal that ngram can be pruned
/*mjd**********************************************************************/ /*mjd**********************************************************************/
bool IsMarked() const { bool IsMarked() const {
return Value().count >> (sizeof(Value().count) * 8 - 1); return Value().count >> (sizeof(Value().count) * 8 - 1);
} }
void Mark() { void Mark() {
Value().count |= (1ul << (sizeof(Value().count) * 8 - 1)); Value().count |= (1ul << (sizeof(Value().count) * 8 - 1));
} }
void Unmark() { void Unmark() {
Value().count &= ~(1ul << (sizeof(Value().count) * 8 - 1)); Value().count &= ~(1ul << (sizeof(Value().count) * 8 - 1));
} }
uint64_t UnmarkedCount() const { uint64_t UnmarkedCount() const {
return Value().count & ~(1ul << (sizeof(Value().count) * 8 - 1)); return Value().count & ~(1ul << (sizeof(Value().count) * 8 - 1));
} }
uint64_t CutoffCount() const { uint64_t CutoffCount() const {
return IsMarked() ? 0 : UnmarkedCount(); return IsMarked() ? 0 : UnmarkedCount();
} }

View File

@ -37,7 +37,7 @@ void PrintStatistics(const std::vector<uint64_t> &counts, const std::vector<uint
class Master { class Master {
public: public:
explicit Master(PipelineConfig &config) explicit Master(PipelineConfig &config)
: config_(config), chains_(config.order), files_(config.order) { : config_(config), chains_(config.order), files_(config.order) {
config_.minimum_block = std::max(NGram::TotalSize(config_.order), config_.minimum_block); config_.minimum_block = std::max(NGram::TotalSize(config_.order), config_.minimum_block);
} }
@ -64,7 +64,7 @@ class Master {
CreateChains(config_.TotalMemory() - merge_using, count_bounds); CreateChains(config_.TotalMemory() - merge_using, count_bounds);
ngrams.Output(chains_.back(), merge_using); ngrams.Output(chains_.back(), merge_using);
// Setup unigram file. // Setup unigram file.
files_.push_back(util::MakeTemp(config_.TempPrefix())); files_.push_back(util::MakeTemp(config_.TempPrefix()));
} }
@ -204,7 +204,7 @@ class Master {
PipelineConfig &config_; PipelineConfig &config_;
util::stream::Chains chains_; util::stream::Chains chains_;
// Often only unigrams, but sometimes all orders. // Often only unigrams, but sometimes all orders.
util::FixedArray<util::stream::FileBuffer> files_; util::FixedArray<util::stream::FileBuffer> files_;
}; };
@ -214,7 +214,7 @@ void CountText(int text_file /* input */, int vocab_file /* output */, Master &m
const std::size_t vocab_usage = CorpusCount::VocabUsage(config.vocab_estimate); const std::size_t vocab_usage = CorpusCount::VocabUsage(config.vocab_estimate);
UTIL_THROW_IF(config.TotalMemory() < vocab_usage, util::Exception, "Vocab hash size estimate " << vocab_usage << " exceeds total memory " << config.TotalMemory()); UTIL_THROW_IF(config.TotalMemory() < vocab_usage, util::Exception, "Vocab hash size estimate " << vocab_usage << " exceeds total memory " << config.TotalMemory());
std::size_t memory_for_chain = std::size_t memory_for_chain =
// This much memory to work with after vocab hash table. // This much memory to work with after vocab hash table.
static_cast<float>(config.TotalMemory() - vocab_usage) / static_cast<float>(config.TotalMemory() - vocab_usage) /
// Solve for block size including the dedupe multiplier for one block. // Solve for block size including the dedupe multiplier for one block.
@ -252,7 +252,7 @@ void InitialProbabilities(const std::vector<uint64_t> &counts, const std::vector
util::stream::Chains gamma_chains(config.order); util::stream::Chains gamma_chains(config.order);
InitialProbabilities(config.initial_probs, discounts, master.MutableChains(), second, gamma_chains, prune_thresholds, prune_vocab); InitialProbabilities(config.initial_probs, discounts, master.MutableChains(), second, gamma_chains, prune_thresholds, prune_vocab);
// Don't care about gamma for 0. // Don't care about gamma for 0.
gamma_chains[0] >> util::stream::kRecycle; gamma_chains[0] >> util::stream::kRecycle;
gammas.Init(config.order - 1); gammas.Init(config.order - 1);
for (std::size_t i = 1; i < config.order; ++i) { for (std::size_t i = 1; i < config.order; ++i) {
@ -307,16 +307,16 @@ void Pipeline(PipelineConfig &config, int text_file, Output &output) {
// master's destructor will wait for chains. But they might be deadlocked if // master's destructor will wait for chains. But they might be deadlocked if
// this thread dies because e.g. it ran out of memory. // this thread dies because e.g. it ran out of memory.
try { try {
util::scoped_fd vocab_file(config.vocab_file.empty() ? util::scoped_fd vocab_file(config.vocab_file.empty() ?
util::MakeTemp(config.TempPrefix()) : util::MakeTemp(config.TempPrefix()) :
util::CreateOrThrow(config.vocab_file.c_str())); util::CreateOrThrow(config.vocab_file.c_str()));
output.SetVocabFD(vocab_file.get()); output.SetVocabFD(vocab_file.get());
uint64_t token_count; uint64_t token_count;
std::string text_file_name; std::string text_file_name;
std::vector<bool> prune_words; std::vector<bool> prune_words;
CountText(text_file, vocab_file.get(), master, token_count, text_file_name, prune_words); CountText(text_file, vocab_file.get(), master, token_count, text_file_name, prune_words);
std::vector<uint64_t> counts; std::vector<uint64_t> counts;
std::vector<uint64_t> counts_pruned; std::vector<uint64_t> counts_pruned;
std::vector<Discount> discounts; std::vector<Discount> discounts;

View File

@ -44,7 +44,7 @@ struct PipelineConfig {
// Compute collapsed q values instead of probability and backoff // Compute collapsed q values instead of probability and backoff
bool output_q; bool output_q;
/* Computing the perplexity of LMs with different vocabularies is hard. For /* Computing the perplexity of LMs with different vocabularies is hard. For
* example, the lowest perplexity is attained by a unigram model that * example, the lowest perplexity is attained by a unigram model that
* predicts p(<unk>) = 1 and has no other vocabulary. Also, linearly * predicts p(<unk>) = 1 and has no other vocabulary. Also, linearly

View File

@ -55,7 +55,7 @@ void PrintARPA::Run(const util::stream::ChainPositions &positions) {
if (order != positions.size()) if (order != positions.size())
out << '\t' << stream->Value().complete.backoff; out << '\t' << stream->Value().complete.backoff;
out << '\n'; out << '\n';
} }
out << '\n'; out << '\n';
} }

View File

@ -14,7 +14,7 @@
// Warning: print routines read all unigrams before all bigrams before all // Warning: print routines read all unigrams before all bigrams before all
// trigrams etc. So if other parts of the chain move jointly, you'll have to // trigrams etc. So if other parts of the chain move jointly, you'll have to
// buffer. // buffer.
namespace lm { namespace builder { namespace lm { namespace builder {
@ -42,7 +42,7 @@ class VocabReconstitute {
std::vector<const char*> map_; std::vector<const char*> map_;
}; };
// Not defined, only specialized. // Not defined, only specialized.
template <class T> void PrintPayload(util::FakeOFStream &to, const Payload &payload); template <class T> void PrintPayload(util::FakeOFStream &to, const Payload &payload);
template <> inline void PrintPayload<uint64_t>(util::FakeOFStream &to, const Payload &payload) { template <> inline void PrintPayload<uint64_t>(util::FakeOFStream &to, const Payload &payload) {
// TODO slow // TODO slow
@ -55,7 +55,7 @@ template <> inline void PrintPayload<ProbBackoff>(util::FakeOFStream &to, const
to << payload.complete.prob << ' ' << payload.complete.backoff; to << payload.complete.prob << ' ' << payload.complete.backoff;
} }
// template parameter is the type stored. // template parameter is the type stored.
template <class V> class Print { template <class V> class Print {
public: public:
static void DumpSeparateFiles(const VocabReconstitute &vocab, const std::string &file_base, util::stream::Chains &chains) { static void DumpSeparateFiles(const VocabReconstitute &vocab, const std::string &file_base, util::stream::Chains &chains) {

View File

@ -19,7 +19,7 @@ namespace builder {
*/ */
template <class Child> class Comparator : public std::binary_function<const void *, const void *, bool> { template <class Child> class Comparator : public std::binary_function<const void *, const void *, bool> {
public: public:
/** /**
* Constructs a comparator capable of comparing two n-grams. * Constructs a comparator capable of comparing two n-grams.
* *
@ -51,8 +51,8 @@ template <class Child> class Comparator : public std::binary_function<const void
/** /**
* N-gram comparator that compares n-grams according to their reverse (suffix) order. * N-gram comparator that compares n-grams according to their reverse (suffix) order.
* *
* This comparator compares n-grams lexicographically, one word at a time, * This comparator compares n-grams lexicographically, one word at a time,
* beginning with the last word of each n-gram and ending with the first word of each n-gram. * beginning with the last word of each n-gram and ending with the first word of each n-gram.
* *
* Some examples of n-gram comparisons as defined by this comparator: * Some examples of n-gram comparisons as defined by this comparator:
* - a b c == a b c * - a b c == a b c
@ -64,8 +64,8 @@ template <class Child> class Comparator : public std::binary_function<const void
*/ */
class SuffixOrder : public Comparator<SuffixOrder> { class SuffixOrder : public Comparator<SuffixOrder> {
public: public:
/** /**
* Constructs a comparator capable of comparing two n-grams. * Constructs a comparator capable of comparing two n-grams.
* *
* @param order Number of words in each n-gram * @param order Number of words in each n-gram
@ -73,7 +73,7 @@ class SuffixOrder : public Comparator<SuffixOrder> {
explicit SuffixOrder(std::size_t order) : Comparator<SuffixOrder>(order) {} explicit SuffixOrder(std::size_t order) : Comparator<SuffixOrder>(order) {}
/** /**
* Compares two n-grams lexicographically, one word at a time, * Compares two n-grams lexicographically, one word at a time,
* beginning with the last word of each n-gram and ending with the first word of each n-gram. * beginning with the last word of each n-gram and ending with the first word of each n-gram.
* *
* @param lhs A pointer to the n-gram on the left-hand side of the comparison * @param lhs A pointer to the n-gram on the left-hand side of the comparison
@ -90,11 +90,11 @@ class SuffixOrder : public Comparator<SuffixOrder> {
static const unsigned kMatchOffset = 1; static const unsigned kMatchOffset = 1;
}; };
/** /**
* N-gram comparator that compares n-grams according to the reverse (suffix) order of the n-gram context. * N-gram comparator that compares n-grams according to the reverse (suffix) order of the n-gram context.
* *
* This comparator compares n-grams lexicographically, one word at a time, * This comparator compares n-grams lexicographically, one word at a time,
* beginning with the penultimate word of each n-gram and ending with the first word of each n-gram; * beginning with the penultimate word of each n-gram and ending with the first word of each n-gram;
* finally, this comparator compares the last word of each n-gram. * finally, this comparator compares the last word of each n-gram.
* *
@ -108,8 +108,8 @@ class SuffixOrder : public Comparator<SuffixOrder> {
*/ */
class ContextOrder : public Comparator<ContextOrder> { class ContextOrder : public Comparator<ContextOrder> {
public: public:
/** /**
* Constructs a comparator capable of comparing two n-grams. * Constructs a comparator capable of comparing two n-grams.
* *
* @param order Number of words in each n-gram * @param order Number of words in each n-gram
@ -117,7 +117,7 @@ class ContextOrder : public Comparator<ContextOrder> {
explicit ContextOrder(std::size_t order) : Comparator<ContextOrder>(order) {} explicit ContextOrder(std::size_t order) : Comparator<ContextOrder>(order) {}
/** /**
* Compares two n-grams lexicographically, one word at a time, * Compares two n-grams lexicographically, one word at a time,
* beginning with the penultimate word of each n-gram and ending with the first word of each n-gram; * beginning with the penultimate word of each n-gram and ending with the first word of each n-gram;
* finally, this comparator compares the last word of each n-gram. * finally, this comparator compares the last word of each n-gram.
* *
@ -136,7 +136,7 @@ class ContextOrder : public Comparator<ContextOrder> {
/** /**
* N-gram comparator that compares n-grams according to their natural (prefix) order. * N-gram comparator that compares n-grams according to their natural (prefix) order.
* *
* This comparator compares n-grams lexicographically, one word at a time, * This comparator compares n-grams lexicographically, one word at a time,
* beginning with the first word of each n-gram and ending with the last word of each n-gram. * beginning with the first word of each n-gram and ending with the last word of each n-gram.
* *
* Some examples of n-gram comparisons as defined by this comparator: * Some examples of n-gram comparisons as defined by this comparator:
@ -149,8 +149,8 @@ class ContextOrder : public Comparator<ContextOrder> {
*/ */
class PrefixOrder : public Comparator<PrefixOrder> { class PrefixOrder : public Comparator<PrefixOrder> {
public: public:
/** /**
* Constructs a comparator capable of comparing two n-grams. * Constructs a comparator capable of comparing two n-grams.
* *
* @param order Number of words in each n-gram * @param order Number of words in each n-gram
@ -158,7 +158,7 @@ class PrefixOrder : public Comparator<PrefixOrder> {
explicit PrefixOrder(std::size_t order) : Comparator<PrefixOrder>(order) {} explicit PrefixOrder(std::size_t order) : Comparator<PrefixOrder>(order) {}
/** /**
* Compares two n-grams lexicographically, one word at a time, * Compares two n-grams lexicographically, one word at a time,
* beginning with the first word of each n-gram and ending with the last word of each n-gram. * beginning with the first word of each n-gram and ending with the last word of each n-gram.
* *
* @param lhs A pointer to the n-gram on the left-hand side of the comparison * @param lhs A pointer to the n-gram on the left-hand side of the comparison
@ -171,7 +171,7 @@ class PrefixOrder : public Comparator<PrefixOrder> {
} }
return false; return false;
} }
static const unsigned kMatchOffset = 0; static const unsigned kMatchOffset = 0;
}; };
@ -179,7 +179,7 @@ class PrefixOrder : public Comparator<PrefixOrder> {
struct AddCombiner { struct AddCombiner {
bool operator()(void *first_void, const void *second_void, const SuffixOrder &compare) const { bool operator()(void *first_void, const void *second_void, const SuffixOrder &compare) const {
NGram first(first_void, compare.Order()); NGram first(first_void, compare.Order());
// There isn't a const version of NGram. // There isn't a const version of NGram.
NGram second(const_cast<void*>(second_void), compare.Order()); NGram second(const_cast<void*>(second_void), compare.Order());
if (memcmp(first.begin(), second.begin(), sizeof(WordIndex) * compare.Order())) return false; if (memcmp(first.begin(), second.begin(), sizeof(WordIndex) * compare.Order())) return false;
first.Count() += second.Count(); first.Count() += second.Count();
@ -204,10 +204,10 @@ template <class Compare> class Sorts : public util::FixedArray<util::stream::Sor
typedef util::FixedArray<S> P; typedef util::FixedArray<S> P;
public: public:
/** /**
* Constructs, but does not initialize. * Constructs, but does not initialize.
* *
* @ref util::FixedArray::Init() "Init" must be called before use. * @ref util::FixedArray::Init() "Init" must be called before use.
* *
* @see util::FixedArray::Init() * @see util::FixedArray::Init()
@ -222,7 +222,7 @@ template <class Compare> class Sorts : public util::FixedArray<util::stream::Sor
*/ */
explicit Sorts(std::size_t number) : util::FixedArray<util::stream::Sort<Compare> >(number) {} explicit Sorts(std::size_t number) : util::FixedArray<util::stream::Sort<Compare> >(number) {}
/** /**
* Constructs a new @ref util::stream::Sort "Sort" object which is stored in this @ref util::FixedArray "array". * Constructs a new @ref util::stream::Sort "Sort" object which is stored in this @ref util::FixedArray "array".
* *
* The new @ref util::stream::Sort "Sort" object is constructed using the provided @ref util::stream::SortConfig "SortConfig" and @ref Comparator "ngram comparator"; * The new @ref util::stream::Sort "Sort" object is constructed using the provided @ref util::stream::SortConfig "SortConfig" and @ref Comparator "ngram comparator";

View File

@ -10,7 +10,7 @@ namespace lm {
* and implement Add. Then put a pointer in Config.enumerate_vocab; it does * and implement Add. Then put a pointer in Config.enumerate_vocab; it does
* not take ownership. Add is called once per vocab word. index starts at 0 * not take ownership. Add is called once per vocab word. index starts at 0
* and increases by 1 each time. This is only used by the Model constructor; * and increases by 1 each time. This is only used by the Model constructor;
* the pointer is not retained by the class. * the pointer is not retained by the class.
*/ */
class EnumerateVocab { class EnumerateVocab {
public: public:

View File

@ -9,8 +9,8 @@
namespace lm { namespace lm {
namespace base { namespace base {
// Common model interface that depends on knowing the specific classes. // Common model interface that depends on knowing the specific classes.
// Curiously recurring template pattern. // Curiously recurring template pattern.
template <class Child, class StateT, class VocabularyT> class ModelFacade : public Model { template <class Child, class StateT, class VocabularyT> class ModelFacade : public Model {
public: public:
typedef StateT State; typedef StateT State;
@ -32,7 +32,7 @@ template <class Child, class StateT, class VocabularyT> class ModelFacade : publ
*reinterpret_cast<State*>(out_state)); *reinterpret_cast<State*>(out_state));
} }
// Default Score function calls FullScore. Model can override this. // Default Score function calls FullScore. Model can override this.
float Score(const State &in_state, const WordIndex new_word, State &out_state) const { float Score(const State &in_state, const WordIndex new_word, State &out_state) const {
return static_cast<const Child*>(this)->FullScore(in_state, new_word, out_state).prob; return static_cast<const Child*>(this)->FullScore(in_state, new_word, out_state).prob;
} }
@ -53,7 +53,7 @@ template <class Child, class StateT, class VocabularyT> class ModelFacade : publ
virtual ~ModelFacade() {} virtual ~ModelFacade() {}
// begin_sentence and null_context can disappear after. vocab should stay. // begin_sentence and null_context can disappear after. vocab should stay.
void Init(const State &begin_sentence, const State &null_context, const Vocabulary &vocab, unsigned char order) { void Init(const State &begin_sentence, const State &null_context, const Vocabulary &vocab, unsigned char order) {
begin_sentence_ = begin_sentence; begin_sentence_ = begin_sentence;
null_context_ = null_context; null_context_ = null_context;

View File

@ -33,7 +33,7 @@ class CountOutput : boost::noncopyable {
class CountBatch { class CountBatch {
public: public:
explicit CountBatch(std::streamsize initial_read) explicit CountBatch(std::streamsize initial_read)
: initial_read_(initial_read) { : initial_read_(initial_read) {
buffer_.reserve(initial_read); buffer_.reserve(initial_read);
} }
@ -66,7 +66,7 @@ class CountBatch {
private: private:
std::streamsize initial_read_; std::streamsize initial_read_;
// This could have been a std::string but that's less happy with raw writes. // This could have been a std::string but that's less happy with raw writes.
std::vector<char> buffer_; std::vector<char> buffer_;
}; };

View File

@ -58,7 +58,7 @@ typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} Fil
typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format; typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format;
struct Config { struct Config {
Config() : Config() :
#ifndef NTHREAD #ifndef NTHREAD
batch_size(25000), batch_size(25000),
threads(boost::thread::hardware_concurrency()), threads(boost::thread::hardware_concurrency()),

View File

@ -134,12 +134,12 @@ struct CountFormat {
/* For multithreading, the buffer classes hold batches of filter inputs and /* For multithreading, the buffer classes hold batches of filter inputs and
* outputs in memory. The strings get reused a lot, so keep them around * outputs in memory. The strings get reused a lot, so keep them around
* instead of clearing each time. * instead of clearing each time.
*/ */
class InputBuffer { class InputBuffer {
public: public:
InputBuffer() : actual_(0) {} InputBuffer() : actual_(0) {}
void Reserve(size_t size) { lines_.reserve(size); } void Reserve(size_t size) { lines_.reserve(size); }
template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) {
@ -179,18 +179,18 @@ class BinaryOutputBuffer {
void Reserve(size_t size) { void Reserve(size_t size) {
lines_.reserve(size); lines_.reserve(size);
} }
void AddNGram(const StringPiece &line) { void AddNGram(const StringPiece &line) {
lines_.push_back(line); lines_.push_back(line);
} }
template <class Output> void Flush(Output &output) { template <class Output> void Flush(Output &output) {
for (std::vector<StringPiece>::const_iterator i = lines_.begin(); i != lines_.end(); ++i) { for (std::vector<StringPiece>::const_iterator i = lines_.begin(); i != lines_.end(); ++i) {
output.AddNGram(*i); output.AddNGram(*i);
} }
lines_.clear(); lines_.clear();
} }
private: private:
std::vector<StringPiece> lines_; std::vector<StringPiece> lines_;
}; };
@ -234,7 +234,7 @@ class MultipleOutputBuffer {
private: private:
struct Annotated { struct Annotated {
// If this is empty, send to all systems. // If this is empty, send to all systems.
// A filter should never send to all systems and send to a single one. // A filter should never send to all systems and send to a single one.
std::vector<size_t> systems; std::vector<size_t> systems;
StringPiece line; StringPiece line;

View File

@ -31,14 +31,14 @@ unsigned int ReadMultiple(std::istream &in, Substrings &out) {
word.clear(); word.clear();
} }
if (c == ' ') continue; if (c == ' ') continue;
// It's more than just a space. Close out the phrase. // It's more than just a space. Close out the phrase.
if (!phrase.empty()) { if (!phrase.empty()) {
sentence_content = true; sentence_content = true;
out.AddPhrase(sentence_id, phrase.begin(), phrase.end()); out.AddPhrase(sentence_id, phrase.begin(), phrase.end());
phrase.clear(); phrase.clear();
} }
if (c == '\t' || c == '\v') continue; if (c == '\t' || c == '\v') continue;
// It's more than a space or tab: a newline. // It's more than a space or tab: a newline.
if (sentence_content) { if (sentence_content) {
++sentence_id; ++sentence_id;
sentence_content = false; sentence_content = false;
@ -53,7 +53,7 @@ typedef unsigned int Sentence;
typedef std::vector<Sentence> Sentences; typedef std::vector<Sentence> Sentences;
} // namespace } // namespace
namespace detail { namespace detail {
const StringPiece kEndSentence("</s>"); const StringPiece kEndSentence("</s>");
@ -61,7 +61,7 @@ class Arc {
public: public:
Arc() {} Arc() {}
// For arcs from one vertex to another. // For arcs from one vertex to another.
void SetPhrase(detail::Vertex &from, detail::Vertex &to, const Sentences &intersect) { void SetPhrase(detail::Vertex &from, detail::Vertex &to, const Sentences &intersect) {
Set(to, intersect); Set(to, intersect);
from_ = &from; from_ = &from;
@ -69,7 +69,7 @@ class Arc {
/* For arcs from before the n-gram begins to somewhere in the n-gram (right /* For arcs from before the n-gram begins to somewhere in the n-gram (right
* aligned). These have no from_ vertex; it implictly matches every * aligned). These have no from_ vertex; it implictly matches every
* sentence. This also handles when the n-gram is a substring of a phrase. * sentence. This also handles when the n-gram is a substring of a phrase.
*/ */
void SetRight(detail::Vertex &to, const Sentences &complete) { void SetRight(detail::Vertex &to, const Sentences &complete) {
Set(to, complete); Set(to, complete);
@ -87,12 +87,12 @@ class Arc {
/* When this function returns: /* When this function returns:
* If Empty() then there's nothing left from this intersection. * If Empty() then there's nothing left from this intersection.
* *
* If Current() == to then to is part of the intersection. * If Current() == to then to is part of the intersection.
* *
* Otherwise, Current() > to. In this case, to is not part of the * Otherwise, Current() > to. In this case, to is not part of the
* intersection and neither is anything < Current(). To determine if * intersection and neither is anything < Current(). To determine if
* any value >= Current() is in the intersection, call LowerBound again * any value >= Current() is in the intersection, call LowerBound again
* with the value. * with the value.
*/ */
void LowerBound(const Sentence to); void LowerBound(const Sentence to);
@ -160,15 +160,15 @@ void Arc::Set(Vertex &to, const Sentences &sentences) {
void Vertex::LowerBound(const Sentence to) { void Vertex::LowerBound(const Sentence to) {
if (Empty()) return; if (Empty()) return;
// Union lower bound. // Union lower bound.
while (true) { while (true) {
Arc *top = incoming_.top(); Arc *top = incoming_.top();
if (top->Current() > to) { if (top->Current() > to) {
current_ = top->Current(); current_ = top->Current();
return; return;
} }
// If top->Current() == to, we still need to verify that's an actual // If top->Current() == to, we still need to verify that's an actual
// element and not just a bound. // element and not just a bound.
incoming_.pop(); incoming_.pop();
top->LowerBound(to); top->LowerBound(to);
if (!top->Empty()) { if (!top->Empty()) {
@ -213,13 +213,13 @@ void BuildGraph(const Substrings &phrase, const std::vector<Hash> &hashes, detai
} }
} }
// Phrases starting at the second or later word in the n-gram. // Phrases starting at the second or later word in the n-gram.
Vertex *vertex_from = vertices; Vertex *vertex_from = vertices;
for (const Hash *word_from = first_word + 1; word_from != &*hashes.end(); ++word_from, ++vertex_from) { for (const Hash *word_from = first_word + 1; word_from != &*hashes.end(); ++word_from, ++vertex_from) {
hash = 0; hash = 0;
Vertex *vertex_to = vertex_from + 1; Vertex *vertex_to = vertex_from + 1;
for (const Hash *word_to = word_from; ; ++word_to, ++vertex_to) { for (const Hash *word_to = word_from; ; ++word_to, ++vertex_to) {
// Notice that word_to and vertex_to have the same index. // Notice that word_to and vertex_to have the same index.
hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *word_to); hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *word_to);
// Now hash covers [word_from, word_to]. // Now hash covers [word_from, word_to].
if (word_to == last_word) { if (word_to == last_word) {
@ -250,7 +250,7 @@ detail::Vertex &ConditionCommon::MakeGraph() {
vertices_.clear(); vertices_.clear();
vertices_.resize(hashes_.size()); vertices_.resize(hashes_.size());
arcs_.clear(); arcs_.clear();
// One for every substring. // One for every substring.
arcs_.resize(((hashes_.size() + 1) * hashes_.size()) / 2); arcs_.resize(((hashes_.size() + 1) * hashes_.size()) / 2);
BuildGraph(substrings_, hashes_, &*vertices_.begin(), &*arcs_.begin()); BuildGraph(substrings_, hashes_, &*vertices_.begin(), &*arcs_.begin());
return vertices_[hashes_.size() - 1]; return vertices_[hashes_.size() - 1];

View File

@ -27,7 +27,7 @@ class Substrings {
private: private:
/* This is the value in a hash table where the key is a string. It indicates /* This is the value in a hash table where the key is a string. It indicates
* four sets of sentences: * four sets of sentences:
* substring is sentences with a phrase containing the key as a substring. * substring is sentences with a phrase containing the key as a substring.
* left is sentencess with a phrase that begins with the key (left aligned). * left is sentencess with a phrase that begins with the key (left aligned).
* right is sentences with a phrase that ends with the key (right aligned). * right is sentences with a phrase that ends with the key (right aligned).
* phrase is sentences where the key is a phrase. * phrase is sentences where the key is a phrase.
@ -39,8 +39,8 @@ class Substrings {
/* Most of the CPU is hash table lookups, so let's not complicate it with /* Most of the CPU is hash table lookups, so let's not complicate it with
* vector equality comparisons. If a collision happens, the SentenceRelation * vector equality comparisons. If a collision happens, the SentenceRelation
* structure will contain the union of sentence ids over the colliding strings. * structure will contain the union of sentence ids over the colliding strings.
* In that case, the filter will be slightly more permissive. * In that case, the filter will be slightly more permissive.
* The key here is the same as boost's hash of std::vector<std::string>. * The key here is the same as boost's hash of std::vector<std::string>.
*/ */
typedef boost::unordered_map<Hash, SentenceRelation> Table; typedef boost::unordered_map<Hash, SentenceRelation> Table;
@ -58,9 +58,9 @@ class Substrings {
LM_FILTER_PHRASE_METHOD(Phrase, phrase) LM_FILTER_PHRASE_METHOD(Phrase, phrase)
#pragma GCC diagnostic ignored "-Wuninitialized" // end != finish so there's always an initialization #pragma GCC diagnostic ignored "-Wuninitialized" // end != finish so there's always an initialization
// sentence_id must be non-decreasing. Iterators are over words in the phrase. // sentence_id must be non-decreasing. Iterators are over words in the phrase.
template <class Iterator> void AddPhrase(unsigned int sentence_id, const Iterator &begin, const Iterator &end) { template <class Iterator> void AddPhrase(unsigned int sentence_id, const Iterator &begin, const Iterator &end) {
// Iterate over all substrings. // Iterate over all substrings.
for (Iterator start = begin; start != end; ++start) { for (Iterator start = begin; start != end; ++start) {
Hash hash = 0; Hash hash = 0;
SentenceRelation *relation; SentenceRelation *relation;
@ -85,7 +85,7 @@ class Substrings {
}; };
// Read a file with one sentence per line containing tab-delimited phrases of // Read a file with one sentence per line containing tab-delimited phrases of
// space-separated words. // space-separated words.
unsigned int ReadMultiple(std::istream &in, Substrings &out); unsigned int ReadMultiple(std::istream &in, Substrings &out);
namespace detail { namespace detail {
@ -94,7 +94,7 @@ extern const StringPiece kEndSentence;
template <class Iterator> void MakeHashes(Iterator i, const Iterator &end, std::vector<Hash> &hashes) { template <class Iterator> void MakeHashes(Iterator i, const Iterator &end, std::vector<Hash> &hashes) {
hashes.clear(); hashes.clear();
if (i == end) return; if (i == end) return;
// TODO: check strict phrase boundaries after <s> and before </s>. For now, just skip tags. // TODO: check strict phrase boundaries after <s> and before </s>. For now, just skip tags.
if ((i->data()[0] == '<') && (i->data()[i->size() - 1] == '>')) { if ((i->data()[0] == '<') && (i->data()[i->size() - 1] == '>')) {
++i; ++i;
} }

View File

@ -88,7 +88,7 @@ class TargetWords {
class Input { class Input {
public: public:
explicit Input(std::size_t max_length) explicit Input(std::size_t max_length)
: max_length_(max_length), sentence_id_(0), empty_() {} : max_length_(max_length), sentence_id_(0), empty_() {}
void AddSentence(StringPiece sentence, TargetWords &targets) { void AddSentence(StringPiece sentence, TargetWords &targets) {
@ -125,7 +125,7 @@ class Input {
Map map_; Map map_;
std::size_t sentence_id_; std::size_t sentence_id_;
// Temporaries in AddSentence. // Temporaries in AddSentence.
std::string canonical_; std::string canonical_;
std::vector<std::size_t> starts_; std::vector<std::size_t> starts_;

View File

@ -13,29 +13,29 @@ namespace lm {
template <class OutputBuffer> class ThreadBatch { template <class OutputBuffer> class ThreadBatch {
public: public:
ThreadBatch() {} ThreadBatch() {}
void Reserve(size_t size) { void Reserve(size_t size) {
input_.Reserve(size); input_.Reserve(size);
output_.Reserve(size); output_.Reserve(size);
} }
// File reading thread. // File reading thread.
InputBuffer &Fill(uint64_t sequence) { InputBuffer &Fill(uint64_t sequence) {
sequence_ = sequence; sequence_ = sequence;
// Why wait until now to clear instead of after output? free in the same // Why wait until now to clear instead of after output? free in the same
// thread as allocated. // thread as allocated.
input_.Clear(); input_.Clear();
return input_; return input_;
} }
// Filter worker thread. // Filter worker thread.
template <class Filter> void CallFilter(Filter &filter) { template <class Filter> void CallFilter(Filter &filter) {
input_.CallFilter(filter, output_); input_.CallFilter(filter, output_);
} }
uint64_t Sequence() const { return sequence_; } uint64_t Sequence() const { return sequence_; }
// File writing thread. // File writing thread.
template <class RealOutput> void Flush(RealOutput &output) { template <class RealOutput> void Flush(RealOutput &output) {
output_.Flush(output); output_.Flush(output);
} }
@ -73,7 +73,7 @@ template <class Batch, class Output> class OutputWorker {
void operator()(Request request) { void operator()(Request request) {
assert(request->Sequence() >= base_sequence_); assert(request->Sequence() >= base_sequence_);
// Assemble the output in order. // Assemble the output in order.
uint64_t pos = request->Sequence() - base_sequence_; uint64_t pos = request->Sequence() - base_sequence_;
if (pos >= ordering_.size()) { if (pos >= ordering_.size()) {
ordering_.resize(pos + 1, NULL); ordering_.resize(pos + 1, NULL);
@ -102,7 +102,7 @@ template <class Filter, class OutputBuffer, class RealOutput> class Controller :
typedef ThreadBatch<OutputBuffer> Batch; typedef ThreadBatch<OutputBuffer> Batch;
public: public:
Controller(size_t batch_size, size_t queue, size_t workers, const Filter &filter, RealOutput &output) Controller(size_t batch_size, size_t queue, size_t workers, const Filter &filter, RealOutput &output)
: batch_size_(batch_size), queue_size_(queue), : batch_size_(batch_size), queue_size_(queue),
batches_(queue), batches_(queue),
to_read_(queue), to_read_(queue),

View File

@ -30,7 +30,7 @@ bool IsLineEnd(std::istream &in) {
}// namespace }// namespace
// Read space separated words in enter separated lines. These lines can be // Read space separated words in enter separated lines. These lines can be
// very long, so don't read an entire line at a time. // very long, so don't read an entire line at a time.
unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out) { unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out) {
in.exceptions(std::istream::badbit); in.exceptions(std::istream::badbit);
unsigned int sentence = 0; unsigned int sentence = 0;

View File

@ -26,7 +26,7 @@ unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, st
/* Is this a special tag like <s> or <UNK>? This actually includes anything /* Is this a special tag like <s> or <UNK>? This actually includes anything
* surrounded with < and >, which most tokenizers separate for real words, so * surrounded with < and >, which most tokenizers separate for real words, so
* this should not catch real words as it looks at a single token. * this should not catch real words as it looks at a single token.
*/ */
inline bool IsTag(const StringPiece &value) { inline bool IsTag(const StringPiece &value) {
// The parser should never give an empty string. // The parser should never give an empty string.

View File

@ -13,7 +13,7 @@ namespace lm {
// multiple-output filter so clients code against one interface. // multiple-output filter so clients code against one interface.
template <class Binary> class BinaryFilter { template <class Binary> class BinaryFilter {
public: public:
// Binary modes are just references (and a set) and it makes the API cleaner to copy them. // Binary modes are just references (and a set) and it makes the API cleaner to copy them.
explicit BinaryFilter(Binary binary) : binary_(binary) {} explicit BinaryFilter(Binary binary) : binary_(binary) {}
template <class Iterator, class Output> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) { template <class Iterator, class Output> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) {

View File

@ -1,22 +1,22 @@
/* Efficient left and right language model state for sentence fragments. /* Efficient left and right language model state for sentence fragments.
* Intended usage: * Intended usage:
* Store ChartState with every chart entry. * Store ChartState with every chart entry.
* To do a rule application: * To do a rule application:
* 1. Make a ChartState object for your new entry. * 1. Make a ChartState object for your new entry.
* 2. Construct RuleScore. * 2. Construct RuleScore.
* 3. Going from left to right, call Terminal or NonTerminal. * 3. Going from left to right, call Terminal or NonTerminal.
* For terminals, just pass the vocab id. * For terminals, just pass the vocab id.
* For non-terminals, pass that non-terminal's ChartState. * For non-terminals, pass that non-terminal's ChartState.
* If your decoder expects scores inclusive of subtree scores (i.e. you * If your decoder expects scores inclusive of subtree scores (i.e. you
* label entries with the highest-scoring path), pass the non-terminal's * label entries with the highest-scoring path), pass the non-terminal's
* score as prob. * score as prob.
* If your decoder expects relative scores and will walk the chart later, * If your decoder expects relative scores and will walk the chart later,
* pass prob = 0.0. * pass prob = 0.0.
* In other words, the only effect of prob is that it gets added to the * In other words, the only effect of prob is that it gets added to the
* returned log probability. * returned log probability.
* 4. Call Finish. It returns the log probability. * 4. Call Finish. It returns the log probability.
* *
* There's a couple more details: * There's a couple more details:
* Do not pass <s> to Terminal as it is formally not a word in the sentence, * Do not pass <s> to Terminal as it is formally not a word in the sentence,
* only context. Instead, call BeginSentence. If called, it should be the * only context. Instead, call BeginSentence. If called, it should be the
* first call after RuleScore is constructed (since <s> is always the * first call after RuleScore is constructed (since <s> is always the
@ -27,12 +27,12 @@
* Hashing and sorting comparison operators are provided. All state objects * Hashing and sorting comparison operators are provided. All state objects
* are POD. If you intend to use memcmp on raw state objects, you must call * are POD. If you intend to use memcmp on raw state objects, you must call
* ZeroRemaining first, as the value of array entries beyond length is * ZeroRemaining first, as the value of array entries beyond length is
* otherwise undefined. * otherwise undefined.
* *
* Usage is of course not limited to chart decoding. Anything that generates * Usage is of course not limited to chart decoding. Anything that generates
* sentence fragments missing left context could benefit. For example, a * sentence fragments missing left context could benefit. For example, a
* phrase-based decoder could pre-score phrases, storing ChartState with each * phrase-based decoder could pre-score phrases, storing ChartState with each
* phrase, even if hypotheses are generated left-to-right. * phrase, even if hypotheses are generated left-to-right.
*/ */
#ifndef LM_LEFT_H #ifndef LM_LEFT_H
@ -77,7 +77,7 @@ template <class M> class RuleScore {
left_done_ = true; left_done_ = true;
} }
// Faster version of NonTerminal for the case where the rule begins with a non-terminal. // Faster version of NonTerminal for the case where the rule begins with a non-terminal.
void BeginNonTerminal(const ChartState &in, float prob = 0.0) { void BeginNonTerminal(const ChartState &in, float prob = 0.0) {
prob_ = prob; prob_ = prob;
*out_ = in; *out_ = in;
@ -86,7 +86,7 @@ template <class M> class RuleScore {
void NonTerminal(const ChartState &in, float prob = 0.0) { void NonTerminal(const ChartState &in, float prob = 0.0) {
prob_ += prob; prob_ += prob;
if (!in.left.length) { if (!in.left.length) {
if (in.left.full) { if (in.left.full) {
for (const float *i = out_->right.backoff; i < out_->right.backoff + out_->right.length; ++i) prob_ += *i; for (const float *i = out_->right.backoff; i < out_->right.backoff + out_->right.length; ++i) prob_ += *i;
@ -131,26 +131,26 @@ template <class M> class RuleScore {
return; return;
} }
// Right state was minimized, so it's already independent of the new words to the left. // Right state was minimized, so it's already independent of the new words to the left.
if (in.right.length < in.left.length) { if (in.right.length < in.left.length) {
out_->right = in.right; out_->right = in.right;
return; return;
} }
// Shift exisiting words down. // Shift exisiting words down.
for (WordIndex *i = out_->right.words + next_use - 1; i >= out_->right.words; --i) { for (WordIndex *i = out_->right.words + next_use - 1; i >= out_->right.words; --i) {
*(i + in.right.length) = *i; *(i + in.right.length) = *i;
} }
// Add words from in.right. // Add words from in.right.
std::copy(in.right.words, in.right.words + in.right.length, out_->right.words); std::copy(in.right.words, in.right.words + in.right.length, out_->right.words);
// Assemble backoff composed on the existing state's backoff followed by the new state's backoff. // Assemble backoff composed on the existing state's backoff followed by the new state's backoff.
std::copy(in.right.backoff, in.right.backoff + in.right.length, out_->right.backoff); std::copy(in.right.backoff, in.right.backoff + in.right.length, out_->right.backoff);
std::copy(back, back + next_use, out_->right.backoff + in.right.length); std::copy(back, back + next_use, out_->right.backoff + in.right.length);
out_->right.length = in.right.length + next_use; out_->right.length = in.right.length + next_use;
} }
float Finish() { float Finish() {
// A N-1-gram might extend left and right but we should still set full to true because it's an N-1-gram. // A N-1-gram might extend left and right but we should still set full to true because it's an N-1-gram.
out_->left.full = left_done_ || (out_->left.length == model_.Order() - 1); out_->left.full = left_done_ || (out_->left.length == model_.Order() - 1);
return prob_; return prob_;
} }
@ -173,17 +173,17 @@ template <class M> class RuleScore {
back_in, // Backoffs to use back_in, // Backoffs to use
in.left.pointers[extend_length - 1], extend_length, // Words to be extended in.left.pointers[extend_length - 1], extend_length, // Words to be extended
back_out, // Backoffs for the next score back_out, // Backoffs for the next score
next_use)); // Length of n-gram to use in next scoring. next_use)); // Length of n-gram to use in next scoring.
if (next_use != out_->right.length) { if (next_use != out_->right.length) {
left_done_ = true; left_done_ = true;
if (!next_use) { if (!next_use) {
// Early exit. // Early exit.
out_->right = in.right; out_->right = in.right;
prob_ += model_.UnRest(in.left.pointers + extend_length, in.left.pointers + in.left.length, extend_length + 1); prob_ += model_.UnRest(in.left.pointers + extend_length, in.left.pointers + in.left.length, extend_length + 1);
return true; return true;
} }
} }
// Continue scoring. // Continue scoring.
return false; return false;
} }

View File

@ -16,7 +16,7 @@ namespace {
#define Term(word) score.Terminal(m.GetVocabulary().Index(word)); #define Term(word) score.Terminal(m.GetVocabulary().Index(word));
#define VCheck(word, value) BOOST_CHECK_EQUAL(m.GetVocabulary().Index(word), value); #define VCheck(word, value) BOOST_CHECK_EQUAL(m.GetVocabulary().Index(word), value);
// Apparently some Boost versions use templates and are pretty strict about types matching. // Apparently some Boost versions use templates and are pretty strict about types matching.
#define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast<double>(ref), static_cast<double>(value), static_cast<double>(tol)); #define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast<double>(ref), static_cast<double>(value), static_cast<double>(tol));
template <class M> void Short(const M &m) { template <class M> void Short(const M &m) {
@ -175,7 +175,7 @@ template <class M> void LookupVocab(const M &m, const StringPiece &str, std::vec
SLOPPY_CHECK_CLOSE(expect, RightToLeft(m, words, rest), 0.001); \ SLOPPY_CHECK_CLOSE(expect, RightToLeft(m, words, rest), 0.001); \
SLOPPY_CHECK_CLOSE(expect, TreeMiddle(m, words, rest), 0.001); \ SLOPPY_CHECK_CLOSE(expect, TreeMiddle(m, words, rest), 0.001); \
// Build sentences, or parts thereof, from right to left. // Build sentences, or parts thereof, from right to left.
template <class M> void GrowBig(const M &m, bool rest = false) { template <class M> void GrowBig(const M &m, bool rest = false) {
std::vector<WordIndex> words; std::vector<WordIndex> words;
float expect; float expect;

View File

@ -1,7 +1,7 @@
#ifndef LM_LM_EXCEPTION_H #ifndef LM_LM_EXCEPTION_H
#define LM_LM_EXCEPTION_H #define LM_LM_EXCEPTION_H
// Named to avoid conflict with util/exception.hh. // Named to avoid conflict with util/exception.hh.
#include "util/exception.hh" #include "util/exception.hh"
#include "util/string_piece.hh" #include "util/string_piece.hh"

View File

@ -1,7 +1,7 @@
#ifndef LM_MAX_ORDER_H #ifndef LM_MAX_ORDER_H
#define LM_MAX_ORDER_H #define LM_MAX_ORDER_H
/* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM. /* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM.
* If not, this is the default maximum order. * If not, this is the default maximum order.
* Having this limit means that State can be * Having this limit means that State can be
* (kMaxOrder - 1) * sizeof(float) bytes instead of * (kMaxOrder - 1) * sizeof(float) bytes instead of
* sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead * sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead

View File

@ -25,7 +25,7 @@ namespace lm {
namespace ngram { namespace ngram {
namespace detail { namespace detail {
// Should return the same results as SRI. // Should return the same results as SRI.
// ModelFacade typedefs Vocabulary so we use VocabularyT to avoid naming conflicts. // ModelFacade typedefs Vocabulary so we use VocabularyT to avoid naming conflicts.
template <class Search, class VocabularyT> class GenericModel : public base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> { template <class Search, class VocabularyT> class GenericModel : public base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> {
private: private:
@ -38,7 +38,7 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
/* Get the size of memory that will be mapped given ngram counts. This /* Get the size of memory that will be mapped given ngram counts. This
* does not include small non-mapped control structures, such as this class * does not include small non-mapped control structures, such as this class
* itself. * itself.
*/ */
static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config = Config()); static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config = Config());
@ -46,47 +46,47 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
* files must have the format expected by this class or you'll get an * files must have the format expected by this class or you'll get an
* exception. So TrieModel can only load ARPA or binary created by * exception. So TrieModel can only load ARPA or binary created by
* TrieModel. To classify binary files, call RecognizeBinary in * TrieModel. To classify binary files, call RecognizeBinary in
* lm/binary_format.hh. * lm/binary_format.hh.
*/ */
explicit GenericModel(const char *file, const Config &config = Config()); explicit GenericModel(const char *file, const Config &config = Config());
/* Score p(new_word | in_state) and incorporate new_word into out_state. /* Score p(new_word | in_state) and incorporate new_word into out_state.
* Note that in_state and out_state must be different references: * Note that in_state and out_state must be different references:
* &in_state != &out_state. * &in_state != &out_state.
*/ */
FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const; FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const;
/* Slower call without in_state. Try to remember state, but sometimes it /* Slower call without in_state. Try to remember state, but sometimes it
* would cost too much memory or your decoder isn't setup properly. * would cost too much memory or your decoder isn't setup properly.
* To use this function, make an array of WordIndex containing the context * To use this function, make an array of WordIndex containing the context
* vocabulary ids in reverse order. Then, pass the bounds of the array: * vocabulary ids in reverse order. Then, pass the bounds of the array:
* [context_rbegin, context_rend). The new_word is not part of the context * [context_rbegin, context_rend). The new_word is not part of the context
* array unless you intend to repeat words. * array unless you intend to repeat words.
*/ */
FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const; FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
/* Get the state for a context. Don't use this if you can avoid it. Use /* Get the state for a context. Don't use this if you can avoid it. Use
* BeginSentenceState or NullContextState and extend from those. If * BeginSentenceState or NullContextState and extend from those. If
* you're only going to use this state to call FullScore once, use * you're only going to use this state to call FullScore once, use
* FullScoreForgotState. * FullScoreForgotState.
* To use this function, make an array of WordIndex containing the context * To use this function, make an array of WordIndex containing the context
* vocabulary ids in reverse order. Then, pass the bounds of the array: * vocabulary ids in reverse order. Then, pass the bounds of the array:
* [context_rbegin, context_rend). * [context_rbegin, context_rend).
*/ */
void GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const; void GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const;
/* More efficient version of FullScore where a partial n-gram has already /* More efficient version of FullScore where a partial n-gram has already
* been scored. * been scored.
* NOTE: THE RETURNED .rest AND .prob ARE RELATIVE TO THE .rest RETURNED BEFORE. * NOTE: THE RETURNED .rest AND .prob ARE RELATIVE TO THE .rest RETURNED BEFORE.
*/ */
FullScoreReturn ExtendLeft( FullScoreReturn ExtendLeft(
// Additional context in reverse order. This will update add_rend to // Additional context in reverse order. This will update add_rend to
const WordIndex *add_rbegin, const WordIndex *add_rend, const WordIndex *add_rbegin, const WordIndex *add_rend,
// Backoff weights to use. // Backoff weights to use.
const float *backoff_in, const float *backoff_in,
// extend_left returned by a previous query. // extend_left returned by a previous query.
uint64_t extend_pointer, uint64_t extend_pointer,
// Length of n-gram that the pointer corresponds to. // Length of n-gram that the pointer corresponds to.
unsigned char extend_length, unsigned char extend_length,
// Where to write additional backoffs for [extend_length + 1, min(Order() - 1, return.ngram_length)] // Where to write additional backoffs for [extend_length + 1, min(Order() - 1, return.ngram_length)]
float *backoff_out, float *backoff_out,
@ -95,17 +95,17 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
/* Return probabilities minus rest costs for an array of pointers. The /* Return probabilities minus rest costs for an array of pointers. The
* first length should be the length of the n-gram to which pointers_begin * first length should be the length of the n-gram to which pointers_begin
* points. * points.
*/ */
float UnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const { float UnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const {
// Compiler should optimize this if away. // Compiler should optimize this if away.
return Search::kDifferentRest ? InternalUnRest(pointers_begin, pointers_end, first_length) : 0.0; return Search::kDifferentRest ? InternalUnRest(pointers_begin, pointers_end, first_length) : 0.0;
} }
private: private:
FullScoreReturn ScoreExceptBackoff(const WordIndex *const context_rbegin, const WordIndex *const context_rend, const WordIndex new_word, State &out_state) const; FullScoreReturn ScoreExceptBackoff(const WordIndex *const context_rbegin, const WordIndex *const context_rend, const WordIndex new_word, State &out_state) const;
// Score bigrams and above. Do not include backoff. // Score bigrams and above. Do not include backoff.
void ResumeScore(const WordIndex *context_rbegin, const WordIndex *const context_rend, unsigned char starting_order_minus_2, typename Search::Node &node, float *backoff_out, unsigned char &next_use, FullScoreReturn &ret) const; void ResumeScore(const WordIndex *context_rbegin, const WordIndex *const context_rend, unsigned char starting_order_minus_2, typename Search::Node &node, float *backoff_out, unsigned char &next_use, FullScoreReturn &ret) const;
// Appears after Size in the cc file. // Appears after Size in the cc file.
@ -116,7 +116,7 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
float InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const; float InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const;
BinaryFormat backing_; BinaryFormat backing_;
VocabularyT vocab_; VocabularyT vocab_;
Search search_; Search search_;
@ -124,8 +124,8 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
} // namespace detail } // namespace detail
// Instead of typedef, inherit. This allows the Model etc to be forward declared. // Instead of typedef, inherit. This allows the Model etc to be forward declared.
// Oh the joys of C and C++. // Oh the joys of C and C++.
#define LM_COMMA() , #define LM_COMMA() ,
#define LM_NAME_MODEL(name, from)\ #define LM_NAME_MODEL(name, from)\
class name : public from {\ class name : public from {\
@ -140,7 +140,7 @@ LM_NAME_MODEL(ArrayTrieModel, detail::GenericModel<trie::TrieSearch<DontQuantize
LM_NAME_MODEL(QuantTrieModel, detail::GenericModel<trie::TrieSearch<SeparatelyQuantize LM_COMMA() trie::DontBhiksha> LM_COMMA() SortedVocabulary>); LM_NAME_MODEL(QuantTrieModel, detail::GenericModel<trie::TrieSearch<SeparatelyQuantize LM_COMMA() trie::DontBhiksha> LM_COMMA() SortedVocabulary>);
LM_NAME_MODEL(QuantArrayTrieModel, detail::GenericModel<trie::TrieSearch<SeparatelyQuantize LM_COMMA() trie::ArrayBhiksha> LM_COMMA() SortedVocabulary>); LM_NAME_MODEL(QuantArrayTrieModel, detail::GenericModel<trie::TrieSearch<SeparatelyQuantize LM_COMMA() trie::ArrayBhiksha> LM_COMMA() SortedVocabulary>);
// Default implementation. No real reason for it to be the default. // Default implementation. No real reason for it to be the default.
typedef ::lm::ngram::ProbingVocabulary Vocabulary; typedef ::lm::ngram::ProbingVocabulary Vocabulary;
typedef ProbingModel Model; typedef ProbingModel Model;

View File

@ -7,7 +7,7 @@
#include <boost/test/unit_test.hpp> #include <boost/test/unit_test.hpp>
#include <boost/test/floating_point_comparison.hpp> #include <boost/test/floating_point_comparison.hpp>
// Apparently some Boost versions use templates and are pretty strict about types matching. // Apparently some Boost versions use templates and are pretty strict about types matching.
#define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast<double>(ref), static_cast<double>(value), static_cast<double>(tol)); #define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast<double>(ref), static_cast<double>(value), static_cast<double>(tol));
namespace lm { namespace lm {
@ -118,7 +118,7 @@ template <class M> void Blanks(const M &model) {
AppendTest("not_found", 1, -1.995635 - 7.0 - 0.30103, true); AppendTest("not_found", 1, -1.995635 - 7.0 - 0.30103, true);
state = model.NullContextState(); state = model.NullContextState();
// higher looking is a blank. // higher looking is a blank.
AppendTest("higher", 1, -1.509559, false); AppendTest("higher", 1, -1.509559, false);
AppendTest("looking", 2, -1.285941 - 0.30103, false); AppendTest("looking", 2, -1.285941 - 0.30103, false);
@ -150,7 +150,7 @@ template <class M> void Unknowns(const M &model) {
State preserve = state; State preserve = state;
AppendTest("not_found2", 2, -15.0, true); AppendTest("not_found2", 2, -15.0, true);
AppendTest("not_found3", 2, -15.0 - 2.0, true); AppendTest("not_found3", 2, -15.0 - 2.0, true);
state = preserve; state = preserve;
AppendTest("however", 2, -4, true); AppendTest("however", 2, -4, true);
AppendTest("not_found3", 3, -6, true); AppendTest("not_found3", 3, -6, true);
@ -167,7 +167,7 @@ template <class M> void MinimalState(const M &model) {
AppendTest("foo", 1, -3.141592, true); AppendTest("foo", 1, -3.141592, true);
BOOST_CHECK_EQUAL(1, state.length); BOOST_CHECK_EQUAL(1, state.length);
AppendTest("bar", 2, -6.0, true); AppendTest("bar", 2, -6.0, true);
// Has to include the backoff weight. // Has to include the backoff weight.
BOOST_CHECK_EQUAL(1, state.length); BOOST_CHECK_EQUAL(1, state.length);
AppendTest("bar", 1, -2.718281 + 3.0, true); AppendTest("bar", 1, -2.718281 + 3.0, true);
BOOST_CHECK_EQUAL(1, state.length); BOOST_CHECK_EQUAL(1, state.length);
@ -263,7 +263,7 @@ template <class M> void Stateless(const M &model) {
// the // the
AppendTest("the", 1, -4.04005, true); AppendTest("the", 1, -4.04005, true);
StatelessTest(5, 5, 1, -4.04005); StatelessTest(5, 5, 1, -4.04005);
// No context of the. // No context of the.
StatelessTest(5, 0, 1, -1.687872); StatelessTest(5, 0, 1, -1.687872);
// biarritz // biarritz
StatelessTest(6, 1, 1, -1.9889); StatelessTest(6, 1, 1, -1.9889);

View File

@ -8,7 +8,7 @@ namespace ngram {
* and I want to preserve existing binary files. */ * and I want to preserve existing binary files. */
typedef enum {PROBING=0, REST_PROBING=1, TRIE=2, QUANT_TRIE=3, ARRAY_TRIE=4, QUANT_ARRAY_TRIE=5} ModelType; typedef enum {PROBING=0, REST_PROBING=1, TRIE=2, QUANT_TRIE=3, ARRAY_TRIE=4, QUANT_ARRAY_TRIE=5} ModelType;
// Historical names. // Historical names.
const ModelType HASH_PROBING = PROBING; const ModelType HASH_PROBING = PROBING;
const ModelType TRIE_SORTED = TRIE; const ModelType TRIE_SORTED = TRIE;
const ModelType QUANT_TRIE_SORTED = QUANT_TRIE; const ModelType QUANT_TRIE_SORTED = QUANT_TRIE;

View File

@ -22,7 +22,7 @@ struct BasicPrint {
std::cout << "Total: " << total << " OOV: " << oov << '\n'; std::cout << "Total: " << total << " OOV: " << oov << '\n';
} }
void Summary(double, double, uint64_t, uint64_t) {} void Summary(double, double, uint64_t, uint64_t) {}
}; };
struct FullPrint : public BasicPrint { struct FullPrint : public BasicPrint {
@ -31,7 +31,7 @@ struct FullPrint : public BasicPrint {
} }
void Summary(double ppl_including_oov, double ppl_excluding_oov, uint64_t corpus_oov, uint64_t corpus_tokens) { void Summary(double ppl_including_oov, double ppl_excluding_oov, uint64_t corpus_oov, uint64_t corpus_tokens) {
std::cout << std::cout <<
"Perplexity including OOVs:\t" << ppl_including_oov << "\n" "Perplexity including OOVs:\t" << ppl_including_oov << "\n"
"Perplexity excluding OOVs:\t" << ppl_excluding_oov << "\n" "Perplexity excluding OOVs:\t" << ppl_excluding_oov << "\n"
"OOVs:\t" << corpus_oov << "\n" "OOVs:\t" << corpus_oov << "\n"

View File

@ -35,9 +35,9 @@ template <class Model> ExtendReturn ExtendLoop(
unsigned char i = 0; unsigned char i = 0;
unsigned char length = pointers_end - pointers; unsigned char length = pointers_end - pointers;
// pointers_write is NULL means that the existing left state is full, so we should use completed probabilities. // pointers_write is NULL means that the existing left state is full, so we should use completed probabilities.
if (pointers_write) { if (pointers_write) {
// Using full context, writing to new left state. // Using full context, writing to new left state.
for (; i < length; ++i) { for (; i < length; ++i) {
FullScoreReturn ret(model.ExtendLeft( FullScoreReturn ret(model.ExtendLeft(
add_rbegin, add_rbegin + value.next_use, add_rbegin, add_rbegin + value.next_use,
@ -61,7 +61,7 @@ template <class Model> ExtendReturn ExtendLoop(
} }
} }
} }
// Using some of the new context. // Using some of the new context.
for (; i < length && value.next_use; ++i) { for (; i < length && value.next_use; ++i) {
FullScoreReturn ret(model.ExtendLeft( FullScoreReturn ret(model.ExtendLeft(
add_rbegin, add_rbegin + value.next_use, add_rbegin, add_rbegin + value.next_use,
@ -73,7 +73,7 @@ template <class Model> ExtendReturn ExtendLoop(
value.adjust += ret.prob; value.adjust += ret.prob;
} }
float unrest = model.UnRest(pointers + i, pointers_end, i + seen + 1); float unrest = model.UnRest(pointers + i, pointers_end, i + seen + 1);
// Using none of the new context. // Using none of the new context.
value.adjust += unrest; value.adjust += unrest;
std::copy(backoff_in, backoff_in + value.next_use, backoff_write); std::copy(backoff_in, backoff_in + value.next_use, backoff_write);
@ -100,7 +100,7 @@ template <class Model> float RevealBefore(const Model &model, const Right &revea
if (left.full) { if (left.full) {
for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += backoff_buffer[i]; for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += backoff_buffer[i];
} else { } else {
// If left wasn't full when it came in, put words into right state. // If left wasn't full when it came in, put words into right state.
std::copy(reveal.words + seen, reveal.words + seen + value.next_use, right.words + right.length); std::copy(reveal.words + seen, reveal.words + seen + value.next_use, right.words + right.length);
right.length += value.next_use; right.length += value.next_use;
left.full = value.make_full || (right.length == model.Order() - 1); left.full = value.make_full || (right.length == model.Order() - 1);

View File

@ -123,7 +123,7 @@ BOOST_AUTO_TEST_CASE(EndSentence) {
before.words[1] = loin; before.words[1] = loin;
before.backoff[0] = -0.845098; before.backoff[0] = -0.845098;
before.backoff[1] = 0.0; before.backoff[1] = 0.0;
before.length = 1; before.length = 1;
BOOST_CHECK_CLOSE(-0.0410707, RevealBefore(m, before, 0, true, between.left, between.right), 0.001); BOOST_CHECK_CLOSE(-0.0410707, RevealBefore(m, before, 0, true, between.left, between.right), 0.001);
BOOST_CHECK_EQUAL(0, between.left.length); BOOST_CHECK_EQUAL(0, between.left.length);
@ -159,7 +159,7 @@ void CheckAdjustment(const RestProbingModel &model, float expect, const Right &b
if (before_full) { if (before_full) {
got += RevealBefore(model, before, before.length, true, between.left, between.right); got += RevealBefore(model, before, before.length, true, between.left, between.right);
} }
// Sometimes they're zero and BOOST_CHECK_CLOSE fails for this. // Sometimes they're zero and BOOST_CHECK_CLOSE fails for this.
BOOST_CHECK(fabs(expect - got) < 0.001); BOOST_CHECK(fabs(expect - got) < 0.001);
} }

View File

@ -50,12 +50,12 @@ void SeparatelyQuantize::UpdateConfigFromBinary(const BinaryFormat &file, uint64
void SeparatelyQuantize::SetupMemory(void *base, unsigned char order, const Config &config) { void SeparatelyQuantize::SetupMemory(void *base, unsigned char order, const Config &config) {
prob_bits_ = config.prob_bits; prob_bits_ = config.prob_bits;
backoff_bits_ = config.backoff_bits; backoff_bits_ = config.backoff_bits;
// We need the reserved values. // We need the reserved values.
if (config.prob_bits == 0) UTIL_THROW(ConfigException, "You can't quantize probability to zero"); if (config.prob_bits == 0) UTIL_THROW(ConfigException, "You can't quantize probability to zero");
if (config.backoff_bits == 0) UTIL_THROW(ConfigException, "You can't quantize backoff to zero"); if (config.backoff_bits == 0) UTIL_THROW(ConfigException, "You can't quantize backoff to zero");
if (config.prob_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing probability supports at most 25 bits. Currently you have requested " << static_cast<unsigned>(config.prob_bits) << " bits."); if (config.prob_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing probability supports at most 25 bits. Currently you have requested " << static_cast<unsigned>(config.prob_bits) << " bits.");
if (config.backoff_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing backoff supports at most 25 bits. Currently you have requested " << static_cast<unsigned>(config.backoff_bits) << " bits."); if (config.backoff_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing backoff supports at most 25 bits. Currently you have requested " << static_cast<unsigned>(config.backoff_bits) << " bits.");
// Reserve 8 byte header for bit counts. // Reserve 8 byte header for bit counts.
actual_base_ = static_cast<uint8_t*>(base); actual_base_ = static_cast<uint8_t*>(base);
float *start = reinterpret_cast<float*>(actual_base_ + 8); float *start = reinterpret_cast<float*>(actual_base_ + 8);
for (unsigned char i = 0; i < order - 2; ++i) { for (unsigned char i = 0; i < order - 2; ++i) {

View File

@ -85,7 +85,7 @@ class DontQuantize {
void SetupMemory(void * /*start*/, unsigned char /*order*/, const Config & /*config*/) {} void SetupMemory(void * /*start*/, unsigned char /*order*/, const Config & /*config*/) {}
static const bool kTrain = false; static const bool kTrain = false;
// These should never be called because kTrain is false. // These should never be called because kTrain is false.
void Train(uint8_t /*order*/, std::vector<float> &/*prob*/, std::vector<float> &/*backoff*/) {} void Train(uint8_t /*order*/, std::vector<float> &/*prob*/, std::vector<float> &/*backoff*/) {}
void TrainProb(uint8_t, std::vector<float> &/*prob*/) {} void TrainProb(uint8_t, std::vector<float> &/*prob*/) {}
@ -142,7 +142,7 @@ class SeparatelyQuantize {
static uint64_t Size(uint8_t order, const Config &config) { static uint64_t Size(uint8_t order, const Config &config) {
uint64_t longest_table = (static_cast<uint64_t>(1) << static_cast<uint64_t>(config.prob_bits)) * sizeof(float); uint64_t longest_table = (static_cast<uint64_t>(1) << static_cast<uint64_t>(config.prob_bits)) * sizeof(float);
uint64_t middle_table = (static_cast<uint64_t>(1) << static_cast<uint64_t>(config.backoff_bits)) * sizeof(float) + longest_table; uint64_t middle_table = (static_cast<uint64_t>(1) << static_cast<uint64_t>(config.backoff_bits)) * sizeof(float) + longest_table;
// unigrams are currently not quantized so no need for a table. // unigrams are currently not quantized so no need for a table.
return (order - 2) * middle_table + longest_table + /* for the bit counts and alignment padding) */ 8; return (order - 2) * middle_table + longest_table + /* for the bit counts and alignment padding) */ 8;
} }
@ -168,7 +168,7 @@ class SeparatelyQuantize {
float Rest() const { return Prob(); } float Rest() const { return Prob(); }
void Write(float prob, float backoff) const { void Write(float prob, float backoff) const {
util::WriteInt57(address_.base, address_.offset, ProbBins().Bits() + BackoffBins().Bits(), util::WriteInt57(address_.base, address_.offset, ProbBins().Bits() + BackoffBins().Bits(),
(ProbBins().EncodeProb(prob) << BackoffBins().Bits()) | BackoffBins().EncodeBackoff(backoff)); (ProbBins().EncodeProb(prob) << BackoffBins().Bits()) | BackoffBins().EncodeBackoff(backoff));
} }
@ -183,7 +183,7 @@ class SeparatelyQuantize {
class LongestPointer { class LongestPointer {
public: public:
LongestPointer(const SeparatelyQuantize &quant, const util::BitAddress &address) : table_(&quant.LongestTable()), address_(address) {} LongestPointer(const SeparatelyQuantize &quant, const util::BitAddress &address) : table_(&quant.LongestTable()), address_(address) {}
LongestPointer() : address_(NULL, 0) {} LongestPointer() : address_(NULL, 0) {}
bool Found() const { return address_.base != NULL; } bool Found() const { return address_.base != NULL; }
@ -206,7 +206,7 @@ class SeparatelyQuantize {
void SetupMemory(void *start, unsigned char order, const Config &config); void SetupMemory(void *start, unsigned char order, const Config &config);
static const bool kTrain = true; static const bool kTrain = true;
// Assumes 0.0 is removed from backoff. // Assumes 0.0 is removed from backoff.
void Train(uint8_t order, std::vector<float> &prob, std::vector<float> &backoff); void Train(uint8_t order, std::vector<float> &prob, std::vector<float> &backoff);
// Train just probabilities (for longest order). // Train just probabilities (for longest order).
void TrainProb(uint8_t order, std::vector<float> &prob); void TrainProb(uint8_t order, std::vector<float> &prob);

View File

@ -9,7 +9,7 @@ struct FullScoreReturn {
// log10 probability // log10 probability
float prob; float prob;
/* The length of n-gram matched. Do not use this for recombination. /* The length of n-gram matched. Do not use this for recombination.
* Consider a model containing only the following n-grams: * Consider a model containing only the following n-grams:
* -1 foo * -1 foo
* -3.14 bar * -3.14 bar
@ -18,9 +18,9 @@ struct FullScoreReturn {
* *
* If you score ``bar'' then ngram_length is 1 and recombination state is the * If you score ``bar'' then ngram_length is 1 and recombination state is the
* empty string because bar has zero backoff and does not extend to the * empty string because bar has zero backoff and does not extend to the
* right. * right.
* If you score ``foo'' then ngram_length is 1 and recombination state is * If you score ``foo'' then ngram_length is 1 and recombination state is
* ``foo''. * ``foo''.
* *
* Ideally, keep output states around and compare them. Failing that, * Ideally, keep output states around and compare them. Failing that,
* get out_state.ValidLength() and use that length for recombination. * get out_state.ValidLength() and use that length for recombination.
@ -29,7 +29,7 @@ struct FullScoreReturn {
/* Left extension information. If independent_left is set, then prob is /* Left extension information. If independent_left is set, then prob is
* independent of words to the left (up to additional backoff). Otherwise, * independent of words to the left (up to additional backoff). Otherwise,
* extend_left indicates how to efficiently extend further to the left. * extend_left indicates how to efficiently extend further to the left.
*/ */
bool independent_left; bool independent_left;
uint64_t extend_left; // Defined only if independent_left uint64_t extend_left; // Defined only if independent_left

Some files were not shown because too many files have changed in this diff Show More