mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-28 14:32:38 +03:00
Merge branch 'master' of https://github.com/moses-smt/mosesdecoder
This commit is contained in:
commit
2b671e67dd
1
.gitignore
vendored
1
.gitignore
vendored
@ -68,6 +68,7 @@ contrib/other-builds/*.xcodeproj/xcuserdata/
|
||||
*/*.xcodeproj/xcuserdata
|
||||
|
||||
mert/sentence-bleu
|
||||
mert/sentence-bleu-nbest
|
||||
._*
|
||||
.DS_Store
|
||||
*.pbxuser
|
||||
|
@ -1,4 +1,3 @@
|
||||
Please see the Moses website on how to compile and run Moses
|
||||
http://www.statmt.org/moses/?n=Development.GetStarted
|
||||
Instructions for building and installing Moses are online:
|
||||
|
||||
blah blah blah
|
||||
http://www.statmt.org/moses/?n=Development.GetStarted
|
||||
|
@ -1,101 +1,101 @@
|
||||
// $Id: Vocabulary.cpp 1565 2008-02-22 14:42:01Z bojar $
|
||||
#include "Vocabulary.h"
|
||||
#include <fstream>
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
const int MAX_LENGTH = 10000;
|
||||
|
||||
} // namespace
|
||||
|
||||
using namespace std;
|
||||
|
||||
// as in beamdecoder/tables.cpp
|
||||
vector<WORD_ID> Vocabulary::Tokenize( const char input[] )
|
||||
{
|
||||
vector< WORD_ID > token;
|
||||
bool betweenWords = true;
|
||||
int start=0;
|
||||
int i=0;
|
||||
for(; input[i] != '\0'; i++) {
|
||||
bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
||||
|
||||
if (!isSpace && betweenWords) {
|
||||
start = i;
|
||||
betweenWords = false;
|
||||
} else if (isSpace && !betweenWords) {
|
||||
token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
|
||||
betweenWords = true;
|
||||
}
|
||||
}
|
||||
if (!betweenWords)
|
||||
token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
|
||||
return token;
|
||||
}
|
||||
|
||||
WORD_ID Vocabulary::StoreIfNew( const WORD& word )
|
||||
{
|
||||
map<WORD, WORD_ID>::iterator i = lookup.find( word );
|
||||
|
||||
if( i != lookup.end() )
|
||||
return i->second;
|
||||
|
||||
WORD_ID id = vocab.size();
|
||||
vocab.push_back( word );
|
||||
lookup[ word ] = id;
|
||||
return id;
|
||||
}
|
||||
|
||||
WORD_ID Vocabulary::GetWordID( const WORD &word ) const
|
||||
{
|
||||
map<WORD, WORD_ID>::const_iterator i = lookup.find( word );
|
||||
if( i == lookup.end() )
|
||||
return 0;
|
||||
WORD_ID w= (WORD_ID) i->second;
|
||||
return w;
|
||||
}
|
||||
|
||||
void Vocabulary::Save(const string& fileName ) const
|
||||
{
|
||||
ofstream vcbFile;
|
||||
vcbFile.open( fileName.c_str(), ios::out | ios::ate | ios::trunc);
|
||||
|
||||
if (!vcbFile) {
|
||||
cerr << "Failed to open " << vcbFile << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
vector< WORD >::const_iterator i;
|
||||
for(i = vocab.begin(); i != vocab.end(); i++) {
|
||||
const string &word = *i;
|
||||
vcbFile << word << endl;
|
||||
}
|
||||
vcbFile.close();
|
||||
}
|
||||
|
||||
void Vocabulary::Load(const string& fileName )
|
||||
{
|
||||
ifstream vcbFile;
|
||||
char line[MAX_LENGTH];
|
||||
vcbFile.open(fileName.c_str());
|
||||
|
||||
if (!vcbFile) {
|
||||
cerr << "no such file or directory: " << vcbFile << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
cerr << "loading from " << fileName << endl;
|
||||
istream *fileP = &vcbFile;
|
||||
int count = 0;
|
||||
while(!fileP->eof()) {
|
||||
SAFE_GETLINE((*fileP), line, MAX_LENGTH, '\n');
|
||||
if (fileP->eof()) break;
|
||||
int length = 0;
|
||||
for(; line[length] != '\0'; length++);
|
||||
StoreIfNew( string( line, length ) );
|
||||
count++;
|
||||
}
|
||||
vcbFile.close();
|
||||
cerr << count << " word read, vocabulary size " << vocab.size() << endl;
|
||||
}
|
||||
// $Id: Vocabulary.cpp 1565 2008-02-22 14:42:01Z bojar $
|
||||
#include "Vocabulary.h"
|
||||
#include <fstream>
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
const int MAX_LENGTH = 10000;
|
||||
|
||||
} // namespace
|
||||
|
||||
using namespace std;
|
||||
|
||||
// as in beamdecoder/tables.cpp
|
||||
vector<WORD_ID> Vocabulary::Tokenize( const char input[] )
|
||||
{
|
||||
vector< WORD_ID > token;
|
||||
bool betweenWords = true;
|
||||
int start=0;
|
||||
int i=0;
|
||||
for(; input[i] != '\0'; i++) {
|
||||
bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
||||
|
||||
if (!isSpace && betweenWords) {
|
||||
start = i;
|
||||
betweenWords = false;
|
||||
} else if (isSpace && !betweenWords) {
|
||||
token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
|
||||
betweenWords = true;
|
||||
}
|
||||
}
|
||||
if (!betweenWords)
|
||||
token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
|
||||
return token;
|
||||
}
|
||||
|
||||
WORD_ID Vocabulary::StoreIfNew( const WORD& word )
|
||||
{
|
||||
map<WORD, WORD_ID>::iterator i = lookup.find( word );
|
||||
|
||||
if( i != lookup.end() )
|
||||
return i->second;
|
||||
|
||||
WORD_ID id = vocab.size();
|
||||
vocab.push_back( word );
|
||||
lookup[ word ] = id;
|
||||
return id;
|
||||
}
|
||||
|
||||
WORD_ID Vocabulary::GetWordID( const WORD &word ) const
|
||||
{
|
||||
map<WORD, WORD_ID>::const_iterator i = lookup.find( word );
|
||||
if( i == lookup.end() )
|
||||
return 0;
|
||||
WORD_ID w= (WORD_ID) i->second;
|
||||
return w;
|
||||
}
|
||||
|
||||
void Vocabulary::Save(const string& fileName ) const
|
||||
{
|
||||
ofstream vcbFile;
|
||||
vcbFile.open( fileName.c_str(), ios::out | ios::ate | ios::trunc);
|
||||
|
||||
if (!vcbFile) {
|
||||
cerr << "Failed to open " << vcbFile << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
vector< WORD >::const_iterator i;
|
||||
for(i = vocab.begin(); i != vocab.end(); i++) {
|
||||
const string &word = *i;
|
||||
vcbFile << word << endl;
|
||||
}
|
||||
vcbFile.close();
|
||||
}
|
||||
|
||||
void Vocabulary::Load(const string& fileName )
|
||||
{
|
||||
ifstream vcbFile;
|
||||
char line[MAX_LENGTH];
|
||||
vcbFile.open(fileName.c_str());
|
||||
|
||||
if (!vcbFile) {
|
||||
cerr << "no such file or directory: " << vcbFile << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
cerr << "loading from " << fileName << endl;
|
||||
istream *fileP = &vcbFile;
|
||||
int count = 0;
|
||||
while(!fileP->eof()) {
|
||||
SAFE_GETLINE((*fileP), line, MAX_LENGTH, '\n');
|
||||
if (fileP->eof()) break;
|
||||
int length = 0;
|
||||
for(; line[length] != '\0'; length++);
|
||||
StoreIfNew( string( line, length ) );
|
||||
count++;
|
||||
}
|
||||
vcbFile.close();
|
||||
cerr << count << " word read, vocabulary size " << vocab.size() << endl;
|
||||
}
|
||||
|
@ -46,7 +46,7 @@ RE2 curr_en_x("^[Nn]?[\'][\\p{L}]"); // english contraction suffixes conjoin to
|
||||
RE2 pre_en_x(".*[\\p{L}\\p{N}]+$"); // valid english contraction prefixes
|
||||
RE2 curr_fr_x(".*[\\p{L}\\p{N}]+[\']"); // french/italian contraction prefixes conjoin to the right
|
||||
RE2 post_fr_x("^[\\p{L}\\p{N}]*"); // valid french/italian contraction suffixes
|
||||
// anything rarely used will just be given as a string and compiled on demand by RE2
|
||||
// anything rarely used will just be given as a string and compiled on demand by RE2
|
||||
|
||||
const char *
|
||||
SPC_BYTE = " ";
|
||||
@ -85,8 +85,8 @@ const char *ESCAPE_MOSES[] = {
|
||||
"'", // ' 6 (27)
|
||||
""", // " 7 (22)
|
||||
};
|
||||
|
||||
const std::set<std::string>
|
||||
|
||||
const std::set<std::string>
|
||||
ESCAPE_SET = {
|
||||
std::string(ESCAPE_MOSES[0]),
|
||||
std::string(ESCAPE_MOSES[1]),
|
||||
@ -98,7 +98,7 @@ ESCAPE_SET = {
|
||||
std::string(ESCAPE_MOSES[7]),
|
||||
};
|
||||
|
||||
const std::map<std::wstring,gunichar>
|
||||
const std::map<std::wstring,gunichar>
|
||||
ENTITY_MAP = {
|
||||
{ std::wstring(L"""), L'"' },
|
||||
{ std::wstring(L"&"), L'&' },
|
||||
@ -355,7 +355,7 @@ ENTITY_MAP = {
|
||||
{ std::wstring(L"♦"), L'\u2666' }
|
||||
};
|
||||
|
||||
inline gunichar
|
||||
inline gunichar
|
||||
get_entity(gunichar *ptr, size_t len) {
|
||||
// try hex, decimal entity first
|
||||
gunichar ech(0);
|
||||
@ -380,16 +380,16 @@ get_entity(gunichar *ptr, size_t len) {
|
||||
ech = 0;
|
||||
}
|
||||
}
|
||||
if (ech)
|
||||
if (ech)
|
||||
return ech;
|
||||
|
||||
std::map<std::wstring,gunichar>::const_iterator it =
|
||||
std::map<std::wstring,gunichar>::const_iterator it =
|
||||
ENTITY_MAP.find(std::wstring((wchar_t *)(ptr),len));
|
||||
return it != ENTITY_MAP.end() ? it->second : gunichar(0);
|
||||
}
|
||||
|
||||
|
||||
inline gunichar
|
||||
inline gunichar
|
||||
get_entity(char *ptr, size_t len) {
|
||||
glong ulen = 0;
|
||||
gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)ptr, len, &ulen);
|
||||
@ -399,7 +399,7 @@ get_entity(char *ptr, size_t len) {
|
||||
}
|
||||
|
||||
|
||||
inline std::string
|
||||
inline std::string
|
||||
trim(const std::string& in)
|
||||
{
|
||||
std::size_t start = 0;
|
||||
@ -413,7 +413,7 @@ trim(const std::string& in)
|
||||
}
|
||||
|
||||
|
||||
inline std::vector<std::string>
|
||||
inline std::vector<std::string>
|
||||
split(const std::string& in)
|
||||
{
|
||||
std::vector<std::string> outv;
|
||||
@ -476,7 +476,7 @@ Tokenizer::Tokenizer(const Parameters& _)
|
||||
//
|
||||
// dtor deletes dynamically allocated per-language RE2 compiled expressions
|
||||
//
|
||||
Tokenizer::~Tokenizer()
|
||||
Tokenizer::~Tokenizer()
|
||||
{
|
||||
for (auto& ptr : prot_pat_vec) {
|
||||
if (ptr == &numprefixed_x || ptr == &quasinumeric_x)
|
||||
@ -491,7 +491,7 @@ Tokenizer::~Tokenizer()
|
||||
// others into nbpre_gen_set
|
||||
//
|
||||
std::pair<int,int>
|
||||
Tokenizer::load_prefixes(std::ifstream& ifs)
|
||||
Tokenizer::load_prefixes(std::ifstream& ifs)
|
||||
{
|
||||
RE2 numonly("(.*)[\\s]+(\\#NUMERIC_ONLY\\#)");
|
||||
std::string line;
|
||||
@ -547,7 +547,7 @@ Tokenizer::init(const char *cfg_dir_optional) {
|
||||
try {
|
||||
std::pair<int,int> counts = load_prefixes(cfg);
|
||||
if (verbose_p) {
|
||||
std::cerr << "loaded " << counts.first << " non-numeric, "
|
||||
std::cerr << "loaded " << counts.first << " non-numeric, "
|
||||
<< counts.second << " numeric prefixes from "
|
||||
<< nbpre_path << std::endl;
|
||||
}
|
||||
@ -570,7 +570,7 @@ Tokenizer::init(const char *cfg_dir_optional) {
|
||||
std::string protpat_path(cfg_dir);
|
||||
protpat_path.append("/protected_pattern.").append(lang_iso);
|
||||
// default to generic version
|
||||
if (::access(protpat_path.c_str(),R_OK))
|
||||
if (::access(protpat_path.c_str(),R_OK))
|
||||
protpat_path = protpat_path.substr(0,protpat_path.size()-lang_iso.size()-1);
|
||||
|
||||
prot_pat_vec.push_back(&numprefixed_x);
|
||||
@ -596,7 +596,7 @@ Tokenizer::init(const char *cfg_dir_optional) {
|
||||
throw std::runtime_error(ess.str());
|
||||
}
|
||||
if (verbose_p) {
|
||||
std::cerr << "loaded " << npat << " protected patterns from "
|
||||
std::cerr << "loaded " << npat << " protected patterns from "
|
||||
<< protpat_path << std::endl;
|
||||
}
|
||||
} else if (verbose_p) {
|
||||
@ -612,7 +612,7 @@ Tokenizer::reset() {
|
||||
|
||||
//
|
||||
// apply ctor-selected tokenization to a string, in-place, no newlines allowed,
|
||||
// assumes protections are applied already, some invariants are in place,
|
||||
// assumes protections are applied already, some invariants are in place,
|
||||
// e.g. that successive chars <= ' ' have been normalized to a single ' '
|
||||
//
|
||||
void
|
||||
@ -633,7 +633,7 @@ Tokenizer::protected_tokenize(std::string& text) {
|
||||
}
|
||||
if (pos < textpc.size() && textpc[pos] != ' ')
|
||||
words.push_back(textpc.substr(pos,textpc.size()-pos));
|
||||
|
||||
|
||||
// regurgitate words with look-ahead handling for tokens with final mumble
|
||||
std::string outs;
|
||||
std::size_t nwords(words.size());
|
||||
@ -659,7 +659,7 @@ Tokenizer::protected_tokenize(std::string& text) {
|
||||
// lower-case look-ahead does not break
|
||||
sentence_break_p = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
outs.append(words[ii].data(),len);
|
||||
if (sentence_break_p)
|
||||
@ -671,15 +671,15 @@ Tokenizer::protected_tokenize(std::string& text) {
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
bool
|
||||
Tokenizer::unescape(std::string& word) {
|
||||
std::ostringstream oss;
|
||||
std::size_t was = 0; // last processed
|
||||
std::size_t pos = 0; // last unprocessed
|
||||
std::size_t len = 0; // processed length
|
||||
bool hit = false;
|
||||
for (std::size_t endp=0;
|
||||
(pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos;
|
||||
for (std::size_t endp=0;
|
||||
(pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos;
|
||||
was = endp == std::string::npos ? pos : 1+endp) {
|
||||
len = endp - pos + 1;
|
||||
glong ulen(0);
|
||||
@ -703,7 +703,7 @@ Tokenizer::unescape(std::string& word) {
|
||||
}
|
||||
g_free(gtmp);
|
||||
}
|
||||
if (was < word.size())
|
||||
if (was < word.size())
|
||||
oss << word.substr(was);
|
||||
if (hit)
|
||||
word = oss.str();
|
||||
@ -727,7 +727,7 @@ Tokenizer::escape(std::string& text) {
|
||||
if (mod_p)
|
||||
outs.append(pp,pt-pp+1);
|
||||
} else {
|
||||
if (mod_p)
|
||||
if (mod_p)
|
||||
outs.append(pp,mk-pp);
|
||||
pt = --mk;
|
||||
}
|
||||
@ -751,7 +751,7 @@ Tokenizer::escape(std::string& text) {
|
||||
} else if (*pt > ']') {
|
||||
if (*pt =='|') { // 7c
|
||||
sequence_p = ESCAPE_MOSES[0];
|
||||
}
|
||||
}
|
||||
} else if (*pt > 'Z') {
|
||||
if (*pt == '<') { // 3e
|
||||
sequence_p = ESCAPE_MOSES[4];
|
||||
@ -761,11 +761,11 @@ Tokenizer::escape(std::string& text) {
|
||||
sequence_p = ESCAPE_MOSES[1];
|
||||
} else if (*pt == ']') { // 5d
|
||||
sequence_p = ESCAPE_MOSES[2];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (sequence_p) {
|
||||
if (pt > pp)
|
||||
if (pt > pp)
|
||||
outs.append(pp,pt-pp);
|
||||
outs.append(sequence_p);
|
||||
mod_p = true;
|
||||
@ -774,7 +774,7 @@ Tokenizer::escape(std::string& text) {
|
||||
++pt;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (mod_p) {
|
||||
if (pp < pt) {
|
||||
outs.append(pp,pt-pp);
|
||||
@ -795,13 +795,13 @@ Tokenizer::penn_tokenize(const std::string& buf)
|
||||
|
||||
std::string text(buf);
|
||||
std::string outs;
|
||||
if (skip_alltags_p)
|
||||
if (skip_alltags_p)
|
||||
RE2::GlobalReplace(&text,genl_tags_x,SPC_BYTE);
|
||||
|
||||
// directed quote patches
|
||||
size_t len = text.size();
|
||||
if (len > 2 && text.substr(0,2) == "``")
|
||||
text.replace(0,2,"`` ",3);
|
||||
if (len > 2 && text.substr(0,2) == "``")
|
||||
text.replace(0,2,"`` ",3);
|
||||
else if (text[0] == '"')
|
||||
text.replace(0,1,"`` ",3);
|
||||
else if (text[0] == '`' || text[0] == '\'')
|
||||
@ -811,9 +811,9 @@ Tokenizer::penn_tokenize(const std::string& buf)
|
||||
RE2::GlobalReplace(&text,x1_v_gg,one_gg);
|
||||
RE2::GlobalReplace(&text,x1_v_g,"\\1 ` \\2");
|
||||
RE2::GlobalReplace(&text,x1_v_q,"\\1 ` ");
|
||||
|
||||
|
||||
// protect ellipsis
|
||||
for (size_t pos = text.find("..."); pos != std::string::npos; pos = text.find("...",pos+11))
|
||||
for (size_t pos = text.find("..."); pos != std::string::npos; pos = text.find("...",pos+11))
|
||||
text.replace(pos,3,"MANYELIPSIS",11);
|
||||
|
||||
// numeric commas
|
||||
@ -826,13 +826,13 @@ Tokenizer::penn_tokenize(const std::string& buf)
|
||||
|
||||
// isolable slash
|
||||
RE2::GlobalReplace(&text,slash_x,special_refs);
|
||||
|
||||
|
||||
// isolate final period
|
||||
RE2::GlobalReplace(&text,final_x,"\\1 \\2\\3");
|
||||
|
||||
|
||||
// isolate q.m., e.m.
|
||||
RE2::GlobalReplace(&text,qx_x,isolate_ref);
|
||||
|
||||
|
||||
// isolate braces
|
||||
RE2::GlobalReplace(&text,braces_x,isolate_ref);
|
||||
|
||||
@ -866,7 +866,7 @@ Tokenizer::penn_tokenize(const std::string& buf)
|
||||
}
|
||||
std::string ntext(SPC_BYTE);
|
||||
ntext.append(text);
|
||||
|
||||
|
||||
// convert double quote to paired single-quotes
|
||||
RE2::GlobalReplace(&ntext,"\""," '' ");
|
||||
|
||||
@ -894,7 +894,7 @@ Tokenizer::penn_tokenize(const std::string& buf)
|
||||
RE2::GlobalReplace(&ntext," ([Ww])anna "," \\1an na ");
|
||||
|
||||
protected_tokenize(ntext);
|
||||
|
||||
|
||||
// restore ellipsis
|
||||
RE2::GlobalReplace(&ntext,"MANYELIPSIS","...");
|
||||
|
||||
@ -919,7 +919,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
int num = 0;
|
||||
|
||||
// this is the main moses-compatible tokenizer
|
||||
|
||||
|
||||
// push all the prefixes matching protected patterns
|
||||
std::vector<std::string> prot_stack;
|
||||
std::string match;
|
||||
@ -942,7 +942,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const char *pt(text.c_str());
|
||||
const char *ep(pt + text.size());
|
||||
while (pt < ep && *pt >= 0 && *pt <= ' ')
|
||||
@ -990,8 +990,8 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
if (!since_start) {
|
||||
if (std::isalpha(char(*ucs4)))
|
||||
alpha_prefix++;
|
||||
} else if (alpha_prefix == since_start
|
||||
&& char(*ucs4) == ':'
|
||||
} else if (alpha_prefix == since_start
|
||||
&& char(*ucs4) == ':'
|
||||
&& next_type != G_UNICODE_SPACE_SEPARATOR) {
|
||||
in_url_p = true;
|
||||
}
|
||||
@ -1018,7 +1018,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
// fallthough
|
||||
case G_UNICODE_UPPERCASE_LETTER:
|
||||
case G_UNICODE_LOWERCASE_LETTER:
|
||||
if (downcase_p && curr_type == G_UNICODE_UPPERCASE_LETTER)
|
||||
if (downcase_p && curr_type == G_UNICODE_UPPERCASE_LETTER)
|
||||
curr_uch = g_unichar_tolower(*ucs4);
|
||||
break;
|
||||
case G_UNICODE_SPACING_MARK:
|
||||
@ -1082,8 +1082,8 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
substitute_p = L"@-@";
|
||||
post_break_p = pre_break_p = true;
|
||||
} else if ( ( curr_uch > gunichar(L'\u002D') && curr_uch < gunichar(L'\u2010') ) ||
|
||||
( curr_uch > gunichar(L'\u2011')
|
||||
&& curr_uch != gunichar(L'\u30A0')
|
||||
( curr_uch > gunichar(L'\u2011')
|
||||
&& curr_uch != gunichar(L'\u30A0')
|
||||
&& curr_uch < gunichar(L'\uFE63') ) ) {
|
||||
// dash, not a hyphen
|
||||
post_break_p = pre_break_p = true;
|
||||
@ -1151,7 +1151,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
default:
|
||||
post_break_p = pre_break_p = prev_uch != curr_uch;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -1159,8 +1159,8 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
switch (curr_uch) {
|
||||
case gunichar(L':'):
|
||||
case gunichar(L'/'):
|
||||
if (refined_p && !in_url_p
|
||||
&& prev_type == G_UNICODE_DECIMAL_NUMBER
|
||||
if (refined_p && !in_url_p
|
||||
&& prev_type == G_UNICODE_DECIMAL_NUMBER
|
||||
&& next_type == G_UNICODE_DECIMAL_NUMBER) {
|
||||
break;
|
||||
}
|
||||
@ -1178,7 +1178,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
break;
|
||||
case gunichar(L'&'):
|
||||
if (unescape_p) {
|
||||
if (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER
|
||||
if (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER
|
||||
|| next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'#')) {
|
||||
gunichar *eptr = nxt4;
|
||||
GUnicodeType eptr_type(G_UNICODE_UNASSIGNED);
|
||||
@ -1223,16 +1223,16 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR;
|
||||
if (escape_p)
|
||||
if (escape_p)
|
||||
substitute_p = L"&";
|
||||
break;
|
||||
case gunichar(L'\''):
|
||||
if (english_p) {
|
||||
if (!in_url_p) {
|
||||
bool next_letter_p = next_type == G_UNICODE_LOWERCASE_LETTER
|
||||
bool next_letter_p = next_type == G_UNICODE_LOWERCASE_LETTER
|
||||
|| next_type == G_UNICODE_UPPERCASE_LETTER;
|
||||
pre_break_p = true;
|
||||
if (next_letter_p && refined_p) {
|
||||
@ -1241,9 +1241,9 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
*(uptr - 1) = gunichar(L' ');
|
||||
*(uptr++) = prev_uch;
|
||||
pre_break_p = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
post_break_p = since_start == 0
|
||||
post_break_p = since_start == 0
|
||||
|| (!next_letter_p && next_type != G_UNICODE_DECIMAL_NUMBER);
|
||||
}
|
||||
} else if (latin_p) {
|
||||
@ -1252,12 +1252,12 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
} else {
|
||||
post_break_p = pre_break_p = !in_url_p;
|
||||
}
|
||||
if (escape_p)
|
||||
if (escape_p)
|
||||
substitute_p = L"'";
|
||||
break;
|
||||
case gunichar(L'"'):
|
||||
post_break_p = pre_break_p = true;
|
||||
if (escape_p)
|
||||
if (escape_p)
|
||||
substitute_p = L""";
|
||||
break;
|
||||
case gunichar(L','):
|
||||
@ -1303,7 +1303,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
}
|
||||
}
|
||||
// terminal isolated letter does not break
|
||||
} else if (class_follows_p(nxt4,lim4,G_UNICODE_LOWERCASE_LETTER) ||
|
||||
} else if (class_follows_p(nxt4,lim4,G_UNICODE_LOWERCASE_LETTER) ||
|
||||
g_unichar_type(*nxt4) == G_UNICODE_DASH_PUNCTUATION) {
|
||||
// lower-case look-ahead does not break
|
||||
} else {
|
||||
@ -1315,7 +1315,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
pre_break_p = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
@ -1346,11 +1346,11 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
case gunichar(L')'):
|
||||
break;
|
||||
case gunichar(L'['):
|
||||
if (escape_p)
|
||||
if (escape_p)
|
||||
substitute_p = L"[";
|
||||
break;
|
||||
case gunichar(L']'):
|
||||
if (escape_p)
|
||||
if (escape_p)
|
||||
substitute_p = L"]";
|
||||
break;
|
||||
default:
|
||||
@ -1377,7 +1377,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
if (english_p) {
|
||||
if (!in_url_p) {
|
||||
pre_break_p = true;
|
||||
post_break_p = since_start == 0 ||
|
||||
post_break_p = since_start == 0 ||
|
||||
(next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
|
||||
}
|
||||
} else if (latin_p) {
|
||||
@ -1386,23 +1386,23 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
} else {
|
||||
post_break_p = pre_break_p = !in_url_p;
|
||||
}
|
||||
if (escape_p)
|
||||
if (escape_p)
|
||||
substitute_p = L"'";
|
||||
else
|
||||
else
|
||||
curr_uch = gunichar(L'\'');
|
||||
break;
|
||||
case gunichar(L'|'):
|
||||
if (escape_p)
|
||||
if (escape_p)
|
||||
substitute_p = L"|";
|
||||
post_break_p = pre_break_p = true;
|
||||
break;
|
||||
case gunichar(L'<'):
|
||||
if (escape_p)
|
||||
if (escape_p)
|
||||
substitute_p = L"<";
|
||||
post_break_p = pre_break_p = true;
|
||||
break;
|
||||
case gunichar(L'>'):
|
||||
if (escape_p)
|
||||
if (escape_p)
|
||||
substitute_p = L">";
|
||||
post_break_p = pre_break_p = true;
|
||||
break;
|
||||
@ -1414,7 +1414,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
case gunichar(L'='):
|
||||
case gunichar(L'~'):
|
||||
in_num_p = false;
|
||||
post_break_p = pre_break_p = !in_url_p;
|
||||
post_break_p = pre_break_p = !in_url_p;
|
||||
break;
|
||||
case gunichar(L'+'):
|
||||
post_break_p = pre_break_p = !in_url_p;
|
||||
@ -1444,12 +1444,12 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
curr_uch = gunichar(L' ');
|
||||
} else if (curr_uch < gunichar(L' ')) {
|
||||
curr_uch = gunichar(L' ');
|
||||
} else if (curr_uch == gunichar(L'\u0092') &&
|
||||
} else if (curr_uch == gunichar(L'\u0092') &&
|
||||
(next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER)) {
|
||||
// observed corpus corruption case
|
||||
if (english_p) {
|
||||
pre_break_p = true;
|
||||
post_break_p = since_start == 0 ||
|
||||
post_break_p = since_start == 0 ||
|
||||
(next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
|
||||
} else if (latin_p) {
|
||||
post_break_p = true;
|
||||
@ -1457,9 +1457,9 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
} else {
|
||||
post_break_p = pre_break_p = true;
|
||||
}
|
||||
if (escape_p)
|
||||
if (escape_p)
|
||||
substitute_p = L"'";
|
||||
else
|
||||
else
|
||||
curr_uch = gunichar(L'\'');
|
||||
} else {
|
||||
post_break_p = pre_break_p = true;
|
||||
@ -1491,7 +1491,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
in_url_p = in_num_p = false;
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
if (pre_break_p || curr_uch == gunichar(L' ') || (bad_length && curr_type != G_UNICODE_UNASSIGNED)) {
|
||||
if (since_start) {
|
||||
// non-empty token emitted previously, so pre-break must emit token separator
|
||||
@ -1501,8 +1501,8 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
if (curr_uch == gunichar(L' '))
|
||||
// suppress emission below, fall-through to substitute logic
|
||||
curr_uch = 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (substitute_p) {
|
||||
for (gunichar *sptr = (gunichar *)substitute_p; *sptr; ++sptr) {
|
||||
*uptr++ = *sptr;
|
||||
@ -1521,7 +1521,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
|
||||
glong nbytes = 0;
|
||||
gchar *utf8 = g_ucs4_to_utf8(ubuf,uptr-ubuf,0,&nbytes,0); // g_free
|
||||
if (utf8[nbytes-1] == ' ')
|
||||
if (utf8[nbytes-1] == ' ')
|
||||
--nbytes;
|
||||
text.assign((const char *)utf8,(const char *)(utf8 + nbytes));
|
||||
g_free(utf8);
|
||||
@ -1552,7 +1552,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
}
|
||||
|
||||
|
||||
std::size_t
|
||||
std::size_t
|
||||
Tokenizer::tokenize(std::istream& is, std::ostream& os)
|
||||
{
|
||||
std::size_t line_no = 0;
|
||||
@ -1561,10 +1561,10 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
|
||||
std::vector< std::vector< std::string > > results(nthreads);
|
||||
std::vector< boost::thread > workers(nthreads);
|
||||
bool done_p = !(is.good() && os.good());
|
||||
|
||||
|
||||
|
||||
for (std::size_t tranche = 0; !done_p; ++tranche) {
|
||||
|
||||
|
||||
// for loop starting threads for chunks of input
|
||||
for (std::size_t ithread = 0; ithread < nthreads; ++ithread) {
|
||||
|
||||
@ -1589,19 +1589,19 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
|
||||
results[ithread].resize(line_pos);
|
||||
break;
|
||||
}
|
||||
lines[ithread][line_pos].clear();
|
||||
} else if (skip_xml_p &&
|
||||
(RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
|
||||
lines[ithread][line_pos].clear();
|
||||
lines[ithread][line_pos].clear();
|
||||
} else if (skip_xml_p &&
|
||||
(RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
|
||||
lines[ithread][line_pos].clear();
|
||||
} else {
|
||||
lines[ithread][line_pos] =
|
||||
std::string(SPC_BYTE).append(istr).append(SPC_BYTE);
|
||||
lines[ithread][line_pos] =
|
||||
std::string(SPC_BYTE).append(istr).append(SPC_BYTE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (line_pos) {
|
||||
workers[ithread] =
|
||||
boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread]));
|
||||
workers[ithread] =
|
||||
boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread]));
|
||||
}
|
||||
} // end for loop starting threads
|
||||
|
||||
@ -1616,22 +1616,22 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
|
||||
|
||||
if (nlin != nres) {
|
||||
std::ostringstream emsg;
|
||||
emsg << "Tranche " << tranche
|
||||
<< " worker " << ithread << "/" << nthreads
|
||||
emsg << "Tranche " << tranche
|
||||
<< " worker " << ithread << "/" << nthreads
|
||||
<< " |lines|==" << nlin << " != |results|==" << nres;
|
||||
throw std::runtime_error(emsg.str());
|
||||
}
|
||||
|
||||
for (std::size_t ires = 0; ires < nres; ++ires)
|
||||
for (std::size_t ires = 0; ires < nres; ++ires)
|
||||
os << results[ithread][ires] << std::endl;
|
||||
|
||||
} // end loop over joined results
|
||||
|
||||
|
||||
if (verbose_p) {
|
||||
std::cerr << line_no << ' ';
|
||||
std::cerr.flush();
|
||||
}
|
||||
|
||||
|
||||
} // end loop over chunks
|
||||
|
||||
return line_no;
|
||||
@ -1642,18 +1642,18 @@ std::string
|
||||
Tokenizer::detokenize(const std::string& buf)
|
||||
{
|
||||
std::vector<std::string> words = split(trim(buf));
|
||||
|
||||
|
||||
std::size_t squotes = 0;
|
||||
std::size_t dquotes = 0;
|
||||
std::string prepends("");
|
||||
|
||||
std::ostringstream oss;
|
||||
|
||||
|
||||
std::size_t nwords = words.size();
|
||||
std::size_t iword = 0;
|
||||
|
||||
if (unescape_p)
|
||||
for (auto &word: words)
|
||||
if (unescape_p)
|
||||
for (auto &word: words)
|
||||
unescape(word);
|
||||
|
||||
for (auto &word: words) {
|
||||
@ -1665,13 +1665,13 @@ Tokenizer::detokenize(const std::string& buf)
|
||||
} else if (RE2::FullMatch(word,left_x)) {
|
||||
oss << word;
|
||||
prepends = SPC_BYTE;
|
||||
} else if (english_p && iword
|
||||
&& RE2::FullMatch(word,curr_en_x)
|
||||
} else if (english_p && iword
|
||||
&& RE2::FullMatch(word,curr_en_x)
|
||||
&& RE2::FullMatch(words[iword-1],pre_en_x)) {
|
||||
oss << word;
|
||||
prepends = SPC_BYTE;
|
||||
} else if (latin_p && iword < nwords - 2
|
||||
&& RE2::FullMatch(word,curr_fr_x)
|
||||
} else if (latin_p && iword < nwords - 2
|
||||
&& RE2::FullMatch(word,curr_fr_x)
|
||||
&& RE2::FullMatch(words[iword+1],post_fr_x)) {
|
||||
oss << prepends << word;
|
||||
prepends.clear();
|
||||
@ -1679,7 +1679,7 @@ Tokenizer::detokenize(const std::string& buf)
|
||||
if ((word.at(0) == '\'' && ((squotes % 2) == 0 )) ||
|
||||
(word.at(0) == '"' && ((dquotes % 2) == 0))) {
|
||||
if (english_p && iword
|
||||
&& word.at(0) == '\''
|
||||
&& word.at(0) == '\''
|
||||
&& std::tolower(words[iword-1].at(words[iword-1].size()-1)) == 's') {
|
||||
oss << word;
|
||||
prepends = SPC_BYTE;
|
||||
@ -1698,7 +1698,7 @@ Tokenizer::detokenize(const std::string& buf)
|
||||
prepends = SPC_BYTE;
|
||||
if (word.at(0) == '\'')
|
||||
squotes++;
|
||||
else if (word.at(0) == '"')
|
||||
else if (word.at(0) == '"')
|
||||
dquotes++;
|
||||
}
|
||||
} else {
|
||||
@ -1707,8 +1707,8 @@ Tokenizer::detokenize(const std::string& buf)
|
||||
}
|
||||
iword++;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
std::string text(oss.str());
|
||||
RE2::GlobalReplace(&text," +",SPC_BYTE);
|
||||
RE2::GlobalReplace(&text,"\n ","\n");
|
||||
@ -1718,14 +1718,14 @@ Tokenizer::detokenize(const std::string& buf)
|
||||
|
||||
|
||||
std::size_t
|
||||
Tokenizer::detokenize(std::istream& is, std::ostream& os)
|
||||
Tokenizer::detokenize(std::istream& is, std::ostream& os)
|
||||
{
|
||||
size_t line_no = 0;
|
||||
while (is.good() && os.good()) {
|
||||
std::string istr;
|
||||
std::getline(is,istr);
|
||||
line_no ++;
|
||||
if (istr.empty())
|
||||
if (istr.empty())
|
||||
continue;
|
||||
if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
|
||||
os << istr << std::endl;
|
||||
@ -1749,7 +1749,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
|
||||
return parts;
|
||||
}
|
||||
gunichar *uout = (gunichar *)g_malloc0(2*ncp*sizeof(gunichar));
|
||||
|
||||
|
||||
const wchar_t GENL_HYPH = L'\u2010';
|
||||
const wchar_t IDEO_STOP = L'\u3002';
|
||||
const wchar_t KANA_MDOT = L'\u30FB';
|
||||
@ -1786,7 +1786,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
|
||||
|
||||
std::vector<std::size_t> breaks;
|
||||
std::set<std::size_t> suppress;
|
||||
|
||||
|
||||
for (; icp <= ncp; ++icp) {
|
||||
currwc = wchar_t(ucs4[icp]);
|
||||
curr_type = g_unichar_type(currwc);
|
||||
@ -1798,7 +1798,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
|
||||
case G_UNICODE_OTHER_NUMBER:
|
||||
curr_class = numba;
|
||||
curr_word_p = true;
|
||||
break;
|
||||
break;
|
||||
case G_UNICODE_LOWERCASE_LETTER:
|
||||
case G_UNICODE_MODIFIER_LETTER:
|
||||
case G_UNICODE_OTHER_LETTER:
|
||||
@ -1822,7 +1822,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
|
||||
} else if (currwc >= SMAL_HYPH) {
|
||||
curr_word_p = true;
|
||||
} else {
|
||||
curr_word_p = (currwc >= WAVE_DASH) && (currwc <= KANA_DHYP);
|
||||
curr_word_p = (currwc >= WAVE_DASH) && (currwc <= KANA_DHYP);
|
||||
}
|
||||
break;
|
||||
case G_UNICODE_CLOSE_PUNCTUATION:
|
||||
@ -1860,7 +1860,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
|
||||
curr_word_p = false;
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
// # condition for prefix test
|
||||
// $words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/
|
||||
// $words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/
|
||||
@ -1875,7 +1875,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
|
||||
} else if (curr_word_p) {
|
||||
if (!fini_word) {
|
||||
init_word = ocp;
|
||||
}
|
||||
}
|
||||
fini_word = ocp+1;
|
||||
dotslen = finilen = 0;
|
||||
} else if (curr_class >= quote && curr_class <= pfpct && curr_class != pinit) {
|
||||
@ -1893,7 +1893,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
|
||||
} else {
|
||||
init_word = fini_word = 0;
|
||||
}
|
||||
|
||||
|
||||
if (check_abbr_p) {
|
||||
// not a valid word character or post-word punctuation character: check word
|
||||
std::wstring k((wchar_t *)uout+init_word,fini_word-init_word);
|
||||
@ -1986,7 +1986,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
|
||||
}
|
||||
init_word = fini_word = 0;
|
||||
}
|
||||
|
||||
|
||||
if (seqpos >= SEQ_LIM) {
|
||||
seqpos = 0;
|
||||
}
|
||||
@ -2015,7 +2015,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (!seqpos) {
|
||||
if (curr_class != blank) {
|
||||
uout[ocp++] = gunichar(currwc);
|
||||
@ -2024,7 +2024,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
if (curr_class == blank) {
|
||||
if (prev_class != blank) {
|
||||
seq[seqpos] = blank;
|
||||
@ -2034,7 +2034,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
|
||||
}
|
||||
if (icp < ncp)
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (curr_class >= quote && curr_class <= pfini) {
|
||||
if (prev_class < quote || prev_class > pfini) {
|
||||
@ -2158,8 +2158,8 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
|
||||
if (uout[chkpos] == L'\n' || uout[chkpos] == L' ') {
|
||||
endpos = chkpos;
|
||||
continue;
|
||||
}
|
||||
if (g_unichar_isgraph(uout[chkpos]))
|
||||
}
|
||||
if (g_unichar_isgraph(uout[chkpos]))
|
||||
break;
|
||||
endpos = chkpos;
|
||||
}
|
||||
@ -2171,17 +2171,17 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
|
||||
if (continuation_ptr)
|
||||
*continuation_ptr = endpos > iop;
|
||||
iop = nextpos;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
g_free(uout);
|
||||
g_free(ucs4);
|
||||
|
||||
|
||||
return parts;
|
||||
}
|
||||
|
||||
|
||||
std::pair<std::size_t,std::size_t>
|
||||
Tokenizer::splitter(std::istream& is, std::ostream& os)
|
||||
Tokenizer::splitter(std::istream& is, std::ostream& os)
|
||||
{
|
||||
std::pair<std::size_t,std::size_t> counts = { 0, 0 };
|
||||
bool continuation_p = false;
|
||||
@ -2197,7 +2197,7 @@ Tokenizer::splitter(std::istream& is, std::ostream& os)
|
||||
if (istr.empty() && (is.eof() ||!para_marks_p))
|
||||
continue;
|
||||
|
||||
if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x)))
|
||||
if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x)))
|
||||
continue;
|
||||
|
||||
std::vector<std::string> sentences(splitter(istr,&continuation_p));
|
||||
@ -2221,13 +2221,13 @@ Tokenizer::splitter(std::istream& is, std::ostream& os)
|
||||
os << " ";
|
||||
pending_gap = false;
|
||||
}
|
||||
|
||||
for (std::size_t ii = 0; ii < nsents-1; ++ii)
|
||||
|
||||
for (std::size_t ii = 0; ii < nsents-1; ++ii)
|
||||
os << sentences[ii] << std::endl;
|
||||
|
||||
|
||||
os << sentences[nsents-1];
|
||||
|
||||
if (continuation_p)
|
||||
if (continuation_p)
|
||||
pending_gap = !split_breaks_p;
|
||||
if (!pending_gap)
|
||||
os << std::endl;
|
||||
|
@ -26,7 +26,7 @@ class Tokenizer {
|
||||
|
||||
private:
|
||||
|
||||
typedef enum {
|
||||
typedef enum {
|
||||
empty = 0,
|
||||
blank,
|
||||
upper, // upper case
|
||||
@ -56,7 +56,7 @@ private:
|
||||
// non-breaking prefixes (other) ucs4
|
||||
std::set<std::wstring> nbpre_gen_ucs4;
|
||||
|
||||
// compiled protected patterns
|
||||
// compiled protected patterns
|
||||
std::vector<re2::RE2 *> prot_pat_vec;
|
||||
|
||||
protected:
|
||||
@ -96,10 +96,10 @@ protected:
|
||||
Tokenizer *tokenizer;
|
||||
std::vector<std::string>& in;
|
||||
std::vector<std::string>& out;
|
||||
|
||||
VectorTokenizerCallable(Tokenizer *_tokenizer,
|
||||
std::vector<std::string>& _in,
|
||||
std::vector<std::string>& _out)
|
||||
|
||||
VectorTokenizerCallable(Tokenizer *_tokenizer,
|
||||
std::vector<std::string>& _in,
|
||||
std::vector<std::string>& _out)
|
||||
: tokenizer(_tokenizer)
|
||||
, in(_in)
|
||||
, out(_out) {
|
||||
@ -107,10 +107,10 @@ protected:
|
||||
|
||||
void operator()() {
|
||||
out.resize(in.size());
|
||||
for (std::size_t ii = 0; ii < in.size(); ++ii)
|
||||
for (std::size_t ii = 0; ii < in.size(); ++ii)
|
||||
if (in[ii].empty())
|
||||
out[ii] = in[ii];
|
||||
else if (tokenizer->penn_p)
|
||||
else if (tokenizer->penn_p)
|
||||
out[ii] = tokenizer->penn_tokenize(in[ii]);
|
||||
else
|
||||
out[ii] = tokenizer->quik_tokenize(in[ii]);
|
||||
|
@ -10,8 +10,8 @@ using namespace TOKENIZER_NAMESPACE ;
|
||||
#endif
|
||||
|
||||
|
||||
void
|
||||
usage(const char *path)
|
||||
void
|
||||
usage(const char *path)
|
||||
{
|
||||
std::cerr << "Usage: " << path << "[-{v|x|p|a|e|s|u|n|N]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
|
||||
std::cerr << " -a -- aggressive hyphenization" << std::endl;
|
||||
@ -89,7 +89,7 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
|
||||
int nlines = 0;
|
||||
std::string line;
|
||||
while (ifs.good() && std::getline(ifs,line)) {
|
||||
if (line.empty())
|
||||
if (line.empty())
|
||||
continue;
|
||||
std::vector<std::string> tokens(tize.tokens(line));
|
||||
int count = 0;
|
||||
@ -127,7 +127,7 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
|
||||
}
|
||||
|
||||
|
||||
int main(int ac, char **av)
|
||||
int main(int ac, char **av)
|
||||
{
|
||||
int rc = 0;
|
||||
Parameters params;
|
||||
@ -140,7 +140,7 @@ int main(int ac, char **av)
|
||||
if (!detokenize_p)
|
||||
params.split_p = std::strstr(av[0],"splitter") != 0;
|
||||
|
||||
while (++av,--ac) {
|
||||
while (++av,--ac) {
|
||||
if (**av == '-') {
|
||||
switch (av[0][1]) {
|
||||
case 'a':
|
||||
@ -244,7 +244,7 @@ int main(int ac, char **av)
|
||||
if (comma) {
|
||||
*comma++ = 0;
|
||||
params.chunksize = std::strtoul(comma,0,0);
|
||||
}
|
||||
}
|
||||
params.nthreads = std::strtoul(*av,0,0);
|
||||
} else {
|
||||
params.args.push_back(std::string(*av));
|
||||
@ -275,7 +275,7 @@ int main(int ac, char **av)
|
||||
cfg_mos_str.append("/moses");
|
||||
if (!::access(cfg_mos_str.c_str(),X_OK)) {
|
||||
params.cfg_path = strdup(cfg_mos_str.c_str());
|
||||
} else if (!::access(cfg_shr_str.c_str(),X_OK)) {
|
||||
} else if (!::access(cfg_shr_str.c_str(),X_OK)) {
|
||||
params.cfg_path = strdup(cfg_shr_str.c_str());
|
||||
} else if (!::access(cfg_dir_str.c_str(),X_OK)) {
|
||||
params.cfg_path = strdup(cfg_dir_str.c_str());
|
||||
@ -287,7 +287,7 @@ int main(int ac, char **av)
|
||||
if (params.verbose_p) {
|
||||
std::cerr << "config path: " << params.cfg_path << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<std::ofstream> pofs = 0;
|
||||
if (!params.out_path.empty()) {
|
||||
@ -345,7 +345,7 @@ int main(int ac, char **av)
|
||||
if (plines.second) {
|
||||
std::cerr << "%%% " << plines.second << " sentences." << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -1,236 +1,236 @@
|
||||
/**
|
||||
* ISS (Indexed Strings Storage) - memory efficient storage for permanent strings.
|
||||
*
|
||||
* Implementation note: use #define USE_HASHSET to switch between implementation
|
||||
* using __gnu_cxx::hash_set and implementation using std::set.
|
||||
*
|
||||
* (C) Ceslav Przywara, UFAL MFF UK, 2011
|
||||
*
|
||||
* $Id$
|
||||
*/
|
||||
|
||||
#ifndef _ISS_H
|
||||
#define _ISS_H
|
||||
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
#include <string.h>
|
||||
|
||||
// Use hashset instead of std::set for string-to-number indexing?
|
||||
#ifdef USE_HASHSET
|
||||
#include <ext/hash_set>
|
||||
#else
|
||||
#include <set>
|
||||
#endif
|
||||
|
||||
#include <boost/pool/pool.hpp>
|
||||
|
||||
#ifdef USE_HASHSET
|
||||
// Forward declaration of comparator functor.
|
||||
template<class IndType>
|
||||
class StringsEqualComparator;
|
||||
|
||||
template<class IndType>
|
||||
class Hasher;
|
||||
#else
|
||||
// Forward declaration of comparator functor.
|
||||
template<class IndType>
|
||||
class StringsLessComparator;
|
||||
#endif
|
||||
|
||||
/**
|
||||
*/
|
||||
template<class IndType>
|
||||
class IndexedStringsStorage {
|
||||
|
||||
public:
|
||||
|
||||
typedef IndType index_type;
|
||||
|
||||
#ifdef USE_HASHSET
|
||||
typedef StringsEqualComparator<IndType> equality_comparator_t;
|
||||
|
||||
typedef Hasher<IndType> hasher_t;
|
||||
|
||||
/** @typedef Hash set used as lookup table (string -> numeric index). */
|
||||
typedef __gnu_cxx::hash_set<IndType, hasher_t, equality_comparator_t> index_t;
|
||||
#else
|
||||
typedef StringsLessComparator<IndType> less_comparator_t;
|
||||
|
||||
/** @typedef Set used as lookup table (string -> numeric index). */
|
||||
typedef std::set<IndType, less_comparator_t> index_t;
|
||||
#endif
|
||||
/** @typedef Container of pointers to stored C-strings. Acts as
|
||||
* conversion table: numeric index -> string.
|
||||
*/
|
||||
typedef std::vector<const char*> table_t;
|
||||
|
||||
private:
|
||||
|
||||
/** @var memory pool used to store C-strings */
|
||||
boost::pool<> _storage;
|
||||
|
||||
/** @var index-to-string conversion table */
|
||||
table_t _table;
|
||||
|
||||
/** @var index lookup table */
|
||||
index_t _index;
|
||||
|
||||
public:
|
||||
/** Default constructor.
|
||||
*/
|
||||
IndexedStringsStorage(void);
|
||||
|
||||
/** @return True, if the indices are exhausted (new strings cannot be stored).
|
||||
*/
|
||||
inline bool is_full(void) const { return _table.size() == std::numeric_limits<IndType>::max(); }
|
||||
|
||||
/** Retrieves pointer to C-string instance represented by given index.
|
||||
* Note: No range checks are performed!
|
||||
* @param index Index of C-string to retrieve.
|
||||
* @return Pointer to stored C-string instance.
|
||||
*/
|
||||
inline const char* get(IndType index) const { return _table[index]; }
|
||||
|
||||
/** Stores the string and returns its numeric index.
|
||||
* @param str Pointer to C-string to store.
|
||||
* @return Index of stored copy of str.
|
||||
* @throw std::bad_alloc When insertion of new string would cause
|
||||
* overflow of indices datatype.
|
||||
*/
|
||||
IndType put(const char* str);
|
||||
|
||||
/** @return Number of unique strings stored so far.
|
||||
*/
|
||||
inline table_t::size_type size(void) const { return _table.size(); }
|
||||
};
|
||||
|
||||
|
||||
/** Functor designed for less than comparison of C-strings stored within StringStore.
|
||||
* @param IndType Type of numerical indices of strings within given StringStore.
|
||||
*/
|
||||
#ifdef USE_HASHSET
|
||||
template<class IndType>
|
||||
class StringsEqualComparator: public std::binary_function<IndType, IndType, bool> {
|
||||
#else
|
||||
template<class IndType>
|
||||
class StringsLessComparator: public std::binary_function<IndType, IndType, bool> {
|
||||
#endif
|
||||
/** @var conversion table: index -> string (necessary for indices comparison) */
|
||||
const typename IndexedStringsStorage<IndType>::table_t& _table;
|
||||
public:
|
||||
#ifdef USE_HASHSET
|
||||
StringsEqualComparator<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _table(table) {}
|
||||
#else
|
||||
StringsLessComparator<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _table(table) {}
|
||||
#endif
|
||||
|
||||
/** Comparison of two pointers to C-strings.
|
||||
* @param lhs Pointer to 1st C-string.
|
||||
* @param rhs Pointer to 2nd C-string.
|
||||
* @return True, if 1st argument is equal/less than 2nd argument.
|
||||
*/
|
||||
inline bool operator()(IndType lhs, IndType rhs) const {
|
||||
#ifdef USE_HASHSET
|
||||
return strcmp(_table[lhs], _table[rhs]) == 0;
|
||||
#else
|
||||
return strcmp(_table[lhs], _table[rhs]) < 0;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef USE_HASHSET
|
||||
/** Functor... TODO.
|
||||
*/
|
||||
template<class IndType>
|
||||
class Hasher: public std::unary_function<IndType, size_t> {
|
||||
|
||||
__gnu_cxx::hash<const char*> _hash;
|
||||
|
||||
/** @var conversion table: index -> string (necessary for indices comparison) */
|
||||
const typename IndexedStringsStorage<IndType>::table_t& _table;
|
||||
|
||||
public:
|
||||
/** */
|
||||
Hasher<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _hash(), _table(table) {}
|
||||
|
||||
/** Hashing function.
|
||||
* @param index
|
||||
* @return Counted hash.
|
||||
*/
|
||||
inline size_t operator()(const IndType index) const {
|
||||
return _hash(_table[index]);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
template <class IndType>
|
||||
#ifdef USE_HASHSET
|
||||
IndexedStringsStorage<IndType>::IndexedStringsStorage(void): _storage(sizeof(char)), _table(), _index(100, hasher_t(_table), equality_comparator_t(_table)) {}
|
||||
#else
|
||||
IndexedStringsStorage<IndType>::IndexedStringsStorage(void): _storage(sizeof(char)), _table(), _index(less_comparator_t(_table)) {}
|
||||
#endif
|
||||
|
||||
template <class IndType>
|
||||
IndType IndexedStringsStorage<IndType>::put(const char* str) {
|
||||
|
||||
if ( this->is_full() ) {
|
||||
// What a pity, not a single index left to spend.
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
|
||||
// To use the index for lookup we first have to store passed string
|
||||
// in conversion table (cause during lookup we compare the strings indirectly
|
||||
// by using their indices).
|
||||
// Note: thread unsafe! TODO: Redesing.
|
||||
IndType index = static_cast<IndType>(_table.size());
|
||||
_table.push_back(str);
|
||||
|
||||
#ifdef USE_HASHSET
|
||||
//
|
||||
typename index_t::iterator iIndex = _index.find(index);
|
||||
#else
|
||||
// A lower_bound() search enables us to use "found" iterator as a hint for
|
||||
// eventual insertion.
|
||||
typename index_t::iterator iIndex = _index.lower_bound(index);
|
||||
#endif
|
||||
|
||||
if ( (iIndex != _index.end())
|
||||
#ifndef USE_HASHSET
|
||||
// In case of lower_bound() search we have to also compare found item
|
||||
// with passed string.
|
||||
&& (strcmp(_table[*iIndex], str) == 0)
|
||||
#endif
|
||||
) {
|
||||
// String is already present in storage!
|
||||
// Pop back temporary stored pointer...
|
||||
_table.pop_back();
|
||||
// ...and return numeric index to already stored copy of `str`.
|
||||
return static_cast<IndType>(*iIndex);
|
||||
}
|
||||
|
||||
// String not found within storage.
|
||||
|
||||
// Allocate memory required for string storage...
|
||||
char* mem = static_cast<char*>(_storage.ordered_malloc(strlen(str) + 1));
|
||||
// ...and fill it with copy of passed string.
|
||||
strcpy(mem, str);
|
||||
|
||||
// Overwrite temporary stored pointer to `str` with pointer to freshly
|
||||
// saved copy.
|
||||
_table[index] = mem;
|
||||
|
||||
#ifdef USE_HASHSET
|
||||
// Insert the index into lookup table.
|
||||
_index.insert(index);
|
||||
#else
|
||||
// Insert the index into lookup table (use previously retrieved iterator
|
||||
// as a hint).
|
||||
_index.insert(iIndex, index);
|
||||
#endif
|
||||
|
||||
// Finally.
|
||||
return index;
|
||||
}
|
||||
|
||||
#endif
|
||||
/**
|
||||
* ISS (Indexed Strings Storage) - memory efficient storage for permanent strings.
|
||||
*
|
||||
* Implementation note: use #define USE_HASHSET to switch between implementation
|
||||
* using __gnu_cxx::hash_set and implementation using std::set.
|
||||
*
|
||||
* (C) Ceslav Przywara, UFAL MFF UK, 2011
|
||||
*
|
||||
* $Id$
|
||||
*/
|
||||
|
||||
#ifndef _ISS_H
|
||||
#define _ISS_H
|
||||
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
#include <string.h>
|
||||
|
||||
// Use hashset instead of std::set for string-to-number indexing?
|
||||
#ifdef USE_HASHSET
|
||||
#include <ext/hash_set>
|
||||
#else
|
||||
#include <set>
|
||||
#endif
|
||||
|
||||
#include <boost/pool/pool.hpp>
|
||||
|
||||
#ifdef USE_HASHSET
|
||||
// Forward declaration of comparator functor.
|
||||
template<class IndType>
|
||||
class StringsEqualComparator;
|
||||
|
||||
template<class IndType>
|
||||
class Hasher;
|
||||
#else
|
||||
// Forward declaration of comparator functor.
|
||||
template<class IndType>
|
||||
class StringsLessComparator;
|
||||
#endif
|
||||
|
||||
/**
|
||||
*/
|
||||
template<class IndType>
|
||||
class IndexedStringsStorage {
|
||||
|
||||
public:
|
||||
|
||||
typedef IndType index_type;
|
||||
|
||||
#ifdef USE_HASHSET
|
||||
typedef StringsEqualComparator<IndType> equality_comparator_t;
|
||||
|
||||
typedef Hasher<IndType> hasher_t;
|
||||
|
||||
/** @typedef Hash set used as lookup table (string -> numeric index). */
|
||||
typedef __gnu_cxx::hash_set<IndType, hasher_t, equality_comparator_t> index_t;
|
||||
#else
|
||||
typedef StringsLessComparator<IndType> less_comparator_t;
|
||||
|
||||
/** @typedef Set used as lookup table (string -> numeric index). */
|
||||
typedef std::set<IndType, less_comparator_t> index_t;
|
||||
#endif
|
||||
/** @typedef Container of pointers to stored C-strings. Acts as
|
||||
* conversion table: numeric index -> string.
|
||||
*/
|
||||
typedef std::vector<const char*> table_t;
|
||||
|
||||
private:
|
||||
|
||||
/** @var memory pool used to store C-strings */
|
||||
boost::pool<> _storage;
|
||||
|
||||
/** @var index-to-string conversion table */
|
||||
table_t _table;
|
||||
|
||||
/** @var index lookup table */
|
||||
index_t _index;
|
||||
|
||||
public:
|
||||
/** Default constructor.
|
||||
*/
|
||||
IndexedStringsStorage(void);
|
||||
|
||||
/** @return True, if the indices are exhausted (new strings cannot be stored).
|
||||
*/
|
||||
inline bool is_full(void) const { return _table.size() == std::numeric_limits<IndType>::max(); }
|
||||
|
||||
/** Retrieves pointer to C-string instance represented by given index.
|
||||
* Note: No range checks are performed!
|
||||
* @param index Index of C-string to retrieve.
|
||||
* @return Pointer to stored C-string instance.
|
||||
*/
|
||||
inline const char* get(IndType index) const { return _table[index]; }
|
||||
|
||||
/** Stores the string and returns its numeric index.
|
||||
* @param str Pointer to C-string to store.
|
||||
* @return Index of stored copy of str.
|
||||
* @throw std::bad_alloc When insertion of new string would cause
|
||||
* overflow of indices datatype.
|
||||
*/
|
||||
IndType put(const char* str);
|
||||
|
||||
/** @return Number of unique strings stored so far.
|
||||
*/
|
||||
inline table_t::size_type size(void) const { return _table.size(); }
|
||||
};
|
||||
|
||||
|
||||
/** Functor designed for less than comparison of C-strings stored within StringStore.
|
||||
* @param IndType Type of numerical indices of strings within given StringStore.
|
||||
*/
|
||||
#ifdef USE_HASHSET
|
||||
template<class IndType>
|
||||
class StringsEqualComparator: public std::binary_function<IndType, IndType, bool> {
|
||||
#else
|
||||
template<class IndType>
|
||||
class StringsLessComparator: public std::binary_function<IndType, IndType, bool> {
|
||||
#endif
|
||||
/** @var conversion table: index -> string (necessary for indices comparison) */
|
||||
const typename IndexedStringsStorage<IndType>::table_t& _table;
|
||||
public:
|
||||
#ifdef USE_HASHSET
|
||||
StringsEqualComparator<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _table(table) {}
|
||||
#else
|
||||
StringsLessComparator<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _table(table) {}
|
||||
#endif
|
||||
|
||||
/** Comparison of two pointers to C-strings.
|
||||
* @param lhs Pointer to 1st C-string.
|
||||
* @param rhs Pointer to 2nd C-string.
|
||||
* @return True, if 1st argument is equal/less than 2nd argument.
|
||||
*/
|
||||
inline bool operator()(IndType lhs, IndType rhs) const {
|
||||
#ifdef USE_HASHSET
|
||||
return strcmp(_table[lhs], _table[rhs]) == 0;
|
||||
#else
|
||||
return strcmp(_table[lhs], _table[rhs]) < 0;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef USE_HASHSET
|
||||
/** Functor... TODO.
|
||||
*/
|
||||
template<class IndType>
|
||||
class Hasher: public std::unary_function<IndType, size_t> {
|
||||
|
||||
__gnu_cxx::hash<const char*> _hash;
|
||||
|
||||
/** @var conversion table: index -> string (necessary for indices comparison) */
|
||||
const typename IndexedStringsStorage<IndType>::table_t& _table;
|
||||
|
||||
public:
|
||||
/** */
|
||||
Hasher<IndType>(const typename IndexedStringsStorage<IndType>::table_t& table): _hash(), _table(table) {}
|
||||
|
||||
/** Hashing function.
|
||||
* @param index
|
||||
* @return Counted hash.
|
||||
*/
|
||||
inline size_t operator()(const IndType index) const {
|
||||
return _hash(_table[index]);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
template <class IndType>
|
||||
#ifdef USE_HASHSET
|
||||
IndexedStringsStorage<IndType>::IndexedStringsStorage(void): _storage(sizeof(char)), _table(), _index(100, hasher_t(_table), equality_comparator_t(_table)) {}
|
||||
#else
|
||||
IndexedStringsStorage<IndType>::IndexedStringsStorage(void): _storage(sizeof(char)), _table(), _index(less_comparator_t(_table)) {}
|
||||
#endif
|
||||
|
||||
template <class IndType>
|
||||
IndType IndexedStringsStorage<IndType>::put(const char* str) {
|
||||
|
||||
if ( this->is_full() ) {
|
||||
// What a pity, not a single index left to spend.
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
|
||||
// To use the index for lookup we first have to store passed string
|
||||
// in conversion table (cause during lookup we compare the strings indirectly
|
||||
// by using their indices).
|
||||
// Note: thread unsafe! TODO: Redesing.
|
||||
IndType index = static_cast<IndType>(_table.size());
|
||||
_table.push_back(str);
|
||||
|
||||
#ifdef USE_HASHSET
|
||||
//
|
||||
typename index_t::iterator iIndex = _index.find(index);
|
||||
#else
|
||||
// A lower_bound() search enables us to use "found" iterator as a hint for
|
||||
// eventual insertion.
|
||||
typename index_t::iterator iIndex = _index.lower_bound(index);
|
||||
#endif
|
||||
|
||||
if ( (iIndex != _index.end())
|
||||
#ifndef USE_HASHSET
|
||||
// In case of lower_bound() search we have to also compare found item
|
||||
// with passed string.
|
||||
&& (strcmp(_table[*iIndex], str) == 0)
|
||||
#endif
|
||||
) {
|
||||
// String is already present in storage!
|
||||
// Pop back temporary stored pointer...
|
||||
_table.pop_back();
|
||||
// ...and return numeric index to already stored copy of `str`.
|
||||
return static_cast<IndType>(*iIndex);
|
||||
}
|
||||
|
||||
// String not found within storage.
|
||||
|
||||
// Allocate memory required for string storage...
|
||||
char* mem = static_cast<char*>(_storage.ordered_malloc(strlen(str) + 1));
|
||||
// ...and fill it with copy of passed string.
|
||||
strcpy(mem, str);
|
||||
|
||||
// Overwrite temporary stored pointer to `str` with pointer to freshly
|
||||
// saved copy.
|
||||
_table[index] = mem;
|
||||
|
||||
#ifdef USE_HASHSET
|
||||
// Insert the index into lookup table.
|
||||
_index.insert(index);
|
||||
#else
|
||||
// Insert the index into lookup table (use previously retrieved iterator
|
||||
// as a hint).
|
||||
_index.insert(iIndex, index);
|
||||
#endif
|
||||
|
||||
// Finally.
|
||||
return index;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -83,7 +83,7 @@ public:
|
||||
const counter_t bucketWidth; // ceil(1/error)
|
||||
|
||||
private:
|
||||
|
||||
|
||||
/** @var Current epoch bucket ID (b-current) */
|
||||
counter_t _bucketId;
|
||||
|
||||
@ -182,7 +182,7 @@ class LossyCounterIterator: public std::iterator<std::forward_iterator_tag, type
|
||||
public:
|
||||
|
||||
typedef LossyCounterIterator<T> self_type;
|
||||
|
||||
|
||||
typedef typename LossyCounter<T>::storage_t::const_iterator const_iterator;
|
||||
|
||||
protected:
|
||||
@ -288,7 +288,7 @@ protected:
|
||||
|
||||
template<class T>
|
||||
void LossyCounter<T>::add(const T& item) {
|
||||
|
||||
|
||||
typename storage_t::iterator iter = _storage.find(item);
|
||||
|
||||
if ( iter == _storage.end() ) {
|
||||
@ -330,7 +330,7 @@ void LossyCounter<T>::prune(void) {
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<class T>
|
||||
LossyCounterIterator<T> LossyCounterIterator<T>::operator++(void) {
|
||||
LossyCounterIterator<T> LossyCounterIterator<T>::operator++(void) {
|
||||
this->forward();
|
||||
return *this;
|
||||
}
|
||||
|
@ -92,7 +92,7 @@ int main(int argc, char* argv[]) {
|
||||
// Init lossy counters.
|
||||
std::string lossyCountersParams;
|
||||
int paramIdx = 5;
|
||||
|
||||
|
||||
while ( (argc > paramIdx) && (*argv[paramIdx] != '-') ) {
|
||||
std::string param = std::string(argv[paramIdx]);
|
||||
if ( !parse_lossy_counting_params(param) ) {
|
||||
@ -113,7 +113,7 @@ int main(int argc, char* argv[]) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if ( (argc > paramIdx) && (strcmp(argv[paramIdx], "--compact") == 0) ) {
|
||||
compactOutputFlag = true;
|
||||
++paramIdx;
|
||||
@ -154,7 +154,7 @@ int main(int argc, char* argv[]) {
|
||||
readInput(eFile, fFile, aFile);
|
||||
|
||||
std::cerr << std::endl; // Leave the progress bar end on previous line.
|
||||
|
||||
|
||||
// close input files
|
||||
eFile.close();
|
||||
fFile.close();
|
||||
|
@ -32,14 +32,14 @@ typedef std::vector<output_pair_t> output_vector_t;
|
||||
class PhraseComp {
|
||||
/** @var If true, sort by target phrase first. */
|
||||
bool _inverted;
|
||||
|
||||
|
||||
bool compareAlignments(const indexed_phrases_pair_t& a, const indexed_phrases_pair_t& b);
|
||||
|
||||
int comparePhrases(const indexed_phrases_pair_t::phrase_t& a, const indexed_phrases_pair_t::phrase_t& b);
|
||||
|
||||
|
||||
public:
|
||||
PhraseComp(bool inverted): _inverted(inverted) {}
|
||||
|
||||
|
||||
bool operator()(const output_pair_t& a, const output_pair_t& b);
|
||||
};
|
||||
|
||||
@ -448,9 +448,9 @@ void extract(SentenceAlignment &sentence) {
|
||||
((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : "") + " | " +
|
||||
((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : "");
|
||||
}
|
||||
|
||||
|
||||
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
|
||||
|
||||
|
||||
} // end of for loop through inbound phrases
|
||||
|
||||
} // end if buildExtraStructure
|
||||
@ -567,7 +567,7 @@ bool PhraseComp::operator()(const output_pair_t& a, const output_pair_t& b) {
|
||||
else {
|
||||
return cmp < 0;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -607,7 +607,7 @@ bool PhraseComp::compareAlignments(const indexed_phrases_pair_t& a, const indexe
|
||||
return cmp < 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Note: LC_ALL=C GNU sort treats shorter item as lesser than longer one.
|
||||
return (cmp == 0) ? (aSize < bSize) : (cmp < 0);
|
||||
|
||||
@ -685,7 +685,7 @@ void processSortedOutput(OutputProcessor& processor) {
|
||||
|
||||
|
||||
void processUnsortedOutput(OutputProcessor& processor) {
|
||||
|
||||
|
||||
LossyCountersVector::value_type current = NULL, prev = NULL;
|
||||
|
||||
for ( size_t i = 1; i < lossyCounters.size(); ++i ) { // Intentionally skip 0.
|
||||
@ -759,7 +759,7 @@ void printStats(void) {
|
||||
if ( (current == NULL) || ((current != prev) && (prev != NULL)) ) {
|
||||
// Time to print.
|
||||
to = i-1;
|
||||
|
||||
|
||||
// Increment overall stats.
|
||||
outputMass += prev->outputMass;
|
||||
outputSize += prev->outputSize;
|
||||
@ -787,7 +787,7 @@ void printStats(void) {
|
||||
|
||||
from = i;
|
||||
}
|
||||
|
||||
|
||||
prev = current;
|
||||
|
||||
}
|
||||
|
@ -26,11 +26,18 @@ def parse_cmd():
|
||||
arguments = parser.parse_args()
|
||||
return arguments
|
||||
|
||||
def repoinit(testconfig):
|
||||
"""Determines revision and sets up the repo."""
|
||||
def repoinit(testconfig, profiler=True):
|
||||
"""Determines revision and sets up the repo. If given the profiler optional
|
||||
argument, wil init the profiler repo instead of the default one."""
|
||||
revision = ''
|
||||
#Update the repo
|
||||
os.chdir(testconfig.repo)
|
||||
if profiler:
|
||||
if testconfig.repo_prof is not None:
|
||||
os.chdir(testconfig.repo_prof)
|
||||
else:
|
||||
raise ValueError('Profiling repo is not defined')
|
||||
else:
|
||||
os.chdir(testconfig.repo)
|
||||
#Checkout specific branch, else maintain main branch
|
||||
if testconfig.branch != 'master':
|
||||
subprocess.call(['git', 'checkout', testconfig.branch])
|
||||
@ -49,13 +56,14 @@ def repoinit(testconfig):
|
||||
rev, _ = subprocess.Popen(['git rev-parse HEAD'], stdout=subprocess.PIPE,\
|
||||
stderr=subprocess.PIPE, shell=True).communicate()
|
||||
revision = str(rev).replace("\\n'", '').replace("b'", '')
|
||||
|
||||
|
||||
return revision
|
||||
|
||||
class Configuration:
|
||||
"""A simple class to hold all of the configuration constatns"""
|
||||
def __init__(self, repo, drop_caches, tests, testlogs, basebranch, baserev):
|
||||
def __init__(self, repo, drop_caches, tests, testlogs, basebranch, baserev, repo_prof=None):
|
||||
self.repo = repo
|
||||
self.repo_prof = repo_prof
|
||||
self.drop_caches = drop_caches
|
||||
self.tests = tests
|
||||
self.testlogs = testlogs
|
||||
@ -80,15 +88,16 @@ class Configuration:
|
||||
|
||||
class Test:
|
||||
"""A simple class to contain all information about tests"""
|
||||
def __init__(self, name, command, ldopts, permutations):
|
||||
def __init__(self, name, command, ldopts, permutations, prof_command=None):
|
||||
self.name = name
|
||||
self.command = command
|
||||
self.prof_command = prof_command
|
||||
self.ldopts = ldopts.replace(' ', '').split(',') #Not tested yet
|
||||
self.permutations = permutations
|
||||
|
||||
def parse_configfile(conffile, testdir, moses_repo):
|
||||
def parse_configfile(conffile, testdir, moses_repo, moses_prof_repo=None):
|
||||
"""Parses the config file"""
|
||||
command, ldopts = '', ''
|
||||
command, ldopts, prof_command = '', '', None
|
||||
permutations = []
|
||||
fileopen = open(conffile, 'r')
|
||||
for line in fileopen:
|
||||
@ -99,6 +108,8 @@ def parse_configfile(conffile, testdir, moses_repo):
|
||||
|
||||
if opt == 'Command:':
|
||||
command = args.replace('\n', '')
|
||||
if moses_prof is not None: # Get optional command for profiling
|
||||
prof_command = moses_prof_repo + '/bin/' + command
|
||||
command = moses_repo + '/bin/' + command
|
||||
elif opt == 'LDPRE:':
|
||||
ldopts = args.replace('\n', '')
|
||||
@ -107,14 +118,14 @@ def parse_configfile(conffile, testdir, moses_repo):
|
||||
else:
|
||||
raise ValueError('Unrecognized option ' + opt)
|
||||
#We use the testdir as the name.
|
||||
testcase = Test(testdir, command, ldopts, permutations)
|
||||
testcase = Test(testdir, command, ldopts, permutations, prof_command)
|
||||
fileopen.close()
|
||||
return testcase
|
||||
|
||||
def parse_testconfig(conffile):
|
||||
"""Parses the config file for the whole testsuite."""
|
||||
repo_path, drop_caches, tests_dir, testlog_dir = '', '', '', ''
|
||||
basebranch, baserev = '', ''
|
||||
basebranch, baserev, repo_prof_path = '', '', None
|
||||
fileopen = open(conffile, 'r')
|
||||
for line in fileopen:
|
||||
line = line.split('#')[0] # Discard comments
|
||||
@ -133,10 +144,12 @@ def parse_testconfig(conffile):
|
||||
basebranch = args.replace('\n', '')
|
||||
elif opt == 'BASEREV:':
|
||||
baserev = args.replace('\n', '')
|
||||
elif opt == 'MOSES_PROFILER_PATH:': # Optional
|
||||
repo_prof_path = args.replace('\n', '')
|
||||
else:
|
||||
raise ValueError('Unrecognized option ' + opt)
|
||||
config = Configuration(repo_path, drop_caches, tests_dir, testlog_dir,\
|
||||
basebranch, baserev)
|
||||
basebranch, baserev, repo_prof_path)
|
||||
fileopen.close()
|
||||
return config
|
||||
|
||||
@ -146,6 +159,8 @@ def get_config():
|
||||
config = parse_testconfig(args.configfile)
|
||||
config.additional_args(args.singletestdir, args.revision, args.branch)
|
||||
revision = repoinit(config)
|
||||
if config.repo_prof is not None:
|
||||
repoinit(config, True)
|
||||
config.set_revision(revision)
|
||||
return config
|
||||
|
||||
@ -221,6 +236,10 @@ def execute_tests(testcase, cur_directory, config):
|
||||
stderr=None, shell=True).communicate()
|
||||
write_log('/tmp/time_moses_tests', testcase.name + '_ldpre_' +opt +'_cached', config)
|
||||
|
||||
#if 'profile' in testcase.permutations:
|
||||
#TODO Separate the above into functions so we can execute them with profiling moses.
|
||||
#Fix the logic in the main
|
||||
|
||||
# Go through all the test directories and executes tests
|
||||
if __name__ == '__main__':
|
||||
CONFIG = get_config()
|
||||
@ -260,7 +279,7 @@ if __name__ == '__main__':
|
||||
#Create a new configuration for base version tests:
|
||||
BASECONFIG = Configuration(CONFIG.repo, CONFIG.drop_caches,\
|
||||
CONFIG.tests, CONFIG.testlogs, CONFIG.basebranch,\
|
||||
CONFIG.baserev)
|
||||
CONFIG.baserev, CONFIG.repo_prof)
|
||||
BASECONFIG.additional_args(None, CONFIG.baserev, CONFIG.basebranch)
|
||||
#Set up the repository and get its revision:
|
||||
REVISION = repoinit(BASECONFIG)
|
||||
@ -268,6 +287,11 @@ if __name__ == '__main__':
|
||||
#Build
|
||||
os.chdir(BASECONFIG.repo)
|
||||
subprocess.call(['./previous.sh'], shell=True)
|
||||
#If profiler configuration exists also init it
|
||||
if BASECONFIG.repo_prof is not None:
|
||||
repoinit(BASECONFIG, True)
|
||||
os.chdir(BASECONFIG.repo_prof)
|
||||
subprocess.call(['./previous.sh'], shell=True)
|
||||
|
||||
#Perform tests
|
||||
for directory in FIRSTTIME:
|
||||
@ -277,10 +301,15 @@ if __name__ == '__main__':
|
||||
|
||||
#Reset back the repository to the normal configuration
|
||||
repoinit(CONFIG)
|
||||
if BASECONFIG.repo_prof is not None:
|
||||
repoinit(CONFIG, True)
|
||||
|
||||
#Builds moses
|
||||
os.chdir(CONFIG.repo)
|
||||
subprocess.call(['./previous.sh'], shell=True)
|
||||
if CONFIG.repo_prof is not None:
|
||||
os.chdir(CONFIG.repo_prof)
|
||||
subprocess.call(['./previous.sh'], shell=True)
|
||||
|
||||
if CONFIG.singletest:
|
||||
TESTCASE = parse_configfile(CONFIG.tests + '/' +\
|
||||
|
@ -1,13 +1,13 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<CodeLite_Workspace Name="all" Database="all.tags">
|
||||
<Project Name="manual-label" Path="manual-label/manual-label.project" Active="No"/>
|
||||
<Project Name="manual-label" Path="manual-label/manual-label.project" Active="Yes"/>
|
||||
<Project Name="extract" Path="extract/extract.project" Active="No"/>
|
||||
<Project Name="util" Path="util/util.project" Active="No"/>
|
||||
<Project Name="extract-mixed-syntax" Path="extract-mixed-syntax/extract-mixed-syntax.project" Active="No"/>
|
||||
<Project Name="lm" Path="lm/lm.project" Active="No"/>
|
||||
<Project Name="OnDiskPt" Path="OnDiskPt/OnDiskPt.project" Active="No"/>
|
||||
<Project Name="search" Path="search/search.project" Active="No"/>
|
||||
<Project Name="moses" Path="moses/moses.project" Active="Yes"/>
|
||||
<Project Name="moses" Path="moses/moses.project" Active="No"/>
|
||||
<Project Name="moses-cmd" Path="moses-cmd/moses-cmd.project" Active="No"/>
|
||||
<Project Name="score" Path="score/score.project" Active="No"/>
|
||||
<Project Name="consolidate" Path="consolidate/consolidate.project" Active="No"/>
|
||||
|
@ -10,15 +10,15 @@ int main(int argc, char* argv[])
|
||||
|
||||
using namespace boost::locale;
|
||||
using namespace std;
|
||||
|
||||
|
||||
generator gen;
|
||||
locale loc=gen("");
|
||||
|
||||
|
||||
cout.imbue(loc);
|
||||
|
||||
|
||||
cout << "Hello, World" << endl;
|
||||
|
||||
|
||||
cout << "This is how we show currency in this locale " << as::currency << 103.34 << endl;
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,231 +1,231 @@
|
||||
// XGetopt.cpp Version 1.2
|
||||
//
|
||||
// Author: Hans Dietrich
|
||||
// hdietrich2@hotmail.com
|
||||
//
|
||||
// Description:
|
||||
// XGetopt.cpp implements getopt(), a function to parse command lines.
|
||||
//
|
||||
// History
|
||||
// Version 1.2 - 2003 May 17
|
||||
// - Added Unicode support
|
||||
//
|
||||
// Version 1.1 - 2002 March 10
|
||||
// - Added example to XGetopt.cpp module header
|
||||
//
|
||||
// This software is released into the public domain.
|
||||
// You are free to use it in any way you like.
|
||||
//
|
||||
// This software is provided "as is" with no expressed
|
||||
// or implied warranty. I accept no liability for any
|
||||
// damage or loss of business that this software may cause.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// if you are using precompiled headers then include this line:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// if you are not using precompiled headers then include these lines:
|
||||
//#include <windows.h>
|
||||
//#include <cstdio>
|
||||
//#include <tchar.h>
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
#include "WIN32_functions.h"
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// X G e t o p t . c p p
|
||||
//
|
||||
//
|
||||
// NAME
|
||||
// getopt -- parse command line options
|
||||
//
|
||||
// SYNOPSIS
|
||||
// int getopt(int argc, char *argv[], char *optstring)
|
||||
//
|
||||
// extern char *optarg;
|
||||
// extern int optind;
|
||||
//
|
||||
// DESCRIPTION
|
||||
// The getopt() function parses the command line arguments. Its
|
||||
// arguments argc and argv are the argument count and array as
|
||||
// passed into the application on program invocation. In the case
|
||||
// of Visual C++ programs, argc and argv are available via the
|
||||
// variables __argc and __argv (double underscores), respectively.
|
||||
// getopt returns the next option letter in argv that matches a
|
||||
// letter in optstring. (Note: Unicode programs should use
|
||||
// __targv instead of __argv. Also, all character and string
|
||||
// literals should be enclosed in ( ) ).
|
||||
//
|
||||
// optstring is a string of recognized option letters; if a letter
|
||||
// is followed by a colon, the option is expected to have an argument
|
||||
// that may or may not be separated from it by white space. optarg
|
||||
// is set to point to the start of the option argument on return from
|
||||
// getopt.
|
||||
//
|
||||
// Option letters may be combined, e.g., "-ab" is equivalent to
|
||||
// "-a -b". Option letters are case sensitive.
|
||||
//
|
||||
// getopt places in the external variable optind the argv index
|
||||
// of the next argument to be processed. optind is initialized
|
||||
// to 0 before the first call to getopt.
|
||||
//
|
||||
// When all options have been processed (i.e., up to the first
|
||||
// non-option argument), getopt returns EOF, optarg will point
|
||||
// to the argument, and optind will be set to the argv index of
|
||||
// the argument. If there are no non-option arguments, optarg
|
||||
// will be set to NULL.
|
||||
//
|
||||
// The special option "--" may be used to delimit the end of the
|
||||
// options; EOF will be returned, and "--" (and everything after it)
|
||||
// will be skipped.
|
||||
//
|
||||
// RETURN VALUE
|
||||
// For option letters contained in the string optstring, getopt
|
||||
// will return the option letter. getopt returns a question mark (?)
|
||||
// when it encounters an option letter not included in optstring.
|
||||
// EOF is returned when processing is finished.
|
||||
//
|
||||
// BUGS
|
||||
// 1) Long options are not supported.
|
||||
// 2) The GNU double-colon extension is not supported.
|
||||
// 3) The environment variable POSIXLY_CORRECT is not supported.
|
||||
// 4) The + syntax is not supported.
|
||||
// 5) The automatic permutation of arguments is not supported.
|
||||
// 6) This implementation of getopt() returns EOF if an error is
|
||||
// encountered, instead of -1 as the latest standard requires.
|
||||
//
|
||||
// EXAMPLE
|
||||
// BOOL CMyApp::ProcessCommandLine(int argc, char *argv[])
|
||||
// {
|
||||
// int c;
|
||||
//
|
||||
// while ((c = getopt(argc, argv, ("aBn:"))) != EOF)
|
||||
// {
|
||||
// switch (c)
|
||||
// {
|
||||
// case ('a'):
|
||||
// TRACE(("option a\n"));
|
||||
// //
|
||||
// // set some flag here
|
||||
// //
|
||||
// break;
|
||||
//
|
||||
// case ('B'):
|
||||
// TRACE( ("option B\n"));
|
||||
// //
|
||||
// // set some other flag here
|
||||
// //
|
||||
// break;
|
||||
//
|
||||
// case ('n'):
|
||||
// TRACE(("option n: value=%d\n"), atoi(optarg));
|
||||
// //
|
||||
// // do something with value here
|
||||
// //
|
||||
// break;
|
||||
//
|
||||
// case ('?'):
|
||||
// TRACE(("ERROR: illegal option %s\n"), argv[optind-1]);
|
||||
// return FALSE;
|
||||
// break;
|
||||
//
|
||||
// default:
|
||||
// TRACE(("WARNING: no handler for option %c\n"), c);
|
||||
// return FALSE;
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// //
|
||||
// // check for non-option args here
|
||||
// //
|
||||
// return TRUE;
|
||||
// }
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
char *optarg; // global argument pointer
|
||||
int optind = 0; // global argv index
|
||||
|
||||
int getopt(int argc, char *argv[], char *optstring)
|
||||
{
|
||||
static char *next = NULL;
|
||||
if (optind == 0)
|
||||
next = NULL;
|
||||
|
||||
optarg = NULL;
|
||||
|
||||
if (next == NULL || *next =='\0') {
|
||||
if (optind == 0)
|
||||
optind++;
|
||||
|
||||
if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0')) {
|
||||
optarg = NULL;
|
||||
if (optind < argc)
|
||||
optarg = argv[optind];
|
||||
return EOF;
|
||||
}
|
||||
|
||||
if (strcmp(argv[optind], "--") == 0) {
|
||||
optind++;
|
||||
optarg = NULL;
|
||||
if (optind < argc)
|
||||
optarg = argv[optind];
|
||||
return EOF;
|
||||
}
|
||||
|
||||
next = argv[optind];
|
||||
next++; // skip past -
|
||||
optind++;
|
||||
}
|
||||
|
||||
char c = *next++;
|
||||
char *cp = strchr(optstring, c);
|
||||
|
||||
if (cp == NULL || c == (':'))
|
||||
return ('?');
|
||||
|
||||
cp++;
|
||||
if (*cp == (':')) {
|
||||
if (*next != ('\0')) {
|
||||
optarg = next;
|
||||
next = NULL;
|
||||
} else if (optind < argc) {
|
||||
optarg = argv[optind];
|
||||
optind++;
|
||||
} else {
|
||||
return ('?');
|
||||
}
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
// for an overview, see
|
||||
// W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
|
||||
double lgamma(int x)
|
||||
{
|
||||
// size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
|
||||
if (x <= 2) {
|
||||
return 0.0;
|
||||
}
|
||||
static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
|
||||
double tmp=(double)x+5.5;
|
||||
tmp -= (((double)x)+0.5)*log(tmp);
|
||||
double y=(double)x;
|
||||
double sum = 1.000000000190015;
|
||||
for (size_t j=0; j<6; ++j) {
|
||||
sum += coefs[j]/++y;
|
||||
}
|
||||
return -tmp+log(2.5066282746310005*sum/(double)x);
|
||||
}
|
||||
// XGetopt.cpp Version 1.2
|
||||
//
|
||||
// Author: Hans Dietrich
|
||||
// hdietrich2@hotmail.com
|
||||
//
|
||||
// Description:
|
||||
// XGetopt.cpp implements getopt(), a function to parse command lines.
|
||||
//
|
||||
// History
|
||||
// Version 1.2 - 2003 May 17
|
||||
// - Added Unicode support
|
||||
//
|
||||
// Version 1.1 - 2002 March 10
|
||||
// - Added example to XGetopt.cpp module header
|
||||
//
|
||||
// This software is released into the public domain.
|
||||
// You are free to use it in any way you like.
|
||||
//
|
||||
// This software is provided "as is" with no expressed
|
||||
// or implied warranty. I accept no liability for any
|
||||
// damage or loss of business that this software may cause.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// if you are using precompiled headers then include this line:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// if you are not using precompiled headers then include these lines:
|
||||
//#include <windows.h>
|
||||
//#include <cstdio>
|
||||
//#include <tchar.h>
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
#include "WIN32_functions.h"
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// X G e t o p t . c p p
|
||||
//
|
||||
//
|
||||
// NAME
|
||||
// getopt -- parse command line options
|
||||
//
|
||||
// SYNOPSIS
|
||||
// int getopt(int argc, char *argv[], char *optstring)
|
||||
//
|
||||
// extern char *optarg;
|
||||
// extern int optind;
|
||||
//
|
||||
// DESCRIPTION
|
||||
// The getopt() function parses the command line arguments. Its
|
||||
// arguments argc and argv are the argument count and array as
|
||||
// passed into the application on program invocation. In the case
|
||||
// of Visual C++ programs, argc and argv are available via the
|
||||
// variables __argc and __argv (double underscores), respectively.
|
||||
// getopt returns the next option letter in argv that matches a
|
||||
// letter in optstring. (Note: Unicode programs should use
|
||||
// __targv instead of __argv. Also, all character and string
|
||||
// literals should be enclosed in ( ) ).
|
||||
//
|
||||
// optstring is a string of recognized option letters; if a letter
|
||||
// is followed by a colon, the option is expected to have an argument
|
||||
// that may or may not be separated from it by white space. optarg
|
||||
// is set to point to the start of the option argument on return from
|
||||
// getopt.
|
||||
//
|
||||
// Option letters may be combined, e.g., "-ab" is equivalent to
|
||||
// "-a -b". Option letters are case sensitive.
|
||||
//
|
||||
// getopt places in the external variable optind the argv index
|
||||
// of the next argument to be processed. optind is initialized
|
||||
// to 0 before the first call to getopt.
|
||||
//
|
||||
// When all options have been processed (i.e., up to the first
|
||||
// non-option argument), getopt returns EOF, optarg will point
|
||||
// to the argument, and optind will be set to the argv index of
|
||||
// the argument. If there are no non-option arguments, optarg
|
||||
// will be set to NULL.
|
||||
//
|
||||
// The special option "--" may be used to delimit the end of the
|
||||
// options; EOF will be returned, and "--" (and everything after it)
|
||||
// will be skipped.
|
||||
//
|
||||
// RETURN VALUE
|
||||
// For option letters contained in the string optstring, getopt
|
||||
// will return the option letter. getopt returns a question mark (?)
|
||||
// when it encounters an option letter not included in optstring.
|
||||
// EOF is returned when processing is finished.
|
||||
//
|
||||
// BUGS
|
||||
// 1) Long options are not supported.
|
||||
// 2) The GNU double-colon extension is not supported.
|
||||
// 3) The environment variable POSIXLY_CORRECT is not supported.
|
||||
// 4) The + syntax is not supported.
|
||||
// 5) The automatic permutation of arguments is not supported.
|
||||
// 6) This implementation of getopt() returns EOF if an error is
|
||||
// encountered, instead of -1 as the latest standard requires.
|
||||
//
|
||||
// EXAMPLE
|
||||
// BOOL CMyApp::ProcessCommandLine(int argc, char *argv[])
|
||||
// {
|
||||
// int c;
|
||||
//
|
||||
// while ((c = getopt(argc, argv, ("aBn:"))) != EOF)
|
||||
// {
|
||||
// switch (c)
|
||||
// {
|
||||
// case ('a'):
|
||||
// TRACE(("option a\n"));
|
||||
// //
|
||||
// // set some flag here
|
||||
// //
|
||||
// break;
|
||||
//
|
||||
// case ('B'):
|
||||
// TRACE( ("option B\n"));
|
||||
// //
|
||||
// // set some other flag here
|
||||
// //
|
||||
// break;
|
||||
//
|
||||
// case ('n'):
|
||||
// TRACE(("option n: value=%d\n"), atoi(optarg));
|
||||
// //
|
||||
// // do something with value here
|
||||
// //
|
||||
// break;
|
||||
//
|
||||
// case ('?'):
|
||||
// TRACE(("ERROR: illegal option %s\n"), argv[optind-1]);
|
||||
// return FALSE;
|
||||
// break;
|
||||
//
|
||||
// default:
|
||||
// TRACE(("WARNING: no handler for option %c\n"), c);
|
||||
// return FALSE;
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// //
|
||||
// // check for non-option args here
|
||||
// //
|
||||
// return TRUE;
|
||||
// }
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
char *optarg; // global argument pointer
|
||||
int optind = 0; // global argv index
|
||||
|
||||
int getopt(int argc, char *argv[], char *optstring)
|
||||
{
|
||||
static char *next = NULL;
|
||||
if (optind == 0)
|
||||
next = NULL;
|
||||
|
||||
optarg = NULL;
|
||||
|
||||
if (next == NULL || *next =='\0') {
|
||||
if (optind == 0)
|
||||
optind++;
|
||||
|
||||
if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0')) {
|
||||
optarg = NULL;
|
||||
if (optind < argc)
|
||||
optarg = argv[optind];
|
||||
return EOF;
|
||||
}
|
||||
|
||||
if (strcmp(argv[optind], "--") == 0) {
|
||||
optind++;
|
||||
optarg = NULL;
|
||||
if (optind < argc)
|
||||
optarg = argv[optind];
|
||||
return EOF;
|
||||
}
|
||||
|
||||
next = argv[optind];
|
||||
next++; // skip past -
|
||||
optind++;
|
||||
}
|
||||
|
||||
char c = *next++;
|
||||
char *cp = strchr(optstring, c);
|
||||
|
||||
if (cp == NULL || c == (':'))
|
||||
return ('?');
|
||||
|
||||
cp++;
|
||||
if (*cp == (':')) {
|
||||
if (*next != ('\0')) {
|
||||
optarg = next;
|
||||
next = NULL;
|
||||
} else if (optind < argc) {
|
||||
optarg = argv[optind];
|
||||
optind++;
|
||||
} else {
|
||||
return ('?');
|
||||
}
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
// for an overview, see
|
||||
// W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
|
||||
double lgamma(int x)
|
||||
{
|
||||
// size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
|
||||
if (x <= 2) {
|
||||
return 0.0;
|
||||
}
|
||||
static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
|
||||
double tmp=(double)x+5.5;
|
||||
tmp -= (((double)x)+0.5)*log(tmp);
|
||||
double y=(double)x;
|
||||
double sum = 1.000000000190015;
|
||||
for (size_t j=0; j<6; ++j) {
|
||||
sum += coefs[j]/++y;
|
||||
}
|
||||
return -tmp+log(2.5066282746310005*sum/(double)x);
|
||||
}
|
||||
|
@ -1,24 +1,24 @@
|
||||
// XGetopt.h Version 1.2
|
||||
//
|
||||
// Author: Hans Dietrich
|
||||
// hdietrich2@hotmail.com
|
||||
//
|
||||
// This software is released into the public domain.
|
||||
// You are free to use it in any way you like.
|
||||
//
|
||||
// This software is provided "as is" with no expressed
|
||||
// or implied warranty. I accept no liability for any
|
||||
// damage or loss of business that this software may cause.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef XGETOPT_H
|
||||
#define XGETOPT_H
|
||||
|
||||
extern int optind, opterr;
|
||||
extern char *optarg;
|
||||
|
||||
int getopt(int argc, char *argv[], char *optstring);
|
||||
double lgamma(int x);
|
||||
|
||||
#endif //XGETOPT_H
|
||||
// XGetopt.h Version 1.2
|
||||
//
|
||||
// Author: Hans Dietrich
|
||||
// hdietrich2@hotmail.com
|
||||
//
|
||||
// This software is released into the public domain.
|
||||
// You are free to use it in any way you like.
|
||||
//
|
||||
// This software is provided "as is" with no expressed
|
||||
// or implied warranty. I accept no liability for any
|
||||
// damage or loss of business that this software may cause.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef XGETOPT_H
|
||||
#define XGETOPT_H
|
||||
|
||||
extern int optind, opterr;
|
||||
extern char *optarg;
|
||||
|
||||
int getopt(int argc, char *argv[], char *optstring);
|
||||
double lgamma(int x);
|
||||
|
||||
#endif //XGETOPT_H
|
||||
|
@ -1,5 +1,5 @@
|
||||
|
||||
#include <cstring>
|
||||
#include <cstring>
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
@ -234,13 +234,13 @@ void OutputAlignment(ostream &out, const AlignmentInfo &ai, size_t sourceOffset,
|
||||
{
|
||||
typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
|
||||
AlignVec alignments = ai.GetSortedAlignments();
|
||||
|
||||
|
||||
AlignVec::const_iterator it;
|
||||
for (it = alignments.begin(); it != alignments.end(); ++it) {
|
||||
const std::pair<size_t,size_t> &alignment = **it;
|
||||
out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " ";
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
|
||||
@ -251,7 +251,7 @@ void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
|
||||
const Hypothesis &edge = *edges[currEdge];
|
||||
const TargetPhrase &tp = edge.GetCurrTargetPhrase();
|
||||
size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos();
|
||||
|
||||
|
||||
OutputAlignment(out, tp.GetAlignmentInfo(), sourceOffset, targetOffset);
|
||||
|
||||
targetOffset += tp.GetSize();
|
||||
@ -263,7 +263,7 @@ void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<co
|
||||
{
|
||||
ostringstream out;
|
||||
OutputAlignment(out, edges);
|
||||
|
||||
|
||||
collector->Write(lineNo,out.str());
|
||||
}
|
||||
|
||||
@ -477,7 +477,7 @@ void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, con
|
||||
const int sourceOffset = sourceRange.GetStartPos();
|
||||
const int targetOffset = targetRange.GetStartPos();
|
||||
const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignmentInfo();
|
||||
|
||||
|
||||
OutputAlignment(out, ai, sourceOffset, targetOffset);
|
||||
|
||||
}
|
||||
|
@ -168,18 +168,18 @@ static void ShowWeights()
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
try {
|
||||
|
||||
|
||||
// echo command line, if verbose
|
||||
IFVERBOSE(1) {
|
||||
TRACE_ERR("command: ");
|
||||
for(int i=0; i<argc; ++i) TRACE_ERR(argv[i]<<" ");
|
||||
TRACE_ERR(endl);
|
||||
}
|
||||
|
||||
|
||||
// set number of significant decimals in output
|
||||
fix(cout,PRECISION);
|
||||
fix(cerr,PRECISION);
|
||||
|
||||
|
||||
// load all the settings into the Parameter class
|
||||
// (stores them as strings, or array of strings)
|
||||
Parameter* params = new Parameter();
|
||||
@ -187,34 +187,34 @@ int main(int argc, char** argv)
|
||||
params->Explain();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// initialize all "global" variables, which are stored in StaticData
|
||||
// note: this also loads models such as the language model, etc.
|
||||
if (!StaticData::LoadDataStatic(params, argv[0])) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
// setting "-show-weights" -> just dump out weights and exit
|
||||
if (params->isParamSpecified("show-weights")) {
|
||||
ShowWeights();
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
||||
// shorthand for accessing information in StaticData
|
||||
const StaticData& staticData = StaticData::Instance();
|
||||
|
||||
|
||||
|
||||
|
||||
//initialise random numbers
|
||||
rand_init();
|
||||
|
||||
|
||||
// set up read/writing class
|
||||
IOWrapper* ioWrapper = GetIOWrapper(staticData);
|
||||
if (!ioWrapper) {
|
||||
cerr << "Error; Failed to create IO object" << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
// check on weights
|
||||
vector<float> weights = staticData.GetAllWeights();
|
||||
IFVERBOSE(2) {
|
||||
@ -233,7 +233,7 @@ int main(int argc, char** argv)
|
||||
|
||||
// setting lexicalized reordering setup
|
||||
PhraseBasedReorderingState::m_useFirstBackwardScore = false;
|
||||
|
||||
|
||||
|
||||
auto_ptr<OutputCollector> outputCollector;
|
||||
outputCollector.reset(new OutputCollector());
|
||||
@ -241,7 +241,7 @@ int main(int argc, char** argv)
|
||||
#ifdef WITH_THREADS
|
||||
ThreadPool pool(staticData.ThreadCount());
|
||||
#endif
|
||||
|
||||
|
||||
// main loop over set of input sentences
|
||||
InputType* source = NULL;
|
||||
size_t lineCount = 0;
|
||||
@ -259,11 +259,11 @@ int main(int argc, char** argv)
|
||||
task->Run();
|
||||
delete task;
|
||||
#endif
|
||||
|
||||
|
||||
source = NULL; //make sure it doesn't get deleted
|
||||
++lineCount;
|
||||
}
|
||||
|
||||
|
||||
// we are done, finishing up
|
||||
#ifdef WITH_THREADS
|
||||
pool.Stop(true); //flush remaining jobs
|
||||
|
@ -70,7 +70,7 @@ namespace MosesCmd
|
||||
if (neg_log_div > 100){
|
||||
return 100;
|
||||
}
|
||||
return neg_log_div;
|
||||
return neg_log_div;
|
||||
}
|
||||
|
||||
void RelativeEntropyCalc::ConcatOutputPhraseRecursive(Phrase& phrase, const Hypothesis *hypo){
|
||||
|
@ -57,7 +57,7 @@ void PrintTranslationAnalysis(const TranslationSystem* system, std::ostream &os,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
bool epsilon = false;
|
||||
if (target == "") {
|
||||
target="<EPSILON>";
|
||||
|
@ -60,12 +60,12 @@ static void add(const string& e, const vector<float> scores,
|
||||
|
||||
static void finalise(Probs& p_e_given_f, Probs& p_f_given_e) {
|
||||
//cerr << "Sizes: p(e|f): " << p_e_given_f.size() << " p(f|e): " << p_f_given_e.size() << endl;
|
||||
for (Probs::const_iterator e1_iter = p_f_given_e.begin() ;
|
||||
for (Probs::const_iterator e1_iter = p_f_given_e.begin() ;
|
||||
e1_iter != p_f_given_e.end(); ++e1_iter) {
|
||||
for (Probs::const_iterator e2_iter = p_e_given_f.begin() ;
|
||||
e2_iter != p_e_given_f.end(); ++e2_iter) {
|
||||
|
||||
if (e1_iter->second == e2_iter->second) continue;
|
||||
if (e1_iter->second == e2_iter->second) continue;
|
||||
cout << e1_iter->second << " ||| " << e2_iter->second << " ||| " <<
|
||||
e1_iter->first * e2_iter->first << " ||| " << endl;
|
||||
}
|
||||
|
@ -3,10 +3,10 @@
|
||||
// The separate moses server executable is being phased out.
|
||||
// Since there were problems with the migration into the main
|
||||
// executable, this separate program is still included in the
|
||||
// distribution for legacy reasons. Contributors are encouraged
|
||||
// to add their contributions to moses/server rather than
|
||||
// distribution for legacy reasons. Contributors are encouraged
|
||||
// to add their contributions to moses/server rather than
|
||||
// contrib/server. This recommendation does not apply to wrapper
|
||||
// scripts.
|
||||
// scripts.
|
||||
// The future is this:
|
||||
|
||||
/** main function of the command line version of the decoder **/
|
||||
@ -83,7 +83,7 @@ public:
|
||||
pdsa->add(source_,target_,alignment_);
|
||||
#else
|
||||
const PhraseDictionary* pdf = PhraseDictionary::GetColl()[0];
|
||||
PhraseDictionaryDynSuffixArray*
|
||||
PhraseDictionaryDynSuffixArray*
|
||||
pdsa = (PhraseDictionaryDynSuffixArray*) pdf;
|
||||
cerr << "Inserting into address " << pdsa << endl;
|
||||
pdsa->insertSnt(source_, target_, alignment_);
|
||||
@ -146,7 +146,7 @@ public:
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
void breakOutParams(const params_t& params) {
|
||||
params_t::const_iterator si = params.find("source");
|
||||
if(si == params.end())
|
||||
@ -236,7 +236,7 @@ public:
|
||||
class TranslationTask : public virtual Moses::TranslationTask {
|
||||
protected:
|
||||
TranslationTask(xmlrpc_c::paramList const& paramList,
|
||||
boost::condition_variable& cond, boost::mutex& mut)
|
||||
boost::condition_variable& cond, boost::mutex& mut)
|
||||
: m_paramList(paramList),
|
||||
m_cond(cond),
|
||||
m_mut(mut),
|
||||
@ -244,7 +244,7 @@ protected:
|
||||
{}
|
||||
|
||||
public:
|
||||
static boost::shared_ptr<TranslationTask>
|
||||
static boost::shared_ptr<TranslationTask>
|
||||
create(xmlrpc_c::paramList const& paramList,
|
||||
boost::condition_variable& cond, boost::mutex& mut)
|
||||
{
|
||||
@ -252,15 +252,15 @@ public:
|
||||
ret->m_self = ret;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
virtual bool DeleteAfterExecution() {return false;}
|
||||
|
||||
bool IsDone() const {return m_done;}
|
||||
|
||||
const map<string, xmlrpc_c::value>& GetRetData() { return m_retData;}
|
||||
|
||||
virtual void
|
||||
Run()
|
||||
virtual void
|
||||
Run()
|
||||
{
|
||||
using namespace xmlrpc_c;
|
||||
const params_t params = m_paramList.getStruct(0);
|
||||
@ -292,25 +292,25 @@ public:
|
||||
|
||||
vector<float> multiModelWeights;
|
||||
si = params.find("lambda");
|
||||
if (si != params.end())
|
||||
if (si != params.end())
|
||||
{
|
||||
value_array multiModelArray = value_array(si->second);
|
||||
vector<value> multiModelValueVector(multiModelArray.vectorValueValue());
|
||||
for (size_t i=0;i < multiModelValueVector.size();i++)
|
||||
for (size_t i=0;i < multiModelValueVector.size();i++)
|
||||
{
|
||||
multiModelWeights.push_back(value_double(multiModelValueVector[i]));
|
||||
}
|
||||
}
|
||||
|
||||
si = params.find("model_name");
|
||||
if (si != params.end() && multiModelWeights.size() > 0)
|
||||
if (si != params.end() && multiModelWeights.size() > 0)
|
||||
{
|
||||
const string model_name = value_string(si->second);
|
||||
PhraseDictionaryMultiModel* pdmm
|
||||
PhraseDictionaryMultiModel* pdmm
|
||||
= (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name);
|
||||
pdmm->SetTemporaryMultiModelWeightsVector(multiModelWeights);
|
||||
}
|
||||
|
||||
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
|
||||
//Make sure alternative paths are retained, if necessary
|
||||
@ -321,7 +321,7 @@ public:
|
||||
|
||||
stringstream out, graphInfo, transCollOpts;
|
||||
|
||||
if (staticData.IsSyntax())
|
||||
if (staticData.IsSyntax())
|
||||
{
|
||||
boost::shared_ptr<TreeInput> tinput(new TreeInput);
|
||||
const vector<FactorType>& IFO = staticData.GetInputFactorOrder();
|
||||
@ -338,8 +338,8 @@ public:
|
||||
manager.OutputSearchGraphMoses(sgstream);
|
||||
m_retData["sg"] = value_string(sgstream.str());
|
||||
}
|
||||
}
|
||||
else
|
||||
}
|
||||
else
|
||||
{
|
||||
// size_t lineNumber = 0; // TODO: Include sentence request number here?
|
||||
boost::shared_ptr<Sentence> sentence(new Sentence(0,source));
|
||||
@ -351,30 +351,30 @@ public:
|
||||
vector<xmlrpc_c::value> alignInfo;
|
||||
outputHypo(out,hypo,addAlignInfo,alignInfo,reportAllFactors);
|
||||
if (addAlignInfo) m_retData["align"] = value_array(alignInfo);
|
||||
if (addWordAlignInfo)
|
||||
if (addWordAlignInfo)
|
||||
{
|
||||
stringstream wordAlignment;
|
||||
hypo->OutputAlignment(wordAlignment);
|
||||
vector<xmlrpc_c::value> alignments;
|
||||
string alignmentPair;
|
||||
while (wordAlignment >> alignmentPair)
|
||||
while (wordAlignment >> alignmentPair)
|
||||
{
|
||||
int pos = alignmentPair.find('-');
|
||||
map<string, xmlrpc_c::value> wordAlignInfo;
|
||||
wordAlignInfo["source-word"]
|
||||
wordAlignInfo["source-word"]
|
||||
= value_int(atoi(alignmentPair.substr(0, pos).c_str()));
|
||||
wordAlignInfo["target-word"]
|
||||
wordAlignInfo["target-word"]
|
||||
= value_int(atoi(alignmentPair.substr(pos + 1).c_str()));
|
||||
alignments.push_back(value_struct(wordAlignInfo));
|
||||
}
|
||||
m_retData["word-align"] = value_array(alignments);
|
||||
}
|
||||
|
||||
|
||||
if (addGraphInfo) insertGraphInfo(manager,m_retData);
|
||||
if (addTopts) insertTranslationOptions(manager,m_retData);
|
||||
if (nbest_size > 0)
|
||||
if (nbest_size > 0)
|
||||
{
|
||||
outputNBest(manager, m_retData, nbest_size, nbest_distinct,
|
||||
outputNBest(manager, m_retData, nbest_size, nbest_distinct,
|
||||
reportAllFactors, addAlignInfo, addScoreBreakdown);
|
||||
}
|
||||
(const_cast<StaticData&>(staticData)).SetOutputSearchGraph(false);
|
||||
@ -389,11 +389,11 @@ public:
|
||||
|
||||
}
|
||||
|
||||
void outputHypo(ostream& out, const Hypothesis* hypo,
|
||||
bool addAlignmentInfo, vector<xmlrpc_c::value>& alignInfo,
|
||||
void outputHypo(ostream& out, const Hypothesis* hypo,
|
||||
bool addAlignmentInfo, vector<xmlrpc_c::value>& alignInfo,
|
||||
bool reportAllFactors = false) {
|
||||
if (hypo->GetPrevHypo() != NULL) {
|
||||
outputHypo(out,hypo->GetPrevHypo(),addAlignmentInfo,
|
||||
outputHypo(out,hypo->GetPrevHypo(),addAlignmentInfo,
|
||||
alignInfo, reportAllFactors);
|
||||
Phrase p = hypo->GetCurrTargetPhrase();
|
||||
if(reportAllFactors) {
|
||||
@ -547,14 +547,14 @@ public:
|
||||
retData.insert(pair<string, xmlrpc_c::value>("nbest", xmlrpc_c::value_array(nBestXml)));
|
||||
}
|
||||
|
||||
void
|
||||
insertTranslationOptions(Manager& manager, map<string, xmlrpc_c::value>& retData)
|
||||
void
|
||||
insertTranslationOptions(Manager& manager, map<string, xmlrpc_c::value>& retData)
|
||||
{
|
||||
const TranslationOptionCollection* toptsColl = manager.getSntTranslationOptions();
|
||||
vector<xmlrpc_c::value> toptsXml;
|
||||
size_t const stop = toptsColl->GetSource().GetSize();
|
||||
TranslationOptionList const* tol;
|
||||
for (size_t s = 0 ; s < stop ; ++s)
|
||||
for (size_t s = 0 ; s < stop ; ++s)
|
||||
{
|
||||
for (size_t e = s; (tol = toptsColl->GetTranslationOptionList(s,e)) != NULL; ++e)
|
||||
{
|
||||
@ -569,11 +569,11 @@ public:
|
||||
toptXml["start"] = xmlrpc_c::value_int(s);
|
||||
toptXml["end"] = xmlrpc_c::value_int(e);
|
||||
vector<xmlrpc_c::value> scoresXml;
|
||||
const std::valarray<FValue> &scores
|
||||
const std::valarray<FValue> &scores
|
||||
= topt->GetScoreBreakdown().getCoreFeatures();
|
||||
for (size_t j = 0; j < scores.size(); ++j)
|
||||
for (size_t j = 0; j < scores.size(); ++j)
|
||||
scoresXml.push_back(xmlrpc_c::value_double(scores[j]));
|
||||
|
||||
|
||||
toptXml["scores"] = xmlrpc_c::value_array(scoresXml);
|
||||
toptsXml.push_back(xmlrpc_c::value_struct(toptXml));
|
||||
}
|
||||
@ -581,7 +581,7 @@ public:
|
||||
}
|
||||
retData.insert(pair<string, xmlrpc_c::value>("topt", xmlrpc_c::value_array(toptsXml)));
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
xmlrpc_c::paramList const& m_paramList;
|
||||
map<string, xmlrpc_c::value> m_retData;
|
||||
@ -619,8 +619,8 @@ private:
|
||||
Moses::ThreadPool m_threadPool;
|
||||
};
|
||||
|
||||
static
|
||||
void
|
||||
static
|
||||
void
|
||||
PrintFeatureWeight(ostream& out, const FeatureFunction* ff)
|
||||
{
|
||||
out << ff->GetScoreProducerDescription() << "=";
|
||||
@ -632,16 +632,16 @@ PrintFeatureWeight(ostream& out, const FeatureFunction* ff)
|
||||
out << endl;
|
||||
}
|
||||
|
||||
static
|
||||
void
|
||||
static
|
||||
void
|
||||
ShowWeights(ostream& out)
|
||||
{
|
||||
// adapted from moses-cmd/Main.cpp
|
||||
std::ios::fmtflags old_flags = out.setf(std::ios::fixed);
|
||||
size_t old_precision = out.precision(6);
|
||||
const vector<const StatelessFeatureFunction*>&
|
||||
const vector<const StatelessFeatureFunction*>&
|
||||
slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
|
||||
const vector<const StatefulFeatureFunction*>&
|
||||
const vector<const StatefulFeatureFunction*>&
|
||||
sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
|
||||
|
||||
for (size_t i = 0; i < sff.size(); ++i) {
|
||||
@ -662,7 +662,7 @@ ShowWeights(ostream& out)
|
||||
out << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
|
||||
}
|
||||
}
|
||||
if (! (old_flags & std::ios::fixed))
|
||||
if (! (old_flags & std::ios::fixed))
|
||||
out.unsetf(std::ios::fixed);
|
||||
out.precision(old_precision);
|
||||
}
|
||||
@ -754,7 +754,7 @@ int main(int argc, char** argv)
|
||||
.allowOrigin("*")
|
||||
);
|
||||
*/
|
||||
|
||||
|
||||
XVERBOSE(1,"Listening on port " << port << endl);
|
||||
if (isSerial) {
|
||||
while(1) myAbyssServer.runOnce();
|
||||
|
@ -1,231 +1,231 @@
|
||||
// XGetopt.cpp Version 1.2
|
||||
//
|
||||
// Author: Hans Dietrich
|
||||
// hdietrich2@hotmail.com
|
||||
//
|
||||
// Description:
|
||||
// XGetopt.cpp implements getopt(), a function to parse command lines.
|
||||
//
|
||||
// History
|
||||
// Version 1.2 - 2003 May 17
|
||||
// - Added Unicode support
|
||||
//
|
||||
// Version 1.1 - 2002 March 10
|
||||
// - Added example to XGetopt.cpp module header
|
||||
//
|
||||
// This software is released into the public domain.
|
||||
// You are free to use it in any way you like.
|
||||
//
|
||||
// This software is provided "as is" with no expressed
|
||||
// or implied warranty. I accept no liability for any
|
||||
// damage or loss of business that this software may cause.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// if you are using precompiled headers then include this line:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// if you are not using precompiled headers then include these lines:
|
||||
//#include <windows.h>
|
||||
//#include <cstdio>
|
||||
//#include <tchar.h>
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
#include "WIN32_functions.h"
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// X G e t o p t . c p p
|
||||
//
|
||||
//
|
||||
// NAME
|
||||
// getopt -- parse command line options
|
||||
//
|
||||
// SYNOPSIS
|
||||
// int getopt(int argc, char *argv[], char *optstring)
|
||||
//
|
||||
// extern char *optarg;
|
||||
// extern int optind;
|
||||
//
|
||||
// DESCRIPTION
|
||||
// The getopt() function parses the command line arguments. Its
|
||||
// arguments argc and argv are the argument count and array as
|
||||
// passed into the application on program invocation. In the case
|
||||
// of Visual C++ programs, argc and argv are available via the
|
||||
// variables __argc and __argv (double underscores), respectively.
|
||||
// getopt returns the next option letter in argv that matches a
|
||||
// letter in optstring. (Note: Unicode programs should use
|
||||
// __targv instead of __argv. Also, all character and string
|
||||
// literals should be enclosed in ( ) ).
|
||||
//
|
||||
// optstring is a string of recognized option letters; if a letter
|
||||
// is followed by a colon, the option is expected to have an argument
|
||||
// that may or may not be separated from it by white space. optarg
|
||||
// is set to point to the start of the option argument on return from
|
||||
// getopt.
|
||||
//
|
||||
// Option letters may be combined, e.g., "-ab" is equivalent to
|
||||
// "-a -b". Option letters are case sensitive.
|
||||
//
|
||||
// getopt places in the external variable optind the argv index
|
||||
// of the next argument to be processed. optind is initialized
|
||||
// to 0 before the first call to getopt.
|
||||
//
|
||||
// When all options have been processed (i.e., up to the first
|
||||
// non-option argument), getopt returns EOF, optarg will point
|
||||
// to the argument, and optind will be set to the argv index of
|
||||
// the argument. If there are no non-option arguments, optarg
|
||||
// will be set to NULL.
|
||||
//
|
||||
// The special option "--" may be used to delimit the end of the
|
||||
// options; EOF will be returned, and "--" (and everything after it)
|
||||
// will be skipped.
|
||||
//
|
||||
// RETURN VALUE
|
||||
// For option letters contained in the string optstring, getopt
|
||||
// will return the option letter. getopt returns a question mark (?)
|
||||
// when it encounters an option letter not included in optstring.
|
||||
// EOF is returned when processing is finished.
|
||||
//
|
||||
// BUGS
|
||||
// 1) Long options are not supported.
|
||||
// 2) The GNU double-colon extension is not supported.
|
||||
// 3) The environment variable POSIXLY_CORRECT is not supported.
|
||||
// 4) The + syntax is not supported.
|
||||
// 5) The automatic permutation of arguments is not supported.
|
||||
// 6) This implementation of getopt() returns EOF if an error is
|
||||
// encountered, instead of -1 as the latest standard requires.
|
||||
//
|
||||
// EXAMPLE
|
||||
// BOOL CMyApp::ProcessCommandLine(int argc, char *argv[])
|
||||
// {
|
||||
// int c;
|
||||
//
|
||||
// while ((c = getopt(argc, argv, ("aBn:"))) != EOF)
|
||||
// {
|
||||
// switch (c)
|
||||
// {
|
||||
// case ('a'):
|
||||
// TRACE(("option a\n"));
|
||||
// //
|
||||
// // set some flag here
|
||||
// //
|
||||
// break;
|
||||
//
|
||||
// case ('B'):
|
||||
// TRACE( ("option B\n"));
|
||||
// //
|
||||
// // set some other flag here
|
||||
// //
|
||||
// break;
|
||||
//
|
||||
// case ('n'):
|
||||
// TRACE(("option n: value=%d\n"), atoi(optarg));
|
||||
// //
|
||||
// // do something with value here
|
||||
// //
|
||||
// break;
|
||||
//
|
||||
// case ('?'):
|
||||
// TRACE(("ERROR: illegal option %s\n"), argv[optind-1]);
|
||||
// return FALSE;
|
||||
// break;
|
||||
//
|
||||
// default:
|
||||
// TRACE(("WARNING: no handler for option %c\n"), c);
|
||||
// return FALSE;
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// //
|
||||
// // check for non-option args here
|
||||
// //
|
||||
// return TRUE;
|
||||
// }
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
char *optarg; // global argument pointer
|
||||
int optind = 0; // global argv index
|
||||
|
||||
int getopt(int argc, char *argv[], char *optstring)
|
||||
{
|
||||
static char *next = NULL;
|
||||
if (optind == 0)
|
||||
next = NULL;
|
||||
|
||||
optarg = NULL;
|
||||
|
||||
if (next == NULL || *next =='\0') {
|
||||
if (optind == 0)
|
||||
optind++;
|
||||
|
||||
if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0')) {
|
||||
optarg = NULL;
|
||||
if (optind < argc)
|
||||
optarg = argv[optind];
|
||||
return EOF;
|
||||
}
|
||||
|
||||
if (strcmp(argv[optind], "--") == 0) {
|
||||
optind++;
|
||||
optarg = NULL;
|
||||
if (optind < argc)
|
||||
optarg = argv[optind];
|
||||
return EOF;
|
||||
}
|
||||
|
||||
next = argv[optind];
|
||||
next++; // skip past -
|
||||
optind++;
|
||||
}
|
||||
|
||||
char c = *next++;
|
||||
char *cp = strchr(optstring, c);
|
||||
|
||||
if (cp == NULL || c == (':'))
|
||||
return ('?');
|
||||
|
||||
cp++;
|
||||
if (*cp == (':')) {
|
||||
if (*next != ('\0')) {
|
||||
optarg = next;
|
||||
next = NULL;
|
||||
} else if (optind < argc) {
|
||||
optarg = argv[optind];
|
||||
optind++;
|
||||
} else {
|
||||
return ('?');
|
||||
}
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
// for an overview, see
|
||||
// W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
|
||||
double lgamma(int x)
|
||||
{
|
||||
// size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
|
||||
if (x <= 2) {
|
||||
return 0.0;
|
||||
}
|
||||
static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
|
||||
double tmp=(double)x+5.5;
|
||||
tmp -= (((double)x)+0.5)*log(tmp);
|
||||
double y=(double)x;
|
||||
double sum = 1.000000000190015;
|
||||
for (size_t j=0; j<6; ++j) {
|
||||
sum += coefs[j]/++y;
|
||||
}
|
||||
return -tmp+log(2.5066282746310005*sum/(double)x);
|
||||
}
|
||||
// XGetopt.cpp Version 1.2
|
||||
//
|
||||
// Author: Hans Dietrich
|
||||
// hdietrich2@hotmail.com
|
||||
//
|
||||
// Description:
|
||||
// XGetopt.cpp implements getopt(), a function to parse command lines.
|
||||
//
|
||||
// History
|
||||
// Version 1.2 - 2003 May 17
|
||||
// - Added Unicode support
|
||||
//
|
||||
// Version 1.1 - 2002 March 10
|
||||
// - Added example to XGetopt.cpp module header
|
||||
//
|
||||
// This software is released into the public domain.
|
||||
// You are free to use it in any way you like.
|
||||
//
|
||||
// This software is provided "as is" with no expressed
|
||||
// or implied warranty. I accept no liability for any
|
||||
// damage or loss of business that this software may cause.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// if you are using precompiled headers then include this line:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// if you are not using precompiled headers then include these lines:
|
||||
//#include <windows.h>
|
||||
//#include <cstdio>
|
||||
//#include <tchar.h>
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
#include "WIN32_functions.h"
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// X G e t o p t . c p p
|
||||
//
|
||||
//
|
||||
// NAME
|
||||
// getopt -- parse command line options
|
||||
//
|
||||
// SYNOPSIS
|
||||
// int getopt(int argc, char *argv[], char *optstring)
|
||||
//
|
||||
// extern char *optarg;
|
||||
// extern int optind;
|
||||
//
|
||||
// DESCRIPTION
|
||||
// The getopt() function parses the command line arguments. Its
|
||||
// arguments argc and argv are the argument count and array as
|
||||
// passed into the application on program invocation. In the case
|
||||
// of Visual C++ programs, argc and argv are available via the
|
||||
// variables __argc and __argv (double underscores), respectively.
|
||||
// getopt returns the next option letter in argv that matches a
|
||||
// letter in optstring. (Note: Unicode programs should use
|
||||
// __targv instead of __argv. Also, all character and string
|
||||
// literals should be enclosed in ( ) ).
|
||||
//
|
||||
// optstring is a string of recognized option letters; if a letter
|
||||
// is followed by a colon, the option is expected to have an argument
|
||||
// that may or may not be separated from it by white space. optarg
|
||||
// is set to point to the start of the option argument on return from
|
||||
// getopt.
|
||||
//
|
||||
// Option letters may be combined, e.g., "-ab" is equivalent to
|
||||
// "-a -b". Option letters are case sensitive.
|
||||
//
|
||||
// getopt places in the external variable optind the argv index
|
||||
// of the next argument to be processed. optind is initialized
|
||||
// to 0 before the first call to getopt.
|
||||
//
|
||||
// When all options have been processed (i.e., up to the first
|
||||
// non-option argument), getopt returns EOF, optarg will point
|
||||
// to the argument, and optind will be set to the argv index of
|
||||
// the argument. If there are no non-option arguments, optarg
|
||||
// will be set to NULL.
|
||||
//
|
||||
// The special option "--" may be used to delimit the end of the
|
||||
// options; EOF will be returned, and "--" (and everything after it)
|
||||
// will be skipped.
|
||||
//
|
||||
// RETURN VALUE
|
||||
// For option letters contained in the string optstring, getopt
|
||||
// will return the option letter. getopt returns a question mark (?)
|
||||
// when it encounters an option letter not included in optstring.
|
||||
// EOF is returned when processing is finished.
|
||||
//
|
||||
// BUGS
|
||||
// 1) Long options are not supported.
|
||||
// 2) The GNU double-colon extension is not supported.
|
||||
// 3) The environment variable POSIXLY_CORRECT is not supported.
|
||||
// 4) The + syntax is not supported.
|
||||
// 5) The automatic permutation of arguments is not supported.
|
||||
// 6) This implementation of getopt() returns EOF if an error is
|
||||
// encountered, instead of -1 as the latest standard requires.
|
||||
//
|
||||
// EXAMPLE
|
||||
// BOOL CMyApp::ProcessCommandLine(int argc, char *argv[])
|
||||
// {
|
||||
// int c;
|
||||
//
|
||||
// while ((c = getopt(argc, argv, ("aBn:"))) != EOF)
|
||||
// {
|
||||
// switch (c)
|
||||
// {
|
||||
// case ('a'):
|
||||
// TRACE(("option a\n"));
|
||||
// //
|
||||
// // set some flag here
|
||||
// //
|
||||
// break;
|
||||
//
|
||||
// case ('B'):
|
||||
// TRACE( ("option B\n"));
|
||||
// //
|
||||
// // set some other flag here
|
||||
// //
|
||||
// break;
|
||||
//
|
||||
// case ('n'):
|
||||
// TRACE(("option n: value=%d\n"), atoi(optarg));
|
||||
// //
|
||||
// // do something with value here
|
||||
// //
|
||||
// break;
|
||||
//
|
||||
// case ('?'):
|
||||
// TRACE(("ERROR: illegal option %s\n"), argv[optind-1]);
|
||||
// return FALSE;
|
||||
// break;
|
||||
//
|
||||
// default:
|
||||
// TRACE(("WARNING: no handler for option %c\n"), c);
|
||||
// return FALSE;
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// //
|
||||
// // check for non-option args here
|
||||
// //
|
||||
// return TRUE;
|
||||
// }
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
char *optarg; // global argument pointer
|
||||
int optind = 0; // global argv index
|
||||
|
||||
int getopt(int argc, char *argv[], char *optstring)
|
||||
{
|
||||
static char *next = NULL;
|
||||
if (optind == 0)
|
||||
next = NULL;
|
||||
|
||||
optarg = NULL;
|
||||
|
||||
if (next == NULL || *next =='\0') {
|
||||
if (optind == 0)
|
||||
optind++;
|
||||
|
||||
if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0')) {
|
||||
optarg = NULL;
|
||||
if (optind < argc)
|
||||
optarg = argv[optind];
|
||||
return EOF;
|
||||
}
|
||||
|
||||
if (strcmp(argv[optind], "--") == 0) {
|
||||
optind++;
|
||||
optarg = NULL;
|
||||
if (optind < argc)
|
||||
optarg = argv[optind];
|
||||
return EOF;
|
||||
}
|
||||
|
||||
next = argv[optind];
|
||||
next++; // skip past -
|
||||
optind++;
|
||||
}
|
||||
|
||||
char c = *next++;
|
||||
char *cp = strchr(optstring, c);
|
||||
|
||||
if (cp == NULL || c == (':'))
|
||||
return ('?');
|
||||
|
||||
cp++;
|
||||
if (*cp == (':')) {
|
||||
if (*next != ('\0')) {
|
||||
optarg = next;
|
||||
next = NULL;
|
||||
} else if (optind < argc) {
|
||||
optarg = argv[optind];
|
||||
optind++;
|
||||
} else {
|
||||
return ('?');
|
||||
}
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
// for an overview, see
|
||||
// W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
|
||||
double lgamma(int x)
|
||||
{
|
||||
// size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
|
||||
if (x <= 2) {
|
||||
return 0.0;
|
||||
}
|
||||
static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
|
||||
double tmp=(double)x+5.5;
|
||||
tmp -= (((double)x)+0.5)*log(tmp);
|
||||
double y=(double)x;
|
||||
double sum = 1.000000000190015;
|
||||
for (size_t j=0; j<6; ++j) {
|
||||
sum += coefs[j]/++y;
|
||||
}
|
||||
return -tmp+log(2.5066282746310005*sum/(double)x);
|
||||
}
|
||||
|
@ -1,24 +1,24 @@
|
||||
// XGetopt.h Version 1.2
|
||||
//
|
||||
// Author: Hans Dietrich
|
||||
// hdietrich2@hotmail.com
|
||||
//
|
||||
// This software is released into the public domain.
|
||||
// You are free to use it in any way you like.
|
||||
//
|
||||
// This software is provided "as is" with no expressed
|
||||
// or implied warranty. I accept no liability for any
|
||||
// damage or loss of business that this software may cause.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef XGETOPT_H
|
||||
#define XGETOPT_H
|
||||
|
||||
extern int optind, opterr;
|
||||
extern char *optarg;
|
||||
|
||||
int getopt(int argc, char *argv[], char *optstring);
|
||||
double lgamma(int x);
|
||||
|
||||
#endif //XGETOPT_H
|
||||
// XGetopt.h Version 1.2
|
||||
//
|
||||
// Author: Hans Dietrich
|
||||
// hdietrich2@hotmail.com
|
||||
//
|
||||
// This software is released into the public domain.
|
||||
// You are free to use it in any way you like.
|
||||
//
|
||||
// This software is provided "as is" with no expressed
|
||||
// or implied warranty. I accept no liability for any
|
||||
// damage or loss of business that this software may cause.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef XGETOPT_H
|
||||
#define XGETOPT_H
|
||||
|
||||
extern int optind, opterr;
|
||||
extern char *optarg;
|
||||
|
||||
int getopt(int argc, char *argv[], char *optstring);
|
||||
double lgamma(int x);
|
||||
|
||||
#endif //XGETOPT_H
|
||||
|
@ -1,5 +1,5 @@
|
||||
|
||||
#include <cstring>
|
||||
#include <cstring>
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
@ -14,7 +14,7 @@
|
||||
#include <set>
|
||||
|
||||
#include <boost/thread/tss.hpp>
|
||||
#include <boost/thread.hpp>
|
||||
#include <boost/thread.hpp>
|
||||
#include <boost/unordered_map.hpp>
|
||||
|
||||
#ifdef WIN32
|
||||
@ -58,9 +58,9 @@ typedef boost::shared_ptr<std::vector<TextLenType> > SentIdSet;
|
||||
class Cache {
|
||||
typedef std::pair<SentIdSet, clock_t> ClockedSet;
|
||||
typedef boost::unordered_map<std::string, ClockedSet> ClockedMap;
|
||||
|
||||
|
||||
public:
|
||||
|
||||
|
||||
SentIdSet get(const std::string& phrase) {
|
||||
boost::shared_lock<boost::shared_mutex> lock(m_mutex);
|
||||
if(m_cont.count(phrase)) {
|
||||
@ -70,27 +70,27 @@ class Cache {
|
||||
}
|
||||
return SentIdSet( new SentIdSet::element_type() );
|
||||
}
|
||||
|
||||
|
||||
void put(const std::string& phrase, const SentIdSet set) {
|
||||
boost::unique_lock<boost::shared_mutex> lock(m_mutex);
|
||||
m_cont[phrase] = std::make_pair(set, clock());
|
||||
}
|
||||
|
||||
|
||||
static void set_max_cache(size_t max_cache) {
|
||||
s_max_cache = max_cache;
|
||||
}
|
||||
|
||||
|
||||
void prune() {
|
||||
if(s_max_cache > 0) {
|
||||
boost::upgrade_lock<boost::shared_mutex> lock(m_mutex);
|
||||
if(m_cont.size() > s_max_cache) {
|
||||
std::vector<clock_t> clocks;
|
||||
for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++)
|
||||
for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++)
|
||||
clocks.push_back(it->second.second);
|
||||
|
||||
|
||||
std::sort(clocks.begin(), clocks.end());
|
||||
clock_t out = clocks[m_cont.size() - s_max_cache];
|
||||
|
||||
|
||||
boost::upgrade_to_unique_lock<boost::shared_mutex> uniq_lock(lock);
|
||||
for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++)
|
||||
if(it->second.second < out)
|
||||
@ -98,7 +98,7 @@ class Cache {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
ClockedMap m_cont;
|
||||
boost::shared_mutex m_mutex;
|
||||
@ -282,12 +282,12 @@ void lookup_phrase(SentIdSet& ids, const std::string& phrase,
|
||||
i != locations.end(); ++i) {
|
||||
ids->push_back(i->sentIdInCorpus);
|
||||
}
|
||||
|
||||
|
||||
std::sort(ids->begin(), ids->end());
|
||||
SentIdSet::element_type::iterator it =
|
||||
std::unique(ids->begin(), ids->end());
|
||||
ids->resize(it - ids->begin());
|
||||
|
||||
|
||||
if(ids->size() >= MINIMUM_SIZE_TO_KEEP)
|
||||
cache.put(phrase, ids);
|
||||
}
|
||||
@ -295,8 +295,8 @@ void lookup_phrase(SentIdSet& ids, const std::string& phrase,
|
||||
|
||||
void lookup_multiple_phrases(SentIdSet& ids, vector<std::string> & phrases,
|
||||
C_SuffixArraySearchApplicationBase & my_sa,
|
||||
const std::string & rule, Cache& cache)
|
||||
{
|
||||
const std::string & rule, Cache& cache)
|
||||
{
|
||||
|
||||
if (phrases.size() == 1) {
|
||||
lookup_phrase(ids, phrases.front(), my_sa, cache);
|
||||
@ -372,32 +372,32 @@ void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options,
|
||||
delete *i;
|
||||
options.erase(options.begin() + pfe_filter_limit,options.end());
|
||||
}
|
||||
|
||||
|
||||
if (pef_filter_only)
|
||||
return;
|
||||
|
||||
|
||||
if (options.empty())
|
||||
return;
|
||||
|
||||
|
||||
SentIdSet fset( new SentIdSet::element_type() );
|
||||
find_occurrences(fset, options.front()->f_phrase, f_sa, f_cache);
|
||||
size_t cf = fset->size();
|
||||
|
||||
|
||||
for (std::vector<PTEntry*>::iterator i = options.begin();
|
||||
i != options.end(); ++i) {
|
||||
const std::string& e_phrase = (*i)->e_phrase;
|
||||
SentIdSet eset( new SentIdSet::element_type() );
|
||||
find_occurrences(eset, e_phrase, e_sa, e_cache);
|
||||
size_t ce = eset->size();
|
||||
|
||||
|
||||
SentIdSet efset( new SentIdSet::element_type() );
|
||||
ordered_set_intersect(efset, fset, eset);
|
||||
size_t cef = efset->size();
|
||||
|
||||
|
||||
double nlp = -log(fisher_exact(cef, cf, ce));
|
||||
(*i)->set_cooc_stats(cef, cf, ce, nlp);
|
||||
}
|
||||
|
||||
|
||||
std::vector<PTEntry*>::iterator new_end =
|
||||
std::remove_if(options.begin(), options.end(),
|
||||
NlogSigThresholder(sig_filter_limit));
|
||||
@ -406,7 +406,7 @@ void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options,
|
||||
}
|
||||
|
||||
void filter(std::istream* in, std::ostream* out, int pfe_index) {
|
||||
|
||||
|
||||
std::vector<std::string> lines;
|
||||
std::string prev = "";
|
||||
std::vector<PTEntry*> options;
|
||||
@ -415,23 +415,23 @@ void filter(std::istream* in, std::ostream* out, int pfe_index) {
|
||||
boost::mutex::scoped_lock lock(in_mutex);
|
||||
if(in->eof())
|
||||
break;
|
||||
|
||||
|
||||
lines.clear();
|
||||
std::string line;
|
||||
while(getline(*in, line) && lines.size() < 500000)
|
||||
lines.push_back(line);
|
||||
}
|
||||
|
||||
|
||||
std::stringstream out_temp;
|
||||
for(std::vector<std::string>::iterator it = lines.begin(); it != lines.end(); it++) {
|
||||
size_t tmp_lines = ++pt_lines;
|
||||
if(tmp_lines % 10000 == 0) {
|
||||
boost::mutex::scoped_lock lock(err_mutex);
|
||||
std::cerr << ".";
|
||||
|
||||
|
||||
if(tmp_lines % 500000 == 0)
|
||||
std::cerr << "[n:" << tmp_lines << "]\n";
|
||||
|
||||
|
||||
if(tmp_lines % 10000000 == 0) {
|
||||
float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
|
||||
float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
|
||||
@ -446,30 +446,30 @@ void filter(std::istream* in, std::ostream* out, int pfe_index) {
|
||||
<< "------------------------------------------------------\n";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if(pt_lines % 10000 == 0) {
|
||||
f_cache.prune();
|
||||
e_cache.prune();
|
||||
}
|
||||
|
||||
|
||||
if(it->length() > 0) {
|
||||
PTEntry* pp = new PTEntry(it->c_str(), pfe_index);
|
||||
if (prev != pp->f_phrase) {
|
||||
prev = pp->f_phrase;
|
||||
|
||||
|
||||
if (!options.empty()) { // always true after first line
|
||||
compute_cooc_stats_and_filter(options, f_cache, e_cache);
|
||||
}
|
||||
|
||||
|
||||
for (std::vector<PTEntry*>::iterator i = options.begin();
|
||||
i != options.end(); ++i) {
|
||||
out_temp << **i << '\n';
|
||||
delete *i;
|
||||
}
|
||||
|
||||
|
||||
options.clear();
|
||||
options.push_back(pp);
|
||||
|
||||
|
||||
} else {
|
||||
options.push_back(pp);
|
||||
}
|
||||
@ -479,7 +479,7 @@ void filter(std::istream* in, std::ostream* out, int pfe_index) {
|
||||
*out << out_temp.str() << std::flush;
|
||||
}
|
||||
compute_cooc_stats_and_filter(options, f_cache, e_cache);
|
||||
|
||||
|
||||
boost::mutex::scoped_lock lock(out_mutex);
|
||||
for (std::vector<PTEntry*>::iterator i = options.begin();
|
||||
i != options.end(); ++i) {
|
||||
@ -512,11 +512,11 @@ int main(int argc, char * argv[])
|
||||
pfe_filter_limit = atoi(optarg);
|
||||
std::cerr << "P(f|e) filter limit: " << pfe_filter_limit << std::endl;
|
||||
break;
|
||||
case 't':
|
||||
case 't':
|
||||
threads = atoi(optarg);
|
||||
std::cerr << "Using threads: " << threads << std::endl;
|
||||
break;
|
||||
case 'm':
|
||||
case 'm':
|
||||
max_cache = atoi(optarg);
|
||||
std::cerr << "Using max phrases in caches: " << max_cache << std::endl;
|
||||
break;
|
||||
@ -548,13 +548,13 @@ int main(int argc, char * argv[])
|
||||
usage();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (sig_filter_limit == 0.0) pef_filter_only = true;
|
||||
//-----------------------------------------------------------------------------
|
||||
if (optind != argc || ((!efile || !ffile) && !pef_filter_only)) {
|
||||
usage();
|
||||
}
|
||||
|
||||
|
||||
//load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false)
|
||||
if (!pef_filter_only) {
|
||||
e_sa.loadData_forSearch(efile, false, false);
|
||||
@ -582,15 +582,15 @@ int main(int argc, char * argv[])
|
||||
|
||||
Cache::set_max_cache(max_cache);
|
||||
std::ios_base::sync_with_stdio(false);
|
||||
|
||||
|
||||
boost::thread_group threadGroup;
|
||||
for(int i = 0; i < threads; i++)
|
||||
for(int i = 0; i < threads; i++)
|
||||
threadGroup.add_thread(new boost::thread(filter, &std::cin, &std::cout, pfe_index));
|
||||
threadGroup.join_all();
|
||||
|
||||
float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
|
||||
float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
|
||||
|
||||
|
||||
std::cerr << "\n\n------------------------------------------------------\n"
|
||||
<< " unfiltered phrases pairs: " << pt_lines << "\n"
|
||||
<< "\n"
|
||||
@ -599,5 +599,5 @@ int main(int argc, char * argv[])
|
||||
<< " TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << " (" << (sigfper + pfefper) << "%)\n"
|
||||
<< "\n"
|
||||
<< " FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << " (" << (100.0-sigfper - pfefper) << "%)\n"
|
||||
<< "------------------------------------------------------\n";
|
||||
<< "------------------------------------------------------\n";
|
||||
}
|
||||
|
@ -65,7 +65,7 @@ class Numbered : public T {
|
||||
friend String& operator<< ( String& str, const Numbered<SD1,I,SD2,T,SD3>& rv ) { return str<<SD1<<rv.i<<SD2<<rv.getT()<<SD3; }
|
||||
friend pair<StringInput,Numbered<SD1,I,SD2,T,SD3>*> operator>> ( StringInput ps, Numbered<SD1,I,SD2,T,SD3>& rv ) { return pair<StringInput,Numbered<SD1,I,SD2,T,SD3>*>(ps,&rv); }
|
||||
friend StringInput operator>> ( pair<StringInput,Numbered<SD1,I,SD2,T,SD3>*> delimbuff, const char* psPostDelim ) {
|
||||
return ( (SD3[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->i>>SD2>>delimbuff.second->setT()>>psPostDelim
|
||||
return ( (SD3[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->i>>SD2>>delimbuff.second->setT()>>psPostDelim
|
||||
: delimbuff.first>>SD1>>delimbuff.second->i>>SD2>>delimbuff.second->setT()>>SD3>>psPostDelim );
|
||||
}
|
||||
};
|
||||
@ -106,7 +106,7 @@ template<class V>
|
||||
pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const V& v ) const {
|
||||
//const Scored<typename V::ElementType,pair<int,SafePtr<const V> > > sipvDummy ( DBL_MAX );
|
||||
//MinHeap<Scored<typename V::ElementType,pair<int,SafePtr<const V> > > > hsiv ( MapType::size()+1, sipvDummy );
|
||||
MinHeap<Scored<typename V::ElementType,pair<int,SafePtr<const NV> > > >& hsiv =
|
||||
MinHeap<Scored<typename V::ElementType,pair<int,SafePtr<const NV> > > >& hsiv =
|
||||
const_cast<MinHeap<Scored<typename V::ElementType,pair<int,SafePtr<const NV> > > >&> ( hsivCalc );
|
||||
hsiv.clear();
|
||||
|
||||
@ -120,7 +120,7 @@ pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const
|
||||
typename V::ElementType d = v.getMarginalDistance ( hsiv.get(iNext).first, hsiv.get(iNext).second.getRef() );
|
||||
hsiv.set(iNext).setScore() = d;
|
||||
//hsiv.set(iNext).setScore() = v.getMarginalDistance ( hsiv.getMin().first, iUpper->second.second );
|
||||
////int j =
|
||||
////int j =
|
||||
hsiv.fixDecr(iNext);
|
||||
////cerr<<" adding ln"<<&hsiv.get(j).second.getRef()<<" marg-dist="<<d<<" new-score="<<double(hsiv.get(j).getScore())<<" new-pos="<<j<<"\n";
|
||||
iNext++;
|
||||
@ -140,7 +140,7 @@ pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const
|
||||
typename V::ElementType d = v.getMarginalDistance ( ++hsiv.setMin().first, hsiv.getMin().second.getRef() );
|
||||
hsiv.setMin().setScore() += d;
|
||||
////cerr<<" matching ln"<<&hsiv.getMin().second.getRef()<<" i="<<hsiv.setMin().first<<" marg-dist="<<d<<" new-score="<<hsiv.getMin().getScore();
|
||||
////int j =
|
||||
////int j =
|
||||
hsiv.fixIncr(0);
|
||||
////cerr<<" new-pos="<<j<<"\n";
|
||||
////if(j!=0) for(int i=0;i<iNext;i++) cerr<<" "<<i<<": ln"<<hsiv.get(i).second.getRef().lineNum.toInt()<<" new-score="<<double(hsiv.get(i).getScore())<<"\n";
|
||||
@ -151,7 +151,7 @@ pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const
|
||||
hsiv.set(iNext).second = SafePtr<const NV> ( iUpper->second );
|
||||
typename V::ElementType d = v.getMarginalDistance ( hsiv.get(iNext).first, hsiv.get(iNext).second.getRef() );
|
||||
hsiv.set(iNext).setScore() = d;
|
||||
////int j =
|
||||
////int j =
|
||||
hsiv.fixDecr(iNext);
|
||||
////cerr<<" adding ln"<<&hsiv.get(j).second.getRef()<<" marg-dist="<<d<<" new-score="<<double(hsiv.get(j).getScore())<<" new-pos="<<j<<"\n";
|
||||
iNext++;
|
||||
@ -164,7 +164,7 @@ pair<typename V::ElementType,int> ArchetypeSet<V>::getDistanceOfNearest ( const
|
||||
hsiv.set(iNext).second = SafePtr<const NV> ( iLower->second );
|
||||
typename V::ElementType d = v.getMarginalDistance ( hsiv.get(iNext).first, hsiv.get(iNext).second.getRef() );
|
||||
hsiv.set(iNext).setScore() = d;
|
||||
////int j =
|
||||
////int j =
|
||||
hsiv.fixDecr(iNext);
|
||||
////cerr<<" adding ln"<<&hsiv.get(j).second.getRef()<<" marg-dist="<<d<<" new-score="<<double(hsiv.get(j).getScore())<<" new-pos="<<j<<"\n";
|
||||
iNext++;
|
||||
|
@ -27,7 +27,7 @@
|
||||
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
using namespace std;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -101,8 +101,8 @@ class Beam {
|
||||
void write(FILE *pf){
|
||||
/* for (typename BeamMap::const_iterator i = mkid.begin(); i != mkid.end(); i++){
|
||||
i->first.write(pf);
|
||||
fprintf(pf, " %d ", i->second.first);
|
||||
// i->second.second.write(pf);
|
||||
fprintf(pf, " %d ", i->second.first);
|
||||
// i->second.second.write(pf);
|
||||
fprintf(pf, "\n");
|
||||
}
|
||||
*/
|
||||
|
@ -394,7 +394,7 @@ class SimpleMap : public map<X,Y> {
|
||||
private:
|
||||
typedef map<X,Y> OrigMap;
|
||||
static const Y yDummy;
|
||||
|
||||
|
||||
public:
|
||||
// Constructor / destructor methods...
|
||||
SimpleMap ( ) : OrigMap() { }
|
||||
@ -899,7 +899,7 @@ class GenericHidVarCPTModel : public SimpleHash<K,typename Y::template ArrayDist
|
||||
const typename Y::template ArrayDistrib<P>& getDistrib ( const K& k ) const {
|
||||
return HKYP::get(k);
|
||||
}
|
||||
|
||||
|
||||
P& setProb ( const Y& y, const K& k ) {
|
||||
pair<typename Y::BaseType,P>& yp = HKYP::set(k).add();
|
||||
yp.first = y;
|
||||
|
@ -36,7 +36,7 @@
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <class Y,class X1,class X2>
|
||||
template <class Y,class X1,class X2>
|
||||
class CRF3DModeledRV : public Y {
|
||||
|
||||
private:
|
||||
@ -90,7 +90,7 @@ template <class Y,class X1,class X2> SafeArray5D<Id<int>,int,int,int,int,float>
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <class Y,class X1,class X2>
|
||||
template <class Y,class X1,class X2>
|
||||
Prob CRF3DModeledRV<Y,X1,X2>::getProb( const X1& x1, const X2& x2 ) const {
|
||||
|
||||
SafeArray2D<int,int,int> aaCnds ( cardOff, cardSh ) ;
|
||||
@ -131,7 +131,7 @@ Prob CRF3DModeledRV<Y,X1,X2>::getProb( const X1& x1, const X2& x2 ) const {
|
||||
for ( int configRghtValSite=0; configRghtValSite<(1<<bitsValSite); configRghtValSite++ )
|
||||
for ( int configValOverlap=0; configValOverlap<(1<<(bitsVal-bitsValSite)); configValOverlap++ ) {
|
||||
int configRghtVal = (configValOverlap<<bitsValSite)+configRghtValSite;
|
||||
// For each possible preceding trellis node...
|
||||
// For each possible preceding trellis node...
|
||||
for ( int configLeftValSite=0; configLeftValSite<(1<<bitsValSite); configLeftValSite++ ) {
|
||||
int configLeftVal = (configLeftValSite<<(bitsVal-bitsValSite))+configValOverlap;
|
||||
// Add product of result and previous trellis cell to current trellis cell...
|
||||
@ -158,7 +158,7 @@ Prob CRF3DModeledRV<Y,X1,X2>::getProb( const X1& x1, const X2& x2 ) const {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <class Y,class X1,class X2>
|
||||
template <class Y,class X1,class X2>
|
||||
bool CRF3DModeledRV<Y,X1,X2>::readModelFields ( char* aps[], int numFields ) {
|
||||
if ( 7==numFields )
|
||||
setPotential ( X1(string(aps[1])), // globals
|
||||
@ -172,7 +172,7 @@ bool CRF3DModeledRV<Y,X1,X2>::readModelFields ( char* aps[], int numFields ) {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <class Y,class X1,class X2>
|
||||
template <class Y,class X1,class X2>
|
||||
void CRF3DModeledRV<Y,X1,X2>::writeObservCliqueConfigs ( FILE* pf, int frame, const char* psMdl,
|
||||
const X1& x1, const X2& x2, bool bObsVal ) const {
|
||||
fprintf ( pf, "%04d> %s ", frame, psMdl );
|
||||
@ -199,7 +199,7 @@ void CRF3DModeledRV<Y,X1,X2>::writeObservCliqueConfigs ( FILE* pf, int frame, co
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <class Y,class X1,class X2,class X3>
|
||||
template <class Y,class X1,class X2,class X3>
|
||||
class CRF4DModeledRV : public Y {
|
||||
|
||||
private:
|
||||
@ -247,13 +247,13 @@ template <class Y,class X1,class X2,class X3> int CRF4DModeledRV<Y,X1,X2,X3>::c
|
||||
template <class Y,class X1,class X2,class X3> int CRF4DModeledRV<Y,X1,X2,X3>::cardCnd = 0;
|
||||
template <class Y,class X1,class X2,class X3> int CRF4DModeledRV<Y,X1,X2,X3>::bitsVal = 0;
|
||||
template <class Y,class X1,class X2,class X3> int CRF4DModeledRV<Y,X1,X2,X3>::bitsValSite = 0;
|
||||
template <class Y,class X1,class X2,class X3> SafeArray5D<Id<int>,int,int,int,int,float>
|
||||
template <class Y,class X1,class X2,class X3> SafeArray5D<Id<int>,int,int,int,int,float>
|
||||
CRF4DModeledRV<Y,X1,X2,X3>::aaaaaPotentials;
|
||||
/* template <class Y,class X1,class X2> SafeArray3D<int> CRF4DModeledRV<Y,X1,X2>::aaaCnds; */
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <class Y,class X1,class X2,class X3>
|
||||
template <class Y,class X1,class X2,class X3>
|
||||
Prob CRF4DModeledRV<Y,X1,X2,X3>::getProb( const X1& x1, const X2& x2, const X3& x3 ) const {
|
||||
|
||||
SafeArray2D<int,int,int> aaCnds ( cardOff, cardSh ) ;
|
||||
@ -294,7 +294,7 @@ Prob CRF4DModeledRV<Y,X1,X2,X3>::getProb( const X1& x1, const X2& x2, const X3&
|
||||
for ( int configRghtValSite=0; configRghtValSite<(1<<bitsValSite); configRghtValSite++ )
|
||||
for ( int configValOverlap=0; configValOverlap<(1<<(bitsVal-bitsValSite)); configValOverlap++ ) {
|
||||
int configRghtVal = (configValOverlap<<bitsValSite)+configRghtValSite;
|
||||
// For each possible preceding trellis node...
|
||||
// For each possible preceding trellis node...
|
||||
for ( int configLeftValSite=0; configLeftValSite<(1<<bitsValSite); configLeftValSite++ ) {
|
||||
int configLeftVal = (configLeftValSite<<(bitsVal-bitsValSite))+configValOverlap;
|
||||
// Add product of result and previous trellis cell to current trellis cell...
|
||||
@ -321,7 +321,7 @@ Prob CRF4DModeledRV<Y,X1,X2,X3>::getProb( const X1& x1, const X2& x2, const X3&
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <class Y,class X1,class X2,class X3>
|
||||
template <class Y,class X1,class X2,class X3>
|
||||
bool CRF4DModeledRV<Y,X1,X2,X3>::readModelFields ( char* aps[], int numFields ) {
|
||||
if ( 7==numFields )
|
||||
setPotential ( X1(string(aps[1])), // globals
|
||||
@ -335,9 +335,9 @@ bool CRF4DModeledRV<Y,X1,X2,X3>::readModelFields ( char* aps[], int numFields )
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <class Y,class X1,class X2, class X3>
|
||||
template <class Y,class X1,class X2, class X3>
|
||||
void CRF4DModeledRV<Y,X1,X2,X3>::writeObservCliqueConfigs ( FILE* pf, int frame, const char* psMdl,
|
||||
const X1& x1, const X2& x2,
|
||||
const X1& x1, const X2& x2,
|
||||
const X3& x3, bool bObsVal ) const {
|
||||
fprintf ( pf, "%04d> %s ", frame, psMdl );
|
||||
// For each shape (feature slope)...
|
||||
|
@ -80,7 +80,7 @@ void VecE<N,I,RC>::read ( char* ps, const ReaderContext& rc ) {
|
||||
*/
|
||||
char* psT; int i=0;
|
||||
for ( char* psU=strtok_r(ps,",",&psT);
|
||||
psU && i<NUM_ENTS;
|
||||
psU && i<NUM_ENTS;
|
||||
psU=strtok_r(NULL,",",&psT),i++ )
|
||||
StaticSafeArray<N,I>::set(i) = psU;
|
||||
}
|
||||
@ -166,7 +166,7 @@ void VecV<N,I,RC,ND1,ND2>::read ( char* ps, VecVReaderContext& rc ) {
|
||||
// Chop into individual coinds strings...
|
||||
char* psT; int i=0;
|
||||
for ( char* psU=strtok_r(ps,",",&psT);
|
||||
psU && i<NUM_ENTS;
|
||||
psU && i<NUM_ENTS;
|
||||
psU=strtok_r(NULL,",",&psT), i++ )
|
||||
asV.set(i) = psU;
|
||||
|
||||
@ -230,7 +230,7 @@ class JointVecV { //// : public StaticSafeArray<V1::NUM_ENTS+V2::NUM_ENTS,I> {
|
||||
static const int NUM_ENTS;
|
||||
// Constructor / destructor methods...
|
||||
JointVecV ( ) { }
|
||||
JointVecV ( const V1& a1, const V2& a2 ) {
|
||||
JointVecV ( const V1& a1, const V2& a2 ) {
|
||||
////fprintf(stderr,"iJoin "); a1.V1::write(stderr); fprintf(stderr," "); a2.V2::write(stderr); fprintf(stderr,"\n");
|
||||
for (int i=0; i<NUM_ENTS; i++) {
|
||||
if ( i<V1::NUM_ENTS ) set(i) = (a1.get(i)==-1) ? IntType(-1) : (a1.get(i)<V1::NUM_ENTS) ? IntType(a1.get(i)) : a1.get(i)+V2::NUM_ENTS;
|
||||
|
@ -75,7 +75,7 @@ class ContDTree2DModel : public Generic2DModel<Y,X,P>, public Tree<ContDecisNode
|
||||
// Extraction methods...
|
||||
const P getProb ( const Y y, const X& x ) const {
|
||||
const Tree<ContDecisNode<Y,P> >* ptr = this;
|
||||
while ( !ptr->isTerm() ) {
|
||||
while ( !ptr->isTerm() ) {
|
||||
double sumsqr=0.0;
|
||||
for(A a;a<X::getSize();a.setNext()) sumsqr += pow(x.get(a.toInt()),2.0) / X::getSize();
|
||||
Wt wtdavg = -Tree<ContDecisNode<Y,P> >::getWt();
|
||||
@ -112,7 +112,7 @@ class ContDTree2DModel : public Generic2DModel<Y,X,P>, public Tree<ContDecisNode
|
||||
};
|
||||
|
||||
////////////////////
|
||||
template <class Y,class X, class P>
|
||||
template <class Y,class X, class P>
|
||||
bool ContDTree2DModel<Y,X,P>::readFields ( char* aps[], int numFields ) {
|
||||
if ( /*aps[0]==sId &&*/ (3==numFields || 4==numFields) ) {
|
||||
//fprintf(stderr,"%s,%d\n",aps[3],numFields);
|
||||
@ -171,7 +171,7 @@ class ContDTree3DModel : public Generic3DModel<Y,X1,X2,P> {
|
||||
};
|
||||
|
||||
////////////////////
|
||||
template <class Y,class X1,class X2, class P>
|
||||
template <class Y,class X1,class X2, class P>
|
||||
bool ContDTree3DModel<Y,X1,X2,P>::readFields ( char* aps[], int numFields ) {
|
||||
if ( /*aps[0]==sId &&*/ (4==numFields || 5==numFields) ) {
|
||||
//fprintf(stderr,"%s,%d\n",aps[3],numFields);
|
||||
@ -212,7 +212,7 @@ bool ContDTree3DModel<Y,X1,X2,P>::readFields ( char* aps[], int numFields ) {
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<class Y, class X, class P>
|
||||
class TrainableContDTree2DModel : public ContDTree2DModel<Y,X,P> {
|
||||
class TrainableContDTree2DModel : public ContDTree2DModel<Y,X,P> {
|
||||
private:
|
||||
List<Joint2DRV<X,Y> > lxy;
|
||||
public:
|
||||
@ -225,7 +225,7 @@ class TrainableContDTree2DModel : public ContDTree2DModel<Y,X,P> {
|
||||
void train ( List<Joint2DRV<X,Y> >&, const double ) ;
|
||||
void train ( const double d ) { train(lxy,d); }
|
||||
////// Input / output methods...
|
||||
bool readData ( char* vs[], int numFields ) {
|
||||
bool readData ( char* vs[], int numFields ) {
|
||||
if ( 3==numFields ) lxy.add() = Joint2DRV<X,Y> ( X(vs[1]), Y(vs[2]) );
|
||||
else return false;
|
||||
return true;
|
||||
@ -312,7 +312,7 @@ void TrainableContDTree2DModel<Y,X,P>::train ( List<Joint2DRV<X,Y> >& lxy, cons
|
||||
// if ( double(rand())/double(RAND_MAX) < prRarest/modelY.getProb(pxy->getSub2()) ) {
|
||||
|
||||
dCtr++;
|
||||
double gamma = dTot/(dTot+dCtr); // 1.0/(double(epoch)+dCtr/dTot); // 1.0/double(epoch); // 1.0/(double(epoch)+dCtr/(dTot*prRarest*2.0)); //
|
||||
double gamma = dTot/(dTot+dCtr); // 1.0/(double(epoch)+dCtr/dTot); // 1.0/double(epoch); // 1.0/(double(epoch)+dCtr/(dTot*prRarest*2.0)); //
|
||||
|
||||
// Weight deltas for next epoch...
|
||||
Wt wDelta = 0.0;
|
||||
@ -333,7 +333,7 @@ void TrainableContDTree2DModel<Y,X,P>::train ( List<Joint2DRV<X,Y> >& lxy, cons
|
||||
P prY = 1.0 / ( 1.0 + exp(-wtdavg) );
|
||||
|
||||
// Calc deltas for each feature/attribute/dimension...
|
||||
double dEachWt = 1.0/dTot; // 1.0/dTot * modelY.getProb ( Y(1-pxy->getSub2().toInt()) ); // 1.0/(dTot*prRarest*2.0); //
|
||||
double dEachWt = 1.0/dTot; // 1.0/dTot * modelY.getProb ( Y(1-pxy->getSub2().toInt()) ); // 1.0/(dTot*prRarest*2.0); //
|
||||
wDelta += dEachWt * -1 * ( prY - P(double(pxy->getSub2().toInt())) );
|
||||
for ( A a; a<X::getSize(); a.setNext() )
|
||||
awDeltas.set(a) += dEachWt * pxy->getSub1().get(a.toInt()) * ( prY - P(double(pxy->getSub2().toInt())) );
|
||||
@ -439,7 +439,7 @@ void TrainableContDTree2DModel<Y,X,P>::train ( List<Joint2DRV<X,Y> >& lxy, cons
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<class Y, class X1, class X2, class P>
|
||||
class TrainableContDTree3DModel : public ContDTree3DModel<Y,X1,X2,P> {
|
||||
class TrainableContDTree3DModel : public ContDTree3DModel<Y,X1,X2,P> {
|
||||
|
||||
private:
|
||||
|
||||
@ -455,7 +455,7 @@ class TrainableContDTree3DModel : public ContDTree3DModel<Y,X1,X2,P> {
|
||||
TrainableContDTree2DModel<Y,X2,P>& setTree(const X1& x1) { return static_cast<TrainableContDTree2DModel<Y,X2,P>&>(ContDTree3DModel<Y,X1,X2,P>::setTree(x1)); }
|
||||
|
||||
////// Add training data to per-subphone lists...
|
||||
bool readData ( char* vs[], int numFields ) {
|
||||
bool readData ( char* vs[], int numFields ) {
|
||||
if ( 4==numFields ) {
|
||||
mqlxy[X1(vs[1])].add() = Joint2DRV<X2,Y> ( X2(vs[2]), Y(vs[3]) );
|
||||
////mqlxy[X1(vs[1])].getLast()->write(stderr); fprintf(stderr,"\n");
|
||||
|
@ -129,8 +129,8 @@ class DTree2DModel : public Tree < typename X::ElementType, DecisNode<X,Y,P> > {
|
||||
friend StringInput operator>> ( pair<StringInput,DTree2DModel<Y,X,P>*> si_m, const char* psD ) {
|
||||
if (StringInput(NULL)==si_m.first) return si_m.first;
|
||||
Y y; String xs; StringInput si,si2; si=si_m.first; DTree2DModel<Y,X,P>* pm=si_m.second;
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
si=si>>xs>>" ";
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
si=si>>xs>>" ";
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
// Find appropriate node, creating nodes as necessary...
|
||||
for(int i=1; i<int(strlen(xs.c_array()))-1; i++) {
|
||||
@ -140,22 +140,22 @@ class DTree2DModel : public Tree < typename X::ElementType, DecisNode<X,Y,P> > {
|
||||
|
||||
if ( si!=NULL && si[0]==':' ) {
|
||||
si=si>>": ";
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
si=si>>y>>" ";
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
si=si>>"= ";
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
// Specify attribute number (at nonterminal) or probability in distribution (at terminal)...
|
||||
return (si!=NULL) ? si>>pm->setProb(y)>>psD : si;
|
||||
}
|
||||
else if ( si!=NULL && si[0]=='=' ) {
|
||||
si=si>>"= "; //cerr<<" in after equals "<<((si==NULL) ? "yes" : "no") << endl;
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
|
||||
//m.setA() = atoi(si.c_str());
|
||||
int aVar = 0;
|
||||
si=si>>aVar>>psD;
|
||||
pm->setA()=aVar;
|
||||
si=si>>aVar>>psD;
|
||||
pm->setA()=aVar;
|
||||
////cerr<<" at end "<<((si==NULL) ? "yes" : "no") << endl;
|
||||
////cerr<<" m.getA() is "<< m.getA().toInt() << endl;
|
||||
return si;
|
||||
@ -169,15 +169,15 @@ class DTree2DModel : public Tree < typename X::ElementType, DecisNode<X,Y,P> > {
|
||||
si=si_m.first;
|
||||
sRt = si.c_str();
|
||||
if (sRt.find(':')!=string::npos) {
|
||||
while((si2=si>>" [")!=NULL)si=si2;
|
||||
si=si>>xs>>"] ";
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
while((si2=si>>" [")!=NULL)si=si2;
|
||||
si=si>>xs>>"] ";
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
si=si>>": ";
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
si=si>>y>>" ";
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
si=si>>"= ";
|
||||
|
||||
|
||||
// For DTree, must find the node labeled by X
|
||||
//Tree<B,DecisNode<X,Y,P> >* ptr = m;
|
||||
//assert(ptr);
|
||||
@ -189,15 +189,15 @@ class DTree2DModel : public Tree < typename X::ElementType, DecisNode<X,Y,P> > {
|
||||
// Specify attribute number (at nonterminal) or probability in distribution (at terminal)...
|
||||
return (si!=NULL) ? si>>m.setProb(y)>>psD : si;
|
||||
} else {
|
||||
while((si2=si>>" [")!=NULL)si=si2;
|
||||
while((si2=si>>" [")!=NULL)si=si2;
|
||||
si=si>>xs>>"] "; //cerr<<" in bracket "<<((si==NULL) ? "yes" : "no") << endl;
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
si=si>>"= "; //cerr<<" in after equals "<<((si==NULL) ? "yes" : "no") << endl;
|
||||
|
||||
//m.setA() = atoi(si.c_str());
|
||||
int aVar = 0;
|
||||
si=si>>aVar>>psD;
|
||||
m.setA()=aVar;
|
||||
si=si>>aVar>>psD;
|
||||
m.setA()=aVar;
|
||||
//cerr<<" at end "<<((si==NULL) ? "yes" : "no") << endl;
|
||||
//cerr<<" m.getA() is "<< m.getA().toInt() << endl;
|
||||
return si;
|
||||
@ -209,7 +209,7 @@ class DTree2DModel : public Tree < typename X::ElementType, DecisNode<X,Y,P> > {
|
||||
};
|
||||
|
||||
////////////////////
|
||||
template <class Y,class X, class P>
|
||||
template <class Y,class X, class P>
|
||||
bool DTree2DModel<Y,X,P>::readFields ( Array<char*>& aps ) {
|
||||
if ( /*aps[0]==sId &&*/ (3==aps.size() || 4==aps.size()) ) {
|
||||
//fprintf(stderr,"%s,%d\n",aps[3],numFields);
|
||||
@ -269,7 +269,7 @@ class DTree3DModel {
|
||||
};
|
||||
|
||||
////////////////////
|
||||
template <class Y,class X1,class X2, class P>
|
||||
template <class Y,class X1,class X2, class P>
|
||||
bool DTree3DModel<Y,X1,X2,P>::readFields ( char* aps[], int numFields ) {
|
||||
if ( /*aps[0]==sId &&*/ (4==numFields || 5==numFields) ) {
|
||||
//fprintf(stderr,"%s,%d\n",aps[3],numFields);
|
||||
@ -307,7 +307,7 @@ bool DTree3DModel<Y,X1,X2,P>::readFields ( char* aps[], int numFields ) {
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<class Y, class X, class P>
|
||||
class TrainableDTree2DModel : public DTree2DModel<Y,X,P> {
|
||||
class TrainableDTree2DModel : public DTree2DModel<Y,X,P> {
|
||||
private:
|
||||
// Type members...
|
||||
typedef typename X::ElementType B;
|
||||
@ -485,7 +485,7 @@ void TrainableDTree2DModel<Y,X,P>::train ( List<Joint2DRV<X,Y> >& lxy, const De
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<class Y, class X1, class X2, class P>
|
||||
class TrainableDTree3DModel : public DTree3DModel<Y,X1,X2,P> {
|
||||
class TrainableDTree3DModel : public DTree3DModel<Y,X1,X2,P> {
|
||||
|
||||
private:
|
||||
|
||||
|
@ -34,7 +34,7 @@ class Matrix : public SafeArray2D<Id<int>,Id<int>,T> {
|
||||
Matrix ( ) : SafeArray2D<Id<int>,Id<int>,T>( ) { }//{ xSize=0; ySize=0; }
|
||||
Matrix (int x, int y) : SafeArray2D<Id<int>,Id<int>,T>(x,y) { }//{ xSize=x; ySize=y; }
|
||||
Matrix (int x, int y, const T& t) : SafeArray2D<Id<int>,Id<int>,T>(x,y,t) { }//{ xSize=x; ySize=y; }
|
||||
Matrix (const Matrix& a) : SafeArray2D<Id<int>,Id<int>,T>(a.xSize(),a.ySize()) { //xSize=a.xSize; ySize=a.ySize;
|
||||
Matrix (const Matrix& a) : SafeArray2D<Id<int>,Id<int>,T>(a.xSize(),a.ySize()) { //xSize=a.xSize; ySize=a.ySize;
|
||||
for(int i=0;i<xSize();i++) for(int j=0;j<ySize();j++) this->set(i,j)=a.get(i,j); }
|
||||
// Specification methods...
|
||||
//Matrix& operator= ( const Matrix<T>& sat )
|
||||
@ -195,34 +195,34 @@ class Matrix : public SafeArray2D<Id<int>,Id<int>,T> {
|
||||
}
|
||||
return false;
|
||||
}
|
||||
bool operator== ( const Matrix<T>& a ) const {
|
||||
bool operator== ( const Matrix<T>& a ) const {
|
||||
if (xSize()!=a.xSize() || ySize()!=a.ySize()) return false;
|
||||
for (int i=0;i<a.xSize();i++)
|
||||
for (int i=0;i<a.xSize();i++)
|
||||
for (int j=0;j<a.ySize();j++)
|
||||
if (this->get(Id<int>(i),Id<int>(j))!=a.get(Id<int>(i),Id<int>(j))) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Input/output methods...
|
||||
friend ostream& operator<< ( ostream& os, const Matrix<T>& a ) {
|
||||
friend ostream& operator<< ( ostream& os, const Matrix<T>& a ) {
|
||||
os<<"\n ";
|
||||
for (int i=0;i<a.xSize();i++) {
|
||||
for (int j=0;j<a.ySize();j++) {
|
||||
os<<((j==0)?"":",")<<a.get(Id<int>(i),Id<int>(j));
|
||||
}
|
||||
}
|
||||
os<<(i==a.xSize()-1?"\n":"\n ");
|
||||
}
|
||||
return os;
|
||||
return os;
|
||||
}
|
||||
friend String& operator<< ( String& str, const Matrix<T>& a ) {
|
||||
friend String& operator<< ( String& str, const Matrix<T>& a ) {
|
||||
str<<"\n ";
|
||||
for (int i=0;i<a.xSize();i++) {
|
||||
for (int j=0;j<a.ySize();j++) {
|
||||
str<<((j==0)?"":",")<<a.get(Id<int>(i),Id<int>(j));
|
||||
}
|
||||
}
|
||||
str<<";";
|
||||
}
|
||||
return str;
|
||||
return str;
|
||||
}
|
||||
string getString( ) const;
|
||||
|
||||
@ -234,7 +234,7 @@ string Matrix<T>::getString() const {
|
||||
for (int j=0;j<ySize();j++) {
|
||||
str += ((j==0)?"":",");
|
||||
str += this->get(Id<int>(i),Id<int>(j));
|
||||
}
|
||||
}
|
||||
str += ";";
|
||||
}
|
||||
return str;
|
||||
|
@ -43,7 +43,7 @@ static const PDFVal VARIANCE_THRESHOLD = 0.01; //0.0001; //0
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <class Y>
|
||||
template <class Y>
|
||||
class DiagGauss1DModel : public Generic1DModel<Y,PDFVal> {
|
||||
private:
|
||||
// Member variables...
|
||||
@ -53,7 +53,7 @@ class DiagGauss1DModel : public Generic1DModel<Y,PDFVal> {
|
||||
SimpleHash<Id<int>,PDFVal> aMeans;
|
||||
SimpleHash<Id<int>,PDFVal> aVariances;
|
||||
PDFVal prInvRootNormVariances;
|
||||
PDFVal prProduct;
|
||||
PDFVal prProduct;
|
||||
SimpleHash<Id<int>,PDFVal> algprNegHalfInvVariances;
|
||||
public:
|
||||
// Constructor / destructor methods...
|
||||
@ -78,7 +78,7 @@ class DiagGauss1DModel : public Generic1DModel<Y,PDFVal> {
|
||||
};
|
||||
|
||||
////////////////////////////////////////
|
||||
template <class Y>
|
||||
template <class Y>
|
||||
inline void DiagGauss1DModel<Y>::precomputeVarianceTerms ( ) {
|
||||
// Inverse square root of norm of variances...
|
||||
setInvRootNormVar() = 1.0;
|
||||
@ -92,7 +92,7 @@ inline void DiagGauss1DModel<Y>::precomputeVarianceTerms ( ) {
|
||||
}
|
||||
|
||||
////////////////////////////////////////
|
||||
template <class Y>
|
||||
template <class Y>
|
||||
inline PDFVal DiagGauss1DModel<Y>::getProb ( const Y& y ) const {
|
||||
// fprintf(stderr,"--------------------\n");
|
||||
// y.write(stderr);
|
||||
@ -109,7 +109,7 @@ inline PDFVal DiagGauss1DModel<Y>::getProb ( const Y& y ) const {
|
||||
}
|
||||
|
||||
////////////////////////////////////////
|
||||
template <class Y>
|
||||
template <class Y>
|
||||
bool DiagGauss1DModel<Y>::readFields ( char* as[], int numFields ) {
|
||||
if ( 0==strcmp(as[1],"m") && numFields>2 ) {
|
||||
char* psT;
|
||||
@ -126,12 +126,12 @@ bool DiagGauss1DModel<Y>::readFields ( char* as[], int numFields ) {
|
||||
}
|
||||
|
||||
////////////////////////////////////////
|
||||
template <class Y>
|
||||
template <class Y>
|
||||
void DiagGauss1DModel<Y>::writeFields ( FILE* pf, const string& sPref ) const {
|
||||
fprintf(pf,"%s m = ",sPref.c_str());
|
||||
for(int i=0; i<getNumFeats(); i++) fprintf(pf,"%s%f",(0==i)?"":"_",getMean(i));
|
||||
fprintf ( pf, "\n" ) ;
|
||||
|
||||
|
||||
fprintf(pf,"%s v = ",sPref.c_str());
|
||||
for(int i=0; i<getNumFeats(); i++) fprintf(pf,"%s%f",(0==i)?"":"_",getVariance(i));
|
||||
fprintf ( pf, "\n" ) ;
|
||||
@ -141,7 +141,7 @@ void DiagGauss1DModel<Y>::writeFields ( FILE* pf, const string& sPref ) const {
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/*
|
||||
template <class Y,class X>
|
||||
template <class Y,class X>
|
||||
class DiagGauss2DModel : public Generic2DModel<Y,X,PDFVal> {
|
||||
private:
|
||||
// Member variables...
|
||||
@ -177,7 +177,7 @@ class DiagGauss2DModel : public Generic2DModel<Y,X,PDFVal> {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <class Y,class X1,class X2>
|
||||
template <class Y,class X1,class X2>
|
||||
class DiagGauss3DModel : public Generic3DModel<Y,X1,X2,PDFVal> {
|
||||
private:
|
||||
// Member variables...
|
||||
@ -220,7 +220,7 @@ class DiagGauss3DModel : public Generic3DModel<Y,X1,X2,PDFVal> {
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <class Y>
|
||||
template <class Y>
|
||||
class TrainableDiagGauss1DModel : public DiagGauss1DModel<Y> {
|
||||
public:
|
||||
TrainableDiagGauss1DModel ( ) : DiagGauss1DModel<Y>() { }
|
||||
|
@ -54,7 +54,7 @@ class SimpleHash : public hash_map<X,Y,SimpleHashFn<X>,SimpleHashEqual<X> > /*pu
|
||||
// tr1::unordered_map<X,Y,SimpleHashFn<X>,SimpleHashEqual<X> > mxy;
|
||||
static const Y yDummy;
|
||||
//static Y yNonconstDummy;
|
||||
|
||||
|
||||
public:
|
||||
// typedef typename OrigHash::const_iterator const_iterator;
|
||||
// typedef typename OrigHash::iterator iterator;
|
||||
|
@ -209,7 +209,7 @@ template <class MY, class MX, class S, class B>
|
||||
void HMM<MY,MX,S,B>::debugPrint() const{
|
||||
|
||||
for (int frame=0, numFrames=aatnTrellis.getxSize(); frame<numFrames; frame++) {
|
||||
|
||||
|
||||
for (int beamIndex=0, beamSize=aatnTrellis.getySize(); beamIndex<beamSize; beamIndex++) {
|
||||
|
||||
if (aatnTrellis.get(frame,beamIndex).getLogProb().toDouble() > 0) {
|
||||
@ -306,7 +306,7 @@ void HMM<MY,MX,S,B>::updateRanked ( const typename MX::RandVarType& x, bool b1 )
|
||||
// Add best transition (top of queue)...
|
||||
//mx.getProb(o,my.setTrellDat(ashpiQueue.getTop().first,ashpiQueue.getTop().second));
|
||||
if ( ashpiQueue.getSize() > 0 ) {
|
||||
S s; my.setTrellDat(s,ashpiQueue.getTop().second);
|
||||
S s; my.setTrellDat(s,ashpiQueue.getTop().second);
|
||||
bFull |= btn.tryAdd ( s, IB(ashpiQueue.getTop().first,my.setBackDat(ashpiQueue.getTop().second)), ashpiQueue.getTop().third );
|
||||
////cerr<<ashpiQueue.getSize()<<" queue elems A "<<ashpiQueue.getTop()<<"\n";
|
||||
////cerr<<"/-----A-----\\\n"<<ashpiQueue<<"\\-----A-----/\n";
|
||||
@ -379,7 +379,7 @@ void HMM<MY,MX,S,B>::updateSerial ( const typename MX::RandVarType& x ) {
|
||||
// Incorporate into trellis...
|
||||
btn.tryAdd ( s, IB(i,my.setBackDat(y)), lgprFull );
|
||||
//if(OUTPUT_VERYNOISY)
|
||||
// fprintf ( stderr," (S_t-1:[e^%0.6f] * Y:e^%0.6f * X:e^%0.6f = S_t:[e^%0.6f])\n",
|
||||
// fprintf ( stderr," (S_t-1:[e^%0.6f] * Y:e^%0.6f * X:e^%0.6f = S_t:[e^%0.6f])\n",
|
||||
// float(aatnTrellis.get(frameLast-1,i).getLogProb().toInt())/100.0,
|
||||
// float(lgprY.toInt())/100.0,
|
||||
// float(lgprX.toInt())/100.0,
|
||||
@ -389,7 +389,7 @@ void HMM<MY,MX,S,B>::updateSerial ( const typename MX::RandVarType& x ) {
|
||||
}
|
||||
|
||||
// for(int i=0;i<BEAM_WIDTH;i++) {
|
||||
// fprintf(stderr,"> "); btn.get(i)->first.write(stderr); fprintf(stderr,"\n");
|
||||
// fprintf(stderr,"> "); btn.get(i)->first.write(stderr); fprintf(stderr,"\n");
|
||||
// }
|
||||
|
||||
btn.sort(atnSorted);
|
||||
@ -429,8 +429,8 @@ void HMM<MY,MX,S,B>::each ( const typename MX::RandVarType& x, Beam<LogProb,S,IB
|
||||
const TrellNode<S,B>& tnsbPrev = aatnTrellis.get(frameLast-1,i);
|
||||
// If prob still not below beam minimum...
|
||||
if ( tnsbPrev.getLogProb() > btn.getMin().getScore() ) {
|
||||
//if (OUTPUT_VERYNOISY) { fprintf(stderr,"FROM: "); tnsbPrev.getId().write(stderr); fprintf(stderr,"\n"); }
|
||||
|
||||
//if (OUTPUT_VERYNOISY) { fprintf(stderr,"FROM: "); tnsbPrev.getId().write(stderr); fprintf(stderr,"\n"); }
|
||||
|
||||
// For each possible transition...
|
||||
const S& sPrev = tnsbPrev.getId();
|
||||
typename MY::IterVal y;
|
||||
@ -447,7 +447,7 @@ void HMM<MY,MX,S,B>::each ( const typename MX::RandVarType& x, Beam<LogProb,S,IB
|
||||
lgprX = mx.getProb(x,my.setTrellDat(s,y)); if ( !OUTPUT_VERYNOISY && LogProb()==lgprX ) continue;
|
||||
#endif /////////////////////////////////////////////////////////////////
|
||||
lgprFull = tnsbPrev.getLogProb() * lgprY * lgprX;
|
||||
if (OUTPUT_VERYNOISY) {
|
||||
if (OUTPUT_VERYNOISY) {
|
||||
boost::mutex::scoped_lock lock1(mutexHmmParanoiaLock);
|
||||
//fprintf(stderr," TO: "); y.write(stderr); fprintf(stderr,"\n");
|
||||
cout<<" "<<tnsbPrev.getId()<<" ==("<<tnsbPrev.getLogProb().toInt()<<"*"<<lgprY.toInt()<<"*"<<lgprX.toInt()<<"="<<lgprFull.toInt()<<")==> "<<y<<"\n";
|
||||
@ -459,7 +459,7 @@ void HMM<MY,MX,S,B>::each ( const typename MX::RandVarType& x, Beam<LogProb,S,IB
|
||||
// Incorporate into trellis...
|
||||
btn.tryAdd ( s, IB(i,my.setBackDat(y)), lgprFull );
|
||||
// if(OUTPUT_VERYNOISY)
|
||||
// fprintf ( stderr," (S_t-1:[e^%0.6f] * Y:e^%0.6f * X:e^%0.6f = S_t:[e^%0.6f])\n",
|
||||
// fprintf ( stderr," (S_t-1:[e^%0.6f] * Y:e^%0.6f * X:e^%0.6f = S_t:[e^%0.6f])\n",
|
||||
// float(aatnTrellis.get(frameLast-1,i).getLogProb().toInt())/100.0,
|
||||
// float(lgprY.toInt())/100.0,
|
||||
// float(lgprO.toInt())/100.0,
|
||||
@ -695,7 +695,7 @@ std::list<string> HMM<MY,MX,S,B>::getMLS(const S& sLast) const {
|
||||
//// sprintf(tmp,"HYPOTH %04d> ", fr-1);
|
||||
//// string tString(tmp);
|
||||
//// tString +=
|
||||
string tString =
|
||||
string tString =
|
||||
//// aatnTrellis.get(fr,iBest).getId().getString() + " " +
|
||||
aatnTrellis.get(fr,iBest).getBackData().getString()
|
||||
//// + "\n"
|
||||
@ -737,7 +737,7 @@ template <class MY, class MX, class S, class B>
|
||||
void HMM<MY,MX,S,B>::writeCurr ( ostream& os, int f=-1 ) const {
|
||||
if ( -1==f ) f=frameLast;
|
||||
if ( 0<=f && f<=frameLast )
|
||||
for ( int i=0; i<BEAM_WIDTH; i++ )
|
||||
for ( int i=0; i<BEAM_WIDTH; i++ )
|
||||
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
|
||||
//fprintf(pf,"at f=%04d b=%04d: ",f,i);
|
||||
os<<"at "<<std::setfill('0')<<std::setw(4)<<f<<" "<<std::setw(4)<<i<<": ";
|
||||
@ -765,7 +765,7 @@ void HMM<MY,MX,S,B>::writeCurrSum ( FILE* pf, int f=-1 ) const {
|
||||
if ( 0<=f && f<=frameLast ) {
|
||||
LogProb sum = 0.0;
|
||||
LogProb logtop = 0.0;
|
||||
for ( int i=0; i<BEAM_WIDTH; i++ )
|
||||
for ( int i=0; i<BEAM_WIDTH; i++ )
|
||||
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
|
||||
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
|
||||
LogProb big1 = sum - logtop;
|
||||
@ -818,7 +818,7 @@ void HMM<MY,MX,S,B>::gatherElementsInBeam( SafeArray1D<Id<int>,pair<S,LogProb> >
|
||||
result->init(BEAM_WIDTH);
|
||||
if ( -1==f ) f=frameLast;
|
||||
if ( 0<=f && f<=frameLast ) {
|
||||
for ( int i=0; i<BEAM_WIDTH && &(aatnTrellis.get(f,i))!=NULL; i++ ) {
|
||||
for ( int i=0; i<BEAM_WIDTH && &(aatnTrellis.get(f,i))!=NULL; i++ ) {
|
||||
result->set(i).first = aatnTrellis.get(f,i).getId();
|
||||
result->set(i).second = aatnTrellis.get(f,i).getLogProb();
|
||||
}
|
||||
@ -836,7 +836,7 @@ void HMM<MY,MX,S,B>::writeCurrEntropy ( FILE* pf, int f=-1 ) const {
|
||||
if ( 0<=f && f<=frameLast ) {
|
||||
LogProb logh = 0.0;
|
||||
LogProb logtop = 0.0;
|
||||
for ( int i=0; i<BEAM_WIDTH; i++ )
|
||||
for ( int i=0; i<BEAM_WIDTH; i++ )
|
||||
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
|
||||
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
|
||||
LogProb big1 = logh - logtop;
|
||||
@ -862,12 +862,12 @@ void HMM<MY,MX,S,B>::writeCurrDepths ( FILE* pf, int f=-1 ) const {
|
||||
Array<int> depths = Array<int>();
|
||||
Array<LogProb> logprobs = Array<LogProb>();
|
||||
double avgdepth = 0.0;
|
||||
for ( int i=0; i<BEAM_WIDTH; i++ )
|
||||
for ( int i=0; i<BEAM_WIDTH; i++ )
|
||||
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
|
||||
|
||||
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
|
||||
logprobs.set(i) = aatnTrellis.get(f,i).getLogProb();
|
||||
|
||||
|
||||
// loop over values in S node to find lowest meaningful depth
|
||||
for ( int j=0; j<aatnTrellis.get(f,i).getId().first.getSize(); j++) {
|
||||
// store the depth, if it's equal to G_BOT/G_BOT
|
||||
@ -996,7 +996,7 @@ int HMM<MY,MX,S,B>::getBeamUsed ( int f=-1 ) const {
|
||||
if ( -1==f ) f=frameLast;
|
||||
int ctr=0;
|
||||
if ( 0<=f && f<=frameLast )
|
||||
for ( int i=0; i<BEAM_WIDTH; i++ )
|
||||
for ( int i=0; i<BEAM_WIDTH; i++ )
|
||||
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
|
||||
ctr++;
|
||||
}
|
||||
|
@ -269,7 +269,7 @@ void HMM<MH,MO,X,B>::updateRanked ( const typename MO::RandVarType& o ) {
|
||||
// Add best transition (top of queue)...
|
||||
//mo.getProb(o,mh.setTrellDat(axhpiQueue.getTop().first,axhpiQueue.getTop().second));
|
||||
if ( axhpiQueue.getSize() > 0 ) {
|
||||
X x; mh.setTrellDat(x,axhpiQueue.getTop().second);
|
||||
X x; mh.setTrellDat(x,axhpiQueue.getTop().second);
|
||||
bFull |= btn.tryAdd ( x, IB(axhpiQueue.getTop().first,mh.setBackDat(axhpiQueue.getTop().second)), axhpiQueue.getTop().third );
|
||||
//cerr<<axhpiQueue.getSize()<<" queue elems A "<<axhpiQueue.getTop()<<"\n";
|
||||
//cerr<<"/-----A-----\\\n + bFull: "<<bFull<<"\naxhpiQueue: \n"<<axhpiQueue<<"\\-----A-----/\n";
|
||||
@ -341,7 +341,7 @@ void HMM<MH,MO,X,B>::updateSerial ( const typename MO::RandVarType& o ) {
|
||||
// Incorporate into trellis...
|
||||
btn.tryAdd ( x, IB(i,mh.setBackDat(h)), lgprFull );
|
||||
//if(OUTPUT_VERYNOISY)
|
||||
// fprintf ( stderr," (X_t-1:[e^%0.6f] * H:e^%0.6f * O:e^%0.6f = X_t:[e^%0.6f])\n",
|
||||
// fprintf ( stderr," (X_t-1:[e^%0.6f] * H:e^%0.6f * O:e^%0.6f = X_t:[e^%0.6f])\n",
|
||||
// float(aatnTrellis.get(frameLast-1,i).getLogProb().toInt())/100.0,
|
||||
// float(lgprH.toInt())/100.0,
|
||||
// float(lgprO.toInt())/100.0,
|
||||
@ -351,7 +351,7 @@ void HMM<MH,MO,X,B>::updateSerial ( const typename MO::RandVarType& o ) {
|
||||
}
|
||||
|
||||
// for(int i=0;i<BEAM_WIDTH;i++) {
|
||||
// fprintf(stderr,"> "); btn.get(i)->first.write(stderr); fprintf(stderr,"\n");
|
||||
// fprintf(stderr,"> "); btn.get(i)->first.write(stderr); fprintf(stderr,"\n");
|
||||
// }
|
||||
|
||||
btn.sort(atnSorted);
|
||||
@ -390,8 +390,8 @@ void HMM<MH,MO,X,B>::each ( const typename MO::RandVarType& o, Beam<LogProb,X,IB
|
||||
const TrellNode<X,B>& tnxbPrev = aatnTrellis.get(frameLast-1,i);
|
||||
// If prob still not below beam minimum...
|
||||
if ( tnxbPrev.getLogProb() > btn.getMin().getScore() ) {
|
||||
//if (OUTPUT_VERYNOISY) { fprintf(stderr,"FROM: "); tnxbPrev.getId().write(stderr); fprintf(stderr,"\n"); }
|
||||
|
||||
//if (OUTPUT_VERYNOISY) { fprintf(stderr,"FROM: "); tnxbPrev.getId().write(stderr); fprintf(stderr,"\n"); }
|
||||
|
||||
// For each possible transition...
|
||||
const X& xPrev = tnxbPrev.getId();
|
||||
typename MH::IterVal h;
|
||||
@ -408,7 +408,7 @@ void HMM<MH,MO,X,B>::each ( const typename MO::RandVarType& o, Beam<LogProb,X,IB
|
||||
lgprO = mo.getProb(o,mh.setTrellDat(x,h)); if ( !OUTPUT_VERYNOISY && LogProb()==lgprO ) continue;
|
||||
#endif /////////////////////////////////////////////////////////////////
|
||||
lgprFull = tnxbPrev.getLogProb() * lgprH * lgprO;
|
||||
if (OUTPUT_VERYNOISY) {
|
||||
if (OUTPUT_VERYNOISY) {
|
||||
boost::mutex::scoped_lock lock1(mutexHmmParanoiaLock);
|
||||
//fprintf(stderr," TO: "); h.write(stderr); fprintf(stderr,"\n");
|
||||
cout<<" "<<tnxbPrev.getId()<<" ==("<<tnxbPrev.getLogProb().toInt()<<"*"<<lgprH.toInt()<<"*"<<lgprO.toInt()<<"="<<lgprFull.toInt()<<")==> "<<h<<"\n";
|
||||
@ -420,7 +420,7 @@ void HMM<MH,MO,X,B>::each ( const typename MO::RandVarType& o, Beam<LogProb,X,IB
|
||||
// Incorporate into trellis...
|
||||
btn.tryAdd ( x, IB(i,mh.setBackDat(h)), lgprFull );
|
||||
// if(OUTPUT_VERYNOISY)
|
||||
// fprintf ( stderr," (X_t-1:[e^%0.6f] * H:e^%0.6f * O:e^%0.6f = X_t:[e^%0.6f])\n",
|
||||
// fprintf ( stderr," (X_t-1:[e^%0.6f] * H:e^%0.6f * O:e^%0.6f = X_t:[e^%0.6f])\n",
|
||||
// float(aatnTrellis.get(frameLast-1,i).getLogProb().toInt())/100.0,
|
||||
// float(lgprH.toInt())/100.0,
|
||||
// float(lgprO.toInt())/100.0,
|
||||
@ -656,7 +656,7 @@ std::list<string> HMM<MH,MO,X,B>::getMLS(const X& xLast) const {
|
||||
//// sprintf(tmp,"HYPOTH %04d> ", fr-1);
|
||||
//// string tString(tmp);
|
||||
//// tString +=
|
||||
string tString =
|
||||
string tString =
|
||||
//// aatnTrellis.get(fr,iBest).getId().getString() + " " +
|
||||
aatnTrellis.get(fr,iBest).getBackData().getString()
|
||||
//// + "\n"
|
||||
@ -697,7 +697,7 @@ template <class MH, class MO, class X, class B>
|
||||
void HMM<MH,MO,X,B>::writeCurr ( FILE* pf, int f=-1 ) const {
|
||||
if ( -1==f ) f=frameLast;
|
||||
if ( 0<=f && f<=frameLast )
|
||||
for ( int i=0; i<BEAM_WIDTH; i++ )
|
||||
for ( int i=0; i<BEAM_WIDTH; i++ )
|
||||
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
|
||||
fprintf(pf,"at f=%04d b=%04d: ",f,i);
|
||||
String str; str<<aatnTrellis.get(f,i).getId(); //.write(pf);
|
||||
@ -721,7 +721,7 @@ void HMM<MH,MO,X,B>::writeCurrSum ( FILE* pf, int f=-1 ) const {
|
||||
if ( 0<=f && f<=frameLast ) {
|
||||
LogProb sum = 0.0;
|
||||
LogProb logtop = 0.0;
|
||||
for ( int i=0; i<BEAM_WIDTH; i++ )
|
||||
for ( int i=0; i<BEAM_WIDTH; i++ )
|
||||
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
|
||||
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
|
||||
LogProb big1 = sum - logtop;
|
||||
@ -741,7 +741,7 @@ void HMM<MH,MO,X,B>::writeCurrEntropy ( FILE* pf, int f=-1 ) const {
|
||||
if ( 0<=f && f<=frameLast ) {
|
||||
LogProb logh = 0.0;
|
||||
LogProb logtop = 0.0;
|
||||
for ( int i=0; i<BEAM_WIDTH; i++ )
|
||||
for ( int i=0; i<BEAM_WIDTH; i++ )
|
||||
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
|
||||
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
|
||||
LogProb big1 = logh - logtop;
|
||||
@ -768,12 +768,12 @@ void HMM<MH,MO,X,B>::writeCurrDepths ( FILE* pf, int f=-1 ) const {
|
||||
Array<int> depths = Array<int>();
|
||||
Array<LogProb> logprobs = Array<LogProb>();
|
||||
double avgdepth = 0.0;
|
||||
for ( int i=0; i<BEAM_WIDTH; i++ )
|
||||
for ( int i=0; i<BEAM_WIDTH; i++ )
|
||||
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
|
||||
|
||||
if(i==0) { logtop=aatnTrellis.get(f,i).getLogProb(); }
|
||||
logprobs.set(i) = aatnTrellis.get(f,i).getLogProb();
|
||||
|
||||
|
||||
// loop over values in S node to find lowest meaningful depth
|
||||
for ( int j=0; j<aatnTrellis.get(f,i).getId().first.getSize(); j++) {
|
||||
// store the depth, if it's equal to G_BOT/G_BOT
|
||||
@ -900,7 +900,7 @@ int HMM<MH,MO,X,B>::getBeamUsed ( int f=-1 ) const {
|
||||
if ( -1==f ) f=frameLast;
|
||||
int ctr=0;
|
||||
if ( 0<=f && f<=frameLast )
|
||||
for ( int i=0; i<BEAM_WIDTH; i++ )
|
||||
for ( int i=0; i<BEAM_WIDTH; i++ )
|
||||
if(!(aatnTrellis.get(f,i).getLogProb() == LogProb())){
|
||||
ctr++;
|
||||
}
|
||||
|
@ -348,7 +348,7 @@ const TrellNode<S,B>& HMMLoop<MY,MX,S,B>::update ( const typename MX::RandVarTyp
|
||||
//modX.getProb(o,modY.setTrellDat(ashpiQueue.getTop().first,ashpiQueue.getTop().second));
|
||||
if ( ashpiQueue.getSize() > 0 ) {
|
||||
S s ( ashpiQueue.getTop().second );
|
||||
////S s; modY.setTrellDat(s,ashpiQueue.getTop().second);
|
||||
////S s; modY.setTrellDat(s,ashpiQueue.getTop().second);
|
||||
bFull |= btn.tryAdd ( s, IB(ashpiQueue.getTop().first,B(ashpiQueue.getTop().second)), ashpiQueue.getTop().third );
|
||||
////cerr<<ashpiQueue.getSize()<<" queue elems A "<<ashpiQueue.getTop()<<"\n";
|
||||
////cerr<<"/-----A-----\\\n"<<ashpiQueue<<"\\-----A-----/\n";
|
||||
|
@ -90,8 +90,8 @@ class Vector : public X {
|
||||
Vector<X> operator- ( ElementType d ) const { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = X::get(i)-d; return vO; }
|
||||
friend Vector<X> operator* ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d*v[i]; return vO; }
|
||||
friend Vector<X> operator/ ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d/v[i]; return vO; }
|
||||
friend Vector<X> operator+ ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d+v[i]; return vO; }
|
||||
friend Vector<X> operator- ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d-v[i]; return vO; }
|
||||
friend Vector<X> operator+ ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d+v[i]; return vO; }
|
||||
friend Vector<X> operator- ( ElementType d, const Vector<X>& v ) { Vector<X> vO; for(uint i=0;i<SIZE;i++) vO[i] = d-v[i]; return vO; }
|
||||
Vector<X>& operator*= ( ElementType d ) { for(uint i=0;i<SIZE;i++) X::set(i)*=d; return *this; }
|
||||
Vector<X>& operator/= ( ElementType d ) { for(uint i=0;i<SIZE;i++) X::set(i)/=d; return *this; }
|
||||
Vector<X>& operator+= ( ElementType d ) { for(uint i=0;i<SIZE;i++) X::set(i)+=d; return *this; }
|
||||
|
@ -97,7 +97,7 @@ class Mixture3DModel : public Generic2DModel<Y,X,Prob> {
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <template <class MY> class M,class Y,class C>
|
||||
template <template <class MY> class M,class Y,class C>
|
||||
class TrainableMixture2DModel : public Mixture2DModel<M,Y,C> {
|
||||
// private:
|
||||
// LogPDFVal logpdfPrevDataAvg;
|
||||
@ -110,7 +110,7 @@ class TrainableMixture2DModel : public Mixture2DModel<M,Y,C> {
|
||||
};
|
||||
|
||||
////////////////////////////////////////
|
||||
template <template <class MY> class M,class Y,class C>
|
||||
template <template <class MY> class M,class Y,class C>
|
||||
void TrainableMixture2DModel<M,Y,C>::updateFields ( const List<Joint2DRV<Y,Prob> >& lyp, const PDFVal WEIGHT_LIMIT, bool& bShouldStop ) {
|
||||
LogPDFVal logpdfData = 0.0;
|
||||
CPT1DModel<C,Prob> mprPseudoEmpC; // pseudo-empirical prob marginal
|
||||
@ -178,7 +178,7 @@ void TrainableMixture2DModel<M,Y,C>::updateFields ( const List<Joint2DRV<Y,Prob>
|
||||
}
|
||||
|
||||
////////////////////////////////////////
|
||||
template <template <class MY> class M,class Y,class C>
|
||||
template <template <class MY> class M,class Y,class C>
|
||||
void TrainableMixture2DModel<M,Y,C>::train ( List<Joint2DRV<Y,Prob> >& lyp, const int EPOCH_LIMIT, const PDFVal WEIGHT_LIMIT ) {
|
||||
|
||||
// Normalize model...
|
||||
@ -204,7 +204,7 @@ void TrainableMixture2DModel<M,Y,C>::train ( List<Joint2DRV<Y,Prob> >& lyp, cons
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <template <class MY> class M,class Y,class X,class C>
|
||||
template <template <class MY> class M,class Y,class X,class C>
|
||||
class TrainableMixture3DModel : public Generic2DModel<Y,X,C> {
|
||||
private:
|
||||
string sId;
|
||||
@ -225,7 +225,7 @@ class TrainableMixture3DModel : public Generic2DModel<Y,X,C> {
|
||||
};
|
||||
|
||||
////////////////////////////////////////
|
||||
template <template <class MY> class M,class Y,class X,class C>
|
||||
template <template <class MY> class M,class Y,class X,class C>
|
||||
void TrainableMixture3DModel<M,Y,X,C>::train ( const int EPOCH_LIMIT, const PDFVal WEIGHT_LIMIT ) {
|
||||
// Update each subphone from list...
|
||||
int ctr = 0;
|
||||
@ -237,7 +237,7 @@ void TrainableMixture3DModel<M,Y,X,C>::train ( const int EPOCH_LIMIT, const PDFV
|
||||
}
|
||||
|
||||
////////////////////////////////////////
|
||||
template <template <class MY> class M,class Y,class X,class C>
|
||||
template <template <class MY> class M,class Y,class X,class C>
|
||||
void TrainableMixture3DModel<M,Y,X,C>::train ( const List<Joint3DRV<X,Y,Prob> >& lxyp, const int EPOCH_LIMIT, const PDFVal WEIGHT_LIMIT ) {
|
||||
// Chop list into phone-specific sub-lists...
|
||||
ListedObject<Joint3DRV<X,Y,Prob> >* pxyp;
|
||||
@ -248,7 +248,7 @@ void TrainableMixture3DModel<M,Y,X,C>::train ( const List<Joint3DRV<X,Y,Prob> >&
|
||||
}
|
||||
|
||||
////////////////////////////////////////
|
||||
template <template <class MY> class M,class Y,class X,class C>
|
||||
template <template <class MY> class M,class Y,class X,class C>
|
||||
bool TrainableMixture3DModel<M,Y,X,C>::readData ( char* as[], int numFields ) {
|
||||
if ( /*as[0]!=sId+"dat" ||*/ numFields!=3 ) return false;
|
||||
alyp.set(X(as[1])).add() = Joint2DRV<Y,Prob>(Y(as[2]),Prob(1.0));
|
||||
@ -256,7 +256,7 @@ bool TrainableMixture3DModel<M,Y,X,C>::readData ( char* as[], int numFields ) {
|
||||
}
|
||||
|
||||
////////////////////////////////////////
|
||||
template <template <class MY> class M,class Y,class X,class C>
|
||||
template <template <class MY> class M,class Y,class X,class C>
|
||||
void TrainableMixture3DModel<M,Y,X,C>::writeFields ( FILE* pf, string sPref ) {
|
||||
X x; for ( bool b=x.setFirst(); b; b=x.setNext() ) {
|
||||
am.get(x).writeFields(pf,sPref+" "+x.getString());
|
||||
|
@ -37,7 +37,7 @@ void processModelFilePtr ( FILE* pf, bool rF(Array<char*>&) ) {
|
||||
int i=0; int numFields=0; int c=' '; int line=1;
|
||||
CONSUME_ALL(pf,c,WHITESPACE(c),line); // Get to first record
|
||||
while ( c!=EOF ) { // For each record
|
||||
if ( c=='#' ) CONSUME_ALL(pf, c, c!='\n' && c!='\0', line ) ; // If comment, consume
|
||||
if ( c=='#' ) CONSUME_ALL(pf, c, c!='\n' && c!='\0', line ) ; // If comment, consume
|
||||
else { // If no comment,
|
||||
Array<char*> aps(100);
|
||||
String psBuff(1000);
|
||||
@ -49,7 +49,7 @@ void processModelFilePtr ( FILE* pf, bool rF(Array<char*>&) ) {
|
||||
if (!z) break;
|
||||
aps[i]=z;
|
||||
}
|
||||
|
||||
|
||||
if ( !rF(aps) ) // Try to process fields, else complain
|
||||
fprintf( stderr, "\nERROR: %d %d-arg %s in line %d\n\n", numFields, aps.size(), aps[0], line);
|
||||
}
|
||||
@ -75,7 +75,7 @@ void processModelSocket ( const int tSockfd, int& c, bool rF(Array<char*>&) ) {
|
||||
int i=0; int numFields=0; int line=1;
|
||||
CONSUME_ALL_SOCKET(tSockfd,c,WHITESPACE(c),line); // Get to first record
|
||||
while ( c!='\0' && c!='\5' ) { // For each record
|
||||
if ( c=='#' ) CONSUME_ALL_SOCKET(tSockfd, c, (c!='\n' && c!='\0' && c!='\5'), line ) ; // If comment, consume
|
||||
if ( c=='#' ) CONSUME_ALL_SOCKET(tSockfd, c, (c!='\n' && c!='\0' && c!='\5'), line ) ; // If comment, consume
|
||||
else { // If no comment,
|
||||
Array<char*> aps(100);
|
||||
String psBuff(1000);
|
||||
@ -88,7 +88,7 @@ void processModelSocket ( const int tSockfd, int& c, bool rF(Array<char*>&) ) {
|
||||
if (!z) break;
|
||||
aps[i]=z;
|
||||
}
|
||||
|
||||
|
||||
if ( !rF(aps) ) // Try to process fields, else complain
|
||||
fprintf( stderr, "\nERROR: %d-arg %s in line %d\n\n", numFields, aps[0], line);
|
||||
}
|
||||
@ -97,7 +97,7 @@ void processModelSocket ( const int tSockfd, int& c, bool rF(Array<char*>&) ) {
|
||||
}
|
||||
|
||||
void processModelSocket ( const int tSockfd, bool rF(Array<char*>&) ) {
|
||||
int c=' ';
|
||||
int c=' ';
|
||||
processModelSocket ( tSockfd, c, rF );
|
||||
}
|
||||
|
||||
|
@ -80,12 +80,12 @@ class binuint {
|
||||
// Input / output methods...
|
||||
friend StringInput operator>> ( StringInput si, binuint& i ) {
|
||||
if(si==NULL) return si;
|
||||
i.b=0;
|
||||
i.b=0;
|
||||
for ( char c=si[0]; '0'<=c && c<='1'; ++si,c=si[0])
|
||||
{ i.b=i.b*2+c-'0'; }
|
||||
return si; }
|
||||
friend ostream& operator<< ( ostream& os, binuint i ) { for(int e=uint(log2(i.b));e>=0;e--)os <<((i.b>>e)%2); return os; }
|
||||
friend String& operator<< ( String& str, binuint i ) { for(int e=uint(log2(i.b));e>=0;e--)str<<((i.b>>e)%2); return str; }
|
||||
friend ostream& operator<< ( ostream& os, binuint i ) { for(int e=uint(log2(i.b));e>=0;e--)os <<((i.b>>e)%2); return os; }
|
||||
friend String& operator<< ( String& str, binuint i ) { for(int e=uint(log2(i.b));e>=0;e--)str<<((i.b>>e)%2); return str; }
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -43,7 +43,7 @@ class Prob {
|
||||
Prob ( ) { gVal = 0.0; }
|
||||
Prob (double d) { gVal = d; }
|
||||
Prob (const char* ps) { gVal = atof(ps); }
|
||||
|
||||
|
||||
operator double() const { return gVal; }
|
||||
double toDouble() const { return gVal; }
|
||||
Prob& operator+= ( const Prob p ) { gVal += p.gVal; return *this; }
|
||||
@ -54,7 +54,7 @@ class Prob {
|
||||
friend ostream& operator<< ( ostream& os, const Prob& pr ) { return os<<pr.toDouble(); }
|
||||
friend String& operator<< ( String& str, const Prob& pr ) { return str<<pr.toDouble(); }
|
||||
friend pair<StringInput,Prob*> operator>> ( StringInput si, Prob& n ) { return pair<StringInput,Prob*>(si,&n); }
|
||||
friend StringInput operator>> ( pair<StringInput,Prob*> si_n, const char* psDlm ) {
|
||||
friend StringInput operator>> ( pair<StringInput,Prob*> si_n, const char* psDlm ) {
|
||||
double d=0.0; StringInput si=si_n.first>>d>>psDlm; *si_n.second=Prob(d); return si; }
|
||||
};
|
||||
|
||||
@ -129,7 +129,7 @@ class LogProb : public Id<int> {
|
||||
friend ostream& operator<< ( ostream& os, const LogProb& lp ) { return os<<lp.toInt(); }
|
||||
friend String& operator<< ( String& str, const LogProb& lp ) { return str<<lp.toInt(); }
|
||||
friend pair<StringInput,LogProb*> operator>> ( StringInput si, LogProb& n ) { return pair<StringInput,LogProb*>(si,&n); }
|
||||
friend StringInput operator>> ( pair<StringInput,LogProb*> si_n, const char* psDlm ) {
|
||||
friend StringInput operator>> ( pair<StringInput,LogProb*> si_n, const char* psDlm ) {
|
||||
double d=0.0; StringInput si=si_n.first>>d>>psDlm; *si_n.second=LogProb(d); return si; }
|
||||
};
|
||||
|
||||
|
@ -33,7 +33,7 @@
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<class Y,class P>
|
||||
template<class Y,class P>
|
||||
class Generic1DModel {
|
||||
public:
|
||||
typedef Y RVType;
|
||||
@ -45,7 +45,7 @@ class Generic1DModel {
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
template<class Y,class X1,class P>
|
||||
template<class Y,class X1,class P>
|
||||
class Generic2DModel {
|
||||
public:
|
||||
typedef Y RVType;
|
||||
@ -60,7 +60,7 @@ class Generic2DModel {
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
template<class Y,class X1,class X2,class P>
|
||||
template<class Y,class X1,class X2,class P>
|
||||
class Generic3DModel {
|
||||
public:
|
||||
typedef Y RVType;
|
||||
@ -76,7 +76,7 @@ class Generic3DModel {
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
template<class Y,class X1,class X2,class X3,class P>
|
||||
template<class Y,class X1,class X2,class X3,class P>
|
||||
class Generic4DModel {
|
||||
public:
|
||||
typedef Y RVType;
|
||||
@ -93,7 +93,7 @@ class Generic4DModel {
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
template<class Y,class X1,class X2,class X3,class X4,class P>
|
||||
template<class Y,class X1,class X2,class X3,class X4,class P>
|
||||
class Generic5DModel {
|
||||
public:
|
||||
typedef Y RVType;
|
||||
@ -111,7 +111,7 @@ class Generic5DModel {
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
template<class Y,class X1,class X2,class X3,class X4,class X5,class P>
|
||||
template<class Y,class X1,class X2,class X3,class X4,class X5,class P>
|
||||
class Generic6DModel {
|
||||
public:
|
||||
typedef Y RVType;
|
||||
@ -130,7 +130,7 @@ class Generic6DModel {
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
template<class Y,class X1,class X2,class X3,class X4,class X5,class X6,class P>
|
||||
template<class Y,class X1,class X2,class X3,class X4,class X5,class X6,class P>
|
||||
class Generic7DModel {
|
||||
public:
|
||||
typedef Y RVType;
|
||||
@ -302,7 +302,7 @@ class Modeled5DRV : public M::RVType {
|
||||
const typename M::Dep2Type& x2,
|
||||
const typename M::Dep3Type& x3,
|
||||
const typename M::Dep4Type& x4 ) const { return m.getProb(*this,x1,x2,x3,x4); }
|
||||
|
||||
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
@ -346,7 +346,7 @@ class Modeled6DRV : public M::RVType {
|
||||
const typename M::Dep3Type& x3,
|
||||
const typename M::Dep4Type& x4,
|
||||
const typename M::Dep5Type& x5 ) const { return m.getProb(*this,x1,x2,x3,x4,x5); }
|
||||
|
||||
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
@ -395,7 +395,7 @@ class Modeled7DRV : public M::RVType {
|
||||
const typename M::Dep4Type& x4,
|
||||
const typename M::Dep5Type& x5,
|
||||
const typename M::Dep6Type& x6 ) const { return m.getProb(*this,x1,x2,x3,x4,x5,x6); }
|
||||
|
||||
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -42,7 +42,7 @@ class GenericRACPTModel : public SimpleHash<K,P> {
|
||||
return ( SimpleHash<K,P>::contains(k) );
|
||||
}
|
||||
|
||||
/*
|
||||
/*
|
||||
P getProb ( const IterVal& ikyp, const K& k ) const {
|
||||
if ( ikyp.iter.first == ikyp.iter.second ) { cerr<<"ERROR: no iterator to fix probability: "<<k<<endl; return P(); }
|
||||
return ( ikyp.iter.first->second );
|
||||
@ -91,7 +91,7 @@ class GenericRACPTModel : public SimpleHash<K,P> {
|
||||
for ( typename HKP::const_iterator ik=HKP::begin(); ik!=HKP::end(); ik++ ) {
|
||||
K k=ik->first;
|
||||
os << psId<<" "<<k<<" = "<<getProb(k).toDouble()<<endl;
|
||||
|
||||
|
||||
// IterVal y;
|
||||
// for ( bool b=setFirst(y,k); b; b=setNext(y,k) )
|
||||
// os<<psId<<" "<<k<<" : "<<y<<" = "<<getProb(y,k).toDouble()<<"\n";
|
||||
@ -110,14 +110,14 @@ class GenericRACPTModel : public SimpleHash<K,P> {
|
||||
|
||||
friend pair<StringInput,GenericRACPTModel<K,P>*> operator>> ( StringInput si, GenericRACPTModel<K,P>& m ) {
|
||||
return pair<StringInput,GenericRACPTModel<K,P>*>(si,&m); }
|
||||
|
||||
|
||||
friend StringInput operator>> ( pair<StringInput,GenericRACPTModel<K,P>*> delimbuff, const char* psD ) {
|
||||
K k;
|
||||
StringInput si,si2,si3;
|
||||
K k;
|
||||
StringInput si,si2,si3;
|
||||
GenericRACPTModel<K,P>& m = *delimbuff.second;
|
||||
si=delimbuff.first;
|
||||
if ( si==NULL ) return si;
|
||||
|
||||
|
||||
// Kill the colon since we're treating the whole thing as the condition
|
||||
char * str = si.c_str();
|
||||
char * p = strchr(str, ':');
|
||||
@ -125,17 +125,17 @@ class GenericRACPTModel : public SimpleHash<K,P> {
|
||||
p[0] = ' ';
|
||||
}
|
||||
si=str;
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
si=si>>k>>" ";
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
si=si>>"= ";
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
while((si2=si>>" ")!=NULL)si=si2;
|
||||
return (si!=NULL) ? si>>m.setProb(k)>>psD : si;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<class Y, class P>
|
||||
template<class Y, class P>
|
||||
class RandAccCPT1DModel : public GenericRACPTModel<MapKey1D<Y>,P> {
|
||||
public:
|
||||
// typedef typename GenericCPTModel<Y,MapKey1D<Unit>,P>::IterVal IterVal;
|
||||
@ -170,7 +170,7 @@ P& setProb ( const Y& y ) {
|
||||
|
||||
|
||||
////////////////////
|
||||
template<class Y, class X1, class P>
|
||||
template<class Y, class X1, class P>
|
||||
class RandAccCPT2DModel : public GenericRACPTModel<MapKey2D<X1,Y>,P> {
|
||||
public:
|
||||
|
||||
@ -187,7 +187,7 @@ class RandAccCPT2DModel : public GenericRACPTModel<MapKey2D<X1,Y>,P> {
|
||||
P getProb ( const Y& y, const X1& x1 ) const {
|
||||
return GenericRACPTModel<MapKey2D<X1,Y>,P>::getProb ( MapKey2D<X1,Y>(x1,y) );
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
P& setProb ( const Y& y, const X1& x1 ) {
|
||||
cerr << "setProb called on racpt2d" << endl;
|
||||
@ -199,7 +199,7 @@ class RandAccCPT2DModel : public GenericRACPTModel<MapKey2D<X1,Y>,P> {
|
||||
|
||||
|
||||
////////////////////
|
||||
template<class Y, class X1, class X2, class P>
|
||||
template<class Y, class X1, class X2, class P>
|
||||
class RandAccCPT3DModel : public GenericRACPTModel<MapKey3D<X1,X2,Y>,P> {
|
||||
public:
|
||||
|
||||
@ -219,7 +219,7 @@ class RandAccCPT3DModel : public GenericRACPTModel<MapKey3D<X1,X2,Y>,P> {
|
||||
|
||||
/*
|
||||
////////////////////
|
||||
template<class Y, class X1, class X2, class X3, class P>
|
||||
template<class Y, class X1, class X2, class X3, class P>
|
||||
class CPT4DModel : public GenericCPTModel<Y,MapKey3D<X1,X2,X3>,P> {
|
||||
public:
|
||||
typedef typename GenericCPTModel<Y,MapKey3D<X1,X2,X3>,P>::IterVal IterVal;
|
||||
@ -256,7 +256,7 @@ class CPT4DModel : public GenericCPTModel<Y,MapKey3D<X1,X2,X3>,P> {
|
||||
|
||||
|
||||
////////////////////
|
||||
template<class Y, class X1, class X2, class X3, class X4, class P>
|
||||
template<class Y, class X1, class X2, class X3, class X4, class P>
|
||||
class CPT5DModel : public GenericCPTModel<Y,MapKey4D<X1,X2,X3,X4>,P> {
|
||||
public:
|
||||
typedef typename GenericCPTModel<Y,MapKey4D<X1,X2,X3,X4>,P>::IterVal IterVal;
|
||||
@ -293,7 +293,7 @@ class CPT5DModel : public GenericCPTModel<Y,MapKey4D<X1,X2,X3,X4>,P> {
|
||||
|
||||
|
||||
////////////////////
|
||||
template<class Y, class X1, class X2, class X3, class X4, class X5, class P>
|
||||
template<class Y, class X1, class X2, class X3, class X4, class X5, class P>
|
||||
class RACPT6DModel : public GenericCPTModel<Y,MapKey5D<X1,X2,X3,X4,X5>,P> {
|
||||
public:
|
||||
typedef typename GenericCPTModel<Y,MapKey5D<X1,X2,X3,X4,X5>,P>::IterVal IterVal;
|
||||
|
@ -129,7 +129,7 @@ class DiscreteDomainRV : public Id<T> {
|
||||
friend pair<StringInput,DiscreteDomainRV<T,domain>*> operator>> ( const StringInput ps, DiscreteDomainRV<T,domain>& rv ) { return pair<StringInput,DiscreteDomainRV<T,domain>*>(ps,&rv); }
|
||||
friend StringInput operator>> ( pair<StringInput,DiscreteDomainRV<T,domain>*> delimbuff, const char* psDlm ) {
|
||||
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
|
||||
////assert(*delimbuff.second<domain.getSize());
|
||||
////assert(*delimbuff.second<domain.getSize());
|
||||
int j=0;
|
||||
StringInput psIn = delimbuff.first;
|
||||
if(psDlm[0]=='\0') { *delimbuff.second=psIn.c_str(); return psIn+strlen(psIn.c_str()); }
|
||||
@ -203,7 +203,7 @@ template <class T> const T RefRV<T>::DUMMY;
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
template<class V1,class V2>
|
||||
template<class V1,class V2>
|
||||
class Joint2DRV {
|
||||
|
||||
public:
|
||||
@ -216,7 +216,7 @@ class Joint2DRV {
|
||||
Joint2DRV ( const V1& v1, const V2& v2 ) { first=v1; second=v2; }
|
||||
|
||||
// Extraction methods...
|
||||
size_t getHashKey ( ) const { size_t k=rotLeft(first.getHashKey(),3); k^=second.getHashKey();
|
||||
size_t getHashKey ( ) const { size_t k=rotLeft(first.getHashKey(),3); k^=second.getHashKey();
|
||||
/*fprintf(stderr," (%d) %d ^& %d = %d\n",sizeof(*this),x1.getHashKey(),x2.getHashKey(),k);*/ return k; }
|
||||
bool operator< ( const Joint2DRV<V1,V2>& j ) const { return ( (first<j.first) ||
|
||||
(first==j.first && second<j.second) ); }
|
||||
@ -276,7 +276,7 @@ class DelimitedJoint2DRV : public Joint2DRV<V1,V2> {
|
||||
friend pair<StringInput,DelimitedJoint2DRV<SD1,V1,SD2,V2,SD3>*> operator>> ( StringInput ps, DelimitedJoint2DRV<SD1,V1,SD2,V2,SD3>& rv ) { return pair<StringInput,DelimitedJoint2DRV<SD1,V1,SD2,V2,SD3>*>(ps,&rv); }
|
||||
friend StringInput operator>> ( pair<StringInput,DelimitedJoint2DRV<SD1,V1,SD2,V2,SD3>*> delimbuff, const char* psDlm ) {
|
||||
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
|
||||
return ( (SD3[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>psDlm
|
||||
return ( (SD3[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>psDlm
|
||||
: delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>SD3>>psDlm );
|
||||
}
|
||||
};
|
||||
@ -290,7 +290,7 @@ class DelimitedJoint2DRV : public Joint2DRV<V1,V2> {
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<class V1,class V2,class V3>
|
||||
template<class V1,class V2,class V3>
|
||||
class Joint3DRV {
|
||||
|
||||
public:
|
||||
@ -361,7 +361,7 @@ class DelimitedJoint3DRV : public Joint3DRV<V1,V2,V3> {
|
||||
return pair<StringInput,DelimitedJoint3DRV<SD1,V1,SD2,V2,SD3,V3,SD4>*>(ps,&rv); }
|
||||
friend StringInput operator>> ( pair<StringInput,DelimitedJoint3DRV<SD1,V1,SD2,V2,SD3,V3,SD4>*> delimbuff, const char* psDlm ) {
|
||||
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
|
||||
return ( (SD4[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>SD3>>delimbuff.second->third>>psDlm
|
||||
return ( (SD4[0]=='\0') ? delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>SD3>>delimbuff.second->third>>psDlm
|
||||
: delimbuff.first>>SD1>>delimbuff.second->first>>SD2>>delimbuff.second->second>>SD3>>delimbuff.second->third>>SD4>>psDlm );
|
||||
}
|
||||
};
|
||||
@ -453,7 +453,7 @@ class DelimitedJoint4DRV : public Joint4DRV<V1,V2,V3,V4> {
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <int I, class T>
|
||||
template <int I, class T>
|
||||
class JointArrayRV {
|
||||
private:
|
||||
// Data members...
|
||||
@ -491,7 +491,7 @@ class JointArrayRV {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <int I, char* SD, class T>
|
||||
template <int I, char* SD, class T>
|
||||
class DelimitedJointArrayRV : public JointArrayRV<I,T> {
|
||||
public:
|
||||
|
||||
@ -569,7 +569,7 @@ class History {
|
||||
/*
|
||||
void read ( char* ps, const ReaderContext& rc=ReaderContext() ) { char* psT; for(int i=0;i<N;i++){char* z=strtok_r((0==i)?ps:NULL,";",&psT); assert(z); at.set(i).read(z);} }
|
||||
//at.set(i).read(strtok_r((0==i)?ps:NULL,";",&psT)); }
|
||||
*/
|
||||
*/
|
||||
|
||||
friend ostream& operator<< ( ostream& os, const History<N,T>& a ) { for(int i=0;i<N;i++)os<<((i==0)?"":";")<<a.getBack(i); return os; }
|
||||
friend pair<StringInput,History<N,T>*> operator>> ( StringInput ps, History<N,T>& a ) { return pair<StringInput,History<N,T>*>(ps,&a); }
|
||||
|
@ -30,7 +30,7 @@
|
||||
#include "nl-stream.h"
|
||||
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
using namespace std;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
@ -39,7 +39,7 @@ using namespace std;
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <int I, class T>
|
||||
template <int I, class T>
|
||||
class StaticSafeArray {
|
||||
private:
|
||||
// Data members...
|
||||
@ -84,7 +84,7 @@ class StaticSafeArray {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <int I, char* SD, class T>
|
||||
template <int I, char* SD, class T>
|
||||
class DelimitedStaticSafeArray : public StaticSafeArray<I,T> {
|
||||
public:
|
||||
DelimitedStaticSafeArray ( ) : StaticSafeArray<I,T>() { }
|
||||
@ -349,7 +349,7 @@ class SafeArray2D {
|
||||
// Extraction methods...
|
||||
const T& get (const X1& x,const X2& y) const { assert(at!=NULL);
|
||||
assert(x.toInt()>=0); assert(x.toInt()<xSize);
|
||||
assert(y.toInt()>=0);
|
||||
assert(y.toInt()>=0);
|
||||
//this assert failed when compile without -DNDEBUG (needed for debugging). Have to figure out why before adding this assert back in
|
||||
//assert(y.toInt()<ySize);
|
||||
return at[x.toInt()*ySize + y.toInt()];}
|
||||
@ -423,7 +423,7 @@ class SafeArray4D {
|
||||
{ delete[] at; wSize=sat.wSize; xSize=sat.xSize; ySize=sat.ySize;
|
||||
zSize=sat.zSize; at=new T[wSize*xSize*ySize*zSize];
|
||||
for(int i=0;i<wSize*xSize*ySize*zSize;i++) at[i]=sat.at[i]; return *this; }
|
||||
void init (int w,int x,int y,int z)
|
||||
void init (int w,int x,int y,int z)
|
||||
{ delete[] at; wSize=w; xSize=x; ySize=y; zSize=z; at=new T[w*x*y*z]; }
|
||||
void init (int w,int x,int y,int z,const T& t)
|
||||
{ delete[] at; wSize=w; xSize=x; ySize=y; zSize=z; at=new T[w*x*y*z];
|
||||
@ -472,7 +472,7 @@ class SafeArray5D {
|
||||
{ delete[] at; vSize=sat.vSize; wSize=sat.wSize; xSize=sat.xSize;
|
||||
ySize=sat.ySize; zSize=sat.zSize; at=new T[vSize*wSize*xSize*ySize*zSize];
|
||||
for(int i=0;i<vSize*wSize*xSize*ySize*zSize;i++) at[i]=sat.at[i]; return *this; }
|
||||
void init(int v,int w,int x,int y,int z)
|
||||
void init(int v,int w,int x,int y,int z)
|
||||
{ delete[] at; vSize=v; wSize=w; xSize=x; ySize=y; zSize=z; at=new T[v*w*x*y*z]; }
|
||||
void init(int v,int w,int x,int y,int z,const T& t)
|
||||
{ delete[] at; vSize=v; wSize=w; xSize=x; ySize=y; zSize=z; at=new T[v*w*x*y*z];
|
||||
|
@ -86,7 +86,7 @@ class IStream {
|
||||
friend ostream& operator<< ( ostream& os, const IStream& is ) { return os<<is.iIndex<<","<<is.psrc<<","<<*is.psrc; }
|
||||
|
||||
// Match single char...
|
||||
friend IStream operator>> ( IStream is, char& c ) {
|
||||
friend IStream operator>> ( IStream is, char& c ) {
|
||||
// Propagate fail...
|
||||
if (IStream()==is) return IStream();
|
||||
c=is.get(is.iIndex);
|
||||
@ -106,7 +106,7 @@ class IStream {
|
||||
|
||||
// Match anything else followed by zero-terminated string delimiter...
|
||||
template<class X> friend pair<IStream,X*> operator>> ( IStream is, X& x ) { return pair<IStream,X*>(is,&x); }
|
||||
template<class X> friend IStream operator>> ( pair<IStream,X*> is_x, const char* psDlm ) {
|
||||
template<class X> friend IStream operator>> ( pair<IStream,X*> is_x, const char* psDlm ) {
|
||||
IStream& is = is_x.first;
|
||||
X& x = *is_x.second;
|
||||
// Propagate fail...
|
||||
@ -129,7 +129,7 @@ class IStream {
|
||||
}
|
||||
|
||||
// Match integer followed by zero-terminated string delimiter...
|
||||
friend IStream operator>> ( pair<IStream,int*> is_x, const char* psDlm ) {
|
||||
friend IStream operator>> ( pair<IStream,int*> is_x, const char* psDlm ) {
|
||||
IStream& is = is_x.first;
|
||||
int& x = *is_x.second;
|
||||
// Propagate fail...
|
||||
@ -151,7 +151,7 @@ class IStream {
|
||||
}
|
||||
|
||||
// Match unsigned int followed by zero-terminated string delimiter...
|
||||
friend IStream operator>> ( pair<IStream,unsigned int*> is_x, const char* psDlm ) {
|
||||
friend IStream operator>> ( pair<IStream,unsigned int*> is_x, const char* psDlm ) {
|
||||
IStream& is = is_x.first;
|
||||
unsigned int& x = *is_x.second;
|
||||
// Propagate fail...
|
||||
@ -173,7 +173,7 @@ class IStream {
|
||||
}
|
||||
|
||||
// Match float followed by zero-terminated string delimiter...
|
||||
friend IStream operator>> ( pair<IStream,float*> is_x, const char* psDlm ) {
|
||||
friend IStream operator>> ( pair<IStream,float*> is_x, const char* psDlm ) {
|
||||
IStream& is = is_x.first;
|
||||
float& x = *is_x.second;
|
||||
// Propagate fail...
|
||||
@ -195,7 +195,7 @@ class IStream {
|
||||
}
|
||||
|
||||
// Match double followed by zero-terminated string delimiter...
|
||||
friend IStream operator>> ( pair<IStream,double*> is_x, const char* psDlm ) {
|
||||
friend IStream operator>> ( pair<IStream,double*> is_x, const char* psDlm ) {
|
||||
IStream& is = is_x.first;
|
||||
double& x = *is_x.second;
|
||||
// Propagate fail...
|
||||
@ -217,7 +217,7 @@ class IStream {
|
||||
}
|
||||
|
||||
// Match void pointer followed by zero-terminated string delimiter...
|
||||
friend IStream operator>> ( pair<IStream,void**> is_x, const char* psDlm ) {
|
||||
friend IStream operator>> ( pair<IStream,void**> is_x, const char* psDlm ) {
|
||||
IStream& is = is_x.first;
|
||||
// Propagate fail...
|
||||
if (IStream()==is) return IStream();
|
||||
|
@ -68,13 +68,13 @@ class StringInput {
|
||||
friend StringInput operator>> ( StringInput psIn, const char* psDlm ) {
|
||||
if (StringInput(NULL)==psIn) return psIn;
|
||||
int i;
|
||||
for (i=0; psIn[i]!='\0' && psDlm[i]!='\0'; i++)
|
||||
for (i=0; psIn[i]!='\0' && psDlm[i]!='\0'; i++)
|
||||
if(psIn[i]!=psDlm[i]) return StringInput(NULL); //psIn;
|
||||
return (psDlm[i]!='\0') ? StringInput(NULL) : (psIn[i]!='\0') ? psIn+i : SI_EOS;
|
||||
}
|
||||
|
||||
friend pair<StringInput,int*> operator>> ( StringInput ps, int& n ) { return pair<StringInput,int*>(ps,&n); }
|
||||
friend StringInput operator>> ( pair<StringInput,int*> delimbuff, const char* psDlm ) {
|
||||
friend StringInput operator>> ( pair<StringInput,int*> delimbuff, const char* psDlm ) {
|
||||
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
|
||||
///int i; for(i=0;psIn[i]!='\0';i++) if(psIn[i]==psDlm[i]) return psIn; return psIn+i;
|
||||
int j=0;
|
||||
@ -90,7 +90,7 @@ class StringInput {
|
||||
}
|
||||
|
||||
friend pair<StringInput,unsigned int*> operator>> ( StringInput ps, unsigned int& n ) { return pair<StringInput,unsigned int*>(ps,&n); }
|
||||
friend StringInput operator>> ( pair<StringInput,unsigned int*> delimbuff, const char* psDlm ) {
|
||||
friend StringInput operator>> ( pair<StringInput,unsigned int*> delimbuff, const char* psDlm ) {
|
||||
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
|
||||
///int i; for(i=0;psIn[i]!='\0';i++) if(psIn[i]==psDlm[i]) return psIn; return psIn+i;
|
||||
int j=0;
|
||||
@ -106,7 +106,7 @@ class StringInput {
|
||||
}
|
||||
|
||||
friend pair<StringInput,double*> operator>> ( StringInput ps, double& d ) { return pair<StringInput,double*>(ps,&d); }
|
||||
friend StringInput operator>> ( pair<StringInput,double*> delimbuff, const char* psDlm ) {
|
||||
friend StringInput operator>> ( pair<StringInput,double*> delimbuff, const char* psDlm ) {
|
||||
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
|
||||
///int i; for(i=0;psIn[i]!='\0';i++) if(psIn[i]==psDlm[i]) return psIn; return psIn+i;
|
||||
int j=0;
|
||||
@ -191,7 +191,7 @@ class String : public Array<char> {
|
||||
friend pair<StringInput,String*> operator>> ( const StringInput ps, String& s ) { return pair<StringInput,String*>(ps,&s); }
|
||||
friend StringInput operator>> ( pair<StringInput,String*> delimbuff, const char* psDlm ) {
|
||||
if (StringInput(NULL)==delimbuff.first) return delimbuff.first;
|
||||
////assert(*delimbuff.second<domain.getSize());
|
||||
////assert(*delimbuff.second<domain.getSize());
|
||||
int j=0;
|
||||
StringInput psIn = delimbuff.first;
|
||||
if(psDlm[0]=='\0') { *delimbuff.second=String(psIn.c_str()); return psIn+strlen(psIn.c_str()); }
|
||||
|
@ -38,7 +38,7 @@ class StringIndex{
|
||||
map <string, int> msi;
|
||||
map <int, string> mis;
|
||||
int maxIndex;
|
||||
|
||||
|
||||
public:
|
||||
|
||||
// Constructor / destructor methods...
|
||||
|
@ -22,7 +22,7 @@
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/***********************************************
|
||||
* nl-tetrahex.h
|
||||
* nl-tetrahex.h
|
||||
* a little header with some base conversion stuff
|
||||
* so that we can represent base 16, 32 or 64 with
|
||||
* one character.
|
||||
|
@ -41,7 +41,7 @@ class Timer {
|
||||
}
|
||||
double elapsed ( ) { // in milliseconds.
|
||||
return (double(kept.tv_sec)*1000.0 + double(kept.tv_usec)/1000.0);
|
||||
//struct timeval end; gettimeofday(&end,NULL);
|
||||
//struct timeval end; gettimeofday(&end,NULL);
|
||||
//double beg_time_s = (double) beg.tv_sec + (double) ((double)beg.tv_usec / 1000000.0);
|
||||
//double end_time_s = (double) end.tv_sec + (double) ((double)end.tv_usec / 1000000.0);
|
||||
//return ( (end_time_s - beg_time_s) * 1000.0 );
|
||||
|
@ -136,7 +136,7 @@ class Rd : public DiscreteDomainRV<int,domRd> {
|
||||
}
|
||||
if (!hToG.contains(*this)) {
|
||||
size_t i=s.find(',');
|
||||
assert(i!=string::npos);
|
||||
assert(i!=string::npos);
|
||||
hToG.set(*this) = G(s.substr(i+1).c_str());
|
||||
if ( '1'==s[0] )
|
||||
hFromG.set(G(s.substr(i+1).c_str())) = *this;
|
||||
|
@ -42,11 +42,11 @@ typedef HidVarCPT2DModel<P,C,LogProb> PgivCModel;
|
||||
class WModel {
|
||||
private:
|
||||
TrainableDTree2DModel<P,W,LogProb> modPgivWdt;
|
||||
|
||||
|
||||
RandAccCPT2DModel<P,W,LogProb> modPgivWs;
|
||||
RandAccCPT1DModel<P,LogProb> modP;
|
||||
RandAccCPT1DModel<W,LogProb> modW;
|
||||
|
||||
|
||||
public:
|
||||
//LogProb getProb ( const W& w, const HidVarCPT1DModel<P,LogProb>::IterVal& p ) const {
|
||||
LogProb getProb ( const W& w, const P::ArrayIterator<LogProb>& p ) const {
|
||||
@ -93,8 +93,8 @@ class OModel {
|
||||
};
|
||||
|
||||
typedef DistribModeledWgivC RandVarType;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
void calcProb ( OModel::RandVarType& o, const W& w ) const {
|
||||
o.clear();
|
||||
@ -106,7 +106,7 @@ class OModel {
|
||||
for (LogProb pr=modPgivC.setIterProb(p,c,aCtr); pr!=LogProb(); pr = modPgivC.setIterProb(p,c,aCtr=0) ){
|
||||
o.setProb(c) += modPgivC.getProb(p,c).toProb() * modWgivP.getProb(w,p).toProb();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -134,7 +134,7 @@ class XModel {
|
||||
RandAccCPT2DModel<P,W,Prob> modPgivW;
|
||||
RandAccCPT1DModel<P,Prob> modP;
|
||||
RandAccCPT1DModel<W,Prob> modW;
|
||||
|
||||
|
||||
public:
|
||||
|
||||
typedef X RandVarType;
|
||||
|
@ -11,12 +11,12 @@ namespace lm {
|
||||
namespace ngram {
|
||||
namespace trie {
|
||||
|
||||
DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) :
|
||||
DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) :
|
||||
next_(util::BitsMask::ByMax(max_next)) {}
|
||||
|
||||
const uint8_t kArrayBhikshaVersion = 0;
|
||||
|
||||
// TODO: put this in binary file header instead when I change the binary file format again.
|
||||
// TODO: put this in binary file header instead when I change the binary file format again.
|
||||
void ArrayBhiksha::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) {
|
||||
uint8_t buffer[2];
|
||||
file.ReadForConfig(buffer, 2, offset);
|
||||
@ -33,7 +33,7 @@ uint8_t ChopBits(uint64_t max_offset, uint64_t max_next, const Config &config) {
|
||||
uint8_t required = util::RequiredBits(max_next);
|
||||
uint8_t best_chop = 0;
|
||||
int64_t lowest_change = std::numeric_limits<int64_t>::max();
|
||||
// There are probably faster ways but I don't care because this is only done once per order at construction time.
|
||||
// There are probably faster ways but I don't care because this is only done once per order at construction time.
|
||||
for (uint8_t chop = 0; chop <= std::min(required, config.pointer_bhiksha_bits); ++chop) {
|
||||
int64_t change = (max_next >> (required - chop)) * 64 /* table cost in bits */
|
||||
- max_offset * static_cast<int64_t>(chop); /* savings in bits*/
|
||||
|
@ -7,7 +7,7 @@
|
||||
* pages={388--391},
|
||||
* }
|
||||
*
|
||||
* Currently only used for next pointers.
|
||||
* Currently only used for next pointers.
|
||||
*/
|
||||
|
||||
#ifndef LM_BHIKSHA_H
|
||||
@ -86,9 +86,9 @@ class ArrayBhiksha {
|
||||
// assert(end_it == std::upper_bound(offset_begin_, offset_end_, index + 1));
|
||||
--end_it;
|
||||
// assert(end_it >= begin_it);
|
||||
out.begin = ((begin_it - offset_begin_) << next_inline_.bits) |
|
||||
out.begin = ((begin_it - offset_begin_) << next_inline_.bits) |
|
||||
util::ReadInt57(base, bit_offset, next_inline_.bits, next_inline_.mask);
|
||||
out.end = ((end_it - offset_begin_) << next_inline_.bits) |
|
||||
out.end = ((end_it - offset_begin_) << next_inline_.bits) |
|
||||
util::ReadInt57(base, bit_offset + total_bits, next_inline_.bits, next_inline_.mask);
|
||||
// If this fails, consider rebuilding your model using KenLM after 1e333d786b748555e8f368d2bbba29a016c98052
|
||||
assert(out.end >= out.begin);
|
||||
|
@ -135,7 +135,7 @@ void MatchCheck(ModelType model_type, unsigned int search_version, const Paramet
|
||||
|
||||
const std::size_t kInvalidSize = static_cast<std::size_t>(-1);
|
||||
|
||||
BinaryFormat::BinaryFormat(const Config &config)
|
||||
BinaryFormat::BinaryFormat(const Config &config)
|
||||
: write_method_(config.write_method), write_mmap_(config.write_mmap), load_method_(config.load_method),
|
||||
header_size_(kInvalidSize), vocab_size_(kInvalidSize), vocab_string_offset_(kInvalidOffset) {}
|
||||
|
||||
|
@ -19,18 +19,18 @@ namespace ngram {
|
||||
|
||||
extern const char *kModelNames[6];
|
||||
|
||||
/*Inspect a file to determine if it is a binary lm. If not, return false.
|
||||
/*Inspect a file to determine if it is a binary lm. If not, return false.
|
||||
* If so, return true and set recognized to the type. This is the only API in
|
||||
* this header designed for use by decoder authors.
|
||||
* this header designed for use by decoder authors.
|
||||
*/
|
||||
bool RecognizeBinary(const char *file, ModelType &recognized);
|
||||
|
||||
struct FixedWidthParameters {
|
||||
unsigned char order;
|
||||
float probing_multiplier;
|
||||
// What type of model is this?
|
||||
// What type of model is this?
|
||||
ModelType model_type;
|
||||
// Does the end of the file have the actual strings in the vocabulary?
|
||||
// Does the end of the file have the actual strings in the vocabulary?
|
||||
bool has_vocabulary;
|
||||
unsigned int search_version;
|
||||
};
|
||||
@ -38,7 +38,7 @@ struct FixedWidthParameters {
|
||||
// This is a macro instead of an inline function so constants can be assigned using it.
|
||||
#define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8)
|
||||
|
||||
// Parameters stored in the header of a binary file.
|
||||
// Parameters stored in the header of a binary file.
|
||||
struct Parameters {
|
||||
FixedWidthParameters fixed;
|
||||
std::vector<uint64_t> counts;
|
||||
@ -79,7 +79,7 @@ class BinaryFormat {
|
||||
const char *write_mmap_;
|
||||
util::LoadMethod load_method_;
|
||||
|
||||
// File behind memory, if any.
|
||||
// File behind memory, if any.
|
||||
util::scoped_fd file_;
|
||||
|
||||
// If there is a file involved, a single mapping.
|
||||
|
@ -15,9 +15,9 @@ namespace ngram {
|
||||
* kNoExtensionBackoff. If the n-gram might be extended, then out_state must
|
||||
* contain the full n-gram, in which case kExtensionBackoff is set. In any
|
||||
* case, if an n-gram has non-zero backoff, the full state is returned so
|
||||
* backoff can be properly charged.
|
||||
* backoff can be properly charged.
|
||||
* These differ only in sign bit because the backoff is in fact zero in either
|
||||
* case.
|
||||
* case.
|
||||
*/
|
||||
const float kNoExtensionBackoff = -0.0;
|
||||
const float kExtensionBackoff = 0.0;
|
||||
@ -28,7 +28,7 @@ inline void SetExtension(float &backoff) {
|
||||
if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff;
|
||||
}
|
||||
|
||||
// This compiles down nicely.
|
||||
// This compiles down nicely.
|
||||
inline bool HasExtension(const float &backoff) {
|
||||
typedef union { float f; uint32_t i; } UnionValue;
|
||||
UnionValue compare, interpret;
|
||||
|
@ -56,7 +56,7 @@ void Usage(const char *name, const char *default_mem) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// I could really use boost::lexical_cast right about now.
|
||||
// I could really use boost::lexical_cast right about now.
|
||||
float ParseFloat(const char *from) {
|
||||
char *end;
|
||||
float ret = strtod(from, &end);
|
||||
|
@ -114,7 +114,7 @@ class CollapseStream {
|
||||
current_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())),
|
||||
prune_threshold_(prune_threshold),
|
||||
prune_words_(prune_words),
|
||||
block_(position) {
|
||||
block_(position) {
|
||||
StartBlock();
|
||||
}
|
||||
|
||||
@ -125,27 +125,27 @@ class CollapseStream {
|
||||
|
||||
CollapseStream &operator++() {
|
||||
assert(block_);
|
||||
|
||||
|
||||
if (current_.begin()[1] == kBOS && current_.Base() < copy_from_) {
|
||||
memcpy(current_.Base(), copy_from_, current_.TotalSize());
|
||||
UpdateCopyFrom();
|
||||
|
||||
|
||||
// Mark highest order n-grams for later pruning
|
||||
if(current_.Count() <= prune_threshold_) {
|
||||
current_.Mark();
|
||||
current_.Mark();
|
||||
}
|
||||
|
||||
|
||||
if(!prune_words_.empty()) {
|
||||
for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
|
||||
if(prune_words_[*i]) {
|
||||
current_.Mark();
|
||||
current_.Mark();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
current_.NextInMemory();
|
||||
uint8_t *block_base = static_cast<uint8_t*>(block_->Get());
|
||||
if (current_.Base() == block_base + block_->ValidSize()) {
|
||||
@ -153,21 +153,21 @@ class CollapseStream {
|
||||
++block_;
|
||||
StartBlock();
|
||||
}
|
||||
|
||||
|
||||
// Mark highest order n-grams for later pruning
|
||||
if(current_.Count() <= prune_threshold_) {
|
||||
current_.Mark();
|
||||
current_.Mark();
|
||||
}
|
||||
|
||||
if(!prune_words_.empty()) {
|
||||
for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
|
||||
if(prune_words_[*i]) {
|
||||
current_.Mark();
|
||||
current_.Mark();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -180,21 +180,21 @@ class CollapseStream {
|
||||
current_.ReBase(block_->Get());
|
||||
copy_from_ = static_cast<uint8_t*>(block_->Get()) + block_->ValidSize();
|
||||
UpdateCopyFrom();
|
||||
|
||||
|
||||
// Mark highest order n-grams for later pruning
|
||||
if(current_.Count() <= prune_threshold_) {
|
||||
current_.Mark();
|
||||
current_.Mark();
|
||||
}
|
||||
|
||||
if(!prune_words_.empty()) {
|
||||
for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
|
||||
if(prune_words_[*i]) {
|
||||
current_.Mark();
|
||||
current_.Mark();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
// Find last without bos.
|
||||
@ -222,18 +222,18 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
|
||||
StatCollector stats(order, counts_, counts_pruned_, discounts_);
|
||||
if (order == 1) {
|
||||
|
||||
// Only unigrams. Just collect stats.
|
||||
// Only unigrams. Just collect stats.
|
||||
for (NGramStream full(positions[0]); full; ++full) {
|
||||
|
||||
|
||||
// Do not prune <s> </s> <unk>
|
||||
if(*full->begin() > 2) {
|
||||
if(full->Count() <= prune_thresholds_[0])
|
||||
full->Mark();
|
||||
|
||||
|
||||
if(!prune_words_.empty() && prune_words_[*full->begin()])
|
||||
full->Mark();
|
||||
}
|
||||
|
||||
|
||||
stats.AddFull(full->UnmarkedCount(), full->IsMarked());
|
||||
}
|
||||
|
||||
@ -243,7 +243,7 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
|
||||
|
||||
NGramStreams streams;
|
||||
streams.Init(positions, positions.size() - 1);
|
||||
|
||||
|
||||
CollapseStream full(positions[positions.size() - 1], prune_thresholds_.back(), prune_words_);
|
||||
|
||||
// Initialization: <unk> has count 0 and so does <s>.
|
||||
@ -261,7 +261,7 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
|
||||
std::vector<uint64_t> actual_counts(positions.size(), 0);
|
||||
// Something of a hack: don't prune <s>.
|
||||
actual_counts[0] = std::numeric_limits<uint64_t>::max();
|
||||
|
||||
|
||||
// Iterate over full (the stream of the highest order ngrams)
|
||||
for (; full; ++full) {
|
||||
const WordIndex *different = FindDifference(*full, **lower_valid);
|
||||
@ -272,16 +272,16 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
|
||||
uint64_t order_minus_1 = lower_valid - streams_begin;
|
||||
if(actual_counts[order_minus_1] <= prune_thresholds_[order_minus_1])
|
||||
(*lower_valid)->Mark();
|
||||
|
||||
|
||||
if(!prune_words_.empty()) {
|
||||
for(WordIndex* i = (*lower_valid)->begin(); i != (*lower_valid)->end(); i++) {
|
||||
if(prune_words_[*i]) {
|
||||
(*lower_valid)->Mark();
|
||||
(*lower_valid)->Mark();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
stats.Add(order_minus_1, (*lower_valid)->UnmarkedCount(), (*lower_valid)->IsMarked());
|
||||
++*lower_valid;
|
||||
}
|
||||
@ -327,16 +327,16 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
|
||||
uint64_t lower_count = actual_counts[(*s)->Order() - 1];
|
||||
if(lower_count <= prune_thresholds_[(*s)->Order() - 1])
|
||||
(*s)->Mark();
|
||||
|
||||
|
||||
if(!prune_words_.empty()) {
|
||||
for(WordIndex* i = (*s)->begin(); i != (*s)->end(); i++) {
|
||||
if(prune_words_[*i]) {
|
||||
(*s)->Mark();
|
||||
(*s)->Mark();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
stats.Add(s - streams.begin(), lower_count, (*s)->IsMarked());
|
||||
++*s;
|
||||
}
|
||||
|
@ -30,9 +30,9 @@ struct DiscountConfig {
|
||||
WarningAction bad_action;
|
||||
};
|
||||
|
||||
/* Compute adjusted counts.
|
||||
/* Compute adjusted counts.
|
||||
* Input: unique suffix sorted N-grams (and just the N-grams) with raw counts.
|
||||
* Output: [1,N]-grams with adjusted counts.
|
||||
* Output: [1,N]-grams with adjusted counts.
|
||||
* [1,N)-grams are in suffix order
|
||||
* N-grams are in undefined order (they're going to be sorted anyway).
|
||||
*/
|
||||
@ -50,13 +50,13 @@ class AdjustCounts {
|
||||
const DiscountConfig &discount_config,
|
||||
std::vector<Discount> &discounts)
|
||||
: prune_thresholds_(prune_thresholds), counts_(counts), counts_pruned_(counts_pruned),
|
||||
prune_words_(prune_words), discount_config_(discount_config), discounts_(discounts)
|
||||
prune_words_(prune_words), discount_config_(discount_config), discounts_(discounts)
|
||||
{}
|
||||
|
||||
void Run(const util::stream::ChainPositions &positions);
|
||||
|
||||
private:
|
||||
const std::vector<uint64_t> &prune_thresholds_;
|
||||
const std::vector<uint64_t> &prune_thresholds_;
|
||||
std::vector<uint64_t> &counts_;
|
||||
std::vector<uint64_t> &counts_pruned_;
|
||||
const std::vector<bool> &prune_words_;
|
||||
|
@ -82,7 +82,7 @@ BOOST_AUTO_TEST_CASE(Simple) {
|
||||
}
|
||||
BOOST_REQUIRE_EQUAL(4UL, counts.size());
|
||||
BOOST_CHECK_EQUAL(4UL, counts[0]);
|
||||
// These are no longer set because the discounts are bad.
|
||||
// These are no longer set because the discounts are bad.
|
||||
/* BOOST_CHECK_EQUAL(4UL, counts[1]);
|
||||
BOOST_CHECK_EQUAL(3UL, counts[2]);
|
||||
BOOST_CHECK_EQUAL(3UL, counts[3]);*/
|
||||
|
@ -45,7 +45,7 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
|
||||
std::size_t operator()(const WordIndex *start) const {
|
||||
return util::MurmurHashNative(start, size_);
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
const std::size_t size_;
|
||||
};
|
||||
@ -53,11 +53,11 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
|
||||
class DedupeEquals : public std::binary_function<const WordIndex *, const WordIndex *, bool> {
|
||||
public:
|
||||
explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {}
|
||||
|
||||
|
||||
bool operator()(const WordIndex *first, const WordIndex *second) const {
|
||||
return !memcmp(first, second, size_);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private:
|
||||
const std::size_t size_;
|
||||
};
|
||||
@ -82,7 +82,7 @@ typedef util::ProbingHashTable<DedupeEntry, DedupeHash, DedupeEquals> Dedupe;
|
||||
|
||||
class Writer {
|
||||
public:
|
||||
Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size)
|
||||
Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size)
|
||||
: block_(position), gram_(block_->Get(), order),
|
||||
dedupe_invalid_(order, std::numeric_limits<WordIndex>::max()),
|
||||
dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)),
|
||||
@ -91,7 +91,7 @@ class Writer {
|
||||
dedupe_.Clear();
|
||||
assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size);
|
||||
if (order == 1) {
|
||||
// Add special words. AdjustCounts is responsible if order != 1.
|
||||
// Add special words. AdjustCounts is responsible if order != 1.
|
||||
AddUnigramWord(kUNK);
|
||||
AddUnigramWord(kBOS);
|
||||
}
|
||||
@ -121,16 +121,16 @@ class Writer {
|
||||
memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1));
|
||||
return;
|
||||
}
|
||||
// Complete the write.
|
||||
// Complete the write.
|
||||
gram_.Count() = 1;
|
||||
// Prepare the next n-gram.
|
||||
// Prepare the next n-gram.
|
||||
if (reinterpret_cast<uint8_t*>(gram_.begin()) + gram_.TotalSize() != static_cast<uint8_t*>(block_->Get()) + block_size_) {
|
||||
NGram last(gram_);
|
||||
gram_.NextInMemory();
|
||||
std::copy(last.begin() + 1, last.end(), gram_.begin());
|
||||
return;
|
||||
}
|
||||
// Block end. Need to store the context in a temporary buffer.
|
||||
// Block end. Need to store the context in a temporary buffer.
|
||||
std::copy(gram_.begin() + 1, gram_.end(), buffer_.get());
|
||||
dedupe_.Clear();
|
||||
block_->SetValidSize(block_size_);
|
||||
@ -158,7 +158,7 @@ class Writer {
|
||||
// Hash table combiner implementation.
|
||||
Dedupe dedupe_;
|
||||
|
||||
// Small buffer to hold existing ngrams when shifting across a block boundary.
|
||||
// Small buffer to hold existing ngrams when shifting across a block boundary.
|
||||
boost::scoped_array<WordIndex> buffer_;
|
||||
|
||||
const std::size_t block_size_;
|
||||
@ -224,12 +224,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
|
||||
} catch (const util::EndOfFileException &e) {}
|
||||
token_count_ = count;
|
||||
type_count_ = vocab.Size();
|
||||
|
||||
|
||||
// Create list of unigrams that are supposed to be pruned
|
||||
if (!prune_vocab_filename_.empty()) {
|
||||
try {
|
||||
util::FilePiece prune_vocab_file(prune_vocab_filename_.c_str());
|
||||
|
||||
|
||||
prune_words_.resize(vocab.Size(), true);
|
||||
try {
|
||||
while (true) {
|
||||
@ -238,12 +238,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
|
||||
prune_words_[vocab.Index(*w)] = false;
|
||||
}
|
||||
} catch (const util::EndOfFileException &e) {}
|
||||
|
||||
|
||||
// Never prune <unk>, <s>, </s>
|
||||
prune_words_[kUNK] = false;
|
||||
prune_words_[kBOS] = false;
|
||||
prune_words_[kEOS] = false;
|
||||
|
||||
|
||||
} catch (const util::Exception &e) {
|
||||
std::cerr << e.what() << std::endl;
|
||||
abort();
|
||||
|
@ -40,7 +40,7 @@ class CorpusCount {
|
||||
uint64_t &token_count_;
|
||||
WordIndex &type_count_;
|
||||
std::vector<bool>& prune_words_;
|
||||
const std::string& prune_vocab_filename_;
|
||||
const std::string& prune_vocab_filename_;
|
||||
|
||||
std::size_t dedupe_mem_size_;
|
||||
util::scoped_malloc dedupe_mem_;
|
||||
|
@ -27,9 +27,9 @@ struct HashBufferEntry : public BufferEntry {
|
||||
uint64_t hash_value;
|
||||
};
|
||||
|
||||
// Reads all entries in order like NGramStream does.
|
||||
// Reads all entries in order like NGramStream does.
|
||||
// But deletes any entries that have CutoffCount below or equal to pruning
|
||||
// threshold.
|
||||
// threshold.
|
||||
class PruneNGramStream {
|
||||
public:
|
||||
PruneNGramStream(const util::stream::ChainPosition &position) :
|
||||
@ -37,7 +37,7 @@ class PruneNGramStream {
|
||||
dest_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())),
|
||||
currentCount_(0),
|
||||
block_(position)
|
||||
{
|
||||
{
|
||||
StartBlock();
|
||||
}
|
||||
|
||||
@ -50,7 +50,7 @@ class PruneNGramStream {
|
||||
|
||||
PruneNGramStream &operator++() {
|
||||
assert(block_);
|
||||
|
||||
|
||||
if(current_.Order() == 1 && *current_.begin() <= 2)
|
||||
dest_.NextInMemory();
|
||||
else if(currentCount_ > 0) {
|
||||
@ -59,9 +59,9 @@ class PruneNGramStream {
|
||||
}
|
||||
dest_.NextInMemory();
|
||||
}
|
||||
|
||||
|
||||
current_.NextInMemory();
|
||||
|
||||
|
||||
uint8_t *block_base = static_cast<uint8_t*>(block_->Get());
|
||||
if (current_.Base() == block_base + block_->ValidSize()) {
|
||||
block_->SetValidSize(dest_.Base() - block_base);
|
||||
@ -70,13 +70,13 @@ class PruneNGramStream {
|
||||
if (block_) {
|
||||
currentCount_ = current_.CutoffCount();
|
||||
}
|
||||
} else {
|
||||
} else {
|
||||
currentCount_ = current_.CutoffCount();
|
||||
}
|
||||
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
void StartBlock() {
|
||||
for (; ; ++block_) {
|
||||
@ -85,13 +85,13 @@ class PruneNGramStream {
|
||||
}
|
||||
current_.ReBase(block_->Get());
|
||||
currentCount_ = current_.CutoffCount();
|
||||
|
||||
|
||||
dest_.ReBase(block_->Get());
|
||||
}
|
||||
|
||||
NGram current_; // input iterator
|
||||
NGram dest_; // output iterator
|
||||
|
||||
|
||||
uint64_t currentCount_;
|
||||
|
||||
util::stream::Link block_;
|
||||
@ -155,24 +155,24 @@ class AddRight {
|
||||
memcpy(previous_raw, in->begin(), size);
|
||||
uint64_t denominator = 0;
|
||||
uint64_t normalizer = 0;
|
||||
|
||||
|
||||
uint64_t counts[4];
|
||||
memset(counts, 0, sizeof(counts));
|
||||
do {
|
||||
denominator += in->UnmarkedCount();
|
||||
|
||||
|
||||
// Collect unused probability mass from pruning.
|
||||
// Becomes 0 for unpruned ngrams.
|
||||
normalizer += in->UnmarkedCount() - in->CutoffCount();
|
||||
|
||||
|
||||
// Chen&Goodman do not mention counting based on cutoffs, but
|
||||
// backoff becomes larger than 1 otherwise, so probably needs
|
||||
// to count cutoffs. Counts normally without pruning.
|
||||
if(in->CutoffCount() > 0)
|
||||
++counts[std::min(in->CutoffCount(), static_cast<uint64_t>(3))];
|
||||
|
||||
|
||||
} while (++in && !memcmp(previous_raw, in->begin(), size));
|
||||
|
||||
|
||||
BufferEntry &entry = *reinterpret_cast<BufferEntry*>(out.Get());
|
||||
entry.denominator = static_cast<float>(denominator);
|
||||
entry.gamma = 0.0;
|
||||
@ -182,9 +182,9 @@ class AddRight {
|
||||
|
||||
// Makes model sum to 1 with pruning (I hope).
|
||||
entry.gamma += normalizer;
|
||||
|
||||
|
||||
entry.gamma /= entry.denominator;
|
||||
|
||||
|
||||
if(pruning_) {
|
||||
// If pruning is enabled the stream actually contains HashBufferEntry, see InitialProbabilities(...),
|
||||
// so add a hash value that identifies the current ngram.
|
||||
@ -244,13 +244,13 @@ class MergeRight {
|
||||
++summed;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
std::vector<WordIndex> previous(grams->Order() - 1);
|
||||
const std::size_t size = sizeof(WordIndex) * previous.size();
|
||||
for (; grams; ++summed) {
|
||||
memcpy(&previous[0], grams->begin(), size);
|
||||
const BufferEntry &sums = *static_cast<const BufferEntry*>(summed.Get());
|
||||
|
||||
|
||||
do {
|
||||
Payload &pay = grams->Value();
|
||||
pay.uninterp.prob = discount_.Apply(grams->UnmarkedCount()) / sums.denominator;
|
||||
@ -288,7 +288,7 @@ void InitialProbabilities(
|
||||
gamma_out[i] >> AddRight(discounts[i], second, prune_vocab || prune_thresholds[i] > 0);
|
||||
|
||||
primary[i] >> MergeRight(config.interpolate_unigrams, gamma_out[i].Add(), discounts[i]);
|
||||
|
||||
|
||||
// Don't bother with the OnlyGamma thread for something to discard.
|
||||
if (i) gamma_out[i] >> OnlyGamma(prune_vocab || prune_thresholds[i] > 0);
|
||||
}
|
||||
|
@ -15,17 +15,17 @@ struct InitialProbabilitiesConfig {
|
||||
// These should be small buffers to keep the adder from getting too far ahead
|
||||
util::stream::ChainConfig adder_in;
|
||||
util::stream::ChainConfig adder_out;
|
||||
// SRILM doesn't normally interpolate unigrams.
|
||||
// SRILM doesn't normally interpolate unigrams.
|
||||
bool interpolate_unigrams;
|
||||
};
|
||||
|
||||
/* Compute initial (uninterpolated) probabilities
|
||||
* primary: the normal chain of n-grams. Incoming is context sorted adjusted
|
||||
* counts. Outgoing has uninterpolated probabilities for use by Interpolate.
|
||||
* second_in: a second copy of the primary input. Discard the output.
|
||||
* second_in: a second copy of the primary input. Discard the output.
|
||||
* gamma_out: Computed gamma values are output on these chains in suffix order.
|
||||
* The values are bare floats and should be buffered for interpolation to
|
||||
* use.
|
||||
* use.
|
||||
*/
|
||||
void InitialProbabilities(
|
||||
const InitialProbabilitiesConfig &config,
|
||||
|
@ -47,7 +47,7 @@ class OutputQ {
|
||||
|
||||
private:
|
||||
// Product of backoffs in the numerator divided by backoffs in the
|
||||
// denominator. Does not include
|
||||
// denominator. Does not include
|
||||
std::vector<float> q_delta_;
|
||||
};
|
||||
|
||||
@ -81,7 +81,7 @@ template <class Output> class Callback {
|
||||
if(prune_vocab_ || prune_thresholds_[i + 1] > 0)
|
||||
while(backoffs_[i])
|
||||
++backoffs_[i];
|
||||
|
||||
|
||||
if (backoffs_[i]) {
|
||||
std::cerr << "Backoffs do not match for order " << (i + 1) << std::endl;
|
||||
abort();
|
||||
@ -99,7 +99,7 @@ template <class Output> class Callback {
|
||||
if(prune_vocab_ || prune_thresholds_[order_minus_1 + 1] > 0) {
|
||||
//Compute hash value for current context
|
||||
uint64_t current_hash = util::MurmurHashNative(gram.begin(), gram.Order() * sizeof(WordIndex));
|
||||
|
||||
|
||||
const HashGamma *hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get());
|
||||
while(current_hash != hashed_backoff->hash_value && ++backoffs_[order_minus_1])
|
||||
hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get());
|
||||
|
@ -8,8 +8,8 @@
|
||||
#include <stdint.h>
|
||||
|
||||
namespace lm { namespace builder {
|
||||
|
||||
/* Interpolate step.
|
||||
|
||||
/* Interpolate step.
|
||||
* Input: suffix sorted n-grams with (p_uninterpolated, gamma) from
|
||||
* InitialProbabilities.
|
||||
* Output: suffix sorted n-grams with complete probability
|
||||
|
@ -35,7 +35,7 @@ template <class Callback, class Compare> void JointOrder(const util::stream::Cha
|
||||
// Does the context match the lower one?
|
||||
if (!memcmp(streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) {
|
||||
callback.Enter(current, *streams[current]);
|
||||
// Transition to looking for extensions.
|
||||
// Transition to looking for extensions.
|
||||
if (++current < order) continue;
|
||||
}
|
||||
#ifdef DEBUG
|
||||
@ -46,16 +46,16 @@ template <class Callback, class Compare> void JointOrder(const util::stream::Cha
|
||||
abort();
|
||||
}
|
||||
#endif // DEBUG
|
||||
// No extension left.
|
||||
// No extension left.
|
||||
while(true) {
|
||||
assert(current > 0);
|
||||
--current;
|
||||
callback.Exit(current, *streams[current]);
|
||||
|
||||
|
||||
if (++streams[current]) break;
|
||||
|
||||
|
||||
UTIL_THROW_IF(order != current + 1, FormatLoadException, "Detected n-gram without matching suffix");
|
||||
|
||||
|
||||
order = current;
|
||||
if (!order) return;
|
||||
}
|
||||
|
@ -53,7 +53,7 @@ std::vector<uint64_t> ParsePruning(const std::vector<std::string> ¶m, std::s
|
||||
// throw if each n-gram order has not threshold specified
|
||||
UTIL_THROW_IF(prune_thresholds.size() > order, util::Exception, "You specified pruning thresholds for orders 1 through " << prune_thresholds.size() << " but the model only has order " << order);
|
||||
// threshold for unigram can only be 0 (no pruning)
|
||||
|
||||
|
||||
// check if threshold are not in decreasing order
|
||||
uint64_t lower_threshold = 0;
|
||||
for (std::vector<uint64_t>::iterator it = prune_thresholds.begin(); it != prune_thresholds.end(); ++it) {
|
||||
@ -124,7 +124,7 @@ int main(int argc, char *argv[]) {
|
||||
po::store(po::parse_command_line(argc, argv, options), vm);
|
||||
|
||||
if (argc == 1 || vm["help"].as<bool>()) {
|
||||
std::cerr <<
|
||||
std::cerr <<
|
||||
"Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
|
||||
"Please cite:\n"
|
||||
"@inproceedings{Heafield-estimate,\n"
|
||||
@ -147,7 +147,7 @@ int main(int argc, char *argv[]) {
|
||||
std::cerr << "This machine has " << mem << " bytes of memory.\n\n";
|
||||
} else {
|
||||
std::cerr << "Unable to determine the amount of memory on this machine.\n\n";
|
||||
}
|
||||
}
|
||||
std::cerr << options << std::endl;
|
||||
return 1;
|
||||
}
|
||||
@ -191,11 +191,11 @@ int main(int argc, char *argv[]) {
|
||||
else {
|
||||
pipeline.prune_vocab = false;
|
||||
}
|
||||
|
||||
|
||||
util::NormalizeTempPrefix(pipeline.sort.temp_prefix);
|
||||
|
||||
lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
|
||||
// TODO: evaluate options for these.
|
||||
// TODO: evaluate options for these.
|
||||
initial.adder_in.total_memory = 32768;
|
||||
initial.adder_in.block_count = 2;
|
||||
initial.adder_out.total_memory = 32768;
|
||||
|
@ -68,26 +68,26 @@ class NGram {
|
||||
assert(size == TotalSize(ret));
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
// manipulate msb to signal that ngram can be pruned
|
||||
/*mjd**********************************************************************/
|
||||
|
||||
bool IsMarked() const {
|
||||
return Value().count >> (sizeof(Value().count) * 8 - 1);
|
||||
}
|
||||
|
||||
|
||||
void Mark() {
|
||||
Value().count |= (1ul << (sizeof(Value().count) * 8 - 1));
|
||||
}
|
||||
|
||||
|
||||
void Unmark() {
|
||||
Value().count &= ~(1ul << (sizeof(Value().count) * 8 - 1));
|
||||
}
|
||||
|
||||
|
||||
uint64_t UnmarkedCount() const {
|
||||
return Value().count & ~(1ul << (sizeof(Value().count) * 8 - 1));
|
||||
}
|
||||
|
||||
|
||||
uint64_t CutoffCount() const {
|
||||
return IsMarked() ? 0 : UnmarkedCount();
|
||||
}
|
||||
|
@ -37,7 +37,7 @@ void PrintStatistics(const std::vector<uint64_t> &counts, const std::vector<uint
|
||||
|
||||
class Master {
|
||||
public:
|
||||
explicit Master(PipelineConfig &config)
|
||||
explicit Master(PipelineConfig &config)
|
||||
: config_(config), chains_(config.order), files_(config.order) {
|
||||
config_.minimum_block = std::max(NGram::TotalSize(config_.order), config_.minimum_block);
|
||||
}
|
||||
@ -64,7 +64,7 @@ class Master {
|
||||
CreateChains(config_.TotalMemory() - merge_using, count_bounds);
|
||||
ngrams.Output(chains_.back(), merge_using);
|
||||
|
||||
// Setup unigram file.
|
||||
// Setup unigram file.
|
||||
files_.push_back(util::MakeTemp(config_.TempPrefix()));
|
||||
}
|
||||
|
||||
@ -204,7 +204,7 @@ class Master {
|
||||
PipelineConfig &config_;
|
||||
|
||||
util::stream::Chains chains_;
|
||||
// Often only unigrams, but sometimes all orders.
|
||||
// Often only unigrams, but sometimes all orders.
|
||||
util::FixedArray<util::stream::FileBuffer> files_;
|
||||
};
|
||||
|
||||
@ -214,7 +214,7 @@ void CountText(int text_file /* input */, int vocab_file /* output */, Master &m
|
||||
|
||||
const std::size_t vocab_usage = CorpusCount::VocabUsage(config.vocab_estimate);
|
||||
UTIL_THROW_IF(config.TotalMemory() < vocab_usage, util::Exception, "Vocab hash size estimate " << vocab_usage << " exceeds total memory " << config.TotalMemory());
|
||||
std::size_t memory_for_chain =
|
||||
std::size_t memory_for_chain =
|
||||
// This much memory to work with after vocab hash table.
|
||||
static_cast<float>(config.TotalMemory() - vocab_usage) /
|
||||
// Solve for block size including the dedupe multiplier for one block.
|
||||
@ -252,7 +252,7 @@ void InitialProbabilities(const std::vector<uint64_t> &counts, const std::vector
|
||||
|
||||
util::stream::Chains gamma_chains(config.order);
|
||||
InitialProbabilities(config.initial_probs, discounts, master.MutableChains(), second, gamma_chains, prune_thresholds, prune_vocab);
|
||||
// Don't care about gamma for 0.
|
||||
// Don't care about gamma for 0.
|
||||
gamma_chains[0] >> util::stream::kRecycle;
|
||||
gammas.Init(config.order - 1);
|
||||
for (std::size_t i = 1; i < config.order; ++i) {
|
||||
@ -307,16 +307,16 @@ void Pipeline(PipelineConfig &config, int text_file, Output &output) {
|
||||
// master's destructor will wait for chains. But they might be deadlocked if
|
||||
// this thread dies because e.g. it ran out of memory.
|
||||
try {
|
||||
util::scoped_fd vocab_file(config.vocab_file.empty() ?
|
||||
util::MakeTemp(config.TempPrefix()) :
|
||||
util::scoped_fd vocab_file(config.vocab_file.empty() ?
|
||||
util::MakeTemp(config.TempPrefix()) :
|
||||
util::CreateOrThrow(config.vocab_file.c_str()));
|
||||
output.SetVocabFD(vocab_file.get());
|
||||
uint64_t token_count;
|
||||
std::string text_file_name;
|
||||
|
||||
|
||||
std::vector<bool> prune_words;
|
||||
CountText(text_file, vocab_file.get(), master, token_count, text_file_name, prune_words);
|
||||
|
||||
|
||||
std::vector<uint64_t> counts;
|
||||
std::vector<uint64_t> counts_pruned;
|
||||
std::vector<Discount> discounts;
|
||||
|
@ -44,7 +44,7 @@ struct PipelineConfig {
|
||||
|
||||
// Compute collapsed q values instead of probability and backoff
|
||||
bool output_q;
|
||||
|
||||
|
||||
/* Computing the perplexity of LMs with different vocabularies is hard. For
|
||||
* example, the lowest perplexity is attained by a unigram model that
|
||||
* predicts p(<unk>) = 1 and has no other vocabulary. Also, linearly
|
||||
|
@ -55,7 +55,7 @@ void PrintARPA::Run(const util::stream::ChainPositions &positions) {
|
||||
if (order != positions.size())
|
||||
out << '\t' << stream->Value().complete.backoff;
|
||||
out << '\n';
|
||||
|
||||
|
||||
}
|
||||
out << '\n';
|
||||
}
|
||||
|
@ -14,7 +14,7 @@
|
||||
|
||||
// Warning: print routines read all unigrams before all bigrams before all
|
||||
// trigrams etc. So if other parts of the chain move jointly, you'll have to
|
||||
// buffer.
|
||||
// buffer.
|
||||
|
||||
namespace lm { namespace builder {
|
||||
|
||||
@ -42,7 +42,7 @@ class VocabReconstitute {
|
||||
std::vector<const char*> map_;
|
||||
};
|
||||
|
||||
// Not defined, only specialized.
|
||||
// Not defined, only specialized.
|
||||
template <class T> void PrintPayload(util::FakeOFStream &to, const Payload &payload);
|
||||
template <> inline void PrintPayload<uint64_t>(util::FakeOFStream &to, const Payload &payload) {
|
||||
// TODO slow
|
||||
@ -55,7 +55,7 @@ template <> inline void PrintPayload<ProbBackoff>(util::FakeOFStream &to, const
|
||||
to << payload.complete.prob << ' ' << payload.complete.backoff;
|
||||
}
|
||||
|
||||
// template parameter is the type stored.
|
||||
// template parameter is the type stored.
|
||||
template <class V> class Print {
|
||||
public:
|
||||
static void DumpSeparateFiles(const VocabReconstitute &vocab, const std::string &file_base, util::stream::Chains &chains) {
|
||||
|
@ -19,7 +19,7 @@ namespace builder {
|
||||
*/
|
||||
template <class Child> class Comparator : public std::binary_function<const void *, const void *, bool> {
|
||||
public:
|
||||
|
||||
|
||||
/**
|
||||
* Constructs a comparator capable of comparing two n-grams.
|
||||
*
|
||||
@ -51,8 +51,8 @@ template <class Child> class Comparator : public std::binary_function<const void
|
||||
/**
|
||||
* N-gram comparator that compares n-grams according to their reverse (suffix) order.
|
||||
*
|
||||
* This comparator compares n-grams lexicographically, one word at a time,
|
||||
* beginning with the last word of each n-gram and ending with the first word of each n-gram.
|
||||
* This comparator compares n-grams lexicographically, one word at a time,
|
||||
* beginning with the last word of each n-gram and ending with the first word of each n-gram.
|
||||
*
|
||||
* Some examples of n-gram comparisons as defined by this comparator:
|
||||
* - a b c == a b c
|
||||
@ -64,8 +64,8 @@ template <class Child> class Comparator : public std::binary_function<const void
|
||||
*/
|
||||
class SuffixOrder : public Comparator<SuffixOrder> {
|
||||
public:
|
||||
|
||||
/**
|
||||
|
||||
/**
|
||||
* Constructs a comparator capable of comparing two n-grams.
|
||||
*
|
||||
* @param order Number of words in each n-gram
|
||||
@ -73,7 +73,7 @@ class SuffixOrder : public Comparator<SuffixOrder> {
|
||||
explicit SuffixOrder(std::size_t order) : Comparator<SuffixOrder>(order) {}
|
||||
|
||||
/**
|
||||
* Compares two n-grams lexicographically, one word at a time,
|
||||
* Compares two n-grams lexicographically, one word at a time,
|
||||
* beginning with the last word of each n-gram and ending with the first word of each n-gram.
|
||||
*
|
||||
* @param lhs A pointer to the n-gram on the left-hand side of the comparison
|
||||
@ -90,11 +90,11 @@ class SuffixOrder : public Comparator<SuffixOrder> {
|
||||
static const unsigned kMatchOffset = 1;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* N-gram comparator that compares n-grams according to the reverse (suffix) order of the n-gram context.
|
||||
*
|
||||
* This comparator compares n-grams lexicographically, one word at a time,
|
||||
* This comparator compares n-grams lexicographically, one word at a time,
|
||||
* beginning with the penultimate word of each n-gram and ending with the first word of each n-gram;
|
||||
* finally, this comparator compares the last word of each n-gram.
|
||||
*
|
||||
@ -108,8 +108,8 @@ class SuffixOrder : public Comparator<SuffixOrder> {
|
||||
*/
|
||||
class ContextOrder : public Comparator<ContextOrder> {
|
||||
public:
|
||||
|
||||
/**
|
||||
|
||||
/**
|
||||
* Constructs a comparator capable of comparing two n-grams.
|
||||
*
|
||||
* @param order Number of words in each n-gram
|
||||
@ -117,7 +117,7 @@ class ContextOrder : public Comparator<ContextOrder> {
|
||||
explicit ContextOrder(std::size_t order) : Comparator<ContextOrder>(order) {}
|
||||
|
||||
/**
|
||||
* Compares two n-grams lexicographically, one word at a time,
|
||||
* Compares two n-grams lexicographically, one word at a time,
|
||||
* beginning with the penultimate word of each n-gram and ending with the first word of each n-gram;
|
||||
* finally, this comparator compares the last word of each n-gram.
|
||||
*
|
||||
@ -136,7 +136,7 @@ class ContextOrder : public Comparator<ContextOrder> {
|
||||
/**
|
||||
* N-gram comparator that compares n-grams according to their natural (prefix) order.
|
||||
*
|
||||
* This comparator compares n-grams lexicographically, one word at a time,
|
||||
* This comparator compares n-grams lexicographically, one word at a time,
|
||||
* beginning with the first word of each n-gram and ending with the last word of each n-gram.
|
||||
*
|
||||
* Some examples of n-gram comparisons as defined by this comparator:
|
||||
@ -149,8 +149,8 @@ class ContextOrder : public Comparator<ContextOrder> {
|
||||
*/
|
||||
class PrefixOrder : public Comparator<PrefixOrder> {
|
||||
public:
|
||||
|
||||
/**
|
||||
|
||||
/**
|
||||
* Constructs a comparator capable of comparing two n-grams.
|
||||
*
|
||||
* @param order Number of words in each n-gram
|
||||
@ -158,7 +158,7 @@ class PrefixOrder : public Comparator<PrefixOrder> {
|
||||
explicit PrefixOrder(std::size_t order) : Comparator<PrefixOrder>(order) {}
|
||||
|
||||
/**
|
||||
* Compares two n-grams lexicographically, one word at a time,
|
||||
* Compares two n-grams lexicographically, one word at a time,
|
||||
* beginning with the first word of each n-gram and ending with the last word of each n-gram.
|
||||
*
|
||||
* @param lhs A pointer to the n-gram on the left-hand side of the comparison
|
||||
@ -171,7 +171,7 @@ class PrefixOrder : public Comparator<PrefixOrder> {
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
static const unsigned kMatchOffset = 0;
|
||||
};
|
||||
|
||||
@ -179,7 +179,7 @@ class PrefixOrder : public Comparator<PrefixOrder> {
|
||||
struct AddCombiner {
|
||||
bool operator()(void *first_void, const void *second_void, const SuffixOrder &compare) const {
|
||||
NGram first(first_void, compare.Order());
|
||||
// There isn't a const version of NGram.
|
||||
// There isn't a const version of NGram.
|
||||
NGram second(const_cast<void*>(second_void), compare.Order());
|
||||
if (memcmp(first.begin(), second.begin(), sizeof(WordIndex) * compare.Order())) return false;
|
||||
first.Count() += second.Count();
|
||||
@ -204,10 +204,10 @@ template <class Compare> class Sorts : public util::FixedArray<util::stream::Sor
|
||||
typedef util::FixedArray<S> P;
|
||||
|
||||
public:
|
||||
|
||||
|
||||
/**
|
||||
* Constructs, but does not initialize.
|
||||
*
|
||||
*
|
||||
* @ref util::FixedArray::Init() "Init" must be called before use.
|
||||
*
|
||||
* @see util::FixedArray::Init()
|
||||
@ -222,7 +222,7 @@ template <class Compare> class Sorts : public util::FixedArray<util::stream::Sor
|
||||
*/
|
||||
explicit Sorts(std::size_t number) : util::FixedArray<util::stream::Sort<Compare> >(number) {}
|
||||
|
||||
/**
|
||||
/**
|
||||
* Constructs a new @ref util::stream::Sort "Sort" object which is stored in this @ref util::FixedArray "array".
|
||||
*
|
||||
* The new @ref util::stream::Sort "Sort" object is constructed using the provided @ref util::stream::SortConfig "SortConfig" and @ref Comparator "ngram comparator";
|
||||
|
@ -10,7 +10,7 @@ namespace lm {
|
||||
* and implement Add. Then put a pointer in Config.enumerate_vocab; it does
|
||||
* not take ownership. Add is called once per vocab word. index starts at 0
|
||||
* and increases by 1 each time. This is only used by the Model constructor;
|
||||
* the pointer is not retained by the class.
|
||||
* the pointer is not retained by the class.
|
||||
*/
|
||||
class EnumerateVocab {
|
||||
public:
|
||||
|
@ -9,8 +9,8 @@
|
||||
namespace lm {
|
||||
namespace base {
|
||||
|
||||
// Common model interface that depends on knowing the specific classes.
|
||||
// Curiously recurring template pattern.
|
||||
// Common model interface that depends on knowing the specific classes.
|
||||
// Curiously recurring template pattern.
|
||||
template <class Child, class StateT, class VocabularyT> class ModelFacade : public Model {
|
||||
public:
|
||||
typedef StateT State;
|
||||
@ -32,7 +32,7 @@ template <class Child, class StateT, class VocabularyT> class ModelFacade : publ
|
||||
*reinterpret_cast<State*>(out_state));
|
||||
}
|
||||
|
||||
// Default Score function calls FullScore. Model can override this.
|
||||
// Default Score function calls FullScore. Model can override this.
|
||||
float Score(const State &in_state, const WordIndex new_word, State &out_state) const {
|
||||
return static_cast<const Child*>(this)->FullScore(in_state, new_word, out_state).prob;
|
||||
}
|
||||
@ -53,7 +53,7 @@ template <class Child, class StateT, class VocabularyT> class ModelFacade : publ
|
||||
|
||||
virtual ~ModelFacade() {}
|
||||
|
||||
// begin_sentence and null_context can disappear after. vocab should stay.
|
||||
// begin_sentence and null_context can disappear after. vocab should stay.
|
||||
void Init(const State &begin_sentence, const State &null_context, const Vocabulary &vocab, unsigned char order) {
|
||||
begin_sentence_ = begin_sentence;
|
||||
null_context_ = null_context;
|
||||
|
@ -33,7 +33,7 @@ class CountOutput : boost::noncopyable {
|
||||
|
||||
class CountBatch {
|
||||
public:
|
||||
explicit CountBatch(std::streamsize initial_read)
|
||||
explicit CountBatch(std::streamsize initial_read)
|
||||
: initial_read_(initial_read) {
|
||||
buffer_.reserve(initial_read);
|
||||
}
|
||||
@ -66,7 +66,7 @@ class CountBatch {
|
||||
private:
|
||||
std::streamsize initial_read_;
|
||||
|
||||
// This could have been a std::string but that's less happy with raw writes.
|
||||
// This could have been a std::string but that's less happy with raw writes.
|
||||
std::vector<char> buffer_;
|
||||
};
|
||||
|
||||
|
@ -58,7 +58,7 @@ typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} Fil
|
||||
typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format;
|
||||
|
||||
struct Config {
|
||||
Config() :
|
||||
Config() :
|
||||
#ifndef NTHREAD
|
||||
batch_size(25000),
|
||||
threads(boost::thread::hardware_concurrency()),
|
||||
|
@ -134,12 +134,12 @@ struct CountFormat {
|
||||
|
||||
/* For multithreading, the buffer classes hold batches of filter inputs and
|
||||
* outputs in memory. The strings get reused a lot, so keep them around
|
||||
* instead of clearing each time.
|
||||
* instead of clearing each time.
|
||||
*/
|
||||
class InputBuffer {
|
||||
public:
|
||||
InputBuffer() : actual_(0) {}
|
||||
|
||||
|
||||
void Reserve(size_t size) { lines_.reserve(size); }
|
||||
|
||||
template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) {
|
||||
@ -179,18 +179,18 @@ class BinaryOutputBuffer {
|
||||
void Reserve(size_t size) {
|
||||
lines_.reserve(size);
|
||||
}
|
||||
|
||||
|
||||
void AddNGram(const StringPiece &line) {
|
||||
lines_.push_back(line);
|
||||
}
|
||||
|
||||
|
||||
template <class Output> void Flush(Output &output) {
|
||||
for (std::vector<StringPiece>::const_iterator i = lines_.begin(); i != lines_.end(); ++i) {
|
||||
output.AddNGram(*i);
|
||||
}
|
||||
lines_.clear();
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
std::vector<StringPiece> lines_;
|
||||
};
|
||||
@ -234,7 +234,7 @@ class MultipleOutputBuffer {
|
||||
|
||||
private:
|
||||
struct Annotated {
|
||||
// If this is empty, send to all systems.
|
||||
// If this is empty, send to all systems.
|
||||
// A filter should never send to all systems and send to a single one.
|
||||
std::vector<size_t> systems;
|
||||
StringPiece line;
|
||||
|
@ -31,14 +31,14 @@ unsigned int ReadMultiple(std::istream &in, Substrings &out) {
|
||||
word.clear();
|
||||
}
|
||||
if (c == ' ') continue;
|
||||
// It's more than just a space. Close out the phrase.
|
||||
// It's more than just a space. Close out the phrase.
|
||||
if (!phrase.empty()) {
|
||||
sentence_content = true;
|
||||
out.AddPhrase(sentence_id, phrase.begin(), phrase.end());
|
||||
phrase.clear();
|
||||
}
|
||||
if (c == '\t' || c == '\v') continue;
|
||||
// It's more than a space or tab: a newline.
|
||||
// It's more than a space or tab: a newline.
|
||||
if (sentence_content) {
|
||||
++sentence_id;
|
||||
sentence_content = false;
|
||||
@ -53,7 +53,7 @@ typedef unsigned int Sentence;
|
||||
typedef std::vector<Sentence> Sentences;
|
||||
} // namespace
|
||||
|
||||
namespace detail {
|
||||
namespace detail {
|
||||
|
||||
const StringPiece kEndSentence("</s>");
|
||||
|
||||
@ -61,7 +61,7 @@ class Arc {
|
||||
public:
|
||||
Arc() {}
|
||||
|
||||
// For arcs from one vertex to another.
|
||||
// For arcs from one vertex to another.
|
||||
void SetPhrase(detail::Vertex &from, detail::Vertex &to, const Sentences &intersect) {
|
||||
Set(to, intersect);
|
||||
from_ = &from;
|
||||
@ -69,7 +69,7 @@ class Arc {
|
||||
|
||||
/* For arcs from before the n-gram begins to somewhere in the n-gram (right
|
||||
* aligned). These have no from_ vertex; it implictly matches every
|
||||
* sentence. This also handles when the n-gram is a substring of a phrase.
|
||||
* sentence. This also handles when the n-gram is a substring of a phrase.
|
||||
*/
|
||||
void SetRight(detail::Vertex &to, const Sentences &complete) {
|
||||
Set(to, complete);
|
||||
@ -87,12 +87,12 @@ class Arc {
|
||||
/* When this function returns:
|
||||
* If Empty() then there's nothing left from this intersection.
|
||||
*
|
||||
* If Current() == to then to is part of the intersection.
|
||||
* If Current() == to then to is part of the intersection.
|
||||
*
|
||||
* Otherwise, Current() > to. In this case, to is not part of the
|
||||
* intersection and neither is anything < Current(). To determine if
|
||||
* any value >= Current() is in the intersection, call LowerBound again
|
||||
* with the value.
|
||||
* with the value.
|
||||
*/
|
||||
void LowerBound(const Sentence to);
|
||||
|
||||
@ -160,15 +160,15 @@ void Arc::Set(Vertex &to, const Sentences &sentences) {
|
||||
|
||||
void Vertex::LowerBound(const Sentence to) {
|
||||
if (Empty()) return;
|
||||
// Union lower bound.
|
||||
// Union lower bound.
|
||||
while (true) {
|
||||
Arc *top = incoming_.top();
|
||||
if (top->Current() > to) {
|
||||
current_ = top->Current();
|
||||
return;
|
||||
}
|
||||
// If top->Current() == to, we still need to verify that's an actual
|
||||
// element and not just a bound.
|
||||
// If top->Current() == to, we still need to verify that's an actual
|
||||
// element and not just a bound.
|
||||
incoming_.pop();
|
||||
top->LowerBound(to);
|
||||
if (!top->Empty()) {
|
||||
@ -213,13 +213,13 @@ void BuildGraph(const Substrings &phrase, const std::vector<Hash> &hashes, detai
|
||||
}
|
||||
}
|
||||
|
||||
// Phrases starting at the second or later word in the n-gram.
|
||||
// Phrases starting at the second or later word in the n-gram.
|
||||
Vertex *vertex_from = vertices;
|
||||
for (const Hash *word_from = first_word + 1; word_from != &*hashes.end(); ++word_from, ++vertex_from) {
|
||||
hash = 0;
|
||||
Vertex *vertex_to = vertex_from + 1;
|
||||
for (const Hash *word_to = word_from; ; ++word_to, ++vertex_to) {
|
||||
// Notice that word_to and vertex_to have the same index.
|
||||
// Notice that word_to and vertex_to have the same index.
|
||||
hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *word_to);
|
||||
// Now hash covers [word_from, word_to].
|
||||
if (word_to == last_word) {
|
||||
@ -250,7 +250,7 @@ detail::Vertex &ConditionCommon::MakeGraph() {
|
||||
vertices_.clear();
|
||||
vertices_.resize(hashes_.size());
|
||||
arcs_.clear();
|
||||
// One for every substring.
|
||||
// One for every substring.
|
||||
arcs_.resize(((hashes_.size() + 1) * hashes_.size()) / 2);
|
||||
BuildGraph(substrings_, hashes_, &*vertices_.begin(), &*arcs_.begin());
|
||||
return vertices_[hashes_.size() - 1];
|
||||
|
@ -27,7 +27,7 @@ class Substrings {
|
||||
private:
|
||||
/* This is the value in a hash table where the key is a string. It indicates
|
||||
* four sets of sentences:
|
||||
* substring is sentences with a phrase containing the key as a substring.
|
||||
* substring is sentences with a phrase containing the key as a substring.
|
||||
* left is sentencess with a phrase that begins with the key (left aligned).
|
||||
* right is sentences with a phrase that ends with the key (right aligned).
|
||||
* phrase is sentences where the key is a phrase.
|
||||
@ -39,8 +39,8 @@ class Substrings {
|
||||
/* Most of the CPU is hash table lookups, so let's not complicate it with
|
||||
* vector equality comparisons. If a collision happens, the SentenceRelation
|
||||
* structure will contain the union of sentence ids over the colliding strings.
|
||||
* In that case, the filter will be slightly more permissive.
|
||||
* The key here is the same as boost's hash of std::vector<std::string>.
|
||||
* In that case, the filter will be slightly more permissive.
|
||||
* The key here is the same as boost's hash of std::vector<std::string>.
|
||||
*/
|
||||
typedef boost::unordered_map<Hash, SentenceRelation> Table;
|
||||
|
||||
@ -58,9 +58,9 @@ class Substrings {
|
||||
LM_FILTER_PHRASE_METHOD(Phrase, phrase)
|
||||
|
||||
#pragma GCC diagnostic ignored "-Wuninitialized" // end != finish so there's always an initialization
|
||||
// sentence_id must be non-decreasing. Iterators are over words in the phrase.
|
||||
// sentence_id must be non-decreasing. Iterators are over words in the phrase.
|
||||
template <class Iterator> void AddPhrase(unsigned int sentence_id, const Iterator &begin, const Iterator &end) {
|
||||
// Iterate over all substrings.
|
||||
// Iterate over all substrings.
|
||||
for (Iterator start = begin; start != end; ++start) {
|
||||
Hash hash = 0;
|
||||
SentenceRelation *relation;
|
||||
@ -85,7 +85,7 @@ class Substrings {
|
||||
};
|
||||
|
||||
// Read a file with one sentence per line containing tab-delimited phrases of
|
||||
// space-separated words.
|
||||
// space-separated words.
|
||||
unsigned int ReadMultiple(std::istream &in, Substrings &out);
|
||||
|
||||
namespace detail {
|
||||
@ -94,7 +94,7 @@ extern const StringPiece kEndSentence;
|
||||
template <class Iterator> void MakeHashes(Iterator i, const Iterator &end, std::vector<Hash> &hashes) {
|
||||
hashes.clear();
|
||||
if (i == end) return;
|
||||
// TODO: check strict phrase boundaries after <s> and before </s>. For now, just skip tags.
|
||||
// TODO: check strict phrase boundaries after <s> and before </s>. For now, just skip tags.
|
||||
if ((i->data()[0] == '<') && (i->data()[i->size() - 1] == '>')) {
|
||||
++i;
|
||||
}
|
||||
|
@ -88,7 +88,7 @@ class TargetWords {
|
||||
|
||||
class Input {
|
||||
public:
|
||||
explicit Input(std::size_t max_length)
|
||||
explicit Input(std::size_t max_length)
|
||||
: max_length_(max_length), sentence_id_(0), empty_() {}
|
||||
|
||||
void AddSentence(StringPiece sentence, TargetWords &targets) {
|
||||
@ -125,7 +125,7 @@ class Input {
|
||||
Map map_;
|
||||
|
||||
std::size_t sentence_id_;
|
||||
|
||||
|
||||
// Temporaries in AddSentence.
|
||||
std::string canonical_;
|
||||
std::vector<std::size_t> starts_;
|
||||
|
@ -13,29 +13,29 @@ namespace lm {
|
||||
template <class OutputBuffer> class ThreadBatch {
|
||||
public:
|
||||
ThreadBatch() {}
|
||||
|
||||
|
||||
void Reserve(size_t size) {
|
||||
input_.Reserve(size);
|
||||
output_.Reserve(size);
|
||||
}
|
||||
|
||||
// File reading thread.
|
||||
// File reading thread.
|
||||
InputBuffer &Fill(uint64_t sequence) {
|
||||
sequence_ = sequence;
|
||||
// Why wait until now to clear instead of after output? free in the same
|
||||
// thread as allocated.
|
||||
// thread as allocated.
|
||||
input_.Clear();
|
||||
return input_;
|
||||
}
|
||||
|
||||
// Filter worker thread.
|
||||
// Filter worker thread.
|
||||
template <class Filter> void CallFilter(Filter &filter) {
|
||||
input_.CallFilter(filter, output_);
|
||||
}
|
||||
|
||||
uint64_t Sequence() const { return sequence_; }
|
||||
|
||||
// File writing thread.
|
||||
// File writing thread.
|
||||
template <class RealOutput> void Flush(RealOutput &output) {
|
||||
output_.Flush(output);
|
||||
}
|
||||
@ -73,7 +73,7 @@ template <class Batch, class Output> class OutputWorker {
|
||||
|
||||
void operator()(Request request) {
|
||||
assert(request->Sequence() >= base_sequence_);
|
||||
// Assemble the output in order.
|
||||
// Assemble the output in order.
|
||||
uint64_t pos = request->Sequence() - base_sequence_;
|
||||
if (pos >= ordering_.size()) {
|
||||
ordering_.resize(pos + 1, NULL);
|
||||
@ -102,7 +102,7 @@ template <class Filter, class OutputBuffer, class RealOutput> class Controller :
|
||||
typedef ThreadBatch<OutputBuffer> Batch;
|
||||
|
||||
public:
|
||||
Controller(size_t batch_size, size_t queue, size_t workers, const Filter &filter, RealOutput &output)
|
||||
Controller(size_t batch_size, size_t queue, size_t workers, const Filter &filter, RealOutput &output)
|
||||
: batch_size_(batch_size), queue_size_(queue),
|
||||
batches_(queue),
|
||||
to_read_(queue),
|
||||
|
@ -30,7 +30,7 @@ bool IsLineEnd(std::istream &in) {
|
||||
}// namespace
|
||||
|
||||
// Read space separated words in enter separated lines. These lines can be
|
||||
// very long, so don't read an entire line at a time.
|
||||
// very long, so don't read an entire line at a time.
|
||||
unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out) {
|
||||
in.exceptions(std::istream::badbit);
|
||||
unsigned int sentence = 0;
|
||||
|
@ -26,7 +26,7 @@ unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, st
|
||||
|
||||
/* Is this a special tag like <s> or <UNK>? This actually includes anything
|
||||
* surrounded with < and >, which most tokenizers separate for real words, so
|
||||
* this should not catch real words as it looks at a single token.
|
||||
* this should not catch real words as it looks at a single token.
|
||||
*/
|
||||
inline bool IsTag(const StringPiece &value) {
|
||||
// The parser should never give an empty string.
|
||||
|
@ -13,7 +13,7 @@ namespace lm {
|
||||
// multiple-output filter so clients code against one interface.
|
||||
template <class Binary> class BinaryFilter {
|
||||
public:
|
||||
// Binary modes are just references (and a set) and it makes the API cleaner to copy them.
|
||||
// Binary modes are just references (and a set) and it makes the API cleaner to copy them.
|
||||
explicit BinaryFilter(Binary binary) : binary_(binary) {}
|
||||
|
||||
template <class Iterator, class Output> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) {
|
||||
|
44
lm/left.hh
44
lm/left.hh
@ -1,22 +1,22 @@
|
||||
/* Efficient left and right language model state for sentence fragments.
|
||||
* Intended usage:
|
||||
* Store ChartState with every chart entry.
|
||||
* Store ChartState with every chart entry.
|
||||
* To do a rule application:
|
||||
* 1. Make a ChartState object for your new entry.
|
||||
* 2. Construct RuleScore.
|
||||
* 3. Going from left to right, call Terminal or NonTerminal.
|
||||
* For terminals, just pass the vocab id.
|
||||
* 1. Make a ChartState object for your new entry.
|
||||
* 2. Construct RuleScore.
|
||||
* 3. Going from left to right, call Terminal or NonTerminal.
|
||||
* For terminals, just pass the vocab id.
|
||||
* For non-terminals, pass that non-terminal's ChartState.
|
||||
* If your decoder expects scores inclusive of subtree scores (i.e. you
|
||||
* label entries with the highest-scoring path), pass the non-terminal's
|
||||
* score as prob.
|
||||
* score as prob.
|
||||
* If your decoder expects relative scores and will walk the chart later,
|
||||
* pass prob = 0.0.
|
||||
* pass prob = 0.0.
|
||||
* In other words, the only effect of prob is that it gets added to the
|
||||
* returned log probability.
|
||||
* 4. Call Finish. It returns the log probability.
|
||||
* returned log probability.
|
||||
* 4. Call Finish. It returns the log probability.
|
||||
*
|
||||
* There's a couple more details:
|
||||
* There's a couple more details:
|
||||
* Do not pass <s> to Terminal as it is formally not a word in the sentence,
|
||||
* only context. Instead, call BeginSentence. If called, it should be the
|
||||
* first call after RuleScore is constructed (since <s> is always the
|
||||
@ -27,12 +27,12 @@
|
||||
* Hashing and sorting comparison operators are provided. All state objects
|
||||
* are POD. If you intend to use memcmp on raw state objects, you must call
|
||||
* ZeroRemaining first, as the value of array entries beyond length is
|
||||
* otherwise undefined.
|
||||
* otherwise undefined.
|
||||
*
|
||||
* Usage is of course not limited to chart decoding. Anything that generates
|
||||
* sentence fragments missing left context could benefit. For example, a
|
||||
* phrase-based decoder could pre-score phrases, storing ChartState with each
|
||||
* phrase, even if hypotheses are generated left-to-right.
|
||||
* phrase, even if hypotheses are generated left-to-right.
|
||||
*/
|
||||
|
||||
#ifndef LM_LEFT_H
|
||||
@ -77,7 +77,7 @@ template <class M> class RuleScore {
|
||||
left_done_ = true;
|
||||
}
|
||||
|
||||
// Faster version of NonTerminal for the case where the rule begins with a non-terminal.
|
||||
// Faster version of NonTerminal for the case where the rule begins with a non-terminal.
|
||||
void BeginNonTerminal(const ChartState &in, float prob = 0.0) {
|
||||
prob_ = prob;
|
||||
*out_ = in;
|
||||
@ -86,7 +86,7 @@ template <class M> class RuleScore {
|
||||
|
||||
void NonTerminal(const ChartState &in, float prob = 0.0) {
|
||||
prob_ += prob;
|
||||
|
||||
|
||||
if (!in.left.length) {
|
||||
if (in.left.full) {
|
||||
for (const float *i = out_->right.backoff; i < out_->right.backoff + out_->right.length; ++i) prob_ += *i;
|
||||
@ -131,26 +131,26 @@ template <class M> class RuleScore {
|
||||
return;
|
||||
}
|
||||
|
||||
// Right state was minimized, so it's already independent of the new words to the left.
|
||||
// Right state was minimized, so it's already independent of the new words to the left.
|
||||
if (in.right.length < in.left.length) {
|
||||
out_->right = in.right;
|
||||
return;
|
||||
}
|
||||
|
||||
// Shift exisiting words down.
|
||||
// Shift exisiting words down.
|
||||
for (WordIndex *i = out_->right.words + next_use - 1; i >= out_->right.words; --i) {
|
||||
*(i + in.right.length) = *i;
|
||||
}
|
||||
// Add words from in.right.
|
||||
// Add words from in.right.
|
||||
std::copy(in.right.words, in.right.words + in.right.length, out_->right.words);
|
||||
// Assemble backoff composed on the existing state's backoff followed by the new state's backoff.
|
||||
// Assemble backoff composed on the existing state's backoff followed by the new state's backoff.
|
||||
std::copy(in.right.backoff, in.right.backoff + in.right.length, out_->right.backoff);
|
||||
std::copy(back, back + next_use, out_->right.backoff + in.right.length);
|
||||
out_->right.length = in.right.length + next_use;
|
||||
}
|
||||
|
||||
float Finish() {
|
||||
// A N-1-gram might extend left and right but we should still set full to true because it's an N-1-gram.
|
||||
// A N-1-gram might extend left and right but we should still set full to true because it's an N-1-gram.
|
||||
out_->left.full = left_done_ || (out_->left.length == model_.Order() - 1);
|
||||
return prob_;
|
||||
}
|
||||
@ -173,17 +173,17 @@ template <class M> class RuleScore {
|
||||
back_in, // Backoffs to use
|
||||
in.left.pointers[extend_length - 1], extend_length, // Words to be extended
|
||||
back_out, // Backoffs for the next score
|
||||
next_use)); // Length of n-gram to use in next scoring.
|
||||
next_use)); // Length of n-gram to use in next scoring.
|
||||
if (next_use != out_->right.length) {
|
||||
left_done_ = true;
|
||||
if (!next_use) {
|
||||
// Early exit.
|
||||
// Early exit.
|
||||
out_->right = in.right;
|
||||
prob_ += model_.UnRest(in.left.pointers + extend_length, in.left.pointers + in.left.length, extend_length + 1);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// Continue scoring.
|
||||
// Continue scoring.
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -16,7 +16,7 @@ namespace {
|
||||
#define Term(word) score.Terminal(m.GetVocabulary().Index(word));
|
||||
#define VCheck(word, value) BOOST_CHECK_EQUAL(m.GetVocabulary().Index(word), value);
|
||||
|
||||
// Apparently some Boost versions use templates and are pretty strict about types matching.
|
||||
// Apparently some Boost versions use templates and are pretty strict about types matching.
|
||||
#define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast<double>(ref), static_cast<double>(value), static_cast<double>(tol));
|
||||
|
||||
template <class M> void Short(const M &m) {
|
||||
@ -175,7 +175,7 @@ template <class M> void LookupVocab(const M &m, const StringPiece &str, std::vec
|
||||
SLOPPY_CHECK_CLOSE(expect, RightToLeft(m, words, rest), 0.001); \
|
||||
SLOPPY_CHECK_CLOSE(expect, TreeMiddle(m, words, rest), 0.001); \
|
||||
|
||||
// Build sentences, or parts thereof, from right to left.
|
||||
// Build sentences, or parts thereof, from right to left.
|
||||
template <class M> void GrowBig(const M &m, bool rest = false) {
|
||||
std::vector<WordIndex> words;
|
||||
float expect;
|
||||
|
@ -1,7 +1,7 @@
|
||||
#ifndef LM_LM_EXCEPTION_H
|
||||
#define LM_LM_EXCEPTION_H
|
||||
|
||||
// Named to avoid conflict with util/exception.hh.
|
||||
// Named to avoid conflict with util/exception.hh.
|
||||
|
||||
#include "util/exception.hh"
|
||||
#include "util/string_piece.hh"
|
||||
|
@ -1,7 +1,7 @@
|
||||
#ifndef LM_MAX_ORDER_H
|
||||
#define LM_MAX_ORDER_H
|
||||
/* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM.
|
||||
* If not, this is the default maximum order.
|
||||
* If not, this is the default maximum order.
|
||||
* Having this limit means that State can be
|
||||
* (kMaxOrder - 1) * sizeof(float) bytes instead of
|
||||
* sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead
|
||||
|
40
lm/model.hh
40
lm/model.hh
@ -25,7 +25,7 @@ namespace lm {
|
||||
namespace ngram {
|
||||
namespace detail {
|
||||
|
||||
// Should return the same results as SRI.
|
||||
// Should return the same results as SRI.
|
||||
// ModelFacade typedefs Vocabulary so we use VocabularyT to avoid naming conflicts.
|
||||
template <class Search, class VocabularyT> class GenericModel : public base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> {
|
||||
private:
|
||||
@ -38,7 +38,7 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
|
||||
|
||||
/* Get the size of memory that will be mapped given ngram counts. This
|
||||
* does not include small non-mapped control structures, such as this class
|
||||
* itself.
|
||||
* itself.
|
||||
*/
|
||||
static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config = Config());
|
||||
|
||||
@ -46,47 +46,47 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
|
||||
* files must have the format expected by this class or you'll get an
|
||||
* exception. So TrieModel can only load ARPA or binary created by
|
||||
* TrieModel. To classify binary files, call RecognizeBinary in
|
||||
* lm/binary_format.hh.
|
||||
* lm/binary_format.hh.
|
||||
*/
|
||||
explicit GenericModel(const char *file, const Config &config = Config());
|
||||
|
||||
/* Score p(new_word | in_state) and incorporate new_word into out_state.
|
||||
* Note that in_state and out_state must be different references:
|
||||
* &in_state != &out_state.
|
||||
* &in_state != &out_state.
|
||||
*/
|
||||
FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const;
|
||||
|
||||
/* Slower call without in_state. Try to remember state, but sometimes it
|
||||
* would cost too much memory or your decoder isn't setup properly.
|
||||
* would cost too much memory or your decoder isn't setup properly.
|
||||
* To use this function, make an array of WordIndex containing the context
|
||||
* vocabulary ids in reverse order. Then, pass the bounds of the array:
|
||||
* [context_rbegin, context_rend). The new_word is not part of the context
|
||||
* array unless you intend to repeat words.
|
||||
* array unless you intend to repeat words.
|
||||
*/
|
||||
FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
|
||||
|
||||
/* Get the state for a context. Don't use this if you can avoid it. Use
|
||||
* BeginSentenceState or NullContextState and extend from those. If
|
||||
* you're only going to use this state to call FullScore once, use
|
||||
* FullScoreForgotState.
|
||||
* FullScoreForgotState.
|
||||
* To use this function, make an array of WordIndex containing the context
|
||||
* vocabulary ids in reverse order. Then, pass the bounds of the array:
|
||||
* [context_rbegin, context_rend).
|
||||
* [context_rbegin, context_rend).
|
||||
*/
|
||||
void GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const;
|
||||
|
||||
/* More efficient version of FullScore where a partial n-gram has already
|
||||
* been scored.
|
||||
* NOTE: THE RETURNED .rest AND .prob ARE RELATIVE TO THE .rest RETURNED BEFORE.
|
||||
* been scored.
|
||||
* NOTE: THE RETURNED .rest AND .prob ARE RELATIVE TO THE .rest RETURNED BEFORE.
|
||||
*/
|
||||
FullScoreReturn ExtendLeft(
|
||||
// Additional context in reverse order. This will update add_rend to
|
||||
// Additional context in reverse order. This will update add_rend to
|
||||
const WordIndex *add_rbegin, const WordIndex *add_rend,
|
||||
// Backoff weights to use.
|
||||
// Backoff weights to use.
|
||||
const float *backoff_in,
|
||||
// extend_left returned by a previous query.
|
||||
uint64_t extend_pointer,
|
||||
// Length of n-gram that the pointer corresponds to.
|
||||
// Length of n-gram that the pointer corresponds to.
|
||||
unsigned char extend_length,
|
||||
// Where to write additional backoffs for [extend_length + 1, min(Order() - 1, return.ngram_length)]
|
||||
float *backoff_out,
|
||||
@ -95,17 +95,17 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
|
||||
|
||||
/* Return probabilities minus rest costs for an array of pointers. The
|
||||
* first length should be the length of the n-gram to which pointers_begin
|
||||
* points.
|
||||
* points.
|
||||
*/
|
||||
float UnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const {
|
||||
// Compiler should optimize this if away.
|
||||
// Compiler should optimize this if away.
|
||||
return Search::kDifferentRest ? InternalUnRest(pointers_begin, pointers_end, first_length) : 0.0;
|
||||
}
|
||||
|
||||
private:
|
||||
FullScoreReturn ScoreExceptBackoff(const WordIndex *const context_rbegin, const WordIndex *const context_rend, const WordIndex new_word, State &out_state) const;
|
||||
|
||||
// Score bigrams and above. Do not include backoff.
|
||||
// Score bigrams and above. Do not include backoff.
|
||||
void ResumeScore(const WordIndex *context_rbegin, const WordIndex *const context_rend, unsigned char starting_order_minus_2, typename Search::Node &node, float *backoff_out, unsigned char &next_use, FullScoreReturn &ret) const;
|
||||
|
||||
// Appears after Size in the cc file.
|
||||
@ -116,7 +116,7 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
|
||||
float InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const;
|
||||
|
||||
BinaryFormat backing_;
|
||||
|
||||
|
||||
VocabularyT vocab_;
|
||||
|
||||
Search search_;
|
||||
@ -124,8 +124,8 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// Instead of typedef, inherit. This allows the Model etc to be forward declared.
|
||||
// Oh the joys of C and C++.
|
||||
// Instead of typedef, inherit. This allows the Model etc to be forward declared.
|
||||
// Oh the joys of C and C++.
|
||||
#define LM_COMMA() ,
|
||||
#define LM_NAME_MODEL(name, from)\
|
||||
class name : public from {\
|
||||
@ -140,7 +140,7 @@ LM_NAME_MODEL(ArrayTrieModel, detail::GenericModel<trie::TrieSearch<DontQuantize
|
||||
LM_NAME_MODEL(QuantTrieModel, detail::GenericModel<trie::TrieSearch<SeparatelyQuantize LM_COMMA() trie::DontBhiksha> LM_COMMA() SortedVocabulary>);
|
||||
LM_NAME_MODEL(QuantArrayTrieModel, detail::GenericModel<trie::TrieSearch<SeparatelyQuantize LM_COMMA() trie::ArrayBhiksha> LM_COMMA() SortedVocabulary>);
|
||||
|
||||
// Default implementation. No real reason for it to be the default.
|
||||
// Default implementation. No real reason for it to be the default.
|
||||
typedef ::lm::ngram::ProbingVocabulary Vocabulary;
|
||||
typedef ProbingModel Model;
|
||||
|
||||
|
@ -7,7 +7,7 @@
|
||||
#include <boost/test/unit_test.hpp>
|
||||
#include <boost/test/floating_point_comparison.hpp>
|
||||
|
||||
// Apparently some Boost versions use templates and are pretty strict about types matching.
|
||||
// Apparently some Boost versions use templates and are pretty strict about types matching.
|
||||
#define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast<double>(ref), static_cast<double>(value), static_cast<double>(tol));
|
||||
|
||||
namespace lm {
|
||||
@ -118,7 +118,7 @@ template <class M> void Blanks(const M &model) {
|
||||
AppendTest("not_found", 1, -1.995635 - 7.0 - 0.30103, true);
|
||||
|
||||
state = model.NullContextState();
|
||||
// higher looking is a blank.
|
||||
// higher looking is a blank.
|
||||
AppendTest("higher", 1, -1.509559, false);
|
||||
AppendTest("looking", 2, -1.285941 - 0.30103, false);
|
||||
|
||||
@ -150,7 +150,7 @@ template <class M> void Unknowns(const M &model) {
|
||||
State preserve = state;
|
||||
AppendTest("not_found2", 2, -15.0, true);
|
||||
AppendTest("not_found3", 2, -15.0 - 2.0, true);
|
||||
|
||||
|
||||
state = preserve;
|
||||
AppendTest("however", 2, -4, true);
|
||||
AppendTest("not_found3", 3, -6, true);
|
||||
@ -167,7 +167,7 @@ template <class M> void MinimalState(const M &model) {
|
||||
AppendTest("foo", 1, -3.141592, true);
|
||||
BOOST_CHECK_EQUAL(1, state.length);
|
||||
AppendTest("bar", 2, -6.0, true);
|
||||
// Has to include the backoff weight.
|
||||
// Has to include the backoff weight.
|
||||
BOOST_CHECK_EQUAL(1, state.length);
|
||||
AppendTest("bar", 1, -2.718281 + 3.0, true);
|
||||
BOOST_CHECK_EQUAL(1, state.length);
|
||||
@ -263,7 +263,7 @@ template <class M> void Stateless(const M &model) {
|
||||
// the
|
||||
AppendTest("the", 1, -4.04005, true);
|
||||
StatelessTest(5, 5, 1, -4.04005);
|
||||
// No context of the.
|
||||
// No context of the.
|
||||
StatelessTest(5, 0, 1, -1.687872);
|
||||
// biarritz
|
||||
StatelessTest(6, 1, 1, -1.9889);
|
||||
|
@ -8,7 +8,7 @@ namespace ngram {
|
||||
* and I want to preserve existing binary files. */
|
||||
typedef enum {PROBING=0, REST_PROBING=1, TRIE=2, QUANT_TRIE=3, ARRAY_TRIE=4, QUANT_ARRAY_TRIE=5} ModelType;
|
||||
|
||||
// Historical names.
|
||||
// Historical names.
|
||||
const ModelType HASH_PROBING = PROBING;
|
||||
const ModelType TRIE_SORTED = TRIE;
|
||||
const ModelType QUANT_TRIE_SORTED = QUANT_TRIE;
|
||||
|
@ -22,7 +22,7 @@ struct BasicPrint {
|
||||
std::cout << "Total: " << total << " OOV: " << oov << '\n';
|
||||
}
|
||||
void Summary(double, double, uint64_t, uint64_t) {}
|
||||
|
||||
|
||||
};
|
||||
|
||||
struct FullPrint : public BasicPrint {
|
||||
@ -31,7 +31,7 @@ struct FullPrint : public BasicPrint {
|
||||
}
|
||||
|
||||
void Summary(double ppl_including_oov, double ppl_excluding_oov, uint64_t corpus_oov, uint64_t corpus_tokens) {
|
||||
std::cout <<
|
||||
std::cout <<
|
||||
"Perplexity including OOVs:\t" << ppl_including_oov << "\n"
|
||||
"Perplexity excluding OOVs:\t" << ppl_excluding_oov << "\n"
|
||||
"OOVs:\t" << corpus_oov << "\n"
|
||||
|
@ -35,9 +35,9 @@ template <class Model> ExtendReturn ExtendLoop(
|
||||
|
||||
unsigned char i = 0;
|
||||
unsigned char length = pointers_end - pointers;
|
||||
// pointers_write is NULL means that the existing left state is full, so we should use completed probabilities.
|
||||
// pointers_write is NULL means that the existing left state is full, so we should use completed probabilities.
|
||||
if (pointers_write) {
|
||||
// Using full context, writing to new left state.
|
||||
// Using full context, writing to new left state.
|
||||
for (; i < length; ++i) {
|
||||
FullScoreReturn ret(model.ExtendLeft(
|
||||
add_rbegin, add_rbegin + value.next_use,
|
||||
@ -61,7 +61,7 @@ template <class Model> ExtendReturn ExtendLoop(
|
||||
}
|
||||
}
|
||||
}
|
||||
// Using some of the new context.
|
||||
// Using some of the new context.
|
||||
for (; i < length && value.next_use; ++i) {
|
||||
FullScoreReturn ret(model.ExtendLeft(
|
||||
add_rbegin, add_rbegin + value.next_use,
|
||||
@ -73,7 +73,7 @@ template <class Model> ExtendReturn ExtendLoop(
|
||||
value.adjust += ret.prob;
|
||||
}
|
||||
float unrest = model.UnRest(pointers + i, pointers_end, i + seen + 1);
|
||||
// Using none of the new context.
|
||||
// Using none of the new context.
|
||||
value.adjust += unrest;
|
||||
|
||||
std::copy(backoff_in, backoff_in + value.next_use, backoff_write);
|
||||
@ -100,7 +100,7 @@ template <class Model> float RevealBefore(const Model &model, const Right &revea
|
||||
if (left.full) {
|
||||
for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += backoff_buffer[i];
|
||||
} else {
|
||||
// If left wasn't full when it came in, put words into right state.
|
||||
// If left wasn't full when it came in, put words into right state.
|
||||
std::copy(reveal.words + seen, reveal.words + seen + value.next_use, right.words + right.length);
|
||||
right.length += value.next_use;
|
||||
left.full = value.make_full || (right.length == model.Order() - 1);
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user