2007-03-15 01:22:36 +03:00
|
|
|
// $Id$
|
2006-08-08 22:54:28 +04:00
|
|
|
//#include "beammain.h"
|
|
|
|
#include "tables-core.h"
|
|
|
|
|
|
|
|
#define TABLE_LINE_MAX_LENGTH 1000
|
|
|
|
#define UNKNOWNSTR "UNK"
|
|
|
|
|
2012-05-30 16:04:02 +04:00
|
|
|
using namespace std;
|
|
|
|
|
2006-08-08 23:04:59 +04:00
|
|
|
// as in beamdecoder/tables.cpp
|
2011-02-24 16:57:11 +03:00
|
|
|
vector<string> tokenize( const char* input )
|
|
|
|
{
|
2006-08-08 22:54:28 +04:00
|
|
|
vector< string > token;
|
|
|
|
bool betweenWords = true;
|
2006-08-09 23:37:14 +04:00
|
|
|
int start=0;
|
2006-08-08 22:54:28 +04:00
|
|
|
int i=0;
|
|
|
|
for(; input[i] != '\0'; i++) {
|
|
|
|
bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
|
|
|
|
|
|
|
if (!isSpace && betweenWords) {
|
|
|
|
start = i;
|
|
|
|
betweenWords = false;
|
2011-02-24 16:57:11 +03:00
|
|
|
} else if (isSpace && !betweenWords) {
|
2006-08-08 22:54:28 +04:00
|
|
|
token.push_back( string( input+start, i-start ) );
|
|
|
|
betweenWords = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!betweenWords)
|
2006-08-09 23:37:14 +04:00
|
|
|
token.push_back( string( input+start, i-start ) );
|
2006-08-08 22:54:28 +04:00
|
|
|
return token;
|
|
|
|
}
|
|
|
|
|
2011-09-07 20:37:33 +04:00
|
|
|
bool isNonTerminal( const WORD &symbol ) {
|
|
|
|
return symbol.substr(0, 1) == "[" && symbol.substr(symbol.size()-1, 1) == "]";
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
WORD_ID Vocabulary::storeIfNew( const WORD& word )
|
|
|
|
{
|
2007-10-03 01:43:54 +04:00
|
|
|
map<WORD, WORD_ID>::iterator i = lookup.find( word );
|
2011-02-24 16:57:11 +03:00
|
|
|
|
2007-10-03 01:43:54 +04:00
|
|
|
if( i != lookup.end() )
|
|
|
|
return i->second;
|
2006-08-08 22:54:28 +04:00
|
|
|
|
|
|
|
WORD_ID id = vocab.size();
|
|
|
|
vocab.push_back( word );
|
|
|
|
lookup[ word ] = id;
|
2011-02-24 16:57:11 +03:00
|
|
|
return id;
|
2006-08-08 22:54:28 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
WORD_ID Vocabulary::getWordID( const WORD& word )
|
|
|
|
{
|
2007-10-03 01:43:54 +04:00
|
|
|
map<WORD, WORD_ID>::iterator i = lookup.find( word );
|
|
|
|
if( i == lookup.end() )
|
2006-08-08 22:54:28 +04:00
|
|
|
return 0;
|
2007-10-03 01:43:54 +04:00
|
|
|
return i->second;
|
2006-08-08 22:54:28 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
PHRASE_ID PhraseTable::storeIfNew( const PHRASE& phrase )
|
|
|
|
{
|
2007-10-03 01:43:54 +04:00
|
|
|
map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );
|
|
|
|
if( i != lookup.end() )
|
|
|
|
return i->second;
|
2006-08-08 22:54:28 +04:00
|
|
|
|
|
|
|
PHRASE_ID id = phraseTable.size();
|
|
|
|
phraseTable.push_back( phrase );
|
|
|
|
lookup[ phrase ] = id;
|
|
|
|
return id;
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
PHRASE_ID PhraseTable::getPhraseID( const PHRASE& phrase )
|
|
|
|
{
|
2007-10-03 01:43:54 +04:00
|
|
|
map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );
|
|
|
|
if( i == lookup.end() )
|
2006-08-08 22:54:28 +04:00
|
|
|
return 0;
|
2007-10-03 01:43:54 +04:00
|
|
|
return i->second;
|
2006-08-08 22:54:28 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
void PhraseTable::clear()
|
|
|
|
{
|
2006-08-08 22:54:28 +04:00
|
|
|
lookup.clear();
|
|
|
|
phraseTable.clear();
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
void DTable::init()
|
|
|
|
{
|
2006-08-08 22:54:28 +04:00
|
|
|
for(int i = -10; i<10; i++)
|
|
|
|
dtable[i] = -abs( i );
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
void DTable::load( const string& fileName )
|
|
|
|
{
|
2006-08-08 22:54:28 +04:00
|
|
|
ifstream inFile;
|
|
|
|
inFile.open(fileName.c_str());
|
|
|
|
|
2012-05-10 20:42:56 +04:00
|
|
|
std::string line;
|
2006-08-08 22:54:28 +04:00
|
|
|
int i=0;
|
|
|
|
while(true) {
|
|
|
|
i++;
|
2012-05-10 20:42:56 +04:00
|
|
|
getline(inFile, line);
|
|
|
|
if (inFile.eof()) break;
|
|
|
|
if (!inFile) {
|
|
|
|
std::cerr << "Error reading from " << fileName << std::endl;
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
|
|
|
|
vector<string> token = tokenize(line.c_str());
|
2006-08-08 22:54:28 +04:00
|
|
|
if (token.size() < 2) {
|
|
|
|
cerr << "line " << i << " in " << fileName << " too short, skipping\n";
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
int d = atoi( token[0].c_str() );
|
|
|
|
double prob = log( atof( token[1].c_str() ) );
|
|
|
|
dtable[ d ] = prob;
|
2011-02-24 16:57:11 +03:00
|
|
|
}
|
2006-08-08 22:54:28 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
double DTable::get( int distortion )
|
|
|
|
{
|
2006-08-08 22:54:28 +04:00
|
|
|
if (dtable.find( distortion ) == dtable.end())
|
|
|
|
return log( 0.00001 );
|
|
|
|
return dtable[ distortion ];
|
|
|
|
}
|
2007-10-03 01:43:54 +04:00
|
|
|
|