2007-03-15 01:22:36 +03:00
|
|
|
// $Id$
|
2006-08-08 22:54:28 +04:00
|
|
|
//#include "beammain.h"
|
Unify tokenize() into util, and unit-test it.
The duplicate definition works fine in environments where the inline
definition becomes a weak symbol in the object file, but if it gets
generated as a regular definition, the duplicate definition causes link
problems.
In most call sites the return value could easily be made const, which
gives both the reader and the compiler a bit more certainty about the code's
intentions. In theory this may help performance, but it's mainly for clarity.
The comments are based on reverse-engineering, and the unit tests are based
on the comments. It's possible that some of what's in there is not essential,
in which case, don't feel bad about changing it!
I left a third identical definition in place, though I updated it with my
changes to avoid creeping divergence, and noted the duplication in a comment.
It would be nice to get rid of this definition as well, but it'd introduce
headers from the main Moses tree into biconcor, which may be against policy.
2015-04-22 05:59:05 +03:00
|
|
|
#include "util/tokenize.hh"
|
2006-08-08 22:54:28 +04:00
|
|
|
#include "tables-core.h"
|
|
|
|
|
|
|
|
#define TABLE_LINE_MAX_LENGTH 1000
|
|
|
|
#define UNKNOWNSTR "UNK"
|
|
|
|
|
2012-05-30 16:04:02 +04:00
|
|
|
using namespace std;
|
|
|
|
|
2012-06-30 18:43:47 +04:00
|
|
|
namespace MosesTraining
|
|
|
|
{
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
WORD_ID Vocabulary::storeIfNew( const WORD& word )
|
|
|
|
{
|
2007-10-03 01:43:54 +04:00
|
|
|
map<WORD, WORD_ID>::iterator i = lookup.find( word );
|
2011-02-24 16:57:11 +03:00
|
|
|
|
2007-10-03 01:43:54 +04:00
|
|
|
if( i != lookup.end() )
|
|
|
|
return i->second;
|
2006-08-08 22:54:28 +04:00
|
|
|
|
|
|
|
WORD_ID id = vocab.size();
|
|
|
|
vocab.push_back( word );
|
|
|
|
lookup[ word ] = id;
|
2011-02-24 16:57:11 +03:00
|
|
|
return id;
|
2006-08-08 22:54:28 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
WORD_ID Vocabulary::getWordID( const WORD& word )
|
|
|
|
{
|
2007-10-03 01:43:54 +04:00
|
|
|
map<WORD, WORD_ID>::iterator i = lookup.find( word );
|
|
|
|
if( i == lookup.end() )
|
2006-08-08 22:54:28 +04:00
|
|
|
return 0;
|
2007-10-03 01:43:54 +04:00
|
|
|
return i->second;
|
2006-08-08 22:54:28 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
PHRASE_ID PhraseTable::storeIfNew( const PHRASE& phrase )
|
|
|
|
{
|
2007-10-03 01:43:54 +04:00
|
|
|
map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );
|
|
|
|
if( i != lookup.end() )
|
|
|
|
return i->second;
|
2006-08-08 22:54:28 +04:00
|
|
|
|
|
|
|
PHRASE_ID id = phraseTable.size();
|
|
|
|
phraseTable.push_back( phrase );
|
|
|
|
lookup[ phrase ] = id;
|
|
|
|
return id;
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
PHRASE_ID PhraseTable::getPhraseID( const PHRASE& phrase )
|
|
|
|
{
|
2007-10-03 01:43:54 +04:00
|
|
|
map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );
|
|
|
|
if( i == lookup.end() )
|
2006-08-08 22:54:28 +04:00
|
|
|
return 0;
|
2007-10-03 01:43:54 +04:00
|
|
|
return i->second;
|
2006-08-08 22:54:28 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
void PhraseTable::clear()
|
|
|
|
{
|
2006-08-08 22:54:28 +04:00
|
|
|
lookup.clear();
|
|
|
|
phraseTable.clear();
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
void DTable::init()
|
|
|
|
{
|
2006-08-08 22:54:28 +04:00
|
|
|
for(int i = -10; i<10; i++)
|
|
|
|
dtable[i] = -abs( i );
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
void DTable::load( const string& fileName )
|
|
|
|
{
|
2006-08-08 22:54:28 +04:00
|
|
|
ifstream inFile;
|
|
|
|
inFile.open(fileName.c_str());
|
|
|
|
|
2012-05-10 20:42:56 +04:00
|
|
|
std::string line;
|
2006-08-08 22:54:28 +04:00
|
|
|
int i=0;
|
|
|
|
while(true) {
|
|
|
|
i++;
|
2012-05-10 20:42:56 +04:00
|
|
|
getline(inFile, line);
|
|
|
|
if (inFile.eof()) break;
|
|
|
|
if (!inFile) {
|
|
|
|
std::cerr << "Error reading from " << fileName << std::endl;
|
|
|
|
abort();
|
|
|
|
}
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2015-04-22 06:35:18 +03:00
|
|
|
const vector<string> token = util::tokenize(line);
|
2006-08-08 22:54:28 +04:00
|
|
|
if (token.size() < 2) {
|
|
|
|
cerr << "line " << i << " in " << fileName << " too short, skipping\n";
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
int d = atoi( token[0].c_str() );
|
|
|
|
double prob = log( atof( token[1].c_str() ) );
|
|
|
|
dtable[ d ] = prob;
|
2011-02-24 16:57:11 +03:00
|
|
|
}
|
2006-08-08 22:54:28 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
double DTable::get( int distortion )
|
|
|
|
{
|
2006-08-08 22:54:28 +04:00
|
|
|
if (dtable.find( distortion ) == dtable.end())
|
|
|
|
return log( 0.00001 );
|
|
|
|
return dtable[ distortion ];
|
|
|
|
}
|
2007-10-03 01:43:54 +04:00
|
|
|
|
2012-06-30 18:43:47 +04:00
|
|
|
}
|
|
|
|
|