mosesdecoder/phrase-extract/tables-core.cpp

// $Id$
//#include "beammain.h"
#include "util/tokenize.hh"
#include "tables-core.h"

#define TABLE_LINE_MAX_LENGTH 1000
#define UNKNOWNSTR	"UNK"

using namespace std;

namespace MosesTraining
{

WORD_ID Vocabulary::storeIfNew( const WORD& word )
{
  map<WORD, WORD_ID>::iterator i = lookup.find( word );

  if( i != lookup.end() )
    return i->second;

  WORD_ID id = vocab.size();
  vocab.push_back( word );
  lookup[ word ] = id;
  return id;
}

WORD_ID Vocabulary::getWordID( const WORD& word )
{
  map<WORD, WORD_ID>::iterator i = lookup.find( word );
  if( i == lookup.end() )
    return 0;
  return i->second;
}

PHRASE_ID PhraseTable::storeIfNew( const PHRASE& phrase )
{
  map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );
  if( i != lookup.end() )
    return i->second;

  PHRASE_ID id  = phraseTable.size();
  phraseTable.push_back( phrase );
  lookup[ phrase ] = id;
  return id;
}

PHRASE_ID PhraseTable::getPhraseID( const PHRASE& phrase )
{
  map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );
  if( i == lookup.end() )
    return 0;
  return i->second;
}

void PhraseTable::clear()
{
  lookup.clear();
  phraseTable.clear();
}

void DTable::init()
{
  for(int i = -10; i<10; i++)
    dtable[i] = -abs( i );
}

void DTable::load( const string& fileName )
{
  ifstream inFile;
  inFile.open(fileName.c_str());

  std::string line;
  int i=0;
  while(true) {
    i++;
    getline(inFile, line);
    if (inFile.eof()) break;
    if (!inFile) {
      std::cerr << "Error reading from " << fileName << std::endl;
      abort();
    }

    const vector<string> token = util::tokenize(line);
    if (token.size() < 2) {
      cerr << "line " << i << " in " << fileName << " too short, skipping\n";
      continue;
    }

    int d = atoi( token[0].c_str() );
    double prob = log( atof( token[1].c_str() ) );
    dtable[ d ] = prob;
  }
}

double DTable::get( int distortion )
{
  if (dtable.find( distortion ) == dtable.end())
    return log( 0.00001 );
  return dtable[ distortion ];
}

}
add svn id comments to start of file git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1307 1f5c12ca-751b-0410-a591-d2e778427230 2007-03-15 01:22:36 +03:00			`// $Id$`
initial version of phrase-extract and phrase-score used by training script git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@567 1f5c12ca-751b-0410-a591-d2e778427230 2006-08-08 22:54:28 +04:00			`//#include "beammain.h"`
Unify tokenize() into util, and unit-test it. The duplicate definition works fine in environments where the inline definition becomes a weak symbol in the object file, but if it gets generated as a regular definition, the duplicate definition causes link problems. In most call sites the return value could easily be made const, which gives both the reader and the compiler a bit more certainty about the code's intentions. In theory this may help performance, but it's mainly for clarity. The comments are based on reverse-engineering, and the unit tests are based on the comments. It's possible that some of what's in there is not essential, in which case, don't feel bad about changing it! I left a third identical definition in place, though I updated it with my changes to avoid creeping divergence, and noted the duplication in a comment. It would be nice to get rid of this definition as well, but it'd introduce headers from the main Moses tree into biconcor, which may be against policy. 2015-04-22 05:59:05 +03:00			`#include "util/tokenize.hh"`
initial version of phrase-extract and phrase-score used by training script git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@567 1f5c12ca-751b-0410-a591-d2e778427230 2006-08-08 22:54:28 +04:00			`#include "tables-core.h"`

			`#define TABLE_LINE_MAX_LENGTH 1000`
			`#define UNKNOWNSTR "UNK"`

xcode build supports threads. move 'using namespace' out from .h file to stop namespace pollution 2012-05-30 16:04:02 +04:00			`using namespace std;`

add namespace to phrase-extract 2012-06-30 18:43:47 +04:00			`namespace MosesTraining`
			`{`

run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`WORD_ID Vocabulary::storeIfNew( const WORD& word )`
			`{`
performance fixes for scorer git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1470 1f5c12ca-751b-0410-a591-d2e778427230 2007-10-03 01:43:54 +04:00			`map<WORD, WORD_ID>::iterator i = lookup.find( word );`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00
performance fixes for scorer git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1470 1f5c12ca-751b-0410-a591-d2e778427230 2007-10-03 01:43:54 +04:00			`if( i != lookup.end() )`
			`return i->second;`
initial version of phrase-extract and phrase-score used by training script git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@567 1f5c12ca-751b-0410-a591-d2e778427230 2006-08-08 22:54:28 +04:00
			`WORD_ID id = vocab.size();`
			`vocab.push_back( word );`
			`lookup[ word ] = id;`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`return id;`
initial version of phrase-extract and phrase-score used by training script git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@567 1f5c12ca-751b-0410-a591-d2e778427230 2006-08-08 22:54:28 +04:00			`}`

run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`WORD_ID Vocabulary::getWordID( const WORD& word )`
			`{`
performance fixes for scorer git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1470 1f5c12ca-751b-0410-a591-d2e778427230 2007-10-03 01:43:54 +04:00			`map<WORD, WORD_ID>::iterator i = lookup.find( word );`
			`if( i == lookup.end() )`
initial version of phrase-extract and phrase-score used by training script git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@567 1f5c12ca-751b-0410-a591-d2e778427230 2006-08-08 22:54:28 +04:00			`return 0;`
performance fixes for scorer git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1470 1f5c12ca-751b-0410-a591-d2e778427230 2007-10-03 01:43:54 +04:00			`return i->second;`
initial version of phrase-extract and phrase-score used by training script git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@567 1f5c12ca-751b-0410-a591-d2e778427230 2006-08-08 22:54:28 +04:00			`}`

run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`PHRASE_ID PhraseTable::storeIfNew( const PHRASE& phrase )`
			`{`
performance fixes for scorer git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1470 1f5c12ca-751b-0410-a591-d2e778427230 2007-10-03 01:43:54 +04:00			`map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );`
			`if( i != lookup.end() )`
			`return i->second;`
initial version of phrase-extract and phrase-score used by training script git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@567 1f5c12ca-751b-0410-a591-d2e778427230 2006-08-08 22:54:28 +04:00
			`PHRASE_ID id = phraseTable.size();`
			`phraseTable.push_back( phrase );`
			`lookup[ phrase ] = id;`
			`return id;`
			`}`

run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`PHRASE_ID PhraseTable::getPhraseID( const PHRASE& phrase )`
			`{`
performance fixes for scorer git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1470 1f5c12ca-751b-0410-a591-d2e778427230 2007-10-03 01:43:54 +04:00			`map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );`
			`if( i == lookup.end() )`
initial version of phrase-extract and phrase-score used by training script git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@567 1f5c12ca-751b-0410-a591-d2e778427230 2006-08-08 22:54:28 +04:00			`return 0;`
performance fixes for scorer git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1470 1f5c12ca-751b-0410-a591-d2e778427230 2007-10-03 01:43:54 +04:00			`return i->second;`
initial version of phrase-extract and phrase-score used by training script git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@567 1f5c12ca-751b-0410-a591-d2e778427230 2006-08-08 22:54:28 +04:00			`}`

run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`void PhraseTable::clear()`
			`{`
initial version of phrase-extract and phrase-score used by training script git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@567 1f5c12ca-751b-0410-a591-d2e778427230 2006-08-08 22:54:28 +04:00			`lookup.clear();`
			`phraseTable.clear();`
			`}`

run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`void DTable::init()`
			`{`
initial version of phrase-extract and phrase-score used by training script git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@567 1f5c12ca-751b-0410-a591-d2e778427230 2006-08-08 22:54:28 +04:00			`for(int i = -10; i<10; i++)`
			`dtable[i] = -abs( i );`
			`}`

run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`void DTable::load( const string& fileName )`
			`{`
initial version of phrase-extract and phrase-score used by training script git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@567 1f5c12ca-751b-0410-a591-d2e778427230 2006-08-08 22:54:28 +04:00			`ifstream inFile;`
			`inFile.open(fileName.c_str());`

Extract tables-core.o to prevent it from being compiled with and without boost macros. Also, kill a use of SafeGetline. 2012-05-10 20:42:56 +04:00			`std::string line;`
initial version of phrase-extract and phrase-score used by training script git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@567 1f5c12ca-751b-0410-a591-d2e778427230 2006-08-08 22:54:28 +04:00			`int i=0;`
			`while(true) {`
			`i++;`
Extract tables-core.o to prevent it from being compiled with and without boost macros. Also, kill a use of SafeGetline. 2012-05-10 20:42:56 +04:00			`getline(inFile, line);`
			`if (inFile.eof()) break;`
			`if (!inFile) {`
			`std::cerr << "Error reading from " << fileName << std::endl;`
			`abort();`
			`}`
beautify 2013-05-29 21:16:15 +04:00
Support tokenize(const std::string &) as well. Convenience wrapper: the actual function takes a const char[], but many of the call sites want to pass a string and have to call its c_str() first. 2015-04-22 06:35:18 +03:00			`const vector<string> token = util::tokenize(line);`
initial version of phrase-extract and phrase-score used by training script git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@567 1f5c12ca-751b-0410-a591-d2e778427230 2006-08-08 22:54:28 +04:00			`if (token.size() < 2) {`
			`cerr << "line " << i << " in " << fileName << " too short, skipping\n";`
			`continue;`
			`}`

			`int d = atoi( token[0].c_str() );`
			`double prob = log( atof( token[1].c_str() ) );`
			`dtable[ d ] = prob;`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`}`
initial version of phrase-extract and phrase-score used by training script git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@567 1f5c12ca-751b-0410-a591-d2e778427230 2006-08-08 22:54:28 +04:00			`}`

run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`double DTable::get( int distortion )`
			`{`
initial version of phrase-extract and phrase-score used by training script git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@567 1f5c12ca-751b-0410-a591-d2e778427230 2006-08-08 22:54:28 +04:00			`if (dtable.find( distortion ) == dtable.end())`
			`return log( 0.00001 );`
			`return dtable[ distortion ];`
			`}`
performance fixes for scorer git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1470 1f5c12ca-751b-0410-a591-d2e778427230 2007-10-03 01:43:54 +04:00
add namespace to phrase-extract 2012-06-30 18:43:47 +04:00			`}`