Merge ../mosesdecoder into hieu

This commit is contained in:
Hieu Hoang 2014-06-08 17:07:41 +01:00
commit 3c6a31128d
18 changed files with 111 additions and 221 deletions

View File

@ -17,12 +17,8 @@ License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "util/exception.hh"
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
#define LINE_MAX_LENGTH 100000
#include "phrase-extract/SafeGetline.h" // for SAFE_GETLINE()
using namespace std;
template<typename T>
@ -461,16 +457,14 @@ void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexic
}
istream *inFileP = &inFile;
char line[LINE_MAX_LENGTH];
int i=0;
while(true) {
string line;
while(getline(*inFileP, line)) {
i++;
if (i%100000 == 0) cerr << "." << flush;
SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
if (inFileP->eof()) break;
vector<string> token = tokenize( line );
vector<string> token = tokenize( line.c_str() );
if (token.size() != 4) {
cerr << "line " << i << " in " << fileName
<< " has wrong number of tokens, skipping:\n"

View File

@ -413,11 +413,9 @@ void FuzzyMatchWrapper::load_corpus( const std::string &fileName, vector< vector
istream *fileStreamP = &fileStream;
char line[LINE_MAX_LENGTH];
while(true) {
SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
if (fileStreamP->eof()) break;
corpus.push_back( GetVocabulary().Tokenize( line ) );
string line;
while(getline(*fileStreamP, line)) {
corpus.push_back( GetVocabulary().Tokenize( line.c_str() ) );
}
}
@ -436,12 +434,9 @@ void FuzzyMatchWrapper::load_target(const std::string &fileName, vector< vector<
WORD_ID delimiter = GetVocabulary().StoreIfNew("|||");
int lineNum = 0;
char line[LINE_MAX_LENGTH];
while(true) {
SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
if (fileStreamP->eof()) break;
vector<WORD_ID> toks = GetVocabulary().Tokenize( line );
string line;
while(getline(*fileStreamP, line)) {
vector<WORD_ID> toks = GetVocabulary().Tokenize( line.c_str() );
corpus.push_back(vector< SentenceAlignment >());
vector< SentenceAlignment > &vec = corpus.back();
@ -493,11 +488,8 @@ void FuzzyMatchWrapper::load_alignment(const std::string &fileName, vector< vect
string delimiter = "|||";
int lineNum = 0;
char line[LINE_MAX_LENGTH];
while(true) {
SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
if (fileStreamP->eof()) break;
string line;
while(getline(*fileStreamP, line)) {
vector< SentenceAlignment > &vec = corpus[lineNum];
size_t targetInd = 0;
SentenceAlignment *sentence = &vec[targetInd];

View File

@ -14,17 +14,16 @@ SuffixArray::SuffixArray( string fileName )
m_endOfSentence = m_vcb.StoreIfNew( "<s>" );
ifstream extractFile;
char line[LINE_MAX_LENGTH];
// count the number of words first;
extractFile.open(fileName.c_str());
istream *fileP = &extractFile;
m_size = 0;
size_t sentenceCount = 0;
while(!fileP->eof()) {
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
if (fileP->eof()) break;
vector< WORD_ID > words = m_vcb.Tokenize( line );
string line;
while(getline(*fileP, line)) {
vector< WORD_ID > words = m_vcb.Tokenize( line.c_str() );
m_size += words.size() + 1;
sentenceCount++;
}
@ -43,10 +42,8 @@ SuffixArray::SuffixArray( string fileName )
int sentenceId = 0;
extractFile.open(fileName.c_str());
fileP = &extractFile;
while(!fileP->eof()) {
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
if (fileP->eof()) break;
vector< WORD_ID > words = m_vcb.Tokenize( line );
while(getline(*fileP, line)) {
vector< WORD_ID > words = m_vcb.Tokenize( line.c_str() );
// add to corpus vector
corpus.push_back(words);

View File

@ -17,20 +17,6 @@
namespace tmmt
{
#define MAX_LENGTH 10000
#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
_IS.getline(_LINE, _SIZE, _DELIM); \
if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
if (_IS.gcount() == _SIZE-1) { \
cerr << "Line too long! Buffer overflow. Delete lines >=" \
<< _SIZE << " chars or raise MAX_LENGTH in phrase-extract/tables-core.cpp" \
<< endl; \
exit(1); \
} \
}
typedef std::string WORD;
typedef unsigned int WORD_ID;

View File

@ -2,9 +2,6 @@
#include "ExtractionPhrasePair.h"
#include "tables-core.h"
#include "InputFileStream.h"
#include "SafeGetline.h"
#define TABLE_LINE_MAX_LENGTH 1000
using namespace std;
@ -16,12 +13,11 @@ void Domain::load( const std::string &domainFileName )
{
Moses::InputFileStream fileS( domainFileName );
istream *fileP = &fileS;
while(true) {
char line[TABLE_LINE_MAX_LENGTH];
SAFE_GETLINE((*fileP), line, TABLE_LINE_MAX_LENGTH, '\n', __FILE__);
if (fileP->eof()) break;
string line;
while(getline(*fileP, line)) {
// read
vector< string > domainSpecLine = tokenize( line );
vector< string > domainSpecLine = tokenize( line.c_str() );
int lineNumber;
if (domainSpecLine.size() != 2 ||
! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {

View File

@ -19,7 +19,6 @@
#include <sstream>
#include "ExtractionPhrasePair.h"
#include "SafeGetline.h"
#include "tables-core.h"
#include "score.h"
#include "moses/Util.h"

View File

@ -1,35 +0,0 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2010 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#ifndef SAFE_GETLINE_INCLUDED_
#define SAFE_GETLINE_INCLUDED_
#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM, _FILE) { \
_IS.getline(_LINE, _SIZE, _DELIM); \
if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
if (_IS.gcount() == _SIZE-1) { \
cerr << "Line too long! Buffer overflow. Delete lines >=" \
<< _SIZE << " chars or raise LINE_MAX_LENGTH in " << _FILE \
<< endl; \
exit(1); \
} \
}
#endif

View File

@ -54,7 +54,11 @@ bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bo
return true;
}
bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], char weightString[], int sentenceID, bool boundaryRules)
bool SentenceAlignment::create(const char targetString[],
const char sourceString[],
const char alignmentString[],
const char weightString[],
int sentenceID, bool boundaryRules)
{
using namespace std;
this->sentenceID = sentenceID;

View File

@ -43,8 +43,11 @@ public:
virtual bool processSourceSentence(const char *, int, bool boundaryRules);
bool create(char targetString[], char sourceString[],
char alignmentString[], char weightString[], int sentenceID, bool boundaryRules);
bool create(const char targetString[],
const char sourceString[],
const char alignmentString[],
const char weightString[],
int sentenceID, bool boundaryRules);
void invertAlignment();

View File

@ -26,16 +26,9 @@
#include "InputFileStream.h"
#include "OutputFileStream.h"
#include "SafeGetline.h"
#define LINE_MAX_LENGTH 10000
using namespace std;
char line[LINE_MAX_LENGTH];
vector< string > splitLine()
vector< string > splitLine(const char *line)
{
vector< string > item;
int start=0;
@ -61,14 +54,15 @@ bool getLine( istream &fileP, vector< string > &item )
{
if (fileP.eof())
return false;
SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
if (fileP.eof())
string line;
if (getline(fileP, line)) {
item = splitLine(line.c_str());
return false;
item = splitLine();
return true;
}
else {
return false;
}
}

View File

@ -26,7 +26,6 @@
#include <cstring>
#include "tables-core.h"
#include "SafeGetline.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"

View File

@ -27,23 +27,19 @@
#include <cstring>
#include "tables-core.h"
#include "SafeGetline.h"
#include "InputFileStream.h"
#define LINE_MAX_LENGTH 10000
using namespace std;
bool hierarchicalFlag = false;
bool onlyDirectFlag = false;
bool phraseCountFlag = true;
bool logProbFlag = false;
char line[LINE_MAX_LENGTH];
void processFiles( char*, char*, char* );
bool getLine( istream &fileP, vector< string > &item );
string reverseAlignment(const string &alignments);
vector< string > splitLine();
vector< string > splitLine(const char *lin);
inline void Tokenize(std::vector<std::string> &output
, const std::string& str
@ -190,17 +186,18 @@ bool getLine( istream &fileP, vector< string > &item )
{
if (fileP.eof())
return false;
SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
if (fileP.eof())
string line;
if (getline(fileP, line)) {
item = splitLine(line.c_str());
return false;
item = splitLine();
return true;
}
else {
return false;
}
}
vector< string > splitLine()
vector< string > splitLine(const char *line)
{
vector< string > item;
bool betweenWords = true;

View File

@ -19,7 +19,6 @@
#include <set>
#include <vector>
#include "SafeGetline.h"
#include "SentenceAlignment.h"
#include "tables-core.h"
#include "InputFileStream.h"
@ -32,10 +31,6 @@ using namespace MosesTraining;
namespace MosesTraining
{
const long int LINE_MAX_LENGTH = 500000 ;
// HPhraseVertex represents a point in the alignment matrix
typedef pair <int, int> HPhraseVertex;
@ -277,20 +272,18 @@ int main(int argc, char* argv[])
int i = sentenceOffset;
while(true) {
string englishString, foreignString, alignmentString, weightString;
while(getline(*eFileP, englishString)) {
i++;
if (i%10000 == 0) cerr << "." << flush;
char englishString[LINE_MAX_LENGTH];
char foreignString[LINE_MAX_LENGTH];
char alignmentString[LINE_MAX_LENGTH];
char weightString[LINE_MAX_LENGTH];
SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__);
if (eFileP->eof()) break;
SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
getline(*fFileP, foreignString);
getline(*aFileP, alignmentString);
if (iwFileP) {
SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__);
getline(*iwFileP, weightString);
}
SentenceAlignment sentence;
// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
//az: output src, tgt, and alingment line
@ -300,7 +293,11 @@ int main(int argc, char* argv[])
cout << "LOG: ALT: " << alignmentString << endl;
cout << "LOG: PHRASES_BEGIN:" << endl;
}
if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) {
if (sentence.create( englishString.c_str(),
foreignString.c_str(),
alignmentString.c_str(),
weightString.c_str(),
i, false)) {
if (options.placeholders.size()) {
sentence.invertAlignment();
}

View File

@ -19,7 +19,6 @@
#include <set>
#include <vector>
#include "SafeGetline.h"
#include "SentenceAlignment.h"
#include "tables-core.h"
#include "InputFileStream.h"
@ -32,10 +31,6 @@ using namespace MosesTraining;
namespace MosesTraining
{
const long int LINE_MAX_LENGTH = 500000 ;
// HPhraseVertex represents a point in the alignment matrix
typedef pair <int, int> HPhraseVertex;
@ -246,20 +241,20 @@ int main(int argc, char* argv[])
int i = sentenceOffset;
while(true) {
string englishString, foreignString, alignmentString, weightString;
while(getline(*eFileP, englishString)) {
i++;
if (i%10000 == 0) cerr << "." << flush;
char englishString[LINE_MAX_LENGTH];
char foreignString[LINE_MAX_LENGTH];
char alignmentString[LINE_MAX_LENGTH];
char weightString[LINE_MAX_LENGTH];
SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__);
if (eFileP->eof()) break;
SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
getline(*eFileP, englishString);
getline(*fFileP, foreignString);
getline(*aFileP, alignmentString);
if (iwFileP) {
SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__);
getline(*iwFileP, weightString);
}
if (i%10000 == 0) cerr << "." << flush;
SentenceAlignment sentence;
// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
//az: output src, tgt, and alingment line
@ -269,7 +264,7 @@ int main(int argc, char* argv[])
cout << "LOG: ALT: " << alignmentString << endl;
cout << "LOG: PHRASES_BEGIN:" << endl;
}
if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) {
if (sentence.create( englishString.c_str(), foreignString.c_str(), alignmentString.c_str(), weightString.c_str(), i, false)) {
ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFileOrientation);
task->Run();
delete task;

View File

@ -39,7 +39,6 @@
#include "Hole.h"
#include "HoleCollection.h"
#include "RuleExist.h"
#include "SafeGetline.h"
#include "SentenceAlignmentWithSyntax.h"
#include "SyntaxTree.h"
#include "tables-core.h"
@ -47,8 +46,6 @@
#include "InputFileStream.h"
#include "OutputFileStream.h"
#define LINE_MAX_LENGTH 500000
using namespace std;
using namespace MosesTraining;
@ -326,17 +323,15 @@ int main(int argc, char* argv[])
// loop through all sentence pairs
size_t i=sentenceOffset;
while(true) {
i++;
if (i%1000 == 0) cerr << i << " " << flush;
string targetString, sourceString, alignmentString;
char targetString[LINE_MAX_LENGTH];
char sourceString[LINE_MAX_LENGTH];
char alignmentString[LINE_MAX_LENGTH];
SAFE_GETLINE((*tFileP), targetString, LINE_MAX_LENGTH, '\n', __FILE__);
if (tFileP->eof()) break;
SAFE_GETLINE((*sFileP), sourceString, LINE_MAX_LENGTH, '\n', __FILE__);
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
while(getline(*tFileP, targetString)) {
i++;
getline(*sFileP, sourceString);
getline(*aFileP, alignmentString);
if (i%1000 == 0) cerr << i << " " << flush;
SentenceAlignmentWithSyntax sentence
(targetLabelCollection, sourceLabelCollection,
@ -349,7 +344,7 @@ int main(int argc, char* argv[])
cout << "LOG: PHRASES_BEGIN:" << endl;
}
if (sentence.create(targetString, sourceString, alignmentString,"", i, options.boundaryRules)) {
if (sentence.create(targetString.c_str(), sourceString.c_str(), alignmentString.c_str(),"", i, options.boundaryRules)) {
if (options.unknownWordLabelFlag) {
collectWordLabelCounts(sentence);
}

View File

@ -33,17 +33,13 @@ int main(int argc, char* argv[])
// loop through all sentences
int i=0;
char inBuffer[LINE_MAX_LENGTH];
while(true) {
string inBuffer;
while(getline(cin, inBuffer)) {
i++;
if (i%1000 == 0) cerr << "." << flush;
if (i%10000 == 0) cerr << ":" << flush;
if (i%100000 == 0) cerr << "!" << flush;
// get line from stdin
SAFE_GETLINE( cin, inBuffer, LINE_MAX_LENGTH, '\n', __FILE__);
if (cin.eof()) break;
// process into syntax tree representation
string inBufferString = string( inBuffer );
set< string > labelCollection; // set of labels, not used

View File

@ -29,7 +29,6 @@
#include <vector>
#include <algorithm>
#include "SafeGetline.h"
#include "ScoreFeature.h"
#include "tables-core.h"
#include "ExtractionPhrasePair.h"
@ -40,8 +39,6 @@
using namespace std;
using namespace MosesTraining;
#define LINE_MAX_LENGTH 100000
namespace MosesTraining
{
LexicalTable lexTable;
@ -236,7 +233,7 @@ int main(int argc, char* argv[])
}
// loop through all extracted phrase translations
char line[LINE_MAX_LENGTH], lastLine[LINE_MAX_LENGTH];
string line, lastLine;
lastLine[0] = '\0';
ExtractionPhrasePair *phrasePair = NULL;
std::vector< ExtractionPhrasePair* > phrasePairsWithSameSource;
@ -249,8 +246,8 @@ int main(int argc, char* argv[])
float tmpCount=0.0f, tmpPcfgSum=0.0f;
int i=0;
SAFE_GETLINE( (extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__ );
if ( !extractFileP.eof() ) {
// TODO why read only the 1st line?
if ( getline(extractFileP, line)) {
++i;
tmpPhraseSource = new PHRASE();
tmpPhraseTarget = new PHRASE();
@ -269,23 +266,21 @@ int main(int argc, char* argv[])
if ( hierarchicalFlag ) {
phrasePairsWithSameSourceAndTarget.push_back( phrasePair );
}
strcpy( lastLine, line );
SAFE_GETLINE( (extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__ );
lastLine = line;
}
while ( !extractFileP.eof() ) {
while ( getline(extractFileP, line) ) {
if ( ++i % 100000 == 0 ) {
std::cerr << "." << std::flush;
}
// identical to last line? just add count
if (strcmp(line,lastLine) == 0) {
if (line == lastLine) {
phrasePair->IncrementPrevious(tmpCount,tmpPcfgSum);
SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
continue;
} else {
strcpy( lastLine, line );
lastLine = line;
}
tmpPhraseSource = new PHRASE();
@ -363,8 +358,6 @@ int main(int argc, char* argv[])
}
}
SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
}
processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
@ -758,11 +751,9 @@ void loadFunctionWords( const string &fileName )
}
istream *inFileP = &inFile;
char line[LINE_MAX_LENGTH];
while(true) {
SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
if (inFileP->eof()) break;
std::vector<string> token = tokenize( line );
string line;
while(getline(*inFileP, line)) {
std::vector<string> token = tokenize( line.c_str() );
if (token.size() > 0)
functionWordList.insert( token[0] );
}
@ -807,16 +798,13 @@ void LexicalTable::load( const string &fileName )
}
istream *inFileP = &inFile;
char line[LINE_MAX_LENGTH];
string line;
int i=0;
while(true) {
while(getline(*inFileP, line)) {
i++;
if (i%100000 == 0) std::cerr << "." << flush;
SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
if (inFileP->eof()) break;
std::vector<string> token = tokenize( line );
std::vector<string> token = tokenize( line.c_str() );
if (token.size() != 3) {
std::cerr << "line " << i << " in " << fileName
<< " has wrong number of tokens, skipping:" << std::endl

View File

@ -12,15 +12,12 @@
#include <time.h>
#include "AlignmentPhrase.h"
#include "SafeGetline.h"
#include "tables-core.h"
#include "InputFileStream.h"
using namespace std;
using namespace MosesTraining;
#define LINE_MAX_LENGTH 10000
namespace MosesTraining
{
@ -31,7 +28,7 @@ public:
vector< vector<size_t> > alignedToE;
vector< vector<size_t> > alignedToF;
bool create( char*, int );
bool create( const char*, int );
void clear();
bool equals( const PhraseAlignment& );
};
@ -106,16 +103,14 @@ int main(int argc, char* argv[])
vector< PhraseAlignment > phrasePairsWithSameF;
int i=0;
int fileCount = 0;
while(true) {
string line;
while(getline(extractFileP, line)) {
if (extractFileP.eof()) break;
if (++i % 100000 == 0) cerr << "." << flush;
char line[LINE_MAX_LENGTH];
SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
// if (fileCount>0)
if (extractFileP.eof())
break;
PhraseAlignment phrasePair;
bool isPhrasePair = phrasePair.create( line, i );
bool isPhrasePair = phrasePair.create( line.c_str(), i );
if (lastForeign >= 0 && lastForeign != phrasePair.foreign) {
processPhrasePairs( phrasePairsWithSameF );
for(size_t j=0; j<phrasePairsWithSameF.size(); j++)
@ -124,7 +119,7 @@ int main(int argc, char* argv[])
phraseTableE.clear();
phraseTableF.clear();
phrasePair.clear(); // process line again, since phrase tables flushed
phrasePair.create( line, i );
phrasePair.create( line.c_str(), i );
phrasePairBase = 0;
}
lastForeign = phrasePair.foreign;
@ -242,7 +237,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair )
}
}
bool PhraseAlignment::create( char line[], int lineID )
bool PhraseAlignment::create(const char line[], int lineID )
{
vector< string > token = tokenize( line );
int item = 1;
@ -321,16 +316,14 @@ void LexicalTable::load( const string &filePath )
}
istream *inFileP = &inFile;
char line[LINE_MAX_LENGTH];
string line;
int i=0;
while(true) {
while(getline(*inFileP, line)) {
i++;
if (i%100000 == 0) cerr << "." << flush;
SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
if (inFileP->eof()) break;
vector<string> token = tokenize( line );
vector<string> token = tokenize( line.c_str() );
if (token.size() != 3) {
cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
token.size() << " " << token[0] << " " << line << endl;