mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-25 12:52:29 +03:00
Merge ../mosesdecoder into hieu
This commit is contained in:
commit
3c6a31128d
@ -17,12 +17,8 @@ License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
#include "util/exception.hh"
|
||||
|
||||
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
|
||||
|
||||
#define LINE_MAX_LENGTH 100000
|
||||
#include "phrase-extract/SafeGetline.h" // for SAFE_GETLINE()
|
||||
|
||||
using namespace std;
|
||||
|
||||
template<typename T>
|
||||
@ -461,16 +457,14 @@ void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexic
|
||||
}
|
||||
istream *inFileP = &inFile;
|
||||
|
||||
char line[LINE_MAX_LENGTH];
|
||||
|
||||
int i=0;
|
||||
while(true) {
|
||||
string line;
|
||||
|
||||
while(getline(*inFileP, line)) {
|
||||
i++;
|
||||
if (i%100000 == 0) cerr << "." << flush;
|
||||
SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
if (inFileP->eof()) break;
|
||||
|
||||
vector<string> token = tokenize( line );
|
||||
vector<string> token = tokenize( line.c_str() );
|
||||
if (token.size() != 4) {
|
||||
cerr << "line " << i << " in " << fileName
|
||||
<< " has wrong number of tokens, skipping:\n"
|
||||
|
@ -413,11 +413,9 @@ void FuzzyMatchWrapper::load_corpus( const std::string &fileName, vector< vector
|
||||
|
||||
istream *fileStreamP = &fileStream;
|
||||
|
||||
char line[LINE_MAX_LENGTH];
|
||||
while(true) {
|
||||
SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
|
||||
if (fileStreamP->eof()) break;
|
||||
corpus.push_back( GetVocabulary().Tokenize( line ) );
|
||||
string line;
|
||||
while(getline(*fileStreamP, line)) {
|
||||
corpus.push_back( GetVocabulary().Tokenize( line.c_str() ) );
|
||||
}
|
||||
}
|
||||
|
||||
@ -436,12 +434,9 @@ void FuzzyMatchWrapper::load_target(const std::string &fileName, vector< vector<
|
||||
WORD_ID delimiter = GetVocabulary().StoreIfNew("|||");
|
||||
|
||||
int lineNum = 0;
|
||||
char line[LINE_MAX_LENGTH];
|
||||
while(true) {
|
||||
SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
|
||||
if (fileStreamP->eof()) break;
|
||||
|
||||
vector<WORD_ID> toks = GetVocabulary().Tokenize( line );
|
||||
string line;
|
||||
while(getline(*fileStreamP, line)) {
|
||||
vector<WORD_ID> toks = GetVocabulary().Tokenize( line.c_str() );
|
||||
|
||||
corpus.push_back(vector< SentenceAlignment >());
|
||||
vector< SentenceAlignment > &vec = corpus.back();
|
||||
@ -493,11 +488,8 @@ void FuzzyMatchWrapper::load_alignment(const std::string &fileName, vector< vect
|
||||
string delimiter = "|||";
|
||||
|
||||
int lineNum = 0;
|
||||
char line[LINE_MAX_LENGTH];
|
||||
while(true) {
|
||||
SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
|
||||
if (fileStreamP->eof()) break;
|
||||
|
||||
string line;
|
||||
while(getline(*fileStreamP, line)) {
|
||||
vector< SentenceAlignment > &vec = corpus[lineNum];
|
||||
size_t targetInd = 0;
|
||||
SentenceAlignment *sentence = &vec[targetInd];
|
||||
|
@ -14,17 +14,16 @@ SuffixArray::SuffixArray( string fileName )
|
||||
m_endOfSentence = m_vcb.StoreIfNew( "<s>" );
|
||||
|
||||
ifstream extractFile;
|
||||
char line[LINE_MAX_LENGTH];
|
||||
|
||||
// count the number of words first;
|
||||
extractFile.open(fileName.c_str());
|
||||
istream *fileP = &extractFile;
|
||||
m_size = 0;
|
||||
size_t sentenceCount = 0;
|
||||
while(!fileP->eof()) {
|
||||
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
|
||||
if (fileP->eof()) break;
|
||||
vector< WORD_ID > words = m_vcb.Tokenize( line );
|
||||
string line;
|
||||
while(getline(*fileP, line)) {
|
||||
|
||||
vector< WORD_ID > words = m_vcb.Tokenize( line.c_str() );
|
||||
m_size += words.size() + 1;
|
||||
sentenceCount++;
|
||||
}
|
||||
@ -43,10 +42,8 @@ SuffixArray::SuffixArray( string fileName )
|
||||
int sentenceId = 0;
|
||||
extractFile.open(fileName.c_str());
|
||||
fileP = &extractFile;
|
||||
while(!fileP->eof()) {
|
||||
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
|
||||
if (fileP->eof()) break;
|
||||
vector< WORD_ID > words = m_vcb.Tokenize( line );
|
||||
while(getline(*fileP, line)) {
|
||||
vector< WORD_ID > words = m_vcb.Tokenize( line.c_str() );
|
||||
|
||||
// add to corpus vector
|
||||
corpus.push_back(words);
|
||||
|
@ -17,20 +17,6 @@
|
||||
|
||||
namespace tmmt
|
||||
{
|
||||
|
||||
#define MAX_LENGTH 10000
|
||||
|
||||
#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
|
||||
_IS.getline(_LINE, _SIZE, _DELIM); \
|
||||
if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
|
||||
if (_IS.gcount() == _SIZE-1) { \
|
||||
cerr << "Line too long! Buffer overflow. Delete lines >=" \
|
||||
<< _SIZE << " chars or raise MAX_LENGTH in phrase-extract/tables-core.cpp" \
|
||||
<< endl; \
|
||||
exit(1); \
|
||||
} \
|
||||
}
|
||||
|
||||
typedef std::string WORD;
|
||||
typedef unsigned int WORD_ID;
|
||||
|
||||
|
@ -2,9 +2,6 @@
|
||||
#include "ExtractionPhrasePair.h"
|
||||
#include "tables-core.h"
|
||||
#include "InputFileStream.h"
|
||||
#include "SafeGetline.h"
|
||||
|
||||
#define TABLE_LINE_MAX_LENGTH 1000
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -16,12 +13,11 @@ void Domain::load( const std::string &domainFileName )
|
||||
{
|
||||
Moses::InputFileStream fileS( domainFileName );
|
||||
istream *fileP = &fileS;
|
||||
while(true) {
|
||||
char line[TABLE_LINE_MAX_LENGTH];
|
||||
SAFE_GETLINE((*fileP), line, TABLE_LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
if (fileP->eof()) break;
|
||||
|
||||
string line;
|
||||
while(getline(*fileP, line)) {
|
||||
// read
|
||||
vector< string > domainSpecLine = tokenize( line );
|
||||
vector< string > domainSpecLine = tokenize( line.c_str() );
|
||||
int lineNumber;
|
||||
if (domainSpecLine.size() != 2 ||
|
||||
! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
|
||||
|
@ -19,7 +19,6 @@
|
||||
|
||||
#include <sstream>
|
||||
#include "ExtractionPhrasePair.h"
|
||||
#include "SafeGetline.h"
|
||||
#include "tables-core.h"
|
||||
#include "score.h"
|
||||
#include "moses/Util.h"
|
||||
|
@ -1,35 +0,0 @@
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2010 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#pragma once
|
||||
#ifndef SAFE_GETLINE_INCLUDED_
|
||||
#define SAFE_GETLINE_INCLUDED_
|
||||
|
||||
#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM, _FILE) { \
|
||||
_IS.getline(_LINE, _SIZE, _DELIM); \
|
||||
if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
|
||||
if (_IS.gcount() == _SIZE-1) { \
|
||||
cerr << "Line too long! Buffer overflow. Delete lines >=" \
|
||||
<< _SIZE << " chars or raise LINE_MAX_LENGTH in " << _FILE \
|
||||
<< endl; \
|
||||
exit(1); \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif
|
@ -54,7 +54,11 @@ bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bo
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], char weightString[], int sentenceID, bool boundaryRules)
|
||||
bool SentenceAlignment::create(const char targetString[],
|
||||
const char sourceString[],
|
||||
const char alignmentString[],
|
||||
const char weightString[],
|
||||
int sentenceID, bool boundaryRules)
|
||||
{
|
||||
using namespace std;
|
||||
this->sentenceID = sentenceID;
|
||||
|
@ -43,8 +43,11 @@ public:
|
||||
|
||||
virtual bool processSourceSentence(const char *, int, bool boundaryRules);
|
||||
|
||||
bool create(char targetString[], char sourceString[],
|
||||
char alignmentString[], char weightString[], int sentenceID, bool boundaryRules);
|
||||
bool create(const char targetString[],
|
||||
const char sourceString[],
|
||||
const char alignmentString[],
|
||||
const char weightString[],
|
||||
int sentenceID, bool boundaryRules);
|
||||
|
||||
void invertAlignment();
|
||||
|
||||
|
@ -26,16 +26,9 @@
|
||||
#include "InputFileStream.h"
|
||||
#include "OutputFileStream.h"
|
||||
|
||||
#include "SafeGetline.h"
|
||||
|
||||
#define LINE_MAX_LENGTH 10000
|
||||
|
||||
using namespace std;
|
||||
|
||||
char line[LINE_MAX_LENGTH];
|
||||
|
||||
|
||||
vector< string > splitLine()
|
||||
vector< string > splitLine(const char *line)
|
||||
{
|
||||
vector< string > item;
|
||||
int start=0;
|
||||
@ -61,14 +54,15 @@ bool getLine( istream &fileP, vector< string > &item )
|
||||
{
|
||||
if (fileP.eof())
|
||||
return false;
|
||||
|
||||
SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
if (fileP.eof())
|
||||
|
||||
string line;
|
||||
if (getline(fileP, line)) {
|
||||
item = splitLine(line.c_str());
|
||||
return false;
|
||||
|
||||
item = splitLine();
|
||||
|
||||
return true;
|
||||
}
|
||||
else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -26,7 +26,6 @@
|
||||
#include <cstring>
|
||||
|
||||
#include "tables-core.h"
|
||||
#include "SafeGetline.h"
|
||||
#include "InputFileStream.h"
|
||||
#include "OutputFileStream.h"
|
||||
|
||||
|
@ -27,23 +27,19 @@
|
||||
#include <cstring>
|
||||
|
||||
#include "tables-core.h"
|
||||
#include "SafeGetline.h"
|
||||
#include "InputFileStream.h"
|
||||
|
||||
#define LINE_MAX_LENGTH 10000
|
||||
|
||||
using namespace std;
|
||||
|
||||
bool hierarchicalFlag = false;
|
||||
bool onlyDirectFlag = false;
|
||||
bool phraseCountFlag = true;
|
||||
bool logProbFlag = false;
|
||||
char line[LINE_MAX_LENGTH];
|
||||
|
||||
void processFiles( char*, char*, char* );
|
||||
bool getLine( istream &fileP, vector< string > &item );
|
||||
string reverseAlignment(const string &alignments);
|
||||
vector< string > splitLine();
|
||||
vector< string > splitLine(const char *lin);
|
||||
|
||||
inline void Tokenize(std::vector<std::string> &output
|
||||
, const std::string& str
|
||||
@ -190,17 +186,18 @@ bool getLine( istream &fileP, vector< string > &item )
|
||||
{
|
||||
if (fileP.eof())
|
||||
return false;
|
||||
|
||||
SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
if (fileP.eof())
|
||||
|
||||
string line;
|
||||
if (getline(fileP, line)) {
|
||||
item = splitLine(line.c_str());
|
||||
return false;
|
||||
|
||||
item = splitLine();
|
||||
|
||||
return true;
|
||||
}
|
||||
else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
vector< string > splitLine()
|
||||
vector< string > splitLine(const char *line)
|
||||
{
|
||||
vector< string > item;
|
||||
bool betweenWords = true;
|
||||
|
@ -19,7 +19,6 @@
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
#include "SafeGetline.h"
|
||||
#include "SentenceAlignment.h"
|
||||
#include "tables-core.h"
|
||||
#include "InputFileStream.h"
|
||||
@ -32,10 +31,6 @@ using namespace MosesTraining;
|
||||
namespace MosesTraining
|
||||
{
|
||||
|
||||
|
||||
const long int LINE_MAX_LENGTH = 500000 ;
|
||||
|
||||
|
||||
// HPhraseVertex represents a point in the alignment matrix
|
||||
typedef pair <int, int> HPhraseVertex;
|
||||
|
||||
@ -277,20 +272,18 @@ int main(int argc, char* argv[])
|
||||
|
||||
int i = sentenceOffset;
|
||||
|
||||
while(true) {
|
||||
string englishString, foreignString, alignmentString, weightString;
|
||||
|
||||
while(getline(*eFileP, englishString)) {
|
||||
i++;
|
||||
if (i%10000 == 0) cerr << "." << flush;
|
||||
char englishString[LINE_MAX_LENGTH];
|
||||
char foreignString[LINE_MAX_LENGTH];
|
||||
char alignmentString[LINE_MAX_LENGTH];
|
||||
char weightString[LINE_MAX_LENGTH];
|
||||
SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
if (eFileP->eof()) break;
|
||||
SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
|
||||
getline(*fFileP, foreignString);
|
||||
getline(*aFileP, alignmentString);
|
||||
if (iwFileP) {
|
||||
SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
getline(*iwFileP, weightString);
|
||||
}
|
||||
|
||||
SentenceAlignment sentence;
|
||||
// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
|
||||
//az: output src, tgt, and alingment line
|
||||
@ -300,7 +293,11 @@ int main(int argc, char* argv[])
|
||||
cout << "LOG: ALT: " << alignmentString << endl;
|
||||
cout << "LOG: PHRASES_BEGIN:" << endl;
|
||||
}
|
||||
if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) {
|
||||
if (sentence.create( englishString.c_str(),
|
||||
foreignString.c_str(),
|
||||
alignmentString.c_str(),
|
||||
weightString.c_str(),
|
||||
i, false)) {
|
||||
if (options.placeholders.size()) {
|
||||
sentence.invertAlignment();
|
||||
}
|
||||
|
@ -19,7 +19,6 @@
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
#include "SafeGetline.h"
|
||||
#include "SentenceAlignment.h"
|
||||
#include "tables-core.h"
|
||||
#include "InputFileStream.h"
|
||||
@ -32,10 +31,6 @@ using namespace MosesTraining;
|
||||
namespace MosesTraining
|
||||
{
|
||||
|
||||
|
||||
const long int LINE_MAX_LENGTH = 500000 ;
|
||||
|
||||
|
||||
// HPhraseVertex represents a point in the alignment matrix
|
||||
typedef pair <int, int> HPhraseVertex;
|
||||
|
||||
@ -246,20 +241,20 @@ int main(int argc, char* argv[])
|
||||
|
||||
int i = sentenceOffset;
|
||||
|
||||
while(true) {
|
||||
string englishString, foreignString, alignmentString, weightString;
|
||||
|
||||
while(getline(*eFileP, englishString)) {
|
||||
i++;
|
||||
if (i%10000 == 0) cerr << "." << flush;
|
||||
char englishString[LINE_MAX_LENGTH];
|
||||
char foreignString[LINE_MAX_LENGTH];
|
||||
char alignmentString[LINE_MAX_LENGTH];
|
||||
char weightString[LINE_MAX_LENGTH];
|
||||
SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
if (eFileP->eof()) break;
|
||||
SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
|
||||
getline(*eFileP, englishString);
|
||||
getline(*fFileP, foreignString);
|
||||
getline(*aFileP, alignmentString);
|
||||
if (iwFileP) {
|
||||
SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
getline(*iwFileP, weightString);
|
||||
}
|
||||
|
||||
if (i%10000 == 0) cerr << "." << flush;
|
||||
|
||||
SentenceAlignment sentence;
|
||||
// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
|
||||
//az: output src, tgt, and alingment line
|
||||
@ -269,7 +264,7 @@ int main(int argc, char* argv[])
|
||||
cout << "LOG: ALT: " << alignmentString << endl;
|
||||
cout << "LOG: PHRASES_BEGIN:" << endl;
|
||||
}
|
||||
if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) {
|
||||
if (sentence.create( englishString.c_str(), foreignString.c_str(), alignmentString.c_str(), weightString.c_str(), i, false)) {
|
||||
ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFileOrientation);
|
||||
task->Run();
|
||||
delete task;
|
||||
|
@ -39,7 +39,6 @@
|
||||
#include "Hole.h"
|
||||
#include "HoleCollection.h"
|
||||
#include "RuleExist.h"
|
||||
#include "SafeGetline.h"
|
||||
#include "SentenceAlignmentWithSyntax.h"
|
||||
#include "SyntaxTree.h"
|
||||
#include "tables-core.h"
|
||||
@ -47,8 +46,6 @@
|
||||
#include "InputFileStream.h"
|
||||
#include "OutputFileStream.h"
|
||||
|
||||
#define LINE_MAX_LENGTH 500000
|
||||
|
||||
using namespace std;
|
||||
using namespace MosesTraining;
|
||||
|
||||
@ -326,17 +323,15 @@ int main(int argc, char* argv[])
|
||||
|
||||
// loop through all sentence pairs
|
||||
size_t i=sentenceOffset;
|
||||
while(true) {
|
||||
i++;
|
||||
if (i%1000 == 0) cerr << i << " " << flush;
|
||||
string targetString, sourceString, alignmentString;
|
||||
|
||||
char targetString[LINE_MAX_LENGTH];
|
||||
char sourceString[LINE_MAX_LENGTH];
|
||||
char alignmentString[LINE_MAX_LENGTH];
|
||||
SAFE_GETLINE((*tFileP), targetString, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
if (tFileP->eof()) break;
|
||||
SAFE_GETLINE((*sFileP), sourceString, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
while(getline(*tFileP, targetString)) {
|
||||
i++;
|
||||
|
||||
getline(*sFileP, sourceString);
|
||||
getline(*aFileP, alignmentString);
|
||||
|
||||
if (i%1000 == 0) cerr << i << " " << flush;
|
||||
|
||||
SentenceAlignmentWithSyntax sentence
|
||||
(targetLabelCollection, sourceLabelCollection,
|
||||
@ -349,7 +344,7 @@ int main(int argc, char* argv[])
|
||||
cout << "LOG: PHRASES_BEGIN:" << endl;
|
||||
}
|
||||
|
||||
if (sentence.create(targetString, sourceString, alignmentString,"", i, options.boundaryRules)) {
|
||||
if (sentence.create(targetString.c_str(), sourceString.c_str(), alignmentString.c_str(),"", i, options.boundaryRules)) {
|
||||
if (options.unknownWordLabelFlag) {
|
||||
collectWordLabelCounts(sentence);
|
||||
}
|
||||
|
@ -33,17 +33,13 @@ int main(int argc, char* argv[])
|
||||
|
||||
// loop through all sentences
|
||||
int i=0;
|
||||
char inBuffer[LINE_MAX_LENGTH];
|
||||
while(true) {
|
||||
string inBuffer;
|
||||
while(getline(cin, inBuffer)) {
|
||||
i++;
|
||||
if (i%1000 == 0) cerr << "." << flush;
|
||||
if (i%10000 == 0) cerr << ":" << flush;
|
||||
if (i%100000 == 0) cerr << "!" << flush;
|
||||
|
||||
// get line from stdin
|
||||
SAFE_GETLINE( cin, inBuffer, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
if (cin.eof()) break;
|
||||
|
||||
// process into syntax tree representation
|
||||
string inBufferString = string( inBuffer );
|
||||
set< string > labelCollection; // set of labels, not used
|
||||
|
@ -29,7 +29,6 @@
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
#include "SafeGetline.h"
|
||||
#include "ScoreFeature.h"
|
||||
#include "tables-core.h"
|
||||
#include "ExtractionPhrasePair.h"
|
||||
@ -40,8 +39,6 @@
|
||||
using namespace std;
|
||||
using namespace MosesTraining;
|
||||
|
||||
#define LINE_MAX_LENGTH 100000
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
LexicalTable lexTable;
|
||||
@ -236,7 +233,7 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
|
||||
// loop through all extracted phrase translations
|
||||
char line[LINE_MAX_LENGTH], lastLine[LINE_MAX_LENGTH];
|
||||
string line, lastLine;
|
||||
lastLine[0] = '\0';
|
||||
ExtractionPhrasePair *phrasePair = NULL;
|
||||
std::vector< ExtractionPhrasePair* > phrasePairsWithSameSource;
|
||||
@ -249,8 +246,8 @@ int main(int argc, char* argv[])
|
||||
float tmpCount=0.0f, tmpPcfgSum=0.0f;
|
||||
|
||||
int i=0;
|
||||
SAFE_GETLINE( (extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__ );
|
||||
if ( !extractFileP.eof() ) {
|
||||
// TODO why read only the 1st line?
|
||||
if ( getline(extractFileP, line)) {
|
||||
++i;
|
||||
tmpPhraseSource = new PHRASE();
|
||||
tmpPhraseTarget = new PHRASE();
|
||||
@ -269,23 +266,21 @@ int main(int argc, char* argv[])
|
||||
if ( hierarchicalFlag ) {
|
||||
phrasePairsWithSameSourceAndTarget.push_back( phrasePair );
|
||||
}
|
||||
strcpy( lastLine, line );
|
||||
SAFE_GETLINE( (extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__ );
|
||||
lastLine = line;
|
||||
}
|
||||
|
||||
while ( !extractFileP.eof() ) {
|
||||
while ( getline(extractFileP, line) ) {
|
||||
|
||||
if ( ++i % 100000 == 0 ) {
|
||||
std::cerr << "." << std::flush;
|
||||
}
|
||||
|
||||
// identical to last line? just add count
|
||||
if (strcmp(line,lastLine) == 0) {
|
||||
if (line == lastLine) {
|
||||
phrasePair->IncrementPrevious(tmpCount,tmpPcfgSum);
|
||||
SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
continue;
|
||||
} else {
|
||||
strcpy( lastLine, line );
|
||||
lastLine = line;
|
||||
}
|
||||
|
||||
tmpPhraseSource = new PHRASE();
|
||||
@ -363,8 +358,6 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
|
||||
SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
|
||||
}
|
||||
|
||||
processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
|
||||
@ -758,11 +751,9 @@ void loadFunctionWords( const string &fileName )
|
||||
}
|
||||
istream *inFileP = &inFile;
|
||||
|
||||
char line[LINE_MAX_LENGTH];
|
||||
while(true) {
|
||||
SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
if (inFileP->eof()) break;
|
||||
std::vector<string> token = tokenize( line );
|
||||
string line;
|
||||
while(getline(*inFileP, line)) {
|
||||
std::vector<string> token = tokenize( line.c_str() );
|
||||
if (token.size() > 0)
|
||||
functionWordList.insert( token[0] );
|
||||
}
|
||||
@ -807,16 +798,13 @@ void LexicalTable::load( const string &fileName )
|
||||
}
|
||||
istream *inFileP = &inFile;
|
||||
|
||||
char line[LINE_MAX_LENGTH];
|
||||
|
||||
string line;
|
||||
int i=0;
|
||||
while(true) {
|
||||
while(getline(*inFileP, line)) {
|
||||
i++;
|
||||
if (i%100000 == 0) std::cerr << "." << flush;
|
||||
SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
if (inFileP->eof()) break;
|
||||
|
||||
std::vector<string> token = tokenize( line );
|
||||
std::vector<string> token = tokenize( line.c_str() );
|
||||
if (token.size() != 3) {
|
||||
std::cerr << "line " << i << " in " << fileName
|
||||
<< " has wrong number of tokens, skipping:" << std::endl
|
||||
|
@ -12,15 +12,12 @@
|
||||
#include <time.h>
|
||||
|
||||
#include "AlignmentPhrase.h"
|
||||
#include "SafeGetline.h"
|
||||
#include "tables-core.h"
|
||||
#include "InputFileStream.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace MosesTraining;
|
||||
|
||||
#define LINE_MAX_LENGTH 10000
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
|
||||
@ -31,7 +28,7 @@ public:
|
||||
vector< vector<size_t> > alignedToE;
|
||||
vector< vector<size_t> > alignedToF;
|
||||
|
||||
bool create( char*, int );
|
||||
bool create( const char*, int );
|
||||
void clear();
|
||||
bool equals( const PhraseAlignment& );
|
||||
};
|
||||
@ -106,16 +103,14 @@ int main(int argc, char* argv[])
|
||||
vector< PhraseAlignment > phrasePairsWithSameF;
|
||||
int i=0;
|
||||
int fileCount = 0;
|
||||
while(true) {
|
||||
|
||||
string line;
|
||||
while(getline(extractFileP, line)) {
|
||||
if (extractFileP.eof()) break;
|
||||
if (++i % 100000 == 0) cerr << "." << flush;
|
||||
char line[LINE_MAX_LENGTH];
|
||||
SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
// if (fileCount>0)
|
||||
if (extractFileP.eof())
|
||||
break;
|
||||
|
||||
PhraseAlignment phrasePair;
|
||||
bool isPhrasePair = phrasePair.create( line, i );
|
||||
bool isPhrasePair = phrasePair.create( line.c_str(), i );
|
||||
if (lastForeign >= 0 && lastForeign != phrasePair.foreign) {
|
||||
processPhrasePairs( phrasePairsWithSameF );
|
||||
for(size_t j=0; j<phrasePairsWithSameF.size(); j++)
|
||||
@ -124,7 +119,7 @@ int main(int argc, char* argv[])
|
||||
phraseTableE.clear();
|
||||
phraseTableF.clear();
|
||||
phrasePair.clear(); // process line again, since phrase tables flushed
|
||||
phrasePair.create( line, i );
|
||||
phrasePair.create( line.c_str(), i );
|
||||
phrasePairBase = 0;
|
||||
}
|
||||
lastForeign = phrasePair.foreign;
|
||||
@ -242,7 +237,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair )
|
||||
}
|
||||
}
|
||||
|
||||
bool PhraseAlignment::create( char line[], int lineID )
|
||||
bool PhraseAlignment::create(const char line[], int lineID )
|
||||
{
|
||||
vector< string > token = tokenize( line );
|
||||
int item = 1;
|
||||
@ -321,16 +316,14 @@ void LexicalTable::load( const string &filePath )
|
||||
}
|
||||
istream *inFileP = &inFile;
|
||||
|
||||
char line[LINE_MAX_LENGTH];
|
||||
string line;
|
||||
|
||||
int i=0;
|
||||
while(true) {
|
||||
while(getline(*inFileP, line)) {
|
||||
i++;
|
||||
if (i%100000 == 0) cerr << "." << flush;
|
||||
SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
if (inFileP->eof()) break;
|
||||
|
||||
vector<string> token = tokenize( line );
|
||||
vector<string> token = tokenize( line.c_str() );
|
||||
if (token.size() != 3) {
|
||||
cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
|
||||
token.size() << " " << token[0] << " " << line << endl;
|
||||
|
Loading…
Reference in New Issue
Block a user