some moderate modifications in phrase-extract/score-main.cpp

(e.g., use Moses::Scan<>() rather than atof()/atoi())
This commit is contained in:
Matthias Huck 2015-03-09 18:49:32 +00:00
parent 973fd98052
commit 559077f6f8

View File

@ -18,10 +18,6 @@
***********************************************************************/
#include <sstream>
#include <cstdio>
#include <iostream>
#include <fstream>
#include <stdlib.h>
#include <assert.h>
#include <cstring>
#include <map>
@ -38,7 +34,8 @@
#include "InputFileStream.h"
#include "OutputFileStream.h"
using namespace std;
#include "moses/Util.h"
using namespace boost::algorithm;
using namespace MosesTraining;
@ -96,7 +93,6 @@ Vocabulary vcbS;
} // namespace
std::vector<std::string> tokenize( const char [] );
void processLine( std::string line,
int lineID, bool includeSentenceIdFlag, int &sentenceId,
@ -109,18 +105,18 @@ void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float>
const std::string &fileNameLeftHandSideSourceLabelCounts,
const std::string &fileNameLeftHandSideTargetSourceLabelCounts );
void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fileName );
void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile,
void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb );
void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog );
void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, std::ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog );
double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource );
set<std::string> functionWordList;
std::set<std::string> functionWordList;
void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors, std::vector<float> &orientationClassPriorsL2R, std::vector<float> &orientationClassPriorsR2L);
void loadFunctionWords( const string &fileNameFunctionWords );
void loadFunctionWords( const std::string &fileNameFunctionWords );
double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
void printSourcePhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, ostream &out );
void printTargetPhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, ostream &out );
void printSourcePhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, std::ostream &out );
void printTargetPhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, std::ostream &out );
void invertAlignment( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment );
@ -228,7 +224,7 @@ int main(int argc, char* argv[])
negLogProb = -1;
std::cerr << "using negative log-probabilities" << std::endl;
} else if (strcmp(argv[i],"--MinCountHierarchical") == 0) {
minCountHierarchical = atof(argv[++i]);
minCountHierarchical = Moses::Scan<float>( argv[++i] );
std::cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times" << std::endl;
minCountHierarchical -= 0.00001; // account for rounding
} else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
@ -291,10 +287,9 @@ int main(int argc, char* argv[])
std::cerr << "ERROR: could not open extract file " << fileNameExtract << std::endl;
exit(1);
}
istream &extractFileP = extractFile;
// output file: phrase translation table
ostream *phraseTableFile;
std::ostream *phraseTableFile;
if (fileNamePhraseTable == "-") {
phraseTableFile = &std::cout;
@ -310,7 +305,7 @@ int main(int argc, char* argv[])
}
// loop through all extracted phrase translations
string line, lastLine;
std::string line, lastLine;
lastLine[0] = '\0';
ExtractionPhrasePair *phrasePair = NULL;
std::vector< ExtractionPhrasePair* > phrasePairsWithSameSource;
@ -323,8 +318,7 @@ int main(int argc, char* argv[])
float tmpCount=0.0f, tmpPcfgSum=0.0f;
int i=0;
// TODO why read only the 1st line?
if ( getline(extractFileP, line) ) {
if ( getline(extractFile, line) ) {
++i;
tmpPhraseSource = new PHRASE();
tmpPhraseTarget = new PHRASE();
@ -346,7 +340,7 @@ int main(int argc, char* argv[])
lastLine = line;
}
while ( getline(extractFileP, line) ) {
while ( getline(extractFile, line) ) {
if ( ++i % 100000 == 0 ) {
std::cerr << "." << std::flush;
@ -503,7 +497,8 @@ void processLine( std::string line,
phraseTarget->clear();
targetToSourceAlignment->clear();
std::vector<std::string> token = tokenize( line.c_str() );
std::vector<std::string> token;
Moses::Tokenize( token, line );
int item = 1;
for ( size_t j=0; j<token.size(); ++j ) {
if (token[j] == "|||") {
@ -534,7 +529,7 @@ void processLine( std::string line,
} else if (item + (includeSentenceIdFlag?-1:0) == 4) { // count
sscanf(token[j].c_str(), "%f", &count);
} else if (item + (includeSentenceIdFlag?-1:0) == 5) { // target syntax PCFG score
float pcfgScore = std::atof(token[j].c_str());
float pcfgScore = Moses::Scan<float>( token[j] );
pcfgSum = pcfgScore * count;
}
}
@ -548,17 +543,17 @@ void processLine( std::string line,
count = 1.0;
}
if (item < 3 || item > (includeSentenceIdFlag?7:6)) {
std::cerr << "ERROR: faulty line " << lineID << ": " << line << endl;
std::cerr << "ERROR: faulty line " << lineID << ": " << line << std::endl;
}
}
void writeCountOfCounts( const string &fileNameCountOfCounts )
void writeCountOfCounts( const std::string &fileNameCountOfCounts )
{
// open file
Moses::OutputFileStream countOfCountsFile;
bool success = countOfCountsFile.Open(fileNameCountOfCounts.c_str());
bool success = countOfCountsFile.Open(fileNameCountOfCounts);
if (!success) {
std::cerr << "ERROR: could not open count-of-counts file "
<< fileNameCountOfCounts << std::endl;
@ -583,7 +578,7 @@ void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float>
{
// open file
Moses::OutputFileStream leftHandSideSourceLabelCounts;
bool success = leftHandSideSourceLabelCounts.Open(fileNameLeftHandSideSourceLabelCounts.c_str());
bool success = leftHandSideSourceLabelCounts.Open(fileNameLeftHandSideSourceLabelCounts);
if (!success) {
std::cerr << "ERROR: could not open left-hand side label counts file "
<< fileNameLeftHandSideSourceLabelCounts << std::endl;
@ -600,7 +595,7 @@ void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float>
// open file
Moses::OutputFileStream leftHandSideTargetSourceLabelCounts;
success = leftHandSideTargetSourceLabelCounts.Open(fileNameLeftHandSideTargetSourceLabelCounts.c_str());
success = leftHandSideTargetSourceLabelCounts.Open(fileNameLeftHandSideTargetSourceLabelCounts);
if (!success) {
std::cerr << "ERROR: could not open left-hand side label joint counts file "
<< fileNameLeftHandSideTargetSourceLabelCounts << std::endl;
@ -624,7 +619,7 @@ void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fi
{
// open file
Moses::OutputFileStream out;
bool success = out.Open(fileName.c_str());
bool success = out.Open(fileName);
if (!success) {
std::cerr << "ERROR: could not open file "
<< fileName << " for writing" << std::endl;
@ -640,7 +635,7 @@ void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fi
}
void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile,
void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb )
{
if (phrasePairsWithSameSource.size() == 0) {
@ -668,7 +663,7 @@ void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSa
void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
float totalCount, int distinctCount,
ostream &phraseTableFile,
std::ostream &phraseTableFile,
const ScoreFeatureManager& featureManager,
const MaybeLog& maybeLogProb )
{
@ -677,7 +672,7 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
const ALIGNMENT *bestAlignmentT2S = phrasePair.FindBestAlignmentTargetToSource();
float count = phrasePair.GetCount();
map< string, float > domainCount;
std::map< std::string, float > domainCount;
// collect count of count statistics
if (goodTuringFlag || kneserNeyFlag) {
@ -796,13 +791,13 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
// extra features
ScoreFeatureContext context(phrasePair, maybeLogProb);
std::vector<float> extraDense;
map<string,float> extraSparse;
std::map<std::string,float> extraSparse;
featureManager.addFeatures(context, extraDense, extraSparse);
for (size_t i = 0; i < extraDense.size(); ++i) {
phraseTableFile << " " << extraDense[i];
}
for (map<string,float>::const_iterator i = extraSparse.begin();
for (std::map<std::string,float>::const_iterator i = extraSparse.begin();
i != extraSparse.end(); ++i) {
phraseTableFile << " " << i->first << " " << i->second;
}
@ -882,14 +877,14 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
}
if (spanLength && !inverseFlag) {
string propValue = phrasePair.CollectAllPropertyValues("SpanLength");
std::string propValue = phrasePair.CollectAllPropertyValues("SpanLength");
if (!propValue.empty()) {
phraseTableFile << " {{SpanLength " << propValue << "}}";
}
}
if (nonTermContext && !inverseFlag) {
string propValue = phrasePair.CollectAllPropertyValues("NonTermContext");
std::string propValue = phrasePair.CollectAllPropertyValues("NonTermContext");
if (!propValue.empty()) {
phraseTableFile << " {{NonTermContext " << propValue << "}}";
}
@ -907,8 +902,7 @@ void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors,
assert(orientationClassPriorsL2R.size()==4 && orientationClassPriorsR2L.size()==4); // mono swap dleft dright
std::cerr << "Loading phrase orientation priors from " << fileNamePhraseOrientationPriors;
ifstream inFile;
inFile.open(fileNamePhraseOrientationPriors.c_str());
Moses::InputFileStream inFile(fileNamePhraseOrientationPriors);
if (inFile.fail()) {
std::cerr << " - ERROR: could not open file" << std::endl;
exit(1);
@ -919,7 +913,7 @@ void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors,
float l2rSum = 0;
float r2lSum = 0;
while (getline(inFile, line)) {
istringstream tokenizer(line);
std::istringstream tokenizer(line);
std::string key;
tokenizer >> key;
@ -983,7 +977,7 @@ void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors,
}
std::cerr << " - read " << linesRead << " lines from orientation priors file" << std::endl;
inFile.close();
inFile.Close();
}
@ -1038,7 +1032,7 @@ double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource )
double unaligned = 1.0;
// only checking target words - source words are caught when computing inverse
for(size_t ti=0; ti<alignmentTargetToSource->size(); ++ti) {
const set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
const std::set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
if (srcIndices.empty()) {
unaligned *= 2.718;
}
@ -1053,7 +1047,7 @@ double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *a
double unaligned = 1.0;
// only checking target words - source words are caught when computing inverse
for(size_t ti=0; ti<alignmentTargetToSource->size(); ++ti) {
const set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
const std::set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseTarget->at(ti) ) ) != functionWordList.end()) {
unaligned *= 2.718;
}
@ -1061,26 +1055,25 @@ double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *a
return unaligned;
}
void loadFunctionWords( const string &fileName )
void loadFunctionWords( const std::string &fileName )
{
std::cerr << "Loading function word list from " << fileName;
ifstream inFile;
inFile.open(fileName.c_str());
Moses::InputFileStream inFile(fileName);
if (inFile.fail()) {
std::cerr << " - ERROR: could not open file" << std::endl;
exit(1);
}
istream *inFileP = &inFile;
string line;
while(getline(*inFileP, line)) {
std::vector<string> token = tokenize( line.c_str() );
std::string line;
while(getline(inFile, line)) {
std::vector<std::string> token;
Moses::Tokenize( token, line );
if (token.size() > 0)
functionWordList.insert( token[0] );
}
std::cerr << " - read " << functionWordList.size() << " function words" << std::endl;
inFile.close();
inFile.Close();
}
@ -1091,14 +1084,14 @@ double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phra
int null = vcbS.getWordID("NULL");
// all target words have to be explained
for(size_t ti=0; ti<alignmentTargetToSource->size(); ti++) {
const set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
const std::set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
if (srcIndices.empty()) {
// explain unaligned word by NULL
lexScore *= lexTable.permissiveLookup( null, phraseTarget->at(ti) );
} else {
// go through all the aligned words to compute average
double thisWordScore = 0;
for (set< size_t >::const_iterator p(srcIndices.begin()); p != srcIndices.end(); ++p) {
for (std::set< size_t >::const_iterator p(srcIndices.begin()); p != srcIndices.end(); ++p) {
thisWordScore += lexTable.permissiveLookup( phraseSource->at(*p), phraseTarget->at(ti) );
}
lexScore *= thisWordScore / (double)srcIndices.size();
@ -1108,24 +1101,23 @@ double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phra
}
void LexicalTable::load( const string &fileName )
void LexicalTable::load( const std::string &fileName )
{
std::cerr << "Loading lexical translation table from " << fileName;
ifstream inFile;
inFile.open(fileName.c_str());
Moses::InputFileStream inFile(fileName);
if (inFile.fail()) {
std::cerr << " - ERROR: could not open file" << std::endl;
exit(1);
}
istream *inFileP = &inFile;
string line;
std::string line;
int i=0;
while(getline(*inFileP, line)) {
while(getline(inFile, line)) {
i++;
if (i%100000 == 0) std::cerr << "." << flush;
if (i%100000 == 0) std::cerr << "." << std::flush;
std::vector<string> token = tokenize( line.c_str() );
std::vector<std::string> token;
Moses::Tokenize( token, line );
if (token.size() != 3) {
std::cerr << "line " << i << " in " << fileName
<< " has wrong number of tokens, skipping:" << std::endl
@ -1133,7 +1125,7 @@ void LexicalTable::load( const string &fileName )
continue;
}
double prob = atof( token[2].c_str() );
double prob = Moses::Scan<double>( token[2] );
WORD_ID wordT = vcbT.storeIfNew( token[0] );
WORD_ID wordS = vcbS.storeIfNew( token[1] );
ltable[ wordS ][ wordT ] = prob;
@ -1143,7 +1135,7 @@ void LexicalTable::load( const string &fileName )
void printSourcePhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
const ALIGNMENT *targetToSourceAlignment, ostream &out)
const ALIGNMENT *targetToSourceAlignment, std::ostream &out)
{
// get corresponding target non-terminal and output pair
ALIGNMENT *sourceToTargetAlignment = new ALIGNMENT();
@ -1175,7 +1167,7 @@ void printSourcePhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
void printTargetPhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
const ALIGNMENT *targetToSourceAlignment, ostream &out)
const ALIGNMENT *targetToSourceAlignment, std::ostream &out)
{
// output target symbols, except root, in rule table format
for (std::size_t i = 0; i < phraseTarget->size()-1; ++i) {