mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-19 07:07:24 +03:00
some moderate modifications in phrase-extract/score-main.cpp
(e.g., use Moses::Scan<>() rather than atof()/atoi())
This commit is contained in:
parent
973fd98052
commit
559077f6f8
@ -18,10 +18,6 @@
|
||||
***********************************************************************/
|
||||
|
||||
#include <sstream>
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <cstring>
|
||||
#include <map>
|
||||
@ -38,7 +34,8 @@
|
||||
#include "InputFileStream.h"
|
||||
#include "OutputFileStream.h"
|
||||
|
||||
using namespace std;
|
||||
#include "moses/Util.h"
|
||||
|
||||
using namespace boost::algorithm;
|
||||
using namespace MosesTraining;
|
||||
|
||||
@ -96,7 +93,6 @@ Vocabulary vcbS;
|
||||
|
||||
} // namespace
|
||||
|
||||
std::vector<std::string> tokenize( const char [] );
|
||||
|
||||
void processLine( std::string line,
|
||||
int lineID, bool includeSentenceIdFlag, int &sentenceId,
|
||||
@ -109,18 +105,18 @@ void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float>
|
||||
const std::string &fileNameLeftHandSideSourceLabelCounts,
|
||||
const std::string &fileNameLeftHandSideTargetSourceLabelCounts );
|
||||
void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fileName );
|
||||
void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile,
|
||||
void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
|
||||
const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb );
|
||||
void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog );
|
||||
void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, std::ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog );
|
||||
double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
|
||||
double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource );
|
||||
set<std::string> functionWordList;
|
||||
std::set<std::string> functionWordList;
|
||||
void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors, std::vector<float> &orientationClassPriorsL2R, std::vector<float> &orientationClassPriorsR2L);
|
||||
void loadFunctionWords( const string &fileNameFunctionWords );
|
||||
void loadFunctionWords( const std::string &fileNameFunctionWords );
|
||||
double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
|
||||
int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
|
||||
void printSourcePhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, ostream &out );
|
||||
void printTargetPhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, ostream &out );
|
||||
void printSourcePhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, std::ostream &out );
|
||||
void printTargetPhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, std::ostream &out );
|
||||
void invertAlignment( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment );
|
||||
|
||||
|
||||
@ -228,7 +224,7 @@ int main(int argc, char* argv[])
|
||||
negLogProb = -1;
|
||||
std::cerr << "using negative log-probabilities" << std::endl;
|
||||
} else if (strcmp(argv[i],"--MinCountHierarchical") == 0) {
|
||||
minCountHierarchical = atof(argv[++i]);
|
||||
minCountHierarchical = Moses::Scan<float>( argv[++i] );
|
||||
std::cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times" << std::endl;
|
||||
minCountHierarchical -= 0.00001; // account for rounding
|
||||
} else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
|
||||
@ -291,10 +287,9 @@ int main(int argc, char* argv[])
|
||||
std::cerr << "ERROR: could not open extract file " << fileNameExtract << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
istream &extractFileP = extractFile;
|
||||
|
||||
// output file: phrase translation table
|
||||
ostream *phraseTableFile;
|
||||
std::ostream *phraseTableFile;
|
||||
|
||||
if (fileNamePhraseTable == "-") {
|
||||
phraseTableFile = &std::cout;
|
||||
@ -310,7 +305,7 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
|
||||
// loop through all extracted phrase translations
|
||||
string line, lastLine;
|
||||
std::string line, lastLine;
|
||||
lastLine[0] = '\0';
|
||||
ExtractionPhrasePair *phrasePair = NULL;
|
||||
std::vector< ExtractionPhrasePair* > phrasePairsWithSameSource;
|
||||
@ -323,8 +318,7 @@ int main(int argc, char* argv[])
|
||||
float tmpCount=0.0f, tmpPcfgSum=0.0f;
|
||||
|
||||
int i=0;
|
||||
// TODO why read only the 1st line?
|
||||
if ( getline(extractFileP, line) ) {
|
||||
if ( getline(extractFile, line) ) {
|
||||
++i;
|
||||
tmpPhraseSource = new PHRASE();
|
||||
tmpPhraseTarget = new PHRASE();
|
||||
@ -346,7 +340,7 @@ int main(int argc, char* argv[])
|
||||
lastLine = line;
|
||||
}
|
||||
|
||||
while ( getline(extractFileP, line) ) {
|
||||
while ( getline(extractFile, line) ) {
|
||||
|
||||
if ( ++i % 100000 == 0 ) {
|
||||
std::cerr << "." << std::flush;
|
||||
@ -503,7 +497,8 @@ void processLine( std::string line,
|
||||
phraseTarget->clear();
|
||||
targetToSourceAlignment->clear();
|
||||
|
||||
std::vector<std::string> token = tokenize( line.c_str() );
|
||||
std::vector<std::string> token;
|
||||
Moses::Tokenize( token, line );
|
||||
int item = 1;
|
||||
for ( size_t j=0; j<token.size(); ++j ) {
|
||||
if (token[j] == "|||") {
|
||||
@ -534,7 +529,7 @@ void processLine( std::string line,
|
||||
} else if (item + (includeSentenceIdFlag?-1:0) == 4) { // count
|
||||
sscanf(token[j].c_str(), "%f", &count);
|
||||
} else if (item + (includeSentenceIdFlag?-1:0) == 5) { // target syntax PCFG score
|
||||
float pcfgScore = std::atof(token[j].c_str());
|
||||
float pcfgScore = Moses::Scan<float>( token[j] );
|
||||
pcfgSum = pcfgScore * count;
|
||||
}
|
||||
}
|
||||
@ -548,17 +543,17 @@ void processLine( std::string line,
|
||||
count = 1.0;
|
||||
}
|
||||
if (item < 3 || item > (includeSentenceIdFlag?7:6)) {
|
||||
std::cerr << "ERROR: faulty line " << lineID << ": " << line << endl;
|
||||
std::cerr << "ERROR: faulty line " << lineID << ": " << line << std::endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void writeCountOfCounts( const string &fileNameCountOfCounts )
|
||||
void writeCountOfCounts( const std::string &fileNameCountOfCounts )
|
||||
{
|
||||
// open file
|
||||
Moses::OutputFileStream countOfCountsFile;
|
||||
bool success = countOfCountsFile.Open(fileNameCountOfCounts.c_str());
|
||||
bool success = countOfCountsFile.Open(fileNameCountOfCounts);
|
||||
if (!success) {
|
||||
std::cerr << "ERROR: could not open count-of-counts file "
|
||||
<< fileNameCountOfCounts << std::endl;
|
||||
@ -583,7 +578,7 @@ void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float>
|
||||
{
|
||||
// open file
|
||||
Moses::OutputFileStream leftHandSideSourceLabelCounts;
|
||||
bool success = leftHandSideSourceLabelCounts.Open(fileNameLeftHandSideSourceLabelCounts.c_str());
|
||||
bool success = leftHandSideSourceLabelCounts.Open(fileNameLeftHandSideSourceLabelCounts);
|
||||
if (!success) {
|
||||
std::cerr << "ERROR: could not open left-hand side label counts file "
|
||||
<< fileNameLeftHandSideSourceLabelCounts << std::endl;
|
||||
@ -600,7 +595,7 @@ void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float>
|
||||
|
||||
// open file
|
||||
Moses::OutputFileStream leftHandSideTargetSourceLabelCounts;
|
||||
success = leftHandSideTargetSourceLabelCounts.Open(fileNameLeftHandSideTargetSourceLabelCounts.c_str());
|
||||
success = leftHandSideTargetSourceLabelCounts.Open(fileNameLeftHandSideTargetSourceLabelCounts);
|
||||
if (!success) {
|
||||
std::cerr << "ERROR: could not open left-hand side label joint counts file "
|
||||
<< fileNameLeftHandSideTargetSourceLabelCounts << std::endl;
|
||||
@ -624,7 +619,7 @@ void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fi
|
||||
{
|
||||
// open file
|
||||
Moses::OutputFileStream out;
|
||||
bool success = out.Open(fileName.c_str());
|
||||
bool success = out.Open(fileName);
|
||||
if (!success) {
|
||||
std::cerr << "ERROR: could not open file "
|
||||
<< fileName << " for writing" << std::endl;
|
||||
@ -640,7 +635,7 @@ void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fi
|
||||
}
|
||||
|
||||
|
||||
void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile,
|
||||
void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
|
||||
const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb )
|
||||
{
|
||||
if (phrasePairsWithSameSource.size() == 0) {
|
||||
@ -668,7 +663,7 @@ void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSa
|
||||
|
||||
void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
|
||||
float totalCount, int distinctCount,
|
||||
ostream &phraseTableFile,
|
||||
std::ostream &phraseTableFile,
|
||||
const ScoreFeatureManager& featureManager,
|
||||
const MaybeLog& maybeLogProb )
|
||||
{
|
||||
@ -677,7 +672,7 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
|
||||
const ALIGNMENT *bestAlignmentT2S = phrasePair.FindBestAlignmentTargetToSource();
|
||||
float count = phrasePair.GetCount();
|
||||
|
||||
map< string, float > domainCount;
|
||||
std::map< std::string, float > domainCount;
|
||||
|
||||
// collect count of count statistics
|
||||
if (goodTuringFlag || kneserNeyFlag) {
|
||||
@ -796,13 +791,13 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
|
||||
// extra features
|
||||
ScoreFeatureContext context(phrasePair, maybeLogProb);
|
||||
std::vector<float> extraDense;
|
||||
map<string,float> extraSparse;
|
||||
std::map<std::string,float> extraSparse;
|
||||
featureManager.addFeatures(context, extraDense, extraSparse);
|
||||
for (size_t i = 0; i < extraDense.size(); ++i) {
|
||||
phraseTableFile << " " << extraDense[i];
|
||||
}
|
||||
|
||||
for (map<string,float>::const_iterator i = extraSparse.begin();
|
||||
for (std::map<std::string,float>::const_iterator i = extraSparse.begin();
|
||||
i != extraSparse.end(); ++i) {
|
||||
phraseTableFile << " " << i->first << " " << i->second;
|
||||
}
|
||||
@ -882,14 +877,14 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
|
||||
}
|
||||
|
||||
if (spanLength && !inverseFlag) {
|
||||
string propValue = phrasePair.CollectAllPropertyValues("SpanLength");
|
||||
std::string propValue = phrasePair.CollectAllPropertyValues("SpanLength");
|
||||
if (!propValue.empty()) {
|
||||
phraseTableFile << " {{SpanLength " << propValue << "}}";
|
||||
}
|
||||
}
|
||||
|
||||
if (nonTermContext && !inverseFlag) {
|
||||
string propValue = phrasePair.CollectAllPropertyValues("NonTermContext");
|
||||
std::string propValue = phrasePair.CollectAllPropertyValues("NonTermContext");
|
||||
if (!propValue.empty()) {
|
||||
phraseTableFile << " {{NonTermContext " << propValue << "}}";
|
||||
}
|
||||
@ -907,8 +902,7 @@ void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors,
|
||||
assert(orientationClassPriorsL2R.size()==4 && orientationClassPriorsR2L.size()==4); // mono swap dleft dright
|
||||
|
||||
std::cerr << "Loading phrase orientation priors from " << fileNamePhraseOrientationPriors;
|
||||
ifstream inFile;
|
||||
inFile.open(fileNamePhraseOrientationPriors.c_str());
|
||||
Moses::InputFileStream inFile(fileNamePhraseOrientationPriors);
|
||||
if (inFile.fail()) {
|
||||
std::cerr << " - ERROR: could not open file" << std::endl;
|
||||
exit(1);
|
||||
@ -919,7 +913,7 @@ void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors,
|
||||
float l2rSum = 0;
|
||||
float r2lSum = 0;
|
||||
while (getline(inFile, line)) {
|
||||
istringstream tokenizer(line);
|
||||
std::istringstream tokenizer(line);
|
||||
std::string key;
|
||||
tokenizer >> key;
|
||||
|
||||
@ -983,7 +977,7 @@ void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors,
|
||||
}
|
||||
|
||||
std::cerr << " - read " << linesRead << " lines from orientation priors file" << std::endl;
|
||||
inFile.close();
|
||||
inFile.Close();
|
||||
}
|
||||
|
||||
|
||||
@ -1038,7 +1032,7 @@ double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource )
|
||||
double unaligned = 1.0;
|
||||
// only checking target words - source words are caught when computing inverse
|
||||
for(size_t ti=0; ti<alignmentTargetToSource->size(); ++ti) {
|
||||
const set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
|
||||
const std::set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
|
||||
if (srcIndices.empty()) {
|
||||
unaligned *= 2.718;
|
||||
}
|
||||
@ -1053,7 +1047,7 @@ double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *a
|
||||
double unaligned = 1.0;
|
||||
// only checking target words - source words are caught when computing inverse
|
||||
for(size_t ti=0; ti<alignmentTargetToSource->size(); ++ti) {
|
||||
const set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
|
||||
const std::set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
|
||||
if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseTarget->at(ti) ) ) != functionWordList.end()) {
|
||||
unaligned *= 2.718;
|
||||
}
|
||||
@ -1061,26 +1055,25 @@ double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *a
|
||||
return unaligned;
|
||||
}
|
||||
|
||||
void loadFunctionWords( const string &fileName )
|
||||
void loadFunctionWords( const std::string &fileName )
|
||||
{
|
||||
std::cerr << "Loading function word list from " << fileName;
|
||||
ifstream inFile;
|
||||
inFile.open(fileName.c_str());
|
||||
Moses::InputFileStream inFile(fileName);
|
||||
if (inFile.fail()) {
|
||||
std::cerr << " - ERROR: could not open file" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
istream *inFileP = &inFile;
|
||||
|
||||
string line;
|
||||
while(getline(*inFileP, line)) {
|
||||
std::vector<string> token = tokenize( line.c_str() );
|
||||
std::string line;
|
||||
while(getline(inFile, line)) {
|
||||
std::vector<std::string> token;
|
||||
Moses::Tokenize( token, line );
|
||||
if (token.size() > 0)
|
||||
functionWordList.insert( token[0] );
|
||||
}
|
||||
|
||||
std::cerr << " - read " << functionWordList.size() << " function words" << std::endl;
|
||||
inFile.close();
|
||||
inFile.Close();
|
||||
}
|
||||
|
||||
|
||||
@ -1091,14 +1084,14 @@ double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phra
|
||||
int null = vcbS.getWordID("NULL");
|
||||
// all target words have to be explained
|
||||
for(size_t ti=0; ti<alignmentTargetToSource->size(); ti++) {
|
||||
const set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
|
||||
const std::set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
|
||||
if (srcIndices.empty()) {
|
||||
// explain unaligned word by NULL
|
||||
lexScore *= lexTable.permissiveLookup( null, phraseTarget->at(ti) );
|
||||
} else {
|
||||
// go through all the aligned words to compute average
|
||||
double thisWordScore = 0;
|
||||
for (set< size_t >::const_iterator p(srcIndices.begin()); p != srcIndices.end(); ++p) {
|
||||
for (std::set< size_t >::const_iterator p(srcIndices.begin()); p != srcIndices.end(); ++p) {
|
||||
thisWordScore += lexTable.permissiveLookup( phraseSource->at(*p), phraseTarget->at(ti) );
|
||||
}
|
||||
lexScore *= thisWordScore / (double)srcIndices.size();
|
||||
@ -1108,24 +1101,23 @@ double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phra
|
||||
}
|
||||
|
||||
|
||||
void LexicalTable::load( const string &fileName )
|
||||
void LexicalTable::load( const std::string &fileName )
|
||||
{
|
||||
std::cerr << "Loading lexical translation table from " << fileName;
|
||||
ifstream inFile;
|
||||
inFile.open(fileName.c_str());
|
||||
Moses::InputFileStream inFile(fileName);
|
||||
if (inFile.fail()) {
|
||||
std::cerr << " - ERROR: could not open file" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
istream *inFileP = &inFile;
|
||||
|
||||
string line;
|
||||
std::string line;
|
||||
int i=0;
|
||||
while(getline(*inFileP, line)) {
|
||||
while(getline(inFile, line)) {
|
||||
i++;
|
||||
if (i%100000 == 0) std::cerr << "." << flush;
|
||||
if (i%100000 == 0) std::cerr << "." << std::flush;
|
||||
|
||||
std::vector<string> token = tokenize( line.c_str() );
|
||||
std::vector<std::string> token;
|
||||
Moses::Tokenize( token, line );
|
||||
if (token.size() != 3) {
|
||||
std::cerr << "line " << i << " in " << fileName
|
||||
<< " has wrong number of tokens, skipping:" << std::endl
|
||||
@ -1133,7 +1125,7 @@ void LexicalTable::load( const string &fileName )
|
||||
continue;
|
||||
}
|
||||
|
||||
double prob = atof( token[2].c_str() );
|
||||
double prob = Moses::Scan<double>( token[2] );
|
||||
WORD_ID wordT = vcbT.storeIfNew( token[0] );
|
||||
WORD_ID wordS = vcbS.storeIfNew( token[1] );
|
||||
ltable[ wordS ][ wordT ] = prob;
|
||||
@ -1143,7 +1135,7 @@ void LexicalTable::load( const string &fileName )
|
||||
|
||||
|
||||
void printSourcePhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
|
||||
const ALIGNMENT *targetToSourceAlignment, ostream &out)
|
||||
const ALIGNMENT *targetToSourceAlignment, std::ostream &out)
|
||||
{
|
||||
// get corresponding target non-terminal and output pair
|
||||
ALIGNMENT *sourceToTargetAlignment = new ALIGNMENT();
|
||||
@ -1175,7 +1167,7 @@ void printSourcePhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
|
||||
|
||||
|
||||
void printTargetPhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
|
||||
const ALIGNMENT *targetToSourceAlignment, ostream &out)
|
||||
const ALIGNMENT *targetToSourceAlignment, std::ostream &out)
|
||||
{
|
||||
// output target symbols, except root, in rule table format
|
||||
for (std::size_t i = 0; i < phraseTarget->size()-1; ++i) {
|
||||
|
Loading…
Reference in New Issue
Block a user