Add --PhrasePairCount option to score.

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4032 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
pjwilliams 2011-06-24 16:24:33 +00:00
parent 0484d43a22
commit 108dc4d12e
3 changed files with 20 additions and 21 deletions

View File

@ -16,7 +16,7 @@ extract-rules: tables-core.o SentenceAlignment.o SentenceAlignmentWithSyntax.o S
score: tables-core.o AlignmentPhrase.o score.o PhraseAlignment.o InputFileStream.o
$(CXX) $^ -lz -o score
consolidate: consolidate.o
consolidate: consolidate.o tables-core.o
$(CXX) $^ -o consolidate
consolidate-direct: consolidate-direct.o InputFileStream.o

View File

@ -25,6 +25,7 @@
#include <cstdlib>
#include <cstring>
#include "tables-core.h"
#include "SafeGetline.h"
#define LINE_MAX_LENGTH 10000
@ -118,23 +119,6 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
// indirect: source target probabilities
// consistency checks
/*
size_t expectedSize = (hierarchicalFlag ? 5 : 4);
if (itemDirect.size() != expectedSize)
{
cerr << "ERROR: expected " << expectedSize << " items in file "
<< fileNameDirect << ", line " << i << endl;
exit(1);
}
if (itemIndirect.size() != 4)
{
cerr << "ERROR: expected 4 items in file "
<< fileNameIndirect << ", line " << i << endl;
exit(1);
}
*/
if (itemDirect[0].compare( itemIndirect[0] ) != 0) {
cerr << "ERROR: target phrase does not match in line " << i << ": '"
<< itemDirect[0] << "' != '" << itemIndirect[0] << "'" << endl;
@ -164,8 +148,15 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
fileConsolidated << " ||| " << itemDirect[3];
// counts, for debugging
fileConsolidated << "||| " << itemIndirect[4] << " " // indirect
<< itemDirect[4]; // direct
vector<string> directCounts = tokenize(itemDirect[4].c_str());
vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
fileConsolidated << "||| " << indirectCounts[0] << " " << directCounts[0];
// output rule count if present in either file
if (directCounts.size() > 1) {
fileConsolidated << " " << directCounts[1];
} else if (indirectCounts.size() > 1) {
fileConsolidated << " " << indirectCounts[1];
}
fileConsolidated << endl;
}

View File

@ -74,6 +74,7 @@ bool lexFlag = true;
int countOfCounts[GT_MAX+1];
float discountFactor[GT_MAX+1];
int maxLinesGTDiscount = -1;
bool ruleCountFlag = false;
int main(int argc, char* argv[])
{
@ -81,7 +82,7 @@ int main(int argc, char* argv[])
<< "scoring methods for extracted rules\n";
if (argc < 4) {
cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--WordAlignment [--MaxLinesGTDiscount num] file]\n";
cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--WordAlignment] [--MaxLinesGTDiscount num] [--RuleCount]\n";
exit(1);
}
char* fileNameExtract = argv[1];
@ -115,6 +116,9 @@ int main(int argc, char* argv[])
++i;
maxLinesGTDiscount = atoi(argv[i]);
cerr << "maxLinesGTDiscount=" << maxLinesGTDiscount << endl;
} else if (strcmp(argv[i],"--RuleCount") == 0) {
ruleCountFlag = true;
cerr << "outputting rule counts" << endl;
} else {
cerr << "ERROR: unknown option " << argv[i] << endl;
exit(1);
@ -345,6 +349,7 @@ void outputPhrasePair( vector< PhraseAlignment* > &phrasePair, float totalCount,
for(size_t i=0; i<phrasePair.size(); i++) {
count += phrasePair[i]->count;
}
const float originalCount = count;
const PHRASE &phraseS = phrasePair[0]->GetSource();
const PHRASE &phraseT = phrasePair[0]->GetTarget();
@ -418,6 +423,9 @@ void outputPhrasePair( vector< PhraseAlignment* > &phrasePair, float totalCount,
}
phraseTableFile << " ||| " << totalCount;
if (ruleCountFlag) {
phraseTableFile << " " << originalCount;
}
phraseTableFile << endl;
}