Fix bug affecting Good-Turing discounting: repeated phrase pairs were always

contributing a count of 1 because PhraseAlignment::addToCount() was looking
for counts in the fifth column, not the fourth.


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3775 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
pjwilliams 2010-12-14 16:31:53 +00:00
parent 41c5b3a1c2
commit 627d8edf8e
3 changed files with 7 additions and 22 deletions

View File

@ -26,24 +26,6 @@ PhraseAlignment::PhraseAlignment()
,targetPhraseId(999999)
{}
void PhraseAlignment::addToCount( char line[] )
{
vector< string > token = tokenize( line );
int item = 0;
for (int j=0; j<token.size(); j++)
{
if (token[j] == "|||") item++;
if (item == 4)
{
float addCount;
sscanf(token[j].c_str(), "%f", &addCount);
count += addCount;
}
}
if (item < 4) // no specified counts -> counts as one
count += 1.0;
}
// read in a phrase pair and store it
void PhraseAlignment::create( char line[], int lineID )
{

View File

@ -25,7 +25,6 @@ public:
PhraseAlignment();
void create( char*, int );
void addToCount( char* );
void clear();
bool equals( const PhraseAlignment& );
bool match( const PhraseAlignment& );

View File

@ -167,6 +167,7 @@ int main(int argc, char* argv[])
// loop through all extracted phrase translations
int lastSource = -1;
float lastCount = 0.0f;
vector< PhraseAlignment > phrasePairsWithSameF;
int i=0;
char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
@ -179,9 +180,9 @@ int main(int argc, char* argv[])
if (extractFileP.eof()) break;
// identical to last line? just add count
if (lastSource > 0 && strcmp(line,lastLine) == 0)
if (strcmp(line,lastLine) == 0)
{
lastPhrasePair->addToCount( line );
lastPhrasePair->count += lastCount;
continue;
}
strcpy( lastLine, line );
@ -189,6 +190,7 @@ int main(int argc, char* argv[])
// create new phrase pair
PhraseAlignment phrasePair;
phrasePair.create( line, i );
lastCount = phrasePair.count;
// only differs in count? just add count
if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair ))
@ -236,6 +238,7 @@ void computeCountOfCounts( char* fileNameExtract, int maxLines )
int lineNum = 0;
char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
lastLine[0] = '\0';
float lastCount = 0.0f;
PhraseAlignment *lastPhrasePair = NULL;
while(true) {
if (extractFileP.eof()) break;
@ -247,7 +250,7 @@ void computeCountOfCounts( char* fileNameExtract, int maxLines )
// identical to last line? just add count
if (strcmp(line,lastLine) == 0)
{
lastPhrasePair->addToCount( line );
lastPhrasePair->count += lastCount;
continue;
}
strcpy( lastLine, line );
@ -255,6 +258,7 @@ void computeCountOfCounts( char* fileNameExtract, int maxLines )
// create new phrase pair
PhraseAlignment *phrasePair = new PhraseAlignment();
phrasePair->create( line, lineNum );
lastCount = phrasePair->count;
if (lineNum == 1)
{