mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-20 15:48:05 +03:00
Fix bug affecting Good-Turing discounting: repeated phrase pairs were always
contributing a count of 1 because PhraseAlignment::addToCount() was looking for counts in the fifth column, not the fourth. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3775 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
41c5b3a1c2
commit
627d8edf8e
@ -26,24 +26,6 @@ PhraseAlignment::PhraseAlignment()
|
||||
,targetPhraseId(999999)
|
||||
{}
|
||||
|
||||
void PhraseAlignment::addToCount( char line[] )
|
||||
{
|
||||
vector< string > token = tokenize( line );
|
||||
int item = 0;
|
||||
for (int j=0; j<token.size(); j++)
|
||||
{
|
||||
if (token[j] == "|||") item++;
|
||||
if (item == 4)
|
||||
{
|
||||
float addCount;
|
||||
sscanf(token[j].c_str(), "%f", &addCount);
|
||||
count += addCount;
|
||||
}
|
||||
}
|
||||
if (item < 4) // no specified counts -> counts as one
|
||||
count += 1.0;
|
||||
}
|
||||
|
||||
// read in a phrase pair and store it
|
||||
void PhraseAlignment::create( char line[], int lineID )
|
||||
{
|
||||
|
@ -25,7 +25,6 @@ public:
|
||||
PhraseAlignment();
|
||||
|
||||
void create( char*, int );
|
||||
void addToCount( char* );
|
||||
void clear();
|
||||
bool equals( const PhraseAlignment& );
|
||||
bool match( const PhraseAlignment& );
|
||||
|
@ -167,6 +167,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
// loop through all extracted phrase translations
|
||||
int lastSource = -1;
|
||||
float lastCount = 0.0f;
|
||||
vector< PhraseAlignment > phrasePairsWithSameF;
|
||||
int i=0;
|
||||
char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
|
||||
@ -179,9 +180,9 @@ int main(int argc, char* argv[])
|
||||
if (extractFileP.eof()) break;
|
||||
|
||||
// identical to last line? just add count
|
||||
if (lastSource > 0 && strcmp(line,lastLine) == 0)
|
||||
if (strcmp(line,lastLine) == 0)
|
||||
{
|
||||
lastPhrasePair->addToCount( line );
|
||||
lastPhrasePair->count += lastCount;
|
||||
continue;
|
||||
}
|
||||
strcpy( lastLine, line );
|
||||
@ -189,6 +190,7 @@ int main(int argc, char* argv[])
|
||||
// create new phrase pair
|
||||
PhraseAlignment phrasePair;
|
||||
phrasePair.create( line, i );
|
||||
lastCount = phrasePair.count;
|
||||
|
||||
// only differs in count? just add count
|
||||
if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair ))
|
||||
@ -236,6 +238,7 @@ void computeCountOfCounts( char* fileNameExtract, int maxLines )
|
||||
int lineNum = 0;
|
||||
char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
|
||||
lastLine[0] = '\0';
|
||||
float lastCount = 0.0f;
|
||||
PhraseAlignment *lastPhrasePair = NULL;
|
||||
while(true) {
|
||||
if (extractFileP.eof()) break;
|
||||
@ -247,7 +250,7 @@ void computeCountOfCounts( char* fileNameExtract, int maxLines )
|
||||
// identical to last line? just add count
|
||||
if (strcmp(line,lastLine) == 0)
|
||||
{
|
||||
lastPhrasePair->addToCount( line );
|
||||
lastPhrasePair->count += lastCount;
|
||||
continue;
|
||||
}
|
||||
strcpy( lastLine, line );
|
||||
@ -255,6 +258,7 @@ void computeCountOfCounts( char* fileNameExtract, int maxLines )
|
||||
// create new phrase pair
|
||||
PhraseAlignment *phrasePair = new PhraseAlignment();
|
||||
phrasePair->create( line, lineNum );
|
||||
lastCount = phrasePair->count;
|
||||
|
||||
if (lineNum == 1)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user