mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 21:42:19 +03:00
program to compute countings for phrase pairs
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2647 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
34d9feccc8
commit
3ad833d136
347
scripts/training/phrase-extract/statistics.cpp
Normal file
347
scripts/training/phrase-extract/statistics.cpp
Normal file
@ -0,0 +1,347 @@
|
||||
// $Id$
|
||||
// vim:tabstop=2
|
||||
|
||||
#include <sstream>
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <time.h>
|
||||
#include "AlignmentPhrase.h"
|
||||
#include "tables-core.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
|
||||
_IS.getline(_LINE, _SIZE, _DELIM); \
|
||||
if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
|
||||
if (_IS.gcount() == _SIZE-1) { \
|
||||
cerr << "Line too long! Buffer overflow. Delete lines >=" \
|
||||
<< _SIZE << " chars or raise LINE_MAX_LENGTH in phrase-extract/statistics.cpp" \
|
||||
<< endl; \
|
||||
exit(1); \
|
||||
} \
|
||||
}
|
||||
#define LINE_MAX_LENGTH 10000
|
||||
|
||||
class PhraseAlignment {
|
||||
public:
|
||||
int english, foreign;
|
||||
vector< vector<size_t> > alignedToE;
|
||||
vector< vector<size_t> > alignedToF;
|
||||
|
||||
bool create( char*, int );
|
||||
void clear();
|
||||
bool equals( const PhraseAlignment& );
|
||||
};
|
||||
|
||||
class LexicalTable {
|
||||
public:
|
||||
map< WORD_ID, map< WORD_ID, double > > ltable;
|
||||
void load( char[] );
|
||||
};
|
||||
|
||||
vector<string> tokenize( char [] );
|
||||
|
||||
void processPhrasePairs( vector< PhraseAlignment > & );
|
||||
|
||||
ofstream phraseTableFile;
|
||||
|
||||
Vocabulary vcbE;
|
||||
Vocabulary vcbF;
|
||||
LexicalTable lexTable;
|
||||
PhraseTable phraseTableE;
|
||||
PhraseTable phraseTableF;
|
||||
bool inverseFlag;
|
||||
int phrasePairBase = 0; // only used for "proper" conditioning
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
cerr << "PhraseStatistics v1.1 written by Nicola Bertoldi\n"
|
||||
<< "modifying PhraseScore v1.4 written by Philipp Koehn\n"
|
||||
<< "It computes statistics for extracted phrase pairs\n"
|
||||
<< "if (direct):\n"
|
||||
<< "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(src_phrase) length(src_phrase) length(trg_phrase)\n"
|
||||
<< "if (inverse)\n"
|
||||
<< "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(trg_phrase) length(src_phrase) length(trg_phrase)\n";
|
||||
time_t starttime = time(NULL);
|
||||
|
||||
if (argc != 4 && argc != 5) {
|
||||
cerr << "syntax: statistics extract lex phrase-table [inverse]\n";
|
||||
exit(1);
|
||||
}
|
||||
char* &fileNameExtract = argv[1];
|
||||
char* &fileNameLex = argv[2];
|
||||
char* &fileNamePhraseTable = argv[3];
|
||||
inverseFlag = false;
|
||||
if (argc > 4) {
|
||||
inverseFlag = true;
|
||||
cerr << "using inverse mode\n";
|
||||
}
|
||||
|
||||
// lexical translation table
|
||||
lexTable.load( fileNameLex );
|
||||
|
||||
// sorted phrase extraction file
|
||||
ifstream extractFile;
|
||||
|
||||
extractFile.open(fileNameExtract);
|
||||
if (extractFile.fail()) {
|
||||
cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
|
||||
exit(1);
|
||||
}
|
||||
istream &extractFileP = extractFile;
|
||||
|
||||
// output file: phrase translation table
|
||||
phraseTableFile.open(fileNamePhraseTable);
|
||||
if (phraseTableFile.fail()) {
|
||||
cerr << "ERROR: could not open file phrase table file "
|
||||
<< fileNamePhraseTable << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// loop through all extracted phrase translations
|
||||
int lastForeign = -1;
|
||||
vector< PhraseAlignment > phrasePairsWithSameF;
|
||||
int i=0;
|
||||
int fileCount = 0;
|
||||
while(true) {
|
||||
if (extractFileP.eof()) break;
|
||||
if (++i % 100000 == 0) cerr << "." << flush;
|
||||
char line[LINE_MAX_LENGTH];
|
||||
SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n');
|
||||
// if (fileCount>0)
|
||||
if (extractFileP.eof())
|
||||
break;
|
||||
PhraseAlignment phrasePair;
|
||||
bool isPhrasePair = phrasePair.create( line, i );
|
||||
if (lastForeign >= 0 && lastForeign != phrasePair.foreign) {
|
||||
processPhrasePairs( phrasePairsWithSameF );
|
||||
for(int j=0;j<phrasePairsWithSameF.size();j++)
|
||||
phrasePairsWithSameF[j].clear();
|
||||
phrasePairsWithSameF.clear();
|
||||
phraseTableE.clear();
|
||||
phraseTableF.clear();
|
||||
phrasePair.clear(); // process line again, since phrase tables flushed
|
||||
phrasePair.create( line, i );
|
||||
phrasePairBase = 0;
|
||||
}
|
||||
lastForeign = phrasePair.foreign;
|
||||
if (isPhrasePair)
|
||||
phrasePairsWithSameF.push_back( phrasePair );
|
||||
else
|
||||
phrasePairBase++;
|
||||
}
|
||||
processPhrasePairs( phrasePairsWithSameF );
|
||||
phraseTableFile.close();
|
||||
}
|
||||
|
||||
void processPhrasePairs( vector< PhraseAlignment > &phrasePair ) {
|
||||
if (phrasePair.size() == 0) return;
|
||||
map<int, int> countE;
|
||||
map<int, int> alignmentE;
|
||||
int totalCount = 0;
|
||||
int currentCount = 0;
|
||||
int maxSameCount = 0;
|
||||
int maxSame = -1;
|
||||
int old = -1;
|
||||
for(int i=0;i<phrasePair.size();i++) {
|
||||
if (i>0) {
|
||||
if (phrasePair[old].english == phrasePair[i].english) {
|
||||
if (! phrasePair[i].equals( phrasePair[old] )) {
|
||||
if (currentCount > maxSameCount) {
|
||||
maxSameCount = currentCount;
|
||||
maxSame = i-1;
|
||||
}
|
||||
currentCount = 0;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// wrap up old E
|
||||
if (currentCount > maxSameCount) {
|
||||
maxSameCount = currentCount;
|
||||
maxSame = i-1;
|
||||
}
|
||||
|
||||
alignmentE[ phrasePair[old].english ] = maxSame;
|
||||
// if (maxSameCount != totalCount)
|
||||
// cout << "max count is " << maxSameCount << "/" << totalCount << endl;
|
||||
|
||||
// get ready for new E
|
||||
totalCount = 0;
|
||||
currentCount = 0;
|
||||
maxSameCount = 0;
|
||||
maxSame = -1;
|
||||
}
|
||||
}
|
||||
countE[ phrasePair[i].english ]++;
|
||||
old = i;
|
||||
currentCount++;
|
||||
totalCount++;
|
||||
}
|
||||
|
||||
// wrap up old E
|
||||
if (currentCount > maxSameCount) {
|
||||
maxSameCount = currentCount;
|
||||
maxSame = phrasePair.size()-1;
|
||||
}
|
||||
alignmentE[ phrasePair[old].english ] = maxSame;
|
||||
// if (maxSameCount != totalCount)
|
||||
// cout << "max count is " << maxSameCount << "/" << totalCount << endl;
|
||||
|
||||
// output table
|
||||
typedef map< int, int >::iterator II;
|
||||
PHRASE phraseF = phraseTableF.getPhrase( phrasePair[0].foreign );
|
||||
size_t index = 0;
|
||||
for(II i = countE.begin(); i != countE.end(); i++) {
|
||||
//cout << "\tp( " << i->first << " | " << phrasePair[0].foreign << " ; " << phraseF.size() << " ) = ...\n";
|
||||
//cerr << index << endl;
|
||||
|
||||
// foreign phrase (unless inverse)
|
||||
if (! inverseFlag) {
|
||||
for(int j=0;j<phraseF.size();j++)
|
||||
{
|
||||
phraseTableFile << vcbF.getWord( phraseF[j] );
|
||||
phraseTableFile << " ";
|
||||
}
|
||||
phraseTableFile << "||| ";
|
||||
}
|
||||
|
||||
// english phrase
|
||||
PHRASE phraseE = phraseTableE.getPhrase( i->first );
|
||||
for(int j=0;j<phraseE.size();j++)
|
||||
{
|
||||
phraseTableFile << vcbE.getWord( phraseE[j] );
|
||||
phraseTableFile << " ";
|
||||
}
|
||||
phraseTableFile << "||| ";
|
||||
|
||||
// foreign phrase (if inverse)
|
||||
if (inverseFlag) {
|
||||
for(int j=0;j<phraseF.size();j++)
|
||||
{
|
||||
phraseTableFile << vcbF.getWord( phraseF[j] );
|
||||
phraseTableFile << " ";
|
||||
}
|
||||
phraseTableFile << "||| ";
|
||||
}
|
||||
|
||||
// phrase pair frequency
|
||||
phraseTableFile << i->second;
|
||||
|
||||
//source phrase pair frequency
|
||||
phraseTableFile << " " << phrasePair.size();
|
||||
|
||||
// source phrase length
|
||||
phraseTableFile << " " << phraseF.size();
|
||||
|
||||
// target phrase length
|
||||
phraseTableFile << " " << phraseE.size();
|
||||
|
||||
phraseTableFile << endl;
|
||||
|
||||
index += i->second;
|
||||
}
|
||||
}
|
||||
|
||||
bool PhraseAlignment::create( char line[], int lineID ) {
|
||||
vector< string > token = tokenize( line );
|
||||
int item = 1;
|
||||
PHRASE phraseF, phraseE;
|
||||
for (int j=0; j<token.size(); j++) {
|
||||
if (token[j] == "|||") item++;
|
||||
else {
|
||||
if (item == 1)
|
||||
phraseF.push_back( vcbF.storeIfNew( token[j] ) );
|
||||
else if (item == 2)
|
||||
phraseE.push_back( vcbE.storeIfNew( token[j] ) );
|
||||
else if (item == 3) {
|
||||
int e,f;
|
||||
sscanf(token[j].c_str(), "%d-%d", &f, &e);
|
||||
if (e >= phraseE.size() || f >= phraseF.size()) {
|
||||
cerr << "WARNING: sentence " << lineID << " has alignment point (" << f << ", " << e << ") out of bounds (" << phraseF.size() << ", " << phraseE.size() << ")\n"; }
|
||||
else {
|
||||
if (alignedToE.size() == 0) {
|
||||
vector< size_t > dummy;
|
||||
for(int i=0;i<phraseE.size();i++)
|
||||
alignedToE.push_back( dummy );
|
||||
for(int i=0;i<phraseF.size();i++)
|
||||
alignedToF.push_back( dummy );
|
||||
foreign = phraseTableF.storeIfNew( phraseF );
|
||||
english = phraseTableE.storeIfNew( phraseE );
|
||||
}
|
||||
alignedToE[e].push_back( f );
|
||||
alignedToF[f].push_back( e );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return (item>2); // real phrase pair, not just foreign phrase
|
||||
}
|
||||
|
||||
void PhraseAlignment::clear() {
|
||||
for(int i=0;i<alignedToE.size();i++)
|
||||
alignedToE[i].clear();
|
||||
for(int i=0;i<alignedToF.size();i++)
|
||||
alignedToF[i].clear();
|
||||
alignedToE.clear();
|
||||
alignedToF.clear();
|
||||
}
|
||||
|
||||
bool PhraseAlignment::equals( const PhraseAlignment& other ) {
|
||||
if (this == &other) return true;
|
||||
if (other.english != english) return false;
|
||||
if (other.foreign != foreign) return false;
|
||||
PHRASE phraseE = phraseTableE.getPhrase( english );
|
||||
PHRASE phraseF = phraseTableF.getPhrase( foreign );
|
||||
for(int i=0;i<phraseE.size();i++) {
|
||||
if (alignedToE[i].size() != other.alignedToE[i].size()) return false;
|
||||
for(int j=0; j<alignedToE[i].size(); j++) {
|
||||
if (alignedToE[i][j] != other.alignedToE[i][j]) return false;
|
||||
}
|
||||
}
|
||||
for(int i=0;i<phraseF.size();i++) {
|
||||
if (alignedToF[i].size() != other.alignedToF[i].size()) return false;
|
||||
for(int j=0; j<alignedToF[i].size(); j++) {
|
||||
if (alignedToF[i][j] != other.alignedToF[i][j]) return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void LexicalTable::load( char *fileName ) {
|
||||
cerr << "Loading lexical translation table from " << fileName;
|
||||
ifstream inFile;
|
||||
inFile.open(fileName);
|
||||
if (inFile.fail()) {
|
||||
cerr << " - ERROR: could not open file\n";
|
||||
exit(1);
|
||||
}
|
||||
istream *inFileP = &inFile;
|
||||
|
||||
char line[LINE_MAX_LENGTH];
|
||||
|
||||
int i=0;
|
||||
while(true) {
|
||||
i++;
|
||||
if (i%100000 == 0) cerr << "." << flush;
|
||||
SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n');
|
||||
if (inFileP->eof()) break;
|
||||
|
||||
vector<string> token = tokenize( line );
|
||||
if (token.size() != 3) {
|
||||
cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:\n" <<
|
||||
token.size() << " " << token[0] << " " << line << endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
double prob = atof( token[2].c_str() );
|
||||
WORD_ID wordE = vcbE.storeIfNew( token[0] );
|
||||
WORD_ID wordF = vcbF.storeIfNew( token[1] );
|
||||
ltable[ wordF ][ wordE ] = prob;
|
||||
}
|
||||
cerr << endl;
|
||||
}
|
Loading…
Reference in New Issue
Block a user