mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 21:42:19 +03:00
346 lines
9.7 KiB
C++
346 lines
9.7 KiB
C++
|
// $Id$
|
||
|
// vim:tabstop=2
|
||
|
|
||
|
#include <sstream>
|
||
|
#include <cstdio>
|
||
|
#include <iostream>
|
||
|
#include <fstream>
|
||
|
#include <vector>
|
||
|
#include <string>
|
||
|
#include <stdlib.h>
|
||
|
#include <assert.h>
|
||
|
#include <time.h>
|
||
|
#include "AlignmentPhrase.h"
|
||
|
#include "tables-core.h"
|
||
|
|
||
|
using namespace std;
|
||
|
|
||
|
#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
|
||
|
_IS.getline(_LINE, _SIZE, _DELIM); \
|
||
|
if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
|
||
|
if (_IS.gcount() == _SIZE-1) { \
|
||
|
cerr << "Line too long! Buffer overflow. Delete lines >=" \
|
||
|
<< _SIZE << " chars or raise LINE_MAX_LENGTH in phrase-extract/statistics.cpp" \
|
||
|
<< endl; \
|
||
|
exit(1); \
|
||
|
} \
|
||
|
}
|
||
|
#define LINE_MAX_LENGTH 10000
|
||
|
|
||
|
class PhraseAlignment {
|
||
|
public:
|
||
|
int english, foreign;
|
||
|
vector< vector<size_t> > alignedToE;
|
||
|
vector< vector<size_t> > alignedToF;
|
||
|
|
||
|
bool create( char*, int );
|
||
|
void clear();
|
||
|
bool equals( const PhraseAlignment& );
|
||
|
};
|
||
|
|
||
|
class LexicalTable {
|
||
|
public:
|
||
|
map< WORD_ID, map< WORD_ID, double > > ltable;
|
||
|
void load( char[] );
|
||
|
};
|
||
|
|
||
|
void processPhrasePairs( vector< PhraseAlignment > & );
|
||
|
|
||
|
ofstream phraseTableFile;
|
||
|
|
||
|
Vocabulary vcbE;
|
||
|
Vocabulary vcbF;
|
||
|
LexicalTable lexTable;
|
||
|
PhraseTable phraseTableE;
|
||
|
PhraseTable phraseTableF;
|
||
|
bool inverseFlag;
|
||
|
int phrasePairBase = 0; // only used for "proper" conditioning
|
||
|
|
||
|
int main(int argc, char* argv[])
|
||
|
{
|
||
|
cerr << "PhraseStatistics v1.1 written by Nicola Bertoldi\n"
|
||
|
<< "modifying PhraseScore v1.4 written by Philipp Koehn\n"
|
||
|
<< "It computes statistics for extracted phrase pairs\n"
|
||
|
<< "if (direct):\n"
|
||
|
<< "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(src_phrase) length(src_phrase) length(trg_phrase)\n"
|
||
|
<< "if (inverse)\n"
|
||
|
<< "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(trg_phrase) length(src_phrase) length(trg_phrase)\n";
|
||
|
time_t starttime = time(NULL);
|
||
|
|
||
|
if (argc != 4 && argc != 5) {
|
||
|
cerr << "syntax: statistics extract lex phrase-table [inverse]\n";
|
||
|
exit(1);
|
||
|
}
|
||
|
char* &fileNameExtract = argv[1];
|
||
|
char* &fileNameLex = argv[2];
|
||
|
char* &fileNamePhraseTable = argv[3];
|
||
|
inverseFlag = false;
|
||
|
if (argc > 4) {
|
||
|
inverseFlag = true;
|
||
|
cerr << "using inverse mode\n";
|
||
|
}
|
||
|
|
||
|
// lexical translation table
|
||
|
lexTable.load( fileNameLex );
|
||
|
|
||
|
// sorted phrase extraction file
|
||
|
ifstream extractFile;
|
||
|
|
||
|
extractFile.open(fileNameExtract);
|
||
|
if (extractFile.fail()) {
|
||
|
cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
|
||
|
exit(1);
|
||
|
}
|
||
|
istream &extractFileP = extractFile;
|
||
|
|
||
|
// output file: phrase translation table
|
||
|
phraseTableFile.open(fileNamePhraseTable);
|
||
|
if (phraseTableFile.fail()) {
|
||
|
cerr << "ERROR: could not open file phrase table file "
|
||
|
<< fileNamePhraseTable << endl;
|
||
|
exit(1);
|
||
|
}
|
||
|
|
||
|
// loop through all extracted phrase translations
|
||
|
int lastForeign = -1;
|
||
|
vector< PhraseAlignment > phrasePairsWithSameF;
|
||
|
int i=0;
|
||
|
int fileCount = 0;
|
||
|
while(true) {
|
||
|
if (extractFileP.eof()) break;
|
||
|
if (++i % 100000 == 0) cerr << "." << flush;
|
||
|
char line[LINE_MAX_LENGTH];
|
||
|
SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n');
|
||
|
// if (fileCount>0)
|
||
|
if (extractFileP.eof())
|
||
|
break;
|
||
|
PhraseAlignment phrasePair;
|
||
|
bool isPhrasePair = phrasePair.create( line, i );
|
||
|
if (lastForeign >= 0 && lastForeign != phrasePair.foreign) {
|
||
|
processPhrasePairs( phrasePairsWithSameF );
|
||
|
for(int j=0;j<phrasePairsWithSameF.size();j++)
|
||
|
phrasePairsWithSameF[j].clear();
|
||
|
phrasePairsWithSameF.clear();
|
||
|
phraseTableE.clear();
|
||
|
phraseTableF.clear();
|
||
|
phrasePair.clear(); // process line again, since phrase tables flushed
|
||
|
phrasePair.create( line, i );
|
||
|
phrasePairBase = 0;
|
||
|
}
|
||
|
lastForeign = phrasePair.foreign;
|
||
|
if (isPhrasePair)
|
||
|
phrasePairsWithSameF.push_back( phrasePair );
|
||
|
else
|
||
|
phrasePairBase++;
|
||
|
}
|
||
|
processPhrasePairs( phrasePairsWithSameF );
|
||
|
phraseTableFile.close();
|
||
|
}
|
||
|
|
||
|
void processPhrasePairs( vector< PhraseAlignment > &phrasePair ) {
|
||
|
if (phrasePair.size() == 0) return;
|
||
|
map<int, int> countE;
|
||
|
map<int, int> alignmentE;
|
||
|
int totalCount = 0;
|
||
|
int currentCount = 0;
|
||
|
int maxSameCount = 0;
|
||
|
int maxSame = -1;
|
||
|
int old = -1;
|
||
|
for(int i=0;i<phrasePair.size();i++) {
|
||
|
if (i>0) {
|
||
|
if (phrasePair[old].english == phrasePair[i].english) {
|
||
|
if (! phrasePair[i].equals( phrasePair[old] )) {
|
||
|
if (currentCount > maxSameCount) {
|
||
|
maxSameCount = currentCount;
|
||
|
maxSame = i-1;
|
||
|
}
|
||
|
currentCount = 0;
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
// wrap up old E
|
||
|
if (currentCount > maxSameCount) {
|
||
|
maxSameCount = currentCount;
|
||
|
maxSame = i-1;
|
||
|
}
|
||
|
|
||
|
alignmentE[ phrasePair[old].english ] = maxSame;
|
||
|
// if (maxSameCount != totalCount)
|
||
|
// cout << "max count is " << maxSameCount << "/" << totalCount << endl;
|
||
|
|
||
|
// get ready for new E
|
||
|
totalCount = 0;
|
||
|
currentCount = 0;
|
||
|
maxSameCount = 0;
|
||
|
maxSame = -1;
|
||
|
}
|
||
|
}
|
||
|
countE[ phrasePair[i].english ]++;
|
||
|
old = i;
|
||
|
currentCount++;
|
||
|
totalCount++;
|
||
|
}
|
||
|
|
||
|
// wrap up old E
|
||
|
if (currentCount > maxSameCount) {
|
||
|
maxSameCount = currentCount;
|
||
|
maxSame = phrasePair.size()-1;
|
||
|
}
|
||
|
alignmentE[ phrasePair[old].english ] = maxSame;
|
||
|
// if (maxSameCount != totalCount)
|
||
|
// cout << "max count is " << maxSameCount << "/" << totalCount << endl;
|
||
|
|
||
|
// output table
|
||
|
typedef map< int, int >::iterator II;
|
||
|
PHRASE phraseF = phraseTableF.getPhrase( phrasePair[0].foreign );
|
||
|
size_t index = 0;
|
||
|
for(II i = countE.begin(); i != countE.end(); i++) {
|
||
|
//cout << "\tp( " << i->first << " | " << phrasePair[0].foreign << " ; " << phraseF.size() << " ) = ...\n";
|
||
|
//cerr << index << endl;
|
||
|
|
||
|
// foreign phrase (unless inverse)
|
||
|
if (! inverseFlag) {
|
||
|
for(int j=0;j<phraseF.size();j++)
|
||
|
{
|
||
|
phraseTableFile << vcbF.getWord( phraseF[j] );
|
||
|
phraseTableFile << " ";
|
||
|
}
|
||
|
phraseTableFile << "||| ";
|
||
|
}
|
||
|
|
||
|
// english phrase
|
||
|
PHRASE phraseE = phraseTableE.getPhrase( i->first );
|
||
|
for(int j=0;j<phraseE.size();j++)
|
||
|
{
|
||
|
phraseTableFile << vcbE.getWord( phraseE[j] );
|
||
|
phraseTableFile << " ";
|
||
|
}
|
||
|
phraseTableFile << "||| ";
|
||
|
|
||
|
// foreign phrase (if inverse)
|
||
|
if (inverseFlag) {
|
||
|
for(int j=0;j<phraseF.size();j++)
|
||
|
{
|
||
|
phraseTableFile << vcbF.getWord( phraseF[j] );
|
||
|
phraseTableFile << " ";
|
||
|
}
|
||
|
phraseTableFile << "||| ";
|
||
|
}
|
||
|
|
||
|
// phrase pair frequency
|
||
|
phraseTableFile << i->second;
|
||
|
|
||
|
//source phrase pair frequency
|
||
|
phraseTableFile << " " << phrasePair.size();
|
||
|
|
||
|
// source phrase length
|
||
|
phraseTableFile << " " << phraseF.size();
|
||
|
|
||
|
// target phrase length
|
||
|
phraseTableFile << " " << phraseE.size();
|
||
|
|
||
|
phraseTableFile << endl;
|
||
|
|
||
|
index += i->second;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
bool PhraseAlignment::create( char line[], int lineID ) {
|
||
|
vector< string > token = tokenize( line );
|
||
|
int item = 1;
|
||
|
PHRASE phraseF, phraseE;
|
||
|
for (int j=0; j<token.size(); j++) {
|
||
|
if (token[j] == "|||") item++;
|
||
|
else {
|
||
|
if (item == 1)
|
||
|
phraseF.push_back( vcbF.storeIfNew( token[j] ) );
|
||
|
else if (item == 2)
|
||
|
phraseE.push_back( vcbE.storeIfNew( token[j] ) );
|
||
|
else if (item == 3) {
|
||
|
int e,f;
|
||
|
sscanf(token[j].c_str(), "%d-%d", &f, &e);
|
||
|
if (e >= phraseE.size() || f >= phraseF.size()) {
|
||
|
cerr << "WARNING: sentence " << lineID << " has alignment point (" << f << ", " << e << ") out of bounds (" << phraseF.size() << ", " << phraseE.size() << ")\n"; }
|
||
|
else {
|
||
|
if (alignedToE.size() == 0) {
|
||
|
vector< size_t > dummy;
|
||
|
for(int i=0;i<phraseE.size();i++)
|
||
|
alignedToE.push_back( dummy );
|
||
|
for(int i=0;i<phraseF.size();i++)
|
||
|
alignedToF.push_back( dummy );
|
||
|
foreign = phraseTableF.storeIfNew( phraseF );
|
||
|
english = phraseTableE.storeIfNew( phraseE );
|
||
|
}
|
||
|
alignedToE[e].push_back( f );
|
||
|
alignedToF[f].push_back( e );
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return (item>2); // real phrase pair, not just foreign phrase
|
||
|
}
|
||
|
|
||
|
void PhraseAlignment::clear() {
|
||
|
for(int i=0;i<alignedToE.size();i++)
|
||
|
alignedToE[i].clear();
|
||
|
for(int i=0;i<alignedToF.size();i++)
|
||
|
alignedToF[i].clear();
|
||
|
alignedToE.clear();
|
||
|
alignedToF.clear();
|
||
|
}
|
||
|
|
||
|
bool PhraseAlignment::equals( const PhraseAlignment& other ) {
|
||
|
if (this == &other) return true;
|
||
|
if (other.english != english) return false;
|
||
|
if (other.foreign != foreign) return false;
|
||
|
PHRASE phraseE = phraseTableE.getPhrase( english );
|
||
|
PHRASE phraseF = phraseTableF.getPhrase( foreign );
|
||
|
for(int i=0;i<phraseE.size();i++) {
|
||
|
if (alignedToE[i].size() != other.alignedToE[i].size()) return false;
|
||
|
for(int j=0; j<alignedToE[i].size(); j++) {
|
||
|
if (alignedToE[i][j] != other.alignedToE[i][j]) return false;
|
||
|
}
|
||
|
}
|
||
|
for(int i=0;i<phraseF.size();i++) {
|
||
|
if (alignedToF[i].size() != other.alignedToF[i].size()) return false;
|
||
|
for(int j=0; j<alignedToF[i].size(); j++) {
|
||
|
if (alignedToF[i][j] != other.alignedToF[i][j]) return false;
|
||
|
}
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
void LexicalTable::load( char *fileName ) {
|
||
|
cerr << "Loading lexical translation table from " << fileName;
|
||
|
ifstream inFile;
|
||
|
inFile.open(fileName);
|
||
|
if (inFile.fail()) {
|
||
|
cerr << " - ERROR: could not open file\n";
|
||
|
exit(1);
|
||
|
}
|
||
|
istream *inFileP = &inFile;
|
||
|
|
||
|
char line[LINE_MAX_LENGTH];
|
||
|
|
||
|
int i=0;
|
||
|
while(true) {
|
||
|
i++;
|
||
|
if (i%100000 == 0) cerr << "." << flush;
|
||
|
SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n');
|
||
|
if (inFileP->eof()) break;
|
||
|
|
||
|
vector<string> token = tokenize( line );
|
||
|
if (token.size() != 3) {
|
||
|
cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:\n" <<
|
||
|
token.size() << " " << token[0] << " " << line << endl;
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
double prob = atof( token[2].c_str() );
|
||
|
WORD_ID wordE = vcbE.storeIfNew( token[0] );
|
||
|
WORD_ID wordF = vcbF.storeIfNew( token[1] );
|
||
|
ltable[ wordF ][ wordE ] = prob;
|
||
|
}
|
||
|
cerr << endl;
|
||
|
}
|