2010-01-08 20:16:37 +03:00
|
|
|
// $Id$
|
|
|
|
// vim:tabstop=2
|
|
|
|
|
|
|
|
#include <sstream>
|
|
|
|
#include <cstdio>
|
|
|
|
#include <iostream>
|
|
|
|
#include <fstream>
|
|
|
|
#include <vector>
|
|
|
|
#include <string>
|
2015-03-28 15:56:20 +03:00
|
|
|
#include <cstdlib>
|
|
|
|
#include <cassert>
|
|
|
|
#include <ctime>
|
2010-04-13 20:29:55 +04:00
|
|
|
|
2010-01-08 20:16:37 +03:00
|
|
|
#include "AlignmentPhrase.h"
|
|
|
|
#include "tables-core.h"
|
2011-10-13 22:57:23 +04:00
|
|
|
#include "InputFileStream.h"
|
Unify tokenize() into util, and unit-test it.
The duplicate definition works fine in environments where the inline
definition becomes a weak symbol in the object file, but if it gets
generated as a regular definition, the duplicate definition causes link
problems.
In most call sites the return value could easily be made const, which
gives both the reader and the compiler a bit more certainty about the code's
intentions. In theory this may help performance, but it's mainly for clarity.
The comments are based on reverse-engineering, and the unit tests are based
on the comments. It's possible that some of what's in there is not essential,
in which case, don't feel bad about changing it!
I left a third identical definition in place, though I updated it with my
changes to avoid creeping divergence, and noted the duplication in a comment.
It would be nice to get rid of this definition as well, but it'd introduce
headers from the main Moses tree into biconcor, which may be against policy.
2015-04-22 05:59:05 +03:00
|
|
|
#include "util/tokenize.hh"
|
2010-01-08 20:16:37 +03:00
|
|
|
|
|
|
|
using namespace std;
|
2012-06-30 19:56:53 +04:00
|
|
|
using namespace MosesTraining;
|
2010-01-08 20:16:37 +03:00
|
|
|
|
2012-07-02 20:05:11 +04:00
|
|
|
namespace MosesTraining
|
|
|
|
{
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
class PhraseAlignment
|
|
|
|
{
|
2010-01-08 20:16:37 +03:00
|
|
|
public:
|
|
|
|
int english, foreign;
|
|
|
|
vector< vector<size_t> > alignedToE;
|
|
|
|
vector< vector<size_t> > alignedToF;
|
2011-02-24 16:57:11 +03:00
|
|
|
|
2014-06-08 19:23:14 +04:00
|
|
|
bool create( const char*, int );
|
2010-01-08 20:16:37 +03:00
|
|
|
void clear();
|
|
|
|
bool equals( const PhraseAlignment& );
|
|
|
|
};
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
class LexicalTable
|
|
|
|
{
|
2010-01-08 20:16:37 +03:00
|
|
|
public:
|
|
|
|
map< WORD_ID, map< WORD_ID, double > > ltable;
|
2012-07-31 05:32:58 +04:00
|
|
|
void load( const string &);
|
2010-01-08 20:16:37 +03:00
|
|
|
};
|
|
|
|
|
2012-07-02 20:05:11 +04:00
|
|
|
}
|
|
|
|
|
2010-01-08 20:16:37 +03:00
|
|
|
void processPhrasePairs( vector< PhraseAlignment > & );
|
|
|
|
|
|
|
|
ofstream phraseTableFile;
|
|
|
|
|
|
|
|
Vocabulary vcbE;
|
|
|
|
Vocabulary vcbF;
|
|
|
|
LexicalTable lexTable;
|
|
|
|
PhraseTable phraseTableE;
|
|
|
|
PhraseTable phraseTableF;
|
|
|
|
bool inverseFlag;
|
|
|
|
int phrasePairBase = 0; // only used for "proper" conditioning
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
int main(int argc, char* argv[])
|
2010-01-08 20:16:37 +03:00
|
|
|
{
|
|
|
|
cerr << "PhraseStatistics v1.1 written by Nicola Bertoldi\n"
|
2011-02-24 16:57:11 +03:00
|
|
|
<< "modifying PhraseScore v1.4 written by Philipp Koehn\n"
|
2010-01-08 20:16:37 +03:00
|
|
|
<< "It computes statistics for extracted phrase pairs\n"
|
|
|
|
<< "if (direct):\n"
|
|
|
|
<< "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(src_phrase) length(src_phrase) length(trg_phrase)\n"
|
|
|
|
<< "if (inverse)\n"
|
|
|
|
<< "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(trg_phrase) length(src_phrase) length(trg_phrase)\n";
|
|
|
|
|
|
|
|
if (argc != 4 && argc != 5) {
|
|
|
|
cerr << "syntax: statistics extract lex phrase-table [inverse]\n";
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
char* &fileNameExtract = argv[1];
|
|
|
|
char* &fileNameLex = argv[2];
|
|
|
|
char* &fileNamePhraseTable = argv[3];
|
|
|
|
inverseFlag = false;
|
|
|
|
if (argc > 4) {
|
|
|
|
inverseFlag = true;
|
|
|
|
cerr << "using inverse mode\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
// lexical translation table
|
|
|
|
lexTable.load( fileNameLex );
|
2011-02-24 16:57:11 +03:00
|
|
|
|
2010-01-08 20:16:37 +03:00
|
|
|
// sorted phrase extraction file
|
2011-10-13 22:57:23 +04:00
|
|
|
Moses::InputFileStream extractFile(fileNameExtract);
|
2010-01-08 20:16:37 +03:00
|
|
|
|
|
|
|
if (extractFile.fail()) {
|
|
|
|
cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
istream &extractFileP = extractFile;
|
|
|
|
|
|
|
|
// output file: phrase translation table
|
|
|
|
phraseTableFile.open(fileNamePhraseTable);
|
|
|
|
if (phraseTableFile.fail()) {
|
2011-02-24 16:57:11 +03:00
|
|
|
cerr << "ERROR: could not open file phrase table file "
|
|
|
|
<< fileNamePhraseTable << endl;
|
2010-01-08 20:16:37 +03:00
|
|
|
exit(1);
|
|
|
|
}
|
2011-02-24 16:57:11 +03:00
|
|
|
|
2010-01-08 20:16:37 +03:00
|
|
|
// loop through all extracted phrase translations
|
|
|
|
int lastForeign = -1;
|
|
|
|
vector< PhraseAlignment > phrasePairsWithSameF;
|
|
|
|
int i=0;
|
2014-06-08 19:23:14 +04:00
|
|
|
|
|
|
|
string line;
|
|
|
|
while(getline(extractFileP, line)) {
|
2010-01-08 20:16:37 +03:00
|
|
|
if (extractFileP.eof()) break;
|
|
|
|
if (++i % 100000 == 0) cerr << "." << flush;
|
2014-06-08 19:23:14 +04:00
|
|
|
|
2010-01-08 20:16:37 +03:00
|
|
|
PhraseAlignment phrasePair;
|
2014-06-08 19:23:14 +04:00
|
|
|
bool isPhrasePair = phrasePair.create( line.c_str(), i );
|
2010-01-08 20:16:37 +03:00
|
|
|
if (lastForeign >= 0 && lastForeign != phrasePair.foreign) {
|
|
|
|
processPhrasePairs( phrasePairsWithSameF );
|
2012-05-10 16:48:51 +04:00
|
|
|
for(size_t j=0; j<phrasePairsWithSameF.size(); j++)
|
2011-02-24 16:57:11 +03:00
|
|
|
phrasePairsWithSameF[j].clear();
|
2010-01-08 20:16:37 +03:00
|
|
|
phrasePairsWithSameF.clear();
|
|
|
|
phraseTableE.clear();
|
|
|
|
phraseTableF.clear();
|
|
|
|
phrasePair.clear(); // process line again, since phrase tables flushed
|
2014-06-08 19:23:14 +04:00
|
|
|
phrasePair.create( line.c_str(), i );
|
2010-01-08 20:16:37 +03:00
|
|
|
phrasePairBase = 0;
|
|
|
|
}
|
|
|
|
lastForeign = phrasePair.foreign;
|
|
|
|
if (isPhrasePair)
|
|
|
|
phrasePairsWithSameF.push_back( phrasePair );
|
|
|
|
else
|
|
|
|
phrasePairBase++;
|
|
|
|
}
|
|
|
|
processPhrasePairs( phrasePairsWithSameF );
|
|
|
|
phraseTableFile.close();
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
void processPhrasePairs( vector< PhraseAlignment > &phrasePair )
|
|
|
|
{
|
2010-01-08 20:16:37 +03:00
|
|
|
if (phrasePair.size() == 0) return;
|
|
|
|
map<int, int> countE;
|
|
|
|
map<int, int> alignmentE;
|
|
|
|
int totalCount = 0;
|
|
|
|
int currentCount = 0;
|
|
|
|
int maxSameCount = 0;
|
|
|
|
int maxSame = -1;
|
|
|
|
int old = -1;
|
2012-05-10 16:48:51 +04:00
|
|
|
for(size_t i=0; i<phrasePair.size(); i++) {
|
2010-01-08 20:16:37 +03:00
|
|
|
if (i>0) {
|
|
|
|
if (phrasePair[old].english == phrasePair[i].english) {
|
2011-02-24 16:57:11 +03:00
|
|
|
if (! phrasePair[i].equals( phrasePair[old] )) {
|
|
|
|
if (currentCount > maxSameCount) {
|
|
|
|
maxSameCount = currentCount;
|
|
|
|
maxSame = i-1;
|
|
|
|
}
|
|
|
|
currentCount = 0;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// wrap up old E
|
|
|
|
if (currentCount > maxSameCount) {
|
|
|
|
maxSameCount = currentCount;
|
|
|
|
maxSame = i-1;
|
|
|
|
}
|
|
|
|
|
|
|
|
alignmentE[ phrasePair[old].english ] = maxSame;
|
|
|
|
// if (maxSameCount != totalCount)
|
|
|
|
// cout << "max count is " << maxSameCount << "/" << totalCount << endl;
|
|
|
|
|
|
|
|
// get ready for new E
|
|
|
|
totalCount = 0;
|
|
|
|
currentCount = 0;
|
|
|
|
maxSameCount = 0;
|
|
|
|
maxSame = -1;
|
|
|
|
}
|
2010-01-08 20:16:37 +03:00
|
|
|
}
|
|
|
|
countE[ phrasePair[i].english ]++;
|
|
|
|
old = i;
|
|
|
|
currentCount++;
|
|
|
|
totalCount++;
|
|
|
|
}
|
2011-02-24 16:57:11 +03:00
|
|
|
|
2010-01-08 20:16:37 +03:00
|
|
|
// wrap up old E
|
|
|
|
if (currentCount > maxSameCount) {
|
|
|
|
maxSameCount = currentCount;
|
|
|
|
maxSame = phrasePair.size()-1;
|
|
|
|
}
|
|
|
|
alignmentE[ phrasePair[old].english ] = maxSame;
|
|
|
|
// if (maxSameCount != totalCount)
|
|
|
|
// cout << "max count is " << maxSameCount << "/" << totalCount << endl;
|
|
|
|
|
|
|
|
// output table
|
|
|
|
typedef map< int, int >::iterator II;
|
|
|
|
PHRASE phraseF = phraseTableF.getPhrase( phrasePair[0].foreign );
|
2011-02-24 16:57:11 +03:00
|
|
|
size_t index = 0;
|
2010-01-08 20:16:37 +03:00
|
|
|
for(II i = countE.begin(); i != countE.end(); i++) {
|
|
|
|
//cout << "\tp( " << i->first << " | " << phrasePair[0].foreign << " ; " << phraseF.size() << " ) = ...\n";
|
2011-02-24 16:57:11 +03:00
|
|
|
//cerr << index << endl;
|
2010-01-08 20:16:37 +03:00
|
|
|
|
|
|
|
// foreign phrase (unless inverse)
|
|
|
|
if (! inverseFlag) {
|
2012-05-10 16:48:51 +04:00
|
|
|
for(size_t j=0; j<phraseF.size(); j++) {
|
2011-02-24 16:57:11 +03:00
|
|
|
phraseTableFile << vcbF.getWord( phraseF[j] );
|
|
|
|
phraseTableFile << " ";
|
|
|
|
}
|
2010-01-08 20:16:37 +03:00
|
|
|
phraseTableFile << "||| ";
|
2011-02-24 16:57:11 +03:00
|
|
|
}
|
2010-01-08 20:16:37 +03:00
|
|
|
|
|
|
|
// english phrase
|
|
|
|
PHRASE phraseE = phraseTableE.getPhrase( i->first );
|
2012-05-10 16:48:51 +04:00
|
|
|
for(size_t j=0; j<phraseE.size(); j++) {
|
2010-01-08 20:16:37 +03:00
|
|
|
phraseTableFile << vcbE.getWord( phraseE[j] );
|
2011-02-24 16:57:11 +03:00
|
|
|
phraseTableFile << " ";
|
|
|
|
}
|
2010-01-08 20:16:37 +03:00
|
|
|
phraseTableFile << "||| ";
|
|
|
|
|
|
|
|
// foreign phrase (if inverse)
|
|
|
|
if (inverseFlag) {
|
2012-05-10 16:48:51 +04:00
|
|
|
for(size_t j=0; j<phraseF.size(); j++) {
|
2011-02-24 16:57:11 +03:00
|
|
|
phraseTableFile << vcbF.getWord( phraseF[j] );
|
|
|
|
phraseTableFile << " ";
|
|
|
|
}
|
2010-01-08 20:16:37 +03:00
|
|
|
phraseTableFile << "||| ";
|
2011-02-24 16:57:11 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// phrase pair frequency
|
2010-01-08 20:16:37 +03:00
|
|
|
phraseTableFile << i->second;
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
//source phrase pair frequency
|
2010-01-08 20:16:37 +03:00
|
|
|
phraseTableFile << " " << phrasePair.size();
|
|
|
|
|
|
|
|
// source phrase length
|
2011-02-24 16:57:11 +03:00
|
|
|
phraseTableFile << " " << phraseF.size();
|
2010-01-08 20:16:37 +03:00
|
|
|
|
|
|
|
// target phrase length
|
2011-02-24 16:57:11 +03:00
|
|
|
phraseTableFile << " " << phraseE.size();
|
2010-01-08 20:16:37 +03:00
|
|
|
|
|
|
|
phraseTableFile << endl;
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
index += i->second;
|
2010-01-08 20:16:37 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-06-08 19:23:14 +04:00
|
|
|
bool PhraseAlignment::create(const char line[], int lineID )
|
2011-02-24 16:57:11 +03:00
|
|
|
{
|
Unify tokenize() into util, and unit-test it.
The duplicate definition works fine in environments where the inline
definition becomes a weak symbol in the object file, but if it gets
generated as a regular definition, the duplicate definition causes link
problems.
In most call sites the return value could easily be made const, which
gives both the reader and the compiler a bit more certainty about the code's
intentions. In theory this may help performance, but it's mainly for clarity.
The comments are based on reverse-engineering, and the unit tests are based
on the comments. It's possible that some of what's in there is not essential,
in which case, don't feel bad about changing it!
I left a third identical definition in place, though I updated it with my
changes to avoid creeping divergence, and noted the duplication in a comment.
It would be nice to get rid of this definition as well, but it'd introduce
headers from the main Moses tree into biconcor, which may be against policy.
2015-04-22 05:59:05 +03:00
|
|
|
const vector< string > token = util::tokenize( line );
|
2010-01-08 20:16:37 +03:00
|
|
|
int item = 1;
|
|
|
|
PHRASE phraseF, phraseE;
|
2012-05-10 16:48:51 +04:00
|
|
|
for (size_t j=0; j<token.size(); j++) {
|
2010-01-08 20:16:37 +03:00
|
|
|
if (token[j] == "|||") item++;
|
|
|
|
else {
|
|
|
|
if (item == 1)
|
2011-02-24 16:57:11 +03:00
|
|
|
phraseF.push_back( vcbF.storeIfNew( token[j] ) );
|
2010-01-08 20:16:37 +03:00
|
|
|
else if (item == 2)
|
2011-02-24 16:57:11 +03:00
|
|
|
phraseE.push_back( vcbE.storeIfNew( token[j] ) );
|
2010-01-08 20:16:37 +03:00
|
|
|
else if (item == 3) {
|
2011-02-24 16:57:11 +03:00
|
|
|
int e,f;
|
|
|
|
sscanf(token[j].c_str(), "%d-%d", &f, &e);
|
2012-05-10 16:48:51 +04:00
|
|
|
if ((size_t)e >= phraseE.size() || (size_t)f >= phraseF.size()) {
|
2011-02-24 16:57:11 +03:00
|
|
|
cerr << "WARNING: sentence " << lineID << " has alignment point (" << f << ", " << e << ") out of bounds (" << phraseF.size() << ", " << phraseE.size() << ")\n";
|
|
|
|
} else {
|
|
|
|
if (alignedToE.size() == 0) {
|
|
|
|
vector< size_t > dummy;
|
2012-05-10 16:48:51 +04:00
|
|
|
for(size_t i=0; i<phraseE.size(); i++)
|
2011-02-24 16:57:11 +03:00
|
|
|
alignedToE.push_back( dummy );
|
2012-05-10 16:48:51 +04:00
|
|
|
for(size_t i=0; i<phraseF.size(); i++)
|
2011-02-24 16:57:11 +03:00
|
|
|
alignedToF.push_back( dummy );
|
|
|
|
foreign = phraseTableF.storeIfNew( phraseF );
|
|
|
|
english = phraseTableE.storeIfNew( phraseE );
|
|
|
|
}
|
|
|
|
alignedToE[e].push_back( f );
|
|
|
|
alignedToF[f].push_back( e );
|
|
|
|
}
|
2010-01-08 20:16:37 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return (item>2); // real phrase pair, not just foreign phrase
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
void PhraseAlignment::clear()
|
|
|
|
{
|
2012-05-10 16:48:51 +04:00
|
|
|
for(size_t i=0; i<alignedToE.size(); i++)
|
2010-01-08 20:16:37 +03:00
|
|
|
alignedToE[i].clear();
|
2012-05-10 16:48:51 +04:00
|
|
|
for(size_t i=0; i<alignedToF.size(); i++)
|
2010-01-08 20:16:37 +03:00
|
|
|
alignedToF[i].clear();
|
|
|
|
alignedToE.clear();
|
|
|
|
alignedToF.clear();
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
bool PhraseAlignment::equals( const PhraseAlignment& other )
|
|
|
|
{
|
2010-01-08 20:16:37 +03:00
|
|
|
if (this == &other) return true;
|
|
|
|
if (other.english != english) return false;
|
|
|
|
if (other.foreign != foreign) return false;
|
|
|
|
PHRASE phraseE = phraseTableE.getPhrase( english );
|
|
|
|
PHRASE phraseF = phraseTableF.getPhrase( foreign );
|
2012-05-10 16:48:51 +04:00
|
|
|
for(size_t i=0; i<phraseE.size(); i++) {
|
2010-01-08 20:16:37 +03:00
|
|
|
if (alignedToE[i].size() != other.alignedToE[i].size()) return false;
|
2012-05-10 16:48:51 +04:00
|
|
|
for(size_t j=0; j<alignedToE[i].size(); j++) {
|
2010-01-08 20:16:37 +03:00
|
|
|
if (alignedToE[i][j] != other.alignedToE[i][j]) return false;
|
|
|
|
}
|
|
|
|
}
|
2012-05-10 16:48:51 +04:00
|
|
|
for(size_t i=0; i<phraseF.size(); i++) {
|
2010-01-08 20:16:37 +03:00
|
|
|
if (alignedToF[i].size() != other.alignedToF[i].size()) return false;
|
2012-05-10 16:48:51 +04:00
|
|
|
for(size_t j=0; j<alignedToF[i].size(); j++) {
|
2010-01-08 20:16:37 +03:00
|
|
|
if (alignedToF[i][j] != other.alignedToF[i][j]) return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2012-07-31 05:21:48 +04:00
|
|
|
void LexicalTable::load( const string &filePath )
|
2011-02-24 16:57:11 +03:00
|
|
|
{
|
2012-07-31 05:21:48 +04:00
|
|
|
cerr << "Loading lexical translation table from " << filePath;
|
2010-01-08 20:16:37 +03:00
|
|
|
ifstream inFile;
|
2012-07-31 05:21:48 +04:00
|
|
|
inFile.open(filePath.c_str());
|
2010-01-08 20:16:37 +03:00
|
|
|
if (inFile.fail()) {
|
|
|
|
cerr << " - ERROR: could not open file\n";
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
istream *inFileP = &inFile;
|
|
|
|
|
2014-06-08 19:23:14 +04:00
|
|
|
string line;
|
2010-01-08 20:16:37 +03:00
|
|
|
|
|
|
|
int i=0;
|
2014-06-08 19:23:14 +04:00
|
|
|
while(getline(*inFileP, line)) {
|
2010-01-08 20:16:37 +03:00
|
|
|
i++;
|
|
|
|
if (i%100000 == 0) cerr << "." << flush;
|
|
|
|
|
Unify tokenize() into util, and unit-test it.
The duplicate definition works fine in environments where the inline
definition becomes a weak symbol in the object file, but if it gets
generated as a regular definition, the duplicate definition causes link
problems.
In most call sites the return value could easily be made const, which
gives both the reader and the compiler a bit more certainty about the code's
intentions. In theory this may help performance, but it's mainly for clarity.
The comments are based on reverse-engineering, and the unit tests are based
on the comments. It's possible that some of what's in there is not essential,
in which case, don't feel bad about changing it!
I left a third identical definition in place, though I updated it with my
changes to avoid creeping divergence, and noted the duplication in a comment.
It would be nice to get rid of this definition as well, but it'd introduce
headers from the main Moses tree into biconcor, which may be against policy.
2015-04-22 05:59:05 +03:00
|
|
|
const vector<string> token = util::tokenize( line.c_str() );
|
2010-01-08 20:16:37 +03:00
|
|
|
if (token.size() != 3) {
|
2012-07-31 05:32:58 +04:00
|
|
|
cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
|
2011-02-24 16:57:11 +03:00
|
|
|
token.size() << " " << token[0] << " " << line << endl;
|
2010-01-08 20:16:37 +03:00
|
|
|
continue;
|
|
|
|
}
|
2011-02-24 16:57:11 +03:00
|
|
|
|
2010-01-08 20:16:37 +03:00
|
|
|
double prob = atof( token[2].c_str() );
|
|
|
|
WORD_ID wordE = vcbE.storeIfNew( token[0] );
|
|
|
|
WORD_ID wordF = vcbF.storeIfNew( token[1] );
|
|
|
|
ltable[ wordF ][ wordE ] = prob;
|
|
|
|
}
|
|
|
|
cerr << endl;
|
|
|
|
}
|