mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-20 15:48:05 +03:00
extended extract/score, to allow for one big file, not just parts
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1903 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
d9d1b8f748
commit
614876771d
@ -34,6 +34,7 @@ class SentenceAlignment {
|
||||
// void clear() { delete(alignment); };
|
||||
};
|
||||
|
||||
void extractBase( SentenceAlignment & );
|
||||
void extract( SentenceAlignment & );
|
||||
void addPhrase( SentenceAlignment &, int, int, int, int );
|
||||
vector<string> tokenize( char [] );
|
||||
@ -45,17 +46,20 @@ ofstream extractFileOrientation;
|
||||
int maxPhraseLength;
|
||||
int phraseCount = 0;
|
||||
char* fileNameExtract;
|
||||
bool orientationFlag;
|
||||
bool onlyOutputSpanInfo;
|
||||
bool orientationFlag = false;
|
||||
bool onlyOutputSpanInfo = false;
|
||||
bool noFileLimit = false;
|
||||
bool zipFiles = false;
|
||||
bool properConditioning = false;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
cerr << "PhraseExtract v1.3.0, written by Philipp Koehn\n"
|
||||
cerr << "PhraseExtract v1.4, written by Philipp Koehn\n"
|
||||
<< "phrase extraction from an aligned parallel corpus\n";
|
||||
time_t starttime = time(NULL);
|
||||
|
||||
if (argc != 6 && argc != 7) {
|
||||
cerr << "syntax: phrase-extract en de align extract max-length [orientation | --OnlyOutputSpanInfo]\n";
|
||||
if (argc < 6) {
|
||||
cerr << "syntax: phrase-extract en de align extract max-length [orientation | --OnlyOutputSpanInfo | --NoFileLimit | --ProperConditioning ]\n";
|
||||
exit(1);
|
||||
}
|
||||
char* &fileNameE = argv[1];
|
||||
@ -63,15 +67,28 @@ int main(int argc, char* argv[])
|
||||
char* &fileNameA = argv[3];
|
||||
fileNameExtract = argv[4];
|
||||
maxPhraseLength = atoi(argv[5]);
|
||||
onlyOutputSpanInfo = argc == 7 && strcmp(argv[6],"--OnlyOutputSpanInfo") == 0; //az
|
||||
if (onlyOutputSpanInfo) cerr << "Only outputting span info in format (starting from 0): SrcBegin SrcEnd TgtBegin TgtEnd\n"; //az
|
||||
orientationFlag = (argc == 7 && !onlyOutputSpanInfo);
|
||||
if (orientationFlag) cerr << "(also extracting orientation)\n";
|
||||
|
||||
// string fileNameE = "/data/nlp/koehn/europarl-v2/models/de-en/model/aligned.en";
|
||||
// string fileNameF = "/data/nlp/koehn/europarl-v2/models/de-en/model/aligned.de";
|
||||
// string fileNameA = "/data/nlp/koehn/europarl-v2/models/de-en/model/aligned.grow-diag-final";
|
||||
|
||||
for(int i=6;i<argc;i++) {
|
||||
if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
|
||||
onlyOutputSpanInfo = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"--NoFileLimit") == 0) {
|
||||
noFileLimit = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
|
||||
orientationFlag = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"--ZipFiles") == 0) {
|
||||
zipFiles = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"--ProperConditioning") == 0) {
|
||||
properConditioning = true;
|
||||
}
|
||||
else {
|
||||
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
ifstream eFile;
|
||||
ifstream fFile;
|
||||
ifstream aFile;
|
||||
@ -82,8 +99,6 @@ int main(int argc, char* argv[])
|
||||
istream *fFileP = &fFile;
|
||||
istream *aFileP = &aFile;
|
||||
|
||||
// string fileNameExtract = "/data/nlp/koehn/europarl-v2/models/de-en/model/new-extract";
|
||||
|
||||
int i=0;
|
||||
while(true) {
|
||||
i++;
|
||||
@ -105,8 +120,10 @@ int main(int argc, char* argv[])
|
||||
cout << "LOG: PHRASES_BEGIN:" << endl;
|
||||
}
|
||||
|
||||
if (sentence.create( englishString, foreignString, alignmentString, i ))
|
||||
if (sentence.create( englishString, foreignString, alignmentString, i )) {
|
||||
extract(sentence);
|
||||
if (properConditioning) extractBase(sentence);
|
||||
}
|
||||
if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
|
||||
}
|
||||
|
||||
@ -121,6 +138,33 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
|
||||
// if proper conditioning, we need the number of times a foreign phrase occured
|
||||
void extractBase( SentenceAlignment &sentence ) {
|
||||
int countF = sentence.foreign.size();
|
||||
for(int startF=0;startF<countF;startF++) {
|
||||
for(int endF=startF;
|
||||
(endF<countF && endF<startF+maxPhraseLength);
|
||||
endF++) {
|
||||
for(int fi=startF;fi<=endF;fi++) {
|
||||
extractFile << sentence.foreign[fi] << " ";
|
||||
}
|
||||
extractFile << "|||" << endl;
|
||||
}
|
||||
}
|
||||
|
||||
int countE = sentence.english.size();
|
||||
for(int startE=0;startE<countE;startE++) {
|
||||
for(int endE=startE;
|
||||
(endE<countE && endE<startE+maxPhraseLength);
|
||||
endE++) {
|
||||
for(int ei=startE;ei<=endE;ei++) {
|
||||
extractFileInv << sentence.english[ei] << " ";
|
||||
}
|
||||
extractFileInv << "|||" << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void extract( SentenceAlignment &sentence ) {
|
||||
int countE = sentence.english.size();
|
||||
int countF = sentence.foreign.size();
|
||||
@ -181,25 +225,41 @@ void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, i
|
||||
// foreign
|
||||
// cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
|
||||
|
||||
if (onlyOutputSpanInfo) {
|
||||
cout << startF << " " << endF << " " << startE << " " << endE << endl;
|
||||
} else {
|
||||
if (onlyOutputSpanInfo) {
|
||||
cout << startF << " " << endF << " " << startE << " " << endE << endl;
|
||||
return;
|
||||
}
|
||||
|
||||
if (phraseCount % 10000000 == 0) {
|
||||
if (phraseCount>0) {
|
||||
// new file every 1e7 phrases
|
||||
if (phraseCount % 10000000 == 0 // new file every 1e7 phrases
|
||||
&& (!noFileLimit || phraseCount == 0)) { // only new partial file, if file limit
|
||||
|
||||
// close old file
|
||||
if (!noFileLimit && phraseCount>0) {
|
||||
extractFile.close();
|
||||
extractFileInv.close();
|
||||
if (orientationFlag) extractFileOrientation.close();
|
||||
}
|
||||
|
||||
// construct file name
|
||||
char part[10];
|
||||
sprintf(part,".part%04d",phraseCount/10000000);
|
||||
if (noFileLimit)
|
||||
part[0] = '\0';
|
||||
else
|
||||
sprintf(part,".part%04d",phraseCount/10000000);
|
||||
string fileNameExtractPart = string(fileNameExtract) + part;
|
||||
string fileNameExtractInvPart = string(fileNameExtract) + ".inv" + part;
|
||||
string fileNameExtractOrientationPart = string(fileNameExtract) + ".o" + part;
|
||||
|
||||
|
||||
// open files
|
||||
extractFile.open(fileNameExtractPart.c_str());
|
||||
extractFileInv.open(fileNameExtractInvPart.c_str());
|
||||
if (orientationFlag) extractFileOrientation.open(fileNameExtractOrientationPart.c_str());
|
||||
if (orientationFlag)
|
||||
extractFileOrientation.open(fileNameExtractOrientationPart.c_str());
|
||||
}
|
||||
|
||||
|
||||
phraseCount++;
|
||||
|
||||
for(int fi=startF;fi<=endF;fi++) {
|
||||
@ -258,7 +318,6 @@ void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, i
|
||||
extractFile << "\n";
|
||||
extractFileInv << "\n";
|
||||
if (orientationFlag) extractFileOrientation << "\n";
|
||||
} // end: if (onlyOutputSpanInfo)
|
||||
}
|
||||
|
||||
bool isAligned ( SentenceAlignment &sentence, int fi, int ei ) {
|
||||
|
@ -33,7 +33,7 @@ public:
|
||||
vector< vector<size_t> > alignedToE;
|
||||
vector< vector<size_t> > alignedToF;
|
||||
|
||||
void create( char*, int );
|
||||
bool create( char*, int );
|
||||
void clear();
|
||||
bool equals( const PhraseAlignment& );
|
||||
};
|
||||
@ -56,10 +56,11 @@ LexicalTable lexTable;
|
||||
PhraseTable phraseTableE;
|
||||
PhraseTable phraseTableF;
|
||||
bool inverseFlag;
|
||||
int phrasePairBase = 0; // only used for "proper" conditioning
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
cerr << "PhraseScore v1.2.1, written by Philipp Koehn\n"
|
||||
cerr << "PhraseScore v1.4 written by Philipp Koehn\n"
|
||||
<< "phrase scoring methods for extracted phrases\n";
|
||||
time_t starttime = time(NULL);
|
||||
|
||||
@ -75,9 +76,6 @@ int main(int argc, char* argv[])
|
||||
inverseFlag = true;
|
||||
cerr << "using inverse mode\n";
|
||||
}
|
||||
// char[] fileNameExtract& = "/data/nlp/koehn/europarl-v2/models/de-en/model/new-extract.sorted";
|
||||
// string fileNameLex = "/data/nlp/koehn/europarl-v2/models/de-en/model/lex.f2n";
|
||||
// string fileNamePhraseTable = "/data/nlp/koehn/europarl-v2/models/de-en/model/new-phrase-table-half.f2n";
|
||||
|
||||
// lexical translation table
|
||||
lexTable.load( fileNameLex );
|
||||
@ -114,7 +112,7 @@ int main(int argc, char* argv[])
|
||||
if (extractFileP.eof())
|
||||
break;
|
||||
PhraseAlignment phrasePair;
|
||||
phrasePair.create( line, i );
|
||||
bool isPhrasePair = phrasePair.create( line, i );
|
||||
if (lastForeign >= 0 && lastForeign != phrasePair.foreign) {
|
||||
processPhrasePairs( phrasePairsWithSameF );
|
||||
for(int j=0;j<phrasePairsWithSameF.size();j++)
|
||||
@ -124,9 +122,13 @@ int main(int argc, char* argv[])
|
||||
phraseTableF.clear();
|
||||
phrasePair.clear(); // process line again, since phrase tables flushed
|
||||
phrasePair.create( line, i );
|
||||
phrasePairBase = 0;
|
||||
}
|
||||
lastForeign = phrasePair.foreign;
|
||||
phrasePairsWithSameF.push_back( phrasePair );
|
||||
if (isPhrasePair)
|
||||
phrasePairsWithSameF.push_back( phrasePair );
|
||||
else
|
||||
phrasePairBase++;
|
||||
}
|
||||
processPhrasePairs( phrasePairsWithSameF );
|
||||
phraseTableFile.close();
|
||||
@ -154,6 +156,7 @@ void outputAlignment(const AlignmentPhrase &alignmentPhrase)
|
||||
}
|
||||
|
||||
void processPhrasePairs( vector< PhraseAlignment > &phrasePair ) {
|
||||
if (phrasePair.size() == 0) return;
|
||||
map<int, int> countE;
|
||||
map<int, int> alignmentE;
|
||||
int totalCount = 0;
|
||||
@ -217,11 +220,9 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair ) {
|
||||
if (! inverseFlag) {
|
||||
for(int j=0;j<phraseF.size();j++)
|
||||
{
|
||||
//cerr << vcbF.getWord( phraseF[j] ) << " ";
|
||||
phraseTableFile << vcbF.getWord( phraseF[j] );
|
||||
phraseTableFile << " ";
|
||||
}
|
||||
//cerr << endl;
|
||||
phraseTableFile << "||| ";
|
||||
}
|
||||
|
||||
@ -229,24 +230,18 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair ) {
|
||||
PHRASE phraseE = phraseTableE.getPhrase( i->first );
|
||||
for(int j=0;j<phraseE.size();j++)
|
||||
{
|
||||
//if ( vcbE.getWord( phraseE[j] ) == "herr")
|
||||
// cerr << "";
|
||||
//cerr << vcbE.getWord( phraseE[j] ) << " ";
|
||||
phraseTableFile << vcbE.getWord( phraseE[j] );
|
||||
phraseTableFile << " ";
|
||||
}
|
||||
//cerr << endl;
|
||||
phraseTableFile << "||| ";
|
||||
|
||||
// foreign phrase (if inverse)
|
||||
if (inverseFlag) {
|
||||
for(int j=0;j<phraseF.size();j++)
|
||||
{
|
||||
//cerr << vcbF.getWord( phraseF[j] ) << " ";
|
||||
phraseTableFile << vcbF.getWord( phraseF[j] );
|
||||
phraseTableFile << " ";
|
||||
}
|
||||
//cerr << endl;
|
||||
phraseTableFile << "||| ";
|
||||
}
|
||||
|
||||
@ -287,8 +282,12 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair ) {
|
||||
}
|
||||
|
||||
// phrase translation probability
|
||||
phraseTableFile << ((double) i->second / (double) phrasePair.size())
|
||||
<< " " << lexScore;
|
||||
if (phrasePairBase > 0) // "proper" conditioning
|
||||
phraseTableFile << ((double) i->second / (double) phrasePairBase);
|
||||
else
|
||||
phraseTableFile << ((double) i->second / (double) phrasePair.size());
|
||||
|
||||
phraseTableFile << " " << lexScore;
|
||||
|
||||
// model 1 score
|
||||
|
||||
@ -300,7 +299,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair ) {
|
||||
}
|
||||
}
|
||||
|
||||
void PhraseAlignment::create( char line[], int lineID ) {
|
||||
bool PhraseAlignment::create( char line[], int lineID ) {
|
||||
vector< string > token = tokenize( line );
|
||||
int item = 1;
|
||||
PHRASE phraseF, phraseE;
|
||||
@ -332,6 +331,7 @@ void PhraseAlignment::create( char line[], int lineID ) {
|
||||
}
|
||||
}
|
||||
}
|
||||
return (item>2); // real phrase pair, not just foreign phrase
|
||||
}
|
||||
|
||||
void PhraseAlignment::clear() {
|
||||
|
Loading…
Reference in New Issue
Block a user