From b83fc72dd2773943f510d2e9edf858bddab26725 Mon Sep 17 00:00:00 2001 From: phkoehn Date: Tue, 8 Aug 2006 18:54:28 +0000 Subject: [PATCH] initial version of phrase-extract and phrase-score used by training script git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@567 1f5c12ca-751b-0410-a591-d2e778427230 --- scripts/training/phrase-extract/Makefile | 50 +++ scripts/training/phrase-extract/extract.cpp | 308 +++++++++++++++++ scripts/training/phrase-extract/score.cpp | 323 ++++++++++++++++++ .../training/phrase-extract/tables-core.cpp | 101 ++++++ scripts/training/phrase-extract/tables-core.h | 57 ++++ 5 files changed, 839 insertions(+) create mode 100644 scripts/training/phrase-extract/Makefile create mode 100644 scripts/training/phrase-extract/extract.cpp create mode 100644 scripts/training/phrase-extract/score.cpp create mode 100644 scripts/training/phrase-extract/tables-core.cpp create mode 100644 scripts/training/phrase-extract/tables-core.h diff --git a/scripts/training/phrase-extract/Makefile b/scripts/training/phrase-extract/Makefile new file mode 100644 index 000000000..d4b3ca13a --- /dev/null +++ b/scripts/training/phrase-extract/Makefile @@ -0,0 +1,50 @@ +DECODER_FLAG = -D_DECODER + +# ================================================================ +#LINUX VARS +CC = /usr/bin/gcc# -Wno-deprecated +CXX = /usr/bin/g++# -Wno-deprecated +CCFLAGS = $(BYTESWAP_FLAG) $(DECODER_FLAG) -O3 +BYTESWAP_FLAG = -DSLM_SWAP_BYTES +OSNAME = linux + +# ================================================================ +# DIRECTORIES TO BE USED +# general variables +TopDir := $(shell pwd) +BinDir = $(TopDir)/bin +SrcDir = $(TopDir) $(TopDir)/extract +INCLUDES = -I$(TopDir) -I$(TopDir)/extract +ObjDir = $(TopDir)/extractobj.$(OSNAME) + +# ========================================================= +# Variables for the EXTRACTOR + +ExtractObjFiles = extract.o + +ExtractObjFilesWithPath = $(addprefix $(ObjDir)/, $(ExtractObjFiles)) + +# ========================================================= + +extract: OutputPaths $(ExtractObjFiles) + $(CXX) $(CCFLAGS) $(INCLUDES) -o $(BinDir)/phrase-extract $(ExtractObjFilesWithPath) + +OutputPaths: MakeObjDir MakeBinDir + +MakeObjDir: + @mkdir -p $(ObjDir) +MakeBinDir: + @mkdir -p $(BinDir) + +VPATH = $(ObjDir) $(SrcDir) + +%.o : %.cpp + @echo @$(CXX) -c $(CCFLAGS) $(INCLUDES) $< -o $(ObjDir)/$(@F) + @echo Compiling $(@F) + @$(CXX) -c $(CCFLAGS) $(INCLUDES) $< -o $(ObjDir)/$(@F) + @echo + +%.o : %.c + @echo Compiling $(@F) + @$(CC) -c $(CCFLAGS) $(INCLUDES) $< -o $(ObjDir)/$(@F) + @echo diff --git a/scripts/training/phrase-extract/extract.cpp b/scripts/training/phrase-extract/extract.cpp new file mode 100644 index 000000000..ac65d463f --- /dev/null +++ b/scripts/training/phrase-extract/extract.cpp @@ -0,0 +1,308 @@ +using namespace std; + +#include +#include +#include +#include +#include +#include +#include + +#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) {_IS.getline(_LINE, _SIZE, _DELIM); if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear();} +#define LINE_MAX_LENGTH 10000 + +class SentenceAlignment { + public: + vector english; + vector foreign; + vector alignedCountF; + vector< vector > alignedToE; + + int create( char[], char[], char[], int ); + // void clear() { delete(alignment); }; +}; + +void extract( SentenceAlignment & ); +void addPhrase( SentenceAlignment &, int, int, int, int ); +vector tokenize( char [] ); +bool isAligned ( SentenceAlignment &, int, int ); + +ofstream extractFile; +ofstream extractFileInv; +ofstream extractFileOrientation; +int maxPhraseLength; +int phraseCount = 0; +char* fileNameExtract; +bool orientationFlag; + +int main(int argc, char* argv[]) +{ + cerr << "PhraseExtract v1.3.0, written by Philipp Koehn\n" + << "phrase extraction from an aligned parallel corpus\n"; + time_t starttime = time(NULL); + + if (argc != 6 && argc != 7) { + cerr << "syntax: phrase-extract en de align extract max-length [orientation]\n"; + exit(0); + } + char* &fileNameE = argv[1]; + char* &fileNameF = argv[2]; + char* &fileNameA = argv[3]; + fileNameExtract = argv[4]; + maxPhraseLength = atoi(argv[5]); + orientationFlag = (argc == 7); + if (orientationFlag) cerr << "(also extracting orientation)\n"; + + // string fileNameE = "/data/nlp/koehn/europarl-v2/models/de-en/model/aligned.en"; + // string fileNameF = "/data/nlp/koehn/europarl-v2/models/de-en/model/aligned.de"; + // string fileNameA = "/data/nlp/koehn/europarl-v2/models/de-en/model/aligned.grow-diag-final"; + + ifstream eFile; + ifstream fFile; + ifstream aFile; + eFile.open(fileNameE); + fFile.open(fileNameF); + aFile.open(fileNameA); + istream *eFileP = &eFile; + istream *fFileP = &fFile; + istream *aFileP = &aFile; + + // string fileNameExtract = "/data/nlp/koehn/europarl-v2/models/de-en/model/new-extract"; + + int i=0; + while(true) { + i++; + if (i%10000 == 0) cerr << "." << flush; + char englishString[LINE_MAX_LENGTH]; + char foreignString[LINE_MAX_LENGTH]; + char alignmentString[LINE_MAX_LENGTH]; + SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n'); + if (eFileP->eof()) break; + SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n'); + SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n'); + SentenceAlignment sentence; + // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl; + if (sentence.create( englishString, foreignString, alignmentString, i )) + extract(sentence); + } + + eFile.close(); + fFile.close(); + aFile.close(); + extractFile.close(); + extractFileInv.close(); +} + +void extract( SentenceAlignment &sentence ) { + int countE = sentence.english.size(); + int countF = sentence.foreign.size(); + + // check alignments for english phrase startE...endE + for(int startE=0;startE usedF = sentence.alignedCountF; + for(int ei=startE;ei<=endE;ei++) { + for(int i=0;imaxF) { maxF = fi; } + usedF[ fi ]--; + } + } + + // cout << "f projected ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n"; + + if (maxF >= 0 && // aligned to any foreign words at all + maxF-minF < maxPhraseLength) { // foreign phrase within limits + + // check if foreign words are aligned to out of bound english words + bool out_of_bounds = false; + for(int fi=minF;fi<=maxF && !out_of_bounds;fi++) + if (usedF[fi]>0) { + // cout << "ouf of bounds: " << fi << "\n"; + out_of_bounds = true; + } + + // cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n"; + if (!out_of_bounds) + // start point of foreign phrase may retreat over unaligned + for(int startF=minF; + (startF>=0 && + startF>maxF-maxPhraseLength && // within length limit + (startF==minF || sentence.alignedCountF[startF]==0)); // unaligned + startF--) + // end point of foreign phrase may advance over unaligned + for(int endF=maxF; + (endF0) { + extractFile.close(); + extractFileInv.close(); + if (orientationFlag) extractFileOrientation.close(); + } + char part[10]; + sprintf(part,".part%04d",phraseCount/10000000); + string fileNameExtractPart = string(fileNameExtract) + part; + string fileNameExtractInvPart = string(fileNameExtract) + ".inv" + part; + string fileNameExtractOrientationPart = string(fileNameExtract) + ".o" + part; + extractFile.open(fileNameExtractPart.c_str()); + extractFileInv.open(fileNameExtractInvPart.c_str()); + if (orientationFlag) extractFileOrientation.open(fileNameExtractOrientationPart.c_str()); + } + phraseCount++; + + for(int fi=startF;fi<=endF;fi++) { + extractFile << sentence.foreign[fi] << " "; + if (orientationFlag) extractFileOrientation << sentence.foreign[fi] << " "; + } + extractFile << "||| "; + if (orientationFlag) extractFileOrientation << "||| "; + + // english + for(int ei=startE;ei<=endE;ei++) { + extractFile << sentence.english[ei] << " "; + extractFileInv << sentence.english[ei] << " "; + if (orientationFlag) extractFileOrientation << sentence.english[ei] << " "; + } + extractFile << "|||"; + extractFileInv << "||| "; + if (orientationFlag) extractFileOrientation << "||| "; + + // foreign (for inverse) + for(int fi=startF;fi<=endF;fi++) + extractFileInv << sentence.foreign[fi] << " "; + extractFileInv << "|||"; + + // alignment + for(int ei=startE;ei<=endE;ei++) + for(int i=0;i= sentence.english.size() || fi >= sentence.foreign.size()) return false; + for(int i=0;i tokenize( char input[] ) { + vector< string > token; + bool betweenWords = true; + int start; + int i=0; + for(; input[i] != '\0'; i++) { + bool isSpace = (input[i] == ' ' || input[i] == '\t'); + + if (!isSpace && betweenWords) { + start = i; + betweenWords = false; + } + else if (isSpace && !betweenWords) { + token.push_back( string( input+start, i-start ) ); + betweenWords = true; + } + } + if (!betweenWords) + token.push_back( string( input+start, i-start ) ); + return token; +} + +int SentenceAlignment::create( char englishString[], char foreignString[], char alignmentString[], int sentenceID ) { + english = tokenize( englishString ); + foreign = tokenize( foreignString ); + // alignment = new bool[foreign.size()*english.size()]; + // alignment = (bool**) calloc(english.size()*foreign.size(),sizeof(bool)); // is this right? + + if (english.size() == 0 || foreign.size() == 0) { + cerr << "no english (" << english.size() << ") or foreign (" << foreign.size() << ") words << end insentence " << sentenceID << endl; + cerr << "E: " << englishString << endl << "F: " << foreignString << endl; + return 0; + } + // cout << "english.size = " << english.size() << endl; + // cout << "foreign.size = " << foreign.size() << endl; + + // cout << "xxx\n"; + for(int i=0; i dummy; + alignedToE.push_back( dummy ); + } + // cout << "\nscanning...\n"; + + vector alignmentSequence = tokenize( alignmentString ); + for(int i=0; i= english.size() || f >= foreign.size()) { + cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << f << ", " << e << ") out of bounds (" << foreign.size() << ", " << english.size() << ")\n"; + cerr << "E: " << englishString << endl << "F: " << foreignString << endl; + return 0; + } + alignedToE[e].push_back( f ); + alignedCountF[f]++; + } + return 1; +} + diff --git a/scripts/training/phrase-extract/score.cpp b/scripts/training/phrase-extract/score.cpp new file mode 100644 index 000000000..267cb669e --- /dev/null +++ b/scripts/training/phrase-extract/score.cpp @@ -0,0 +1,323 @@ +using namespace std; + +#include +#include +#include +#include +#include +#include +#include + +#include "tables-core.h" + +#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) {_IS.getline(_LINE, _SIZE, _DELIM); if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear();} +#define LINE_MAX_LENGTH 10000 + +class PhraseAlignment { +public: + int english, foreign; + vector< vector > alignedToE; + vector< vector > alignedToF; + + void create( char*, int ); + void clear(); + bool equals( PhraseAlignment ); +}; + +class LexicalTable { +public: + map< WORD_ID, map< WORD_ID, double > > ltable; + void load( char[] ); +}; + +vector tokenize( char [] ); + +void processPhrasePairs( vector< PhraseAlignment > & ); + +ofstream phraseTableFile; + +Vocabulary vcbE; +Vocabulary vcbF; +LexicalTable lexTable; +PhraseTable phraseTableE; +PhraseTable phraseTableF; +bool inverseFlag; + +int main(int argc, char* argv[]) +{ + cerr << "PhraseScore v1.2.1, written by Philipp Koehn\n" + << "phrase scoring methods for extracted phrases\n"; + time_t starttime = time(NULL); + + if (argc != 4 && argc != 5) { + cerr << "syntax: phrase-score extract lex phrase-table [inverse]\n"; + exit(0); + } + char* &fileNameExtract = argv[1]; + char* &fileNameLex = argv[2]; + char* &fileNamePhraseTable = argv[3]; + inverseFlag = false; + if (argc > 4) { + inverseFlag = true; + cerr << "using inverse mode\n"; + } + // char[] fileNameExtract& = "/data/nlp/koehn/europarl-v2/models/de-en/model/new-extract.sorted"; + // string fileNameLex = "/data/nlp/koehn/europarl-v2/models/de-en/model/lex.f2n"; + // string fileNamePhraseTable = "/data/nlp/koehn/europarl-v2/models/de-en/model/new-phrase-table-half.f2n"; + + // lexical translation table + lexTable.load( fileNameLex ); + + // sorted phrase extraction file + ifstream extractFile; + + extractFile.open(fileNameExtract); + if (extractFile.fail()) { + cerr << "ERROR: could not open extract file " << fileNameExtract << endl; + exit(0); + } + istream *extractFileP = &extractFile; + + // output file: phrase translation table + phraseTableFile.open(fileNamePhraseTable); + if (phraseTableFile.fail()) { + cerr << "ERROR: could not open file phrase table file " + << fileNamePhraseTable << endl; + exit(0); + } + + // loop through all extracted phrase translations + int lastForeign = -1; + vector< PhraseAlignment > phrasePairsWithSameF; + int i=0; + int fileCount = 0; + while(true) { + if (extractFileP->eof()) break; + if (++i % 100000 == 0) cerr << "." << flush; + char line[LINE_MAX_LENGTH]; + SAFE_GETLINE((*extractFileP), line, LINE_MAX_LENGTH, '\n'); + // if (fileCount>0) + if (extractFileP->eof()) break; + PhraseAlignment phrasePair; + phrasePair.create( line, i ); + if (lastForeign >= 0 && lastForeign != phrasePair.foreign) { + processPhrasePairs( phrasePairsWithSameF ); + for(int j=0;j &phrasePair ) { + map countE; + map alignmentE; + int totalCount = 0; + int currentCount = 0; + int maxSameCount = 0; + int maxSame = -1; + int old = -1; + for(int i=0;i0) { + if (phrasePair[old].english == phrasePair[i].english) { + if (! phrasePair[i].equals( phrasePair[old] )) { + if (currentCount > maxSameCount) { + maxSameCount = currentCount; + maxSame = i-1; + } + currentCount = 0; + } + } + else { + // wrap up old E + if (currentCount > maxSameCount) { + maxSameCount = currentCount; + maxSame = i-1; + } + + alignmentE[ phrasePair[old].english ] = maxSame; + // if (maxSameCount != totalCount) + // cout << "max count is " << maxSameCount << "/" << totalCount << endl; + + // get ready for new E + totalCount = 0; + currentCount = 0; + maxSameCount = 0; + maxSame = -1; + } + } + countE[ phrasePair[i].english ]++; + old = i; + currentCount++; + totalCount++; + } + + // wrap up old E + if (currentCount > maxSameCount) { + maxSameCount = currentCount; + maxSame = phrasePair.size()-1; + } + alignmentE[ phrasePair[old].english ] = maxSame; + // if (maxSameCount != totalCount) + // cout << "max count is " << maxSameCount << "/" << totalCount << endl; + + // output table + typedef map< int, int >::iterator II; + PHRASE phraseF = phraseTableF.getPhrase( phrasePair[0].foreign ); + for(II i = countE.begin(); i != countE.end(); i++) { + // cout << "\tp( " << i->first << " | " << phrasePair[0].foreign << " ; " << phraseF.size() << " ) = ...\n"; + + // foreign phrase (unless inverse) + if (! inverseFlag) { + for(int j=0;jfirst ); + for(int j=0;jsecond / (double) phrasePair.size()); + + // lexical translation probability + double lexScore = 1; + int null = vcbF.getWordID("NULL"); + PhraseAlignment ¤t = phrasePair[ alignmentE[ i->first ] ]; + for(int ei=0;ei " << lexScore << endl; + } + phraseTableFile << " " << lexScore; + + // model 1 score + + // zens&ney lexical score + + phraseTableFile << endl; + } +} + +void PhraseAlignment::create( char line[], int lineID ) { + vector< string > token = tokenize( line ); + int item = 1; + PHRASE phraseF, phraseE; + for (int j=0; j= phraseE.size() || f >= phraseF.size()) { + cerr << "WARNING: sentence " << lineID << " has alignment point (" << f << ", " << e << ") out of bounds (" << phraseF.size() << ", " << phraseE.size() << ")\n"; } + else { + if (alignedToE.size() == 0) { + vector< int > dummy; + for(int i=0;ieof()) break; + + vector token = tokenize( line ); + if (token.size() != 3) { + cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:\n" << + token.size() << " " << token[0] << " " << line << endl; + continue; + } + + double prob = atof( token[2].c_str() ); + WORD_ID wordE = vcbE.storeIfNew( token[0] ); + WORD_ID wordF = vcbF.storeIfNew( token[1] ); + ltable[ wordF ][ wordE ] = prob; + } + cerr << endl; +} diff --git a/scripts/training/phrase-extract/tables-core.cpp b/scripts/training/phrase-extract/tables-core.cpp new file mode 100644 index 000000000..f687e914e --- /dev/null +++ b/scripts/training/phrase-extract/tables-core.cpp @@ -0,0 +1,101 @@ +//#include "beammain.h" +#include "tables-core.h" + +#define TABLE_LINE_MAX_LENGTH 1000 +#define UNKNOWNSTR "UNK" + +#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) {_IS.getline(_LINE, _SIZE, _DELIM); if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear();} + +vector tokenize( char input[] ) { + vector< string > token; + bool betweenWords = true; + int start; + int i=0; + for(; input[i] != '\0'; i++) { + bool isSpace = (input[i] == ' ' || input[i] == '\t'); + + if (!isSpace && betweenWords) { + start = i; + betweenWords = false; + } + else if (isSpace && !betweenWords) { + token.push_back( string( input+start, i-start ) ); + betweenWords = true; + } + } + if (!betweenWords) + token.push_back( string( input+start, i-start+1 ) ); + return token; +} + +WORD_ID Vocabulary::storeIfNew( WORD word ) { + if( lookup.find( word ) != lookup.end() ) + return lookup[ word ]; + + WORD_ID id = vocab.size(); + vocab.push_back( word ); + lookup[ word ] = id; + return id; +} + +WORD_ID Vocabulary::getWordID( WORD word ) { + if( lookup.find( word ) == lookup.end() ) + return 0; + return lookup[ word ]; +} + +PHRASE_ID PhraseTable::storeIfNew( PHRASE phrase ) { + if( lookup.find( phrase ) != lookup.end() ) + return lookup[ phrase ]; + + PHRASE_ID id = phraseTable.size(); + phraseTable.push_back( phrase ); + lookup[ phrase ] = id; + return id; +} + +PHRASE_ID PhraseTable::getPhraseID( PHRASE phrase ) { + if( lookup.find( phrase ) == lookup.end() ) + return 0; + return lookup[ phrase ]; +} + +void PhraseTable::clear() { + lookup.clear(); + phraseTable.clear(); +} + +void DTable::init() { + for(int i = -10; i<10; i++) + dtable[i] = -abs( i ); +} + +void DTable::load( string fileName ) { + ifstream inFile; + inFile.open(fileName.c_str()); + istream *inFileP = &inFile; + + char line[TABLE_LINE_MAX_LENGTH]; + int i=0; + while(true) { + i++; + SAFE_GETLINE((*inFileP), line, TABLE_LINE_MAX_LENGTH, '\n'); + if (inFileP->eof()) break; + + vector token = tokenize( line ); + if (token.size() < 2) { + cerr << "line " << i << " in " << fileName << " too short, skipping\n"; + continue; + } + + int d = atoi( token[0].c_str() ); + double prob = log( atof( token[1].c_str() ) ); + dtable[ d ] = prob; + } +} + +double DTable::get( int distortion ) { + if (dtable.find( distortion ) == dtable.end()) + return log( 0.00001 ); + return dtable[ distortion ]; +} diff --git a/scripts/training/phrase-extract/tables-core.h b/scripts/training/phrase-extract/tables-core.h new file mode 100644 index 000000000..781468079 --- /dev/null +++ b/scripts/training/phrase-extract/tables-core.h @@ -0,0 +1,57 @@ +#ifndef _TABLES_H +#define _TABLES_H + +using namespace std; + +#include +#include +#include +#include +#include +#include +#include + +vector tokenize( char[] ); + +typedef string WORD; +typedef unsigned int WORD_ID; + +class Vocabulary { + public: + map lookup; + vector< WORD > vocab; + WORD_ID storeIfNew( WORD ); + WORD_ID getWordID( WORD ); + inline WORD &getWord( WORD_ID id ) { return vocab[ id ]; } +}; + +typedef vector< WORD_ID > PHRASE; +typedef unsigned int PHRASE_ID; + +class PhraseTable { + public: + map< PHRASE, PHRASE_ID > lookup; + vector< PHRASE > phraseTable; + PHRASE_ID storeIfNew( PHRASE ); + PHRASE_ID getPhraseID( PHRASE ); + void clear(); + inline PHRASE &getPhrase( const PHRASE_ID id ) { return phraseTable[ id ]; } +}; + +typedef vector< pair< PHRASE_ID, double > > PHRASEPROBVEC; + +class TTable { + public: + map< PHRASE_ID, vector< pair< PHRASE_ID, double > > > ttable; + map< PHRASE_ID, vector< pair< PHRASE_ID, vector< double > > > > ttableMulti; +}; + +class DTable { + public: + map< int, double > dtable; + void init(); + void load( string ); + double get( int ); +}; + +#endif