mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-20 23:58:15 +03:00
initial version of phrase-extract and phrase-score used by training script
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@567 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
7f8914c6d5
commit
b83fc72dd2
50
scripts/training/phrase-extract/Makefile
Normal file
50
scripts/training/phrase-extract/Makefile
Normal file
@ -0,0 +1,50 @@
|
||||
DECODER_FLAG = -D_DECODER
|
||||
|
||||
# ================================================================
|
||||
#LINUX VARS
|
||||
CC = /usr/bin/gcc# -Wno-deprecated
|
||||
CXX = /usr/bin/g++# -Wno-deprecated
|
||||
CCFLAGS = $(BYTESWAP_FLAG) $(DECODER_FLAG) -O3
|
||||
BYTESWAP_FLAG = -DSLM_SWAP_BYTES
|
||||
OSNAME = linux
|
||||
|
||||
# ================================================================
|
||||
# DIRECTORIES TO BE USED
|
||||
# general variables
|
||||
TopDir := $(shell pwd)
|
||||
BinDir = $(TopDir)/bin
|
||||
SrcDir = $(TopDir) $(TopDir)/extract
|
||||
INCLUDES = -I$(TopDir) -I$(TopDir)/extract
|
||||
ObjDir = $(TopDir)/extractobj.$(OSNAME)
|
||||
|
||||
# =========================================================
|
||||
# Variables for the EXTRACTOR
|
||||
|
||||
ExtractObjFiles = extract.o
|
||||
|
||||
ExtractObjFilesWithPath = $(addprefix $(ObjDir)/, $(ExtractObjFiles))
|
||||
|
||||
# =========================================================
|
||||
|
||||
extract: OutputPaths $(ExtractObjFiles)
|
||||
$(CXX) $(CCFLAGS) $(INCLUDES) -o $(BinDir)/phrase-extract $(ExtractObjFilesWithPath)
|
||||
|
||||
OutputPaths: MakeObjDir MakeBinDir
|
||||
|
||||
MakeObjDir:
|
||||
@mkdir -p $(ObjDir)
|
||||
MakeBinDir:
|
||||
@mkdir -p $(BinDir)
|
||||
|
||||
VPATH = $(ObjDir) $(SrcDir)
|
||||
|
||||
%.o : %.cpp
|
||||
@echo @$(CXX) -c $(CCFLAGS) $(INCLUDES) $< -o $(ObjDir)/$(@F)
|
||||
@echo Compiling $(@F)
|
||||
@$(CXX) -c $(CCFLAGS) $(INCLUDES) $< -o $(ObjDir)/$(@F)
|
||||
@echo
|
||||
|
||||
%.o : %.c
|
||||
@echo Compiling $(@F)
|
||||
@$(CC) -c $(CCFLAGS) $(INCLUDES) $< -o $(ObjDir)/$(@F)
|
||||
@echo
|
308
scripts/training/phrase-extract/extract.cpp
Normal file
308
scripts/training/phrase-extract/extract.cpp
Normal file
@ -0,0 +1,308 @@
|
||||
using namespace std;
|
||||
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
|
||||
#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) {_IS.getline(_LINE, _SIZE, _DELIM); if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear();}
|
||||
#define LINE_MAX_LENGTH 10000
|
||||
|
||||
class SentenceAlignment {
|
||||
public:
|
||||
vector<string> english;
|
||||
vector<string> foreign;
|
||||
vector<int> alignedCountF;
|
||||
vector< vector<int> > alignedToE;
|
||||
|
||||
int create( char[], char[], char[], int );
|
||||
// void clear() { delete(alignment); };
|
||||
};
|
||||
|
||||
void extract( SentenceAlignment & );
|
||||
void addPhrase( SentenceAlignment &, int, int, int, int );
|
||||
vector<string> tokenize( char [] );
|
||||
bool isAligned ( SentenceAlignment &, int, int );
|
||||
|
||||
ofstream extractFile;
|
||||
ofstream extractFileInv;
|
||||
ofstream extractFileOrientation;
|
||||
int maxPhraseLength;
|
||||
int phraseCount = 0;
|
||||
char* fileNameExtract;
|
||||
bool orientationFlag;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
cerr << "PhraseExtract v1.3.0, written by Philipp Koehn\n"
|
||||
<< "phrase extraction from an aligned parallel corpus\n";
|
||||
time_t starttime = time(NULL);
|
||||
|
||||
if (argc != 6 && argc != 7) {
|
||||
cerr << "syntax: phrase-extract en de align extract max-length [orientation]\n";
|
||||
exit(0);
|
||||
}
|
||||
char* &fileNameE = argv[1];
|
||||
char* &fileNameF = argv[2];
|
||||
char* &fileNameA = argv[3];
|
||||
fileNameExtract = argv[4];
|
||||
maxPhraseLength = atoi(argv[5]);
|
||||
orientationFlag = (argc == 7);
|
||||
if (orientationFlag) cerr << "(also extracting orientation)\n";
|
||||
|
||||
// string fileNameE = "/data/nlp/koehn/europarl-v2/models/de-en/model/aligned.en";
|
||||
// string fileNameF = "/data/nlp/koehn/europarl-v2/models/de-en/model/aligned.de";
|
||||
// string fileNameA = "/data/nlp/koehn/europarl-v2/models/de-en/model/aligned.grow-diag-final";
|
||||
|
||||
ifstream eFile;
|
||||
ifstream fFile;
|
||||
ifstream aFile;
|
||||
eFile.open(fileNameE);
|
||||
fFile.open(fileNameF);
|
||||
aFile.open(fileNameA);
|
||||
istream *eFileP = &eFile;
|
||||
istream *fFileP = &fFile;
|
||||
istream *aFileP = &aFile;
|
||||
|
||||
// string fileNameExtract = "/data/nlp/koehn/europarl-v2/models/de-en/model/new-extract";
|
||||
|
||||
int i=0;
|
||||
while(true) {
|
||||
i++;
|
||||
if (i%10000 == 0) cerr << "." << flush;
|
||||
char englishString[LINE_MAX_LENGTH];
|
||||
char foreignString[LINE_MAX_LENGTH];
|
||||
char alignmentString[LINE_MAX_LENGTH];
|
||||
SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n');
|
||||
if (eFileP->eof()) break;
|
||||
SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n');
|
||||
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n');
|
||||
SentenceAlignment sentence;
|
||||
// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
|
||||
if (sentence.create( englishString, foreignString, alignmentString, i ))
|
||||
extract(sentence);
|
||||
}
|
||||
|
||||
eFile.close();
|
||||
fFile.close();
|
||||
aFile.close();
|
||||
extractFile.close();
|
||||
extractFileInv.close();
|
||||
}
|
||||
|
||||
void extract( SentenceAlignment &sentence ) {
|
||||
int countE = sentence.english.size();
|
||||
int countF = sentence.foreign.size();
|
||||
|
||||
// check alignments for english phrase startE...endE
|
||||
for(int startE=0;startE<countE;startE++) {
|
||||
for(int endE=startE;
|
||||
(endE<countE && endE<startE+maxPhraseLength);
|
||||
endE++) {
|
||||
|
||||
int minF = 9999;
|
||||
int maxF = -1;
|
||||
vector< int > usedF = sentence.alignedCountF;
|
||||
for(int ei=startE;ei<=endE;ei++) {
|
||||
for(int i=0;i<sentence.alignedToE[ei].size();i++) {
|
||||
int fi = sentence.alignedToE[ei][i];
|
||||
// cout << "point (" << fi << ", " << ei << ")\n";
|
||||
if (fi<minF) { minF = fi; }
|
||||
if (fi>maxF) { maxF = fi; }
|
||||
usedF[ fi ]--;
|
||||
}
|
||||
}
|
||||
|
||||
// cout << "f projected ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n";
|
||||
|
||||
if (maxF >= 0 && // aligned to any foreign words at all
|
||||
maxF-minF < maxPhraseLength) { // foreign phrase within limits
|
||||
|
||||
// check if foreign words are aligned to out of bound english words
|
||||
bool out_of_bounds = false;
|
||||
for(int fi=minF;fi<=maxF && !out_of_bounds;fi++)
|
||||
if (usedF[fi]>0) {
|
||||
// cout << "ouf of bounds: " << fi << "\n";
|
||||
out_of_bounds = true;
|
||||
}
|
||||
|
||||
// cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n";
|
||||
if (!out_of_bounds)
|
||||
// start point of foreign phrase may retreat over unaligned
|
||||
for(int startF=minF;
|
||||
(startF>=0 &&
|
||||
startF>maxF-maxPhraseLength && // within length limit
|
||||
(startF==minF || sentence.alignedCountF[startF]==0)); // unaligned
|
||||
startF--)
|
||||
// end point of foreign phrase may advance over unaligned
|
||||
for(int endF=maxF;
|
||||
(endF<countF &&
|
||||
endF<startF+maxPhraseLength && // within length limit
|
||||
(endF==maxF || sentence.alignedCountF[endF]==0)); // unaligned
|
||||
endF++)
|
||||
addPhrase(sentence,startE,endE,startF,endF);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF ) {
|
||||
// foreign
|
||||
// cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
|
||||
if (phraseCount % 10000000 == 0) {
|
||||
if (phraseCount>0) {
|
||||
extractFile.close();
|
||||
extractFileInv.close();
|
||||
if (orientationFlag) extractFileOrientation.close();
|
||||
}
|
||||
char part[10];
|
||||
sprintf(part,".part%04d",phraseCount/10000000);
|
||||
string fileNameExtractPart = string(fileNameExtract) + part;
|
||||
string fileNameExtractInvPart = string(fileNameExtract) + ".inv" + part;
|
||||
string fileNameExtractOrientationPart = string(fileNameExtract) + ".o" + part;
|
||||
extractFile.open(fileNameExtractPart.c_str());
|
||||
extractFileInv.open(fileNameExtractInvPart.c_str());
|
||||
if (orientationFlag) extractFileOrientation.open(fileNameExtractOrientationPart.c_str());
|
||||
}
|
||||
phraseCount++;
|
||||
|
||||
for(int fi=startF;fi<=endF;fi++) {
|
||||
extractFile << sentence.foreign[fi] << " ";
|
||||
if (orientationFlag) extractFileOrientation << sentence.foreign[fi] << " ";
|
||||
}
|
||||
extractFile << "||| ";
|
||||
if (orientationFlag) extractFileOrientation << "||| ";
|
||||
|
||||
// english
|
||||
for(int ei=startE;ei<=endE;ei++) {
|
||||
extractFile << sentence.english[ei] << " ";
|
||||
extractFileInv << sentence.english[ei] << " ";
|
||||
if (orientationFlag) extractFileOrientation << sentence.english[ei] << " ";
|
||||
}
|
||||
extractFile << "|||";
|
||||
extractFileInv << "||| ";
|
||||
if (orientationFlag) extractFileOrientation << "||| ";
|
||||
|
||||
// foreign (for inverse)
|
||||
for(int fi=startF;fi<=endF;fi++)
|
||||
extractFileInv << sentence.foreign[fi] << " ";
|
||||
extractFileInv << "|||";
|
||||
|
||||
// alignment
|
||||
for(int ei=startE;ei<=endE;ei++)
|
||||
for(int i=0;i<sentence.alignedToE[ei].size();i++) {
|
||||
int fi = sentence.alignedToE[ei][i];
|
||||
extractFile << " " << fi-startF << "-" << ei-startE;
|
||||
extractFileInv << " " << ei-startE << "-" << fi-startF;
|
||||
}
|
||||
|
||||
if (orientationFlag) {
|
||||
|
||||
// orientation to previous E
|
||||
bool connectedLeftTop = isAligned( sentence, startF-1, startE-1 );
|
||||
bool connectedRightTop = isAligned( sentence, endF+1, startE-1 );
|
||||
if ( connectedLeftTop && !connectedRightTop)
|
||||
extractFileOrientation << "mono";
|
||||
else if (!connectedLeftTop && connectedRightTop)
|
||||
extractFileOrientation << "swap";
|
||||
else
|
||||
extractFileOrientation << "other";
|
||||
|
||||
// orientation to following E
|
||||
bool connectedLeftBottom = isAligned( sentence, startF-1, endE+1 );
|
||||
bool connectedRightBottom = isAligned( sentence, endF+1, endE+1 );
|
||||
if ( connectedLeftBottom && !connectedRightBottom)
|
||||
extractFileOrientation << " swap";
|
||||
else if (!connectedLeftBottom && connectedRightBottom)
|
||||
extractFileOrientation << " mono";
|
||||
else
|
||||
extractFileOrientation << " other";
|
||||
}
|
||||
|
||||
extractFile << "\n";
|
||||
extractFileInv << "\n";
|
||||
if (orientationFlag) extractFileOrientation << "\n";
|
||||
}
|
||||
|
||||
bool isAligned ( SentenceAlignment &sentence, int fi, int ei ) {
|
||||
if (ei == -1 && fi == -1) return true;
|
||||
if (ei <= -1 || fi <= -1) return false;
|
||||
if (ei == sentence.english.size() && fi == sentence.foreign.size()) return true;
|
||||
if (ei >= sentence.english.size() || fi >= sentence.foreign.size()) return false;
|
||||
for(int i=0;i<sentence.alignedToE[ei].size();i++)
|
||||
if (sentence.alignedToE[ei][i] == fi) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// as in beamdecoder/tables.cpp
|
||||
vector<string> tokenize( char input[] ) {
|
||||
vector< string > token;
|
||||
bool betweenWords = true;
|
||||
int start;
|
||||
int i=0;
|
||||
for(; input[i] != '\0'; i++) {
|
||||
bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
||||
|
||||
if (!isSpace && betweenWords) {
|
||||
start = i;
|
||||
betweenWords = false;
|
||||
}
|
||||
else if (isSpace && !betweenWords) {
|
||||
token.push_back( string( input+start, i-start ) );
|
||||
betweenWords = true;
|
||||
}
|
||||
}
|
||||
if (!betweenWords)
|
||||
token.push_back( string( input+start, i-start ) );
|
||||
return token;
|
||||
}
|
||||
|
||||
int SentenceAlignment::create( char englishString[], char foreignString[], char alignmentString[], int sentenceID ) {
|
||||
english = tokenize( englishString );
|
||||
foreign = tokenize( foreignString );
|
||||
// alignment = new bool[foreign.size()*english.size()];
|
||||
// alignment = (bool**) calloc(english.size()*foreign.size(),sizeof(bool)); // is this right?
|
||||
|
||||
if (english.size() == 0 || foreign.size() == 0) {
|
||||
cerr << "no english (" << english.size() << ") or foreign (" << foreign.size() << ") words << end insentence " << sentenceID << endl;
|
||||
cerr << "E: " << englishString << endl << "F: " << foreignString << endl;
|
||||
return 0;
|
||||
}
|
||||
// cout << "english.size = " << english.size() << endl;
|
||||
// cout << "foreign.size = " << foreign.size() << endl;
|
||||
|
||||
// cout << "xxx\n";
|
||||
for(int i=0; i<foreign.size(); i++) {
|
||||
// cout << "i" << i << endl;
|
||||
alignedCountF.push_back( 0 );
|
||||
}
|
||||
for(int i=0; i<english.size(); i++) {
|
||||
vector< int > dummy;
|
||||
alignedToE.push_back( dummy );
|
||||
}
|
||||
// cout << "\nscanning...\n";
|
||||
|
||||
vector<string> alignmentSequence = tokenize( alignmentString );
|
||||
for(int i=0; i<alignmentSequence.size(); i++) {
|
||||
int e,f;
|
||||
// cout << "scaning " << alignmentSequence[i].c_str() << endl;
|
||||
if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &f, &e)) {
|
||||
cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentnce " << sentenceID << endl;
|
||||
cerr << "E: " << englishString << endl << "F: " << foreignString << endl;
|
||||
return 0;
|
||||
}
|
||||
// cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << f << ", " << e << endl;
|
||||
if (e >= english.size() || f >= foreign.size()) {
|
||||
cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << f << ", " << e << ") out of bounds (" << foreign.size() << ", " << english.size() << ")\n";
|
||||
cerr << "E: " << englishString << endl << "F: " << foreignString << endl;
|
||||
return 0;
|
||||
}
|
||||
alignedToE[e].push_back( f );
|
||||
alignedCountF[f]++;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
323
scripts/training/phrase-extract/score.cpp
Normal file
323
scripts/training/phrase-extract/score.cpp
Normal file
@ -0,0 +1,323 @@
|
||||
using namespace std;
|
||||
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "tables-core.h"
|
||||
|
||||
#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) {_IS.getline(_LINE, _SIZE, _DELIM); if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear();}
|
||||
#define LINE_MAX_LENGTH 10000
|
||||
|
||||
class PhraseAlignment {
|
||||
public:
|
||||
int english, foreign;
|
||||
vector< vector<int> > alignedToE;
|
||||
vector< vector<int> > alignedToF;
|
||||
|
||||
void create( char*, int );
|
||||
void clear();
|
||||
bool equals( PhraseAlignment );
|
||||
};
|
||||
|
||||
class LexicalTable {
|
||||
public:
|
||||
map< WORD_ID, map< WORD_ID, double > > ltable;
|
||||
void load( char[] );
|
||||
};
|
||||
|
||||
vector<string> tokenize( char [] );
|
||||
|
||||
void processPhrasePairs( vector< PhraseAlignment > & );
|
||||
|
||||
ofstream phraseTableFile;
|
||||
|
||||
Vocabulary vcbE;
|
||||
Vocabulary vcbF;
|
||||
LexicalTable lexTable;
|
||||
PhraseTable phraseTableE;
|
||||
PhraseTable phraseTableF;
|
||||
bool inverseFlag;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
cerr << "PhraseScore v1.2.1, written by Philipp Koehn\n"
|
||||
<< "phrase scoring methods for extracted phrases\n";
|
||||
time_t starttime = time(NULL);
|
||||
|
||||
if (argc != 4 && argc != 5) {
|
||||
cerr << "syntax: phrase-score extract lex phrase-table [inverse]\n";
|
||||
exit(0);
|
||||
}
|
||||
char* &fileNameExtract = argv[1];
|
||||
char* &fileNameLex = argv[2];
|
||||
char* &fileNamePhraseTable = argv[3];
|
||||
inverseFlag = false;
|
||||
if (argc > 4) {
|
||||
inverseFlag = true;
|
||||
cerr << "using inverse mode\n";
|
||||
}
|
||||
// char[] fileNameExtract& = "/data/nlp/koehn/europarl-v2/models/de-en/model/new-extract.sorted";
|
||||
// string fileNameLex = "/data/nlp/koehn/europarl-v2/models/de-en/model/lex.f2n";
|
||||
// string fileNamePhraseTable = "/data/nlp/koehn/europarl-v2/models/de-en/model/new-phrase-table-half.f2n";
|
||||
|
||||
// lexical translation table
|
||||
lexTable.load( fileNameLex );
|
||||
|
||||
// sorted phrase extraction file
|
||||
ifstream extractFile;
|
||||
|
||||
extractFile.open(fileNameExtract);
|
||||
if (extractFile.fail()) {
|
||||
cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
|
||||
exit(0);
|
||||
}
|
||||
istream *extractFileP = &extractFile;
|
||||
|
||||
// output file: phrase translation table
|
||||
phraseTableFile.open(fileNamePhraseTable);
|
||||
if (phraseTableFile.fail()) {
|
||||
cerr << "ERROR: could not open file phrase table file "
|
||||
<< fileNamePhraseTable << endl;
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// loop through all extracted phrase translations
|
||||
int lastForeign = -1;
|
||||
vector< PhraseAlignment > phrasePairsWithSameF;
|
||||
int i=0;
|
||||
int fileCount = 0;
|
||||
while(true) {
|
||||
if (extractFileP->eof()) break;
|
||||
if (++i % 100000 == 0) cerr << "." << flush;
|
||||
char line[LINE_MAX_LENGTH];
|
||||
SAFE_GETLINE((*extractFileP), line, LINE_MAX_LENGTH, '\n');
|
||||
// if (fileCount>0)
|
||||
if (extractFileP->eof()) break;
|
||||
PhraseAlignment phrasePair;
|
||||
phrasePair.create( line, i );
|
||||
if (lastForeign >= 0 && lastForeign != phrasePair.foreign) {
|
||||
processPhrasePairs( phrasePairsWithSameF );
|
||||
for(int j=0;j<phrasePairsWithSameF.size();j++)
|
||||
phrasePairsWithSameF[j].clear();
|
||||
phrasePairsWithSameF.clear();
|
||||
phraseTableE.clear();
|
||||
phraseTableF.clear();
|
||||
phrasePair.clear(); // process line again, since phrase tables flushed
|
||||
phrasePair.create( line, i );
|
||||
}
|
||||
lastForeign = phrasePair.foreign;
|
||||
phrasePairsWithSameF.push_back( phrasePair );
|
||||
}
|
||||
processPhrasePairs( phrasePairsWithSameF );
|
||||
phraseTableFile.close();
|
||||
}
|
||||
|
||||
void processPhrasePairs( vector< PhraseAlignment > &phrasePair ) {
|
||||
map<int, int> countE;
|
||||
map<int, int> alignmentE;
|
||||
int totalCount = 0;
|
||||
int currentCount = 0;
|
||||
int maxSameCount = 0;
|
||||
int maxSame = -1;
|
||||
int old = -1;
|
||||
for(int i=0;i<phrasePair.size();i++) {
|
||||
if (i>0) {
|
||||
if (phrasePair[old].english == phrasePair[i].english) {
|
||||
if (! phrasePair[i].equals( phrasePair[old] )) {
|
||||
if (currentCount > maxSameCount) {
|
||||
maxSameCount = currentCount;
|
||||
maxSame = i-1;
|
||||
}
|
||||
currentCount = 0;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// wrap up old E
|
||||
if (currentCount > maxSameCount) {
|
||||
maxSameCount = currentCount;
|
||||
maxSame = i-1;
|
||||
}
|
||||
|
||||
alignmentE[ phrasePair[old].english ] = maxSame;
|
||||
// if (maxSameCount != totalCount)
|
||||
// cout << "max count is " << maxSameCount << "/" << totalCount << endl;
|
||||
|
||||
// get ready for new E
|
||||
totalCount = 0;
|
||||
currentCount = 0;
|
||||
maxSameCount = 0;
|
||||
maxSame = -1;
|
||||
}
|
||||
}
|
||||
countE[ phrasePair[i].english ]++;
|
||||
old = i;
|
||||
currentCount++;
|
||||
totalCount++;
|
||||
}
|
||||
|
||||
// wrap up old E
|
||||
if (currentCount > maxSameCount) {
|
||||
maxSameCount = currentCount;
|
||||
maxSame = phrasePair.size()-1;
|
||||
}
|
||||
alignmentE[ phrasePair[old].english ] = maxSame;
|
||||
// if (maxSameCount != totalCount)
|
||||
// cout << "max count is " << maxSameCount << "/" << totalCount << endl;
|
||||
|
||||
// output table
|
||||
typedef map< int, int >::iterator II;
|
||||
PHRASE phraseF = phraseTableF.getPhrase( phrasePair[0].foreign );
|
||||
for(II i = countE.begin(); i != countE.end(); i++) {
|
||||
// cout << "\tp( " << i->first << " | " << phrasePair[0].foreign << " ; " << phraseF.size() << " ) = ...\n";
|
||||
|
||||
// foreign phrase (unless inverse)
|
||||
if (! inverseFlag) {
|
||||
for(int j=0;j<phraseF.size();j++)
|
||||
phraseTableFile << vcbF.getWord( phraseF[j] ) << " ";
|
||||
phraseTableFile << "||| ";
|
||||
}
|
||||
|
||||
// english phrase
|
||||
PHRASE phraseE = phraseTableE.getPhrase( i->first );
|
||||
for(int j=0;j<phraseE.size();j++)
|
||||
phraseTableFile << vcbE.getWord( phraseE[j] ) << " ";
|
||||
phraseTableFile << "||| ";
|
||||
|
||||
// foreign phrase (if inverse)
|
||||
if (inverseFlag) {
|
||||
for(int j=0;j<phraseF.size();j++)
|
||||
phraseTableFile << vcbF.getWord( phraseF[j] ) << " ";
|
||||
phraseTableFile << "||| ";
|
||||
}
|
||||
|
||||
// phrase translation probability
|
||||
phraseTableFile << ((double) i->second / (double) phrasePair.size());
|
||||
|
||||
// lexical translation probability
|
||||
double lexScore = 1;
|
||||
int null = vcbF.getWordID("NULL");
|
||||
PhraseAlignment ¤t = phrasePair[ alignmentE[ i->first ] ];
|
||||
for(int ei=0;ei<phraseE.size();ei++) { // all english words have to be explained
|
||||
if (current.alignedToE[ ei ].size() == 0)
|
||||
lexScore *= lexTable.ltable[ null ][ phraseE[ ei ] ]; // by NULL if neccessary
|
||||
else {
|
||||
double thisWordScore = 0;
|
||||
for(int j=0;j<current.alignedToE[ ei ].size();j++) {
|
||||
thisWordScore += lexTable.ltable[ phraseF[current.alignedToE[ ei ][ j ] ] ][ phraseE[ ei ] ];
|
||||
// cout << "lex" << j << "(" << vcbE.getWord( phraseE[ ei ] ) << "|" << vcbF.getWord( phraseF[current.alignedToE[ ei ][ j ] ] ) << ")=" << lexTable.ltable[ phraseF[current.alignedToE[ ei ][ j ] ] ][ phraseE[ ei ] ] << " ";
|
||||
}
|
||||
lexScore *= thisWordScore / (double)current.alignedToE[ ei ].size();
|
||||
}
|
||||
// cout << " => " << lexScore << endl;
|
||||
}
|
||||
phraseTableFile << " " << lexScore;
|
||||
|
||||
// model 1 score
|
||||
|
||||
// zens&ney lexical score
|
||||
|
||||
phraseTableFile << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void PhraseAlignment::create( char line[], int lineID ) {
|
||||
vector< string > token = tokenize( line );
|
||||
int item = 1;
|
||||
PHRASE phraseF, phraseE;
|
||||
for (int j=0; j<token.size(); j++) {
|
||||
if (token[j] == "|||") item++;
|
||||
else {
|
||||
if (item == 1)
|
||||
phraseF.push_back( vcbF.storeIfNew( token[j] ) );
|
||||
else if (item == 2)
|
||||
phraseE.push_back( vcbE.storeIfNew( token[j] ) );
|
||||
else if (item == 3) {
|
||||
int e,f;
|
||||
sscanf(token[j].c_str(), "%d-%d", &f, &e);
|
||||
if (e >= phraseE.size() || f >= phraseF.size()) {
|
||||
cerr << "WARNING: sentence " << lineID << " has alignment point (" << f << ", " << e << ") out of bounds (" << phraseF.size() << ", " << phraseE.size() << ")\n"; }
|
||||
else {
|
||||
if (alignedToE.size() == 0) {
|
||||
vector< int > dummy;
|
||||
for(int i=0;i<phraseE.size();i++)
|
||||
alignedToE.push_back( dummy );
|
||||
for(int i=0;i<phraseF.size();i++)
|
||||
alignedToF.push_back( dummy );
|
||||
foreign = phraseTableF.storeIfNew( phraseF );
|
||||
english = phraseTableE.storeIfNew( phraseE );
|
||||
}
|
||||
alignedToE[e].push_back( f );
|
||||
alignedToF[f].push_back( e );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PhraseAlignment::clear() {
|
||||
for(int i=0;i<alignedToE.size();i++)
|
||||
alignedToE[i].clear();
|
||||
for(int i=0;i<alignedToF.size();i++)
|
||||
alignedToF[i].clear();
|
||||
alignedToE.clear();
|
||||
alignedToF.clear();
|
||||
}
|
||||
|
||||
bool PhraseAlignment::equals( PhraseAlignment other ) {
|
||||
if (other.english != english) return false;
|
||||
if (other.foreign != foreign) return false;
|
||||
PHRASE phraseE = phraseTableE.getPhrase( english );
|
||||
PHRASE phraseF = phraseTableF.getPhrase( foreign );
|
||||
for(int i=0;i<phraseE.size();i++) {
|
||||
if (alignedToE[i].size() != other.alignedToE[i].size()) return false;
|
||||
for(int j=0; j<alignedToE[i].size(); j++) {
|
||||
if (alignedToE[i][j] != other.alignedToE[i][j]) return false;
|
||||
}
|
||||
}
|
||||
for(int i=0;i<phraseF.size();i++) {
|
||||
if (alignedToF[i].size() != other.alignedToF[i].size()) return false;
|
||||
for(int j=0; j<alignedToF[i].size(); j++) {
|
||||
if (alignedToF[i][j] != other.alignedToF[i][j]) return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void LexicalTable::load( char *fileName ) {
|
||||
cerr << "Loading lexical translation table from " << fileName;
|
||||
ifstream inFile;
|
||||
inFile.open(fileName);
|
||||
if (inFile.fail()) {
|
||||
cerr << " - ERROR: could not open file\n";
|
||||
exit(0);
|
||||
}
|
||||
istream *inFileP = &inFile;
|
||||
|
||||
char line[LINE_MAX_LENGTH];
|
||||
|
||||
int i=0;
|
||||
while(true) {
|
||||
i++;
|
||||
if (i%100000 == 0) cerr << "." << flush;
|
||||
SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n');
|
||||
if (inFileP->eof()) break;
|
||||
|
||||
vector<string> token = tokenize( line );
|
||||
if (token.size() != 3) {
|
||||
cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:\n" <<
|
||||
token.size() << " " << token[0] << " " << line << endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
double prob = atof( token[2].c_str() );
|
||||
WORD_ID wordE = vcbE.storeIfNew( token[0] );
|
||||
WORD_ID wordF = vcbF.storeIfNew( token[1] );
|
||||
ltable[ wordF ][ wordE ] = prob;
|
||||
}
|
||||
cerr << endl;
|
||||
}
|
101
scripts/training/phrase-extract/tables-core.cpp
Normal file
101
scripts/training/phrase-extract/tables-core.cpp
Normal file
@ -0,0 +1,101 @@
|
||||
//#include "beammain.h"
|
||||
#include "tables-core.h"
|
||||
|
||||
#define TABLE_LINE_MAX_LENGTH 1000
|
||||
#define UNKNOWNSTR "UNK"
|
||||
|
||||
#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) {_IS.getline(_LINE, _SIZE, _DELIM); if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear();}
|
||||
|
||||
vector<string> tokenize( char input[] ) {
|
||||
vector< string > token;
|
||||
bool betweenWords = true;
|
||||
int start;
|
||||
int i=0;
|
||||
for(; input[i] != '\0'; i++) {
|
||||
bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
||||
|
||||
if (!isSpace && betweenWords) {
|
||||
start = i;
|
||||
betweenWords = false;
|
||||
}
|
||||
else if (isSpace && !betweenWords) {
|
||||
token.push_back( string( input+start, i-start ) );
|
||||
betweenWords = true;
|
||||
}
|
||||
}
|
||||
if (!betweenWords)
|
||||
token.push_back( string( input+start, i-start+1 ) );
|
||||
return token;
|
||||
}
|
||||
|
||||
WORD_ID Vocabulary::storeIfNew( WORD word ) {
|
||||
if( lookup.find( word ) != lookup.end() )
|
||||
return lookup[ word ];
|
||||
|
||||
WORD_ID id = vocab.size();
|
||||
vocab.push_back( word );
|
||||
lookup[ word ] = id;
|
||||
return id;
|
||||
}
|
||||
|
||||
WORD_ID Vocabulary::getWordID( WORD word ) {
|
||||
if( lookup.find( word ) == lookup.end() )
|
||||
return 0;
|
||||
return lookup[ word ];
|
||||
}
|
||||
|
||||
PHRASE_ID PhraseTable::storeIfNew( PHRASE phrase ) {
|
||||
if( lookup.find( phrase ) != lookup.end() )
|
||||
return lookup[ phrase ];
|
||||
|
||||
PHRASE_ID id = phraseTable.size();
|
||||
phraseTable.push_back( phrase );
|
||||
lookup[ phrase ] = id;
|
||||
return id;
|
||||
}
|
||||
|
||||
PHRASE_ID PhraseTable::getPhraseID( PHRASE phrase ) {
|
||||
if( lookup.find( phrase ) == lookup.end() )
|
||||
return 0;
|
||||
return lookup[ phrase ];
|
||||
}
|
||||
|
||||
void PhraseTable::clear() {
|
||||
lookup.clear();
|
||||
phraseTable.clear();
|
||||
}
|
||||
|
||||
void DTable::init() {
|
||||
for(int i = -10; i<10; i++)
|
||||
dtable[i] = -abs( i );
|
||||
}
|
||||
|
||||
void DTable::load( string fileName ) {
|
||||
ifstream inFile;
|
||||
inFile.open(fileName.c_str());
|
||||
istream *inFileP = &inFile;
|
||||
|
||||
char line[TABLE_LINE_MAX_LENGTH];
|
||||
int i=0;
|
||||
while(true) {
|
||||
i++;
|
||||
SAFE_GETLINE((*inFileP), line, TABLE_LINE_MAX_LENGTH, '\n');
|
||||
if (inFileP->eof()) break;
|
||||
|
||||
vector<string> token = tokenize( line );
|
||||
if (token.size() < 2) {
|
||||
cerr << "line " << i << " in " << fileName << " too short, skipping\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
int d = atoi( token[0].c_str() );
|
||||
double prob = log( atof( token[1].c_str() ) );
|
||||
dtable[ d ] = prob;
|
||||
}
|
||||
}
|
||||
|
||||
double DTable::get( int distortion ) {
|
||||
if (dtable.find( distortion ) == dtable.end())
|
||||
return log( 0.00001 );
|
||||
return dtable[ distortion ];
|
||||
}
|
57
scripts/training/phrase-extract/tables-core.h
Normal file
57
scripts/training/phrase-extract/tables-core.h
Normal file
@ -0,0 +1,57 @@
|
||||
#ifndef _TABLES_H
|
||||
#define _TABLES_H
|
||||
|
||||
using namespace std;
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <string>
|
||||
#include <queue>
|
||||
#include <map>
|
||||
|
||||
vector<string> tokenize( char[] );
|
||||
|
||||
typedef string WORD;
|
||||
typedef unsigned int WORD_ID;
|
||||
|
||||
class Vocabulary {
|
||||
public:
|
||||
map<WORD, WORD_ID> lookup;
|
||||
vector< WORD > vocab;
|
||||
WORD_ID storeIfNew( WORD );
|
||||
WORD_ID getWordID( WORD );
|
||||
inline WORD &getWord( WORD_ID id ) { return vocab[ id ]; }
|
||||
};
|
||||
|
||||
typedef vector< WORD_ID > PHRASE;
|
||||
typedef unsigned int PHRASE_ID;
|
||||
|
||||
class PhraseTable {
|
||||
public:
|
||||
map< PHRASE, PHRASE_ID > lookup;
|
||||
vector< PHRASE > phraseTable;
|
||||
PHRASE_ID storeIfNew( PHRASE );
|
||||
PHRASE_ID getPhraseID( PHRASE );
|
||||
void clear();
|
||||
inline PHRASE &getPhrase( const PHRASE_ID id ) { return phraseTable[ id ]; }
|
||||
};
|
||||
|
||||
typedef vector< pair< PHRASE_ID, double > > PHRASEPROBVEC;
|
||||
|
||||
class TTable {
|
||||
public:
|
||||
map< PHRASE_ID, vector< pair< PHRASE_ID, double > > > ttable;
|
||||
map< PHRASE_ID, vector< pair< PHRASE_ID, vector< double > > > > ttableMulti;
|
||||
};
|
||||
|
||||
class DTable {
|
||||
public:
|
||||
map< int, double > dtable;
|
||||
void init();
|
||||
void load( string );
|
||||
double get( int );
|
||||
};
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user