faster scorer

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4119 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
hieuhoang1972 2011-08-05 10:27:15 +00:00
parent b4c79f721e
commit 30ca534b86
9 changed files with 101 additions and 612 deletions

View File

@ -5,7 +5,7 @@ use strict;
# if your tests need a new version of the test data, increment this
# and make sure that a moses-regression-tests-vX.Y is available for
# download from statmt.org (redpony AT umd dot edu for more info)
use constant TESTING_DATA_VERSION => '6';
use constant TESTING_DATA_VERSION => '7';
# find the data directory in a few likely locations and make sure
# that it is the correct version

View File

@ -105,7 +105,7 @@ if($NBEST > 0){
run_command("gzip $results/run.nbest");
}
($o, $ec, $sig) = run_command("$BIN_TEST/compare-results.pl $results $truth");
($o, $ec, $sig) = run_command("$BIN_TEST/compare-results.perl $results $truth");
print $o;
if ($ec) {
print STDERR "FAILURE, for debugging, local moses.ini=$local_moses_ini\n";

View File

@ -8,8 +8,8 @@ DS?=$(shell date '+%Y%m%d')
# Set TARGETDIR to directory where you want the compiled scripts to be copied
# to.
# Set BINDIR to the directory where GIZA++ and other tools are installed.
TARGETDIR=/mnt/odin3/bhaddow/moses
BINDIR=/mnt/odin3/bhaddow/moses/bin
TARGETDIR=/opt/AO/sw/edinburgh-code/
BINDIR=/opt/AO/sw/edinburgh-code/
MAIN_SCRIPTS_TARGET_DIR=$(TARGETDIR)
# MAIN_SCRIPTS_TARGET_DIR=$(shell echo `pwd`/temp)

View File

@ -1,5 +1,5 @@
all: consolidate consolidate-direct consolidate-reverse extract extract-rules relax-parse \
score score2 statistics extract-lex
score statistics extract-lex
clean:
rm -f *.o
@ -19,9 +19,6 @@ extract-lex: extract-lex.o
score: tables-core.o AlignmentPhrase.o score.o PhraseAlignment.o InputFileStream.o
$(CXX) $^ -lz -o score
score2: tables-core.o AlignmentPhrase.o score2.o PhraseAlignment.o InputFileStream.o
$(CXX) $^ -lz -o score2
consolidate: consolidate.o tables-core.o InputFileStream.o
$(CXX) $^ -lz -o consolidate

View File

@ -58,8 +58,8 @@ vector<string> tokenize( const char [] );
void computeCountOfCounts( char* fileNameExtract, int maxLines );
void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile);
PhraseAlignment* findBestAlignment( vector< PhraseAlignment* > & );
void outputPhrasePair( vector< PhraseAlignment * > &, float, ostream &phraseTableFile );
PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection & );
void outputPhrasePair(const PhraseAlignmentCollection &, float, ostream &phraseTableFile );
double computeLexicalTranslation( const PHRASE &, const PHRASE &, PhraseAlignment * );
LexicalTable lexTable;
@ -168,7 +168,11 @@ int main(int argc, char* argv[])
PhraseAlignment *lastPhrasePair = NULL;
while(true) {
if (extractFileP.eof()) break;
if (++i % 100000 == 0) cerr << "." << flush;
if (++i % 100000 == 0)
{
cerr << i << " " << flush;
}
SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
if (extractFileP.eof()) break;
@ -193,6 +197,7 @@ int main(int argc, char* argv[])
// if new source phrase, process last batch
if (lastPhrasePair != NULL &&
lastPhrasePair->GetSource() != phrasePair.GetSource()) {
processPhrasePairs( phrasePairsWithSameF, *phraseTableFile );
phrasePairsWithSameF.clear();
lastPhrasePair = NULL;
@ -291,39 +296,44 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseT
// group phrase pairs based on alignments that matter
// (i.e. that re-arrange non-terminals)
vector< vector< PhraseAlignment * > > phrasePairGroup;
PhrasePairGroup phrasePairGroup;
float totalSource = 0;
//cerr << "phrasePair.size() = " << phrasePair.size() << endl;
// loop through phrase pairs
for(size_t i=0; i<phrasePair.size(); i++) {
// add to total count
PhraseAlignment &currPhrasePair = phrasePair[i];
totalSource += phrasePair[i].count;
bool matched = false;
// check for matches
for(size_t g=0; g<phrasePairGroup.size(); g++) {
vector< PhraseAlignment* > &group = phrasePairGroup[g];
// matched? place into same group
if ( group[0]->match( phrasePair[i] )) {
group.push_back( &phrasePair[i] );
matched = true;
}
}
// not matched? create new group
if (! matched) {
vector< PhraseAlignment* > newGroup;
newGroup.push_back( &phrasePair[i] );
phrasePairGroup.push_back( newGroup );
//cerr << "phrasePairGroup.size() = " << phrasePairGroup.size() << endl;
PhraseAlignmentCollection phraseAlignColl;
phraseAlignColl.push_back(&currPhrasePair);
pair<PhrasePairGroup::iterator, bool> retInsert;
retInsert = phrasePairGroup.insert(phraseAlignColl);
if (!retInsert.second)
{ // already exist. Add to that collection instead
PhraseAlignmentCollection &existingColl = const_cast<PhraseAlignmentCollection&>(*retInsert.first);
existingColl.push_back(&currPhrasePair);
}
}
for(size_t g=0; g<phrasePairGroup.size(); g++) {
vector< PhraseAlignment* > &group = phrasePairGroup[g];
const PhrasePairGroup::SortedColl &sortedColl = phrasePairGroup.GetSortedColl();
PhrasePairGroup::SortedColl::const_iterator iter;
for(iter = sortedColl.begin(); iter != sortedColl.end(); ++iter)
{
const PhraseAlignmentCollection &group = **iter;
outputPhrasePair( group, totalSource, phraseTableFile );
}
}
PhraseAlignment* findBestAlignment( vector< PhraseAlignment* > &phrasePair )
PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
{
float bestAlignmentCount = -1;
PhraseAlignment* bestAlignment;
@ -338,7 +348,7 @@ PhraseAlignment* findBestAlignment( vector< PhraseAlignment* > &phrasePair )
return bestAlignment;
}
void outputPhrasePair( vector< PhraseAlignment* > &phrasePair, float totalCount, ostream &phraseTableFile )
void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, ostream &phraseTableFile )
{
if (phrasePair.size() == 0) return;
@ -488,3 +498,18 @@ void LexicalTable::load( char *fileName )
cerr << endl;
}
std::pair<PhrasePairGroup::Coll::iterator,bool> PhrasePairGroup::insert ( const PhraseAlignmentCollection& obj )
{
std::pair<iterator,bool> ret = m_coll.insert(obj);
if (ret.second)
{ // obj inserted. Also add to sorted vector
const PhraseAlignmentCollection &insertedObj = *ret.first;
m_sortedColl.push_back(&insertedObj);
}
return ret;
}

View File

@ -8,11 +8,60 @@
*
*/
#include <string>
#include <vector>
class PhraseAlignment;
typedef std::vector<PhraseAlignment*> PhraseAlignmentCollection;
//typedef std::vector<PhraseAlignmentCollection> PhrasePairGroup;
class PhraseAlignmentCollectionOrderer
{
public:
bool operator()(const PhraseAlignmentCollection &collA, const PhraseAlignmentCollection &collB) const
{
assert(collA.size() > 0);
assert(collB.size() > 0);
const PhraseAlignment &objA = *collA[0];
const PhraseAlignment &objB = *collB[0];
bool ret = objA < objB;
return ret;
}
};
//typedef std::set<PhraseAlignmentCollection, PhraseAlignmentCollectionOrderer> PhrasePairGroup;
class PhrasePairGroup
{
private:
typedef std::set<PhraseAlignmentCollection, PhraseAlignmentCollectionOrderer> Coll;
Coll m_coll;
public:
typedef Coll::iterator iterator;
typedef Coll::const_iterator const_iterator;
typedef std::vector<const PhraseAlignmentCollection *> SortedColl;
std::pair<Coll::iterator,bool> insert ( const PhraseAlignmentCollection& obj );
const SortedColl &GetSortedColl() const
{ return m_sortedColl; }
private:
SortedColl m_sortedColl;
};
// other functions *********************************************
inline bool isNonTerminal( std::string &word )
{
return (word.length()>=3 &&
word.substr(0,1).compare("[") == 0 &&
word.substr(word.length()-1,1).compare("]") == 0);
}

View File

@ -1,515 +0,0 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2009 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <sstream>
#include <cstdio>
#include <iostream>
#include <fstream>
#include <vector>
#include <stdlib.h>
#include <assert.h>
#include <cstring>
#include <set>
#include "SafeGetline.h"
#include "tables-core.h"
#include "PhraseAlignment.h"
#include "score2.h"
#include "InputFileStream.h"
using namespace std;
#define LINE_MAX_LENGTH 100000
Vocabulary vcbT;
Vocabulary vcbS;
class LexicalTable
{
public:
map< WORD_ID, map< WORD_ID, double > > ltable;
void load( char[] );
double permissiveLookup( WORD_ID wordS, WORD_ID wordT ) {
// cout << endl << vcbS.getWord( wordS ) << "-" << vcbT.getWord( wordT ) << ":";
if (ltable.find( wordS ) == ltable.end()) return 1.0;
if (ltable[ wordS ].find( wordT ) == ltable[ wordS ].end()) return 1.0;
// cout << ltable[ wordS ][ wordT ];
return ltable[ wordS ][ wordT ];
}
};
vector<string> tokenize( const char [] );
void computeCountOfCounts( char* fileNameExtract, int maxLines );
void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile);
PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection & );
void outputPhrasePair(const PhraseAlignmentCollection &, float, ostream &phraseTableFile );
double computeLexicalTranslation( const PHRASE &, const PHRASE &, PhraseAlignment * );
LexicalTable lexTable;
bool inverseFlag = false;
bool hierarchicalFlag = false;
bool wordAlignmentFlag = false;
bool goodTuringFlag = false;
#define GT_MAX 10
bool logProbFlag = false;
int negLogProb = 1;
bool lexFlag = true;
int countOfCounts[GT_MAX+1];
float discountFactor[GT_MAX+1];
int maxLinesGTDiscount = -1;
bool phrasePairCountFlag = false;
int main(int argc, char* argv[])
{
cerr << "Score v2.0 written by Philipp Koehn\n"
<< "scoring methods for extracted rules\n";
if (argc < 4) {
cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--WordAlignment] [--MaxLinesGTDiscount num] [--PhrasePairCount]\n";
exit(1);
}
char* fileNameExtract = argv[1];
char* fileNameLex = argv[2];
char* fileNamePhraseTable = argv[3];
for(int i=4; i<argc; i++) {
if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
inverseFlag = true;
cerr << "using inverse mode\n";
} else if (strcmp(argv[i],"--Hierarchical") == 0) {
hierarchicalFlag = true;
cerr << "processing hierarchical rules\n";
} else if (strcmp(argv[i],"--WordAlignment") == 0) {
wordAlignmentFlag = true;
cerr << "outputing word alignment" << endl;
} else if (strcmp(argv[i],"--NoLex") == 0) {
lexFlag = false;
cerr << "not computing lexical translation score\n";
} else if (strcmp(argv[i],"--GoodTuring") == 0) {
goodTuringFlag = true;
cerr << "using Good Turing discounting\n";
} else if (strcmp(argv[i],"--LogProb") == 0) {
logProbFlag = true;
cerr << "using log-probabilities\n";
} else if (strcmp(argv[i],"--NegLogProb") == 0) {
logProbFlag = true;
negLogProb = -1;
cerr << "using negative log-probabilities\n";
} else if (strcmp(argv[i],"--MaxLinesGTDiscount") == 0) {
++i;
maxLinesGTDiscount = atoi(argv[i]);
cerr << "maxLinesGTDiscount=" << maxLinesGTDiscount << endl;
} else if (strcmp(argv[i],"--PhrasePairCount") == 0) {
phrasePairCountFlag = true;
cerr << "outputting phrase pair counts" << endl;
} else {
cerr << "ERROR: unknown option " << argv[i] << endl;
exit(1);
}
}
// lexical translation table
if (lexFlag)
lexTable.load( fileNameLex );
// compute count of counts for Good Turing discounting
if (goodTuringFlag)
computeCountOfCounts( fileNameExtract, maxLinesGTDiscount );
// sorted phrase extraction file
Moses::InputFileStream extractFile(fileNameExtract);
if (extractFile.fail()) {
cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
exit(1);
}
istream &extractFileP = extractFile;
// output file: phrase translation table
ostream *phraseTableFile;
if (strcmp(fileNamePhraseTable, "-") == 0) {
phraseTableFile = &cout;
}
else {
ofstream *outputFile = new ofstream();
outputFile->open(fileNamePhraseTable);
if (outputFile->fail()) {
cerr << "ERROR: could not open file phrase table file "
<< fileNamePhraseTable << endl;
exit(1);
}
phraseTableFile = outputFile;
}
// loop through all extracted phrase translations
float lastCount = 0.0f;
vector< PhraseAlignment > phrasePairsWithSameF;
int i=0;
char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
lastLine[0] = '\0';
PhraseAlignment *lastPhrasePair = NULL;
while(true) {
if (extractFileP.eof()) break;
if (++i % 100000 == 0)
{
cerr << i << " " << flush;
}
SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
if (extractFileP.eof()) break;
// identical to last line? just add count
if (strcmp(line,lastLine) == 0) {
lastPhrasePair->count += lastCount;
continue;
}
strcpy( lastLine, line );
// create new phrase pair
PhraseAlignment phrasePair;
phrasePair.create( line, i );
lastCount = phrasePair.count;
// only differs in count? just add count
if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) {
lastPhrasePair->count += phrasePair.count;
continue;
}
// if new source phrase, process last batch
if (lastPhrasePair != NULL &&
lastPhrasePair->GetSource() != phrasePair.GetSource()) {
processPhrasePairs( phrasePairsWithSameF, *phraseTableFile );
phrasePairsWithSameF.clear();
lastPhrasePair = NULL;
}
// add phrase pairs to list, it's now the last one
phrasePairsWithSameF.push_back( phrasePair );
lastPhrasePair = &phrasePairsWithSameF.back();
}
processPhrasePairs( phrasePairsWithSameF, *phraseTableFile );
phraseTableFile->flush();
if (phraseTableFile != &cout) {
(dynamic_cast<ofstream*>(phraseTableFile))->close();
delete phraseTableFile;
}
}
void computeCountOfCounts( char* fileNameExtract, int maxLines )
{
cerr << "computing counts of counts";
for(int i=1; i<=GT_MAX; i++) countOfCounts[i] = 0;
Moses::InputFileStream extractFile(fileNameExtract);
if (extractFile.fail()) {
cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
exit(1);
}
istream &extractFileP = extractFile;
// loop through all extracted phrase translations
int lineNum = 0;
char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
lastLine[0] = '\0';
float lastCount = 0.0f;
PhraseAlignment *lastPhrasePair = NULL;
while(true) {
if (extractFileP.eof()) break;
if (maxLines > 0 && lineNum >= maxLines) break;
if (++lineNum % 100000 == 0) cerr << "." << flush;
SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
if (extractFileP.eof()) break;
// identical to last line? just add count
if (strcmp(line,lastLine) == 0) {
lastPhrasePair->count += lastCount;
continue;
}
strcpy( lastLine, line );
// create new phrase pair
PhraseAlignment *phrasePair = new PhraseAlignment();
phrasePair->create( line, lineNum );
lastCount = phrasePair->count;
if (lineNum == 1) {
lastPhrasePair = phrasePair;
continue;
}
// only differs in count? just add count
if (lastPhrasePair->match( *phrasePair )) {
lastPhrasePair->count += phrasePair->count;
phrasePair->clear();
delete(phrasePair);
continue;
}
int count = lastPhrasePair->count + 0.99999;
if(count <= GT_MAX)
countOfCounts[ count ]++;
lastPhrasePair->clear();
delete( lastPhrasePair );
lastPhrasePair = phrasePair;
}
delete lastPhrasePair;
discountFactor[0] = 0.01; // floor
cerr << "\n";
for(int i=1; i<GT_MAX; i++) {
discountFactor[i] = ((float)i+1)/(float)i*(((float)countOfCounts[i+1]+0.1) / ((float)countOfCounts[i]+0.1));
cerr << "count " << i << ": " << countOfCounts[ i ] << ", discount factor: " << discountFactor[i];
// some smoothing...
if (discountFactor[i]>1)
discountFactor[i] = 1;
if (discountFactor[i]<discountFactor[i-1])
discountFactor[i] = discountFactor[i-1];
cerr << " -> " << discountFactor[i]*i << endl;
}
}
void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile )
{
if (phrasePair.size() == 0) return;
// group phrase pairs based on alignments that matter
// (i.e. that re-arrange non-terminals)
PhrasePairGroup phrasePairGroup;
float totalSource = 0;
//cerr << "phrasePair.size() = " << phrasePair.size() << endl;
// loop through phrase pairs
for(size_t i=0; i<phrasePair.size(); i++) {
// add to total count
PhraseAlignment &currPhrasePair = phrasePair[i];
totalSource += phrasePair[i].count;
// check for matches
//cerr << "phrasePairGroup.size() = " << phrasePairGroup.size() << endl;
PhraseAlignmentCollection phraseAlignColl;
phraseAlignColl.push_back(&currPhrasePair);
pair<PhrasePairGroup::iterator, bool> retInsert;
retInsert = phrasePairGroup.insert(phraseAlignColl);
if (!retInsert.second)
{ // already exist. Add to that collection instead
PhraseAlignmentCollection &existingColl = const_cast<PhraseAlignmentCollection&>(*retInsert.first);
existingColl.push_back(&currPhrasePair);
}
}
const PhrasePairGroup::SortedColl &sortedColl = phrasePairGroup.GetSortedColl();
PhrasePairGroup::SortedColl::const_iterator iter;
for(iter = sortedColl.begin(); iter != sortedColl.end(); ++iter)
{
const PhraseAlignmentCollection &group = **iter;
outputPhrasePair( group, totalSource, phraseTableFile );
}
}
PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
{
float bestAlignmentCount = -1;
PhraseAlignment* bestAlignment;
for(int i=0; i<phrasePair.size(); i++) {
if (phrasePair[i]->count > bestAlignmentCount) {
bestAlignmentCount = phrasePair[i]->count;
bestAlignment = phrasePair[i];
}
}
return bestAlignment;
}
void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, ostream &phraseTableFile )
{
if (phrasePair.size() == 0) return;
PhraseAlignment *bestAlignment = findBestAlignment( phrasePair );
// compute count
float count = 0;
for(size_t i=0; i<phrasePair.size(); i++) {
count += phrasePair[i]->count;
}
const float originalCount = count;
const PHRASE &phraseS = phrasePair[0]->GetSource();
const PHRASE &phraseT = phrasePair[0]->GetTarget();
// labels (if hierarchical)
// source phrase (unless inverse)
if (! inverseFlag) {
for(int j=0; j<phraseS.size(); j++) {
phraseTableFile << vcbS.getWord( phraseS[j] );
phraseTableFile << " ";
}
phraseTableFile << "||| ";
}
// target phrase
for(int j=0; j<phraseT.size(); j++) {
phraseTableFile << vcbT.getWord( phraseT[j] );
phraseTableFile << " ";
}
phraseTableFile << "||| ";
// source phrase (if inverse)
if (inverseFlag) {
for(int j=0; j<phraseS.size(); j++) {
phraseTableFile << vcbS.getWord( phraseS[j] );
phraseTableFile << " ";
}
phraseTableFile << "||| ";
}
// phrase translation probability
if (goodTuringFlag && count<GT_MAX)
count *= discountFactor[(int)(count+0.99999)];
double condScore = count / totalCount;
phraseTableFile << ( logProbFlag ? negLogProb*log(condScore) : condScore );
// lexical translation probability
if (lexFlag) {
double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment);
phraseTableFile << " " << ( logProbFlag ? negLogProb*log(lexScore) : lexScore );
}
phraseTableFile << " ||| ";
// alignment info for non-terminals
if (! inverseFlag) {
if (hierarchicalFlag) {
// always output alignment if hiero style, but only for non-terms
assert(phraseT.size() == bestAlignment->alignedToT.size() + 1);
for(int j = 0; j < phraseT.size() - 1; j++) {
if (isNonTerminal(vcbT.getWord( phraseT[j] ))) {
if (bestAlignment->alignedToT[ j ].size() != 1) {
cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl;
phraseTableFile.flush();
assert(bestAlignment->alignedToT[ j ].size() == 1);
}
int sourcePos = *(bestAlignment->alignedToT[ j ].begin());
phraseTableFile << sourcePos << "-" << j << " ";
}
}
} else if (wordAlignmentFlag) {
// alignment info in pb model
for(int j=0; j<bestAlignment->alignedToT.size(); j++) {
const set< size_t > &aligned = bestAlignment->alignedToT[j];
for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) {
phraseTableFile << *p << "-" << j << " ";
}
}
}
}
phraseTableFile << " ||| " << totalCount;
if (phrasePairCountFlag) {
phraseTableFile << " " << originalCount;
}
phraseTableFile << endl;
}
double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
{
// lexical translation probability
double lexScore = 1.0;
int null = vcbS.getWordID("NULL");
// all target words have to be explained
for(int ti=0; ti<alignment->alignedToT.size(); ti++) {
const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
if (srcIndices.empty()) {
// explain unaligned word by NULL
lexScore *= lexTable.permissiveLookup( null, phraseT[ ti ] );
} else {
// go through all the aligned words to compute average
double thisWordScore = 0;
for (set< size_t >::const_iterator p(srcIndices.begin()); p != srcIndices.end(); ++p) {
thisWordScore += lexTable.permissiveLookup( phraseS[ *p ], phraseT[ ti ] );
}
lexScore *= thisWordScore / (double)srcIndices.size();
}
}
return lexScore;
}
void LexicalTable::load( char *fileName )
{
cerr << "Loading lexical translation table from " << fileName;
ifstream inFile;
inFile.open(fileName);
if (inFile.fail()) {
cerr << " - ERROR: could not open file\n";
exit(1);
}
istream *inFileP = &inFile;
char line[LINE_MAX_LENGTH];
int i=0;
while(true) {
i++;
if (i%100000 == 0) cerr << "." << flush;
SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
if (inFileP->eof()) break;
vector<string> token = tokenize( line );
if (token.size() != 3) {
cerr << "line " << i << " in " << fileName
<< " has wrong number of tokens, skipping:\n"
<< token.size() << " " << token[0] << " " << line << endl;
continue;
}
double prob = atof( token[2].c_str() );
WORD_ID wordT = vcbT.storeIfNew( token[0] );
WORD_ID wordS = vcbS.storeIfNew( token[1] );
ltable[ wordS ][ wordT ] = prob;
}
cerr << endl;
}
std::pair<PhrasePairGroup::Coll::iterator,bool> PhrasePairGroup::insert ( const PhraseAlignmentCollection& obj )
{
std::pair<iterator,bool> ret = m_coll.insert(obj);
if (ret.second)
{ // obj inserted. Also add to sorted vector
const PhraseAlignmentCollection &insertedObj = *ret.first;
m_sortedColl.push_back(&insertedObj);
}
return ret;
}

View File

@ -1,67 +0,0 @@
#pragma once
/*
* score.h
* extract
*
* Created by Hieu Hoang on 28/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <string>
#include <vector>
class PhraseAlignment;
typedef std::vector<PhraseAlignment*> PhraseAlignmentCollection;
//typedef std::vector<PhraseAlignmentCollection> PhrasePairGroup;
class PhraseAlignmentCollectionOrderer
{
public:
bool operator()(const PhraseAlignmentCollection &collA, const PhraseAlignmentCollection &collB) const
{
assert(collA.size() > 0);
assert(collB.size() > 0);
const PhraseAlignment &objA = *collA[0];
const PhraseAlignment &objB = *collB[0];
bool ret = objA < objB;
return ret;
}
};
//typedef std::set<PhraseAlignmentCollection, PhraseAlignmentCollectionOrderer> PhrasePairGroup;
class PhrasePairGroup
{
private:
typedef std::set<PhraseAlignmentCollection, PhraseAlignmentCollectionOrderer> Coll;
Coll m_coll;
public:
typedef Coll::iterator iterator;
typedef Coll::const_iterator const_iterator;
typedef std::vector<const PhraseAlignmentCollection *> SortedColl;
std::pair<Coll::iterator,bool> insert ( const PhraseAlignmentCollection& obj );
const SortedColl &GetSortedColl() const
{ return m_sortedColl; }
private:
SortedColl m_sortedColl;
};
// other functions *********************************************
inline bool isNonTerminal( std::string &word )
{
return (word.length()>=3 &&
word.substr(0,1).compare("[") == 0 &&
word.substr(word.length()-1,1).compare("]") == 0);
}