This commit is contained in:
Eva 2012-04-28 23:11:30 -07:00
parent b8b3000daf
commit 6f39ad0b3e
13 changed files with 149 additions and 40 deletions

View File

@ -55,7 +55,6 @@ int main (int argc, char * const argv[])
const string filePath = argv[6]
,destPath = argv[7];
Moses::InputFileStream inStream(filePath);
OnDiskWrapper onDiskWrapper;
@ -138,7 +137,8 @@ void Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, char *line
break;
}
case 3: {
targetPhrase.Create1AlignFromString(tok);
//targetPhrase.Create1AlignFromString(tok);
targetPhrase.CreateAlignFromString(tok);
break;
}
case 4:

View File

@ -27,6 +27,8 @@
#include "TargetPhrase.h"
#include "OnDiskWrapper.h"
#include <boost/algorithm/string.hpp>
using namespace std;
namespace OnDiskPt
@ -61,6 +63,18 @@ void TargetPhrase::Create1AlignFromString(const std::string &align1Str)
m_align.push_back(pair<size_t, size_t>(alignPoints[0], alignPoints[1]) );
}
void TargetPhrase::CreateAlignFromString(const std::string &alignStr)
{
vector<std::string> alignPairs;
boost::split(alignPairs, alignStr, boost::is_any_of("\t "));
for (size_t i = 0; i < alignPairs.size(); ++i) {
vector<size_t> alignPoints;
Moses::Tokenize<size_t>(alignPoints, alignPairs[i], "-");
m_align.push_back(pair<size_t, size_t>(alignPoints[0], alignPoints[1]) );
}
}
void TargetPhrase::SetScore(float score, size_t ind)
{
CHECK(ind < m_scores.size());
@ -143,9 +157,10 @@ char *TargetPhrase::WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t
// phrase id
memcpy(mem, &m_filePos, sizeof(UINT64));
memUsed += sizeof(UINT64);
// align
memUsed += WriteAlignToMemory(mem + memUsed);
size_t tmp = WriteAlignToMemory(mem + memUsed);
memUsed += tmp;
// scores
memUsed += WriteScoresToMemory(mem + memUsed);
@ -176,6 +191,7 @@ size_t TargetPhrase::WriteAlignToMemory(char *mem) const
memUsed += sizeof(alignPair.second);
}
std::cerr << "align memory used: " << memUsed << std::endl;
return memUsed;
}
@ -269,12 +285,14 @@ UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP, size_t numFactors)
UINT64 TargetPhrase::ReadAlignFromFile(std::fstream &fileTPColl)
{
std::cerr << "read alignment.." << std::endl;
UINT64 bytesRead = 0;
UINT64 numAlign;
fileTPColl.read((char*) &numAlign, sizeof(UINT64));
bytesRead += sizeof(UINT64);
std::cerr << "numAlign: " << numAlign << std::endl;
for (size_t ind = 0; ind < numAlign; ++ind) {
AlignPair alignPair;
fileTPColl.read((char*) &alignPair.first, sizeof(UINT64));
@ -284,6 +302,7 @@ UINT64 TargetPhrase::ReadAlignFromFile(std::fstream &fileTPColl)
bytesRead += sizeof(UINT64) * 2;
}
std::cerr << "Align bytes read: " << bytesRead << std::endl;
return bytesRead;
}

View File

@ -63,6 +63,7 @@ public:
void SetLHS(Word *lhs);
void Create1AlignFromString(const std::string &align1Str);
void CreateAlignFromString(const std::string &align1Str);
void SetScore(float score, size_t ind);
const AlignType &GetAlign() const {

View File

@ -173,11 +173,14 @@ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnD
TargetPhrase *tp = new TargetPhrase(numScores);
UINT64 sizeOtherInfo = tp->ReadOtherInfoFromFile(currFilePos, fileTPColl);
std::cerr << "other info done." << std::endl;
tp->ReadFromFile(fileTP, numTargetFactors);
std::cerr << "done reading from file." << std::endl;
currFilePos += sizeOtherInfo;
m_coll.push_back(tp);
std::cerr << "tp done." << std::endl;
}
}

View File

@ -7,6 +7,8 @@
#include <iterator>
#include <stdexcept>
#include "Util.h"
#include "ScoreDataIterator.h"
#include "FeatureDataIterator.h"
BleuScorer::BleuScorer(const string& config)
: StatisticsBasedScorer("BLEU",config),
@ -212,3 +214,65 @@ void BleuScorer::dump_counts(counts_t& counts) const {
}
cerr << endl;
}
vector<float> BleuScorer::ScoreNbestList(string scoreFile, string featureFile) {
vector<string> scoreFiles;
vector<string> featureFiles;
scoreFiles.push_back(scoreFile);
featureFiles.push_back(featureFile);
vector<FeatureDataIterator> featureDataIters;
vector<ScoreDataIterator> scoreDataIters;
for (size_t i = 0; i < featureFiles.size(); ++i) {
featureDataIters.push_back(FeatureDataIterator(featureFiles[i]));
scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i]));
}
vector<pair<size_t,size_t> > hypotheses;
if (featureDataIters[0] == FeatureDataIterator::end()) {
cerr << "Error: at the end of feature data iterator" << endl;
exit(1);
}
for (size_t i = 0; i < featureFiles.size(); ++i) {
if (featureDataIters[i] == FeatureDataIterator::end()) {
cerr << "Error: Feature file " << i << " ended prematurely" << endl;
exit(1);
}
if (scoreDataIters[i] == ScoreDataIterator::end()) {
cerr << "Error: Score file " << i << " ended prematurely" << endl;
exit(1);
}
if (featureDataIters[i]->size() != scoreDataIters[i]->size()) {
cerr << "Error: features and scores have different size" << endl;
exit(1);
}
for (size_t j = 0; j < featureDataIters[i]->size(); ++j) {
hypotheses.push_back(pair<size_t,size_t>(i,j));
}
}
// score the nbest list
vector<float> bleuScores;
for (size_t i=0; i < hypotheses.size(); ++i) {
pair<size_t,size_t> translation = hypotheses[i];
float bleu = sentenceLevelBleuPlusOne(scoreDataIters[translation.first]->operator[](translation.second));
bleuScores.push_back(bleu);
}
return bleuScores;
}
float BleuScorer::sentenceLevelBleuPlusOne(const vector<float>& stats) {
float logbleu = 0.0;
const unsigned int bleu_order = 4;
for (unsigned int j=0; j<bleu_order; j++) {
//cerr << (stats.get(2*j)+1) << "/" << (stats.get(2*j+1)+1) << " ";
logbleu += log(stats[2*j]+1) - log(stats[2*j+1]+1);
}
logbleu /= bleu_order;
float brevity = 1.0 - (float)stats[(bleu_order*2)]/stats[1];
if (brevity < 0.0) {
logbleu += brevity;
}
//cerr << brevity << " -> " << exp(logbleu) << endl;
return exp(logbleu);
}

View File

@ -23,6 +23,9 @@ class BleuScorer: public StatisticsBasedScorer
public:
explicit BleuScorer(const string& config = "");
~BleuScorer();
static vector<float> ScoreNbestList(string scoreFile, string featureFile);
static float sentenceLevelBleuPlusOne(const vector<float>& stats);
virtual void setReferenceFiles(const vector<string>& referenceFiles);
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);

View File

@ -9,7 +9,6 @@ ScoreDataIterator.cpp
FeatureStats.cpp FeatureArray.cpp FeatureData.cpp
FeatureDataIterator.cpp
Data.cpp
BleuScorer.cpp
Point.cpp
PerScorer.cpp
Scorer.cpp
@ -31,14 +30,16 @@ CderScorer.cpp
MergeScorer.cpp
../util//kenutil m ..//z ;
exe mert : mert.cpp mert_lib ../moses/src//ThreadPool ;
exe mert : mert.cpp mert_lib bleu_lib ../moses/src//ThreadPool ;
exe extractor : extractor.cpp mert_lib ;
exe extractor : extractor.cpp mert_lib bleu_lib ;
exe evaluator : evaluator.cpp mert_lib ;
exe evaluator : evaluator.cpp mert_lib bleu_lib ;
exe pro : pro.cpp mert_lib ..//boost_program_options ;
exe pro : pro.cpp mert_lib bleu_lib ..//boost_program_options ;
alias programs : mert extractor evaluator pro ;
install legacy : programs : <location>. ;
lib bleu_lib : BleuScorer.cpp mert_lib : : : <include>. ;

View File

@ -39,6 +39,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "FeatureDataIterator.h"
#include "ScoreDataIterator.h"
#include "BleuScorer.h"
using namespace std;
@ -67,23 +68,6 @@ public:
const pair<size_t,size_t>& getTranslation2() const { return translation2; }
};
static float sentenceLevelBleuPlusOne(const vector<float>& stats) {
float logbleu = 0.0;
const unsigned int bleu_order = 4;
for (unsigned int j=0; j<bleu_order; j++) {
//cerr << (stats.get(2*j)+1) << "/" << (stats.get(2*j+1)+1) << " ";
logbleu += log(stats[2*j]+1) - log(stats[2*j+1]+1);
}
logbleu /= bleu_order;
float brevity = 1.0 - (float)stats[(bleu_order*2)]/stats[1];
if (brevity < 0.0) {
logbleu += brevity;
}
//cerr << brevity << " -> " << exp(logbleu) << endl;
return exp(logbleu);
}
static void outputSample(ostream& out, const FeatureDataItem& f1, const FeatureDataItem& f2) {
// difference in score in regular features
for(unsigned int j=0; j<f1.dense.size(); j++)
@ -209,11 +193,11 @@ int main(int argc, char** argv)
for(size_t i=0; i<n_candidates; i++) {
size_t rand1 = rand() % n_translations;
pair<size_t,size_t> translation1 = hypotheses[rand1];
float bleu1 = sentenceLevelBleuPlusOne(scoreDataIters[translation1.first]->operator[](translation1.second));
float bleu1 = BleuScorer::sentenceLevelBleuPlusOne(scoreDataIters[translation1.first]->operator[](translation1.second));
size_t rand2 = rand() % n_translations;
pair<size_t,size_t> translation2 = hypotheses[rand2];
float bleu2 = sentenceLevelBleuPlusOne(scoreDataIters[translation2.first]->operator[](translation2.second));
float bleu2 = BleuScorer::sentenceLevelBleuPlusOne(scoreDataIters[translation2.first]->operator[](translation2.second));
/*
cerr << "t(" << translation1.first << "," << translation1.second << ") = " << bleu1 <<

View File

@ -99,7 +99,8 @@ namespace Mira {
bool distinct,
bool avgRefLength,
size_t rank,
size_t epoch)
size_t epoch,
string filename)
{
StaticData &staticData = StaticData::InstanceNonConst();
initialize(staticData, source, sentenceid, bleuObjectiveWeight, bleuScoreWeight, avgRefLength);
@ -115,7 +116,7 @@ namespace Mira {
SearchAlgorithm search = staticData.GetSearchAlgorithm();
return runDecoder(source, sentenceid, nBestSize, bleuObjectiveWeight, bleuScoreWeight,
featureValues, bleuScores, modelScores, numReturnedTranslations, distinct, rank, epoch,
search, system);
search, system, filename);
}
}
@ -132,12 +133,26 @@ namespace Mira {
size_t rank,
size_t epoch,
SearchAlgorithm& search,
const TranslationSystem& system) {
const TranslationSystem& system,
string filename) {
// run the decoder
m_manager = new Moses::Manager(*m_sentence, search, &system);
m_manager->ProcessSentence();
TrellisPathList nBestList;
m_manager->CalcNBest(nBestSize, nBestList, distinct);
// optionally print nbest to file (to extract scores and features.. currently just for sentence bleu scoring)
if (filename != "") {
ofstream out(filename.c_str());
if (!out) {
ostringstream msg;
msg << "Unable to open " << filename;
throw runtime_error(msg.str());
}
// TODO: handle sentence id (for now always 0)
OutputNBest(out, nBestList, StaticData::Instance().GetOutputFactorOrder(),m_manager->GetTranslationSystem(), 0);
out.close();
}
// read off the feature values and bleu scores for each sentence in the nbest list
Moses::TrellisPathList::const_iterator iter;
@ -184,7 +199,6 @@ namespace Mira {
translations.push_back(translation);
}
// cerr << "Rank " << rank << ", use cache: " << staticData.GetUseTransOptCache() << ", weights: " << staticData.GetAllWeights() << endl;
return translations;
}
@ -307,8 +321,8 @@ namespace Mira {
out.close();
}
else {
OutputNBest(streamOut, nBestList, StaticData::Instance().GetOutputFactorOrder(),m_manager->GetTranslationSystem(), sentenceid);
streamOut.flush();
OutputNBest(streamOut, nBestList, StaticData::Instance().GetOutputFactorOrder(),m_manager->GetTranslationSystem(), sentenceid);
streamOut.flush();
}
}
}

View File

@ -62,7 +62,8 @@ class MosesDecoder {
bool distinct,
bool avgRefLength,
size_t rank,
size_t epoch);
size_t epoch,
std::string filename);
std::vector< std::vector<const Moses::Word*> > runDecoder(const std::string& source,
size_t sentenceid,
size_t nbestSize,
@ -76,7 +77,8 @@ class MosesDecoder {
size_t rank,
size_t epoch,
Moses::SearchAlgorithm& seach,
const Moses::TranslationSystem& system);
const Moses::TranslationSystem& system,
std::string filename);
std::vector< std::vector<const Moses::Word*> > runChartDecoder(const std::string& source,
size_t sentenceid,
size_t nbestSize,

View File

@ -1,6 +1,6 @@
lib mira_lib :
[ glob *.cpp : *Test.cpp Main.cpp ]
../moses-cmd/src//IOWrapper_lib ../moses/src//moses ../OnDiskPt//OnDiskPt ..//boost_program_options ;
../moses-cmd/src//IOWrapper_lib ../mert//bleu_lib ../moses/src//moses ../OnDiskPt//OnDiskPt ..//boost_program_options ;
exe mira : Main.cpp mira_lib ;

View File

@ -762,6 +762,25 @@ int main(int argc, char** argv) {
cerr << endl;
}
// ################
ostringstream hope_nbest_filename, fear_nbest_filename, model_nbest_filename, ref_filename;
hope_nbest_filename << "decode_hope_sent" << *sid << "." << hope_n << "best";
fear_nbest_filename << "decode_fear_sent" << *sid << "." << fear_n << "best";
model_nbest_filename << "decode_model_sent" << *sid << "." << n << "best";
// save reference
ref_filename << "decode_ref_sent" << *sid;
referenceFileMegam = ref_filename.str();
ofstream ref_out(referenceFileMegam.c_str());
if (!ref_out) {
ostringstream msg;
msg << "Unable to open " << referenceFileMegam;
throw runtime_error(msg.str());
}
ref_out << referenceSentences[decoder->getShortestReferenceIndex(*sid)][*sid] << "\n";
ref_out.close();
// ################
// check LM weight
for (LMList::const_iterator i = lmList.begin(); i != lmList.end(); ++i) {
float lmWeight = mosesWeights.GetScoreForProducer(*i);
@ -770,14 +789,13 @@ int main(int argc, char** argv) {
cerr << "ERROR: language model weight should never be <= 0." << endl;
}
// HOPE
if (clear_static) {
delete decoder;
StaticData::ClearDataStatic();
decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params);
decoder->setBleuParameters(sentenceLevelBleu, scaleByInputLength, scaleByAvgInputLength, scaleByInverseLength, scaleByAvgInverseLength, scaleByX, historySmoothing, bleu_smoothing_scheme);
decoder->setWeights(mosesWeights);
}
}
// ################
ostringstream hope_nbest_filename, fear_nbest_filename, model_nbest_filename, ref_filename;

View File

@ -54,7 +54,7 @@ void PhrasePairFeature::Evaluate(const Hypothesis& cur_hypo, ScoreComponentColle
namestr << targetFactor->GetString();
}
// temporary:
// temporary: limit training to particular phrases
if (!m_unrestricted) {
string feature = namestr.str();
if (m_limitedFeatures.find(feature) != m_limitedFeatures.end() )