test

2024-09-11 11:25:40 +03:00 · 2012-04-28 23:11:30 -07:00 · 2012-04-28 23:11:30 -07:00 · 6f39ad0b3e
commit 6f39ad0b3e
parent b8b3000daf
13 changed files with 149 additions and 40 deletions
--- a/OnDiskPt/Main.cpp
+++ b/OnDiskPt/Main.cpp
@ -55,7 +55,6 @@ int main (int argc, char * const argv[])
  const string filePath = argv[6]
                          ,destPath = argv[7];

-
  Moses::InputFileStream inStream(filePath);

  OnDiskWrapper onDiskWrapper;
@ -138,7 +137,8 @@ void Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, char *line
        break;
      }
      case 3: {
-        targetPhrase.Create1AlignFromString(tok);
+        //targetPhrase.Create1AlignFromString(tok);
+    	targetPhrase.CreateAlignFromString(tok);	
        break;
      }
      case 4:
--- a/OnDiskPt/TargetPhrase.cpp
+++ b/OnDiskPt/TargetPhrase.cpp
@ -27,6 +27,8 @@
 #include "TargetPhrase.h"
 #include "OnDiskWrapper.h"

+#include <boost/algorithm/string.hpp>
+
 using namespace std;

 namespace OnDiskPt
@ -61,6 +63,18 @@ void TargetPhrase::Create1AlignFromString(const std::string &align1Str)
  m_align.push_back(pair<size_t, size_t>(alignPoints[0], alignPoints[1]) );
 }

+void TargetPhrase::CreateAlignFromString(const std::string &alignStr)
+{
+	vector<std::string> alignPairs;
+	boost::split(alignPairs, alignStr, boost::is_any_of("\t "));
+	for (size_t i = 0; i < alignPairs.size(); ++i) {
+		vector<size_t> alignPoints;
+		Moses::Tokenize<size_t>(alignPoints, alignPairs[i], "-");
+		m_align.push_back(pair<size_t, size_t>(alignPoints[0], alignPoints[1]) );
+	}
+}
+
+
 void TargetPhrase::SetScore(float score, size_t ind)
 {
  CHECK(ind < m_scores.size());
@ -143,9 +157,10 @@ char *TargetPhrase::WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t
  // phrase id
  memcpy(mem, &m_filePos, sizeof(UINT64));
  memUsed += sizeof(UINT64);
-
+  
  // align
-  memUsed += WriteAlignToMemory(mem + memUsed);
+  size_t tmp = WriteAlignToMemory(mem + memUsed);
+  memUsed += tmp;

  // scores
  memUsed += WriteScoresToMemory(mem + memUsed);
@ -176,6 +191,7 @@ size_t TargetPhrase::WriteAlignToMemory(char *mem) const
    memUsed += sizeof(alignPair.second);
  }

+  std::cerr << "align memory used: " << memUsed << std::endl;
  return memUsed;
 }

@ -269,12 +285,14 @@ UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP, size_t numFactors)

 UINT64 TargetPhrase::ReadAlignFromFile(std::fstream &fileTPColl)
 {
+  std::cerr << "read alignment.." << std::endl;
  UINT64 bytesRead = 0;

  UINT64 numAlign;
  fileTPColl.read((char*) &numAlign, sizeof(UINT64));
  bytesRead += sizeof(UINT64);

+  std::cerr << "numAlign: " << numAlign << std::endl;
  for (size_t ind = 0; ind < numAlign; ++ind) {
    AlignPair alignPair;
    fileTPColl.read((char*) &alignPair.first, sizeof(UINT64));
@ -284,6 +302,7 @@ UINT64 TargetPhrase::ReadAlignFromFile(std::fstream &fileTPColl)
    bytesRead += sizeof(UINT64) * 2;
  }

+  std::cerr << "Align bytes read: " << bytesRead << std::endl;
  return bytesRead;
 }

--- a/OnDiskPt/TargetPhrase.h
+++ b/OnDiskPt/TargetPhrase.h
@ -63,6 +63,7 @@ public:
  void SetLHS(Word *lhs);

  void Create1AlignFromString(const std::string &align1Str);
+  void CreateAlignFromString(const std::string &align1Str);
  void SetScore(float score, size_t ind);

  const AlignType &GetAlign() const {
--- a/OnDiskPt/TargetPhraseCollection.cpp
+++ b/OnDiskPt/TargetPhraseCollection.cpp
@ -173,11 +173,14 @@ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnD
    TargetPhrase *tp = new TargetPhrase(numScores);

    UINT64 sizeOtherInfo = tp->ReadOtherInfoFromFile(currFilePos, fileTPColl);
+    std::cerr << "other info done." << std::endl;
    tp->ReadFromFile(fileTP, numTargetFactors);
+    std::cerr << "done reading from file." << std::endl;

    currFilePos += sizeOtherInfo;

    m_coll.push_back(tp);
+    std::cerr << "tp done." << std::endl;
  }
 }

--- a/mert/BleuScorer.cpp
+++ b/mert/BleuScorer.cpp
@ -7,6 +7,8 @@
 #include <iterator>
 #include <stdexcept>
 #include "Util.h"
+#include "ScoreDataIterator.h"
+#include "FeatureDataIterator.h"

 BleuScorer::BleuScorer(const string& config)
    : StatisticsBasedScorer("BLEU",config),
@ -212,3 +214,65 @@ void BleuScorer::dump_counts(counts_t& counts) const {
  }
  cerr << endl;
 }
+
+vector<float> BleuScorer::ScoreNbestList(string scoreFile, string featureFile) {
+	vector<string> scoreFiles;
+	vector<string> featureFiles;
+	scoreFiles.push_back(scoreFile);
+	featureFiles.push_back(featureFile);
+	
+	vector<FeatureDataIterator> featureDataIters;
+	vector<ScoreDataIterator> scoreDataIters;
+	for (size_t i = 0; i < featureFiles.size(); ++i) {
+		featureDataIters.push_back(FeatureDataIterator(featureFiles[i]));
+	    scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i]));
+	}
+	  
+	vector<pair<size_t,size_t> > hypotheses;
+	if (featureDataIters[0] == FeatureDataIterator::end()) {
+		cerr << "Error: at the end of feature data iterator" << endl;
+		exit(1);
+	}
+	for (size_t i = 0; i < featureFiles.size(); ++i) {
+		if (featureDataIters[i] == FeatureDataIterator::end()) {
+			cerr << "Error: Feature file " << i << " ended prematurely" << endl;
+	        exit(1);
+		}
+		if (scoreDataIters[i] == ScoreDataIterator::end()) {
+			cerr << "Error: Score file " << i << " ended prematurely" << endl;
+	        exit(1);
+	    }
+		if (featureDataIters[i]->size() != scoreDataIters[i]->size()) {
+			cerr << "Error: features and scores have different size" << endl;
+	        exit(1);
+		}
+		for (size_t j = 0; j < featureDataIters[i]->size(); ++j) {
+			hypotheses.push_back(pair<size_t,size_t>(i,j));
+	    }
+	}
+	    
+	// score the nbest list
+	vector<float> bleuScores;
+	for (size_t i=0; i < hypotheses.size(); ++i) {
+		pair<size_t,size_t> translation = hypotheses[i];
+		float bleu = sentenceLevelBleuPlusOne(scoreDataIters[translation.first]->operator[](translation.second));
+		bleuScores.push_back(bleu);
+	}	
+	return bleuScores;
+}	
+
+float BleuScorer::sentenceLevelBleuPlusOne(const vector<float>& stats) {
+	float logbleu = 0.0;
+	const unsigned int bleu_order = 4;
+	for (unsigned int j=0; j<bleu_order; j++) {
+		//cerr << (stats.get(2*j)+1) << "/" << (stats.get(2*j+1)+1) << " ";
+		logbleu += log(stats[2*j]+1) - log(stats[2*j+1]+1);
+	}
+	logbleu /= bleu_order;
+	float brevity = 1.0 - (float)stats[(bleu_order*2)]/stats[1];
+	if (brevity < 0.0) {
+		logbleu += brevity;
+	}
+	//cerr << brevity << " -> " << exp(logbleu) << endl;
+	return exp(logbleu);
+}
--- a/mert/BleuScorer.h
+++ b/mert/BleuScorer.h
@ -23,6 +23,9 @@ class BleuScorer: public StatisticsBasedScorer
 public:
  explicit BleuScorer(const string& config = "");
  ~BleuScorer();
+  
+  static vector<float> ScoreNbestList(string scoreFile, string featureFile);
+  static float sentenceLevelBleuPlusOne(const vector<float>& stats);

  virtual void setReferenceFiles(const vector<string>& referenceFiles);
  virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
--- a/mert/Jamfile
+++ b/mert/Jamfile
@ -9,7 +9,6 @@ ScoreDataIterator.cpp
 FeatureStats.cpp FeatureArray.cpp FeatureData.cpp
 FeatureDataIterator.cpp
 Data.cpp
-BleuScorer.cpp
 Point.cpp
 PerScorer.cpp
 Scorer.cpp
@ -31,14 +30,16 @@ CderScorer.cpp
 MergeScorer.cpp
 ../util//kenutil m ..//z ;

-exe mert : mert.cpp mert_lib ../moses/src//ThreadPool ;
+exe mert : mert.cpp mert_lib bleu_lib ../moses/src//ThreadPool ;

-exe extractor : extractor.cpp mert_lib ;
+exe extractor : extractor.cpp mert_lib bleu_lib ;

-exe evaluator : evaluator.cpp mert_lib ;
+exe evaluator : evaluator.cpp mert_lib bleu_lib ;

-exe pro : pro.cpp mert_lib ..//boost_program_options ;
+exe pro : pro.cpp mert_lib bleu_lib ..//boost_program_options ;

 alias programs : mert extractor evaluator pro ;

 install legacy : programs : <location>. ;
+
+lib bleu_lib : BleuScorer.cpp mert_lib : : : <include>. ;
--- a/mert/pro.cpp
+++ b/mert/pro.cpp
@ -39,6 +39,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA

 #include "FeatureDataIterator.h"
 #include "ScoreDataIterator.h"
+#include "BleuScorer.h"

 using namespace std;

@ -67,23 +68,6 @@ public:
 	const pair<size_t,size_t>& getTranslation2() const { return translation2; }
 };

-
-static float sentenceLevelBleuPlusOne(const vector<float>& stats) {
-	float logbleu = 0.0;
-	const unsigned int bleu_order = 4;
-	for (unsigned int j=0; j<bleu_order; j++) {
-		//cerr << (stats.get(2*j)+1) << "/" << (stats.get(2*j+1)+1) << " ";
-		logbleu += log(stats[2*j]+1) - log(stats[2*j+1]+1);
-	}
-	logbleu /= bleu_order;
-	float brevity = 1.0 - (float)stats[(bleu_order*2)]/stats[1];
-	if (brevity < 0.0) {
-		logbleu += brevity;
-	}
-	//cerr << brevity << " -> " << exp(logbleu) << endl;
-	return exp(logbleu);
-}
-
 static void outputSample(ostream& out, const FeatureDataItem& f1, const FeatureDataItem& f2) {
  // difference in score in regular features
 	for(unsigned int j=0; j<f1.dense.size(); j++)
@ -209,11 +193,11 @@ int main(int argc, char** argv)
    for(size_t  i=0; i<n_candidates; i++) {
      size_t rand1 = rand() % n_translations;
      pair<size_t,size_t> translation1 = hypotheses[rand1];
-      float bleu1 = sentenceLevelBleuPlusOne(scoreDataIters[translation1.first]->operator[](translation1.second));
+      float bleu1 = BleuScorer::sentenceLevelBleuPlusOne(scoreDataIters[translation1.first]->operator[](translation1.second));

      size_t rand2 = rand() % n_translations;
      pair<size_t,size_t> translation2 = hypotheses[rand2];
-      float bleu2 = sentenceLevelBleuPlusOne(scoreDataIters[translation2.first]->operator[](translation2.second));
+      float bleu2 = BleuScorer::sentenceLevelBleuPlusOne(scoreDataIters[translation2.first]->operator[](translation2.second));
      
      /*
      cerr << "t(" << translation1.first << "," << translation1.second << ") = " << bleu1 <<
--- a/mira/Decoder.cpp
+++ b/mira/Decoder.cpp
@ -99,7 +99,8 @@ namespace Mira {
                              bool distinct,
                              bool avgRefLength,
                              size_t rank,
-                              size_t epoch)
+                              size_t epoch,
+                              string filename)
  {
  	StaticData &staticData = StaticData::InstanceNonConst();
  	initialize(staticData, source, sentenceid, bleuObjectiveWeight, bleuScoreWeight, avgRefLength);
@ -115,7 +116,7 @@ namespace Mira {
    	SearchAlgorithm search = staticData.GetSearchAlgorithm();
    	return runDecoder(source, sentenceid, nBestSize, bleuObjectiveWeight, bleuScoreWeight,
    			featureValues, bleuScores, modelScores, numReturnedTranslations, distinct, rank, epoch,
-    			search, system);
+    			search, system, filename);
    }
  }

@ -132,12 +133,26 @@ namespace Mira {
  														size_t rank,
  														size_t epoch,
  														SearchAlgorithm& search,
-  														const TranslationSystem& system) {
+  														const TranslationSystem& system,
+  														string filename) {
  	// run the decoder
    m_manager = new Moses::Manager(*m_sentence, search, &system);
    m_manager->ProcessSentence();
    TrellisPathList nBestList;
    m_manager->CalcNBest(nBestSize, nBestList, distinct);
+    
+    // optionally print nbest to file (to extract scores and features.. currently just for sentence bleu scoring)
+    if (filename != "") {
+    	ofstream out(filename.c_str());
+    	if (!out) {
+    		ostringstream msg;
+    		msg << "Unable to open " << filename;
+    		throw runtime_error(msg.str());
+    	}
+    	// TODO: handle sentence id (for now always 0)
+    	OutputNBest(out, nBestList, StaticData::Instance().GetOutputFactorOrder(),m_manager->GetTranslationSystem(), 0);
+    	out.close();
+    }

    // read off the feature values and bleu scores for each sentence in the nbest list
    Moses::TrellisPathList::const_iterator iter;
@ -184,7 +199,6 @@ namespace Mira {
        translations.push_back(translation);
    }

-//    cerr << "Rank " << rank << ", use cache: " << staticData.GetUseTransOptCache() << ", weights: " << staticData.GetAllWeights() << endl;
    return translations;
  }

@ -307,8 +321,8 @@ namespace Mira {
    		out.close();
      }
      else {
-	OutputNBest(streamOut, nBestList, StaticData::Instance().GetOutputFactorOrder(),m_manager->GetTranslationSystem(), sentenceid);
-	streamOut.flush();
+    	  OutputNBest(streamOut, nBestList, StaticData::Instance().GetOutputFactorOrder(),m_manager->GetTranslationSystem(), sentenceid);
+    	  streamOut.flush();
      }
    }
  }
--- a/mira/Decoder.h
+++ b/mira/Decoder.h
@ -62,7 +62,8 @@ class MosesDecoder {
                          bool distinct,
                          bool avgRefLength,
                          size_t rank,
-                          size_t epoch);
+                          size_t epoch,
+                          std::string filename);
    std::vector< std::vector<const Moses::Word*> > runDecoder(const std::string& source,
                          size_t sentenceid,
                          size_t nbestSize,
@ -76,7 +77,8 @@ class MosesDecoder {
                          size_t rank,
                          size_t epoch,
                          Moses::SearchAlgorithm& seach,
-                          const Moses::TranslationSystem& system);
+                          const Moses::TranslationSystem& system,
+                          std::string filename);
    std::vector< std::vector<const Moses::Word*> > runChartDecoder(const std::string& source,
                          size_t sentenceid,
                          size_t nbestSize,
--- a/mira/Jamfile
+++ b/mira/Jamfile
@ -1,6 +1,6 @@
 lib mira_lib :
 [ glob *.cpp : *Test.cpp Main.cpp ]
-../moses-cmd/src//IOWrapper_lib ../moses/src//moses ../OnDiskPt//OnDiskPt ..//boost_program_options ;
+../moses-cmd/src//IOWrapper_lib ../mert//bleu_lib ../moses/src//moses ../OnDiskPt//OnDiskPt ..//boost_program_options ;

 exe mira : Main.cpp mira_lib ; 

--- a/mira/Main.cpp
+++ b/mira/Main.cpp
@ -762,6 +762,25 @@ int main(int argc, char** argv) {
 				    cerr << endl;
 				  }
 				  
+				  // ################
+				  ostringstream hope_nbest_filename, fear_nbest_filename, model_nbest_filename, ref_filename;
+				  hope_nbest_filename << "decode_hope_sent" << *sid << "." << hope_n << "best";
+				  fear_nbest_filename << "decode_fear_sent" << *sid << "." << fear_n << "best";
+				  model_nbest_filename << "decode_model_sent" << *sid << "." << n << "best";
+				  
+				  // save reference
+				  ref_filename <<  "decode_ref_sent" << *sid;
+				  referenceFileMegam = ref_filename.str();
+				  ofstream ref_out(referenceFileMegam.c_str());
+				  if (!ref_out) {
+					ostringstream msg;
+					msg << "Unable to open " << referenceFileMegam;
+					throw runtime_error(msg.str());
+				  }
+				  ref_out << referenceSentences[decoder->getShortestReferenceIndex(*sid)][*sid] << "\n";
+				  ref_out.close();
+				  // ################
+								  
 				  // check LM weight
 				  for (LMList::const_iterator i = lmList.begin(); i != lmList.end(); ++i) {
 				    float lmWeight = mosesWeights.GetScoreForProducer(*i);
@ -770,14 +789,13 @@ int main(int argc, char** argv) {
 				      cerr << "ERROR: language model weight should never be <= 0." << endl;
 				  }
 				  
-					// HOPE		  
 				  if (clear_static) {
 				    delete decoder;
 				    StaticData::ClearDataStatic();
 				    decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params);
 				    decoder->setBleuParameters(sentenceLevelBleu, scaleByInputLength, scaleByAvgInputLength, scaleByInverseLength, scaleByAvgInverseLength, scaleByX, historySmoothing, bleu_smoothing_scheme);
 				    decoder->setWeights(mosesWeights);
-				}    
+				  }    

 				  // ################
 				  ostringstream hope_nbest_filename, fear_nbest_filename, model_nbest_filename, ref_filename;
--- a/moses/src/PhrasePairFeature.cpp
+++ b/moses/src/PhrasePairFeature.cpp
@ -54,7 +54,7 @@ void PhrasePairFeature::Evaluate(const Hypothesis& cur_hypo, ScoreComponentColle
 		   namestr << targetFactor->GetString();
 	   }

-	   // temporary:
+	   // temporary: limit training to particular phrases
 	   if (!m_unrestricted) {
 	  	 string feature = namestr.str();
 	  	 if (m_limitedFeatures.find(feature) != m_limitedFeatures.end() )