// $Id$ // vim:tabstop=2 /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2011- University of Edinburgh This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ /** * This is part of the PRO implementation. It converts the features and scores * files into a form suitable for input into the megam maxent trainer. * * For details of PRO, refer to Hopkins & May (EMNLP 2011) **/ #include #include #include #include #include #include #include #include #include "FeatureDataIterator.h" #include "ScoreDataIterator.h" using namespace std; namespace po = boost::program_options; class SampledPair { private: pair translation1; pair translation2; float scoreDiff; public: SampledPair(const pair& t1, const pair& t2, float diff ) { if (diff > 0) { translation1 = t1; translation2 = t2; scoreDiff = diff; } else { translation1 = t2; translation2 = t1; scoreDiff = -diff; } } float getDiff() const { return scoreDiff; } const pair& getTranslation1() const { return translation1; } const pair& getTranslation2() const { return translation2; } }; static float sentenceLevelBleuPlusOne(const vector& stats) { float logbleu = 0.0; const unsigned int bleu_order = 4; for (unsigned int j=0; j(stats[(bleu_order*2)]) / stats[1]; if (brevity < 0.0) { logbleu += brevity; } //cerr << brevity << " -> " << exp(logbleu) << endl; return exp(logbleu); } static void outputSample(ostream& out, const FeatureDataItem& f1, const FeatureDataItem& f2) { // difference in score in regular features for(unsigned int j=0; j 0.00001) out << " F" << j << " " << (f1.dense[j]-f2.dense[j]); if (f1.sparse.size() || f2.sparse.size()) { out << " "; // sparse features const SparseVector &s1 = f1.sparse; const SparseVector &s2 = f2.sparse; SparseVector diff = s1 - s2; diff.write(out); } } int main(int argc, char** argv) { bool help; vector scoreFiles; vector featureFiles; int seed; string outputFile; //TODO: options const unsigned int n_candidates = 5000; // Gamma, in Hopkins & May const unsigned int n_samples = 50; // Xi, in Hopkins & May const float min_diff = 0.05; po::options_description desc("Allowed options"); desc.add_options() ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit") ("scfile,S", po::value >(&scoreFiles), "Scorer data files") ("ffile,F", po::value > (&featureFiles), "Feature data files") ("random-seed,r", po::value(&seed), "Seed for random number generation") ("output-file,o", po::value(&outputFile), "Output file") ; po::options_description cmdline_options; cmdline_options.add(desc); po::variables_map vm; po::store(po::command_line_parser(argc,argv). options(cmdline_options).run(), vm); po::notify(vm); if (help) { cout << "Usage: " + string(argv[0]) + " [options]" << endl; cout << desc << endl; exit(0); } if (vm.count("random-seed")) { cerr << "Initialising random seed to " << seed << endl; srand(seed); } else { cerr << "Initialising random seed from system clock" << endl; srand(time(NULL)); } if (scoreFiles.size() == 0 || featureFiles.size() == 0) { cerr << "No data to process" << endl; exit(0); } if (featureFiles.size() != scoreFiles.size()) { cerr << "Error: Number of feature files (" << featureFiles.size() << ") does not match number of score files (" << scoreFiles.size() << ")" << endl; exit(1); } ostream* out; ofstream outFile; if (!outputFile.empty() ) { outFile.open(outputFile.c_str()); if (!(outFile)) { cerr << "Error: Failed to open " << outputFile << endl; exit(1); } out = &outFile; } else { out = &cout; } vector featureDataIters; vector scoreDataIters; for (size_t i = 0; i < featureFiles.size(); ++i) { featureDataIters.push_back(FeatureDataIterator(featureFiles[i])); scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i])); } //loop through nbest lists size_t sentenceId = 0; while(1) { vector > hypotheses; //TODO: de-deuping. Collect hashes of score,feature pairs and //only add index if it's unique. if (featureDataIters[0] == FeatureDataIterator::end()) { break; } for (size_t i = 0; i < featureFiles.size(); ++i) { if (featureDataIters[i] == FeatureDataIterator::end()) { cerr << "Error: Feature file " << i << " ended prematurely" << endl; exit(1); } if (scoreDataIters[i] == ScoreDataIterator::end()) { cerr << "Error: Score file " << i << " ended prematurely" << endl; exit(1); } if (featureDataIters[i]->size() != scoreDataIters[i]->size()) { cerr << "Error: For sentence " << sentenceId << " features and scores have different size" << endl; exit(1); } for (size_t j = 0; j < featureDataIters[i]->size(); ++j) { hypotheses.push_back(pair(i,j)); } } //collect the candidates vector samples; vector scores; size_t n_translations = hypotheses.size(); for(size_t i=0; i translation1 = hypotheses[rand1]; float bleu1 = sentenceLevelBleuPlusOne(scoreDataIters[translation1.first]->operator[](translation1.second)); size_t rand2 = rand() % n_translations; pair translation2 = hypotheses[rand2]; float bleu2 = sentenceLevelBleuPlusOne(scoreDataIters[translation2.first]->operator[](translation2.second)); /* cerr << "t(" << translation1.first << "," << translation1.second << ") = " << bleu1 << " t(" << translation2.first << "," << translation2.second << ") = " << bleu2 << " diff = " << abs(bleu1-bleu2) << endl; */ if (abs(bleu1-bleu2) < min_diff) continue; samples.push_back(SampledPair(translation1, translation2, bleu1-bleu2)); scores.push_back(1.0-abs(bleu1-bleu2)); } float sample_threshold = -1.0; if (samples.size() > n_samples) { nth_element(scores.begin(), scores.begin() + (n_samples-1), scores.end()); sample_threshold = 0.99999-scores[n_samples-1]; } size_t collected = 0; for (size_t i = 0; collected < n_samples && i < samples.size(); ++i) { if (samples[i].getDiff() < sample_threshold) continue; ++collected; size_t file_id1 = samples[i].getTranslation1().first; size_t hypo_id1 = samples[i].getTranslation1().second; size_t file_id2 = samples[i].getTranslation2().first; size_t hypo_id2 = samples[i].getTranslation2().second; *out << "1"; outputSample(*out, featureDataIters[file_id1]->operator[](hypo_id1), featureDataIters[file_id2]->operator[](hypo_id2)); *out << endl; *out << "0"; outputSample(*out, featureDataIters[file_id2]->operator[](hypo_id2), featureDataIters[file_id1]->operator[](hypo_id1)); *out << endl; } //advance all iterators for (size_t i = 0; i < featureFiles.size(); ++i) { ++featureDataIters[i]; ++scoreDataIters[i]; } ++sentenceId; } outFile.close(); }