mosesdecoder/mert/Data.cpp

/*
 *  Data.cpp
 *  mert - Minimum Error Rate Training
 *
 *  Created by Nicola Bertoldi on 13/05/08.
 *
 */

#include <algorithm>
#include "util/check.hh"
#include <cmath>
#include <fstream>

#include "Data.h"
#include "FileStream.h"
#include "Scorer.h"
#include "ScorerFactory.h"
#include "Util.h"

Data::Data()
  : theScorer(NULL),
    number_of_scores(0),
    _sparse_flag(false),
    scoredata(),
    featdata() {}

Data::Data(Scorer& ptr)
    : theScorer(&ptr),
      score_type(theScorer->getName()),
      number_of_scores(0),
      _sparse_flag(false),
      scoredata(new ScoreData(*theScorer)),
      featdata(new FeatureData)
{
  TRACE_ERR("Data::score_type " << score_type << std::endl);
  TRACE_ERR("Data::Scorer type from Scorer: " << theScorer->getName() << endl);
}

//ADDED BY TS
void Data::remove_duplicates() {

  size_t nSentences = featdata->size();
  assert(scoredata->size() == nSentences);

  for (size_t s=0; s < nSentences; s++) {

    FeatureArray& feat_array =  featdata->get(s);
    ScoreArray& score_array =  scoredata->get(s);

    assert(feat_array.size() == score_array.size());

    //serves as a hash-map:
    std::map<double, std::vector<size_t> > lookup;

    size_t end_pos = feat_array.size() - 1;

    size_t nRemoved = 0;
    for (size_t k=0; k <= end_pos; k++) {

      const FeatureStats& cur_feats = feat_array.get(k);

      double sum = 0.0;
      for (size_t l=0; l < cur_feats.size(); l++)
	sum += cur_feats.get(l);

      if (lookup.find(sum) != lookup.end()) {

	//std::cerr << "hit" << std::endl;

	std::vector<size_t>& cur_list = lookup[sum];

	size_t l=0;
	for (l=0; l < cur_list.size(); l++) {

	  size_t j=cur_list[l];

	  if (cur_feats == feat_array.get(j)
	      && score_array.get(k) == score_array.get(j)) {

	    if (k < end_pos) {

	      feat_array.swap(k,end_pos);
	      score_array.swap(k,end_pos);

	      k--;
	    }

	    end_pos--;
	    nRemoved++;
	    break;
	  }
	}

	if (l == lookup[sum].size())
	  cur_list.push_back(k);
      }
      else
	lookup[sum].push_back(k);

      // for (size_t j=0; j < k; j++) {

      // 	if (feat_array.get(k) == feat_array.get(j)
      // 	    && score_array.get(k) == score_array.get(j)) {

      // 	  if (k < end_pos) {

      // 	    feat_array.swap(k,end_pos);
      // 	    score_array.swap(k,end_pos);

      // 	    k--;
      // 	  }

      //          end_pos--;
      // 	  nRemoved++;
      //          break;
      // 	}
      // }
    }


    if (nRemoved > 0) {

      feat_array.resize(end_pos+1);
      score_array.resize(end_pos+1);
    }
  }
}
//END_ADDED


void Data::loadnbest(const std::string &file)
{
  TRACE_ERR("loading nbest from " << file << std::endl);

  ScoreStats scoreentry;

  inputfilestream inp(file); // matches a stream with a file. Opens the file

  if (!inp.good())
    throw runtime_error("Unable to open: " + file);

  std::string subsubstring, stringBuf;
  std::string sentence_index, sentence, feature_str;
  std::string::size_type loc;

  while (getline(inp,stringBuf,'\n')) {
    if (stringBuf.empty()) continue;
    // adding statistics for error measures
    scoreentry.clear();

    getNextPound(stringBuf, sentence_index, "|||"); // first field
    getNextPound(stringBuf, sentence, "|||");       // second field
    getNextPound(stringBuf, feature_str, "|||");    // third field

    theScorer->prepareStats(sentence_index, sentence, scoreentry);
    scoredata->add(scoreentry, sentence_index);

    // examine first line for name of features
    if (!existsFeatureNames()) {
      InitFeatureMap(feature_str);
    }
    AddFeatures(feature_str, sentence_index);
  }
  inp.close();
}

void Data::InitFeatureMap(const string& str) {
  string buf = str;
  string substr;
  string features = "";
  string tmp_name = "";
  size_t tmp_index = 0;
  string::size_type loc;
  char tmp[64];                         // for snprintf();

  while (!buf.empty()) {
    getNextPound(buf, substr);

    // string ending with ":" are skipped, because they are the names of the features
    if ((loc = substr.find_last_of(":")) != substr.length()-1) {
      snprintf(tmp, sizeof(tmp), "%s_%lu ", tmp_name.c_str(), tmp_index);
      features.append(tmp);

      tmp_index++;
    } else if (substr.find("_") != string::npos) {
      // ignore sparse feature name and its value
      getNextPound(buf, substr);
    } else {                              // update current feature name
      tmp_index = 0;
      tmp_name = substr.substr(0, substr.size() - 1);
    }
  }
  featdata->setFeatureMap(features);
}

void Data::AddFeatures(const string& str,
                       const string& sentence_index) {
  string::size_type loc;
  string buf = str;
  string substr;
  FeatureStats feature_entry;
  feature_entry.reset();

  while (!buf.empty()) {
    getNextPound(buf, substr);

    // no ':' -> feature value that needs to be stored
    if ((loc = substr.find_last_of(":")) != substr.length()-1) {
      feature_entry.add(ConvertStringToFeatureStatsType(substr));
    } else if (substr.find("_") != string::npos) {
      // sparse feature name? store as well
      std::string name = substr;
      getNextPound(buf, substr);
      feature_entry.addSparse(name, atof(substr.c_str()));
      _sparse_flag = true;
    }
  }
  featdata->add(feature_entry, sentence_index);
}

// TODO
void Data::mergeSparseFeatures() {
  std::cerr << "ERROR: sparse features can only be trained with pairwise ranked optimizer (PRO), not traditional MERT\n";
  exit(1);
}

void Data::createShards(size_t shard_count, float shard_size, const string& scorerconfig,
                        std::vector<Data>& shards)
{
  CHECK(shard_count);
  CHECK(shard_size >= 0);
  CHECK(shard_size <= 1);

  size_t data_size = scoredata->size();
  CHECK(data_size == featdata->size());

  shard_size *= data_size;

  for (size_t shard_id = 0; shard_id < shard_count; ++shard_id) {
    vector<size_t> shard_contents;
    if (shard_size == 0) {
      //split into roughly equal size shards
      const size_t shard_start = floor(0.5 + shard_id * static_cast<float>(data_size) / shard_count);
      const size_t shard_end = floor(0.5 + (shard_id + 1) * static_cast<float>(data_size) / shard_count);
      for (size_t i = shard_start; i < shard_end; ++i) {
        shard_contents.push_back(i);
      }
    } else {
      //create shards by randomly sampling
      for (size_t i = 0; i < floor(shard_size+0.5); ++i) {
        shard_contents.push_back(rand() % data_size);
      }
    }

    Scorer* scorer = ScorerFactory::getScorer(score_type, scorerconfig);

    shards.push_back(Data(*scorer));
    shards.back().score_type = score_type;
    shards.back().number_of_scores = number_of_scores;
    shards.back()._sparse_flag = _sparse_flag;
    for (size_t i = 0; i < shard_contents.size(); ++i) {
      shards.back().featdata->add(featdata->get(shard_contents[i]));
      shards.back().scoredata->add(scoredata->get(shard_contents[i]));
    }
    //cerr << endl;
  }
}
main command for managing feature and error statistics is ready; small example fortesting is available in directory example git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1692 1f5c12ca-751b-0410-a591-d2e778427230 2008-05-15 12:35:56 +04:00			`/*`
			`* Data.cpp`
Fix typo. 2012-02-20 03:29:53 +04:00			`* mert - Minimum Error Rate Training`
main command for managing feature and error statistics is ready; small example fortesting is available in directory example git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1692 1f5c12ca-751b-0410-a591-d2e778427230 2008-05-15 12:35:56 +04:00			`*`
			`* Created by Nicola Bertoldi on 13/05/08.`
			`*`
			`*/`

Add missing headers. 2011-11-14 14:52:21 +04:00			`#include <algorithm>`
Replace assert with CHECK until people learn how to use assert properly 2011-11-18 16:07:41 +04:00			`#include "util/check.hh"`
Minimize using #include headers in headers. Should use it in .cpp files. 2011-11-14 10:15:30 +04:00			`#include <cmath>`
main command for managing feature and error statistics is ready; small example fortesting is available in directory example git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1692 1f5c12ca-751b-0410-a591-d2e778427230 2008-05-15 12:35:56 +04:00			`#include <fstream>`
Implementation of sharding and resampling in mert. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4226 1f5c12ca-751b-0410-a591-d2e778427230 2011-09-15 21:45:35 +04:00
Move filestream wrapper classes to FileStream.{h,cpp} 2011-11-12 06:44:39 +04:00			`#include "Data.h"`
			`#include "FileStream.h"`
main command for managing feature and error statistics is ready; small example fortesting is available in directory example git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1692 1f5c12ca-751b-0410-a591-d2e778427230 2008-05-15 12:35:56 +04:00			`#include "Scorer.h"`
Implementation of sharding and resampling in mert. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4226 1f5c12ca-751b-0410-a591-d2e778427230 2011-09-15 21:45:35 +04:00			`#include "ScorerFactory.h"`
main command for managing feature and error statistics is ready; small example fortesting is available in directory example git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1692 1f5c12ca-751b-0410-a591-d2e778427230 2008-05-15 12:35:56 +04:00			`#include "Util.h"`

Fix bugs, private members were not initialized when the instances are created. When the objects of Data and FeatureData classes are created, the primitive private members were not initialized properly. 2011-11-14 07:20:04 +04:00			`Data::Data()`
			`: theScorer(NULL),`
			`number_of_scores(0),`
			`_sparse_flag(false),`
Fix sharding bug 2012-02-08 21:11:56 +04:00			`scoredata(),`
			`featdata() {}`
Fix bugs, private members were not initialized when the instances are created. When the objects of Data and FeatureData classes are created, the primitive private members were not initialized properly. 2011-11-14 07:20:04 +04:00
Small change: modify initialization of the Data class. 2011-11-12 17:04:22 +04:00			`Data::Data(Scorer& ptr)`
			`: theScorer(&ptr),`
			`score_type(theScorer->getName()),`
			`number_of_scores(0),`
			`_sparse_flag(false),`
			`scoredata(new ScoreData(*theScorer)),`
			`featdata(new FeatureData)`
main command for managing feature and error statistics is ready; small example fortesting is available in directory example git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1692 1f5c12ca-751b-0410-a591-d2e778427230 2008-05-15 12:35:56 +04:00			`{`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3899 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 15:42:19 +03:00			`TRACE_ERR("Data::score_type " << score_type << std::endl);`
			`TRACE_ERR("Data::Scorer type from Scorer: " << theScorer->getName() << endl);`
Fix memory leaks in extractor. 2011-11-11 14:11:10 +04:00			`}`

revert 2011-12-12 17:48:42 +04:00			`//ADDED BY TS`
			`void Data::remove_duplicates() {`

uint -> size_t 2011-12-12 20:27:27 +04:00			`size_t nSentences = featdata->size();`
revert 2011-12-12 17:48:42 +04:00			`assert(scoredata->size() == nSentences);`

uint -> size_t 2011-12-12 20:27:27 +04:00			`for (size_t s=0; s < nSentences; s++) {`
revert 2011-12-12 17:48:42 +04:00
			`FeatureArray& feat_array = featdata->get(s);`
			`ScoreArray& score_array = scoredata->get(s);`

			`assert(feat_array.size() == score_array.size());`

			`//serves as a hash-map:`
uint -> size_t 2011-12-12 20:27:27 +04:00			`std::map<double, std::vector<size_t> > lookup;`
revert 2011-12-12 17:48:42 +04:00
uint -> size_t 2011-12-12 20:27:27 +04:00			`size_t end_pos = feat_array.size() - 1;`
revert 2011-12-12 17:48:42 +04:00
uint -> size_t 2011-12-12 20:27:27 +04:00			`size_t nRemoved = 0;`
			`for (size_t k=0; k <= end_pos; k++) {`
revert 2011-12-12 17:48:42 +04:00
			`const FeatureStats& cur_feats = feat_array.get(k);`

			`double sum = 0.0;`
uint -> size_t 2011-12-12 20:27:27 +04:00			`for (size_t l=0; l < cur_feats.size(); l++)`
revert 2011-12-12 17:48:42 +04:00			`sum += cur_feats.get(l);`

			`if (lookup.find(sum) != lookup.end()) {`

			`//std::cerr << "hit" << std::endl;`

uint -> size_t 2011-12-12 20:27:27 +04:00			`std::vector<size_t>& cur_list = lookup[sum];`
revert 2011-12-12 17:48:42 +04:00
uint -> size_t 2011-12-12 20:27:27 +04:00			`size_t l=0;`
revert 2011-12-12 17:48:42 +04:00			`for (l=0; l < cur_list.size(); l++) {`
Clean up Data::loadnbest(). Add helper functions. 2012-03-07 02:01:28 +04:00
uint -> size_t 2011-12-12 20:27:27 +04:00			`size_t j=cur_list[l];`
revert 2011-12-12 17:48:42 +04:00
			`if (cur_feats == feat_array.get(j)`
			`&& score_array.get(k) == score_array.get(j)) {`

			`if (k < end_pos) {`
Clean up Data::loadnbest(). Add helper functions. 2012-03-07 02:01:28 +04:00
revert 2011-12-12 17:48:42 +04:00			`feat_array.swap(k,end_pos);`
			`score_array.swap(k,end_pos);`
Clean up Data::loadnbest(). Add helper functions. 2012-03-07 02:01:28 +04:00
revert 2011-12-12 17:48:42 +04:00			`k--;`
			`}`
Clean up Data::loadnbest(). Add helper functions. 2012-03-07 02:01:28 +04:00
revert 2011-12-12 17:48:42 +04:00			`end_pos--;`
			`nRemoved++;`
			`break;`
			`}`
			`}`

			`if (l == lookup[sum].size())`
			`cur_list.push_back(k);`
			`}`
			`else`
			`lookup[sum].push_back(k);`

uint -> size_t 2011-12-12 20:27:27 +04:00			`// for (size_t j=0; j < k; j++) {`
revert 2011-12-12 17:48:42 +04:00
			`// if (feat_array.get(k) == feat_array.get(j)`
			`// && score_array.get(k) == score_array.get(j)) {`

			`// if (k < end_pos) {`

			`// feat_array.swap(k,end_pos);`
			`// score_array.swap(k,end_pos);`

			`// k--;`
			`// }`

			`// end_pos--;`
			`// nRemoved++;`
			`// break;`
			`// }`
			`// }`
			`}`


			`if (nRemoved > 0) {`

			`feat_array.resize(end_pos+1);`
			`score_array.resize(end_pos+1);`
			`}`
			`}`
			`}`
			`//END_ADDED`


main command for managing feature and error statistics is ready; small example fortesting is available in directory example git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1692 1f5c12ca-751b-0410-a591-d2e778427230 2008-05-15 12:35:56 +04:00			`void Data::loadnbest(const std::string &file)`
			`{`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3899 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 15:42:19 +03:00			`TRACE_ERR("loading nbest from " << file << std::endl);`
main command for managing feature and error statistics is ready; small example fortesting is available in directory example git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1692 1f5c12ca-751b-0410-a591-d2e778427230 2008-05-15 12:35:56 +04:00
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3899 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 15:42:19 +03:00			`ScoreStats scoreentry;`
main command for managing feature and error statistics is ready; small example fortesting is available in directory example git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1692 1f5c12ca-751b-0410-a591-d2e778427230 2008-05-15 12:35:56 +04:00
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3899 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 15:42:19 +03:00			`inputfilestream inp(file); // matches a stream with a file. Opens the file`
main command for managing feature and error statistics is ready; small example fortesting is available in directory example git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1692 1f5c12ca-751b-0410-a591-d2e778427230 2008-05-15 12:35:56 +04:00
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3899 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 15:42:19 +03:00			`if (!inp.good())`
			`throw runtime_error("Unable to open: " + file);`
main command for managing feature and error statistics is ready; small example fortesting is available in directory example git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1692 1f5c12ca-751b-0410-a591-d2e778427230 2008-05-15 12:35:56 +04:00
Clean up Data::loadnbest(). Add helper functions. 2012-03-07 02:01:28 +04:00			`std::string subsubstring, stringBuf;`
			`std::string sentence_index, sentence, feature_str;`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3899 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 15:42:19 +03:00			`std::string::size_type loc;`
main command for managing feature and error statistics is ready; small example fortesting is available in directory example git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1692 1f5c12ca-751b-0410-a591-d2e778427230 2008-05-15 12:35:56 +04:00
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3899 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 15:42:19 +03:00			`while (getline(inp,stringBuf,'\n')) {`
			`if (stringBuf.empty()) continue;`
Fix indentation. 2011-11-12 04:24:19 +04:00			`// adding statistics for error measures`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3899 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 15:42:19 +03:00			`scoreentry.clear();`

Clean up Data::loadnbest(). Add helper functions. 2012-03-07 02:01:28 +04:00			`getNextPound(stringBuf, sentence_index, "\|\|\|"); // first field`
			`getNextPound(stringBuf, sentence, "\|\|\|"); // second field`
			`getNextPound(stringBuf, feature_str, "\|\|\|"); // third field`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3899 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 15:42:19 +03:00
Clean up Data::loadnbest(). Add helper functions. 2012-03-07 02:01:28 +04:00			`theScorer->prepareStats(sentence_index, sentence, scoreentry);`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3899 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 15:42:19 +03:00			`scoredata->add(scoreentry, sentence_index);`

support for sparse feature functions (mert support only when using PRO) git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4184 1f5c12ca-751b-0410-a591-d2e778427230 2011-09-07 20:37:33 +04:00			`// examine first line for name of features`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3899 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 15:42:19 +03:00			`if (!existsFeatureNames()) {`
Clean up Data::loadnbest(). Add helper functions. 2012-03-07 02:01:28 +04:00			`InitFeatureMap(feature_str);`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3899 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 15:42:19 +03:00			`}`
Clean up Data::loadnbest(). Add helper functions. 2012-03-07 02:01:28 +04:00			`AddFeatures(feature_str, sentence_index);`
			`}`
			`inp.close();`
			`}`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3899 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 15:42:19 +03:00
Clean up Data::loadnbest(). Add helper functions. 2012-03-07 02:01:28 +04:00			`void Data::InitFeatureMap(const string& str) {`
			`string buf = str;`
			`string substr;`
			`string features = "";`
			`string tmp_name = "";`
			`size_t tmp_index = 0;`
			`string::size_type loc;`
			`char tmp[64]; // for snprintf();`

			`while (!buf.empty()) {`
			`getNextPound(buf, substr);`

			`// string ending with ":" are skipped, because they are the names of the features`
			`if ((loc = substr.find_last_of(":")) != substr.length()-1) {`
			`snprintf(tmp, sizeof(tmp), "%s_%lu ", tmp_name.c_str(), tmp_index);`
			`features.append(tmp);`

			`tmp_index++;`
			`} else if (substr.find("_") != string::npos) {`
			`// ignore sparse feature name and its value`
			`getNextPound(buf, substr);`
			`} else { // update current feature name`
			`tmp_index = 0;`
			`tmp_name = substr.substr(0, substr.size() - 1);`
			`}`
			`}`
			`featdata->setFeatureMap(features);`
			`}`
reading from textual gzipped file is now possible git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1786 1f5c12ca-751b-0410-a591-d2e778427230 2008-05-20 18:15:30 +04:00
Clean up Data::loadnbest(). Add helper functions. 2012-03-07 02:01:28 +04:00			`void Data::AddFeatures(const string& str,`
			`const string& sentence_index) {`
			`string::size_type loc;`
			`string buf = str;`
			`string substr;`
			`FeatureStats feature_entry;`
			`feature_entry.reset();`

			`while (!buf.empty()) {`
			`getNextPound(buf, substr);`

			`// no ':' -> feature value that needs to be stored`
			`if ((loc = substr.find_last_of(":")) != substr.length()-1) {`
			`feature_entry.add(ConvertStringToFeatureStatsType(substr));`
			`} else if (substr.find("_") != string::npos) {`
support for sparse feature functions (mert support only when using PRO) git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4184 1f5c12ca-751b-0410-a591-d2e778427230 2011-09-07 20:37:33 +04:00			`// sparse feature name? store as well`
Clean up Data::loadnbest(). Add helper functions. 2012-03-07 02:01:28 +04:00			`std::string name = substr;`
			`getNextPound(buf, substr);`
			`feature_entry.addSparse(name, atof(substr.c_str()));`
			`_sparse_flag = true;`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3899 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 15:42:19 +03:00			`}`
			`}`
Clean up Data::loadnbest(). Add helper functions. 2012-03-07 02:01:28 +04:00			`featdata->add(feature_entry, sentence_index);`
Implementation of Cer et al mert regularisation. Use with argument such as --scconfig regtype:min,regwin:3 in extractor and mert. Only tested on toy example so far. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1860 1f5c12ca-751b-0410-a591-d2e778427230 2008-06-24 23:27:18 +04:00			`}`

support for sparse feature functions (mert support only when using PRO) git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4184 1f5c12ca-751b-0410-a591-d2e778427230 2011-09-07 20:37:33 +04:00			`// TODO`
Fix indentation. 2011-11-12 04:24:19 +04:00			`void Data::mergeSparseFeatures() {`
support for sparse feature functions (mert support only when using PRO) git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4184 1f5c12ca-751b-0410-a591-d2e778427230 2011-09-07 20:37:33 +04:00			`std::cerr << "ERROR: sparse features can only be trained with pairwise ranked optimizer (PRO), not traditional MERT\n";`
			`exit(1);`
			`}`

Implementation of sharding and resampling in mert. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4226 1f5c12ca-751b-0410-a591-d2e778427230 2011-09-15 21:45:35 +04:00			`void Data::createShards(size_t shard_count, float shard_size, const string& scorerconfig,`
Use passing objects by const references not passing by their values. 2011-11-14 09:00:47 +04:00			`std::vector<Data>& shards)`
Implementation of sharding and resampling in mert. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4226 1f5c12ca-751b-0410-a591-d2e778427230 2011-09-15 21:45:35 +04:00			`{`
Replace assert with CHECK until people learn how to use assert properly 2011-11-18 16:07:41 +04:00			`CHECK(shard_count);`
			`CHECK(shard_size >= 0);`
			`CHECK(shard_size <= 1);`
Implementation of sharding and resampling in mert. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4226 1f5c12ca-751b-0410-a591-d2e778427230 2011-09-15 21:45:35 +04:00
			`size_t data_size = scoredata->size();`
Replace assert with CHECK until people learn how to use assert properly 2011-11-18 16:07:41 +04:00			`CHECK(data_size == featdata->size());`
Implementation of sharding and resampling in mert. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4226 1f5c12ca-751b-0410-a591-d2e778427230 2011-09-15 21:45:35 +04:00
Use passing objects by const references not passing by their values. 2011-11-14 09:00:47 +04:00			`shard_size *= data_size;`
Implementation of sharding and resampling in mert. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4226 1f5c12ca-751b-0410-a591-d2e778427230 2011-09-15 21:45:35 +04:00
			`for (size_t shard_id = 0; shard_id < shard_count; ++shard_id) {`
			`vector<size_t> shard_contents;`
			`if (shard_size == 0) {`
			`//split into roughly equal size shards`
Change casts to C++ style casts, and delete unnecessary casts. 2012-02-01 12:17:58 +04:00			`const size_t shard_start = floor(0.5 + shard_id * static_cast<float>(data_size) / shard_count);`
			`const size_t shard_end = floor(0.5 + (shard_id + 1) * static_cast<float>(data_size) / shard_count);`
Implementation of sharding and resampling in mert. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4226 1f5c12ca-751b-0410-a591-d2e778427230 2011-09-15 21:45:35 +04:00			`for (size_t i = shard_start; i < shard_end; ++i) {`
			`shard_contents.push_back(i);`
			`}`
			`} else {`
			`//create shards by randomly sampling`
			`for (size_t i = 0; i < floor(shard_size+0.5); ++i) {`
			`shard_contents.push_back(rand() % data_size);`
			`}`
			`}`
Fix memory leaks in mert. 2011-11-11 15:40:59 +04:00
			`Scorer* scorer = ScorerFactory::getScorer(score_type, scorerconfig);`
Implementation of sharding and resampling in mert. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4226 1f5c12ca-751b-0410-a591-d2e778427230 2011-09-15 21:45:35 +04:00
			`shards.push_back(Data(*scorer));`
			`shards.back().score_type = score_type;`
			`shards.back().number_of_scores = number_of_scores;`
			`shards.back()._sparse_flag = _sparse_flag;`
			`for (size_t i = 0; i < shard_contents.size(); ++i) {`
			`shards.back().featdata->add(featdata->get(shard_contents[i]));`
			`shards.back().scoredata->add(scoredata->get(shard_contents[i]));`
			`}`
			`//cerr << endl;`
			`}`
			`}`