2008-05-15 12:35:56 +04:00
|
|
|
/*
|
|
|
|
* Data.h
|
2012-02-20 03:29:53 +04:00
|
|
|
* mert - Minimum Error Rate Training
|
2008-05-15 12:35:56 +04:00
|
|
|
*
|
|
|
|
* Created by Nicola Bertoldi on 13/05/08.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2012-02-20 04:46:08 +04:00
|
|
|
#ifndef MERT_DATA_H_
|
|
|
|
#define MERT_DATA_H_
|
2008-05-15 12:35:56 +04:00
|
|
|
|
|
|
|
#include <vector>
|
2012-03-10 12:47:01 +04:00
|
|
|
#include <boost/shared_ptr.hpp>
|
2008-05-15 12:35:56 +04:00
|
|
|
|
|
|
|
#include "Util.h"
|
|
|
|
#include "FeatureData.h"
|
|
|
|
#include "ScoreData.h"
|
|
|
|
|
2012-06-30 23:23:45 +04:00
|
|
|
namespace MosesTuning
|
|
|
|
{
|
2012-12-06 20:39:22 +04:00
|
|
|
|
2008-05-15 12:35:56 +04:00
|
|
|
class Scorer;
|
|
|
|
|
2012-02-08 21:11:56 +04:00
|
|
|
typedef boost::shared_ptr<ScoreData> ScoreDataHandle;
|
|
|
|
typedef boost::shared_ptr<FeatureData> FeatureDataHandle;
|
|
|
|
|
2012-03-10 12:47:01 +04:00
|
|
|
// NOTE: there is no copy constructor implemented, so only the
|
|
|
|
// compiler synthesised shallow copy is available.
|
2008-05-15 12:35:56 +04:00
|
|
|
class Data
|
|
|
|
{
|
|
|
|
private:
|
2012-03-10 12:12:34 +04:00
|
|
|
Scorer* m_scorer;
|
|
|
|
std::string m_score_type;
|
2012-05-06 00:27:04 +04:00
|
|
|
std::size_t m_num_scores;
|
2012-03-10 12:12:34 +04:00
|
|
|
ScoreDataHandle m_score_data;
|
|
|
|
FeatureDataHandle m_feature_data;
|
2012-05-25 00:11:35 +04:00
|
|
|
SparseVector m_sparse_weights;
|
2011-11-12 17:04:22 +04:00
|
|
|
|
2008-05-15 12:35:56 +04:00
|
|
|
public:
|
2012-05-25 00:11:35 +04:00
|
|
|
explicit Data(Scorer* scorer, const std::string& sparseweightsfile="");
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2012-03-10 12:47:01 +04:00
|
|
|
void clear() {
|
2012-03-10 12:12:34 +04:00
|
|
|
m_score_data->clear();
|
|
|
|
m_feature_data->clear();
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
|
|
|
|
2012-03-10 12:47:01 +04:00
|
|
|
ScoreDataHandle getScoreData() { return m_score_data; }
|
2011-11-12 04:40:01 +04:00
|
|
|
|
2012-03-10 12:47:01 +04:00
|
|
|
FeatureDataHandle getFeatureData() { return m_feature_data; }
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2012-03-10 12:47:01 +04:00
|
|
|
Scorer* getScorer() { return m_scorer; }
|
2011-09-15 21:45:35 +04:00
|
|
|
|
2012-05-06 00:27:04 +04:00
|
|
|
std::size_t NumberOfFeatures() const {
|
2012-03-10 12:12:34 +04:00
|
|
|
return m_feature_data->NumberOfFeatures();
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
2011-09-07 20:37:33 +04:00
|
|
|
|
2012-03-10 12:47:01 +04:00
|
|
|
std::string Features() const { return m_feature_data->Features(); }
|
|
|
|
void Features(const std::string &f) { m_feature_data->Features(f); }
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2012-03-10 12:47:01 +04:00
|
|
|
void loadNBest(const std::string &file);
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2012-03-10 12:47:01 +04:00
|
|
|
void load(const std::string &featfile, const std::string &scorefile);
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2012-03-10 12:47:01 +04:00
|
|
|
void save(const std::string &featfile, const std::string &scorefile, bool bin=false);
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2012-03-10 12:47:01 +04:00
|
|
|
//ADDED BY TS
|
|
|
|
void removeDuplicates();
|
|
|
|
//END_ADDED
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2011-11-12 06:26:13 +04:00
|
|
|
inline bool existsFeatureNames() const {
|
2012-03-10 12:12:34 +04:00
|
|
|
return m_feature_data->existsFeatureNames();
|
2011-11-12 04:40:01 +04:00
|
|
|
}
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2012-05-06 00:27:04 +04:00
|
|
|
inline std::string getFeatureName(std::size_t idx) const {
|
2012-03-10 12:12:34 +04:00
|
|
|
return m_feature_data->getFeatureName(idx);
|
2011-11-12 04:40:01 +04:00
|
|
|
}
|
2011-08-03 21:00:17 +04:00
|
|
|
|
2012-05-06 00:27:04 +04:00
|
|
|
inline std::size_t getFeatureIndex(const std::string& name) const {
|
2012-03-10 12:12:34 +04:00
|
|
|
return m_feature_data->getFeatureIndex(name);
|
2011-11-12 04:40:01 +04:00
|
|
|
}
|
2011-08-03 21:00:17 +04:00
|
|
|
|
2011-09-15 21:45:35 +04:00
|
|
|
/**
|
2011-11-12 03:58:23 +04:00
|
|
|
* Create shard_count shards. If shard_size == 0, then the shards are non-overlapping
|
|
|
|
* and exhaust the data. If 0 < shard_size <= 1, then shards are chosen by sampling
|
|
|
|
* the data (with replacement) and shard_size is interpreted as the proportion
|
2011-09-15 21:45:35 +04:00
|
|
|
* of the total size.
|
|
|
|
*/
|
2012-05-06 00:27:04 +04:00
|
|
|
void createShards(std::size_t shard_count, float shard_size, const std::string& scorerconfig,
|
2011-11-12 04:24:19 +04:00
|
|
|
std::vector<Data>& shards);
|
2012-04-04 17:04:51 +04:00
|
|
|
|
|
|
|
// Helper functions for loadnbest();
|
|
|
|
void InitFeatureMap(const std::string& str);
|
|
|
|
void AddFeatures(const std::string& str,
|
2012-12-06 20:39:22 +04:00
|
|
|
int sentence_index);
|
2011-09-15 21:45:35 +04:00
|
|
|
};
|
2008-05-15 12:35:56 +04:00
|
|
|
|
2012-06-30 23:23:45 +04:00
|
|
|
}
|
|
|
|
|
2012-02-20 04:46:08 +04:00
|
|
|
#endif // MERT_DATA_H_
|