2008-05-14 11:57:45 +04:00
|
|
|
/*
|
|
|
|
* FeatureData.h
|
2012-02-20 03:29:53 +04:00
|
|
|
* mert - Minimum Error Rate Training
|
2008-05-14 11:57:45 +04:00
|
|
|
*
|
|
|
|
* Created by Nicola Bertoldi on 13/05/08.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2012-02-20 04:46:08 +04:00
|
|
|
#ifndef MERT_FEATURE_DATA_H_
|
|
|
|
#define MERT_FEATURE_DATA_H_
|
2008-05-14 11:57:45 +04:00
|
|
|
|
|
|
|
#include <vector>
|
|
|
|
#include <iostream>
|
2011-11-14 10:15:30 +04:00
|
|
|
#include <stdexcept>
|
2014-10-07 15:53:14 +04:00
|
|
|
#include <boost/lexical_cast.hpp>
|
2008-05-14 11:57:45 +04:00
|
|
|
#include "FeatureArray.h"
|
|
|
|
|
2012-06-30 23:23:45 +04:00
|
|
|
namespace MosesTuning
|
|
|
|
{
|
2012-12-06 20:39:22 +04:00
|
|
|
|
2012-06-30 23:23:45 +04:00
|
|
|
|
2008-05-14 11:57:45 +04:00
|
|
|
class FeatureData
|
|
|
|
{
|
|
|
|
private:
|
2012-05-05 20:31:04 +04:00
|
|
|
std::size_t m_num_features;
|
2012-03-10 12:12:34 +04:00
|
|
|
std::string m_features;
|
2012-05-05 20:31:04 +04:00
|
|
|
std::map<std::string, std::size_t> m_feature_name_to_index; // map from name to index of features
|
|
|
|
std::map<std::size_t, std::string> m_index_to_feature_name; // map from index to name of features
|
2012-03-10 12:12:34 +04:00
|
|
|
featdata_t m_array;
|
|
|
|
idx2name m_index_to_array_name; // map from index to name of array
|
|
|
|
name2idx m_array_name_to_index; // map from name to index of array
|
2011-11-14 07:20:04 +04:00
|
|
|
|
2008-05-14 11:57:45 +04:00
|
|
|
public:
|
2011-02-24 15:42:19 +03:00
|
|
|
FeatureData();
|
2011-11-12 04:40:01 +04:00
|
|
|
~FeatureData() {}
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
void clear() {
|
|
|
|
m_array.clear();
|
|
|
|
}
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2012-05-25 00:11:35 +04:00
|
|
|
FeatureArray& get(size_t idx) {
|
|
|
|
return m_array.at(idx);
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
2012-05-25 00:11:35 +04:00
|
|
|
const FeatureArray& get(size_t idx) const {
|
|
|
|
return m_array.at(idx);
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
|
|
|
|
2012-12-06 20:39:22 +04:00
|
|
|
inline bool exists(int sent_idx) const {
|
|
|
|
return existsInternal(getIndex(sent_idx));
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
2012-02-01 13:13:00 +04:00
|
|
|
|
2012-12-06 20:39:22 +04:00
|
|
|
inline bool existsInternal(int sent_idx) const {
|
2012-03-10 12:12:34 +04:00
|
|
|
return (sent_idx > -1 && sent_idx < static_cast<int>(m_array.size())) ? true : false;
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
|
|
|
|
2012-05-05 20:31:04 +04:00
|
|
|
inline FeatureStats& get(std::size_t i, std::size_t j) {
|
2012-03-10 12:12:34 +04:00
|
|
|
return m_array.at(i).get(j);
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
2012-03-10 12:12:34 +04:00
|
|
|
|
2012-05-05 20:31:04 +04:00
|
|
|
inline const FeatureStats& get(std::size_t i, std::size_t j) const {
|
2012-03-10 12:12:34 +04:00
|
|
|
return m_array.at(i).get(j);
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void add(FeatureArray& e);
|
2012-12-06 20:39:22 +04:00
|
|
|
void add(FeatureStats& e, int sent_idx);
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
std::size_t size() const {
|
|
|
|
return m_array.size();
|
|
|
|
}
|
2012-03-10 12:12:34 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
std::size_t NumberOfFeatures() const {
|
|
|
|
return m_num_features;
|
|
|
|
}
|
|
|
|
void NumberOfFeatures(std::size_t v) {
|
|
|
|
m_num_features = v;
|
|
|
|
}
|
2012-03-10 12:12:34 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
std::string Features() const {
|
|
|
|
return m_features;
|
|
|
|
}
|
|
|
|
void Features(const std::string& f) {
|
|
|
|
m_features = f;
|
|
|
|
}
|
2011-02-24 15:42:19 +03:00
|
|
|
|
|
|
|
void save(const std::string &file, bool bin=false);
|
2012-03-10 14:04:43 +04:00
|
|
|
void save(std::ostream* os, bool bin=false);
|
2012-03-10 14:27:52 +04:00
|
|
|
void save(bool bin=false);
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2012-05-25 00:11:35 +04:00
|
|
|
void load(std::istream* is, const SparseVector& sparseWeights);
|
2012-01-13 20:52:15 +04:00
|
|
|
void load(const std::string &file, const SparseVector& sparseWeights);
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2011-11-12 06:26:13 +04:00
|
|
|
bool check_consistency() const;
|
2012-03-10 14:27:52 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
void setIndex();
|
|
|
|
|
2012-12-06 20:39:22 +04:00
|
|
|
inline int getIndex(int idx) const {
|
2012-03-10 12:12:34 +04:00
|
|
|
name2idx::const_iterator i = m_array_name_to_index.find(idx);
|
|
|
|
if (i != m_array_name_to_index.end())
|
2011-02-24 15:42:19 +03:00
|
|
|
return i->second;
|
|
|
|
else
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2012-12-06 20:39:22 +04:00
|
|
|
inline int getName(std::size_t idx) const {
|
2012-03-10 12:12:34 +04:00
|
|
|
idx2name::const_iterator i = m_index_to_array_name.find(idx);
|
|
|
|
if (i != m_index_to_array_name.end())
|
2014-10-07 15:53:14 +04:00
|
|
|
throw std::runtime_error("there is no entry at index " + boost::lexical_cast<std::string>(idx));
|
2011-02-24 15:42:19 +03:00
|
|
|
return i->second;
|
|
|
|
}
|
|
|
|
|
2011-11-12 06:26:13 +04:00
|
|
|
bool existsFeatureNames() const {
|
2012-03-10 12:12:34 +04:00
|
|
|
return (m_index_to_feature_name.size() > 0) ? true : false;
|
2011-11-12 04:40:01 +04:00
|
|
|
}
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2012-05-05 20:31:04 +04:00
|
|
|
std::string getFeatureName(std::size_t idx) const {
|
2012-03-10 12:12:34 +04:00
|
|
|
if (idx >= m_index_to_feature_name.size())
|
2012-05-10 02:51:05 +04:00
|
|
|
throw std::runtime_error("Error: you required an too big index");
|
2012-05-05 20:31:04 +04:00
|
|
|
std::map<std::size_t, std::string>::const_iterator it = m_index_to_feature_name.find(idx);
|
2012-03-10 12:12:34 +04:00
|
|
|
if (it == m_index_to_feature_name.end()) {
|
2014-10-07 15:53:14 +04:00
|
|
|
throw std::runtime_error("Error: specified id is unknown: " + boost::lexical_cast<std::string>(idx));
|
2011-11-12 06:26:13 +04:00
|
|
|
} else {
|
|
|
|
return it->second;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-05-05 20:31:04 +04:00
|
|
|
std::size_t getFeatureIndex(const std::string& name) const {
|
|
|
|
std::map<std::string, std::size_t>::const_iterator it = m_feature_name_to_index.find(name);
|
2012-03-30 13:50:23 +04:00
|
|
|
if (it == m_feature_name_to_index.end()) {
|
|
|
|
std::string msg = "Error: feature " + name + " is unknown. Known features: ";
|
2012-05-10 02:51:05 +04:00
|
|
|
for (std::map<std::string, std::size_t>::const_iterator cit = m_feature_name_to_index.begin();
|
|
|
|
cit != m_feature_name_to_index.end(); cit++) {
|
|
|
|
msg += cit->first;
|
2012-03-30 13:50:23 +04:00
|
|
|
msg += ", ";
|
|
|
|
}
|
|
|
|
|
2012-05-05 20:31:04 +04:00
|
|
|
throw std::runtime_error(msg);
|
2012-03-30 13:50:23 +04:00
|
|
|
}
|
2011-11-12 06:26:13 +04:00
|
|
|
return it->second;
|
2011-11-12 04:40:01 +04:00
|
|
|
}
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2011-11-14 09:00:47 +04:00
|
|
|
void setFeatureMap(const std::string& feat);
|
2012-02-17 02:27:07 +04:00
|
|
|
|
|
|
|
/* For debugging */
|
|
|
|
std::string ToString() const;
|
2008-05-14 11:57:45 +04:00
|
|
|
};
|
|
|
|
|
2012-06-30 23:23:45 +04:00
|
|
|
}
|
|
|
|
|
2012-02-20 04:46:08 +04:00
|
|
|
#endif // MERT_FEATURE_DATA_H_
|