2008-05-14 11:57:45 +04:00
|
|
|
/*
|
|
|
|
* FeatureStats.cpp
|
2012-02-20 03:29:53 +04:00
|
|
|
* mert - Minimum Error Rate Training
|
2008-05-14 11:57:45 +04:00
|
|
|
*
|
|
|
|
* Created by Nicola Bertoldi on 13/05/08.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "FeatureStats.h"
|
|
|
|
|
2012-03-10 14:04:43 +04:00
|
|
|
#include <fstream>
|
2011-11-14 10:15:30 +04:00
|
|
|
#include <cmath>
|
2012-01-13 20:52:15 +04:00
|
|
|
#include <stdexcept>
|
2012-05-30 20:39:53 +04:00
|
|
|
|
2012-05-29 21:38:57 +04:00
|
|
|
#include <boost/functional/hash.hpp>
|
|
|
|
|
2014-07-21 14:04:43 +04:00
|
|
|
#include "util/murmur_hash.hh"
|
|
|
|
|
2011-11-14 10:15:30 +04:00
|
|
|
#include "Util.h"
|
|
|
|
|
2012-05-10 02:51:05 +04:00
|
|
|
using namespace std;
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
namespace
|
|
|
|
{
|
2011-11-12 12:30:33 +04:00
|
|
|
const int kAvailableSize = 8;
|
|
|
|
} // namespace
|
2008-06-05 11:23:34 +04:00
|
|
|
|
2012-06-30 23:23:45 +04:00
|
|
|
namespace MosesTuning
|
|
|
|
{
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2012-06-30 23:23:45 +04:00
|
|
|
|
2012-03-10 12:12:34 +04:00
|
|
|
SparseVector::name2id_t SparseVector::m_name_to_id;
|
|
|
|
SparseVector::id2name_t SparseVector::m_id_to_name;
|
2011-10-14 11:40:53 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
FeatureStatsType SparseVector::get(const string& name) const
|
|
|
|
{
|
2012-03-10 12:12:34 +04:00
|
|
|
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
|
|
|
|
if (name2id_iter == m_name_to_id.end()) return 0;
|
2011-10-14 11:40:53 +04:00
|
|
|
size_t id = name2id_iter->second;
|
|
|
|
return get(id);
|
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
FeatureStatsType SparseVector::get(size_t id) const
|
|
|
|
{
|
2012-03-10 12:12:34 +04:00
|
|
|
fvector_t::const_iterator fvector_iter = m_fvector.find(id);
|
|
|
|
if (fvector_iter == m_fvector.end()) return 0;
|
2011-10-14 11:40:53 +04:00
|
|
|
return fvector_iter->second;
|
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
void SparseVector::set(const string& name, FeatureStatsType value)
|
|
|
|
{
|
2012-03-10 12:12:34 +04:00
|
|
|
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
|
2011-10-14 11:40:53 +04:00
|
|
|
size_t id = 0;
|
2012-03-10 12:12:34 +04:00
|
|
|
if (name2id_iter == m_name_to_id.end()) {
|
|
|
|
id = m_id_to_name.size();
|
|
|
|
m_id_to_name.push_back(name);
|
|
|
|
m_name_to_id[name] = id;
|
2011-10-14 11:40:53 +04:00
|
|
|
} else {
|
|
|
|
id = name2id_iter->second;
|
|
|
|
}
|
2012-03-10 12:12:34 +04:00
|
|
|
m_fvector[id] = value;
|
2011-10-14 11:40:53 +04:00
|
|
|
}
|
|
|
|
|
2015-01-14 14:07:42 +03:00
|
|
|
void SparseVector::set(size_t id, FeatureStatsType value)
|
|
|
|
{
|
2014-07-21 14:04:43 +04:00
|
|
|
assert(m_id_to_name.size() > id);
|
|
|
|
m_fvector[id] = value;
|
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
void SparseVector::write(ostream& out, const string& sep) const
|
|
|
|
{
|
2012-03-10 12:12:34 +04:00
|
|
|
for (fvector_t::const_iterator i = m_fvector.begin(); i != m_fvector.end(); ++i) {
|
2012-02-01 12:17:58 +04:00
|
|
|
if (abs(i->second) < 0.00001) continue;
|
2012-03-10 12:12:34 +04:00
|
|
|
string name = m_id_to_name[i->first];
|
2011-10-14 18:01:15 +04:00
|
|
|
out << name << sep << i->second << " ";
|
2011-10-14 11:40:53 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
void SparseVector::clear()
|
|
|
|
{
|
2012-03-10 12:12:34 +04:00
|
|
|
m_fvector.clear();
|
2011-10-14 11:40:53 +04:00
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
void SparseVector::load(const string& file)
|
|
|
|
{
|
2012-01-13 20:52:15 +04:00
|
|
|
ifstream in(file.c_str());
|
|
|
|
if (!in) {
|
|
|
|
throw runtime_error("Failed to open sparse weights file: " + file);
|
|
|
|
}
|
|
|
|
string line;
|
|
|
|
while(getline(in,line)) {
|
|
|
|
if (line[0] == '#') continue;
|
|
|
|
istringstream linestream(line);
|
|
|
|
string name;
|
|
|
|
float value;
|
|
|
|
linestream >> name;
|
|
|
|
linestream >> value;
|
|
|
|
set(name,value);
|
2011-10-14 11:40:53 +04:00
|
|
|
}
|
2012-01-13 20:52:15 +04:00
|
|
|
}
|
|
|
|
|
2014-07-21 14:04:43 +04:00
|
|
|
SparseVector& SparseVector::operator+=(const SparseVector& rhs)
|
|
|
|
{
|
|
|
|
|
|
|
|
for (fvector_t::const_iterator i = rhs.m_fvector.begin();
|
|
|
|
i != rhs.m_fvector.end(); ++i) {
|
|
|
|
m_fvector[i->first] = get(i->first) + (i->second);
|
|
|
|
}
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
SparseVector& SparseVector::operator-=(const SparseVector& rhs)
|
|
|
|
{
|
2011-10-14 11:40:53 +04:00
|
|
|
|
2012-03-10 12:12:34 +04:00
|
|
|
for (fvector_t::const_iterator i = rhs.m_fvector.begin();
|
2013-05-29 21:16:15 +04:00
|
|
|
i != rhs.m_fvector.end(); ++i) {
|
2012-05-25 00:11:35 +04:00
|
|
|
m_fvector[i->first] = get(i->first) - (i->second);
|
2011-10-14 11:40:53 +04:00
|
|
|
}
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
FeatureStatsType SparseVector::inner_product(const SparseVector& rhs) const
|
|
|
|
{
|
2012-01-13 20:52:15 +04:00
|
|
|
FeatureStatsType product = 0.0;
|
2012-05-25 00:11:35 +04:00
|
|
|
for (fvector_t::const_iterator i = m_fvector.begin();
|
2013-05-29 21:16:15 +04:00
|
|
|
i != m_fvector.end(); ++i) {
|
2012-01-13 20:52:15 +04:00
|
|
|
product += ((i->second) * (rhs.get(i->first)));
|
|
|
|
}
|
|
|
|
return product;
|
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs)
|
|
|
|
{
|
2011-10-14 11:40:53 +04:00
|
|
|
SparseVector res(lhs);
|
|
|
|
res -= rhs;
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
FeatureStatsType inner_product(const SparseVector& lhs, const SparseVector& rhs)
|
|
|
|
{
|
|
|
|
if (lhs.size() >= rhs.size()) {
|
|
|
|
return rhs.inner_product(lhs);
|
|
|
|
} else {
|
|
|
|
return lhs.inner_product(rhs);
|
|
|
|
}
|
2012-01-13 20:52:15 +04:00
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
std::vector<std::size_t> SparseVector::feats() const
|
|
|
|
{
|
2012-05-29 21:38:57 +04:00
|
|
|
std::vector<std::size_t> toRet;
|
|
|
|
for(fvector_t::const_iterator iter = m_fvector.begin();
|
|
|
|
iter!=m_fvector.end();
|
|
|
|
iter++) {
|
|
|
|
toRet.push_back(iter->first);
|
|
|
|
}
|
|
|
|
return toRet;
|
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
std::size_t SparseVector::encode(const std::string& name)
|
|
|
|
{
|
2012-05-29 21:38:57 +04:00
|
|
|
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
|
|
|
|
size_t id = 0;
|
|
|
|
if (name2id_iter == m_name_to_id.end()) {
|
|
|
|
id = m_id_to_name.size();
|
|
|
|
m_id_to_name.push_back(name);
|
|
|
|
m_name_to_id[name] = id;
|
|
|
|
} else {
|
|
|
|
id = name2id_iter->second;
|
|
|
|
}
|
|
|
|
return id;
|
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
std::string SparseVector::decode(std::size_t id)
|
|
|
|
{
|
2012-05-29 21:38:57 +04:00
|
|
|
return m_id_to_name[id];
|
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
bool operator==(SparseVector const& item1, SparseVector const& item2)
|
|
|
|
{
|
2012-05-29 21:38:57 +04:00
|
|
|
return item1.m_fvector==item2.m_fvector;
|
|
|
|
}
|
|
|
|
|
2014-07-21 14:04:43 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
std::size_t hash_value(SparseVector const& item)
|
|
|
|
{
|
2014-07-21 14:04:43 +04:00
|
|
|
size_t seed = 0;
|
|
|
|
for (SparseVector::fvector_t::const_iterator i = item.m_fvector.begin(); i != item.m_fvector.end(); ++i) {
|
|
|
|
seed = util::MurmurHashNative(&(i->first), sizeof(i->first), seed);
|
|
|
|
seed = util::MurmurHashNative(&(i->second), sizeof(i->second), seed);
|
|
|
|
}
|
|
|
|
return seed;
|
2012-05-29 21:38:57 +04:00
|
|
|
}
|
|
|
|
|
2014-07-21 14:04:43 +04:00
|
|
|
|
2008-05-16 00:32:37 +04:00
|
|
|
FeatureStats::FeatureStats()
|
2013-05-29 21:16:15 +04:00
|
|
|
: m_available_size(kAvailableSize), m_entries(0),
|
|
|
|
m_array(new FeatureStatsType[m_available_size]) {}
|
2011-11-12 13:12:07 +04:00
|
|
|
|
|
|
|
FeatureStats::FeatureStats(const size_t size)
|
2013-05-29 21:16:15 +04:00
|
|
|
: m_available_size(size), m_entries(size),
|
|
|
|
m_array(new FeatureStatsType[m_available_size])
|
2008-06-05 11:23:34 +04:00
|
|
|
{
|
2012-03-10 12:12:34 +04:00
|
|
|
memset(m_array, 0, GetArraySizeWithBytes());
|
2011-11-12 13:12:07 +04:00
|
|
|
}
|
|
|
|
|
2008-06-05 11:23:34 +04:00
|
|
|
FeatureStats::~FeatureStats()
|
|
|
|
{
|
2015-01-14 14:07:42 +03:00
|
|
|
delete [] m_array;
|
2011-11-12 04:40:01 +04:00
|
|
|
}
|
2008-05-14 11:57:45 +04:00
|
|
|
|
2011-11-12 13:12:07 +04:00
|
|
|
void FeatureStats::Copy(const FeatureStats &stats)
|
2008-06-05 11:23:34 +04:00
|
|
|
{
|
2012-03-10 12:12:34 +04:00
|
|
|
m_available_size = stats.available();
|
|
|
|
m_entries = stats.size();
|
|
|
|
m_array = new FeatureStatsType[m_available_size];
|
|
|
|
memcpy(m_array, stats.getArray(), GetArraySizeWithBytes());
|
|
|
|
m_map = stats.getSparse();
|
2011-11-12 04:40:01 +04:00
|
|
|
}
|
2008-05-14 11:57:45 +04:00
|
|
|
|
2011-11-12 13:12:07 +04:00
|
|
|
FeatureStats::FeatureStats(const FeatureStats &stats)
|
2008-05-14 11:57:45 +04:00
|
|
|
{
|
2011-11-12 13:12:07 +04:00
|
|
|
Copy(stats);
|
2011-11-12 04:40:01 +04:00
|
|
|
}
|
2008-05-14 11:57:45 +04:00
|
|
|
|
2011-11-12 13:12:07 +04:00
|
|
|
FeatureStats& FeatureStats::operator=(const FeatureStats &stats)
|
2008-05-14 11:57:45 +04:00
|
|
|
{
|
2012-03-10 12:12:34 +04:00
|
|
|
delete [] m_array;
|
2011-11-12 13:12:07 +04:00
|
|
|
Copy(stats);
|
|
|
|
return *this;
|
2008-05-14 11:57:45 +04:00
|
|
|
}
|
|
|
|
|
2008-06-05 11:23:34 +04:00
|
|
|
void FeatureStats::expand()
|
|
|
|
{
|
2012-03-10 12:12:34 +04:00
|
|
|
m_available_size *= 2;
|
|
|
|
featstats_t t_ = new FeatureStatsType[m_available_size];
|
|
|
|
memcpy(t_, m_array, GetArraySizeWithBytes());
|
|
|
|
delete [] m_array;
|
|
|
|
m_array = t_;
|
2008-06-05 11:23:34 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void FeatureStats::add(FeatureStatsType v)
|
|
|
|
{
|
2011-02-24 15:42:19 +03:00
|
|
|
if (isfull()) expand();
|
2012-03-10 12:12:34 +04:00
|
|
|
m_array[m_entries++]=v;
|
2008-06-05 11:23:34 +04:00
|
|
|
}
|
|
|
|
|
2011-11-14 09:00:47 +04:00
|
|
|
void FeatureStats::addSparse(const string& name, FeatureStatsType v)
|
2011-09-07 20:37:33 +04:00
|
|
|
{
|
2012-03-10 12:12:34 +04:00
|
|
|
m_map.set(name,v);
|
2011-09-07 20:37:33 +04:00
|
|
|
}
|
|
|
|
|
2012-05-25 00:11:35 +04:00
|
|
|
void FeatureStats::set(string &theString, const SparseVector& sparseWeights )
|
2008-05-14 11:57:45 +04:00
|
|
|
{
|
2012-03-10 14:04:43 +04:00
|
|
|
string substring, stringBuf;
|
2011-02-24 15:42:19 +03:00
|
|
|
reset();
|
|
|
|
|
|
|
|
while (!theString.empty()) {
|
|
|
|
getNextPound(theString, substring);
|
2011-09-07 20:37:33 +04:00
|
|
|
// regular feature
|
2013-05-17 11:37:29 +04:00
|
|
|
if (substring.find("=") == string::npos) {
|
2011-11-12 12:30:33 +04:00
|
|
|
add(ConvertStringToFeatureStatsType(substring));
|
2011-09-07 20:37:33 +04:00
|
|
|
}
|
|
|
|
// sparse feature
|
|
|
|
else {
|
2013-05-17 11:37:29 +04:00
|
|
|
size_t separator = substring.find_last_of("=");
|
2011-09-07 20:37:33 +04:00
|
|
|
addSparse(substring.substr(0,separator), atof(substring.substr(separator+1).c_str()) );
|
|
|
|
}
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
2012-01-13 20:52:15 +04:00
|
|
|
|
|
|
|
if (sparseWeights.size()) {
|
|
|
|
//Merge the sparse features
|
2012-05-25 00:11:35 +04:00
|
|
|
FeatureStatsType merged = inner_product(sparseWeights, m_map);
|
2012-01-13 20:52:15 +04:00
|
|
|
add(merged);
|
|
|
|
/*
|
|
|
|
cerr << "Merged ";
|
|
|
|
sparseWeights.write(cerr,"=");
|
|
|
|
cerr << " and ";
|
|
|
|
map_.write(cerr,"=");
|
|
|
|
cerr << " to give " << merged << endl;
|
|
|
|
*/
|
2012-05-25 00:11:35 +04:00
|
|
|
m_map.clear();
|
2012-01-13 20:52:15 +04:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
cerr << "FS: ";
|
|
|
|
for (size_t i = 0; i < entries_; ++i) {
|
|
|
|
cerr << array_[i] << " ";
|
|
|
|
}
|
|
|
|
cerr << endl;*/
|
2008-05-14 11:57:45 +04:00
|
|
|
}
|
|
|
|
|
2012-03-10 14:04:43 +04:00
|
|
|
void FeatureStats::loadbin(istream* is)
|
2008-06-05 11:23:34 +04:00
|
|
|
{
|
2012-03-10 14:04:43 +04:00
|
|
|
is->read(reinterpret_cast<char*>(m_array),
|
|
|
|
static_cast<streamsize>(GetArraySizeWithBytes()));
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
2008-06-05 11:23:34 +04:00
|
|
|
|
2012-05-25 00:11:35 +04:00
|
|
|
void FeatureStats::loadtxt(istream* is, const SparseVector& sparseWeights)
|
2008-05-14 11:57:45 +04:00
|
|
|
{
|
2012-03-10 14:04:43 +04:00
|
|
|
string line;
|
|
|
|
getline(*is, line);
|
2012-05-25 00:11:35 +04:00
|
|
|
set(line, sparseWeights);
|
2008-05-14 11:57:45 +04:00
|
|
|
}
|
|
|
|
|
2012-03-10 14:04:43 +04:00
|
|
|
void FeatureStats::savetxt(const string &file)
|
2008-05-14 11:57:45 +04:00
|
|
|
{
|
2012-03-10 14:04:43 +04:00
|
|
|
ofstream ofs(file.c_str(), ios::out);
|
|
|
|
ostream* os = &ofs;
|
|
|
|
savetxt(os);
|
2008-05-14 11:57:45 +04:00
|
|
|
}
|
|
|
|
|
2012-03-10 14:04:43 +04:00
|
|
|
void FeatureStats::savetxt(ostream* os)
|
2008-05-14 11:57:45 +04:00
|
|
|
{
|
2012-03-10 14:04:43 +04:00
|
|
|
*os << *this;
|
2008-05-14 11:57:45 +04:00
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
void FeatureStats::savetxt()
|
|
|
|
{
|
2012-03-10 14:04:43 +04:00
|
|
|
savetxt(&cout);
|
2008-05-14 11:57:45 +04:00
|
|
|
}
|
|
|
|
|
2012-03-10 14:04:43 +04:00
|
|
|
void FeatureStats::savebin(ostream* os)
|
2008-06-03 12:56:37 +04:00
|
|
|
{
|
2012-03-10 14:04:43 +04:00
|
|
|
os->write(reinterpret_cast<char*>(m_array),
|
|
|
|
static_cast<streamsize>(GetArraySizeWithBytes()));
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
2008-05-14 11:57:45 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
ostream& operator<<(ostream& o, const FeatureStats& e)
|
|
|
|
{
|
2011-09-07 20:37:33 +04:00
|
|
|
// print regular features
|
|
|
|
for (size_t i=0; i< e.size(); i++) {
|
2011-02-24 15:42:19 +03:00
|
|
|
o << e.get(i) << " ";
|
2011-09-07 20:37:33 +04:00
|
|
|
}
|
|
|
|
// sparse features
|
2011-10-14 18:01:15 +04:00
|
|
|
e.getSparse().write(o,"");
|
2011-10-14 11:40:53 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
return o;
|
2008-05-27 20:50:52 +04:00
|
|
|
}
|
2011-12-12 17:48:42 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
bool operator==(const FeatureStats& f1, const FeatureStats& f2)
|
|
|
|
{
|
2011-12-12 17:48:42 +04:00
|
|
|
size_t size = f1.size();
|
|
|
|
|
|
|
|
if (size != f2.size())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
for (size_t k=0; k < size; k++) {
|
|
|
|
if (f1.get(k) != f2.get(k))
|
|
|
|
return false;
|
|
|
|
}
|
2012-03-10 12:12:34 +04:00
|
|
|
|
2011-12-12 17:48:42 +04:00
|
|
|
return true;
|
|
|
|
}
|
2012-06-30 23:23:45 +04:00
|
|
|
|
|
|
|
}
|