2008-05-14 11:57:45 +04:00
|
|
|
/*
|
|
|
|
* FeatureStats.cpp
|
2012-02-20 03:29:53 +04:00
|
|
|
* mert - Minimum Error Rate Training
|
2008-05-14 11:57:45 +04:00
|
|
|
*
|
|
|
|
* Created by Nicola Bertoldi on 13/05/08.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "FeatureStats.h"
|
|
|
|
|
2011-11-14 10:15:30 +04:00
|
|
|
#include <cmath>
|
|
|
|
#include "Util.h"
|
|
|
|
|
2011-11-12 12:30:33 +04:00
|
|
|
namespace {
|
|
|
|
const int kAvailableSize = 8;
|
|
|
|
} // namespace
|
2008-06-05 11:23:34 +04:00
|
|
|
|
2011-10-14 11:40:53 +04:00
|
|
|
SparseVector::name2id_t SparseVector::name2id_;
|
|
|
|
SparseVector::id2name_t SparseVector::id2name_;
|
|
|
|
|
2011-11-14 09:00:47 +04:00
|
|
|
FeatureStatsType SparseVector::get(const string& name) const {
|
2011-10-14 11:40:53 +04:00
|
|
|
name2id_t::const_iterator name2id_iter = name2id_.find(name);
|
|
|
|
if (name2id_iter == name2id_.end()) return 0;
|
|
|
|
size_t id = name2id_iter->second;
|
|
|
|
return get(id);
|
|
|
|
}
|
|
|
|
|
|
|
|
FeatureStatsType SparseVector::get(size_t id) const {
|
|
|
|
fvector_t::const_iterator fvector_iter = fvector_.find(id);
|
|
|
|
if (fvector_iter == fvector_.end()) return 0;
|
|
|
|
return fvector_iter->second;
|
|
|
|
}
|
|
|
|
|
2011-11-14 09:00:47 +04:00
|
|
|
void SparseVector::set(const string& name, FeatureStatsType value) {
|
2011-10-14 11:40:53 +04:00
|
|
|
name2id_t::const_iterator name2id_iter = name2id_.find(name);
|
|
|
|
size_t id = 0;
|
|
|
|
if (name2id_iter == name2id_.end()) {
|
|
|
|
id = id2name_.size();
|
|
|
|
id2name_.push_back(name);
|
|
|
|
name2id_[name] = id;
|
|
|
|
} else {
|
|
|
|
id = name2id_iter->second;
|
|
|
|
}
|
|
|
|
fvector_[id] = value;
|
|
|
|
}
|
|
|
|
|
2011-10-14 18:01:15 +04:00
|
|
|
void SparseVector::write(ostream& out, const string& sep) const {
|
2011-10-14 11:40:53 +04:00
|
|
|
for (fvector_t::const_iterator i = fvector_.begin(); i != fvector_.end(); ++i) {
|
2012-02-01 12:17:58 +04:00
|
|
|
if (abs(i->second) < 0.00001) continue;
|
2011-10-14 11:40:53 +04:00
|
|
|
string name = id2name_[i->first];
|
2011-10-14 18:01:15 +04:00
|
|
|
out << name << sep << i->second << " ";
|
2011-10-14 11:40:53 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void SparseVector::clear() {
|
|
|
|
fvector_.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
SparseVector& SparseVector::operator-=(const SparseVector& rhs) {
|
|
|
|
//All the elements that have values in *this
|
|
|
|
for (fvector_t::iterator i = fvector_.begin(); i != fvector_.end(); ++i) {
|
|
|
|
fvector_[i->first] = i->second - rhs.get(i->first);
|
|
|
|
}
|
|
|
|
|
|
|
|
//Any elements in rhs, that have no value in *this
|
2011-11-12 03:58:23 +04:00
|
|
|
for (fvector_t::const_iterator i = rhs.fvector_.begin();
|
2011-10-14 11:40:53 +04:00
|
|
|
i != rhs.fvector_.end(); ++i) {
|
|
|
|
if (fvector_.find(i->first) == fvector_.end()) {
|
|
|
|
fvector_[i->first] = -(i->second);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs) {
|
|
|
|
SparseVector res(lhs);
|
|
|
|
res -= rhs;
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2008-05-16 00:32:37 +04:00
|
|
|
FeatureStats::FeatureStats()
|
2011-11-12 13:12:07 +04:00
|
|
|
: available_(kAvailableSize), entries_(0),
|
|
|
|
array_(new FeatureStatsType[available_]) {}
|
|
|
|
|
|
|
|
FeatureStats::FeatureStats(const size_t size)
|
|
|
|
: available_(size), entries_(size),
|
|
|
|
array_(new FeatureStatsType[available_])
|
2008-06-05 11:23:34 +04:00
|
|
|
{
|
2011-11-12 13:12:07 +04:00
|
|
|
memset(array_, 0, GetArraySizeWithBytes());
|
|
|
|
}
|
|
|
|
|
|
|
|
FeatureStats::FeatureStats(std::string &theString)
|
|
|
|
: available_(0), entries_(0), array_(NULL)
|
|
|
|
{
|
|
|
|
set(theString);
|
2011-11-12 04:40:01 +04:00
|
|
|
}
|
2008-06-05 11:23:34 +04:00
|
|
|
|
|
|
|
FeatureStats::~FeatureStats()
|
|
|
|
{
|
2011-11-12 17:04:22 +04:00
|
|
|
if (array_) {
|
2011-11-12 13:12:07 +04:00
|
|
|
delete [] array_;
|
2011-11-12 17:04:22 +04:00
|
|
|
array_ = NULL;
|
|
|
|
}
|
2011-11-12 04:40:01 +04:00
|
|
|
}
|
2008-05-14 11:57:45 +04:00
|
|
|
|
2011-11-12 13:12:07 +04:00
|
|
|
void FeatureStats::Copy(const FeatureStats &stats)
|
2008-06-05 11:23:34 +04:00
|
|
|
{
|
2011-02-24 15:42:19 +03:00
|
|
|
available_ = stats.available();
|
|
|
|
entries_ = stats.size();
|
|
|
|
array_ = new FeatureStatsType[available_];
|
2011-11-12 12:30:33 +04:00
|
|
|
memcpy(array_, stats.getArray(), GetArraySizeWithBytes());
|
2011-09-07 20:37:33 +04:00
|
|
|
map_ = stats.getSparse();
|
2011-11-12 04:40:01 +04:00
|
|
|
}
|
2008-05-14 11:57:45 +04:00
|
|
|
|
2011-11-12 13:12:07 +04:00
|
|
|
FeatureStats::FeatureStats(const FeatureStats &stats)
|
2008-05-14 11:57:45 +04:00
|
|
|
{
|
2011-11-12 13:12:07 +04:00
|
|
|
Copy(stats);
|
2011-11-12 04:40:01 +04:00
|
|
|
}
|
2008-05-14 11:57:45 +04:00
|
|
|
|
2011-11-12 13:12:07 +04:00
|
|
|
FeatureStats& FeatureStats::operator=(const FeatureStats &stats)
|
2008-05-14 11:57:45 +04:00
|
|
|
{
|
2011-11-12 13:12:07 +04:00
|
|
|
delete [] array_;
|
|
|
|
Copy(stats);
|
|
|
|
return *this;
|
2008-05-14 11:57:45 +04:00
|
|
|
}
|
|
|
|
|
2008-06-05 11:23:34 +04:00
|
|
|
void FeatureStats::expand()
|
|
|
|
{
|
2011-11-12 17:04:22 +04:00
|
|
|
available_ *= 2;
|
2011-02-24 15:42:19 +03:00
|
|
|
featstats_t t_ = new FeatureStatsType[available_];
|
2011-11-12 12:30:33 +04:00
|
|
|
memcpy(t_, array_, GetArraySizeWithBytes());
|
2011-11-11 14:11:10 +04:00
|
|
|
delete [] array_;
|
2011-11-12 17:04:22 +04:00
|
|
|
array_ = t_;
|
2008-06-05 11:23:34 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void FeatureStats::add(FeatureStatsType v)
|
|
|
|
{
|
2011-02-24 15:42:19 +03:00
|
|
|
if (isfull()) expand();
|
|
|
|
array_[entries_++]=v;
|
2008-06-05 11:23:34 +04:00
|
|
|
}
|
|
|
|
|
2011-11-14 09:00:47 +04:00
|
|
|
void FeatureStats::addSparse(const string& name, FeatureStatsType v)
|
2011-09-07 20:37:33 +04:00
|
|
|
{
|
2011-10-14 11:40:53 +04:00
|
|
|
map_.set(name,v);
|
2011-09-07 20:37:33 +04:00
|
|
|
}
|
|
|
|
|
2008-05-14 11:57:45 +04:00
|
|
|
void FeatureStats::set(std::string &theString)
|
|
|
|
{
|
2008-05-27 20:50:52 +04:00
|
|
|
std::string substring, stringBuf;
|
2011-02-24 15:42:19 +03:00
|
|
|
reset();
|
|
|
|
|
|
|
|
while (!theString.empty()) {
|
|
|
|
getNextPound(theString, substring);
|
2011-09-07 20:37:33 +04:00
|
|
|
// regular feature
|
|
|
|
if (substring.find(":") == string::npos) {
|
2011-11-12 12:30:33 +04:00
|
|
|
add(ConvertStringToFeatureStatsType(substring));
|
2011-09-07 20:37:33 +04:00
|
|
|
}
|
|
|
|
// sparse feature
|
|
|
|
else {
|
|
|
|
size_t separator = substring.find_last_of(":");
|
|
|
|
addSparse(substring.substr(0,separator), atof(substring.substr(separator+1).c_str()) );
|
|
|
|
}
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
2008-05-14 11:57:45 +04:00
|
|
|
}
|
|
|
|
|
2008-06-05 11:23:34 +04:00
|
|
|
|
|
|
|
void FeatureStats::loadbin(std::ifstream& inFile)
|
|
|
|
{
|
2011-11-12 12:30:33 +04:00
|
|
|
inFile.read((char*) array_, GetArraySizeWithBytes());
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
2008-06-05 11:23:34 +04:00
|
|
|
|
2008-05-14 11:57:45 +04:00
|
|
|
void FeatureStats::loadtxt(std::ifstream& inFile)
|
|
|
|
{
|
2011-02-24 15:42:19 +03:00
|
|
|
std::string theString;
|
|
|
|
std::getline(inFile, theString);
|
|
|
|
set(theString);
|
2008-05-14 11:57:45 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void FeatureStats::loadtxt(const std::string &file)
|
|
|
|
{
|
2011-11-12 04:24:19 +04:00
|
|
|
// TRACE_ERR("loading the stats from " << file << std::endl);
|
2008-05-14 11:57:45 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
|
2008-05-14 11:57:45 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
loadtxt(inFile);
|
2008-05-14 11:57:45 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void FeatureStats::savetxt(const std::string &file)
|
|
|
|
{
|
2011-11-12 04:24:19 +04:00
|
|
|
// TRACE_ERR("saving the stats into " << file << std::endl);
|
2008-05-14 11:57:45 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
|
2008-05-14 11:57:45 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
savetxt(outFile);
|
2008-05-14 11:57:45 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void FeatureStats::savetxt(std::ofstream& outFile)
|
|
|
|
{
|
2011-11-12 04:24:19 +04:00
|
|
|
// TRACE_ERR("saving the stats" << std::endl);
|
2011-02-24 15:42:19 +03:00
|
|
|
outFile << *this;
|
2008-05-14 11:57:45 +04:00
|
|
|
}
|
|
|
|
|
2008-06-03 12:56:37 +04:00
|
|
|
void FeatureStats::savebin(std::ofstream& outFile)
|
|
|
|
{
|
2011-11-12 12:30:33 +04:00
|
|
|
outFile.write((char*) array_, GetArraySizeWithBytes());
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
2008-05-14 11:57:45 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
ostream& operator<<(ostream& o, const FeatureStats& e)
|
|
|
|
{
|
2011-09-07 20:37:33 +04:00
|
|
|
// print regular features
|
|
|
|
for (size_t i=0; i< e.size(); i++) {
|
2011-02-24 15:42:19 +03:00
|
|
|
o << e.get(i) << " ";
|
2011-09-07 20:37:33 +04:00
|
|
|
}
|
|
|
|
// sparse features
|
2011-10-14 18:01:15 +04:00
|
|
|
e.getSparse().write(o,"");
|
2011-10-14 11:40:53 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
return o;
|
2008-05-27 20:50:52 +04:00
|
|
|
}
|
2011-12-12 17:48:42 +04:00
|
|
|
|
|
|
|
//ADEED_BY_TS
|
|
|
|
bool operator==(const FeatureStats& f1, const FeatureStats& f2) {
|
|
|
|
size_t size = f1.size();
|
|
|
|
|
|
|
|
if (size != f2.size())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
for (size_t k=0; k < size; k++) {
|
|
|
|
if (f1.get(k) != f2.get(k))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
//END_ADDED
|