2014-01-29 22:37:42 +04:00
|
|
|
#include "DomainFeature.h"
|
|
|
|
#include "ExtractionPhrasePair.h"
|
2012-09-03 10:23:32 +04:00
|
|
|
#include "tables-core.h"
|
|
|
|
#include "InputFileStream.h"
|
Unify tokenize() into util, and unit-test it.
The duplicate definition works fine in environments where the inline
definition becomes a weak symbol in the object file, but if it gets
generated as a regular definition, the duplicate definition causes link
problems.
In most call sites the return value could easily be made const, which
gives both the reader and the compiler a bit more certainty about the code's
intentions. In theory this may help performance, but it's mainly for clarity.
The comments are based on reverse-engineering, and the unit tests are based
on the comments. It's possible that some of what's in there is not essential,
in which case, don't feel bad about changing it!
I left a third identical definition in place, though I updated it with my
changes to avoid creeping divergence, and noted the duplication in a comment.
It would be nice to get rid of this definition as well, but it'd introduce
headers from the main Moses tree into biconcor, which may be against policy.
2015-04-22 05:59:05 +03:00
|
|
|
#include "util/tokenize.hh"
|
2012-09-03 10:23:32 +04:00
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
|
|
|
|
namespace MosesTraining
|
|
|
|
{
|
|
|
|
|
|
|
|
// handling of domain names: load database with sentence-id / domain name info
|
2013-05-29 21:16:15 +04:00
|
|
|
void Domain::load( const std::string &domainFileName )
|
|
|
|
{
|
2012-09-03 10:23:32 +04:00
|
|
|
Moses::InputFileStream fileS( domainFileName );
|
|
|
|
istream *fileP = &fileS;
|
2014-06-08 19:23:14 +04:00
|
|
|
|
2015-01-14 14:07:42 +03:00
|
|
|
string line;
|
2014-06-08 19:23:14 +04:00
|
|
|
while(getline(*fileP, line)) {
|
2012-09-03 10:23:32 +04:00
|
|
|
// read
|
2015-04-22 06:35:18 +03:00
|
|
|
const vector< string > domainSpecLine = util::tokenize( line );
|
2012-09-03 10:23:32 +04:00
|
|
|
int lineNumber;
|
|
|
|
if (domainSpecLine.size() != 2 ||
|
|
|
|
! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
|
2014-01-29 22:37:42 +04:00
|
|
|
std::cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
|
2012-09-03 10:23:32 +04:00
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
// store
|
Unify tokenize() into util, and unit-test it.
The duplicate definition works fine in environments where the inline
definition becomes a weak symbol in the object file, but if it gets
generated as a regular definition, the duplicate definition causes link
problems.
In most call sites the return value could easily be made const, which
gives both the reader and the compiler a bit more certainty about the code's
intentions. In theory this may help performance, but it's mainly for clarity.
The comments are based on reverse-engineering, and the unit tests are based
on the comments. It's possible that some of what's in there is not essential,
in which case, don't feel bad about changing it!
I left a third identical definition in place, though I updated it with my
changes to avoid creeping divergence, and noted the duplication in a comment.
It would be nice to get rid of this definition as well, but it'd introduce
headers from the main Moses tree into biconcor, which may be against policy.
2015-04-22 05:59:05 +03:00
|
|
|
const string &name = domainSpecLine[1];
|
2012-09-03 10:23:32 +04:00
|
|
|
spec.push_back( make_pair( lineNumber, name ));
|
|
|
|
if (name2id.find( name ) == name2id.end()) {
|
|
|
|
name2id[ name ] = list.size();
|
|
|
|
list.push_back( name );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// get domain name based on sentence number
|
2013-05-29 21:16:15 +04:00
|
|
|
string Domain::getDomainOfSentence( int sentenceId ) const
|
|
|
|
{
|
2012-09-03 10:23:32 +04:00
|
|
|
for(size_t i=0; i<spec.size(); i++) {
|
|
|
|
if (sentenceId <= spec[i].first) {
|
|
|
|
return spec[i].second;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return "undefined";
|
|
|
|
}
|
|
|
|
|
2014-01-29 22:37:42 +04:00
|
|
|
DomainFeature::DomainFeature(const string& domainFile) : m_propertyKey("domain")
|
2012-11-03 03:30:51 +04:00
|
|
|
{
|
|
|
|
//process domain file
|
|
|
|
m_domain.load(domainFile);
|
|
|
|
}
|
|
|
|
|
2014-05-19 17:35:08 +04:00
|
|
|
void DomainFeature::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
|
|
|
|
float count,
|
|
|
|
int sentenceId) const
|
2014-01-29 22:37:42 +04:00
|
|
|
{
|
|
|
|
std::string value = m_domain.getDomainOfSentence(sentenceId);
|
|
|
|
phrasePair.AddProperty(m_propertyKey, value, count);
|
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
void DomainFeature::add(const ScoreFeatureContext& context,
|
|
|
|
std::vector<float>& denseValues,
|
|
|
|
std::map<std::string,float>& sparseValues) const
|
2012-11-03 03:30:51 +04:00
|
|
|
{
|
2014-01-29 22:37:42 +04:00
|
|
|
const map<string,float> *domainCount = context.phrasePair.GetProperty(m_propertyKey);
|
|
|
|
assert( domainCount != NULL );
|
2014-05-19 17:35:08 +04:00
|
|
|
add(*domainCount,
|
|
|
|
context.phrasePair.GetCount(),
|
|
|
|
context.maybeLog,
|
2014-01-29 22:37:42 +04:00
|
|
|
denseValues, sparseValues);
|
2012-11-03 03:30:51 +04:00
|
|
|
}
|
|
|
|
|
2014-05-19 17:35:08 +04:00
|
|
|
void SubsetDomainFeature::add(const map<string,float>& domainCount,
|
2014-01-29 22:37:42 +04:00
|
|
|
float count,
|
2013-05-29 21:16:15 +04:00
|
|
|
const MaybeLog& maybeLog,
|
|
|
|
std::vector<float>& denseValues,
|
|
|
|
std::map<std::string,float>& sparseValues) const
|
2012-11-03 03:30:51 +04:00
|
|
|
{
|
2013-05-29 21:16:15 +04:00
|
|
|
if (m_domain.list.size() > 6) {
|
2012-11-03 03:30:51 +04:00
|
|
|
UTIL_THROW_IF(m_domain.list.size() > 6, ScoreFeatureArgumentException,
|
2013-05-29 21:16:15 +04:00
|
|
|
"too many domains for core domain subset features");
|
2012-11-03 03:30:51 +04:00
|
|
|
}
|
|
|
|
size_t bitmap = 0;
|
|
|
|
for(size_t bit = 0; bit < m_domain.list.size(); bit++) {
|
|
|
|
if (domainCount.find( m_domain.list[ bit ] ) != domainCount.end()) {
|
|
|
|
bitmap += 1 << bit;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for(size_t i = 1; i < (1 << m_domain.list.size()); i++) {
|
|
|
|
denseValues.push_back(maybeLog( (bitmap == i) ? 2.718 : 1 ));
|
2013-05-29 21:16:15 +04:00
|
|
|
}
|
2012-11-03 03:30:51 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void SparseSubsetDomainFeature::add(const map<string,float>& domainCount,float count,
|
2013-05-29 21:16:15 +04:00
|
|
|
const MaybeLog& maybeLog,
|
|
|
|
std::vector<float>& denseValues,
|
|
|
|
std::map<std::string,float>& sparseValues) const
|
2012-11-03 03:30:51 +04:00
|
|
|
{
|
|
|
|
typedef vector<string>::const_iterator I;
|
|
|
|
ostringstream key;
|
|
|
|
key << "doms";
|
|
|
|
for (I i = m_domain.list.begin(); i != m_domain.list.end(); ++i) {
|
|
|
|
if (domainCount.find(*i) != domainCount.end()) {
|
|
|
|
key << "_" << *i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
sparseValues[key.str()] = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void RatioDomainFeature::add(const map<string,float>& domainCount,float count,
|
2013-05-29 21:16:15 +04:00
|
|
|
const MaybeLog& maybeLog,
|
|
|
|
std::vector<float>& denseValues,
|
|
|
|
std::map<std::string,float>& sparseValues) const
|
2012-11-03 03:30:51 +04:00
|
|
|
{
|
|
|
|
typedef vector< string >::const_iterator I;
|
|
|
|
for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {
|
|
|
|
map<string,float>::const_iterator dci = domainCount.find(*i);
|
|
|
|
if (dci == domainCount.end() ) {
|
|
|
|
denseValues.push_back(maybeLog( 1 ));
|
|
|
|
} else {
|
|
|
|
denseValues.push_back(maybeLog(exp( dci->second / count ) ));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void SparseRatioDomainFeature::add(const map<string,float>& domainCount,float count,
|
2013-05-29 21:16:15 +04:00
|
|
|
const MaybeLog& maybeLog,
|
|
|
|
std::vector<float>& denseValues,
|
|
|
|
std::map<std::string,float>& sparseValues) const
|
2012-11-03 03:30:51 +04:00
|
|
|
{
|
|
|
|
typedef map< string, float >::const_iterator I;
|
|
|
|
for (I i=domainCount.begin(); i != domainCount.end(); i++) {
|
|
|
|
sparseValues["domr_" + i->first] = (i->second / count);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void IndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
|
2013-05-29 21:16:15 +04:00
|
|
|
const MaybeLog& maybeLog,
|
|
|
|
std::vector<float>& denseValues,
|
|
|
|
std::map<std::string,float>& sparseValues) const
|
2012-11-03 03:30:51 +04:00
|
|
|
{
|
|
|
|
typedef vector< string >::const_iterator I;
|
|
|
|
for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {
|
|
|
|
map<string,float>::const_iterator dci = domainCount.find(*i);
|
|
|
|
if (dci == domainCount.end() ) {
|
|
|
|
denseValues.push_back(maybeLog( 1 ));
|
|
|
|
} else {
|
|
|
|
denseValues.push_back(maybeLog(2.718));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
|
2013-05-29 21:16:15 +04:00
|
|
|
const MaybeLog& maybeLog,
|
|
|
|
std::vector<float>& denseValues,
|
|
|
|
std::map<std::string,float>& sparseValues) const
|
2012-11-03 03:30:51 +04:00
|
|
|
{
|
|
|
|
typedef map< string, float >::const_iterator I;
|
|
|
|
for (I i=domainCount.begin(); i != domainCount.end(); i++) {
|
2013-05-29 21:16:15 +04:00
|
|
|
sparseValues["dom_" + i->first] = 1;
|
2012-11-03 03:30:51 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-09-03 10:23:32 +04:00
|
|
|
}
|
|
|
|
|