mosesdecoder/phrase-extract/DomainFeature.cpp

#include "DomainFeature.h"
#include "ExtractionPhrasePair.h"
#include "tables-core.h"
#include "InputFileStream.h"
#include "util/tokenize.hh"

using namespace std;

namespace MosesTraining
{

// handling of domain names: load database with sentence-id / domain name info
void Domain::load( const std::string &domainFileName )
{
  Moses::InputFileStream fileS( domainFileName );
  istream *fileP = &fileS;

  string line;
  while(getline(*fileP, line)) {
    // read
    const vector< string > domainSpecLine = util::tokenize( line );
    int lineNumber;
    if (domainSpecLine.size() != 2 ||
        ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
      std::cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
      exit(1);
    }
    // store
    const string &name = domainSpecLine[1];
    spec.push_back( make_pair( lineNumber, name ));
    if (name2id.find( name ) == name2id.end()) {
      name2id[ name ] = list.size();
      list.push_back( name );
    }
  }
}

// get domain name based on sentence number
string Domain::getDomainOfSentence( int sentenceId ) const
{
  for(size_t i=0; i<spec.size(); i++) {
    if (sentenceId <= spec[i].first) {
      return spec[i].second;
    }
  }
  return "undefined";
}

DomainFeature::DomainFeature(const string& domainFile) : m_propertyKey("domain")
{
  //process domain file
  m_domain.load(domainFile);
}

void DomainFeature::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
    float count,
    int sentenceId) const
{
  std::string value = m_domain.getDomainOfSentence(sentenceId);
  phrasePair.AddProperty(m_propertyKey, value, count);
}

void DomainFeature::add(const ScoreFeatureContext& context,
                        std::vector<float>& denseValues,
                        std::map<std::string,float>& sparseValues)  const
{
  const map<string,float> *domainCount = context.phrasePair.GetProperty(m_propertyKey);
  assert( domainCount != NULL );
  add(*domainCount,
      context.phrasePair.GetCount(),
      context.maybeLog,
      denseValues, sparseValues);
}

void SubsetDomainFeature::add(const map<string,float>& domainCount,
                              float count,
                              const MaybeLog& maybeLog,
                              std::vector<float>& denseValues,
                              std::map<std::string,float>& sparseValues)  const
{
  if (m_domain.list.size() > 6) {
    UTIL_THROW_IF(m_domain.list.size() > 6, ScoreFeatureArgumentException,
                  "too many domains for core domain subset features");
  }
  size_t bitmap = 0;
  for(size_t bit = 0; bit < m_domain.list.size(); bit++) {
    if (domainCount.find( m_domain.list[ bit ] ) != domainCount.end()) {
      bitmap += 1 << bit;
    }
  }
  for(size_t i = 1; i < (1 << m_domain.list.size()); i++) {
    denseValues.push_back(maybeLog( (bitmap == i) ? 2.718 : 1 ));
  }
}

void SparseSubsetDomainFeature::add(const map<string,float>& domainCount,float count,
                                    const MaybeLog& maybeLog,
                                    std::vector<float>& denseValues,
                                    std::map<std::string,float>& sparseValues)  const
{
  typedef vector<string>::const_iterator I;
  ostringstream key;
  key << "doms";
  for (I i = m_domain.list.begin(); i != m_domain.list.end(); ++i) {
    if (domainCount.find(*i) != domainCount.end()) {
      key << "_" << *i;
    }
  }
  sparseValues[key.str()] = 1;
}


void RatioDomainFeature::add(const map<string,float>& domainCount,float count,
                             const MaybeLog& maybeLog,
                             std::vector<float>& denseValues,
                             std::map<std::string,float>& sparseValues)  const
{
  typedef vector< string >::const_iterator I;
  for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {
    map<string,float>::const_iterator dci = domainCount.find(*i);
    if (dci == domainCount.end() ) {
      denseValues.push_back(maybeLog( 1 ));
    } else {
      denseValues.push_back(maybeLog(exp( dci->second / count ) ));
    }
  }
}


void SparseRatioDomainFeature::add(const map<string,float>& domainCount,float count,
                                   const MaybeLog& maybeLog,
                                   std::vector<float>& denseValues,
                                   std::map<std::string,float>& sparseValues)  const
{
  typedef map< string, float >::const_iterator I;
  for (I i=domainCount.begin(); i != domainCount.end(); i++) {
    sparseValues["domr_" + i->first] =  (i->second / count);
  }
}


void IndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
                                 const MaybeLog& maybeLog,
                                 std::vector<float>& denseValues,
                                 std::map<std::string,float>& sparseValues)  const
{
  typedef vector< string >::const_iterator I;
  for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {
    map<string,float>::const_iterator dci = domainCount.find(*i);
    if (dci == domainCount.end() ) {
      denseValues.push_back(maybeLog( 1 ));
    } else {
      denseValues.push_back(maybeLog(2.718));
    }
  }
}

void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
                                       const MaybeLog& maybeLog,
                                       std::vector<float>& denseValues,
                                       std::map<std::string,float>& sparseValues)  const
{
  typedef map< string, float >::const_iterator I;
  for (I i=domainCount.begin(); i != domainCount.end(); i++) {
    sparseValues["dom_" + i->first] = 1;
  }
}

}
new version of the `score` tool which is now capable of dealing with additional properties in an appropriate manner 2014-01-29 22:37:42 +04:00			`#include "DomainFeature.h"`
			`#include "ExtractionPhrasePair.h"`
bug fix to enable pruned search graph output by default 2012-09-03 10:23:32 +04:00			`#include "tables-core.h"`
			`#include "InputFileStream.h"`
Unify tokenize() into util, and unit-test it. The duplicate definition works fine in environments where the inline definition becomes a weak symbol in the object file, but if it gets generated as a regular definition, the duplicate definition causes link problems. In most call sites the return value could easily be made const, which gives both the reader and the compiler a bit more certainty about the code's intentions. In theory this may help performance, but it's mainly for clarity. The comments are based on reverse-engineering, and the unit tests are based on the comments. It's possible that some of what's in there is not essential, in which case, don't feel bad about changing it! I left a third identical definition in place, though I updated it with my changes to avoid creeping divergence, and noted the duplication in a comment. It would be nice to get rid of this definition as well, but it'd introduce headers from the main Moses tree into biconcor, which may be against policy. 2015-04-22 05:59:05 +03:00			`#include "util/tokenize.hh"`
bug fix to enable pruned search graph output by default 2012-09-03 10:23:32 +04:00
			`using namespace std;`

			`namespace MosesTraining`
			`{`

			`// handling of domain names: load database with sentence-id / domain name info`
beautify 2013-05-29 21:16:15 +04:00			`void Domain::load( const std::string &domainFileName )`
			`{`
bug fix to enable pruned search graph output by default 2012-09-03 10:23:32 +04:00			`Moses::InputFileStream fileS( domainFileName );`
			`istream *fileP = &fileS;`
use standard c++ getline instead of old Moses SAFE_GETLINE 2014-06-08 19:23:14 +04:00
beautify 2015-01-14 14:07:42 +03:00			`string line;`
use standard c++ getline instead of old Moses SAFE_GETLINE 2014-06-08 19:23:14 +04:00			`while(getline(*fileP, line)) {`
bug fix to enable pruned search graph output by default 2012-09-03 10:23:32 +04:00			`// read`
Support tokenize(const std::string &) as well. Convenience wrapper: the actual function takes a const char[], but many of the call sites want to pass a string and have to call its c_str() first. 2015-04-22 06:35:18 +03:00			`const vector< string > domainSpecLine = util::tokenize( line );`
bug fix to enable pruned search graph output by default 2012-09-03 10:23:32 +04:00			`int lineNumber;`
			`if (domainSpecLine.size() != 2 \|\|`
			`! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {`
new version of the `score` tool which is now capable of dealing with additional properties in an appropriate manner 2014-01-29 22:37:42 +04:00			`std::cerr << "ERROR: in domain specification line: '" << line << "'" << endl;`
bug fix to enable pruned search graph output by default 2012-09-03 10:23:32 +04:00			`exit(1);`
			`}`
			`// store`
Unify tokenize() into util, and unit-test it. The duplicate definition works fine in environments where the inline definition becomes a weak symbol in the object file, but if it gets generated as a regular definition, the duplicate definition causes link problems. In most call sites the return value could easily be made const, which gives both the reader and the compiler a bit more certainty about the code's intentions. In theory this may help performance, but it's mainly for clarity. The comments are based on reverse-engineering, and the unit tests are based on the comments. It's possible that some of what's in there is not essential, in which case, don't feel bad about changing it! I left a third identical definition in place, though I updated it with my changes to avoid creeping divergence, and noted the duplication in a comment. It would be nice to get rid of this definition as well, but it'd introduce headers from the main Moses tree into biconcor, which may be against policy. 2015-04-22 05:59:05 +03:00			`const string &name = domainSpecLine[1];`
bug fix to enable pruned search graph output by default 2012-09-03 10:23:32 +04:00			`spec.push_back( make_pair( lineNumber, name ));`
			`if (name2id.find( name ) == name2id.end()) {`
			`name2id[ name ] = list.size();`
			`list.push_back( name );`
			`}`
			`}`
			`}`

			`// get domain name based on sentence number`
beautify 2013-05-29 21:16:15 +04:00			`string Domain::getDomainOfSentence( int sentenceId ) const`
			`{`
bug fix to enable pruned search graph output by default 2012-09-03 10:23:32 +04:00			`for(size_t i=0; i<spec.size(); i++) {`
			`if (sentenceId <= spec[i].first) {`
			`return spec[i].second;`
			`}`
			`}`
			`return "undefined";`
			`}`

new version of the `score` tool which is now capable of dealing with additional properties in an appropriate manner 2014-01-29 22:37:42 +04:00			`DomainFeature::DomainFeature(const string& domainFile) : m_propertyKey("domain")`
Feature function interface for use in scoring 2012-11-03 03:30:51 +04:00			`{`
			`//process domain file`
			`m_domain.load(domainFile);`
			`}`

beautify 2014-05-19 17:35:08 +04:00			`void DomainFeature::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,`
			`float count,`
			`int sentenceId) const`
new version of the `score` tool which is now capable of dealing with additional properties in an appropriate manner 2014-01-29 22:37:42 +04:00			`{`
			`std::string value = m_domain.getDomainOfSentence(sentenceId);`
			`phrasePair.AddProperty(m_propertyKey, value, count);`
			`}`

beautify 2013-05-29 21:16:15 +04:00			`void DomainFeature::add(const ScoreFeatureContext& context,`
			`std::vector<float>& denseValues,`
			`std::map<std::string,float>& sparseValues) const`
Feature function interface for use in scoring 2012-11-03 03:30:51 +04:00			`{`
new version of the `score` tool which is now capable of dealing with additional properties in an appropriate manner 2014-01-29 22:37:42 +04:00			`const map<string,float> *domainCount = context.phrasePair.GetProperty(m_propertyKey);`
			`assert( domainCount != NULL );`
beautify 2014-05-19 17:35:08 +04:00			`add(*domainCount,`
			`context.phrasePair.GetCount(),`
			`context.maybeLog,`
new version of the `score` tool which is now capable of dealing with additional properties in an appropriate manner 2014-01-29 22:37:42 +04:00			`denseValues, sparseValues);`
Feature function interface for use in scoring 2012-11-03 03:30:51 +04:00			`}`

beautify 2014-05-19 17:35:08 +04:00			`void SubsetDomainFeature::add(const map<string,float>& domainCount,`
new version of the `score` tool which is now capable of dealing with additional properties in an appropriate manner 2014-01-29 22:37:42 +04:00			`float count,`
beautify 2013-05-29 21:16:15 +04:00			`const MaybeLog& maybeLog,`
			`std::vector<float>& denseValues,`
			`std::map<std::string,float>& sparseValues) const`
Feature function interface for use in scoring 2012-11-03 03:30:51 +04:00			`{`
beautify 2013-05-29 21:16:15 +04:00			`if (m_domain.list.size() > 6) {`
Feature function interface for use in scoring 2012-11-03 03:30:51 +04:00			`UTIL_THROW_IF(m_domain.list.size() > 6, ScoreFeatureArgumentException,`
beautify 2013-05-29 21:16:15 +04:00			`"too many domains for core domain subset features");`
Feature function interface for use in scoring 2012-11-03 03:30:51 +04:00			`}`
			`size_t bitmap = 0;`
			`for(size_t bit = 0; bit < m_domain.list.size(); bit++) {`
			`if (domainCount.find( m_domain.list[ bit ] ) != domainCount.end()) {`
			`bitmap += 1 << bit;`
			`}`
			`}`
			`for(size_t i = 1; i < (1 << m_domain.list.size()); i++) {`
			`denseValues.push_back(maybeLog( (bitmap == i) ? 2.718 : 1 ));`
beautify 2013-05-29 21:16:15 +04:00			`}`
Feature function interface for use in scoring 2012-11-03 03:30:51 +04:00			`}`

			`void SparseSubsetDomainFeature::add(const map<string,float>& domainCount,float count,`
beautify 2013-05-29 21:16:15 +04:00			`const MaybeLog& maybeLog,`
			`std::vector<float>& denseValues,`
			`std::map<std::string,float>& sparseValues) const`
Feature function interface for use in scoring 2012-11-03 03:30:51 +04:00			`{`
			`typedef vector<string>::const_iterator I;`
			`ostringstream key;`
			`key << "doms";`
			`for (I i = m_domain.list.begin(); i != m_domain.list.end(); ++i) {`
			`if (domainCount.find(*i) != domainCount.end()) {`
			`key << "_" << *i;`
			`}`
			`}`
			`sparseValues[key.str()] = 1;`
			`}`


			`void RatioDomainFeature::add(const map<string,float>& domainCount,float count,`
beautify 2013-05-29 21:16:15 +04:00			`const MaybeLog& maybeLog,`
			`std::vector<float>& denseValues,`
			`std::map<std::string,float>& sparseValues) const`
Feature function interface for use in scoring 2012-11-03 03:30:51 +04:00			`{`
			`typedef vector< string >::const_iterator I;`
			`for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {`
			`map<string,float>::const_iterator dci = domainCount.find(*i);`
			`if (dci == domainCount.end() ) {`
			`denseValues.push_back(maybeLog( 1 ));`
			`} else {`
			`denseValues.push_back(maybeLog(exp( dci->second / count ) ));`
			`}`
			`}`
			`}`


			`void SparseRatioDomainFeature::add(const map<string,float>& domainCount,float count,`
beautify 2013-05-29 21:16:15 +04:00			`const MaybeLog& maybeLog,`
			`std::vector<float>& denseValues,`
			`std::map<std::string,float>& sparseValues) const`
Feature function interface for use in scoring 2012-11-03 03:30:51 +04:00			`{`
			`typedef map< string, float >::const_iterator I;`
			`for (I i=domainCount.begin(); i != domainCount.end(); i++) {`
			`sparseValues["domr_" + i->first] = (i->second / count);`
			`}`
			`}`


			`void IndicatorDomainFeature::add(const map<string,float>& domainCount,float count,`
beautify 2013-05-29 21:16:15 +04:00			`const MaybeLog& maybeLog,`
			`std::vector<float>& denseValues,`
			`std::map<std::string,float>& sparseValues) const`
Feature function interface for use in scoring 2012-11-03 03:30:51 +04:00			`{`
			`typedef vector< string >::const_iterator I;`
			`for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {`
			`map<string,float>::const_iterator dci = domainCount.find(*i);`
			`if (dci == domainCount.end() ) {`
			`denseValues.push_back(maybeLog( 1 ));`
			`} else {`
			`denseValues.push_back(maybeLog(2.718));`
			`}`
			`}`
			`}`

			`void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,float count,`
beautify 2013-05-29 21:16:15 +04:00			`const MaybeLog& maybeLog,`
			`std::vector<float>& denseValues,`
			`std::map<std::string,float>& sparseValues) const`
Feature function interface for use in scoring 2012-11-03 03:30:51 +04:00			`{`
			`typedef map< string, float >::const_iterator I;`
			`for (I i=domainCount.begin(); i != domainCount.end(); i++) {`
beautify 2013-05-29 21:16:15 +04:00			`sparseValues["dom_" + i->first] = 1;`
Feature function interface for use in scoring 2012-11-03 03:30:51 +04:00			`}`
			`}`

bug fix to enable pruned search graph output by default 2012-09-03 10:23:32 +04:00			`}`