mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-29 06:52:34 +03:00
Roll back to just tracking coordinates in mmsapt
(distance wants to be a separate FF)
This commit is contained in:
parent
5c2b8d843c
commit
d29916bbb3
@ -3655,16 +3655,6 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_coherence.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/UG/sapt_pscore_dist.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_dist.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/UG/sapt_pscore_length_ratio.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_length_ratio.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/UG/sapt_pscore_lex1.h</name>
|
||||
<type>1</type>
|
||||
@ -3710,11 +3700,6 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_wordcount.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/UG/sapt_pscore_cumulative_bias.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/UG/sim-pe.cc</name>
|
||||
<type>1</type>
|
||||
|
@ -124,8 +124,6 @@
|
||||
<File Name="../../../moses/TranslationModel/UG/sapt_phrase_scorers.h"/>
|
||||
<File Name="../../../moses/TranslationModel/UG/sapt_pscore_base.h"/>
|
||||
<File Name="../../../moses/TranslationModel/UG/sapt_pscore_coherence.h"/>
|
||||
<File Name="../../../moses/TranslationModel/UG/sapt_pscore_dist.h"/>
|
||||
<File Name="../../../moses/TranslationModel/UG/sapt_pscore_length_ratio.h"/>
|
||||
<File Name="../../../moses/TranslationModel/UG/sapt_pscore_lex1.h"/>
|
||||
<File Name="../../../moses/TranslationModel/UG/sapt_pscore_logcnt.h"/>
|
||||
<File Name="../../../moses/TranslationModel/UG/sapt_pscore_pbwd.h"/>
|
||||
@ -135,7 +133,6 @@
|
||||
<File Name="../../../moses/TranslationModel/UG/sapt_pscore_rareness.h"/>
|
||||
<File Name="../../../moses/TranslationModel/UG/sapt_pscore_unaligned.h"/>
|
||||
<File Name="../../../moses/TranslationModel/UG/sapt_pscore_wordcount.h"/>
|
||||
<File Name="../../../moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h"/>
|
||||
<File Name="../../../moses/TranslationModel/UG/sim-pe.cc" ExcludeProjConfig="Debug"/>
|
||||
<File Name="../../../moses/TranslationModel/UG/spe-check-coverage.cc" ExcludeProjConfig="Debug"/>
|
||||
<File Name="../../../moses/TranslationModel/UG/spe-check-coverage2.cc" ExcludeProjConfig="Debug"/>
|
||||
|
@ -68,8 +68,13 @@ public:
|
||||
size_t m_frontSpanCoveredLength;
|
||||
// how many words from the beginning are covered
|
||||
|
||||
// coordinates in some space, populated by xml tag "coord"
|
||||
boost::shared_ptr<std::vector<float> > m_coord;
|
||||
// Coordinates in user-defined spaces, indexed by phrase dictionary pointer
|
||||
// Looking up PD* returns a vector of the input's coordinates in each space
|
||||
// known to the PD, in order (vector of pointers to float vectors). This
|
||||
// allows different models to use different subsets of all named spaces.
|
||||
typedef std::vector<boost::shared_ptr<std::vector<float> > > INCOORD;
|
||||
typedef std::map<PhraseDictionary const*, INCOORD> PD2IC;
|
||||
boost::shared_ptr<PD2IC> m_pd2InputCoord;
|
||||
|
||||
InputType(AllOptions::ptr const& opts, long translationId = 0);
|
||||
virtual ~InputType();
|
||||
|
@ -147,6 +147,14 @@ public:
|
||||
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
void AddKnownSpace(const std::string& name) {
|
||||
m_knownSpaces.push_back(name);
|
||||
}
|
||||
|
||||
const std::vector<std::string> &GetKnownSpaces() const {
|
||||
return m_knownSpaces;
|
||||
}
|
||||
|
||||
// LEGACY
|
||||
//! find list of translations that can translates a portion of src. Used by confusion network decoding
|
||||
virtual
|
||||
@ -171,6 +179,9 @@ protected:
|
||||
// cache
|
||||
size_t m_maxCacheSize; // 0 = no caching
|
||||
|
||||
// Named coordinate spaces used by this model, in order (see "coord" XML tag)
|
||||
std::vector<std::string> m_knownSpaces;
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
//reader-writer lock
|
||||
mutable boost::thread_specific_ptr<CacheColl> m_cache;
|
||||
|
@ -20,6 +20,7 @@ namespace sapt
|
||||
for (int i = 0; i <= LRModel::NONE; ++i)
|
||||
ofwd[i] = obwd[i] = 0;
|
||||
my_aln.reserve(1);
|
||||
sids.reset(new std::vector<uint32_t>);
|
||||
}
|
||||
|
||||
jstats::
|
||||
@ -80,9 +81,7 @@ namespace sapt
|
||||
++obwd[bwd_orient];
|
||||
// Record sentence id if requested
|
||||
if (track_sid)
|
||||
{
|
||||
sids.push_back(sid);
|
||||
}
|
||||
sids->push_back(sid);
|
||||
if (docid >= 0)
|
||||
{
|
||||
// while (int(indoc.size()) <= docid) indoc.push_back(0);
|
||||
|
@ -28,7 +28,7 @@ namespace sapt
|
||||
uint32_t obwd[LRModel::NONE+1]; // backward distortion type counts
|
||||
|
||||
public:
|
||||
std::vector<uint32_t> sids; // list of sentence ids in this sample
|
||||
SPTR<std::vector<uint32_t> > sids; // list of sentence ids in this sample
|
||||
std::map<uint32_t,uint32_t> indoc;
|
||||
// std::vector<uint32_t> indoc; // counts origin of samples (for biased sampling)
|
||||
jstats();
|
||||
|
@ -31,8 +31,8 @@ namespace sapt
|
||||
std::vector<unsigned char> aln;
|
||||
float score;
|
||||
bool inverse;
|
||||
std::vector<uint32_t> sids; // list of sampled sentence ids where this
|
||||
// phrase pair was found
|
||||
SPTR<std::vector<uint32_t> > sids; // list of sampled sentence ids where
|
||||
// this phrase pair was found
|
||||
// std::vector<uint32_t> indoc;
|
||||
std::map<uint32_t,uint32_t> indoc;
|
||||
PhrasePair() { };
|
||||
@ -185,6 +185,8 @@ namespace sapt
|
||||
sample2 += o.sample2;
|
||||
cum_bias += o.cum_bias;
|
||||
// todo: add distortion counts
|
||||
if (sids && o.sids)
|
||||
sids->insert(sids->end(), o.sids->begin(), o.sids->end());
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
@ -215,7 +215,6 @@ namespace Moses
|
||||
param.insert(pair<string,string>("coh", "0"));
|
||||
param.insert(pair<string,string>("prov", "0"));
|
||||
param.insert(pair<string,string>("cumb", "0"));
|
||||
param.insert(pair<string,string>("dist", "0"));
|
||||
|
||||
poolCounts = true;
|
||||
|
||||
@ -276,6 +275,37 @@ namespace Moses
|
||||
m = param.find("name");
|
||||
if (m != param.end()) m_name = m->second;
|
||||
|
||||
// Optional coordinates for training corpus
|
||||
// Takes form coord=name1:file1.gz,name2:file2.gz,...
|
||||
// Names should match with XML input (coord tag)
|
||||
param.insert(pair<string,string>("coord","0"));
|
||||
if(param["coord"] != "0")
|
||||
{
|
||||
m_track_coord = true;
|
||||
vector<string> coord_instances = Tokenize(param["coord"], ",");
|
||||
BOOST_FOREACH(std::string instance, coord_instances)
|
||||
{
|
||||
vector<string> toks = Moses::Tokenize(instance, ":");
|
||||
string name = toks[0];
|
||||
string file = toks[1];
|
||||
//TODO: register this space for this model
|
||||
// Load sid coordinates from file
|
||||
m_sid_coord_list.push_back(vector<vector<float> >());
|
||||
vector<vector<float> >& sid_coord = m_sid_coord_list[m_sid_coord_list.size() - 1];
|
||||
//TODO: support extra data for btdyn, here? extra?
|
||||
sid_coord.reserve(btfix->T1->size());
|
||||
string line;
|
||||
cerr << "Loading coordinate lines for space \"" << name << "\" from " << file << endl;
|
||||
iostreams::filtering_istream in;
|
||||
ugdiss::open_input_stream(file, in);
|
||||
while(getline(in, line))
|
||||
{
|
||||
sid_coord.push_back(Scan<float>(Tokenize(line)));
|
||||
}
|
||||
cerr << "Loaded " << sid_coord.size() << " lines" << endl;
|
||||
}
|
||||
}
|
||||
|
||||
// check for unknown parameters
|
||||
vector<string> known_parameters; known_parameters.reserve(50);
|
||||
known_parameters.push_back("L1");
|
||||
@ -291,8 +321,8 @@ namespace Moses
|
||||
known_parameters.push_back("cache");
|
||||
known_parameters.push_back("coh");
|
||||
known_parameters.push_back("config");
|
||||
known_parameters.push_back("coord");
|
||||
known_parameters.push_back("cumb");
|
||||
known_parameters.push_back("dist");
|
||||
known_parameters.push_back("extra");
|
||||
known_parameters.push_back("feature-sets");
|
||||
known_parameters.push_back("input-factor");
|
||||
@ -468,19 +498,6 @@ namespace Moses
|
||||
SPTR<PScoreWC<Token> > ffwcnt(new PScoreWC<Token>("wcnt"));
|
||||
register_ff(ffwcnt,m_active_ff_common);
|
||||
}
|
||||
// Optional distance feature
|
||||
if(param["dist"] != "0")
|
||||
{
|
||||
// Now using sid coordinate list
|
||||
// (to be populated after bitext load)
|
||||
if(m_sid_coord == NULL) {
|
||||
m_sid_coord.reset(new vector<vector<float> >());
|
||||
}
|
||||
// Track sids when sampling bitext
|
||||
m_track_sids = true;
|
||||
SPTR<PScoreDist<Token> > ff(new PScoreDist<Token>(m_sid_coord, param["dist"]));
|
||||
register_ff(ff,m_active_ff_common);
|
||||
}
|
||||
}
|
||||
// cerr << "Features: " << Join("|",m_feature_names) << endl;
|
||||
this->m_numScoreComponents = this->m_feature_names.size();
|
||||
@ -524,28 +541,6 @@ namespace Moses
|
||||
if (m_extra_data.size())
|
||||
load_extra_data(m_extra_data, false);
|
||||
|
||||
// A feature (such as dist) left a note that we need to populate src
|
||||
// sentence coordinates
|
||||
if (m_sid_coord)
|
||||
{
|
||||
// We know the corpus size from the bitext
|
||||
m_sid_coord->reserve(btfix->T1->size());
|
||||
string coordfile = m_bname + L1 + ".coord.gz";
|
||||
string line;
|
||||
cerr << "Loading coordinate lines from " << coordfile << endl;
|
||||
boost::iostreams::filtering_istream in;
|
||||
ugdiss::open_input_stream(coordfile, in);
|
||||
while(getline(in, line))
|
||||
{
|
||||
m_sid_coord->push_back(Scan<float>(Tokenize(line)));
|
||||
}
|
||||
cerr << "Loaded " << m_sid_coord->size() << " lines" << endl;
|
||||
UTIL_THROW_IF2(m_sid_coord->size() != btfix->T1->size(),
|
||||
"Coordinates file size does not match bitext size ("
|
||||
<< m_sid_coord->size() << " != " << btfix->T1->size()
|
||||
<< ")");
|
||||
}
|
||||
|
||||
#if 0
|
||||
// currently not used
|
||||
LexicalPhraseScorer2<Token>::table_t & COOC = calc_lex.scorer.COOC;
|
||||
@ -587,12 +582,12 @@ namespace Moses
|
||||
if (fix)
|
||||
{
|
||||
BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_fix)
|
||||
(*ff)(*btfix, *fix, ttask, &fvals);
|
||||
(*ff)(*btfix, *fix, &fvals);
|
||||
}
|
||||
if (dyn)
|
||||
{
|
||||
BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_dyn)
|
||||
(*ff)(*dynbt, *dyn, ttask, &fvals);
|
||||
(*ff)(*dynbt, *dyn, &fvals);
|
||||
}
|
||||
|
||||
if (fix && dyn) { pool += *dyn; }
|
||||
@ -604,7 +599,7 @@ namespace Moses
|
||||
zilch.raw2 = m.approxOccurrenceCount();
|
||||
pool += zilch;
|
||||
BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_dyn)
|
||||
(*ff)(*dynbt, ff->allowPooling() ? pool : zilch, ttask, &fvals);
|
||||
(*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals);
|
||||
}
|
||||
else if (dyn)
|
||||
{
|
||||
@ -614,17 +609,17 @@ namespace Moses
|
||||
zilch.raw2 = m.approxOccurrenceCount();
|
||||
pool += zilch;
|
||||
BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_fix)
|
||||
(*ff)(*dynbt, ff->allowPooling() ? pool : zilch, ttask, &fvals);
|
||||
(*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals);
|
||||
}
|
||||
if (fix)
|
||||
{
|
||||
BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_common)
|
||||
(*ff)(*btfix, pool, ttask, &fvals);
|
||||
(*ff)(*btfix, pool, &fvals);
|
||||
}
|
||||
else
|
||||
{
|
||||
BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_common)
|
||||
(*ff)(*dynbt, pool, ttask, &fvals);
|
||||
(*ff)(*dynbt, pool, &fvals);
|
||||
}
|
||||
|
||||
TargetPhrase* tp = new TargetPhrase(const_cast<ttasksptr&>(ttask), this);
|
||||
@ -653,6 +648,21 @@ namespace Moses
|
||||
}
|
||||
#endif
|
||||
|
||||
// Track stats for rescoring non-cacheable phrases as needed
|
||||
if (m_track_coord)
|
||||
{
|
||||
cerr << btfix->toString(pool.p1, 0) << " ::: " << btfix->toString(pool.p2, 1) << endl;
|
||||
BOOST_FOREACH(uint32_t const sid, *pool.sids)
|
||||
{
|
||||
BOOST_FOREACH(vector<vector<float> > coord, m_sid_coord_list)
|
||||
{
|
||||
//TODO: store coord[sid] in tp
|
||||
cerr << " : " << Join(" ", coord[sid]);
|
||||
}
|
||||
cerr << endl;
|
||||
}
|
||||
}
|
||||
|
||||
return tp;
|
||||
}
|
||||
|
||||
@ -728,7 +738,7 @@ namespace Moses
|
||||
SPTR<ContextScope> const& scope = ttask->GetScope();
|
||||
SPTR<TPCollCache> cache = scope->get<TPCollCache>(cache_key);
|
||||
if (!cache) cache = m_cache; // no context-specific cache, use global one
|
||||
|
||||
|
||||
ret = cache->get(phrasekey, dyn->revision());
|
||||
// TO DO: we should revise the revision mechanism: we take the
|
||||
// length of the dynamic bitext (in sentences) at the time the PT
|
||||
@ -742,12 +752,12 @@ namespace Moses
|
||||
// std::cerr << ret << " with " << ret->refCount << " references at "
|
||||
// << HERE << std::endl;
|
||||
boost::upgrade_lock<boost::shared_mutex> rlock(ret->lock);
|
||||
if (ret->GetSize()) return ret;
|
||||
if (ret->GetSize()) return ret;
|
||||
|
||||
// new TPC (not found or old one was not up to date)
|
||||
boost::upgrade_to_unique_lock<boost::shared_mutex> wlock(rlock);
|
||||
// maybe another thread did the work while we waited for the lock ?
|
||||
if (ret->GetSize()) return ret;
|
||||
if (ret->GetSize()) return ret;
|
||||
|
||||
// OK: pt entry NOT found or NOT up to date
|
||||
// lookup and expansion could be done in parallel threads,
|
||||
@ -768,7 +778,7 @@ namespace Moses
|
||||
m_min_sample_size,
|
||||
m_default_sample_size,
|
||||
m_sampling_method,
|
||||
m_track_sids);
|
||||
m_track_coord);
|
||||
s();
|
||||
sfix = s.stats();
|
||||
}
|
||||
@ -956,7 +966,7 @@ namespace Moses
|
||||
{
|
||||
BitextSampler<Token> s(btfix, mfix, context->bias,
|
||||
m_min_sample_size, m_default_sample_size,
|
||||
m_sampling_method, m_track_sids);
|
||||
m_sampling_method, m_track_coord);
|
||||
if (*context->cache1->get(pid, s.stats()) == s.stats())
|
||||
m_thread_pool->add(s);
|
||||
}
|
||||
@ -977,7 +987,7 @@ namespace Moses
|
||||
for (size_t i = 0; mdyn.size() == i && i < myphrase.size(); ++i)
|
||||
mdyn.extend(myphrase[i]);
|
||||
// let's assume a uniform bias over the foreground corpus
|
||||
if (mdyn.size() == myphrase.size()) dyn->prep(ttask, mdyn, m_track_sids);
|
||||
if (mdyn.size() == myphrase.size()) dyn->prep(ttask, mdyn, m_track_coord);
|
||||
}
|
||||
return mdyn.size() == myphrase.size();
|
||||
}
|
||||
|
@ -119,9 +119,8 @@ namespace Moses
|
||||
std::vector<SPTR<pscorer > > m_active_ff_common;
|
||||
// activated feature functions (dyn)
|
||||
|
||||
// Coordinates of bitext source sentences for dist feature
|
||||
boost::shared_ptr<std::vector<std::vector<float> > > m_sid_coord;
|
||||
bool m_track_sids; // track sids when sampling bitext?
|
||||
bool m_track_coord = false; // track coordinates? Effectively: track sids when sampling bitext?
|
||||
std::vector<std::vector<std::vector<float> > > m_sid_coord_list;
|
||||
|
||||
void
|
||||
parse_factor_spec(std::vector<FactorType>& flist, std::string const key);
|
||||
|
@ -14,4 +14,3 @@
|
||||
#include "sapt_pscore_phrasecount.h" // phrase count
|
||||
#include "sapt_pscore_wordcount.h" // word count
|
||||
#include "sapt_pscore_cumulative_bias.h" // cumulative bias score
|
||||
#include "sapt_pscore_dist.h" // sample distance score
|
||||
|
@ -27,7 +27,6 @@
|
||||
|
||||
virtual void
|
||||
operator()(Bitext<Token> const& pt, PhrasePair<Token>& pp,
|
||||
ttasksptr const& ttask,
|
||||
std::vector<float> * dest=NULL) const = 0;
|
||||
|
||||
void
|
||||
|
@ -22,7 +22,6 @@ namespace sapt
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
ttasksptr const& ttask,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
|
@ -29,7 +29,6 @@ namespace sapt {
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
ttasksptr const& ttask,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
|
@ -1,124 +0,0 @@
|
||||
// -*- c++ -*-
|
||||
//
|
||||
// This scorer measures distance between sentences in an arbitrary N-dimensional
|
||||
// space on the source side. It provides two scores for each phrase pair:
|
||||
// * Distance to input, the average distance between training sentences and the
|
||||
// input sentence (are training points close to test point?)
|
||||
// * Training data consistency, the average distance between training sentences
|
||||
// and their centroid (are training points close to each other?)
|
||||
// Here "training sentences" refers to the subset of sentences sampled from the
|
||||
// suffix array from which the phrase pair can be extracted. The two distances
|
||||
// reported as feature scores are log-transformed.
|
||||
//
|
||||
// This requires pre-computing the coordinates of every source sentence in the
|
||||
// bitext and computing the coordinates of each input sentence at run-time.
|
||||
//
|
||||
// Specify the coordinates of bitext source sentences with a file called
|
||||
// ${CORPUS}.${L1}.coord.gz that contains lines of space-delimited floats:
|
||||
// 0.1 0.5 0.2 ...
|
||||
//
|
||||
// Specify the coordinates of input sentences (InputType m_coord) with XML input
|
||||
// using the coord tag. See www.statmt.org/moses/?n=Advanced.Hybrid#ntoc1 for
|
||||
// turning on XML input:
|
||||
// <coord coord="0.1 0.5 0.2 ..." />
|
||||
//
|
||||
// Activate this feature with "dist=MEASURE" where MEASURE is one of:
|
||||
// euc: Euclidean distance (for spaces)
|
||||
// var: total variation distance (for distributions)
|
||||
|
||||
#pragma once
|
||||
#include "sapt_pscore_base.h"
|
||||
#include "mmsapt.h"
|
||||
|
||||
#include <boost/foreach.hpp>
|
||||
|
||||
namespace sapt
|
||||
{
|
||||
template<typename Token>
|
||||
class
|
||||
PScoreDist : public PhraseScorer<Token>
|
||||
{
|
||||
enum Measure {
|
||||
EuclideanDistance,
|
||||
TotalVariationDistance,
|
||||
};
|
||||
boost::shared_ptr<std::vector<std::vector<float> > > m_sid_coord;
|
||||
Measure m_measure;
|
||||
public:
|
||||
PScoreDist(boost::shared_ptr<std::vector<std::vector<float> > > const& sid_coord,
|
||||
std::string const description)
|
||||
{
|
||||
this->m_index = -1;
|
||||
this->m_num_feats = 2;
|
||||
this->m_feature_names.push_back("dist-" + description + "-i");
|
||||
this->m_feature_names.push_back("dist-" + description + "-c");
|
||||
this->m_sid_coord = sid_coord;
|
||||
if (description == "euc") {
|
||||
this->m_measure = EuclideanDistance;
|
||||
} else if (description == "var") {
|
||||
this->m_measure = TotalVariationDistance;
|
||||
} else {
|
||||
UTIL_THROW2("Unknown specification \""
|
||||
<< description << "\" for dist phrase scorer (one of: euc var)");
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
ttasksptr const& ttask,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) {
|
||||
dest = &pp.fvals;
|
||||
}
|
||||
// Coordinates of input
|
||||
std::vector<float> const& input = *(ttask->GetSource()->m_coord);
|
||||
// Coordinates of training data centroid
|
||||
std::vector<float> centroid = std::vector<float>((*m_sid_coord)[0].size());
|
||||
BOOST_FOREACH(int const sid, pp.sids) {
|
||||
std::vector<float> const& point = (*m_sid_coord)[sid];
|
||||
for (size_t i = 0; i < centroid.size(); ++i) {
|
||||
centroid[i] += point[i];
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < centroid.size(); ++i) {
|
||||
centroid[i] /= pp.sids.size();
|
||||
}
|
||||
// Compute log-average-distance of specified type from the training points
|
||||
// to both the input sentence and training centroid (max distance with
|
||||
// float epsilon to avoid domain error)
|
||||
float input_distance = 0;
|
||||
float centroid_distance = 0;
|
||||
if (m_measure == EuclideanDistance) {
|
||||
BOOST_FOREACH(int const sid, pp.sids) {
|
||||
std::vector<float> const& point = (*m_sid_coord)[sid];
|
||||
float input_point_distance = 0;
|
||||
float centroid_point_distance = 0;
|
||||
for (size_t i = 0; i < input.size(); ++i) {
|
||||
input_point_distance += pow(input[i] - point[i], 2);
|
||||
centroid_point_distance += pow(centroid[i] - point[i], 2);
|
||||
}
|
||||
input_distance += sqrt(input_point_distance);
|
||||
centroid_distance += sqrt(centroid_point_distance);
|
||||
}
|
||||
} else if (m_measure == TotalVariationDistance) {
|
||||
BOOST_FOREACH(int const sid, pp.sids) {
|
||||
std::vector<float> const& point = (*m_sid_coord)[sid];
|
||||
float input_point_distance = 0;
|
||||
float centroid_point_distance = 0;
|
||||
for (size_t i = 0; i < input.size(); ++i) {
|
||||
input_point_distance += std::abs(input[i] - point[i]);
|
||||
centroid_point_distance += std::abs(centroid[i] - point[i]);
|
||||
}
|
||||
input_distance += input_point_distance / 2;
|
||||
centroid_distance += centroid_point_distance / 2;
|
||||
}
|
||||
}
|
||||
input_distance /= pp.sids.size();
|
||||
centroid_distance /= pp.sids.size();
|
||||
(*dest)[this->m_index] = log(std::max(input_distance, Moses::FLOAT_EPSILON));
|
||||
(*dest)[this->m_index + 1] = log(std::max(centroid_distance, Moses::FLOAT_EPSILON));
|
||||
}
|
||||
};
|
||||
}
|
@ -49,7 +49,6 @@ namespace sapt {
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
ttasksptr const& ttask,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
|
@ -37,7 +37,6 @@ namespace sapt
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
ttasksptr const& ttask,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
|
@ -38,7 +38,6 @@ namespace sapt {
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
ttasksptr const& ttask,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
|
@ -39,7 +39,6 @@ namespace sapt
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
ttasksptr const& ttask,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
|
@ -40,7 +40,6 @@ namespace sapt
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
ttasksptr const& ttask,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
|
@ -23,7 +23,6 @@ namespace sapt
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
ttasksptr const& ttask,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
|
@ -29,7 +29,6 @@ namespace sapt {
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
ttasksptr const& ttask,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
|
@ -27,7 +27,6 @@ namespace sapt {
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
ttasksptr const& ttask,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
|
@ -38,7 +38,6 @@ namespace sapt
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
ttasksptr const& ttask,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
|
@ -22,9 +22,8 @@ namespace sapt
|
||||
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
ttasksptr const& ttask,
|
||||
std::vector<float> * dest = NULL) const
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
(*dest)[this->m_index] = pp.len2;
|
||||
|
@ -402,12 +402,37 @@ ProcessAndStripXMLTags(AllOptions const& opts, string &line,
|
||||
StaticData::InstanceNonConst().SetAllWeights(allWeights);
|
||||
}
|
||||
|
||||
// coord: coordinate(s) of the input sentence in some space
|
||||
// (one or more floats)
|
||||
// Coord: coordinates of the input sentence in a user-defined space
|
||||
// <coord space="NAME" coord="X Y Z ..." />
|
||||
// where NAME is the name of the space and X Y Z ... are floats. See
|
||||
// PScoreDist in PhraseDictionaryBitextSampling (Mmsapt) for an example
|
||||
// of using this information for feature scoring.
|
||||
else if (tagName == "coord") {
|
||||
// Parse tag
|
||||
string space = ParseXmlTagAttribute(tagContent, "space");
|
||||
vector<string> toks = Tokenize(ParseXmlTagAttribute(tagContent, "coord"));
|
||||
input.m_coord.reset(new vector<float>());
|
||||
Scan<float>(*(input.m_coord), toks);
|
||||
boost::shared_ptr<vector<float> > coord(new vector<float>);
|
||||
Scan<float>(*coord, toks);
|
||||
// Init if needed
|
||||
if (!input.m_pd2InputCoord) {
|
||||
input.m_pd2InputCoord.reset(new std::map<PhraseDictionary const*, std::vector<boost::shared_ptr<std::vector<float> > > >);
|
||||
}
|
||||
// Scan phrase dictionaries to see which (if any) use this space
|
||||
BOOST_FOREACH(PhraseDictionary const* pd, PhraseDictionary::GetColl()) {
|
||||
const vector<string>& pdKnownSpaces = pd->GetKnownSpaces();
|
||||
for (size_t i = 0; i < pdKnownSpaces.size(); ++i) {
|
||||
// Match
|
||||
if (pdKnownSpaces[i] == space) {
|
||||
// Make sure a slot to store the coordinates exists
|
||||
std::vector<boost::shared_ptr<std::vector<float> > >& inputCoord = (*input.m_pd2InputCoord)[pd];
|
||||
if (inputCoord.size() < i + 1) {
|
||||
inputCoord.resize(i + 1);
|
||||
}
|
||||
// Store
|
||||
inputCoord[i] = coord;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// default: opening tag that specifies translation options
|
||||
|
Loading…
Reference in New Issue
Block a user