Merge pull request #165 from moses-smt/mjdenkowski

Phrase distance feature
This commit is contained in:
Michael Denkowski 2016-08-12 12:20:23 -04:00 committed by GitHub
commit 950c7de458
36 changed files with 435 additions and 59 deletions

View File

@ -1319,6 +1319,16 @@
<name>FF/PhraseBoundaryFeature.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhraseBoundaryFeature.h</locationURI>
</link>
<link>
<name>FF/PhraseDistanceFeature.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhraseDistanceFeature.cpp</locationURI>
</link>
<link>
<name>FF/PhraseDistanceFeature.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhraseDistanceFeature.h</locationURI>
</link>
<link>
<name>FF/PhraseLengthFeature.cpp</name>
@ -3654,7 +3664,7 @@
<name>TranslationModel/UG/sapt_pscore_coherence.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_coherence.h</locationURI>
</link>
</link>
<link>
<name>TranslationModel/UG/sapt_pscore_lex1.h</name>
<type>1</type>
@ -3699,7 +3709,7 @@
<name>TranslationModel/UG/sapt_pscore_wordcount.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_wordcount.h</locationURI>
</link>
</link>
<link>
<name>TranslationModel/UG/sim-pe.cc</name>
<type>1</type>

View File

@ -30,6 +30,7 @@
#include "moses/FF/TargetBigramFeature.h"
#include "moses/FF/TargetNgramFeature.h"
#include "moses/FF/PhraseBoundaryFeature.h"
#include "moses/FF/PhraseDistanceFeature.h"
#include "moses/FF/PhrasePairFeature.h"
#include "moses/FF/RulePairUnlexicalizedSource.h"
#include "moses/FF/PhraseLengthFeature.h"
@ -252,6 +253,7 @@ FeatureRegistry::FeatureRegistry()
MOSES_FNAME(SourceWordDeletionFeature);
MOSES_FNAME(TargetWordInsertionFeature);
MOSES_FNAME(PhraseBoundaryFeature);
MOSES_FNAME(PhraseDistanceFeature);
MOSES_FNAME(PhraseLengthFeature);
MOSES_FNAME(WordTranslationFeature);
MOSES_FNAME(TargetBigramFeature);

View File

@ -0,0 +1,123 @@
#include "PhraseDistanceFeature.h"
#include <vector>
#include <boost/foreach.hpp>
#include "moses/InputType.h"
#include "moses/ScoreComponentCollection.h"
#include "moses/StaticData.h"
#include "util/exception.hh"
using namespace std;
namespace Moses
{
PhraseDistanceFeature::PhraseDistanceFeature(const string &line)
: StatelessFeatureFunction(2, line)
, m_space("")
, m_spaceID(0)
, m_measure(EuclideanDistance)
{
ReadParameters();
}
void PhraseDistanceFeature::EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedScores) const
{
vector<float> scores(m_numScoreComponents, 0);
bool broken = false;
// Input coord
map<size_t const, vector<float> >::const_iterator ii;
if (input.m_coordMap) {
ii = input.m_coordMap->find(m_spaceID);
} else {
TRACE_ERR("No coordinates for space " << m_space << " on input (specify with coord XML tag)" << endl);
TRACE_ERR("Scores for " << m_description << " will be incorrect and probably all zeros" << endl);
broken = true;
}
if (ii == input.m_coordMap->end()) {
TRACE_ERR("No coordinates for space " << m_space << " on input (specify with coord XML tag)" << endl);
TRACE_ERR("Scores for " << m_description << " will be incorrect and probably all zeros" << endl);
broken = true;
}
// Target phrase coord
vector<SPTR<vector<float> > > const* tpp = targetPhrase.GetCoordList(m_spaceID);
if (tpp == NULL) {
TRACE_ERR("No coordinates for space " << m_space << " on target phrase (PhraseDictionary implementation needs to set)" << endl);
TRACE_ERR("Scores for " << m_description << " will be incorrect and probably all zeros" << endl);
broken = true;
}
// Compute scores
if (!broken) {
vector<float> const& inputCoord = ii->second;
vector<SPTR<vector<float> > > const& tpCoord = *tpp;
// Centroid of target phrase instances (from phrase extraction)
vector<float> centroid = vector<float>(inputCoord.size(), 0);
BOOST_FOREACH(SPTR<vector<float> > const coord, tpCoord) {
for (size_t i = 0; i < inputCoord.size(); ++i) {
centroid[i] += (*coord)[i];
}
}
for (size_t i = 0; i < inputCoord.size(); ++i) {
centroid[i] /= tpCoord.size();
}
// Average distance from the target phrase instances to (1) the input and
// (2) the target phrase centroid
float inputDistance = 0;
float centroidDistance = 0;
if (m_measure == EuclideanDistance) {
BOOST_FOREACH(SPTR<vector<float> > const coord, tpCoord) {
float pointInputDistance = 0;
float pointCentroidDistance = 0;
for (size_t i = 0; i < inputCoord.size(); ++i) {
pointInputDistance += pow(inputCoord[i] - (*coord)[i], 2);
pointCentroidDistance += pow(centroid[i] - (*coord)[i], 2);
}
inputDistance += sqrt(pointInputDistance);
centroidDistance += sqrt(pointCentroidDistance);
}
} else if (m_measure == TotalVariationDistance) {
BOOST_FOREACH(SPTR<vector<float> > const coord, tpCoord) {
float pointInputDistance = 0;
float pointCentroidDistance = 0;
for (size_t i = 0; i < inputCoord.size(); ++i) {
pointInputDistance += abs(inputCoord[i] - (*coord)[i]);
pointCentroidDistance += abs(centroid[i] - (*coord)[i]);
}
inputDistance += pointInputDistance / 2;
centroidDistance += pointCentroidDistance / 2;
}
}
inputDistance /= tpCoord.size();
centroidDistance /= tpCoord.size();
// Log transform scores, max with float epsilon to avoid domain error
scores[0] = log(max(inputDistance, Moses::FLOAT_EPSILON));
scores[1] = log(max(centroidDistance, Moses::FLOAT_EPSILON));
}
// Set scores
scoreBreakdown.Assign(this, scores);
return;
}
void PhraseDistanceFeature::SetParameter(const string& key, const string& value)
{
if (key == "space") {
m_space = value;
m_spaceID = StaticData::InstanceNonConst().MapCoordSpace(m_space);
} else if (key == "measure") {
if (value == "euc") {
m_measure = EuclideanDistance;
} else if (value == "var") {
m_measure = TotalVariationDistance;
} else {
UTIL_THROW2("Unknown measure " << value << ", choices: euc var");
}
} else {
StatelessFeatureFunction::SetParameter(key, value);
}
}
} // namespace

View File

@ -0,0 +1,57 @@
#pragma once
#include "StatelessFeatureFunction.h"
namespace Moses
{
class PhraseDistanceFeature : public StatelessFeatureFunction
{
enum Measure
{
EuclideanDistance,
TotalVariationDistance,
};
public:
PhraseDistanceFeature(const std::string &line);
bool IsUseable(const FactorMask &mask) const {
return true;
}
virtual void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedScores) const {
}
void EvaluateWhenApplied(const Hypothesis& hypo,
ScoreComponentCollection* accumulator) const {
}
void EvaluateWhenApplied(const ChartHypothesis &hypo,
ScoreComponentCollection* accumulator) const {
}
void EvaluateWhenApplied(const Syntax::SHyperedge &hyperedge,
ScoreComponentCollection* accumulator) const {
}
void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedScores = NULL) const;
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
, const TranslationOptionList &translationOptionList) const {
}
void SetParameter(const std::string& key, const std::string& value);
protected:
Measure m_measure;
std::string m_space;
size_t m_spaceID;
};
} //namespace

View File

@ -68,6 +68,9 @@ public:
size_t m_frontSpanCoveredLength;
// how many words from the beginning are covered
// Coordinates in user-defined spaces (see "coord" XML tag)
SPTR<std::map<size_t const, std::vector<float> > > m_coordMap;
InputType(AllOptions::ptr const& opts, long translationId = 0);
virtual ~InputType();

View File

@ -154,7 +154,8 @@ aux_interpret_xml(std::string& line, std::vector<size_t> & xmlWalls,
bool OK = ProcessAndStripXMLTags(*m_options, line,
m_xmlOptions,
m_reorderingConstraint,
xmlWalls, placeholders);
xmlWalls, placeholders,
*this);
if (!OK) {
TRACE_ERR("Unable to parse XML in line: " << line);
}

View File

@ -936,4 +936,25 @@ void StaticData::ResetWeights(const std::string &denseWeights, const std::string
}
}
size_t StaticData::GetCoordSpace(string space) const
{
map<string, size_t>::const_iterator m = m_coordSpaceMap.find(space);
if(m == m_coordSpaceMap.end()) {
return 0;
}
return m->second;
}
size_t StaticData::MapCoordSpace(string space)
{
map<string, size_t>::const_iterator m = m_coordSpaceMap.find(space);
if (m != m_coordSpaceMap.end()) {
return m->second;
}
size_t id = m_coordSpaceNextID;
m_coordSpaceNextID += 1;
m_coordSpaceMap[space] = id;
return id;
}
} // namespace

View File

@ -60,7 +60,7 @@ class PhraseDictionaryDynamicCacheBased;
typedef std::pair<std::string, float> UnknownLHSEntry;
typedef std::vector<UnknownLHSEntry> UnknownLHSList;
/** Contains global variables and contants.
/** Contains global variables and constants.
* Only 1 object of this class should be instantiated.
* A const object of this class is accessible by any function during decoding by calling StaticData::Instance();
*/
@ -152,6 +152,12 @@ protected:
bool ini_performance_options();
void initialize_features();
// Coordinate space name map for matching spaces across XML input ("coord"
// tag) and feature functions that assign or use coordinates on target phrases
std::map< std::string const, size_t > m_coordSpaceMap;
size_t m_coordSpaceNextID = 1;
public:
//! destructor
@ -394,6 +400,9 @@ public:
return m_requireSortingAfterSourceContext;
}
// Coordinate spaces
size_t GetCoordSpace(std::string space) const;
size_t MapCoordSpace(std::string space);
};
}

View File

@ -125,6 +125,7 @@ TargetPhrase::TargetPhrase(const Phrase &phrase, const PhraseDictionary *pt)
TargetPhrase::TargetPhrase(const TargetPhrase &copy)
: Phrase(copy)
, m_cached_coord(copy.m_cached_coord)
, m_cached_scores(copy.m_cached_scores)
, m_scope(copy.m_scope)
, m_futureScore(copy.m_futureScore)
@ -333,6 +334,31 @@ SetExtraScores(FeatureFunction const* ff,
m_cached_scores[ff] = s;
}
vector<SPTR<vector<float> > > const*
TargetPhrase::
GetCoordList(size_t const spaceID) const
{
if(!m_cached_coord) {
return NULL;
}
CoordCache_t::const_iterator m = m_cached_coord->find(spaceID);
if(m == m_cached_coord->end()) {
return NULL;
}
return &m->second;
}
void
TargetPhrase::
PushCoord(size_t const spaceID,
SPTR<vector<float> > const coord)
{
if (!m_cached_coord) {
m_cached_coord.reset(new CoordCache_t);
}
vector<SPTR<vector<float> > >& coordList = (*m_cached_coord)[spaceID];
coordList.push_back(coord);
}
void TargetPhrase::SetProperties(const StringPiece &str)
{

View File

@ -56,9 +56,13 @@ public:
Scores const* GetExtraScores(FeatureFunction const* ff) const;
void SetExtraScores(FeatureFunction const* ff,boost::shared_ptr<Scores> const& scores);
typedef std::map<size_t const, std::vector<SPTR<std::vector<float> > > > CoordCache_t;
std::vector<SPTR<std::vector<float> > > const* GetCoordList(size_t const spaceID) const;
void PushCoord(size_t const spaceID, SPTR<std::vector<float> > const coord);
private:
ScoreCache_t m_cached_scores;
SPTR<CoordCache_t> m_cached_coord;
WPTR<ContextScope> m_scope;
private:

View File

@ -130,7 +130,6 @@ namespace sapt
mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
// caches for unbiased sampling; biased sampling uses the caches that
// are stored locally on the translation task
public:
SPTR<Ttrack<char> > Tx; // word alignments
SPTR<Ttrack<Token> > T1; // token track
@ -164,7 +163,8 @@ namespace sapt
#ifndef NO_MOSES
SPTR<pstats>
prep2(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
prep2(ttasksptr const& ttask, iter const& phrase, bool const track_sids,
int max_sample = -1) const;
#endif
protected:
@ -189,7 +189,7 @@ namespace sapt
SPTR<pstats>
lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
void prep(ttasksptr const& ttask, iter const& phrase) const;
void prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const;
#endif
void setDefaultSampleSize(size_t const max_samples);

View File

@ -33,7 +33,8 @@ public:
SPTR<pstats>
add_job(Bitext<Token> const* const theBitext,
typename TSA<Token>::tree_iterator const& phrase,
size_t const max_samples, SPTR<SamplingBias const> const& bias);
size_t const max_samples, SPTR<SamplingBias const> const& bias,
bool const track_sids);
// add_job(Bitext<Token> const* const theBitext,
// typename TSA<Token>::tree_iterator const& phrase,
// size_t const max_samples, SamplingBias const* const bias);
@ -93,13 +94,14 @@ SPTR<pstats> Bitext<Token>
::agenda
::add_job(Bitext<Token> const* const theBitext,
typename TSA<Token>::tree_iterator const& phrase,
size_t const max_samples, SPTR<SamplingBias const> const& bias)
size_t const max_samples, SPTR<SamplingBias const> const& bias,
bool const track_sids)
{
boost::unique_lock<boost::mutex> lk(this->lock);
static boost::posix_time::time_duration nodelay(0,0,0,0);
bool fwd = phrase.root == bt.I1.get();
SPTR<job> j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2,
max_samples, fwd, bias));
max_samples, fwd, bias, track_sids));
j->stats->register_worker();
joblist.push_back(j);

View File

@ -35,6 +35,8 @@ public:
SPTR<pstats> stats; // stores statistics collected during sampling
SPTR<SamplingBias const> const m_bias; // sentence-level bias for sampling
float bias_total;
bool m_track_sids; // track sentence ids in sample?
bool nextSample(uint64_t & sid, uint64_t & offset); // select next occurrence
int
@ -46,7 +48,7 @@ public:
job(Bitext<Token> const* const theBitext,
typename TSA<Token>::tree_iterator const& m,
SPTR<TSA<Token> > const& r, size_t maxsmpl, bool isfwd,
SPTR<SamplingBias const> const& bias);
SPTR<SamplingBias const> const& bias, bool const track_sids);
~job();
};
@ -66,7 +68,8 @@ Bitext<Token>::agenda::job
::job(Bitext<Token> const* const theBitext,
typename TSA<Token>::tree_iterator const& m,
SPTR<TSA<Token> > const& r, size_t maxsmpl,
bool isfwd, SPTR<SamplingBias const> const& bias)
bool isfwd, SPTR<SamplingBias const> const& bias,
bool const track_sids)
: m_bitext(theBitext)
, rnd(0)
, rnddenom(rnd.max() + 1.)
@ -80,8 +83,9 @@ Bitext<Token>::agenda::job
, len(m.size())
, fwd(isfwd)
, m_bias(bias)
, m_track_sids(track_sids)
{
stats.reset(new pstats());
stats.reset(new pstats(m_track_sids));
stats->raw_cnt = m.approxOccurrenceCount();
bias_total = 0;

View File

@ -90,7 +90,7 @@ Bitext<Token>::agenda
size_t raw2 = b->approxOccurrenceCount();
float bwgt = j->m_bias ? (*j->m_bias)[sid] : 1;
j->stats->add(tpid, sample_weight, bwgt, aln, raw2,
po_fwd, po_bwd, docid);
po_fwd, po_bwd, docid, sid);
bool ok = (i == e2) || b->extend(o[i].id());
UTIL_THROW_IF2(!ok, "Could not extend target phrase.");
}

View File

@ -29,6 +29,7 @@ namespace sapt
my_wcnt = other.wcnt();
my_bcnt = other.bcnt();
my_aln = other.aln();
sids = other.sids;
indoc = other.indoc;
for (int i = 0; i <= LRModel::NONE; i++)
{
@ -56,7 +57,8 @@ namespace sapt
size_t
jstats::
add(float w, float b, std::vector<unsigned char> const& a, uint32_t const cnt2,
uint32_t fwd_orient, uint32_t bwd_orient, int const docid)
uint32_t fwd_orient, uint32_t bwd_orient, int const docid,
uint32_t const sid, bool const track_sid)
{
boost::lock_guard<boost::mutex> lk(this->lock);
my_cnt2 = cnt2;
@ -76,6 +78,13 @@ namespace sapt
}
++ofwd[fwd_orient];
++obwd[bwd_orient];
// Record sentence id if requested
if (track_sid)
{
if (!sids)
sids.reset(new std::vector<uint32_t>);
sids->push_back(sid);
}
if (docid >= 0)
{
// while (int(indoc.size()) <= docid) indoc.push_back(0);

View File

@ -28,6 +28,7 @@ namespace sapt
uint32_t obwd[LRModel::NONE+1]; // backward distortion type counts
public:
SPTR<std::vector<uint32_t> > sids; // list of sentence ids in this sample
std::map<uint32_t,uint32_t> indoc;
// std::vector<uint32_t> indoc; // counts origin of samples (for biased sampling)
jstats();
@ -41,7 +42,8 @@ namespace sapt
size_t
add(float w, float b, std::vector<unsigned char> const& a, uint32_t const cnt2,
uint32_t fwd_orient, uint32_t bwd_orient, int const docid);
uint32_t fwd_orient, uint32_t bwd_orient, int const docid, uint32_t const sid,
bool const track_sid);
void invalidate();
void validate();

View File

@ -30,9 +30,9 @@ lookup(ttasksptr const& ttask, iter const& phrase, int max_sample) const
template<typename Token>
void
Bitext<Token>::
prep(ttasksptr const& ttask, iter const& phrase) const
prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const
{
prep2(ttask, phrase, m_default_sample_size);
prep2(ttask, phrase, track_sids, m_default_sample_size);
}
@ -44,7 +44,8 @@ template<typename Token>
SPTR<pstats>
Bitext<Token>
::prep2
( ttasksptr const& ttask, iter const& phrase, int max_sample) const
( ttasksptr const& ttask, iter const& phrase, bool const track_sids,
int max_sample) const
{
if (max_sample < 0) max_sample = m_default_sample_size;
SPTR<SamplingBias> bias;
@ -74,7 +75,7 @@ Bitext<Token>
if (m_num_workers > 1)
ag->add_workers(m_num_workers);
}
ret = ag->add_job(this, phrase, max_sample, bias);
ret = ag->add_job(this, phrase, max_sample, bias, track_sids);
if (cache) cache->set(phrase.getPid(),ret);
UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job.");
return ret;

View File

@ -10,7 +10,7 @@ namespace sapt
#endif
pstats::
pstats() : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0)
pstats(bool const track_sids) : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0), track_sids(track_sids)
{
for (int i = 0; i <= LRModel::NONE; ++i)
ofwd[i] = obwd[i] = 0;
@ -69,11 +69,11 @@ namespace sapt
std::vector<unsigned char> const& a,
uint32_t const cnt2,
uint32_t fwd_o,
uint32_t bwd_o, int const docid)
uint32_t bwd_o, int const docid, uint32_t const sid)
{
boost::lock_guard<boost::mutex> guard(this->lock);
jstats& entry = this->trg[pid];
size_t ret = entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid);
size_t ret = entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid, sid, track_sids);
if (this->good < entry.rcnt())
{
UTIL_THROW(util::Exception, "more joint counts than good counts:"

View File

@ -35,7 +35,8 @@ namespace sapt
indoc_map_t indoc;
trg_map_t trg;
pstats();
bool track_sids;
pstats(bool const track_sids);
~pstats();
void release();
void register_worker();
@ -49,7 +50,8 @@ namespace sapt
uint32_t const cnt2, // raw target phrase count
uint32_t fwd_o, // fwd. phrase orientation
uint32_t bwd_o, // bwd. phrase orientation
int const docid); // document where sample was found
int const docid, // document where sample was found
uint32_t const sid); // index of sentence where sample was found
void
count_sample(int const docid, // document where sample was found

View File

@ -70,6 +70,7 @@ BitextSampler : public Moses::reference_counter
size_t m_num_occurrences; // estimated number of phrase occurrences in corpus
boost::taus88 m_rnd; // every job has its own pseudo random generator
double m_bias_total;
bool m_track_sids; // track sentence ids in stats?
size_t consider_sample(TokenPosition const& p);
size_t perform_random_sampling();
@ -86,7 +87,8 @@ public:
SPTR<SamplingBias const> const& bias,
size_t const min_samples,
size_t const max_samples,
sampling_method const method);
sampling_method const method,
bool const track_sids);
~BitextSampler();
SPTR<pstats> stats();
bool done() const;
@ -185,7 +187,7 @@ BitextSampler<Token>::
BitextSampler(SPTR<Bitext<Token> const> const& bitext,
typename bitext::iter const& phrase,
SPTR<SamplingBias const> const& bias, size_t const min_samples, size_t const max_samples,
sampling_method const method)
sampling_method const method, bool const track_sids)
: m_bitext(bitext)
, m_plen(phrase.size())
, m_fwd(phrase.root == bitext->I1.get())
@ -201,8 +203,9 @@ BitextSampler(SPTR<Bitext<Token> const> const& bitext,
, m_finished(false)
, m_num_occurrences(phrase.ca())
, m_rnd(0)
, m_track_sids(track_sids)
{
m_stats.reset(new pstats);
m_stats.reset(new pstats(m_track_sids));
m_stats->raw_cnt = phrase.ca();
m_stats->register_worker();
}
@ -332,7 +335,8 @@ consider_sample(TokenPosition const& p)
size_t raw2 = b->approxOccurrenceCount();
size_t evid = m_stats->add(tpid, sample_weight,
m_bias ? (*m_bias)[p.sid] : 1,
aln, raw2, rec.po_fwd, rec.po_bwd, docid);
aln, raw2, rec.po_fwd, rec.po_bwd, docid,
p.sid);
max_evidence = std::max(max_evidence, evid);
bool ok = (i == rec.e2) || b->extend(o[i].id());
UTIL_THROW_IF2(!ok, "Could not extend target phrase.");

View File

@ -31,6 +31,8 @@ namespace sapt
std::vector<unsigned char> aln;
float score;
bool inverse;
SPTR<std::vector<uint32_t> > sids; // list of sampled sentence ids where
// this phrase pair was found
// std::vector<uint32_t> indoc;
std::map<uint32_t,uint32_t> indoc;
PhrasePair() { };
@ -132,6 +134,7 @@ namespace sapt
dbwd[i] = js.dcnt_bwd(po);
}
sids = js.sids;
indoc = js.indoc;
return *this;
}
@ -182,6 +185,8 @@ namespace sapt
sample2 += o.sample2;
cum_bias += o.cum_bias;
// todo: add distortion counts
if (sids && o.sids)
sids->insert(sids->end(), o.sids->begin(), o.sids->end());
return *this;
}
@ -199,6 +204,7 @@ namespace sapt
, aln(o.aln)
, score(o.score)
, inverse(o.inverse)
, sids(o.sids)
, indoc(o.indoc)
{
for (int i = 0; i <= LRModel::NONE; ++i)

View File

@ -275,6 +275,40 @@ namespace Moses
m = param.find("name");
if (m != param.end()) m_name = m->second;
// Optional coordinates for training corpus
// Takes form coord=name1:file1.gz,name2:file2.gz,...
// Names should match with XML input (coord tag)
param.insert(pair<string,string>("coord","0"));
if(param["coord"] != "0")
{
m_track_coord = true;
vector<string> coord_instances = Tokenize(param["coord"], ",");
BOOST_FOREACH(std::string instance, coord_instances)
{
vector<string> toks = Moses::Tokenize(instance, ":");
string space = toks[0];
string file = toks[1];
// Register that this model uses the given space
m_coord_spaces.push_back(StaticData::InstanceNonConst().MapCoordSpace(space));
// Load sid coordinates from file
m_sid_coord_list.push_back(vector<SPTR<vector<float> > >());
vector<SPTR<vector<float> > >& sid_coord = m_sid_coord_list[m_sid_coord_list.size() - 1];
//TODO: support extra data for btdyn, here? extra?
sid_coord.reserve(btfix->T1->size());
string line;
cerr << "Loading coordinate lines for space \"" << space << "\" from " << file << endl;
iostreams::filtering_istream in;
ugdiss::open_input_stream(file, in);
while(getline(in, line))
{
SPTR<vector<float> > coord(new vector<float>);
Scan<float>(*coord, Tokenize(line));
sid_coord.push_back(coord);
}
cerr << "Loaded " << sid_coord.size() << " lines" << endl;
}
}
// check for unknown parameters
vector<string> known_parameters; known_parameters.reserve(50);
known_parameters.push_back("L1");
@ -290,6 +324,7 @@ namespace Moses
known_parameters.push_back("cache");
known_parameters.push_back("coh");
known_parameters.push_back("config");
known_parameters.push_back("coord");
known_parameters.push_back("cumb");
known_parameters.push_back("extra");
known_parameters.push_back("feature-sets");
@ -616,6 +651,29 @@ namespace Moses
}
#endif
// Track coordinates if requested
if (m_track_coord)
{
BOOST_FOREACH(uint32_t const sid, *pool.sids)
{
for(size_t i = 0; i < m_coord_spaces.size(); ++i)
{
tp->PushCoord(m_coord_spaces[i], m_sid_coord_list[i][sid]);
}
}
/*
cerr << btfix->toString(pool.p1, 0) << " ::: " << btfix->toString(pool.p2, 1);
BOOST_FOREACH(size_t id, m_coord_spaces)
{
cerr << " [" << id << "]";
vector<vector<float> const*> const* coordList = tp->GetCoordList(id);
BOOST_FOREACH(vector<float> const* coord, *coordList)
cerr << " : " << Join(" ", *coord);
}
cerr << endl;
*/
}
return tp;
}
@ -691,7 +749,7 @@ namespace Moses
SPTR<ContextScope> const& scope = ttask->GetScope();
SPTR<TPCollCache> cache = scope->get<TPCollCache>(cache_key);
if (!cache) cache = m_cache; // no context-specific cache, use global one
ret = cache->get(phrasekey, dyn->revision());
// TO DO: we should revise the revision mechanism: we take the
// length of the dynamic bitext (in sentences) at the time the PT
@ -705,12 +763,12 @@ namespace Moses
// std::cerr << ret << " with " << ret->refCount << " references at "
// << HERE << std::endl;
boost::upgrade_lock<boost::shared_mutex> rlock(ret->lock);
if (ret->GetSize()) return ret;
if (ret->GetSize()) return ret;
// new TPC (not found or old one was not up to date)
boost::upgrade_to_unique_lock<boost::shared_mutex> wlock(rlock);
// maybe another thread did the work while we waited for the lock ?
if (ret->GetSize()) return ret;
if (ret->GetSize()) return ret;
// OK: pt entry NOT found or NOT up to date
// lookup and expansion could be done in parallel threads,
@ -730,7 +788,8 @@ namespace Moses
BitextSampler<Token> s(btfix, mfix, context->bias,
m_min_sample_size,
m_default_sample_size,
m_sampling_method);
m_sampling_method,
m_track_coord);
s();
sfix = s.stats();
}
@ -918,7 +977,7 @@ namespace Moses
{
BitextSampler<Token> s(btfix, mfix, context->bias,
m_min_sample_size, m_default_sample_size,
m_sampling_method);
m_sampling_method, m_track_coord);
if (*context->cache1->get(pid, s.stats()) == s.stats())
m_thread_pool->add(s);
}
@ -939,7 +998,7 @@ namespace Moses
for (size_t i = 0; mdyn.size() == i && i < myphrase.size(); ++i)
mdyn.extend(myphrase[i]);
// let's assume a uniform bias over the foreground corpus
if (mdyn.size() == myphrase.size()) dyn->prep(ttask, mdyn);
if (mdyn.size() == myphrase.size()) dyn->prep(ttask, mdyn, m_track_coord);
}
return mdyn.size() == myphrase.size();
}

View File

@ -119,6 +119,12 @@ namespace Moses
std::vector<SPTR<pscorer > > m_active_ff_common;
// activated feature functions (dyn)
bool m_track_coord = false; // track coordinates? Track sids when sampling
// from bitext, append coords to target phrases
// Space < Sid < sptr sentence coords > >
std::vector<std::vector<SPTR<std::vector<float> > > > m_sid_coord_list;
std::vector<size_t> m_coord_spaces;
void
parse_factor_spec(std::vector<FactorType>& flist, std::string const key);

View File

@ -28,8 +28,8 @@ namespace sapt {
void
operator()(Bitext<Token> const& bt,
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
(*dest)[this->m_index] = log(std::max(m_floor,pp.cum_bias));

View File

@ -48,8 +48,8 @@ namespace sapt {
void
operator()(Bitext<Token> const& bt,
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
float p = float(bt.T1->numTokens());

View File

@ -36,8 +36,8 @@ namespace sapt
void
operator()(Bitext<Token> const& bt,
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
// uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;

View File

@ -37,8 +37,8 @@ namespace sapt {
void
operator()(Bitext<Token> const& bt,
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
assert(pp.raw1);

View File

@ -38,8 +38,8 @@ namespace sapt
void
operator()(Bitext<Token> const& bt,
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
// we use the denominator specification to scale the raw counts on the

View File

@ -38,8 +38,9 @@ namespace sapt
}
void
operator()(Bitext<Token> const& bt, PhrasePair<Token> & pp,
std::vector<float> * dest = NULL) const
operator()(Bitext<Token> const& bt,
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
if (pp.joint > pp.good1)

View File

@ -22,8 +22,8 @@ namespace sapt
void
operator()(Bitext<Token> const& bt,
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
(*dest)[this->m_index] = 1;

View File

@ -28,8 +28,8 @@ namespace sapt {
void
operator()(Bitext<Token> const& bt,
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
size_t i = this->m_index;

View File

@ -26,8 +26,8 @@ namespace sapt {
void
operator()(Bitext<Token> const& bt,
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
size_t i = this->m_index;

View File

@ -37,8 +37,8 @@ namespace sapt
void
operator()(Bitext<Token> const& bt,
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
// uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;

View File

@ -22,8 +22,8 @@ namespace sapt
void
operator()(Bitext<Token> const& bt,
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
PhrasePair<Token>& pp,
std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
(*dest)[this->m_index] = pp.len2;

View File

@ -163,7 +163,8 @@ ProcessAndStripXMLTags(AllOptions const& opts, string &line,
vector<XmlOption const*> &res,
ReorderingConstraint &reorderingConstraint,
vector< size_t > &walls,
std::vector< std::pair<size_t, std::string> > &placeholders)
std::vector< std::pair<size_t, std::string> > &placeholders,
InputType &input)
{
//parse XML markup in translation line
@ -401,6 +402,28 @@ ProcessAndStripXMLTags(AllOptions const& opts, string &line,
StaticData::InstanceNonConst().SetAllWeights(allWeights);
}
// Coord: coordinates of the input sentence in a user-defined space
// <coord space="NAME" coord="X Y Z ..." />
// where NAME is the name of the space and X Y Z ... are floats. See
// PhraseDistanceFeature for an example of using this information for
// feature scoring.
else if (tagName == "coord") {
// Parse tag
string space = ParseXmlTagAttribute(tagContent, "space");
vector<string> tok = Tokenize(ParseXmlTagAttribute(tagContent, "coord"));
size_t id = StaticData::Instance().GetCoordSpace(space);
if (!id) {
TRACE_ERR("ERROR: no models use space " << space << ", will be ignored" << endl);
} else {
// Init if needed
if (!input.m_coordMap) {
input.m_coordMap.reset(new map<size_t const, vector<float> >);
}
vector<float>& coord = (*input.m_coordMap)[id];
Scan<float>(coord, tok);
}
}
// default: opening tag that specifies translation options
else {
if (startPos > endPos) {

View File

@ -34,7 +34,8 @@ bool ProcessAndStripXMLTags(AllOptions const& opts,
std::string &line, std::vector<XmlOption const*> &res,
ReorderingConstraint &reorderingConstraint,
std::vector< size_t > &walls,
std::vector< std::pair<size_t, std::string> > &placeholders);
std::vector< std::pair<size_t, std::string> > &placeholders,
InputType &input);
}