mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-01 16:33:16 +03:00
Merge ../mosesdecoder into perf_moses2
This commit is contained in:
commit
ae654adf83
@ -1319,6 +1319,16 @@
|
||||
<name>FF/PhraseBoundaryFeature.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhraseBoundaryFeature.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/PhraseDistanceFeature.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhraseDistanceFeature.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/PhraseDistanceFeature.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhraseDistanceFeature.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/PhraseLengthFeature.cpp</name>
|
||||
@ -3654,7 +3664,7 @@
|
||||
<name>TranslationModel/UG/sapt_pscore_coherence.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_coherence.h</locationURI>
|
||||
</link>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/UG/sapt_pscore_lex1.h</name>
|
||||
<type>1</type>
|
||||
@ -3699,7 +3709,7 @@
|
||||
<name>TranslationModel/UG/sapt_pscore_wordcount.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_wordcount.h</locationURI>
|
||||
</link>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/UG/sim-pe.cc</name>
|
||||
<type>1</type>
|
||||
|
@ -30,6 +30,7 @@
|
||||
#include "moses/FF/TargetBigramFeature.h"
|
||||
#include "moses/FF/TargetNgramFeature.h"
|
||||
#include "moses/FF/PhraseBoundaryFeature.h"
|
||||
#include "moses/FF/PhraseDistanceFeature.h"
|
||||
#include "moses/FF/PhrasePairFeature.h"
|
||||
#include "moses/FF/RulePairUnlexicalizedSource.h"
|
||||
#include "moses/FF/PhraseLengthFeature.h"
|
||||
@ -252,6 +253,7 @@ FeatureRegistry::FeatureRegistry()
|
||||
MOSES_FNAME(SourceWordDeletionFeature);
|
||||
MOSES_FNAME(TargetWordInsertionFeature);
|
||||
MOSES_FNAME(PhraseBoundaryFeature);
|
||||
MOSES_FNAME(PhraseDistanceFeature);
|
||||
MOSES_FNAME(PhraseLengthFeature);
|
||||
MOSES_FNAME(WordTranslationFeature);
|
||||
MOSES_FNAME(TargetBigramFeature);
|
||||
|
123
moses/FF/PhraseDistanceFeature.cpp
Normal file
123
moses/FF/PhraseDistanceFeature.cpp
Normal file
@ -0,0 +1,123 @@
|
||||
#include "PhraseDistanceFeature.h"
|
||||
|
||||
#include <vector>
|
||||
#include <boost/foreach.hpp>
|
||||
#include "moses/InputType.h"
|
||||
#include "moses/ScoreComponentCollection.h"
|
||||
#include "moses/StaticData.h"
|
||||
#include "util/exception.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
PhraseDistanceFeature::PhraseDistanceFeature(const string &line)
|
||||
: StatelessFeatureFunction(2, line)
|
||||
, m_space("")
|
||||
, m_spaceID(0)
|
||||
, m_measure(EuclideanDistance)
|
||||
{
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
void PhraseDistanceFeature::EvaluateWithSourceContext(const InputType &input
|
||||
, const InputPath &inputPath
|
||||
, const TargetPhrase &targetPhrase
|
||||
, const StackVec *stackVec
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection *estimatedScores) const
|
||||
{
|
||||
vector<float> scores(m_numScoreComponents, 0);
|
||||
bool broken = false;
|
||||
// Input coord
|
||||
map<size_t const, vector<float> >::const_iterator ii;
|
||||
if (input.m_coordMap) {
|
||||
ii = input.m_coordMap->find(m_spaceID);
|
||||
} else {
|
||||
TRACE_ERR("No coordinates for space " << m_space << " on input (specify with coord XML tag)" << endl);
|
||||
TRACE_ERR("Scores for " << m_description << " will be incorrect and probably all zeros" << endl);
|
||||
broken = true;
|
||||
}
|
||||
if (ii == input.m_coordMap->end()) {
|
||||
TRACE_ERR("No coordinates for space " << m_space << " on input (specify with coord XML tag)" << endl);
|
||||
TRACE_ERR("Scores for " << m_description << " will be incorrect and probably all zeros" << endl);
|
||||
broken = true;
|
||||
}
|
||||
// Target phrase coord
|
||||
vector<SPTR<vector<float> > > const* tpp = targetPhrase.GetCoordList(m_spaceID);
|
||||
if (tpp == NULL) {
|
||||
TRACE_ERR("No coordinates for space " << m_space << " on target phrase (PhraseDictionary implementation needs to set)" << endl);
|
||||
TRACE_ERR("Scores for " << m_description << " will be incorrect and probably all zeros" << endl);
|
||||
broken = true;
|
||||
}
|
||||
// Compute scores
|
||||
if (!broken) {
|
||||
vector<float> const& inputCoord = ii->second;
|
||||
vector<SPTR<vector<float> > > const& tpCoord = *tpp;
|
||||
// Centroid of target phrase instances (from phrase extraction)
|
||||
vector<float> centroid = vector<float>(inputCoord.size(), 0);
|
||||
BOOST_FOREACH(SPTR<vector<float> > const coord, tpCoord) {
|
||||
for (size_t i = 0; i < inputCoord.size(); ++i) {
|
||||
centroid[i] += (*coord)[i];
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < inputCoord.size(); ++i) {
|
||||
centroid[i] /= tpCoord.size();
|
||||
}
|
||||
// Average distance from the target phrase instances to (1) the input and
|
||||
// (2) the target phrase centroid
|
||||
float inputDistance = 0;
|
||||
float centroidDistance = 0;
|
||||
if (m_measure == EuclideanDistance) {
|
||||
BOOST_FOREACH(SPTR<vector<float> > const coord, tpCoord) {
|
||||
float pointInputDistance = 0;
|
||||
float pointCentroidDistance = 0;
|
||||
for (size_t i = 0; i < inputCoord.size(); ++i) {
|
||||
pointInputDistance += pow(inputCoord[i] - (*coord)[i], 2);
|
||||
pointCentroidDistance += pow(centroid[i] - (*coord)[i], 2);
|
||||
}
|
||||
inputDistance += sqrt(pointInputDistance);
|
||||
centroidDistance += sqrt(pointCentroidDistance);
|
||||
}
|
||||
} else if (m_measure == TotalVariationDistance) {
|
||||
BOOST_FOREACH(SPTR<vector<float> > const coord, tpCoord) {
|
||||
float pointInputDistance = 0;
|
||||
float pointCentroidDistance = 0;
|
||||
for (size_t i = 0; i < inputCoord.size(); ++i) {
|
||||
pointInputDistance += abs(inputCoord[i] - (*coord)[i]);
|
||||
pointCentroidDistance += abs(centroid[i] - (*coord)[i]);
|
||||
}
|
||||
inputDistance += pointInputDistance / 2;
|
||||
centroidDistance += pointCentroidDistance / 2;
|
||||
}
|
||||
}
|
||||
inputDistance /= tpCoord.size();
|
||||
centroidDistance /= tpCoord.size();
|
||||
// Log transform scores, max with float epsilon to avoid domain error
|
||||
scores[0] = log(max(inputDistance, Moses::FLOAT_EPSILON));
|
||||
scores[1] = log(max(centroidDistance, Moses::FLOAT_EPSILON));
|
||||
}
|
||||
// Set scores
|
||||
scoreBreakdown.Assign(this, scores);
|
||||
return;
|
||||
}
|
||||
|
||||
void PhraseDistanceFeature::SetParameter(const string& key, const string& value)
|
||||
{
|
||||
if (key == "space") {
|
||||
m_space = value;
|
||||
m_spaceID = StaticData::InstanceNonConst().MapCoordSpace(m_space);
|
||||
} else if (key == "measure") {
|
||||
if (value == "euc") {
|
||||
m_measure = EuclideanDistance;
|
||||
} else if (value == "var") {
|
||||
m_measure = TotalVariationDistance;
|
||||
} else {
|
||||
UTIL_THROW2("Unknown measure " << value << ", choices: euc var");
|
||||
}
|
||||
} else {
|
||||
StatelessFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
56
moses/FF/PhraseDistanceFeature.h
Normal file
56
moses/FF/PhraseDistanceFeature.h
Normal file
@ -0,0 +1,56 @@
|
||||
#pragma once
|
||||
|
||||
#include "StatelessFeatureFunction.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class PhraseDistanceFeature : public StatelessFeatureFunction
|
||||
{
|
||||
enum Measure {
|
||||
EuclideanDistance,
|
||||
TotalVariationDistance,
|
||||
};
|
||||
|
||||
public:
|
||||
PhraseDistanceFeature(const std::string &line);
|
||||
|
||||
bool IsUseable(const FactorMask &mask) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual void EvaluateInIsolation(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedScores) const {
|
||||
}
|
||||
|
||||
void EvaluateWhenApplied(const Hypothesis& hypo,
|
||||
ScoreComponentCollection* accumulator) const {
|
||||
}
|
||||
void EvaluateWhenApplied(const ChartHypothesis &hypo,
|
||||
ScoreComponentCollection* accumulator) const {
|
||||
}
|
||||
void EvaluateWhenApplied(const Syntax::SHyperedge &hyperedge,
|
||||
ScoreComponentCollection* accumulator) const {
|
||||
}
|
||||
|
||||
void EvaluateWithSourceContext(const InputType &input
|
||||
, const InputPath &inputPath
|
||||
, const TargetPhrase &targetPhrase
|
||||
, const StackVec *stackVec
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection *estimatedScores = NULL) const;
|
||||
|
||||
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
||||
, const TranslationOptionList &translationOptionList) const {
|
||||
}
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
protected:
|
||||
Measure m_measure;
|
||||
std::string m_space;
|
||||
size_t m_spaceID;
|
||||
};
|
||||
|
||||
} //namespace
|
@ -68,6 +68,9 @@ public:
|
||||
size_t m_frontSpanCoveredLength;
|
||||
// how many words from the beginning are covered
|
||||
|
||||
// Coordinates in user-defined spaces (see "coord" XML tag)
|
||||
SPTR<std::map<size_t const, std::vector<float> > > m_coordMap;
|
||||
|
||||
InputType(AllOptions::ptr const& opts, long translationId = 0);
|
||||
virtual ~InputType();
|
||||
|
||||
|
@ -154,7 +154,8 @@ aux_interpret_xml(std::string& line, std::vector<size_t> & xmlWalls,
|
||||
bool OK = ProcessAndStripXMLTags(*m_options, line,
|
||||
m_xmlOptions,
|
||||
m_reorderingConstraint,
|
||||
xmlWalls, placeholders);
|
||||
xmlWalls, placeholders,
|
||||
*this);
|
||||
if (!OK) {
|
||||
TRACE_ERR("Unable to parse XML in line: " << line);
|
||||
}
|
||||
|
@ -69,6 +69,7 @@ StaticData::StaticData()
|
||||
, m_requireSortingAfterSourceContext(false)
|
||||
, m_currentWeightSetting("default")
|
||||
, m_treeStructure(NULL)
|
||||
, m_coordSpaceNextID(1)
|
||||
{
|
||||
Phrase::InitializeMemPool();
|
||||
}
|
||||
@ -936,4 +937,25 @@ void StaticData::ResetWeights(const std::string &denseWeights, const std::string
|
||||
}
|
||||
}
|
||||
|
||||
size_t StaticData::GetCoordSpace(string space) const
|
||||
{
|
||||
map<string, size_t>::const_iterator m = m_coordSpaceMap.find(space);
|
||||
if(m == m_coordSpaceMap.end()) {
|
||||
return 0;
|
||||
}
|
||||
return m->second;
|
||||
}
|
||||
|
||||
size_t StaticData::MapCoordSpace(string space)
|
||||
{
|
||||
map<string, size_t>::const_iterator m = m_coordSpaceMap.find(space);
|
||||
if (m != m_coordSpaceMap.end()) {
|
||||
return m->second;
|
||||
}
|
||||
size_t id = m_coordSpaceNextID;
|
||||
m_coordSpaceNextID += 1;
|
||||
m_coordSpaceMap[space] = id;
|
||||
return id;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
@ -60,7 +60,7 @@ class PhraseDictionaryDynamicCacheBased;
|
||||
typedef std::pair<std::string, float> UnknownLHSEntry;
|
||||
typedef std::vector<UnknownLHSEntry> UnknownLHSList;
|
||||
|
||||
/** Contains global variables and contants.
|
||||
/** Contains global variables and constants.
|
||||
* Only 1 object of this class should be instantiated.
|
||||
* A const object of this class is accessible by any function during decoding by calling StaticData::Instance();
|
||||
*/
|
||||
@ -152,6 +152,12 @@ protected:
|
||||
bool ini_performance_options();
|
||||
|
||||
void initialize_features();
|
||||
|
||||
// Coordinate space name map for matching spaces across XML input ("coord"
|
||||
// tag) and feature functions that assign or use coordinates on target phrases
|
||||
std::map< std::string const, size_t > m_coordSpaceMap;
|
||||
size_t m_coordSpaceNextID;
|
||||
|
||||
public:
|
||||
|
||||
//! destructor
|
||||
@ -394,6 +400,9 @@ public:
|
||||
return m_requireSortingAfterSourceContext;
|
||||
}
|
||||
|
||||
// Coordinate spaces
|
||||
size_t GetCoordSpace(std::string space) const;
|
||||
size_t MapCoordSpace(std::string space);
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -125,6 +125,7 @@ TargetPhrase::TargetPhrase(const Phrase &phrase, const PhraseDictionary *pt)
|
||||
|
||||
TargetPhrase::TargetPhrase(const TargetPhrase ©)
|
||||
: Phrase(copy)
|
||||
, m_cached_coord(copy.m_cached_coord)
|
||||
, m_cached_scores(copy.m_cached_scores)
|
||||
, m_scope(copy.m_scope)
|
||||
, m_futureScore(copy.m_futureScore)
|
||||
@ -333,6 +334,31 @@ SetExtraScores(FeatureFunction const* ff,
|
||||
m_cached_scores[ff] = s;
|
||||
}
|
||||
|
||||
vector<SPTR<vector<float> > > const*
|
||||
TargetPhrase::
|
||||
GetCoordList(size_t const spaceID) const
|
||||
{
|
||||
if(!m_cached_coord) {
|
||||
return NULL;
|
||||
}
|
||||
CoordCache_t::const_iterator m = m_cached_coord->find(spaceID);
|
||||
if(m == m_cached_coord->end()) {
|
||||
return NULL;
|
||||
}
|
||||
return &m->second;
|
||||
}
|
||||
|
||||
void
|
||||
TargetPhrase::
|
||||
PushCoord(size_t const spaceID,
|
||||
SPTR<vector<float> > const coord)
|
||||
{
|
||||
if (!m_cached_coord) {
|
||||
m_cached_coord.reset(new CoordCache_t);
|
||||
}
|
||||
vector<SPTR<vector<float> > >& coordList = (*m_cached_coord)[spaceID];
|
||||
coordList.push_back(coord);
|
||||
}
|
||||
|
||||
void TargetPhrase::SetProperties(const StringPiece &str)
|
||||
{
|
||||
|
@ -56,9 +56,13 @@ public:
|
||||
Scores const* GetExtraScores(FeatureFunction const* ff) const;
|
||||
void SetExtraScores(FeatureFunction const* ff,boost::shared_ptr<Scores> const& scores);
|
||||
|
||||
typedef std::map<size_t const, std::vector<SPTR<std::vector<float> > > > CoordCache_t;
|
||||
std::vector<SPTR<std::vector<float> > > const* GetCoordList(size_t const spaceID) const;
|
||||
void PushCoord(size_t const spaceID, SPTR<std::vector<float> > const coord);
|
||||
|
||||
private:
|
||||
ScoreCache_t m_cached_scores;
|
||||
SPTR<CoordCache_t> m_cached_coord;
|
||||
WPTR<ContextScope> m_scope;
|
||||
|
||||
private:
|
||||
|
@ -130,7 +130,6 @@ namespace sapt
|
||||
mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
|
||||
// caches for unbiased sampling; biased sampling uses the caches that
|
||||
// are stored locally on the translation task
|
||||
|
||||
public:
|
||||
SPTR<Ttrack<char> > Tx; // word alignments
|
||||
SPTR<Ttrack<Token> > T1; // token track
|
||||
@ -164,7 +163,8 @@ namespace sapt
|
||||
|
||||
#ifndef NO_MOSES
|
||||
SPTR<pstats>
|
||||
prep2(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
|
||||
prep2(ttasksptr const& ttask, iter const& phrase, bool const track_sids,
|
||||
int max_sample = -1) const;
|
||||
#endif
|
||||
|
||||
protected:
|
||||
@ -189,7 +189,7 @@ namespace sapt
|
||||
SPTR<pstats>
|
||||
lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
|
||||
|
||||
void prep(ttasksptr const& ttask, iter const& phrase) const;
|
||||
void prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const;
|
||||
#endif
|
||||
|
||||
void setDefaultSampleSize(size_t const max_samples);
|
||||
|
@ -33,7 +33,8 @@ public:
|
||||
SPTR<pstats>
|
||||
add_job(Bitext<Token> const* const theBitext,
|
||||
typename TSA<Token>::tree_iterator const& phrase,
|
||||
size_t const max_samples, SPTR<SamplingBias const> const& bias);
|
||||
size_t const max_samples, SPTR<SamplingBias const> const& bias,
|
||||
bool const track_sids);
|
||||
// add_job(Bitext<Token> const* const theBitext,
|
||||
// typename TSA<Token>::tree_iterator const& phrase,
|
||||
// size_t const max_samples, SamplingBias const* const bias);
|
||||
@ -93,13 +94,14 @@ SPTR<pstats> Bitext<Token>
|
||||
::agenda
|
||||
::add_job(Bitext<Token> const* const theBitext,
|
||||
typename TSA<Token>::tree_iterator const& phrase,
|
||||
size_t const max_samples, SPTR<SamplingBias const> const& bias)
|
||||
size_t const max_samples, SPTR<SamplingBias const> const& bias,
|
||||
bool const track_sids)
|
||||
{
|
||||
boost::unique_lock<boost::mutex> lk(this->lock);
|
||||
static boost::posix_time::time_duration nodelay(0,0,0,0);
|
||||
bool fwd = phrase.root == bt.I1.get();
|
||||
SPTR<job> j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2,
|
||||
max_samples, fwd, bias));
|
||||
max_samples, fwd, bias, track_sids));
|
||||
j->stats->register_worker();
|
||||
|
||||
joblist.push_back(j);
|
||||
|
@ -35,6 +35,8 @@ public:
|
||||
SPTR<pstats> stats; // stores statistics collected during sampling
|
||||
SPTR<SamplingBias const> const m_bias; // sentence-level bias for sampling
|
||||
float bias_total;
|
||||
bool m_track_sids; // track sentence ids in sample?
|
||||
|
||||
bool nextSample(uint64_t & sid, uint64_t & offset); // select next occurrence
|
||||
|
||||
int
|
||||
@ -46,7 +48,7 @@ public:
|
||||
job(Bitext<Token> const* const theBitext,
|
||||
typename TSA<Token>::tree_iterator const& m,
|
||||
SPTR<TSA<Token> > const& r, size_t maxsmpl, bool isfwd,
|
||||
SPTR<SamplingBias const> const& bias);
|
||||
SPTR<SamplingBias const> const& bias, bool const track_sids);
|
||||
~job();
|
||||
};
|
||||
|
||||
@ -66,7 +68,8 @@ Bitext<Token>::agenda::job
|
||||
::job(Bitext<Token> const* const theBitext,
|
||||
typename TSA<Token>::tree_iterator const& m,
|
||||
SPTR<TSA<Token> > const& r, size_t maxsmpl,
|
||||
bool isfwd, SPTR<SamplingBias const> const& bias)
|
||||
bool isfwd, SPTR<SamplingBias const> const& bias,
|
||||
bool const track_sids)
|
||||
: m_bitext(theBitext)
|
||||
, rnd(0)
|
||||
, rnddenom(rnd.max() + 1.)
|
||||
@ -80,8 +83,9 @@ Bitext<Token>::agenda::job
|
||||
, len(m.size())
|
||||
, fwd(isfwd)
|
||||
, m_bias(bias)
|
||||
, m_track_sids(track_sids)
|
||||
{
|
||||
stats.reset(new pstats());
|
||||
stats.reset(new pstats(m_track_sids));
|
||||
stats->raw_cnt = m.approxOccurrenceCount();
|
||||
bias_total = 0;
|
||||
|
||||
|
@ -90,7 +90,7 @@ Bitext<Token>::agenda
|
||||
size_t raw2 = b->approxOccurrenceCount();
|
||||
float bwgt = j->m_bias ? (*j->m_bias)[sid] : 1;
|
||||
j->stats->add(tpid, sample_weight, bwgt, aln, raw2,
|
||||
po_fwd, po_bwd, docid);
|
||||
po_fwd, po_bwd, docid, sid);
|
||||
bool ok = (i == e2) || b->extend(o[i].id());
|
||||
UTIL_THROW_IF2(!ok, "Could not extend target phrase.");
|
||||
}
|
||||
|
@ -29,6 +29,7 @@ namespace sapt
|
||||
my_wcnt = other.wcnt();
|
||||
my_bcnt = other.bcnt();
|
||||
my_aln = other.aln();
|
||||
sids = other.sids;
|
||||
indoc = other.indoc;
|
||||
for (int i = 0; i <= LRModel::NONE; i++)
|
||||
{
|
||||
@ -56,7 +57,8 @@ namespace sapt
|
||||
size_t
|
||||
jstats::
|
||||
add(float w, float b, std::vector<unsigned char> const& a, uint32_t const cnt2,
|
||||
uint32_t fwd_orient, uint32_t bwd_orient, int const docid)
|
||||
uint32_t fwd_orient, uint32_t bwd_orient, int const docid,
|
||||
uint32_t const sid, bool const track_sid)
|
||||
{
|
||||
boost::lock_guard<boost::mutex> lk(this->lock);
|
||||
my_cnt2 = cnt2;
|
||||
@ -76,6 +78,13 @@ namespace sapt
|
||||
}
|
||||
++ofwd[fwd_orient];
|
||||
++obwd[bwd_orient];
|
||||
// Record sentence id if requested
|
||||
if (track_sid)
|
||||
{
|
||||
if (!sids)
|
||||
sids.reset(new std::vector<uint32_t>);
|
||||
sids->push_back(sid);
|
||||
}
|
||||
if (docid >= 0)
|
||||
{
|
||||
// while (int(indoc.size()) <= docid) indoc.push_back(0);
|
||||
|
@ -28,6 +28,7 @@ namespace sapt
|
||||
uint32_t obwd[LRModel::NONE+1]; // backward distortion type counts
|
||||
|
||||
public:
|
||||
SPTR<std::vector<uint32_t> > sids; // list of sentence ids in this sample
|
||||
std::map<uint32_t,uint32_t> indoc;
|
||||
// std::vector<uint32_t> indoc; // counts origin of samples (for biased sampling)
|
||||
jstats();
|
||||
@ -41,7 +42,8 @@ namespace sapt
|
||||
|
||||
size_t
|
||||
add(float w, float b, std::vector<unsigned char> const& a, uint32_t const cnt2,
|
||||
uint32_t fwd_orient, uint32_t bwd_orient, int const docid);
|
||||
uint32_t fwd_orient, uint32_t bwd_orient, int const docid, uint32_t const sid,
|
||||
bool const track_sid);
|
||||
|
||||
void invalidate();
|
||||
void validate();
|
||||
|
@ -30,9 +30,9 @@ lookup(ttasksptr const& ttask, iter const& phrase, int max_sample) const
|
||||
template<typename Token>
|
||||
void
|
||||
Bitext<Token>::
|
||||
prep(ttasksptr const& ttask, iter const& phrase) const
|
||||
prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const
|
||||
{
|
||||
prep2(ttask, phrase, m_default_sample_size);
|
||||
prep2(ttask, phrase, track_sids, m_default_sample_size);
|
||||
}
|
||||
|
||||
|
||||
@ -44,7 +44,8 @@ template<typename Token>
|
||||
SPTR<pstats>
|
||||
Bitext<Token>
|
||||
::prep2
|
||||
( ttasksptr const& ttask, iter const& phrase, int max_sample) const
|
||||
( ttasksptr const& ttask, iter const& phrase, bool const track_sids,
|
||||
int max_sample) const
|
||||
{
|
||||
if (max_sample < 0) max_sample = m_default_sample_size;
|
||||
SPTR<SamplingBias> bias;
|
||||
@ -74,7 +75,7 @@ Bitext<Token>
|
||||
if (m_num_workers > 1)
|
||||
ag->add_workers(m_num_workers);
|
||||
}
|
||||
ret = ag->add_job(this, phrase, max_sample, bias);
|
||||
ret = ag->add_job(this, phrase, max_sample, bias, track_sids);
|
||||
if (cache) cache->set(phrase.getPid(),ret);
|
||||
UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job.");
|
||||
return ret;
|
||||
|
@ -10,7 +10,7 @@ namespace sapt
|
||||
#endif
|
||||
|
||||
pstats::
|
||||
pstats() : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0)
|
||||
pstats(bool const track_sids) : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0), track_sids(track_sids)
|
||||
{
|
||||
for (int i = 0; i <= LRModel::NONE; ++i)
|
||||
ofwd[i] = obwd[i] = 0;
|
||||
@ -69,11 +69,11 @@ namespace sapt
|
||||
std::vector<unsigned char> const& a,
|
||||
uint32_t const cnt2,
|
||||
uint32_t fwd_o,
|
||||
uint32_t bwd_o, int const docid)
|
||||
uint32_t bwd_o, int const docid, uint32_t const sid)
|
||||
{
|
||||
boost::lock_guard<boost::mutex> guard(this->lock);
|
||||
jstats& entry = this->trg[pid];
|
||||
size_t ret = entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid);
|
||||
size_t ret = entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid, sid, track_sids);
|
||||
if (this->good < entry.rcnt())
|
||||
{
|
||||
UTIL_THROW(util::Exception, "more joint counts than good counts:"
|
||||
|
@ -35,7 +35,8 @@ namespace sapt
|
||||
|
||||
indoc_map_t indoc;
|
||||
trg_map_t trg;
|
||||
pstats();
|
||||
bool track_sids;
|
||||
pstats(bool const track_sids);
|
||||
~pstats();
|
||||
void release();
|
||||
void register_worker();
|
||||
@ -49,7 +50,8 @@ namespace sapt
|
||||
uint32_t const cnt2, // raw target phrase count
|
||||
uint32_t fwd_o, // fwd. phrase orientation
|
||||
uint32_t bwd_o, // bwd. phrase orientation
|
||||
int const docid); // document where sample was found
|
||||
int const docid, // document where sample was found
|
||||
uint32_t const sid); // index of sentence where sample was found
|
||||
|
||||
void
|
||||
count_sample(int const docid, // document where sample was found
|
||||
|
@ -70,6 +70,7 @@ BitextSampler : public Moses::reference_counter
|
||||
size_t m_num_occurrences; // estimated number of phrase occurrences in corpus
|
||||
boost::taus88 m_rnd; // every job has its own pseudo random generator
|
||||
double m_bias_total;
|
||||
bool m_track_sids; // track sentence ids in stats?
|
||||
|
||||
size_t consider_sample(TokenPosition const& p);
|
||||
size_t perform_random_sampling();
|
||||
@ -86,7 +87,8 @@ public:
|
||||
SPTR<SamplingBias const> const& bias,
|
||||
size_t const min_samples,
|
||||
size_t const max_samples,
|
||||
sampling_method const method);
|
||||
sampling_method const method,
|
||||
bool const track_sids);
|
||||
~BitextSampler();
|
||||
SPTR<pstats> stats();
|
||||
bool done() const;
|
||||
@ -185,7 +187,7 @@ BitextSampler<Token>::
|
||||
BitextSampler(SPTR<Bitext<Token> const> const& bitext,
|
||||
typename bitext::iter const& phrase,
|
||||
SPTR<SamplingBias const> const& bias, size_t const min_samples, size_t const max_samples,
|
||||
sampling_method const method)
|
||||
sampling_method const method, bool const track_sids)
|
||||
: m_bitext(bitext)
|
||||
, m_plen(phrase.size())
|
||||
, m_fwd(phrase.root == bitext->I1.get())
|
||||
@ -201,8 +203,9 @@ BitextSampler(SPTR<Bitext<Token> const> const& bitext,
|
||||
, m_finished(false)
|
||||
, m_num_occurrences(phrase.ca())
|
||||
, m_rnd(0)
|
||||
, m_track_sids(track_sids)
|
||||
{
|
||||
m_stats.reset(new pstats);
|
||||
m_stats.reset(new pstats(m_track_sids));
|
||||
m_stats->raw_cnt = phrase.ca();
|
||||
m_stats->register_worker();
|
||||
}
|
||||
@ -332,7 +335,8 @@ consider_sample(TokenPosition const& p)
|
||||
size_t raw2 = b->approxOccurrenceCount();
|
||||
size_t evid = m_stats->add(tpid, sample_weight,
|
||||
m_bias ? (*m_bias)[p.sid] : 1,
|
||||
aln, raw2, rec.po_fwd, rec.po_bwd, docid);
|
||||
aln, raw2, rec.po_fwd, rec.po_bwd, docid,
|
||||
p.sid);
|
||||
max_evidence = std::max(max_evidence, evid);
|
||||
bool ok = (i == rec.e2) || b->extend(o[i].id());
|
||||
UTIL_THROW_IF2(!ok, "Could not extend target phrase.");
|
||||
|
@ -31,6 +31,8 @@ namespace sapt
|
||||
std::vector<unsigned char> aln;
|
||||
float score;
|
||||
bool inverse;
|
||||
SPTR<std::vector<uint32_t> > sids; // list of sampled sentence ids where
|
||||
// this phrase pair was found
|
||||
// std::vector<uint32_t> indoc;
|
||||
std::map<uint32_t,uint32_t> indoc;
|
||||
PhrasePair() { };
|
||||
@ -132,6 +134,7 @@ namespace sapt
|
||||
dbwd[i] = js.dcnt_bwd(po);
|
||||
}
|
||||
|
||||
sids = js.sids;
|
||||
indoc = js.indoc;
|
||||
return *this;
|
||||
}
|
||||
@ -182,6 +185,8 @@ namespace sapt
|
||||
sample2 += o.sample2;
|
||||
cum_bias += o.cum_bias;
|
||||
// todo: add distortion counts
|
||||
if (sids && o.sids)
|
||||
sids->insert(sids->end(), o.sids->begin(), o.sids->end());
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -199,6 +204,7 @@ namespace sapt
|
||||
, aln(o.aln)
|
||||
, score(o.score)
|
||||
, inverse(o.inverse)
|
||||
, sids(o.sids)
|
||||
, indoc(o.indoc)
|
||||
{
|
||||
for (int i = 0; i <= LRModel::NONE; ++i)
|
||||
|
@ -275,6 +275,40 @@ namespace Moses
|
||||
m = param.find("name");
|
||||
if (m != param.end()) m_name = m->second;
|
||||
|
||||
// Optional coordinates for training corpus
|
||||
// Takes form coord=name1:file1.gz,name2:file2.gz,...
|
||||
// Names should match with XML input (coord tag)
|
||||
param.insert(pair<string,string>("coord","0"));
|
||||
if(param["coord"] != "0")
|
||||
{
|
||||
m_track_coord = true;
|
||||
vector<string> coord_instances = Tokenize(param["coord"], ",");
|
||||
BOOST_FOREACH(std::string instance, coord_instances)
|
||||
{
|
||||
vector<string> toks = Moses::Tokenize(instance, ":");
|
||||
string space = toks[0];
|
||||
string file = toks[1];
|
||||
// Register that this model uses the given space
|
||||
m_coord_spaces.push_back(StaticData::InstanceNonConst().MapCoordSpace(space));
|
||||
// Load sid coordinates from file
|
||||
m_sid_coord_list.push_back(vector<SPTR<vector<float> > >());
|
||||
vector<SPTR<vector<float> > >& sid_coord = m_sid_coord_list[m_sid_coord_list.size() - 1];
|
||||
//TODO: support extra data for btdyn, here? extra?
|
||||
sid_coord.reserve(btfix->T1->size());
|
||||
string line;
|
||||
cerr << "Loading coordinate lines for space \"" << space << "\" from " << file << endl;
|
||||
iostreams::filtering_istream in;
|
||||
ugdiss::open_input_stream(file, in);
|
||||
while(getline(in, line))
|
||||
{
|
||||
SPTR<vector<float> > coord(new vector<float>);
|
||||
Scan<float>(*coord, Tokenize(line));
|
||||
sid_coord.push_back(coord);
|
||||
}
|
||||
cerr << "Loaded " << sid_coord.size() << " lines" << endl;
|
||||
}
|
||||
}
|
||||
|
||||
// check for unknown parameters
|
||||
vector<string> known_parameters; known_parameters.reserve(50);
|
||||
known_parameters.push_back("L1");
|
||||
@ -290,6 +324,7 @@ namespace Moses
|
||||
known_parameters.push_back("cache");
|
||||
known_parameters.push_back("coh");
|
||||
known_parameters.push_back("config");
|
||||
known_parameters.push_back("coord");
|
||||
known_parameters.push_back("cumb");
|
||||
known_parameters.push_back("extra");
|
||||
known_parameters.push_back("feature-sets");
|
||||
@ -616,6 +651,29 @@ namespace Moses
|
||||
}
|
||||
#endif
|
||||
|
||||
// Track coordinates if requested
|
||||
if (m_track_coord)
|
||||
{
|
||||
BOOST_FOREACH(uint32_t const sid, *pool.sids)
|
||||
{
|
||||
for(size_t i = 0; i < m_coord_spaces.size(); ++i)
|
||||
{
|
||||
tp->PushCoord(m_coord_spaces[i], m_sid_coord_list[i][sid]);
|
||||
}
|
||||
}
|
||||
/*
|
||||
cerr << btfix->toString(pool.p1, 0) << " ::: " << btfix->toString(pool.p2, 1);
|
||||
BOOST_FOREACH(size_t id, m_coord_spaces)
|
||||
{
|
||||
cerr << " [" << id << "]";
|
||||
vector<vector<float> const*> const* coordList = tp->GetCoordList(id);
|
||||
BOOST_FOREACH(vector<float> const* coord, *coordList)
|
||||
cerr << " : " << Join(" ", *coord);
|
||||
}
|
||||
cerr << endl;
|
||||
*/
|
||||
}
|
||||
|
||||
return tp;
|
||||
}
|
||||
|
||||
@ -691,7 +749,7 @@ namespace Moses
|
||||
SPTR<ContextScope> const& scope = ttask->GetScope();
|
||||
SPTR<TPCollCache> cache = scope->get<TPCollCache>(cache_key);
|
||||
if (!cache) cache = m_cache; // no context-specific cache, use global one
|
||||
|
||||
|
||||
ret = cache->get(phrasekey, dyn->revision());
|
||||
// TO DO: we should revise the revision mechanism: we take the
|
||||
// length of the dynamic bitext (in sentences) at the time the PT
|
||||
@ -705,12 +763,12 @@ namespace Moses
|
||||
// std::cerr << ret << " with " << ret->refCount << " references at "
|
||||
// << HERE << std::endl;
|
||||
boost::upgrade_lock<boost::shared_mutex> rlock(ret->lock);
|
||||
if (ret->GetSize()) return ret;
|
||||
if (ret->GetSize()) return ret;
|
||||
|
||||
// new TPC (not found or old one was not up to date)
|
||||
boost::upgrade_to_unique_lock<boost::shared_mutex> wlock(rlock);
|
||||
// maybe another thread did the work while we waited for the lock ?
|
||||
if (ret->GetSize()) return ret;
|
||||
if (ret->GetSize()) return ret;
|
||||
|
||||
// OK: pt entry NOT found or NOT up to date
|
||||
// lookup and expansion could be done in parallel threads,
|
||||
@ -730,7 +788,8 @@ namespace Moses
|
||||
BitextSampler<Token> s(btfix, mfix, context->bias,
|
||||
m_min_sample_size,
|
||||
m_default_sample_size,
|
||||
m_sampling_method);
|
||||
m_sampling_method,
|
||||
m_track_coord);
|
||||
s();
|
||||
sfix = s.stats();
|
||||
}
|
||||
@ -918,7 +977,7 @@ namespace Moses
|
||||
{
|
||||
BitextSampler<Token> s(btfix, mfix, context->bias,
|
||||
m_min_sample_size, m_default_sample_size,
|
||||
m_sampling_method);
|
||||
m_sampling_method, m_track_coord);
|
||||
if (*context->cache1->get(pid, s.stats()) == s.stats())
|
||||
m_thread_pool->add(s);
|
||||
}
|
||||
@ -939,7 +998,7 @@ namespace Moses
|
||||
for (size_t i = 0; mdyn.size() == i && i < myphrase.size(); ++i)
|
||||
mdyn.extend(myphrase[i]);
|
||||
// let's assume a uniform bias over the foreground corpus
|
||||
if (mdyn.size() == myphrase.size()) dyn->prep(ttask, mdyn);
|
||||
if (mdyn.size() == myphrase.size()) dyn->prep(ttask, mdyn, m_track_coord);
|
||||
}
|
||||
return mdyn.size() == myphrase.size();
|
||||
}
|
||||
|
@ -119,6 +119,12 @@ namespace Moses
|
||||
std::vector<SPTR<pscorer > > m_active_ff_common;
|
||||
// activated feature functions (dyn)
|
||||
|
||||
bool m_track_coord = false; // track coordinates? Track sids when sampling
|
||||
// from bitext, append coords to target phrases
|
||||
// Space < Sid < sptr sentence coords > >
|
||||
std::vector<std::vector<SPTR<std::vector<float> > > > m_sid_coord_list;
|
||||
std::vector<size_t> m_coord_spaces;
|
||||
|
||||
void
|
||||
parse_factor_spec(std::vector<FactorType>& flist, std::string const key);
|
||||
|
||||
|
@ -28,8 +28,8 @@ namespace sapt {
|
||||
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
(*dest)[this->m_index] = log(std::max(m_floor,pp.cum_bias));
|
||||
|
@ -48,8 +48,8 @@ namespace sapt {
|
||||
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
float p = float(bt.T1->numTokens());
|
||||
|
@ -36,8 +36,8 @@ namespace sapt
|
||||
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
// uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
|
||||
|
@ -37,8 +37,8 @@ namespace sapt {
|
||||
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
assert(pp.raw1);
|
||||
|
@ -38,8 +38,8 @@ namespace sapt
|
||||
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
// we use the denominator specification to scale the raw counts on the
|
||||
|
@ -38,8 +38,9 @@ namespace sapt
|
||||
}
|
||||
|
||||
void
|
||||
operator()(Bitext<Token> const& bt, PhrasePair<Token> & pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
if (pp.joint > pp.good1)
|
||||
|
@ -22,8 +22,8 @@ namespace sapt
|
||||
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
(*dest)[this->m_index] = 1;
|
||||
|
@ -28,8 +28,8 @@ namespace sapt {
|
||||
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
size_t i = this->m_index;
|
||||
|
@ -26,8 +26,8 @@ namespace sapt {
|
||||
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
size_t i = this->m_index;
|
||||
|
@ -37,8 +37,8 @@ namespace sapt
|
||||
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
// uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
|
||||
|
@ -22,8 +22,8 @@ namespace sapt
|
||||
|
||||
void
|
||||
operator()(Bitext<Token> const& bt,
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
PhrasePair<Token>& pp,
|
||||
std::vector<float> * dest = NULL) const
|
||||
{
|
||||
if (!dest) dest = &pp.fvals;
|
||||
(*dest)[this->m_index] = pp.len2;
|
||||
|
@ -163,7 +163,8 @@ ProcessAndStripXMLTags(AllOptions const& opts, string &line,
|
||||
vector<XmlOption const*> &res,
|
||||
ReorderingConstraint &reorderingConstraint,
|
||||
vector< size_t > &walls,
|
||||
std::vector< std::pair<size_t, std::string> > &placeholders)
|
||||
std::vector< std::pair<size_t, std::string> > &placeholders,
|
||||
InputType &input)
|
||||
{
|
||||
//parse XML markup in translation line
|
||||
|
||||
@ -401,6 +402,28 @@ ProcessAndStripXMLTags(AllOptions const& opts, string &line,
|
||||
StaticData::InstanceNonConst().SetAllWeights(allWeights);
|
||||
}
|
||||
|
||||
// Coord: coordinates of the input sentence in a user-defined space
|
||||
// <coord space="NAME" coord="X Y Z ..." />
|
||||
// where NAME is the name of the space and X Y Z ... are floats. See
|
||||
// PhraseDistanceFeature for an example of using this information for
|
||||
// feature scoring.
|
||||
else if (tagName == "coord") {
|
||||
// Parse tag
|
||||
string space = ParseXmlTagAttribute(tagContent, "space");
|
||||
vector<string> tok = Tokenize(ParseXmlTagAttribute(tagContent, "coord"));
|
||||
size_t id = StaticData::Instance().GetCoordSpace(space);
|
||||
if (!id) {
|
||||
TRACE_ERR("ERROR: no models use space " << space << ", will be ignored" << endl);
|
||||
} else {
|
||||
// Init if needed
|
||||
if (!input.m_coordMap) {
|
||||
input.m_coordMap.reset(new map<size_t const, vector<float> >);
|
||||
}
|
||||
vector<float>& coord = (*input.m_coordMap)[id];
|
||||
Scan<float>(coord, tok);
|
||||
}
|
||||
}
|
||||
|
||||
// default: opening tag that specifies translation options
|
||||
else {
|
||||
if (startPos > endPos) {
|
||||
|
@ -34,7 +34,8 @@ bool ProcessAndStripXMLTags(AllOptions const& opts,
|
||||
std::string &line, std::vector<XmlOption const*> &res,
|
||||
ReorderingConstraint &reorderingConstraint,
|
||||
std::vector< size_t > &walls,
|
||||
std::vector< std::pair<size_t, std::string> > &placeholders);
|
||||
std::vector< std::pair<size_t, std::string> > &placeholders,
|
||||
InputType &input);
|
||||
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user