2008-06-11 14:52:57 +04:00
|
|
|
// $Id$
|
|
|
|
|
|
|
|
/***********************************************************************
|
|
|
|
Moses - factored phrase-based language decoder
|
|
|
|
Copyright (C) 2006 University of Edinburgh
|
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with this library; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
***********************************************************************/
|
|
|
|
|
2010-02-24 14:15:44 +03:00
|
|
|
#ifndef moses_ScoreComponentCollection_h
|
|
|
|
#define moses_ScoreComponentCollection_h
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
#include <numeric>
|
|
|
|
#include <cassert>
|
2011-08-07 04:58:56 +04:00
|
|
|
#include <sstream>
|
2010-08-10 17:12:00 +04:00
|
|
|
|
2010-09-28 19:13:50 +04:00
|
|
|
#ifdef MPI_ENABLE
|
|
|
|
#include <boost/serialization/access.hpp>
|
|
|
|
#include <boost/serialization/split_member.hpp>
|
|
|
|
#endif
|
|
|
|
|
2010-08-10 17:12:00 +04:00
|
|
|
#include "LMList.h"
|
2008-06-11 14:52:57 +04:00
|
|
|
#include "ScoreProducer.h"
|
2010-09-15 15:30:25 +04:00
|
|
|
#include "FeatureVector.h"
|
2008-06-11 14:52:57 +04:00
|
|
|
#include "TypeDef.h"
|
|
|
|
#include "Util.h"
|
|
|
|
|
2010-09-28 19:13:50 +04:00
|
|
|
|
2008-10-09 03:51:26 +04:00
|
|
|
namespace Moses
|
|
|
|
{
|
|
|
|
|
2010-09-28 19:13:50 +04:00
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
/*** An unweighted collection of scores for a translation or step in a translation.
|
|
|
|
*
|
|
|
|
* In the factored phrase-based models that are implemented by moses, there are a set of
|
|
|
|
* scores that come from a variety of sources (translation probabilities, language model
|
|
|
|
* probablilities, distortion probabilities, generation probabilities). Furthermore, while
|
|
|
|
* some of these scores may be 0, this number is fixed (and generally quite small, ie, less
|
|
|
|
* than 15), for a given model.
|
|
|
|
*
|
|
|
|
* The values contained in ScoreComponentCollection objects are unweighted scores (log-probs).
|
2011-02-24 16:14:42 +03:00
|
|
|
*
|
2008-06-11 14:52:57 +04:00
|
|
|
* ScoreComponentCollection objects can be added and subtracted, which makes them appropriate
|
|
|
|
* to be the datatype used to return the result of a score computations (in this case they will
|
|
|
|
* have most values set to zero, except for the ones that are results of the indivudal computation
|
|
|
|
* this will then be added into the "running total" in the Hypothesis. In fact, for a score
|
|
|
|
* to be tracked in the hypothesis (and thus to participate in the decoding process), a class
|
|
|
|
* representing that score must extend the ScoreProducer abstract base class. For an example
|
|
|
|
* refer to the DistortionScoreProducer class.
|
|
|
|
*/
|
2011-02-24 16:14:42 +03:00
|
|
|
class ScoreComponentCollection
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
friend std::ostream& operator<<(std::ostream& os, const ScoreComponentCollection& rhs);
|
|
|
|
private:
|
2010-09-14 21:29:39 +04:00
|
|
|
FVector m_scores;
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
public:
|
|
|
|
//! Create a new score collection with all values set to 0.0
|
2011-02-24 16:14:42 +03:00
|
|
|
ScoreComponentCollection();
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
//! Clone a score collection
|
|
|
|
ScoreComponentCollection(const ScoreComponentCollection& rhs)
|
|
|
|
: m_scores(rhs.m_scores)
|
|
|
|
{}
|
|
|
|
|
2010-10-15 19:19:17 +04:00
|
|
|
/** Load from file */
|
|
|
|
bool Load(const std::string& filename)
|
|
|
|
{
|
|
|
|
return m_scores.load(filename);
|
|
|
|
}
|
|
|
|
|
2011-08-07 04:58:56 +04:00
|
|
|
FVector GetScoresVector() const
|
2010-12-06 18:28:51 +03:00
|
|
|
{
|
|
|
|
return m_scores;
|
|
|
|
}
|
2008-09-24 20:48:23 +04:00
|
|
|
|
2011-02-02 21:03:08 +03:00
|
|
|
size_t Size()
|
|
|
|
{
|
|
|
|
return m_scores.size();
|
|
|
|
}
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
//! Set all values to 0.0
|
|
|
|
void ZeroAll()
|
|
|
|
{
|
2010-09-14 21:29:39 +04:00
|
|
|
m_scores.clear();
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2010-09-17 19:20:55 +04:00
|
|
|
void MultiplyEquals(float scalar);
|
2010-10-28 16:41:33 +04:00
|
|
|
void DivideEquals(float scalar);
|
2010-09-17 19:33:21 +04:00
|
|
|
void MultiplyEquals(const ScoreComponentCollection& rhs);
|
2010-09-17 19:20:55 +04:00
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
//! add the score in rhs
|
|
|
|
void PlusEquals(const ScoreComponentCollection& rhs)
|
|
|
|
{
|
2010-09-14 21:29:39 +04:00
|
|
|
m_scores += rhs.m_scores;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2010-12-15 22:48:45 +03:00
|
|
|
void PlusEquals(const FVector& scores)
|
|
|
|
{
|
|
|
|
m_scores += scores;
|
|
|
|
}
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
//! subtract the score in rhs
|
|
|
|
void MinusEquals(const ScoreComponentCollection& rhs)
|
|
|
|
{
|
2010-09-14 21:29:39 +04:00
|
|
|
m_scores -= rhs.m_scores;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-02-06 18:43:06 +03:00
|
|
|
//! Add scores from a single ScoreProducer only
|
|
|
|
//! The length of scores must be equal to the number of score components
|
|
|
|
//! produced by sp
|
|
|
|
void PlusEquals(const ScoreProducer* sp, const ScoreComponentCollection& scores)
|
|
|
|
{
|
2010-10-07 02:06:49 +04:00
|
|
|
const std::vector<FName>& names = sp->GetFeatureNames();
|
|
|
|
for (std::vector<FName>::const_iterator i = names.begin();
|
|
|
|
i != names.end(); ++i) {
|
|
|
|
m_scores[*i] += scores.m_scores[*i];
|
|
|
|
}
|
2009-02-06 18:43:06 +03:00
|
|
|
}
|
|
|
|
|
2010-10-07 02:06:49 +04:00
|
|
|
//! Add scores from a single ScoreProducer only
|
|
|
|
//! The length of scores must be equal to the number of score components
|
|
|
|
//! produced by sp
|
|
|
|
void PlusEquals(const ScoreProducer* sp, const std::vector<float>& scores)
|
|
|
|
{
|
|
|
|
const std::vector<FName>& names = sp->GetFeatureNames();
|
|
|
|
assert(names.size() == scores.size());
|
|
|
|
for (size_t i = 0; i < scores.size(); ++i) {
|
2010-10-08 18:28:20 +04:00
|
|
|
m_scores[names[i]] += scores[i];
|
2010-10-07 02:06:49 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
//! Special version PlusEquals(ScoreProducer, vector<float>)
|
|
|
|
//! to add the score from a single ScoreProducer that produces
|
|
|
|
//! a single value
|
|
|
|
void PlusEquals(const ScoreProducer* sp, float score)
|
|
|
|
{
|
|
|
|
assert(1 == sp->GetNumScoreComponents());
|
2010-10-07 02:06:49 +04:00
|
|
|
m_scores[sp->GetFeatureNames()[0]] += score;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2010-10-15 01:52:35 +04:00
|
|
|
//For features which have an unbounded number of components
|
|
|
|
void PlusEquals(const ScoreProducer*sp, const std::string& name, float score)
|
|
|
|
{
|
|
|
|
assert(sp->GetNumScoreComponents() == ScoreProducer::unlimited);
|
|
|
|
FName fname(sp->GetScoreProducerDescription(),name);
|
|
|
|
m_scores[fname] += score;
|
|
|
|
}
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
void Assign(const ScoreProducer* sp, const std::vector<float>& scores)
|
|
|
|
{
|
|
|
|
assert(scores.size() == sp->GetNumScoreComponents());
|
2010-10-07 02:06:49 +04:00
|
|
|
const std::vector<FName>& names = sp->GetFeatureNames();
|
|
|
|
for (size_t i = 0; i < scores.size(); ++i) {
|
|
|
|
m_scores[names[i]] = scores[i];
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2010-10-15 01:52:35 +04:00
|
|
|
//! Special version Assign(ScoreProducer, vector<float>)
|
2008-06-11 14:52:57 +04:00
|
|
|
//! to add the score from a single ScoreProducer that produces
|
|
|
|
//! a single value
|
|
|
|
void Assign(const ScoreProducer* sp, float score)
|
|
|
|
{
|
|
|
|
assert(1 == sp->GetNumScoreComponents());
|
2010-10-07 02:06:49 +04:00
|
|
|
m_scores[sp->GetFeatureNames()[0]] = score;
|
2010-09-17 19:20:16 +04:00
|
|
|
}
|
|
|
|
|
2010-10-15 01:52:35 +04:00
|
|
|
//For features which have an unbounded number of components
|
|
|
|
void Assign(const ScoreProducer*sp, const std::string name, float score)
|
|
|
|
{
|
|
|
|
assert(sp->GetNumScoreComponents() == ScoreProducer::unlimited);
|
|
|
|
FName fname(sp->GetScoreProducerDescription(),name);
|
|
|
|
m_scores[fname] = score;
|
|
|
|
}
|
|
|
|
|
2011-06-01 21:26:41 +04:00
|
|
|
// shortcut: setting the value directly using the feature name
|
|
|
|
void Assign(const std::string name, float score)
|
|
|
|
{
|
|
|
|
FName fname(name);
|
|
|
|
m_scores[fname] = score;
|
|
|
|
}
|
2010-09-17 19:44:07 +04:00
|
|
|
|
|
|
|
float InnerProduct(const ScoreComponentCollection& rhs) const
|
|
|
|
{
|
|
|
|
return m_scores.inner_product(rhs.m_scores);
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
float PartialInnerProduct(const ScoreProducer* sp, const std::vector<float>& rhs) const
|
|
|
|
{
|
|
|
|
std::vector<float> lhs = GetScoresForProducer(sp);
|
|
|
|
assert(lhs.size() == rhs.size());
|
|
|
|
return std::inner_product(lhs.begin(), lhs.end(), rhs.begin(), 0.0f);
|
|
|
|
}
|
|
|
|
|
|
|
|
//! return a vector of all the scores associated with a certain ScoreProducer
|
|
|
|
std::vector<float> GetScoresForProducer(const ScoreProducer* sp) const
|
|
|
|
{
|
2011-03-18 18:37:15 +03:00
|
|
|
size_t components = sp->GetNumScoreComponents();
|
|
|
|
if (components == ScoreProducer::unlimited) return std::vector<float>();
|
|
|
|
std::vector<float> res(components);
|
2010-10-07 02:06:49 +04:00
|
|
|
const std::vector<FName>& names = sp->GetFeatureNames();
|
|
|
|
for (size_t i = 0; i < names.size(); ++i) {
|
|
|
|
res[i] = m_scores[names[i]];
|
|
|
|
}
|
|
|
|
return res;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2011-08-07 04:58:56 +04:00
|
|
|
//! get subset of scores that belong to a certain sparse ScoreProducer
|
|
|
|
FVector GetVectorForProducer(const ScoreProducer* sp) const
|
|
|
|
{
|
|
|
|
FVector fv;
|
|
|
|
std::string prefix = sp->GetScoreProducerWeightShortName() + FName::SEP;
|
|
|
|
for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) {
|
|
|
|
std::stringstream name;
|
|
|
|
name << i->first;
|
|
|
|
if (name.str().substr( 0, prefix.length() ).compare( prefix ) == 0)
|
|
|
|
fv[i->first] = i->second;
|
|
|
|
}
|
|
|
|
return fv;
|
|
|
|
}
|
|
|
|
|
2010-12-16 23:01:36 +03:00
|
|
|
void ApplyLog(size_t baseOfLog) {
|
|
|
|
m_scores.applyLog(baseOfLog);
|
|
|
|
}
|
|
|
|
|
2011-03-08 19:58:02 +03:00
|
|
|
void ThresholdScaling(float maxValue)
|
2011-03-03 19:14:20 +03:00
|
|
|
{
|
2011-03-04 19:15:01 +03:00
|
|
|
// find (smallest) factor for which all weights are <= maxValue
|
|
|
|
// 0.1 / 0.14 = 0.714285714
|
|
|
|
// 0.1 / 0.17 = 0.588235294
|
2011-08-16 15:10:54 +04:00
|
|
|
m_scores.thresholdScale(maxValue);
|
2011-03-03 19:14:20 +03:00
|
|
|
}
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
//! if a ScoreProducer produces a single score (for example, a language model score)
|
|
|
|
//! this will return it. If not, this method will throw
|
|
|
|
float GetScoreForProducer(const ScoreProducer* sp) const
|
|
|
|
{
|
2010-10-07 02:06:49 +04:00
|
|
|
assert(sp->GetNumScoreComponents() == 1);
|
|
|
|
return m_scores[sp->GetFeatureNames()[0]];
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2010-10-15 01:52:35 +04:00
|
|
|
//For features which have an unbounded number of components
|
|
|
|
float GetScoreForProducer
|
|
|
|
(const ScoreProducer* sp, const std::string& name) const
|
|
|
|
{
|
|
|
|
assert(sp->GetNumScoreComponents() == ScoreProducer::unlimited);
|
|
|
|
FName fname(sp->GetScoreProducerDescription(),name);
|
|
|
|
return m_scores[fname];
|
|
|
|
}
|
|
|
|
|
2010-04-08 21:16:10 +04:00
|
|
|
float GetWeightedScore() const;
|
|
|
|
|
2010-10-28 16:41:33 +04:00
|
|
|
void ZeroAllLM(const LMList& lmList);
|
|
|
|
void PlusEqualsAllLM(const LMList& lmList, const ScoreComponentCollection& rhs);
|
2010-09-28 19:13:50 +04:00
|
|
|
void L1Normalise();
|
2011-05-31 19:39:48 +04:00
|
|
|
float GetL1Norm();
|
2011-04-23 16:13:43 +04:00
|
|
|
float GetL2Norm();
|
2010-10-28 16:41:33 +04:00
|
|
|
void Save(std::string filename) {m_scores.save(filename);}
|
2010-09-28 19:13:50 +04:00
|
|
|
|
|
|
|
#ifdef MPI_ENABLE
|
|
|
|
public:
|
|
|
|
friend class boost::serialization::access;
|
|
|
|
|
|
|
|
private:
|
|
|
|
//serialization
|
|
|
|
template<class Archive>
|
|
|
|
void save(Archive &ar, const unsigned int version) const {
|
|
|
|
ar << m_scores;
|
|
|
|
}
|
|
|
|
|
|
|
|
template<class Archive>
|
|
|
|
void load(Archive &ar, const unsigned int version) {
|
|
|
|
ar >> m_scores;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
BOOST_SERIALIZATION_SPLIT_MEMBER()
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
};
|
2010-04-08 21:16:10 +04:00
|
|
|
|
2010-09-28 19:13:50 +04:00
|
|
|
struct SCCPlus {
|
|
|
|
ScoreComponentCollection operator()
|
|
|
|
(const ScoreComponentCollection& lhs,
|
|
|
|
const ScoreComponentCollection& rhs) {
|
|
|
|
ScoreComponentCollection sum(lhs);
|
|
|
|
sum.PlusEquals(rhs);
|
|
|
|
return sum;
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
};
|
|
|
|
|
2008-10-09 03:51:26 +04:00
|
|
|
}
|
2010-02-24 14:15:44 +03:00
|
|
|
#endif
|