2014-07-21 14:04:43 +04:00
|
|
|
/***********************************************************************
|
|
|
|
Moses - factored phrase-based language decoder
|
|
|
|
Copyright (C) 2014- University of Edinburgh
|
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with this library; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
***********************************************************************/
|
|
|
|
#ifndef MERT_FOREST_RESCORE_H
|
|
|
|
#define MERT_FOREST_RESCORE_H
|
|
|
|
|
|
|
|
#include <valarray>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include <boost/unordered_set.hpp>
|
|
|
|
|
|
|
|
#include "BleuScorer.h"
|
|
|
|
#include "Hypergraph.h"
|
|
|
|
|
2015-01-14 14:07:42 +03:00
|
|
|
namespace MosesTuning
|
|
|
|
{
|
2014-07-21 14:04:43 +04:00
|
|
|
|
|
|
|
std::ostream& operator<<(std::ostream& out, const WordVec& wordVec);
|
|
|
|
|
|
|
|
struct NgramHash : public std::unary_function<const WordVec&, std::size_t> {
|
|
|
|
std::size_t operator()(const WordVec& ngram) const {
|
|
|
|
return util::MurmurHashNative(&(ngram[0]), ngram.size() * sizeof(WordVec::value_type));
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
struct NgramEquals : public std::binary_function<const WordVec&, const WordVec&, bool> {
|
|
|
|
bool operator()(const WordVec& first, const WordVec& second) const {
|
|
|
|
if (first.size() != second.size()) return false;
|
|
|
|
return memcmp(&(first[0]), &(second[0]), first.size() * sizeof(WordVec::value_type)) == 0;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
typedef boost::unordered_map<WordVec, size_t, NgramHash, NgramEquals> NgramCounter;
|
|
|
|
|
|
|
|
|
2015-01-14 14:07:42 +03:00
|
|
|
class ReferenceSet
|
|
|
|
{
|
2014-07-21 14:04:43 +04:00
|
|
|
|
|
|
|
|
|
|
|
public:
|
2015-01-14 14:07:42 +03:00
|
|
|
|
2014-07-21 14:04:43 +04:00
|
|
|
void AddLine(size_t sentenceId, const StringPiece& line, Vocab& vocab);
|
|
|
|
|
|
|
|
void Load(const std::vector<std::string>& files, Vocab& vocab);
|
|
|
|
|
|
|
|
size_t NgramMatches(size_t sentenceId, const WordVec&, bool clip) const;
|
|
|
|
|
2015-01-14 14:07:42 +03:00
|
|
|
size_t Length(size_t sentenceId) const {
|
|
|
|
return lengths_[sentenceId];
|
|
|
|
}
|
2014-07-21 14:04:43 +04:00
|
|
|
|
|
|
|
private:
|
|
|
|
//ngrams to (clipped,unclipped) counts
|
|
|
|
typedef boost::unordered_map<WordVec, std::pair<std::size_t,std::size_t>, NgramHash,NgramEquals> NgramMap;
|
|
|
|
std::vector<NgramMap> ngramCounts_;
|
|
|
|
std::vector<size_t> lengths_;
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
struct VertexState {
|
|
|
|
VertexState();
|
|
|
|
|
|
|
|
std::vector<FeatureStatsType> bleuStats;
|
|
|
|
WordVec leftContext;
|
|
|
|
WordVec rightContext;
|
|
|
|
size_t targetLength;
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Used to score an rule (ie edge) when we are applying it.
|
|
|
|
**/
|
2015-01-14 14:07:42 +03:00
|
|
|
class HgBleuScorer
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
HgBleuScorer(const ReferenceSet& references, const Graph& graph, size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu):
|
2014-07-21 14:04:43 +04:00
|
|
|
references_(references), sentenceId_(sentenceId), graph_(graph), backgroundBleu_(backgroundBleu),
|
2015-01-14 14:07:42 +03:00
|
|
|
backgroundRefLength_(backgroundBleu[kBleuNgramOrder*2]) {
|
|
|
|
vertexStates_.resize(graph.VertexSize());
|
|
|
|
totalSourceLength_ = graph.GetVertex(graph.VertexSize()-1).SourceCovered();
|
|
|
|
}
|
2014-07-21 14:04:43 +04:00
|
|
|
|
2015-01-14 14:07:42 +03:00
|
|
|
FeatureStatsType Score(const Edge& edge, const Vertex& head, std::vector<FeatureStatsType>& bleuStats) ;
|
2014-07-21 14:04:43 +04:00
|
|
|
|
2015-01-14 14:07:42 +03:00
|
|
|
void UpdateState(const Edge& winnerEdge, size_t vertexId, const std::vector<FeatureStatsType>& bleuStats);
|
2014-07-21 14:04:43 +04:00
|
|
|
|
|
|
|
|
2015-01-14 14:07:42 +03:00
|
|
|
private:
|
|
|
|
const ReferenceSet& references_;
|
|
|
|
std::vector<VertexState> vertexStates_;
|
|
|
|
size_t sentenceId_;
|
|
|
|
size_t totalSourceLength_;
|
|
|
|
const Graph& graph_;
|
|
|
|
std::vector<FeatureStatsType> backgroundBleu_;
|
|
|
|
FeatureStatsType backgroundRefLength_;
|
|
|
|
|
|
|
|
void UpdateMatches(const NgramCounter& counter, std::vector<FeatureStatsType>& bleuStats) const;
|
|
|
|
size_t GetTargetLength(const Edge& edge) const;
|
2014-07-21 14:04:43 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
struct HgHypothesis {
|
|
|
|
SparseVector featureVector;
|
|
|
|
WordVec text;
|
|
|
|
std::vector<FeatureStatsType> bleuStats;
|
|
|
|
};
|
|
|
|
|
|
|
|
void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight, const ReferenceSet& references, size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu, HgHypothesis* bestHypo);
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
#endif
|