mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 05:14:36 +03:00
2a66a55c85
Minor overhaul to the bias regime, which allows to specify bias by document name (as provided in the document map) rather than by sentence in the static parallel corpus.
296 lines
9.1 KiB
C++
296 lines
9.1 KiB
C++
// -*- c++ -*-
|
|
// Sampling phrase table implementation based on memory-mapped suffix arrays.
|
|
// Design and code by Ulrich Germann.
|
|
#pragma once
|
|
|
|
#include <time.h>
|
|
#include <boost/thread.hpp>
|
|
#include <boost/scoped_ptr.hpp>
|
|
|
|
#include "moses/TypeDef.h"
|
|
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
|
|
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
|
|
#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
|
|
|
|
#include "moses/TranslationModel/UG/mm/ug_mm_ttrack.h"
|
|
#include "moses/TranslationModel/UG/mm/ug_mm_tsa.h"
|
|
#include "moses/TranslationModel/UG/mm/tpt_tokenindex.h"
|
|
#include "moses/TranslationModel/UG/mm/ug_corpus_token.h"
|
|
#include "moses/TranslationModel/UG/mm/ug_typedefs.h"
|
|
#include "moses/TranslationModel/UG/mm/tpt_pickler.h"
|
|
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
|
|
#include "moses/TranslationModel/UG/mm/ug_phrasepair.h"
|
|
#include "moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h"
|
|
|
|
#include "moses/InputFileStream.h"
|
|
#include "moses/FactorTypeSet.h"
|
|
#include "moses/TargetPhrase.h"
|
|
#include <boost/dynamic_bitset.hpp>
|
|
#include "moses/TargetPhraseCollection.h"
|
|
#include "util/usage.hh"
|
|
#include <map>
|
|
|
|
#include "moses/TranslationModel/PhraseDictionary.h"
|
|
#include "sapt_phrase_scorers.h"
|
|
|
|
// TO DO:
|
|
// - make lexical phrase scorer take addition to the "dynamic overlay" into account
|
|
// - switch to pool of sapts, where each sapt has its own provenance feature
|
|
// RESEARCH QUESTION: is this more effective than having multiple phrase tables,
|
|
// each with its own set of features?
|
|
|
|
using namespace std;
|
|
namespace Moses
|
|
{
|
|
using namespace bitext;
|
|
class Mmsapt
|
|
#ifndef NO_MOSES
|
|
: public PhraseDictionary
|
|
#endif
|
|
{
|
|
friend class Alignment;
|
|
map<string,string> param;
|
|
sptr<SamplingBias> m_bias;
|
|
string m_name;
|
|
public:
|
|
typedef L2R_Token<SimpleWordId> Token;
|
|
typedef mmBitext<Token> mmbitext;
|
|
typedef imBitext<Token> imbitext;
|
|
typedef Bitext<Token> bitext;
|
|
typedef TSA<Token> tsa;
|
|
typedef PhraseScorer<Token> pscorer;
|
|
private:
|
|
// vector<sptr<bitext> > shards;
|
|
mmbitext btfix;
|
|
sptr<imbitext> btdyn;
|
|
string bname,extra_data,bias_file;
|
|
string L1;
|
|
string L2;
|
|
float m_lbop_conf; // confidence level for lbop smoothing
|
|
float m_lex_alpha; // alpha paramter (j+a)/(m+a) for lexical smoothing
|
|
// alpha parameter for lexical smoothing (joint+alpha)/(marg + alpha)
|
|
// must be > 0 if dynamic
|
|
size_t m_default_sample_size;
|
|
size_t m_workers; // number of worker threads for sampling the bitexts
|
|
vector<string> m_feature_set_names; // one or more of: standard, datasource
|
|
|
|
|
|
|
|
|
|
// // deprecated!
|
|
// char m_pfwd_denom; // denominator for computation of fwd phrase score:
|
|
// // 'r' - divide by raw count
|
|
// // 's' - divide by sample count
|
|
// // 'g' - devide by number of "good" (i.e. coherent) samples
|
|
// // size_t num_features;
|
|
|
|
size_t input_factor;
|
|
size_t output_factor; // we can actually return entire Tokens!
|
|
|
|
// bool withLogCountFeatures; // add logs of counts as features?
|
|
// bool withCoherence;
|
|
// string m_pfwd_features; // which pfwd functions to use
|
|
// string m_pbwd_features; // which pbwd functions to use
|
|
|
|
// for display for human inspection (ttable dumps):
|
|
vector<string> m_feature_names; // names of features activated
|
|
vector<bool> m_is_logval; // keeps track of which features are log valued
|
|
vector<bool> m_is_integer; // keeps track of which features are integer valued
|
|
|
|
vector<sptr<pscorer > > m_active_ff_fix; // activated feature functions (fix)
|
|
vector<sptr<pscorer > > m_active_ff_dyn; // activated feature functions (dyn)
|
|
vector<sptr<pscorer > > m_active_ff_common; // activated feature functions (dyn)
|
|
|
|
void
|
|
register_ff(sptr<pscorer> const& ff, vector<sptr<pscorer> > & registry);
|
|
|
|
template<typename fftype>
|
|
void
|
|
check_ff(string const ffname,vector<sptr<pscorer> >* registry = NULL);
|
|
// add feature function if specified
|
|
|
|
template<typename fftype>
|
|
void
|
|
check_ff(string const ffname, float const xtra, vector<sptr<pscorer> >* registry = NULL);
|
|
// add feature function if specified
|
|
|
|
void
|
|
add_corpus_specific_features(vector<sptr<pscorer > >& ffvec);
|
|
|
|
// built-in feature functions
|
|
// PScorePfwd<Token> calc_pfwd_fix, calc_pfwd_dyn;
|
|
// PScorePbwd<Token> calc_pbwd_fix, calc_pbwd_dyn;
|
|
// PScoreLex<Token> calc_lex; // this one I'd like to see as an external ff eventually
|
|
// PScorePC<Token> apply_pp; // apply phrase penalty
|
|
// PScoreLogCounts<Token> add_logcounts_fix;
|
|
// PScoreLogCounts<Token> add_logcounts_dyn;
|
|
void init(string const& line);
|
|
mutable boost::mutex lock;
|
|
bool withPbwd;
|
|
bool poolCounts;
|
|
vector<FactorType> ofactor;
|
|
|
|
|
|
public:
|
|
// typedef boost::unordered_map<uint64_t, sptr<TargetPhraseCollection> > tpcoll_cache_t;
|
|
class TargetPhraseCollectionWrapper
|
|
: public TargetPhraseCollection
|
|
{
|
|
public:
|
|
size_t const revision; // time stamp from dynamic bitext
|
|
::uint64_t const key; // phrase key
|
|
uint32_t refCount; // reference count
|
|
#if defined(timespec)
|
|
timespec tstamp; // last use
|
|
#else
|
|
timeval tstamp; // last use
|
|
#endif
|
|
int idx; // position in history heap
|
|
TargetPhraseCollectionWrapper(size_t r, ::uint64_t const k);
|
|
~TargetPhraseCollectionWrapper();
|
|
};
|
|
|
|
private:
|
|
|
|
void read_config_file(string fname, map<string,string>& param);
|
|
|
|
TargetPhraseCollectionWrapper*
|
|
encache(TargetPhraseCollectionWrapper* const ptr) const;
|
|
|
|
void
|
|
decache(TargetPhraseCollectionWrapper* ptr) const;
|
|
|
|
typedef map<typename ::uint64_t, TargetPhraseCollectionWrapper*> tpc_cache_t;
|
|
mutable tpc_cache_t m_cache;
|
|
mutable vector<TargetPhraseCollectionWrapper*> m_history;
|
|
// phrase table feature weights for alignment:
|
|
vector<float> feature_weights;
|
|
|
|
vector<vector<id_type> > wlex21;
|
|
// word translation lexicon (without counts, get these from calc_lex.COOC)
|
|
typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> mm2dtable_t;
|
|
mm2dtable_t COOCraw;
|
|
|
|
TargetPhrase*
|
|
mkTPhrase(Phrase const& src,
|
|
Moses::bitext::PhrasePair<Token>* fix,
|
|
Moses::bitext::PhrasePair<Token>* dyn,
|
|
sptr<Bitext<Token> > const& dynbt) const;
|
|
|
|
// template<typename Token>
|
|
// void
|
|
// expand(typename Bitext<Token>::iter const& m, Bitext<Token> const& bt,
|
|
// pstats const& pstats, vector<PhrasePair<Token> >& dest);
|
|
|
|
#if 0
|
|
TargetPhrase*
|
|
mkTPhrase
|
|
(Phrase const& src,
|
|
Bitext<Token> const& bt,
|
|
Moses::bitext::PhrasePair const& pp
|
|
) const;
|
|
#endif
|
|
void
|
|
process_pstats
|
|
(Phrase const& src,
|
|
::uint64_t const pid1,
|
|
pstats const& stats,
|
|
Bitext<Token> const & bt,
|
|
TargetPhraseCollection* tpcoll
|
|
) const;
|
|
|
|
bool
|
|
pool_pstats
|
|
(Phrase const& src,
|
|
::uint64_t const pid1a, pstats * statsa, Bitext<Token> const & bta,
|
|
::uint64_t const pid1b, pstats const* statsb, Bitext<Token> const & btb,
|
|
TargetPhraseCollection* tpcoll) const;
|
|
|
|
bool
|
|
combine_pstats
|
|
(Phrase const& src,
|
|
::uint64_t const pid1a, pstats* statsa, Bitext<Token> const & bta,
|
|
::uint64_t const pid1b, pstats const* statsb, Bitext<Token> const & btb,
|
|
TargetPhraseCollection* tpcoll) const;
|
|
|
|
void
|
|
load_extra_data(string bname, bool locking);
|
|
|
|
void
|
|
load_bias(string bname);
|
|
|
|
mutable size_t m_tpc_ctr;
|
|
public:
|
|
// Mmsapt(string const& description, string const& line);
|
|
Mmsapt(string const& line);
|
|
|
|
void
|
|
Load();
|
|
|
|
void
|
|
Load(bool with_checks);
|
|
|
|
// returns the prior table limit
|
|
size_t SetTableLimit(size_t limit);
|
|
|
|
string const&
|
|
GetName() const;
|
|
|
|
#ifndef NO_MOSES
|
|
TargetPhraseCollection const*
|
|
GetTargetPhraseCollectionLEGACY(const Phrase& src) const;
|
|
//! Create a sentence-specific manager for SCFG rule lookup.
|
|
ChartRuleLookupManager*
|
|
CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &);
|
|
|
|
ChartRuleLookupManager*
|
|
CreateRuleLookupManager
|
|
(const ChartParser &, const ChartCellCollectionBase &, std::size_t);
|
|
#endif
|
|
|
|
void add(string const& s1, string const& s2, string const& a);
|
|
|
|
// align two new sentences
|
|
sptr<vector<int> >
|
|
align(string const& src, string const& trg) const;
|
|
|
|
void setWeights(vector<float> const& w);
|
|
|
|
void
|
|
CleanUpAfterSentenceProcessing(const InputType& source);
|
|
|
|
void
|
|
InitializeForInput(InputType const& source);
|
|
|
|
void
|
|
Release(TargetPhraseCollection const* tpc) const;
|
|
|
|
bool
|
|
ProvidesPrefixCheck() const;
|
|
|
|
/// return true if prefix /phrase/ exists
|
|
bool
|
|
PrefixExists(Phrase const& phrase, SamplingBias const* const bias) const;
|
|
|
|
bool
|
|
PrefixExists(Phrase const& phrase) const;
|
|
|
|
vector<string> const&
|
|
GetFeatureNames() const;
|
|
|
|
// void
|
|
// ScorePPfix(bitext::PhrasePair& pp) const;
|
|
|
|
bool
|
|
isLogVal(int i) const;
|
|
|
|
bool
|
|
isInteger(int i) const;
|
|
|
|
sptr<DocumentBias>
|
|
setupDocumentBias(map<string,float> const& bias) const;
|
|
private:
|
|
};
|
|
} // end namespace
|
|
|