mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 05:14:36 +03:00
46e31a285c
- Bug fixes and conceptual improvements in biased sampling. The sampling now tries to stick to the bias, even when an unsuitable corpus dominates the occurrences.
241 lines
8.4 KiB
C++
241 lines
8.4 KiB
C++
// -*- c++ -*-
|
|
// Sampling phrase table implementation based on memory-mapped suffix arrays.
|
|
// Design and code by Ulrich Germann.
|
|
#pragma once
|
|
|
|
#include <boost/thread.hpp>
|
|
#include <boost/scoped_ptr.hpp>
|
|
|
|
#include "moses/TypeDef.h"
|
|
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
|
|
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
|
|
#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
|
|
|
|
#include "moses/TranslationModel/UG/mm/ug_mm_ttrack.h"
|
|
#include "moses/TranslationModel/UG/mm/ug_mm_tsa.h"
|
|
#include "moses/TranslationModel/UG/mm/tpt_tokenindex.h"
|
|
#include "moses/TranslationModel/UG/mm/ug_corpus_token.h"
|
|
#include "moses/TranslationModel/UG/mm/ug_typedefs.h"
|
|
#include "moses/TranslationModel/UG/mm/tpt_pickler.h"
|
|
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
|
|
#include "moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h"
|
|
|
|
#include "moses/TranslationModel/UG/TargetPhraseCollectionCache.h"
|
|
|
|
#include "moses/InputFileStream.h"
|
|
#include "moses/FactorTypeSet.h"
|
|
#include "moses/TargetPhrase.h"
|
|
#include <boost/dynamic_bitset.hpp>
|
|
#include "moses/TargetPhraseCollection.h"
|
|
#include "util/usage.hh"
|
|
#include <map>
|
|
|
|
#include "moses/TranslationModel/PhraseDictionary.h"
|
|
#include "sapt_phrase_scorers.h"
|
|
|
|
// TO DO:
|
|
// - make lexical phrase scorer take addition to the "dynamic overlay" into account
|
|
// - switch to pool of sapts, where each sapt has its own provenance feature
|
|
// RESEARCH QUESTION: is this more effective than having multiple phrase tables,
|
|
// each with its own set of features?
|
|
|
|
namespace Moses
|
|
{
|
|
using namespace bitext;
|
|
class Mmsapt
|
|
#ifndef NO_MOSES
|
|
: public PhraseDictionary
|
|
#endif
|
|
{
|
|
// using namespace std;
|
|
class TPCOllCache;
|
|
friend class Alignment;
|
|
std::map<std::string,std::string> param;
|
|
std::string m_name;
|
|
public:
|
|
typedef L2R_Token<SimpleWordId> Token;
|
|
typedef mmBitext<Token> mmbitext;
|
|
typedef imBitext<Token> imbitext;
|
|
typedef Bitext<Token> bitext;
|
|
typedef TSA<Token> tsa;
|
|
typedef PhraseScorer<Token> pscorer;
|
|
private:
|
|
// vector<sptr<bitext> > shards;
|
|
mmbitext btfix;
|
|
sptr<imbitext> btdyn;
|
|
std::string m_bname, m_extra_data, m_bias_file,m_bias_server;
|
|
std::string L1;
|
|
std::string L2;
|
|
float m_lbop_conf; // confidence level for lbop smoothing
|
|
float m_lex_alpha; // alpha paramter (j+a)/(m+a) for lexical smoothing
|
|
// alpha parameter for lexical smoothing (joint+alpha)/(marg + alpha)
|
|
// must be > 0 if dynamic
|
|
size_t m_default_sample_size;
|
|
size_t m_workers; // number of worker threads for sampling the bitexts
|
|
std::vector<std::string> m_feature_set_names; // one or more of: standard, datasource
|
|
std::string m_bias_logfile;
|
|
boost::scoped_ptr<ofstream> m_bias_logger; // for logging to a file
|
|
ostream* m_bias_log;
|
|
int m_bias_loglevel;
|
|
public:
|
|
void* const cache_key; // for getting cache from ttask
|
|
void* const context_key; // for context scope from ttask
|
|
private:
|
|
boost::shared_ptr<SamplingBias> m_bias; // for global default bias
|
|
boost::shared_ptr<TPCollCache> m_cache; // for global default bias
|
|
size_t m_cache_size; //
|
|
size_t input_factor; //
|
|
size_t output_factor; // we can actually return entire Tokens!
|
|
|
|
// for display for human inspection (ttable dumps):
|
|
std::vector<std::string> m_feature_names; // names of features activated
|
|
std::vector<bool> m_is_logval; // keeps track of which features are log valued
|
|
std::vector<bool> m_is_integer; // keeps track of which features are integer valued
|
|
|
|
std::vector<sptr<pscorer > > m_active_ff_fix; // activated feature functions (fix)
|
|
std::vector<sptr<pscorer > > m_active_ff_dyn; // activated feature functions (dyn)
|
|
std::vector<sptr<pscorer > > m_active_ff_common;
|
|
// activated feature functions (dyn)
|
|
|
|
void
|
|
register_ff(sptr<pscorer> const& ff, std::vector<sptr<pscorer> > & registry);
|
|
|
|
template<typename fftype>
|
|
void
|
|
check_ff(std::string const ffname,std::vector<sptr<pscorer> >* registry = NULL);
|
|
// add feature function if specified
|
|
|
|
template<typename fftype>
|
|
void
|
|
check_ff(std::string const ffname, float const xtra,
|
|
std::vector<sptr<pscorer> >* registry = NULL);
|
|
// add feature function if specified
|
|
|
|
void
|
|
add_corpus_specific_features(std::vector<sptr<pscorer > >& ffvec);
|
|
|
|
// built-in feature functions
|
|
// PScorePfwd<Token> calc_pfwd_fix, calc_pfwd_dyn;
|
|
// PScorePbwd<Token> calc_pbwd_fix, calc_pbwd_dyn;
|
|
// PScoreLex<Token> calc_lex;
|
|
// this one I'd like to see as an external ff eventually
|
|
// PScorePC<Token> apply_pp; // apply phrase penalty
|
|
// PScoreLogCounts<Token> add_logcounts_fix;
|
|
// PScoreLogCounts<Token> add_logcounts_dyn;
|
|
void init(std::string const& line);
|
|
mutable boost::shared_mutex m_lock;
|
|
// mutable boost::shared_mutex m_cache_lock;
|
|
// for more complex operations on the cache
|
|
bool withPbwd;
|
|
bool poolCounts;
|
|
std::vector<FactorType> ofactor;
|
|
|
|
|
|
private:
|
|
|
|
void read_config_file(std::string fname, std::map<std::string,std::string>& param);
|
|
|
|
// phrase table feature weights for alignment:
|
|
std::vector<float> feature_weights;
|
|
|
|
std::vector<std::vector<id_type> > wlex21;
|
|
// word translation lexicon (without counts, get these from calc_lex.COOC)
|
|
typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> mm2dtable_t;
|
|
mm2dtable_t COOCraw;
|
|
|
|
TargetPhrase*
|
|
mkTPhrase(Phrase const& src,
|
|
Moses::bitext::PhrasePair<Token>* fix,
|
|
Moses::bitext::PhrasePair<Token>* dyn,
|
|
sptr<Bitext<Token> > const& dynbt) const;
|
|
|
|
void
|
|
process_pstats
|
|
(Phrase const& src,
|
|
uint64_t const pid1,
|
|
pstats const& stats,
|
|
Bitext<Token> const & bt,
|
|
TargetPhraseCollection* tpcoll
|
|
) const;
|
|
|
|
bool
|
|
pool_pstats
|
|
(Phrase const& src,
|
|
uint64_t const pid1a, pstats * statsa, Bitext<Token> const & bta,
|
|
uint64_t const pid1b, pstats const* statsb, Bitext<Token> const & btb,
|
|
TargetPhraseCollection* tpcoll) const;
|
|
|
|
bool
|
|
combine_pstats
|
|
(Phrase const& src,
|
|
uint64_t const pid1a, pstats* statsa, Bitext<Token> const & bta,
|
|
uint64_t const pid1b, pstats const* statsb, Bitext<Token> const & btb,
|
|
TargetPhraseCollection* tpcoll) const;
|
|
|
|
void load_extra_data(std::string bname, bool locking);
|
|
void load_bias(std::string bname);
|
|
|
|
public:
|
|
// Mmsapt(std::string const& description, std::string const& line);
|
|
Mmsapt(std::string const& line);
|
|
|
|
void Load();
|
|
void Load(bool with_checks);
|
|
size_t SetTableLimit(size_t limit); // returns the prior table limit
|
|
std::string const& GetName() const;
|
|
|
|
#ifndef NO_MOSES
|
|
TargetPhraseCollection const*
|
|
GetTargetPhraseCollectionLEGACY(ttasksptr const& ttask, const Phrase& src) const;
|
|
|
|
TargetPhraseCollection const*
|
|
GetTargetPhraseCollectionLEGACY(const Phrase& src) const;
|
|
|
|
void
|
|
GetTargetPhraseCollectionBatch(ttasksptr const& ttask,
|
|
const InputPathList &inputPathQueue) const;
|
|
|
|
//! Create a sentence-specific manager for SCFG rule lookup.
|
|
ChartRuleLookupManager*
|
|
CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &);
|
|
|
|
ChartRuleLookupManager*
|
|
CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &,
|
|
std::size_t);
|
|
#endif
|
|
|
|
void add(std::string const& s1, std::string const& s2, std::string const& a);
|
|
// add a new sentence pair to the dynamic bitext
|
|
|
|
void setWeights(std::vector<float> const& w);
|
|
|
|
|
|
void Release(ttasksptr const& ttask, TargetPhraseCollection*& tpc) const;
|
|
// some consumer lets me know that *tpc isn't needed any more
|
|
|
|
|
|
bool ProvidesPrefixCheck() const; // return true if prefix /phrase/ check exists
|
|
// bool PrefixExists(Phrase const& phrase, SamplingBias const* const bias) const;
|
|
bool PrefixExists(ttasksptr const& ttask, Phrase const& phrase) const;
|
|
|
|
bool isLogVal(int i) const;
|
|
bool isInteger(int i) const;
|
|
|
|
// task setup and takedown functions
|
|
void InitializeForInput(ttasksptr const& ttask);
|
|
// void CleanUpAfterSentenceProcessing(const InputType& source);
|
|
void CleanUpAfterSentenceProcessing(ttasksptr const& ttask);
|
|
|
|
// align two new sentences
|
|
sptr<std::vector<int> >
|
|
align(std::string const& src, std::string const& trg) const;
|
|
|
|
std::vector<std::string> const&
|
|
GetFeatureNames() const;
|
|
|
|
sptr<DocumentBias>
|
|
setupDocumentBias(std::map<std::string,float> const& bias) const;
|
|
};
|
|
} // end namespace
|
|
|