mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 05:55:02 +03:00
adding legacy files
This commit is contained in:
parent
0036d8bb4d
commit
a8316c5975
51
vw/ExtractorConfig.cpp
Normal file
51
vw/ExtractorConfig.cpp
Normal file
@ -0,0 +1,51 @@
|
||||
#include "ExtractorConfig.h"
|
||||
#include "Util.h"
|
||||
|
||||
#include <exception>
|
||||
#include <stdexcept>
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
|
||||
using namespace std;
|
||||
using namespace boost::bimaps;
|
||||
using namespace Moses;
|
||||
|
||||
namespace Classifier
|
||||
{
|
||||
|
||||
void ExtractorConfig::Load(const string &configFile)
|
||||
{
|
||||
try {
|
||||
IniReader reader(configFile);
|
||||
m_sourceInternal = reader.Get<bool>("features.source-internal", false);
|
||||
m_sourceExternal = reader.Get<bool>("features.source-external", false);
|
||||
m_targetInternal = reader.Get<bool>("features.target-internal", false);
|
||||
m_sourceIndicator = reader.Get<bool>("features.source-indicator", false);
|
||||
m_targetIndicator = reader.Get<bool>("features.target-indicator", false);
|
||||
m_sourceTargetIndicator = reader.Get<bool>("features.source-target-indicator", false);
|
||||
m_STSE = reader.Get<bool>("features.source-target-source-external", false);
|
||||
m_paired = reader.Get<bool>("features.paired", false);
|
||||
m_bagOfWords = reader.Get<bool>("features.bag-of-words", false);
|
||||
m_mostFrequent = reader.Get<bool>("features.most-frequent", false);
|
||||
m_binnedScores = reader.Get<bool>("features.binned-scores", false);
|
||||
m_sourceTopic = reader.Get<bool>("features.source-topic", false);
|
||||
m_phraseFactor = reader.Get<bool>("features.phrase-factor", false);
|
||||
m_windowSize = reader.Get<size_t>("features.window-size", 0);
|
||||
|
||||
m_factors = Scan<size_t>(Tokenize(reader.Get<string>("features.factors", ""), ","));
|
||||
m_scoreIndexes = Scan<size_t>(Tokenize(reader.Get<string>("features.scores", ""), ","));
|
||||
m_scoreBins = Scan<float>(Tokenize(reader.Get<string>("features.score-bins", ""), ","));
|
||||
|
||||
m_vwOptsTrain = reader.Get<string>("vw-options.train", "");
|
||||
m_vwOptsPredict = reader.Get<string>("vw-options.predict", "");
|
||||
|
||||
m_normalization = reader.Get<string>("decoder.normalization", "");
|
||||
|
||||
m_isLoaded = true;
|
||||
} catch (const runtime_error &err) {
|
||||
cerr << "Error loading file " << configFile << ": " << err.what();
|
||||
m_isLoaded = false;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Classifier
|
60
vw/ExtractorConfig.h
Normal file
60
vw/ExtractorConfig.h
Normal file
@ -0,0 +1,60 @@
|
||||
#ifndef moses_ExtractorConfig_h
|
||||
#define moses_ExtractorConfig_h
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <boost/bimap/bimap.hpp>
|
||||
#include "IniReader.h"
|
||||
|
||||
namespace Classifier
|
||||
{
|
||||
|
||||
const size_t FACTOR_FORM = 0; // index of surface forms
|
||||
const size_t P_E_F_INDEX = 2; // index of P(e|f) score in phrase table
|
||||
|
||||
class ExtractorConfig
|
||||
{
|
||||
public:
|
||||
void Load(const std::string &configFile);
|
||||
inline bool GetSourceExternal() const { return m_sourceExternal; }
|
||||
inline bool GetSourceInternal() const { return m_sourceInternal; }
|
||||
inline bool GetTargetInternal() const { return m_targetInternal; }
|
||||
inline bool GetSourceIndicator() const { return m_sourceIndicator; }
|
||||
inline bool GetTargetIndicator() const { return m_targetIndicator; }
|
||||
inline bool GetSourceTargetIndicator() const { return m_sourceTargetIndicator; }
|
||||
inline bool GetSTSE() const { return m_STSE; }
|
||||
inline bool GetPhraseFactor() const { return m_phraseFactor; }
|
||||
inline bool GetPaired() const { return m_paired; }
|
||||
inline bool GetBagOfWords() const { return m_bagOfWords; }
|
||||
inline bool GetMostFrequent() const { return m_mostFrequent; }
|
||||
inline size_t GetWindowSize() const { return m_windowSize; }
|
||||
inline bool GetBinnedScores() const { return m_binnedScores; }
|
||||
inline bool GetSourceTopic() const { return m_sourceTopic; }
|
||||
inline const std::vector<size_t> &GetFactors() const { return m_factors; }
|
||||
inline const std::vector<size_t> &GetScoreIndexes() const { return m_scoreIndexes; }
|
||||
inline const std::vector<float> &GetScoreBins() const { return m_scoreBins; }
|
||||
inline const std::string &GetVWOptionsTrain() const { return m_vwOptsTrain; }
|
||||
inline const std::string &GetVWOptionsPredict() const { return m_vwOptsPredict; }
|
||||
inline const std::string &GetNormalization() const { return m_normalization; }
|
||||
|
||||
inline bool IsLoaded() const { return m_isLoaded; }
|
||||
|
||||
private:
|
||||
// read from configuration
|
||||
bool m_paired, m_bagOfWords, m_sourceExternal,
|
||||
m_sourceInternal, m_targetInternal, m_mostFrequent,
|
||||
m_binnedScores, m_sourceIndicator, m_targetIndicator,
|
||||
m_sourceTargetIndicator, m_STSE, m_sourceTopic, m_phraseFactor;
|
||||
std::string m_vwOptsPredict, m_vwOptsTrain, m_normalization;
|
||||
size_t m_windowSize;
|
||||
std::vector<size_t> m_factors, m_scoreIndexes;
|
||||
std::vector<float> m_scoreBins;
|
||||
|
||||
// internal variables
|
||||
bool m_isLoaded;
|
||||
};
|
||||
|
||||
} // namespace Classifier
|
||||
|
||||
#endif // moses_ExtractorConfig_h
|
135
vw/FeatureConsumer.h
Normal file
135
vw/FeatureConsumer.h
Normal file
@ -0,0 +1,135 @@
|
||||
#ifndef moses_FeatureConsumer_h
|
||||
#define moses_FeatureConsumer_h
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <deque>
|
||||
#include <vector>
|
||||
|
||||
#include <boost/noncopyable.hpp>
|
||||
#include <boost/thread/condition_variable.hpp>
|
||||
#include <boost/thread/locks.hpp>
|
||||
#include <boost/thread/mutex.hpp>
|
||||
#include <boost/iostreams/filtering_stream.hpp>
|
||||
#include <boost/iostreams/filter/gzip.hpp>
|
||||
|
||||
// #ifdef HAVE_VW
|
||||
// forward declarations to avoid dependency on VW
|
||||
struct vw;
|
||||
class ezexample;
|
||||
// #endif
|
||||
|
||||
namespace Classifier
|
||||
{
|
||||
|
||||
// abstract consumer
|
||||
class FeatureConsumer
|
||||
{
|
||||
public:
|
||||
virtual void SetNamespace(char ns, bool shared) = 0;
|
||||
virtual void AddFeature(const std::string &name) = 0;
|
||||
virtual void AddFeature(const std::string &name, float value) = 0;
|
||||
virtual void Train(const std::string &label, float loss) = 0;
|
||||
virtual float Predict(const std::string &label) = 0;
|
||||
virtual void FinishExample() = 0;
|
||||
virtual void Finish() = 0;
|
||||
};
|
||||
|
||||
// consumer that builds VW training files
|
||||
class VWFileTrainConsumer : public FeatureConsumer
|
||||
{
|
||||
public:
|
||||
VWFileTrainConsumer(const std::string &outputFile);
|
||||
|
||||
// FeatureConsumer interface implementation
|
||||
virtual void SetNamespace(char ns, bool shared);
|
||||
virtual void AddFeature(const std::string &name);
|
||||
virtual void AddFeature(const std::string &name, float value);
|
||||
virtual void FinishExample();
|
||||
virtual void Finish();
|
||||
virtual void Train(const std::string &label, float loss);
|
||||
virtual float Predict(const std::string &label);
|
||||
|
||||
private:
|
||||
boost::iostreams::filtering_ostream m_bfos;
|
||||
std::deque<std::string> m_outputBuffer;
|
||||
|
||||
void WriteBuffer();
|
||||
std::string EscapeSpecialChars(const std::string &str);
|
||||
};
|
||||
|
||||
// #ifdef HAVE_VW
|
||||
// abstract consumer that trains/predicts using VW library interface
|
||||
class VWLibraryConsumer : public FeatureConsumer, private boost::noncopyable
|
||||
{
|
||||
public:
|
||||
virtual void SetNamespace(char ns, bool shared);
|
||||
virtual void AddFeature(const std::string &name);
|
||||
virtual void AddFeature(const std::string &name, float value);
|
||||
virtual void FinishExample();
|
||||
virtual void Finish();
|
||||
|
||||
protected:
|
||||
::vw *m_VWInstance;
|
||||
::ezexample *m_ex;
|
||||
// this contains state about which namespaces are shared
|
||||
bool m_shared;
|
||||
// if true, then the VW instance is owned by an external party and should NOT be
|
||||
// deleted at end; if false, then we own the VW instance and must clean up after it.
|
||||
bool m_sharedVwInstance;
|
||||
int m_index;
|
||||
|
||||
~VWLibraryConsumer();
|
||||
};
|
||||
|
||||
// train using VW
|
||||
class VWLibraryTrainConsumer : public VWLibraryConsumer
|
||||
{
|
||||
public:
|
||||
VWLibraryTrainConsumer(const std::string &modelFile, const std::string &vwOptions);
|
||||
virtual void Train(const std::string &label, float loss);
|
||||
virtual float Predict(const std::string &label);
|
||||
virtual void FinishExample();
|
||||
};
|
||||
|
||||
// predict using VW
|
||||
class VWLibraryPredictConsumer : public VWLibraryConsumer
|
||||
{
|
||||
public:
|
||||
VWLibraryPredictConsumer(const std::string &modelFile, const std::string &vwOptions);
|
||||
virtual void Train(const std::string &label, float loss);
|
||||
virtual float Predict(const std::string &label);
|
||||
|
||||
friend class VWLibraryPredictConsumerFactory;
|
||||
|
||||
private:
|
||||
VWLibraryPredictConsumer(vw * instance, int index);
|
||||
};
|
||||
|
||||
// object pool of VWLibraryPredictConsumers
|
||||
class VWLibraryPredictConsumerFactory : private boost::noncopyable
|
||||
{
|
||||
public:
|
||||
VWLibraryPredictConsumerFactory(const std::string &modelFile, const std::string &vwOptions, const int poolSize);
|
||||
|
||||
VWLibraryPredictConsumer * Acquire();
|
||||
void Release(VWLibraryPredictConsumer * fc);
|
||||
|
||||
~VWLibraryPredictConsumerFactory();
|
||||
|
||||
private:
|
||||
::vw *m_VWInstance;
|
||||
int m_firstFree;
|
||||
std::vector<int> m_nextFree;
|
||||
std::vector<VWLibraryPredictConsumer *> m_consumers;
|
||||
boost::mutex m_mutex;
|
||||
boost::condition_variable m_cond;
|
||||
};
|
||||
|
||||
// #endif // HAVE_VW
|
||||
|
||||
} // namespace Classifier
|
||||
|
||||
#endif // moses_FeatureConsumer_h
|
247
vw/FeatureExtractor.cpp
Normal file
247
vw/FeatureExtractor.cpp
Normal file
@ -0,0 +1,247 @@
|
||||
#include "FeatureExtractor.h"
|
||||
#include "Util.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
|
||||
using namespace std;
|
||||
using namespace Moses;
|
||||
|
||||
namespace Classifier
|
||||
{
|
||||
|
||||
FeatureExtractor::FeatureExtractor(const ExtractorConfig &config, bool train)
|
||||
: m_config(config), m_train(train)
|
||||
{
|
||||
if (! m_config.IsLoaded())
|
||||
throw logic_error("configuration file not loaded");
|
||||
}
|
||||
|
||||
map<string, float> FeatureExtractor::GetMaxProb(const vector<Translation> &translations)
|
||||
{
|
||||
map<string, float> maxProbs;
|
||||
vector<Translation>::const_iterator it;
|
||||
vector<TTableEntry>::const_iterator tableIt;
|
||||
for (it = translations.begin(); it != translations.end(); it++) {
|
||||
for (tableIt = it->m_ttableScores.begin(); tableIt != it->m_ttableScores.end(); tableIt++) {
|
||||
if (tableIt->m_exists) {
|
||||
maxProbs[tableIt->m_id] = max(tableIt->m_scores[P_E_F_INDEX], maxProbs[tableIt->m_id]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return maxProbs;
|
||||
}
|
||||
|
||||
void FeatureExtractor::GenerateFeatures(FeatureConsumer *fc,
|
||||
const ContextType &context,
|
||||
size_t spanStart,
|
||||
size_t spanEnd,
|
||||
const vector<Translation> &translations,
|
||||
vector<float> &losses)
|
||||
{
|
||||
fc->SetNamespace('s', true);
|
||||
|
||||
if (m_config.GetSourceExternal()) GenerateContextFeatures(context, spanStart, spanEnd, fc);
|
||||
|
||||
// get words (surface forms) in source phrase
|
||||
vector<string> sourceForms(spanEnd - spanStart + 1);
|
||||
for (size_t i = spanStart; i <= spanEnd; i++)
|
||||
sourceForms[i - spanStart] = context[i][FACTOR_FORM];
|
||||
|
||||
map<string, float> maxProbs;
|
||||
if (m_config.GetMostFrequent()) maxProbs = GetMaxProb(translations);
|
||||
|
||||
if (m_config.GetSourceInternal()) GenerateInternalFeatures(sourceForms, fc);
|
||||
if (m_config.GetPhraseFactor()) GeneratePhraseFactorFeatures(context, spanStart, spanEnd, fc);
|
||||
if (m_config.GetBagOfWords()) GenerateBagOfWordsFeatures(context, spanStart, spanEnd, FACTOR_FORM, fc);
|
||||
|
||||
if (m_config.GetSourceIndicator()) GenerateIndicatorFeature(sourceForms, fc);
|
||||
|
||||
vector<Translation>::const_iterator transIt = translations.begin();
|
||||
vector<float>::iterator lossIt = losses.begin();
|
||||
for (; transIt != translations.end(); transIt++, lossIt++) {
|
||||
assert(lossIt != losses.end());
|
||||
fc->SetNamespace('t', false);
|
||||
|
||||
// get words in target phrase
|
||||
const vector<string> &targetForms = transIt->translation;
|
||||
|
||||
if (m_config.GetTargetInternal()) GenerateInternalFeatures(targetForms, fc);
|
||||
if (m_config.GetPaired()) GeneratePairedFeatures(sourceForms, targetForms, transIt->m_alignment, fc);
|
||||
|
||||
if (m_config.GetMostFrequent()) GenerateMostFrequentFeature(transIt->m_ttableScores, maxProbs, fc);
|
||||
|
||||
if (m_config.GetBinnedScores()) GenerateScoreFeatures(transIt->m_ttableScores, fc);
|
||||
|
||||
// "NOT_IN_" features
|
||||
if (m_config.GetBinnedScores() || m_config.GetMostFrequent()) GenerateTTableEntryFeatures(transIt->m_ttableScores, fc);
|
||||
|
||||
if (m_config.GetTargetIndicator()) GenerateIndicatorFeature(targetForms, fc);
|
||||
|
||||
if (m_config.GetSourceTargetIndicator()) GenerateConcatIndicatorFeature(sourceForms, targetForms, fc);
|
||||
|
||||
if (m_config.GetSTSE()) GenerateSTSE(sourceForms, targetForms, context, spanStart, spanEnd, fc);
|
||||
|
||||
if (m_train) {
|
||||
fc->Train(SPrint(DUMMY_IDX), *lossIt);
|
||||
} else {
|
||||
*lossIt = fc->Predict(SPrint(DUMMY_IDX));
|
||||
}
|
||||
}
|
||||
fc->FinishExample();
|
||||
}
|
||||
|
||||
//
|
||||
// private methods
|
||||
//
|
||||
|
||||
string FeatureExtractor::BuildContextFeature(size_t factor, int index, const string &value)
|
||||
{
|
||||
return "c^" + SPrint(factor) + "_" + SPrint(index) + "_" + value;
|
||||
}
|
||||
|
||||
void FeatureExtractor::GenerateContextFeatures(const ContextType &context,
|
||||
size_t spanStart,
|
||||
size_t spanEnd,
|
||||
FeatureConsumer *fc)
|
||||
{
|
||||
vector<size_t>::const_iterator factIt;
|
||||
for (factIt = m_config.GetFactors().begin(); factIt != m_config.GetFactors().end(); factIt++) {
|
||||
for (size_t i = 1; i <= m_config.GetWindowSize(); i++) {
|
||||
string left = "<s>";
|
||||
string right = "</s>";
|
||||
if (spanStart >= i)
|
||||
left = context[spanStart - i][*factIt];
|
||||
fc->AddFeature(BuildContextFeature(*factIt, -i, left));
|
||||
if (spanEnd + i < context.size())
|
||||
right = context[spanEnd + i][*factIt];
|
||||
fc->AddFeature(BuildContextFeature(*factIt, i, right));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureExtractor::GenerateIndicatorFeature(const vector<string> &span, FeatureConsumer *fc)
|
||||
{
|
||||
fc->AddFeature("p^" + Join("_", span));
|
||||
}
|
||||
|
||||
void FeatureExtractor::GenerateConcatIndicatorFeature(const vector<string> &span1, const vector<string> &span2, FeatureConsumer *fc)
|
||||
{
|
||||
fc->AddFeature("p^" + Join("_", span1) + "^" + Join("_", span2));
|
||||
}
|
||||
|
||||
void FeatureExtractor::GenerateSTSE(const vector<string> &span1, const vector<string> &span2,
|
||||
const ContextType &context,
|
||||
size_t spanStart,
|
||||
size_t spanEnd,
|
||||
FeatureConsumer *fc)
|
||||
{
|
||||
vector<size_t>::const_iterator factIt;
|
||||
for (factIt = m_config.GetFactors().begin(); factIt != m_config.GetFactors().end(); factIt++) {
|
||||
for (size_t i = 1; i <= m_config.GetWindowSize(); i++) {
|
||||
string left = "<s>";
|
||||
string right = "</s>";
|
||||
if (spanStart >= i)
|
||||
left = context[spanStart - i][*factIt];
|
||||
fc->AddFeature("stse^" + Join("_", span1) + "^" + Join("_", span2) + BuildContextFeature(*factIt, -i, left));
|
||||
if (spanEnd + i < context.size())
|
||||
right = context[spanEnd + i][*factIt];
|
||||
fc->AddFeature("stse^" + Join("_", span1) + "^" + Join("_", span2) + BuildContextFeature(*factIt, i, right));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureExtractor::GenerateInternalFeatures(const vector<string> &span, FeatureConsumer *fc)
|
||||
{
|
||||
vector<string>::const_iterator it;
|
||||
for (it = span.begin(); it != span.end(); it++) {
|
||||
fc->AddFeature("w^" + *it);
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureExtractor::GenerateBagOfWordsFeatures(const ContextType &context, size_t spanStart, size_t spanEnd, size_t factorID, FeatureConsumer *fc)
|
||||
{
|
||||
for (size_t i = 0; i < spanStart; i++)
|
||||
fc->AddFeature("bow^" + context[i][factorID]);
|
||||
for (size_t i = spanEnd + 1; i < context.size(); i++)
|
||||
fc->AddFeature("bow^" + context[i][factorID]);
|
||||
}
|
||||
|
||||
void FeatureExtractor::GeneratePhraseFactorFeatures(const ContextType &context, size_t spanStart, size_t spanEnd, FeatureConsumer *fc)
|
||||
{
|
||||
for (size_t i = spanStart; i <= spanEnd; i++) {
|
||||
vector<size_t>::const_iterator factIt;
|
||||
for (factIt = m_config.GetFactors().begin(); factIt != m_config.GetFactors().end(); factIt++) {
|
||||
fc->AddFeature("ibow^" + SPrint(*factIt) + "_" + context[i][*factIt]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureExtractor::GeneratePairedFeatures(const vector<string> &srcPhrase, const vector<string> &tgtPhrase,
|
||||
const AlignmentType &align, FeatureConsumer *fc)
|
||||
{
|
||||
AlignmentType::const_iterator it;
|
||||
set<size_t> srcAligned;
|
||||
set<size_t> tgtAligned;
|
||||
|
||||
for (it = align.begin(); it != align.end(); it++) {
|
||||
fc->AddFeature("pair^" + srcPhrase[it->first] + "^" + tgtPhrase[it->second]);
|
||||
srcAligned.insert(it->first);
|
||||
tgtAligned.insert(it->second);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < srcPhrase.size(); i++) {
|
||||
if (srcAligned.count(i) == 0)
|
||||
fc->AddFeature("pair^" + srcPhrase[i] + "^NULL");
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < tgtPhrase.size(); i++) {
|
||||
if (tgtAligned.count(i) == 0)
|
||||
fc->AddFeature("pair^NULL^" + tgtPhrase[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureExtractor::GenerateScoreFeatures(const std::vector<TTableEntry> &ttableScores, FeatureConsumer *fc)
|
||||
{
|
||||
vector<size_t>::const_iterator scoreIt;
|
||||
vector<float>::const_iterator binIt;
|
||||
vector<TTableEntry>::const_iterator tableIt;
|
||||
const vector<size_t> &scoreIDs = m_config.GetScoreIndexes();
|
||||
const vector<float> &bins = m_config.GetScoreBins();
|
||||
|
||||
for (tableIt = ttableScores.begin(); tableIt != ttableScores.end(); tableIt++) {
|
||||
if (! tableIt->m_exists)
|
||||
continue;
|
||||
string prefix = ttableScores.size() == 1 ? "" : tableIt->m_id + "_";
|
||||
for (scoreIt = scoreIDs.begin(); scoreIt != scoreIDs.end(); scoreIt++) {
|
||||
for (binIt = bins.begin(); binIt != bins.end(); binIt++) {
|
||||
float logScore = log(tableIt->m_scores[*scoreIt]);
|
||||
if (logScore < *binIt || Equals(logScore, *binIt)) {
|
||||
fc->AddFeature(prefix + "sc^" + SPrint<size_t>(*scoreIt) + "_" + SPrint(*binIt));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureExtractor::GenerateMostFrequentFeature(const std::vector<TTableEntry> &ttableScores, const map<string, float> &maxProbs, FeatureConsumer *fc)
|
||||
{
|
||||
vector<TTableEntry>::const_iterator it;
|
||||
for (it = ttableScores.begin(); it != ttableScores.end(); it++) {
|
||||
if (it->m_exists && Equals(it->m_scores[P_E_F_INDEX], maxProbs.find(it->m_id)->second)) {
|
||||
string prefix = ttableScores.size() == 1 ? "" : it->m_id + "_";
|
||||
fc->AddFeature(prefix + "MOST_FREQUENT");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureExtractor::GenerateTTableEntryFeatures(const std::vector<TTableEntry> &ttableScores, FeatureConsumer *fc)
|
||||
{
|
||||
vector<TTableEntry>::const_iterator it;
|
||||
for (it = ttableScores.begin(); it != ttableScores.end(); it++) {
|
||||
if (! it->m_exists)
|
||||
fc->AddFeature("NOT_IN_" + it->m_id);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Classifier
|
88
vw/FeatureExtractor.h
Normal file
88
vw/FeatureExtractor.h
Normal file
@ -0,0 +1,88 @@
|
||||
#ifndef moses_FeatureExtractor_h
|
||||
#define moses_FeatureExtractor_h
|
||||
|
||||
#include "FeatureConsumer.h"
|
||||
#include "ExtractorConfig.h"
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <exception>
|
||||
#include <stdexcept>
|
||||
#include <map>
|
||||
|
||||
namespace Classifier
|
||||
{
|
||||
|
||||
// label index passed to the classifier, this value is not used in our setting
|
||||
const int DUMMY_IDX = 1111;
|
||||
|
||||
// vector of words, each word is a vector of factors
|
||||
typedef std::vector<std::vector<std::string> > ContextType;
|
||||
|
||||
typedef std::multimap<size_t, size_t> AlignmentType;
|
||||
|
||||
// In DA scenario, there are multiple phrase tables. This struct
|
||||
// contains scores for a phrase in one phrase-table.
|
||||
struct TTableEntry
|
||||
{
|
||||
std::string m_id; // phrase-table identifier
|
||||
bool m_exists; // does translation exist in this table
|
||||
std::vector<float> m_scores; // translation scores (empty if m_exists == false)
|
||||
};
|
||||
|
||||
// One translation (phrase target side).
|
||||
struct Translation
|
||||
{
|
||||
std::vector<std::string> translation; // words (surface forms) of translation
|
||||
AlignmentType m_alignment; // phrase-internal word alignment
|
||||
std::vector<TTableEntry> m_ttableScores; // phrase scores in each phrase table
|
||||
};
|
||||
|
||||
// extract features
|
||||
class FeatureExtractor
|
||||
{
|
||||
public:
|
||||
FeatureExtractor(const ExtractorConfig &config, bool train);
|
||||
|
||||
// Generate features for current source phrase and all its translation options, based on
|
||||
// configuration. Calls all auxiliary Generate* methods.
|
||||
//
|
||||
// In training, reads the &losses parameter and passes them to VW. In prediction, &losses is
|
||||
// an output variable where VW scores are written.
|
||||
void GenerateFeatures(FeatureConsumer *fc,
|
||||
const ContextType &context,
|
||||
size_t spanStart,
|
||||
size_t spanEnd,
|
||||
const std::vector<Translation> &translations,
|
||||
std::vector<float> &losses);
|
||||
|
||||
private:
|
||||
const ExtractorConfig &m_config; // Configuration of features.
|
||||
bool m_train; // Train or predict.
|
||||
|
||||
// Get the highest probability P(e|f) associated with any of the translation options,
|
||||
// separately for each phrase table (string keys are phrase-table IDs).
|
||||
std::map<std::string, float> GetMaxProb(const std::vector<Translation> &translations);
|
||||
|
||||
void GenerateContextFeatures(const ContextType &context, size_t spanStart, size_t spanEnd, FeatureConsumer *fc);
|
||||
void GeneratePhraseFactorFeatures(const ContextType &context, size_t spanStart, size_t spanEnd, FeatureConsumer *fc);
|
||||
void GenerateInternalFeatures(const std::vector<std::string> &span, FeatureConsumer *fc);
|
||||
void GenerateIndicatorFeature(const std::vector<std::string> &span, FeatureConsumer *fc);
|
||||
void GenerateConcatIndicatorFeature(const std::vector<std::string> &span1, const std::vector<std::string> &span2, FeatureConsumer *fc);
|
||||
void GenerateSTSE(const std::vector<std::string> &span1, const std::vector<std::string> &span2, const ContextType &context, size_t spanStart, size_t spanEnd, FeatureConsumer *fc);
|
||||
void GenerateBagOfWordsFeatures(const ContextType &context, size_t spanStart, size_t spanEnd, size_t factorID, FeatureConsumer *fc);
|
||||
void GeneratePairedFeatures(const std::vector<std::string> &srcPhrase,
|
||||
const std::vector<std::string> &tgtPhrase,
|
||||
const AlignmentType &align,
|
||||
FeatureConsumer *fc);
|
||||
void GenerateScoreFeatures(const std::vector<TTableEntry> &ttableScores, FeatureConsumer *fc);
|
||||
void GenerateMostFrequentFeature(const std::vector<TTableEntry> &ttableScores,
|
||||
const std::map<std::string, float> &maxProbs,
|
||||
FeatureConsumer *fc);
|
||||
void GenerateTTableEntryFeatures(const std::vector<TTableEntry> &ttableScores, FeatureConsumer *fc);
|
||||
std::string BuildContextFeature(size_t factor, int index, const std::string &value);
|
||||
};
|
||||
|
||||
} // namespace Classifier
|
||||
|
||||
#endif // moses_FeatureExtractor_h
|
61
vw/IniReader.h
Normal file
61
vw/IniReader.h
Normal file
@ -0,0 +1,61 @@
|
||||
#ifndef moses_iniReader_h
|
||||
#define moses_iniReader_h
|
||||
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <exception>
|
||||
#include <stdexcept>
|
||||
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <boost/bind.hpp>
|
||||
#include <boost/algorithm/string/classification.hpp>
|
||||
#include <boost/lexical_cast.hpp>
|
||||
|
||||
// simple reader of .ini files
|
||||
class IniReader {
|
||||
public:
|
||||
IniReader(const std::string &file)
|
||||
{
|
||||
std::ifstream inStr(file.c_str());
|
||||
if (! inStr.is_open())
|
||||
throw std::runtime_error("Failed to open file " + file);
|
||||
|
||||
std::string section = "";
|
||||
std::string line;
|
||||
while (getline(inStr, line)) {
|
||||
if (line.empty() || line[0] == ';' || line[0] == '#') {
|
||||
// empty line or comment, do nothing
|
||||
} else if (line[0] == '[') {
|
||||
// new section
|
||||
section = line.substr(1, line.size() - 2);
|
||||
} else {
|
||||
std::vector<std::string> cols;
|
||||
boost::split(cols, line, boost::is_any_of("="));
|
||||
std::for_each(cols.begin(), cols.end(),
|
||||
boost::bind(&boost::trim<std::string>, _1, std::locale()));
|
||||
if (section.empty())
|
||||
throw std::runtime_error("Missing section");
|
||||
if (cols.size() != 2)
|
||||
throw std::runtime_error("Failed to parse line: '" + line + "'");
|
||||
std::string key = section + "." + cols[0];
|
||||
properties[key] = cols[1];
|
||||
}
|
||||
}
|
||||
inStr.close();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
T Get(const std::string &key, T defaultValue)
|
||||
{
|
||||
std::map<std::string, std::string>::const_iterator it = properties.find(key);
|
||||
return (it == properties.end()) ? defaultValue : boost::lexical_cast<T>(it->second);
|
||||
}
|
||||
|
||||
private:
|
||||
std::map<std::string, std::string> properties;
|
||||
};
|
||||
|
||||
#endif // moses_iniReader_h
|
14
vw/Jamfile
Normal file
14
vw/Jamfile
Normal file
@ -0,0 +1,14 @@
|
||||
alias headers : : : : <include>. <include>..//..//moses// <include>.. ;
|
||||
|
||||
boost 103600 ;
|
||||
|
||||
# VW
|
||||
local with-vw = [ option.get "with-vw" ] ;
|
||||
if $(with-vw) {
|
||||
lib vw : : <search>$(with-vw)/lib ;
|
||||
lib allreduce : : <search>$(with-vw)/lib ;
|
||||
obj VWLibraryConsumer.o : VWLibraryConsumer.cpp headers : <include>$(with-vw)/library <include>$(with-vw)/vowpalwabbit ;
|
||||
alias vw_objects : VWLibraryConsumer.o vw allreduce : : : <library>boost_program_options ;
|
||||
lib classifier : [ glob *.cpp : VWLibraryConsumer.cpp ] vw_objects headers ;
|
||||
echo "Linking with Vowpal Wabbit" ;
|
||||
}
|
85
vw/VWFileConsumer.cpp
Normal file
85
vw/VWFileConsumer.cpp
Normal file
@ -0,0 +1,85 @@
|
||||
#include "FeatureConsumer.h"
|
||||
#include "Util.h"
|
||||
#include <stdexcept>
|
||||
#include <exception>
|
||||
#include <string>
|
||||
#include <boost/iostreams/device/file.hpp>
|
||||
|
||||
using namespace std;
|
||||
using namespace Moses;
|
||||
|
||||
namespace Classifier
|
||||
{
|
||||
|
||||
VWFileTrainConsumer::VWFileTrainConsumer(const std::string &outputFile)
|
||||
{
|
||||
if (outputFile.size() > 3 && outputFile.substr(outputFile.size() - 3, 3) == ".gz") {
|
||||
m_bfos.push(boost::iostreams::gzip_compressor());
|
||||
}
|
||||
m_bfos.push(boost::iostreams::file_sink(outputFile));
|
||||
}
|
||||
|
||||
void VWFileTrainConsumer::SetNamespace(char ns, bool shared)
|
||||
{
|
||||
if (! m_outputBuffer.empty())
|
||||
WriteBuffer();
|
||||
|
||||
if (shared)
|
||||
m_outputBuffer.push_back("shared");
|
||||
|
||||
m_outputBuffer.push_back("|" + SPrint(ns));
|
||||
}
|
||||
|
||||
void VWFileTrainConsumer::AddFeature(const std::string &name)
|
||||
{
|
||||
m_outputBuffer.push_back(EscapeSpecialChars(name));
|
||||
}
|
||||
|
||||
void VWFileTrainConsumer::AddFeature(const std::string &name, float value)
|
||||
{
|
||||
m_outputBuffer.push_back(EscapeSpecialChars(name) + ":" + SPrint(value));
|
||||
}
|
||||
|
||||
void VWFileTrainConsumer::FinishExample()
|
||||
{
|
||||
WriteBuffer();
|
||||
m_bfos << "\n";
|
||||
}
|
||||
|
||||
void VWFileTrainConsumer::Finish()
|
||||
{
|
||||
//m_os.close();
|
||||
close(m_bfos);
|
||||
}
|
||||
|
||||
void VWFileTrainConsumer::Train(const std::string &label, float loss)
|
||||
{
|
||||
m_outputBuffer.push_front(label + ":" + SPrint(loss));
|
||||
}
|
||||
|
||||
float VWFileTrainConsumer::Predict(const std::string &label)
|
||||
{
|
||||
throw logic_error("Trying to predict during training!");
|
||||
}
|
||||
|
||||
//
|
||||
// private methods
|
||||
//
|
||||
|
||||
void VWFileTrainConsumer::WriteBuffer()
|
||||
{
|
||||
m_bfos << Join(" ", m_outputBuffer.begin(), m_outputBuffer.end()) << "\n";
|
||||
m_outputBuffer.clear();
|
||||
}
|
||||
|
||||
|
||||
std::string VWFileTrainConsumer::EscapeSpecialChars(const std::string &str)
|
||||
{
|
||||
string out;
|
||||
out = Replace(str, "|", "_PIPE_");
|
||||
out = Replace(out, ":", "_COLON_");
|
||||
out = Replace(out, " ", "_");
|
||||
return out;
|
||||
}
|
||||
|
||||
} // namespace Classifier
|
206
vw/VWLibraryConsumer.cpp
Normal file
206
vw/VWLibraryConsumer.cpp
Normal file
@ -0,0 +1,206 @@
|
||||
#include "FeatureConsumer.h"
|
||||
#include "vw.h"
|
||||
#include "Util.h"
|
||||
#include "ezexample.h"
|
||||
#include <stdexcept>
|
||||
#include <exception>
|
||||
#include <string>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Classifier
|
||||
{
|
||||
|
||||
//
|
||||
// VWLibraryPredictConsumerFactory
|
||||
//
|
||||
|
||||
const int EMPTY_LIST = -1;
|
||||
const int BAD_LIST_POINTER = -2;
|
||||
|
||||
VWLibraryPredictConsumerFactory::VWLibraryPredictConsumerFactory(
|
||||
const string &modelFile,
|
||||
const string &vwOptions,
|
||||
const int poolSize)
|
||||
{
|
||||
m_VWInstance = VW::initialize(vwOptions + " -i " + modelFile);
|
||||
|
||||
if (poolSize < 1)
|
||||
throw runtime_error("VWLibraryPredictConsumerFactory pool size must be greater than zero!");
|
||||
int lastFree = EMPTY_LIST;
|
||||
for (int i = 0; i < poolSize; ++i)
|
||||
{
|
||||
m_consumers.push_back(new VWLibraryPredictConsumer(m_VWInstance, i));
|
||||
m_nextFree.push_back(lastFree);
|
||||
lastFree = i;
|
||||
}
|
||||
m_firstFree = lastFree;
|
||||
}
|
||||
|
||||
VWLibraryPredictConsumerFactory::~VWLibraryPredictConsumerFactory()
|
||||
{
|
||||
boost::unique_lock<boost::mutex> lock(m_mutex);
|
||||
size_t count = 0;
|
||||
int prev = EMPTY_LIST;
|
||||
for (int cur = m_firstFree; cur != EMPTY_LIST; cur = m_nextFree[cur])
|
||||
{
|
||||
if (cur == BAD_LIST_POINTER)
|
||||
throw std::runtime_error("VWLibraryPredictConsumerFactory::~VWLibraryPredictConsumerFactory -- bad free list!");
|
||||
++count;
|
||||
if (prev == EMPTY_LIST)
|
||||
m_firstFree = BAD_LIST_POINTER;
|
||||
else
|
||||
m_nextFree[prev] = BAD_LIST_POINTER;
|
||||
prev = cur;
|
||||
}
|
||||
if (prev != EMPTY_LIST)
|
||||
m_nextFree[prev] = BAD_LIST_POINTER;
|
||||
if (count != m_nextFree.size())
|
||||
throw std::runtime_error("VWLibraryPredictConsumerFactory::~VWLibraryPredictConsumerFactory -- not all consumers were returned to pool at destruction time!");
|
||||
|
||||
for (size_t s = 0; s < m_consumers.size(); ++s)
|
||||
{
|
||||
delete m_consumers[s];
|
||||
m_consumers[s] = NULL;
|
||||
}
|
||||
m_consumers.clear();
|
||||
VW::finish(*m_VWInstance);
|
||||
}
|
||||
|
||||
VWLibraryPredictConsumer * VWLibraryPredictConsumerFactory::Acquire()
|
||||
{
|
||||
boost::unique_lock<boost::mutex> lock(m_mutex);
|
||||
while (m_firstFree == EMPTY_LIST)
|
||||
m_cond.wait(lock);
|
||||
|
||||
int free = m_firstFree;
|
||||
m_firstFree = m_nextFree[free];
|
||||
return m_consumers[free];
|
||||
}
|
||||
|
||||
void VWLibraryPredictConsumerFactory::Release(VWLibraryPredictConsumer * fc)
|
||||
{
|
||||
// use scope block to handle the lock
|
||||
{
|
||||
boost::unique_lock<boost::mutex> lock(m_mutex);
|
||||
int index = fc->m_index;
|
||||
|
||||
if (index < 0 || index >= (int)m_consumers.size())
|
||||
throw std::runtime_error("bad index at VWLibraryPredictConsumerFactory::Release");
|
||||
|
||||
if (fc != m_consumers[index])
|
||||
throw std::runtime_error("mismatched pointer at VWLibraryPredictConsumerFactory::Release");
|
||||
|
||||
m_nextFree[index] = m_firstFree;
|
||||
m_firstFree = index;
|
||||
}
|
||||
// release the semaphore *AFTER* the lock goes out of scope
|
||||
m_cond.notify_one();
|
||||
}
|
||||
|
||||
//
|
||||
// VWLibraryConsumer
|
||||
//
|
||||
|
||||
void VWLibraryConsumer::SetNamespace(char ns, bool shared)
|
||||
{
|
||||
if (!m_shared) {
|
||||
m_ex->remns();
|
||||
}
|
||||
|
||||
m_ex->addns(ns);
|
||||
m_shared = shared;
|
||||
}
|
||||
|
||||
void VWLibraryConsumer::AddFeature(const string &name)
|
||||
{
|
||||
m_ex->addf(name);
|
||||
}
|
||||
|
||||
void VWLibraryConsumer::AddFeature(const string &name, float value)
|
||||
{
|
||||
m_ex->addf(name, value);
|
||||
}
|
||||
|
||||
void VWLibraryConsumer::FinishExample()
|
||||
{
|
||||
m_shared = true; // avoid removing an empty namespace in next call of SetNamespace
|
||||
m_ex->clear_features();
|
||||
}
|
||||
|
||||
void VWLibraryConsumer::Finish()
|
||||
{
|
||||
if (m_sharedVwInstance)
|
||||
m_VWInstance = NULL;
|
||||
else
|
||||
VW::finish(*m_VWInstance);
|
||||
}
|
||||
|
||||
VWLibraryConsumer::~VWLibraryConsumer()
|
||||
{
|
||||
delete m_ex;
|
||||
if (!m_sharedVwInstance)
|
||||
VW::finish(*m_VWInstance);
|
||||
}
|
||||
|
||||
//
|
||||
// VWLibraryTrainConsumer
|
||||
//
|
||||
|
||||
VWLibraryTrainConsumer::VWLibraryTrainConsumer(const string &modelFile, const string &vwOptions)
|
||||
{
|
||||
m_shared = true;
|
||||
m_VWInstance = VW::initialize(vwOptions + " -f " + modelFile);
|
||||
m_sharedVwInstance = false;
|
||||
m_ex = new ::ezexample(m_VWInstance, false);
|
||||
}
|
||||
|
||||
void VWLibraryTrainConsumer::Train(const string &label, float loss)
|
||||
{
|
||||
m_ex->set_label(label + Moses::SPrint(loss));
|
||||
}
|
||||
|
||||
void VWLibraryTrainConsumer::FinishExample()
|
||||
{
|
||||
m_ex->finish();
|
||||
VWLibraryConsumer::FinishExample();
|
||||
}
|
||||
|
||||
float VWLibraryTrainConsumer::Predict(const string &label)
|
||||
{
|
||||
throw logic_error("Trying to predict during training!");
|
||||
}
|
||||
|
||||
//
|
||||
// VWLibraryPredictConsumer
|
||||
//
|
||||
|
||||
VWLibraryPredictConsumer::VWLibraryPredictConsumer(const string &modelFile, const string &vwOptions)
|
||||
{
|
||||
m_shared = true;
|
||||
m_VWInstance = VW::initialize(vwOptions + " -i " + modelFile);
|
||||
m_sharedVwInstance = false;
|
||||
m_ex = new ::ezexample(m_VWInstance, false);
|
||||
}
|
||||
|
||||
void VWLibraryPredictConsumer::Train(const string &label, float loss)
|
||||
{
|
||||
throw logic_error("Trying to train during prediction!");
|
||||
}
|
||||
|
||||
float VWLibraryPredictConsumer::Predict(const string &label)
|
||||
{
|
||||
m_ex->set_label(label);
|
||||
return m_ex->predict();
|
||||
}
|
||||
|
||||
VWLibraryPredictConsumer::VWLibraryPredictConsumer(vw * instance, int index)
|
||||
{
|
||||
m_VWInstance = instance;
|
||||
m_sharedVwInstance = true;
|
||||
m_ex = new ::ezexample(m_VWInstance, false);
|
||||
m_shared = true;
|
||||
m_index = index;
|
||||
}
|
||||
|
||||
} // namespace Classifier
|
Loading…
Reference in New Issue
Block a user