adding legacy files

This commit is contained in:
Ales Tamchyna 2015-01-05 18:17:02 +01:00
parent 0036d8bb4d
commit a8316c5975
9 changed files with 947 additions and 0 deletions

51
vw/ExtractorConfig.cpp Normal file
View File

@ -0,0 +1,51 @@
#include "ExtractorConfig.h"
#include "Util.h"
#include <exception>
#include <stdexcept>
#include <algorithm>
#include <set>
using namespace std;
using namespace boost::bimaps;
using namespace Moses;
namespace Classifier
{
void ExtractorConfig::Load(const string &configFile)
{
try {
IniReader reader(configFile);
m_sourceInternal = reader.Get<bool>("features.source-internal", false);
m_sourceExternal = reader.Get<bool>("features.source-external", false);
m_targetInternal = reader.Get<bool>("features.target-internal", false);
m_sourceIndicator = reader.Get<bool>("features.source-indicator", false);
m_targetIndicator = reader.Get<bool>("features.target-indicator", false);
m_sourceTargetIndicator = reader.Get<bool>("features.source-target-indicator", false);
m_STSE = reader.Get<bool>("features.source-target-source-external", false);
m_paired = reader.Get<bool>("features.paired", false);
m_bagOfWords = reader.Get<bool>("features.bag-of-words", false);
m_mostFrequent = reader.Get<bool>("features.most-frequent", false);
m_binnedScores = reader.Get<bool>("features.binned-scores", false);
m_sourceTopic = reader.Get<bool>("features.source-topic", false);
m_phraseFactor = reader.Get<bool>("features.phrase-factor", false);
m_windowSize = reader.Get<size_t>("features.window-size", 0);
m_factors = Scan<size_t>(Tokenize(reader.Get<string>("features.factors", ""), ","));
m_scoreIndexes = Scan<size_t>(Tokenize(reader.Get<string>("features.scores", ""), ","));
m_scoreBins = Scan<float>(Tokenize(reader.Get<string>("features.score-bins", ""), ","));
m_vwOptsTrain = reader.Get<string>("vw-options.train", "");
m_vwOptsPredict = reader.Get<string>("vw-options.predict", "");
m_normalization = reader.Get<string>("decoder.normalization", "");
m_isLoaded = true;
} catch (const runtime_error &err) {
cerr << "Error loading file " << configFile << ": " << err.what();
m_isLoaded = false;
}
}
} // namespace Classifier

60
vw/ExtractorConfig.h Normal file
View File

@ -0,0 +1,60 @@
#ifndef moses_ExtractorConfig_h
#define moses_ExtractorConfig_h
#include <vector>
#include <string>
#include <map>
#include <boost/bimap/bimap.hpp>
#include "IniReader.h"
namespace Classifier
{
const size_t FACTOR_FORM = 0; // index of surface forms
const size_t P_E_F_INDEX = 2; // index of P(e|f) score in phrase table
class ExtractorConfig
{
public:
void Load(const std::string &configFile);
inline bool GetSourceExternal() const { return m_sourceExternal; }
inline bool GetSourceInternal() const { return m_sourceInternal; }
inline bool GetTargetInternal() const { return m_targetInternal; }
inline bool GetSourceIndicator() const { return m_sourceIndicator; }
inline bool GetTargetIndicator() const { return m_targetIndicator; }
inline bool GetSourceTargetIndicator() const { return m_sourceTargetIndicator; }
inline bool GetSTSE() const { return m_STSE; }
inline bool GetPhraseFactor() const { return m_phraseFactor; }
inline bool GetPaired() const { return m_paired; }
inline bool GetBagOfWords() const { return m_bagOfWords; }
inline bool GetMostFrequent() const { return m_mostFrequent; }
inline size_t GetWindowSize() const { return m_windowSize; }
inline bool GetBinnedScores() const { return m_binnedScores; }
inline bool GetSourceTopic() const { return m_sourceTopic; }
inline const std::vector<size_t> &GetFactors() const { return m_factors; }
inline const std::vector<size_t> &GetScoreIndexes() const { return m_scoreIndexes; }
inline const std::vector<float> &GetScoreBins() const { return m_scoreBins; }
inline const std::string &GetVWOptionsTrain() const { return m_vwOptsTrain; }
inline const std::string &GetVWOptionsPredict() const { return m_vwOptsPredict; }
inline const std::string &GetNormalization() const { return m_normalization; }
inline bool IsLoaded() const { return m_isLoaded; }
private:
// read from configuration
bool m_paired, m_bagOfWords, m_sourceExternal,
m_sourceInternal, m_targetInternal, m_mostFrequent,
m_binnedScores, m_sourceIndicator, m_targetIndicator,
m_sourceTargetIndicator, m_STSE, m_sourceTopic, m_phraseFactor;
std::string m_vwOptsPredict, m_vwOptsTrain, m_normalization;
size_t m_windowSize;
std::vector<size_t> m_factors, m_scoreIndexes;
std::vector<float> m_scoreBins;
// internal variables
bool m_isLoaded;
};
} // namespace Classifier
#endif // moses_ExtractorConfig_h

135
vw/FeatureConsumer.h Normal file
View File

@ -0,0 +1,135 @@
#ifndef moses_FeatureConsumer_h
#define moses_FeatureConsumer_h
#include <iostream>
#include <string>
#include <fstream>
#include <sstream>
#include <deque>
#include <vector>
#include <boost/noncopyable.hpp>
#include <boost/thread/condition_variable.hpp>
#include <boost/thread/locks.hpp>
#include <boost/thread/mutex.hpp>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/gzip.hpp>
// #ifdef HAVE_VW
// forward declarations to avoid dependency on VW
struct vw;
class ezexample;
// #endif
namespace Classifier
{
// abstract consumer
class FeatureConsumer
{
public:
virtual void SetNamespace(char ns, bool shared) = 0;
virtual void AddFeature(const std::string &name) = 0;
virtual void AddFeature(const std::string &name, float value) = 0;
virtual void Train(const std::string &label, float loss) = 0;
virtual float Predict(const std::string &label) = 0;
virtual void FinishExample() = 0;
virtual void Finish() = 0;
};
// consumer that builds VW training files
class VWFileTrainConsumer : public FeatureConsumer
{
public:
VWFileTrainConsumer(const std::string &outputFile);
// FeatureConsumer interface implementation
virtual void SetNamespace(char ns, bool shared);
virtual void AddFeature(const std::string &name);
virtual void AddFeature(const std::string &name, float value);
virtual void FinishExample();
virtual void Finish();
virtual void Train(const std::string &label, float loss);
virtual float Predict(const std::string &label);
private:
boost::iostreams::filtering_ostream m_bfos;
std::deque<std::string> m_outputBuffer;
void WriteBuffer();
std::string EscapeSpecialChars(const std::string &str);
};
// #ifdef HAVE_VW
// abstract consumer that trains/predicts using VW library interface
class VWLibraryConsumer : public FeatureConsumer, private boost::noncopyable
{
public:
virtual void SetNamespace(char ns, bool shared);
virtual void AddFeature(const std::string &name);
virtual void AddFeature(const std::string &name, float value);
virtual void FinishExample();
virtual void Finish();
protected:
::vw *m_VWInstance;
::ezexample *m_ex;
// this contains state about which namespaces are shared
bool m_shared;
// if true, then the VW instance is owned by an external party and should NOT be
// deleted at end; if false, then we own the VW instance and must clean up after it.
bool m_sharedVwInstance;
int m_index;
~VWLibraryConsumer();
};
// train using VW
class VWLibraryTrainConsumer : public VWLibraryConsumer
{
public:
VWLibraryTrainConsumer(const std::string &modelFile, const std::string &vwOptions);
virtual void Train(const std::string &label, float loss);
virtual float Predict(const std::string &label);
virtual void FinishExample();
};
// predict using VW
class VWLibraryPredictConsumer : public VWLibraryConsumer
{
public:
VWLibraryPredictConsumer(const std::string &modelFile, const std::string &vwOptions);
virtual void Train(const std::string &label, float loss);
virtual float Predict(const std::string &label);
friend class VWLibraryPredictConsumerFactory;
private:
VWLibraryPredictConsumer(vw * instance, int index);
};
// object pool of VWLibraryPredictConsumers
class VWLibraryPredictConsumerFactory : private boost::noncopyable
{
public:
VWLibraryPredictConsumerFactory(const std::string &modelFile, const std::string &vwOptions, const int poolSize);
VWLibraryPredictConsumer * Acquire();
void Release(VWLibraryPredictConsumer * fc);
~VWLibraryPredictConsumerFactory();
private:
::vw *m_VWInstance;
int m_firstFree;
std::vector<int> m_nextFree;
std::vector<VWLibraryPredictConsumer *> m_consumers;
boost::mutex m_mutex;
boost::condition_variable m_cond;
};
// #endif // HAVE_VW
} // namespace Classifier
#endif // moses_FeatureConsumer_h

247
vw/FeatureExtractor.cpp Normal file
View File

@ -0,0 +1,247 @@
#include "FeatureExtractor.h"
#include "Util.h"
#include <algorithm>
#include <set>
using namespace std;
using namespace Moses;
namespace Classifier
{
FeatureExtractor::FeatureExtractor(const ExtractorConfig &config, bool train)
: m_config(config), m_train(train)
{
if (! m_config.IsLoaded())
throw logic_error("configuration file not loaded");
}
map<string, float> FeatureExtractor::GetMaxProb(const vector<Translation> &translations)
{
map<string, float> maxProbs;
vector<Translation>::const_iterator it;
vector<TTableEntry>::const_iterator tableIt;
for (it = translations.begin(); it != translations.end(); it++) {
for (tableIt = it->m_ttableScores.begin(); tableIt != it->m_ttableScores.end(); tableIt++) {
if (tableIt->m_exists) {
maxProbs[tableIt->m_id] = max(tableIt->m_scores[P_E_F_INDEX], maxProbs[tableIt->m_id]);
}
}
}
return maxProbs;
}
void FeatureExtractor::GenerateFeatures(FeatureConsumer *fc,
const ContextType &context,
size_t spanStart,
size_t spanEnd,
const vector<Translation> &translations,
vector<float> &losses)
{
fc->SetNamespace('s', true);
if (m_config.GetSourceExternal()) GenerateContextFeatures(context, spanStart, spanEnd, fc);
// get words (surface forms) in source phrase
vector<string> sourceForms(spanEnd - spanStart + 1);
for (size_t i = spanStart; i <= spanEnd; i++)
sourceForms[i - spanStart] = context[i][FACTOR_FORM];
map<string, float> maxProbs;
if (m_config.GetMostFrequent()) maxProbs = GetMaxProb(translations);
if (m_config.GetSourceInternal()) GenerateInternalFeatures(sourceForms, fc);
if (m_config.GetPhraseFactor()) GeneratePhraseFactorFeatures(context, spanStart, spanEnd, fc);
if (m_config.GetBagOfWords()) GenerateBagOfWordsFeatures(context, spanStart, spanEnd, FACTOR_FORM, fc);
if (m_config.GetSourceIndicator()) GenerateIndicatorFeature(sourceForms, fc);
vector<Translation>::const_iterator transIt = translations.begin();
vector<float>::iterator lossIt = losses.begin();
for (; transIt != translations.end(); transIt++, lossIt++) {
assert(lossIt != losses.end());
fc->SetNamespace('t', false);
// get words in target phrase
const vector<string> &targetForms = transIt->translation;
if (m_config.GetTargetInternal()) GenerateInternalFeatures(targetForms, fc);
if (m_config.GetPaired()) GeneratePairedFeatures(sourceForms, targetForms, transIt->m_alignment, fc);
if (m_config.GetMostFrequent()) GenerateMostFrequentFeature(transIt->m_ttableScores, maxProbs, fc);
if (m_config.GetBinnedScores()) GenerateScoreFeatures(transIt->m_ttableScores, fc);
// "NOT_IN_" features
if (m_config.GetBinnedScores() || m_config.GetMostFrequent()) GenerateTTableEntryFeatures(transIt->m_ttableScores, fc);
if (m_config.GetTargetIndicator()) GenerateIndicatorFeature(targetForms, fc);
if (m_config.GetSourceTargetIndicator()) GenerateConcatIndicatorFeature(sourceForms, targetForms, fc);
if (m_config.GetSTSE()) GenerateSTSE(sourceForms, targetForms, context, spanStart, spanEnd, fc);
if (m_train) {
fc->Train(SPrint(DUMMY_IDX), *lossIt);
} else {
*lossIt = fc->Predict(SPrint(DUMMY_IDX));
}
}
fc->FinishExample();
}
//
// private methods
//
string FeatureExtractor::BuildContextFeature(size_t factor, int index, const string &value)
{
return "c^" + SPrint(factor) + "_" + SPrint(index) + "_" + value;
}
void FeatureExtractor::GenerateContextFeatures(const ContextType &context,
size_t spanStart,
size_t spanEnd,
FeatureConsumer *fc)
{
vector<size_t>::const_iterator factIt;
for (factIt = m_config.GetFactors().begin(); factIt != m_config.GetFactors().end(); factIt++) {
for (size_t i = 1; i <= m_config.GetWindowSize(); i++) {
string left = "<s>";
string right = "</s>";
if (spanStart >= i)
left = context[spanStart - i][*factIt];
fc->AddFeature(BuildContextFeature(*factIt, -i, left));
if (spanEnd + i < context.size())
right = context[spanEnd + i][*factIt];
fc->AddFeature(BuildContextFeature(*factIt, i, right));
}
}
}
void FeatureExtractor::GenerateIndicatorFeature(const vector<string> &span, FeatureConsumer *fc)
{
fc->AddFeature("p^" + Join("_", span));
}
void FeatureExtractor::GenerateConcatIndicatorFeature(const vector<string> &span1, const vector<string> &span2, FeatureConsumer *fc)
{
fc->AddFeature("p^" + Join("_", span1) + "^" + Join("_", span2));
}
void FeatureExtractor::GenerateSTSE(const vector<string> &span1, const vector<string> &span2,
const ContextType &context,
size_t spanStart,
size_t spanEnd,
FeatureConsumer *fc)
{
vector<size_t>::const_iterator factIt;
for (factIt = m_config.GetFactors().begin(); factIt != m_config.GetFactors().end(); factIt++) {
for (size_t i = 1; i <= m_config.GetWindowSize(); i++) {
string left = "<s>";
string right = "</s>";
if (spanStart >= i)
left = context[spanStart - i][*factIt];
fc->AddFeature("stse^" + Join("_", span1) + "^" + Join("_", span2) + BuildContextFeature(*factIt, -i, left));
if (spanEnd + i < context.size())
right = context[spanEnd + i][*factIt];
fc->AddFeature("stse^" + Join("_", span1) + "^" + Join("_", span2) + BuildContextFeature(*factIt, i, right));
}
}
}
void FeatureExtractor::GenerateInternalFeatures(const vector<string> &span, FeatureConsumer *fc)
{
vector<string>::const_iterator it;
for (it = span.begin(); it != span.end(); it++) {
fc->AddFeature("w^" + *it);
}
}
void FeatureExtractor::GenerateBagOfWordsFeatures(const ContextType &context, size_t spanStart, size_t spanEnd, size_t factorID, FeatureConsumer *fc)
{
for (size_t i = 0; i < spanStart; i++)
fc->AddFeature("bow^" + context[i][factorID]);
for (size_t i = spanEnd + 1; i < context.size(); i++)
fc->AddFeature("bow^" + context[i][factorID]);
}
void FeatureExtractor::GeneratePhraseFactorFeatures(const ContextType &context, size_t spanStart, size_t spanEnd, FeatureConsumer *fc)
{
for (size_t i = spanStart; i <= spanEnd; i++) {
vector<size_t>::const_iterator factIt;
for (factIt = m_config.GetFactors().begin(); factIt != m_config.GetFactors().end(); factIt++) {
fc->AddFeature("ibow^" + SPrint(*factIt) + "_" + context[i][*factIt]);
}
}
}
void FeatureExtractor::GeneratePairedFeatures(const vector<string> &srcPhrase, const vector<string> &tgtPhrase,
const AlignmentType &align, FeatureConsumer *fc)
{
AlignmentType::const_iterator it;
set<size_t> srcAligned;
set<size_t> tgtAligned;
for (it = align.begin(); it != align.end(); it++) {
fc->AddFeature("pair^" + srcPhrase[it->first] + "^" + tgtPhrase[it->second]);
srcAligned.insert(it->first);
tgtAligned.insert(it->second);
}
for (size_t i = 0; i < srcPhrase.size(); i++) {
if (srcAligned.count(i) == 0)
fc->AddFeature("pair^" + srcPhrase[i] + "^NULL");
}
for (size_t i = 0; i < tgtPhrase.size(); i++) {
if (tgtAligned.count(i) == 0)
fc->AddFeature("pair^NULL^" + tgtPhrase[i]);
}
}
void FeatureExtractor::GenerateScoreFeatures(const std::vector<TTableEntry> &ttableScores, FeatureConsumer *fc)
{
vector<size_t>::const_iterator scoreIt;
vector<float>::const_iterator binIt;
vector<TTableEntry>::const_iterator tableIt;
const vector<size_t> &scoreIDs = m_config.GetScoreIndexes();
const vector<float> &bins = m_config.GetScoreBins();
for (tableIt = ttableScores.begin(); tableIt != ttableScores.end(); tableIt++) {
if (! tableIt->m_exists)
continue;
string prefix = ttableScores.size() == 1 ? "" : tableIt->m_id + "_";
for (scoreIt = scoreIDs.begin(); scoreIt != scoreIDs.end(); scoreIt++) {
for (binIt = bins.begin(); binIt != bins.end(); binIt++) {
float logScore = log(tableIt->m_scores[*scoreIt]);
if (logScore < *binIt || Equals(logScore, *binIt)) {
fc->AddFeature(prefix + "sc^" + SPrint<size_t>(*scoreIt) + "_" + SPrint(*binIt));
}
}
}
}
}
void FeatureExtractor::GenerateMostFrequentFeature(const std::vector<TTableEntry> &ttableScores, const map<string, float> &maxProbs, FeatureConsumer *fc)
{
vector<TTableEntry>::const_iterator it;
for (it = ttableScores.begin(); it != ttableScores.end(); it++) {
if (it->m_exists && Equals(it->m_scores[P_E_F_INDEX], maxProbs.find(it->m_id)->second)) {
string prefix = ttableScores.size() == 1 ? "" : it->m_id + "_";
fc->AddFeature(prefix + "MOST_FREQUENT");
}
}
}
void FeatureExtractor::GenerateTTableEntryFeatures(const std::vector<TTableEntry> &ttableScores, FeatureConsumer *fc)
{
vector<TTableEntry>::const_iterator it;
for (it = ttableScores.begin(); it != ttableScores.end(); it++) {
if (! it->m_exists)
fc->AddFeature("NOT_IN_" + it->m_id);
}
}
} // namespace Classifier

88
vw/FeatureExtractor.h Normal file
View File

@ -0,0 +1,88 @@
#ifndef moses_FeatureExtractor_h
#define moses_FeatureExtractor_h
#include "FeatureConsumer.h"
#include "ExtractorConfig.h"
#include <vector>
#include <string>
#include <exception>
#include <stdexcept>
#include <map>
namespace Classifier
{
// label index passed to the classifier, this value is not used in our setting
const int DUMMY_IDX = 1111;
// vector of words, each word is a vector of factors
typedef std::vector<std::vector<std::string> > ContextType;
typedef std::multimap<size_t, size_t> AlignmentType;
// In DA scenario, there are multiple phrase tables. This struct
// contains scores for a phrase in one phrase-table.
struct TTableEntry
{
std::string m_id; // phrase-table identifier
bool m_exists; // does translation exist in this table
std::vector<float> m_scores; // translation scores (empty if m_exists == false)
};
// One translation (phrase target side).
struct Translation
{
std::vector<std::string> translation; // words (surface forms) of translation
AlignmentType m_alignment; // phrase-internal word alignment
std::vector<TTableEntry> m_ttableScores; // phrase scores in each phrase table
};
// extract features
class FeatureExtractor
{
public:
FeatureExtractor(const ExtractorConfig &config, bool train);
// Generate features for current source phrase and all its translation options, based on
// configuration. Calls all auxiliary Generate* methods.
//
// In training, reads the &losses parameter and passes them to VW. In prediction, &losses is
// an output variable where VW scores are written.
void GenerateFeatures(FeatureConsumer *fc,
const ContextType &context,
size_t spanStart,
size_t spanEnd,
const std::vector<Translation> &translations,
std::vector<float> &losses);
private:
const ExtractorConfig &m_config; // Configuration of features.
bool m_train; // Train or predict.
// Get the highest probability P(e|f) associated with any of the translation options,
// separately for each phrase table (string keys are phrase-table IDs).
std::map<std::string, float> GetMaxProb(const std::vector<Translation> &translations);
void GenerateContextFeatures(const ContextType &context, size_t spanStart, size_t spanEnd, FeatureConsumer *fc);
void GeneratePhraseFactorFeatures(const ContextType &context, size_t spanStart, size_t spanEnd, FeatureConsumer *fc);
void GenerateInternalFeatures(const std::vector<std::string> &span, FeatureConsumer *fc);
void GenerateIndicatorFeature(const std::vector<std::string> &span, FeatureConsumer *fc);
void GenerateConcatIndicatorFeature(const std::vector<std::string> &span1, const std::vector<std::string> &span2, FeatureConsumer *fc);
void GenerateSTSE(const std::vector<std::string> &span1, const std::vector<std::string> &span2, const ContextType &context, size_t spanStart, size_t spanEnd, FeatureConsumer *fc);
void GenerateBagOfWordsFeatures(const ContextType &context, size_t spanStart, size_t spanEnd, size_t factorID, FeatureConsumer *fc);
void GeneratePairedFeatures(const std::vector<std::string> &srcPhrase,
const std::vector<std::string> &tgtPhrase,
const AlignmentType &align,
FeatureConsumer *fc);
void GenerateScoreFeatures(const std::vector<TTableEntry> &ttableScores, FeatureConsumer *fc);
void GenerateMostFrequentFeature(const std::vector<TTableEntry> &ttableScores,
const std::map<std::string, float> &maxProbs,
FeatureConsumer *fc);
void GenerateTTableEntryFeatures(const std::vector<TTableEntry> &ttableScores, FeatureConsumer *fc);
std::string BuildContextFeature(size_t factor, int index, const std::string &value);
};
} // namespace Classifier
#endif // moses_FeatureExtractor_h

61
vw/IniReader.h Normal file
View File

@ -0,0 +1,61 @@
#ifndef moses_iniReader_h
#define moses_iniReader_h
#include <vector>
#include <algorithm>
#include <string>
#include <fstream>
#include <map>
#include <exception>
#include <stdexcept>
#include <boost/algorithm/string.hpp>
#include <boost/bind.hpp>
#include <boost/algorithm/string/classification.hpp>
#include <boost/lexical_cast.hpp>
// simple reader of .ini files
class IniReader {
public:
IniReader(const std::string &file)
{
std::ifstream inStr(file.c_str());
if (! inStr.is_open())
throw std::runtime_error("Failed to open file " + file);
std::string section = "";
std::string line;
while (getline(inStr, line)) {
if (line.empty() || line[0] == ';' || line[0] == '#') {
// empty line or comment, do nothing
} else if (line[0] == '[') {
// new section
section = line.substr(1, line.size() - 2);
} else {
std::vector<std::string> cols;
boost::split(cols, line, boost::is_any_of("="));
std::for_each(cols.begin(), cols.end(),
boost::bind(&boost::trim<std::string>, _1, std::locale()));
if (section.empty())
throw std::runtime_error("Missing section");
if (cols.size() != 2)
throw std::runtime_error("Failed to parse line: '" + line + "'");
std::string key = section + "." + cols[0];
properties[key] = cols[1];
}
}
inStr.close();
}
template <class T>
T Get(const std::string &key, T defaultValue)
{
std::map<std::string, std::string>::const_iterator it = properties.find(key);
return (it == properties.end()) ? defaultValue : boost::lexical_cast<T>(it->second);
}
private:
std::map<std::string, std::string> properties;
};
#endif // moses_iniReader_h

14
vw/Jamfile Normal file
View File

@ -0,0 +1,14 @@
alias headers : : : : <include>. <include>..//..//moses// <include>.. ;
boost 103600 ;
# VW
local with-vw = [ option.get "with-vw" ] ;
if $(with-vw) {
lib vw : : <search>$(with-vw)/lib ;
lib allreduce : : <search>$(with-vw)/lib ;
obj VWLibraryConsumer.o : VWLibraryConsumer.cpp headers : <include>$(with-vw)/library <include>$(with-vw)/vowpalwabbit ;
alias vw_objects : VWLibraryConsumer.o vw allreduce : : : <library>boost_program_options ;
lib classifier : [ glob *.cpp : VWLibraryConsumer.cpp ] vw_objects headers ;
echo "Linking with Vowpal Wabbit" ;
}

85
vw/VWFileConsumer.cpp Normal file
View File

@ -0,0 +1,85 @@
#include "FeatureConsumer.h"
#include "Util.h"
#include <stdexcept>
#include <exception>
#include <string>
#include <boost/iostreams/device/file.hpp>
using namespace std;
using namespace Moses;
namespace Classifier
{
VWFileTrainConsumer::VWFileTrainConsumer(const std::string &outputFile)
{
if (outputFile.size() > 3 && outputFile.substr(outputFile.size() - 3, 3) == ".gz") {
m_bfos.push(boost::iostreams::gzip_compressor());
}
m_bfos.push(boost::iostreams::file_sink(outputFile));
}
void VWFileTrainConsumer::SetNamespace(char ns, bool shared)
{
if (! m_outputBuffer.empty())
WriteBuffer();
if (shared)
m_outputBuffer.push_back("shared");
m_outputBuffer.push_back("|" + SPrint(ns));
}
void VWFileTrainConsumer::AddFeature(const std::string &name)
{
m_outputBuffer.push_back(EscapeSpecialChars(name));
}
void VWFileTrainConsumer::AddFeature(const std::string &name, float value)
{
m_outputBuffer.push_back(EscapeSpecialChars(name) + ":" + SPrint(value));
}
void VWFileTrainConsumer::FinishExample()
{
WriteBuffer();
m_bfos << "\n";
}
void VWFileTrainConsumer::Finish()
{
//m_os.close();
close(m_bfos);
}
void VWFileTrainConsumer::Train(const std::string &label, float loss)
{
m_outputBuffer.push_front(label + ":" + SPrint(loss));
}
float VWFileTrainConsumer::Predict(const std::string &label)
{
throw logic_error("Trying to predict during training!");
}
//
// private methods
//
void VWFileTrainConsumer::WriteBuffer()
{
m_bfos << Join(" ", m_outputBuffer.begin(), m_outputBuffer.end()) << "\n";
m_outputBuffer.clear();
}
std::string VWFileTrainConsumer::EscapeSpecialChars(const std::string &str)
{
string out;
out = Replace(str, "|", "_PIPE_");
out = Replace(out, ":", "_COLON_");
out = Replace(out, " ", "_");
return out;
}
} // namespace Classifier

206
vw/VWLibraryConsumer.cpp Normal file
View File

@ -0,0 +1,206 @@
#include "FeatureConsumer.h"
#include "vw.h"
#include "Util.h"
#include "ezexample.h"
#include <stdexcept>
#include <exception>
#include <string>
using namespace std;
namespace Classifier
{
//
// VWLibraryPredictConsumerFactory
//
const int EMPTY_LIST = -1;
const int BAD_LIST_POINTER = -2;
VWLibraryPredictConsumerFactory::VWLibraryPredictConsumerFactory(
const string &modelFile,
const string &vwOptions,
const int poolSize)
{
m_VWInstance = VW::initialize(vwOptions + " -i " + modelFile);
if (poolSize < 1)
throw runtime_error("VWLibraryPredictConsumerFactory pool size must be greater than zero!");
int lastFree = EMPTY_LIST;
for (int i = 0; i < poolSize; ++i)
{
m_consumers.push_back(new VWLibraryPredictConsumer(m_VWInstance, i));
m_nextFree.push_back(lastFree);
lastFree = i;
}
m_firstFree = lastFree;
}
VWLibraryPredictConsumerFactory::~VWLibraryPredictConsumerFactory()
{
boost::unique_lock<boost::mutex> lock(m_mutex);
size_t count = 0;
int prev = EMPTY_LIST;
for (int cur = m_firstFree; cur != EMPTY_LIST; cur = m_nextFree[cur])
{
if (cur == BAD_LIST_POINTER)
throw std::runtime_error("VWLibraryPredictConsumerFactory::~VWLibraryPredictConsumerFactory -- bad free list!");
++count;
if (prev == EMPTY_LIST)
m_firstFree = BAD_LIST_POINTER;
else
m_nextFree[prev] = BAD_LIST_POINTER;
prev = cur;
}
if (prev != EMPTY_LIST)
m_nextFree[prev] = BAD_LIST_POINTER;
if (count != m_nextFree.size())
throw std::runtime_error("VWLibraryPredictConsumerFactory::~VWLibraryPredictConsumerFactory -- not all consumers were returned to pool at destruction time!");
for (size_t s = 0; s < m_consumers.size(); ++s)
{
delete m_consumers[s];
m_consumers[s] = NULL;
}
m_consumers.clear();
VW::finish(*m_VWInstance);
}
VWLibraryPredictConsumer * VWLibraryPredictConsumerFactory::Acquire()
{
boost::unique_lock<boost::mutex> lock(m_mutex);
while (m_firstFree == EMPTY_LIST)
m_cond.wait(lock);
int free = m_firstFree;
m_firstFree = m_nextFree[free];
return m_consumers[free];
}
void VWLibraryPredictConsumerFactory::Release(VWLibraryPredictConsumer * fc)
{
// use scope block to handle the lock
{
boost::unique_lock<boost::mutex> lock(m_mutex);
int index = fc->m_index;
if (index < 0 || index >= (int)m_consumers.size())
throw std::runtime_error("bad index at VWLibraryPredictConsumerFactory::Release");
if (fc != m_consumers[index])
throw std::runtime_error("mismatched pointer at VWLibraryPredictConsumerFactory::Release");
m_nextFree[index] = m_firstFree;
m_firstFree = index;
}
// release the semaphore *AFTER* the lock goes out of scope
m_cond.notify_one();
}
//
// VWLibraryConsumer
//
void VWLibraryConsumer::SetNamespace(char ns, bool shared)
{
if (!m_shared) {
m_ex->remns();
}
m_ex->addns(ns);
m_shared = shared;
}
void VWLibraryConsumer::AddFeature(const string &name)
{
m_ex->addf(name);
}
void VWLibraryConsumer::AddFeature(const string &name, float value)
{
m_ex->addf(name, value);
}
void VWLibraryConsumer::FinishExample()
{
m_shared = true; // avoid removing an empty namespace in next call of SetNamespace
m_ex->clear_features();
}
void VWLibraryConsumer::Finish()
{
if (m_sharedVwInstance)
m_VWInstance = NULL;
else
VW::finish(*m_VWInstance);
}
VWLibraryConsumer::~VWLibraryConsumer()
{
delete m_ex;
if (!m_sharedVwInstance)
VW::finish(*m_VWInstance);
}
//
// VWLibraryTrainConsumer
//
VWLibraryTrainConsumer::VWLibraryTrainConsumer(const string &modelFile, const string &vwOptions)
{
m_shared = true;
m_VWInstance = VW::initialize(vwOptions + " -f " + modelFile);
m_sharedVwInstance = false;
m_ex = new ::ezexample(m_VWInstance, false);
}
void VWLibraryTrainConsumer::Train(const string &label, float loss)
{
m_ex->set_label(label + Moses::SPrint(loss));
}
void VWLibraryTrainConsumer::FinishExample()
{
m_ex->finish();
VWLibraryConsumer::FinishExample();
}
float VWLibraryTrainConsumer::Predict(const string &label)
{
throw logic_error("Trying to predict during training!");
}
//
// VWLibraryPredictConsumer
//
VWLibraryPredictConsumer::VWLibraryPredictConsumer(const string &modelFile, const string &vwOptions)
{
m_shared = true;
m_VWInstance = VW::initialize(vwOptions + " -i " + modelFile);
m_sharedVwInstance = false;
m_ex = new ::ezexample(m_VWInstance, false);
}
void VWLibraryPredictConsumer::Train(const string &label, float loss)
{
throw logic_error("Trying to train during prediction!");
}
float VWLibraryPredictConsumer::Predict(const string &label)
{
m_ex->set_label(label);
return m_ex->predict();
}
VWLibraryPredictConsumer::VWLibraryPredictConsumer(vw * instance, int index)
{
m_VWInstance = instance;
m_sharedVwInstance = true;
m_ex = new ::ezexample(m_VWInstance, false);
m_shared = true;
m_index = index;
}
} // namespace Classifier