online combination of multiple phrase tables

- creates a virtual phrase table at decoding time based on a vector of component models and a combination algorithm
  - linear interpolation or instance weighting
  - two possible component model types supported so far: 0 (in-memory) or 12 (compact)
  - weights can be set in config, and overriden on a sentence-level through mosesserver API
  - online optimization (perplexity minimization) using dlib and xmlrpc-c call
This commit is contained in:
Rico Sennrich 2013-04-22 13:21:59 +02:00
parent 477f913585
commit 908c006e32
17 changed files with 1886 additions and 12 deletions

View File

@ -0,0 +1,78 @@
# -*- coding: utf-8 -*-
#
# Sample python client. Additionally to basic functionality, shows how translation model weights can be provided to multimodel phrase table type,
# and how translation model weights can be optimized on tuning set of phrase pairs.
# translate_concurrent() shows how to use multiple moses server threads.
#
import sys
import gzip
from multiprocessing import Pool
if sys.version_info < (3, 0):
import xmlrpclib
else:
import xmlrpc.client as xmlrpclib
def translate(input_object, server, weights=None):
for line in input_object:
params = {}
params['text'] = line
if weights:
params['weight-t-multimodel'] = weights
print server.translate(params)
def optimize(phrase_pairs, server):
params = {}
params['phrase_pairs'] = phrase_pairs
weights = server.optimize(params)
sys.stderr.write(str(weights + '\n'))
return weights
def read_phrase_pairs(input_object):
pairs = []
for line in input_object:
line = line.split(' ||| ')
pairs.append((line[0],line[1]))
return pairs
#same functionality as translate(), but using multiple concurrent connections to server
def translate_concurrent(input_object, url, weights=None, num_processes=8):
pool = Pool(processes=num_processes)
text_args = [(line, weights, url) for line in input_object]
for translated_line in pool.imap(translate_single_line, text_args):
print translated_line
def translate_single_line(args):
line, weights, url = args
server = xmlrpclib.ServerProxy(url)
params = {}
params['text'] = line
if weights:
params['weight-t-multimodel'] = weights
return server.translate(params)['text']
if __name__ == '__main__':
url = "http://localhost:8111/RPC2"
server = xmlrpclib.ServerProxy(url)
phrase_pairs = read_phrase_pairs(gzip.open('/path/to/moses-regression-tests/models/multimodel/extract.sorted.gz'))
weights = optimize(phrase_pairs, server)
translate(sys.stdin, server, weights)

View File

@ -11,11 +11,16 @@
#include "moses/Phrase.h"
#include "moses/StaticData.h"
#include "moses/TranslationModel/PhraseDictionaryDynSuffixArray.h"
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
#include "moses/TranslationSystem.h"
#include "moses/TreeInput.h"
#include "moses/LMList.h"
#include "moses/LM/ORLM.h"
#ifdef WITH_THREADS
#include <boost/thread.hpp>
#endif
#include <xmlrpc-c/base.hpp>
#include <xmlrpc-c/registry.hpp>
#include <xmlrpc-c/server_abyss.hpp>
@ -133,6 +138,61 @@ public:
}
};
class Optimizer : public xmlrpc_c::method
{
public:
Optimizer() {
// signature and help strings are documentation -- the client
// can query this information with a system.methodSignature and
// system.methodHelp RPC.
this->_signature = "S:S";
this->_help = "Optimizes multi-model translation model";
}
void
execute(xmlrpc_c::paramList const& paramList,
xmlrpc_c::value * const retvalP) {
#ifdef WITH_DLIB
const params_t params = paramList.getStruct(0);
const TranslationSystem& system = getTranslationSystem(params);
const PhraseDictionaryFeature* pdf = system.GetPhraseDictionaries()[0];
PhraseDictionaryMultiModel* pdmm = (PhraseDictionaryMultiModel*) pdf->GetDictionary();
params_t::const_iterator si = params.find("phrase_pairs");
if (si == params.end()) {
throw xmlrpc_c::fault(
"Missing list of phrase pairs",
xmlrpc_c::fault::CODE_PARSE);
}
vector<pair<string, string> > phrase_pairs;
xmlrpc_c::value_array phrase_pairs_array = xmlrpc_c::value_array(si->second);
vector<xmlrpc_c::value> phrasePairValueVector(phrase_pairs_array.vectorValueValue());
for (size_t i=0;i < phrasePairValueVector.size();i++) {
vector<xmlrpc_c::value> phrasePair(xmlrpc_c::value_array(phrasePairValueVector[i]).vectorValueValue());
string L1 = xmlrpc_c::value_string(phrasePair[0]);
string L2 = xmlrpc_c::value_string(phrasePair[1]);
phrase_pairs.push_back(make_pair(L1,L2));
}
vector<float> weight_vector;
weight_vector = pdmm->MinimizePerplexity(phrase_pairs);
vector<xmlrpc_c::value> weight_vector_ret;
for (size_t i=0;i < weight_vector.size();i++) {
weight_vector_ret.push_back(xmlrpc_c::value_double(weight_vector[i]));
}
*retvalP = xmlrpc_c::value_array(weight_vector_ret);
#else
string errmsg = "Error: Perplexity minimization requires dlib (compilation option --with-dlib)";
cerr << errmsg << endl;
*retvalP = xmlrpc_c::value_string(errmsg);
#endif
}
};
class Translator : public xmlrpc_c::method
{
public:
@ -173,12 +233,29 @@ public:
si = params.find("nbest-distinct");
bool nbest_distinct = (si != params.end());
vector<float> multiModelWeights;
si = params.find("weight-t-multimodel");
if (si != params.end()) {
xmlrpc_c::value_array multiModelArray = xmlrpc_c::value_array(si->second);
vector<xmlrpc_c::value> multiModelValueVector(multiModelArray.vectorValueValue());
for (size_t i=0;i < multiModelValueVector.size();i++) {
multiModelWeights.push_back(xmlrpc_c::value_double(multiModelValueVector[i]));
}
}
const StaticData &staticData = StaticData::Instance();
if (addGraphInfo) {
(const_cast<StaticData&>(staticData)).SetOutputSearchGraph(true);
}
if (multiModelWeights.size() > 0) {
staticData.SetTemporaryMultiModelWeightsVector(multiModelWeights);
if (staticData.GetUseTransOptCache()) {
cerr << "Warning: -use-persistent-cache is set to true; sentence-specific weights may be ignored. Disable cache for true results.\n";
}
}
const TranslationSystem& system = getTranslationSystem(params);
stringstream out, graphInfo, transCollOpts;
map<string, xmlrpc_c::value> retData;
@ -425,13 +502,18 @@ int main(int argc, char** argv)
exit(1);
}
//512 MB data limit (512KB is not enough for optimization)
xmlrpc_limit_set(XMLRPC_XML_SIZE_LIMIT_ID, 512*1024*1024);
xmlrpc_c::registry myRegistry;
xmlrpc_c::methodPtr const translator(new Translator);
xmlrpc_c::methodPtr const updater(new Updater);
xmlrpc_c::methodPtr const optimizer(new Optimizer);
myRegistry.addMethod("translate", translator);
myRegistry.addMethod("updater", updater);
myRegistry.addMethod("optimize", optimizer);
xmlrpc_c::serverAbyss myAbyssServer(
myRegistry,

View File

@ -3,7 +3,14 @@ path-constant FACTOR-LOG : bin/factor.log ;
update-if-changed $(FACTOR-LOG) $(max-factors) ;
max-factors = <define>MAX_NUM_FACTORS=$(max-factors) <dependency>$(FACTOR-LOG) ;
alias headers : ../util//kenutil : : : $(max-factors) ;
with-dlib = [ option.get "with-dlib" ] ;
if $(with-dlib) {
dlib = <define>WITH_DLIB <include>$(with-dlib) ;
} else {
dlib = ;
}
alias headers : ../util//kenutil : : : $(max-factors) $(dlib) ;
alias ThreadPool : ThreadPool.cpp ;

View File

@ -100,6 +100,7 @@ Parameter::Parameter()
AddParam("weight-w", "w", "weight for word penalty");
AddParam("weight-u", "u", "weight for unknown word penalty");
AddParam("weight-e", "e", "weight for word deletion");
AddParam("weight-t-multimodel", "tmo", "weights for multi-model mode");
AddParam("weight-file", "wf", "feature weights file. Do *not* put weights for 'core' features in here - they go in moses.ini");
AddParam("output-factors", "list if factors in the output");
AddParam("cache-path", "?");

View File

@ -355,6 +355,8 @@ bool StaticData::LoadData(Parameter *parameter)
m_unknownWordPenaltyProducer = new UnknownWordPenaltyProducer();
SetWeight(m_unknownWordPenaltyProducer, weightUnknownWord);
m_multimodelweights = Scan<float>( m_parameter->GetParam("weight-t-multimodel") );
// reordering constraints
m_maxDistortion = (m_parameter->GetParam("distortion-limit").size() > 0) ?
Scan<int>(m_parameter->GetParam("distortion-limit")[0])

View File

@ -33,6 +33,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <string>
#ifdef WITH_THREADS
#include <boost/thread.hpp>
#include <boost/thread/mutex.hpp>
#endif
@ -246,7 +247,13 @@ protected:
int m_threadCount;
long m_startTranslationId;
std::vector<float> m_multimodelweights;
#ifdef WITH_THREADS
mutable std::map<boost::thread::id, std::vector<float> > m_multimodelweights_tmp;
#else
mutable std::vector<float> m_multimodelweights_tmp;
#endif
StaticData();
@ -728,6 +735,34 @@ public:
void SetExecPath(const std::string &path);
const std::string &GetBinDirectory() const;
const std::vector<float>* GetMultiModelWeightsVector() const {
return &m_multimodelweights;
}
void SetTemporaryMultiModelWeightsVector(std::vector<float> weights) const {
#ifdef WITH_THREADS
m_multimodelweights_tmp[boost::this_thread::get_id()] = weights;
#else
m_multimodelweights_tmp = weights;
#endif
}
const std::vector<float>* GetTemporaryMultiModelWeightsVector() const {
#ifdef WITH_THREADS
if (m_multimodelweights_tmp.find(boost::this_thread::get_id()) != m_multimodelweights_tmp.end()) {
return &m_multimodelweights_tmp.find(boost::this_thread::get_id())->second;
}
else {
return NULL;
}
#else
return &m_multimodelweights_tmp;
#endif
}
void SetNeedAlignmentInfo(bool needAlignmentInfo) {
m_needAlignmentInfo = needAlignmentInfo;
}
bool NeedAlignmentInfo() const {
return m_needAlignmentInfo; }
const std::string &GetAlignmentOutputFile() const {

View File

@ -433,9 +433,13 @@ TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
size_t idx = m_multipleScoreTrees ? scores.size() : 0;
float score = m_scoreTrees[idx]->Read(encodedBitStream);
scores.push_back(score);
if(scores.size() == m_numScoreComponent)
{
//PhraseDictionaryMultiModel may use input phrase dictionaries with a different number of features than it is assigned in the log-linear model;
//filling extra slots with zeroes to prevent error messages on the way
if (m_phraseDictionary.GetNumScoreComponentMultiModel() > 0 && m_phraseDictionary.GetNumScoreComponentMultiModel() > m_numScoreComponent) {
scores.resize(m_phraseDictionary.GetNumScoreComponentMultiModel());
}
targetPhrase->SetScore(m_feature, scores, ScoreComponentCollection() /*sparse*/,*m_weight, m_weightWP, *m_languageModels);
if(m_containsAlignmentInfo)

View File

@ -26,6 +26,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h"
#include "moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h"
#include "moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h"
#include "moses/TranslationModel/PhraseDictionaryMultiModel.h"
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
#ifndef WIN32
#include "moses/TranslationModel/PhraseDictionaryDynSuffixArray.h"
@ -76,7 +78,7 @@ PhraseDictionaryFeature::PhraseDictionaryFeature
m_sparsePhraseDictionaryFeature(spdf)
{
if (implementation == Memory || implementation == SCFG || implementation == SuffixArray ||
implementation==Compact || implementation==FuzzyMatch ) {
implementation==Compact || implementation==FuzzyMatch || implementation == MultiModel || implementation == MultiModelCounts) {
m_useThreadSafePhraseDictionary = true;
} else {
m_useThreadSafePhraseDictionary = false;
@ -241,7 +243,43 @@ PhraseDictionary* PhraseDictionaryFeature::LoadPhraseTable(const TranslationSyst
#else
CHECK(false);
#endif
}
} else if (m_implementation == MultiModel ) {
// memory phrase table
VERBOSE(2,"multi-model mode" << std::endl);
if (staticData.GetInputType() != SentenceInput) {
UserMessage::Add("Must use binary phrase table for this input type");
CHECK(false);
}
PhraseDictionaryMultiModel* pd = new PhraseDictionaryMultiModel(GetNumScoreComponents(),this);
bool ret = pd->Load(GetInput(), GetOutput()
, m_config
, weightT
, m_tableLimit
, system->GetLanguageModels()
, system->GetWeightWordPenalty());
CHECK(ret);
return pd;
} else if (m_implementation == MultiModelCounts) {
// memory phrase table
VERBOSE(2,"multi-model mode (count tables)" << std::endl);
if (staticData.GetInputType() != SentenceInput) {
UserMessage::Add("Must use binary phrase table for this input type");
CHECK(false);
}
(const_cast<StaticData&>(staticData)).SetNeedAlignmentInfo(true); //needed for lexical weight computation
PhraseDictionaryMultiModelCounts* pd = new PhraseDictionaryMultiModelCounts(GetNumScoreComponents(),this);
bool ret = pd->Load(GetInput(), GetOutput()
, m_config
, weightT
, m_tableLimit
, system->GetLanguageModels()
, system->GetWeightWordPenalty());
CHECK(ret);
return pd;
}
else {
std::cerr << "Unknown phrase table type " << m_implementation << endl;
CHECK(false);
@ -261,6 +299,16 @@ void PhraseDictionaryFeature::InitDictionary(const TranslationSystem* system)
//Other types will be lazy loaded
}
void PhraseDictionary::SetNumScoreComponentMultiModel(size_t num)
{
m_numScoreComponentMultiModel = num;
}
size_t PhraseDictionary::GetNumScoreComponentMultiModel() const
{
return m_numScoreComponentMultiModel;
}
//Called when we start translating a new sentence
void PhraseDictionaryFeature::InitDictionary(const TranslationSystem* system, const InputType& source)
{

View File

@ -60,7 +60,7 @@ class PhraseDictionary: public Dictionary
{
public:
PhraseDictionary(size_t numScoreComponent, const PhraseDictionaryFeature* feature):
Dictionary(numScoreComponent), m_tableLimit(0), m_feature(feature) {}
Dictionary(numScoreComponent), m_tableLimit(0), m_feature(feature), m_numScoreComponentMultiModel(0) {}
//! table limit number.
size_t GetTableLimit() const {
return m_tableLimit;
@ -83,9 +83,14 @@ public:
const InputType &,
const ChartCellCollectionBase &) = 0;
//PhraseDictionaryMultiModel may use input phrase dictionaries with a different number of features than it is assigned in the log-linear model
void SetNumScoreComponentMultiModel(size_t num);
size_t GetNumScoreComponentMultiModel() const;
protected:
size_t m_tableLimit;
const PhraseDictionaryFeature* m_feature;
size_t m_numScoreComponentMultiModel;
};
@ -174,7 +179,7 @@ private:
PhraseTableImplementation m_implementation;
const std::vector<std::string> m_config;
SparsePhraseDictionaryFeature* m_sparsePhraseDictionaryFeature;
std::vector<std::string> m_allPaths;
};

View File

@ -122,10 +122,17 @@ bool PhraseDictionaryMemory::Load(const std::vector<FactorType> &input
}
}
if (scv.size() != m_numScoreComponent) {
stringstream strme;
strme << "Size of scoreVector != number (" <<scv.size() << "!=" <<m_numScoreComponent<<") of score components on line " << line_num;
UserMessage::Add(strme.str());
abort();
//PhraseDictionaryMultiModel may use input phrase dictionaries with a different number of features than it is assigned in the log-linear model;
//filling extra slots with zeroes to prevent error messages on the way
if (m_numScoreComponentMultiModel > 0 && scv.size() == m_numScoreComponentMultiModel && m_numScoreComponentMultiModel < m_numScoreComponent) {
scv.resize(m_numScoreComponent);
}
else {
stringstream strme;
strme << "Size of scoreVector != number (" <<scv.size() << "!=" <<m_numScoreComponent<<") of score components on line " << line_num;
UserMessage::Add(strme.str());
abort();
}
}

View File

@ -0,0 +1,476 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "moses/TranslationModel/PhraseDictionaryMultiModel.h"
using namespace std;
namespace Moses
{
PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(size_t numScoreComponent,
PhraseDictionaryFeature* feature): PhraseDictionary(numScoreComponent, feature)
{
m_feature_load = feature;
}
PhraseDictionaryMultiModel::~PhraseDictionaryMultiModel()
{
RemoveAllInColl(m_pd);
}
bool PhraseDictionaryMultiModel::Load(const std::vector<FactorType> &input
, const std::vector<FactorType> &output
, const std::vector<std::string> &config
, const vector<float> &weight
, size_t tableLimit
, const LMList &languageModels
, float weightWP)
{
m_languageModels = &languageModels;
m_weight = weight;
m_weightWP = weightWP;
m_input = input;
m_output = output;
m_tableLimit = tableLimit;
m_mode = config[4];
std::vector<std::string> files(config.begin()+5,config.end());
m_numModels = files.size();
// since the top X target phrases of the final model are not the same as the top X phrases of each component model,
// one could choose a higher value than tableLimit (or 0) here for maximal precision, at a cost of speed.
m_componentTableLimit = tableLimit;
//how many actual scores there are in the phrase tables
//so far, equal to number of log-linear scores, but it is allowed to be smaller (for other combination types)
size_t numPtScores = m_numScoreComponent;
if (m_mode != "interpolate") {
ostringstream msg;
msg << "combination mode unknown: " << m_mode;
throw runtime_error(msg.str());
}
for(size_t i = 0; i < m_numModels; ++i){
std::string impl, file, main_table;
std::string delim = ":";
size_t delim_pos = files[i].find(delim);
if (delim_pos >= files[i].size()) {
UserMessage::Add("Phrase table must be specified in this format: Implementation:Path");
CHECK(false);
}
impl = files[i].substr(0,delim_pos);
file = files[i].substr(delim_pos+1,files[i].size());
PhraseTableImplementation implementation = (PhraseTableImplementation) Scan<int>(impl);
if (implementation == Memory) {
if (!FileExists(file) && FileExists(file + ".gz")) file += ".gz";
PhraseDictionaryMemory* pdm = new PhraseDictionaryMemory(m_numScoreComponent, m_feature_load);
pdm->SetNumScoreComponentMultiModel(numPtScores); //instead of complaining about inequal number of scores, silently fill up the score vector with zeroes
pdm->Load( input, output, file, m_weight, m_componentTableLimit, languageModels, m_weightWP);
m_pd.push_back(pdm);
}
else if (implementation == Compact) {
#ifndef WIN32
PhraseDictionaryCompact* pdc = new PhraseDictionaryCompact(m_numScoreComponent, implementation, m_feature_load);
pdc->SetNumScoreComponentMultiModel(m_numScoreComponent); //for compact models, we need to pass number of log-linear components to correctly resize the score vector
pdc->Load( input, output, file, m_weight, m_componentTableLimit, languageModels, m_weightWP);
m_pd.push_back(pdc);
#else
CHECK(false);
#endif
}
else {
UserMessage::Add("phrase table type unknown to multi-model mode");
CHECK(false);
}
}
return true;
}
const TargetPhraseCollection *PhraseDictionaryMultiModel::GetTargetPhraseCollection(const Phrase& src) const
{
std::vector<std::vector<float> > multimodelweights;
if (m_mode == "interpolate") {
//interpolation of phrase penalty is skipped, and fixed-value (2.718) is used instead. results will be screwed up if phrase penalty is not last feature
size_t numWeights = m_numScoreComponent-1;
multimodelweights = getWeights(numWeights, true);
}
std::map<std::string,multiModelStatistics*>* allStats = new(std::map<std::string,multiModelStatistics*>);
CollectSufficientStatistics(src, allStats);
TargetPhraseCollection *ret;
if (m_mode == "interpolate") {
ret = CreateTargetPhraseCollectionLinearInterpolation(allStats, multimodelweights);
}
ret->NthElement(m_tableLimit); // sort the phrases for pruning later
const_cast<PhraseDictionaryMultiModel*>(this)->CacheForCleanup(ret);
RemoveAllInMap(*allStats);
delete allStats;
return ret;
}
void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src, std::map<std::string,multiModelStatistics*>* allStats) const
{
for(size_t i = 0; i < m_numModels; ++i){
TargetPhraseCollection *ret_raw = (TargetPhraseCollection*) m_pd[i]->GetTargetPhraseCollection( src);
if (ret_raw != NULL) {
TargetPhraseCollection::iterator iterTargetPhrase, iterLast;
if (m_componentTableLimit != 0 && ret_raw->GetSize() > m_componentTableLimit) {
iterLast = ret_raw->begin() + m_componentTableLimit;
}
else {
iterLast = ret_raw->end();
}
for (iterTargetPhrase = ret_raw->begin(); iterTargetPhrase != iterLast; ++iterTargetPhrase) {
TargetPhrase * targetPhrase = *iterTargetPhrase;
std::vector<float> raw_scores = targetPhrase->GetScoreBreakdown().GetScoresForProducer(m_feature);
std::string targetString = targetPhrase->GetStringRep(m_output);
if (allStats->find(targetString) == allStats->end()) {
multiModelStatistics * statistics = new multiModelStatistics;
statistics->targetPhrase = new TargetPhrase(*targetPhrase); //make a copy so that we don't overwrite the original phrase table info
Scores scoreVector(m_numScoreComponent);
statistics->p.resize(m_numScoreComponent);
for(size_t j = 0; j < m_numScoreComponent; ++j){
statistics->p[j].resize(m_numModels);
scoreVector[j] = -raw_scores[j];
}
statistics->targetPhrase->SetScore(m_feature, scoreVector, ScoreComponentCollection(), m_weight, m_weightWP, *m_languageModels); // set scores to 0
(*allStats)[targetString] = statistics;
}
multiModelStatistics * statistics = (*allStats)[targetString];
for(size_t j = 0; j < m_numScoreComponent; ++j){
statistics->p[j][i] = UntransformScore(raw_scores[j]);
}
(*allStats)[targetString] = statistics;
}
}
}
}
TargetPhraseCollection* PhraseDictionaryMultiModel::CreateTargetPhraseCollectionLinearInterpolation(std::map<std::string,multiModelStatistics*>* allStats, std::vector<std::vector<float> > &multimodelweights) const
{
TargetPhraseCollection *ret = new TargetPhraseCollection();
for ( std::map< std::string, multiModelStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) {
multiModelStatistics * statistics = iter->second;
Scores scoreVector(m_numScoreComponent);
for(size_t i = 0; i < m_numScoreComponent-1; ++i){
scoreVector[i] = TransformScore(std::inner_product(statistics->p[i].begin(), statistics->p[i].end(), multimodelweights[i].begin(), 0.0));
}
//assuming that last value is phrase penalty
scoreVector[m_numScoreComponent-1] = 1.0;
statistics->targetPhrase->SetScore(m_feature, scoreVector, ScoreComponentCollection(), m_weight, m_weightWP, *m_languageModels);
ret->Add(new TargetPhrase(*statistics->targetPhrase));
}
return ret;
}
//TODO: is it worth caching the results as long as weights don't change?
std::vector<std::vector<float> > PhraseDictionaryMultiModel::getWeights(size_t numWeights, bool normalize) const
{
const std::vector<float>* weights_ptr;
std::vector<float> raw_weights;
const StaticData &staticData = StaticData::Instance();
weights_ptr = staticData.GetTemporaryMultiModelWeightsVector();
//checking weights passed to mosesserver; only valid for this sentence; *don't* raise exception if client weights are malformed
if (weights_ptr == NULL || weights_ptr->size() == 0) {
weights_ptr = staticData.GetMultiModelWeightsVector(); //fall back to weights defined in config
}
else if(weights_ptr->size() != m_numModels && weights_ptr->size() != m_numModels * numWeights) {
//TODO: can we pass error message to client if weights are malformed?
std::stringstream strme;
strme << "Must have either one multimodel weight per model (" << m_numModels << "), or one per weighted feature and model (" << numWeights << "*" << m_numModels << "). You have " << weights_ptr->size() << ". Reverting to weights in config";
UserMessage::Add(strme.str());
weights_ptr = staticData.GetMultiModelWeightsVector(); //fall back to weights defined in config
}
//checking weights defined in config; only valid for this sentence; raise exception if config weights are malformed
if (weights_ptr == NULL || weights_ptr->size() == 0) {
for (size_t i=0;i < m_numModels;i++) {
raw_weights.push_back(1.0/m_numModels); //uniform weights created online
}
}
else if(weights_ptr->size() != m_numModels && weights_ptr->size() != m_numModels * numWeights) {
std::stringstream strme;
strme << "Must have either one multimodel weight per model (" << m_numModels << "), or one per weighted feature and model (" << numWeights << "*" << m_numModels << "). You have " << weights_ptr->size() << ".";
UserMessage::Add(strme.str());
CHECK(false);
}
else {
raw_weights = *weights_ptr;
}
std::vector<std::vector<float> > multimodelweights (numWeights);
for (size_t i=0;i < numWeights;i++) {
std::vector<float> weights_onefeature (m_numModels);
if(raw_weights.size() == m_numModels) {
weights_onefeature = raw_weights;
}
else {
copy ( raw_weights.begin()+i*m_numModels, raw_weights.begin()+(i+1)*m_numModels, weights_onefeature.begin() );
}
if(normalize) {
multimodelweights[i] = normalizeWeights(weights_onefeature);
}
else {
multimodelweights[i] = weights_onefeature;
}
}
return multimodelweights;
}
std::vector<float> PhraseDictionaryMultiModel::normalizeWeights(std::vector<float> &weights) const
{
std::vector<float> ret (m_numModels);
float total = std::accumulate(weights.begin(),weights.end(),0.0);
for (size_t i=0;i < weights.size();i++) {
ret[i] = weights[i]/total;
}
return ret;
}
ChartRuleLookupManager *PhraseDictionaryMultiModel::CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&)
{
CHECK(false);
return 0;
}
//copied from PhraseDictionaryCompact; free memory allocated to TargetPhraseCollection (and each TargetPhrase) at end of sentence
void PhraseDictionaryMultiModel::CacheForCleanup(TargetPhraseCollection* tpc) {
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_sentenceMutex);
PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()];
#else
PhraseCache &ref = m_sentenceCache;
#endif
ref.push_back(tpc);
}
void PhraseDictionaryMultiModel::CleanUp(const InputType &source) {
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_sentenceMutex);
PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()];
#else
PhraseCache &ref = m_sentenceCache;
#endif
for(PhraseCache::iterator it = ref.begin(); it != ref.end(); it++) {
delete *it;
}
PhraseCache temp;
temp.swap(ref);
CleanUpComponentModels(source);
const StaticData &staticData = StaticData::Instance();
std::vector<float> empty_vector;
(const_cast<StaticData&>(staticData)).SetTemporaryMultiModelWeightsVector(empty_vector);
}
void PhraseDictionaryMultiModel::CleanUpComponentModels(const InputType &source) {
for(size_t i = 0; i < m_numModels; ++i){
m_pd[i]->CleanUp(source);
}
}
#ifdef WITH_DLIB
vector<float> PhraseDictionaryMultiModel::MinimizePerplexity(vector<pair<string, string> > &phrase_pair_vector) {
const StaticData &staticData = StaticData::Instance();
const string& factorDelimiter = staticData.GetFactorDelimiter();
map<pair<string, string>, size_t> phrase_pair_map;
for ( vector<pair<string, string> >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) {
phrase_pair_map[*iter] += 1;
}
vector<multiModelStatisticsOptimization*> optimizerStats;
for ( map<pair<string, string>, size_t>::iterator iter = phrase_pair_map.begin(); iter != phrase_pair_map.end(); ++iter ) {
pair<string, string> phrase_pair = iter->first;
string source_string = phrase_pair.first;
string target_string = phrase_pair.second;
vector<float> fs(m_numModels);
map<string,multiModelStatistics*>* allStats = new(map<string,multiModelStatistics*>);
Phrase sourcePhrase(0);
sourcePhrase.CreateFromString(m_input, source_string, factorDelimiter);
CollectSufficientStatistics(sourcePhrase, allStats); //optimization potential: only call this once per source phrase
//phrase pair not found; leave cache empty
if (allStats->find(target_string) == allStats->end()) {
RemoveAllInMap(*allStats);
delete allStats;
continue;
}
multiModelStatisticsOptimization* targetStatistics = new multiModelStatisticsOptimization();
targetStatistics->targetPhrase = new TargetPhrase(*(*allStats)[target_string]->targetPhrase);
targetStatistics->p = (*allStats)[target_string]->p;
targetStatistics->f = iter->second;
optimizerStats.push_back(targetStatistics);
RemoveAllInMap(*allStats);
delete allStats;
}
Sentence sentence;
CleanUp(sentence); // free memory used by compact phrase tables
size_t numWeights = m_numScoreComponent;
if (m_mode == "interpolate") {
//interpolation of phrase penalty is skipped, and fixed-value (2.718) is used instead. results will be screwed up if phrase penalty is not last feature
numWeights = m_numScoreComponent-1;
}
vector<float> ret (m_numModels*numWeights);
for (size_t iFeature=0; iFeature < numWeights; iFeature++) {
CrossEntropy * ObjectiveFunction = new CrossEntropy(optimizerStats, this, iFeature);
vector<float> weight_vector = Optimize(ObjectiveFunction, m_numModels);
if (m_mode == "interpolate") {
weight_vector = normalizeWeights(weight_vector);
}
cerr << "Weight vector for feature " << iFeature << ": ";
for (size_t i=0; i < m_numModels; i++) {
ret[(iFeature*m_numModels)+i] = weight_vector[i];
cerr << weight_vector[i] << " ";
}
cerr << endl;
delete ObjectiveFunction;
}
RemoveAllInColl(optimizerStats);
return ret;
}
vector<float> PhraseDictionaryMultiModel::Optimize(OptimizationObjective *ObjectiveFunction, size_t numModels) {
dlib::matrix<double,0,1> starting_point;
starting_point.set_size(numModels);
starting_point = 1.0;
try {
dlib::find_min_bobyqa(*ObjectiveFunction,
starting_point,
2*numModels+1, // number of interpolation points
dlib::uniform_matrix<double>(numModels,1, 1e-09), // lower bound constraint
dlib::uniform_matrix<double>(numModels,1, 1e100), // upper bound constraint
1.0, // initial trust region radius
1e-5, // stopping trust region radius
10000 // max number of objective function evaluations
);
}
catch (dlib::bobyqa_failure& e)
{
cerr << e.what() << endl;
}
vector<float> weight_vector (numModels);
for (int i=0; i < starting_point.nr(); i++) {
weight_vector[i] = starting_point(i);
}
cerr << "Cross-entropy: " << (*ObjectiveFunction)(starting_point) << endl;
return weight_vector;
}
double CrossEntropy::operator() ( const dlib::matrix<double,0,1>& arg) const
{
double total = 0.0;
double n = 0.0;
std::vector<float> weight_vector (m_model->m_numModels);
for (int i=0; i < arg.nr(); i++) {
weight_vector[i] = arg(i);
}
if (m_model->m_mode == "interpolate") {
weight_vector = m_model->normalizeWeights(weight_vector);
}
for ( std::vector<multiModelStatisticsOptimization*>::const_iterator iter = m_optimizerStats.begin(); iter != m_optimizerStats.end(); ++iter ) {
multiModelStatisticsOptimization* statistics = *iter;
size_t f = statistics->f;
double score;
score = std::inner_product(statistics->p[m_iFeature].begin(), statistics->p[m_iFeature].end(), weight_vector.begin(), 0.0);
total -= (FloorScore(TransformScore(score))/TransformScore(2))*f;
n += f;
}
return total/n;
}
#endif
} //namespace

View File

@ -0,0 +1,148 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_PhraseDictionaryMultiModel_h
#define moses_PhraseDictionaryMultiModel_h
#include "moses/TranslationModel/PhraseDictionary.h"
#include "moses/TranslationModel/PhraseDictionaryMemory.h"
#ifndef WIN32
#include "moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h"
#endif
#include <boost/unordered_map.hpp>
#include "moses/StaticData.h"
#include "moses/TargetPhrase.h"
#include "moses/Util.h"
#include "moses/UserMessage.h"
#ifdef WITH_DLIB
#include <dlib/optimization.h>
#endif
namespace Moses
{
struct multiModelStatistics {
TargetPhrase *targetPhrase;
std::vector<std::vector<float> > p;
~multiModelStatistics() {delete targetPhrase;};
};
struct multiModelStatisticsOptimization: multiModelStatistics {
size_t f;
};
class OptimizationObjective;
/** Implementation of a virtual phrase table constructed from multiple component phrase tables.
*/
class PhraseDictionaryMultiModel: public PhraseDictionary
{
#ifdef WITH_DLIB
friend class CrossEntropy;
#endif
public:
PhraseDictionaryMultiModel(size_t m_numScoreComponent, PhraseDictionaryFeature* feature);
~PhraseDictionaryMultiModel();
bool Load(const std::vector<FactorType> &input
, const std::vector<FactorType> &output
, const std::vector<std::string> &files
, const std::vector<float> &weight
, size_t tableLimit
, const LMList &languageModels
, float weightWP);
virtual void CollectSufficientStatistics(const Phrase& src, std::map<std::string,multiModelStatistics*>* allStats) const;
virtual TargetPhraseCollection* CreateTargetPhraseCollectionLinearInterpolation(std::map<std::string,multiModelStatistics*>* allStats, std::vector<std::vector<float> > &multimodelweights) const;
std::vector<std::vector<float> > getWeights(size_t numWeights, bool normalize) const;
std::vector<float> normalizeWeights(std::vector<float> &weights) const;
void CacheForCleanup(TargetPhraseCollection* tpc);
void CleanUp(const InputType &source);
virtual void CleanUpComponentModels(const InputType &source);
#ifdef WITH_DLIB
virtual std::vector<float> MinimizePerplexity(std::vector<std::pair<std::string, std::string> > &phrase_pair_vector);
std::vector<float> Optimize(OptimizationObjective * ObjectiveFunction, size_t numModels);
#endif
// functions below required by base class
virtual const TargetPhraseCollection* GetTargetPhraseCollection(const Phrase& src) const;
virtual void InitializeForInput(InputType const&) {
/* Don't do anything source specific here as this object is shared between threads.*/
}
ChartRuleLookupManager *CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&);
protected:
std::string m_mode;
std::vector<PhraseDictionary*> m_pd;
std::vector<float> m_weight;
const LMList *m_languageModels;
float m_weightWP;
std::vector<FactorType> m_input;
std::vector<FactorType> m_output;
size_t m_numModels;
size_t m_componentTableLimit;
PhraseDictionaryFeature* m_feature_load;
typedef std::vector<TargetPhraseCollection*> PhraseCache;
#ifdef WITH_THREADS
boost::mutex m_sentenceMutex;
typedef std::map<boost::thread::id, PhraseCache> SentenceCache;
#else
typedef PhraseCache SentenceCache;
#endif
SentenceCache m_sentenceCache;
};
#ifdef WITH_DLIB
class OptimizationObjective
{
public:
virtual double operator() ( const dlib::matrix<double,0,1>& arg) const = 0;
};
class CrossEntropy: public OptimizationObjective
{
public:
CrossEntropy (
std::vector<multiModelStatisticsOptimization*> &optimizerStats,
PhraseDictionaryMultiModel * model,
size_t iFeature
)
{
m_optimizerStats = optimizerStats;
m_model = model;
m_iFeature = iFeature;
}
double operator() ( const dlib::matrix<double,0,1>& arg) const;
protected:
std::vector<multiModelStatisticsOptimization*> m_optimizerStats;
PhraseDictionaryMultiModel * m_model;
size_t m_iFeature;
};
#endif
} // end namespace
#endif

View File

@ -0,0 +1,666 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
#define LINE_MAX_LENGTH 100000
#include "phrase-extract/SafeGetline.h" // for SAFE_GETLINE()
using namespace std;
// from phrase-extract/tables-core.cpp
vector<string> tokenize( const char* input )
{
vector< string > token;
bool betweenWords = true;
int start=0;
int i=0;
for(; input[i] != '\0'; i++) {
bool isSpace = (input[i] == ' ' || input[i] == '\t');
if (!isSpace && betweenWords) {
start = i;
betweenWords = false;
} else if (isSpace && !betweenWords) {
token.push_back( string( input+start, i-start ) );
betweenWords = true;
}
}
if (!betweenWords)
token.push_back( string( input+start, i-start ) );
return token;
}
namespace Moses
{
PhraseDictionaryMultiModelCounts::PhraseDictionaryMultiModelCounts(size_t numScoreComponent,
PhraseDictionaryFeature* feature): PhraseDictionaryMultiModel(numScoreComponent, feature)
{
m_feature_load = feature;
m_mode = "instance_weighting"; //TODO: set this in config; use m_mode to switch between interpolation and instance weighting
m_combineFunction = InstanceWeighting;
//m_mode = "interpolate";
//m_combineFunction = LinearInterpolationFromCounts;
}
PhraseDictionaryMultiModelCounts::~PhraseDictionaryMultiModelCounts()
{
RemoveAllInColl(m_lexTable_e2f);
RemoveAllInColl(m_lexTable_f2e);
RemoveAllInColl(m_pd);
RemoveAllInColl(m_inverse_pd);
}
bool PhraseDictionaryMultiModelCounts::Load(const vector<FactorType> &input
, const vector<FactorType> &output
, const vector<string> &config
, const vector<float> &weight
, size_t tableLimit
, const LMList &languageModels
, float weightWP)
{
m_languageModels = &languageModels;
m_weight = weight;
m_weightWP = weightWP;
m_input = input;
m_output = output;
m_tableLimit = tableLimit;
m_mode = config[4];
std::vector<std::string> files(config.begin()+5,config.end());
m_numModels = files.size();
if (m_mode == "instance_weighting")
m_combineFunction = InstanceWeighting;
else if (m_mode == "interpolate")
m_combineFunction = LinearInterpolationFromCounts;
else {
ostringstream msg;
msg << "combination mode unknown: " << m_mode;
throw runtime_error(msg.str());
}
for(size_t i = 0; i < m_numModels; ++i){
string impl, file, main_table, target_table, lex_e2f, lex_f2e;
string delim = ":";
size_t delim_pos = files[i].find(delim);
if (delim_pos >= files[i].size()) {
UserMessage::Add("Phrase table must be specified in this format: Implementation:Path");
CHECK(false);
}
impl = files[i].substr(0,delim_pos);
file = files[i].substr(delim_pos+1,files[i].size());
main_table = file + "/count-table";
target_table = file + "/count-table-target";
lex_e2f = file + "/lex.counts.e2f";
lex_f2e = file + "/lex.counts.f2e";
size_t componentTableLimit = 0; // using 0, because we can't trust implemented pruning algorithms with count tables.
PhraseTableImplementation implementation = (PhraseTableImplementation) Scan<int>(impl);
if (implementation == Memory) {
//how many actual scores there are in the phrase tables
size_t numScoresCounts = 3;
size_t numScoresTargetCounts = 1;
if (!FileExists(main_table) && FileExists(main_table + ".gz")) main_table += ".gz";
if (!FileExists(target_table) && FileExists(target_table + ".gz")) target_table += ".gz";
PhraseDictionaryMemory* pdm = new PhraseDictionaryMemory(m_numScoreComponent, m_feature_load);
pdm->SetNumScoreComponentMultiModel(numScoresCounts); //instead of complaining about inequal number of scores, silently fill up the score vector with zeroes
pdm->Load( input, output, main_table, m_weight, componentTableLimit, languageModels, m_weightWP);
m_pd.push_back(pdm);
PhraseDictionaryMemory* pdm_inverse = new PhraseDictionaryMemory(m_numScoreComponent, m_feature_load);
pdm_inverse->SetNumScoreComponentMultiModel(numScoresTargetCounts);
pdm_inverse->Load( input, output, target_table, m_weight, componentTableLimit, languageModels, m_weightWP);
m_inverse_pd.push_back(pdm_inverse);
}
else if (implementation == Compact) {
#ifndef WIN32
PhraseDictionaryCompact* pdc = new PhraseDictionaryCompact(m_numScoreComponent, implementation, m_feature_load);
pdc->SetNumScoreComponentMultiModel(m_numScoreComponent); //for compact models, we need to pass number of log-linear components to correctly resize the score vector
pdc->Load( input, output, main_table, m_weight, componentTableLimit, languageModels, m_weightWP);
m_pd.push_back(pdc);
PhraseDictionaryCompact* pdc_inverse = new PhraseDictionaryCompact(m_numScoreComponent, implementation, m_feature_load);
pdc_inverse->SetNumScoreComponentMultiModel(m_numScoreComponent);
pdc_inverse->Load( input, output, target_table, m_weight, componentTableLimit, languageModels, m_weightWP);
m_inverse_pd.push_back(pdc_inverse);
#else
CHECK(false);
#endif
}
else {
UserMessage::Add("phrase table type unknown to multi-model mode");
CHECK(false);
}
lexicalTable* e2f = new lexicalTable;
LoadLexicalTable(lex_e2f, e2f);
lexicalTable* f2e = new lexicalTable;
LoadLexicalTable(lex_f2e, f2e);
m_lexTable_e2f.push_back(e2f);
m_lexTable_f2e.push_back(f2e);
}
return true;
}
const TargetPhraseCollection *PhraseDictionaryMultiModelCounts::GetTargetPhraseCollection(const Phrase& src) const
{
vector<vector<float> > multimodelweights;
bool normalize;
normalize = (m_mode == "interpolate") ? true : false;
multimodelweights = getWeights(4,normalize);
//source phrase frequency is shared among all phrase pairs
vector<float> fs(m_numModels);
map<string,multiModelCountsStatistics*>* allStats = new(map<string,multiModelCountsStatistics*>);
CollectSufficientStatistics(src, fs, allStats);
TargetPhraseCollection *ret = CreateTargetPhraseCollectionCounts(src, fs, allStats, multimodelweights);
ret->NthElement(m_tableLimit); // sort the phrases for pruning later
const_cast<PhraseDictionaryMultiModelCounts*>(this)->CacheForCleanup(ret);
return ret;
}
void PhraseDictionaryMultiModelCounts::CollectSufficientStatistics(const Phrase& src, vector<float> &fs, map<string,multiModelCountsStatistics*>* allStats) const
//fill fs and allStats with statistics from models
{
for(size_t i = 0; i < m_numModels; ++i){
TargetPhraseCollection *ret_raw = (TargetPhraseCollection*) m_pd[i]->GetTargetPhraseCollection( src);
if (ret_raw != NULL) {
TargetPhraseCollection::iterator iterTargetPhrase;
for (iterTargetPhrase = ret_raw->begin(); iterTargetPhrase != ret_raw->end(); ++iterTargetPhrase) {
TargetPhrase * targetPhrase = *iterTargetPhrase;
vector<float> raw_scores = targetPhrase->GetScoreBreakdown().GetScoresForProducer(m_feature);
string targetString = targetPhrase->GetStringRep(m_output);
if (allStats->find(targetString) == allStats->end()) {
multiModelCountsStatistics * statistics = new multiModelCountsStatistics;
statistics->targetPhrase = new TargetPhrase(*targetPhrase); //make a copy so that we don't overwrite the original phrase table info
statistics->fst.resize(m_numModels);
statistics->ft.resize(m_numModels);
Scores scoreVector(5);
scoreVector[0] = -raw_scores[0];
scoreVector[1] = -raw_scores[1];
scoreVector[2] = -raw_scores[2];
statistics->targetPhrase->SetScore(m_feature, scoreVector, ScoreComponentCollection(), m_weight, m_weightWP, *m_languageModels); // set scores to 0
(*allStats)[targetString] = statistics;
}
multiModelCountsStatistics * statistics = (*allStats)[targetString];
statistics->fst[i] = UntransformScore(raw_scores[0]);
statistics->ft[i] = UntransformScore(raw_scores[1]);
fs[i] = UntransformScore(raw_scores[2]);
(*allStats)[targetString] = statistics;
}
}
}
// get target phrase frequency for models which have not seen the phrase pair
for ( map< string, multiModelCountsStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) {
multiModelCountsStatistics * statistics = iter->second;
for (size_t i = 0; i < m_numModels; ++i) {
if (!statistics->ft[i]) {
statistics->ft[i] = GetTargetCount(static_cast<const Phrase&>(*statistics->targetPhrase), i);
}
}
}
}
TargetPhraseCollection* PhraseDictionaryMultiModelCounts::CreateTargetPhraseCollectionCounts(const Phrase &src, vector<float> &fs, map<string,multiModelCountsStatistics*>* allStats, vector<vector<float> > &multimodelweights) const
{
TargetPhraseCollection *ret = new TargetPhraseCollection();
for ( map< string, multiModelCountsStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) {
multiModelCountsStatistics * statistics = iter->second;
if (statistics->targetPhrase->GetAlignTerm().GetSize() == 0) {
UserMessage::Add(" alignment information empty\ncount-tables need to include alignment information for computation of lexical weights.\nUse --phrase-word-alignment during training; for on-disk tables, also set -alignment-info when creating on-disk tables.");
CHECK(false);
}
try {
pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(src, static_cast<const Phrase&>(*statistics->targetPhrase), statistics->targetPhrase->GetAlignTerm());
vector< set<size_t> > alignedToT = alignment.first;
vector< set<size_t> > alignedToS = alignment.second;
double lexst = ComputeWeightedLexicalTranslation(static_cast<const Phrase&>(*statistics->targetPhrase), src, alignedToS, m_lexTable_e2f, multimodelweights[1], m_output, m_input );
double lexts = ComputeWeightedLexicalTranslation(src, static_cast<const Phrase&>(*statistics->targetPhrase), alignedToT, m_lexTable_f2e, multimodelweights[3], m_input, m_output );
Scores scoreVector(5);
scoreVector[0] = FloorScore(TransformScore(m_combineFunction(statistics->fst, statistics->ft, multimodelweights[0])));
scoreVector[1] = FloorScore(TransformScore(lexst));
scoreVector[2] = FloorScore(TransformScore(m_combineFunction(statistics->fst, fs, multimodelweights[2])));
scoreVector[3] = FloorScore(TransformScore(lexts));
scoreVector[4] = FloorScore(TransformScore(2.718));
statistics->targetPhrase->SetScore(m_feature, scoreVector, ScoreComponentCollection(), m_weight, m_weightWP, *m_languageModels);
}
catch (AlignmentException& e) {
continue;
}
ret->Add(new TargetPhrase(*statistics->targetPhrase));
}
RemoveAllInMap(*allStats);
delete allStats;
return ret;
}
float PhraseDictionaryMultiModelCounts::GetTargetCount(const Phrase &target, size_t modelIndex) const {
TargetPhraseCollection *ret_raw = (TargetPhraseCollection*) m_inverse_pd[modelIndex]->GetTargetPhraseCollection(target);
// in inverse mode, we want the first score of the first phrase pair (note: if we were to work with truly symmetric models, it would be the third score)
if (ret_raw != NULL) {
TargetPhrase * targetPhrase = *(ret_raw->begin());
return UntransformScore(targetPhrase->GetScoreBreakdown().GetScoresForProducer(m_feature)[0]);
}
// target phrase unknown
else return 0;
}
pair<PhraseDictionaryMultiModelCounts::AlignVector,PhraseDictionaryMultiModelCounts::AlignVector> PhraseDictionaryMultiModelCounts::GetAlignmentsForLexWeights(const Phrase &phraseS, const Phrase &phraseT, const AlignmentInfo &alignment) const {
size_t tsize = phraseT.GetSize();
size_t ssize = phraseS.GetSize();
AlignVector alignedToT (tsize);
AlignVector alignedToS (ssize);
AlignmentInfo::const_iterator iter;
for (iter = alignment.begin(); iter != alignment.end(); ++iter) {
const pair<size_t,size_t> &alignPair = *iter;
size_t s = alignPair.first;
size_t t = alignPair.second;
if (s >= ssize || t >= tsize) {
cerr << "Error: inconsistent alignment for phrase pair: " << phraseS << " - " << phraseT << endl;
cerr << "phrase pair will be discarded" << endl;
throw AlignmentException();
}
alignedToT[t].insert( s );
alignedToS[s].insert( t );
}
return make_pair(alignedToT,alignedToS);
}
double PhraseDictionaryMultiModelCounts::ComputeWeightedLexicalTranslation( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const vector<lexicalTable*> &tables, vector<float> &multimodelweights, const vector<FactorType> &input_factors, const vector<FactorType> &output_factors ) const {
// lexical translation probability
double lexScore = 1.0;
string null = "NULL";
// all target words have to be explained
for(size_t ti=0; ti<alignment.size(); ti++) {
const set< size_t > & srcIndices = alignment[ ti ];
Word t_word = phraseT.GetWord(ti);
string ti_str = t_word.GetString(output_factors, false);
if (srcIndices.empty()) {
// explain unaligned word by NULL
lexScore *= GetLexicalProbability( null, ti_str, tables, multimodelweights );
} else {
// go through all the aligned words to compute average
double thisWordScore = 0;
for (set< size_t >::const_iterator si(srcIndices.begin()); si != srcIndices.end(); ++si) {
string s_str = phraseS.GetWord(*si).GetString(input_factors, false);
thisWordScore += GetLexicalProbability( s_str, ti_str, tables, multimodelweights );
}
lexScore *= thisWordScore / srcIndices.size();
}
}
return lexScore;
}
lexicalCache PhraseDictionaryMultiModelCounts::CacheLexicalStatistics( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const vector<lexicalTable*> &tables, const vector<FactorType> &input_factors, const vector<FactorType> &output_factors ) {
//do all the necessary lexical table lookups and get counts, but don't apply weights yet
string null = "NULL";
lexicalCache ret;
// all target words have to be explained
for(size_t ti=0; ti<alignment.size(); ti++) {
const set< size_t > & srcIndices = alignment[ ti ];
Word t_word = phraseT.GetWord(ti);
string ti_str = t_word.GetString(output_factors, false);
vector<lexicalPair> ti_vector;
if (srcIndices.empty()) {
// explain unaligned word by NULL
vector<float> joint_count (m_numModels);
vector<float> marginals (m_numModels);
FillLexicalCountsJoint(null, ti_str, joint_count, tables);
FillLexicalCountsMarginal(null, marginals, tables);
ti_vector.push_back(make_pair(joint_count, marginals));
} else {
for (set< size_t >::const_iterator si(srcIndices.begin()); si != srcIndices.end(); ++si) {
string s_str = phraseS.GetWord(*si).GetString(input_factors, false);
vector<float> joint_count (m_numModels);
vector<float> marginals (m_numModels);
FillLexicalCountsJoint(s_str, ti_str, joint_count, tables);
FillLexicalCountsMarginal(s_str, marginals, tables);
ti_vector.push_back(make_pair(joint_count, marginals));
}
}
ret.push_back(ti_vector);
}
return ret;
}
double PhraseDictionaryMultiModelCounts::ComputeWeightedLexicalTranslationFromCache( lexicalCache &cache, vector<float> &weights ) const {
// lexical translation probability
double lexScore = 1.0;
for (lexicalCache::const_iterator iter = cache.begin(); iter != cache.end(); ++iter) {
vector<lexicalPair> t_vector = *iter;
double thisWordScore = 0;
for ( vector<lexicalPair>::const_iterator iter2 = t_vector.begin(); iter2 != t_vector.end(); ++iter2) {
vector<float> joint_count = iter2->first;
vector<float> marginal = iter2->second;
thisWordScore += m_combineFunction(joint_count, marginal, weights);
}
lexScore *= thisWordScore / t_vector.size();
}
return lexScore;
}
// get lexical probability for single word alignment pair
double PhraseDictionaryMultiModelCounts::GetLexicalProbability( string &wordS, string &wordT, const vector<lexicalTable*> &tables, vector<float> &multimodelweights ) const {
vector<float> joint_count (m_numModels);
vector<float> marginals (m_numModels);
FillLexicalCountsJoint(wordS, wordT, joint_count, tables);
FillLexicalCountsMarginal(wordS, marginals, tables);
double lexProb = m_combineFunction(joint_count, marginals, multimodelweights);
return lexProb;
}
void PhraseDictionaryMultiModelCounts::FillLexicalCountsJoint(string &wordS, string &wordT, vector<float> &count, const vector<lexicalTable*> &tables) const {
for (size_t i=0;i < m_numModels;i++) {
lexicalMapJoint::iterator joint_s = tables[i]->joint.find( wordS );
if (joint_s == tables[i]->joint.end()) count[i] = 0.0;
else {
lexicalMap::iterator joint_t = joint_s->second.find( wordT );
if (joint_t == joint_s->second.end()) count[i] = 0.0;
else count[i] = joint_t->second;
}
}
}
void PhraseDictionaryMultiModelCounts::FillLexicalCountsMarginal(string &wordS, vector<float> &count, const vector<lexicalTable*> &tables) const {
for (size_t i=0;i < m_numModels;i++) {
lexicalMap::iterator marginal_s = tables[i]->marginal.find( wordS );
if (marginal_s == tables[i]->marginal.end()) count[i] = 0.0;
else count[i] = marginal_s->second;
}
}
void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexicalTable* ltable) {
cerr << "Loading lexical translation table from " << fileName;
ifstream inFile;
inFile.open(fileName.c_str());
if (inFile.fail()) {
cerr << " - ERROR: could not open file\n";
exit(1);
}
istream *inFileP = &inFile;
char line[LINE_MAX_LENGTH];
int i=0;
while(true) {
i++;
if (i%100000 == 0) cerr << "." << flush;
SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
if (inFileP->eof()) break;
vector<string> token = tokenize( line );
if (token.size() != 4) {
cerr << "line " << i << " in " << fileName
<< " has wrong number of tokens, skipping:\n"
<< token.size() << " " << token[0] << " " << line << endl;
continue;
}
double joint = atof( token[2].c_str() );
double marginal = atof( token[3].c_str() );
string wordT = token[0];
string wordS = token[1];
ltable->joint[ wordS ][ wordT ] = joint;
ltable->marginal[ wordS ] = marginal;
}
cerr << endl;
}
void PhraseDictionaryMultiModelCounts::CleanUpComponentModels(const InputType &source) {
for(size_t i = 0; i < m_numModels; ++i){
m_pd[i]->CleanUp(source);
m_inverse_pd[i]->CleanUp(source);
}
}
#ifdef WITH_DLIB
vector<float> PhraseDictionaryMultiModelCounts::MinimizePerplexity(vector<pair<string, string> > &phrase_pair_vector) {
const StaticData &staticData = StaticData::Instance();
const string& factorDelimiter = staticData.GetFactorDelimiter();
map<pair<string, string>, size_t> phrase_pair_map;
for ( vector<pair<string, string> >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) {
phrase_pair_map[*iter] += 1;
}
vector<multiModelCountsStatisticsOptimization*> optimizerStats;
for ( map<pair<string, string>, size_t>::iterator iter = phrase_pair_map.begin(); iter != phrase_pair_map.end(); ++iter ) {
pair<string, string> phrase_pair = iter->first;
string source_string = phrase_pair.first;
string target_string = phrase_pair.second;
vector<float> fs(m_numModels);
map<string,multiModelCountsStatistics*>* allStats = new(map<string,multiModelCountsStatistics*>);
Phrase sourcePhrase(0);
sourcePhrase.CreateFromString(m_input, source_string, factorDelimiter);
CollectSufficientStatistics(sourcePhrase, fs, allStats); //optimization potential: only call this once per source phrase
//phrase pair not found; leave cache empty
if (allStats->find(target_string) == allStats->end()) {
RemoveAllInMap(*allStats);
delete allStats;
continue;
}
multiModelCountsStatisticsOptimization * targetStatistics = new multiModelCountsStatisticsOptimization();
targetStatistics->targetPhrase = new TargetPhrase(*(*allStats)[target_string]->targetPhrase);
targetStatistics->fs = fs;
targetStatistics->fst = (*allStats)[target_string]->fst;
targetStatistics->ft = (*allStats)[target_string]->ft;
targetStatistics->f = iter->second;
try {
pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(sourcePhrase, static_cast<const Phrase&>(*targetStatistics->targetPhrase), targetStatistics->targetPhrase->GetAlignTerm());
targetStatistics->lexCachee2f = CacheLexicalStatistics(static_cast<const Phrase&>(*targetStatistics->targetPhrase), sourcePhrase, alignment.second, m_lexTable_e2f, m_output, m_input );
targetStatistics->lexCachef2e = CacheLexicalStatistics(sourcePhrase, static_cast<const Phrase&>(*targetStatistics->targetPhrase), alignment.first, m_lexTable_f2e, m_input, m_output );
optimizerStats.push_back(targetStatistics);
}
catch (AlignmentException& e) {}
RemoveAllInMap(*allStats);
delete allStats;
}
Sentence sentence;
CleanUp(sentence); // free memory used by compact phrase tables
vector<float> ret (m_numModels*4);
for (size_t iFeature=0; iFeature < 4; iFeature++) {
CrossEntropyCounts * ObjectiveFunction = new CrossEntropyCounts(optimizerStats, this, iFeature);
vector<float> weight_vector = Optimize(ObjectiveFunction, m_numModels);
if (m_mode == "interpolate") {
weight_vector = normalizeWeights(weight_vector);
}
else if (m_mode == "instance_weighting") {
float first_value = weight_vector[0];
for (size_t i=0; i < m_numModels; i++) {
weight_vector[i] = weight_vector[i]/first_value;
}
}
cerr << "Weight vector for feature " << iFeature << ": ";
for (size_t i=0; i < m_numModels; i++) {
ret[(iFeature*m_numModels)+i] = weight_vector[i];
cerr << weight_vector[i] << " ";
}
cerr << endl;
delete ObjectiveFunction;
}
RemoveAllInColl(optimizerStats);
return ret;
}
double CrossEntropyCounts::operator() ( const dlib::matrix<double,0,1>& arg) const
{
double total = 0.0;
double n = 0.0;
std::vector<float> weight_vector (m_model->m_numModels);
for (int i=0; i < arg.nr(); i++) {
weight_vector[i] = arg(i);
}
if (m_model->m_mode == "interpolate") {
weight_vector = m_model->normalizeWeights(weight_vector);
}
for ( std::vector<multiModelCountsStatisticsOptimization*>::const_iterator iter = m_optimizerStats.begin(); iter != m_optimizerStats.end(); ++iter ) {
multiModelCountsStatisticsOptimization* statistics = *iter;
size_t f = statistics->f;
double score;
if (m_iFeature == 0) {
score = m_model->m_combineFunction(statistics->fst, statistics->ft, weight_vector);
}
else if (m_iFeature == 1) {
score = m_model->ComputeWeightedLexicalTranslationFromCache(statistics->lexCachee2f, weight_vector);
}
else if (m_iFeature == 2) {
score = m_model->m_combineFunction(statistics->fst, statistics->fs, weight_vector);
}
else if (m_iFeature == 3) {
score = m_model->ComputeWeightedLexicalTranslationFromCache(statistics->lexCachef2e, weight_vector);
}
else {
score = 0;
UserMessage::Add("Trying to optimize feature that I don't know. Aborting");
CHECK(false);
}
total -= (FloorScore(TransformScore(score))/TransformScore(2))*f;
n += f;
}
return total/n;
}
#endif
// calculate weighted probability based on instance weighting of joint counts and marginal counts
double InstanceWeighting(vector<float> &joint_counts, vector<float> &marginals, vector<float> &multimodelweights) {
double joint_counts_weighted = inner_product(joint_counts.begin(), joint_counts.end(), multimodelweights.begin(), 0.0);
double marginals_weighted = inner_product(marginals.begin(), marginals.end(), multimodelweights.begin(), 0.0);
if (marginals_weighted == 0) {
return 0;
}
else {
return joint_counts_weighted/marginals_weighted;
}
}
// calculate linear interpolation of relative frequency estimates based on joint count and marginal counts
//unused for now; enable in config?
double LinearInterpolationFromCounts(vector<float> &joint_counts, vector<float> &marginals, vector<float> &multimodelweights) {
vector<float> p(marginals.size());
for (size_t i=0;i < marginals.size();i++) {
if (marginals[i] != 0) {
p[i] = joint_counts[i]/marginals[i];
}
}
double p_weighted = inner_product(p.begin(), p.end(), multimodelweights.begin(), 0.0);
return p_weighted;
}
} //namespace

View File

@ -0,0 +1,149 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_PhraseDictionaryMultiModelCounts_h
#define moses_PhraseDictionaryMultiModelCounts_h
#include "moses/TranslationModel/PhraseDictionaryMultiModel.h"
#include "moses/TranslationModel/PhraseDictionaryMemory.h"
#ifndef WIN32
#include "moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h"
#endif
#include <boost/unordered_map.hpp>
#include "moses/StaticData.h"
#include "moses/TargetPhrase.h"
#include "moses/Util.h"
#include "moses/UserMessage.h"
#include <exception>
extern std::vector<std::string> tokenize( const char*);
namespace Moses
{
typedef boost::unordered_map<std::string, double > lexicalMap;
typedef boost::unordered_map<std::string, lexicalMap > lexicalMapJoint;
typedef std::pair<std::vector<float>, std::vector<float> > lexicalPair;
typedef std::vector<std::vector<lexicalPair> > lexicalCache;
struct multiModelCountsStatistics : multiModelStatistics {
std::vector<float> fst, ft;
};
struct multiModelCountsStatisticsOptimization: multiModelCountsStatistics {
std::vector<float> fs;
lexicalCache lexCachee2f, lexCachef2e;
size_t f;
};
struct lexicalTable {
lexicalMapJoint joint;
lexicalMap marginal;
};
double InstanceWeighting(std::vector<float> &joint_counts, std::vector<float> &marginals, std::vector<float> &multimodelweights);
double LinearInterpolationFromCounts(std::vector<float> &joint_counts, std::vector<float> &marginals, std::vector<float> &multimodelweights);
//thrown if alignment information does not match phrase pair (out-of-bound alignment points)
class AlignmentException : public std::runtime_error {
public:
AlignmentException() : std::runtime_error("AlignmentException") { }
};
/** Implementation of a phrase table with raw counts.
*/
class PhraseDictionaryMultiModelCounts: public PhraseDictionaryMultiModel
{
#ifdef WITH_DLIB
friend class CrossEntropyCounts;
#endif
typedef std::vector< std::set<size_t> > AlignVector;
public:
PhraseDictionaryMultiModelCounts(size_t m_numScoreComponent, PhraseDictionaryFeature* feature);
~PhraseDictionaryMultiModelCounts();
bool Load(const std::vector<FactorType> &input
, const std::vector<FactorType> &output
, const std::vector<std::string> &files
, const std::vector<float> &weight
, size_t tableLimit
, const LMList &languageModels
, float weightWP);
TargetPhraseCollection* CreateTargetPhraseCollectionCounts(const Phrase &src, std::vector<float> &fs, std::map<std::string,multiModelCountsStatistics*>* allStats, std::vector<std::vector<float> > &multimodelweights) const;
void CollectSufficientStatistics(const Phrase &src, std::vector<float> &fs, std::map<std::string,multiModelCountsStatistics*>* allStats) const;
float GetTargetCount(const Phrase& target, size_t modelIndex) const;
double GetLexicalProbability( std::string &inner, std::string &outer, const std::vector<lexicalTable*> &tables, std::vector<float> &multimodelweights ) const;
double ComputeWeightedLexicalTranslation( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const std::vector<lexicalTable*> &tables, std::vector<float> &multimodelweights, const std::vector<FactorType> &input_factors, const std::vector<FactorType> &output_factors ) const;
double ComputeWeightedLexicalTranslationFromCache( std::vector<std::vector<std::pair<std::vector<float>, std::vector<float> > > > &cache, std::vector<float> &weights ) const;
std::pair<PhraseDictionaryMultiModelCounts::AlignVector,PhraseDictionaryMultiModelCounts::AlignVector> GetAlignmentsForLexWeights(const Phrase &phraseS, const Phrase &phraseT, const AlignmentInfo &alignment) const;
std::vector<std::vector<std::pair<std::vector<float>, std::vector<float> > > > CacheLexicalStatistics( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const std::vector<lexicalTable*> &tables, const std::vector<FactorType> &input_factors, const std::vector<FactorType> &output_factors );
void FillLexicalCountsJoint(std::string &wordS, std::string &wordT, std::vector<float> &count, const std::vector<lexicalTable*> &tables) const;
void FillLexicalCountsMarginal(std::string &wordS, std::vector<float> &count, const std::vector<lexicalTable*> &tables) const;
void LoadLexicalTable( std::string &fileName, lexicalTable* ltable);
const TargetPhraseCollection* GetTargetPhraseCollection(const Phrase& src) const;
void CleanUpComponentModels(const InputType &source);
#ifdef WITH_DLIB
std::vector<float> MinimizePerplexity(std::vector<std::pair<std::string, std::string> > &phrase_pair_vector);
#endif
// functions below required by base class
virtual void InitializeForInput(InputType const&) {
/* Don't do anything source specific here as this object is shared between threads.*/
}
private:
std::vector<PhraseDictionary*> m_inverse_pd;
std::vector<lexicalTable*> m_lexTable_e2f, m_lexTable_f2e;
double (*m_combineFunction) (std::vector<float> &joint_counts, std::vector<float> &marginals, std::vector<float> &multimodelweights);
};
#ifdef WITH_DLIB
class CrossEntropyCounts: public OptimizationObjective
{
public:
CrossEntropyCounts (
std::vector<multiModelCountsStatisticsOptimization*> &optimizerStats,
PhraseDictionaryMultiModelCounts * model,
size_t iFeature
)
{
m_optimizerStats = optimizerStats;
m_model = model;
m_iFeature = iFeature;
}
double operator() ( const dlib::matrix<double,0,1>& arg) const;
private:
std::vector<multiModelCountsStatisticsOptimization*> m_optimizerStats;
PhraseDictionaryMultiModelCounts * m_model;
size_t m_iFeature;
};
#endif
} // end namespace
#endif

View File

@ -140,6 +140,8 @@ enum PhraseTableImplementation {
,FuzzyMatch = 11
,Compact = 12
,Interpolated = 13
,MultiModelCounts = 98
,MultiModel = 99
};
enum InputTypeEnum {

View File

@ -310,7 +310,7 @@ inline float CalcTranslationScore(const std::vector<float> &probVector,
return out.str(); \
} \
//! delete and remove every element of a collection object such as map, set, list etc
//! delete and remove every element of a collection object such as set, list etc
template<class COLL>
void RemoveAllInColl(COLL &coll)
{
@ -320,6 +320,17 @@ void RemoveAllInColl(COLL &coll)
coll.clear();
}
//! delete and remove every element of map
template<class COLL>
void RemoveAllInMap(COLL &coll)
{
for (typename COLL::const_iterator iter = coll.begin() ; iter != coll.end() ; ++iter) {
delete (iter->second);
}
coll.clear();
}
//! x-platform reference to temp folder
std::string GetTempFolder();
//! MD5 hash of a file

View File

@ -0,0 +1,153 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Rico Sennrich <sennrich [AT] cl.uzh.ch>
# This script creates tables that store phrase pair frequencies rather than probabilities.
# These count tables can be used for a delayed, online computation of the original phrase translation features
# The benefit is that models can be combined quickly, with the same results as if we trained a model on the concatenation of all data (excepting differences in word alignment).
# Also, each model can be given a weight, which is applied to all frequencies of the model for the combination.
# Note: the input phrase table must have alignment information (--phrase-word-alignment in train-model.perl);
# it must be unsmoothed;
# additionally, the online model type requires the lexical counts files lex.counts.e2f and lex.counts.f2e to be in the same folder (--write-lexical-counts in train-model.perl)
# The results may differ from training on the concatenation of all data due to differences in word alignment, and rounding errors.
from __future__ import unicode_literals
import sys
import os
import gzip
from tempfile import NamedTemporaryFile
from subprocess import Popen, PIPE
if len(sys.argv) < 3 or len(sys.argv) > 4:
sys.stderr.write('Usage: ' + sys.argv[0] + ' in_file out_path [prune_count]\nThis script will create the files out_path/count-table.gz and out_path/count-table-target.gz\n')
exit()
def handle_file(filename,action,fileobj=None,mode='r'):
"""support reading either from stdin, plain file or gzipped file"""
if action == 'open':
if mode == 'r':
mode = 'rb'
if mode == 'rb' and not filename == '-' and not os.path.exists(filename):
if os.path.exists(filename+'.gz'):
filename = filename+'.gz'
else:
sys.stderr.write('Error: unable to open file. ' + filename + ' - aborting.\n')
exit()
if filename.endswith('.gz'):
fileobj = gzip.open(filename,mode)
elif filename == '-':
fileobj = sys.stdin
else:
fileobj = open(filename,mode)
return fileobj
elif action == 'close' and filename != '-':
fileobj.close()
def sort_and_uniq(infile, outfile):
cmd = ['sort', infile]
fobj = handle_file(outfile, 'open', mode='w')
sys.stderr.write('Executing: LC_ALL=C ' + ' '.join(cmd) + ' | uniq | gzip -c > ' + outfile + '\n')
p_sort = Popen(cmd, env={'LC_ALL':'C'}, stdout=PIPE)
p_uniq = Popen(['uniq'], stdin = p_sort.stdout, stdout=PIPE)
p_compress = Popen(['gzip', '-c'], stdin = p_uniq.stdout, stdout=fobj)
p_compress.wait()
fobj.close()
def create_count_lines(fobj, countobj, countobj_target, prune=0):
i = 0
original_pos = 0
source = ""
store_lines = set()
for line in fobj:
if not i % 100000:
sys.stderr.write('.')
i += 1
line = line.split(b' ||| ')
current_source = line[0]
scores = line[2].split()
comments = line[4].split()
fs = comments[1]
ft = comments[0]
try:
fst = comments[2]
except IndexError:
fst = str(int(round(float(scores[0])*float(ft)))).encode()
line[2] = b' '.join([fst,ft,fs])
if prune:
if current_source != source:
write_batch(store_lines, countobj, prune)
source = current_source
store_lines = set()
original_pos = 0
store_lines.add((float(fst), original_pos, b' ||| '.join(line)))
original_pos += 1
else:
countobj.write(b' ||| '.join(line))
# target count file
tline = b' ||| '.join([line[1], b'X', ft]) + b' ||| |||\n' # if you use string formatting to make this look nicer, you may break Python 3 compatibility.
countobj_target.write(tline)
if prune:
write_batch(store_lines, countobj, prune)
countobj.close()
countobj_target.close()
def write_batch(store_lines, outfile, prune):
top20 = sorted(store_lines, reverse=True)[:prune]
for score, original_pos, store_line in sorted(top20, key = lambda x: x[1]): #write in original_order
outfile.write(store_line)
if __name__ == '__main__':
if len(sys.argv) == 4:
prune = int(sys.argv[3])
else:
prune = 0
fileobj = handle_file(sys.argv[1],'open')
out_path = sys.argv[2]
count_table_file = gzip.open(os.path.join(out_path,'count-table.gz'), 'w')
count_table_target_file = os.path.join(out_path,'count-table-target.gz')
count_table_target_file_temp = NamedTemporaryFile(delete=False)
try:
sys.stderr.write('Creating temporary file for unsorted target counts file: ' + count_table_target_file_temp.name + '\n')
create_count_lines(fileobj, count_table_file, count_table_target_file_temp, prune)
count_table_target_file_temp.close()
sys.stderr.write('Finished writing, now re-sorting and compressing target count file\n')
sort_and_uniq(count_table_target_file_temp.name, count_table_target_file)
os.remove(count_table_target_file_temp.name)
sys.stderr.write('Done\n')
except BaseException:
os.remove(count_table_target_file_temp.name)
raise