Merge branch 'master' of github.com:moses-smt/mosesdecoder

This commit is contained in:
Ulrich Germann 2013-09-14 11:21:15 +01:00
commit 6ea0bb1f61
20 changed files with 238 additions and 186 deletions

View File

@ -76,6 +76,10 @@ include $(TOP)/jam-files/sanity.jam ;
boost 103600 ;
external-lib z ;
lib dl : : <runtime-link>static:<link>static <runtime-link>shared:<link>shared ;
requirements += <library>dl ;
if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_minimal" ] {
if [ option.get "full-tcmalloc" : : "yes" ] {
external-lib unwind ;

View File

@ -1066,6 +1066,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/DistortionScoreProducer.h</locationURI>
</link>
<link>
<name>FF/ExternalFeature.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/ExternalFeature.cpp</locationURI>
</link>
<link>
<name>FF/ExternalFeature.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/ExternalFeature.h</locationURI>
</link>
<link>
<name>FF/FFState.cpp</name>
<type>1</type>

View File

@ -93,6 +93,9 @@ void ChartManager::ProcessSentence()
m_parser.Create(range, m_translationOptionList);
m_translationOptionList.ApplyThreshold();
const InputPath &inputPath = m_parser.GetInputPath(range);
m_translationOptionList.Evaluate(m_source, inputPath);
// decode
ChartCell &cell = m_hypoStackColl.Get(range);

View File

@ -219,6 +219,11 @@ void ChartParser::CreateInputPaths(const InputType &input)
}
}
const InputPath &ChartParser::GetInputPath(WordsRange &range) const
{
return GetInputPath(range.GetStartPos(), range.GetEndPos());
}
const InputPath &ChartParser::GetInputPath(size_t startPos, size_t endPos) const
{
size_t offset = endPos - startPos;

View File

@ -66,6 +66,7 @@ public:
long GetTranslationId() const;
size_t GetSize() const;
const InputPath &GetInputPath(size_t startPos, size_t endPos) const;
const InputPath &GetInputPath(WordsRange &range) const;
private:
ChartParserUnknown m_unknown;

View File

@ -13,6 +13,7 @@ class ChartTranslationOption
protected:
const TargetPhrase &m_targetPhrase;
ScoreComponentCollection m_scoreBreakdown;
const InputPath *m_inputPath;
public:
ChartTranslationOption(const TargetPhrase &targetPhrase);
@ -21,6 +22,11 @@ public:
return m_targetPhrase;
}
void SetInputPath(const InputPath *inputPath)
{ m_inputPath = inputPath; }
const InputPath *GetInputPath() const
{ return m_inputPath; }
const ScoreComponentCollection &GetScores() const {
return m_scoreBreakdown;
}

View File

@ -69,6 +69,7 @@ void ChartTranslationOptions::Evaluate(const InputType &input, const InputPath &
CollType::iterator iter;
for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
ChartTranslationOption &transOpt = **iter;
transOpt.SetInputPath(&inputPath);
transOpt.Evaluate(input, inputPath);
}

View File

@ -0,0 +1,73 @@
#include "ExternalFeature.h"
#include <dlfcn.h>
using namespace std;
namespace Moses
{
ExternalFeatureState::ExternalFeatureState(int stateSize, void *data)
{
m_stateSize = stateSize;
m_data = malloc(stateSize);
memcpy(m_data, data, stateSize);
}
void ExternalFeature::Load()
{
string nparam = "testing";
if (m_path.size() < 1) {
cerr << "External requires a path to a dynamic library!\n";
abort();
}
lib_handle = dlopen(m_path.c_str(), RTLD_LAZY);
if (!lib_handle) {
cerr << "dlopen reports: " << dlerror() << endl;
cerr << "Did you provide a full path to the dynamic library?\n";
abort();
}
CdecFF* (*fn)(const string&) =
(CdecFF* (*)(const string&))(dlsym(lib_handle, "create_ff"));
if (!fn) {
cerr << "dlsym reports: " << dlerror() << endl;
abort();
}
ff_ext = (*fn)(nparam);
m_stateSize = ff_ext->StateSize();
}
ExternalFeature::~ExternalFeature() {
delete ff_ext;
dlclose(lib_handle);
}
void ExternalFeature::SetParameter(const std::string& key, const std::string& value)
{
if (key == "path") {
m_path = value;
}
else {
StatefulFeatureFunction::SetParameter(key, value);
}
}
FFState* ExternalFeature::Evaluate(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const
{
return new ExternalFeatureState(m_stateSize);
}
FFState* ExternalFeature::EvaluateChart(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const
{
return new ExternalFeatureState(m_stateSize);
}
}

View File

@ -0,0 +1,93 @@
#pragma once
#include <string>
#include <cstring>
#include "StatefulFeatureFunction.h"
#include "FFState.h"
namespace Moses
{
class CdecFF;
class ExternalFeatureState : public FFState
{
protected:
int m_stateSize;
void *m_data;
public:
ExternalFeatureState(int stateSize)
:m_stateSize(stateSize)
,m_data(NULL)
{}
ExternalFeatureState(int stateSize, void *data);
~ExternalFeatureState()
{
free(m_data);
}
int Compare(const FFState& other) const
{
const ExternalFeatureState &otherFF = static_cast<const ExternalFeatureState&>(other);
int ret = memcmp(m_data, otherFF.m_data, m_stateSize);
return ret;
}
};
// copied from cdec
class ExternalFeature : public StatefulFeatureFunction
{
public:
ExternalFeature(const std::string &line)
:StatefulFeatureFunction("ExternalFeature", line)
{
ReadParameters();
}
~ExternalFeature();
void Load();
bool IsUseable(const FactorMask &mask) const
{ return true; }
void SetParameter(const std::string& key, const std::string& value);
void Evaluate(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{}
void Evaluate(const InputType &input
, const InputPath &inputPath
, ScoreComponentCollection &scoreBreakdown) const
{}
FFState* Evaluate(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
FFState* EvaluateChart(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const;
virtual const FFState* EmptyHypothesisState(const InputType &input) const
{
return new ExternalFeatureState(m_stateSize);
}
protected:
std::string m_path;
void* lib_handle;
CdecFF *ff_ext;
int m_stateSize;
};
class CdecFF
{
public:
virtual int StateSize() const = 0;
};
}

View File

@ -30,6 +30,8 @@
#include "moses/FF/PhrasePenalty.h"
#include "moses/FF/OSM-Feature/OpSequenceModel.h"
#include "moses/FF/ControlRecombination.h"
#include "moses/FF/ExternalFeature.h"
#include "moses/FF/SkeletonStatelessFF.h"
#include "moses/FF/SkeletonStatefulFF.h"
@ -142,6 +144,7 @@ FeatureRegistry::FeatureRegistry()
MOSES_FNAME(ControlRecombination);
MOSES_FNAME(SkeletonStatelessFF);
MOSES_FNAME(SkeletonStatefulFF);
MOSES_FNAME(ExternalFeature);
#ifdef HAVE_SYNLM
MOSES_FNAME(SyntacticLanguageModel);

View File

@ -18,6 +18,8 @@ InputPath::InputPath(const Phrase &phrase, const NonTerminalSet &sourceNonTerms,
,m_range(range)
,m_inputScore(inputScore)
{
//cerr << "phrase=" << phrase << " m_inputScore=" << *m_inputScore << endl;
FactorType placeholderFactor = StaticData::Instance().GetPlaceholderFactor().first;
if (placeholderFactor != NOT_FOUND) {
for (size_t pos = 0; pos < m_phrase.GetSize(); ++pos) {

View File

@ -1,6 +1,6 @@
// $Id$
#include <vector>
#include "util/exception.hh"
#include "ScoreComponentCollection.h"
#include "StaticData.h"
@ -30,6 +30,20 @@ void ScorePair::PlusEquals(const StringPiece &key, float value)
}
}
std::ostream& operator<<(std::ostream& os, const ScorePair& rhs)
{
for (size_t i = 0; i < rhs.denseScores.size(); ++i) {
os << rhs.denseScores[i] << ",";
}
std::map<StringPiece, float>::const_iterator iter;
for (iter = rhs.sparseScores.begin(); iter != rhs.sparseScores.end(); ++iter) {
os << iter->first << "=" << iter->second << ",";
}
return os;
}
ScoreComponentCollection::ScoreIndexMap ScoreComponentCollection::s_scoreIndexes;
size_t ScoreComponentCollection::s_denseVectorSize = 0;
@ -206,6 +220,21 @@ void ScoreComponentCollection::Assign(const FeatureFunction* sp, const string li
}
}
void ScoreComponentCollection::Assign(const FeatureFunction* sp, const std::vector<float>& scores) {
IndexPair indexes = GetIndexes(sp);
size_t numScores = indexes.second - indexes.first;
if (scores.size() != numScores) {
UTIL_THROW(util::Exception, "Feature function " << sp->GetScoreProducerDescription() << " specified "
<< numScores << " dense scores or weights. Actually has " << scores.size());
}
for (size_t i = 0; i < scores.size(); ++i) {
m_scores[i + indexes.first] = scores[i];
}
}
void ScoreComponentCollection::InvertDenseFeatures(const FeatureFunction* sp)
{

View File

@ -46,6 +46,8 @@ namespace Moses
*/
struct ScorePair
{
friend std::ostream& operator<<(std::ostream& os, const ScorePair& rhs);
std::vector<float> denseScores;
std::map<StringPiece, float> sparseScores;
@ -262,13 +264,7 @@ public:
m_scores[fname] += score;
}
void Assign(const FeatureFunction* sp, const std::vector<float>& scores) {
IndexPair indexes = GetIndexes(sp);
CHECK(scores.size() == indexes.second - indexes.first);
for (size_t i = 0; i < scores.size(); ++i) {
m_scores[i + indexes.first] = scores[i];
}
}
void Assign(const FeatureFunction* sp, const std::vector<float>& scores);
//! Special version Assign(ScoreProducer, vector<float>)
//! to add the score from a single ScoreProducer that produces

View File

@ -1,44 +0,0 @@
//
// ExtractedRule.cpp
// extract
//
// Created by Hieu Hoang on 13/09/2011.
// Copyright 2011 __MyCompanyName__. All rights reserved.
//
#include "ExtractedRule.h"
using namespace std;
namespace MosesTraining
{
void ExtractedRule::OutputNTLengths(std::ostream &out) const
{
ostringstream outString;
OutputNTLengths(outString);
out << outString;
}
void ExtractedRule::OutputNTLengths(std::ostringstream &outString) const
{
std::map<size_t, std::pair<size_t, size_t> >::const_iterator iter;
for (iter = m_ntLengths.begin(); iter != m_ntLengths.end(); ++iter) {
size_t sourcePos = iter->first;
const std::pair<size_t, size_t> &spanLengths = iter->second;
outString << sourcePos << "=" << spanLengths.first << "," <<spanLengths.second << " ";
}
}
std::ostream& operator<<(std::ostream &out, const ExtractedRule &obj)
{
out << obj.source << " ||| " << obj.target << " ||| "
<< obj.alignment << " ||| "
<< obj.alignmentInv << " ||| ";
obj.OutputNTLengths(out);
return out;
}
} // namespace

View File

@ -32,8 +32,6 @@ namespace MosesTraining
// sentence-level collection of rules
class ExtractedRule
{
friend std::ostream& operator<<(std::ostream &, const ExtractedRule &);
public:
std::string source;
std::string target;
@ -54,8 +52,6 @@ public:
float count;
double pcfgScore;
std::map<size_t, std::pair<size_t, size_t> > m_ntLengths;
ExtractedRule(int sT, int eT, int sS, int eS)
: source()
, target()
@ -76,13 +72,6 @@ public:
, count(0)
, pcfgScore(0.0) {
}
void SetSpanLength(size_t sourcePos, size_t sourceLength, size_t targetLength) {
m_ntLengths[sourcePos] = std::pair<size_t, size_t>(sourceLength, targetLength);
}
void OutputNTLengths(std::ostream &out) const;
void OutputNTLengths(std::ostringstream &out) const;
};
}

View File

@ -49,7 +49,6 @@ public:
bool duplicateRules;
bool fractionalCounting;
bool pcfgScore;
bool outputNTLengths;
bool gzOutput;
bool unpairedExtractFormat;
bool conditionOnTargetLhs;
@ -83,7 +82,6 @@ public:
, duplicateRules(true)
, fractionalCounting(true)
, pcfgScore(false)
, outputNTLengths(false)
, gzOutput(false)
, unpairedExtractFormat(false)
, conditionOnTargetLhs(false)

View File

@ -41,7 +41,6 @@ bool lowCountFlag = false;
bool goodTuringFlag = false;
bool kneserNeyFlag = false;
bool logProbFlag = false;
bool outputNTLengths = false;
inline float maybeLogProb( float a )
{
return logProbFlag ? log(a) : a;
@ -62,7 +61,7 @@ int main(int argc, char* argv[])
<< "consolidating direct and indirect rule tables\n";
if (argc < 4) {
cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--OutputNTLengths] \n";
cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] \n";
exit(1);
}
char* &fileNameDirect = argv[1];
@ -119,8 +118,6 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--LogProb") == 0) {
logProbFlag = true;
cerr << "using log-probabilities\n";
} else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
outputNTLengths = true;
} else {
cerr << "ERROR: unknown option " << argv[i] << endl;
exit(1);
@ -315,10 +312,6 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
// counts, for debugging
fileConsolidated << "||| " << countE << " " << countF << " " << countEF;
if (outputNTLengths) {
fileConsolidated << " ||| " << itemDirect[5];
}
// count bin feature (as a sparse feature)
if (sparseCountBinFeatureFlag ||
directSparseScores.compare("") != 0 ||

View File

@ -129,7 +129,6 @@ int main(int argc, char* argv[])
<< " --GlueGrammar FILE"
<< " | --UnknownWordLabel FILE"
<< " | --OnlyDirect"
<< " | --OutputNTLengths"
<< " | --MaxSpan[" << options.maxSpan << "]"
<< " | --MinHoleTarget[" << options.minHoleTarget << "]"
<< " | --MinHoleSource[" << options.minHoleSource << "]"
@ -262,8 +261,6 @@ int main(int argc, char* argv[])
options.fractionalCounting = false;
} else if (strcmp(argv[i],"--PCFG") == 0) {
options.pcfgScore = true;
} else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
options.outputNTLengths = true;
} else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
options.unpairedExtractFormat = true;
} else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
@ -663,9 +660,6 @@ void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS
rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
if (!m_options.onlyDirectFlag)
rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " ";
rule.SetSpanLength(hole.GetPos(0), hole.GetSize(0), hole.GetSize(1) ) ;
}
rule.alignment.erase(rule.alignment.size()-1);
@ -1077,9 +1071,6 @@ void ExtractTask::writeRulesToFile()
<< rule->target << " ||| "
<< rule->alignment << " ||| "
<< rule->count << " ||| ";
if (m_options.outputNTLengths) {
rule->OutputNTLengths(out);
}
if (m_options.pcfgScore) {
out << " ||| " << rule->pcfgScore;
}

View File

@ -59,7 +59,6 @@ int negLogProb = 1;
bool lexFlag = true;
bool unalignedFlag = false;
bool unalignedFWFlag = false;
bool outputNTLengths = false;
bool singletonFeature = false;
bool crossedNonTerm = false;
int countOfCounts[COC_MAX+1];
@ -82,9 +81,6 @@ double computeUnalignedPenalty( const PHRASE &, const PHRASE &, const PhraseAlig
set<string> functionWordList;
void loadFunctionWords( const string &fileNameFunctionWords );
double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
, map<size_t, map<size_t, float> > &sourceProb
, map<size_t, map<size_t, float> > &targetProb);
void printSourcePhrase(const PHRASE &, const PHRASE &, const PhraseAlignment &, ostream &);
void printTargetPhrase(const PHRASE &, const PHRASE &, const PhraseAlignment &, ostream &);
@ -95,7 +91,7 @@ int main(int argc, char* argv[])
ScoreFeatureManager featureManager;
if (argc < 4) {
cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] [--CrossedNonTerm] \n";
cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] [--CrossedNonTerm] \n";
cerr << featureManager.usage() << endl;
exit(1);
}
@ -158,8 +154,6 @@ int main(int argc, char* argv[])
minCountHierarchical = atof(argv[++i]);
cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times\n";
minCountHierarchical -= 0.00001; // account for rounding
} else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
outputNTLengths = true;
} else if (strcmp(argv[i],"--Singleton") == 0) {
singletonFeature = true;
cerr << "binary singleton feature\n";
@ -375,87 +369,6 @@ const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrase
return *bestAlignment;
}
void calcNTLengthProb(const map<size_t, map<size_t, size_t> > &lengths
, size_t total
, map<size_t, map<size_t, float> > &probs)
{
map<size_t, map<size_t, size_t> >::const_iterator iterOuter;
for (iterOuter = lengths.begin(); iterOuter != lengths.end(); ++iterOuter) {
size_t sourcePos = iterOuter->first;
const map<size_t, size_t> &inner = iterOuter->second;
map<size_t, size_t>::const_iterator iterInner;
for (iterInner = inner.begin(); iterInner != inner.end(); ++iterInner) {
size_t length = iterInner->first;
size_t count = iterInner->second;
float prob = (float) count / (float) total;
probs[sourcePos][length] = prob;
}
}
}
void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
, map<size_t, map<size_t, float> > &sourceProb
, map<size_t, map<size_t, float> > &targetProb)
{
map<size_t, map<size_t, size_t> > sourceLengths, targetLengths;
// 1st = position in source phrase, 2nd = length, 3rd = count
map<size_t, size_t> totals;
// 1st = position in source phrase, 2nd = total counts
// each source pos should have same count?
vector< PhraseAlignment* >::const_iterator iterOuter;
for (iterOuter = phrasePairs.begin(); iterOuter != phrasePairs.end(); ++iterOuter) {
const PhraseAlignment &phrasePair = **iterOuter;
const std::map<size_t, std::pair<size_t, size_t> > &ntLengths = phrasePair.GetNTLengths();
std::map<size_t, std::pair<size_t, size_t> >::const_iterator iterInner;
for (iterInner = ntLengths.begin(); iterInner != ntLengths.end(); ++iterInner) {
size_t sourcePos = iterInner->first;
size_t sourceLength = iterInner->second.first;
size_t targetLength = iterInner->second.second;
sourceLengths[sourcePos][sourceLength]++;
targetLengths[sourcePos][targetLength]++;
totals[sourcePos]++;
}
}
if (totals.size() == 0) {
// no non-term. Don't bother
return;
}
size_t total = totals.begin()->second;
if (totals.size() > 1) {
assert(total == (++totals.begin())->second );
}
calcNTLengthProb(sourceLengths, total, sourceProb);
calcNTLengthProb(targetLengths, total, targetProb);
}
void outputNTLengthProbs(ostream &phraseTableFile, const map<size_t, map<size_t, float> > &probs, const string &prefix)
{
map<size_t, map<size_t, float> >::const_iterator iterOuter;
for (iterOuter = probs.begin(); iterOuter != probs.end(); ++iterOuter) {
size_t sourcePos = iterOuter->first;
const map<size_t, float> &inner = iterOuter->second;
map<size_t, float>::const_iterator iterInner;
for (iterInner = inner.begin(); iterInner != inner.end(); ++iterInner) {
size_t length = iterInner->first;
float prob = iterInner->second;
phraseTableFile << sourcePos << "|" << prefix << "|" << length << "=" << prob << " ";
}
}
}
bool calcCrossedNonTerm(size_t sourcePos, size_t targetPos, const std::vector< std::set<size_t> > &alignedToS)
{
for (size_t currSource = 0; currSource < alignedToS.size(); ++currSource) {
@ -664,21 +577,6 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
if (kneserNeyFlag)
phraseTableFile << " " << distinctCount;
// nt lengths
if (outputNTLengths) {
phraseTableFile << " ||| ";
if (!inverseFlag) {
map<size_t, map<size_t, float> > sourceProb, targetProb;
// 1st sourcePos, 2nd = length, 3rd = prob
calcNTLengthProb(phrasePair, sourceProb, targetProb);
outputNTLengthProbs(phraseTableFile, sourceProb, "S");
outputNTLengthProbs(phraseTableFile, targetProb, "T");
}
}
phraseTableFile << endl;
}

View File

@ -6,7 +6,8 @@
Distortion0= 0.3
UnknownWordPenalty0= 1
WordPenalty0= -1
TranslationModel0= 0.2 0.2 0.2 0.2 0.2
TranslationModel0= 0.2 0.2 0.2 0.2
PhrasePenalty0= 0.2
LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3
LM0= 0.5