Implementation and testing of target bigram feature

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/branches/mira-mtm5@3624 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
bhaddow 2010-10-14 21:52:35 +00:00
parent ec902882e0
commit 4126c19f91
11 changed files with 470 additions and 64 deletions

View File

@ -58,9 +58,9 @@ namespace Moses {
//A feature name can either be initialised as a pair of strings,
//which will be concatenated with a SEP between them, or as
//a single string, which will be used as-is.
FName(const std::string root, const std::string name)
explicit FName(const std::string root, const std::string name)
{init(root + SEP + name);}
FName(const std::string& name)
explicit FName(const std::string& name)
{init(name);}
const std::string& name() const;

View File

@ -129,6 +129,14 @@ public:
m_scores[sp->GetFeatureNames()[0]] += score;
}
//For features which have an unbounded number of components
void PlusEquals(const ScoreProducer*sp, const std::string& name, float score)
{
assert(sp->GetNumScoreComponents() == ScoreProducer::unlimited);
FName fname(sp->GetScoreProducerDescription(),name);
m_scores[fname] += score;
}
void Assign(const ScoreProducer* sp, const std::vector<float>& scores)
{
assert(scores.size() == sp->GetNumScoreComponents());
@ -138,7 +146,7 @@ public:
}
}
//! Special version PlusEquals(ScoreProducer, vector<float>)
//! Special version Assign(ScoreProducer, vector<float>)
//! to add the score from a single ScoreProducer that produces
//! a single value
void Assign(const ScoreProducer* sp, float score)
@ -147,6 +155,14 @@ public:
m_scores[sp->GetFeatureNames()[0]] = score;
}
//For features which have an unbounded number of components
void Assign(const ScoreProducer*sp, const std::string name, float score)
{
assert(sp->GetNumScoreComponents() == ScoreProducer::unlimited);
FName fname(sp->GetScoreProducerDescription(),name);
m_scores[fname] = score;
}
float InnerProduct(const ScoreComponentCollection& rhs) const
{
@ -179,6 +195,15 @@ public:
return m_scores[sp->GetFeatureNames()[0]];
}
//For features which have an unbounded number of components
float GetScoreForProducer
(const ScoreProducer* sp, const std::string& name) const
{
assert(sp->GetNumScoreComponents() == ScoreProducer::unlimited);
FName fname(sp->GetScoreProducerDescription(),name);
return m_scores[fname];
}
float GetWeightedScore() const;
void ZeroAllLM(const LMList& lmList);

View File

@ -11,6 +11,7 @@ namespace Moses
{
multiset<string> ScoreProducer::description_counts;
const size_t ScoreProducer::unlimited = -1;
ScoreProducer::ScoreProducer(const std::string& description)
{

View File

@ -30,10 +30,12 @@ protected:
virtual ~ScoreProducer();
public:
static const size_t unlimited;
//! returns the number of scores that a subclass produces.
//! For example, a language model conventionally produces 1, a translation table some arbitrary number, etc
//! will cause an error if this producer does not have a fixed number
//! of scores (eg sparse features)
//! sparse features returned unlimited
virtual size_t GetNumScoreComponents() const = 0;
//! returns a string description of this producer

View File

@ -1,5 +1,4 @@
#include "TargetBigramFeature.h"
#include "InputFileStream.h"
#include "Phrase.h"
#include "TargetPhrase.h"
#include "Hypothesis.h"
@ -9,30 +8,39 @@ namespace Moses {
using namespace std;
int TargetBigramState::Compare(const FFState& other) const {
const TargetBigramState& rhs = dynamic_cast<const TargetBigramState&>(other);
return Word::Compare(m_word,rhs.m_word);
}
bool TargetBigramFeature::Load(const std::string &filePath)
{
InputFileStream inFile(filePath);
if (filePath == "*") return true; //allow all
ifstream inFile(filePath.c_str());
if (!inFile)
{
return false;
size_t lineNo = 0;
std::string line;
while (getline(inFile, line)) {
m_wordMap[line] = lineNo++;
}
inFile.Close();
std::string line;
m_vocab.insert(BOS_);
while (getline(inFile, line)) {
m_vocab.insert(line);
}
inFile.close();
return true;
}
size_t TargetBigramFeature::GetNumScoreComponents() const
{
return m_wordMap.size() * m_wordMap.size();
return ScoreProducer::unlimited;
}
string TargetBigramFeature::GetScoreProducerWeightShortName() const
{
return "tbf";
return "dlmb";
}
size_t TargetBigramFeature::GetNumInputScores() const
@ -43,36 +51,37 @@ size_t TargetBigramFeature::GetNumInputScores() const
const FFState* TargetBigramFeature::EmptyHypothesisState(const InputType &/*input*/) const
{
return NULL;
return new TargetBigramState(m_bos);
}
FFState* TargetBigramFeature::Evaluate(const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const
{
vector<string> words;
if (cur_hypo.GetPrevHypo() != NULL) {
size_t prevPhraseSize = cur_hypo.GetPrevHypo()->GetCurrTargetPhrase().GetSize();
if (prevPhraseSize > 0) {
words.push_back(cur_hypo.GetPrevHypo()->GetCurrTargetPhrase().GetWord(prevPhraseSize - 1).ToString());
}
const TargetBigramState* tbState = dynamic_cast<const TargetBigramState*>(prev_state);
assert(tbState);
const Phrase& targetPhrase = cur_hypo.GetCurrTargetPhrase();
if (targetPhrase.GetSize() == 0) {
return new TargetBigramState(*tbState);
}
for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
const Factor* f1 = NULL;
if (i == 0) {
f1 = tbState->GetWord().GetFactor(m_factorType);
} else {
f1 = targetPhrase.GetWord(i-1).GetFactor(m_factorType);
}
const Factor* f2 = targetPhrase.GetWord(i).GetFactor(m_factorType);
const string& w1 = f1->GetString();
const string& w2 = f2->GetString();
if (m_vocab.size() &&
(m_vocab.find(w1) == m_vocab.end() || m_vocab.find(w2) == m_vocab.end())) {
continue;
}
string name(w1 +":"+w2);
accumulator->PlusEquals(this,name,1);
}
return new TargetBigramState(targetPhrase.GetWord(targetPhrase.GetSize()-1));
}
size_t currPhraseSize = cur_hypo.GetCurrTargetPhrase().GetSize();
for (size_t i = 0; i < currPhraseSize; ++i) {
words.push_back(cur_hypo.GetCurrTargetPhrase().GetWord(i).ToString());
}
for (size_t i = 1; i < words.size(); ++i) {
map<string,size_t>::const_iterator first, second;
if ((first = m_wordMap.find(words[i-1])) != m_wordMap.end() &&
(second = m_wordMap.find(words[i])) != m_wordMap.end()) {
cerr << "FIXME" << endl;
assert(0);
//accumulator->Assign(first->second * second->second, 1);
}
}
return NULL;
}
}

View File

@ -1,32 +1,57 @@
#ifndef moses_TargetBigramFeature_h
#define moses_TargetBigramFeature_h
#include "FeatureFunction.h"
#include "FFState.h"
#include <string>
#include <map>
#include "FactorCollection.h"
#include "FeatureFunction.h"
#include "FFState.h"
#include "Word.h"
namespace Moses
{
class TargetBigramState : public FFState {
public:
TargetBigramState(const Word& word): m_word(word) {}
const Word& GetWord() const {return m_word;}
virtual int Compare(const FFState& other) const;
private:
Word m_word;
};
/** Sets the features of observed bigrams.
*/
class TargetBigramFeature : public StatefulFeatureFunction {
public:
TargetBigramFeature(): StatefulFeatureFunction("TargetBigram"){}
TargetBigramFeature(FactorType factorType = 0):
StatefulFeatureFunction("dlmb"),
m_factorType(factorType)
{
FactorCollection& factorCollection = FactorCollection::Instance();
const Factor* bosFactor =
factorCollection.AddFactor(Output,m_factorType,BOS_);
m_bos.SetFactor(m_factorType,bosFactor);
}
bool Load(const std::string &filePath);
size_t GetNumScoreComponents() const;
std::string GetScoreProducerWeightShortName() const;
size_t GetNumInputScores() const;
Word GetSentenceStartArray() const {return m_bos;}
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
virtual FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
private:
std::map<std::string, size_t> m_wordMap;
FactorType m_factorType;
Word m_bos;
std::set<std::string> m_vocab;
};
}

View File

@ -1,5 +1,6 @@
bin_PROGRAMS = moses_test
moses_test_SOURCES = MosesTest.cpp \
moses_test_SOURCES = MockHypothesis.cpp \
MosesTest.cpp \
ScoreComponentCollectionTest.cpp \
TargetBigramFeatureTest.cpp

102
unittest/MockHypothesis.cpp Normal file
View File

@ -0,0 +1,102 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2010 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "MockHypothesis.h"
#include <boost/test/unit_test.hpp>
#include "TranslationOption.h"
using namespace Moses;
using namespace std;
namespace MosesTest {
MockHypothesisGuard::MockHypothesisGuard(
const string& sourceSentence,
const vector<Alignment>& alignments,
const vector<string>& targetSegments)
: m_emptyTarget(Input),
m_sentence(Input),
m_system("mock",&m_wp,&m_uwp,&m_dist),
m_manager(m_sentence,Normal,&m_system)
{
BOOST_CHECK_EQUAL(alignments.size(), targetSegments.size());
std::vector<Moses::FactorType> factors;
factors.push_back(0);
stringstream in(sourceSentence + "\n");
m_sentence.Read(in,factors);
//Initial empty hypothesis
m_manager.ResetSentenceStats(m_sentence);
m_hypothesis = Hypothesis::Create(m_manager, m_sentence, m_emptyTarget);
//create the chain
vector<Alignment>::const_iterator ai = alignments.begin();
vector<string>::const_iterator ti = targetSegments.begin();
for (; ti != targetSegments.end() && ai != alignments.end(); ++ti,++ai)
{
Hypothesis* prevHypo = m_hypothesis;
WordsRange wordsRange(ai->first,ai->second);
m_targetPhrases.push_back(TargetPhrase());
m_targetPhrases.back().CreateFromString(factors,*ti,"|");
m_toptions.push_back(new TranslationOption
(wordsRange,m_targetPhrases.back(),m_sentence));
m_hypothesis = Hypothesis::Create(*prevHypo,*m_toptions.back(),NULL);
}
}
MockHypothesisGuard::~MockHypothesisGuard()
{
RemoveAllInColl(m_toptions);
while (m_hypothesis) {
Hypothesis* prevHypo = const_cast<Hypothesis*>(m_hypothesis->GetPrevHypo());
delete m_hypothesis;
m_hypothesis = prevHypo;
}
}
HypothesisFixture::HypothesisFixture()
{
string source = "je ne sais pas . ";
vector<string> target;
vector<Alignment> alignments;
m_empty.reset(new MockHypothesisGuard(source,alignments,target));
target.push_back("i");
target.push_back("do not");
alignments.push_back(Alignment(0,0));
alignments.push_back(Alignment(3,3));
m_partial.reset(new MockHypothesisGuard(source,alignments,target));
target.push_back("know");
target.push_back(".");
alignments.push_back(Alignment(1,2));
alignments.push_back(Alignment(4,4));
m_full.reset(new MockHypothesisGuard(source,alignments,target));
}
}

83
unittest/MockHypothesis.h Normal file
View File

@ -0,0 +1,83 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2010 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef _MOCK_HYPOTHESIS_
#define _MOCK_HYPOTHESIS_
#include <memory>
#include <vector>
#include "DummyScoreProducers.h"
#include "Hypothesis.h"
#include "Manager.h"
#include "TranslationSystem.h"
namespace MosesTest {
//
// Construct a hypothesis with arbitrary source and target phrase
// sequences. Useful for testing feature functions.
//
typedef std::pair<size_t,size_t> Alignment; //(first,last) in source
class MockHypothesisGuard {
public:
/** Creates a phrase-based hypothesis.
*/
MockHypothesisGuard(
const std::string& sourceSentence,
const std::vector<Alignment>& alignments,
const std::vector<std::string>& targetSegments);
Moses::Hypothesis* operator*() const {return m_hypothesis;}
/** Destroy the hypothesis chain */
~MockHypothesisGuard();
private:
Moses::TargetPhrase m_emptyTarget;
Moses::Sentence m_sentence;
Moses::WordPenaltyProducer m_wp;
Moses::UnknownWordPenaltyProducer m_uwp;
Moses::DistortionScoreProducer m_dist;
Moses::TranslationSystem m_system;
Moses::Manager m_manager;
Moses::Hypothesis* m_hypothesis;
std::vector<Moses::TargetPhrase> m_targetPhrases;
std::vector<Moses::TranslationOption*> m_toptions;
};
class HypothesisFixture {
public:
HypothesisFixture();
const Moses::Hypothesis* empty() {return **m_empty;}
const Moses::Hypothesis* partial() {return **m_partial;}
const Moses::Hypothesis* full() {return **m_full;}
private:
std::auto_ptr<MockHypothesisGuard> m_empty;
std::auto_ptr<MockHypothesisGuard> m_partial;
std::auto_ptr<MockHypothesisGuard> m_full;
};
}
#endif

View File

@ -17,14 +17,16 @@ License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <stdexcept>
#include <boost/test/unit_test.hpp>
#include <DummyScoreProducers.h>
#include <FeatureFunction.h>
#include <ScoreComponentCollection.h>
#include "DummyScoreProducers.h"
#include "FeatureFunction.h"
#include "ScoreComponentCollection.h"
using namespace Moses;
using namespace std;
BOOST_AUTO_TEST_SUITE(scc)
@ -42,21 +44,28 @@ class MockMultiFeature : public StatelessFeatureFunction {
size_t GetNumScoreComponents() const {return 5;}
};
struct MockProducers {
MockProducers(): single(new MockSingleFeature()),
multi(new MockMultiFeature()) {}
~MockProducers() {delete single; delete multi;}
class MockSparseFeature : public StatelessFeatureFunction {
public:
MockSparseFeature(): StatelessFeatureFunction("MockSparse") {}
std::string GetScoreProducerWeightShortName() const {return "sf";}
size_t GetNumScoreComponents() const {return ScoreProducer::unlimited;}
};
MockSingleFeature* single;
MockMultiFeature* multi;
struct MockProducers {
MockProducers() {}
MockSingleFeature single;
MockMultiFeature multi;
MockSparseFeature sparse;
};
BOOST_FIXTURE_TEST_CASE(ctor, MockProducers)
{
ScoreComponentCollection scc;
BOOST_CHECK_EQUAL(scc.GetScoreForProducer(single),0);
BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&single),0);
float expected[] = {0,0,0,0,0};
std::vector<float> actual= scc.GetScoresForProducer(multi);
std::vector<float> actual= scc.GetScoresForProducer(&multi);
BOOST_CHECK_EQUAL_COLLECTIONS(expected, expected+5, actual.begin(), actual.begin()+5);
}
@ -68,23 +77,32 @@ BOOST_FIXTURE_TEST_CASE(plusequals, MockProducers)
std::vector<float> vec2(arr2,arr2+5);
ScoreComponentCollection scc;
scc.PlusEquals(single, 3.4f);
BOOST_CHECK_EQUAL(scc.GetScoreForProducer(single), 3.4f);
scc.PlusEquals(multi,vec1);
std::vector<float> actual = scc.GetScoresForProducer(multi);
scc.PlusEquals(&single, 3.4f);
BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&single), 3.4f);
scc.PlusEquals(&multi,vec1);
std::vector<float> actual = scc.GetScoresForProducer(&multi);
BOOST_CHECK_EQUAL_COLLECTIONS(vec1.begin(),vec1.end()
,actual.begin(), actual.end());
scc.PlusEquals(multi,vec1);
actual = scc.GetScoresForProducer(multi);
scc.PlusEquals(&multi,vec1);
actual = scc.GetScoresForProducer(&multi);
BOOST_CHECK_EQUAL_COLLECTIONS(vec2.begin(),vec2.end(),
actual.begin(), actual.end());
BOOST_CHECK_EQUAL(scc.GetScoreForProducer(single), 3.4f);
BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&single), 3.4f);
}
BOOST_AUTO_TEST_CASE(test)
BOOST_FIXTURE_TEST_CASE(sparse_feature, MockProducers)
{
BOOST_CHECK(true);
ScoreComponentCollection scc;
scc.Assign(&sparse, "first", 1.3f);
scc.Assign(&sparse, "second", 2.1f);
BOOST_CHECK_EQUAL( scc.GetScoreForProducer(&sparse,"first"), 1.3f);
BOOST_CHECK_EQUAL( scc.GetScoreForProducer(&sparse,"second"), 2.1f);
BOOST_CHECK_EQUAL( scc.GetScoreForProducer(&sparse,"third"), 0.0f);
scc.Assign(&sparse, "first", -1.9f);
BOOST_CHECK_EQUAL( scc.GetScoreForProducer(&sparse,"first"), -1.9f);
scc.PlusEquals(&sparse, "first", -1.9f);
BOOST_CHECK_EQUAL( scc.GetScoreForProducer(&sparse,"first"), -3.8f);
}

View File

@ -17,16 +17,156 @@ License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <cstdio>
#include <cstdlib>
#include <string>
#include <boost/test/unit_test.hpp>
#include "FactorCollection.h"
#include "Sentence.h"
#include "TargetBigramFeature.h"
#include "Word.h"
#include "MockHypothesis.h"
using namespace std;
using namespace Moses;
namespace MosesTest
{
BOOST_AUTO_TEST_SUITE(target_bigram)
static Word MakeWord(string text) {
FactorCollection &factorCollection = FactorCollection::Instance();
const Factor* f = factorCollection.AddFactor(Input,0,text);
Word w;
w.SetFactor(0,f);
return w;
}
class VocabFileFixture {
public:
template<class I>
VocabFileFixture(I begin, I end)
{
filename = string(tmpnam(NULL));
ofstream out(filename.c_str());
BOOST_CHECK(out);
for (I i = begin; i != end; ++i)
{
out << *i << endl;
}
out.close();
}
~VocabFileFixture()
{
BOOST_CHECK(!remove(filename.c_str()));
}
string filename;
};
/*
BOOST_AUTO_TEST_CASE(Test2)
{
HypothesisFixture hypos;
cerr << hypos.empty() << ", " << *hypos.empty() << endl;
cerr << hypos.partial() << ", " << *hypos.partial() << endl;
cerr << hypos.full() << ", " << *hypos.full() << endl;
BOOST_CHECK(true);
} */
BOOST_AUTO_TEST_CASE(state_compare)
{
Word w1 = MakeWord("w1");
Word w2 = MakeWord("w2");
TargetBigramState s1(w1);
TargetBigramState s2(w2);
BOOST_CHECK_EQUAL(s1.Compare(s1),0);
BOOST_CHECK_EQUAL(s2.Compare(s2),0);
BOOST_CHECK_NE(s1.Compare(s2),0);
BOOST_CHECK_NE(s2.Compare(s1),0);
}
BOOST_AUTO_TEST_CASE(load)
{
TargetBigramFeature tbf;
string vocab[] = {"je", "ne", "pas"};
VocabFileFixture vocabFixture(vocab,vocab+3);
BOOST_CHECK(tbf.Load(vocabFixture.filename));
BOOST_CHECK(!tbf.Load("/gweugegyiegy"));
}
BOOST_AUTO_TEST_CASE(score_components)
{
TargetBigramFeature tbf;
BOOST_CHECK_EQUAL(
tbf.GetNumScoreComponents(),
ScoreProducer::unlimited);
}
BOOST_AUTO_TEST_CASE(empty_hypo)
{
Sentence s(Input);
TargetBigramFeature tbf;
auto_ptr<const FFState> ffs(tbf.EmptyHypothesisState(s));
BOOST_CHECK(ffs.get());
TargetBigramState expected(tbf.GetSentenceStartArray());
BOOST_CHECK_EQUAL(ffs->Compare(expected),0);
}
//Test of evaluate() where a vocab is specified
BOOST_AUTO_TEST_CASE(evaluate_vocab)
{
string vocab[] = {"i", "do"};
VocabFileFixture vocabFile(vocab,vocab+2);
HypothesisFixture hypos;
TargetBigramFeature tbf;
BOOST_CHECK(tbf.Load(vocabFile.filename));
TargetBigramState prevState(MakeWord("i"));
ScoreComponentCollection scc;
auto_ptr<FFState> currState(
tbf.Evaluate(*hypos.partial(), &prevState, &scc));
BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&tbf, "i:do"),1);
BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&tbf, "do:not"),0);
BOOST_CHECK(! currState->Compare(TargetBigramState(MakeWord("not"))));
}
//Test of evaluate() where no vocab file is specified
BOOST_AUTO_TEST_CASE(evaluate_all)
{
HypothesisFixture hypos;
TargetBigramFeature tbf;
BOOST_CHECK(tbf.Load("*"));
TargetBigramState prevState(MakeWord("i"));
ScoreComponentCollection scc;
auto_ptr<FFState> currState(
tbf.Evaluate(*hypos.partial(),&prevState,&scc));
BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&tbf, "i:do"),1);
BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&tbf, "do:not"),1);
BOOST_CHECK(! currState->Compare(TargetBigramState(MakeWord("not"))));
}
BOOST_AUTO_TEST_CASE(evaluate_empty)
{
HypothesisFixture hypos;
TargetBigramFeature tbf;
BOOST_CHECK(tbf.Load("*"));
auto_ptr<const FFState> prevState(tbf.EmptyHypothesisState(Sentence(Input)));
ScoreComponentCollection scc;
auto_ptr<const FFState> currState(
tbf.Evaluate(*hypos.empty(),prevState.get(),&scc));
BOOST_CHECK(! currState->Compare(*prevState));
}
BOOST_AUTO_TEST_SUITE_END()
}