- confusion net:

* more robust read functions
    * correct treatment of epsilons
    * code cleanup

 - parameter: fixed check for binary phrase table
 - staticData: do not read input phrases in case of binary phrase table


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@260 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
zens 2006-07-24 18:33:08 +00:00
parent 90124bd403
commit 422418008e
9 changed files with 124 additions and 53 deletions

View File

@ -13,7 +13,7 @@ default: processPhraseTable
processPhraseTable.o: processPhraseTable.cpp
$(CXX) $(CXXFLAGS) $(INCLUDES) $< -c -o $@
MOSESLIB =$(HOME)/workspace/moses/src/libmoses.a
MOSESLIB =../moses/src/libmoses.a
processPhraseTable: processPhraseTable.o $(MOSESLIB)
$(CXX) $(LDFLAGS) $^ -o $@ $(LIBS)

View File

@ -174,7 +174,7 @@ int main(int argc,char **argv) {
for(size_t i=0;i<pdicts.size();++i)
weights.push_back(std::vector<float>(noScoreComponent,1/(1.0*noScoreComponent)));
while(net.Read(std::cin,factorOrder,cn-1)) {
while(net.ReadF(std::cin,factorOrder,cn-1)) {
net.Print(std::cerr);
GenerateCandidates(net,pdicts,weights,verb);
}

View File

@ -7,49 +7,73 @@
#include "PhraseDictionaryTreeAdaptor.h"
#include "TranslationOptionCollectionConfusionNet.h"
ConfusionNet::ConfusionNet(FactorCollection* p) : InputType(),m_factorCollection(p) {}
ConfusionNet::ConfusionNet(FactorCollection* p)
: InputType(),m_factorCollection(p) {}
void ConfusionNet::SetFactorCollection(FactorCollection *p)
{
m_factorCollection=p;
}
bool ConfusionNet::ReadF(std::istream& in,const std::vector<FactorType>& factorOrder,int format) {
std::cerr<<"cn read with format "<<format<<"\n";
bool ConfusionNet::ReadF(std::istream& in,
const std::vector<FactorType>& factorOrder,
int format)
{
TRACE_ERR("read confusion net with format "<<format<<"\n");
switch(format)
{
case 0: return ReadFormat0(in,factorOrder);
case 1: return ReadFormat1(in,factorOrder);
default:
std::cerr<<"ERROR: unknown format '"<<format<<"' in ConfusionNet::Read\n";
std::cerr<<"ERROR: unknown format '"<<format
<<"' in ConfusionNet::Read\n";
}
return 0;
}
int ConfusionNet::Read(std::istream& in,const std::vector<FactorType>& factorOrder, FactorCollection &factorCollection)
int ConfusionNet::Read(std::istream& in,
const std::vector<FactorType>& factorOrder,
FactorCollection &factorCollection)
{
SetFactorCollection(&factorCollection);
return ReadF(in,factorOrder,0);
}
void ConfusionNet::String2Word(const std::string& s,Word& w,const std::vector<FactorType>& factorOrder) {
void ConfusionNet::String2Word(const std::string& s,Word& w,
const std::vector<FactorType>& factorOrder)
{
std::vector<std::string> factorStrVector = Tokenize(s, "|");
for(size_t i=0;i<factorOrder.size();++i)
w.SetFactor(factorOrder[i],m_factorCollection->AddFactor(Input,factorOrder[i],factorStrVector[i]));
w.SetFactor(factorOrder[i],
m_factorCollection->AddFactor(Input,factorOrder[i],
factorStrVector[i]));
}
bool ConfusionNet::ReadFormat0(std::istream& in,const std::vector<FactorType>& factorOrder) {
bool ConfusionNet::ReadFormat0(std::istream& in,
const std::vector<FactorType>& factorOrder)
{
assert(m_factorCollection);
Clear();
std::string line;
while(getline(in,line)) {
std::istringstream is(line);
std::string word;float costs;
std::string word;double prob;
Column col;
while(is>>word>>costs) {
while(is>>word>>prob) {
Word w;
String2Word(word,w,factorOrder);
col.push_back(std::make_pair(w,costs));
if(prob<0.0)
{
std::cerr<<"WARN: negative prob: "<<prob<<" ->set to 0.0\n";
prob=0.0;
}
else if (prob>1.0)
{
std::cerr<<"WARN: prob > 1.0 : "<<prob<<" -> set to 1.0\n";
prob=1.0;
}
col.push_back(std::make_pair(w,std::max(static_cast<float>(log(prob)),
LOWEST_SCORE)));
}
if(col.size()) {
data.push_back(col);
@ -59,7 +83,9 @@ bool ConfusionNet::ReadFormat0(std::istream& in,const std::vector<FactorType>& f
}
return !data.empty();
}
bool ConfusionNet::ReadFormat1(std::istream& in,const std::vector<FactorType>& factorOrder) {
bool ConfusionNet::ReadFormat1(std::istream& in,
const std::vector<FactorType>& factorOrder)
{
assert(m_factorCollection);
Clear();
std::string line;
@ -110,15 +136,21 @@ std::ostream& operator<<(std::ostream& out,const ConfusionNet& cn)
cn.Print(out);return out;
}
TargetPhraseCollection const* ConfusionNet::CreateTargetPhraseCollection(PhraseDictionaryBase const& d,const WordsRange& r) const
TargetPhraseCollection const* ConfusionNet::
CreateTargetPhraseCollection(PhraseDictionaryBase const& d,
const WordsRange& r) const
{
if(PhraseDictionaryTreeAdaptor const* pdict=dynamic_cast<PhraseDictionaryTreeAdaptor const*>(&d))
if(PhraseDictionaryTreeAdaptor const* pdict=
dynamic_cast<PhraseDictionaryTreeAdaptor const*>(&d))
return pdict->GetTargetPhraseCollection(*this,r);
std::cerr<<"ERROR: wrong phrase dictionary type for confusion net decoding!\n"
"has to be PhraseDictionaryTreeAdaptor\n";
std::cerr<<"ERROR: wrong phrase dictionary type for confusion net decoding!"
" Has to be PhraseDictionaryTreeAdaptor\n";
abort();
}
TranslationOptionCollection* ConfusionNet::CreateTranslationOptionCollection() const
TranslationOptionCollection*
ConfusionNet::CreateTranslationOptionCollection() const
{
return new TranslationOptionCollectionConfusionNet(*this);
}

View File

@ -97,8 +97,15 @@ bool Parameter::Validate()
// do files exist?
// phrase tables
if (ret)
ret = FilesExist("ttable-file", 3);
if (ret)
{
std::vector<std::string> ext;
// standard phrase table extension (i.e. full name has to be specified)
ext.push_back("");
// alternative file extension for binary phrase table format:
ext.push_back(".binphr.idx");
ret = FilesExist("ttable-file", 3,ext);
}
// generation tables
if (ret)
ret = FilesExist("generation-file", 2);
@ -109,7 +116,7 @@ bool Parameter::Validate()
return ret;
}
bool Parameter::FilesExist(const string &paramName, size_t tokenizeIndex)
bool Parameter::FilesExist(const string &paramName, size_t tokenizeIndex,std::vector<std::string> const& extensions)
{
using namespace boost::filesystem;
@ -135,14 +142,21 @@ bool Parameter::FilesExist(const string &paramName, size_t tokenizeIndex)
return false;
}
const string &pathStr = vec[tokenizeIndex];
path filePath(pathStr, native);
if (!exists(filePath))
{
stringstream errorMsg("");
errorMsg << "File " << pathStr << " does not exists";
UserMessage::Add(errorMsg.str());
return false;
}
bool fileFound=0;
for(size_t i=0;i<extensions.size() && !fileFound;++i)
{
path filePath(pathStr+extensions[i], native);
fileFound|=exists(filePath);
}
if(!fileFound)
{
stringstream errorMsg("");
errorMsg << "File " << pathStr << " does not exists";
UserMessage::Add(errorMsg.str());
return false;
}
}
return true;
}

View File

@ -41,7 +41,7 @@ protected:
std::string FindParam(const std::string &paramSwitch, int argc, char* argv[]);
void OverwriteParam(const std::string &paramSwitch, const std::string &paramName, int argc, char* argv[]);
bool ReadConfigFile( std::string filePath );
bool FilesExist(const std::string &paramName, size_t tokenizeIndex);
bool FilesExist(const std::string &paramName, size_t tokenizeIndex,std::vector<std::string> const& fileExtension=std::vector<std::string>(1,""));
bool Validate();
@ -59,5 +59,6 @@ public:
{
return m_setting[paramName];
}
};

View File

@ -5,6 +5,7 @@
#include <sstream>
#include <iostream>
#include <fstream>
#include <ext/hash_map>
#include "PrefixTree.h"
#include "File.h"
@ -29,6 +30,12 @@ typedef std::vector<LabelId> IPhrase;
typedef std::vector<float> Scores;
typedef PrefixTreeF<LabelId,off_t> PTF;
namespace __gnu_cxx {
template <> struct hash<std::string> {
size_t operator()(const std::string& s) const {return __gnu_cxx::__stl_hash_string(s.c_str());}
};
}
template<typename A,typename B=std::map<A,LabelId> >
class LVoc {
typedef A Key;
@ -132,7 +139,8 @@ struct PDTimp {
typedef PrefixTreeF<LabelId,off_t> PTF;
typedef FilePtr<PTF> CPT;
typedef std::vector<CPT> Data;
typedef LVoc<std::string> WordVoc;
// typedef LVoc<std::string> WordVoc;
typedef LVoc<std::string,__gnu_cxx::hash_map<std::string,LabelId> > WordVoc;
Data data;
std::vector<off_t> srcOffsets;
@ -215,7 +223,7 @@ struct PDTimp {
PPtr Extend(PPtr p,const std::string& w)
{
assert(p);
if(w.empty()) return p;
if(w.empty() || w==EPSILON) return p;
LabelId wi=sv.index(w);
if(wi==InvalidLabelId) return PPtr();
else if(p.imp->isRoot())
@ -291,6 +299,13 @@ PhraseDictionaryTree::PhraseDictionaryTree(size_t noScoreComponent,
: Dictionary(noScoreComponent),imp(new PDTimp),m_inFactorType(ift),m_outFactorType(oft)
{
imp->m_factorCollection=fc;
if(sizeof(off_t)!=8)
{
std::cerr<<"ERROR: size of type 'off_t' has to be 64 bit!\n"
"use compiler settings '-D_FILE_OFFSET_BITS=64 -D_LARGE_FILES'\n"
" -> abort \n\n";
abort();
}
}
PhraseDictionaryTree::~PhraseDictionaryTree()

View File

@ -35,6 +35,7 @@ struct PDTAimp {
: m_languageModels(0),m_weightWP(0.0),m_factorCollection(0),m_dict(0),
m_obj(p),useCache(1) {}
// convert FactorArray into string
void Factors2String(FactorArray const& w,std::string& s) const
{
for(size_t j=0;j<m_input.size();++j)
@ -44,6 +45,7 @@ struct PDTAimp {
}
}
// free temporary memory
void CleanUp()
{
assert(m_dict);
@ -54,6 +56,7 @@ struct PDTAimp {
m_rangeCache.clear();
}
// add phrase pair till next CleanUp, should be used only for unknowns
void AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase)
{
assert(GetTargetPhraseCollection(source)==0);
@ -70,12 +73,14 @@ struct PDTAimp {
else std::cerr<<"WARNING: you added an already existing phrase!\n";
}
// access with full source phrase
TargetPhraseCollection const*
GetTargetPhraseCollection(Phrase const &src) const
{
assert(m_dict);
if(src.GetSize()==0) return 0;
// look up cache
std::pair<MapSrc2Tgt::iterator,bool> piter;
if(useCache)
{
@ -84,6 +89,8 @@ struct PDTAimp {
}
else if (m_cache.size())
{
// cache is also used for unknowns, so even if the cache is disabled
// there may be entries
MapSrc2Tgt::const_iterator i=m_cache.find(src);
return (i!=m_cache.end() ? i->second : 0);
}

View File

@ -418,30 +418,31 @@ void StaticData::LoadPhraseTables(bool filter
+ PROJECT_NAME + "--"
+ inputFileHash + "--"
+ phraseTableHash + ".txt";
bool filterPhrase;
if (filter)
{
boost::filesystem::path tempFile(hashFilePath, boost::filesystem::native);
if (boost::filesystem::exists(tempFile))
{ // load filtered file instead
filterPhrase = false;
filePath = hashFilePath;
}
else
{ // load original file & create has file
filterPhrase = true;
}
}
else
{ // load original file
filterPhrase = false;
}
TRACE_ERR(filePath << endl);
timer.check("Start loading PhraseTable");
if (!boost::filesystem::exists(filePath+".binphr.idx"))
{
bool filterPhrase;
if (filter)
{
boost::filesystem::path tempFile(hashFilePath, boost::filesystem::native);
if (boost::filesystem::exists(tempFile))
{ // load filtered file instead
filterPhrase = false;
filePath = hashFilePath;
}
else
{ // load original file & create has file
filterPhrase = true;
}
}
else
{ // load original file
filterPhrase = false;
}
TRACE_ERR(filePath << endl);
TRACE_ERR("using standard phrase tables");
PhraseDictionary *pd=new PhraseDictionary(noScoreComponent);
pd->Load(input

View File

@ -29,6 +29,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#define SENTENCE_START "<s>"
#define SENTENCE_END "</s>"
#define UNKNOWN_FACTOR "UNK"
#define EPSILON "*EPS*"
#define NOT_FOUND std::numeric_limits<size_t>::max()
#define MAX_NGRAM_SIZE 20