diff --git a/misc/Makefile b/misc/Makefile index bfe173fcf..a21823f27 100644 --- a/misc/Makefile +++ b/misc/Makefile @@ -13,7 +13,7 @@ default: processPhraseTable processPhraseTable.o: processPhraseTable.cpp $(CXX) $(CXXFLAGS) $(INCLUDES) $< -c -o $@ -MOSESLIB =$(HOME)/workspace/moses/src/libmoses.a +MOSESLIB =../moses/src/libmoses.a processPhraseTable: processPhraseTable.o $(MOSESLIB) $(CXX) $(LDFLAGS) $^ -o $@ $(LIBS) diff --git a/misc/processPhraseTable.cpp b/misc/processPhraseTable.cpp index ec5af4f99..d7becb8a6 100644 --- a/misc/processPhraseTable.cpp +++ b/misc/processPhraseTable.cpp @@ -174,7 +174,7 @@ int main(int argc,char **argv) { for(size_t i=0;i(noScoreComponent,1/(1.0*noScoreComponent))); - while(net.Read(std::cin,factorOrder,cn-1)) { + while(net.ReadF(std::cin,factorOrder,cn-1)) { net.Print(std::cerr); GenerateCandidates(net,pdicts,weights,verb); } diff --git a/moses/src/ConfusionNet.cpp b/moses/src/ConfusionNet.cpp index 235ae87a4..02bc2e7ad 100644 --- a/moses/src/ConfusionNet.cpp +++ b/moses/src/ConfusionNet.cpp @@ -7,49 +7,73 @@ #include "PhraseDictionaryTreeAdaptor.h" #include "TranslationOptionCollectionConfusionNet.h" -ConfusionNet::ConfusionNet(FactorCollection* p) : InputType(),m_factorCollection(p) {} +ConfusionNet::ConfusionNet(FactorCollection* p) + : InputType(),m_factorCollection(p) {} void ConfusionNet::SetFactorCollection(FactorCollection *p) { m_factorCollection=p; } -bool ConfusionNet::ReadF(std::istream& in,const std::vector& factorOrder,int format) { - std::cerr<<"cn read with format "<& factorOrder, + int format) +{ + TRACE_ERR("read confusion net with format "<& factorOrder, FactorCollection &factorCollection) +int ConfusionNet::Read(std::istream& in, + const std::vector& factorOrder, + FactorCollection &factorCollection) { SetFactorCollection(&factorCollection); return ReadF(in,factorOrder,0); } -void ConfusionNet::String2Word(const std::string& s,Word& w,const std::vector& factorOrder) { +void ConfusionNet::String2Word(const std::string& s,Word& w, + const std::vector& factorOrder) +{ std::vector factorStrVector = Tokenize(s, "|"); for(size_t i=0;iAddFactor(Input,factorOrder[i],factorStrVector[i])); + w.SetFactor(factorOrder[i], + m_factorCollection->AddFactor(Input,factorOrder[i], + factorStrVector[i])); } -bool ConfusionNet::ReadFormat0(std::istream& in,const std::vector& factorOrder) { +bool ConfusionNet::ReadFormat0(std::istream& in, + const std::vector& factorOrder) +{ assert(m_factorCollection); Clear(); std::string line; while(getline(in,line)) { std::istringstream is(line); - std::string word;float costs; + std::string word;double prob; Column col; - while(is>>word>>costs) { + while(is>>word>>prob) { Word w; String2Word(word,w,factorOrder); - col.push_back(std::make_pair(w,costs)); + if(prob<0.0) + { + std::cerr<<"WARN: negative prob: "<set to 0.0\n"; + prob=0.0; + } + else if (prob>1.0) + { + std::cerr<<"WARN: prob > 1.0 : "< set to 1.0\n"; + prob=1.0; + } + col.push_back(std::make_pair(w,std::max(static_cast(log(prob)), + LOWEST_SCORE))); } if(col.size()) { data.push_back(col); @@ -59,7 +83,9 @@ bool ConfusionNet::ReadFormat0(std::istream& in,const std::vector& f } return !data.empty(); } -bool ConfusionNet::ReadFormat1(std::istream& in,const std::vector& factorOrder) { +bool ConfusionNet::ReadFormat1(std::istream& in, + const std::vector& factorOrder) +{ assert(m_factorCollection); Clear(); std::string line; @@ -110,15 +136,21 @@ std::ostream& operator<<(std::ostream& out,const ConfusionNet& cn) cn.Print(out);return out; } -TargetPhraseCollection const* ConfusionNet::CreateTargetPhraseCollection(PhraseDictionaryBase const& d,const WordsRange& r) const +TargetPhraseCollection const* ConfusionNet:: +CreateTargetPhraseCollection(PhraseDictionaryBase const& d, + const WordsRange& r) const { - if(PhraseDictionaryTreeAdaptor const* pdict=dynamic_cast(&d)) + if(PhraseDictionaryTreeAdaptor const* pdict= + dynamic_cast(&d)) return pdict->GetTargetPhraseCollection(*this,r); - std::cerr<<"ERROR: wrong phrase dictionary type for confusion net decoding!\n" - "has to be PhraseDictionaryTreeAdaptor\n"; + + std::cerr<<"ERROR: wrong phrase dictionary type for confusion net decoding!" + " Has to be PhraseDictionaryTreeAdaptor\n"; abort(); } -TranslationOptionCollection* ConfusionNet::CreateTranslationOptionCollection() const + +TranslationOptionCollection* +ConfusionNet::CreateTranslationOptionCollection() const { return new TranslationOptionCollectionConfusionNet(*this); } diff --git a/moses/src/Parameter.cpp b/moses/src/Parameter.cpp index 6b5273b3c..2b1473774 100755 --- a/moses/src/Parameter.cpp +++ b/moses/src/Parameter.cpp @@ -97,8 +97,15 @@ bool Parameter::Validate() // do files exist? // phrase tables - if (ret) - ret = FilesExist("ttable-file", 3); + if (ret) + { + std::vector ext; + // standard phrase table extension (i.e. full name has to be specified) + ext.push_back(""); + // alternative file extension for binary phrase table format: + ext.push_back(".binphr.idx"); + ret = FilesExist("ttable-file", 3,ext); + } // generation tables if (ret) ret = FilesExist("generation-file", 2); @@ -109,7 +116,7 @@ bool Parameter::Validate() return ret; } -bool Parameter::FilesExist(const string ¶mName, size_t tokenizeIndex) +bool Parameter::FilesExist(const string ¶mName, size_t tokenizeIndex,std::vector const& extensions) { using namespace boost::filesystem; @@ -135,14 +142,21 @@ bool Parameter::FilesExist(const string ¶mName, size_t tokenizeIndex) return false; } const string &pathStr = vec[tokenizeIndex]; - path filePath(pathStr, native); - if (!exists(filePath)) - { - stringstream errorMsg(""); - errorMsg << "File " << pathStr << " does not exists"; - UserMessage::Add(errorMsg.str()); - return false; - } + + bool fileFound=0; + for(size_t i=0;i const& fileExtension=std::vector(1,"")); bool Validate(); @@ -59,5 +59,6 @@ public: { return m_setting[paramName]; } + }; diff --git a/moses/src/PhraseDictionaryTree.cpp b/moses/src/PhraseDictionaryTree.cpp index c5b448867..5860db893 100644 --- a/moses/src/PhraseDictionaryTree.cpp +++ b/moses/src/PhraseDictionaryTree.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include "PrefixTree.h" #include "File.h" @@ -29,6 +30,12 @@ typedef std::vector IPhrase; typedef std::vector Scores; typedef PrefixTreeF PTF; +namespace __gnu_cxx { + template <> struct hash { + size_t operator()(const std::string& s) const {return __gnu_cxx::__stl_hash_string(s.c_str());} + }; +} + template > class LVoc { typedef A Key; @@ -132,7 +139,8 @@ struct PDTimp { typedef PrefixTreeF PTF; typedef FilePtr CPT; typedef std::vector Data; - typedef LVoc WordVoc; + // typedef LVoc WordVoc; + typedef LVoc > WordVoc; Data data; std::vector srcOffsets; @@ -215,7 +223,7 @@ struct PDTimp { PPtr Extend(PPtr p,const std::string& w) { assert(p); - if(w.empty()) return p; + if(w.empty() || w==EPSILON) return p; LabelId wi=sv.index(w); if(wi==InvalidLabelId) return PPtr(); else if(p.imp->isRoot()) @@ -291,6 +299,13 @@ PhraseDictionaryTree::PhraseDictionaryTree(size_t noScoreComponent, : Dictionary(noScoreComponent),imp(new PDTimp),m_inFactorType(ift),m_outFactorType(oft) { imp->m_factorCollection=fc; + if(sizeof(off_t)!=8) + { + std::cerr<<"ERROR: size of type 'off_t' has to be 64 bit!\n" + "use compiler settings '-D_FILE_OFFSET_BITS=64 -D_LARGE_FILES'\n" + " -> abort \n\n"; + abort(); + } } PhraseDictionaryTree::~PhraseDictionaryTree() diff --git a/moses/src/PhraseDictionaryTreeAdaptor.cpp b/moses/src/PhraseDictionaryTreeAdaptor.cpp index fd514db99..0a78cdfe6 100644 --- a/moses/src/PhraseDictionaryTreeAdaptor.cpp +++ b/moses/src/PhraseDictionaryTreeAdaptor.cpp @@ -35,6 +35,7 @@ struct PDTAimp { : m_languageModels(0),m_weightWP(0.0),m_factorCollection(0),m_dict(0), m_obj(p),useCache(1) {} + // convert FactorArray into string void Factors2String(FactorArray const& w,std::string& s) const { for(size_t j=0;j piter; if(useCache) { @@ -84,6 +89,8 @@ struct PDTAimp { } else if (m_cache.size()) { + // cache is also used for unknowns, so even if the cache is disabled + // there may be entries MapSrc2Tgt::const_iterator i=m_cache.find(src); return (i!=m_cache.end() ? i->second : 0); } diff --git a/moses/src/StaticData.cpp b/moses/src/StaticData.cpp index 7da13e37b..44323a3e4 100755 --- a/moses/src/StaticData.cpp +++ b/moses/src/StaticData.cpp @@ -418,30 +418,31 @@ void StaticData::LoadPhraseTables(bool filter + PROJECT_NAME + "--" + inputFileHash + "--" + phraseTableHash + ".txt"; - bool filterPhrase; - if (filter) - { - boost::filesystem::path tempFile(hashFilePath, boost::filesystem::native); - if (boost::filesystem::exists(tempFile)) - { // load filtered file instead - filterPhrase = false; - filePath = hashFilePath; - } - else - { // load original file & create has file - filterPhrase = true; - } - } - else - { // load original file - filterPhrase = false; - } - TRACE_ERR(filePath << endl); timer.check("Start loading PhraseTable"); - if (!boost::filesystem::exists(filePath+".binphr.idx")) { + bool filterPhrase; + if (filter) + { + boost::filesystem::path tempFile(hashFilePath, boost::filesystem::native); + if (boost::filesystem::exists(tempFile)) + { // load filtered file instead + filterPhrase = false; + filePath = hashFilePath; + } + else + { // load original file & create has file + filterPhrase = true; + } + } + else + { // load original file + filterPhrase = false; + } + TRACE_ERR(filePath << endl); + + TRACE_ERR("using standard phrase tables"); PhraseDictionary *pd=new PhraseDictionary(noScoreComponent); pd->Load(input diff --git a/moses/src/TypeDef.h b/moses/src/TypeDef.h index f1abf17e3..c109f7e12 100755 --- a/moses/src/TypeDef.h +++ b/moses/src/TypeDef.h @@ -29,6 +29,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #define SENTENCE_START "" #define SENTENCE_END "" #define UNKNOWN_FACTOR "UNK" +#define EPSILON "*EPS*" #define NOT_FOUND std::numeric_limits::max() #define MAX_NGRAM_SIZE 20