From 8595b06dce870857f89f8b4389dd79d31b05cc2f Mon Sep 17 00:00:00 2001 From: hieuhoang1972 Date: Fri, 1 Jul 2011 05:40:46 +0000 Subject: [PATCH] rewrite lex prob calc git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4058 1f5c12ca-751b-0410-a591-d2e778427230 --- scripts/training/phrase-extract/Makefile | 5 + .../training/phrase-extract/extract-lex.cpp | 172 ++++++++++++++++++ scripts/training/phrase-extract/extract-lex.h | 112 ++++++++++++ 3 files changed, 289 insertions(+) create mode 100644 scripts/training/phrase-extract/extract-lex.cpp create mode 100644 scripts/training/phrase-extract/extract-lex.h diff --git a/scripts/training/phrase-extract/Makefile b/scripts/training/phrase-extract/Makefile index 63652fcb6..8ab6fbfbf 100644 --- a/scripts/training/phrase-extract/Makefile +++ b/scripts/training/phrase-extract/Makefile @@ -13,6 +13,9 @@ extract: tables-core.o SentenceAlignment.o extract.o extract-rules: tables-core.o SentenceAlignment.o SentenceAlignmentWithSyntax.o SyntaxTree.o XmlTree.o HoleCollection.o extract-rules.o $(CXX) $^ -o extract-rules +extract-lex: extract-lex + $(CXX) $^ -o extract-lex + score: tables-core.o AlignmentPhrase.o score.o PhraseAlignment.o InputFileStream.o $(CXX) $^ -lz -o score @@ -27,3 +30,5 @@ relax-parse: tables-core.o SyntaxTree.o XmlTree.o relax-parse.o statistics: tables-core.o AlignmentPhrase.o statistics.o $(CXX) $^ -o statistics + + diff --git a/scripts/training/phrase-extract/extract-lex.cpp b/scripts/training/phrase-extract/extract-lex.cpp new file mode 100644 index 000000000..7e878ae9e --- /dev/null +++ b/scripts/training/phrase-extract/extract-lex.cpp @@ -0,0 +1,172 @@ +#include +#include +#include +#include +#include "extract-lex.h" + +using namespace std; + +int main(int argc, char* argv[]) +{ + cerr << "Starting...\n"; + + char* &filePathTarget = argv[1]; + char* &filePathSource = argv[2]; + char* &filePathAlign = argv[3]; + char* &filePathLexS2T = argv[4]; + char* &filePathLexT2S = argv[5]; + + ifstream streamTarget; + ifstream streamSource; + ifstream streamAlign; + streamTarget.open(filePathTarget); + streamSource.open(filePathSource); + streamAlign.open(filePathAlign); + + ofstream streamLexS2T; + ofstream streamLexT2S; + streamLexS2T.open(filePathLexS2T); + streamLexT2S.open(filePathLexT2S); + + ExtractLex extractSingleton; + + string lineTarget, lineSource, lineAlign; + while (getline(streamTarget, lineTarget)) + { + istream &isSource = getline(streamSource, lineSource); + assert(isSource); + istream &isAlign = getline(streamAlign, lineAlign); + assert(isAlign); + + vector toksTarget, toksSource, toksAlign; + Tokenize(toksTarget, lineTarget); + Tokenize(toksSource, lineSource); + Tokenize(toksAlign, lineAlign); + + cerr << endl + << toksTarget.size() << " " << lineTarget << endl + << toksSource.size() << " " << lineSource << endl + << toksAlign.size() << " " << lineAlign << endl; + + extractSingleton.Process(toksTarget, toksSource, toksAlign); + + } + + extractSingleton.Output(streamLexS2T, streamLexT2S); + + streamLexS2T.close(); + streamLexT2S.close(); + + cerr << "Finished\n"; +} + +const std::string *Vocab::GetOrAdd(const std::string &word) +{ + const string *ret = &(*m_coll.insert(word).first); + return ret; +} + +void ExtractLex::Process(vector &toksTarget, vector &toksSource, vector &toksAlign) +{ + vector::const_iterator iterAlign; + for (iterAlign = toksAlign.begin(); iterAlign != toksAlign.end(); ++iterAlign) + { + const string &alignTok = *iterAlign; + + vector alignPos; + Tokenize(alignPos, alignTok, "-"); + assert(alignPos.size() == 2); + assert(alignPos[0] < toksSource.size()); + assert(alignPos[1] < toksTarget.size()); + + const string &tmpSource = toksSource[ alignPos[0] ]; + const string &tmpTarget = toksTarget[ alignPos[1] ]; + + const string *source = m_vocab.GetOrAdd(tmpSource); + const string *target = m_vocab.GetOrAdd(tmpTarget); + + Process(target, source); + + } + +} + +float COUNT_INCR = 1; + +void ExtractLex::Process(const std::string *target, const std::string *source) +{ + WordCount tmpWCTarget(target, COUNT_INCR); + WordCount tmpWCSource(source, COUNT_INCR); + + Process(tmpWCSource, tmpWCTarget, m_collS2T); + Process(tmpWCTarget, tmpWCSource, m_collT2S); +} + +void ExtractLex::Process(const WordCount &in, const WordCount &out, std::map &coll) +{ + std::map::iterator iterMap; + // s2t + WordCountColl *wcColl = NULL; + iterMap = coll.find(in); + if (iterMap == coll.end()) + { + wcColl = &coll[in]; + } + else + { + const WordCount &wcIn = iterMap->first; + + //cerr << wcIn << endl; + wcIn.AddCount(COUNT_INCR); + //cerr << wcIn << endl; + + wcColl = &iterMap->second; + } + + assert(in.GetCount() == COUNT_INCR); + assert(out.GetCount() == COUNT_INCR); + assert(wcColl); + + pair iterSet = wcColl->insert(out); + const WordCount &outWC = *iterSet.first; + outWC.AddCount(COUNT_INCR); +} + +void ExtractLex::Output(std::ofstream &streamLexS2T, std::ofstream &streamLexT2S) +{ + Output(m_collS2T, streamLexS2T); + Output(m_collT2S, streamLexT2S); +} + +void ExtractLex::Output(const std::map &coll, std::ofstream &outStream) +{ + std::map::const_iterator iterOuter; + for (iterOuter = coll.begin(); iterOuter != coll.end(); ++iterOuter) + { + const WordCount &in = iterOuter->first; + const WordCountColl &outColl = iterOuter->second; + + WordCountColl::const_iterator iterInner; + for (iterInner = outColl.begin(); iterInner != outColl.end(); ++iterInner) + { + const WordCount &out = *iterInner; + outStream << in.GetString() << " " << out.GetString() + << " " << in.GetCount() << " " << out.GetCount() + << endl; + } + } +} + +std::ostream& operator<<(std::ostream &out, const WordCount &obj) +{ + out << obj.GetString() << "(" << obj.GetCount() << ")"; + return out; +} + +void WordCount::AddCount(float incr) const +{ + m_count += incr; + cerr << *this << endl; +} + + diff --git a/scripts/training/phrase-extract/extract-lex.h b/scripts/training/phrase-extract/extract-lex.h new file mode 100644 index 000000000..5e186df16 --- /dev/null +++ b/scripts/training/phrase-extract/extract-lex.h @@ -0,0 +1,112 @@ +#pragma once + +#include +#include +#include +#include +#include + + +//! convert string to variable of type T. Used to reading floats, int etc from files +template +inline T Scan(const std::string &input) +{ + std::stringstream stream(input); + T ret; + stream >> ret; + return ret; +} + + +//! speeded up version of above +template +inline void Scan(std::vector &output, const std::vector< std::string > &input) +{ + output.resize(input.size()); + for (size_t i = 0 ; i < input.size() ; i++) + { + output[i] = Scan( input[i] ); + } +} + + +inline void Tokenize(std::vector &output + , const std::string& str + , const std::string& delimiters = " \t") +{ + // Skip delimiters at beginning. + std::string::size_type lastPos = str.find_first_not_of(delimiters, 0); + // Find first "non-delimiter". + std::string::size_type pos = str.find_first_of(delimiters, lastPos); + + while (std::string::npos != pos || std::string::npos != lastPos) { + // Found a token, add it to the vector. + output.push_back(str.substr(lastPos, pos - lastPos)); + // Skip delimiters. Note the "not_of" + lastPos = str.find_first_not_of(delimiters, pos); + // Find next "non-delimiter" + pos = str.find_first_of(delimiters, lastPos); + } +} + +// speeded up version of above +template +inline void Tokenize( std::vector &output + , const std::string &input + , const std::string& delimiters = " \t") +{ + std::vector stringVector; + Tokenize(stringVector, input, delimiters); + return Scan(output, stringVector ); +} + +class WordCount +{ + friend std::ostream& operator<<(std::ostream&, const WordCount&); +public: + const std::string *m_str; + mutable float m_count; + + WordCount(const std::string *str, float count) + :m_str(str) + ,m_count(count) + {} + + void AddCount(float incr) const; + + const std::string GetString() const + { return *m_str; } + const float GetCount() const + { return m_count; } + + //! transitive comparison used for adding objects into FactorCollection + inline bool operator<(const WordCount &other) const + { + return m_str < other.m_str; + } +}; + +class Vocab +{ + std::set m_coll; +public: + const std::string *GetOrAdd(const std::string &word); +}; + +typedef std::set WordCountColl; + +class ExtractLex +{ + Vocab m_vocab; + std::map m_collS2T, m_collT2S; + + void Process(const std::string *target, const std::string *source); + void Process(const WordCount &in, const WordCount &out, std::map &coll); + void Output(const std::map &coll, std::ofstream &outStream); + +public: + void Process(std::vector &toksTarget, std::vector &toksSource, std::vector &toksAlign); + void Output(std::ofstream &streamLexS2T, std::ofstream &streamLexT2S); + +}; +