mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-28 14:32:38 +03:00
rewrite lex prob calc
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4058 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
3585d8bae2
commit
8595b06dce
@ -13,6 +13,9 @@ extract: tables-core.o SentenceAlignment.o extract.o
|
|||||||
extract-rules: tables-core.o SentenceAlignment.o SentenceAlignmentWithSyntax.o SyntaxTree.o XmlTree.o HoleCollection.o extract-rules.o
|
extract-rules: tables-core.o SentenceAlignment.o SentenceAlignmentWithSyntax.o SyntaxTree.o XmlTree.o HoleCollection.o extract-rules.o
|
||||||
$(CXX) $^ -o extract-rules
|
$(CXX) $^ -o extract-rules
|
||||||
|
|
||||||
|
extract-lex: extract-lex
|
||||||
|
$(CXX) $^ -o extract-lex
|
||||||
|
|
||||||
score: tables-core.o AlignmentPhrase.o score.o PhraseAlignment.o InputFileStream.o
|
score: tables-core.o AlignmentPhrase.o score.o PhraseAlignment.o InputFileStream.o
|
||||||
$(CXX) $^ -lz -o score
|
$(CXX) $^ -lz -o score
|
||||||
|
|
||||||
@ -27,3 +30,5 @@ relax-parse: tables-core.o SyntaxTree.o XmlTree.o relax-parse.o
|
|||||||
|
|
||||||
statistics: tables-core.o AlignmentPhrase.o statistics.o
|
statistics: tables-core.o AlignmentPhrase.o statistics.o
|
||||||
$(CXX) $^ -o statistics
|
$(CXX) $^ -o statistics
|
||||||
|
|
||||||
|
|
||||||
|
172
scripts/training/phrase-extract/extract-lex.cpp
Normal file
172
scripts/training/phrase-extract/extract-lex.cpp
Normal file
@ -0,0 +1,172 @@
|
|||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
#include <cassert>
|
||||||
|
#include <vector>
|
||||||
|
#include "extract-lex.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
int main(int argc, char* argv[])
|
||||||
|
{
|
||||||
|
cerr << "Starting...\n";
|
||||||
|
|
||||||
|
char* &filePathTarget = argv[1];
|
||||||
|
char* &filePathSource = argv[2];
|
||||||
|
char* &filePathAlign = argv[3];
|
||||||
|
char* &filePathLexS2T = argv[4];
|
||||||
|
char* &filePathLexT2S = argv[5];
|
||||||
|
|
||||||
|
ifstream streamTarget;
|
||||||
|
ifstream streamSource;
|
||||||
|
ifstream streamAlign;
|
||||||
|
streamTarget.open(filePathTarget);
|
||||||
|
streamSource.open(filePathSource);
|
||||||
|
streamAlign.open(filePathAlign);
|
||||||
|
|
||||||
|
ofstream streamLexS2T;
|
||||||
|
ofstream streamLexT2S;
|
||||||
|
streamLexS2T.open(filePathLexS2T);
|
||||||
|
streamLexT2S.open(filePathLexT2S);
|
||||||
|
|
||||||
|
ExtractLex extractSingleton;
|
||||||
|
|
||||||
|
string lineTarget, lineSource, lineAlign;
|
||||||
|
while (getline(streamTarget, lineTarget))
|
||||||
|
{
|
||||||
|
istream &isSource = getline(streamSource, lineSource);
|
||||||
|
assert(isSource);
|
||||||
|
istream &isAlign = getline(streamAlign, lineAlign);
|
||||||
|
assert(isAlign);
|
||||||
|
|
||||||
|
vector<string> toksTarget, toksSource, toksAlign;
|
||||||
|
Tokenize(toksTarget, lineTarget);
|
||||||
|
Tokenize(toksSource, lineSource);
|
||||||
|
Tokenize(toksAlign, lineAlign);
|
||||||
|
|
||||||
|
cerr << endl
|
||||||
|
<< toksTarget.size() << " " << lineTarget << endl
|
||||||
|
<< toksSource.size() << " " << lineSource << endl
|
||||||
|
<< toksAlign.size() << " " << lineAlign << endl;
|
||||||
|
|
||||||
|
extractSingleton.Process(toksTarget, toksSource, toksAlign);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
extractSingleton.Output(streamLexS2T, streamLexT2S);
|
||||||
|
|
||||||
|
streamLexS2T.close();
|
||||||
|
streamLexT2S.close();
|
||||||
|
|
||||||
|
cerr << "Finished\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string *Vocab::GetOrAdd(const std::string &word)
|
||||||
|
{
|
||||||
|
const string *ret = &(*m_coll.insert(word).first);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ExtractLex::Process(vector<string> &toksTarget, vector<string> &toksSource, vector<string> &toksAlign)
|
||||||
|
{
|
||||||
|
vector<string>::const_iterator iterAlign;
|
||||||
|
for (iterAlign = toksAlign.begin(); iterAlign != toksAlign.end(); ++iterAlign)
|
||||||
|
{
|
||||||
|
const string &alignTok = *iterAlign;
|
||||||
|
|
||||||
|
vector<size_t> alignPos;
|
||||||
|
Tokenize(alignPos, alignTok, "-");
|
||||||
|
assert(alignPos.size() == 2);
|
||||||
|
assert(alignPos[0] < toksSource.size());
|
||||||
|
assert(alignPos[1] < toksTarget.size());
|
||||||
|
|
||||||
|
const string &tmpSource = toksSource[ alignPos[0] ];
|
||||||
|
const string &tmpTarget = toksTarget[ alignPos[1] ];
|
||||||
|
|
||||||
|
const string *source = m_vocab.GetOrAdd(tmpSource);
|
||||||
|
const string *target = m_vocab.GetOrAdd(tmpTarget);
|
||||||
|
|
||||||
|
Process(target, source);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
float COUNT_INCR = 1;
|
||||||
|
|
||||||
|
void ExtractLex::Process(const std::string *target, const std::string *source)
|
||||||
|
{
|
||||||
|
WordCount tmpWCTarget(target, COUNT_INCR);
|
||||||
|
WordCount tmpWCSource(source, COUNT_INCR);
|
||||||
|
|
||||||
|
Process(tmpWCSource, tmpWCTarget, m_collS2T);
|
||||||
|
Process(tmpWCTarget, tmpWCSource, m_collT2S);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ExtractLex::Process(const WordCount &in, const WordCount &out, std::map<WordCount, WordCountColl> &coll)
|
||||||
|
{
|
||||||
|
std::map<WordCount, WordCountColl>::iterator iterMap;
|
||||||
|
// s2t
|
||||||
|
WordCountColl *wcColl = NULL;
|
||||||
|
iterMap = coll.find(in);
|
||||||
|
if (iterMap == coll.end())
|
||||||
|
{
|
||||||
|
wcColl = &coll[in];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const WordCount &wcIn = iterMap->first;
|
||||||
|
|
||||||
|
//cerr << wcIn << endl;
|
||||||
|
wcIn.AddCount(COUNT_INCR);
|
||||||
|
//cerr << wcIn << endl;
|
||||||
|
|
||||||
|
wcColl = &iterMap->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(in.GetCount() == COUNT_INCR);
|
||||||
|
assert(out.GetCount() == COUNT_INCR);
|
||||||
|
assert(wcColl);
|
||||||
|
|
||||||
|
pair<WordCountColl::iterator, bool> iterSet = wcColl->insert(out);
|
||||||
|
const WordCount &outWC = *iterSet.first;
|
||||||
|
outWC.AddCount(COUNT_INCR);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ExtractLex::Output(std::ofstream &streamLexS2T, std::ofstream &streamLexT2S)
|
||||||
|
{
|
||||||
|
Output(m_collS2T, streamLexS2T);
|
||||||
|
Output(m_collT2S, streamLexT2S);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ExtractLex::Output(const std::map<WordCount, WordCountColl> &coll, std::ofstream &outStream)
|
||||||
|
{
|
||||||
|
std::map<WordCount, WordCountColl>::const_iterator iterOuter;
|
||||||
|
for (iterOuter = coll.begin(); iterOuter != coll.end(); ++iterOuter)
|
||||||
|
{
|
||||||
|
const WordCount &in = iterOuter->first;
|
||||||
|
const WordCountColl &outColl = iterOuter->second;
|
||||||
|
|
||||||
|
WordCountColl::const_iterator iterInner;
|
||||||
|
for (iterInner = outColl.begin(); iterInner != outColl.end(); ++iterInner)
|
||||||
|
{
|
||||||
|
const WordCount &out = *iterInner;
|
||||||
|
outStream << in.GetString() << " " << out.GetString()
|
||||||
|
<< " " << in.GetCount() << " " << out.GetCount()
|
||||||
|
<< endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ostream& operator<<(std::ostream &out, const WordCount &obj)
|
||||||
|
{
|
||||||
|
out << obj.GetString() << "(" << obj.GetCount() << ")";
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
void WordCount::AddCount(float incr) const
|
||||||
|
{
|
||||||
|
m_count += incr;
|
||||||
|
cerr << *this << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
112
scripts/training/phrase-extract/extract-lex.h
Normal file
112
scripts/training/phrase-extract/extract-lex.h
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <map>
|
||||||
|
#include <set>
|
||||||
|
#include <sstream>
|
||||||
|
#include <fstream>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
|
||||||
|
//! convert string to variable of type T. Used to reading floats, int etc from files
|
||||||
|
template<typename T>
|
||||||
|
inline T Scan(const std::string &input)
|
||||||
|
{
|
||||||
|
std::stringstream stream(input);
|
||||||
|
T ret;
|
||||||
|
stream >> ret;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//! speeded up version of above
|
||||||
|
template<typename T>
|
||||||
|
inline void Scan(std::vector<T> &output, const std::vector< std::string > &input)
|
||||||
|
{
|
||||||
|
output.resize(input.size());
|
||||||
|
for (size_t i = 0 ; i < input.size() ; i++)
|
||||||
|
{
|
||||||
|
output[i] = Scan<T>( input[i] );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline void Tokenize(std::vector<std::string> &output
|
||||||
|
, const std::string& str
|
||||||
|
, const std::string& delimiters = " \t")
|
||||||
|
{
|
||||||
|
// Skip delimiters at beginning.
|
||||||
|
std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
|
||||||
|
// Find first "non-delimiter".
|
||||||
|
std::string::size_type pos = str.find_first_of(delimiters, lastPos);
|
||||||
|
|
||||||
|
while (std::string::npos != pos || std::string::npos != lastPos) {
|
||||||
|
// Found a token, add it to the vector.
|
||||||
|
output.push_back(str.substr(lastPos, pos - lastPos));
|
||||||
|
// Skip delimiters. Note the "not_of"
|
||||||
|
lastPos = str.find_first_not_of(delimiters, pos);
|
||||||
|
// Find next "non-delimiter"
|
||||||
|
pos = str.find_first_of(delimiters, lastPos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// speeded up version of above
|
||||||
|
template<typename T>
|
||||||
|
inline void Tokenize( std::vector<T> &output
|
||||||
|
, const std::string &input
|
||||||
|
, const std::string& delimiters = " \t")
|
||||||
|
{
|
||||||
|
std::vector<std::string> stringVector;
|
||||||
|
Tokenize(stringVector, input, delimiters);
|
||||||
|
return Scan<T>(output, stringVector );
|
||||||
|
}
|
||||||
|
|
||||||
|
class WordCount
|
||||||
|
{
|
||||||
|
friend std::ostream& operator<<(std::ostream&, const WordCount&);
|
||||||
|
public:
|
||||||
|
const std::string *m_str;
|
||||||
|
mutable float m_count;
|
||||||
|
|
||||||
|
WordCount(const std::string *str, float count)
|
||||||
|
:m_str(str)
|
||||||
|
,m_count(count)
|
||||||
|
{}
|
||||||
|
|
||||||
|
void AddCount(float incr) const;
|
||||||
|
|
||||||
|
const std::string GetString() const
|
||||||
|
{ return *m_str; }
|
||||||
|
const float GetCount() const
|
||||||
|
{ return m_count; }
|
||||||
|
|
||||||
|
//! transitive comparison used for adding objects into FactorCollection
|
||||||
|
inline bool operator<(const WordCount &other) const
|
||||||
|
{
|
||||||
|
return m_str < other.m_str;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class Vocab
|
||||||
|
{
|
||||||
|
std::set<std::string> m_coll;
|
||||||
|
public:
|
||||||
|
const std::string *GetOrAdd(const std::string &word);
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef std::set<WordCount> WordCountColl;
|
||||||
|
|
||||||
|
class ExtractLex
|
||||||
|
{
|
||||||
|
Vocab m_vocab;
|
||||||
|
std::map<WordCount, WordCountColl> m_collS2T, m_collT2S;
|
||||||
|
|
||||||
|
void Process(const std::string *target, const std::string *source);
|
||||||
|
void Process(const WordCount &in, const WordCount &out, std::map<WordCount, WordCountColl> &coll);
|
||||||
|
void Output(const std::map<WordCount, WordCountColl> &coll, std::ofstream &outStream);
|
||||||
|
|
||||||
|
public:
|
||||||
|
void Process(std::vector<std::string> &toksTarget, std::vector<std::string> &toksSource, std::vector<std::string> &toksAlign);
|
||||||
|
void Output(std::ofstream &streamLexS2T, std::ofstream &streamLexT2S);
|
||||||
|
|
||||||
|
};
|
||||||
|
|
Loading…
Reference in New Issue
Block a user