2011-07-01 09:40:46 +04:00
|
|
|
#include <iostream>
|
|
|
|
#include <fstream>
|
|
|
|
#include <cassert>
|
|
|
|
#include <vector>
|
|
|
|
#include "extract-lex.h"
|
2011-10-11 10:49:19 +04:00
|
|
|
#include "InputFileStream.h"
|
2011-07-01 09:40:46 +04:00
|
|
|
|
|
|
|
using namespace std;
|
2012-07-01 00:39:10 +04:00
|
|
|
using namespace MosesTraining;
|
2011-07-01 09:40:46 +04:00
|
|
|
|
2011-07-01 14:22:55 +04:00
|
|
|
float COUNT_INCR = 1;
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
void fix(std::ostream& stream)
|
2011-07-07 13:29:03 +04:00
|
|
|
{
|
2013-05-29 21:16:15 +04:00
|
|
|
stream.setf(std::ios::fixed);
|
|
|
|
stream.precision(7);
|
2011-07-07 13:29:03 +04:00
|
|
|
}
|
|
|
|
|
2011-07-01 09:40:46 +04:00
|
|
|
int main(int argc, char* argv[])
|
|
|
|
{
|
|
|
|
cerr << "Starting...\n";
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2011-07-18 09:08:26 +04:00
|
|
|
assert(argc == 6);
|
2011-07-01 09:40:46 +04:00
|
|
|
char* &filePathTarget = argv[1];
|
|
|
|
char* &filePathSource = argv[2];
|
|
|
|
char* &filePathAlign = argv[3];
|
|
|
|
char* &filePathLexS2T = argv[4];
|
|
|
|
char* &filePathLexT2S = argv[5];
|
|
|
|
|
2011-10-11 10:49:19 +04:00
|
|
|
Moses::InputFileStream streamTarget(filePathTarget);
|
|
|
|
Moses::InputFileStream streamSource(filePathSource);
|
|
|
|
Moses::InputFileStream streamAlign(filePathAlign);
|
2011-07-01 09:40:46 +04:00
|
|
|
|
|
|
|
ofstream streamLexS2T;
|
|
|
|
ofstream streamLexT2S;
|
|
|
|
streamLexS2T.open(filePathLexS2T);
|
|
|
|
streamLexT2S.open(filePathLexT2S);
|
|
|
|
|
2011-07-07 13:29:03 +04:00
|
|
|
fix(streamLexS2T);
|
|
|
|
fix(streamLexT2S);
|
|
|
|
|
2011-07-01 09:40:46 +04:00
|
|
|
ExtractLex extractSingleton;
|
|
|
|
|
2011-07-01 14:33:04 +04:00
|
|
|
size_t lineCount = 0;
|
2011-07-01 09:40:46 +04:00
|
|
|
string lineTarget, lineSource, lineAlign;
|
2013-05-29 21:16:15 +04:00
|
|
|
while (getline(streamTarget, lineTarget)) {
|
2011-07-01 14:33:04 +04:00
|
|
|
if (lineCount % 10000 == 0)
|
|
|
|
cerr << lineCount << " ";
|
|
|
|
|
2011-07-01 09:40:46 +04:00
|
|
|
istream &isSource = getline(streamSource, lineSource);
|
|
|
|
assert(isSource);
|
|
|
|
istream &isAlign = getline(streamAlign, lineAlign);
|
|
|
|
assert(isAlign);
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2011-07-01 09:40:46 +04:00
|
|
|
vector<string> toksTarget, toksSource, toksAlign;
|
|
|
|
Tokenize(toksTarget, lineTarget);
|
|
|
|
Tokenize(toksSource, lineSource);
|
|
|
|
Tokenize(toksAlign, lineAlign);
|
|
|
|
|
2011-07-01 14:33:04 +04:00
|
|
|
/*
|
2011-07-01 09:40:46 +04:00
|
|
|
cerr << endl
|
|
|
|
<< toksTarget.size() << " " << lineTarget << endl
|
2013-05-29 21:16:15 +04:00
|
|
|
<< toksSource.size() << " " << lineSource << endl
|
2011-07-01 09:40:46 +04:00
|
|
|
<< toksAlign.size() << " " << lineAlign << endl;
|
2011-07-01 14:33:04 +04:00
|
|
|
*/
|
2011-07-01 09:40:46 +04:00
|
|
|
|
2012-01-04 20:29:31 +04:00
|
|
|
extractSingleton.Process(toksTarget, toksSource, toksAlign, lineCount);
|
2013-05-29 21:16:15 +04:00
|
|
|
|
|
|
|
++lineCount;
|
2011-07-01 09:40:46 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
extractSingleton.Output(streamLexS2T, streamLexT2S);
|
|
|
|
|
2011-10-11 10:49:19 +04:00
|
|
|
streamTarget.Close();
|
|
|
|
streamSource.Close();
|
|
|
|
streamAlign.Close();
|
2011-07-01 09:40:46 +04:00
|
|
|
streamLexS2T.close();
|
|
|
|
streamLexT2S.close();
|
|
|
|
|
2011-07-01 14:33:04 +04:00
|
|
|
cerr << "\nFinished\n";
|
2011-07-01 09:40:46 +04:00
|
|
|
}
|
|
|
|
|
2012-07-01 00:39:10 +04:00
|
|
|
namespace MosesTraining
|
|
|
|
{
|
|
|
|
|
2011-07-01 09:40:46 +04:00
|
|
|
const std::string *Vocab::GetOrAdd(const std::string &word)
|
|
|
|
{
|
2013-05-29 21:16:15 +04:00
|
|
|
const string *ret = &(*m_coll.insert(word).first);
|
2011-07-01 09:40:46 +04:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-01-04 20:29:31 +04:00
|
|
|
void ExtractLex::Process(vector<string> &toksTarget, vector<string> &toksSource, vector<string> &toksAlign, size_t lineCount)
|
2011-07-01 09:40:46 +04:00
|
|
|
{
|
2011-07-07 13:29:03 +04:00
|
|
|
std::vector<bool> m_sourceAligned(toksSource.size(), false)
|
2013-05-29 21:16:15 +04:00
|
|
|
, m_targetAligned(toksTarget.size(), false);
|
2011-07-07 13:29:03 +04:00
|
|
|
|
2011-07-01 09:40:46 +04:00
|
|
|
vector<string>::const_iterator iterAlign;
|
2013-05-29 21:16:15 +04:00
|
|
|
for (iterAlign = toksAlign.begin(); iterAlign != toksAlign.end(); ++iterAlign) {
|
2011-07-01 09:40:46 +04:00
|
|
|
const string &alignTok = *iterAlign;
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2011-07-01 09:40:46 +04:00
|
|
|
vector<size_t> alignPos;
|
|
|
|
Tokenize(alignPos, alignTok, "-");
|
|
|
|
assert(alignPos.size() == 2);
|
2012-01-04 20:29:31 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
if (alignPos[0] >= toksSource.size()) {
|
|
|
|
cerr << "ERROR: alignment over source length. Alignment " << alignPos[0] << " at line " << lineCount << endl;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (alignPos[1] >= toksTarget.size()) {
|
|
|
|
cerr << "ERROR: alignment over target length. Alignment " << alignPos[1] << " at line " << lineCount << endl;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2011-07-01 09:40:46 +04:00
|
|
|
assert(alignPos[0] < toksSource.size());
|
|
|
|
assert(alignPos[1] < toksTarget.size());
|
|
|
|
|
2011-07-07 13:29:03 +04:00
|
|
|
m_sourceAligned[ alignPos[0] ] = true;
|
|
|
|
m_targetAligned[ alignPos[1] ] = true;
|
|
|
|
|
2011-07-01 09:40:46 +04:00
|
|
|
const string &tmpSource = toksSource[ alignPos[0] ];
|
|
|
|
const string &tmpTarget = toksTarget[ alignPos[1] ];
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2011-07-01 09:40:46 +04:00
|
|
|
const string *source = m_vocab.GetOrAdd(tmpSource);
|
|
|
|
const string *target = m_vocab.GetOrAdd(tmpTarget);
|
|
|
|
|
|
|
|
Process(target, source);
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2011-07-01 09:40:46 +04:00
|
|
|
}
|
|
|
|
|
2011-07-07 13:29:03 +04:00
|
|
|
ProcessUnaligned(toksTarget, toksSource, m_sourceAligned, m_targetAligned);
|
2011-07-01 09:40:46 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void ExtractLex::Process(const std::string *target, const std::string *source)
|
|
|
|
{
|
2011-07-01 14:22:55 +04:00
|
|
|
WordCount &wcS2T = m_collS2T[source];
|
|
|
|
WordCount &wcT2S = m_collT2S[target];
|
2011-07-01 09:40:46 +04:00
|
|
|
|
2011-07-01 14:22:55 +04:00
|
|
|
wcS2T.AddCount(COUNT_INCR);
|
|
|
|
wcT2S.AddCount(COUNT_INCR);
|
|
|
|
|
|
|
|
Process(wcS2T, target);
|
|
|
|
Process(wcT2S, source);
|
2011-07-01 09:40:46 +04:00
|
|
|
}
|
|
|
|
|
2011-07-01 14:22:55 +04:00
|
|
|
void ExtractLex::Process(WordCount &wcIn, const std::string *out)
|
2011-07-01 09:40:46 +04:00
|
|
|
{
|
2011-07-01 14:22:55 +04:00
|
|
|
std::map<const std::string*, WordCount> &collOut = wcIn.GetColl();
|
|
|
|
WordCount &wcOut = collOut[out];
|
|
|
|
wcOut.AddCount(COUNT_INCR);
|
2011-07-01 09:40:46 +04:00
|
|
|
}
|
|
|
|
|
2011-07-07 13:29:03 +04:00
|
|
|
void ExtractLex::ProcessUnaligned(vector<string> &toksTarget, vector<string> &toksSource
|
2013-05-29 21:16:15 +04:00
|
|
|
, const std::vector<bool> &m_sourceAligned, const std::vector<bool> &m_targetAligned)
|
2011-07-07 13:29:03 +04:00
|
|
|
{
|
2013-05-29 21:16:15 +04:00
|
|
|
const string *nullWord = m_vocab.GetOrAdd("NULL");
|
2011-07-07 13:29:03 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
for (size_t pos = 0; pos < m_sourceAligned.size(); ++pos) {
|
2011-07-07 13:29:03 +04:00
|
|
|
bool isAlignedCurr = m_sourceAligned[pos];
|
2013-05-29 21:16:15 +04:00
|
|
|
if (!isAlignedCurr) {
|
2011-07-07 13:29:03 +04:00
|
|
|
const string &tmpWord = toksSource[pos];
|
|
|
|
const string *sourceWord = m_vocab.GetOrAdd(tmpWord);
|
|
|
|
|
|
|
|
Process(nullWord, sourceWord);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
for (size_t pos = 0; pos < m_targetAligned.size(); ++pos) {
|
2011-07-07 13:29:03 +04:00
|
|
|
bool isAlignedCurr = m_targetAligned[pos];
|
2013-05-29 21:16:15 +04:00
|
|
|
if (!isAlignedCurr) {
|
2011-07-07 13:29:03 +04:00
|
|
|
const string &tmpWord = toksTarget[pos];
|
|
|
|
const string *targetWord = m_vocab.GetOrAdd(tmpWord);
|
|
|
|
|
|
|
|
Process(targetWord, nullWord);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2011-07-01 09:40:46 +04:00
|
|
|
void ExtractLex::Output(std::ofstream &streamLexS2T, std::ofstream &streamLexT2S)
|
|
|
|
{
|
|
|
|
Output(m_collS2T, streamLexS2T);
|
|
|
|
Output(m_collT2S, streamLexT2S);
|
|
|
|
}
|
|
|
|
|
2011-07-01 14:22:55 +04:00
|
|
|
void ExtractLex::Output(const std::map<const std::string*, WordCount> &coll, std::ofstream &outStream)
|
2011-07-01 09:40:46 +04:00
|
|
|
{
|
2011-07-01 14:22:55 +04:00
|
|
|
std::map<const std::string*, WordCount>::const_iterator iterOuter;
|
2013-05-29 21:16:15 +04:00
|
|
|
for (iterOuter = coll.begin(); iterOuter != coll.end(); ++iterOuter) {
|
2011-07-01 14:22:55 +04:00
|
|
|
const string &inStr = *iterOuter->first;
|
|
|
|
const WordCount &inWC = iterOuter->second;
|
|
|
|
|
|
|
|
const std::map<const std::string*, WordCount> &outColl = inWC.GetColl();
|
2011-07-01 09:40:46 +04:00
|
|
|
|
2011-07-01 14:22:55 +04:00
|
|
|
std::map<const std::string*, WordCount>::const_iterator iterInner;
|
2013-05-29 21:16:15 +04:00
|
|
|
for (iterInner = outColl.begin(); iterInner != outColl.end(); ++iterInner) {
|
2011-07-01 14:22:55 +04:00
|
|
|
const string &outStr = *iterInner->first;
|
|
|
|
const WordCount &outWC = iterInner->second;
|
|
|
|
|
|
|
|
float prob = outWC.GetCount() / inWC.GetCount();
|
2011-07-07 13:29:03 +04:00
|
|
|
outStream << outStr << " " << inStr << " " << prob << endl;
|
2011-07-01 09:40:46 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::ostream& operator<<(std::ostream &out, const WordCount &obj)
|
|
|
|
{
|
2011-07-01 14:22:55 +04:00
|
|
|
out << "(" << obj.GetCount() << ")";
|
2011-07-01 09:40:46 +04:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
2011-07-01 14:22:55 +04:00
|
|
|
void WordCount::AddCount(float incr)
|
2011-07-01 09:40:46 +04:00
|
|
|
{
|
|
|
|
m_count += incr;
|
|
|
|
}
|
|
|
|
|
2012-07-01 00:39:10 +04:00
|
|
|
} // namespace
|
2011-07-01 09:40:46 +04:00
|
|
|
|