2011-07-01 09:40:46 +04:00
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <map>
|
|
|
|
#include <set>
|
|
|
|
#include <sstream>
|
|
|
|
#include <fstream>
|
|
|
|
#include <iostream>
|
|
|
|
|
2012-06-30 18:43:47 +04:00
|
|
|
namespace MosesTraining
|
|
|
|
{
|
|
|
|
|
2011-07-01 09:40:46 +04:00
|
|
|
class WordCount
|
|
|
|
{
|
2013-05-29 21:16:15 +04:00
|
|
|
friend std::ostream& operator<<(std::ostream&, const WordCount&);
|
2011-07-01 09:40:46 +04:00
|
|
|
public:
|
2011-07-01 14:22:55 +04:00
|
|
|
float m_count;
|
2011-07-01 09:40:46 +04:00
|
|
|
|
2011-07-01 14:22:55 +04:00
|
|
|
std::map<const std::string*, WordCount> m_coll;
|
|
|
|
|
|
|
|
WordCount()
|
2013-06-10 21:11:55 +04:00
|
|
|
:m_count(0) {
|
|
|
|
}
|
2011-07-01 14:22:55 +04:00
|
|
|
|
|
|
|
//WordCount(const WordCount ©);
|
|
|
|
|
|
|
|
WordCount(float count)
|
2013-06-10 21:11:55 +04:00
|
|
|
:m_count(count) {
|
|
|
|
}
|
2011-07-01 09:40:46 +04:00
|
|
|
|
2011-07-01 14:22:55 +04:00
|
|
|
void AddCount(float incr);
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
std::map<const std::string*, WordCount> &GetColl() {
|
|
|
|
return m_coll;
|
|
|
|
}
|
|
|
|
const std::map<const std::string*, WordCount> &GetColl() const {
|
|
|
|
return m_coll;
|
|
|
|
}
|
2011-07-01 09:40:46 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
const float GetCount() const {
|
|
|
|
return m_count;
|
|
|
|
}
|
2011-07-01 09:40:46 +04:00
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
class Vocab
|
|
|
|
{
|
|
|
|
std::set<std::string> m_coll;
|
|
|
|
public:
|
|
|
|
const std::string *GetOrAdd(const std::string &word);
|
|
|
|
};
|
|
|
|
|
|
|
|
class ExtractLex
|
|
|
|
{
|
|
|
|
Vocab m_vocab;
|
2011-07-01 14:22:55 +04:00
|
|
|
std::map<const std::string*, WordCount> m_collS2T, m_collT2S;
|
2011-07-01 09:40:46 +04:00
|
|
|
|
|
|
|
void Process(const std::string *target, const std::string *source);
|
2011-07-01 14:27:47 +04:00
|
|
|
void Process(WordCount &wcIn, const std::string *out);
|
2011-07-07 13:29:03 +04:00
|
|
|
void ProcessUnaligned(std::vector<std::string> &toksTarget, std::vector<std::string> &toksSource
|
|
|
|
, const std::vector<bool> &m_sourceAligned, const std::vector<bool> &m_targetAligned);
|
2011-07-01 14:22:55 +04:00
|
|
|
|
|
|
|
void Output(const std::map<const std::string*, WordCount> &coll, std::ofstream &outStream);
|
2011-07-01 09:40:46 +04:00
|
|
|
|
|
|
|
public:
|
2012-01-04 20:29:31 +04:00
|
|
|
void Process(std::vector<std::string> &toksTarget, std::vector<std::string> &toksSource, std::vector<std::string> &toksAlign, size_t lineCount);
|
2011-07-01 09:40:46 +04:00
|
|
|
void Output(std::ofstream &streamLexS2T, std::ofstream &streamLexT2S);
|
|
|
|
|
|
|
|
};
|
|
|
|
|
2012-06-30 18:43:47 +04:00
|
|
|
} // namespace
|