2010-02-12 14:05:43 +03:00
|
|
|
#include <sstream>
|
|
|
|
#include "vocab.h"
|
|
|
|
|
|
|
|
namespace Moses {
|
|
|
|
|
2010-04-08 18:52:35 +04:00
|
|
|
// Vocab class
|
2010-04-20 18:09:53 +04:00
|
|
|
void Vocab::InitSpecialWords()
|
|
|
|
{
|
|
|
|
m_kBOSWord = InitSpecialWord(BOS_); // BOS_ is a string <s> (defined in ../typedef.h)
|
|
|
|
m_kEOSWord = InitSpecialWord(EOS_); // EOS_ is a string </s> (defined in ../typedef.h)
|
|
|
|
m_kOOVWord = InitSpecialWord(UNKNOWN_FACTOR); // UNKNOWN_FACTOR also defined in ../typedef.h
|
|
|
|
}
|
|
|
|
|
|
|
|
const Word Vocab::InitSpecialWord( const string& word_str)
|
|
|
|
{
|
|
|
|
FactorList factors;
|
|
|
|
factors.push_back( 0); // store the special word string as the first factor
|
|
|
|
Word word;
|
|
|
|
// define special word as Input word with one factor and isNonTerminal=false
|
|
|
|
word.CreateFromString( Input, factors, word_str, false ); // Input is enum defined in ../typedef.h
|
|
|
|
// TODO not sure if this will work properly:
|
|
|
|
// - word comparison can fail because the last parameter (isNonTerminal)
|
|
|
|
// in function CreateFromString may not match properly created words
|
|
|
|
// - special word is Input word but what about Output words?
|
|
|
|
// - currently Input/Output variable is not stored in class Word, but in the future???
|
|
|
|
return word;
|
|
|
|
}
|
|
|
|
|
|
|
|
// get wordID_t index for word represented as string
|
|
|
|
wordID_t Vocab::GetWordID(const std::string& word_str,
|
|
|
|
const FactorDirection& direction, const FactorList& factors, bool isNonTerminal)
|
|
|
|
{
|
|
|
|
// get id for factored string
|
|
|
|
Word word;
|
|
|
|
word.CreateFromString( direction, factors, word_str, isNonTerminal);
|
|
|
|
return GetWordID( word);
|
|
|
|
}
|
|
|
|
|
|
|
|
wordID_t Vocab::GetWordID(const Word& word)
|
|
|
|
{
|
2010-04-08 18:52:35 +04:00
|
|
|
// get id and possibly add to vocab
|
2010-04-20 18:09:53 +04:00
|
|
|
if (m_words2ids.find(word) == m_words2ids.end())
|
|
|
|
if (!m_closed) {
|
|
|
|
wordID_t id = m_words2ids.size() + 1;
|
|
|
|
m_ids2words[id] = word;
|
|
|
|
// update lookup tables
|
|
|
|
m_words2ids[word] = id;
|
2010-04-08 18:52:35 +04:00
|
|
|
}
|
|
|
|
else {
|
2010-04-20 18:09:53 +04:00
|
|
|
return m_kOOVWordID;
|
2010-04-08 18:52:35 +04:00
|
|
|
}
|
2010-04-20 18:09:53 +04:00
|
|
|
wordID_t id = m_words2ids[word];
|
2010-04-08 18:52:35 +04:00
|
|
|
return id;
|
|
|
|
}
|
|
|
|
|
2010-04-20 18:09:53 +04:00
|
|
|
Word& Vocab::GetWord(wordID_t id)
|
|
|
|
{
|
2010-04-08 18:52:35 +04:00
|
|
|
// get word string given id
|
2010-04-20 18:09:53 +04:00
|
|
|
return (m_ids2words.find(id) == m_ids2words.end()) ? m_kOOVWord : m_ids2words[id];
|
2010-04-08 18:52:35 +04:00
|
|
|
}
|
|
|
|
|
2010-04-20 18:09:53 +04:00
|
|
|
bool Vocab::InVocab(wordID_t id)
|
|
|
|
{
|
|
|
|
return m_ids2words.find(id) != m_ids2words.end();
|
2010-04-08 18:52:35 +04:00
|
|
|
}
|
2010-02-12 14:05:43 +03:00
|
|
|
|
2010-04-20 18:09:53 +04:00
|
|
|
bool Vocab::InVocab(const Word& word)
|
|
|
|
{
|
|
|
|
return m_words2ids.find(word) != m_words2ids.end();
|
2010-04-08 18:52:35 +04:00
|
|
|
}
|
|
|
|
|
2010-04-20 18:09:53 +04:00
|
|
|
bool Vocab::Save(const std::string & vocab_path)
|
|
|
|
{
|
2010-04-08 18:52:35 +04:00
|
|
|
// save vocab as id -> word
|
|
|
|
FileHandler vcbout(vocab_path, std::ios::out);
|
2010-04-20 18:09:53 +04:00
|
|
|
return Save(&vcbout);
|
2010-04-08 18:52:35 +04:00
|
|
|
}
|
2010-04-20 18:09:53 +04:00
|
|
|
|
|
|
|
bool Vocab::Save(FileHandler* vcbout)
|
|
|
|
{
|
2010-04-08 18:52:35 +04:00
|
|
|
// then each vcb entry
|
2010-04-20 18:09:53 +04:00
|
|
|
*vcbout << m_ids2words.size() << "\n";
|
|
|
|
iterate(m_ids2words, iter)
|
2010-04-08 18:52:35 +04:00
|
|
|
*vcbout << iter->second << "\t" << iter->first << "\n";
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2010-04-20 18:09:53 +04:00
|
|
|
bool Vocab::Load(const std::string & vocab_path, const FactorDirection& direction,
|
|
|
|
const FactorList& factors, bool closed)
|
|
|
|
{
|
2010-04-08 18:52:35 +04:00
|
|
|
FileHandler vcbin(vocab_path, std::ios::in);
|
|
|
|
std::cerr << "Loading vocab from " << vocab_path << std::endl;
|
2010-04-20 18:09:53 +04:00
|
|
|
return Load(&vcbin, direction, factors, closed);
|
2010-04-08 18:52:35 +04:00
|
|
|
}
|
2010-04-20 18:09:53 +04:00
|
|
|
|
|
|
|
bool Vocab::Load(FileHandler* vcbin, const FactorDirection& direction,
|
|
|
|
const FactorList& factors, bool closed) {
|
2010-04-08 18:52:35 +04:00
|
|
|
// load vocab id -> word mapping
|
2010-04-20 18:09:53 +04:00
|
|
|
m_words2ids.clear(); // reset mapping
|
|
|
|
m_ids2words.clear();
|
|
|
|
std::string line, word_str;
|
2010-04-08 18:52:35 +04:00
|
|
|
wordID_t id;
|
|
|
|
assert(getline(*vcbin, line));
|
|
|
|
std::istringstream first(line.c_str());
|
|
|
|
uint32_t vcbsize(0);
|
|
|
|
first >> vcbsize;
|
|
|
|
uint32_t loadedsize = 0;
|
|
|
|
while (loadedsize++ < vcbsize && getline(*vcbin, line)) {
|
|
|
|
std::istringstream entry(line.c_str());
|
2010-04-20 18:09:53 +04:00
|
|
|
entry >> word_str;
|
|
|
|
Word word;
|
|
|
|
word.CreateFromString( direction, factors, word_str, false); // TODO set correctly isNonTerminal
|
2010-04-08 18:52:35 +04:00
|
|
|
entry >> id;
|
|
|
|
// may be no id (i.e. file may just be a word list)
|
2010-04-20 18:09:53 +04:00
|
|
|
if (id == 0 && word != GetkOOVWord())
|
|
|
|
id = m_ids2words.size() + 1; // assign ids sequentially starting from 1
|
|
|
|
assert(m_ids2words.count(id) == 0 && m_words2ids.count(word) == 0);
|
|
|
|
m_ids2words[id] = word;
|
|
|
|
m_words2ids[word] = id;
|
2010-04-08 18:52:35 +04:00
|
|
|
}
|
2010-04-20 18:09:53 +04:00
|
|
|
m_closed = closed; // once loaded fix vocab ?
|
|
|
|
std::cerr << "Loaded vocab with " << m_ids2words.size() << " words." << std::endl;
|
2010-04-08 18:52:35 +04:00
|
|
|
return true;
|
|
|
|
}
|
2010-04-20 18:09:53 +04:00
|
|
|
void Vocab::PrintVocab()
|
|
|
|
{
|
|
|
|
iterate(m_ids2words, iter)
|
2010-04-08 18:52:35 +04:00
|
|
|
std::cerr << iter->second << "\t" << iter->first << "\n";
|
2010-04-20 18:09:53 +04:00
|
|
|
iterate(m_words2ids, iter)
|
2010-04-08 18:52:35 +04:00
|
|
|
std::cerr << iter->second << "\t" << iter->first << "\n";
|
|
|
|
}
|
2010-02-12 14:05:43 +03:00
|
|
|
|
|
|
|
} //end namespace
|