2008-06-11 14:52:57 +04:00
|
|
|
#include "LexicalReorderingTable.h"
|
|
|
|
#include "InputFileStream.h"
|
|
|
|
//#include "LVoc.h" //need IPhrase
|
|
|
|
|
|
|
|
#include "StaticData.h"
|
|
|
|
#include "PhraseDictionary.h"
|
|
|
|
#include "GenerationDictionary.h"
|
|
|
|
#include "TargetPhrase.h"
|
|
|
|
#include "TargetPhraseCollection.h"
|
|
|
|
|
2012-08-03 14:04:39 +04:00
|
|
|
#ifndef WIN32
|
|
|
|
#include "CompactPT/LexicalReorderingTableCompact.h"
|
|
|
|
#endif
|
|
|
|
|
2008-10-09 03:51:26 +04:00
|
|
|
namespace Moses
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
/*
|
2008-06-11 14:52:57 +04:00
|
|
|
* local helper functions
|
|
|
|
*/
|
|
|
|
//cleans str of leading and tailing spaces
|
2011-02-24 16:14:42 +03:00
|
|
|
std::string auxClearString(const std::string& str)
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
int i = 0, j = str.size()-1;
|
2011-02-24 16:14:42 +03:00
|
|
|
while(i <= j) {
|
|
|
|
if(' ' != str[i]) {
|
2008-06-11 14:52:57 +04:00
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
++i;
|
|
|
|
}
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
while(j >= i) {
|
|
|
|
if(' ' != str[j]) {
|
2008-06-11 14:52:57 +04:00
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
--j;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return str.substr(i,j-i+1);
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
void auxAppend(IPhrase& head, const IPhrase& tail)
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
head.reserve(head.size()+tail.size());
|
2011-02-24 16:14:42 +03:00
|
|
|
for(size_t i = 0; i < tail.size(); ++i) {
|
|
|
|
head.push_back(tail[i]);
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
/*
|
2008-06-11 14:52:57 +04:00
|
|
|
* functions for LexicalReorderingTable
|
|
|
|
*/
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
LexicalReorderingTable* LexicalReorderingTable::LoadAvailable(const std::string& filePath, const FactorList& f_factors, const FactorList& e_factors, const FactorList& c_factors)
|
|
|
|
{
|
2012-08-03 14:04:39 +04:00
|
|
|
//decide use Compact or Tree or Memory table
|
|
|
|
if(FileExists(filePath+".minlexr")) {
|
|
|
|
//there exists a compact binary version use that
|
|
|
|
VERBOSE(2,"Using compact lexical reordering table" << std::endl);
|
|
|
|
return new LexicalReorderingTableCompact(filePath+".minlexr", f_factors, e_factors, c_factors);
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
if(FileExists(filePath+".binlexr.idx")) {
|
|
|
|
//there exists a binary version use that
|
|
|
|
return new LexicalReorderingTableTree(filePath, f_factors, e_factors, c_factors);
|
|
|
|
} else {
|
|
|
|
//use plain memory
|
|
|
|
return new LexicalReorderingTableMemory(filePath, f_factors, e_factors, c_factors);
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
/*
|
2008-06-11 14:52:57 +04:00
|
|
|
* functions for LexicalReorderingTableMemory
|
|
|
|
*/
|
2011-02-24 16:14:42 +03:00
|
|
|
LexicalReorderingTableMemory::LexicalReorderingTableMemory(
|
|
|
|
const std::string& filePath,
|
|
|
|
const std::vector<FactorType>& f_factors,
|
|
|
|
const std::vector<FactorType>& e_factors,
|
|
|
|
const std::vector<FactorType>& c_factors)
|
|
|
|
: LexicalReorderingTable(f_factors, e_factors, c_factors)
|
2008-06-11 14:52:57 +04:00
|
|
|
{
|
|
|
|
LoadFromFile(filePath);
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
LexicalReorderingTableMemory::~LexicalReorderingTableMemory()
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<float> LexicalReorderingTableMemory::GetScore(const Phrase& f,
|
2011-02-24 16:14:42 +03:00
|
|
|
const Phrase& e,
|
|
|
|
const Phrase& c)
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
//rather complicated because of const can't use []... as [] might enter new things into std::map
|
|
|
|
//also can't have to be careful with words range if c is empty can't use c.GetSize()-1 will underflow and be large
|
|
|
|
TableType::const_iterator r;
|
|
|
|
std::string key;
|
2011-02-24 16:14:42 +03:00
|
|
|
if(0 == c.GetSize()) {
|
|
|
|
key = MakeKey(f,e,c);
|
|
|
|
r = m_Table.find(key);
|
|
|
|
if(m_Table.end() != r) {
|
|
|
|
return r->second;
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
} else {
|
2011-02-24 16:14:42 +03:00
|
|
|
//right try from large to smaller context
|
|
|
|
for(size_t i = 0; i <= c.GetSize(); ++i) {
|
|
|
|
Phrase sub_c(c.GetSubString(WordsRange(i,c.GetSize()-1)));
|
|
|
|
key = MakeKey(f,e,sub_c);
|
|
|
|
r = m_Table.find(key);
|
|
|
|
if(m_Table.end() != r) {
|
|
|
|
return r->second;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Scores();
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
void LexicalReorderingTableMemory::DbgDump(std::ostream* out) const
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
TableType::const_iterator i;
|
2011-02-24 16:14:42 +03:00
|
|
|
for(i = m_Table.begin(); i != m_Table.end(); ++i) {
|
|
|
|
*out << " key: '" << i->first << "' score: ";
|
|
|
|
*out << "(num scores: " << (i->second).size() << ")";
|
|
|
|
for(size_t j = 0; j < (i->second).size(); ++j) {
|
|
|
|
*out << (i->second)[j] << " ";
|
|
|
|
}
|
|
|
|
*out << "\n";
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
std::string LexicalReorderingTableMemory::MakeKey(const Phrase& f,
|
|
|
|
const Phrase& e,
|
|
|
|
const Phrase& c) const
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
/*
|
|
|
|
std::string key;
|
|
|
|
if(!m_FactorsF.empty()){
|
|
|
|
key += f.GetStringRep(m_FactorsF);
|
|
|
|
}
|
|
|
|
if(!m_FactorsE.empty()){
|
|
|
|
if(!key.empty()){
|
|
|
|
key += " ||| ";
|
|
|
|
}
|
|
|
|
key += e.GetStringRep(m_FactorsE);
|
|
|
|
}
|
|
|
|
*/
|
|
|
|
return MakeKey(auxClearString(f.GetStringRep(m_FactorsF)),
|
2011-02-24 16:14:42 +03:00
|
|
|
auxClearString(e.GetStringRep(m_FactorsE)),
|
|
|
|
auxClearString(c.GetStringRep(m_FactorsC)));
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
std::string LexicalReorderingTableMemory::MakeKey(const std::string& f,
|
|
|
|
const std::string& e,
|
|
|
|
const std::string& c) const
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
std::string key;
|
2011-02-24 16:14:42 +03:00
|
|
|
if(!f.empty()) {
|
2008-06-11 14:52:57 +04:00
|
|
|
key += f;
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
if(!m_FactorsE.empty()) {
|
|
|
|
if(!key.empty()) {
|
2008-06-11 14:52:57 +04:00
|
|
|
key += "|||";
|
|
|
|
}
|
|
|
|
key += e;
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
if(!m_FactorsC.empty()) {
|
|
|
|
if(!key.empty()) {
|
2008-06-11 14:52:57 +04:00
|
|
|
key += "|||";
|
|
|
|
}
|
|
|
|
key += c;
|
|
|
|
}
|
|
|
|
return key;
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
void LexicalReorderingTableMemory::LoadFromFile(const std::string& filePath)
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
std::string fileName = filePath;
|
2011-02-24 16:14:42 +03:00
|
|
|
if(!FileExists(fileName) && FileExists(fileName+".gz")) {
|
|
|
|
fileName += ".gz";
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
InputFileStream file(fileName);
|
|
|
|
std::string line(""), key("");
|
|
|
|
int numScores = -1;
|
|
|
|
std::cerr << "Loading table into memory...";
|
2011-02-24 16:14:42 +03:00
|
|
|
while(!getline(file, line).eof()) {
|
2008-06-11 14:52:57 +04:00
|
|
|
std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||");
|
2011-02-24 16:14:42 +03:00
|
|
|
int t = 0 ;
|
2008-06-11 14:52:57 +04:00
|
|
|
std::string f(""),e(""),c("");
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
if(!m_FactorsF.empty()) {
|
2008-06-11 14:52:57 +04:00
|
|
|
//there should be something for f
|
|
|
|
f = auxClearString(tokens.at(t));
|
|
|
|
++t;
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
if(!m_FactorsE.empty()) {
|
2008-06-11 14:52:57 +04:00
|
|
|
//there should be something for e
|
|
|
|
e = auxClearString(tokens.at(t));
|
|
|
|
++t;
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
if(!m_FactorsC.empty()) {
|
2008-06-11 14:52:57 +04:00
|
|
|
//there should be something for c
|
|
|
|
c = auxClearString(tokens.at(t));
|
|
|
|
++t;
|
|
|
|
}
|
|
|
|
//last token are the probs
|
|
|
|
std::vector<float> p = Scan<float>(Tokenize(tokens.at(t)));
|
|
|
|
//sanity check: all lines must have equall number of probs
|
2011-02-24 16:14:42 +03:00
|
|
|
if(-1 == numScores) {
|
2008-06-11 14:52:57 +04:00
|
|
|
numScores = (int)p.size(); //set in first line
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
if((int)p.size() != numScores) {
|
2008-06-11 14:52:57 +04:00
|
|
|
TRACE_ERR( "found inconsistent number of probabilities... found " << p.size() << " expected " << numScores << std::endl);
|
|
|
|
exit(0);
|
|
|
|
}
|
|
|
|
std::transform(p.begin(),p.end(),p.begin(),TransformScore);
|
|
|
|
std::transform(p.begin(),p.end(),p.begin(),FloorScore);
|
|
|
|
//save it all into our map
|
|
|
|
m_Table[MakeKey(f,e,c)] = p;
|
|
|
|
}
|
|
|
|
std::cerr << "done.\n";
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
/*
|
2008-06-11 14:52:57 +04:00
|
|
|
* functions for LexicalReorderingTableTree
|
|
|
|
*/
|
|
|
|
LexicalReorderingTableTree::LexicalReorderingTableTree(
|
2011-02-24 16:14:42 +03:00
|
|
|
const std::string& filePath,
|
|
|
|
const std::vector<FactorType>& f_factors,
|
|
|
|
const std::vector<FactorType>& e_factors,
|
|
|
|
const std::vector<FactorType>& c_factors)
|
|
|
|
: LexicalReorderingTable(f_factors, e_factors, c_factors), m_UseCache(false), m_FilePath(filePath)
|
2008-06-11 14:52:57 +04:00
|
|
|
{
|
2009-08-07 20:47:54 +04:00
|
|
|
m_Table.reset(new PrefixTreeMap());
|
|
|
|
m_Table->Read(m_FilePath+".binlexr");
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
LexicalReorderingTableTree::~LexicalReorderingTableTree()
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
Scores LexicalReorderingTableTree::GetScore(const Phrase& f, const Phrase& e, const Phrase& c)
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
if( (!m_FactorsF.empty() && 0 == f.GetSize())
|
2011-02-24 16:14:42 +03:00
|
|
|
|| (!m_FactorsE.empty() && 0 == e.GetSize())) {
|
2008-06-11 14:52:57 +04:00
|
|
|
//NOTE: no check for c as c might be empty, e.g. start of sentence
|
|
|
|
//not a proper key
|
|
|
|
// phi: commented out, since e may be empty (drop-unknown)
|
|
|
|
//std::cerr << "Not a proper key!\n";
|
2010-01-28 15:12:57 +03:00
|
|
|
return Scores();
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
CacheType::iterator i;;
|
2011-02-24 16:14:42 +03:00
|
|
|
if(m_UseCache) {
|
2008-06-11 14:52:57 +04:00
|
|
|
std::pair<CacheType::iterator, bool> r = m_Cache.insert(std::make_pair(MakeCacheKey(f,e),Candidates()));
|
2011-02-24 16:14:42 +03:00
|
|
|
if(!r.second) {
|
2008-06-11 14:52:57 +04:00
|
|
|
return auxFindScoreForContext((r.first)->second, c);
|
|
|
|
}
|
|
|
|
i = r.first;
|
2011-02-24 16:14:42 +03:00
|
|
|
} else if(!m_Cache.empty()) {
|
2008-06-11 14:52:57 +04:00
|
|
|
//although we might not be caching now, cache might be none empty!
|
2011-02-24 16:14:42 +03:00
|
|
|
i = m_Cache.find(MakeCacheKey(f,e));
|
|
|
|
if(i != m_Cache.end()) {
|
2008-06-11 14:52:57 +04:00
|
|
|
return auxFindScoreForContext(i->second, c);
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
//not in cache go to file...
|
2010-01-28 15:12:57 +03:00
|
|
|
Scores score;
|
2011-02-24 16:14:42 +03:00
|
|
|
Candidates cands;
|
2009-08-07 20:47:54 +04:00
|
|
|
m_Table->GetCandidates(MakeTableKey(f,e), &cands);
|
2011-02-24 16:14:42 +03:00
|
|
|
if(cands.empty()) {
|
2010-01-28 15:12:57 +03:00
|
|
|
return Scores();
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
if(m_FactorsC.empty()) {
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(1 == cands.size());
|
2011-02-24 16:14:42 +03:00
|
|
|
return cands[0].GetScore(0);
|
2008-06-11 14:52:57 +04:00
|
|
|
} else {
|
2011-02-24 16:14:42 +03:00
|
|
|
score = auxFindScoreForContext(cands, c);
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
//cache for future use
|
2011-02-24 16:14:42 +03:00
|
|
|
if(m_UseCache) {
|
2008-06-11 14:52:57 +04:00
|
|
|
i->second = cands;
|
|
|
|
}
|
|
|
|
return score;
|
|
|
|
};
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
Scores LexicalReorderingTableTree::auxFindScoreForContext(const Candidates& cands, const Phrase& context)
|
|
|
|
{
|
|
|
|
if(m_FactorsC.empty()) {
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(cands.size() <= 1);
|
2011-02-24 16:14:42 +03:00
|
|
|
return (1 == cands.size())?(cands[0].GetScore(0)):(Scores());
|
2008-06-11 14:52:57 +04:00
|
|
|
} else {
|
2011-02-24 16:14:42 +03:00
|
|
|
std::vector<std::string> cvec;
|
|
|
|
for(size_t i = 0; i < context.GetSize(); ++i) {
|
|
|
|
/* old code
|
|
|
|
std::string s = context.GetWord(i).ToString(m_FactorsC);
|
|
|
|
cvec.push_back(s.substr(0,s.size()-1));
|
|
|
|
*/
|
|
|
|
cvec.push_back(context.GetWord(i).GetString(m_FactorsC, false));
|
|
|
|
}
|
|
|
|
IPhrase c = m_Table->ConvertPhrase(cvec,TargetVocId);
|
|
|
|
IPhrase sub_c;
|
|
|
|
IPhrase::iterator start = c.begin();
|
|
|
|
for(size_t j = 0; j <= context.GetSize(); ++j, ++start) {
|
|
|
|
sub_c.assign(start, c.end());
|
|
|
|
for(size_t cand = 0; cand < cands.size(); ++cand) {
|
|
|
|
IPhrase p = cands[cand].GetPhrase(0);
|
|
|
|
if(cands[cand].GetPhrase(0) == sub_c) {
|
|
|
|
return cands[cand].GetScore(0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Scores();
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
void LexicalReorderingTableTree::DbgDump(std::ostream* pout){
|
2011-02-24 16:14:42 +03:00
|
|
|
std::ostream& out = *pout;
|
|
|
|
//TODO!
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
*/
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
void LexicalReorderingTableTree::InitializeForInput(const InputType& input)
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
ClearCache();
|
2011-02-24 16:14:42 +03:00
|
|
|
if(ConfusionNet const* cn = dynamic_cast<ConfusionNet const*>(&input)) {
|
2008-06-11 14:52:57 +04:00
|
|
|
Cache(*cn);
|
2011-02-24 16:14:42 +03:00
|
|
|
} else if(Sentence const* s = dynamic_cast<Sentence const*>(&input)) {
|
2008-06-11 14:52:57 +04:00
|
|
|
// Cache(*s); ... this just takes up too much memory, we cache elsewhere
|
|
|
|
DisableCache();
|
|
|
|
}
|
2009-08-07 20:47:54 +04:00
|
|
|
if (!m_Table.get()) {
|
2011-02-24 16:14:42 +03:00
|
|
|
//load thread specific table.
|
2009-08-07 20:47:54 +04:00
|
|
|
m_Table.reset(new PrefixTreeMap());
|
|
|
|
m_Table->Read(m_FilePath+".binlexr");
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
};
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
bool LexicalReorderingTableTree::Create(std::istream& inFile,
|
|
|
|
const std::string& outFileName)
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
std::string line;
|
2011-02-24 16:14:42 +03:00
|
|
|
//TRACE_ERR("Entering Create...\n");
|
|
|
|
std::string
|
|
|
|
ofn(outFileName+".binlexr.srctree"),
|
|
|
|
oft(outFileName+".binlexr.tgtdata"),
|
|
|
|
ofi(outFileName+".binlexr.idx"),
|
|
|
|
ofsv(outFileName+".binlexr.voc0"),
|
|
|
|
oftv(outFileName+".binlexr.voc1");
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
FILE *os = fOpen(ofn.c_str(),"wb");
|
|
|
|
FILE *ot = fOpen(oft.c_str(),"wb");
|
|
|
|
|
|
|
|
//TRACE_ERR("opend files....\n");
|
|
|
|
|
|
|
|
typedef PrefixTreeSA<LabelId,OFF_T> PSA;
|
|
|
|
PSA *psa = new PSA;
|
|
|
|
PSA::setDefault(InvalidOffT);
|
|
|
|
WordVoc* voc[3];
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
LabelId currFirstWord = InvalidLabelId;
|
|
|
|
IPhrase currKey;
|
|
|
|
|
|
|
|
Candidates cands;
|
|
|
|
std::vector<OFF_T> vo;
|
|
|
|
size_t lnc = 0;
|
|
|
|
size_t numTokens = 0;
|
|
|
|
size_t numKeyTokens = 0;
|
2011-02-24 16:14:42 +03:00
|
|
|
while(getline(inFile, line)) {
|
2008-06-11 14:52:57 +04:00
|
|
|
++lnc;
|
2011-02-24 16:14:42 +03:00
|
|
|
if(0 == lnc % 10000) {
|
|
|
|
TRACE_ERR(".");
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
IPhrase key;
|
2010-01-28 15:12:57 +03:00
|
|
|
Scores score;
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||");
|
|
|
|
std::string w;
|
2011-02-24 16:14:42 +03:00
|
|
|
if(1 == lnc) {
|
|
|
|
//do some init stuff in the first line
|
|
|
|
numTokens = tokens.size();
|
|
|
|
if(tokens.size() == 2) { //f ||| score
|
|
|
|
numKeyTokens = 1;
|
|
|
|
voc[0] = new WordVoc();
|
|
|
|
voc[1] = 0;
|
|
|
|
} else if(3 == tokens.size() || 4 == tokens.size()) { //either f ||| e ||| score or f ||| e ||| c ||| score
|
|
|
|
numKeyTokens = 2;
|
|
|
|
voc[0] = new WordVoc(); //f voc
|
|
|
|
voc[1] = new WordVoc(); //e voc
|
|
|
|
voc[2] = voc[1]; //c & e share voc
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
//sanity check ALL lines must have same number of tokens
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(numTokens == tokens.size());
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
2012-05-10 16:48:51 +04:00
|
|
|
size_t phrase = 0;
|
2011-02-24 16:14:42 +03:00
|
|
|
for(; phrase < numKeyTokens; ++phrase) {
|
2008-06-11 14:52:57 +04:00
|
|
|
//conditioned on more than just f... need |||
|
2011-02-24 16:14:42 +03:00
|
|
|
if(phrase >=1) {
|
|
|
|
key.push_back(PrefixTreeMap::MagicWord);
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
std::istringstream is(tokens[phrase]);
|
|
|
|
while(is >> w) {
|
2011-02-24 16:14:42 +03:00
|
|
|
key.push_back(voc[phrase]->add(w));
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
//collect all non key phrases, i.e. c
|
|
|
|
std::vector<IPhrase> tgt_phrases;
|
|
|
|
tgt_phrases.resize(numTokens - numKeyTokens - 1);
|
2012-05-10 16:48:51 +04:00
|
|
|
for(size_t j = 0; j < tgt_phrases.size(); ++j, ++phrase) {
|
2008-06-11 14:52:57 +04:00
|
|
|
std::istringstream is(tokens[numKeyTokens + j]);
|
|
|
|
while(is >> w) {
|
2011-02-24 16:14:42 +03:00
|
|
|
tgt_phrases[j].push_back(voc[phrase]->add(w));
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
//last token is score
|
|
|
|
std::istringstream is(tokens[numTokens-1]);
|
|
|
|
while(is >> w) {
|
|
|
|
score.push_back(atof(w.c_str()));
|
|
|
|
}
|
|
|
|
//transform score now...
|
|
|
|
std::transform(score.begin(),score.end(),score.begin(),TransformScore);
|
|
|
|
std::transform(score.begin(),score.end(),score.begin(),FloorScore);
|
2010-01-28 15:12:57 +03:00
|
|
|
std::vector<Scores> scores;
|
2008-06-11 14:52:57 +04:00
|
|
|
scores.push_back(score);
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
if(key.empty()) {
|
|
|
|
TRACE_ERR("WARNING: empty source phrase in line '"<<line<<"'\n");
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
//first time inits
|
2011-02-24 16:14:42 +03:00
|
|
|
if(currFirstWord == InvalidLabelId) {
|
2008-06-11 14:52:57 +04:00
|
|
|
currFirstWord = key[0];
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
if(currKey.empty()) {
|
2008-06-11 14:52:57 +04:00
|
|
|
currKey = key;
|
|
|
|
//insert key into tree
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(psa);
|
2008-06-11 14:52:57 +04:00
|
|
|
PSA::Data& d = psa->insert(key);
|
2011-02-24 16:14:42 +03:00
|
|
|
if(d == InvalidOffT) {
|
|
|
|
d = fTell(ot);
|
2008-06-11 14:52:57 +04:00
|
|
|
} else {
|
2011-02-24 16:14:42 +03:00
|
|
|
TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '" << line << "\n");
|
|
|
|
return false;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
if(currKey != key) {
|
2008-06-11 14:52:57 +04:00
|
|
|
//ok new key
|
|
|
|
currKey = key;
|
|
|
|
//a) write cands for old key
|
|
|
|
cands.writeBin(ot);
|
|
|
|
cands.clear();
|
|
|
|
//b) check if we need to move on to new tree root
|
2011-02-24 16:14:42 +03:00
|
|
|
if(key[0] != currFirstWord) {
|
|
|
|
// write key prefix tree to file and clear
|
|
|
|
PTF pf;
|
|
|
|
if(currFirstWord >= vo.size()) {
|
|
|
|
vo.resize(currFirstWord+1,InvalidOffT);
|
|
|
|
}
|
|
|
|
vo[currFirstWord] = fTell(os);
|
|
|
|
pf.create(*psa, os);
|
|
|
|
// clear
|
|
|
|
delete psa;
|
|
|
|
psa = new PSA;
|
|
|
|
currFirstWord = key[0];
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
//c) insert key into tree
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(psa);
|
2008-06-11 14:52:57 +04:00
|
|
|
PSA::Data& d = psa->insert(key);
|
2011-02-24 16:14:42 +03:00
|
|
|
if(d == InvalidOffT) {
|
|
|
|
d = fTell(ot);
|
2008-06-11 14:52:57 +04:00
|
|
|
} else {
|
2011-02-24 16:14:42 +03:00
|
|
|
TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '" << line << "\n");
|
|
|
|
return false;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
cands.push_back(GenericCandidate(tgt_phrases, scores));
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
2012-07-16 12:13:33 +04:00
|
|
|
if (lnc == 0) {
|
|
|
|
TRACE_ERR("ERROR: empty lexicalised reordering file\n" << std::endl);
|
|
|
|
return false;
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
//flush remainders
|
|
|
|
cands.writeBin(ot);
|
|
|
|
cands.clear();
|
|
|
|
//process last currFirstWord
|
|
|
|
PTF pf;
|
|
|
|
if(currFirstWord >= vo.size()) {
|
|
|
|
vo.resize(currFirstWord+1,InvalidOffT);
|
|
|
|
}
|
|
|
|
vo[currFirstWord] = fTell(os);
|
|
|
|
pf.create(*psa,os);
|
|
|
|
delete psa;
|
|
|
|
psa=0;
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
fClose(os);
|
|
|
|
fClose(ot);
|
|
|
|
/*
|
|
|
|
std::vector<size_t> inv;
|
|
|
|
for(size_t i = 0; i < vo.size(); ++i){
|
2011-02-24 16:14:42 +03:00
|
|
|
if(vo[i] == InvalidOffT){
|
2008-06-11 14:52:57 +04:00
|
|
|
inv.push_back(i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if(inv.size()) {
|
|
|
|
TRACE_ERR("WARNING: there are src voc entries with no phrase "
|
2011-02-24 16:14:42 +03:00
|
|
|
"translation: count "<<inv.size()<<"\n"
|
|
|
|
"There exists phrase translations for "<<vo.size()-inv.size()
|
|
|
|
<<" entries\n");
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
*/
|
|
|
|
FILE *oi = fOpen(ofi.c_str(),"wb");
|
|
|
|
fWriteVector(oi,vo);
|
|
|
|
fClose(oi);
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
if(voc[0]) {
|
|
|
|
voc[0]->Write(ofsv);
|
|
|
|
delete voc[0];
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
if(voc[1]) {
|
|
|
|
voc[1]->Write(oftv);
|
|
|
|
delete voc[1];
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
std::string LexicalReorderingTableTree::MakeCacheKey(const Phrase& f,
|
|
|
|
const Phrase& e) const
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
std::string key;
|
2011-02-24 16:14:42 +03:00
|
|
|
if(!m_FactorsF.empty()) {
|
2008-06-11 14:52:57 +04:00
|
|
|
key += auxClearString(f.GetStringRep(m_FactorsF));
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
if(!m_FactorsE.empty()) {
|
|
|
|
if(!key.empty()) {
|
2008-06-11 14:52:57 +04:00
|
|
|
key += "|||";
|
|
|
|
}
|
|
|
|
key += auxClearString(e.GetStringRep(m_FactorsE));
|
|
|
|
}
|
|
|
|
return key;
|
|
|
|
};
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
IPhrase LexicalReorderingTableTree::MakeTableKey(const Phrase& f,
|
|
|
|
const Phrase& e) const
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
IPhrase key;
|
|
|
|
std::vector<std::string> keyPart;
|
2011-02-24 16:14:42 +03:00
|
|
|
if(!m_FactorsF.empty()) {
|
2012-05-10 16:48:51 +04:00
|
|
|
for(size_t i = 0; i < f.GetSize(); ++i) {
|
2011-02-24 16:14:42 +03:00
|
|
|
/* old code
|
|
|
|
std::string s = f.GetWord(i).ToString(m_FactorsF);
|
|
|
|
keyPart.push_back(s.substr(0,s.size()-1));
|
|
|
|
*/
|
|
|
|
keyPart.push_back(f.GetWord(i).GetString(m_FactorsF, false));
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
2009-08-07 20:47:54 +04:00
|
|
|
auxAppend(key, m_Table->ConvertPhrase(keyPart, SourceVocId));
|
2011-02-24 16:14:42 +03:00
|
|
|
keyPart.clear();
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
if(!m_FactorsE.empty()) {
|
|
|
|
if(!key.empty()) {
|
2008-06-11 14:52:57 +04:00
|
|
|
key.push_back(PrefixTreeMap::MagicWord);
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
2012-05-10 16:48:51 +04:00
|
|
|
for(size_t i = 0; i < e.GetSize(); ++i) {
|
2011-02-24 16:14:42 +03:00
|
|
|
/* old code
|
|
|
|
std::string s = e.GetWord(i).ToString(m_FactorsE);
|
|
|
|
keyPart.push_back(s.substr(0,s.size()-1));
|
|
|
|
*/
|
|
|
|
keyPart.push_back(e.GetWord(i).GetString(m_FactorsE, false));
|
|
|
|
}
|
|
|
|
auxAppend(key, m_Table->ConvertPhrase(keyPart,TargetVocId));
|
|
|
|
//keyPart.clear();
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
return key;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct State {
|
2011-02-24 16:14:42 +03:00
|
|
|
State(PPimp* t, const std::string& p) : pos(t), path(p) {
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
PPimp* pos;
|
|
|
|
std::string path;
|
|
|
|
};
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
void LexicalReorderingTableTree::auxCacheForSrcPhrase(const Phrase& f)
|
|
|
|
{
|
|
|
|
if(m_FactorsE.empty()) {
|
|
|
|
//f is all of key...
|
|
|
|
Candidates cands;
|
2011-11-21 14:49:26 +04:00
|
|
|
m_Table->GetCandidates(MakeTableKey(f,Phrase(ARRAY_SIZE_INCR)),&cands);
|
|
|
|
m_Cache[MakeCacheKey(f,Phrase(ARRAY_SIZE_INCR))] = cands;
|
2008-06-11 14:52:57 +04:00
|
|
|
} else {
|
2011-02-24 16:14:42 +03:00
|
|
|
ObjectPool<PPimp> pool;
|
|
|
|
PPimp* pPos = m_Table->GetRoot();
|
|
|
|
//1) goto subtree for f
|
2012-05-10 16:48:51 +04:00
|
|
|
for(size_t i = 0; i < f.GetSize() && 0 != pPos && pPos->isValid(); ++i) {
|
2011-02-24 16:14:42 +03:00
|
|
|
/* old code
|
|
|
|
pPos = m_Table.Extend(pPos, auxClearString(f.GetWord(i).ToString(m_FactorsF)), SourceVocId);
|
|
|
|
*/
|
|
|
|
pPos = m_Table->Extend(pPos, f.GetWord(i).GetString(m_FactorsF, false), SourceVocId);
|
|
|
|
}
|
|
|
|
if(0 != pPos && pPos->isValid()) {
|
|
|
|
pPos = m_Table->Extend(pPos, PrefixTreeMap::MagicWord);
|
|
|
|
}
|
|
|
|
if(0 == pPos || !pPos->isValid()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
//2) explore whole subtree depth first & cache
|
|
|
|
std::string cache_key = auxClearString(f.GetStringRep(m_FactorsF)) + "|||";
|
|
|
|
|
|
|
|
std::vector<State> stack;
|
|
|
|
stack.push_back(State(pool.get(PPimp(pPos->ptr()->getPtr(pPos->idx),0,0)),""));
|
|
|
|
Candidates cands;
|
|
|
|
while(!stack.empty()) {
|
|
|
|
if(stack.back().pos->isValid()) {
|
|
|
|
LabelId w = stack.back().pos->ptr()->getKey(stack.back().pos->idx);
|
|
|
|
std::string next_path = stack.back().path + " " + m_Table->ConvertWord(w,TargetVocId);
|
|
|
|
//cache this
|
|
|
|
m_Table->GetCandidates(*stack.back().pos,&cands);
|
|
|
|
if(!cands.empty()) {
|
|
|
|
m_Cache[cache_key + auxClearString(next_path)] = cands;
|
|
|
|
}
|
|
|
|
cands.clear();
|
|
|
|
PPimp* next_pos = pool.get(PPimp(stack.back().pos->ptr()->getPtr(stack.back().pos->idx),0,0));
|
|
|
|
++stack.back().pos->idx;
|
|
|
|
stack.push_back(State(next_pos,next_path));
|
|
|
|
} else {
|
|
|
|
stack.pop_back();
|
|
|
|
}
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
void LexicalReorderingTableTree::Cache(const ConfusionNet& /*input*/)
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
void LexicalReorderingTableTree::Cache(const Sentence& input)
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
//only works with sentences...
|
2010-05-11 01:18:47 +04:00
|
|
|
size_t prev_cache_size = m_Cache.size();
|
|
|
|
size_t max_phrase_length = input.GetSize();
|
2011-02-24 16:14:42 +03:00
|
|
|
for(size_t len = 0; len <= max_phrase_length; ++len) {
|
|
|
|
for(size_t start = 0; start+len <= input.GetSize(); ++start) {
|
|
|
|
Phrase f = input.GetSubString(WordsRange(start, start+len));
|
|
|
|
auxCacheForSrcPhrase(f);
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
std::cerr << "Cached " << m_Cache.size() - prev_cache_size << " new primary reordering table keys\n";
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
/*
|
2011-02-24 16:14:42 +03:00
|
|
|
Pre fetching implementation using Phrase and Generation Dictionaries
|
2008-06-11 14:52:57 +04:00
|
|
|
*//*
|
|
|
|
void LexicalReorderingTableTree::Cache(const ConfusionNet& input){
|
|
|
|
typedef TargetPhraseCollection::iterator Iter;
|
|
|
|
typedef TargetPhraseCollection::const_iterator ConstIter;
|
|
|
|
//not implemented for confusion networks...
|
|
|
|
Sentence const* s = dynamic_cast<Sentence const*>(&input);
|
|
|
|
if(!s){
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
int max_phrase_length = input.GetSize();
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
std::vector<PhraseDictionaryBase*> PhraseTables = StaticData::Instance()->GetPhraseDictionaries();
|
|
|
|
//new code:
|
|
|
|
//std::vector<PhraseDictionary*> PhraseTables = StaticData::Instance()->GetPhraseDictionaries();
|
|
|
|
std::vector<GenerationDictionary*> GenTables = StaticData::Instance()->GetGenerationDictionaries();
|
2011-02-24 16:14:42 +03:00
|
|
|
for(size_t len = 1; len <= max_phrase_length; ++len){
|
2008-06-11 14:52:57 +04:00
|
|
|
for(size_t start = 0; start+len <= input.GetSize(); ++start){
|
|
|
|
Phrase f = s->GetSubString(WordsRange(start, start+len));
|
|
|
|
//find all translations of f
|
|
|
|
TargetPhraseCollection list;
|
|
|
|
|
|
|
|
for(size_t t = 0; t < PhraseTables.size(); ++t){
|
|
|
|
//if(doIntersect(PhraseTables[t]->GetOutputFactorMask(),FactorMask(m_FactorsE))){
|
|
|
|
//this table gives us something we need
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
const TargetPhraseCollection* new_list = PhraseTables[t]->GetTargetPhraseCollection(f);
|
|
|
|
TargetPhraseCollection curr_list;
|
|
|
|
for(ConstIter i = new_list->begin(); i != new_list->end(); ++i){
|
|
|
|
for(Iter j = list.begin(); j != list.end(); ++j){
|
|
|
|
curr_list.Add((*j)->MergeNext(*(*i)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if(list.IsEmpty()){
|
|
|
|
list = *new_list;
|
|
|
|
} else {
|
|
|
|
list = curr_list;
|
|
|
|
}
|
|
|
|
//}
|
|
|
|
}
|
|
|
|
for(size_t g = 0; g < GenTables.size(); ++g){
|
|
|
|
//if(doIntersect(GenTables[g]->GetOutputFactorMask(),FactorMask(m_FactorsE))){
|
|
|
|
TargetPhraseCollection curr_list;
|
|
|
|
for(Iter j = list.begin(); j != list.end(); ++j){
|
|
|
|
for(size_t w = 0; w < (*j)->GetSize(); ++w){
|
|
|
|
const OutputWordCollection* words = GenTables[g]->FindWord((*j)->GetWord(w));
|
|
|
|
for(OutputWordCollection::const_iterator i = words->begin(); i != words->end(); ++i){
|
|
|
|
TargetPhrase* p = new TargetPhrase(*(*j));
|
|
|
|
Word& pw = p->GetWord(w);
|
|
|
|
pw.Merge(i->first);
|
|
|
|
curr_list.Add(p);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
list = curr_list;
|
|
|
|
//}
|
|
|
|
}
|
|
|
|
//cache for each translation
|
|
|
|
for(Iter e = list.begin(); e < list.end(); ++e){
|
|
|
|
Candidates cands;
|
|
|
|
m_Table.GetCandidates(MakeTableKey(f,*(*e)), &cands);
|
|
|
|
m_Cache.insert(std::make_pair(MakeCacheKey(f,*(*e)),cands));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
*/
|
2008-10-09 03:51:26 +04:00
|
|
|
|
|
|
|
}
|
|
|
|
|