moses and moses2 both use probingpt lib

This commit is contained in:
Hieu Hoang 2017-02-16 11:30:39 +00:00
parent 07cef43cea
commit a391b84b42
23 changed files with 28 additions and 1647 deletions

View File

@ -2271,9 +2271,14 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/PhraseDictionaryTreeAdaptor.h</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
<name>TranslationModel/ProbingPT.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT.cpp</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT.h</locationURI>
</link>
<link>
<name>TranslationModel/RuleTable</name>
@ -3355,106 +3360,6 @@
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/Jamfile</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/Jamfile</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/ProbingPT.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/ProbingPT.cpp</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/ProbingPT.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/ProbingPT.h</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/StoreTarget.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreTarget.cpp</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/StoreTarget.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreTarget.h</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/StoreVocab.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreVocab.cpp</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/StoreVocab.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreVocab.h</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/hash.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.cpp</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/hash.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.hh</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/line_splitter.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/line_splitter.cpp</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/line_splitter.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/line_splitter.hh</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/probing_hash_utils.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/probing_hash_utils.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/probing_hash_utils.hh</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/querying.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/querying.cpp</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/querying.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/querying.hh</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/storing.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/storing.cpp</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/storing.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/storing.hh</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/vocabid.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/vocabid.cpp</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/vocabid.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/vocabid.hh</locationURI>
</link>
<link>
<name>TranslationModel/RuleTable/Loader.h</name>
<type>1</type>
@ -3955,11 +3860,6 @@
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>TranslationModel/UG/bin/darwin-4.2.1</name>
<type>2</type>
@ -4515,11 +4415,6 @@
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>TranslationModel/UG/bin/darwin-4.2.1/release</name>
<type>2</type>
@ -4645,16 +4540,6 @@
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/debug-symbols-on</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>TranslationModel/UG/bin/darwin-4.2.1/release/debug-symbols-on</name>
<type>2</type>
@ -4700,16 +4585,6 @@
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/debug-symbols-on/link-static</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>TranslationModel/UG/bin/darwin-4.2.1/release/debug-symbols-on/link-static</name>
<type>2</type>
@ -5350,66 +5225,6 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/darwin-4.2.1/release/link-static/threading-multi/ThrowingFwrite.o</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/ProbingPT.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/ProbingPT.o</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/StoreTarget.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/StoreTarget.o</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/StoreVocab.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/StoreVocab.o</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/hash.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/hash.o</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/huffmanish.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/huffmanish.o</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/line_splitter.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/line_splitter.o</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/probing_hash_utils.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/probing_hash_utils.o</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/quering.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/quering.o</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/querying.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/querying.o</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/storing.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/storing.o</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/vocabid.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/bin/darwin-4.2.1/release/link-static/threading-multi/vocabid.o</locationURI>
</link>
<link>
<name>TranslationModel/UG/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi</name>
<type>2</type>
@ -5975,46 +5790,6 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CompactPT/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ThrowingFwrite.o</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ProbingPT.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ProbingPT.o</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/hash.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/hash.o</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/huffmanish.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/huffmanish.o</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/line_splitter.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/line_splitter.o</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/probing_hash_utils.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/probing_hash_utils.o</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/quering.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/quering.o</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/storing.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/storing.o</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/vocabid.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/vocabid.o</locationURI>
</link>
<link>
<name>TranslationModel/UG/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/count-ptable-features</name>
<type>1</type>

View File

@ -122,10 +122,10 @@ vwfiles synlm mmlib mserver headers
FF_Factory.o
LM//LM
TranslationModel/CompactPT//CompactPT
TranslationModel/ProbingPT//ProbingPT
ThreadPool
..//search
../util/double-conversion//double-conversion
../probingpt//probingpt
..//z
../OnDiskPt//OnDiskPt
$(TOP)//boost_filesystem
@ -139,5 +139,5 @@ alias headers-to-install : [ glob-tree *.h ] ;
import testing ;
unit-test moses_test : [ glob *Test.cpp Mock*.cpp FF/*Test.cpp ] ..//boost_filesystem moses headers ..//z ../OnDiskPt//OnDiskPt ..//boost_unit_test_framework ;
unit-test moses_test : [ glob *Test.cpp Mock*.cpp FF/*Test.cpp ] ..//boost_filesystem moses headers ..//z ../OnDiskPt//OnDiskPt ../probingpt//probingpt ..//boost_unit_test_framework ;

View File

@ -5,7 +5,8 @@
#include "moses/TargetPhraseCollection.h"
#include "moses/InputFileStream.h"
#include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h"
#include "querying.hh"
#include "probingpt/querying.hh"
#include "probingpt/probing_hash_utils.hh"
using namespace std;
@ -14,6 +15,7 @@ namespace Moses
ProbingPT::ProbingPT(const std::string &line)
: PhraseDictionary(line,true)
,m_engine(NULL)
,load_method(util::POPULATE_OR_READ)
{
ReadParameters();
@ -31,7 +33,7 @@ void ProbingPT::Load(AllOptions::ptr const& opts)
m_options = opts;
SetFeaturesToApply();
m_engine = new QueryEngine(m_filePath.c_str());
m_engine = new probingpt::QueryEngine(m_filePath.c_str(), load_method);
m_unkId = 456456546456;
@ -256,12 +258,12 @@ TargetPhraseCollection *ProbingPT::CreateTargetPhrases(
TargetPhrase *ProbingPT::CreateTargetPhrase(
const char *&offset) const
{
TargetPhraseInfo *tpInfo = (TargetPhraseInfo*) offset;
probingpt::TargetPhraseInfo *tpInfo = (probingpt::TargetPhraseInfo*) offset;
size_t numRealWords = tpInfo->numWords / m_output.size();
TargetPhrase *tp = new TargetPhrase(this);
offset += sizeof(TargetPhraseInfo);
offset += sizeof(probingpt::TargetPhraseInfo);
// scores
float *scores = (float*) offset;

View File

@ -3,16 +3,20 @@
#include <boost/iostreams/device/mapped_file.hpp>
#include <boost/bimap.hpp>
#include <boost/unordered_map.hpp>
#include "../PhraseDictionary.h"
#include "PhraseDictionary.h"
#include "util/mmap.hh"
namespace probingpt
{
class QueryEngine;
class target_text;
}
namespace Moses
{
class ChartParser;
class ChartCellCollectionBase;
class ChartRuleLookupManager;
class QueryEngine;
class target_text;
class ProbingPT : public PhraseDictionary
{
@ -39,12 +43,13 @@ public:
protected:
QueryEngine *m_engine;
probingpt::QueryEngine *m_engine;
uint64_t m_unkId;
std::vector<uint64_t> m_sourceVocab; // factor id -> pt id
std::vector<const Factor*> m_targetVocab; // pt id -> factor*
std::vector<const AlignmentInfo*> m_aligns;
util::LoadMethod load_method;
boost::iostreams::mapped_file_source file;
const char *data;

View File

@ -1,8 +0,0 @@
local current = "" ;
local includes = ;
fakelib ProbingPT : [ glob *.cpp ] ../..//headers : $(includes) <dependency>$(PT-LOG) : : $(includes) ;
path-constant PT-LOG : bin/pt.log ;
update-if-changed $(PT-LOG) $(current) ;

View File

@ -1,264 +0,0 @@
/*
* StoreTarget.cpp
*
* Created on: 19 Jan 2016
* Author: hieu
*/
#include <boost/foreach.hpp>
#include "StoreTarget.h"
#include "line_splitter.hh"
#include "probing_hash_utils.hh"
#include "moses/OutputFileStream.h"
#include "moses/Util.h"
using namespace std;
namespace Moses
{
StoreTarget::StoreTarget(const std::string &basepath)
:m_basePath(basepath)
,m_vocab(basepath + "/TargetVocab.dat")
{
std::string path = basepath + "/TargetColl.dat";
m_fileTargetColl.open(path.c_str(),
std::ios::out | std::ios::binary | std::ios::ate | std::ios::trunc);
if (!m_fileTargetColl.is_open()) {
throw "can't create file ";
}
}
StoreTarget::~StoreTarget()
{
assert(m_coll.empty());
m_fileTargetColl.close();
// vocab
m_vocab.Save();
}
uint64_t StoreTarget::Save()
{
uint64_t ret = m_fileTargetColl.tellp();
// save to disk
uint64_t numTP = m_coll.size();
m_fileTargetColl.write((char*) &numTP, sizeof(uint64_t));
for (size_t i = 0; i < m_coll.size(); ++i) {
Save(*m_coll[i]);
}
// clear coll
RemoveAllInColl(m_coll);
m_coll.clear();
// starting position of coll
return ret;
}
void StoreTarget::Save(const target_text &rule)
{
// metadata for each tp
TargetPhraseInfo tpInfo;
tpInfo.alignTerm = GetAlignId(rule.word_align_term);
tpInfo.alignNonTerm = GetAlignId(rule.word_align_non_term);
tpInfo.numWords = rule.target_phrase.size();
tpInfo.propLength = rule.property.size();
//cerr << "TPInfo=" << sizeof(TPInfo);
m_fileTargetColl.write((char*) &tpInfo, sizeof(TargetPhraseInfo));
// scores
for (size_t i = 0; i < rule.prob.size(); ++i) {
float prob = rule.prob[i];
m_fileTargetColl.write((char*) &prob, sizeof(prob));
}
// tp
for (size_t i = 0; i < rule.target_phrase.size(); ++i) {
uint32_t vocabId = rule.target_phrase[i];
m_fileTargetColl.write((char*) &vocabId, sizeof(vocabId));
}
// prop TODO
}
void StoreTarget::SaveAlignment()
{
std::string path = m_basePath + "/Alignments.dat";
OutputFileStream file(path);
BOOST_FOREACH(Alignments::value_type &valPair, m_aligns) {
file << valPair.second << "\t";
const std::vector<size_t> &aligns = valPair.first;
BOOST_FOREACH(size_t align, aligns) {
file << align << " ";
}
file << endl;
}
}
void StoreTarget::Append(const line_text &line, bool log_prob, bool scfg)
{
target_text *rule = new target_text;
//cerr << "line.target_phrase=" << line.target_phrase << endl;
// target_phrase
vector<bool> nonTerms;
util::TokenIter<util::SingleCharacter> it;
it = util::TokenIter<util::SingleCharacter>(line.target_phrase,
util::SingleCharacter(' '));
while (it) {
StringPiece word = *it;
//cerr << "word=" << word << endl;
bool nonTerm = false;
if (scfg) {
// not really sure how to handle factored SCFG and NT
if (scfg && word[0] == '[' && word[word.size() - 1] == ']') {
//cerr << "NON-TERM=" << tok << " " << nonTerms.size() << endl;
nonTerm = true;
}
nonTerms.push_back(nonTerm);
}
util::TokenIter<util::SingleCharacter> itFactor;
itFactor = util::TokenIter<util::SingleCharacter>(word,
util::SingleCharacter('|'));
while (itFactor) {
StringPiece factor = *itFactor;
string factorStr = factor.as_string();
uint32_t vocabId = m_vocab.GetVocabId(factorStr);
rule->target_phrase.push_back(vocabId);
itFactor++;
}
it++;
}
// probs
it = util::TokenIter<util::SingleCharacter>(line.prob,
util::SingleCharacter(' '));
while (it) {
string tok = it->as_string();
float prob = Scan<float>(tok);
if (log_prob) {
prob = FloorScore(log(prob));
if (prob == 0.0f) prob = 0.0000000001;
}
rule->prob.push_back(prob);
it++;
}
/*
cerr << "nonTerms=";
for (size_t i = 0; i < nonTerms.size(); ++i) {
cerr << nonTerms[i] << " ";
}
cerr << endl;
*/
// alignment
it = util::TokenIter<util::SingleCharacter>(line.word_align,
util::SingleCharacter(' '));
while (it) {
string tokPair = Trim(it->as_string());
if (tokPair.empty()) {
break;
}
vector<size_t> alignPair = Tokenize<size_t>(tokPair, "-");
assert(alignPair.size() == 2);
bool nonTerm = false;
size_t sourcePos = alignPair[0];
size_t targetPos = alignPair[1];
if (scfg) {
nonTerm = nonTerms[targetPos];
}
//cerr << targetPos << "=" << nonTerm << endl;
if (nonTerm) {
rule->word_align_non_term.push_back(sourcePos);
rule->word_align_non_term.push_back(targetPos);
//cerr << (int) rule->word_all1.back() << " ";
} else {
rule->word_align_term.push_back(sourcePos);
rule->word_align_term.push_back(targetPos);
}
it++;
}
// extra scores
string prop = line.property.as_string();
AppendLexRO(prop, rule->prob, log_prob);
//cerr << "line.property=" << line.property << endl;
//cerr << "prop=" << prop << endl;
// properties
/*
for (size_t i = 0; i < prop.size(); ++i) {
rule->property.push_back(prop[i]);
}
*/
m_coll.push_back(rule);
}
uint32_t StoreTarget::GetAlignId(const std::vector<size_t> &align)
{
boost::unordered_map<std::vector<size_t>, uint32_t>::iterator iter =
m_aligns.find(align);
if (iter == m_aligns.end()) {
uint32_t ind = m_aligns.size();
m_aligns[align] = ind;
return ind;
} else {
return iter->second;
}
}
void StoreTarget::AppendLexRO(std::string &prop, std::vector<float> &retvector,
bool log_prob) const
{
size_t startPos = prop.find("{{LexRO ");
if (startPos != string::npos) {
size_t endPos = prop.find("}}", startPos + 8);
string lexProb = prop.substr(startPos + 8, endPos - startPos - 8);
//cerr << "lexProb=" << lexProb << endl;
// append lex probs to pt probs
vector<float> scores = Tokenize<float>(lexProb);
if (log_prob) {
for (size_t i = 0; i < scores.size(); ++i) {
scores[i] = FloorScore(log(scores[i]));
if (scores[i] == 0.0f) scores[i] = 0.0000000001;
}
}
for (size_t i = 0; i < scores.size(); ++i) {
retvector.push_back(scores[i]);
}
// exclude LexRO property from property column
prop = prop.substr(0, startPos)
+ prop.substr(endPos + 2, prop.size() - endPos - 2);
//cerr << "line.property_to_be_binarized=" << line.property_to_be_binarized << "AAAA" << endl;
}
}
} /* namespace Moses2 */

View File

@ -1,51 +0,0 @@
/*
* StoreTarget.h
*
* Created on: 19 Jan 2016
* Author: hieu
*/
#pragma once
#include <string>
#include <fstream>
#include <vector>
#include <inttypes.h>
#include <boost/unordered_map.hpp>
#include <boost/unordered_set.hpp>
#include "StoreVocab.h"
namespace Moses
{
class line_text;
class target_text;
class StoreTarget
{
public:
StoreTarget(const std::string &basepath);
virtual ~StoreTarget();
uint64_t Save();
void SaveAlignment();
void Append(const line_text &line, bool log_prob, bool scfg);
protected:
std::string m_basePath;
std::fstream m_fileTargetColl;
StoreVocab<uint32_t> m_vocab;
typedef boost::unordered_map<std::vector<size_t>, uint32_t> Alignments;
Alignments m_aligns;
std::vector<target_text*> m_coll;
uint32_t GetAlignId(const std::vector<size_t> &align);
void Save(const target_text &rule);
void AppendLexRO(std::string &prop, std::vector<float> &retvector,
bool log_prob) const;
};
} /* namespace Moses2 */

View File

@ -1,13 +0,0 @@
/*
* StoreVocab.cpp
*
* Created on: 15 Jun 2016
* Author: hieu
*/
#include <fstream>
#include "StoreVocab.h"
namespace Moses
{
} /* namespace Moses2 */

View File

@ -1,60 +0,0 @@
/*
* StoreVocab.h
*
* Created on: 15 Jun 2016
* Author: hieu
*/
#pragma once
#include <string>
#include <boost/unordered_map.hpp>
#include "moses/OutputFileStream.h"
#include "moses/Util.h"
namespace Moses
{
template<typename VOCABID>
class StoreVocab
{
protected:
std::string m_path;
typedef boost::unordered_map<std::string, VOCABID> Coll;
Coll m_vocab;
public:
StoreVocab(const std::string &path)
:m_path(path)
{}
virtual ~StoreVocab() {}
VOCABID GetVocabId(const std::string &word) {
typename Coll::iterator iter = m_vocab.find(word);
if (iter == m_vocab.end()) {
VOCABID ind = m_vocab.size() + 1;
m_vocab[word] = ind;
return ind;
} else {
return iter->second;
}
}
void Insert(VOCABID id, const std::string &word) {
m_vocab[word] = id;
}
void Save() {
OutputFileStream strme(m_path);
typename Coll::const_iterator iter;
for (iter = m_vocab.begin(); iter != m_vocab.end(); ++iter) {
strme << iter->first << "\t" << iter->second << std::endl;
}
strme.Close();
}
};
} /* namespace Moses2 */

View File

@ -1,44 +0,0 @@
#include <iostream>
#include "hash.hh"
using namespace std;
namespace Moses
{
uint64_t getHash(StringPiece text)
{
std::size_t len = text.size();
uint64_t key = util::MurmurHashNative(text.data(), len);
return key;
}
std::vector<uint64_t> getVocabIDs(const StringPiece &textin)
{
//Tokenize
std::vector<uint64_t> output;
util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' '));
while (itWord) {
StringPiece word = *itWord;
uint64_t id = 0;
util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|'));
while (itFactor) {
StringPiece factor = *itFactor;
//cerr << "factor=" << factor << endl;
id += getHash(factor);
itFactor++;
}
output.push_back(id);
itWord++;
}
return output;
}
}

View File

@ -1,17 +0,0 @@
#pragma once
#include "util/string_piece.hh"
#include "util/murmur_hash.hh"
#include "util/string_piece.hh" //Tokenization and work with StringPiece
#include "util/tokenize_piece.hh"
#include <vector>
namespace Moses
{
//Gets the MurmurmurHash for give string
uint64_t getHash(StringPiece text);
std::vector<uint64_t> getVocabIDs(const StringPiece &textin);
}

View File

@ -1,103 +0,0 @@
#include "line_splitter.hh"
namespace Moses
{
line_text splitLine(const StringPiece &textin, bool scfg)
{
const char delim[] = "|||";
line_text output;
//Tokenize
util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
//Get source phrase
output.source_phrase = Trim(*it);
//std::cerr << "output.source_phrase=" << output.source_phrase << "AAAA" << std::endl;
//Get target_phrase
it++;
output.target_phrase = Trim(*it);
//std::cerr << "output.target_phrase=" << output.target_phrase << "AAAA" << std::endl;
if (scfg) {
/*
std::cerr << "output.source_phrase=" << output.source_phrase << std::endl;
std::cerr << "output.target_phrase=" << output.target_phrase << std::endl;
reformatSCFG(output);
std::cerr << "output.source_phrase=" << output.source_phrase << std::endl;
std::cerr << "output.target_phrase=" << output.target_phrase << std::endl;
*/
}
//Get probabilities
it++;
output.prob = Trim(*it);
//std::cerr << "output.prob=" << output.prob << "AAAA" << std::endl;
//Get WordAllignment
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
output.word_align = Trim(*it);
//std::cerr << "output.word_align=" << output.word_align << "AAAA" << std::endl;
//Get count
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
output.counts = Trim(*it);
//std::cerr << "output.counts=" << output.counts << "AAAA" << std::endl;
//Get sparse_score
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
output.sparse_score = Trim(*it);
//std::cerr << "output.sparse_score=" << output.sparse_score << "AAAA" << std::endl;
//Get property
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
output.property = Trim(*it);
//std::cerr << "output.property=" << output.property << "AAAA" << std::endl;
return output;
}
std::vector<unsigned char> splitWordAll1(const StringPiece &textin)
{
const char delim[] = " ";
const char delim2[] = "-";
std::vector<unsigned char> output;
//Case with no word alignments.
if (textin.size() == 0) {
return output;
}
//Split on space
util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
//For each int
while (it) {
//Split on dash (-)
util::TokenIter<util::MultiCharacter> itInner(*it,
util::MultiCharacter(delim2));
//Insert the two entries in the vector. User will read entry 0 and 1 to get the first,
//2 and 3 for second etc. Use unsigned char instead of int to save space, as
//word allignments are all very small numbers that fit in a single byte
output.push_back((unsigned char) (atoi(itInner->data())));
itInner++;
output.push_back((unsigned char) (atoi(itInner->data())));
it++;
}
return output;
}
void reformatSCFG(line_text &output)
{
}
}

View File

@ -1,57 +0,0 @@
#pragma once
#include "util/string_piece.hh"
#include "util/tokenize_piece.hh"
#include "util/file_piece.hh"
#include <vector>
#include <cstdlib> //atof
#include "util/string_piece.hh" //Tokenization and work with StringPiece
#include "util/tokenize_piece.hh"
#include <vector>
namespace Moses
{
//Struct for holding processed line
struct line_text {
StringPiece source_phrase;
StringPiece target_phrase;
StringPiece prob;
StringPiece word_align;
StringPiece counts;
StringPiece sparse_score;
StringPiece property;
std::string property_to_be_binarized;
};
//Struct for holding processed line
struct target_text {
std::vector<unsigned int> target_phrase;
std::vector<float> prob;
std::vector<size_t> word_align_term;
std::vector<size_t> word_align_non_term;
std::vector<char> counts;
std::vector<char> sparse_score;
std::vector<char> property;
/*
void Reset()
{
target_phrase.clear();
prob.clear();
word_all1.clear();
counts.clear();
sparse_score.clear();
property.clear();
}
*/
};
//Ask if it's better to have it receive a pointer to a line_text struct
line_text splitLine(const StringPiece &textin, bool scfg);
void reformatSCFG(line_text &output);
std::vector<unsigned char> splitWordAll1(const StringPiece &textin);
}

View File

@ -1,50 +0,0 @@
#include "probing_hash_utils.hh"
namespace Moses
{
//Read table from disk, return memory map location
char * readTable(const char * filename, size_t size)
{
//Initial position of the file is the end of the file, thus we know the size
int fd;
char * map;
fd = open(filename, O_RDONLY);
if (fd == -1) {
perror("Error opening file for reading");
exit(EXIT_FAILURE);
}
map = (char *) mmap(0, size, PROT_READ, MAP_SHARED, fd, 0);
if (map == MAP_FAILED) {
close(fd);
perror("Error mmapping the file");
exit(EXIT_FAILURE);
}
return map;
}
void serialize_table(char *mem, size_t size, const std::string &filename)
{
std::ofstream os(filename.c_str(), std::ios::binary);
os.write((const char*) &mem[0], size);
os.close();
}
uint64_t getKey(const uint64_t source_phrase[], size_t size)
{
//TOO SLOW
//uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
uint64_t key = 0;
for (size_t i = 0; i < size; i++) {
key += (source_phrase[i] << i);
}
return key;
}
}

View File

@ -1,51 +0,0 @@
#pragma once
#include "util/probing_hash_table.hh"
#include <sys/mman.h>
#include <boost/functional/hash.hpp>
#include <fcntl.h>
#include <fstream>
namespace Moses
{
#define API_VERSION 15
//Hash table entry
struct Entry {
typedef uint64_t Key;
Key key;
Key GetKey() const {
return key;
}
void SetKey(Key to) {
key = to;
}
uint64_t value;
};
#define NONE std::numeric_limits<uint64_t>::max()
//Define table
typedef util::ProbingHashTable<Entry, boost::hash<uint64_t> > Table;
void serialize_table(char *mem, size_t size, const std::string &filename);
char * readTable(const char * filename, size_t size);
uint64_t getKey(const uint64_t source_phrase[], size_t size);
struct TargetPhraseInfo {
uint32_t alignTerm;
uint32_t alignNonTerm;
uint16_t numWords;
uint16_t propLength;
uint16_t filler;
};
}

View File

@ -1,141 +0,0 @@
#include "querying.hh"
#include "util/exception.hh"
using namespace std;
namespace Moses
{
QueryEngine::QueryEngine(const char * filepath)
{
//Create filepaths
std::string basepath(filepath);
std::string path_to_config = basepath + "/config";
std::string path_to_hashtable = basepath + "/probing_hash.dat";
std::string path_to_source_vocabid = basepath + "/source_vocabids";
std::string alignPath = basepath + "/Alignments.dat";
if (!FileExists(path_to_config)) {
UTIL_THROW2("Binary table doesn't exist is didn't finish binarizing: " << path_to_config);
}
///Source phrase vocabids
read_map(source_vocabids, path_to_source_vocabid.c_str());
// alignments
read_alignments(alignPath);
//Read config file
boost::unordered_map<std::string, std::string> keyValue;
std::ifstream config(path_to_config.c_str());
std::string line;
while (getline(config, line)) {
std::vector<std::string> toks = Tokenize(line, "\t");
UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line);
keyValue[ toks[0] ] = toks[1];
}
bool found;
//Check API version:
int version;
found = Get(keyValue, "API_VERSION", version);
if (!found) {
std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl;
} else if (version != API_VERSION) {
std::cerr << "The ProbingPT API has changed. " << version << "!="
<< API_VERSION << " Please rebinarize your phrase tables." << std::endl;
exit(EXIT_FAILURE);
}
//Get tablesize.
int tablesize;
found = Get(keyValue, "uniq_entries", tablesize);
if (!found) {
std::cerr << "uniq_entries not found" << std::endl;
exit(EXIT_FAILURE);
}
//Number of scores
found = Get(keyValue, "num_scores", num_scores);
if (!found) {
std::cerr << "num_scores not found" << std::endl;
exit(EXIT_FAILURE);
}
//How may scores from lex reordering models
found = Get(keyValue, "num_lex_scores", num_lex_scores);
if (!found) {
std::cerr << "num_lex_scores not found" << std::endl;
exit(EXIT_FAILURE);
}
// have the scores been log() and FloorScore()?
found = Get(keyValue, "log_prob", logProb);
if (!found) {
std::cerr << "logProb not found" << std::endl;
exit(EXIT_FAILURE);
}
config.close();
//Read hashtable
table_filesize = Table::Size(tablesize, 1.2);
mem = readTable(path_to_hashtable.c_str(), table_filesize);
Table table_init(mem, table_filesize);
table = table_init;
std::cerr << "Initialized successfully! " << std::endl;
}
QueryEngine::~QueryEngine()
{
//Clear mmap content from memory.
munmap(mem, table_filesize);
}
uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const
{
//TOO SLOW
//uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
return Moses::getKey(source_phrase, size);
}
std::pair<bool, uint64_t> QueryEngine::query(uint64_t key)
{
std::pair<bool, uint64_t> ret;
const Entry * entry;
ret.first = table.Find(key, entry);
if (ret.first) {
ret.second = entry->value;
}
return ret;
}
void QueryEngine::read_alignments(const std::string &alignPath)
{
std::ifstream strm(alignPath.c_str());
string line;
while (getline(strm, line)) {
vector<string> toks = Tokenize(line, "\t ");
UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file");
uint32_t alignInd = Scan<uint32_t>(toks[0]);
if (alignInd >= alignColl.size()) {
alignColl.resize(alignInd + 1);
}
Alignments &aligns = alignColl[alignInd];
for (size_t i = 1; i < toks.size(); ++i) {
size_t pos = Scan<size_t>(toks[i]);
aligns.push_back(pos);
}
}
}
}

View File

@ -1,66 +0,0 @@
#pragma once
#include <boost/unordered_map.hpp>
#include <sys/stat.h> //For finding size of file
#include "vocabid.hh"
#include <algorithm> //toLower
#include <deque>
#include "probing_hash_utils.hh"
#include "hash.hh" //Includes line splitter
#include "line_splitter.hh"
#include "moses//Util.h"
namespace Moses
{
class QueryEngine
{
std::map<uint64_t, std::string> source_vocabids;
typedef std::vector<unsigned char> Alignments;
std::vector<Alignments> alignColl;
Table table;
char *mem; //Memory for the table, necessary so that we can correctly destroy the object
size_t table_filesize;
bool is_reordering;
void read_alignments(const std::string &alignPath);
public:
int num_scores;
int num_lex_scores;
bool logProb;
QueryEngine(const char *);
~QueryEngine();
std::pair<bool, uint64_t> query(uint64_t key);
const std::map<uint64_t, std::string> &getSourceVocab() const {
return source_vocabids;
}
const std::vector<Alignments> &getAlignments() const {
return alignColl;
}
uint64_t getKey(uint64_t source_phrase[], size_t size) const;
template<typename T>
inline bool Get(const boost::unordered_map<std::string, std::string> &keyValue, const std::string &sought, T &found) const {
boost::unordered_map<std::string, std::string>::const_iterator iter = keyValue.find(sought);
if (iter == keyValue.end()) {
return false;
}
const std::string &foundStr = iter->second;
found = Scan<T>(foundStr);
return true;
}
};
}

View File

@ -1,298 +0,0 @@
#include <sys/stat.h>
#include <boost/foreach.hpp>
#include "line_splitter.hh"
#include "storing.hh"
#include "StoreTarget.h"
#include "StoreVocab.h"
#include "moses/Util.h"
#include "moses/InputFileStream.h"
using namespace std;
namespace Moses
{
///////////////////////////////////////////////////////////////////////
void Node::Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos)
{
if (pos < sourcePhrase.size()) {
uint64_t vocabId = sourcePhrase[pos];
Node *child;
Children::iterator iter = m_children.find(vocabId);
if (iter == m_children.end()) {
// New node. Write other children then discard them
BOOST_FOREACH(Children::value_type &valPair, m_children) {
Node &otherChild = valPair.second;
otherChild.Write(table);
}
m_children.clear();
// create new node
child = &m_children[vocabId];
assert(!child->done);
child->key = key + (vocabId << pos);
} else {
child = &iter->second;
}
child->Add(table, sourcePhrase, pos + 1);
} else {
// this node was written previously 'cos it has rules
done = true;
}
}
void Node::Write(Table &table)
{
//cerr << "START write " << done << " " << key << endl;
BOOST_FOREACH(Children::value_type &valPair, m_children) {
Node &child = valPair.second;
child.Write(table);
}
if (!done) {
// save
Entry sourceEntry;
sourceEntry.value = NONE;
sourceEntry.key = key;
//Put into table
table.Insert(sourceEntry);
}
}
///////////////////////////////////////////////////////////////////////
void createProbingPT(const std::string &phrasetable_path,
const std::string &basepath, int num_scores, int num_lex_scores,
bool log_prob, int max_cache_size, bool scfg)
{
std::cerr << "Starting..." << std::endl;
//Get basepath and create directory if missing
mkdir(basepath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
StoreTarget storeTarget(basepath);
//Get uniq lines:
unsigned long uniq_entries = countUniqueSource(phrasetable_path);
//Source phrase vocabids
StoreVocab<uint64_t> sourceVocab(basepath + "/source_vocabids");
//Read the file
util::FilePiece filein(phrasetable_path.c_str());
//Init the probing hash table
size_t size = Table::Size(uniq_entries, 1.2);
char * mem = new char[size];
memset(mem, 0, size);
Table sourceEntries(mem, size);
std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> cache;
float totalSourceCount = 0;
//Keep track of the size of each group of target phrases
size_t line_num = 0;
//Read everything and processs
std::string prevSource;
Node sourcePhrases;
sourcePhrases.done = true;
sourcePhrases.key = 0;
while (true) {
try {
//Process line read
line_text line;
line = splitLine(filein.ReadLine(), scfg);
//cerr << "line=" << line.source_phrase << endl;
++line_num;
if (line_num % 1000000 == 0) {
std::cerr << line_num << " " << std::flush;
}
//Add source phrases to vocabularyIDs
add_to_map(sourceVocab, line.source_phrase);
if (prevSource.empty()) {
// 1st line
prevSource = line.source_phrase.as_string();
storeTarget.Append(line, log_prob, scfg);
} else if (prevSource == line.source_phrase) {
//If we still have the same line, just append to it:
storeTarget.Append(line, log_prob, scfg);
} else {
assert(prevSource != line.source_phrase);
//Create a new entry even
// save
uint64_t targetInd = storeTarget.Save();
// next line
storeTarget.Append(line, log_prob, scfg);
//Create an entry for the previous source phrase:
Entry sourceEntry;
sourceEntry.value = targetInd;
//The key is the sum of hashes of individual words bitshifted by their position in the phrase.
//Probably not entirerly correct, but fast and seems to work fine in practise.
std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource);
if (scfg) {
// storing prefixes?
sourcePhrases.Add(sourceEntries, vocabid_source);
}
sourceEntry.key = getKey(vocabid_source);
/*
cerr << "prevSource=" << prevSource << flush
<< " vocabids=" << Debug(vocabid_source) << flush
<< " key=" << sourceEntry.key << endl;
*/
//Put into table
sourceEntries.Insert(sourceEntry);
// update cache - CURRENT source phrase, not prev
if (max_cache_size) {
std::string countStr = line.counts.as_string();
countStr = Trim(countStr);
if (!countStr.empty()) {
std::vector<float> toks = Tokenize<float>(countStr);
//cerr << "CACHE:" << line.source_phrase << " " << countStr << " " << toks[1] << endl;
if (toks.size() >= 2) {
totalSourceCount += toks[1];
// compute key for CURRENT source
std::vector<uint64_t> currVocabidSource = getVocabIDs(line.source_phrase.as_string());
uint64_t currKey = getKey(currVocabidSource);
CacheItem *item = new CacheItem(
Trim(line.source_phrase.as_string()),
currKey,
toks[1]);
cache.push(item);
if (max_cache_size > 0 && cache.size() > max_cache_size) {
cache.pop();
}
}
}
}
//Set prevLine
prevSource = line.source_phrase.as_string();
}
} catch (util::EndOfFileException e) {
std::cerr
<< "Reading phrase table finished, writing remaining files to disk."
<< std::endl;
//After the final entry is constructed we need to add it to the phrase_table
//Create an entry for the previous source phrase:
uint64_t targetInd = storeTarget.Save();
Entry sourceEntry;
sourceEntry.value = targetInd;
//The key is the sum of hashes of individual words. Probably not entirerly correct, but fast
std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource);
sourceEntry.key = getKey(vocabid_source);
//Put into table
sourceEntries.Insert(sourceEntry);
break;
}
}
sourcePhrases.Write(sourceEntries);
storeTarget.SaveAlignment();
serialize_table(mem, size, (basepath + "/probing_hash.dat"));
sourceVocab.Save();
serialize_cache(cache, (basepath + "/cache"), totalSourceCount);
delete[] mem;
//Write configfile
std::ofstream configfile;
configfile.open((basepath + "/config").c_str());
configfile << "API_VERSION\t" << API_VERSION << '\n';
configfile << "uniq_entries\t" << uniq_entries << '\n';
configfile << "num_scores\t" << num_scores << '\n';
configfile << "num_lex_scores\t" << num_lex_scores << '\n';
configfile << "log_prob\t" << log_prob << '\n';
configfile.close();
}
size_t countUniqueSource(const std::string &path)
{
size_t ret = 0;
InputFileStream strme(path);
std::string line, prevSource;
while (std::getline(strme, line)) {
std::vector<std::string> toks = TokenizeMultiCharSeparator(line, "|||");
assert(toks.size() != 0);
if (prevSource != toks[0]) {
prevSource = toks[0];
++ret;
}
}
return ret;
}
void serialize_cache(
std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache,
const std::string &path, float totalSourceCount)
{
std::vector<const CacheItem*> vec(cache.size());
size_t ind = cache.size() - 1;
while (!cache.empty()) {
const CacheItem *item = cache.top();
vec[ind] = item;
cache.pop();
--ind;
}
std::ofstream os(path.c_str());
os << totalSourceCount << std::endl;
for (size_t i = 0; i < vec.size(); ++i) {
const CacheItem *item = vec[i];
os << item->count << "\t" << item->sourceKey << "\t" << item->source << std::endl;
delete item;
}
os.close();
}
uint64_t getKey(const std::vector<uint64_t> &vocabid_source)
{
return getKey(vocabid_source.data(), vocabid_source.size());
}
std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos)
{
assert(endPos < vocabid_source.size());
std::vector<uint64_t> ret(endPos + 1);
for (size_t i = 0; i <= endPos; ++i) {
ret[i] = vocabid_source[i];
}
return ret;
}
}

View File

@ -1,92 +0,0 @@
#pragma once
#include <boost/unordered_set.hpp>
#include <boost/unordered_map.hpp>
#include <cstdio>
#include <sstream>
#include <fstream>
#include <iostream>
#include <string>
#include <queue>
#include <sys/stat.h> //mkdir
#include "hash.hh" //Includes line_splitter
#include "probing_hash_utils.hh"
#include "util/file_piece.hh"
#include "util/file.hh"
#include "vocabid.hh"
namespace Moses
{
typedef std::vector<uint64_t> SourcePhrase;
class Node
{
typedef boost::unordered_map<uint64_t, Node> Children;
Children m_children;
public:
uint64_t key;
bool done;
Node()
:done(false)
{}
void Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos = 0);
void Write(Table &table);
};
void createProbingPT(const std::string &phrasetable_path,
const std::string &basepath, int num_scores, int num_lex_scores,
bool log_prob, int max_cache_size, bool scfg);
uint64_t getKey(const std::vector<uint64_t> &source_phrase);
std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos);
template<typename T>
std::string Debug(const std::vector<T> &vec)
{
std::stringstream strm;
for (size_t i = 0; i < vec.size(); ++i) {
strm << vec[i] << " ";
}
return strm.str();
}
size_t countUniqueSource(const std::string &path);
class CacheItem
{
public:
std::string source;
uint64_t sourceKey;
float count;
CacheItem(const std::string &vSource, uint64_t vSourceKey, float vCount)
:source(vSource)
,sourceKey(vSourceKey)
,count(vCount) {
}
bool operator<(const CacheItem &other) const {
return count > other.count;
}
};
class CacheItemOrderer
{
public:
bool operator()(const CacheItem* a, const CacheItem* b) const {
return (*a) < (*b);
}
};
void serialize_cache(
std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache,
const std::string &path, float totalSourceCount);
}

View File

@ -1,59 +0,0 @@
#include <boost/foreach.hpp>
#include "vocabid.hh"
#include "StoreVocab.h"
#include "moses/Util.h"
namespace Moses
{
void add_to_map(StoreVocab<uint64_t> &sourceVocab,
const StringPiece &textin)
{
//Tokenize
util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' '));
while (itWord) {
StringPiece word = *itWord;
util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|'));
while (itFactor) {
StringPiece factor = *itFactor;
sourceVocab.Insert(getHash(factor), factor.as_string());
itFactor++;
}
itWord++;
}
}
void serialize_map(const std::map<uint64_t, std::string> &karta,
const std::string &filename)
{
std::ofstream os(filename.c_str());
std::map<uint64_t, std::string>::const_iterator iter;
for (iter = karta.begin(); iter != karta.end(); ++iter) {
os << iter->first << '\t' << iter->second << std::endl;
}
os.close();
}
void read_map(std::map<uint64_t, std::string> &karta, const char* filename)
{
std::ifstream is(filename);
std::string line;
while (getline(is, line)) {
std::vector<std::string> toks = Tokenize(line, "\t");
assert(toks.size() == 2);
uint64_t ind = Scan<uint64_t>(toks[1]);
karta[ind] = toks[0];
}
//Close the stream after we are done.
is.close();
}
}

View File

@ -1,29 +0,0 @@
//Serialization
#include <boost/serialization/serialization.hpp>
#include <boost/serialization/map.hpp>
#include <boost/archive/text_iarchive.hpp>
#include <boost/archive/text_oarchive.hpp>
#include <fstream>
#include <iostream>
#include <vector>
#include <map> //Container
#include "hash.hh" //Hash of elements
#include "util/string_piece.hh" //Tokenization and work with StringPiece
#include "util/tokenize_piece.hh"
namespace Moses
{
template<typename VOCABID>
class StoreVocab;
void add_to_map(StoreVocab<uint64_t> &sourceVocab,
const StringPiece &textin);
void serialize_map(const std::map<uint64_t, std::string> &karta,
const std::string &filename);
void read_map(std::map<uint64_t, std::string> &karta, const char* filename);
}

View File

@ -11,6 +11,7 @@ lib probingpt :
vocabid.cpp
OutputFileStream.cpp
InputFileStream.cpp
util.cpp
# ../util/string_piece.cc
# ../util/exception.cc

View File

@ -10,6 +10,7 @@
#include "hash.hh" //Includes line splitter
#include "line_splitter.hh"
#include "moses2/legacy/Util2.h"
#include "util.hh"
namespace probingpt
{
@ -68,7 +69,7 @@ public:
}
const std::string &foundStr = iter->second;
found = Moses2::Scan<T>(foundStr);
found = Scan<T>(foundStr);
return true;
}