This commit is contained in:
Hieu Hoang 2016-10-06 14:00:32 +01:00
commit b7f1b360be
34 changed files with 1547 additions and 1255 deletions

View File

@ -341,3 +341,5 @@ if [ path.exists $(TOP)/dist ] && $(prefix) != dist {
local temp = [ _shell "mkdir -p $(TOP)/bin" ] ;
local temp = [ _shell "rm -f $(TOP)/bin/moses_chart" ] ;
local temp = [ _shell "cd $(TOP)/bin && ln -s moses moses_chart" ] ;
local temp = [ _shell "cd $(TOP)/bin && ln -s CreateProbingPT CreateProbingPT2" ] ;

View File

@ -1,113 +0,0 @@
#include <string>
#include <boost/program_options.hpp>
#include "util/usage.hh"
#include "TranslationModel/ProbingPT/storing.hh"
#include "legacy/InputFileStream.h"
#include "legacy/OutputFileStream.h"
#include "legacy/Util2.h"
using namespace std;
std::string ReformatSCFGFile(const std::string &path);
int main(int argc, char* argv[])
{
string inPath, outPath;
int num_scores = 4;
int num_lex_scores = 0;
bool log_prob = false;
bool scfg = false;
int max_cache_size = 50000;
namespace po = boost::program_options;
po::options_description desc("Options");
desc.add_options()
("help", "Print help messages")
("input-pt", po::value<string>()->required(), "Text pt")
("output-dir", po::value<string>()->required(), "Directory when binary files will be written")
("num-scores", po::value<int>()->default_value(num_scores), "Number of pt scores")
("num-lex-scores", po::value<int>()->default_value(num_lex_scores), "Number of lexicalized reordering scores")
("log-prob", "log (and floor) probabilities before storing")
("max-cache-size", po::value<int>()->default_value(max_cache_size), "Maximum number of high-count source lines to write to cache file. 0=no cache, negative=no limit")
("scfg", "Rules are SCFG in Moses format (ie. with non-terms and LHS")
;
po::variables_map vm;
try {
po::store(po::parse_command_line(argc, argv, desc),
vm); // can throw
/** --help option
*/
if ( vm.count("help")) {
std::cout << desc << std::endl;
return EXIT_SUCCESS;
}
po::notify(vm); // throws on error, so do after help in case
// there are any problems
} catch(po::error& e) {
std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
std::cerr << desc << std::endl;
return EXIT_FAILURE;
}
if (vm.count("input-pt")) inPath = vm["input-pt"].as<string>();
if (vm.count("output-dir")) outPath = vm["output-dir"].as<string>();
if (vm.count("num-scores")) num_scores = vm["num-scores"].as<int>();
if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as<int>();
if (vm.count("max-cache-size")) max_cache_size = vm["max-cache-size"].as<int>();
if (vm.count("log-prob")) log_prob = true;
if (vm.count("scfg")) scfg = true;
if (scfg) {
inPath = ReformatSCFGFile(inPath);
}
Moses2::createProbingPT(inPath, outPath, num_scores, num_lex_scores, log_prob, max_cache_size, scfg);
//util::PrintUsage(std::cout);
return 0;
}
std::string ReformatSCFGFile(const std::string &path)
{
Moses2::InputFileStream inFile(path);
string reformattedPath = path + ".reformat.gz";
Moses2::OutputFileStream outFile(reformattedPath);
string line;
while (getline(inFile, line)) {
vector<string> toks = Moses2::TokenizeMultiCharSeparator(line, "|||");
assert(toks.size() >= 3);
// source
vector<string> sourceToks = Moses2::Tokenize(toks[0], " ");
for (size_t i = 0; i < sourceToks.size() - 1; ++i) {
outFile << sourceToks[i] << " ";
}
// other columns
for (size_t i = 1; i < toks.size(); ++i) {
outFile << "|||" << toks[i];
}
outFile << endl;
}
inFile.Close();
outFile.Close();
string sortedPath = path + ".reformat.sorted.gz";
string tmpPath = path + ".tmp ";
string cmd = "mkdir " + tmpPath
+ " && gzip -dc " + reformattedPath + " | LC_ALL=C sort -T " + tmpPath + " | gzip -c > " + sortedPath;
system(cmd.c_str());
cmd = "rm -rf " + tmpPath + " " + reformattedPath;
system(cmd.c_str());
return sortedPath;
}

View File

@ -72,7 +72,7 @@ alias deps : ../..//z ../..//boost_iostreams ../..//boost_filesystem ../../mose
TranslationModel/ProbingPT/hash.cpp
TranslationModel/ProbingPT/line_splitter.cpp
TranslationModel/ProbingPT/probing_hash_utils.cpp
TranslationModel/ProbingPT/quering.cpp
TranslationModel/ProbingPT/querying.cpp
TranslationModel/ProbingPT/storing.cpp
TranslationModel/ProbingPT/StoreVocab.cpp
TranslationModel/ProbingPT/StoreTarget.cpp
@ -173,11 +173,10 @@ alias deps : ../..//z ../..//boost_iostreams ../..//boost_filesystem ../../mose
deps ;
exe moses2 : Main.cpp moses2_lib ;
exe CreateProbingPT2 : CreateProbingPT2.cpp moses2_lib ;
if [ xmlrpc ] {
echo "Building Moses2" ;
alias programs : moses2 CreateProbingPT2 ;
alias programs : moses2 ;
}
else {
echo "Not building Moses2" ;

View File

@ -230,6 +230,14 @@ public:
//std::cerr << "destroy " << p << " " << n << std::endl;
}
// return address of values
pointer address (reference value) const {
return &value;
}
const_pointer address (const_reference value) const {
return &value;
}
MemPool &m_pool;
protected:
};

View File

@ -6,7 +6,7 @@
*/
#include <boost/foreach.hpp>
#include "ProbingPT.h"
#include "quering.hh"
#include "querying.hh"
#include "probing_hash_utils.hh"
#include "util/exception.hh"
#include "../../System.h"

View File

@ -1,4 +1,4 @@
#include "quering.hh"
#include "querying.hh"
#include "util/exception.hh"
#include "../../legacy/Util2.h"
@ -12,10 +12,15 @@ QueryEngine::QueryEngine(const char * filepath)
//Create filepaths
std::string basepath(filepath);
std::string path_to_config = basepath + "/config";
std::string path_to_hashtable = basepath + "/probing_hash.dat";
std::string path_to_source_vocabid = basepath + "/source_vocabids";
std::string alignPath = basepath + "/Alignments.dat";
if (!FileExists(path_to_config)) {
UTIL_THROW2("Binary table doesn't exist is didn't finish binarizing: " << path_to_config);
}
///Source phrase vocabids
read_map(source_vocabids, path_to_source_vocabid.c_str());
@ -25,7 +30,7 @@ QueryEngine::QueryEngine(const char * filepath)
//Read config file
boost::unordered_map<std::string, std::string> keyValue;
std::ifstream config((basepath + "/config").c_str());
std::ifstream config(path_to_config.c_str());
std::string line;
while (getline(config, line)) {
std::vector<std::string> toks = Moses2::Tokenize(line, "\t");

View File

@ -1319,7 +1319,7 @@
<name>FF/PhraseBoundaryFeature.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhraseBoundaryFeature.h</locationURI>
</link>
</link>
<link>
<name>FF/PhraseDistanceFeature.cpp</name>
<type>1</type>
@ -3340,6 +3340,26 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/ProbingPT.h</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/StoreTarget.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreTarget.cpp</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/StoreTarget.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreTarget.h</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/StoreVocab.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreVocab.cpp</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/StoreVocab.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreVocab.h</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/hash.cpp</name>
<type>1</type>
@ -3350,16 +3370,6 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.hh</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/huffmanish.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/huffmanish.cpp</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/huffmanish.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/huffmanish.hh</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/line_splitter.cpp</name>
<type>1</type>
@ -3381,14 +3391,14 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/probing_hash_utils.hh</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/quering.cpp</name>
<name>TranslationModel/ProbingPT/querying.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/quering.cpp</locationURI>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/querying.cpp</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/quering.hh</name>
<name>TranslationModel/ProbingPT/querying.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/quering.hh</locationURI>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/querying.hh</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/storing.cpp</name>
@ -3664,7 +3674,7 @@
<name>TranslationModel/UG/sapt_pscore_coherence.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_coherence.h</locationURI>
</link>
</link>
<link>
<name>TranslationModel/UG/sapt_pscore_lex1.h</name>
<type>1</type>
@ -3709,7 +3719,7 @@
<name>TranslationModel/UG/sapt_pscore_wordcount.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_wordcount.h</locationURI>
</link>
</link>
<link>
<name>TranslationModel/UG/sim-pe.cc</name>
<type>1</type>

View File

@ -1,29 +1,113 @@
#include <string>
#include <boost/program_options.hpp>
#include "util/usage.hh"
#include "moses/TranslationModel/ProbingPT/storing.hh"
#include "moses/InputFileStream.h"
#include "moses/OutputFileStream.h"
#include "moses/Util.h"
using namespace std;
std::string ReformatSCFGFile(const std::string &path);
int main(int argc, char* argv[])
{
string inPath, outPath;
int num_scores = 4;
int num_lex_scores = 0;
bool log_prob = false;
bool scfg = false;
int max_cache_size = 50000;
const char * is_reordering = "false";
namespace po = boost::program_options;
po::options_description desc("Options");
desc.add_options()
("help", "Print help messages")
("input-pt", po::value<string>()->required(), "Text pt")
("output-dir", po::value<string>()->required(), "Directory when binary files will be written")
("num-scores", po::value<int>()->default_value(num_scores), "Number of pt scores")
("num-lex-scores", po::value<int>()->default_value(num_lex_scores), "Number of lexicalized reordering scores")
("log-prob", "log (and floor) probabilities before storing")
("max-cache-size", po::value<int>()->default_value(max_cache_size), "Maximum number of high-count source lines to write to cache file. 0=no cache, negative=no limit")
("scfg", "Rules are SCFG in Moses format (ie. with non-terms and LHS")
if (!(argc == 5 || argc == 4)) {
// Tell the user how to run the program
std::cerr << "Provided " << argc << " arguments, needed 4 or 5." << std::endl;
std::cerr << "Usage: " << argv[0] << " path_to_phrasetable output_dir num_scores is_reordering" << std::endl;
std::cerr << "is_reordering should be either true or false, but it is currently a stub feature." << std::endl;
//std::cerr << "Usage: " << argv[0] << " path_to_phrasetable number_of_uniq_lines output_bin_file output_hash_table output_vocab_id" << std::endl;
return 1;
;
po::variables_map vm;
try {
po::store(po::parse_command_line(argc, argv, desc),
vm); // can throw
/** --help option
*/
if ( vm.count("help")) {
std::cout << desc << std::endl;
return EXIT_SUCCESS;
}
po::notify(vm); // throws on error, so do after help in case
// there are any problems
} catch(po::error& e) {
std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
std::cerr << desc << std::endl;
return EXIT_FAILURE;
}
if (argc == 5) {
is_reordering = argv[4];
if (vm.count("input-pt")) inPath = vm["input-pt"].as<string>();
if (vm.count("output-dir")) outPath = vm["output-dir"].as<string>();
if (vm.count("num-scores")) num_scores = vm["num-scores"].as<int>();
if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as<int>();
if (vm.count("max-cache-size")) max_cache_size = vm["max-cache-size"].as<int>();
if (vm.count("log-prob")) log_prob = true;
if (vm.count("scfg")) scfg = true;
if (scfg) {
inPath = ReformatSCFGFile(inPath);
}
createProbingPT(argv[1], argv[2], argv[3], is_reordering);
Moses::createProbingPT(inPath, outPath, num_scores, num_lex_scores, log_prob, max_cache_size, scfg);
util::PrintUsage(std::cout);
//util::PrintUsage(std::cout);
return 0;
}
std::string ReformatSCFGFile(const std::string &path)
{
Moses::InputFileStream inFile(path);
string reformattedPath = path + ".reformat.gz";
Moses::OutputFileStream outFile(reformattedPath);
string line;
while (getline(inFile, line)) {
vector<string> toks = Moses::TokenizeMultiCharSeparator(line, "|||");
assert(toks.size() >= 3);
// source
vector<string> sourceToks = Moses::Tokenize(toks[0], " ");
for (size_t i = 0; i < sourceToks.size() - 1; ++i) {
outFile << sourceToks[i] << " ";
}
// other columns
for (size_t i = 1; i < toks.size(); ++i) {
outFile << "|||" << toks[i];
}
outFile << endl;
}
inFile.Close();
outFile.Close();
string sortedPath = path + ".reformat.sorted.gz";
string tmpPath = path + ".tmp ";
string cmd = "mkdir " + tmpPath
+ " && gzip -dc " + reformattedPath + " | LC_ALL=C sort -T " + tmpPath + " | gzip -c > " + sortedPath;
system(cmd.c_str());
cmd = "rm -rf " + tmpPath + " " + reformattedPath;
system(cmd.c_str());
return sortedPath;
}

View File

@ -31,9 +31,9 @@ else {
}
exe CreateProbingPT : CreateProbingPT.cpp ..//boost_filesystem ../moses//moses ;
exe QueryProbingPT : QueryProbingPT.cpp ..//boost_filesystem ../moses//moses ;
#exe QueryProbingPT : QueryProbingPT.cpp ..//boost_filesystem ../moses//moses ;
alias programsProbing : CreateProbingPT QueryProbingPT ;
alias programsProbing : CreateProbingPT ; #QueryProbingPT
exe merge-sorted :
merge-sorted.cc

View File

@ -34,7 +34,7 @@ int main(int argc, char* argv[])
return 1;
}
QueryEngine queries(argv[1]);
Moses::QueryEngine queries(argv[1]);
//Interactive search
std::cout << "Please enter a string to be searched, or exit to exit." << std::endl;

View File

@ -247,6 +247,15 @@ public:
}
}
void PlusEquals(const FeatureFunction* sp, float scores[])
{
size_t numScores = sp->GetNumScoreComponents();
size_t offset = sp->GetIndex();
for (size_t i = 0; i < numScores; ++i) {
m_scores[i + offset] += scores[i];
}
}
//! Special version PlusEquals(ScoreProducer, vector<float>)
//! to add the score from a single ScoreProducer that produces
//! a single value

View File

@ -3,8 +3,9 @@
#include "moses/StaticData.h"
#include "moses/FactorCollection.h"
#include "moses/TargetPhraseCollection.h"
#include "moses/InputFileStream.h"
#include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h"
#include "quering.hh"
#include "querying.hh"
using namespace std;
@ -34,44 +35,94 @@ void ProbingPT::Load(AllOptions::ptr const& opts)
m_unkId = 456456546456;
FactorCollection &vocab = FactorCollection::Instance();
// source vocab
const std::map<uint64_t, std::string> &sourceVocab = m_engine->getSourceVocab();
const std::map<uint64_t, std::string> &sourceVocab =
m_engine->getSourceVocab();
std::map<uint64_t, std::string>::const_iterator iterSource;
for (iterSource = sourceVocab.begin(); iterSource != sourceVocab.end(); ++iterSource) {
const string &wordStr = iterSource->second;
const Factor *factor = FactorCollection::Instance().AddFactor(wordStr);
for (iterSource = sourceVocab.begin(); iterSource != sourceVocab.end();
++iterSource) {
string wordStr = iterSource->second;
//cerr << "wordStr=" << wordStr << endl;
const Factor *factor = vocab.AddFactor(wordStr);
uint64_t probingId = iterSource->first;
size_t factorId = factor->GetId();
SourceVocabMap::value_type entry(factor, probingId);
m_sourceVocabMap.insert(entry);
if (factorId >= m_sourceVocab.size()) {
m_sourceVocab.resize(factorId + 1, m_unkId);
}
m_sourceVocab[factorId] = probingId;
}
// target vocab
const std::map<unsigned int, std::string> &probingVocab = m_engine->getVocab();
std::map<unsigned int, std::string>::const_iterator iter;
for (iter = probingVocab.begin(); iter != probingVocab.end(); ++iter) {
const string &wordStr = iter->second;
const Factor *factor = FactorCollection::Instance().AddFactor(wordStr);
InputFileStream targetVocabStrme(m_filePath + "/TargetVocab.dat");
string line;
while (getline(targetVocabStrme, line)) {
vector<string> toks = Tokenize(line, "\t");
UTIL_THROW_IF2(toks.size() != 2, string("Incorrect format:") + line + "\n");
unsigned int probingId = iter->first;
//cerr << "wordStr=" << toks[0] << endl;
TargetVocabMap::value_type entry(factor, probingId);
m_vocabMap.insert(entry);
const Factor *factor = vocab.AddFactor(toks[0]);
uint32_t probingId = Scan<uint32_t>(toks[1]);
if (probingId >= m_targetVocab.size()) {
m_targetVocab.resize(probingId + 1);
}
m_targetVocab[probingId] = factor;
}
// alignments
CreateAlignmentMap(m_filePath + "/Alignments.dat");
// memory mapped file to tps
string filePath = m_filePath + "/TargetColl.dat";
file.open(filePath.c_str());
if (!file.is_open()) {
throw "Couldn't open file ";
}
data = file.data();
//size_t size = file.size();
// cache
//CreateCache(system);
}
void ProbingPT::CreateAlignmentMap(const std::string path)
{
const std::vector< std::vector<unsigned char> > &probingAlignColl = m_engine->getAlignments();
m_aligns.resize(probingAlignColl.size(), NULL);
for (size_t i = 0; i < probingAlignColl.size(); ++i) {
AlignmentInfo::CollType aligns;
const std::vector<unsigned char> &probingAligns = probingAlignColl[i];
for (size_t j = 0; j < probingAligns.size(); j += 2) {
size_t startPos = probingAligns[j];
size_t endPos = probingAligns[j+1];
//cerr << "startPos=" << startPos << " " << endPos << endl;
aligns.insert(std::pair<size_t,size_t>(startPos, endPos));
}
const AlignmentInfo *align = AlignmentInfoCollection::Instance().Add(aligns);
m_aligns[i] = align;
//cerr << "align=" << align->Debug(system) << endl;
}
}
void ProbingPT::InitializeForInput(ttasksptr const& ttask)
{
ReduceCache();
}
void ProbingPT::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const
{
CacheColl &cache = GetCache();
InputPathList::const_iterator iter;
for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) {
InputPath &inputPath = **iter;
@ -82,132 +133,205 @@ void ProbingPT::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQue
}
TargetPhraseCollection::shared_ptr tpColl = CreateTargetPhrase(sourcePhrase);
// add target phrase to phrase-table cache
size_t hash = hash_value(sourcePhrase);
std::pair<TargetPhraseCollection::shared_ptr , clock_t> value(tpColl, clock());
cache[hash] = value;
inputPath.SetTargetPhrases(*this, tpColl, NULL);
}
}
std::vector<uint64_t> ProbingPT::ConvertToProbingSourcePhrase(const Phrase &sourcePhrase, bool &ok) const
{
size_t size = sourcePhrase.GetSize();
std::vector<uint64_t> ret(size);
for (size_t i = 0; i < size; ++i) {
const Factor *factor = sourcePhrase.GetFactor(i, m_input[0]);
uint64_t probingId = GetSourceProbingId(factor);
if (probingId == m_unkId) {
ok = false;
return ret;
} else {
ret[i] = probingId;
}
}
ok = true;
return ret;
}
TargetPhraseCollection::shared_ptr ProbingPT::CreateTargetPhrase(const Phrase &sourcePhrase) const
{
// create a target phrase from the 1st word of the source, prefix with 'ProbingPT:'
assert(sourcePhrase.GetSize());
TargetPhraseCollection::shared_ptr tpColl;
bool ok;
vector<uint64_t> probingSource = ConvertToProbingSourcePhrase(sourcePhrase, ok);
if (!ok) {
// source phrase contains a word unknown in the pt.
// We know immediately there's no translation for it
return tpColl;
std::pair<bool, uint64_t> keyStruct = GetKey(sourcePhrase);
if (!keyStruct.first) {
return TargetPhraseCollection::shared_ptr();
}
std::pair<bool, std::vector<target_text> > query_result;
//Actual lookup
query_result = m_engine->query(probingSource);
if (query_result.first) {
//m_engine->printTargetInfo(query_result.second);
tpColl.reset(new TargetPhraseCollection());
const std::vector<target_text> &probingTargetPhrases = query_result.second;
for (size_t i = 0; i < probingTargetPhrases.size(); ++i) {
const target_text &probingTargetPhrase = probingTargetPhrases[i];
TargetPhrase *tp = CreateTargetPhrase(sourcePhrase, probingTargetPhrase);
tpColl->Add(tp);
}
tpColl->Prune(true, m_tableLimit);
// check in cache
CachePb::const_iterator iter = m_cachePb.find(keyStruct.second);
if (iter != m_cachePb.end()) {
//cerr << "FOUND IN CACHE " << keyStruct.second << " " << sourcePhrase.Debug(mgr.system) << endl;
TargetPhraseCollection *tps = iter->second;
return TargetPhraseCollection::shared_ptr(tps);
}
return tpColl;
// query pt
TargetPhraseCollection *tps = CreateTargetPhrases(sourcePhrase,
keyStruct.second);
return TargetPhraseCollection::shared_ptr(tps);
}
TargetPhrase *ProbingPT::CreateTargetPhrase(const Phrase &sourcePhrase, const target_text &probingTargetPhrase) const
std::pair<bool, uint64_t> ProbingPT::GetKey(const Phrase &sourcePhrase) const
{
const std::vector<unsigned int> &probingPhrase = probingTargetPhrase.target_phrase;
size_t size = probingPhrase.size();
std::pair<bool, uint64_t> ret;
// create a target phrase from the 1st word of the source, prefix with 'ProbingPT:'
size_t sourceSize = sourcePhrase.GetSize();
assert(sourceSize);
uint64_t probingSource[sourceSize];
GetSourceProbingIds(sourcePhrase, ret.first, probingSource);
if (!ret.first) {
// source phrase contains a word unknown in the pt.
// We know immediately there's no translation for it
}
else {
ret.second = m_engine->getKey(probingSource, sourceSize);
}
return ret;
}
void ProbingPT::GetSourceProbingIds(const Phrase &sourcePhrase,
bool &ok, uint64_t probingSource[]) const
{
size_t size = sourcePhrase.GetSize();
for (size_t i = 0; i < size; ++i) {
const Word &word = sourcePhrase.GetWord(i);
uint64_t probingId = GetSourceProbingId(word);
if (probingId == m_unkId) {
ok = false;
return;
}
else {
probingSource[i] = probingId;
}
}
ok = true;
}
uint64_t ProbingPT::GetSourceProbingId(const Word &word) const
{
uint64_t ret = 0;
for (size_t i = 0; i < m_input.size(); ++i) {
FactorType factorType = m_input[i];
const Factor *factor = word[factorType];
size_t factorId = factor->GetId();
if (factorId >= m_sourceVocab.size()) {
return m_unkId;
}
ret += m_sourceVocab[factorId];
}
return ret;
}
TargetPhraseCollection *ProbingPT::CreateTargetPhrases(
const Phrase &sourcePhrase, uint64_t key) const
{
TargetPhraseCollection *tps = NULL;
//Actual lookup
std::pair<bool, uint64_t> query_result; // 1st=found, 2nd=target file offset
query_result = m_engine->query(key);
//cerr << "key2=" << query_result.second << endl;
if (query_result.first) {
const char *offset = data + query_result.second;
uint64_t *numTP = (uint64_t*) offset;
tps = new TargetPhraseCollection();
offset += sizeof(uint64_t);
for (size_t i = 0; i < *numTP; ++i) {
TargetPhrase *tp = CreateTargetPhrase(offset);
assert(tp);
tp->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
tps->Add(tp);
}
tps->Prune(true, m_tableLimit);
//cerr << *tps << endl;
}
return tps;
}
TargetPhrase *ProbingPT::CreateTargetPhrase(
const char *&offset) const
{
TargetPhraseInfo *tpInfo = (TargetPhraseInfo*) offset;
size_t numRealWords = tpInfo->numWords / m_output.size();
TargetPhrase *tp = new TargetPhrase(this);
offset += sizeof(TargetPhraseInfo);
// scores
float *scores = (float*) offset;
size_t totalNumScores = m_engine->num_scores + m_engine->num_lex_scores;
if (m_engine->logProb) {
// set pt score for rule
tp->GetScoreBreakdown().PlusEquals(this, scores);
// save scores for other FF, eg. lex RO. Just give the offset
/*
if (m_engine->num_lex_scores) {
tp->scoreProperties = scores + m_engine->num_scores;
}
*/
}
else {
// log score 1st
float logScores[totalNumScores];
for (size_t i = 0; i < totalNumScores; ++i) {
logScores[i] = FloorScore(TransformScore(scores[i]));
}
// set pt score for rule
tp->GetScoreBreakdown().PlusEquals(this, logScores);
// save scores for other FF, eg. lex RO.
/*
tp->scoreProperties = pool.Allocate<SCORE>(m_engine->num_lex_scores);
for (size_t i = 0; i < m_engine->num_lex_scores; ++i) {
tp->scoreProperties[i] = logScores[i + m_engine->num_scores];
}
*/
}
offset += sizeof(float) * totalNumScores;
// words
for (size_t i = 0; i < size; ++i) {
uint64_t probingId = probingPhrase[i];
const Factor *factor = GetTargetFactor(probingId);
assert(factor);
for (size_t targetPos = 0; targetPos < numRealWords; ++targetPos) {
Word &word = tp->AddWord();
for (size_t i = 0; i < m_output.size(); ++i) {
FactorType factorType = m_output[i];
Word &word = tp->AddWord();
word.SetFactor(m_output[0], factor);
uint32_t *probingId = (uint32_t*) offset;
const Factor *factor = GetTargetFactor(*probingId);
assert(factor);
word[factorType] = factor;
offset += sizeof(uint32_t);
}
}
// score for this phrase table
vector<float> scores = probingTargetPhrase.prob;
std::transform(scores.begin(), scores.end(), scores.begin(),TransformScore);
tp->GetScoreBreakdown().PlusEquals(this, scores);
// align
uint32_t alignTerm = tpInfo->alignTerm;
//cerr << "alignTerm=" << alignTerm << endl;
UTIL_THROW_IF2(alignTerm >= m_aligns.size(), "Unknown alignInd");
tp->SetAlignTerm(m_aligns[alignTerm]);
// alignment
/*
const std::vector<unsigned char> &alignments = probingTargetPhrase.word_all1;
// properties TODO
AlignmentInfo &aligns = tp->GetAlignTerm();
for (size_t i = 0; i < alignS.size(); i += 2 ) {
aligns.Add((size_t) alignments[i], (size_t) alignments[i+1]);
}
*/
// score of all other ff when this rule is being loaded
tp->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
return tp;
}
const Factor *ProbingPT::GetTargetFactor(uint64_t probingId) const
{
TargetVocabMap::right_map::const_iterator iter;
iter = m_vocabMap.right.find(probingId);
if (iter != m_vocabMap.right.end()) {
return iter->second;
} else {
// not in mapping. Must be UNK
return NULL;
}
}
//////////////////////////////////////////////////////////////////
uint64_t ProbingPT::GetSourceProbingId(const Factor *factor) const
{
SourceVocabMap::left_map::const_iterator iter;
iter = m_sourceVocabMap.left.find(factor);
if (iter != m_sourceVocabMap.left.end()) {
return iter->second;
} else {
// not in mapping. Must be UNK
return m_unkId;
}
}
ChartRuleLookupManager *ProbingPT::CreateRuleLookupManager(
const ChartParser &,

View File

@ -1,17 +1,18 @@
#pragma once
#include <boost/iostreams/device/mapped_file.hpp>
#include <boost/bimap.hpp>
#include <boost/unordered_map.hpp>
#include "../PhraseDictionary.h"
class QueryEngine;
class target_text;
namespace Moses
{
class ChartParser;
class ChartCellCollectionBase;
class ChartRuleLookupManager;
class QueryEngine;
class target_text;
class ProbingPT : public PhraseDictionary
{
@ -39,21 +40,42 @@ public:
protected:
QueryEngine *m_engine;
uint64_t m_unkId;
typedef boost::bimap<const Factor *, uint64_t> SourceVocabMap;
mutable SourceVocabMap m_sourceVocabMap;
std::vector<uint64_t> m_sourceVocab; // factor id -> pt id
std::vector<const Factor*> m_targetVocab; // pt id -> factor*
std::vector<const AlignmentInfo*> m_aligns;
typedef boost::bimap<const Factor *, unsigned int> TargetVocabMap;
mutable TargetVocabMap m_vocabMap;
boost::iostreams::mapped_file_source file;
const char *data;
// caching
typedef boost::unordered_map<uint64_t, TargetPhraseCollection*> CachePb;
CachePb m_cachePb;
void CreateAlignmentMap(const std::string path);
TargetPhraseCollection::shared_ptr CreateTargetPhrase(const Phrase &sourcePhrase) const;
TargetPhrase *CreateTargetPhrase(const Phrase &sourcePhrase, const target_text &probingTargetPhrase) const;
const Factor *GetTargetFactor(uint64_t probingId) const;
std::pair<bool, uint64_t> GetKey(const Phrase &sourcePhrase) const;
void GetSourceProbingIds(const Phrase &sourcePhrase, bool &ok,
uint64_t probingSource[]) const;
uint64_t GetSourceProbingId(const Word &word) const;
uint64_t GetSourceProbingId(const Factor *factor) const;
std::vector<uint64_t> ConvertToProbingSourcePhrase(const Phrase &sourcePhrase, bool &ok) const;
TargetPhraseCollection *CreateTargetPhrases(
const Phrase &sourcePhrase, uint64_t key) const;
TargetPhrase *CreateTargetPhrase(
const char *&offset) const;
inline const Factor *GetTargetFactor(uint32_t probingId) const
{
if (probingId >= m_targetVocab.size()) {
return NULL;
}
return m_targetVocab[probingId];
}
uint64_t m_unkId;
};
} // namespace Moses

View File

@ -0,0 +1,266 @@
/*
* StoreTarget.cpp
*
* Created on: 19 Jan 2016
* Author: hieu
*/
#include <boost/foreach.hpp>
#include "StoreTarget.h"
#include "line_splitter.hh"
#include "probing_hash_utils.hh"
#include "moses/OutputFileStream.h"
#include "moses/Util.h"
using namespace std;
namespace Moses
{
StoreTarget::StoreTarget(const std::string &basepath)
:m_basePath(basepath)
,m_vocab(basepath + "/TargetVocab.dat")
{
std::string path = basepath + "/TargetColl.dat";
m_fileTargetColl.open(path.c_str(),
std::ios::out | std::ios::binary | std::ios::ate | std::ios::trunc);
if (!m_fileTargetColl.is_open()) {
throw "can't create file ";
}
}
StoreTarget::~StoreTarget()
{
assert(m_coll.empty());
m_fileTargetColl.close();
// vocab
m_vocab.Save();
}
uint64_t StoreTarget::Save()
{
uint64_t ret = m_fileTargetColl.tellp();
// save to disk
uint64_t numTP = m_coll.size();
m_fileTargetColl.write((char*) &numTP, sizeof(uint64_t));
for (size_t i = 0; i < m_coll.size(); ++i) {
Save(*m_coll[i]);
}
// clear coll
RemoveAllInColl(m_coll);
m_coll.clear();
// starting position of coll
return ret;
}
void StoreTarget::Save(const target_text &rule)
{
// metadata for each tp
TargetPhraseInfo tpInfo;
tpInfo.alignTerm = GetAlignId(rule.word_align_term);
tpInfo.alignNonTerm = GetAlignId(rule.word_align_non_term);
tpInfo.numWords = rule.target_phrase.size();
tpInfo.propLength = rule.property.size();
//cerr << "TPInfo=" << sizeof(TPInfo);
m_fileTargetColl.write((char*) &tpInfo, sizeof(TargetPhraseInfo));
// scores
for (size_t i = 0; i < rule.prob.size(); ++i) {
float prob = rule.prob[i];
m_fileTargetColl.write((char*) &prob, sizeof(prob));
}
// tp
for (size_t i = 0; i < rule.target_phrase.size(); ++i) {
uint32_t vocabId = rule.target_phrase[i];
m_fileTargetColl.write((char*) &vocabId, sizeof(vocabId));
}
// prop TODO
}
void StoreTarget::SaveAlignment()
{
std::string path = m_basePath + "/Alignments.dat";
OutputFileStream file(path);
BOOST_FOREACH(Alignments::value_type &valPair, m_aligns) {
file << valPair.second << "\t";
const std::vector<size_t> &aligns = valPair.first;
BOOST_FOREACH(size_t align, aligns) {
file << align << " ";
}
file << endl;
}
}
void StoreTarget::Append(const line_text &line, bool log_prob, bool scfg)
{
target_text *rule = new target_text;
//cerr << "line.target_phrase=" << line.target_phrase << endl;
// target_phrase
vector<bool> nonTerms;
util::TokenIter<util::SingleCharacter> it;
it = util::TokenIter<util::SingleCharacter>(line.target_phrase,
util::SingleCharacter(' '));
while (it) {
StringPiece word = *it;
//cerr << "word=" << word << endl;
bool nonTerm = false;
if (scfg) {
// not really sure how to handle factored SCFG and NT
if (scfg && word[0] == '[' && word[word.size() - 1] == ']') {
//cerr << "NON-TERM=" << tok << " " << nonTerms.size() << endl;
nonTerm = true;
}
nonTerms.push_back(nonTerm);
}
util::TokenIter<util::SingleCharacter> itFactor;
itFactor = util::TokenIter<util::SingleCharacter>(word,
util::SingleCharacter('|'));
while (itFactor) {
StringPiece factor = *itFactor;
string factorStr = factor.as_string();
uint32_t vocabId = m_vocab.GetVocabId(factorStr);
rule->target_phrase.push_back(vocabId);
itFactor++;
}
it++;
}
// probs
it = util::TokenIter<util::SingleCharacter>(line.prob,
util::SingleCharacter(' '));
while (it) {
string tok = it->as_string();
float prob = Scan<float>(tok);
if (log_prob) {
prob = FloorScore(log(prob));
if (prob == 0.0f) prob = 0.0000000001;
}
rule->prob.push_back(prob);
it++;
}
/*
cerr << "nonTerms=";
for (size_t i = 0; i < nonTerms.size(); ++i) {
cerr << nonTerms[i] << " ";
}
cerr << endl;
*/
// alignment
it = util::TokenIter<util::SingleCharacter>(line.word_align,
util::SingleCharacter(' '));
while (it) {
string tokPair = Trim(it->as_string());
if (tokPair.empty()) {
break;
}
vector<size_t> alignPair = Tokenize<size_t>(tokPair, "-");
assert(alignPair.size() == 2);
bool nonTerm = false;
size_t sourcePos = alignPair[0];
size_t targetPos = alignPair[1];
if (scfg) {
nonTerm = nonTerms[targetPos];
}
//cerr << targetPos << "=" << nonTerm << endl;
if (nonTerm) {
rule->word_align_non_term.push_back(sourcePos);
rule->word_align_non_term.push_back(targetPos);
//cerr << (int) rule->word_all1.back() << " ";
}
else {
rule->word_align_term.push_back(sourcePos);
rule->word_align_term.push_back(targetPos);
}
it++;
}
// extra scores
string prop = line.property.as_string();
AppendLexRO(prop, rule->prob, log_prob);
//cerr << "line.property=" << line.property << endl;
//cerr << "prop=" << prop << endl;
// properties
/*
for (size_t i = 0; i < prop.size(); ++i) {
rule->property.push_back(prop[i]);
}
*/
m_coll.push_back(rule);
}
uint32_t StoreTarget::GetAlignId(const std::vector<size_t> &align)
{
boost::unordered_map<std::vector<size_t>, uint32_t>::iterator iter =
m_aligns.find(align);
if (iter == m_aligns.end()) {
uint32_t ind = m_aligns.size();
m_aligns[align] = ind;
return ind;
}
else {
return iter->second;
}
}
void StoreTarget::AppendLexRO(std::string &prop, std::vector<float> &retvector,
bool log_prob) const
{
size_t startPos = prop.find("{{LexRO ");
if (startPos != string::npos) {
size_t endPos = prop.find("}}", startPos + 8);
string lexProb = prop.substr(startPos + 8, endPos - startPos - 8);
//cerr << "lexProb=" << lexProb << endl;
// append lex probs to pt probs
vector<float> scores = Tokenize<float>(lexProb);
if (log_prob) {
for (size_t i = 0; i < scores.size(); ++i) {
scores[i] = FloorScore(log(scores[i]));
if (scores[i] == 0.0f) scores[i] = 0.0000000001;
}
}
for (size_t i = 0; i < scores.size(); ++i) {
retvector.push_back(scores[i]);
}
// exclude LexRO property from property column
prop = prop.substr(0, startPos)
+ prop.substr(endPos + 2, prop.size() - endPos - 2);
//cerr << "line.property_to_be_binarized=" << line.property_to_be_binarized << "AAAA" << endl;
}
}
} /* namespace Moses2 */

View File

@ -0,0 +1,51 @@
/*
* StoreTarget.h
*
* Created on: 19 Jan 2016
* Author: hieu
*/
#pragma once
#include <string>
#include <fstream>
#include <vector>
#include <inttypes.h>
#include <boost/unordered_map.hpp>
#include <boost/unordered_set.hpp>
#include "StoreVocab.h"
namespace Moses
{
class line_text;
class target_text;
class StoreTarget
{
public:
StoreTarget(const std::string &basepath);
virtual ~StoreTarget();
uint64_t Save();
void SaveAlignment();
void Append(const line_text &line, bool log_prob, bool scfg);
protected:
std::string m_basePath;
std::fstream m_fileTargetColl;
StoreVocab<uint32_t> m_vocab;
typedef boost::unordered_map<std::vector<size_t>, uint32_t> Alignments;
Alignments m_aligns;
std::vector<target_text*> m_coll;
uint32_t GetAlignId(const std::vector<size_t> &align);
void Save(const target_text &rule);
void AppendLexRO(std::string &prop, std::vector<float> &retvector,
bool log_prob) const;
};
} /* namespace Moses2 */

View File

@ -0,0 +1,13 @@
/*
* StoreVocab.cpp
*
* Created on: 15 Jun 2016
* Author: hieu
*/
#include <fstream>
#include "StoreVocab.h"
namespace Moses
{
} /* namespace Moses2 */

View File

@ -0,0 +1,64 @@
/*
* StoreVocab.h
*
* Created on: 15 Jun 2016
* Author: hieu
*/
#pragma once
#include <string>
#include <boost/unordered_map.hpp>
#include "moses/OutputFileStream.h"
#include "moses/Util.h"
namespace Moses
{
template<typename VOCABID>
class StoreVocab
{
protected:
std::string m_path;
typedef boost::unordered_map<std::string, VOCABID> Coll;
Coll m_vocab;
public:
StoreVocab(const std::string &path)
:m_path(path)
{}
virtual ~StoreVocab() {}
VOCABID GetVocabId(const std::string &word)
{
typename Coll::iterator iter = m_vocab.find(word);
if (iter == m_vocab.end()) {
VOCABID ind = m_vocab.size() + 1;
m_vocab[word] = ind;
return ind;
}
else {
return iter->second;
}
}
void Insert(VOCABID id, const std::string &word)
{
m_vocab[word] = id;
}
void Save()
{
OutputFileStream strme(m_path);
typename Coll::const_iterator iter;
for (iter = m_vocab.begin(); iter != m_vocab.end(); ++iter) {
strme << iter->first << "\t" << iter->second << std::endl;
}
strme.Close();
}
};
} /* namespace Moses2 */

View File

@ -1,5 +1,11 @@
#include <iostream>
#include "hash.hh"
using namespace std;
namespace Moses
{
uint64_t getHash(StringPiece text)
{
std::size_t len = text.size();
@ -7,24 +13,32 @@ uint64_t getHash(StringPiece text)
return key;
}
std::vector<uint64_t> getVocabIDs(StringPiece textin)
std::vector<uint64_t> getVocabIDs(const StringPiece &textin)
{
//Tokenize
std::vector<uint64_t> output;
util::TokenIter<util::SingleCharacter> it(textin, util::SingleCharacter(' '));
util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' '));
while(it) {
output.push_back(getHash(*it));
it++;
while (itWord) {
StringPiece word = *itWord;
uint64_t id = 0;
util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|'));
while (itFactor) {
StringPiece factor = *itFactor;
//cerr << "factor=" << factor << endl;
id += getHash(factor);
itFactor++;
}
output.push_back(id);
itWord++;
}
return output;
}
uint64_t getVocabID(std::string candidate)
{
std::size_t len = candidate.length();
uint64_t key = util::MurmurHashNative(candidate.c_str(), len);
return key;
}
}

View File

@ -6,9 +6,12 @@
#include "util/tokenize_piece.hh"
#include <vector>
namespace Moses
{
//Gets the MurmurmurHash for give string
uint64_t getHash(StringPiece text);
std::vector<uint64_t> getVocabIDs(StringPiece textin);
std::vector<uint64_t> getVocabIDs(const StringPiece &textin);
uint64_t getVocabID(std::string candidate);
}

View File

@ -1,451 +0,0 @@
#include "huffmanish.hh"
Huffman::Huffman (const char * filepath)
{
//Read the file
util::FilePiece filein(filepath);
//Init uniq_lines to zero;
uniq_lines = 0;
line_text prev_line; //Check for unique lines.
int num_lines = 0 ;
while (true) {
line_text new_line;
num_lines++;
try {
//Process line read
new_line = splitLine(filein.ReadLine());
count_elements(new_line); //Counts the number of elements, adds new and increments counters.
} catch (util::EndOfFileException e) {
std::cerr << "Unique entries counted: ";
break;
}
if (new_line.source_phrase == prev_line.source_phrase) {
continue;
} else {
uniq_lines++;
prev_line = new_line;
}
}
std::cerr << uniq_lines << std::endl;
}
void Huffman::count_elements(line_text linein)
{
//For target phrase:
util::TokenIter<util::SingleCharacter> it(linein.target_phrase, util::SingleCharacter(' '));
while (it) {
//Check if we have that entry
std::map<std::string, unsigned int>::iterator mapiter;
mapiter = target_phrase_words.find(it->as_string());
if (mapiter != target_phrase_words.end()) {
//If the element is found, increment the count.
mapiter->second++;
} else {
//Else create a new entry;
target_phrase_words.insert(std::pair<std::string, unsigned int>(it->as_string(), 1));
}
it++;
}
//For word allignment 1
std::map<std::vector<unsigned char>, unsigned int>::iterator mapiter3;
std::vector<unsigned char> numbers = splitWordAll1(linein.word_align);
mapiter3 = word_all1.find(numbers);
if (mapiter3 != word_all1.end()) {
//If the element is found, increment the count.
mapiter3->second++;
} else {
//Else create a new entry;
word_all1.insert(std::pair<std::vector<unsigned char>, unsigned int>(numbers, 1));
}
}
//Assigns huffman values for each unique element
void Huffman::assign_values()
{
//First create vectors for all maps so that we could sort them later.
//Create a vector for target phrases
for(std::map<std::string, unsigned int>::iterator it = target_phrase_words.begin(); it != target_phrase_words.end(); it++ ) {
target_phrase_words_counts.push_back(*it);
}
//Sort it
std::sort(target_phrase_words_counts.begin(), target_phrase_words_counts.end(), sort_pair());
//Create a vector for word allignments 1
for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1.begin(); it != word_all1.end(); it++ ) {
word_all1_counts.push_back(*it);
}
//Sort it
std::sort(word_all1_counts.begin(), word_all1_counts.end(), sort_pair_vec());
//Afterwards we assign a value for each phrase, starting from 1, as zero is reserved for delimiter
unsigned int i = 1; //huffman code
for(std::vector<std::pair<std::string, unsigned int> >::iterator it = target_phrase_words_counts.begin();
it != target_phrase_words_counts.end(); it++) {
target_phrase_huffman.insert(std::pair<std::string, unsigned int>(it->first, i));
i++; //Go to the next huffman code
}
i = 1; //Reset i for the next map
for(std::vector<std::pair<std::vector<unsigned char>, unsigned int> >::iterator it = word_all1_counts.begin();
it != word_all1_counts.end(); it++) {
word_all1_huffman.insert(std::pair<std::vector<unsigned char>, unsigned int>(it->first, i));
i++; //Go to the next huffman code
}
//After lookups are produced, clear some memory usage of objects not needed anymore.
target_phrase_words.clear();
word_all1.clear();
target_phrase_words_counts.clear();
word_all1_counts.clear();
std::cerr << "Finished generating huffman codes." << std::endl;
}
void Huffman::serialize_maps(const char * dirname)
{
//Note that directory name should exist.
std::string basedir(dirname);
std::string target_phrase_path(basedir + "/target_phrases");
std::string probabilities_path(basedir + "/probs");
std::string word_all1_path(basedir + "/Wall1");
//Target phrase
std::ofstream os (target_phrase_path.c_str(), std::ios::binary);
boost::archive::text_oarchive oarch(os);
oarch << lookup_target_phrase;
os.close();
//Word all1
std::ofstream os2 (word_all1_path.c_str(), std::ios::binary);
boost::archive::text_oarchive oarch2(os2);
oarch2 << lookup_word_all1;
os2.close();
}
std::vector<unsigned char> Huffman::full_encode_line(line_text line)
{
return vbyte_encode_line((encode_line(line)));
}
std::vector<unsigned int> Huffman::encode_line(line_text line)
{
std::vector<unsigned int> retvector;
//Get target_phrase first.
util::TokenIter<util::SingleCharacter> it(line.target_phrase, util::SingleCharacter(' '));
while (it) {
retvector.push_back(target_phrase_huffman.find(it->as_string())->second);
it++;
}
//Add a zero;
retvector.push_back(0);
//Get probabilities. Reinterpreting the float bytes as unsgined int.
util::TokenIter<util::SingleCharacter> probit(line.prob, util::SingleCharacter(' '));
while (probit) {
//Sometimes we have too big floats to handle, so first convert to double
double tempnum = atof(probit->data());
float num = (float)tempnum;
retvector.push_back(reinterpret_float(&num));
probit++;
}
//Add a zero;
retvector.push_back(0);
//Get Word allignments
retvector.push_back(word_all1_huffman.find(splitWordAll1(line.word_align))->second);
retvector.push_back(0);
return retvector;
}
void Huffman::produce_lookups()
{
//basically invert every map that we have
for(std::map<std::string, unsigned int>::iterator it = target_phrase_huffman.begin(); it != target_phrase_huffman.end(); it++ ) {
lookup_target_phrase.insert(std::pair<unsigned int, std::string>(it->second, it->first));
}
for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1_huffman.begin(); it != word_all1_huffman.end(); it++ ) {
lookup_word_all1.insert(std::pair<unsigned int, std::vector<unsigned char> >(it->second, it->first));
}
}
HuffmanDecoder::HuffmanDecoder (const char * dirname)
{
//Read the maps from disk
//Note that directory name should exist.
std::string basedir(dirname);
std::string target_phrase_path(basedir + "/target_phrases");
std::string word_all1_path(basedir + "/Wall1");
//Target phrases
std::ifstream is (target_phrase_path.c_str(), std::ios::binary);
boost::archive::text_iarchive iarch(is);
iarch >> lookup_target_phrase;
is.close();
//Word allignment 1
std::ifstream is2 (word_all1_path.c_str(), std::ios::binary);
boost::archive::text_iarchive iarch2(is2);
iarch2 >> lookup_word_all1;
is2.close();
}
HuffmanDecoder::HuffmanDecoder (std::map<unsigned int, std::string> * lookup_target,
std::map<unsigned int, std::vector<unsigned char> > * lookup_word1)
{
lookup_target_phrase = *lookup_target;
lookup_word_all1 = *lookup_word1;
}
std::vector<target_text> HuffmanDecoder::full_decode_line (std::vector<unsigned char> lines, int num_scores)
{
std::vector<target_text> retvector; //All target phrases
std::vector<unsigned int> decoded_lines = vbyte_decode_line(lines); //All decoded lines
std::vector<unsigned int>::iterator it = decoded_lines.begin(); //Iterator for them
std::vector<unsigned int> current_target_phrase; //Current target phrase decoded
short zero_count = 0; //Count home many zeroes we have met. so far. Every 3 zeroes mean a new target phrase.
while(it != decoded_lines.end()) {
if (zero_count == 1) {
//We are extracting scores. we know how many scores there are so we can push them
//to the vector. This is done in case any of the scores is 0, because it would mess
//up the state machine.
for (int i = 0; i < num_scores; i++) {
current_target_phrase.push_back(*it);
it++;
}
}
if (zero_count == 3) {
//We have finished with this entry, decode it, and add it to the retvector.
retvector.push_back(decode_line(current_target_phrase, num_scores));
current_target_phrase.clear(); //Clear the current target phrase and the zero_count
zero_count = 0; //So that we can reuse them for the next target phrase
}
//Add to the next target_phrase, number by number.
current_target_phrase.push_back(*it);
if (*it == 0) {
zero_count++;
}
it++; //Go to the next word/symbol
}
//Don't forget the last remaining line!
if (zero_count == 3) {
//We have finished with this entry, decode it, and add it to the retvector.
retvector.push_back(decode_line(current_target_phrase, num_scores));
current_target_phrase.clear(); //Clear the current target phrase and the zero_count
zero_count = 0; //So that we can reuse them for the next target phrase
}
return retvector;
}
target_text HuffmanDecoder::decode_line (std::vector<unsigned int> input, int num_scores)
{
//demo decoder
target_text ret;
//Split everything
std::vector<unsigned int> target_phrase;
std::vector<unsigned int> probs;
unsigned int wAll;
//Split the line into the proper arrays
short num_zeroes = 0;
int counter = 0;
while (num_zeroes < 3) {
unsigned int num = input[counter];
if (num == 0) {
num_zeroes++;
} else if (num_zeroes == 0) {
target_phrase.push_back(num);
} else if (num_zeroes == 1) {
//Push exactly num_scores scores
for (int i = 0; i < num_scores; i++) {
probs.push_back(num);
counter++;
num = input[counter];
}
continue;
} else if (num_zeroes == 2) {
wAll = num;
}
counter++;
}
ret.target_phrase = target_phrase;
ret.word_all1 = lookup_word_all1.find(wAll)->second;
//Decode probabilities
for (std::vector<unsigned int>::iterator it = probs.begin(); it != probs.end(); it++) {
ret.prob.push_back(reinterpret_uint(&(*it)));
}
return ret;
}
inline std::string HuffmanDecoder::getTargetWordFromID(unsigned int id)
{
return lookup_target_phrase.find(id)->second;
}
std::string HuffmanDecoder::getTargetWordsFromIDs(std::vector<unsigned int> ids)
{
std::string returnstring;
for (std::vector<unsigned int>::iterator it = ids.begin(); it != ids.end(); it++) {
returnstring.append(getTargetWordFromID(*it) + " ");
}
return returnstring;
}
inline std::string getTargetWordFromID(unsigned int id, std::map<unsigned int, std::string> * lookup_target_phrase)
{
return lookup_target_phrase->find(id)->second;
}
std::string getTargetWordsFromIDs(std::vector<unsigned int> ids, std::map<unsigned int, std::string> * lookup_target_phrase)
{
std::string returnstring;
for (std::vector<unsigned int>::iterator it = ids.begin(); it != ids.end(); it++) {
returnstring.append(getTargetWordFromID(*it, lookup_target_phrase) + " ");
}
return returnstring;
}
/*Those functions are used to more easily store the floats in the binary phrase table
We convert the float unsinged int so that it is the same as our other values and we can
apply variable byte encoding on top of it.*/
inline unsigned int reinterpret_float(float * num)
{
unsigned int * converted_num;
converted_num = reinterpret_cast<unsigned int *>(num);
return *converted_num;
}
inline float reinterpret_uint(unsigned int * num)
{
float * converted_num;
converted_num = reinterpret_cast<float *>(num);
return *converted_num;
}
/*Mostly taken from stackoverflow, http://stackoverflow.com/questions/5858646/optimizing-variable-length-encoding
and modified in order to return a vector of chars. Implements ULEB128 or variable byte encoding.
This is highly optimized version with unrolled loop */
inline std::vector<unsigned char> vbyte_encode(unsigned int num)
{
//Determine how many bytes we are going to take.
short size;
std::vector<unsigned char> byte_vector;
if (num < 0x00000080U) {
size = 1;
byte_vector.reserve(size);
goto b1;
}
if (num < 0x00004000U) {
size = 2;
byte_vector.reserve(size);
goto b2;
}
if (num < 0x00200000U) {
size = 3;
byte_vector.reserve(size);
goto b3;
}
if (num < 0x10000000U) {
size = 4;
byte_vector.reserve(size);
goto b4;
}
size = 5;
byte_vector.reserve(size);
//Now proceed with the encoding.
byte_vector.push_back((num & 0x7f) | 0x80);
num >>= 7;
b4:
byte_vector.push_back((num & 0x7f) | 0x80);
num >>= 7;
b3:
byte_vector.push_back((num & 0x7f) | 0x80);
num >>= 7;
b2:
byte_vector.push_back((num & 0x7f) | 0x80);
num >>= 7;
b1:
byte_vector.push_back(num);
return byte_vector;
}
std::vector<unsigned int> vbyte_decode_line(std::vector<unsigned char> line)
{
std::vector<unsigned int> huffman_line;
std::vector<unsigned char> current_num;
for (std::vector<unsigned char>::iterator it = line.begin(); it != line.end(); it++) {
current_num.push_back(*it);
if ((*it >> 7) != 1) {
//We don't have continuation in the next bit
huffman_line.push_back(bytes_to_int(current_num));
current_num.clear();
}
}
return huffman_line;
}
inline unsigned int bytes_to_int(std::vector<unsigned char> number)
{
unsigned int retvalue = 0;
std::vector<unsigned char>::iterator it = number.begin();
unsigned char shift = 0; //By how many bits to shift
while (it != number.end()) {
retvalue |= (*it & 0x7f) << shift;
shift += 7;
it++;
}
return retvalue;
}
std::vector<unsigned char> vbyte_encode_line(std::vector<unsigned int> line)
{
std::vector<unsigned char> retvec;
//For each unsigned int in the line, vbyte encode it and add it to a vector of unsigned chars.
for (std::vector<unsigned int>::iterator it = line.begin(); it != line.end(); it++) {
std::vector<unsigned char> vbyte_encoded = vbyte_encode(*it);
retvec.insert(retvec.end(), vbyte_encoded.begin(), vbyte_encoded.end());
}
return retvec;
}

View File

@ -1,112 +0,0 @@
#pragma once
//Huffman encodes a line and also produces the vocabulary ids
#include "hash.hh"
#include "line_splitter.hh"
#include <cstdio>
#include <fstream>
#include <iostream>
#include <sstream>
#include <boost/serialization/serialization.hpp>
#include <boost/serialization/vector.hpp>
#include <boost/serialization/map.hpp>
#include <boost/archive/text_iarchive.hpp>
#include <boost/archive/text_oarchive.hpp>
//Sorting for the second
struct sort_pair {
bool operator()(const std::pair<std::string, unsigned int> &left, const std::pair<std::string, unsigned int> &right) {
return left.second > right.second; //This puts biggest numbers first.
}
};
struct sort_pair_vec {
bool operator()(const std::pair<std::vector<unsigned char>, unsigned int> &left, const std::pair<std::vector<unsigned char>, unsigned int> &right) {
return left.second > right.second; //This puts biggest numbers first.
}
};
class Huffman
{
unsigned long uniq_lines; //Unique lines in the file.
//Containers used when counting the occurence of a given phrase
std::map<std::string, unsigned int> target_phrase_words;
std::map<std::vector<unsigned char>, unsigned int> word_all1;
//Same containers as vectors, for sorting
std::vector<std::pair<std::string, unsigned int> > target_phrase_words_counts;
std::vector<std::pair<std::vector<unsigned char>, unsigned int> > word_all1_counts;
//Huffman maps
std::map<std::string, unsigned int> target_phrase_huffman;
std::map<std::vector<unsigned char>, unsigned int> word_all1_huffman;
//inverted maps
std::map<unsigned int, std::string> lookup_target_phrase;
std::map<unsigned int, std::vector<unsigned char> > lookup_word_all1;
public:
Huffman (const char *);
void count_elements (line_text line);
void assign_values();
void serialize_maps(const char * dirname);
void produce_lookups();
std::vector<unsigned int> encode_line(line_text line);
//encode line + variable byte ontop
std::vector<unsigned char> full_encode_line(line_text line);
//Getters
const std::map<unsigned int, std::string> get_target_lookup_map() const {
return lookup_target_phrase;
}
const std::map<unsigned int, std::vector<unsigned char> > get_word_all1_lookup_map() const {
return lookup_word_all1;
}
unsigned long getUniqLines() {
return uniq_lines;
}
};
class HuffmanDecoder
{
std::map<unsigned int, std::string> lookup_target_phrase;
std::map<unsigned int, std::vector<unsigned char> > lookup_word_all1;
public:
HuffmanDecoder (const char *);
HuffmanDecoder (std::map<unsigned int, std::string> *, std::map<unsigned int, std::vector<unsigned char> > *);
//Getters
const std::map<unsigned int, std::string> get_target_lookup_map() const {
return lookup_target_phrase;
}
const std::map<unsigned int, std::vector<unsigned char> > get_word_all1_lookup_map() const {
return lookup_word_all1;
}
inline std::string getTargetWordFromID(unsigned int id);
std::string getTargetWordsFromIDs(std::vector<unsigned int> ids);
target_text decode_line (std::vector<unsigned int> input, int num_scores);
//Variable byte decodes a all target phrases contained here and then passes them to decode_line
std::vector<target_text> full_decode_line (std::vector<unsigned char> lines, int num_scores);
};
std::string getTargetWordsFromIDs(std::vector<unsigned int> ids, std::map<unsigned int, std::string> * lookup_target_phrase);
inline std::string getTargetWordFromID(unsigned int id, std::map<unsigned int, std::string> * lookup_target_phrase);
inline unsigned int reinterpret_float(float * num);
inline float reinterpret_uint(unsigned int * num);
std::vector<unsigned char> vbyte_encode_line(std::vector<unsigned int> line);
inline std::vector<unsigned char> vbyte_encode(unsigned int num);
std::vector<unsigned int> vbyte_decode_line(std::vector<unsigned char> line);
inline unsigned int bytes_to_int(std::vector<unsigned char> number);

View File

@ -1,66 +1,92 @@
#include "line_splitter.hh"
line_text splitLine(StringPiece textin)
namespace Moses
{
const char delim[] = " ||| ";
line_text splitLine(const StringPiece &textin, bool scfg)
{
const char delim[] = "|||";
line_text output;
//Tokenize
util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
//Get source phrase
output.source_phrase = *it;
output.source_phrase = Trim(*it);
//std::cerr << "output.source_phrase=" << output.source_phrase << "AAAA" << std::endl;
//Get target_phrase
it++;
output.target_phrase = *it;
output.target_phrase = Trim(*it);
//std::cerr << "output.target_phrase=" << output.target_phrase << "AAAA" << std::endl;
if (scfg) {
/*
std::cerr << "output.source_phrase=" << output.source_phrase << std::endl;
std::cerr << "output.target_phrase=" << output.target_phrase << std::endl;
reformatSCFG(output);
std::cerr << "output.source_phrase=" << output.source_phrase << std::endl;
std::cerr << "output.target_phrase=" << output.target_phrase << std::endl;
*/
}
//Get probabilities
it++;
output.prob = *it;
output.prob = Trim(*it);
//std::cerr << "output.prob=" << output.prob << "AAAA" << std::endl;
//Get WordAllignment
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
output.word_align = *it;
output.word_align = Trim(*it);
//std::cerr << "output.word_align=" << output.word_align << "AAAA" << std::endl;
//Get count
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
output.counts = *it;
output.counts = Trim(*it);
//std::cerr << "output.counts=" << output.counts << "AAAA" << std::endl;
//Get sparse_score
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
output.sparse_score = *it;
output.sparse_score = Trim(*it);
//std::cerr << "output.sparse_score=" << output.sparse_score << "AAAA" << std::endl;
//Get property
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
output.property = *it;
output.property = Trim(*it);
//std::cerr << "output.property=" << output.property << "AAAA" << std::endl;
return output;
}
std::vector<unsigned char> splitWordAll1(StringPiece textin)
std::vector<unsigned char> splitWordAll1(const StringPiece &textin)
{
const char delim[] = " ";
const char delim2[] = "-";
std::vector<unsigned char> output;
//Case with no word alignments.
if (textin.size() == 0) {
return output;
}
//Split on space
util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
//For each int
while (it) {
//Split on dash (-)
util::TokenIter<util::MultiCharacter> itInner(*it, util::MultiCharacter(delim2));
util::TokenIter<util::MultiCharacter> itInner(*it,
util::MultiCharacter(delim2));
//Insert the two entries in the vector. User will read entry 0 and 1 to get the first,
//2 and 3 for second etc. Use unsigned char instead of int to save space, as
//word allignments are all very small numbers that fit in a single byte
output.push_back((unsigned char)(atoi(itInner->data())));
output.push_back((unsigned char) (atoi(itInner->data())));
itInner++;
output.push_back((unsigned char)(atoi(itInner->data())));
output.push_back((unsigned char) (atoi(itInner->data())));
it++;
}
@ -68,3 +94,10 @@ std::vector<unsigned char> splitWordAll1(StringPiece textin)
}
void reformatSCFG(line_text &output)
{
}
}

View File

@ -9,8 +9,12 @@
#include "util/tokenize_piece.hh"
#include <vector>
namespace Moses
{
//Struct for holding processed line
struct line_text {
struct line_text
{
StringPiece source_phrase;
StringPiece target_phrase;
StringPiece prob;
@ -18,16 +22,38 @@ struct line_text {
StringPiece counts;
StringPiece sparse_score;
StringPiece property;
std::string property_to_be_binarized;
};
//Struct for holding processed line
struct target_text {
struct target_text
{
std::vector<unsigned int> target_phrase;
std::vector<float> prob;
std::vector<unsigned char> word_all1;
std::vector<size_t> word_align_term;
std::vector<size_t> word_align_non_term;
std::vector<char> counts;
std::vector<char> sparse_score;
std::vector<char> property;
/*
void Reset()
{
target_phrase.clear();
prob.clear();
word_all1.clear();
counts.clear();
sparse_score.clear();
property.clear();
}
*/
};
//Ask if it's better to have it receive a pointer to a line_text struct
line_text splitLine(StringPiece textin);
line_text splitLine(const StringPiece &textin, bool scfg);
void reformatSCFG(line_text &output);
std::vector<unsigned char> splitWordAll1(const StringPiece &textin);
}
std::vector<unsigned char> splitWordAll1(StringPiece textin);

View File

@ -1,5 +1,8 @@
#include "probing_hash_utils.hh"
namespace Moses
{
//Read table from disk, return memory map location
char * readTable(const char * filename, size_t size)
{
@ -13,7 +16,7 @@ char * readTable(const char * filename, size_t size)
exit(EXIT_FAILURE);
}
map = (char *)mmap(0, size, PROT_READ, MAP_SHARED, fd, 0);
map = (char *) mmap(0, size, PROT_READ, MAP_SHARED, fd, 0);
if (map == MAP_FAILED) {
close(fd);
@ -24,11 +27,24 @@ char * readTable(const char * filename, size_t size)
return map;
}
void serialize_table(char *mem, size_t size, const char * filename)
void serialize_table(char *mem, size_t size, const std::string &filename)
{
std::ofstream os (filename, std::ios::binary);
os.write((const char*)&mem[0], size);
std::ofstream os(filename.c_str(), std::ios::binary);
os.write((const char*) &mem[0], size);
os.close();
}
}
uint64_t getKey(const uint64_t source_phrase[], size_t size)
{
//TOO SLOW
//uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
uint64_t key = 0;
for (size_t i = 0; i < size; i++) {
key += (source_phrase[i] << i);
}
return key;
}
}

View File

@ -7,31 +7,49 @@
#include <fcntl.h>
#include <fstream>
namespace Moses
{
#define API_VERSION 15
//Hash table entry
struct Entry {
uint64_t key;
struct Entry
{
typedef uint64_t Key;
unsigned int bytes_toread;
Key key;
uint64_t GetKey() const {
Key GetKey() const
{
return key;
}
void SetKey(uint64_t to) {
void SetKey(Key to)
{
key = to;
}
uint64_t GetValue() const {
return value;
}
uint64_t value;
};
#define NONE std::numeric_limits<uint64_t>::max()
//Define table
typedef util::ProbingHashTable<Entry, boost::hash<uint64_t> > Table;
void serialize_table(char *mem, size_t size, const char * filename);
void serialize_table(char *mem, size_t size, const std::string &filename);
char * readTable(const char * filename, size_t size);
uint64_t getKey(const uint64_t source_phrase[], size_t size);
struct TargetPhraseInfo
{
uint32_t alignTerm;
uint32_t alignNonTerm;
uint16_t numWords;
uint16_t propLength;
uint16_t filler;
};
}

View File

@ -1,198 +0,0 @@
#include "quering.hh"
unsigned char * read_binary_file(const char * filename, size_t filesize)
{
//Get filesize
int fd;
unsigned char * map;
fd = open(filename, O_RDONLY);
if (fd == -1) {
perror("Error opening file for reading");
exit(EXIT_FAILURE);
}
map = (unsigned char *)mmap(0, filesize, PROT_READ, MAP_SHARED, fd, 0);
if (map == MAP_FAILED) {
close(fd);
perror("Error mmapping the file");
exit(EXIT_FAILURE);
}
return map;
}
QueryEngine::QueryEngine(const char * filepath) : decoder(filepath)
{
//Create filepaths
std::string basepath(filepath);
std::string path_to_hashtable = basepath + "/probing_hash.dat";
std::string path_to_data_bin = basepath + "/binfile.dat";
std::string path_to_source_vocabid = basepath + "/source_vocabids";
///Source phrase vocabids
read_map(&source_vocabids, path_to_source_vocabid.c_str());
//Target phrase vocabIDs
vocabids = decoder.get_target_lookup_map();
//Read config file
std::string line;
std::ifstream config ((basepath + "/config").c_str());
//Check API version:
getline(config, line);
if (atoi(line.c_str()) != API_VERSION) {
std::cerr << "The ProbingPT API has changed, please rebinarize your phrase tables." << std::endl;
exit(EXIT_FAILURE);
}
//Get tablesize.
getline(config, line);
int tablesize = atoi(line.c_str());
//Number of scores
getline(config, line);
num_scores = atoi(line.c_str());
//do we have a reordering table
getline(config, line);
std::transform(line.begin(), line.end(), line.begin(), ::tolower); //Get the boolean in lowercase
is_reordering = false;
if (line == "true") {
is_reordering = true;
std::cerr << "WARNING. REORDERING TABLES NOT SUPPORTED YET." << std::endl;
}
config.close();
//Mmap binary table
struct stat filestatus;
stat(path_to_data_bin.c_str(), &filestatus);
binary_filesize = filestatus.st_size;
binary_mmaped = read_binary_file(path_to_data_bin.c_str(), binary_filesize);
//Read hashtable
table_filesize = Table::Size(tablesize, 1.2);
mem = readTable(path_to_hashtable.c_str(), table_filesize);
Table table_init(mem, table_filesize);
table = table_init;
std::cerr << "Initialized successfully! " << std::endl;
}
QueryEngine::~QueryEngine()
{
//Clear mmap content from memory.
munmap(binary_mmaped, binary_filesize);
munmap(mem, table_filesize);
}
std::pair<bool, std::vector<target_text> > QueryEngine::query(std::vector<uint64_t> source_phrase)
{
bool found;
std::vector<target_text> translation_entries;
const Entry * entry;
//TOO SLOW
//uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
uint64_t key = 0;
for (int i = 0; i < source_phrase.size(); i++) {
key += (source_phrase[i] << i);
}
found = table.Find(key, entry);
if (found) {
//The phrase that was searched for was found! We need to get the translation entries.
//We will read the largest entry in bytes and then filter the unnecesarry with functions
//from line_splitter
uint64_t initial_index = entry -> GetValue();
unsigned int bytes_toread = entry -> bytes_toread;
//ASK HIEU FOR MORE EFFICIENT WAY TO DO THIS!
std::vector<unsigned char> encoded_text; //Assign to the vector the relevant portion of the array.
encoded_text.reserve(bytes_toread);
for (int i = 0; i < bytes_toread; i++) {
encoded_text.push_back(binary_mmaped[i+initial_index]);
}
//Get only the translation entries necessary
translation_entries = decoder.full_decode_line(encoded_text, num_scores);
}
std::pair<bool, std::vector<target_text> > output (found, translation_entries);
return output;
}
std::pair<bool, std::vector<target_text> > QueryEngine::query(StringPiece source_phrase)
{
bool found;
std::vector<target_text> translation_entries;
const Entry * entry;
//Convert source frase to VID
std::vector<uint64_t> source_phrase_vid = getVocabIDs(source_phrase);
//TOO SLOW
//uint64_t key = util::MurmurHashNative(&source_phrase_vid[0], source_phrase_vid.size());
uint64_t key = 0;
for (int i = 0; i < source_phrase_vid.size(); i++) {
key += (source_phrase_vid[i] << i);
}
found = table.Find(key, entry);
if (found) {
//The phrase that was searched for was found! We need to get the translation entries.
//We will read the largest entry in bytes and then filter the unnecesarry with functions
//from line_splitter
uint64_t initial_index = entry -> GetValue();
unsigned int bytes_toread = entry -> bytes_toread;
//At the end of the file we can't readd + largest_entry cause we get a segfault.
std::cerr << "Entry size is bytes is: " << bytes_toread << std::endl;
//ASK HIEU FOR MORE EFFICIENT WAY TO DO THIS!
std::vector<unsigned char> encoded_text; //Assign to the vector the relevant portion of the array.
encoded_text.reserve(bytes_toread);
for (int i = 0; i < bytes_toread; i++) {
encoded_text.push_back(binary_mmaped[i+initial_index]);
}
//Get only the translation entries necessary
translation_entries = decoder.full_decode_line(encoded_text, num_scores);
}
std::pair<bool, std::vector<target_text> > output (found, translation_entries);
return output;
}
void QueryEngine::printTargetInfo(std::vector<target_text> target_phrases)
{
int entries = target_phrases.size();
for (int i = 0; i<entries; i++) {
std::cout << "Entry " << i+1 << " of " << entries << ":" << std::endl;
//Print text
std::cout << getTargetWordsFromIDs(target_phrases[i].target_phrase, &vocabids) << "\t";
//Print probabilities:
for (int j = 0; j<target_phrases[i].prob.size(); j++) {
std::cout << target_phrases[i].prob[j] << " ";
}
std::cout << "\t";
//Print word_all1
for (int j = 0; j<target_phrases[i].word_all1.size(); j++) {
if (j%2 == 0) {
std::cout << (short)target_phrases[i].word_all1[j] << "-";
} else {
std::cout << (short)target_phrases[i].word_all1[j] << " ";
}
}
std::cout << std::endl;
}
}

View File

@ -1,45 +0,0 @@
#pragma once
#include "probing_hash_utils.hh"
#include "huffmanish.hh"
#include "hash.hh" //Includes line splitter
#include <sys/stat.h> //For finding size of file
#include "vocabid.hh"
#include <algorithm> //toLower
#define API_VERSION 3
char * read_binary_file(char * filename);
class QueryEngine
{
unsigned char * binary_mmaped; //The binari phrase table file
std::map<unsigned int, std::string> vocabids;
std::map<uint64_t, std::string> source_vocabids;
Table table;
char *mem; //Memory for the table, necessary so that we can correctly destroy the object
HuffmanDecoder decoder;
size_t binary_filesize;
size_t table_filesize;
int num_scores;
bool is_reordering;
public:
QueryEngine (const char *);
~QueryEngine();
std::pair<bool, std::vector<target_text> > query(StringPiece source_phrase);
std::pair<bool, std::vector<target_text> > query(std::vector<uint64_t> source_phrase);
void printTargetInfo(std::vector<target_text> target_phrases);
const std::map<unsigned int, std::string> getVocab() const {
return decoder.get_target_lookup_map();
}
const std::map<uint64_t, std::string> getSourceVocab() const {
return source_vocabids;
}
};

View File

@ -0,0 +1,142 @@
#include "quering.hh"
#include "util/exception.hh"
using namespace std;
namespace Moses
{
QueryEngine::QueryEngine(const char * filepath)
{
//Create filepaths
std::string basepath(filepath);
std::string path_to_config = basepath + "/config";
std::string path_to_hashtable = basepath + "/probing_hash.dat";
std::string path_to_source_vocabid = basepath + "/source_vocabids";
std::string alignPath = basepath + "/Alignments.dat";
if (!FileExists(path_to_config)) {
UTIL_THROW2("Binary table doesn't exist is didn't finish binarizing: " << path_to_config);
}
///Source phrase vocabids
read_map(source_vocabids, path_to_source_vocabid.c_str());
// alignments
read_alignments(alignPath);
//Read config file
boost::unordered_map<std::string, std::string> keyValue;
std::ifstream config(path_to_config.c_str());
std::string line;
while (getline(config, line)) {
std::vector<std::string> toks = Tokenize(line, "\t");
UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line);
keyValue[ toks[0] ] = toks[1];
}
bool found;
//Check API version:
int version;
found = Get(keyValue, "API_VERSION", version);
if (!found) {
std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl;
}
else if (version != API_VERSION) {
std::cerr << "The ProbingPT API has changed. " << version << "!="
<< API_VERSION << " Please rebinarize your phrase tables." << std::endl;
exit(EXIT_FAILURE);
}
//Get tablesize.
int tablesize;
found = Get(keyValue, "uniq_entries", tablesize);
if (!found) {
std::cerr << "uniq_entries not found" << std::endl;
exit(EXIT_FAILURE);
}
//Number of scores
found = Get(keyValue, "num_scores", num_scores);
if (!found) {
std::cerr << "num_scores not found" << std::endl;
exit(EXIT_FAILURE);
}
//How may scores from lex reordering models
found = Get(keyValue, "num_lex_scores", num_lex_scores);
if (!found) {
std::cerr << "num_lex_scores not found" << std::endl;
exit(EXIT_FAILURE);
}
// have the scores been log() and FloorScore()?
found = Get(keyValue, "log_prob", logProb);
if (!found) {
std::cerr << "logProb not found" << std::endl;
exit(EXIT_FAILURE);
}
config.close();
//Read hashtable
table_filesize = Table::Size(tablesize, 1.2);
mem = readTable(path_to_hashtable.c_str(), table_filesize);
Table table_init(mem, table_filesize);
table = table_init;
std::cerr << "Initialized successfully! " << std::endl;
}
QueryEngine::~QueryEngine()
{
//Clear mmap content from memory.
munmap(mem, table_filesize);
}
uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const
{
//TOO SLOW
//uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
return Moses::getKey(source_phrase, size);
}
std::pair<bool, uint64_t> QueryEngine::query(uint64_t key)
{
std::pair<bool, uint64_t> ret;
const Entry * entry;
ret.first = table.Find(key, entry);
if (ret.first) {
ret.second = entry->value;
}
return ret;
}
void QueryEngine::read_alignments(const std::string &alignPath)
{
std::ifstream strm(alignPath.c_str());
string line;
while (getline(strm, line)) {
vector<string> toks = Tokenize(line, "\t ");
UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file");
uint32_t alignInd = Scan<uint32_t>(toks[0]);
if (alignInd >= alignColl.size()) {
alignColl.resize(alignInd + 1);
}
Alignments &aligns = alignColl[alignInd];
for (size_t i = 1; i < toks.size(); ++i) {
size_t pos = Scan<size_t>(toks[i]);
aligns.push_back(pos);
}
}
}
}

View File

@ -0,0 +1,65 @@
#pragma once
#include <boost/unordered_map.hpp>
#include <sys/stat.h> //For finding size of file
#include "vocabid.hh"
#include <algorithm> //toLower
#include <deque>
#include "probing_hash_utils.hh"
#include "hash.hh" //Includes line splitter
#include "line_splitter.hh"
#include "moses//Util.h"
namespace Moses
{
class QueryEngine
{
std::map<uint64_t, std::string> source_vocabids;
typedef std::vector<unsigned char> Alignments;
std::vector<Alignments> alignColl;
Table table;
char *mem; //Memory for the table, necessary so that we can correctly destroy the object
size_t table_filesize;
bool is_reordering;
void read_alignments(const std::string &alignPath);
public:
int num_scores;
int num_lex_scores;
bool logProb;
QueryEngine(const char *);
~QueryEngine();
std::pair<bool, uint64_t> query(uint64_t key);
const std::map<uint64_t, std::string> &getSourceVocab() const
{ return source_vocabids; }
const std::vector<Alignments> &getAlignments() const
{ return alignColl; }
uint64_t getKey(uint64_t source_phrase[], size_t size) const;
template<typename T>
inline bool Get(const boost::unordered_map<std::string, std::string> &keyValue, const std::string &sought, T &found) const
{
boost::unordered_map<std::string, std::string>::const_iterator iter = keyValue.find(sought);
if (iter == keyValue.end()) {
return false;
}
const std::string &foundStr = iter->second;
found = Scan<T>(foundStr);
return true;
}
};
}

View File

@ -1,161 +1,303 @@
#include <sys/stat.h>
#include <boost/foreach.hpp>
#include "line_splitter.hh"
#include "storing.hh"
#include "StoreTarget.h"
#include "StoreVocab.h"
#include "moses/Util.h"
#include "moses/InputFileStream.h"
BinaryFileWriter::BinaryFileWriter (std::string basepath) : os ((basepath + "/binfile.dat").c_str(), std::ios::binary)
{
binfile.reserve(10000); //Reserve part of the vector to avoid realocation
it = binfile.begin();
dist_from_start = 0; //Initialize variables
extra_counter = 0;
}
using namespace std;
void BinaryFileWriter::write (std::vector<unsigned char> * bytes)
namespace Moses
{
binfile.insert(it, bytes->begin(), bytes->end()); //Insert the bytes
//Keep track of the offsets
it += bytes->size();
dist_from_start = distance(binfile.begin(),it);
//Flush the vector to disk every once in a while so that we don't consume too much ram
if (dist_from_start > 9000) {
flush();
///////////////////////////////////////////////////////////////////////
void Node::Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos)
{
if (pos < sourcePhrase.size()) {
uint64_t vocabId = sourcePhrase[pos];
Node *child;
Children::iterator iter = m_children.find(vocabId);
if (iter == m_children.end()) {
// New node. Write other children then discard them
BOOST_FOREACH(Children::value_type &valPair, m_children) {
Node &otherChild = valPair.second;
otherChild.Write(table);
}
m_children.clear();
// create new node
child = &m_children[vocabId];
assert(!child->done);
child->key = key + (vocabId << pos);
}
else {
child = &iter->second;
}
child->Add(table, sourcePhrase, pos + 1);
}
else {
// this node was written previously 'cos it has rules
done = true;
}
}
void BinaryFileWriter::flush ()
void Node::Write(Table &table)
{
//Cast unsigned char to char before writing...
os.write((char *)&binfile[0], dist_from_start);
//Clear the vector:
binfile.clear();
binfile.reserve(10000);
extra_counter += dist_from_start; //Keep track of the total number of bytes.
it = binfile.begin(); //Reset iterator
dist_from_start = distance(binfile.begin(),it); //Reset dist from start
//cerr << "START write " << done << " " << key << endl;
BOOST_FOREACH(Children::value_type &valPair, m_children) {
Node &child = valPair.second;
child.Write(table);
}
if (!done) {
// save
Entry sourceEntry;
sourceEntry.value = NONE;
sourceEntry.key = key;
//Put into table
table.Insert(sourceEntry);
}
}
BinaryFileWriter::~BinaryFileWriter ()
///////////////////////////////////////////////////////////////////////
void createProbingPT(const std::string &phrasetable_path,
const std::string &basepath, int num_scores, int num_lex_scores,
bool log_prob, int max_cache_size, bool scfg)
{
os.close();
binfile.clear();
}
std::cerr << "Starting..." << std::endl;
void createProbingPT(const char * phrasetable_path, const char * target_path,
const char * num_scores, const char * is_reordering)
{
//Get basepath and create directory if missing
std::string basepath(target_path);
mkdir(basepath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
//Set up huffman and serialize decoder maps.
Huffman huffmanEncoder(phrasetable_path); //initialize
huffmanEncoder.assign_values();
huffmanEncoder.produce_lookups();
huffmanEncoder.serialize_maps(target_path);
StoreTarget storeTarget(basepath);
//Get uniq lines:
unsigned long uniq_entries = huffmanEncoder.getUniqLines();
unsigned long uniq_entries = countUniqueSource(phrasetable_path);
//Source phrase vocabids
std::map<uint64_t, std::string> source_vocabids;
StoreVocab<uint64_t> sourceVocab(basepath + "/source_vocabids");
//Read the file
util::FilePiece filein(phrasetable_path);
util::FilePiece filein(phrasetable_path.c_str());
//Init the probing hash table
size_t size = Table::Size(uniq_entries, 1.2);
char * mem = new char[size];
memset(mem, 0, size);
Table table(mem, size);
Table sourceEntries(mem, size);
BinaryFileWriter binfile(basepath); //Init the binary file writer.
line_text prev_line; //Check if the source phrase of the previous line is the same
std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> cache;
float totalSourceCount = 0;
//Keep track of the size of each group of target phrases
uint64_t entrystartidx = 0;
//uint64_t line_num = 0;
size_t line_num = 0;
//Read everything and processs
while(true) {
std::string prevSource;
Node sourcePhrases;
sourcePhrases.done = true;
sourcePhrases.key = 0;
while (true) {
try {
//Process line read
line_text line;
line = splitLine(filein.ReadLine());
line = splitLine(filein.ReadLine(), scfg);
//cerr << "line=" << line.source_phrase << endl;
++line_num;
if (line_num % 1000000 == 0) {
std::cerr << line_num << " " << std::flush;
}
//Add source phrases to vocabularyIDs
add_to_map(&source_vocabids, line.source_phrase);
add_to_map(sourceVocab, line.source_phrase);
if ((binfile.dist_from_start + binfile.extra_counter) == 0) {
prev_line = line; //For the first iteration assume the previous line is
} //The same as this one.
if (line.source_phrase != prev_line.source_phrase) {
if (prevSource.empty()) {
// 1st line
prevSource = line.source_phrase.as_string();
storeTarget.Append(line, log_prob, scfg);
}
else if (prevSource == line.source_phrase) {
//If we still have the same line, just append to it:
storeTarget.Append(line, log_prob, scfg);
}
else {
assert(prevSource != line.source_phrase);
//Create a new entry even
// save
uint64_t targetInd = storeTarget.Save();
// next line
storeTarget.Append(line, log_prob, scfg);
//Create an entry for the previous source phrase:
Entry pesho;
pesho.value = entrystartidx;
Entry sourceEntry;
sourceEntry.value = targetInd;
//The key is the sum of hashes of individual words bitshifted by their position in the phrase.
//Probably not entirerly correct, but fast and seems to work fine in practise.
pesho.key = 0;
std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase);
for (int i = 0; i < vocabid_source.size(); i++) {
pesho.key += (vocabid_source[i] << i);
std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource);
if (scfg) {
// storing prefixes?
sourcePhrases.Add(sourceEntries, vocabid_source);
}
pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx;
sourceEntry.key = getKey(vocabid_source);
/*
cerr << "prevSource=" << prevSource << flush
<< " vocabids=" << Debug(vocabid_source) << flush
<< " key=" << sourceEntry.key << endl;
*/
//Put into table
table.Insert(pesho);
sourceEntries.Insert(sourceEntry);
entrystartidx = binfile.dist_from_start + binfile.extra_counter; //Designate start idx for new entry
// update cache - CURRENT source phrase, not prev
if (max_cache_size) {
std::string countStr = line.counts.as_string();
countStr = Trim(countStr);
if (!countStr.empty()) {
std::vector<float> toks = Tokenize<float>(countStr);
//cerr << "CACHE:" << line.source_phrase << " " << countStr << " " << toks[1] << endl;
//Encode a line and write it to disk.
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
binfile.write(&encoded_line);
if (toks.size() >= 2) {
totalSourceCount += toks[1];
// compute key for CURRENT source
std::vector<uint64_t> currVocabidSource = getVocabIDs(line.source_phrase.as_string());
uint64_t currKey = getKey(currVocabidSource);
CacheItem *item = new CacheItem(
Trim(line.source_phrase.as_string()),
currKey,
toks[1]);
cache.push(item);
if (max_cache_size > 0 && cache.size() > max_cache_size) {
cache.pop();
}
}
}
}
//Set prevLine
prev_line = line;
} else {
//If we still have the same line, just append to it:
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
binfile.write(&encoded_line);
prevSource = line.source_phrase.as_string();
}
} catch (util::EndOfFileException e) {
std::cerr << "Reading phrase table finished, writing remaining files to disk." << std::endl;
binfile.flush();
}
catch (util::EndOfFileException e) {
std::cerr
<< "Reading phrase table finished, writing remaining files to disk."
<< std::endl;
//After the final entry is constructed we need to add it to the phrase_table
//Create an entry for the previous source phrase:
Entry pesho;
pesho.value = entrystartidx;
uint64_t targetInd = storeTarget.Save();
Entry sourceEntry;
sourceEntry.value = targetInd;
//The key is the sum of hashes of individual words. Probably not entirerly correct, but fast
pesho.key = 0;
std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase);
for (int i = 0; i < vocabid_source.size(); i++) {
pesho.key += (vocabid_source[i] << i);
}
pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx;
std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource);
sourceEntry.key = getKey(vocabid_source);
//Put into table
table.Insert(pesho);
sourceEntries.Insert(sourceEntry);
break;
}
}
serialize_table(mem, size, (basepath + "/probing_hash.dat").c_str());
sourcePhrases.Write(sourceEntries);
serialize_map(&source_vocabids, (basepath + "/source_vocabids").c_str());
storeTarget.SaveAlignment();
serialize_table(mem, size, (basepath + "/probing_hash.dat"));
sourceVocab.Save();
serialize_cache(cache, (basepath + "/cache"), totalSourceCount);
delete[] mem;
//Write configfile
std::ofstream configfile;
configfile.open((basepath + "/config").c_str());
configfile << API_VERSION << '\n';
configfile << uniq_entries << '\n';
configfile << num_scores << '\n';
configfile << is_reordering << '\n';
configfile << "API_VERSION\t" << API_VERSION << '\n';
configfile << "uniq_entries\t" << uniq_entries << '\n';
configfile << "num_scores\t" << num_scores << '\n';
configfile << "num_lex_scores\t" << num_lex_scores << '\n';
configfile << "log_prob\t" << log_prob << '\n';
configfile.close();
}
size_t countUniqueSource(const std::string &path)
{
size_t ret = 0;
InputFileStream strme(path);
std::string line, prevSource;
while (std::getline(strme, line)) {
std::vector<std::string> toks = TokenizeMultiCharSeparator(line, "|||");
assert(toks.size() != 0);
if (prevSource != toks[0]) {
prevSource = toks[0];
++ret;
}
}
return ret;
}
void serialize_cache(
std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache,
const std::string &path, float totalSourceCount)
{
std::vector<const CacheItem*> vec(cache.size());
size_t ind = cache.size() - 1;
while (!cache.empty()) {
const CacheItem *item = cache.top();
vec[ind] = item;
cache.pop();
--ind;
}
std::ofstream os(path.c_str());
os << totalSourceCount << std::endl;
for (size_t i = 0; i < vec.size(); ++i) {
const CacheItem *item = vec[i];
os << item->count << "\t" << item->sourceKey << "\t" << item->source << std::endl;
delete item;
}
os.close();
}
uint64_t getKey(const std::vector<uint64_t> &vocabid_source)
{
return getKey(vocabid_source.data(), vocabid_source.size());
}
std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos)
{
assert(endPos < vocabid_source.size());
std::vector<uint64_t> ret(endPos + 1);
for (size_t i = 0; i <= endPos; ++i) {
ret[i] = vocabid_source[i];
}
return ret;
}
}

View File

@ -1,36 +1,95 @@
#pragma once
#include <boost/unordered_set.hpp>
#include <boost/unordered_map.hpp>
#include <cstdio>
#include <sstream>
#include <fstream>
#include <iostream>
#include <string>
#include <queue>
#include <sys/stat.h> //mkdir
#include "hash.hh" //Includes line_splitter
#include "probing_hash_utils.hh"
#include "huffmanish.hh"
#include <sys/stat.h> //mkdir
#include "util/file_piece.hh"
#include "util/file.hh"
#include "vocabid.hh"
#define API_VERSION 3
void createProbingPT(const char * phrasetable_path, const char * target_path,
const char * num_scores, const char * is_reordering);
class BinaryFileWriter
namespace Moses
{
std::vector<unsigned char> binfile;
std::vector<unsigned char>::iterator it;
//Output binary
std::ofstream os;
typedef std::vector<uint64_t> SourcePhrase;
class Node
{
typedef boost::unordered_map<uint64_t, Node> Children;
Children m_children;
public:
unsigned int dist_from_start; //Distance from the start of the vector.
uint64_t extra_counter; //After we reset the counter, we still want to keep track of the correct offset, so
uint64_t key;
bool done;
BinaryFileWriter (std::string);
~BinaryFileWriter ();
void write (std::vector<unsigned char> * bytes);
void flush (); //Flush to disk
Node()
:done(false)
{}
void Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos = 0);
void Write(Table &table);
};
void createProbingPT(const std::string &phrasetable_path,
const std::string &basepath, int num_scores, int num_lex_scores,
bool log_prob, int max_cache_size, bool scfg);
uint64_t getKey(const std::vector<uint64_t> &source_phrase);
std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos);
template<typename T>
std::string Debug(const std::vector<T> &vec)
{
std::stringstream strm;
for (size_t i = 0; i < vec.size(); ++i) {
strm << vec[i] << " ";
}
return strm.str();
}
size_t countUniqueSource(const std::string &path);
class CacheItem
{
public:
std::string source;
uint64_t sourceKey;
float count;
CacheItem(const std::string &vSource, uint64_t vSourceKey, float vCount)
:source(vSource)
,sourceKey(vSourceKey)
,count(vCount)
{
}
bool operator<(const CacheItem &other) const
{
return count > other.count;
}
};
class CacheItemOrderer
{
public:
bool operator()(const CacheItem* a, const CacheItem* b) const
{
return (*a) < (*b);
}
};
void serialize_cache(
std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache,
const std::string &path, float totalSourceCount);
}

View File

@ -1,32 +1,59 @@
#include <boost/foreach.hpp>
#include "vocabid.hh"
#include "StoreVocab.h"
#include "moses/Util.h"
void add_to_map(std::map<uint64_t, std::string> *karta, StringPiece textin)
namespace Moses
{
void add_to_map(StoreVocab<uint64_t> &sourceVocab,
const StringPiece &textin)
{
//Tokenize
util::TokenIter<util::SingleCharacter> it(textin, util::SingleCharacter(' '));
util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' '));
while(it) {
karta->insert(std::pair<uint64_t, std::string>(getHash(*it), it->as_string()));
it++;
while (itWord) {
StringPiece word = *itWord;
util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|'));
while (itFactor) {
StringPiece factor = *itFactor;
sourceVocab.Insert(getHash(factor), factor.as_string());
itFactor++;
}
itWord++;
}
}
void serialize_map(std::map<uint64_t, std::string> *karta, const char* filename)
void serialize_map(const std::map<uint64_t, std::string> &karta,
const std::string &filename)
{
std::ofstream os (filename, std::ios::binary);
boost::archive::text_oarchive oarch(os);
std::ofstream os(filename.c_str());
std::map<uint64_t, std::string>::const_iterator iter;
for (iter = karta.begin(); iter != karta.end(); ++iter) {
os << iter->first << '\t' << iter->second << std::endl;
}
oarch << *karta; //Serialise map
os.close();
}
void read_map(std::map<uint64_t, std::string> *karta, const char* filename)
void read_map(std::map<uint64_t, std::string> &karta, const char* filename)
{
std::ifstream is (filename, std::ios::binary);
boost::archive::text_iarchive iarch(is);
std::ifstream is(filename);
iarch >> *karta;
std::string line;
while (getline(is, line)) {
std::vector<std::string> toks = Tokenize(line, "\t");
assert(toks.size() == 2);
uint64_t ind = Scan<uint64_t>(toks[1]);
karta[ind] = toks[0];
}
//Close the stream after we are done.
is.close();
}
}

View File

@ -13,8 +13,17 @@
#include "util/string_piece.hh" //Tokenization and work with StringPiece
#include "util/tokenize_piece.hh"
void add_to_map(std::map<uint64_t, std::string> *karta, StringPiece textin);
namespace Moses
{
template<typename VOCABID>
class StoreVocab;
void serialize_map(std::map<uint64_t, std::string> *karta, const char* filename);
void add_to_map(StoreVocab<uint64_t> &sourceVocab,
const StringPiece &textin);
void read_map(std::map<uint64_t, std::string> *karta, const char* filename);
void serialize_map(const std::map<uint64_t, std::string> &karta,
const std::string &filename);
void read_map(std::map<uint64_t, std::string> &karta, const char* filename);
}