mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-26 19:37:58 +03:00
merge
This commit is contained in:
commit
b7f1b360be
2
Jamroot
2
Jamroot
@ -341,3 +341,5 @@ if [ path.exists $(TOP)/dist ] && $(prefix) != dist {
|
||||
local temp = [ _shell "mkdir -p $(TOP)/bin" ] ;
|
||||
local temp = [ _shell "rm -f $(TOP)/bin/moses_chart" ] ;
|
||||
local temp = [ _shell "cd $(TOP)/bin && ln -s moses moses_chart" ] ;
|
||||
local temp = [ _shell "cd $(TOP)/bin && ln -s CreateProbingPT CreateProbingPT2" ] ;
|
||||
|
||||
|
@ -1,113 +0,0 @@
|
||||
#include <string>
|
||||
#include <boost/program_options.hpp>
|
||||
#include "util/usage.hh"
|
||||
#include "TranslationModel/ProbingPT/storing.hh"
|
||||
#include "legacy/InputFileStream.h"
|
||||
#include "legacy/OutputFileStream.h"
|
||||
#include "legacy/Util2.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
std::string ReformatSCFGFile(const std::string &path);
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
string inPath, outPath;
|
||||
int num_scores = 4;
|
||||
int num_lex_scores = 0;
|
||||
bool log_prob = false;
|
||||
bool scfg = false;
|
||||
int max_cache_size = 50000;
|
||||
|
||||
namespace po = boost::program_options;
|
||||
po::options_description desc("Options");
|
||||
desc.add_options()
|
||||
("help", "Print help messages")
|
||||
("input-pt", po::value<string>()->required(), "Text pt")
|
||||
("output-dir", po::value<string>()->required(), "Directory when binary files will be written")
|
||||
("num-scores", po::value<int>()->default_value(num_scores), "Number of pt scores")
|
||||
("num-lex-scores", po::value<int>()->default_value(num_lex_scores), "Number of lexicalized reordering scores")
|
||||
("log-prob", "log (and floor) probabilities before storing")
|
||||
("max-cache-size", po::value<int>()->default_value(max_cache_size), "Maximum number of high-count source lines to write to cache file. 0=no cache, negative=no limit")
|
||||
("scfg", "Rules are SCFG in Moses format (ie. with non-terms and LHS")
|
||||
|
||||
;
|
||||
|
||||
po::variables_map vm;
|
||||
try {
|
||||
po::store(po::parse_command_line(argc, argv, desc),
|
||||
vm); // can throw
|
||||
|
||||
/** --help option
|
||||
*/
|
||||
if ( vm.count("help")) {
|
||||
std::cout << desc << std::endl;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
po::notify(vm); // throws on error, so do after help in case
|
||||
// there are any problems
|
||||
} catch(po::error& e) {
|
||||
std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
|
||||
std::cerr << desc << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
if (vm.count("input-pt")) inPath = vm["input-pt"].as<string>();
|
||||
if (vm.count("output-dir")) outPath = vm["output-dir"].as<string>();
|
||||
if (vm.count("num-scores")) num_scores = vm["num-scores"].as<int>();
|
||||
if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as<int>();
|
||||
if (vm.count("max-cache-size")) max_cache_size = vm["max-cache-size"].as<int>();
|
||||
if (vm.count("log-prob")) log_prob = true;
|
||||
if (vm.count("scfg")) scfg = true;
|
||||
|
||||
|
||||
if (scfg) {
|
||||
inPath = ReformatSCFGFile(inPath);
|
||||
}
|
||||
|
||||
Moses2::createProbingPT(inPath, outPath, num_scores, num_lex_scores, log_prob, max_cache_size, scfg);
|
||||
|
||||
//util::PrintUsage(std::cout);
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::string ReformatSCFGFile(const std::string &path)
|
||||
{
|
||||
Moses2::InputFileStream inFile(path);
|
||||
string reformattedPath = path + ".reformat.gz";
|
||||
Moses2::OutputFileStream outFile(reformattedPath);
|
||||
|
||||
string line;
|
||||
while (getline(inFile, line)) {
|
||||
vector<string> toks = Moses2::TokenizeMultiCharSeparator(line, "|||");
|
||||
assert(toks.size() >= 3);
|
||||
|
||||
// source
|
||||
vector<string> sourceToks = Moses2::Tokenize(toks[0], " ");
|
||||
for (size_t i = 0; i < sourceToks.size() - 1; ++i) {
|
||||
outFile << sourceToks[i] << " ";
|
||||
}
|
||||
|
||||
// other columns
|
||||
for (size_t i = 1; i < toks.size(); ++i) {
|
||||
outFile << "|||" << toks[i];
|
||||
}
|
||||
outFile << endl;
|
||||
}
|
||||
|
||||
inFile.Close();
|
||||
outFile.Close();
|
||||
|
||||
string sortedPath = path + ".reformat.sorted.gz";
|
||||
string tmpPath = path + ".tmp ";
|
||||
string cmd = "mkdir " + tmpPath
|
||||
+ " && gzip -dc " + reformattedPath + " | LC_ALL=C sort -T " + tmpPath + " | gzip -c > " + sortedPath;
|
||||
system(cmd.c_str());
|
||||
|
||||
cmd = "rm -rf " + tmpPath + " " + reformattedPath;
|
||||
system(cmd.c_str());
|
||||
|
||||
return sortedPath;
|
||||
}
|
||||
|
@ -72,7 +72,7 @@ alias deps : ../..//z ../..//boost_iostreams ../..//boost_filesystem ../../mose
|
||||
TranslationModel/ProbingPT/hash.cpp
|
||||
TranslationModel/ProbingPT/line_splitter.cpp
|
||||
TranslationModel/ProbingPT/probing_hash_utils.cpp
|
||||
TranslationModel/ProbingPT/quering.cpp
|
||||
TranslationModel/ProbingPT/querying.cpp
|
||||
TranslationModel/ProbingPT/storing.cpp
|
||||
TranslationModel/ProbingPT/StoreVocab.cpp
|
||||
TranslationModel/ProbingPT/StoreTarget.cpp
|
||||
@ -173,11 +173,10 @@ alias deps : ../..//z ../..//boost_iostreams ../..//boost_filesystem ../../mose
|
||||
deps ;
|
||||
|
||||
exe moses2 : Main.cpp moses2_lib ;
|
||||
exe CreateProbingPT2 : CreateProbingPT2.cpp moses2_lib ;
|
||||
|
||||
if [ xmlrpc ] {
|
||||
echo "Building Moses2" ;
|
||||
alias programs : moses2 CreateProbingPT2 ;
|
||||
alias programs : moses2 ;
|
||||
}
|
||||
else {
|
||||
echo "Not building Moses2" ;
|
||||
|
@ -230,6 +230,14 @@ public:
|
||||
//std::cerr << "destroy " << p << " " << n << std::endl;
|
||||
}
|
||||
|
||||
// return address of values
|
||||
pointer address (reference value) const {
|
||||
return &value;
|
||||
}
|
||||
const_pointer address (const_reference value) const {
|
||||
return &value;
|
||||
}
|
||||
|
||||
MemPool &m_pool;
|
||||
protected:
|
||||
};
|
||||
|
@ -6,7 +6,7 @@
|
||||
*/
|
||||
#include <boost/foreach.hpp>
|
||||
#include "ProbingPT.h"
|
||||
#include "quering.hh"
|
||||
#include "querying.hh"
|
||||
#include "probing_hash_utils.hh"
|
||||
#include "util/exception.hh"
|
||||
#include "../../System.h"
|
||||
|
@ -1,4 +1,4 @@
|
||||
#include "quering.hh"
|
||||
#include "querying.hh"
|
||||
#include "util/exception.hh"
|
||||
#include "../../legacy/Util2.h"
|
||||
|
||||
@ -12,10 +12,15 @@ QueryEngine::QueryEngine(const char * filepath)
|
||||
|
||||
//Create filepaths
|
||||
std::string basepath(filepath);
|
||||
std::string path_to_config = basepath + "/config";
|
||||
std::string path_to_hashtable = basepath + "/probing_hash.dat";
|
||||
std::string path_to_source_vocabid = basepath + "/source_vocabids";
|
||||
std::string alignPath = basepath + "/Alignments.dat";
|
||||
|
||||
if (!FileExists(path_to_config)) {
|
||||
UTIL_THROW2("Binary table doesn't exist is didn't finish binarizing: " << path_to_config);
|
||||
}
|
||||
|
||||
///Source phrase vocabids
|
||||
read_map(source_vocabids, path_to_source_vocabid.c_str());
|
||||
|
||||
@ -25,7 +30,7 @@ QueryEngine::QueryEngine(const char * filepath)
|
||||
//Read config file
|
||||
boost::unordered_map<std::string, std::string> keyValue;
|
||||
|
||||
std::ifstream config((basepath + "/config").c_str());
|
||||
std::ifstream config(path_to_config.c_str());
|
||||
std::string line;
|
||||
while (getline(config, line)) {
|
||||
std::vector<std::string> toks = Moses2::Tokenize(line, "\t");
|
@ -1319,7 +1319,7 @@
|
||||
<name>FF/PhraseBoundaryFeature.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhraseBoundaryFeature.h</locationURI>
|
||||
</link>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/PhraseDistanceFeature.cpp</name>
|
||||
<type>1</type>
|
||||
@ -3340,6 +3340,26 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/ProbingPT.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/ProbingPT/StoreTarget.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreTarget.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/ProbingPT/StoreTarget.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreTarget.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/ProbingPT/StoreVocab.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreVocab.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/ProbingPT/StoreVocab.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreVocab.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/ProbingPT/hash.cpp</name>
|
||||
<type>1</type>
|
||||
@ -3350,16 +3370,6 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.hh</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/ProbingPT/huffmanish.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/huffmanish.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/ProbingPT/huffmanish.hh</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/huffmanish.hh</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/ProbingPT/line_splitter.cpp</name>
|
||||
<type>1</type>
|
||||
@ -3381,14 +3391,14 @@
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/probing_hash_utils.hh</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/ProbingPT/quering.cpp</name>
|
||||
<name>TranslationModel/ProbingPT/querying.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/quering.cpp</locationURI>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/querying.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/ProbingPT/quering.hh</name>
|
||||
<name>TranslationModel/ProbingPT/querying.hh</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/quering.hh</locationURI>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/querying.hh</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/ProbingPT/storing.cpp</name>
|
||||
@ -3664,7 +3674,7 @@
|
||||
<name>TranslationModel/UG/sapt_pscore_coherence.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_coherence.h</locationURI>
|
||||
</link>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/UG/sapt_pscore_lex1.h</name>
|
||||
<type>1</type>
|
||||
@ -3709,7 +3719,7 @@
|
||||
<name>TranslationModel/UG/sapt_pscore_wordcount.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_wordcount.h</locationURI>
|
||||
</link>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/UG/sim-pe.cc</name>
|
||||
<type>1</type>
|
||||
|
@ -1,29 +1,113 @@
|
||||
#include <string>
|
||||
#include <boost/program_options.hpp>
|
||||
#include "util/usage.hh"
|
||||
#include "moses/TranslationModel/ProbingPT/storing.hh"
|
||||
#include "moses/InputFileStream.h"
|
||||
#include "moses/OutputFileStream.h"
|
||||
#include "moses/Util.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
std::string ReformatSCFGFile(const std::string &path);
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
string inPath, outPath;
|
||||
int num_scores = 4;
|
||||
int num_lex_scores = 0;
|
||||
bool log_prob = false;
|
||||
bool scfg = false;
|
||||
int max_cache_size = 50000;
|
||||
|
||||
const char * is_reordering = "false";
|
||||
namespace po = boost::program_options;
|
||||
po::options_description desc("Options");
|
||||
desc.add_options()
|
||||
("help", "Print help messages")
|
||||
("input-pt", po::value<string>()->required(), "Text pt")
|
||||
("output-dir", po::value<string>()->required(), "Directory when binary files will be written")
|
||||
("num-scores", po::value<int>()->default_value(num_scores), "Number of pt scores")
|
||||
("num-lex-scores", po::value<int>()->default_value(num_lex_scores), "Number of lexicalized reordering scores")
|
||||
("log-prob", "log (and floor) probabilities before storing")
|
||||
("max-cache-size", po::value<int>()->default_value(max_cache_size), "Maximum number of high-count source lines to write to cache file. 0=no cache, negative=no limit")
|
||||
("scfg", "Rules are SCFG in Moses format (ie. with non-terms and LHS")
|
||||
|
||||
if (!(argc == 5 || argc == 4)) {
|
||||
// Tell the user how to run the program
|
||||
std::cerr << "Provided " << argc << " arguments, needed 4 or 5." << std::endl;
|
||||
std::cerr << "Usage: " << argv[0] << " path_to_phrasetable output_dir num_scores is_reordering" << std::endl;
|
||||
std::cerr << "is_reordering should be either true or false, but it is currently a stub feature." << std::endl;
|
||||
//std::cerr << "Usage: " << argv[0] << " path_to_phrasetable number_of_uniq_lines output_bin_file output_hash_table output_vocab_id" << std::endl;
|
||||
return 1;
|
||||
;
|
||||
|
||||
po::variables_map vm;
|
||||
try {
|
||||
po::store(po::parse_command_line(argc, argv, desc),
|
||||
vm); // can throw
|
||||
|
||||
/** --help option
|
||||
*/
|
||||
if ( vm.count("help")) {
|
||||
std::cout << desc << std::endl;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
po::notify(vm); // throws on error, so do after help in case
|
||||
// there are any problems
|
||||
} catch(po::error& e) {
|
||||
std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
|
||||
std::cerr << desc << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
if (argc == 5) {
|
||||
is_reordering = argv[4];
|
||||
if (vm.count("input-pt")) inPath = vm["input-pt"].as<string>();
|
||||
if (vm.count("output-dir")) outPath = vm["output-dir"].as<string>();
|
||||
if (vm.count("num-scores")) num_scores = vm["num-scores"].as<int>();
|
||||
if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as<int>();
|
||||
if (vm.count("max-cache-size")) max_cache_size = vm["max-cache-size"].as<int>();
|
||||
if (vm.count("log-prob")) log_prob = true;
|
||||
if (vm.count("scfg")) scfg = true;
|
||||
|
||||
|
||||
if (scfg) {
|
||||
inPath = ReformatSCFGFile(inPath);
|
||||
}
|
||||
|
||||
createProbingPT(argv[1], argv[2], argv[3], is_reordering);
|
||||
Moses::createProbingPT(inPath, outPath, num_scores, num_lex_scores, log_prob, max_cache_size, scfg);
|
||||
|
||||
util::PrintUsage(std::cout);
|
||||
//util::PrintUsage(std::cout);
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::string ReformatSCFGFile(const std::string &path)
|
||||
{
|
||||
Moses::InputFileStream inFile(path);
|
||||
string reformattedPath = path + ".reformat.gz";
|
||||
Moses::OutputFileStream outFile(reformattedPath);
|
||||
|
||||
string line;
|
||||
while (getline(inFile, line)) {
|
||||
vector<string> toks = Moses::TokenizeMultiCharSeparator(line, "|||");
|
||||
assert(toks.size() >= 3);
|
||||
|
||||
// source
|
||||
vector<string> sourceToks = Moses::Tokenize(toks[0], " ");
|
||||
for (size_t i = 0; i < sourceToks.size() - 1; ++i) {
|
||||
outFile << sourceToks[i] << " ";
|
||||
}
|
||||
|
||||
// other columns
|
||||
for (size_t i = 1; i < toks.size(); ++i) {
|
||||
outFile << "|||" << toks[i];
|
||||
}
|
||||
outFile << endl;
|
||||
}
|
||||
|
||||
inFile.Close();
|
||||
outFile.Close();
|
||||
|
||||
string sortedPath = path + ".reformat.sorted.gz";
|
||||
string tmpPath = path + ".tmp ";
|
||||
string cmd = "mkdir " + tmpPath
|
||||
+ " && gzip -dc " + reformattedPath + " | LC_ALL=C sort -T " + tmpPath + " | gzip -c > " + sortedPath;
|
||||
system(cmd.c_str());
|
||||
|
||||
cmd = "rm -rf " + tmpPath + " " + reformattedPath;
|
||||
system(cmd.c_str());
|
||||
|
||||
return sortedPath;
|
||||
}
|
||||
|
||||
|
@ -31,9 +31,9 @@ else {
|
||||
}
|
||||
|
||||
exe CreateProbingPT : CreateProbingPT.cpp ..//boost_filesystem ../moses//moses ;
|
||||
exe QueryProbingPT : QueryProbingPT.cpp ..//boost_filesystem ../moses//moses ;
|
||||
#exe QueryProbingPT : QueryProbingPT.cpp ..//boost_filesystem ../moses//moses ;
|
||||
|
||||
alias programsProbing : CreateProbingPT QueryProbingPT ;
|
||||
alias programsProbing : CreateProbingPT ; #QueryProbingPT
|
||||
|
||||
exe merge-sorted :
|
||||
merge-sorted.cc
|
||||
|
@ -34,7 +34,7 @@ int main(int argc, char* argv[])
|
||||
return 1;
|
||||
}
|
||||
|
||||
QueryEngine queries(argv[1]);
|
||||
Moses::QueryEngine queries(argv[1]);
|
||||
|
||||
//Interactive search
|
||||
std::cout << "Please enter a string to be searched, or exit to exit." << std::endl;
|
||||
|
@ -247,6 +247,15 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
void PlusEquals(const FeatureFunction* sp, float scores[])
|
||||
{
|
||||
size_t numScores = sp->GetNumScoreComponents();
|
||||
size_t offset = sp->GetIndex();
|
||||
for (size_t i = 0; i < numScores; ++i) {
|
||||
m_scores[i + offset] += scores[i];
|
||||
}
|
||||
}
|
||||
|
||||
//! Special version PlusEquals(ScoreProducer, vector<float>)
|
||||
//! to add the score from a single ScoreProducer that produces
|
||||
//! a single value
|
||||
|
@ -3,8 +3,9 @@
|
||||
#include "moses/StaticData.h"
|
||||
#include "moses/FactorCollection.h"
|
||||
#include "moses/TargetPhraseCollection.h"
|
||||
#include "moses/InputFileStream.h"
|
||||
#include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h"
|
||||
#include "quering.hh"
|
||||
#include "querying.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -34,44 +35,94 @@ void ProbingPT::Load(AllOptions::ptr const& opts)
|
||||
|
||||
m_unkId = 456456546456;
|
||||
|
||||
FactorCollection &vocab = FactorCollection::Instance();
|
||||
|
||||
// source vocab
|
||||
const std::map<uint64_t, std::string> &sourceVocab = m_engine->getSourceVocab();
|
||||
const std::map<uint64_t, std::string> &sourceVocab =
|
||||
m_engine->getSourceVocab();
|
||||
std::map<uint64_t, std::string>::const_iterator iterSource;
|
||||
for (iterSource = sourceVocab.begin(); iterSource != sourceVocab.end(); ++iterSource) {
|
||||
const string &wordStr = iterSource->second;
|
||||
const Factor *factor = FactorCollection::Instance().AddFactor(wordStr);
|
||||
for (iterSource = sourceVocab.begin(); iterSource != sourceVocab.end();
|
||||
++iterSource) {
|
||||
string wordStr = iterSource->second;
|
||||
//cerr << "wordStr=" << wordStr << endl;
|
||||
|
||||
const Factor *factor = vocab.AddFactor(wordStr);
|
||||
|
||||
uint64_t probingId = iterSource->first;
|
||||
size_t factorId = factor->GetId();
|
||||
|
||||
SourceVocabMap::value_type entry(factor, probingId);
|
||||
m_sourceVocabMap.insert(entry);
|
||||
|
||||
if (factorId >= m_sourceVocab.size()) {
|
||||
m_sourceVocab.resize(factorId + 1, m_unkId);
|
||||
}
|
||||
m_sourceVocab[factorId] = probingId;
|
||||
}
|
||||
|
||||
// target vocab
|
||||
const std::map<unsigned int, std::string> &probingVocab = m_engine->getVocab();
|
||||
std::map<unsigned int, std::string>::const_iterator iter;
|
||||
for (iter = probingVocab.begin(); iter != probingVocab.end(); ++iter) {
|
||||
const string &wordStr = iter->second;
|
||||
const Factor *factor = FactorCollection::Instance().AddFactor(wordStr);
|
||||
InputFileStream targetVocabStrme(m_filePath + "/TargetVocab.dat");
|
||||
string line;
|
||||
while (getline(targetVocabStrme, line)) {
|
||||
vector<string> toks = Tokenize(line, "\t");
|
||||
UTIL_THROW_IF2(toks.size() != 2, string("Incorrect format:") + line + "\n");
|
||||
|
||||
unsigned int probingId = iter->first;
|
||||
//cerr << "wordStr=" << toks[0] << endl;
|
||||
|
||||
TargetVocabMap::value_type entry(factor, probingId);
|
||||
m_vocabMap.insert(entry);
|
||||
const Factor *factor = vocab.AddFactor(toks[0]);
|
||||
uint32_t probingId = Scan<uint32_t>(toks[1]);
|
||||
|
||||
if (probingId >= m_targetVocab.size()) {
|
||||
m_targetVocab.resize(probingId + 1);
|
||||
}
|
||||
|
||||
m_targetVocab[probingId] = factor;
|
||||
}
|
||||
|
||||
// alignments
|
||||
CreateAlignmentMap(m_filePath + "/Alignments.dat");
|
||||
|
||||
// memory mapped file to tps
|
||||
string filePath = m_filePath + "/TargetColl.dat";
|
||||
file.open(filePath.c_str());
|
||||
if (!file.is_open()) {
|
||||
throw "Couldn't open file ";
|
||||
}
|
||||
|
||||
data = file.data();
|
||||
//size_t size = file.size();
|
||||
|
||||
// cache
|
||||
//CreateCache(system);
|
||||
|
||||
}
|
||||
|
||||
void ProbingPT::CreateAlignmentMap(const std::string path)
|
||||
{
|
||||
const std::vector< std::vector<unsigned char> > &probingAlignColl = m_engine->getAlignments();
|
||||
m_aligns.resize(probingAlignColl.size(), NULL);
|
||||
|
||||
for (size_t i = 0; i < probingAlignColl.size(); ++i) {
|
||||
AlignmentInfo::CollType aligns;
|
||||
|
||||
const std::vector<unsigned char> &probingAligns = probingAlignColl[i];
|
||||
for (size_t j = 0; j < probingAligns.size(); j += 2) {
|
||||
size_t startPos = probingAligns[j];
|
||||
size_t endPos = probingAligns[j+1];
|
||||
//cerr << "startPos=" << startPos << " " << endPos << endl;
|
||||
aligns.insert(std::pair<size_t,size_t>(startPos, endPos));
|
||||
}
|
||||
|
||||
const AlignmentInfo *align = AlignmentInfoCollection::Instance().Add(aligns);
|
||||
m_aligns[i] = align;
|
||||
//cerr << "align=" << align->Debug(system) << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void ProbingPT::InitializeForInput(ttasksptr const& ttask)
|
||||
{
|
||||
ReduceCache();
|
||||
|
||||
}
|
||||
|
||||
void ProbingPT::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const
|
||||
{
|
||||
CacheColl &cache = GetCache();
|
||||
|
||||
InputPathList::const_iterator iter;
|
||||
for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) {
|
||||
InputPath &inputPath = **iter;
|
||||
@ -82,132 +133,205 @@ void ProbingPT::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQue
|
||||
}
|
||||
|
||||
TargetPhraseCollection::shared_ptr tpColl = CreateTargetPhrase(sourcePhrase);
|
||||
|
||||
// add target phrase to phrase-table cache
|
||||
size_t hash = hash_value(sourcePhrase);
|
||||
std::pair<TargetPhraseCollection::shared_ptr , clock_t> value(tpColl, clock());
|
||||
cache[hash] = value;
|
||||
|
||||
inputPath.SetTargetPhrases(*this, tpColl, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<uint64_t> ProbingPT::ConvertToProbingSourcePhrase(const Phrase &sourcePhrase, bool &ok) const
|
||||
{
|
||||
size_t size = sourcePhrase.GetSize();
|
||||
std::vector<uint64_t> ret(size);
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
const Factor *factor = sourcePhrase.GetFactor(i, m_input[0]);
|
||||
uint64_t probingId = GetSourceProbingId(factor);
|
||||
if (probingId == m_unkId) {
|
||||
ok = false;
|
||||
return ret;
|
||||
} else {
|
||||
ret[i] = probingId;
|
||||
}
|
||||
}
|
||||
|
||||
ok = true;
|
||||
return ret;
|
||||
}
|
||||
|
||||
TargetPhraseCollection::shared_ptr ProbingPT::CreateTargetPhrase(const Phrase &sourcePhrase) const
|
||||
{
|
||||
// create a target phrase from the 1st word of the source, prefix with 'ProbingPT:'
|
||||
assert(sourcePhrase.GetSize());
|
||||
|
||||
TargetPhraseCollection::shared_ptr tpColl;
|
||||
bool ok;
|
||||
vector<uint64_t> probingSource = ConvertToProbingSourcePhrase(sourcePhrase, ok);
|
||||
if (!ok) {
|
||||
// source phrase contains a word unknown in the pt.
|
||||
// We know immediately there's no translation for it
|
||||
return tpColl;
|
||||
std::pair<bool, uint64_t> keyStruct = GetKey(sourcePhrase);
|
||||
if (!keyStruct.first) {
|
||||
return TargetPhraseCollection::shared_ptr();
|
||||
}
|
||||
|
||||
std::pair<bool, std::vector<target_text> > query_result;
|
||||
|
||||
//Actual lookup
|
||||
query_result = m_engine->query(probingSource);
|
||||
|
||||
if (query_result.first) {
|
||||
//m_engine->printTargetInfo(query_result.second);
|
||||
tpColl.reset(new TargetPhraseCollection());
|
||||
|
||||
const std::vector<target_text> &probingTargetPhrases = query_result.second;
|
||||
for (size_t i = 0; i < probingTargetPhrases.size(); ++i) {
|
||||
const target_text &probingTargetPhrase = probingTargetPhrases[i];
|
||||
TargetPhrase *tp = CreateTargetPhrase(sourcePhrase, probingTargetPhrase);
|
||||
|
||||
tpColl->Add(tp);
|
||||
}
|
||||
|
||||
tpColl->Prune(true, m_tableLimit);
|
||||
// check in cache
|
||||
CachePb::const_iterator iter = m_cachePb.find(keyStruct.second);
|
||||
if (iter != m_cachePb.end()) {
|
||||
//cerr << "FOUND IN CACHE " << keyStruct.second << " " << sourcePhrase.Debug(mgr.system) << endl;
|
||||
TargetPhraseCollection *tps = iter->second;
|
||||
return TargetPhraseCollection::shared_ptr(tps);
|
||||
}
|
||||
|
||||
return tpColl;
|
||||
// query pt
|
||||
TargetPhraseCollection *tps = CreateTargetPhrases(sourcePhrase,
|
||||
keyStruct.second);
|
||||
return TargetPhraseCollection::shared_ptr(tps);
|
||||
}
|
||||
|
||||
TargetPhrase *ProbingPT::CreateTargetPhrase(const Phrase &sourcePhrase, const target_text &probingTargetPhrase) const
|
||||
std::pair<bool, uint64_t> ProbingPT::GetKey(const Phrase &sourcePhrase) const
|
||||
{
|
||||
const std::vector<unsigned int> &probingPhrase = probingTargetPhrase.target_phrase;
|
||||
size_t size = probingPhrase.size();
|
||||
std::pair<bool, uint64_t> ret;
|
||||
|
||||
// create a target phrase from the 1st word of the source, prefix with 'ProbingPT:'
|
||||
size_t sourceSize = sourcePhrase.GetSize();
|
||||
assert(sourceSize);
|
||||
|
||||
uint64_t probingSource[sourceSize];
|
||||
GetSourceProbingIds(sourcePhrase, ret.first, probingSource);
|
||||
if (!ret.first) {
|
||||
// source phrase contains a word unknown in the pt.
|
||||
// We know immediately there's no translation for it
|
||||
}
|
||||
else {
|
||||
ret.second = m_engine->getKey(probingSource, sourceSize);
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
void ProbingPT::GetSourceProbingIds(const Phrase &sourcePhrase,
|
||||
bool &ok, uint64_t probingSource[]) const
|
||||
{
|
||||
|
||||
size_t size = sourcePhrase.GetSize();
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
const Word &word = sourcePhrase.GetWord(i);
|
||||
uint64_t probingId = GetSourceProbingId(word);
|
||||
if (probingId == m_unkId) {
|
||||
ok = false;
|
||||
return;
|
||||
}
|
||||
else {
|
||||
probingSource[i] = probingId;
|
||||
}
|
||||
}
|
||||
|
||||
ok = true;
|
||||
}
|
||||
|
||||
uint64_t ProbingPT::GetSourceProbingId(const Word &word) const
|
||||
{
|
||||
uint64_t ret = 0;
|
||||
|
||||
for (size_t i = 0; i < m_input.size(); ++i) {
|
||||
FactorType factorType = m_input[i];
|
||||
const Factor *factor = word[factorType];
|
||||
|
||||
size_t factorId = factor->GetId();
|
||||
if (factorId >= m_sourceVocab.size()) {
|
||||
return m_unkId;
|
||||
}
|
||||
ret += m_sourceVocab[factorId];
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
TargetPhraseCollection *ProbingPT::CreateTargetPhrases(
|
||||
const Phrase &sourcePhrase, uint64_t key) const
|
||||
{
|
||||
TargetPhraseCollection *tps = NULL;
|
||||
|
||||
//Actual lookup
|
||||
std::pair<bool, uint64_t> query_result; // 1st=found, 2nd=target file offset
|
||||
query_result = m_engine->query(key);
|
||||
//cerr << "key2=" << query_result.second << endl;
|
||||
|
||||
if (query_result.first) {
|
||||
const char *offset = data + query_result.second;
|
||||
uint64_t *numTP = (uint64_t*) offset;
|
||||
|
||||
tps = new TargetPhraseCollection();
|
||||
|
||||
offset += sizeof(uint64_t);
|
||||
for (size_t i = 0; i < *numTP; ++i) {
|
||||
TargetPhrase *tp = CreateTargetPhrase(offset);
|
||||
assert(tp);
|
||||
tp->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
|
||||
|
||||
tps->Add(tp);
|
||||
|
||||
}
|
||||
|
||||
tps->Prune(true, m_tableLimit);
|
||||
//cerr << *tps << endl;
|
||||
}
|
||||
|
||||
return tps;
|
||||
|
||||
}
|
||||
|
||||
TargetPhrase *ProbingPT::CreateTargetPhrase(
|
||||
const char *&offset) const
|
||||
{
|
||||
TargetPhraseInfo *tpInfo = (TargetPhraseInfo*) offset;
|
||||
size_t numRealWords = tpInfo->numWords / m_output.size();
|
||||
|
||||
TargetPhrase *tp = new TargetPhrase(this);
|
||||
|
||||
offset += sizeof(TargetPhraseInfo);
|
||||
|
||||
// scores
|
||||
float *scores = (float*) offset;
|
||||
|
||||
size_t totalNumScores = m_engine->num_scores + m_engine->num_lex_scores;
|
||||
|
||||
if (m_engine->logProb) {
|
||||
// set pt score for rule
|
||||
tp->GetScoreBreakdown().PlusEquals(this, scores);
|
||||
|
||||
// save scores for other FF, eg. lex RO. Just give the offset
|
||||
/*
|
||||
if (m_engine->num_lex_scores) {
|
||||
tp->scoreProperties = scores + m_engine->num_scores;
|
||||
}
|
||||
*/
|
||||
}
|
||||
else {
|
||||
// log score 1st
|
||||
float logScores[totalNumScores];
|
||||
for (size_t i = 0; i < totalNumScores; ++i) {
|
||||
logScores[i] = FloorScore(TransformScore(scores[i]));
|
||||
}
|
||||
|
||||
// set pt score for rule
|
||||
tp->GetScoreBreakdown().PlusEquals(this, logScores);
|
||||
|
||||
// save scores for other FF, eg. lex RO.
|
||||
/*
|
||||
tp->scoreProperties = pool.Allocate<SCORE>(m_engine->num_lex_scores);
|
||||
for (size_t i = 0; i < m_engine->num_lex_scores; ++i) {
|
||||
tp->scoreProperties[i] = logScores[i + m_engine->num_scores];
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
offset += sizeof(float) * totalNumScores;
|
||||
|
||||
// words
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
uint64_t probingId = probingPhrase[i];
|
||||
const Factor *factor = GetTargetFactor(probingId);
|
||||
assert(factor);
|
||||
for (size_t targetPos = 0; targetPos < numRealWords; ++targetPos) {
|
||||
Word &word = tp->AddWord();
|
||||
for (size_t i = 0; i < m_output.size(); ++i) {
|
||||
FactorType factorType = m_output[i];
|
||||
|
||||
Word &word = tp->AddWord();
|
||||
word.SetFactor(m_output[0], factor);
|
||||
uint32_t *probingId = (uint32_t*) offset;
|
||||
|
||||
const Factor *factor = GetTargetFactor(*probingId);
|
||||
assert(factor);
|
||||
|
||||
word[factorType] = factor;
|
||||
|
||||
offset += sizeof(uint32_t);
|
||||
}
|
||||
}
|
||||
|
||||
// score for this phrase table
|
||||
vector<float> scores = probingTargetPhrase.prob;
|
||||
std::transform(scores.begin(), scores.end(), scores.begin(),TransformScore);
|
||||
tp->GetScoreBreakdown().PlusEquals(this, scores);
|
||||
// align
|
||||
uint32_t alignTerm = tpInfo->alignTerm;
|
||||
//cerr << "alignTerm=" << alignTerm << endl;
|
||||
UTIL_THROW_IF2(alignTerm >= m_aligns.size(), "Unknown alignInd");
|
||||
tp->SetAlignTerm(m_aligns[alignTerm]);
|
||||
|
||||
// alignment
|
||||
/*
|
||||
const std::vector<unsigned char> &alignments = probingTargetPhrase.word_all1;
|
||||
// properties TODO
|
||||
|
||||
AlignmentInfo &aligns = tp->GetAlignTerm();
|
||||
for (size_t i = 0; i < alignS.size(); i += 2 ) {
|
||||
aligns.Add((size_t) alignments[i], (size_t) alignments[i+1]);
|
||||
}
|
||||
*/
|
||||
|
||||
// score of all other ff when this rule is being loaded
|
||||
tp->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
|
||||
return tp;
|
||||
}
|
||||
|
||||
const Factor *ProbingPT::GetTargetFactor(uint64_t probingId) const
|
||||
{
|
||||
TargetVocabMap::right_map::const_iterator iter;
|
||||
iter = m_vocabMap.right.find(probingId);
|
||||
if (iter != m_vocabMap.right.end()) {
|
||||
return iter->second;
|
||||
} else {
|
||||
// not in mapping. Must be UNK
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////
|
||||
|
||||
uint64_t ProbingPT::GetSourceProbingId(const Factor *factor) const
|
||||
{
|
||||
SourceVocabMap::left_map::const_iterator iter;
|
||||
iter = m_sourceVocabMap.left.find(factor);
|
||||
if (iter != m_sourceVocabMap.left.end()) {
|
||||
return iter->second;
|
||||
} else {
|
||||
// not in mapping. Must be UNK
|
||||
return m_unkId;
|
||||
}
|
||||
}
|
||||
|
||||
ChartRuleLookupManager *ProbingPT::CreateRuleLookupManager(
|
||||
const ChartParser &,
|
||||
|
@ -1,17 +1,18 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <boost/iostreams/device/mapped_file.hpp>
|
||||
#include <boost/bimap.hpp>
|
||||
#include <boost/unordered_map.hpp>
|
||||
#include "../PhraseDictionary.h"
|
||||
|
||||
class QueryEngine;
|
||||
class target_text;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
class ChartParser;
|
||||
class ChartCellCollectionBase;
|
||||
class ChartRuleLookupManager;
|
||||
class QueryEngine;
|
||||
class target_text;
|
||||
|
||||
class ProbingPT : public PhraseDictionary
|
||||
{
|
||||
@ -39,21 +40,42 @@ public:
|
||||
|
||||
protected:
|
||||
QueryEngine *m_engine;
|
||||
uint64_t m_unkId;
|
||||
|
||||
typedef boost::bimap<const Factor *, uint64_t> SourceVocabMap;
|
||||
mutable SourceVocabMap m_sourceVocabMap;
|
||||
std::vector<uint64_t> m_sourceVocab; // factor id -> pt id
|
||||
std::vector<const Factor*> m_targetVocab; // pt id -> factor*
|
||||
std::vector<const AlignmentInfo*> m_aligns;
|
||||
|
||||
typedef boost::bimap<const Factor *, unsigned int> TargetVocabMap;
|
||||
mutable TargetVocabMap m_vocabMap;
|
||||
boost::iostreams::mapped_file_source file;
|
||||
const char *data;
|
||||
|
||||
// caching
|
||||
typedef boost::unordered_map<uint64_t, TargetPhraseCollection*> CachePb;
|
||||
CachePb m_cachePb;
|
||||
|
||||
void CreateAlignmentMap(const std::string path);
|
||||
|
||||
TargetPhraseCollection::shared_ptr CreateTargetPhrase(const Phrase &sourcePhrase) const;
|
||||
TargetPhrase *CreateTargetPhrase(const Phrase &sourcePhrase, const target_text &probingTargetPhrase) const;
|
||||
const Factor *GetTargetFactor(uint64_t probingId) const;
|
||||
|
||||
std::pair<bool, uint64_t> GetKey(const Phrase &sourcePhrase) const;
|
||||
void GetSourceProbingIds(const Phrase &sourcePhrase, bool &ok,
|
||||
uint64_t probingSource[]) const;
|
||||
uint64_t GetSourceProbingId(const Word &word) const;
|
||||
uint64_t GetSourceProbingId(const Factor *factor) const;
|
||||
|
||||
std::vector<uint64_t> ConvertToProbingSourcePhrase(const Phrase &sourcePhrase, bool &ok) const;
|
||||
TargetPhraseCollection *CreateTargetPhrases(
|
||||
const Phrase &sourcePhrase, uint64_t key) const;
|
||||
TargetPhrase *CreateTargetPhrase(
|
||||
const char *&offset) const;
|
||||
|
||||
inline const Factor *GetTargetFactor(uint32_t probingId) const
|
||||
{
|
||||
if (probingId >= m_targetVocab.size()) {
|
||||
return NULL;
|
||||
}
|
||||
return m_targetVocab[probingId];
|
||||
}
|
||||
|
||||
uint64_t m_unkId;
|
||||
};
|
||||
|
||||
} // namespace Moses
|
||||
|
266
moses/TranslationModel/ProbingPT/StoreTarget.cpp
Normal file
266
moses/TranslationModel/ProbingPT/StoreTarget.cpp
Normal file
@ -0,0 +1,266 @@
|
||||
/*
|
||||
* StoreTarget.cpp
|
||||
*
|
||||
* Created on: 19 Jan 2016
|
||||
* Author: hieu
|
||||
*/
|
||||
#include <boost/foreach.hpp>
|
||||
#include "StoreTarget.h"
|
||||
#include "line_splitter.hh"
|
||||
#include "probing_hash_utils.hh"
|
||||
#include "moses/OutputFileStream.h"
|
||||
#include "moses/Util.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
StoreTarget::StoreTarget(const std::string &basepath)
|
||||
:m_basePath(basepath)
|
||||
,m_vocab(basepath + "/TargetVocab.dat")
|
||||
{
|
||||
std::string path = basepath + "/TargetColl.dat";
|
||||
m_fileTargetColl.open(path.c_str(),
|
||||
std::ios::out | std::ios::binary | std::ios::ate | std::ios::trunc);
|
||||
if (!m_fileTargetColl.is_open()) {
|
||||
throw "can't create file ";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
StoreTarget::~StoreTarget()
|
||||
{
|
||||
assert(m_coll.empty());
|
||||
m_fileTargetColl.close();
|
||||
|
||||
// vocab
|
||||
m_vocab.Save();
|
||||
}
|
||||
|
||||
uint64_t StoreTarget::Save()
|
||||
{
|
||||
uint64_t ret = m_fileTargetColl.tellp();
|
||||
|
||||
// save to disk
|
||||
uint64_t numTP = m_coll.size();
|
||||
m_fileTargetColl.write((char*) &numTP, sizeof(uint64_t));
|
||||
|
||||
for (size_t i = 0; i < m_coll.size(); ++i) {
|
||||
Save(*m_coll[i]);
|
||||
}
|
||||
|
||||
// clear coll
|
||||
RemoveAllInColl(m_coll);
|
||||
m_coll.clear();
|
||||
|
||||
// starting position of coll
|
||||
return ret;
|
||||
}
|
||||
|
||||
void StoreTarget::Save(const target_text &rule)
|
||||
{
|
||||
// metadata for each tp
|
||||
TargetPhraseInfo tpInfo;
|
||||
tpInfo.alignTerm = GetAlignId(rule.word_align_term);
|
||||
tpInfo.alignNonTerm = GetAlignId(rule.word_align_non_term);
|
||||
tpInfo.numWords = rule.target_phrase.size();
|
||||
tpInfo.propLength = rule.property.size();
|
||||
|
||||
//cerr << "TPInfo=" << sizeof(TPInfo);
|
||||
m_fileTargetColl.write((char*) &tpInfo, sizeof(TargetPhraseInfo));
|
||||
|
||||
// scores
|
||||
for (size_t i = 0; i < rule.prob.size(); ++i) {
|
||||
float prob = rule.prob[i];
|
||||
m_fileTargetColl.write((char*) &prob, sizeof(prob));
|
||||
}
|
||||
|
||||
// tp
|
||||
for (size_t i = 0; i < rule.target_phrase.size(); ++i) {
|
||||
uint32_t vocabId = rule.target_phrase[i];
|
||||
m_fileTargetColl.write((char*) &vocabId, sizeof(vocabId));
|
||||
}
|
||||
|
||||
// prop TODO
|
||||
|
||||
}
|
||||
|
||||
void StoreTarget::SaveAlignment()
|
||||
{
|
||||
std::string path = m_basePath + "/Alignments.dat";
|
||||
OutputFileStream file(path);
|
||||
|
||||
BOOST_FOREACH(Alignments::value_type &valPair, m_aligns) {
|
||||
file << valPair.second << "\t";
|
||||
|
||||
const std::vector<size_t> &aligns = valPair.first;
|
||||
BOOST_FOREACH(size_t align, aligns) {
|
||||
file << align << " ";
|
||||
}
|
||||
file << endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void StoreTarget::Append(const line_text &line, bool log_prob, bool scfg)
|
||||
{
|
||||
target_text *rule = new target_text;
|
||||
//cerr << "line.target_phrase=" << line.target_phrase << endl;
|
||||
|
||||
// target_phrase
|
||||
vector<bool> nonTerms;
|
||||
util::TokenIter<util::SingleCharacter> it;
|
||||
it = util::TokenIter<util::SingleCharacter>(line.target_phrase,
|
||||
util::SingleCharacter(' '));
|
||||
while (it) {
|
||||
StringPiece word = *it;
|
||||
//cerr << "word=" << word << endl;
|
||||
|
||||
bool nonTerm = false;
|
||||
if (scfg) {
|
||||
// not really sure how to handle factored SCFG and NT
|
||||
if (scfg && word[0] == '[' && word[word.size() - 1] == ']') {
|
||||
//cerr << "NON-TERM=" << tok << " " << nonTerms.size() << endl;
|
||||
nonTerm = true;
|
||||
}
|
||||
nonTerms.push_back(nonTerm);
|
||||
}
|
||||
|
||||
util::TokenIter<util::SingleCharacter> itFactor;
|
||||
itFactor = util::TokenIter<util::SingleCharacter>(word,
|
||||
util::SingleCharacter('|'));
|
||||
while (itFactor) {
|
||||
StringPiece factor = *itFactor;
|
||||
|
||||
string factorStr = factor.as_string();
|
||||
uint32_t vocabId = m_vocab.GetVocabId(factorStr);
|
||||
|
||||
rule->target_phrase.push_back(vocabId);
|
||||
|
||||
itFactor++;
|
||||
}
|
||||
|
||||
it++;
|
||||
}
|
||||
|
||||
// probs
|
||||
it = util::TokenIter<util::SingleCharacter>(line.prob,
|
||||
util::SingleCharacter(' '));
|
||||
while (it) {
|
||||
string tok = it->as_string();
|
||||
float prob = Scan<float>(tok);
|
||||
|
||||
if (log_prob) {
|
||||
prob = FloorScore(log(prob));
|
||||
if (prob == 0.0f) prob = 0.0000000001;
|
||||
}
|
||||
|
||||
rule->prob.push_back(prob);
|
||||
it++;
|
||||
}
|
||||
|
||||
/*
|
||||
cerr << "nonTerms=";
|
||||
for (size_t i = 0; i < nonTerms.size(); ++i) {
|
||||
cerr << nonTerms[i] << " ";
|
||||
}
|
||||
cerr << endl;
|
||||
*/
|
||||
|
||||
// alignment
|
||||
it = util::TokenIter<util::SingleCharacter>(line.word_align,
|
||||
util::SingleCharacter(' '));
|
||||
while (it) {
|
||||
string tokPair = Trim(it->as_string());
|
||||
if (tokPair.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
vector<size_t> alignPair = Tokenize<size_t>(tokPair, "-");
|
||||
assert(alignPair.size() == 2);
|
||||
|
||||
bool nonTerm = false;
|
||||
size_t sourcePos = alignPair[0];
|
||||
size_t targetPos = alignPair[1];
|
||||
if (scfg) {
|
||||
nonTerm = nonTerms[targetPos];
|
||||
}
|
||||
|
||||
//cerr << targetPos << "=" << nonTerm << endl;
|
||||
|
||||
if (nonTerm) {
|
||||
rule->word_align_non_term.push_back(sourcePos);
|
||||
rule->word_align_non_term.push_back(targetPos);
|
||||
//cerr << (int) rule->word_all1.back() << " ";
|
||||
}
|
||||
else {
|
||||
rule->word_align_term.push_back(sourcePos);
|
||||
rule->word_align_term.push_back(targetPos);
|
||||
}
|
||||
|
||||
it++;
|
||||
}
|
||||
|
||||
// extra scores
|
||||
string prop = line.property.as_string();
|
||||
AppendLexRO(prop, rule->prob, log_prob);
|
||||
|
||||
//cerr << "line.property=" << line.property << endl;
|
||||
//cerr << "prop=" << prop << endl;
|
||||
|
||||
// properties
|
||||
/*
|
||||
for (size_t i = 0; i < prop.size(); ++i) {
|
||||
rule->property.push_back(prop[i]);
|
||||
}
|
||||
*/
|
||||
m_coll.push_back(rule);
|
||||
}
|
||||
|
||||
uint32_t StoreTarget::GetAlignId(const std::vector<size_t> &align)
|
||||
{
|
||||
boost::unordered_map<std::vector<size_t>, uint32_t>::iterator iter =
|
||||
m_aligns.find(align);
|
||||
if (iter == m_aligns.end()) {
|
||||
uint32_t ind = m_aligns.size();
|
||||
m_aligns[align] = ind;
|
||||
return ind;
|
||||
}
|
||||
else {
|
||||
return iter->second;
|
||||
}
|
||||
}
|
||||
|
||||
void StoreTarget::AppendLexRO(std::string &prop, std::vector<float> &retvector,
|
||||
bool log_prob) const
|
||||
{
|
||||
size_t startPos = prop.find("{{LexRO ");
|
||||
|
||||
if (startPos != string::npos) {
|
||||
size_t endPos = prop.find("}}", startPos + 8);
|
||||
string lexProb = prop.substr(startPos + 8, endPos - startPos - 8);
|
||||
//cerr << "lexProb=" << lexProb << endl;
|
||||
|
||||
// append lex probs to pt probs
|
||||
vector<float> scores = Tokenize<float>(lexProb);
|
||||
|
||||
if (log_prob) {
|
||||
for (size_t i = 0; i < scores.size(); ++i) {
|
||||
scores[i] = FloorScore(log(scores[i]));
|
||||
if (scores[i] == 0.0f) scores[i] = 0.0000000001;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < scores.size(); ++i) {
|
||||
retvector.push_back(scores[i]);
|
||||
}
|
||||
|
||||
// exclude LexRO property from property column
|
||||
prop = prop.substr(0, startPos)
|
||||
+ prop.substr(endPos + 2, prop.size() - endPos - 2);
|
||||
//cerr << "line.property_to_be_binarized=" << line.property_to_be_binarized << "AAAA" << endl;
|
||||
}
|
||||
}
|
||||
|
||||
} /* namespace Moses2 */
|
51
moses/TranslationModel/ProbingPT/StoreTarget.h
Normal file
51
moses/TranslationModel/ProbingPT/StoreTarget.h
Normal file
@ -0,0 +1,51 @@
|
||||
/*
|
||||
* StoreTarget.h
|
||||
*
|
||||
* Created on: 19 Jan 2016
|
||||
* Author: hieu
|
||||
*/
|
||||
#pragma once
|
||||
#include <string>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
#include <inttypes.h>
|
||||
#include <boost/unordered_map.hpp>
|
||||
#include <boost/unordered_set.hpp>
|
||||
#include "StoreVocab.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class line_text;
|
||||
class target_text;
|
||||
|
||||
class StoreTarget
|
||||
{
|
||||
public:
|
||||
StoreTarget(const std::string &basepath);
|
||||
virtual ~StoreTarget();
|
||||
|
||||
uint64_t Save();
|
||||
void SaveAlignment();
|
||||
|
||||
void Append(const line_text &line, bool log_prob, bool scfg);
|
||||
protected:
|
||||
std::string m_basePath;
|
||||
std::fstream m_fileTargetColl;
|
||||
StoreVocab<uint32_t> m_vocab;
|
||||
|
||||
typedef boost::unordered_map<std::vector<size_t>, uint32_t> Alignments;
|
||||
Alignments m_aligns;
|
||||
|
||||
std::vector<target_text*> m_coll;
|
||||
|
||||
uint32_t GetAlignId(const std::vector<size_t> &align);
|
||||
void Save(const target_text &rule);
|
||||
|
||||
void AppendLexRO(std::string &prop, std::vector<float> &retvector,
|
||||
bool log_prob) const;
|
||||
|
||||
};
|
||||
|
||||
} /* namespace Moses2 */
|
||||
|
13
moses/TranslationModel/ProbingPT/StoreVocab.cpp
Normal file
13
moses/TranslationModel/ProbingPT/StoreVocab.cpp
Normal file
@ -0,0 +1,13 @@
|
||||
/*
|
||||
* StoreVocab.cpp
|
||||
*
|
||||
* Created on: 15 Jun 2016
|
||||
* Author: hieu
|
||||
*/
|
||||
#include <fstream>
|
||||
#include "StoreVocab.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
} /* namespace Moses2 */
|
64
moses/TranslationModel/ProbingPT/StoreVocab.h
Normal file
64
moses/TranslationModel/ProbingPT/StoreVocab.h
Normal file
@ -0,0 +1,64 @@
|
||||
/*
|
||||
* StoreVocab.h
|
||||
*
|
||||
* Created on: 15 Jun 2016
|
||||
* Author: hieu
|
||||
*/
|
||||
#pragma once
|
||||
#include <string>
|
||||
#include <boost/unordered_map.hpp>
|
||||
#include "moses/OutputFileStream.h"
|
||||
#include "moses/Util.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
template<typename VOCABID>
|
||||
class StoreVocab
|
||||
{
|
||||
protected:
|
||||
std::string m_path;
|
||||
|
||||
typedef boost::unordered_map<std::string, VOCABID> Coll;
|
||||
Coll m_vocab;
|
||||
|
||||
public:
|
||||
StoreVocab(const std::string &path)
|
||||
:m_path(path)
|
||||
{}
|
||||
|
||||
virtual ~StoreVocab() {}
|
||||
|
||||
VOCABID GetVocabId(const std::string &word)
|
||||
{
|
||||
typename Coll::iterator iter = m_vocab.find(word);
|
||||
if (iter == m_vocab.end()) {
|
||||
VOCABID ind = m_vocab.size() + 1;
|
||||
m_vocab[word] = ind;
|
||||
return ind;
|
||||
}
|
||||
else {
|
||||
return iter->second;
|
||||
}
|
||||
}
|
||||
|
||||
void Insert(VOCABID id, const std::string &word)
|
||||
{
|
||||
m_vocab[word] = id;
|
||||
}
|
||||
|
||||
void Save()
|
||||
{
|
||||
OutputFileStream strme(m_path);
|
||||
|
||||
typename Coll::const_iterator iter;
|
||||
for (iter = m_vocab.begin(); iter != m_vocab.end(); ++iter) {
|
||||
strme << iter->first << "\t" << iter->second << std::endl;
|
||||
}
|
||||
|
||||
strme.Close();
|
||||
}
|
||||
};
|
||||
|
||||
} /* namespace Moses2 */
|
||||
|
@ -1,5 +1,11 @@
|
||||
#include <iostream>
|
||||
#include "hash.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
uint64_t getHash(StringPiece text)
|
||||
{
|
||||
std::size_t len = text.size();
|
||||
@ -7,24 +13,32 @@ uint64_t getHash(StringPiece text)
|
||||
return key;
|
||||
}
|
||||
|
||||
std::vector<uint64_t> getVocabIDs(StringPiece textin)
|
||||
std::vector<uint64_t> getVocabIDs(const StringPiece &textin)
|
||||
{
|
||||
//Tokenize
|
||||
std::vector<uint64_t> output;
|
||||
|
||||
util::TokenIter<util::SingleCharacter> it(textin, util::SingleCharacter(' '));
|
||||
util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' '));
|
||||
|
||||
while(it) {
|
||||
output.push_back(getHash(*it));
|
||||
it++;
|
||||
while (itWord) {
|
||||
StringPiece word = *itWord;
|
||||
uint64_t id = 0;
|
||||
|
||||
util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|'));
|
||||
while (itFactor) {
|
||||
StringPiece factor = *itFactor;
|
||||
//cerr << "factor=" << factor << endl;
|
||||
|
||||
id += getHash(factor);
|
||||
itFactor++;
|
||||
}
|
||||
|
||||
output.push_back(id);
|
||||
itWord++;
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
uint64_t getVocabID(std::string candidate)
|
||||
{
|
||||
std::size_t len = candidate.length();
|
||||
uint64_t key = util::MurmurHashNative(candidate.c_str(), len);
|
||||
return key;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -6,9 +6,12 @@
|
||||
#include "util/tokenize_piece.hh"
|
||||
#include <vector>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
//Gets the MurmurmurHash for give string
|
||||
uint64_t getHash(StringPiece text);
|
||||
|
||||
std::vector<uint64_t> getVocabIDs(StringPiece textin);
|
||||
std::vector<uint64_t> getVocabIDs(const StringPiece &textin);
|
||||
|
||||
uint64_t getVocabID(std::string candidate);
|
||||
}
|
||||
|
@ -1,451 +0,0 @@
|
||||
#include "huffmanish.hh"
|
||||
|
||||
Huffman::Huffman (const char * filepath)
|
||||
{
|
||||
//Read the file
|
||||
util::FilePiece filein(filepath);
|
||||
|
||||
//Init uniq_lines to zero;
|
||||
uniq_lines = 0;
|
||||
|
||||
line_text prev_line; //Check for unique lines.
|
||||
int num_lines = 0 ;
|
||||
|
||||
while (true) {
|
||||
line_text new_line;
|
||||
|
||||
num_lines++;
|
||||
|
||||
try {
|
||||
//Process line read
|
||||
new_line = splitLine(filein.ReadLine());
|
||||
count_elements(new_line); //Counts the number of elements, adds new and increments counters.
|
||||
|
||||
} catch (util::EndOfFileException e) {
|
||||
std::cerr << "Unique entries counted: ";
|
||||
break;
|
||||
}
|
||||
|
||||
if (new_line.source_phrase == prev_line.source_phrase) {
|
||||
continue;
|
||||
} else {
|
||||
uniq_lines++;
|
||||
prev_line = new_line;
|
||||
}
|
||||
}
|
||||
|
||||
std::cerr << uniq_lines << std::endl;
|
||||
}
|
||||
|
||||
void Huffman::count_elements(line_text linein)
|
||||
{
|
||||
//For target phrase:
|
||||
util::TokenIter<util::SingleCharacter> it(linein.target_phrase, util::SingleCharacter(' '));
|
||||
while (it) {
|
||||
//Check if we have that entry
|
||||
std::map<std::string, unsigned int>::iterator mapiter;
|
||||
mapiter = target_phrase_words.find(it->as_string());
|
||||
|
||||
if (mapiter != target_phrase_words.end()) {
|
||||
//If the element is found, increment the count.
|
||||
mapiter->second++;
|
||||
} else {
|
||||
//Else create a new entry;
|
||||
target_phrase_words.insert(std::pair<std::string, unsigned int>(it->as_string(), 1));
|
||||
}
|
||||
it++;
|
||||
}
|
||||
|
||||
//For word allignment 1
|
||||
std::map<std::vector<unsigned char>, unsigned int>::iterator mapiter3;
|
||||
std::vector<unsigned char> numbers = splitWordAll1(linein.word_align);
|
||||
mapiter3 = word_all1.find(numbers);
|
||||
|
||||
if (mapiter3 != word_all1.end()) {
|
||||
//If the element is found, increment the count.
|
||||
mapiter3->second++;
|
||||
} else {
|
||||
//Else create a new entry;
|
||||
word_all1.insert(std::pair<std::vector<unsigned char>, unsigned int>(numbers, 1));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//Assigns huffman values for each unique element
|
||||
void Huffman::assign_values()
|
||||
{
|
||||
//First create vectors for all maps so that we could sort them later.
|
||||
|
||||
//Create a vector for target phrases
|
||||
for(std::map<std::string, unsigned int>::iterator it = target_phrase_words.begin(); it != target_phrase_words.end(); it++ ) {
|
||||
target_phrase_words_counts.push_back(*it);
|
||||
}
|
||||
//Sort it
|
||||
std::sort(target_phrase_words_counts.begin(), target_phrase_words_counts.end(), sort_pair());
|
||||
|
||||
//Create a vector for word allignments 1
|
||||
for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1.begin(); it != word_all1.end(); it++ ) {
|
||||
word_all1_counts.push_back(*it);
|
||||
}
|
||||
//Sort it
|
||||
std::sort(word_all1_counts.begin(), word_all1_counts.end(), sort_pair_vec());
|
||||
|
||||
|
||||
//Afterwards we assign a value for each phrase, starting from 1, as zero is reserved for delimiter
|
||||
unsigned int i = 1; //huffman code
|
||||
for(std::vector<std::pair<std::string, unsigned int> >::iterator it = target_phrase_words_counts.begin();
|
||||
it != target_phrase_words_counts.end(); it++) {
|
||||
target_phrase_huffman.insert(std::pair<std::string, unsigned int>(it->first, i));
|
||||
i++; //Go to the next huffman code
|
||||
}
|
||||
|
||||
i = 1; //Reset i for the next map
|
||||
for(std::vector<std::pair<std::vector<unsigned char>, unsigned int> >::iterator it = word_all1_counts.begin();
|
||||
it != word_all1_counts.end(); it++) {
|
||||
word_all1_huffman.insert(std::pair<std::vector<unsigned char>, unsigned int>(it->first, i));
|
||||
i++; //Go to the next huffman code
|
||||
}
|
||||
|
||||
//After lookups are produced, clear some memory usage of objects not needed anymore.
|
||||
target_phrase_words.clear();
|
||||
word_all1.clear();
|
||||
|
||||
target_phrase_words_counts.clear();
|
||||
word_all1_counts.clear();
|
||||
|
||||
std::cerr << "Finished generating huffman codes." << std::endl;
|
||||
|
||||
}
|
||||
|
||||
void Huffman::serialize_maps(const char * dirname)
|
||||
{
|
||||
//Note that directory name should exist.
|
||||
std::string basedir(dirname);
|
||||
std::string target_phrase_path(basedir + "/target_phrases");
|
||||
std::string probabilities_path(basedir + "/probs");
|
||||
std::string word_all1_path(basedir + "/Wall1");
|
||||
|
||||
//Target phrase
|
||||
std::ofstream os (target_phrase_path.c_str(), std::ios::binary);
|
||||
boost::archive::text_oarchive oarch(os);
|
||||
oarch << lookup_target_phrase;
|
||||
os.close();
|
||||
|
||||
//Word all1
|
||||
std::ofstream os2 (word_all1_path.c_str(), std::ios::binary);
|
||||
boost::archive::text_oarchive oarch2(os2);
|
||||
oarch2 << lookup_word_all1;
|
||||
os2.close();
|
||||
}
|
||||
|
||||
std::vector<unsigned char> Huffman::full_encode_line(line_text line)
|
||||
{
|
||||
return vbyte_encode_line((encode_line(line)));
|
||||
}
|
||||
|
||||
std::vector<unsigned int> Huffman::encode_line(line_text line)
|
||||
{
|
||||
std::vector<unsigned int> retvector;
|
||||
|
||||
//Get target_phrase first.
|
||||
util::TokenIter<util::SingleCharacter> it(line.target_phrase, util::SingleCharacter(' '));
|
||||
while (it) {
|
||||
retvector.push_back(target_phrase_huffman.find(it->as_string())->second);
|
||||
it++;
|
||||
}
|
||||
//Add a zero;
|
||||
retvector.push_back(0);
|
||||
|
||||
//Get probabilities. Reinterpreting the float bytes as unsgined int.
|
||||
util::TokenIter<util::SingleCharacter> probit(line.prob, util::SingleCharacter(' '));
|
||||
while (probit) {
|
||||
//Sometimes we have too big floats to handle, so first convert to double
|
||||
double tempnum = atof(probit->data());
|
||||
float num = (float)tempnum;
|
||||
retvector.push_back(reinterpret_float(&num));
|
||||
probit++;
|
||||
}
|
||||
//Add a zero;
|
||||
retvector.push_back(0);
|
||||
|
||||
|
||||
//Get Word allignments
|
||||
retvector.push_back(word_all1_huffman.find(splitWordAll1(line.word_align))->second);
|
||||
retvector.push_back(0);
|
||||
|
||||
return retvector;
|
||||
}
|
||||
|
||||
void Huffman::produce_lookups()
|
||||
{
|
||||
//basically invert every map that we have
|
||||
for(std::map<std::string, unsigned int>::iterator it = target_phrase_huffman.begin(); it != target_phrase_huffman.end(); it++ ) {
|
||||
lookup_target_phrase.insert(std::pair<unsigned int, std::string>(it->second, it->first));
|
||||
}
|
||||
|
||||
for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1_huffman.begin(); it != word_all1_huffman.end(); it++ ) {
|
||||
lookup_word_all1.insert(std::pair<unsigned int, std::vector<unsigned char> >(it->second, it->first));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
HuffmanDecoder::HuffmanDecoder (const char * dirname)
|
||||
{
|
||||
//Read the maps from disk
|
||||
|
||||
//Note that directory name should exist.
|
||||
std::string basedir(dirname);
|
||||
std::string target_phrase_path(basedir + "/target_phrases");
|
||||
std::string word_all1_path(basedir + "/Wall1");
|
||||
|
||||
//Target phrases
|
||||
std::ifstream is (target_phrase_path.c_str(), std::ios::binary);
|
||||
boost::archive::text_iarchive iarch(is);
|
||||
iarch >> lookup_target_phrase;
|
||||
is.close();
|
||||
|
||||
//Word allignment 1
|
||||
std::ifstream is2 (word_all1_path.c_str(), std::ios::binary);
|
||||
boost::archive::text_iarchive iarch2(is2);
|
||||
iarch2 >> lookup_word_all1;
|
||||
is2.close();
|
||||
|
||||
}
|
||||
|
||||
HuffmanDecoder::HuffmanDecoder (std::map<unsigned int, std::string> * lookup_target,
|
||||
std::map<unsigned int, std::vector<unsigned char> > * lookup_word1)
|
||||
{
|
||||
lookup_target_phrase = *lookup_target;
|
||||
lookup_word_all1 = *lookup_word1;
|
||||
}
|
||||
|
||||
std::vector<target_text> HuffmanDecoder::full_decode_line (std::vector<unsigned char> lines, int num_scores)
|
||||
{
|
||||
std::vector<target_text> retvector; //All target phrases
|
||||
std::vector<unsigned int> decoded_lines = vbyte_decode_line(lines); //All decoded lines
|
||||
std::vector<unsigned int>::iterator it = decoded_lines.begin(); //Iterator for them
|
||||
std::vector<unsigned int> current_target_phrase; //Current target phrase decoded
|
||||
|
||||
short zero_count = 0; //Count home many zeroes we have met. so far. Every 3 zeroes mean a new target phrase.
|
||||
while(it != decoded_lines.end()) {
|
||||
if (zero_count == 1) {
|
||||
//We are extracting scores. we know how many scores there are so we can push them
|
||||
//to the vector. This is done in case any of the scores is 0, because it would mess
|
||||
//up the state machine.
|
||||
for (int i = 0; i < num_scores; i++) {
|
||||
current_target_phrase.push_back(*it);
|
||||
it++;
|
||||
}
|
||||
}
|
||||
|
||||
if (zero_count == 3) {
|
||||
//We have finished with this entry, decode it, and add it to the retvector.
|
||||
retvector.push_back(decode_line(current_target_phrase, num_scores));
|
||||
current_target_phrase.clear(); //Clear the current target phrase and the zero_count
|
||||
zero_count = 0; //So that we can reuse them for the next target phrase
|
||||
}
|
||||
//Add to the next target_phrase, number by number.
|
||||
current_target_phrase.push_back(*it);
|
||||
if (*it == 0) {
|
||||
zero_count++;
|
||||
}
|
||||
it++; //Go to the next word/symbol
|
||||
}
|
||||
//Don't forget the last remaining line!
|
||||
if (zero_count == 3) {
|
||||
//We have finished with this entry, decode it, and add it to the retvector.
|
||||
retvector.push_back(decode_line(current_target_phrase, num_scores));
|
||||
current_target_phrase.clear(); //Clear the current target phrase and the zero_count
|
||||
zero_count = 0; //So that we can reuse them for the next target phrase
|
||||
}
|
||||
|
||||
return retvector;
|
||||
|
||||
}
|
||||
|
||||
target_text HuffmanDecoder::decode_line (std::vector<unsigned int> input, int num_scores)
|
||||
{
|
||||
//demo decoder
|
||||
target_text ret;
|
||||
//Split everything
|
||||
std::vector<unsigned int> target_phrase;
|
||||
std::vector<unsigned int> probs;
|
||||
unsigned int wAll;
|
||||
|
||||
//Split the line into the proper arrays
|
||||
short num_zeroes = 0;
|
||||
int counter = 0;
|
||||
while (num_zeroes < 3) {
|
||||
unsigned int num = input[counter];
|
||||
if (num == 0) {
|
||||
num_zeroes++;
|
||||
} else if (num_zeroes == 0) {
|
||||
target_phrase.push_back(num);
|
||||
} else if (num_zeroes == 1) {
|
||||
//Push exactly num_scores scores
|
||||
for (int i = 0; i < num_scores; i++) {
|
||||
probs.push_back(num);
|
||||
counter++;
|
||||
num = input[counter];
|
||||
}
|
||||
continue;
|
||||
} else if (num_zeroes == 2) {
|
||||
wAll = num;
|
||||
}
|
||||
counter++;
|
||||
}
|
||||
|
||||
ret.target_phrase = target_phrase;
|
||||
ret.word_all1 = lookup_word_all1.find(wAll)->second;
|
||||
|
||||
//Decode probabilities
|
||||
for (std::vector<unsigned int>::iterator it = probs.begin(); it != probs.end(); it++) {
|
||||
ret.prob.push_back(reinterpret_uint(&(*it)));
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
inline std::string HuffmanDecoder::getTargetWordFromID(unsigned int id)
|
||||
{
|
||||
return lookup_target_phrase.find(id)->second;
|
||||
}
|
||||
|
||||
std::string HuffmanDecoder::getTargetWordsFromIDs(std::vector<unsigned int> ids)
|
||||
{
|
||||
std::string returnstring;
|
||||
for (std::vector<unsigned int>::iterator it = ids.begin(); it != ids.end(); it++) {
|
||||
returnstring.append(getTargetWordFromID(*it) + " ");
|
||||
}
|
||||
|
||||
return returnstring;
|
||||
}
|
||||
|
||||
inline std::string getTargetWordFromID(unsigned int id, std::map<unsigned int, std::string> * lookup_target_phrase)
|
||||
{
|
||||
return lookup_target_phrase->find(id)->second;
|
||||
}
|
||||
|
||||
std::string getTargetWordsFromIDs(std::vector<unsigned int> ids, std::map<unsigned int, std::string> * lookup_target_phrase)
|
||||
{
|
||||
std::string returnstring;
|
||||
for (std::vector<unsigned int>::iterator it = ids.begin(); it != ids.end(); it++) {
|
||||
returnstring.append(getTargetWordFromID(*it, lookup_target_phrase) + " ");
|
||||
}
|
||||
|
||||
return returnstring;
|
||||
}
|
||||
|
||||
/*Those functions are used to more easily store the floats in the binary phrase table
|
||||
We convert the float unsinged int so that it is the same as our other values and we can
|
||||
apply variable byte encoding on top of it.*/
|
||||
|
||||
inline unsigned int reinterpret_float(float * num)
|
||||
{
|
||||
unsigned int * converted_num;
|
||||
converted_num = reinterpret_cast<unsigned int *>(num);
|
||||
return *converted_num;
|
||||
}
|
||||
|
||||
inline float reinterpret_uint(unsigned int * num)
|
||||
{
|
||||
float * converted_num;
|
||||
converted_num = reinterpret_cast<float *>(num);
|
||||
return *converted_num;
|
||||
}
|
||||
|
||||
/*Mostly taken from stackoverflow, http://stackoverflow.com/questions/5858646/optimizing-variable-length-encoding
|
||||
and modified in order to return a vector of chars. Implements ULEB128 or variable byte encoding.
|
||||
This is highly optimized version with unrolled loop */
|
||||
inline std::vector<unsigned char> vbyte_encode(unsigned int num)
|
||||
{
|
||||
//Determine how many bytes we are going to take.
|
||||
short size;
|
||||
std::vector<unsigned char> byte_vector;
|
||||
|
||||
if (num < 0x00000080U) {
|
||||
size = 1;
|
||||
byte_vector.reserve(size);
|
||||
goto b1;
|
||||
}
|
||||
if (num < 0x00004000U) {
|
||||
size = 2;
|
||||
byte_vector.reserve(size);
|
||||
goto b2;
|
||||
}
|
||||
if (num < 0x00200000U) {
|
||||
size = 3;
|
||||
byte_vector.reserve(size);
|
||||
goto b3;
|
||||
}
|
||||
if (num < 0x10000000U) {
|
||||
size = 4;
|
||||
byte_vector.reserve(size);
|
||||
goto b4;
|
||||
}
|
||||
size = 5;
|
||||
byte_vector.reserve(size);
|
||||
|
||||
|
||||
//Now proceed with the encoding.
|
||||
byte_vector.push_back((num & 0x7f) | 0x80);
|
||||
num >>= 7;
|
||||
b4:
|
||||
byte_vector.push_back((num & 0x7f) | 0x80);
|
||||
num >>= 7;
|
||||
b3:
|
||||
byte_vector.push_back((num & 0x7f) | 0x80);
|
||||
num >>= 7;
|
||||
b2:
|
||||
byte_vector.push_back((num & 0x7f) | 0x80);
|
||||
num >>= 7;
|
||||
b1:
|
||||
byte_vector.push_back(num);
|
||||
|
||||
return byte_vector;
|
||||
}
|
||||
|
||||
std::vector<unsigned int> vbyte_decode_line(std::vector<unsigned char> line)
|
||||
{
|
||||
std::vector<unsigned int> huffman_line;
|
||||
std::vector<unsigned char> current_num;
|
||||
|
||||
for (std::vector<unsigned char>::iterator it = line.begin(); it != line.end(); it++) {
|
||||
current_num.push_back(*it);
|
||||
if ((*it >> 7) != 1) {
|
||||
//We don't have continuation in the next bit
|
||||
huffman_line.push_back(bytes_to_int(current_num));
|
||||
current_num.clear();
|
||||
}
|
||||
}
|
||||
return huffman_line;
|
||||
}
|
||||
|
||||
inline unsigned int bytes_to_int(std::vector<unsigned char> number)
|
||||
{
|
||||
unsigned int retvalue = 0;
|
||||
std::vector<unsigned char>::iterator it = number.begin();
|
||||
unsigned char shift = 0; //By how many bits to shift
|
||||
|
||||
while (it != number.end()) {
|
||||
retvalue |= (*it & 0x7f) << shift;
|
||||
shift += 7;
|
||||
it++;
|
||||
}
|
||||
|
||||
return retvalue;
|
||||
}
|
||||
|
||||
std::vector<unsigned char> vbyte_encode_line(std::vector<unsigned int> line)
|
||||
{
|
||||
std::vector<unsigned char> retvec;
|
||||
|
||||
//For each unsigned int in the line, vbyte encode it and add it to a vector of unsigned chars.
|
||||
for (std::vector<unsigned int>::iterator it = line.begin(); it != line.end(); it++) {
|
||||
std::vector<unsigned char> vbyte_encoded = vbyte_encode(*it);
|
||||
retvec.insert(retvec.end(), vbyte_encoded.begin(), vbyte_encoded.end());
|
||||
}
|
||||
|
||||
return retvec;
|
||||
}
|
@ -1,112 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
//Huffman encodes a line and also produces the vocabulary ids
|
||||
#include "hash.hh"
|
||||
#include "line_splitter.hh"
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <boost/serialization/serialization.hpp>
|
||||
#include <boost/serialization/vector.hpp>
|
||||
#include <boost/serialization/map.hpp>
|
||||
#include <boost/archive/text_iarchive.hpp>
|
||||
#include <boost/archive/text_oarchive.hpp>
|
||||
|
||||
//Sorting for the second
|
||||
struct sort_pair {
|
||||
bool operator()(const std::pair<std::string, unsigned int> &left, const std::pair<std::string, unsigned int> &right) {
|
||||
return left.second > right.second; //This puts biggest numbers first.
|
||||
}
|
||||
};
|
||||
|
||||
struct sort_pair_vec {
|
||||
bool operator()(const std::pair<std::vector<unsigned char>, unsigned int> &left, const std::pair<std::vector<unsigned char>, unsigned int> &right) {
|
||||
return left.second > right.second; //This puts biggest numbers first.
|
||||
}
|
||||
};
|
||||
|
||||
class Huffman
|
||||
{
|
||||
unsigned long uniq_lines; //Unique lines in the file.
|
||||
|
||||
//Containers used when counting the occurence of a given phrase
|
||||
std::map<std::string, unsigned int> target_phrase_words;
|
||||
std::map<std::vector<unsigned char>, unsigned int> word_all1;
|
||||
|
||||
//Same containers as vectors, for sorting
|
||||
std::vector<std::pair<std::string, unsigned int> > target_phrase_words_counts;
|
||||
std::vector<std::pair<std::vector<unsigned char>, unsigned int> > word_all1_counts;
|
||||
|
||||
//Huffman maps
|
||||
std::map<std::string, unsigned int> target_phrase_huffman;
|
||||
std::map<std::vector<unsigned char>, unsigned int> word_all1_huffman;
|
||||
|
||||
//inverted maps
|
||||
std::map<unsigned int, std::string> lookup_target_phrase;
|
||||
std::map<unsigned int, std::vector<unsigned char> > lookup_word_all1;
|
||||
|
||||
public:
|
||||
Huffman (const char *);
|
||||
void count_elements (line_text line);
|
||||
void assign_values();
|
||||
void serialize_maps(const char * dirname);
|
||||
void produce_lookups();
|
||||
|
||||
std::vector<unsigned int> encode_line(line_text line);
|
||||
|
||||
//encode line + variable byte ontop
|
||||
std::vector<unsigned char> full_encode_line(line_text line);
|
||||
|
||||
//Getters
|
||||
const std::map<unsigned int, std::string> get_target_lookup_map() const {
|
||||
return lookup_target_phrase;
|
||||
}
|
||||
const std::map<unsigned int, std::vector<unsigned char> > get_word_all1_lookup_map() const {
|
||||
return lookup_word_all1;
|
||||
}
|
||||
|
||||
unsigned long getUniqLines() {
|
||||
return uniq_lines;
|
||||
}
|
||||
};
|
||||
|
||||
class HuffmanDecoder
|
||||
{
|
||||
std::map<unsigned int, std::string> lookup_target_phrase;
|
||||
std::map<unsigned int, std::vector<unsigned char> > lookup_word_all1;
|
||||
|
||||
public:
|
||||
HuffmanDecoder (const char *);
|
||||
HuffmanDecoder (std::map<unsigned int, std::string> *, std::map<unsigned int, std::vector<unsigned char> > *);
|
||||
|
||||
//Getters
|
||||
const std::map<unsigned int, std::string> get_target_lookup_map() const {
|
||||
return lookup_target_phrase;
|
||||
}
|
||||
const std::map<unsigned int, std::vector<unsigned char> > get_word_all1_lookup_map() const {
|
||||
return lookup_word_all1;
|
||||
}
|
||||
|
||||
inline std::string getTargetWordFromID(unsigned int id);
|
||||
|
||||
std::string getTargetWordsFromIDs(std::vector<unsigned int> ids);
|
||||
|
||||
target_text decode_line (std::vector<unsigned int> input, int num_scores);
|
||||
|
||||
//Variable byte decodes a all target phrases contained here and then passes them to decode_line
|
||||
std::vector<target_text> full_decode_line (std::vector<unsigned char> lines, int num_scores);
|
||||
};
|
||||
|
||||
std::string getTargetWordsFromIDs(std::vector<unsigned int> ids, std::map<unsigned int, std::string> * lookup_target_phrase);
|
||||
|
||||
inline std::string getTargetWordFromID(unsigned int id, std::map<unsigned int, std::string> * lookup_target_phrase);
|
||||
|
||||
inline unsigned int reinterpret_float(float * num);
|
||||
|
||||
inline float reinterpret_uint(unsigned int * num);
|
||||
|
||||
std::vector<unsigned char> vbyte_encode_line(std::vector<unsigned int> line);
|
||||
inline std::vector<unsigned char> vbyte_encode(unsigned int num);
|
||||
std::vector<unsigned int> vbyte_decode_line(std::vector<unsigned char> line);
|
||||
inline unsigned int bytes_to_int(std::vector<unsigned char> number);
|
@ -1,66 +1,92 @@
|
||||
#include "line_splitter.hh"
|
||||
|
||||
line_text splitLine(StringPiece textin)
|
||||
namespace Moses
|
||||
{
|
||||
const char delim[] = " ||| ";
|
||||
|
||||
line_text splitLine(const StringPiece &textin, bool scfg)
|
||||
{
|
||||
const char delim[] = "|||";
|
||||
line_text output;
|
||||
|
||||
//Tokenize
|
||||
util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
|
||||
//Get source phrase
|
||||
output.source_phrase = *it;
|
||||
output.source_phrase = Trim(*it);
|
||||
//std::cerr << "output.source_phrase=" << output.source_phrase << "AAAA" << std::endl;
|
||||
|
||||
//Get target_phrase
|
||||
it++;
|
||||
output.target_phrase = *it;
|
||||
output.target_phrase = Trim(*it);
|
||||
//std::cerr << "output.target_phrase=" << output.target_phrase << "AAAA" << std::endl;
|
||||
|
||||
if (scfg) {
|
||||
/*
|
||||
std::cerr << "output.source_phrase=" << output.source_phrase << std::endl;
|
||||
std::cerr << "output.target_phrase=" << output.target_phrase << std::endl;
|
||||
reformatSCFG(output);
|
||||
std::cerr << "output.source_phrase=" << output.source_phrase << std::endl;
|
||||
std::cerr << "output.target_phrase=" << output.target_phrase << std::endl;
|
||||
*/
|
||||
}
|
||||
|
||||
//Get probabilities
|
||||
it++;
|
||||
output.prob = *it;
|
||||
output.prob = Trim(*it);
|
||||
//std::cerr << "output.prob=" << output.prob << "AAAA" << std::endl;
|
||||
|
||||
//Get WordAllignment
|
||||
it++;
|
||||
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
|
||||
output.word_align = *it;
|
||||
output.word_align = Trim(*it);
|
||||
//std::cerr << "output.word_align=" << output.word_align << "AAAA" << std::endl;
|
||||
|
||||
//Get count
|
||||
it++;
|
||||
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
|
||||
output.counts = *it;
|
||||
output.counts = Trim(*it);
|
||||
//std::cerr << "output.counts=" << output.counts << "AAAA" << std::endl;
|
||||
|
||||
//Get sparse_score
|
||||
it++;
|
||||
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
|
||||
output.sparse_score = *it;
|
||||
output.sparse_score = Trim(*it);
|
||||
//std::cerr << "output.sparse_score=" << output.sparse_score << "AAAA" << std::endl;
|
||||
|
||||
//Get property
|
||||
it++;
|
||||
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
|
||||
output.property = *it;
|
||||
output.property = Trim(*it);
|
||||
//std::cerr << "output.property=" << output.property << "AAAA" << std::endl;
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
std::vector<unsigned char> splitWordAll1(StringPiece textin)
|
||||
std::vector<unsigned char> splitWordAll1(const StringPiece &textin)
|
||||
{
|
||||
const char delim[] = " ";
|
||||
const char delim2[] = "-";
|
||||
std::vector<unsigned char> output;
|
||||
|
||||
//Case with no word alignments.
|
||||
if (textin.size() == 0) {
|
||||
return output;
|
||||
}
|
||||
|
||||
//Split on space
|
||||
util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
|
||||
|
||||
//For each int
|
||||
while (it) {
|
||||
//Split on dash (-)
|
||||
util::TokenIter<util::MultiCharacter> itInner(*it, util::MultiCharacter(delim2));
|
||||
util::TokenIter<util::MultiCharacter> itInner(*it,
|
||||
util::MultiCharacter(delim2));
|
||||
|
||||
//Insert the two entries in the vector. User will read entry 0 and 1 to get the first,
|
||||
//2 and 3 for second etc. Use unsigned char instead of int to save space, as
|
||||
//word allignments are all very small numbers that fit in a single byte
|
||||
output.push_back((unsigned char)(atoi(itInner->data())));
|
||||
output.push_back((unsigned char) (atoi(itInner->data())));
|
||||
itInner++;
|
||||
output.push_back((unsigned char)(atoi(itInner->data())));
|
||||
output.push_back((unsigned char) (atoi(itInner->data())));
|
||||
it++;
|
||||
}
|
||||
|
||||
@ -68,3 +94,10 @@ std::vector<unsigned char> splitWordAll1(StringPiece textin)
|
||||
|
||||
}
|
||||
|
||||
void reformatSCFG(line_text &output)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -9,8 +9,12 @@
|
||||
#include "util/tokenize_piece.hh"
|
||||
#include <vector>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
//Struct for holding processed line
|
||||
struct line_text {
|
||||
struct line_text
|
||||
{
|
||||
StringPiece source_phrase;
|
||||
StringPiece target_phrase;
|
||||
StringPiece prob;
|
||||
@ -18,16 +22,38 @@ struct line_text {
|
||||
StringPiece counts;
|
||||
StringPiece sparse_score;
|
||||
StringPiece property;
|
||||
std::string property_to_be_binarized;
|
||||
};
|
||||
|
||||
//Struct for holding processed line
|
||||
struct target_text {
|
||||
struct target_text
|
||||
{
|
||||
std::vector<unsigned int> target_phrase;
|
||||
std::vector<float> prob;
|
||||
std::vector<unsigned char> word_all1;
|
||||
std::vector<size_t> word_align_term;
|
||||
std::vector<size_t> word_align_non_term;
|
||||
std::vector<char> counts;
|
||||
std::vector<char> sparse_score;
|
||||
std::vector<char> property;
|
||||
|
||||
/*
|
||||
void Reset()
|
||||
{
|
||||
target_phrase.clear();
|
||||
prob.clear();
|
||||
word_all1.clear();
|
||||
counts.clear();
|
||||
sparse_score.clear();
|
||||
property.clear();
|
||||
}
|
||||
*/
|
||||
};
|
||||
|
||||
//Ask if it's better to have it receive a pointer to a line_text struct
|
||||
line_text splitLine(StringPiece textin);
|
||||
line_text splitLine(const StringPiece &textin, bool scfg);
|
||||
void reformatSCFG(line_text &output);
|
||||
|
||||
std::vector<unsigned char> splitWordAll1(const StringPiece &textin);
|
||||
|
||||
}
|
||||
|
||||
std::vector<unsigned char> splitWordAll1(StringPiece textin);
|
||||
|
@ -1,5 +1,8 @@
|
||||
#include "probing_hash_utils.hh"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
//Read table from disk, return memory map location
|
||||
char * readTable(const char * filename, size_t size)
|
||||
{
|
||||
@ -13,7 +16,7 @@ char * readTable(const char * filename, size_t size)
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
map = (char *)mmap(0, size, PROT_READ, MAP_SHARED, fd, 0);
|
||||
map = (char *) mmap(0, size, PROT_READ, MAP_SHARED, fd, 0);
|
||||
|
||||
if (map == MAP_FAILED) {
|
||||
close(fd);
|
||||
@ -24,11 +27,24 @@ char * readTable(const char * filename, size_t size)
|
||||
return map;
|
||||
}
|
||||
|
||||
|
||||
void serialize_table(char *mem, size_t size, const char * filename)
|
||||
void serialize_table(char *mem, size_t size, const std::string &filename)
|
||||
{
|
||||
std::ofstream os (filename, std::ios::binary);
|
||||
os.write((const char*)&mem[0], size);
|
||||
std::ofstream os(filename.c_str(), std::ios::binary);
|
||||
os.write((const char*) &mem[0], size);
|
||||
os.close();
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t getKey(const uint64_t source_phrase[], size_t size)
|
||||
{
|
||||
//TOO SLOW
|
||||
//uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
|
||||
uint64_t key = 0;
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
key += (source_phrase[i] << i);
|
||||
}
|
||||
return key;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -7,31 +7,49 @@
|
||||
#include <fcntl.h>
|
||||
#include <fstream>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
#define API_VERSION 15
|
||||
|
||||
//Hash table entry
|
||||
struct Entry {
|
||||
uint64_t key;
|
||||
struct Entry
|
||||
{
|
||||
typedef uint64_t Key;
|
||||
unsigned int bytes_toread;
|
||||
Key key;
|
||||
|
||||
uint64_t GetKey() const {
|
||||
Key GetKey() const
|
||||
{
|
||||
return key;
|
||||
}
|
||||
|
||||
void SetKey(uint64_t to) {
|
||||
void SetKey(Key to)
|
||||
{
|
||||
key = to;
|
||||
}
|
||||
|
||||
uint64_t GetValue() const {
|
||||
return value;
|
||||
}
|
||||
|
||||
uint64_t value;
|
||||
};
|
||||
|
||||
#define NONE std::numeric_limits<uint64_t>::max()
|
||||
|
||||
//Define table
|
||||
typedef util::ProbingHashTable<Entry, boost::hash<uint64_t> > Table;
|
||||
|
||||
void serialize_table(char *mem, size_t size, const char * filename);
|
||||
void serialize_table(char *mem, size_t size, const std::string &filename);
|
||||
|
||||
char * readTable(const char * filename, size_t size);
|
||||
|
||||
uint64_t getKey(const uint64_t source_phrase[], size_t size);
|
||||
|
||||
struct TargetPhraseInfo
|
||||
{
|
||||
uint32_t alignTerm;
|
||||
uint32_t alignNonTerm;
|
||||
uint16_t numWords;
|
||||
uint16_t propLength;
|
||||
uint16_t filler;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
|
@ -1,198 +0,0 @@
|
||||
#include "quering.hh"
|
||||
|
||||
unsigned char * read_binary_file(const char * filename, size_t filesize)
|
||||
{
|
||||
//Get filesize
|
||||
int fd;
|
||||
unsigned char * map;
|
||||
|
||||
fd = open(filename, O_RDONLY);
|
||||
|
||||
if (fd == -1) {
|
||||
perror("Error opening file for reading");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
map = (unsigned char *)mmap(0, filesize, PROT_READ, MAP_SHARED, fd, 0);
|
||||
if (map == MAP_FAILED) {
|
||||
close(fd);
|
||||
perror("Error mmapping the file");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
return map;
|
||||
}
|
||||
|
||||
QueryEngine::QueryEngine(const char * filepath) : decoder(filepath)
|
||||
{
|
||||
|
||||
//Create filepaths
|
||||
std::string basepath(filepath);
|
||||
std::string path_to_hashtable = basepath + "/probing_hash.dat";
|
||||
std::string path_to_data_bin = basepath + "/binfile.dat";
|
||||
std::string path_to_source_vocabid = basepath + "/source_vocabids";
|
||||
|
||||
///Source phrase vocabids
|
||||
read_map(&source_vocabids, path_to_source_vocabid.c_str());
|
||||
|
||||
//Target phrase vocabIDs
|
||||
vocabids = decoder.get_target_lookup_map();
|
||||
|
||||
//Read config file
|
||||
std::string line;
|
||||
std::ifstream config ((basepath + "/config").c_str());
|
||||
//Check API version:
|
||||
getline(config, line);
|
||||
if (atoi(line.c_str()) != API_VERSION) {
|
||||
std::cerr << "The ProbingPT API has changed, please rebinarize your phrase tables." << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
//Get tablesize.
|
||||
getline(config, line);
|
||||
int tablesize = atoi(line.c_str());
|
||||
//Number of scores
|
||||
getline(config, line);
|
||||
num_scores = atoi(line.c_str());
|
||||
//do we have a reordering table
|
||||
getline(config, line);
|
||||
std::transform(line.begin(), line.end(), line.begin(), ::tolower); //Get the boolean in lowercase
|
||||
is_reordering = false;
|
||||
if (line == "true") {
|
||||
is_reordering = true;
|
||||
std::cerr << "WARNING. REORDERING TABLES NOT SUPPORTED YET." << std::endl;
|
||||
}
|
||||
config.close();
|
||||
|
||||
//Mmap binary table
|
||||
struct stat filestatus;
|
||||
stat(path_to_data_bin.c_str(), &filestatus);
|
||||
binary_filesize = filestatus.st_size;
|
||||
binary_mmaped = read_binary_file(path_to_data_bin.c_str(), binary_filesize);
|
||||
|
||||
//Read hashtable
|
||||
table_filesize = Table::Size(tablesize, 1.2);
|
||||
mem = readTable(path_to_hashtable.c_str(), table_filesize);
|
||||
Table table_init(mem, table_filesize);
|
||||
table = table_init;
|
||||
|
||||
std::cerr << "Initialized successfully! " << std::endl;
|
||||
}
|
||||
|
||||
QueryEngine::~QueryEngine()
|
||||
{
|
||||
//Clear mmap content from memory.
|
||||
munmap(binary_mmaped, binary_filesize);
|
||||
munmap(mem, table_filesize);
|
||||
|
||||
}
|
||||
|
||||
std::pair<bool, std::vector<target_text> > QueryEngine::query(std::vector<uint64_t> source_phrase)
|
||||
{
|
||||
bool found;
|
||||
std::vector<target_text> translation_entries;
|
||||
const Entry * entry;
|
||||
//TOO SLOW
|
||||
//uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
|
||||
uint64_t key = 0;
|
||||
for (int i = 0; i < source_phrase.size(); i++) {
|
||||
key += (source_phrase[i] << i);
|
||||
}
|
||||
|
||||
|
||||
found = table.Find(key, entry);
|
||||
|
||||
if (found) {
|
||||
//The phrase that was searched for was found! We need to get the translation entries.
|
||||
//We will read the largest entry in bytes and then filter the unnecesarry with functions
|
||||
//from line_splitter
|
||||
uint64_t initial_index = entry -> GetValue();
|
||||
unsigned int bytes_toread = entry -> bytes_toread;
|
||||
|
||||
//ASK HIEU FOR MORE EFFICIENT WAY TO DO THIS!
|
||||
std::vector<unsigned char> encoded_text; //Assign to the vector the relevant portion of the array.
|
||||
encoded_text.reserve(bytes_toread);
|
||||
for (int i = 0; i < bytes_toread; i++) {
|
||||
encoded_text.push_back(binary_mmaped[i+initial_index]);
|
||||
}
|
||||
|
||||
//Get only the translation entries necessary
|
||||
translation_entries = decoder.full_decode_line(encoded_text, num_scores);
|
||||
|
||||
}
|
||||
|
||||
std::pair<bool, std::vector<target_text> > output (found, translation_entries);
|
||||
|
||||
return output;
|
||||
|
||||
}
|
||||
|
||||
std::pair<bool, std::vector<target_text> > QueryEngine::query(StringPiece source_phrase)
|
||||
{
|
||||
bool found;
|
||||
std::vector<target_text> translation_entries;
|
||||
const Entry * entry;
|
||||
//Convert source frase to VID
|
||||
std::vector<uint64_t> source_phrase_vid = getVocabIDs(source_phrase);
|
||||
//TOO SLOW
|
||||
//uint64_t key = util::MurmurHashNative(&source_phrase_vid[0], source_phrase_vid.size());
|
||||
uint64_t key = 0;
|
||||
for (int i = 0; i < source_phrase_vid.size(); i++) {
|
||||
key += (source_phrase_vid[i] << i);
|
||||
}
|
||||
|
||||
found = table.Find(key, entry);
|
||||
|
||||
|
||||
if (found) {
|
||||
//The phrase that was searched for was found! We need to get the translation entries.
|
||||
//We will read the largest entry in bytes and then filter the unnecesarry with functions
|
||||
//from line_splitter
|
||||
uint64_t initial_index = entry -> GetValue();
|
||||
unsigned int bytes_toread = entry -> bytes_toread;
|
||||
//At the end of the file we can't readd + largest_entry cause we get a segfault.
|
||||
std::cerr << "Entry size is bytes is: " << bytes_toread << std::endl;
|
||||
|
||||
//ASK HIEU FOR MORE EFFICIENT WAY TO DO THIS!
|
||||
std::vector<unsigned char> encoded_text; //Assign to the vector the relevant portion of the array.
|
||||
encoded_text.reserve(bytes_toread);
|
||||
for (int i = 0; i < bytes_toread; i++) {
|
||||
encoded_text.push_back(binary_mmaped[i+initial_index]);
|
||||
}
|
||||
|
||||
//Get only the translation entries necessary
|
||||
translation_entries = decoder.full_decode_line(encoded_text, num_scores);
|
||||
|
||||
}
|
||||
|
||||
std::pair<bool, std::vector<target_text> > output (found, translation_entries);
|
||||
|
||||
return output;
|
||||
|
||||
}
|
||||
|
||||
void QueryEngine::printTargetInfo(std::vector<target_text> target_phrases)
|
||||
{
|
||||
int entries = target_phrases.size();
|
||||
|
||||
for (int i = 0; i<entries; i++) {
|
||||
std::cout << "Entry " << i+1 << " of " << entries << ":" << std::endl;
|
||||
//Print text
|
||||
std::cout << getTargetWordsFromIDs(target_phrases[i].target_phrase, &vocabids) << "\t";
|
||||
|
||||
//Print probabilities:
|
||||
for (int j = 0; j<target_phrases[i].prob.size(); j++) {
|
||||
std::cout << target_phrases[i].prob[j] << " ";
|
||||
}
|
||||
std::cout << "\t";
|
||||
|
||||
//Print word_all1
|
||||
for (int j = 0; j<target_phrases[i].word_all1.size(); j++) {
|
||||
if (j%2 == 0) {
|
||||
std::cout << (short)target_phrases[i].word_all1[j] << "-";
|
||||
} else {
|
||||
std::cout << (short)target_phrases[i].word_all1[j] << " ";
|
||||
}
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
@ -1,45 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "probing_hash_utils.hh"
|
||||
#include "huffmanish.hh"
|
||||
#include "hash.hh" //Includes line splitter
|
||||
#include <sys/stat.h> //For finding size of file
|
||||
#include "vocabid.hh"
|
||||
#include <algorithm> //toLower
|
||||
#define API_VERSION 3
|
||||
|
||||
|
||||
char * read_binary_file(char * filename);
|
||||
|
||||
class QueryEngine
|
||||
{
|
||||
unsigned char * binary_mmaped; //The binari phrase table file
|
||||
std::map<unsigned int, std::string> vocabids;
|
||||
std::map<uint64_t, std::string> source_vocabids;
|
||||
|
||||
Table table;
|
||||
char *mem; //Memory for the table, necessary so that we can correctly destroy the object
|
||||
|
||||
HuffmanDecoder decoder;
|
||||
|
||||
size_t binary_filesize;
|
||||
size_t table_filesize;
|
||||
int num_scores;
|
||||
bool is_reordering;
|
||||
public:
|
||||
QueryEngine (const char *);
|
||||
~QueryEngine();
|
||||
std::pair<bool, std::vector<target_text> > query(StringPiece source_phrase);
|
||||
std::pair<bool, std::vector<target_text> > query(std::vector<uint64_t> source_phrase);
|
||||
void printTargetInfo(std::vector<target_text> target_phrases);
|
||||
const std::map<unsigned int, std::string> getVocab() const {
|
||||
return decoder.get_target_lookup_map();
|
||||
}
|
||||
|
||||
const std::map<uint64_t, std::string> getSourceVocab() const {
|
||||
return source_vocabids;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
142
moses/TranslationModel/ProbingPT/querying.cpp
Normal file
142
moses/TranslationModel/ProbingPT/querying.cpp
Normal file
@ -0,0 +1,142 @@
|
||||
#include "quering.hh"
|
||||
#include "util/exception.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
QueryEngine::QueryEngine(const char * filepath)
|
||||
{
|
||||
|
||||
//Create filepaths
|
||||
std::string basepath(filepath);
|
||||
std::string path_to_config = basepath + "/config";
|
||||
std::string path_to_hashtable = basepath + "/probing_hash.dat";
|
||||
std::string path_to_source_vocabid = basepath + "/source_vocabids";
|
||||
std::string alignPath = basepath + "/Alignments.dat";
|
||||
|
||||
if (!FileExists(path_to_config)) {
|
||||
UTIL_THROW2("Binary table doesn't exist is didn't finish binarizing: " << path_to_config);
|
||||
}
|
||||
|
||||
///Source phrase vocabids
|
||||
read_map(source_vocabids, path_to_source_vocabid.c_str());
|
||||
|
||||
// alignments
|
||||
read_alignments(alignPath);
|
||||
|
||||
//Read config file
|
||||
boost::unordered_map<std::string, std::string> keyValue;
|
||||
|
||||
std::ifstream config(path_to_config.c_str());
|
||||
std::string line;
|
||||
while (getline(config, line)) {
|
||||
std::vector<std::string> toks = Tokenize(line, "\t");
|
||||
UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line);
|
||||
keyValue[ toks[0] ] = toks[1];
|
||||
}
|
||||
|
||||
bool found;
|
||||
//Check API version:
|
||||
int version;
|
||||
found = Get(keyValue, "API_VERSION", version);
|
||||
if (!found) {
|
||||
std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl;
|
||||
}
|
||||
else if (version != API_VERSION) {
|
||||
std::cerr << "The ProbingPT API has changed. " << version << "!="
|
||||
<< API_VERSION << " Please rebinarize your phrase tables." << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
//Get tablesize.
|
||||
int tablesize;
|
||||
found = Get(keyValue, "uniq_entries", tablesize);
|
||||
if (!found) {
|
||||
std::cerr << "uniq_entries not found" << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
//Number of scores
|
||||
found = Get(keyValue, "num_scores", num_scores);
|
||||
if (!found) {
|
||||
std::cerr << "num_scores not found" << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
//How may scores from lex reordering models
|
||||
found = Get(keyValue, "num_lex_scores", num_lex_scores);
|
||||
if (!found) {
|
||||
std::cerr << "num_lex_scores not found" << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// have the scores been log() and FloorScore()?
|
||||
found = Get(keyValue, "log_prob", logProb);
|
||||
if (!found) {
|
||||
std::cerr << "logProb not found" << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
config.close();
|
||||
|
||||
//Read hashtable
|
||||
table_filesize = Table::Size(tablesize, 1.2);
|
||||
mem = readTable(path_to_hashtable.c_str(), table_filesize);
|
||||
Table table_init(mem, table_filesize);
|
||||
table = table_init;
|
||||
|
||||
std::cerr << "Initialized successfully! " << std::endl;
|
||||
}
|
||||
|
||||
QueryEngine::~QueryEngine()
|
||||
{
|
||||
//Clear mmap content from memory.
|
||||
munmap(mem, table_filesize);
|
||||
|
||||
}
|
||||
|
||||
uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const
|
||||
{
|
||||
//TOO SLOW
|
||||
//uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
|
||||
return Moses::getKey(source_phrase, size);
|
||||
}
|
||||
|
||||
std::pair<bool, uint64_t> QueryEngine::query(uint64_t key)
|
||||
{
|
||||
std::pair<bool, uint64_t> ret;
|
||||
|
||||
const Entry * entry;
|
||||
ret.first = table.Find(key, entry);
|
||||
if (ret.first) {
|
||||
ret.second = entry->value;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void QueryEngine::read_alignments(const std::string &alignPath)
|
||||
{
|
||||
std::ifstream strm(alignPath.c_str());
|
||||
|
||||
string line;
|
||||
while (getline(strm, line)) {
|
||||
vector<string> toks = Tokenize(line, "\t ");
|
||||
UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file");
|
||||
|
||||
uint32_t alignInd = Scan<uint32_t>(toks[0]);
|
||||
if (alignInd >= alignColl.size()) {
|
||||
alignColl.resize(alignInd + 1);
|
||||
}
|
||||
|
||||
Alignments &aligns = alignColl[alignInd];
|
||||
for (size_t i = 1; i < toks.size(); ++i) {
|
||||
size_t pos = Scan<size_t>(toks[i]);
|
||||
aligns.push_back(pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
65
moses/TranslationModel/ProbingPT/querying.hh
Normal file
65
moses/TranslationModel/ProbingPT/querying.hh
Normal file
@ -0,0 +1,65 @@
|
||||
#pragma once
|
||||
|
||||
#include <boost/unordered_map.hpp>
|
||||
#include <sys/stat.h> //For finding size of file
|
||||
#include "vocabid.hh"
|
||||
#include <algorithm> //toLower
|
||||
#include <deque>
|
||||
#include "probing_hash_utils.hh"
|
||||
#include "hash.hh" //Includes line splitter
|
||||
#include "line_splitter.hh"
|
||||
#include "moses//Util.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class QueryEngine
|
||||
{
|
||||
std::map<uint64_t, std::string> source_vocabids;
|
||||
|
||||
typedef std::vector<unsigned char> Alignments;
|
||||
std::vector<Alignments> alignColl;
|
||||
|
||||
Table table;
|
||||
char *mem; //Memory for the table, necessary so that we can correctly destroy the object
|
||||
|
||||
size_t table_filesize;
|
||||
bool is_reordering;
|
||||
|
||||
void read_alignments(const std::string &alignPath);
|
||||
|
||||
public:
|
||||
int num_scores;
|
||||
int num_lex_scores;
|
||||
bool logProb;
|
||||
|
||||
QueryEngine(const char *);
|
||||
~QueryEngine();
|
||||
|
||||
std::pair<bool, uint64_t> query(uint64_t key);
|
||||
|
||||
const std::map<uint64_t, std::string> &getSourceVocab() const
|
||||
{ return source_vocabids; }
|
||||
|
||||
const std::vector<Alignments> &getAlignments() const
|
||||
{ return alignColl; }
|
||||
|
||||
uint64_t getKey(uint64_t source_phrase[], size_t size) const;
|
||||
|
||||
template<typename T>
|
||||
inline bool Get(const boost::unordered_map<std::string, std::string> &keyValue, const std::string &sought, T &found) const
|
||||
{
|
||||
boost::unordered_map<std::string, std::string>::const_iterator iter = keyValue.find(sought);
|
||||
if (iter == keyValue.end()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const std::string &foundStr = iter->second;
|
||||
found = Scan<T>(foundStr);
|
||||
return true;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -1,161 +1,303 @@
|
||||
#include <sys/stat.h>
|
||||
#include <boost/foreach.hpp>
|
||||
#include "line_splitter.hh"
|
||||
#include "storing.hh"
|
||||
#include "StoreTarget.h"
|
||||
#include "StoreVocab.h"
|
||||
#include "moses/Util.h"
|
||||
#include "moses/InputFileStream.h"
|
||||
|
||||
BinaryFileWriter::BinaryFileWriter (std::string basepath) : os ((basepath + "/binfile.dat").c_str(), std::ios::binary)
|
||||
{
|
||||
binfile.reserve(10000); //Reserve part of the vector to avoid realocation
|
||||
it = binfile.begin();
|
||||
dist_from_start = 0; //Initialize variables
|
||||
extra_counter = 0;
|
||||
}
|
||||
using namespace std;
|
||||
|
||||
void BinaryFileWriter::write (std::vector<unsigned char> * bytes)
|
||||
namespace Moses
|
||||
{
|
||||
binfile.insert(it, bytes->begin(), bytes->end()); //Insert the bytes
|
||||
//Keep track of the offsets
|
||||
it += bytes->size();
|
||||
dist_from_start = distance(binfile.begin(),it);
|
||||
//Flush the vector to disk every once in a while so that we don't consume too much ram
|
||||
if (dist_from_start > 9000) {
|
||||
flush();
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
void Node::Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos)
|
||||
{
|
||||
if (pos < sourcePhrase.size()) {
|
||||
uint64_t vocabId = sourcePhrase[pos];
|
||||
|
||||
Node *child;
|
||||
Children::iterator iter = m_children.find(vocabId);
|
||||
if (iter == m_children.end()) {
|
||||
// New node. Write other children then discard them
|
||||
BOOST_FOREACH(Children::value_type &valPair, m_children) {
|
||||
Node &otherChild = valPair.second;
|
||||
otherChild.Write(table);
|
||||
}
|
||||
m_children.clear();
|
||||
|
||||
// create new node
|
||||
child = &m_children[vocabId];
|
||||
assert(!child->done);
|
||||
child->key = key + (vocabId << pos);
|
||||
}
|
||||
else {
|
||||
child = &iter->second;
|
||||
}
|
||||
|
||||
child->Add(table, sourcePhrase, pos + 1);
|
||||
}
|
||||
else {
|
||||
// this node was written previously 'cos it has rules
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
|
||||
void BinaryFileWriter::flush ()
|
||||
void Node::Write(Table &table)
|
||||
{
|
||||
//Cast unsigned char to char before writing...
|
||||
os.write((char *)&binfile[0], dist_from_start);
|
||||
//Clear the vector:
|
||||
binfile.clear();
|
||||
binfile.reserve(10000);
|
||||
extra_counter += dist_from_start; //Keep track of the total number of bytes.
|
||||
it = binfile.begin(); //Reset iterator
|
||||
dist_from_start = distance(binfile.begin(),it); //Reset dist from start
|
||||
//cerr << "START write " << done << " " << key << endl;
|
||||
BOOST_FOREACH(Children::value_type &valPair, m_children) {
|
||||
Node &child = valPair.second;
|
||||
child.Write(table);
|
||||
}
|
||||
|
||||
if (!done) {
|
||||
// save
|
||||
Entry sourceEntry;
|
||||
sourceEntry.value = NONE;
|
||||
sourceEntry.key = key;
|
||||
|
||||
//Put into table
|
||||
table.Insert(sourceEntry);
|
||||
}
|
||||
}
|
||||
|
||||
BinaryFileWriter::~BinaryFileWriter ()
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
void createProbingPT(const std::string &phrasetable_path,
|
||||
const std::string &basepath, int num_scores, int num_lex_scores,
|
||||
bool log_prob, int max_cache_size, bool scfg)
|
||||
{
|
||||
os.close();
|
||||
binfile.clear();
|
||||
}
|
||||
std::cerr << "Starting..." << std::endl;
|
||||
|
||||
void createProbingPT(const char * phrasetable_path, const char * target_path,
|
||||
const char * num_scores, const char * is_reordering)
|
||||
{
|
||||
//Get basepath and create directory if missing
|
||||
std::string basepath(target_path);
|
||||
mkdir(basepath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
|
||||
|
||||
//Set up huffman and serialize decoder maps.
|
||||
Huffman huffmanEncoder(phrasetable_path); //initialize
|
||||
huffmanEncoder.assign_values();
|
||||
huffmanEncoder.produce_lookups();
|
||||
huffmanEncoder.serialize_maps(target_path);
|
||||
StoreTarget storeTarget(basepath);
|
||||
|
||||
//Get uniq lines:
|
||||
unsigned long uniq_entries = huffmanEncoder.getUniqLines();
|
||||
unsigned long uniq_entries = countUniqueSource(phrasetable_path);
|
||||
|
||||
//Source phrase vocabids
|
||||
std::map<uint64_t, std::string> source_vocabids;
|
||||
StoreVocab<uint64_t> sourceVocab(basepath + "/source_vocabids");
|
||||
|
||||
//Read the file
|
||||
util::FilePiece filein(phrasetable_path);
|
||||
util::FilePiece filein(phrasetable_path.c_str());
|
||||
|
||||
//Init the probing hash table
|
||||
size_t size = Table::Size(uniq_entries, 1.2);
|
||||
char * mem = new char[size];
|
||||
memset(mem, 0, size);
|
||||
Table table(mem, size);
|
||||
Table sourceEntries(mem, size);
|
||||
|
||||
BinaryFileWriter binfile(basepath); //Init the binary file writer.
|
||||
|
||||
line_text prev_line; //Check if the source phrase of the previous line is the same
|
||||
std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> cache;
|
||||
float totalSourceCount = 0;
|
||||
|
||||
//Keep track of the size of each group of target phrases
|
||||
uint64_t entrystartidx = 0;
|
||||
//uint64_t line_num = 0;
|
||||
|
||||
size_t line_num = 0;
|
||||
|
||||
//Read everything and processs
|
||||
while(true) {
|
||||
std::string prevSource;
|
||||
|
||||
Node sourcePhrases;
|
||||
sourcePhrases.done = true;
|
||||
sourcePhrases.key = 0;
|
||||
|
||||
while (true) {
|
||||
try {
|
||||
//Process line read
|
||||
line_text line;
|
||||
line = splitLine(filein.ReadLine());
|
||||
line = splitLine(filein.ReadLine(), scfg);
|
||||
//cerr << "line=" << line.source_phrase << endl;
|
||||
|
||||
++line_num;
|
||||
if (line_num % 1000000 == 0) {
|
||||
std::cerr << line_num << " " << std::flush;
|
||||
}
|
||||
|
||||
//Add source phrases to vocabularyIDs
|
||||
add_to_map(&source_vocabids, line.source_phrase);
|
||||
add_to_map(sourceVocab, line.source_phrase);
|
||||
|
||||
if ((binfile.dist_from_start + binfile.extra_counter) == 0) {
|
||||
prev_line = line; //For the first iteration assume the previous line is
|
||||
} //The same as this one.
|
||||
|
||||
if (line.source_phrase != prev_line.source_phrase) {
|
||||
if (prevSource.empty()) {
|
||||
// 1st line
|
||||
prevSource = line.source_phrase.as_string();
|
||||
storeTarget.Append(line, log_prob, scfg);
|
||||
}
|
||||
else if (prevSource == line.source_phrase) {
|
||||
//If we still have the same line, just append to it:
|
||||
storeTarget.Append(line, log_prob, scfg);
|
||||
}
|
||||
else {
|
||||
assert(prevSource != line.source_phrase);
|
||||
|
||||
//Create a new entry even
|
||||
|
||||
// save
|
||||
uint64_t targetInd = storeTarget.Save();
|
||||
|
||||
// next line
|
||||
storeTarget.Append(line, log_prob, scfg);
|
||||
|
||||
//Create an entry for the previous source phrase:
|
||||
Entry pesho;
|
||||
pesho.value = entrystartidx;
|
||||
Entry sourceEntry;
|
||||
sourceEntry.value = targetInd;
|
||||
//The key is the sum of hashes of individual words bitshifted by their position in the phrase.
|
||||
//Probably not entirerly correct, but fast and seems to work fine in practise.
|
||||
pesho.key = 0;
|
||||
std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase);
|
||||
for (int i = 0; i < vocabid_source.size(); i++) {
|
||||
pesho.key += (vocabid_source[i] << i);
|
||||
std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource);
|
||||
if (scfg) {
|
||||
// storing prefixes?
|
||||
sourcePhrases.Add(sourceEntries, vocabid_source);
|
||||
}
|
||||
pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx;
|
||||
sourceEntry.key = getKey(vocabid_source);
|
||||
|
||||
/*
|
||||
cerr << "prevSource=" << prevSource << flush
|
||||
<< " vocabids=" << Debug(vocabid_source) << flush
|
||||
<< " key=" << sourceEntry.key << endl;
|
||||
*/
|
||||
//Put into table
|
||||
table.Insert(pesho);
|
||||
sourceEntries.Insert(sourceEntry);
|
||||
|
||||
entrystartidx = binfile.dist_from_start + binfile.extra_counter; //Designate start idx for new entry
|
||||
// update cache - CURRENT source phrase, not prev
|
||||
if (max_cache_size) {
|
||||
std::string countStr = line.counts.as_string();
|
||||
countStr = Trim(countStr);
|
||||
if (!countStr.empty()) {
|
||||
std::vector<float> toks = Tokenize<float>(countStr);
|
||||
//cerr << "CACHE:" << line.source_phrase << " " << countStr << " " << toks[1] << endl;
|
||||
|
||||
//Encode a line and write it to disk.
|
||||
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
|
||||
binfile.write(&encoded_line);
|
||||
if (toks.size() >= 2) {
|
||||
totalSourceCount += toks[1];
|
||||
|
||||
// compute key for CURRENT source
|
||||
std::vector<uint64_t> currVocabidSource = getVocabIDs(line.source_phrase.as_string());
|
||||
uint64_t currKey = getKey(currVocabidSource);
|
||||
|
||||
CacheItem *item = new CacheItem(
|
||||
Trim(line.source_phrase.as_string()),
|
||||
currKey,
|
||||
toks[1]);
|
||||
cache.push(item);
|
||||
|
||||
if (max_cache_size > 0 && cache.size() > max_cache_size) {
|
||||
cache.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//Set prevLine
|
||||
prev_line = line;
|
||||
|
||||
} else {
|
||||
//If we still have the same line, just append to it:
|
||||
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
|
||||
binfile.write(&encoded_line);
|
||||
prevSource = line.source_phrase.as_string();
|
||||
}
|
||||
|
||||
} catch (util::EndOfFileException e) {
|
||||
std::cerr << "Reading phrase table finished, writing remaining files to disk." << std::endl;
|
||||
binfile.flush();
|
||||
}
|
||||
catch (util::EndOfFileException e) {
|
||||
std::cerr
|
||||
<< "Reading phrase table finished, writing remaining files to disk."
|
||||
<< std::endl;
|
||||
|
||||
//After the final entry is constructed we need to add it to the phrase_table
|
||||
//Create an entry for the previous source phrase:
|
||||
Entry pesho;
|
||||
pesho.value = entrystartidx;
|
||||
uint64_t targetInd = storeTarget.Save();
|
||||
|
||||
Entry sourceEntry;
|
||||
sourceEntry.value = targetInd;
|
||||
|
||||
//The key is the sum of hashes of individual words. Probably not entirerly correct, but fast
|
||||
pesho.key = 0;
|
||||
std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase);
|
||||
for (int i = 0; i < vocabid_source.size(); i++) {
|
||||
pesho.key += (vocabid_source[i] << i);
|
||||
}
|
||||
pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx;
|
||||
std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource);
|
||||
sourceEntry.key = getKey(vocabid_source);
|
||||
|
||||
//Put into table
|
||||
table.Insert(pesho);
|
||||
sourceEntries.Insert(sourceEntry);
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
serialize_table(mem, size, (basepath + "/probing_hash.dat").c_str());
|
||||
sourcePhrases.Write(sourceEntries);
|
||||
|
||||
serialize_map(&source_vocabids, (basepath + "/source_vocabids").c_str());
|
||||
storeTarget.SaveAlignment();
|
||||
|
||||
serialize_table(mem, size, (basepath + "/probing_hash.dat"));
|
||||
|
||||
sourceVocab.Save();
|
||||
|
||||
serialize_cache(cache, (basepath + "/cache"), totalSourceCount);
|
||||
|
||||
delete[] mem;
|
||||
|
||||
//Write configfile
|
||||
std::ofstream configfile;
|
||||
configfile.open((basepath + "/config").c_str());
|
||||
configfile << API_VERSION << '\n';
|
||||
configfile << uniq_entries << '\n';
|
||||
configfile << num_scores << '\n';
|
||||
configfile << is_reordering << '\n';
|
||||
configfile << "API_VERSION\t" << API_VERSION << '\n';
|
||||
configfile << "uniq_entries\t" << uniq_entries << '\n';
|
||||
configfile << "num_scores\t" << num_scores << '\n';
|
||||
configfile << "num_lex_scores\t" << num_lex_scores << '\n';
|
||||
configfile << "log_prob\t" << log_prob << '\n';
|
||||
configfile.close();
|
||||
}
|
||||
|
||||
size_t countUniqueSource(const std::string &path)
|
||||
{
|
||||
size_t ret = 0;
|
||||
InputFileStream strme(path);
|
||||
|
||||
std::string line, prevSource;
|
||||
while (std::getline(strme, line)) {
|
||||
std::vector<std::string> toks = TokenizeMultiCharSeparator(line, "|||");
|
||||
assert(toks.size() != 0);
|
||||
|
||||
if (prevSource != toks[0]) {
|
||||
prevSource = toks[0];
|
||||
++ret;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void serialize_cache(
|
||||
std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache,
|
||||
const std::string &path, float totalSourceCount)
|
||||
{
|
||||
std::vector<const CacheItem*> vec(cache.size());
|
||||
|
||||
size_t ind = cache.size() - 1;
|
||||
while (!cache.empty()) {
|
||||
const CacheItem *item = cache.top();
|
||||
vec[ind] = item;
|
||||
cache.pop();
|
||||
--ind;
|
||||
}
|
||||
|
||||
std::ofstream os(path.c_str());
|
||||
|
||||
os << totalSourceCount << std::endl;
|
||||
for (size_t i = 0; i < vec.size(); ++i) {
|
||||
const CacheItem *item = vec[i];
|
||||
os << item->count << "\t" << item->sourceKey << "\t" << item->source << std::endl;
|
||||
delete item;
|
||||
}
|
||||
|
||||
os.close();
|
||||
}
|
||||
|
||||
uint64_t getKey(const std::vector<uint64_t> &vocabid_source)
|
||||
{
|
||||
return getKey(vocabid_source.data(), vocabid_source.size());
|
||||
}
|
||||
|
||||
std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos)
|
||||
{
|
||||
assert(endPos < vocabid_source.size());
|
||||
|
||||
std::vector<uint64_t> ret(endPos + 1);
|
||||
for (size_t i = 0; i <= endPos; ++i) {
|
||||
ret[i] = vocabid_source[i];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -1,36 +1,95 @@
|
||||
#pragma once
|
||||
|
||||
#include <boost/unordered_set.hpp>
|
||||
#include <boost/unordered_map.hpp>
|
||||
#include <cstdio>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <queue>
|
||||
#include <sys/stat.h> //mkdir
|
||||
|
||||
#include "hash.hh" //Includes line_splitter
|
||||
#include "probing_hash_utils.hh"
|
||||
#include "huffmanish.hh"
|
||||
#include <sys/stat.h> //mkdir
|
||||
|
||||
#include "util/file_piece.hh"
|
||||
#include "util/file.hh"
|
||||
#include "vocabid.hh"
|
||||
#define API_VERSION 3
|
||||
|
||||
void createProbingPT(const char * phrasetable_path, const char * target_path,
|
||||
const char * num_scores, const char * is_reordering);
|
||||
|
||||
class BinaryFileWriter
|
||||
namespace Moses
|
||||
{
|
||||
std::vector<unsigned char> binfile;
|
||||
std::vector<unsigned char>::iterator it;
|
||||
//Output binary
|
||||
std::ofstream os;
|
||||
typedef std::vector<uint64_t> SourcePhrase;
|
||||
|
||||
|
||||
class Node
|
||||
{
|
||||
typedef boost::unordered_map<uint64_t, Node> Children;
|
||||
Children m_children;
|
||||
|
||||
public:
|
||||
unsigned int dist_from_start; //Distance from the start of the vector.
|
||||
uint64_t extra_counter; //After we reset the counter, we still want to keep track of the correct offset, so
|
||||
uint64_t key;
|
||||
bool done;
|
||||
|
||||
BinaryFileWriter (std::string);
|
||||
~BinaryFileWriter ();
|
||||
void write (std::vector<unsigned char> * bytes);
|
||||
void flush (); //Flush to disk
|
||||
Node()
|
||||
:done(false)
|
||||
{}
|
||||
|
||||
void Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos = 0);
|
||||
void Write(Table &table);
|
||||
};
|
||||
|
||||
|
||||
void createProbingPT(const std::string &phrasetable_path,
|
||||
const std::string &basepath, int num_scores, int num_lex_scores,
|
||||
bool log_prob, int max_cache_size, bool scfg);
|
||||
uint64_t getKey(const std::vector<uint64_t> &source_phrase);
|
||||
|
||||
std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos);
|
||||
|
||||
template<typename T>
|
||||
std::string Debug(const std::vector<T> &vec)
|
||||
{
|
||||
std::stringstream strm;
|
||||
for (size_t i = 0; i < vec.size(); ++i) {
|
||||
strm << vec[i] << " ";
|
||||
}
|
||||
return strm.str();
|
||||
}
|
||||
|
||||
size_t countUniqueSource(const std::string &path);
|
||||
|
||||
class CacheItem
|
||||
{
|
||||
public:
|
||||
std::string source;
|
||||
uint64_t sourceKey;
|
||||
float count;
|
||||
CacheItem(const std::string &vSource, uint64_t vSourceKey, float vCount)
|
||||
:source(vSource)
|
||||
,sourceKey(vSourceKey)
|
||||
,count(vCount)
|
||||
{
|
||||
}
|
||||
|
||||
bool operator<(const CacheItem &other) const
|
||||
{
|
||||
return count > other.count;
|
||||
}
|
||||
};
|
||||
|
||||
class CacheItemOrderer
|
||||
{
|
||||
public:
|
||||
bool operator()(const CacheItem* a, const CacheItem* b) const
|
||||
{
|
||||
return (*a) < (*b);
|
||||
}
|
||||
};
|
||||
|
||||
void serialize_cache(
|
||||
std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache,
|
||||
const std::string &path, float totalSourceCount);
|
||||
|
||||
}
|
||||
|
||||
|
@ -1,32 +1,59 @@
|
||||
#include <boost/foreach.hpp>
|
||||
#include "vocabid.hh"
|
||||
#include "StoreVocab.h"
|
||||
#include "moses/Util.h"
|
||||
|
||||
void add_to_map(std::map<uint64_t, std::string> *karta, StringPiece textin)
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
void add_to_map(StoreVocab<uint64_t> &sourceVocab,
|
||||
const StringPiece &textin)
|
||||
{
|
||||
//Tokenize
|
||||
util::TokenIter<util::SingleCharacter> it(textin, util::SingleCharacter(' '));
|
||||
util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' '));
|
||||
|
||||
while(it) {
|
||||
karta->insert(std::pair<uint64_t, std::string>(getHash(*it), it->as_string()));
|
||||
it++;
|
||||
while (itWord) {
|
||||
StringPiece word = *itWord;
|
||||
|
||||
util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|'));
|
||||
while (itFactor) {
|
||||
StringPiece factor = *itFactor;
|
||||
|
||||
sourceVocab.Insert(getHash(factor), factor.as_string());
|
||||
itFactor++;
|
||||
}
|
||||
itWord++;
|
||||
}
|
||||
}
|
||||
|
||||
void serialize_map(std::map<uint64_t, std::string> *karta, const char* filename)
|
||||
void serialize_map(const std::map<uint64_t, std::string> &karta,
|
||||
const std::string &filename)
|
||||
{
|
||||
std::ofstream os (filename, std::ios::binary);
|
||||
boost::archive::text_oarchive oarch(os);
|
||||
std::ofstream os(filename.c_str());
|
||||
|
||||
std::map<uint64_t, std::string>::const_iterator iter;
|
||||
for (iter = karta.begin(); iter != karta.end(); ++iter) {
|
||||
os << iter->first << '\t' << iter->second << std::endl;
|
||||
}
|
||||
|
||||
oarch << *karta; //Serialise map
|
||||
os.close();
|
||||
}
|
||||
|
||||
void read_map(std::map<uint64_t, std::string> *karta, const char* filename)
|
||||
void read_map(std::map<uint64_t, std::string> &karta, const char* filename)
|
||||
{
|
||||
std::ifstream is (filename, std::ios::binary);
|
||||
boost::archive::text_iarchive iarch(is);
|
||||
std::ifstream is(filename);
|
||||
|
||||
iarch >> *karta;
|
||||
std::string line;
|
||||
while (getline(is, line)) {
|
||||
std::vector<std::string> toks = Tokenize(line, "\t");
|
||||
assert(toks.size() == 2);
|
||||
uint64_t ind = Scan<uint64_t>(toks[1]);
|
||||
karta[ind] = toks[0];
|
||||
}
|
||||
|
||||
//Close the stream after we are done.
|
||||
is.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -13,8 +13,17 @@
|
||||
#include "util/string_piece.hh" //Tokenization and work with StringPiece
|
||||
#include "util/tokenize_piece.hh"
|
||||
|
||||
void add_to_map(std::map<uint64_t, std::string> *karta, StringPiece textin);
|
||||
namespace Moses
|
||||
{
|
||||
template<typename VOCABID>
|
||||
class StoreVocab;
|
||||
|
||||
void serialize_map(std::map<uint64_t, std::string> *karta, const char* filename);
|
||||
void add_to_map(StoreVocab<uint64_t> &sourceVocab,
|
||||
const StringPiece &textin);
|
||||
|
||||
void read_map(std::map<uint64_t, std::string> *karta, const char* filename);
|
||||
void serialize_map(const std::map<uint64_t, std::string> &karta,
|
||||
const std::string &filename);
|
||||
|
||||
void read_map(std::map<uint64_t, std::string> &karta, const char* filename);
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user