update Moses::ProbingPT with Moses2::ProbingPT. Does not compile

This commit is contained in:
Hieu Hoang 2016-10-03 19:02:06 +01:00
parent 34e0ac2672
commit 3a72b4958a
20 changed files with 845 additions and 960 deletions

View File

@ -1319,7 +1319,7 @@
<name>FF/PhraseBoundaryFeature.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhraseBoundaryFeature.h</locationURI>
</link>
</link>
<link>
<name>FF/PhraseDistanceFeature.cpp</name>
<type>1</type>
@ -3340,6 +3340,26 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/ProbingPT.h</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/StoreTarget.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreTarget.cpp</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/StoreTarget.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreTarget.h</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/StoreVocab.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreVocab.cpp</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/StoreVocab.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreVocab.h</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/hash.cpp</name>
<type>1</type>
@ -3350,16 +3370,6 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.hh</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/huffmanish.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/huffmanish.cpp</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/huffmanish.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/huffmanish.hh</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/line_splitter.cpp</name>
<type>1</type>
@ -3664,7 +3674,7 @@
<name>TranslationModel/UG/sapt_pscore_coherence.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_coherence.h</locationURI>
</link>
</link>
<link>
<name>TranslationModel/UG/sapt_pscore_lex1.h</name>
<type>1</type>
@ -3709,7 +3719,7 @@
<name>TranslationModel/UG/sapt_pscore_wordcount.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_wordcount.h</locationURI>
</link>
</link>
<link>
<name>TranslationModel/UG/sim-pe.cc</name>
<type>1</type>

View File

@ -1,29 +1,113 @@
#include <string>
#include <boost/program_options.hpp>
#include "util/usage.hh"
#include "moses/TranslationModel/ProbingPT/storing.hh"
#include "moses/InputFileStream.h"
#include "moses/OutputFileStream.h"
#include "moses/Util.h"
using namespace std;
std::string ReformatSCFGFile(const std::string &path);
int main(int argc, char* argv[])
{
string inPath, outPath;
int num_scores = 4;
int num_lex_scores = 0;
bool log_prob = false;
bool scfg = false;
int max_cache_size = 50000;
const char * is_reordering = "false";
namespace po = boost::program_options;
po::options_description desc("Options");
desc.add_options()
("help", "Print help messages")
("input-pt", po::value<string>()->required(), "Text pt")
("output-dir", po::value<string>()->required(), "Directory when binary files will be written")
("num-scores", po::value<int>()->default_value(num_scores), "Number of pt scores")
("num-lex-scores", po::value<int>()->default_value(num_lex_scores), "Number of lexicalized reordering scores")
("log-prob", "log (and floor) probabilities before storing")
("max-cache-size", po::value<int>()->default_value(max_cache_size), "Maximum number of high-count source lines to write to cache file. 0=no cache, negative=no limit")
("scfg", "Rules are SCFG in Moses format (ie. with non-terms and LHS")
if (!(argc == 5 || argc == 4)) {
// Tell the user how to run the program
std::cerr << "Provided " << argc << " arguments, needed 4 or 5." << std::endl;
std::cerr << "Usage: " << argv[0] << " path_to_phrasetable output_dir num_scores is_reordering" << std::endl;
std::cerr << "is_reordering should be either true or false, but it is currently a stub feature." << std::endl;
//std::cerr << "Usage: " << argv[0] << " path_to_phrasetable number_of_uniq_lines output_bin_file output_hash_table output_vocab_id" << std::endl;
return 1;
;
po::variables_map vm;
try {
po::store(po::parse_command_line(argc, argv, desc),
vm); // can throw
/** --help option
*/
if ( vm.count("help")) {
std::cout << desc << std::endl;
return EXIT_SUCCESS;
}
po::notify(vm); // throws on error, so do after help in case
// there are any problems
} catch(po::error& e) {
std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
std::cerr << desc << std::endl;
return EXIT_FAILURE;
}
if (argc == 5) {
is_reordering = argv[4];
if (vm.count("input-pt")) inPath = vm["input-pt"].as<string>();
if (vm.count("output-dir")) outPath = vm["output-dir"].as<string>();
if (vm.count("num-scores")) num_scores = vm["num-scores"].as<int>();
if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as<int>();
if (vm.count("max-cache-size")) max_cache_size = vm["max-cache-size"].as<int>();
if (vm.count("log-prob")) log_prob = true;
if (vm.count("scfg")) scfg = true;
if (scfg) {
inPath = ReformatSCFGFile(inPath);
}
createProbingPT(argv[1], argv[2], argv[3], is_reordering);
Moses::createProbingPT(inPath, outPath, num_scores, num_lex_scores, log_prob, max_cache_size, scfg);
util::PrintUsage(std::cout);
//util::PrintUsage(std::cout);
return 0;
}
std::string ReformatSCFGFile(const std::string &path)
{
Moses::InputFileStream inFile(path);
string reformattedPath = path + ".reformat.gz";
Moses::OutputFileStream outFile(reformattedPath);
string line;
while (getline(inFile, line)) {
vector<string> toks = Moses::TokenizeMultiCharSeparator(line, "|||");
assert(toks.size() >= 3);
// source
vector<string> sourceToks = Moses::Tokenize(toks[0], " ");
for (size_t i = 0; i < sourceToks.size() - 1; ++i) {
outFile << sourceToks[i] << " ";
}
// other columns
for (size_t i = 1; i < toks.size(); ++i) {
outFile << "|||" << toks[i];
}
outFile << endl;
}
inFile.Close();
outFile.Close();
string sortedPath = path + ".reformat.sorted.gz";
string tmpPath = path + ".tmp ";
string cmd = "mkdir " + tmpPath
+ " && gzip -dc " + reformattedPath + " | LC_ALL=C sort -T " + tmpPath + " | gzip -c > " + sortedPath;
system(cmd.c_str());
cmd = "rm -rf " + tmpPath + " " + reformattedPath;
system(cmd.c_str());
return sortedPath;
}

View File

@ -31,9 +31,9 @@ else {
}
exe CreateProbingPT : CreateProbingPT.cpp ..//boost_filesystem ../moses//moses ;
exe QueryProbingPT : QueryProbingPT.cpp ..//boost_filesystem ../moses//moses ;
#exe QueryProbingPT : QueryProbingPT.cpp ..//boost_filesystem ../moses//moses ;
alias programsProbing : CreateProbingPT QueryProbingPT ;
alias programsProbing : CreateProbingPT ; #QueryProbingPT
exe merge-sorted :
merge-sorted.cc

View File

@ -34,7 +34,7 @@ int main(int argc, char* argv[])
return 1;
}
QueryEngine queries(argv[1]);
Moses::QueryEngine queries(argv[1]);
//Interactive search
std::cout << "Please enter a string to be searched, or exit to exit." << std::endl;

View File

@ -3,6 +3,7 @@
#include "moses/StaticData.h"
#include "moses/FactorCollection.h"
#include "moses/TargetPhraseCollection.h"
#include "moses/InputFileStream.h"
#include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h"
#include "quering.hh"
@ -34,44 +35,94 @@ void ProbingPT::Load(AllOptions::ptr const& opts)
m_unkId = 456456546456;
FactorCollection &vocab = FactorCollection::Instance();
// source vocab
const std::map<uint64_t, std::string> &sourceVocab = m_engine->getSourceVocab();
const std::map<uint64_t, std::string> &sourceVocab =
m_engine->getSourceVocab();
std::map<uint64_t, std::string>::const_iterator iterSource;
for (iterSource = sourceVocab.begin(); iterSource != sourceVocab.end(); ++iterSource) {
const string &wordStr = iterSource->second;
const Factor *factor = FactorCollection::Instance().AddFactor(wordStr);
for (iterSource = sourceVocab.begin(); iterSource != sourceVocab.end();
++iterSource) {
string wordStr = iterSource->second;
//cerr << "wordStr=" << wordStr << endl;
const Factor *factor = vocab.AddFactor(wordStr);
uint64_t probingId = iterSource->first;
size_t factorId = factor->GetId();
SourceVocabMap::value_type entry(factor, probingId);
m_sourceVocabMap.insert(entry);
if (factorId >= m_sourceVocab.size()) {
m_sourceVocab.resize(factorId + 1, m_unkId);
}
m_sourceVocab[factorId] = probingId;
}
// target vocab
const std::map<unsigned int, std::string> &probingVocab = m_engine->getVocab();
std::map<unsigned int, std::string>::const_iterator iter;
for (iter = probingVocab.begin(); iter != probingVocab.end(); ++iter) {
const string &wordStr = iter->second;
const Factor *factor = FactorCollection::Instance().AddFactor(wordStr);
InputFileStream targetVocabStrme(m_filePath + "/TargetVocab.dat");
string line;
while (getline(targetVocabStrme, line)) {
vector<string> toks = Tokenize(line, "\t");
UTIL_THROW_IF2(toks.size() != 2, string("Incorrect format:") + line + "\n");
unsigned int probingId = iter->first;
//cerr << "wordStr=" << toks[0] << endl;
TargetVocabMap::value_type entry(factor, probingId);
m_vocabMap.insert(entry);
const Factor *factor = vocab.AddFactor(toks[0]);
uint32_t probingId = Scan<uint32_t>(toks[1]);
if (probingId >= m_targetVocab.size()) {
m_targetVocab.resize(probingId + 1);
}
m_targetVocab[probingId] = factor;
}
// alignments
CreateAlignmentMap(m_filePath + "/Alignments.dat");
// memory mapped file to tps
string filePath = m_filePath + "/TargetColl.dat";
file.open(filePath.c_str());
if (!file.is_open()) {
throw "Couldn't open file ";
}
data = file.data();
//size_t size = file.size();
// cache
//CreateCache(system);
}
void ProbingPT::CreateAlignmentMap(const std::string path)
{
const std::vector< std::vector<unsigned char> > &probingAlignColl = m_engine->getAlignments();
m_aligns.resize(probingAlignColl.size(), NULL);
for (size_t i = 0; i < probingAlignColl.size(); ++i) {
AlignmentInfo::CollType aligns;
const std::vector<unsigned char> &probingAligns = probingAlignColl[i];
for (size_t j = 0; j < probingAligns.size(); j += 2) {
size_t startPos = probingAligns[j];
size_t endPos = probingAligns[j+1];
//cerr << "startPos=" << startPos << " " << endPos << endl;
aligns.insert(std::pair<size_t,size_t>(startPos, endPos));
}
const AlignmentInfo *align = AlignmentInfoCollection::Instance().Add(aligns);
m_aligns[i] = align;
//cerr << "align=" << align->Debug(system) << endl;
}
}
void ProbingPT::InitializeForInput(ttasksptr const& ttask)
{
ReduceCache();
}
void ProbingPT::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const
{
CacheColl &cache = GetCache();
InputPathList::const_iterator iter;
for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) {
InputPath &inputPath = **iter;
@ -82,12 +133,6 @@ void ProbingPT::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQue
}
TargetPhraseCollection::shared_ptr tpColl = CreateTargetPhrase(sourcePhrase);
// add target phrase to phrase-table cache
size_t hash = hash_value(sourcePhrase);
std::pair<TargetPhraseCollection::shared_ptr , clock_t> value(tpColl, clock());
cache[hash] = value;
inputPath.SetTargetPhrases(*this, tpColl, NULL);
}
}

View File

@ -1,17 +1,17 @@
#pragma once
#include <boost/iostreams/device/mapped_file.hpp>
#include <boost/bimap.hpp>
#include "../PhraseDictionary.h"
class QueryEngine;
class target_text;
namespace Moses
{
class ChartParser;
class ChartCellCollectionBase;
class ChartRuleLookupManager;
class QueryEngine;
class target_text;
class ProbingPT : public PhraseDictionary
{
@ -39,12 +39,16 @@ public:
protected:
QueryEngine *m_engine;
uint64_t m_unkId;
typedef boost::bimap<const Factor *, uint64_t> SourceVocabMap;
mutable SourceVocabMap m_sourceVocabMap;
std::vector<uint64_t> m_sourceVocab; // factor id -> pt id
std::vector<const Factor*> m_targetVocab; // pt id -> factor*
std::vector<const AlignmentInfo*> m_aligns;
typedef boost::bimap<const Factor *, unsigned int> TargetVocabMap;
mutable TargetVocabMap m_vocabMap;
boost::iostreams::mapped_file_source file;
const char *data;
void CreateAlignmentMap(const std::string path);
TargetPhraseCollection::shared_ptr CreateTargetPhrase(const Phrase &sourcePhrase) const;
TargetPhrase *CreateTargetPhrase(const Phrase &sourcePhrase, const target_text &probingTargetPhrase) const;
@ -53,7 +57,6 @@ protected:
std::vector<uint64_t> ConvertToProbingSourcePhrase(const Phrase &sourcePhrase, bool &ok) const;
uint64_t m_unkId;
};
} // namespace Moses

View File

@ -1,5 +1,11 @@
#include <iostream>
#include "hash.hh"
using namespace std;
namespace Moses
{
uint64_t getHash(StringPiece text)
{
std::size_t len = text.size();
@ -7,24 +13,32 @@ uint64_t getHash(StringPiece text)
return key;
}
std::vector<uint64_t> getVocabIDs(StringPiece textin)
std::vector<uint64_t> getVocabIDs(const StringPiece &textin)
{
//Tokenize
std::vector<uint64_t> output;
util::TokenIter<util::SingleCharacter> it(textin, util::SingleCharacter(' '));
util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' '));
while(it) {
output.push_back(getHash(*it));
it++;
while (itWord) {
StringPiece word = *itWord;
uint64_t id = 0;
util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|'));
while (itFactor) {
StringPiece factor = *itFactor;
//cerr << "factor=" << factor << endl;
id += getHash(factor);
itFactor++;
}
output.push_back(id);
itWord++;
}
return output;
}
uint64_t getVocabID(std::string candidate)
{
std::size_t len = candidate.length();
uint64_t key = util::MurmurHashNative(candidate.c_str(), len);
return key;
}

View File

@ -6,9 +6,12 @@
#include "util/tokenize_piece.hh"
#include <vector>
namespace Moses
{
//Gets the MurmurmurHash for give string
uint64_t getHash(StringPiece text);
std::vector<uint64_t> getVocabIDs(StringPiece textin);
std::vector<uint64_t> getVocabIDs(const StringPiece &textin);
uint64_t getVocabID(std::string candidate);
}

View File

@ -1,451 +0,0 @@
#include "huffmanish.hh"
Huffman::Huffman (const char * filepath)
{
//Read the file
util::FilePiece filein(filepath);
//Init uniq_lines to zero;
uniq_lines = 0;
line_text prev_line; //Check for unique lines.
int num_lines = 0 ;
while (true) {
line_text new_line;
num_lines++;
try {
//Process line read
new_line = splitLine(filein.ReadLine());
count_elements(new_line); //Counts the number of elements, adds new and increments counters.
} catch (util::EndOfFileException e) {
std::cerr << "Unique entries counted: ";
break;
}
if (new_line.source_phrase == prev_line.source_phrase) {
continue;
} else {
uniq_lines++;
prev_line = new_line;
}
}
std::cerr << uniq_lines << std::endl;
}
void Huffman::count_elements(line_text linein)
{
//For target phrase:
util::TokenIter<util::SingleCharacter> it(linein.target_phrase, util::SingleCharacter(' '));
while (it) {
//Check if we have that entry
std::map<std::string, unsigned int>::iterator mapiter;
mapiter = target_phrase_words.find(it->as_string());
if (mapiter != target_phrase_words.end()) {
//If the element is found, increment the count.
mapiter->second++;
} else {
//Else create a new entry;
target_phrase_words.insert(std::pair<std::string, unsigned int>(it->as_string(), 1));
}
it++;
}
//For word allignment 1
std::map<std::vector<unsigned char>, unsigned int>::iterator mapiter3;
std::vector<unsigned char> numbers = splitWordAll1(linein.word_align);
mapiter3 = word_all1.find(numbers);
if (mapiter3 != word_all1.end()) {
//If the element is found, increment the count.
mapiter3->second++;
} else {
//Else create a new entry;
word_all1.insert(std::pair<std::vector<unsigned char>, unsigned int>(numbers, 1));
}
}
//Assigns huffman values for each unique element
void Huffman::assign_values()
{
//First create vectors for all maps so that we could sort them later.
//Create a vector for target phrases
for(std::map<std::string, unsigned int>::iterator it = target_phrase_words.begin(); it != target_phrase_words.end(); it++ ) {
target_phrase_words_counts.push_back(*it);
}
//Sort it
std::sort(target_phrase_words_counts.begin(), target_phrase_words_counts.end(), sort_pair());
//Create a vector for word allignments 1
for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1.begin(); it != word_all1.end(); it++ ) {
word_all1_counts.push_back(*it);
}
//Sort it
std::sort(word_all1_counts.begin(), word_all1_counts.end(), sort_pair_vec());
//Afterwards we assign a value for each phrase, starting from 1, as zero is reserved for delimiter
unsigned int i = 1; //huffman code
for(std::vector<std::pair<std::string, unsigned int> >::iterator it = target_phrase_words_counts.begin();
it != target_phrase_words_counts.end(); it++) {
target_phrase_huffman.insert(std::pair<std::string, unsigned int>(it->first, i));
i++; //Go to the next huffman code
}
i = 1; //Reset i for the next map
for(std::vector<std::pair<std::vector<unsigned char>, unsigned int> >::iterator it = word_all1_counts.begin();
it != word_all1_counts.end(); it++) {
word_all1_huffman.insert(std::pair<std::vector<unsigned char>, unsigned int>(it->first, i));
i++; //Go to the next huffman code
}
//After lookups are produced, clear some memory usage of objects not needed anymore.
target_phrase_words.clear();
word_all1.clear();
target_phrase_words_counts.clear();
word_all1_counts.clear();
std::cerr << "Finished generating huffman codes." << std::endl;
}
void Huffman::serialize_maps(const char * dirname)
{
//Note that directory name should exist.
std::string basedir(dirname);
std::string target_phrase_path(basedir + "/target_phrases");
std::string probabilities_path(basedir + "/probs");
std::string word_all1_path(basedir + "/Wall1");
//Target phrase
std::ofstream os (target_phrase_path.c_str(), std::ios::binary);
boost::archive::text_oarchive oarch(os);
oarch << lookup_target_phrase;
os.close();
//Word all1
std::ofstream os2 (word_all1_path.c_str(), std::ios::binary);
boost::archive::text_oarchive oarch2(os2);
oarch2 << lookup_word_all1;
os2.close();
}
std::vector<unsigned char> Huffman::full_encode_line(line_text line)
{
return vbyte_encode_line((encode_line(line)));
}
std::vector<unsigned int> Huffman::encode_line(line_text line)
{
std::vector<unsigned int> retvector;
//Get target_phrase first.
util::TokenIter<util::SingleCharacter> it(line.target_phrase, util::SingleCharacter(' '));
while (it) {
retvector.push_back(target_phrase_huffman.find(it->as_string())->second);
it++;
}
//Add a zero;
retvector.push_back(0);
//Get probabilities. Reinterpreting the float bytes as unsgined int.
util::TokenIter<util::SingleCharacter> probit(line.prob, util::SingleCharacter(' '));
while (probit) {
//Sometimes we have too big floats to handle, so first convert to double
double tempnum = atof(probit->data());
float num = (float)tempnum;
retvector.push_back(reinterpret_float(&num));
probit++;
}
//Add a zero;
retvector.push_back(0);
//Get Word allignments
retvector.push_back(word_all1_huffman.find(splitWordAll1(line.word_align))->second);
retvector.push_back(0);
return retvector;
}
void Huffman::produce_lookups()
{
//basically invert every map that we have
for(std::map<std::string, unsigned int>::iterator it = target_phrase_huffman.begin(); it != target_phrase_huffman.end(); it++ ) {
lookup_target_phrase.insert(std::pair<unsigned int, std::string>(it->second, it->first));
}
for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1_huffman.begin(); it != word_all1_huffman.end(); it++ ) {
lookup_word_all1.insert(std::pair<unsigned int, std::vector<unsigned char> >(it->second, it->first));
}
}
HuffmanDecoder::HuffmanDecoder (const char * dirname)
{
//Read the maps from disk
//Note that directory name should exist.
std::string basedir(dirname);
std::string target_phrase_path(basedir + "/target_phrases");
std::string word_all1_path(basedir + "/Wall1");
//Target phrases
std::ifstream is (target_phrase_path.c_str(), std::ios::binary);
boost::archive::text_iarchive iarch(is);
iarch >> lookup_target_phrase;
is.close();
//Word allignment 1
std::ifstream is2 (word_all1_path.c_str(), std::ios::binary);
boost::archive::text_iarchive iarch2(is2);
iarch2 >> lookup_word_all1;
is2.close();
}
HuffmanDecoder::HuffmanDecoder (std::map<unsigned int, std::string> * lookup_target,
std::map<unsigned int, std::vector<unsigned char> > * lookup_word1)
{
lookup_target_phrase = *lookup_target;
lookup_word_all1 = *lookup_word1;
}
std::vector<target_text> HuffmanDecoder::full_decode_line (std::vector<unsigned char> lines, int num_scores)
{
std::vector<target_text> retvector; //All target phrases
std::vector<unsigned int> decoded_lines = vbyte_decode_line(lines); //All decoded lines
std::vector<unsigned int>::iterator it = decoded_lines.begin(); //Iterator for them
std::vector<unsigned int> current_target_phrase; //Current target phrase decoded
short zero_count = 0; //Count home many zeroes we have met. so far. Every 3 zeroes mean a new target phrase.
while(it != decoded_lines.end()) {
if (zero_count == 1) {
//We are extracting scores. we know how many scores there are so we can push them
//to the vector. This is done in case any of the scores is 0, because it would mess
//up the state machine.
for (int i = 0; i < num_scores; i++) {
current_target_phrase.push_back(*it);
it++;
}
}
if (zero_count == 3) {
//We have finished with this entry, decode it, and add it to the retvector.
retvector.push_back(decode_line(current_target_phrase, num_scores));
current_target_phrase.clear(); //Clear the current target phrase and the zero_count
zero_count = 0; //So that we can reuse them for the next target phrase
}
//Add to the next target_phrase, number by number.
current_target_phrase.push_back(*it);
if (*it == 0) {
zero_count++;
}
it++; //Go to the next word/symbol
}
//Don't forget the last remaining line!
if (zero_count == 3) {
//We have finished with this entry, decode it, and add it to the retvector.
retvector.push_back(decode_line(current_target_phrase, num_scores));
current_target_phrase.clear(); //Clear the current target phrase and the zero_count
zero_count = 0; //So that we can reuse them for the next target phrase
}
return retvector;
}
target_text HuffmanDecoder::decode_line (std::vector<unsigned int> input, int num_scores)
{
//demo decoder
target_text ret;
//Split everything
std::vector<unsigned int> target_phrase;
std::vector<unsigned int> probs;
unsigned int wAll;
//Split the line into the proper arrays
short num_zeroes = 0;
int counter = 0;
while (num_zeroes < 3) {
unsigned int num = input[counter];
if (num == 0) {
num_zeroes++;
} else if (num_zeroes == 0) {
target_phrase.push_back(num);
} else if (num_zeroes == 1) {
//Push exactly num_scores scores
for (int i = 0; i < num_scores; i++) {
probs.push_back(num);
counter++;
num = input[counter];
}
continue;
} else if (num_zeroes == 2) {
wAll = num;
}
counter++;
}
ret.target_phrase = target_phrase;
ret.word_all1 = lookup_word_all1.find(wAll)->second;
//Decode probabilities
for (std::vector<unsigned int>::iterator it = probs.begin(); it != probs.end(); it++) {
ret.prob.push_back(reinterpret_uint(&(*it)));
}
return ret;
}
inline std::string HuffmanDecoder::getTargetWordFromID(unsigned int id)
{
return lookup_target_phrase.find(id)->second;
}
std::string HuffmanDecoder::getTargetWordsFromIDs(std::vector<unsigned int> ids)
{
std::string returnstring;
for (std::vector<unsigned int>::iterator it = ids.begin(); it != ids.end(); it++) {
returnstring.append(getTargetWordFromID(*it) + " ");
}
return returnstring;
}
inline std::string getTargetWordFromID(unsigned int id, std::map<unsigned int, std::string> * lookup_target_phrase)
{
return lookup_target_phrase->find(id)->second;
}
std::string getTargetWordsFromIDs(std::vector<unsigned int> ids, std::map<unsigned int, std::string> * lookup_target_phrase)
{
std::string returnstring;
for (std::vector<unsigned int>::iterator it = ids.begin(); it != ids.end(); it++) {
returnstring.append(getTargetWordFromID(*it, lookup_target_phrase) + " ");
}
return returnstring;
}
/*Those functions are used to more easily store the floats in the binary phrase table
We convert the float unsinged int so that it is the same as our other values and we can
apply variable byte encoding on top of it.*/
inline unsigned int reinterpret_float(float * num)
{
unsigned int * converted_num;
converted_num = reinterpret_cast<unsigned int *>(num);
return *converted_num;
}
inline float reinterpret_uint(unsigned int * num)
{
float * converted_num;
converted_num = reinterpret_cast<float *>(num);
return *converted_num;
}
/*Mostly taken from stackoverflow, http://stackoverflow.com/questions/5858646/optimizing-variable-length-encoding
and modified in order to return a vector of chars. Implements ULEB128 or variable byte encoding.
This is highly optimized version with unrolled loop */
inline std::vector<unsigned char> vbyte_encode(unsigned int num)
{
//Determine how many bytes we are going to take.
short size;
std::vector<unsigned char> byte_vector;
if (num < 0x00000080U) {
size = 1;
byte_vector.reserve(size);
goto b1;
}
if (num < 0x00004000U) {
size = 2;
byte_vector.reserve(size);
goto b2;
}
if (num < 0x00200000U) {
size = 3;
byte_vector.reserve(size);
goto b3;
}
if (num < 0x10000000U) {
size = 4;
byte_vector.reserve(size);
goto b4;
}
size = 5;
byte_vector.reserve(size);
//Now proceed with the encoding.
byte_vector.push_back((num & 0x7f) | 0x80);
num >>= 7;
b4:
byte_vector.push_back((num & 0x7f) | 0x80);
num >>= 7;
b3:
byte_vector.push_back((num & 0x7f) | 0x80);
num >>= 7;
b2:
byte_vector.push_back((num & 0x7f) | 0x80);
num >>= 7;
b1:
byte_vector.push_back(num);
return byte_vector;
}
std::vector<unsigned int> vbyte_decode_line(std::vector<unsigned char> line)
{
std::vector<unsigned int> huffman_line;
std::vector<unsigned char> current_num;
for (std::vector<unsigned char>::iterator it = line.begin(); it != line.end(); it++) {
current_num.push_back(*it);
if ((*it >> 7) != 1) {
//We don't have continuation in the next bit
huffman_line.push_back(bytes_to_int(current_num));
current_num.clear();
}
}
return huffman_line;
}
inline unsigned int bytes_to_int(std::vector<unsigned char> number)
{
unsigned int retvalue = 0;
std::vector<unsigned char>::iterator it = number.begin();
unsigned char shift = 0; //By how many bits to shift
while (it != number.end()) {
retvalue |= (*it & 0x7f) << shift;
shift += 7;
it++;
}
return retvalue;
}
std::vector<unsigned char> vbyte_encode_line(std::vector<unsigned int> line)
{
std::vector<unsigned char> retvec;
//For each unsigned int in the line, vbyte encode it and add it to a vector of unsigned chars.
for (std::vector<unsigned int>::iterator it = line.begin(); it != line.end(); it++) {
std::vector<unsigned char> vbyte_encoded = vbyte_encode(*it);
retvec.insert(retvec.end(), vbyte_encoded.begin(), vbyte_encoded.end());
}
return retvec;
}

View File

@ -1,112 +0,0 @@
#pragma once
//Huffman encodes a line and also produces the vocabulary ids
#include "hash.hh"
#include "line_splitter.hh"
#include <cstdio>
#include <fstream>
#include <iostream>
#include <sstream>
#include <boost/serialization/serialization.hpp>
#include <boost/serialization/vector.hpp>
#include <boost/serialization/map.hpp>
#include <boost/archive/text_iarchive.hpp>
#include <boost/archive/text_oarchive.hpp>
//Sorting for the second
struct sort_pair {
bool operator()(const std::pair<std::string, unsigned int> &left, const std::pair<std::string, unsigned int> &right) {
return left.second > right.second; //This puts biggest numbers first.
}
};
struct sort_pair_vec {
bool operator()(const std::pair<std::vector<unsigned char>, unsigned int> &left, const std::pair<std::vector<unsigned char>, unsigned int> &right) {
return left.second > right.second; //This puts biggest numbers first.
}
};
class Huffman
{
unsigned long uniq_lines; //Unique lines in the file.
//Containers used when counting the occurence of a given phrase
std::map<std::string, unsigned int> target_phrase_words;
std::map<std::vector<unsigned char>, unsigned int> word_all1;
//Same containers as vectors, for sorting
std::vector<std::pair<std::string, unsigned int> > target_phrase_words_counts;
std::vector<std::pair<std::vector<unsigned char>, unsigned int> > word_all1_counts;
//Huffman maps
std::map<std::string, unsigned int> target_phrase_huffman;
std::map<std::vector<unsigned char>, unsigned int> word_all1_huffman;
//inverted maps
std::map<unsigned int, std::string> lookup_target_phrase;
std::map<unsigned int, std::vector<unsigned char> > lookup_word_all1;
public:
Huffman (const char *);
void count_elements (line_text line);
void assign_values();
void serialize_maps(const char * dirname);
void produce_lookups();
std::vector<unsigned int> encode_line(line_text line);
//encode line + variable byte ontop
std::vector<unsigned char> full_encode_line(line_text line);
//Getters
const std::map<unsigned int, std::string> get_target_lookup_map() const {
return lookup_target_phrase;
}
const std::map<unsigned int, std::vector<unsigned char> > get_word_all1_lookup_map() const {
return lookup_word_all1;
}
unsigned long getUniqLines() {
return uniq_lines;
}
};
class HuffmanDecoder
{
std::map<unsigned int, std::string> lookup_target_phrase;
std::map<unsigned int, std::vector<unsigned char> > lookup_word_all1;
public:
HuffmanDecoder (const char *);
HuffmanDecoder (std::map<unsigned int, std::string> *, std::map<unsigned int, std::vector<unsigned char> > *);
//Getters
const std::map<unsigned int, std::string> get_target_lookup_map() const {
return lookup_target_phrase;
}
const std::map<unsigned int, std::vector<unsigned char> > get_word_all1_lookup_map() const {
return lookup_word_all1;
}
inline std::string getTargetWordFromID(unsigned int id);
std::string getTargetWordsFromIDs(std::vector<unsigned int> ids);
target_text decode_line (std::vector<unsigned int> input, int num_scores);
//Variable byte decodes a all target phrases contained here and then passes them to decode_line
std::vector<target_text> full_decode_line (std::vector<unsigned char> lines, int num_scores);
};
std::string getTargetWordsFromIDs(std::vector<unsigned int> ids, std::map<unsigned int, std::string> * lookup_target_phrase);
inline std::string getTargetWordFromID(unsigned int id, std::map<unsigned int, std::string> * lookup_target_phrase);
inline unsigned int reinterpret_float(float * num);
inline float reinterpret_uint(unsigned int * num);
std::vector<unsigned char> vbyte_encode_line(std::vector<unsigned int> line);
inline std::vector<unsigned char> vbyte_encode(unsigned int num);
std::vector<unsigned int> vbyte_decode_line(std::vector<unsigned char> line);
inline unsigned int bytes_to_int(std::vector<unsigned char> number);

View File

@ -1,66 +1,92 @@
#include "line_splitter.hh"
line_text splitLine(StringPiece textin)
namespace Moses
{
const char delim[] = " ||| ";
line_text splitLine(const StringPiece &textin, bool scfg)
{
const char delim[] = "|||";
line_text output;
//Tokenize
util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
//Get source phrase
output.source_phrase = *it;
output.source_phrase = Trim(*it);
//std::cerr << "output.source_phrase=" << output.source_phrase << "AAAA" << std::endl;
//Get target_phrase
it++;
output.target_phrase = *it;
output.target_phrase = Trim(*it);
//std::cerr << "output.target_phrase=" << output.target_phrase << "AAAA" << std::endl;
if (scfg) {
/*
std::cerr << "output.source_phrase=" << output.source_phrase << std::endl;
std::cerr << "output.target_phrase=" << output.target_phrase << std::endl;
reformatSCFG(output);
std::cerr << "output.source_phrase=" << output.source_phrase << std::endl;
std::cerr << "output.target_phrase=" << output.target_phrase << std::endl;
*/
}
//Get probabilities
it++;
output.prob = *it;
output.prob = Trim(*it);
//std::cerr << "output.prob=" << output.prob << "AAAA" << std::endl;
//Get WordAllignment
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
output.word_align = *it;
output.word_align = Trim(*it);
//std::cerr << "output.word_align=" << output.word_align << "AAAA" << std::endl;
//Get count
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
output.counts = *it;
output.counts = Trim(*it);
//std::cerr << "output.counts=" << output.counts << "AAAA" << std::endl;
//Get sparse_score
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
output.sparse_score = *it;
output.sparse_score = Trim(*it);
//std::cerr << "output.sparse_score=" << output.sparse_score << "AAAA" << std::endl;
//Get property
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
output.property = *it;
output.property = Trim(*it);
//std::cerr << "output.property=" << output.property << "AAAA" << std::endl;
return output;
}
std::vector<unsigned char> splitWordAll1(StringPiece textin)
std::vector<unsigned char> splitWordAll1(const StringPiece &textin)
{
const char delim[] = " ";
const char delim2[] = "-";
std::vector<unsigned char> output;
//Case with no word alignments.
if (textin.size() == 0) {
return output;
}
//Split on space
util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
//For each int
while (it) {
//Split on dash (-)
util::TokenIter<util::MultiCharacter> itInner(*it, util::MultiCharacter(delim2));
util::TokenIter<util::MultiCharacter> itInner(*it,
util::MultiCharacter(delim2));
//Insert the two entries in the vector. User will read entry 0 and 1 to get the first,
//2 and 3 for second etc. Use unsigned char instead of int to save space, as
//word allignments are all very small numbers that fit in a single byte
output.push_back((unsigned char)(atoi(itInner->data())));
output.push_back((unsigned char) (atoi(itInner->data())));
itInner++;
output.push_back((unsigned char)(atoi(itInner->data())));
output.push_back((unsigned char) (atoi(itInner->data())));
it++;
}
@ -68,3 +94,10 @@ std::vector<unsigned char> splitWordAll1(StringPiece textin)
}
void reformatSCFG(line_text &output)
{
}
}

View File

@ -9,8 +9,12 @@
#include "util/tokenize_piece.hh"
#include <vector>
namespace Moses
{
//Struct for holding processed line
struct line_text {
struct line_text
{
StringPiece source_phrase;
StringPiece target_phrase;
StringPiece prob;
@ -18,16 +22,38 @@ struct line_text {
StringPiece counts;
StringPiece sparse_score;
StringPiece property;
std::string property_to_be_binarized;
};
//Struct for holding processed line
struct target_text {
struct target_text
{
std::vector<unsigned int> target_phrase;
std::vector<float> prob;
std::vector<unsigned char> word_all1;
std::vector<size_t> word_align_term;
std::vector<size_t> word_align_non_term;
std::vector<char> counts;
std::vector<char> sparse_score;
std::vector<char> property;
/*
void Reset()
{
target_phrase.clear();
prob.clear();
word_all1.clear();
counts.clear();
sparse_score.clear();
property.clear();
}
*/
};
//Ask if it's better to have it receive a pointer to a line_text struct
line_text splitLine(StringPiece textin);
line_text splitLine(const StringPiece &textin, bool scfg);
void reformatSCFG(line_text &output);
std::vector<unsigned char> splitWordAll1(const StringPiece &textin);
}
std::vector<unsigned char> splitWordAll1(StringPiece textin);

View File

@ -1,5 +1,8 @@
#include "probing_hash_utils.hh"
namespace Moses
{
//Read table from disk, return memory map location
char * readTable(const char * filename, size_t size)
{
@ -13,7 +16,7 @@ char * readTable(const char * filename, size_t size)
exit(EXIT_FAILURE);
}
map = (char *)mmap(0, size, PROT_READ, MAP_SHARED, fd, 0);
map = (char *) mmap(0, size, PROT_READ, MAP_SHARED, fd, 0);
if (map == MAP_FAILED) {
close(fd);
@ -24,11 +27,24 @@ char * readTable(const char * filename, size_t size)
return map;
}
void serialize_table(char *mem, size_t size, const char * filename)
void serialize_table(char *mem, size_t size, const std::string &filename)
{
std::ofstream os (filename, std::ios::binary);
os.write((const char*)&mem[0], size);
std::ofstream os(filename.c_str(), std::ios::binary);
os.write((const char*) &mem[0], size);
os.close();
}
uint64_t getKey(const uint64_t source_phrase[], size_t size)
{
//TOO SLOW
//uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
uint64_t key = 0;
for (size_t i = 0; i < size; i++) {
key += (source_phrase[i] << i);
}
return key;
}
}

View File

@ -7,31 +7,49 @@
#include <fcntl.h>
#include <fstream>
namespace Moses
{
#define API_VERSION 15
//Hash table entry
struct Entry {
uint64_t key;
struct Entry
{
typedef uint64_t Key;
unsigned int bytes_toread;
Key key;
uint64_t GetKey() const {
Key GetKey() const
{
return key;
}
void SetKey(uint64_t to) {
void SetKey(Key to)
{
key = to;
}
uint64_t GetValue() const {
return value;
}
uint64_t value;
};
#define NONE std::numeric_limits<uint64_t>::max()
//Define table
typedef util::ProbingHashTable<Entry, boost::hash<uint64_t> > Table;
void serialize_table(char *mem, size_t size, const char * filename);
void serialize_table(char *mem, size_t size, const std::string &filename);
char * readTable(const char * filename, size_t size);
uint64_t getKey(const uint64_t source_phrase[], size_t size);
struct TargetPhraseInfo
{
uint32_t alignTerm;
uint32_t alignNonTerm;
uint16_t numWords;
uint16_t propLength;
uint16_t filler;
};
}

View File

@ -1,73 +1,80 @@
#include "quering.hh"
#include "util/exception.hh"
unsigned char * read_binary_file(const char * filename, size_t filesize)
using namespace std;
namespace Moses
{
//Get filesize
int fd;
unsigned char * map;
fd = open(filename, O_RDONLY);
if (fd == -1) {
perror("Error opening file for reading");
exit(EXIT_FAILURE);
}
map = (unsigned char *)mmap(0, filesize, PROT_READ, MAP_SHARED, fd, 0);
if (map == MAP_FAILED) {
close(fd);
perror("Error mmapping the file");
exit(EXIT_FAILURE);
}
return map;
}
QueryEngine::QueryEngine(const char * filepath) : decoder(filepath)
QueryEngine::QueryEngine(const char * filepath)
{
//Create filepaths
std::string basepath(filepath);
std::string path_to_hashtable = basepath + "/probing_hash.dat";
std::string path_to_data_bin = basepath + "/binfile.dat";
std::string path_to_source_vocabid = basepath + "/source_vocabids";
std::string alignPath = basepath + "/Alignments.dat";
///Source phrase vocabids
read_map(&source_vocabids, path_to_source_vocabid.c_str());
read_map(source_vocabids, path_to_source_vocabid.c_str());
//Target phrase vocabIDs
vocabids = decoder.get_target_lookup_map();
// alignments
read_alignments(alignPath);
//Read config file
boost::unordered_map<std::string, std::string> keyValue;
std::ifstream config((basepath + "/config").c_str());
std::string line;
std::ifstream config ((basepath + "/config").c_str());
while (getline(config, line)) {
std::vector<std::string> toks = Tokenize(line, "\t");
UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line);
keyValue[ toks[0] ] = toks[1];
}
bool found;
//Check API version:
getline(config, line);
if (atoi(line.c_str()) != API_VERSION) {
std::cerr << "The ProbingPT API has changed, please rebinarize your phrase tables." << std::endl;
int version;
found = Get(keyValue, "API_VERSION", version);
if (!found) {
std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl;
}
else if (version != API_VERSION) {
std::cerr << "The ProbingPT API has changed. " << version << "!="
<< API_VERSION << " Please rebinarize your phrase tables." << std::endl;
exit(EXIT_FAILURE);
}
//Get tablesize.
getline(config, line);
int tablesize = atoi(line.c_str());
//Number of scores
getline(config, line);
num_scores = atoi(line.c_str());
//do we have a reordering table
getline(config, line);
std::transform(line.begin(), line.end(), line.begin(), ::tolower); //Get the boolean in lowercase
is_reordering = false;
if (line == "true") {
is_reordering = true;
std::cerr << "WARNING. REORDERING TABLES NOT SUPPORTED YET." << std::endl;
}
config.close();
//Mmap binary table
struct stat filestatus;
stat(path_to_data_bin.c_str(), &filestatus);
binary_filesize = filestatus.st_size;
binary_mmaped = read_binary_file(path_to_data_bin.c_str(), binary_filesize);
//Get tablesize.
int tablesize;
found = Get(keyValue, "uniq_entries", tablesize);
if (!found) {
std::cerr << "uniq_entries not found" << std::endl;
exit(EXIT_FAILURE);
}
//Number of scores
found = Get(keyValue, "num_scores", num_scores);
if (!found) {
std::cerr << "num_scores not found" << std::endl;
exit(EXIT_FAILURE);
}
//How may scores from lex reordering models
found = Get(keyValue, "num_lex_scores", num_lex_scores);
if (!found) {
std::cerr << "num_lex_scores not found" << std::endl;
exit(EXIT_FAILURE);
}
// have the scores been log() and FloorScore()?
found = Get(keyValue, "log_prob", logProb);
if (!found) {
std::cerr << "logProb not found" << std::endl;
exit(EXIT_FAILURE);
}
config.close();
//Read hashtable
table_filesize = Table::Size(tablesize, 1.2);
@ -81,118 +88,50 @@ QueryEngine::QueryEngine(const char * filepath) : decoder(filepath)
QueryEngine::~QueryEngine()
{
//Clear mmap content from memory.
munmap(binary_mmaped, binary_filesize);
munmap(mem, table_filesize);
}
std::pair<bool, std::vector<target_text> > QueryEngine::query(std::vector<uint64_t> source_phrase)
uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const
{
bool found;
std::vector<target_text> translation_entries;
const Entry * entry;
//TOO SLOW
//uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
uint64_t key = 0;
for (int i = 0; i < source_phrase.size(); i++) {
key += (source_phrase[i] << i);
}
found = table.Find(key, entry);
if (found) {
//The phrase that was searched for was found! We need to get the translation entries.
//We will read the largest entry in bytes and then filter the unnecesarry with functions
//from line_splitter
uint64_t initial_index = entry -> GetValue();
unsigned int bytes_toread = entry -> bytes_toread;
//ASK HIEU FOR MORE EFFICIENT WAY TO DO THIS!
std::vector<unsigned char> encoded_text; //Assign to the vector the relevant portion of the array.
encoded_text.reserve(bytes_toread);
for (int i = 0; i < bytes_toread; i++) {
encoded_text.push_back(binary_mmaped[i+initial_index]);
}
//Get only the translation entries necessary
translation_entries = decoder.full_decode_line(encoded_text, num_scores);
}
std::pair<bool, std::vector<target_text> > output (found, translation_entries);
return output;
return getKey(source_phrase, size);
}
std::pair<bool, std::vector<target_text> > QueryEngine::query(StringPiece source_phrase)
std::pair<bool, uint64_t> QueryEngine::query(uint64_t key)
{
bool found;
std::vector<target_text> translation_entries;
std::pair<bool, uint64_t> ret;
const Entry * entry;
//Convert source frase to VID
std::vector<uint64_t> source_phrase_vid = getVocabIDs(source_phrase);
//TOO SLOW
//uint64_t key = util::MurmurHashNative(&source_phrase_vid[0], source_phrase_vid.size());
uint64_t key = 0;
for (int i = 0; i < source_phrase_vid.size(); i++) {
key += (source_phrase_vid[i] << i);
ret.first = table.Find(key, entry);
if (ret.first) {
ret.second = entry->value;
}
found = table.Find(key, entry);
if (found) {
//The phrase that was searched for was found! We need to get the translation entries.
//We will read the largest entry in bytes and then filter the unnecesarry with functions
//from line_splitter
uint64_t initial_index = entry -> GetValue();
unsigned int bytes_toread = entry -> bytes_toread;
//At the end of the file we can't readd + largest_entry cause we get a segfault.
std::cerr << "Entry size is bytes is: " << bytes_toread << std::endl;
//ASK HIEU FOR MORE EFFICIENT WAY TO DO THIS!
std::vector<unsigned char> encoded_text; //Assign to the vector the relevant portion of the array.
encoded_text.reserve(bytes_toread);
for (int i = 0; i < bytes_toread; i++) {
encoded_text.push_back(binary_mmaped[i+initial_index]);
}
//Get only the translation entries necessary
translation_entries = decoder.full_decode_line(encoded_text, num_scores);
}
std::pair<bool, std::vector<target_text> > output (found, translation_entries);
return output;
return ret;
}
void QueryEngine::printTargetInfo(std::vector<target_text> target_phrases)
void QueryEngine::read_alignments(const std::string &alignPath)
{
int entries = target_phrases.size();
std::ifstream strm(alignPath.c_str());
for (int i = 0; i<entries; i++) {
std::cout << "Entry " << i+1 << " of " << entries << ":" << std::endl;
//Print text
std::cout << getTargetWordsFromIDs(target_phrases[i].target_phrase, &vocabids) << "\t";
string line;
while (getline(strm, line)) {
vector<string> toks = Tokenize(line, "\t ");
UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file");
//Print probabilities:
for (int j = 0; j<target_phrases[i].prob.size(); j++) {
std::cout << target_phrases[i].prob[j] << " ";
uint32_t alignInd = Scan<uint32_t>(toks[0]);
if (alignInd >= alignColl.size()) {
alignColl.resize(alignInd + 1);
}
std::cout << "\t";
//Print word_all1
for (int j = 0; j<target_phrases[i].word_all1.size(); j++) {
if (j%2 == 0) {
std::cout << (short)target_phrases[i].word_all1[j] << "-";
} else {
std::cout << (short)target_phrases[i].word_all1[j] << " ";
}
Alignments &aligns = alignColl[alignInd];
for (size_t i = 1; i < toks.size(); ++i) {
size_t pos = Scan<size_t>(toks[i]);
aligns.push_back(pos);
}
std::cout << std::endl;
}
}
}

View File

@ -1,45 +1,65 @@
#pragma once
#include "probing_hash_utils.hh"
#include "huffmanish.hh"
#include "hash.hh" //Includes line splitter
#include <boost/unordered_map.hpp>
#include <sys/stat.h> //For finding size of file
#include "vocabid.hh"
#include <algorithm> //toLower
#define API_VERSION 3
#include <deque>
#include "probing_hash_utils.hh"
#include "hash.hh" //Includes line splitter
#include "line_splitter.hh"
#include "moses//Util.h"
char * read_binary_file(char * filename);
namespace Moses
{
class QueryEngine
{
unsigned char * binary_mmaped; //The binari phrase table file
std::map<unsigned int, std::string> vocabids;
std::map<uint64_t, std::string> source_vocabids;
typedef std::vector<unsigned char> Alignments;
std::vector<Alignments> alignColl;
Table table;
char *mem; //Memory for the table, necessary so that we can correctly destroy the object
HuffmanDecoder decoder;
size_t binary_filesize;
size_t table_filesize;
int num_scores;
bool is_reordering;
public:
QueryEngine (const char *);
~QueryEngine();
std::pair<bool, std::vector<target_text> > query(StringPiece source_phrase);
std::pair<bool, std::vector<target_text> > query(std::vector<uint64_t> source_phrase);
void printTargetInfo(std::vector<target_text> target_phrases);
const std::map<unsigned int, std::string> getVocab() const {
return decoder.get_target_lookup_map();
}
const std::map<uint64_t, std::string> getSourceVocab() const {
return source_vocabids;
void read_alignments(const std::string &alignPath);
public:
int num_scores;
int num_lex_scores;
bool logProb;
QueryEngine(const char *);
~QueryEngine();
std::pair<bool, uint64_t> query(uint64_t key);
const std::map<uint64_t, std::string> &getSourceVocab() const
{ return source_vocabids; }
const std::vector<Alignments> &getAlignments() const
{ return alignColl; }
uint64_t getKey(uint64_t source_phrase[], size_t size) const;
template<typename T>
inline bool Get(const boost::unordered_map<std::string, std::string> &keyValue, const std::string &sought, T &found) const
{
boost::unordered_map<std::string, std::string>::const_iterator iter = keyValue.find(sought);
if (iter == keyValue.end()) {
return false;
}
const std::string &foundStr = iter->second;
found = Scan<T>(foundStr);
return true;
}
};
}

View File

@ -1,161 +1,303 @@
#include <sys/stat.h>
#include <boost/foreach.hpp>
#include "line_splitter.hh"
#include "storing.hh"
#include "StoreTarget.h"
#include "StoreVocab.h"
#include "moses/Util.h"
#include "moses/InputFileStream.h"
BinaryFileWriter::BinaryFileWriter (std::string basepath) : os ((basepath + "/binfile.dat").c_str(), std::ios::binary)
{
binfile.reserve(10000); //Reserve part of the vector to avoid realocation
it = binfile.begin();
dist_from_start = 0; //Initialize variables
extra_counter = 0;
}
using namespace std;
void BinaryFileWriter::write (std::vector<unsigned char> * bytes)
namespace Moses
{
binfile.insert(it, bytes->begin(), bytes->end()); //Insert the bytes
//Keep track of the offsets
it += bytes->size();
dist_from_start = distance(binfile.begin(),it);
//Flush the vector to disk every once in a while so that we don't consume too much ram
if (dist_from_start > 9000) {
flush();
///////////////////////////////////////////////////////////////////////
void Node::Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos)
{
if (pos < sourcePhrase.size()) {
uint64_t vocabId = sourcePhrase[pos];
Node *child;
Children::iterator iter = m_children.find(vocabId);
if (iter == m_children.end()) {
// New node. Write other children then discard them
BOOST_FOREACH(Children::value_type &valPair, m_children) {
Node &otherChild = valPair.second;
otherChild.Write(table);
}
m_children.clear();
// create new node
child = &m_children[vocabId];
assert(!child->done);
child->key = key + (vocabId << pos);
}
else {
child = &iter->second;
}
child->Add(table, sourcePhrase, pos + 1);
}
else {
// this node was written previously 'cos it has rules
done = true;
}
}
void BinaryFileWriter::flush ()
void Node::Write(Table &table)
{
//Cast unsigned char to char before writing...
os.write((char *)&binfile[0], dist_from_start);
//Clear the vector:
binfile.clear();
binfile.reserve(10000);
extra_counter += dist_from_start; //Keep track of the total number of bytes.
it = binfile.begin(); //Reset iterator
dist_from_start = distance(binfile.begin(),it); //Reset dist from start
//cerr << "START write " << done << " " << key << endl;
BOOST_FOREACH(Children::value_type &valPair, m_children) {
Node &child = valPair.second;
child.Write(table);
}
if (!done) {
// save
Entry sourceEntry;
sourceEntry.value = NONE;
sourceEntry.key = key;
//Put into table
table.Insert(sourceEntry);
}
}
BinaryFileWriter::~BinaryFileWriter ()
///////////////////////////////////////////////////////////////////////
void createProbingPT(const std::string &phrasetable_path,
const std::string &basepath, int num_scores, int num_lex_scores,
bool log_prob, int max_cache_size, bool scfg)
{
os.close();
binfile.clear();
}
std::cerr << "Starting..." << std::endl;
void createProbingPT(const char * phrasetable_path, const char * target_path,
const char * num_scores, const char * is_reordering)
{
//Get basepath and create directory if missing
std::string basepath(target_path);
mkdir(basepath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
//Set up huffman and serialize decoder maps.
Huffman huffmanEncoder(phrasetable_path); //initialize
huffmanEncoder.assign_values();
huffmanEncoder.produce_lookups();
huffmanEncoder.serialize_maps(target_path);
StoreTarget storeTarget(basepath);
//Get uniq lines:
unsigned long uniq_entries = huffmanEncoder.getUniqLines();
unsigned long uniq_entries = countUniqueSource(phrasetable_path);
//Source phrase vocabids
std::map<uint64_t, std::string> source_vocabids;
StoreVocab<uint64_t> sourceVocab(basepath + "/source_vocabids");
//Read the file
util::FilePiece filein(phrasetable_path);
util::FilePiece filein(phrasetable_path.c_str());
//Init the probing hash table
size_t size = Table::Size(uniq_entries, 1.2);
char * mem = new char[size];
memset(mem, 0, size);
Table table(mem, size);
Table sourceEntries(mem, size);
BinaryFileWriter binfile(basepath); //Init the binary file writer.
line_text prev_line; //Check if the source phrase of the previous line is the same
std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> cache;
float totalSourceCount = 0;
//Keep track of the size of each group of target phrases
uint64_t entrystartidx = 0;
//uint64_t line_num = 0;
size_t line_num = 0;
//Read everything and processs
while(true) {
std::string prevSource;
Node sourcePhrases;
sourcePhrases.done = true;
sourcePhrases.key = 0;
while (true) {
try {
//Process line read
line_text line;
line = splitLine(filein.ReadLine());
line = splitLine(filein.ReadLine(), scfg);
//cerr << "line=" << line.source_phrase << endl;
++line_num;
if (line_num % 1000000 == 0) {
std::cerr << line_num << " " << std::flush;
}
//Add source phrases to vocabularyIDs
add_to_map(&source_vocabids, line.source_phrase);
add_to_map(sourceVocab, line.source_phrase);
if ((binfile.dist_from_start + binfile.extra_counter) == 0) {
prev_line = line; //For the first iteration assume the previous line is
} //The same as this one.
if (line.source_phrase != prev_line.source_phrase) {
if (prevSource.empty()) {
// 1st line
prevSource = line.source_phrase.as_string();
storeTarget.Append(line, log_prob, scfg);
}
else if (prevSource == line.source_phrase) {
//If we still have the same line, just append to it:
storeTarget.Append(line, log_prob, scfg);
}
else {
assert(prevSource != line.source_phrase);
//Create a new entry even
// save
uint64_t targetInd = storeTarget.Save();
// next line
storeTarget.Append(line, log_prob, scfg);
//Create an entry for the previous source phrase:
Entry pesho;
pesho.value = entrystartidx;
Entry sourceEntry;
sourceEntry.value = targetInd;
//The key is the sum of hashes of individual words bitshifted by their position in the phrase.
//Probably not entirerly correct, but fast and seems to work fine in practise.
pesho.key = 0;
std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase);
for (int i = 0; i < vocabid_source.size(); i++) {
pesho.key += (vocabid_source[i] << i);
std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource);
if (scfg) {
// storing prefixes?
sourcePhrases.Add(sourceEntries, vocabid_source);
}
pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx;
sourceEntry.key = getKey(vocabid_source);
/*
cerr << "prevSource=" << prevSource << flush
<< " vocabids=" << Debug(vocabid_source) << flush
<< " key=" << sourceEntry.key << endl;
*/
//Put into table
table.Insert(pesho);
sourceEntries.Insert(sourceEntry);
entrystartidx = binfile.dist_from_start + binfile.extra_counter; //Designate start idx for new entry
// update cache - CURRENT source phrase, not prev
if (max_cache_size) {
std::string countStr = line.counts.as_string();
countStr = Trim(countStr);
if (!countStr.empty()) {
std::vector<float> toks = Tokenize<float>(countStr);
//cerr << "CACHE:" << line.source_phrase << " " << countStr << " " << toks[1] << endl;
//Encode a line and write it to disk.
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
binfile.write(&encoded_line);
if (toks.size() >= 2) {
totalSourceCount += toks[1];
// compute key for CURRENT source
std::vector<uint64_t> currVocabidSource = getVocabIDs(line.source_phrase.as_string());
uint64_t currKey = getKey(currVocabidSource);
CacheItem *item = new CacheItem(
Trim(line.source_phrase.as_string()),
currKey,
toks[1]);
cache.push(item);
if (max_cache_size > 0 && cache.size() > max_cache_size) {
cache.pop();
}
}
}
}
//Set prevLine
prev_line = line;
} else {
//If we still have the same line, just append to it:
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
binfile.write(&encoded_line);
prevSource = line.source_phrase.as_string();
}
} catch (util::EndOfFileException e) {
std::cerr << "Reading phrase table finished, writing remaining files to disk." << std::endl;
binfile.flush();
}
catch (util::EndOfFileException e) {
std::cerr
<< "Reading phrase table finished, writing remaining files to disk."
<< std::endl;
//After the final entry is constructed we need to add it to the phrase_table
//Create an entry for the previous source phrase:
Entry pesho;
pesho.value = entrystartidx;
uint64_t targetInd = storeTarget.Save();
Entry sourceEntry;
sourceEntry.value = targetInd;
//The key is the sum of hashes of individual words. Probably not entirerly correct, but fast
pesho.key = 0;
std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase);
for (int i = 0; i < vocabid_source.size(); i++) {
pesho.key += (vocabid_source[i] << i);
}
pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx;
std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource);
sourceEntry.key = getKey(vocabid_source);
//Put into table
table.Insert(pesho);
sourceEntries.Insert(sourceEntry);
break;
}
}
serialize_table(mem, size, (basepath + "/probing_hash.dat").c_str());
sourcePhrases.Write(sourceEntries);
serialize_map(&source_vocabids, (basepath + "/source_vocabids").c_str());
storeTarget.SaveAlignment();
serialize_table(mem, size, (basepath + "/probing_hash.dat"));
sourceVocab.Save();
serialize_cache(cache, (basepath + "/cache"), totalSourceCount);
delete[] mem;
//Write configfile
std::ofstream configfile;
configfile.open((basepath + "/config").c_str());
configfile << API_VERSION << '\n';
configfile << uniq_entries << '\n';
configfile << num_scores << '\n';
configfile << is_reordering << '\n';
configfile << "API_VERSION\t" << API_VERSION << '\n';
configfile << "uniq_entries\t" << uniq_entries << '\n';
configfile << "num_scores\t" << num_scores << '\n';
configfile << "num_lex_scores\t" << num_lex_scores << '\n';
configfile << "log_prob\t" << log_prob << '\n';
configfile.close();
}
size_t countUniqueSource(const std::string &path)
{
size_t ret = 0;
InputFileStream strme(path);
std::string line, prevSource;
while (std::getline(strme, line)) {
std::vector<std::string> toks = TokenizeMultiCharSeparator(line, "|||");
assert(toks.size() != 0);
if (prevSource != toks[0]) {
prevSource = toks[0];
++ret;
}
}
return ret;
}
void serialize_cache(
std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache,
const std::string &path, float totalSourceCount)
{
std::vector<const CacheItem*> vec(cache.size());
size_t ind = cache.size() - 1;
while (!cache.empty()) {
const CacheItem *item = cache.top();
vec[ind] = item;
cache.pop();
--ind;
}
std::ofstream os(path.c_str());
os << totalSourceCount << std::endl;
for (size_t i = 0; i < vec.size(); ++i) {
const CacheItem *item = vec[i];
os << item->count << "\t" << item->sourceKey << "\t" << item->source << std::endl;
delete item;
}
os.close();
}
uint64_t getKey(const std::vector<uint64_t> &vocabid_source)
{
return getKey(vocabid_source.data(), vocabid_source.size());
}
std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos)
{
assert(endPos < vocabid_source.size());
std::vector<uint64_t> ret(endPos + 1);
for (size_t i = 0; i <= endPos; ++i) {
ret[i] = vocabid_source[i];
}
return ret;
}
}

View File

@ -1,36 +1,95 @@
#pragma once
#include <boost/unordered_set.hpp>
#include <boost/unordered_map.hpp>
#include <cstdio>
#include <sstream>
#include <fstream>
#include <iostream>
#include <string>
#include <queue>
#include <sys/stat.h> //mkdir
#include "hash.hh" //Includes line_splitter
#include "probing_hash_utils.hh"
#include "huffmanish.hh"
#include <sys/stat.h> //mkdir
#include "util/file_piece.hh"
#include "util/file.hh"
#include "vocabid.hh"
#define API_VERSION 3
void createProbingPT(const char * phrasetable_path, const char * target_path,
const char * num_scores, const char * is_reordering);
class BinaryFileWriter
namespace Moses
{
std::vector<unsigned char> binfile;
std::vector<unsigned char>::iterator it;
//Output binary
std::ofstream os;
typedef std::vector<uint64_t> SourcePhrase;
class Node
{
typedef boost::unordered_map<uint64_t, Node> Children;
Children m_children;
public:
unsigned int dist_from_start; //Distance from the start of the vector.
uint64_t extra_counter; //After we reset the counter, we still want to keep track of the correct offset, so
uint64_t key;
bool done;
BinaryFileWriter (std::string);
~BinaryFileWriter ();
void write (std::vector<unsigned char> * bytes);
void flush (); //Flush to disk
Node()
:done(false)
{}
void Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos = 0);
void Write(Table &table);
};
void createProbingPT(const std::string &phrasetable_path,
const std::string &basepath, int num_scores, int num_lex_scores,
bool log_prob, int max_cache_size, bool scfg);
uint64_t getKey(const std::vector<uint64_t> &source_phrase);
std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos);
template<typename T>
std::string Debug(const std::vector<T> &vec)
{
std::stringstream strm;
for (size_t i = 0; i < vec.size(); ++i) {
strm << vec[i] << " ";
}
return strm.str();
}
size_t countUniqueSource(const std::string &path);
class CacheItem
{
public:
std::string source;
uint64_t sourceKey;
float count;
CacheItem(const std::string &vSource, uint64_t vSourceKey, float vCount)
:source(vSource)
,sourceKey(vSourceKey)
,count(vCount)
{
}
bool operator<(const CacheItem &other) const
{
return count > other.count;
}
};
class CacheItemOrderer
{
public:
bool operator()(const CacheItem* a, const CacheItem* b) const
{
return (*a) < (*b);
}
};
void serialize_cache(
std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache,
const std::string &path, float totalSourceCount);
}

View File

@ -1,32 +1,59 @@
#include <boost/foreach.hpp>
#include "vocabid.hh"
#include "StoreVocab.h"
#include "moses/Util.h"
void add_to_map(std::map<uint64_t, std::string> *karta, StringPiece textin)
namespace Moses
{
void add_to_map(StoreVocab<uint64_t> &sourceVocab,
const StringPiece &textin)
{
//Tokenize
util::TokenIter<util::SingleCharacter> it(textin, util::SingleCharacter(' '));
util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' '));
while(it) {
karta->insert(std::pair<uint64_t, std::string>(getHash(*it), it->as_string()));
it++;
while (itWord) {
StringPiece word = *itWord;
util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|'));
while (itFactor) {
StringPiece factor = *itFactor;
sourceVocab.Insert(getHash(factor), factor.as_string());
itFactor++;
}
itWord++;
}
}
void serialize_map(std::map<uint64_t, std::string> *karta, const char* filename)
void serialize_map(const std::map<uint64_t, std::string> &karta,
const std::string &filename)
{
std::ofstream os (filename, std::ios::binary);
boost::archive::text_oarchive oarch(os);
std::ofstream os(filename.c_str());
std::map<uint64_t, std::string>::const_iterator iter;
for (iter = karta.begin(); iter != karta.end(); ++iter) {
os << iter->first << '\t' << iter->second << std::endl;
}
oarch << *karta; //Serialise map
os.close();
}
void read_map(std::map<uint64_t, std::string> *karta, const char* filename)
void read_map(std::map<uint64_t, std::string> &karta, const char* filename)
{
std::ifstream is (filename, std::ios::binary);
boost::archive::text_iarchive iarch(is);
std::ifstream is(filename);
iarch >> *karta;
std::string line;
while (getline(is, line)) {
std::vector<std::string> toks = Tokenize(line, "\t");
assert(toks.size() == 2);
uint64_t ind = Scan<uint64_t>(toks[1]);
karta[ind] = toks[0];
}
//Close the stream after we are done.
is.close();
}
}

View File

@ -13,8 +13,17 @@
#include "util/string_piece.hh" //Tokenization and work with StringPiece
#include "util/tokenize_piece.hh"
void add_to_map(std::map<uint64_t, std::string> *karta, StringPiece textin);
namespace Moses
{
template<typename VOCABID>
class StoreVocab;
void serialize_map(std::map<uint64_t, std::string> *karta, const char* filename);
void add_to_map(StoreVocab<uint64_t> &sourceVocab,
const StringPiece &textin);
void read_map(std::map<uint64_t, std::string> *karta, const char* filename);
void serialize_map(const std::map<uint64_t, std::string> &karta,
const std::string &filename);
void read_map(std::map<uint64_t, std::string> &karta, const char* filename);
}