trim Entry struct

This commit is contained in:
Hieu Hoang 2016-01-21 17:55:47 +00:00
parent 7e4cfb6416
commit 23dbfb0f27
6 changed files with 6 additions and 730 deletions

View File

@ -71,7 +71,6 @@ alias deps : ../../..//z ../../..//boost_iostreams ../../..//boost_filesystem .
legacy/Util2.cpp
legacy/ProbingPT/hash.cpp
legacy/ProbingPT/huffmanish.cpp
legacy/ProbingPT/line_splitter.cpp
legacy/ProbingPT/probing_hash_utils.cpp
legacy/ProbingPT/quering.cpp

View File

@ -1,566 +0,0 @@
#include "huffmanish.hh"
namespace Moses2
{
Huffman::Huffman (const char * filepath)
{
//Read the file
util::FilePiece filein(filepath);
//Init uniq_lines to zero;
uniq_lines = 0;
line_text prev_line; //Check for unique lines.
int num_lines = 0 ;
while (true) {
line_text new_line;
num_lines++;
try {
//Process line read
new_line = splitLine(filein.ReadLine());
count_elements(new_line); //Counts the number of elements, adds new and increments counters.
} catch (util::EndOfFileException e) {
std::cerr << "Unique entries counted: ";
break;
}
if (new_line.source_phrase == prev_line.source_phrase) {
continue;
} else {
uniq_lines++;
prev_line = new_line;
}
}
std::cerr << uniq_lines << std::endl;
}
void Huffman::count_elements(const line_text &linein)
{
//For target phrase:
util::TokenIter<util::SingleCharacter> it(linein.target_phrase, util::SingleCharacter(' '));
while (it) {
//Check if we have that entry
std::map<std::string, unsigned int>::iterator mapiter;
mapiter = target_phrase_words.find(it->as_string());
if (mapiter != target_phrase_words.end()) {
//If the element is found, increment the count.
mapiter->second++;
} else {
//Else create a new entry;
target_phrase_words.insert(std::pair<std::string, unsigned int>(it->as_string(), 1));
}
it++;
}
//For word allignment 1
std::map<std::vector<unsigned char>, unsigned int>::iterator mapiter3;
std::vector<unsigned char> numbers = splitWordAll1(linein.word_align);
mapiter3 = word_all1.find(numbers);
if (mapiter3 != word_all1.end()) {
//If the element is found, increment the count.
mapiter3->second++;
} else {
//Else create a new entry;
word_all1.insert(std::pair<std::vector<unsigned char>, unsigned int>(numbers, 1));
}
}
//Assigns huffman values for each unique element
void Huffman::assign_values()
{
//First create vectors for all maps so that we could sort them later.
//Create a vector for target phrases
for(std::map<std::string, unsigned int>::iterator it = target_phrase_words.begin(); it != target_phrase_words.end(); it++ ) {
target_phrase_words_counts.push_back(*it);
}
//Sort it
std::sort(target_phrase_words_counts.begin(), target_phrase_words_counts.end(), sort_pair());
//Create a vector for word allignments 1
for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1.begin(); it != word_all1.end(); it++ ) {
word_all1_counts.push_back(*it);
}
//Sort it
std::sort(word_all1_counts.begin(), word_all1_counts.end(), sort_pair_vec());
//Afterwards we assign a value for each phrase, starting from 1, as zero is reserved for delimiter
unsigned int i = 1; //huffman code
for(std::vector<std::pair<std::string, unsigned int> >::iterator it = target_phrase_words_counts.begin();
it != target_phrase_words_counts.end(); it++) {
target_phrase_huffman.insert(std::pair<std::string, unsigned int>(it->first, i));
i++; //Go to the next huffman code
}
i = 1; //Reset i for the next map
for(std::vector<std::pair<std::vector<unsigned char>, unsigned int> >::iterator it = word_all1_counts.begin();
it != word_all1_counts.end(); it++) {
word_all1_huffman.insert(std::pair<std::vector<unsigned char>, unsigned int>(it->first, i));
i++; //Go to the next huffman code
}
//After lookups are produced, clear some memory usage of objects not needed anymore.
target_phrase_words.clear();
word_all1.clear();
target_phrase_words_counts.clear();
word_all1_counts.clear();
std::cerr << "Finished generating huffman codes." << std::endl;
}
void Huffman::serialize_maps(const char * dirname)
{
//Note that directory name should exist.
std::string basedir(dirname);
std::string target_phrase_path(basedir + "/target_phrases");
std::string probabilities_path(basedir + "/probs");
std::string word_all1_path(basedir + "/Wall1");
//Target phrase
std::ofstream os (target_phrase_path.c_str(), std::ios::binary);
boost::archive::text_oarchive oarch(os);
oarch << lookup_target_phrase;
os.close();
//Word all1
std::ofstream os2 (word_all1_path.c_str(), std::ios::binary);
boost::archive::text_oarchive oarch2(os2);
oarch2 << lookup_word_all1;
os2.close();
}
std::vector<unsigned char> Huffman::full_encode_line(line_text &line, bool log_prob)
{
return vbyte_encode_line((encode_line(line, log_prob)));
}
//! make sure score doesn't fall below LOWEST_SCORE
inline float FloorScore(float logScore)
{
const float LOWEST_SCORE = -100.0f;
return (std::max)(logScore , LOWEST_SCORE);
}
std::vector<unsigned int> Huffman::encode_line(line_text &line, bool log_prob)
{
std::vector<unsigned int> retvector;
//Get target_phrase first.
util::TokenIter<util::SingleCharacter> it(line.target_phrase, util::SingleCharacter(' '));
while (it) {
retvector.push_back(target_phrase_huffman.find(it->as_string())->second);
it++;
}
//Add a zero;
retvector.push_back(0);
//Get probabilities. Reinterpreting the float bytes as unsgined int.
util::TokenIter<util::SingleCharacter> probit(line.prob, util::SingleCharacter(' '));
while (probit) {
//Sometimes we have too big floats to handle, so first convert to double
double tempnum = atof(probit->data());
float num = (float)tempnum;
if (log_prob) {
num = FloorScore(log(num));
if (num == 0.0f) num = 0.0000000001;
}
//cerr << "num=" << num << endl;
retvector.push_back(reinterpret_float(&num));
probit++;
}
// append LexRO prob to pt scores
AppendLexRO(line, retvector, log_prob);
//Add a zero;
retvector.push_back(0);
//Get Word allignments
retvector.push_back(word_all1_huffman.find(splitWordAll1(line.word_align))->second);
retvector.push_back(0);
//The rest of the components might not be there, but add them (as reinterpretation to byte arr)
//In the future we should really make those optional to save space
//Counts
const char* counts = line.counts.data();
size_t counts_size = line.counts.size();
for (size_t i = 0; i < counts_size; i++) {
retvector.push_back(counts[i]);
}
retvector.push_back(0);
//Sparse score
const char* sparse_score = line.sparse_score.data();
size_t sparse_score_size = line.sparse_score.size();
for (size_t i = 0; i < sparse_score_size; i++) {
retvector.push_back(sparse_score[i]);
}
retvector.push_back(0);
//Property
const char* property = line.property_to_be_binarized.data();
size_t property_size = line.property_to_be_binarized.size();
for (size_t i = 0; i < property_size; i++) {
retvector.push_back(property[i]);
}
retvector.push_back(0);
return retvector;
}
void Huffman::AppendLexRO(line_text &line, std::vector<unsigned int> &retvector, bool log_prob) const
{
const StringPiece &origProperty = line.property;
StringPiece::size_type startPos = origProperty.find("{{LexRO ");
if (startPos != StringPiece::npos) {
StringPiece::size_type endPos = origProperty.find("}}", startPos + 8);
StringPiece lexProb = origProperty.substr(startPos + 8, endPos - startPos - 8);
//cerr << "lexProb=" << lexProb << endl;
// append lex probs to pt probs
util::TokenIter<util::SingleCharacter> it(lexProb, util::SingleCharacter(' '));
while (it) {
StringPiece probStr = *it;
//cerr << "\t" << probStr << endl;
double tempnum = atof(probStr.data());
float num = (float)tempnum;
if (log_prob) {
num = FloorScore(log(num));
if (num == 0.0f) num = 0.0000000001;
}
retvector.push_back(reinterpret_float(&num));
// exclude LexRO property from property column
line.property_to_be_binarized = origProperty.substr(0, startPos).as_string()
+ origProperty.substr(endPos + 2, origProperty.size() - endPos - 2).as_string();
//cerr << "line.property_to_be_binarized=" << line.property_to_be_binarized << "AAAA" << endl;
it++;
}
}
}
void Huffman::produce_lookups()
{
//basically invert every map that we have
for(std::map<std::string, unsigned int>::iterator it = target_phrase_huffman.begin(); it != target_phrase_huffman.end(); it++ ) {
lookup_target_phrase.insert(std::pair<unsigned int, std::string>(it->second, it->first));
}
for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1_huffman.begin(); it != word_all1_huffman.end(); it++ ) {
lookup_word_all1.insert(std::pair<unsigned int, std::vector<unsigned char> >(it->second, it->first));
}
}
HuffmanDecoder::HuffmanDecoder (const char * dirname)
{
//Read the maps from disk
//Note that directory name should exist.
std::string basedir(dirname);
std::string target_phrase_path(basedir + "/target_phrases");
std::string word_all1_path(basedir + "/Wall1");
//Target phrases
std::ifstream is (target_phrase_path.c_str(), std::ios::binary);
boost::archive::text_iarchive iarch(is);
iarch >> lookup_target_phrase;
is.close();
//Word allignment 1
std::ifstream is2 (word_all1_path.c_str(), std::ios::binary);
boost::archive::text_iarchive iarch2(is2);
iarch2 >> lookup_word_all1;
is2.close();
}
HuffmanDecoder::HuffmanDecoder (const std::map<unsigned int, std::string> &lookup_target,
const std::map<unsigned int, std::vector<unsigned char> > &lookup_word1)
{
lookup_target_phrase = lookup_target;
lookup_word_all1 = lookup_word1;
}
std::vector<target_text*> HuffmanDecoder::full_decode_line (unsigned char lines[],
size_t linesCount,
int num_scores,
int num_lex_scores,
RecycleData &recycler)
{
std::vector<target_text*> retvector; //All target phrases
std::vector<unsigned int> *decoded_lines = vbyte_decode_line(lines, linesCount, recycler); //All decoded lines
std::vector<unsigned int>::iterator it = decoded_lines->begin(); //Iterator for them
std::vector<unsigned int> current_target_phrase; //Current target phrase decoded
short zero_count = 0; //Count how many zeroes we have met. so far. Every 3 zeroes mean a new target phrase.
while(it != decoded_lines->end()) {
if (zero_count == 1) {
//We are extracting scores. we know how many scores there are so we can push them
//to the vector. This is done in case any of the scores is 0, because it would mess
//up the state machine.
for (int i = 0; i < num_scores + num_lex_scores; i++) {
current_target_phrase.push_back(*it);
it++;
}
}
if (zero_count == 6) {
//We have finished with this entry, decode it, and add it to the retvector.
retvector.push_back(decode_line(current_target_phrase, num_scores, num_lex_scores, recycler));
current_target_phrase.clear(); //Clear the current target phrase and the zero_count
zero_count = 0; //So that we can reuse them for the next target phrase
}
//Add to the next target_phrase, number by number.
current_target_phrase.push_back(*it);
if (*it == 0) {
zero_count++;
}
it++; //Go to the next word/symbol
}
//Don't forget the last remaining line!
if (zero_count == 6) {
//We have finished with this entry, decode it, and add it to the retvector.
retvector.push_back(decode_line(current_target_phrase, num_scores, num_lex_scores, recycler));
current_target_phrase.clear(); //Clear the current target phrase and the zero_count
zero_count = 0; //So that we can reuse them for the next target phrase
}
recycler.huffman_line.push_back(decoded_lines);
return retvector;
}
target_text *HuffmanDecoder::decode_line (const std::vector<unsigned int> &input,
int num_scores,
int num_lex_scores,
Moses2::RecycleData &recycler)
{
//demo decoder
target_text *ret;
if (recycler.tt.empty()) {
ret = new target_text;
}
else {
ret = recycler.tt.back();
recycler.tt.pop_back();
ret->Reset();
}
ret->prob.reserve(num_scores);
//Split everything
unsigned int wAll = 1;
//Split the line into the proper arrays
short num_zeroes = 0;
int counter = 0;
while (num_zeroes < 6) {
unsigned int num = input[counter];
if (num == 0) {
num_zeroes++;
} else if (num_zeroes == 0) {
ret->target_phrase.push_back(num);
} else if (num_zeroes == 1) {
//Push exactly num_scores scores
for (int i = 0; i < num_scores + num_lex_scores; i++) {
float prob = reinterpret_uint(&num);
ret->prob.push_back(prob);
counter++;
num = input[counter];
}
continue;
} else if (num_zeroes == 2) {
wAll = num;
} else if (num_zeroes == 3) {
ret->counts.push_back(static_cast<char>(input[counter]));
} else if (num_zeroes == 4) {
ret->sparse_score.push_back(static_cast<char>(input[counter]));
} else if (num_zeroes == 5) {
ret->property.push_back(static_cast<char>(input[counter]));
}
counter++;
}
ret->word_all1 = lookup_word_all1.find(wAll)->second;
return ret;
}
inline const std::string &HuffmanDecoder::getTargetWordFromID(unsigned int id)
{
return lookup_target_phrase.find(id)->second;
}
std::string HuffmanDecoder::getTargetWordsFromIDs(const std::vector<unsigned int> &ids)
{
std::string returnstring;
for (std::vector<unsigned int>::const_iterator it = ids.begin(); it != ids.end(); it++) {
returnstring.append(getTargetWordFromID(*it) + " ");
}
return returnstring;
}
inline const std::string &getTargetWordFromID(unsigned int id, const std::map<unsigned int, std::string> &lookup_target_phrase)
{
return lookup_target_phrase.find(id)->second;
}
std::string getTargetWordsFromIDs(const std::vector<unsigned int> &ids, const std::map<unsigned int, std::string> &lookup_target_phrase)
{
std::string returnstring;
for (std::vector<unsigned int>::const_iterator it = ids.begin(); it != ids.end(); it++) {
returnstring.append(getTargetWordFromID(*it, lookup_target_phrase) + " ");
}
return returnstring;
}
/*Those functions are used to more easily store the floats in the binary phrase table
We convert the float unsinged int so that it is the same as our other values and we can
apply variable byte encoding on top of it.*/
inline unsigned int reinterpret_float(float * num)
{
unsigned int * converted_num;
converted_num = reinterpret_cast<unsigned int *>(num);
return *converted_num;
}
inline float reinterpret_uint(unsigned int * num)
{
float * converted_num;
converted_num = reinterpret_cast<float *>(num);
return *converted_num;
}
/*Mostly taken from stackoverflow, http://stackoverflow.com/questions/5858646/optimizing-variable-length-encoding
and modified in order to return a vector of chars. Implements ULEB128 or variable byte encoding.
This is highly optimized version with unrolled loop */
inline std::vector<unsigned char> vbyte_encode(unsigned int num)
{
//Determine how many bytes we are going to take.
short size;
std::vector<unsigned char> byte_vector;
if (num < 0x00000080U) {
size = 1;
byte_vector.reserve(size);
goto b1;
}
if (num < 0x00004000U) {
size = 2;
byte_vector.reserve(size);
goto b2;
}
if (num < 0x00200000U) {
size = 3;
byte_vector.reserve(size);
goto b3;
}
if (num < 0x10000000U) {
size = 4;
byte_vector.reserve(size);
goto b4;
}
size = 5;
byte_vector.reserve(size);
//Now proceed with the encoding.
byte_vector.push_back((num & 0x7f) | 0x80);
num >>= 7;
b4:
byte_vector.push_back((num & 0x7f) | 0x80);
num >>= 7;
b3:
byte_vector.push_back((num & 0x7f) | 0x80);
num >>= 7;
b2:
byte_vector.push_back((num & 0x7f) | 0x80);
num >>= 7;
b1:
byte_vector.push_back(num);
return byte_vector;
}
std::vector<unsigned int> *vbyte_decode_line(unsigned char line[], size_t linesSize, RecycleData &recycler)
{
std::vector<unsigned int> *huffman_line;
if (recycler.huffman_line.empty()) {
huffman_line = new std::vector<unsigned int>();
}
else {
huffman_line = recycler.huffman_line.back();
recycler.huffman_line.pop_back();
huffman_line->clear();
}
unsigned char current_num[linesSize];
size_t current_num_ind = 0;
for (size_t i = 0; i < linesSize; ++i) {
unsigned char c = line[i];
current_num[current_num_ind++] = c;
if ((c >> 7) != 1) {
//We don't have continuation in the next bit
huffman_line->push_back(bytes_to_int(current_num, current_num_ind));
current_num_ind = 0;
}
}
return huffman_line;
}
inline unsigned int bytes_to_int(unsigned char number[], size_t numberSize)
{
unsigned int retvalue = 0;
unsigned char shift = 0; //By how many bits to shift
for (size_t i = 0; i < numberSize; ++i) {
unsigned char c = number[i];
retvalue |= (c & 0x7f) << shift;
shift += 7;
}
return retvalue;
}
std::vector<unsigned char> vbyte_encode_line(const std::vector<unsigned int> &line)
{
std::vector<unsigned char> retvec;
//For each unsigned int in the line, vbyte encode it and add it to a vector of unsigned chars.
for (std::vector<unsigned int>::const_iterator it = line.begin(); it != line.end(); it++) {
std::vector<unsigned char> vbyte_encoded = vbyte_encode(*it);
retvec.insert(retvec.end(), vbyte_encoded.begin(), vbyte_encoded.end());
}
return retvec;
}
}

View File

@ -1,150 +0,0 @@
#pragma once
//Huffman encodes a line and also produces the vocabulary ids
#include "hash.hh"
#include "line_splitter.hh"
#include <cstdio>
#include <fstream>
#include <iostream>
#include <sstream>
#include <deque>
#include <boost/serialization/serialization.hpp>
#include <boost/serialization/vector.hpp>
#include <boost/serialization/map.hpp>
#include <boost/archive/text_iarchive.hpp>
#include <boost/archive/text_oarchive.hpp>
#include <boost/foreach.hpp>
namespace Moses2
{
////////////////////////////////////////////////////////////////
class RecycleData
{
public:
std::deque<target_text*> tt;
std::deque<std::vector<unsigned int>*> huffman_line;
~RecycleData()
{
BOOST_FOREACH (const target_text *obj, tt) {
delete obj;
}
BOOST_FOREACH (const std::vector<unsigned int> *obj, huffman_line) {
delete obj;
}
}
};
////////////////////////////////////////////////////////////////
//Sorting for the second
struct sort_pair {
bool operator()(const std::pair<std::string, unsigned int> &left, const std::pair<std::string, unsigned int> &right) {
return left.second > right.second; //This puts biggest numbers first.
}
};
struct sort_pair_vec {
bool operator()(const std::pair<std::vector<unsigned char>, unsigned int> &left, const std::pair<std::vector<unsigned char>, unsigned int> &right) {
return left.second > right.second; //This puts biggest numbers first.
}
};
class Huffman
{
unsigned long uniq_lines; //Unique lines in the file.
//Containers used when counting the occurence of a given phrase
std::map<std::string, unsigned int> target_phrase_words;
std::map<std::vector<unsigned char>, unsigned int> word_all1;
//Same containers as vectors, for sorting
std::vector<std::pair<std::string, unsigned int> > target_phrase_words_counts;
std::vector<std::pair<std::vector<unsigned char>, unsigned int> > word_all1_counts;
//Huffman maps
std::map<std::string, unsigned int> target_phrase_huffman;
std::map<std::vector<unsigned char>, unsigned int> word_all1_huffman;
//inverted maps
std::map<unsigned int, std::string> lookup_target_phrase;
std::map<unsigned int, std::vector<unsigned char> > lookup_word_all1;
public:
Huffman (const char *);
void count_elements (const line_text &line);
void assign_values();
void serialize_maps(const char * dirname);
void produce_lookups();
std::vector<unsigned int> encode_line(line_text &line, bool log_prob);
//encode line + variable byte ontop
std::vector<unsigned char> full_encode_line(line_text &line, bool log_prob);
//Getters
const std::map<unsigned int, std::string> get_target_lookup_map() const {
return lookup_target_phrase;
}
const std::map<unsigned int, std::vector<unsigned char> > get_word_all1_lookup_map() const {
return lookup_word_all1;
}
unsigned long getUniqLines() {
return uniq_lines;
}
void AppendLexRO(line_text &line, std::vector<unsigned int> &retvector, bool log_prob) const;
};
class HuffmanDecoder
{
std::map<unsigned int, std::string> lookup_target_phrase;
std::map<unsigned int, std::vector<unsigned char> > lookup_word_all1;
public:
HuffmanDecoder (const char *);
HuffmanDecoder (const std::map<unsigned int, std::string> &, const std::map<unsigned int, std::vector<unsigned char> > &);
//Getters
const std::map<unsigned int, std::string> &get_target_lookup_map() const {
return lookup_target_phrase;
}
const std::map<unsigned int, std::vector<unsigned char> > &get_word_all1_lookup_map() const {
return lookup_word_all1;
}
inline const std::string &getTargetWordFromID(unsigned int id);
std::string getTargetWordsFromIDs(const std::vector<unsigned int> &ids);
target_text *decode_line (const std::vector<unsigned int> &input,
int num_scores,
int num_lex_scores,
RecycleData &recycler);
//Variable byte decodes a all target phrases contained here and then passes them to decode_line
std::vector<target_text*> full_decode_line (unsigned char lines[],
size_t linesCount,
int num_scores,
int num_lex_scores,
RecycleData &recycler);
};
std::string getTargetWordsFromIDs(const std::vector<unsigned int> &ids, const std::map<unsigned int, std::string> &lookup_target_phrase);
inline const std::string &getTargetWordFromID(unsigned int id, const std::map<unsigned int, std::string> &lookup_target_phrase);
inline unsigned int reinterpret_float(float * num);
inline float reinterpret_uint(unsigned int * num);
std::vector<unsigned char> vbyte_encode_line(const std::vector<unsigned int> &line);
inline std::vector<unsigned char> vbyte_encode(unsigned int num);
std::vector<unsigned int> *vbyte_decode_line(unsigned char line[], size_t linesSize, RecycleData &recycler);
inline unsigned int bytes_to_int(unsigned char number[], size_t numberSize);
}

View File

@ -10,13 +10,12 @@
namespace Moses2
{
#define API_VERSION 8
#define API_VERSION 9
//Hash table entry
struct Entry {
uint64_t key;
typedef uint64_t Key;
unsigned int bytes_toread;
uint64_t GetKey() const {
return key;
@ -26,12 +25,7 @@ struct Entry {
key = to;
}
uint64_t GetValue() const {
return value;
}
uint64_t value;
uint64_t targetInd;
};
//Define table

View File

@ -94,7 +94,9 @@ std::pair<bool, uint64_t> QueryEngine::query(uint64_t key)
const Entry * entry;
ret.first = table.Find(key, entry);
ret.second = entry->targetInd;
if (ret.first) {
ret.second = entry->value;
}
return ret;
}

View File

@ -42,7 +42,6 @@ void createProbingPT(
float totalSourceCount = 0;
//Keep track of the size of each group of target phrases
uint64_t entrystartidx = 0;
size_t line_num = 0;
//Read everything and processs
@ -83,8 +82,7 @@ void createProbingPT(
//Create an entry for the previous source phrase:
Entry pesho;
pesho.value = entrystartidx;
pesho.targetInd = targetInd;
pesho.value = targetInd;
//The key is the sum of hashes of individual words bitshifted by their position in the phrase.
//Probably not entirerly correct, but fast and seems to work fine in practise.
pesho.key = 0;
@ -128,8 +126,7 @@ void createProbingPT(
uint64_t targetInd = storeTarget.Save();
Entry pesho;
pesho.value = entrystartidx;
pesho.targetInd = targetInd;
pesho.value = targetInd;
//The key is the sum of hashes of individual words. Probably not entirerly correct, but fast
pesho.key = 0;