mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-30 15:34:01 +03:00
trim Entry struct
This commit is contained in:
parent
7e4cfb6416
commit
23dbfb0f27
@ -71,7 +71,6 @@ alias deps : ../../..//z ../../..//boost_iostreams ../../..//boost_filesystem .
|
||||
legacy/Util2.cpp
|
||||
|
||||
legacy/ProbingPT/hash.cpp
|
||||
legacy/ProbingPT/huffmanish.cpp
|
||||
legacy/ProbingPT/line_splitter.cpp
|
||||
legacy/ProbingPT/probing_hash_utils.cpp
|
||||
legacy/ProbingPT/quering.cpp
|
||||
|
@ -1,566 +0,0 @@
|
||||
#include "huffmanish.hh"
|
||||
|
||||
namespace Moses2
|
||||
{
|
||||
|
||||
Huffman::Huffman (const char * filepath)
|
||||
{
|
||||
//Read the file
|
||||
util::FilePiece filein(filepath);
|
||||
|
||||
//Init uniq_lines to zero;
|
||||
uniq_lines = 0;
|
||||
|
||||
line_text prev_line; //Check for unique lines.
|
||||
int num_lines = 0 ;
|
||||
|
||||
while (true) {
|
||||
line_text new_line;
|
||||
|
||||
num_lines++;
|
||||
|
||||
try {
|
||||
//Process line read
|
||||
new_line = splitLine(filein.ReadLine());
|
||||
count_elements(new_line); //Counts the number of elements, adds new and increments counters.
|
||||
|
||||
} catch (util::EndOfFileException e) {
|
||||
std::cerr << "Unique entries counted: ";
|
||||
break;
|
||||
}
|
||||
|
||||
if (new_line.source_phrase == prev_line.source_phrase) {
|
||||
continue;
|
||||
} else {
|
||||
uniq_lines++;
|
||||
prev_line = new_line;
|
||||
}
|
||||
}
|
||||
|
||||
std::cerr << uniq_lines << std::endl;
|
||||
}
|
||||
|
||||
void Huffman::count_elements(const line_text &linein)
|
||||
{
|
||||
//For target phrase:
|
||||
util::TokenIter<util::SingleCharacter> it(linein.target_phrase, util::SingleCharacter(' '));
|
||||
while (it) {
|
||||
//Check if we have that entry
|
||||
std::map<std::string, unsigned int>::iterator mapiter;
|
||||
mapiter = target_phrase_words.find(it->as_string());
|
||||
|
||||
if (mapiter != target_phrase_words.end()) {
|
||||
//If the element is found, increment the count.
|
||||
mapiter->second++;
|
||||
} else {
|
||||
//Else create a new entry;
|
||||
target_phrase_words.insert(std::pair<std::string, unsigned int>(it->as_string(), 1));
|
||||
}
|
||||
it++;
|
||||
}
|
||||
|
||||
//For word allignment 1
|
||||
std::map<std::vector<unsigned char>, unsigned int>::iterator mapiter3;
|
||||
std::vector<unsigned char> numbers = splitWordAll1(linein.word_align);
|
||||
mapiter3 = word_all1.find(numbers);
|
||||
|
||||
if (mapiter3 != word_all1.end()) {
|
||||
//If the element is found, increment the count.
|
||||
mapiter3->second++;
|
||||
} else {
|
||||
//Else create a new entry;
|
||||
word_all1.insert(std::pair<std::vector<unsigned char>, unsigned int>(numbers, 1));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//Assigns huffman values for each unique element
|
||||
void Huffman::assign_values()
|
||||
{
|
||||
//First create vectors for all maps so that we could sort them later.
|
||||
|
||||
//Create a vector for target phrases
|
||||
for(std::map<std::string, unsigned int>::iterator it = target_phrase_words.begin(); it != target_phrase_words.end(); it++ ) {
|
||||
target_phrase_words_counts.push_back(*it);
|
||||
}
|
||||
//Sort it
|
||||
std::sort(target_phrase_words_counts.begin(), target_phrase_words_counts.end(), sort_pair());
|
||||
|
||||
//Create a vector for word allignments 1
|
||||
for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1.begin(); it != word_all1.end(); it++ ) {
|
||||
word_all1_counts.push_back(*it);
|
||||
}
|
||||
//Sort it
|
||||
std::sort(word_all1_counts.begin(), word_all1_counts.end(), sort_pair_vec());
|
||||
|
||||
|
||||
//Afterwards we assign a value for each phrase, starting from 1, as zero is reserved for delimiter
|
||||
unsigned int i = 1; //huffman code
|
||||
for(std::vector<std::pair<std::string, unsigned int> >::iterator it = target_phrase_words_counts.begin();
|
||||
it != target_phrase_words_counts.end(); it++) {
|
||||
target_phrase_huffman.insert(std::pair<std::string, unsigned int>(it->first, i));
|
||||
i++; //Go to the next huffman code
|
||||
}
|
||||
|
||||
i = 1; //Reset i for the next map
|
||||
for(std::vector<std::pair<std::vector<unsigned char>, unsigned int> >::iterator it = word_all1_counts.begin();
|
||||
it != word_all1_counts.end(); it++) {
|
||||
word_all1_huffman.insert(std::pair<std::vector<unsigned char>, unsigned int>(it->first, i));
|
||||
i++; //Go to the next huffman code
|
||||
}
|
||||
|
||||
//After lookups are produced, clear some memory usage of objects not needed anymore.
|
||||
target_phrase_words.clear();
|
||||
word_all1.clear();
|
||||
|
||||
target_phrase_words_counts.clear();
|
||||
word_all1_counts.clear();
|
||||
|
||||
std::cerr << "Finished generating huffman codes." << std::endl;
|
||||
|
||||
}
|
||||
|
||||
void Huffman::serialize_maps(const char * dirname)
|
||||
{
|
||||
//Note that directory name should exist.
|
||||
std::string basedir(dirname);
|
||||
std::string target_phrase_path(basedir + "/target_phrases");
|
||||
std::string probabilities_path(basedir + "/probs");
|
||||
std::string word_all1_path(basedir + "/Wall1");
|
||||
|
||||
//Target phrase
|
||||
std::ofstream os (target_phrase_path.c_str(), std::ios::binary);
|
||||
boost::archive::text_oarchive oarch(os);
|
||||
oarch << lookup_target_phrase;
|
||||
os.close();
|
||||
|
||||
//Word all1
|
||||
std::ofstream os2 (word_all1_path.c_str(), std::ios::binary);
|
||||
boost::archive::text_oarchive oarch2(os2);
|
||||
oarch2 << lookup_word_all1;
|
||||
os2.close();
|
||||
}
|
||||
|
||||
std::vector<unsigned char> Huffman::full_encode_line(line_text &line, bool log_prob)
|
||||
{
|
||||
return vbyte_encode_line((encode_line(line, log_prob)));
|
||||
}
|
||||
|
||||
//! make sure score doesn't fall below LOWEST_SCORE
|
||||
inline float FloorScore(float logScore)
|
||||
{
|
||||
const float LOWEST_SCORE = -100.0f;
|
||||
return (std::max)(logScore , LOWEST_SCORE);
|
||||
}
|
||||
|
||||
std::vector<unsigned int> Huffman::encode_line(line_text &line, bool log_prob)
|
||||
{
|
||||
std::vector<unsigned int> retvector;
|
||||
|
||||
//Get target_phrase first.
|
||||
util::TokenIter<util::SingleCharacter> it(line.target_phrase, util::SingleCharacter(' '));
|
||||
while (it) {
|
||||
retvector.push_back(target_phrase_huffman.find(it->as_string())->second);
|
||||
it++;
|
||||
}
|
||||
//Add a zero;
|
||||
retvector.push_back(0);
|
||||
|
||||
//Get probabilities. Reinterpreting the float bytes as unsgined int.
|
||||
util::TokenIter<util::SingleCharacter> probit(line.prob, util::SingleCharacter(' '));
|
||||
while (probit) {
|
||||
//Sometimes we have too big floats to handle, so first convert to double
|
||||
double tempnum = atof(probit->data());
|
||||
float num = (float)tempnum;
|
||||
if (log_prob) {
|
||||
num = FloorScore(log(num));
|
||||
if (num == 0.0f) num = 0.0000000001;
|
||||
}
|
||||
//cerr << "num=" << num << endl;
|
||||
retvector.push_back(reinterpret_float(&num));
|
||||
probit++;
|
||||
}
|
||||
|
||||
// append LexRO prob to pt scores
|
||||
AppendLexRO(line, retvector, log_prob);
|
||||
|
||||
//Add a zero;
|
||||
retvector.push_back(0);
|
||||
|
||||
|
||||
//Get Word allignments
|
||||
retvector.push_back(word_all1_huffman.find(splitWordAll1(line.word_align))->second);
|
||||
retvector.push_back(0);
|
||||
|
||||
//The rest of the components might not be there, but add them (as reinterpretation to byte arr)
|
||||
//In the future we should really make those optional to save space
|
||||
|
||||
//Counts
|
||||
const char* counts = line.counts.data();
|
||||
size_t counts_size = line.counts.size();
|
||||
for (size_t i = 0; i < counts_size; i++) {
|
||||
retvector.push_back(counts[i]);
|
||||
}
|
||||
retvector.push_back(0);
|
||||
|
||||
//Sparse score
|
||||
const char* sparse_score = line.sparse_score.data();
|
||||
size_t sparse_score_size = line.sparse_score.size();
|
||||
for (size_t i = 0; i < sparse_score_size; i++) {
|
||||
retvector.push_back(sparse_score[i]);
|
||||
}
|
||||
retvector.push_back(0);
|
||||
|
||||
//Property
|
||||
const char* property = line.property_to_be_binarized.data();
|
||||
size_t property_size = line.property_to_be_binarized.size();
|
||||
for (size_t i = 0; i < property_size; i++) {
|
||||
retvector.push_back(property[i]);
|
||||
}
|
||||
retvector.push_back(0);
|
||||
|
||||
return retvector;
|
||||
}
|
||||
|
||||
void Huffman::AppendLexRO(line_text &line, std::vector<unsigned int> &retvector, bool log_prob) const
|
||||
{
|
||||
const StringPiece &origProperty = line.property;
|
||||
StringPiece::size_type startPos = origProperty.find("{{LexRO ");
|
||||
|
||||
if (startPos != StringPiece::npos) {
|
||||
StringPiece::size_type endPos = origProperty.find("}}", startPos + 8);
|
||||
StringPiece lexProb = origProperty.substr(startPos + 8, endPos - startPos - 8);
|
||||
//cerr << "lexProb=" << lexProb << endl;
|
||||
|
||||
// append lex probs to pt probs
|
||||
util::TokenIter<util::SingleCharacter> it(lexProb, util::SingleCharacter(' '));
|
||||
while (it) {
|
||||
StringPiece probStr = *it;
|
||||
//cerr << "\t" << probStr << endl;
|
||||
|
||||
double tempnum = atof(probStr.data());
|
||||
float num = (float)tempnum;
|
||||
if (log_prob) {
|
||||
num = FloorScore(log(num));
|
||||
if (num == 0.0f) num = 0.0000000001;
|
||||
}
|
||||
|
||||
retvector.push_back(reinterpret_float(&num));
|
||||
|
||||
// exclude LexRO property from property column
|
||||
line.property_to_be_binarized = origProperty.substr(0, startPos).as_string()
|
||||
+ origProperty.substr(endPos + 2, origProperty.size() - endPos - 2).as_string();
|
||||
//cerr << "line.property_to_be_binarized=" << line.property_to_be_binarized << "AAAA" << endl;
|
||||
it++;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
void Huffman::produce_lookups()
|
||||
{
|
||||
//basically invert every map that we have
|
||||
for(std::map<std::string, unsigned int>::iterator it = target_phrase_huffman.begin(); it != target_phrase_huffman.end(); it++ ) {
|
||||
lookup_target_phrase.insert(std::pair<unsigned int, std::string>(it->second, it->first));
|
||||
}
|
||||
|
||||
for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1_huffman.begin(); it != word_all1_huffman.end(); it++ ) {
|
||||
lookup_word_all1.insert(std::pair<unsigned int, std::vector<unsigned char> >(it->second, it->first));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
HuffmanDecoder::HuffmanDecoder (const char * dirname)
|
||||
{
|
||||
//Read the maps from disk
|
||||
|
||||
//Note that directory name should exist.
|
||||
std::string basedir(dirname);
|
||||
std::string target_phrase_path(basedir + "/target_phrases");
|
||||
std::string word_all1_path(basedir + "/Wall1");
|
||||
|
||||
//Target phrases
|
||||
std::ifstream is (target_phrase_path.c_str(), std::ios::binary);
|
||||
boost::archive::text_iarchive iarch(is);
|
||||
iarch >> lookup_target_phrase;
|
||||
is.close();
|
||||
|
||||
//Word allignment 1
|
||||
std::ifstream is2 (word_all1_path.c_str(), std::ios::binary);
|
||||
boost::archive::text_iarchive iarch2(is2);
|
||||
iarch2 >> lookup_word_all1;
|
||||
is2.close();
|
||||
|
||||
}
|
||||
|
||||
HuffmanDecoder::HuffmanDecoder (const std::map<unsigned int, std::string> &lookup_target,
|
||||
const std::map<unsigned int, std::vector<unsigned char> > &lookup_word1)
|
||||
{
|
||||
lookup_target_phrase = lookup_target;
|
||||
lookup_word_all1 = lookup_word1;
|
||||
}
|
||||
|
||||
std::vector<target_text*> HuffmanDecoder::full_decode_line (unsigned char lines[],
|
||||
size_t linesCount,
|
||||
int num_scores,
|
||||
int num_lex_scores,
|
||||
RecycleData &recycler)
|
||||
{
|
||||
std::vector<target_text*> retvector; //All target phrases
|
||||
std::vector<unsigned int> *decoded_lines = vbyte_decode_line(lines, linesCount, recycler); //All decoded lines
|
||||
std::vector<unsigned int>::iterator it = decoded_lines->begin(); //Iterator for them
|
||||
std::vector<unsigned int> current_target_phrase; //Current target phrase decoded
|
||||
|
||||
short zero_count = 0; //Count how many zeroes we have met. so far. Every 3 zeroes mean a new target phrase.
|
||||
while(it != decoded_lines->end()) {
|
||||
if (zero_count == 1) {
|
||||
//We are extracting scores. we know how many scores there are so we can push them
|
||||
//to the vector. This is done in case any of the scores is 0, because it would mess
|
||||
//up the state machine.
|
||||
for (int i = 0; i < num_scores + num_lex_scores; i++) {
|
||||
current_target_phrase.push_back(*it);
|
||||
it++;
|
||||
}
|
||||
}
|
||||
|
||||
if (zero_count == 6) {
|
||||
//We have finished with this entry, decode it, and add it to the retvector.
|
||||
retvector.push_back(decode_line(current_target_phrase, num_scores, num_lex_scores, recycler));
|
||||
current_target_phrase.clear(); //Clear the current target phrase and the zero_count
|
||||
zero_count = 0; //So that we can reuse them for the next target phrase
|
||||
}
|
||||
//Add to the next target_phrase, number by number.
|
||||
current_target_phrase.push_back(*it);
|
||||
if (*it == 0) {
|
||||
zero_count++;
|
||||
}
|
||||
it++; //Go to the next word/symbol
|
||||
}
|
||||
//Don't forget the last remaining line!
|
||||
if (zero_count == 6) {
|
||||
//We have finished with this entry, decode it, and add it to the retvector.
|
||||
retvector.push_back(decode_line(current_target_phrase, num_scores, num_lex_scores, recycler));
|
||||
current_target_phrase.clear(); //Clear the current target phrase and the zero_count
|
||||
zero_count = 0; //So that we can reuse them for the next target phrase
|
||||
}
|
||||
|
||||
recycler.huffman_line.push_back(decoded_lines);
|
||||
|
||||
return retvector;
|
||||
|
||||
}
|
||||
|
||||
target_text *HuffmanDecoder::decode_line (const std::vector<unsigned int> &input,
|
||||
int num_scores,
|
||||
int num_lex_scores,
|
||||
Moses2::RecycleData &recycler)
|
||||
{
|
||||
//demo decoder
|
||||
target_text *ret;
|
||||
if (recycler.tt.empty()) {
|
||||
ret = new target_text;
|
||||
}
|
||||
else {
|
||||
ret = recycler.tt.back();
|
||||
recycler.tt.pop_back();
|
||||
|
||||
ret->Reset();
|
||||
}
|
||||
|
||||
ret->prob.reserve(num_scores);
|
||||
//Split everything
|
||||
unsigned int wAll = 1;
|
||||
|
||||
//Split the line into the proper arrays
|
||||
short num_zeroes = 0;
|
||||
int counter = 0;
|
||||
while (num_zeroes < 6) {
|
||||
unsigned int num = input[counter];
|
||||
if (num == 0) {
|
||||
num_zeroes++;
|
||||
} else if (num_zeroes == 0) {
|
||||
ret->target_phrase.push_back(num);
|
||||
} else if (num_zeroes == 1) {
|
||||
//Push exactly num_scores scores
|
||||
for (int i = 0; i < num_scores + num_lex_scores; i++) {
|
||||
float prob = reinterpret_uint(&num);
|
||||
ret->prob.push_back(prob);
|
||||
|
||||
counter++;
|
||||
num = input[counter];
|
||||
}
|
||||
continue;
|
||||
} else if (num_zeroes == 2) {
|
||||
wAll = num;
|
||||
} else if (num_zeroes == 3) {
|
||||
ret->counts.push_back(static_cast<char>(input[counter]));
|
||||
} else if (num_zeroes == 4) {
|
||||
ret->sparse_score.push_back(static_cast<char>(input[counter]));
|
||||
} else if (num_zeroes == 5) {
|
||||
ret->property.push_back(static_cast<char>(input[counter]));
|
||||
}
|
||||
|
||||
counter++;
|
||||
}
|
||||
|
||||
ret->word_all1 = lookup_word_all1.find(wAll)->second;
|
||||
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
inline const std::string &HuffmanDecoder::getTargetWordFromID(unsigned int id)
|
||||
{
|
||||
return lookup_target_phrase.find(id)->second;
|
||||
}
|
||||
|
||||
std::string HuffmanDecoder::getTargetWordsFromIDs(const std::vector<unsigned int> &ids)
|
||||
{
|
||||
std::string returnstring;
|
||||
for (std::vector<unsigned int>::const_iterator it = ids.begin(); it != ids.end(); it++) {
|
||||
returnstring.append(getTargetWordFromID(*it) + " ");
|
||||
}
|
||||
|
||||
return returnstring;
|
||||
}
|
||||
|
||||
inline const std::string &getTargetWordFromID(unsigned int id, const std::map<unsigned int, std::string> &lookup_target_phrase)
|
||||
{
|
||||
return lookup_target_phrase.find(id)->second;
|
||||
}
|
||||
|
||||
std::string getTargetWordsFromIDs(const std::vector<unsigned int> &ids, const std::map<unsigned int, std::string> &lookup_target_phrase)
|
||||
{
|
||||
std::string returnstring;
|
||||
for (std::vector<unsigned int>::const_iterator it = ids.begin(); it != ids.end(); it++) {
|
||||
returnstring.append(getTargetWordFromID(*it, lookup_target_phrase) + " ");
|
||||
}
|
||||
|
||||
return returnstring;
|
||||
}
|
||||
|
||||
/*Those functions are used to more easily store the floats in the binary phrase table
|
||||
We convert the float unsinged int so that it is the same as our other values and we can
|
||||
apply variable byte encoding on top of it.*/
|
||||
|
||||
inline unsigned int reinterpret_float(float * num)
|
||||
{
|
||||
unsigned int * converted_num;
|
||||
converted_num = reinterpret_cast<unsigned int *>(num);
|
||||
return *converted_num;
|
||||
}
|
||||
|
||||
inline float reinterpret_uint(unsigned int * num)
|
||||
{
|
||||
float * converted_num;
|
||||
converted_num = reinterpret_cast<float *>(num);
|
||||
return *converted_num;
|
||||
}
|
||||
|
||||
/*Mostly taken from stackoverflow, http://stackoverflow.com/questions/5858646/optimizing-variable-length-encoding
|
||||
and modified in order to return a vector of chars. Implements ULEB128 or variable byte encoding.
|
||||
This is highly optimized version with unrolled loop */
|
||||
inline std::vector<unsigned char> vbyte_encode(unsigned int num)
|
||||
{
|
||||
//Determine how many bytes we are going to take.
|
||||
short size;
|
||||
std::vector<unsigned char> byte_vector;
|
||||
|
||||
if (num < 0x00000080U) {
|
||||
size = 1;
|
||||
byte_vector.reserve(size);
|
||||
goto b1;
|
||||
}
|
||||
if (num < 0x00004000U) {
|
||||
size = 2;
|
||||
byte_vector.reserve(size);
|
||||
goto b2;
|
||||
}
|
||||
if (num < 0x00200000U) {
|
||||
size = 3;
|
||||
byte_vector.reserve(size);
|
||||
goto b3;
|
||||
}
|
||||
if (num < 0x10000000U) {
|
||||
size = 4;
|
||||
byte_vector.reserve(size);
|
||||
goto b4;
|
||||
}
|
||||
size = 5;
|
||||
byte_vector.reserve(size);
|
||||
|
||||
|
||||
//Now proceed with the encoding.
|
||||
byte_vector.push_back((num & 0x7f) | 0x80);
|
||||
num >>= 7;
|
||||
b4:
|
||||
byte_vector.push_back((num & 0x7f) | 0x80);
|
||||
num >>= 7;
|
||||
b3:
|
||||
byte_vector.push_back((num & 0x7f) | 0x80);
|
||||
num >>= 7;
|
||||
b2:
|
||||
byte_vector.push_back((num & 0x7f) | 0x80);
|
||||
num >>= 7;
|
||||
b1:
|
||||
byte_vector.push_back(num);
|
||||
|
||||
return byte_vector;
|
||||
}
|
||||
|
||||
std::vector<unsigned int> *vbyte_decode_line(unsigned char line[], size_t linesSize, RecycleData &recycler)
|
||||
{
|
||||
std::vector<unsigned int> *huffman_line;
|
||||
if (recycler.huffman_line.empty()) {
|
||||
huffman_line = new std::vector<unsigned int>();
|
||||
}
|
||||
else {
|
||||
huffman_line = recycler.huffman_line.back();
|
||||
recycler.huffman_line.pop_back();
|
||||
huffman_line->clear();
|
||||
}
|
||||
|
||||
unsigned char current_num[linesSize];
|
||||
|
||||
size_t current_num_ind = 0;
|
||||
for (size_t i = 0; i < linesSize; ++i) {
|
||||
unsigned char c = line[i];
|
||||
current_num[current_num_ind++] = c;
|
||||
if ((c >> 7) != 1) {
|
||||
//We don't have continuation in the next bit
|
||||
huffman_line->push_back(bytes_to_int(current_num, current_num_ind));
|
||||
current_num_ind = 0;
|
||||
}
|
||||
}
|
||||
return huffman_line;
|
||||
}
|
||||
|
||||
inline unsigned int bytes_to_int(unsigned char number[], size_t numberSize)
|
||||
{
|
||||
unsigned int retvalue = 0;
|
||||
unsigned char shift = 0; //By how many bits to shift
|
||||
|
||||
for (size_t i = 0; i < numberSize; ++i) {
|
||||
unsigned char c = number[i];
|
||||
retvalue |= (c & 0x7f) << shift;
|
||||
shift += 7;
|
||||
}
|
||||
|
||||
return retvalue;
|
||||
}
|
||||
|
||||
std::vector<unsigned char> vbyte_encode_line(const std::vector<unsigned int> &line)
|
||||
{
|
||||
std::vector<unsigned char> retvec;
|
||||
|
||||
//For each unsigned int in the line, vbyte encode it and add it to a vector of unsigned chars.
|
||||
for (std::vector<unsigned int>::const_iterator it = line.begin(); it != line.end(); it++) {
|
||||
std::vector<unsigned char> vbyte_encoded = vbyte_encode(*it);
|
||||
retvec.insert(retvec.end(), vbyte_encoded.begin(), vbyte_encoded.end());
|
||||
}
|
||||
|
||||
return retvec;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,150 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
//Huffman encodes a line and also produces the vocabulary ids
|
||||
#include "hash.hh"
|
||||
#include "line_splitter.hh"
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <deque>
|
||||
#include <boost/serialization/serialization.hpp>
|
||||
#include <boost/serialization/vector.hpp>
|
||||
#include <boost/serialization/map.hpp>
|
||||
#include <boost/archive/text_iarchive.hpp>
|
||||
#include <boost/archive/text_oarchive.hpp>
|
||||
#include <boost/foreach.hpp>
|
||||
|
||||
namespace Moses2
|
||||
{
|
||||
////////////////////////////////////////////////////////////////
|
||||
class RecycleData
|
||||
{
|
||||
public:
|
||||
std::deque<target_text*> tt;
|
||||
std::deque<std::vector<unsigned int>*> huffman_line;
|
||||
|
||||
~RecycleData()
|
||||
{
|
||||
BOOST_FOREACH (const target_text *obj, tt) {
|
||||
delete obj;
|
||||
}
|
||||
|
||||
BOOST_FOREACH (const std::vector<unsigned int> *obj, huffman_line) {
|
||||
delete obj;
|
||||
}
|
||||
}
|
||||
};
|
||||
////////////////////////////////////////////////////////////////
|
||||
|
||||
//Sorting for the second
|
||||
struct sort_pair {
|
||||
bool operator()(const std::pair<std::string, unsigned int> &left, const std::pair<std::string, unsigned int> &right) {
|
||||
return left.second > right.second; //This puts biggest numbers first.
|
||||
}
|
||||
};
|
||||
|
||||
struct sort_pair_vec {
|
||||
bool operator()(const std::pair<std::vector<unsigned char>, unsigned int> &left, const std::pair<std::vector<unsigned char>, unsigned int> &right) {
|
||||
return left.second > right.second; //This puts biggest numbers first.
|
||||
}
|
||||
};
|
||||
|
||||
class Huffman
|
||||
{
|
||||
unsigned long uniq_lines; //Unique lines in the file.
|
||||
|
||||
//Containers used when counting the occurence of a given phrase
|
||||
std::map<std::string, unsigned int> target_phrase_words;
|
||||
std::map<std::vector<unsigned char>, unsigned int> word_all1;
|
||||
|
||||
//Same containers as vectors, for sorting
|
||||
std::vector<std::pair<std::string, unsigned int> > target_phrase_words_counts;
|
||||
std::vector<std::pair<std::vector<unsigned char>, unsigned int> > word_all1_counts;
|
||||
|
||||
//Huffman maps
|
||||
std::map<std::string, unsigned int> target_phrase_huffman;
|
||||
std::map<std::vector<unsigned char>, unsigned int> word_all1_huffman;
|
||||
|
||||
//inverted maps
|
||||
std::map<unsigned int, std::string> lookup_target_phrase;
|
||||
std::map<unsigned int, std::vector<unsigned char> > lookup_word_all1;
|
||||
|
||||
public:
|
||||
Huffman (const char *);
|
||||
void count_elements (const line_text &line);
|
||||
void assign_values();
|
||||
void serialize_maps(const char * dirname);
|
||||
void produce_lookups();
|
||||
|
||||
std::vector<unsigned int> encode_line(line_text &line, bool log_prob);
|
||||
|
||||
//encode line + variable byte ontop
|
||||
std::vector<unsigned char> full_encode_line(line_text &line, bool log_prob);
|
||||
|
||||
//Getters
|
||||
const std::map<unsigned int, std::string> get_target_lookup_map() const {
|
||||
return lookup_target_phrase;
|
||||
}
|
||||
const std::map<unsigned int, std::vector<unsigned char> > get_word_all1_lookup_map() const {
|
||||
return lookup_word_all1;
|
||||
}
|
||||
|
||||
unsigned long getUniqLines() {
|
||||
return uniq_lines;
|
||||
}
|
||||
|
||||
void AppendLexRO(line_text &line, std::vector<unsigned int> &retvector, bool log_prob) const;
|
||||
|
||||
};
|
||||
|
||||
class HuffmanDecoder
|
||||
{
|
||||
std::map<unsigned int, std::string> lookup_target_phrase;
|
||||
std::map<unsigned int, std::vector<unsigned char> > lookup_word_all1;
|
||||
|
||||
public:
|
||||
HuffmanDecoder (const char *);
|
||||
HuffmanDecoder (const std::map<unsigned int, std::string> &, const std::map<unsigned int, std::vector<unsigned char> > &);
|
||||
|
||||
//Getters
|
||||
const std::map<unsigned int, std::string> &get_target_lookup_map() const {
|
||||
return lookup_target_phrase;
|
||||
}
|
||||
const std::map<unsigned int, std::vector<unsigned char> > &get_word_all1_lookup_map() const {
|
||||
return lookup_word_all1;
|
||||
}
|
||||
|
||||
inline const std::string &getTargetWordFromID(unsigned int id);
|
||||
|
||||
std::string getTargetWordsFromIDs(const std::vector<unsigned int> &ids);
|
||||
|
||||
target_text *decode_line (const std::vector<unsigned int> &input,
|
||||
int num_scores,
|
||||
int num_lex_scores,
|
||||
RecycleData &recycler);
|
||||
|
||||
//Variable byte decodes a all target phrases contained here and then passes them to decode_line
|
||||
std::vector<target_text*> full_decode_line (unsigned char lines[],
|
||||
size_t linesCount,
|
||||
int num_scores,
|
||||
int num_lex_scores,
|
||||
RecycleData &recycler);
|
||||
};
|
||||
|
||||
std::string getTargetWordsFromIDs(const std::vector<unsigned int> &ids, const std::map<unsigned int, std::string> &lookup_target_phrase);
|
||||
|
||||
inline const std::string &getTargetWordFromID(unsigned int id, const std::map<unsigned int, std::string> &lookup_target_phrase);
|
||||
|
||||
inline unsigned int reinterpret_float(float * num);
|
||||
|
||||
inline float reinterpret_uint(unsigned int * num);
|
||||
|
||||
std::vector<unsigned char> vbyte_encode_line(const std::vector<unsigned int> &line);
|
||||
inline std::vector<unsigned char> vbyte_encode(unsigned int num);
|
||||
std::vector<unsigned int> *vbyte_decode_line(unsigned char line[], size_t linesSize, RecycleData &recycler);
|
||||
inline unsigned int bytes_to_int(unsigned char number[], size_t numberSize);
|
||||
|
||||
}
|
||||
|
||||
|
@ -10,13 +10,12 @@
|
||||
namespace Moses2
|
||||
{
|
||||
|
||||
#define API_VERSION 8
|
||||
#define API_VERSION 9
|
||||
|
||||
//Hash table entry
|
||||
struct Entry {
|
||||
uint64_t key;
|
||||
typedef uint64_t Key;
|
||||
unsigned int bytes_toread;
|
||||
|
||||
uint64_t GetKey() const {
|
||||
return key;
|
||||
@ -26,12 +25,7 @@ struct Entry {
|
||||
key = to;
|
||||
}
|
||||
|
||||
uint64_t GetValue() const {
|
||||
return value;
|
||||
}
|
||||
|
||||
uint64_t value;
|
||||
uint64_t targetInd;
|
||||
};
|
||||
|
||||
//Define table
|
||||
|
@ -94,7 +94,9 @@ std::pair<bool, uint64_t> QueryEngine::query(uint64_t key)
|
||||
|
||||
const Entry * entry;
|
||||
ret.first = table.Find(key, entry);
|
||||
ret.second = entry->targetInd;
|
||||
if (ret.first) {
|
||||
ret.second = entry->value;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -42,7 +42,6 @@ void createProbingPT(
|
||||
float totalSourceCount = 0;
|
||||
|
||||
//Keep track of the size of each group of target phrases
|
||||
uint64_t entrystartidx = 0;
|
||||
size_t line_num = 0;
|
||||
|
||||
//Read everything and processs
|
||||
@ -83,8 +82,7 @@ void createProbingPT(
|
||||
|
||||
//Create an entry for the previous source phrase:
|
||||
Entry pesho;
|
||||
pesho.value = entrystartidx;
|
||||
pesho.targetInd = targetInd;
|
||||
pesho.value = targetInd;
|
||||
//The key is the sum of hashes of individual words bitshifted by their position in the phrase.
|
||||
//Probably not entirerly correct, but fast and seems to work fine in practise.
|
||||
pesho.key = 0;
|
||||
@ -128,8 +126,7 @@ void createProbingPT(
|
||||
uint64_t targetInd = storeTarget.Save();
|
||||
|
||||
Entry pesho;
|
||||
pesho.value = entrystartidx;
|
||||
pesho.targetInd = targetInd;
|
||||
pesho.value = targetInd;
|
||||
|
||||
//The key is the sum of hashes of individual words. Probably not entirerly correct, but fast
|
||||
pesho.key = 0;
|
||||
|
Loading…
Reference in New Issue
Block a user