#include "huffmanish.hh" Huffman::Huffman (const char * filepath) { //Read the file util::FilePiece filein(filepath); //Init uniq_lines to zero; uniq_lines = 0; line_text prev_line; //Check for unique lines. int num_lines = 0 ; while (true) { line_text new_line; num_lines++; try { //Process line read new_line = splitLine(filein.ReadLine()); count_elements(new_line); //Counts the number of elements, adds new and increments counters. } catch (util::EndOfFileException e) { std::cerr << "Unique entries counted: "; break; } if (new_line.source_phrase == prev_line.source_phrase) { continue; } else { uniq_lines++; prev_line = new_line; } } std::cerr << uniq_lines << std::endl; } void Huffman::count_elements(line_text linein) { //For target phrase: util::TokenIter it(linein.target_phrase, util::SingleCharacter(' ')); while (it) { //Check if we have that entry std::map::iterator mapiter; mapiter = target_phrase_words.find(it->as_string()); if (mapiter != target_phrase_words.end()) { //If the element is found, increment the count. mapiter->second++; } else { //Else create a new entry; target_phrase_words.insert(std::pair(it->as_string(), 1)); } it++; } //For word allignment 1 std::map, unsigned int>::iterator mapiter3; std::vector numbers = splitWordAll1(linein.word_all1); mapiter3 = word_all1.find(numbers); if (mapiter3 != word_all1.end()) { //If the element is found, increment the count. mapiter3->second++; } else { //Else create a new entry; word_all1.insert(std::pair, unsigned int>(numbers, 1)); } } //Assigns huffman values for each unique element void Huffman::assign_values() { //First create vectors for all maps so that we could sort them later. //Create a vector for target phrases for(std::map::iterator it = target_phrase_words.begin(); it != target_phrase_words.end(); it++ ) { target_phrase_words_counts.push_back(*it); } //Sort it std::sort(target_phrase_words_counts.begin(), target_phrase_words_counts.end(), sort_pair()); //Create a vector for word allignments 1 for(std::map, unsigned int>::iterator it = word_all1.begin(); it != word_all1.end(); it++ ) { word_all1_counts.push_back(*it); } //Sort it std::sort(word_all1_counts.begin(), word_all1_counts.end(), sort_pair_vec()); //Afterwards we assign a value for each phrase, starting from 1, as zero is reserved for delimiter unsigned int i = 1; //huffman code for(std::vector >::iterator it = target_phrase_words_counts.begin(); it != target_phrase_words_counts.end(); it++) { target_phrase_huffman.insert(std::pair(it->first, i)); i++; //Go to the next huffman code } i = 1; //Reset i for the next map for(std::vector, unsigned int> >::iterator it = word_all1_counts.begin(); it != word_all1_counts.end(); it++) { word_all1_huffman.insert(std::pair, unsigned int>(it->first, i)); i++; //Go to the next huffman code } //After lookups are produced, clear some memory usage of objects not needed anymore. target_phrase_words.clear(); word_all1.clear(); target_phrase_words_counts.clear(); word_all1_counts.clear(); std::cerr << "Finished generating huffman codes." << std::endl; } void Huffman::serialize_maps(const char * dirname) { //Note that directory name should exist. std::string basedir(dirname); std::string target_phrase_path(basedir + "/target_phrases"); std::string probabilities_path(basedir + "/probs"); std::string word_all1_path(basedir + "/Wall1"); //Target phrase std::ofstream os (target_phrase_path.c_str(), std::ios::binary); boost::archive::text_oarchive oarch(os); oarch << lookup_target_phrase; os.close(); //Word all1 std::ofstream os2 (word_all1_path.c_str(), std::ios::binary); boost::archive::text_oarchive oarch2(os2); oarch2 << lookup_word_all1; os2.close(); } std::vector Huffman::full_encode_line(line_text line) { return vbyte_encode_line((encode_line(line))); } std::vector Huffman::encode_line(line_text line) { std::vector retvector; //Get target_phrase first. util::TokenIter it(line.target_phrase, util::SingleCharacter(' ')); while (it) { retvector.push_back(target_phrase_huffman.find(it->as_string())->second); it++; } //Add a zero; retvector.push_back(0); //Get probabilities. Reinterpreting the float bytes as unsgined int. util::TokenIter probit(line.prob, util::SingleCharacter(' ')); while (probit) { //Sometimes we have too big floats to handle, so first convert to double double tempnum = atof(probit->data()); float num = (float)tempnum; retvector.push_back(reinterpret_float(&num)); probit++; } //Add a zero; retvector.push_back(0); //Get Word allignments retvector.push_back(word_all1_huffman.find(splitWordAll1(line.word_all1))->second); retvector.push_back(0); return retvector; } void Huffman::produce_lookups() { //basically invert every map that we have for(std::map::iterator it = target_phrase_huffman.begin(); it != target_phrase_huffman.end(); it++ ) { lookup_target_phrase.insert(std::pair(it->second, it->first)); } for(std::map, unsigned int>::iterator it = word_all1_huffman.begin(); it != word_all1_huffman.end(); it++ ) { lookup_word_all1.insert(std::pair >(it->second, it->first)); } } HuffmanDecoder::HuffmanDecoder (const char * dirname) { //Read the maps from disk //Note that directory name should exist. std::string basedir(dirname); std::string target_phrase_path(basedir + "/target_phrases"); std::string word_all1_path(basedir + "/Wall1"); //Target phrases std::ifstream is (target_phrase_path.c_str(), std::ios::binary); boost::archive::text_iarchive iarch(is); iarch >> lookup_target_phrase; is.close(); //Word allignment 1 std::ifstream is2 (word_all1_path.c_str(), std::ios::binary); boost::archive::text_iarchive iarch2(is2); iarch2 >> lookup_word_all1; is2.close(); } HuffmanDecoder::HuffmanDecoder (std::map * lookup_target, std::map > * lookup_word1) { lookup_target_phrase = *lookup_target; lookup_word_all1 = *lookup_word1; } std::vector HuffmanDecoder::full_decode_line (std::vector lines, int num_scores) { std::vector retvector; //All target phrases std::vector decoded_lines = vbyte_decode_line(lines); //All decoded lines std::vector::iterator it = decoded_lines.begin(); //Iterator for them std::vector current_target_phrase; //Current target phrase decoded short zero_count = 0; //Count home many zeroes we have met. so far. Every 3 zeroes mean a new target phrase. while(it != decoded_lines.end()) { if (zero_count == 1) { //We are extracting scores. we know how many scores there are so we can push them //to the vector. This is done in case any of the scores is 0, because it would mess //up the state machine. for (int i = 0; i < num_scores; i++) { current_target_phrase.push_back(*it); it++; } } if (zero_count == 3) { //We have finished with this entry, decode it, and add it to the retvector. retvector.push_back(decode_line(current_target_phrase, num_scores)); current_target_phrase.clear(); //Clear the current target phrase and the zero_count zero_count = 0; //So that we can reuse them for the next target phrase } //Add to the next target_phrase, number by number. current_target_phrase.push_back(*it); if (*it == 0) { zero_count++; } it++; //Go to the next word/symbol } //Don't forget the last remaining line! if (zero_count == 3) { //We have finished with this entry, decode it, and add it to the retvector. retvector.push_back(decode_line(current_target_phrase, num_scores)); current_target_phrase.clear(); //Clear the current target phrase and the zero_count zero_count = 0; //So that we can reuse them for the next target phrase } return retvector; } target_text HuffmanDecoder::decode_line (std::vector input, int num_scores) { //demo decoder target_text ret; //Split everything std::vector target_phrase; std::vector probs; unsigned int wAll; //Split the line into the proper arrays short num_zeroes = 0; int counter = 0; while (num_zeroes < 3) { unsigned int num = input[counter]; if (num == 0) { num_zeroes++; } else if (num_zeroes == 0) { target_phrase.push_back(num); } else if (num_zeroes == 1) { //Push exactly num_scores scores for (int i = 0; i < num_scores; i++) { probs.push_back(num); counter++; num = input[counter]; } continue; } else if (num_zeroes == 2) { wAll = num; } counter++; } ret.target_phrase = target_phrase; ret.word_all1 = lookup_word_all1.find(wAll)->second; //Decode probabilities for (std::vector::iterator it = probs.begin(); it != probs.end(); it++) { ret.prob.push_back(reinterpret_uint(&(*it))); } return ret; } inline std::string HuffmanDecoder::getTargetWordFromID(unsigned int id) { return lookup_target_phrase.find(id)->second; } std::string HuffmanDecoder::getTargetWordsFromIDs(std::vector ids) { std::string returnstring; for (std::vector::iterator it = ids.begin(); it != ids.end(); it++) { returnstring.append(getTargetWordFromID(*it) + " "); } return returnstring; } inline std::string getTargetWordFromID(unsigned int id, std::map * lookup_target_phrase) { return lookup_target_phrase->find(id)->second; } std::string getTargetWordsFromIDs(std::vector ids, std::map * lookup_target_phrase) { std::string returnstring; for (std::vector::iterator it = ids.begin(); it != ids.end(); it++) { returnstring.append(getTargetWordFromID(*it, lookup_target_phrase) + " "); } return returnstring; } /*Those functions are used to more easily store the floats in the binary phrase table We convert the float unsinged int so that it is the same as our other values and we can apply variable byte encoding on top of it.*/ inline unsigned int reinterpret_float(float * num) { unsigned int * converted_num; converted_num = reinterpret_cast(num); return *converted_num; } inline float reinterpret_uint(unsigned int * num) { float * converted_num; converted_num = reinterpret_cast(num); return *converted_num; } /*Mostly taken from stackoverflow, http://stackoverflow.com/questions/5858646/optimizing-variable-length-encoding and modified in order to return a vector of chars. Implements ULEB128 or variable byte encoding. This is highly optimized version with unrolled loop */ inline std::vector vbyte_encode(unsigned int num) { //Determine how many bytes we are going to take. short size; std::vector byte_vector; if (num < 0x00000080U) { size = 1; byte_vector.reserve(size); goto b1; } if (num < 0x00004000U) { size = 2; byte_vector.reserve(size); goto b2; } if (num < 0x00200000U) { size = 3; byte_vector.reserve(size); goto b3; } if (num < 0x10000000U) { size = 4; byte_vector.reserve(size); goto b4; } size = 5; byte_vector.reserve(size); //Now proceed with the encoding. byte_vector.push_back((num & 0x7f) | 0x80); num >>= 7; b4: byte_vector.push_back((num & 0x7f) | 0x80); num >>= 7; b3: byte_vector.push_back((num & 0x7f) | 0x80); num >>= 7; b2: byte_vector.push_back((num & 0x7f) | 0x80); num >>= 7; b1: byte_vector.push_back(num); return byte_vector; } std::vector vbyte_decode_line(std::vector line) { std::vector huffman_line; std::vector current_num; for (std::vector::iterator it = line.begin(); it != line.end(); it++) { current_num.push_back(*it); if ((*it >> 7) != 1) { //We don't have continuation in the next bit huffman_line.push_back(bytes_to_int(current_num)); current_num.clear(); } } return huffman_line; } inline unsigned int bytes_to_int(std::vector number) { unsigned int retvalue = 0; std::vector::iterator it = number.begin(); unsigned char shift = 0; //By how many bits to shift while (it != number.end()) { retvalue |= (*it & 0x7f) << shift; shift += 7; it++; } return retvalue; } std::vector vbyte_encode_line(std::vector line) { std::vector retvec; //For each unsigned int in the line, vbyte encode it and add it to a vector of unsigned chars. for (std::vector::iterator it = line.begin(); it != line.end(); it++) { std::vector vbyte_encoded = vbyte_encode(*it); retvec.insert(retvec.end(), vbyte_encoded.begin(), vbyte_encoded.end()); } return retvec; }