mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-25 21:03:22 +03:00
152 lines
5.4 KiB
C++
152 lines
5.4 KiB
C++
|
#include "storing.hh"
|
||
|
|
||
|
BinaryFileWriter::BinaryFileWriter (std::string basepath) : os ((basepath + "/binfile.dat").c_str(), std::ios::binary) {
|
||
|
binfile.reserve(10000); //Reserve part of the vector to avoid realocation
|
||
|
it = binfile.begin();
|
||
|
dist_from_start = 0; //Initialize variables
|
||
|
extra_counter = 0;
|
||
|
}
|
||
|
|
||
|
void BinaryFileWriter::write (std::vector<unsigned char> * bytes) {
|
||
|
binfile.insert(it, bytes->begin(), bytes->end()); //Insert the bytes
|
||
|
//Keep track of the offsets
|
||
|
it += bytes->size();
|
||
|
dist_from_start = distance(binfile.begin(),it);
|
||
|
//Flush the vector to disk every once in a while so that we don't consume too much ram
|
||
|
if (dist_from_start > 9000) {
|
||
|
flush();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void BinaryFileWriter::flush () {
|
||
|
//Cast unsigned char to char before writing...
|
||
|
os.write((char *)&binfile[0], dist_from_start);
|
||
|
//Clear the vector:
|
||
|
binfile.clear();
|
||
|
binfile.reserve(10000);
|
||
|
extra_counter += dist_from_start; //Keep track of the total number of bytes.
|
||
|
it = binfile.begin(); //Reset iterator
|
||
|
dist_from_start = distance(binfile.begin(),it); //Reset dist from start
|
||
|
}
|
||
|
|
||
|
BinaryFileWriter::~BinaryFileWriter (){
|
||
|
os.close();
|
||
|
binfile.clear();
|
||
|
}
|
||
|
|
||
|
void createProbingPT(const char * phrasetable_path, const char * target_path){
|
||
|
//Get basepath and create directory if missing
|
||
|
std::string basepath(target_path);
|
||
|
mkdir(basepath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
|
||
|
|
||
|
//Set up huffman and serialize decoder maps.
|
||
|
Huffman huffmanEncoder(phrasetable_path); //initialize
|
||
|
huffmanEncoder.assign_values();
|
||
|
huffmanEncoder.produce_lookups();
|
||
|
huffmanEncoder.serialize_maps(target_path);
|
||
|
|
||
|
//Get uniq lines:
|
||
|
unsigned long uniq_entries = huffmanEncoder.getUniqLines();
|
||
|
|
||
|
//Source phrase vocabids
|
||
|
std::map<uint64_t, std::string> source_vocabids;
|
||
|
|
||
|
//Read the file
|
||
|
util::FilePiece filein(phrasetable_path);
|
||
|
|
||
|
//Init the probing hash table
|
||
|
size_t size = Table::Size(uniq_entries, 1.2);
|
||
|
char * mem = new char[size];
|
||
|
memset(mem, 0, size);
|
||
|
Table table(mem, size);
|
||
|
|
||
|
BinaryFileWriter binfile(basepath); //Init the binary file writer.
|
||
|
|
||
|
line_text prev_line; //Check if the source phrase of the previous line is the same
|
||
|
|
||
|
//Keep track of the size of each group of target phrases
|
||
|
uint64_t entrystartidx = 0;
|
||
|
//uint64_t line_num = 0;
|
||
|
|
||
|
|
||
|
//Read everything and processs
|
||
|
while(true){
|
||
|
try {
|
||
|
//Process line read
|
||
|
line_text line;
|
||
|
line = splitLine(filein.ReadLine());
|
||
|
//Add source phrases to vocabularyIDs
|
||
|
add_to_map(&source_vocabids, line.source_phrase);
|
||
|
|
||
|
if ((binfile.dist_from_start + binfile.extra_counter) == 0) {
|
||
|
prev_line = line; //For the first iteration assume the previous line is
|
||
|
} //The same as this one.
|
||
|
|
||
|
if (line.source_phrase != prev_line.source_phrase){
|
||
|
|
||
|
//Create a new entry even
|
||
|
|
||
|
//Create an entry for the previous source phrase:
|
||
|
Entry pesho;
|
||
|
pesho.value = entrystartidx;
|
||
|
//The key is the sum of hashes of individual words. Probably not entirerly correct, but fast
|
||
|
pesho.key = 0;
|
||
|
std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase);
|
||
|
for (int i = 0; i < vocabid_source.size(); i++){
|
||
|
pesho.key += vocabid_source[i];
|
||
|
}
|
||
|
pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx;
|
||
|
|
||
|
//Put into table
|
||
|
table.Insert(pesho);
|
||
|
|
||
|
entrystartidx = binfile.dist_from_start + binfile.extra_counter; //Designate start idx for new entry
|
||
|
|
||
|
//Encode a line and write it to disk.
|
||
|
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
|
||
|
binfile.write(&encoded_line);
|
||
|
|
||
|
//Set prevLine
|
||
|
prev_line = line;
|
||
|
|
||
|
} else{
|
||
|
//If we still have the same line, just append to it:
|
||
|
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
|
||
|
binfile.write(&encoded_line);
|
||
|
}
|
||
|
|
||
|
} catch (util::EndOfFileException e){
|
||
|
std::cerr << "Reading phrase table finished, writing remaining files to disk." << std::endl;
|
||
|
binfile.flush();
|
||
|
|
||
|
//After the final entry is constructed we need to add it to the phrase_table
|
||
|
//Create an entry for the previous source phrase:
|
||
|
Entry pesho;
|
||
|
pesho.value = entrystartidx;
|
||
|
//The key is the sum of hashes of individual words. Probably not entirerly correct, but fast
|
||
|
pesho.key = 0;
|
||
|
std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase);
|
||
|
for (int i = 0; i < vocabid_source.size(); i++){
|
||
|
pesho.key += vocabid_source[i];
|
||
|
}
|
||
|
pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx;
|
||
|
//Put into table
|
||
|
table.Insert(pesho);
|
||
|
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
serialize_table(mem, size, (basepath + "/probing_hash.dat").c_str());
|
||
|
|
||
|
serialize_map(&source_vocabids, (basepath + "/source_vocabids").c_str());
|
||
|
|
||
|
delete[] mem;
|
||
|
|
||
|
//Write configfile
|
||
|
std::ofstream configfile;
|
||
|
configfile.open((basepath + "/config").c_str());
|
||
|
configfile << uniq_entries << '\n';
|
||
|
configfile.close();
|
||
|
}
|