This commit is contained in:
Hieu Hoang 2016-01-07 12:06:49 +00:00
parent 859705431a
commit 5d1bc4b406
12 changed files with 48 additions and 22 deletions

View File

@ -162,8 +162,11 @@ TargetPhrase *ProbingPT::CreateTargetPhrase(MemPool &pool, const System &system,
// score for this phrase table
SCORE scores[probingTargetPhrase.prob.size()];
std::copy(probingTargetPhrase.prob.begin(), probingTargetPhrase.prob.end(), scores);
if (!m_engine->IsLogProb()) {
std::transform(scores, scores + probingTargetPhrase.prob.size(), scores, TransformScore);
std::transform(scores, scores + probingTargetPhrase.prob.size(), scores, FloorScore);
}
tp->GetScores().PlusEquals(system, *this, scores);
// extra scores

View File

@ -318,7 +318,7 @@ target_text *HuffmanDecoder::decode_line (const std::vector<unsigned int> &input
ret->prob.reserve(num_scores);
//Split everything
unsigned int wAll;
unsigned int wAll = 1;
//Split the line into the proper arrays
short num_zeroes = 0;

View File

@ -10,6 +10,8 @@
namespace Moses2
{
#define API_VERSION 6
//Hash table entry
struct Entry {
uint64_t key;

View File

@ -60,8 +60,8 @@ QueryEngine::QueryEngine(const char * filepath) : decoder(filepath)
getline(config, line);
num_lex_scores = atoi(line.c_str());
// have the scores been log() and FloorScore()?
//getline(config, line);
//logProb = atoi(line.c_str());
getline(config, line);
logProb = atoi(line.c_str());
config.close();

View File

@ -9,8 +9,6 @@
#include "hash.hh" //Includes line splitter
#include "../../Vector.h"
#define API_VERSION 5
namespace Moses2
{
@ -50,6 +48,8 @@ public:
return source_vocabids;
}
bool IsLogProb() const
{ return logProb; }
};
}

View File

@ -10,6 +10,7 @@ int main(int argc, char* argv[])
string inPath, outPath;
int num_scores = 4;
int num_lex_scores = 0;
bool log_prob = false;
namespace po = boost::program_options;
po::options_description desc("Options");
@ -19,6 +20,7 @@ int main(int argc, char* argv[])
("output-dir", po::value<string>()->required(), "Directory when binary files will be written")
("num-scores", po::value<int>()->default_value(num_scores), "Number of pt scores")
("num-lex-scores", po::value<int>()->default_value(num_lex_scores), "Number of lexicalized reordering scores")
("log-prob", "log (and floor) probabilities before storing")
;
@ -46,9 +48,10 @@ int main(int argc, char* argv[])
if (vm.count("output-dir")) outPath = vm["output-dir"].as<string>();
if (vm.count("num-scores")) num_scores = vm["num-scores"].as<int>();
if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as<int>();
if (vm.count("log-prob")) log_prob = true;
createProbingPT(inPath.c_str(), outPath.c_str(), num_scores, num_lex_scores);
createProbingPT(inPath.c_str(), outPath.c_str(), num_scores, num_lex_scores, log_prob);
util::PrintUsage(std::cout);
return 0;

View File

@ -141,12 +141,19 @@ void Huffman::serialize_maps(const char * dirname)
os2.close();
}
std::vector<unsigned char> Huffman::full_encode_line(line_text &line)
std::vector<unsigned char> Huffman::full_encode_line(line_text &line, bool log_prob)
{
return vbyte_encode_line((encode_line(line)));
return vbyte_encode_line((encode_line(line, log_prob)));
}
std::vector<unsigned int> Huffman::encode_line(line_text &line)
//! make sure score doesn't fall below LOWEST_SCORE
inline float FloorScore(float logScore)
{
const float LOWEST_SCORE = -100.0f;
return (std::max)(logScore , LOWEST_SCORE);
}
std::vector<unsigned int> Huffman::encode_line(line_text &line, bool log_prob)
{
std::vector<unsigned int> retvector;
@ -165,12 +172,17 @@ std::vector<unsigned int> Huffman::encode_line(line_text &line)
//Sometimes we have too big floats to handle, so first convert to double
double tempnum = atof(probit->data());
float num = (float)tempnum;
if (log_prob) {
num = FloorScore(log(num));
if (num == 0.0f) num = 0.0000000001;
}
//cerr << "num=" << num << endl;
retvector.push_back(reinterpret_float(&num));
probit++;
}
// append LexRO prob to pt scores
AppendLexRO(line, retvector);
AppendLexRO(line, retvector, log_prob);
//Add a zero;
retvector.push_back(0);
@ -210,7 +222,7 @@ std::vector<unsigned int> Huffman::encode_line(line_text &line)
return retvector;
}
void Huffman::AppendLexRO(line_text &line, std::vector<unsigned int> &retvector)
void Huffman::AppendLexRO(line_text &line, std::vector<unsigned int> &retvector, bool log_prob)
{
const StringPiece &origProperty = line.property_orig;
StringPiece::size_type startPos = origProperty.find("{{LexRO ");
@ -228,6 +240,11 @@ void Huffman::AppendLexRO(line_text &line, std::vector<unsigned int> &retvector)
double tempnum = atof(probStr.data());
float num = (float)tempnum;
if (log_prob) {
num = FloorScore(log(num));
if (num == 0.0f) num = 0.0000000001;
}
retvector.push_back(reinterpret_float(&num));
// exclude LexRO property from property column

View File

@ -53,10 +53,10 @@ public:
void serialize_maps(const char * dirname);
void produce_lookups();
std::vector<unsigned int> encode_line(line_text &line);
std::vector<unsigned int> encode_line(line_text &line, bool log_prob);
//encode line + variable byte ontop
std::vector<unsigned char> full_encode_line(line_text &line);
std::vector<unsigned char> full_encode_line(line_text &line, bool log_prob);
//Getters
const std::map<unsigned int, std::string> get_target_lookup_map() const {
@ -70,7 +70,7 @@ public:
return uniq_lines;
}
void AppendLexRO(line_text &line, std::vector<unsigned int> &retvector);
void AppendLexRO(line_text &line, std::vector<unsigned int> &retvector, bool log_prob);
};

View File

@ -7,6 +7,8 @@
#include <fcntl.h>
#include <fstream>
#define API_VERSION 6
//Hash table entry
struct Entry {

View File

@ -6,7 +6,6 @@
#include <sys/stat.h> //For finding size of file
#include "vocabid.hh"
#include <algorithm> //toLower
#define API_VERSION 5
char * read_binary_file(char * filename);

View File

@ -39,7 +39,7 @@ BinaryFileWriter::~BinaryFileWriter ()
}
void createProbingPT(const char * phrasetable_path, const char * target_path,
int num_scores, int num_lex_scores)
int num_scores, int num_lex_scores, bool log_prob)
{
//Get basepath and create directory if missing
std::string basepath(target_path);
@ -110,7 +110,7 @@ void createProbingPT(const char * phrasetable_path, const char * target_path,
entrystartidx = binfile.dist_from_start + binfile.extra_counter; //Designate start idx for new entry
//Encode a line and write it to disk.
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line, log_prob);
binfile.write(&encoded_line);
//Set prevLine
@ -118,7 +118,7 @@ void createProbingPT(const char * phrasetable_path, const char * target_path,
} else {
//If we still have the same line, just append to it:
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line, log_prob);
binfile.write(&encoded_line);
}
@ -157,5 +157,6 @@ void createProbingPT(const char * phrasetable_path, const char * target_path,
configfile << uniq_entries << '\n';
configfile << num_scores << '\n';
configfile << num_lex_scores << '\n';
configfile << log_prob << '\n';
configfile.close();
}

View File

@ -12,10 +12,9 @@
#include "util/file_piece.hh"
#include "util/file.hh"
#include "vocabid.hh"
#define API_VERSION 5
void createProbingPT(const char * phrasetable_path, const char * target_path,
int num_scores, int num_lex_scores);
int num_scores, int num_lex_scores, bool log_prob);
class BinaryFileWriter
{