mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-28 14:32:38 +03:00
log prob
This commit is contained in:
parent
859705431a
commit
5d1bc4b406
@ -162,8 +162,11 @@ TargetPhrase *ProbingPT::CreateTargetPhrase(MemPool &pool, const System &system,
|
|||||||
// score for this phrase table
|
// score for this phrase table
|
||||||
SCORE scores[probingTargetPhrase.prob.size()];
|
SCORE scores[probingTargetPhrase.prob.size()];
|
||||||
std::copy(probingTargetPhrase.prob.begin(), probingTargetPhrase.prob.end(), scores);
|
std::copy(probingTargetPhrase.prob.begin(), probingTargetPhrase.prob.end(), scores);
|
||||||
|
|
||||||
|
if (!m_engine->IsLogProb()) {
|
||||||
std::transform(scores, scores + probingTargetPhrase.prob.size(), scores, TransformScore);
|
std::transform(scores, scores + probingTargetPhrase.prob.size(), scores, TransformScore);
|
||||||
std::transform(scores, scores + probingTargetPhrase.prob.size(), scores, FloorScore);
|
std::transform(scores, scores + probingTargetPhrase.prob.size(), scores, FloorScore);
|
||||||
|
}
|
||||||
tp->GetScores().PlusEquals(system, *this, scores);
|
tp->GetScores().PlusEquals(system, *this, scores);
|
||||||
|
|
||||||
// extra scores
|
// extra scores
|
||||||
|
@ -318,7 +318,7 @@ target_text *HuffmanDecoder::decode_line (const std::vector<unsigned int> &input
|
|||||||
|
|
||||||
ret->prob.reserve(num_scores);
|
ret->prob.reserve(num_scores);
|
||||||
//Split everything
|
//Split everything
|
||||||
unsigned int wAll;
|
unsigned int wAll = 1;
|
||||||
|
|
||||||
//Split the line into the proper arrays
|
//Split the line into the proper arrays
|
||||||
short num_zeroes = 0;
|
short num_zeroes = 0;
|
||||||
|
@ -10,6 +10,8 @@
|
|||||||
namespace Moses2
|
namespace Moses2
|
||||||
{
|
{
|
||||||
|
|
||||||
|
#define API_VERSION 6
|
||||||
|
|
||||||
//Hash table entry
|
//Hash table entry
|
||||||
struct Entry {
|
struct Entry {
|
||||||
uint64_t key;
|
uint64_t key;
|
||||||
|
@ -60,8 +60,8 @@ QueryEngine::QueryEngine(const char * filepath) : decoder(filepath)
|
|||||||
getline(config, line);
|
getline(config, line);
|
||||||
num_lex_scores = atoi(line.c_str());
|
num_lex_scores = atoi(line.c_str());
|
||||||
// have the scores been log() and FloorScore()?
|
// have the scores been log() and FloorScore()?
|
||||||
//getline(config, line);
|
getline(config, line);
|
||||||
//logProb = atoi(line.c_str());
|
logProb = atoi(line.c_str());
|
||||||
|
|
||||||
config.close();
|
config.close();
|
||||||
|
|
||||||
|
@ -9,8 +9,6 @@
|
|||||||
#include "hash.hh" //Includes line splitter
|
#include "hash.hh" //Includes line splitter
|
||||||
#include "../../Vector.h"
|
#include "../../Vector.h"
|
||||||
|
|
||||||
#define API_VERSION 5
|
|
||||||
|
|
||||||
namespace Moses2
|
namespace Moses2
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -50,6 +48,8 @@ public:
|
|||||||
return source_vocabids;
|
return source_vocabids;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool IsLogProb() const
|
||||||
|
{ return logProb; }
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -10,6 +10,7 @@ int main(int argc, char* argv[])
|
|||||||
string inPath, outPath;
|
string inPath, outPath;
|
||||||
int num_scores = 4;
|
int num_scores = 4;
|
||||||
int num_lex_scores = 0;
|
int num_lex_scores = 0;
|
||||||
|
bool log_prob = false;
|
||||||
|
|
||||||
namespace po = boost::program_options;
|
namespace po = boost::program_options;
|
||||||
po::options_description desc("Options");
|
po::options_description desc("Options");
|
||||||
@ -19,6 +20,7 @@ int main(int argc, char* argv[])
|
|||||||
("output-dir", po::value<string>()->required(), "Directory when binary files will be written")
|
("output-dir", po::value<string>()->required(), "Directory when binary files will be written")
|
||||||
("num-scores", po::value<int>()->default_value(num_scores), "Number of pt scores")
|
("num-scores", po::value<int>()->default_value(num_scores), "Number of pt scores")
|
||||||
("num-lex-scores", po::value<int>()->default_value(num_lex_scores), "Number of lexicalized reordering scores")
|
("num-lex-scores", po::value<int>()->default_value(num_lex_scores), "Number of lexicalized reordering scores")
|
||||||
|
("log-prob", "log (and floor) probabilities before storing")
|
||||||
|
|
||||||
;
|
;
|
||||||
|
|
||||||
@ -46,9 +48,10 @@ int main(int argc, char* argv[])
|
|||||||
if (vm.count("output-dir")) outPath = vm["output-dir"].as<string>();
|
if (vm.count("output-dir")) outPath = vm["output-dir"].as<string>();
|
||||||
if (vm.count("num-scores")) num_scores = vm["num-scores"].as<int>();
|
if (vm.count("num-scores")) num_scores = vm["num-scores"].as<int>();
|
||||||
if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as<int>();
|
if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as<int>();
|
||||||
|
if (vm.count("log-prob")) log_prob = true;
|
||||||
|
|
||||||
|
|
||||||
createProbingPT(inPath.c_str(), outPath.c_str(), num_scores, num_lex_scores);
|
createProbingPT(inPath.c_str(), outPath.c_str(), num_scores, num_lex_scores, log_prob);
|
||||||
|
|
||||||
util::PrintUsage(std::cout);
|
util::PrintUsage(std::cout);
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -141,12 +141,19 @@ void Huffman::serialize_maps(const char * dirname)
|
|||||||
os2.close();
|
os2.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<unsigned char> Huffman::full_encode_line(line_text &line)
|
std::vector<unsigned char> Huffman::full_encode_line(line_text &line, bool log_prob)
|
||||||
{
|
{
|
||||||
return vbyte_encode_line((encode_line(line)));
|
return vbyte_encode_line((encode_line(line, log_prob)));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<unsigned int> Huffman::encode_line(line_text &line)
|
//! make sure score doesn't fall below LOWEST_SCORE
|
||||||
|
inline float FloorScore(float logScore)
|
||||||
|
{
|
||||||
|
const float LOWEST_SCORE = -100.0f;
|
||||||
|
return (std::max)(logScore , LOWEST_SCORE);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<unsigned int> Huffman::encode_line(line_text &line, bool log_prob)
|
||||||
{
|
{
|
||||||
std::vector<unsigned int> retvector;
|
std::vector<unsigned int> retvector;
|
||||||
|
|
||||||
@ -165,12 +172,17 @@ std::vector<unsigned int> Huffman::encode_line(line_text &line)
|
|||||||
//Sometimes we have too big floats to handle, so first convert to double
|
//Sometimes we have too big floats to handle, so first convert to double
|
||||||
double tempnum = atof(probit->data());
|
double tempnum = atof(probit->data());
|
||||||
float num = (float)tempnum;
|
float num = (float)tempnum;
|
||||||
|
if (log_prob) {
|
||||||
|
num = FloorScore(log(num));
|
||||||
|
if (num == 0.0f) num = 0.0000000001;
|
||||||
|
}
|
||||||
|
//cerr << "num=" << num << endl;
|
||||||
retvector.push_back(reinterpret_float(&num));
|
retvector.push_back(reinterpret_float(&num));
|
||||||
probit++;
|
probit++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// append LexRO prob to pt scores
|
// append LexRO prob to pt scores
|
||||||
AppendLexRO(line, retvector);
|
AppendLexRO(line, retvector, log_prob);
|
||||||
|
|
||||||
//Add a zero;
|
//Add a zero;
|
||||||
retvector.push_back(0);
|
retvector.push_back(0);
|
||||||
@ -210,7 +222,7 @@ std::vector<unsigned int> Huffman::encode_line(line_text &line)
|
|||||||
return retvector;
|
return retvector;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Huffman::AppendLexRO(line_text &line, std::vector<unsigned int> &retvector)
|
void Huffman::AppendLexRO(line_text &line, std::vector<unsigned int> &retvector, bool log_prob)
|
||||||
{
|
{
|
||||||
const StringPiece &origProperty = line.property_orig;
|
const StringPiece &origProperty = line.property_orig;
|
||||||
StringPiece::size_type startPos = origProperty.find("{{LexRO ");
|
StringPiece::size_type startPos = origProperty.find("{{LexRO ");
|
||||||
@ -228,6 +240,11 @@ void Huffman::AppendLexRO(line_text &line, std::vector<unsigned int> &retvector)
|
|||||||
|
|
||||||
double tempnum = atof(probStr.data());
|
double tempnum = atof(probStr.data());
|
||||||
float num = (float)tempnum;
|
float num = (float)tempnum;
|
||||||
|
if (log_prob) {
|
||||||
|
num = FloorScore(log(num));
|
||||||
|
if (num == 0.0f) num = 0.0000000001;
|
||||||
|
}
|
||||||
|
|
||||||
retvector.push_back(reinterpret_float(&num));
|
retvector.push_back(reinterpret_float(&num));
|
||||||
|
|
||||||
// exclude LexRO property from property column
|
// exclude LexRO property from property column
|
||||||
|
@ -53,10 +53,10 @@ public:
|
|||||||
void serialize_maps(const char * dirname);
|
void serialize_maps(const char * dirname);
|
||||||
void produce_lookups();
|
void produce_lookups();
|
||||||
|
|
||||||
std::vector<unsigned int> encode_line(line_text &line);
|
std::vector<unsigned int> encode_line(line_text &line, bool log_prob);
|
||||||
|
|
||||||
//encode line + variable byte ontop
|
//encode line + variable byte ontop
|
||||||
std::vector<unsigned char> full_encode_line(line_text &line);
|
std::vector<unsigned char> full_encode_line(line_text &line, bool log_prob);
|
||||||
|
|
||||||
//Getters
|
//Getters
|
||||||
const std::map<unsigned int, std::string> get_target_lookup_map() const {
|
const std::map<unsigned int, std::string> get_target_lookup_map() const {
|
||||||
@ -70,7 +70,7 @@ public:
|
|||||||
return uniq_lines;
|
return uniq_lines;
|
||||||
}
|
}
|
||||||
|
|
||||||
void AppendLexRO(line_text &line, std::vector<unsigned int> &retvector);
|
void AppendLexRO(line_text &line, std::vector<unsigned int> &retvector, bool log_prob);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -7,6 +7,8 @@
|
|||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
|
||||||
|
#define API_VERSION 6
|
||||||
|
|
||||||
|
|
||||||
//Hash table entry
|
//Hash table entry
|
||||||
struct Entry {
|
struct Entry {
|
||||||
|
@ -6,7 +6,6 @@
|
|||||||
#include <sys/stat.h> //For finding size of file
|
#include <sys/stat.h> //For finding size of file
|
||||||
#include "vocabid.hh"
|
#include "vocabid.hh"
|
||||||
#include <algorithm> //toLower
|
#include <algorithm> //toLower
|
||||||
#define API_VERSION 5
|
|
||||||
|
|
||||||
|
|
||||||
char * read_binary_file(char * filename);
|
char * read_binary_file(char * filename);
|
||||||
|
@ -39,7 +39,7 @@ BinaryFileWriter::~BinaryFileWriter ()
|
|||||||
}
|
}
|
||||||
|
|
||||||
void createProbingPT(const char * phrasetable_path, const char * target_path,
|
void createProbingPT(const char * phrasetable_path, const char * target_path,
|
||||||
int num_scores, int num_lex_scores)
|
int num_scores, int num_lex_scores, bool log_prob)
|
||||||
{
|
{
|
||||||
//Get basepath and create directory if missing
|
//Get basepath and create directory if missing
|
||||||
std::string basepath(target_path);
|
std::string basepath(target_path);
|
||||||
@ -110,7 +110,7 @@ void createProbingPT(const char * phrasetable_path, const char * target_path,
|
|||||||
entrystartidx = binfile.dist_from_start + binfile.extra_counter; //Designate start idx for new entry
|
entrystartidx = binfile.dist_from_start + binfile.extra_counter; //Designate start idx for new entry
|
||||||
|
|
||||||
//Encode a line and write it to disk.
|
//Encode a line and write it to disk.
|
||||||
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
|
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line, log_prob);
|
||||||
binfile.write(&encoded_line);
|
binfile.write(&encoded_line);
|
||||||
|
|
||||||
//Set prevLine
|
//Set prevLine
|
||||||
@ -118,7 +118,7 @@ void createProbingPT(const char * phrasetable_path, const char * target_path,
|
|||||||
|
|
||||||
} else {
|
} else {
|
||||||
//If we still have the same line, just append to it:
|
//If we still have the same line, just append to it:
|
||||||
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
|
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line, log_prob);
|
||||||
binfile.write(&encoded_line);
|
binfile.write(&encoded_line);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -157,5 +157,6 @@ void createProbingPT(const char * phrasetable_path, const char * target_path,
|
|||||||
configfile << uniq_entries << '\n';
|
configfile << uniq_entries << '\n';
|
||||||
configfile << num_scores << '\n';
|
configfile << num_scores << '\n';
|
||||||
configfile << num_lex_scores << '\n';
|
configfile << num_lex_scores << '\n';
|
||||||
|
configfile << log_prob << '\n';
|
||||||
configfile.close();
|
configfile.close();
|
||||||
}
|
}
|
||||||
|
@ -12,10 +12,9 @@
|
|||||||
#include "util/file_piece.hh"
|
#include "util/file_piece.hh"
|
||||||
#include "util/file.hh"
|
#include "util/file.hh"
|
||||||
#include "vocabid.hh"
|
#include "vocabid.hh"
|
||||||
#define API_VERSION 5
|
|
||||||
|
|
||||||
void createProbingPT(const char * phrasetable_path, const char * target_path,
|
void createProbingPT(const char * phrasetable_path, const char * target_path,
|
||||||
int num_scores, int num_lex_scores);
|
int num_scores, int num_lex_scores, bool log_prob);
|
||||||
|
|
||||||
class BinaryFileWriter
|
class BinaryFileWriter
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user