mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 05:55:02 +03:00
log prob
This commit is contained in:
parent
859705431a
commit
5d1bc4b406
@ -162,8 +162,11 @@ TargetPhrase *ProbingPT::CreateTargetPhrase(MemPool &pool, const System &system,
|
||||
// score for this phrase table
|
||||
SCORE scores[probingTargetPhrase.prob.size()];
|
||||
std::copy(probingTargetPhrase.prob.begin(), probingTargetPhrase.prob.end(), scores);
|
||||
std::transform(scores, scores + probingTargetPhrase.prob.size(), scores, TransformScore);
|
||||
std::transform(scores, scores + probingTargetPhrase.prob.size(), scores, FloorScore);
|
||||
|
||||
if (!m_engine->IsLogProb()) {
|
||||
std::transform(scores, scores + probingTargetPhrase.prob.size(), scores, TransformScore);
|
||||
std::transform(scores, scores + probingTargetPhrase.prob.size(), scores, FloorScore);
|
||||
}
|
||||
tp->GetScores().PlusEquals(system, *this, scores);
|
||||
|
||||
// extra scores
|
||||
|
@ -318,7 +318,7 @@ target_text *HuffmanDecoder::decode_line (const std::vector<unsigned int> &input
|
||||
|
||||
ret->prob.reserve(num_scores);
|
||||
//Split everything
|
||||
unsigned int wAll;
|
||||
unsigned int wAll = 1;
|
||||
|
||||
//Split the line into the proper arrays
|
||||
short num_zeroes = 0;
|
||||
|
@ -10,6 +10,8 @@
|
||||
namespace Moses2
|
||||
{
|
||||
|
||||
#define API_VERSION 6
|
||||
|
||||
//Hash table entry
|
||||
struct Entry {
|
||||
uint64_t key;
|
||||
|
@ -60,8 +60,8 @@ QueryEngine::QueryEngine(const char * filepath) : decoder(filepath)
|
||||
getline(config, line);
|
||||
num_lex_scores = atoi(line.c_str());
|
||||
// have the scores been log() and FloorScore()?
|
||||
//getline(config, line);
|
||||
//logProb = atoi(line.c_str());
|
||||
getline(config, line);
|
||||
logProb = atoi(line.c_str());
|
||||
|
||||
config.close();
|
||||
|
||||
|
@ -9,8 +9,6 @@
|
||||
#include "hash.hh" //Includes line splitter
|
||||
#include "../../Vector.h"
|
||||
|
||||
#define API_VERSION 5
|
||||
|
||||
namespace Moses2
|
||||
{
|
||||
|
||||
@ -50,6 +48,8 @@ public:
|
||||
return source_vocabids;
|
||||
}
|
||||
|
||||
bool IsLogProb() const
|
||||
{ return logProb; }
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -10,6 +10,7 @@ int main(int argc, char* argv[])
|
||||
string inPath, outPath;
|
||||
int num_scores = 4;
|
||||
int num_lex_scores = 0;
|
||||
bool log_prob = false;
|
||||
|
||||
namespace po = boost::program_options;
|
||||
po::options_description desc("Options");
|
||||
@ -19,6 +20,7 @@ int main(int argc, char* argv[])
|
||||
("output-dir", po::value<string>()->required(), "Directory when binary files will be written")
|
||||
("num-scores", po::value<int>()->default_value(num_scores), "Number of pt scores")
|
||||
("num-lex-scores", po::value<int>()->default_value(num_lex_scores), "Number of lexicalized reordering scores")
|
||||
("log-prob", "log (and floor) probabilities before storing")
|
||||
|
||||
;
|
||||
|
||||
@ -46,9 +48,10 @@ int main(int argc, char* argv[])
|
||||
if (vm.count("output-dir")) outPath = vm["output-dir"].as<string>();
|
||||
if (vm.count("num-scores")) num_scores = vm["num-scores"].as<int>();
|
||||
if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as<int>();
|
||||
if (vm.count("log-prob")) log_prob = true;
|
||||
|
||||
|
||||
createProbingPT(inPath.c_str(), outPath.c_str(), num_scores, num_lex_scores);
|
||||
createProbingPT(inPath.c_str(), outPath.c_str(), num_scores, num_lex_scores, log_prob);
|
||||
|
||||
util::PrintUsage(std::cout);
|
||||
return 0;
|
||||
|
@ -141,12 +141,19 @@ void Huffman::serialize_maps(const char * dirname)
|
||||
os2.close();
|
||||
}
|
||||
|
||||
std::vector<unsigned char> Huffman::full_encode_line(line_text &line)
|
||||
std::vector<unsigned char> Huffman::full_encode_line(line_text &line, bool log_prob)
|
||||
{
|
||||
return vbyte_encode_line((encode_line(line)));
|
||||
return vbyte_encode_line((encode_line(line, log_prob)));
|
||||
}
|
||||
|
||||
std::vector<unsigned int> Huffman::encode_line(line_text &line)
|
||||
//! make sure score doesn't fall below LOWEST_SCORE
|
||||
inline float FloorScore(float logScore)
|
||||
{
|
||||
const float LOWEST_SCORE = -100.0f;
|
||||
return (std::max)(logScore , LOWEST_SCORE);
|
||||
}
|
||||
|
||||
std::vector<unsigned int> Huffman::encode_line(line_text &line, bool log_prob)
|
||||
{
|
||||
std::vector<unsigned int> retvector;
|
||||
|
||||
@ -165,12 +172,17 @@ std::vector<unsigned int> Huffman::encode_line(line_text &line)
|
||||
//Sometimes we have too big floats to handle, so first convert to double
|
||||
double tempnum = atof(probit->data());
|
||||
float num = (float)tempnum;
|
||||
if (log_prob) {
|
||||
num = FloorScore(log(num));
|
||||
if (num == 0.0f) num = 0.0000000001;
|
||||
}
|
||||
//cerr << "num=" << num << endl;
|
||||
retvector.push_back(reinterpret_float(&num));
|
||||
probit++;
|
||||
}
|
||||
|
||||
// append LexRO prob to pt scores
|
||||
AppendLexRO(line, retvector);
|
||||
AppendLexRO(line, retvector, log_prob);
|
||||
|
||||
//Add a zero;
|
||||
retvector.push_back(0);
|
||||
@ -210,7 +222,7 @@ std::vector<unsigned int> Huffman::encode_line(line_text &line)
|
||||
return retvector;
|
||||
}
|
||||
|
||||
void Huffman::AppendLexRO(line_text &line, std::vector<unsigned int> &retvector)
|
||||
void Huffman::AppendLexRO(line_text &line, std::vector<unsigned int> &retvector, bool log_prob)
|
||||
{
|
||||
const StringPiece &origProperty = line.property_orig;
|
||||
StringPiece::size_type startPos = origProperty.find("{{LexRO ");
|
||||
@ -228,6 +240,11 @@ void Huffman::AppendLexRO(line_text &line, std::vector<unsigned int> &retvector)
|
||||
|
||||
double tempnum = atof(probStr.data());
|
||||
float num = (float)tempnum;
|
||||
if (log_prob) {
|
||||
num = FloorScore(log(num));
|
||||
if (num == 0.0f) num = 0.0000000001;
|
||||
}
|
||||
|
||||
retvector.push_back(reinterpret_float(&num));
|
||||
|
||||
// exclude LexRO property from property column
|
||||
|
@ -53,10 +53,10 @@ public:
|
||||
void serialize_maps(const char * dirname);
|
||||
void produce_lookups();
|
||||
|
||||
std::vector<unsigned int> encode_line(line_text &line);
|
||||
std::vector<unsigned int> encode_line(line_text &line, bool log_prob);
|
||||
|
||||
//encode line + variable byte ontop
|
||||
std::vector<unsigned char> full_encode_line(line_text &line);
|
||||
std::vector<unsigned char> full_encode_line(line_text &line, bool log_prob);
|
||||
|
||||
//Getters
|
||||
const std::map<unsigned int, std::string> get_target_lookup_map() const {
|
||||
@ -70,7 +70,7 @@ public:
|
||||
return uniq_lines;
|
||||
}
|
||||
|
||||
void AppendLexRO(line_text &line, std::vector<unsigned int> &retvector);
|
||||
void AppendLexRO(line_text &line, std::vector<unsigned int> &retvector, bool log_prob);
|
||||
|
||||
};
|
||||
|
||||
|
@ -7,6 +7,8 @@
|
||||
#include <fcntl.h>
|
||||
#include <fstream>
|
||||
|
||||
#define API_VERSION 6
|
||||
|
||||
|
||||
//Hash table entry
|
||||
struct Entry {
|
||||
|
@ -6,7 +6,6 @@
|
||||
#include <sys/stat.h> //For finding size of file
|
||||
#include "vocabid.hh"
|
||||
#include <algorithm> //toLower
|
||||
#define API_VERSION 5
|
||||
|
||||
|
||||
char * read_binary_file(char * filename);
|
||||
|
@ -39,7 +39,7 @@ BinaryFileWriter::~BinaryFileWriter ()
|
||||
}
|
||||
|
||||
void createProbingPT(const char * phrasetable_path, const char * target_path,
|
||||
int num_scores, int num_lex_scores)
|
||||
int num_scores, int num_lex_scores, bool log_prob)
|
||||
{
|
||||
//Get basepath and create directory if missing
|
||||
std::string basepath(target_path);
|
||||
@ -110,7 +110,7 @@ void createProbingPT(const char * phrasetable_path, const char * target_path,
|
||||
entrystartidx = binfile.dist_from_start + binfile.extra_counter; //Designate start idx for new entry
|
||||
|
||||
//Encode a line and write it to disk.
|
||||
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
|
||||
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line, log_prob);
|
||||
binfile.write(&encoded_line);
|
||||
|
||||
//Set prevLine
|
||||
@ -118,7 +118,7 @@ void createProbingPT(const char * phrasetable_path, const char * target_path,
|
||||
|
||||
} else {
|
||||
//If we still have the same line, just append to it:
|
||||
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
|
||||
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line, log_prob);
|
||||
binfile.write(&encoded_line);
|
||||
}
|
||||
|
||||
@ -157,5 +157,6 @@ void createProbingPT(const char * phrasetable_path, const char * target_path,
|
||||
configfile << uniq_entries << '\n';
|
||||
configfile << num_scores << '\n';
|
||||
configfile << num_lex_scores << '\n';
|
||||
configfile << log_prob << '\n';
|
||||
configfile.close();
|
||||
}
|
||||
|
@ -12,10 +12,9 @@
|
||||
#include "util/file_piece.hh"
|
||||
#include "util/file.hh"
|
||||
#include "vocabid.hh"
|
||||
#define API_VERSION 5
|
||||
|
||||
void createProbingPT(const char * phrasetable_path, const char * target_path,
|
||||
int num_scores, int num_lex_scores);
|
||||
int num_scores, int num_lex_scores, bool log_prob);
|
||||
|
||||
class BinaryFileWriter
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user