KenLM c1dba12

- Reject NaNs
- Fix ChartState hashing (unused in Moses)
- Expose CreateOrThrow
- Minor portability improvement in getopt
This commit is contained in:
Kenneth Heafield 2012-03-11 13:47:38 -04:00
parent d5efa27be9
commit 4bcd2c75ca
7 changed files with 38 additions and 29 deletions

View File

@ -112,7 +112,7 @@ inline size_t hash_value(const ChartState &state) {
size_t hashes[2]; size_t hashes[2];
hashes[0] = hash_value(state.left); hashes[0] = hash_value(state.left);
hashes[1] = hash_value(state.right); hashes[1] = hash_value(state.right);
return util::MurmurHashNative(hashes, sizeof(size_t), state.full); return util::MurmurHashNative(hashes, sizeof(size_t) * 2, state.full);
} }
template <class M> class RuleScore { template <class M> class RuleScore {

View File

@ -7,6 +7,7 @@
#include <vector> #include <vector>
#include <ctype.h> #include <ctype.h>
#include <math.h>
#include <string.h> #include <string.h>
#include <stdint.h> #include <stdint.h>
@ -93,7 +94,11 @@ void ReadBackoff(util::FilePiece &in, ProbBackoff &weights) {
case '\t': case '\t':
weights.backoff = in.ReadFloat(); weights.backoff = in.ReadFloat();
if (weights.backoff == ngram::kExtensionBackoff) weights.backoff = ngram::kNoExtensionBackoff; if (weights.backoff == ngram::kExtensionBackoff) weights.backoff = ngram::kNoExtensionBackoff;
if ((in.get() != '\n')) UTIL_THROW(FormatLoadException, "Expected newline after backoff"); {
int float_class = fpclassify(weights.backoff);
UTIL_THROW_IF(float_class == FP_NAN || float_class == FP_INFINITE, FormatLoadException, "Bad backoff " << weights.backoff);
}
UTIL_THROW_IF((in.get() != '\n'), FormatLoadException, "Expected newline after backoff");
break; break;
case '\n': case '\n':
weights.backoff = ngram::kNoExtensionBackoff; weights.backoff = ngram::kNoExtensionBackoff;

View File

@ -10,6 +10,8 @@
#include <iosfwd> #include <iosfwd>
#include <vector> #include <vector>
#include <math.h>
namespace lm { namespace lm {
void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number); void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number);
@ -29,20 +31,26 @@ class PositiveProbWarn {
explicit PositiveProbWarn(WarningAction action) : action_(action) {} explicit PositiveProbWarn(WarningAction action) : action_(action) {}
void Warn(float prob); float ReadProb(util::FilePiece &f) {
float prob = f.ReadFloat();
UTIL_THROW_IF(f.get() != '\t', FormatLoadException, "Expected tab after probability");
UTIL_THROW_IF(isnan(prob), FormatLoadException, "NaN probability");
if (prob > 0.0) {
Warn(prob);
prob = 0.0;
}
return prob;
}
private: private:
void Warn(float prob);
WarningAction action_; WarningAction action_;
}; };
template <class Voc> void Read1Gram(util::FilePiece &f, Voc &vocab, ProbBackoff *unigrams, PositiveProbWarn &warn) { template <class Voc> void Read1Gram(util::FilePiece &f, Voc &vocab, ProbBackoff *unigrams, PositiveProbWarn &warn) {
try { try {
float prob = f.ReadFloat(); float prob = warn.ReadProb(f);
if (prob > 0.0) {
warn.Warn(prob);
prob = 0.0;
}
if (f.get() != '\t') UTIL_THROW(FormatLoadException, "Expected tab after probability");
ProbBackoff &value = unigrams[vocab.Insert(f.ReadDelimited(kARPASpaces))]; ProbBackoff &value = unigrams[vocab.Insert(f.ReadDelimited(kARPASpaces))];
value.prob = prob; value.prob = prob;
ReadBackoff(f, value); ReadBackoff(f, value);
@ -64,11 +72,7 @@ template <class Voc> void Read1Grams(util::FilePiece &f, std::size_t count, Voc
// Return true if a positive log probability came out. // Return true if a positive log probability came out.
template <class Voc, class Weights> void ReadNGram(util::FilePiece &f, const unsigned char n, const Voc &vocab, WordIndex *const reverse_indices, Weights &weights, PositiveProbWarn &warn) { template <class Voc, class Weights> void ReadNGram(util::FilePiece &f, const unsigned char n, const Voc &vocab, WordIndex *const reverse_indices, Weights &weights, PositiveProbWarn &warn) {
try { try {
weights.prob = f.ReadFloat(); weights.prob = warn.ReadProb(f);
if (weights.prob > 0.0) {
warn.Warn(weights.prob);
weights.prob = 0.0;
}
for (WordIndex *vocab_out = reverse_indices + n - 1; vocab_out >= reverse_indices; --vocab_out) { for (WordIndex *vocab_out = reverse_indices + n - 1; vocab_out >= reverse_indices; --vocab_out) {
*vocab_out = vocab.Index(f.ReadDelimited(kARPASpaces)); *vocab_out = vocab.Index(f.ReadDelimited(kARPASpaces));
} }

View File

@ -42,6 +42,16 @@ int OpenReadOrThrow(const char *name) {
return ret; return ret;
} }
int CreateOrThrow(const char *name) {
int ret;
#if defined(_WIN32) || defined(_WIN64)
UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name);
#else
UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name);
#endif
return ret;
}
uint64_t SizeFile(int fd) { uint64_t SizeFile(int fd) {
#if defined(_WIN32) || defined(_WIN64) #if defined(_WIN32) || defined(_WIN64)
__int64 ret = _filelengthi64(fd); __int64 ret = _filelengthi64(fd);

View File

@ -65,7 +65,10 @@ class scoped_FILE {
std::FILE *file_; std::FILE *file_;
}; };
// Open for read only.
int OpenReadOrThrow(const char *name); int OpenReadOrThrow(const char *name);
// Create file if it doesn't exist, truncate if it does. Opened for write.
int CreateOrThrow(const char *name);
// Return value for SizeFile when it can't size properly. // Return value for SizeFile when it can't size properly.
const uint64_t kBadSize = (uint64_t)-1; const uint64_t kBadSize = (uint64_t)-1;

View File

@ -10,6 +10,7 @@ Code given out at the 1985 UNIFORUM conference in Dallas.
#include "getopt.hh" #include "getopt.hh"
#include <stdio.h> #include <stdio.h>
#include <string.h>
#define NULL 0 #define NULL 0
#define EOF (-1) #define EOF (-1)
@ -74,4 +75,4 @@ char **argv, *opts;
return(c); return(c);
} }
#endif /* __GNUC__ */ #endif /* __GNUC__ */

View File

@ -170,20 +170,6 @@ void *MapZeroedWrite(int fd, std::size_t size) {
return MapOrThrow(size, true, kFileFlags, false, fd, 0); return MapOrThrow(size, true, kFileFlags, false, fd, 0);
} }
namespace {
int CreateOrThrow(const char *name) {
int ret;
#if defined(_WIN32) || defined(_WIN64)
UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name);
#else
UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name);
#endif
return ret;
}
} // namespace
void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) { void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) {
file.reset(CreateOrThrow(name)); file.reset(CreateOrThrow(name));
try { try {