KenLM 6b4a1c7940a36026de1d96693ccb6ec0f16de8dc

This commit is contained in:
Kenneth Heafield 2013-06-24 16:05:47 +01:00
parent f3cd72537c
commit 794867c555
23 changed files with 235 additions and 226 deletions

View File

@ -33,6 +33,8 @@ int main(int argc, char *argv[]) {
po::options_description options("Language model building options");
lm::builder::PipelineConfig pipeline;
std::string text, arpa;
options.add_options()
("order,o", po::value<std::size_t>(&pipeline.order)
#if BOOST_VERSION >= 104200
@ -47,18 +49,21 @@ int main(int argc, char *argv[]) {
("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file")
("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.");
("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout");
if (argc == 1) {
std::cerr <<
"Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
"Please cite:\n"
"@inproceedings{kenlm,\n"
"author = {Kenneth Heafield},\n"
"title = {{KenLM}: Faster and Smaller Language Model Queries},\n"
"booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n"
"month = {July}, year={2011},\n"
"address = {Edinburgh, UK},\n"
"publisher = {Association for Computational Linguistics},\n"
"@inproceedings{Heafield-estimate,\n"
" author = {Kenneth Heafield and Ivan Pouzyrevsky and Jonathan H. Clark and Philipp Koehn},\n"
" title = {Scalable Modified {Kneser-Ney} Language Model Estimation},\n"
" year = {2013},\n"
" month = {8},\n"
" booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics},\n"
" address = {Sofia, Bulgaria},\n"
" url = {http://kheafield.com/professional/edinburgh/estimate\\_paper.pdf},\n"
"}\n\n"
"Provide the corpus on stdin. The ARPA file will be written to stdout. Order of\n"
"the model (-o) is the only mandatory option. As this is an on-disk program,\n"
@ -91,9 +96,17 @@ int main(int argc, char *argv[]) {
initial.adder_out.block_count = 2;
pipeline.read_backoffs = initial.adder_out;
util::scoped_fd in(0), out(1);
if (vm.count("text")) {
in.reset(util::OpenReadOrThrow(text.c_str()));
}
if (vm.count("arpa")) {
out.reset(util::CreateOrThrow(arpa.c_str()));
}
// Read from stdin
try {
lm::builder::Pipeline(pipeline, 0, 1);
lm::builder::Pipeline(pipeline, in.release(), out.release());
} catch (const util::MallocException &e) {
std::cerr << e.what() << std::endl;
std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as<std::string>() << std::endl;

View File

@ -53,7 +53,7 @@ class NGram {
Payload &Value() { return *reinterpret_cast<Payload *>(end_); }
uint64_t &Count() { return Value().count; }
const uint64_t Count() const { return Value().count; }
uint64_t Count() const { return Value().count; }
std::size_t Order() const { return end_ - begin_; }

View File

@ -304,5 +304,26 @@ template class GenericModel<trie::TrieSearch<SeparatelyQuantize, trie::DontBhiks
template class GenericModel<trie::TrieSearch<SeparatelyQuantize, trie::ArrayBhiksha>, SortedVocabulary>;
} // namespace detail
base::Model *LoadVirtual(const char *file_name, const Config &config, ModelType model_type) {
RecognizeBinary(file_name, model_type);
switch (model_type) {
case PROBING:
return new ProbingModel(file_name, config);
case REST_PROBING:
return new RestProbingModel(file_name, config);
case TRIE:
return new TrieModel(file_name, config);
case QUANT_TRIE:
return new QuantTrieModel(file_name, config);
case ARRAY_TRIE:
return new ArrayTrieModel(file_name, config);
case QUANT_ARRAY_TRIE:
return new QuantArrayTrieModel(file_name, config);
default:
UTIL_THROW(FormatLoadException, "Confused by model type " << model_type);
}
}
} // namespace ngram
} // namespace lm

View File

@ -153,6 +153,11 @@ LM_NAME_MODEL(QuantArrayTrieModel, detail::GenericModel<trie::TrieSearch<Separat
typedef ::lm::ngram::ProbingVocabulary Vocabulary;
typedef ProbingModel Model;
/* Autorecognize the file type, load, and return the virtual base class. Don't
* use the virtual base class if you can avoid it. Instead, use the above
* classes as template arguments to your own virtual feature function.*/
base::Model *LoadVirtual(const char *file_name, const Config &config = Config(), ModelType if_arpa = PROBING);
} // namespace ngram
} // namespace lm

View File

@ -54,7 +54,7 @@ template <class Weights> class ActivateUnigram {
Weights *modify_;
};
// Find the lower order entry, inserting blanks along the way as necessary.
// Find the lower order entry, inserting blanks along the way as necessary.
template <class Value> void FindLower(
const std::vector<uint64_t> &keys,
typename Value::Weights &unigram,
@ -64,7 +64,7 @@ template <class Value> void FindLower(
typename Value::ProbingEntry entry;
// Backoff will always be 0.0. We'll get the probability and rest in another pass.
entry.value.backoff = kNoExtensionBackoff;
// Go back and find the longest right-aligned entry, informing it that it extends left. Normally this will match immediately, but sometimes SRI is dumb.
// Go back and find the longest right-aligned entry, informing it that it extends left. Normally this will match immediately, but sometimes SRI is dumb.
for (int lower = keys.size() - 2; ; --lower) {
if (lower == -1) {
between.push_back(&unigram);
@ -77,11 +77,11 @@ template <class Value> void FindLower(
}
}
// Between usually has single entry, the value to adjust. But sometimes SRI stupidly pruned entries so it has unitialized blank values to be set here.
// Between usually has single entry, the value to adjust. But sometimes SRI stupidly pruned entries so it has unitialized blank values to be set here.
template <class Added, class Build> void AdjustLower(
const Added &added,
const Build &build,
std::vector<typename Build::Value::Weights *> &between,
std::vector<typename Build::Value::Weights *> &between,
const unsigned int n,
const std::vector<WordIndex> &vocab_ids,
typename Build::Value::Weights *unigrams,
@ -93,14 +93,14 @@ template <class Added, class Build> void AdjustLower(
}
typedef util::ProbingHashTable<typename Value::ProbingEntry, util::IdentityHash> Middle;
float prob = -fabs(between.back()->prob);
// Order of the n-gram on which probabilities are based.
// Order of the n-gram on which probabilities are based.
unsigned char basis = n - between.size();
assert(basis != 0);
typename Build::Value::Weights **change = &between.back();
// Skip the basis.
--change;
if (basis == 1) {
// Hallucinate a bigram based on a unigram's backoff and a unigram probability.
// Hallucinate a bigram based on a unigram's backoff and a unigram probability.
float &backoff = unigrams[vocab_ids[1]].backoff;
SetExtension(backoff);
prob += backoff;
@ -128,14 +128,14 @@ template <class Added, class Build> void AdjustLower(
typename std::vector<typename Value::Weights *>::const_iterator i(between.begin());
build.MarkExtends(**i, added);
const typename Value::Weights *longer = *i;
// Everything has probability but is not marked as extending.
// Everything has probability but is not marked as extending.
for (++i; i != between.end(); ++i) {
build.MarkExtends(**i, *longer);
longer = *i;
}
}
// Continue marking lower entries even they know that they extend left. This is used for upper/lower bounds.
// Continue marking lower entries even they know that they extend left. This is used for upper/lower bounds.
template <class Build> void MarkLower(
const std::vector<uint64_t> &keys,
const Build &build,
@ -144,15 +144,15 @@ template <class Build> void MarkLower(
int start_order,
const typename Build::Value::Weights &longer) {
if (start_order == 0) return;
typename util::ProbingHashTable<typename Build::Value::ProbingEntry, util::IdentityHash>::MutableIterator iter;
// Hopefully the compiler will realize that if MarkExtends always returns false, it can simplify this code.
// Hopefully the compiler will realize that if MarkExtends always returns false, it can simplify this code.
for (int even_lower = start_order - 2 /* index in middle */; ; --even_lower) {
if (even_lower == -1) {
build.MarkExtends(unigram, longer);
return;
}
middle[even_lower].UnsafeMutableFind(keys[even_lower], iter);
if (!build.MarkExtends(iter->value, longer)) return;
if (!build.MarkExtends(
middle[even_lower].UnsafeMutableMustFind(keys[even_lower])->value,
longer)) return;
}
}
@ -168,7 +168,6 @@ template <class Build, class Activate, class Store> void ReadNGrams(
Store &store,
PositiveProbWarn &warn) {
typedef typename Build::Value Value;
typedef util::ProbingHashTable<typename Value::ProbingEntry, util::IdentityHash> Middle;
assert(n >= 2);
ReadNGramHeader(f, n);
@ -186,7 +185,7 @@ template <class Build, class Activate, class Store> void ReadNGrams(
for (unsigned int h = 1; h < n - 1; ++h) {
keys[h] = detail::CombineWordHash(keys[h-1], vocab_ids[h+1]);
}
// Initially the sign bit is on, indicating it does not extend left. Most already have this but there might +0.0.
// Initially the sign bit is on, indicating it does not extend left. Most already have this but there might +0.0.
util::SetSign(entry.value.prob);
entry.key = keys[n-2];
@ -203,7 +202,7 @@ template <class Build, class Activate, class Store> void ReadNGrams(
} // namespace
namespace detail {
template <class Value> uint8_t *HashedSearch<Value>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
std::size_t allocated = Unigram::Size(counts[0]);
unigram_ = Unigram(start, counts[0], allocated);

View File

@ -71,7 +71,7 @@ template <class Value> class HashedSearch {
static const bool kDifferentRest = Value::kDifferentRest;
static const unsigned int kVersion = 0;
// TODO: move probing_multiplier here with next binary file format update.
// TODO: move probing_multiplier here with next binary file format update.
static void UpdateConfigFromBinary(int, const std::vector<uint64_t> &, Config &) {}
static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config) {
@ -102,14 +102,9 @@ template <class Value> class HashedSearch {
return ret;
}
#pragma GCC diagnostic ignored "-Wuninitialized"
MiddlePointer Unpack(uint64_t extend_pointer, unsigned char extend_length, Node &node) const {
node = extend_pointer;
typename Middle::ConstIterator found;
bool got = middle_[extend_length - 2].Find(extend_pointer, found);
assert(got);
(void)got;
return MiddlePointer(found->value);
return MiddlePointer(middle_[extend_length - 2].MustFind(extend_pointer)->value);
}
MiddlePointer LookupMiddle(unsigned char order_minus_2, WordIndex word, Node &node, bool &independent_left, uint64_t &extend_pointer) const {
@ -126,14 +121,14 @@ template <class Value> class HashedSearch {
}
LongestPointer LookupLongest(WordIndex word, const Node &node) const {
// Sign bit is always on because longest n-grams do not extend left.
// Sign bit is always on because longest n-grams do not extend left.
typename Longest::ConstIterator found;
if (!longest_.Find(CombineWordHash(node, word), found)) return LongestPointer();
return LongestPointer(found->value.prob);
}
// Generate a node without necessarily checking that it actually exists.
// Optionally return false if it's know to not exist.
// Generate a node without necessarily checking that it actually exists.
// Optionally return false if it's know to not exist.
bool FastMakeNode(const WordIndex *begin, const WordIndex *end, Node &node) const {
assert(begin != end);
node = static_cast<Node>(*begin);
@ -144,7 +139,7 @@ template <class Value> class HashedSearch {
}
private:
// Interpret config's rest cost build policy and pass the right template argument to ApplyBuild.
// Interpret config's rest cost build policy and pass the right template argument to ApplyBuild.
void DispatchBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn);
template <class Build> void ApplyBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const ProbingVocabulary &vocab, PositiveProbWarn &warn, const Build &build);
@ -153,7 +148,7 @@ template <class Value> class HashedSearch {
public:
Unigram() {}
Unigram(void *start, uint64_t count, std::size_t /*allocated*/) :
Unigram(void *start, uint64_t count, std::size_t /*allocated*/) :
unigram_(static_cast<typename Value::Weights*>(start))
#ifdef DEBUG
, count_(count)

View File

@ -6,6 +6,7 @@
#include "util/string_piece.hh"
#include <string>
#include <string.h>
namespace lm {
namespace base {
@ -119,7 +120,9 @@ class Model {
size_t StateSize() const { return state_size_; }
const void *BeginSentenceMemory() const { return begin_sentence_memory_; }
void BeginSentenceWrite(void *to) const { memcpy(to, begin_sentence_memory_, StateSize()); }
const void *NullContextMemory() const { return null_context_memory_; }
void NullContextWrite(void *to) const { memcpy(to, null_context_memory_, StateSize()); }
// Requires in_state != out_state
virtual float Score(const void *in_state, const WordIndex new_word, void *out_state) const = 0;

View File

@ -30,8 +30,7 @@
#include "utils.h"
namespace double_conversion
{
namespace double_conversion {
enum BignumDtoaMode {
// Return the shortest correct representation.

View File

@ -30,12 +30,10 @@
#include "utils.h"
namespace double_conversion
{
namespace double_conversion {
class Bignum
{
public:
class Bignum {
public:
// 3584 = 128 * 28. We can represent 2^3584 > 10^1000 accurately.
// This bignum can encode much bigger numbers, since it contains an
// exponent.
@ -62,9 +60,7 @@ public:
void MultiplyByUInt32(uint32_t factor);
void MultiplyByUInt64(uint64_t factor);
void MultiplyByPowerOfTen(int exponent);
void Times10() {
return MultiplyByUInt32(10);
}
void Times10() { return MultiplyByUInt32(10); }
// Pseudocode:
// int result = this / other;
// this = this % other;
@ -101,7 +97,7 @@ public:
static bool PlusLess(const Bignum& a, const Bignum& b, const Bignum& c) {
return PlusCompare(a, b, c) < 0;
}
private:
private:
typedef uint32_t Chunk;
typedef uint64_t DoubleChunk;
@ -129,9 +125,7 @@ private:
// shift_amount must be < kBigitSize.
void BigitsShiftLeft(int shift_amount);
// BigitLength includes the "hidden" digits encoded in the exponent.
int BigitLength() const {
return used_digits_ + exponent_;
}
int BigitLength() const { return used_digits_ + exponent_; }
Chunk BigitAt(int index) const;
void SubtractTimes(const Bignum& other, int factor);

View File

@ -30,12 +30,10 @@
#include "diy-fp.h"
namespace double_conversion
{
namespace double_conversion {
class PowersOfTenCache
{
public:
class PowersOfTenCache {
public:
// Not all powers of ten are cached. The decimal exponent of two neighboring
// cached numbers will differ by kDecimalExponentDistance.
@ -47,9 +45,9 @@ public:
// Returns a cached power-of-ten with a binary exponent in the range
// [min_exponent; max_exponent] (boundaries included).
static void GetCachedPowerForBinaryExponentRange(int min_exponent,
int max_exponent,
DiyFp* power,
int* decimal_exponent);
int max_exponent,
DiyFp* power,
int* decimal_exponent);
// Returns a cached power of ten x ~= 10^k such that
// k <= decimal_exponent < k + kCachedPowersDecimalDistance.
@ -57,8 +55,8 @@ public:
// kMinDecimalExponent <= requested_exponent, and
// requested_exponent < kMaxDecimalExponent + kDecimalExponentDistance.
static void GetCachedPowerForDecimalExponent(int requested_exponent,
DiyFp* power,
int* found_exponent);
DiyFp* power,
int* found_exponent);
};
} // namespace double_conversion

View File

@ -30,17 +30,15 @@
#include "utils.h"
namespace double_conversion
{
namespace double_conversion {
// This "Do It Yourself Floating Point" class implements a floating-point number
// with a uint64 significand and an int exponent. Normalized DiyFp numbers will
// have the most significant bit of the significand set.
// Multiplication and Subtraction do not normalize their results.
// DiyFp are not designed to contain special doubles (NaN and Infinity).
class DiyFp
{
public:
class DiyFp {
public:
static const int kSignificandSize = 64;
DiyFp() : f_(0), e_(0) {}
@ -102,21 +100,13 @@ public:
return result;
}
uint64_t f() const {
return f_;
}
int e() const {
return e_;
}
uint64_t f() const { return f_; }
int e() const { return e_; }
void set_f(uint64_t new_value) {
f_ = new_value;
}
void set_e(int new_value) {
e_ = new_value;
}
void set_f(uint64_t new_value) { f_ = new_value; }
void set_e(int new_value) { e_ = new_value; }
private:
private:
static const uint64_t kUint64MSB = UINT64_2PART_C(0x80000000, 00000000);
uint64_t f_;

View File

@ -30,12 +30,10 @@
#include "utils.h"
namespace double_conversion
{
namespace double_conversion {
class DoubleToStringConverter
{
public:
class DoubleToStringConverter {
public:
// When calling ToFixed with a double > 10^kMaxFixedDigitsBeforePoint
// or a requested_digits parameter > kMaxFixedDigitsAfterPoint then the
// function returns false.
@ -114,20 +112,20 @@ public:
int decimal_in_shortest_high,
int max_leading_padding_zeroes_in_precision_mode,
int max_trailing_padding_zeroes_in_precision_mode)
: flags_(flags),
infinity_symbol_(infinity_symbol),
nan_symbol_(nan_symbol),
exponent_character_(exponent_character),
decimal_in_shortest_low_(decimal_in_shortest_low),
decimal_in_shortest_high_(decimal_in_shortest_high),
max_leading_padding_zeroes_in_precision_mode_(
max_leading_padding_zeroes_in_precision_mode),
max_trailing_padding_zeroes_in_precision_mode_(
max_trailing_padding_zeroes_in_precision_mode) {
: flags_(flags),
infinity_symbol_(infinity_symbol),
nan_symbol_(nan_symbol),
exponent_character_(exponent_character),
decimal_in_shortest_low_(decimal_in_shortest_low),
decimal_in_shortest_high_(decimal_in_shortest_high),
max_leading_padding_zeroes_in_precision_mode_(
max_leading_padding_zeroes_in_precision_mode),
max_trailing_padding_zeroes_in_precision_mode_(
max_trailing_padding_zeroes_in_precision_mode) {
// When 'trailing zero after the point' is set, then 'trailing point'
// must be set too.
ASSERT(((flags & EMIT_TRAILING_DECIMAL_POINT) != 0) ||
!((flags & EMIT_TRAILING_ZERO_AFTER_POINT) != 0));
!((flags & EMIT_TRAILING_ZERO_AFTER_POINT) != 0));
}
// Returns a converter following the EcmaScript specification.
@ -343,7 +341,7 @@ public:
int* length,
int* point);
private:
private:
// Implementation for ToShortest and ToShortestSingle.
bool ToShortestIeeeNumber(double value,
StringBuilder* result_builder,
@ -380,9 +378,8 @@ private:
};
class StringToDoubleConverter
{
public:
class StringToDoubleConverter {
public:
// Enumeration for allowing octals and ignoring junk when converting
// strings to numbers.
enum Flags {
@ -491,11 +488,11 @@ public:
double junk_string_value,
const char* infinity_symbol,
const char* nan_symbol)
: flags_(flags),
empty_string_value_(empty_string_value),
junk_string_value_(junk_string_value),
infinity_symbol_(infinity_symbol),
nan_symbol_(nan_symbol) {
: flags_(flags),
empty_string_value_(empty_string_value),
junk_string_value_(junk_string_value),
infinity_symbol_(infinity_symbol),
nan_symbol_(nan_symbol) {
}
// Performs the conversion.
@ -519,7 +516,7 @@ public:
processed_characters_count, false));
}
private:
private:
const int flags_;
const double empty_string_value_;
const double junk_string_value_;

View File

@ -30,8 +30,7 @@
#include "utils.h"
namespace double_conversion
{
namespace double_conversion {
enum FastDtoaMode {
// Computes the shortest representation of the given input. The returned

View File

@ -30,8 +30,7 @@
#include "utils.h"
namespace double_conversion
{
namespace double_conversion {
// Produces digits necessary to print a given number with
// 'fractional_count' digits after the decimal point.

View File

@ -30,31 +30,17 @@
#include "diy-fp.h"
namespace double_conversion
{
namespace double_conversion {
// We assume that doubles and uint64_t have the same endianness.
static uint64_t double_to_uint64(double d)
{
return BitCast<uint64_t>(d);
}
static double uint64_to_double(uint64_t d64)
{
return BitCast<double>(d64);
}
static uint32_t float_to_uint32(float f)
{
return BitCast<uint32_t>(f);
}
static float uint32_to_float(uint32_t d32)
{
return BitCast<float>(d32);
}
static uint64_t double_to_uint64(double d) { return BitCast<uint64_t>(d); }
static double uint64_to_double(uint64_t d64) { return BitCast<double>(d64); }
static uint32_t float_to_uint32(float f) { return BitCast<uint32_t>(f); }
static float uint32_to_float(uint32_t d32) { return BitCast<float>(d32); }
// Helper functions for doubles.
class Double
{
public:
class Double {
public:
static const uint64_t kSignMask = UINT64_2PART_C(0x80000000, 00000000);
static const uint64_t kExponentMask = UINT64_2PART_C(0x7FF00000, 00000000);
static const uint64_t kSignificandMask = UINT64_2PART_C(0x000FFFFF, FFFFFFFF);
@ -127,7 +113,7 @@ public:
uint64_t d64 = AsUint64();
int biased_e =
static_cast<int>((d64 & kExponentMask) >> kPhysicalSignificandSize);
static_cast<int>((d64 & kExponentMask) >> kPhysicalSignificandSize);
return biased_e - kExponentBias;
}
@ -157,13 +143,13 @@ public:
bool IsNan() const {
uint64_t d64 = AsUint64();
return ((d64 & kExponentMask) == kExponentMask) &&
((d64 & kSignificandMask) != 0);
((d64 & kSignificandMask) != 0);
}
bool IsInfinite() const {
uint64_t d64 = AsUint64();
return ((d64 & kExponentMask) == kExponentMask) &&
((d64 & kSignificandMask) == 0);
((d64 & kSignificandMask) == 0);
}
int Sign() const {
@ -211,9 +197,7 @@ public:
return physical_significand_is_zero && (Exponent() != kDenormalExponent);
}
double value() const {
return uint64_to_double(d64_);
}
double value() const { return uint64_to_double(d64_); }
// Returns the significand size for a given order of magnitude.
// If v = f*2^e with 2^p-1 <= f <= 2^p then p+e is v's order of magnitude.
@ -237,7 +221,7 @@ public:
return Double(kNaN).value();
}
private:
private:
static const int kExponentBias = 0x3FF + kPhysicalSignificandSize;
static const int kDenormalExponent = -kExponentBias + 1;
static const int kMaxExponent = 0x7FF - kExponentBias;
@ -270,13 +254,12 @@ private:
biased_exponent = static_cast<uint64_t>(exponent + kExponentBias);
}
return (significand & kSignificandMask) |
(biased_exponent << kPhysicalSignificandSize);
(biased_exponent << kPhysicalSignificandSize);
}
};
class Single
{
public:
class Single {
public:
static const uint32_t kSignMask = 0x80000000;
static const uint32_t kExponentMask = 0x7F800000;
static const uint32_t kSignificandMask = 0x007FFFFF;
@ -306,7 +289,7 @@ public:
uint32_t d32 = AsUint32();
int biased_e =
static_cast<int>((d32 & kExponentMask) >> kPhysicalSignificandSize);
static_cast<int>((d32 & kExponentMask) >> kPhysicalSignificandSize);
return biased_e - kExponentBias;
}
@ -336,13 +319,13 @@ public:
bool IsNan() const {
uint32_t d32 = AsUint32();
return ((d32 & kExponentMask) == kExponentMask) &&
((d32 & kSignificandMask) != 0);
((d32 & kSignificandMask) != 0);
}
bool IsInfinite() const {
uint32_t d32 = AsUint32();
return ((d32 & kExponentMask) == kExponentMask) &&
((d32 & kSignificandMask) == 0);
((d32 & kSignificandMask) == 0);
}
int Sign() const {
@ -390,9 +373,7 @@ public:
return physical_significand_is_zero && (Exponent() != kDenormalExponent);
}
float value() const {
return uint32_to_float(d32_);
}
float value() const { return uint32_to_float(d32_); }
static float Infinity() {
return Single(kInfinity).value();
@ -402,7 +383,7 @@ public:
return Single(kNaN).value();
}
private:
private:
static const int kExponentBias = 0x7F + kPhysicalSignificandSize;
static const int kDenormalExponent = -kExponentBias + 1;
static const int kMaxExponent = 0xFF - kExponentBias;

View File

@ -30,8 +30,7 @@
#include "utils.h"
namespace double_conversion
{
namespace double_conversion {
// The buffer must only contain digits in the range [0-9]. It must not
// contain a dot or a sign. It must not start with '0', and must not be empty.

View File

@ -126,29 +126,25 @@ typedef unsigned __int64 uint64_t;
DISALLOW_COPY_AND_ASSIGN(TypeName)
#endif
namespace double_conversion
{
namespace double_conversion {
static const int kCharSize = sizeof(char);
// Returns the maximum of the two parameters.
template <typename T>
static T Max(T a, T b)
{
static T Max(T a, T b) {
return a < b ? b : a;
}
// Returns the minimum of the two parameters.
template <typename T>
static T Min(T a, T b)
{
static T Min(T a, T b) {
return a < b ? a : b;
}
inline int StrLength(const char* string)
{
inline int StrLength(const char* string) {
size_t length = strlen(string);
ASSERT(length == static_cast<size_t>(static_cast<int>(length)));
return static_cast<int>(length);
@ -156,9 +152,8 @@ inline int StrLength(const char* string)
// This is a simplified version of V8's Vector class.
template <typename T>
class Vector
{
public:
class Vector {
public:
Vector() : start_(NULL), length_(0) {}
Vector(T* data, int length) : start_(data), length_(length) {
ASSERT(length == 0 || (length > 0 && data != NULL));
@ -174,19 +169,13 @@ public:
}
// Returns the length of the vector.
int length() const {
return length_;
}
int length() const { return length_; }
// Returns whether or not the vector is empty.
bool is_empty() const {
return length_ == 0;
}
bool is_empty() const { return length_ == 0; }
// Returns the pointer to the start of the data in the vector.
T* start() const {
return start_;
}
T* start() const { return start_; }
// Access individual vector elements - checks bounds in debug mode.
T& operator[](int index) const {
@ -194,15 +183,11 @@ public:
return start_[index];
}
T& first() {
return start_[0];
}
T& first() { return start_[0]; }
T& last() {
return start_[length_ - 1];
}
T& last() { return start_[length_ - 1]; }
private:
private:
T* start_;
int length_;
};
@ -211,19 +196,14 @@ private:
// Helper class for building result strings in a character buffer. The
// purpose of the class is to use safe operations that checks the
// buffer bounds on all operations in debug mode.
class StringBuilder
{
public:
class StringBuilder {
public:
StringBuilder(char* buffer, int size)
: buffer_(buffer, size), position_(0) { }
: buffer_(buffer, size), position_(0) { }
~StringBuilder() {
if (!is_finalized()) Finalize();
}
~StringBuilder() { if (!is_finalized()) Finalize(); }
int size() const {
return buffer_.length();
}
int size() const { return buffer_.length(); }
// Get the current position in the builder.
int position() const {
@ -232,9 +212,7 @@ public:
}
// Reset the position.
void Reset() {
position_ = 0;
}
void Reset() { position_ = 0; }
// Add a single character to the builder. It is not allowed to add
// 0-characters; use the Finalize() method to terminate the string
@ -284,13 +262,11 @@ public:
return buffer_.start();
}
private:
private:
Vector<char> buffer_;
int position_;
bool is_finalized() const {
return position_ < 0;
}
bool is_finalized() const { return position_ < 0; }
DISALLOW_IMPLICIT_CONSTRUCTORS(StringBuilder);
};
@ -320,11 +296,14 @@ private:
// enough that it can no longer see that you have cast one pointer type to
// another thus avoiding the warning.
template <class Dest, class Source>
inline Dest BitCast(const Source& source)
{
inline Dest BitCast(const Source& source) {
// Compile time assertion: sizeof(Dest) == sizeof(Source)
// A compile error here means your Dest and Source have different sizes.
typedef char VerifySizesAreEqual[sizeof(Dest) == sizeof(Source) ? 1 : -1];
typedef char VerifySizesAreEqual[sizeof(Dest) == sizeof(Source) ? 1 : -1]
#if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 8
__attribute__((unused))
#endif
;
Dest dest;
memmove(&dest, &source, sizeof(dest));
@ -332,8 +311,7 @@ inline Dest BitCast(const Source& source)
}
template <class Dest, class Source>
inline Dest BitCast(Source* source)
{
inline Dest BitCast(Source* source) {
return BitCast<Dest>(reinterpret_cast<uintptr_t>(source));
}

View File

@ -116,7 +116,7 @@ std::size_t GuardLarge(std::size_t size) {
// The following operating systems have broken read/write/pread/pwrite that
// only supports up to 2^31.
#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || defined(OS_ANDROID)
return std::min(static_cast<std::size_t>(INT_MAX), size);
return std::min(static_cast<std::size_t>(static_cast<unsigned>(-1)), size);
#else
return size;
#endif
@ -209,7 +209,7 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
#endif
errno = 0;
do {
ret =
ret =
#if defined(_WIN32) || defined(_WIN64)
_write
#else
@ -229,7 +229,7 @@ void WriteOrThrow(FILE *to, const void *data, std::size_t size) {
}
void FSyncOrThrow(int fd) {
// Apparently windows doesn't have fsync?
// Apparently windows doesn't have fsync?
#if !defined(_WIN32) && !defined(_WIN64)
UTIL_THROW_IF_ARG(-1 == fsync(fd), FDException, (fd), "while syncing");
#endif
@ -248,7 +248,7 @@ template <> struct CheckOffT<8> {
typedef CheckOffT<sizeof(off_t)>::True IgnoredType;
#endif
// Can't we all just get along?
// Can't we all just get along?
void InternalSeek(int fd, int64_t off, int whence) {
if (
#if defined(_WIN32) || defined(_WIN64)
@ -457,9 +457,9 @@ bool TryName(int fd, std::string &out) {
std::ostringstream convert;
convert << fd;
name += convert.str();
struct stat sb;
if (-1 == lstat(name.c_str(), &sb))
if (-1 == lstat(name.c_str(), &sb))
return false;
out.resize(sb.st_size + 1);
ssize_t ret = readlink(name.c_str(), &out[0], sb.st_size + 1);
@ -471,7 +471,7 @@ bool TryName(int fd, std::string &out) {
}
out.resize(ret);
// Don't use the non-file names.
if (!out.empty() && out[0] != '/')
if (!out.empty() && out[0] != '/')
return false;
return true;
#endif

View File

@ -109,9 +109,20 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
if (equal_(got, key)) { out = i; return true; }
if (equal_(got, invalid_)) return false;
if (++i == end_) i = begin_;
}
}
}
// Like UnsafeMutableFind, but the key must be there.
template <class Key> MutableIterator UnsafeMutableMustFind(const Key key) {
for (MutableIterator i(begin_ + (hash_(key) % buckets_));;) {
Key got(i->GetKey());
if (equal_(got, key)) { return i; }
assert(!equal_(got, invalid_));
if (++i == end_) i = begin_;
}
}
template <class Key> bool Find(const Key key, ConstIterator &out) const {
#ifdef DEBUG
assert(initialized_);
@ -124,6 +135,16 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
}
}
// Like Find but we're sure it must be there.
template <class Key> ConstIterator MustFind(const Key key) const {
for (ConstIterator i(begin_ + (hash_(key) % buckets_));;) {
Key got(i->GetKey());
if (equal_(got, key)) { return i; }
assert(!equal_(got, invalid_));
if (++i == end_) i = begin_;
}
}
void Clear() {
Entry invalid;
invalid.SetKey(invalid_);

View File

@ -6,11 +6,11 @@
/* This is a RandomAccessIterator that uses a proxy to access the underlying
* data. Useful for packing data at bit offsets but still using STL
* algorithms.
* algorithms.
*
* Normally I would use boost::iterator_facade but some people are too lazy to
* install boost and still want to use my language model. It's amazing how
* many operators an iterator has.
* many operators an iterator has.
*
* The Proxy needs to provide:
* class InnerIterator;
@ -22,15 +22,15 @@
* operator<(InnerIterator)
* operator+=(std::ptrdiff_t)
* operator-(InnerIterator)
* and of course whatever Proxy needs to dereference it.
* and of course whatever Proxy needs to dereference it.
*
* It's also a good idea to specialize std::swap for Proxy.
* It's also a good idea to specialize std::swap for Proxy.
*/
namespace util {
template <class Proxy> class ProxyIterator {
private:
// Self.
// Self.
typedef ProxyIterator<Proxy> S;
typedef typename Proxy::InnerIterator InnerIterator;
@ -38,16 +38,21 @@ template <class Proxy> class ProxyIterator {
typedef std::random_access_iterator_tag iterator_category;
typedef typename Proxy::value_type value_type;
typedef std::ptrdiff_t difference_type;
typedef Proxy reference;
typedef Proxy & reference;
typedef Proxy * pointer;
ProxyIterator() {}
// For cast from non const to const.
// For cast from non const to const.
template <class AlternateProxy> ProxyIterator(const ProxyIterator<AlternateProxy> &in) : p_(*in) {}
explicit ProxyIterator(const Proxy &p) : p_(p) {}
// p_'s operator= does value copying, but here we want iterator copying.
// p_'s swap does value swapping, but here we want iterator swapping
friend inline void swap(ProxyIterator<Proxy> &first, ProxyIterator<Proxy> &second) {
swap(first.I(), second.I());
}
// p_'s operator= does value copying, but here we want iterator copying.
S &operator=(const S &other) {
I() = other.I();
return *this;
@ -72,8 +77,8 @@ template <class Proxy> class ProxyIterator {
std::ptrdiff_t operator-(const S &other) const { return I() - other.I(); }
Proxy operator*() { return p_; }
const Proxy operator*() const { return p_; }
Proxy &operator*() { return p_; }
const Proxy &operator*() const { return p_; }
Proxy *operator->() { return &p_; }
const Proxy *operator->() const { return &p_; }
Proxy operator[](std::ptrdiff_t amount) const { return *(*this + amount); }

View File

@ -36,6 +36,11 @@ class SizedInnerIterator {
void *Data() { return ptr_; }
std::size_t EntrySize() const { return size_; }
friend inline void swap(SizedInnerIterator &first, SizedInnerIterator &second) {
std::swap(first.ptr_, second.ptr_);
std::swap(first.size_, second.size_);
}
private:
uint8_t *ptr_;
std::size_t size_;
@ -63,12 +68,22 @@ class SizedProxy {
const void *Data() const { return inner_.Data(); }
void *Data() { return inner_.Data(); }
/**
// TODO: this (deep) swap was recently added. why? if any std heap sort etc
// algs are using swap, that's going to be worse performance than using
// =. i'm not sure why we *want* a deep swap. if C++11 compilers are
// choosing between move constructor and swap, then we'd better implement a
// (deep) move constructor. it may also be that this is moot since i made
// ProxyIterator a reference and added a shallow ProxyIterator swap? (I
// need Ken or someone competent to judge whether that's correct also. -
// let me know at graehl@gmail.com
*/
friend void swap(SizedProxy &first, SizedProxy &second) {
std::swap_ranges(
static_cast<char*>(first.inner_.Data()),
static_cast<char*>(first.inner_.Data()) + first.inner_.EntrySize(),
static_cast<char*>(second.inner_.Data()));
static_cast<char*>(first.inner_.Data()),
static_cast<char*>(first.inner_.Data()) + first.inner_.EntrySize(),
static_cast<char*>(second.inner_.Data()));
}
private:
@ -87,7 +102,7 @@ typedef ProxyIterator<SizedProxy> SizedIterator;
inline SizedIterator SizedIt(void *ptr, std::size_t size) { return SizedIterator(SizedProxy(ptr, size)); }
// Useful wrapper for a comparison function i.e. sort.
// Useful wrapper for a comparison function i.e. sort.
template <class Delegate, class Proxy = SizedProxy> class SizedCompare : public std::binary_function<const Proxy &, const Proxy &, bool> {
public:
explicit SizedCompare(const Delegate &delegate = Delegate()) : delegate_(delegate) {}
@ -106,7 +121,7 @@ template <class Delegate, class Proxy = SizedProxy> class SizedCompare : public
}
const Delegate &GetDelegate() const { return delegate_; }
private:
const Delegate delegate_;
};

View File

@ -122,7 +122,7 @@ class Chain {
threads_.push_back(new Thread(Complete(), kRecycle));
}
Chain &operator>>(const Recycler &recycle) {
Chain &operator>>(const Recycler &) {
CompleteLoop();
return *this;
}

View File

@ -3,8 +3,6 @@
#include "util/string_piece.hh"
#include <set>
#include <boost/functional/hash.hpp>
#include <boost/version.hpp>