KenLM 6b4a1c7940a36026de1d96693ccb6ec0f16de8dc

2024-12-26 05:14:36 +03:00 · 2013-06-24 16:05:47 +01:00 · 2013-06-24 16:05:47 +01:00 · 794867c555
commit 794867c555
parent f3cd72537c
23 changed files with 235 additions and 226 deletions
--- a/lm/builder/lmplz_main.cc
+++ b/lm/builder/lmplz_main.cc
@ -33,6 +33,8 @@ int main(int argc, char *argv[]) {
    po::options_description options("Language model building options");
    lm::builder::PipelineConfig pipeline;

+    std::string text, arpa;
+
    options.add_options()
      ("order,o", po::value<std::size_t>(&pipeline.order)
 #if BOOST_VERSION >= 104200
@ -47,18 +49,21 @@ int main(int argc, char *argv[]) {
      ("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
      ("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
      ("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file")
-      ("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.");
+      ("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
+      ("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
+      ("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout");
    if (argc == 1) {
      std::cerr << 
        "Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
        "Please cite:\n"
-        "@inproceedings{kenlm,\n"
-        "author    = {Kenneth Heafield},\n"
-        "title     = {{KenLM}: Faster and Smaller Language Model Queries},\n"
-        "booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n"
-        "month     = {July}, year={2011},\n"
-        "address   = {Edinburgh, UK},\n"
-        "publisher = {Association for Computational Linguistics},\n"
+        "@inproceedings{Heafield-estimate,\n"
+        "  author = {Kenneth Heafield and Ivan Pouzyrevsky and Jonathan H. Clark and Philipp Koehn},\n"
+        "  title = {Scalable Modified {Kneser-Ney} Language Model Estimation},\n"
+        "  year = {2013},\n"
+        "  month = {8},\n"
+        "  booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics},\n"
+        "  address = {Sofia, Bulgaria},\n"
+        "  url = {http://kheafield.com/professional/edinburgh/estimate\\_paper.pdf},\n"
        "}\n\n"
        "Provide the corpus on stdin.  The ARPA file will be written to stdout.  Order of\n"
        "the model (-o) is the only mandatory option.  As this is an on-disk program,\n"
@ -91,9 +96,17 @@ int main(int argc, char *argv[]) {
    initial.adder_out.block_count = 2;
    pipeline.read_backoffs = initial.adder_out;

+    util::scoped_fd in(0), out(1);
+    if (vm.count("text")) {
+      in.reset(util::OpenReadOrThrow(text.c_str()));
+    }
+    if (vm.count("arpa")) {
+      out.reset(util::CreateOrThrow(arpa.c_str()));
+    }
+
    // Read from stdin
    try {
-      lm::builder::Pipeline(pipeline, 0, 1);
+      lm::builder::Pipeline(pipeline, in.release(), out.release());
    } catch (const util::MallocException &e) {
      std::cerr << e.what() << std::endl;
      std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as<std::string>() << std::endl;
--- a/lm/builder/ngram.hh
+++ b/lm/builder/ngram.hh
@ -53,7 +53,7 @@ class NGram {
    Payload &Value() { return *reinterpret_cast<Payload *>(end_); }

    uint64_t &Count() { return Value().count; }
-    const uint64_t Count() const { return Value().count; }
+    uint64_t Count() const { return Value().count; }

    std::size_t Order() const { return end_ - begin_; }

--- a/lm/model.cc
+++ b/lm/model.cc
@ -304,5 +304,26 @@ template class GenericModel<trie::TrieSearch<SeparatelyQuantize, trie::DontBhiks
 template class GenericModel<trie::TrieSearch<SeparatelyQuantize, trie::ArrayBhiksha>, SortedVocabulary>;

 } // namespace detail
+
+base::Model *LoadVirtual(const char *file_name, const Config &config, ModelType model_type) {
+  RecognizeBinary(file_name, model_type);
+  switch (model_type) {
+    case PROBING:
+      return new ProbingModel(file_name, config);
+    case REST_PROBING:
+      return new RestProbingModel(file_name, config);
+    case TRIE:
+      return new TrieModel(file_name, config);
+    case QUANT_TRIE:
+      return new QuantTrieModel(file_name, config);
+    case ARRAY_TRIE:
+      return new ArrayTrieModel(file_name, config);
+    case QUANT_ARRAY_TRIE:
+      return new QuantArrayTrieModel(file_name, config);
+    default:
+      UTIL_THROW(FormatLoadException, "Confused by model type " << model_type);
+  }
+}
+
 } // namespace ngram
 } // namespace lm
--- a/lm/model.hh
+++ b/lm/model.hh
@ -153,6 +153,11 @@ LM_NAME_MODEL(QuantArrayTrieModel, detail::GenericModel<trie::TrieSearch<Separat
 typedef ::lm::ngram::ProbingVocabulary Vocabulary;
 typedef ProbingModel Model;

+/* Autorecognize the file type, load, and return the virtual base class.  Don't
+ * use the virtual base class if you can avoid it.  Instead, use the above
+ * classes as template arguments to your own virtual feature function.*/
+base::Model *LoadVirtual(const char *file_name, const Config &config = Config(), ModelType if_arpa = PROBING);
+
 } // namespace ngram
 } // namespace lm

--- a/lm/search_hashed.cc
+++ b/lm/search_hashed.cc
@ -54,7 +54,7 @@ template <class Weights> class ActivateUnigram {
    Weights *modify_;
 };

-// Find the lower order entry, inserting blanks along the way as necessary.  
+// Find the lower order entry, inserting blanks along the way as necessary.
 template <class Value> void FindLower(
    const std::vector<uint64_t> &keys,
    typename Value::Weights &unigram,
@ -64,7 +64,7 @@ template <class Value> void FindLower(
  typename Value::ProbingEntry entry;
  // Backoff will always be 0.0.  We'll get the probability and rest in another pass.
  entry.value.backoff = kNoExtensionBackoff;
-  // Go back and find the longest right-aligned entry, informing it that it extends left.  Normally this will match immediately, but sometimes SRI is dumb.  
+  // Go back and find the longest right-aligned entry, informing it that it extends left.  Normally this will match immediately, but sometimes SRI is dumb.
  for (int lower = keys.size() - 2; ; --lower) {
    if (lower == -1) {
      between.push_back(&unigram);
@ -77,11 +77,11 @@ template <class Value> void FindLower(
  }
 }

-// Between usually has  single entry, the value to adjust.  But sometimes SRI stupidly pruned entries so it has unitialized blank values to be set here.  
+// Between usually has  single entry, the value to adjust.  But sometimes SRI stupidly pruned entries so it has unitialized blank values to be set here.
 template <class Added, class Build> void AdjustLower(
    const Added &added,
    const Build &build,
-    std::vector<typename Build::Value::Weights *> &between, 
+    std::vector<typename Build::Value::Weights *> &between,
    const unsigned int n,
    const std::vector<WordIndex> &vocab_ids,
    typename Build::Value::Weights *unigrams,
@ -93,14 +93,14 @@ template <class Added, class Build> void AdjustLower(
  }
  typedef util::ProbingHashTable<typename Value::ProbingEntry, util::IdentityHash> Middle;
  float prob = -fabs(between.back()->prob);
-  // Order of the n-gram on which probabilities are based.  
+  // Order of the n-gram on which probabilities are based.
  unsigned char basis = n - between.size();
  assert(basis != 0);
  typename Build::Value::Weights **change = &between.back();
  // Skip the basis.
  --change;
  if (basis == 1) {
-    // Hallucinate a bigram based on a unigram's backoff and a unigram probability.  
+    // Hallucinate a bigram based on a unigram's backoff and a unigram probability.
    float &backoff = unigrams[vocab_ids[1]].backoff;
    SetExtension(backoff);
    prob += backoff;
@ -128,14 +128,14 @@ template <class Added, class Build> void AdjustLower(
  typename std::vector<typename Value::Weights *>::const_iterator i(between.begin());
  build.MarkExtends(**i, added);
  const typename Value::Weights *longer = *i;
-  // Everything has probability but is not marked as extending.  
+  // Everything has probability but is not marked as extending.
  for (++i; i != between.end(); ++i) {
    build.MarkExtends(**i, *longer);
    longer = *i;
  }
 }

-// Continue marking lower entries even they know that they extend left.  This is used for upper/lower bounds.  
+// Continue marking lower entries even they know that they extend left.  This is used for upper/lower bounds.
 template <class Build> void MarkLower(
    const std::vector<uint64_t> &keys,
    const Build &build,
@ -144,15 +144,15 @@ template <class Build> void MarkLower(
    int start_order,
    const typename Build::Value::Weights &longer) {
  if (start_order == 0) return;
-  typename util::ProbingHashTable<typename Build::Value::ProbingEntry, util::IdentityHash>::MutableIterator iter;
-  // Hopefully the compiler will realize that if MarkExtends always returns false, it can simplify this code.  
+  // Hopefully the compiler will realize that if MarkExtends always returns false, it can simplify this code.
  for (int even_lower = start_order - 2 /* index in middle */; ; --even_lower) {
    if (even_lower == -1) {
      build.MarkExtends(unigram, longer);
      return;
    }
-    middle[even_lower].UnsafeMutableFind(keys[even_lower], iter);
-    if (!build.MarkExtends(iter->value, longer)) return;
+    if (!build.MarkExtends(
+          middle[even_lower].UnsafeMutableMustFind(keys[even_lower])->value,
+          longer)) return;
  }
 }

@ -168,7 +168,6 @@ template <class Build, class Activate, class Store> void ReadNGrams(
    Store &store,
    PositiveProbWarn &warn) {
  typedef typename Build::Value Value;
-  typedef util::ProbingHashTable<typename Value::ProbingEntry, util::IdentityHash> Middle;
  assert(n >= 2);
  ReadNGramHeader(f, n);

@ -186,7 +185,7 @@ template <class Build, class Activate, class Store> void ReadNGrams(
    for (unsigned int h = 1; h < n - 1; ++h) {
      keys[h] = detail::CombineWordHash(keys[h-1], vocab_ids[h+1]);
    }
-    // Initially the sign bit is on, indicating it does not extend left.  Most already have this but there might +0.0.  
+    // Initially the sign bit is on, indicating it does not extend left.  Most already have this but there might +0.0.
    util::SetSign(entry.value.prob);
    entry.key = keys[n-2];

@ -203,7 +202,7 @@ template <class Build, class Activate, class Store> void ReadNGrams(

 } // namespace
 namespace detail {
- 
+
 template <class Value> uint8_t *HashedSearch<Value>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
  std::size_t allocated = Unigram::Size(counts[0]);
  unigram_ = Unigram(start, counts[0], allocated);
--- a/lm/search_hashed.hh
+++ b/lm/search_hashed.hh
@ -71,7 +71,7 @@ template <class Value> class HashedSearch {
    static const bool kDifferentRest = Value::kDifferentRest;
    static const unsigned int kVersion = 0;

-    // TODO: move probing_multiplier here with next binary file format update.  
+    // TODO: move probing_multiplier here with next binary file format update.
    static void UpdateConfigFromBinary(int, const std::vector<uint64_t> &, Config &) {}

    static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config) {
@ -102,14 +102,9 @@ template <class Value> class HashedSearch {
      return ret;
    }

-#pragma GCC diagnostic ignored "-Wuninitialized"
    MiddlePointer Unpack(uint64_t extend_pointer, unsigned char extend_length, Node &node) const {
      node = extend_pointer;
-      typename Middle::ConstIterator found;
-      bool got = middle_[extend_length - 2].Find(extend_pointer, found);
-      assert(got);
-      (void)got;
-      return MiddlePointer(found->value);
+      return MiddlePointer(middle_[extend_length - 2].MustFind(extend_pointer)->value);
    }

    MiddlePointer LookupMiddle(unsigned char order_minus_2, WordIndex word, Node &node, bool &independent_left, uint64_t &extend_pointer) const {
@ -126,14 +121,14 @@ template <class Value> class HashedSearch {
    }

    LongestPointer LookupLongest(WordIndex word, const Node &node) const {
-      // Sign bit is always on because longest n-grams do not extend left.  
+      // Sign bit is always on because longest n-grams do not extend left.
      typename Longest::ConstIterator found;
      if (!longest_.Find(CombineWordHash(node, word), found)) return LongestPointer();
      return LongestPointer(found->value.prob);
    }

-    // Generate a node without necessarily checking that it actually exists.  
-    // Optionally return false if it's know to not exist.  
+    // Generate a node without necessarily checking that it actually exists.
+    // Optionally return false if it's know to not exist.
    bool FastMakeNode(const WordIndex *begin, const WordIndex *end, Node &node) const {
      assert(begin != end);
      node = static_cast<Node>(*begin);
@ -144,7 +139,7 @@ template <class Value> class HashedSearch {
    }

  private:
-    // Interpret config's rest cost build policy and pass the right template argument to ApplyBuild.  
+    // Interpret config's rest cost build policy and pass the right template argument to ApplyBuild.
    void DispatchBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn);

    template <class Build> void ApplyBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const ProbingVocabulary &vocab, PositiveProbWarn &warn, const Build &build);
@ -153,7 +148,7 @@ template <class Value> class HashedSearch {
      public:
        Unigram() {}

-        Unigram(void *start, uint64_t count, std::size_t /*allocated*/) : 
+        Unigram(void *start, uint64_t count, std::size_t /*allocated*/) :
          unigram_(static_cast<typename Value::Weights*>(start))
 #ifdef DEBUG
         ,  count_(count)
--- a/lm/virtual_interface.hh
+++ b/lm/virtual_interface.hh
@ -6,6 +6,7 @@
 #include "util/string_piece.hh"

 #include <string>
+#include <string.h>

 namespace lm {
 namespace base {
@ -119,7 +120,9 @@ class Model {

    size_t StateSize() const { return state_size_; }
    const void *BeginSentenceMemory() const { return begin_sentence_memory_; }
+    void BeginSentenceWrite(void *to) const { memcpy(to, begin_sentence_memory_, StateSize()); }
    const void *NullContextMemory() const { return null_context_memory_; }
+    void NullContextWrite(void *to) const { memcpy(to, null_context_memory_, StateSize()); }

    // Requires in_state != out_state
    virtual float Score(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
--- a/util/double-conversion/bignum-dtoa.h
+++ b/util/double-conversion/bignum-dtoa.h
@ -30,8 +30,7 @@

 #include "utils.h"

-namespace double_conversion
-{
+namespace double_conversion {

 enum BignumDtoaMode {
  // Return the shortest correct representation.
--- a/util/double-conversion/bignum.h
+++ b/util/double-conversion/bignum.h
@ -30,12 +30,10 @@

 #include "utils.h"

-namespace double_conversion
-{
+namespace double_conversion {

-class Bignum
-{
-public:
+class Bignum {
+ public:
  // 3584 = 128 * 28. We can represent 2^3584 > 10^1000 accurately.
  // This bignum can encode much bigger numbers, since it contains an
  // exponent.
@ -62,9 +60,7 @@ public:
  void MultiplyByUInt32(uint32_t factor);
  void MultiplyByUInt64(uint64_t factor);
  void MultiplyByPowerOfTen(int exponent);
-  void Times10() {
-    return MultiplyByUInt32(10);
-  }
+  void Times10() { return MultiplyByUInt32(10); }
  // Pseudocode:
  //  int result = this / other;
  //  this = this % other;
@ -101,7 +97,7 @@ public:
  static bool PlusLess(const Bignum& a, const Bignum& b, const Bignum& c) {
    return PlusCompare(a, b, c) < 0;
  }
-private:
+ private:
  typedef uint32_t Chunk;
  typedef uint64_t DoubleChunk;

@ -129,9 +125,7 @@ private:
  // shift_amount must be < kBigitSize.
  void BigitsShiftLeft(int shift_amount);
  // BigitLength includes the "hidden" digits encoded in the exponent.
-  int BigitLength() const {
-    return used_digits_ + exponent_;
-  }
+  int BigitLength() const { return used_digits_ + exponent_; }
  Chunk BigitAt(int index) const;
  void SubtractTimes(const Bignum& other, int factor);

--- a/util/double-conversion/cached-powers.h
+++ b/util/double-conversion/cached-powers.h
@ -30,12 +30,10 @@

 #include "diy-fp.h"

-namespace double_conversion
-{
+namespace double_conversion {

-class PowersOfTenCache
-{
-public:
+class PowersOfTenCache {
+ public:

  // Not all powers of ten are cached. The decimal exponent of two neighboring
  // cached numbers will differ by kDecimalExponentDistance.
@ -47,9 +45,9 @@ public:
  // Returns a cached power-of-ten with a binary exponent in the range
  // [min_exponent; max_exponent] (boundaries included).
  static void GetCachedPowerForBinaryExponentRange(int min_exponent,
-      int max_exponent,
-      DiyFp* power,
-      int* decimal_exponent);
+                                                   int max_exponent,
+                                                   DiyFp* power,
+                                                   int* decimal_exponent);

  // Returns a cached power of ten x ~= 10^k such that
  //   k <= decimal_exponent < k + kCachedPowersDecimalDistance.
@ -57,8 +55,8 @@ public:
  //   kMinDecimalExponent <= requested_exponent, and
  //   requested_exponent < kMaxDecimalExponent + kDecimalExponentDistance.
  static void GetCachedPowerForDecimalExponent(int requested_exponent,
-      DiyFp* power,
-      int* found_exponent);
+                                               DiyFp* power,
+                                               int* found_exponent);
 };

 }  // namespace double_conversion
--- a/util/double-conversion/diy-fp.h
+++ b/util/double-conversion/diy-fp.h
@ -30,17 +30,15 @@

 #include "utils.h"

-namespace double_conversion
-{
+namespace double_conversion {

 // This "Do It Yourself Floating Point" class implements a floating-point number
 // with a uint64 significand and an int exponent. Normalized DiyFp numbers will
 // have the most significant bit of the significand set.
 // Multiplication and Subtraction do not normalize their results.
 // DiyFp are not designed to contain special doubles (NaN and Infinity).
-class DiyFp
-{
-public:
+class DiyFp {
+ public:
  static const int kSignificandSize = 64;

  DiyFp() : f_(0), e_(0) {}
@ -102,21 +100,13 @@ public:
    return result;
  }

-  uint64_t f() const {
-    return f_;
-  }
-  int e() const {
-    return e_;
-  }
+  uint64_t f() const { return f_; }
+  int e() const { return e_; }

-  void set_f(uint64_t new_value) {
-    f_ = new_value;
-  }
-  void set_e(int new_value) {
-    e_ = new_value;
-  }
+  void set_f(uint64_t new_value) { f_ = new_value; }
+  void set_e(int new_value) { e_ = new_value; }

-private:
+ private:
  static const uint64_t kUint64MSB = UINT64_2PART_C(0x80000000, 00000000);

  uint64_t f_;
--- a/util/double-conversion/double-conversion.h
+++ b/util/double-conversion/double-conversion.h
@ -30,12 +30,10 @@

 #include "utils.h"

-namespace double_conversion
-{
+namespace double_conversion {

-class DoubleToStringConverter
-{
-public:
+class DoubleToStringConverter {
+ public:
  // When calling ToFixed with a double > 10^kMaxFixedDigitsBeforePoint
  // or a requested_digits parameter > kMaxFixedDigitsAfterPoint then the
  // function returns false.
@ -114,20 +112,20 @@ public:
                          int decimal_in_shortest_high,
                          int max_leading_padding_zeroes_in_precision_mode,
                          int max_trailing_padding_zeroes_in_precision_mode)
-    : flags_(flags),
-      infinity_symbol_(infinity_symbol),
-      nan_symbol_(nan_symbol),
-      exponent_character_(exponent_character),
-      decimal_in_shortest_low_(decimal_in_shortest_low),
-      decimal_in_shortest_high_(decimal_in_shortest_high),
-      max_leading_padding_zeroes_in_precision_mode_(
-        max_leading_padding_zeroes_in_precision_mode),
-      max_trailing_padding_zeroes_in_precision_mode_(
-        max_trailing_padding_zeroes_in_precision_mode) {
+      : flags_(flags),
+        infinity_symbol_(infinity_symbol),
+        nan_symbol_(nan_symbol),
+        exponent_character_(exponent_character),
+        decimal_in_shortest_low_(decimal_in_shortest_low),
+        decimal_in_shortest_high_(decimal_in_shortest_high),
+        max_leading_padding_zeroes_in_precision_mode_(
+            max_leading_padding_zeroes_in_precision_mode),
+        max_trailing_padding_zeroes_in_precision_mode_(
+            max_trailing_padding_zeroes_in_precision_mode) {
    // When 'trailing zero after the point' is set, then 'trailing point'
    // must be set too.
    ASSERT(((flags & EMIT_TRAILING_DECIMAL_POINT) != 0) ||
-           !((flags & EMIT_TRAILING_ZERO_AFTER_POINT) != 0));
+        !((flags & EMIT_TRAILING_ZERO_AFTER_POINT) != 0));
  }

  // Returns a converter following the EcmaScript specification.
@ -343,7 +341,7 @@ public:
                            int* length,
                            int* point);

-private:
+ private:
  // Implementation for ToShortest and ToShortestSingle.
  bool ToShortestIeeeNumber(double value,
                            StringBuilder* result_builder,
@ -380,9 +378,8 @@ private:
 };


-class StringToDoubleConverter
-{
-public:
+class StringToDoubleConverter {
+ public:
  // Enumeration for allowing octals and ignoring junk when converting
  // strings to numbers.
  enum Flags {
@ -491,11 +488,11 @@ public:
                          double junk_string_value,
                          const char* infinity_symbol,
                          const char* nan_symbol)
-    : flags_(flags),
-      empty_string_value_(empty_string_value),
-      junk_string_value_(junk_string_value),
-      infinity_symbol_(infinity_symbol),
-      nan_symbol_(nan_symbol) {
+      : flags_(flags),
+        empty_string_value_(empty_string_value),
+        junk_string_value_(junk_string_value),
+        infinity_symbol_(infinity_symbol),
+        nan_symbol_(nan_symbol) {
  }

  // Performs the conversion.
@ -519,7 +516,7 @@ public:
                                           processed_characters_count, false));
  }

-private:
+ private:
  const int flags_;
  const double empty_string_value_;
  const double junk_string_value_;
--- a/util/double-conversion/fast-dtoa.h
+++ b/util/double-conversion/fast-dtoa.h
@ -30,8 +30,7 @@

 #include "utils.h"

-namespace double_conversion
-{
+namespace double_conversion {

 enum FastDtoaMode {
  // Computes the shortest representation of the given input. The returned
--- a/util/double-conversion/fixed-dtoa.h
+++ b/util/double-conversion/fixed-dtoa.h
@ -30,8 +30,7 @@

 #include "utils.h"

-namespace double_conversion
-{
+namespace double_conversion {

 // Produces digits necessary to print a given number with
 // 'fractional_count' digits after the decimal point.
--- a/util/double-conversion/ieee.h
+++ b/util/double-conversion/ieee.h
@ -30,31 +30,17 @@

 #include "diy-fp.h"

-namespace double_conversion
-{
+namespace double_conversion {

 // We assume that doubles and uint64_t have the same endianness.
-static uint64_t double_to_uint64(double d)
-{
-  return BitCast<uint64_t>(d);
-}
-static double uint64_to_double(uint64_t d64)
-{
-  return BitCast<double>(d64);
-}
-static uint32_t float_to_uint32(float f)
-{
-  return BitCast<uint32_t>(f);
-}
-static float uint32_to_float(uint32_t d32)
-{
-  return BitCast<float>(d32);
-}
+static uint64_t double_to_uint64(double d) { return BitCast<uint64_t>(d); }
+static double uint64_to_double(uint64_t d64) { return BitCast<double>(d64); }
+static uint32_t float_to_uint32(float f) { return BitCast<uint32_t>(f); }
+static float uint32_to_float(uint32_t d32) { return BitCast<float>(d32); }

 // Helper functions for doubles.
-class Double
-{
-public:
+class Double {
+ public:
  static const uint64_t kSignMask = UINT64_2PART_C(0x80000000, 00000000);
  static const uint64_t kExponentMask = UINT64_2PART_C(0x7FF00000, 00000000);
  static const uint64_t kSignificandMask = UINT64_2PART_C(0x000FFFFF, FFFFFFFF);
@ -127,7 +113,7 @@ public:

    uint64_t d64 = AsUint64();
    int biased_e =
-      static_cast<int>((d64 & kExponentMask) >> kPhysicalSignificandSize);
+        static_cast<int>((d64 & kExponentMask) >> kPhysicalSignificandSize);
    return biased_e - kExponentBias;
  }

@ -157,13 +143,13 @@ public:
  bool IsNan() const {
    uint64_t d64 = AsUint64();
    return ((d64 & kExponentMask) == kExponentMask) &&
-           ((d64 & kSignificandMask) != 0);
+        ((d64 & kSignificandMask) != 0);
  }

  bool IsInfinite() const {
    uint64_t d64 = AsUint64();
    return ((d64 & kExponentMask) == kExponentMask) &&
-           ((d64 & kSignificandMask) == 0);
+        ((d64 & kSignificandMask) == 0);
  }

  int Sign() const {
@ -211,9 +197,7 @@ public:
    return physical_significand_is_zero && (Exponent() != kDenormalExponent);
  }

-  double value() const {
-    return uint64_to_double(d64_);
-  }
+  double value() const { return uint64_to_double(d64_); }

  // Returns the significand size for a given order of magnitude.
  // If v = f*2^e with 2^p-1 <= f <= 2^p then p+e is v's order of magnitude.
@ -237,7 +221,7 @@ public:
    return Double(kNaN).value();
  }

-private:
+ private:
  static const int kExponentBias = 0x3FF + kPhysicalSignificandSize;
  static const int kDenormalExponent = -kExponentBias + 1;
  static const int kMaxExponent = 0x7FF - kExponentBias;
@ -270,13 +254,12 @@ private:
      biased_exponent = static_cast<uint64_t>(exponent + kExponentBias);
    }
    return (significand & kSignificandMask) |
-           (biased_exponent << kPhysicalSignificandSize);
+        (biased_exponent << kPhysicalSignificandSize);
  }
 };

-class Single
-{
-public:
+class Single {
+ public:
  static const uint32_t kSignMask = 0x80000000;
  static const uint32_t kExponentMask = 0x7F800000;
  static const uint32_t kSignificandMask = 0x007FFFFF;
@ -306,7 +289,7 @@ public:

    uint32_t d32 = AsUint32();
    int biased_e =
-      static_cast<int>((d32 & kExponentMask) >> kPhysicalSignificandSize);
+        static_cast<int>((d32 & kExponentMask) >> kPhysicalSignificandSize);
    return biased_e - kExponentBias;
  }

@ -336,13 +319,13 @@ public:
  bool IsNan() const {
    uint32_t d32 = AsUint32();
    return ((d32 & kExponentMask) == kExponentMask) &&
-           ((d32 & kSignificandMask) != 0);
+        ((d32 & kSignificandMask) != 0);
  }

  bool IsInfinite() const {
    uint32_t d32 = AsUint32();
    return ((d32 & kExponentMask) == kExponentMask) &&
-           ((d32 & kSignificandMask) == 0);
+        ((d32 & kSignificandMask) == 0);
  }

  int Sign() const {
@ -390,9 +373,7 @@ public:
    return physical_significand_is_zero && (Exponent() != kDenormalExponent);
  }

-  float value() const {
-    return uint32_to_float(d32_);
-  }
+  float value() const { return uint32_to_float(d32_); }

  static float Infinity() {
    return Single(kInfinity).value();
@ -402,7 +383,7 @@ public:
    return Single(kNaN).value();
  }

-private:
+ private:
  static const int kExponentBias = 0x7F + kPhysicalSignificandSize;
  static const int kDenormalExponent = -kExponentBias + 1;
  static const int kMaxExponent = 0xFF - kExponentBias;
--- a/util/double-conversion/strtod.h
+++ b/util/double-conversion/strtod.h
@ -30,8 +30,7 @@

 #include "utils.h"

-namespace double_conversion
-{
+namespace double_conversion {

 // The buffer must only contain digits in the range [0-9]. It must not
 // contain a dot or a sign. It must not start with '0', and must not be empty.
--- a/util/double-conversion/utils.h
+++ b/util/double-conversion/utils.h
@ -126,29 +126,25 @@ typedef unsigned __int64 uint64_t;
  DISALLOW_COPY_AND_ASSIGN(TypeName)
 #endif

-namespace double_conversion
-{
+namespace double_conversion {

 static const int kCharSize = sizeof(char);

 // Returns the maximum of the two parameters.
 template <typename T>
-static T Max(T a, T b)
-{
+static T Max(T a, T b) {
  return a < b ? b : a;
 }


 // Returns the minimum of the two parameters.
 template <typename T>
-static T Min(T a, T b)
-{
+static T Min(T a, T b) {
  return a < b ? a : b;
 }


-inline int StrLength(const char* string)
-{
+inline int StrLength(const char* string) {
  size_t length = strlen(string);
  ASSERT(length == static_cast<size_t>(static_cast<int>(length)));
  return static_cast<int>(length);
@ -156,9 +152,8 @@ inline int StrLength(const char* string)

 // This is a simplified version of V8's Vector class.
 template <typename T>
-class Vector
-{
-public:
+class Vector {
+ public:
  Vector() : start_(NULL), length_(0) {}
  Vector(T* data, int length) : start_(data), length_(length) {
    ASSERT(length == 0 || (length > 0 && data != NULL));
@ -174,19 +169,13 @@ public:
  }

  // Returns the length of the vector.
-  int length() const {
-    return length_;
-  }
+  int length() const { return length_; }

  // Returns whether or not the vector is empty.
-  bool is_empty() const {
-    return length_ == 0;
-  }
+  bool is_empty() const { return length_ == 0; }

  // Returns the pointer to the start of the data in the vector.
-  T* start() const {
-    return start_;
-  }
+  T* start() const { return start_; }

  // Access individual vector elements - checks bounds in debug mode.
  T& operator[](int index) const {
@ -194,15 +183,11 @@ public:
    return start_[index];
  }

-  T& first() {
-    return start_[0];
-  }
+  T& first() { return start_[0]; }

-  T& last() {
-    return start_[length_ - 1];
-  }
+  T& last() { return start_[length_ - 1]; }

-private:
+ private:
  T* start_;
  int length_;
 };
@ -211,19 +196,14 @@ private:
 // Helper class for building result strings in a character buffer. The
 // purpose of the class is to use safe operations that checks the
 // buffer bounds on all operations in debug mode.
-class StringBuilder
-{
-public:
+class StringBuilder {
+ public:
  StringBuilder(char* buffer, int size)
-    : buffer_(buffer, size), position_(0) { }
+      : buffer_(buffer, size), position_(0) { }

-  ~StringBuilder() {
-    if (!is_finalized()) Finalize();
-  }
+  ~StringBuilder() { if (!is_finalized()) Finalize(); }

-  int size() const {
-    return buffer_.length();
-  }
+  int size() const { return buffer_.length(); }

  // Get the current position in the builder.
  int position() const {
@ -232,9 +212,7 @@ public:
  }

  // Reset the position.
-  void Reset() {
-    position_ = 0;
-  }
+  void Reset() { position_ = 0; }

  // Add a single character to the builder. It is not allowed to add
  // 0-characters; use the Finalize() method to terminate the string
@ -284,13 +262,11 @@ public:
    return buffer_.start();
  }

-private:
+ private:
  Vector<char> buffer_;
  int position_;

-  bool is_finalized() const {
-    return position_ < 0;
-  }
+  bool is_finalized() const { return position_ < 0; }

  DISALLOW_IMPLICIT_CONSTRUCTORS(StringBuilder);
 };
@ -320,11 +296,14 @@ private:
 // enough that it can no longer see that you have cast one pointer type to
 // another thus avoiding the warning.
 template <class Dest, class Source>
-inline Dest BitCast(const Source& source)
-{
+inline Dest BitCast(const Source& source) {
  // Compile time assertion: sizeof(Dest) == sizeof(Source)
  // A compile error here means your Dest and Source have different sizes.
-  typedef char VerifySizesAreEqual[sizeof(Dest) == sizeof(Source) ? 1 : -1];
+  typedef char VerifySizesAreEqual[sizeof(Dest) == sizeof(Source) ? 1 : -1]
+#if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 8
+      __attribute__((unused))
+#endif
+      ;

  Dest dest;
  memmove(&dest, &source, sizeof(dest));
@ -332,8 +311,7 @@ inline Dest BitCast(const Source& source)
 }

 template <class Dest, class Source>
-inline Dest BitCast(Source* source)
-{
+inline Dest BitCast(Source* source) {
  return BitCast<Dest>(reinterpret_cast<uintptr_t>(source));
 }

--- a/util/file.cc
+++ b/util/file.cc
@ -116,7 +116,7 @@ std::size_t GuardLarge(std::size_t size) {
  // The following operating systems have broken read/write/pread/pwrite that
  // only supports up to 2^31.
 #if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || defined(OS_ANDROID)
-  return std::min(static_cast<std::size_t>(INT_MAX), size);
+  return std::min(static_cast<std::size_t>(static_cast<unsigned>(-1)), size);
 #else
  return size;
 #endif
@ -209,7 +209,7 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
 #endif
    errno = 0;
    do {
-      ret = 
+      ret =
 #if defined(_WIN32) || defined(_WIN64)
        _write
 #else
@ -229,7 +229,7 @@ void WriteOrThrow(FILE *to, const void *data, std::size_t size) {
 }

 void FSyncOrThrow(int fd) {
-// Apparently windows doesn't have fsync?  
+// Apparently windows doesn't have fsync?
 #if !defined(_WIN32) && !defined(_WIN64)
  UTIL_THROW_IF_ARG(-1 == fsync(fd), FDException, (fd), "while syncing");
 #endif
@ -248,7 +248,7 @@ template <> struct CheckOffT<8> {
 typedef CheckOffT<sizeof(off_t)>::True IgnoredType;
 #endif

-// Can't we all just get along?  
+// Can't we all just get along?
 void InternalSeek(int fd, int64_t off, int whence) {
  if (
 #if defined(_WIN32) || defined(_WIN64)
@ -457,9 +457,9 @@ bool TryName(int fd, std::string &out) {
  std::ostringstream convert;
  convert << fd;
  name += convert.str();
-  
+
  struct stat sb;
-  if (-1 == lstat(name.c_str(), &sb)) 
+  if (-1 == lstat(name.c_str(), &sb))
    return false;
  out.resize(sb.st_size + 1);
  ssize_t ret = readlink(name.c_str(), &out[0], sb.st_size + 1);
@ -471,7 +471,7 @@ bool TryName(int fd, std::string &out) {
  }
  out.resize(ret);
  // Don't use the non-file names.
-  if (!out.empty() && out[0] != '/') 
+  if (!out.empty() && out[0] != '/')
    return false;
  return true;
 #endif
--- a/util/probing_hash_table.hh
+++ b/util/probing_hash_table.hh
@ -109,9 +109,20 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
        if (equal_(got, key)) { out = i; return true; }
        if (equal_(got, invalid_)) return false;
        if (++i == end_) i = begin_;
-      }   
+      }
    }

+    // Like UnsafeMutableFind, but the key must be there.
+    template <class Key> MutableIterator UnsafeMutableMustFind(const Key key) {
+       for (MutableIterator i(begin_ + (hash_(key) % buckets_));;) {
+        Key got(i->GetKey());
+        if (equal_(got, key)) { return i; }
+        assert(!equal_(got, invalid_));
+        if (++i == end_) i = begin_;
+      }
+    }
+
+
    template <class Key> bool Find(const Key key, ConstIterator &out) const {
 #ifdef DEBUG
      assert(initialized_);
@ -124,6 +135,16 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
      }    
    }

+    // Like Find but we're sure it must be there.
+    template <class Key> ConstIterator MustFind(const Key key) const {
+      for (ConstIterator i(begin_ + (hash_(key) % buckets_));;) {
+        Key got(i->GetKey());
+        if (equal_(got, key)) { return i; }
+        assert(!equal_(got, invalid_));
+        if (++i == end_) i = begin_;
+      }
+    }
+
    void Clear() {
      Entry invalid;
      invalid.SetKey(invalid_);
--- a/util/proxy_iterator.hh
+++ b/util/proxy_iterator.hh
@ -6,11 +6,11 @@

 /* This is a RandomAccessIterator that uses a proxy to access the underlying
 * data.  Useful for packing data at bit offsets but still using STL
- * algorithms.  
+ * algorithms.
 *
 * Normally I would use boost::iterator_facade but some people are too lazy to
 * install boost and still want to use my language model.  It's amazing how
- * many operators an iterator has. 
+ * many operators an iterator has.
 *
 * The Proxy needs to provide:
 *   class InnerIterator;
@ -22,15 +22,15 @@
 *   operator<(InnerIterator)
 *   operator+=(std::ptrdiff_t)
 *   operator-(InnerIterator)
- * and of course whatever Proxy needs to dereference it.  
+ * and of course whatever Proxy needs to dereference it.
 *
- * It's also a good idea to specialize std::swap for Proxy.  
+ * It's also a good idea to specialize std::swap for Proxy.
 */

 namespace util {
 template <class Proxy> class ProxyIterator {
  private:
-    // Self.  
+    // Self.
    typedef ProxyIterator<Proxy> S;
    typedef typename Proxy::InnerIterator InnerIterator;

@ -38,16 +38,21 @@ template <class Proxy> class ProxyIterator {
    typedef std::random_access_iterator_tag iterator_category;
    typedef typename Proxy::value_type value_type;
    typedef std::ptrdiff_t difference_type;
-    typedef Proxy reference;
+    typedef Proxy & reference;
    typedef Proxy * pointer;

    ProxyIterator() {}

-    // For cast from non const to const.  
+    // For cast from non const to const.
    template <class AlternateProxy> ProxyIterator(const ProxyIterator<AlternateProxy> &in) : p_(*in) {}
    explicit ProxyIterator(const Proxy &p) : p_(p) {}

-    // p_'s operator= does value copying, but here we want iterator copying.  
+    // p_'s swap does value swapping, but here we want iterator swapping
+    friend inline void swap(ProxyIterator<Proxy> &first, ProxyIterator<Proxy> &second) {
+      swap(first.I(), second.I());
+    }
+
+    // p_'s operator= does value copying, but here we want iterator copying.
    S &operator=(const S &other) {
      I() = other.I();
      return *this;
@ -72,8 +77,8 @@ template <class Proxy> class ProxyIterator {

    std::ptrdiff_t operator-(const S &other) const { return I() - other.I(); }

-    Proxy operator*() { return p_; }
-    const Proxy operator*() const { return p_; }
+    Proxy &operator*() { return p_; }
+    const Proxy &operator*() const { return p_; }
    Proxy *operator->() { return &p_; }
    const Proxy *operator->() const { return &p_; }
    Proxy operator[](std::ptrdiff_t amount) const { return *(*this + amount); }
--- a/util/sized_iterator.hh
+++ b/util/sized_iterator.hh
@ -36,6 +36,11 @@ class SizedInnerIterator {
    void *Data() { return ptr_; }
    std::size_t EntrySize() const { return size_; }

+    friend inline void swap(SizedInnerIterator &first, SizedInnerIterator &second) {
+      std::swap(first.ptr_, second.ptr_);
+      std::swap(first.size_, second.size_);
+    }
+
  private:
    uint8_t *ptr_;
    std::size_t size_;
@ -63,12 +68,22 @@ class SizedProxy {

    const void *Data() const { return inner_.Data(); }
    void *Data() { return inner_.Data(); }
-    
+
+  /**
+     // TODO: this (deep) swap was recently added. why? if any std heap sort etc
+     // algs are using swap, that's going to be worse performance than using
+     // =. i'm not sure why we *want* a deep swap. if C++11 compilers are
+     // choosing between move constructor and swap, then we'd better implement a
+     // (deep) move constructor. it may also be that this is moot since i made
+     // ProxyIterator a reference and added a shallow ProxyIterator swap? (I
+     // need Ken or someone competent to judge whether that's correct also. -
+     // let me know at graehl@gmail.com
+  */
    friend void swap(SizedProxy &first, SizedProxy &second) {
      std::swap_ranges(
-        static_cast<char*>(first.inner_.Data()), 
-        static_cast<char*>(first.inner_.Data()) + first.inner_.EntrySize(),
-        static_cast<char*>(second.inner_.Data()));
+          static_cast<char*>(first.inner_.Data()),
+          static_cast<char*>(first.inner_.Data()) + first.inner_.EntrySize(),
+          static_cast<char*>(second.inner_.Data()));
    }

  private:
@ -87,7 +102,7 @@ typedef ProxyIterator<SizedProxy> SizedIterator;

 inline SizedIterator SizedIt(void *ptr, std::size_t size) { return SizedIterator(SizedProxy(ptr, size)); }

-// Useful wrapper for a comparison function i.e. sort.  
+// Useful wrapper for a comparison function i.e. sort.
 template <class Delegate, class Proxy = SizedProxy> class SizedCompare : public std::binary_function<const Proxy &, const Proxy &, bool> {
  public:
    explicit SizedCompare(const Delegate &delegate = Delegate()) : delegate_(delegate) {}
@ -106,7 +121,7 @@ template <class Delegate, class Proxy = SizedProxy> class SizedCompare : public
    }

    const Delegate &GetDelegate() const { return delegate_; }
-    
+
  private:
    const Delegate delegate_;
 };
--- a/util/stream/chain.hh
+++ b/util/stream/chain.hh
@ -122,7 +122,7 @@ class Chain {
      threads_.push_back(new Thread(Complete(), kRecycle));
    }

-    Chain &operator>>(const Recycler &recycle) {
+    Chain &operator>>(const Recycler &) {
      CompleteLoop();
      return *this;
    }
--- a/util/string_piece_hash.hh
+++ b/util/string_piece_hash.hh
@ -3,8 +3,6 @@

 #include "util/string_piece.hh"

-#include <set>
-
 #include <boost/functional/hash.hpp>
 #include <boost/version.hpp>