diff --git a/lm/binary_format.cc b/lm/binary_format.cc
index 2b34a778a..802943f57 100644
--- a/lm/binary_format.cc
+++ b/lm/binary_format.cc
@@ -169,8 +169,7 @@ void *BinaryFormat::SetupJustVocab(std::size_t memory_size, uint8_t order) {
   vocab_size_ = memory_size;
   if (!write_mmap_) {
     header_size_ = 0;
-    util::MapAnonymous(memory_size, memory_vocab_);
-    util::AdviseHugePages(memory_vocab_.get(), memory_size);
+    util::HugeMalloc(memory_size, true, memory_vocab_);
     return reinterpret_cast<uint8_t*>(memory_vocab_.get());
   }
   header_size_ = TotalHeaderSize(order);
@@ -181,16 +180,16 @@ void *BinaryFormat::SetupJustVocab(std::size_t memory_size, uint8_t order) {
   switch (write_method_) {
     case Config::WRITE_MMAP:
       mapping_.reset(util::MapZeroedWrite(file_.get(), total), total, util::scoped_memory::MMAP_ALLOCATED);
+      util::AdviseHugePages(vocab_base, total);
       vocab_base = mapping_.get();
       break;
     case Config::WRITE_AFTER:
       util::ResizeOrThrow(file_.get(), 0);
-      util::MapAnonymous(total, memory_vocab_);
+      util::HugeMalloc(total, true, memory_vocab_);
       vocab_base = memory_vocab_.get();
       break;
   }
   strncpy(reinterpret_cast<char*>(vocab_base), kMagicIncomplete, header_size_);
-  util::AdviseHugePages(vocab_base, total);
   return reinterpret_cast<uint8_t*>(vocab_base) + header_size_;
 }
 
@@ -200,7 +199,7 @@ void *BinaryFormat::GrowForSearch(std::size_t memory_size, std::size_t vocab_pad
   std::size_t new_size = header_size_ + vocab_size_ + vocab_pad_ + memory_size;
   vocab_string_offset_ = new_size;
   if (!write_mmap_ || write_method_ == Config::WRITE_AFTER) {
-    util::MapAnonymous(memory_size, memory_search_);
+    util::HugeMalloc(memory_size, true, memory_search_);
     assert(header_size_ == 0 || write_mmap_);
     vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()) + header_size_;
     util::AdviseHugePages(memory_search_.get(), memory_size);
diff --git a/lm/builder/corpus_count.cc b/lm/builder/corpus_count.cc
index 04815d805..0414c2261 100644
--- a/lm/builder/corpus_count.cc
+++ b/lm/builder/corpus_count.cc
@@ -5,7 +5,7 @@
 #include "lm/lm_exception.hh"
 #include "lm/vocab.hh"
 #include "lm/word_index.hh"
-#include "util/fake_ofstream.hh"
+#include "util/file_stream.hh"
 #include "util/file.hh"
 #include "util/file_piece.hh"
 #include "util/murmur_hash.hh"
diff --git a/lm/builder/debug_print.hh b/lm/builder/debug_print.hh
index 193a6892c..4b9f306d8 100644
--- a/lm/builder/debug_print.hh
+++ b/lm/builder/debug_print.hh
@@ -4,21 +4,21 @@
 #include "lm/builder/payload.hh"
 #include "lm/common/print.hh"
 #include "lm/common/ngram_stream.hh"
-#include "util/fake_ofstream.hh"
+#include "util/file_stream.hh"
 #include "util/file.hh"
 
 #include <boost/lexical_cast.hpp>
 
 namespace lm { namespace builder {
 // Not defined, only specialized.
-template <class T> void PrintPayload(util::FakeOFStream &to, const BuildingPayload &payload);
-template <> inline void PrintPayload<uint64_t>(util::FakeOFStream &to, const BuildingPayload &payload) {
+template <class T> void PrintPayload(util::FileStream &to, const BuildingPayload &payload);
+template <> inline void PrintPayload<uint64_t>(util::FileStream &to, const BuildingPayload &payload) {
   to << payload.count;
 }
-template <> inline void PrintPayload<Uninterpolated>(util::FakeOFStream &to, const BuildingPayload &payload) {
+template <> inline void PrintPayload<Uninterpolated>(util::FileStream &to, const BuildingPayload &payload) {
   to << log10(payload.uninterp.prob) << ' ' << log10(payload.uninterp.gamma);
 }
-template <> inline void PrintPayload<ProbBackoff>(util::FakeOFStream &to, const BuildingPayload &payload) {
+template <> inline void PrintPayload<ProbBackoff>(util::FileStream &to, const BuildingPayload &payload) {
   to << payload.complete.prob << ' ' << payload.complete.backoff;
 }
 
@@ -36,7 +36,7 @@ template <class V> class Print {
 
     void Run(const util::stream::ChainPositions &chains) {
       util::scoped_fd fd(to_);
-      util::FakeOFStream out(to_);
+      util::FileStream out(to_);
       NGramStreams<BuildingPayload> streams(chains);
       for (NGramStream<BuildingPayload> *s = streams.begin(); s != streams.end(); ++s) {
         DumpStream(*s, out);
@@ -45,13 +45,13 @@ template <class V> class Print {
 
     void Run(const util::stream::ChainPosition &position) {
       util::scoped_fd fd(to_);
-      util::FakeOFStream out(to_);
+      util::FileStream out(to_);
       NGramStream<BuildingPayload> stream(position);
       DumpStream(stream, out);
     }
 
   private:
-    void DumpStream(NGramStream<BuildingPayload> &stream, util::FakeOFStream &to) {
+    void DumpStream(NGramStream<BuildingPayload> &stream, util::FileStream &to) {
       for (; stream; ++stream) {
         PrintPayload<V>(to, stream->Value());
         for (const WordIndex *w = stream->begin(); w != stream->end(); ++w) {
diff --git a/lm/builder/dump_counts_main.cc b/lm/builder/dump_counts_main.cc
index a4c9478b6..26078d0e7 100644
--- a/lm/builder/dump_counts_main.cc
+++ b/lm/builder/dump_counts_main.cc
@@ -30,7 +30,7 @@ int main(int argc, char *argv[]) {
       UTIL_THROW_IF(*i >= vocab.Size(), util::Exception, "Vocab ID " << *i << " is larger than the vocab file's maximum of " << vocab.Size() << ".  Are you sure you have the right order and vocab file for these counts?");
       std::cout << vocab.Lookup(*i) << ' ';
     }
-    // TODO don't use std::cout because it is slow.  Add fast uint64_t printing support to FakeOFStream.
+    // TODO don't use std::cout because it is slow.  Add fast uint64_t printing support to FileStream.
     std::cout << *reinterpret_cast<const uint64_t*>(words + order) << '\n';
   }
 }
diff --git a/lm/builder/interpolate.cc b/lm/builder/interpolate.cc
index 7dcb3623c..a62ef43d2 100644
--- a/lm/builder/interpolate.cc
+++ b/lm/builder/interpolate.cc
@@ -12,7 +12,6 @@
 #include <iostream>
 #include <cassert>
 #include <cmath>
-#include <iostream>
 
 namespace lm { namespace builder {
 namespace {
diff --git a/lm/builder/output.cc b/lm/builder/output.cc
index c92283ac6..604fa22e6 100644
--- a/lm/builder/output.cc
+++ b/lm/builder/output.cc
@@ -2,7 +2,7 @@
 
 #include "lm/common/model_buffer.hh"
 #include "lm/common/print.hh"
-#include "util/fake_ofstream.hh"
+#include "util/file_stream.hh"
 #include "util/stream/multi_stream.hh"
 
 #include <iostream>
@@ -41,7 +41,7 @@ void Output::Apply(HookType hook_type, util::stream::Chains &chains) {
 
 void PrintHook::Sink(const HeaderInfo &info, int vocab_file, util::stream::Chains &chains) {
   if (verbose_header_) {
-    util::FakeOFStream out(file_.get(), 50);
+    util::FileStream out(file_.get(), 50);
     out << "# Input file: " << info.input_file << '\n';
     out << "# Token count: " << info.token_count << '\n';
     out << "# Smoothing: Modified Kneser-Ney" << '\n';
diff --git a/lm/common/model_buffer.cc b/lm/common/model_buffer.cc
index 431d4ae4c..ae9b08c4f 100644
--- a/lm/common/model_buffer.cc
+++ b/lm/common/model_buffer.cc
@@ -1,6 +1,6 @@
 #include "lm/common/model_buffer.hh"
 #include "util/exception.hh"
-#include "util/fake_ofstream.hh"
+#include "util/file_stream.hh"
 #include "util/file.hh"
 #include "util/file_piece.hh"
 #include "util/stream/io.hh"
@@ -68,7 +68,7 @@ void ModelBuffer::Sink(util::stream::Chains &chains, const std::vector<uint64_t>
   }
   if (keep_buffer_) {
     util::scoped_fd metadata(util::CreateOrThrow((file_base_ + ".kenlm_intermediate").c_str()));
-    util::FakeOFStream meta(metadata.get(), 200);
+    util::FileStream meta(metadata.get(), 200);
     meta << kMetadataHeader << "\nCounts";
     for (std::vector<uint64_t>::const_iterator i = counts_.begin(); i != counts_.end(); ++i) {
       meta << ' ' << *i;
diff --git a/lm/common/print.cc b/lm/common/print.cc
index cd2a80260..518b62f51 100644
--- a/lm/common/print.cc
+++ b/lm/common/print.cc
@@ -1,7 +1,7 @@
 #include "lm/common/print.hh"
 
 #include "lm/common/ngram_stream.hh"
-#include "util/fake_ofstream.hh"
+#include "util/file_stream.hh"
 #include "util/file.hh"
 #include "util/mmap.hh"
 #include "util/scoped.hh"
@@ -24,7 +24,7 @@ VocabReconstitute::VocabReconstitute(int fd) {
 }
 
 namespace {
-template <class Payload> void PrintLead(const VocabReconstitute &vocab, ProxyStream<Payload> &stream, util::FakeOFStream &out) {
+template <class Payload> void PrintLead(const VocabReconstitute &vocab, ProxyStream<Payload> &stream, util::FileStream &out) {
   out << stream->Value().prob << '\t' << vocab.Lookup(*stream->begin());
   for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
     out << ' ' << vocab.Lookup(*i);
@@ -34,7 +34,7 @@ template <class Payload> void PrintLead(const VocabReconstitute &vocab, ProxyStr
 
 void PrintARPA::Run(const util::stream::ChainPositions &positions) {
   VocabReconstitute vocab(vocab_fd_);
-  util::FakeOFStream out(out_fd_);
+  util::FileStream out(out_fd_);
   out << "\\data\\\n";
   for (size_t i = 0; i < positions.size(); ++i) {
     out << "ngram " << (i+1) << '=' << counts_[i] << '\n';
diff --git a/lm/filter/arpa_io.cc b/lm/filter/arpa_io.cc
index 92c821aaf..2cae60f9a 100644
--- a/lm/filter/arpa_io.cc
+++ b/lm/filter/arpa_io.cc
@@ -1,5 +1,6 @@
 #include "lm/filter/arpa_io.hh"
 #include "util/file_piece.hh"
+#include "util/string_stream.hh"
 
 #include <iostream>
 #include <ostream>
@@ -22,14 +23,8 @@ ARPAInputException::ARPAInputException(const StringPiece &message, const StringP
 
 ARPAInputException::~ARPAInputException() throw() {}
 
-ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw() {
-  *this << message << " in file " << file_name;
-}
-
-ARPAOutputException::~ARPAOutputException() throw() {}
-
 // Seeking is the responsibility of the caller.
-void WriteCounts(std::ostream &out, const std::vector<uint64_t> &number) {
+template <class Stream> void WriteCounts(Stream &out, const std::vector<uint64_t> &number) {
   out << "\n\\data\\\n";
   for (unsigned int i = 0; i < number.size(); ++i) {
     out << "ngram " << i+1 << "=" << number[i] << '\n';
@@ -38,9 +33,10 @@ void WriteCounts(std::ostream &out, const std::vector<uint64_t> &number) {
 }
 
 size_t SizeNeededForCounts(const std::vector<uint64_t> &number) {
-  std::ostringstream buf;
-  WriteCounts(buf, number);
-  return buf.tellp();
+  std::string buf;
+  util::StringStream stream(buf);
+  WriteCounts(stream, number);
+  return buf.size();
 }
 
 bool IsEntirelyWhiteSpace(const StringPiece &line) {
@@ -50,44 +46,21 @@ bool IsEntirelyWhiteSpace(const StringPiece &line) {
   return true;
 }
 
-ARPAOutput::ARPAOutput(const char *name, size_t buffer_size) : file_name_(name), buffer_(new char[buffer_size]) {
-  try {
-    file_.exceptions(std::ostream::eofbit | std::ostream::failbit | std::ostream::badbit);
-    if (!file_.rdbuf()->pubsetbuf(buffer_.get(), buffer_size)) {
-      std::cerr << "Warning: could not enlarge buffer for " << name << std::endl;
-      buffer_.reset();
-    }
-    file_.open(name, std::ios::out | std::ios::binary);
-  } catch (const std::ios_base::failure &f) {
-    throw ARPAOutputException("Opening", file_name_);
-  }
-}
+ARPAOutput::ARPAOutput(const char *name, size_t buffer_size) 
+  : file_backing_(util::CreateOrThrow(name)), file_(file_backing_.get(), buffer_size) {}
 
 void ARPAOutput::ReserveForCounts(std::streampos reserve) {
-  try {
-    for (std::streampos i = 0; i < reserve; i += std::streampos(1)) {
-      file_ << '\n';
-    }
-  } catch (const std::ios_base::failure &f) {
-    throw ARPAOutputException("Writing blanks to reserve space for counts to ", file_name_);
+  for (std::streampos i = 0; i < reserve; i += std::streampos(1)) {
+    file_ << '\n';
   }
 }
 
 void ARPAOutput::BeginLength(unsigned int length) {
-  fast_counter_ = 0;
-  try {
-    file_ << '\\' << length << "-grams:" << '\n';
-  } catch (const std::ios_base::failure &f) {
-    throw ARPAOutputException("Writing n-gram header to ", file_name_);
-  }
+  file_ << '\\' << length << "-grams:" << '\n';
 }
 
 void ARPAOutput::EndLength(unsigned int length) {
-  try {
-    file_ << '\n';
-  } catch (const std::ios_base::failure &f) {
-    throw ARPAOutputException("Writing blank at end of count list to ", file_name_);
-  }
+  file_ << '\n';
   if (length > counts_.size()) {
     counts_.resize(length);
   }
@@ -95,14 +68,10 @@ void ARPAOutput::EndLength(unsigned int length) {
 }
 
 void ARPAOutput::Finish() {
-  try {
-    file_ << "\\end\\\n";
-    file_.seekp(0);
-    WriteCounts(file_, counts_);
-    file_ << std::flush;
-  } catch (const std::ios_base::failure &f) {
-    throw ARPAOutputException("Finishing including writing counts at beginning to ", file_name_);
-  }
+  file_ << "\\end\\\n";
+  file_.seekp(0);
+  WriteCounts(file_, counts_);
+  file_.flush();
 }
 
 } // namespace lm
diff --git a/lm/filter/arpa_io.hh b/lm/filter/arpa_io.hh
index 0f6c8be75..7489270db 100644
--- a/lm/filter/arpa_io.hh
+++ b/lm/filter/arpa_io.hh
@@ -4,6 +4,7 @@
  */
 #include "lm/read_arpa.hh"
 #include "util/exception.hh"
+#include "util/file_stream.hh"
 #include "util/string_piece.hh"
 #include "util/tokenize_piece.hh"
 
@@ -28,17 +29,6 @@ class ARPAInputException : public util::Exception {
     virtual ~ARPAInputException() throw();
 };
 
-class ARPAOutputException : public util::ErrnoException {
-  public:
-    ARPAOutputException(const char *prefix, const std::string &file_name) throw();
-    virtual ~ARPAOutputException() throw();
-
-    const std::string &File() const throw() { return file_name_; }
-
-  private:
-    const std::string file_name_;
-};
-
 // Handling for the counts of n-grams at the beginning of ARPA files.
 size_t SizeNeededForCounts(const std::vector<uint64_t> &number);
 
@@ -55,11 +45,7 @@ class ARPAOutput : boost::noncopyable {
     void BeginLength(unsigned int length);
 
     void AddNGram(const StringPiece &line) {
-      try {
-        file_ << line << '\n';
-      } catch (const std::ios_base::failure &f) {
-        throw ARPAOutputException("Writing an n-gram", file_name_);
-      }
+      file_ << line << '\n';
       ++fast_counter_;
     }
 
@@ -76,9 +62,8 @@ class ARPAOutput : boost::noncopyable {
     void Finish();
 
   private:
-    const std::string file_name_;
-    boost::scoped_array<char> buffer_;
-    std::fstream file_;
+    util::scoped_fd file_backing_;
+    util::FileStream file_;
     size_t fast_counter_;
     std::vector<uint64_t> counts_;
 };
diff --git a/lm/filter/count_io.hh b/lm/filter/count_io.hh
index 02eb78baa..1af6676c4 100644
--- a/lm/filter/count_io.hh
+++ b/lm/filter/count_io.hh
@@ -5,7 +5,7 @@
 #include <iostream>
 #include <string>
 
-#include "util/fake_ofstream.hh"
+#include "util/file_stream.hh"
 #include "util/file.hh"
 #include "util/file_piece.hh"
 
@@ -28,7 +28,7 @@ class CountOutput : boost::noncopyable {
     }
 
   private:
-    util::FakeOFStream file_;
+    util::FileStream file_;
 };
 
 class CountBatch {
diff --git a/lm/filter/phrase_table_vocab_main.cc b/lm/filter/phrase_table_vocab_main.cc
index e8a8d0265..9ffa35f93 100644
--- a/lm/filter/phrase_table_vocab_main.cc
+++ b/lm/filter/phrase_table_vocab_main.cc
@@ -1,4 +1,4 @@
-#include "util/fake_ofstream.hh"
+#include "util/file_stream.hh"
 #include "util/file_piece.hh"
 #include "util/murmur_hash.hh"
 #include "util/pool.hh"
@@ -68,7 +68,7 @@ class TargetWords {
     }
 
     void Print() const {
-      util::FakeOFStream out(1);
+      util::FileStream out(1);
       for (std::vector<boost::unordered_set<const char *> >::const_iterator i = vocab_.begin(); i != vocab_.end(); ++i) {
         for (boost::unordered_set<const char *>::const_iterator j = i->begin(); j != i->end(); ++j) {
           out << *j << ' ';
diff --git a/lm/kenlm_benchmark_main.cc b/lm/kenlm_benchmark_main.cc
index 1704ec6c9..c9ee16525 100644
--- a/lm/kenlm_benchmark_main.cc
+++ b/lm/kenlm_benchmark_main.cc
@@ -1,5 +1,5 @@
 #include "lm/model.hh"
-#include "util/fake_ofstream.hh"
+#include "util/file_stream.hh"
 #include "util/file.hh"
 #include "util/file_piece.hh"
 #include "util/usage.hh"
@@ -10,7 +10,7 @@ namespace {
 
 template <class Model, class Width> void ConvertToBytes(const Model &model, int fd_in) {
   util::FilePiece in(fd_in);
-  util::FakeOFStream out(1);
+  util::FileStream out(1);
   Width width;
   StringPiece word;
   const Width end_sentence = (Width)model.GetVocabulary().EndSentence();
@@ -30,10 +30,19 @@ template <class Model, class Width> void QueryFromBytes(const Model &model, int
   const lm::ngram::State *next_state = begin_state;
   Width kEOS = model.GetVocabulary().EndSentence();
   Width buf[4096];
-  float sum = 0.0;
+
+  uint64_t completed = 0;
+  double loaded = util::CPUTime();
+
+  std::cout << "CPU_to_load: " << loaded << std::endl;
+
+  // Numerical precision: batch sums.
+  double total = 0.0;
   while (std::size_t got = util::ReadOrEOF(fd_in, buf, sizeof(buf))) {
+    float sum = 0.0;
     UTIL_THROW_IF2(got % sizeof(Width), "File size not a multiple of vocab id size " << sizeof(Width));
     got /= sizeof(Width);
+    completed += got;
     // Do even stuff first.
     const Width *even_end = buf + (got & ~1);
     // Alternating states
@@ -49,8 +58,13 @@ template <class Model, class Width> void QueryFromBytes(const Model &model, int
       sum += model.FullScore(*next_state, *i, state[2]).prob;
       next_state = (*i++ == kEOS) ? begin_state : &state[2];
     }
+    total += sum;
   }
-  std::cout << "Sum is " << sum << std::endl;
+  double after = util::CPUTime();
+  std::cerr << "Probability sum is " << total << std::endl;
+  std::cout << "Queries: " << completed << std::endl;
+  std::cout << "CPU_excluding_load: " << (after - loaded) << "\nCPU_per_query: " << ((after - loaded) / static_cast<double>(completed)) << std::endl;
+  std::cout << "RSSMax: " << util::RSSMax() << std::endl;
 }
 
 template <class Model, class Width> void DispatchFunction(const Model &model, bool query) {
@@ -62,7 +76,10 @@ template <class Model, class Width> void DispatchFunction(const Model &model, bo
 }
 
 template <class Model> void DispatchWidth(const char *file, bool query) {
-  Model model(file);
+  lm::ngram::Config config;
+  config.load_method = util::READ;
+  std::cerr << "Using load_method = READ." << std::endl;
+  Model model(file, config);
   lm::WordIndex bound = model.GetVocabulary().Bound();
   if (bound <= 256) {
     DispatchFunction<Model, uint8_t>(model, query);
@@ -116,11 +133,10 @@ int main(int argc, char *argv[]) {
       << argv[0] << " vocab $model <$text >$text.vocab\n"
       << "#Ensure files are in RAM.\n"
       << "cat $text.vocab $model >/dev/null\n"
-      << "#Timed query against the model, including loading.\n"
-      << "time " << argv[0] << " query $model <$text.vocab\n";
+      << "#Timed query against the model.\n"
+      << argv[0] << " query $model <$text.vocab\n";
     return 1;
   }
   Dispatch(argv[2], !strcmp(argv[1], "query"));
-  util::PrintUsage(std::cerr);
   return 0;
 }
diff --git a/lm/ngram_query.hh b/lm/ngram_query.hh
index b19c5aa4f..4430841c9 100644
--- a/lm/ngram_query.hh
+++ b/lm/ngram_query.hh
@@ -3,7 +3,7 @@
 
 #include "lm/enumerate_vocab.hh"
 #include "lm/model.hh"
-#include "util/fake_ofstream.hh"
+#include "util/file_stream.hh"
 #include "util/file_piece.hh"
 #include "util/usage.hh"
 
@@ -42,7 +42,7 @@ class QueryPrinter {
     }
 
   private:
-    util::FakeOFStream out_;
+    util::FileStream out_;
     bool print_word_;
     bool print_line_;
     bool print_summary_;
diff --git a/lm/vocab.cc b/lm/vocab.cc
index 5696e60b3..3d83e0452 100644
--- a/lm/vocab.cc
+++ b/lm/vocab.cc
@@ -6,7 +6,7 @@
 #include "lm/config.hh"
 #include "lm/weights.hh"
 #include "util/exception.hh"
-#include "util/fake_ofstream.hh"
+#include "util/file_stream.hh"
 #include "util/file.hh"
 #include "util/joint_sort.hh"
 #include "util/murmur_hash.hh"
@@ -182,7 +182,7 @@ void SortedVocabulary::ComputeRenumbering(WordIndex types, int from_words, int t
   std::sort(entries.begin(), entries.end());
   // Write out new vocab file.
   {
-    util::FakeOFStream out(to_words);
+    util::FileStream out(to_words);
     out << "<unk>" << '\0';
     for (std::vector<RenumberEntry>::const_iterator i = entries.begin(); i != entries.end(); ++i) {
       out << i->str << '\0';
diff --git a/lm/vocab.hh b/lm/vocab.hh
index b42566f23..59740e853 100644
--- a/lm/vocab.hh
+++ b/lm/vocab.hh
@@ -4,7 +4,7 @@
 #include "lm/enumerate_vocab.hh"
 #include "lm/lm_exception.hh"
 #include "lm/virtual_interface.hh"
-#include "util/fake_ofstream.hh"
+#include "util/file_stream.hh"
 #include "util/murmur_hash.hh"
 #include "util/pool.hh"
 #include "util/probing_hash_table.hh"
@@ -44,7 +44,7 @@ class ImmediateWriteWordsWrapper : public EnumerateVocab {
   private:
     EnumerateVocab *inner_;
 
-    util::FakeOFStream stream_;
+    util::FileStream stream_;
 };
 
 // When the binary size isn't known yet.
@@ -225,7 +225,7 @@ class WriteUniqueWords {
     }
 
   private:
-    util::FakeOFStream word_list_;
+    util::FileStream word_list_;
 };
 
 class NoOpUniqueWords {
diff --git a/util/exception.cc b/util/exception.cc
index 588f5eae5..e644d2cb7 100644
--- a/util/exception.cc
+++ b/util/exception.cc
@@ -7,47 +7,41 @@
 #include <cerrno>
 #include <cstring>
 
+#if defined(_WIN32) || defined(_WIN64)
+#include <windows.h>
+#include <io.h>
+#endif
+
 namespace util {
 
 Exception::Exception() throw() {}
 Exception::~Exception() throw() {}
 
-Exception::Exception(const Exception &from) : std::exception() {
-  stream_ << from.stream_.str();
-}
-
-Exception &Exception::operator=(const Exception &from) {
-  stream_ << from.stream_.str();
-  return *this;
-}
-
-const char *Exception::what() const throw() {
-  text_ = stream_.str();
-  return text_.c_str();
-}
-
 void Exception::SetLocation(const char *file, unsigned int line, const char *func, const char *child_name, const char *condition) {
   /* The child class might have set some text, but we want this to come first.
    * Another option would be passing this information to the constructor, but
    * then child classes would have to accept constructor arguments and pass
    * them down.
    */
-  text_ = stream_.str();
-  stream_.str("");
-  stream_ << file << ':' << line;
-  if (func) stream_ << " in " << func << " threw ";
+  std::string old_text;
+  std::swap(old_text, what_);
+  StringStream stream(what_);
+  stream << file << ':' << line;
+  if (func) stream << " in " << func << " threw ";
   if (child_name) {
-    stream_ << child_name;
+    stream << child_name;
   } else {
 #ifdef __GXX_RTTI
-    stream_ << typeid(this).name();
+    stream << typeid(this).name();
 #else
-    stream_ << "an exception";
+    stream << "an exception";
 #endif
   }
-  if (condition) stream_ << " because `" << condition;
-  stream_ << "'.\n";
-  stream_ << text_;
+  if (condition) {
+    stream << " because `" << condition << '\'';
+  }
+  stream << ".\n";
+  stream << old_text;
 }
 
 namespace {
@@ -95,4 +89,17 @@ ErrnoException::~ErrnoException() throw() {}
 OverflowException::OverflowException() throw() {}
 OverflowException::~OverflowException() throw() {}
 
+#if defined(_WIN32) || defined(_WIN64)
+WindowsException::WindowsException() throw() {
+  unsigned int last_error = GetLastError();
+  char error_msg[256] = "";
+  if (!FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, last_error, LANG_NEUTRAL, error_msg, sizeof(error_msg), NULL)) {
+    *this << "Windows error " << GetLastError() << " while formatting Windows error " << last_error << ". ";
+  } else {
+    *this << "Windows error " << last_error << ": " << error_msg;
+  }
+}
+WindowsException::~WindowsException() throw() {}
+#endif
+
 } // namespace util
diff --git a/util/exception.hh b/util/exception.hh
index d67a6f9fb..00207b242 100644
--- a/util/exception.hh
+++ b/util/exception.hh
@@ -1,12 +1,16 @@
 #ifndef UTIL_EXCEPTION_H
 #define UTIL_EXCEPTION_H
 
+#include "util/string_stream.hh"
+
 #include <exception>
 #include <limits>
-#include <sstream>
 #include <string>
 #include <stdint.h>
 
+// TODO(hieu) delete this
+#include <sstream>
+
 namespace util {
 
 template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data);
@@ -16,11 +20,7 @@ class Exception : public std::exception {
     Exception() throw();
     virtual ~Exception() throw();
 
-    Exception(const Exception &from);
-    Exception &operator=(const Exception &from);
-
-    // Not threadsafe, but probably doesn't matter.  FWIW, Boost's exception guidance implies that what() isn't threadsafe.
-    const char *what() const throw();
+    const char *what() const throw() { return what_.c_str(); }
 
     // For use by the UTIL_THROW macros.
     void SetLocation(
@@ -38,8 +38,7 @@ class Exception : public std::exception {
       typedef T Identity;
     };
 
-    std::stringstream stream_;
-    mutable std::string text_;
+    std::string what_;
 };
 
 /* This implements the normal operator<< for Exception and all its children.
@@ -47,7 +46,12 @@ class Exception : public std::exception {
  * boost::enable_if.
  */
 template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data) {
-  e.stream_ << data;
+  // TODO(hieu): change this to
+  // StringStream(e.what_) << data;
+
+  std::stringstream moses_hack;
+  moses_hack << data;
+  e.what_ += moses_hack.str();
   return e;
 }
 
@@ -149,6 +153,15 @@ inline std::size_t CheckOverflow(uint64_t value) {
   return CheckOverflowInternal<sizeof(std::size_t)>(value);
 }
 
+#if defined(_WIN32) || defined(_WIN64)
+/* Thrown for Windows specific operations. */
+class WindowsException : public Exception {
+  public:
+    WindowsException() throw();
+    ~WindowsException() throw();
+};
+#endif
+
 } // namespace util
 
 #endif // UTIL_EXCEPTION_H
diff --git a/util/fake_ofstream.hh b/util/fake_ofstream.hh
deleted file mode 100644
index d35bf0d83..000000000
--- a/util/fake_ofstream.hh
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Like std::ofstream but without being incredibly slow.  Backed by a raw fd.
- * Supports most of the built-in types except for void* and long double.
- */
-#ifndef UTIL_FAKE_OFSTREAM_H
-#define UTIL_FAKE_OFSTREAM_H
-
-#include "util/file.hh"
-#include "util/float_to_string.hh"
-#include "util/integer_to_string.hh"
-#include "util/scoped.hh"
-#include "util/string_piece.hh"
-
-#include <cassert>
-#include <cstring>
-
-#include <stdint.h>
-
-namespace util {
-class FakeOFStream {
-  public:
-    // Maximum over all ToString operations.
-    // static const std::size_t kMinBuf = 20;
-    // This was causing compile failures in debug, so now 20 is written directly.
-    //
-    // Does not take ownership of out.
-    // Allows default constructor, but must call SetFD.
-    explicit FakeOFStream(int out = -1, std::size_t buffer_size = 1048576)
-      : buf_(util::MallocOrThrow(std::max(buffer_size, (size_t)20))),
-        current_(static_cast<char*>(buf_.get())),
-        end_(current_ + std::max(buffer_size, (size_t)20)),
-        fd_(out) {}
-
-    ~FakeOFStream() {
-      // Could have called Finish already
-      flush();
-    }
-
-    void SetFD(int to) {
-      flush();
-      fd_ = to;
-    }
-
-    FakeOFStream &write(const void *data, std::size_t length) {
-      if (UTIL_LIKELY(current_ + length <= end_)) {
-        std::memcpy(current_, data, length);
-        current_ += length;
-        return *this;
-      }
-      flush();
-      if (current_ + length <= end_) {
-        std::memcpy(current_, data, length);
-        current_ += length;
-      } else {
-        util::WriteOrThrow(fd_, data, length);
-      }
-      return *this;
-    }
-
-    // This also covers std::string and char*
-    FakeOFStream &operator<<(StringPiece str) {
-      return write(str.data(), str.size());
-    }
-
-    // For anything with ToStringBuf<T>::kBytes, define operator<< using ToString.
-    // This includes uint64_t, int64_t, uint32_t, int32_t, uint16_t, int16_t,
-    // float, double
-  private:
-    template <int Arg> struct EnableIfKludge {
-      typedef FakeOFStream type;
-    };
-  public:
-    template <class T> typename EnableIfKludge<ToStringBuf<T>::kBytes>::type &operator<<(const T value) {
-      EnsureRemaining(ToStringBuf<T>::kBytes);
-      current_ = ToString(value, current_);
-      assert(current_ <= end_);
-      return *this;
-    }
-
-    FakeOFStream &operator<<(char c) {
-      EnsureRemaining(1);
-      *current_++ = c;
-      return *this;
-    }
-
-    FakeOFStream &operator<<(unsigned char c) {
-      EnsureRemaining(1);
-      *current_++ = static_cast<char>(c);
-      return *this;
-    }
-
-    /* clang on OS X appears to consider std::size_t aka unsigned long distinct
-     * from uint64_t.  So this function makes clang work.  gcc considers
-     * uint64_t and std::size_t the same (on 64-bit) so this isn't necessary.
-     * But it does no harm since gcc sees it as a specialization of the
-     * EnableIfKludge template.
-     * Also, delegating to *this << static_cast<uint64_t>(value) would loop
-     * indefinitely on gcc.
-     */
-    FakeOFStream &operator<<(std::size_t value) {
-      EnsureRemaining(ToStringBuf<uint64_t>::kBytes);
-      current_ = ToString(static_cast<uint64_t>(value), current_);
-      return *this;
-    }
-
-    // Note this does not sync.
-    void flush() {
-      if (current_ != buf_.get()) {
-        util::WriteOrThrow(fd_, buf_.get(), current_ - (char*)buf_.get());
-        current_ = static_cast<char*>(buf_.get());
-      }
-    }
-
-    // Not necessary, but does assure the data is cleared.
-    void Finish() {
-      flush();
-      buf_.reset();
-      current_ = NULL;
-      util::FSyncOrThrow(fd_);
-    }
-
-  private:
-    void EnsureRemaining(std::size_t amount) {
-      if (UTIL_UNLIKELY(current_ + amount > end_)) {
-        flush();
-        assert(current_ + amount <= end_);
-      }
-    }
-
-    util::scoped_malloc buf_;
-    char *current_, *end_;
-
-    int fd_;
-};
-
-} // namespace
-
-#endif
diff --git a/util/fake_ostream.hh b/util/fake_ostream.hh
new file mode 100644
index 000000000..331742f6a
--- /dev/null
+++ b/util/fake_ostream.hh
@@ -0,0 +1,128 @@
+#ifndef UTIL_FAKE_OSTREAM_H
+#define UTIL_FAKE_OSTREAM_H
+
+#include "util/float_to_string.hh"
+#include "util/integer_to_string.hh"
+#include "util/string_piece.hh"
+
+#include <cassert>
+#include <limits>
+
+#include <stdint.h>
+
+namespace util {
+
+/* Like std::ostream but without being incredibly slow.
+ * Supports most of the built-in types except for long double.
+ * 
+ * The FakeOStream class is intended to be inherited from.  The inherting class
+ * should provide:
+ * public:
+ *   Derived &flush();
+ *   Derived &write(const void *data, std::size_t length);
+ * 
+ * private: or protected:
+ *   friend class FakeOStream;
+ *   char *Ensure(std::size_t amount);
+ *   void AdvanceTo(char *to);
+ *
+ * The Ensure function makes enough space for an in-place write and returns
+ * where to write.  The AdvanceTo function happens after the write, saying how
+ * much was actually written.
+ * 
+ * Precondition:
+ * amount <= kToStringMaxBytes for in-place writes.
+ */
+template <class Derived> class FakeOStream {
+  public:
+    FakeOStream() {}
+
+    // This also covers std::string and char*
+    Derived &operator<<(StringPiece str) {
+      return C().write(str.data(), str.size());
+    }
+
+    // For anything with ToStringBuf<T>::kBytes, define operator<< using ToString.
+    // This includes uint64_t, int64_t, uint32_t, int32_t, uint16_t, int16_t,
+    // float, double
+  private:
+    template <int Arg> struct EnableIfKludge {
+      typedef Derived type;
+    };
+  public:
+    template <class T> typename EnableIfKludge<ToStringBuf<T>::kBytes>::type &operator<<(const T value) {
+      return CallToString(value);
+    }
+
+    /* clang on OS X appears to consider std::size_t aka unsigned long distinct
+     * from uint64_t.  So this function makes clang work.  gcc considers
+     * uint64_t and std::size_t the same (on 64-bit) so this isn't necessary.
+     * But it does no harm since gcc sees it as a specialization of the
+     * EnableIfKludge template.
+     * Also, delegating to *this << static_cast<uint64_t>(value) would loop
+     * indefinitely on gcc.
+     */
+    Derived &operator<<(std::size_t value) { return CoerceToString(value); }
+
+    // union types will map to int, but don't pass the template magic above in gcc.
+    Derived &operator<<(int value) { return CoerceToString(value); }
+
+    // gcc considers these distinct from uint64_t
+    Derived &operator<<(unsigned long long value) { return CoerceToString(value); }
+    Derived &operator<<(signed long long value) { return CoerceToString(value); }
+
+    // Character types that get copied as bytes instead of displayed as integers.
+    Derived &operator<<(char val) { return put(val); }
+    Derived &operator<<(signed char val) { return put(static_cast<char>(val)); }
+    Derived &operator<<(unsigned char val) { return put(static_cast<char>(val)); }
+
+    // This is here to catch all the other pointer types.
+    Derived &operator<<(const void *value) { return CallToString(value); }
+    // This is here because the above line also catches const char*.
+    Derived &operator<<(const char *value) { return *this << StringPiece(value); }
+    Derived &operator<<(char *value) { return *this << StringPiece(value); }
+
+    Derived &put(char val) {
+      char *c = C().Ensure(1);
+      *c = val;
+      C().AdvanceTo(++c);
+      return C();
+    }
+
+    char widen(char val) const { return val; }
+
+  private:
+    // References to derived class for convenience.
+    Derived &C() {
+      return *static_cast<Derived*>(this);
+    }
+
+    const Derived &C() const {
+      return *static_cast<const Derived*>(this);
+    }
+
+    template <class From, unsigned Length = sizeof(From), bool Signed = std::numeric_limits<From>::is_signed> struct Coerce {};
+
+    template <class From> struct Coerce<From, 2, false> { typedef uint16_t To; };
+    template <class From> struct Coerce<From, 4, false> { typedef uint32_t To; };
+    template <class From> struct Coerce<From, 8, false> { typedef uint64_t To; };
+
+    template <class From> struct Coerce<From, 2, true> { typedef int16_t To; };
+    template <class From> struct Coerce<From, 4, true> { typedef int32_t To; };
+    template <class From> struct Coerce<From, 8, true> { typedef int64_t To; };
+
+    template <class From> Derived &CoerceToString(const From value) {
+      return CallToString(static_cast<typename Coerce<From>::To>(value));
+    }
+
+    // This is separate to prevent an infinite loop if the compiler considers
+    // types the same (i.e. gcc std::size_t and uint64_t or uint32_t).
+    template <class T> Derived &CallToString(const T value) {
+      C().AdvanceTo(ToString(value, C().Ensure(ToStringBuf<T>::kBytes)));
+      return C();
+    }
+};
+
+} // namespace
+
+#endif // UTIL_FAKE_OSTREAM_H
diff --git a/util/file.cc b/util/file.cc
index be272f9bc..e8976bc10 100644
--- a/util/file.cc
+++ b/util/file.cc
@@ -147,17 +147,33 @@ std::size_t GuardLarge(std::size_t size) {
 }
 }
 
+#if defined(_WIN32) || defined(_WIN64)
+namespace {
+const std::size_t kMaxDWORD = static_cast<std::size_t>(4294967295UL);
+} // namespace
+#endif
+
 std::size_t PartialRead(int fd, void *to, std::size_t amount) {
 #if defined(_WIN32) || defined(_WIN64)
-  int ret = _read(fd, to, GuardLarge(amount));
+    DWORD ret;
+    HANDLE file_handle = reinterpret_cast<HANDLE>(_get_osfhandle(fd));
+    DWORD larger_size = static_cast<DWORD>(std::min<std::size_t>(kMaxDWORD, amount));
+    DWORD smaller_size = 28672; // Received reports that 31346 worked but higher values did not. This rounds down to the nearest multiple of 4096, the page size. 
+    if (!ReadFile(file_handle, to, larger_size, &ret, NULL))
+    {
+        DWORD last_error = GetLastError();
+        if (last_error != ERROR_NOT_ENOUGH_MEMORY || !ReadFile(file_handle, to, smaller_size, &ret, NULL)) {
+            UTIL_THROW(WindowsException, "Windows error in ReadFile.");
+        }
+    }
 #else
   errno = 0;
   ssize_t ret;
   do {
     ret = read(fd, to, GuardLarge(amount));
   } while (ret == -1 && errno == EINTR);
-#endif
   UTIL_THROW_IF_ARG(ret < 0, FDException, (fd), "while reading " << amount << " bytes");
+#endif
   return static_cast<std::size_t>(ret);
 }
 
@@ -212,12 +228,6 @@ void WriteOrThrow(FILE *to, const void *data, std::size_t size) {
   UTIL_THROW_IF(1 != std::fwrite(data, size, 1, to), ErrnoException, "Short write; requested size " << size);
 }
 
-#if defined(_WIN32) || defined(_WIN64)
-namespace {
-const std::size_t kMaxDWORD = static_cast<std::size_t>(4294967295UL);
-} // namespace
-#endif
-
 void ErsatzPRead(int fd, void *to_void, std::size_t size, uint64_t off) {
   uint8_t *to = static_cast<uint8_t*>(to_void);
   while (size) {
@@ -230,7 +240,7 @@ void ErsatzPRead(int fd, void *to_void, std::size_t size, uint64_t off) {
     memset(&overlapped, 0, sizeof(OVERLAPPED));
     overlapped.Offset = static_cast<DWORD>(off);
     overlapped.OffsetHigh = static_cast<DWORD>(off >> 32);
-    UTIL_THROW_IF(!ReadFile((HANDLE)_get_osfhandle(fd), to, reading, &ret, &overlapped), Exception, "ReadFile failed for offset " << off);
+    UTIL_THROW_IF(!ReadFile((HANDLE)_get_osfhandle(fd), to, reading, &ret, &overlapped), WindowsException, "ReadFile failed for offset " << off);
 #else
     ssize_t ret;
     errno = 0;
diff --git a/util/file_piece.cc b/util/file_piece.cc
index 7017942e6..0a4d3a9da 100644
--- a/util/file_piece.cc
+++ b/util/file_piece.cc
@@ -56,7 +56,7 @@ FilePiece::FilePiece(std::istream &stream, const char *name, std::size_t min_buf
   InitializeNoRead("istream", min_buffer);
 
   fallback_to_read_ = true;
-  data_.reset(MallocOrThrow(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
+  HugeMalloc(default_map_size_, false, data_);
   position_ = data_.begin();
   position_end_ = position_;
 
@@ -282,7 +282,7 @@ void FilePiece::TransitionToRead() {
   assert(!fallback_to_read_);
   fallback_to_read_ = true;
   data_.reset();
-  data_.reset(MallocOrThrow(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
+  HugeMalloc(default_map_size_, false, data_);
   position_ = data_.begin();
   position_end_ = position_;
 
@@ -313,8 +313,7 @@ void FilePiece::ReadShift() {
       // Buffer too small.
       std::size_t valid_length = position_end_ - position_;
       default_map_size_ *= 2;
-      data_.call_realloc(default_map_size_);
-      UTIL_THROW_IF(!data_.get(), ErrnoException, "realloc failed for " << default_map_size_);
+      HugeRealloc(default_map_size_, false, data_);
       position_ = data_.begin();
       position_end_ = position_ + valid_length;
     } else {
diff --git a/util/file_piece_test.cc b/util/file_piece_test.cc
index 11e2ab3aa..d03cd312d 100644
--- a/util/file_piece_test.cc
+++ b/util/file_piece_test.cc
@@ -1,7 +1,7 @@
 // Tests might fail if you have creative characters in your path.  Sue me.
 #include "util/file_piece.hh"
 
-#include "util/fake_ofstream.hh"
+#include "util/file_stream.hh"
 #include "util/file.hh"
 #include "util/scoped.hh"
 
@@ -138,7 +138,7 @@ BOOST_AUTO_TEST_CASE(Numbers) {
   scoped_fd file(MakeTemp(FileLocation()));
   const float floating = 3.2;
   {
-    util::FakeOFStream writing(file.get());
+    util::FileStream writing(file.get());
     writing << "94389483984398493890287 " << floating << " 5";
   }
   SeekOrThrow(file.get(), 0);
diff --git a/util/file_stream.hh b/util/file_stream.hh
new file mode 100644
index 000000000..ae9ad5aa7
--- /dev/null
+++ b/util/file_stream.hh
@@ -0,0 +1,89 @@
+/* Like std::ofstream but without being incredibly slow.  Backed by a raw fd.
+ * Supports most of the built-in types except for long double.
+ */
+#ifndef UTIL_FILE_STREAM_H
+#define UTIL_FILE_STREAM_H
+
+#include "util/fake_ostream.hh"
+#include "util/file.hh"
+#include "util/scoped.hh"
+
+#include <cassert>
+#include <cstring>
+
+#include <stdint.h>
+
+namespace util {
+
+class FileStream : public FakeOStream<FileStream> {
+  public:
+    FileStream(int out = -1, std::size_t buffer_size = 8192)
+      : buf_(util::MallocOrThrow(std::max<std::size_t>(buffer_size, kToStringMaxBytes))),
+        current_(static_cast<char*>(buf_.get())),
+        end_(current_ + std::max<std::size_t>(buffer_size, kToStringMaxBytes)),
+        fd_(out) {}
+
+    ~FileStream() {
+      flush();
+    }
+
+    void SetFD(int to) {
+      flush();
+      fd_ = to;
+    }
+
+    FileStream &flush() {
+      if (current_ != buf_.get()) {
+        util::WriteOrThrow(fd_, buf_.get(), current_ - (char*)buf_.get());
+        current_ = static_cast<char*>(buf_.get());
+      }
+      return *this;
+    }
+
+    // For writes of arbitrary size.
+    FileStream &write(const void *data, std::size_t length) {
+      if (UTIL_LIKELY(current_ + length <= end_)) {
+        std::memcpy(current_, data, length);
+        current_ += length;
+        return *this;
+      }
+      flush();
+      if (current_ + length <= end_) {
+        std::memcpy(current_, data, length);
+        current_ += length;
+      } else {
+        util::WriteOrThrow(fd_, data, length);
+      }
+      return *this;
+    }
+
+    FileStream &seekp(uint64_t to) {
+      util::SeekOrThrow(fd_, to);
+      return *this;
+    }
+
+  protected:
+    friend class FakeOStream<FileStream>;
+    // For writes directly to buffer guaranteed to have amount < buffer size.
+    char *Ensure(std::size_t amount) {
+      if (UTIL_UNLIKELY(current_ + amount > end_)) {
+        flush();
+        assert(current_ + amount <= end_);
+      }
+      return current_;
+    }
+
+    void AdvanceTo(char *to) {
+      current_ = to;
+      assert(current_ <= end_);
+    }
+
+  private:
+    util::scoped_malloc buf_;
+    char *current_, *end_;
+    int fd_;
+};
+
+} // namespace
+
+#endif
diff --git a/util/integer_to_string.cc b/util/integer_to_string.cc
index 6b8766119..5150d7973 100644
--- a/util/integer_to_string.cc
+++ b/util/integer_to_string.cc
@@ -1,3 +1,4 @@
+#include <iostream>
 /* Fast integer to string conversion.
 Source: https://github.com/miloyip/itoa-benchmark
 Local modifications:
@@ -637,4 +638,28 @@ char *ToString(uint16_t value, char *to) {
   return ToString((uint32_t)value, to);
 }
 
+// void * to string.  This hasn't been optimized at all really.
+namespace {
+const char kHexDigits[] = "0123456789abcdef";
+} // namespace
+
+char *ToString(const void *v, char *to) {
+  // Apparently it's 0, not 0x0.  
+  if (!v) {
+    *to++ = '0';
+    return to;
+  }
+
+  *to++ = '0';
+  *to++ = 'x';
+  uintptr_t value = reinterpret_cast<uintptr_t>(v);
+  uint8_t shift = sizeof(void*) * 8 - 4;
+  for (; !(value >> shift); shift -= 4) {}
+  for (; ; shift -= 4) {
+    *to++ = kHexDigits[(value >> shift) & 0xf];
+    if (!shift) break;
+  }
+  return to;
+}
+
 } // namespace util
diff --git a/util/integer_to_string.hh b/util/integer_to_string.hh
index 0d975b14e..9ac25bd78 100644
--- a/util/integer_to_string.hh
+++ b/util/integer_to_string.hh
@@ -18,6 +18,8 @@ char *ToString(int64_t value, char *to);
 char *ToString(uint16_t value, char *to);
 char *ToString(int16_t value, char *to);
 
+char *ToString(const void *value, char *to);
+
 inline char *ToString(bool value, char *to) {
   *to++ = '0' + value;
   return to;
@@ -51,6 +53,14 @@ template <> struct ToStringBuf<int64_t> {
   enum { kBytes = 20 };
 };
 
+template <> struct ToStringBuf<const void*> {
+  // Either 18 on 64-bit or 10 on 32-bit.
+  enum { kBytes = sizeof(const void*) * 2 + 2 };
+};
+
+// Maximum over this and float.
+enum { kToStringMaxBytes = 20 };
+
 } // namespace util
 
 #endif // UTIL_INTEGER_TO_STRING_H
diff --git a/util/integer_to_string_test.cc b/util/integer_to_string_test.cc
index ded1ecec7..d090f64a8 100644
--- a/util/integer_to_string_test.cc
+++ b/util/integer_to_string_test.cc
@@ -21,9 +21,9 @@ template <class T> void TestValue(const T value) {
 template <class T> void TestCorners() {
   TestValue(std::numeric_limits<T>::min());
   TestValue(std::numeric_limits<T>::max());
-  TestValue(static_cast<T>(0));
-  TestValue(static_cast<T>(-1));
-  TestValue(static_cast<T>(1));
+  TestValue((T)0);
+  TestValue((T)-1);
+  TestValue((T)1);
 }
 
 BOOST_AUTO_TEST_CASE(Corners) {
@@ -33,6 +33,7 @@ BOOST_AUTO_TEST_CASE(Corners) {
   TestCorners<int16_t>();
   TestCorners<int32_t>();
   TestCorners<int64_t>();
+  TestCorners<const void*>();
 }
 
 template <class T> void TestAll() {
@@ -62,4 +63,14 @@ BOOST_AUTO_TEST_CASE(Tens) {
   Test10s<int32_t>();
 }
 
+BOOST_AUTO_TEST_CASE(Pointers) {
+  for (uintptr_t i = 1; i < std::numeric_limits<uintptr_t>::max() / 10; i *= 10) {
+    TestValue((const void*)i);
+  }
+  for (uintptr_t i = 0; i < 256; ++i) {
+    TestValue((const void*)i);
+    TestValue((const void*)(i + 0xf00));
+  }
+}
+
 }} // namespaces
diff --git a/util/mmap.cc b/util/mmap.cc
index 7dcb57ba3..b70fd7573 100644
--- a/util/mmap.cc
+++ b/util/mmap.cc
@@ -27,7 +27,7 @@
 
 namespace util {
 
-long SizePage() {
+std::size_t SizePage() {
 #if defined(_WIN32) || defined(_WIN64)
   SYSTEM_INFO si;
   GetSystemInfo(&si);
@@ -37,22 +37,6 @@ long SizePage() {
 #endif
 }
 
-void SyncOrThrow(void *start, size_t length) {
-#if defined(_WIN32) || defined(_WIN64)
-  UTIL_THROW_IF(!::FlushViewOfFile(start, length), ErrnoException, "Failed to sync mmap");
-#else
-  UTIL_THROW_IF(length && msync(start, length, MS_SYNC), ErrnoException, "Failed to sync mmap");
-#endif
-}
-
-void UnmapOrThrow(void *start, size_t length) {
-#if defined(_WIN32) || defined(_WIN64)
-  UTIL_THROW_IF(!::UnmapViewOfFile(start), ErrnoException, "Failed to unmap a file");
-#else
-  UTIL_THROW_IF(munmap(start, length), ErrnoException, "munmap failed");
-#endif
-}
-
 scoped_mmap::~scoped_mmap() {
   if (data_ != (void*)-1) {
     try {
@@ -66,14 +50,24 @@ scoped_mmap::~scoped_mmap() {
   }
 }
 
+namespace {
+template <class T> T RoundUpPow2(T value, T mult) {
+  return ((value - 1) & ~(mult - 1)) + mult;
+}
+} // namespace
+
+scoped_memory::scoped_memory(std::size_t size, bool zeroed) : data_(NULL), size_(0), source_(NONE_ALLOCATED) {
+  HugeMalloc(size, zeroed, *this);
+}
+
 void scoped_memory::reset(void *data, std::size_t size, Alloc source) {
   switch(source_) {
+    case MMAP_ROUND_UP_ALLOCATED:
+      scoped_mmap(data_, RoundUpPow2(size_, (std::size_t)SizePage()));
+      break;
     case MMAP_ALLOCATED:
       scoped_mmap(data_, size_);
       break;
-    case ARRAY_ALLOCATED:
-      delete [] reinterpret_cast<char*>(data_);
-      break;
     case MALLOC_ALLOCATED:
       free(data_);
       break;
@@ -85,7 +79,7 @@ void scoped_memory::reset(void *data, std::size_t size, Alloc source) {
   source_ = source;
 }
 
-void scoped_memory::call_realloc(std::size_t size) {
+/*void scoped_memory::call_realloc(std::size_t size) {
   assert(source_ == MALLOC_ALLOCATED || source_ == NONE_ALLOCATED);
   void *new_data = realloc(data_, size);
   if (!new_data) {
@@ -95,7 +89,17 @@ void scoped_memory::call_realloc(std::size_t size) {
     size_ = size;
     source_ = MALLOC_ALLOCATED;
   }
-}
+}*/
+
+const int kFileFlags =
+#if defined(_WIN32) || defined(_WIN64)
+  0 // MapOrThrow ignores flags on windows
+#elif defined(MAP_FILE)
+  MAP_FILE | MAP_SHARED
+#else
+  MAP_SHARED
+#endif
+  ;
 
 void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset) {
 #ifdef MAP_POPULATE // Linux specific
@@ -126,15 +130,168 @@ void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int
   return ret;
 }
 
-const int kFileFlags =
+void SyncOrThrow(void *start, size_t length) {
 #if defined(_WIN32) || defined(_WIN64)
-  0 // MapOrThrow ignores flags on windows
-#elif defined(MAP_FILE)
-  MAP_FILE | MAP_SHARED
+  UTIL_THROW_IF(!::FlushViewOfFile(start, length), ErrnoException, "Failed to sync mmap");
 #else
-  MAP_SHARED
+  UTIL_THROW_IF(length && msync(start, length, MS_SYNC), ErrnoException, "Failed to sync mmap");
 #endif
-  ;
+}
+
+void UnmapOrThrow(void *start, size_t length) {
+#if defined(_WIN32) || defined(_WIN64)
+  UTIL_THROW_IF(!::UnmapViewOfFile(start), ErrnoException, "Failed to unmap a file");
+#else
+  UTIL_THROW_IF(munmap(start, length), ErrnoException, "munmap failed");
+#endif
+}
+
+// Linux huge pages.
+#ifdef __linux__
+
+namespace {
+
+bool AnonymousMap(std::size_t size, int flags, bool populate, util::scoped_memory &to) {
+  if (populate) flags |= MAP_POPULATE;
+  void *ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | flags, -1, 0);
+  if (ret == MAP_FAILED) return false;
+  to.reset(ret, size, scoped_memory::MMAP_ALLOCATED);
+  return true;
+}
+
+bool TryHuge(std::size_t size, uint8_t alignment_bits, bool populate, util::scoped_memory &to) {
+  // Don't bother with these cases.
+  if (size < (1ULL << alignment_bits) || (1ULL << alignment_bits) < SizePage())
+    return false;
+
+  // First try: Linux >= 3.8 with manually configured hugetlb pages available.
+#ifdef MAP_HUGE_SHIFT
+  if (AnonymousMap(size, MAP_HUGETLB | (alignment_bits << MAP_HUGE_SHIFT), populate, to))
+    return true;
+#endif
+
+  // Second try: manually configured hugetlb pages exist, but kernel too old to
+  // pick size or not available.  This might pick the wrong size huge pages,
+  // but the sysadmin must have made them available in the first place.
+  if (AnonymousMap(size, MAP_HUGETLB, populate, to))
+    return true;
+
+  // Third try: align to a multiple of the huge page size by overallocating.
+  // I feel bad about doing this, but it's also how posix_memalign is
+  // implemented.  And the memory is virtual.
+
+  // Round up requested size to multiple of page size.  This will allow the pages after to be munmapped.
+  std::size_t size_up = RoundUpPow2(size, SizePage());
+
+  std::size_t ask = size_up + (1 << alignment_bits) - SizePage();
+  // Don't populate because this is asking for more than we will use.
+  scoped_mmap larger(mmap(NULL, ask, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0), ask);
+  if (larger.get() == MAP_FAILED) return false;
+
+  // Throw out pages before the alignment point.
+  uintptr_t base = reinterpret_cast<uintptr_t>(larger.get());
+  // Round up to next multiple of alignment.
+  uintptr_t rounded_up = RoundUpPow2(base, static_cast<uintptr_t>(1) << alignment_bits);
+  if (base != rounded_up) {
+    // If this throws an exception (which it shouldn't) then we want to unmap the whole thing by keeping it in larger.
+    UnmapOrThrow(larger.get(), rounded_up - base);
+    larger.steal();
+    larger.reset(reinterpret_cast<void*>(rounded_up), ask - (rounded_up - base));
+  }
+
+  // Throw out pages after the requested size.
+  assert(larger.size() >= size_up);
+  if (larger.size() > size_up) {
+    // This is where we assume size_up is a multiple of page size.
+    UnmapOrThrow(static_cast<uint8_t*>(larger.get()) + size_up, larger.size() - size_up);
+    larger.reset(larger.steal(), size_up);
+  }
+  madvise(larger.get(), size_up, MADV_HUGEPAGE);
+  to.reset(larger.steal(), size, scoped_memory::MMAP_ROUND_UP_ALLOCATED);
+  return true;
+}
+
+} // namespace
+
+#endif
+
+void HugeMalloc(std::size_t size, bool zeroed, scoped_memory &to) {
+  to.reset();
+#ifdef __linux__
+  // TODO: architectures/page sizes other than 2^21 and 2^30.
+  // Attempt 1 GB pages.
+  // If the user asked for zeroed memory, assume they want it populated.
+  if (size >= (1ULL << 30) && TryHuge(size, 30, zeroed, to))
+    return;
+  // Attempt 2 MB pages.
+  if (size >= (1ULL << 21) && TryHuge(size, 21, zeroed, to))
+    return;
+#endif // __linux__
+  // Non-linux will always do this, as will small allocations on Linux.
+  to.reset(zeroed ? calloc(1, size) : malloc(size), size, scoped_memory::MALLOC_ALLOCATED);
+  UTIL_THROW_IF(!to.get(), ErrnoException, "Failed to allocate " << size << " bytes");
+}
+
+#ifdef __linux__
+const std::size_t kTransitionHuge = std::max<std::size_t>(1ULL << 21, SizePage());
+#endif // __linux__
+
+void HugeRealloc(std::size_t to, bool zero_new, scoped_memory &mem) {
+  if (!to) {
+    mem.reset();
+    return;
+  }
+  std::size_t from_size = mem.size();
+  switch (mem.source()) {
+    case scoped_memory::NONE_ALLOCATED:
+      HugeMalloc(to, zero_new, mem);
+      return;
+#ifdef __linux__
+    case scoped_memory::MMAP_ROUND_UP_ALLOCATED:
+      // for mremap's benefit.
+      from_size = RoundUpPow2(from_size, SizePage());
+    case scoped_memory::MMAP_ALLOCATED:
+      // Downsizing below barrier?
+      if (to <= SizePage()) {
+        scoped_malloc replacement(malloc(to));
+        memcpy(replacement.get(), mem.get(), std::min(to, mem.size()));
+        if (zero_new && to > mem.size())
+          memset(static_cast<uint8_t*>(replacement.get()) + mem.size(), 0, to - mem.size());
+        mem.reset(replacement.release(), to, scoped_memory::MALLOC_ALLOCATED);
+      } else {
+        void *new_addr = mremap(mem.get(), from_size, to, MREMAP_MAYMOVE);
+        UTIL_THROW_IF(!new_addr, ErrnoException, "Failed to mremap from " << from_size << " to " << to);
+        mem.steal();
+        mem.reset(new_addr, to, scoped_memory::MMAP_ALLOCATED);
+      }
+      return;
+#endif // __linux__
+    case scoped_memory::MALLOC_ALLOCATED:
+#ifdef __linux__
+      // Transition larger allocations to huge pages, but don't keep trying if we're still malloc allocated.
+      if (to >= kTransitionHuge && mem.size() < kTransitionHuge) {
+        scoped_memory replacement;
+        HugeMalloc(to, zero_new, replacement);
+        memcpy(replacement.get(), mem.get(), mem.size());
+        // This can't throw.
+        mem.reset(replacement.get(), replacement.size(), replacement.source());
+        replacement.steal();
+        return;
+      }
+#endif // __linux__
+      {
+        void *new_addr = std::realloc(mem.get(), to);
+        UTIL_THROW_IF(!new_addr, ErrnoException, "realloc to " << to << " bytes failed.");
+        if (zero_new && to > mem.size())
+          memset(static_cast<uint8_t*>(new_addr) + mem.size(), 0, to - mem.size());
+        mem.steal();
+        mem.reset(new_addr, to, scoped_memory::MALLOC_ALLOCATED);
+      }
+      return;
+    default:
+      UTIL_THROW(Exception, "HugeRealloc called with type " << mem.source());
+  }
+}
 
 void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out) {
   switch (method) {
@@ -151,33 +308,17 @@ void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scope
     case POPULATE_OR_READ:
 #endif
     case READ:
-      out.reset(MallocOrThrow(size), size, scoped_memory::MALLOC_ALLOCATED);
+      HugeMalloc(size, false, out);
       SeekOrThrow(fd, offset);
       ReadOrThrow(fd, out.get(), size);
       break;
     case PARALLEL_READ:
-      out.reset(MallocOrThrow(size), size, scoped_memory::MALLOC_ALLOCATED);
+      HugeMalloc(size, false, out);
       ParallelRead(fd, out.get(), size, offset);
       break;
   }
 }
 
-// Allocates zeroed memory in to.
-void MapAnonymous(std::size_t size, util::scoped_memory &to) {
-  to.reset();
-#if defined(_WIN32) || defined(_WIN64)
-  to.reset(calloc(1, size), size, scoped_memory::MALLOC_ALLOCATED);
-#else
-  to.reset(MapOrThrow(size, true,
-#  if defined(MAP_ANONYMOUS)
-      MAP_ANONYMOUS | MAP_PRIVATE // Linux
-#  else
-      MAP_ANON | MAP_PRIVATE // BSD
-#  endif
-      , false, -1, 0), size, scoped_memory::MMAP_ALLOCATED);
-#endif
-}
-
 void *MapZeroedWrite(int fd, std::size_t size) {
   ResizeOrThrow(fd, 0);
   ResizeOrThrow(fd, size);
diff --git a/util/mmap.hh b/util/mmap.hh
index 9ac604975..b474dc75b 100644
--- a/util/mmap.hh
+++ b/util/mmap.hh
@@ -12,7 +12,7 @@ namespace util {
 
 class scoped_fd;
 
-long SizePage();
+std::size_t SizePage();
 
 // (void*)-1 is MAP_FAILED; this is done to avoid including the mmap header here.
 class scoped_mmap {
@@ -37,6 +37,13 @@ class scoped_mmap {
       reset((void*)-1, 0);
     }
 
+    void *steal() {
+      void *ret = data_;
+      data_ = (void*)-1;
+      size_ = 0;
+      return ret;
+    }
+
   private:
     void *data_;
     std::size_t size_;
@@ -51,13 +58,21 @@ class scoped_mmap {
  */
 class scoped_memory {
   public:
-    typedef enum {MMAP_ALLOCATED, ARRAY_ALLOCATED, MALLOC_ALLOCATED, NONE_ALLOCATED} Alloc;
+    typedef enum {
+      MMAP_ROUND_UP_ALLOCATED, // The size was rounded up to a multiple of page size.  Do the same before munmap.
+      MMAP_ALLOCATED, // munmap
+      MALLOC_ALLOCATED, // free
+      NONE_ALLOCATED // nothing here!
+    } Alloc;
 
     scoped_memory(void *data, std::size_t size, Alloc source)
       : data_(data), size_(size), source_(source) {}
 
     scoped_memory() : data_(NULL), size_(0), source_(NONE_ALLOCATED) {}
 
+    // Calls HugeMalloc
+    scoped_memory(std::size_t to, bool zero_new);
+
     ~scoped_memory() { reset(); }
 
     void *get() const { return data_; }
@@ -71,9 +86,13 @@ class scoped_memory {
 
     void reset(void *data, std::size_t size, Alloc from);
 
-    // realloc allows the current data to escape hence the need for this call
-    // If realloc fails, destroys the original too and get() returns NULL.
-    void call_realloc(std::size_t to);
+    void *steal() {
+      void *ret = data_;
+      data_ = NULL;
+      size_ = 0;
+      source_ = NONE_ALLOCATED;
+      return ret;
+    }
 
   private:
     void *data_;
@@ -85,6 +104,30 @@ class scoped_memory {
     scoped_memory &operator=(const scoped_memory &);
 };
 
+extern const int kFileFlags;
+
+// Cross-platform, error-checking wrapper for mmap().
+void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset = 0);
+
+// msync wrapper
+void SyncOrThrow(void *start, size_t length);
+
+// Cross-platform, error-checking wrapper for munmap().
+void UnmapOrThrow(void *start, size_t length);
+
+// Allocate memory, promising that all/vast majority of it will be used.  Tries
+// hard to use huge pages on Linux.
+// If you want zeroed memory, pass zeroed = true.
+void HugeMalloc(std::size_t size, bool zeroed, scoped_memory &to);
+
+// Reallocates memory ala realloc but with option to zero the new memory.
+// On Linux, the memory can come from anonymous mmap or malloc/calloc.
+// On non-Linux, only malloc/calloc is supported.
+//
+// To summarize, any memory from HugeMalloc or HugeRealloc can be resized with
+// this.
+void HugeRealloc(std::size_t size, bool new_zeroed, scoped_memory &mem);
+
 typedef enum {
   // mmap with no prepopulate
   LAZY,
@@ -98,25 +141,12 @@ typedef enum {
   PARALLEL_READ,
 } LoadMethod;
 
-extern const int kFileFlags;
-
-// Cross-platform, error-checking wrapper for mmap().
-void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset = 0);
-
-// Cross-platform, error-checking wrapper for munmap().
-void UnmapOrThrow(void *start, size_t length);
-
 void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out);
 
-void MapAnonymous(std::size_t size, scoped_memory &to);
-
 // Open file name with mmap of size bytes, all of which are initially zero.
 void *MapZeroedWrite(int fd, std::size_t size);
 void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file);
 
-// msync wrapper
-void SyncOrThrow(void *start, size_t length);
-
 // Forward rolling memory map with no overlap.
 class Rolling {
   public:
diff --git a/util/pool.cc b/util/pool.cc
index e0e4c61f6..246417c16 100644
--- a/util/pool.cc
+++ b/util/pool.cc
@@ -4,6 +4,8 @@
 
 #include <cstdlib>
 
+#include <algorithm>
+
 namespace util {
 
 Pool::Pool() {
diff --git a/util/probing_hash_table.hh b/util/probing_hash_table.hh
index 53fbbe996..d17016a4f 100644
--- a/util/probing_hash_table.hh
+++ b/util/probing_hash_table.hh
@@ -2,7 +2,7 @@
 #define UTIL_PROBING_HASH_TABLE_H
 
 #include "util/exception.hh"
-#include "util/scoped.hh"
+#include "util/mmap.hh"
 
 #include <algorithm>
 #include <cstddef>
@@ -336,9 +336,11 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
     typedef EqualT Equal;
 
     AutoProbing(std::size_t initial_size = 5, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) :
-      allocated_(Backend::Size(initial_size, 1.5)), mem_(util::MallocOrThrow(allocated_)), backend_(mem_.get(), allocated_, invalid, hash_func, equal_func) {
-      threshold_ = initial_size * 1.2;
-      Clear();
+      allocated_(Backend::Size(initial_size, 1.2)), mem_(allocated_, KeyIsRawZero(invalid)), backend_(mem_.get(), allocated_, invalid, hash_func, equal_func) {
+      threshold_ = std::min<std::size_t>(backend_.buckets_ - 1, backend_.buckets_ * 0.9);
+      if (!KeyIsRawZero(invalid)) {
+        Clear();
+      }
     }
 
     // Assumes that the key is unique.  Multiple insertions won't cause a failure, just inconsistent lookup.
@@ -379,16 +381,23 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
 
   private:
     void DoubleIfNeeded() {
-      if (Size() < threshold_)
+      if (UTIL_LIKELY(Size() < threshold_))
         return;
-      mem_.call_realloc(backend_.DoubleTo());
+      HugeRealloc(backend_.DoubleTo(), KeyIsRawZero(backend_.invalid_), mem_);
       allocated_ = backend_.DoubleTo();
-      backend_.Double(mem_.get());
-      threshold_ *= 2;
+      backend_.Double(mem_.get(), !KeyIsRawZero(backend_.invalid_));
+      threshold_ = std::min<std::size_t>(backend_.buckets_ - 1, backend_.buckets_ * 0.9);
+    }
+
+    bool KeyIsRawZero(const Key &key) {
+      for (const uint8_t *i = reinterpret_cast<const uint8_t*>(&key); i < reinterpret_cast<const uint8_t*>(&key) + sizeof(Key); ++i) {
+        if (*i) return false;
+      }
+      return true;
     }
 
     std::size_t allocated_;
-    util::scoped_malloc mem_;
+    util::scoped_memory mem_;
     Backend backend_;
     std::size_t threshold_;
 };
diff --git a/util/probing_hash_table_benchmark_main.cc b/util/probing_hash_table_benchmark_main.cc
index c5129480f..583d21f5e 100644
--- a/util/probing_hash_table_benchmark_main.cc
+++ b/util/probing_hash_table_benchmark_main.cc
@@ -1,6 +1,6 @@
 #include "util/file.hh"
 #include "util/probing_hash_table.hh"
-#include "util/scoped.hh"
+#include "util/mmap.hh"
 #include "util/usage.hh"
 
 #include <iostream>
@@ -46,11 +46,12 @@ struct PrefetchEntry {
   const Entry *pointer;
 };
 
-const std::size_t kPrefetchSize = 4;
-template <class Table> class PrefetchQueue {
+template <class TableT, unsigned PrefetchSize> class PrefetchQueue {
   public:
+    typedef TableT Table;
+
     explicit PrefetchQueue(Table &table) : table_(table), cur_(0), twiddle_(false) {
-      for (PrefetchEntry *i = entries_; i != entries_ + kPrefetchSize; ++i)
+      for (PrefetchEntry *i = entries_; i != entries_ + PrefetchSize; ++i)
         i->pointer = NULL;
     }
 
@@ -66,7 +67,7 @@ template <class Table> class PrefetchQueue {
 
     bool Drain() {
       if (Cur().pointer) {
-        for (PrefetchEntry *i = &Cur(); i < entries_ + kPrefetchSize; ++i) {
+        for (PrefetchEntry *i = &Cur(); i < entries_ + PrefetchSize; ++i) {
           twiddle_ ^= table_.FindFromIdeal(i->key, i->pointer);
         }
       }
@@ -80,11 +81,11 @@ template <class Table> class PrefetchQueue {
     PrefetchEntry &Cur() { return entries_[cur_]; }
     void Next() {
       ++cur_;
-      cur_ = cur_ % kPrefetchSize;
+      cur_ = cur_ % PrefetchSize;
     }
 
     Table &table_;
-    PrefetchEntry entries_[kPrefetchSize];
+    PrefetchEntry entries_[PrefetchSize];
     std::size_t cur_;
 
     bool twiddle_;
@@ -93,12 +94,23 @@ template <class Table> class PrefetchQueue {
     void operator=(const PrefetchQueue&);
 };
 
-/*template <class Table> class Immediate {
+template <class TableT> class Immediate {
   public:
+    typedef TableT Table;
+
+    explicit Immediate(Table &table) : table_(table), twiddle_(false) {}
+
+    void Add(uint64_t key) {
+      typename Table::ConstIterator it;
+      twiddle_ ^= table_.Find(key, it);
+    }
+
+    bool Drain() const { return twiddle_; }
 
   private:
     Table &table_;
-};*/
+    bool twiddle_;
+};
 
 std::size_t Size(uint64_t entries, float multiplier = 1.5) {
   typedef util::ProbingHashTable<Entry, util::IdentityHash, std::equal_to<Entry::Key>, Power2Mod> Table;
@@ -106,39 +118,54 @@ std::size_t Size(uint64_t entries, float multiplier = 1.5) {
   return Power2Mod::RoundBuckets(Table::Size(entries, multiplier) / sizeof(Entry)) * sizeof(Entry);
 }
 
-template <class Mod> bool Test(URandom &rn, uint64_t entries, const uint64_t *const queries_begin, const uint64_t *const queries_end, float multiplier = 1.5) {
-  typedef util::ProbingHashTable<Entry, util::IdentityHash, std::equal_to<Entry::Key>, Mod> Table;
+template <class Queue> bool Test(URandom &rn, uint64_t entries, const uint64_t *const queries_begin, const uint64_t *const queries_end, bool ordinary_malloc, float multiplier = 1.5) {
   std::size_t size = Size(entries, multiplier);
-  scoped_malloc backing(util::CallocOrThrow(size));
-  Table table(backing.get(), size);
+  scoped_memory backing;
+  if (ordinary_malloc) {
+    backing.reset(util::CallocOrThrow(size), size, scoped_memory::MALLOC_ALLOCATED);
+  } else {
+    util::HugeMalloc(size, true, backing);
+  }
+  typename Queue::Table table(backing.get(), size);
 
-  double start = UserTime();
+  double start = CPUTime();
   for (uint64_t i = 0; i < entries; ++i) {
     Entry entry;
     entry.key = rn.Get();
     table.Insert(entry);
   }
-  double inserted = UserTime() - start;
-  double before_lookup = UserTime();
-  PrefetchQueue<Table> queue(table);
+  double inserted = CPUTime() - start;
+  double before_lookup = CPUTime();
+  Queue queue(table);
   for (const uint64_t *i = queries_begin; i != queries_end; ++i) {
     queue.Add(*i);
-/*    typename Table::ConstIterator it;
-    meaningless ^= table.Find(*i, it);*/
   }
   bool meaningless = queue.Drain();
-  std::cout << entries << ' ' << size << ' ' << (inserted / static_cast<double>(entries)) << ' ' << (UserTime() - before_lookup) / static_cast<double>(queries_end - queries_begin) << '\n';
+  std::cout << ' ' << (inserted / static_cast<double>(entries)) << ' ' << (CPUTime() - before_lookup) / static_cast<double>(queries_end - queries_begin) << std::flush;
   return meaningless;
 }
 
-template <class Mod> bool TestRun(uint64_t lookups = 20000000, float multiplier = 1.5) {
+bool TestRun(uint64_t lookups = 20000000, float multiplier = 1.5) {
   URandom rn;
-  util::scoped_malloc queries(util::CallocOrThrow(lookups * sizeof(uint64_t)));
+  util::scoped_memory queries;
+  HugeMalloc(lookups * sizeof(uint64_t), true, queries);
   rn.Batch(static_cast<uint64_t*>(queries.get()), static_cast<uint64_t*>(queries.get()) + lookups);
   uint64_t physical_mem_limit = util::GuessPhysicalMemory() / 2;
   bool meaningless = true;
   for (uint64_t i = 4; Size(i / multiplier) < physical_mem_limit; i *= 4) {
-    meaningless ^= util::Test<Mod>(rn, i / multiplier, static_cast<const uint64_t*>(queries.get()), static_cast<const uint64_t*>(queries.get()) + lookups, multiplier);
+    std::cout << static_cast<std::size_t>(i / multiplier) << ' ' << Size(i / multiplier);
+    typedef util::ProbingHashTable<Entry, util::IdentityHash, std::equal_to<Entry::Key>, Power2Mod> Table;
+    typedef util::ProbingHashTable<Entry, util::IdentityHash, std::equal_to<Entry::Key>, DivMod> TableDiv;
+    const uint64_t *const queries_begin = static_cast<const uint64_t*>(queries.get());
+    meaningless ^= util::Test<Immediate<TableDiv> >(rn, i / multiplier, queries_begin, queries_begin + lookups, true, multiplier);
+    meaningless ^= util::Test<Immediate<Table> >(rn, i / multiplier, queries_begin, queries_begin + lookups, true, multiplier);
+    meaningless ^= util::Test<PrefetchQueue<Table, 4> >(rn, i / multiplier, queries_begin, queries_begin + lookups, true, multiplier);
+    meaningless ^= util::Test<Immediate<Table> >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier);
+    meaningless ^= util::Test<PrefetchQueue<Table, 2> >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier);
+    meaningless ^= util::Test<PrefetchQueue<Table, 4> >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier);
+    meaningless ^= util::Test<PrefetchQueue<Table, 8> >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier);
+    meaningless ^= util::Test<PrefetchQueue<Table, 16> >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier);
+    std::cout << std::endl;
   }
   return meaningless;
 }
@@ -148,9 +175,7 @@ template <class Mod> bool TestRun(uint64_t lookups = 20000000, float multiplier
 
 int main() {
   bool meaningless = false;
-  std::cout << "#Integer division\n";
-  meaningless ^= util::TestRun<util::DivMod>();
-  std::cout << "#Masking\n";
-  meaningless ^= util::TestRun<util::Power2Mod>();
+  std::cout << "#CPU time\n";
+  meaningless ^= util::TestRun();
   std::cerr << "Meaningless: " << meaningless << '\n';
 }
diff --git a/util/scoped.cc b/util/scoped.cc
index 84f4344b7..817aa2424 100644
--- a/util/scoped.cc
+++ b/util/scoped.cc
@@ -27,7 +27,7 @@ void *MallocOrThrow(std::size_t requested) {
 }
 
 void *CallocOrThrow(std::size_t requested) {
-  return InspectAddr(std::calloc(1, requested), requested, "calloc");
+  return InspectAddr(std::calloc(requested, 1), requested, "calloc");
 }
 
 void scoped_malloc::call_realloc(std::size_t requested) {
diff --git a/util/stream/rewindable_stream.cc b/util/stream/rewindable_stream.cc
index 9421b25c3..726e2a72d 100644
--- a/util/stream/rewindable_stream.cc
+++ b/util/stream/rewindable_stream.cc
@@ -1,6 +1,5 @@
 #include "util/stream/rewindable_stream.hh"
 #include "util/pcqueue.hh"
-#include <iostream>
 
 #include <iostream>
 
diff --git a/util/string_stream.hh b/util/string_stream.hh
new file mode 100644
index 000000000..730403d70
--- /dev/null
+++ b/util/string_stream.hh
@@ -0,0 +1,44 @@
+#ifndef UTIL_STRING_STREAM_H
+#define UTIL_STRING_STREAM_H
+
+#include "util/fake_ostream.hh"
+
+#include <cassert>
+#include <string>
+
+namespace util {
+
+class StringStream : public FakeOStream<StringStream> {
+  public:
+    // Semantics: appends to string.  Remember to clear first!
+    explicit StringStream(std::string &out)
+      : out_(out) {}
+
+    StringStream &flush() { return *this; }
+
+    StringStream &write(const void *data, std::size_t length) {
+      out_.append(static_cast<const char*>(data), length);
+      return *this;
+    }
+
+  protected:
+    friend class FakeOStream<StringStream>;
+    char *Ensure(std::size_t amount) {
+      std::size_t current = out_.size();
+      out_.resize(out_.size() + amount);
+      return &out_[current];
+    }
+
+    void AdvanceTo(char *to) {
+      assert(to <= &*out_.end());
+      assert(to >= &*out_.begin());
+      out_.resize(to - &*out_.begin());
+    }
+
+  private:
+    std::string &out_;
+};
+
+} // namespace
+
+#endif // UTIL_STRING_STREAM_H
diff --git a/util/string_stream_test.cc b/util/string_stream_test.cc
new file mode 100644
index 000000000..7d25711c2
--- /dev/null
+++ b/util/string_stream_test.cc
@@ -0,0 +1,57 @@
+#define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE
+#define BOOST_TEST_MODULE FakeOStreamTest
+
+#include "util/string_stream.hh"
+#include <boost/test/unit_test.hpp>
+#include <boost/lexical_cast.hpp>
+
+#include <cstddef>
+#include <limits>
+
+namespace util { namespace {
+
+template <class T> void TestEqual(const T value) {
+  std::string str;
+  StringStream(str) << value;
+  BOOST_CHECK_EQUAL(boost::lexical_cast<std::string>(value), str);
+}
+
+template <class T> void TestCorners() {
+  TestEqual(std::numeric_limits<T>::max());
+  TestEqual(std::numeric_limits<T>::min());
+  TestEqual(static_cast<T>(0));
+  TestEqual(static_cast<T>(-1));
+  TestEqual(static_cast<T>(1));
+}
+
+BOOST_AUTO_TEST_CASE(Integer) {
+  TestCorners<char>();
+  TestCorners<signed char>();
+  TestCorners<unsigned char>();
+
+  TestCorners<short>();
+  TestCorners<signed short>();
+  TestCorners<unsigned short>();
+
+  TestCorners<int>();
+  TestCorners<unsigned int>();
+  TestCorners<signed int>();
+
+  TestCorners<long>();
+  TestCorners<unsigned long>();
+  TestCorners<signed long>();
+
+  TestCorners<long long>();
+  TestCorners<unsigned long long>();
+  TestCorners<signed long long>();
+
+  TestCorners<std::size_t>();
+}
+
+enum TinyEnum { EnumValue };
+
+BOOST_AUTO_TEST_CASE(EnumCase) {
+  TestEqual(EnumValue);
+}
+
+}} // namespaces
diff --git a/util/usage.cc b/util/usage.cc
index 5f66b17d2..7965e9645 100644
--- a/util/usage.cc
+++ b/util/usage.cc
@@ -135,14 +135,26 @@ double WallTime() {
   return Subtract(GetWall(), kRecordStart.Started());
 }
 
-double UserTime() {
-#if !defined(_WIN32) && !defined(_WIN64)
+double CPUTime() {
+#if defined(_WIN32) || defined(_WIN64)
+  return 0.0;
+#else
   struct rusage usage;
   if (getrusage(RUSAGE_SELF, &usage))
     return 0.0;
-  return DoubleSec(usage.ru_utime);
+  return DoubleSec(usage.ru_utime) + DoubleSec(usage.ru_stime);
+#endif
+}
+
+uint64_t RSSMax() {
+#if defined(_WIN32) || defined(_WIN64)
+  return 0;
+#else
+  struct rusage usage;
+  if (getrusage(RUSAGE_SELF, &usage))
+    return 0;
+  return static_cast<uint64_t>(usage.ru_maxrss) * 1024;
 #endif
-  return 0.0;
 }
 
 void PrintUsage(std::ostream &out) {
@@ -274,6 +286,7 @@ template <class Num> uint64_t ParseNum(const std::string &arg) {
     return static_cast<uint64_t>(static_cast<double>(value) * static_cast<double>(mem) / 100.0);
   }
 
+  if (after == "k") after == "K";
   std::string units("bKMGTPEZY");
   std::string::size_type index = units.find(after[0]);
   UTIL_THROW_IF_ARG(index == std::string::npos, SizeParseError, (arg), "the allowed suffixes are " << units << "%.");
diff --git a/util/usage.hh b/util/usage.hh
index dff81b59d..2f1b3e992 100644
--- a/util/usage.hh
+++ b/util/usage.hh
@@ -9,7 +9,11 @@ namespace util {
 // Time in seconds since process started.  Zero on unsupported platforms.
 double WallTime();
 
-double UserTime();
+// User + system time.
+double CPUTime();
+
+// Resident usage in bytes.
+uint64_t RSSMax();
 
 void PrintUsage(std::ostream &to);