KenLM 0e5d259 including read_compressed fix

2024-12-26 13:23:25 +03:00 · 2013-01-04 21:02:47 +00:00 · 2013-01-04 21:02:47 +00:00 · f9ee7ae4b3
commit f9ee7ae4b3
parent 3203f7c92d
14 changed files with 111 additions and 98 deletions
--- a/lm/binary_format.cc
+++ b/lm/binary_format.cc
@ -16,11 +16,11 @@ namespace ngram {
 namespace {
 const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version";
 const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0";
-// This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed). 
+// This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed).
 const char kMagicIncomplete[] = "mmap lm http://kheafield.com/code incomplete\n";
 const long int kMagicVersion = 5;

-// Old binary files built on 32-bit machines have this header.  
+// Old binary files built on 32-bit machines have this header.
 // TODO: eliminate with next binary release.
 struct OldSanity {
  char magic[sizeof(kMagicBytes)];
@ -39,7 +39,7 @@ struct OldSanity {
 };


-// Test values aligned to 8 bytes.    
+// Test values aligned to 8 bytes.
 struct Sanity {
  char magic[ALIGN8(sizeof(kMagicBytes))];
  float zero_f, one_f, minus_half_f;
@ -101,7 +101,7 @@ uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_
 uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing) {
  std::size_t adjusted_vocab = backing.vocab.size() + vocab_pad;
  if (config.write_mmap) {
-    // Grow the file to accomodate the search, using zeros.  
+    // Grow the file to accomodate the search, using zeros.
    try {
      util::ResizeOrThrow(backing.file.get(), adjusted_vocab + memory_size);
    } catch (util::ErrnoException &e) {
@ -114,7 +114,7 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
      return reinterpret_cast<uint8_t*>(backing.search.get());
    }
    // mmap it now.
-    // We're skipping over the header and vocab for the search space mmap.  mmap likes page aligned offsets, so some arithmetic to round the offset down.  
+    // We're skipping over the header and vocab for the search space mmap.  mmap likes page aligned offsets, so some arithmetic to round the offset down.
    std::size_t page_size = util::SizePage();
    std::size_t alignment_cruft = adjusted_vocab % page_size;
    backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED);
@ -122,7 +122,7 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
  } else {
    util::MapAnonymous(memory_size, backing.search);
    return reinterpret_cast<uint8_t*>(backing.search.get());
-  } 
+  }
 }

 void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing) {
@ -140,7 +140,7 @@ void FinishFile(const Config &config, ModelType model_type, unsigned int search_
      util::FSyncOrThrow(backing.file.get());
      break;
  }
-  // header and vocab share the same mmap.  The header is written here because we know the counts.  
+  // header and vocab share the same mmap.  The header is written here because we know the counts.
  Parameters params = Parameters();
  params.counts = counts;
  params.fixed.order = counts.size();
@ -160,7 +160,7 @@ namespace detail {
 bool IsBinaryFormat(int fd) {
  const uint64_t size = util::SizeFile(fd);
  if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false;
-  // Try reading the header.  
+  // Try reading the header.
  util::scoped_memory memory;
  try {
    util::MapRead(util::LAZY, fd, 0, sizeof(Sanity), memory);
@ -214,7 +214,7 @@ void SeekPastHeader(int fd, const Parameters &params) {

 uint8_t *SetupBinary(const Config &config, const Parameters &params, uint64_t memory_size, Backing &backing) {
  const uint64_t file_size = util::SizeFile(backing.file.get());
-  // The header is smaller than a page, so we have to map the whole header as well.  
+  // The header is smaller than a page, so we have to map the whole header as well.
  std::size_t total_map = util::CheckOverflow(TotalHeaderSize(params.counts.size()) + memory_size);
  if (file_size != util::kBadSize && static_cast<uint64_t>(file_size) < total_map)
    UTIL_THROW(FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);
@ -233,7 +233,8 @@ void ComplainAboutARPA(const Config &config, ModelType model_type) {
  if (config.write_mmap || !config.messages) return;
  if (config.arpa_complain == Config::ALL) {
    *config.messages << "Loading the LM will be faster if you build a binary file." << std::endl;
-  } else if (config.arpa_complain == Config::EXPENSIVE && model_type == TRIE_SORTED) {
+  } else if (config.arpa_complain == Config::EXPENSIVE &&
+             (model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) {
    *config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive.  Save time by building a binary format." << std::endl;
  }
 }
--- a/lm/config.cc
+++ b/lm/config.cc
@ -6,6 +6,7 @@ namespace lm {
 namespace ngram {

 Config::Config() :
+  show_progress(true),
  messages(&std::cerr),
  enumerate_vocab(NULL),
  unknown_missing(COMPLAIN),
--- a/lm/config.hh
+++ b/lm/config.hh
@ -11,46 +11,52 @@
 /* Configuration for ngram model.  Separate header to reduce pollution. */

 namespace lm {
-  
+
 class EnumerateVocab;

 namespace ngram {

 struct Config {
-  // EFFECTIVE FOR BOTH ARPA AND BINARY READS 
+  // EFFECTIVE FOR BOTH ARPA AND BINARY READS
+
+  // (default true) print progress bar to messages
+  bool show_progress;

  // Where to log messages including the progress bar.  Set to NULL for
  // silence.
  std::ostream *messages;

+  std::ostream *ProgressMessages() const {
+    return show_progress ? messages : 0;
+  }
+
  // This will be called with every string in the vocabulary.  See
  // enumerate_vocab.hh for more detail.  Config does not take ownership; you
-  // are still responsible for deleting it (or stack allocating).  
+  // are still responsible for deleting it (or stack allocating).
  EnumerateVocab *enumerate_vocab;


-
  // ONLY EFFECTIVE WHEN READING ARPA

-  // What to do when <unk> isn't in the provided model. 
+  // What to do when <unk> isn't in the provided model.
  WarningAction unknown_missing;
-  // What to do when <s> or </s> is missing from the model. 
-  // If THROW_UP, the exception will be of type util::SpecialWordMissingException.  
+  // What to do when <s> or </s> is missing from the model.
+  // If THROW_UP, the exception will be of type util::SpecialWordMissingException.
  WarningAction sentence_marker_missing;

  // What to do with a positive log probability.  For COMPLAIN and SILENT, map
-  // to 0.  
+  // to 0.
  WarningAction positive_log_probability;

-  // The probability to substitute for <unk> if it's missing from the model.  
+  // The probability to substitute for <unk> if it's missing from the model.
  // No effect if the model has <unk> or unknown_missing == THROW_UP.
  float unknown_missing_logprob;

  // Size multiplier for probing hash table.  Must be > 1.  Space is linear in
  // this.  Time is probing_multiplier / (probing_multiplier - 1).  No effect
-  // for sorted variant.  
+  // for sorted variant.
  // If you find yourself setting this to a low number, consider using the
-  // TrieModel which has lower memory consumption.  
+  // TrieModel which has lower memory consumption.
  float probing_multiplier;

  // Amount of memory to use for building.  The actual memory usage will be
@ -58,10 +64,10 @@ struct Config {
  // models.
  std::size_t building_memory;

-  // Template for temporary directory appropriate for passing to mkdtemp.  
+  // Template for temporary directory appropriate for passing to mkdtemp.
  // The characters XXXXXX are appended before passing to mkdtemp.  Only
  // applies to trie.  If NULL, defaults to write_mmap.  If that's NULL,
-  // defaults to input file name.  
+  // defaults to input file name.
  const char *temporary_directory_prefix;

  // Level of complaining to do when loading from ARPA instead of binary format.
@ -69,49 +75,46 @@ struct Config {
  ARPALoadComplain arpa_complain;

  // While loading an ARPA file, also write out this binary format file.  Set
-  // to NULL to disable.  
+  // to NULL to disable.
  const char *write_mmap;

  enum WriteMethod {
-    WRITE_MMAP, // Map the file directly.  
-    WRITE_AFTER // Write after we're done.  
+    WRITE_MMAP, // Map the file directly.
+    WRITE_AFTER // Write after we're done.
  };
  WriteMethod write_method;

-  // Include the vocab in the binary file?  Only effective if write_mmap != NULL.  
+  // Include the vocab in the binary file?  Only effective if write_mmap != NULL.
  bool include_vocab;


-  // Left rest options.  Only used when the model includes rest costs.  
+  // Left rest options.  Only used when the model includes rest costs.
  enum RestFunction {
    REST_MAX,   // Maximum of any score to the left
-    REST_LOWER, // Use lower-order files given below.  
+    REST_LOWER, // Use lower-order files given below.
  };
  RestFunction rest_function;
-  // Only used for REST_LOWER.  
+  // Only used for REST_LOWER.
  std::vector<std::string> rest_lower_files;


-
  // Quantization options.  Only effective for QuantTrieModel.  One value is
  // reserved for each of prob and backoff, so 2^bits - 1 buckets will be used
-  // to quantize (and one of the remaining backoffs will be 0).  
+  // to quantize (and one of the remaining backoffs will be 0).
  uint8_t prob_bits, backoff_bits;

  // Bhiksha compression (simple form).  Only works with trie.
  uint8_t pointer_bhiksha_bits;

-  
-  
+
  // ONLY EFFECTIVE WHEN READING BINARY
-  
+
  // How to get the giant array into memory: lazy mmap, populate, read etc.
-  // See util/mmap.hh for details of MapMethod.  
+  // See util/mmap.hh for details of MapMethod.
  util::LoadMethod load_method;


-
-  // Set defaults. 
+  // Set defaults.
  Config();
 };

--- a/lm/model.cc
+++ b/lm/model.cc
@ -37,7 +37,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
 template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, const Config &config) {
  LoadLM(file, config, *this);

-  // g++ prints warnings unless these are fully initialized.  
+  // g++ prints warnings unless these are fully initialized.
  State begin_sentence = State();
  begin_sentence.length = 1;
  begin_sentence.words[0] = vocab_.BeginSentence();
@ -69,8 +69,8 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
 }

 template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromARPA(const char *file, const Config &config) {
-  // Backing file is the ARPA.  Steal it so we can make the backing file the mmap output if any.  
-  util::FilePiece f(backing_.file.release(), file, config.messages);
+  // Backing file is the ARPA.  Steal it so we can make the backing file the mmap output if any.
+  util::FilePiece f(backing_.file.release(), file, config.ProgressMessages());
  try {
    std::vector<uint64_t> counts;
    // File counts do not include pruned trigrams that extend to quadgrams etc.   These will be fixed by search_.
@ -80,7 +80,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
    if (config.probing_multiplier <= 1.0) UTIL_THROW(ConfigException, "probing multiplier must be > 1.0");

    std::size_t vocab_size = util::CheckOverflow(VocabularyT::Size(counts[0], config));
-    // Setup the binary file for writing the vocab lookup table.  The search_ is responsible for growing the binary file to its needs.  
+    // Setup the binary file for writing the vocab lookup table.  The search_ is responsible for growing the binary file to its needs.
    vocab_.SetupMemory(SetupJustVocab(config, counts.size(), vocab_size, backing_), vocab_size, counts[0], config);

    if (config.write_mmap) {
@ -95,7 +95,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT

    if (!vocab_.SawUnk()) {
      assert(config.unknown_missing != THROW_UP);
-      // Default probabilities for unknown.  
+      // Default probabilities for unknown.
      search_.UnknownUnigram().backoff = 0.0;
      search_.UnknownUnigram().prob = config.unknown_missing_logprob;
    }
@ -147,7 +147,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
 }

 template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const {
-  // Generate a state from context.  
+  // Generate a state from context.
  context_rend = std::min(context_rend, context_rbegin + P::Order() - 1);
  if (context_rend == context_rbegin) {
    out_state.length = 0;
@ -191,7 +191,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
    ret.rest = ptr.Rest();
    ret.prob = ptr.Prob();
    ret.extend_left = extend_pointer;
-    // If this function is called, then it does depend on left words.   
+    // If this function is called, then it does depend on left words.
    ret.independent_left = false;
  }
  float subtract_me = ret.rest;
@ -199,7 +199,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
  next_use = extend_length;
  ResumeScore(add_rbegin, add_rend, extend_length - 1, node, backoff_out, next_use, ret);
  next_use -= extend_length;
-  // Charge backoffs.  
+  // Charge backoffs.
  for (const float *b = backoff_in + ret.ngram_length - extend_length; b < backoff_in + (add_rend - add_rbegin); ++b) ret.prob += *b;
  ret.prob -= subtract_me;
  ret.rest -= subtract_me;
@ -209,7 +209,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
 namespace {
 // Do a paraonoid copy of history, assuming new_word has already been copied
 // (hence the -1).  out_state.length could be zero so I avoided using
-// std::copy.   
+// std::copy.
 void CopyRemainingHistory(const WordIndex *from, State &out_state) {
  WordIndex *out = out_state.words + 1;
  const WordIndex *in_end = from + static_cast<ptrdiff_t>(out_state.length) - 1;
@ -217,10 +217,10 @@ void CopyRemainingHistory(const WordIndex *from, State &out_state) {
 }
 } // namespace

-/* Ugly optimized function.  Produce a score excluding backoff.  
- * The search goes in increasing order of ngram length.  
+/* Ugly optimized function.  Produce a score excluding backoff.
+ * The search goes in increasing order of ngram length.
 * Context goes backward, so context_begin is the word immediately preceeding
- * new_word.  
+ * new_word.
 */
 template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::ScoreExceptBackoff(
    const WordIndex *const context_rbegin,
@ -229,7 +229,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
    State &out_state) const {
  assert(new_word < vocab_.Bound());
  FullScoreReturn ret;
-  // ret.ngram_length contains the last known non-blank ngram length.  
+  // ret.ngram_length contains the last known non-blank ngram length.
  ret.ngram_length = 1;

  typename Search::Node node;
@ -238,9 +238,9 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
  ret.prob = uni.Prob();
  ret.rest = uni.Rest();

-  // This is the length of the context that should be used for continuation to the right.  
+  // This is the length of the context that should be used for continuation to the right.
  out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0;
-  // We'll write the word anyway since it will probably be used and does no harm being there.  
+  // We'll write the word anyway since it will probably be used and does no harm being there.
  out_state.words[0] = new_word;
  if (context_rbegin == context_rend) return ret;

--- a/lm/search_trie.cc
+++ b/lm/search_trie.cc
@ -55,7 +55,7 @@ struct ProbPointer {
  uint64_t index;
 };

-// Array of n-grams and float indices.  
+// Array of n-grams and float indices.
 class BackoffMessages {
  public:
    void Init(std::size_t entry_size) {
@ -100,7 +100,7 @@ class BackoffMessages {
    void Apply(float *const *const base, RecordReader &reader) {
      FinishedAdding();
      if (current_ == allocated_) return;
-      // We'll also use the same buffer to record messages to blanks that they extend.  
+      // We'll also use the same buffer to record messages to blanks that they extend.
      WordIndex *extend_out = reinterpret_cast<WordIndex*>(current_);
      const unsigned char order = (entry_size_ - sizeof(ProbPointer)) / sizeof(WordIndex);
      for (reader.Rewind(); reader && (current_ != allocated_); ) {
@ -109,7 +109,7 @@ class BackoffMessages {
            ++reader;
            break;
          case 1:
-            // Message but nobody to receive it.  Write it down at the beginning of the buffer so we can inform this blank that it extends.  
+            // Message but nobody to receive it.  Write it down at the beginning of the buffer so we can inform this blank that it extends.
            for (const WordIndex *w = reinterpret_cast<const WordIndex *>(current_); w != reinterpret_cast<const WordIndex *>(current_) + order; ++w, ++extend_out) *extend_out = *w;
            current_ += entry_size_;
            break;
@ -126,7 +126,7 @@ class BackoffMessages {
            break;
        }
      }
-      // Now this is a list of blanks that extend right.  
+      // Now this is a list of blanks that extend right.
      entry_size_ = sizeof(WordIndex) * order;
      Resize(sizeof(WordIndex) * (extend_out - (const WordIndex*)backing_.get()));
      current_ = (uint8_t*)backing_.get();
@ -153,7 +153,7 @@ class BackoffMessages {
  private:
    void FinishedAdding() {
      Resize(current_ - (uint8_t*)backing_.get());
-      // Sort requests in same order as files.  
+      // Sort requests in same order as files.
      std::sort(
          util::SizedIterator(util::SizedProxy(backing_.get(), entry_size_)),
          util::SizedIterator(util::SizedProxy(current_, entry_size_)),
@ -220,7 +220,7 @@ class SRISucks {
    }

  private:
-    // This used to be one array.  Then I needed to separate it by order for quantization to work.  
+    // This used to be one array.  Then I needed to separate it by order for quantization to work.
    std::vector<float> values_[KENLM_MAX_ORDER - 1];
    BackoffMessages messages_[KENLM_MAX_ORDER - 1];

@ -253,7 +253,7 @@ class FindBlanks {
      ++counts_.back();
    }

-    // Unigrams wrote one past.  
+    // Unigrams wrote one past.
    void Cleanup() {
      --counts_[0];
    }
@ -270,15 +270,15 @@ class FindBlanks {
    SRISucks &sri_;
 };

-// Phase to actually write n-grams to the trie.  
+// Phase to actually write n-grams to the trie.
 template <class Quant, class Bhiksha> class WriteEntries {
  public:
-    WriteEntries(RecordReader *contexts, const Quant &quant, UnigramValue *unigrams, BitPackedMiddle<Bhiksha> *middle, BitPackedLongest &longest, unsigned char order, SRISucks &sri) : 
+    WriteEntries(RecordReader *contexts, const Quant &quant, UnigramValue *unigrams, BitPackedMiddle<Bhiksha> *middle, BitPackedLongest &longest, unsigned char order, SRISucks &sri) :
      contexts_(contexts),
      quant_(quant),
      unigrams_(unigrams),
      middle_(middle),
-      longest_(longest), 
+      longest_(longest),
      bigram_pack_((order == 2) ? static_cast<BitPacked&>(longest_) : static_cast<BitPacked&>(*middle_)),
      order_(order),
      sri_(sri) {}
@ -328,7 +328,7 @@ struct Gram {

  const WordIndex *begin, *end;

-  // For queue, this is the direction we want.  
+  // For queue, this is the direction we want.
  bool operator<(const Gram &other) const {
    return std::lexicographical_compare(other.begin, other.end, begin, end);
  }
@ -353,7 +353,7 @@ template <class Doing> class BlankManager {
        been_length_ = length;
        return;
      }
-      // There are blanks to insert starting with order blank.  
+      // There are blanks to insert starting with order blank.
      unsigned char blank = cur - to + 1;
      UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context.");
      const float *lower_basis;
@ -363,7 +363,7 @@ template <class Doing> class BlankManager {
        assert(*lower_basis != kBadProb);
        doing_.MiddleBlank(blank, to, based_on, *lower_basis);
        *pre = *cur;
-        // Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram.  
+        // Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram.
        basis_[blank - 1] = kBadProb;
      }
      *pre = *cur;
@ -377,7 +377,7 @@ template <class Doing> class BlankManager {
    unsigned char been_length_;

    float basis_[KENLM_MAX_ORDER];
-    
+
    Doing &doing_;
 };

@ -451,7 +451,7 @@ template <class Quant> void TrainProbQuantizer(uint8_t order, uint64_t count, Re
 }

 void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &contexts, UnigramValue *unigrams) {
-  // Fill unigram probabilities.  
+  // Fill unigram probabilities.
  try {
    rewind(file);
    for (WordIndex i = 0; i < unigram_count; ++i) {
@ -486,7 +486,7 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
    util::scoped_memory unigrams;
    MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams);
    FindBlanks finder(counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri);
-    RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Identifying n-grams omitted by SRI", finder);
+    RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Identifying n-grams omitted by SRI", finder);
    fixed_counts = finder.Counts();
  }
  unigram_file.reset(util::FDOpenOrThrow(unigram_fd));
@ -504,7 +504,8 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
    inputs[i-2].Rewind();
  }
  if (Quant::kTrain) {
-    util::ErsatzProgress progress(std::accumulate(counts.begin() + 1, counts.end(), 0), config.messages, "Quantizing");
+    util::ErsatzProgress progress(std::accumulate(counts.begin() + 1, counts.end(), 0),
+                                  config.ProgressMessages(), "Quantizing");
    for (unsigned char i = 2; i < counts.size(); ++i) {
      TrainQuantizer(i, counts[i-1], sri.Values(i), inputs[i-2], progress, quant);
    }
@ -519,13 +520,13 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
  for (unsigned char i = 2; i <= counts.size(); ++i) {
    inputs[i-2].Rewind();
  }
-  // Fill entries except unigram probabilities.  
+  // Fill entries except unigram probabilities.
  {
    WriteEntries<Quant, Bhiksha> writer(contexts, quant, unigrams, out.middle_begin_, out.longest_, counts.size(), sri);
-    RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Writing trie", writer);
+    RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Writing trie", writer);
  }

-  // Do not disable this error message or else too little state will be returned.  Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation.   
+  // Do not disable this error message or else too little state will be returned.  Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation.
  for (unsigned char order = 2; order <= counts.size(); ++order) {
    const RecordReader &context = contexts[order - 2];
    if (context) {
@ -541,13 +542,13 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
  }

  /* Set ending offsets so the last entry will be sized properly */
-  // Last entry for unigrams was already set.  
+  // Last entry for unigrams was already set.
  if (out.middle_begin_ != out.middle_end_) {
    for (typename TrieSearch<Quant, Bhiksha>::Middle *i = out.middle_begin_; i != out.middle_end_ - 1; ++i) {
      i->FinishedLoading((i+1)->InsertIndex(), config);
    }
    (out.middle_end_ - 1)->FinishedLoading(out.longest_.InsertIndex(), config);
-  }  
+  }
 }

 template <class Quant, class Bhiksha> uint8_t *TrieSearch<Quant, Bhiksha>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
@ -595,7 +596,7 @@ template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::Initializ
  } else {
    temporary_prefix = file;
  }
-  // At least 1MB sorting memory.  
+  // At least 1MB sorting memory.
  SortedFiles sorted(config, f, counts, std::max<size_t>(config.building_memory, 1048576), temporary_prefix, vocab);

  BuildTrie(sorted, counts, config, *this, quant_, vocab, backing);
--- a/util/ersatz_progress.cc
+++ b/util/ersatz_progress.cc
@ -38,7 +38,7 @@ void ErsatzProgress::Milestone() {
    next_ = std::numeric_limits<uint64_t>::max();
    out_ = NULL;
  } else {
-    next_ = std::max(next_, (stone * complete_) / kWidth);
+    next_ = std::max(next_, ((stone + 1) * complete_ + kWidth - 1) / kWidth);
  }
 }

--- a/util/ersatz_progress.hh
+++ b/util/ersatz_progress.hh
@ -32,7 +32,6 @@ class ErsatzProgress {

    void Set(uint64_t to) {
      if ((current_ = to) >= next_) Milestone();
-      Milestone();
    }

    void Finished() {
--- a/util/file.cc
+++ b/util/file.cc
@ -1,3 +1,5 @@
+#define _LARGEFILE64_SOURCE
+
 #include "util/file.hh"

 #include "util/exception.hh"
@ -91,7 +93,7 @@ void ReadOrThrow(int fd, void *to_void, std::size_t amount) {
  uint8_t *to = static_cast<uint8_t*>(to_void);
  while (amount) {
    std::size_t ret = PartialRead(fd, to, amount);
-    UTIL_THROW_IF(ret == 0, EndOfFileException, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read.");
+    UTIL_THROW_IF(ret == 0, EndOfFileException, " in fd " << fd << " but there should be " << amount << " more bytes to read.");
    amount -= ret;
    to += ret;
  }
@ -141,7 +143,7 @@ void InternalSeek(int fd, int64_t off, int whence) {
  UTIL_THROW_IF((__int64)-1 == _lseeki64(fd, off, whence), ErrnoException, "Windows seek failed");

 #else
-  UTIL_THROW_IF((off_t)-1 == lseek(fd, off, whence), ErrnoException, "Seek failed");
+  UTIL_THROW_IF((off_t)-1 == lseek64(fd, off, whence), ErrnoException, "Seek failed");
 #endif
 }
 } // namespace
--- a/util/file.hh
+++ b/util/file.hh
@ -32,8 +32,6 @@ class scoped_fd {
      return ret;
    }

-    operator bool() { return fd_ != -1; }
-
  private:
    int fd_;

--- a/util/file_piece.hh
+++ b/util/file_piece.hh
@ -23,17 +23,17 @@ class ParseNumberException : public Exception {

 extern const bool kSpaces[256];

-// Memory backing the returned StringPiece may vanish on the next call.  
+// Memory backing the returned StringPiece may vanish on the next call.
 class FilePiece {
  public:
    // 1 MB default.
    explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
-    // Takes ownership of fd.  name is used for messages.  
+    // Takes ownership of fd.  name is used for messages.
    explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);

    ~FilePiece();
-     
-    char get() { 
+
+    char get() {
      if (position_ == position_end_) {
        Shift();
        if (at_end_) throw EndOfFileException();
@ -41,14 +41,14 @@ class FilePiece {
      return *(position_++);
    }

-    // Leaves the delimiter, if any, to be returned by get().  Delimiters defined by isspace().  
+    // Leaves the delimiter, if any, to be returned by get().  Delimiters defined by isspace().
    StringPiece ReadDelimited(const bool *delim = kSpaces) {
      SkipSpaces(delim);
      return Consume(FindDelimiterOrEOF(delim));
    }

    // Unlike ReadDelimited, this includes leading spaces and consumes the delimiter.
-    // It is similar to getline in that way.  
+    // It is similar to getline in that way.
    StringPiece ReadLine(char delim = '\n');

    float ReadFloat();
@ -56,7 +56,7 @@ class FilePiece {
    long int ReadLong();
    unsigned long int ReadULong();

-    // Skip spaces defined by isspace.  
+    // Skip spaces defined by isspace.
    void SkipSpaces(const bool *delim = kSpaces) {
      for (; ; ++position_) {
        if (position_ == position_end_) Shift();
@ -69,7 +69,7 @@ class FilePiece {
    }

    const std::string &FileName() const { return file_name_; }
-    
+
  private:
    void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer);

--- a/util/have.hh
+++ b/util/have.hh
@ -6,8 +6,8 @@
 //#define HAVE_ICU
 #endif

-#ifndef HAVE_THREADS
-//#define HAVE_THREADS
+#ifndef HAVE_BOOST
+#define HAVE_BOOST
 #endif

 #endif // UTIL_HAVE__
--- a/util/joint_sort.hh
+++ b/util/joint_sort.hh
@ -60,7 +60,7 @@ template <class KeyIter, class ValueIter> class JointProxy {
    JointProxy(const KeyIter &key_iter, const ValueIter &value_iter) : inner_(key_iter, value_iter) {}
    JointProxy(const JointProxy<KeyIter, ValueIter> &other) : inner_(other.inner_) {}

-    operator const value_type() const {
+    operator value_type() const {
      value_type ret;
      ret.key = *inner_.key_;
      ret.value = *inner_.value_;
@ -121,7 +121,7 @@ template <class Proxy, class Less> class LessWrapper : public std::binary_functi

 template <class KeyIter, class ValueIter> class PairedIterator : public ProxyIterator<detail::JointProxy<KeyIter, ValueIter> > {
  public:
-    PairedIterator(const KeyIter &key, const ValueIter &value) : 
+    PairedIterator(const KeyIter &key, const ValueIter &value) :
      ProxyIterator<detail::JointProxy<KeyIter, ValueIter> >(detail::JointProxy<KeyIter, ValueIter>(key, value)) {}
 };

--- a/util/read_compressed.cc
+++ b/util/read_compressed.cc
@ -370,7 +370,7 @@ ReadBase *ReadFactory(int fd, uint64_t &raw_amount) {
      break;
  }
  try {
-    AdvanceOrThrow(fd, -ReadCompressed::kMagicSize);
+    SeekOrThrow(fd, 0);
  } catch (const util::ErrnoException &e) {
    return new UncompressedWithHeader(hold.release(), header, ReadCompressed::kMagicSize);
  }
--- a/util/string_piece.hh
+++ b/util/string_piece.hh
@ -1,6 +1,6 @@
 /* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n.  If
 * you don't use ICU, then this will use the Google implementation from Chrome.
- * This has been modified from the original version to let you choose.  
+ * This has been modified from the original version to let you choose.
 */

 // Copyright 2008, Google Inc.
@ -49,7 +49,11 @@
 #define BASE_STRING_PIECE_H__

 #include "util/have.hh"
+
+#ifdef HAVE_BOOST
 #include <boost/functional/hash/hash.hpp>
+#endif // HAVE_BOOST
+
 #include <cstring>
 #include <iosfwd>
 #include <ostream>
@ -58,9 +62,9 @@
 #include <unicode/stringpiece.h>
 #include <unicode/uversion.h>

-// Old versions of ICU don't define operator== and operator!=.  
+// Old versions of ICU don't define operator== and operator!=.
 #if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4))
-#warning You are using an old version of ICU.  Consider upgrading to ICU >= 4.6.  
+#warning You are using an old version of ICU.  Consider upgrading to ICU >= 4.6.
 inline bool operator==(const StringPiece& x, const StringPiece& y) {
  if (x.size() != y.size())
    return false;
@ -252,6 +256,7 @@ inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
  return o.write(piece.data(), static_cast<std::streamsize>(piece.size()));
 }

+#ifdef HAVE_BOOST
 inline size_t hash_value(const StringPiece &str) {
  return boost::hash_range(str.data(), str.data() + str.length());
 }
@ -285,9 +290,12 @@ template <class T> typename T::iterator FindStringPiece(T &t, const StringPiece
  return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
 #endif
 }
+#endif

 #ifdef HAVE_ICU
 U_NAMESPACE_END
+using U_NAMESPACE_QUALIFIER StringPiece;
 #endif

+
 #endif  // BASE_STRING_PIECE_H__