mosesdecoder/kenlm/lm/search_trie.cc

/* This is where the trie is built.  It's on-disk.  */
#include "lm/search_trie.hh"

#include "lm/bhiksha.hh"
#include "lm/binary_format.hh"
#include "lm/blank.hh"
#include "lm/lm_exception.hh"
#include "lm/max_order.hh"
#include "lm/quantize.hh"
#include "lm/trie.hh"
#include "lm/trie_sort.hh"
#include "lm/vocab.hh"
#include "lm/weights.hh"
#include "lm/word_index.hh"
#include "util/ersatz_progress.hh"
#include "util/proxy_iterator.hh"
#include "util/scoped.hh"
#include "util/sized_iterator.hh"

#include <algorithm>
#include <cstring>
#include <cstdio>
#include <queue>
#include <limits>
#include <numeric>
#include <vector>
#include "util/portability.hh"


namespace lm {
namespace ngram {
namespace trie {
namespace {

void ReadOrThrow(FILE *from, void *data, size_t size) {
  UTIL_THROW_IF(1 != std::fread(data, size, 1, from), util::ErrnoException, "Short read");
}

int Compare(unsigned char order, const void *first_void, const void *second_void) {
  const WordIndex *first = reinterpret_cast<const WordIndex*>(first_void), *second = reinterpret_cast<const WordIndex*>(second_void);
  const WordIndex *end = first + order;
  for (; first != end; ++first, ++second) {
    if (*first < *second) return -1;
    if (*first > *second) return 1;
  }
  return 0;
}

struct ProbPointer {
  unsigned char array;
  uint64_t index;
};

// Array of n-grams and float indices.
class BackoffMessages {
  public:
    void Init(std::size_t entry_size) {
      current_ = NULL;
      allocated_ = NULL;
      entry_size_ = entry_size;
    }

    void Add(const WordIndex *to, ProbPointer index) {
      while (current_ + entry_size_ > allocated_) {
        std::size_t allocated_size = allocated_ - (uint8_t*)backing_.get();
        Resize(std::max<std::size_t>(allocated_size * 2, entry_size_));
      }
      memcpy(current_, to, entry_size_ - sizeof(ProbPointer));
      *reinterpret_cast<ProbPointer*>(current_ + entry_size_ - sizeof(ProbPointer)) = index;
      current_ += entry_size_;
    }

    void Apply(float *const *const base, FILE *unigrams) {
      FinishedAdding();
      if (current_ == allocated_) return;
      rewind(unigrams);
      ProbBackoff weights;
      WordIndex unigram = 0;
      ReadOrThrow(unigrams, &weights, sizeof(weights));
      for (; current_ != allocated_; current_ += entry_size_) {
        const WordIndex &cur_word = *reinterpret_cast<const WordIndex*>(current_);
        for (; unigram < cur_word; ++unigram) {
          ReadOrThrow(unigrams, &weights, sizeof(weights));
        }
        if (!HasExtension(weights.backoff)) {
          weights.backoff = kExtensionBackoff;
          UTIL_THROW_IF(fseek(unigrams, -sizeof(weights), SEEK_CUR), util::ErrnoException, "Seeking backwards to denote unigram extension failed.");
          WriteOrThrow(unigrams, &weights, sizeof(weights));
        }
        const ProbPointer &write_to = *reinterpret_cast<const ProbPointer*>(current_ + sizeof(WordIndex));
        base[write_to.array][write_to.index] += weights.backoff;
      }
      backing_.reset();
    }

    void Apply(float *const *const base, RecordReader &reader) {
      FinishedAdding();
      if (current_ == allocated_) return;
      // We'll also use the same buffer to record messages to blanks that they extend.
      WordIndex *extend_out = reinterpret_cast<WordIndex*>(current_);
      const unsigned char order = (entry_size_ - sizeof(ProbPointer)) / sizeof(WordIndex);
      for (reader.Rewind(); reader && (current_ != allocated_); ) {
        switch (Compare(order, reader.Data(), current_)) {
          case -1:
            ++reader;
            break;
          case 1:
            // Message but nobody to receive it.  Write it down at the beginning of the buffer so we can inform this blank that it extends.
            for (const WordIndex *w = reinterpret_cast<const WordIndex *>(current_); w != reinterpret_cast<const WordIndex *>(current_) + order; ++w, ++extend_out) *extend_out = *w;
            current_ += entry_size_;
            break;
          case 0:
            float &backoff = reinterpret_cast<ProbBackoff*>((uint8_t*)reader.Data() + order * sizeof(WordIndex))->backoff;
            if (!HasExtension(backoff)) {
              backoff = kExtensionBackoff;
              reader.Overwrite(&backoff, sizeof(float));
            } else {
              const ProbPointer &write_to = *reinterpret_cast<const ProbPointer*>(current_ + entry_size_ - sizeof(ProbPointer));
              base[write_to.array][write_to.index] += backoff;
            }
            current_ += entry_size_;
            break;
        }
      }
      // Now this is a list of blanks that extend right.
      entry_size_ = sizeof(WordIndex) * order;
      Resize(sizeof(WordIndex) * (extend_out - (const WordIndex*)backing_.get()));
      current_ = (uint8_t*)backing_.get();
    }

    // Call after Apply
    bool Extends(unsigned char order, const WordIndex *words) {
      if (current_ == allocated_) return false;
      assert(order * sizeof(WordIndex) == entry_size_);
      while (true) {
        switch(Compare(order, words, current_)) {
          case 1:
            current_ += entry_size_;
            if (current_ == allocated_) return false;
            break;
          case -1:
            return false;
          case 0:
            return true;
        }
      }
    }

  private:
    void FinishedAdding() {
      Resize(current_ - (uint8_t*)backing_.get());
      // Sort requests in same order as files.
      std::sort(
          util::SizedIterator(util::SizedProxy(backing_.get(), entry_size_)),
          util::SizedIterator(util::SizedProxy(current_, entry_size_)),
          util::SizedCompare<EntryCompare>(EntryCompare((entry_size_ - sizeof(ProbPointer)) / sizeof(WordIndex))));
      current_ = (uint8_t*)backing_.get();
    }

    void Resize(std::size_t to) {
      std::size_t current = current_ - (uint8_t*)backing_.get();
      backing_.call_realloc(to);
      current_ = (uint8_t*)backing_.get() + current;
      allocated_ = (uint8_t*)backing_.get() + to;
    }

    util::scoped_malloc backing_;

    uint8_t *current_, *allocated_;

    std::size_t entry_size_;
};

const float kBadProb = std::numeric_limits<float>::infinity();

class SRISucks {
  public:
    SRISucks() {
      for (BackoffMessages *i = messages_; i != messages_ + kMaxOrder - 1; ++i)
        i->Init(sizeof(ProbPointer) + sizeof(WordIndex) * (i - messages_ + 1));
    }

    void Send(unsigned char begin, unsigned char order, const WordIndex *to, float prob_basis) {
      assert(prob_basis != kBadProb);
      ProbPointer pointer;
      pointer.array = order - 1;
      pointer.index = values_[order - 1].size();
      for (unsigned char i = begin; i < order; ++i) {
        messages_[i - 1].Add(to, pointer);
      }
      values_[order - 1].push_back(prob_basis);
    }

    void ObtainBackoffs(unsigned char total_order, FILE *unigram_file, RecordReader *reader) {
      for (unsigned char i = 0; i < kMaxOrder - 1; ++i) {
        it_[i] = &*values_[i].begin();
      }
      messages_[0].Apply(it_, unigram_file);
      BackoffMessages *messages = messages_ + 1;
      const RecordReader *end = reader + total_order - 2 /* exclude unigrams and longest order */;
      for (; reader != end; ++messages, ++reader) {
        messages->Apply(it_, *reader);
      }
    }

    ProbBackoff GetBlank(unsigned char total_order, unsigned char order, const WordIndex *indices) {
      assert(order > 1);
      ProbBackoff ret;
      ret.prob = *(it_[order - 1]++);
      ret.backoff = ((order != total_order - 1) && messages_[order - 1].Extends(order, indices)) ? kExtensionBackoff : kNoExtensionBackoff;
      return ret;
    }

    const std::vector<float> &Values(unsigned char order) const {
      return values_[order - 1];
    }

  private:
    // This used to be one array.  Then I needed to separate it by order for quantization to work.
    std::vector<float> values_[kMaxOrder - 1];
    BackoffMessages messages_[kMaxOrder - 1];

    float *it_[kMaxOrder - 1];
};

class FindBlanks {
  public:
    FindBlanks(uint64_t *counts, unsigned char order, const ProbBackoff *unigrams, SRISucks &messages)
      : counts_(counts), longest_counts_(counts + order - 1), unigrams_(unigrams), sri_(messages) {}

    float UnigramProb(WordIndex index) const {
      return unigrams_[index].prob;
    }

    void Unigram(WordIndex /*index*/) {
      ++counts_[0];
    }

    void MiddleBlank(const unsigned char order, const WordIndex *indices, unsigned char lower, float prob_basis) {
      sri_.Send(lower, order, indices + 1, prob_basis);
      ++counts_[order - 1];
    }

    void Middle(const unsigned char order, const void * /*data*/) {
      ++counts_[order - 1];
    }

    void Longest(const void * /*data*/) {
      ++*longest_counts_;
    }

    // Unigrams wrote one past.
    void Cleanup() {
      --counts_[0];
    }

  private:
    uint64_t *const counts_, *const longest_counts_;

    const ProbBackoff *unigrams_;

    SRISucks &sri_;
};

// Phase to actually write n-grams to the trie.
template <class Quant, class Bhiksha> class WriteEntries {
  public:
    WriteEntries(RecordReader *contexts, UnigramValue *unigrams, BitPackedMiddle<typename Quant::Middle, Bhiksha> *middle, BitPackedLongest<typename Quant::Longest> &longest, unsigned char order, SRISucks &sri) :
      contexts_(contexts),
      unigrams_(unigrams),
      middle_(middle),
      longest_(longest),
      bigram_pack_((order == 2) ? static_cast<BitPacked&>(longest_) : static_cast<BitPacked&>(*middle_)),
      order_(order),
      sri_(sri) {}

    float UnigramProb(WordIndex index) const { return unigrams_[index].weights.prob; }

    void Unigram(WordIndex word) {
      unigrams_[word].next = bigram_pack_.InsertIndex();
    }

    void MiddleBlank(const unsigned char order, const WordIndex *indices, unsigned char /*lower*/, float /*prob_base*/) {
      ProbBackoff weights = sri_.GetBlank(order_, order, indices);
      middle_[order - 2].Insert(indices[order - 1], weights.prob, weights.backoff);
    }

    void Middle(const unsigned char order, const void *data) {
      RecordReader &context = contexts_[order - 1];
      const WordIndex *words = reinterpret_cast<const WordIndex*>(data);
      ProbBackoff weights = *reinterpret_cast<const ProbBackoff*>(words + order);
      if (context && !memcmp(data, context.Data(), sizeof(WordIndex) * order)) {
        SetExtension(weights.backoff);
        ++context;
      }
      middle_[order - 2].Insert(words[order - 1], weights.prob, weights.backoff);
    }

    void Longest(const void *data) {
      const WordIndex *words = reinterpret_cast<const WordIndex*>(data);
      longest_.Insert(words[order_ - 1], reinterpret_cast<const Prob*>(words + order_)->prob);
    }

    void Cleanup() {}

  private:
    RecordReader *contexts_;
    UnigramValue *const unigrams_;
    BitPackedMiddle<typename Quant::Middle, Bhiksha> *const middle_;
    BitPackedLongest<typename Quant::Longest> &longest_;
    BitPacked &bigram_pack_;
    const unsigned char order_;
    SRISucks &sri_;
};

struct Gram {
  Gram(const WordIndex *in_begin, unsigned char order) : begin(in_begin), end(in_begin + order) {}

  const WordIndex *begin, *end;

  // For queue, this is the direction we want.
  bool operator<(const Gram &other) const {
    return std::lexicographical_compare(other.begin, other.end, begin, end);
  }
};

template <class Doing> class BlankManager {
  public:
    BlankManager(unsigned char total_order, Doing &doing) : total_order_(total_order), been_length_(0), doing_(doing) {
      for (float *i = basis_; i != basis_ + kMaxOrder - 1; ++i) *i = kBadProb;
    }

    void Visit(const WordIndex *to, unsigned char length, float prob) {
      basis_[length - 1] = prob;
      unsigned char overlap = std::min<unsigned char>(length - 1, been_length_);
      const WordIndex *cur;
      WordIndex *pre;
      for (cur = to, pre = been_; cur != to + overlap; ++cur, ++pre) {
        if (*pre != *cur) break;
      }
      if (cur == to + length - 1) {
        *pre = *cur;
        been_length_ = length;
        return;
      }
      // There are blanks to insert starting with order blank.
      unsigned char blank = cur - to + 1;
      UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context.");
      const float *lower_basis;
      for (lower_basis = basis_ + blank - 2; *lower_basis == kBadProb; --lower_basis) {}
      unsigned char based_on = lower_basis - basis_ + 1;
      for (; cur != to + length - 1; ++blank, ++cur, ++pre) {
        assert(*lower_basis != kBadProb);
        doing_.MiddleBlank(blank, to, based_on, *lower_basis);
        *pre = *cur;
        // Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram.
        basis_[blank - 1] = kBadProb;
      }
      been_length_ = length;
    }

  private:
    const unsigned char total_order_;

    WordIndex been_[kMaxOrder];
    unsigned char been_length_;

    float basis_[kMaxOrder];

    Doing &doing_;
};

template <class Doing> void RecursiveInsert(const unsigned char total_order, const WordIndex unigram_count, RecordReader *input, std::ostream *progress_out, const char *message, Doing &doing) {
  util::ErsatzProgress progress(progress_out, message, unigram_count + 1);
  unsigned int unigram = 0;
  std::priority_queue<Gram> grams;
  grams.push(Gram(&unigram, 1));
  for (unsigned char i = 2; i <= total_order; ++i) {
    if (input[i-2]) grams.push(Gram(reinterpret_cast<const WordIndex*>(input[i-2].Data()), i));
  }

  BlankManager<Doing> blank(total_order, doing);

  while (true) {
    Gram top = grams.top();
    grams.pop();
    unsigned char order = top.end - top.begin;
    if (order == 1) {
      blank.Visit(&unigram, 1, doing.UnigramProb(unigram));
      doing.Unigram(unigram);
      progress.Set(unigram);
      if (++unigram == unigram_count + 1) break;
      grams.push(top);
    } else {
      if (order == total_order) {
        blank.Visit(top.begin, order, reinterpret_cast<const Prob*>(top.end)->prob);
        doing.Longest(top.begin);
      } else {
        blank.Visit(top.begin, order, reinterpret_cast<const ProbBackoff*>(top.end)->prob);
        doing.Middle(order, top.begin);
      }
      RecordReader &reader = input[order - 2];
      if (++reader) grams.push(top);
    }
  }
  assert(grams.empty());
  doing.Cleanup();
}

void SanityCheckCounts(const std::vector<uint64_t> &initial, const std::vector<uint64_t> &fixed) {
  if (fixed[0] != initial[0]) UTIL_THROW(util::Exception, "Unigram count should be constant but initial is " << initial[0] << " and recounted is " << fixed[0]);
  if (fixed.back() != initial.back()) UTIL_THROW(util::Exception, "Longest count should be constant but it changed from " << initial.back() << " to " << fixed.back());
  for (unsigned char i = 0; i < initial.size(); ++i) {
    if (fixed[i] < initial[i]) UTIL_THROW(util::Exception, "Counts came out lower than expected.  This shouldn't happen");
  }
}

template <class Quant> void TrainQuantizer(uint8_t order, uint64_t count, const std::vector<float> &additional, RecordReader &reader, util::ErsatzProgress &progress, Quant &quant) {
  std::vector<float> probs(additional), backoffs;
  probs.reserve(count + additional.size());
  backoffs.reserve(count);
  for (reader.Rewind(); reader; ++reader) {
    const ProbBackoff &weights = *reinterpret_cast<const ProbBackoff*>(reinterpret_cast<const uint8_t*>(reader.Data()) + sizeof(WordIndex) * order);
    probs.push_back(weights.prob);
    if (weights.backoff != 0.0) backoffs.push_back(weights.backoff);
    ++progress;
  }
  quant.Train(order, probs, backoffs);
}

template <class Quant> void TrainProbQuantizer(uint8_t order, uint64_t count, RecordReader &reader, util::ErsatzProgress &progress, Quant &quant) {
  std::vector<float> probs, backoffs;
  probs.reserve(count);
  for (reader.Rewind(); reader; ++reader) {
    const Prob &weights = *reinterpret_cast<const Prob*>(reinterpret_cast<const uint8_t*>(reader.Data()) + sizeof(WordIndex) * order);
    probs.push_back(weights.prob);
    ++progress;
  }
  quant.TrainProb(order, probs);
}

void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &contexts, UnigramValue *unigrams) {
  // Fill unigram probabilities.
  try {
    rewind(file);
    for (WordIndex i = 0; i < unigram_count; ++i) {
      ReadOrThrow(file, &unigrams[i].weights, sizeof(ProbBackoff));
      if (contexts && *reinterpret_cast<const WordIndex*>(contexts.Data()) == i) {
        SetExtension(unigrams[i].weights.backoff);
        ++contexts;
      }
    }
  } catch (util::Exception &e) {
    e << " while re-reading unigram probabilities";
    throw;
  }
}

} // namespace

template <class Quant, class Bhiksha> void BuildTrie(const std::string &file_prefix, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) {
  RecordReader inputs[kMaxOrder - 1];
  RecordReader contexts[kMaxOrder - 1];

  for (unsigned char i = 2; i <= counts.size(); ++i) {
    std::stringstream assembled;
    assembled << file_prefix << static_cast<unsigned int>(i) << "_merged";
    inputs[i-2].Init(assembled.str(), i * sizeof(WordIndex) + (i == counts.size() ? sizeof(Prob) : sizeof(ProbBackoff)));
    util::RemoveOrThrow(assembled.str().c_str());
    assembled << kContextSuffix;
    contexts[i-2].Init(assembled.str(), (i-1) * sizeof(WordIndex));
    util::RemoveOrThrow(assembled.str().c_str());
  }

  SRISucks sri;
  std::vector<uint64_t> fixed_counts(counts.size());
  {
    std::string temp(file_prefix); temp += "unigrams";
    util::scoped_fd unigram_file(util::OpenReadOrThrow(temp.c_str()));
    util::scoped_memory unigrams;
    MapRead(util::POPULATE_OR_READ, unigram_file.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams);
    FindBlanks finder(&*fixed_counts.begin(), counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri);
    RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Identifying n-grams omitted by SRI", finder);
  }
  for (const RecordReader *i = inputs; i != inputs + counts.size() - 2; ++i) {
    if (*i) UTIL_THROW(FormatLoadException, "There's a bug in the trie implementation: the " << (i - inputs + 2) << "-gram table did not complete reading");
  }
  SanityCheckCounts(counts, fixed_counts);
  counts = fixed_counts;

  util::scoped_FILE unigram_file;
  {
    std::string name(file_prefix + "unigrams");
    unigram_file.reset(OpenOrThrow(name.c_str(), "r"));
    util::RemoveOrThrow(name.c_str());
  }
  sri.ObtainBackoffs(counts.size(), unigram_file.get(), inputs);

  out.SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), TrieSearch<Quant, Bhiksha>::Size(fixed_counts, config), backing), fixed_counts, config);

  for (unsigned char i = 2; i <= counts.size(); ++i) {
    inputs[i-2].Rewind();
  }
  if (Quant::kTrain) {
    util::ErsatzProgress progress(config.messages, "Quantizing", std::accumulate(counts.begin() + 1, counts.end(), 0));
    for (unsigned char i = 2; i < counts.size(); ++i) {
      TrainQuantizer(i, counts[i-1], sri.Values(i), inputs[i-2], progress, quant);
    }
    TrainProbQuantizer(counts.size(), counts.back(), inputs[counts.size() - 2], progress, quant);
    quant.FinishedLoading(config);
  }

  UnigramValue *unigrams = out.unigram.Raw();
  PopulateUnigramWeights(unigram_file.get(), counts[0], contexts[0], unigrams);
  unigram_file.reset();

  for (unsigned char i = 2; i <= counts.size(); ++i) {
    inputs[i-2].Rewind();
  }
  // Fill entries except unigram probabilities.
  {
    WriteEntries<Quant, Bhiksha> writer(contexts, unigrams, out.middle_begin_, out.longest, counts.size(), sri);
    RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Writing trie", writer);
  }

  // Do not disable this error message or else too little state will be returned.  Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation.
  for (unsigned char order = 2; order <= counts.size(); ++order) {
    const RecordReader &context = contexts[order - 2];
    if (context) {
      FormatLoadException e;
      e << "A " << static_cast<unsigned int>(order) << "-gram has context";
      const WordIndex *ctx = reinterpret_cast<const WordIndex*>(context.Data());
      for (const WordIndex *i = ctx; i != ctx + order - 1; ++i) {
        e << ' ' << *i;
      }
      e << " so this context must appear in the model as a " << static_cast<unsigned int>(order - 1) << "-gram but it does not";
      throw e;
    }
  }

  /* Set ending offsets so the last entry will be sized properly */
  // Last entry for unigrams was already set.
  if (out.middle_begin_ != out.middle_end_) {
    for (typename TrieSearch<Quant, Bhiksha>::Middle *i = out.middle_begin_; i != out.middle_end_ - 1; ++i) {
      i->FinishedLoading((i+1)->InsertIndex(), config);
    }
    (out.middle_end_ - 1)->FinishedLoading(out.longest.InsertIndex(), config);
  }
}

template <class Quant, class Bhiksha> uint8_t *TrieSearch<Quant, Bhiksha>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
  quant_.SetupMemory(start, config);
  start += Quant::Size(counts.size(), config);
  unigram.Init(start);
  start += Unigram::Size(counts[0]);
  FreeMiddles();
  middle_begin_ = static_cast<Middle*>(malloc(sizeof(Middle) * (counts.size() - 2)));
  middle_end_ = middle_begin_ + (counts.size() - 2);
  std::vector<uint8_t*> middle_starts(counts.size() - 2);
  for (unsigned char i = 2; i < counts.size(); ++i) {
    middle_starts[i-2] = start;
    start += Middle::Size(Quant::MiddleBits(config), counts[i-1], counts[0], counts[i], config);
  }
  // Crazy backwards thing so we initialize using pointers to ones that have already been initialized
  for (unsigned char i = counts.size() - 1; i >= 2; --i) {
    new (middle_begin_ + i - 2) Middle(
        middle_starts[i-2],
        quant_.Mid(i),
        counts[i-1],
        counts[0],
        counts[i],
        (i == counts.size() - 1) ? static_cast<const BitPacked&>(longest) : static_cast<const BitPacked &>(middle_begin_[i-1]),
        config);
  }
  longest.Init(start, quant_.Long(counts.size()), counts[0]);
  return start + Longest::Size(Quant::LongestBits(config), counts.back(), counts[0]);
}

template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::LoadedBinary() {
  unigram.LoadedBinary();
  for (Middle *i = middle_begin_; i != middle_end_; ++i) {
    i->LoadedBinary();
  }
  longest.LoadedBinary();
}

namespace {
bool IsDirectory(const char *path) {
  struct stat info;
  if (0 != stat(path, &info)) return false;
  return S_ISDIR(info.st_mode);
}
} // namespace

template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, Backing &backing) {
  std::string temporary_directory;
  if (config.temporary_directory_prefix) {
    temporary_directory = config.temporary_directory_prefix;
    if (!temporary_directory.empty() && temporary_directory[temporary_directory.size() - 1] != '/' && IsDirectory(temporary_directory.c_str()))
      temporary_directory += '/';
  } else if (config.write_mmap) {
    temporary_directory = config.write_mmap;
  } else {
    temporary_directory = file;
  }
  // Null on end is kludge to ensure null termination.
  temporary_directory += "_trie_tmp_XXXXXX";
  temporary_directory += '\0';
  if (!mkdtemp(&temporary_directory[0])) {
    UTIL_THROW(util::ErrnoException, "Failed to make a temporary directory based on the name " << temporary_directory.c_str());
  }
  // Chop off null kludge.
  temporary_directory.resize(strlen(temporary_directory.c_str()));
  // Add directory delimiter.  Assumes a real operating system.
  temporary_directory += '/';
  // At least 1MB sorting memory.
  ARPAToSortedFiles(config, f, counts, std::max<size_t>(config.building_memory, 1048576), temporary_directory.c_str(), vocab);

  BuildTrie(temporary_directory, counts, config, *this, quant_, vocab, backing);
  if (rmdir(temporary_directory.c_str()) && config.messages) {
    *config.messages << "Failed to delete " << temporary_directory << std::endl;
  }
}

template class TrieSearch<DontQuantize, DontBhiksha>;
template class TrieSearch<DontQuantize, ArrayBhiksha>;
template class TrieSearch<SeparatelyQuantize, DontBhiksha>;
template class TrieSearch<SeparatelyQuantize, ArrayBhiksha>;

} // namespace trie
} // namespace ngram
} // namespace lm