2015-04-30 08:05:11 +03:00
|
|
|
// Step of trie builder: create sorted files.
|
2011-11-11 00:46:59 +04:00
|
|
|
|
2014-06-02 21:28:02 +04:00
|
|
|
#ifndef LM_TRIE_SORT_H
|
|
|
|
#define LM_TRIE_SORT_H
|
2011-09-21 20:06:48 +04:00
|
|
|
|
2012-09-28 18:04:48 +04:00
|
|
|
#include "lm/max_order.hh"
|
2011-09-21 20:06:48 +04:00
|
|
|
#include "lm/word_index.hh"
|
|
|
|
|
|
|
|
#include "util/file.hh"
|
|
|
|
#include "util/scoped.hh"
|
|
|
|
|
|
|
|
#include <cstddef>
|
|
|
|
#include <functional>
|
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
|
2011-11-12 02:39:27 +04:00
|
|
|
#include <stdint.h>
|
2011-09-21 20:06:48 +04:00
|
|
|
|
2011-11-11 00:46:59 +04:00
|
|
|
namespace util {
|
|
|
|
class FilePiece;
|
|
|
|
} // namespace util
|
2011-09-21 20:06:48 +04:00
|
|
|
|
|
|
|
namespace lm {
|
2011-11-11 00:46:59 +04:00
|
|
|
class PositiveProbWarn;
|
2011-09-21 20:06:48 +04:00
|
|
|
namespace ngram {
|
|
|
|
class SortedVocabulary;
|
2012-05-05 08:55:46 +04:00
|
|
|
struct Config;
|
2011-09-21 20:06:48 +04:00
|
|
|
|
|
|
|
namespace trie {
|
|
|
|
|
|
|
|
class EntryCompare : public std::binary_function<const void*, const void*, bool> {
|
|
|
|
public:
|
|
|
|
explicit EntryCompare(unsigned char order) : order_(order) {}
|
|
|
|
|
|
|
|
bool operator()(const void *first_void, const void *second_void) const {
|
|
|
|
const WordIndex *first = static_cast<const WordIndex*>(first_void);
|
|
|
|
const WordIndex *second = static_cast<const WordIndex*>(second_void);
|
|
|
|
const WordIndex *end = first + order_;
|
|
|
|
for (; first != end; ++first, ++second) {
|
|
|
|
if (*first < *second) return true;
|
|
|
|
if (*first > *second) return false;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
private:
|
|
|
|
unsigned char order_;
|
|
|
|
};
|
|
|
|
|
|
|
|
class RecordReader {
|
|
|
|
public:
|
|
|
|
RecordReader() : remains_(true) {}
|
|
|
|
|
2011-11-11 00:46:59 +04:00
|
|
|
void Init(FILE *file, std::size_t entry_size);
|
2011-09-21 20:06:48 +04:00
|
|
|
|
|
|
|
void *Data() { return data_.get(); }
|
|
|
|
const void *Data() const { return data_.get(); }
|
|
|
|
|
|
|
|
RecordReader &operator++() {
|
2011-11-11 00:46:59 +04:00
|
|
|
std::size_t ret = fread(data_.get(), entry_size_, 1, file_);
|
2011-09-21 20:06:48 +04:00
|
|
|
if (!ret) {
|
2011-11-11 00:46:59 +04:00
|
|
|
UTIL_THROW_IF(!feof(file_), util::ErrnoException, "Error reading temporary file");
|
2011-09-21 20:06:48 +04:00
|
|
|
remains_ = false;
|
|
|
|
}
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
operator bool() const { return remains_; }
|
|
|
|
|
2011-11-11 00:46:59 +04:00
|
|
|
void Rewind();
|
2011-09-21 20:06:48 +04:00
|
|
|
|
|
|
|
std::size_t EntrySize() const { return entry_size_; }
|
|
|
|
|
|
|
|
void Overwrite(const void *start, std::size_t amount);
|
|
|
|
|
|
|
|
private:
|
2011-11-11 00:46:59 +04:00
|
|
|
FILE *file_;
|
|
|
|
|
2011-09-21 20:06:48 +04:00
|
|
|
util::scoped_malloc data_;
|
|
|
|
|
|
|
|
bool remains_;
|
|
|
|
|
|
|
|
std::size_t entry_size_;
|
|
|
|
};
|
|
|
|
|
2011-11-11 00:46:59 +04:00
|
|
|
class SortedFiles {
|
|
|
|
public:
|
|
|
|
// Build from ARPA
|
|
|
|
SortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, std::size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab);
|
|
|
|
|
|
|
|
int StealUnigram() {
|
|
|
|
return unigram_.release();
|
|
|
|
}
|
|
|
|
|
|
|
|
FILE *Full(unsigned char order) {
|
|
|
|
return full_[order - 2].get();
|
|
|
|
}
|
|
|
|
|
|
|
|
FILE *Context(unsigned char of_order) {
|
|
|
|
return context_[of_order - 2].get();
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
2013-01-17 15:58:58 +04:00
|
|
|
void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const std::string &prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size);
|
2015-04-30 08:05:11 +03:00
|
|
|
|
2011-11-11 00:46:59 +04:00
|
|
|
util::scoped_fd unigram_;
|
|
|
|
|
2012-08-09 00:22:13 +04:00
|
|
|
util::scoped_FILE full_[KENLM_MAX_ORDER - 1], context_[KENLM_MAX_ORDER - 1];
|
2011-11-11 00:46:59 +04:00
|
|
|
};
|
2011-09-21 20:06:48 +04:00
|
|
|
|
|
|
|
} // namespace trie
|
|
|
|
} // namespace ngram
|
|
|
|
} // namespace lm
|
|
|
|
|
2014-06-02 21:28:02 +04:00
|
|
|
#endif // LM_TRIE_SORT_H
|