mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-25 21:03:22 +03:00
KenLM a590a3a4dadf516a1cff28c8f1c06aa89766f519 including StringStream
TODO: kill istream
This commit is contained in:
parent
82527fc8b2
commit
ea8e19f286
@ -169,8 +169,7 @@ void *BinaryFormat::SetupJustVocab(std::size_t memory_size, uint8_t order) {
|
||||
vocab_size_ = memory_size;
|
||||
if (!write_mmap_) {
|
||||
header_size_ = 0;
|
||||
util::MapAnonymous(memory_size, memory_vocab_);
|
||||
util::AdviseHugePages(memory_vocab_.get(), memory_size);
|
||||
util::HugeMalloc(memory_size, true, memory_vocab_);
|
||||
return reinterpret_cast<uint8_t*>(memory_vocab_.get());
|
||||
}
|
||||
header_size_ = TotalHeaderSize(order);
|
||||
@ -181,16 +180,16 @@ void *BinaryFormat::SetupJustVocab(std::size_t memory_size, uint8_t order) {
|
||||
switch (write_method_) {
|
||||
case Config::WRITE_MMAP:
|
||||
mapping_.reset(util::MapZeroedWrite(file_.get(), total), total, util::scoped_memory::MMAP_ALLOCATED);
|
||||
util::AdviseHugePages(vocab_base, total);
|
||||
vocab_base = mapping_.get();
|
||||
break;
|
||||
case Config::WRITE_AFTER:
|
||||
util::ResizeOrThrow(file_.get(), 0);
|
||||
util::MapAnonymous(total, memory_vocab_);
|
||||
util::HugeMalloc(total, true, memory_vocab_);
|
||||
vocab_base = memory_vocab_.get();
|
||||
break;
|
||||
}
|
||||
strncpy(reinterpret_cast<char*>(vocab_base), kMagicIncomplete, header_size_);
|
||||
util::AdviseHugePages(vocab_base, total);
|
||||
return reinterpret_cast<uint8_t*>(vocab_base) + header_size_;
|
||||
}
|
||||
|
||||
@ -200,7 +199,7 @@ void *BinaryFormat::GrowForSearch(std::size_t memory_size, std::size_t vocab_pad
|
||||
std::size_t new_size = header_size_ + vocab_size_ + vocab_pad_ + memory_size;
|
||||
vocab_string_offset_ = new_size;
|
||||
if (!write_mmap_ || write_method_ == Config::WRITE_AFTER) {
|
||||
util::MapAnonymous(memory_size, memory_search_);
|
||||
util::HugeMalloc(memory_size, true, memory_search_);
|
||||
assert(header_size_ == 0 || write_mmap_);
|
||||
vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()) + header_size_;
|
||||
util::AdviseHugePages(memory_search_.get(), memory_size);
|
||||
|
@ -5,7 +5,7 @@
|
||||
#include "lm/lm_exception.hh"
|
||||
#include "lm/vocab.hh"
|
||||
#include "lm/word_index.hh"
|
||||
#include "util/fake_ofstream.hh"
|
||||
#include "util/file_stream.hh"
|
||||
#include "util/file.hh"
|
||||
#include "util/file_piece.hh"
|
||||
#include "util/murmur_hash.hh"
|
||||
|
@ -4,21 +4,21 @@
|
||||
#include "lm/builder/payload.hh"
|
||||
#include "lm/common/print.hh"
|
||||
#include "lm/common/ngram_stream.hh"
|
||||
#include "util/fake_ofstream.hh"
|
||||
#include "util/file_stream.hh"
|
||||
#include "util/file.hh"
|
||||
|
||||
#include <boost/lexical_cast.hpp>
|
||||
|
||||
namespace lm { namespace builder {
|
||||
// Not defined, only specialized.
|
||||
template <class T> void PrintPayload(util::FakeOFStream &to, const BuildingPayload &payload);
|
||||
template <> inline void PrintPayload<uint64_t>(util::FakeOFStream &to, const BuildingPayload &payload) {
|
||||
template <class T> void PrintPayload(util::FileStream &to, const BuildingPayload &payload);
|
||||
template <> inline void PrintPayload<uint64_t>(util::FileStream &to, const BuildingPayload &payload) {
|
||||
to << payload.count;
|
||||
}
|
||||
template <> inline void PrintPayload<Uninterpolated>(util::FakeOFStream &to, const BuildingPayload &payload) {
|
||||
template <> inline void PrintPayload<Uninterpolated>(util::FileStream &to, const BuildingPayload &payload) {
|
||||
to << log10(payload.uninterp.prob) << ' ' << log10(payload.uninterp.gamma);
|
||||
}
|
||||
template <> inline void PrintPayload<ProbBackoff>(util::FakeOFStream &to, const BuildingPayload &payload) {
|
||||
template <> inline void PrintPayload<ProbBackoff>(util::FileStream &to, const BuildingPayload &payload) {
|
||||
to << payload.complete.prob << ' ' << payload.complete.backoff;
|
||||
}
|
||||
|
||||
@ -36,7 +36,7 @@ template <class V> class Print {
|
||||
|
||||
void Run(const util::stream::ChainPositions &chains) {
|
||||
util::scoped_fd fd(to_);
|
||||
util::FakeOFStream out(to_);
|
||||
util::FileStream out(to_);
|
||||
NGramStreams<BuildingPayload> streams(chains);
|
||||
for (NGramStream<BuildingPayload> *s = streams.begin(); s != streams.end(); ++s) {
|
||||
DumpStream(*s, out);
|
||||
@ -45,13 +45,13 @@ template <class V> class Print {
|
||||
|
||||
void Run(const util::stream::ChainPosition &position) {
|
||||
util::scoped_fd fd(to_);
|
||||
util::FakeOFStream out(to_);
|
||||
util::FileStream out(to_);
|
||||
NGramStream<BuildingPayload> stream(position);
|
||||
DumpStream(stream, out);
|
||||
}
|
||||
|
||||
private:
|
||||
void DumpStream(NGramStream<BuildingPayload> &stream, util::FakeOFStream &to) {
|
||||
void DumpStream(NGramStream<BuildingPayload> &stream, util::FileStream &to) {
|
||||
for (; stream; ++stream) {
|
||||
PrintPayload<V>(to, stream->Value());
|
||||
for (const WordIndex *w = stream->begin(); w != stream->end(); ++w) {
|
||||
|
@ -30,7 +30,7 @@ int main(int argc, char *argv[]) {
|
||||
UTIL_THROW_IF(*i >= vocab.Size(), util::Exception, "Vocab ID " << *i << " is larger than the vocab file's maximum of " << vocab.Size() << ". Are you sure you have the right order and vocab file for these counts?");
|
||||
std::cout << vocab.Lookup(*i) << ' ';
|
||||
}
|
||||
// TODO don't use std::cout because it is slow. Add fast uint64_t printing support to FakeOFStream.
|
||||
// TODO don't use std::cout because it is slow. Add fast uint64_t printing support to FileStream.
|
||||
std::cout << *reinterpret_cast<const uint64_t*>(words + order) << '\n';
|
||||
}
|
||||
}
|
||||
|
@ -12,7 +12,6 @@
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
|
||||
namespace lm { namespace builder {
|
||||
namespace {
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
#include "lm/common/model_buffer.hh"
|
||||
#include "lm/common/print.hh"
|
||||
#include "util/fake_ofstream.hh"
|
||||
#include "util/file_stream.hh"
|
||||
#include "util/stream/multi_stream.hh"
|
||||
|
||||
#include <iostream>
|
||||
@ -41,7 +41,7 @@ void Output::Apply(HookType hook_type, util::stream::Chains &chains) {
|
||||
|
||||
void PrintHook::Sink(const HeaderInfo &info, int vocab_file, util::stream::Chains &chains) {
|
||||
if (verbose_header_) {
|
||||
util::FakeOFStream out(file_.get(), 50);
|
||||
util::FileStream out(file_.get(), 50);
|
||||
out << "# Input file: " << info.input_file << '\n';
|
||||
out << "# Token count: " << info.token_count << '\n';
|
||||
out << "# Smoothing: Modified Kneser-Ney" << '\n';
|
||||
|
@ -1,6 +1,6 @@
|
||||
#include "lm/common/model_buffer.hh"
|
||||
#include "util/exception.hh"
|
||||
#include "util/fake_ofstream.hh"
|
||||
#include "util/file_stream.hh"
|
||||
#include "util/file.hh"
|
||||
#include "util/file_piece.hh"
|
||||
#include "util/stream/io.hh"
|
||||
@ -68,7 +68,7 @@ void ModelBuffer::Sink(util::stream::Chains &chains, const std::vector<uint64_t>
|
||||
}
|
||||
if (keep_buffer_) {
|
||||
util::scoped_fd metadata(util::CreateOrThrow((file_base_ + ".kenlm_intermediate").c_str()));
|
||||
util::FakeOFStream meta(metadata.get(), 200);
|
||||
util::FileStream meta(metadata.get(), 200);
|
||||
meta << kMetadataHeader << "\nCounts";
|
||||
for (std::vector<uint64_t>::const_iterator i = counts_.begin(); i != counts_.end(); ++i) {
|
||||
meta << ' ' << *i;
|
||||
|
@ -1,7 +1,7 @@
|
||||
#include "lm/common/print.hh"
|
||||
|
||||
#include "lm/common/ngram_stream.hh"
|
||||
#include "util/fake_ofstream.hh"
|
||||
#include "util/file_stream.hh"
|
||||
#include "util/file.hh"
|
||||
#include "util/mmap.hh"
|
||||
#include "util/scoped.hh"
|
||||
@ -24,7 +24,7 @@ VocabReconstitute::VocabReconstitute(int fd) {
|
||||
}
|
||||
|
||||
namespace {
|
||||
template <class Payload> void PrintLead(const VocabReconstitute &vocab, ProxyStream<Payload> &stream, util::FakeOFStream &out) {
|
||||
template <class Payload> void PrintLead(const VocabReconstitute &vocab, ProxyStream<Payload> &stream, util::FileStream &out) {
|
||||
out << stream->Value().prob << '\t' << vocab.Lookup(*stream->begin());
|
||||
for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
|
||||
out << ' ' << vocab.Lookup(*i);
|
||||
@ -34,7 +34,7 @@ template <class Payload> void PrintLead(const VocabReconstitute &vocab, ProxyStr
|
||||
|
||||
void PrintARPA::Run(const util::stream::ChainPositions &positions) {
|
||||
VocabReconstitute vocab(vocab_fd_);
|
||||
util::FakeOFStream out(out_fd_);
|
||||
util::FileStream out(out_fd_);
|
||||
out << "\\data\\\n";
|
||||
for (size_t i = 0; i < positions.size(); ++i) {
|
||||
out << "ngram " << (i+1) << '=' << counts_[i] << '\n';
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "lm/filter/arpa_io.hh"
|
||||
#include "util/file_piece.hh"
|
||||
#include "util/string_stream.hh"
|
||||
|
||||
#include <iostream>
|
||||
#include <ostream>
|
||||
@ -22,14 +23,8 @@ ARPAInputException::ARPAInputException(const StringPiece &message, const StringP
|
||||
|
||||
ARPAInputException::~ARPAInputException() throw() {}
|
||||
|
||||
ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw() {
|
||||
*this << message << " in file " << file_name;
|
||||
}
|
||||
|
||||
ARPAOutputException::~ARPAOutputException() throw() {}
|
||||
|
||||
// Seeking is the responsibility of the caller.
|
||||
void WriteCounts(std::ostream &out, const std::vector<uint64_t> &number) {
|
||||
template <class Stream> void WriteCounts(Stream &out, const std::vector<uint64_t> &number) {
|
||||
out << "\n\\data\\\n";
|
||||
for (unsigned int i = 0; i < number.size(); ++i) {
|
||||
out << "ngram " << i+1 << "=" << number[i] << '\n';
|
||||
@ -38,9 +33,10 @@ void WriteCounts(std::ostream &out, const std::vector<uint64_t> &number) {
|
||||
}
|
||||
|
||||
size_t SizeNeededForCounts(const std::vector<uint64_t> &number) {
|
||||
std::ostringstream buf;
|
||||
WriteCounts(buf, number);
|
||||
return buf.tellp();
|
||||
std::string buf;
|
||||
util::StringStream stream(buf);
|
||||
WriteCounts(stream, number);
|
||||
return buf.size();
|
||||
}
|
||||
|
||||
bool IsEntirelyWhiteSpace(const StringPiece &line) {
|
||||
@ -50,44 +46,21 @@ bool IsEntirelyWhiteSpace(const StringPiece &line) {
|
||||
return true;
|
||||
}
|
||||
|
||||
ARPAOutput::ARPAOutput(const char *name, size_t buffer_size) : file_name_(name), buffer_(new char[buffer_size]) {
|
||||
try {
|
||||
file_.exceptions(std::ostream::eofbit | std::ostream::failbit | std::ostream::badbit);
|
||||
if (!file_.rdbuf()->pubsetbuf(buffer_.get(), buffer_size)) {
|
||||
std::cerr << "Warning: could not enlarge buffer for " << name << std::endl;
|
||||
buffer_.reset();
|
||||
}
|
||||
file_.open(name, std::ios::out | std::ios::binary);
|
||||
} catch (const std::ios_base::failure &f) {
|
||||
throw ARPAOutputException("Opening", file_name_);
|
||||
}
|
||||
}
|
||||
ARPAOutput::ARPAOutput(const char *name, size_t buffer_size)
|
||||
: file_backing_(util::CreateOrThrow(name)), file_(file_backing_.get(), buffer_size) {}
|
||||
|
||||
void ARPAOutput::ReserveForCounts(std::streampos reserve) {
|
||||
try {
|
||||
for (std::streampos i = 0; i < reserve; i += std::streampos(1)) {
|
||||
file_ << '\n';
|
||||
}
|
||||
} catch (const std::ios_base::failure &f) {
|
||||
throw ARPAOutputException("Writing blanks to reserve space for counts to ", file_name_);
|
||||
for (std::streampos i = 0; i < reserve; i += std::streampos(1)) {
|
||||
file_ << '\n';
|
||||
}
|
||||
}
|
||||
|
||||
void ARPAOutput::BeginLength(unsigned int length) {
|
||||
fast_counter_ = 0;
|
||||
try {
|
||||
file_ << '\\' << length << "-grams:" << '\n';
|
||||
} catch (const std::ios_base::failure &f) {
|
||||
throw ARPAOutputException("Writing n-gram header to ", file_name_);
|
||||
}
|
||||
file_ << '\\' << length << "-grams:" << '\n';
|
||||
}
|
||||
|
||||
void ARPAOutput::EndLength(unsigned int length) {
|
||||
try {
|
||||
file_ << '\n';
|
||||
} catch (const std::ios_base::failure &f) {
|
||||
throw ARPAOutputException("Writing blank at end of count list to ", file_name_);
|
||||
}
|
||||
file_ << '\n';
|
||||
if (length > counts_.size()) {
|
||||
counts_.resize(length);
|
||||
}
|
||||
@ -95,14 +68,10 @@ void ARPAOutput::EndLength(unsigned int length) {
|
||||
}
|
||||
|
||||
void ARPAOutput::Finish() {
|
||||
try {
|
||||
file_ << "\\end\\\n";
|
||||
file_.seekp(0);
|
||||
WriteCounts(file_, counts_);
|
||||
file_ << std::flush;
|
||||
} catch (const std::ios_base::failure &f) {
|
||||
throw ARPAOutputException("Finishing including writing counts at beginning to ", file_name_);
|
||||
}
|
||||
file_ << "\\end\\\n";
|
||||
file_.seekp(0);
|
||||
WriteCounts(file_, counts_);
|
||||
file_.flush();
|
||||
}
|
||||
|
||||
} // namespace lm
|
||||
|
@ -4,6 +4,7 @@
|
||||
*/
|
||||
#include "lm/read_arpa.hh"
|
||||
#include "util/exception.hh"
|
||||
#include "util/file_stream.hh"
|
||||
#include "util/string_piece.hh"
|
||||
#include "util/tokenize_piece.hh"
|
||||
|
||||
@ -28,17 +29,6 @@ class ARPAInputException : public util::Exception {
|
||||
virtual ~ARPAInputException() throw();
|
||||
};
|
||||
|
||||
class ARPAOutputException : public util::ErrnoException {
|
||||
public:
|
||||
ARPAOutputException(const char *prefix, const std::string &file_name) throw();
|
||||
virtual ~ARPAOutputException() throw();
|
||||
|
||||
const std::string &File() const throw() { return file_name_; }
|
||||
|
||||
private:
|
||||
const std::string file_name_;
|
||||
};
|
||||
|
||||
// Handling for the counts of n-grams at the beginning of ARPA files.
|
||||
size_t SizeNeededForCounts(const std::vector<uint64_t> &number);
|
||||
|
||||
@ -55,11 +45,7 @@ class ARPAOutput : boost::noncopyable {
|
||||
void BeginLength(unsigned int length);
|
||||
|
||||
void AddNGram(const StringPiece &line) {
|
||||
try {
|
||||
file_ << line << '\n';
|
||||
} catch (const std::ios_base::failure &f) {
|
||||
throw ARPAOutputException("Writing an n-gram", file_name_);
|
||||
}
|
||||
file_ << line << '\n';
|
||||
++fast_counter_;
|
||||
}
|
||||
|
||||
@ -76,9 +62,8 @@ class ARPAOutput : boost::noncopyable {
|
||||
void Finish();
|
||||
|
||||
private:
|
||||
const std::string file_name_;
|
||||
boost::scoped_array<char> buffer_;
|
||||
std::fstream file_;
|
||||
util::scoped_fd file_backing_;
|
||||
util::FileStream file_;
|
||||
size_t fast_counter_;
|
||||
std::vector<uint64_t> counts_;
|
||||
};
|
||||
|
@ -5,7 +5,7 @@
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "util/fake_ofstream.hh"
|
||||
#include "util/file_stream.hh"
|
||||
#include "util/file.hh"
|
||||
#include "util/file_piece.hh"
|
||||
|
||||
@ -28,7 +28,7 @@ class CountOutput : boost::noncopyable {
|
||||
}
|
||||
|
||||
private:
|
||||
util::FakeOFStream file_;
|
||||
util::FileStream file_;
|
||||
};
|
||||
|
||||
class CountBatch {
|
||||
|
@ -1,4 +1,4 @@
|
||||
#include "util/fake_ofstream.hh"
|
||||
#include "util/file_stream.hh"
|
||||
#include "util/file_piece.hh"
|
||||
#include "util/murmur_hash.hh"
|
||||
#include "util/pool.hh"
|
||||
@ -68,7 +68,7 @@ class TargetWords {
|
||||
}
|
||||
|
||||
void Print() const {
|
||||
util::FakeOFStream out(1);
|
||||
util::FileStream out(1);
|
||||
for (std::vector<boost::unordered_set<const char *> >::const_iterator i = vocab_.begin(); i != vocab_.end(); ++i) {
|
||||
for (boost::unordered_set<const char *>::const_iterator j = i->begin(); j != i->end(); ++j) {
|
||||
out << *j << ' ';
|
||||
|
@ -1,5 +1,5 @@
|
||||
#include "lm/model.hh"
|
||||
#include "util/fake_ofstream.hh"
|
||||
#include "util/file_stream.hh"
|
||||
#include "util/file.hh"
|
||||
#include "util/file_piece.hh"
|
||||
#include "util/usage.hh"
|
||||
@ -10,7 +10,7 @@ namespace {
|
||||
|
||||
template <class Model, class Width> void ConvertToBytes(const Model &model, int fd_in) {
|
||||
util::FilePiece in(fd_in);
|
||||
util::FakeOFStream out(1);
|
||||
util::FileStream out(1);
|
||||
Width width;
|
||||
StringPiece word;
|
||||
const Width end_sentence = (Width)model.GetVocabulary().EndSentence();
|
||||
@ -30,10 +30,19 @@ template <class Model, class Width> void QueryFromBytes(const Model &model, int
|
||||
const lm::ngram::State *next_state = begin_state;
|
||||
Width kEOS = model.GetVocabulary().EndSentence();
|
||||
Width buf[4096];
|
||||
float sum = 0.0;
|
||||
|
||||
uint64_t completed = 0;
|
||||
double loaded = util::CPUTime();
|
||||
|
||||
std::cout << "CPU_to_load: " << loaded << std::endl;
|
||||
|
||||
// Numerical precision: batch sums.
|
||||
double total = 0.0;
|
||||
while (std::size_t got = util::ReadOrEOF(fd_in, buf, sizeof(buf))) {
|
||||
float sum = 0.0;
|
||||
UTIL_THROW_IF2(got % sizeof(Width), "File size not a multiple of vocab id size " << sizeof(Width));
|
||||
got /= sizeof(Width);
|
||||
completed += got;
|
||||
// Do even stuff first.
|
||||
const Width *even_end = buf + (got & ~1);
|
||||
// Alternating states
|
||||
@ -49,8 +58,13 @@ template <class Model, class Width> void QueryFromBytes(const Model &model, int
|
||||
sum += model.FullScore(*next_state, *i, state[2]).prob;
|
||||
next_state = (*i++ == kEOS) ? begin_state : &state[2];
|
||||
}
|
||||
total += sum;
|
||||
}
|
||||
std::cout << "Sum is " << sum << std::endl;
|
||||
double after = util::CPUTime();
|
||||
std::cerr << "Probability sum is " << total << std::endl;
|
||||
std::cout << "Queries: " << completed << std::endl;
|
||||
std::cout << "CPU_excluding_load: " << (after - loaded) << "\nCPU_per_query: " << ((after - loaded) / static_cast<double>(completed)) << std::endl;
|
||||
std::cout << "RSSMax: " << util::RSSMax() << std::endl;
|
||||
}
|
||||
|
||||
template <class Model, class Width> void DispatchFunction(const Model &model, bool query) {
|
||||
@ -62,7 +76,10 @@ template <class Model, class Width> void DispatchFunction(const Model &model, bo
|
||||
}
|
||||
|
||||
template <class Model> void DispatchWidth(const char *file, bool query) {
|
||||
Model model(file);
|
||||
lm::ngram::Config config;
|
||||
config.load_method = util::READ;
|
||||
std::cerr << "Using load_method = READ." << std::endl;
|
||||
Model model(file, config);
|
||||
lm::WordIndex bound = model.GetVocabulary().Bound();
|
||||
if (bound <= 256) {
|
||||
DispatchFunction<Model, uint8_t>(model, query);
|
||||
@ -116,11 +133,10 @@ int main(int argc, char *argv[]) {
|
||||
<< argv[0] << " vocab $model <$text >$text.vocab\n"
|
||||
<< "#Ensure files are in RAM.\n"
|
||||
<< "cat $text.vocab $model >/dev/null\n"
|
||||
<< "#Timed query against the model, including loading.\n"
|
||||
<< "time " << argv[0] << " query $model <$text.vocab\n";
|
||||
<< "#Timed query against the model.\n"
|
||||
<< argv[0] << " query $model <$text.vocab\n";
|
||||
return 1;
|
||||
}
|
||||
Dispatch(argv[2], !strcmp(argv[1], "query"));
|
||||
util::PrintUsage(std::cerr);
|
||||
return 0;
|
||||
}
|
||||
|
@ -3,7 +3,7 @@
|
||||
|
||||
#include "lm/enumerate_vocab.hh"
|
||||
#include "lm/model.hh"
|
||||
#include "util/fake_ofstream.hh"
|
||||
#include "util/file_stream.hh"
|
||||
#include "util/file_piece.hh"
|
||||
#include "util/usage.hh"
|
||||
|
||||
@ -42,7 +42,7 @@ class QueryPrinter {
|
||||
}
|
||||
|
||||
private:
|
||||
util::FakeOFStream out_;
|
||||
util::FileStream out_;
|
||||
bool print_word_;
|
||||
bool print_line_;
|
||||
bool print_summary_;
|
||||
|
@ -6,7 +6,7 @@
|
||||
#include "lm/config.hh"
|
||||
#include "lm/weights.hh"
|
||||
#include "util/exception.hh"
|
||||
#include "util/fake_ofstream.hh"
|
||||
#include "util/file_stream.hh"
|
||||
#include "util/file.hh"
|
||||
#include "util/joint_sort.hh"
|
||||
#include "util/murmur_hash.hh"
|
||||
@ -182,7 +182,7 @@ void SortedVocabulary::ComputeRenumbering(WordIndex types, int from_words, int t
|
||||
std::sort(entries.begin(), entries.end());
|
||||
// Write out new vocab file.
|
||||
{
|
||||
util::FakeOFStream out(to_words);
|
||||
util::FileStream out(to_words);
|
||||
out << "<unk>" << '\0';
|
||||
for (std::vector<RenumberEntry>::const_iterator i = entries.begin(); i != entries.end(); ++i) {
|
||||
out << i->str << '\0';
|
||||
|
@ -4,7 +4,7 @@
|
||||
#include "lm/enumerate_vocab.hh"
|
||||
#include "lm/lm_exception.hh"
|
||||
#include "lm/virtual_interface.hh"
|
||||
#include "util/fake_ofstream.hh"
|
||||
#include "util/file_stream.hh"
|
||||
#include "util/murmur_hash.hh"
|
||||
#include "util/pool.hh"
|
||||
#include "util/probing_hash_table.hh"
|
||||
@ -44,7 +44,7 @@ class ImmediateWriteWordsWrapper : public EnumerateVocab {
|
||||
private:
|
||||
EnumerateVocab *inner_;
|
||||
|
||||
util::FakeOFStream stream_;
|
||||
util::FileStream stream_;
|
||||
};
|
||||
|
||||
// When the binary size isn't known yet.
|
||||
@ -225,7 +225,7 @@ class WriteUniqueWords {
|
||||
}
|
||||
|
||||
private:
|
||||
util::FakeOFStream word_list_;
|
||||
util::FileStream word_list_;
|
||||
};
|
||||
|
||||
class NoOpUniqueWords {
|
||||
|
@ -7,47 +7,41 @@
|
||||
#include <cerrno>
|
||||
#include <cstring>
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#include <windows.h>
|
||||
#include <io.h>
|
||||
#endif
|
||||
|
||||
namespace util {
|
||||
|
||||
Exception::Exception() throw() {}
|
||||
Exception::~Exception() throw() {}
|
||||
|
||||
Exception::Exception(const Exception &from) : std::exception() {
|
||||
stream_ << from.stream_.str();
|
||||
}
|
||||
|
||||
Exception &Exception::operator=(const Exception &from) {
|
||||
stream_ << from.stream_.str();
|
||||
return *this;
|
||||
}
|
||||
|
||||
const char *Exception::what() const throw() {
|
||||
text_ = stream_.str();
|
||||
return text_.c_str();
|
||||
}
|
||||
|
||||
void Exception::SetLocation(const char *file, unsigned int line, const char *func, const char *child_name, const char *condition) {
|
||||
/* The child class might have set some text, but we want this to come first.
|
||||
* Another option would be passing this information to the constructor, but
|
||||
* then child classes would have to accept constructor arguments and pass
|
||||
* them down.
|
||||
*/
|
||||
text_ = stream_.str();
|
||||
stream_.str("");
|
||||
stream_ << file << ':' << line;
|
||||
if (func) stream_ << " in " << func << " threw ";
|
||||
std::string old_text;
|
||||
std::swap(old_text, what_);
|
||||
StringStream stream(what_);
|
||||
stream << file << ':' << line;
|
||||
if (func) stream << " in " << func << " threw ";
|
||||
if (child_name) {
|
||||
stream_ << child_name;
|
||||
stream << child_name;
|
||||
} else {
|
||||
#ifdef __GXX_RTTI
|
||||
stream_ << typeid(this).name();
|
||||
stream << typeid(this).name();
|
||||
#else
|
||||
stream_ << "an exception";
|
||||
stream << "an exception";
|
||||
#endif
|
||||
}
|
||||
if (condition) stream_ << " because `" << condition;
|
||||
stream_ << "'.\n";
|
||||
stream_ << text_;
|
||||
if (condition) {
|
||||
stream << " because `" << condition << '\'';
|
||||
}
|
||||
stream << ".\n";
|
||||
stream << old_text;
|
||||
}
|
||||
|
||||
namespace {
|
||||
@ -95,4 +89,17 @@ ErrnoException::~ErrnoException() throw() {}
|
||||
OverflowException::OverflowException() throw() {}
|
||||
OverflowException::~OverflowException() throw() {}
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
WindowsException::WindowsException() throw() {
|
||||
unsigned int last_error = GetLastError();
|
||||
char error_msg[256] = "";
|
||||
if (!FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, last_error, LANG_NEUTRAL, error_msg, sizeof(error_msg), NULL)) {
|
||||
*this << "Windows error " << GetLastError() << " while formatting Windows error " << last_error << ". ";
|
||||
} else {
|
||||
*this << "Windows error " << last_error << ": " << error_msg;
|
||||
}
|
||||
}
|
||||
WindowsException::~WindowsException() throw() {}
|
||||
#endif
|
||||
|
||||
} // namespace util
|
||||
|
@ -1,12 +1,16 @@
|
||||
#ifndef UTIL_EXCEPTION_H
|
||||
#define UTIL_EXCEPTION_H
|
||||
|
||||
#include "util/string_stream.hh"
|
||||
|
||||
#include <exception>
|
||||
#include <limits>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <stdint.h>
|
||||
|
||||
// TODO(hieu) delete this
|
||||
#include <sstream>
|
||||
|
||||
namespace util {
|
||||
|
||||
template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data);
|
||||
@ -16,11 +20,7 @@ class Exception : public std::exception {
|
||||
Exception() throw();
|
||||
virtual ~Exception() throw();
|
||||
|
||||
Exception(const Exception &from);
|
||||
Exception &operator=(const Exception &from);
|
||||
|
||||
// Not threadsafe, but probably doesn't matter. FWIW, Boost's exception guidance implies that what() isn't threadsafe.
|
||||
const char *what() const throw();
|
||||
const char *what() const throw() { return what_.c_str(); }
|
||||
|
||||
// For use by the UTIL_THROW macros.
|
||||
void SetLocation(
|
||||
@ -38,8 +38,7 @@ class Exception : public std::exception {
|
||||
typedef T Identity;
|
||||
};
|
||||
|
||||
std::stringstream stream_;
|
||||
mutable std::string text_;
|
||||
std::string what_;
|
||||
};
|
||||
|
||||
/* This implements the normal operator<< for Exception and all its children.
|
||||
@ -47,7 +46,12 @@ class Exception : public std::exception {
|
||||
* boost::enable_if.
|
||||
*/
|
||||
template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data) {
|
||||
e.stream_ << data;
|
||||
// TODO(hieu): change this to
|
||||
// StringStream(e.what_) << data;
|
||||
|
||||
std::stringstream moses_hack;
|
||||
moses_hack << data;
|
||||
e.what_ += moses_hack.str();
|
||||
return e;
|
||||
}
|
||||
|
||||
@ -149,6 +153,15 @@ inline std::size_t CheckOverflow(uint64_t value) {
|
||||
return CheckOverflowInternal<sizeof(std::size_t)>(value);
|
||||
}
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
/* Thrown for Windows specific operations. */
|
||||
class WindowsException : public Exception {
|
||||
public:
|
||||
WindowsException() throw();
|
||||
~WindowsException() throw();
|
||||
};
|
||||
#endif
|
||||
|
||||
} // namespace util
|
||||
|
||||
#endif // UTIL_EXCEPTION_H
|
||||
|
@ -1,137 +0,0 @@
|
||||
/* Like std::ofstream but without being incredibly slow. Backed by a raw fd.
|
||||
* Supports most of the built-in types except for void* and long double.
|
||||
*/
|
||||
#ifndef UTIL_FAKE_OFSTREAM_H
|
||||
#define UTIL_FAKE_OFSTREAM_H
|
||||
|
||||
#include "util/file.hh"
|
||||
#include "util/float_to_string.hh"
|
||||
#include "util/integer_to_string.hh"
|
||||
#include "util/scoped.hh"
|
||||
#include "util/string_piece.hh"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
namespace util {
|
||||
class FakeOFStream {
|
||||
public:
|
||||
// Maximum over all ToString operations.
|
||||
// static const std::size_t kMinBuf = 20;
|
||||
// This was causing compile failures in debug, so now 20 is written directly.
|
||||
//
|
||||
// Does not take ownership of out.
|
||||
// Allows default constructor, but must call SetFD.
|
||||
explicit FakeOFStream(int out = -1, std::size_t buffer_size = 1048576)
|
||||
: buf_(util::MallocOrThrow(std::max(buffer_size, (size_t)20))),
|
||||
current_(static_cast<char*>(buf_.get())),
|
||||
end_(current_ + std::max(buffer_size, (size_t)20)),
|
||||
fd_(out) {}
|
||||
|
||||
~FakeOFStream() {
|
||||
// Could have called Finish already
|
||||
flush();
|
||||
}
|
||||
|
||||
void SetFD(int to) {
|
||||
flush();
|
||||
fd_ = to;
|
||||
}
|
||||
|
||||
FakeOFStream &write(const void *data, std::size_t length) {
|
||||
if (UTIL_LIKELY(current_ + length <= end_)) {
|
||||
std::memcpy(current_, data, length);
|
||||
current_ += length;
|
||||
return *this;
|
||||
}
|
||||
flush();
|
||||
if (current_ + length <= end_) {
|
||||
std::memcpy(current_, data, length);
|
||||
current_ += length;
|
||||
} else {
|
||||
util::WriteOrThrow(fd_, data, length);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// This also covers std::string and char*
|
||||
FakeOFStream &operator<<(StringPiece str) {
|
||||
return write(str.data(), str.size());
|
||||
}
|
||||
|
||||
// For anything with ToStringBuf<T>::kBytes, define operator<< using ToString.
|
||||
// This includes uint64_t, int64_t, uint32_t, int32_t, uint16_t, int16_t,
|
||||
// float, double
|
||||
private:
|
||||
template <int Arg> struct EnableIfKludge {
|
||||
typedef FakeOFStream type;
|
||||
};
|
||||
public:
|
||||
template <class T> typename EnableIfKludge<ToStringBuf<T>::kBytes>::type &operator<<(const T value) {
|
||||
EnsureRemaining(ToStringBuf<T>::kBytes);
|
||||
current_ = ToString(value, current_);
|
||||
assert(current_ <= end_);
|
||||
return *this;
|
||||
}
|
||||
|
||||
FakeOFStream &operator<<(char c) {
|
||||
EnsureRemaining(1);
|
||||
*current_++ = c;
|
||||
return *this;
|
||||
}
|
||||
|
||||
FakeOFStream &operator<<(unsigned char c) {
|
||||
EnsureRemaining(1);
|
||||
*current_++ = static_cast<char>(c);
|
||||
return *this;
|
||||
}
|
||||
|
||||
/* clang on OS X appears to consider std::size_t aka unsigned long distinct
|
||||
* from uint64_t. So this function makes clang work. gcc considers
|
||||
* uint64_t and std::size_t the same (on 64-bit) so this isn't necessary.
|
||||
* But it does no harm since gcc sees it as a specialization of the
|
||||
* EnableIfKludge template.
|
||||
* Also, delegating to *this << static_cast<uint64_t>(value) would loop
|
||||
* indefinitely on gcc.
|
||||
*/
|
||||
FakeOFStream &operator<<(std::size_t value) {
|
||||
EnsureRemaining(ToStringBuf<uint64_t>::kBytes);
|
||||
current_ = ToString(static_cast<uint64_t>(value), current_);
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Note this does not sync.
|
||||
void flush() {
|
||||
if (current_ != buf_.get()) {
|
||||
util::WriteOrThrow(fd_, buf_.get(), current_ - (char*)buf_.get());
|
||||
current_ = static_cast<char*>(buf_.get());
|
||||
}
|
||||
}
|
||||
|
||||
// Not necessary, but does assure the data is cleared.
|
||||
void Finish() {
|
||||
flush();
|
||||
buf_.reset();
|
||||
current_ = NULL;
|
||||
util::FSyncOrThrow(fd_);
|
||||
}
|
||||
|
||||
private:
|
||||
void EnsureRemaining(std::size_t amount) {
|
||||
if (UTIL_UNLIKELY(current_ + amount > end_)) {
|
||||
flush();
|
||||
assert(current_ + amount <= end_);
|
||||
}
|
||||
}
|
||||
|
||||
util::scoped_malloc buf_;
|
||||
char *current_, *end_;
|
||||
|
||||
int fd_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
128
util/fake_ostream.hh
Normal file
128
util/fake_ostream.hh
Normal file
@ -0,0 +1,128 @@
|
||||
#ifndef UTIL_FAKE_OSTREAM_H
|
||||
#define UTIL_FAKE_OSTREAM_H
|
||||
|
||||
#include "util/float_to_string.hh"
|
||||
#include "util/integer_to_string.hh"
|
||||
#include "util/string_piece.hh"
|
||||
|
||||
#include <cassert>
|
||||
#include <limits>
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
namespace util {
|
||||
|
||||
/* Like std::ostream but without being incredibly slow.
|
||||
* Supports most of the built-in types except for long double.
|
||||
*
|
||||
* The FakeOStream class is intended to be inherited from. The inherting class
|
||||
* should provide:
|
||||
* public:
|
||||
* Derived &flush();
|
||||
* Derived &write(const void *data, std::size_t length);
|
||||
*
|
||||
* private: or protected:
|
||||
* friend class FakeOStream;
|
||||
* char *Ensure(std::size_t amount);
|
||||
* void AdvanceTo(char *to);
|
||||
*
|
||||
* The Ensure function makes enough space for an in-place write and returns
|
||||
* where to write. The AdvanceTo function happens after the write, saying how
|
||||
* much was actually written.
|
||||
*
|
||||
* Precondition:
|
||||
* amount <= kToStringMaxBytes for in-place writes.
|
||||
*/
|
||||
template <class Derived> class FakeOStream {
|
||||
public:
|
||||
FakeOStream() {}
|
||||
|
||||
// This also covers std::string and char*
|
||||
Derived &operator<<(StringPiece str) {
|
||||
return C().write(str.data(), str.size());
|
||||
}
|
||||
|
||||
// For anything with ToStringBuf<T>::kBytes, define operator<< using ToString.
|
||||
// This includes uint64_t, int64_t, uint32_t, int32_t, uint16_t, int16_t,
|
||||
// float, double
|
||||
private:
|
||||
template <int Arg> struct EnableIfKludge {
|
||||
typedef Derived type;
|
||||
};
|
||||
public:
|
||||
template <class T> typename EnableIfKludge<ToStringBuf<T>::kBytes>::type &operator<<(const T value) {
|
||||
return CallToString(value);
|
||||
}
|
||||
|
||||
/* clang on OS X appears to consider std::size_t aka unsigned long distinct
|
||||
* from uint64_t. So this function makes clang work. gcc considers
|
||||
* uint64_t and std::size_t the same (on 64-bit) so this isn't necessary.
|
||||
* But it does no harm since gcc sees it as a specialization of the
|
||||
* EnableIfKludge template.
|
||||
* Also, delegating to *this << static_cast<uint64_t>(value) would loop
|
||||
* indefinitely on gcc.
|
||||
*/
|
||||
Derived &operator<<(std::size_t value) { return CoerceToString(value); }
|
||||
|
||||
// union types will map to int, but don't pass the template magic above in gcc.
|
||||
Derived &operator<<(int value) { return CoerceToString(value); }
|
||||
|
||||
// gcc considers these distinct from uint64_t
|
||||
Derived &operator<<(unsigned long long value) { return CoerceToString(value); }
|
||||
Derived &operator<<(signed long long value) { return CoerceToString(value); }
|
||||
|
||||
// Character types that get copied as bytes instead of displayed as integers.
|
||||
Derived &operator<<(char val) { return put(val); }
|
||||
Derived &operator<<(signed char val) { return put(static_cast<char>(val)); }
|
||||
Derived &operator<<(unsigned char val) { return put(static_cast<char>(val)); }
|
||||
|
||||
// This is here to catch all the other pointer types.
|
||||
Derived &operator<<(const void *value) { return CallToString(value); }
|
||||
// This is here because the above line also catches const char*.
|
||||
Derived &operator<<(const char *value) { return *this << StringPiece(value); }
|
||||
Derived &operator<<(char *value) { return *this << StringPiece(value); }
|
||||
|
||||
Derived &put(char val) {
|
||||
char *c = C().Ensure(1);
|
||||
*c = val;
|
||||
C().AdvanceTo(++c);
|
||||
return C();
|
||||
}
|
||||
|
||||
char widen(char val) const { return val; }
|
||||
|
||||
private:
|
||||
// References to derived class for convenience.
|
||||
Derived &C() {
|
||||
return *static_cast<Derived*>(this);
|
||||
}
|
||||
|
||||
const Derived &C() const {
|
||||
return *static_cast<const Derived*>(this);
|
||||
}
|
||||
|
||||
template <class From, unsigned Length = sizeof(From), bool Signed = std::numeric_limits<From>::is_signed> struct Coerce {};
|
||||
|
||||
template <class From> struct Coerce<From, 2, false> { typedef uint16_t To; };
|
||||
template <class From> struct Coerce<From, 4, false> { typedef uint32_t To; };
|
||||
template <class From> struct Coerce<From, 8, false> { typedef uint64_t To; };
|
||||
|
||||
template <class From> struct Coerce<From, 2, true> { typedef int16_t To; };
|
||||
template <class From> struct Coerce<From, 4, true> { typedef int32_t To; };
|
||||
template <class From> struct Coerce<From, 8, true> { typedef int64_t To; };
|
||||
|
||||
template <class From> Derived &CoerceToString(const From value) {
|
||||
return CallToString(static_cast<typename Coerce<From>::To>(value));
|
||||
}
|
||||
|
||||
// This is separate to prevent an infinite loop if the compiler considers
|
||||
// types the same (i.e. gcc std::size_t and uint64_t or uint32_t).
|
||||
template <class T> Derived &CallToString(const T value) {
|
||||
C().AdvanceTo(ToString(value, C().Ensure(ToStringBuf<T>::kBytes)));
|
||||
return C();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // UTIL_FAKE_OSTREAM_H
|
28
util/file.cc
28
util/file.cc
@ -147,17 +147,33 @@ std::size_t GuardLarge(std::size_t size) {
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
namespace {
|
||||
const std::size_t kMaxDWORD = static_cast<std::size_t>(4294967295UL);
|
||||
} // namespace
|
||||
#endif
|
||||
|
||||
std::size_t PartialRead(int fd, void *to, std::size_t amount) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
int ret = _read(fd, to, GuardLarge(amount));
|
||||
DWORD ret;
|
||||
HANDLE file_handle = reinterpret_cast<HANDLE>(_get_osfhandle(fd));
|
||||
DWORD larger_size = static_cast<DWORD>(std::min<std::size_t>(kMaxDWORD, amount));
|
||||
DWORD smaller_size = 28672; // Received reports that 31346 worked but higher values did not. This rounds down to the nearest multiple of 4096, the page size.
|
||||
if (!ReadFile(file_handle, to, larger_size, &ret, NULL))
|
||||
{
|
||||
DWORD last_error = GetLastError();
|
||||
if (last_error != ERROR_NOT_ENOUGH_MEMORY || !ReadFile(file_handle, to, smaller_size, &ret, NULL)) {
|
||||
UTIL_THROW(WindowsException, "Windows error in ReadFile.");
|
||||
}
|
||||
}
|
||||
#else
|
||||
errno = 0;
|
||||
ssize_t ret;
|
||||
do {
|
||||
ret = read(fd, to, GuardLarge(amount));
|
||||
} while (ret == -1 && errno == EINTR);
|
||||
#endif
|
||||
UTIL_THROW_IF_ARG(ret < 0, FDException, (fd), "while reading " << amount << " bytes");
|
||||
#endif
|
||||
return static_cast<std::size_t>(ret);
|
||||
}
|
||||
|
||||
@ -212,12 +228,6 @@ void WriteOrThrow(FILE *to, const void *data, std::size_t size) {
|
||||
UTIL_THROW_IF(1 != std::fwrite(data, size, 1, to), ErrnoException, "Short write; requested size " << size);
|
||||
}
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
namespace {
|
||||
const std::size_t kMaxDWORD = static_cast<std::size_t>(4294967295UL);
|
||||
} // namespace
|
||||
#endif
|
||||
|
||||
void ErsatzPRead(int fd, void *to_void, std::size_t size, uint64_t off) {
|
||||
uint8_t *to = static_cast<uint8_t*>(to_void);
|
||||
while (size) {
|
||||
@ -230,7 +240,7 @@ void ErsatzPRead(int fd, void *to_void, std::size_t size, uint64_t off) {
|
||||
memset(&overlapped, 0, sizeof(OVERLAPPED));
|
||||
overlapped.Offset = static_cast<DWORD>(off);
|
||||
overlapped.OffsetHigh = static_cast<DWORD>(off >> 32);
|
||||
UTIL_THROW_IF(!ReadFile((HANDLE)_get_osfhandle(fd), to, reading, &ret, &overlapped), Exception, "ReadFile failed for offset " << off);
|
||||
UTIL_THROW_IF(!ReadFile((HANDLE)_get_osfhandle(fd), to, reading, &ret, &overlapped), WindowsException, "ReadFile failed for offset " << off);
|
||||
#else
|
||||
ssize_t ret;
|
||||
errno = 0;
|
||||
|
@ -56,7 +56,7 @@ FilePiece::FilePiece(std::istream &stream, const char *name, std::size_t min_buf
|
||||
InitializeNoRead("istream", min_buffer);
|
||||
|
||||
fallback_to_read_ = true;
|
||||
data_.reset(MallocOrThrow(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
|
||||
HugeMalloc(default_map_size_, false, data_);
|
||||
position_ = data_.begin();
|
||||
position_end_ = position_;
|
||||
|
||||
@ -282,7 +282,7 @@ void FilePiece::TransitionToRead() {
|
||||
assert(!fallback_to_read_);
|
||||
fallback_to_read_ = true;
|
||||
data_.reset();
|
||||
data_.reset(MallocOrThrow(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
|
||||
HugeMalloc(default_map_size_, false, data_);
|
||||
position_ = data_.begin();
|
||||
position_end_ = position_;
|
||||
|
||||
@ -313,8 +313,7 @@ void FilePiece::ReadShift() {
|
||||
// Buffer too small.
|
||||
std::size_t valid_length = position_end_ - position_;
|
||||
default_map_size_ *= 2;
|
||||
data_.call_realloc(default_map_size_);
|
||||
UTIL_THROW_IF(!data_.get(), ErrnoException, "realloc failed for " << default_map_size_);
|
||||
HugeRealloc(default_map_size_, false, data_);
|
||||
position_ = data_.begin();
|
||||
position_end_ = position_ + valid_length;
|
||||
} else {
|
||||
|
@ -1,7 +1,7 @@
|
||||
// Tests might fail if you have creative characters in your path. Sue me.
|
||||
#include "util/file_piece.hh"
|
||||
|
||||
#include "util/fake_ofstream.hh"
|
||||
#include "util/file_stream.hh"
|
||||
#include "util/file.hh"
|
||||
#include "util/scoped.hh"
|
||||
|
||||
@ -138,7 +138,7 @@ BOOST_AUTO_TEST_CASE(Numbers) {
|
||||
scoped_fd file(MakeTemp(FileLocation()));
|
||||
const float floating = 3.2;
|
||||
{
|
||||
util::FakeOFStream writing(file.get());
|
||||
util::FileStream writing(file.get());
|
||||
writing << "94389483984398493890287 " << floating << " 5";
|
||||
}
|
||||
SeekOrThrow(file.get(), 0);
|
||||
|
89
util/file_stream.hh
Normal file
89
util/file_stream.hh
Normal file
@ -0,0 +1,89 @@
|
||||
/* Like std::ofstream but without being incredibly slow. Backed by a raw fd.
|
||||
* Supports most of the built-in types except for long double.
|
||||
*/
|
||||
#ifndef UTIL_FILE_STREAM_H
|
||||
#define UTIL_FILE_STREAM_H
|
||||
|
||||
#include "util/fake_ostream.hh"
|
||||
#include "util/file.hh"
|
||||
#include "util/scoped.hh"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
namespace util {
|
||||
|
||||
class FileStream : public FakeOStream<FileStream> {
|
||||
public:
|
||||
FileStream(int out = -1, std::size_t buffer_size = 8192)
|
||||
: buf_(util::MallocOrThrow(std::max<std::size_t>(buffer_size, kToStringMaxBytes))),
|
||||
current_(static_cast<char*>(buf_.get())),
|
||||
end_(current_ + std::max<std::size_t>(buffer_size, kToStringMaxBytes)),
|
||||
fd_(out) {}
|
||||
|
||||
~FileStream() {
|
||||
flush();
|
||||
}
|
||||
|
||||
void SetFD(int to) {
|
||||
flush();
|
||||
fd_ = to;
|
||||
}
|
||||
|
||||
FileStream &flush() {
|
||||
if (current_ != buf_.get()) {
|
||||
util::WriteOrThrow(fd_, buf_.get(), current_ - (char*)buf_.get());
|
||||
current_ = static_cast<char*>(buf_.get());
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// For writes of arbitrary size.
|
||||
FileStream &write(const void *data, std::size_t length) {
|
||||
if (UTIL_LIKELY(current_ + length <= end_)) {
|
||||
std::memcpy(current_, data, length);
|
||||
current_ += length;
|
||||
return *this;
|
||||
}
|
||||
flush();
|
||||
if (current_ + length <= end_) {
|
||||
std::memcpy(current_, data, length);
|
||||
current_ += length;
|
||||
} else {
|
||||
util::WriteOrThrow(fd_, data, length);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
FileStream &seekp(uint64_t to) {
|
||||
util::SeekOrThrow(fd_, to);
|
||||
return *this;
|
||||
}
|
||||
|
||||
protected:
|
||||
friend class FakeOStream<FileStream>;
|
||||
// For writes directly to buffer guaranteed to have amount < buffer size.
|
||||
char *Ensure(std::size_t amount) {
|
||||
if (UTIL_UNLIKELY(current_ + amount > end_)) {
|
||||
flush();
|
||||
assert(current_ + amount <= end_);
|
||||
}
|
||||
return current_;
|
||||
}
|
||||
|
||||
void AdvanceTo(char *to) {
|
||||
current_ = to;
|
||||
assert(current_ <= end_);
|
||||
}
|
||||
|
||||
private:
|
||||
util::scoped_malloc buf_;
|
||||
char *current_, *end_;
|
||||
int fd_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
@ -1,3 +1,4 @@
|
||||
#include <iostream>
|
||||
/* Fast integer to string conversion.
|
||||
Source: https://github.com/miloyip/itoa-benchmark
|
||||
Local modifications:
|
||||
@ -637,4 +638,28 @@ char *ToString(uint16_t value, char *to) {
|
||||
return ToString((uint32_t)value, to);
|
||||
}
|
||||
|
||||
// void * to string. This hasn't been optimized at all really.
|
||||
namespace {
|
||||
const char kHexDigits[] = "0123456789abcdef";
|
||||
} // namespace
|
||||
|
||||
char *ToString(const void *v, char *to) {
|
||||
// Apparently it's 0, not 0x0.
|
||||
if (!v) {
|
||||
*to++ = '0';
|
||||
return to;
|
||||
}
|
||||
|
||||
*to++ = '0';
|
||||
*to++ = 'x';
|
||||
uintptr_t value = reinterpret_cast<uintptr_t>(v);
|
||||
uint8_t shift = sizeof(void*) * 8 - 4;
|
||||
for (; !(value >> shift); shift -= 4) {}
|
||||
for (; ; shift -= 4) {
|
||||
*to++ = kHexDigits[(value >> shift) & 0xf];
|
||||
if (!shift) break;
|
||||
}
|
||||
return to;
|
||||
}
|
||||
|
||||
} // namespace util
|
||||
|
@ -18,6 +18,8 @@ char *ToString(int64_t value, char *to);
|
||||
char *ToString(uint16_t value, char *to);
|
||||
char *ToString(int16_t value, char *to);
|
||||
|
||||
char *ToString(const void *value, char *to);
|
||||
|
||||
inline char *ToString(bool value, char *to) {
|
||||
*to++ = '0' + value;
|
||||
return to;
|
||||
@ -51,6 +53,14 @@ template <> struct ToStringBuf<int64_t> {
|
||||
enum { kBytes = 20 };
|
||||
};
|
||||
|
||||
template <> struct ToStringBuf<const void*> {
|
||||
// Either 18 on 64-bit or 10 on 32-bit.
|
||||
enum { kBytes = sizeof(const void*) * 2 + 2 };
|
||||
};
|
||||
|
||||
// Maximum over this and float.
|
||||
enum { kToStringMaxBytes = 20 };
|
||||
|
||||
} // namespace util
|
||||
|
||||
#endif // UTIL_INTEGER_TO_STRING_H
|
||||
|
@ -21,9 +21,9 @@ template <class T> void TestValue(const T value) {
|
||||
template <class T> void TestCorners() {
|
||||
TestValue(std::numeric_limits<T>::min());
|
||||
TestValue(std::numeric_limits<T>::max());
|
||||
TestValue(static_cast<T>(0));
|
||||
TestValue(static_cast<T>(-1));
|
||||
TestValue(static_cast<T>(1));
|
||||
TestValue((T)0);
|
||||
TestValue((T)-1);
|
||||
TestValue((T)1);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(Corners) {
|
||||
@ -33,6 +33,7 @@ BOOST_AUTO_TEST_CASE(Corners) {
|
||||
TestCorners<int16_t>();
|
||||
TestCorners<int32_t>();
|
||||
TestCorners<int64_t>();
|
||||
TestCorners<const void*>();
|
||||
}
|
||||
|
||||
template <class T> void TestAll() {
|
||||
@ -62,4 +63,14 @@ BOOST_AUTO_TEST_CASE(Tens) {
|
||||
Test10s<int32_t>();
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(Pointers) {
|
||||
for (uintptr_t i = 1; i < std::numeric_limits<uintptr_t>::max() / 10; i *= 10) {
|
||||
TestValue((const void*)i);
|
||||
}
|
||||
for (uintptr_t i = 0; i < 256; ++i) {
|
||||
TestValue((const void*)i);
|
||||
TestValue((const void*)(i + 0xf00));
|
||||
}
|
||||
}
|
||||
|
||||
}} // namespaces
|
||||
|
233
util/mmap.cc
233
util/mmap.cc
@ -27,7 +27,7 @@
|
||||
|
||||
namespace util {
|
||||
|
||||
long SizePage() {
|
||||
std::size_t SizePage() {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
SYSTEM_INFO si;
|
||||
GetSystemInfo(&si);
|
||||
@ -37,22 +37,6 @@ long SizePage() {
|
||||
#endif
|
||||
}
|
||||
|
||||
void SyncOrThrow(void *start, size_t length) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
UTIL_THROW_IF(!::FlushViewOfFile(start, length), ErrnoException, "Failed to sync mmap");
|
||||
#else
|
||||
UTIL_THROW_IF(length && msync(start, length, MS_SYNC), ErrnoException, "Failed to sync mmap");
|
||||
#endif
|
||||
}
|
||||
|
||||
void UnmapOrThrow(void *start, size_t length) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
UTIL_THROW_IF(!::UnmapViewOfFile(start), ErrnoException, "Failed to unmap a file");
|
||||
#else
|
||||
UTIL_THROW_IF(munmap(start, length), ErrnoException, "munmap failed");
|
||||
#endif
|
||||
}
|
||||
|
||||
scoped_mmap::~scoped_mmap() {
|
||||
if (data_ != (void*)-1) {
|
||||
try {
|
||||
@ -66,14 +50,24 @@ scoped_mmap::~scoped_mmap() {
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
template <class T> T RoundUpPow2(T value, T mult) {
|
||||
return ((value - 1) & ~(mult - 1)) + mult;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
scoped_memory::scoped_memory(std::size_t size, bool zeroed) : data_(NULL), size_(0), source_(NONE_ALLOCATED) {
|
||||
HugeMalloc(size, zeroed, *this);
|
||||
}
|
||||
|
||||
void scoped_memory::reset(void *data, std::size_t size, Alloc source) {
|
||||
switch(source_) {
|
||||
case MMAP_ROUND_UP_ALLOCATED:
|
||||
scoped_mmap(data_, RoundUpPow2(size_, (std::size_t)SizePage()));
|
||||
break;
|
||||
case MMAP_ALLOCATED:
|
||||
scoped_mmap(data_, size_);
|
||||
break;
|
||||
case ARRAY_ALLOCATED:
|
||||
delete [] reinterpret_cast<char*>(data_);
|
||||
break;
|
||||
case MALLOC_ALLOCATED:
|
||||
free(data_);
|
||||
break;
|
||||
@ -85,7 +79,7 @@ void scoped_memory::reset(void *data, std::size_t size, Alloc source) {
|
||||
source_ = source;
|
||||
}
|
||||
|
||||
void scoped_memory::call_realloc(std::size_t size) {
|
||||
/*void scoped_memory::call_realloc(std::size_t size) {
|
||||
assert(source_ == MALLOC_ALLOCATED || source_ == NONE_ALLOCATED);
|
||||
void *new_data = realloc(data_, size);
|
||||
if (!new_data) {
|
||||
@ -95,7 +89,17 @@ void scoped_memory::call_realloc(std::size_t size) {
|
||||
size_ = size;
|
||||
source_ = MALLOC_ALLOCATED;
|
||||
}
|
||||
}
|
||||
}*/
|
||||
|
||||
const int kFileFlags =
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
0 // MapOrThrow ignores flags on windows
|
||||
#elif defined(MAP_FILE)
|
||||
MAP_FILE | MAP_SHARED
|
||||
#else
|
||||
MAP_SHARED
|
||||
#endif
|
||||
;
|
||||
|
||||
void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset) {
|
||||
#ifdef MAP_POPULATE // Linux specific
|
||||
@ -126,15 +130,168 @@ void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int
|
||||
return ret;
|
||||
}
|
||||
|
||||
const int kFileFlags =
|
||||
void SyncOrThrow(void *start, size_t length) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
0 // MapOrThrow ignores flags on windows
|
||||
#elif defined(MAP_FILE)
|
||||
MAP_FILE | MAP_SHARED
|
||||
UTIL_THROW_IF(!::FlushViewOfFile(start, length), ErrnoException, "Failed to sync mmap");
|
||||
#else
|
||||
MAP_SHARED
|
||||
UTIL_THROW_IF(length && msync(start, length, MS_SYNC), ErrnoException, "Failed to sync mmap");
|
||||
#endif
|
||||
;
|
||||
}
|
||||
|
||||
void UnmapOrThrow(void *start, size_t length) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
UTIL_THROW_IF(!::UnmapViewOfFile(start), ErrnoException, "Failed to unmap a file");
|
||||
#else
|
||||
UTIL_THROW_IF(munmap(start, length), ErrnoException, "munmap failed");
|
||||
#endif
|
||||
}
|
||||
|
||||
// Linux huge pages.
|
||||
#ifdef __linux__
|
||||
|
||||
namespace {
|
||||
|
||||
bool AnonymousMap(std::size_t size, int flags, bool populate, util::scoped_memory &to) {
|
||||
if (populate) flags |= MAP_POPULATE;
|
||||
void *ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | flags, -1, 0);
|
||||
if (ret == MAP_FAILED) return false;
|
||||
to.reset(ret, size, scoped_memory::MMAP_ALLOCATED);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TryHuge(std::size_t size, uint8_t alignment_bits, bool populate, util::scoped_memory &to) {
|
||||
// Don't bother with these cases.
|
||||
if (size < (1ULL << alignment_bits) || (1ULL << alignment_bits) < SizePage())
|
||||
return false;
|
||||
|
||||
// First try: Linux >= 3.8 with manually configured hugetlb pages available.
|
||||
#ifdef MAP_HUGE_SHIFT
|
||||
if (AnonymousMap(size, MAP_HUGETLB | (alignment_bits << MAP_HUGE_SHIFT), populate, to))
|
||||
return true;
|
||||
#endif
|
||||
|
||||
// Second try: manually configured hugetlb pages exist, but kernel too old to
|
||||
// pick size or not available. This might pick the wrong size huge pages,
|
||||
// but the sysadmin must have made them available in the first place.
|
||||
if (AnonymousMap(size, MAP_HUGETLB, populate, to))
|
||||
return true;
|
||||
|
||||
// Third try: align to a multiple of the huge page size by overallocating.
|
||||
// I feel bad about doing this, but it's also how posix_memalign is
|
||||
// implemented. And the memory is virtual.
|
||||
|
||||
// Round up requested size to multiple of page size. This will allow the pages after to be munmapped.
|
||||
std::size_t size_up = RoundUpPow2(size, SizePage());
|
||||
|
||||
std::size_t ask = size_up + (1 << alignment_bits) - SizePage();
|
||||
// Don't populate because this is asking for more than we will use.
|
||||
scoped_mmap larger(mmap(NULL, ask, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0), ask);
|
||||
if (larger.get() == MAP_FAILED) return false;
|
||||
|
||||
// Throw out pages before the alignment point.
|
||||
uintptr_t base = reinterpret_cast<uintptr_t>(larger.get());
|
||||
// Round up to next multiple of alignment.
|
||||
uintptr_t rounded_up = RoundUpPow2(base, static_cast<uintptr_t>(1) << alignment_bits);
|
||||
if (base != rounded_up) {
|
||||
// If this throws an exception (which it shouldn't) then we want to unmap the whole thing by keeping it in larger.
|
||||
UnmapOrThrow(larger.get(), rounded_up - base);
|
||||
larger.steal();
|
||||
larger.reset(reinterpret_cast<void*>(rounded_up), ask - (rounded_up - base));
|
||||
}
|
||||
|
||||
// Throw out pages after the requested size.
|
||||
assert(larger.size() >= size_up);
|
||||
if (larger.size() > size_up) {
|
||||
// This is where we assume size_up is a multiple of page size.
|
||||
UnmapOrThrow(static_cast<uint8_t*>(larger.get()) + size_up, larger.size() - size_up);
|
||||
larger.reset(larger.steal(), size_up);
|
||||
}
|
||||
madvise(larger.get(), size_up, MADV_HUGEPAGE);
|
||||
to.reset(larger.steal(), size, scoped_memory::MMAP_ROUND_UP_ALLOCATED);
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
||||
void HugeMalloc(std::size_t size, bool zeroed, scoped_memory &to) {
|
||||
to.reset();
|
||||
#ifdef __linux__
|
||||
// TODO: architectures/page sizes other than 2^21 and 2^30.
|
||||
// Attempt 1 GB pages.
|
||||
// If the user asked for zeroed memory, assume they want it populated.
|
||||
if (size >= (1ULL << 30) && TryHuge(size, 30, zeroed, to))
|
||||
return;
|
||||
// Attempt 2 MB pages.
|
||||
if (size >= (1ULL << 21) && TryHuge(size, 21, zeroed, to))
|
||||
return;
|
||||
#endif // __linux__
|
||||
// Non-linux will always do this, as will small allocations on Linux.
|
||||
to.reset(zeroed ? calloc(1, size) : malloc(size), size, scoped_memory::MALLOC_ALLOCATED);
|
||||
UTIL_THROW_IF(!to.get(), ErrnoException, "Failed to allocate " << size << " bytes");
|
||||
}
|
||||
|
||||
#ifdef __linux__
|
||||
const std::size_t kTransitionHuge = std::max<std::size_t>(1ULL << 21, SizePage());
|
||||
#endif // __linux__
|
||||
|
||||
void HugeRealloc(std::size_t to, bool zero_new, scoped_memory &mem) {
|
||||
if (!to) {
|
||||
mem.reset();
|
||||
return;
|
||||
}
|
||||
std::size_t from_size = mem.size();
|
||||
switch (mem.source()) {
|
||||
case scoped_memory::NONE_ALLOCATED:
|
||||
HugeMalloc(to, zero_new, mem);
|
||||
return;
|
||||
#ifdef __linux__
|
||||
case scoped_memory::MMAP_ROUND_UP_ALLOCATED:
|
||||
// for mremap's benefit.
|
||||
from_size = RoundUpPow2(from_size, SizePage());
|
||||
case scoped_memory::MMAP_ALLOCATED:
|
||||
// Downsizing below barrier?
|
||||
if (to <= SizePage()) {
|
||||
scoped_malloc replacement(malloc(to));
|
||||
memcpy(replacement.get(), mem.get(), std::min(to, mem.size()));
|
||||
if (zero_new && to > mem.size())
|
||||
memset(static_cast<uint8_t*>(replacement.get()) + mem.size(), 0, to - mem.size());
|
||||
mem.reset(replacement.release(), to, scoped_memory::MALLOC_ALLOCATED);
|
||||
} else {
|
||||
void *new_addr = mremap(mem.get(), from_size, to, MREMAP_MAYMOVE);
|
||||
UTIL_THROW_IF(!new_addr, ErrnoException, "Failed to mremap from " << from_size << " to " << to);
|
||||
mem.steal();
|
||||
mem.reset(new_addr, to, scoped_memory::MMAP_ALLOCATED);
|
||||
}
|
||||
return;
|
||||
#endif // __linux__
|
||||
case scoped_memory::MALLOC_ALLOCATED:
|
||||
#ifdef __linux__
|
||||
// Transition larger allocations to huge pages, but don't keep trying if we're still malloc allocated.
|
||||
if (to >= kTransitionHuge && mem.size() < kTransitionHuge) {
|
||||
scoped_memory replacement;
|
||||
HugeMalloc(to, zero_new, replacement);
|
||||
memcpy(replacement.get(), mem.get(), mem.size());
|
||||
// This can't throw.
|
||||
mem.reset(replacement.get(), replacement.size(), replacement.source());
|
||||
replacement.steal();
|
||||
return;
|
||||
}
|
||||
#endif // __linux__
|
||||
{
|
||||
void *new_addr = std::realloc(mem.get(), to);
|
||||
UTIL_THROW_IF(!new_addr, ErrnoException, "realloc to " << to << " bytes failed.");
|
||||
if (zero_new && to > mem.size())
|
||||
memset(static_cast<uint8_t*>(new_addr) + mem.size(), 0, to - mem.size());
|
||||
mem.steal();
|
||||
mem.reset(new_addr, to, scoped_memory::MALLOC_ALLOCATED);
|
||||
}
|
||||
return;
|
||||
default:
|
||||
UTIL_THROW(Exception, "HugeRealloc called with type " << mem.source());
|
||||
}
|
||||
}
|
||||
|
||||
void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out) {
|
||||
switch (method) {
|
||||
@ -151,33 +308,17 @@ void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scope
|
||||
case POPULATE_OR_READ:
|
||||
#endif
|
||||
case READ:
|
||||
out.reset(MallocOrThrow(size), size, scoped_memory::MALLOC_ALLOCATED);
|
||||
HugeMalloc(size, false, out);
|
||||
SeekOrThrow(fd, offset);
|
||||
ReadOrThrow(fd, out.get(), size);
|
||||
break;
|
||||
case PARALLEL_READ:
|
||||
out.reset(MallocOrThrow(size), size, scoped_memory::MALLOC_ALLOCATED);
|
||||
HugeMalloc(size, false, out);
|
||||
ParallelRead(fd, out.get(), size, offset);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Allocates zeroed memory in to.
|
||||
void MapAnonymous(std::size_t size, util::scoped_memory &to) {
|
||||
to.reset();
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
to.reset(calloc(1, size), size, scoped_memory::MALLOC_ALLOCATED);
|
||||
#else
|
||||
to.reset(MapOrThrow(size, true,
|
||||
# if defined(MAP_ANONYMOUS)
|
||||
MAP_ANONYMOUS | MAP_PRIVATE // Linux
|
||||
# else
|
||||
MAP_ANON | MAP_PRIVATE // BSD
|
||||
# endif
|
||||
, false, -1, 0), size, scoped_memory::MMAP_ALLOCATED);
|
||||
#endif
|
||||
}
|
||||
|
||||
void *MapZeroedWrite(int fd, std::size_t size) {
|
||||
ResizeOrThrow(fd, 0);
|
||||
ResizeOrThrow(fd, size);
|
||||
|
66
util/mmap.hh
66
util/mmap.hh
@ -12,7 +12,7 @@ namespace util {
|
||||
|
||||
class scoped_fd;
|
||||
|
||||
long SizePage();
|
||||
std::size_t SizePage();
|
||||
|
||||
// (void*)-1 is MAP_FAILED; this is done to avoid including the mmap header here.
|
||||
class scoped_mmap {
|
||||
@ -37,6 +37,13 @@ class scoped_mmap {
|
||||
reset((void*)-1, 0);
|
||||
}
|
||||
|
||||
void *steal() {
|
||||
void *ret = data_;
|
||||
data_ = (void*)-1;
|
||||
size_ = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
private:
|
||||
void *data_;
|
||||
std::size_t size_;
|
||||
@ -51,13 +58,21 @@ class scoped_mmap {
|
||||
*/
|
||||
class scoped_memory {
|
||||
public:
|
||||
typedef enum {MMAP_ALLOCATED, ARRAY_ALLOCATED, MALLOC_ALLOCATED, NONE_ALLOCATED} Alloc;
|
||||
typedef enum {
|
||||
MMAP_ROUND_UP_ALLOCATED, // The size was rounded up to a multiple of page size. Do the same before munmap.
|
||||
MMAP_ALLOCATED, // munmap
|
||||
MALLOC_ALLOCATED, // free
|
||||
NONE_ALLOCATED // nothing here!
|
||||
} Alloc;
|
||||
|
||||
scoped_memory(void *data, std::size_t size, Alloc source)
|
||||
: data_(data), size_(size), source_(source) {}
|
||||
|
||||
scoped_memory() : data_(NULL), size_(0), source_(NONE_ALLOCATED) {}
|
||||
|
||||
// Calls HugeMalloc
|
||||
scoped_memory(std::size_t to, bool zero_new);
|
||||
|
||||
~scoped_memory() { reset(); }
|
||||
|
||||
void *get() const { return data_; }
|
||||
@ -71,9 +86,13 @@ class scoped_memory {
|
||||
|
||||
void reset(void *data, std::size_t size, Alloc from);
|
||||
|
||||
// realloc allows the current data to escape hence the need for this call
|
||||
// If realloc fails, destroys the original too and get() returns NULL.
|
||||
void call_realloc(std::size_t to);
|
||||
void *steal() {
|
||||
void *ret = data_;
|
||||
data_ = NULL;
|
||||
size_ = 0;
|
||||
source_ = NONE_ALLOCATED;
|
||||
return ret;
|
||||
}
|
||||
|
||||
private:
|
||||
void *data_;
|
||||
@ -85,6 +104,30 @@ class scoped_memory {
|
||||
scoped_memory &operator=(const scoped_memory &);
|
||||
};
|
||||
|
||||
extern const int kFileFlags;
|
||||
|
||||
// Cross-platform, error-checking wrapper for mmap().
|
||||
void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset = 0);
|
||||
|
||||
// msync wrapper
|
||||
void SyncOrThrow(void *start, size_t length);
|
||||
|
||||
// Cross-platform, error-checking wrapper for munmap().
|
||||
void UnmapOrThrow(void *start, size_t length);
|
||||
|
||||
// Allocate memory, promising that all/vast majority of it will be used. Tries
|
||||
// hard to use huge pages on Linux.
|
||||
// If you want zeroed memory, pass zeroed = true.
|
||||
void HugeMalloc(std::size_t size, bool zeroed, scoped_memory &to);
|
||||
|
||||
// Reallocates memory ala realloc but with option to zero the new memory.
|
||||
// On Linux, the memory can come from anonymous mmap or malloc/calloc.
|
||||
// On non-Linux, only malloc/calloc is supported.
|
||||
//
|
||||
// To summarize, any memory from HugeMalloc or HugeRealloc can be resized with
|
||||
// this.
|
||||
void HugeRealloc(std::size_t size, bool new_zeroed, scoped_memory &mem);
|
||||
|
||||
typedef enum {
|
||||
// mmap with no prepopulate
|
||||
LAZY,
|
||||
@ -98,25 +141,12 @@ typedef enum {
|
||||
PARALLEL_READ,
|
||||
} LoadMethod;
|
||||
|
||||
extern const int kFileFlags;
|
||||
|
||||
// Cross-platform, error-checking wrapper for mmap().
|
||||
void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset = 0);
|
||||
|
||||
// Cross-platform, error-checking wrapper for munmap().
|
||||
void UnmapOrThrow(void *start, size_t length);
|
||||
|
||||
void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out);
|
||||
|
||||
void MapAnonymous(std::size_t size, scoped_memory &to);
|
||||
|
||||
// Open file name with mmap of size bytes, all of which are initially zero.
|
||||
void *MapZeroedWrite(int fd, std::size_t size);
|
||||
void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file);
|
||||
|
||||
// msync wrapper
|
||||
void SyncOrThrow(void *start, size_t length);
|
||||
|
||||
// Forward rolling memory map with no overlap.
|
||||
class Rolling {
|
||||
public:
|
||||
|
@ -4,6 +4,8 @@
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace util {
|
||||
|
||||
Pool::Pool() {
|
||||
|
@ -2,7 +2,7 @@
|
||||
#define UTIL_PROBING_HASH_TABLE_H
|
||||
|
||||
#include "util/exception.hh"
|
||||
#include "util/scoped.hh"
|
||||
#include "util/mmap.hh"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
@ -336,9 +336,11 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
|
||||
typedef EqualT Equal;
|
||||
|
||||
AutoProbing(std::size_t initial_size = 5, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) :
|
||||
allocated_(Backend::Size(initial_size, 1.5)), mem_(util::MallocOrThrow(allocated_)), backend_(mem_.get(), allocated_, invalid, hash_func, equal_func) {
|
||||
threshold_ = initial_size * 1.2;
|
||||
Clear();
|
||||
allocated_(Backend::Size(initial_size, 1.2)), mem_(allocated_, KeyIsRawZero(invalid)), backend_(mem_.get(), allocated_, invalid, hash_func, equal_func) {
|
||||
threshold_ = std::min<std::size_t>(backend_.buckets_ - 1, backend_.buckets_ * 0.9);
|
||||
if (!KeyIsRawZero(invalid)) {
|
||||
Clear();
|
||||
}
|
||||
}
|
||||
|
||||
// Assumes that the key is unique. Multiple insertions won't cause a failure, just inconsistent lookup.
|
||||
@ -379,16 +381,23 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
|
||||
|
||||
private:
|
||||
void DoubleIfNeeded() {
|
||||
if (Size() < threshold_)
|
||||
if (UTIL_LIKELY(Size() < threshold_))
|
||||
return;
|
||||
mem_.call_realloc(backend_.DoubleTo());
|
||||
HugeRealloc(backend_.DoubleTo(), KeyIsRawZero(backend_.invalid_), mem_);
|
||||
allocated_ = backend_.DoubleTo();
|
||||
backend_.Double(mem_.get());
|
||||
threshold_ *= 2;
|
||||
backend_.Double(mem_.get(), !KeyIsRawZero(backend_.invalid_));
|
||||
threshold_ = std::min<std::size_t>(backend_.buckets_ - 1, backend_.buckets_ * 0.9);
|
||||
}
|
||||
|
||||
bool KeyIsRawZero(const Key &key) {
|
||||
for (const uint8_t *i = reinterpret_cast<const uint8_t*>(&key); i < reinterpret_cast<const uint8_t*>(&key) + sizeof(Key); ++i) {
|
||||
if (*i) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::size_t allocated_;
|
||||
util::scoped_malloc mem_;
|
||||
util::scoped_memory mem_;
|
||||
Backend backend_;
|
||||
std::size_t threshold_;
|
||||
};
|
||||
|
@ -1,6 +1,6 @@
|
||||
#include "util/file.hh"
|
||||
#include "util/probing_hash_table.hh"
|
||||
#include "util/scoped.hh"
|
||||
#include "util/mmap.hh"
|
||||
#include "util/usage.hh"
|
||||
|
||||
#include <iostream>
|
||||
@ -46,11 +46,12 @@ struct PrefetchEntry {
|
||||
const Entry *pointer;
|
||||
};
|
||||
|
||||
const std::size_t kPrefetchSize = 4;
|
||||
template <class Table> class PrefetchQueue {
|
||||
template <class TableT, unsigned PrefetchSize> class PrefetchQueue {
|
||||
public:
|
||||
typedef TableT Table;
|
||||
|
||||
explicit PrefetchQueue(Table &table) : table_(table), cur_(0), twiddle_(false) {
|
||||
for (PrefetchEntry *i = entries_; i != entries_ + kPrefetchSize; ++i)
|
||||
for (PrefetchEntry *i = entries_; i != entries_ + PrefetchSize; ++i)
|
||||
i->pointer = NULL;
|
||||
}
|
||||
|
||||
@ -66,7 +67,7 @@ template <class Table> class PrefetchQueue {
|
||||
|
||||
bool Drain() {
|
||||
if (Cur().pointer) {
|
||||
for (PrefetchEntry *i = &Cur(); i < entries_ + kPrefetchSize; ++i) {
|
||||
for (PrefetchEntry *i = &Cur(); i < entries_ + PrefetchSize; ++i) {
|
||||
twiddle_ ^= table_.FindFromIdeal(i->key, i->pointer);
|
||||
}
|
||||
}
|
||||
@ -80,11 +81,11 @@ template <class Table> class PrefetchQueue {
|
||||
PrefetchEntry &Cur() { return entries_[cur_]; }
|
||||
void Next() {
|
||||
++cur_;
|
||||
cur_ = cur_ % kPrefetchSize;
|
||||
cur_ = cur_ % PrefetchSize;
|
||||
}
|
||||
|
||||
Table &table_;
|
||||
PrefetchEntry entries_[kPrefetchSize];
|
||||
PrefetchEntry entries_[PrefetchSize];
|
||||
std::size_t cur_;
|
||||
|
||||
bool twiddle_;
|
||||
@ -93,12 +94,23 @@ template <class Table> class PrefetchQueue {
|
||||
void operator=(const PrefetchQueue&);
|
||||
};
|
||||
|
||||
/*template <class Table> class Immediate {
|
||||
template <class TableT> class Immediate {
|
||||
public:
|
||||
typedef TableT Table;
|
||||
|
||||
explicit Immediate(Table &table) : table_(table), twiddle_(false) {}
|
||||
|
||||
void Add(uint64_t key) {
|
||||
typename Table::ConstIterator it;
|
||||
twiddle_ ^= table_.Find(key, it);
|
||||
}
|
||||
|
||||
bool Drain() const { return twiddle_; }
|
||||
|
||||
private:
|
||||
Table &table_;
|
||||
};*/
|
||||
bool twiddle_;
|
||||
};
|
||||
|
||||
std::size_t Size(uint64_t entries, float multiplier = 1.5) {
|
||||
typedef util::ProbingHashTable<Entry, util::IdentityHash, std::equal_to<Entry::Key>, Power2Mod> Table;
|
||||
@ -106,39 +118,54 @@ std::size_t Size(uint64_t entries, float multiplier = 1.5) {
|
||||
return Power2Mod::RoundBuckets(Table::Size(entries, multiplier) / sizeof(Entry)) * sizeof(Entry);
|
||||
}
|
||||
|
||||
template <class Mod> bool Test(URandom &rn, uint64_t entries, const uint64_t *const queries_begin, const uint64_t *const queries_end, float multiplier = 1.5) {
|
||||
typedef util::ProbingHashTable<Entry, util::IdentityHash, std::equal_to<Entry::Key>, Mod> Table;
|
||||
template <class Queue> bool Test(URandom &rn, uint64_t entries, const uint64_t *const queries_begin, const uint64_t *const queries_end, bool ordinary_malloc, float multiplier = 1.5) {
|
||||
std::size_t size = Size(entries, multiplier);
|
||||
scoped_malloc backing(util::CallocOrThrow(size));
|
||||
Table table(backing.get(), size);
|
||||
scoped_memory backing;
|
||||
if (ordinary_malloc) {
|
||||
backing.reset(util::CallocOrThrow(size), size, scoped_memory::MALLOC_ALLOCATED);
|
||||
} else {
|
||||
util::HugeMalloc(size, true, backing);
|
||||
}
|
||||
typename Queue::Table table(backing.get(), size);
|
||||
|
||||
double start = UserTime();
|
||||
double start = CPUTime();
|
||||
for (uint64_t i = 0; i < entries; ++i) {
|
||||
Entry entry;
|
||||
entry.key = rn.Get();
|
||||
table.Insert(entry);
|
||||
}
|
||||
double inserted = UserTime() - start;
|
||||
double before_lookup = UserTime();
|
||||
PrefetchQueue<Table> queue(table);
|
||||
double inserted = CPUTime() - start;
|
||||
double before_lookup = CPUTime();
|
||||
Queue queue(table);
|
||||
for (const uint64_t *i = queries_begin; i != queries_end; ++i) {
|
||||
queue.Add(*i);
|
||||
/* typename Table::ConstIterator it;
|
||||
meaningless ^= table.Find(*i, it);*/
|
||||
}
|
||||
bool meaningless = queue.Drain();
|
||||
std::cout << entries << ' ' << size << ' ' << (inserted / static_cast<double>(entries)) << ' ' << (UserTime() - before_lookup) / static_cast<double>(queries_end - queries_begin) << '\n';
|
||||
std::cout << ' ' << (inserted / static_cast<double>(entries)) << ' ' << (CPUTime() - before_lookup) / static_cast<double>(queries_end - queries_begin) << std::flush;
|
||||
return meaningless;
|
||||
}
|
||||
|
||||
template <class Mod> bool TestRun(uint64_t lookups = 20000000, float multiplier = 1.5) {
|
||||
bool TestRun(uint64_t lookups = 20000000, float multiplier = 1.5) {
|
||||
URandom rn;
|
||||
util::scoped_malloc queries(util::CallocOrThrow(lookups * sizeof(uint64_t)));
|
||||
util::scoped_memory queries;
|
||||
HugeMalloc(lookups * sizeof(uint64_t), true, queries);
|
||||
rn.Batch(static_cast<uint64_t*>(queries.get()), static_cast<uint64_t*>(queries.get()) + lookups);
|
||||
uint64_t physical_mem_limit = util::GuessPhysicalMemory() / 2;
|
||||
bool meaningless = true;
|
||||
for (uint64_t i = 4; Size(i / multiplier) < physical_mem_limit; i *= 4) {
|
||||
meaningless ^= util::Test<Mod>(rn, i / multiplier, static_cast<const uint64_t*>(queries.get()), static_cast<const uint64_t*>(queries.get()) + lookups, multiplier);
|
||||
std::cout << static_cast<std::size_t>(i / multiplier) << ' ' << Size(i / multiplier);
|
||||
typedef util::ProbingHashTable<Entry, util::IdentityHash, std::equal_to<Entry::Key>, Power2Mod> Table;
|
||||
typedef util::ProbingHashTable<Entry, util::IdentityHash, std::equal_to<Entry::Key>, DivMod> TableDiv;
|
||||
const uint64_t *const queries_begin = static_cast<const uint64_t*>(queries.get());
|
||||
meaningless ^= util::Test<Immediate<TableDiv> >(rn, i / multiplier, queries_begin, queries_begin + lookups, true, multiplier);
|
||||
meaningless ^= util::Test<Immediate<Table> >(rn, i / multiplier, queries_begin, queries_begin + lookups, true, multiplier);
|
||||
meaningless ^= util::Test<PrefetchQueue<Table, 4> >(rn, i / multiplier, queries_begin, queries_begin + lookups, true, multiplier);
|
||||
meaningless ^= util::Test<Immediate<Table> >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier);
|
||||
meaningless ^= util::Test<PrefetchQueue<Table, 2> >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier);
|
||||
meaningless ^= util::Test<PrefetchQueue<Table, 4> >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier);
|
||||
meaningless ^= util::Test<PrefetchQueue<Table, 8> >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier);
|
||||
meaningless ^= util::Test<PrefetchQueue<Table, 16> >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier);
|
||||
std::cout << std::endl;
|
||||
}
|
||||
return meaningless;
|
||||
}
|
||||
@ -148,9 +175,7 @@ template <class Mod> bool TestRun(uint64_t lookups = 20000000, float multiplier
|
||||
|
||||
int main() {
|
||||
bool meaningless = false;
|
||||
std::cout << "#Integer division\n";
|
||||
meaningless ^= util::TestRun<util::DivMod>();
|
||||
std::cout << "#Masking\n";
|
||||
meaningless ^= util::TestRun<util::Power2Mod>();
|
||||
std::cout << "#CPU time\n";
|
||||
meaningless ^= util::TestRun();
|
||||
std::cerr << "Meaningless: " << meaningless << '\n';
|
||||
}
|
||||
|
@ -27,7 +27,7 @@ void *MallocOrThrow(std::size_t requested) {
|
||||
}
|
||||
|
||||
void *CallocOrThrow(std::size_t requested) {
|
||||
return InspectAddr(std::calloc(1, requested), requested, "calloc");
|
||||
return InspectAddr(std::calloc(requested, 1), requested, "calloc");
|
||||
}
|
||||
|
||||
void scoped_malloc::call_realloc(std::size_t requested) {
|
||||
|
@ -1,6 +1,5 @@
|
||||
#include "util/stream/rewindable_stream.hh"
|
||||
#include "util/pcqueue.hh"
|
||||
#include <iostream>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
|
44
util/string_stream.hh
Normal file
44
util/string_stream.hh
Normal file
@ -0,0 +1,44 @@
|
||||
#ifndef UTIL_STRING_STREAM_H
|
||||
#define UTIL_STRING_STREAM_H
|
||||
|
||||
#include "util/fake_ostream.hh"
|
||||
|
||||
#include <cassert>
|
||||
#include <string>
|
||||
|
||||
namespace util {
|
||||
|
||||
class StringStream : public FakeOStream<StringStream> {
|
||||
public:
|
||||
// Semantics: appends to string. Remember to clear first!
|
||||
explicit StringStream(std::string &out)
|
||||
: out_(out) {}
|
||||
|
||||
StringStream &flush() { return *this; }
|
||||
|
||||
StringStream &write(const void *data, std::size_t length) {
|
||||
out_.append(static_cast<const char*>(data), length);
|
||||
return *this;
|
||||
}
|
||||
|
||||
protected:
|
||||
friend class FakeOStream<StringStream>;
|
||||
char *Ensure(std::size_t amount) {
|
||||
std::size_t current = out_.size();
|
||||
out_.resize(out_.size() + amount);
|
||||
return &out_[current];
|
||||
}
|
||||
|
||||
void AdvanceTo(char *to) {
|
||||
assert(to <= &*out_.end());
|
||||
assert(to >= &*out_.begin());
|
||||
out_.resize(to - &*out_.begin());
|
||||
}
|
||||
|
||||
private:
|
||||
std::string &out_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // UTIL_STRING_STREAM_H
|
57
util/string_stream_test.cc
Normal file
57
util/string_stream_test.cc
Normal file
@ -0,0 +1,57 @@
|
||||
#define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE
|
||||
#define BOOST_TEST_MODULE FakeOStreamTest
|
||||
|
||||
#include "util/string_stream.hh"
|
||||
#include <boost/test/unit_test.hpp>
|
||||
#include <boost/lexical_cast.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <limits>
|
||||
|
||||
namespace util { namespace {
|
||||
|
||||
template <class T> void TestEqual(const T value) {
|
||||
std::string str;
|
||||
StringStream(str) << value;
|
||||
BOOST_CHECK_EQUAL(boost::lexical_cast<std::string>(value), str);
|
||||
}
|
||||
|
||||
template <class T> void TestCorners() {
|
||||
TestEqual(std::numeric_limits<T>::max());
|
||||
TestEqual(std::numeric_limits<T>::min());
|
||||
TestEqual(static_cast<T>(0));
|
||||
TestEqual(static_cast<T>(-1));
|
||||
TestEqual(static_cast<T>(1));
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(Integer) {
|
||||
TestCorners<char>();
|
||||
TestCorners<signed char>();
|
||||
TestCorners<unsigned char>();
|
||||
|
||||
TestCorners<short>();
|
||||
TestCorners<signed short>();
|
||||
TestCorners<unsigned short>();
|
||||
|
||||
TestCorners<int>();
|
||||
TestCorners<unsigned int>();
|
||||
TestCorners<signed int>();
|
||||
|
||||
TestCorners<long>();
|
||||
TestCorners<unsigned long>();
|
||||
TestCorners<signed long>();
|
||||
|
||||
TestCorners<long long>();
|
||||
TestCorners<unsigned long long>();
|
||||
TestCorners<signed long long>();
|
||||
|
||||
TestCorners<std::size_t>();
|
||||
}
|
||||
|
||||
enum TinyEnum { EnumValue };
|
||||
|
||||
BOOST_AUTO_TEST_CASE(EnumCase) {
|
||||
TestEqual(EnumValue);
|
||||
}
|
||||
|
||||
}} // namespaces
|
@ -135,14 +135,26 @@ double WallTime() {
|
||||
return Subtract(GetWall(), kRecordStart.Started());
|
||||
}
|
||||
|
||||
double UserTime() {
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
double CPUTime() {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
return 0.0;
|
||||
#else
|
||||
struct rusage usage;
|
||||
if (getrusage(RUSAGE_SELF, &usage))
|
||||
return 0.0;
|
||||
return DoubleSec(usage.ru_utime);
|
||||
return DoubleSec(usage.ru_utime) + DoubleSec(usage.ru_stime);
|
||||
#endif
|
||||
}
|
||||
|
||||
uint64_t RSSMax() {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
return 0;
|
||||
#else
|
||||
struct rusage usage;
|
||||
if (getrusage(RUSAGE_SELF, &usage))
|
||||
return 0;
|
||||
return static_cast<uint64_t>(usage.ru_maxrss) * 1024;
|
||||
#endif
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
void PrintUsage(std::ostream &out) {
|
||||
@ -274,6 +286,7 @@ template <class Num> uint64_t ParseNum(const std::string &arg) {
|
||||
return static_cast<uint64_t>(static_cast<double>(value) * static_cast<double>(mem) / 100.0);
|
||||
}
|
||||
|
||||
if (after == "k") after == "K";
|
||||
std::string units("bKMGTPEZY");
|
||||
std::string::size_type index = units.find(after[0]);
|
||||
UTIL_THROW_IF_ARG(index == std::string::npos, SizeParseError, (arg), "the allowed suffixes are " << units << "%.");
|
||||
|
@ -9,7 +9,11 @@ namespace util {
|
||||
// Time in seconds since process started. Zero on unsupported platforms.
|
||||
double WallTime();
|
||||
|
||||
double UserTime();
|
||||
// User + system time.
|
||||
double CPUTime();
|
||||
|
||||
// Resident usage in bytes.
|
||||
uint64_t RSSMax();
|
||||
|
||||
void PrintUsage(std::ostream &to);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user