Support gzipped ARPA files. Progress bar tweak. Test fixes. Holding off on the big change for now.

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3643 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
heafield 2010-10-23 05:21:10 +00:00
parent ecf4b0d368
commit 8d0d44f5cd
8 changed files with 208 additions and 62 deletions

View File

@ -19,9 +19,9 @@ libkenlm_a_SOURCES = \
query_SOURCES = lm/ngram_query.cc
query_DEPENDENCIES = libkenlm.a
query_LDADD = -L$(top_srcdir)/kenlm -lkenlm
query_LDADD = -L$(top_srcdir)/kenlm -lkenlm -lz
build_binary_SOURCES = lm/ngram_build_binary.cc
build_binary_DEPENDENCIES = libkenlm.a
build_binary_LDADD = -L$(top_srcdir)/kenlm -lkenlm
build_binary_LDADD = -L$(top_srcdir)/kenlm -lkenlm -lz

View File

@ -195,7 +195,7 @@ template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::Ge
} else {
if (config.probing_multiplier <= 1.0) UTIL_THROW(FormatLoadException, "probing multiplier must be > 1.0");
util::FilePiece f(file, mapped_file_.release(), config.messages);
util::FilePiece f(mapped_file_.release(), file, config.messages);
ReadARPACounts(f, counts);
size_t memory_size = Size(counts, config);
char *start;

View File

@ -68,7 +68,7 @@ template <class M> void Stateless(const M &model) {
for (unsigned int i = 0; i < sizeof(words) / sizeof(const char*); ++i) {
indices[i] = model.GetVocabulary().Index(words[i]);
}
FullScoreReturn ret;
HieuShouldRefactorMoses ret;
StatelessTest(indices, indices + 2, 2, -0.484652);
StatelessTest(indices, indices + 3, 3, -0.348837);
StatelessTest(indices, indices + 4, 4, -0.0155266);

View File

@ -13,10 +13,7 @@ ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits<std::s
ErsatzProgress::~ErsatzProgress() {
if (!out_) return;
for (; stones_written_ < kWidth; ++stones_written_) {
(*out_) << '*';
}
*out_ << '\n';
Finished();
}
ErsatzProgress::ErsatzProgress(std::ostream *to, const std::string &message, std::size_t complete)
@ -36,6 +33,9 @@ void ErsatzProgress::Milestone() {
for (; stones_written_ < stone; ++stones_written_) {
(*out_) << '*';
}
if (stone == kWidth) {
(*out_) << std::endl;
}
if (current_ >= complete_) {
next_ = std::numeric_limits<std::size_t>::max();

View File

@ -19,7 +19,7 @@ class ErsatzProgress {
~ErsatzProgress();
ErsatzProgress &operator++() {
if (++current_ == next_) Milestone();
if (++current_ >= next_) Milestone();
return *this;
}
@ -33,6 +33,10 @@ class ErsatzProgress {
Milestone();
}
void Finished() {
Set(complete_);
}
private:
void Milestone();

View File

@ -2,19 +2,23 @@
#include "util/exception.hh"
#include <iostream>
#include <string>
#include <limits>
#include <assert.h>
#include <cstdlib>
#include <ctype.h>
#include <err.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#ifdef USE_ZLIB
#include <zlib.h>
#endif
namespace util {
EndOfFileException::EndOfFileException() throw() {
@ -26,6 +30,13 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() {
*this << "Could not parse \"" << value << "\" into a float";
}
GZException::GZException(void *file) {
#ifdef USE_ZLIB
int num;
*this << gzerror(file, &num) << " from zlib";
#endif // USE_ZLIB
}
int OpenReadOrThrow(const char *name) {
int ret = open(name, O_RDONLY);
if (ret == -1) UTIL_THROW(ErrnoException, "in open (" << name << ") for reading");
@ -38,42 +49,72 @@ off_t SizeFile(int fd) {
return sb.st_size;
}
FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) :
FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) :
file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
Initialize(name, show_progress, min_buffer);
}
FilePiece::FilePiece(const char *name, int fd, std::ostream *show_progress, off_t min_buffer) :
FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) :
file_(fd), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
Initialize(name, show_progress, min_buffer);
}
void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) {
if (total_size_ == kBadSize) {
fallback_to_read_ = true;
if (show_progress)
*show_progress << "File " << name << " isn't normal. Using slower read() instead of mmap(). No progress bar." << std::endl;
} else {
fallback_to_read_ = false;
FilePiece::~FilePiece() {
#ifdef USE_ZLIB
if (gz_file_) {
int ret;
if (Z_OK != (ret = gzclose(gz_file_))) {
errx(1, "could not close file %s using zlib", file_name_.c_str());
abort();
}
}
#endif
}
void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) {
#ifdef USE_ZLIB
gz_file_ = NULL;
#endif
file_name_ = name;
default_map_size_ = page_ * std::max<off_t>((min_buffer / page_ + 1), 2);
position_ = NULL;
position_end_ = NULL;
mapped_offset_ = 0;
at_end_ = false;
if (total_size_ == kBadSize) {
// So the assertion passes.
fallback_to_read_ = false;
if (show_progress)
*show_progress << "File " << name << " isn't normal. Using slower read() instead of mmap(). No progress bar." << std::endl;
TransitionToRead();
} else {
fallback_to_read_ = false;
}
Shift();
// gzip detect.
if ((position_end_ - position_) > 2 && *position_ == 0x1f && static_cast<unsigned char>(*(position_ + 1)) == 0x8b) {
#ifndef USE_ZLIB
UTIL_THROW(GZException, "Looks like a gzip file but support was not compiled in.");
#endif
if (!fallback_to_read_) {
at_end_ = false;
TransitionToRead();
}
}
}
float FilePiece::ReadFloat() throw(EndOfFileException, ParseNumberException) {
float FilePiece::ReadFloat() throw(GZException, EndOfFileException, ParseNumberException) {
SkipSpaces();
while (last_space_ < position_) {
if (at_end_) {
// Hallucinate a null off the end of the file.
std::string buffer(position_, position_end_);
char *end;
float ret = strtof(buffer.c_str(), &end);
float ret = std::strtof(buffer.c_str(), &end);
if (buffer.c_str() == end) throw ParseNumberException(buffer);
position_ += end - buffer.c_str();
return ret;
@ -81,20 +122,20 @@ float FilePiece::ReadFloat() throw(EndOfFileException, ParseNumberException) {
Shift();
}
char *end;
float ret = strtof(position_, &end);
float ret = std::strtof(position_, &end);
if (end == position_) throw ParseNumberException(ReadDelimited());
position_ = end;
return ret;
}
void FilePiece::SkipSpaces() throw (EndOfFileException) {
void FilePiece::SkipSpaces() throw (GZException, EndOfFileException) {
for (; ; ++position_) {
if (position_ == position_end_) Shift();
if (!isspace(*position_)) return;
}
}
const char *FilePiece::FindDelimiterOrEOF() throw (EndOfFileException) {
const char *FilePiece::FindDelimiterOrEOF() throw (GZException, EndOfFileException) {
for (const char *i = position_; i <= last_space_; ++i) {
if (isspace(*i)) return i;
}
@ -108,7 +149,7 @@ const char *FilePiece::FindDelimiterOrEOF() throw (EndOfFileException) {
return position_end_;
}
StringPiece FilePiece::ReadLine(char delim) throw (EndOfFileException) {
StringPiece FilePiece::ReadLine(char delim) throw (GZException, EndOfFileException) {
const char *start = position_;
do {
for (const char *i = start; i < position_end_; ++i) {
@ -127,14 +168,16 @@ StringPiece FilePiece::ReadLine(char delim) throw (EndOfFileException) {
return position_;
}
void FilePiece::Shift() throw(EndOfFileException) {
if (at_end_) throw EndOfFileException();
void FilePiece::Shift() throw(GZException, EndOfFileException) {
if (at_end_) {
progress_.Finished();
throw EndOfFileException();
}
off_t desired_begin = position_ - data_.begin() + mapped_offset_;
progress_.Set(desired_begin);
if (!fallback_to_read_) MMapShift(desired_begin);
// Notice an mmap failure might set the fallback.
if (fallback_to_read_) ReadShift(desired_begin);
if (fallback_to_read_) ReadShift();
for (last_space_ = position_end_ - 1; last_space_ >= position_; --last_space_) {
if (isspace(*last_space_)) break;
@ -163,28 +206,43 @@ void FilePiece::MMapShift(off_t desired_begin) throw() {
data_.reset();
data_.reset(mmap(NULL, mapped_size, PROT_READ, MAP_PRIVATE, *file_, mapped_offset), mapped_size, scoped_memory::MMAP_ALLOCATED);
if (data_.get() == MAP_FAILED) {
fallback_to_read_ = true;
if (desired_begin) {
if (((off_t)-1) == lseek(*file_, desired_begin, SEEK_SET)) UTIL_THROW(ErrnoException, "mmap failed even though it worked before. lseek failed too, so using read isn't an option either.");
}
// The mmap was scheduled to end the file, but now we're going to read it.
at_end_ = false;
TransitionToRead();
return;
}
mapped_offset_ = mapped_offset;
position_ = data_.begin() + ignore;
position_end_ = data_.begin() + mapped_size;
progress_.Set(desired_begin);
}
void FilePiece::ReadShift(off_t desired_begin) throw() {
void FilePiece::TransitionToRead() throw (GZException) {
assert(!fallback_to_read_);
fallback_to_read_ = true;
data_.reset();
data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
if (!data_.get()) UTIL_THROW(ErrnoException, "malloc failed for " << default_map_size_);
position_ = data_.begin();
position_end_ = position_;
#ifdef USE_ZLIB
assert(!gz_file_);
gz_file_ = gzdopen(file_.get(), "r");
if (!gz_file_) {
UTIL_THROW(GZException, "zlib failed to open " << file_name_);
}
// gz_file_ took ownership. Also the fd shouldn't be used for anything else.
file_.release();
#endif
}
void FilePiece::ReadShift() throw(GZException, EndOfFileException) {
assert(fallback_to_read_);
if (data_.source() != scoped_memory::MALLOC_ALLOCATED) {
// First call.
data_.reset();
data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
if (!data_.get()) UTIL_THROW(ErrnoException, "malloc failed for " << default_map_size_);
position_ = data_.begin();
position_end_ = position_;
}
// Bytes [data_.begin(), position_) have been consumed.
// Bytes [position_, position_end_) have been read into the buffer.
@ -215,9 +273,26 @@ void FilePiece::ReadShift(off_t desired_begin) throw() {
}
}
ssize_t read_return = read(file_.get(), static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read);
ssize_t read_return;
#ifdef USE_ZLIB
read_return = gzread(gz_file_, static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read);
if (read_return == -1) throw GZException(gz_file_);
if (total_size_ != kBadSize) {
z_off_t got_off = gztell(gz_file_);
if (got_off == -1) {
gzclearerr(gz_file_);
} else {
progress_.Set(got_off);
}
}
#else
read_return = read(file_.get(), static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read);
if (read_return == -1) UTIL_THROW(ErrnoException, "read failed");
if (read_return == 0) at_end_ = true;
progress_.Set(mapped_offset_);
#endif
if (read_return == 0) {
at_end_ = true;
}
position_end_ += read_return;
}

View File

@ -11,6 +11,8 @@
#include <cstddef>
#define USE_ZLIB
namespace util {
class EndOfFileException : public Exception {
@ -25,6 +27,13 @@ class ParseNumberException : public Exception {
~ParseNumberException() throw() {}
};
class GZException : public Exception {
public:
explicit GZException(void *file);
GZException() throw() {}
~GZException() throw() {}
};
int OpenReadOrThrow(const char *name);
// Return value for SizeFile when it can't size properly.
@ -34,40 +43,42 @@ off_t SizeFile(int fd);
class FilePiece {
public:
// 32 MB default.
explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432);
explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432) throw(GZException);
// Takes ownership of fd. name is used for messages.
explicit FilePiece(const char *name, int fd, std::ostream *show_progress = NULL, off_t min_buffer = 33554432);
explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, off_t min_buffer = 33554432) throw(GZException);
~FilePiece();
char get() throw(EndOfFileException) {
if (position_ == position_end_) Shift();
char get() throw(GZException, EndOfFileException) {
if (position_ == position_end_) {
Shift();
if (at_end_) throw EndOfFileException();
}
return *(position_++);
}
// Memory backing the returned StringPiece may vanish on the next call.
// Leaves the delimiter, if any, to be returned by get().
StringPiece ReadDelimited() throw(EndOfFileException) {
StringPiece ReadDelimited() throw(GZException, EndOfFileException) {
SkipSpaces();
return Consume(FindDelimiterOrEOF());
}
// Unlike ReadDelimited, this includes leading spaces and consumes the delimiter.
// It is similar to getline in that way.
StringPiece ReadLine(char delim = '\n') throw(EndOfFileException);
StringPiece ReadLine(char delim = '\n') throw(GZException, EndOfFileException);
float ReadFloat() throw(EndOfFileException, ParseNumberException);
float ReadFloat() throw(GZException, EndOfFileException, ParseNumberException);
void SkipSpaces() throw (EndOfFileException);
void SkipSpaces() throw (GZException, EndOfFileException);
off_t Offset() const {
return position_ - data_.begin() + mapped_offset_;
}
// Only for testing.
void ForceFallbackToRead() {
fallback_to_read_ = true;
}
const std::string &FileName() const { return file_name_; }
private:
void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer);
void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw(GZException);
StringPiece Consume(const char *to) {
StringPiece ret(position_, to - position_);
@ -75,12 +86,14 @@ class FilePiece {
return ret;
}
const char *FindDelimiterOrEOF() throw(EndOfFileException);
const char *FindDelimiterOrEOF() throw(EndOfFileException, GZException);
void Shift() throw (EndOfFileException);
void Shift() throw (EndOfFileException, GZException);
// Backends to Shift().
void MMapShift(off_t desired_begin) throw ();
void ReadShift(off_t desired_begin) throw ();
void TransitionToRead() throw (GZException);
void ReadShift() throw (GZException, EndOfFileException);
const char *position_, *last_space_, *position_end_;
@ -98,6 +111,12 @@ class FilePiece {
bool fallback_to_read_;
ErsatzProgress progress_;
std::string file_name_;
#ifdef USE_ZLIB
void *gz_file_;
#endif // USE_ZLIB
};
} // namespace util

View File

@ -1,15 +1,19 @@
#include "util/file_piece.hh"
#include "util/scoped.hh"
#define BOOST_TEST_MODULE FilePieceTest
#include <boost/test/unit_test.hpp>
#include <fstream>
#include <iostream>
#include <stdio.h>
namespace util {
namespace {
/* mmap implementation */
BOOST_AUTO_TEST_CASE(MMapLine) {
BOOST_AUTO_TEST_CASE(MMapReadLine) {
std::fstream ref("file_piece.cc", std::ios::in);
FilePiece test("file_piece.cc", NULL, 1);
std::string ref_line;
@ -20,13 +24,17 @@ BOOST_AUTO_TEST_CASE(MMapLine) {
BOOST_CHECK_EQUAL(ref_line, test_line);
}
}
BOOST_CHECK_THROW(test.get(), EndOfFileException);
}
/* read() implementation */
BOOST_AUTO_TEST_CASE(ReadLine) {
BOOST_AUTO_TEST_CASE(StreamReadLine) {
std::fstream ref("file_piece.cc", std::ios::in);
FilePiece test("file_piece.cc", NULL, 1);
test.ForceFallbackToRead();
scoped_FILE catter(popen("cat file_piece.cc", "r"));
BOOST_REQUIRE(catter.get());
FilePiece test(dup(fileno(catter.get())), "file_piece.cc", NULL, 1);
std::string ref_line;
while (getline(ref, ref_line)) {
StringPiece test_line(test.ReadLine());
@ -35,7 +43,47 @@ BOOST_AUTO_TEST_CASE(ReadLine) {
BOOST_CHECK_EQUAL(ref_line, test_line);
}
}
BOOST_CHECK_THROW(test.get(), EndOfFileException);
}
#ifdef USE_ZLIB
// gzip file
BOOST_AUTO_TEST_CASE(PlainZipReadLine) {
std::fstream ref("file_piece.cc", std::ios::in);
BOOST_REQUIRE_EQUAL(0, system("gzip <file_piece.cc >file_piece.cc.gz"));
FilePiece test("file_piece.cc.gz", NULL, 1);
std::string ref_line;
while (getline(ref, ref_line)) {
StringPiece test_line(test.ReadLine());
// I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924
if (!test_line.empty() || !ref_line.empty()) {
BOOST_CHECK_EQUAL(ref_line, test_line);
}
}
BOOST_CHECK_THROW(test.get(), EndOfFileException);
}
// gzip stream
BOOST_AUTO_TEST_CASE(StreamZipReadLine) {
std::fstream ref("file_piece.cc", std::ios::in);
scoped_FILE catter(popen("gzip <file_piece.cc", "r"));
BOOST_REQUIRE(catter.get());
FilePiece test(dup(fileno(catter.get())), "file_piece.cc", NULL, 1);
std::string ref_line;
while (getline(ref, ref_line)) {
StringPiece test_line(test.ReadLine());
// I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924
if (!test_line.empty() || !ref_line.empty()) {
BOOST_CHECK_EQUAL(ref_line, test_line);
}
}
BOOST_CHECK_THROW(test.get(), EndOfFileException);
}
#endif
} // namespace
} // namespace util