#include "util/read_compressed.hh" #include "util/file.hh" #include "util/have.hh" #include "util/scoped.hh" #include #include #include #include #include #include #ifdef HAVE_ZLIB #include #endif #ifdef HAVE_BZLIB #include #endif #ifdef HAVE_XZLIB #include #endif namespace util { CompressedException::CompressedException() throw() {} CompressedException::~CompressedException() throw() {} GZException::GZException() throw() {} GZException::~GZException() throw() {} BZException::BZException() throw() {} BZException::~BZException() throw() {} XZException::XZException() throw() {} XZException::~XZException() throw() {} class ReadBase { public: virtual ~ReadBase() {} virtual std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) = 0; protected: static void ReplaceThis(ReadBase *with, ReadCompressed &thunk) { thunk.internal_.reset(with); } ReadBase *Current(ReadCompressed &thunk) { return thunk.internal_.get(); } static uint64_t &ReadCount(ReadCompressed &thunk) { return thunk.raw_amount_; } }; namespace { ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, std::size_t already_size, bool require_compressed); // Completed file that other classes can thunk to. class Complete : public ReadBase { public: std::size_t Read(void *, std::size_t, ReadCompressed &) { return 0; } }; class Uncompressed : public ReadBase { public: explicit Uncompressed(int fd) : fd_(fd) {} std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { std::size_t got = PartialRead(fd_.get(), to, amount); ReadCount(thunk) += got; return got; } private: scoped_fd fd_; }; class UncompressedWithHeader : public ReadBase { public: UncompressedWithHeader(int fd, const void *already_data, std::size_t already_size) : fd_(fd) { assert(already_size); buf_.reset(malloc(already_size)); if (!buf_.get()) throw std::bad_alloc(); memcpy(buf_.get(), already_data, already_size); remain_ = static_cast(buf_.get()); end_ = remain_ + already_size; } std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { assert(buf_.get()); assert(remain_ != end_); std::size_t sending = std::min(amount, end_ - remain_); memcpy(to, remain_, sending); remain_ += sending; if (remain_ == end_) { ReplaceThis(new Uncompressed(fd_.release()), thunk); } return sending; } private: scoped_malloc buf_; uint8_t *remain_; uint8_t *end_; scoped_fd fd_; }; static const std::size_t kInputBuffer = 16384; template class StreamCompressed : public ReadBase { public: StreamCompressed(int fd, const void *already_data, std::size_t already_size) : file_(fd), in_buffer_(MallocOrThrow(kInputBuffer)), back_(memcpy(in_buffer_.get(), already_data, already_size), already_size) {} std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { if (amount == 0) return 0; back_.SetOutput(to, amount); do { if (!back_.Stream().avail_in) ReadInput(thunk); if (!back_.Process()) { // reached end, at least for the compressed portion. std::size_t ret = static_cast(static_cast(back_.Stream().next_out)) - static_cast(to); ReplaceThis(ReadFactory(file_.release(), ReadCount(thunk), back_.Stream().next_in, back_.Stream().avail_in, true), thunk); if (ret) return ret; // We did not read anything this round, so clients might think EOF. Transfer responsibility to the next reader. return Current(thunk)->Read(to, amount, thunk); } } while (back_.Stream().next_out == to); return static_cast(static_cast(back_.Stream().next_out)) - static_cast(to); } private: void ReadInput(ReadCompressed &thunk) { assert(!back_.Stream().avail_in); std::size_t got = ReadOrEOF(file_.get(), in_buffer_.get(), kInputBuffer); back_.SetInput(in_buffer_.get(), got); ReadCount(thunk) += got; } scoped_fd file_; scoped_malloc in_buffer_; Compression back_; }; #ifdef HAVE_ZLIB class GZip { public: GZip(const void *base, std::size_t amount) { SetInput(base, amount); stream_.zalloc = Z_NULL; stream_.zfree = Z_NULL; stream_.opaque = Z_NULL; stream_.msg = NULL; // 32 for zlib and gzip decoding with automatic header detection. // 15 for maximum window size. UTIL_THROW_IF(Z_OK != inflateInit2(&stream_, 32 + 15), GZException, "Failed to initialize zlib."); } ~GZip() { if (Z_OK != inflateEnd(&stream_)) { std::cerr << "zlib could not close properly." << std::endl; abort(); } } void SetOutput(void *to, std::size_t amount) { stream_.next_out = static_cast(to); stream_.avail_out = std::min(std::numeric_limits::max(), amount); } void SetInput(const void *base, std::size_t amount) { assert(amount < static_cast(std::numeric_limits::max())); stream_.next_in = const_cast(static_cast(base)); stream_.avail_in = amount; } const z_stream &Stream() const { return stream_; } bool Process() { int result = inflate(&stream_, 0); switch (result) { case Z_OK: return true; case Z_STREAM_END: return false; case Z_ERRNO: UTIL_THROW(ErrnoException, "zlib error"); default: UTIL_THROW(GZException, "zlib encountered " << (stream_.msg ? stream_.msg : "an error ") << " code " << result); } } private: z_stream stream_; }; #endif // HAVE_ZLIB #ifdef HAVE_BZLIB class BZip { public: BZip(const void *base, std::size_t amount) { memset(&stream_, 0, sizeof(stream_)); SetInput(base, amount); HandleError(BZ2_bzDecompressInit(&stream_, 0, 0)); } ~BZip() { try { HandleError(BZ2_bzDecompressEnd(&stream_)); } catch (const std::exception &e) { std::cerr << e.what() << std::endl; abort(); } } bool Process() { int ret = BZ2_bzDecompress(&stream_); if (ret == BZ_STREAM_END) return false; HandleError(ret); return true; } void SetOutput(void *base, std::size_t amount) { stream_.next_out = static_cast(base); stream_.avail_out = std::min(std::numeric_limits::max(), amount); } void SetInput(const void *base, std::size_t amount) { stream_.next_in = const_cast(static_cast(base)); stream_.avail_in = amount; } const bz_stream &Stream() const { return stream_; } private: void HandleError(int value) { switch(value) { case BZ_OK: return; case BZ_CONFIG_ERROR: UTIL_THROW(BZException, "bzip2 seems to be miscompiled."); case BZ_PARAM_ERROR: UTIL_THROW(BZException, "bzip2 Parameter error"); case BZ_DATA_ERROR: UTIL_THROW(BZException, "bzip2 detected a corrupt file"); case BZ_DATA_ERROR_MAGIC: UTIL_THROW(BZException, "bzip2 detected bad magic bytes. Perhaps this was not a bzip2 file after all?"); case BZ_MEM_ERROR: throw std::bad_alloc(); default: UTIL_THROW(BZException, "Unknown bzip2 error code " << value); } } bz_stream stream_; }; #endif // HAVE_BZLIB #ifdef HAVE_XZLIB class XZip { public: XZip(const void *base, std::size_t amount) : stream_(), action_(LZMA_RUN) { memset(&stream_, 0, sizeof(stream_)); SetInput(base, amount); HandleError(lzma_stream_decoder(&stream_, UINT64_MAX, 0)); } ~XZip() { lzma_end(&stream_); } void SetOutput(void *base, std::size_t amount) { stream_.next_out = static_cast(base); stream_.avail_out = amount; } void SetInput(const void *base, std::size_t amount) { stream_.next_in = static_cast(base); stream_.avail_in = amount; if (!amount) action_ = LZMA_FINISH; } const lzma_stream &Stream() const { return stream_; } bool Process() { lzma_ret status = lzma_code(&stream_, action_); if (status == LZMA_STREAM_END) return false; HandleError(status); return true; } private: void HandleError(lzma_ret value) { switch (value) { case LZMA_OK: return; case LZMA_MEM_ERROR: throw std::bad_alloc(); case LZMA_FORMAT_ERROR: UTIL_THROW(XZException, "xzlib says file format not recognized"); case LZMA_OPTIONS_ERROR: UTIL_THROW(XZException, "xzlib says unsupported compression options"); case LZMA_DATA_ERROR: UTIL_THROW(XZException, "xzlib says this file is corrupt"); case LZMA_BUF_ERROR: UTIL_THROW(XZException, "xzlib says unexpected end of input"); default: UTIL_THROW(XZException, "unrecognized xzlib error " << value); } } lzma_stream stream_; lzma_action action_; }; #endif // HAVE_XZLIB class IStreamReader : public ReadBase { public: explicit IStreamReader(std::istream &stream) : stream_(stream) {} std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { if (!stream_.read(static_cast(to), amount)) { UTIL_THROW_IF(!stream_.eof(), ErrnoException, "istream error"); amount = stream_.gcount(); } ReadCount(thunk) += amount; return amount; } private: std::istream &stream_; }; enum MagicResult { UTIL_UNKNOWN, UTIL_GZIP, UTIL_BZIP, UTIL_XZIP }; MagicResult DetectMagic(const void *from_void, std::size_t length) { const uint8_t *header = static_cast(from_void); if (length >= 2 && header[0] == 0x1f && header[1] == 0x8b) { return UTIL_GZIP; } const uint8_t kBZMagic[3] = {'B', 'Z', 'h'}; if (length >= sizeof(kBZMagic) && !memcmp(header, kBZMagic, sizeof(kBZMagic))) { return UTIL_BZIP; } const uint8_t kXZMagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 }; if (length >= sizeof(kXZMagic) && !memcmp(header, kXZMagic, sizeof(kXZMagic))) { return UTIL_XZIP; } return UTIL_UNKNOWN; } ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, const std::size_t already_size, bool require_compressed) { scoped_fd hold(fd); std::string header(reinterpret_cast(already_data), already_size); if (header.size() < ReadCompressed::kMagicSize) { std::size_t original = header.size(); header.resize(ReadCompressed::kMagicSize); std::size_t got = ReadOrEOF(fd, &header[original], ReadCompressed::kMagicSize - original); raw_amount += got; header.resize(original + got); } if (header.empty()) { hold.release(); return new Complete(); } switch (DetectMagic(&header[0], header.size())) { case UTIL_GZIP: #ifdef HAVE_ZLIB return new StreamCompressed(hold.release(), header.data(), header.size()); #else UTIL_THROW(CompressedException, "This looks like a gzip file but gzip support was not compiled in."); #endif case UTIL_BZIP: #ifdef HAVE_BZLIB return new StreamCompressed(hold.release(), &header[0], header.size()); #else UTIL_THROW(CompressedException, "This looks like a bzip file (it begins with BZh), but bzip support was not compiled in."); #endif case UTIL_XZIP: #ifdef HAVE_XZLIB return new StreamCompressed(hold.release(), header.data(), header.size()); #else UTIL_THROW(CompressedException, "This looks like an xz file, but xz support was not compiled in."); #endif default: UTIL_THROW_IF(require_compressed, CompressedException, "Uncompressed data detected after a compresssed file. This could be supported but usually indicates an error."); return new UncompressedWithHeader(hold.release(), header.data(), header.size()); } } } // namespace bool ReadCompressed::DetectCompressedMagic(const void *from_void) { return DetectMagic(from_void, kMagicSize) != UTIL_UNKNOWN; } ReadCompressed::ReadCompressed(int fd) { Reset(fd); } ReadCompressed::ReadCompressed(std::istream &in) { Reset(in); } ReadCompressed::ReadCompressed() {} ReadCompressed::~ReadCompressed() {} void ReadCompressed::Reset(int fd) { raw_amount_ = 0; internal_.reset(); internal_.reset(ReadFactory(fd, raw_amount_, NULL, 0, false)); } void ReadCompressed::Reset(std::istream &in) { internal_.reset(); internal_.reset(new IStreamReader(in)); } std::size_t ReadCompressed::Read(void *to, std::size_t amount) { return internal_->Read(to, amount, *this); } } // namespace util