2012-10-30 23:33:22 +04:00
# include "util/read_compressed.hh"
# include "util/file.hh"
# include "util/have.hh"
# include "util/scoped.hh"
# include <algorithm>
# include <iostream>
2015-03-28 15:37:48 +03:00
# include <cassert>
# include <climits>
# include <cstdlib>
# include <cstring>
2012-10-30 23:33:22 +04:00
# ifdef HAVE_ZLIB
# include <zlib.h>
# endif
# ifdef HAVE_BZLIB
# include <bzlib.h>
# endif
# ifdef HAVE_XZLIB
# include <lzma.h>
# endif
namespace util {
CompressedException : : CompressedException ( ) throw ( ) { }
CompressedException : : ~ CompressedException ( ) throw ( ) { }
GZException : : GZException ( ) throw ( ) { }
GZException : : ~ GZException ( ) throw ( ) { }
BZException : : BZException ( ) throw ( ) { }
BZException : : ~ BZException ( ) throw ( ) { }
XZException : : XZException ( ) throw ( ) { }
XZException : : ~ XZException ( ) throw ( ) { }
class ReadBase {
public :
virtual ~ ReadBase ( ) { }
virtual std : : size_t Read ( void * to , std : : size_t amount , ReadCompressed & thunk ) = 0 ;
protected :
static void ReplaceThis ( ReadBase * with , ReadCompressed & thunk ) {
thunk . internal_ . reset ( with ) ;
}
2014-01-30 21:03:01 +04:00
ReadBase * Current ( ReadCompressed & thunk ) { return thunk . internal_ . get ( ) ; }
2012-10-30 23:33:22 +04:00
static uint64_t & ReadCount ( ReadCompressed & thunk ) {
return thunk . raw_amount_ ;
}
} ;
namespace {
2014-01-30 21:03:01 +04:00
ReadBase * ReadFactory ( int fd , uint64_t & raw_amount , const void * already_data , std : : size_t already_size , bool require_compressed ) ;
2015-04-30 08:05:11 +03:00
// Completed file that other classes can thunk to.
2012-10-30 23:33:22 +04:00
class Complete : public ReadBase {
public :
std : : size_t Read ( void * , std : : size_t , ReadCompressed & ) {
return 0 ;
}
} ;
class Uncompressed : public ReadBase {
public :
explicit Uncompressed ( int fd ) : fd_ ( fd ) { }
std : : size_t Read ( void * to , std : : size_t amount , ReadCompressed & thunk ) {
std : : size_t got = PartialRead ( fd_ . get ( ) , to , amount ) ;
ReadCount ( thunk ) + = got ;
return got ;
}
private :
scoped_fd fd_ ;
} ;
class UncompressedWithHeader : public ReadBase {
public :
2014-01-30 21:03:01 +04:00
UncompressedWithHeader ( int fd , const void * already_data , std : : size_t already_size ) : fd_ ( fd ) {
2012-10-30 23:33:22 +04:00
assert ( already_size ) ;
buf_ . reset ( malloc ( already_size ) ) ;
if ( ! buf_ . get ( ) ) throw std : : bad_alloc ( ) ;
memcpy ( buf_ . get ( ) , already_data , already_size ) ;
remain_ = static_cast < uint8_t * > ( buf_ . get ( ) ) ;
end_ = remain_ + already_size ;
}
std : : size_t Read ( void * to , std : : size_t amount , ReadCompressed & thunk ) {
assert ( buf_ . get ( ) ) ;
2014-01-30 21:03:01 +04:00
assert ( remain_ ! = end_ ) ;
2012-10-30 23:33:22 +04:00
std : : size_t sending = std : : min < std : : size_t > ( amount , end_ - remain_ ) ;
memcpy ( to , remain_ , sending ) ;
remain_ + = sending ;
if ( remain_ = = end_ ) {
ReplaceThis ( new Uncompressed ( fd_ . release ( ) ) , thunk ) ;
}
return sending ;
}
private :
scoped_malloc buf_ ;
uint8_t * remain_ ;
uint8_t * end_ ;
scoped_fd fd_ ;
} ;
2014-01-30 21:03:01 +04:00
static const std : : size_t kInputBuffer = 16384 ;
template < class Compression > class StreamCompressed : public ReadBase {
public :
StreamCompressed ( int fd , const void * already_data , std : : size_t already_size )
: file_ ( fd ) ,
in_buffer_ ( MallocOrThrow ( kInputBuffer ) ) ,
back_ ( memcpy ( in_buffer_ . get ( ) , already_data , already_size ) , already_size ) { }
2015-04-30 08:05:11 +03:00
2014-01-30 21:03:01 +04:00
std : : size_t Read ( void * to , std : : size_t amount , ReadCompressed & thunk ) {
if ( amount = = 0 ) return 0 ;
back_ . SetOutput ( to , amount ) ;
do {
if ( ! back_ . Stream ( ) . avail_in ) ReadInput ( thunk ) ;
if ( ! back_ . Process ( ) ) {
// reached end, at least for the compressed portion.
std : : size_t ret = static_cast < const uint8_t * > ( static_cast < void * > ( back_ . Stream ( ) . next_out ) ) - static_cast < const uint8_t * > ( to ) ;
ReplaceThis ( ReadFactory ( file_ . release ( ) , ReadCount ( thunk ) , back_ . Stream ( ) . next_in , back_ . Stream ( ) . avail_in , true ) , thunk ) ;
if ( ret ) return ret ;
// We did not read anything this round, so clients might think EOF. Transfer responsibility to the next reader.
return Current ( thunk ) - > Read ( to , amount , thunk ) ;
}
} while ( back_ . Stream ( ) . next_out = = to ) ;
return static_cast < const uint8_t * > ( static_cast < void * > ( back_ . Stream ( ) . next_out ) ) - static_cast < const uint8_t * > ( to ) ;
}
2012-10-30 23:33:22 +04:00
private :
2014-01-30 21:03:01 +04:00
void ReadInput ( ReadCompressed & thunk ) {
assert ( ! back_ . Stream ( ) . avail_in ) ;
std : : size_t got = ReadOrEOF ( file_ . get ( ) , in_buffer_ . get ( ) , kInputBuffer ) ;
back_ . SetInput ( in_buffer_ . get ( ) , got ) ;
ReadCount ( thunk ) + = got ;
}
scoped_fd file_ ;
scoped_malloc in_buffer_ ;
Compression back_ ;
} ;
# ifdef HAVE_ZLIB
class GZip {
2012-10-30 23:33:22 +04:00
public :
2014-01-30 21:03:01 +04:00
GZip ( const void * base , std : : size_t amount ) {
SetInput ( base , amount ) ;
2012-10-30 23:33:22 +04:00
stream_ . zalloc = Z_NULL ;
stream_ . zfree = Z_NULL ;
stream_ . opaque = Z_NULL ;
stream_ . msg = NULL ;
2015-04-30 08:05:11 +03:00
// 32 for zlib and gzip decoding with automatic header detection.
// 15 for maximum window size.
2012-10-30 23:33:22 +04:00
UTIL_THROW_IF ( Z_OK ! = inflateInit2 ( & stream_ , 32 + 15 ) , GZException , " Failed to initialize zlib. " ) ;
}
~ GZip ( ) {
if ( Z_OK ! = inflateEnd ( & stream_ ) ) {
std : : cerr < < " zlib could not close properly. " < < std : : endl ;
abort ( ) ;
}
}
2014-01-30 21:03:01 +04:00
void SetOutput ( void * to , std : : size_t amount ) {
2012-10-30 23:33:22 +04:00
stream_ . next_out = static_cast < Bytef * > ( to ) ;
stream_ . avail_out = std : : min < std : : size_t > ( std : : numeric_limits < uInt > : : max ( ) , amount ) ;
}
2014-01-30 21:03:01 +04:00
void SetInput ( const void * base , std : : size_t amount ) {
assert ( amount < static_cast < std : : size_t > ( std : : numeric_limits < uInt > : : max ( ) ) ) ;
stream_ . next_in = const_cast < Bytef * > ( static_cast < const Bytef * > ( base ) ) ;
stream_ . avail_in = amount ;
2012-10-30 23:33:22 +04:00
}
2014-01-30 21:03:01 +04:00
const z_stream & Stream ( ) const { return stream_ ; }
bool Process ( ) {
int result = inflate ( & stream_ , 0 ) ;
switch ( result ) {
case Z_OK :
return true ;
case Z_STREAM_END :
return false ;
case Z_ERRNO :
UTIL_THROW ( ErrnoException , " zlib error " ) ;
default :
UTIL_THROW ( GZException , " zlib encountered " < < ( stream_ . msg ? stream_ . msg : " an error " ) < < " code " < < result ) ;
}
}
private :
2012-10-30 23:33:22 +04:00
z_stream stream_ ;
} ;
# endif // HAVE_ZLIB
# ifdef HAVE_BZLIB
2014-01-30 21:03:01 +04:00
class BZip {
2012-10-30 23:33:22 +04:00
public :
2014-01-30 21:03:01 +04:00
BZip ( const void * base , std : : size_t amount ) {
memset ( & stream_ , 0 , sizeof ( stream_ ) ) ;
SetInput ( base , amount ) ;
HandleError ( BZ2_bzDecompressInit ( & stream_ , 0 , 0 ) ) ;
2013-03-19 18:46:52 +04:00
}
2014-01-30 21:03:01 +04:00
~ BZip ( ) {
try {
HandleError ( BZ2_bzDecompressEnd ( & stream_ ) ) ;
} catch ( const std : : exception & e ) {
std : : cerr < < e . what ( ) < < std : : endl ;
abort ( ) ;
}
2013-03-19 18:46:52 +04:00
}
2014-01-30 21:03:01 +04:00
bool Process ( ) {
int ret = BZ2_bzDecompress ( & stream_ ) ;
if ( ret = = BZ_STREAM_END ) return false ;
HandleError ( ret ) ;
return true ;
2013-03-19 18:46:52 +04:00
}
2014-01-30 21:03:01 +04:00
void SetOutput ( void * base , std : : size_t amount ) {
stream_ . next_out = static_cast < char * > ( base ) ;
stream_ . avail_out = std : : min < std : : size_t > ( std : : numeric_limits < unsigned int > : : max ( ) , amount ) ;
}
void SetInput ( const void * base , std : : size_t amount ) {
stream_ . next_in = const_cast < char * > ( static_cast < const char * > ( base ) ) ;
stream_ . avail_in = amount ;
2013-03-19 18:46:52 +04:00
}
2014-01-30 21:03:01 +04:00
const bz_stream & Stream ( ) const { return stream_ ; }
2013-03-19 18:46:52 +04:00
private :
2014-01-30 21:03:01 +04:00
void HandleError ( int value ) {
switch ( value ) {
2012-10-30 23:33:22 +04:00
case BZ_OK :
return ;
case BZ_CONFIG_ERROR :
2014-01-30 21:03:01 +04:00
UTIL_THROW ( BZException , " bzip2 seems to be miscompiled. " ) ;
2012-10-30 23:33:22 +04:00
case BZ_PARAM_ERROR :
2014-01-30 21:03:01 +04:00
UTIL_THROW ( BZException , " bzip2 Parameter error " ) ;
case BZ_DATA_ERROR :
UTIL_THROW ( BZException , " bzip2 detected a corrupt file " ) ;
case BZ_DATA_ERROR_MAGIC :
UTIL_THROW ( BZException , " bzip2 detected bad magic bytes. Perhaps this was not a bzip2 file after all? " ) ;
2012-10-30 23:33:22 +04:00
case BZ_MEM_ERROR :
throw std : : bad_alloc ( ) ;
2013-03-19 18:46:52 +04:00
default :
2014-01-30 21:03:01 +04:00
UTIL_THROW ( BZException , " Unknown bzip2 error code " < < value ) ;
2012-10-30 23:33:22 +04:00
}
}
2014-01-30 21:03:01 +04:00
bz_stream stream_ ;
2012-10-30 23:33:22 +04:00
} ;
# endif // HAVE_BZLIB
# ifdef HAVE_XZLIB
2014-01-30 21:03:01 +04:00
class XZip {
2012-10-30 23:33:22 +04:00
public :
2014-01-30 21:03:01 +04:00
XZip ( const void * base , std : : size_t amount )
: stream_ ( ) , action_ ( LZMA_RUN ) {
memset ( & stream_ , 0 , sizeof ( stream_ ) ) ;
SetInput ( base , amount ) ;
HandleError ( lzma_stream_decoder ( & stream_ , UINT64_MAX , 0 ) ) ;
2012-10-30 23:33:22 +04:00
}
~ XZip ( ) {
lzma_end ( & stream_ ) ;
}
2014-01-30 21:03:01 +04:00
void SetOutput ( void * base , std : : size_t amount ) {
stream_ . next_out = static_cast < uint8_t * > ( base ) ;
2012-10-30 23:33:22 +04:00
stream_ . avail_out = amount ;
2014-01-30 21:03:01 +04:00
}
void SetInput ( const void * base , std : : size_t amount ) {
stream_ . next_in = static_cast < const uint8_t * > ( base ) ;
stream_ . avail_in = amount ;
if ( ! amount ) action_ = LZMA_FINISH ;
}
const lzma_stream & Stream ( ) const { return stream_ ; }
bool Process ( ) {
lzma_ret status = lzma_code ( & stream_ , action_ ) ;
if ( status = = LZMA_STREAM_END ) return false ;
HandleError ( status ) ;
return true ;
2012-10-30 23:33:22 +04:00
}
private :
2014-01-30 21:03:01 +04:00
void HandleError ( lzma_ret value ) {
switch ( value ) {
case LZMA_OK :
return ;
case LZMA_MEM_ERROR :
throw std : : bad_alloc ( ) ;
case LZMA_FORMAT_ERROR :
UTIL_THROW ( XZException , " xzlib says file format not recognized " ) ;
case LZMA_OPTIONS_ERROR :
UTIL_THROW ( XZException , " xzlib says unsupported compression options " ) ;
case LZMA_DATA_ERROR :
UTIL_THROW ( XZException , " xzlib says this file is corrupt " ) ;
case LZMA_BUF_ERROR :
UTIL_THROW ( XZException , " xzlib says unexpected end of input " ) ;
default :
UTIL_THROW ( XZException , " unrecognized xzlib error " < < value ) ;
}
2012-10-30 23:33:22 +04:00
}
lzma_stream stream_ ;
lzma_action action_ ;
} ;
# endif // HAVE_XZLIB
2013-01-23 00:23:35 +04:00
class IStreamReader : public ReadBase {
public :
explicit IStreamReader ( std : : istream & stream ) : stream_ ( stream ) { }
std : : size_t Read ( void * to , std : : size_t amount , ReadCompressed & thunk ) {
if ( ! stream_ . read ( static_cast < char * > ( to ) , amount ) ) {
UTIL_THROW_IF ( ! stream_ . eof ( ) , ErrnoException , " istream error " ) ;
amount = stream_ . gcount ( ) ;
}
ReadCount ( thunk ) + = amount ;
return amount ;
}
private :
std : : istream & stream_ ;
} ;
2012-10-30 23:33:22 +04:00
enum MagicResult {
2014-01-30 21:03:01 +04:00
UTIL_UNKNOWN , UTIL_GZIP , UTIL_BZIP , UTIL_XZIP
2012-10-30 23:33:22 +04:00
} ;
2014-01-30 21:03:01 +04:00
MagicResult DetectMagic ( const void * from_void , std : : size_t length ) {
2012-10-30 23:33:22 +04:00
const uint8_t * header = static_cast < const uint8_t * > ( from_void ) ;
2014-01-30 21:03:01 +04:00
if ( length > = 2 & & header [ 0 ] = = 0x1f & & header [ 1 ] = = 0x8b ) {
return UTIL_GZIP ;
2012-10-30 23:33:22 +04:00
}
2014-01-30 21:03:01 +04:00
const uint8_t kBZMagic [ 3 ] = { ' B ' , ' Z ' , ' h ' } ;
if ( length > = sizeof ( kBZMagic ) & & ! memcmp ( header , kBZMagic , sizeof ( kBZMagic ) ) ) {
return UTIL_BZIP ;
2012-10-30 23:33:22 +04:00
}
2013-03-19 18:46:52 +04:00
const uint8_t kXZMagic [ 6 ] = { 0xFD , ' 7 ' , ' z ' , ' X ' , ' Z ' , 0x00 } ;
2014-01-30 21:03:01 +04:00
if ( length > = sizeof ( kXZMagic ) & & ! memcmp ( header , kXZMagic , sizeof ( kXZMagic ) ) ) {
return UTIL_XZIP ;
2012-10-30 23:33:22 +04:00
}
2014-01-30 21:03:01 +04:00
return UTIL_UNKNOWN ;
2012-10-30 23:33:22 +04:00
}
2014-01-30 21:03:01 +04:00
ReadBase * ReadFactory ( int fd , uint64_t & raw_amount , const void * already_data , const std : : size_t already_size , bool require_compressed ) {
2012-10-30 23:33:22 +04:00
scoped_fd hold ( fd ) ;
2014-01-30 21:03:01 +04:00
std : : string header ( reinterpret_cast < const char * > ( already_data ) , already_size ) ;
if ( header . size ( ) < ReadCompressed : : kMagicSize ) {
std : : size_t original = header . size ( ) ;
header . resize ( ReadCompressed : : kMagicSize ) ;
std : : size_t got = ReadOrEOF ( fd , & header [ original ] , ReadCompressed : : kMagicSize - original ) ;
raw_amount + = got ;
header . resize ( original + got ) ;
}
if ( header . empty ( ) ) {
return new Complete ( ) ;
}
switch ( DetectMagic ( & header [ 0 ] , header . size ( ) ) ) {
case UTIL_GZIP :
2012-10-30 23:33:22 +04:00
# ifdef HAVE_ZLIB
2014-01-30 21:03:01 +04:00
return new StreamCompressed < GZip > ( hold . release ( ) , header . data ( ) , header . size ( ) ) ;
2012-10-30 23:33:22 +04:00
# else
UTIL_THROW ( CompressedException , " This looks like a gzip file but gzip support was not compiled in. " ) ;
# endif
2014-01-30 21:03:01 +04:00
case UTIL_BZIP :
2012-10-30 23:33:22 +04:00
# ifdef HAVE_BZLIB
2014-01-30 21:03:01 +04:00
return new StreamCompressed < BZip > ( hold . release ( ) , & header [ 0 ] , header . size ( ) ) ;
2012-10-30 23:33:22 +04:00
# else
2014-01-30 21:03:01 +04:00
UTIL_THROW ( CompressedException , " This looks like a bzip file (it begins with BZh), but bzip support was not compiled in. " ) ;
2012-10-30 23:33:22 +04:00
# endif
2014-01-30 21:03:01 +04:00
case UTIL_XZIP :
2012-10-30 23:33:22 +04:00
# ifdef HAVE_XZLIB
2014-01-30 21:03:01 +04:00
return new StreamCompressed < XZip > ( hold . release ( ) , header . data ( ) , header . size ( ) ) ;
2012-10-30 23:33:22 +04:00
# else
UTIL_THROW ( CompressedException , " This looks like an xz file, but xz support was not compiled in. " ) ;
# endif
2014-01-30 21:03:01 +04:00
default :
UTIL_THROW_IF ( require_compressed , CompressedException , " Uncompressed data detected after a compresssed file. This could be supported but usually indicates an error. " ) ;
return new UncompressedWithHeader ( hold . release ( ) , header . data ( ) , header . size ( ) ) ;
2012-10-30 23:33:22 +04:00
}
}
} // namespace
bool ReadCompressed : : DetectCompressedMagic ( const void * from_void ) {
2014-01-30 21:03:01 +04:00
return DetectMagic ( from_void , kMagicSize ) ! = UTIL_UNKNOWN ;
2012-10-30 23:33:22 +04:00
}
ReadCompressed : : ReadCompressed ( int fd ) {
Reset ( fd ) ;
}
2013-01-23 00:23:35 +04:00
ReadCompressed : : ReadCompressed ( std : : istream & in ) {
Reset ( in ) ;
}
2012-10-30 23:33:22 +04:00
ReadCompressed : : ReadCompressed ( ) { }
ReadCompressed : : ~ ReadCompressed ( ) { }
void ReadCompressed : : Reset ( int fd ) {
2014-01-31 03:55:25 +04:00
raw_amount_ = 0 ;
2012-10-30 23:33:22 +04:00
internal_ . reset ( ) ;
2014-01-30 21:03:01 +04:00
internal_ . reset ( ReadFactory ( fd , raw_amount_ , NULL , 0 , false ) ) ;
2012-10-30 23:33:22 +04:00
}
2013-01-23 00:23:35 +04:00
void ReadCompressed : : Reset ( std : : istream & in ) {
internal_ . reset ( ) ;
internal_ . reset ( new IStreamReader ( in ) ) ;
}
2012-10-30 23:33:22 +04:00
std : : size_t ReadCompressed : : Read ( void * to , std : : size_t amount ) {
return internal_ - > Read ( to , amount , * this ) ;
}
2014-07-19 02:54:01 +04:00
std : : size_t ReadCompressed : : ReadOrEOF ( void * const to_in , std : : size_t amount ) {
uint8_t * to = reinterpret_cast < uint8_t * > ( to_in ) ;
while ( amount ) {
std : : size_t got = Read ( to , amount ) ;
if ( ! got ) break ;
to + = got ;
amount - = got ;
}
return to - reinterpret_cast < uint8_t * > ( to_in ) ;
}
2012-10-30 23:33:22 +04:00
} // namespace util