2020-06-08 03:29:51 +03:00
|
|
|
package bzip2
|
|
|
|
|
|
|
|
// https://en.wikipedia.org/wiki/Bzip2
|
|
|
|
// https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf
|
|
|
|
// TODO: multiple streams, possible to figure out length of compressed? use footer magic?
|
|
|
|
// TODO: empty file, no streams
|
|
|
|
|
|
|
|
import (
|
|
|
|
"compress/bzip2"
|
2021-11-21 22:13:42 +03:00
|
|
|
"encoding/binary"
|
|
|
|
"hash/crc32"
|
|
|
|
"io"
|
|
|
|
"math/bits"
|
2021-08-17 13:06:32 +03:00
|
|
|
|
|
|
|
"github.com/wader/fq/format"
|
|
|
|
"github.com/wader/fq/format/registry"
|
|
|
|
"github.com/wader/fq/pkg/decode"
|
2021-12-02 00:48:25 +03:00
|
|
|
"github.com/wader/fq/pkg/scalar"
|
2020-06-08 03:29:51 +03:00
|
|
|
)
|
|
|
|
|
2021-11-17 18:46:10 +03:00
|
|
|
var probeGroup decode.Group
|
2020-06-08 03:29:51 +03:00
|
|
|
|
|
|
|
func init() {
|
2021-11-17 18:46:10 +03:00
|
|
|
registry.MustRegister(decode.Format{
|
2020-06-08 03:29:51 +03:00
|
|
|
Name: format.BZIP2,
|
|
|
|
Description: "bzip2 compression",
|
|
|
|
Groups: []string{format.PROBE},
|
2021-11-18 00:14:49 +03:00
|
|
|
DecodeFn: bzip2Decode,
|
2020-06-08 03:29:51 +03:00
|
|
|
Dependencies: []decode.Dependency{
|
2021-11-17 18:46:10 +03:00
|
|
|
{Names: []string{format.PROBE}, Group: &probeGroup},
|
2020-06-08 03:29:51 +03:00
|
|
|
},
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2021-11-18 00:14:49 +03:00
|
|
|
const blockMagic = 0x31_41_59_26_53_59
|
|
|
|
const footerMagic = 0x17_72_45_38_50_90
|
|
|
|
|
2021-11-21 22:13:42 +03:00
|
|
|
type bitFlipReader struct {
|
|
|
|
r io.Reader
|
|
|
|
}
|
|
|
|
|
|
|
|
func (bfr bitFlipReader) Read(p []byte) (n int, err error) {
|
|
|
|
n, err = bfr.r.Read(p)
|
|
|
|
for i := 0; i < n; i++ {
|
|
|
|
p[i] = bits.Reverse8(p[i])
|
|
|
|
}
|
|
|
|
return n, err
|
|
|
|
}
|
|
|
|
|
2021-11-18 00:14:49 +03:00
|
|
|
func bzip2Decode(d *decode.D, in interface{}) interface{} {
|
2020-06-08 03:29:51 +03:00
|
|
|
// moreStreams := true
|
|
|
|
|
2021-11-05 17:04:26 +03:00
|
|
|
// d.FieldArray("streams", func(d *decode.D) {
|
2020-06-08 03:29:51 +03:00
|
|
|
// for moreStreams {
|
2021-11-05 17:04:26 +03:00
|
|
|
// d.FieldStruct("stream", func(d *decode.D) {
|
2020-06-08 03:29:51 +03:00
|
|
|
|
2021-11-21 22:13:42 +03:00
|
|
|
var blockCRCValue *decode.Value
|
|
|
|
var streamCRCN uint32
|
|
|
|
|
2021-11-05 17:04:26 +03:00
|
|
|
d.FieldUTF8("magic", 2, d.AssertStr("BZ"))
|
2020-06-08 03:29:51 +03:00
|
|
|
d.FieldU8("version")
|
|
|
|
d.FieldU8("hundred_k_blocksize")
|
|
|
|
|
2021-11-05 17:04:26 +03:00
|
|
|
d.FieldStruct("block", func(d *decode.D) {
|
2020-06-08 03:29:51 +03:00
|
|
|
// if d.PeekBits(48) != blockHeaderMagic {
|
|
|
|
// moreStreams = false
|
|
|
|
// return
|
|
|
|
// }
|
2021-12-02 00:48:25 +03:00
|
|
|
d.FieldU48("magic", d.AssertU(blockMagic), scalar.Hex)
|
|
|
|
d.FieldU32("crc", scalar.Hex)
|
2021-11-21 22:13:42 +03:00
|
|
|
blockCRCValue = d.FieldGet("crc")
|
2020-06-08 03:29:51 +03:00
|
|
|
d.FieldU1("randomised")
|
|
|
|
d.FieldU24("origptr")
|
|
|
|
d.FieldU16("syncmapl1")
|
|
|
|
|
|
|
|
d.SeekRel(-16)
|
|
|
|
ranges := 0
|
|
|
|
for i := 0; i < 16; i++ {
|
|
|
|
if d.Bool() {
|
|
|
|
ranges++
|
|
|
|
}
|
|
|
|
}
|
2021-11-05 17:04:26 +03:00
|
|
|
d.FieldRawLen("syncmapl2", int64(ranges)*16)
|
2020-06-08 03:29:51 +03:00
|
|
|
numTrees := d.FieldU3("num_trees")
|
|
|
|
selectorsUsed := d.FieldU15("num_sels")
|
|
|
|
selectorsI := uint64(0)
|
2021-11-05 17:04:26 +03:00
|
|
|
d.FieldArrayLoop("selector_list", func() bool { return selectorsI < selectorsUsed }, func(d *decode.D) {
|
2020-06-08 03:29:51 +03:00
|
|
|
d.FieldU1("selector")
|
|
|
|
selectorsI++
|
|
|
|
})
|
|
|
|
treesI := uint64(0)
|
2021-11-05 17:04:26 +03:00
|
|
|
d.FieldArrayLoop("trees", func() bool { return treesI < numTrees }, func(d *decode.D) {
|
|
|
|
d.FieldUFn("tree", func(d *decode.D) uint64 {
|
2020-06-08 03:29:51 +03:00
|
|
|
l := d.U5()
|
|
|
|
if !d.Bool() {
|
2021-11-05 17:04:26 +03:00
|
|
|
return l
|
2020-06-08 03:29:51 +03:00
|
|
|
}
|
|
|
|
if d.Bool() {
|
|
|
|
l--
|
|
|
|
} else {
|
|
|
|
l++
|
|
|
|
}
|
2021-11-05 17:04:26 +03:00
|
|
|
return l
|
2020-06-08 03:29:51 +03:00
|
|
|
})
|
|
|
|
treesI++
|
|
|
|
})
|
|
|
|
})
|
|
|
|
|
2021-11-18 00:14:49 +03:00
|
|
|
compressedStart := d.Pos()
|
|
|
|
|
2021-11-24 17:25:27 +03:00
|
|
|
readCompressedSize, uncompressedBB, dv, _, _ := d.TryFieldReaderRangeFormat("uncompressed", 0, d.Len(), bzip2.NewReader, probeGroup, nil)
|
|
|
|
if uncompressedBB != nil {
|
|
|
|
if dv == nil {
|
|
|
|
d.FieldRootBitBuf("uncompressed", uncompressedBB)
|
|
|
|
}
|
2020-06-08 03:29:51 +03:00
|
|
|
|
2021-11-24 17:25:27 +03:00
|
|
|
blockCRC32W := crc32.NewIEEE()
|
2021-11-24 23:20:46 +03:00
|
|
|
d.MustCopy(blockCRC32W, bitFlipReader{uncompressedBB.Clone()})
|
2021-11-24 17:25:27 +03:00
|
|
|
blockCRC32N := bits.Reverse32(binary.BigEndian.Uint32(blockCRC32W.Sum(nil)))
|
|
|
|
_ = blockCRCValue.TryScalarFn(d.ValidateU(uint64(blockCRC32N)))
|
|
|
|
streamCRCN = blockCRC32N ^ ((streamCRCN << 1) | (streamCRCN >> 31))
|
|
|
|
|
|
|
|
// HACK: bzip2.NewReader will read from start of whole buffer and then we figure out compressedSize ourself
|
|
|
|
// "It is important to note that none of the fields within a StreamBlock or StreamFooter are necessarily byte-aligned"
|
|
|
|
const footerByteSize = 10
|
|
|
|
compressedSize := (readCompressedSize - compressedStart) - footerByteSize*8
|
|
|
|
for i := 0; i < 8; i++ {
|
|
|
|
d.SeekAbs(compressedStart + compressedSize)
|
|
|
|
if d.PeekBits(48) == footerMagic {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
compressedSize--
|
|
|
|
}
|
|
|
|
d.SeekAbs(compressedStart)
|
2021-11-18 00:14:49 +03:00
|
|
|
|
2021-11-24 17:25:27 +03:00
|
|
|
d.FieldRawLen("compressed", compressedSize)
|
2021-11-18 00:14:49 +03:00
|
|
|
|
2021-11-24 17:25:27 +03:00
|
|
|
d.FieldStruct("footer", func(d *decode.D) {
|
2021-12-02 00:48:25 +03:00
|
|
|
d.FieldU48("magic", d.AssertU(footerMagic), scalar.Hex)
|
2021-11-24 17:25:27 +03:00
|
|
|
// TODO: crc of block crcs
|
2021-12-02 00:48:25 +03:00
|
|
|
d.FieldU32("crc", scalar.Hex, d.ValidateU(uint64(streamCRCN)))
|
2021-11-24 17:25:27 +03:00
|
|
|
d.FieldRawLen("padding", int64(d.ByteAlignBits()))
|
|
|
|
})
|
|
|
|
}
|
2020-06-08 03:29:51 +03:00
|
|
|
|
|
|
|
// moreStreams = false
|
|
|
|
// }
|
|
|
|
// })
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|