1
1
mirror of https://github.com/wader/fq.git synced 2024-12-27 07:24:48 +03:00
fq/format/bzip2/bzip2.go

152 lines
3.9 KiB
Go
Raw Normal View History

2020-06-08 03:29:51 +03:00
package bzip2
// https://en.wikipedia.org/wiki/Bzip2
// https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf
// TODO: multiple streams, possible to figure out length of compressed? use footer magic?
// TODO: empty file, no streams
import (
"compress/bzip2"
2021-11-21 22:13:42 +03:00
"encoding/binary"
"hash/crc32"
"io"
"math/bits"
"github.com/wader/fq/format"
"github.com/wader/fq/format/registry"
"github.com/wader/fq/pkg/decode"
"github.com/wader/fq/pkg/scalar"
2020-06-08 03:29:51 +03:00
)
var probeGroup decode.Group
2020-06-08 03:29:51 +03:00
func init() {
registry.MustRegister(decode.Format{
2020-06-08 03:29:51 +03:00
Name: format.BZIP2,
Description: "bzip2 compression",
Groups: []string{format.PROBE},
2021-11-18 00:14:49 +03:00
DecodeFn: bzip2Decode,
2020-06-08 03:29:51 +03:00
Dependencies: []decode.Dependency{
{Names: []string{format.PROBE}, Group: &probeGroup},
2020-06-08 03:29:51 +03:00
},
})
}
2021-11-18 00:14:49 +03:00
const blockMagic = 0x31_41_59_26_53_59
const footerMagic = 0x17_72_45_38_50_90
2021-11-21 22:13:42 +03:00
type bitFlipReader struct {
r io.Reader
}
func (bfr bitFlipReader) Read(p []byte) (n int, err error) {
n, err = bfr.r.Read(p)
for i := 0; i < n; i++ {
p[i] = bits.Reverse8(p[i])
}
return n, err
}
2021-11-18 00:14:49 +03:00
func bzip2Decode(d *decode.D, in interface{}) interface{} {
2020-06-08 03:29:51 +03:00
// moreStreams := true
// d.FieldArray("streams", func(d *decode.D) {
2020-06-08 03:29:51 +03:00
// for moreStreams {
// d.FieldStruct("stream", func(d *decode.D) {
2020-06-08 03:29:51 +03:00
2021-11-21 22:13:42 +03:00
var blockCRCValue *decode.Value
var streamCRCN uint32
d.FieldUTF8("magic", 2, d.AssertStr("BZ"))
2020-06-08 03:29:51 +03:00
d.FieldU8("version")
d.FieldU8("hundred_k_blocksize")
d.FieldStruct("block", func(d *decode.D) {
2020-06-08 03:29:51 +03:00
// if d.PeekBits(48) != blockHeaderMagic {
// moreStreams = false
// return
// }
d.FieldU48("magic", d.AssertU(blockMagic), scalar.Hex)
d.FieldU32("crc", scalar.Hex)
2021-11-21 22:13:42 +03:00
blockCRCValue = d.FieldGet("crc")
2020-06-08 03:29:51 +03:00
d.FieldU1("randomised")
d.FieldU24("origptr")
d.FieldU16("syncmapl1")
d.SeekRel(-16)
ranges := 0
for i := 0; i < 16; i++ {
if d.Bool() {
ranges++
}
}
d.FieldRawLen("syncmapl2", int64(ranges)*16)
2020-06-08 03:29:51 +03:00
numTrees := d.FieldU3("num_trees")
selectorsUsed := d.FieldU15("num_sels")
selectorsI := uint64(0)
d.FieldArrayLoop("selector_list", func() bool { return selectorsI < selectorsUsed }, func(d *decode.D) {
2020-06-08 03:29:51 +03:00
d.FieldU1("selector")
selectorsI++
})
treesI := uint64(0)
d.FieldArrayLoop("trees", func() bool { return treesI < numTrees }, func(d *decode.D) {
d.FieldUFn("tree", func(d *decode.D) uint64 {
2020-06-08 03:29:51 +03:00
l := d.U5()
if !d.Bool() {
return l
2020-06-08 03:29:51 +03:00
}
if d.Bool() {
l--
} else {
l++
}
return l
2020-06-08 03:29:51 +03:00
})
treesI++
})
})
2021-11-18 00:14:49 +03:00
compressedStart := d.Pos()
readCompressedSize, uncompressedBB, dv, _, _ := d.TryFieldReaderRangeFormat("uncompressed", 0, d.Len(), bzip2.NewReader, probeGroup, nil)
if uncompressedBB != nil {
if dv == nil {
d.FieldRootBitBuf("uncompressed", uncompressedBB)
}
2020-06-08 03:29:51 +03:00
blockCRC32W := crc32.NewIEEE()
d.MustCopy(blockCRC32W, bitFlipReader{uncompressedBB.Clone()})
blockCRC32N := bits.Reverse32(binary.BigEndian.Uint32(blockCRC32W.Sum(nil)))
_ = blockCRCValue.TryScalarFn(d.ValidateU(uint64(blockCRC32N)))
streamCRCN = blockCRC32N ^ ((streamCRCN << 1) | (streamCRCN >> 31))
// HACK: bzip2.NewReader will read from start of whole buffer and then we figure out compressedSize ourself
// "It is important to note that none of the fields within a StreamBlock or StreamFooter are necessarily byte-aligned"
const footerByteSize = 10
compressedSize := (readCompressedSize - compressedStart) - footerByteSize*8
for i := 0; i < 8; i++ {
d.SeekAbs(compressedStart + compressedSize)
if d.PeekBits(48) == footerMagic {
break
}
compressedSize--
}
d.SeekAbs(compressedStart)
2021-11-18 00:14:49 +03:00
d.FieldRawLen("compressed", compressedSize)
2021-11-18 00:14:49 +03:00
d.FieldStruct("footer", func(d *decode.D) {
d.FieldU48("magic", d.AssertU(footerMagic), scalar.Hex)
// TODO: crc of block crcs
d.FieldU32("crc", scalar.Hex, d.ValidateU(uint64(streamCRCN)))
d.FieldRawLen("padding", int64(d.ByteAlignBits()))
})
}
2020-06-08 03:29:51 +03:00
// moreStreams = false
// }
// })
return nil
}