1
1
mirror of https://github.com/wader/fq.git synced 2024-11-27 06:04:47 +03:00
fq/format/bzip2/bzip2.go
2022-07-19 18:33:50 +02:00

153 lines
3.9 KiB
Go

package bzip2
// https://en.wikipedia.org/wiki/Bzip2
// https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf
// TODO: multiple streams, possible to figure out length of compressed? use footer magic?
// TODO: empty file, no streams
import (
"compress/bzip2"
"encoding/binary"
"hash/crc32"
"io"
"math/bits"
"github.com/wader/fq/format"
"github.com/wader/fq/pkg/bitio"
"github.com/wader/fq/pkg/decode"
"github.com/wader/fq/pkg/interp"
"github.com/wader/fq/pkg/scalar"
)
var probeGroup decode.Group
func init() {
interp.RegisterFormat(decode.Format{
Name: format.BZIP2,
Description: "bzip2 compression",
Groups: []string{format.PROBE},
DecodeFn: bzip2Decode,
Dependencies: []decode.Dependency{
{Names: []string{format.PROBE}, Group: &probeGroup},
},
})
}
const blockMagic = 0x31_41_59_26_53_59
const footerMagic = 0x17_72_45_38_50_90
type bitFlipReader struct {
r io.Reader
}
func (bfr bitFlipReader) Read(p []byte) (n int, err error) {
n, err = bfr.r.Read(p)
for i := 0; i < n; i++ {
p[i] = bits.Reverse8(p[i])
}
return n, err
}
func bzip2Decode(d *decode.D, _ any) any {
// moreStreams := true
// d.FieldArray("streams", func(d *decode.D) {
// for moreStreams {
// d.FieldStruct("stream", func(d *decode.D) {
var blockCRCValue *decode.Value
var streamCRCN uint32
d.FieldUTF8("magic", 2, d.AssertStr("BZ"))
d.FieldU8("version")
d.FieldU8("hundred_k_blocksize")
d.FieldStruct("block", func(d *decode.D) {
// if d.PeekBits(48) != blockHeaderMagic {
// moreStreams = false
// return
// }
d.FieldU48("magic", d.AssertU(blockMagic), scalar.ActualHex)
d.FieldU32("crc", scalar.ActualHex)
blockCRCValue = d.FieldGet("crc")
d.FieldU1("randomised")
d.FieldU24("origptr")
d.FieldU16("syncmapl1")
d.SeekRel(-16)
ranges := 0
for i := 0; i < 16; i++ {
if d.Bool() {
ranges++
}
}
d.FieldRawLen("syncmapl2", int64(ranges)*16)
numTrees := d.FieldU3("num_trees")
selectorsUsed := d.FieldU15("num_sels")
selectorsI := uint64(0)
d.FieldArrayLoop("selector_list", func() bool { return selectorsI < selectorsUsed }, func(d *decode.D) {
d.FieldU1("selector")
selectorsI++
})
treesI := uint64(0)
d.FieldArrayLoop("trees", func() bool { return treesI < numTrees }, func(d *decode.D) {
d.FieldUFn("tree", func(d *decode.D) uint64 {
l := d.U5()
if !d.Bool() {
return l
}
if d.Bool() {
l--
} else {
l++
}
return l
})
treesI++
})
})
compressedStart := d.Pos()
readCompressedSize, uncompressedBR, dv, _, _ := d.TryFieldReaderRangeFormat("uncompressed", 0, d.Len(), bzip2.NewReader, probeGroup, nil)
if uncompressedBR != nil {
if dv == nil {
d.FieldRootBitBuf("uncompressed", uncompressedBR)
}
blockCRC32W := crc32.NewIEEE()
d.Copy(blockCRC32W, bitFlipReader{bitio.NewIOReader(uncompressedBR)})
blockCRC32N := bits.Reverse32(binary.BigEndian.Uint32(blockCRC32W.Sum(nil)))
_ = blockCRCValue.TryScalarFn(d.ValidateU(uint64(blockCRC32N)))
streamCRCN = blockCRC32N ^ ((streamCRCN << 1) | (streamCRCN >> 31))
// HACK: bzip2.NewReader will read from start of whole buffer and then we figure out compressedSize ourself
// "It is important to note that none of the fields within a StreamBlock or StreamFooter are necessarily byte-aligned"
const footerByteSize = 10
compressedSize := (readCompressedSize - compressedStart) - footerByteSize*8
for i := 0; i < 8; i++ {
d.SeekAbs(compressedStart + compressedSize)
if d.PeekBits(48) == footerMagic {
break
}
compressedSize--
}
d.SeekAbs(compressedStart)
d.FieldRawLen("compressed", compressedSize)
d.FieldStruct("footer", func(d *decode.D) {
d.FieldU48("magic", d.AssertU(footerMagic), scalar.ActualHex)
// TODO: crc of block crcs
d.FieldU32("crc", scalar.ActualHex, d.ValidateU(uint64(streamCRCN)))
d.FieldRawLen("padding", int64(d.ByteAlignBits()))
})
}
// moreStreams = false
// }
// })
return nil
}