1
1
mirror of https://github.com/wader/fq.git synced 2024-10-27 04:09:37 +03:00
fq/format/bzip2/bzip2.go
Mattias Wadman e2eb667091 html: Add to probe group
As decoder now can know they are decoding as part of probing we can now
use some heuristics to see if we should decode as html.
The reason heuristics is needed is that x/html parser will alwaus succeed.

Add lazyre package to help delay compile of RE and make it concurrency safe.
2023-05-11 19:07:18 +02:00

155 lines
3.9 KiB
Go

package bzip2
// https://en.wikipedia.org/wiki/Bzip2
// https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf
// TODO: multiple streams, possible to figure out length of compressed? use footer magic?
// TODO: empty file, no streams
import (
"compress/bzip2"
"encoding/binary"
"hash/crc32"
"io"
"math/bits"
"github.com/wader/fq/format"
"github.com/wader/fq/pkg/bitio"
"github.com/wader/fq/pkg/decode"
"github.com/wader/fq/pkg/interp"
"github.com/wader/fq/pkg/scalar"
)
var probeGroup decode.Group
func init() {
interp.RegisterFormat(
format.Bzip2,
&decode.Format{
Description: "bzip2 compression",
Groups: []*decode.Group{format.Probe},
DecodeFn: bzip2Decode,
Dependencies: []decode.Dependency{
{Groups: []*decode.Group{format.Probe}, Out: &probeGroup},
},
})
}
const blockMagic = 0x31_41_59_26_53_59
const footerMagic = 0x17_72_45_38_50_90
type bitFlipReader struct {
r io.Reader
}
func (bfr bitFlipReader) Read(p []byte) (n int, err error) {
n, err = bfr.r.Read(p)
for i := 0; i < n; i++ {
p[i] = bits.Reverse8(p[i])
}
return n, err
}
func bzip2Decode(d *decode.D) any {
// moreStreams := true
// d.FieldArray("streams", func(d *decode.D) {
// for moreStreams {
// d.FieldStruct("stream", func(d *decode.D) {
var blockCRCValue *decode.Value
var streamCRCN uint32
d.FieldUTF8("magic", 2, d.StrAssert("BZ"))
d.FieldU8("version")
d.FieldU8("hundred_k_blocksize")
d.FieldStruct("block", func(d *decode.D) {
// if d.PeekBits(48) != blockHeaderMagic {
// moreStreams = false
// return
// }
d.FieldU48("magic", d.UintAssert(blockMagic), scalar.UintHex)
d.FieldU32("crc", scalar.UintHex)
blockCRCValue = d.FieldGet("crc")
d.FieldU1("randomised")
d.FieldU24("origptr")
d.FieldU16("syncmapl1")
d.SeekRel(-16)
ranges := 0
for i := 0; i < 16; i++ {
if d.Bool() {
ranges++
}
}
d.FieldRawLen("syncmapl2", int64(ranges)*16)
numTrees := d.FieldU3("num_trees")
selectorsUsed := d.FieldU15("num_sels")
selectorsI := uint64(0)
d.FieldArrayLoop("selector_list", func() bool { return selectorsI < selectorsUsed }, func(d *decode.D) {
d.FieldU1("selector")
selectorsI++
})
treesI := uint64(0)
d.FieldArrayLoop("trees", func() bool { return treesI < numTrees }, func(d *decode.D) {
d.FieldUintFn("tree", func(d *decode.D) uint64 {
l := d.U5()
if !d.Bool() {
return l
}
if d.Bool() {
l--
} else {
l++
}
return l
})
treesI++
})
})
compressedStart := d.Pos()
readCompressedSize, uncompressedBR, dv, _, _ :=
d.TryFieldReaderRangeFormat("uncompressed", 0, d.Len(), bzip2.NewReader, &probeGroup, format.Probe_In{})
if uncompressedBR != nil {
if dv == nil {
d.FieldRootBitBuf("uncompressed", uncompressedBR)
}
blockCRC32W := crc32.NewIEEE()
d.Copy(blockCRC32W, bitFlipReader{bitio.NewIOReader(uncompressedBR)})
blockCRC32N := bits.Reverse32(binary.BigEndian.Uint32(blockCRC32W.Sum(nil)))
_ = blockCRCValue.TryUintScalarFn(d.UintValidate(uint64(blockCRC32N)))
streamCRCN = blockCRC32N ^ ((streamCRCN << 1) | (streamCRCN >> 31))
// HACK: bzip2.NewReader will read from start of whole buffer and then we figure out compressedSize ourself
// "It is important to note that none of the fields within a StreamBlock or StreamFooter are necessarily byte-aligned"
const footerByteSize = 10
compressedSize := (readCompressedSize - compressedStart) - footerByteSize*8
for i := 0; i < 8; i++ {
d.SeekAbs(compressedStart + compressedSize)
if d.PeekUintBits(48) == footerMagic {
break
}
compressedSize--
}
d.SeekAbs(compressedStart)
d.FieldRawLen("compressed", compressedSize)
d.FieldStruct("footer", func(d *decode.D) {
d.FieldU48("magic", d.UintAssert(footerMagic), scalar.UintHex)
// TODO: crc of block crcs
d.FieldU32("crc", scalar.UintHex, d.UintValidate(uint64(streamCRCN)))
d.FieldRawLen("padding", int64(d.ByteAlignBits()))
})
}
// moreStreams = false
// }
// })
return nil
}