2021-12-28 22:01:53 +03:00
|
|
|
package avro
|
|
|
|
|
|
|
|
import (
|
2022-02-08 08:03:50 +03:00
|
|
|
"bytes"
|
|
|
|
"compress/flate"
|
2021-12-09 19:15:21 +03:00
|
|
|
"embed"
|
2022-02-10 05:46:12 +03:00
|
|
|
"hash/crc32"
|
|
|
|
|
2022-02-08 08:03:50 +03:00
|
|
|
"github.com/golang/snappy"
|
2021-12-28 22:01:53 +03:00
|
|
|
"github.com/wader/fq/format"
|
2021-12-30 01:12:36 +03:00
|
|
|
"github.com/wader/fq/format/avro/decoders"
|
2021-12-28 22:05:10 +03:00
|
|
|
"github.com/wader/fq/format/avro/schema"
|
2022-02-08 08:03:50 +03:00
|
|
|
"github.com/wader/fq/pkg/bitio"
|
2021-12-28 22:01:53 +03:00
|
|
|
"github.com/wader/fq/pkg/decode"
|
2022-07-16 19:39:57 +03:00
|
|
|
"github.com/wader/fq/pkg/interp"
|
2021-12-28 22:05:10 +03:00
|
|
|
"github.com/wader/fq/pkg/scalar"
|
2021-12-28 22:01:53 +03:00
|
|
|
)
|
|
|
|
|
2021-12-09 19:15:21 +03:00
|
|
|
//go:embed avro_ocf.jq
|
|
|
|
var avroOcfFS embed.FS
|
|
|
|
|
2021-12-28 22:01:53 +03:00
|
|
|
func init() {
|
2022-07-16 19:39:57 +03:00
|
|
|
interp.RegisterFormat(decode.Format{
|
2021-12-28 22:01:53 +03:00
|
|
|
Name: format.AVRO_OCF,
|
|
|
|
Description: "Avro object container file",
|
|
|
|
Groups: []string{format.PROBE},
|
2022-02-08 08:03:50 +03:00
|
|
|
DecodeFn: decodeAvroOCF,
|
2021-12-09 19:15:21 +03:00
|
|
|
Functions: []string{"_help"},
|
|
|
|
Files: avroOcfFS,
|
2021-12-28 22:01:53 +03:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2021-12-28 22:05:10 +03:00
|
|
|
type HeaderData struct {
|
2022-01-20 06:00:44 +03:00
|
|
|
Schema schema.SimplifiedSchema
|
2021-12-28 22:05:10 +03:00
|
|
|
Codec string
|
|
|
|
Sync []byte
|
2021-12-28 22:01:53 +03:00
|
|
|
}
|
|
|
|
|
2022-01-20 06:00:44 +03:00
|
|
|
const headerSchemaSpec = `
|
|
|
|
{
|
|
|
|
"type": "record",
|
|
|
|
"name": "org.apache.avro.file.Header",
|
|
|
|
"fields": [
|
|
|
|
{"name": "meta", "type": {"type": "map", "values": "string"}},
|
|
|
|
{"name": "sync", "type": {"type": "fixed", "name": "Sync", "size": 16}}
|
|
|
|
]
|
|
|
|
}`
|
|
|
|
|
2021-12-28 22:05:10 +03:00
|
|
|
func decodeHeader(d *decode.D) HeaderData {
|
2022-01-20 06:00:44 +03:00
|
|
|
d.FieldRawLen("magic", 4*8, d.AssertBitBuf([]byte{'O', 'b', 'j', 1}))
|
|
|
|
|
2021-12-28 22:05:10 +03:00
|
|
|
var headerData HeaderData
|
|
|
|
|
2022-01-20 06:00:44 +03:00
|
|
|
headerSchema, err := schema.FromSchemaString(headerSchemaSpec)
|
|
|
|
if err != nil {
|
|
|
|
d.Fatalf("Failed to parse header schema: %v", err)
|
|
|
|
}
|
|
|
|
decodeHeaderFn, err := decoders.DecodeFnForSchema(headerSchema)
|
|
|
|
if err != nil {
|
|
|
|
d.Fatalf("failed to parse header: %v", err)
|
|
|
|
}
|
2021-12-28 22:01:53 +03:00
|
|
|
|
2022-01-20 06:00:44 +03:00
|
|
|
header := decodeHeaderFn("header", d)
|
2022-05-20 16:10:41 +03:00
|
|
|
headerRecord, ok := header.(map[string]any)
|
2022-01-20 06:00:44 +03:00
|
|
|
if !ok {
|
|
|
|
d.Fatalf("header is not a map")
|
|
|
|
}
|
2022-05-20 16:10:41 +03:00
|
|
|
meta, ok := headerRecord["meta"].(map[string]any)
|
2022-01-20 06:00:44 +03:00
|
|
|
if !ok {
|
|
|
|
d.Fatalf("header.meta is not a map")
|
2021-12-28 22:05:10 +03:00
|
|
|
}
|
|
|
|
|
2022-07-23 23:16:56 +03:00
|
|
|
metaSchema, ok := meta["avro.schema"].(string)
|
|
|
|
if !ok {
|
|
|
|
d.Fatalf("missing meta avro.schema")
|
|
|
|
}
|
|
|
|
|
|
|
|
headerData.Schema, err = schema.FromSchemaString(metaSchema)
|
2022-01-20 06:00:44 +03:00
|
|
|
if err != nil {
|
|
|
|
d.Fatalf("failed to parse schema: %v", err)
|
|
|
|
}
|
2022-02-08 08:03:50 +03:00
|
|
|
if codec, ok := meta["avro.codec"]; ok {
|
2022-01-20 06:00:44 +03:00
|
|
|
headerData.Codec, ok = codec.(string)
|
|
|
|
if !ok {
|
|
|
|
d.Fatalf("avro.codec is not a string")
|
|
|
|
}
|
|
|
|
} else {
|
2022-02-08 08:03:50 +03:00
|
|
|
headerData.Codec = "null"
|
2021-12-28 22:05:10 +03:00
|
|
|
}
|
|
|
|
|
2022-01-20 06:00:44 +03:00
|
|
|
headerData.Sync, ok = headerRecord["sync"].([]byte)
|
|
|
|
if !ok {
|
|
|
|
d.Fatalf("header.sync is not a byte array")
|
2021-12-28 22:01:53 +03:00
|
|
|
}
|
2021-12-28 22:05:10 +03:00
|
|
|
return headerData
|
|
|
|
}
|
|
|
|
|
2022-02-10 05:39:31 +03:00
|
|
|
func decodeBlockCodec(d *decode.D, dataSize int64, codec string) *bytes.Buffer {
|
|
|
|
bb := &bytes.Buffer{}
|
|
|
|
if codec == "deflate" {
|
|
|
|
br := d.FieldRawLen("compressed", dataSize*8)
|
2022-06-30 13:13:36 +03:00
|
|
|
d.Copy(bb, flate.NewReader(bitio.NewIOReader(br)))
|
2022-02-10 05:39:31 +03:00
|
|
|
} else if codec == "snappy" {
|
|
|
|
// Everything but last 4 bytes which are the checksum
|
|
|
|
n := dataSize - 4
|
|
|
|
br := d.FieldRawLen("compressed", n*8)
|
2022-02-10 17:55:56 +03:00
|
|
|
|
|
|
|
// This could be simplified to be similar to deflate, however snappy's reader only works for streaming frames,
|
|
|
|
// not block data. See https://github.com/google/snappy/blob/main/framing_format.txt for details.
|
2022-02-10 05:39:31 +03:00
|
|
|
compressed := make([]byte, n)
|
|
|
|
if _, err := bitio.ReadFull(br, compressed, n*8); err != nil {
|
|
|
|
d.Fatalf("failed reading compressed data %v", err)
|
|
|
|
}
|
|
|
|
decompressed, err := snappy.Decode(nil, compressed)
|
|
|
|
if err != nil {
|
|
|
|
d.Fatalf("failed decompressing data: %v", err)
|
|
|
|
}
|
2022-06-30 13:13:36 +03:00
|
|
|
d.Copy(bb, bytes.NewReader(decompressed))
|
2022-02-10 05:39:31 +03:00
|
|
|
|
|
|
|
// Check the checksum
|
|
|
|
crc32W := crc32.NewIEEE()
|
2022-06-30 13:13:36 +03:00
|
|
|
d.Copy(crc32W, bytes.NewReader(bb.Bytes()))
|
2022-05-07 13:46:34 +03:00
|
|
|
d.FieldU32("crc", d.ValidateUBytes(crc32W.Sum(nil)), scalar.ActualHex)
|
2022-02-10 05:39:31 +03:00
|
|
|
} else {
|
|
|
|
// Unknown codec, just dump the compressed data.
|
|
|
|
d.FieldRawLen("compressed", dataSize*8, scalar.Description(codec+" encoded"))
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return bb
|
|
|
|
}
|
|
|
|
|
2022-07-19 19:33:50 +03:00
|
|
|
func decodeAvroOCF(d *decode.D, _ any) any {
|
2021-12-28 22:05:10 +03:00
|
|
|
header := decodeHeader(d)
|
|
|
|
|
2022-01-20 06:00:44 +03:00
|
|
|
decodeFn, err := decoders.DecodeFnForSchema(header.Schema)
|
2021-12-28 22:05:10 +03:00
|
|
|
if err != nil {
|
|
|
|
d.Fatalf("unable to create codec: %v", err)
|
|
|
|
}
|
|
|
|
|
2021-12-28 22:01:53 +03:00
|
|
|
d.FieldStructArrayLoop("blocks", "block", func() bool { return d.NotEnd() }, func(d *decode.D) {
|
2021-12-30 01:12:36 +03:00
|
|
|
count := d.FieldSFn("count", decoders.VarZigZag)
|
2021-12-28 22:05:10 +03:00
|
|
|
if count <= 0 {
|
|
|
|
return
|
|
|
|
}
|
2021-12-30 01:12:36 +03:00
|
|
|
size := d.FieldSFn("size", decoders.VarZigZag)
|
2022-02-08 08:03:50 +03:00
|
|
|
i := int64(0)
|
|
|
|
|
2022-02-10 05:39:31 +03:00
|
|
|
if header.Codec != "null" {
|
|
|
|
if bb := decodeBlockCodec(d, size, header.Codec); bb != nil {
|
|
|
|
d.FieldArrayRootBitBufFn("data", bitio.NewBitReader(bb.Bytes(), -1), func(d *decode.D) {
|
|
|
|
for ; i < count; i++ {
|
|
|
|
decodeFn("data", d)
|
|
|
|
}
|
|
|
|
})
|
2022-02-08 08:03:50 +03:00
|
|
|
}
|
2022-02-10 05:39:31 +03:00
|
|
|
} else {
|
2021-12-28 22:05:10 +03:00
|
|
|
d.FieldArrayLoop("data", func() bool { return i < count }, func(d *decode.D) {
|
2021-12-30 01:12:36 +03:00
|
|
|
decodeFn("datum", d)
|
2021-12-30 01:46:56 +03:00
|
|
|
i++
|
2021-12-28 22:05:10 +03:00
|
|
|
})
|
|
|
|
}
|
|
|
|
d.FieldRawLen("sync", 16*8, d.AssertBitBuf(header.Sync))
|
2021-12-28 22:01:53 +03:00
|
|
|
})
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|