1
1
mirror of https://github.com/wader/fq.git synced 2024-09-11 12:05:39 +03:00

html: Add to probe group

As decoder now can know they are decoding as part of probing we can now
use some heuristics to see if we should decode as html.
The reason heuristics is needed is that x/html parser will alwaus succeed.

Add lazyre package to help delay compile of RE and make it concurrency safe.
This commit is contained in:
Mattias Wadman 2023-05-08 22:37:50 +02:00
parent f254b16cfc
commit e2eb667091
16 changed files with 1058 additions and 975 deletions

View File

@ -14,6 +14,7 @@
- Optimize `Interp.Options` calls, now called per display. Cache per eval? needs to handle nested evals.
- `<array decode value>[{start: ...: end: ...}]` syntax a bit broken.
- REPL completion might have side effcts. Make interp.Function type know and wrap somehow? input, inputs, open, ...
- Rework group arguments so that `{is_probe:true}` is not needed. Look up group name and see if it has an argument somehow?
### TODO and ideas

View File

@ -127,7 +127,7 @@
|`ip_packet` |Group |<sub>`icmp` `icmpv6` `tcp_segment` `udp_datagram`</sub>|
|`link_frame` |Group |<sub>`bsd_loopback_frame` `ether8023_frame` `ipv4_packet` `ipv6_packet` `sll2_packet` `sll_packet`</sub>|
|`mp3_frame_tags` |Group |<sub>`mp3_frame_vbri` `mp3_frame_xing`</sub>|
|`probe` |Group |<sub>`adts` `aiff` `apple_bookmark` `ar` `avi` `avro_ocf` `bitcoin_blkdat` `bplist` `bzip2` `elf` `flac` `gif` `gzip` `jpeg` `json` `jsonl` `macho` `macho_fat` `matroska` `mp3` `mp4` `mpeg_ts` `ogg` `pcap` `pcapng` `png` `tar` `tiff` `toml` `tzif` `wasm` `wav` `webp` `xml` `yaml` `zip`</sub>|
|`probe` |Group |<sub>`adts` `aiff` `apple_bookmark` `ar` `avi` `avro_ocf` `bitcoin_blkdat` `bplist` `bzip2` `elf` `flac` `gif` `gzip` `html` `jpeg` `json` `jsonl` `macho` `macho_fat` `matroska` `mp3` `mp4` `mpeg_ts` `ogg` `pcap` `pcapng` `png` `tar` `tiff` `toml` `tzif` `wasm` `wav` `webp` `xml` `yaml` `zip`</sub>|
|`tcp_stream` |Group |<sub>`dns_tcp` `rtmp` `tls`</sub>|
|`udp_payload` |Group |<sub>`dns`</sub>|

File diff suppressed because it is too large Load Diff

Before

Width:  |  Height:  |  Size: 145 KiB

After

Width:  |  Height:  |  Size: 143 KiB

View File

@ -32,6 +32,7 @@ $ fq -n _registry.groups.probe
"mpeg_ts",
"wav",
"json",
"html",
"jsonl",
"toml",
"xml",

View File

@ -38,7 +38,7 @@ func decodeAr(d *decode.D) any {
}
size := int64(sizeStr.SymUint()) * 8
d.FieldUTF8("ending_characters", 2)
d.FieldFormatOrRawLen("data", size, &probeGroup, nil)
d.FieldFormatOrRawLen("data", size, &probeGroup, format.Probe_In{})
padding := d.AlignBits(16)
if padding > 0 {
d.FieldRawLen("padding", int64(padding))

View File

@ -111,7 +111,7 @@ func bzip2Decode(d *decode.D) any {
compressedStart := d.Pos()
readCompressedSize, uncompressedBR, dv, _, _ :=
d.TryFieldReaderRangeFormat("uncompressed", 0, d.Len(), bzip2.NewReader, &probeGroup, nil)
d.TryFieldReaderRangeFormat("uncompressed", 0, d.Len(), bzip2.NewReader, &probeGroup, format.Probe_In{})
if uncompressedBR != nil {
if dv == nil {
d.FieldRootBitBuf("uncompressed", uncompressedBR)

View File

@ -110,7 +110,7 @@ func gzDecode(d *decode.D) any {
if rFn != nil {
readCompressedSize, uncompressedBR, dv, _, _ :=
d.TryFieldReaderRangeFormat("uncompressed", d.Pos(), d.BitsLeft(), rFn, &probeGroup, nil)
d.TryFieldReaderRangeFormat("uncompressed", d.Pos(), d.BitsLeft(), rFn, &probeGroup, format.Probe_In{})
if uncompressedBR != nil {
if dv == nil {
d.FieldRootBitBuf("uncompressed", uncompressedBR)

View File

@ -76,7 +76,7 @@ func tarDecode(d *decode.D) any {
d.FieldUTF8("prefix", 155, mapTrimSpaceNull)
d.FieldRawLen("header_block_padding", blockPadding(d), d.BitBufIsZero())
d.FieldFormatOrRawLen("data", int64(size), &probeGroup, nil)
d.FieldFormatOrRawLen("data", int64(size), &probeGroup, format.Probe_In{})
d.FieldRawLen("data_block_padding", blockPadding(d), d.BitBufIsZero())
})

View File

@ -5,6 +5,7 @@ import (
"strings"
"github.com/wader/fq/format"
"github.com/wader/fq/internal/lazyre"
"github.com/wader/fq/pkg/bitio"
"github.com/wader/fq/pkg/decode"
"github.com/wader/fq/pkg/interp"
@ -21,6 +22,8 @@ func init() {
format.HTML,
&decode.Format{
Description: "HyperText Markup Language",
ProbeOrder: format.ProbeOrderTextFuzzy,
Groups: []*decode.Group{format.Probe},
DecodeFn: decodeHTML,
DefaultInArg: format.HTML_In{
Seq: false,
@ -193,9 +196,28 @@ func fromHTMLToArray(n *html.Node) any {
return f(n)
}
var htmlMagicRe = &lazyre.RE{S: `` +
`^` + // anchor to start
`(?i)` + // case insensitive
`[[:graph:][:space:]]{0,64}?` + // 0-64 non-control ASCII lazily to allow comment etc
`(?:` +
`<\s{0,20}html|` + // <html
// or
`<!DOCTYPE\s{1,20}html` + // <!DOCTYPE html
`)`,
}
func decodeHTML(d *decode.D) any {
var hi format.HTML_In
var pi format.Probe_In
d.ArgAs(&hi)
if d.ArgAs(&pi) {
// if probing the input has to start with "<html" or "<!DOCTYPE html" this
// is because the html parser will always succeed so we have to be careful
if d.RE(htmlMagicRe.Must()) == nil {
d.Fatalf("no <html> or <!DOCTYPE html> found")
}
}
br := d.RawLen(d.Len())
var r any

View File

@ -34,29 +34,21 @@ $ fq -o array=true -d html . doctype.xml
]
$ fq . doctype.xml
{
"html": {
"head": {
"title": "aaa"
}
"head": {
"title": "aaa"
}
}
$ fq -o array=true . doctype.xml
[
"html",
"head",
null,
[
[
"head",
null,
[
[
"title",
{
"#text": "aaa"
},
[]
]
]
"title",
{
"#text": "aaa"
},
[]
]
]
]

View File

@ -1,6 +1,4 @@
<!DOCTYPE html>
<html>
<!DOCTYPE bla SYSTEM "" []>
<head>
<title>aaa</title>
</head>
</html>

27
format/xml/testdata/html_probe.fqtest vendored Normal file
View File

@ -0,0 +1,27 @@
/html.html:
<html><a>
$ fq . html.html
{
"html": {
"body": {
"a": ""
},
"head": ""
}
}
/doctype.html:
<!DOCTYPE html>
<a>
$ fq . doctype.html
{
"html": {
"body": {
"a": ""
},
"head": ""
}
}

View File

@ -360,7 +360,7 @@ func zipDecode(d *decode.D) any {
}
if compressionMethod == compressionMethodNone {
d.FieldFormatOrRawLen("uncompressed", compressedSize, &probeGroup, nil)
d.FieldFormatOrRawLen("uncompressed", compressedSize, &probeGroup, format.Probe_In{})
} else {
var rFn func(r io.Reader) io.Reader
if zi.Uncompress {
@ -374,7 +374,7 @@ func zipDecode(d *decode.D) any {
if rFn != nil {
readCompressedSize, uncompressedBR, dv, _, _ :=
d.TryFieldReaderRangeFormat("uncompressed", d.Pos(), compressedLimit, rFn, &probeGroup, nil)
d.TryFieldReaderRangeFormat("uncompressed", d.Pos(), compressedLimit, rFn, &probeGroup, format.Probe_In{})
if dv == nil && uncompressedBR != nil {
d.FieldRootBitBuf("uncompressed", uncompressedBR)
}

30
internal/lazyre/lazyre.go Normal file
View File

@ -0,0 +1,30 @@
// lazyre lazily compiles a *regexp.Regexp in concurrency safe way
// Use &lazyre.RE{S: `...`} or call New
package lazyre
import (
"regexp"
"sync"
)
type RE struct {
S string
m sync.RWMutex
re *regexp.Regexp
}
// New creates a new *lazyRE
func New(s string) *RE {
return &RE{S: s}
}
// Must compiles regexp, returned *regexp.Regexp can be stored away and reused
func (lr *RE) Must() *regexp.Regexp {
lr.m.Lock()
defer lr.m.Unlock()
if lr.re == nil {
lr.re = regexp.MustCompile(lr.S)
}
return lr.re
}

View File

@ -0,0 +1,13 @@
package lazyre_test
import (
"testing"
"github.com/wader/fq/internal/lazyre"
)
func TestMust(t *testing.T) {
if !lazyre.New("a").Must().MatchString("a") {
t.Fatal("should compile and be non-nil and match a")
}
}

View File

@ -1261,15 +1261,11 @@ func (d *D) FieldValue(name string, fn func() *Value) *Value {
return v
}
func (d *D) RE(reRef **regexp.Regexp, reStr string) []ranges.Range {
if *reRef == nil {
*reRef = regexp.MustCompile(reStr)
}
func (d *D) RE(re *regexp.Regexp) []ranges.Range {
startPos := d.Pos()
rr := ioex.ByteRuneReader{RS: bitio.NewIOReadSeeker(d.bitBuf)}
locs := (*reRef).FindReaderSubmatchIndex(rr)
locs := re.FindReaderSubmatchIndex(rr)
if locs == nil {
return nil
}
@ -1292,13 +1288,10 @@ func (d *D) RE(reRef **regexp.Regexp, reStr string) []ranges.Range {
return rs
}
func (d *D) FieldRE(reRef **regexp.Regexp, reStr string, mRef *map[string]string, sms ...scalar.StrMapper) {
if *reRef == nil {
*reRef = regexp.MustCompile(reStr)
}
subexpNames := (*reRef).SubexpNames()
func (d *D) FieldRE(re *regexp.Regexp, mRef *map[string]string, sms ...scalar.StrMapper) {
subexpNames := re.SubexpNames()
rs := d.RE(reRef, reStr)
rs := d.RE(re)
for i, r := range rs {
if i == 0 || r.Start == -1 {
continue