html: Add to probe group

As decoder now can know they are decoding as part of probing we can now use some heuristics to see if we should decode as html. The reason heuristics is needed is that x/html parser will alwaus succeed. Add lazyre package to help delay compile of RE and make it concurrency safe.
2024-09-11 12:05:39 +03:00 · 2023-05-08 22:37:50 +02:00 · 2023-05-08 22:37:50 +02:00 · e2eb667091
commit e2eb667091
parent f254b16cfc
16 changed files with 1058 additions and 975 deletions
--- a/doc/TODO.md
+++ b/doc/TODO.md
@ -14,6 +14,7 @@
 - Optimize `Interp.Options` calls, now called per display. Cache per eval? needs to handle nested evals.
 - `<array decode value>[{start: ...: end: ...}]` syntax a bit broken.
 - REPL completion might have side effcts. Make interp.Function type know and wrap somehow? input, inputs, open, ...
+- Rework group arguments so that `{is_probe:true}` is not needed. Look up group name and see if it has an argument somehow?

 ### TODO and ideas

--- a/doc/formats.md
+++ b/doc/formats.md
@ -127,7 +127,7 @@
 |`ip_packet`                                             |Group                                                                                                        |<sub>`icmp` `icmpv6` `tcp_segment` `udp_datagram`</sub>|
 |`link_frame`                                            |Group                                                                                                        |<sub>`bsd_loopback_frame` `ether8023_frame` `ipv4_packet` `ipv6_packet` `sll2_packet` `sll_packet`</sub>|
 |`mp3_frame_tags`                                        |Group                                                                                                        |<sub>`mp3_frame_vbri` `mp3_frame_xing`</sub>|
-|`probe`                                                 |Group                                                                                                        |<sub>`adts` `aiff` `apple_bookmark` `ar` `avi` `avro_ocf` `bitcoin_blkdat` `bplist` `bzip2` `elf` `flac` `gif` `gzip` `jpeg` `json` `jsonl` `macho` `macho_fat` `matroska` `mp3` `mp4` `mpeg_ts` `ogg` `pcap` `pcapng` `png` `tar` `tiff` `toml` `tzif` `wasm` `wav` `webp` `xml` `yaml` `zip`</sub>|
+|`probe`                                                 |Group                                                                                                        |<sub>`adts` `aiff` `apple_bookmark` `ar` `avi` `avro_ocf` `bitcoin_blkdat` `bplist` `bzip2` `elf` `flac` `gif` `gzip` `html` `jpeg` `json` `jsonl` `macho` `macho_fat` `matroska` `mp3` `mp4` `mpeg_ts` `ogg` `pcap` `pcapng` `png` `tar` `tiff` `toml` `tzif` `wasm` `wav` `webp` `xml` `yaml` `zip`</sub>|
 |`tcp_stream`                                            |Group                                                                                                        |<sub>`dns_tcp` `rtmp` `tls`</sub>|
 |`udp_payload`                                           |Group                                                                                                        |<sub>`dns`</sub>|

--- a/doc/formats.svg
+++ b/doc/formats.svg
--- a/format/all/all.fqtest
+++ b/format/all/all.fqtest
@ -32,6 +32,7 @@ $ fq -n _registry.groups.probe
  "mpeg_ts",
  "wav",
  "json",
+  "html",
  "jsonl",
  "toml",
  "xml",
--- a/format/ar/ar.go
+++ b/format/ar/ar.go
@ -38,7 +38,7 @@ func decodeAr(d *decode.D) any {
 				}
 				size := int64(sizeStr.SymUint()) * 8
 				d.FieldUTF8("ending_characters", 2)
-				d.FieldFormatOrRawLen("data", size, &probeGroup, nil)
+				d.FieldFormatOrRawLen("data", size, &probeGroup, format.Probe_In{})
 				padding := d.AlignBits(16)
 				if padding > 0 {
 					d.FieldRawLen("padding", int64(padding))
--- a/format/bzip2/bzip2.go
+++ b/format/bzip2/bzip2.go
@ -111,7 +111,7 @@ func bzip2Decode(d *decode.D) any {
 	compressedStart := d.Pos()

 	readCompressedSize, uncompressedBR, dv, _, _ :=
-		d.TryFieldReaderRangeFormat("uncompressed", 0, d.Len(), bzip2.NewReader, &probeGroup, nil)
+		d.TryFieldReaderRangeFormat("uncompressed", 0, d.Len(), bzip2.NewReader, &probeGroup, format.Probe_In{})
 	if uncompressedBR != nil {
 		if dv == nil {
 			d.FieldRootBitBuf("uncompressed", uncompressedBR)
--- a/format/gzip/gzip.go
+++ b/format/gzip/gzip.go
@ -110,7 +110,7 @@ func gzDecode(d *decode.D) any {

 	if rFn != nil {
 		readCompressedSize, uncompressedBR, dv, _, _ :=
-			d.TryFieldReaderRangeFormat("uncompressed", d.Pos(), d.BitsLeft(), rFn, &probeGroup, nil)
+			d.TryFieldReaderRangeFormat("uncompressed", d.Pos(), d.BitsLeft(), rFn, &probeGroup, format.Probe_In{})
 		if uncompressedBR != nil {
 			if dv == nil {
 				d.FieldRootBitBuf("uncompressed", uncompressedBR)
--- a/format/tar/tar.go
+++ b/format/tar/tar.go
@ -76,7 +76,7 @@ func tarDecode(d *decode.D) any {
 				d.FieldUTF8("prefix", 155, mapTrimSpaceNull)
 				d.FieldRawLen("header_block_padding", blockPadding(d), d.BitBufIsZero())

-				d.FieldFormatOrRawLen("data", int64(size), &probeGroup, nil)
+				d.FieldFormatOrRawLen("data", int64(size), &probeGroup, format.Probe_In{})

 				d.FieldRawLen("data_block_padding", blockPadding(d), d.BitBufIsZero())
 			})
--- a/format/xml/html.go
+++ b/format/xml/html.go
@ -5,6 +5,7 @@ import (
 	"strings"

 	"github.com/wader/fq/format"
+	"github.com/wader/fq/internal/lazyre"
 	"github.com/wader/fq/pkg/bitio"
 	"github.com/wader/fq/pkg/decode"
 	"github.com/wader/fq/pkg/interp"
@ -21,6 +22,8 @@ func init() {
 		format.HTML,
 		&decode.Format{
 			Description: "HyperText Markup Language",
+			ProbeOrder:  format.ProbeOrderTextFuzzy,
+			Groups:      []*decode.Group{format.Probe},
 			DecodeFn:    decodeHTML,
 			DefaultInArg: format.HTML_In{
 				Seq:             false,
@ -193,9 +196,28 @@ func fromHTMLToArray(n *html.Node) any {
 	return f(n)
 }

+var htmlMagicRe = &lazyre.RE{S: `` +
+	`^` + // anchor to start
+	`(?i)` + // case insensitive
+	`[[:graph:][:space:]]{0,64}?` + // 0-64 non-control ASCII lazily to allow comment etc
+	`(?:` +
+	`<\s{0,20}html|` + // <html
+	// or
+	`<!DOCTYPE\s{1,20}html` + // <!DOCTYPE html
+	`)`,
+}
+
 func decodeHTML(d *decode.D) any {
 	var hi format.HTML_In
+	var pi format.Probe_In
 	d.ArgAs(&hi)
+	if d.ArgAs(&pi) {
+		// if probing the input has to start with "<html" or "<!DOCTYPE html" this
+		// is because the html parser will always succeed so we have to be careful
+		if d.RE(htmlMagicRe.Must()) == nil {
+			d.Fatalf("no <html> or <!DOCTYPE html> found")
+		}
+	}

 	br := d.RawLen(d.Len())
 	var r any
--- a/format/xml/testdata/doctype.fqtest
+++ b/format/xml/testdata/doctype.fqtest
@ -34,29 +34,21 @@ $ fq -o array=true -d html . doctype.xml
 ]
 $ fq . doctype.xml
 {
-  "html": {
-    "head": {
-      "title": "aaa"
-    }
+  "head": {
+    "title": "aaa"
  }
 }
 $ fq -o array=true . doctype.xml
 [
-  "html",
+  "head",
  null,
  [
    [
-      "head",
-      null,
-      [
-        [
-          "title",
-          {
-            "#text": "aaa"
-          },
-          []
-        ]
-      ]
+      "title",
+      {
+        "#text": "aaa"
+      },
+      []
    ]
  ]
 ]
--- a/format/xml/testdata/doctype.xml
+++ b/format/xml/testdata/doctype.xml
@ -1,6 +1,4 @@
-<!DOCTYPE html>
-<html>
+<!DOCTYPE bla SYSTEM "" []>
 <head>
 <title>aaa</title>
 </head>
-</html>
--- a/format/xml/testdata/html_probe.fqtest
+++ b/format/xml/testdata/html_probe.fqtest
@ -0,0 +1,27 @@
+/html.html:
+
+    <html><a>
+
+$ fq . html.html
+{
+  "html": {
+    "body": {
+      "a": ""
+    },
+    "head": ""
+  }
+}
+/doctype.html:
+
+    <!DOCTYPE html>
+    <a>
+
+$ fq . doctype.html
+{
+  "html": {
+    "body": {
+      "a": ""
+    },
+    "head": ""
+  }
+}
--- a/format/zip/zip.go
+++ b/format/zip/zip.go
@ -360,7 +360,7 @@ func zipDecode(d *decode.D) any {
 				}

 				if compressionMethod == compressionMethodNone {
-					d.FieldFormatOrRawLen("uncompressed", compressedSize, &probeGroup, nil)
+					d.FieldFormatOrRawLen("uncompressed", compressedSize, &probeGroup, format.Probe_In{})
 				} else {
 					var rFn func(r io.Reader) io.Reader
 					if zi.Uncompress {
@ -374,7 +374,7 @@ func zipDecode(d *decode.D) any {

 					if rFn != nil {
 						readCompressedSize, uncompressedBR, dv, _, _ :=
-							d.TryFieldReaderRangeFormat("uncompressed", d.Pos(), compressedLimit, rFn, &probeGroup, nil)
+							d.TryFieldReaderRangeFormat("uncompressed", d.Pos(), compressedLimit, rFn, &probeGroup, format.Probe_In{})
 						if dv == nil && uncompressedBR != nil {
 							d.FieldRootBitBuf("uncompressed", uncompressedBR)
 						}
--- a/internal/lazyre/lazyre.go
+++ b/internal/lazyre/lazyre.go
@ -0,0 +1,30 @@
+// lazyre lazily compiles a *regexp.Regexp in concurrency safe way
+// Use &lazyre.RE{S: `...`} or call New
+package lazyre
+
+import (
+	"regexp"
+	"sync"
+)
+
+type RE struct {
+	S string
+
+	m  sync.RWMutex
+	re *regexp.Regexp
+}
+
+// New creates a new *lazyRE
+func New(s string) *RE {
+	return &RE{S: s}
+}
+
+// Must compiles regexp, returned *regexp.Regexp can be stored away and reused
+func (lr *RE) Must() *regexp.Regexp {
+	lr.m.Lock()
+	defer lr.m.Unlock()
+	if lr.re == nil {
+		lr.re = regexp.MustCompile(lr.S)
+	}
+	return lr.re
+}
--- a/internal/lazyre/lazyre_test.go
+++ b/internal/lazyre/lazyre_test.go
@ -0,0 +1,13 @@
+package lazyre_test
+
+import (
+	"testing"
+
+	"github.com/wader/fq/internal/lazyre"
+)
+
+func TestMust(t *testing.T) {
+	if !lazyre.New("a").Must().MatchString("a") {
+		t.Fatal("should compile and be non-nil and match a")
+	}
+}
--- a/pkg/decode/decode.go
+++ b/pkg/decode/decode.go
@ -1261,15 +1261,11 @@ func (d *D) FieldValue(name string, fn func() *Value) *Value {
 	return v
 }

-func (d *D) RE(reRef **regexp.Regexp, reStr string) []ranges.Range {
-	if *reRef == nil {
-		*reRef = regexp.MustCompile(reStr)
-	}
-
+func (d *D) RE(re *regexp.Regexp) []ranges.Range {
 	startPos := d.Pos()

 	rr := ioex.ByteRuneReader{RS: bitio.NewIOReadSeeker(d.bitBuf)}
-	locs := (*reRef).FindReaderSubmatchIndex(rr)
+	locs := re.FindReaderSubmatchIndex(rr)
 	if locs == nil {
 		return nil
 	}
@ -1292,13 +1288,10 @@ func (d *D) RE(reRef **regexp.Regexp, reStr string) []ranges.Range {
 	return rs
 }

-func (d *D) FieldRE(reRef **regexp.Regexp, reStr string, mRef *map[string]string, sms ...scalar.StrMapper) {
-	if *reRef == nil {
-		*reRef = regexp.MustCompile(reStr)
-	}
-	subexpNames := (*reRef).SubexpNames()
+func (d *D) FieldRE(re *regexp.Regexp, mRef *map[string]string, sms ...scalar.StrMapper) {
+	subexpNames := re.SubexpNames()

-	rs := d.RE(reRef, reStr)
+	rs := d.RE(re)
 	for i, r := range rs {
 		if i == 0 || r.Start == -1 {
 			continue