mirror of
https://github.com/wader/fq.git
synced 2024-12-26 23:15:04 +03:00
f4e0137244
Was hard to use when you dont know what indexes things will have. They are sill optioal for toxml
208 lines
4.0 KiB
Go
208 lines
4.0 KiB
Go
package xml
|
|
|
|
import (
|
|
"embed"
|
|
"strings"
|
|
|
|
"github.com/wader/fq/format"
|
|
"github.com/wader/fq/pkg/bitio"
|
|
"github.com/wader/fq/pkg/decode"
|
|
"github.com/wader/fq/pkg/interp"
|
|
"github.com/wader/fq/pkg/scalar"
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
//go:embed html.jq
|
|
var htmlFS embed.FS
|
|
|
|
func init() {
|
|
interp.RegisterFormat(decode.Format{
|
|
Name: format.HTML,
|
|
Description: "HyperText Markup Language",
|
|
DecodeFn: decodeHTML,
|
|
DecodeInArg: format.HTMLIn{
|
|
Seq: false,
|
|
Array: false,
|
|
},
|
|
Functions: []string{"_todisplay"},
|
|
})
|
|
interp.RegisterFS(htmlFS)
|
|
}
|
|
|
|
func fromHTMLToObject(n *html.Node, hi format.HTMLIn) any {
|
|
var f func(n *html.Node, seq int) any
|
|
f = func(n *html.Node, seq int) any {
|
|
attrs := map[string]any{}
|
|
|
|
switch n.Type {
|
|
case html.ElementNode:
|
|
for _, a := range n.Attr {
|
|
attrs["-"+a.Key] = a.Val
|
|
}
|
|
default:
|
|
// skip
|
|
}
|
|
|
|
nNodes := 0
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
if c.Type == html.ElementNode {
|
|
nNodes++
|
|
}
|
|
}
|
|
nSeq := -1
|
|
if nNodes > 1 {
|
|
nSeq = 0
|
|
}
|
|
|
|
var textSb *strings.Builder
|
|
var commentSb *strings.Builder
|
|
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
switch c.Type {
|
|
case html.ElementNode:
|
|
if e, ok := attrs[c.Data]; ok {
|
|
if ea, ok := e.([]any); ok {
|
|
attrs[c.Data] = append(ea, f(c, nSeq))
|
|
} else {
|
|
attrs[c.Data] = []any{e, f(c, nSeq)}
|
|
}
|
|
} else {
|
|
attrs[c.Data] = f(c, nSeq)
|
|
}
|
|
if nNodes > 1 {
|
|
nSeq++
|
|
}
|
|
case html.TextNode:
|
|
if !whitespaceRE.MatchString(c.Data) {
|
|
if textSb == nil {
|
|
textSb = &strings.Builder{}
|
|
}
|
|
textSb.WriteString(c.Data)
|
|
}
|
|
case html.CommentNode:
|
|
if !whitespaceRE.MatchString(c.Data) {
|
|
if commentSb == nil {
|
|
commentSb = &strings.Builder{}
|
|
}
|
|
commentSb.WriteString(c.Data)
|
|
}
|
|
default:
|
|
// skip other nodes
|
|
}
|
|
|
|
if textSb != nil {
|
|
attrs["#text"] = strings.TrimSpace(textSb.String())
|
|
}
|
|
if commentSb != nil {
|
|
attrs["#comment"] = strings.TrimSpace(commentSb.String())
|
|
}
|
|
}
|
|
|
|
if hi.Seq && seq != -1 {
|
|
attrs["#seq"] = seq
|
|
}
|
|
|
|
if len(attrs) == 0 {
|
|
return ""
|
|
} else if len(attrs) == 1 && attrs["#text"] != nil {
|
|
return attrs["#text"]
|
|
}
|
|
|
|
return attrs
|
|
}
|
|
|
|
return f(n, -1)
|
|
}
|
|
|
|
func fromHTMLToArray(n *html.Node) any {
|
|
var f func(n *html.Node) any
|
|
f = func(n *html.Node) any {
|
|
attrs := map[string]any{}
|
|
|
|
switch n.Type {
|
|
case html.ElementNode:
|
|
for _, a := range n.Attr {
|
|
attrs[a.Key] = a.Val
|
|
}
|
|
default:
|
|
// skip
|
|
}
|
|
|
|
nodes := []any{}
|
|
var textSb *strings.Builder
|
|
var commentSb *strings.Builder
|
|
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
switch c.Type {
|
|
case html.ElementNode:
|
|
nodes = append(nodes, f(c))
|
|
case html.TextNode:
|
|
if !whitespaceRE.MatchString(c.Data) {
|
|
if textSb == nil {
|
|
textSb = &strings.Builder{}
|
|
}
|
|
textSb.WriteString(c.Data)
|
|
}
|
|
case html.CommentNode:
|
|
if !whitespaceRE.MatchString(c.Data) {
|
|
if commentSb == nil {
|
|
commentSb = &strings.Builder{}
|
|
}
|
|
commentSb.WriteString(c.Data)
|
|
}
|
|
default:
|
|
// skip other nodes
|
|
}
|
|
}
|
|
|
|
if textSb != nil {
|
|
attrs["#text"] = strings.TrimSpace(textSb.String())
|
|
}
|
|
if commentSb != nil {
|
|
attrs["#comment"] = strings.TrimSpace(commentSb.String())
|
|
}
|
|
|
|
elm := []any{n.Data}
|
|
if len(attrs) > 0 {
|
|
elm = append(elm, attrs)
|
|
} else {
|
|
// make attrs null if there were none, jq allows index into null
|
|
elm = append(elm, nil)
|
|
}
|
|
elm = append(elm, nodes)
|
|
|
|
return elm
|
|
}
|
|
|
|
return f(n.FirstChild)
|
|
}
|
|
|
|
func decodeHTML(d *decode.D, in any) any {
|
|
hi, _ := in.(format.HTMLIn)
|
|
|
|
br := d.RawLen(d.Len())
|
|
var r any
|
|
var err error
|
|
// disabled scripting means parse noscript tags etc
|
|
n, err := html.ParseWithOptions(bitio.NewIOReader(br), html.ParseOptionEnableScripting(false))
|
|
if err != nil {
|
|
d.Fatalf("%s", err)
|
|
}
|
|
|
|
if hi.Array {
|
|
r = fromHTMLToArray(n)
|
|
} else {
|
|
r = fromHTMLToObject(n, hi)
|
|
}
|
|
if err != nil {
|
|
d.Fatalf("%s", err)
|
|
}
|
|
var s scalar.S
|
|
s.Actual = r
|
|
|
|
d.Value.V = &s
|
|
d.Value.Range.Len = d.Len()
|
|
|
|
return nil
|
|
}
|