1
1
mirror of https://github.com/wader/fq.git synced 2025-01-07 14:48:14 +03:00
fq/format/xml/xml.go
Mattias Wadman 9b81d4d3ab decode: More type safe API and split scalar into multiple types
Preparation to make decoder use less memory and API more type safe.
Now each scalar type has it's own struct type so it can store different
things and enables to have a scalar interface.
Also own types will enable experimenting with decode DLS designs like
using chained methods that are type aware.
2022-12-14 16:23:58 +01:00

519 lines
11 KiB
Go

package xml
// object mode inspired by https://www.xml.com/pub/a/2006/05/31/converting-between-xml-and-json.html
// TODO: keep <?xml>? root #desc?
// TODO: refactor to share more code
// TODO: rewrite ns stack
import (
"bytes"
"embed"
"encoding/xml"
"errors"
"html"
"io"
"regexp"
"strconv"
"strings"
"github.com/wader/fq/format"
"github.com/wader/fq/internal/gojqex"
"github.com/wader/fq/internal/sortex"
"github.com/wader/fq/internal/stringsex"
"github.com/wader/fq/pkg/bitio"
"github.com/wader/fq/pkg/decode"
"github.com/wader/fq/pkg/interp"
"github.com/wader/fq/pkg/scalar"
"golang.org/x/exp/slices"
)
//go:embed xml.jq
//go:embed xml.md
var xmlFS embed.FS
func init() {
interp.RegisterFormat(decode.Format{
Name: format.XML,
Description: "Extensible Markup Language",
ProbeOrder: format.ProbeOrderTextFuzzy,
Groups: []string{format.PROBE},
DecodeFn: decodeXML,
DecodeInArg: format.XMLIn{
Seq: false,
Array: false,
AttributePrefix: "@",
},
Functions: []string{"_todisplay"},
})
interp.RegisterFS(xmlFS)
interp.RegisterFunc1("toxml", toXML)
interp.RegisterFunc0("fromxmlentities", func(_ *interp.Interp, c string) any {
return html.UnescapeString(c)
})
interp.RegisterFunc0("toxmlentities", func(_ *interp.Interp, c string) any {
return html.EscapeString(c)
})
}
var whitespaceRE = regexp.MustCompile(`^\s*$`)
type xmlNode struct {
XMLName xml.Name
Attrs []xml.Attr `xml:",attr"`
Chardata []byte `xml:",chardata"`
Comment []byte `xml:",comment"`
Nodes []xmlNode `xml:",any"`
}
func (n *xmlNode) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
n.Attrs = start.Attr
type node xmlNode
return d.DecodeElement((*node)(n), &start)
}
type xmlNS struct {
name string
url string
}
// TODO: nss pop? attr not stack?
// xmlNNStack is used to undo namespace url resolving, space is url not the "alias" name
type xmlNNStack []xmlNS
func (nss xmlNNStack) lookup(name xml.Name) string {
var s string
for i := len(nss) - 1; i >= 0; i-- {
ns := nss[i]
if name.Space == ns.url {
// first found or is default namespace
if s == "" || ns.name == "" {
s = ns.name
}
if s == "" {
break
}
}
}
return s
}
func (nss xmlNNStack) push(name string, url string) xmlNNStack {
n := append([]xmlNS{}, nss...)
n = append(n, xmlNS{name: name, url: url})
return xmlNNStack(n)
}
func elmName(space, local string) string {
if space == "" {
return local
}
return space + ":" + local
}
func fromXMLToObject(n xmlNode, xi format.XMLIn) any {
var f func(n xmlNode, seq int, nss xmlNNStack) (string, any)
f = func(n xmlNode, seq int, nss xmlNNStack) (string, any) {
attrs := map[string]any{}
for _, a := range n.Attrs {
local, space := a.Name.Local, a.Name.Space
if space == "xmlns" {
nss = nss.push(local, a.Value)
} else if local == "xmlns" {
// track default namespace
nss = nss.push("", a.Value)
}
}
for _, a := range n.Attrs {
local, space := a.Name.Local, a.Name.Space
name := local
if space != "" {
if space == "xmlns" {
// nop
} else {
space = nss.lookup(a.Name)
}
name = elmName(space, local)
}
attrs[xi.AttributePrefix+name] = a.Value
}
for i, nn := range n.Nodes {
nSeq := i
if len(n.Nodes) == 1 {
nSeq = -1
}
nname, naddrs := f(nn, nSeq, nss)
if e, ok := attrs[nname]; ok {
if ea, ok := e.([]any); ok {
attrs[nname] = append(ea, naddrs)
} else {
attrs[nname] = []any{e, naddrs}
}
} else {
attrs[nname] = naddrs
}
}
if xi.Seq && seq != -1 {
attrs["#seq"] = seq
}
if attrs["#text"] == nil && !whitespaceRE.Match(n.Chardata) {
attrs["#text"] = strings.TrimSpace(string(n.Chardata))
}
if attrs["#comment"] == nil && !whitespaceRE.Match(n.Comment) {
attrs["#comment"] = strings.TrimSpace(string(n.Comment))
}
local, space := n.XMLName.Local, n.XMLName.Space
if space != "" {
space = nss.lookup(n.XMLName)
}
name := elmName(space, local)
if len(attrs) == 0 {
return name, ""
} else if len(attrs) == 1 && attrs["#text"] != nil {
return name, attrs["#text"]
}
return name, attrs
}
name, attrs := f(n, -1, nil)
return map[string]any{name: attrs}
}
func fromXMLToArray(n xmlNode) any {
var f func(n xmlNode, nss xmlNNStack) []any
f = func(n xmlNode, nss xmlNNStack) []any {
attrs := map[string]any{}
for _, a := range n.Attrs {
local, space := a.Name.Local, a.Name.Space
if space == "xmlns" {
nss = nss.push(local, a.Value)
} else if local == "xmlns" {
// track default namespace
nss = nss.push("", a.Value)
}
}
for _, a := range n.Attrs {
local, space := a.Name.Local, a.Name.Space
name := local
if space != "" {
if space == "xmlns" {
// nop
} else {
space = nss.lookup(a.Name)
}
name = elmName(space, local)
}
attrs[name] = a.Value
}
if attrs["#text"] == nil && !whitespaceRE.Match(n.Chardata) {
attrs["#text"] = strings.TrimSpace(string(n.Chardata))
}
if attrs["#comment"] == nil && !whitespaceRE.Match(n.Comment) {
attrs["#comment"] = strings.TrimSpace(string(n.Comment))
}
nodes := []any{}
for _, c := range n.Nodes {
nodes = append(nodes, f(c, nss))
}
name := elmName(nss.lookup(n.XMLName), n.XMLName.Local)
elm := []any{name}
if len(attrs) > 0 {
elm = append(elm, attrs)
} else {
// make attrs null if there were none, jq allows index into null
elm = append(elm, nil)
}
elm = append(elm, nodes)
return elm
}
return f(n, nil)
}
func decodeXML(d *decode.D, in any) any {
xi, _ := in.(format.XMLIn)
br := d.RawLen(d.Len())
var r any
var err error
xd := xml.NewDecoder(bitio.NewIOReader(br))
xd.Strict = false
var n xmlNode
if err := xd.Decode(&n); err != nil {
d.Fatalf("%s", err)
}
if xi.Array {
r = fromXMLToArray(n)
} else {
r = fromXMLToObject(n, xi)
}
if err != nil {
d.Fatalf("%s", err)
}
var s scalar.Any
s.Actual = r
switch s.Actual.(type) {
case map[string]any,
[]any:
default:
d.Fatalf("root not object or array")
}
// continue decode to end and make sure there is only things we want to ignore
for {
d.SeekAbs(xd.InputOffset() * 8)
t, err := xd.Token()
if errors.Is(err, io.EOF) {
break
}
switch t := t.(type) {
case xml.CharData:
if !whitespaceRE.Match([]byte(t)) {
d.Fatalf("root element has trailing non-whitespace %q", stringsex.TrimN(string(t), 50, "..."))
}
// ignore trailing whitespace
case xml.ProcInst:
// ignore trailing process instructions <?elm?>
case xml.StartElement:
d.Fatalf("root element has trailing element <%s>", elmName(t.Name.Space, t.Name.Local))
default:
d.Fatalf("root element has trailing data")
}
}
d.Value.V = &s
d.Value.Range.Len = d.Len()
return nil
}
func xmlNameFromStr(s string) xml.Name {
return xml.Name{Local: s}
}
func xmlNameSort(a, b xml.Name) bool {
if a.Space != b.Space {
if a.Space == "" {
return true
}
return a.Space < b.Space
}
return a.Local < b.Local
}
type ToXMLOpts struct {
Indent int
AttributePrefix string `default:"@"`
}
func toXMLFromObject(c any, opts ToXMLOpts) any {
var f func(name string, content any) (xmlNode, int, bool)
f = func(name string, content any) (xmlNode, int, bool) {
n := xmlNode{
XMLName: xml.Name{Local: name},
}
hasSeq := false
seq := 0
orderHasSeq := false
var orderSeqs []int
var orderNames []string
switch v := content.(type) {
case string:
n.Chardata = []byte(v)
case map[string]any:
for k, v := range v {
switch {
case k == "#seq":
hasSeq = true
seq, _ = strconv.Atoi(v.(string))
case k == "#text":
s, _ := v.(string)
n.Chardata = []byte(s)
case k == "#comment":
s, _ := v.(string)
n.Comment = []byte(s)
case strings.HasPrefix(k, opts.AttributePrefix):
s, _ := v.(string)
a := xml.Attr{
Name: xmlNameFromStr(k[1:]),
Value: s,
}
n.Attrs = append(n.Attrs, a)
default:
switch v := v.(type) {
case []any:
if len(v) > 0 {
for _, c := range v {
nn, nseq, nHasSeq := f(k, c)
n.Nodes = append(n.Nodes, nn)
orderNames = append(orderNames, k)
orderSeqs = append(orderSeqs, nseq)
orderHasSeq = orderHasSeq || nHasSeq
}
} else {
nn, nseq, nHasSeq := f(k, "")
n.Nodes = append(n.Nodes, nn)
orderNames = append(orderNames, k)
orderSeqs = append(orderSeqs, nseq)
orderHasSeq = orderHasSeq || nHasSeq
}
default:
nn, nseq, nHasSeq := f(k, v)
n.Nodes = append(n.Nodes, nn)
orderNames = append(orderNames, k)
orderSeqs = append(orderSeqs, nseq)
orderHasSeq = orderHasSeq || nHasSeq
}
}
}
}
// if one #seq was found, assume all have them, otherwise sort by name
if orderHasSeq {
sortex.ProxySort(orderSeqs, n.Nodes, func(a, b int) bool { return a < b })
} else {
sortex.ProxySort(orderNames, n.Nodes, func(a, b string) bool { return a < b })
}
slices.SortFunc(n.Attrs, func(a, b xml.Attr) bool { return xmlNameSort(a.Name, b.Name) })
return n, seq, hasSeq
}
n, _, _ := f("doc", c)
if len(n.Nodes) == 1 && len(n.Attrs) == 0 && n.Comment == nil && n.Chardata == nil {
n = n.Nodes[0]
}
bb := &bytes.Buffer{}
e := xml.NewEncoder(bb)
e.Indent("", strings.Repeat(" ", opts.Indent))
if err := e.Encode(n); err != nil {
return err
}
if err := e.Flush(); err != nil {
return err
}
return bb.String()
}
// ["elm", {attrs}, [children]] -> <elm attrs...>children...</elm>
func toXMLFromArray(c any, opts ToXMLOpts) any {
var f func(elm []any) (xmlNode, bool)
f = func(elm []any) (xmlNode, bool) {
var name string
var attrs map[string]any
var children []any
for _, v := range elm {
switch v := v.(type) {
case string:
if name == "" {
name = v
}
case map[string]any:
if attrs == nil {
attrs = v
}
case []any:
if children == nil {
children = v
}
}
}
if name == "" {
return xmlNode{}, false
}
n := xmlNode{
XMLName: xmlNameFromStr(name),
}
for k, v := range attrs {
switch k {
case "#comment":
s, _ := v.(string)
n.Comment = []byte(s)
case "#text":
s, _ := v.(string)
n.Chardata = []byte(s)
default:
s, _ := v.(string)
n.Attrs = append(n.Attrs, xml.Attr{
Name: xmlNameFromStr(k),
Value: s,
})
}
}
slices.SortFunc(n.Attrs, func(a, b xml.Attr) bool { return xmlNameSort(a.Name, b.Name) })
for _, c := range children {
c, ok := c.([]any)
if !ok {
continue
}
if cn, ok := f(c); ok {
n.Nodes = append(n.Nodes, cn)
}
}
return n, true
}
ca, ok := c.([]any)
if !ok {
return gojqex.FuncTypeError{Name: "toxml", V: c}
}
n, ok := f(ca)
if !ok {
// TODO: better error
return gojqex.FuncTypeError{Name: "toxml", V: c}
}
bb := &bytes.Buffer{}
e := xml.NewEncoder(bb)
e.Indent("", strings.Repeat(" ", opts.Indent))
if err := e.Encode(n); err != nil {
return err
}
if err := e.Flush(); err != nil {
return err
}
return bb.String()
}
func toXML(_ *interp.Interp, c any, opts ToXMLOpts) any {
if v, ok := gojqex.Cast[map[string]any](c); ok {
return toXMLFromObject(gojqex.NormalizeToStrings(v), opts)
} else if v, ok := gojqex.Cast[[]any](c); ok {
return toXMLFromArray(gojqex.NormalizeToStrings(v), opts)
}
return gojqex.FuncTypeError{Name: "toxml", V: c}
}