1
1
mirror of https://github.com/wader/fq.git synced 2024-12-23 13:22:58 +03:00
fq/format/xml/xml.go
Mattias Wadman 56edb59e83 toml,xml: Fail fast on invalid content
encoding/xml and github.com/BurntSushi/toml both reads a lot before detecting
that it can't decode. Now we instead read one UTF-8 and make sure it's valid
xml or toml.

Should speed up probing

Related to #586 bigzero-zip.zip
2023-02-22 16:23:21 +01:00

562 lines
12 KiB
Go

package xml
// object mode inspired by https://www.xml.com/pub/a/2006/05/31/converting-between-xml-and-json.html
// TODO: keep <?xml>? root #desc?
// TODO: refactor to share more code
// TODO: rewrite ns stack
import (
"bufio"
"bytes"
"embed"
"encoding/xml"
"errors"
"fmt"
"html"
"io"
"regexp"
"strconv"
"strings"
"unicode/utf8"
"github.com/wader/fq/format"
"github.com/wader/fq/internal/gojqex"
"github.com/wader/fq/internal/sortex"
"github.com/wader/fq/internal/stringsex"
"github.com/wader/fq/pkg/bitio"
"github.com/wader/fq/pkg/decode"
"github.com/wader/fq/pkg/interp"
"github.com/wader/fq/pkg/scalar"
"golang.org/x/exp/slices"
)
//go:embed xml.jq
//go:embed xml.md
var xmlFS embed.FS
func init() {
interp.RegisterFormat(decode.Format{
Name: format.XML,
Description: "Extensible Markup Language",
ProbeOrder: format.ProbeOrderTextFuzzy,
Groups: []string{format.PROBE},
DecodeFn: decodeXML,
DefaultInArg: format.XMLIn{
Seq: false,
Array: false,
AttributePrefix: "@",
},
Functions: []string{"_todisplay"},
})
interp.RegisterFS(xmlFS)
interp.RegisterFunc1("to_xml", toXML)
interp.RegisterFunc0("from_xmlentities", func(_ *interp.Interp, c string) any {
return html.UnescapeString(c)
})
interp.RegisterFunc0("to_xmlentities", func(_ *interp.Interp, c string) any {
return html.EscapeString(c)
})
}
var whitespaceRE = regexp.MustCompile(`^\s*$`)
type xmlNode struct {
XMLName xml.Name
Attrs []xml.Attr `xml:",attr"`
Chardata []byte `xml:",chardata"`
Comment []byte `xml:",comment"`
Nodes []xmlNode `xml:",any"`
}
func (n *xmlNode) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
n.Attrs = start.Attr
type node xmlNode
return d.DecodeElement((*node)(n), &start)
}
type xmlNS struct {
name string
url string
}
// TODO: nss pop? attr not stack?
// xmlNNStack is used to undo namespace url resolving, space is url not the "alias" name
type xmlNNStack []xmlNS
func (nss xmlNNStack) lookup(name xml.Name) string {
var s string
for i := len(nss) - 1; i >= 0; i-- {
ns := nss[i]
if name.Space == ns.url {
// first found or is default namespace
if s == "" || ns.name == "" {
s = ns.name
}
if s == "" {
break
}
}
}
return s
}
func (nss xmlNNStack) push(name string, url string) xmlNNStack {
n := append([]xmlNS{}, nss...)
n = append(n, xmlNS{name: name, url: url})
return xmlNNStack(n)
}
func elmName(space, local string) string {
if space == "" {
return local
}
return space + ":" + local
}
func fromXMLToObject(n xmlNode, xi format.XMLIn) any {
var f func(n xmlNode, seq int, nss xmlNNStack) (string, any)
f = func(n xmlNode, seq int, nss xmlNNStack) (string, any) {
attrs := map[string]any{}
for _, a := range n.Attrs {
local, space := a.Name.Local, a.Name.Space
if space == "xmlns" {
nss = nss.push(local, a.Value)
} else if local == "xmlns" {
// track default namespace
nss = nss.push("", a.Value)
}
}
for _, a := range n.Attrs {
local, space := a.Name.Local, a.Name.Space
name := local
if space != "" {
if space == "xmlns" {
// nop
} else {
space = nss.lookup(a.Name)
}
name = elmName(space, local)
}
attrs[xi.AttributePrefix+name] = a.Value
}
for i, nn := range n.Nodes {
nSeq := i
if len(n.Nodes) == 1 {
nSeq = -1
}
nname, naddrs := f(nn, nSeq, nss)
if e, ok := attrs[nname]; ok {
if ea, ok := e.([]any); ok {
attrs[nname] = append(ea, naddrs)
} else {
attrs[nname] = []any{e, naddrs}
}
} else {
attrs[nname] = naddrs
}
}
if xi.Seq && seq != -1 {
attrs["#seq"] = seq
}
if attrs["#text"] == nil && !whitespaceRE.Match(n.Chardata) {
attrs["#text"] = strings.TrimSpace(string(n.Chardata))
}
if attrs["#comment"] == nil && !whitespaceRE.Match(n.Comment) {
attrs["#comment"] = strings.TrimSpace(string(n.Comment))
}
local, space := n.XMLName.Local, n.XMLName.Space
if space != "" {
space = nss.lookup(n.XMLName)
}
name := elmName(space, local)
if len(attrs) == 0 {
return name, ""
} else if len(attrs) == 1 && attrs["#text"] != nil {
return name, attrs["#text"]
}
return name, attrs
}
name, attrs := f(n, -1, nil)
return map[string]any{name: attrs}
}
func fromXMLToArray(n xmlNode) any {
var f func(n xmlNode, nss xmlNNStack) []any
f = func(n xmlNode, nss xmlNNStack) []any {
attrs := map[string]any{}
for _, a := range n.Attrs {
local, space := a.Name.Local, a.Name.Space
if space == "xmlns" {
nss = nss.push(local, a.Value)
} else if local == "xmlns" {
// track default namespace
nss = nss.push("", a.Value)
}
}
for _, a := range n.Attrs {
local, space := a.Name.Local, a.Name.Space
name := local
if space != "" {
if space == "xmlns" {
// nop
} else {
space = nss.lookup(a.Name)
}
name = elmName(space, local)
}
attrs[name] = a.Value
}
if attrs["#text"] == nil && !whitespaceRE.Match(n.Chardata) {
attrs["#text"] = strings.TrimSpace(string(n.Chardata))
}
if attrs["#comment"] == nil && !whitespaceRE.Match(n.Comment) {
attrs["#comment"] = strings.TrimSpace(string(n.Comment))
}
nodes := []any{}
for _, c := range n.Nodes {
nodes = append(nodes, f(c, nss))
}
name := elmName(nss.lookup(n.XMLName), n.XMLName.Local)
elm := []any{name}
if len(attrs) > 0 {
elm = append(elm, attrs)
} else {
// make attrs null if there were none, jq allows index into null
elm = append(elm, nil)
}
elm = append(elm, nodes)
return elm
}
return f(n, nil)
}
// from golang encoding/xml, copyright 2009 The Go Authors
// the Char production of https://www.xml.com/axml/testaxml.htm,
// Section 2.2 Characters.
func isInCharacterRange(r rune) (inrange bool) {
return r == 0x09 ||
r == 0x0A ||
r == 0x0D ||
r >= 0x20 && r <= 0xD7FF ||
r >= 0xE000 && r <= 0xFFFD ||
r >= 0x10000 && r <= 0x10FFFF
}
func decodeXMLSeekFirstValidRune(br io.ReadSeeker) error {
buf := bufio.NewReader(br)
r, sz, err := buf.ReadRune()
if err != nil {
return err
}
if _, err := br.Seek(0, io.SeekStart); err != nil {
return err
}
if r == utf8.RuneError && sz == 1 {
return fmt.Errorf("invalid UTF-8")
}
if !isInCharacterRange(r) {
return fmt.Errorf("illegal character code %U", r)
}
return nil
}
func decodeXML(d *decode.D) any {
var xi format.XMLIn
d.ArgAs(&xi)
bbr := d.RawLen(d.Len())
var r any
var err error
br := bitio.NewIOReadSeeker(bbr)
// this reimplements same xml rune range validation as ecoding/xml but fails faster
if err := decodeXMLSeekFirstValidRune(br); err != nil {
d.Fatalf("%s", err)
}
xd := xml.NewDecoder(br)
xd.Strict = false
var n xmlNode
if err := xd.Decode(&n); err != nil {
d.Fatalf("%s", err)
}
if xi.Array {
r = fromXMLToArray(n)
} else {
r = fromXMLToObject(n, xi)
}
if err != nil {
d.Fatalf("%s", err)
}
var s scalar.Any
s.Actual = r
switch s.Actual.(type) {
case map[string]any,
[]any:
default:
d.Fatalf("root not object or array")
}
// continue decode to end and make sure there is only things we want to ignore
for {
d.SeekAbs(xd.InputOffset() * 8)
t, err := xd.Token()
if errors.Is(err, io.EOF) {
break
}
switch t := t.(type) {
case xml.CharData:
if !whitespaceRE.Match([]byte(t)) {
d.Fatalf("root element has trailing non-whitespace %q", stringsex.TrimN(string(t), 50, "..."))
}
// ignore trailing whitespace
case xml.ProcInst:
// ignore trailing process instructions <?elm?>
case xml.StartElement:
d.Fatalf("root element has trailing element <%s>", elmName(t.Name.Space, t.Name.Local))
default:
d.Fatalf("root element has trailing data")
}
}
d.Value.V = &s
d.Value.Range.Len = d.Len()
return nil
}
func xmlNameFromStr(s string) xml.Name {
return xml.Name{Local: s}
}
func xmlNameSort(a, b xml.Name) bool {
if a.Space != b.Space {
if a.Space == "" {
return true
}
return a.Space < b.Space
}
return a.Local < b.Local
}
type ToXMLOpts struct {
Indent int
AttributePrefix string `default:"@"`
}
func toXMLFromObject(c any, opts ToXMLOpts) any {
var f func(name string, content any) (xmlNode, int, bool)
f = func(name string, content any) (xmlNode, int, bool) {
n := xmlNode{
XMLName: xml.Name{Local: name},
}
hasSeq := false
seq := 0
orderHasSeq := false
var orderSeqs []int
var orderNames []string
switch v := content.(type) {
case string:
n.Chardata = []byte(v)
case map[string]any:
for k, v := range v {
switch {
case k == "#seq":
hasSeq = true
seq, _ = strconv.Atoi(v.(string))
case k == "#text":
s, _ := v.(string)
n.Chardata = []byte(s)
case k == "#comment":
s, _ := v.(string)
n.Comment = []byte(s)
case strings.HasPrefix(k, opts.AttributePrefix):
s, _ := v.(string)
a := xml.Attr{
Name: xmlNameFromStr(k[1:]),
Value: s,
}
n.Attrs = append(n.Attrs, a)
default:
switch v := v.(type) {
case []any:
if len(v) > 0 {
for _, c := range v {
nn, nseq, nHasSeq := f(k, c)
n.Nodes = append(n.Nodes, nn)
orderNames = append(orderNames, k)
orderSeqs = append(orderSeqs, nseq)
orderHasSeq = orderHasSeq || nHasSeq
}
} else {
nn, nseq, nHasSeq := f(k, "")
n.Nodes = append(n.Nodes, nn)
orderNames = append(orderNames, k)
orderSeqs = append(orderSeqs, nseq)
orderHasSeq = orderHasSeq || nHasSeq
}
default:
nn, nseq, nHasSeq := f(k, v)
n.Nodes = append(n.Nodes, nn)
orderNames = append(orderNames, k)
orderSeqs = append(orderSeqs, nseq)
orderHasSeq = orderHasSeq || nHasSeq
}
}
}
}
// if one #seq was found, assume all have them, otherwise sort by name
if orderHasSeq {
sortex.ProxySort(orderSeqs, n.Nodes, func(a, b int) bool { return a < b })
} else {
sortex.ProxySort(orderNames, n.Nodes, func(a, b string) bool { return a < b })
}
slices.SortFunc(n.Attrs, func(a, b xml.Attr) bool { return xmlNameSort(a.Name, b.Name) })
return n, seq, hasSeq
}
n, _, _ := f("doc", c)
if len(n.Nodes) == 1 && len(n.Attrs) == 0 && n.Comment == nil && n.Chardata == nil {
n = n.Nodes[0]
}
bb := &bytes.Buffer{}
e := xml.NewEncoder(bb)
e.Indent("", strings.Repeat(" ", opts.Indent))
if err := e.Encode(n); err != nil {
return err
}
if err := e.Flush(); err != nil {
return err
}
return bb.String()
}
// ["elm", {attrs}, [children]] -> <elm attrs...>children...</elm>
func toXMLFromArray(c any, opts ToXMLOpts) any {
var f func(elm []any) (xmlNode, bool)
f = func(elm []any) (xmlNode, bool) {
var name string
var attrs map[string]any
var children []any
for _, v := range elm {
switch v := v.(type) {
case string:
if name == "" {
name = v
}
case map[string]any:
if attrs == nil {
attrs = v
}
case []any:
if children == nil {
children = v
}
}
}
if name == "" {
return xmlNode{}, false
}
n := xmlNode{
XMLName: xmlNameFromStr(name),
}
for k, v := range attrs {
switch k {
case "#comment":
s, _ := v.(string)
n.Comment = []byte(s)
case "#text":
s, _ := v.(string)
n.Chardata = []byte(s)
default:
s, _ := v.(string)
n.Attrs = append(n.Attrs, xml.Attr{
Name: xmlNameFromStr(k),
Value: s,
})
}
}
slices.SortFunc(n.Attrs, func(a, b xml.Attr) bool { return xmlNameSort(a.Name, b.Name) })
for _, c := range children {
c, ok := c.([]any)
if !ok {
continue
}
if cn, ok := f(c); ok {
n.Nodes = append(n.Nodes, cn)
}
}
return n, true
}
ca, ok := c.([]any)
if !ok {
return gojqex.FuncTypeError{Name: "to_xml", V: c}
}
n, ok := f(ca)
if !ok {
// TODO: better error
return gojqex.FuncTypeError{Name: "to_xml", V: c}
}
bb := &bytes.Buffer{}
e := xml.NewEncoder(bb)
e.Indent("", strings.Repeat(" ", opts.Indent))
if err := e.Encode(n); err != nil {
return err
}
if err := e.Flush(); err != nil {
return err
}
return bb.String()
}
func toXML(_ *interp.Interp, c any, opts ToXMLOpts) any {
if v, ok := gojqex.Cast[map[string]any](c); ok {
return toXMLFromObject(gojqex.NormalizeToStrings(v), opts)
} else if v, ok := gojqex.Cast[[]any](c); ok {
return toXMLFromArray(gojqex.NormalizeToStrings(v), opts)
}
return gojqex.FuncTypeError{Name: "to_xml", V: c}
}