1
1
mirror of https://github.com/wader/fq.git synced 2024-12-27 15:42:07 +03:00

toml,xml: Fail fast on invalid content

encoding/xml and github.com/BurntSushi/toml both reads a lot before detecting
that it can't decode. Now we instead read one UTF-8 and make sure it's valid
xml or toml.

Should speed up probing

Related to #586 bigzero-zip.zip
This commit is contained in:
Mattias Wadman 2023-02-22 16:16:35 +01:00
parent aaf60ec250
commit 56edb59e83
3 changed files with 80 additions and 7 deletions

View File

@ -63,9 +63,9 @@ true = true
toml: top-level values must be Go maps or structs
----
error at position 0x0: root object has no values
error at position 0x0: EOF
----
$ fq -n '"" | from_toml'
$ fq -n '" " | from_toml'
exitcode: 5
stderr:
error: error at position 0x0: root object has no values
error: error at position 0x1: root object has no values

View File

@ -1,8 +1,12 @@
package toml
import (
"bufio"
"bytes"
"embed"
"fmt"
"io"
"unicode/utf8"
"github.com/BurntSushi/toml"
"github.com/wader/fq/format"
@ -29,11 +33,38 @@ func init() {
interp.RegisterFunc0("to_toml", toTOML)
}
func decodeTOMLSeekFirstValidRune(br io.ReadSeeker) error {
buf := bufio.NewReader(br)
r, sz, err := buf.ReadRune()
if err != nil {
return err
}
if _, err := br.Seek(0, io.SeekStart); err != nil {
return err
}
if r == utf8.RuneError && sz == 1 {
return fmt.Errorf("invalid UTF-8")
}
if r == 0 {
return fmt.Errorf("TOML can't contain null bytes")
}
return nil
}
func decodeTOML(d *decode.D) any {
br := d.RawLen(d.Len())
bbr := d.RawLen(d.Len())
var r any
if _, err := toml.NewDecoder(bitio.NewIOReader(br)).Decode(&r); err != nil {
br := bitio.NewIOReadSeeker(bbr)
// github.com/BurntSushi/toml currently does a ReadAll which might be expensive
// try find invalid toml (null bytes etc) faster and more efficient
if err := decodeTOMLSeekFirstValidRune(br); err != nil {
d.Fatalf("%s", err)
}
if _, err := toml.NewDecoder(br).Decode(&r); err != nil {
d.Fatalf("%s", err)
}
var s scalar.Any

View File

@ -7,15 +7,18 @@ package xml
// TODO: rewrite ns stack
import (
"bufio"
"bytes"
"embed"
"encoding/xml"
"errors"
"fmt"
"html"
"io"
"regexp"
"strconv"
"strings"
"unicode/utf8"
"github.com/wader/fq/format"
"github.com/wader/fq/internal/gojqex"
@ -247,15 +250,54 @@ func fromXMLToArray(n xmlNode) any {
return f(n, nil)
}
// from golang encoding/xml, copyright 2009 The Go Authors
// the Char production of https://www.xml.com/axml/testaxml.htm,
// Section 2.2 Characters.
func isInCharacterRange(r rune) (inrange bool) {
return r == 0x09 ||
r == 0x0A ||
r == 0x0D ||
r >= 0x20 && r <= 0xD7FF ||
r >= 0xE000 && r <= 0xFFFD ||
r >= 0x10000 && r <= 0x10FFFF
}
func decodeXMLSeekFirstValidRune(br io.ReadSeeker) error {
buf := bufio.NewReader(br)
r, sz, err := buf.ReadRune()
if err != nil {
return err
}
if _, err := br.Seek(0, io.SeekStart); err != nil {
return err
}
if r == utf8.RuneError && sz == 1 {
return fmt.Errorf("invalid UTF-8")
}
if !isInCharacterRange(r) {
return fmt.Errorf("illegal character code %U", r)
}
return nil
}
func decodeXML(d *decode.D) any {
var xi format.XMLIn
d.ArgAs(&xi)
br := d.RawLen(d.Len())
bbr := d.RawLen(d.Len())
var r any
var err error
xd := xml.NewDecoder(bitio.NewIOReader(br))
br := bitio.NewIOReadSeeker(bbr)
// this reimplements same xml rune range validation as ecoding/xml but fails faster
if err := decodeXMLSeekFirstValidRune(br); err != nil {
d.Fatalf("%s", err)
}
xd := xml.NewDecoder(br)
xd.Strict = false
var n xmlNode
if err := xd.Decode(&n); err != nil {