From 56edb59e83761acc8d847d9f08d3ba010d889c0d Mon Sep 17 00:00:00 2001 From: Mattias Wadman Date: Wed, 22 Feb 2023 16:16:35 +0100 Subject: [PATCH] toml,xml: Fail fast on invalid content encoding/xml and github.com/BurntSushi/toml both reads a lot before detecting that it can't decode. Now we instead read one UTF-8 and make sure it's valid xml or toml. Should speed up probing Related to #586 bigzero-zip.zip --- format/toml/testdata/toml.fqtest | 6 ++--- format/toml/toml.go | 35 ++++++++++++++++++++++-- format/xml/xml.go | 46 ++++++++++++++++++++++++++++++-- 3 files changed, 80 insertions(+), 7 deletions(-) diff --git a/format/toml/testdata/toml.fqtest b/format/toml/testdata/toml.fqtest index f637f18b..7aafdc47 100644 --- a/format/toml/testdata/toml.fqtest +++ b/format/toml/testdata/toml.fqtest @@ -63,9 +63,9 @@ true = true toml: top-level values must be Go maps or structs ---- -error at position 0x0: root object has no values +error at position 0x0: EOF ---- -$ fq -n '"" | from_toml' +$ fq -n '" " | from_toml' exitcode: 5 stderr: -error: error at position 0x0: root object has no values +error: error at position 0x1: root object has no values diff --git a/format/toml/toml.go b/format/toml/toml.go index 4944f513..266fd3e3 100644 --- a/format/toml/toml.go +++ b/format/toml/toml.go @@ -1,8 +1,12 @@ package toml import ( + "bufio" "bytes" "embed" + "fmt" + "io" + "unicode/utf8" "github.com/BurntSushi/toml" "github.com/wader/fq/format" @@ -29,11 +33,38 @@ func init() { interp.RegisterFunc0("to_toml", toTOML) } +func decodeTOMLSeekFirstValidRune(br io.ReadSeeker) error { + buf := bufio.NewReader(br) + r, sz, err := buf.ReadRune() + if err != nil { + return err + } + if _, err := br.Seek(0, io.SeekStart); err != nil { + return err + } + if r == utf8.RuneError && sz == 1 { + return fmt.Errorf("invalid UTF-8") + } + if r == 0 { + return fmt.Errorf("TOML can't contain null bytes") + } + + return nil +} + func decodeTOML(d *decode.D) any { - br := d.RawLen(d.Len()) + bbr := d.RawLen(d.Len()) var r any - if _, err := toml.NewDecoder(bitio.NewIOReader(br)).Decode(&r); err != nil { + br := bitio.NewIOReadSeeker(bbr) + + // github.com/BurntSushi/toml currently does a ReadAll which might be expensive + // try find invalid toml (null bytes etc) faster and more efficient + if err := decodeTOMLSeekFirstValidRune(br); err != nil { + d.Fatalf("%s", err) + } + + if _, err := toml.NewDecoder(br).Decode(&r); err != nil { d.Fatalf("%s", err) } var s scalar.Any diff --git a/format/xml/xml.go b/format/xml/xml.go index 7a9d5453..ce82ea3f 100644 --- a/format/xml/xml.go +++ b/format/xml/xml.go @@ -7,15 +7,18 @@ package xml // TODO: rewrite ns stack import ( + "bufio" "bytes" "embed" "encoding/xml" "errors" + "fmt" "html" "io" "regexp" "strconv" "strings" + "unicode/utf8" "github.com/wader/fq/format" "github.com/wader/fq/internal/gojqex" @@ -247,15 +250,54 @@ func fromXMLToArray(n xmlNode) any { return f(n, nil) } +// from golang encoding/xml, copyright 2009 The Go Authors +// the Char production of https://www.xml.com/axml/testaxml.htm, +// Section 2.2 Characters. +func isInCharacterRange(r rune) (inrange bool) { + return r == 0x09 || + r == 0x0A || + r == 0x0D || + r >= 0x20 && r <= 0xD7FF || + r >= 0xE000 && r <= 0xFFFD || + r >= 0x10000 && r <= 0x10FFFF +} + +func decodeXMLSeekFirstValidRune(br io.ReadSeeker) error { + buf := bufio.NewReader(br) + r, sz, err := buf.ReadRune() + if err != nil { + return err + } + if _, err := br.Seek(0, io.SeekStart); err != nil { + return err + } + if r == utf8.RuneError && sz == 1 { + return fmt.Errorf("invalid UTF-8") + } + if !isInCharacterRange(r) { + return fmt.Errorf("illegal character code %U", r) + } + + return nil +} + func decodeXML(d *decode.D) any { var xi format.XMLIn d.ArgAs(&xi) - br := d.RawLen(d.Len()) + bbr := d.RawLen(d.Len()) var r any var err error - xd := xml.NewDecoder(bitio.NewIOReader(br)) + br := bitio.NewIOReadSeeker(bbr) + + // this reimplements same xml rune range validation as ecoding/xml but fails faster + if err := decodeXMLSeekFirstValidRune(br); err != nil { + d.Fatalf("%s", err) + } + + xd := xml.NewDecoder(br) + xd.Strict = false var n xmlNode if err := xd.Decode(&n); err != nil {