Merge pull request #429 from wader/more-format-doc

doc,html,xml: Add more documentation and examples
2024-12-23 13:22:58 +03:00 · 2022-09-20 18:32:15 +02:00 · 2022-09-20 18:32:15 +02:00 · 2e3d71fdcc
commit 2e3d71fdcc
parent 01be59e459 725ab1b17b
5 changed files with 501 additions and 0 deletions
--- a/doc/formats.md
+++ b/doc/formats.md
@ -375,6 +375,71 @@ Decode value as html
 ... | html({array:false,attribute_prefix:"@",seq:false})
 ```

+HTML is decoded in HTML5 mode and will always include `<html>`, `<body>` and `<head>` element.
+
+See xml format for more examples and how to preserve element order and how to encode to xml.
+
+There is no `tohtml` function, see `toxml` instead.
+
+### Element as object
+
+```sh
+# decode as object is the default
+$ echo '<a href="url">text</a>' | fq -d html
+{
+  "html": {
+    "body": {
+      "a": {
+        "#text": "text",
+        "@href": "url"
+      }
+    },
+    "head": ""
+  }
+}
+```
+
+### Element as array
+
+```sh
+$ '<a href="url">text</a>' | fq -d html -o array=true
+[
+  "html",
+  null,
+  [
+    [
+      "head",
+      null,
+      []
+    ],
+    [
+      "body",
+      null,
+      [
+        [
+          "a",
+          {
+            "#text": "text",
+            "href": "url"
+          },
+          []
+        ]
+      ]
+    ]
+  ]
+]
+```
+
+```sh
+# Decode html files to a {file: "title", ...} object
+$ fq -n -d html '[inputs | {key: input_filename, value: .html.head.title?}] | from_entries' *.html
+```
+
+```sh
+# <a> href:s in file
+$ fq -r -o array=true -d html '.. | select(.[0] == "a" and .[1].href)?.[1].href' file.html
+```
+
 ## macho

 Supports decoding vanilla and FAT Mach-O binaries.
@ -560,6 +625,113 @@ Decode value as xml
 ... | xml({array:false,attribute_prefix:"@",seq:false})
 ```

+XML can be decoded and encoded into jq values in two ways, elements as object or array.
+Which variant to use depends a bit what you want to do. The object variant might be easier
+to query for a specific value but array might be easier to use to generate xml or to query
+after all elements of some kind etc.
+
+Encoding is done using the `toxml` function and it will figure what variant that is used based on the input value.
+Is has two optional options `indent` and `attribute_prefix`.
+
+### Elements as object
+
+Element can have different shapes depending on body text, attributes and children:
+
+- `<a key="value">text</a>` is `{"a":{"#text":"text","@key":"value"}}`, has text (`#text`) and attributes (`@key`)
+- `<a>text</a>` is `{"a":"text"}`
+- `<a><b>text</b></a>` is `{"a":{"b":"text"}}` one child with only text and no attributes
+- `<a><b/><b>text</b></a>` is `{"a":{"b":["","text"]}}` two children with same name end up in an array
+- `<a><b/><b key="value">text</b></a>` is `{"a":{"b":["",{"#text":"text","@key":"value"}]}}`
+
+If there is `#seq` attribute it encodes the child element order. Use `-o seq=true` to include sequence number when decoding,
+otherwise order might be lost.
+
+```sh
+# decode as object is the default
+$ echo '<a><b/><b>bbb</b><c attr="value">ccc</c></a>' | fq -d xml -o seq=true
+{
+  "a": {
+    "b": [
+      {
+        "#seq": 0
+      },
+      {
+        "#seq": 1,
+        "#text": "bbb"
+      }
+    ],
+    "c": {
+      "#seq": 2,
+      "#text": "ccc",
+      "@attr": "value"
+    }
+  }
+}
+
+# access text of the <c> element
+$ echo '<a><b/><b>bbb</b><c attr="value">ccc</c></a>' | fq '.a.c["#text"]'
+"ccc"
+```
+
+```sh
+# decode to object and encode to xml
+$ echo '<a><b/><b>bbb</b><c attr="value">ccc</c></a>' | fq -r -d xml -o seq=true 'toxml({indent:2})'
+<a>
+  <b></b>
+  <b>bbb</b>
+  <c attr="value">ccc</c>
+</a>
+```
+
+### Elements as array
+
+Elements are arrays of the shape `["#text": "body text", "attr_name", {key: "attr value"}|null, [<child element>, ...]]`.
+
+```sh
+# decode as array
+✗ echo '<a><b/><b>bbb</b><c attr="value">ccc</c></a>' | fq -d xml -o array=true
+[
+  "a",
+  null,
+  [
+    [
+      "b",
+      null,
+      []
+    ],
+    [
+      "b",
+      {
+        "#text": "bbb"
+      },
+      []
+    ],
+    [
+      "c",
+      {
+        "#text": "ccc",
+        "attr": "value"
+      },
+      []
+    ]
+  ]
+]
+```
+
+```sh
+# decode to array and encode to xml
+✗ echo '<a><b/><b>bbb</b><c attr="value">ccc</c></a>' | fq -r -d xml -o array=true -o seq=true 'toxml({indent:2})'
+<a>
+  <b></b>
+  <b>bbb</b>
+  <c attr="value">ccc</c>
+</a>
+
+# access text of the <c> element, the object variant above is probably easier to use
+$ echo '<a><b/><b>bbb</b><c attr="value">ccc</c></a>' | fq -o array=true '.[2][2][1]["#text"]'
+"ccc"
+```
+
 ### References
 - [xml.com's Converting Between XML and JSON](https://www.xml.com/pub/a/2006/05/31/converting-between-xml-and-json.html)

--- a/format/all/help.fqtest
+++ b/format/all/help.fqtest
@ -621,6 +621,65 @@ out   $ fq -d html -o array=false -o attribute_prefix="@" -o seq=false . file
 out   # Decode value as html
 out   ... | html({array:false,attribute_prefix:"@",seq:false})
 out 
+out HTML is decoded in HTML5 mode and will always include <html>, <body> and <head> element.
+out 
+out See xml format for more examples and how to preserve element order and how to encode to xml.
+out 
+out There is no tohtml function, see toxml instead.
+out 
+out # Element as object
+out 
+out   # decode as object is the default
+out   $ echo '<a href="url">text</a>' | fq -d html
+out   {
+out     "html": {
+out       "body": {
+out         "a": {
+out           "#text": "text",
+out           "@href": "url"
+out         }
+out       },
+out       "head": ""
+out     }
+out   }
+out   
+out # Element as array
+out 
+out   $ '<a href="url">text</a>' | fq -d html -o array=true
+out   [
+out     "html",
+out     null,
+out     [
+out       [
+out         "head",
+out         null,
+out         []
+out       ],
+out       [
+out         "body",
+out         null,
+out         [
+out           [
+out             "a",
+out             {
+out               "#text": "text",
+out               "href": "url"
+out             },
+out             []
+out           ]
+out         ]
+out       ]
+out     ]
+out   ]
+out   
+out 
+out   # Decode html files to a {file: "title", ...} object
+out   $ fq -n -d html '[inputs | {key: input_filename, value: .html.head.title?}] | from_entries' *.html
+out   
+out 
+out   # <a> href:s in file
+out   $ fq -r -o array=true -d html '.. | select(.[0] == "a" and .[1].href)?.[1].href' file.html
+out   
 "help(icc_profile)"
 out icc_profile: International Color Consortium profile decoder
 out 
@ -1280,6 +1339,104 @@ out   $ fq -d xml -o array=false -o attribute_prefix="@" -o seq=false . file
 out   # Decode value as xml
 out   ... | xml({array:false,attribute_prefix:"@",seq:false})
 out 
+out XML can be decoded and encoded into jq values in two ways, elements as object or array. Which variant to use depends a bit what you want to do. The object variant
+out might be easier to query for a specific value but array might be easier to use to generate xml or to query after all elements of some kind etc.
+out 
+out Encoding is done using the toxml function and it will figure what variant that is used based on the input value. Is has two optional options indent and
+out attribute_prefix.
+out 
+out # Elements as object
+out Element can have different shapes depending on body text, attributes and children:
+out 
+out - <a key="value">text</a> is {"a":{"#text":"text","@key":"value"}}, has text (#text) and attributes (@key)
+out - <a>text</a> is {"a":"text"}
+out - <a><b>text</b></a> is {"a":{"b":"text"}} one child with only text and no attributes
+out - <a><b/><b>text</b></a> is {"a":{"b":["","text"]}} two children with same name end up in an array
+out - <a><b/><b key="value">text</b></a> is {"a":{"b":["",{"#text":"text","@key":"value"}]}}
+out 
+out If there is #seq attribute it encodes the child element order. Use -o seq=true to include sequence number when decoding, otherwise order might be lost.
+out 
+out 
+out   # decode as object is the default
+out   $ echo '<a><b/><b>bbb</b><c attr="value">ccc</c></a>' | fq -d xml -o seq=true
+out   {
+out     "a": {
+out       "b": [
+out         {
+out           "#seq": 0
+out         },
+out         {
+out           "#seq": 1,
+out           "#text": "bbb"
+out         }
+out       ],
+out       "c": {
+out         "#seq": 2,
+out         "#text": "ccc",
+out         "@attr": "value"
+out       }
+out     }
+out   }
+out   
+out   # access text of the <c> element
+out   $ echo '<a><b/><b>bbb</b><c attr="value">ccc</c></a>' | fq '.a.c["#text"]'
+out   "ccc"
+out   
+out 
+out   # decode to object and encode to xml
+out   $ echo '<a><b/><b>bbb</b><c attr="value">ccc</c></a>' | fq -r -d xml -o seq=true 'toxml({indent:2})'
+out   <a>
+out     <b></b>
+out     <b>bbb</b>
+out     <c attr="value">ccc</c>
+out   </a>
+out   
+out # Elements as array
+out Elements are arrays of the shape ["#text": "body text", "attr_name", {key: "attr value"}|null, [<child element>, ...]].
+out 
+out 
+out   # decode as array
+out   ✗ echo '<a><b/><b>bbb</b><c attr="value">ccc</c></a>' | fq -d xml -o array=true
+out   [
+out     "a",
+out     null,
+out     [
+out       [
+out         "b",
+out         null,
+out         []
+out       ],
+out       [
+out         "b",
+out         {
+out           "#text": "bbb"
+out         },
+out         []
+out       ],
+out       [
+out         "c",
+out         {
+out           "#text": "ccc",
+out           "attr": "value"
+out         },
+out         []
+out       ]
+out     ]
+out   ]
+out   
+out 
+out   # decode to array and encode to xml
+out   ✗ echo '<a><b/><b>bbb</b><c attr="value">ccc</c></a>' | fq -r -d xml -o array=true -o seq=true 'toxml({indent:2})'
+out   <a>
+out     <b></b>
+out     <b>bbb</b>
+out     <c attr="value">ccc</c>
+out   </a>
+out   
+out   # access text of the <c> element, the object variant above is probably easier to use
+out   $ echo '<a><b/><b>bbb</b><c attr="value">ccc</c></a>' | fq -o array=true '.[2][2][1]["#text"]'
+out   "ccc"
+out   
 out # References
 out - xml.com's Converting Between XML and JSON (https://www.xml.com/pub/a/2006/05/31/converting-between-xml-and-json.html)
 out 
--- a/format/xml/html.go
+++ b/format/xml/html.go
@ -13,6 +13,7 @@ import (
 )

 //go:embed html.jq
+//go:embed html.md
 var htmlFS embed.FS

 func init() {
--- a/format/xml/html.md
+++ b/format/xml/html.md
@ -0,0 +1,64 @@
+HTML is decoded in HTML5 mode and will always include `<html>`, `<body>` and `<head>` element.
+
+See xml format for more examples and how to preserve element order and how to encode to xml.
+
+There is no `tohtml` function, see `toxml` instead.
+
+### Element as object
+
+```sh
+# decode as object is the default
+$ echo '<a href="url">text</a>' | fq -d html
+{
+  "html": {
+    "body": {
+      "a": {
+        "#text": "text",
+        "@href": "url"
+      }
+    },
+    "head": ""
+  }
+}
+```
+
+### Element as array
+
+```sh
+$ '<a href="url">text</a>' | fq -d html -o array=true
+[
+  "html",
+  null,
+  [
+    [
+      "head",
+      null,
+      []
+    ],
+    [
+      "body",
+      null,
+      [
+        [
+          "a",
+          {
+            "#text": "text",
+            "href": "url"
+          },
+          []
+        ]
+      ]
+    ]
+  ]
+]
+```
+
+```sh
+# Decode html files to a {file: "title", ...} object
+$ fq -n -d html '[inputs | {key: input_filename, value: .html.head.title?}] | from_entries' *.html
+```
+
+```sh
+# <a> href:s in file
+$ fq -r -o array=true -d html '.. | select(.[0] == "a" and .[1].href)?.[1].href' file.html
+```
--- a/format/xml/xml.md
+++ b/format/xml/xml.md
@ -1,2 +1,109 @@
+XML can be decoded and encoded into jq values in two ways, elements as object or array.
+Which variant to use depends a bit what you want to do. The object variant might be easier
+to query for a specific value but array might be easier to use to generate xml or to query
+after all elements of some kind etc.
+
+Encoding is done using the `toxml` function and it will figure what variant that is used based on the input value.
+Is has two optional options `indent` and `attribute_prefix`.
+
+### Elements as object
+
+Element can have different shapes depending on body text, attributes and children:
+
+- `<a key="value">text</a>` is `{"a":{"#text":"text","@key":"value"}}`, has text (`#text`) and attributes (`@key`)
+- `<a>text</a>` is `{"a":"text"}`
+- `<a><b>text</b></a>` is `{"a":{"b":"text"}}` one child with only text and no attributes
+- `<a><b/><b>text</b></a>` is `{"a":{"b":["","text"]}}` two children with same name end up in an array
+- `<a><b/><b key="value">text</b></a>` is `{"a":{"b":["",{"#text":"text","@key":"value"}]}}`
+
+If there is `#seq` attribute it encodes the child element order. Use `-o seq=true` to include sequence number when decoding,
+otherwise order might be lost.
+
+```sh
+# decode as object is the default
+$ echo '<a><b/><b>bbb</b><c attr="value">ccc</c></a>' | fq -d xml -o seq=true
+{
+  "a": {
+    "b": [
+      {
+        "#seq": 0
+      },
+      {
+        "#seq": 1,
+        "#text": "bbb"
+      }
+    ],
+    "c": {
+      "#seq": 2,
+      "#text": "ccc",
+      "@attr": "value"
+    }
+  }
+}
+
+# access text of the <c> element
+$ echo '<a><b/><b>bbb</b><c attr="value">ccc</c></a>' | fq '.a.c["#text"]'
+"ccc"
+```
+
+```sh
+# decode to object and encode to xml
+$ echo '<a><b/><b>bbb</b><c attr="value">ccc</c></a>' | fq -r -d xml -o seq=true 'toxml({indent:2})'
+<a>
+  <b></b>
+  <b>bbb</b>
+  <c attr="value">ccc</c>
+</a>
+```
+
+### Elements as array
+
+Elements are arrays of the shape `["#text": "body text", "attr_name", {key: "attr value"}|null, [<child element>, ...]]`.
+
+```sh
+# decode as array
+✗ echo '<a><b/><b>bbb</b><c attr="value">ccc</c></a>' | fq -d xml -o array=true
+[
+  "a",
+  null,
+  [
+    [
+      "b",
+      null,
+      []
+    ],
+    [
+      "b",
+      {
+        "#text": "bbb"
+      },
+      []
+    ],
+    [
+      "c",
+      {
+        "#text": "ccc",
+        "attr": "value"
+      },
+      []
+    ]
+  ]
+]
+```
+
+```sh
+# decode to array and encode to xml
+✗ echo '<a><b/><b>bbb</b><c attr="value">ccc</c></a>' | fq -r -d xml -o array=true -o seq=true 'toxml({indent:2})'
+<a>
+  <b></b>
+  <b>bbb</b>
+  <c attr="value">ccc</c>
+</a>
+
+# access text of the <c> element, the object variant above is probably easier to use
+$ echo '<a><b/><b>bbb</b><c attr="value">ccc</c></a>' | fq -o array=true '.[2][2][1]["#text"]'
+"ccc"
+```
+
 ### References
 - [xml.com's Converting Between XML and JSON](https://www.xml.com/pub/a/2006/05/31/converting-between-xml-and-json.html)