1
1
mirror of https://github.com/wader/fq.git synced 2024-11-22 07:16:49 +03:00

markdown: Add decoder

This commit is contained in:
Mattias Wadman 2022-09-10 18:09:36 +02:00
parent e98adfaa03
commit 00a50662ea
8 changed files with 694 additions and 0 deletions

View File

@ -28,6 +28,7 @@ import (
_ "github.com/wader/fq/format/jpeg"
_ "github.com/wader/fq/format/json"
_ "github.com/wader/fq/format/macho"
_ "github.com/wader/fq/format/markdown"
_ "github.com/wader/fq/format/math"
_ "github.com/wader/fq/format/matroska"
_ "github.com/wader/fq/format/mp3"

View File

@ -86,6 +86,7 @@ const (
JSONL = "jsonl"
MACHO = "macho"
MACHO_FAT = "macho_fat"
MARKDOWN = "markdown"
MATROSKA = "matroska"
MP3 = "mp3"
MP3_FRAME = "mp3_frame"

319
format/markdown/markdown.go Normal file
View File

@ -0,0 +1,319 @@
package markdown
import (
"embed"
"fmt"
"io/ioutil"
"github.com/gomarkdown/markdown"
"github.com/gomarkdown/markdown/ast"
"github.com/wader/fq/format"
"github.com/wader/fq/pkg/bitio"
"github.com/wader/fq/pkg/decode"
"github.com/wader/fq/pkg/interp"
"github.com/wader/fq/pkg/scalar"
)
//go:embed markdown.jq
var markdownFS embed.FS
func init() {
interp.RegisterFormat(decode.Format{
Name: format.MARKDOWN,
Description: "Markdown",
DecodeFn: decodeMarkdown,
Functions: []string{"_todisplay"},
})
interp.RegisterFS(markdownFS)
}
func decodeMarkdown(d *decode.D, _ any) any {
b, err := ioutil.ReadAll(bitio.NewIOReader(d.RawLen(d.Len())))
if err != nil {
panic(err)
}
var s scalar.S
s.Actual = node(markdown.Parse(b, nil))
d.Value.V = &s
d.Value.Range.Len = d.Len()
return nil
}
func stringSlice[T string | []byte](ss []T) []any {
var vs []any
for _, e := range ss {
vs = append(vs, string(e))
}
return vs
}
func sliceMap[F, T any](vs []F, fn func(F) T) []T {
ts := make([]T, len(vs))
for i, v := range vs {
ts[i] = fn(v)
}
return ts
}
func intSlice[T ~int](ss []T) []any {
var vs []any
for _, e := range ss {
vs = append(vs, e)
}
return vs
}
func attr(v map[string]any, attr *ast.Attribute) {
if attr == nil {
return
}
v["id"] = string(attr.ID)
var as []any
for _, a := range attr.Attrs {
as = append(as, string(a))
}
v["attrs"] = as
var cs []any
for _, a := range attr.Classes {
cs = append(cs, string(a))
}
v["classes"] = cs
}
func leaf(v map[string]any, typ string, l ast.Leaf) {
v["type"] = typ
v["literal"] = string(l.Literal)
attr(v, l.Attribute)
}
func container(v map[string]any, typ string, c ast.Container) {
v["type"] = typ
v["literal"] = string(c.Literal)
var cs []any
children := c.GetChildren()
for _, n := range children {
cv := node(n)
if cv != nil {
cs = append(cs, node(n))
}
}
v["children"] = cs
attr(v, c.Attribute)
}
func listType(t ast.ListType) []any {
var vs []any
if t&ast.ListTypeOrdered == ast.ListTypeOrdered {
vs = append(vs, "ordered")
}
if t%ast.ListTypeOrdered == ast.ListTypeOrdered {
vs = append(vs, "ordered")
}
if t%ast.ListTypeDefinition == ast.ListTypeDefinition {
vs = append(vs, "definition")
}
if t%ast.ListTypeTerm == ast.ListTypeTerm {
vs = append(vs, "term")
}
if t%ast.ListItemContainsBlock == ast.ListItemContainsBlock {
vs = append(vs, "contains_block")
}
if t%ast.ListItemBeginningOfList == ast.ListItemBeginningOfList {
vs = append(vs, "beginning_of_list")
}
if t%ast.ListItemEndOfList == ast.ListItemEndOfList {
vs = append(vs, "end_of_list")
}
return vs
}
func node(n ast.Node) any {
v := map[string]any{}
switch n := n.(type) {
case *ast.Text:
if n.Leaf.Attribute == nil {
if len(n.Leaf.Literal) > 0 {
return string(n.Leaf.Literal)
}
// skip
return nil
}
case *ast.Softbreak:
leaf(v, "softbreak", n.Leaf)
case *ast.Hardbreak:
leaf(v, "hardbreak", n.Leaf)
case *ast.NonBlockingSpace:
leaf(v, "nbsp", n.Leaf)
case *ast.Emph:
container(v, "em", n.Container)
case *ast.Strong:
container(v, "strong", n.Container)
case *ast.Del:
container(v, "del", n.Container)
case *ast.BlockQuote:
container(v, "blockquote", n.Container)
case *ast.Aside:
container(v, "aside", n.Container)
case *ast.Link:
container(v, "link", n.Container)
v["destination"] = string(n.Destination)
v["title"] = string(n.Title)
v["note_id"] = n.NoteID
v["deferred_id"] = string(n.DeferredID)
v["additional_attributes"] = stringSlice(n.AdditionalAttributes)
case *ast.CrossReference:
container(v, "cross_reference", n.Container)
v["destination"] = string(n.Destination)
case *ast.Citation:
leaf(v, "citation", n.Leaf)
v["destination"] = stringSlice(n.Destination)
v["type"] = sliceMap(n.Type, func(v ast.CitationTypes) string {
switch v {
case ast.CitationTypeNone:
return "none"
case ast.CitationTypeSuppressed:
return "suppressed"
case ast.CitationTypeInformative:
return "informative"
case ast.CitationTypeNormative:
return "normative"
default:
return "unknown"
}
})
v["type"] = intSlice(n.Type)
v["suffix"] = stringSlice(n.Suffix)
case *ast.Image:
container(v, "image", n.Container)
v["destination"] = string(n.Destination)
v["title"] = string(n.Title)
case *ast.Code:
leaf(v, "code", n.Leaf)
case *ast.CodeBlock:
leaf(v, "code_block", n.Leaf)
v["is_fenced"] = n.IsFenced
v["info"] = string(n.Info)
if n.FenceChar != 0 {
v["fence_char"] = string(n.FenceChar)
}
v["fence_length"] = n.FenceLength
v["fence_offset"] = n.FenceOffset
case *ast.Caption:
container(v, "caption", n.Container)
case *ast.CaptionFigure:
container(v, "caption_figure", n.Container)
v["heading_id"] = n.HeadingID
case *ast.Document:
container(v, "document", n.Container)
case *ast.Paragraph:
container(v, "paragraph", n.Container)
case *ast.HTMLSpan:
leaf(v, "html_span", n.Leaf)
case *ast.HTMLBlock:
leaf(v, "html_block", n.Leaf)
case *ast.Heading:
container(v, "heading", n.Container)
v["level"] = n.Level
v["heading_id"] = n.HeadingID
v["is_titleblock"] = n.IsTitleblock
v["is_special"] = n.IsSpecial
case *ast.HorizontalRule:
leaf(v, "hr", n.Leaf)
case *ast.List:
container(v, "list", n.Container)
v["list_flags"] = listType(n.ListFlags)
v["tight"] = n.Tight
if n.BulletChar != 0 {
v["bullet_char"] = string(n.BulletChar)
}
if n.Delimiter != 0 {
v["delimiter"] = string(n.Delimiter)
}
v["start"] = n.Start
v["ref_link"] = string(n.RefLink)
v["is_footnotes_list"] = n.IsFootnotesList
case *ast.ListItem:
container(v, "list_item", n.Container)
v["list_flags"] = listType(n.ListFlags)
v["tight"] = n.Tight
if n.BulletChar != 0 {
v["bullet_char"] = string(n.BulletChar)
}
if n.Delimiter != 0 {
v["delimiter"] = string(n.Delimiter)
}
v["ref_link"] = string(n.RefLink)
v["is_footnotes_list"] = n.IsFootnotesList
case *ast.Table:
container(v, "table", n.Container)
case *ast.TableCell:
container(v, "table_cell", n.Container)
v["is_header"] = n.IsHeader
v["align"] = n.Align.String()
v["col_span"] = n.ColSpan
case *ast.TableHeader:
container(v, "table_header", n.Container)
case *ast.TableBody:
container(v, "table_body", n.Container)
case *ast.TableRow:
container(v, "table_row", n.Container)
case *ast.TableFooter:
container(v, "table_footer", n.Container)
case *ast.Math:
leaf(v, "math", n.Leaf)
case *ast.MathBlock:
container(v, "math_block", n.Container)
case *ast.DocumentMatter:
container(v, "document_matter", n.Container)
v["matter"] = func(v ast.DocumentMatters) string {
switch v {
case ast.DocumentMatterNone:
return "none"
case ast.DocumentMatterFront:
return "front"
case ast.DocumentMatterMain:
return "main"
case ast.DocumentMatterBack:
return "back"
default:
return "unknown"
}
}(n.Matter)
case *ast.Callout:
leaf(v, "callout", n.Leaf)
v["id"] = string(n.ID)
case *ast.Index:
leaf(v, "index", n.Leaf)
v["primary"] = n.Primary
v["item"] = string(n.Item)
v["subitem"] = string(n.Subitem)
v["id"] = n.ID
case *ast.Subscript:
leaf(v, "subscript", n.Leaf)
case *ast.Superscript:
leaf(v, "superscript", n.Leaf)
case *ast.Footnotes:
container(v, "footnotes", n.Container)
default:
panic(fmt.Sprintf("unknown node %T", node))
}
for k, e := range v {
if s, ok := e.(string); ok && s == "" {
delete(v, k)
}
}
return v
}

316
format/markdown/testdata/test.fqtest vendored Normal file
View File

@ -0,0 +1,316 @@
$ fq -d markdown . test.md
{
"children": [
{
"children": [
"Before"
],
"type": "paragraph"
},
{
"children": [
"header 1"
],
"is_special": false,
"is_titleblock": false,
"level": 1,
"type": "heading"
},
{
"children": [
"Paragraph with ",
{
"children": [
"bold"
],
"type": "strong"
},
" and ",
{
"children": [
"italic"
],
"type": "em"
},
"\non\nmultiple\nlines."
],
"type": "paragraph"
},
{
"children": [
{
"children": [
"Some citation"
],
"type": "paragraph"
}
],
"type": "blockquote"
},
{
"children": [
"A footnote",
{
"additional_attributes": [],
"children": [
"^1"
],
"destination": "footnote1",
"note_id": 0,
"type": "link"
},
" and this also",
{
"additional_attributes": [],
"children": [
"^note"
],
"destination": "footnote2",
"note_id": 0,
"type": "link"
}
],
"type": "paragraph"
},
{
"children": [
"header 2"
],
"is_special": false,
"is_titleblock": false,
"level": 2,
"type": "heading"
},
{
"fence_length": 0,
"fence_offset": 0,
"info": "jq",
"is_fenced": true,
"literal": "code\nblock\n",
"type": "code_block"
},
{
"fence_length": 0,
"fence_offset": 0,
"is_fenced": false,
"literal": "also\ncode\n",
"type": "code_block"
},
{
"children": [
"header 3"
],
"is_special": false,
"is_titleblock": false,
"level": 3,
"type": "heading"
},
{
"children": [
"Some text with ",
{
"literal": "code",
"type": "code"
}
],
"type": "paragraph"
},
{
"children": [
"header 4"
],
"is_special": false,
"is_titleblock": false,
"level": 4,
"type": "heading"
},
{
"children": [
"Some text ",
{
"additional_attributes": [],
"children": [
"with a link"
],
"destination": "http://host/path",
"note_id": 0,
"type": "link"
}
],
"type": "paragraph"
},
{
"children": [
"An image ",
{
"children": [
"img alt text"
],
"destination": "path/image.png",
"type": "image"
}
],
"type": "paragraph"
},
{
"children": [
"header 5"
],
"is_special": false,
"is_titleblock": false,
"level": 5,
"type": "heading"
},
{
"children": [
{
"bullet_char": "-",
"children": [
{
"children": [
"list of"
],
"type": "paragraph"
}
],
"delimiter": ".",
"is_footnotes_list": false,
"list_flags": [],
"tight": false,
"type": "list_item"
},
{
"bullet_char": "-",
"children": [
{
"children": [
"things"
],
"type": "paragraph"
}
],
"delimiter": ".",
"is_footnotes_list": false,
"list_flags": [],
"tight": false,
"type": "list_item"
}
],
"delimiter": ".",
"is_footnotes_list": false,
"list_flags": [],
"start": 0,
"tight": true,
"type": "list"
},
{
"children": [
"a table"
],
"type": "paragraph"
},
{
"children": [
{
"children": [
{
"children": [
{
"children": [
"a"
],
"col_span": 0,
"is_header": true,
"type": "table_cell"
},
{
"children": [
"b"
],
"col_span": 0,
"is_header": true,
"type": "table_cell"
},
{
"children": [
"c"
],
"col_span": 0,
"is_header": true,
"type": "table_cell"
}
],
"type": "table_row"
}
],
"type": "table_header"
},
{
"children": [
{
"children": [
{
"children": [
"1"
],
"col_span": 0,
"is_header": false,
"type": "table_cell"
},
{
"children": [
"2"
],
"col_span": 0,
"is_header": false,
"type": "table_cell"
},
{
"children": [
"3"
],
"col_span": 0,
"is_header": false,
"type": "table_cell"
}
],
"type": "table_row"
}
],
"type": "table_body"
}
],
"type": "table"
},
{
"children": [
"header 6"
],
"is_special": false,
"is_titleblock": false,
"level": 6,
"type": "heading"
},
{
"children": [
"Some text with line ",
{
"literal": "<br>",
"type": "html_span"
},
" break and ",
{
"literal": "<b>",
"type": "html_span"
},
"bold",
{
"literal": "</b>",
"type": "html_span"
}
],
"type": "paragraph"
}
],
"type": "document"
}

50
format/markdown/testdata/test.md vendored Normal file
View File

@ -0,0 +1,50 @@
Before
# header 1
Paragraph with **bold** and *italic*
on
multiple
lines.
> Some citation
A footnote[^1] and this also[^note]
## header 2
```jq
code
block
```
also
code
### header 3
Some text with `code`
#### header 4
Some text [with a link](http://host/path)
An image ![img alt text](path/image.png)
##### header 5
- list of
- things
a table
| a | b | c |
| --- | --- | --- |
| 1 | 2 | 3 |
###### header 6
Some text with line <br> break and <b>bold</b>
[^1]: footnote1
[^note]: footnote2

4
go.mod
View File

@ -25,6 +25,10 @@ require (
// bump: gomod-golang-snappy link "Source diff $CURRENT..$LATEST" https://github.com/golang/snappy/compare/v$CURRENT..v$LATEST
github.com/golang/snappy v0.0.4
// has no tags
// go get -d github.com/gomarkdown/markdown@master && go mod tidy
github.com/gomarkdown/markdown v0.0.0-20220627144906-e9a81102ebeb
// has no tags yet
// bump-disabled: gomod-gopacket /github\.com\/gopacket\/gopacket v(.*)/ https://github.com/gopacket/gopacket.git|^1
// bump-disabled: gomod-gopacket command go get -d github.com/gopacket/gopacket@v$LATEST && go mod tidy

2
go.sum
View File

@ -4,6 +4,8 @@ github.com/creasty/defaults v1.6.0 h1:ltuE9cfphUtlrBeomuu8PEyISTXnxqkBIoQfXgv7BS
github.com/creasty/defaults v1.6.0/go.mod h1:iGzKe6pbEHnpMPtfDXZEr0NVxWnPTjb1bbDy08fPzYM=
github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM=
github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/gomarkdown/markdown v0.0.0-20220627144906-e9a81102ebeb h1:5b/eFaSaKPFG9ygDBaPKkydKU5nFJYk08g9jPIVogMg=
github.com/gomarkdown/markdown v0.0.0-20220627144906-e9a81102ebeb/go.mod h1:JDGcbDT52eL4fju3sZ4TeHGsQwhG9nbDV21aMyhwPoA=
github.com/gopacket/gopacket v0.0.0-20220819214934-ee81b8c880da h1:AAwDU9N39fQNYUtg270aiU6N7U2ZVsGZKiRwsCMsWEo=
github.com/gopacket/gopacket v0.0.0-20220819214934-ee81b8c880da/go.mod h1:DlRRfaM/QjAu2ADqraIure1Eif0HpNL8hmyVQ+qci5Y=
github.com/itchyny/timefmt-go v0.1.3 h1:7M3LGVDsqcd0VZH2U+x393obrzZisp7C0uEe921iRkU=

View File

@ -162,6 +162,7 @@ json JavaScript Object Notation
jsonl JavaScript Object Notation Lines
macho Mach-O macOS executable
macho_fat Fat Mach-O macOS executable (multi-architecture)
markdown Markdown
matroska Matroska file
mp3 MP3 file
mp3_frame MPEG audio layer 3 frame