diff --git a/format/all/all.go b/format/all/all.go index 0263dbd3..ba1127fa 100644 --- a/format/all/all.go +++ b/format/all/all.go @@ -28,6 +28,7 @@ import ( _ "github.com/wader/fq/format/jpeg" _ "github.com/wader/fq/format/json" _ "github.com/wader/fq/format/macho" + _ "github.com/wader/fq/format/markdown" _ "github.com/wader/fq/format/math" _ "github.com/wader/fq/format/matroska" _ "github.com/wader/fq/format/mp3" diff --git a/format/format.go b/format/format.go index e42a5900..2ac54a72 100644 --- a/format/format.go +++ b/format/format.go @@ -86,6 +86,7 @@ const ( JSONL = "jsonl" MACHO = "macho" MACHO_FAT = "macho_fat" + MARKDOWN = "markdown" MATROSKA = "matroska" MP3 = "mp3" MP3_FRAME = "mp3_frame" diff --git a/format/markdown/markdown.go b/format/markdown/markdown.go new file mode 100644 index 00000000..dd921ef0 --- /dev/null +++ b/format/markdown/markdown.go @@ -0,0 +1,319 @@ +package markdown + +import ( + "embed" + "fmt" + "io/ioutil" + + "github.com/gomarkdown/markdown" + "github.com/gomarkdown/markdown/ast" + "github.com/wader/fq/format" + "github.com/wader/fq/pkg/bitio" + "github.com/wader/fq/pkg/decode" + "github.com/wader/fq/pkg/interp" + "github.com/wader/fq/pkg/scalar" +) + +//go:embed markdown.jq +var markdownFS embed.FS + +func init() { + interp.RegisterFormat(decode.Format{ + Name: format.MARKDOWN, + Description: "Markdown", + DecodeFn: decodeMarkdown, + Functions: []string{"_todisplay"}, + }) + interp.RegisterFS(markdownFS) +} + +func decodeMarkdown(d *decode.D, _ any) any { + b, err := ioutil.ReadAll(bitio.NewIOReader(d.RawLen(d.Len()))) + if err != nil { + panic(err) + } + + var s scalar.S + s.Actual = node(markdown.Parse(b, nil)) + d.Value.V = &s + d.Value.Range.Len = d.Len() + + return nil +} + +func stringSlice[T string | []byte](ss []T) []any { + var vs []any + for _, e := range ss { + vs = append(vs, string(e)) + } + return vs +} + +func sliceMap[F, T any](vs []F, fn func(F) T) []T { + ts := make([]T, len(vs)) + for i, v := range vs { + ts[i] = fn(v) + } + return ts +} + +func intSlice[T ~int](ss []T) []any { + var vs []any + for _, e := range ss { + vs = append(vs, e) + } + return vs +} + +func attr(v map[string]any, attr *ast.Attribute) { + if attr == nil { + return + } + + v["id"] = string(attr.ID) + + var as []any + for _, a := range attr.Attrs { + as = append(as, string(a)) + } + v["attrs"] = as + + var cs []any + for _, a := range attr.Classes { + cs = append(cs, string(a)) + } + v["classes"] = cs +} + +func leaf(v map[string]any, typ string, l ast.Leaf) { + v["type"] = typ + v["literal"] = string(l.Literal) + + attr(v, l.Attribute) +} + +func container(v map[string]any, typ string, c ast.Container) { + v["type"] = typ + v["literal"] = string(c.Literal) + + var cs []any + children := c.GetChildren() + for _, n := range children { + cv := node(n) + if cv != nil { + cs = append(cs, node(n)) + } + } + v["children"] = cs + + attr(v, c.Attribute) +} + +func listType(t ast.ListType) []any { + var vs []any + + if t&ast.ListTypeOrdered == ast.ListTypeOrdered { + vs = append(vs, "ordered") + } + if t%ast.ListTypeOrdered == ast.ListTypeOrdered { + vs = append(vs, "ordered") + } + if t%ast.ListTypeDefinition == ast.ListTypeDefinition { + vs = append(vs, "definition") + } + if t%ast.ListTypeTerm == ast.ListTypeTerm { + vs = append(vs, "term") + } + if t%ast.ListItemContainsBlock == ast.ListItemContainsBlock { + vs = append(vs, "contains_block") + } + if t%ast.ListItemBeginningOfList == ast.ListItemBeginningOfList { + vs = append(vs, "beginning_of_list") + } + if t%ast.ListItemEndOfList == ast.ListItemEndOfList { + vs = append(vs, "end_of_list") + } + + return vs +} + +func node(n ast.Node) any { + v := map[string]any{} + + switch n := n.(type) { + case *ast.Text: + if n.Leaf.Attribute == nil { + if len(n.Leaf.Literal) > 0 { + return string(n.Leaf.Literal) + } + // skip + return nil + } + case *ast.Softbreak: + leaf(v, "softbreak", n.Leaf) + case *ast.Hardbreak: + leaf(v, "hardbreak", n.Leaf) + case *ast.NonBlockingSpace: + leaf(v, "nbsp", n.Leaf) + case *ast.Emph: + container(v, "em", n.Container) + case *ast.Strong: + container(v, "strong", n.Container) + case *ast.Del: + container(v, "del", n.Container) + case *ast.BlockQuote: + container(v, "blockquote", n.Container) + case *ast.Aside: + container(v, "aside", n.Container) + case *ast.Link: + container(v, "link", n.Container) + v["destination"] = string(n.Destination) + v["title"] = string(n.Title) + v["note_id"] = n.NoteID + v["deferred_id"] = string(n.DeferredID) + v["additional_attributes"] = stringSlice(n.AdditionalAttributes) + case *ast.CrossReference: + container(v, "cross_reference", n.Container) + v["destination"] = string(n.Destination) + case *ast.Citation: + leaf(v, "citation", n.Leaf) + v["destination"] = stringSlice(n.Destination) + v["type"] = sliceMap(n.Type, func(v ast.CitationTypes) string { + switch v { + case ast.CitationTypeNone: + return "none" + case ast.CitationTypeSuppressed: + return "suppressed" + case ast.CitationTypeInformative: + return "informative" + case ast.CitationTypeNormative: + return "normative" + default: + return "unknown" + } + }) + v["type"] = intSlice(n.Type) + v["suffix"] = stringSlice(n.Suffix) + case *ast.Image: + container(v, "image", n.Container) + v["destination"] = string(n.Destination) + v["title"] = string(n.Title) + case *ast.Code: + leaf(v, "code", n.Leaf) + case *ast.CodeBlock: + leaf(v, "code_block", n.Leaf) + v["is_fenced"] = n.IsFenced + v["info"] = string(n.Info) + if n.FenceChar != 0 { + v["fence_char"] = string(n.FenceChar) + } + v["fence_length"] = n.FenceLength + v["fence_offset"] = n.FenceOffset + case *ast.Caption: + container(v, "caption", n.Container) + case *ast.CaptionFigure: + container(v, "caption_figure", n.Container) + v["heading_id"] = n.HeadingID + case *ast.Document: + container(v, "document", n.Container) + case *ast.Paragraph: + container(v, "paragraph", n.Container) + case *ast.HTMLSpan: + leaf(v, "html_span", n.Leaf) + case *ast.HTMLBlock: + leaf(v, "html_block", n.Leaf) + case *ast.Heading: + container(v, "heading", n.Container) + v["level"] = n.Level + v["heading_id"] = n.HeadingID + v["is_titleblock"] = n.IsTitleblock + v["is_special"] = n.IsSpecial + case *ast.HorizontalRule: + leaf(v, "hr", n.Leaf) + case *ast.List: + container(v, "list", n.Container) + v["list_flags"] = listType(n.ListFlags) + v["tight"] = n.Tight + if n.BulletChar != 0 { + v["bullet_char"] = string(n.BulletChar) + } + if n.Delimiter != 0 { + v["delimiter"] = string(n.Delimiter) + } + v["start"] = n.Start + v["ref_link"] = string(n.RefLink) + v["is_footnotes_list"] = n.IsFootnotesList + case *ast.ListItem: + container(v, "list_item", n.Container) + v["list_flags"] = listType(n.ListFlags) + v["tight"] = n.Tight + if n.BulletChar != 0 { + v["bullet_char"] = string(n.BulletChar) + } + if n.Delimiter != 0 { + v["delimiter"] = string(n.Delimiter) + } + v["ref_link"] = string(n.RefLink) + v["is_footnotes_list"] = n.IsFootnotesList + case *ast.Table: + container(v, "table", n.Container) + case *ast.TableCell: + container(v, "table_cell", n.Container) + v["is_header"] = n.IsHeader + v["align"] = n.Align.String() + v["col_span"] = n.ColSpan + case *ast.TableHeader: + container(v, "table_header", n.Container) + case *ast.TableBody: + container(v, "table_body", n.Container) + case *ast.TableRow: + container(v, "table_row", n.Container) + case *ast.TableFooter: + container(v, "table_footer", n.Container) + case *ast.Math: + leaf(v, "math", n.Leaf) + case *ast.MathBlock: + container(v, "math_block", n.Container) + case *ast.DocumentMatter: + container(v, "document_matter", n.Container) + v["matter"] = func(v ast.DocumentMatters) string { + switch v { + case ast.DocumentMatterNone: + return "none" + case ast.DocumentMatterFront: + return "front" + case ast.DocumentMatterMain: + return "main" + case ast.DocumentMatterBack: + return "back" + default: + return "unknown" + } + }(n.Matter) + case *ast.Callout: + leaf(v, "callout", n.Leaf) + v["id"] = string(n.ID) + case *ast.Index: + leaf(v, "index", n.Leaf) + v["primary"] = n.Primary + v["item"] = string(n.Item) + v["subitem"] = string(n.Subitem) + v["id"] = n.ID + case *ast.Subscript: + leaf(v, "subscript", n.Leaf) + case *ast.Superscript: + leaf(v, "superscript", n.Leaf) + case *ast.Footnotes: + container(v, "footnotes", n.Container) + default: + panic(fmt.Sprintf("unknown node %T", node)) + } + + for k, e := range v { + if s, ok := e.(string); ok && s == "" { + delete(v, k) + } + } + + return v +} diff --git a/format/markdown/testdata/test.fqtest b/format/markdown/testdata/test.fqtest new file mode 100644 index 00000000..a0152ca1 --- /dev/null +++ b/format/markdown/testdata/test.fqtest @@ -0,0 +1,316 @@ +$ fq -d markdown . test.md +{ + "children": [ + { + "children": [ + "Before" + ], + "type": "paragraph" + }, + { + "children": [ + "header 1" + ], + "is_special": false, + "is_titleblock": false, + "level": 1, + "type": "heading" + }, + { + "children": [ + "Paragraph with ", + { + "children": [ + "bold" + ], + "type": "strong" + }, + " and ", + { + "children": [ + "italic" + ], + "type": "em" + }, + "\non\nmultiple\nlines." + ], + "type": "paragraph" + }, + { + "children": [ + { + "children": [ + "Some citation" + ], + "type": "paragraph" + } + ], + "type": "blockquote" + }, + { + "children": [ + "A footnote", + { + "additional_attributes": [], + "children": [ + "^1" + ], + "destination": "footnote1", + "note_id": 0, + "type": "link" + }, + " and this also", + { + "additional_attributes": [], + "children": [ + "^note" + ], + "destination": "footnote2", + "note_id": 0, + "type": "link" + } + ], + "type": "paragraph" + }, + { + "children": [ + "header 2" + ], + "is_special": false, + "is_titleblock": false, + "level": 2, + "type": "heading" + }, + { + "fence_length": 0, + "fence_offset": 0, + "info": "jq", + "is_fenced": true, + "literal": "code\nblock\n", + "type": "code_block" + }, + { + "fence_length": 0, + "fence_offset": 0, + "is_fenced": false, + "literal": "also\ncode\n", + "type": "code_block" + }, + { + "children": [ + "header 3" + ], + "is_special": false, + "is_titleblock": false, + "level": 3, + "type": "heading" + }, + { + "children": [ + "Some text with ", + { + "literal": "code", + "type": "code" + } + ], + "type": "paragraph" + }, + { + "children": [ + "header 4" + ], + "is_special": false, + "is_titleblock": false, + "level": 4, + "type": "heading" + }, + { + "children": [ + "Some text ", + { + "additional_attributes": [], + "children": [ + "with a link" + ], + "destination": "http://host/path", + "note_id": 0, + "type": "link" + } + ], + "type": "paragraph" + }, + { + "children": [ + "An image ", + { + "children": [ + "img alt text" + ], + "destination": "path/image.png", + "type": "image" + } + ], + "type": "paragraph" + }, + { + "children": [ + "header 5" + ], + "is_special": false, + "is_titleblock": false, + "level": 5, + "type": "heading" + }, + { + "children": [ + { + "bullet_char": "-", + "children": [ + { + "children": [ + "list of" + ], + "type": "paragraph" + } + ], + "delimiter": ".", + "is_footnotes_list": false, + "list_flags": [], + "tight": false, + "type": "list_item" + }, + { + "bullet_char": "-", + "children": [ + { + "children": [ + "things" + ], + "type": "paragraph" + } + ], + "delimiter": ".", + "is_footnotes_list": false, + "list_flags": [], + "tight": false, + "type": "list_item" + } + ], + "delimiter": ".", + "is_footnotes_list": false, + "list_flags": [], + "start": 0, + "tight": true, + "type": "list" + }, + { + "children": [ + "a table" + ], + "type": "paragraph" + }, + { + "children": [ + { + "children": [ + { + "children": [ + { + "children": [ + "a" + ], + "col_span": 0, + "is_header": true, + "type": "table_cell" + }, + { + "children": [ + "b" + ], + "col_span": 0, + "is_header": true, + "type": "table_cell" + }, + { + "children": [ + "c" + ], + "col_span": 0, + "is_header": true, + "type": "table_cell" + } + ], + "type": "table_row" + } + ], + "type": "table_header" + }, + { + "children": [ + { + "children": [ + { + "children": [ + "1" + ], + "col_span": 0, + "is_header": false, + "type": "table_cell" + }, + { + "children": [ + "2" + ], + "col_span": 0, + "is_header": false, + "type": "table_cell" + }, + { + "children": [ + "3" + ], + "col_span": 0, + "is_header": false, + "type": "table_cell" + } + ], + "type": "table_row" + } + ], + "type": "table_body" + } + ], + "type": "table" + }, + { + "children": [ + "header 6" + ], + "is_special": false, + "is_titleblock": false, + "level": 6, + "type": "heading" + }, + { + "children": [ + "Some text with line ", + { + "literal": "
", + "type": "html_span" + }, + " break and ", + { + "literal": "", + "type": "html_span" + }, + "bold", + { + "literal": "", + "type": "html_span" + } + ], + "type": "paragraph" + } + ], + "type": "document" +} diff --git a/format/markdown/testdata/test.md b/format/markdown/testdata/test.md new file mode 100644 index 00000000..e131e5a9 --- /dev/null +++ b/format/markdown/testdata/test.md @@ -0,0 +1,50 @@ +Before + +# header 1 + +Paragraph with **bold** and *italic* +on +multiple +lines. + +> Some citation + +A footnote[^1] and this also[^note] + +## header 2 + +```jq +code +block +``` + + also + code + +### header 3 + +Some text with `code` + +#### header 4 + +Some text [with a link](http://host/path) + +An image ![img alt text](path/image.png) + +##### header 5 + +- list of +- things + +a table + +| a | b | c | +| --- | --- | --- | +| 1 | 2 | 3 | + +###### header 6 + +Some text with line
break and bold + +[^1]: footnote1 +[^note]: footnote2 diff --git a/go.mod b/go.mod index d3b54095..1b565b9f 100644 --- a/go.mod +++ b/go.mod @@ -25,6 +25,10 @@ require ( // bump: gomod-golang-snappy link "Source diff $CURRENT..$LATEST" https://github.com/golang/snappy/compare/v$CURRENT..v$LATEST github.com/golang/snappy v0.0.4 + // has no tags + // go get -d github.com/gomarkdown/markdown@master && go mod tidy + github.com/gomarkdown/markdown v0.0.0-20220627144906-e9a81102ebeb + // has no tags yet // bump-disabled: gomod-gopacket /github\.com\/gopacket\/gopacket v(.*)/ https://github.com/gopacket/gopacket.git|^1 // bump-disabled: gomod-gopacket command go get -d github.com/gopacket/gopacket@v$LATEST && go mod tidy diff --git a/go.sum b/go.sum index cd320c9f..3d064f28 100644 --- a/go.sum +++ b/go.sum @@ -4,6 +4,8 @@ github.com/creasty/defaults v1.6.0 h1:ltuE9cfphUtlrBeomuu8PEyISTXnxqkBIoQfXgv7BS github.com/creasty/defaults v1.6.0/go.mod h1:iGzKe6pbEHnpMPtfDXZEr0NVxWnPTjb1bbDy08fPzYM= github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/gomarkdown/markdown v0.0.0-20220627144906-e9a81102ebeb h1:5b/eFaSaKPFG9ygDBaPKkydKU5nFJYk08g9jPIVogMg= +github.com/gomarkdown/markdown v0.0.0-20220627144906-e9a81102ebeb/go.mod h1:JDGcbDT52eL4fju3sZ4TeHGsQwhG9nbDV21aMyhwPoA= github.com/gopacket/gopacket v0.0.0-20220819214934-ee81b8c880da h1:AAwDU9N39fQNYUtg270aiU6N7U2ZVsGZKiRwsCMsWEo= github.com/gopacket/gopacket v0.0.0-20220819214934-ee81b8c880da/go.mod h1:DlRRfaM/QjAu2ADqraIure1Eif0HpNL8hmyVQ+qci5Y= github.com/itchyny/timefmt-go v0.1.3 h1:7M3LGVDsqcd0VZH2U+x393obrzZisp7C0uEe921iRkU= diff --git a/pkg/interp/testdata/args.fqtest b/pkg/interp/testdata/args.fqtest index ef3272e5..5e03684e 100644 --- a/pkg/interp/testdata/args.fqtest +++ b/pkg/interp/testdata/args.fqtest @@ -162,6 +162,7 @@ json JavaScript Object Notation jsonl JavaScript Object Notation Lines macho Mach-O macOS executable macho_fat Fat Mach-O macOS executable (multi-architecture) +markdown Markdown matroska Matroska file mp3 MP3 file mp3_frame MPEG audio layer 3 frame