Merge pull request #659 from pulsar-edit/add-markdown-tree-sitter-grammar

Add the Tree-sitter Markdown grammar
2024-09-11 11:15:24 +03:00 · 2023-08-15 17:13:26 -07:00 · 2023-08-15 17:13:26 -07:00 · 9e0db5b054
commit 9e0db5b054
parent ecaff0e996 e293df1ab1
21 changed files with 484 additions and 19 deletions
--- a/packages/atom-dark-syntax/styles/syntax-legacy/_base.less
+++ b/packages/atom-dark-syntax/styles/syntax-legacy/_base.less
@ -242,12 +242,17 @@
    color: #555;
  }

+  .syntax--punctuation.syntax--definition.syntax--list-item {
+    color: #C6C5FE;
+  }
+
  .syntax--variable.syntax--list,
  .syntax--support.syntax--quote {
    color: #555;
  }

-  .syntax--link .syntax--entity {
+  .syntax--link .syntax--entity,
+  .syntax--meta.syntax--link.syntax--text {
    color: #ddd;
  }

--- a/packages/atom-dark-syntax/styles/syntax/base.less
+++ b/packages/atom-dark-syntax/styles/syntax/base.less
@ -202,11 +202,6 @@
    color: #eee;
  }

-  // - item
-  &.syntax--list {
-    color: #555;
-  }
-
  // > quote
  &.syntax--quote {
    color: #555;
@ -226,6 +221,14 @@
  &.syntax--alt {
    color: #ddd;
  }
+
+  &.syntax--bold {
+    font-style: bold;
+  }
+
+  &.syntax--italic {
+    font-style: italic;
+  }
 }

 // /* comment */
--- a/packages/atom-light-syntax/styles/syntax-legacy/_base.less
+++ b/packages/atom-light-syntax/styles/syntax-legacy/_base.less
@ -170,9 +170,14 @@
    color: #888;
  }

-  .syntax--variable.syntax--list {
+  .syntax--variable.syntax--list,
+  .syntax--punctuation.syntax--definition.syntax--list-item {
    color: #888;
  }
+
+  .syntax--meta.syntax--link.syntax--text {
+    color: #555;
+  }
 }

 .syntax--markdown {
--- a/packages/atom-light-syntax/styles/syntax/base.less
+++ b/packages/atom-light-syntax/styles/syntax/base.less
@ -190,6 +190,14 @@
  &.syntax--link {
    color: #888;
  }
+
+  &.syntax--bold {
+    font-style: bold;
+  }
+
+  &.syntax--italic {
+    font-style: italic;
+  }
 }

 // /* comment */
--- a/packages/base16-tomorrow-dark-theme/styles/syntax-legacy/_base.less
+++ b/packages/base16-tomorrow-dark-theme/styles/syntax-legacy/_base.less
@ -274,6 +274,20 @@
    -webkit-font-smoothing: auto;
  }

+  .syntax--meta.syntax--link.syntax--text {
+    color: @cyan;
+  }
+
+  .syntax--punctuation.syntax--definition.syntax--list-item {
+    color: @red;
+  }
+
+  .syntax--markup.syntax--heading {
+    .syntax--punctuation {
+      color: inherit;
+    }
+  }
+
  .syntax--link .syntax--entity {
    color: @cyan;
  }
--- a/packages/base16-tomorrow-dark-theme/styles/syntax/base.less
+++ b/packages/base16-tomorrow-dark-theme/styles/syntax/base.less
@ -258,6 +258,13 @@
      color: @gray;
    }
  }
+
+  &.syntax--quote.syntax--blockquote {
+    color: @gray;
+    .syntax--punctuation.syntax--definition.syntax--blockquote {
+      color: @gray;
+    }
+  }
 }

 // /* comment */
--- a/packages/base16-tomorrow-light-theme/styles/syntax-legacy/_base.less
+++ b/packages/base16-tomorrow-light-theme/styles/syntax-legacy/_base.less
@ -274,6 +274,20 @@
    -webkit-font-smoothing: auto;
  }

+  .syntax--meta.syntax--link.syntax--text {
+    color: @cyan;
+  }
+
+  .syntax--punctuation.syntax--definition.syntax--list-item {
+    color: @red;
+  }
+
+  .syntax--markup.syntax--heading {
+    .syntax--punctuation {
+      color: inherit;
+    }
+  }
+
  .syntax--link .syntax--entity {
    color: @cyan;
  }
--- a/packages/base16-tomorrow-light-theme/styles/syntax/base.less
+++ b/packages/base16-tomorrow-light-theme/styles/syntax/base.less
@ -258,6 +258,13 @@
      color: @gray;
    }
  }
+
+  &.syntax--quote.syntax--blockquote {
+    color: @gray;
+    .syntax--punctuation.syntax--definition.syntax--blockquote {
+      color: @gray;
+    }
+  }
 }

 // /* comment */
--- a/packages/language-gfm/grammars/modern-tree-sitter-gfm-with-frontmatter.cson
+++ b/packages/language-gfm/grammars/modern-tree-sitter-gfm-with-frontmatter.cson
@ -0,0 +1,30 @@
+name: 'GitHub Markdown'
+scopeName: 'source.gfm'
+type: 'modern-tree-sitter'
+# Generated from `savetheclocktower/tree-sitter-frontmatter`.
+parser: 'tree-sitter-frontmatter'
+
+treeSitter:
+  grammar: 'tree-sitter/tree-sitter-frontmatter.wasm'
+  highlightsQuery: 'tree-sitter/tree-sitter-frontmatter/highlights.scm'
+
+fileTypes: [
+  'markdown'
+  'md'
+  'mdown'
+  'mdwn'
+  'mkd'
+  'mkdn'
+  'mkdown'
+  'rmd'
+  'ron'
+  'workbook'
+]
+
+firstLineRegex: '^---$'
+
+comments:
+  start: '<!--'
+  end: '-->'
+  blockStart: '<!--'
+  blockEnd: '-->'
--- a/packages/language-gfm/grammars/modern-tree-sitter-gfm.cson
+++ b/packages/language-gfm/grammars/modern-tree-sitter-gfm.cson
@ -0,0 +1,31 @@
+# This grammar doesn't have its own name because it's only meant to be injected.
+scopeName: 'source.gfm.embedded'
+type: 'modern-tree-sitter'
+parser: 'tree-sitter-markdown'
+
+injectionRegex: '(MARKDOWN|markdown|GFM|gfm)$'
+
+treeSitter:
+  grammar: 'tree-sitter/tree-sitter-markdown.wasm'
+  highlightsQuery: 'tree-sitter/tree-sitter-markdown/highlights.scm'
+  foldsQuery: 'tree-sitter/tree-sitter-markdown/folds.scm'
+  tagsQuery: 'tree-sitter/tree-sitter-markdown/tags.scm'
+
+fileTypes: [
+  'markdown'
+  'md'
+  'mdown'
+  'mdwn'
+  'mkd'
+  'mkdn'
+  'mkdown'
+  'rmd'
+  'ron'
+  'workbook'
+]
+
+comments:
+  start: '<!--'
+  end: '-->'
+  blockStart: '<!--'
+  blockEnd: '-->'
--- a/packages/language-gfm/grammars/tree-sitter/tree-sitter-frontmatter.wasm
+++ b/packages/language-gfm/grammars/tree-sitter/tree-sitter-frontmatter.wasm
--- a/packages/language-gfm/grammars/tree-sitter/tree-sitter-frontmatter/highlights.scm
+++ b/packages/language-gfm/grammars/tree-sitter/tree-sitter-frontmatter/highlights.scm
@ -0,0 +1 @@
+(front_matter) @meta.embedded.block.front-matter.gfm
--- a/packages/language-gfm/grammars/tree-sitter/tree-sitter-markdown.wasm
+++ b/packages/language-gfm/grammars/tree-sitter/tree-sitter-markdown.wasm
--- a/packages/language-gfm/grammars/tree-sitter/tree-sitter-markdown/folds.scm
+++ b/packages/language-gfm/grammars/tree-sitter/tree-sitter-markdown/folds.scm
@ -0,0 +1,22 @@
+; TODO: Folds in Markdown files will have to wait until we can add "tags" to
+; divided folds. We want an H1 section to be able to fold up everything until
+; the next H1 in the file, an H2 to fold up everything until the next H2 _or_
+; H1, an H3 to fold up everything until the next H3 _or_ H2 _or_ H1… but this
+; is not currently possible.
+
+; (atx_heading (atx_h1_marker)) @fold.start.h1 @fold.end.h1 @fold.end.h2 @fold.end.h3 @fold.end.h4 @fold.end.h5 @fold.end.h6
+;
+; (atx_heading (atx_h2_marker)) @fold.start.h2 @fold.end.h2 @fold.end.h3 @fold.end.h4 @fold.end.h5 @fold.end.h6
+;
+; (atx_heading (atx_h3_marker)) @fold.start.h3 @fold.end.h3 @fold.end.h4 @fold.end.h5 @fold.end.h6
+;
+; (atx_heading (atx_h4_marker)) @fold.start.h4 @fold.end.h4 @fold.end.h5 @fold.end.h6
+;
+; (atx_heading (atx_h5_marker)) @fold.start.h5 @fold.end.h5 @fold.end.h6
+;
+; (atx_heading (atx_h6_marker)) @fold.start.h6 @fold.end.h6
+;
+; ; [(atx_heading) (setext_heading)] @fold.end @fold.start
+
+((list_item) @fold
+  (#set! fold.endAt endPosition))
--- a/packages/language-gfm/grammars/tree-sitter/tree-sitter-markdown/highlights.scm
+++ b/packages/language-gfm/grammars/tree-sitter/tree-sitter-markdown/highlights.scm
@ -0,0 +1,187 @@
+; HEADINGS
+; ========
+
+(setext_heading
+  (heading_content) @markup.heading.heading-1.gfm
+  (setext_h1_underline) @punctuation.definition.heading-underline.gfm)
+
+(setext_heading
+  (heading_content) @markup.heading.heading-2.gfm
+  (setext_h2_underline) @punctuation.definition.heading-underline.gfm)
+
+(atx_heading
+  (atx_h1_marker) @punctuation.definition.heading.gfm
+  ) @markup.heading.heading-1.gfm
+
+(atx_heading
+  (atx_h2_marker) @punctuation.definition.heading.gfm
+  ) @markup.heading.heading-2.gfm
+
+(atx_heading
+  (atx_h3_marker) @punctuation.definition.heading.gfm
+  ) @markup.heading.heading-3.gfm
+
+(atx_heading
+  (atx_h4_marker) @punctuation.definition.heading.gfm
+  ) @markup.heading.heading-4.gfm
+
+(atx_heading
+  (atx_h5_marker) @punctuation.definition.heading.gfm
+  ) @markup.heading.heading-5.gfm
+
+(atx_heading
+  (atx_h6_marker) @punctuation.definition.heading.gfm
+  ) @markup.heading.heading-6.gfm
+
+
+; SECTIONS
+; ========
+
+(paragraph) @markup.paragraph.gfm
+
+(thematic_break) @punctuation.definition.horizontal-rule.gfm
+
+(block_quote) @markup.quote.blockquote.gfm
+((block_quote) @punctuation.definition.blockquote.gfm
+  (#set! adjust.endAfterFirstMatchOf ">"))
+
+
+; LISTS
+; =====
+
+; `markup.list` gets applied to individual list items, unintuitively. So let's
+; scope the entire list. “Tight” vs “Loose” has to do with whether each `<li>`
+; has one or more implicit `<p>` tags around it.
+
+[(tight_list) (loose_list)] @meta.list.gfm
+
+((list_item
+  (list_marker) @punctuation.definition.list-item.gfm) @markup.list.unnumbered
+  ; Instead of matching bullet or minus or plus, any not-digit here is
+  ; guaranteed to be an unordered list.
+  (#not-match? @punctuation.definition.list-item.gfm "^\\d"))
+
+((list_item
+  (list_marker) @punctuation.definition.list-item.gfm) @markup.list.numbered
+  (#match? @punctuation.definition.list-item.gfm "^\\d"))
+
+((task_list_item
+  (list_marker) @punctuation.definition.list-item.gfm) @markup.list.unnumbered
+  ; Instead of matching bullet or minus or plus, any not-digit here is
+  ; guaranteed to be an unordered list.
+  (#not-match? @punctuation.definition.list-item.gfm "^\\d"))
+
+((task_list_item
+  (list_marker) @punctuation.definition.list-item.gfm) @markup.list.numbered
+  (#match? @punctuation.definition.list-item.gfm "^\\d"))
+
+
+; INLINE/REPLACED
+; ===============
+
+; The text inside []s in anchors/image syntax.
+[(link_text) (image_description)] @string.unquoted.gfm @meta.link.text
+
+(link_label (text) @meta.link.text)
+
+; A URL between ()s in anchor syntax.
+(link_destination) @markup.underline.link.gfm
+((link) @punctuation.definition.begin.link.bracket.round.gfm
+  (#set! adjust.startAndEndAroundFirstMatchOf "(?<=\\])\\("))
+((link) @punctuation.definition.end.link.bracket.round.gfm
+  (#set! adjust.startAndEndAroundFirstMatchOf "\\)$"))
+
+((link) @punctuation.definition.begin.link.bracket.square.gfm
+  (#set! adjust.endAfterFirstMatchOf "^\\["))
+((link) @punctuation.definition.end.link.bracket.square.gfm
+  (#set! adjust.startAndEndAroundFirstMatchOf "\\](?=\\(|\\[)"))
+((link_reference_definition) @punctuation.definition.begin.link.bracket.square.gfm
+  (#set! adjust.endAfterFirstMatchOf "^\\["))
+((link_reference_definition) @punctuation.definition.end.link.bracket.square.gfm
+  (#set! adjust.startAndEndAroundFirstMatchOf "\\](?=\\(|\\[)"))
+((link_reference_definition) @punctuation.separator.link.colon.gfm
+  (#set! adjust.startAndEndAroundFirstMatchOf ":"))
+
+; A URL between <>s in autolink syntax.
+(uri_autolink (text) @markup.underline.link.gfm)
+((uri_autolink) @punctuation.definition.link.begin.bracket.angle.gfm
+  (#set! adjust.endAfterFirstMatchOf "^<"))
+((uri_autolink) @punctuation.definition.link.end.bracket.angle.gfm
+  (#set! adjust.startBeforeFirstMatchOf ">$"))
+
+; A link title: `[foo](http://example.com "Example web site")`
+((link_title) @string.quoted.double.link-title.gfm
+  (#match? @string.quoted.double.link-title.gfm "^\"")
+  (#set! capture.final true))
+
+((link_title) @punctuation.definition.string.begin.gfm
+  (#match? @punctuation.definition.string.begin.gfm "^\"")
+  (#set! adjust.endAfterFirstMatchOf "^\""))
+
+((link_title) @punctuation.definition.string.end.gfm
+  (#match? @punctuation.definition.string.end.gfm "\"$")
+  (#set! adjust.startBeforeFirstMatchOf "\"$"))
+
+; Out of laziness, let's throw all other kinds of link title into the generic
+; bin — they are all delimited _somehow_, right?
+(link_title) @string.quoted.link-title.gfm
+
+; Link labels in `[foo][bar]` syntax, where `bar` is associated with a URL via
+; a subsequent footnote, actually work correctly when one runs "Link: Open" in
+; Pulsar, so these should be treated like links.
+(link_label) @markup.underline.link.link-label.gfm
+
+(image) @meta.image.gfm
+
+
+; CODE BLOCKS
+; ===========
+
+(code_span) @meta.embedded.line.inline-code.gfm @markup.raw.inline.gfm
+(info_string) @storage.modifier.language._TEXT_.gfm
+
+(fenced_code_block) @markup.code.fenced.gfm @meta.embedded.block.fenced-code.gfm
+(indented_code_block) @markup.code.indented.gfm @meta.embedded.block.indented-code.gfm
+
+
+; BOLD/ITALIC/OTHER
+; =================
+
+(emphasis) @markup.italic.gfm
+(strong_emphasis) @markup.bold.gfm
+(strikethrough) @markup.strike.gfm
+
+((emphasis) @punctuation.delimiter.emphasis.begin.gfm
+  (#set! adjust.startAndEndAroundFirstMatchOf "^(\\*|_)"))
+
+((emphasis) @punctuation.delimiter.emphasis.end.gfm
+  (#set! adjust.startAndEndAroundFirstMatchOf "(\\*|_)$"))
+
+((strong_emphasis) @punctuation.delimiter.emphasis.begin.gfm
+  (#set! adjust.startAndEndAroundFirstMatchOf "^(\\*{2}|_{2})"))
+
+((strong_emphasis) @punctuation.delimiter.emphasis.end.gfm
+  (#set! adjust.startAndEndAroundFirstMatchOf "(\\*{2}|_{2})$"))
+
+((strikethrough) @punctuation.delimiter.strike.begin.gfm
+  (#set! adjust.startAndEndAroundFirstMatchOf "^~~"))
+
+((strikethrough) @punctuation.delimiter.strike.begin.gfm
+  (#set! adjust.startAndEndAroundFirstMatchOf "~~$"))
+
+; HTML
+; ====
+
+(html_comment) @comment.block.html
+
+; MISC
+; ====
+
+(table) @markup.other.table.gfm
+(table_header_row (table_cell) @markup.other.table-cell.header.gfm)
+(table_data_row (table_cell) @markup.other.table-cell.data.gfm)
+
+(table_delimiter_row (table_column_alignment) @punctuation.separator.table-row.gfm)
+
+
+(backslash_escape) @constant.character.escape.gfm
--- a/packages/language-gfm/grammars/tree-sitter/tree-sitter-markdown/indents.scm
+++ b/packages/language-gfm/grammars/tree-sitter/tree-sitter-markdown/indents.scm
@ -0,0 +1,3 @@
+; Intentionally empty indents.scm. By and large, indentation level should be
+; manually controlled by the user in Markdown; the best thing we can do is stay
+; out of the user's way.
--- a/packages/language-gfm/grammars/tree-sitter/tree-sitter-markdown/tags.scm
+++ b/packages/language-gfm/grammars/tree-sitter/tree-sitter-markdown/tags.scm
@ -0,0 +1,48 @@
+
+((atx_heading
+  (atx_h1_marker)
+  (heading_content) @name) @definition.heading
+  (#set! symbol.strip "(^\\s*|\\s*$)")
+  (#set! symbol.prepend "· "))
+
+((atx_heading
+  (atx_h2_marker)
+  (heading_content) @name) @definition.heading
+  (#set! symbol.strip "(^\\s*|\\s*$)")
+  (#set! symbol.prepend "·· "))
+
+((atx_heading
+  (atx_h3_marker)
+  (heading_content) @name) @definition.heading
+  (#set! symbol.strip "(^\\s*|\\s*$)")
+  (#set! symbol.prepend "··· "))
+
+((atx_heading
+  (atx_h4_marker)
+  (heading_content) @name) @definition.heading
+  (#set! symbol.strip "(^\\s*|\\s*$)")
+  (#set! symbol.prepend "···· "))
+
+((atx_heading
+  (atx_h5_marker)
+  (heading_content) @name) @definition.heading
+  (#set! symbol.strip "(^\\s*|\\s*$)")
+  (#set! symbol.prepend "····· "))
+
+((atx_heading
+  (atx_h6_marker)
+  (heading_content) @name) @definition.heading
+  (#set! symbol.strip "(^\\s*|\\s*$)")
+  (#set! symbol.prepend "······ "))
+
+((setext_heading
+  (heading_content) @name) @definition.heading
+  (setext_h1_underline)
+  (#set! symbol.strip "(^\\s*|\\s*$)")
+  (#set! symbol.prepend "· "))
+
+((setext_heading
+  (heading_content) @name) @definition.heading
+  (setext_h2_underline)
+  (#set! symbol.strip "(^\\s*|\\s*$)")
+  (#set! symbol.prepend "·· "))
--- a/packages/language-gfm/lib/main.js
+++ b/packages/language-gfm/lib/main.js
@ -0,0 +1,89 @@
+exports.activate = () => {
+
+  // The top-level tree-sitter parser for `source.gfm` simply divides the text
+  // into front matter (if it exists) and the remainder, which is directly
+  // parsed as Markdown.
+  //
+  // We do this because the `ikatyang/tree-sitter-markdown` parser does not
+  // recognize YAML front matter, but is otherwise a very strong Markdown
+  // parser. If the `MDeiml/tree-sitter-markdown` parser became more stable,
+  // we could consider switching, and then we wouldn't need this extra parser.
+
+  // Hand off the front matter to the YAML injection.
+  atom.grammars.addInjectionPoint('source.gfm', {
+    type: 'front_matter',
+    language: () => 'yaml',
+    content(node) {
+      return node.descendantsOfType('text');
+    }
+  });
+
+  // Hand off everything else to the Markdown injection.
+  atom.grammars.addInjectionPoint('source.gfm', {
+    type: 'remainder',
+    language: () => 'markdown',
+    content: (node) => node,
+    languageScope: null
+  });
+
+  // The markdown injection has a scope name of `source.gfm.embedded` so we can
+  // target it for the rest of these injections, but you can see above that we
+  // suppress that scope name when we inject it into a document.
+
+  // Highlight HTML blocks.
+  atom.grammars.addInjectionPoint('source.gfm.embedded', {
+    type: 'html_block',
+    language: () => 'html',
+    content: (node) => node,
+    includeChildren: true
+  });
+
+  // Highlight inline HTML within paragraphs.
+  atom.grammars.addInjectionPoint('source.gfm.embedded', {
+    type: 'paragraph',
+    language(node) {
+      let html = node.descendantsOfType([
+        'html_open_tag',
+        'html_close_tag',
+        'html_self_closing_tag'
+      ]);
+      if (html.length === 0) { return null; }
+      return 'html';
+    },
+
+    content(node) {
+      let html = node.descendantsOfType([
+        'html_open_tag',
+        'html_close_tag',
+        'html_self_closing_tag'
+      ]);
+      return html;
+    },
+
+    includeChildren: true
+  });
+
+  // All code blocks of the form
+  //
+  // ```foo
+  // (code goes here)
+  // ```
+  //
+  // get injections on the theory that some grammar's `injectionRegex` will
+  // match `foo`.
+  atom.grammars.addInjectionPoint('source.gfm.embedded', {
+    type: 'fenced_code_block',
+    language(node) {
+      let language = node?.firstNamedChild;
+      if (language?.type === 'info_string')
+        return language.text;
+
+      return null;
+    },
+    content(node) {
+      return node.descendantsOfType('code_fence_content');
+    },
+    languageScope: (grammar) => `${grammar.scopeName}.embedded`,
+    includeChildren: true
+  });
+};
--- a/packages/language-gfm/package.json
+++ b/packages/language-gfm/package.json
@ -1,6 +1,7 @@
 {
  "name": "language-gfm",
  "version": "0.90.8",
+  "main": "lib/main",
  "description": "Syntax highlighting and snippets for GitHub Flavored Markdown (GFM).",
  "repository": "https://github.com/pulsar-edit/pulsar",
  "license": "MIT",
--- a/packages/one-dark-syntax/styles/syntax-legacy/_base.less
+++ b/packages/one-dark-syntax/styles/syntax-legacy/_base.less
@ -129,11 +129,6 @@
      color: @mono-1;
    }

-    &.syntax--heading,
-    &.syntax--identity {
-      color: @hue-2;
-    }
-
    &.syntax--bold {
      color: @hue-6-2;
      font-weight: bold;
@ -299,7 +294,7 @@
    color: @hue-5;

    .syntax--punctuation.syntax--definition.syntax--heading {
-      color: @hue-2;
+      color: @hue-5;
    }
  }

--- a/packages/one-light-syntax/styles/syntax-legacy/_base.less
+++ b/packages/one-light-syntax/styles/syntax-legacy/_base.less
@ -129,11 +129,6 @@
      color: @mono-1;
    }

-    &.syntax--heading,
-    &.syntax--identity {
-      color: @hue-2;
-    }
-
    &.syntax--bold {
      color: @hue-6-2;
      font-weight: bold;
@ -299,7 +294,7 @@
    color: @hue-5;

    .syntax--punctuation.syntax--definition.syntax--heading {
-      color: @hue-2;
+      color: @hue-5;
    }
  }
				`@ -0,0 +1 @@`
				`(front_matter) @meta.embedded.block.front-matter.gfm`