From 9c6c63a2be30252a6207f4aebb5e0f76f746b4c8 Mon Sep 17 00:00:00 2001 From: Timothy DeHerrera Date: Fri, 28 Apr 2023 03:21:34 -0600 Subject: [PATCH] inject language based on file extension & shebang (#3970) * inject language based on file extension Nodes can now be captured with "injection.filename". If this capture contains a valid file extension known to Helix, then the content will be highlighted as that language. * inject language by shebang Nodes can now be captured with "injection.shebang". If this capture contains a valid shebang line known to Helix, then the content will be highlighted as the language the shebang calls for. * add documentation for language injection * nix: fix highlights The `@` is now highlighted properly on either side of the function arg. Also, extending the phases with `buildPhase = prev.buildPhase + ''''` is now highlighted properly. Fix highlighting of `''$` style escapes (requires tree-sitter-nix bump) Fix `inherit` highlighting. * simplify injection_for_match Split out injection pair logic into its own method to make the overall flow easier to follow. Also transform the top-level function into a method on a HighlightConfiguration. * markdown: add shebang injection query --- book/src/SUMMARY.md | 1 + book/src/guides/injection.md | 57 +++++++ helix-core/src/syntax.rs | 207 +++++++++++++++--------- helix-term/src/ui/markdown.rs | 6 +- languages.toml | 2 +- runtime/queries/markdown/injections.scm | 4 + runtime/queries/nix/highlights.scm | 5 +- runtime/queries/nix/injections.scm | 18 ++- 8 files changed, 216 insertions(+), 84 deletions(-) create mode 100644 book/src/guides/injection.md diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md index 6e780b87f..ba330cf77 100644 --- a/book/src/SUMMARY.md +++ b/book/src/SUMMARY.md @@ -16,3 +16,4 @@ # Summary - [Adding languages](./guides/adding_languages.md) - [Adding textobject queries](./guides/textobject.md) - [Adding indent queries](./guides/indent.md) + - [Adding injection queries](./guides/injection.md) diff --git a/book/src/guides/injection.md b/book/src/guides/injection.md new file mode 100644 index 000000000..18c474cfe --- /dev/null +++ b/book/src/guides/injection.md @@ -0,0 +1,57 @@ +# Adding Injection Queries + +Writing language injection queries allows one to highlight a specific node as a different language. +In addition to the [standard](upstream-docs) language injection options used by tree-sitter, there +are a few Helix specific extensions that allow for more control. + +And example of a simple query that would highlight all strings as bash in Nix: +```scm +((string_expression (string_fragment) @injection.content) + (#set! injection.language "bash")) +``` + +## Capture Types + +- `@injection.language` (standard): +The captured node may contain the language name used to highlight the node captured by +`@injection.content`. + +- `@injection.content` (standard): +Marks the content to be highlighted as the language captured with `@injection.language` _et al_. + +- `@injection.filename` (extension): +The captured node may contain a filename with a file-extension known to Helix, +highlighting `@injection.content` as that language. This uses the language extensions defined in +both the default languages.toml distributed with Helix, as well as user defined languages. + +- `@injection.shebang` (extension): +The captured node may contain a shebang used to choose a language to highlight as. This also uses +the shebangs defined in the default and user `languages.toml`. + +## Settings + +- `injection.combined` (standard): +Indicates that all the matching nodes in the tree should have their content parsed as one +nested document. + +- `injection.language` (standard): +Forces the captured content to be highlighted as the given language + +- `injection.include-children` (standard): +Indicates that the content node’s entire text should be re-parsed, including the text of its child +nodes. By default, child nodes’ text will be excluded from the injected document. + +- `injection.include-unnamed-children` (extension): +Same as `injection.include-children` but only for unnamed child nodes. + +## Predicates + +- `#eq?` (standard): +The first argument (a capture) must be equal to the second argument +(a capture or a string). + +- `#match?` (standard): +The first argument (a capture) must match the regex given in the +second argument (a string). + +[upstream-docs]: http://tree-sitter.github.io/tree-sitter/syntax-highlighting#language-injection diff --git a/helix-core/src/syntax.rs b/helix-core/src/syntax.rs index c34ea81a3..6514b40f5 100644 --- a/helix-core/src/syntax.rs +++ b/helix-core/src/syntax.rs @@ -662,9 +662,8 @@ pub fn language_config_for_file_name(&self, path: &Path) -> Option Option> { let line = Cow::from(source.line(0)); - static SHEBANG_REGEX: Lazy = Lazy::new(|| { - Regex::new(r"^#!\s*(?:\S*[/\\](?:env\s+(?:\-\S+\s+)*)?)?([^\s\.\d]+)").unwrap() - }); + static SHEBANG_REGEX: Lazy = + Lazy::new(|| Regex::new(&["^", SHEBANG].concat()).unwrap()); let configuration_id = SHEBANG_REGEX .captures(&line) .and_then(|cap| self.language_config_ids_by_shebang.get(&cap[1])); @@ -686,15 +685,14 @@ pub fn language_config_for_language_id(&self, id: &str) -> Option Option> { + /// Unlike language_config_for_language_id, which only returns Some for an exact id, this + /// function will perform a regex match on the given string to find the closest language match. + pub fn language_config_for_name(&self, name: &str) -> Option> { let mut best_match_length = 0; let mut best_match_position = None; for (i, configuration) in self.language_configs.iter().enumerate() { if let Some(injection_regex) = &configuration.injection_regex { - if let Some(mat) = injection_regex.find(string) { + if let Some(mat) = injection_regex.find(name) { let length = mat.end() - mat.start(); if length > best_match_length { best_match_position = Some(i); @@ -704,11 +702,20 @@ pub fn language_configuration_for_injection_string( } } - if let Some(i) = best_match_position { - let configuration = &self.language_configs[i]; - return Some(configuration.clone()); + best_match_position.map(|i| self.language_configs[i].clone()) + } + + pub fn language_configuration_for_injection_string( + &self, + capture: &InjectionLanguageMarker, + ) -> Option> { + match capture { + InjectionLanguageMarker::Name(string) => self.language_config_for_name(string), + InjectionLanguageMarker::Filename(file) => self.language_config_for_file_name(file), + InjectionLanguageMarker::Shebang(shebang) => { + self.language_config_for_language_id(shebang) + } } - None } pub fn language_configs(&self) -> impl Iterator> { @@ -800,7 +807,7 @@ pub fn update( queue.push_back(self.root); let scopes = self.loader.scopes.load(); - let injection_callback = |language: &str| { + let injection_callback = |language: &InjectionLanguageMarker| { self.loader .language_configuration_for_injection_string(language) .and_then(|language_config| language_config.highlight_config(&scopes)) @@ -961,12 +968,9 @@ fn point_sub(a: Point, b: Point) -> Point { ); let mut injections = Vec::new(); for mat in matches { - let (language_name, content_node, included_children) = injection_for_match( - &layer.config, - &layer.config.injections_query, - &mat, - source_slice, - ); + let (injection_capture, content_node, included_children) = layer + .config + .injection_for_match(&layer.config.injections_query, &mat, source_slice); // Explicitly remove this match so that none of its other captures will remain // in the stream of captures. @@ -974,9 +978,10 @@ fn point_sub(a: Point, b: Point) -> Point { // If a language is found with the given name, then add a new language layer // to the highlighted document. - if let (Some(language_name), Some(content_node)) = (language_name, content_node) + if let (Some(injection_capture), Some(content_node)) = + (injection_capture, content_node) { - if let Some(config) = (injection_callback)(&language_name) { + if let Some(config) = (injection_callback)(&injection_capture) { let ranges = intersect_ranges(&layer.ranges, &[content_node], included_children); @@ -1001,14 +1006,11 @@ fn point_sub(a: Point, b: Point) -> Point { ); for mat in matches { let entry = &mut injections_by_pattern_index[mat.pattern_index]; - let (language_name, content_node, included_children) = injection_for_match( - &layer.config, - combined_injections_query, - &mat, - source_slice, - ); - if language_name.is_some() { - entry.0 = language_name; + let (injection_capture, content_node, included_children) = layer + .config + .injection_for_match(combined_injections_query, &mat, source_slice); + if injection_capture.is_some() { + entry.0 = injection_capture; } if let Some(content_node) = content_node { entry.1.push(content_node); @@ -1395,6 +1397,8 @@ pub struct HighlightConfiguration { non_local_variable_patterns: Vec, injection_content_capture_index: Option, injection_language_capture_index: Option, + injection_filename_capture_index: Option, + injection_shebang_capture_index: Option, local_scope_capture_index: Option, local_def_capture_index: Option, local_def_value_capture_index: Option, @@ -1538,6 +1542,8 @@ pub fn new( // Store the numeric ids for all of the special captures. let mut injection_content_capture_index = None; let mut injection_language_capture_index = None; + let mut injection_filename_capture_index = None; + let mut injection_shebang_capture_index = None; let mut local_def_capture_index = None; let mut local_def_value_capture_index = None; let mut local_ref_capture_index = None; @@ -1558,6 +1564,8 @@ pub fn new( match name.as_str() { "injection.content" => injection_content_capture_index = i, "injection.language" => injection_language_capture_index = i, + "injection.filename" => injection_filename_capture_index = i, + "injection.shebang" => injection_shebang_capture_index = i, _ => {} } } @@ -1573,6 +1581,8 @@ pub fn new( non_local_variable_patterns, injection_content_capture_index, injection_language_capture_index, + injection_filename_capture_index, + injection_shebang_capture_index, local_scope_capture_index, local_def_capture_index, local_def_value_capture_index, @@ -1631,6 +1641,90 @@ pub fn configure(&self, recognized_names: &[String]) { self.highlight_indices.store(Arc::new(indices)); } + + fn injection_pair<'a>( + &self, + query_match: &QueryMatch<'a, 'a>, + source: RopeSlice<'a>, + ) -> (Option>, Option>) { + let mut injection_capture = None; + let mut content_node = None; + + for capture in query_match.captures { + let index = Some(capture.index); + if index == self.injection_language_capture_index { + let name = byte_range_to_str(capture.node.byte_range(), source); + injection_capture = Some(InjectionLanguageMarker::Name(name)); + } else if index == self.injection_filename_capture_index { + let name = byte_range_to_str(capture.node.byte_range(), source); + let path = Path::new(name.as_ref()).to_path_buf(); + injection_capture = Some(InjectionLanguageMarker::Filename(path.into())); + } else if index == self.injection_shebang_capture_index { + let node_slice = source.byte_slice(capture.node.byte_range()); + + // some languages allow space and newlines before the actual string content + // so a shebang could be on either the first or second line + let lines = if let Ok(end) = node_slice.try_line_to_byte(2) { + node_slice.byte_slice(..end) + } else { + node_slice + }; + + static SHEBANG_REGEX: Lazy = Lazy::new(|| Regex::new(SHEBANG).unwrap()); + + injection_capture = SHEBANG_REGEX + .captures(&Cow::from(lines)) + .map(|cap| InjectionLanguageMarker::Shebang(cap[1].to_owned())) + } else if index == self.injection_content_capture_index { + content_node = Some(capture.node); + } + } + (injection_capture, content_node) + } + + fn injection_for_match<'a>( + &self, + query: &'a Query, + query_match: &QueryMatch<'a, 'a>, + source: RopeSlice<'a>, + ) -> ( + Option>, + Option>, + IncludedChildren, + ) { + let (mut injection_capture, content_node) = self.injection_pair(query_match, source); + + let mut included_children = IncludedChildren::default(); + for prop in query.property_settings(query_match.pattern_index) { + match prop.key.as_ref() { + // In addition to specifying the language name via the text of a + // captured node, it can also be hard-coded via a `#set!` predicate + // that sets the injection.language key. + "injection.language" if injection_capture.is_none() => { + injection_capture = prop + .value + .as_ref() + .map(|s| InjectionLanguageMarker::Name(s.as_ref().into())); + } + + // By default, injections do not include the *children* of an + // `injection.content` node - only the ranges that belong to the + // node itself. This can be changed using a `#set!` predicate that + // sets the `injection.include-children` key. + "injection.include-children" => included_children = IncludedChildren::All, + + // Some queries might only exclude named children but include unnamed + // children in their `injection.content` node. This can be enabled using + // a `#set!` predicate that sets the `injection.include-unnamed-children` key. + "injection.include-unnamed-children" => { + included_children = IncludedChildren::Unnamed + } + _ => {} + } + } + + (injection_capture, content_node, included_children) + } } impl<'a> HighlightIterLayer<'a> { @@ -2042,56 +2136,15 @@ fn next(&mut self) -> Option { } } -fn injection_for_match<'a>( - config: &HighlightConfiguration, - query: &'a Query, - query_match: &QueryMatch<'a, 'a>, - source: RopeSlice<'a>, -) -> (Option>, Option>, IncludedChildren) { - let content_capture_index = config.injection_content_capture_index; - let language_capture_index = config.injection_language_capture_index; - - let mut language_name = None; - let mut content_node = None; - for capture in query_match.captures { - let index = Some(capture.index); - if index == language_capture_index { - let name = byte_range_to_str(capture.node.byte_range(), source); - language_name = Some(name); - } else if index == content_capture_index { - content_node = Some(capture.node); - } - } - - let mut included_children = IncludedChildren::default(); - for prop in query.property_settings(query_match.pattern_index) { - match prop.key.as_ref() { - // In addition to specifying the language name via the text of a - // captured node, it can also be hard-coded via a `#set!` predicate - // that sets the injection.language key. - "injection.language" => { - if language_name.is_none() { - language_name = prop.value.as_ref().map(|s| s.as_ref().into()) - } - } - - // By default, injections do not include the *children* of an - // `injection.content` node - only the ranges that belong to the - // node itself. This can be changed using a `#set!` predicate that - // sets the `injection.include-children` key. - "injection.include-children" => included_children = IncludedChildren::All, - - // Some queries might only exclude named children but include unnamed - // children in their `injection.content` node. This can be enabled using - // a `#set!` predicate that sets the `injection.include-unnamed-children` key. - "injection.include-unnamed-children" => included_children = IncludedChildren::Unnamed, - _ => {} - } - } - - (language_name, content_node, included_children) +#[derive(Debug, Clone)] +pub enum InjectionLanguageMarker<'a> { + Name(Cow<'a, str>), + Filename(Cow<'a, Path>), + Shebang(String), } +const SHEBANG: &str = r"#!\s*(?:\S*[/\\](?:env\s+(?:\-\S+\s+)*)?)?([^\s\.\d]+)"; + pub struct Merge { iter: I, spans: Box)>>, diff --git a/helix-term/src/ui/markdown.rs b/helix-term/src/ui/markdown.rs index 87136992c..fea3de78f 100644 --- a/helix-term/src/ui/markdown.rs +++ b/helix-term/src/ui/markdown.rs @@ -9,7 +9,7 @@ use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Options, Parser, Tag}; use helix_core::{ - syntax::{self, HighlightEvent, Syntax}, + syntax::{self, HighlightEvent, InjectionLanguageMarker, Syntax}, Rope, }; use helix_view::{ @@ -47,7 +47,9 @@ pub fn highlighted_code_block<'a>( let rope = Rope::from(text.as_ref()); let syntax = config_loader - .language_configuration_for_injection_string(language) + .language_configuration_for_injection_string(&InjectionLanguageMarker::Name( + language.into(), + )) .and_then(|config| config.highlight_config(theme.scopes())) .map(|config| Syntax::new(&rope, config, Arc::clone(&config_loader))); diff --git a/languages.toml b/languages.toml index 14da46c15..9a0a538a0 100644 --- a/languages.toml +++ b/languages.toml @@ -591,7 +591,7 @@ indent = { tab-width = 2, unit = " " } [[grammar]] name = "nix" -source = { git = "https://github.com/nix-community/tree-sitter-nix", rev = "6b71a810c0acd49b980c50fc79092561f7cee307" } +source = { git = "https://github.com/nix-community/tree-sitter-nix", rev = "1b69cf1fa92366eefbe6863c184e5d2ece5f187d" } [[language]] name = "ruby" diff --git a/runtime/queries/markdown/injections.scm b/runtime/queries/markdown/injections.scm index e88393512..80977459e 100644 --- a/runtime/queries/markdown/injections.scm +++ b/runtime/queries/markdown/injections.scm @@ -1,5 +1,9 @@ ; From nvim-treesitter/nvim-treesitter +(fenced_code_block + (code_fence_content) @injection.shebang @injection.content + (#set! injection.include-unnamed-children)) + (fenced_code_block (info_string (language) @injection.language) diff --git a/runtime/queries/nix/highlights.scm b/runtime/queries/nix/highlights.scm index a998aa644..4633e1786 100644 --- a/runtime/queries/nix/highlights.scm +++ b/runtime/queries/nix/highlights.scm @@ -47,8 +47,10 @@ (float_expression) @constant.numeric.float (escape_sequence) @constant.character.escape +(dollar_escape) @constant.character.escape (function_expression + "@"? @punctuation.delimiter universal: (identifier) @variable.parameter "@"? @punctuation.delimiter ) @@ -82,7 +84,8 @@ (binding attrpath: (attrpath attr: (identifier)) @variable.other.member) -(inherit_from attrs: (inherited_attrs attr: (identifier) @variable)) +(inherit_from attrs: (inherited_attrs attr: (identifier) @variable.other.member)) +(inherited_attrs attr: (identifier) @variable) (has_attr_expression expression: (_) diff --git a/runtime/queries/nix/injections.scm b/runtime/queries/nix/injections.scm index 62b48233a..1da63ce08 100644 --- a/runtime/queries/nix/injections.scm +++ b/runtime/queries/nix/injections.scm @@ -10,9 +10,11 @@ ; such as those of stdenv.mkDerivation. ((binding attrpath: (attrpath (identifier) @_path) - expression: (indented_string_expression - (string_fragment) @injection.content)) - (#match? @_path "(^\\w*Phase|(pre|post)\\w*|(.*\\.)?\\w*([sS]cript|[hH]ook)|(.*\\.)?startup)$") + expression: [ + (indented_string_expression (string_fragment) @injection.content) + (binary_expression (indented_string_expression (string_fragment) @injection.content)) + ]) + (#match? @_path "(^\\w*Phase|command|(pre|post)\\w*|(.*\\.)?\\w*([sS]cript|[hH]ook)|(.*\\.)?startup)$") (#set! injection.language "bash") (#set! injection.combined)) @@ -150,3 +152,13 @@ ; (#match? @_func "(^|\\.)writeFSharp(Bin)?$") ; (#set! injection.language "f-sharp") ; (#set! injection.combined)) + +((apply_expression + function: (apply_expression function: (_) @_func + argument: (string_expression (string_fragment) @injection.filename)) + argument: (indented_string_expression (string_fragment) @injection.content)) + (#match? @_func "(^|\\.)write(Text|Script(Bin)?)$") + (#set! injection.combined)) + +((indented_string_expression (string_fragment) @injection.shebang @injection.content) + (#set! injection.combined)) \ No newline at end of file