Skip to content

Commit 29eff34

Browse files
committed
Highlight regular expressions with tree-sitter-regex grammar
This grammar is bundled in nixos by default and seems good enough for java regular expressions. It is also maintained under the tree-sitter github org so is "official". In order to property identify the #" and closing " characters we have to parse them with the clojure grammar (in case the regex grammar is not available) and again with the regex grammar as part of the actual pattern. This could be avoided if either the clojure grammar captured a node for the inner contents of the regex literal, or the treesit-range-settings supported some kind of offest argument like the neovim tree-sitter mechanisms do. Should address issue #11 I think that multiple parsers per buffer may be too buggy to use right now. There are situations where no regex will be present on in a buffer, but the entire buffer will be highlighted as a regular expression. This functionality probably needs upstream work in Emacs before we can merge this into the main branch of clojure-ts-mode
1 parent 5125a56 commit 29eff34

File tree

2 files changed

+236
-133
lines changed

2 files changed

+236
-133
lines changed

clojure-ts-mode.el

+235-132
Original file line numberDiff line numberDiff line change
@@ -277,138 +277,224 @@ Only intended for use at development time.")
277277
"defstruct")
278278
line-end))
279279

280-
(defun clojure-ts--font-lock-settings ()
281-
"Return font lock settings suitable for use in `treesit-font-lock-settings'."
282-
(treesit-font-lock-rules
283-
:feature 'string
284-
:language 'clojure
285-
'((str_lit) @font-lock-string-face
286-
(regex_lit) @font-lock-string-face)
280+
(defvar clojure-ts-regex-grammar-git-url
281+
"https://github.com/tree-sitter/tree-sitter-regex.git"
282+
"The URL to install the regex grammar from.")
287283

288-
:feature 'regex
289-
:language 'clojure
290-
:override t
291-
'((regex_lit marker: _ @font-lock-property-face))
292-
293-
:feature 'number
294-
:language 'clojure
295-
'((num_lit) @font-lock-number-face)
296-
297-
:feature 'constant
298-
:language 'clojure
299-
'([(bool_lit) (nil_lit)] @font-lock-constant-face)
300-
301-
:feature 'char
302-
:language 'clojure
303-
'((char_lit) @clojure-ts-character-face)
304-
305-
:feature 'keyword
306-
:language 'clojure
307-
'((kwd_ns) @font-lock-type-face
308-
(kwd_name) @clojure-ts-keyword-face
309-
(kwd_lit
310-
marker: _ @clojure-ts-keyword-face
311-
delimiter: _ :? @default))
312-
313-
:feature 'builtin
314-
:language 'clojure
315-
`(((list_lit :anchor (sym_lit (sym_name) @font-lock-keyword-face))
316-
(:match ,clojure-ts--builtin-symbol-regexp @font-lock-keyword-face))
317-
((sym_name) @font-lock-builtin-face
318-
(:match ,clojure-ts--builtin-dynamic-var-regexp @font-lock-builtin-face)))
319-
320-
:feature 'symbol
321-
:language 'clojure
322-
'((sym_ns) @font-lock-type-face)
323-
324-
;; How does this work for defns nested in other forms, not at the top level?
325-
;; Should I match against the source node to only hit the top level? Can that be expressed?
326-
;; What about valid usages like `(let [closed 1] (defn +closed [n] (+ n closed)))'??
327-
;; No wonder the tree-sitter-clojure grammar only touches syntax, and not semantics
328-
:feature 'definition ;; defn and defn like macros
329-
:language 'clojure
330-
`(((list_lit :anchor (sym_lit (sym_name) @font-lock-keyword-face)
331-
:anchor (sym_lit (sym_name) @font-lock-function-name-face))
332-
(:match ,clojure-ts--definition-keyword-regexp
333-
@font-lock-keyword-face))
334-
((anon_fn_lit
335-
marker: "#" @font-lock-property-face)))
336-
337-
:feature 'variable ;; def, defonce
338-
:language 'clojure
339-
`(((list_lit :anchor (sym_lit (sym_name) @font-lock-keyword-face)
340-
:anchor (sym_lit (sym_name) @font-lock-variable-name-face))
341-
(:match ,clojure-ts--variable-keyword-regexp @font-lock-keyword-face)))
342-
343-
:feature 'type ;; deftype, defmulti, defprotocol, etc
344-
:language 'clojure
345-
`(((list_lit :anchor (sym_lit (sym_name) @font-lock-keyword-face)
346-
:anchor (sym_lit (sym_name) @font-lock-type-face))
347-
(:match ,clojure-ts--type-keyword-regexp @font-lock-keyword-face)))
348-
349-
:feature 'metadata
350-
:language 'clojure
351-
:override t
352-
`((meta_lit marker: "^" @font-lock-property-face)
353-
(meta_lit value: (kwd_lit) @font-lock-property-face) ;; metadata
354-
(meta_lit value: (sym_lit (sym_name) @font-lock-type-face)) ;; typehint
355-
(old_meta_lit marker: "#^" @font-lock-property-face)
356-
(old_meta_lit value: (kwd_lit) @font-lock-property-face) ;; metadata
357-
(old_meta_lit value: (sym_lit (sym_name) @font-lock-type-face))) ;; typehint
358-
359-
:feature 'tagged-literals
360-
:language 'clojure
361-
:override t
362-
'((tagged_or_ctor_lit marker: "#" @font-lock-preprocessor-face
363-
tag: (sym_lit) @font-lock-preprocessor-face))
284+
(defvar clojure-ts-regex-grammar-git-ref
285+
"v0.20.0"
286+
"The branch or tag to use when installing the regex gramar.")
364287

365-
;; TODO, also account for `def'
366-
;; Figure out how to highlight symbols in docstrings.
367-
:feature 'doc
368-
:language 'clojure
369-
:override t
370-
`(((list_lit :anchor (sym_lit) @def_symbol
371-
:anchor (sym_lit) @function_name
372-
:anchor (str_lit) @font-lock-doc-face)
373-
(:match ,clojure-ts--definition-keyword-regexp @def_symbol)))
374-
375-
:feature 'quote
376-
:language 'clojure
377-
'((quoting_lit
378-
marker: _ @font-lock-delimiter-face)
379-
(var_quoting_lit
380-
marker: _ @font-lock-delimiter-face)
381-
(syn_quoting_lit
382-
marker: _ @font-lock-delimiter-face)
383-
(unquoting_lit
384-
marker: _ @font-lock-delimiter-face)
385-
(unquote_splicing_lit
386-
marker: _ @font-lock-delimiter-face)
387-
(var_quoting_lit
388-
marker: _ @font-lock-delimiter-face))
389-
390-
:feature 'bracket
391-
:language 'clojure
392-
'((["(" ")" "[" "]" "{" "}"]) @font-lock-bracket-face
393-
(set_lit :anchor "#" @font-lock-bracket-face))
394-
395-
:feature 'comment
396-
:language 'clojure
288+
(defun clojure-ts-install-regex-grammar ()
289+
"Install the grammar needed by `clojure-ts-mode' for regex literal font-locking."
290+
(interactive)
291+
(add-to-list
292+
'treesit-language-source-alist
293+
`(regex . (,clojure-ts-regex-grammar-git-url ,clojure-ts-regex-grammar-git-ref)))
294+
(treesit-install-language-grammar 'regex))
295+
296+
(defvar clojure-ts--supress-regex-grammar-install-message
297+
nil
298+
"When non-nil, do not show message about installing the regex grammar.")
299+
300+
(defun clojure-ts--notify-regex-grammar-missing ()
301+
"Show the users a one-time message about installing the regex grammar."
302+
(unless clojure-ts--supress-regex-grammar-install-message
303+
(message (concat "To add support for regular expression font locking "
304+
"in clojure-ts-mode "
305+
"run `M-x clojure-ts-install-regex-grammar <RET>`."))
306+
(setq clojure-ts--supress-regex-grammar-install-message t)))
307+
308+
(defun clojure-ts--regex-font-lock-compatibility-ab4eb4b ()
309+
"Font-lock helper to handle breaking changes in different releases of tree-sitter-regex."
310+
;; Prior to ab4eb4b, lookahead_assertion was a named node
311+
;; After ab4eb4b was applied, it was replaced by lookaround_assertion, which includes
312+
;; both lookahead and lookbehind assertions.
313+
(condition-case nil
314+
(progn (treesit-query-capture 'regex '((lookahead_assertion) @capture))
315+
`(lookahead_assertion (["(?" "=" "!" ")"]) @font-lock-regexp-grouping-construct))
316+
(error
317+
`(lookaround_assertion (["(?" "(?<" "=" "!" ")"]) @font-lock-regexp-grouping-construct))))
318+
319+
(defun clojure-ts--regex-font-lock-settings ()
320+
"Return rules for font-locking regular expression literals."
321+
;; We have to gate this behind a check to (treesit-ready-p 'regex)
322+
;; Even if we don't set treesit-range-settings while the grammar is not
323+
;; installed, the font-locking engine still seems to want to evaluate these
324+
;; rules.
325+
(treesit-font-lock-rules
326+
:feature 'regex
327+
:language 'regex
397328
:override t
398-
`((comment) @font-lock-comment-face
399-
(dis_expr
400-
marker: "#_" @font-lock-comment-delimiter-face
401-
value: _ @font-lock-comment-face)
402-
(,(append
403-
'(list_lit :anchor (sym_lit) @font-lock-comment-delimiter-face)
404-
(when clojure-ts-comment-macro-font-lock-body
405-
'(_ :* @font-lock-comment-face)))
406-
(:match "^\\(\\(clojure.core/\\)?comment\\)$" @font-lock-comment-delimiter-face)))
407-
408-
:feature 'deref ;; not part of clojure-mode, but a cool idea?
409-
:language 'clojure
410-
'((derefing_lit
411-
marker: "@" @font-lock-warning-face))))
329+
`(;; This captures the #"" characters that surround a regex in clojure.
330+
;; If we could define offsets in treesit-range-settings
331+
;; this would not be necessary
332+
((pattern (term
333+
:anchor (pattern_character) @font-lock-regexp-face
334+
:anchor (pattern_character) @font-lock-string-face
335+
(pattern_character) @font-lock-string-face :anchor))
336+
(:equal @font-lock-regexp-face "#")
337+
(:equal @font-lock-string-face "\""))
338+
;; Capturing Groups
339+
((anonymous_capturing_group (["(" ")"]) @font-lock-regexp-grouping-construct))
340+
((non_capturing_group (["(?:" ")"]) @font-lock-regexp-grouping-construct))
341+
(,(clojure-ts--regex-font-lock-compatibility-ab4eb4b))
342+
((named_capturing_group (["(?<" ">" ")"]) @font-lock-regexp-grouping-construct))
343+
((group_name) @font-lock-variable-name-face)
344+
;; Character classes
345+
((character_class (["[" "]"]) @font-lock-bracket-face))
346+
((character_class "^" @font-lock-negation-char-face))
347+
((class_range "-" @font-lock-punctuation-face))
348+
;; Quantifiers
349+
([(zero_or_more) (one_or_more) (optional)]) @font-lock-keyword-face
350+
((count_quantifier (["{" "}"]) @font-lock-bracket-face))
351+
((count_quantifier "," @font-lock-punctuation-face))
352+
((count_quantifier (decimal_digits) @font-lock-number-face))
353+
;; Escaping
354+
([(start_assertion) (any_character) (end_assertion)]) @font-lock-keyword-face
355+
([(decimal_escape)
356+
(identity_escape)
357+
(character_class_escape)]) @font-lock-regexp-grouping-backslash
358+
((pattern_character) @font-lock-regexp-face)
359+
([(control_escape) (boundary_assertion)] @font-lock-builtin-face))))
360+
361+
362+
(defun clojure-ts--font-lock-settings (regex-available)
363+
"Return font lock settings suitable for use in `treesit-font-lock-settings'.
364+
When REGEX-AVAILABLE is non-nil, includes regex font-lock rules."
365+
(append
366+
(treesit-font-lock-rules
367+
:feature 'string
368+
:language 'clojure
369+
'((str_lit) @font-lock-string-face
370+
(regex_lit) @font-lock-regexp-face)
371+
372+
:feature 'regex
373+
:language 'clojure
374+
:override t
375+
'((regex_lit marker: "#" @font-lock-regexp-face)))
376+
(when regex-available
377+
(clojure-ts--regex-font-lock-settings))
378+
(treesit-font-lock-rules
379+
:feature 'number
380+
:language 'clojure
381+
'((num_lit) @font-lock-number-face)
382+
383+
:feature 'constant
384+
:language 'clojure
385+
'([(bool_lit) (nil_lit)] @font-lock-constant-face)
386+
387+
:feature 'char
388+
:language 'clojure
389+
'((char_lit) @clojure-ts-character-face)
390+
391+
:feature 'keyword
392+
:language 'clojure
393+
'((kwd_ns) @font-lock-type-face
394+
(kwd_name) @clojure-ts-keyword-face
395+
(kwd_lit
396+
marker: _ @clojure-ts-keyword-face
397+
delimiter: _ :? @default))
398+
399+
:feature 'builtin
400+
:language 'clojure
401+
`(((list_lit :anchor (sym_lit (sym_name) @font-lock-keyword-face))
402+
(:match ,clojure-ts--builtin-symbol-regexp @font-lock-keyword-face))
403+
((sym_name) @font-lock-builtin-face
404+
(:match ,clojure-ts--builtin-dynamic-var-regexp @font-lock-builtin-face)))
405+
406+
:feature 'symbol
407+
:language 'clojure
408+
'((sym_ns) @font-lock-type-face)
409+
410+
;; How does this work for defns nested in other forms, not at the top level?
411+
;; Should I match against the source node to only hit the top level? Can that be expressed?
412+
;; What about valid usages like `(let [closed 1] (defn +closed [n] (+ n closed)))'??
413+
;; No wonder the tree-sitter-clojure grammar only touches syntax, and not semantics
414+
:feature 'definition ;; defn and defn like macros
415+
:language 'clojure
416+
`(((list_lit :anchor (sym_lit (sym_name) @font-lock-keyword-face)
417+
:anchor (sym_lit (sym_name) @font-lock-function-name-face))
418+
(:match ,clojure-ts--definition-keyword-regexp
419+
@font-lock-keyword-face))
420+
((anon_fn_lit
421+
marker: "#" @font-lock-property-face)))
422+
423+
:feature 'variable ;; def, defonce
424+
:language 'clojure
425+
`(((list_lit :anchor (sym_lit (sym_name) @font-lock-keyword-face)
426+
:anchor (sym_lit (sym_name) @font-lock-variable-name-face))
427+
(:match ,clojure-ts--variable-keyword-regexp @font-lock-keyword-face)))
428+
429+
:feature 'type ;; deftype, defmulti, defprotocol, etc
430+
:language 'clojure
431+
`(((list_lit :anchor (sym_lit (sym_name) @font-lock-keyword-face)
432+
:anchor (sym_lit (sym_name) @font-lock-type-face))
433+
(:match ,clojure-ts--type-keyword-regexp @font-lock-keyword-face)))
434+
435+
:feature 'metadata
436+
:language 'clojure
437+
:override t
438+
`((meta_lit marker: "^" @font-lock-property-face)
439+
(meta_lit value: (kwd_lit) @font-lock-property-face) ;; metadata
440+
(meta_lit value: (sym_lit (sym_name) @font-lock-type-face)) ;; typehint
441+
(old_meta_lit marker: "#^" @font-lock-property-face)
442+
(old_meta_lit value: (kwd_lit) @font-lock-property-face) ;; metadata
443+
(old_meta_lit value: (sym_lit (sym_name) @font-lock-type-face))) ;; typehint
444+
445+
:feature 'tagged-literals
446+
:language 'clojure
447+
:override t
448+
'((tagged_or_ctor_lit marker: "#" @font-lock-preprocessor-face
449+
tag: (sym_lit) @font-lock-preprocessor-face))
450+
451+
;; TODO, also account for `def'
452+
;; Figure out how to highlight symbols in docstrings.
453+
:feature 'doc
454+
:language 'clojure
455+
:override t
456+
`(((list_lit :anchor (sym_lit) @def_symbol
457+
:anchor (sym_lit) @function_name
458+
:anchor (str_lit) @font-lock-doc-face)
459+
(:match ,clojure-ts--definition-keyword-regexp @def_symbol)))
460+
461+
:feature 'quote
462+
:language 'clojure
463+
'((quoting_lit
464+
marker: _ @font-lock-delimiter-face)
465+
(var_quoting_lit
466+
marker: _ @font-lock-delimiter-face)
467+
(syn_quoting_lit
468+
marker: _ @font-lock-delimiter-face)
469+
(unquoting_lit
470+
marker: _ @font-lock-delimiter-face)
471+
(unquote_splicing_lit
472+
marker: _ @font-lock-delimiter-face)
473+
(var_quoting_lit
474+
marker: _ @font-lock-delimiter-face))
475+
476+
:feature 'bracket
477+
:language 'clojure
478+
'((["(" ")" "[" "]" "{" "}"]) @font-lock-bracket-face
479+
(set_lit :anchor "#" @font-lock-bracket-face))
480+
481+
:feature 'comment
482+
:language 'clojure
483+
:override t
484+
`((comment) @font-lock-comment-face
485+
(dis_expr
486+
marker: "#_" @font-lock-comment-delimiter-face
487+
value: _ @font-lock-comment-face)
488+
(,(append
489+
'(list_lit :anchor (sym_lit) @font-lock-comment-delimiter-face)
490+
(when clojure-ts-comment-macro-font-lock-body
491+
'(_ :* @font-lock-comment-face)))
492+
(:match "^\\(\\(clojure.core/\\)?comment\\)$" @font-lock-comment-delimiter-face)))
493+
494+
:feature 'deref ;; not part of clojure-mode, but a cool idea?
495+
:language 'clojure
496+
'((derefing_lit
497+
marker: "@" @font-lock-warning-face)))))
412498

413499
;; Node predicates
414500

@@ -597,6 +683,12 @@ See `clojure-ts--standard-definition-node-name' for the implementation used.")
597683
(interactive)
598684
(message "clojure-ts-mode (version %s)" clojure-ts-mode-version))
599685

686+
(defvar clojure-ts--treesit-range-settings
687+
(treesit-range-rules
688+
:embed 'regex
689+
:host 'clojure
690+
'((regex_lit) @capture)))
691+
600692
;;;###autoload
601693
(define-derived-mode clojure-ts-mode prog-mode "Clojure[TS]"
602694
"Major mode for editing Clojure code.
@@ -607,9 +699,20 @@ See `clojure-ts--standard-definition-node-name' for the implementation used.")
607699
(treesit-install-language-grammar 'clojure))
608700
(setq-local comment-start ";")
609701
(when (treesit-ready-p 'clojure)
610-
(treesit-parser-create 'clojure)
611-
(setq-local treesit-font-lock-settings (clojure-ts--font-lock-settings)
612-
treesit-defun-prefer-top-level t
702+
(let ((regex-available (treesit-ready-p
703+
'regex
704+
(or clojure-ts--supress-regex-grammar-install-message
705+
'message))))
706+
;; Configure OPTIONAL regex sub-grammar font locking
707+
(if regex-available
708+
(progn
709+
(treesit-parser-create 'regex)
710+
(setq-local treesit-range-settings clojure-ts--treesit-range-settings))
711+
(treesit-parser-create 'clojure)
712+
(clojure-ts--notify-regex-grammar-missing))
713+
(setq-local treesit-font-lock-settings
714+
(clojure-ts--font-lock-settings regex-available)))
715+
(setq-local treesit-defun-prefer-top-level t
613716
treesit-defun-tactic 'top-level
614717
treesit-defun-type-regexp (rx (or "list_lit" "vec_lit" "map_lit"))
615718
treesit-simple-indent-rules clojure-ts--fixed-indent-rules

test/test.clj

+1-1
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ Etiam commodo nulla id risus convallis pharetra. Integer dapibus, eros vitae veh
264264
(println "Hello, World!"))
265265

266266
(binding [*out* nil]
267-
#"regex string"
267+
#"^(?<lookup>abc)[0-9]\b$"
268268
(def #^Typehint x 1)
269269
(def #^:metadata x 1)
270270
(def ^Typehint x 2)

0 commit comments

Comments
 (0)