Skip to content

Commit 36309ff

Browse files
committed
Highlight regular expressions with tree-sitter-regex grammar
This grammar is bundled in nixos by default and seems good enough for java regular expressions. It is also maintained under the tree-sitter github org so is "official". In order to property identify the #" and closing " characters we have to parse them with the clojure grammar (in case the regex grammar is not available) and again with the regex grammar as part of the actual pattern. This could be avoided if either the clojure grammar captured a node for the inner contents of the regex literal, or the treesit-range-settings supported some kind of offest argument like the neovim tree-sitter mechanisms do. Should address issue #11
1 parent 5125a56 commit 36309ff

File tree

2 files changed

+223
-132
lines changed

2 files changed

+223
-132
lines changed

clojure-ts-mode.el

+222-131
Original file line numberDiff line numberDiff line change
@@ -277,138 +277,212 @@ Only intended for use at development time.")
277277
"defstruct")
278278
line-end))
279279

280-
(defun clojure-ts--font-lock-settings ()
281-
"Return font lock settings suitable for use in `treesit-font-lock-settings'."
282-
(treesit-font-lock-rules
283-
:feature 'string
284-
:language 'clojure
285-
'((str_lit) @font-lock-string-face
286-
(regex_lit) @font-lock-string-face)
280+
(defvar clojure-ts-regex-grammar-git-url
281+
"https://github.com/tree-sitter/tree-sitter-regex.git"
282+
"The URL to install the regex grammar from.")
287283

288-
:feature 'regex
289-
:language 'clojure
290-
:override t
291-
'((regex_lit marker: _ @font-lock-property-face))
292-
293-
:feature 'number
294-
:language 'clojure
295-
'((num_lit) @font-lock-number-face)
296-
297-
:feature 'constant
298-
:language 'clojure
299-
'([(bool_lit) (nil_lit)] @font-lock-constant-face)
300-
301-
:feature 'char
302-
:language 'clojure
303-
'((char_lit) @clojure-ts-character-face)
304-
305-
:feature 'keyword
306-
:language 'clojure
307-
'((kwd_ns) @font-lock-type-face
308-
(kwd_name) @clojure-ts-keyword-face
309-
(kwd_lit
310-
marker: _ @clojure-ts-keyword-face
311-
delimiter: _ :? @default))
312-
313-
:feature 'builtin
314-
:language 'clojure
315-
`(((list_lit :anchor (sym_lit (sym_name) @font-lock-keyword-face))
316-
(:match ,clojure-ts--builtin-symbol-regexp @font-lock-keyword-face))
317-
((sym_name) @font-lock-builtin-face
318-
(:match ,clojure-ts--builtin-dynamic-var-regexp @font-lock-builtin-face)))
319-
320-
:feature 'symbol
321-
:language 'clojure
322-
'((sym_ns) @font-lock-type-face)
323-
324-
;; How does this work for defns nested in other forms, not at the top level?
325-
;; Should I match against the source node to only hit the top level? Can that be expressed?
326-
;; What about valid usages like `(let [closed 1] (defn +closed [n] (+ n closed)))'??
327-
;; No wonder the tree-sitter-clojure grammar only touches syntax, and not semantics
328-
:feature 'definition ;; defn and defn like macros
329-
:language 'clojure
330-
`(((list_lit :anchor (sym_lit (sym_name) @font-lock-keyword-face)
331-
:anchor (sym_lit (sym_name) @font-lock-function-name-face))
332-
(:match ,clojure-ts--definition-keyword-regexp
333-
@font-lock-keyword-face))
334-
((anon_fn_lit
335-
marker: "#" @font-lock-property-face)))
336-
337-
:feature 'variable ;; def, defonce
338-
:language 'clojure
339-
`(((list_lit :anchor (sym_lit (sym_name) @font-lock-keyword-face)
340-
:anchor (sym_lit (sym_name) @font-lock-variable-name-face))
341-
(:match ,clojure-ts--variable-keyword-regexp @font-lock-keyword-face)))
342-
343-
:feature 'type ;; deftype, defmulti, defprotocol, etc
344-
:language 'clojure
345-
`(((list_lit :anchor (sym_lit (sym_name) @font-lock-keyword-face)
346-
:anchor (sym_lit (sym_name) @font-lock-type-face))
347-
(:match ,clojure-ts--type-keyword-regexp @font-lock-keyword-face)))
348-
349-
:feature 'metadata
350-
:language 'clojure
351-
:override t
352-
`((meta_lit marker: "^" @font-lock-property-face)
353-
(meta_lit value: (kwd_lit) @font-lock-property-face) ;; metadata
354-
(meta_lit value: (sym_lit (sym_name) @font-lock-type-face)) ;; typehint
355-
(old_meta_lit marker: "#^" @font-lock-property-face)
356-
(old_meta_lit value: (kwd_lit) @font-lock-property-face) ;; metadata
357-
(old_meta_lit value: (sym_lit (sym_name) @font-lock-type-face))) ;; typehint
358-
359-
:feature 'tagged-literals
360-
:language 'clojure
361-
:override t
362-
'((tagged_or_ctor_lit marker: "#" @font-lock-preprocessor-face
363-
tag: (sym_lit) @font-lock-preprocessor-face))
284+
(defvar clojure-ts-regex-grammar-git-ref
285+
"v0.20.0"
286+
"The branch or tag to use when installing the regex gramar.")
364287

365-
;; TODO, also account for `def'
366-
;; Figure out how to highlight symbols in docstrings.
367-
:feature 'doc
368-
:language 'clojure
369-
:override t
370-
`(((list_lit :anchor (sym_lit) @def_symbol
371-
:anchor (sym_lit) @function_name
372-
:anchor (str_lit) @font-lock-doc-face)
373-
(:match ,clojure-ts--definition-keyword-regexp @def_symbol)))
374-
375-
:feature 'quote
376-
:language 'clojure
377-
'((quoting_lit
378-
marker: _ @font-lock-delimiter-face)
379-
(var_quoting_lit
380-
marker: _ @font-lock-delimiter-face)
381-
(syn_quoting_lit
382-
marker: _ @font-lock-delimiter-face)
383-
(unquoting_lit
384-
marker: _ @font-lock-delimiter-face)
385-
(unquote_splicing_lit
386-
marker: _ @font-lock-delimiter-face)
387-
(var_quoting_lit
388-
marker: _ @font-lock-delimiter-face))
389-
390-
:feature 'bracket
391-
:language 'clojure
392-
'((["(" ")" "[" "]" "{" "}"]) @font-lock-bracket-face
393-
(set_lit :anchor "#" @font-lock-bracket-face))
394-
395-
:feature 'comment
396-
:language 'clojure
288+
(defun clojure-ts-install-regex-grammar ()
289+
"Install the grammar needed by `clojure-ts-mode' for regex literal font-locking."
290+
(interactive)
291+
(add-to-list
292+
'treesit-language-source-alist
293+
'(regex . (clojure-ts-regex-grammar-git-url clojure-ts-regex-grammar-git-ref)))
294+
(treesit-install-language-grammar 'regex))
295+
296+
(defvar clojure-ts--supress-regex-grammar-install-message
297+
nil
298+
"When non-nil, do not show message about installing the regex grammar.")
299+
300+
(defun clojure-ts--notify-regex-grammar-missing ()
301+
"Show the users a one-time message about installing the regex grammar."
302+
(unless clojure-ts--supress-regex-grammar-install-message
303+
(message (concat "To add support for regular expression font locking "
304+
"in clojure-ts-mode "
305+
"run `M-x clojure-ts-install-regex-grammar <RET>`."))
306+
(setq clojure-ts--supress-regex-grammar-install-message t)))
307+
308+
(defun clojure-ts--regex-font-lock-settings ()
309+
"Return rules for font-locking regular expression literals."
310+
;; We have to gate this behind a check to (treesit-ready-p 'regex)
311+
;; Even if we don't set treesit-range-settings while the grammar is not
312+
;; installed, the font-locking engine still seems to want to evaluate these
313+
;; rules.
314+
(treesit-font-lock-rules
315+
:feature 'regex
316+
:language 'regex
397317
:override t
398-
`((comment) @font-lock-comment-face
399-
(dis_expr
400-
marker: "#_" @font-lock-comment-delimiter-face
401-
value: _ @font-lock-comment-face)
402-
(,(append
403-
'(list_lit :anchor (sym_lit) @font-lock-comment-delimiter-face)
404-
(when clojure-ts-comment-macro-font-lock-body
405-
'(_ :* @font-lock-comment-face)))
406-
(:match "^\\(\\(clojure.core/\\)?comment\\)$" @font-lock-comment-delimiter-face)))
407-
408-
:feature 'deref ;; not part of clojure-mode, but a cool idea?
409-
:language 'clojure
410-
'((derefing_lit
411-
marker: "@" @font-lock-warning-face))))
318+
'(;; This captures the #"" characters that surround a regex in clojure.
319+
;; If we could define offsets in treesit-range-settings
320+
;; this would not be necessary
321+
((pattern (term
322+
:anchor (pattern_character) @font-lock-regexp-face
323+
:anchor (pattern_character) @font-lock-string-face
324+
(pattern_character) @font-lock-string-face :anchor))
325+
(:equal @font-lock-regexp-face "#")
326+
(:equal @font-lock-string-face "\""))
327+
;; Capturing Groups
328+
((anonymous_capturing_group (["(" ")"]) @font-lock-regexp-grouping-construct))
329+
((non_capturing_group (["(?:" ")"]) @font-lock-regexp-grouping-construct))
330+
((lookahead_assertion (["(?" "=" "!" ")"]) @font-lock-regexp-grouping-construct))
331+
((named_capturing_group (["(?<" ">" ")"]) @font-lock-regexp-grouping-construct))
332+
((group_name) @font-lock-variable-name-face)
333+
;; Character classes
334+
((character_class (["[" "]"]) @font-lock-bracket-face))
335+
((character_class "^" @font-lock-negation-char-face))
336+
((class_range "-" @font-lock-punctuation-face))
337+
;; Quantifiers
338+
([(zero_or_more) (one_or_more) (optional)]) @font-lock-keyword-face
339+
((count_quantifier (["{" "}"]) @font-lock-bracket-face))
340+
((count_quantifier "," @font-lock-punctuation-face))
341+
((count_quantifier (decimal_digits) @font-lock-number-face))
342+
;; Escaping
343+
([(start_assertion) (any_character) (end_assertion)]) @font-lock-keyword-face
344+
([(decimal_escape)
345+
(identity_escape)
346+
(character_class_escape)]) @font-lock-regexp-grouping-backslash
347+
((pattern_character) @font-lock-regexp-face)
348+
([(control_escape) (boundary_assertion)] @font-lock-builtin-face))))
349+
350+
(defun clojure-ts--font-lock-settings (regex-available)
351+
"Return font lock settings suitable for use in `treesit-font-lock-settings'.
352+
When REGEX-AVAILABLE is non-nil, includes regex font-lock rules."
353+
(append
354+
(treesit-font-lock-rules
355+
:feature 'string
356+
:language 'clojure
357+
'((str_lit) @font-lock-string-face
358+
(regex_lit) @font-lock-regexp-face)
359+
360+
:feature 'regex
361+
:language 'clojure
362+
:override t
363+
'((regex_lit marker: "#" @font-lock-regexp-face))
364+
365+
:feature 'number
366+
:language 'clojure
367+
'((num_lit) @font-lock-number-face)
368+
369+
:feature 'constant
370+
:language 'clojure
371+
'([(bool_lit) (nil_lit)] @font-lock-constant-face)
372+
373+
:feature 'char
374+
:language 'clojure
375+
'((char_lit) @clojure-ts-character-face)
376+
377+
:feature 'keyword
378+
:language 'clojure
379+
'((kwd_ns) @font-lock-type-face
380+
(kwd_name) @clojure-ts-keyword-face
381+
(kwd_lit
382+
marker: _ @clojure-ts-keyword-face
383+
delimiter: _ :? @default))
384+
385+
:feature 'builtin
386+
:language 'clojure
387+
`(((list_lit :anchor (sym_lit (sym_name) @font-lock-keyword-face))
388+
(:match ,clojure-ts--builtin-symbol-regexp @font-lock-keyword-face))
389+
((sym_name) @font-lock-builtin-face
390+
(:match ,clojure-ts--builtin-dynamic-var-regexp @font-lock-builtin-face)))
391+
392+
:feature 'symbol
393+
:language 'clojure
394+
'((sym_ns) @font-lock-type-face)
395+
396+
;; How does this work for defns nested in other forms, not at the top level?
397+
;; Should I match against the source node to only hit the top level? Can that be expressed?
398+
;; What about valid usages like `(let [closed 1] (defn +closed [n] (+ n closed)))'??
399+
;; No wonder the tree-sitter-clojure grammar only touches syntax, and not semantics
400+
:feature 'definition ;; defn and defn like macros
401+
:language 'clojure
402+
`(((list_lit :anchor (sym_lit (sym_name) @font-lock-keyword-face)
403+
:anchor (sym_lit (sym_name) @font-lock-function-name-face))
404+
(:match ,clojure-ts--definition-keyword-regexp
405+
@font-lock-keyword-face))
406+
((anon_fn_lit
407+
marker: "#" @font-lock-property-face)))
408+
409+
:feature 'variable ;; def, defonce
410+
:language 'clojure
411+
`(((list_lit :anchor (sym_lit (sym_name) @font-lock-keyword-face)
412+
:anchor (sym_lit (sym_name) @font-lock-variable-name-face))
413+
(:match ,clojure-ts--variable-keyword-regexp @font-lock-keyword-face)))
414+
415+
:feature 'type ;; deftype, defmulti, defprotocol, etc
416+
:language 'clojure
417+
`(((list_lit :anchor (sym_lit (sym_name) @font-lock-keyword-face)
418+
:anchor (sym_lit (sym_name) @font-lock-type-face))
419+
(:match ,clojure-ts--type-keyword-regexp @font-lock-keyword-face)))
420+
421+
:feature 'metadata
422+
:language 'clojure
423+
:override t
424+
`((meta_lit marker: "^" @font-lock-property-face)
425+
(meta_lit value: (kwd_lit) @font-lock-property-face) ;; metadata
426+
(meta_lit value: (sym_lit (sym_name) @font-lock-type-face)) ;; typehint
427+
(old_meta_lit marker: "#^" @font-lock-property-face)
428+
(old_meta_lit value: (kwd_lit) @font-lock-property-face) ;; metadata
429+
(old_meta_lit value: (sym_lit (sym_name) @font-lock-type-face))) ;; typehint
430+
431+
:feature 'tagged-literals
432+
:language 'clojure
433+
:override t
434+
'((tagged_or_ctor_lit marker: "#" @font-lock-preprocessor-face
435+
tag: (sym_lit) @font-lock-preprocessor-face))
436+
437+
;; TODO, also account for `def'
438+
;; Figure out how to highlight symbols in docstrings.
439+
:feature 'doc
440+
:language 'clojure
441+
:override t
442+
`(((list_lit :anchor (sym_lit) @def_symbol
443+
:anchor (sym_lit) @function_name
444+
:anchor (str_lit) @font-lock-doc-face)
445+
(:match ,clojure-ts--definition-keyword-regexp @def_symbol)))
446+
447+
:feature 'quote
448+
:language 'clojure
449+
'((quoting_lit
450+
marker: _ @font-lock-delimiter-face)
451+
(var_quoting_lit
452+
marker: _ @font-lock-delimiter-face)
453+
(syn_quoting_lit
454+
marker: _ @font-lock-delimiter-face)
455+
(unquoting_lit
456+
marker: _ @font-lock-delimiter-face)
457+
(unquote_splicing_lit
458+
marker: _ @font-lock-delimiter-face)
459+
(var_quoting_lit
460+
marker: _ @font-lock-delimiter-face))
461+
462+
:feature 'bracket
463+
:language 'clojure
464+
'((["(" ")" "[" "]" "{" "}"]) @font-lock-bracket-face
465+
(set_lit :anchor "#" @font-lock-bracket-face))
466+
467+
:feature 'comment
468+
:language 'clojure
469+
:override t
470+
`((comment) @font-lock-comment-face
471+
(dis_expr
472+
marker: "#_" @font-lock-comment-delimiter-face
473+
value: _ @font-lock-comment-face)
474+
(,(append
475+
'(list_lit :anchor (sym_lit) @font-lock-comment-delimiter-face)
476+
(when clojure-ts-comment-macro-font-lock-body
477+
'(_ :* @font-lock-comment-face)))
478+
(:match "^\\(\\(clojure.core/\\)?comment\\)$" @font-lock-comment-delimiter-face)))
479+
480+
:feature 'deref ;; not part of clojure-mode, but a cool idea?
481+
:language 'clojure
482+
'((derefing_lit
483+
marker: "@" @font-lock-warning-face)))
484+
(when regex-available
485+
(clojure-ts--regex-font-lock-settings))))
412486

413487
;; Node predicates
414488

@@ -597,6 +671,12 @@ See `clojure-ts--standard-definition-node-name' for the implementation used.")
597671
(interactive)
598672
(message "clojure-ts-mode (version %s)" clojure-ts-mode-version))
599673

674+
(defvar clojure-ts--treesit-range-settings
675+
(treesit-range-rules
676+
:embed 'regex
677+
:host 'clojure
678+
'((regex_lit) @capture)))
679+
600680
;;;###autoload
601681
(define-derived-mode clojure-ts-mode prog-mode "Clojure[TS]"
602682
"Major mode for editing Clojure code.
@@ -608,8 +688,19 @@ See `clojure-ts--standard-definition-node-name' for the implementation used.")
608688
(setq-local comment-start ";")
609689
(when (treesit-ready-p 'clojure)
610690
(treesit-parser-create 'clojure)
611-
(setq-local treesit-font-lock-settings (clojure-ts--font-lock-settings)
612-
treesit-defun-prefer-top-level t
691+
(let ((regex-available (treesit-ready-p
692+
'regex
693+
(or clojure-ts--supress-regex-grammar-install-message
694+
'message))))
695+
;; Configure OPTIONAL regex sub-grammar font locking
696+
(if regex-available
697+
(progn
698+
(treesit-parser-create 'regex)
699+
(setq-local treesit-range-settings clojure-ts--treesit-range-settings))
700+
(clojure-ts--notify-regex-grammar-missing))
701+
(setq-local treesit-font-lock-settings
702+
(clojure-ts--font-lock-settings regex-available)))
703+
(setq-local treesit-defun-prefer-top-level t
613704
treesit-defun-tactic 'top-level
614705
treesit-defun-type-regexp (rx (or "list_lit" "vec_lit" "map_lit"))
615706
treesit-simple-indent-rules clojure-ts--fixed-indent-rules

test/test.clj

+1-1
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ Etiam commodo nulla id risus convallis pharetra. Integer dapibus, eros vitae veh
264264
(println "Hello, World!"))
265265

266266
(binding [*out* nil]
267-
#"regex string"
267+
#"^(?<lookup>abc)[0-9]\b$"
268268
(def #^Typehint x 1)
269269
(def #^:metadata x 1)
270270
(def ^Typehint x 2)

0 commit comments

Comments
 (0)