Skip to content

Commit 5c0608f

Browse files
committed
Plugins: automatic mentions generator for topics
add support for calculating and displaying mentions of topics in the newsletter based on double-bracket link syntax
1 parent a051a45 commit 5c0608f

File tree

3 files changed

+247
-0
lines changed

3 files changed

+247
-0
lines changed
+239
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
# frozen_string_literal: true
2+
# This file is based on code from https://github.com/maximevaillancourt/digital-garden-jekyll-template
3+
# Generators run after Jekyll has made an inventory of the existing content,
4+
# and before the site is generated.
5+
6+
# Newsletter mentions of a topic were historically manually added to each
7+
# topic's page under `optech_mentions`. This enchances the exisiting logic
8+
# by allowing for automatic mentions using the double-bracket link syntax.
9+
class BidirectionalLinksGenerator < Jekyll::Generator
10+
def generate(site)
11+
12+
# This is only supported for english
13+
lang = "en"
14+
all_pages = site.documents.select { |doc| doc.url.start_with?("/#{lang}/") }
15+
# pages that contain the double-bracket link syntax `[[]]` are only a subset
16+
# of all the pages
17+
pages_with_link_syntax = all_pages.select { |page| page.content.match(/\[\[.*?\]\]/) }
18+
# indexed pages are the only pages that newsletters might mention
19+
indexed_pages = site.collections["topics"].docs
20+
21+
# Convert all Wiki/Roam-style double-bracket link syntax to plain HTML
22+
# anchor tag elements (<a>)
23+
pages_with_link_syntax.each do |current_page|
24+
indexed_pages.each do |page_potentially_linked_to|
25+
page_title_regexp_pattern = Regexp.escape(
26+
File.basename(
27+
page_potentially_linked_to.basename,
28+
File.extname(page_potentially_linked_to.basename)
29+
)
30+
).gsub('\_', '[ _]').gsub('\-', '[ -]').capitalize
31+
32+
title_from_data = title_from_data_escaped = page_potentially_linked_to.data['title']
33+
if title_from_data
34+
title_from_data_escaped = Regexp.escape(title_from_data)
35+
end
36+
37+
new_href = "#{site.baseurl}#{page_potentially_linked_to.url}"
38+
title_anchor_tag = "<a href='#{new_href}'>#{title_from_data}</a>"
39+
anchor_tag = "<a href='#{new_href}'>\\1</a>"
40+
41+
42+
# Replace double-bracketed links that use topic's title with the given label
43+
# [[coin selection|this is a link to coin selection]] => [this is a link to coin selection](/topics/coin-selection)
44+
current_page.content.gsub!(
45+
/\[\[#{page_title_regexp_pattern}\|(.+?)(?=\])\]\]/i,
46+
anchor_tag
47+
)
48+
49+
# Replace double-bracketed links that use topic's filename with the given label
50+
# [[coin-seletion|this is a link to coin selection]] => [this is a link to coin selection](/topics/coin-selection)
51+
current_page.content.gsub!(
52+
/\[\[#{title_from_data_escaped}\|(.+?)(?=\])\]\]/i,
53+
anchor_tag
54+
)
55+
56+
# Replace double-bracketed links that use topic's title
57+
# [[coin selection]] => [coin selection](/topics/coin-selection)
58+
# [[Coin selection]] => [Coin selection](/topics/coin-selection)
59+
current_page.content.gsub!(
60+
/\[\[(#{title_from_data_escaped})\]\]/i,
61+
anchor_tag
62+
)
63+
64+
# Replace double-bracketed links that use topic's filename with topic's title
65+
# [[bnb]] => [Branch and Bound (BnB)](/topics/bnb)
66+
current_page.content.gsub!(
67+
/\[\[(#{page_title_regexp_pattern})\]\]/i,
68+
title_anchor_tag
69+
)
70+
end
71+
72+
# At this point, all remaining double-bracket-wrapped words are
73+
# pointing to non-existing pages, so let's turn them into disabled
74+
# links by greying them out and changing the cursor
75+
current_page.content = current_page.content.gsub(
76+
/\[\[([^\]]+)\]\]/i, # match on the remaining double-bracket links
77+
<<~HTML.delete("\n") # replace with this HTML (\\1 is what was inside the brackets)
78+
<span title='There is no page that matches this link.' class='invalid-link'>
79+
<span class='invalid-link-brackets'>[[</span>
80+
\\1
81+
<span class='invalid-link-brackets'>]]</span></span>
82+
HTML
83+
)
84+
end
85+
# Newsletter mentions
86+
# =====================
87+
newsletter_pages = pages_with_link_syntax.select { |doc| doc.url.start_with?("/#{lang}/newsletters/") }
88+
# Identify page backlinks and add them to each page
89+
indexed_pages.each do |current_page|
90+
target_page_href = "href='#{current_page.url}'"
91+
92+
# Iterate over all pages to find mentions of the current page
93+
newsletter_pages.each do |page_in_question|
94+
# Check if the current page is mentioned in the content of the page in question
95+
if page_in_question.content.include?(target_page_href)
96+
# The page_in_question mentions the current page, we now need to
97+
# find the specific mentions.
98+
mentions = get_mentions_of(page_in_question, target_page_href)
99+
current_page.data["optech_mentions"] ||= [] # Initialize if not already present
100+
# Add the calculated mentions to `optech_mentions`
101+
# Note: a page might mentioning another page more than once
102+
mentions.each do |mention|
103+
current_page.data["optech_mentions"] << {
104+
"title" => mention["title"],
105+
"url" => mention["url"]
106+
}
107+
end
108+
end
109+
end
110+
end
111+
end
112+
113+
def find_title(string)
114+
title = capture_group = ""
115+
## Find shortest match for **bold**, *italics*, or [markdown][links]
116+
title_match = string.match($title_pattern)
117+
title_match&.named_captures&.compact&.each do |key, value|
118+
capture_group = key # one of {bold, italics, markdown_link}
119+
title = value
120+
end
121+
122+
if title.empty?
123+
{}
124+
else
125+
{"title"=> title, "capture_group"=> capture_group}
126+
end
127+
end
128+
129+
def sanitize_title(title_hierarchy)
130+
# This is needed because for this plugin's logic, we use the same
131+
# matching pattern for the title as in `auto-anchor.rb` in order to be
132+
# able to reproduce the slugs/anchors and therefore point to them.
133+
# This pattern matches the title of the paragraph by finding the shortest
134+
# match for **bold**, *italics*, or [markdown][links], but the matched
135+
# **bold** or *italics* might have nested [markdown][links].
136+
#
137+
# Note that the nested [markdown][links] actually become part of the slug
138+
# for example in /en/newsletter/2018-06-08/ the title
139+
# "**[BIP174][BIP174] discussion and review ongoing:**" becomes
140+
# "#bip174-bip174-discussion-and-review-ongoing"
141+
#
142+
# In the case of `auto-anchor.rb` this doesn't matter because the title
143+
# will "markdownify" and transform into a link. But here, we extract the
144+
# title therefore we need to remove the second part
145+
#
146+
# We call unsanitized title, a title that has those nested
147+
# [markdown][links]. The logic here, finds the pattern "[text][text]"
148+
# or "[text](text)" and remove the second part
149+
title_hierarchy.each do |title|
150+
title.gsub!(/\[(.*?)\][(\[].*?[)\]]/, '\1')
151+
end
152+
end
153+
154+
def extract_slug_from_manual_anchor(text)
155+
# sometimes the liquid anchor syntax is used to create anchors in the document
156+
# our extracted backlink snippets include those, therefore we need to
157+
# - remove liquid anchor syntax from the result
158+
# - extract slug to use it on the generated anchor list link
159+
# example of this pattern can be seen in `en/newsletter/2019-06-12-newsletter.md`
160+
match = text.match(/\{:#(\w+)\}/)
161+
if match
162+
slug = "##{match[1]}" # extract slug
163+
text.sub!(/\{:#\w+\}\n?/, "") # Remove the {:#slug} syntax and optional trailing newline
164+
slug
165+
else
166+
nil
167+
end
168+
end
169+
170+
# This method searches the content for paragraphs that link to the
171+
# the target page and returns these mentions
172+
def get_mentions_of(page, target_page_url)
173+
# This is called only when we know that a match exists
174+
# The logic here assumes that:
175+
# - each block of text (paragraph) is seperated by an empty line
176+
# - primary titles are enclosed in **bold**
177+
# - secondary (nested) titles are enclosed in *italics*
178+
179+
content = page.content
180+
# Split the content into paragraphs
181+
paragraphs = content.split(/\n\n+/)
182+
183+
# Create an array of hashes containing:
184+
# - the associated url
185+
# - the associated title
186+
matching_paragraphs = []
187+
current_title = []
188+
189+
# Iterate over all paragraphs to find those that match the given url
190+
paragraphs.each do |p|
191+
# a title might have multiple paragraphs associated with it
192+
# an isolated paragraph snippet cannot access the title therefore
193+
# we keep this information to be used in backlinks
194+
title = find_title(p)
195+
if !title.empty?
196+
# paragraph has title
197+
if title["capture_group"] == "bold" or title["capture_group"] == "markdown_link"
198+
# when a new primary title is found, we reset the current_title array
199+
current_title = [title["title"]]
200+
elsif title["capture_group"] == "italics"
201+
# title is a nested title, we assign it as the 2nd element of the array
202+
# in order to keep the titles' hierarchy
203+
# [**primary title**, *secondary title*]
204+
current_title[1] = title["title"]
205+
end
206+
else
207+
# paragraph has no title, switch back to the last primary title
208+
# this covers the case when you have a nested secondary paragraph
209+
# but the mention is in a later paragraph (still nested under primary)
210+
# that has no title, therefore we need to default back to the last
211+
# primary title, otherwise the title of the mention would also include
212+
# the secondary title which might be irrelevant
213+
current_title = [current_title[0]]
214+
end
215+
216+
# If the current paragraph contains the URL, add it to the matching paragraphs
217+
if p.include?(target_page_url)
218+
# generate slug for matching paragraph
219+
slug = extract_slug_from_manual_anchor(p)
220+
if slug.nil?
221+
# no manual anchor has been defined, so generate slug from title
222+
slug = generate_slug(current_title.last)
223+
# after generating the title-based slug we must sanitize the title,
224+
# this must be done after generating the title-based slug because
225+
# slugs are historically generated based on the unsanitize title
226+
sanitize_title(current_title)
227+
end
228+
matching_paragraph = {
229+
# resulting title for the mention is "primary title: secondary title"
230+
"title"=> current_title.join(": "),
231+
"url" => "#{page.url}#{slug}"
232+
}
233+
matching_paragraphs << matching_paragraph
234+
end
235+
end
236+
# Return the matching paragraphs
237+
matching_paragraphs
238+
end
239+
end

_plugins/common_utils.rb

+7
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
# Regex pattern to match list items and capture their title which
2+
# is either in **bold**, *italics*, or [markdown][links]
3+
$bold = /\*\*(?<bold>.*?):?\*\*/
4+
$italics = /\*(?<italics>.*?):?\*/
5+
$markdown_link = /\[(?<markdown_link>.*?):?\][(\[]/
6+
$title_pattern = /^ *- .*?(?:#{$bold}|#{$italics}|#{$markdown_link})/
7+
18
def generate_slug(title)
29
## Remove double-quotes from titles before attempting to slugify
310
title.gsub!('"', '')

_plugins/recap_references_generator.rb

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# Create the podcast recap references by parsing the referenced newsletter for
77
# podcast reference marks (timestamps)
88
class RecapReferencesGenerator < Jekyll::Generator
9+
priority :high
910
def generate(site)
1011
podcast_pages = site.documents.select { |doc| doc.data["type"] == "podcast"}
1112
podcast_pages.each do |podcast|

0 commit comments

Comments
 (0)