|
| 1 | +# frozen_string_literal: true |
| 2 | +# This file is based on code from https://github.com/maximevaillancourt/digital-garden-jekyll-template |
| 3 | +# Generators run after Jekyll has made an inventory of the existing content, |
| 4 | +# and before the site is generated. |
| 5 | + |
| 6 | +# Newsletter mentions of a topic were historically manually added to each |
| 7 | +# topic's page under `optech_mentions`. This enchances the exisiting logic |
| 8 | +# by allowing for automatic mentions using the double-bracket link syntax. |
| 9 | +class BidirectionalLinksGenerator < Jekyll::Generator |
| 10 | + def generate(site) |
| 11 | + |
| 12 | + # This is only supported for english |
| 13 | + lang = "en" |
| 14 | + all_pages = site.documents.select { |doc| doc.url.start_with?("/#{lang}/") } |
| 15 | + # pages that contain the double-bracket link syntax `[[]]` are only a subset |
| 16 | + # of all the pages |
| 17 | + pages_with_link_syntax = all_pages.select { |page| page.content.match(/\[\[.*?\]\]/) } |
| 18 | + # indexed pages are the only pages that newsletters might mention |
| 19 | + indexed_pages = site.collections["topics"].docs |
| 20 | + |
| 21 | + # Convert all Wiki/Roam-style double-bracket link syntax to plain HTML |
| 22 | + # anchor tag elements (<a>) |
| 23 | + pages_with_link_syntax.each do |current_page| |
| 24 | + indexed_pages.each do |page_potentially_linked_to| |
| 25 | + page_title_regexp_pattern = Regexp.escape( |
| 26 | + File.basename( |
| 27 | + page_potentially_linked_to.basename, |
| 28 | + File.extname(page_potentially_linked_to.basename) |
| 29 | + ) |
| 30 | + ).gsub('\_', '[ _]').gsub('\-', '[ -]').capitalize |
| 31 | + |
| 32 | + title_from_data = title_from_data_escaped = page_potentially_linked_to.data['title'] |
| 33 | + if title_from_data |
| 34 | + title_from_data_escaped = Regexp.escape(title_from_data) |
| 35 | + end |
| 36 | + |
| 37 | + new_href = "#{site.baseurl}#{page_potentially_linked_to.url}" |
| 38 | + title_anchor_tag = "<a href='#{new_href}'>#{title_from_data}</a>" |
| 39 | + anchor_tag = "<a href='#{new_href}'>\\1</a>" |
| 40 | + |
| 41 | + |
| 42 | + # Replace double-bracketed links that use topic's title with the given label |
| 43 | + # [[coin selection|this is a link to coin selection]] => [this is a link to coin selection](/topics/coin-selection) |
| 44 | + current_page.content.gsub!( |
| 45 | + /\[\[#{page_title_regexp_pattern}\|(.+?)(?=\])\]\]/i, |
| 46 | + anchor_tag |
| 47 | + ) |
| 48 | + |
| 49 | + # Replace double-bracketed links that use topic's filename with the given label |
| 50 | + # [[coin-seletion|this is a link to coin selection]] => [this is a link to coin selection](/topics/coin-selection) |
| 51 | + current_page.content.gsub!( |
| 52 | + /\[\[#{title_from_data_escaped}\|(.+?)(?=\])\]\]/i, |
| 53 | + anchor_tag |
| 54 | + ) |
| 55 | + |
| 56 | + # Replace double-bracketed links that use topic's title |
| 57 | + # [[coin selection]] => [coin selection](/topics/coin-selection) |
| 58 | + # [[Coin selection]] => [Coin selection](/topics/coin-selection) |
| 59 | + current_page.content.gsub!( |
| 60 | + /\[\[(#{title_from_data_escaped})\]\]/i, |
| 61 | + anchor_tag |
| 62 | + ) |
| 63 | + |
| 64 | + # Replace double-bracketed links that use topic's filename with topic's title |
| 65 | + # [[bnb]] => [Branch and Bound (BnB)](/topics/bnb) |
| 66 | + current_page.content.gsub!( |
| 67 | + /\[\[(#{page_title_regexp_pattern})\]\]/i, |
| 68 | + title_anchor_tag |
| 69 | + ) |
| 70 | + end |
| 71 | + |
| 72 | + # At this point, all remaining double-bracket-wrapped words are |
| 73 | + # pointing to non-existing pages, so let's turn them into disabled |
| 74 | + # links by greying them out and changing the cursor |
| 75 | + current_page.content = current_page.content.gsub( |
| 76 | + /\[\[([^\]]+)\]\]/i, # match on the remaining double-bracket links |
| 77 | + <<~HTML.delete("\n") # replace with this HTML (\\1 is what was inside the brackets) |
| 78 | + <span title='There is no page that matches this link.' class='invalid-link'> |
| 79 | + <span class='invalid-link-brackets'>[[</span> |
| 80 | + \\1 |
| 81 | + <span class='invalid-link-brackets'>]]</span></span> |
| 82 | + HTML |
| 83 | + ) |
| 84 | + end |
| 85 | + # Newsletter mentions |
| 86 | + # ===================== |
| 87 | + newsletter_pages = pages_with_link_syntax.select { |doc| doc.url.start_with?("/#{lang}/newsletters/") } |
| 88 | + # Identify page backlinks and add them to each page |
| 89 | + indexed_pages.each do |current_page| |
| 90 | + target_page_href = "href='#{current_page.url}'" |
| 91 | + |
| 92 | + # Iterate over all pages to find mentions of the current page |
| 93 | + newsletter_pages.each do |page_in_question| |
| 94 | + # Check if the current page is mentioned in the content of the page in question |
| 95 | + if page_in_question.content.include?(target_page_href) |
| 96 | + # The page_in_question mentions the current page, we now need to |
| 97 | + # find the specific mentions. |
| 98 | + mentions = get_mentions_of(page_in_question, target_page_href) |
| 99 | + current_page.data["optech_mentions"] ||= [] # Initialize if not already present |
| 100 | + # Add the calculated mentions to `optech_mentions` |
| 101 | + # Note: a page might mentioning another page more than once |
| 102 | + mentions.each do |mention| |
| 103 | + current_page.data["optech_mentions"] << { |
| 104 | + "title" => mention["title"], |
| 105 | + "url" => mention["url"] |
| 106 | + } |
| 107 | + end |
| 108 | + end |
| 109 | + end |
| 110 | + end |
| 111 | + end |
| 112 | + |
| 113 | + def find_title(string) |
| 114 | + title = capture_group = "" |
| 115 | + ## Find shortest match for **bold**, *italics*, or [markdown][links] |
| 116 | + title_match = string.match($title_pattern) |
| 117 | + title_match&.named_captures&.compact&.each do |key, value| |
| 118 | + capture_group = key # one of {bold, italics, markdown_link} |
| 119 | + title = value |
| 120 | + end |
| 121 | + |
| 122 | + if title.empty? |
| 123 | + {} |
| 124 | + else |
| 125 | + {"title"=> title, "capture_group"=> capture_group} |
| 126 | + end |
| 127 | + end |
| 128 | + |
| 129 | + def sanitize_title(title_hierarchy) |
| 130 | + # This is needed because for this plugin's logic, we use the same |
| 131 | + # matching pattern for the title as in `auto-anchor.rb` in order to be |
| 132 | + # able to reproduce the slugs/anchors and therefore point to them. |
| 133 | + # This pattern matches the title of the paragraph by finding the shortest |
| 134 | + # match for **bold**, *italics*, or [markdown][links], but the matched |
| 135 | + # **bold** or *italics* might have nested [markdown][links]. |
| 136 | + # |
| 137 | + # Note that the nested [markdown][links] actually become part of the slug |
| 138 | + # for example in /en/newsletter/2018-06-08/ the title |
| 139 | + # "**[BIP174][BIP174] discussion and review ongoing:**" becomes |
| 140 | + # "#bip174-bip174-discussion-and-review-ongoing" |
| 141 | + # |
| 142 | + # In the case of `auto-anchor.rb` this doesn't matter because the title |
| 143 | + # will "markdownify" and transform into a link. But here, we extract the |
| 144 | + # title therefore we need to remove the second part |
| 145 | + # |
| 146 | + # We call unsanitized title, a title that has those nested |
| 147 | + # [markdown][links]. The logic here, finds the pattern "[text][text]" |
| 148 | + # or "[text](text)" and remove the second part |
| 149 | + title_hierarchy.each do |title| |
| 150 | + title.gsub!(/\[(.*?)\][(\[].*?[)\]]/, '\1') |
| 151 | + end |
| 152 | + end |
| 153 | + |
| 154 | + def extract_slug_from_manual_anchor(text) |
| 155 | + # sometimes the liquid anchor syntax is used to create anchors in the document |
| 156 | + # our extracted backlink snippets include those, therefore we need to |
| 157 | + # - remove liquid anchor syntax from the result |
| 158 | + # - extract slug to use it on the generated anchor list link |
| 159 | + # example of this pattern can be seen in `en/newsletter/2019-06-12-newsletter.md` |
| 160 | + match = text.match(/\{:#(\w+)\}/) |
| 161 | + if match |
| 162 | + slug = "##{match[1]}" # extract slug |
| 163 | + text.sub!(/\{:#\w+\}\n?/, "") # Remove the {:#slug} syntax and optional trailing newline |
| 164 | + slug |
| 165 | + else |
| 166 | + nil |
| 167 | + end |
| 168 | + end |
| 169 | + |
| 170 | + # This method searches the content for paragraphs that link to the |
| 171 | + # the target page and returns these mentions |
| 172 | + def get_mentions_of(page, target_page_url) |
| 173 | + # This is called only when we know that a match exists |
| 174 | + # The logic here assumes that: |
| 175 | + # - each block of text (paragraph) is seperated by an empty line |
| 176 | + # - primary titles are enclosed in **bold** |
| 177 | + # - secondary (nested) titles are enclosed in *italics* |
| 178 | + |
| 179 | + content = page.content |
| 180 | + # Split the content into paragraphs |
| 181 | + paragraphs = content.split(/\n\n+/) |
| 182 | + |
| 183 | + # Create an array of hashes containing: |
| 184 | + # - the associated url |
| 185 | + # - the associated title |
| 186 | + matching_paragraphs = [] |
| 187 | + current_title = [] |
| 188 | + |
| 189 | + # Iterate over all paragraphs to find those that match the given url |
| 190 | + paragraphs.each do |p| |
| 191 | + # a title might have multiple paragraphs associated with it |
| 192 | + # an isolated paragraph snippet cannot access the title therefore |
| 193 | + # we keep this information to be used in backlinks |
| 194 | + title = find_title(p) |
| 195 | + if !title.empty? |
| 196 | + # paragraph has title |
| 197 | + if title["capture_group"] == "bold" or title["capture_group"] == "markdown_link" |
| 198 | + # when a new primary title is found, we reset the current_title array |
| 199 | + current_title = [title["title"]] |
| 200 | + elsif title["capture_group"] == "italics" |
| 201 | + # title is a nested title, we assign it as the 2nd element of the array |
| 202 | + # in order to keep the titles' hierarchy |
| 203 | + # [**primary title**, *secondary title*] |
| 204 | + current_title[1] = title["title"] |
| 205 | + end |
| 206 | + else |
| 207 | + # paragraph has no title, switch back to the last primary title |
| 208 | + # this covers the case when you have a nested secondary paragraph |
| 209 | + # but the mention is in a later paragraph (still nested under primary) |
| 210 | + # that has no title, therefore we need to default back to the last |
| 211 | + # primary title, otherwise the title of the mention would also include |
| 212 | + # the secondary title which might be irrelevant |
| 213 | + current_title = [current_title[0]] |
| 214 | + end |
| 215 | + |
| 216 | + # If the current paragraph contains the URL, add it to the matching paragraphs |
| 217 | + if p.include?(target_page_url) |
| 218 | + # generate slug for matching paragraph |
| 219 | + slug = extract_slug_from_manual_anchor(p) |
| 220 | + if slug.nil? |
| 221 | + # no manual anchor has been defined, so generate slug from title |
| 222 | + slug = generate_slug(current_title.last) |
| 223 | + # after generating the title-based slug we must sanitize the title, |
| 224 | + # this must be done after generating the title-based slug because |
| 225 | + # slugs are historically generated based on the unsanitize title |
| 226 | + sanitize_title(current_title) |
| 227 | + end |
| 228 | + matching_paragraph = { |
| 229 | + # resulting title for the mention is "primary title: secondary title" |
| 230 | + "title"=> current_title.join(": "), |
| 231 | + "url" => "#{page.url}#{slug}" |
| 232 | + } |
| 233 | + matching_paragraphs << matching_paragraph |
| 234 | + end |
| 235 | + end |
| 236 | + # Return the matching paragraphs |
| 237 | + matching_paragraphs |
| 238 | + end |
| 239 | + end |
0 commit comments