From ac898d08ed7ab5cdb52008f9fc411096861c40e4 Mon Sep 17 00:00:00 2001 From: ChrisBAshton Date: Wed, 5 Feb 2025 07:20:28 +0000 Subject: [PATCH 1/2] Support human readable slugs for non-English locales MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR: - No longer sets `slug` to `nil` when the locale is non-English. Every locale is considered "sluggable". - This also means we no longer need the `sluggable_locale?`, `slug_eligible?` methods and ASCII checks. (The latter has caused a number of Zendesk tickets, whereby even English HTML Attachments have been given a content-ID based URL, because they contain a non-ASCII character like a fancy single quote). - `sluggable_string` doesn't appear to have been in use, so it was deleted. (`sluggable_string` is used in the Document class: `friendly_id :sluggable_string`. But the Attachment class uses `friendly_id :title`, so overriding the `sluggable_string` method had no effect). - Adds a test for friendly_id's native "uniqueness" feature (which we've opted into via `use: :scoped, scope: :attachable` on the Attachment class), so in theory we should be protected from clashes in the attachment slug namespace. Therefore the issue alluded to in #5321 (which reverts the last time this was attempted) should not happen again. - Adds a basic safeguard that the slug-generation results in a slug that contains at least 4 a-z or A-Z characters. This is to protect against generating a slug like `-`, which could happen from sluggifying a title like `{chinese}-{chinese}`. - That said, if we somehow DO get into a state whereby an attachment has what we'd call an invalid slug (<4 azAZ chars), we don't want to fix it if the attachment is already live. So we check for `safely_resluggable?`, introduced in #5942. Result: - Most HTML attachments should now have human-readable slugs. Only languages that are non-ASCII and cannot be converted to ASCII, such as Chinese, will continue to fall back to the content-ID based URLs. Next steps: - Now that all Whitehall tables use the UTF8MB4 charset (#9767), we could potentially use CJK character sets natively in slugs - but it may not always show nicely in the browser. Compare `https://example.com/你好` with `https://example.com/%E4%BD%A0%E5%A5%BD`. - The option we'd likely want to go with is conversion to their Pinyin representations, e.g. `https://example.com/ni-hao`, probably using the `pinyin` or `ruby-pinyin` gem. Trello: https://trello.com/c/YGsEG9YV/3418-improve-html-attachment-slug-creation --- app/models/html_attachment.rb | 21 ++-- test/unit/app/models/html_attachment_test.rb | 102 ++++++++++++------- 2 files changed, 70 insertions(+), 53 deletions(-) diff --git a/app/models/html_attachment.rb b/app/models/html_attachment.rb index 0ac27312562..66e0979615e 100644 --- a/app/models/html_attachment.rb +++ b/app/models/html_attachment.rb @@ -7,7 +7,7 @@ class HtmlAttachment < Attachment autosave: true, inverse_of: :html_attachment - before_validation :clear_slug_if_non_english_locale + before_save :ensure_slug_is_valid validates :govspeak_content, presence: true @@ -61,8 +61,6 @@ def url(options = {}) end def should_generate_new_friendly_id? - return false unless sluggable_locale? - slug.nil? || attachable.nil? || safely_resluggable? end @@ -121,17 +119,12 @@ def public_url(options = {}) root + public_path(options) end - def sluggable_locale? - locale.blank? || (locale == "en") - end + # Sense-check that the title converted to ASCII contains more than just hyphens! + def ensure_slug_is_valid + return unless slug.present? && slug.count("A-Za-z") < 4 - def sluggable_string - sluggable_locale? ? title : nil - end - - def clear_slug_if_non_english_locale - if locale_changed? && !sluggable_locale? - self.slug = nil - end + # Force slug to be `nil` - but only if attachment isn't live already. + # A `nil` slug will fall back to using the content_id as the slug. + self.slug = nil if safely_resluggable? end end diff --git a/test/unit/app/models/html_attachment_test.rb b/test/unit/app/models/html_attachment_test.rb index 8b49a2c9227..26ffbb53e80 100644 --- a/test/unit/app/models/html_attachment_test.rb +++ b/test/unit/app/models/html_attachment_test.rb @@ -46,18 +46,6 @@ class HtmlAttachmentTest < ActiveSupport::TestCase assert_equal expected, actual end - test "#url returns absolute path to the draft stack when previewing for non-english locale" do - edition = create(:draft_publication, :with_html_attachment) - attachment = edition.attachments.first - attachment.update!(locale: "fi") - - expected = "https://draft-origin.test.gov.uk/government/publications/" - expected += "#{edition.slug}/#{attachment.content_id}?preview=#{attachment.id}" - actual = attachment.url(preview: true, full_url: true) - - assert_equal expected, actual - end - test "#url returns absolute path to the live site when not previewing" do edition = create(:published_publication, :with_html_attachment) attachment = edition.attachments.first @@ -69,31 +57,12 @@ class HtmlAttachmentTest < ActiveSupport::TestCase assert_equal expected, actual end - test "#url returns absolute path to the live site when not previewing for non-english locale" do - edition = create(:published_publication, :with_html_attachment) - attachment = edition.attachments.first - attachment.update!(locale: "fi") - - expected = "https://www.test.gov.uk/government/publications/" - expected += "#{edition.slug}/#{attachment.content_id}" - actual = attachment.url(full_url: true) - - assert_equal expected, actual - end - test "#url returns relative path by default" do edition = create(:published_publication, :with_html_attachment) attachment = edition.attachments.first assert_equal "/government/publications/#{edition.slug}/#{attachment.slug}", attachment.url end - test "#url returns relative path by default for non-english locale" do - edition = create(:published_publication, :with_html_attachment) - attachment = edition.attachments.first - attachment.update!(locale: "fi") - assert_equal "/government/publications/#{edition.slug}/#{attachment.content_id}", attachment.url - end - test "#url works when an attachment with english locale has special characters in title" do edition = create(:published_publication, :with_html_attachment) attachment = edition.attachments.first @@ -111,7 +80,7 @@ class HtmlAttachmentTest < ActiveSupport::TestCase statistics = create(:published_national_statistics) attachment = statistics.attachments.last attachment.update!(locale: "fi") - assert_equal "/government/statistics/#{statistics.slug}/#{attachment.content_id}", attachment.url + assert_equal "/government/statistics/#{statistics.slug}/#{attachment.slug}", attachment.url end test "#url works with consultation outcomes" do @@ -124,7 +93,7 @@ class HtmlAttachmentTest < ActiveSupport::TestCase consultation = create(:consultation_with_outcome_html_attachment) attachment = consultation.outcome.attachments.first attachment.update!(locale: "fi") - assert_equal "/government/consultations/#{consultation.slug}/outcome/#{attachment.content_id}", attachment.url + assert_equal "/government/consultations/#{consultation.slug}/outcome/#{attachment.slug}", attachment.url end test "#url works with consultation public feedback" do @@ -137,7 +106,7 @@ class HtmlAttachmentTest < ActiveSupport::TestCase consultation = create(:consultation_with_public_feedback_html_attachment) attachment = consultation.public_feedback.attachments.first attachment.update!(locale: "fi") - assert_equal "/government/consultations/#{consultation.slug}/public-feedback/#{attachment.content_id}", attachment.url + assert_equal "/government/consultations/#{consultation.slug}/public-feedback/#{attachment.slug}", attachment.url end test "#url works with call for evidence outcomes" do @@ -232,7 +201,7 @@ class HtmlAttachmentTest < ActiveSupport::TestCase assert_equal "an-html-attachment", second_draft_attachment.slug end - test "slug is not created for non-english attachments" do + test "slug is not created for non-english attachments if conversion to non-ASCII isn't possible" do # Additional attachment to ensure the duplicate detection behaviour isn't triggered create(:html_attachment, locale: "fr") attachment = create(:html_attachment, locale: "ar", title: "المملكة المتحدة والمملكة العربية السعودية") @@ -257,11 +226,66 @@ class HtmlAttachmentTest < ActiveSupport::TestCase assert_equal expected_slug, attachment.to_param end - test "slug is cleared when changing from english to non-english" do - attachment = create(:html_attachment, locale: "en") + test "slug is created for non-english attachments where conversion to ASCII is possible" do + attachment = create(:html_attachment, locale: "cs", title: "toto je náhodný název") - attachment.update!(locale: "fr") - assert attachment.slug.blank? + expected_slug = "toto-je-nahodny-nazev" + assert_equal expected_slug, attachment.slug + assert_equal expected_slug, attachment.to_param + end + + test "slug requires minimum 3 azAZ characters" do + attachment = create(:html_attachment, locale: "cs", title: "tot") + assert_equal nil, attachment.slug + end + + test "slug reverts to nil, if document has never been published, if title is changed such that minimum 3 azAZ characters is not reached" do + attachment = build(:html_attachment, title: "an-html-attachment") + + create(:draft_publication, attachments: [attachment]) + + attachment.title = "foo" + attachment.save! + attachment.reload + + assert_equal nil, attachment.slug + end + + test "even if slug is considered too short NOW, slug on legacy old attachment is not updated when the title is changed if document is published" do + edition = create( + :published_publication, + attachments: [ + build(:html_attachment, title: "an-html-attachment"), + ], + ) + # Bypass validations and callbacks to manually set slug + attachment = edition.attachments.first + attachment.update_column(:slug, "foo") + assert_equal "foo", attachment.slug + + #  now check slug not overridden + draft = edition.create_draft(create(:writer)) + attachment = draft.attachments.first + attachment.title = "a-new-title" + attachment.save! + attachment.reload + assert_equal "foo", attachment.slug + end + + # This behaviour is built into the friendly_id gem + test "slug is made unique if there is an existing clash" do + edition = create( + :published_publication, + attachments: [ + build(:html_attachment, locale: "en", title: "food"), + build(:html_attachment, locale: "cs", title: "food"), + ], + ) + attachment_1 = edition.attachments.first + attachment_2 = edition.attachments.last + + assert_equal "food", attachment_1.slug + assert_equal "food--2", attachment_2.slug end test "#identifier falls back to content_id if no slug available" do From 0609ff5b86b65d8e44cfa3bff79cdef8c3bf4868 Mon Sep 17 00:00:00 2001 From: ChrisBAshton Date: Wed, 5 Feb 2025 14:45:57 +0000 Subject: [PATCH 2/2] Only check `safely_resluggable?` before generating new ID See code review comment: https://github.com/alphagov/whitehall/pull/9892#discussion_r1942515565 `safely_resluggable?` returns `true` for newly created attachments and `false` for attachments that have been copied over from a previous edition. New attachments will start off with a `nil` slug, or in any case a slug that is derived from the attachment's current title. It can be safely re-generated an infinite number of times before the attachment is made live. So we don't need to be checking the value of the slug in this method. --- app/models/html_attachment.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/models/html_attachment.rb b/app/models/html_attachment.rb index 66e0979615e..9bd15ece5b8 100644 --- a/app/models/html_attachment.rb +++ b/app/models/html_attachment.rb @@ -61,7 +61,7 @@ def url(options = {}) end def should_generate_new_friendly_id? - slug.nil? || attachable.nil? || safely_resluggable? + safely_resluggable? end def deep_clone