From e4ebeb8a4271e9a55de5e00e9bae74db2082092c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Damir=20Jeli=C4=87?= Date: Mon, 4 Nov 2024 14:34:05 +0100 Subject: [PATCH] feat(base): Introduce a DisplayName struct This patch introduces a struct that normalizes and sanitizes display names. Display names can be a source of abuse and can contain characters which might make it hard to distinguish one display name from the other. This struct attempts to make it easier to protect against such abuse. Changelog: Introduce a DisplayName struct which normalizes and sanitizes display names. Co-authored-by: Denis Kasak --- Cargo.lock | 18 +- crates/matrix-sdk-base/Cargo.toml | 5 +- .../src/deserialized_responses.rs | 414 +++++++++++++++++- 3 files changed, 433 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 80ca196886a..0d23120497d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1246,6 +1246,17 @@ dependencies = [ "uuid", ] +[[package]] +name = "decancer" +version = "3.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a41401dd84c9335e2f5aec7f64057e243585d62622260d41c245919a601ccc9" +dependencies = [ + "lazy_static", + "paste", + "regex", +] + [[package]] name = "delegate-display" version = "2.1.1" @@ -2955,6 +2966,7 @@ dependencies = [ "assign", "async-trait", "bitflags 2.6.0", + "decancer", "eyeball", "eyeball-im", "futures-executor", @@ -2970,10 +2982,12 @@ dependencies = [ "ruma", "serde", "serde_json", + "similar-asserts", "stream_assert", "thiserror", "tokio", "tracing", + "unicode-normalization", "uniffi", "wasm-bindgen-test", ] @@ -5907,9 +5921,9 @@ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unicode-normalization" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" dependencies = [ "tinyvec", ] diff --git a/crates/matrix-sdk-base/Cargo.toml b/crates/matrix-sdk-base/Cargo.toml index 0c8d575d4b2..d54aa38ff6d 100644 --- a/crates/matrix-sdk-base/Cargo.toml +++ b/crates/matrix-sdk-base/Cargo.toml @@ -50,6 +50,7 @@ assert_matches = { workspace = true, optional = true } assert_matches2 = { workspace = true, optional = true } async-trait = { workspace = true } bitflags = { version = "2.4.0", features = ["serde"] } +decancer = "3.2.4" eyeball = { workspace = true } eyeball-im = { workspace = true } futures-util = { workspace = true } @@ -60,14 +61,15 @@ matrix-sdk-crypto = { workspace = true, optional = true } matrix-sdk-store-encryption = { workspace = true } matrix-sdk-test = { workspace = true, optional = true } once_cell = { workspace = true } +regex = "1.11.0" ruma = { workspace = true, features = ["canonical-json", "unstable-msc3381", "unstable-msc2867", "rand"] } +unicode-normalization = "0.1.24" serde = { workspace = true, features = ["rc"] } serde_json = { workspace = true } tokio = { workspace = true } thiserror = { workspace = true } tracing = { workspace = true } uniffi = { workspace = true, optional = true } -regex = "1.11.1" [dev-dependencies] assert_matches = { workspace = true } @@ -77,6 +79,7 @@ futures-executor = { workspace = true } http = { workspace = true } matrix-sdk-test = { workspace = true } stream_assert = { workspace = true } +similar-asserts = { workspace = true } [target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies] tokio = { workspace = true, features = ["rt-multi-thread", "macros"] } diff --git a/crates/matrix-sdk-base/src/deserialized_responses.rs b/crates/matrix-sdk-base/src/deserialized_responses.rs index 9c73ea64703..47535f1fe63 100644 --- a/crates/matrix-sdk-base/src/deserialized_responses.rs +++ b/crates/matrix-sdk-base/src/deserialized_responses.rs @@ -14,9 +14,11 @@ //! SDK-specific variations of response types from Ruma. -use std::{collections::BTreeMap, fmt, iter}; +use std::{collections::BTreeMap, fmt, hash::Hash, iter}; pub use matrix_sdk_common::deserialized_responses::*; +use once_cell::sync::Lazy; +use regex::Regex; use ruma::{ events::{ room::{ @@ -31,6 +33,7 @@ use ruma::{ EventId, MilliSecondsSinceUnixEpoch, OwnedEventId, OwnedRoomId, OwnedUserId, UserId, }; use serde::Serialize; +use unicode_normalization::UnicodeNormalization; /// A change in ambiguity of room members that an `m.room.member` event /// triggers. @@ -67,6 +70,178 @@ pub struct AmbiguityChanges { pub changes: BTreeMap>, } +static MXID_REGEX: Lazy = Lazy::new(|| { + Regex::new(DisplayName::MXID_PATTERN) + .expect("We should be able to create a regex from our static MXID pattern") +}); +static LEFT_TO_RIGHT_REGEX: Lazy = Lazy::new(|| { + Regex::new(DisplayName::LEFT_TO_RIGHT_PATTERN) + .expect("We should be able to create a regex from our static left-to-right pattern") +}); +static HIDDEN_CHARACTERS_REGEX: Lazy = Lazy::new(|| { + Regex::new(DisplayName::HIDDEN_CHARACTERS_PATTERN) + .expect("We should be able to create a regex from our static hidden characters pattern") +}); + +/// Regex to match `i` characters. +/// +/// This is used to replace an `i` with a lowercase `l`, i.e. to mark "Hello" +/// and "HeIlo" as ambiguous. Decancer will lowercase an `I` for us. +static I_REGEX: Lazy = Lazy::new(|| { + Regex::new("[i]").expect("We should be able to create a regex from our uppercase I pattern") +}); + +/// Regex to match `0` characters. +/// +/// This is used to replace an `0` with a lowercase `o`, i.e. to mark "HellO" +/// and "Hell0" as ambiguous. Decancer will lowercase an `O` for us. +static ZERO_REGEX: Lazy = Lazy::new(|| { + Regex::new("[0]").expect("We should be able to create a regex from our zero pattern") +}); + +/// Regex to match a couple of dot-like characters, also matches an actual dot. +/// +/// This is used to replace a `.` with a `:`, i.e. to mark "@mxid.domain.tld" as +/// ambiguous. +static DOT_REGEX: Lazy = Lazy::new(|| { + Regex::new("[.\u{1d16d}]").expect("We should be able to create a regex from our dot pattern") +}); + +/// A high-level wrapper for strings representing display names. +/// +/// This wrapper provides attempts to determine whether a display name +/// contains characters that could make it ambiguous or easily confused +/// with similar names. +/// +/// +/// # Examples +/// +/// ``` +/// use matrix_sdk_base::deserialized_responses::DisplayName; +/// +/// let display_name = DisplayName::new("๐’ฎ๐’ถ๐’ฝ๐’ถ๐“ˆ๐“‡๐’ถ๐’ฝ๐“๐’ถ"); +/// +/// // The normalized and sanitized string will be returned by DisplayName.as_normalized_str(). +/// assert_eq!(display_name.as_normalized_str(), Some("sahasrahla")); +/// ``` +/// +/// ``` +/// # use matrix_sdk_base::deserialized_responses::DisplayName; +/// let display_name = DisplayName::new("@alice:localhost"); +/// +/// // The display name looks like an MXID, which makes it ambiguous. +/// assert!(display_name.is_inherently_ambiguous()); +/// ``` +#[derive(Debug, Clone, Eq)] +pub struct DisplayName { + raw: String, + decancered: Option, +} + +impl Hash for DisplayName { + fn hash(&self, state: &mut H) { + if let Some(decancered) = &self.decancered { + decancered.hash(state); + } else { + self.raw.hash(state); + } + } +} + +impl PartialEq for DisplayName { + fn eq(&self, other: &Self) -> bool { + match (self.decancered.as_deref(), other.decancered.as_deref()) { + (None, None) => self.raw == other.raw, + (None, Some(_)) | (Some(_), None) => false, + (Some(this), Some(other)) => this == other, + } + } +} + +impl DisplayName { + /// Regex pattern matching an MXID. + const MXID_PATTERN: &str = "@.+[:.].+"; + + /// Regex pattern matching some left-to-right formatting marks: + /// * LTR and RTL marks U+200E and U+200F + /// * LTR/RTL and other directional formatting marks U+202A - U+202F + const LEFT_TO_RIGHT_PATTERN: &str = "[\u{202a}-\u{202f}\u{200e}\u{200f}]"; + + /// Regex pattern matching bunch of unicode control characters and otherwise + /// misleading/invisible characters. + /// + /// This includes: + /// * various width spaces U+2000 - U+200D + /// * Combining characters U+0300 - U+036F + /// * Blank/invisible characters (U2800, U2062-U2063) + /// * Arabic Letter RTL mark U+061C + /// * Zero width no-break space (BOM) U+FEFF + const HIDDEN_CHARACTERS_PATTERN: &str = + "[\u{2000}-\u{200D}\u{300}-\u{036f}\u{2062}-\u{2063}\u{2800}\u{061c}\u{feff}]"; + + /// Creates a new [`DisplayName`] from the given raw string. + /// + /// The raw display name is transformed into a Unicode-normalized form, with + /// common confusable characters removed to reduce ambiguity. + /// + /// **Note**: If removing confusable characters fails, + /// [`DisplayName::is_inherently_ambiguous`] will return `true`, and + /// [`DisplayName::as_normalized_str()`] will return `None. + pub fn new(raw: &str) -> Self { + let normalized = raw.nfd().collect::(); + let replaced = DOT_REGEX.replace_all(&normalized, ":"); + let replaced = HIDDEN_CHARACTERS_REGEX.replace_all(&replaced, ""); + + let decancered = decancer::cure!(&replaced).ok().map(|cured| { + let removed_left_to_right = LEFT_TO_RIGHT_REGEX.replace_all(cured.as_ref(), ""); + let replaced = I_REGEX.replace_all(&removed_left_to_right, "l"); + // We re-run the dot replacement because decancer normalized a lot of weird + // characets into a `.`, it just doesn't do that for /u{1d16d}. + let replaced = DOT_REGEX.replace_all(&replaced, ":"); + let replaced = ZERO_REGEX.replace_all(&replaced, "o"); + + replaced.to_string() + }); + + Self { raw: raw.to_owned(), decancered } + } + + /// Is this display name considered to be ambiguous? + /// + /// If the display name has cancer (i.e. fails normalisation or has a + /// different normalised form) or looks like an MXID, then it's ambiguous. + pub fn is_inherently_ambiguous(&self) -> bool { + // If we look like an MXID or have hidden characters then we're ambiguous. + self.looks_like_an_mxid() || self.has_hidden_characters() || self.decancered.is_none() + } + + /// Returns the underlying raw and and unsanitized string of this + /// [`DisplayName`]. + pub fn as_raw_str(&self) -> &str { + &self.raw + } + + /// Returns the underlying normalized and and sanitized string of this + /// [`DisplayName`]. + /// + /// Returns `None` if normalization failed during construction of this + /// [`DisplayName`]. + pub fn as_normalized_str(&self) -> Option<&str> { + self.decancered.as_deref() + } + + fn has_hidden_characters(&self) -> bool { + HIDDEN_CHARACTERS_REGEX.is_match(&self.raw) + } + + fn looks_like_an_mxid(&self) -> bool { + self.decancered + .as_deref() + .map(|d| MXID_REGEX.is_match(d)) + .unwrap_or_else(|| MXID_REGEX.is_match(&self.raw)) + } +} + /// A deserialized response for the rooms members API call. /// /// [`GET /_matrix/client/r0/rooms/{roomId}/members`](https://spec.matrix.org/v1.5/client-server-api/#get_matrixclientv3roomsroomidmembers) @@ -310,3 +485,240 @@ impl SyncOrStrippedState { } } } + +#[cfg(test)] +mod test { + macro_rules! assert_display_name_eq { + ($left:expr, $right:expr $(, $desc:expr)?) => {{ + let left = crate::deserialized_responses::DisplayName::new($left); + let right = crate::deserialized_responses::DisplayName::new($right); + + similar_asserts::assert_eq!( + left, + right + $(, $desc)? + ); + }}; + } + + macro_rules! assert_display_name_ne { + ($left:expr, $right:expr $(, $desc:expr)?) => {{ + let left = crate::deserialized_responses::DisplayName::new($left); + let right = crate::deserialized_responses::DisplayName::new($right); + + assert_ne!( + left, + right + $(, $desc)? + ); + }}; + } + + macro_rules! assert_ambiguous { + ($name:expr) => { + let name = crate::deserialized_responses::DisplayName::new($name); + + assert!( + name.is_inherently_ambiguous(), + "The display {:?} should be considered amgibuous", + name + ); + }; + } + + macro_rules! assert_not_ambiguous { + ($name:expr) => { + let name = crate::deserialized_responses::DisplayName::new($name); + + assert!( + !name.is_inherently_ambiguous(), + "The display {:?} should not be considered amgibuous", + name + ); + }; + } + + #[test] + fn test_display_name_inherently_ambiguous() { + // These should not be inherently ambiguous, only if another similarly looking + // display name appears should they be considered to be ambiguous. + assert_not_ambiguous!("Alice"); + assert_not_ambiguous!("Carol"); + assert_not_ambiguous!("Car0l"); + assert_not_ambiguous!("Ivan"); + assert_not_ambiguous!("๐’ฎ๐’ถ๐’ฝ๐’ถ๐“ˆ๐“‡๐’ถ๐’ฝ๐“๐’ถ"); + assert_not_ambiguous!("โ“ˆโ“โ“—โ“โ“ขโ“กโ“โ“—โ“›โ“"); + assert_not_ambiguous!("๐Ÿ…‚๐Ÿ„ฐ๐Ÿ„ท๐Ÿ„ฐ๐Ÿ…‚๐Ÿ…๐Ÿ„ฐ๐Ÿ„ท๐Ÿ„ป๐Ÿ„ฐ"); + assert_not_ambiguous!("๏ผณ๏ฝ๏ฝˆ๏ฝ๏ฝ“๏ฝ’๏ฝ๏ฝˆ๏ฝŒ๏ฝ"); + // Left to right is fine, if it's the only one in the room. + assert_not_ambiguous!("\u{202e}alharsahas"); + + // These on the other hand contain invisible chars. + assert_ambiguous!("Saฬดhasrahla"); + assert_ambiguous!("Sahas\u{200D}rahla"); + } + + #[test] + fn test_display_name_equality_capitalization() { + // Display name with different capitalization + assert_display_name_eq!("Alice", "alice"); + } + + #[test] + fn test_display_name_equality_different_names() { + // Different display names + assert_display_name_ne!("Alice", "Carol"); + } + + #[test] + fn test_display_name_equality_capital_l() { + // Different display names + assert_display_name_eq!("Hello", "HeIlo"); + } + + #[test] + fn test_display_name_equality_confusable_zero() { + // Different display names + assert_display_name_eq!("Carol", "Car0l"); + } + + #[test] + fn test_display_name_equality_cyrilic() { + // Display name with scritpure symbols + assert_display_name_eq!("alice", "ะฐlice"); + } + + #[test] + fn test_display_name_equality_scriptures() { + // Display name with scritpure symbols + assert_display_name_eq!("Sahasrahla", "๐’ฎ๐’ถ๐’ฝ๐’ถ๐“ˆ๐“‡๐’ถ๐’ฝ๐“๐’ถ"); + } + + #[test] + fn test_display_name_equality_frakturs() { + // Display name with fraktur symbols + assert_display_name_eq!("Sahasrahla", "๐”–๐”ž๐”ฅ๐”ž๐”ฐ๐”ฏ๐”ž๐”ฅ๐”ฉ๐”ž"); + } + + #[test] + fn test_display_name_equality_circled() { + // Display name with circled symbols + assert_display_name_eq!("Sahasrahla", "โ“ˆโ“โ“—โ“โ“ขโ“กโ“โ“—โ“›โ“"); + } + + #[test] + fn test_display_name_equality_squared() { + // Display name with squared symbols + assert_display_name_eq!("Sahasrahla", "๐Ÿ…‚๐Ÿ„ฐ๐Ÿ„ท๐Ÿ„ฐ๐Ÿ…‚๐Ÿ…๐Ÿ„ฐ๐Ÿ„ท๐Ÿ„ป๐Ÿ„ฐ"); + } + + #[test] + fn test_display_name_equality_big_unicode() { + // Display name with big unicode letters + assert_display_name_eq!("Sahasrahla", "๏ผณ๏ฝ๏ฝˆ๏ฝ๏ฝ“๏ฝ’๏ฝ๏ฝˆ๏ฝŒ๏ฝ"); + } + + #[test] + fn test_display_name_equality_left_to_right() { + // Display name with a left-to-right character + assert_display_name_eq!("Sahasrahla", "\u{202e}alharsahas"); + } + + #[test] + fn test_display_name_equality_diacritical() { + // Display name with a diacritical mark. + assert_display_name_eq!("Sahasrahla", "Saฬดhasrahla"); + } + + #[test] + fn test_display_name_equality_zero_width_joiner() { + // Display name with a zero-width joiner + assert_display_name_eq!("Sahasrahla", "Sahas\u{200B}rahla"); + } + + #[test] + fn test_display_name_equality_zero_width_space() { + // Display name with zero-width space. + assert_display_name_eq!("Sahasrahla", "Sahas\u{200D}rahla"); + } + + #[test] + fn test_display_name_equality_ligatures() { + // Display name with a ligature. + assert_display_name_eq!("ff", "\u{FB00}"); + } + + #[test] + fn test_display_name_confusable_mxid_colon() { + assert_display_name_eq!("@mxid:domain.tld", "@mxid\u{0589}domain.tld"); + assert_display_name_eq!("@mxid:domain.tld", "@mxid\u{05c3}domain.tld"); + assert_display_name_eq!("@mxid:domain.tld", "@mxid\u{0703}domain.tld"); + assert_display_name_eq!("@mxid:domain.tld", "@mxid\u{0a83}domain.tld"); + assert_display_name_eq!("@mxid:domain.tld", "@mxid\u{16ec}domain.tld"); + assert_display_name_eq!("@mxid:domain.tld", "@mxid\u{205a}domain.tld"); + assert_display_name_eq!("@mxid:domain.tld", "@mxid\u{2236}domain.tld"); + assert_display_name_eq!("@mxid:domain.tld", "@mxid\u{fe13}domain.tld"); + assert_display_name_eq!("@mxid:domain.tld", "@mxid\u{fe52}domain.tld"); + assert_display_name_eq!("@mxid:domain.tld", "@mxid\u{fe30}domain.tld"); + assert_display_name_eq!("@mxid:domain.tld", "@mxid\u{ff1a}domain.tld"); + + // Additionally these should be considered to be ambiguous on their own. + assert_ambiguous!("@mxid\u{0589}domain.tld"); + assert_ambiguous!("@mxid\u{05c3}domain.tld"); + assert_ambiguous!("@mxid\u{0703}domain.tld"); + assert_ambiguous!("@mxid\u{0a83}domain.tld"); + assert_ambiguous!("@mxid\u{16ec}domain.tld"); + assert_ambiguous!("@mxid\u{205a}domain.tld"); + assert_ambiguous!("@mxid\u{2236}domain.tld"); + assert_ambiguous!("@mxid\u{fe13}domain.tld"); + assert_ambiguous!("@mxid\u{fe52}domain.tld"); + assert_ambiguous!("@mxid\u{fe30}domain.tld"); + assert_ambiguous!("@mxid\u{ff1a}domain.tld"); + } + + #[test] + fn test_display_name_confusable_mxid_dot() { + assert_display_name_eq!("@mxid:domain.tld", "@mxid:domain\u{0701}tld"); + assert_display_name_eq!("@mxid:domain.tld", "@mxid:domain\u{0702}tld"); + assert_display_name_eq!("@mxid:domain.tld", "@mxid:domain\u{2024}tld"); + assert_display_name_eq!("@mxid:domain.tld", "@mxid:domain\u{fe52}tld"); + assert_display_name_eq!("@mxid:domain.tld", "@mxid:domain\u{ff0e}tld"); + assert_display_name_eq!("@mxid:domain.tld", "@mxid:domain\u{1d16d}tld"); + + // Additionally these should be considered to be ambiguous on their own. + assert_ambiguous!("@mxid:domain\u{0701}tld"); + assert_ambiguous!("@mxid:domain\u{0702}tld"); + assert_ambiguous!("@mxid:domain\u{2024}tld"); + assert_ambiguous!("@mxid:domain\u{fe52}tld"); + assert_ambiguous!("@mxid:domain\u{ff0e}tld"); + assert_ambiguous!("@mxid:domain\u{1d16d}tld"); + } + + #[test] + fn test_display_name_confusable_mxid_replacing_a() { + assert_display_name_eq!("@mxid:domain.tld", "@mxid:dom\u{1d44e}in.tld"); + assert_display_name_eq!("@mxid:domain.tld", "@mxid:dom\u{0430}in.tld"); + + // Additionally these should be considered to be ambiguous on their own. + assert_ambiguous!("@mxid:dom\u{1d44e}in.tld"); + assert_ambiguous!("@mxid:dom\u{0430}in.tld"); + } + + #[test] + fn test_display_name_confusable_mxid_replacing_l() { + assert_display_name_eq!("@mxid:domain.tld", "@mxid:domain.tId"); + assert_display_name_eq!("mxid:domain.tld", "mxid:domain.t\u{217c}d"); + assert_display_name_eq!("mxid:domain.tld", "mxid:domain.t\u{ff4c}d"); + assert_display_name_eq!("mxid:domain.tld", "mxid:domain.t\u{1d5f9}d"); + assert_display_name_eq!("mxid:domain.tld", "mxid:domain.t\u{1d695}d"); + assert_display_name_eq!("mxid:domain.tld", "mxid:domain.t\u{2223}d"); + + // Additionally these should be considered to be ambiguous on their own. + assert_ambiguous!("@mxid:domain.tId"); + assert_ambiguous!("@mxid:domain.t\u{217c}d"); + assert_ambiguous!("@mxid:domain.t\u{ff4c}d"); + assert_ambiguous!("@mxid:domain.t\u{1d5f9}d"); + assert_ambiguous!("@mxid:domain.t\u{1d695}d"); + assert_ambiguous!("@mxid:domain.t\u{2223}d"); + } +}