diff --git a/Cargo.lock b/Cargo.lock index 280f10c7..4187ae73 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -249,6 +249,7 @@ dependencies = [ "atrium-xrpc-client", "http 1.1.0", "ipld-core", + "psl", "regex", "serde", "serde_json", @@ -1357,6 +1358,21 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "psl" +version = "2.1.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71cad2f472a847f48e9b6d4712238b77ce586801c4b1702dc6c026290b25c6ff" +dependencies = [ + "psl-types", +] + +[[package]] +name = "psl-types" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33cb294fe86a74cbcf50d4445b37da762029549ebeea341421c7c70370f86cac" + [[package]] name = "quote" version = "1.0.35" diff --git a/bsky-sdk/Cargo.toml b/bsky-sdk/Cargo.toml index 79a0b66b..fa3f708b 100644 --- a/bsky-sdk/Cargo.toml +++ b/bsky-sdk/Cargo.toml @@ -17,6 +17,7 @@ atrium-api.workspace = true atrium-xrpc-client.workspace = true http.workspace = true ipld-core.workspace = true +psl = { version = "2.1.42", optional = true } regex.workspace = true serde = { workspace = true, features = ["derive"] } serde_json.workspace = true @@ -29,5 +30,5 @@ tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } [features] default = ["rich-text"] -rich-text = ["unicode-segmentation"] +rich-text = ["psl", "unicode-segmentation"] config-toml = ["toml"] diff --git a/bsky-sdk/src/rich_text.rs b/bsky-sdk/src/rich_text.rs index 74ef2e8b..ce8bc75a 100644 --- a/bsky-sdk/src/rich_text.rs +++ b/bsky-sdk/src/rich_text.rs @@ -10,6 +10,8 @@ use detection::{detect_facets, FacetFeaturesItem}; use std::cmp::Ordering; use unicode_segmentation::UnicodeSegmentation; +const PUBLIC_API_ENDPOINT: &str = "https://public.api.bsky.app"; + #[derive(Debug, Clone, PartialEq, Eq)] pub struct RichTextSegment { text: String, @@ -181,11 +183,11 @@ impl RichText { facets.retain(|facet| facet.index.byte_start < facet.index.byte_end); } } - pub async fn detect_facets(&mut self, client: impl XrpcClient + Send + Sync) -> Result<()> { + pub async fn detect_facets(mut self, client: impl XrpcClient + Send + Sync) -> Result { let agent = BskyAgentBuilder::default() .client(client) .config(Config { - endpoint: "https://public.api.bsky.app".into(), + endpoint: PUBLIC_API_ENDPOINT.into(), ..Default::default() }) .build() @@ -224,7 +226,7 @@ impl RichText { } Some(facets) }; - Ok(()) + Ok(self) } } diff --git a/bsky-sdk/src/rich_text/detection.rs b/bsky-sdk/src/rich_text/detection.rs index 1f697b0e..51d8a4cd 100644 --- a/bsky-sdk/src/rich_text/detection.rs +++ b/bsky-sdk/src/rich_text/detection.rs @@ -1,8 +1,11 @@ use atrium_api::app::bsky::richtext::facet::{ByteSlice, Link, Tag}; +use psl; use regex::Regex; use std::sync::OnceLock; static RE_MENTION: OnceLock = OnceLock::new(); +static RE_URL: OnceLock = OnceLock::new(); +static RE_ENDING_PUNCTUATION: OnceLock = OnceLock::new(); #[derive(Debug, Clone, PartialEq, Eq)] pub struct FacetWithoutResolution { @@ -45,5 +48,43 @@ pub fn detect_facets(text: &str) -> Vec { }); } } + // links + { + let re = RE_URL.get_or_init(|| { + Regex::new( + r"(?:^|\s|\()((?:https?:\/\/[\S]+)|(?:(?[a-z][a-z0-9]*(?:\.[a-z0-9]+)+)[\S]*))", + ) + .expect("invalid regex") + }); + for capture in re.captures_iter(text) { + let m = capture.get(1).expect("invalid capture"); + let mut uri = if let Some(domain) = capture.name("domain") { + if !psl::suffix(domain.as_str().as_bytes()) + .map_or(false, |suffix| suffix.is_known()) + { + continue; + } + format!("https://{}", m.as_str()) + } else { + m.as_str().into() + }; + let mut index = ByteSlice { + byte_end: m.end(), + byte_start: m.start(), + }; + // strip ending puncuation + if RE_ENDING_PUNCTUATION + .get_or_init(|| Regex::new(r"[.,;:!?]$").expect("invalid regex")) + .is_match(&uri) + { + uri.pop(); + index.byte_end -= 1; + } + facets.push(FacetWithoutResolution { + features: vec![FacetFeaturesItem::Link(Box::new(Link { uri }))], + index, + }); + } + } facets } diff --git a/bsky-sdk/src/rich_text/tests/detection.rs b/bsky-sdk/src/rich_text/tests/detection.rs index c4097585..d3df6626 100644 --- a/bsky-sdk/src/rich_text/tests/detection.rs +++ b/bsky-sdk/src/rich_text/tests/detection.rs @@ -136,10 +136,169 @@ async fn detect_facets() -> Result<()> { (" end", None), ], ), + ( + "start https://middle.com/foo/bar end", + vec![ + ("start ", None), + ( + "https://middle.com/foo/bar", + Some("https://middle.com/foo/bar"), + ), + (" end", None), + ], + ), + ( + "start https://middle.com/foo/bar?baz=bux end", + vec![ + ("start ", None), + ( + "https://middle.com/foo/bar?baz=bux", + Some("https://middle.com/foo/bar?baz=bux"), + ), + (" end", None), + ], + ), + ( + "start https://middle.com/foo/bar?baz=bux#hash end", + vec![ + ("start ", None), + ( + "https://middle.com/foo/bar?baz=bux#hash", + Some("https://middle.com/foo/bar?baz=bux#hash"), + ), + (" end", None), + ], + ), + ( + "https://start.com/foo/bar?baz=bux#hash middle end", + vec![ + ( + "https://start.com/foo/bar?baz=bux#hash", + Some("https://start.com/foo/bar?baz=bux#hash"), + ), + (" middle end", None), + ], + ), + ( + "start middle https://end.com/foo/bar?baz=bux#hash", + vec![ + ("start middle ", None), + ( + "https://end.com/foo/bar?baz=bux#hash", + Some("https://end.com/foo/bar?baz=bux#hash"), + ), + ], + ), + ( + "https://newline1.com\nhttps://newline2.com", + vec![ + ("https://newline1.com", Some("https://newline1.com")), + ("\n", None), + ("https://newline2.com", Some("https://newline2.com")), + ], + ), + ( + "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง https://middle.com ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง", + vec![ + ("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง ", None), + ("https://middle.com", Some("https://middle.com")), + (" ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง", None), + ], + ), + ( + "start middle.com end", + vec![ + ("start ", None), + ("middle.com", Some("https://middle.com")), + (" end", None), + ], + ), + ( + "start middle.com/foo/bar end", + vec![ + ("start ", None), + ("middle.com/foo/bar", Some("https://middle.com/foo/bar")), + (" end", None), + ], + ), + ( + "start middle.com/foo/bar?baz=bux end", + vec![ + ("start ", None), + ( + "middle.com/foo/bar?baz=bux", + Some("https://middle.com/foo/bar?baz=bux"), + ), + (" end", None), + ], + ), + ( + "start middle.com/foo/bar?baz=bux#hash end", + vec![ + ("start ", None), + ( + "middle.com/foo/bar?baz=bux#hash", + Some("https://middle.com/foo/bar?baz=bux#hash"), + ), + (" end", None), + ], + ), + ( + "start.com/foo/bar?baz=bux#hash middle end", + vec![ + ( + "start.com/foo/bar?baz=bux#hash", + Some("https://start.com/foo/bar?baz=bux#hash"), + ), + (" middle end", None), + ], + ), + ( + "start middle end.com/foo/bar?baz=bux#hash", + vec![ + ("start middle ", None), + ( + "end.com/foo/bar?baz=bux#hash", + Some("https://end.com/foo/bar?baz=bux#hash"), + ), + ], + ), + ( + "newline1.com\nnewline2.com", + vec![ + ("newline1.com", Some("https://newline1.com")), + ("\n", None), + ("newline2.com", Some("https://newline2.com")), + ], + ), + ( + "a example.com/index.php php link", + vec![ + ("a ", None), + ( + "example.com/index.php", + Some("https://example.com/index.php"), + ), + (" php link", None), + ], + ), + ( + "a trailing bsky.app: colon", + vec![ + ("a trailing ", None), + ("bsky.app", Some("https://bsky.app")), + (": colon", None), + ], + ), + ("not.. a..url ..here", vec![("not.. a..url ..here", None)]), + ("e.g.", vec![("e.g.", None)]), + ("something-cool.jpg", vec![("something-cool.jpg", None)]), + ("website.com.jpg", vec![("website.com.jpg", None)]), + ("e.g./foo", vec![("e.g./foo", None)]), + ("website.com.jpg/foo", vec![("website.com.jpg/foo", None)]), ]; for (input, expected) in test_cases { - let mut rt = RichText::new(input, None); - rt.detect_facets(MockClient).await?; + let rt = RichText::new(input, None).detect_facets(MockClient).await?; assert_eq!( rt.segments() .iter()