Skip to content

Commit

Permalink
Add link detection, add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
sugyan committed Jun 9, 2024
1 parent 03f4c8f commit 35ebaeb
Show file tree
Hide file tree
Showing 5 changed files with 225 additions and 6 deletions.
16 changes: 16 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion bsky-sdk/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ atrium-api.workspace = true
atrium-xrpc-client.workspace = true
http.workspace = true
ipld-core.workspace = true
psl = { version = "2.1.42", optional = true }
regex.workspace = true
serde = { workspace = true, features = ["derive"] }
serde_json.workspace = true
Expand All @@ -29,5 +30,5 @@ tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }

[features]
default = ["rich-text"]
rich-text = ["unicode-segmentation"]
rich-text = ["psl", "unicode-segmentation"]
config-toml = ["toml"]
8 changes: 5 additions & 3 deletions bsky-sdk/src/rich_text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ use detection::{detect_facets, FacetFeaturesItem};
use std::cmp::Ordering;
use unicode_segmentation::UnicodeSegmentation;

const PUBLIC_API_ENDPOINT: &str = "https://public.api.bsky.app";

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RichTextSegment {
text: String,
Expand Down Expand Up @@ -181,11 +183,11 @@ impl RichText {
facets.retain(|facet| facet.index.byte_start < facet.index.byte_end);
}
}
pub async fn detect_facets(&mut self, client: impl XrpcClient + Send + Sync) -> Result<()> {
pub async fn detect_facets(mut self, client: impl XrpcClient + Send + Sync) -> Result<Self> {
let agent = BskyAgentBuilder::default()
.client(client)
.config(Config {
endpoint: "https://public.api.bsky.app".into(),
endpoint: PUBLIC_API_ENDPOINT.into(),
..Default::default()
})
.build()
Expand Down Expand Up @@ -224,7 +226,7 @@ impl RichText {
}
Some(facets)
};
Ok(())
Ok(self)
}
}

Expand Down
41 changes: 41 additions & 0 deletions bsky-sdk/src/rich_text/detection.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
use atrium_api::app::bsky::richtext::facet::{ByteSlice, Link, Tag};
use psl;
use regex::Regex;
use std::sync::OnceLock;

static RE_MENTION: OnceLock<Regex> = OnceLock::new();
static RE_URL: OnceLock<Regex> = OnceLock::new();
static RE_ENDING_PUNCTUATION: OnceLock<Regex> = OnceLock::new();

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct FacetWithoutResolution {
Expand Down Expand Up @@ -45,5 +48,43 @@ pub fn detect_facets(text: &str) -> Vec<FacetWithoutResolution> {
});
}
}
// links
{
let re = RE_URL.get_or_init(|| {
Regex::new(
r"(?:^|\s|\()((?:https?:\/\/[\S]+)|(?:(?<domain>[a-z][a-z0-9]*(?:\.[a-z0-9]+)+)[\S]*))",
)
.expect("invalid regex")
});
for capture in re.captures_iter(text) {
let m = capture.get(1).expect("invalid capture");
let mut uri = if let Some(domain) = capture.name("domain") {
if !psl::suffix(domain.as_str().as_bytes())
.map_or(false, |suffix| suffix.is_known())
{
continue;
}
format!("https://{}", m.as_str())
} else {
m.as_str().into()
};
let mut index = ByteSlice {
byte_end: m.end(),
byte_start: m.start(),
};
// strip ending puncuation
if RE_ENDING_PUNCTUATION
.get_or_init(|| Regex::new(r"[.,;:!?]$").expect("invalid regex"))
.is_match(&uri)
{
uri.pop();
index.byte_end -= 1;
}
facets.push(FacetWithoutResolution {
features: vec![FacetFeaturesItem::Link(Box::new(Link { uri }))],
index,
});
}
}
facets
}
163 changes: 161 additions & 2 deletions bsky-sdk/src/rich_text/tests/detection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,169 @@ async fn detect_facets() -> Result<()> {
(" end", None),
],
),
(
"start https://middle.com/foo/bar end",
vec![
("start ", None),
(
"https://middle.com/foo/bar",
Some("https://middle.com/foo/bar"),
),
(" end", None),
],
),
(
"start https://middle.com/foo/bar?baz=bux end",
vec![
("start ", None),
(
"https://middle.com/foo/bar?baz=bux",
Some("https://middle.com/foo/bar?baz=bux"),
),
(" end", None),
],
),
(
"start https://middle.com/foo/bar?baz=bux#hash end",
vec![
("start ", None),
(
"https://middle.com/foo/bar?baz=bux#hash",
Some("https://middle.com/foo/bar?baz=bux#hash"),
),
(" end", None),
],
),
(
"https://start.com/foo/bar?baz=bux#hash middle end",
vec![
(
"https://start.com/foo/bar?baz=bux#hash",
Some("https://start.com/foo/bar?baz=bux#hash"),
),
(" middle end", None),
],
),
(
"start middle https://end.com/foo/bar?baz=bux#hash",
vec![
("start middle ", None),
(
"https://end.com/foo/bar?baz=bux#hash",
Some("https://end.com/foo/bar?baz=bux#hash"),
),
],
),
(
"https://newline1.com\nhttps://newline2.com",
vec![
("https://newline1.com", Some("https://newline1.com")),
("\n", None),
("https://newline2.com", Some("https://newline2.com")),
],
),
(
"👨‍👩‍👧‍👧 https://middle.com 👨‍👩‍👧‍👧",
vec![
("👨‍👩‍👧‍👧 ", None),
("https://middle.com", Some("https://middle.com")),
(" 👨‍👩‍👧‍👧", None),
],
),
(
"start middle.com end",
vec![
("start ", None),
("middle.com", Some("https://middle.com")),
(" end", None),
],
),
(
"start middle.com/foo/bar end",
vec![
("start ", None),
("middle.com/foo/bar", Some("https://middle.com/foo/bar")),
(" end", None),
],
),
(
"start middle.com/foo/bar?baz=bux end",
vec![
("start ", None),
(
"middle.com/foo/bar?baz=bux",
Some("https://middle.com/foo/bar?baz=bux"),
),
(" end", None),
],
),
(
"start middle.com/foo/bar?baz=bux#hash end",
vec![
("start ", None),
(
"middle.com/foo/bar?baz=bux#hash",
Some("https://middle.com/foo/bar?baz=bux#hash"),
),
(" end", None),
],
),
(
"start.com/foo/bar?baz=bux#hash middle end",
vec![
(
"start.com/foo/bar?baz=bux#hash",
Some("https://start.com/foo/bar?baz=bux#hash"),
),
(" middle end", None),
],
),
(
"start middle end.com/foo/bar?baz=bux#hash",
vec![
("start middle ", None),
(
"end.com/foo/bar?baz=bux#hash",
Some("https://end.com/foo/bar?baz=bux#hash"),
),
],
),
(
"newline1.com\nnewline2.com",
vec![
("newline1.com", Some("https://newline1.com")),
("\n", None),
("newline2.com", Some("https://newline2.com")),
],
),
(
"a example.com/index.php php link",
vec![
("a ", None),
(
"example.com/index.php",
Some("https://example.com/index.php"),
),
(" php link", None),
],
),
(
"a trailing bsky.app: colon",
vec![
("a trailing ", None),
("bsky.app", Some("https://bsky.app")),
(": colon", None),
],
),
("not.. a..url ..here", vec![("not.. a..url ..here", None)]),
("e.g.", vec![("e.g.", None)]),
("something-cool.jpg", vec![("something-cool.jpg", None)]),
("website.com.jpg", vec![("website.com.jpg", None)]),
("e.g./foo", vec![("e.g./foo", None)]),
("website.com.jpg/foo", vec![("website.com.jpg/foo", None)]),
];
for (input, expected) in test_cases {
let mut rt = RichText::new(input, None);
rt.detect_facets(MockClient).await?;
let rt = RichText::new(input, None).detect_facets(MockClient).await?;
assert_eq!(
rt.segments()
.iter()
Expand Down

0 comments on commit 35ebaeb

Please sign in to comment.