Skip to content

Commit

Permalink
Merge branch 'bsky' into 'master'
Browse files Browse the repository at this point in the history
Bluesky graph collection

See merge request imi-sad/spikexplore!3
  • Loading branch information
naspert authored and GitLab committed Jan 22, 2025
2 parents fcc5ef6 + 9d2a4bb commit 329b9fe
Show file tree
Hide file tree
Showing 12 changed files with 368 additions and 48 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/black.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ jobs:
with:
options: "--check -l 150"
src: "."
version: "~= 22.12.0"
version: "~= 24.8.0"

8 changes: 5 additions & 3 deletions .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10"]
python-version: ["3.10", "3.11"]

steps:
- uses: actions/checkout@v2
Expand All @@ -19,8 +19,10 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
python -m pip install .[dev]
- name: Run tests
env:
BSKY_HANDLE: ${{ secrets.BSKY_HANDLE }}
BSKY_PASSWORD: ${{ secrets.BSKY_PASSWORD }}
run: |
python -m pytest tests
5 changes: 2 additions & 3 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
image: python:3.9
image: python:3.11

stages:
- format
Expand All @@ -20,8 +20,7 @@ before_script:
- python -m pip install virtualenv
- virtualenv venv
- source venv/bin/activate
- pip install --upgrade -r requirements.txt
- pip install black pytest
- pip install .[dev]

Formatting:
stage: format
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ So far this implementation supports:
- ~~Twitter using [v1 API](https://developer.twitter.com/en/docs/twitter-api/api-reference-index) through [Twython](https://twython.readthedocs.io/en/latest/)~~
Twitter API is no longer available unless you pay. Latest version supporting it is v0.0.12.
- Wikipedia using [Mediawiki API](https://www.mediawiki.org/wiki/API:Main_page) through [Wikipedia-API](https://pypi.org/project/Wikipedia-API/)
- Bluesky using [ATProto](https://atproto.blue/en/latest/)
42 changes: 42 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
[build-system]
requires = ["setuptools>=61.2"]
build-backend = "setuptools.build_meta"

[project]
name = "spikexplore"
version = "0.2.0"
authors = [
{name = "Nicolas Aspert", email = "[email protected]"},
{name = "Benjamin Ricaud", email = "[email protected]"},
]
license = {text = "Apache license"}
description = "Graph exploration using inhomogeneous filtered diffusion"
classifiers = [
"Development Status :: 4 - Beta",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache License",
"Operating System :: POSIX :: Linux",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.8",
]
urls = {Homepage = "https://gitlab.switch.ch/imi-sad/spikexplore"}
requires-python = ">=3.8"
dependencies = [
"pandas",
"numpy",
"networkx",
"tqdm",
"atproto",
"wikipedia-api",
"python-louvain",
]
[project.optional-dependencies]
dev = [
"pytest",
"black==24.8.0"
]
[tool.setuptools]
packages = ["spikexplore", "spikexplore.backends"]
script-files = []
include-package-data = false
8 changes: 0 additions & 8 deletions requirements.txt

This file was deleted.

24 changes: 0 additions & 24 deletions setup.py

This file was deleted.

271 changes: 271 additions & 0 deletions spikexplore/backends/bluesky.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
from atproto import Client, client_utils
import networkx as nx
import time
import logging
import pandas as pd
from datetime import datetime, timedelta, timezone

from atproto_client.exceptions import BadRequestError

from spikexplore.NodeInfo import NodeInfo
from spikexplore.graph import add_node_attributes, add_edges_attributes

logger = logging.getLogger(__name__)


class BlueskyCredentials:
def __init__(self, handle, password):
self.handle = handle
self.password = password


class SkeetsGetter:
def __init__(self, credentials, config):
# Instantiate an object
self.config = config
self.bsky_client = Client()
self.bsky_client.login(credentials.handle, credentials.password)
self.profiles_cache = {}
self.features_attrs = {"mention": "did", "tag": "tag", "link": "uri"}

def _filter_old_skeets(self, skeets):
max_day_old = self.config.max_day_old
if not max_day_old:
return skeets

days_limit = datetime.now() - timedelta(days=max_day_old)
skeets_filt = filter(
lambda t: datetime.fromisoformat(t.post.record["created_at"].replace("Z", "+00:00")).replace(tzinfo=None) >= days_limit, skeets
)
return list(skeets_filt)

def get_profile(self, did):
handle = self.profiles_cache.get(did)
if handle is not None:
return handle
try:
p = self.bsky_client.get_profile(did)
if p is not None:
self.profiles_cache[did] = p.handle
return p.handle
except Exception as e:
logger.error("Error in getting profile: ", e)
finally:
return None

def facet_data(self, skeet, data):
if not hasattr(skeet, "record"):
return []
if skeet.record.facets is None:
return []
return [
getattr(f.features[0], self.features_attrs[data])
for f in skeet.record.facets
if f.features[0].py_type == f"app.bsky.richtext.facet#{data}"
]

def get_user_skeets(self, username):
# Collect skeets from a username/did

count = self.config.max_skeets_per_user

# Test if ok
try:
user_skeets_raw = self.bsky_client.get_author_feed(actor=username, limit=count).feed
# remove old tweets
user_skeets_filt = self._filter_old_skeets(user_skeets_raw)
# make a dictionary
user_skeets = {x.post.cid: x.post for x in user_skeets_filt}

# update profile cache
for v in user_skeets.items():
if v[1].author.did not in self.profiles_cache:
self.profiles_cache[v[1].author.did] = v[1].author.handle

skeets_metadata = map(
lambda x: (
x[0],
{
"user_did": x[1].author.did,
"user": x[1].author.handle,
"name": x[1].author.display_name,
"mentions": self.facet_data(x[1], "mention"),
"hashtags": self.facet_data(x[1], "tag"),
"links": self.facet_data(x[1], "link"),
"repost_count": x[1].repost_count,
"favorite_count": x[1].like_count,
"reply_to": x[1].reply.parent.author.handle if hasattr(x[1], "reply") else [],
"created_at": x[1].record.created_at,
"account_creation": x[1].author.created_at,
},
),
user_skeets.items(),
)
return user_skeets, dict(skeets_metadata)
except BadRequestError as e:
logger.error(f"Error in getting user skeets: code {e.response.status_code} - {e.response.content.message}")
return {}, {}
except Exception as e:
logger.error(f"Error in getting user skeets: {e}")
return {}, {}

def reshape_node_data(self, node_df):
node_df = node_df[
[
"user_did",
"user",
"name",
"spikyball_hop",
"account_creation",
]
]
node_df = node_df.reset_index().groupby("user").max().rename(columns={"index": "last_skeet_id"})
return node_df


class BlueskyNetwork:
class BlueskyNodeInfo(NodeInfo):
def __init__(self, user_hashtags=None, user_skeets=None, user_links=None, skeets_meta=pd.DataFrame()):
self.user_hashtags = user_hashtags if user_hashtags else {}
self.user_links = user_links if user_links else {}
self.user_skeets = user_skeets if user_skeets else {}
self.skeets_meta = skeets_meta

def update(self, new_info):
self.user_hashtags.update(new_info.user_hashtags)
self.user_skeets.update(new_info.user_skeets)
self.user_links.update(new_info.user_skeets)

def get_nodes(self):
return self.skeets_meta

def __init__(self, credentials, config):
self.skeets_getter = SkeetsGetter(credentials, config)
self.config = config

def create_node_info(self):
return self.BlueskyNodeInfo()

def get_neighbors(self, user):
if not isinstance(user, str):
return self.BlueskyNodeInfo(), pd.DataFrame()
skeets_dic, skeets_meta = self.skeets_getter.get_user_skeets(user)
edges_df, node_info = self.edges_nodes_from_user(user, skeets_meta, skeets_dic)

# replace user and mentions by source and target
if not edges_df.empty:
edges_df.index.names = ["source", "target"]
edges_df.reset_index(level=["source", "target"], inplace=True)

return node_info, edges_df

def filter(self, node_info, edges_df):
# filter edges according to node properties
# filter according to edges properties
edges_df = self.filter_edges(edges_df)
return node_info, edges_df

def filter_edges(self, edges_df):
# filter edges according to their properties
if edges_df.empty:
return edges_df
return edges_df[edges_df["weight"] >= self.config.min_mentions]

def neighbors_list(self, edges_df):
users_connected = edges_df["target"].tolist()
return users_connected

def neighbors_with_weights(self, edges_df):
if edges_df.empty:
return {}
user_list = self.neighbors_list(edges_df)
return dict.fromkeys(user_list, 1)

###############################################################
# Functions for extracting skeet info from the bluesky API
###############################################################

def edges_nodes_from_user(self, user, skeets_meta, skeets_dic):
# Make an edge and node property dataframes
edges_df = self.get_edges(user, skeets_meta)
user_info = self.get_nodes_properties(skeets_meta, skeets_dic)
return edges_df, user_info

def did_to_handle(self, did):
return self.skeets_getter.get_profile(did)

def match_usernames(self, meta_df):
mask = meta_df["mentions"].str.startswith("did:")
meta_df.loc[mask, "mentions"] = meta_df.loc[mask, "mentions"].apply(self.did_to_handle)

return meta_df.dropna(subset=["mentions"])

def get_edges(self, user, skeets_meta):
if not skeets_meta:
return pd.DataFrame()
# Create the user -> mention table with their properties fom the list of tweets of a user
mentions_df = pd.DataFrame.from_dict(skeets_meta, orient="index")
mentions_df["full_mentions"] = mentions_df["mentions"] + mentions_df["reply_to"] # a reply is a kind of mention
mentions_df = (
mentions_df.drop(columns=["reply_to", "mentions"]).explode("full_mentions").dropna().rename(columns={"full_mentions": "mentions"})
)
# Some bots to be removed from the collection
users_to_remove = self.config.users_to_remove

# mentions can be dids so need to translate that first into user handles
mentions_df = self.match_usernames(mentions_df)
filtered_mentions_df = mentions_df[~mentions_df["mentions"].isin(users_to_remove) & ~mentions_df["mentions"].isin(mentions_df["user"])]

# group by mentions and keep list of tweets for each mention
tmp = filtered_mentions_df.groupby(["user", "mentions"]).apply(lambda x: (x.index.tolist(), len(x.index)), include_groups=False)
if tmp.empty:
edge_mentions_df = pd.DataFrame([], columns=["source", "target", "cid", "weight"])
else:
edge_mentions_df = pd.DataFrame(tmp.tolist(), index=tmp.index).rename(columns={0: "cid", 1: "weight"})
edge_mentions_df.index.names = ["source", "target"]
# Now get reposts
repost_df = pd.DataFrame.from_dict(skeets_meta, orient="index")
repost_df["source_user"] = user
# remove self edges
repost_df = repost_df[repost_df["user"] != repost_df["source_user"]]
tmp = repost_df.groupby(["source_user", "user"]).apply(lambda x: (x.index.tolist(), len(x.index)), include_groups=False)
if tmp.empty:
edge_repost_df = pd.DataFrame([], columns=["source", "target", "cid", "weight"])
else:
edge_repost_df = pd.DataFrame(tmp.tolist(), index=tmp.index).rename(columns={0: "cid", 1: "weight"})
edge_repost_df.index.names = ["source", "target"]
edges_df = pd.concat([edge_mentions_df, edge_repost_df]).groupby(["source", "target"]).sum()
return edges_df

def get_nodes_properties(self, skeets_meta, skeets_dic):
if not skeets_meta:
return self.BlueskyNodeInfo({}, {}, {}, pd.DataFrame())
nb_popular_skeets = self.config.nb_popular_skeets
# global properties
meta_df = pd.DataFrame.from_dict(skeets_meta, orient="index").sort_values("repost_count", ascending=False)
# hashtags statistics
ht_df = meta_df.explode("hashtags").dropna()
htgb = ht_df.groupby(["hashtags"]).size()
user_hashtags = pd.DataFrame(htgb).rename(columns={0: "count"}).sort_values("count", ascending=False).to_dict()
links_df = meta_df.explode("links").dropna()
links = links_df.groupby(["links"]).size()
user_links = pd.DataFrame(links).rename(columns={0: "count"}).sort_values("count", ascending=False).to_dict()
user_name = meta_df["user"].iloc[0]
skeets_meta_kept = meta_df.head(nb_popular_skeets)
skeets_kept = {k: skeets_dic[k] for k in skeets_meta_kept.index.to_list()}
# Get most popular tweets of user
return self.BlueskyNodeInfo(
user_hashtags={user_name: user_hashtags["count"]},
user_skeets=skeets_kept,
user_links={user_name: user_links["count"]},
skeets_meta=skeets_meta_kept,
)

#####################################################
## Utils functions for the graph
#####################################################

def add_graph_attributes(self, g, nodes_df, edges_df, nodes_info):
g = add_edges_attributes(g, edges_df, drop_cols=["cid"])
g = add_node_attributes(g, self.skeets_getter.reshape_node_data(nodes_df), attr_dic=nodes_info.user_hashtags, attr_name="all_hashtags")
return g
Loading

0 comments on commit 329b9fe

Please sign in to comment.