From 7fde7b4ff19724f1ed72407a2882b5ecbffd9af0 Mon Sep 17 00:00:00 2001 From: Nicolas Aspert Date: Thu, 19 Dec 2024 18:19:43 +0100 Subject: [PATCH 01/11] wip collect bluesky data --- spikexplore/backends/bluesky.py | 245 ++++++++++++++++++++++++++++++++ spikexplore/config.py | 7 +- tests/bsky_test.py | 37 +++++ 3 files changed, 285 insertions(+), 4 deletions(-) create mode 100644 spikexplore/backends/bluesky.py create mode 100644 tests/bsky_test.py diff --git a/spikexplore/backends/bluesky.py b/spikexplore/backends/bluesky.py new file mode 100644 index 0000000..6fd2371 --- /dev/null +++ b/spikexplore/backends/bluesky.py @@ -0,0 +1,245 @@ +from atproto import Client, client_utils +import networkx as nx +import time +import logging +import pandas as pd +from datetime import datetime, timedelta +from spikexplore.NodeInfo import NodeInfo +from spikexplore.graph import add_node_attributes, add_edges_attributes + +logger = logging.getLogger(__name__) + +class BlueskyCredentials: + def __init__(self, handle, password): + self.handle = handle + self.password = password + + +class SkeetsGetter: + def __init__(self, credentials, config): + # Instantiate an object + self.config = config + self.bsky_client = Client() + self.bsky_client.login(credentials.handle, credentials.password) + self.profiles_cache = {} + self.features_attrs = {"mention": "did", "tag": "tag", "link": "uri"} + + def _filter_old_skeets(self, skeets): + max_day_old = self.config.max_day_old + if not max_day_old: + return skeets + + days_limit = datetime.now() - timedelta(days=max_day_old) + skeets_filt = filter(lambda t: datetime.strptime(t.post.record["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") >= days_limit, skeets) + return list(skeets_filt) + + + def get_profile(self, did): + handle = self.profiles_cache.get(did) + if handle is not None: + return handle + + p = self.bsky_client.get_profile(did) + if p is not None: + self.profiles_cache[did] = p.handle + return p.handle + return None + + def facet_data(self, skeet, data): + if not hasattr(skeet, "record"): + return [] + if skeet.record.facets is None: + return [] + return [getattr(f.features[0], self.features_attrs[data]) for f in + skeet.record.facets if f.features[0].py_type == f"app.bsky.richtext.facet#{data}"] + + def get_user_skeets(self, username): + # Collect skeets from a username/did + + count = self.config.max_skeets_per_user + + # Test if ok + try: + user_skeets_raw = self.bsky_client.get_author_feed( + actor=username, limit=count + ).feed + # remove old tweets + user_skeets_filt = self._filter_old_skeets(user_skeets_raw) + # make a dictionary + user_skeets = {x.post.cid: x.post for x in user_skeets_filt} + + # update profile cache + for v in user_skeets.items(): + if v[1].author.did not in self.profiles_cache: + self.profiles_cache[v[1].author.did] = v[1].author.handle + + skeets_metadata = map( + lambda x: ( + x[0], + { + "user_did": x[1].author.did, + "user": x[1].author.handle, + "name": x[1].author.display_name, + "mentions": self.facet_data(x[1], "mention"), + "hashtags": self.facet_data(x[1], "tag"), + "links": self.facet_data(x[1], "link"), + "repost_count": x[1].repost_count, + "favorite_count": x[1].like_count, + "created_at": x[1].record.created_at, + "account_creation": x[1].author.created_at, + }, + ), + user_skeets.items(), + ) + return user_skeets, dict(skeets_metadata) + except Exception as e: + logger.error("Error in getting user skeets: ", e) + return {}, {} + + + def reshape_node_data(self, node_df): + # user name user_details mentions hashtags retweet_count favorite_count + # created_at account_creation account_followers account_following account_statuses account_favourites + # account_verified account_default_profile account_default_profile_image spikyball_hop + node_df = node_df[ + [ + "user_did", + "user", + "name", + "spikyball_hop", + "account_creation", + ] + ] + node_df = node_df.reset_index().groupby("user_did").max().rename(columns={"index": "max_tweet_id"}) + return node_df + + +class BlueskyNetwork: + class BlueskyNodeInfo(NodeInfo): + def __init__(self, user_hashtags=None, user_skeets=None, user_links=None, skeets_meta=pd.DataFrame()): + self.user_hashtags = user_hashtags if user_hashtags else {} + self.user_links = user_links if user_links else {} + self.user_skeets = user_skeets if user_skeets else {} + self.skeets_meta = skeets_meta + + def update(self, new_info): + self.user_hashtags.update(new_info.user_hashtags) + self.user_skeets.update(new_info.user_skeets) + self.user_links.update(new_info.user_skeets) + + def get_nodes(self): + return self.skeets_meta + + def __init__(self, credentials, config): + self.skeets_getter = SkeetsGetter(credentials, config) + self.config = config + + def create_node_info(self): + return self.BlueskyNodeInfo() + + def get_neighbors(self, user): + if not isinstance(user, str): + return self.BlueskyNodeInfo(), pd.DataFrame() + skeets_dic, skeets_meta = self.skeets_getter.get_user_skeets(user) + edges_df, node_info = self.edges_nodes_from_user(skeets_meta, skeets_dic) + + # replace user and mentions by source and target + if not edges_df.empty: + edges_df.index.names = ["source", "target"] + edges_df.reset_index(level=["source", "target"], inplace=True) + + return node_info, edges_df + + def filter(self, node_info, edges_df): + # filter edges according to node properties + # filter according to edges properties + edges_df = self.filter_edges(edges_df) + return node_info, edges_df + + def filter_edges(self, edges_df): + # filter edges according to their properties + if edges_df.empty: + return edges_df + return edges_df[edges_df["weight"] >= self.config.min_mentions] + + def neighbors_list(self, edges_df): + if edges_df.empty: + return edges_df + users_connected = edges_df["target"].tolist() + return users_connected + + def neighbors_with_weights(self, edges_df): + user_list = self.neighbors_list(edges_df) + return dict.fromkeys(user_list, 1) + + ############################################################### + # Functions for extracting skeet info from the bluesky API + ############################################################### + + def edges_nodes_from_user(self, skeets_meta, skeets_dic): + # Make an edge and node property dataframes + edges_df = self.get_edges(skeets_meta) + user_info = self.get_nodes_properties(skeets_meta, skeets_dic) + return edges_df, user_info + + def did_to_handle(self, did): + return self.skeets_getter.get_profile(did) + + def match_usernames(self, meta_df): + mask = meta_df['mentions'].str.startswith("did:") + meta_df.loc[mask, 'mentions'] = meta_df.loc[mask, 'mentions'].apply(self.did_to_handle) + + return meta_df.dropna(subset=['mentions']) + + def get_edges(self, skeets_meta): + if not skeets_meta: + return pd.DataFrame() + # Create the user -> mention table with their properties fom the list of tweets of a user + meta_df = pd.DataFrame.from_dict(skeets_meta, orient="index").explode("mentions").dropna() + # Some bots to be removed from the collection + users_to_remove = self.config.users_to_remove + + # mentions can be dids so need to translate that first into user handles + meta_df = self.match_usernames(meta_df) + filtered_meta_df = meta_df[~meta_df["mentions"].isin(users_to_remove) & ~meta_df["mentions"].isin(meta_df["user"])] + + # group by mentions and keep list of tweets for each mention + tmp = filtered_meta_df.groupby(["user", "mentions"]).apply(lambda x: (x.index.tolist(), len(x.index))) + if tmp.empty: + return tmp + edge_df = pd.DataFrame(tmp.tolist(), index=tmp.index).rename(columns={0: "cid", 1: "weight"}) + return edge_df + + def get_nodes_properties(self, skeets_meta, skeets_dic): + if not skeets_meta: + return self.BlueskyNodeInfo({}, {}, {}, pd.DataFrame()) + nb_popular_skeets = self.config.nb_popular_skeets + # global properties + meta_df = pd.DataFrame.from_dict(skeets_meta, orient="index").sort_values("repost_count", ascending=False) + # hashtags statistics + ht_df = meta_df.explode("hashtags").dropna() + htgb = ht_df.groupby(["hashtags"]).size() + user_hashtags = pd.DataFrame(htgb).rename(columns={0: "count"}).sort_values("count", ascending=False).to_dict() + links_df = meta_df.explode("links").dropna() + links = links_df.groupby(["links"]).size() + user_links = pd.DataFrame(links).rename(columns={0: "count"}).sort_values("count", ascending=False).to_dict() + user_name = meta_df["user"].iloc[0] + skeets_meta_kept = meta_df.head(nb_popular_skeets) + skeets_kept = {k: skeets_dic[k] for k in skeets_meta_kept.index.to_list()} + # Get most popular tweets of user + return self.BlueskyNodeInfo(user_hashtags={user_name: user_hashtags["count"]}, user_skeets=skeets_kept, + user_links={user_name: user_links["count"]}, skeets_meta=skeets_meta_kept) + + ##################################################### + ## Utils functions for the graph + ##################################################### + + def add_graph_attributes(self, g, nodes_df, edges_df, nodes_info): + g = add_edges_attributes(g, edges_df, drop_cols=["cid", "degree_target", "degree_source"]) + g = add_node_attributes(g, self.skeets_getter.reshape_node_data(nodes_df), attr_dic=nodes_info.user_hashtags, attr_name="all_hashtags") + return g + + + + + diff --git a/spikexplore/config.py b/spikexplore/config.py index b81b56c..d82af23 100644 --- a/spikexplore/config.py +++ b/spikexplore/config.py @@ -32,13 +32,12 @@ def __init__(self, graph, data_collection): @dataclass -class TwitterConfig: +class BlueskyConfig: min_mentions: int = 0 max_day_old: int = 30 - max_tweets_per_user: int = 200 - nb_popular_tweets: int = 10 + max_skeets_per_user: int = 100 + nb_popular_skeets: int = 10 users_to_remove = [] - api_version: int = 1 @dataclass diff --git a/tests/bsky_test.py b/tests/bsky_test.py new file mode 100644 index 0000000..e28a5a9 --- /dev/null +++ b/tests/bsky_test.py @@ -0,0 +1,37 @@ +import os +import unittest +import networkx as nx +from spikexplore import graph_explore +from spikexplore.backends.bluesky import BlueskyNetwork, BlueskyCredentials +from spikexplore.config import SamplingConfig, GraphConfig, DataCollectionConfig, BlueskyConfig + + +class BlueskyGraphSampling(unittest.TestCase): + @classmethod + def setUpClass(cls): + bsky_credentials = BlueskyCredentials( + os.getenv("BSKY_HANDLE", ""), os.getenv("BSKY_PASSWORD", "") + ) + + cls.bluesky_config = BlueskyConfig() + cls.bluesky_config.users_to_remove = ["threader_app", "threadreaderapp"] + cls.sampling_backend = BlueskyNetwork(bsky_credentials, cls.bluesky_config) + + graph_config = GraphConfig(min_degree=2, min_weight=1, community_detection=True, min_community_size=2, as_undirected=False) + data_collection_config = DataCollectionConfig( + exploration_depth=2, random_subset_mode="percent", random_subset_size=20, expansion_type="coreball", degree=2, max_nodes_per_hop=100 + ) + cls.sampling_config = SamplingConfig(graph_config, data_collection_config) + cls.initial_nodes = ["github.com", "githubnext.com", "bsky.app", "jay.bsky.team"] + + def test_sampling_coreball(self): + g_sub, _ = graph_explore.explore(self.sampling_backend, self.initial_nodes, self.sampling_config) + self.assertTrue(g_sub.number_of_nodes() > 5) + self.assertTrue(g_sub.number_of_edges() > 10) + communities = nx.get_node_attributes(g_sub, "community") + self.assertGreaterEqual(max(communities.values()), 2) + + def test_empty_graph(self): + g_sub, _ = graph_explore.explore(self.sampling_backend, ["#InvalidUsername"], self.sampling_config) + self.assertTrue(g_sub.number_of_nodes() == 0) + self.assertTrue(g_sub.number_of_edges() == 0) From 0ed6b815fa07197be1e209a8479859e995a4a56f Mon Sep 17 00:00:00 2001 From: Nicolas Aspert Date: Fri, 20 Dec 2024 10:55:26 +0100 Subject: [PATCH 02/11] wip update reqs + black --- .github/workflows/main.yaml | 2 +- .gitlab-ci.yml | 4 ++-- requirements.txt | 3 +-- spikexplore/backends/bluesky.py | 33 ++++++++++++++++----------------- tests/bsky_test.py | 4 +--- 5 files changed, 21 insertions(+), 25 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index af8bb4c..8e60434 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.10", "3.11"] steps: - uses: actions/checkout@v2 diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6b41809..88abd4a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,4 +1,4 @@ -image: python:3.9 +image: python:3.11 stages: - format @@ -21,7 +21,7 @@ before_script: - virtualenv venv - source venv/bin/activate - pip install --upgrade -r requirements.txt - - pip install black pytest + - pip install black==2024.8.0 pytest Formatting: stage: format diff --git a/requirements.txt b/requirements.txt index 82fd53e..9bdc472 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,6 @@ numpy pandas networkx tqdm -twython wikipedia-api python-louvain -TwitterAPI \ No newline at end of file +atproto diff --git a/spikexplore/backends/bluesky.py b/spikexplore/backends/bluesky.py index 6fd2371..f4e70dc 100644 --- a/spikexplore/backends/bluesky.py +++ b/spikexplore/backends/bluesky.py @@ -9,6 +9,7 @@ logger = logging.getLogger(__name__) + class BlueskyCredentials: def __init__(self, handle, password): self.handle = handle @@ -33,7 +34,6 @@ def _filter_old_skeets(self, skeets): skeets_filt = filter(lambda t: datetime.strptime(t.post.record["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") >= days_limit, skeets) return list(skeets_filt) - def get_profile(self, did): handle = self.profiles_cache.get(did) if handle is not None: @@ -50,8 +50,11 @@ def facet_data(self, skeet, data): return [] if skeet.record.facets is None: return [] - return [getattr(f.features[0], self.features_attrs[data]) for f in - skeet.record.facets if f.features[0].py_type == f"app.bsky.richtext.facet#{data}"] + return [ + getattr(f.features[0], self.features_attrs[data]) + for f in skeet.record.facets + if f.features[0].py_type == f"app.bsky.richtext.facet#{data}" + ] def get_user_skeets(self, username): # Collect skeets from a username/did @@ -60,9 +63,7 @@ def get_user_skeets(self, username): # Test if ok try: - user_skeets_raw = self.bsky_client.get_author_feed( - actor=username, limit=count - ).feed + user_skeets_raw = self.bsky_client.get_author_feed(actor=username, limit=count).feed # remove old tweets user_skeets_filt = self._filter_old_skeets(user_skeets_raw) # make a dictionary @@ -96,7 +97,6 @@ def get_user_skeets(self, username): logger.error("Error in getting user skeets: ", e) return {}, {} - def reshape_node_data(self, node_df): # user name user_details mentions hashtags retweet_count favorite_count # created_at account_creation account_followers account_following account_statuses account_favourites @@ -186,10 +186,10 @@ def did_to_handle(self, did): return self.skeets_getter.get_profile(did) def match_usernames(self, meta_df): - mask = meta_df['mentions'].str.startswith("did:") - meta_df.loc[mask, 'mentions'] = meta_df.loc[mask, 'mentions'].apply(self.did_to_handle) + mask = meta_df["mentions"].str.startswith("did:") + meta_df.loc[mask, "mentions"] = meta_df.loc[mask, "mentions"].apply(self.did_to_handle) - return meta_df.dropna(subset=['mentions']) + return meta_df.dropna(subset=["mentions"]) def get_edges(self, skeets_meta): if not skeets_meta: @@ -227,8 +227,12 @@ def get_nodes_properties(self, skeets_meta, skeets_dic): skeets_meta_kept = meta_df.head(nb_popular_skeets) skeets_kept = {k: skeets_dic[k] for k in skeets_meta_kept.index.to_list()} # Get most popular tweets of user - return self.BlueskyNodeInfo(user_hashtags={user_name: user_hashtags["count"]}, user_skeets=skeets_kept, - user_links={user_name: user_links["count"]}, skeets_meta=skeets_meta_kept) + return self.BlueskyNodeInfo( + user_hashtags={user_name: user_hashtags["count"]}, + user_skeets=skeets_kept, + user_links={user_name: user_links["count"]}, + skeets_meta=skeets_meta_kept, + ) ##################################################### ## Utils functions for the graph @@ -238,8 +242,3 @@ def add_graph_attributes(self, g, nodes_df, edges_df, nodes_info): g = add_edges_attributes(g, edges_df, drop_cols=["cid", "degree_target", "degree_source"]) g = add_node_attributes(g, self.skeets_getter.reshape_node_data(nodes_df), attr_dic=nodes_info.user_hashtags, attr_name="all_hashtags") return g - - - - - diff --git a/tests/bsky_test.py b/tests/bsky_test.py index e28a5a9..b9b3c3f 100644 --- a/tests/bsky_test.py +++ b/tests/bsky_test.py @@ -9,9 +9,7 @@ class BlueskyGraphSampling(unittest.TestCase): @classmethod def setUpClass(cls): - bsky_credentials = BlueskyCredentials( - os.getenv("BSKY_HANDLE", ""), os.getenv("BSKY_PASSWORD", "") - ) + bsky_credentials = BlueskyCredentials(os.getenv("BSKY_HANDLE", ""), os.getenv("BSKY_PASSWORD", "")) cls.bluesky_config = BlueskyConfig() cls.bluesky_config.users_to_remove = ["threader_app", "threadreaderapp"] From c9d5b01455d3bda372014f6324e2c4d52ce28b53 Mon Sep 17 00:00:00 2001 From: Nicolas Aspert Date: Fri, 20 Dec 2024 10:57:26 +0100 Subject: [PATCH 03/11] fix black version for CI --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 88abd4a..e8553c9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -21,7 +21,7 @@ before_script: - virtualenv venv - source venv/bin/activate - pip install --upgrade -r requirements.txt - - pip install black==2024.8.0 pytest + - pip install black==24.8.0 pytest Formatting: stage: format From 838b8b2d3ef4fbcc912d952cd4da59cb6c60a8b8 Mon Sep 17 00:00:00 2001 From: Nicolas Aspert Date: Fri, 20 Dec 2024 11:00:45 +0100 Subject: [PATCH 04/11] fix black version for github --- .github/workflows/black.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/black.yaml b/.github/workflows/black.yaml index 8bd0b3a..485dabb 100644 --- a/.github/workflows/black.yaml +++ b/.github/workflows/black.yaml @@ -11,5 +11,5 @@ jobs: with: options: "--check -l 150" src: "." - version: "~= 22.12.0" + version: "~= 24.8.0" From 7100c05140619ba92847b06c05072966cbb9fa9b Mon Sep 17 00:00:00 2001 From: Nicolas Aspert Date: Tue, 14 Jan 2025 15:23:32 +0100 Subject: [PATCH 05/11] fix collection for bluesky --- spikexplore/backends/bluesky.py | 73 +++++++++++++++++++++------------ spikexplore/collect_edges.py | 5 ++- spikexplore/graph.py | 4 +- tests/bsky_test.py | 12 +++--- 4 files changed, 59 insertions(+), 35 deletions(-) diff --git a/spikexplore/backends/bluesky.py b/spikexplore/backends/bluesky.py index f4e70dc..4f94928 100644 --- a/spikexplore/backends/bluesky.py +++ b/spikexplore/backends/bluesky.py @@ -3,7 +3,7 @@ import time import logging import pandas as pd -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from spikexplore.NodeInfo import NodeInfo from spikexplore.graph import add_node_attributes, add_edges_attributes @@ -31,19 +31,24 @@ def _filter_old_skeets(self, skeets): return skeets days_limit = datetime.now() - timedelta(days=max_day_old) - skeets_filt = filter(lambda t: datetime.strptime(t.post.record["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") >= days_limit, skeets) + skeets_filt = filter( + lambda t: datetime.fromisoformat(t.post.record["created_at"].replace("Z", "+00:00")).replace(tzinfo=None) >= days_limit, skeets + ) return list(skeets_filt) def get_profile(self, did): handle = self.profiles_cache.get(did) if handle is not None: return handle - - p = self.bsky_client.get_profile(did) - if p is not None: - self.profiles_cache[did] = p.handle - return p.handle - return None + try: + p = self.bsky_client.get_profile(did) + if p is not None: + self.profiles_cache[did] = p.handle + return p.handle + except Exception as e: + logger.error("Error in getting profile: ", e) + finally: + return None def facet_data(self, skeet, data): if not hasattr(skeet, "record"): @@ -86,6 +91,7 @@ def get_user_skeets(self, username): "links": self.facet_data(x[1], "link"), "repost_count": x[1].repost_count, "favorite_count": x[1].like_count, + "reply_to": x[1].reply.parent.author.handle if hasattr(x[1], "reply") else [], "created_at": x[1].record.created_at, "account_creation": x[1].author.created_at, }, @@ -98,9 +104,6 @@ def get_user_skeets(self, username): return {}, {} def reshape_node_data(self, node_df): - # user name user_details mentions hashtags retweet_count favorite_count - # created_at account_creation account_followers account_following account_statuses account_favourites - # account_verified account_default_profile account_default_profile_image spikyball_hop node_df = node_df[ [ "user_did", @@ -110,7 +113,7 @@ def reshape_node_data(self, node_df): "account_creation", ] ] - node_df = node_df.reset_index().groupby("user_did").max().rename(columns={"index": "max_tweet_id"}) + node_df = node_df.reset_index().groupby("user").max().rename(columns={"index": "last_skeet_id"}) return node_df @@ -141,7 +144,7 @@ def get_neighbors(self, user): if not isinstance(user, str): return self.BlueskyNodeInfo(), pd.DataFrame() skeets_dic, skeets_meta = self.skeets_getter.get_user_skeets(user) - edges_df, node_info = self.edges_nodes_from_user(skeets_meta, skeets_dic) + edges_df, node_info = self.edges_nodes_from_user(user, skeets_meta, skeets_dic) # replace user and mentions by source and target if not edges_df.empty: @@ -163,12 +166,12 @@ def filter_edges(self, edges_df): return edges_df[edges_df["weight"] >= self.config.min_mentions] def neighbors_list(self, edges_df): - if edges_df.empty: - return edges_df users_connected = edges_df["target"].tolist() return users_connected def neighbors_with_weights(self, edges_df): + if edges_df.empty: + return {} user_list = self.neighbors_list(edges_df) return dict.fromkeys(user_list, 1) @@ -176,9 +179,9 @@ def neighbors_with_weights(self, edges_df): # Functions for extracting skeet info from the bluesky API ############################################################### - def edges_nodes_from_user(self, skeets_meta, skeets_dic): + def edges_nodes_from_user(self, user, skeets_meta, skeets_dic): # Make an edge and node property dataframes - edges_df = self.get_edges(skeets_meta) + edges_df = self.get_edges(user, skeets_meta) user_info = self.get_nodes_properties(skeets_meta, skeets_dic) return edges_df, user_info @@ -191,24 +194,42 @@ def match_usernames(self, meta_df): return meta_df.dropna(subset=["mentions"]) - def get_edges(self, skeets_meta): + def get_edges(self, user, skeets_meta): if not skeets_meta: return pd.DataFrame() # Create the user -> mention table with their properties fom the list of tweets of a user - meta_df = pd.DataFrame.from_dict(skeets_meta, orient="index").explode("mentions").dropna() + mentions_df = pd.DataFrame.from_dict(skeets_meta, orient="index") + mentions_df["full_mentions"] = mentions_df["mentions"] + mentions_df["reply_to"] # a reply is a kind of mention + mentions_df = ( + mentions_df.drop(columns=["reply_to", "mentions"]).explode("full_mentions").dropna().rename(columns={"full_mentions": "mentions"}) + ) # Some bots to be removed from the collection users_to_remove = self.config.users_to_remove # mentions can be dids so need to translate that first into user handles - meta_df = self.match_usernames(meta_df) - filtered_meta_df = meta_df[~meta_df["mentions"].isin(users_to_remove) & ~meta_df["mentions"].isin(meta_df["user"])] + mentions_df = self.match_usernames(mentions_df) + filtered_mentions_df = mentions_df[~mentions_df["mentions"].isin(users_to_remove) & ~mentions_df["mentions"].isin(mentions_df["user"])] # group by mentions and keep list of tweets for each mention - tmp = filtered_meta_df.groupby(["user", "mentions"]).apply(lambda x: (x.index.tolist(), len(x.index))) + tmp = filtered_mentions_df.groupby(["user", "mentions"]).apply(lambda x: (x.index.tolist(), len(x.index)), include_groups=False) + if tmp.empty: + edge_mentions_df = pd.DataFrame([], columns=["source", "target", "cid", "weight"]) + else: + edge_mentions_df = pd.DataFrame(tmp.tolist(), index=tmp.index).rename(columns={0: "cid", 1: "weight"}) + edge_mentions_df.index.names = ["source", "target"] + # Now get reposts + repost_df = pd.DataFrame.from_dict(skeets_meta, orient="index") + repost_df["source_user"] = user + # remove self edges + repost_df = repost_df[repost_df["user"] != repost_df["source_user"]] + tmp = repost_df.groupby(["source_user", "user"]).apply(lambda x: (x.index.tolist(), len(x.index)), include_groups=False) if tmp.empty: - return tmp - edge_df = pd.DataFrame(tmp.tolist(), index=tmp.index).rename(columns={0: "cid", 1: "weight"}) - return edge_df + edge_repost_df = pd.DataFrame([], columns=["source", "target", "cid", "weight"]) + else: + edge_repost_df = pd.DataFrame(tmp.tolist(), index=tmp.index).rename(columns={0: "cid", 1: "weight"}) + edge_repost_df.index.names = ["source", "target"] + edges_df = pd.concat([edge_mentions_df, edge_repost_df]).groupby(["source", "target"]).sum() + return edges_df def get_nodes_properties(self, skeets_meta, skeets_dic): if not skeets_meta: @@ -239,6 +260,6 @@ def get_nodes_properties(self, skeets_meta, skeets_dic): ##################################################### def add_graph_attributes(self, g, nodes_df, edges_df, nodes_info): - g = add_edges_attributes(g, edges_df, drop_cols=["cid", "degree_target", "degree_source"]) + g = add_edges_attributes(g, edges_df, drop_cols=["cid"]) g = add_node_attributes(g, self.skeets_getter.reshape_node_data(nodes_df), attr_dic=nodes_info.user_hashtags, attr_name="all_hashtags") return g diff --git a/spikexplore/collect_edges.py b/spikexplore/collect_edges.py index 15a4c02..5ada682 100644 --- a/spikexplore/collect_edges.py +++ b/spikexplore/collect_edges.py @@ -142,7 +142,10 @@ def spiky_ball(initial_node_list, graph_handle, cfg, node_acc=NodeInfo(), progre edges_df_in, edges_df_out = split_edges(edges_df, total_node_list) # add edges linking to new nodes - total_edges_df = pd.concat([total_edges_df, edges_df_in, new_edges]) + total_edges_df = pd.concat([total_edges_df, edges_df_in]) + if not new_edges.empty: + total_edges_df = pd.concat([total_edges_df, new_edges.drop(columns=["degree_source", "degree_target"])]) + total_edges_df = total_edges_df.groupby(["source", "target"]).sum().reset_index() total_nodes_df = pd.concat([total_nodes_df, nodes_df]) new_node_list, new_edges = random_subset(edges_df_out, expansion_type, mode=random_subset_mode, mode_value=random_subset_size, coeff=degree) diff --git a/spikexplore/graph.py b/spikexplore/graph.py index 6ff9a8d..19ceb94 100644 --- a/spikexplore/graph.py +++ b/spikexplore/graph.py @@ -134,8 +134,8 @@ def process_hop(graph_handle, node_list, nodes_info_acc): total_nodes_df = pd.concat([total_nodes_df, node_info.get_nodes()]) nodes_info_acc.update(node_info) # add new info - - total_edges_df = pd.concat([total_edges_df, edges_df]) + if not edges_df.empty: + total_edges_df = pd.concat([total_edges_df, edges_df]).groupby(["source", "target"]).sum().reset_index() neighbors_dic = graph_handle.neighbors_with_weights(edges_df) new_node_dic = combine_dicts(new_node_dic, neighbors_dic) diff --git a/tests/bsky_test.py b/tests/bsky_test.py index b9b3c3f..a3f46ae 100644 --- a/tests/bsky_test.py +++ b/tests/bsky_test.py @@ -17,19 +17,19 @@ def setUpClass(cls): graph_config = GraphConfig(min_degree=2, min_weight=1, community_detection=True, min_community_size=2, as_undirected=False) data_collection_config = DataCollectionConfig( - exploration_depth=2, random_subset_mode="percent", random_subset_size=20, expansion_type="coreball", degree=2, max_nodes_per_hop=100 + exploration_depth=2, random_subset_mode="percent", random_subset_size=60, expansion_type="coreball", degree=2, max_nodes_per_hop=100 ) cls.sampling_config = SamplingConfig(graph_config, data_collection_config) - cls.initial_nodes = ["github.com", "githubnext.com", "bsky.app", "jay.bsky.team"] + cls.initial_nodes = ["atproto.com", "bsky.app", "jay.bsky.team", "atprotocol.dev", "freeourfeeds.com"] def test_sampling_coreball(self): g_sub, _ = graph_explore.explore(self.sampling_backend, self.initial_nodes, self.sampling_config) - self.assertTrue(g_sub.number_of_nodes() > 5) - self.assertTrue(g_sub.number_of_edges() > 10) + self.assertGreaterEqual(g_sub.number_of_nodes(), 5) + self.assertGreaterEqual(g_sub.number_of_edges(), 10) communities = nx.get_node_attributes(g_sub, "community") self.assertGreaterEqual(max(communities.values()), 2) def test_empty_graph(self): g_sub, _ = graph_explore.explore(self.sampling_backend, ["#InvalidUsername"], self.sampling_config) - self.assertTrue(g_sub.number_of_nodes() == 0) - self.assertTrue(g_sub.number_of_edges() == 0) + self.assertEqual(g_sub.number_of_nodes(), 0) + self.assertEqual(g_sub.number_of_edges(), 0) From 76349a9cb1ac55be20f3d339ac14b89302883500 Mon Sep 17 00:00:00 2001 From: Nicolas Aspert Date: Tue, 14 Jan 2025 16:56:52 +0100 Subject: [PATCH 06/11] fix empty graph tests --- spikexplore/backends/bluesky.py | 8 +++++++- spikexplore/collect_edges.py | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/spikexplore/backends/bluesky.py b/spikexplore/backends/bluesky.py index 4f94928..e892716 100644 --- a/spikexplore/backends/bluesky.py +++ b/spikexplore/backends/bluesky.py @@ -4,6 +4,9 @@ import logging import pandas as pd from datetime import datetime, timedelta, timezone + +from atproto_client.exceptions import BadRequestError + from spikexplore.NodeInfo import NodeInfo from spikexplore.graph import add_node_attributes, add_edges_attributes @@ -99,8 +102,11 @@ def get_user_skeets(self, username): user_skeets.items(), ) return user_skeets, dict(skeets_metadata) + except BadRequestError as e: + logger.error(f"Error in getting user skeets: code {e.response.status_code} - {e.response.content.message}") + return {}, {} except Exception as e: - logger.error("Error in getting user skeets: ", e) + logger.error(f"Error in getting user skeets: {e}") return {}, {} def reshape_node_data(self, node_df): diff --git a/spikexplore/collect_edges.py b/spikexplore/collect_edges.py index 5ada682..390f844 100644 --- a/spikexplore/collect_edges.py +++ b/spikexplore/collect_edges.py @@ -134,8 +134,8 @@ def spiky_ball(initial_node_list, graph_handle, cfg, node_acc=NodeInfo(), progre new_edges = remove_edges_with_target_nodes(new_edges, new_node_list) new_node_dic, edges_df, nodes_df, node_acc = process_hop(graph_handle, new_node_list, node_acc) - if nodes_df.empty: - break + if edges_df.empty: + continue nodes_df["spikyball_hop"] = depth # Mark the depth of the spiky ball on the nodes total_node_list = total_node_list + new_node_list From 0f1868e03641baf8db29ac8ed192b656e35de565 Mon Sep 17 00:00:00 2001 From: Nicolas Aspert Date: Wed, 15 Jan 2025 11:01:18 +0100 Subject: [PATCH 07/11] add secrets to github workflow --- .github/workflows/main.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 8e60434..ba86035 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -22,5 +22,8 @@ jobs: python -m pip install pytest if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Run tests + env: + BSKY_HANDLE: ${{ secrets.BSKY_HANDLE }} + BSKY_PASSWORD: ${{ secrets.BSKY_PASSWORD }} run: | python -m pytest tests From a17ab12490cff6d85fb17260f577db517ae70da4 Mon Sep 17 00:00:00 2001 From: Nicolas Aspert Date: Tue, 21 Jan 2025 15:12:37 +0100 Subject: [PATCH 08/11] switch to pyproject.toml --- pyproject.toml | 38 ++++++++++++++++++++++++++++++++++++++ setup.py | 24 ------------------------ 2 files changed, 38 insertions(+), 24 deletions(-) create mode 100644 pyproject.toml delete mode 100644 setup.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..40d24fc --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,38 @@ +[build-system] +requires = ["setuptools>=61.2"] +build-backend = "setuptools.build_meta" + +[project] +name = "spikexplore" +version = "0.2.0" +authors = [ + {name = "Nicolas Aspert", email = "nicolas.aspert@epfl.ch"}, + {name = "Benjamin Ricaud", email = "benjamin.ricaud@uit.no"}, +] +license = {text = "Apache license"} +description = "Graph exploration using inhomogeneous filtered diffusion" +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache License", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.8", +] +urls = {Homepage = "https://gitlab.switch.ch/imi-sad/spikexplore"} +requires-python = ">=3.8" +dependencies = [ + "pandas", + "numpy", + "networkx", + "tqdm", + "atproto", + "wikipedia-api", + "python-louvain", +] + +[tool.setuptools] +packages = ["spikexplore", "spikexplore.backends"] +script-files = [] +include-package-data = false diff --git a/setup.py b/setup.py deleted file mode 100644 index 57387b7..0000000 --- a/setup.py +++ /dev/null @@ -1,24 +0,0 @@ -from setuptools import setup - -setup( - name="spikexplore", - version="0.1.0", - description="Graph exploration using inhomogeneous filtered diffusion", - url="https://github.com/epfl-lts2/spikexplore", - author="Nicolas Aspert, Benjamin Ricaud", - author_email="nicolas.aspert@epfl.ch, benjamin.ricaud@epfl.ch", - license="Apache license", - packages=["spikexplore", "spikexplore.backends"], - scripts=[], - install_requires=["pandas", "numpy", "networkx", "tqdm", "wikipedia-api", "python-louvain"], - python_requires=">=3.8", - classifiers=[ - "Development Status :: 4 - Beta", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: Apache License", - "Operating System :: POSIX :: Linux", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.8", - ], -) From a3be26a562af3fa10a5799e67a0c60c9ec77e2d1 Mon Sep 17 00:00:00 2001 From: Nicolas Aspert Date: Wed, 22 Jan 2025 09:16:46 +0100 Subject: [PATCH 09/11] remove requirements.txt and use only pyproject.toml for deps --- .gitlab-ci.yml | 3 +-- pyproject.toml | 6 +++++- requirements.txt | 7 ------- 3 files changed, 6 insertions(+), 10 deletions(-) delete mode 100644 requirements.txt diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e8553c9..9ea706a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -20,8 +20,7 @@ before_script: - python -m pip install virtualenv - virtualenv venv - source venv/bin/activate - - pip install --upgrade -r requirements.txt - - pip install black==24.8.0 pytest + - pip install .[dev] Formatting: stage: format diff --git a/pyproject.toml b/pyproject.toml index 40d24fc..15e4633 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,11 @@ dependencies = [ "wikipedia-api", "python-louvain", ] - +[project.optional-dependencies] +dev = [ + "pytest", + "black==24.8.0" +] [tool.setuptools] packages = ["spikexplore", "spikexplore.backends"] script-files = [] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 9bdc472..0000000 --- a/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -numpy -pandas -networkx -tqdm -wikipedia-api -python-louvain -atproto From 23c458c27357658f8b0b8e60f663f2904f869895 Mon Sep 17 00:00:00 2001 From: Nicolas Aspert Date: Wed, 22 Jan 2025 09:21:20 +0100 Subject: [PATCH 10/11] fix github workflow --- .github/workflows/main.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index ba86035..73e9305 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -19,8 +19,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + python -m pip install .[dev] - name: Run tests env: BSKY_HANDLE: ${{ secrets.BSKY_HANDLE }} From 9d2a4bba8e7ae6141ffca3f90942587ba877c13e Mon Sep 17 00:00:00 2001 From: Nicolas Aspert Date: Wed, 22 Jan 2025 09:28:53 +0100 Subject: [PATCH 11/11] update README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 2001c51..67f0bc9 100644 --- a/README.md +++ b/README.md @@ -24,3 +24,4 @@ So far this implementation supports: - ~~Twitter using [v1 API](https://developer.twitter.com/en/docs/twitter-api/api-reference-index) through [Twython](https://twython.readthedocs.io/en/latest/)~~ Twitter API is no longer available unless you pay. Latest version supporting it is v0.0.12. - Wikipedia using [Mediawiki API](https://www.mediawiki.org/wiki/API:Main_page) through [Wikipedia-API](https://pypi.org/project/Wikipedia-API/) +- Bluesky using [ATProto](https://atproto.blue/en/latest/)