From 7fde7b4ff19724f1ed72407a2882b5ecbffd9af0 Mon Sep 17 00:00:00 2001
From: Nicolas Aspert <nicolas.aspert@epfl.ch>
Date: Thu, 19 Dec 2024 18:19:43 +0100
Subject: [PATCH 01/11] wip collect bluesky data

---
 spikexplore/backends/bluesky.py | 245 ++++++++++++++++++++++++++++++++
 spikexplore/config.py           |   7 +-
 tests/bsky_test.py              |  37 +++++
 3 files changed, 285 insertions(+), 4 deletions(-)
 create mode 100644 spikexplore/backends/bluesky.py
 create mode 100644 tests/bsky_test.py

diff --git a/spikexplore/backends/bluesky.py b/spikexplore/backends/bluesky.py
new file mode 100644
index 0000000..6fd2371
--- /dev/null
+++ b/spikexplore/backends/bluesky.py
@@ -0,0 +1,245 @@
+from atproto import Client, client_utils
+import networkx as nx
+import time
+import logging
+import pandas as pd
+from datetime import datetime, timedelta
+from spikexplore.NodeInfo import NodeInfo
+from spikexplore.graph import add_node_attributes, add_edges_attributes
+
+logger = logging.getLogger(__name__)
+
+class BlueskyCredentials:
+    def __init__(self, handle, password):
+        self.handle = handle
+        self.password = password
+
+
+class SkeetsGetter:
+    def __init__(self, credentials, config):
+        # Instantiate an object
+        self.config = config
+        self.bsky_client = Client()
+        self.bsky_client.login(credentials.handle, credentials.password)
+        self.profiles_cache = {}
+        self.features_attrs = {"mention": "did", "tag": "tag", "link": "uri"}
+
+    def _filter_old_skeets(self, skeets):
+        max_day_old = self.config.max_day_old
+        if not max_day_old:
+            return skeets
+
+        days_limit = datetime.now() - timedelta(days=max_day_old)
+        skeets_filt = filter(lambda t: datetime.strptime(t.post.record["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") >= days_limit, skeets)
+        return list(skeets_filt)
+
+
+    def get_profile(self, did):
+        handle = self.profiles_cache.get(did)
+        if handle is not None:
+            return handle
+
+        p = self.bsky_client.get_profile(did)
+        if p is not None:
+            self.profiles_cache[did] = p.handle
+            return p.handle
+        return None
+
+    def facet_data(self, skeet, data):
+        if not hasattr(skeet, "record"):
+            return []
+        if skeet.record.facets is None:
+            return []
+        return [getattr(f.features[0], self.features_attrs[data]) for f in
+                skeet.record.facets if f.features[0].py_type == f"app.bsky.richtext.facet#{data}"]
+
+    def get_user_skeets(self, username):
+        # Collect skeets from a username/did
+
+        count = self.config.max_skeets_per_user
+
+        # Test if ok
+        try:
+            user_skeets_raw = self.bsky_client.get_author_feed(
+                actor=username, limit=count
+            ).feed
+            # remove old tweets
+            user_skeets_filt = self._filter_old_skeets(user_skeets_raw)
+            # make a dictionary
+            user_skeets = {x.post.cid: x.post for x in user_skeets_filt}
+
+            # update profile cache
+            for v in user_skeets.items():
+                if v[1].author.did not in self.profiles_cache:
+                    self.profiles_cache[v[1].author.did] = v[1].author.handle
+
+            skeets_metadata = map(
+                lambda x: (
+                    x[0],
+                    {
+                        "user_did": x[1].author.did,
+                        "user": x[1].author.handle,
+                        "name": x[1].author.display_name,
+                        "mentions": self.facet_data(x[1], "mention"),
+                        "hashtags": self.facet_data(x[1], "tag"),
+                        "links": self.facet_data(x[1], "link"),
+                        "repost_count": x[1].repost_count,
+                        "favorite_count": x[1].like_count,
+                        "created_at": x[1].record.created_at,
+                        "account_creation": x[1].author.created_at,
+                    },
+                ),
+                user_skeets.items(),
+            )
+            return user_skeets, dict(skeets_metadata)
+        except Exception as e:
+            logger.error("Error in getting user skeets: ", e)
+            return {}, {}
+
+
+    def reshape_node_data(self, node_df):
+        # user name user_details mentions hashtags retweet_count favorite_count
+        # created_at account_creation account_followers account_following account_statuses account_favourites
+        # account_verified account_default_profile account_default_profile_image spikyball_hop
+        node_df = node_df[
+            [
+                "user_did",
+                "user",
+                "name",
+                "spikyball_hop",
+                "account_creation",
+            ]
+        ]
+        node_df = node_df.reset_index().groupby("user_did").max().rename(columns={"index": "max_tweet_id"})
+        return node_df
+
+
+class BlueskyNetwork:
+    class BlueskyNodeInfo(NodeInfo):
+        def __init__(self, user_hashtags=None, user_skeets=None, user_links=None, skeets_meta=pd.DataFrame()):
+            self.user_hashtags = user_hashtags if user_hashtags else {}
+            self.user_links = user_links if user_links else {}
+            self.user_skeets = user_skeets if user_skeets else {}
+            self.skeets_meta = skeets_meta
+
+        def update(self, new_info):
+            self.user_hashtags.update(new_info.user_hashtags)
+            self.user_skeets.update(new_info.user_skeets)
+            self.user_links.update(new_info.user_skeets)
+
+        def get_nodes(self):
+            return self.skeets_meta
+
+    def __init__(self, credentials, config):
+        self.skeets_getter = SkeetsGetter(credentials, config)
+        self.config = config
+
+    def create_node_info(self):
+        return self.BlueskyNodeInfo()
+
+    def get_neighbors(self, user):
+        if not isinstance(user, str):
+            return self.BlueskyNodeInfo(), pd.DataFrame()
+        skeets_dic, skeets_meta = self.skeets_getter.get_user_skeets(user)
+        edges_df, node_info = self.edges_nodes_from_user(skeets_meta, skeets_dic)
+
+        # replace user and mentions by source and target
+        if not edges_df.empty:
+            edges_df.index.names = ["source", "target"]
+            edges_df.reset_index(level=["source", "target"], inplace=True)
+
+        return node_info, edges_df
+
+    def filter(self, node_info, edges_df):
+        # filter edges according to node properties
+        # filter according to edges properties
+        edges_df = self.filter_edges(edges_df)
+        return node_info, edges_df
+
+    def filter_edges(self, edges_df):
+        # filter edges according to their properties
+        if edges_df.empty:
+            return edges_df
+        return edges_df[edges_df["weight"] >= self.config.min_mentions]
+
+    def neighbors_list(self, edges_df):
+        if edges_df.empty:
+            return edges_df
+        users_connected = edges_df["target"].tolist()
+        return users_connected
+
+    def neighbors_with_weights(self, edges_df):
+        user_list = self.neighbors_list(edges_df)
+        return dict.fromkeys(user_list, 1)
+
+    ###############################################################
+    # Functions for extracting skeet info from the bluesky API
+    ###############################################################
+
+    def edges_nodes_from_user(self, skeets_meta, skeets_dic):
+        # Make an edge and node property dataframes
+        edges_df = self.get_edges(skeets_meta)
+        user_info = self.get_nodes_properties(skeets_meta, skeets_dic)
+        return edges_df, user_info
+
+    def did_to_handle(self, did):
+        return self.skeets_getter.get_profile(did)
+
+    def match_usernames(self, meta_df):
+        mask = meta_df['mentions'].str.startswith("did:")
+        meta_df.loc[mask, 'mentions'] = meta_df.loc[mask, 'mentions'].apply(self.did_to_handle)
+
+        return meta_df.dropna(subset=['mentions'])
+
+    def get_edges(self, skeets_meta):
+        if not skeets_meta:
+            return pd.DataFrame()
+        # Create the user -> mention table with their properties fom the list of tweets of a user
+        meta_df = pd.DataFrame.from_dict(skeets_meta, orient="index").explode("mentions").dropna()
+        # Some bots to be removed from the collection
+        users_to_remove = self.config.users_to_remove
+
+        # mentions can be dids so need to translate that first into user handles
+        meta_df = self.match_usernames(meta_df)
+        filtered_meta_df = meta_df[~meta_df["mentions"].isin(users_to_remove) & ~meta_df["mentions"].isin(meta_df["user"])]
+
+        # group by mentions and keep list of tweets for each mention
+        tmp = filtered_meta_df.groupby(["user", "mentions"]).apply(lambda x: (x.index.tolist(), len(x.index)))
+        if tmp.empty:
+            return tmp
+        edge_df = pd.DataFrame(tmp.tolist(), index=tmp.index).rename(columns={0: "cid", 1: "weight"})
+        return edge_df
+
+    def get_nodes_properties(self, skeets_meta, skeets_dic):
+        if not skeets_meta:
+            return self.BlueskyNodeInfo({}, {}, {}, pd.DataFrame())
+        nb_popular_skeets = self.config.nb_popular_skeets
+        # global properties
+        meta_df = pd.DataFrame.from_dict(skeets_meta, orient="index").sort_values("repost_count", ascending=False)
+        # hashtags statistics
+        ht_df = meta_df.explode("hashtags").dropna()
+        htgb = ht_df.groupby(["hashtags"]).size()
+        user_hashtags = pd.DataFrame(htgb).rename(columns={0: "count"}).sort_values("count", ascending=False).to_dict()
+        links_df = meta_df.explode("links").dropna()
+        links = links_df.groupby(["links"]).size()
+        user_links = pd.DataFrame(links).rename(columns={0: "count"}).sort_values("count", ascending=False).to_dict()
+        user_name = meta_df["user"].iloc[0]
+        skeets_meta_kept = meta_df.head(nb_popular_skeets)
+        skeets_kept = {k: skeets_dic[k] for k in skeets_meta_kept.index.to_list()}
+        # Get most popular tweets of user
+        return self.BlueskyNodeInfo(user_hashtags={user_name: user_hashtags["count"]}, user_skeets=skeets_kept,
+                                    user_links={user_name: user_links["count"]}, skeets_meta=skeets_meta_kept)
+
+    #####################################################
+    ## Utils functions for the graph
+    #####################################################
+
+    def add_graph_attributes(self, g, nodes_df, edges_df, nodes_info):
+        g = add_edges_attributes(g, edges_df, drop_cols=["cid", "degree_target", "degree_source"])
+        g = add_node_attributes(g, self.skeets_getter.reshape_node_data(nodes_df), attr_dic=nodes_info.user_hashtags, attr_name="all_hashtags")
+        return g
+
+
+
+
+
diff --git a/spikexplore/config.py b/spikexplore/config.py
index b81b56c..d82af23 100644
--- a/spikexplore/config.py
+++ b/spikexplore/config.py
@@ -32,13 +32,12 @@ def __init__(self, graph, data_collection):
 
 
 @dataclass
-class TwitterConfig:
+class BlueskyConfig:
     min_mentions: int = 0
     max_day_old: int = 30
-    max_tweets_per_user: int = 200
-    nb_popular_tweets: int = 10
+    max_skeets_per_user: int = 100
+    nb_popular_skeets: int = 10
     users_to_remove = []
-    api_version: int = 1
 
 
 @dataclass
diff --git a/tests/bsky_test.py b/tests/bsky_test.py
new file mode 100644
index 0000000..e28a5a9
--- /dev/null
+++ b/tests/bsky_test.py
@@ -0,0 +1,37 @@
+import os
+import unittest
+import networkx as nx
+from spikexplore import graph_explore
+from spikexplore.backends.bluesky import BlueskyNetwork, BlueskyCredentials
+from spikexplore.config import SamplingConfig, GraphConfig, DataCollectionConfig, BlueskyConfig
+
+
+class BlueskyGraphSampling(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        bsky_credentials = BlueskyCredentials(
+            os.getenv("BSKY_HANDLE", ""), os.getenv("BSKY_PASSWORD", "")
+        )
+
+        cls.bluesky_config = BlueskyConfig()
+        cls.bluesky_config.users_to_remove = ["threader_app", "threadreaderapp"]
+        cls.sampling_backend = BlueskyNetwork(bsky_credentials, cls.bluesky_config)
+
+        graph_config = GraphConfig(min_degree=2, min_weight=1, community_detection=True, min_community_size=2, as_undirected=False)
+        data_collection_config = DataCollectionConfig(
+            exploration_depth=2, random_subset_mode="percent", random_subset_size=20, expansion_type="coreball", degree=2, max_nodes_per_hop=100
+        )
+        cls.sampling_config = SamplingConfig(graph_config, data_collection_config)
+        cls.initial_nodes = ["github.com", "githubnext.com", "bsky.app", "jay.bsky.team"]
+
+    def test_sampling_coreball(self):
+        g_sub, _ = graph_explore.explore(self.sampling_backend, self.initial_nodes, self.sampling_config)
+        self.assertTrue(g_sub.number_of_nodes() > 5)
+        self.assertTrue(g_sub.number_of_edges() > 10)
+        communities = nx.get_node_attributes(g_sub, "community")
+        self.assertGreaterEqual(max(communities.values()), 2)
+
+    def test_empty_graph(self):
+        g_sub, _ = graph_explore.explore(self.sampling_backend, ["#InvalidUsername"], self.sampling_config)
+        self.assertTrue(g_sub.number_of_nodes() == 0)
+        self.assertTrue(g_sub.number_of_edges() == 0)

From 0ed6b815fa07197be1e209a8479859e995a4a56f Mon Sep 17 00:00:00 2001
From: Nicolas Aspert <nicolas.aspert@epfl.ch>
Date: Fri, 20 Dec 2024 10:55:26 +0100
Subject: [PATCH 02/11] wip update reqs + black

---
 .github/workflows/main.yaml     |  2 +-
 .gitlab-ci.yml                  |  4 ++--
 requirements.txt                |  3 +--
 spikexplore/backends/bluesky.py | 33 ++++++++++++++++-----------------
 tests/bsky_test.py              |  4 +---
 5 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index af8bb4c..8e60434 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10"]
+        python-version: ["3.10", "3.11"]
 
     steps:
     - uses: actions/checkout@v2
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 6b41809..88abd4a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,4 +1,4 @@
-image: python:3.9
+image: python:3.11
 
 stages:
   - format
@@ -21,7 +21,7 @@ before_script:
   - virtualenv venv
   - source venv/bin/activate
   - pip install --upgrade -r requirements.txt
-  - pip install black pytest
+  - pip install black==2024.8.0 pytest
 
 Formatting:
   stage: format
diff --git a/requirements.txt b/requirements.txt
index 82fd53e..9bdc472 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,6 @@ numpy
 pandas
 networkx
 tqdm
-twython
 wikipedia-api
 python-louvain
-TwitterAPI
\ No newline at end of file
+atproto
diff --git a/spikexplore/backends/bluesky.py b/spikexplore/backends/bluesky.py
index 6fd2371..f4e70dc 100644
--- a/spikexplore/backends/bluesky.py
+++ b/spikexplore/backends/bluesky.py
@@ -9,6 +9,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 class BlueskyCredentials:
     def __init__(self, handle, password):
         self.handle = handle
@@ -33,7 +34,6 @@ def _filter_old_skeets(self, skeets):
         skeets_filt = filter(lambda t: datetime.strptime(t.post.record["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") >= days_limit, skeets)
         return list(skeets_filt)
 
-
     def get_profile(self, did):
         handle = self.profiles_cache.get(did)
         if handle is not None:
@@ -50,8 +50,11 @@ def facet_data(self, skeet, data):
             return []
         if skeet.record.facets is None:
             return []
-        return [getattr(f.features[0], self.features_attrs[data]) for f in
-                skeet.record.facets if f.features[0].py_type == f"app.bsky.richtext.facet#{data}"]
+        return [
+            getattr(f.features[0], self.features_attrs[data])
+            for f in skeet.record.facets
+            if f.features[0].py_type == f"app.bsky.richtext.facet#{data}"
+        ]
 
     def get_user_skeets(self, username):
         # Collect skeets from a username/did
@@ -60,9 +63,7 @@ def get_user_skeets(self, username):
 
         # Test if ok
         try:
-            user_skeets_raw = self.bsky_client.get_author_feed(
-                actor=username, limit=count
-            ).feed
+            user_skeets_raw = self.bsky_client.get_author_feed(actor=username, limit=count).feed
             # remove old tweets
             user_skeets_filt = self._filter_old_skeets(user_skeets_raw)
             # make a dictionary
@@ -96,7 +97,6 @@ def get_user_skeets(self, username):
             logger.error("Error in getting user skeets: ", e)
             return {}, {}
 
-
     def reshape_node_data(self, node_df):
         # user name user_details mentions hashtags retweet_count favorite_count
         # created_at account_creation account_followers account_following account_statuses account_favourites
@@ -186,10 +186,10 @@ def did_to_handle(self, did):
         return self.skeets_getter.get_profile(did)
 
     def match_usernames(self, meta_df):
-        mask = meta_df['mentions'].str.startswith("did:")
-        meta_df.loc[mask, 'mentions'] = meta_df.loc[mask, 'mentions'].apply(self.did_to_handle)
+        mask = meta_df["mentions"].str.startswith("did:")
+        meta_df.loc[mask, "mentions"] = meta_df.loc[mask, "mentions"].apply(self.did_to_handle)
 
-        return meta_df.dropna(subset=['mentions'])
+        return meta_df.dropna(subset=["mentions"])
 
     def get_edges(self, skeets_meta):
         if not skeets_meta:
@@ -227,8 +227,12 @@ def get_nodes_properties(self, skeets_meta, skeets_dic):
         skeets_meta_kept = meta_df.head(nb_popular_skeets)
         skeets_kept = {k: skeets_dic[k] for k in skeets_meta_kept.index.to_list()}
         # Get most popular tweets of user
-        return self.BlueskyNodeInfo(user_hashtags={user_name: user_hashtags["count"]}, user_skeets=skeets_kept,
-                                    user_links={user_name: user_links["count"]}, skeets_meta=skeets_meta_kept)
+        return self.BlueskyNodeInfo(
+            user_hashtags={user_name: user_hashtags["count"]},
+            user_skeets=skeets_kept,
+            user_links={user_name: user_links["count"]},
+            skeets_meta=skeets_meta_kept,
+        )
 
     #####################################################
     ## Utils functions for the graph
@@ -238,8 +242,3 @@ def add_graph_attributes(self, g, nodes_df, edges_df, nodes_info):
         g = add_edges_attributes(g, edges_df, drop_cols=["cid", "degree_target", "degree_source"])
         g = add_node_attributes(g, self.skeets_getter.reshape_node_data(nodes_df), attr_dic=nodes_info.user_hashtags, attr_name="all_hashtags")
         return g
-
-
-
-
-
diff --git a/tests/bsky_test.py b/tests/bsky_test.py
index e28a5a9..b9b3c3f 100644
--- a/tests/bsky_test.py
+++ b/tests/bsky_test.py
@@ -9,9 +9,7 @@
 class BlueskyGraphSampling(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        bsky_credentials = BlueskyCredentials(
-            os.getenv("BSKY_HANDLE", ""), os.getenv("BSKY_PASSWORD", "")
-        )
+        bsky_credentials = BlueskyCredentials(os.getenv("BSKY_HANDLE", ""), os.getenv("BSKY_PASSWORD", ""))
 
         cls.bluesky_config = BlueskyConfig()
         cls.bluesky_config.users_to_remove = ["threader_app", "threadreaderapp"]

From c9d5b01455d3bda372014f6324e2c4d52ce28b53 Mon Sep 17 00:00:00 2001
From: Nicolas Aspert <nicolas.aspert@epfl.ch>
Date: Fri, 20 Dec 2024 10:57:26 +0100
Subject: [PATCH 03/11] fix black version for CI

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 88abd4a..e8553c9 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -21,7 +21,7 @@ before_script:
   - virtualenv venv
   - source venv/bin/activate
   - pip install --upgrade -r requirements.txt
-  - pip install black==2024.8.0 pytest
+  - pip install black==24.8.0 pytest
 
 Formatting:
   stage: format

From 838b8b2d3ef4fbcc912d952cd4da59cb6c60a8b8 Mon Sep 17 00:00:00 2001
From: Nicolas Aspert <nicolas.aspert@epfl.ch>
Date: Fri, 20 Dec 2024 11:00:45 +0100
Subject: [PATCH 04/11] fix black version for github

---
 .github/workflows/black.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/black.yaml b/.github/workflows/black.yaml
index 8bd0b3a..485dabb 100644
--- a/.github/workflows/black.yaml
+++ b/.github/workflows/black.yaml
@@ -11,5 +11,5 @@ jobs:
         with:
           options: "--check -l 150"
           src: "."
-          version: "~= 22.12.0"
+          version: "~= 24.8.0"
 

From 7100c05140619ba92847b06c05072966cbb9fa9b Mon Sep 17 00:00:00 2001
From: Nicolas Aspert <nicolas.aspert@epfl.ch>
Date: Tue, 14 Jan 2025 15:23:32 +0100
Subject: [PATCH 05/11] fix collection for bluesky

---
 spikexplore/backends/bluesky.py | 73 +++++++++++++++++++++------------
 spikexplore/collect_edges.py    |  5 ++-
 spikexplore/graph.py            |  4 +-
 tests/bsky_test.py              | 12 +++---
 4 files changed, 59 insertions(+), 35 deletions(-)

diff --git a/spikexplore/backends/bluesky.py b/spikexplore/backends/bluesky.py
index f4e70dc..4f94928 100644
--- a/spikexplore/backends/bluesky.py
+++ b/spikexplore/backends/bluesky.py
@@ -3,7 +3,7 @@
 import time
 import logging
 import pandas as pd
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 from spikexplore.NodeInfo import NodeInfo
 from spikexplore.graph import add_node_attributes, add_edges_attributes
 
@@ -31,19 +31,24 @@ def _filter_old_skeets(self, skeets):
             return skeets
 
         days_limit = datetime.now() - timedelta(days=max_day_old)
-        skeets_filt = filter(lambda t: datetime.strptime(t.post.record["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") >= days_limit, skeets)
+        skeets_filt = filter(
+            lambda t: datetime.fromisoformat(t.post.record["created_at"].replace("Z", "+00:00")).replace(tzinfo=None) >= days_limit, skeets
+        )
         return list(skeets_filt)
 
     def get_profile(self, did):
         handle = self.profiles_cache.get(did)
         if handle is not None:
             return handle
-
-        p = self.bsky_client.get_profile(did)
-        if p is not None:
-            self.profiles_cache[did] = p.handle
-            return p.handle
-        return None
+        try:
+            p = self.bsky_client.get_profile(did)
+            if p is not None:
+                self.profiles_cache[did] = p.handle
+                return p.handle
+        except Exception as e:
+            logger.error("Error in getting profile: ", e)
+        finally:
+            return None
 
     def facet_data(self, skeet, data):
         if not hasattr(skeet, "record"):
@@ -86,6 +91,7 @@ def get_user_skeets(self, username):
                         "links": self.facet_data(x[1], "link"),
                         "repost_count": x[1].repost_count,
                         "favorite_count": x[1].like_count,
+                        "reply_to": x[1].reply.parent.author.handle if hasattr(x[1], "reply") else [],
                         "created_at": x[1].record.created_at,
                         "account_creation": x[1].author.created_at,
                     },
@@ -98,9 +104,6 @@ def get_user_skeets(self, username):
             return {}, {}
 
     def reshape_node_data(self, node_df):
-        # user name user_details mentions hashtags retweet_count favorite_count
-        # created_at account_creation account_followers account_following account_statuses account_favourites
-        # account_verified account_default_profile account_default_profile_image spikyball_hop
         node_df = node_df[
             [
                 "user_did",
@@ -110,7 +113,7 @@ def reshape_node_data(self, node_df):
                 "account_creation",
             ]
         ]
-        node_df = node_df.reset_index().groupby("user_did").max().rename(columns={"index": "max_tweet_id"})
+        node_df = node_df.reset_index().groupby("user").max().rename(columns={"index": "last_skeet_id"})
         return node_df
 
 
@@ -141,7 +144,7 @@ def get_neighbors(self, user):
         if not isinstance(user, str):
             return self.BlueskyNodeInfo(), pd.DataFrame()
         skeets_dic, skeets_meta = self.skeets_getter.get_user_skeets(user)
-        edges_df, node_info = self.edges_nodes_from_user(skeets_meta, skeets_dic)
+        edges_df, node_info = self.edges_nodes_from_user(user, skeets_meta, skeets_dic)
 
         # replace user and mentions by source and target
         if not edges_df.empty:
@@ -163,12 +166,12 @@ def filter_edges(self, edges_df):
         return edges_df[edges_df["weight"] >= self.config.min_mentions]
 
     def neighbors_list(self, edges_df):
-        if edges_df.empty:
-            return edges_df
         users_connected = edges_df["target"].tolist()
         return users_connected
 
     def neighbors_with_weights(self, edges_df):
+        if edges_df.empty:
+            return {}
         user_list = self.neighbors_list(edges_df)
         return dict.fromkeys(user_list, 1)
 
@@ -176,9 +179,9 @@ def neighbors_with_weights(self, edges_df):
     # Functions for extracting skeet info from the bluesky API
     ###############################################################
 
-    def edges_nodes_from_user(self, skeets_meta, skeets_dic):
+    def edges_nodes_from_user(self, user, skeets_meta, skeets_dic):
         # Make an edge and node property dataframes
-        edges_df = self.get_edges(skeets_meta)
+        edges_df = self.get_edges(user, skeets_meta)
         user_info = self.get_nodes_properties(skeets_meta, skeets_dic)
         return edges_df, user_info
 
@@ -191,24 +194,42 @@ def match_usernames(self, meta_df):
 
         return meta_df.dropna(subset=["mentions"])
 
-    def get_edges(self, skeets_meta):
+    def get_edges(self, user, skeets_meta):
         if not skeets_meta:
             return pd.DataFrame()
         # Create the user -> mention table with their properties fom the list of tweets of a user
-        meta_df = pd.DataFrame.from_dict(skeets_meta, orient="index").explode("mentions").dropna()
+        mentions_df = pd.DataFrame.from_dict(skeets_meta, orient="index")
+        mentions_df["full_mentions"] = mentions_df["mentions"] + mentions_df["reply_to"]  # a reply is a kind of mention
+        mentions_df = (
+            mentions_df.drop(columns=["reply_to", "mentions"]).explode("full_mentions").dropna().rename(columns={"full_mentions": "mentions"})
+        )
         # Some bots to be removed from the collection
         users_to_remove = self.config.users_to_remove
 
         # mentions can be dids so need to translate that first into user handles
-        meta_df = self.match_usernames(meta_df)
-        filtered_meta_df = meta_df[~meta_df["mentions"].isin(users_to_remove) & ~meta_df["mentions"].isin(meta_df["user"])]
+        mentions_df = self.match_usernames(mentions_df)
+        filtered_mentions_df = mentions_df[~mentions_df["mentions"].isin(users_to_remove) & ~mentions_df["mentions"].isin(mentions_df["user"])]
 
         # group by mentions and keep list of tweets for each mention
-        tmp = filtered_meta_df.groupby(["user", "mentions"]).apply(lambda x: (x.index.tolist(), len(x.index)))
+        tmp = filtered_mentions_df.groupby(["user", "mentions"]).apply(lambda x: (x.index.tolist(), len(x.index)), include_groups=False)
+        if tmp.empty:
+            edge_mentions_df = pd.DataFrame([], columns=["source", "target", "cid", "weight"])
+        else:
+            edge_mentions_df = pd.DataFrame(tmp.tolist(), index=tmp.index).rename(columns={0: "cid", 1: "weight"})
+            edge_mentions_df.index.names = ["source", "target"]
+        # Now get reposts
+        repost_df = pd.DataFrame.from_dict(skeets_meta, orient="index")
+        repost_df["source_user"] = user
+        # remove self edges
+        repost_df = repost_df[repost_df["user"] != repost_df["source_user"]]
+        tmp = repost_df.groupby(["source_user", "user"]).apply(lambda x: (x.index.tolist(), len(x.index)), include_groups=False)
         if tmp.empty:
-            return tmp
-        edge_df = pd.DataFrame(tmp.tolist(), index=tmp.index).rename(columns={0: "cid", 1: "weight"})
-        return edge_df
+            edge_repost_df = pd.DataFrame([], columns=["source", "target", "cid", "weight"])
+        else:
+            edge_repost_df = pd.DataFrame(tmp.tolist(), index=tmp.index).rename(columns={0: "cid", 1: "weight"})
+            edge_repost_df.index.names = ["source", "target"]
+        edges_df = pd.concat([edge_mentions_df, edge_repost_df]).groupby(["source", "target"]).sum()
+        return edges_df
 
     def get_nodes_properties(self, skeets_meta, skeets_dic):
         if not skeets_meta:
@@ -239,6 +260,6 @@ def get_nodes_properties(self, skeets_meta, skeets_dic):
     #####################################################
 
     def add_graph_attributes(self, g, nodes_df, edges_df, nodes_info):
-        g = add_edges_attributes(g, edges_df, drop_cols=["cid", "degree_target", "degree_source"])
+        g = add_edges_attributes(g, edges_df, drop_cols=["cid"])
         g = add_node_attributes(g, self.skeets_getter.reshape_node_data(nodes_df), attr_dic=nodes_info.user_hashtags, attr_name="all_hashtags")
         return g
diff --git a/spikexplore/collect_edges.py b/spikexplore/collect_edges.py
index 15a4c02..5ada682 100644
--- a/spikexplore/collect_edges.py
+++ b/spikexplore/collect_edges.py
@@ -142,7 +142,10 @@ def spiky_ball(initial_node_list, graph_handle, cfg, node_acc=NodeInfo(), progre
         edges_df_in, edges_df_out = split_edges(edges_df, total_node_list)
 
         # add edges linking to new nodes
-        total_edges_df = pd.concat([total_edges_df, edges_df_in, new_edges])
+        total_edges_df = pd.concat([total_edges_df, edges_df_in])
+        if not new_edges.empty:
+            total_edges_df = pd.concat([total_edges_df, new_edges.drop(columns=["degree_source", "degree_target"])])
+        total_edges_df = total_edges_df.groupby(["source", "target"]).sum().reset_index()
         total_nodes_df = pd.concat([total_nodes_df, nodes_df])
 
         new_node_list, new_edges = random_subset(edges_df_out, expansion_type, mode=random_subset_mode, mode_value=random_subset_size, coeff=degree)
diff --git a/spikexplore/graph.py b/spikexplore/graph.py
index 6ff9a8d..19ceb94 100644
--- a/spikexplore/graph.py
+++ b/spikexplore/graph.py
@@ -134,8 +134,8 @@ def process_hop(graph_handle, node_list, nodes_info_acc):
 
         total_nodes_df = pd.concat([total_nodes_df, node_info.get_nodes()])
         nodes_info_acc.update(node_info)  # add new info
-
-        total_edges_df = pd.concat([total_edges_df, edges_df])
+        if not edges_df.empty:
+            total_edges_df = pd.concat([total_edges_df, edges_df]).groupby(["source", "target"]).sum().reset_index()
         neighbors_dic = graph_handle.neighbors_with_weights(edges_df)
         new_node_dic = combine_dicts(new_node_dic, neighbors_dic)
 
diff --git a/tests/bsky_test.py b/tests/bsky_test.py
index b9b3c3f..a3f46ae 100644
--- a/tests/bsky_test.py
+++ b/tests/bsky_test.py
@@ -17,19 +17,19 @@ def setUpClass(cls):
 
         graph_config = GraphConfig(min_degree=2, min_weight=1, community_detection=True, min_community_size=2, as_undirected=False)
         data_collection_config = DataCollectionConfig(
-            exploration_depth=2, random_subset_mode="percent", random_subset_size=20, expansion_type="coreball", degree=2, max_nodes_per_hop=100
+            exploration_depth=2, random_subset_mode="percent", random_subset_size=60, expansion_type="coreball", degree=2, max_nodes_per_hop=100
         )
         cls.sampling_config = SamplingConfig(graph_config, data_collection_config)
-        cls.initial_nodes = ["github.com", "githubnext.com", "bsky.app", "jay.bsky.team"]
+        cls.initial_nodes = ["atproto.com", "bsky.app", "jay.bsky.team", "atprotocol.dev", "freeourfeeds.com"]
 
     def test_sampling_coreball(self):
         g_sub, _ = graph_explore.explore(self.sampling_backend, self.initial_nodes, self.sampling_config)
-        self.assertTrue(g_sub.number_of_nodes() > 5)
-        self.assertTrue(g_sub.number_of_edges() > 10)
+        self.assertGreaterEqual(g_sub.number_of_nodes(), 5)
+        self.assertGreaterEqual(g_sub.number_of_edges(), 10)
         communities = nx.get_node_attributes(g_sub, "community")
         self.assertGreaterEqual(max(communities.values()), 2)
 
     def test_empty_graph(self):
         g_sub, _ = graph_explore.explore(self.sampling_backend, ["#InvalidUsername"], self.sampling_config)
-        self.assertTrue(g_sub.number_of_nodes() == 0)
-        self.assertTrue(g_sub.number_of_edges() == 0)
+        self.assertEqual(g_sub.number_of_nodes(), 0)
+        self.assertEqual(g_sub.number_of_edges(), 0)

From 76349a9cb1ac55be20f3d339ac14b89302883500 Mon Sep 17 00:00:00 2001
From: Nicolas Aspert <nicolas.aspert@epfl.ch>
Date: Tue, 14 Jan 2025 16:56:52 +0100
Subject: [PATCH 06/11] fix empty graph tests

---
 spikexplore/backends/bluesky.py | 8 +++++++-
 spikexplore/collect_edges.py    | 4 ++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/spikexplore/backends/bluesky.py b/spikexplore/backends/bluesky.py
index 4f94928..e892716 100644
--- a/spikexplore/backends/bluesky.py
+++ b/spikexplore/backends/bluesky.py
@@ -4,6 +4,9 @@
 import logging
 import pandas as pd
 from datetime import datetime, timedelta, timezone
+
+from atproto_client.exceptions import BadRequestError
+
 from spikexplore.NodeInfo import NodeInfo
 from spikexplore.graph import add_node_attributes, add_edges_attributes
 
@@ -99,8 +102,11 @@ def get_user_skeets(self, username):
                 user_skeets.items(),
             )
             return user_skeets, dict(skeets_metadata)
+        except BadRequestError as e:
+            logger.error(f"Error in getting user skeets: code {e.response.status_code} - {e.response.content.message}")
+            return {}, {}
         except Exception as e:
-            logger.error("Error in getting user skeets: ", e)
+            logger.error(f"Error in getting user skeets: {e}")
             return {}, {}
 
     def reshape_node_data(self, node_df):
diff --git a/spikexplore/collect_edges.py b/spikexplore/collect_edges.py
index 5ada682..390f844 100644
--- a/spikexplore/collect_edges.py
+++ b/spikexplore/collect_edges.py
@@ -134,8 +134,8 @@ def spiky_ball(initial_node_list, graph_handle, cfg, node_acc=NodeInfo(), progre
                 new_edges = remove_edges_with_target_nodes(new_edges, new_node_list)
 
         new_node_dic, edges_df, nodes_df, node_acc = process_hop(graph_handle, new_node_list, node_acc)
-        if nodes_df.empty:
-            break
+        if edges_df.empty:
+            continue
         nodes_df["spikyball_hop"] = depth  # Mark the depth of the spiky ball on the nodes
 
         total_node_list = total_node_list + new_node_list

From 0f1868e03641baf8db29ac8ed192b656e35de565 Mon Sep 17 00:00:00 2001
From: Nicolas Aspert <nicolas.aspert@epfl.ch>
Date: Wed, 15 Jan 2025 11:01:18 +0100
Subject: [PATCH 07/11] add secrets to github workflow

---
 .github/workflows/main.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index 8e60434..ba86035 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -22,5 +22,8 @@ jobs:
         python -m pip install pytest
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
     - name: Run tests
+      env:
+        BSKY_HANDLE: ${{ secrets.BSKY_HANDLE }}
+        BSKY_PASSWORD: ${{ secrets.BSKY_PASSWORD }}
       run: |
         python -m pytest tests

From a17ab12490cff6d85fb17260f577db517ae70da4 Mon Sep 17 00:00:00 2001
From: Nicolas Aspert <nicolas.aspert@epfl.ch>
Date: Tue, 21 Jan 2025 15:12:37 +0100
Subject: [PATCH 08/11] switch to pyproject.toml

---
 pyproject.toml | 38 ++++++++++++++++++++++++++++++++++++++
 setup.py       | 24 ------------------------
 2 files changed, 38 insertions(+), 24 deletions(-)
 create mode 100644 pyproject.toml
 delete mode 100644 setup.py

diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..40d24fc
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,38 @@
+[build-system]
+requires = ["setuptools>=61.2"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "spikexplore"
+version = "0.2.0"
+authors = [
+    {name = "Nicolas Aspert", email = "nicolas.aspert@epfl.ch"},
+    {name = "Benjamin Ricaud", email = "benjamin.ricaud@uit.no"},
+]
+license = {text = "Apache license"}
+description = "Graph exploration using inhomogeneous filtered diffusion"
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache License",
+    "Operating System :: POSIX :: Linux",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.8",
+]
+urls = {Homepage = "https://gitlab.switch.ch/imi-sad/spikexplore"}
+requires-python = ">=3.8"
+dependencies = [
+    "pandas",
+    "numpy",
+    "networkx",
+    "tqdm",
+    "atproto",
+    "wikipedia-api",
+    "python-louvain",
+]
+
+[tool.setuptools]
+packages = ["spikexplore", "spikexplore.backends"]
+script-files = []
+include-package-data = false
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 57387b7..0000000
--- a/setup.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from setuptools import setup
-
-setup(
-    name="spikexplore",
-    version="0.1.0",
-    description="Graph exploration using inhomogeneous filtered diffusion",
-    url="https://github.com/epfl-lts2/spikexplore",
-    author="Nicolas Aspert, Benjamin Ricaud",
-    author_email="nicolas.aspert@epfl.ch, benjamin.ricaud@epfl.ch",
-    license="Apache license",
-    packages=["spikexplore", "spikexplore.backends"],
-    scripts=[],
-    install_requires=["pandas", "numpy", "networkx", "tqdm", "wikipedia-api", "python-louvain"],
-    python_requires=">=3.8",
-    classifiers=[
-        "Development Status :: 4 - Beta",
-        "Intended Audience :: Science/Research",
-        "License :: OSI Approved :: Apache License",
-        "Operating System :: POSIX :: Linux",
-        "Programming Language :: Python :: 3.10",
-        "Programming Language :: Python :: 3.9",
-        "Programming Language :: Python :: 3.8",
-    ],
-)

From a3be26a562af3fa10a5799e67a0c60c9ec77e2d1 Mon Sep 17 00:00:00 2001
From: Nicolas Aspert <nicolas.aspert@epfl.ch>
Date: Wed, 22 Jan 2025 09:16:46 +0100
Subject: [PATCH 09/11] remove requirements.txt and use only pyproject.toml for
 deps

---
 .gitlab-ci.yml   | 3 +--
 pyproject.toml   | 6 +++++-
 requirements.txt | 7 -------
 3 files changed, 6 insertions(+), 10 deletions(-)
 delete mode 100644 requirements.txt

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index e8553c9..9ea706a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -20,8 +20,7 @@ before_script:
   - python -m pip install virtualenv
   - virtualenv venv
   - source venv/bin/activate
-  - pip install --upgrade -r requirements.txt
-  - pip install black==24.8.0 pytest
+  - pip install .[dev]
 
 Formatting:
   stage: format
diff --git a/pyproject.toml b/pyproject.toml
index 40d24fc..15e4633 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,11 @@ dependencies = [
     "wikipedia-api",
     "python-louvain",
 ]
-
+[project.optional-dependencies]
+dev = [
+    "pytest",
+    "black==24.8.0"
+]
 [tool.setuptools]
 packages = ["spikexplore", "spikexplore.backends"]
 script-files = []
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 9bdc472..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-numpy
-pandas
-networkx
-tqdm
-wikipedia-api
-python-louvain
-atproto

From 23c458c27357658f8b0b8e60f663f2904f869895 Mon Sep 17 00:00:00 2001
From: Nicolas Aspert <nicolas.aspert@epfl.ch>
Date: Wed, 22 Jan 2025 09:21:20 +0100
Subject: [PATCH 10/11] fix github workflow

---
 .github/workflows/main.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index ba86035..73e9305 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -19,8 +19,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install pytest
-        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        python -m pip install .[dev]
     - name: Run tests
       env:
         BSKY_HANDLE: ${{ secrets.BSKY_HANDLE }}

From 9d2a4bba8e7ae6141ffca3f90942587ba877c13e Mon Sep 17 00:00:00 2001
From: Nicolas Aspert <nicolas.aspert@epfl.ch>
Date: Wed, 22 Jan 2025 09:28:53 +0100
Subject: [PATCH 11/11] update README

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 2001c51..67f0bc9 100644
--- a/README.md
+++ b/README.md
@@ -24,3 +24,4 @@ So far this implementation supports:
 - ~~Twitter using [v1 API](https://developer.twitter.com/en/docs/twitter-api/api-reference-index) through [Twython](https://twython.readthedocs.io/en/latest/)~~ 
 Twitter API is no longer available unless you pay. Latest version supporting it is v0.0.12. 
 - Wikipedia using [Mediawiki API](https://www.mediawiki.org/wiki/API:Main_page) through [Wikipedia-API](https://pypi.org/project/Wikipedia-API/)
+- Bluesky using [ATProto](https://atproto.blue/en/latest/)