Release 0.2 (#154)

breezykermo · web-flow · commit 670f593b3f85 · 2020-05-08T11:44:10.000+02:00
* rms faulty line in build * Analyser to create a XLSX for Gephi from Twitter scrape (#152) * rms faulty line in build * factor out common twint utilities * gets direct replies through another twint search * WIP: start preparing graph logic * WIP: start structuring CSV graph * actually export CSV * minor fix * update requirements.txt * correct dest_q update * add download_videos option to Twitter selector * lint * proper fix * correct info.yaml Co-authored-by: Lachlan <Kermode>
diff --git a/src/build/core-cpu.start.Dockerfile b/src/build/core-cpu.start.Dockerfile
@@ -63,7 +63,6 @@ RUN apt-get update --fix-missing
 RUN mkdir -p /mtriage
 COPY ./scripts /mtriage/scripts
 COPY ./src /mtriage/src
-COPY ./credentials /mtriage/credentials
 WORKDIR /mtriage
 
 # *********************
diff --git a/src/build/core-gpu.start.Dockerfile b/src/build/core-gpu.start.Dockerfile
@@ -63,7 +63,6 @@ RUN apt-get update --fix-missing
 RUN mkdir -p /mtriage
 COPY ./scripts /mtriage/scripts
 COPY ./src /mtriage/src
-COPY ./credentials /mtriage/credentials
 WORKDIR /mtriage
 
 # *********************
diff --git a/src/lib/analysers/TwintToGephi/core.py b/src/lib/analysers/TwintToGephi/core.py
@@ -0,0 +1,215 @@
+import os
+import json
+import twint
+import pandas as pd
+from pathlib import Path
+from lib.common.analyser import Analyser
+from lib.common.etypes import Etype
+from lib.util.twint import to_serializable, pythonize
+
+
+from collections import namedtuple
+from datetime import datetime
+
+
+def fmt_timestmap(dstamp, tstamp, tzone):
+    ds = datetime.strptime(dstamp, "%Y-%m-%d")
+    fmtted_ds = ds.strftime("%m/%d/%y")
+    return f"{fmtted_ds} {tstamp}"
+
+
+TMP = Path("/tmp")
+TweetEdge = namedtuple(
+    "TweetEdge", "date tweet urls domains hashtags tweet_id inreplyto_id"
+)
+
+
+class CsvGraph:
+    node_labels = [
+        "Vertex",
+        "Followed",
+        "Followers",
+        "Tweets",
+        "Favorites",
+        "Description",
+        "Location",
+        "Web",
+        "Time Zone",
+        "Joined Twitter Date (UTC)",
+    ]
+    edge_labels = [
+        "Vertex 1",
+        "Vertex 2",
+        "Width",
+        "Relationship",
+        "Relationship Date (UTC)",
+        "Tweet",
+        "URLs in Tweet",
+        "Domains in Tweet",
+        "Hashtags in Tweet",
+        "Tweet Date (UTC)",
+        "Twitter Page for Tweet",
+        "Imported ID",
+        "In-Reply-To Tweet ID",
+    ]
+
+    def __init__(self):
+        self.nodes = []
+        self.edges = []
+
+    def has_node(self, name: str):
+        return name in self.nodes
+
+    def add_node(self, name: str):
+        if name not in self.nodes:
+            self.nodes.append(name)
+
+    def add_edge(self, _from: dict, _to: dict):
+        is_reply = _to is not None
+
+        self.add_node(_from["username"])
+        if is_reply:
+            self.add_node(_to["username"])
+
+        edge = TweetEdge(
+            date=fmt_timestmap(
+                _from["datestamp"], _from["timestamp"], _from["timezone"]
+            ),
+            tweet=_from["tweet"],
+            urls=_from["urls"],
+            domains=[],  # NB: no domains provided in obj
+            hashtags=_from["hashtags"],
+            tweet_id=_from["id"],
+            inreplyto_id=_to["id"] if _to is not None else None,
+        )
+
+        self.edges.append(
+            [
+                _from["username"],
+                _to["username"] if is_reply else _from["username"],
+                1,  # width defaults to 1
+                "Tweet" if not is_reply else "Replies To",  # relationship
+                edge.date,  # relationship date
+                edge.tweet,
+                "- ".join(edge.urls) if isinstance(edge.urls, list) else edge.urls,
+                "- ".join(edge.domains)
+                if isinstance(edge.domains, list)
+                else edge.domains,
+                "- ".join(edge.hashtags)
+                if isinstance(edge.hashtags, list)
+                else edge.hashtags,
+                edge.date,  # tweet date
+                f"https://twitter.com/${_from['username']}/status/${_from['id']}",
+                edge.tweet_id,  # the tweet's id
+                ""
+                if not is_reply
+                else edge.inreplyto_id,  # the id of the tweet to which this replies.
+            ]
+        )
+
+    def to_xlsx(self, path):
+        """ Save graph as XLSX file. The default tab will be edges, with an extra tab for nodes. """
+        edge_df = pd.DataFrame.from_records(self.edges)
+        edge_df.columns = CsvGraph.edge_labels
+        node_df = pd.DataFrame.from_records([[x] for x in self.nodes])
+        node_df.columns = ["Vertex"]
+
+        writer = pd.ExcelWriter(path, engine="xlsxwriter")
+        edge_df.to_excel(writer, sheet_name="Edges")
+        node_df.to_excel(writer, sheet_name="Vertices")
+        writer.save()
+
+
+class TwintToGephi(Analyser):
+    def pre_analyse(self, _):
+        # keeps a record of which user ids have been indexed so that there's no
+        # repeated work.
+        self.indexed_ids = []
+        # usernames (to easily check whether a user exists in the graph or not)
+        self.graph = CsvGraph()
+
+    def analyse_element(self, element: Etype.Json, _) -> Etype.Any:
+        with open(element.paths[0], "r") as f:
+            orig_tweet = json.load(f)
+            orig_tweet = pythonize(orig_tweet)
+
+        tweet_with_replies = [orig_tweet]
+        reply_count = orig_tweet["replies_count"]
+        # retweet_count = orig_tweet["retweets_count"]
+        usr = orig_tweet["username"]
+
+        # TODO: get retweets, as they are mentions
+        # if retweet_count > 0:
+        #     retweets = self.get_all_retweets(usr)
+
+        if reply_count > 0 and usr not in self.indexed_ids:
+            # TODO: keep a record so that we don't need to rescrape
+            # self.indexed_ids.append(usr)
+
+            all_tweets = self.get_all_tweets_sent_to(usr)
+            conv_tweets = [
+                tweet
+                for tweet in all_tweets
+                if tweet["conversation_id"] == orig_tweet["conversation_id"]
+            ]
+            if len(conv_tweets) > 0:
+                tweet_with_replies = tweet_with_replies + conv_tweets
+                self.logger(f"{len(conv_tweets)} replies added to tweet {element.id}.")
+
+        output = TMP / f"{element.id}.json"
+        with open(output, "w+") as f:
+            json.dump(tweet_with_replies, f)
+
+        element.paths = [output]
+
+        return element
+
+    def get_all_retweets(self, username):
+        c = twint.Config()
+        c.Username = username
+        c.Retweets = True
+        twint.run.Profile(c)
+
+    def get_all_tweets_sent_to(self, username):
+        """ See https://github.com/twintproject/twint/issues/513 """
+        c = twint.Config()
+        c.To = f"@{username}"
+        c.Retweets = True
+        c.Since = self.config["uploaded_after"]
+        c.Until = self.config["uploaded_before"]
+        c.Store_object = True
+        self.logger(f"Scraping tweets sent to {username}...")
+        twint.run.Search(c)
+        results = twint.output.tweets_list
+        twint.output.tweets_list = []
+
+        return to_serializable(results)
+
+    def add_to_graph(self, t, inreplyto=None):
+        """ Add the relevant rows (for `nodes` and `edges`) to a graph from
+            a Twint-formatted tweet (Python dictionary) """
+        self.graph.add_node(t["username"])
+
+        self.graph.add_edge(t, inreplyto)
+
+    def post_analyse(self, _):
+        # TODO: a kind of hack... should maybe make available as a func, i.e. `self.get_analysed()`
+        analysed_els = self.disk.read_elements([self.dest_q])
+        for el in analysed_els:
+            el_json = el.paths[0]
+            with open(el_json) as f:
+                tweets = json.load(f)
+
+            initial_tweet = tweets[0]
+            self.logger(f"Adding tweet {initial_tweet['id']} to graph...")
+            self.add_to_graph(initial_tweet)
+            for tweet in tweets[1:]:
+                self.logger(f"Adding reply {tweet['id']} to graph...")
+                self.add_to_graph(tweet, inreplyto=initial_tweet)
+
+        xlsx_path = TMP / "final.xlsx"
+        self.graph.to_xlsx(xlsx_path)
+        return Etype.Any("FINAL", xlsx_path)
+
+
+module = TwintToGephi
diff --git a/src/lib/analysers/TwintToGephi/info.yaml b/src/lib/analysers/TwintToGephi/info.yaml
@@ -0,0 +1,10 @@
+desc: Create a single element from Twitter elements, which contains two CSV files that specify a relational graph. As replies are determined by scraping all tweets in a user's timeline and then filtering by conversation ID, a requirement of twint, `uploaded_before` and `uploaded_after` should be provided so that only relevant tweets need to be scraped.
+args:
+  - name: uploaded_before
+    desc: Only return tweets before this date.
+    required: true
+    input: date
+  - name: uploaded_after
+    desc: Only return tweets after this date.
+    required: true
+    input: date
diff --git a/src/lib/analysers/TwintToGephi/requirements.txt b/src/lib/analysers/TwintToGephi/requirements.txt
@@ -0,0 +1,2 @@
+xlsxwriter
+pandas
diff --git a/src/lib/common/analyser.py b/src/lib/common/analyser.py
@@ -111,9 +111,9 @@ def analyse(
             # NB: `super` infra is necessary in case a storage class overwrites
             # the `read_query` method as LocalStorage does.
             og_query = super(type(self.disk), self.disk).read_query(element.query)
-            dest_q = f"{og_query[0]}/{self.name}"
+            self.dest_q = f"{og_query[0]}/{self.name}"
 
-            self.__attempt_analyse(5, element, dest_q)
+            self.__attempt_analyse(5, element)
             self.disk.delete_local_on_write = False
 
     @MTModule.phase("post-analyse")
@@ -133,12 +133,12 @@ def __post_analyse(self):
                 "Some instances of the final element produced via 'post_analyse' failed to save."
             )
 
-    def __attempt_analyse(self, attempts, element, dest_q):
+    def __attempt_analyse(self, attempts, element):
         try:
             new_element = self.analyse_element(element, self.config)
             if new_element is None:
                 return
-            success = self.disk.write_element(dest_q, new_element)
+            success = self.disk.write_element(self.dest_q, new_element)
             if not success:
                 raise ElementShouldRetryError("Unsuccessful storage")
 
@@ -147,7 +147,7 @@ def __attempt_analyse(self, attempts, element, dest_q):
         except ElementShouldRetryError as e:
             self.error_logger(str(e), element)
             if attempts > 1:
-                return self.__attempt_analyse(attempts - 1, element, dest_q)
+                return self.__attempt_analyse(attempts - 1, element)
             else:
                 self.error_logger(
                     "failed after maximum retries - skipping element", element
diff --git a/src/lib/selectors/Twitter/core.py b/src/lib/selectors/Twitter/core.py
@@ -5,6 +5,7 @@
 from lib.common.selector import Selector
 from lib.common.etypes import Etype, LocalElementsIndex
 from lib.common.util import files
+from lib.util.twint import to_serializable
 
 TMP = Path("/tmp")
 
@@ -26,11 +27,7 @@ def index(self, config):
 
         twint.run.Search(c)
 
-        def extract_fields(t):
-            return [t.id, t.datetime, t.tweet, ",".join(t.hashtags), ",".join(t.photos)]
-
-        tweets = list(map(extract_fields, twint.output.tweets_list))
-        tweets.insert(0, ["id", "datetime", "tweet", "hashtags", "photos"])
+        tweets = to_serializable(twint.output.tweets_list, as_list=True)
         return LocalElementsIndex(tweets)
 
     def retrieve_element(self, element, _):
@@ -40,16 +37,23 @@ def retrieve_element(self, element, _):
             json.dump(element.__dict__, fp)
 
         # retrieve photos
-        photos = element.photos.split(",")
-        if len(photos) < 1 or photos[0] == "":
-            self.logger(f"{element.id} downloaded.")
-            return Etype.cast(element.id, files(base))
+        if "download_photos" in self.config and self.config.download_photos:
+            photos = element.photos.split(",")
+            if len(photos) < 1 or photos[0] == "":
+                self.logger(f"{element.id} downloaded.")
+                return Etype.cast(element.id, files(base))
+
+            for url in photos:
+                fname = url.rsplit("/", 1)[-1]
+                urlretrieve(url, base / fname)
+
+            self.logger(f"{element.id} downloaded (with images).")
 
-        for url in photos:
-            fname = url.rsplit("/", 1)[-1]
-            urlretrieve(url, base / fname)
+        if "download_videos" in self.config and self.config.download_videos:
+            if hasattr(element, "video") and element.video != "":
+                fname = element.video.rsplit("/", 1)[-1]
+                urlretrieve(element.video, base / fname)
 
-        self.logger(f"{element.id} downloaded (with images).")
         self.disk.delete_local_on_write = True
         return Etype.cast(element.id, files(base))
 
diff --git a/src/lib/selectors/Twitter/info.yaml b/src/lib/selectors/Twitter/info.yaml
@@ -12,3 +12,12 @@ args:
     desc: Only return tweets after this date.
     required: true
     input: date
+  - name: download_photos
+    required: false
+    desc: set to True if the selector should download photos in tweets. False by default.
+    input: boolean
+  - name: download_videos
+    required: false
+    desc: set to True if the selector should download videos in tweets. False by default.
+    input: boolean
+
diff --git a/src/lib/util/twint.py b/src/lib/util/twint.py