Skip to content

Commit

Permalink
Release 0.2 (#154)
Browse files Browse the repository at this point in the history
* rms faulty line in build

* Analyser to create a XLSX for Gephi from Twitter scrape (#152)

* rms faulty line in build

* factor out common twint utilities

* gets direct replies through another twint search

* WIP: start preparing graph logic

* WIP: start structuring CSV graph

* actually export CSV

* minor fix

* update requirements.txt

* correct dest_q update

* add download_videos option to Twitter selector

* lint

* proper fix

* correct info.yaml

Co-authored-by: Lachlan <Kermode>
  • Loading branch information
breezykermo authored May 8, 2020
1 parent 629cba2 commit 670f593
Show file tree
Hide file tree
Showing 9 changed files with 335 additions and 20 deletions.
1 change: 0 additions & 1 deletion src/build/core-cpu.start.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ RUN apt-get update --fix-missing
RUN mkdir -p /mtriage
COPY ./scripts /mtriage/scripts
COPY ./src /mtriage/src
COPY ./credentials /mtriage/credentials
WORKDIR /mtriage

# *********************
Expand Down
1 change: 0 additions & 1 deletion src/build/core-gpu.start.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ RUN apt-get update --fix-missing
RUN mkdir -p /mtriage
COPY ./scripts /mtriage/scripts
COPY ./src /mtriage/src
COPY ./credentials /mtriage/credentials
WORKDIR /mtriage

# *********************
Expand Down
215 changes: 215 additions & 0 deletions src/lib/analysers/TwintToGephi/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
import os
import json
import twint
import pandas as pd
from pathlib import Path
from lib.common.analyser import Analyser
from lib.common.etypes import Etype
from lib.util.twint import to_serializable, pythonize


from collections import namedtuple
from datetime import datetime


def fmt_timestmap(dstamp, tstamp, tzone):
ds = datetime.strptime(dstamp, "%Y-%m-%d")
fmtted_ds = ds.strftime("%m/%d/%y")
return f"{fmtted_ds} {tstamp}"


TMP = Path("/tmp")
TweetEdge = namedtuple(
"TweetEdge", "date tweet urls domains hashtags tweet_id inreplyto_id"
)


class CsvGraph:
node_labels = [
"Vertex",
"Followed",
"Followers",
"Tweets",
"Favorites",
"Description",
"Location",
"Web",
"Time Zone",
"Joined Twitter Date (UTC)",
]
edge_labels = [
"Vertex 1",
"Vertex 2",
"Width",
"Relationship",
"Relationship Date (UTC)",
"Tweet",
"URLs in Tweet",
"Domains in Tweet",
"Hashtags in Tweet",
"Tweet Date (UTC)",
"Twitter Page for Tweet",
"Imported ID",
"In-Reply-To Tweet ID",
]

def __init__(self):
self.nodes = []
self.edges = []

def has_node(self, name: str):
return name in self.nodes

def add_node(self, name: str):
if name not in self.nodes:
self.nodes.append(name)

def add_edge(self, _from: dict, _to: dict):
is_reply = _to is not None

self.add_node(_from["username"])
if is_reply:
self.add_node(_to["username"])

edge = TweetEdge(
date=fmt_timestmap(
_from["datestamp"], _from["timestamp"], _from["timezone"]
),
tweet=_from["tweet"],
urls=_from["urls"],
domains=[], # NB: no domains provided in obj
hashtags=_from["hashtags"],
tweet_id=_from["id"],
inreplyto_id=_to["id"] if _to is not None else None,
)

self.edges.append(
[
_from["username"],
_to["username"] if is_reply else _from["username"],
1, # width defaults to 1
"Tweet" if not is_reply else "Replies To", # relationship
edge.date, # relationship date
edge.tweet,
"- ".join(edge.urls) if isinstance(edge.urls, list) else edge.urls,
"- ".join(edge.domains)
if isinstance(edge.domains, list)
else edge.domains,
"- ".join(edge.hashtags)
if isinstance(edge.hashtags, list)
else edge.hashtags,
edge.date, # tweet date
f"https://twitter.com/${_from['username']}/status/${_from['id']}",
edge.tweet_id, # the tweet's id
""
if not is_reply
else edge.inreplyto_id, # the id of the tweet to which this replies.
]
)

def to_xlsx(self, path):
""" Save graph as XLSX file. The default tab will be edges, with an extra tab for nodes. """
edge_df = pd.DataFrame.from_records(self.edges)
edge_df.columns = CsvGraph.edge_labels
node_df = pd.DataFrame.from_records([[x] for x in self.nodes])
node_df.columns = ["Vertex"]

writer = pd.ExcelWriter(path, engine="xlsxwriter")
edge_df.to_excel(writer, sheet_name="Edges")
node_df.to_excel(writer, sheet_name="Vertices")
writer.save()


class TwintToGephi(Analyser):
def pre_analyse(self, _):
# keeps a record of which user ids have been indexed so that there's no
# repeated work.
self.indexed_ids = []
# usernames (to easily check whether a user exists in the graph or not)
self.graph = CsvGraph()

def analyse_element(self, element: Etype.Json, _) -> Etype.Any:
with open(element.paths[0], "r") as f:
orig_tweet = json.load(f)
orig_tweet = pythonize(orig_tweet)

tweet_with_replies = [orig_tweet]
reply_count = orig_tweet["replies_count"]
# retweet_count = orig_tweet["retweets_count"]
usr = orig_tweet["username"]

# TODO: get retweets, as they are mentions
# if retweet_count > 0:
# retweets = self.get_all_retweets(usr)

if reply_count > 0 and usr not in self.indexed_ids:
# TODO: keep a record so that we don't need to rescrape
# self.indexed_ids.append(usr)

all_tweets = self.get_all_tweets_sent_to(usr)
conv_tweets = [
tweet
for tweet in all_tweets
if tweet["conversation_id"] == orig_tweet["conversation_id"]
]
if len(conv_tweets) > 0:
tweet_with_replies = tweet_with_replies + conv_tweets
self.logger(f"{len(conv_tweets)} replies added to tweet {element.id}.")

output = TMP / f"{element.id}.json"
with open(output, "w+") as f:
json.dump(tweet_with_replies, f)

element.paths = [output]

return element

def get_all_retweets(self, username):
c = twint.Config()
c.Username = username
c.Retweets = True
twint.run.Profile(c)

def get_all_tweets_sent_to(self, username):
""" See https://github.com/twintproject/twint/issues/513 """
c = twint.Config()
c.To = f"@{username}"
c.Retweets = True
c.Since = self.config["uploaded_after"]
c.Until = self.config["uploaded_before"]
c.Store_object = True
self.logger(f"Scraping tweets sent to {username}...")
twint.run.Search(c)
results = twint.output.tweets_list
twint.output.tweets_list = []

return to_serializable(results)

def add_to_graph(self, t, inreplyto=None):
""" Add the relevant rows (for `nodes` and `edges`) to a graph from
a Twint-formatted tweet (Python dictionary) """
self.graph.add_node(t["username"])

self.graph.add_edge(t, inreplyto)

def post_analyse(self, _):
# TODO: a kind of hack... should maybe make available as a func, i.e. `self.get_analysed()`
analysed_els = self.disk.read_elements([self.dest_q])
for el in analysed_els:
el_json = el.paths[0]
with open(el_json) as f:
tweets = json.load(f)

initial_tweet = tweets[0]
self.logger(f"Adding tweet {initial_tweet['id']} to graph...")
self.add_to_graph(initial_tweet)
for tweet in tweets[1:]:
self.logger(f"Adding reply {tweet['id']} to graph...")
self.add_to_graph(tweet, inreplyto=initial_tweet)

xlsx_path = TMP / "final.xlsx"
self.graph.to_xlsx(xlsx_path)
return Etype.Any("FINAL", xlsx_path)


module = TwintToGephi
10 changes: 10 additions & 0 deletions src/lib/analysers/TwintToGephi/info.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
desc: Create a single element from Twitter elements, which contains two CSV files that specify a relational graph. As replies are determined by scraping all tweets in a user's timeline and then filtering by conversation ID, a requirement of twint, `uploaded_before` and `uploaded_after` should be provided so that only relevant tweets need to be scraped.
args:
- name: uploaded_before
desc: Only return tweets before this date.
required: true
input: date
- name: uploaded_after
desc: Only return tweets after this date.
required: true
input: date
2 changes: 2 additions & 0 deletions src/lib/analysers/TwintToGephi/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
xlsxwriter
pandas
10 changes: 5 additions & 5 deletions src/lib/common/analyser.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,9 @@ def analyse(
# NB: `super` infra is necessary in case a storage class overwrites
# the `read_query` method as LocalStorage does.
og_query = super(type(self.disk), self.disk).read_query(element.query)
dest_q = f"{og_query[0]}/{self.name}"
self.dest_q = f"{og_query[0]}/{self.name}"

self.__attempt_analyse(5, element, dest_q)
self.__attempt_analyse(5, element)
self.disk.delete_local_on_write = False

@MTModule.phase("post-analyse")
Expand All @@ -133,12 +133,12 @@ def __post_analyse(self):
"Some instances of the final element produced via 'post_analyse' failed to save."
)

def __attempt_analyse(self, attempts, element, dest_q):
def __attempt_analyse(self, attempts, element):
try:
new_element = self.analyse_element(element, self.config)
if new_element is None:
return
success = self.disk.write_element(dest_q, new_element)
success = self.disk.write_element(self.dest_q, new_element)
if not success:
raise ElementShouldRetryError("Unsuccessful storage")

Expand All @@ -147,7 +147,7 @@ def __attempt_analyse(self, attempts, element, dest_q):
except ElementShouldRetryError as e:
self.error_logger(str(e), element)
if attempts > 1:
return self.__attempt_analyse(attempts - 1, element, dest_q)
return self.__attempt_analyse(attempts - 1, element)
else:
self.error_logger(
"failed after maximum retries - skipping element", element
Expand Down
30 changes: 17 additions & 13 deletions src/lib/selectors/Twitter/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from lib.common.selector import Selector
from lib.common.etypes import Etype, LocalElementsIndex
from lib.common.util import files
from lib.util.twint import to_serializable

TMP = Path("/tmp")

Expand All @@ -26,11 +27,7 @@ def index(self, config):

twint.run.Search(c)

def extract_fields(t):
return [t.id, t.datetime, t.tweet, ",".join(t.hashtags), ",".join(t.photos)]

tweets = list(map(extract_fields, twint.output.tweets_list))
tweets.insert(0, ["id", "datetime", "tweet", "hashtags", "photos"])
tweets = to_serializable(twint.output.tweets_list, as_list=True)
return LocalElementsIndex(tweets)

def retrieve_element(self, element, _):
Expand All @@ -40,16 +37,23 @@ def retrieve_element(self, element, _):
json.dump(element.__dict__, fp)

# retrieve photos
photos = element.photos.split(",")
if len(photos) < 1 or photos[0] == "":
self.logger(f"{element.id} downloaded.")
return Etype.cast(element.id, files(base))
if "download_photos" in self.config and self.config.download_photos:
photos = element.photos.split(",")
if len(photos) < 1 or photos[0] == "":
self.logger(f"{element.id} downloaded.")
return Etype.cast(element.id, files(base))

for url in photos:
fname = url.rsplit("/", 1)[-1]
urlretrieve(url, base / fname)

self.logger(f"{element.id} downloaded (with images).")

for url in photos:
fname = url.rsplit("/", 1)[-1]
urlretrieve(url, base / fname)
if "download_videos" in self.config and self.config.download_videos:
if hasattr(element, "video") and element.video != "":
fname = element.video.rsplit("/", 1)[-1]
urlretrieve(element.video, base / fname)

self.logger(f"{element.id} downloaded (with images).")
self.disk.delete_local_on_write = True
return Etype.cast(element.id, files(base))

Expand Down
9 changes: 9 additions & 0 deletions src/lib/selectors/Twitter/info.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,12 @@ args:
desc: Only return tweets after this date.
required: true
input: date
- name: download_photos
required: false
desc: set to True if the selector should download photos in tweets. False by default.
input: boolean
- name: download_videos
required: false
desc: set to True if the selector should download videos in tweets. False by default.
input: boolean

Loading

0 comments on commit 670f593

Please sign in to comment.