Skip to content

Commit 670f593

Browse files
authored
Release 0.2 (#154)
* rms faulty line in build * Analyser to create a XLSX for Gephi from Twitter scrape (#152) * rms faulty line in build * factor out common twint utilities * gets direct replies through another twint search * WIP: start preparing graph logic * WIP: start structuring CSV graph * actually export CSV * minor fix * update requirements.txt * correct dest_q update * add download_videos option to Twitter selector * lint * proper fix * correct info.yaml Co-authored-by: Lachlan <Kermode>
1 parent 629cba2 commit 670f593

File tree

9 files changed

+335
-20
lines changed

9 files changed

+335
-20
lines changed

src/build/core-cpu.start.Dockerfile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@ RUN apt-get update --fix-missing
6363
RUN mkdir -p /mtriage
6464
COPY ./scripts /mtriage/scripts
6565
COPY ./src /mtriage/src
66-
COPY ./credentials /mtriage/credentials
6766
WORKDIR /mtriage
6867

6968
# *********************

src/build/core-gpu.start.Dockerfile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@ RUN apt-get update --fix-missing
6363
RUN mkdir -p /mtriage
6464
COPY ./scripts /mtriage/scripts
6565
COPY ./src /mtriage/src
66-
COPY ./credentials /mtriage/credentials
6766
WORKDIR /mtriage
6867

6968
# *********************
Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
import os
2+
import json
3+
import twint
4+
import pandas as pd
5+
from pathlib import Path
6+
from lib.common.analyser import Analyser
7+
from lib.common.etypes import Etype
8+
from lib.util.twint import to_serializable, pythonize
9+
10+
11+
from collections import namedtuple
12+
from datetime import datetime
13+
14+
15+
def fmt_timestmap(dstamp, tstamp, tzone):
16+
ds = datetime.strptime(dstamp, "%Y-%m-%d")
17+
fmtted_ds = ds.strftime("%m/%d/%y")
18+
return f"{fmtted_ds} {tstamp}"
19+
20+
21+
TMP = Path("/tmp")
22+
TweetEdge = namedtuple(
23+
"TweetEdge", "date tweet urls domains hashtags tweet_id inreplyto_id"
24+
)
25+
26+
27+
class CsvGraph:
28+
node_labels = [
29+
"Vertex",
30+
"Followed",
31+
"Followers",
32+
"Tweets",
33+
"Favorites",
34+
"Description",
35+
"Location",
36+
"Web",
37+
"Time Zone",
38+
"Joined Twitter Date (UTC)",
39+
]
40+
edge_labels = [
41+
"Vertex 1",
42+
"Vertex 2",
43+
"Width",
44+
"Relationship",
45+
"Relationship Date (UTC)",
46+
"Tweet",
47+
"URLs in Tweet",
48+
"Domains in Tweet",
49+
"Hashtags in Tweet",
50+
"Tweet Date (UTC)",
51+
"Twitter Page for Tweet",
52+
"Imported ID",
53+
"In-Reply-To Tweet ID",
54+
]
55+
56+
def __init__(self):
57+
self.nodes = []
58+
self.edges = []
59+
60+
def has_node(self, name: str):
61+
return name in self.nodes
62+
63+
def add_node(self, name: str):
64+
if name not in self.nodes:
65+
self.nodes.append(name)
66+
67+
def add_edge(self, _from: dict, _to: dict):
68+
is_reply = _to is not None
69+
70+
self.add_node(_from["username"])
71+
if is_reply:
72+
self.add_node(_to["username"])
73+
74+
edge = TweetEdge(
75+
date=fmt_timestmap(
76+
_from["datestamp"], _from["timestamp"], _from["timezone"]
77+
),
78+
tweet=_from["tweet"],
79+
urls=_from["urls"],
80+
domains=[], # NB: no domains provided in obj
81+
hashtags=_from["hashtags"],
82+
tweet_id=_from["id"],
83+
inreplyto_id=_to["id"] if _to is not None else None,
84+
)
85+
86+
self.edges.append(
87+
[
88+
_from["username"],
89+
_to["username"] if is_reply else _from["username"],
90+
1, # width defaults to 1
91+
"Tweet" if not is_reply else "Replies To", # relationship
92+
edge.date, # relationship date
93+
edge.tweet,
94+
"- ".join(edge.urls) if isinstance(edge.urls, list) else edge.urls,
95+
"- ".join(edge.domains)
96+
if isinstance(edge.domains, list)
97+
else edge.domains,
98+
"- ".join(edge.hashtags)
99+
if isinstance(edge.hashtags, list)
100+
else edge.hashtags,
101+
edge.date, # tweet date
102+
f"https://twitter.com/${_from['username']}/status/${_from['id']}",
103+
edge.tweet_id, # the tweet's id
104+
""
105+
if not is_reply
106+
else edge.inreplyto_id, # the id of the tweet to which this replies.
107+
]
108+
)
109+
110+
def to_xlsx(self, path):
111+
""" Save graph as XLSX file. The default tab will be edges, with an extra tab for nodes. """
112+
edge_df = pd.DataFrame.from_records(self.edges)
113+
edge_df.columns = CsvGraph.edge_labels
114+
node_df = pd.DataFrame.from_records([[x] for x in self.nodes])
115+
node_df.columns = ["Vertex"]
116+
117+
writer = pd.ExcelWriter(path, engine="xlsxwriter")
118+
edge_df.to_excel(writer, sheet_name="Edges")
119+
node_df.to_excel(writer, sheet_name="Vertices")
120+
writer.save()
121+
122+
123+
class TwintToGephi(Analyser):
124+
def pre_analyse(self, _):
125+
# keeps a record of which user ids have been indexed so that there's no
126+
# repeated work.
127+
self.indexed_ids = []
128+
# usernames (to easily check whether a user exists in the graph or not)
129+
self.graph = CsvGraph()
130+
131+
def analyse_element(self, element: Etype.Json, _) -> Etype.Any:
132+
with open(element.paths[0], "r") as f:
133+
orig_tweet = json.load(f)
134+
orig_tweet = pythonize(orig_tweet)
135+
136+
tweet_with_replies = [orig_tweet]
137+
reply_count = orig_tweet["replies_count"]
138+
# retweet_count = orig_tweet["retweets_count"]
139+
usr = orig_tweet["username"]
140+
141+
# TODO: get retweets, as they are mentions
142+
# if retweet_count > 0:
143+
# retweets = self.get_all_retweets(usr)
144+
145+
if reply_count > 0 and usr not in self.indexed_ids:
146+
# TODO: keep a record so that we don't need to rescrape
147+
# self.indexed_ids.append(usr)
148+
149+
all_tweets = self.get_all_tweets_sent_to(usr)
150+
conv_tweets = [
151+
tweet
152+
for tweet in all_tweets
153+
if tweet["conversation_id"] == orig_tweet["conversation_id"]
154+
]
155+
if len(conv_tweets) > 0:
156+
tweet_with_replies = tweet_with_replies + conv_tweets
157+
self.logger(f"{len(conv_tweets)} replies added to tweet {element.id}.")
158+
159+
output = TMP / f"{element.id}.json"
160+
with open(output, "w+") as f:
161+
json.dump(tweet_with_replies, f)
162+
163+
element.paths = [output]
164+
165+
return element
166+
167+
def get_all_retweets(self, username):
168+
c = twint.Config()
169+
c.Username = username
170+
c.Retweets = True
171+
twint.run.Profile(c)
172+
173+
def get_all_tweets_sent_to(self, username):
174+
""" See https://github.com/twintproject/twint/issues/513 """
175+
c = twint.Config()
176+
c.To = f"@{username}"
177+
c.Retweets = True
178+
c.Since = self.config["uploaded_after"]
179+
c.Until = self.config["uploaded_before"]
180+
c.Store_object = True
181+
self.logger(f"Scraping tweets sent to {username}...")
182+
twint.run.Search(c)
183+
results = twint.output.tweets_list
184+
twint.output.tweets_list = []
185+
186+
return to_serializable(results)
187+
188+
def add_to_graph(self, t, inreplyto=None):
189+
""" Add the relevant rows (for `nodes` and `edges`) to a graph from
190+
a Twint-formatted tweet (Python dictionary) """
191+
self.graph.add_node(t["username"])
192+
193+
self.graph.add_edge(t, inreplyto)
194+
195+
def post_analyse(self, _):
196+
# TODO: a kind of hack... should maybe make available as a func, i.e. `self.get_analysed()`
197+
analysed_els = self.disk.read_elements([self.dest_q])
198+
for el in analysed_els:
199+
el_json = el.paths[0]
200+
with open(el_json) as f:
201+
tweets = json.load(f)
202+
203+
initial_tweet = tweets[0]
204+
self.logger(f"Adding tweet {initial_tweet['id']} to graph...")
205+
self.add_to_graph(initial_tweet)
206+
for tweet in tweets[1:]:
207+
self.logger(f"Adding reply {tweet['id']} to graph...")
208+
self.add_to_graph(tweet, inreplyto=initial_tweet)
209+
210+
xlsx_path = TMP / "final.xlsx"
211+
self.graph.to_xlsx(xlsx_path)
212+
return Etype.Any("FINAL", xlsx_path)
213+
214+
215+
module = TwintToGephi
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
desc: Create a single element from Twitter elements, which contains two CSV files that specify a relational graph. As replies are determined by scraping all tweets in a user's timeline and then filtering by conversation ID, a requirement of twint, `uploaded_before` and `uploaded_after` should be provided so that only relevant tweets need to be scraped.
2+
args:
3+
- name: uploaded_before
4+
desc: Only return tweets before this date.
5+
required: true
6+
input: date
7+
- name: uploaded_after
8+
desc: Only return tweets after this date.
9+
required: true
10+
input: date
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
xlsxwriter
2+
pandas

src/lib/common/analyser.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,9 @@ def analyse(
111111
# NB: `super` infra is necessary in case a storage class overwrites
112112
# the `read_query` method as LocalStorage does.
113113
og_query = super(type(self.disk), self.disk).read_query(element.query)
114-
dest_q = f"{og_query[0]}/{self.name}"
114+
self.dest_q = f"{og_query[0]}/{self.name}"
115115

116-
self.__attempt_analyse(5, element, dest_q)
116+
self.__attempt_analyse(5, element)
117117
self.disk.delete_local_on_write = False
118118

119119
@MTModule.phase("post-analyse")
@@ -133,12 +133,12 @@ def __post_analyse(self):
133133
"Some instances of the final element produced via 'post_analyse' failed to save."
134134
)
135135

136-
def __attempt_analyse(self, attempts, element, dest_q):
136+
def __attempt_analyse(self, attempts, element):
137137
try:
138138
new_element = self.analyse_element(element, self.config)
139139
if new_element is None:
140140
return
141-
success = self.disk.write_element(dest_q, new_element)
141+
success = self.disk.write_element(self.dest_q, new_element)
142142
if not success:
143143
raise ElementShouldRetryError("Unsuccessful storage")
144144

@@ -147,7 +147,7 @@ def __attempt_analyse(self, attempts, element, dest_q):
147147
except ElementShouldRetryError as e:
148148
self.error_logger(str(e), element)
149149
if attempts > 1:
150-
return self.__attempt_analyse(attempts - 1, element, dest_q)
150+
return self.__attempt_analyse(attempts - 1, element)
151151
else:
152152
self.error_logger(
153153
"failed after maximum retries - skipping element", element

src/lib/selectors/Twitter/core.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from lib.common.selector import Selector
66
from lib.common.etypes import Etype, LocalElementsIndex
77
from lib.common.util import files
8+
from lib.util.twint import to_serializable
89

910
TMP = Path("/tmp")
1011

@@ -26,11 +27,7 @@ def index(self, config):
2627

2728
twint.run.Search(c)
2829

29-
def extract_fields(t):
30-
return [t.id, t.datetime, t.tweet, ",".join(t.hashtags), ",".join(t.photos)]
31-
32-
tweets = list(map(extract_fields, twint.output.tweets_list))
33-
tweets.insert(0, ["id", "datetime", "tweet", "hashtags", "photos"])
30+
tweets = to_serializable(twint.output.tweets_list, as_list=True)
3431
return LocalElementsIndex(tweets)
3532

3633
def retrieve_element(self, element, _):
@@ -40,16 +37,23 @@ def retrieve_element(self, element, _):
4037
json.dump(element.__dict__, fp)
4138

4239
# retrieve photos
43-
photos = element.photos.split(",")
44-
if len(photos) < 1 or photos[0] == "":
45-
self.logger(f"{element.id} downloaded.")
46-
return Etype.cast(element.id, files(base))
40+
if "download_photos" in self.config and self.config.download_photos:
41+
photos = element.photos.split(",")
42+
if len(photos) < 1 or photos[0] == "":
43+
self.logger(f"{element.id} downloaded.")
44+
return Etype.cast(element.id, files(base))
45+
46+
for url in photos:
47+
fname = url.rsplit("/", 1)[-1]
48+
urlretrieve(url, base / fname)
49+
50+
self.logger(f"{element.id} downloaded (with images).")
4751

48-
for url in photos:
49-
fname = url.rsplit("/", 1)[-1]
50-
urlretrieve(url, base / fname)
52+
if "download_videos" in self.config and self.config.download_videos:
53+
if hasattr(element, "video") and element.video != "":
54+
fname = element.video.rsplit("/", 1)[-1]
55+
urlretrieve(element.video, base / fname)
5156

52-
self.logger(f"{element.id} downloaded (with images).")
5357
self.disk.delete_local_on_write = True
5458
return Etype.cast(element.id, files(base))
5559

src/lib/selectors/Twitter/info.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,12 @@ args:
1212
desc: Only return tweets after this date.
1313
required: true
1414
input: date
15+
- name: download_photos
16+
required: false
17+
desc: set to True if the selector should download photos in tweets. False by default.
18+
input: boolean
19+
- name: download_videos
20+
required: false
21+
desc: set to True if the selector should download videos in tweets. False by default.
22+
input: boolean
23+

0 commit comments

Comments
 (0)