-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrape.py
43 lines (35 loc) · 1.5 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# importing libraries and packages
from datetime import datetime
import snscrape.modules.twitter as sntwitter
import pandas as pd
import csv
import sys
# for timing execution
begin_time = datetime.now()
# list to hold scraped tweets
tweets_list = []
# reading city,state from script input
# argv[1] should be city csv
locations = []
with open(sys.argv[1]) as csvfile:
for line in csvfile:
locations.append(line)
# splitting on commas and saving in locations_stripped list
locations_stripped = []
for i in range(len(locations)):
temp = locations[i].rstrip(",\n")
locations_stripped.append(temp)
# terms to be scraped for
terms = ['jew', 'joo', '(((', ')))', 'Rothschild', 'George Soros', 'New World Order', 'goyim', 'Holocaust', 'holahoax', 'holohoax', 'Shoah']
# scraping within 30 mi of each city in the locations_stripped list
for i in range(len(locations_stripped)):
for j in range(len(terms)):
print(locations_stripped[i], terms[j])
for k, tweet in enumerate(sntwitter.TwitterSearchScraper(terms[j] + ' since:2017-01-01 until:2020-12-31 near:' + "\"" + locations_stripped[i] + "\"" + 'within:30mi').get_items()):
if k > 100:
break
tweets_list.append([tweet.id, tweet.date, tweet.content, tweet.user.username, tweet.user.location, tweet.lang, tweet.place])
tweets_df = pd.DataFrame(tweets_list, columns=['ID', 'Datetime', 'Text', 'Username', 'Location', 'Language', 'Place'])
tweets_df.to_csv(sys.argv[2])
# calculating execution time
print("execution time", datetime.now() - begin_time)