-
Notifications
You must be signed in to change notification settings - Fork 415
/
Copy pathreddit_scraper.py
60 lines (47 loc) · 1.62 KB
/
reddit_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import requests
import csv
import time
from bs4 import BeautifulSoup
class HaikuScraper:
"""
This scraper is designed with the purpose of scraping Haikus (Japanese poems) from Reddit.
"""
def __init__(self, url: str, headers: dict):
self.url = url
self.headers = headers
def make_request(self):
time.sleep(3)
page = requests.get(self.url, headers=self.headers)
soup = BeautifulSoup(page.text, 'html.parser')
return soup
def get_next_page(self, soup: BeautifulSoup):
time.sleep(3)
next_button = soup.find('span', class_='next-button')
next_page_link = next_button.find("a").attrs['href']
return next_page_link
def get_haikus(self, soup: BeautifulSoup):
haikus = [str(title.text) for title in soup.find_all("a", class_="title may-blank ")]
return haikus
def write_haikus_to_csv(self, haikus: list):
with open('scraped_haikus_v2.txt', 'a') as f:
writer = csv.writer(f)
for haiku in haikus:
writer.writerow([haiku])
f.close()
url = "https://old.reddit.com/r/haiku/"
# Headers to mimic a browser visit
headers = {'User-Agent': 'Mozilla/5.0'}
scraper = HaikuScraper(url, headers)
soup = scraper.make_request()
haikus = scraper.get_haikus(soup)
scraper.write_haikus_to_csv(haikus)
counter = 1
while (counter <= 2500):
time.sleep(2)
link = scraper.get_next_page(soup)
print(f"Page {counter + 1}. Link {link}.")
scraper = HaikuScraper(link, headers)
soup = scraper.make_request()
haikus = scraper.get_haikus(soup)
scraper.write_haikus_to_csv(haikus)
counter += 1