Skip to content
This repository was archived by the owner on Nov 30, 2022. It is now read-only.

Commit bdbba90

Browse files
committed
medium articles details scrapper
1 parent c969801 commit bdbba90

File tree

4 files changed

+134
-0
lines changed

4 files changed

+134
-0
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Unnecesaary Files
2+
3+
app/__pycahce__/
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Medium-Articles-Details-Scrapping
2+
This script will scrap details about medium articles published in a date range in the given publication. The dates are choosen randomly. If there is no article on that date, then that date is skipped. The results returned is a dataframe which can be saved in any format, currently saves as CSV.
3+
4+
# Requirements
5+
- numpy
6+
- pandas
7+
- bs4
8+
- requests
9+
10+
# How to run?
11+
- Open the run.py to add the dictionary of urls, date range and number of random dates.
12+
- Save the file.
13+
- Run the command: python run.py
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
import pandas as pd
4+
import random
5+
import numpy as np
6+
import itertools
7+
import time
8+
9+
class Scrap:
10+
"""
11+
A Scrapper to get details about medium articles published in a date range in a Publication by selecting random dates.
12+
13+
Attributes
14+
----------
15+
urls_dict : dict
16+
key-value pairs of the publication name with link. Example:
17+
urls_dict={"The Startup":"https://medium.com/swlh"}
18+
19+
start_date : str
20+
starting date of the search. Default: 2020-01-01
21+
22+
end_date : str
23+
ending date of the search. Default: 2020-08-01
24+
25+
year : int
26+
year in which search has to be done. Default: 2020
27+
28+
number: int
29+
number of random dates you want to pick. Default: 10
30+
31+
Methods
32+
-------
33+
scrap():
34+
Scrapping process will be initiated by this method.
35+
36+
dataframe():
37+
Returns the dataframe object.
38+
39+
"""
40+
41+
def __init__(self, urls_dict, start_date='2020-01-01', end_date='2020-08-01', number=10, year=2020):
42+
self.urls = urls_dict
43+
self.start = pd.to_datetime(start_date)
44+
self.end = pd.to_datetime(end_date)
45+
self.n = number
46+
self.year = year
47+
self.titles = []
48+
self.sub_titles = []
49+
self.article_link = []
50+
self.claps = []
51+
self.reading_time = []
52+
self.responses = []
53+
self.pubs = []
54+
self.dates_list = []
55+
56+
def randDates(self):
57+
start_u = self.start.value//10**9
58+
end_u = self.end.value//10**9
59+
60+
return pd.DatetimeIndex((10**9*np.random.randint(start_u, end_u, self.n, dtype=np.int64)).view('M8[ns]')).date
61+
62+
def scrap(self):
63+
dates = pd.to_datetime(pd.Series(self.randDates()))
64+
for i in range(len(dates)):
65+
month = dates.dt.month[i]
66+
day = dates.dt.day[i]
67+
for publication, url in self.urls.items():
68+
url = url+'/archive/{0}/{1:02d}/{2:02d}'
69+
print(f'Publication: {publication}, Date: {self.year}-{month}-{day}')
70+
response = requests.get(url.format(self.year, month, day), allow_redirects=True)
71+
if not response.url.startswith(url.format(self.year, month, day)):
72+
continue
73+
page = response.content
74+
soup = BeautifulSoup(page, 'html.parser')
75+
articles = soup.find_all("div", class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls")
76+
77+
number = len([i.find('h3',class_="graf--title" ).text if i.find('h3',class_="graf--title" ) is not None else '' for i in articles])
78+
79+
self.titles.append([i.find('h3',class_="graf--title" ).text if i.find('h3',class_="graf--title" ) is not None else '' for i in articles])
80+
81+
self.sub_titles.append([i.find("h4", class_="graf--subtitle").text if i.find("h4", class_="graf--subtitle") is not None else '' for i in articles])
82+
83+
self.article_link.append([i.find_all('a')[3]['href'].split('?')[0] for i in articles])
84+
85+
self.claps.append([0 if (k is None) or (k == '') or (k.split is None) else int(float(k.split('K')[0])*1000) if len(k.split('K'))==2 else int(float(k.split('K')[0])) for k in [j.text for j in [i.find_all('button')[1] for i in articles]]])
86+
87+
self.reading_time.append([int(i.find("span", class_="readingTime")['title'].split()[0]) if i.find("span", class_="readingTime") is not None else 0 for i in articles])
88+
89+
self.responses.append([i.find_all('a')[6].text.split(' ')[0] if (len(i.find_all('a'))==7) and len(i.find_all('a')[6].text.split(' '))!=0 else 0 for i in articles])
90+
91+
self.pubs.append([publication]*number)
92+
93+
self.dates_list.append([f'{self.year}-{month}-{day}'])
94+
95+
time.sleep(0.3)
96+
97+
def dataframe(self):
98+
columns = ['Title', 'SubTitle', 'Link', 'Claps', 'Reading_Time', 'Responses', 'Publication','Date_Published']
99+
titles = list(itertools.chain.from_iterable(self.titles))
100+
sub_titles = list(itertools.chain.from_iterable(self.sub_titles))
101+
article_link = list(itertools.chain.from_iterable(self.article_link))
102+
claps = list(itertools.chain.from_iterable(self.claps))
103+
reading_time = list(itertools.chain.from_iterable(self.reading_time))
104+
responses = list(itertools.chain.from_iterable(self.responses))
105+
pubs = list(itertools.chain.from_iterable(self.pubs))
106+
dates = list(itertools.chain.from_iterable(self.dates_list))
107+
108+
return pd.DataFrame(zip(titles, sub_titles, article_link, claps, reading_time, responses, pubs, dates), columns=columns)
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from app import Scrap
2+
3+
4+
a = Scrap(urls_dict={"Towards Data Science": "https://towardsdatascience.com",
5+
"The Startup":"https://medium.com/swlh",
6+
}, number=50,
7+
start_date='2019-01-01', end_date='2019-08-01',year=2019)
8+
a.scrap()
9+
a.dataframe().to_csv('results.csv')
10+
print(a.dataframe())

0 commit comments

Comments
 (0)