Skip to content
This repository was archived by the owner on Nov 30, 2022. It is now read-only.

Commit 8ec4b27

Browse files
authored
Merge pull request #125 from kaustubhgupta/dev
Medium Articles Details Scrapper
2 parents 1104372 + 6a1d2ca commit 8ec4b27

File tree

5 files changed

+156
-0
lines changed

5 files changed

+156
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Unnecesaary Files
2+
3+
app/__pycahce__/
4+
.idea
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Medium-Articles-Details-Scrapping
2+
This script will scrap details about medium articles published in a date range in the given publication. The dates are choosen randomly. If there is no article on that date, then that date is skipped. The results returned is a dataframe which can be saved in any format, currently saves as CSV. Here is the preview of the terminal:
3+
![](terminal-preview.PNG)
4+
5+
# Requirements
6+
- numpy
7+
- pandas
8+
- bs4
9+
- requests
10+
11+
# How to run?
12+
- Run the command: python run.py
13+
14+
# About the Scrap class
15+
A Scrapper to get details about medium articles published in a date range in a Publication by selecting random dates.
16+
17+
Attributes
18+
----------
19+
urls_dict : dict
20+
key-value pairs of the publication name with link. Example:
21+
urls_dict={"The Startup":"https://medium.com/swlh"}
22+
23+
start_date : str
24+
starting date of the search. Default: 2020-01-01
25+
26+
end_date : str
27+
ending date of the search. Default: 2020-08-01
28+
29+
year : int
30+
year in which search has to be done. Default: 2020
31+
32+
number: int
33+
number of random dates you want to pick. Default: 10
34+
35+
Methods
36+
-------
37+
scrap():
38+
Scrapping process will be initiated by this method.
39+
40+
dataframe():
41+
Returns the dataframe object.
42+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
import pandas as pd
4+
import random
5+
import numpy as np
6+
import itertools
7+
import time
8+
9+
10+
class Scrap:
11+
12+
def __init__(self, urls_dict, start_date='2020-01-01', end_date='2020-08-01', number=10, year=2020):
13+
self.urls = urls_dict
14+
self.start = pd.to_datetime(start_date)
15+
self.end = pd.to_datetime(end_date)
16+
self.n = number
17+
self.year = year
18+
self.titles = []
19+
self.sub_titles = []
20+
self.article_link = []
21+
self.claps = []
22+
self.reading_time = []
23+
self.responses = []
24+
self.pubs = []
25+
self.dates_list = []
26+
27+
def randDates(self):
28+
start_u = self.start.value//10**9
29+
end_u = self.end.value//10**9
30+
31+
return pd.DatetimeIndex((10**9*np.random.randint(start_u, end_u, self.n, dtype=np.int64)).view('M8[ns]')).date
32+
33+
def scrap(self):
34+
dates = pd.to_datetime(pd.Series(self.randDates()))
35+
for i in range(len(dates)):
36+
month = dates.dt.month[i]
37+
day = dates.dt.day[i]
38+
for publication, url in self.urls.items():
39+
url = url+'/archive/{0}/{1:02d}/{2:02d}'
40+
print(f'Publication: {publication}, Date: {self.year}-{month}-{day}')
41+
response = requests.get(url.format(self.year, month, day), allow_redirects=True)
42+
if not response.url.startswith(url.format(self.year, month, day)):
43+
continue
44+
page = response.content
45+
soup = BeautifulSoup(page, 'html.parser')
46+
articles = soup.find_all("div", class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls")
47+
48+
number = len([i.find('h3',class_="graf--title" ).text if i.find('h3',class_="graf--title" ) is not None else '' for i in articles])
49+
50+
self.titles.append([i.find('h3',class_="graf--title" ).text if i.find('h3',class_="graf--title" ) is not None else '' for i in articles])
51+
52+
self.sub_titles.append([i.find("h4", class_="graf--subtitle").text if i.find("h4", class_="graf--subtitle") is not None else '' for i in articles])
53+
54+
self.article_link.append([i.find_all('a')[3]['href'].split('?')[0] for i in articles])
55+
56+
self.claps.append([0 if (k is None) or (k == '') or (k.split is None) else int(float(k.split('K')[0])*1000) if len(k.split('K'))==2 else int(float(k.split('K')[0])) for k in [j.text for j in [i.find_all('button')[1] for i in articles]]])
57+
58+
self.reading_time.append([int(i.find("span", class_="readingTime")['title'].split()[0]) if i.find("span", class_="readingTime") is not None else 0 for i in articles])
59+
60+
self.responses.append([i.find_all('a')[6].text.split(' ')[0] if (len(i.find_all('a'))==7) and len(i.find_all('a')[6].text.split(' '))!=0 else 0 for i in articles])
61+
62+
self.pubs.append([publication]*number)
63+
64+
self.dates_list.append([f'{self.year}-{month}-{day}'])
65+
66+
time.sleep(0.3)
67+
68+
def dataframe(self):
69+
columns = ['Title', 'SubTitle', 'Link', 'Claps', 'Reading_Time', 'Responses', 'Publication','Date_Published']
70+
titles = list(itertools.chain.from_iterable(self.titles))
71+
sub_titles = list(itertools.chain.from_iterable(self.sub_titles))
72+
article_link = list(itertools.chain.from_iterable(self.article_link))
73+
claps = list(itertools.chain.from_iterable(self.claps))
74+
reading_time = list(itertools.chain.from_iterable(self.reading_time))
75+
responses = list(itertools.chain.from_iterable(self.responses))
76+
pubs = list(itertools.chain.from_iterable(self.pubs))
77+
dates = list(itertools.chain.from_iterable(self.dates_list))
78+
79+
return pd.DataFrame(zip(titles, sub_titles, article_link, claps, reading_time, responses, pubs, dates), columns=columns)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from app import Scrap
2+
3+
print('-----------------')
4+
pub_name = input('Enter the comma seperated list of publication names(The Startup, Medium ...): ').split(',')
5+
pub_link = input('Enter the comma seperated links of publications (https://medium.com/swlh, https://towardsdatascience.com ...): ').split(',')
6+
7+
if len(pub_name) != len(pub_link):
8+
print('Please Enter links of all publications!')
9+
10+
pub_dict = {i: j for i, j in zip(pub_name, pub_link)}
11+
12+
choice = input("The default information passed is:\nNumber=5\nstart_date='2019-01-01'\nend_date='2019-08-01'\nyear=2019\n\nDo you want to change it? (Y/N): ")
13+
14+
if choice == 'Y':
15+
s_date = input("Enter new start date in format (YYYY-MM-DD): ")
16+
e_date = input("Enter new end date in format (YYYY-MM-DD): ")
17+
new_year = int(input("Enter year: "))
18+
num = int(input("Enter number of random samples: "))
19+
else:
20+
s_date = '2019-01-01'
21+
e_date = '2019-08-01'
22+
new_year = 2020
23+
num = 5
24+
25+
print('Process started ...')
26+
a = Scrap(urls_dict=pub_dict, number=num, start_date=s_date, end_date=e_date, year=new_year)
27+
a.scrap()
28+
a.dataframe().to_csv('results.csv')
29+
print(a.dataframe())
30+
print('-----------------')
31+
print('Process ended... Thanks for using!')
Loading

0 commit comments

Comments
 (0)