medium articles details scrapper

kaustubhgupta · kaustubhgupta · commit bdbba90902fc · 2020-08-16T20:47:26.000+05:30
diff --git a/Web-Scraping/Medium-Articles-Details-Scrapping/.gitignore b/Web-Scraping/Medium-Articles-Details-Scrapping/.gitignore
@@ -0,0 +1,3 @@
+# Unnecesaary Files
+
+app/__pycahce__/
diff --git a/Web-Scraping/Medium-Articles-Details-Scrapping/README.md b/Web-Scraping/Medium-Articles-Details-Scrapping/README.md
@@ -0,0 +1,13 @@
+# Medium-Articles-Details-Scrapping
+This script will scrap details about medium articles published in a date range in the given publication. The dates are choosen randomly. If there is no article on that date, then that date is skipped. The results returned is a dataframe which can be saved in any format, currently saves as CSV.
+
+# Requirements
+- numpy
+- pandas
+- bs4
+- requests
+
+# How to run?
+- Open the run.py to add the dictionary of urls, date range and number of random dates.
+- Save the file.
+- Run the command: python run.py
diff --git a/Web-Scraping/Medium-Articles-Details-Scrapping/app/__init__.py b/Web-Scraping/Medium-Articles-Details-Scrapping/app/__init__.py
@@ -0,0 +1,108 @@
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import random
+import numpy as np
+import itertools
+import time
+
+class Scrap:
+    """
+    A Scrapper to get details about medium articles published in a date range in a Publication by selecting random dates.
+
+    Attributes
+    ----------
+    urls_dict : dict
+        key-value pairs of the publication name with link. Example:
+        urls_dict={"The Startup":"https://medium.com/swlh"}
+
+    start_date : str
+        starting date of the search. Default: 2020-01-01
+
+    end_date : str
+        ending date of the search. Default: 2020-08-01
+
+    year : int
+        year in which search has to be done. Default: 2020
+
+    number: int
+        number of random dates you want to pick. Default: 10
+
+    Methods
+    -------
+    scrap():
+        Scrapping process will be initiated by this method.
+
+    dataframe():
+        Returns the dataframe object.
+
+    """
+
+    def __init__(self, urls_dict, start_date='2020-01-01', end_date='2020-08-01', number=10, year=2020):
+        self.urls = urls_dict
+        self.start = pd.to_datetime(start_date)
+        self.end = pd.to_datetime(end_date)
+        self.n = number
+        self.year = year
+        self.titles = []
+        self.sub_titles = []
+        self.article_link = []
+        self.claps = []
+        self.reading_time = []
+        self.responses = []
+        self.pubs = []
+        self.dates_list = []
+        
+    def randDates(self):
+        start_u = self.start.value//10**9
+        end_u = self.end.value//10**9
+
+        return pd.DatetimeIndex((10**9*np.random.randint(start_u, end_u, self.n, dtype=np.int64)).view('M8[ns]')).date
+
+    def scrap(self):
+        dates = pd.to_datetime(pd.Series(self.randDates()))
+        for i in range(len(dates)):
+            month = dates.dt.month[i]
+            day = dates.dt.day[i]
+            for publication, url in self.urls.items():
+                url = url+'/archive/{0}/{1:02d}/{2:02d}'
+                print(f'Publication: {publication}, Date: {self.year}-{month}-{day}')
+                response = requests.get(url.format(self.year, month, day), allow_redirects=True)
+                if not response.url.startswith(url.format(self.year, month, day)):
+                    continue
+                page = response.content
+                soup = BeautifulSoup(page, 'html.parser')
+                articles = soup.find_all("div", class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls")
+                
+                number = len([i.find('h3',class_="graf--title" ).text if i.find('h3',class_="graf--title" ) is not None else '' for i in articles])
+                
+                self.titles.append([i.find('h3',class_="graf--title" ).text if i.find('h3',class_="graf--title" ) is not None else '' for i in articles])
+
+                self.sub_titles.append([i.find("h4", class_="graf--subtitle").text if i.find("h4", class_="graf--subtitle") is not None else '' for i in articles])
+
+                self.article_link.append([i.find_all('a')[3]['href'].split('?')[0] for i in articles])
+
+                self.claps.append([0 if (k is None) or (k == '') or (k.split is None) else int(float(k.split('K')[0])*1000) if len(k.split('K'))==2 else int(float(k.split('K')[0])) for k in [j.text for j in [i.find_all('button')[1] for i in articles]]])
+
+                self.reading_time.append([int(i.find("span", class_="readingTime")['title'].split()[0]) if i.find("span", class_="readingTime") is not None else 0 for i in articles])
+
+                self.responses.append([i.find_all('a')[6].text.split(' ')[0] if (len(i.find_all('a'))==7) and len(i.find_all('a')[6].text.split(' '))!=0 else 0 for i in articles])
+                
+                self.pubs.append([publication]*number)
+
+                self.dates_list.append([f'{self.year}-{month}-{day}'])
+                
+                time.sleep(0.3)
+
+    def dataframe(self):
+        columns = ['Title', 'SubTitle', 'Link', 'Claps', 'Reading_Time', 'Responses', 'Publication','Date_Published']
+        titles = list(itertools.chain.from_iterable(self.titles))
+        sub_titles = list(itertools.chain.from_iterable(self.sub_titles))
+        article_link = list(itertools.chain.from_iterable(self.article_link))
+        claps = list(itertools.chain.from_iterable(self.claps))
+        reading_time = list(itertools.chain.from_iterable(self.reading_time))
+        responses = list(itertools.chain.from_iterable(self.responses))
+        pubs = list(itertools.chain.from_iterable(self.pubs))
+        dates = list(itertools.chain.from_iterable(self.dates_list))
+
+        return pd.DataFrame(zip(titles, sub_titles, article_link, claps, reading_time, responses, pubs, dates), columns=columns)
diff --git a/Web-Scraping/Medium-Articles-Details-Scrapping/run.py b/Web-Scraping/Medium-Articles-Details-Scrapping/run.py
@@ -0,0 +1,10 @@
+from app import Scrap
+
+
+a = Scrap(urls_dict={"Towards Data Science": "https://towardsdatascience.com",
+                     "The Startup":"https://medium.com/swlh",
+                     }, number=50,
+                     start_date='2019-01-01', end_date='2019-08-01',year=2019)
+a.scrap()
+a.dataframe().to_csv('results.csv')
+print(a.dataframe())

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Unnecesaary Files`
	`2`	`+`
	`3`	`+app/__pycahce__/`