Skip to content
This repository was archived by the owner on Nov 30, 2022. It is now read-only.

Commit dd64bb4

Browse files
committed
All necessary changes made
1 parent bdbba90 commit dd64bb4

File tree

4 files changed

+58
-35
lines changed

4 files changed

+58
-35
lines changed
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
# Unnecesaary Files
22

3-
app/__pycahce__/
3+
app/__pycahce__/
4+
.idea

Web-Scraping/Medium-Articles-Details-Scrapping/README.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,33 @@ This script will scrap details about medium articles published in a date range i
1111
- Open the run.py to add the dictionary of urls, date range and number of random dates.
1212
- Save the file.
1313
- Run the command: python run.py
14+
15+
# About the Scrap class
16+
A Scrapper to get details about medium articles published in a date range in a Publication by selecting random dates.
17+
18+
Attributes
19+
----------
20+
urls_dict : dict
21+
key-value pairs of the publication name with link. Example:
22+
urls_dict={"The Startup":"https://medium.com/swlh"}
23+
24+
start_date : str
25+
starting date of the search. Default: 2020-01-01
26+
27+
end_date : str
28+
ending date of the search. Default: 2020-08-01
29+
30+
year : int
31+
year in which search has to be done. Default: 2020
32+
33+
number: int
34+
number of random dates you want to pick. Default: 10
35+
36+
Methods
37+
-------
38+
scrap():
39+
Scrapping process will be initiated by this method.
40+
41+
dataframe():
42+
Returns the dataframe object.
43+

Web-Scraping/Medium-Articles-Details-Scrapping/app/__init__.py

Lines changed: 1 addition & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,37 +6,8 @@
66
import itertools
77
import time
88

9-
class Scrap:
10-
"""
11-
A Scrapper to get details about medium articles published in a date range in a Publication by selecting random dates.
12-
13-
Attributes
14-
----------
15-
urls_dict : dict
16-
key-value pairs of the publication name with link. Example:
17-
urls_dict={"The Startup":"https://medium.com/swlh"}
18-
19-
start_date : str
20-
starting date of the search. Default: 2020-01-01
21-
22-
end_date : str
23-
ending date of the search. Default: 2020-08-01
24-
25-
year : int
26-
year in which search has to be done. Default: 2020
279

28-
number: int
29-
number of random dates you want to pick. Default: 10
30-
31-
Methods
32-
-------
33-
scrap():
34-
Scrapping process will be initiated by this method.
35-
36-
dataframe():
37-
Returns the dataframe object.
38-
39-
"""
10+
class Scrap:
4011

4112
def __init__(self, urls_dict, start_date='2020-01-01', end_date='2020-08-01', number=10, year=2020):
4213
self.urls = urls_dict
Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,31 @@
11
from app import Scrap
22

3+
print('-----------------')
4+
pub_name = input('Enter the comma seperated list of publication names(The Startup, Medium ...): ').split(',')
5+
pub_link = input('Enter the comma seperated links of publications (https://medium.com/swlh, https://towardsdatascience.com ...): ').split(',')
36

4-
a = Scrap(urls_dict={"Towards Data Science": "https://towardsdatascience.com",
5-
"The Startup":"https://medium.com/swlh",
6-
}, number=50,
7-
start_date='2019-01-01', end_date='2019-08-01',year=2019)
7+
if len(pub_name) != len(pub_link):
8+
print('Please Enter links of all publications!')
9+
10+
pub_dict = {i: j for i, j in zip(pub_name, pub_link)}
11+
12+
choice = input("The default information passed is:\nNumber=5\nstart_date='2019-01-01'\nend_date='2019-08-01'\nyear=2019\n\nDo you want to change it? (Y/N): ")
13+
14+
if choice == 'Y':
15+
s_date = input("Enter new start date in format (YYYY-MM-DD): ")
16+
e_date = input("Enter new end date in format (YYYY-MM-DD): ")
17+
new_year = int(input("Enter year: "))
18+
num = int(input("Enter number of random samples: "))
19+
else:
20+
s_date = '2019-01-01'
21+
e_date = '2019-08-01'
22+
new_year = 2020
23+
num = 5
24+
25+
print('Process started ...')
26+
a = Scrap(urls_dict=pub_dict, number=num, start_date=s_date, end_date=e_date, year=new_year)
827
a.scrap()
928
a.dataframe().to_csv('results.csv')
1029
print(a.dataframe())
30+
print('-----------------')
31+
print('Process ended... Thanks for using!')

0 commit comments

Comments
 (0)