Skip to content
This repository was archived by the owner on Nov 30, 2022. It is now read-only.

Commit 65967e8

Browse files
author
namrun
committed
Updated script
1 parent 63360f9 commit 65967e8

File tree

1 file changed

+26
-16
lines changed

1 file changed

+26
-16
lines changed

Web-Scraping/medium-article-downloader.py

+26-16
Original file line numberDiff line numberDiff line change
@@ -5,32 +5,42 @@
55
import requests
66
from bs4 import BeautifulSoup
77

8-
#The content is written into a text file
98

10-
file = open("Medium_article_content.txt", "w")
9+
def article_download():
1110

12-
#The URL of the article is entered here
13-
page_url = input("Enter the URL of the Medium Article ")
11+
#The content is written into a text file
1412

15-
#Based on the response got from the URL, the content is loaded into response
13+
file = open("Medium_article_content.txt", "w")
1614

17-
response = requests.get(page_url)
15+
#The URL of the article is entered here
16+
page_url = input("Enter the URL of the Medium Article ")
1817

19-
#Beautiful soup is a library used for web scraping and parsing the contents of a web page
20-
#Here a html parser is used to parse through the content embedded in the html tags
18+
#In the field User-Agent, the user must look for my-user-agent and get the headers
19+
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0'}
2120

22-
soup = BeautifulSoup(response.text,"html.parser")
21+
#Based on the response got from the URL, the content is loaded into response
2322

24-
#The content of the article is stored in the <article> tag
23+
response = requests.get(page_url, headers)
2524

26-
for line in soup.find('article').find('div'):
25+
#Beautiful soup is a library used for web scraping and parsing the contents of a web page
26+
#Here a html parser is used to parse through the content embedded in the html tags
27+
28+
soup = BeautifulSoup(response.text,"html.parser")
29+
30+
#The content of the article is stored in the <article> tag
31+
32+
for line in soup.find('article').find('div'):
2733

28-
#All the content is essentially stored between <p> tags
34+
#All the content is essentially stored between <p> tags
2935

30-
for content in line.find_all('p'):
36+
for content in line.find_all('p'):
3137

32-
#contents are written into a file
38+
#contents are written into a file
3339

34-
file.write(content.text + '\n')
40+
file.write(content.text + '\n')
41+
42+
file.close()
43+
print("Content downloaded")
3544

36-
file.close()
45+
if __name__ == "__main__":
46+
article_download()

0 commit comments

Comments
 (0)