Skip to content
This repository was archived by the owner on Nov 30, 2022. It is now read-only.

Commit d56a8c1

Browse files
authored
Merge pull request #105 from GudlaArunKumar/ScrappingHackerNews
Added Scraping Hacker news website script
2 parents ab55e64 + eb92b6b commit d56a8c1

File tree

2 files changed

+60
-0
lines changed

2 files changed

+60
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Scraping Hacker news Website
2+
3+
Scraping the first 2 pages of Hacker news website wherein user can read Tech news(as a articles) which has upvotes more than 100 with help of Requests and
4+
Beautiful Soup Modules. User can just click on story link to see the article.
5+
6+
Link for Hacker news Website - https://news.ycombinator.com/
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
'''
2+
Scraping the first 2 pages of Hacker news website which gives lot of Tech news(as a articles)
3+
which has upvotes more than 100.User can just click on story link to see the article.
4+
'''
5+
6+
'''
7+
Program uses requests module to get web data from URL and BeautifulSoup module to parse the web data
8+
as HTML using html parser.
9+
Install requests and BeautifulSoup module before executing!
10+
'''
11+
12+
import requests
13+
from bs4 import BeautifulSoup
14+
import pprint # prints the Final output in pretty manner which is inbuilt module in Python
15+
16+
17+
response1 = requests.get("https://news.ycombinator.com/news") #Storing response of first page of website
18+
response2 = requests.get("https://news.ycombinator.com/news?p=2") # Storing response of Second page of website
19+
20+
response1_html_parser = BeautifulSoup(response1.text,'html.parser') #parsing the received web data by html parser
21+
response2_html_parser = BeautifulSoup(response2.text,'html.parser')
22+
23+
linksInPage1 = response1_html_parser.select('.storylink') #All links of tech news are included in class "Storylink"
24+
linksInPage2 = response2_html_parser.select('.storylink')
25+
26+
votesInPage1 = response1_html_parser.select('.subtext') #All votes are stored inside subclass "score" of class "subtext"
27+
votesInPage2 = response2_html_parser.select('.subtext')
28+
29+
30+
mega_link = linksInPage1 + linksInPage2 # Combining links of both pages
31+
#print(mega_link)
32+
mega_votes = votesInPage1 + votesInPage2
33+
34+
def sorted_stories_list(hackerNewsList):
35+
"""Sorting the list in decreasing order
36+
with respect to votes"""
37+
return sorted(hackerNewsList,key=lambda x:x['votes'],reverse=True)
38+
39+
def create_custom_hackernews(mega_link,mega_votes):
40+
hackerNews =[]
41+
for index,item in enumerate(mega_link):
42+
title = mega_link[index].getText() #To get title of the story(news)
43+
href = mega_link[index].get('href',None) # To get link of stroy(news).If no link is present, default is None
44+
vote = mega_votes[index].select('.score') # points are stored inside class "score" of class subtext,if points/votes not available, then class score wont be present.
45+
if len(vote): #To check if class "score" exists or not
46+
points = int(vote[0].getText().replace(' points', ''))
47+
if points > 100: # To get votes/points more than 100
48+
hackerNews.append({'title': title, 'link': href,'votes': points})
49+
50+
return sorted_stories_list(hackerNews)
51+
52+
if __name__ == '__main__':
53+
# Prints story link, story title and its votes in a pretty manner
54+
pprint.pprint(create_custom_hackernews(mega_link,mega_votes))

0 commit comments

Comments
 (0)