From 2a35441c26a41a9859373045812cece8f3222743 Mon Sep 17 00:00:00 2001 From: GudlaArunKumar Date: Sun, 16 Aug 2020 11:29:41 +0530 Subject: [PATCH 1/2] Added Scraping Hacker news website script --- Web-Scraping/ScrappingHackerNewsWebsite.py | 54 ++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 Web-Scraping/ScrappingHackerNewsWebsite.py diff --git a/Web-Scraping/ScrappingHackerNewsWebsite.py b/Web-Scraping/ScrappingHackerNewsWebsite.py new file mode 100644 index 00000000..feaa8231 --- /dev/null +++ b/Web-Scraping/ScrappingHackerNewsWebsite.py @@ -0,0 +1,54 @@ +''' +Scraping the first 2 pages of Hacker news website which gives lot of Tech news(as a articles) +which has upvotes more than 100.User can just click on story link to see the article. +''' + +''' +Program uses requests module to get web data from URL and BeautifulSoup module to parse the web data +as HTML using html parser. +Install requests and BeautifulSoup module before executing! +''' + +import requests +from bs4 import BeautifulSoup +import pprint # prints the Final output in pretty manner which is inbuilt module in Python + + +response1 = requests.get("https://news.ycombinator.com/news") #Storing response of first page of website +response2 = requests.get("https://news.ycombinator.com/news?p=2") # Storing response of Second page of website + +response1_html_parser = BeautifulSoup(response1.text,'html.parser') #parsing the received web data by html parser +response2_html_parser = BeautifulSoup(response2.text,'html.parser') + +linksInPage1 = response1_html_parser.select('.storylink') #All links of tech news are included in class "Storylink" +linksInPage2 = response2_html_parser.select('.storylink') + +votesInPage1 = response1_html_parser.select('.subtext') #All votes are stored inside subclass "score" of class "subtext" +votesInPage2 = response2_html_parser.select('.subtext') + + +mega_link = linksInPage1 + linksInPage2 # Combining links of both pages +#print(mega_link) +mega_votes = votesInPage1 + votesInPage2 + +def sorted_stories_list(hackerNewsList): + """Sorting the list in decreasing order + with respect to votes""" + return sorted(hackerNewsList,key=lambda x:x['votes'],reverse=True) + +def create_custom_hackernews(mega_link,mega_votes): + hackerNews =[] + for index,item in enumerate(mega_link): + title = mega_link[index].getText() #To get title of the story(news) + href = mega_link[index].get('href',None) # To get link of stroy(news).If no link is present, default is None + vote = mega_votes[index].select('.score') # points are stored inside class "score" of class subtext,if points/votes not available, then class score wont be present. + if len(vote): #To check if class "score" exists or not + points = int(vote[0].getText().replace(' points', '')) + if points > 100: # To get votes/points more than 100 + hackerNews.append({'title': title, 'link': href,'votes':points}) + + return sorted_stories_list(hackerNews) + +if __name__ == '__main__': + # Prints story link, story title and its votes in a pretty manner + pprint.pprint(create_custom_hackernews(mega_link,mega_votes)) \ No newline at end of file From eb92b6b1be9455fd68cb1702fb07bc7db3deccbf Mon Sep 17 00:00:00 2001 From: GudlaArunKumar Date: Sun, 16 Aug 2020 17:02:32 +0530 Subject: [PATCH 2/2] Added Readme as per PR review --- Web-Scraping/ScrappingHackerNewsWebsite/README.md | 6 ++++++ .../ScrappingHackerNewsWebsite.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 Web-Scraping/ScrappingHackerNewsWebsite/README.md rename Web-Scraping/{ => ScrappingHackerNewsWebsite}/ScrappingHackerNewsWebsite.py (99%) diff --git a/Web-Scraping/ScrappingHackerNewsWebsite/README.md b/Web-Scraping/ScrappingHackerNewsWebsite/README.md new file mode 100644 index 00000000..9d2ced2c --- /dev/null +++ b/Web-Scraping/ScrappingHackerNewsWebsite/README.md @@ -0,0 +1,6 @@ +# Scraping Hacker news Website + +Scraping the first 2 pages of Hacker news website wherein user can read Tech news(as a articles) which has upvotes more than 100 with help of Requests and +Beautiful Soup Modules. User can just click on story link to see the article. + +Link for Hacker news Website - https://news.ycombinator.com/ \ No newline at end of file diff --git a/Web-Scraping/ScrappingHackerNewsWebsite.py b/Web-Scraping/ScrappingHackerNewsWebsite/ScrappingHackerNewsWebsite.py similarity index 99% rename from Web-Scraping/ScrappingHackerNewsWebsite.py rename to Web-Scraping/ScrappingHackerNewsWebsite/ScrappingHackerNewsWebsite.py index feaa8231..bb66926c 100644 --- a/Web-Scraping/ScrappingHackerNewsWebsite.py +++ b/Web-Scraping/ScrappingHackerNewsWebsite/ScrappingHackerNewsWebsite.py @@ -45,7 +45,7 @@ def create_custom_hackernews(mega_link,mega_votes): if len(vote): #To check if class "score" exists or not points = int(vote[0].getText().replace(' points', '')) if points > 100: # To get votes/points more than 100 - hackerNews.append({'title': title, 'link': href,'votes':points}) + hackerNews.append({'title': title, 'link': href,'votes': points}) return sorted_stories_list(hackerNews)