diff --git a/Web-Scraping/Times_of_india/README.md b/Web-Scraping/Times_of_india/README.md new file mode 100644 index 00000000..7953491f --- /dev/null +++ b/Web-Scraping/Times_of_india/README.md @@ -0,0 +1,8 @@ +## Scraping Times of India + +Scraping times of india top headlines in four domains : Flash news, News in Bulletin, Entertainment, Latest news. +using REquests and Beautiful Soup Modules. + +Link for Website - "http://timesofindia.indiatimes.com/" + +![output](TOI.png) diff --git a/Web-Scraping/Times_of_india/TOI.png b/Web-Scraping/Times_of_india/TOI.png new file mode 100644 index 00000000..6ae22644 Binary files /dev/null and b/Web-Scraping/Times_of_india/TOI.png differ diff --git a/Web-Scraping/Times_of_india/Times_of_india.py b/Web-Scraping/Times_of_india/Times_of_india.py new file mode 100644 index 00000000..e8a404ce --- /dev/null +++ b/Web-Scraping/Times_of_india/Times_of_india.py @@ -0,0 +1,37 @@ +import requests +import datetime +from bs4 import BeautifulSoup + +url = "http://timesofindia.indiatimes.com/" + +# Use requests library to get html from TOI's page +response = requests.get(url) +# Make the html soup object +soup = BeautifulSoup(response.content, 'html.parser') + +print("\t!!!** The Times of India **!!!") +today = datetime.date.today() +print(today.strftime('\tThe date %d, %b %Y')) + +# scrping times of India in four domains: +print("\n\t\t**** Flash news ****") +for div in soup.findAll('div', attrs={'id':'featuredstory'}): + for a in div.findAll('a'): + print(a.text) + +print("\n\t\t**** News in Bulletin ****") +for div in soup.findAll('div', attrs={'class':'top-story'}): + for a in div.findAll('li'): + print (a.text) + + +print("\n\t\t**** Entertainment ****\t") +for div in soup.findAll('div', attrs={'class':'entrmnt-wdgt-outer'}): + for a in div.findAll('li'): + print(a.text) + + +print("\n\t\t**** Latest News ****\t\n") +for div in soup.findAll('div', attrs={'id':'lateststories'}): + for a in div.findAll('li'): + print(a.text)