-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
114 lines (96 loc) · 5.74 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import pandas as pd
from gnews import GNews
from newspaper import Article
from tqdm import tqdm
from duckduckgo_search import ddg_news
import os
from datetime import datetime
from chatgpt_wrapper import ChatGPT
import time
TOPICS = ["WORLD", "BUSINESS", "TECHNOLOGY", "ENTERTAINMENT", "SPORTS", "SCIENCE", "HEALTH"]
def df_search_articles_bytopic(topic:str,exclude_websites:list,language='en', period='7d', max_results=10) -> pd.DataFrame :
#create GNews search object and perform search
gnews_obj = GNews(language='en', period='7d', max_results=25, exclude_websites=exclude_websites) #get a lot of news so that if we are unable to access one we already have more loaded
world_news = gnews_obj.get_news_by_topic(topic)
#create empty df
articles = pd.DataFrame(columns = ["url","publisher","description","pub_date","title","text","authors","top_image","movies"])
for i, new_obj in tqdm(enumerate(world_news[:max_results]),total=max_results):
article = Article(world_news[i]["url"])
failed_bool = True
cnt = 0
while failed_bool == True:
cnt += 1 #look deeper each time into the news output
try:
article.download()
article.parse()
if len(article.text) == 0:
print("can't get text: ",world_news[i]["url"])
failed_bool = True
else:
data = {"url":new_obj["url"],"publisher":new_obj["publisher"]["href"],"description":new_obj["description"],"pub_date":new_obj["published date"],"title":new_obj["title"],"text":article.text,"authors":article.authors,"top_image":article.top_image,"movies":article.movies}
articles.loc[len(articles)] = data
failed_bool = False
except:
print("failed downloading: ",world_news[i]["url"])
failed_bool = True
#if failed look for another one
if(failed_bool == True):
print("looking for another one...")
article = Article(world_news[-cnt]["url"])
world_news.pop(-cnt) #make the last one not accessible again by other failing source
return articles
def df_expand_search_articles(root_article:pd.Series,max_results=10,polite_wait=5) -> pd.DataFrame :
#we are going to make a search using duck duck go, to get better results we'll get chat
#gpt to summarize our root article description into a promp for text search.
bot.new_conversation() #restart a new clean chat
input = "Write a search engine text input looking for more articles from multiple sources talking about: "+root_article["description"]
search_title = bot.consistent_ask(input)
print(search_title)
#search articles using duckduckgo
#Note: having the output of chatgpt be within "" and passing that text as a string to the duckduckgo library
#messes with it and returns nothing. So removing that.
search_title = search_title.replace('"','')
result = ddg_news(search_title, region='wt-wt', safesearch='Off', time='w', max_results=max_results)
#create empty df
articles = pd.DataFrame(columns = ["url","publisher","description","pub_date","title","text","authors","top_image","movies"])
#add root article
articles.loc[len(articles)] = root_article
for i, new_obj in tqdm(enumerate(result),total=len(result)):
time.sleep(polite_wait) #some polite waiting between requests to chatgpt, in seconds
#check if the article found is the same as the root one
if(new_obj["title"] != root_article["title"]):
#We'll again use chatgpt to better filter our searches,
#we'll filter if the found article talks or not about the topic we have at hand.
input = f"Only answer explicitly with Yes or No. Does the title for an article: '{new_obj['title']}' suggest to be talking about our search input? Is it the result we want?"
gpt_judge = bot.consistent_ask(input)
if (gpt_judge != "No."): #otherwise yes, to avoid chatgpt not behaving, just in case :)
article = Article(result[i]["url"])
try:
article.download()
article.parse()
if len(article.text) == 0:
print("can't get text: ",result[i]["url"])
else:
data = {"url":new_obj["url"],"publisher":new_obj["source"],"description":new_obj["body"],"pub_date":new_obj["date"],"title":new_obj["title"],"text":article.text,"authors":article.authors,"top_image":article.top_image,"movies":article.movies}
articles.loc[len(articles)] = data
except:
print("failed downloading: ",result[i]["url"])
return articles
#Archiving old articles and preparing folders for new to process
os.system(f"mkdir archive archive/{datetime.today().strftime('%Y_%m_%d')}")
os.system(f"mv topic_csv archive/{datetime.today().strftime('%Y_%m_%d')}")
os.system(f"mv articles archive/{datetime.today().strftime('%Y_%m_%d')}")
#creating clean folder tree to work on
folders_paths = ""
for topic in TOPICS:
folders_paths += " topic_csv/"+topic
os.system("mkdir topic_csv articles"+folders_paths)
bot = ChatGPT()
for topic in TOPICS:
print(f"Processing {topic}...")
print("-->Gathering root articles...")
root_articles = df_search_articles_bytopic(topic = topic,max_results=2,exclude_websites=["www.youtube.com"])
#for each article found for that topic find more articles
print("-->Scraping expanded articles...")
for i, article in root_articles.iterrows():
tree = df_expand_search_articles(article,max_results=8).to_csv(f"topic_csv/{topic}/subtree{i}.csv",index=False)