-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnewsScraper.py
144 lines (97 loc) · 4.69 KB
/
newsScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from gnews import GNews # needed for google news api
from newspaper import Article # needed for article scraping
from googlenewsdecoder import gnewsdecoder # needed for decoding google news api
import yaml # needed for reading yaml config file
import json # needed for reading json objects from the llm
import os # needed for folder creation
from geopy.geocoders import Nominatim # needed for geolocation
from ollama import chat # needed for LLM
from ollama import ChatResponse # needed for LLM
import time # needed for timestamping
# Only needed for testing
import random # needed for random number generation
# load config settings
with open("./configs/context.yaml", "r") as ymlfile:
context = yaml.safe_load(ymlfile)
# Configure the LLM settings
model_id = context["llm"]["ID"]
modelContext = context["llm"]["CONTEXT"]
modelJSON = [] # list to hold the JSON output of the LLM
# Create the incidents folder if it doesn't exist
if not os.path.exists("./incidents"):
os.makedirs("./incidents")
# Configure the geolocation object
geolocator = Nominatim(user_agent="newsAnalyzer")
# Configure the Google News object
googleNews = GNews(language=context["news"]["LANGUAGE"], country=context["news"]["COUNTRY"], period=context["news"]["LOOKBACK"])
def getHistory():
"""
Function to get the history of all existing and return it as a context string
"""
incidents = []
for file in os.listdir("./incidents"):
with open(f"./incidents/{file}", "r") as incident:
incidents.append(json.load(incident))
historyContext = "This is the list of summarized incidents that have been detected so far. If a news story is similar to any of these incidents, it is not unique: \n"
for incident in incidents:
historyContext += f"Title: {incident['title']}, Date: {incident['date']}, Summery: {incident['summery']} \n"
return historyContext
# TODO: Put this in a main function
# get the history context of all current incidents
historyContext = getHistory()
# gets all drone stories from the past hour as a json list
newsJSON = googleNews.get_news(context["news"]["TOPIC"])
random.shuffle(newsJSON) # shuffle the list to get a random article for testing
print(f'Found {str(len(newsJSON))} stories')
for story in newsJSON:
print(f"\nTitle: {story['title']} \nDate: {str(story['published date'])} \nDescription: {story['description']}")
print("\n")
#print(f"URL: {story['url']}")
# Adding in try except block to handle errors from URL grabbing
try:
# decode the URL to get the actual article
decodedURL = gnewsdecoder(story['url'], 5)
article = Article(decodedURL["decoded_url"])
article.download()
article.parse()
#print(f"Article: {article.title}")
#print(f"Article text: {article.text}")
messages = [
{"role": "system", "content": historyContext},
{"role": "system", "content": modelContext},
{"role": "user", "content": f"Analyze the following article: {article.title} /n {article.text}"},
]
response: ChatResponse = chat(model_id, messages)
#print(response.message.content)
decision = json.loads(response.message.content)
#print(decision)
# check if the decision is an incident and unique
if decision['isIncident'] == True and decision['isUnique'] == True:
#print("Incident detected with high confidence")
# get the location of the incident
location = geolocator.geocode(decision['location'])
# make the JSON object representing the incident
incident = {
"title": story['title'],
"summery": story['description'],
"date": story['published date'],
"url": decodedURL["decoded_url"],
"latitude": location.latitude,
"longitude": location.longitude
}
# use the timestamp as the file name for the incident
filePath = f"./incidents/{str(time.time())}.json"
# write the incident to a file
with open(filePath, "w") as file:
json.dump(incident, file)
# add the incident to the history context
historyContext += f"Title: {story['title']}, Date: {story['published date']}, Summery: {story['description']} \n"
print(f"Incident detected at {location.latitude}, {location.longitude}")
else:
#print(f"Title: {str(article.title)}")
print("No incident detected")
except KeyboardInterrupt:
print("Keyboard interrupt detected, exiting")
break
except:
print("Error processing URL, could be timeout or other issue")