Skip to content

Commit b646b09

Browse files
committed
fix formating issues
1 parent e33751a commit b646b09

File tree

5 files changed

+85
-657
lines changed

5 files changed

+85
-657
lines changed

extractors.py

Lines changed: 28 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
1-
from abc import ABC,abstractmethod
1+
from abc import ABC, abstractmethod
22
import requests
33
from bs4 import BeautifulSoup
44
from summarize import summarizetext
55

66

7-
87
class extractor(ABC):
98
def __init__(self):
109
self.url = ""
@@ -27,51 +26,54 @@ def __init__(self):
2726
def htmlparser(self):
2827
content = self.extracthtml()
2928
# only process articles limited by the limit defined
30-
newsdivs = content.find_all("div", class_="col-md-4 pt-2")[:self.limit]
29+
newsdivs = content.find_all("div", class_="col-md-4 pt-2")[: self.limit]
3130
news = []
3231

3332
for div in newsdivs:
34-
33+
3534
news.append(self.extractinfo(div=div))
36-
35+
3736
return news
3837

39-
def extractinfo(self,div):
38+
def extractinfo(self, div):
4039
try:
41-
soup = BeautifulSoup(str(div).encode('utf-8').decode('ascii', 'ignore'), "html.parser")
42-
40+
soup = BeautifulSoup(
41+
str(div).encode("utf-8").decode("ascii", "ignore"), "html.parser"
42+
)
43+
4344
# Extract the article URL and title
44-
article_link = soup.find('a', href=True)
45-
article_url = article_link['href'] if article_link else None
46-
title_tag = soup.find('h6')
45+
article_link = soup.find("a", href=True)
46+
article_url = article_link["href"] if article_link else None
47+
title_tag = soup.find("h6")
4748
article_title = title_tag.text.strip() if title_tag else None
48-
49+
4950
# Extract the image URL
50-
img_tag = soup.find('img')
51-
image_url = img_tag.get('src') if img_tag else None
51+
img_tag = soup.find("img")
52+
image_url = img_tag.get("src") if img_tag else None
5253

5354
# get article from url
5455
page = BeautifulSoup(requests.get(article_url).content, "html.parser")
5556
article = page.find("div", class_="news_reader")
5657

57-
date_span = page.find('span', class_="greytime2")
58+
date_span = page.find("span", class_="greytime2")
5859
date = date_span.get_text().split(" ")[1]
5960

6061
summary = summarizetext(article.text)
61-
62+
6263
return {
63-
'id':article_url,
64-
'title': article_title,
65-
'summary': summary,
66-
'article_url': article_url,
67-
'publish_time': date,
68-
'image_url': image_url,
69-
'source': self.source
64+
"id": article_url,
65+
"title": article_title,
66+
"summary": summary,
67+
"article_url": article_url,
68+
"publish_time": date,
69+
"image_url": image_url,
70+
"source": self.source,
7071
}
7172
except Exception as e:
7273
print(f"Error parsing HTML snippet: {e}")
7374
return None
74-
75-
if __name__ == '__main__':
75+
76+
77+
if __name__ == "__main__":
7678
test = fijivillage()
77-
print(test.htmlparser())
79+
print(test.htmlparser())

main.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from api import app
22

3-
if __name__ == '__main__':
3+
if __name__ == "__main__":
44
import uvicorn
55
import nltk
66

7-
nltk.download('punkt_tab')
8-
uvicorn.run("main:app", port=10000, host="0.0.0.0", log_level="info")
7+
nltk.download("punkt_tab")
8+
uvicorn.run("main:app", port=10000, host="0.0.0.0", log_level="info")

models.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@
33
from beanie import Document, PydanticObjectId, Link
44
from pydantic import Field, BaseModel
55

6+
67
class News(Document):
78
id: str
89
title: str
910
summary: str
1011
article_url: str
1112
image_url: str
12-
publish_time: str
13+
publish_time: str
1314
source: str

0 commit comments

Comments
 (0)