1
- from abc import ABC ,abstractmethod
1
+ from abc import ABC , abstractmethod
2
2
import requests
3
3
from bs4 import BeautifulSoup
4
4
from summarize import summarizetext
5
5
6
6
7
-
8
7
class extractor (ABC ):
9
8
def __init__ (self ):
10
9
self .url = ""
@@ -27,51 +26,54 @@ def __init__(self):
27
26
def htmlparser (self ):
28
27
content = self .extracthtml ()
29
28
# only process articles limited by the limit defined
30
- newsdivs = content .find_all ("div" , class_ = "col-md-4 pt-2" )[:self .limit ]
29
+ newsdivs = content .find_all ("div" , class_ = "col-md-4 pt-2" )[: self .limit ]
31
30
news = []
32
31
33
32
for div in newsdivs :
34
-
33
+
35
34
news .append (self .extractinfo (div = div ))
36
-
35
+
37
36
return news
38
37
39
- def extractinfo (self ,div ):
38
+ def extractinfo (self , div ):
40
39
try :
41
- soup = BeautifulSoup (str (div ).encode ('utf-8' ).decode ('ascii' , 'ignore' ), "html.parser" )
42
-
40
+ soup = BeautifulSoup (
41
+ str (div ).encode ("utf-8" ).decode ("ascii" , "ignore" ), "html.parser"
42
+ )
43
+
43
44
# Extract the article URL and title
44
- article_link = soup .find ('a' , href = True )
45
- article_url = article_link [' href' ] if article_link else None
46
- title_tag = soup .find ('h6' )
45
+ article_link = soup .find ("a" , href = True )
46
+ article_url = article_link [" href" ] if article_link else None
47
+ title_tag = soup .find ("h6" )
47
48
article_title = title_tag .text .strip () if title_tag else None
48
-
49
+
49
50
# Extract the image URL
50
- img_tag = soup .find (' img' )
51
- image_url = img_tag .get (' src' ) if img_tag else None
51
+ img_tag = soup .find (" img" )
52
+ image_url = img_tag .get (" src" ) if img_tag else None
52
53
53
54
# get article from url
54
55
page = BeautifulSoup (requests .get (article_url ).content , "html.parser" )
55
56
article = page .find ("div" , class_ = "news_reader" )
56
57
57
- date_span = page .find (' span' , class_ = "greytime2" )
58
+ date_span = page .find (" span" , class_ = "greytime2" )
58
59
date = date_span .get_text ().split (" " )[1 ]
59
60
60
61
summary = summarizetext (article .text )
61
-
62
+
62
63
return {
63
- 'id' : article_url ,
64
- ' title' : article_title ,
65
- ' summary' : summary ,
66
- ' article_url' : article_url ,
67
- ' publish_time' : date ,
68
- ' image_url' : image_url ,
69
- ' source' : self .source
64
+ "id" : article_url ,
65
+ " title" : article_title ,
66
+ " summary" : summary ,
67
+ " article_url" : article_url ,
68
+ " publish_time" : date ,
69
+ " image_url" : image_url ,
70
+ " source" : self .source ,
70
71
}
71
72
except Exception as e :
72
73
print (f"Error parsing HTML snippet: { e } " )
73
74
return None
74
-
75
- if __name__ == '__main__' :
75
+
76
+
77
+ if __name__ == "__main__" :
76
78
test = fijivillage ()
77
- print (test .htmlparser ())
79
+ print (test .htmlparser ())
0 commit comments