-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathleakbase_crawling.py
103 lines (85 loc) · 3.68 KB
/
leakbase_crawling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import requests
import pytz
import json
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime
token = 'token'
channel = 'channel'
file_path = 'leakbase_clean.json'
leakbase_url = 'https://leakbase.io/'
# 슬랙 알림 전송
def slack_alarm(post):
text = f"New post\nTitle: {post['title']}\nUpload time: {post['upload time']}\nAuthor: {post['author']}\nURL: {post['url']}"
requests.post('https://slack.com/api/chat.postMessage',
headers = {"Authorization": "Bearer "+token},
data={"channel":channel,"text":text})
# 파일 업데이트
def update_file(posts):
crawled_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
with open(file_path,'w',encoding='utf-8') as file:
json.dump(posts, file, ensure_ascii=False, indent=4)
print(f'{file_path} updated.')
# 이전 게시글 데이터 불러오기
def load_previous_posts(filename):
try:
with open(filename, 'r', encoding='utf-8') as file: # UTF-8 인코딩을 명시적으로 지정
return json.load(file)
except FileNotFoundError:
return []
# 새로운 게시글 확인 및 처리
def check_posts(new_posts):
previous_posts = load_previous_posts('leakbase_clean.json')
if not previous_posts:
for post in new_posts:
slack_alarm(post)
update_file(new_posts)
return
previous_urls = set(post['url'] for post in previous_posts)
new_posts_found = [post for post in new_posts if post['url'] not in previous_urls]
if new_posts_found:
for post in reversed(new_posts_found): # 슬랙 알림 과거-최신순으로 보내도록
slack_alarm(post)
updated_posts = new_posts_found + previous_posts
update_file(updated_posts)
else:
update_file(previous_posts)
# Leakbase 데이터 가져오기
def fetch_leakbase_data():
response = requests.get(leakbase_url)
if response.status_code == 200:
html = response.text
soup = BeautifulSoup(html, 'html.parser')
# 게시글 정보 추출
post_elements = soup.select('div._xgtIstatistik-satir--konu')
time_elements = soup.select('time.structItem-latestDate')
forum_elements = [a['title'] for a in soup.select('._xgtIstatistik-satir--hucre._xgtIstatistik-satir--forum a[title]')]
posts = []
for post_element, time_element, forum in zip(post_elements, time_elements, forum_elements):
# 작성자 정보
author = post_element.get('data-author', 'no author')
# url
title_tag = post_element.find('a', attrs={'data-preview-url': True})
# 게시글 시간정보
datetime_str = time_element.get('datetime', 'no time info')
utc_datetime = datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M:%S%z')
korea_datetime_str = utc_datetime.astimezone(pytz.timezone('Asia/Seoul')).strftime("%Y-%m-%d %H:%M:%S")
time_text = time_element.get_text().strip()
# 게시글 주소
if title_tag:
full_url = urljoin(leakbase_url, title_tag['href'])
title = title_tag.get_text().strip()
post_data = {
'title': title,
'upload time': korea_datetime_str,
'time_text': time_text,
'author': author,
'url': full_url
}
posts.append(post_data)
# 새로운 게시글 확인 및 처리
check_posts(posts)
else:
print("Failed to fetch data from Leakbase.")
if __name__ == "__main__":
fetch_leakbase_data()