-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCrawlerThread.py
94 lines (79 loc) · 4.1 KB
/
CrawlerThread.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python
# This file is part of android-permissions and licensed under The MIT License (MIT).
import threading
import urllib
import re
import time
import psycopg2
from config import *
class HttpError(BaseException):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
class CrawlerThread(threading.Thread):
def __init__(self, logger, crawling_queue, crawled_dict, discovered_dict, processing_queue, processed_dict):
threading.Thread.__init__(self)
self.logger = logger
self.crawling_queue = crawling_queue
self.crawled_dict = crawled_dict
self.discovered_dict = discovered_dict
self.processing_queue = processing_queue
self.processed_dict = processed_dict
self.config = Config()
self.config.read_config()
# connect to database
self.db_conn = psycopg2.connect(host = self.config.db_host, user = self.config.db_user, password = self.config.db_password, database = self.config.db_database)
self.db_cursor = self.db_conn.cursor()
def __del__(self):
self.logger.info("crawled died")
self.db_conn.close()
def run(self):
self.logger.info("crawler spawned")
while True:
url = self.crawling_queue.get()
self.logger.debug("crawling " + url)
try:
html = self.fetch_page(url)
except HttpError as e:
self.logger.info("HttpError: Code " + e.value + " at " + url)
if (re.match('https:\/\/play.google.com\/store\/apps\/details\?id=[^&"?#<>()]*', url) != None):
identifier = re.findall('\/store\/apps\/details\?id=([^&"?#<>()]*)', url)[0]
# print('INSERT INTO "public"."pointsintime" (timestamp, application_id) VALUES ( now(), (SELECT id FROM "public"."applications" WHERE identifier = \''+identifier+'\' ))')
# self.db_cursor.execute('INSERT INTO "public"."pointsintime" (timestamp, application_id) VALUES ( now(), (SELECT id FROM "public"."applications" WHERE identifier = \''+identifier+'\' ))')
# self.db_conn.commit()
continue
# add found apps to queue
identifiers = self.find_identifiers(html)
for identifier in identifiers:
self.logger.debug("found " + identifier + " at " + url)
if (self.discovered_dict.has_key(identifier) is False):
url_app = self.config.app_url + identifier + "&hl=en"
self.crawling_queue.put(url_app)
self.discovered_dict[identifier] = url_app
self.db_cursor.execute('INSERT INTO "public"."applications" (identifier) SELECT \''+identifier+'\' WHERE NOT EXISTS (SELECT 1 FROM "public"."applications" WHERE identifier = \''+identifier+'\');')
self.logger.debug("added for visit: " + url_app)
# commit
self.db_conn.commit()
# check if this page is to be processed
# TODO: use escaped self.config.app_url
if (re.match('https:\/\/play.google.com\/store\/apps\/details\?id=[^&"?#<>()]*', url) != None):
identifier = re.findall('\/store\/apps\/details\?id=([^&"?#<>()]*)', url)[0]
if (self.discovered_dict.has_key(identifier) is False):
self.discovered_dict[identifier] = url
self.logger.debug("added for processing: " + identifier + " from " + url)
item = (identifier, url, html)
self.processing_queue.put(item)
self.crawled_dict[url] = "crawled"
self.crawling_queue.task_done()
time.sleep(1)
def fetch_page(self, url):
f = urllib.urlopen(url)
code = str(f.getcode())
if (re.findall('(2\\d\\d|3\\d\\d)', code)):
return ''.join(f.readlines())
else:
raise HttpError(str(f.getcode()))
def find_identifiers(self, html):
identifiers = re.findall('\/store\/apps\/details\?id=([^&"?#<>()]*)', html)
return list(identifiers)