Skip to content

Commit

Permalink
Merge pull request #63 from ClericPy/dev
Browse files Browse the repository at this point in the history
1.7.2
  • Loading branch information
ClericPy authored May 8, 2020
2 parents f909140 + 9d9c4e9 commit 339a904
Show file tree
Hide file tree
Showing 12 changed files with 521 additions and 435 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ uniparser>=1.4.5
fastapi
uvicorn
databases
torequests>=5.0.3
torequests>=5.0.4
fire
jinja2
aiofiles
Expand Down
2 changes: 1 addition & 1 deletion watchdogs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
from .config import Config
from .main import init_app

__version__ = '1.7.1'
__version__ = '1.7.2'
__all__ = ['Config', 'init_app']
logging.getLogger('watchdogs').addHandler(logging.NullHandler())
13 changes: 9 additions & 4 deletions watchdogs/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from . import __version__
from .config import md5_checker
from .crawler import crawl_once
from .crawler import crawl_once, find_next_check_time
from .models import Task, query_tasks, tasks
from .settings import (Config, get_host_freq_list, refresh_token, release_app,
set_host_freq, setup_app)
Expand Down Expand Up @@ -119,6 +119,7 @@ async def index(request: Request, tag: str = ''):
'custom_links': Config.custom_links,
'callback_workers': Config.callback_handler.workers,
'custom_tabs': Config.custom_tabs,
'work_hours_doc': find_next_check_time.__doc__,
})
init_vars_b64 = b64encode(init_vars_json.encode('u8')).decode('u8')
kwargs['init_vars'] = init_vars_b64
Expand Down Expand Up @@ -204,7 +205,7 @@ async def force_crawl(task_name: str):
async def load_tasks(
task_name: Optional[str] = None,
page: int = 1,
page_size: int = 30,
page_size: int = Config.default_page_size,
order_by: str = 'last_change_time',
sort: str = 'desc',
tag: str = '',
Expand Down Expand Up @@ -469,8 +470,12 @@ async def post_lite(request: Request,


@app.get("/lite")
async def lite(request: Request, tag: str = '', sign: str = '', page: int = 1):
tasks, has_more = await query_tasks(tag=tag, page=page)
async def lite(request: Request,
tag: str = '',
sign: str = '',
page: int = 1,
page_size: int = Config.default_page_size):
tasks, has_more = await query_tasks(tag=tag, page=page, page_size=page_size)
now = datetime.now()
for task in tasks:
result = loads(task['latest_result'] or '{}')
Expand Down
2 changes: 1 addition & 1 deletion watchdogs/background.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
async def crawl_chunks(crawl_once):
loop_num = 0
while not Config.is_shutdown:
loop_num += 1
has_more = await crawl_once()
if isinstance(has_more, Exception):
Config.logger.error(f'crawl_once error, {has_more!r}')
Expand All @@ -15,6 +14,7 @@ async def crawl_chunks(crawl_once):
f'crawl_once finished, has_more: {has_more}, loop: {loop_num}')
if not has_more:
break
loop_num += 1


async def background_loop(coro_funcs: list = None):
Expand Down
1 change: 1 addition & 0 deletions watchdogs/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ class Config:
custom_tabs: List[Dict] = []
COLLATION: str = None
cookie_max_age = 86400 * 7
default_page_size = 15

@classmethod
def add_custom_tabs(cls, label, url, name=None, desc=None):
Expand Down
76 changes: 50 additions & 26 deletions watchdogs/crawler.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# -*- coding: utf-8 -*-

from asyncio import ensure_future, wait
from datetime import datetime, timedelta
from json import JSONDecodeError, dumps, loads
from typing import Optional, Tuple

from torequests.utils import ttime
from torequests.utils import timeago, ttime
from uniparser import Crawler, RuleNotFoundError

from .config import Config
Expand Down Expand Up @@ -38,8 +40,7 @@ def kwargs(self):


def find_next_check_time(
work_hours: str,
interval: int,
task: Task,
now: Optional[datetime] = None,
) -> Tuple[bool, datetime]:
'''
Expand All @@ -63,30 +64,50 @@ def find_next_check_time(
%w==5|20, 24 means every Friday or everyday 20:00 ~ 23:59
%w==5|%w==2 means every Friday or Tuesday
%w!=6&%w!=0 means everyday except Saturday & Sunday.
5. Set a ensure change interval
> If work_hours string endswith `#` and `x` seconds, will check the next_change_time first.
> In other words, I am very sure that the interval between two changes is more than `x` seconds
> So the crawler of this task will not run until the time is `last_change_time + change_interval`
%w==5#86400 means every Friday if it didn't change within 1 day
0, 24#3600 means each hour if it didn't change within this hour. The task will only be crawled once if it has changed.
'''
# find the latest hour fit work_hours, if not exist, return next day 00:00
now = now or datetime.now()
work_hours = task.work_hours or '0, 24'
if '#' in work_hours:
# check if changed
last_change_time = task.last_change_time or datetime.fromtimestamp(0)
# split work_hours and change_interval
work_hours, change_interval_str = work_hours.split('#')
change_interval = int(change_interval_str)
# not fit change interval, will wait for left seconds.
next_change_time = last_change_time + timedelta(seconds=change_interval)
if now < next_change_time:
Config.logger.info(
f'Task [{task.name}] has changed in {timeago(change_interval, accuracy=1, format=1, short_name=1)} ago.'
)
return False, next_change_time

ok = check_work_time(work_hours, now)
if ok:
# current time is ok, next_check_time is now+interval
next_check_time = now + timedelta(seconds=interval)
return ok, next_check_time
need_crawl = check_work_time(work_hours, now)
if need_crawl:
# current time is need_crawl, next_check_time is now+interval
next_check_time = now + timedelta(seconds=task.interval)
return need_crawl, next_check_time
else:
# current time is not ok
# current time is not need_crawl
next_check_time = now
# time machine to check time fast
# time machine to update next_check_time fast
for _ in range(60):
# check next interval
next_check_time = next_check_time + timedelta(seconds=interval)
_ok = check_work_time(work_hours, next_check_time)
if _ok:
# current is still False, but next_check_time is True
# next interval
next_check_time = next_check_time + timedelta(seconds=task.interval)
_need_crawl = check_work_time(work_hours, next_check_time)
if _need_crawl:
# current time is still False, but next_check_time is True
break
return ok, next_check_time
return need_crawl, next_check_time


async def crawl(task):
async def crawl(task: Task):
crawler: Crawler = Config.crawler
logger = Config.logger
logger.info(f'Start crawling: {task.name}')
Expand All @@ -103,13 +124,17 @@ async def crawl(task):
else:
if len(crawl_result) == 1:
# chain result for __request__ which fetch a new request
result_list = get_watchdog_result(item=crawl_result.popitem()[1])
if result_list == {'text': 'text not found'}:
formated_result = get_watchdog_result(
item=crawl_result.popitem()[1])
if formated_result == {'text': 'text not found'}:
error = f'{task.name} text not found, crawl result given: {crawl_result}'
logger.error(error)
result_list = None
else:
if not isinstance(result_list, list):
result_list = [result_list]
if isinstance(formated_result, list):
result_list = formated_result
else:
result_list = [formated_result]
# use force crawl one web UI for more log
logger.info(f'{task.name} Crawl success: {result_list}'[:150])
else:
Expand Down Expand Up @@ -141,12 +166,11 @@ async def _crawl_once(task_name: Optional[str] = None, chunk_size: int = 20):
for _task in fetched_tasks:
task = Task(**dict(_task))
# check work hours
ok, next_check_time = find_next_check_time(task.work_hours or '0, 24',
task.interval, now)
need_crawl, next_check_time = find_next_check_time(task, now)
if task_name:
# always crawl for given task_name
ok = True
if ok:
need_crawl = True
if need_crawl:
t = ensure_future(crawl(task))
# add task_name for logger
setattr(t, 'task_name', task.name)
Expand All @@ -160,7 +184,7 @@ async def _crawl_once(task_name: Optional[str] = None, chunk_size: int = 20):
# update task variable for callback
task.__dict__.update(values)
update_values.append(values)
if not ok:
if not need_crawl:
logger.info(
f'Task [{task.name}] is not on work time, next_check_time reset to {next_check_time}'
)
Expand Down
2 changes: 1 addition & 1 deletion watchdogs/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ async def query_tasks(
task_name: Optional[str] = None,
task_id: Optional[int] = None,
page: int = 1,
page_size: int = 30,
page_size: int = Config.default_page_size,
order_by: str = 'last_change_time',
sort: str = 'desc',
tag: str = '',
Expand Down
3 changes: 3 additions & 0 deletions watchdogs/static/css/watchdogs.css
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ p.custom_links {
.el-popover {
max-width: 50%;
}
.el-message-box.work_hours_doc{
width: 40%;
}
pre {
word-wrap: break-word;
white-space: pre-wrap;
Expand Down
2 changes: 1 addition & 1 deletion watchdogs/static/css/watchdogs.min.css

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 339a904

Please sign in to comment.