Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix sql error #93

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,7 @@ venv
dist
build
SpiderKeeper.db
dbs/
eggs/
*.egg
*.db
7 changes: 7 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
FROM python:3.6-alpine

WORKDIR /app

RUN pip install https://github.com/ryanvin/SpiderKeeper/archive/master.zip

EXPOSE 5000
39 changes: 5 additions & 34 deletions SpiderKeeper/app/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
# Import flask and template operators
import logging
import traceback

import apscheduler
from apscheduler.schedulers.background import BackgroundScheduler
from flask import Flask
from flask import jsonify
from flask_basicauth import BasicAuth
from flask_restful import Api
from flask_restful_swagger import swagger
from flask_sqlalchemy import SQLAlchemy
from werkzeug.exceptions import HTTPException

import SpiderKeeper
from SpiderKeeper import config
Expand All @@ -21,13 +16,7 @@
app.config.from_object(config)

# Logging
log = logging.getLogger('werkzeug')
log.setLevel(logging.ERROR)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler = logging.StreamHandler()
handler.setFormatter(formatter)
app.logger.setLevel(app.config.get('LOG_LEVEL', "INFO"))
app.logger.addHandler(handler)
app.logger.setLevel(app.config.get('LOG_LEVEL'))

# swagger
api = swagger.docs(Api(app), apiVersion=SpiderKeeper.__version__, api_spec_url="/api",
Expand All @@ -44,6 +33,7 @@ def teardown_request(exception):
db.session.remove()
db.session.remove()


# Define apscheduler
scheduler = BackgroundScheduler()

Expand All @@ -57,26 +47,6 @@ class Base(db.Model):
onupdate=db.func.current_timestamp())


# Sample HTTP error handling
# @app.errorhandler(404)
# def not_found(error):
# abort(404)


@app.errorhandler(Exception)
def handle_error(e):
code = 500
if isinstance(e, HTTPException):
code = e.code
app.logger.error(traceback.print_exc())
return jsonify({
'code': code,
'success': False,
'msg': str(e),
'data': None
})


# Build the database:
from SpiderKeeper.app.spider.model import *

Expand Down Expand Up @@ -108,7 +78,8 @@ def regist_server():
from SpiderKeeper.app.schedulers.common import sync_job_execution_status_job, sync_spiders, \
reload_runnable_spider_job_execution

scheduler.add_job(sync_job_execution_status_job, 'interval', seconds=5, id='sys_sync_status')
scheduler.add_job(sync_job_execution_status_job, 'interval', seconds=5, id='sys_sync_status',
max_instances=3, misfire_grace_time=10)
scheduler.add_job(sync_spiders, 'interval', seconds=10, id='sys_sync_spiders')
scheduler.add_job(reload_runnable_spider_job_execution, 'interval', seconds=30, id='sys_reload_job')

Expand All @@ -119,7 +90,7 @@ def start_scheduler():

def init_basic_auth():
if not app.config.get('NO_AUTH'):
basic_auth = BasicAuth(app)
BasicAuth(app)


def initialize():
Expand Down
5 changes: 3 additions & 2 deletions SpiderKeeper/app/proxy/contrib/scrapy.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import datetime, time
import datetime
import time

import requests

Expand Down Expand Up @@ -72,7 +73,7 @@ def start_spider(self, project_name, spider_name, arguments):
def cancel_spider(self, project_name, job_id):
post_data = dict(project=project_name, job=job_id)
data = request("post", self._scrapyd_url() + "/cancel.json", data=post_data, return_type="json")
return data != None
return data is not None

def deploy(self, project_name, file_path):
with open(file_path, 'rb') as f:
Expand Down
8 changes: 5 additions & 3 deletions SpiderKeeper/app/schedulers/common.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import threading
import time

from SpiderKeeper.app import scheduler, app, agent, db
from SpiderKeeper.app import scheduler, app, agent
from SpiderKeeper.app.spider.model import Project, JobInstance, SpiderInstance


Expand All @@ -10,7 +9,7 @@ def sync_job_execution_status_job():
sync job execution running status
:return:
'''
for project in Project.query.all():
for project in Project.get_valid_project():
agent.sync_job_status(project)
app.logger.debug('[sync_job_execution_status]')

Expand All @@ -34,6 +33,9 @@ def run_spider_job(job_instance_id):
'''
try:
job_instance = JobInstance.find_job_instance_by_id(job_instance_id)
if not job_instance:
app.logger.error('[run_spider_job] job_instance {} not found'.format(job_instance_id))
return
agent.start_spider(job_instance)
app.logger.info('[run_spider_job][project:%s][spider_name:%s][job_instance_id:%s]' % (
job_instance.project_id, job_instance.spider_name, job_instance.id))
Expand Down
15 changes: 14 additions & 1 deletion SpiderKeeper/app/spider/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ class SpiderCtrl(flask_restful.Resource):
}])
def get(self, project_id):
project = Project.find_project_by_id(project_id)
if not project:
return [{'error_msg': 'project id {} not found'.format(project_id)}]
return [spider_instance.to_dict() for spider_instance in
SpiderInstance.query.filter_by(project_id=project_id).all()]

Expand Down Expand Up @@ -510,6 +512,7 @@ def project_create():
project_name = request.form['project_name']
project = Project()
project.project_name = project_name
project.valid = 1
db.session.add(project)
db.session.commit()
return redirect("/project/%s/spider/deploy" % project.id, code=302)
Expand All @@ -536,7 +539,6 @@ def job_dashboard(project_id):

@app.route("/project/<project_id>/job/periodic")
def job_periodic(project_id):
project = Project.find_project_by_id(project_id)
job_instance_list = [job_instance.to_dict() for job_instance in
JobInstance.query.filter_by(run_type="periodic", project_id=project_id).all()]
return render_template("job_periodic.html",
Expand All @@ -546,6 +548,8 @@ def job_periodic(project_id):
@app.route("/project/<project_id>/job/add", methods=['post'])
def job_add(project_id):
project = Project.find_project_by_id(project_id)
if not project:
flash('project_id {} not found'.format(project_id), 'danger')
job_instance = JobInstance()
job_instance.spider_name = request.form['spider_name']
job_instance.project_id = project_id
Expand Down Expand Up @@ -589,6 +593,9 @@ def job_stop(project_id, job_exec_id):
@app.route("/project/<project_id>/jobexecs/<job_exec_id>/log")
def job_log(project_id, job_exec_id):
job_execution = JobExecution.query.filter_by(project_id=project_id, id=job_exec_id).first()
if not job_execution:
return render_template(
"job_log.html", log_lines='project id: {}, job exec id: {}'.format(project_id, job_exec_id))
res = requests.get(agent.log_url(job_execution))
res.encoding = 'utf8'
raw = res.text
Expand Down Expand Up @@ -628,6 +635,8 @@ def spider_dashboard(project_id):
@app.route("/project/<project_id>/spider/deploy")
def spider_deploy(project_id):
project = Project.find_project_by_id(project_id)
if not project:
flash('project_id {} not found'.format(project_id), 'danger')
return render_template("spider_deploy.html")


Expand Down Expand Up @@ -655,12 +664,16 @@ def spider_egg_upload(project_id):
@app.route("/project/<project_id>/project/stats")
def project_stats(project_id):
project = Project.find_project_by_id(project_id)
if not project:
flash('project_id {} not found'.format(project_id), 'danger')
run_stats = JobExecution.list_run_stats_by_hours(project_id)
return render_template("project_stats.html", run_stats=run_stats)


@app.route("/project/<project_id>/server/stats")
def service_stats(project_id):
project = Project.find_project_by_id(project_id)
if not project:
flash('project_id {} not found'.format(project_id), 'danger')
run_stats = JobExecution.list_run_stats_by_hours(project_id)
return render_template("server_stats.html", run_stats=run_stats)
11 changes: 9 additions & 2 deletions SpiderKeeper/app/spider/model.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import datetime

from sqlalchemy import desc

from SpiderKeeper.app import db, Base


class Project(Base):
__tablename__ = 'sk_project'

project_name = db.Column(db.String(50))
valid = db.Column(db.Integer)

@classmethod
def load_project(cls, project_list):
Expand All @@ -26,6 +29,10 @@ def to_dict(self):
"project_name": self.project_name
}

@classmethod
def get_valid_project(cls):
return cls.query.filter_by(valid=1).all()


class SpiderInstance(Base):
__tablename__ = 'sk_spider'
Expand Down Expand Up @@ -63,7 +70,7 @@ def to_dict(self):
@classmethod
def list_spiders(cls, project_id):
sql_last_runtime = '''
select * from (select a.spider_name,b.date_created from sk_job_instance as a
select c.spider_name, max(c.date_created) from (select a.spider_name,b.date_created from sk_job_instance as a
left join sk_job_execution as b
on a.id = b.job_instance_id
order by b.date_created desc) as c
Expand Down Expand Up @@ -201,7 +208,7 @@ def list_jobs(cls, project_id, each_status_limit=100):
result['COMPLETED'] = [job_execution.to_dict() for job_execution in
JobExecution.query.filter(JobExecution.project_id == project_id).filter(
(JobExecution.running_status == SpiderStatus.FINISHED) | (
JobExecution.running_status == SpiderStatus.CANCELED)).order_by(
JobExecution.running_status == SpiderStatus.CANCELED)).order_by(
desc(JobExecution.date_modified)).limit(each_status_limit)]
return result

Expand Down
16 changes: 9 additions & 7 deletions SpiderKeeper/config.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
# Statement for enabling the development environment
import os

DEBUG = True
DEBUG = False

# Define the application directory


BASE_DIR = os.path.abspath(os.path.dirname(__file__))

SQLALCHEMY_DATABASE_URI = 'sqlite:///' + os.path.join(os.path.abspath('.'), 'SpiderKeeper.db')
_DEFAULT_URI = 'sqlite:///' + os.path.join(os.path.abspath('.'), 'SpiderKeeper.db')
SQLALCHEMY_DATABASE_URI = os.getenv('SK_DATABASE_URI', _DEFAULT_URI)
SQLALCHEMY_TRACK_MODIFICATIONS = False
DATABASE_CONNECT_OPTIONS = {}

Expand All @@ -23,20 +24,21 @@

# Use a secure, unique and absolutely secret key for
# signing the data.
CSRF_SESSION_KEY = "secret"
CSRF_SESSION_KEY = "36a372e9c1988acc04d12ea92f194a2a8f4b6e2c"

# Secret key for signing cookies
SECRET_KEY = "secret"
SECRET_KEY = "72e2fc5300409e20fda661734ceb03a3d6d72d50"

# log
LOG_LEVEL = 'INFO'

# spider services
_DEFAULT_SERVER = 'http://localhost:6800'
SERVER_TYPE = 'scrapyd'
SERVERS = ['http://localhost:6800']
SERVERS = os.getenv('SK_SERVERS', _DEFAULT_SERVER).split(',')

# basic auth
NO_AUTH = False
BASIC_AUTH_USERNAME = 'admin'
BASIC_AUTH_PASSWORD = 'admin'
BASIC_AUTH_USERNAME = os.getenv('SK_USERNAME', 'admin')
BASIC_AUTH_PASSWORD = os.getenv('SK_PASSWORD', 'admin')
BASIC_AUTH_FORCE = True
64 changes: 1 addition & 63 deletions SpiderKeeper/run.py
Original file line number Diff line number Diff line change
@@ -1,71 +1,9 @@
import logging
import os
from optparse import OptionParser

from SpiderKeeper.app import app, initialize


def main():
opts, args = parse_opts(app.config)
app.config.update(dict(
SERVER_TYPE=opts.server_type,
SERVERS=opts.servers or app.config.get('SERVERS'),
SQLALCHEMY_DATABASE_URI=opts.database_url,
BASIC_AUTH_USERNAME=opts.username,
BASIC_AUTH_PASSWORD=opts.password,
NO_AUTH=opts.no_auth
))
if opts.verbose:
app.logger.setLevel(logging.DEBUG)
initialize()
app.logger.info("SpiderKeeper startd on %s:%s username:%s/password:%s with %s servers:%s" % (
opts.host, opts.port, opts.username, opts.password, opts.server_type, ','.join(app.config.get('SERVERS', []))))
app.run(host=opts.host, port=opts.port, use_reloader=False, threaded=True)


def parse_opts(config):
parser = OptionParser(usage="%prog [options]",
description="Admin ui for spider service")
parser.add_option("--host",
help="host, default:0.0.0.0",
dest='host',
default='0.0.0.0')
parser.add_option("--port",
help="port, default:5000",
dest='port',
type="int",
default=5000)
parser.add_option("--username",
help="basic auth username ,default: %s" % config.get('BASIC_AUTH_USERNAME'),
dest='username',
default=config.get('BASIC_AUTH_USERNAME'))
parser.add_option("--password",
help="basic auth password ,default: %s" % config.get('BASIC_AUTH_PASSWORD'),
dest='password',
default=config.get('BASIC_AUTH_PASSWORD'))
parser.add_option("--type",
help="access spider server type, default: %s" % config.get('SERVER_TYPE'),
dest='server_type',
default=config.get('SERVER_TYPE'))
parser.add_option("--server",
help="servers, default: %s" % config.get('SERVERS'),
dest='servers',
action='append',
default=[])
parser.add_option("--database-url",
help='SpiderKeeper metadata database default: %s' % config.get('SQLALCHEMY_DATABASE_URI'),
dest='database_url',
default=config.get('SQLALCHEMY_DATABASE_URI'))

parser.add_option("--no-auth",
help="disable basic auth",
dest='no_auth',
action='store_true')
parser.add_option("-v", "--verbose",
help="log level",
dest='verbose',
action='store_true')
return parser.parse_args()
app.run(host='0.0.0.0', port=5000, threaded=True)


if __name__ == '__main__':
Expand Down
11 changes: 0 additions & 11 deletions SpiderKeeper/uwsgi.py

This file was deleted.

Loading