diff --git a/README.md b/README.md index 52c81f11..af5db14e 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,34 @@ Visit: ``` +## Feed params + +You may specify the following `scrapy` feed params in `config.py`: +- `FEED_URI` - path that is used by scrapy to store feed. +All storages (s3, ftp, local filesystem) are supported. +- `FEED_FORMAT` - exported file format +- `EXPORT_URI` - path where feed can be retrieved from. + +`FEED_URI` and `EXPORT_URI` can contain the following params: +- `%(name)s` - spider name +- `%(create_time)s` - time of job execution start +- `%(job_id)s` - job execution id +- any other params from `Args` set while adding jobs. + +If `EXPORT_URI` is not defined, export uri is equal to `FEED_URI`. +If `FEED_URI` is also not defined, it is not passed to spider. +The same is for `FEED_FORMAT`. + +Example: +``` +FEED_FORMAT = 'csv' +FEED_URI = 's3://bucket/%(name)s/%(job_id)s_%(create_time)s.csv' +EXPORT_URI = 'https://s3.amazonaws.com/bucket/%(name)s/%(job_id)s_%(create_time)s.csv' +``` +If job has export uri (i.e. `FEED_URI` is defined in config.py), `Export` button is displayed on job detail page. + +Note: need to install `boto3` for uploading to `s3`. + ## TODO - [ ] Job dashboard support filter - [x] User Authentication diff --git a/SpiderKeeper/app/proxy/spiderctrl.py b/SpiderKeeper/app/proxy/spiderctrl.py index 2e47cfab..4b2d1377 100644 --- a/SpiderKeeper/app/proxy/spiderctrl.py +++ b/SpiderKeeper/app/proxy/spiderctrl.py @@ -1,7 +1,7 @@ import datetime import random -from functools import reduce +import SpiderKeeper.config as config from SpiderKeeper.app import db from SpiderKeeper.app.spider.model import SpiderStatus, JobExecution, JobInstance, Project, JobPriority @@ -147,16 +147,68 @@ def start_spider(self, job_instance): for i in range(threshold): leaders.append(random.choice(candidates)) for leader in leaders: - serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id - job_execution.service_job_execution_id = serviec_job_id job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server db.session.add(job_execution) db.session.commit() + feed_settings = self.get_feed_params( + job_execution, + spider_name, + arguments + ) + if feed_settings: + arguments['setting'] = feed_settings + + service_job_id = leader.start_spider( + project.project_name, + spider_name, + arguments + ) + + job_execution.service_job_execution_id = service_job_id + db.session.commit() + + def get_feed_params(self, job_execution, spider_name, args): + """Pass FEED_URI and FEED_FORMAT params to spider settings. + + Save EXPORT_URI to db as well. + + """ + custom_settings = [] + feed_uri, export_uri = self.get_feed_uri( + job_execution, + spider_name, + args + ) + if feed_uri: + job_execution.export_uri = export_uri + custom_settings.append( + 'FEED_URI={}'.format(feed_uri)) + if config.FEED_FORMAT: + custom_settings.append( + 'FEED_FORMAT={}'.format(config.FEED_FORMAT) + ) + return custom_settings + + @staticmethod + def get_feed_uri(job_execution, spider_name, args): + """Pass params to FEED_URI and EXPORT_URI and return the result.""" + if not config.FEED_URI: + return None, None + params = { + 'name': spider_name, + 'job_id': job_execution.id, + 'create_time': + job_execution.create_time.strftime('%Y-%m-%d_%H-%M-%S') + } + params.update({key: value[0] for key, value in args.items()}) + export_uri = config.EXPORT_URI if config.EXPORT_URI else config.FEED_URI + return config.FEED_URI % params, export_uri % params + def cancel_spider(self, job_execution): job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) diff --git a/SpiderKeeper/app/spider/controller.py b/SpiderKeeper/app/spider/controller.py index 296126b3..152e6763 100644 --- a/SpiderKeeper/app/spider/controller.py +++ b/SpiderKeeper/app/spider/controller.py @@ -534,6 +534,18 @@ def job_dashboard(project_id): return render_template("job_dashboard.html", job_status=JobExecution.list_jobs(project_id)) +@app.route("/job//detail") +def job_detail(job_execution_id): + job_execution = JobExecution.query.filter_by(id=job_execution_id).first() + job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) \ + if job_execution else None + return render_template( + 'job_detail.html', + job=job_execution, + job_instance=job_instance, + ) + + @app.route("/project//job/periodic") def job_periodic(project_id): project = Project.find_project_by_id(project_id) diff --git a/SpiderKeeper/app/spider/model.py b/SpiderKeeper/app/spider/model.py index 5376602b..4e2526cd 100644 --- a/SpiderKeeper/app/spider/model.py +++ b/SpiderKeeper/app/spider/model.py @@ -151,13 +151,14 @@ class JobExecution(Base): __tablename__ = 'sk_job_execution' project_id = db.Column(db.INTEGER, nullable=False, index=True) - service_job_execution_id = db.Column(db.String(50), nullable=False, index=True) + service_job_execution_id = db.Column(db.String(50), index=True) job_instance_id = db.Column(db.INTEGER, nullable=False, index=True) create_time = db.Column(db.DATETIME) start_time = db.Column(db.DATETIME) end_time = db.Column(db.DATETIME) running_status = db.Column(db.INTEGER, default=SpiderStatus.PENDING) running_on = db.Column(db.Text) + export_uri = db.Column(db.Text, nullable=True) def to_dict(self): job_instance = JobInstance.query.filter_by(id=self.job_instance_id).first() diff --git a/SpiderKeeper/app/templates/job_dashboard.html b/SpiderKeeper/app/templates/job_dashboard.html index 494b93d7..ce47320c 100644 --- a/SpiderKeeper/app/templates/job_dashboard.html +++ b/SpiderKeeper/app/templates/job_dashboard.html @@ -37,8 +37,12 @@

Next Jobs

{% for job in job_status.PENDING %} {% if job.job_instance %} - {{ job.job_execution_id }} - {{ job.job_instance_id }} + + + {{ job.job_execution_id }} + + + {{ job.job_instance_id }} {{ job.job_instance.spider_name }} {{ job.job_instance.spider_arguments }} @@ -94,8 +98,12 @@

Running Jobs

{% for job in job_status.RUNNING %} {% if job.job_instance %} - {{ job.job_execution_id }} - {{ job.job_instance_id }} + + + {{ job.job_execution_id }} + + + {{ job.job_instance_id }} {{ job.job_instance.spider_name }} {{ job.job_instance.spider_arguments }} @@ -159,8 +167,12 @@

Completed Jobs

{% for job in job_status.COMPLETED %} {% if job.job_instance %} - {{ job.job_execution_id }} - {{ job.job_instance_id }} + + + {{ job.job_execution_id }} + + + {{ job.job_instance_id }} {{ job.job_instance.spider_name }} {{ job.job_instance.spider_arguments }} diff --git a/SpiderKeeper/app/templates/job_detail.html b/SpiderKeeper/app/templates/job_detail.html new file mode 100644 index 00000000..c6d7de91 --- /dev/null +++ b/SpiderKeeper/app/templates/job_detail.html @@ -0,0 +1,97 @@ +{% extends "base.html" %} +{% block content_header %} +

Job Dashboard

+
    + + +{% if job.running_status == 2 and job.export_uri %} + + Export + +{% endif %} + +
+{% endblock %} +{% block content_body %} + + +
+
+

Job

+
+
+ + + + + + + + + + + + + {% if job %} + + + + + + {% if job_instance.priority == -1 %} + + {% elif job_instance.priority == 0 %} + + {% elif job_instance.priority == 1 %} + + {% elif job_instance.priority == 2 %} + + {% endif %} + + + + + + {% endif %} +
#JobSpiderArgsPriorityRuntimeStartedLogStatus
{{ job.id }}{{ job.job_instance_id }}{{ job_instance.spider_name }}{{ job_instance.spider_arguments }} + + LOW + + NORMAL + + HIGH + + HIGHEST + + {% if job.running_status in [0, 1] %} + {{ timedelta(now,job.start_time) }} + {% else %} + {{ timedelta(job.end_time,job.start_time) }} + {% endif %} + {{ job.start_time }} + Log + + {% if job.running_status == 0 %} + PENDING + {% elif job.running_status == 1 %} + RUNNING + {% elif job.running_status == 2 %} + FINISHED + {% else %} + CANCELED + {% endif %} +
+
+
+{% endblock %} \ No newline at end of file diff --git a/SpiderKeeper/config.py b/SpiderKeeper/config.py index 8f0722d9..21b954af 100644 --- a/SpiderKeeper/config.py +++ b/SpiderKeeper/config.py @@ -40,3 +40,8 @@ BASIC_AUTH_USERNAME = 'admin' BASIC_AUTH_PASSWORD = 'admin' BASIC_AUTH_FORCE = True + +# feed params +FEED_FORMAT = None +FEED_URI = None +EXPORT_URI = None