From 35366be0b7ce9dcf39782af9034f54f3a7c050f0 Mon Sep 17 00:00:00 2001 From: will-horning Date: Fri, 28 Jul 2017 14:16:21 -0400 Subject: [PATCH 1/2] [capitol-words] ported crec scraper to work as a celery task in django --- .../capitolweb/capitolweb/__init__.py | 5 ++ .../capitolweb/capitolweb/settings.py | 14 +++- .../capitolweb/workers/__init__.py | 1 + .../{scrapers => capitolweb/workers}/crec.py | 47 ++++------- capitolwords_ng/requirements.txt | 83 ++++++++++++++++--- capitolwords_ng/run_crec_es_uploader.py | 4 +- 6 files changed, 108 insertions(+), 46 deletions(-) create mode 100644 capitolwords_ng/capitolweb/workers/__init__.py rename capitolwords_ng/{scrapers => capitolweb/workers}/crec.py (86%) diff --git a/capitolwords_ng/capitolweb/capitolweb/__init__.py b/capitolwords_ng/capitolweb/capitolweb/__init__.py index e69de29..9e21b83 100644 --- a/capitolwords_ng/capitolweb/capitolweb/__init__.py +++ b/capitolwords_ng/capitolweb/capitolweb/__init__.py @@ -0,0 +1,5 @@ +from __future__ import absolute_import, unicode_literals + +from .celery import app as celery_app + +__all__ = ['celery_app'] \ No newline at end of file diff --git a/capitolwords_ng/capitolweb/capitolweb/settings.py b/capitolwords_ng/capitolweb/capitolweb/settings.py index ccb29c1..cbdd15a 100644 --- a/capitolwords_ng/capitolweb/capitolweb/settings.py +++ b/capitolwords_ng/capitolweb/capitolweb/settings.py @@ -20,6 +20,8 @@ DEV_FRONTEND = True DEV_FRONTEND_SPA_BASE_URL = 'http://localhost:3000' +CREC_STAGING_BUCKET = 'capitol-words-data' +CREC_STAGING_S3_KEY_PREFIX = 'crec' # Build paths inside the project like this: os.path.join(BASE_DIR, ...) BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -49,6 +51,9 @@ 'legislators', 'rest_framework', 'rest_framework_swagger', + 'django_celery_beat', + 'django_celery_results', + 'workers', ] MIDDLEWARE = [ @@ -149,4 +154,11 @@ 'level': os.getenv('DJANGO_LOG_LEVEL', 'INFO'), }, }, -} \ No newline at end of file +} + +CELERY_RESULT_BACKEND = 'django-db' +CELERY_BROKER_URL = 'amqp://guest:guest@localhost//' +CELERY_BEAT_SCHEDULER = 'django_celery_beat.schedulers:DatabaseScheduler' +CELERY_ACCEPT_CONTENT = ['json'] +CELERY_TASK_SERIALIZER = 'json' +CELERY_RESULT_SERIALIZER = 'json' diff --git a/capitolwords_ng/capitolweb/workers/__init__.py b/capitolwords_ng/capitolweb/workers/__init__.py new file mode 100644 index 0000000..b311a46 --- /dev/null +++ b/capitolwords_ng/capitolweb/workers/__init__.py @@ -0,0 +1 @@ +from .tasks import scrape_crecs diff --git a/capitolwords_ng/scrapers/crec.py b/capitolwords_ng/capitolweb/workers/crec.py similarity index 86% rename from capitolwords_ng/scrapers/crec.py rename to capitolwords_ng/capitolweb/workers/crec.py index 762e8ed..4caef99 100644 --- a/capitolwords_ng/scrapers/crec.py +++ b/capitolwords_ng/capitolweb/workers/crec.py @@ -1,40 +1,26 @@ -"""Service for staging unpacked html files from a daily zip of congressional +"""Stages unpacked html files from a daily zip of congressional records retrieved from gpo.gov. -This module can be used either from the command line, or deployed as an AWS -Lambda function (see :func:``lambda_handler`` for details on lambda execution). - -To run locally: - - :: - - python crec_stager.py --s3_bucket=mybukkit - Attributes: DEFAULT_LOG_FORMAT (:obj:`str`): A template string for log lines. LOGLEVELS (:obj:`dict`): A lookup of loglevel name to the loglevel code. """ -from __future__ import print_function - import os import sys -import urllib2 import logging import argparse from datetime import datetime from datetime import timedelta from zipfile import ZipFile from collections import defaultdict +from urllib.request import urlopen +from urllib.error import HTTPError import boto3 import requests from botocore.exceptions import ClientError -from cli import setup_logger -from cli import add_logging_options -from cli import CMD_LINE_DATE_FORMAT - def get_dates(start_dt, end_dt=None): if end_dt is None: @@ -73,7 +59,7 @@ def __init__(self, download_dir, s3_bucket, s3_key_prefix): self.download_dir = download_dir self.s3_bucket = s3_bucket self.s3_key_prefix = s3_key_prefix - self.s3 = boto3.client('s3') + self.s3 = boto3.resource('s3') def download_crec_zip(self, date): """Downloads the CREC zip for this date. @@ -84,8 +70,8 @@ def download_crec_zip(self, date): url = date.strftime(self.CREC_ZIP_TEMPLATE) logging.info('Downloading CREC zip from "{0}".'.format(url)) try: - response = urllib2.urlopen(url) - except urllib2.HTTPError as e: + response = urlopen(url) + except HTTPError as e: if e.getcode() == 404: logging.info('No zip found for date {0}'.format(date)) else: @@ -106,8 +92,8 @@ def download_mods_xml(self, date): url = date.strftime(self.MODS_ZIP_TEMPLATE) logging.info('Downloading mods.xml from "{0}".'.format(url)) try: - response = urllib2.urlopen(url) - except urllib2.HTTPError as e: + response = urlopen(url) + except HTTPError as e: if e.getcode() == 404: logging.debug('No mods.xml found for date {0}, at "{1}"'.format( date, url @@ -118,9 +104,9 @@ def download_mods_xml(self, date): return None data = response.read() mods_path = os.path.join(self.download_dir, 'mods.xml') - with open(mods_path, 'w') as f: + with open(mods_path, 'wb') as f: f.write(data) - return mods_path + return str(mods_path) def extract_html_files(self, zip_path): """Unpacks all html files in the zip at the provided path to the value @@ -162,15 +148,12 @@ def upload_to_s3(self, file_path, data_type, date): data_type, os.path.basename(file_path), ) - with open(file_path) as f: - logging.debug( - 'Uploading "{0}" to "s3://{1}/{2}".'.format( - file_path, self.s3_bucket, s3_key - ) - ) - self.s3.put_object( - Body=f, Bucket=self.s3_bucket, Key=s3_key + logging.debug( + 'Uploading "{0}" to "s3://{1}/{2}".'.format( + file_path, self.s3_bucket, s3_key ) + ) + self.s3.Object(self.s3_bucket, s3_key).upload_file(file_path) return s3_key def scrape_files_for_date(self, date): diff --git a/capitolwords_ng/requirements.txt b/capitolwords_ng/requirements.txt index ee21f18..fadc669 100644 --- a/capitolwords_ng/requirements.txt +++ b/capitolwords_ng/requirements.txt @@ -1,16 +1,77 @@ -requests==2.10.0 -boto3==1.2.1 -botocore==1.3.1 -elasticsearch>=5.0.0,<6.0.0 +amqp==2.2.1 +appdirs==1.4.3 +billiard==3.5.0.3 +boto3==1.4.4 +botocore==1.5.89 +cachetools==2.0.0 +celery==4.1.0 +certifi==2017.4.17 +cffi==1.10.0 +chardet==3.0.3 +cld2-cffi==0.1.4 +coreapi==2.3.1 +coreschema==0.0.4 +cymem==1.31.2 +cytoolz==0.8.2 +DateTime==4.2 +decorator==4.1.2 +dill==0.2.6 Django==1.11.3 -elasticsearch-dsl>=5.0.0,<6.0.0 -lxml==3.7.2 +django-celery-beat==1.0.1 +django-celery-results==1.0.1 +django-rest-swagger==2.1.2 djangorestframework==3.6.3 -PyYAML==3.12 -coreapi==2.3.1 -Pygments==2.2.0 +docutils==0.13.1 +elasticsearch==5.4.0 +elasticsearch-dsl==5.3.0 +ftfy==4.4.3 +html5lib==0.999999999 +idna==2.5 +ijson==2.3 +itypes==1.1.0 +Jinja2==2.9.6 +jmespath==0.9.3 +kombu==4.1.0 +lxml==3.7.2 Markdown==2.6.8 +MarkupSafe==1.0 +murmurhash==0.26.4 +networkx==1.11 +numpy==1.12.1 +openapi-codec==1.3.2 +packaging==16.8 +pathlib==1.0.1 +plac==0.9.6 +preshed==1.0.0 +pycparser==2.18 +pyemd==0.4.4 +Pygments==2.2.0 +pyparsing==2.2.0 +Pyphen==0.9.4 +python-dateutil==2.6.1 +python-Levenshtein==0.12.0 +pytz==2017.2 +PyYAML==3.12 +regex==2017.4.5 +requests==2.10.0 +s3transfer==0.1.10 +scikit-learn==0.18.2 +scipy==0.19.1 +simplejson==3.11.1 +six==1.10.0 spacy==1.8.2 +SQLAlchemy==1.1.12 +termcolor==1.1.0 textacy==0.3.4 -django-rest-swagger==2.1.2 -DateTime==4.2 +thinc==6.5.2 +toolz==0.8.2 +tqdm==4.14.0 +ujson==1.35 +Unidecode==0.4.21 +uritemplate==3.0.0 +urllib3==1.21.1 +vine==1.1.4 +wcwidth==0.1.7 +webencodings==0.5.1 +wrapt==1.10.10 +zope.interface==4.4.2 diff --git a/capitolwords_ng/run_crec_es_uploader.py b/capitolwords_ng/run_crec_es_uploader.py index eacdfea..45ad566 100644 --- a/capitolwords_ng/run_crec_es_uploader.py +++ b/capitolwords_ng/run_crec_es_uploader.py @@ -26,7 +26,7 @@ output_option_group = parser.add_mutually_exclusive_group(required=True) output_option_group.add_argument( '--to_stdout', - help='If true, will not upload to es and instead print to stdout.', + help='If true, will not upload to elasticsearch and instead print to stdout.', action='store_true' ) output_option_group.add_argument( @@ -47,7 +47,7 @@ ) parser.add_argument( '--source_bucket', - help='Location of crec data.', + help='S3 bucket name of crec source data.', ) args = parser.parse_args() From a6a63cde426322b17c23ebe0b888a8531d3dfc30 Mon Sep 17 00:00:00 2001 From: will-horning Date: Fri, 28 Jul 2017 15:46:17 -0400 Subject: [PATCH 2/2] [capitol-words] factored out some config stuff, added support for override arguments from the ui to scrape all days within a datetime range --- .../capitolweb/capitolweb/settings.py | 3 +- capitolwords_ng/capitolweb/workers/crec.py | 35 +++++++++++-------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/capitolwords_ng/capitolweb/capitolweb/settings.py b/capitolwords_ng/capitolweb/capitolweb/settings.py index cbdd15a..952eb32 100644 --- a/capitolwords_ng/capitolweb/capitolweb/settings.py +++ b/capitolwords_ng/capitolweb/capitolweb/settings.py @@ -20,8 +20,9 @@ DEV_FRONTEND = True DEV_FRONTEND_SPA_BASE_URL = 'http://localhost:3000' -CREC_STAGING_BUCKET = 'capitol-words-data' +CREC_STAGING_S3_BUCKET = 'capitol-words-data' CREC_STAGING_S3_KEY_PREFIX = 'crec' +CREC_STAGING_FOLDER = '/tmp' # Build paths inside the project like this: os.path.join(BASE_DIR, ...) BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) diff --git a/capitolwords_ng/capitolweb/workers/crec.py b/capitolwords_ng/capitolweb/workers/crec.py index 4caef99..864796d 100644 --- a/capitolwords_ng/capitolweb/workers/crec.py +++ b/capitolwords_ng/capitolweb/workers/crec.py @@ -160,11 +160,13 @@ def scrape_files_for_date(self, date): zip_path = self.download_crec_zip(date) mods_path = self.download_mods_xml(date) if zip_path is None: - logging.info('No zip found for date {0}'.format(date)) - return None + message = 'No zip found for date {0}.'.format(date) + logging.info(message) + return {'success': True, 'message': message} if mods_path is None: - logging.info('No mods.xml found for date {0}'.format(date)) - return None + message = 'No mods.xml found for date {0}'.format(date) + logging.info(message) + return {'success': True, 'message': message} logging.info( 'Extracting html files from zip to {0}'.format(self.download_dir) ) @@ -172,23 +174,25 @@ def scrape_files_for_date(self, date): try: s3_key = self.upload_to_s3(mods_path, 'mods', date) except ClientError as e: - logging.exception( - 'Error uploading file {0}, exiting'.format(mods_path, e) - ) - return False + message = 'Error uploading file {0}, exiting'.format(mods_path, e) + logging.exception(message) + return {'success': False, 'message': message} logging.info('Uploading {0} html files...'.format(len(html_file_paths))) for file_path in html_file_paths: try: s3_key = self.upload_to_s3(file_path, 'crec', date) except ClientError as e: - logging.exception( - 'Error uploading file {0}, exiting'.format(file_path, e) - ) - return False + message = 'Error uploading file {0}, exiting'.format(mods_path, e) + logging.exception(message) + return {'success': False, 'message': message} logging.info('Uploads finished.') - return True + return { + 'success': True, + 'message': '{0} crec html files uploaded.'.format(len(html_file_paths)) + } - def scrape_files_in_range(self, start_dt, end_dt): + def scrape_files_in_range(self, start_dt, end_dt=None): + results = [] if end_dt is None: end_dt = datetime.utcnow() end_dt = end_dt.replace(hour=0, minute=0, second=0, microsecond=0) @@ -197,5 +201,6 @@ def scrape_files_in_range(self, start_dt, end_dt): dt = start_dt.replace(hour=0, minute=0, second=0, microsecond=0) dates = [] while dt < end_dt: - self.scrape_files_for_date(dt) + result = self.scrape_files_for_date(dt) + results.append(result) dt += timedelta(days=1)