From 35366be0b7ce9dcf39782af9034f54f3a7c050f0 Mon Sep 17 00:00:00 2001
From: will-horning <will.horning@gmail.com>
Date: Fri, 28 Jul 2017 14:16:21 -0400
Subject: [PATCH 1/2] [capitol-words] ported crec scraper to work as a celery
 task in django

---
 .../capitolweb/capitolweb/__init__.py         |  5 ++
 .../capitolweb/capitolweb/settings.py         | 14 +++-
 .../capitolweb/workers/__init__.py            |  1 +
 .../{scrapers => capitolweb/workers}/crec.py  | 47 ++++-------
 capitolwords_ng/requirements.txt              | 83 ++++++++++++++++---
 capitolwords_ng/run_crec_es_uploader.py       |  4 +-
 6 files changed, 108 insertions(+), 46 deletions(-)
 create mode 100644 capitolwords_ng/capitolweb/workers/__init__.py
 rename capitolwords_ng/{scrapers => capitolweb/workers}/crec.py (86%)

diff --git a/capitolwords_ng/capitolweb/capitolweb/__init__.py b/capitolwords_ng/capitolweb/capitolweb/__init__.py
index e69de29..9e21b83 100644
--- a/capitolwords_ng/capitolweb/capitolweb/__init__.py
+++ b/capitolwords_ng/capitolweb/capitolweb/__init__.py
@@ -0,0 +1,5 @@
+from __future__ import absolute_import, unicode_literals
+
+from .celery import app as celery_app
+
+__all__ = ['celery_app']
\ No newline at end of file
diff --git a/capitolwords_ng/capitolweb/capitolweb/settings.py b/capitolwords_ng/capitolweb/capitolweb/settings.py
index ccb29c1..cbdd15a 100644
--- a/capitolwords_ng/capitolweb/capitolweb/settings.py
+++ b/capitolwords_ng/capitolweb/capitolweb/settings.py
@@ -20,6 +20,8 @@
 DEV_FRONTEND = True
 DEV_FRONTEND_SPA_BASE_URL = 'http://localhost:3000'
 
+CREC_STAGING_BUCKET = 'capitol-words-data'
+CREC_STAGING_S3_KEY_PREFIX = 'crec'
 
 # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -49,6 +51,9 @@
     'legislators',
     'rest_framework',
     'rest_framework_swagger',
+    'django_celery_beat',
+    'django_celery_results',
+    'workers',
 ]
 
 MIDDLEWARE = [
@@ -149,4 +154,11 @@
             'level': os.getenv('DJANGO_LOG_LEVEL', 'INFO'),
         },
     },
-}
\ No newline at end of file
+}
+
+CELERY_RESULT_BACKEND = 'django-db'
+CELERY_BROKER_URL = 'amqp://guest:guest@localhost//'
+CELERY_BEAT_SCHEDULER = 'django_celery_beat.schedulers:DatabaseScheduler'
+CELERY_ACCEPT_CONTENT = ['json']
+CELERY_TASK_SERIALIZER = 'json'
+CELERY_RESULT_SERIALIZER = 'json'
diff --git a/capitolwords_ng/capitolweb/workers/__init__.py b/capitolwords_ng/capitolweb/workers/__init__.py
new file mode 100644
index 0000000..b311a46
--- /dev/null
+++ b/capitolwords_ng/capitolweb/workers/__init__.py
@@ -0,0 +1 @@
+from .tasks import scrape_crecs
diff --git a/capitolwords_ng/scrapers/crec.py b/capitolwords_ng/capitolweb/workers/crec.py
similarity index 86%
rename from capitolwords_ng/scrapers/crec.py
rename to capitolwords_ng/capitolweb/workers/crec.py
index 762e8ed..4caef99 100644
--- a/capitolwords_ng/scrapers/crec.py
+++ b/capitolwords_ng/capitolweb/workers/crec.py
@@ -1,40 +1,26 @@
-"""Service for staging unpacked html files from a daily zip of congressional
+"""Stages unpacked html files from a daily zip of congressional
 records retrieved from gpo.gov.
 
-This module can be used either from the command line, or deployed as an AWS
-Lambda function (see :func:``lambda_handler`` for details on lambda execution).
-
-To run locally:
-
-    ::
-
-        python crec_stager.py --s3_bucket=mybukkit
-
 Attributes:
     DEFAULT_LOG_FORMAT (:obj:`str`): A template string for log lines.
     LOGLEVELS (:obj:`dict`): A lookup of loglevel name to the loglevel code.
 """
 
-from __future__ import print_function
-
 import os
 import sys
-import urllib2
 import logging
 import argparse
 from datetime import datetime
 from datetime import timedelta
 from zipfile import ZipFile
 from collections import defaultdict
+from urllib.request import urlopen
+from urllib.error import HTTPError
 
 import boto3
 import requests
 from botocore.exceptions import ClientError
 
-from cli import setup_logger
-from cli import add_logging_options
-from cli import CMD_LINE_DATE_FORMAT
-
 
 def get_dates(start_dt, end_dt=None):
     if end_dt is None:
@@ -73,7 +59,7 @@ def __init__(self, download_dir, s3_bucket, s3_key_prefix):
         self.download_dir = download_dir
         self.s3_bucket = s3_bucket
         self.s3_key_prefix = s3_key_prefix
-        self.s3 = boto3.client('s3')
+        self.s3 = boto3.resource('s3')
 
     def download_crec_zip(self, date):
         """Downloads the CREC zip for this date.
@@ -84,8 +70,8 @@ def download_crec_zip(self, date):
         url = date.strftime(self.CREC_ZIP_TEMPLATE)
         logging.info('Downloading CREC zip from "{0}".'.format(url))
         try:
-            response = urllib2.urlopen(url)
-        except urllib2.HTTPError as e:
+            response = urlopen(url)
+        except HTTPError as e:
             if e.getcode() == 404:
                 logging.info('No zip found for date {0}'.format(date))
             else:
@@ -106,8 +92,8 @@ def download_mods_xml(self, date):
         url = date.strftime(self.MODS_ZIP_TEMPLATE)
         logging.info('Downloading mods.xml from "{0}".'.format(url))
         try:
-            response = urllib2.urlopen(url)
-        except urllib2.HTTPError as e:
+            response = urlopen(url)
+        except HTTPError as e:
             if e.getcode() == 404:
                 logging.debug('No mods.xml found for date {0}, at "{1}"'.format(
                         date, url
@@ -118,9 +104,9 @@ def download_mods_xml(self, date):
             return None
         data = response.read()
         mods_path = os.path.join(self.download_dir, 'mods.xml')
-        with open(mods_path, 'w') as f:
+        with open(mods_path, 'wb') as f:
             f.write(data)
-        return mods_path
+        return str(mods_path)
 
     def extract_html_files(self, zip_path):
         """Unpacks all html files in the zip at the provided path to the value
@@ -162,15 +148,12 @@ def upload_to_s3(self, file_path, data_type, date):
             data_type,
             os.path.basename(file_path),
         )
-        with open(file_path) as f:
-            logging.debug(
-                'Uploading "{0}" to "s3://{1}/{2}".'.format(
-                    file_path, self.s3_bucket, s3_key
-                )
-            )
-            self.s3.put_object(
-                Body=f, Bucket=self.s3_bucket, Key=s3_key
+        logging.debug(
+            'Uploading "{0}" to "s3://{1}/{2}".'.format(
+                file_path, self.s3_bucket, s3_key
             )
+        )
+        self.s3.Object(self.s3_bucket, s3_key).upload_file(file_path)
         return s3_key
 
     def scrape_files_for_date(self, date):
diff --git a/capitolwords_ng/requirements.txt b/capitolwords_ng/requirements.txt
index ee21f18..fadc669 100644
--- a/capitolwords_ng/requirements.txt
+++ b/capitolwords_ng/requirements.txt
@@ -1,16 +1,77 @@
-requests==2.10.0
-boto3==1.2.1
-botocore==1.3.1
-elasticsearch>=5.0.0,<6.0.0
+amqp==2.2.1
+appdirs==1.4.3
+billiard==3.5.0.3
+boto3==1.4.4
+botocore==1.5.89
+cachetools==2.0.0
+celery==4.1.0
+certifi==2017.4.17
+cffi==1.10.0
+chardet==3.0.3
+cld2-cffi==0.1.4
+coreapi==2.3.1
+coreschema==0.0.4
+cymem==1.31.2
+cytoolz==0.8.2
+DateTime==4.2
+decorator==4.1.2
+dill==0.2.6
 Django==1.11.3
-elasticsearch-dsl>=5.0.0,<6.0.0
-lxml==3.7.2
+django-celery-beat==1.0.1
+django-celery-results==1.0.1
+django-rest-swagger==2.1.2
 djangorestframework==3.6.3
-PyYAML==3.12
-coreapi==2.3.1
-Pygments==2.2.0
+docutils==0.13.1
+elasticsearch==5.4.0
+elasticsearch-dsl==5.3.0
+ftfy==4.4.3
+html5lib==0.999999999
+idna==2.5
+ijson==2.3
+itypes==1.1.0
+Jinja2==2.9.6
+jmespath==0.9.3
+kombu==4.1.0
+lxml==3.7.2
 Markdown==2.6.8
+MarkupSafe==1.0
+murmurhash==0.26.4
+networkx==1.11
+numpy==1.12.1
+openapi-codec==1.3.2
+packaging==16.8
+pathlib==1.0.1
+plac==0.9.6
+preshed==1.0.0
+pycparser==2.18
+pyemd==0.4.4
+Pygments==2.2.0
+pyparsing==2.2.0
+Pyphen==0.9.4
+python-dateutil==2.6.1
+python-Levenshtein==0.12.0
+pytz==2017.2
+PyYAML==3.12
+regex==2017.4.5
+requests==2.10.0
+s3transfer==0.1.10
+scikit-learn==0.18.2
+scipy==0.19.1
+simplejson==3.11.1
+six==1.10.0
 spacy==1.8.2
+SQLAlchemy==1.1.12
+termcolor==1.1.0
 textacy==0.3.4
-django-rest-swagger==2.1.2
-DateTime==4.2
+thinc==6.5.2
+toolz==0.8.2
+tqdm==4.14.0
+ujson==1.35
+Unidecode==0.4.21
+uritemplate==3.0.0
+urllib3==1.21.1
+vine==1.1.4
+wcwidth==0.1.7
+webencodings==0.5.1
+wrapt==1.10.10
+zope.interface==4.4.2
diff --git a/capitolwords_ng/run_crec_es_uploader.py b/capitolwords_ng/run_crec_es_uploader.py
index eacdfea..45ad566 100644
--- a/capitolwords_ng/run_crec_es_uploader.py
+++ b/capitolwords_ng/run_crec_es_uploader.py
@@ -26,7 +26,7 @@
     output_option_group = parser.add_mutually_exclusive_group(required=True)
     output_option_group.add_argument(
         '--to_stdout',
-        help='If true, will not upload to es and instead print to stdout.',
+        help='If true, will not upload to elasticsearch and instead print to stdout.',
         action='store_true'
     )
     output_option_group.add_argument(
@@ -47,7 +47,7 @@
     )
     parser.add_argument(
         '--source_bucket',
-        help='Location of crec data.',
+        help='S3 bucket name of crec source data.',
     )
     args = parser.parse_args()
 

From a6a63cde426322b17c23ebe0b888a8531d3dfc30 Mon Sep 17 00:00:00 2001
From: will-horning <will.horning@gmail.com>
Date: Fri, 28 Jul 2017 15:46:17 -0400
Subject: [PATCH 2/2] [capitol-words] factored out some config stuff, added
 support for override arguments from the ui to scrape all days within a
 datetime range

---
 .../capitolweb/capitolweb/settings.py         |  3 +-
 capitolwords_ng/capitolweb/workers/crec.py    | 35 +++++++++++--------
 2 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/capitolwords_ng/capitolweb/capitolweb/settings.py b/capitolwords_ng/capitolweb/capitolweb/settings.py
index cbdd15a..952eb32 100644
--- a/capitolwords_ng/capitolweb/capitolweb/settings.py
+++ b/capitolwords_ng/capitolweb/capitolweb/settings.py
@@ -20,8 +20,9 @@
 DEV_FRONTEND = True
 DEV_FRONTEND_SPA_BASE_URL = 'http://localhost:3000'
 
-CREC_STAGING_BUCKET = 'capitol-words-data'
+CREC_STAGING_S3_BUCKET = 'capitol-words-data'
 CREC_STAGING_S3_KEY_PREFIX = 'crec'
+CREC_STAGING_FOLDER = '/tmp'
 
 # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
diff --git a/capitolwords_ng/capitolweb/workers/crec.py b/capitolwords_ng/capitolweb/workers/crec.py
index 4caef99..864796d 100644
--- a/capitolwords_ng/capitolweb/workers/crec.py
+++ b/capitolwords_ng/capitolweb/workers/crec.py
@@ -160,11 +160,13 @@ def scrape_files_for_date(self, date):
         zip_path = self.download_crec_zip(date)
         mods_path = self.download_mods_xml(date)
         if zip_path is None:
-            logging.info('No zip found for date {0}'.format(date))
-            return None
+            message = 'No zip found for date {0}.'.format(date)
+            logging.info(message)
+            return {'success': True, 'message': message}
         if mods_path is None:
-            logging.info('No mods.xml found for date {0}'.format(date))
-            return None
+            message = 'No mods.xml found for date {0}'.format(date)
+            logging.info(message)
+            return {'success': True, 'message': message}
         logging.info(
             'Extracting html files from zip to {0}'.format(self.download_dir)
         )
@@ -172,23 +174,25 @@ def scrape_files_for_date(self, date):
         try:
             s3_key = self.upload_to_s3(mods_path, 'mods', date)
         except ClientError as e:
-            logging.exception(
-                'Error uploading file {0}, exiting'.format(mods_path, e)
-            )
-            return False
+            message = 'Error uploading file {0}, exiting'.format(mods_path, e)
+            logging.exception(message)
+            return {'success': False, 'message': message}
         logging.info('Uploading {0} html files...'.format(len(html_file_paths)))
         for file_path in html_file_paths:
             try:
                 s3_key = self.upload_to_s3(file_path, 'crec', date)
             except ClientError as e:
-                logging.exception(
-                    'Error uploading file {0}, exiting'.format(file_path, e)
-                )
-                return False
+                message = 'Error uploading file {0}, exiting'.format(mods_path, e)
+                logging.exception(message)
+                return {'success': False, 'message': message}
         logging.info('Uploads finished.')
-        return True
+        return {
+            'success': True,
+            'message': '{0} crec html files uploaded.'.format(len(html_file_paths))
+        }
 
-    def scrape_files_in_range(self, start_dt, end_dt):
+    def scrape_files_in_range(self, start_dt, end_dt=None):
+        results = []
         if end_dt is None:
             end_dt = datetime.utcnow()
         end_dt = end_dt.replace(hour=0, minute=0, second=0, microsecond=0)
@@ -197,5 +201,6 @@ def scrape_files_in_range(self, start_dt, end_dt):
         dt = start_dt.replace(hour=0, minute=0, second=0, microsecond=0)
         dates = []
         while dt < end_dt:
-            self.scrape_files_for_date(dt)
+            result = self.scrape_files_for_date(dt)
+            results.append(result)
             dt += timedelta(days=1)