From 4c2a8f8f5345c1db60abd0188bc02f58d2066c1a Mon Sep 17 00:00:00 2001 From: William Horning Date: Thu, 16 Feb 2017 17:00:57 -0500 Subject: [PATCH 1/3] [crec_stager] first commit --- crec_stager/__init__.py | 0 crec_stager/crec_stager.py | 258 +++++++++++++++++++++++++++++++++++++ 2 files changed, 258 insertions(+) create mode 100644 crec_stager/__init__.py create mode 100644 crec_stager/crec_stager.py diff --git a/crec_stager/__init__.py b/crec_stager/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crec_stager/crec_stager.py b/crec_stager/crec_stager.py new file mode 100644 index 0000000..e6e4cc3 --- /dev/null +++ b/crec_stager/crec_stager.py @@ -0,0 +1,258 @@ +"""Service for staging unpacked html files from a daily zip of congressional +records retrieved from gpo.gov. + +This module can be used either from the command line, or deployed as an AWS +Lambda function (see :func:``lambda_handler`` for details on lambda execution). + +To run locally: + + :: + + python crec_stager.py --s3_bucket=mybukkit + +Attributes: + DEFAULT_LOG_FORMAT (:obj:`str`): A template string for log lines. + LOGLEVELS (:obj:`dict`): A lookup of loglevel name to the loglevel code. +""" + +from __future__ import print_function + +import os +import sys +import urllib2 +import logging +import argparse +from datetime import datetime +from datetime import timedelta +from zipfile import ZipFile + +import boto3 +from botocore.exceptions import ClientError + +DEFAULT_LOG_FORMAT = ' '.join([ + '%(asctime)s', + '%(levelname)s', + 'pid:%(process)d,', + 'file:%(filename)s:%(lineno)d>', + '%(message)s', +]) + + +LOGLEVELS = { + 'CRITICAL': logging.CRITICAL, + 'DEBUG': logging.DEBUG, + 'WARN': logging.WARN, + 'INFO': logging.INFO, + 'ERROR': logging.ERROR, +} + + +class CRECStager(object): + """Downloads the zip for specified date from gpo.gov, unpacks all html files + to disk, then uploads each one to S3. + + Args: + date (:class:`datetime.datetime`): Date of records to download. + zip_download_dir (:obj:`str`): A directory to download and unpack the + CREC zip. + s3_bucket (:obj:`str`): The name of an S3 bucket to stage unpacked html + files in. + s3_key_prefix (:obj:`str`): The prefix is prepended to each html + filename to create the S3 key to upload it to. + + Attributes: + CREC_ZIP_TEMPLATE (:obj:`str`): The endpoint template for a CREC zip. + """ + + CREC_ZIP_TEMPLATE = 'https://www.gpo.gov/fdsys/pkg/CREC-%Y-%m-%d.zip' + + def __init__(self, date, zip_download_dir, s3_bucket, s3_key_prefix): + self.date = date + self.zip_download_dir = zip_download_dir + self.s3_bucket = s3_bucket + self.s3_key_prefix = s3_key_prefix + self.s3 = boto3.client('s3') + + def download_crec_zip(self): + """Downloads the CREC zip for this date. + + Returns: + :obj:`str`: The path to the downloaded zip. + """ + url = self.date.strftime(self.CREC_ZIP_TEMPLATE) + logging.info('Downloading CREC zip from "{0}".'.format(url)) + try: + response = urllib2.urlopen(url) + except urllib2.URLError as e: + if e.getcode() == 404: + logging.debug('No zip found for date {0}'.format(self.date)) + return None + zip_path = os.path.join(self.zip_download_dir, url.split('/')[-1]) + zip_data = response.read() + with open(zip_path, 'wb') as f: + f.write(zip_data) + return zip_path + + def extract_html_files(self, zip_path): + """Unpacks all html files in the zip at the provided path to the value + set in the instance variable ``CRECStager.zip_download_dir``. + + Args: + zip_path (:obj:`str`): Path to the CREC zip file. + + Returns: + :obj:`list` of :obj:`str`: A list of the unpacked html files. + """ + zip_filename = os.path.splitext(os.path.basename(zip_path))[0] + html_prefix = os.path.join(zip_filename, 'html') + html_filenames = [] + with ZipFile(zip_path) as crec_zip: + for f in crec_zip.filelist: + if f.filename.startswith(html_prefix): + html_filenames.append(f.filename) + crec_zip.extract(f, self.zip_download_dir) + return [ + os.path.join(self.zip_download_dir, fname) + for fname in html_filenames + ] + + def upload_to_s3(self, file_path): + """Uploads the file at the provided path to s3. The s3 key is + generated from the date, the original filename, and the s3_key_prefix. + + Args: + file_path (:obj:`str`): Path to html file. + + Returns: + :obj:`str`: The S3 key the file was uploaded to. + """ + s3_key = os.path.join( + self.s3_key_prefix, + self.date.strftime('%Y/%m/%d'), + os.path.basename(file_path), + ) + with open(file_path) as html_file: + logging.debug( + 'Uploading "{0}" to "s3://{1}/{2}".'.format( + file_path, self.s3_bucket, s3_key + ) + ) + self.s3.put_object( + Body=html_file, Bucket=self.s3_bucket, Key=s3_key + ) + return s3_key + + def stage_html_files(self): + """Main entry point to staging process. Downloads the CREC zip for this + date, unpacks all HTML files to disk, then uploads each one to S3. + + Returns: + :obj:`bool`: True if all uploads were successful, False otherwise. + """ + zip_path = self.download_crec_zip() + if zip_path is None: + logging.info('No zip found for date {0}'.format(self.dt)) + return None + logging.info( + 'Extracting html files from zip to {0}'.format(self.zip_download_dir) + ) + html_file_paths = self.extract_html_files(zip_path) + logging.info('Uploading {0} html files...'.format(len(html_file_paths))) + for file_path in html_file_paths: + try: + s3_key = self.upload_to_s3(file_path) + except ClientError as e: + logging.exception( + 'Error uploading .htm file {0}, exiting'.format(file_path, e) + ) + return False + logging.info('Uploads finished.') + return True + + +def lambda_handler(event, context): + """Entry point for AWS Lambda execution. + + In addition to the arguments specified below, this function also gets some + settings from the following environment variables (set through the AWS + console): + LOGLEVEL + loglevel for logging to cloudwatch + ZIP_DOWNLOAD_DIR + what directory to download and unpack CREC zips. Must be under + ``/tmp`` as everything else is write protected in lambda. + S3_TARGET_BUCKET + what s3 bucket to upload unpacked html files to. + + Args: + event (:obj:`dict`): A dictionary containg data from event trigger. + context (:obj:`dict`): Context settings for this lambda job. + """ + logger = logging.getLogger() + logger.setLevel(os.environ.get('LOGLEVEL', 'INFO')) + formatter = logging.Formatter(DEFAULT_LOG_FORMAT) + zip_download_dir = os.environ.get('ZIP_DOWNLOAD_DIR', '/tmp') + s3_bucket = os.environ.get('S3_TARGET_BUCKET') + if not s3_bucket: + raise Exception('No s3 bucket defined in $S3_TARGET_BUCKET.') + s3_key_prefix = os.environ.get('S3_KEY_PREFIX', 'capitolwords/') + crec_stager = CRECStager( + datetime.utcnow() - timedelta(days=1), + zip_download_dir, + s3_bucket, + s3_key_prefix + ) + crec_stager.stage_html_files() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--date', + help='Date to retrieve records for, format is YYYY-MM-DD.', + type=lambda d: datetime.strptime(d, CMD_LINE_DATE_FORMAT), + ) + parser.add_argument( + '--s3_bucket', + help='Bucket to upload html files to.', + default='use-this-bucket-to-test-your-bullshit', + ) + parser.add_argument( + '--s3_key_prefix', + help='Key prefix for the html files staged in S3.', + default='capitolwords/', + ) + parser.add_argument( + '--zip_download_dir', + help='Directory to write the zip and extracted files to.', + default='/tmp' + ) + parser.add_argument( + '--loglevel', + help='Log level, one of INFO, ERROR, WARN, DEBUG or CRITICAL.', + default='INFO', + ) + args = parser.parse_args() + loglevel = LOGLEVELS.get(args.loglevel.upper()) + if loglevel is None: + loglevel = LOGLEVELS['INFO'] + logger = logging.getLogger() + logger.setLevel(loglevel) + formatter = logging.Formatter(DEFAULT_LOG_FORMAT) + console_handler = logging.StreamHandler(stream=sys.stdout) + console_handler.setLevel(loglevel) + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) + if args.date: + dt = args.date + else: + dt = datetime.utcnow() - timedelta(days=1) + if not os.path.exists(args.zip_download_dir): + os.makedirs(args.zip_download_dir) + crec_stager = CRECStager( + dt, + zip_download_dir, + s3_bucket, + s3_key_prefix + ) + crec_stager.stage_html_files() From ec2ea4f5422f52148be3f0e4c1a13517ff441398 Mon Sep 17 00:00:00 2001 From: William Horning Date: Fri, 17 Feb 2017 15:55:24 -0500 Subject: [PATCH 2/3] [crec_stager] added support for downloading mods.xml file --- crec_stager/crec_stager.py | 93 ++++++++++++++++++++++++++------------ 1 file changed, 65 insertions(+), 28 deletions(-) diff --git a/crec_stager/crec_stager.py b/crec_stager/crec_stager.py index e6e4cc3..5e2fa9e 100644 --- a/crec_stager/crec_stager.py +++ b/crec_stager/crec_stager.py @@ -29,6 +29,7 @@ import boto3 from botocore.exceptions import ClientError + DEFAULT_LOG_FORMAT = ' '.join([ '%(asctime)s', '%(levelname)s', @@ -53,7 +54,7 @@ class CRECStager(object): Args: date (:class:`datetime.datetime`): Date of records to download. - zip_download_dir (:obj:`str`): A directory to download and unpack the + download_dir (:obj:`str`): A directory to download and unpack the CREC zip. s3_bucket (:obj:`str`): The name of an S3 bucket to stage unpacked html files in. @@ -65,10 +66,11 @@ class CRECStager(object): """ CREC_ZIP_TEMPLATE = 'https://www.gpo.gov/fdsys/pkg/CREC-%Y-%m-%d.zip' + MODS_ZIP_TEMPLATE = 'https://www.gpo.gov/fdsys/pkg/CREC-%Y-%m-%d/mods.xml' - def __init__(self, date, zip_download_dir, s3_bucket, s3_key_prefix): + def __init__(self, date, download_dir, s3_bucket, s3_key_prefix): self.date = date - self.zip_download_dir = zip_download_dir + self.download_dir = download_dir self.s3_bucket = s3_bucket self.s3_key_prefix = s3_key_prefix self.s3 = boto3.client('s3') @@ -87,15 +89,38 @@ def download_crec_zip(self): if e.getcode() == 404: logging.debug('No zip found for date {0}'.format(self.date)) return None - zip_path = os.path.join(self.zip_download_dir, url.split('/')[-1]) + zip_path = os.path.join(self.download_dir, url.split('/')[-1]) zip_data = response.read() with open(zip_path, 'wb') as f: f.write(zip_data) return zip_path + def download_mods_xml(self): + """Downloads the mods.xml metadata file for this date to download_dir. + + Returns: + :obj:`str`: Path to the downloaded mods.xml file. + """ + url = self.date.strftime(self.MODS_ZIP_TEMPLATE) + logging.info('Downloading mods.xml from "{0}".'.format(url)) + try: + response = urllib2.urlopen(url) + except urllib2.URLError as e: + if e.getcode() == 404: + logging.debug('No mods.xml found for date {0}, at "{1}"'.format( + self.date, url + ) + ) + return None + data = response.read() + mods_path = os.path.join(self.download_dir, 'mods.xml') + with open(mods_path, 'w') as f: + f.write(data) + return mods_path + def extract_html_files(self, zip_path): """Unpacks all html files in the zip at the provided path to the value - set in the instance variable ``CRECStager.zip_download_dir``. + set in the instance variable ``CRECStager.download_dir``. Args: zip_path (:obj:`str`): Path to the CREC zip file. @@ -110,18 +135,19 @@ def extract_html_files(self, zip_path): for f in crec_zip.filelist: if f.filename.startswith(html_prefix): html_filenames.append(f.filename) - crec_zip.extract(f, self.zip_download_dir) + crec_zip.extract(f, self.download_dir) return [ - os.path.join(self.zip_download_dir, fname) + os.path.join(self.download_dir, fname) for fname in html_filenames ] - def upload_to_s3(self, file_path): + def upload_to_s3(self, file_path, data_type): """Uploads the file at the provided path to s3. The s3 key is generated from the date, the original filename, and the s3_key_prefix. Args: file_path (:obj:`str`): Path to html file. + data_type (:obj:`str`): One of "crec" or "mods", used in s3 key. Returns: :obj:`str`: The S3 key the file was uploaded to. @@ -129,43 +155,53 @@ def upload_to_s3(self, file_path): s3_key = os.path.join( self.s3_key_prefix, self.date.strftime('%Y/%m/%d'), + data_type, os.path.basename(file_path), ) - with open(file_path) as html_file: + with open(file_path) as f: logging.debug( 'Uploading "{0}" to "s3://{1}/{2}".'.format( file_path, self.s3_bucket, s3_key ) ) self.s3.put_object( - Body=html_file, Bucket=self.s3_bucket, Key=s3_key + Body=f, Bucket=self.s3_bucket, Key=s3_key ) return s3_key - def stage_html_files(self): + def stage_files(self): """Main entry point to staging process. Downloads the CREC zip for this - date, unpacks all HTML files to disk, then uploads each one to S3. + date, unpacks all HTML files to disk, downloads the mods.xml metadata + file, and uploads that and the unpacked HTML files. Returns: :obj:`bool`: True if all uploads were successful, False otherwise. """ zip_path = self.download_crec_zip() + mods_path = self.download_mods_xml() if zip_path is None: logging.info('No zip found for date {0}'.format(self.dt)) return None logging.info( - 'Extracting html files from zip to {0}'.format(self.zip_download_dir) + 'Extracting html files from zip to {0}'.format(self.download_dir) ) html_file_paths = self.extract_html_files(zip_path) logging.info('Uploading {0} html files...'.format(len(html_file_paths))) for file_path in html_file_paths: try: - s3_key = self.upload_to_s3(file_path) + s3_key = self.upload_to_s3(file_path, 'crec') except ClientError as e: logging.exception( - 'Error uploading .htm file {0}, exiting'.format(file_path, e) + 'Error uploading file {0}, exiting'.format(file_path, e) ) return False + try: + s3_key = self.upload_to_s3(mods_path, 'mods') + except ClientError as e: + logging.exception( + 'Error uploading file {0}, exiting'.format(mods_path, e) + ) + return False logging.info('Uploads finished.') return True @@ -178,9 +214,10 @@ def lambda_handler(event, context): console): LOGLEVEL loglevel for logging to cloudwatch - ZIP_DOWNLOAD_DIR - what directory to download and unpack CREC zips. Must be under - ``/tmp`` as everything else is write protected in lambda. + DOWNLOAD_DIR + what directory to download and unpack CREC zips and the mods.xml + file. Must be under ``/tmp`` when running in lambda as everything + else is write protected. S3_TARGET_BUCKET what s3 bucket to upload unpacked html files to. @@ -191,18 +228,18 @@ def lambda_handler(event, context): logger = logging.getLogger() logger.setLevel(os.environ.get('LOGLEVEL', 'INFO')) formatter = logging.Formatter(DEFAULT_LOG_FORMAT) - zip_download_dir = os.environ.get('ZIP_DOWNLOAD_DIR', '/tmp') + download_dir = os.environ.get('DOWNLOAD_DIR', '/tmp') s3_bucket = os.environ.get('S3_TARGET_BUCKET') if not s3_bucket: raise Exception('No s3 bucket defined in $S3_TARGET_BUCKET.') s3_key_prefix = os.environ.get('S3_KEY_PREFIX', 'capitolwords/') crec_stager = CRECStager( datetime.utcnow() - timedelta(days=1), - zip_download_dir, + download_dir, s3_bucket, s3_key_prefix ) - crec_stager.stage_html_files() + crec_stager.stage_files() if __name__ == '__main__': @@ -223,7 +260,7 @@ def lambda_handler(event, context): default='capitolwords/', ) parser.add_argument( - '--zip_download_dir', + '--download_dir', help='Directory to write the zip and extracted files to.', default='/tmp' ) @@ -247,12 +284,12 @@ def lambda_handler(event, context): dt = args.date else: dt = datetime.utcnow() - timedelta(days=1) - if not os.path.exists(args.zip_download_dir): - os.makedirs(args.zip_download_dir) + if not os.path.exists(args.download_dir): + os.makedirs(args.download_dir) crec_stager = CRECStager( dt, - zip_download_dir, - s3_bucket, - s3_key_prefix + args.download_dir, + args.s3_bucket, + args.s3_key_prefix ) - crec_stager.stage_html_files() + crec_stager.stage_files() From 4e363101bf715fddcd72dbd682f0d1b7c0ac12e0 Mon Sep 17 00:00:00 2001 From: William Horning Date: Mon, 20 Feb 2017 20:54:44 -0500 Subject: [PATCH 3/3] [crec_parser] mods.xml loader --- crec_stager/crec_stager.py | 28 +++++--- parser/new_parser.py | 128 +++++++++++++++++++++++++++++++++++++ 2 files changed, 147 insertions(+), 9 deletions(-) create mode 100644 parser/new_parser.py diff --git a/crec_stager/crec_stager.py b/crec_stager/crec_stager.py index 5e2fa9e..620d9d1 100644 --- a/crec_stager/crec_stager.py +++ b/crec_stager/crec_stager.py @@ -180,12 +180,22 @@ def stage_files(self): zip_path = self.download_crec_zip() mods_path = self.download_mods_xml() if zip_path is None: - logging.info('No zip found for date {0}'.format(self.dt)) + logging.info('No zip found for date {0}'.format(self.date)) + return None + if mods_path is None: + logging.info('No mods.xml found for date {0}'.format(self.date)) return None logging.info( 'Extracting html files from zip to {0}'.format(self.download_dir) ) html_file_paths = self.extract_html_files(zip_path) + try: + s3_key = self.upload_to_s3(mods_path, 'mods') + except ClientError as e: + logging.exception( + 'Error uploading file {0}, exiting'.format(mods_path, e) + ) + return False logging.info('Uploading {0} html files...'.format(len(html_file_paths))) for file_path in html_file_paths: try: @@ -195,13 +205,6 @@ def stage_files(self): 'Error uploading file {0}, exiting'.format(file_path, e) ) return False - try: - s3_key = self.upload_to_s3(mods_path, 'mods') - except ClientError as e: - logging.exception( - 'Error uploading file {0}, exiting'.format(mods_path, e) - ) - return False logging.info('Uploads finished.') return True @@ -220,6 +223,8 @@ def lambda_handler(event, context): else is write protected. S3_TARGET_BUCKET what s3 bucket to upload unpacked html files to. + DATE + what day to look for crec data for. Args: event (:obj:`dict`): A dictionary containg data from event trigger. @@ -229,12 +234,17 @@ def lambda_handler(event, context): logger.setLevel(os.environ.get('LOGLEVEL', 'INFO')) formatter = logging.Formatter(DEFAULT_LOG_FORMAT) download_dir = os.environ.get('DOWNLOAD_DIR', '/tmp') + date_str = os.environ.get('DATE', None) + if date_str is None: + date = datetime.utcnow() - timedelta(days=1) + else: + date = datetime.strptime(date_str, '%Y-%m-%d') s3_bucket = os.environ.get('S3_TARGET_BUCKET') if not s3_bucket: raise Exception('No s3 bucket defined in $S3_TARGET_BUCKET.') s3_key_prefix = os.environ.get('S3_KEY_PREFIX', 'capitolwords/') crec_stager = CRECStager( - datetime.utcnow() - timedelta(days=1), + date, download_dir, s3_bucket, s3_key_prefix diff --git a/parser/new_parser.py b/parser/new_parser.py new file mode 100644 index 0000000..0d9985b --- /dev/null +++ b/parser/new_parser.py @@ -0,0 +1,128 @@ +from __future__ import print_function + +import argparse +import logging +from datetime import datetime +from datetime import timedelta + +import boto3 +import xmltodict +from botocore.exceptions import ClientError + + +CMD_LINE_DATE_FORMAT = '%Y-%m-%d' + + +def lambda_handler(event, context): + pass + + +class CRECParser(object): + + MODS_S3_KEY_BASE_TEMPLATE = '{prefix}/%Y/%m/%d/mods/mods.xml' + + def __init__(self, s3_bucket, s3_prefix='capitolwords'): + self.s3_bucket = s3_bucket + self.s3_prefix = s3_prefix + self.mods_s3_key_template = self.MODS_S3_KEY_BASE_TEMPLATE.format( + prefix=self.s3_prefix + ) + self.s3 = boto3.client('s3') + self.mods = None + self.crec = None + + def load_mods_from_s3(self, dt=None): + if dt is None: + dt = datetime.utcnow() - timedelta(days=1) + mods_s3_key = dt.strftime(self.mods_s3_key_template) + logging.info('Reading mods.xml file from "{0}".'.format(mods_s3_key)) + response = self.s3.get_object( + Bucket=self.s3_bucket, + Key=mods_s3_key + ) + self.mods = xmltodict.parse(response['Body'].read())['mods'] + return self.mods + + def load_mods_from_disk(self, filepath): + doc = None + with open(filepath) as f: + raw_data = f.read() + doc = xmltodict.parse(raw_data) + self.mods = doc['mods'] + return self.mods + + def load_crec_from_s3(self, crec_s3_key): + response = self.s3.get_object( + Bucket=self.s3_bucket, + Key=crec_s3_key, + ) + self.crec = response['Body'].read() + return self.crec + + def load_crec_from_disk(self, crec_path): + with open(crec_path) as f: + self.crec = f.read() + return self.crec + + def get_crec_description(self, crec_id): + relateds = [] + if self.mods is None: + raise Exception('Mods file must be loaded first.') + for related_item in self.mods['relatedItem']: + if related_item['@ID'] == crec_id: + relateds.append(related_item) + return relateds + +def main(): + parser = CRECParser( + 'use-this-bucket-to-test-your-bullshit', + 'capitolwords', + ) + dt = datetime(2017, 2, 15) + parser.load_mods_from_s3(dt=dt) + crec_id = 'id-CREC-2017-02-15-pt1-PgD160' + data = parser.get_crec_description(crec_id) + return data + +if __name__ == '__main__': + parser = CRECParser( + 'use-this-bucket-to-test-your-bullshit', + 'capitolwords', + ) + dt = datetime(2017, 2, 15) + parser.load_mods_from_s3(dt=dt) + +# capitolwords/2017/02/15/CREC-2017-02-15-pt1-PgD160.htm + + # parser = argparse.ArgumentParser() + # parser.add_argument( + # '--mods_path', + # help='S3 key or local file path to the mods.xml file for this date.', + # ) + # parser.add_argument( + # '--date', + # help='Use the mods.xml file in S3 for this date.', + # type=lambda d: datetime.strptime(d, CMD_LINE_DATE_FORMAT), + # ) + # parser.add_argument( + # '--crec_path', + # help='S3 key or local file path to crec .html file to parse.', + # ) + # parser.add_argument( + # '--pg_host', + # help='Hostname for postgres database.', + # default='localhost', + # ) + # parser.add_argument( + # '--pg_port', + # help='Hostname for postgres database.', + # default=5432 + # ) + # parser.add_argument( + # '--pg_user', + # help='Postgres user name.' + # ) + # parser.add_argument( + # '--pg_password', + # help='Postgres password.' + # )