diff --git a/crec_stager/__init__.py b/crec_stager/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crec_stager/crec_stager.py b/crec_stager/crec_stager.py new file mode 100644 index 0000000..620d9d1 --- /dev/null +++ b/crec_stager/crec_stager.py @@ -0,0 +1,305 @@ +"""Service for staging unpacked html files from a daily zip of congressional +records retrieved from gpo.gov. + +This module can be used either from the command line, or deployed as an AWS +Lambda function (see :func:``lambda_handler`` for details on lambda execution). + +To run locally: + + :: + + python crec_stager.py --s3_bucket=mybukkit + +Attributes: + DEFAULT_LOG_FORMAT (:obj:`str`): A template string for log lines. + LOGLEVELS (:obj:`dict`): A lookup of loglevel name to the loglevel code. +""" + +from __future__ import print_function + +import os +import sys +import urllib2 +import logging +import argparse +from datetime import datetime +from datetime import timedelta +from zipfile import ZipFile + +import boto3 +from botocore.exceptions import ClientError + + +DEFAULT_LOG_FORMAT = ' '.join([ + '%(asctime)s', + '%(levelname)s', + 'pid:%(process)d,', + 'file:%(filename)s:%(lineno)d>', + '%(message)s', +]) + + +LOGLEVELS = { + 'CRITICAL': logging.CRITICAL, + 'DEBUG': logging.DEBUG, + 'WARN': logging.WARN, + 'INFO': logging.INFO, + 'ERROR': logging.ERROR, +} + + +class CRECStager(object): + """Downloads the zip for specified date from gpo.gov, unpacks all html files + to disk, then uploads each one to S3. + + Args: + date (:class:`datetime.datetime`): Date of records to download. + download_dir (:obj:`str`): A directory to download and unpack the + CREC zip. + s3_bucket (:obj:`str`): The name of an S3 bucket to stage unpacked html + files in. + s3_key_prefix (:obj:`str`): The prefix is prepended to each html + filename to create the S3 key to upload it to. + + Attributes: + CREC_ZIP_TEMPLATE (:obj:`str`): The endpoint template for a CREC zip. + """ + + CREC_ZIP_TEMPLATE = 'https://www.gpo.gov/fdsys/pkg/CREC-%Y-%m-%d.zip' + MODS_ZIP_TEMPLATE = 'https://www.gpo.gov/fdsys/pkg/CREC-%Y-%m-%d/mods.xml' + + def __init__(self, date, download_dir, s3_bucket, s3_key_prefix): + self.date = date + self.download_dir = download_dir + self.s3_bucket = s3_bucket + self.s3_key_prefix = s3_key_prefix + self.s3 = boto3.client('s3') + + def download_crec_zip(self): + """Downloads the CREC zip for this date. + + Returns: + :obj:`str`: The path to the downloaded zip. + """ + url = self.date.strftime(self.CREC_ZIP_TEMPLATE) + logging.info('Downloading CREC zip from "{0}".'.format(url)) + try: + response = urllib2.urlopen(url) + except urllib2.URLError as e: + if e.getcode() == 404: + logging.debug('No zip found for date {0}'.format(self.date)) + return None + zip_path = os.path.join(self.download_dir, url.split('/')[-1]) + zip_data = response.read() + with open(zip_path, 'wb') as f: + f.write(zip_data) + return zip_path + + def download_mods_xml(self): + """Downloads the mods.xml metadata file for this date to download_dir. + + Returns: + :obj:`str`: Path to the downloaded mods.xml file. + """ + url = self.date.strftime(self.MODS_ZIP_TEMPLATE) + logging.info('Downloading mods.xml from "{0}".'.format(url)) + try: + response = urllib2.urlopen(url) + except urllib2.URLError as e: + if e.getcode() == 404: + logging.debug('No mods.xml found for date {0}, at "{1}"'.format( + self.date, url + ) + ) + return None + data = response.read() + mods_path = os.path.join(self.download_dir, 'mods.xml') + with open(mods_path, 'w') as f: + f.write(data) + return mods_path + + def extract_html_files(self, zip_path): + """Unpacks all html files in the zip at the provided path to the value + set in the instance variable ``CRECStager.download_dir``. + + Args: + zip_path (:obj:`str`): Path to the CREC zip file. + + Returns: + :obj:`list` of :obj:`str`: A list of the unpacked html files. + """ + zip_filename = os.path.splitext(os.path.basename(zip_path))[0] + html_prefix = os.path.join(zip_filename, 'html') + html_filenames = [] + with ZipFile(zip_path) as crec_zip: + for f in crec_zip.filelist: + if f.filename.startswith(html_prefix): + html_filenames.append(f.filename) + crec_zip.extract(f, self.download_dir) + return [ + os.path.join(self.download_dir, fname) + for fname in html_filenames + ] + + def upload_to_s3(self, file_path, data_type): + """Uploads the file at the provided path to s3. The s3 key is + generated from the date, the original filename, and the s3_key_prefix. + + Args: + file_path (:obj:`str`): Path to html file. + data_type (:obj:`str`): One of "crec" or "mods", used in s3 key. + + Returns: + :obj:`str`: The S3 key the file was uploaded to. + """ + s3_key = os.path.join( + self.s3_key_prefix, + self.date.strftime('%Y/%m/%d'), + data_type, + os.path.basename(file_path), + ) + with open(file_path) as f: + logging.debug( + 'Uploading "{0}" to "s3://{1}/{2}".'.format( + file_path, self.s3_bucket, s3_key + ) + ) + self.s3.put_object( + Body=f, Bucket=self.s3_bucket, Key=s3_key + ) + return s3_key + + def stage_files(self): + """Main entry point to staging process. Downloads the CREC zip for this + date, unpacks all HTML files to disk, downloads the mods.xml metadata + file, and uploads that and the unpacked HTML files. + + Returns: + :obj:`bool`: True if all uploads were successful, False otherwise. + """ + zip_path = self.download_crec_zip() + mods_path = self.download_mods_xml() + if zip_path is None: + logging.info('No zip found for date {0}'.format(self.date)) + return None + if mods_path is None: + logging.info('No mods.xml found for date {0}'.format(self.date)) + return None + logging.info( + 'Extracting html files from zip to {0}'.format(self.download_dir) + ) + html_file_paths = self.extract_html_files(zip_path) + try: + s3_key = self.upload_to_s3(mods_path, 'mods') + except ClientError as e: + logging.exception( + 'Error uploading file {0}, exiting'.format(mods_path, e) + ) + return False + logging.info('Uploading {0} html files...'.format(len(html_file_paths))) + for file_path in html_file_paths: + try: + s3_key = self.upload_to_s3(file_path, 'crec') + except ClientError as e: + logging.exception( + 'Error uploading file {0}, exiting'.format(file_path, e) + ) + return False + logging.info('Uploads finished.') + return True + + +def lambda_handler(event, context): + """Entry point for AWS Lambda execution. + + In addition to the arguments specified below, this function also gets some + settings from the following environment variables (set through the AWS + console): + LOGLEVEL + loglevel for logging to cloudwatch + DOWNLOAD_DIR + what directory to download and unpack CREC zips and the mods.xml + file. Must be under ``/tmp`` when running in lambda as everything + else is write protected. + S3_TARGET_BUCKET + what s3 bucket to upload unpacked html files to. + DATE + what day to look for crec data for. + + Args: + event (:obj:`dict`): A dictionary containg data from event trigger. + context (:obj:`dict`): Context settings for this lambda job. + """ + logger = logging.getLogger() + logger.setLevel(os.environ.get('LOGLEVEL', 'INFO')) + formatter = logging.Formatter(DEFAULT_LOG_FORMAT) + download_dir = os.environ.get('DOWNLOAD_DIR', '/tmp') + date_str = os.environ.get('DATE', None) + if date_str is None: + date = datetime.utcnow() - timedelta(days=1) + else: + date = datetime.strptime(date_str, '%Y-%m-%d') + s3_bucket = os.environ.get('S3_TARGET_BUCKET') + if not s3_bucket: + raise Exception('No s3 bucket defined in $S3_TARGET_BUCKET.') + s3_key_prefix = os.environ.get('S3_KEY_PREFIX', 'capitolwords/') + crec_stager = CRECStager( + date, + download_dir, + s3_bucket, + s3_key_prefix + ) + crec_stager.stage_files() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--date', + help='Date to retrieve records for, format is YYYY-MM-DD.', + type=lambda d: datetime.strptime(d, CMD_LINE_DATE_FORMAT), + ) + parser.add_argument( + '--s3_bucket', + help='Bucket to upload html files to.', + default='use-this-bucket-to-test-your-bullshit', + ) + parser.add_argument( + '--s3_key_prefix', + help='Key prefix for the html files staged in S3.', + default='capitolwords/', + ) + parser.add_argument( + '--download_dir', + help='Directory to write the zip and extracted files to.', + default='/tmp' + ) + parser.add_argument( + '--loglevel', + help='Log level, one of INFO, ERROR, WARN, DEBUG or CRITICAL.', + default='INFO', + ) + args = parser.parse_args() + loglevel = LOGLEVELS.get(args.loglevel.upper()) + if loglevel is None: + loglevel = LOGLEVELS['INFO'] + logger = logging.getLogger() + logger.setLevel(loglevel) + formatter = logging.Formatter(DEFAULT_LOG_FORMAT) + console_handler = logging.StreamHandler(stream=sys.stdout) + console_handler.setLevel(loglevel) + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) + if args.date: + dt = args.date + else: + dt = datetime.utcnow() - timedelta(days=1) + if not os.path.exists(args.download_dir): + os.makedirs(args.download_dir) + crec_stager = CRECStager( + dt, + args.download_dir, + args.s3_bucket, + args.s3_key_prefix + ) + crec_stager.stage_files() diff --git a/parser/new_parser.py b/parser/new_parser.py new file mode 100644 index 0000000..0d9985b --- /dev/null +++ b/parser/new_parser.py @@ -0,0 +1,128 @@ +from __future__ import print_function + +import argparse +import logging +from datetime import datetime +from datetime import timedelta + +import boto3 +import xmltodict +from botocore.exceptions import ClientError + + +CMD_LINE_DATE_FORMAT = '%Y-%m-%d' + + +def lambda_handler(event, context): + pass + + +class CRECParser(object): + + MODS_S3_KEY_BASE_TEMPLATE = '{prefix}/%Y/%m/%d/mods/mods.xml' + + def __init__(self, s3_bucket, s3_prefix='capitolwords'): + self.s3_bucket = s3_bucket + self.s3_prefix = s3_prefix + self.mods_s3_key_template = self.MODS_S3_KEY_BASE_TEMPLATE.format( + prefix=self.s3_prefix + ) + self.s3 = boto3.client('s3') + self.mods = None + self.crec = None + + def load_mods_from_s3(self, dt=None): + if dt is None: + dt = datetime.utcnow() - timedelta(days=1) + mods_s3_key = dt.strftime(self.mods_s3_key_template) + logging.info('Reading mods.xml file from "{0}".'.format(mods_s3_key)) + response = self.s3.get_object( + Bucket=self.s3_bucket, + Key=mods_s3_key + ) + self.mods = xmltodict.parse(response['Body'].read())['mods'] + return self.mods + + def load_mods_from_disk(self, filepath): + doc = None + with open(filepath) as f: + raw_data = f.read() + doc = xmltodict.parse(raw_data) + self.mods = doc['mods'] + return self.mods + + def load_crec_from_s3(self, crec_s3_key): + response = self.s3.get_object( + Bucket=self.s3_bucket, + Key=crec_s3_key, + ) + self.crec = response['Body'].read() + return self.crec + + def load_crec_from_disk(self, crec_path): + with open(crec_path) as f: + self.crec = f.read() + return self.crec + + def get_crec_description(self, crec_id): + relateds = [] + if self.mods is None: + raise Exception('Mods file must be loaded first.') + for related_item in self.mods['relatedItem']: + if related_item['@ID'] == crec_id: + relateds.append(related_item) + return relateds + +def main(): + parser = CRECParser( + 'use-this-bucket-to-test-your-bullshit', + 'capitolwords', + ) + dt = datetime(2017, 2, 15) + parser.load_mods_from_s3(dt=dt) + crec_id = 'id-CREC-2017-02-15-pt1-PgD160' + data = parser.get_crec_description(crec_id) + return data + +if __name__ == '__main__': + parser = CRECParser( + 'use-this-bucket-to-test-your-bullshit', + 'capitolwords', + ) + dt = datetime(2017, 2, 15) + parser.load_mods_from_s3(dt=dt) + +# capitolwords/2017/02/15/CREC-2017-02-15-pt1-PgD160.htm + + # parser = argparse.ArgumentParser() + # parser.add_argument( + # '--mods_path', + # help='S3 key or local file path to the mods.xml file for this date.', + # ) + # parser.add_argument( + # '--date', + # help='Use the mods.xml file in S3 for this date.', + # type=lambda d: datetime.strptime(d, CMD_LINE_DATE_FORMAT), + # ) + # parser.add_argument( + # '--crec_path', + # help='S3 key or local file path to crec .html file to parse.', + # ) + # parser.add_argument( + # '--pg_host', + # help='Hostname for postgres database.', + # default='localhost', + # ) + # parser.add_argument( + # '--pg_port', + # help='Hostname for postgres database.', + # default=5432 + # ) + # parser.add_argument( + # '--pg_user', + # help='Postgres user name.' + # ) + # parser.add_argument( + # '--pg_password', + # help='Postgres password.' + # )