From 4c2a8f8f5345c1db60abd0188bc02f58d2066c1a Mon Sep 17 00:00:00 2001
From: William Horning <will.horning@chartbeat.com>
Date: Thu, 16 Feb 2017 17:00:57 -0500
Subject: [PATCH 1/3] [crec_stager] first commit

---
 crec_stager/__init__.py    |   0
 crec_stager/crec_stager.py | 258 +++++++++++++++++++++++++++++++++++++
 2 files changed, 258 insertions(+)
 create mode 100644 crec_stager/__init__.py
 create mode 100644 crec_stager/crec_stager.py

diff --git a/crec_stager/__init__.py b/crec_stager/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/crec_stager/crec_stager.py b/crec_stager/crec_stager.py
new file mode 100644
index 0000000..e6e4cc3
--- /dev/null
+++ b/crec_stager/crec_stager.py
@@ -0,0 +1,258 @@
+"""Service for staging unpacked html files from a daily zip of congressional
+records retrieved from gpo.gov.
+
+This module can be used either from the command line, or deployed as an AWS
+Lambda function (see :func:``lambda_handler`` for details on lambda execution).
+
+To run locally:
+
+    ::
+
+        python crec_stager.py --s3_bucket=mybukkit
+
+Attributes:
+    DEFAULT_LOG_FORMAT (:obj:`str`): A template string for log lines.
+    LOGLEVELS (:obj:`dict`): A lookup of loglevel name to the loglevel code.
+"""
+
+from __future__ import print_function
+
+import os
+import sys
+import urllib2
+import logging
+import argparse
+from datetime import datetime
+from datetime import timedelta
+from zipfile import ZipFile
+
+import boto3
+from botocore.exceptions import ClientError
+
+DEFAULT_LOG_FORMAT = ' '.join([
+    '%(asctime)s',
+    '%(levelname)s',
+    'pid:%(process)d,',
+    'file:%(filename)s:%(lineno)d>',
+    '%(message)s',
+])
+
+
+LOGLEVELS = {
+    'CRITICAL': logging.CRITICAL,
+    'DEBUG': logging.DEBUG,
+    'WARN': logging.WARN,
+    'INFO': logging.INFO,
+    'ERROR': logging.ERROR,
+}
+
+
+class CRECStager(object):
+    """Downloads the zip for specified date from gpo.gov, unpacks all html files
+    to disk, then uploads each one to S3.
+
+    Args:
+        date (:class:`datetime.datetime`): Date of records to download.
+        zip_download_dir (:obj:`str`): A directory to download and unpack the
+            CREC zip.
+        s3_bucket (:obj:`str`): The name of an S3 bucket to stage unpacked html
+            files in.
+        s3_key_prefix (:obj:`str`): The prefix is prepended to each html
+            filename to create the S3 key to upload it to.
+
+    Attributes:
+        CREC_ZIP_TEMPLATE (:obj:`str`): The endpoint template for a CREC zip.
+    """
+
+    CREC_ZIP_TEMPLATE = 'https://www.gpo.gov/fdsys/pkg/CREC-%Y-%m-%d.zip'
+
+    def __init__(self, date, zip_download_dir, s3_bucket, s3_key_prefix):
+        self.date = date
+        self.zip_download_dir = zip_download_dir
+        self.s3_bucket = s3_bucket
+        self.s3_key_prefix = s3_key_prefix
+        self.s3 = boto3.client('s3')
+
+    def download_crec_zip(self):
+        """Downloads the CREC zip for this date.
+
+        Returns:
+            :obj:`str`: The path to the downloaded zip.
+        """
+        url = self.date.strftime(self.CREC_ZIP_TEMPLATE)
+        logging.info('Downloading CREC zip from "{0}".'.format(url))
+        try:
+            response = urllib2.urlopen(url)
+        except urllib2.URLError as e:
+            if e.getcode() == 404:
+                logging.debug('No zip found for date {0}'.format(self.date))
+                return None
+        zip_path = os.path.join(self.zip_download_dir, url.split('/')[-1])
+        zip_data = response.read()
+        with open(zip_path, 'wb') as f:
+            f.write(zip_data)
+        return zip_path
+
+    def extract_html_files(self, zip_path):
+        """Unpacks all html files in the zip at the provided path to the value
+        set in the instance variable ``CRECStager.zip_download_dir``.
+
+        Args:
+            zip_path (:obj:`str`): Path to the CREC zip file.
+
+        Returns:
+            :obj:`list` of :obj:`str`: A list of the unpacked html files.
+        """
+        zip_filename = os.path.splitext(os.path.basename(zip_path))[0]
+        html_prefix = os.path.join(zip_filename, 'html')
+        html_filenames = []
+        with ZipFile(zip_path) as crec_zip:
+            for f in crec_zip.filelist:
+                if f.filename.startswith(html_prefix):
+                    html_filenames.append(f.filename)
+                    crec_zip.extract(f, self.zip_download_dir)
+        return [
+            os.path.join(self.zip_download_dir, fname)
+            for fname in html_filenames
+        ]
+
+    def upload_to_s3(self, file_path):
+        """Uploads the file at the provided path to s3. The s3 key is
+        generated from the date, the original filename, and the s3_key_prefix.
+
+        Args:
+            file_path (:obj:`str`): Path to html file.
+
+        Returns:
+            :obj:`str`: The S3 key the file was uploaded to.
+        """
+        s3_key = os.path.join(
+            self.s3_key_prefix,
+            self.date.strftime('%Y/%m/%d'),
+            os.path.basename(file_path),
+        )
+        with open(file_path) as html_file:
+            logging.debug(
+                'Uploading "{0}" to "s3://{1}/{2}".'.format(
+                    file_path, self.s3_bucket, s3_key
+                )
+            )
+            self.s3.put_object(
+                Body=html_file, Bucket=self.s3_bucket, Key=s3_key
+            )
+        return s3_key
+
+    def stage_html_files(self):
+        """Main entry point to staging process. Downloads the CREC zip for this
+        date, unpacks all HTML files to disk, then uploads each one to S3.
+
+        Returns:
+            :obj:`bool`: True if all uploads were successful, False otherwise.
+        """
+        zip_path = self.download_crec_zip()
+        if zip_path is None:
+            logging.info('No zip found for date {0}'.format(self.dt))
+            return None
+        logging.info(
+            'Extracting html files from zip to {0}'.format(self.zip_download_dir)
+        )
+        html_file_paths = self.extract_html_files(zip_path)
+        logging.info('Uploading {0} html files...'.format(len(html_file_paths)))
+        for file_path in html_file_paths:
+            try:
+                s3_key = self.upload_to_s3(file_path)
+            except ClientError as e:
+                logging.exception(
+                    'Error uploading .htm file {0}, exiting'.format(file_path, e)
+                )
+                return False
+        logging.info('Uploads finished.')
+        return True
+
+
+def lambda_handler(event, context):
+    """Entry point for AWS Lambda execution.
+
+    In addition to the arguments specified below, this function also gets some
+    settings from the following environment variables (set through the AWS
+    console):
+        LOGLEVEL
+            loglevel for logging to cloudwatch
+        ZIP_DOWNLOAD_DIR
+            what directory to download and unpack CREC zips. Must be under
+            ``/tmp`` as everything else is write protected in lambda.
+        S3_TARGET_BUCKET
+            what s3 bucket to upload unpacked html files to.
+
+    Args:
+        event (:obj:`dict`): A dictionary containg data from event trigger.
+        context (:obj:`dict`): Context settings for this lambda job.
+    """
+    logger = logging.getLogger()
+    logger.setLevel(os.environ.get('LOGLEVEL', 'INFO'))
+    formatter = logging.Formatter(DEFAULT_LOG_FORMAT)
+    zip_download_dir = os.environ.get('ZIP_DOWNLOAD_DIR', '/tmp')
+    s3_bucket = os.environ.get('S3_TARGET_BUCKET')
+    if not s3_bucket:
+        raise Exception('No s3 bucket defined in $S3_TARGET_BUCKET.')
+    s3_key_prefix = os.environ.get('S3_KEY_PREFIX', 'capitolwords/')
+    crec_stager = CRECStager(
+        datetime.utcnow() - timedelta(days=1),
+        zip_download_dir,
+        s3_bucket,
+        s3_key_prefix
+    )
+    crec_stager.stage_html_files()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--date',
+        help='Date to retrieve records for, format is YYYY-MM-DD.',
+        type=lambda d: datetime.strptime(d, CMD_LINE_DATE_FORMAT),
+    )
+    parser.add_argument(
+        '--s3_bucket',
+        help='Bucket to upload html files to.',
+        default='use-this-bucket-to-test-your-bullshit',
+    )
+    parser.add_argument(
+        '--s3_key_prefix',
+        help='Key prefix for the html files staged in S3.',
+        default='capitolwords/',
+    )
+    parser.add_argument(
+        '--zip_download_dir',
+        help='Directory to write the zip and extracted files to.',
+        default='/tmp'
+    )
+    parser.add_argument(
+        '--loglevel',
+        help='Log level, one of INFO, ERROR, WARN, DEBUG or CRITICAL.',
+        default='INFO',
+    )
+    args = parser.parse_args()
+    loglevel = LOGLEVELS.get(args.loglevel.upper())
+    if loglevel is None:
+        loglevel = LOGLEVELS['INFO']
+    logger = logging.getLogger()
+    logger.setLevel(loglevel)
+    formatter = logging.Formatter(DEFAULT_LOG_FORMAT)
+    console_handler = logging.StreamHandler(stream=sys.stdout)
+    console_handler.setLevel(loglevel)
+    console_handler.setFormatter(formatter)
+    logger.addHandler(console_handler)
+    if args.date:
+        dt = args.date
+    else:
+        dt = datetime.utcnow() - timedelta(days=1)
+    if not os.path.exists(args.zip_download_dir):
+        os.makedirs(args.zip_download_dir)
+    crec_stager = CRECStager(
+        dt,
+        zip_download_dir,
+        s3_bucket,
+        s3_key_prefix
+    )
+    crec_stager.stage_html_files()

From ec2ea4f5422f52148be3f0e4c1a13517ff441398 Mon Sep 17 00:00:00 2001
From: William Horning <will.horning@chartbeat.com>
Date: Fri, 17 Feb 2017 15:55:24 -0500
Subject: [PATCH 2/3] [crec_stager] added support for downloading mods.xml file

---
 crec_stager/crec_stager.py | 93 ++++++++++++++++++++++++++------------
 1 file changed, 65 insertions(+), 28 deletions(-)

diff --git a/crec_stager/crec_stager.py b/crec_stager/crec_stager.py
index e6e4cc3..5e2fa9e 100644
--- a/crec_stager/crec_stager.py
+++ b/crec_stager/crec_stager.py
@@ -29,6 +29,7 @@
 import boto3
 from botocore.exceptions import ClientError
 
+
 DEFAULT_LOG_FORMAT = ' '.join([
     '%(asctime)s',
     '%(levelname)s',
@@ -53,7 +54,7 @@ class CRECStager(object):
 
     Args:
         date (:class:`datetime.datetime`): Date of records to download.
-        zip_download_dir (:obj:`str`): A directory to download and unpack the
+        download_dir (:obj:`str`): A directory to download and unpack the
             CREC zip.
         s3_bucket (:obj:`str`): The name of an S3 bucket to stage unpacked html
             files in.
@@ -65,10 +66,11 @@ class CRECStager(object):
     """
 
     CREC_ZIP_TEMPLATE = 'https://www.gpo.gov/fdsys/pkg/CREC-%Y-%m-%d.zip'
+    MODS_ZIP_TEMPLATE = 'https://www.gpo.gov/fdsys/pkg/CREC-%Y-%m-%d/mods.xml'
 
-    def __init__(self, date, zip_download_dir, s3_bucket, s3_key_prefix):
+    def __init__(self, date, download_dir, s3_bucket, s3_key_prefix):
         self.date = date
-        self.zip_download_dir = zip_download_dir
+        self.download_dir = download_dir
         self.s3_bucket = s3_bucket
         self.s3_key_prefix = s3_key_prefix
         self.s3 = boto3.client('s3')
@@ -87,15 +89,38 @@ def download_crec_zip(self):
             if e.getcode() == 404:
                 logging.debug('No zip found for date {0}'.format(self.date))
                 return None
-        zip_path = os.path.join(self.zip_download_dir, url.split('/')[-1])
+        zip_path = os.path.join(self.download_dir, url.split('/')[-1])
         zip_data = response.read()
         with open(zip_path, 'wb') as f:
             f.write(zip_data)
         return zip_path
 
+    def download_mods_xml(self):
+        """Downloads the mods.xml metadata file for this date to download_dir.
+
+        Returns:
+            :obj:`str`: Path to the downloaded mods.xml file.
+        """
+        url = self.date.strftime(self.MODS_ZIP_TEMPLATE)
+        logging.info('Downloading mods.xml from "{0}".'.format(url))
+        try:
+            response = urllib2.urlopen(url)
+        except urllib2.URLError as e:
+            if e.getcode() == 404:
+                logging.debug('No mods.xml found for date {0}, at "{1}"'.format(
+                        self.date, url
+                    )
+                )
+                return None
+        data = response.read()
+        mods_path = os.path.join(self.download_dir, 'mods.xml')
+        with open(mods_path, 'w') as f:
+            f.write(data)
+        return mods_path
+
     def extract_html_files(self, zip_path):
         """Unpacks all html files in the zip at the provided path to the value
-        set in the instance variable ``CRECStager.zip_download_dir``.
+        set in the instance variable ``CRECStager.download_dir``.
 
         Args:
             zip_path (:obj:`str`): Path to the CREC zip file.
@@ -110,18 +135,19 @@ def extract_html_files(self, zip_path):
             for f in crec_zip.filelist:
                 if f.filename.startswith(html_prefix):
                     html_filenames.append(f.filename)
-                    crec_zip.extract(f, self.zip_download_dir)
+                    crec_zip.extract(f, self.download_dir)
         return [
-            os.path.join(self.zip_download_dir, fname)
+            os.path.join(self.download_dir, fname)
             for fname in html_filenames
         ]
 
-    def upload_to_s3(self, file_path):
+    def upload_to_s3(self, file_path, data_type):
         """Uploads the file at the provided path to s3. The s3 key is
         generated from the date, the original filename, and the s3_key_prefix.
 
         Args:
             file_path (:obj:`str`): Path to html file.
+            data_type (:obj:`str`): One of "crec" or "mods", used in s3 key.
 
         Returns:
             :obj:`str`: The S3 key the file was uploaded to.
@@ -129,43 +155,53 @@ def upload_to_s3(self, file_path):
         s3_key = os.path.join(
             self.s3_key_prefix,
             self.date.strftime('%Y/%m/%d'),
+            data_type,
             os.path.basename(file_path),
         )
-        with open(file_path) as html_file:
+        with open(file_path) as f:
             logging.debug(
                 'Uploading "{0}" to "s3://{1}/{2}".'.format(
                     file_path, self.s3_bucket, s3_key
                 )
             )
             self.s3.put_object(
-                Body=html_file, Bucket=self.s3_bucket, Key=s3_key
+                Body=f, Bucket=self.s3_bucket, Key=s3_key
             )
         return s3_key
 
-    def stage_html_files(self):
+    def stage_files(self):
         """Main entry point to staging process. Downloads the CREC zip for this
-        date, unpacks all HTML files to disk, then uploads each one to S3.
+        date, unpacks all HTML files to disk, downloads the mods.xml metadata
+        file, and uploads that and the unpacked HTML files.
 
         Returns:
             :obj:`bool`: True if all uploads were successful, False otherwise.
         """
         zip_path = self.download_crec_zip()
+        mods_path = self.download_mods_xml()
         if zip_path is None:
             logging.info('No zip found for date {0}'.format(self.dt))
             return None
         logging.info(
-            'Extracting html files from zip to {0}'.format(self.zip_download_dir)
+            'Extracting html files from zip to {0}'.format(self.download_dir)
         )
         html_file_paths = self.extract_html_files(zip_path)
         logging.info('Uploading {0} html files...'.format(len(html_file_paths)))
         for file_path in html_file_paths:
             try:
-                s3_key = self.upload_to_s3(file_path)
+                s3_key = self.upload_to_s3(file_path, 'crec')
             except ClientError as e:
                 logging.exception(
-                    'Error uploading .htm file {0}, exiting'.format(file_path, e)
+                    'Error uploading file {0}, exiting'.format(file_path, e)
                 )
                 return False
+        try:
+            s3_key = self.upload_to_s3(mods_path, 'mods')
+        except ClientError as e:
+            logging.exception(
+                'Error uploading file {0}, exiting'.format(mods_path, e)
+            )
+            return False
         logging.info('Uploads finished.')
         return True
 
@@ -178,9 +214,10 @@ def lambda_handler(event, context):
     console):
         LOGLEVEL
             loglevel for logging to cloudwatch
-        ZIP_DOWNLOAD_DIR
-            what directory to download and unpack CREC zips. Must be under
-            ``/tmp`` as everything else is write protected in lambda.
+        DOWNLOAD_DIR
+            what directory to download and unpack CREC zips and the mods.xml
+            file. Must be under ``/tmp`` when running in lambda as everything
+            else is write protected.
         S3_TARGET_BUCKET
             what s3 bucket to upload unpacked html files to.
 
@@ -191,18 +228,18 @@ def lambda_handler(event, context):
     logger = logging.getLogger()
     logger.setLevel(os.environ.get('LOGLEVEL', 'INFO'))
     formatter = logging.Formatter(DEFAULT_LOG_FORMAT)
-    zip_download_dir = os.environ.get('ZIP_DOWNLOAD_DIR', '/tmp')
+    download_dir = os.environ.get('DOWNLOAD_DIR', '/tmp')
     s3_bucket = os.environ.get('S3_TARGET_BUCKET')
     if not s3_bucket:
         raise Exception('No s3 bucket defined in $S3_TARGET_BUCKET.')
     s3_key_prefix = os.environ.get('S3_KEY_PREFIX', 'capitolwords/')
     crec_stager = CRECStager(
         datetime.utcnow() - timedelta(days=1),
-        zip_download_dir,
+        download_dir,
         s3_bucket,
         s3_key_prefix
     )
-    crec_stager.stage_html_files()
+    crec_stager.stage_files()
 
 
 if __name__ == '__main__':
@@ -223,7 +260,7 @@ def lambda_handler(event, context):
         default='capitolwords/',
     )
     parser.add_argument(
-        '--zip_download_dir',
+        '--download_dir',
         help='Directory to write the zip and extracted files to.',
         default='/tmp'
     )
@@ -247,12 +284,12 @@ def lambda_handler(event, context):
         dt = args.date
     else:
         dt = datetime.utcnow() - timedelta(days=1)
-    if not os.path.exists(args.zip_download_dir):
-        os.makedirs(args.zip_download_dir)
+    if not os.path.exists(args.download_dir):
+        os.makedirs(args.download_dir)
     crec_stager = CRECStager(
         dt,
-        zip_download_dir,
-        s3_bucket,
-        s3_key_prefix
+        args.download_dir,
+        args.s3_bucket,
+        args.s3_key_prefix
     )
-    crec_stager.stage_html_files()
+    crec_stager.stage_files()

From 4e363101bf715fddcd72dbd682f0d1b7c0ac12e0 Mon Sep 17 00:00:00 2001
From: William Horning <will.horning@chartbeat.com>
Date: Mon, 20 Feb 2017 20:54:44 -0500
Subject: [PATCH 3/3] [crec_parser] mods.xml loader

---
 crec_stager/crec_stager.py |  28 +++++---
 parser/new_parser.py       | 128 +++++++++++++++++++++++++++++++++++++
 2 files changed, 147 insertions(+), 9 deletions(-)
 create mode 100644 parser/new_parser.py

diff --git a/crec_stager/crec_stager.py b/crec_stager/crec_stager.py
index 5e2fa9e..620d9d1 100644
--- a/crec_stager/crec_stager.py
+++ b/crec_stager/crec_stager.py
@@ -180,12 +180,22 @@ def stage_files(self):
         zip_path = self.download_crec_zip()
         mods_path = self.download_mods_xml()
         if zip_path is None:
-            logging.info('No zip found for date {0}'.format(self.dt))
+            logging.info('No zip found for date {0}'.format(self.date))
+            return None
+        if mods_path is None:
+            logging.info('No mods.xml found for date {0}'.format(self.date))
             return None
         logging.info(
             'Extracting html files from zip to {0}'.format(self.download_dir)
         )
         html_file_paths = self.extract_html_files(zip_path)
+        try:
+            s3_key = self.upload_to_s3(mods_path, 'mods')
+        except ClientError as e:
+            logging.exception(
+                'Error uploading file {0}, exiting'.format(mods_path, e)
+            )
+            return False
         logging.info('Uploading {0} html files...'.format(len(html_file_paths)))
         for file_path in html_file_paths:
             try:
@@ -195,13 +205,6 @@ def stage_files(self):
                     'Error uploading file {0}, exiting'.format(file_path, e)
                 )
                 return False
-        try:
-            s3_key = self.upload_to_s3(mods_path, 'mods')
-        except ClientError as e:
-            logging.exception(
-                'Error uploading file {0}, exiting'.format(mods_path, e)
-            )
-            return False
         logging.info('Uploads finished.')
         return True
 
@@ -220,6 +223,8 @@ def lambda_handler(event, context):
             else is write protected.
         S3_TARGET_BUCKET
             what s3 bucket to upload unpacked html files to.
+        DATE
+            what day to look for crec data for.
 
     Args:
         event (:obj:`dict`): A dictionary containg data from event trigger.
@@ -229,12 +234,17 @@ def lambda_handler(event, context):
     logger.setLevel(os.environ.get('LOGLEVEL', 'INFO'))
     formatter = logging.Formatter(DEFAULT_LOG_FORMAT)
     download_dir = os.environ.get('DOWNLOAD_DIR', '/tmp')
+    date_str = os.environ.get('DATE', None)
+    if date_str is None:
+        date = datetime.utcnow() - timedelta(days=1)
+    else:
+        date = datetime.strptime(date_str, '%Y-%m-%d')
     s3_bucket = os.environ.get('S3_TARGET_BUCKET')
     if not s3_bucket:
         raise Exception('No s3 bucket defined in $S3_TARGET_BUCKET.')
     s3_key_prefix = os.environ.get('S3_KEY_PREFIX', 'capitolwords/')
     crec_stager = CRECStager(
-        datetime.utcnow() - timedelta(days=1),
+        date,
         download_dir,
         s3_bucket,
         s3_key_prefix
diff --git a/parser/new_parser.py b/parser/new_parser.py
new file mode 100644
index 0000000..0d9985b
--- /dev/null
+++ b/parser/new_parser.py
@@ -0,0 +1,128 @@
+from __future__ import print_function
+
+import argparse
+import logging
+from datetime import datetime
+from datetime import timedelta
+
+import boto3
+import xmltodict
+from botocore.exceptions import ClientError
+
+
+CMD_LINE_DATE_FORMAT = '%Y-%m-%d'
+
+
+def lambda_handler(event, context):
+    pass
+
+
+class CRECParser(object):
+
+    MODS_S3_KEY_BASE_TEMPLATE = '{prefix}/%Y/%m/%d/mods/mods.xml'
+
+    def __init__(self, s3_bucket, s3_prefix='capitolwords'):
+        self.s3_bucket = s3_bucket
+        self.s3_prefix = s3_prefix
+        self.mods_s3_key_template = self.MODS_S3_KEY_BASE_TEMPLATE.format(
+            prefix=self.s3_prefix
+        )
+        self.s3 = boto3.client('s3')
+        self.mods = None
+        self.crec = None
+
+    def load_mods_from_s3(self, dt=None):
+        if dt is None:
+            dt = datetime.utcnow() - timedelta(days=1)
+        mods_s3_key = dt.strftime(self.mods_s3_key_template)
+        logging.info('Reading mods.xml file from "{0}".'.format(mods_s3_key))
+        response = self.s3.get_object(
+            Bucket=self.s3_bucket,
+            Key=mods_s3_key
+        )
+        self.mods = xmltodict.parse(response['Body'].read())['mods']
+        return self.mods
+
+    def load_mods_from_disk(self, filepath):
+        doc = None
+        with open(filepath) as f:
+            raw_data = f.read()
+            doc = xmltodict.parse(raw_data)
+        self.mods = doc['mods']
+        return self.mods
+
+    def load_crec_from_s3(self, crec_s3_key):
+        response = self.s3.get_object(
+            Bucket=self.s3_bucket,
+            Key=crec_s3_key,
+        )
+        self.crec = response['Body'].read()
+        return self.crec
+
+    def load_crec_from_disk(self, crec_path):
+        with open(crec_path) as f:
+            self.crec = f.read()
+        return self.crec
+
+    def get_crec_description(self, crec_id):
+        relateds = []
+        if self.mods is None:
+            raise Exception('Mods file must be loaded first.')
+        for related_item in self.mods['relatedItem']:
+            if related_item['@ID'] == crec_id:
+                relateds.append(related_item)
+        return relateds
+
+def main():
+    parser = CRECParser(
+        'use-this-bucket-to-test-your-bullshit',
+        'capitolwords',
+    )
+    dt = datetime(2017, 2, 15)
+    parser.load_mods_from_s3(dt=dt)
+    crec_id = 'id-CREC-2017-02-15-pt1-PgD160'
+    data = parser.get_crec_description(crec_id)
+    return data
+
+if __name__ == '__main__':
+    parser = CRECParser(
+        'use-this-bucket-to-test-your-bullshit',
+        'capitolwords',
+    )
+    dt = datetime(2017, 2, 15)
+    parser.load_mods_from_s3(dt=dt)
+
+# capitolwords/2017/02/15/CREC-2017-02-15-pt1-PgD160.htm
+
+    # parser = argparse.ArgumentParser()
+    # parser.add_argument(
+    #     '--mods_path',
+    #     help='S3 key or local file path to the mods.xml file for this date.',
+    # )
+    # parser.add_argument(
+    #     '--date',
+    #     help='Use the mods.xml file in S3 for this date.',
+    #     type=lambda d: datetime.strptime(d, CMD_LINE_DATE_FORMAT),
+    # )
+    # parser.add_argument(
+    #     '--crec_path',
+    #     help='S3 key or local file path to crec .html file to parse.',
+    # )
+    # parser.add_argument(
+    #     '--pg_host',
+    #     help='Hostname for postgres database.',
+    #     default='localhost',
+    # )
+    # parser.add_argument(
+    #     '--pg_port',
+    #     help='Hostname for postgres database.',
+    #     default=5432
+    # )
+    # parser.add_argument(
+    #     '--pg_user',
+    #     help='Postgres user name.'
+    # )
+    # parser.add_argument(
+    #     '--pg_password',
+    #     help='Postgres password.'
+    # )