Skip to content

Commit e71392b

Browse files
authored
Dev/issue 41 (#42)
* replace logging.warn (deprecated) with logging.warning * cosmetic changes/PEP-8 * Fix #41 * retain backwards compatibility * retain backwards compatibility (for 11 more days lol) * increment version number * SSOT for version
1 parent ad6b222 commit e71392b

5 files changed

Lines changed: 46 additions & 31 deletions

File tree

congressionalrecord/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,4 @@
1-
__version__ = '2.0.1'
1+
import pkg_resources # part of setuptools
2+
VERSION = pkg_resources.require("congressionalrecord")[0].version
3+
4+
__version__ = VERSION

congressionalrecord/govinfo/cr_parser.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import logging
1111
import itertools
1212

13+
1314
class ParseCRDir(object):
1415

1516
def gen_dir_metadata(self):
@@ -25,7 +26,8 @@ def __init__(self, abspath, **kwargs):
2526
self.mods_path = os.path.join(self.cr_dir,'mods.xml')
2627
self.html_path = os.path.join(self.cr_dir,'html')
2728
self.gen_dir_metadata()
28-
29+
30+
2931
class ParseCRFile(object):
3032
# Some regex
3133
re_time = r'^CREC-(?P<year>[0-9]{4})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})-.*'
@@ -204,7 +206,7 @@ def gen_file_metadata(self):
204206
if matchobj:
205207
self.doc_title, self.cr_vol, self.cr_num = matchobj.group('title','vol','num')
206208
else:
207-
logging.warn('{0} yields no title, vol, num'.format(
209+
logging.warning('{0} yields no title, vol, num'.format(
208210
self.access_path))
209211
self.doc_title, self.cr_vol, self.cr_num = \
210212
'None','Unknown','Unknown'
@@ -343,7 +345,7 @@ def write_page(self):
343345
itemno += 1
344346
the_content.append(item)
345347
except Exception as e:
346-
logging.warn('{0}'.format(e))
348+
logging.warning('{0}'.format(e))
347349
break
348350

349351
self.crdoc['content'] = the_content

congressionalrecord/govinfo/downloader.py

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,23 @@
11
from __future__ import absolute_import
2+
import os
3+
import json
4+
import logging
5+
import pkg_resources # part of setuptools
6+
import certifi
7+
import urllib3.contrib.pyopenssl
28
#import requests
39
from builtins import str
410
from builtins import object
5-
import certifi
6-
import urllib3.contrib.pyopenssl
7-
urllib3.contrib.pyopenssl.inject_into_urllib3()
811
from urllib3 import PoolManager, Retry, Timeout
9-
import os
1012
from datetime import datetime, date, timedelta
13+
from io import BytesIO
1114
from time import sleep
12-
from zipfile import ZipFile
15+
from zipfile import ZipFile, BadZipfile
1316
from .cr_parser import ParseCRDir, ParseCRFile
14-
import json
1517
from pyelasticsearch import ElasticSearch, bulk_chunks
16-
import logging
18+
19+
urllib3.contrib.pyopenssl.inject_into_urllib3()
20+
VERSION = pkg_resources.require("congressionalrecord")[0].version
1721

1822

1923
class Downloader(object):
@@ -59,7 +63,6 @@ def bulkdownload(self, start, parse=True, **kwargs):
5963
logging.warning('Unexpected condition in bulkdownloader')
6064
day += timedelta(days=1)
6165

62-
6366
def __init__(self, start, **kwargs):
6467
"""
6568
Invoke a Downloader object to get data from
@@ -140,12 +143,10 @@ def __init__(self, start, **kwargs):
140143
return None
141144

142145

143-
144-
145-
146146
class downloadRequest(object):
147147

148-
user_agent = {'user-agent': 'congressional-record 0.0.1 (https://github.com/unitedstates/congressional-record)'}
148+
user_agent = {'user-agent':
149+
'congressional-record {} (https://github.com/unitedstates/congressional-record)'.format(VERSION)}
149150
its_today = datetime.strftime(datetime.today(), '%Y-%m-%d %H:%M')
150151
timeout = Timeout(connect=2.0, read=10.0)
151152
retry = Retry(total=3, backoff_factor=300)
@@ -162,19 +163,26 @@ def __init__(self, url, filename):
162163
r = self.http.request('GET', url)
163164
logging.debug('Request headers received with code {}'.format(r.status))
164165
if r.status == 404:
165-
logging.warn('Received 404, not retrying request.')
166+
logging.warning('Received 404, not retrying request.')
166167
self.status = 404
167168
elif r.status == 200 and r.data:
168-
logging.info('Considering request successful.')
169-
self.binary_content = r.data
170-
self.status = True
169+
logging.info('Considering download request successful.')
170+
logging.info('Sniff sniff: Does this smell like a ZIP file?')
171+
with BytesIO(r.data) as thepackage:
172+
try:
173+
isazip = ZipFile(thepackage)
174+
self.binary_content = r.data
175+
self.status = True
176+
except BadZipfile:
177+
logging.warning('File {} is not a valid ZIP file (BadZipFile)'.format(url))
178+
self.status = False
171179
else:
172-
logging.warn('Unexpected condition, not continuing:\
180+
logging.warning('Unexpected condition, not continuing:\
173181
{}'.format(r.status))
174182
except urllib3.exceptions.MaxRetryError as ce:
175-
logging.warn('Error: %s - Aborting download' % ce)
183+
logging.warning('Error: %s - Aborting download' % ce)
176184
if self.status == False:
177-
logging.warn('Failed to download file {}'.format(url))
185+
logging.warning('Failed to download file {}'.format(url))
178186
elif self.status == 404:
179187
logging.info('downloadRequester skipping file that returned 404.')
180188
elif self.binary_content:
@@ -199,7 +207,7 @@ def download_day(self, day, outpath):
199207
the_download = downloadRequest(the_url, the_filename)
200208
self.status = the_download.status
201209
if self.status == False:
202-
logging.warn("fdsysDL received report that download for {} did not complete.".format(day))
210+
logging.warning("fdsysDL received report that download for {} did not complete.".format(day))
203211
elif self.status == 404:
204212
logging.warning('fdsysDL received 404 report for {}.'.format(day))
205213
else:
@@ -213,6 +221,7 @@ def __init__(self, day, **kwargs):
213221
self.outpath = 'output'
214222
self.download_day(day, self.outpath)
215223

224+
216225
class GovInfoExtract(object):
217226

218227
def __init__(self, day, **kwargs):

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
setup(
44
name='congressionalrecord',
5-
version='2.0.1',
5+
version='2.0.2',
66
description='Parse the U.S. Congressional Record from GovInfo.',
77
url='https://github.com/unitedstates/congressional-record',
88
author='Nick Judd',

tests/test_downloader.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,19 @@
88
import re
99
import logging
1010

11-
logging.basicConfig(filename='tests.log',level=logging.DEBUG)
11+
logging.basicConfig(filename='tests.log', level=logging.DEBUG)
12+
1213

1314
class testDownloader(unittest.TestCase):
1415

1516
def test_handle_404(self):
16-
download = dl.Downloader('2015-07-19',do_mode='json')
17-
self.assertEqual(download.status,'downloadFailure')
17+
download = dl.Downloader('2015-07-19', do_mode='json')
18+
self.assertEqual(download.status, 'downloadFailure')
1819

1920
def test_handle_existing(self):
2021
download = dl.Downloader('2005-07-20',do_mode='json')
21-
self.assertIn(download.status,['extractedFilesdeletedZip','existingFiles'])
22+
self.assertIn(download.status, ['extractedFilesdeletedZip', 'existingFiles'])
2223

2324
def test_handle_empty(self):
24-
download = dl.Downloader('2017-01-02',do_mode='json')
25-
self.assertEqual(download.status,'downloadFailure')
25+
download = dl.Downloader('2017-01-02', do_mode='json')
26+
self.assertEqual(download.status, 'downloadFailure')

0 commit comments

Comments
 (0)