11from __future__ import absolute_import
2+ import os
3+ import json
4+ import logging
5+ import pkg_resources # part of setuptools
6+ import certifi
7+ import urllib3 .contrib .pyopenssl
28#import requests
39from builtins import str
410from builtins import object
5- import certifi
6- import urllib3 .contrib .pyopenssl
7- urllib3 .contrib .pyopenssl .inject_into_urllib3 ()
811from urllib3 import PoolManager , Retry , Timeout
9- import os
1012from datetime import datetime , date , timedelta
13+ from io import BytesIO
1114from time import sleep
12- from zipfile import ZipFile
15+ from zipfile import ZipFile , BadZipfile
1316from .cr_parser import ParseCRDir , ParseCRFile
14- import json
1517from pyelasticsearch import ElasticSearch , bulk_chunks
16- import logging
18+
19+ urllib3 .contrib .pyopenssl .inject_into_urllib3 ()
20+ VERSION = pkg_resources .require ("congressionalrecord" )[0 ].version
1721
1822
1923class Downloader (object ):
@@ -59,7 +63,6 @@ def bulkdownload(self, start, parse=True, **kwargs):
5963 logging .warning ('Unexpected condition in bulkdownloader' )
6064 day += timedelta (days = 1 )
6165
62-
6366 def __init__ (self , start , ** kwargs ):
6467 """
6568 Invoke a Downloader object to get data from
@@ -140,12 +143,10 @@ def __init__(self, start, **kwargs):
140143 return None
141144
142145
143-
144-
145-
146146class downloadRequest (object ):
147147
148- user_agent = {'user-agent' : 'congressional-record 0.0.1 (https://github.com/unitedstates/congressional-record)' }
148+ user_agent = {'user-agent' :
149+ 'congressional-record {} (https://github.com/unitedstates/congressional-record)' .format (VERSION )}
149150 its_today = datetime .strftime (datetime .today (), '%Y-%m-%d %H:%M' )
150151 timeout = Timeout (connect = 2.0 , read = 10.0 )
151152 retry = Retry (total = 3 , backoff_factor = 300 )
@@ -162,19 +163,26 @@ def __init__(self, url, filename):
162163 r = self .http .request ('GET' , url )
163164 logging .debug ('Request headers received with code {}' .format (r .status ))
164165 if r .status == 404 :
165- logging .warn ('Received 404, not retrying request.' )
166+ logging .warning ('Received 404, not retrying request.' )
166167 self .status = 404
167168 elif r .status == 200 and r .data :
168- logging .info ('Considering request successful.' )
169- self .binary_content = r .data
170- self .status = True
169+ logging .info ('Considering download request successful.' )
170+ logging .info ('Sniff sniff: Does this smell like a ZIP file?' )
171+ with BytesIO (r .data ) as thepackage :
172+ try :
173+ isazip = ZipFile (thepackage )
174+ self .binary_content = r .data
175+ self .status = True
176+ except BadZipfile :
177+ logging .warning ('File {} is not a valid ZIP file (BadZipFile)' .format (url ))
178+ self .status = False
171179 else :
172- logging .warn ('Unexpected condition, not continuing:\
180+ logging .warning ('Unexpected condition, not continuing:\
173181 {}' .format (r .status ))
174182 except urllib3 .exceptions .MaxRetryError as ce :
175- logging .warn ('Error: %s - Aborting download' % ce )
183+ logging .warning ('Error: %s - Aborting download' % ce )
176184 if self .status == False :
177- logging .warn ('Failed to download file {}' .format (url ))
185+ logging .warning ('Failed to download file {}' .format (url ))
178186 elif self .status == 404 :
179187 logging .info ('downloadRequester skipping file that returned 404.' )
180188 elif self .binary_content :
@@ -199,7 +207,7 @@ def download_day(self, day, outpath):
199207 the_download = downloadRequest (the_url , the_filename )
200208 self .status = the_download .status
201209 if self .status == False :
202- logging .warn ("fdsysDL received report that download for {} did not complete." .format (day ))
210+ logging .warning ("fdsysDL received report that download for {} did not complete." .format (day ))
203211 elif self .status == 404 :
204212 logging .warning ('fdsysDL received 404 report for {}.' .format (day ))
205213 else :
@@ -213,6 +221,7 @@ def __init__(self, day, **kwargs):
213221 self .outpath = 'output'
214222 self .download_day (day , self .outpath )
215223
224+
216225class GovInfoExtract (object ):
217226
218227 def __init__ (self , day , ** kwargs ):
0 commit comments