From 46d9d06361d9764eac438be77c0b4289e550c398 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 26 Feb 2016 18:25:04 -0500 Subject: [PATCH 01/11] GovernmentAttic --- inspectors/governmentattic.py | 246 ++++++++++++++++++++++++++++++++++ inspectors/osc.py | 2 +- 2 files changed, 247 insertions(+), 1 deletion(-) create mode 100644 inspectors/governmentattic.py diff --git a/inspectors/governmentattic.py b/inspectors/governmentattic.py new file mode 100644 index 00000000..cf70278b --- /dev/null +++ b/inspectors/governmentattic.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python + +import datetime +import logging +import os +import re +from urllib.parse import urljoin + +from utils import utils, inspector + +""" +This file is different in that it doesn't scrape an IG's list of public documents, but rather scrapes the largest public +repository of otherwise-secret IG reports that were obtained under FOIA -- governmentattic.org. + +This has the advantage of being a repeatable process that will sweep in PDFs that are otherwise collected +only via manual processes, thanks to the FOIA efforts of the people that run governmentattic.org + +As you might expect, IG reports that can only be obtained via FOIA can be more interesting +and juicy than the ones the government chooses to publish online itself. + +Not all of GovAttic's documents are IG reports. By default, this script pulls in only IG reports, but there is a flag, +IG_REPORTS_ONLY, which can simply be set to False to start pulling in all GovernmentAttic documents. +Additionally, it ignores IG reports that don't map to an IG that oversight.garden already keeps track of. + +FYI, the Internet Archive backed up governmentattic.org in 2014: +https://archive.org/details/governmentattic.org?sort=-publicdate + +-Luke Rosiak +""" + + +IG_REPORTS_ONLY = True + + +def remove_non_ascii(text): + return ''.join(i for i in text if ord(i)<128) + +# +archive = 1930 +#govattic page structure isn't based on year + +# options: +# standard since/year options for a year range to fetch from. +# +# Notes for IG's web team: +# GovernmentAttic's website seems to be hand-coded HTML with no CSS classes, but seems to be updated in a consistent enough way. + +#Landing page where GovAttic breaks government agencies down into several groups +CATEGORIES_URL = 'http://www.governmentattic.org/DocumentsCat.html' + +#The below maps GovAttic's agency descriptors with inspectors-general's slugs. +#It takes the format: ga_category,ga_agency,ig_short,ig_url,ig_slug +#A GovAttic record will be ignored if it doesn't map to an IG that is in this repo. +#This mapping was hand-coded based on this file: +#https://raw.githubusercontent.com/konklone/oversight.garden/master/config/inspectors.json + +GOVATTIC_MAPPING = """Department of Defense Documents,Department of Defense (DoD),Department of Defense,http://www.dodig.mil/,dod +Department of Defense Documents,Office of the Inspector General (OIG),Department of Defense,http://www.dodig.mil/,dod +Department of Defense Documents,Defense Commissary Agency (DeCA),Department of Defense,http://www.dodig.mil/,dod +Department of Defense Documents,Defense Intelligence Agency (DIA),Defense Intelligence Agency,http://www.dia.mil/About/OfficeoftheInspectorGeneral.aspx,dia +Department of Defense Documents,Defense Threat Reduction Agency (DTRA),Department of Defense,http://www.dodig.mil/,dod +Department of Defense Documents,National,Department of Defense,http://www.dodig.mil/,dod +Department of Defense Documents,United States Air Force,Air Force,http://www.af.mil/InspectorGeneralComplaints.aspx,airforce +Department of Defense Documents,United States Army,Army,https://www.daig.pentagon.mil/,army +Department of Defense Documents,United States Navy,Navy,http://www.secnav.navy.mil/ig/Pages/Home.aspx,navy +Department of Justice Documents,Department of Justice (DOJ),Department of Justice,https://oig.justice.gov/,doj +Department of Justice Documents,Office of the Inspector General,Department of Justice,https://oig.justice.gov/,doj +Executive Branch Departments A-M,Department of Agriculture (USDA),Department of Agriculture,http://www.usda.gov/oig/,agriculture +Executive Branch Departments A-M,Department of Commerce (DOC),Department of Commerce,https://www.oig.doc.gov/Pages/default.aspx,commerce +Executive Branch Departments A-M,Department of Education (ED),Department of Education,https://www2.ed.gov/about/offices/list/oig/index.html,education +Executive Branch Departments A-M,Department of Energy (DOE),Department of Energy,http://energy.gov/ig/office-inspector-general,energy +Executive Branch Departments A-M,Department of Heath and Human Services (DHHS),Department of Health and Human Services,http://oig.hhs.gov/,hhs +Executive Branch Departments A-M,Department of Homeland Security (DHS),Department of Homeland Security,https://www.oig.dhs.gov/,dhs +Executive Branch Departments A-M,United States Secret Service (USSS),Department of Defense,http://www.dodig.mil/,dod +Executive Branch Departments A-M,Department of Housing and Urban Development (HUD),Department of Housing and Urban Development,https://www.hudoig.gov/,hud +Executive Branch Departments A-M,Department of the Interior (DOI),Department of the Interior,https://www.doioig.gov/,interior +Executive Branch Departments A-M,Department of Labor (DOL),Department of Labor,https://www.oig.dol.gov/,labor +Executive Branch Departments N-Z,Department of State,Department of State,https://oig.state.gov/,state +Executive Branch Departments N-Z,Department of Transportation (DOT),Department of Transportation,https://www.oig.dot.gov/,dot +Executive Branch Departments N-Z,Federal Aviation Administration (FAA),Department of Transportation,https://www.oig.dot.gov/,dot +Executive Branch Departments N-Z,Department of the Treasury,Department of the Treasury,http://www.treasury.gov/about/organizational-structure/ig/,treasury +Executive Branch Departments N-Z,Bureau of Engraving and Printing (BEP),Department of the Treasury,http://www.treasury.gov/about/organizational-structure/ig/,treasury +Executive Branch Departments N-Z,Treasury Inspector General for Tax Administration (TIGTA),Treasury IG for Tax Administration,https://www.treasury.gov/tigta/,tigta +Executive Branch Departments N-Z,Department of Veterans Affairs (VA),Department of Veterans Affairs,http://www.va.gov/oig,va +White House Offices,Office of the Director of National Intelligence (ODNI),,, +Legislative Agencies,Architect of the Capitol (AOC),Architect of the Capitol,http://www.aoc.gov/oig/office-inspector-general,architect +Legislative Agencies,Government Accountability Office (GAO),Government Accountability Office,http://www.gao.gov/about/workforce/ig.html,gao +Legislative Agencies,Library of Congress (LOC),Library of Congress,https://www.loc.gov/about/office-of-the-inspector-general/,loc +Independent Federal Agencies A-M,The,,, +Independent Federal Agencies A-M,Central Intelligence Agency (CIA),Central Intelligence Agency,https://www.cia.gov/offices-of-cia/inspector-general,cia +Independent Federal Agencies A-M,Commodity Futures Trading Commission (CFTC),Commodity Futures Trading Commission,http://www.cftc.gov/About/OfficeoftheInspectorGeneral/index.htm,cftc +Independent Federal Agencies A-M,Consumer Product Safety Commission (CPSC),Consumer Product Safety Commission,https://www.cpsc.gov/en/About-CPSC/Inspector-General/,cpsc +Independent Federal Agencies A-M,Corporation for National and Community Service (CNCS),Corporation for National and Community Service,http://www.cncsoig.gov/,cncs +Independent Federal Agencies A-M,Council of Inspectors General on Integrity and Efficiency (CIGIE),Council of Inspectors General on Integrity and Efficiency (CIGIE),https://www.ignet.gov/,cigie +Independent Federal Agencies A-M,The Denali Commission,Denali Commission,http://oig.denali.gov/,denali +Independent Federal Agencies A-M,Environmental Protection Agency (EPA),Environmental Protection Agency,http://www.epa.gov/oig,epa +Independent Federal Agencies A-M,Equal Employment Opportunity Commission (EEOC),Equal Employment Opportunity Commission,https://oig.eeoc.gov/,eeoc +Independent Federal Agencies A-M,Export-Import Bank of the United States (Ex-Im Bank),Export-Import Bank,http://www.exim.gov/about/oig,exim +Independent Federal Agencies A-M,Federal Communications Commission (FCC),Federal Communications Commission,https://www.fcc.gov/office-inspector-general,fcc +Independent Federal Agencies A-M,Federal Deposit Insurance Corporation (FDIC),Federal Deposit Insurance Corporation,https://www.fdicig.gov/,fdic +Independent Federal Agencies A-M,Federal Election Commission,Federal Election Commission,http://www.fec.gov/fecig/fecig.shtml,fec +Independent Federal Agencies A-M,Federal Housing Finance Agency (FHFA),Federal Housing Finance Agency,http://fhfaoig.gov/,fhfa +Independent Federal Agencies A-M,Federal Labor Relations Authority (FLRA),Federal Labor Relations Authority,https://www.flra.gov/OIG,flra +Independent Federal Agencies A-M,Federal Reserve System,Federal Reserve/CFPB,https://oig.federalreserve.gov/,fed +Independent Federal Agencies A-M,Federal Trade Commission (FTC),Federal Trade Commission,https://www.ftc.gov/about-ftc/office-inspector-general,ftc +Independent Federal Agencies A-M,General Services Administration (GSA),General Services Administration,https://www.gsaig.gov/,gsa +Independent Federal Agencies N-Z,National Archives and Records Administration (NARA),National Archives,https://www.archives.gov/oig/,archives +Independent Federal Agencies N-Z,National Aeronautics and Space Administration (NASA),NASA,https://oig.nasa.gov/,nasa +Independent Federal Agencies N-Z,National Credit Union Administration (NCUA),National Credit Union Administration,http://www.ncua.gov/about/Leadership/Pages/page_oig.aspx,ncua +Independent Federal Agencies N-Z,National Endowment for the Humanities (NEH),National Endowment for the Humanities,http://www.neh.gov/about/oig,neh +Independent Federal Agencies N-Z,National Labor Relations Board (NLRB),National Labor Relations Board,https://www.nlrb.gov/who-we-are/inspector-general,nlrb +Independent Federal Agencies N-Z,National Railroad Passenger Corporation (AMTRAK),Amtrak,https://www.amtrakoig.gov/,amtrak +Independent Federal Agencies N-Z,National Science Foundation (NSF),National Science Foundation,https://www.nsf.gov/oig/,nsf +Independent Federal Agencies N-Z,Nuclear Regulatory Commission (NRC),Nuclear Regulatory Commission,http://www.nrc.gov/insp-gen.html,nrc +Independent Federal Agencies N-Z,Office of Personnel Management (OPM),Office of Personnel Management,https://www.opm.gov/our-inspector-general/,opm +Independent Federal Agencies N-Z,Office of the Special Inspector General for Afghanistan,Special IG for Afghanistan Reconstruction,https://www.sigar.mil/,sigar +Independent Federal Agencies N-Z,Office of the Special Inspector General for Iraq Reconstruction (SIGIR),Special IG for Iraq Reconstruction,http://www.sigir.mil/,sigir +Independent Federal Agencies N-Z,Overseas Private Investment Corporation (OPIC),,, +Independent Federal Agencies N-Z,The Peace Corps,Peace Corps,http://www.peacecorps.gov/about/inspgen/,peacecorps +Independent Federal Agencies N-Z,The Railroad Retirement Board,Railroad Retirement Board,http://www.rrb.gov/oig/,rrb +Independent Federal Agencies N-Z,Securities and Exchange Commission (SEC),Securities and Exchange Commission,http://www.sec.gov/oig,sec +Independent Federal Agencies N-Z,Special Inspector General for the Troubled Asset Relief Program (SIGTARP),Special IG for TARP,https://www.sigtarp.gov/Pages/home.aspx,sigtarp +Independent Federal Agencies N-Z,Small Business Administration (SBA),Small Business Administration,https://www.sba.gov/office-of-inspector-general,sba +Independent Federal Agencies N-Z,Social Security Administration (SSA),Social Security Administration,http://oig.ssa.gov,ssa +Independent Federal Agencies N-Z,Tennessee Valley Authority (TVA),Tennessee Valley Authority,http://oig.tva.gov/,tva +Independent Federal Agencies N-Z,US Agency for International Development (USAID),U.S. Agency for International Development,https://oig.usaid.gov/,usaid +Independent Federal Agencies N-Z,US Postal Service (USPS),U.S. Postal Service,https://uspsoig.gov/,usps +Government Corporations,Legal Services Corporation,Legal Services Corporation,https://www.oig.lsc.gov/,lsc +Government Corporations,Corporation for National and Community Service (CNCS),Corporation for National and Community Service,http://www.cncsoig.gov/,cncs +Government Corporations,Pension Benefit Guaranty Corporation (PBGC),Pension Benefit Guaranty Corporation,http://oig.pbgc.gov/,pbgc +State Records / Miscellaneous Records / Interagency Records,Records of State/CITY Agencies,,, +State Records / Miscellaneous Records / Interagency Records,Miscellaneous Records,,, +State Records / Miscellaneous Records / Interagency Records,Smithsonian Institution (SI),Smithsonian Institute,http://www.si.edu/OIG,smithsonian""" + +#store this as tuples above for ease of editing, but turn it into a dict for use. +GOVATTIC_MAPPING_DICT = {} +for line in GOVATTIC_MAPPING.splitlines(): + (ga_category,ga_agency,ig_short,ig_url,ig_slug) = line.strip().split(',') + GOVATTIC_MAPPING_DICT[(ga_category,ga_agency)] = (ig_short,ig_url,ig_slug) + +def remove_linebreaks(s): + #lots of weird tabs, etc. inside HTML strings. would replace all at once, but since utils.beautifulsoup_from_url + #is taking the html straight to soup, we'll do it individually for the fields we need + return remove_non_ascii(s.replace('\n','').replace('\t','').replace('\r','')) #.encode('ascii','ignore') + +def run(options): + year_range = inspector.year_range(options, archive) + + #loop through sections (Executive Branch Departments A-M, etc) + category_doc = utils.beautifulsoup_from_url(CATEGORIES_URL) + category_links = category_doc.findAll('a') + for category_link in category_links: + #these are the detail pages with lots of PDFs. they are grouped according to agency name + category_name = remove_linebreaks(category_link.text).strip() + doc = utils.beautifulsoup_from_url(category_link['href']) + agency = '' + for result in doc.findAll('p'): + if result.font and result.font.get('color')=="#993333": + #this is an agency name + agency = remove_linebreaks(result.font.text).strip() + else: + #this is a report from that agency + report = report_from(result, category_name, agency, year_range) + if report: + inspector.save_report(report) + + +#all dates successfully parse as of this writing, but it's a hand-coded HTML site, +#so it's possible one may not have a valid date in the future. use a set-in-time +#default as a fallback so we get a consistent year-based slug for it. +DEFAULT_DATE = datetime.datetime(2015,11,1,1,1,1) + +# extract a dict of details that are ready for inspector.save_report(). +def report_from(result, category_name, agency, year_range): + + #ignore if it's not in our agency string->slug mapping or if it's in our mapping and has null instead of a slug. + #that means it doesn't come from an agency whose IG we track; it may be a document from a + #local government, etc. + if (category_name,agency) not in GOVATTIC_MAPPING_DICT or GOVATTIC_MAPPING_DICT[(category_name,agency)][-1]=='': + return + (ig_short,ig_url,ig_slug) = GOVATTIC_MAPPING_DICT[(category_name,agency)] + + a = result.find('a') + if not a: + if result.p and result.p.font and result.p.font.find('a'): + a = result.p.font.find('a') + if not a: + #there's no link, so this must just be some explanatory text, such as the footer + return + report_url = a['href'] + + text = remove_linebreaks(result.text) + r = re.compile('\[.*\s(\d{2})-+(\w{3,12})-+(\d{4})') + datematch = r.search(text) + published_on = None + datestring = None + if datematch: + datestring = '-'.join(datematch.groups()) #'01-Mar-2015 + try: + published_on = datetime.datetime.strptime(datestring, '%d-%b-%Y') + except: + published_on = None + if not published_on: + try: + published_on = datetime.datetime.strptime(datestring, '%d-%B-%Y') + except: + published_on = None + if not published_on: + logging.debug("[%s] Can't parse date %s, using default date." % (report_url,datestring)) + published_on = DEFAULT_DATE + + title = remove_linebreaks(a.text).strip() + + if published_on.year not in year_range: + logging.debug("[%s] Skipping, not in requested range." % report_url) + return + + #ignore documents that are interesting FOIAs but are not IG reports. + #if you want to scrape IG and agency documents, set IG_REPORTS_ONLY=False + if IG_REPORTS_ONLY and 'OIG' not in title and 'inspector general' not in title.lower(): + logging.debug("[%s] Skipping, not an IG report." % title) + return + + #these will be stored in folders with documents scraped by the official IG scrapers, so + #use the governmentattic url as slug to assure no conflict. + report_id = inspector.slugify(report_url.replace('http://www.','')) + + + report = { + 'inspector': ig_slug, # Store these with their natively-scraped counterparts, not in a govattic-specific place + 'inspector_url': ig_url, + 'agency': ig_slug, # Agency and IG slug will be the same + 'agency_name': ig_short, # Take short name of the IG as the agency name. I think this should work. + 'report_id': report_id, + 'url': report_url, + 'title': title, + 'type': 'FOIA - GovernmentAttic.org', # Type of report (default to 'other') + 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d") #date published to GovAttic, not released by IG + } + + return report + +utils.run(run) if (__name__ == "__main__") else None + + + diff --git a/inspectors/osc.py b/inspectors/osc.py index fc8a26d4..3ec2f466 100644 --- a/inspectors/osc.py +++ b/inspectors/osc.py @@ -180,7 +180,7 @@ def report_from(result, year, year_range,OUTCOME_CODES): def make_report_id(url): - return url.replace('/PublicFiles/','').replace('/publicfiles/','').replace('/','-').replace('.pdf','') + return inspector.slugify(url.replace('/PublicFiles/','').replace('/publicfiles/','').replace('.pdf','')) def get_extra_descrip(pdf_link): #takes a beautifulsoup object representing a PDF and tries to find the blurb of text that may be right above it. From bc55d3e1b1c49d2277089c3dc401c8a726a2ea2d Mon Sep 17 00:00:00 2001 From: root Date: Fri, 26 Feb 2016 18:30:42 -0500 Subject: [PATCH 02/11] GovernmentAttic --- inspectors/governmentattic.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/inspectors/governmentattic.py b/inspectors/governmentattic.py index cf70278b..0b118c52 100644 --- a/inspectors/governmentattic.py +++ b/inspectors/governmentattic.py @@ -2,9 +2,7 @@ import datetime import logging -import os import re -from urllib.parse import urljoin from utils import utils, inspector From 6fc58d385f8621af8519edd3c94907bc913f5bae Mon Sep 17 00:00:00 2001 From: David Cook Date: Tue, 5 Apr 2016 18:13:02 -0500 Subject: [PATCH 03/11] [governmentattic] chmod a+x --- inspectors/governmentattic.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 inspectors/governmentattic.py diff --git a/inspectors/governmentattic.py b/inspectors/governmentattic.py old mode 100644 new mode 100755 From 2da6d41fe7a6282178982367f420047ff6d096be Mon Sep 17 00:00:00 2001 From: David Cook Date: Tue, 5 Apr 2016 18:26:35 -0500 Subject: [PATCH 04/11] [governmentattic] Fix mojibake governmentattic.org doesn't provide an encoding in HTTP headers, and requests can't parse tags inside the body of a document. Thus, we explicitly decode these pages as utf-8 with a special case for governmentattic.org. --- inspectors/governmentattic.py | 5 +---- inspectors/utils/utils.py | 6 ++++++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/inspectors/governmentattic.py b/inspectors/governmentattic.py index 0b118c52..ef8235b0 100755 --- a/inspectors/governmentattic.py +++ b/inspectors/governmentattic.py @@ -30,9 +30,6 @@ IG_REPORTS_ONLY = True -def remove_non_ascii(text): - return ''.join(i for i in text if ord(i)<128) - # archive = 1930 #govattic page structure isn't based on year @@ -139,7 +136,7 @@ def remove_non_ascii(text): def remove_linebreaks(s): #lots of weird tabs, etc. inside HTML strings. would replace all at once, but since utils.beautifulsoup_from_url #is taking the html straight to soup, we'll do it individually for the fields we need - return remove_non_ascii(s.replace('\n','').replace('\t','').replace('\r','')) #.encode('ascii','ignore') + return inspector.sanitize(s.replace('\n','').replace('\t','').replace('\r','')) def run(options): year_range = inspector.year_range(options, archive) diff --git a/inspectors/utils/utils.py b/inspectors/utils/utils.py index 76ec05b9..6ab3e5fc 100644 --- a/inspectors/utils/utils.py +++ b/inspectors/utils/utils.py @@ -263,6 +263,12 @@ def download(url, destination=None, options=None): log_http_error(e, url) return None + # Special case handling for governmentattic.org: + # These pages are served without an encoding in the HTTP headers, + # and with utf-8 specified in a tag inside the document. + if url.startswith("http://www.governmentattic.org"): + response.encoding = "utf-8" + body = response.text if not isinstance(body, str): raise ValueError("Content not decoded.") From e1273a6ce64807326244a68da5aa31a1d0f302b4 Mon Sep 17 00:00:00 2001 From: David Cook Date: Tue, 5 Apr 2016 18:34:10 -0500 Subject: [PATCH 05/11] [governmentattic] Log missing dates, no default --- inspectors/governmentattic.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/inspectors/governmentattic.py b/inspectors/governmentattic.py index ef8235b0..c39a3194 100755 --- a/inspectors/governmentattic.py +++ b/inspectors/governmentattic.py @@ -160,11 +160,6 @@ def run(options): inspector.save_report(report) -#all dates successfully parse as of this writing, but it's a hand-coded HTML site, -#so it's possible one may not have a valid date in the future. use a set-in-time -#default as a fallback so we get a consistent year-based slug for it. -DEFAULT_DATE = datetime.datetime(2015,11,1,1,1,1) - # extract a dict of details that are ready for inspector.save_report(). def report_from(result, category_name, agency, year_range): @@ -184,6 +179,11 @@ def report_from(result, category_name, agency, year_range): return report_url = a['href'] + #these will be stored in folders with documents scraped by the official IG scrapers, so + #use the governmentattic url as slug to assure no conflict. + report_id = inspector.slugify(report_url.replace('http://www.','')) + + title = remove_linebreaks(a.text).strip() text = remove_linebreaks(result.text) r = re.compile('\[.*\s(\d{2})-+(\w{3,12})-+(\d{4})') datematch = r.search(text) @@ -201,11 +201,9 @@ def report_from(result, category_name, agency, year_range): except: published_on = None if not published_on: - logging.debug("[%s] Can't parse date %s, using default date." % (report_url,datestring)) - published_on = DEFAULT_DATE + inspector.log_no_date(report_id, title, report_url) + return - title = remove_linebreaks(a.text).strip() - if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return @@ -216,11 +214,6 @@ def report_from(result, category_name, agency, year_range): logging.debug("[%s] Skipping, not an IG report." % title) return - #these will be stored in folders with documents scraped by the official IG scrapers, so - #use the governmentattic url as slug to assure no conflict. - report_id = inspector.slugify(report_url.replace('http://www.','')) - - report = { 'inspector': ig_slug, # Store these with their natively-scraped counterparts, not in a govattic-specific place 'inspector_url': ig_url, From 5b2bf53f389cb68670124fa1c229b0871f1ed614 Mon Sep 17 00:00:00 2001 From: David Cook Date: Tue, 5 Apr 2016 18:35:51 -0500 Subject: [PATCH 06/11] [governmentattic] Handle "Sept" in dates --- inspectors/governmentattic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/inspectors/governmentattic.py b/inspectors/governmentattic.py index c39a3194..5d8436b9 100755 --- a/inspectors/governmentattic.py +++ b/inspectors/governmentattic.py @@ -191,6 +191,7 @@ def report_from(result, category_name, agency, year_range): datestring = None if datematch: datestring = '-'.join(datematch.groups()) #'01-Mar-2015 + datestring = datestring.replace("Sept", "Sep") try: published_on = datetime.datetime.strptime(datestring, '%d-%b-%Y') except: From d68ba3c3deb7b366656960f3c107baa02af3a503 Mon Sep 17 00:00:00 2001 From: David Cook Date: Tue, 5 Apr 2016 18:37:38 -0500 Subject: [PATCH 07/11] [governmentattic] Handle extra hyphen before date --- inspectors/governmentattic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inspectors/governmentattic.py b/inspectors/governmentattic.py index 5d8436b9..0c3529db 100755 --- a/inspectors/governmentattic.py +++ b/inspectors/governmentattic.py @@ -185,7 +185,7 @@ def report_from(result, category_name, agency, year_range): title = remove_linebreaks(a.text).strip() text = remove_linebreaks(result.text) - r = re.compile('\[.*\s(\d{2})-+(\w{3,12})-+(\d{4})') + r = re.compile('\[.*\s-?(\d{2})-+(\w{3,12})-+(\d{4})') datematch = r.search(text) published_on = None datestring = None From 5b87108499f7a8fb29648dabcea41b0eb196433a Mon Sep 17 00:00:00 2001 From: David Cook Date: Tue, 5 Apr 2016 18:40:27 -0500 Subject: [PATCH 08/11] [governmentattic] Handle [11-Aug-2013] etc. --- inspectors/governmentattic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inspectors/governmentattic.py b/inspectors/governmentattic.py index 0c3529db..4d87e0dd 100755 --- a/inspectors/governmentattic.py +++ b/inspectors/governmentattic.py @@ -185,7 +185,7 @@ def report_from(result, category_name, agency, year_range): title = remove_linebreaks(a.text).strip() text = remove_linebreaks(result.text) - r = re.compile('\[.*\s-?(\d{2})-+(\w{3,12})-+(\d{4})') + r = re.compile('\[(?:.*\s-?|)(\d{2})-+(\w{3,12})-+(\d{4})') datematch = r.search(text) published_on = None datestring = None From 97eb04dcb9eb51d0c60ae17d0d0331c32681397c Mon Sep 17 00:00:00 2001 From: David Cook Date: Tue, 5 Apr 2016 18:41:31 -0500 Subject: [PATCH 09/11] [governmentattic] Don't break [16-September-2009] --- inspectors/governmentattic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inspectors/governmentattic.py b/inspectors/governmentattic.py index 4d87e0dd..280f9a27 100755 --- a/inspectors/governmentattic.py +++ b/inspectors/governmentattic.py @@ -191,7 +191,7 @@ def report_from(result, category_name, agency, year_range): datestring = None if datematch: datestring = '-'.join(datematch.groups()) #'01-Mar-2015 - datestring = datestring.replace("Sept", "Sep") + datestring = datestring.replace("-Sept-", "-Sep-") try: published_on = datetime.datetime.strptime(datestring, '%d-%b-%Y') except: From 9c707bf29849c0f357da445552508f69ece36bc7 Mon Sep 17 00:00:00 2001 From: David Cook Date: Tue, 5 Apr 2016 18:50:56 -0500 Subject: [PATCH 10/11] [governmentattic] Remove dead code --- inspectors/governmentattic.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/inspectors/governmentattic.py b/inspectors/governmentattic.py index 280f9a27..d333271b 100755 --- a/inspectors/governmentattic.py +++ b/inspectors/governmentattic.py @@ -171,9 +171,6 @@ def report_from(result, category_name, agency, year_range): (ig_short,ig_url,ig_slug) = GOVATTIC_MAPPING_DICT[(category_name,agency)] a = result.find('a') - if not a: - if result.p and result.p.font and result.p.font.find('a'): - a = result.p.font.find('a') if not a: #there's no link, so this must just be some explanatory text, such as the footer return From 99351908a7395e460261e4f5715d3ace76574367 Mon Sep 17 00:00:00 2001 From: David Cook Date: Tue, 5 Apr 2016 18:53:08 -0500 Subject: [PATCH 11/11] [governmentattic] Add to safe.yml --- safe.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/safe.yml b/safe.yml index b692b106..b375f9d6 100644 --- a/safe.yml +++ b/safe.yml @@ -108,6 +108,9 @@ # Government Accountability Office - gao +# Governmentattic.org +- governmentattic + # Government Printing Office - gpo