From 46d9d06361d9764eac438be77c0b4289e550c398 Mon Sep 17 00:00:00 2001
From: root <root@ytubuntu.org>
Date: Fri, 26 Feb 2016 18:25:04 -0500
Subject: [PATCH 01/11] GovernmentAttic

---
 inspectors/governmentattic.py | 246 ++++++++++++++++++++++++++++++++++
 inspectors/osc.py             |   2 +-
 2 files changed, 247 insertions(+), 1 deletion(-)
 create mode 100644 inspectors/governmentattic.py

diff --git a/inspectors/governmentattic.py b/inspectors/governmentattic.py
new file mode 100644
index 00000000..cf70278b
--- /dev/null
+++ b/inspectors/governmentattic.py
@@ -0,0 +1,246 @@
+#!/usr/bin/env python
+
+import datetime
+import logging
+import os
+import re
+from urllib.parse import urljoin
+
+from utils import utils, inspector
+
+"""
+This file is different in that it doesn't scrape an IG's list of public documents, but rather scrapes the largest public 
+repository of otherwise-secret IG reports that were obtained under FOIA -- governmentattic.org.
+
+This has the advantage of being a repeatable process that will sweep in PDFs that are otherwise collected
+only via manual processes, thanks to the FOIA efforts of the people that run governmentattic.org
+
+As you might expect, IG reports that can only be obtained via FOIA can be more interesting
+and juicy than the ones the government chooses to publish online itself.
+
+Not all of GovAttic's documents are IG reports. By default, this script pulls in only IG reports, but there is a flag,
+IG_REPORTS_ONLY, which can simply be set to False to start pulling in all GovernmentAttic documents.
+Additionally, it ignores IG reports that don't map to an IG that oversight.garden already keeps track of.
+
+FYI, the Internet Archive backed up governmentattic.org in 2014:
+https://archive.org/details/governmentattic.org?sort=-publicdate
+
+-Luke Rosiak
+"""
+
+
+IG_REPORTS_ONLY = True
+
+
+def remove_non_ascii(text):
+    return ''.join(i for i in text if ord(i)<128)
+
+# <oig_url>
+archive = 1930
+#govattic page structure isn't based on year
+
+# options:
+#   standard since/year options for a year range to fetch from.
+#
+# Notes for IG's web team:
+# GovernmentAttic's website seems to be hand-coded HTML with no CSS classes, but seems to be updated in a consistent enough way.
+
+#Landing page where GovAttic breaks government agencies down into several groups
+CATEGORIES_URL = 'http://www.governmentattic.org/DocumentsCat.html'
+
+#The below maps GovAttic's agency descriptors with inspectors-general's slugs.
+#It takes the format: ga_category,ga_agency,ig_short,ig_url,ig_slug
+#A GovAttic record will be ignored if it doesn't map to an IG that is in this repo. 
+#This mapping was hand-coded based on this file:
+#https://raw.githubusercontent.com/konklone/oversight.garden/master/config/inspectors.json
+
+GOVATTIC_MAPPING = """Department of Defense Documents,Department of Defense (DoD),Department of Defense,http://www.dodig.mil/,dod
+Department of Defense Documents,Office of the Inspector General (OIG),Department of Defense,http://www.dodig.mil/,dod
+Department of Defense Documents,Defense Commissary Agency (DeCA),Department of Defense,http://www.dodig.mil/,dod
+Department of Defense Documents,Defense Intelligence Agency (DIA),Defense Intelligence Agency,http://www.dia.mil/About/OfficeoftheInspectorGeneral.aspx,dia
+Department of Defense Documents,Defense Threat Reduction Agency (DTRA),Department of Defense,http://www.dodig.mil/,dod
+Department of Defense Documents,National,Department of Defense,http://www.dodig.mil/,dod
+Department of Defense Documents,United States Air Force,Air Force,http://www.af.mil/InspectorGeneralComplaints.aspx,airforce
+Department of Defense Documents,United States Army,Army,https://www.daig.pentagon.mil/,army
+Department of Defense Documents,United States Navy,Navy,http://www.secnav.navy.mil/ig/Pages/Home.aspx,navy
+Department of Justice Documents,Department of Justice (DOJ),Department of Justice,https://oig.justice.gov/,doj
+Department of Justice Documents,Office of the Inspector General,Department of Justice,https://oig.justice.gov/,doj
+Executive Branch Departments A-M,Department of Agriculture (USDA),Department of Agriculture,http://www.usda.gov/oig/,agriculture
+Executive Branch Departments A-M,Department of Commerce (DOC),Department of Commerce,https://www.oig.doc.gov/Pages/default.aspx,commerce
+Executive Branch Departments A-M,Department of Education (ED),Department of Education,https://www2.ed.gov/about/offices/list/oig/index.html,education
+Executive Branch Departments A-M,Department of Energy (DOE),Department of Energy,http://energy.gov/ig/office-inspector-general,energy
+Executive Branch Departments A-M,Department of Heath and Human Services (DHHS),Department of Health and Human Services,http://oig.hhs.gov/,hhs
+Executive Branch Departments A-M,Department of Homeland Security (DHS),Department of Homeland Security,https://www.oig.dhs.gov/,dhs
+Executive Branch Departments A-M,United States Secret Service (USSS),Department of Defense,http://www.dodig.mil/,dod
+Executive Branch Departments A-M,Department of Housing and Urban Development (HUD),Department of Housing and Urban Development,https://www.hudoig.gov/,hud
+Executive Branch Departments A-M,Department of the Interior (DOI),Department of the Interior,https://www.doioig.gov/,interior
+Executive Branch Departments A-M,Department of Labor (DOL),Department of Labor,https://www.oig.dol.gov/,labor
+Executive Branch Departments N-Z,Department of State,Department of State,https://oig.state.gov/,state
+Executive Branch Departments N-Z,Department of Transportation (DOT),Department of Transportation,https://www.oig.dot.gov/,dot
+Executive Branch Departments N-Z,Federal Aviation Administration (FAA),Department of Transportation,https://www.oig.dot.gov/,dot
+Executive Branch Departments N-Z,Department of the Treasury,Department of the Treasury,http://www.treasury.gov/about/organizational-structure/ig/,treasury
+Executive Branch Departments N-Z,Bureau of Engraving and Printing (BEP),Department of the Treasury,http://www.treasury.gov/about/organizational-structure/ig/,treasury
+Executive Branch Departments N-Z,Treasury Inspector General for Tax Administration (TIGTA),Treasury IG for Tax Administration,https://www.treasury.gov/tigta/,tigta
+Executive Branch Departments N-Z,Department of Veterans Affairs (VA),Department of Veterans Affairs,http://www.va.gov/oig,va
+White House Offices,Office of the Director of National Intelligence (ODNI),,,
+Legislative Agencies,Architect of the Capitol (AOC),Architect of the Capitol,http://www.aoc.gov/oig/office-inspector-general,architect
+Legislative Agencies,Government Accountability Office (GAO),Government Accountability Office,http://www.gao.gov/about/workforce/ig.html,gao
+Legislative Agencies,Library of Congress (LOC),Library of Congress,https://www.loc.gov/about/office-of-the-inspector-general/,loc
+Independent Federal Agencies A-M,The,,,
+Independent Federal Agencies A-M,Central Intelligence Agency (CIA),Central Intelligence Agency,https://www.cia.gov/offices-of-cia/inspector-general,cia
+Independent Federal Agencies A-M,Commodity Futures Trading Commission (CFTC),Commodity Futures Trading Commission,http://www.cftc.gov/About/OfficeoftheInspectorGeneral/index.htm,cftc
+Independent Federal Agencies A-M,Consumer Product Safety Commission (CPSC),Consumer Product Safety Commission,https://www.cpsc.gov/en/About-CPSC/Inspector-General/,cpsc
+Independent Federal Agencies A-M,Corporation for National and Community Service (CNCS),Corporation for National and Community Service,http://www.cncsoig.gov/,cncs
+Independent Federal Agencies A-M,Council of Inspectors General on Integrity and Efficiency (CIGIE),Council of Inspectors General on Integrity and Efficiency (CIGIE),https://www.ignet.gov/,cigie
+Independent Federal Agencies A-M,The Denali Commission,Denali Commission,http://oig.denali.gov/,denali
+Independent Federal Agencies A-M,Environmental Protection Agency (EPA),Environmental Protection Agency,http://www.epa.gov/oig,epa
+Independent Federal Agencies A-M,Equal Employment Opportunity Commission (EEOC),Equal Employment Opportunity Commission,https://oig.eeoc.gov/,eeoc
+Independent Federal Agencies A-M,Export-Import Bank of the United States (Ex-Im Bank),Export-Import Bank,http://www.exim.gov/about/oig,exim
+Independent Federal Agencies A-M,Federal Communications Commission (FCC),Federal Communications Commission,https://www.fcc.gov/office-inspector-general,fcc
+Independent Federal Agencies A-M,Federal Deposit Insurance Corporation (FDIC),Federal Deposit Insurance Corporation,https://www.fdicig.gov/,fdic
+Independent Federal Agencies A-M,Federal Election Commission,Federal Election Commission,http://www.fec.gov/fecig/fecig.shtml,fec
+Independent Federal Agencies A-M,Federal Housing Finance Agency (FHFA),Federal Housing Finance Agency,http://fhfaoig.gov/,fhfa
+Independent Federal Agencies A-M,Federal Labor Relations Authority (FLRA),Federal Labor Relations Authority,https://www.flra.gov/OIG,flra
+Independent Federal Agencies A-M,Federal Reserve System,Federal Reserve/CFPB,https://oig.federalreserve.gov/,fed
+Independent Federal Agencies A-M,Federal Trade Commission (FTC),Federal Trade Commission,https://www.ftc.gov/about-ftc/office-inspector-general,ftc
+Independent Federal Agencies A-M,General Services Administration (GSA),General Services Administration,https://www.gsaig.gov/,gsa
+Independent Federal Agencies N-Z,National Archives and Records Administration (NARA),National Archives,https://www.archives.gov/oig/,archives
+Independent Federal Agencies N-Z,National Aeronautics and Space Administration (NASA),NASA,https://oig.nasa.gov/,nasa
+Independent Federal Agencies N-Z,National Credit Union Administration (NCUA),National Credit Union Administration,http://www.ncua.gov/about/Leadership/Pages/page_oig.aspx,ncua
+Independent Federal Agencies N-Z,National Endowment for the Humanities (NEH),National Endowment for the Humanities,http://www.neh.gov/about/oig,neh
+Independent Federal Agencies N-Z,National Labor Relations Board (NLRB),National Labor Relations Board,https://www.nlrb.gov/who-we-are/inspector-general,nlrb
+Independent Federal Agencies N-Z,National Railroad Passenger Corporation (AMTRAK),Amtrak,https://www.amtrakoig.gov/,amtrak
+Independent Federal Agencies N-Z,National Science Foundation (NSF),National Science Foundation,https://www.nsf.gov/oig/,nsf
+Independent Federal Agencies N-Z,Nuclear Regulatory Commission (NRC),Nuclear Regulatory Commission,http://www.nrc.gov/insp-gen.html,nrc
+Independent Federal Agencies N-Z,Office of Personnel Management (OPM),Office of Personnel Management,https://www.opm.gov/our-inspector-general/,opm
+Independent Federal Agencies N-Z,Office of the Special Inspector General for Afghanistan,Special IG for Afghanistan Reconstruction,https://www.sigar.mil/,sigar
+Independent Federal Agencies N-Z,Office of the Special Inspector General for Iraq Reconstruction (SIGIR),Special IG for Iraq Reconstruction,http://www.sigir.mil/,sigir
+Independent Federal Agencies N-Z,Overseas Private Investment Corporation (OPIC),,,
+Independent Federal Agencies N-Z,The Peace Corps,Peace Corps,http://www.peacecorps.gov/about/inspgen/,peacecorps
+Independent Federal Agencies N-Z,The Railroad Retirement Board,Railroad Retirement Board,http://www.rrb.gov/oig/,rrb
+Independent Federal Agencies N-Z,Securities and Exchange Commission (SEC),Securities and Exchange Commission,http://www.sec.gov/oig,sec
+Independent Federal Agencies N-Z,Special Inspector General for the Troubled Asset Relief Program (SIGTARP),Special IG for TARP,https://www.sigtarp.gov/Pages/home.aspx,sigtarp
+Independent Federal Agencies N-Z,Small Business Administration (SBA),Small Business Administration,https://www.sba.gov/office-of-inspector-general,sba
+Independent Federal Agencies N-Z,Social Security Administration (SSA),Social Security Administration,http://oig.ssa.gov,ssa
+Independent Federal Agencies N-Z,Tennessee Valley Authority (TVA),Tennessee Valley Authority,http://oig.tva.gov/,tva
+Independent Federal Agencies N-Z,US Agency for International Development (USAID),U.S. Agency for International Development,https://oig.usaid.gov/,usaid
+Independent Federal Agencies N-Z,US Postal Service (USPS),U.S. Postal Service,https://uspsoig.gov/,usps
+Government Corporations,Legal Services Corporation,Legal Services Corporation,https://www.oig.lsc.gov/,lsc
+Government Corporations,Corporation for National and Community Service (CNCS),Corporation for National and Community Service,http://www.cncsoig.gov/,cncs
+Government Corporations,Pension Benefit Guaranty Corporation (PBGC),Pension Benefit Guaranty Corporation,http://oig.pbgc.gov/,pbgc
+State Records / Miscellaneous Records / Interagency Records,Records of State/CITY Agencies,,,
+State Records / Miscellaneous Records / Interagency Records,Miscellaneous Records,,,
+State Records / Miscellaneous Records / Interagency Records,Smithsonian Institution (SI),Smithsonian Institute,http://www.si.edu/OIG,smithsonian"""
+
+#store this as tuples above for ease of editing, but turn it into a dict for use.
+GOVATTIC_MAPPING_DICT = {}
+for line in GOVATTIC_MAPPING.splitlines():
+  (ga_category,ga_agency,ig_short,ig_url,ig_slug) = line.strip().split(',')
+  GOVATTIC_MAPPING_DICT[(ga_category,ga_agency)] = (ig_short,ig_url,ig_slug)
+
+def remove_linebreaks(s):
+  #lots of weird tabs, etc. inside HTML strings. would replace all at once, but since utils.beautifulsoup_from_url
+  #is taking the html straight to soup, we'll do it individually for the fields we need
+  return remove_non_ascii(s.replace('\n','').replace('\t','').replace('\r',''))  #.encode('ascii','ignore')
+
+def run(options):
+  year_range = inspector.year_range(options, archive)
+
+  #loop through sections (Executive Branch Departments A-M, etc)
+  category_doc = utils.beautifulsoup_from_url(CATEGORIES_URL)
+  category_links = category_doc.findAll('a')
+  for category_link in category_links:
+    #these are the detail pages with lots of PDFs. they are grouped according to agency name
+    category_name = remove_linebreaks(category_link.text).strip()
+    doc = utils.beautifulsoup_from_url(category_link['href'])
+    agency = ''
+    for result in doc.findAll('p'):
+      if result.font and result.font.get('color')=="#993333": 
+        #this is an agency name
+        agency = remove_linebreaks(result.font.text).strip()
+      else:
+        #this is a report from that agency
+        report = report_from(result, category_name, agency, year_range)
+        if report:
+          inspector.save_report(report)
+        
+
+#all dates successfully parse as of this writing, but it's a hand-coded HTML site, 
+#so it's possible one may not have a valid date in the future. use a set-in-time
+#default as a fallback so we get a consistent year-based slug for it.
+DEFAULT_DATE = datetime.datetime(2015,11,1,1,1,1)
+
+# extract a dict of details that are ready for inspector.save_report().
+def report_from(result, category_name, agency, year_range):
+
+  #ignore if it's not in our agency string->slug mapping or if it's in our mapping and has null instead of a slug.
+  #that means it doesn't come from an agency whose IG we track; it may be a document from a
+  #local government, etc.
+  if (category_name,agency) not in GOVATTIC_MAPPING_DICT or GOVATTIC_MAPPING_DICT[(category_name,agency)][-1]=='':
+    return
+  (ig_short,ig_url,ig_slug) = GOVATTIC_MAPPING_DICT[(category_name,agency)]
+
+  a = result.find('a')
+  if not a:
+    if result.p and result.p.font and result.p.font.find('a'):
+      a = result.p.font.find('a')
+  if not a: 
+    #there's no link, so this must just be some explanatory text, such as the footer
+    return
+  report_url = a['href']
+
+  text = remove_linebreaks(result.text)
+  r = re.compile('\[.*\s(\d{2})-+(\w{3,12})-+(\d{4})')
+  datematch = r.search(text)
+  published_on = None
+  datestring = None
+  if datematch:
+    datestring = '-'.join(datematch.groups()) #'01-Mar-2015
+    try:
+      published_on = datetime.datetime.strptime(datestring, '%d-%b-%Y')
+    except:    
+      published_on = None    
+    if not published_on:    
+      try:
+        published_on = datetime.datetime.strptime(datestring, '%d-%B-%Y')
+      except:
+        published_on = None
+  if not published_on:
+    logging.debug("[%s] Can't parse date %s, using default date." % (report_url,datestring))
+    published_on = DEFAULT_DATE
+
+  title = remove_linebreaks(a.text).strip()
+  
+  if published_on.year not in year_range:
+    logging.debug("[%s] Skipping, not in requested range." % report_url)
+    return
+
+  #ignore documents that are interesting FOIAs but are not IG reports.
+  #if you want to scrape IG and agency documents, set IG_REPORTS_ONLY=False
+  if IG_REPORTS_ONLY and 'OIG' not in title and 'inspector general' not in title.lower():
+    logging.debug("[%s] Skipping, not an IG report." % title)
+    return
+
+  #these will be stored in folders with documents scraped by the official IG scrapers, so
+  #use the governmentattic url as slug to assure no conflict.
+  report_id = inspector.slugify(report_url.replace('http://www.',''))
+
+
+  report = {
+    'inspector': ig_slug,     # Store these with their natively-scraped counterparts, not in a govattic-specific place
+    'inspector_url': ig_url,  
+    'agency': ig_slug,        # Agency and IG slug will be the same
+    'agency_name': ig_short,  # Take short name of the IG as the agency name. I think this should work.
+    'report_id': report_id,  
+    'url': report_url,  
+    'title': title,  
+    'type': 'FOIA - GovernmentAttic.org', # Type of report (default to 'other')
+    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d") #date published to GovAttic, not released by IG  
+  }
+
+  return report
+
+utils.run(run) if (__name__ == "__main__") else None
+
+
+
diff --git a/inspectors/osc.py b/inspectors/osc.py
index fc8a26d4..3ec2f466 100644
--- a/inspectors/osc.py
+++ b/inspectors/osc.py
@@ -180,7 +180,7 @@ def report_from(result, year, year_range,OUTCOME_CODES):
 
 
 def make_report_id(url):
-  return url.replace('/PublicFiles/','').replace('/publicfiles/','').replace('/','-').replace('.pdf','')
+  return inspector.slugify(url.replace('/PublicFiles/','').replace('/publicfiles/','').replace('.pdf',''))
 
 def get_extra_descrip(pdf_link):
   #takes a beautifulsoup object representing a PDF and tries to find the blurb of text that may be right above it.

From bc55d3e1b1c49d2277089c3dc401c8a726a2ea2d Mon Sep 17 00:00:00 2001
From: root <root@ytubuntu.org>
Date: Fri, 26 Feb 2016 18:30:42 -0500
Subject: [PATCH 02/11] GovernmentAttic

---
 inspectors/governmentattic.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/inspectors/governmentattic.py b/inspectors/governmentattic.py
index cf70278b..0b118c52 100644
--- a/inspectors/governmentattic.py
+++ b/inspectors/governmentattic.py
@@ -2,9 +2,7 @@
 
 import datetime
 import logging
-import os
 import re
-from urllib.parse import urljoin
 
 from utils import utils, inspector
 

From 6fc58d385f8621af8519edd3c94907bc913f5bae Mon Sep 17 00:00:00 2001
From: David Cook <divergentdave@gmail.com>
Date: Tue, 5 Apr 2016 18:13:02 -0500
Subject: [PATCH 03/11] [governmentattic] chmod a+x

---
 inspectors/governmentattic.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 inspectors/governmentattic.py

diff --git a/inspectors/governmentattic.py b/inspectors/governmentattic.py
old mode 100644
new mode 100755

From 2da6d41fe7a6282178982367f420047ff6d096be Mon Sep 17 00:00:00 2001
From: David Cook <divergentdave@gmail.com>
Date: Tue, 5 Apr 2016 18:26:35 -0500
Subject: [PATCH 04/11] [governmentattic] Fix mojibake

governmentattic.org doesn't provide an encoding in HTTP headers, and
requests can't parse <meta> tags inside the body of a document. Thus,
we explicitly decode these pages as utf-8 with a special case for
governmentattic.org.
---
 inspectors/governmentattic.py | 5 +----
 inspectors/utils/utils.py     | 6 ++++++
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/inspectors/governmentattic.py b/inspectors/governmentattic.py
index 0b118c52..ef8235b0 100755
--- a/inspectors/governmentattic.py
+++ b/inspectors/governmentattic.py
@@ -30,9 +30,6 @@
 IG_REPORTS_ONLY = True
 
 
-def remove_non_ascii(text):
-    return ''.join(i for i in text if ord(i)<128)
-
 # <oig_url>
 archive = 1930
 #govattic page structure isn't based on year
@@ -139,7 +136,7 @@ def remove_non_ascii(text):
 def remove_linebreaks(s):
   #lots of weird tabs, etc. inside HTML strings. would replace all at once, but since utils.beautifulsoup_from_url
   #is taking the html straight to soup, we'll do it individually for the fields we need
-  return remove_non_ascii(s.replace('\n','').replace('\t','').replace('\r',''))  #.encode('ascii','ignore')
+  return inspector.sanitize(s.replace('\n','').replace('\t','').replace('\r',''))
 
 def run(options):
   year_range = inspector.year_range(options, archive)
diff --git a/inspectors/utils/utils.py b/inspectors/utils/utils.py
index 76ec05b9..6ab3e5fc 100644
--- a/inspectors/utils/utils.py
+++ b/inspectors/utils/utils.py
@@ -263,6 +263,12 @@ def download(url, destination=None, options=None):
         log_http_error(e, url)
         return None
 
+      # Special case handling for governmentattic.org:
+      # These pages are served without an encoding in the HTTP headers,
+      # and with utf-8 specified in a <meta> tag inside the document.
+      if url.startswith("http://www.governmentattic.org"):
+        response.encoding = "utf-8"
+
       body = response.text
       if not isinstance(body, str): raise ValueError("Content not decoded.")
 

From e1273a6ce64807326244a68da5aa31a1d0f302b4 Mon Sep 17 00:00:00 2001
From: David Cook <divergentdave@gmail.com>
Date: Tue, 5 Apr 2016 18:34:10 -0500
Subject: [PATCH 05/11] [governmentattic] Log missing dates, no default

---
 inspectors/governmentattic.py | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/inspectors/governmentattic.py b/inspectors/governmentattic.py
index ef8235b0..c39a3194 100755
--- a/inspectors/governmentattic.py
+++ b/inspectors/governmentattic.py
@@ -160,11 +160,6 @@ def run(options):
           inspector.save_report(report)
         
 
-#all dates successfully parse as of this writing, but it's a hand-coded HTML site, 
-#so it's possible one may not have a valid date in the future. use a set-in-time
-#default as a fallback so we get a consistent year-based slug for it.
-DEFAULT_DATE = datetime.datetime(2015,11,1,1,1,1)
-
 # extract a dict of details that are ready for inspector.save_report().
 def report_from(result, category_name, agency, year_range):
 
@@ -184,6 +179,11 @@ def report_from(result, category_name, agency, year_range):
     return
   report_url = a['href']
 
+  #these will be stored in folders with documents scraped by the official IG scrapers, so
+  #use the governmentattic url as slug to assure no conflict.
+  report_id = inspector.slugify(report_url.replace('http://www.',''))
+
+  title = remove_linebreaks(a.text).strip()
   text = remove_linebreaks(result.text)
   r = re.compile('\[.*\s(\d{2})-+(\w{3,12})-+(\d{4})')
   datematch = r.search(text)
@@ -201,11 +201,9 @@ def report_from(result, category_name, agency, year_range):
       except:
         published_on = None
   if not published_on:
-    logging.debug("[%s] Can't parse date %s, using default date." % (report_url,datestring))
-    published_on = DEFAULT_DATE
+    inspector.log_no_date(report_id, title, report_url)
+    return
 
-  title = remove_linebreaks(a.text).strip()
-  
   if published_on.year not in year_range:
     logging.debug("[%s] Skipping, not in requested range." % report_url)
     return
@@ -216,11 +214,6 @@ def report_from(result, category_name, agency, year_range):
     logging.debug("[%s] Skipping, not an IG report." % title)
     return
 
-  #these will be stored in folders with documents scraped by the official IG scrapers, so
-  #use the governmentattic url as slug to assure no conflict.
-  report_id = inspector.slugify(report_url.replace('http://www.',''))
-
-
   report = {
     'inspector': ig_slug,     # Store these with their natively-scraped counterparts, not in a govattic-specific place
     'inspector_url': ig_url,  

From 5b2bf53f389cb68670124fa1c229b0871f1ed614 Mon Sep 17 00:00:00 2001
From: David Cook <divergentdave@gmail.com>
Date: Tue, 5 Apr 2016 18:35:51 -0500
Subject: [PATCH 06/11] [governmentattic] Handle "Sept" in dates

---
 inspectors/governmentattic.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/inspectors/governmentattic.py b/inspectors/governmentattic.py
index c39a3194..5d8436b9 100755
--- a/inspectors/governmentattic.py
+++ b/inspectors/governmentattic.py
@@ -191,6 +191,7 @@ def report_from(result, category_name, agency, year_range):
   datestring = None
   if datematch:
     datestring = '-'.join(datematch.groups()) #'01-Mar-2015
+    datestring = datestring.replace("Sept", "Sep")
     try:
       published_on = datetime.datetime.strptime(datestring, '%d-%b-%Y')
     except:    

From d68ba3c3deb7b366656960f3c107baa02af3a503 Mon Sep 17 00:00:00 2001
From: David Cook <divergentdave@gmail.com>
Date: Tue, 5 Apr 2016 18:37:38 -0500
Subject: [PATCH 07/11] [governmentattic] Handle extra hyphen before date

---
 inspectors/governmentattic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inspectors/governmentattic.py b/inspectors/governmentattic.py
index 5d8436b9..0c3529db 100755
--- a/inspectors/governmentattic.py
+++ b/inspectors/governmentattic.py
@@ -185,7 +185,7 @@ def report_from(result, category_name, agency, year_range):
 
   title = remove_linebreaks(a.text).strip()
   text = remove_linebreaks(result.text)
-  r = re.compile('\[.*\s(\d{2})-+(\w{3,12})-+(\d{4})')
+  r = re.compile('\[.*\s-?(\d{2})-+(\w{3,12})-+(\d{4})')
   datematch = r.search(text)
   published_on = None
   datestring = None

From 5b87108499f7a8fb29648dabcea41b0eb196433a Mon Sep 17 00:00:00 2001
From: David Cook <divergentdave@gmail.com>
Date: Tue, 5 Apr 2016 18:40:27 -0500
Subject: [PATCH 08/11] [governmentattic] Handle [11-Aug-2013] etc.

---
 inspectors/governmentattic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inspectors/governmentattic.py b/inspectors/governmentattic.py
index 0c3529db..4d87e0dd 100755
--- a/inspectors/governmentattic.py
+++ b/inspectors/governmentattic.py
@@ -185,7 +185,7 @@ def report_from(result, category_name, agency, year_range):
 
   title = remove_linebreaks(a.text).strip()
   text = remove_linebreaks(result.text)
-  r = re.compile('\[.*\s-?(\d{2})-+(\w{3,12})-+(\d{4})')
+  r = re.compile('\[(?:.*\s-?|)(\d{2})-+(\w{3,12})-+(\d{4})')
   datematch = r.search(text)
   published_on = None
   datestring = None

From 97eb04dcb9eb51d0c60ae17d0d0331c32681397c Mon Sep 17 00:00:00 2001
From: David Cook <divergentdave@gmail.com>
Date: Tue, 5 Apr 2016 18:41:31 -0500
Subject: [PATCH 09/11] [governmentattic] Don't break [16-September-2009]

---
 inspectors/governmentattic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inspectors/governmentattic.py b/inspectors/governmentattic.py
index 4d87e0dd..280f9a27 100755
--- a/inspectors/governmentattic.py
+++ b/inspectors/governmentattic.py
@@ -191,7 +191,7 @@ def report_from(result, category_name, agency, year_range):
   datestring = None
   if datematch:
     datestring = '-'.join(datematch.groups()) #'01-Mar-2015
-    datestring = datestring.replace("Sept", "Sep")
+    datestring = datestring.replace("-Sept-", "-Sep-")
     try:
       published_on = datetime.datetime.strptime(datestring, '%d-%b-%Y')
     except:    

From 9c707bf29849c0f357da445552508f69ece36bc7 Mon Sep 17 00:00:00 2001
From: David Cook <divergentdave@gmail.com>
Date: Tue, 5 Apr 2016 18:50:56 -0500
Subject: [PATCH 10/11] [governmentattic] Remove dead code

---
 inspectors/governmentattic.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/inspectors/governmentattic.py b/inspectors/governmentattic.py
index 280f9a27..d333271b 100755
--- a/inspectors/governmentattic.py
+++ b/inspectors/governmentattic.py
@@ -171,9 +171,6 @@ def report_from(result, category_name, agency, year_range):
   (ig_short,ig_url,ig_slug) = GOVATTIC_MAPPING_DICT[(category_name,agency)]
 
   a = result.find('a')
-  if not a:
-    if result.p and result.p.font and result.p.font.find('a'):
-      a = result.p.font.find('a')
   if not a: 
     #there's no link, so this must just be some explanatory text, such as the footer
     return

From 99351908a7395e460261e4f5715d3ace76574367 Mon Sep 17 00:00:00 2001
From: David Cook <divergentdave@gmail.com>
Date: Tue, 5 Apr 2016 18:53:08 -0500
Subject: [PATCH 11/11] [governmentattic] Add to safe.yml

---
 safe.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/safe.yml b/safe.yml
index b692b106..b375f9d6 100644
--- a/safe.yml
+++ b/safe.yml
@@ -108,6 +108,9 @@
 # Government Accountability Office
 - gao
 
+# Governmentattic.org
+- governmentattic
+
 # Government Printing Office
 - gpo