catalyst-cooperative · e-belfer · Feb 12, 2025 · Feb 12, 2025
diff --git a/src/pudl_archiver/archivers/nrel/siting.py b/src/pudl_archiver/archivers/nrel/siting.py
@@ -0,0 +1,120 @@
+"""Download data from the NREL siting lab data."""
+
+import re
+
+from pudl_archiver.archivers.classes import (
+    AbstractDatasetArchiver,
+    ArchiveAwaitable,
+    ResourceInfo,
+    retry_async,
+)
+from pudl_archiver.frictionless import ZipLayout
+
+
+class NrelSitingArchiver(AbstractDatasetArchiver):
+    """NREL Siting Lab Data archiver."""
+
+    name: str = "nrelsiting"
+    base_url: str = "https://data.openei.org/siting_lab"
+
+    async def get_resources(self) -> ArchiveAwaitable:
+        """Using data IDs, iterate and download all NREL Siting Lab files."""
+        # The links on the table are hidden through Javascript. However,
+        # the IDs are exposed on this JS file, which links each dataset ID to an image.
+        # Rather than using Selenium, we can use this file to identify the links for all
+        # datasets hosted through the siting lab.
+        url = "https://data.openei.org/api"
+        data = {
+            "action": "getSubmissionStatistics",
+            "format": "json",
+            "s": "siting_lab",
+        }
+        response = await retry_async(
+            self.session.post, args=[url], kwargs={"data": data}
+        )
+        data_dict = await response.json()
+
+        self.logger.info(
+            f"Downloading data for {data_dict['numSubmissions']} datasets. {data_dict['numFiles']} files ({data_dict['sizeOfFiles'] / 1e-9} GB)."
+        )
+        for item in data_dict["submissions"]:
+            yield self.get_siting_resources(item)
+
+    async def download_nrel_data(self, dataset_id: str, dataset_link: str) -> set:
+        """For a given NREL dataset link, grab all PDFs and data links from the page."""
+        # There are many file types here, so we match using the more general link pattern
+        # e.g., https://data.openei.org/files/6121/nexrad_4km.tif
+        # We also grab the PDF files, which are hosted on a different part of the
+        # NREL website. E.g., https://www.nrel.gov/docs/fy24osti/87843.pdf
+        download_links = set()
+
+        data_pattern = re.compile(rf"files\/{dataset_id}\/")
+        pdf_data_pattern = re.compile(r"docs\/[\w\/]*.pdf$")
+
+        # Get data
+        data_download_links = await self.get_hyperlinks(dataset_link, data_pattern)
+        for link in data_download_links:
+            full_link = f"https://data.openei.org{link}"
+            download_links.add(full_link)
+
+        # Get PDFs
+        pdf_download_links = await self.get_hyperlinks(dataset_link, pdf_data_pattern)
+        download_links.update(pdf_download_links)
+        return download_links
+
+    async def get_siting_resources(self, dataset_dict: dict[str, str | int | list]):
+        """Download all files for a siting resource."""
+        dataset_id = dataset_dict["xdrId"]
+
+        dataset_link = f"https://data.openei.org/submissions/{dataset_id}"
+        self.logger.info(f"Downloading files from {dataset_link}")
+
+        # Create zipfile name from dataset name
+        title = dataset_dict["submissionName"]
+        dataset_name = title.lower().strip()
+        dataset_name = re.sub(
+            r"([^a-zA-Z0-9 ])", "", dataset_name
+        )  # Drop all non-space special characters
+        dataset_name = dataset_name.replace(" ", "-")
+
+        zip_path = self.download_directory / f"nrelsiting-{dataset_name}.zip"
+        data_paths_in_archive = set()
+
+        # First, get all the links from the page itself
+        data_links = await self.download_nrel_data(
+            dataset_id=dataset_id, dataset_link=dataset_link
+        )
+
+        # A few datasets have an additional linked data page:
+        # e.g., https://data.openei.org/submissions/1932
+        additional_datasets_pattern = re.compile(r"\/submissions\/\d{4}")
+        links = await self.get_hyperlinks(dataset_link, additional_datasets_pattern)
+
+        # For each additional dataset linked, iterate through the same process
+        for link in links:
+            additional_dataset_id = link.split("/")[-1]
+            additional_data_paths_in_archive = await self.download_nrel_data(
+                dataset_id=additional_dataset_id, dataset_link=link
+            )
+            data_links.update(additional_data_paths_in_archive)
+
+        # For each link we've collected, download it and add it to the zipfile
+        for link in set(data_links):  # Use set to handle duplication
+            filename = link.split("/")[-1]
+            # This file shows up in multiple datasets,
+            # causing collision when they run concurrently. Rename it
+            # to avoid this problem.
+            if filename == "87843.pdf":
+                filename = f"{dataset_name}-technical-report.pdf"
+
+            self.logger.info(f"Downloading {link} to {filename} for {zip_path}.")
+            await self.download_add_to_archive_and_unlink(
+                url=link, filename=filename, zip_path=zip_path
+            )
+            data_paths_in_archive.add(filename)
+
+        return ResourceInfo(
+            local_path=zip_path,
+            partitions={"data_set": dataset_name},
+            layout=ZipLayout(file_paths=data_paths_in_archive),
+        )
diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py
@@ -414,4 +414,32 @@
         "license_pudl": LICENSES["cc-by-4.0"],
         "contributors": [CONTRIBUTORS["catalyst-cooperative"]],
     },
+    "nrelsiting": {
+        "title": "NREL Renewable Energy Siting Lab Data",
+        "path": "https://data.openei.org/siting_lab",
+        "description": (
+            "This repository contains all data produced by the NREL Renewable Energy Siting Lab. "
+            "The Siting Lab offers information on solar energy siting regulations and zoning ordinances, "
+            "as well as supply curve data. Documentation particular to each dataset can be found in the "
+            "relevant dataset zipfile."
+        ),
+        "working_partitions": {},
+        "keywords": sorted(
+            {
+                "nrel",
+                "siting",
+                "supply curves",
+                "pv",
+                "solar",
+                "wind",
+                "ordinances",
+                "setbacks",
+                "nexrad",
+                "moratoriums",
+            }
+        ),
+        "license_raw": LICENSES["us-govt"],
+        "license_pudl": LICENSES["cc-by-4.0"],
+        "contributors": [CONTRIBUTORS["catalyst-cooperative"]],
+    },
 }