Add USGS USWTDB data (#539)

* add archiver for USGS USWTDB * rename * add to select year supported datasets * clarify extracting date parts * [pre-commit.ci] auto fixes from pre-commit.com hooks For more information, see https://pre-commit.ci * Log remote URL and local download paths. * Construct valid ZIP file download URLs * Don't fail on date discontinuities * Add Zenodo DOIs for USGS US Wind Turbine DB * Add USGS US Wind Turbine DB to run-archiver workflow. * Use concept DOIs not v1.0 for USGS US Wind Turbine DB. * update docstring and improve logging --------- Co-authored-by: Marianne Hoogeveen <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Zane Selvans <[email protected]> Co-authored-by: E. Belfer <[email protected]>
catalyst-cooperative · Feb 6, 2025 · fc7950f · fc7950f
1 parent 22783e3
commit fc7950f
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 15 deletions.
diff --git a/.github/workflows/run-archiver.yml b/.github/workflows/run-archiver.yml
@@ -6,7 +6,7 @@ on:
     inputs:
       datasets:
         description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").'
-        default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"'
+        default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"'
         required: true
         type: string
       create_github_issue:
@@ -26,7 +26,7 @@ jobs:
     strategy:
       matrix:
         # Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here.
-        dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }}
+        dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"' )) }}
       fail-fast: false
     runs-on: ubuntu-latest
     permissions:

diff --git a/src/pudl_archiver/archivers/classes.py b/src/pudl_archiver/archivers/classes.py
@@ -261,7 +261,7 @@ async def get_hyperlinks(
         filter_pattern: typing.Pattern | None = None,
         verify: bool = True,
         headers: dict | None = None,
-    ) -> list[str]:
+    ) -> dict[str, str]:
         """Return all hyperlinks from a specific web page.
 
         This is a helper function to perform very basic web-scraping functionality.

diff --git a/src/pudl_archiver/archivers/usgsuswtdb.py b/src/pudl_archiver/archivers/usgsuswtdb.py
@@ -0,0 +1,53 @@
+"""Download USGS USWTDB data."""
+
+import re
+from urllib.parse import urlparse
+
+from pudl_archiver.archivers.classes import (
+    AbstractDatasetArchiver,
+    ArchiveAwaitable,
+    ResourceInfo,
+)
+
+BASE_URL = "https://www.sciencebase.gov/catalog/item/5e99a01082ce172707f6fd2a"
+
+
+class UsgsUswtdbArchiver(AbstractDatasetArchiver):
+    """USGS USWTDB archiver.
+
+    Data is published almost quarterly (with some extra publicatons), so monthly
+    continuous data is not expected.
+    """
+
+    name = "usgsuswtdb"
+    fail_on_data_continuity = False
+
+    async def get_resources(self) -> ArchiveAwaitable:
+        """Download USWTDB resources."""
+        link_pattern = re.compile(r"uswtdb_v(\d+)_(\d+)(?:_(\d+))?_(\d{8})\.zip")
+        self.logger.info(f"Searching {BASE_URL} for hyperlinks matching {link_pattern}")
+        data_links = await self.get_hyperlinks(BASE_URL, link_pattern)
+        for link, name in data_links.items():
+            self.logger.debug(f"Found link: {link}, name: {name}")
+            matches = link_pattern.search(name)
+            if not matches:
+                continue
+
+            date = matches.group(4)
+            year, month = date[:4], date[4:6]
+            year_month = f"{year}-{month}"
+            if self.valid_year(int(year)):
+                yield self.get_year_month_resource(link, year_month)
+
+    async def get_year_month_resource(self, link: str, year_month: str) -> ResourceInfo:
+        """Download zip file."""
+        # Append hyperlink to base URL to get URL of file
+        parsed_url = urlparse(BASE_URL)
+        url = f"{parsed_url.scheme}://{parsed_url.netloc}{link}"
+        download_path = self.download_directory / f"usgsuswtdb-{year_month}.zip"
+        self.logger.debug(f"Attempting to download {url} to {download_path}")
+        await self.download_zipfile(url, download_path)
+
+        return ResourceInfo(
+            local_path=download_path, partitions={"year_month": year_month}
+        )
diff --git a/src/pudl_archiver/cli.py b/src/pudl_archiver/cli.py
@@ -34,8 +34,8 @@ def parse_main(args=None):
         nargs="*",
         help="Years to download data for. Supported datasets: censusdp1tract, censuspep, "
         "eia176, eia191, eia757a, eia860, eia860m, eia861, eia923, eia930, eia_bulk_elec, "
-        "eiaaeo, eiamecs, eiawater, epacamd_eia, epacems, epaegrid,ferc1, ferc2, ferc6, "
-        "ferc60, ferc714, mshamines, nrelatb, phmsagas",
+        "eiaaeo, eiamecs, eiawater, epacamd_eia, epacems, epaegrid, ferc1, ferc2, ferc6, "
+        "ferc60, ferc714, mshamines, nrelatb, phmsagas, usgsuswtdb",
         type=int,
     )
     parser.add_argument(
@@ -68,16 +68,12 @@ def parse_main(args=None):
         action="store_true",
         help="Automatically publish a deposition, rather than requiring manual review before publishing.",
     )
-    (
-        parser.add_argument(
-            "--deposition-path",
-            help=(
-                "Configurable base path used by `fsspec` depositor. Expects paths in `fsspec` compatible "
-                "format like: 'file://local/path/to/folder' or file:///absolute/path/to/folder or "
-                "gs://path/to/gcs_bucket"
-            ),
-            default=None,
-        ),
+    parser.add_argument(
+        "--deposition-path",
+        help="Configurable base path used by `fsspec` depositor. Expects paths in `fsspec` compatible "
+        "format like: 'file://local/path/to/folder' or file:///absolute/path/to/folder or "
+        "gs://path/to/gcs_bucket",
+        default=None,
     )
     parser.add_argument(
         "--refresh-metadata",

diff --git a/src/pudl_archiver/package_data/zenodo_doi.yaml b/src/pudl_archiver/package_data/zenodo_doi.yaml
@@ -97,6 +97,9 @@ phmsagas:
 usgsuspvdb:
   production_doi: 10.5281/zenodo.14736285
   sandbox_doi: 10.5072/zenodo.157776
+usgsuswtdb:
+  production_doi: 10.5281/zenodo.14783214
+  sandbox_doi: 10.5072/zenodo.161235
 vcerare:
   production_doi: 10.5281/zenodo.13937522
   sandbox_doi: 10.5072/zenodo.118136