From fc7950f75e7f3b556aa1649bc622c440146c3349 Mon Sep 17 00:00:00 2001 From: Marianne Hoogeveen Date: Wed, 5 Feb 2025 22:13:05 -0500 Subject: [PATCH] Add USGS USWTDB data (#539) * add archiver for USGS USWTDB * rename * add to select year supported datasets * clarify extracting date parts * [pre-commit.ci] auto fixes from pre-commit.com hooks For more information, see https://pre-commit.ci * Log remote URL and local download paths. * Construct valid ZIP file download URLs * Don't fail on date discontinuities * Add Zenodo DOIs for USGS US Wind Turbine DB * Add USGS US Wind Turbine DB to run-archiver workflow. * Use concept DOIs not v1.0 for USGS US Wind Turbine DB. * update docstring and improve logging --------- Co-authored-by: Marianne Hoogeveen Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Zane Selvans Co-authored-by: E. Belfer <37471869+e-belfer@users.noreply.github.com> --- .github/workflows/run-archiver.yml | 4 +- src/pudl_archiver/archivers/classes.py | 2 +- src/pudl_archiver/archivers/usgsuswtdb.py | 53 +++++++++++++++++++ src/pudl_archiver/cli.py | 20 +++---- .../package_data/zenodo_doi.yaml | 3 ++ 5 files changed, 67 insertions(+), 15 deletions(-) create mode 100644 src/pudl_archiver/archivers/usgsuswtdb.py diff --git a/.github/workflows/run-archiver.yml b/.github/workflows/run-archiver.yml index ecf8e5e5..a5e2e621 100644 --- a/.github/workflows/run-archiver.yml +++ b/.github/workflows/run-archiver.yml @@ -6,7 +6,7 @@ on: inputs: datasets: description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").' - default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' + default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"' required: true type: string create_github_issue: @@ -26,7 +26,7 @@ jobs: strategy: matrix: # Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here. - dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }} + dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"' )) }} fail-fast: false runs-on: ubuntu-latest permissions: diff --git a/src/pudl_archiver/archivers/classes.py b/src/pudl_archiver/archivers/classes.py index 4041e635..512a0230 100644 --- a/src/pudl_archiver/archivers/classes.py +++ b/src/pudl_archiver/archivers/classes.py @@ -261,7 +261,7 @@ async def get_hyperlinks( filter_pattern: typing.Pattern | None = None, verify: bool = True, headers: dict | None = None, - ) -> list[str]: + ) -> dict[str, str]: """Return all hyperlinks from a specific web page. This is a helper function to perform very basic web-scraping functionality. diff --git a/src/pudl_archiver/archivers/usgsuswtdb.py b/src/pudl_archiver/archivers/usgsuswtdb.py new file mode 100644 index 00000000..8810324e --- /dev/null +++ b/src/pudl_archiver/archivers/usgsuswtdb.py @@ -0,0 +1,53 @@ +"""Download USGS USWTDB data.""" + +import re +from urllib.parse import urlparse + +from pudl_archiver.archivers.classes import ( + AbstractDatasetArchiver, + ArchiveAwaitable, + ResourceInfo, +) + +BASE_URL = "https://www.sciencebase.gov/catalog/item/5e99a01082ce172707f6fd2a" + + +class UsgsUswtdbArchiver(AbstractDatasetArchiver): + """USGS USWTDB archiver. + + Data is published almost quarterly (with some extra publicatons), so monthly + continuous data is not expected. + """ + + name = "usgsuswtdb" + fail_on_data_continuity = False + + async def get_resources(self) -> ArchiveAwaitable: + """Download USWTDB resources.""" + link_pattern = re.compile(r"uswtdb_v(\d+)_(\d+)(?:_(\d+))?_(\d{8})\.zip") + self.logger.info(f"Searching {BASE_URL} for hyperlinks matching {link_pattern}") + data_links = await self.get_hyperlinks(BASE_URL, link_pattern) + for link, name in data_links.items(): + self.logger.debug(f"Found link: {link}, name: {name}") + matches = link_pattern.search(name) + if not matches: + continue + + date = matches.group(4) + year, month = date[:4], date[4:6] + year_month = f"{year}-{month}" + if self.valid_year(int(year)): + yield self.get_year_month_resource(link, year_month) + + async def get_year_month_resource(self, link: str, year_month: str) -> ResourceInfo: + """Download zip file.""" + # Append hyperlink to base URL to get URL of file + parsed_url = urlparse(BASE_URL) + url = f"{parsed_url.scheme}://{parsed_url.netloc}{link}" + download_path = self.download_directory / f"usgsuswtdb-{year_month}.zip" + self.logger.debug(f"Attempting to download {url} to {download_path}") + await self.download_zipfile(url, download_path) + + return ResourceInfo( + local_path=download_path, partitions={"year_month": year_month} + ) diff --git a/src/pudl_archiver/cli.py b/src/pudl_archiver/cli.py index 9812d688..0611ae69 100644 --- a/src/pudl_archiver/cli.py +++ b/src/pudl_archiver/cli.py @@ -34,8 +34,8 @@ def parse_main(args=None): nargs="*", help="Years to download data for. Supported datasets: censusdp1tract, censuspep, " "eia176, eia191, eia757a, eia860, eia860m, eia861, eia923, eia930, eia_bulk_elec, " - "eiaaeo, eiamecs, eiawater, epacamd_eia, epacems, epaegrid,ferc1, ferc2, ferc6, " - "ferc60, ferc714, mshamines, nrelatb, phmsagas", + "eiaaeo, eiamecs, eiawater, epacamd_eia, epacems, epaegrid, ferc1, ferc2, ferc6, " + "ferc60, ferc714, mshamines, nrelatb, phmsagas, usgsuswtdb", type=int, ) parser.add_argument( @@ -68,16 +68,12 @@ def parse_main(args=None): action="store_true", help="Automatically publish a deposition, rather than requiring manual review before publishing.", ) - ( - parser.add_argument( - "--deposition-path", - help=( - "Configurable base path used by `fsspec` depositor. Expects paths in `fsspec` compatible " - "format like: 'file://local/path/to/folder' or file:///absolute/path/to/folder or " - "gs://path/to/gcs_bucket" - ), - default=None, - ), + parser.add_argument( + "--deposition-path", + help="Configurable base path used by `fsspec` depositor. Expects paths in `fsspec` compatible " + "format like: 'file://local/path/to/folder' or file:///absolute/path/to/folder or " + "gs://path/to/gcs_bucket", + default=None, ) parser.add_argument( "--refresh-metadata", diff --git a/src/pudl_archiver/package_data/zenodo_doi.yaml b/src/pudl_archiver/package_data/zenodo_doi.yaml index 1fc7552a..8cedd6ba 100644 --- a/src/pudl_archiver/package_data/zenodo_doi.yaml +++ b/src/pudl_archiver/package_data/zenodo_doi.yaml @@ -97,6 +97,9 @@ phmsagas: usgsuspvdb: production_doi: 10.5281/zenodo.14736285 sandbox_doi: 10.5072/zenodo.157776 +usgsuswtdb: + production_doi: 10.5281/zenodo.14783214 + sandbox_doi: 10.5072/zenodo.161235 vcerare: production_doi: 10.5281/zenodo.13937522 sandbox_doi: 10.5072/zenodo.118136