Skip to content

Commit

Permalink
Add USGS USWTDB data (#539)
Browse files Browse the repository at this point in the history
* add archiver for USGS USWTDB
* rename
* add to select year supported datasets
* clarify extracting date parts
* [pre-commit.ci] auto fixes from pre-commit.com hooks
For more information, see https://pre-commit.ci
* Log remote URL and local download paths.
* Construct valid ZIP file download URLs
* Don't fail on date discontinuities
* Add Zenodo DOIs for USGS US Wind Turbine DB
* Add USGS US Wind Turbine DB to run-archiver workflow.
* Use concept DOIs not v1.0 for USGS US Wind Turbine DB.
* update docstring and improve logging

---------

Co-authored-by: Marianne Hoogeveen <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Zane Selvans <[email protected]>
Co-authored-by: E. Belfer <[email protected]>
  • Loading branch information
5 people authored Feb 6, 2025
1 parent 22783e3 commit fc7950f
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 15 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/run-archiver.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
inputs:
datasets:
description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").'
default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"'
default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"'
required: true
type: string
create_github_issue:
Expand All @@ -26,7 +26,7 @@ jobs:
strategy:
matrix:
# Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here.
dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }}
dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"' )) }}
fail-fast: false
runs-on: ubuntu-latest
permissions:
Expand Down
2 changes: 1 addition & 1 deletion src/pudl_archiver/archivers/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ async def get_hyperlinks(
filter_pattern: typing.Pattern | None = None,
verify: bool = True,
headers: dict | None = None,
) -> list[str]:
) -> dict[str, str]:
"""Return all hyperlinks from a specific web page.
This is a helper function to perform very basic web-scraping functionality.
Expand Down
53 changes: 53 additions & 0 deletions src/pudl_archiver/archivers/usgsuswtdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Download USGS USWTDB data."""

import re
from urllib.parse import urlparse

from pudl_archiver.archivers.classes import (
AbstractDatasetArchiver,
ArchiveAwaitable,
ResourceInfo,
)

BASE_URL = "https://www.sciencebase.gov/catalog/item/5e99a01082ce172707f6fd2a"


class UsgsUswtdbArchiver(AbstractDatasetArchiver):
"""USGS USWTDB archiver.
Data is published almost quarterly (with some extra publicatons), so monthly
continuous data is not expected.
"""

name = "usgsuswtdb"
fail_on_data_continuity = False

async def get_resources(self) -> ArchiveAwaitable:
"""Download USWTDB resources."""
link_pattern = re.compile(r"uswtdb_v(\d+)_(\d+)(?:_(\d+))?_(\d{8})\.zip")
self.logger.info(f"Searching {BASE_URL} for hyperlinks matching {link_pattern}")
data_links = await self.get_hyperlinks(BASE_URL, link_pattern)
for link, name in data_links.items():
self.logger.debug(f"Found link: {link}, name: {name}")
matches = link_pattern.search(name)
if not matches:
continue

date = matches.group(4)
year, month = date[:4], date[4:6]
year_month = f"{year}-{month}"
if self.valid_year(int(year)):
yield self.get_year_month_resource(link, year_month)

async def get_year_month_resource(self, link: str, year_month: str) -> ResourceInfo:
"""Download zip file."""
# Append hyperlink to base URL to get URL of file
parsed_url = urlparse(BASE_URL)
url = f"{parsed_url.scheme}://{parsed_url.netloc}{link}"
download_path = self.download_directory / f"usgsuswtdb-{year_month}.zip"
self.logger.debug(f"Attempting to download {url} to {download_path}")
await self.download_zipfile(url, download_path)

return ResourceInfo(
local_path=download_path, partitions={"year_month": year_month}
)
20 changes: 8 additions & 12 deletions src/pudl_archiver/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ def parse_main(args=None):
nargs="*",
help="Years to download data for. Supported datasets: censusdp1tract, censuspep, "
"eia176, eia191, eia757a, eia860, eia860m, eia861, eia923, eia930, eia_bulk_elec, "
"eiaaeo, eiamecs, eiawater, epacamd_eia, epacems, epaegrid,ferc1, ferc2, ferc6, "
"ferc60, ferc714, mshamines, nrelatb, phmsagas",
"eiaaeo, eiamecs, eiawater, epacamd_eia, epacems, epaegrid, ferc1, ferc2, ferc6, "
"ferc60, ferc714, mshamines, nrelatb, phmsagas, usgsuswtdb",
type=int,
)
parser.add_argument(
Expand Down Expand Up @@ -68,16 +68,12 @@ def parse_main(args=None):
action="store_true",
help="Automatically publish a deposition, rather than requiring manual review before publishing.",
)
(
parser.add_argument(
"--deposition-path",
help=(
"Configurable base path used by `fsspec` depositor. Expects paths in `fsspec` compatible "
"format like: 'file://local/path/to/folder' or file:///absolute/path/to/folder or "
"gs://path/to/gcs_bucket"
),
default=None,
),
parser.add_argument(
"--deposition-path",
help="Configurable base path used by `fsspec` depositor. Expects paths in `fsspec` compatible "
"format like: 'file://local/path/to/folder' or file:///absolute/path/to/folder or "
"gs://path/to/gcs_bucket",
default=None,
)
parser.add_argument(
"--refresh-metadata",
Expand Down
3 changes: 3 additions & 0 deletions src/pudl_archiver/package_data/zenodo_doi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ phmsagas:
usgsuspvdb:
production_doi: 10.5281/zenodo.14736285
sandbox_doi: 10.5072/zenodo.157776
usgsuswtdb:
production_doi: 10.5281/zenodo.14783214
sandbox_doi: 10.5072/zenodo.161235
vcerare:
production_doi: 10.5281/zenodo.13937522
sandbox_doi: 10.5072/zenodo.118136

0 comments on commit fc7950f

Please sign in to comment.