Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue/get india data #127

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 125 additions & 0 deletions examples/get_india_data.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get metadata for all systems in a country \n",
"\n",
"#### (excluding AU, US, NL, UK and DE) \n",
"\n",
"- Makes a request to the Get Country System Service - World.\n",
"- For more information on this route, [see pvoutput.org API documentation](https://pvoutput.org/help/data_services.html#get-country-system-service). \n",
"\n",
"- The request being run by this notebook looks like this: ```https://pvoutput.org/data/r2/getcountrysystem.jsp?c={country_code}&from={start_id_range}&to={end_id_range}```, where ```country_code``` is the code for the country found in the docs and ```from``` and ```to``` are a range queried for system ids. \n",
"\n",
"- NB: The maximum range of the \"from\" and \"to\" parameters is 20000.\n",
"\n",
"**2023-11-06**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import pandas as pd\n",
"\n",
"sys.path.append(\"..\")\n",
"from pvoutput.pvoutput import PVOutput"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# set up pvoutput environment variables \n",
"# API_KEY is the key provided via the pvoutput.org account used\n",
"# SYSTEM_ID is the system_id connected to your account on pvoutput.org\n",
"api_key = \"API_KEY\"\n",
"system_id = \"SYSTEM_ID\"\n",
"data_service_url = \"https://pvoutput.org\"\n",
"country_code=\"COUNTRY_CODE\" # get from pvoutput.org, e.g. \"in\" for India\n",
"pv = PVOutput(api_key=api_key, system_id=system_id, data_service_url=data_service_url)\n"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"# set up a directory locally to store data \n",
"CACHE_DIR = \"../examples/pv_data\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Create csv file with metadata from country systems. \n",
"\n",
"Could be useful to check the Get Country System Service [documentation](https://pvoutput.org/help/data_services.html#get-country-system-service) for data specifications."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# set up a list of ranges to check for system ids and metadat\n",
"start_id_range = list(range(1, 120000, 20000))\n",
"end_id_range = list(range(20000, 140000, 20000))\n",
"\n",
"# get system metadata for a country\n",
"def get_system_metadata_by_country(start_id_range=start_id_range, end_id_range=end_id_range):\n",
" frames = []\n",
" i=0\n",
" while i < len(start_id_range) and i < len(end_id_range):\n",
" df = pv.get_metadata_for_country(country_code={country_code}, start_id_range=start_id_range[i], end_id_range=end_id_range[i], use_data_service=True)\n",
" frames.append(df)\n",
" i+=1\n",
" df= pd.concat(frames)\n",
" df.to_csv(f\"{CACHE_DIR}/example.csv\", index=False)\n",
" return df\n",
"\n",
"get_system_metadata_by_country(start_id_range=start_id_range, end_id_range=end_id_range)\n",
"\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "pvoutput-venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
1 change: 1 addition & 0 deletions pvoutput/mapscraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,7 @@ def clean_soup(soup):
"""Function to clean scraped soup object.

Note that the downloaded soup could change over time.

Args:
soup: bs4.BeautifulSoup

Expand Down
140 changes: 121 additions & 19 deletions pvoutput/pvoutput.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def get_status(

# add timezone
if timezone is not None:
pv_system_status = pv_system_status.tz_localize(timezone).tz_convert("UTC")
pv_system_status = pv_system_status.tz_convert(timezone).tz_convert("UTC")

return pv_system_status

Expand Down Expand Up @@ -340,7 +340,6 @@ def get_system_status(

pv_system_status = []
for pv_system_status_text in pv_systems_status_text:

try:
one_pv_system_status = process_system_status(
pv_system_status_text=pv_system_status_text, date=date
Expand All @@ -360,7 +359,7 @@ def get_system_status(
if timezone is not None:
pv_system_status["datetime"] = (
pd.DatetimeIndex(pv_system_status["datetime"])
.tz_localize(timezone)
.tz_convert(timezone)
.tz_convert("UTC")
)

Expand Down Expand Up @@ -416,10 +415,17 @@ def get_batch_status(
for retry in range(max_retries):
try:
pv_system_status_text = self._api_query(
service="getbatchstatus", api_params=api_params, use_data_service=True, **kwargs
service="getbatchstatus",
api_params=api_params,
use_data_service=True,
**kwargs,
)
except NoStatusFound:
_LOG.info("system_id %d: No status found for date_to %s", pv_system_id, date_to)
_LOG.info(
"system_id %d: No status found for date_to %s",
pv_system_id,
date_to,
)
pv_system_status_text = ""
break

Expand Down Expand Up @@ -517,6 +523,79 @@ def get_metadata(self, pv_system_id: int, **kwargs) -> pd.Series:
pv_metadata.name = pv_system_id
return pv_metadata

def get_metadata_for_country(
self, country_code: str, start_id_range: int, end_id_range: int, **kwargs
) -> pd.DataFrame:
"""Get metadata for a single PV system.

Args:
country_code: str,
start_id_range: int,
end_id_range: int,

Returns:
pd.Dataframe. Index is:
system_id,
system_size_W,
postcode,
num_panels,
panel_power_W,
num_inverters,
inverter_capacity_W,
orientation,
array_tilt_degrees,
shade,
install_date,
latitude,
longitude,
status_interval_minutes,
secondary_num_panels,
secondary_panel_capacity_W_each,
secondary_orientation,
secondary_array_tilt_degrees
"""
pv_metadata_text = self._api_query(
service="getcountrysystem",
api_params={
"c": country_code, # Provide data about secondary array, if present.
"from": start_id_range,
"to": end_id_range,
},
**kwargs,
)

_LOG.debug(f"getting metadata for {country_code} for {start_id_range} to {end_id_range}")
print(
f"Getting metadata for country code: {country_code} for {start_id_range} to {end_id_range}"
)

pv_metadata_for_country = pd.read_csv(
StringIO(pv_metadata_text),
lineterminator="\n",
names=[
"system_id",
"system_size_W",
"postcode",
"num_panels",
"panel_power_W",
"num_inverters",
"inverter_capacity_W",
"orientation",
"array_tilt_degrees",
"shade",
"install_date",
"latitude",
"longitude",
"status_interval_minutes",
"secondary_num_panels",
"secondary_panel_capacity_W_each",
"secondary_orientation",
"secondary_array_tilt_degrees",
],
parse_dates=["install_date"],
)
return pv_metadata_for_country

def get_statistic(
self,
pv_system_id: int,
Expand Down Expand Up @@ -720,10 +799,13 @@ def download_multiple_systems_to_disk(
output_filename, pv_system_id, start_date, end_date
)

# How much data is actually available?
date_ranges_to_download = self._filter_date_range(
output_filename, pv_system_id, date_ranges_to_download, min_data_availability
)
# # How much data is actually available?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be good to put this back in, and add a comment saying,

if you dont have access to XXX API route, then comment this bit out

Or something like that

# date_ranges_to_download = self._filter_date_range(
# output_filename,
# pv_system_id,
# date_ranges_to_download,
# min_data_availability,
# )

if not date_ranges_to_download:
_LOG.info("system_id %d: No data left to download :)", pv_system_id)
Expand Down Expand Up @@ -861,7 +943,11 @@ def _filter_date_range(
return new_date_ranges

def _download_multiple_using_get_batch_status(
self, output_filename, pv_system_id, date_ranges_to_download, timezone: Optional[str] = None
self,
output_filename,
pv_system_id,
date_ranges_to_download,
timezone: Optional[str] = None,
):
years = merge_date_ranges_to_years(date_ranges_to_download)
dates_to = [year.end_date for year in years]
Expand All @@ -875,7 +961,11 @@ def _download_multiple_using_get_batch_status(
sort_and_de_dupe_pv_system(store, pv_system_id)

def _download_multiple_using_get_status(
self, output_filename, pv_system_id, date_ranges_to_download, timezone: Optional[str] = None
self,
output_filename,
pv_system_id,
date_ranges_to_download,
timezone: Optional[str] = None,
):
for date_range in date_ranges_to_download:
dates = date_range.date_range()
Expand Down Expand Up @@ -904,7 +994,9 @@ def _download_multiple_worker(
timeseries = self.get_batch_status(pv_system_id, date_to=date_to_load)
if timeseries.empty:
_LOG.info(
"system_id %d: Got empty timeseries back for %s", pv_system_id, date_to_load
"system_id %d: Got empty timeseries back for %s",
pv_system_id,
date_to_load,
)
if use_get_status:
_append_missing_date_range(
Expand All @@ -924,8 +1016,8 @@ def _download_multiple_worker(
)
else:
total_rows += len(timeseries)
_LOG.info(f'Adding timezone {timezone} to {total_rows} rows')
timeseries = timeseries.tz_localize(timezone)
_LOG.info(f"Adding timezone {timezone} to {total_rows} rows")
timeseries = timeseries.tz_convert(timezone)
_LOG.info(
"system_id: %d: %d rows retrieved: %s to %s",
pv_system_id,
Expand All @@ -946,7 +1038,8 @@ def _download_multiple_worker(
timeseries["datetime_of_API_request"] = datetime_of_api_request
timeseries["query_date"] = pd.Timestamp(date_to_load)
key = system_id_to_hdf_key(pv_system_id)
with pd.HDFStore(output_filename, mode="a", complevel=9) as store:
print(key)
with pd.HDFStore(output_filename, mode="a") as store:
with warnings.catch_warnings():
warnings.simplefilter("ignore", tables.NaturalNameWarning)
store.append(key=key, value=timeseries, data_columns=True)
Expand Down Expand Up @@ -1049,7 +1142,10 @@ def _set_rate_limit_params(self, headers):
setattr(self, param_name, header_value)

self.rate_limit_reset_time = pd.Timestamp.utcfromtimestamp(self.rate_limit_reset_time)
self.rate_limit_reset_time = self.rate_limit_reset_time.tz_localize("utc")
if self.rate_limit_reset_time.tzinfo is None:
self.rate_limit_reset_time = self.rate_limit_reset_time.tz_localize("utc")
else:
self.rate_limit_reset_time = self.rate_limit_reset_time.tz_convert("utc")

_LOG.debug("%s", self.rate_limit_info())

Expand Down Expand Up @@ -1191,9 +1287,12 @@ def check_pv_system_status(pv_system_status: pd.DataFrame, requested_date: date)


def _append_missing_date_range(
output_filename, pv_system_id, missing_start_date, missing_end_date, datetime_of_api_request
output_filename,
pv_system_id,
missing_start_date,
missing_end_date,
datetime_of_api_request,
):

data = {
"missing_start_date_PV_localtime": pd.Timestamp(missing_start_date),
"missing_end_date_PV_localtime": pd.Timestamp(missing_end_date),
Expand Down Expand Up @@ -1258,7 +1357,10 @@ def _convert_consecutive_dates_to_date_ranges(missing_dates):

end_date = missing_dates[-1]
new_missing.append(
{"missing_start_date_PV_localtime": start_date, "missing_end_date_PV_localtime": end_date}
{
"missing_start_date_PV_localtime": start_date,
"missing_end_date_PV_localtime": end_date,
}
)

return pd.DataFrame(new_missing)
Loading