openclimatefix · rachel-labri-tipton · Oct 20, 2023 · Oct 23, 2023 · Oct 25, 2023 · Oct 30, 2023
diff --git a/examples/get_india_data.ipynb b/examples/get_india_data.ipynb
@@ -0,0 +1,125 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Get metadata for all systems in a country \n",
+    "\n",
+    "#### (excluding AU, US, NL, UK and DE) \n",
+    "\n",
+    "- Makes a request to the Get Country System Service - World.\n",
+    "- For more information on this route, [see pvoutput.org API documentation](https://pvoutput.org/help/data_services.html#get-country-system-service). \n",
+    "\n",
+    "- The request being run by this notebook looks like this: ```https://pvoutput.org/data/r2/getcountrysystem.jsp?c={country_code}&from={start_id_range}&to={end_id_range}```, where ```country_code``` is the code for the country found in the docs and ```from``` and ```to``` are a range queried for system ids. \n",
+    "\n",
+    "- NB: The maximum range of the \"from\" and \"to\" parameters is 20000.\n",
+    "\n",
+    "**2023-11-06**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import pandas as pd\n",
+    "\n",
+    "sys.path.append(\"..\")\n",
+    "from pvoutput.pvoutput import PVOutput"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set up pvoutput environment variables \n",
+    "# API_KEY is the key provided via the pvoutput.org account used\n",
+    "# SYSTEM_ID is the system_id connected to your account on pvoutput.org\n",
+    "api_key = \"API_KEY\"\n",
+    "system_id = \"SYSTEM_ID\"\n",
+    "data_service_url = \"https://pvoutput.org\"\n",
+    "country_code=\"COUNTRY_CODE\" # get from pvoutput.org, e.g. \"in\" for India\n",
+    "pv = PVOutput(api_key=api_key, system_id=system_id, data_service_url=data_service_url)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set up a directory locally to store data \n",
+    "CACHE_DIR = \"../examples/pv_data\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Create csv file with metadata from country systems. \n",
+    "\n",
+    "Could be useful to check the Get Country System Service [documentation](https://pvoutput.org/help/data_services.html#get-country-system-service) for data specifications."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set up a list of ranges to check for system ids and metadat\n",
+    "start_id_range = list(range(1, 120000, 20000))\n",
+    "end_id_range = list(range(20000, 140000, 20000))\n",
+    "\n",
+    "# get system metadata for a country\n",
+    "def get_system_metadata_by_country(start_id_range=start_id_range, end_id_range=end_id_range):\n",
+    "    frames = []\n",
+    "    i=0\n",
+    "    while i < len(start_id_range) and i < len(end_id_range):\n",
+    "        df = pv.get_metadata_for_country(country_code={country_code}, start_id_range=start_id_range[i], end_id_range=end_id_range[i], use_data_service=True)\n",
+    "        frames.append(df)\n",
+    "        i+=1\n",
+    "    df= pd.concat(frames)\n",
+    "    df.to_csv(f\"{CACHE_DIR}/example.csv\", index=False)\n",
+    "    return df\n",
+    "\n",
+    "get_system_metadata_by_country(start_id_range=start_id_range, end_id_range=end_id_range)\n",
+    "\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pvoutput-venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pvoutput/mapscraper.py b/pvoutput/mapscraper.py
@@ -350,6 +350,7 @@ def clean_soup(soup):
     """Function to clean scraped soup object.
 
     Note that the downloaded soup could change over time.
+
     Args:
         soup: bs4.BeautifulSoup
 

diff --git a/pvoutput/pvoutput.py b/pvoutput/pvoutput.py
@@ -277,7 +277,7 @@ def get_status(
 
         # add timezone
         if timezone is not None:
-            pv_system_status = pv_system_status.tz_localize(timezone).tz_convert("UTC")
+            pv_system_status = pv_system_status.tz_convert(timezone).tz_convert("UTC")
 
         return pv_system_status
 
@@ -340,7 +340,6 @@ def get_system_status(
 
         pv_system_status = []
         for pv_system_status_text in pv_systems_status_text:
-
             try:
                 one_pv_system_status = process_system_status(
                     pv_system_status_text=pv_system_status_text, date=date
@@ -360,7 +359,7 @@ def get_system_status(
         if timezone is not None:
             pv_system_status["datetime"] = (
                 pd.DatetimeIndex(pv_system_status["datetime"])
-                .tz_localize(timezone)
+                .tz_convert(timezone)
                 .tz_convert("UTC")
             )
 
@@ -416,10 +415,17 @@ def get_batch_status(
         for retry in range(max_retries):
             try:
                 pv_system_status_text = self._api_query(
-                    service="getbatchstatus", api_params=api_params, use_data_service=True, **kwargs
+                    service="getbatchstatus",
+                    api_params=api_params,
+                    use_data_service=True,
+                    **kwargs,
                 )
             except NoStatusFound:
-                _LOG.info("system_id %d: No status found for date_to %s", pv_system_id, date_to)
+                _LOG.info(
+                    "system_id %d: No status found for date_to %s",
+                    pv_system_id,
+                    date_to,
+                )
                 pv_system_status_text = ""
                 break
 
@@ -517,6 +523,79 @@ def get_metadata(self, pv_system_id: int, **kwargs) -> pd.Series:
         pv_metadata.name = pv_system_id
         return pv_metadata
 
+    def get_metadata_for_country(
+        self, country_code: str, start_id_range: int, end_id_range: int, **kwargs
+    ) -> pd.DataFrame:
+        """Get metadata for a single PV system.
+
+        Args:
+            country_code: str,
+            start_id_range: int,
+            end_id_range: int,
+
+        Returns:
+            pd.Dataframe.  Index is:
+                system_id,
+                system_size_W,
+                postcode,
+                num_panels,
+                panel_power_W,
+                num_inverters,
+                inverter_capacity_W,
+                orientation,
+                array_tilt_degrees,
+                shade,
+                install_date,
+                latitude,
+                longitude,
+                status_interval_minutes,
+                secondary_num_panels,
+                secondary_panel_capacity_W_each,
+                secondary_orientation,
+                secondary_array_tilt_degrees
+        """
+        pv_metadata_text = self._api_query(
+            service="getcountrysystem",
+            api_params={
+                "c": country_code,  # Provide data about secondary array, if present.
+                "from": start_id_range,
+                "to": end_id_range,
+            },
+            **kwargs,
+        )
+
+        _LOG.debug(f"getting metadata for {country_code} for {start_id_range} to {end_id_range}")
+        print(
+            f"Getting metadata for country code: {country_code} for {start_id_range} to {end_id_range}"
+        )
+
+        pv_metadata_for_country = pd.read_csv(
+            StringIO(pv_metadata_text),
+            lineterminator="\n",
+            names=[
+                "system_id",
+                "system_size_W",
+                "postcode",
+                "num_panels",
+                "panel_power_W",
+                "num_inverters",
+                "inverter_capacity_W",
+                "orientation",
+                "array_tilt_degrees",
+                "shade",
+                "install_date",
+                "latitude",
+                "longitude",
+                "status_interval_minutes",
+                "secondary_num_panels",
+                "secondary_panel_capacity_W_each",
+                "secondary_orientation",
+                "secondary_array_tilt_degrees",
+            ],
+            parse_dates=["install_date"],
+        )
+        return pv_metadata_for_country
+
     def get_statistic(
         self,
         pv_system_id: int,
@@ -720,10 +799,13 @@ def download_multiple_systems_to_disk(
                 output_filename, pv_system_id, start_date, end_date
             )
 
-            # How much data is actually available?
-            date_ranges_to_download = self._filter_date_range(
-                output_filename, pv_system_id, date_ranges_to_download, min_data_availability
-            )
+            # # How much data is actually available?
+            # date_ranges_to_download = self._filter_date_range(
+            #     output_filename,
+            #     pv_system_id,
+            #     date_ranges_to_download,
+            #     min_data_availability,
+            # )
 
             if not date_ranges_to_download:
                 _LOG.info("system_id %d: No data left to download :)", pv_system_id)
@@ -861,7 +943,11 @@ def _filter_date_range(
         return new_date_ranges
 
     def _download_multiple_using_get_batch_status(
-        self, output_filename, pv_system_id, date_ranges_to_download, timezone: Optional[str] = None
+        self,
+        output_filename,
+        pv_system_id,
+        date_ranges_to_download,
+        timezone: Optional[str] = None,
     ):
         years = merge_date_ranges_to_years(date_ranges_to_download)
         dates_to = [year.end_date for year in years]
@@ -875,7 +961,11 @@ def _download_multiple_using_get_batch_status(
                 sort_and_de_dupe_pv_system(store, pv_system_id)
 
     def _download_multiple_using_get_status(
-        self, output_filename, pv_system_id, date_ranges_to_download, timezone: Optional[str] = None
+        self,
+        output_filename,
+        pv_system_id,
+        date_ranges_to_download,
+        timezone: Optional[str] = None,
     ):
         for date_range in date_ranges_to_download:
             dates = date_range.date_range()
@@ -904,7 +994,9 @@ def _download_multiple_worker(
                 timeseries = self.get_batch_status(pv_system_id, date_to=date_to_load)
             if timeseries.empty:
                 _LOG.info(
-                    "system_id %d: Got empty timeseries back for %s", pv_system_id, date_to_load
+                    "system_id %d: Got empty timeseries back for %s",
+                    pv_system_id,
+                    date_to_load,
                 )
                 if use_get_status:
                     _append_missing_date_range(
@@ -924,8 +1016,8 @@ def _download_multiple_worker(
                     )
             else:
                 total_rows += len(timeseries)
-                _LOG.info(f'Adding timezone {timezone} to {total_rows} rows')
-                timeseries = timeseries.tz_localize(timezone)
+                _LOG.info(f"Adding timezone {timezone} to {total_rows} rows")
+                timeseries = timeseries.tz_convert(timezone)
                 _LOG.info(
                     "system_id: %d: %d rows retrieved: %s to %s",
                     pv_system_id,
@@ -946,7 +1038,8 @@ def _download_multiple_worker(
                 timeseries["datetime_of_API_request"] = datetime_of_api_request
                 timeseries["query_date"] = pd.Timestamp(date_to_load)
                 key = system_id_to_hdf_key(pv_system_id)
-                with pd.HDFStore(output_filename, mode="a", complevel=9) as store:
+                print(key)
+                with pd.HDFStore(output_filename, mode="a") as store:
                     with warnings.catch_warnings():
                         warnings.simplefilter("ignore", tables.NaturalNameWarning)
                         store.append(key=key, value=timeseries, data_columns=True)
@@ -1049,7 +1142,10 @@ def _set_rate_limit_params(self, headers):
             setattr(self, param_name, header_value)
 
         self.rate_limit_reset_time = pd.Timestamp.utcfromtimestamp(self.rate_limit_reset_time)
-        self.rate_limit_reset_time = self.rate_limit_reset_time.tz_localize("utc")
+        if self.rate_limit_reset_time.tzinfo is None:
+            self.rate_limit_reset_time = self.rate_limit_reset_time.tz_localize("utc")
+        else:
+            self.rate_limit_reset_time = self.rate_limit_reset_time.tz_convert("utc")
 
         _LOG.debug("%s", self.rate_limit_info())
 
@@ -1191,9 +1287,12 @@ def check_pv_system_status(pv_system_status: pd.DataFrame, requested_date: date)
 
 
 def _append_missing_date_range(
-    output_filename, pv_system_id, missing_start_date, missing_end_date, datetime_of_api_request
+    output_filename,
+    pv_system_id,
+    missing_start_date,
+    missing_end_date,
+    datetime_of_api_request,
 ):
-
     data = {
         "missing_start_date_PV_localtime": pd.Timestamp(missing_start_date),
         "missing_end_date_PV_localtime": pd.Timestamp(missing_end_date),
@@ -1258,7 +1357,10 @@ def _convert_consecutive_dates_to_date_ranges(missing_dates):
 
     end_date = missing_dates[-1]
     new_missing.append(
-        {"missing_start_date_PV_localtime": start_date, "missing_end_date_PV_localtime": end_date}
+        {
+            "missing_start_date_PV_localtime": start_date,
+            "missing_end_date_PV_localtime": end_date,
+        }
     )
 
     return pd.DataFrame(new_missing)
-Original file line number
+Diff line change
@@ Expand Up / @@ -350,6 +350,7 @@ def clean_soup(soup): @@
         """Function to clean scraped soup object.
         Note that the downloaded soup could change over time.
         Args:
             soup: bs4.BeautifulSoup
@@ Expand Down @@