Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Shapefile/Raster to Patch Container/Webservice #116

Open
clorton opened this issue Jan 29, 2025 · 3 comments
Open

Shapefile/Raster to Patch Container/Webservice #116

clorton opened this issue Jan 29, 2025 · 3 comments

Comments

@clorton
Copy link
Contributor

clorton commented Jan 29, 2025

GDAL is a hassle to install and configure to support Python based access to GeoTIFF files. We should build a container configured correctly and expose a webservice to take shapefiles and overlay them on population rasters to provide patch information with populations.

@jonathanhhb
Copy link
Collaborator

Here's a script I cooked up which seems to get a population file and a shapefile for a country (in this case NGA) from publicly available reliable sources.

import requests
import rasterio
import pandas as pd
from rasterio.mask import mask
from shapely.geometry import shape
import geopandas as gpd
import os
import zipfile

# Define the population data URL for Nigeria (WorldPop)
population_url = "https://data.worldpop.org/GIS/Population/Global_2000_2020_Constrained/2020/maxar_v1/NGA/nga_ppp_2020_UNadj_constrained.tif"
local_population_filename = "nga_ppp_2020_UNadj_constrained.tif"

# Define the URL to download the Nigeria Admin Level 2 shapefile (HDX source)
shapefile_url = "https://data.humdata.org/dataset/81ac1d38-f603-4a98-804d-325c658599a3/resource/aa69f07b-ed8e-456a-9233-b20674730be6/download/nga_adm_osgof_20190417.zip"
local_shapefile_zip = "nga_adm_osgof_20190417.zip"
local_shapefile_dir = "nga_shapefile"

# Download the population data file if it doesn't exist
def download_population_data(url, filename):
    if not os.path.exists(filename):
        print(f"Downloading population data from {url}...")
        response = requests.get(url, stream=True)
        with open(filename, "wb") as f:
            for chunk in response.iter_content(1024):
                f.write(chunk)
        print(f"Downloaded population data to {filename}.")
    else:
        print(f"Population data already downloaded: {filename}")

# Download and extract the shapefile if it doesn't exist
def download_shapefile(url, zip_filename, extract_dir):
    if not os.path.exists(extract_dir):
        print(f"Downloading shapefile from {url}...")
        response = requests.get(url, stream=True)
        with open(zip_filename, "wb") as f:
            for chunk in response.iter_content(1024):
                f.write(chunk)
        print(f"Downloaded shapefile to {zip_filename}.")
        
        # Extract the zip file
        with zipfile.ZipFile(zip_filename, "r") as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f"Shapefile extracted to {extract_dir}.")
    else:
        print(f"Shapefile already downloaded and extracted to {extract_dir}.")

# Load the shapefile for Admin Level 2 boundaries (using GeoPandas)
def load_shapefile(shapefile_dir):
    shapefile_path = os.path.join(shapefile_dir, "nga_admbnda_adm2_osgof_20190417.shp")
    gdf = gpd.read_file(shapefile_path)
    return gdf

# Extract population data for Admin Level 2 regions from the .tif file
def extract_population_data(population_filename, shapefile_gdf):
    population_data = []

    # Open the raster file (population data)
    with rasterio.open(population_filename) as src:
        nodata_value = src.nodata  # Get the NoData value from the raster
        
        for _, region in shapefile_gdf.iterrows():
            # Get the geometry of the current region
            region_geometry = region['geometry']
            region_shape = [shape(region_geometry)]

            # Mask the raster with the geometry of the region
            out_image, out_transform = mask(src, region_shape, crop=True)

            # Mask the NoData values and set them to zero
            if nodata_value is not None:
                out_image[out_image == nodata_value] = 0

            # Sum the population in the region
            population = out_image.sum()

            # Extract region-specific attributes
            region_id = region['ADM2_PCODE']  # Using ADM2_PCODE as the region ID
            region_name = region['ADM2_EN']   # Using ADM2_EN as the region name

            # Store the result
            population_data.append({
                'region_id': region_id,
                'region_name': region_name,
                'population': population
            })
    
    return population_data

# Save population data to a CSV file
def save_population_to_csv(population_data, output_file):
    df = pd.DataFrame(population_data)
    df.to_csv(output_file, index=False)
    print(f"Population data saved to {output_file}.")

# Main function to orchestrate the process
def main():
    # Step 1: Download population data (if not already downloaded)
    download_population_data(population_url, local_population_filename)
    
    # Step 2: Download and extract the Admin Level 2 shapefile (if not already done)
    download_shapefile(shapefile_url, local_shapefile_zip, local_shapefile_dir)
    
    # Step 3: Load the Admin Level 2 shapefile for Nigeria
    shapefile_gdf = load_shapefile(local_shapefile_dir)
    
    # Step 4: Extract population data for each Admin Level 2 region
    population_data = extract_population_data(local_population_filename, shapefile_gdf)
    
    # Step 5: Save the population data to a CSV file
    output_csv = "nigeria_population_admin_level_2.csv"
    save_population_to_csv(population_data, output_csv)

# Run the script
if __name__ == "__main__":
    main()

@jonathanhhb
Copy link
Collaborator

I cooked this up yesterday afternoon:

https://deepnote.com/app/idm/LASER-095fb38b-bc23-4447-ab35-79eb252c9bd3

The app (which may run very slowly) is really just eye candy to show the code working. This seems to work for lots of countries for admin level 1 and 2. The code itself gives you a csv file with the population for each region.

Chris & I thought something like this would be very useful as a front-end for LASER.

@clorton
Copy link
Contributor Author

clorton commented Jan 30, 2025

  1. shebang trick to use uv to run the script in a virtual environment with the necessary packages (chmod +x script.py).
  2. tqdm to give some feedback about progress since downloads can take seconds to minutes.
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.12"
# dependencies = [
#   "geopandas",
#   "pandas",
#   "rasterio",
#   "requests",
#   "tqdm",
# ]
# ///

import requests
import rasterio
import pandas as pd
from rasterio.mask import mask
from shapely.geometry import shape
import geopandas as gpd
import os
import zipfile
from tqdm import tqdm

# Define the population data URL for Nigeria (WorldPop)
population_url = "https://data.worldpop.org/GIS/Population/Global_2000_2020_Constrained/2020/maxar_v1/NGA/nga_ppp_2020_UNadj_constrained.tif"
local_population_filename = "nga_ppp_2020_UNadj_constrained.tif"

# Define the URL to download the Nigeria Admin Level 2 shapefile (HDX source)
shapefile_url = "https://data.humdata.org/dataset/81ac1d38-f603-4a98-804d-325c658599a3/resource/aa69f07b-ed8e-456a-9233-b20674730be6/download/nga_adm_osgof_20190417.zip"
local_shapefile_zip = "nga_adm_osgof_20190417.zip"
local_shapefile_dir = "nga_shapefile"

# Download the population data file if it doesn't exist
def download_population_data(url, filename):
    if not os.path.exists(filename):
        print(f"Downloading population data from {url}...")
        response = requests.get(url, stream=True)
        with open(filename, "wb") as f:
            for chunk in tqdm(response.iter_content(1024)):
                f.write(chunk)
        print(f"Downloaded population data to {filename}.")
    else:
        print(f"Population data already downloaded: {filename}")

# Download and extract the shapefile if it doesn't exist
def download_shapefile(url, zip_filename, extract_dir):
    if not os.path.exists(extract_dir):
        print(f"Downloading shapefile from {url}...")
        response = requests.get(url, stream=True)
        with open(zip_filename, "wb") as f:
            for chunk in tqdm(response.iter_content(1024)):
                f.write(chunk)
        print(f"Downloaded shapefile to {zip_filename}.")
        
        # Extract the zip file
        with zipfile.ZipFile(zip_filename, "r") as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f"Shapefile extracted to {extract_dir}.")
    else:
        print(f"Shapefile already downloaded and extracted to {extract_dir}.")

# Load the shapefile for Admin Level 2 boundaries (using GeoPandas)
def load_shapefile(shapefile_dir):
    shapefile_path = os.path.join(shapefile_dir, "nga_admbnda_adm2_osgof_20190417.shp")
    gdf = gpd.read_file(shapefile_path)
    return gdf

# Extract population data for Admin Level 2 regions from the .tif file
def extract_population_data(population_filename, shapefile_gdf):
    population_data = []

    # Open the raster file (population data)
    with rasterio.open(population_filename) as src:
        nodata_value = src.nodata  # Get the NoData value from the raster
        
        for _, region in tqdm(shapefile_gdf.iterrows()):
            # Get the geometry of the current region
            region_geometry = region['geometry']
            region_shape = [shape(region_geometry)]

            # Mask the raster with the geometry of the region
            out_image, out_transform = mask(src, region_shape, crop=True)

            # Mask the NoData values and set them to zero
            if nodata_value is not None:
                out_image[out_image == nodata_value] = 0

            # Sum the population in the region
            population = out_image.sum()

            # Extract region-specific attributes
            region_id = region['ADM2_PCODE']  # Using ADM2_PCODE as the region ID
            region_name = region['ADM2_EN']   # Using ADM2_EN as the region name

            # Store the result
            population_data.append({
                'region_id': region_id,
                'region_name': region_name,
                'population': population
            })
    
    return population_data

# Save population data to a CSV file
def save_population_to_csv(population_data, output_file):
    df = pd.DataFrame(population_data)
    df.to_csv(output_file, index=False)
    print(f"Population data saved to {output_file}.")

# Main function to orchestrate the process
def main():
    # Step 1: Download population data (if not already downloaded)
    download_population_data(population_url, local_population_filename)
    
    # Step 2: Download and extract the Admin Level 2 shapefile (if not already done)
    download_shapefile(shapefile_url, local_shapefile_zip, local_shapefile_dir)
    
    # Step 3: Load the Admin Level 2 shapefile for Nigeria
    shapefile_gdf = load_shapefile(local_shapefile_dir)
    
    # Step 4: Extract population data for each Admin Level 2 region
    population_data = extract_population_data(local_population_filename, shapefile_gdf)
    
    # Step 5: Save the population data to a CSV file
    output_csv = "nigeria_population_admin_level_2.csv"
    save_population_to_csv(population_data, output_csv)

# Run the script
if __name__ == "__main__":
    main()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants