Skip to content

Commit

Permalink
update main
Browse files Browse the repository at this point in the history
  • Loading branch information
mdsage1 committed Mar 20, 2024
1 parent 122c79d commit 21a2906
Showing 1 changed file with 94 additions and 9 deletions.
103 changes: 94 additions & 9 deletions apps/openchallenges/edam-etl/src/main.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,101 @@
"""Extract, transform and load EDAM concepts"""

import pandas as pd
import requests
from os import getenv
from typing import Optional

# Get config from the environment variables
oc_db_url = getenv("OC_DB_URL")
print(f"OC DB URL: {oc_db_url}")
"""Get config from the environment variables"""

# TODO Download EDAM concepts from GitHub or S3 bucket (CSV file)
print("[TODO] Download EDAM concepts from GitHub or S3 bucket (CSV file)")
OC_DB_URL = getenv("OC_DB_URL")
VERSION = getenv("OC_DB_VERSION")
print(f"EDAM Version: {VERSION}")
print(f"OC DB URL: {OC_DB_URL}")

# TODO Process the EDAM concepts
print("[TODO] Process the EDAM concepts")

# TODO Push the EDAM concept to the OC challenge service DB
print("[TODO] Push the EDAM concept to the OC challenge service DB")
def download_edam_csv(url: str, version: str) -> Optional[bool]:
"""Download EDAM concepts from GitHub or S3 bucket (CSV file)"""
print("Downloading the EDAM concepts from GitHub (CSV file)...")
try:
response = requests.get(url)
response.raise_for_status() # Raise an exception for bad response
with open(f"EDAM_{version}.csv", "wb") as f:
f.write(response.content)
print("EDAM concepts downloaded successfully.")
except requests.RequestException as e:
print(f"Error downloading EDAM concepts: {e}")
return None


def transform_to_dataframe(version: str) -> pd.DataFrame:
"""Transform the CSV to a DataFrame with indices starting from 1"""
print("Processing the EDAM concepts...")
try:
df = (
pd.read_csv(f"EDAM_{version}.csv", usecols=["Class ID", "Preferred Label"])
.rename(
columns={"Class ID": "class_id", "Preferred Label": "preferred_label"}
)
.assign(id=lambda x: x.reset_index(drop=True).index + 1)
)
print("EDAM concepts processed successfully.")
return df
except FileNotFoundError:
<<<<<<< HEAD
print(f"File EDAM_{VERSION}.csv not found.")
except Exception as e:
print(f"Error processing EDAM concepts: {e}")


def print_info_statistics(df: pd.DataFrame) -> None:
"""Gather data about the EDAM ontology"""
if df is not None:
print(f"Number of Concepts Transformed: {len(df)}")
print(f"Column names: {df.columns.tolist()}")
print("Statistics:")
# Set the display options to show only 2 decimal places
pd.set_option("display.float_format", "{:.0f}".format)
print(df.describe())
else:
print("No data available.")


def main() -> None:
"""Main function to excute preceeding functions"""
url: str = (
f"https://github.com/edamontology/edamontology/raw/main/releases/EDAM_{VERSION}.csv"
)
download_edam_csv(url, VERSION)
df: pd.DataFrame = transform_to_dataframe(VERSION)
print_info_statistics(df)
=======
print(f"File EDAM_{version}.csv not found.")
except Exception as e:
print(f"Error processing EDAM concepts: {e}")


def print_info_statistics(df: pd.DataFrame) -> None:
"""Gather data about the EDAM ontology"""
if df is not None:
print(f"Number of Concepts Transformed: {len(df)}")
print(f"Column names: {df.columns.tolist()}")
print("Statistics:")
# Set the display options to show only 2 decimal places
pd.set_option("display.float_format", "{:.0f}".format)
print(df.describe())
else:
print("No data available.")



def main() -> None:
"""Main function to excute preceeding functions"""
url: str = (
f"https://github.com/edamontology/edamontology/raw/main/releases/EDAM_{VERSION}.csv"
)
download_edam_csv(url, VERSION)
df: pd.DataFrame = transform_to_dataframe(VERSION)
print_info_statistics(df)

if __name__ == "__main__":
main()

0 comments on commit 21a2906

Please sign in to comment.