Skip to content

Commit

Permalink
Implemented filter on Dataframe for flower data processing. (#485)
Browse files Browse the repository at this point in the history
  • Loading branch information
KFilippopolitis authored Jul 9, 2024
1 parent 325476c commit f673763
Show file tree
Hide file tree
Showing 4 changed files with 449 additions and 12 deletions.
47 changes: 47 additions & 0 deletions exareme2/algorithms/flower/df_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import pandas as pd


def apply_filter(df, filter_rule):
condition = filter_rule.get("condition", "AND")
rules = filter_rule.get("rules", [])

if condition == "AND":
filtered_df = df
for rule in rules:
filtered_df = apply_single_rule(filtered_df, rule)
return filtered_df
elif condition == "OR":
filtered_dfs = []
for rule in rules:
filtered_dfs.append(apply_single_rule(df, rule))
return pd.concat(filtered_dfs).drop_duplicates().reset_index(drop=True)


def apply_single_rule(df, rule):
if "condition" in rule:
return apply_filter(df, rule)
else:
id = rule["id"]
operator = rule["operator"]
value = rule.get("value", None)

operator_functions = {
"equal": lambda df, id, value: df[df[id] == value],
"not_equal": lambda df, id, value: df[df[id] != value],
"greater": lambda df, id, value: df[df[id] > value],
"less": lambda df, id, value: df[df[id] < value],
"greater_or_equal": lambda df, id, value: df[df[id] >= value],
"less_or_equal": lambda df, id, value: df[df[id] <= value],
"between": lambda df, id, value: df[df[id].between(value[0], value[1])],
"not_between": lambda df, id, value: df[
~df[id].between(value[0], value[1])
],
"is_not_null": lambda df, id, value: df[df[id].notnull()],
"is_null": lambda df, id, value: df[df[id].isnull()],
"in": lambda df, id, value: df[df[id].isin(value)],
}

if operator in operator_functions:
return operator_functions[operator](df, id, value)
else:
raise ValueError(f"Unsupported operator: {operator}")
25 changes: 15 additions & 10 deletions exareme2/algorithms/flower/inputdata_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from flwr.common.logger import FLOWER_LOGGER
from pydantic import BaseModel
from sklearn import preprocessing
from sklearn.impute import SimpleImputer

from exareme2.algorithms.flower.df_filter import apply_filter

# Constants for project directories and environment configurations
CONTROLLER_IP = os.getenv("CONTROLLER_IP", "127.0.0.1")
Expand All @@ -28,15 +29,24 @@ class Inputdata(BaseModel):
x: Optional[List[str]]


def apply_inputdata(df: pd.DataFrame, inputdata: Inputdata) -> pd.DataFrame:
if inputdata.filters:
df = apply_filter(df, inputdata.filters)
df = df[df["dataset"].isin(inputdata.datasets)]
columns = inputdata.x + inputdata.y
df = df[columns]
df = df.dropna(subset=columns)
return df


def fetch_client_data(inputdata) -> pd.DataFrame:
FLOWER_LOGGER.error(f"BROOO {os.getenv('CSV_PATHS')}")
dataframes = [
pd.read_csv(f"{os.getenv('DATA_PATH')}{csv_path}")
for csv_path in os.getenv("CSV_PATHS").split(",")
]
df = pd.concat(dataframes, ignore_index=True)
df = df[df["dataset"].isin(inputdata.datasets)]
return df[inputdata.x + inputdata.y]
return apply_inputdata(df, inputdata)


def fetch_server_data(inputdata) -> pd.DataFrame:
Expand All @@ -50,8 +60,7 @@ def fetch_server_data(inputdata) -> pd.DataFrame:
if (data_folder / f"{dataset}.csv").exists()
]
df = pd.concat(dataframes, ignore_index=True)
df = df[df["dataset"].isin(inputdata.datasets)]
return df[inputdata.x + inputdata.y]
return apply_inputdata(df, inputdata)


def preprocess_data(inputdata, full_data):
Expand All @@ -63,16 +72,12 @@ def preprocess_data(inputdata, full_data):
features = full_data[inputdata.x] # This should be a DataFrame
target = full_data[inputdata.y].values.ravel() # Flatten the array if it's 2D

# Impute missing values for features
imputer = SimpleImputer(strategy="most_frequent")
features_imputed = imputer.fit_transform(features)

# Encode target variable
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(get_enumerations(inputdata.data_model, inputdata.y[0]))
y_train = label_encoder.transform(target)

return features_imputed, y_train
return features, y_train


def error_handling(error):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,49 @@ def test_logistic_regression(get_algorithm_result):
}
input["type"] = "flower"
algorithm_result = get_algorithm_result("logistic_regression", input)
assert {"accuracy": 0.3813682678311499} == algorithm_result or algorithm_result == {
"accuracy": 0.61863173216885
assert algorithm_result == {"accuracy": 0.6180758017492711} or algorithm_result == {
"accuracy": 0.3819241982507289
}


def test_logistic_regression_with_filters(get_algorithm_result):
input = {
"inputdata": {
"y": ["gender"],
"x": ["lefthippocampus"],
"data_model": "dementia:0.1",
"datasets": [
"ppmi0",
"ppmi1",
"ppmi2",
"ppmi3",
"ppmi4",
"ppmi5",
"ppmi6",
"ppmi7",
"ppmi8",
"ppmi9",
],
"filters": {
"condition": "AND",
"rules": [
{
"id": "lefthippocampus",
"field": "lefthippocampus",
"type": "double",
"input": "number",
"operator": "greater",
"value": 3.2,
}
],
"valid": True,
},
},
"parameters": None,
"test_case_num": 99,
}
input["type"] = "flower"
algorithm_result = get_algorithm_result("logistic_regression", input)
assert algorithm_result == {"accuracy": 0.7755681818181818} or algorithm_result == {
"accuracy": 0.22443181818181818
}
Loading

0 comments on commit f673763

Please sign in to comment.