Skip to content

Commit 556ef59

Browse files
committed
refactor clean step
1 parent 6276cc5 commit 556ef59

File tree

1 file changed

+92
-78
lines changed

1 file changed

+92
-78
lines changed

pipeline_lib/core/steps/clean.py

Lines changed: 92 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -28,93 +28,107 @@ def execute(self, data: DataContainer) -> DataContainer:
2828

2929
df = data.raw
3030

31-
if self.fill_missing:
32-
for column, fill_value in self.fill_missing.items():
33-
if column in df.columns:
34-
df[column].fillna(fill_value, inplace=True)
35-
self.logger.info(
36-
f"Filled missing values in column '{column}' with {fill_value}"
37-
)
38-
else:
39-
self.logger.warning(f"Column '{column}' not found in the DataFrame")
40-
4131
if self.remove_outliers:
42-
for column, method in self.remove_outliers.items():
43-
if column in df.columns:
44-
if method == "clip":
45-
q1 = df[column].quantile(0.25)
46-
q3 = df[column].quantile(0.75)
47-
iqr = q3 - q1
48-
lower_bound = q1 - (1.5 * iqr)
49-
upper_bound = q3 + (1.5 * iqr)
50-
df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)
51-
self.logger.info(f"Clipped outliers in column '{column}'")
52-
elif method == "drop":
53-
q1 = df[column].quantile(0.25)
54-
q3 = df[column].quantile(0.75)
55-
iqr = q3 - q1
56-
lower_bound = q1 - (1.5 * iqr)
57-
upper_bound = q3 + (1.5 * iqr)
58-
outliers = (df[column] < lower_bound) | (df[column] > upper_bound)
59-
df = df[~outliers]
60-
self.logger.info(f"Dropped outliers in column '{column}'")
61-
else:
62-
self.logger.warning(f"Unsupported outlier removal method '{method}'")
63-
else:
64-
self.logger.warning(f"Column '{column}' not found in the DataFrame")
32+
df = self._remove_outliers(df)
33+
34+
if self.fill_missing:
35+
df = self._fill_missing(df)
6536

6637
if self.convert_dtypes:
67-
for column, dtype in self.convert_dtypes.items():
68-
if column in df.columns:
69-
df[column] = df[column].astype(dtype)
70-
self.logger.info(f"Converted column '{column}' to {dtype}")
71-
else:
72-
self.logger.warning(f"Column '{column}' not found in the DataFrame")
38+
df = self._convert_dtypes(df)
7339

7440
if self.drop_na_columns:
75-
for column in self.drop_na_columns:
76-
if column in df.columns:
77-
initial_rows = len(df)
78-
df.dropna(subset=[column], inplace=True)
79-
dropped_rows = initial_rows - len(df)
80-
self.logger.info(
81-
f"Dropped {dropped_rows} rows with None values in column '{column}'"
82-
)
83-
else:
84-
self.logger.warning(f"Column '{column}' not found in the DataFrame")
41+
df = self._drop_na_columns(df)
8542

8643
if self.drop_ids:
87-
for column, ids in self.drop_ids.items():
88-
if column in df.columns:
89-
initial_rows = len(df)
90-
initial_ids = set(df[column].unique())
91-
92-
dropped_ids = set(ids) & initial_ids
93-
not_found_ids = set(ids) - initial_ids
94-
95-
if dropped_ids:
96-
df = df.loc[~df[column].isin(dropped_ids)].copy()
97-
dropped_rows = initial_rows - len(df)
98-
percentage_dropped = (
99-
dropped_rows / initial_rows
100-
) * 100 # Calculate the percentage of rows dropped
101-
self.logger.info(
102-
f"Dropped {dropped_rows} rows ({percentage_dropped:.2f}%) with IDs"
103-
f" {list(dropped_ids)} in column '{column}'"
104-
)
105-
else:
106-
self.logger.info(
107-
f"No rows dropped for IDs {list(ids)} in column '{column}'"
108-
)
109-
110-
if not_found_ids:
111-
self.logger.warning(
112-
f"IDs {list(not_found_ids)} not found in column '{column}'"
113-
)
114-
else:
115-
self.logger.warning(f"Column '{column}' not found in the DataFrame")
44+
df = self._drop_ids(df)
11645

11746
data.clean = df
11847
data.flow = df
11948

12049
return data
50+
51+
def _remove_outliers(self, df):
52+
for column, method in self.remove_outliers.items():
53+
if column in df.columns:
54+
if method == "clip":
55+
q1 = df[column].quantile(0.25)
56+
q3 = df[column].quantile(0.75)
57+
iqr = q3 - q1
58+
lower_bound = q1 - (1.5 * iqr)
59+
upper_bound = q3 + (1.5 * iqr)
60+
df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)
61+
self.logger.info(f"Clipped outliers in column '{column}'")
62+
elif method == "drop":
63+
q1 = df[column].quantile(0.25)
64+
q3 = df[column].quantile(0.75)
65+
iqr = q3 - q1
66+
lower_bound = q1 - (1.5 * iqr)
67+
upper_bound = q3 + (1.5 * iqr)
68+
outliers = (df[column] < lower_bound) | (df[column] > upper_bound)
69+
df = df[~outliers]
70+
self.logger.info(f"Dropped outliers in column '{column}'")
71+
else:
72+
self.logger.warning(f"Unsupported outlier removal method '{method}'")
73+
else:
74+
self.logger.warning(f"Column '{column}' not found in the DataFrame")
75+
return df
76+
77+
def _fill_missing(self, df):
78+
for column, fill_value in self.fill_missing.items():
79+
if column in df.columns:
80+
df[column].fillna(fill_value, inplace=True)
81+
self.logger.info(f"Filled missing values in column '{column}' with {fill_value}")
82+
else:
83+
self.logger.warning(f"Column '{column}' not found in the DataFrame")
84+
return df
85+
86+
def _convert_dtypes(self, df):
87+
for column, dtype in self.convert_dtypes.items():
88+
if column in df.columns:
89+
df[column] = df[column].astype(dtype)
90+
self.logger.info(f"Converted column '{column}' to {dtype}")
91+
else:
92+
self.logger.warning(f"Column '{column}' not found in the DataFrame")
93+
return df
94+
95+
def _drop_na_columns(self, df):
96+
for column in self.drop_na_columns:
97+
if column in df.columns:
98+
initial_rows = len(df)
99+
df.dropna(subset=[column], inplace=True)
100+
dropped_rows = initial_rows - len(df)
101+
self.logger.info(
102+
f"Dropped {dropped_rows} rows with None values in column '{column}'"
103+
)
104+
else:
105+
self.logger.warning(f"Column '{column}' not found in the DataFrame")
106+
return df
107+
108+
def _drop_ids(self, df):
109+
for column, ids in self.drop_ids.items():
110+
if column in df.columns:
111+
initial_rows = len(df)
112+
initial_ids = set(df[column].unique())
113+
114+
dropped_ids = set(ids) & initial_ids
115+
not_found_ids = set(ids) - initial_ids
116+
117+
if dropped_ids:
118+
df = df.loc[~df[column].isin(dropped_ids)].copy()
119+
dropped_rows = initial_rows - len(df)
120+
percentage_dropped = (
121+
dropped_rows / initial_rows
122+
) * 100 # Calculate the percentage of rows dropped
123+
self.logger.info(
124+
f"Dropped {dropped_rows} rows ({percentage_dropped:.2f}%) with IDs"
125+
f" {list(dropped_ids)} in column '{column}'"
126+
)
127+
else:
128+
self.logger.info(f"No rows dropped for IDs {list(ids)} in column '{column}'")
129+
130+
if not_found_ids:
131+
self.logger.warning(f"IDs {list(not_found_ids)} not found in column '{column}'")
132+
else:
133+
self.logger.warning(f"Column '{column}' not found in the DataFrame")
134+
return df

0 commit comments

Comments
 (0)