@@ -28,93 +28,107 @@ def execute(self, data: DataContainer) -> DataContainer:
28
28
29
29
df = data .raw
30
30
31
- if self .fill_missing :
32
- for column , fill_value in self .fill_missing .items ():
33
- if column in df .columns :
34
- df [column ].fillna (fill_value , inplace = True )
35
- self .logger .info (
36
- f"Filled missing values in column '{ column } ' with { fill_value } "
37
- )
38
- else :
39
- self .logger .warning (f"Column '{ column } ' not found in the DataFrame" )
40
-
41
31
if self .remove_outliers :
42
- for column , method in self .remove_outliers .items ():
43
- if column in df .columns :
44
- if method == "clip" :
45
- q1 = df [column ].quantile (0.25 )
46
- q3 = df [column ].quantile (0.75 )
47
- iqr = q3 - q1
48
- lower_bound = q1 - (1.5 * iqr )
49
- upper_bound = q3 + (1.5 * iqr )
50
- df [column ] = df [column ].clip (lower = lower_bound , upper = upper_bound )
51
- self .logger .info (f"Clipped outliers in column '{ column } '" )
52
- elif method == "drop" :
53
- q1 = df [column ].quantile (0.25 )
54
- q3 = df [column ].quantile (0.75 )
55
- iqr = q3 - q1
56
- lower_bound = q1 - (1.5 * iqr )
57
- upper_bound = q3 + (1.5 * iqr )
58
- outliers = (df [column ] < lower_bound ) | (df [column ] > upper_bound )
59
- df = df [~ outliers ]
60
- self .logger .info (f"Dropped outliers in column '{ column } '" )
61
- else :
62
- self .logger .warning (f"Unsupported outlier removal method '{ method } '" )
63
- else :
64
- self .logger .warning (f"Column '{ column } ' not found in the DataFrame" )
32
+ df = self ._remove_outliers (df )
33
+
34
+ if self .fill_missing :
35
+ df = self ._fill_missing (df )
65
36
66
37
if self .convert_dtypes :
67
- for column , dtype in self .convert_dtypes .items ():
68
- if column in df .columns :
69
- df [column ] = df [column ].astype (dtype )
70
- self .logger .info (f"Converted column '{ column } ' to { dtype } " )
71
- else :
72
- self .logger .warning (f"Column '{ column } ' not found in the DataFrame" )
38
+ df = self ._convert_dtypes (df )
73
39
74
40
if self .drop_na_columns :
75
- for column in self .drop_na_columns :
76
- if column in df .columns :
77
- initial_rows = len (df )
78
- df .dropna (subset = [column ], inplace = True )
79
- dropped_rows = initial_rows - len (df )
80
- self .logger .info (
81
- f"Dropped { dropped_rows } rows with None values in column '{ column } '"
82
- )
83
- else :
84
- self .logger .warning (f"Column '{ column } ' not found in the DataFrame" )
41
+ df = self ._drop_na_columns (df )
85
42
86
43
if self .drop_ids :
87
- for column , ids in self .drop_ids .items ():
88
- if column in df .columns :
89
- initial_rows = len (df )
90
- initial_ids = set (df [column ].unique ())
91
-
92
- dropped_ids = set (ids ) & initial_ids
93
- not_found_ids = set (ids ) - initial_ids
94
-
95
- if dropped_ids :
96
- df = df .loc [~ df [column ].isin (dropped_ids )].copy ()
97
- dropped_rows = initial_rows - len (df )
98
- percentage_dropped = (
99
- dropped_rows / initial_rows
100
- ) * 100 # Calculate the percentage of rows dropped
101
- self .logger .info (
102
- f"Dropped { dropped_rows } rows ({ percentage_dropped :.2f} %) with IDs"
103
- f" { list (dropped_ids )} in column '{ column } '"
104
- )
105
- else :
106
- self .logger .info (
107
- f"No rows dropped for IDs { list (ids )} in column '{ column } '"
108
- )
109
-
110
- if not_found_ids :
111
- self .logger .warning (
112
- f"IDs { list (not_found_ids )} not found in column '{ column } '"
113
- )
114
- else :
115
- self .logger .warning (f"Column '{ column } ' not found in the DataFrame" )
44
+ df = self ._drop_ids (df )
116
45
117
46
data .clean = df
118
47
data .flow = df
119
48
120
49
return data
50
+
51
+ def _remove_outliers (self , df ):
52
+ for column , method in self .remove_outliers .items ():
53
+ if column in df .columns :
54
+ if method == "clip" :
55
+ q1 = df [column ].quantile (0.25 )
56
+ q3 = df [column ].quantile (0.75 )
57
+ iqr = q3 - q1
58
+ lower_bound = q1 - (1.5 * iqr )
59
+ upper_bound = q3 + (1.5 * iqr )
60
+ df [column ] = df [column ].clip (lower = lower_bound , upper = upper_bound )
61
+ self .logger .info (f"Clipped outliers in column '{ column } '" )
62
+ elif method == "drop" :
63
+ q1 = df [column ].quantile (0.25 )
64
+ q3 = df [column ].quantile (0.75 )
65
+ iqr = q3 - q1
66
+ lower_bound = q1 - (1.5 * iqr )
67
+ upper_bound = q3 + (1.5 * iqr )
68
+ outliers = (df [column ] < lower_bound ) | (df [column ] > upper_bound )
69
+ df = df [~ outliers ]
70
+ self .logger .info (f"Dropped outliers in column '{ column } '" )
71
+ else :
72
+ self .logger .warning (f"Unsupported outlier removal method '{ method } '" )
73
+ else :
74
+ self .logger .warning (f"Column '{ column } ' not found in the DataFrame" )
75
+ return df
76
+
77
+ def _fill_missing (self , df ):
78
+ for column , fill_value in self .fill_missing .items ():
79
+ if column in df .columns :
80
+ df [column ].fillna (fill_value , inplace = True )
81
+ self .logger .info (f"Filled missing values in column '{ column } ' with { fill_value } " )
82
+ else :
83
+ self .logger .warning (f"Column '{ column } ' not found in the DataFrame" )
84
+ return df
85
+
86
+ def _convert_dtypes (self , df ):
87
+ for column , dtype in self .convert_dtypes .items ():
88
+ if column in df .columns :
89
+ df [column ] = df [column ].astype (dtype )
90
+ self .logger .info (f"Converted column '{ column } ' to { dtype } " )
91
+ else :
92
+ self .logger .warning (f"Column '{ column } ' not found in the DataFrame" )
93
+ return df
94
+
95
+ def _drop_na_columns (self , df ):
96
+ for column in self .drop_na_columns :
97
+ if column in df .columns :
98
+ initial_rows = len (df )
99
+ df .dropna (subset = [column ], inplace = True )
100
+ dropped_rows = initial_rows - len (df )
101
+ self .logger .info (
102
+ f"Dropped { dropped_rows } rows with None values in column '{ column } '"
103
+ )
104
+ else :
105
+ self .logger .warning (f"Column '{ column } ' not found in the DataFrame" )
106
+ return df
107
+
108
+ def _drop_ids (self , df ):
109
+ for column , ids in self .drop_ids .items ():
110
+ if column in df .columns :
111
+ initial_rows = len (df )
112
+ initial_ids = set (df [column ].unique ())
113
+
114
+ dropped_ids = set (ids ) & initial_ids
115
+ not_found_ids = set (ids ) - initial_ids
116
+
117
+ if dropped_ids :
118
+ df = df .loc [~ df [column ].isin (dropped_ids )].copy ()
119
+ dropped_rows = initial_rows - len (df )
120
+ percentage_dropped = (
121
+ dropped_rows / initial_rows
122
+ ) * 100 # Calculate the percentage of rows dropped
123
+ self .logger .info (
124
+ f"Dropped { dropped_rows } rows ({ percentage_dropped :.2f} %) with IDs"
125
+ f" { list (dropped_ids )} in column '{ column } '"
126
+ )
127
+ else :
128
+ self .logger .info (f"No rows dropped for IDs { list (ids )} in column '{ column } '" )
129
+
130
+ if not_found_ids :
131
+ self .logger .warning (f"IDs { list (not_found_ids )} not found in column '{ column } '" )
132
+ else :
133
+ self .logger .warning (f"Column '{ column } ' not found in the DataFrame" )
134
+ return df
0 commit comments