6
6
from pathlib import Path
7
7
import re
8
8
import sys
9
+ import warnings
9
10
10
11
# Third party libraries alphabetic order of main package.
11
- from pandas import DataFrame , merge , read_csv
12
+ from pandas import concat , DataFrame , merge , read_csv
12
13
import yaml
13
14
14
15
@@ -72,7 +73,8 @@ def check_required_keys_metrics(qc_settings):
72
73
def select_metrics (filename , input_files ):
73
74
metrics = list (filter (re .compile (f".*{ filename } " ).match , input_files ))
74
75
if not metrics :
75
- raise ValueError (f"No input file provided with filename pattern { filename } " )
76
+ warnings .warn (UserWarning (f"No input file provided with filename pattern { filename } " ))
77
+ return None
76
78
return metrics
77
79
78
80
@@ -84,8 +86,10 @@ def get_columns_to_report(qc_report_cols, qc_metric_cols, qc_col):
84
86
raise TypeError (f"{ qc_report_cols } not string, list or '@all'" )
85
87
elif not_existing_cols :
86
88
raise ValueError (f"Some column names provided as report_cols do not exists: { not_existing_cols } " )
87
- qc_report_cols = list (map (lambda x : x .replace (qc_col , "qc_value" ), qc_report_cols )) # rename qc_col with qc_value
88
- qc_report_cols .insert (0 , "qc_title" ) # add column qc_title
89
+ # Rename qc_col with qc_value
90
+ qc_report_cols = list (map (lambda x : x .replace (qc_col , "qc_value" ), qc_report_cols ))
91
+ # Add column qc_title
92
+ qc_report_cols .insert (0 , "qc_title" )
89
93
return qc_report_cols
90
94
91
95
@@ -128,16 +132,17 @@ def add_failed_samples_metric(qc_metric, failed_rows, report_cols, sample_cols):
128
132
# A single qc metric could have multiple sample columns
129
133
# If a qc check fails for a 'multiple sample check', each individual sample is flagged as "failed"
130
134
for sample_col in sample_cols :
131
- qc_metric_out = qc_metric_out .append (
135
+ qc_metric_out = concat ([
136
+ qc_metric_out ,
132
137
(
133
138
qc_metric
134
139
.rename (columns = {sample_col : "sample" })
135
140
.loc [failed_rows , qc_metric_out .columns .to_list ()]
136
141
.groupby (["sample" , "qc_check" , "qc_status" ], dropna = False )
137
- .agg (lambda val : ';' .join (val .astype (str ))) # Or .agg(lambda val: val.to_list())
142
+ .agg (lambda val : ';' .join (val .astype (str )))
138
143
.reset_index ()
139
144
)
140
- )
145
+ ] )
141
146
# Drop failed samples current metric
142
147
for sample_col in sample_cols :
143
148
drop_index = qc_metric [qc_metric [sample_col ].isin (set (failed_samples ))].index
@@ -149,13 +154,14 @@ def add_failed_samples_metric(qc_metric, failed_rows, report_cols, sample_cols):
149
154
def add_passed_samples_metric (qc_metric , qc_metric_out , sample_cols ):
150
155
# Add passed samples to output
151
156
for sample_col in sample_cols :
152
- qc_metric_out = qc_metric_out .append (
157
+ qc_metric_out = concat ([
158
+ qc_metric_out ,
153
159
(
154
160
qc_metric
155
161
.rename (columns = {sample_col : "sample" })
156
162
.loc [:, qc_metric_out .columns ]
157
163
)
158
- )
164
+ ] )
159
165
# In case 'multiple sample qc check',
160
166
# output could contain duplicate rows for individual samples used in multiple comparisons.
161
167
return qc_metric_out .sort_values (by = ["qc_check" , "qc_status" ]).drop_duplicates (keep = "first" )
@@ -169,36 +175,67 @@ def create_and_write_output(qc_output, output_path, output_prefix):
169
175
qc_output .to_csv (output_path + output_prefix + "_summary.csv" , index = False , header = True )
170
176
171
177
178
+ def read_and_judge_metrics (qc , metrics ):
179
+ for qc_file in metrics :
180
+ qc_metric_raw = read_csv (qc_file , comment = qc .get ("comment" , None ), delimiter = "\t " , quotechar = '"' )
181
+ report_cols = get_columns_to_report (qc ["report_cols" ], qc_metric_raw .columns .to_list (), qc ["qc_col" ])
182
+ qc_metric_edit = add_and_rename_columns (qc_metric_raw , qc ["title" ], qc ["qc_col" ], qc ["operator" ], qc ["threshold" ])
183
+ failed_rows = get_failed_rows (qc_metric_edit , "qc_value" , qc ["operator" ], qc ["threshold" ])
184
+ qc_metric_subset , qc_metric_judged = add_failed_samples_metric (
185
+ qc_metric_edit , failed_rows , report_cols , qc ["sample_cols" ]
186
+ )
187
+ qc_metric_judged = add_passed_samples_metric (qc_metric_subset , qc_metric_judged , qc ["sample_cols" ])
188
+ # Rename columns
189
+ suffix = f"_{ qc ['title' ].lower ()} "
190
+ qc_judged_renamed = qc_metric_judged .add_suffix (suffix ).rename (columns = {f"sample{ suffix } " : "sample" })
191
+ # Concatenate/merge metric output
192
+ if "output" not in locals (): # First time
193
+ output = qc_judged_renamed
194
+ else :
195
+ is_duplicate_sample = False
196
+ # Check for duplicate sampleIDs before merge.
197
+ if any (qc_judged_renamed ["sample" ].isin (output ["sample" ])):
198
+ is_duplicate_sample = True
199
+ output = merge (output , qc_judged_renamed , on = output .columns .tolist (), how = "outer" )
200
+ if is_duplicate_sample :
201
+ dup_sampleIDs = output [output ['sample' ].duplicated ()]['sample' ].to_list ()
202
+ # Duplicate sampleIDs with different column values
203
+ if output ["sample" ].nunique () != output .shape [0 ]:
204
+ # Warning to parse all qc values / samples.
205
+ msg = f"Different qc values for duplicated sample IDs in input: { dup_sampleIDs } "
206
+ # Duplicate sampleIDs same column values
207
+ else :
208
+ msg = f"Sample IDs occur multiple times in input: { dup_sampleIDs } "
209
+ warnings .warn (UserWarning (msg ))
210
+ return output
211
+
212
+
172
213
def check_qc (input_files , settings , output_path , output_prefix ):
173
214
# A single qc metric file can be used multiple times, by defining a metric section for each check in the qc settings.
174
215
qc_settings = read_yaml (settings )
175
216
check_required_keys_metrics (qc_settings )
176
- for qc in qc_settings ["metrics" ]:
177
- check_allowed_operators (qc ["operator" ])
178
- metrics = select_metrics (qc ["filename" ], input_files )
179
- for qc_file in metrics :
180
- qc_metric_raw = read_csv (qc_file , comment = qc .get ("comment" , None ), delimiter = "\t " , quotechar = '"' )
181
- report_cols = get_columns_to_report (qc ["report_cols" ], qc_metric_raw .columns .to_list (), qc ["qc_col" ])
182
- qc_metric_edit = add_and_rename_columns (qc_metric_raw , qc ["title" ], qc ["qc_col" ], qc ["operator" ], qc ["threshold" ])
183
- failed_rows = get_failed_rows (qc_metric_edit , "qc_value" , qc ["operator" ], qc ["threshold" ])
184
- qc_metric_subset , qc_metric_judged = add_failed_samples_metric (
185
- qc_metric_edit , failed_rows , report_cols , qc ["sample_cols" ]
186
- )
187
- qc_metric_judged = add_passed_samples_metric (qc_metric_subset , qc_metric_judged , qc ["sample_cols" ])
188
- # Rename columns
189
- suffix = f"_{ qc ['title' ].lower ()} "
190
- qc_judged_renamed = qc_metric_judged .add_suffix (suffix ).rename (columns = {f"sample{ suffix } " : "sample" })
191
- # Concatenate/merge metric output
192
- try :
193
- output = merge (output , qc_judged_renamed , on = "sample" , how = "outer" )
194
- except NameError : # First time:
195
- output = merge (
196
- DataFrame (qc_metric_judged ['sample' ], columns = ["sample" ]),
197
- qc_judged_renamed ,
198
- on = "sample" ,
199
- how = "outer"
200
- )
201
- create_and_write_output (output , output_path , output_prefix )
217
+ duplicated_sample_file = []
218
+ for qc_metric_settings in qc_settings ["metrics" ]:
219
+ check_allowed_operators (qc_metric_settings ["operator" ])
220
+ metric_files = select_metrics (qc_metric_settings ["filename" ], input_files )
221
+ if not metric_files :
222
+ continue
223
+ # Join multiple metrices files into single table
224
+ metric_out = read_and_judge_metrics (qc_metric_settings , metric_files )
225
+ if any (metric_out .duplicated (subset = "sample" )):
226
+ duplicated_sample_file .append (qc_metric_settings ["filename" ])
227
+ continue
228
+ if "merged_out" not in locals ():
229
+ merged_out = metric_out
230
+ else :
231
+ # Join all metrics output to single table.
232
+ merged_out = merge (merged_out , metric_out , on = "sample" , how = "outer" )
233
+
234
+ if "metric_out" not in locals ():
235
+ raise ValueError ("No input files found to match any qc metric pattern." )
236
+ if duplicated_sample_file :
237
+ raise ValueError (f"Duplicated samples with different values found in files matching { duplicated_sample_file } ." )
238
+ create_and_write_output (merged_out , output_path , output_prefix )
202
239
203
240
204
241
if __name__ == "__main__" :
0 commit comments