Skip to content

Commit d6673e2

Browse files
authored
Merge pull request #19 from UMCUGenetics/release/v2.0.2
Release/v2.0.2
2 parents e6e852d + 1195e93 commit d6673e2

20 files changed

+227
-56
lines changed

BAF/IGV.nf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ process IGV {
66
shell = ['/bin/bash', '-eo', 'pipefail']
77

88
input:
9-
tuple(output_name, path(vcf_files), path(vcf_idx_files))
9+
tuple(val(output_name), path(vcf_files), path(vcf_idx_files))
1010

1111
output:
1212
path("${output_name}_baf.igv", emit: BAF_IGV_files)

CheckQC/check_qc.py

Lines changed: 72 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@
66
from pathlib import Path
77
import re
88
import sys
9+
import warnings
910

1011
# Third party libraries alphabetic order of main package.
11-
from pandas import DataFrame, merge, read_csv
12+
from pandas import concat, DataFrame, merge, read_csv
1213
import yaml
1314

1415

@@ -72,7 +73,8 @@ def check_required_keys_metrics(qc_settings):
7273
def select_metrics(filename, input_files):
7374
metrics = list(filter(re.compile(f".*{filename}").match, input_files))
7475
if not metrics:
75-
raise ValueError(f"No input file provided with filename pattern {filename}")
76+
warnings.warn(UserWarning(f"No input file provided with filename pattern {filename}"))
77+
return None
7678
return metrics
7779

7880

@@ -84,8 +86,10 @@ def get_columns_to_report(qc_report_cols, qc_metric_cols, qc_col):
8486
raise TypeError(f"{qc_report_cols} not string, list or '@all'")
8587
elif not_existing_cols:
8688
raise ValueError(f"Some column names provided as report_cols do not exists: {not_existing_cols}")
87-
qc_report_cols = list(map(lambda x: x.replace(qc_col, "qc_value"), qc_report_cols)) # rename qc_col with qc_value
88-
qc_report_cols.insert(0, "qc_title") # add column qc_title
89+
# Rename qc_col with qc_value
90+
qc_report_cols = list(map(lambda x: x.replace(qc_col, "qc_value"), qc_report_cols))
91+
# Add column qc_title
92+
qc_report_cols.insert(0, "qc_title")
8993
return qc_report_cols
9094

9195

@@ -128,16 +132,17 @@ def add_failed_samples_metric(qc_metric, failed_rows, report_cols, sample_cols):
128132
# A single qc metric could have multiple sample columns
129133
# If a qc check fails for a 'multiple sample check', each individual sample is flagged as "failed"
130134
for sample_col in sample_cols:
131-
qc_metric_out = qc_metric_out.append(
135+
qc_metric_out = concat([
136+
qc_metric_out,
132137
(
133138
qc_metric
134139
.rename(columns={sample_col: "sample"})
135140
.loc[failed_rows, qc_metric_out.columns.to_list()]
136141
.groupby(["sample", "qc_check", "qc_status"], dropna=False)
137-
.agg(lambda val: ';'.join(val.astype(str))) # Or .agg(lambda val: val.to_list())
142+
.agg(lambda val: ';'.join(val.astype(str)))
138143
.reset_index()
139144
)
140-
)
145+
])
141146
# Drop failed samples current metric
142147
for sample_col in sample_cols:
143148
drop_index = qc_metric[qc_metric[sample_col].isin(set(failed_samples))].index
@@ -149,13 +154,14 @@ def add_failed_samples_metric(qc_metric, failed_rows, report_cols, sample_cols):
149154
def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols):
150155
# Add passed samples to output
151156
for sample_col in sample_cols:
152-
qc_metric_out = qc_metric_out.append(
157+
qc_metric_out = concat([
158+
qc_metric_out,
153159
(
154160
qc_metric
155161
.rename(columns={sample_col: "sample"})
156162
.loc[:, qc_metric_out.columns]
157163
)
158-
)
164+
])
159165
# In case 'multiple sample qc check',
160166
# output could contain duplicate rows for individual samples used in multiple comparisons.
161167
return qc_metric_out.sort_values(by=["qc_check", "qc_status"]).drop_duplicates(keep="first")
@@ -169,36 +175,67 @@ def create_and_write_output(qc_output, output_path, output_prefix):
169175
qc_output.to_csv(output_path + output_prefix + "_summary.csv", index=False, header=True)
170176

171177

178+
def read_and_judge_metrics(qc, metrics):
179+
for qc_file in metrics:
180+
qc_metric_raw = read_csv(qc_file, comment=qc.get("comment", None), delimiter="\t", quotechar='"')
181+
report_cols = get_columns_to_report(qc["report_cols"], qc_metric_raw.columns.to_list(), qc["qc_col"])
182+
qc_metric_edit = add_and_rename_columns(qc_metric_raw, qc["title"], qc["qc_col"], qc["operator"], qc["threshold"])
183+
failed_rows = get_failed_rows(qc_metric_edit, "qc_value", qc["operator"], qc["threshold"])
184+
qc_metric_subset, qc_metric_judged = add_failed_samples_metric(
185+
qc_metric_edit, failed_rows, report_cols, qc["sample_cols"]
186+
)
187+
qc_metric_judged = add_passed_samples_metric(qc_metric_subset, qc_metric_judged, qc["sample_cols"])
188+
# Rename columns
189+
suffix = f"_{qc['title'].lower()}"
190+
qc_judged_renamed = qc_metric_judged.add_suffix(suffix).rename(columns={f"sample{suffix}": "sample"})
191+
# Concatenate/merge metric output
192+
if "output" not in locals(): # First time
193+
output = qc_judged_renamed
194+
else:
195+
is_duplicate_sample = False
196+
# Check for duplicate sampleIDs before merge.
197+
if any(qc_judged_renamed["sample"].isin(output["sample"])):
198+
is_duplicate_sample = True
199+
output = merge(output, qc_judged_renamed, on=output.columns.tolist(), how="outer")
200+
if is_duplicate_sample:
201+
dup_sampleIDs = output[output['sample'].duplicated()]['sample'].to_list()
202+
# Duplicate sampleIDs with different column values
203+
if output["sample"].nunique() != output.shape[0]:
204+
# Warning to parse all qc values / samples.
205+
msg = f"Different qc values for duplicated sample IDs in input: {dup_sampleIDs}"
206+
# Duplicate sampleIDs same column values
207+
else:
208+
msg = f"Sample IDs occur multiple times in input: {dup_sampleIDs}"
209+
warnings.warn(UserWarning(msg))
210+
return output
211+
212+
172213
def check_qc(input_files, settings, output_path, output_prefix):
173214
# A single qc metric file can be used multiple times, by defining a metric section for each check in the qc settings.
174215
qc_settings = read_yaml(settings)
175216
check_required_keys_metrics(qc_settings)
176-
for qc in qc_settings["metrics"]:
177-
check_allowed_operators(qc["operator"])
178-
metrics = select_metrics(qc["filename"], input_files)
179-
for qc_file in metrics:
180-
qc_metric_raw = read_csv(qc_file, comment=qc.get("comment", None), delimiter="\t", quotechar='"')
181-
report_cols = get_columns_to_report(qc["report_cols"], qc_metric_raw.columns.to_list(), qc["qc_col"])
182-
qc_metric_edit = add_and_rename_columns(qc_metric_raw, qc["title"], qc["qc_col"], qc["operator"], qc["threshold"])
183-
failed_rows = get_failed_rows(qc_metric_edit, "qc_value", qc["operator"], qc["threshold"])
184-
qc_metric_subset, qc_metric_judged = add_failed_samples_metric(
185-
qc_metric_edit, failed_rows, report_cols, qc["sample_cols"]
186-
)
187-
qc_metric_judged = add_passed_samples_metric(qc_metric_subset, qc_metric_judged, qc["sample_cols"])
188-
# Rename columns
189-
suffix = f"_{qc['title'].lower()}"
190-
qc_judged_renamed = qc_metric_judged.add_suffix(suffix).rename(columns={f"sample{suffix}": "sample"})
191-
# Concatenate/merge metric output
192-
try:
193-
output = merge(output, qc_judged_renamed, on="sample", how="outer")
194-
except NameError: # First time:
195-
output = merge(
196-
DataFrame(qc_metric_judged['sample'], columns=["sample"]),
197-
qc_judged_renamed,
198-
on="sample",
199-
how="outer"
200-
)
201-
create_and_write_output(output, output_path, output_prefix)
217+
duplicated_sample_file = []
218+
for qc_metric_settings in qc_settings["metrics"]:
219+
check_allowed_operators(qc_metric_settings["operator"])
220+
metric_files = select_metrics(qc_metric_settings["filename"], input_files)
221+
if not metric_files:
222+
continue
223+
# Join multiple metrices files into single table
224+
metric_out = read_and_judge_metrics(qc_metric_settings, metric_files)
225+
if any(metric_out.duplicated(subset="sample")):
226+
duplicated_sample_file.append(qc_metric_settings["filename"])
227+
continue
228+
if "merged_out" not in locals():
229+
merged_out = metric_out
230+
else:
231+
# Join all metrics output to single table.
232+
merged_out = merge(merged_out, metric_out, on="sample", how="outer")
233+
234+
if "metric_out" not in locals():
235+
raise ValueError("No input files found to match any qc metric pattern.")
236+
if duplicated_sample_file:
237+
raise ValueError(f"Duplicated samples with different values found in files matching {duplicated_sample_file}.")
238+
create_and_write_output(merged_out, output_path, output_prefix)
202239

203240

204241
if __name__ == "__main__":

CheckQC/requirements.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1-
pandas==1.3.3
1+
pandas==2.1.4
22
pytest==6.2.5
33
pytest-cov==3.0.0
4+
pytest-datadir==1.5.0
45
pytest-datafiles==2.0
6+
pytest-dataset==0.3.2
57
pytest-flake8==1.0.7
68
pytest-mock==3.8.2
79
pytest-raises==0.11
810
pytest-reqs==0.2.1
911
pytest-unordered==0.5.2
10-
PyYAML==5.4.1
12+
PyYAML==6.0.1

CheckQC/test_check_qc.py

Lines changed: 101 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@ def setup_test_path(tmp_path_factory):
2121
return test_tmp_path
2222

2323

24+
@pytest.fixture(scope="class")
25+
def mock_settings(class_mocker, dataset_class):
26+
return class_mocker.patch("check_qc.read_yaml", return_value=dataset_class["settings_single_metric"])
27+
28+
2429
class TestNonEmptyExistingPath():
2530
def test_existing_dir(self, setup_test_path):
2631
file_or_dir = check_qc.non_empty_existing_path(setup_test_path)
@@ -99,11 +104,12 @@ def test_select_metric(self, input_files, expected):
99104
assert metrics == expected
100105

101106
def test_no_match(self):
102-
with pytest.raises(ValueError) as match_error:
103-
check_qc.select_metrics("test", ["fake1.txt", "fake2.txt"])
104-
error_val = str(match_error.value)
105-
assert "No input file provided with filename pattern" in error_val
106-
assert "test" in error_val
107+
with pytest.warns(UserWarning) as match_warning:
108+
return_val = check_qc.select_metrics("test", ["fake1.txt", "fake2.txt"])
109+
warn_msg = match_warning[0].message.args[0]
110+
assert "No input file provided with filename pattern" in warn_msg
111+
assert "test" in warn_msg
112+
assert not return_val
107113

108114

109115
class TestGetColumnsToReport():
@@ -252,3 +258,93 @@ def test_create_and_write_output(self, setup_test_path, exp_summary, qc_output):
252258
out = read_csv(expected_output)
253259
assert "qc_summary" in out.columns.to_list()
254260
assert out["qc_summary"].values == exp_summary
261+
262+
263+
class TestGetOutputMetrics():
264+
@pytest.mark.parametrize("data_in,nr_rows", [
265+
# single sample
266+
(["sample1_fake_check.txt"], 1),
267+
# multiple single samples
268+
(["sample1_fake_check.txt", "sample2_fake_check.txt"], 2),
269+
# single multi samples
270+
(["240101_fake_check.txt"], 2),
271+
# multiple multi samples
272+
(["240101_fake_check.txt", "240102_fake_check.txt"], 4),
273+
# multi and single sample
274+
(["sample1_fake_check.txt", "240101_fake_check.txt"], 3),
275+
])
276+
def test_input_ok(self, data_in, nr_rows, dataset, datadir):
277+
datadir_files = [f"{datadir}/{filename}" for filename in data_in]
278+
# input1 = datadir / "sample1_fake_check.txt"
279+
df_output = check_qc.read_and_judge_metrics(dataset["settings_single_metric"]["metrics"][0], datadir_files)
280+
assert not df_output.empty
281+
observed_cols = df_output.columns.to_list()
282+
assert df_output.shape[0] == nr_rows # shape results in tuple with no. rows and no. cols
283+
assert len(observed_cols) == 5
284+
assert observed_cols == ['sample', 'qc_check_fc', 'qc_status_fc', 'qc_msg_fc', 'qc_value_fc']
285+
286+
@pytest.mark.parametrize("data_in,nr_rows,exp_warn_msg", [
287+
# single sample duplicate
288+
(["sample1_fake_check.txt"]*2, 1, "Sample IDs occur multiple times in input:"),
289+
# single multi samples duplicate
290+
(["240101_fake_check.txt"]*2, 2, "Sample IDs occur multiple times in input:"),
291+
# multiple multi samples, duplicate samples
292+
(["240101_fake_check.txt", "240101_v2_fake_check.txt"], 4, "Different qc values for duplicated sample IDs in input:"),
293+
])
294+
def test_input_warn(self, data_in, nr_rows, exp_warn_msg, dataset, datadir):
295+
datadir_files = [f"{datadir}/{filename}" for filename in data_in]
296+
# input1 = datadir / "sample1_fake_check.txt"
297+
with pytest.warns(UserWarning) as match_warning:
298+
df_output = check_qc.read_and_judge_metrics(dataset["settings_single_metric"]["metrics"][0], datadir_files)
299+
warn_msg = match_warning[0].message.args[0]
300+
assert exp_warn_msg in warn_msg
301+
assert not df_output.empty
302+
observed_cols = df_output.columns.to_list()
303+
assert df_output.shape[0] == nr_rows # Shape: tuple with no. rows and no. cols
304+
assert len(observed_cols) == 5
305+
assert observed_cols == ['sample', 'qc_check_fc', 'qc_status_fc', 'qc_msg_fc', 'qc_value_fc']
306+
307+
308+
class TestCheckQc():
309+
@pytest.mark.parametrize("settings,data_in,exp_shape", [
310+
# single metric, single sample input
311+
("settings_single_metric", ["sample1_fake_check.txt"], (1, 5)),
312+
# two metrics, single sample input
313+
("settings_two_metrics", ["sample1_fake_check.txt"], (1, 9)),
314+
# single metric, multiple samples input
315+
("settings_single_metric", ["240101_fake_check.txt"], (2, 5)),
316+
("settings_single_metric", ["240101_fake_check.txt", "240102_fake_check.txt"], (4, 5)),
317+
# two metrics, multiple sample input
318+
("settings_two_metrics", ["240101_fake_check.txt", "240102_fake_check.txt"], (4, 9)),
319+
# two metric, multi and single sample input
320+
("settings_two_metrics", ["sample1_fake_check.txt", "240101_fake_check.txt"], (3, 9)),
321+
])
322+
def test_ok(self, settings, data_in, exp_shape, datadir, dataset, mocker, ):
323+
datadir_files = [f"{datadir}/{filename}" for filename in data_in]
324+
mocker.patch("check_qc.read_yaml", return_value=dataset[settings])
325+
mock_write_output = mocker.patch("check_qc.create_and_write_output")
326+
check_qc.check_qc(input_files=datadir_files, settings="", output_path="", output_prefix="")
327+
mock_write_output.assert_called_once()
328+
# Shape: tuple with no. rows and no. cols
329+
assert mock_write_output.call_args[0][0].shape == exp_shape
330+
mock_write_output.reset_mock()
331+
332+
def test_no_match_input_error(self, mocker, mock_settings):
333+
mock_select_metrics = mocker.patch("check_qc.select_metrics", return_value=None)
334+
mock_get_output = mocker.patch("check_qc.read_and_judge_metrics")
335+
with pytest.raises(ValueError) as no_match_error:
336+
check_qc.check_qc(input_files=[], settings="", output_path="", output_prefix="")
337+
mock_select_metrics.assert_called_once()
338+
assert not mock_get_output.called
339+
assert "No input files found to match any qc metric pattern." == str(no_match_error.value)
340+
mock_settings.reset_mock()
341+
342+
def test_duplicate_samples_error(self, datadir, mocker, mock_settings):
343+
mock_pandas_merge = mocker.patch("pandas.merge")
344+
with pytest.raises(ValueError) as duplicate_error:
345+
check_qc.check_qc(input_files=[f"{datadir}/240101_fake_check.txt", f"{datadir}/240101_v2_fake_check.txt"],
346+
settings="", output_path="", output_prefix="")
347+
assert "Duplicated samples with different values found in files matching" in str(duplicate_error.value)
348+
assert "fake_check.txt" in str(duplicate_error.value)
349+
assert not mock_pandas_merge.called
350+
mock_settings.reset_mock()
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
sample_id value
2+
sample3 1
3+
sample4 1
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
sample_id value
2+
sample3 0
3+
sample4 0
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
sample_id value
2+
sample5 1
3+
sample6 1
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
sample_id value
2+
sample1 1
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
sample_id value
2+
sample2 1
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
metrics:
2+
- filename: ".*fake_check.txt$"
3+
qc_col: "value"
4+
threshold: 0
5+
operator: ">"
6+
report_cols: ["sample_id", "value"]
7+
sample_cols: ["sample_id"]
8+
title: "FC"
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
metrics:
2+
- filename: ".*fake_check.txt$"
3+
qc_col: "value"
4+
threshold: 0
5+
operator: ">"
6+
report_cols: ["sample_id", "value"]
7+
sample_cols: ["sample_id"]
8+
title: "FC"
9+
- filename: ".*fake_check.txt$"
10+
qc_col: "value"
11+
threshold: 0
12+
operator: ">"
13+
report_cols: ["sample_id", "value"]
14+
sample_cols: ["sample_id"]
15+
title: "FC2"

ClarityEpp/SampleIndications.nf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ process SampleIndications {
1010
val(sample_id)
1111

1212
output:
13-
tuple(sample_id, stdout)
13+
tuple(val(sample_id), stdout)
1414

1515
script:
1616
"""

ExomeDepth/CallCNV.nf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ process CallCNV {
66
shell = ['/bin/bash', '-eo', 'pipefail']
77

88
input:
9-
tuple(analysis_id, sample_id, path(bam_file), path(bai_file))
9+
tuple(val(analysis_id), val(sample_id), path(bam_file), path(bai_file))
1010

1111
output:
1212
path("*.log", emit: ED_log)

ExomeDepth/GetRefset.nf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@ process GetRefset {
77
cache = false
88

99
input:
10-
tuple(sample_id, path(bam_file))
10+
tuple(val(sample_id), path(bam_file))
1111

1212
output:
13-
tuple(sample_id, stdout)
13+
tuple(val(sample_id), stdout)
1414

1515
script:
1616
"""

0 commit comments

Comments
 (0)