Skip to content

Commit c884823

Browse files
Add tx2gene module (formerly local to rnaseq) (nf-core#4854)
* Add test data link * Add tx2gene * Fix linting, update snapshots * path -> directory * Correct conda setup * Correct conda setup * Update test data reference * Apply minor suggestions from code review Co-authored-by: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com> * Error out at point of failure in determining transcript attribute --------- Co-authored-by: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com>
1 parent efbf86b commit c884823

File tree

8 files changed

+471
-0
lines changed

8 files changed

+471
-0
lines changed
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
---
2+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
3+
name: "custom_tx2gene"
4+
channels:
5+
- conda-forge
6+
- bioconda
7+
- defaults
8+
dependencies:
9+
- python=3.9.5
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
process CUSTOM_TX2GENE {
2+
tag "$meta.id"
3+
label 'process_single'
4+
5+
conda "${moduleDir}/environment.yml"
6+
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
7+
'https://depot.galaxyproject.org/singularity/python:3.9--1' :
8+
'biocontainers/python:3.9--1' }"
9+
10+
input:
11+
tuple val(meta), path(gtf)
12+
tuple val(meta2), path ("quants")
13+
val quant_type
14+
val id
15+
val extra
16+
17+
output:
18+
tuple val(meta), path("*.tx2gene.tsv"), emit: tx2gene
19+
path "versions.yml" , emit: versions
20+
21+
when:
22+
task.ext.when == null || task.ext.when
23+
24+
script:
25+
template 'tx2gene.py'
26+
27+
stub:
28+
"""
29+
touch ${meta.id}.tx2gene.tsv
30+
31+
cat <<-END_VERSIONS > versions.yml
32+
"${task.process}":
33+
python: \$(python --version | sed 's/Python //g')
34+
END_VERSIONS
35+
"""
36+
}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
---
2+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
3+
name: "custom_tx2gene"
4+
description: Make a transcript/gene mapping from a GTF and cross-reference with transcript quantifications.
5+
keywords:
6+
- gene
7+
- gtf
8+
- pseudoalignment
9+
- transcript
10+
tools:
11+
- "custom":
12+
description: |
13+
"Custom module to create a transcript to gene mapping from a GTF and
14+
check it against transcript quantifications"
15+
tool_dev_url: "https://github.com/nf-core/modules/blob/master/modules/nf-core/custom/tx2gene/main.nf"
16+
licence: ["MIT"]
17+
18+
input:
19+
- meta:
20+
type: map
21+
description: |
22+
Groovy Map containing reference information related to the GTF file
23+
e.g. `[ id:'yeast' ]`
24+
- gtf:
25+
type: file
26+
description: An annotation file of the reference genome in GTF format
27+
pattern: "*.gtf"
28+
- meta2:
29+
type: map
30+
description: |
31+
Groovy Map containing information related to the experiment as a whole
32+
e.g. `[ id:'SRP123456' ]`
33+
- quants:
34+
type: directory
35+
description: Path to a directory with subdirectories corresponding to
36+
sample-wise runs of Salmon or Kallisto
37+
- quant_type:
38+
type: string
39+
description: Quantification type, 'kallisto' or 'salmon'
40+
- id:
41+
type: string
42+
description: Gene ID attribute in the GTF file (default= gene_id)
43+
- extra:
44+
type: string
45+
description: Extra gene attribute in the GTF file (default= gene_name)
46+
47+
output:
48+
- meta:
49+
type: map
50+
description: |
51+
Groovy Map containing reference information related to the GTF file
52+
e.g. `[ id:'yeast' ]`
53+
- tx2gene:
54+
type: file
55+
description: A transcript/ gene mapping table in TSV format
56+
pattern: "*.tx2gene.tsv"
57+
- versions:
58+
type: file
59+
description: File containing software versions
60+
pattern: "versions.yml"
61+
62+
authors:
63+
- "@pinin4fjords"
64+
maintainers:
65+
- "@pinin4fjords"
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
#!/usr/bin/env python3
2+
3+
# Written by Lorena Pantano with subsequent reworking by Jonathan Manning. Released under the MIT license.
4+
5+
import logging
6+
import argparse
7+
import glob
8+
import os
9+
import platform
10+
import re
11+
from collections import Counter, defaultdict, OrderedDict
12+
from collections.abc import Set
13+
from typing import Dict
14+
15+
# Configure logging
16+
logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s")
17+
logger = logging.getLogger(__name__)
18+
logger.setLevel(logging.INFO)
19+
20+
def format_yaml_like(data: dict, indent: int = 0) -> str:
21+
"""Formats a dictionary to a YAML-like string.
22+
23+
Args:
24+
data (dict): The dictionary to format.
25+
indent (int): The current indentation level.
26+
27+
Returns:
28+
str: A string formatted as YAML.
29+
"""
30+
yaml_str = ""
31+
for key, value in data.items():
32+
spaces = " " * indent
33+
if isinstance(value, dict):
34+
yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
35+
else:
36+
yaml_str += f"{spaces}{key}: {value}\\n"
37+
return yaml_str
38+
39+
def read_top_transcripts(quant_dir: str, file_pattern: str) -> Set[str]:
40+
"""
41+
Read the top 100 transcripts from the quantification file.
42+
43+
Parameters:
44+
quant_dir (str): Directory where quantification files are located.
45+
file_pattern (str): Pattern to match quantification files.
46+
47+
Returns:
48+
set: A set containing the top 100 transcripts.
49+
"""
50+
try:
51+
# Find the quantification file within the directory
52+
quant_file_path = glob.glob(os.path.join(quant_dir, "*", file_pattern))[0]
53+
with open(quant_file_path, "r") as file_handle:
54+
# Read the file and extract the top 100 transcripts
55+
return {line.split()[0] for i, line in enumerate(file_handle) if i > 0 and i <= 100}
56+
except IndexError:
57+
# Log an error and raise a FileNotFoundError if the quant file does not exist
58+
logger.error("No quantification files found.")
59+
raise FileNotFoundError("Quantification file not found.")
60+
61+
62+
def discover_transcript_attribute(gtf_file: str, transcripts: Set[str]) -> str:
63+
"""
64+
Discover the attribute in the GTF that corresponds to transcripts, prioritizing 'transcript_id'.
65+
66+
Parameters:
67+
gtf_file (str): Path to the GTF file.
68+
transcripts (Set[str]): A set of transcripts to match in the GTF file.
69+
70+
Returns:
71+
str: The attribute name that corresponds to transcripts in the GTF file.
72+
"""
73+
74+
votes = Counter()
75+
with open(gtf_file) as inh:
76+
# Read GTF file, skipping header lines
77+
for line in filter(lambda x: not x.startswith("#"), inh):
78+
cols = line.split("\\t")
79+
80+
# Use regular expression to correctly split the attributes string
81+
attributes_str = cols[8]
82+
attributes = dict(re.findall(r'(\\S+) "(.*?)(?<!\\\\)";', attributes_str))
83+
84+
votes.update(key for key, value in attributes.items() if value in transcripts)
85+
86+
if not votes:
87+
# Error out if no matching attribute is found
88+
logger.error("No attribute in GTF matching transcripts")
89+
90+
# Check if 'transcript_id' is among the attributes with the highest votes
91+
if "transcript_id" in votes and votes["transcript_id"] == max(votes.values()):
92+
logger.info("Attribute 'transcript_id' corresponds to transcripts.")
93+
return "transcript_id"
94+
95+
# If 'transcript_id' isn't the highest, determine the most common attribute that matches the transcripts
96+
attribute, _ = votes.most_common(1)[0]
97+
logger.info(f"Attribute '{attribute}' corresponds to transcripts.")
98+
return attribute
99+
100+
101+
def parse_attributes(attributes_text: str) -> Dict[str, str]:
102+
"""
103+
Parse the attributes column of a GTF file.
104+
105+
:param attributes_text: The attributes column as a string.
106+
:return: A dictionary of the attributes.
107+
"""
108+
# Split the attributes string by semicolon and strip whitespace
109+
attributes = attributes_text.strip().split(";")
110+
attr_dict = OrderedDict()
111+
112+
# Iterate over each attribute pair
113+
for attribute in attributes:
114+
# Split the attribute into key and value, ensuring there are two parts
115+
parts = attribute.strip().split(" ", 1)
116+
if len(parts) == 2:
117+
key, value = parts
118+
# Remove any double quotes from the value
119+
value = value.replace('"', "")
120+
attr_dict[key] = value
121+
122+
return attr_dict
123+
124+
125+
def map_transcripts_to_gene(
126+
quant_type: str, gtf_file: str, quant_dir: str, gene_id: str, extra_id_field: str, output_file: str
127+
) -> bool:
128+
"""
129+
Map transcripts to gene names and write the output to a file.
130+
131+
Parameters:
132+
quant_type (str): The quantification method used (e.g., 'salmon').
133+
gtf_file (str): Path to the GTF file.
134+
quant_dir (str): Directory where quantification files are located.
135+
gene_id (str): The gene ID attribute in the GTF file.
136+
extra_id_field (str): Additional ID field in the GTF file.
137+
output_file (str): The output file path.
138+
139+
Returns:
140+
bool: True if the operation was successful, False otherwise.
141+
"""
142+
# Read the top transcripts based on quantification type
143+
transcripts = read_top_transcripts(quant_dir, "quant.sf" if quant_type == "salmon" else "abundance.tsv")
144+
# Discover the attribute that corresponds to transcripts in the GTF
145+
transcript_attribute = discover_transcript_attribute(gtf_file, transcripts)
146+
147+
# Open GTF and output file to write the mappings
148+
# Initialize the set to track seen combinations
149+
seen = set()
150+
151+
with open(gtf_file) as inh, open(output_file, "w") as output_handle:
152+
# Parse each line of the GTF, mapping transcripts to genes
153+
for line in filter(lambda x: not x.startswith("#"), inh):
154+
cols = line.split("\\t")
155+
attr_dict = parse_attributes(cols[8])
156+
if gene_id in attr_dict and transcript_attribute in attr_dict:
157+
# Create a unique identifier for the transcript-gene combination
158+
transcript_gene_pair = (attr_dict[transcript_attribute], attr_dict[gene_id])
159+
160+
# Check if the combination has already been seen
161+
if transcript_gene_pair not in seen:
162+
# If it's a new combination, write it to the output and add to the seen set
163+
extra_id = attr_dict.get(extra_id_field, attr_dict[gene_id])
164+
output_handle.write(f"{attr_dict[transcript_attribute]}\\t{attr_dict[gene_id]}\\t{extra_id}\\n")
165+
seen.add(transcript_gene_pair)
166+
167+
return True
168+
169+
170+
# Main function to parse arguments and call the mapping function
171+
if __name__ == "__main__":
172+
prefix = (
173+
"$task.ext.prefix"
174+
if "$task.ext.prefix" != "null"
175+
else f"$meta.id"
176+
)
177+
if not map_transcripts_to_gene('$quant_type', '$gtf', 'quants', '$id', '$extra', f"{prefix}.tx2gene.tsv"):
178+
logger.error("Failed to map transcripts to genes.")
179+
180+
# Write the versions
181+
versions_this_module = {}
182+
versions_this_module["${task.process}"] = {"python": platform.python_version()}
183+
with open("versions.yml", "w") as f:
184+
f.write(format_yaml_like(versions_this_module))
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
nextflow_process {
2+
3+
name "Test Process CUSTOM_TX2GENE"
4+
script "../main.nf"
5+
process "CUSTOM_TX2GENE"
6+
7+
tag "modules"
8+
tag "modules_nfcore"
9+
tag "custom"
10+
tag "custom/tx2gene"
11+
tag "untar"
12+
13+
test("saccharomyces_cerevisiae - gtf") {
14+
15+
setup {
16+
17+
run("UNTAR") {
18+
script "../../../untar/main.nf"
19+
process {
20+
"""
21+
input[0] = [
22+
[ id:'test'], // meta map
23+
file(params.test_data['saccharomyces_cerevisiae']['genome']['kallisto_results'], checkIfExists: true)
24+
]
25+
"""
26+
}
27+
}
28+
}
29+
30+
when {
31+
process {
32+
"""
33+
input[0] = [
34+
[ id:'test'], // meta map
35+
file(params.test_data['saccharomyces_cerevisiae']['genome']['genome_gfp_gtf'], checkIfExists: true)
36+
]
37+
input[1] = UNTAR.out.untar
38+
input[2] = 'kallisto'
39+
input[3] = 'gene_id'
40+
input[4] = 'gene_name'
41+
"""
42+
}
43+
}
44+
45+
46+
then {
47+
assertAll(
48+
{ assert process.success },
49+
{ assert snapshot(process.out.tx2gene).match('tx2gene') },
50+
{ assert snapshot(process.out.tx2gene).match('versions') }
51+
)
52+
}
53+
54+
}
55+
56+
test("saccharomyces_cerevisiae - gtf - stub") {
57+
58+
options "-stub"
59+
60+
setup {
61+
62+
run("UNTAR") {
63+
script "../../../untar/main.nf"
64+
process {
65+
"""
66+
input[0] = [
67+
[ id:'test'], // meta map
68+
file(params.test_data['saccharomyces_cerevisiae']['genome']['kallisto_results'], checkIfExists: true)
69+
]
70+
"""
71+
}
72+
}
73+
}
74+
75+
when {
76+
process {
77+
"""
78+
input[0] = [
79+
[ id:'test'], // meta map
80+
file(params.test_data['saccharomyces_cerevisiae']['genome']['genome_gfp_gtf'], checkIfExists: true)
81+
]
82+
input[1] = UNTAR.out.untar
83+
input[2] = 'kallisto'
84+
input[3] = 'gene_id'
85+
input[4] = 'gene_name'
86+
"""
87+
}
88+
}
89+
90+
91+
then {
92+
assertAll(
93+
{ assert process.success },
94+
{ assert snapshot(process.out.tx2gene).match('tx2gene - stub') },
95+
{ assert snapshot(process.out.tx2gene).match('versions - stub') }
96+
)
97+
}
98+
}
99+
}

0 commit comments

Comments
 (0)