Skip to content

Commit c22729b

Browse files
VinzentRischgregcaporaso
authored andcommitted
ENH: Added action normalize that can normalize a FeatureTable[Frequency] with common methods. (qiime2#312)
* added normalized * typemap properties and plugin setup * lint and add rnanorm to metayaml
1 parent 8e6ea08 commit c22729b

File tree

6 files changed

+297
-6
lines changed

6 files changed

+297
-6
lines changed

ci/recipe/meta.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ requirements:
3131
- qiime2 {{ qiime2_epoch }}.*
3232
- q2templates {{ qiime2_epoch }}.*
3333
- q2-types {{ qiime2_epoch }}.*
34+
- rnanorm
3435

3536
test:
3637
requires:

q2_feature_table/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
# The full license is in the file LICENSE, distributed with this software.
77
# ----------------------------------------------------------------------------
88

9-
from ._normalize import rarefy
9+
from ._normalize import rarefy, normalize
1010
from ._subsample_ids import subsample_ids
1111
from ._transform import (presence_absence, relative_frequency, transpose)
1212
from ._summarize import (summarize, tabulate_seqs, tabulate_sample_frequencies,
@@ -31,4 +31,4 @@
3131
'filter_seqs', 'subsample_ids', 'rename_ids',
3232
'filter_features_conditionally', 'split',
3333
'tabulate_feature_frequencies', 'tabulate_sample_frequencies',
34-
'summarize_plus']
34+
'summarize_plus', 'normalize']

q2_feature_table/_normalize.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77
# ----------------------------------------------------------------------------
88
import biom
99

10+
import os
11+
12+
import pandas as pd
13+
from q2_types.feature_data import SequenceCharacteristicsDirectoryFormat
14+
from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ
15+
1016

1117
def rarefy(table: biom.Table,
1218
sampling_depth: int,
@@ -28,3 +34,95 @@ def rarefy(table: biom.Table,
2834
'shallow enough sampling depth.')
2935

3036
return table
37+
38+
39+
def normalize(
40+
table: pd.DataFrame,
41+
method: str,
42+
m_trim: float = None,
43+
a_trim: float = None,
44+
gene_length: SequenceCharacteristicsDirectoryFormat = None,
45+
) -> pd.DataFrame:
46+
# Validate parameter combinations and set trim parameters
47+
m_trim, a_trim = _validate_parameters(
48+
method, m_trim, a_trim, gene_length)
49+
50+
# Process gene_lengths input and define methods that need gene_lengths
51+
# input
52+
if method in ["tpm", "fpkm"]:
53+
lengths = _convert_lengths(table, gene_length)
54+
55+
methods = {
56+
"tpm": TPM(gene_lengths=lengths),
57+
"fpkm": FPKM(gene_lengths=lengths),
58+
}
59+
60+
# Define remaining methods that don't need gene_lengths input
61+
else:
62+
methods = {
63+
"tmm": TMM(m_trim=m_trim, a_trim=a_trim),
64+
"ctf": CTF(m_trim=m_trim, a_trim=a_trim),
65+
"uq": UQ(),
66+
"cuf": CUF(),
67+
"cpm": CPM(),
68+
}
69+
70+
# Run normalization method on frequency table
71+
normalized = methods[method].set_output(
72+
transform="pandas").fit_transform(table)
73+
normalized.index.name = "sample_id"
74+
75+
return normalized
76+
77+
78+
def _validate_parameters(method, m_trim, a_trim, gene_length):
79+
# Raise Error if gene-length is missing when using methods TPM or FPKM
80+
if method in ["tpm", "fpkm"] and not gene_length:
81+
raise ValueError("gene-length input is missing.")
82+
83+
# Raise Error if gene-length is given when using methods TMM, UQ, CUF,
84+
# CPM or CTF
85+
if method in ["tmm", "uq", "cuf", "ctf", "cpm"] and gene_length:
86+
raise ValueError(
87+
"gene-length input can only be used with FPKM and TPM methods."
88+
)
89+
90+
# Raise Error if m_trim or a_trim are given when not using methods
91+
# TMM or CTF
92+
if ((method not in ["tmm", "ctf"])
93+
and (m_trim is not None or a_trim is not None)):
94+
raise ValueError(
95+
"Parameters m-trim and a-trim can only be used with "
96+
"methods TMM and CTF."
97+
)
98+
99+
# Set m_trim and a_trim to their default values for methods
100+
# TMM and CTF
101+
if method in ["tmm", "ctf"]:
102+
m_trim = 0.3 if m_trim is None else m_trim
103+
a_trim = 0.05 if a_trim is None else a_trim
104+
105+
return m_trim, a_trim
106+
107+
108+
def _convert_lengths(table, gene_length):
109+
# Read in table from sequence_characteristics.tsv as a pd.Series
110+
lengths = pd.read_csv(
111+
os.path.join(gene_length.path, "sequence_characteristics.tsv"),
112+
sep="\t",
113+
header=None,
114+
names=["index", "values"],
115+
index_col="index",
116+
skiprows=1,
117+
).squeeze("columns")
118+
119+
# Check if all gene IDs that are present in the table are also
120+
# present in the lengths
121+
if not set(table.columns).issubset(set(lengths.index)):
122+
only_in_counts = set(table.columns) - set(lengths.index)
123+
raise ValueError(
124+
f"There are genes present in the FeatureTable that are "
125+
f"not present in the gene-length input. Missing lengths "
126+
f"for genes: {only_in_counts}"
127+
)
128+
return lengths

q2_feature_table/citations.bib

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,12 @@ @article{NCBI-BLAST
4343
year = {2008},
4444
doi = {10.1093/nar/gkn201},
4545
}
46+
47+
@misc{Zmrzlikar_RNAnorm_RNA-seq_data_2023,
48+
author = {Zmrzlikar, Jure and Žganec, Matjaž and Ausec, Luka and Štajdohar, Miha},
49+
month = jul,
50+
title = {{RNAnorm: RNA-seq data normalization in Python}},
51+
url = {https://github.com/genialis/RNAnorm},
52+
version = {2.1.0},
53+
year = {2023}
54+
}

q2_feature_table/plugin_setup.py

Lines changed: 78 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,17 @@
55
#
66
# The full license is in the file LICENSE, distributed with this software.
77
# ----------------------------------------------------------------------------
8-
8+
from qiime2.core.type import Properties
99
from qiime2.plugin import (Plugin, Int, Float, Range, Metadata, Str, Bool,
1010
Choices, MetadataColumn, Categorical, List,
1111
Citations, TypeMatch, TypeMap, Collection,
1212
Visualization)
1313

1414
from q2_types.feature_table import (
15-
FeatureTable, Frequency, RelativeFrequency, PresenceAbsence, Composition)
15+
FeatureTable, Frequency, RelativeFrequency, PresenceAbsence, Composition,
16+
Normalized)
1617
from q2_types.feature_data import (
17-
FeatureData, Sequence, Taxonomy, AlignedSequence)
18+
FeatureData, Sequence, Taxonomy, AlignedSequence, SequenceCharacteristics)
1819
from q2_types.metadata import ImmutableMetadata
1920

2021
import q2_feature_table
@@ -727,3 +728,77 @@
727728
"Tabulate sample and feature frequencies.",
728729
examples={'feature_table_summarize_plus': ex.feature_table_summarize_plus}
729730
)
731+
732+
P_method, T_normalized_table = TypeMap(
733+
{
734+
Str
735+
% Choices("tpm"): (
736+
FeatureTable[Normalized % Properties("TPM")],
737+
),
738+
Str
739+
% Choices("fpkm"): (
740+
FeatureTable[Normalized % Properties("FPKM")],
741+
),
742+
Str
743+
% Choices("tmm"): (
744+
FeatureTable[Normalized % Properties("TMM")],
745+
),
746+
Str
747+
% Choices("uq"): (
748+
FeatureTable[Normalized % Properties("UQ")],
749+
),
750+
Str
751+
% Choices("cuf"): (
752+
FeatureTable[Normalized % Properties("CUF")],
753+
),
754+
Str
755+
% Choices("ctf"): (
756+
FeatureTable[Normalized % Properties("CTF")],
757+
),
758+
Str
759+
% Choices("cpm"): (
760+
FeatureTable[Normalized % Properties("CPM")],
761+
),
762+
}
763+
)
764+
765+
plugin.methods.register_function(
766+
function=q2_feature_table.normalize,
767+
inputs={
768+
"table": FeatureTable[Frequency],
769+
"gene_length": FeatureData[
770+
SequenceCharacteristics % Properties("length")],
771+
},
772+
parameters={
773+
"method": P_method,
774+
"m_trim": Float % Range(
775+
0, 1, inclusive_start=True, inclusive_end=True
776+
),
777+
"a_trim": Float % Range(
778+
0, 1, inclusive_start=True, inclusive_end=True
779+
),
780+
},
781+
outputs=[("normalized_table", T_normalized_table)],
782+
input_descriptions={
783+
"table": "Feature table with gene counts.",
784+
"gene_length": "Gene lengths of all genes in the feature table.",
785+
},
786+
parameter_descriptions={
787+
"method": "Specify the normalization method to be used. Use FPKM or "
788+
"TPM for within comparisons and TMM, UQ, CUF or CTF for between "
789+
"sample camparisons. Check https://www.genialis.com/wp-content/uploads"
790+
"/2023/12/2023-Normalizing-RNA-seq-data-in-Python-with-RNAnorm.pdf "
791+
"for more information on the methods.",
792+
"m_trim": "Two sided cutoff for M-values. Can only be used for "
793+
"methods TMM and CTF. (default = 0.3)",
794+
"a_trim": "Two sided cutoff for A-values. Can only be used for "
795+
"methods TMM and CTF. (default = 0.05)",
796+
},
797+
output_descriptions={
798+
"normalized_table": "Feature table normalized with specified method."
799+
},
800+
name="Normalize FeatureTable",
801+
description="Normalize FeatureTable by gene length, library size and "
802+
"composition with common methods for RNA-seq.",
803+
citations=[citations["Zmrzlikar_RNAnorm_RNA-seq_data_2023"]],
804+
)

q2_feature_table/tests/test_normalize.py

Lines changed: 109 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,20 @@
55
#
66
# The full license is in the file LICENSE, distributed with this software.
77
# ----------------------------------------------------------------------------
8-
8+
import os
99
from unittest import TestCase, main
10+
from unittest.mock import MagicMock, patch
1011

1112
import numpy as np
1213
import numpy.testing as npt
14+
import pandas as pd
1315
from biom.table import Table
16+
from pandas._testing import assert_series_equal
17+
from q2_types.feature_data import SequenceCharacteristicsDirectoryFormat
1418

1519
from q2_feature_table import rarefy
20+
from q2_feature_table._normalize import (_validate_parameters,
21+
_convert_lengths, normalize)
1622

1723

1824
class RarefyTests(TestCase):
@@ -78,5 +84,107 @@ def test_rarefy_depth_error(self):
7884
rarefy(t, 50)
7985

8086

87+
class NormalizeTests(TestCase):
88+
89+
@classmethod
90+
def setUpClass(cls):
91+
cls.lengths = pd.Series(
92+
{
93+
"ARO1": 1356.0,
94+
"ARO2": 1173.0,
95+
},
96+
name="values",
97+
)
98+
cls.lengths.index.name = "index"
99+
cls.table = pd.DataFrame({
100+
'ID': ['sample1', 'sample2'],
101+
'ARO1': [2.0, 2.0],
102+
'ARO2': [0.0, 0.0]
103+
}).set_index('ID')
104+
105+
def test_validate_parameters_uq_with_m_a_trim(self):
106+
# Test Error raised if gene-length is given with UQ method
107+
with self.assertRaisesRegex(
108+
ValueError,
109+
"Parameters m-trim and a-trim can only "
110+
"be used with methods TMM and CTF.",
111+
):
112+
_validate_parameters("uq", 0.2, 0.05, None)
113+
114+
def test_validate_parameters_tpm_missing_gene_length(self):
115+
# Test Error raised if gene-length is missing with TPM method
116+
with self.assertRaisesRegex(
117+
ValueError, "gene-length input is missing."):
118+
_validate_parameters("tpm", None, None, None)
119+
120+
def test_validate_parameters_tmm_gene_length(self):
121+
# Test Error raised if gene-length is given with TMM method
122+
with self.assertRaisesRegex(
123+
ValueError,
124+
"gene-length input can only be used with FPKM and "
125+
"TPM methods."
126+
):
127+
_validate_parameters(
128+
"tmm", None, None, gene_length=MagicMock())
129+
130+
def test_validate_parameters_default_m_a_trim(self):
131+
# Test if m_trim and a_trim get set to default values if None
132+
m_trim, a_trim = _validate_parameters("tmm", None, None, None)
133+
self.assertEqual(m_trim, 0.3)
134+
self.assertEqual(a_trim, 0.05)
135+
136+
def test_validate_parameters_m_a_trim(self):
137+
# Test if m_trim and a_trim are not modified if not None
138+
m_trim, a_trim = _validate_parameters("tmm", 0.1, 0.06, None)
139+
self.assertEqual(m_trim, 0.1)
140+
self.assertEqual(a_trim, 0.06)
141+
142+
def test_convert_lengths_gene_length(self):
143+
# Test _convert_lengths
144+
gene_length = SequenceCharacteristicsDirectoryFormat()
145+
with open(os.path.join(
146+
str(gene_length), "sequence_characteristics.tsv"),
147+
'w') as file:
148+
file.write("id\tlength\nARO1\t1356.0\nARO2\t1173.0")
149+
150+
obs = _convert_lengths(self.table, gene_length=gene_length)
151+
assert_series_equal(obs, self.lengths)
152+
153+
def test_convert_lengths_short_gene_length(self):
154+
# Test Error raised if gene-length is missing genes
155+
gene_length = SequenceCharacteristicsDirectoryFormat()
156+
with open(os.path.join(
157+
str(gene_length),
158+
"sequence_characteristics.tsv"), 'w') as file:
159+
file.write("id\tlength\nARO1\t1356.0")
160+
with self.assertRaisesRegex(
161+
ValueError,
162+
"There are genes present in the FeatureTable that are "
163+
"not present in the gene-length input. Missing lengths "
164+
"for genes: {'ARO2'}",
165+
):
166+
_convert_lengths(self.table, gene_length=gene_length)
167+
168+
@patch("q2_feature_table._normalize.TPM")
169+
def test_tpm_fpkm_with_valid_inputs(self, mock_tpm):
170+
# Test valid inputs for TPM method
171+
gene_length = SequenceCharacteristicsDirectoryFormat()
172+
with open(os.path.join(
173+
str(gene_length), "sequence_characteristics.tsv"),
174+
'w') as file:
175+
file.write("id\tlength\nARO1\t1356.0\nARO2\t1173.0")
176+
normalize(table=self.table, gene_length=gene_length, method="tpm")
177+
178+
@patch("q2_feature_table._normalize.TMM")
179+
def test_tmm_uq_cuf_ctf_with_valid_inputs(self, mock_tmm):
180+
# Test valid inputs for TMM method
181+
gene_length = SequenceCharacteristicsDirectoryFormat()
182+
with open(os.path.join(
183+
str(gene_length), "sequence_characteristics.tsv"),
184+
'w') as file:
185+
file.write("id\tlength\nARO1\t1356.0\nARO2\t1173.0")
186+
normalize(table=self.table, method="tmm", a_trim=0.06, m_trim=0.4)
187+
188+
81189
if __name__ == "__main__":
82190
main()

0 commit comments

Comments
 (0)