2
2
# Compute the signature score based on the geometric mean of the target gene expression
3
3
# and split cells in 2 groups (high/low) using this signature score.
4
4
5
- options(show.error.messages = FALSE ,
6
- error = function () {
7
- cat(geterrmessage(), file = stderr())
8
- q(" no" , 1 , FALSE )
9
- }
5
+ options(
6
+ show.error.messages = FALSE ,
7
+ error = function () {
8
+ cat(geterrmessage(), file = stderr())
9
+ q(" no" , 1 , FALSE )
10
+ }
10
11
)
11
12
loc <- Sys.setlocale(" LC_MESSAGES" , " en_US.UTF-8" )
12
13
warnings()
@@ -18,93 +19,95 @@ library(gridExtra)
18
19
19
20
# Arguments
20
21
option_list <- list (
21
- make_option(
22
- " --input" ,
23
- default = NA ,
24
- type = " character" ,
25
- help = " Input file that contains log2(CPM +1) values"
26
- ),
27
- make_option(
28
- " --sep" ,
29
- default = " \t " ,
30
- type = " character" ,
31
- help = " File separator [default : '%default' ]"
32
- ),
33
- make_option(
34
- " --colnames" ,
35
- default = TRUE ,
36
- type = " logical" ,
37
- help = " Consider first line as header ? [default : '%default' ]"
38
- ),
39
- make_option(
40
- " --genes" ,
41
- default = NA ,
42
- type = " character" ,
43
- help = " List of genes comma separated"
44
- ),
45
- make_option(
46
- " --percentile_threshold" ,
47
- default = 20 ,
48
- type = " integer" ,
49
- help = " detection threshold to keep a gene in signature set [default : '%default' ]"
50
- ),
51
- make_option(
52
- " --output" ,
53
- default = " ./output.tab" ,
54
- type = " character" ,
55
- help = " Output path [default : '%default' ]"
56
- ),
57
- make_option(
58
- " --stats" ,
59
- default = " ./statistics.tab" ,
60
- type = " character" ,
61
- help = " statistics path [default : '%default' ]"
62
- ),
63
- make_option(
64
- " --correlations" ,
65
- default = " ./correlations.tab" ,
66
- type = " character" ,
67
- help = " Correlations between signature genes [default : '%default' ]"
68
- ),
69
- make_option(
70
- " --covariances" ,
71
- default = " ./statistics.tab" ,
72
- type = " character" ,
73
- help = " Covariances between signature genes [default : '%default' ]"
74
- ),
75
- make_option(
76
- " --pdf" ,
77
- default = " ~/output.pdf" ,
78
- type = " character" ,
79
- help = " pdf path [default : '%default' ]"
80
- )
22
+ make_option(
23
+ " --input" ,
24
+ default = NA ,
25
+ type = " character" ,
26
+ help = " Input file that contains log2(CPM +1) values"
27
+ ),
28
+ make_option(
29
+ " --sep" ,
30
+ default = " \t " ,
31
+ type = " character" ,
32
+ help = " File separator [default : '%default' ]"
33
+ ),
34
+ make_option(
35
+ " --colnames" ,
36
+ default = TRUE ,
37
+ type = " logical" ,
38
+ help = " Consider first line as header ? [default : '%default' ]"
39
+ ),
40
+ make_option(
41
+ " --genes" ,
42
+ default = NA ,
43
+ type = " character" ,
44
+ help = " List of genes comma separated"
45
+ ),
46
+ make_option(
47
+ " --percentile_threshold" ,
48
+ default = 20 ,
49
+ type = " integer" ,
50
+ help = " detection threshold to keep a gene in signature set [default : '%default' ]"
51
+ ),
52
+ make_option(
53
+ " --output" ,
54
+ default = " ./output.tab" ,
55
+ type = " character" ,
56
+ help = " Output path [default : '%default' ]"
57
+ ),
58
+ make_option(
59
+ " --stats" ,
60
+ default = " ./statistics.tab" ,
61
+ type = " character" ,
62
+ help = " statistics path [default : '%default' ]"
63
+ ),
64
+ make_option(
65
+ " --correlations" ,
66
+ default = " ./correlations.tab" ,
67
+ type = " character" ,
68
+ help = " Correlations between signature genes [default : '%default' ]"
69
+ ),
70
+ make_option(
71
+ " --covariances" ,
72
+ default = " ./statistics.tab" ,
73
+ type = " character" ,
74
+ help = " Covariances between signature genes [default : '%default' ]"
75
+ ),
76
+ make_option(
77
+ " --pdf" ,
78
+ default = " ~/output.pdf" ,
79
+ type = " character" ,
80
+ help = " pdf path [default : '%default' ]"
81
+ )
81
82
)
82
83
83
84
opt <- parse_args(OptionParser(option_list = option_list ),
84
- args = commandArgs(trailingOnly = TRUE ))
85
+ args = commandArgs(trailingOnly = TRUE )
86
+ )
85
87
86
88
if (opt $ sep == " tab" ) {
87
- opt $ sep <- " \t "
89
+ opt $ sep <- " \t "
88
90
}
89
91
if (opt $ sep == " comma" ) {
90
- opt $ sep <- " ,"
92
+ opt $ sep <- " ,"
91
93
}
92
94
93
95
# Take input data
94
96
data.counts <- read.table(
95
- opt $ input ,
96
- h = opt $ colnames ,
97
- row.names = 1 ,
98
- sep = opt $ sep ,
99
- check.names = FALSE
97
+ opt $ input ,
98
+ h = opt $ colnames ,
99
+ row.names = 1 ,
100
+ sep = opt $ sep ,
101
+ check.names = FALSE
100
102
)
101
103
102
104
# Get vector of target genes
103
105
genes <- unlist(strsplit(opt $ genes , " ," ))
104
106
105
107
if (length(unique(genes %in% rownames(data.counts ))) == 1 ) {
106
- if (unique(genes %in% rownames(data.counts )) == FALSE )
107
- stop(" None of these genes are in your dataset: " , opt $ genes )
108
+ if (unique(genes %in% rownames(data.counts )) == FALSE ) {
109
+ stop(" None of these genes are in your dataset: " , opt $ genes )
110
+ }
108
111
}
109
112
110
113
logical_genes <- rownames(data.counts ) %in% genes
@@ -116,10 +119,11 @@ signature.counts <- subset(data.counts, logical_genes)
116
119
signature.covariances <- as.data.frame(cov(t(signature.counts )))
117
120
signature.covariances <- cbind(gene = rownames(signature.covariances ), signature.covariances )
118
121
write.table(signature.covariances ,
119
- file = opt $ covariances ,
120
- quote = FALSE ,
121
- row.names = FALSE ,
122
- sep = " \t " )
122
+ file = opt $ covariances ,
123
+ quote = FALSE ,
124
+ row.names = FALSE ,
125
+ sep = " \t "
126
+ )
123
127
124
128
# compute signature.correlations
125
129
signature.correlations <- as.data.frame(cor(t(signature.counts )))
@@ -128,15 +132,15 @@ write.table(signature.correlations, file = opt$correlations, quote = FALSE, row.
128
132
129
133
# # Descriptive Statistics Function
130
134
descriptive_stats <- function (InputData ) {
131
- SummaryData <- data.frame (
132
- mean = rowMeans(InputData ),
133
- SD = apply(InputData , 1 , sd ),
134
- Variance = apply(InputData , 1 , var ),
135
- Percentage_Detection = apply(InputData , 1 , function (x , y = InputData ) {
136
- (sum(x != 0 ) / ncol(y )) * 100
137
- })
138
- )
139
- return (SummaryData )
135
+ SummaryData <- data.frame (
136
+ mean = rowMeans(InputData ),
137
+ SD = apply(InputData , 1 , sd ),
138
+ Variance = apply(InputData , 1 , var ),
139
+ Percentage_Detection = apply(InputData , 1 , function (x , y = InputData ) {
140
+ (sum(x != 0 ) / ncol(y )) * 100
141
+ })
142
+ )
143
+ return (SummaryData )
140
144
}
141
145
142
146
signature_stats <- descriptive_stats(signature.counts )
@@ -146,21 +150,21 @@ kept_genes <- signature_stats$Percentage_Detection >= opt$percentile_threshold
146
150
147
151
# Add warnings
148
152
if (length(unique(kept_genes )) > 1 ) {
149
- cat(
150
- " WARNINGS ! Following genes were removed from further analysis due to low gene expression :" ,
151
- paste(paste(rownames(signature.counts )[! kept_genes ], round(signature_stats $ Percentage_Detection [! kept_genes ], 2 ), sep = " : " ), collapse = " , " ),
152
- " \n "
153
- )
154
- } else {
155
- if (unique(kept_genes ) == FALSE ) {
156
- stop(
157
- " None of these genes are detected in " ,
158
- opt $ percent ,
159
- " % of your cells: " ,
160
- paste(rownames(signature_stats ), collapse = " , " ),
161
- " . You can be less stringent thanks to --percent parameter."
153
+ cat(
154
+ " WARNINGS ! Following genes were removed from further analysis due to low gene expression :" ,
155
+ paste(paste(rownames(signature.counts )[! kept_genes ], round(signature_stats $ Percentage_Detection [! kept_genes ], 2 ), sep = " : " ), collapse = " , " ),
156
+ " \n "
162
157
)
163
- }
158
+ } else {
159
+ if (unique(kept_genes ) == FALSE ) {
160
+ stop(
161
+ " None of these genes are detected in " ,
162
+ opt $ percent ,
163
+ " % of your cells: " ,
164
+ paste(rownames(signature_stats ), collapse = " , " ),
165
+ " . You can be less stringent thanks to --percent parameter."
166
+ )
167
+ }
164
168
}
165
169
166
170
# Remove genes poorly detected in the dataset
@@ -173,54 +177,63 @@ signature.counts[signature.counts == 0] <- 1
173
177
score <- apply(signature.counts , 2 , geometric.mean ) # geometric.mean requires psych
174
178
175
179
# Add results in signature_output
176
- signature_output <- data.frame (cell = names(score ),
177
- score = score ,
178
- rate = ifelse(score > mean(score ), " HIGH" , " LOW" ),
179
- nGenes = colSums(data.counts != 0 ),
180
- total_counts = colSums(data.counts ))
180
+ signature_output <- data.frame (
181
+ cell = names(score ),
182
+ score = score ,
183
+ rate = ifelse(score > mean(score ), " HIGH" , " LOW" ),
184
+ nGenes = colSums(data.counts != 0 ),
185
+ total_counts = colSums(data.counts )
186
+ )
181
187
182
188
# statistics of input genes, signature genes first lines
183
- statistics.counts <- rbind(subset(data.counts , logical_genes ),
184
- subset(data.counts , ! logical_genes ))
189
+ statistics.counts <- rbind(
190
+ subset(data.counts , logical_genes ),
191
+ subset(data.counts , ! logical_genes )
192
+ )
185
193
statistics <- descriptive_stats(statistics.counts )
186
194
statistics <- cbind(gene = rownames(statistics ), statistics )
187
195
188
196
189
197
190
198
# Re-arrange score matrix for plots
191
- score <- data.frame (score = score ,
192
- order = rank(score , ties.method = " first" ),
193
- signature = signature_output $ rate ,
194
- stringsAsFactors = FALSE )
199
+ score <- data.frame (
200
+ score = score ,
201
+ order = rank(score , ties.method = " first" ),
202
+ signature = signature_output $ rate ,
203
+ stringsAsFactors = FALSE
204
+ )
195
205
196
206
pdf(file = opt $ pdf )
197
207
myplot <- ggplot(signature_output , aes(x = rate , y = score )) +
198
- geom_violin(aes(fill = rate ), alpha = .5 , trim = FALSE , show.legend = FALSE , cex = 0.5 ) +
199
- geom_abline(slope = 0 , intercept = mean(score $ score ), lwd = 0.5 , color = " red" ) +
200
- scale_fill_manual(values = c(" #ff0000" , " #08661e" )) +
201
- geom_jitter(size = 0.2 ) + labs(y = " Score" , x = " Rate" ) +
202
- annotate(" text" , x = 0.55 , y = mean(score $ score ), cex = 3 , vjust = 1.5 ,
203
- color = " black" , label = mean(score $ score ), parse = TRUE ) +
204
- labs(title = " Violin plots of Cell signature scores" )
208
+ geom_violin(aes(fill = rate ), alpha = .5 , trim = FALSE , show.legend = FALSE , cex = 0.5 ) +
209
+ geom_abline(slope = 0 , intercept = mean(score $ score ), lwd = 0.5 , color = " red" ) +
210
+ scale_fill_manual(values = c(" #ff0000" , " #08661e" )) +
211
+ geom_jitter(size = 0.2 ) +
212
+ labs(y = " Score" , x = " Rate" ) +
213
+ annotate(" text" ,
214
+ x = 0.55 , y = mean(score $ score ), cex = 3 , vjust = 1.5 ,
215
+ color = " black" , label = mean(score $ score ), parse = TRUE
216
+ ) +
217
+ labs(title = " Violin plots of Cell signature scores" )
205
218
206
219
print(myplot )
207
220
dev.off()
208
221
209
222
# Save file
210
223
write.table(
211
- signature_output ,
212
- opt $ output ,
213
- sep = " \t " ,
214
- quote = FALSE ,
215
- col.names = TRUE ,
216
- row.names = FALSE
224
+ signature_output ,
225
+ opt $ output ,
226
+ sep = " \t " ,
227
+ quote = FALSE ,
228
+ col.names = TRUE ,
229
+ row.names = FALSE
217
230
)
218
231
219
232
write.table(
220
- statistics ,
221
- opt $ stats ,
222
- sep = " \t " ,
223
- quote = FALSE ,
224
- col.names = TRUE ,
225
- row.names = FALSE
233
+ statistics ,
234
+ opt $ stats ,
235
+ sep = " \t " ,
236
+ quote = FALSE ,
237
+ col.names = TRUE ,
238
+ row.names = FALSE
226
239
)
0 commit comments