22# Compute the signature score based on the geometric mean of the target gene expression
33# and split cells in 2 groups (high/low) using this signature score.
44
5- options(show.error.messages = FALSE ,
6- error = function () {
7- cat(geterrmessage(), file = stderr())
8- q(" no" , 1 , FALSE )
9- }
5+ options(
6+ show.error.messages = FALSE ,
7+ error = function () {
8+ cat(geterrmessage(), file = stderr())
9+ q(" no" , 1 , FALSE )
10+ }
1011)
1112loc <- Sys.setlocale(" LC_MESSAGES" , " en_US.UTF-8" )
1213warnings()
@@ -18,93 +19,95 @@ library(gridExtra)
1819
1920# Arguments
2021option_list <- list (
21- make_option(
22- " --input" ,
23- default = NA ,
24- type = " character" ,
25- help = " Input file that contains log2(CPM +1) values"
26- ),
27- make_option(
28- " --sep" ,
29- default = " \t " ,
30- type = " character" ,
31- help = " File separator [default : '%default' ]"
32- ),
33- make_option(
34- " --colnames" ,
35- default = TRUE ,
36- type = " logical" ,
37- help = " Consider first line as header ? [default : '%default' ]"
38- ),
39- make_option(
40- " --genes" ,
41- default = NA ,
42- type = " character" ,
43- help = " List of genes comma separated"
44- ),
45- make_option(
46- " --percentile_threshold" ,
47- default = 20 ,
48- type = " integer" ,
49- help = " detection threshold to keep a gene in signature set [default : '%default' ]"
50- ),
51- make_option(
52- " --output" ,
53- default = " ./output.tab" ,
54- type = " character" ,
55- help = " Output path [default : '%default' ]"
56- ),
57- make_option(
58- " --stats" ,
59- default = " ./statistics.tab" ,
60- type = " character" ,
61- help = " statistics path [default : '%default' ]"
62- ),
63- make_option(
64- " --correlations" ,
65- default = " ./correlations.tab" ,
66- type = " character" ,
67- help = " Correlations between signature genes [default : '%default' ]"
68- ),
69- make_option(
70- " --covariances" ,
71- default = " ./statistics.tab" ,
72- type = " character" ,
73- help = " Covariances between signature genes [default : '%default' ]"
74- ),
75- make_option(
76- " --pdf" ,
77- default = " ~/output.pdf" ,
78- type = " character" ,
79- help = " pdf path [default : '%default' ]"
80- )
22+ make_option(
23+ " --input" ,
24+ default = NA ,
25+ type = " character" ,
26+ help = " Input file that contains log2(CPM +1) values"
27+ ),
28+ make_option(
29+ " --sep" ,
30+ default = " \t " ,
31+ type = " character" ,
32+ help = " File separator [default : '%default' ]"
33+ ),
34+ make_option(
35+ " --colnames" ,
36+ default = TRUE ,
37+ type = " logical" ,
38+ help = " Consider first line as header ? [default : '%default' ]"
39+ ),
40+ make_option(
41+ " --genes" ,
42+ default = NA ,
43+ type = " character" ,
44+ help = " List of genes comma separated"
45+ ),
46+ make_option(
47+ " --percentile_threshold" ,
48+ default = 20 ,
49+ type = " integer" ,
50+ help = " detection threshold to keep a gene in signature set [default : '%default' ]"
51+ ),
52+ make_option(
53+ " --output" ,
54+ default = " ./output.tab" ,
55+ type = " character" ,
56+ help = " Output path [default : '%default' ]"
57+ ),
58+ make_option(
59+ " --stats" ,
60+ default = " ./statistics.tab" ,
61+ type = " character" ,
62+ help = " statistics path [default : '%default' ]"
63+ ),
64+ make_option(
65+ " --correlations" ,
66+ default = " ./correlations.tab" ,
67+ type = " character" ,
68+ help = " Correlations between signature genes [default : '%default' ]"
69+ ),
70+ make_option(
71+ " --covariances" ,
72+ default = " ./statistics.tab" ,
73+ type = " character" ,
74+ help = " Covariances between signature genes [default : '%default' ]"
75+ ),
76+ make_option(
77+ " --pdf" ,
78+ default = " ~/output.pdf" ,
79+ type = " character" ,
80+ help = " pdf path [default : '%default' ]"
81+ )
8182)
8283
8384opt <- parse_args(OptionParser(option_list = option_list ),
84- args = commandArgs(trailingOnly = TRUE ))
85+ args = commandArgs(trailingOnly = TRUE )
86+ )
8587
8688if (opt $ sep == " tab" ) {
87- opt $ sep <- " \t "
89+ opt $ sep <- " \t "
8890}
8991if (opt $ sep == " comma" ) {
90- opt $ sep <- " ,"
92+ opt $ sep <- " ,"
9193}
9294
9395# Take input data
9496data.counts <- read.table(
95- opt $ input ,
96- h = opt $ colnames ,
97- row.names = 1 ,
98- sep = opt $ sep ,
99- check.names = FALSE
97+ opt $ input ,
98+ h = opt $ colnames ,
99+ row.names = 1 ,
100+ sep = opt $ sep ,
101+ check.names = FALSE
100102)
101103
102104# Get vector of target genes
103105genes <- unlist(strsplit(opt $ genes , " ," ))
104106
105107if (length(unique(genes %in% rownames(data.counts ))) == 1 ) {
106- if (unique(genes %in% rownames(data.counts )) == FALSE )
107- stop(" None of these genes are in your dataset: " , opt $ genes )
108+ if (unique(genes %in% rownames(data.counts )) == FALSE ) {
109+ stop(" None of these genes are in your dataset: " , opt $ genes )
110+ }
108111}
109112
110113logical_genes <- rownames(data.counts ) %in% genes
@@ -116,10 +119,11 @@ signature.counts <- subset(data.counts, logical_genes)
116119signature.covariances <- as.data.frame(cov(t(signature.counts )))
117120signature.covariances <- cbind(gene = rownames(signature.covariances ), signature.covariances )
118121write.table(signature.covariances ,
119- file = opt $ covariances ,
120- quote = FALSE ,
121- row.names = FALSE ,
122- sep = " \t " )
122+ file = opt $ covariances ,
123+ quote = FALSE ,
124+ row.names = FALSE ,
125+ sep = " \t "
126+ )
123127
124128# compute signature.correlations
125129signature.correlations <- as.data.frame(cor(t(signature.counts )))
@@ -128,15 +132,15 @@ write.table(signature.correlations, file = opt$correlations, quote = FALSE, row.
128132
129133# # Descriptive Statistics Function
130134descriptive_stats <- function (InputData ) {
131- SummaryData <- data.frame (
132- mean = rowMeans(InputData ),
133- SD = apply(InputData , 1 , sd ),
134- Variance = apply(InputData , 1 , var ),
135- Percentage_Detection = apply(InputData , 1 , function (x , y = InputData ) {
136- (sum(x != 0 ) / ncol(y )) * 100
137- })
138- )
139- return (SummaryData )
135+ SummaryData <- data.frame (
136+ mean = rowMeans(InputData ),
137+ SD = apply(InputData , 1 , sd ),
138+ Variance = apply(InputData , 1 , var ),
139+ Percentage_Detection = apply(InputData , 1 , function (x , y = InputData ) {
140+ (sum(x != 0 ) / ncol(y )) * 100
141+ })
142+ )
143+ return (SummaryData )
140144}
141145
142146signature_stats <- descriptive_stats(signature.counts )
@@ -146,21 +150,21 @@ kept_genes <- signature_stats$Percentage_Detection >= opt$percentile_threshold
146150
147151# Add warnings
148152if (length(unique(kept_genes )) > 1 ) {
149- cat(
150- " WARNINGS ! Following genes were removed from further analysis due to low gene expression :" ,
151- paste(paste(rownames(signature.counts )[! kept_genes ], round(signature_stats $ Percentage_Detection [! kept_genes ], 2 ), sep = " : " ), collapse = " , " ),
152- " \n "
153- )
154- } else {
155- if (unique(kept_genes ) == FALSE ) {
156- stop(
157- " None of these genes are detected in " ,
158- opt $ percent ,
159- " % of your cells: " ,
160- paste(rownames(signature_stats ), collapse = " , " ),
161- " . You can be less stringent thanks to --percent parameter."
153+ cat(
154+ " WARNINGS ! Following genes were removed from further analysis due to low gene expression :" ,
155+ paste(paste(rownames(signature.counts )[! kept_genes ], round(signature_stats $ Percentage_Detection [! kept_genes ], 2 ), sep = " : " ), collapse = " , " ),
156+ " \n "
162157 )
163- }
158+ } else {
159+ if (unique(kept_genes ) == FALSE ) {
160+ stop(
161+ " None of these genes are detected in " ,
162+ opt $ percent ,
163+ " % of your cells: " ,
164+ paste(rownames(signature_stats ), collapse = " , " ),
165+ " . You can be less stringent thanks to --percent parameter."
166+ )
167+ }
164168}
165169
166170# Remove genes poorly detected in the dataset
@@ -173,54 +177,63 @@ signature.counts[signature.counts == 0] <- 1
173177score <- apply(signature.counts , 2 , geometric.mean ) # geometric.mean requires psych
174178
175179# Add results in signature_output
176- signature_output <- data.frame (cell = names(score ),
177- score = score ,
178- rate = ifelse(score > mean(score ), " HIGH" , " LOW" ),
179- nGenes = colSums(data.counts != 0 ),
180- total_counts = colSums(data.counts ))
180+ signature_output <- data.frame (
181+ cell = names(score ),
182+ score = score ,
183+ rate = ifelse(score > mean(score ), " HIGH" , " LOW" ),
184+ nGenes = colSums(data.counts != 0 ),
185+ total_counts = colSums(data.counts )
186+ )
181187
182188# statistics of input genes, signature genes first lines
183- statistics.counts <- rbind(subset(data.counts , logical_genes ),
184- subset(data.counts , ! logical_genes ))
189+ statistics.counts <- rbind(
190+ subset(data.counts , logical_genes ),
191+ subset(data.counts , ! logical_genes )
192+ )
185193statistics <- descriptive_stats(statistics.counts )
186194statistics <- cbind(gene = rownames(statistics ), statistics )
187195
188196
189197
190198# Re-arrange score matrix for plots
191- score <- data.frame (score = score ,
192- order = rank(score , ties.method = " first" ),
193- signature = signature_output $ rate ,
194- stringsAsFactors = FALSE )
199+ score <- data.frame (
200+ score = score ,
201+ order = rank(score , ties.method = " first" ),
202+ signature = signature_output $ rate ,
203+ stringsAsFactors = FALSE
204+ )
195205
196206pdf(file = opt $ pdf )
197207myplot <- ggplot(signature_output , aes(x = rate , y = score )) +
198- geom_violin(aes(fill = rate ), alpha = .5 , trim = FALSE , show.legend = FALSE , cex = 0.5 ) +
199- geom_abline(slope = 0 , intercept = mean(score $ score ), lwd = 0.5 , color = " red" ) +
200- scale_fill_manual(values = c(" #ff0000" , " #08661e" )) +
201- geom_jitter(size = 0.2 ) + labs(y = " Score" , x = " Rate" ) +
202- annotate(" text" , x = 0.55 , y = mean(score $ score ), cex = 3 , vjust = 1.5 ,
203- color = " black" , label = mean(score $ score ), parse = TRUE ) +
204- labs(title = " Violin plots of Cell signature scores" )
208+ geom_violin(aes(fill = rate ), alpha = .5 , trim = FALSE , show.legend = FALSE , cex = 0.5 ) +
209+ geom_abline(slope = 0 , intercept = mean(score $ score ), lwd = 0.5 , color = " red" ) +
210+ scale_fill_manual(values = c(" #ff0000" , " #08661e" )) +
211+ geom_jitter(size = 0.2 ) +
212+ labs(y = " Score" , x = " Rate" ) +
213+ annotate(" text" ,
214+ x = 0.55 , y = mean(score $ score ), cex = 3 , vjust = 1.5 ,
215+ color = " black" , label = mean(score $ score ), parse = TRUE
216+ ) +
217+ labs(title = " Violin plots of Cell signature scores" )
205218
206219print(myplot )
207220dev.off()
208221
209222# Save file
210223write.table(
211- signature_output ,
212- opt $ output ,
213- sep = " \t " ,
214- quote = FALSE ,
215- col.names = TRUE ,
216- row.names = FALSE
224+ signature_output ,
225+ opt $ output ,
226+ sep = " \t " ,
227+ quote = FALSE ,
228+ col.names = TRUE ,
229+ row.names = FALSE
217230)
218231
219232write.table(
220- statistics ,
221- opt $ stats ,
222- sep = " \t " ,
223- quote = FALSE ,
224- col.names = TRUE ,
225- row.names = FALSE
233+ statistics ,
234+ opt $ stats ,
235+ sep = " \t " ,
236+ quote = FALSE ,
237+ col.names = TRUE ,
238+ row.names = FALSE
226239)
0 commit comments