-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcounters.R
99 lines (85 loc) · 3.04 KB
/
counters.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# find the excel file with the list of experiments
# count the patterns per experiment
folder.project <- "/Users/radek/reactor/rnaoutliers"
source(paste(folder.project, "config.R", sep="/"))
# get hairpin data
hairpinSeqs <- readRNAStringSet(file=hairpin.path, format="fasta")
# get only C. Elegans data
cElegHairpings <- hairpinSeqs[grep("^cel-", names(hairpinSeqs))]
#setwd("D:/devel/data/small_RNA")
#file <- "B470002.1.fq"
# get counts for all files within a given condition
getCounts <- function(files, occurences, path) {
condCounts <- list()
top_m <- NULL
for (file in files) {
fq <- readFastq(path, pattern = file)
top = tables(sread(fq), n = length(unique(sread(fq))))
condCounts[[file]] <- top$top
row_m <- matrix(top$top, nrow=length(top$top), ncol=1)
colnames(row_m) <- file
rownames(row_m) <- names(top$top)
if (is.null(top_m)) {
top_m <- row_m
} else {
top_m <- cbindX(top_m, row_m)
}
}
# filter out all sequences that has less than mean number of occurences
top_m[rowSums(top_m) > occurences,])
for (file in files) {
fq <- readFastq(path, pattern = file)
top = tables(sread(fq), n = length(unique(sread(fq))))
experimentSeqs = top$top[top$top >= occurences]
vcount <- rep(0, length(experimentSeqs))
names(vcount) <- names(experimentSeqs)
for (key in 1:length(experimentSeqs)) {
seq <- names(vcount)[key]
vcount[key] <- sum(vcountPattern(RNAString(DNAString(seq)),
cElegHairpings, max.mismatch=2))
if (key %% 1000 == 0) {
print(key)
flush.console()
}
}
vcount
}
}
# for every condition
conditions <- unique(samples$condition)
# for testing use just one condition
condition <- "N2C"
for (condition in conditions) {
files <- as.character(samples[samples$condition == condition,4])
condCounts <- list()
for (file in files) {
#files <- list.files(folder.clean)
#for(file in files) {
vcount <- getCounts(file, occurences = 5, folder.clean)
condCounts[[file]] <- vcount
# TODO: only discard counters if they have less than 5 occurences
# across the conditions
#vcount.positive <- vcount[vcount > 0]
#dput(vcount.positive, paste(folder.counters, file, sep=""))
}
}
files <- list.files(folder.counters)
count.m <- NULL
for (file in files) {
vcount.positive <- dget(paste(folder.counters, file, sep=""))
row.m <- matrix(vcount.positive, nrow=length(vcount.positive), ncol=1)
colnames(row.m) <- file
rownames(row.m) <- names(vcount.positive)
if (is.null(count.m)) {
count.m <- row.m
} else {
count.m <- cbindX(count.m, row.m)
}
}
count.m <- na.omit(count.m)
# normalize table counts using PreprocessCore library (see normalize.quantiles)
# is this correct?
count.q <- normalize.quantiles(count.m)
rownames(count.q) <- rownames(count.m)
colnames(count.q) <- colnames(count.m)
#