generated from carpentries/workshop-template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathday_1-1.r
344 lines (223 loc) · 7.75 KB
/
day_1-1.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
### ###
### Software Carpentry Nov 2022 ###
### DAY 1 ###
### Intro to R, Simulation ###
### ###
##########################################################x
# 1 basics in R (Studio) #####
##########################################################x
# In addition to doing stats, R can be used as a calculator
# you can type this in the CONSOLE Pane
1 + 1
log(5)
exp(6) ^ (4*7)
sum(1, 4, 67, 9)
### Logical operators in R ###
1 == 1. # returns TRUE
1 == 2. # returns FALSE
1 != 2. # returns TRUE (1 and 2 are NOT equal)
4 < 9. # returns TRUE
# Some complex sequences of calculations
# often easier to write the EDITOR window of Rstudio
1 + 5 * (12 - log(5))
# assigning values to a variable and doing subsequent calculations
x <- 3
y <- 1 + 5 * ( 12 - log(x))
### vectorisation ###
x <- 1:10 # x is a vector: 1 2 3 4 5 6 7 8 9 10
y <- x^2 # returns a vector: 1 4 9 16 25 36 49 64 81 100
plot(x, y)
x <- 1:5
y <- c(1, 2, 3, 4, 5)
x == y # TRUE TRUE TRUE TRUE TRUE
# built-in functions sequences, repetitions:
x <- seq(1, 5, by=0.1) # returns 1, 1.1, 1.2, 1.3, ... , 4.9, 5
x <- rep(1, 5) # returns 1 1 1 1 1
x <- c(3,5,6,8,10)
y <- mean(x) # "mean" is an in-buit function, calculating the mean of a vector of numbers
### getting help ###
?mean
help(mean)
# informative names: giving variables a more meaningful name, e.g.:
mean_boys_height <- mean(c(160, 176, 170))
# Valid variable names
MaxLength <- 1 # CamelCase
celsius2kelvin <- 1
min_height <- 1 # snake_case
max.height <- 1 # dot notation
# does not work
# _age <- 1
# min-length <- 1
# 2widths <- 1
### Managing Environment ###
# in the R Studio ENVIRONMENT Pane you can check which variables are assigned, and their values
# list of
ls()
# can also use ".xyz <- 666 to assign a variable that's hidden from the environment
.hidden_variable <- 1
# remove single items (variables or dataframes etc) from environment
rm(x)
# wipe your environment (will delete all stored variables) or specify a list
rm(list=ls())
### packages ###
# the R community provides a number of useful packages and functions
# check which packages are installed and loaded in the R studio's "Packages" pane
# install and access packages
install.packages("NAME_OF_PACKAGE")
load("NAME_OF_PACKAGE")
# for this workshop, we need:
install.packages(c("ggplot2", "dplyr", "tidyr", "gapminder"))
# "ggplot2", "dplyr", "tidyr" are part of the large package "tidyverse"
### File management / working directories ###
setwd("../") # move one folder up
getwd()
getwd() # get your current working directory
setwd("C:/Users/my_user_name/folder") # set working directory
# create a new folder "data" in your wd, then store a downloaded (data)file in that folder
download.file(url = "https://raw.githubusercontent.com/swcarpentry/r-novice-gapminder/gh-pages/_episodes_rmd/data/gapminder_data.csv",
destfile = "data/gapminder_data.csv")
### creating data frames ###
cats <- data.frame( coat = c("calico", "black", "tabby"),
weight = c(2.1, 5, 3.2),
likes_string = c(1, 0, 1))
# saving to file
write.csv(cats, file = "data/feline-data.csv", row.names = FALSE)
# remove
rm(cats)
# load cats data from file
cats <- read.csv(file = "data/feline-data.csv")
### manipulating/analysing cats data ###
cats$weight <- cats$weight * 1000 # use weight in g, not kg
### Data Types ###
str(cats)
typeof(cats) # returns "list"
class(cats) # returns "data.frame"
typeof(cats$coat) # returns "character"
typeof(cats$weight) # returns "double"
# 'double' stores floating-point numbers (vs integers)
typeof(1) # returns "double"
typeof(1L) # returns "integer"
# note, we stored cats$likes_string as Y/N, 1/0, but we can coerce it to be TRUE/FALSE:
is.logical(cats$likes_string) # FALSE
sum(cats$likes_string) # returns 2
cats$likes_string <- as.logical(cats$likes_string)
sum(cats$likes_string) # still, returns 2
# can't do calculations on different types though
cats$weight + cats$coat
# returns:
# Error in cats$weight + cats$coat :
# non-numeric argument to binary operator
typeof(cats$coat)
typeof(cats$weight)
# use variables stored as characters as factors:
cats$coat <- as.factor(cats$coat)
# reverse to character
cats$coat <- as.character(cats$coat)
##########################################################x
# 2 functions in R ######
##########################################################x
my_function <- function(x) {
# performing actions on parameter x, e.g. x+100
# return value # return()
}
# example, convert between temperature scales
fahr_to_kelvin <- function(temp, min_kel = 273.15){
out <- ((temp - 32) * (5/9)) + min_kel
return (out)
}
fahr_to_kelvin(100) # returns 310.9278
fahr_to_kelvin(0) # returns 255.3722
fahr_to_kelvin(temp = 32) # 273.15
fahr_to_kelvin(temp = c(0, 32, 100)) # returns 255.3722 273.1500 310.9278
kelvin_to_celsius <- function(temp){
out <- temp - 273.15
return(out)
}
kelvin_to_celsius(3497)
fahr_to_celsius <- function(temp_farh){
temp_kelvin <- fahr_to_kelvin(temp = temp_farh)
out <- kelvin_to_celsius(temp = temp_kelvin)
return(out)
}
fahr_to_celsius(100)
##########################################################x
# 3 using R for generative simulations ######
##########################################################x
letters
LETTERS
# sample() takes a random sample of size n from the elements of x using either with or without replacement
n <- 100
draws <- sample(LETTERS, n, replace = TRUE)
barplot(table(draws)) # check which letter got sampled how often
abline(h = n/26, lty = 2)
set.seed(2)
# you guarantee that the same random values are produced
# used to create reproducible results
# use a function to sample letters
sample_letters <- function(n) {
draws <- sample(LETTERS, n, replace = TRUE)
return(table(draws))
}
# note, sample(15, 1) is not the same as sample(“15”, 1)
# avoid trouble using
sample_safe <- function(x, ...) {
x[sample.int(length(x), ...)]
}
# sample numbers, with different relative weights
sample(1:10, 30, replace=TRUE, prob=1:10)
# sampling from a normal distribution:
rnorm(10)
hist(rnorm(10000000))
mean(rnorm(1000, 5))
hist(rnorm(1000, 5))
my_sample <- rnorm (10000, mean = 10, sd = 4)
hist(my_sample)
heights <- rnorm(10000, mean = 170, sd = 40)
hist(heights)
# two non-correlated samples:
n <- 1000
y <- rnorm(n)
x <- rnorm(n)
plot(x, y)
cor(x,y) # -1 and 1, where 0 means 'no' correlation
# a null relationship has correlation near 0
m1 <- lm( y ~ x)
summary(m1)
# creating a fixed correlation between two variables
r <- 0.9 # correlation factor
x <- rnorm(1000)
y <- rnorm(1000, mean = x * r, sd = sqrt(1 - r^2))
plot(x,y)
cor(x,y)
# creating correlated samples using a custom function
correlator <- function(r, n = 1000){
stopifnot(r >= (-1) & r <= 1)
x <- rnorm (n)
y <- rnorm (n, mean = r * x, sd = sqrt(1- r^2))
out <- data.frame(x = x, y = y)
return (out)
}
# using the correlator function to create a data set
df <- correlator(0, n = 40)
cor(df$x, df$y)
plot(df$x, df$y)
m2 <- lm(y ~ x, data = df)
summary(m2)
abline(m2)
### sample sizes and false positive ###
# create a regression dataset with different r values (e.g. negative) and play around with n
n_obs <- 200
r <- -0.5
x <- matrix( rnorm(n_obs * 100), nrow = n_obs, ncol = 100)
y <- rnorm(n_obs, mean = r * x[,50], sd = sqrt(1- r^2))
m3 <- lm(y ~ x)
summary(m3)
# identify effects above a threshold
big_effects <- which(abs(coef(m3)) >= 0.016) - 1
x2 <- x[, big_effects]
m3_2 <- lm(y ~ x2)
summary(m3_2)
x_new <- matrix( rnorm(n_obs * 100), nrow = n_obs, ncol = 100)
x2_new <- x_new[, big_effects]
m3_preregistered <- lm(y ~ x2_new)
summary(m3_preregistered)