CornellCAC
diff --git a/‎20_minutes_to_R.Rmd
+537 b/‎20_minutes_to_R.Rmd
+537
diff --git a/‎20_minutes_to_R.nb.html
+3,086 b/‎20_minutes_to_R.nb.html
+3,086
diff --git a/‎CAC SCU R Basics.pptx
1.22 MB b/‎CAC SCU R Basics.pptx
1.22 MB
diff --git a/‎R-basics.Rmd
+222 b/‎R-basics.Rmd
+222
@@ -0,0 +1,222 @@
+---
+title: "R Basics"
+output: html_document
+---
+
+# R Studio Interface
+
+Posit (Formerly R Studio Public Benefit Corporation) publishes helpful and extremely detailed cheatsheets. (e.g. <https://posit.co/wp-content/uploads/2022/10/rstudio-ide-1.pdf>)
+
+1.  **Notice:** Working Directory at top of Console
+2.  **Demo:** Start a new R notebook
+3.  **Demo:** Use Packages tab to install a package (tidyverse, titanic, gmodels)
+
+```{r message=FALSE, warning=FALSE}
+#install.packages("tidyverse") #uncomment (remove leading #) to run
+require(tidyverse)
+```
+
+## Data import
+
+-   **Demo:** Import Dataset Wizard in Upper Tab Pane: Environment
+    -   nutrient.txt (fixed width format) - use base
+
+    -   registration_times.csv (can set some datatypes on import)
+
+### Output generated by Base wizard for Nutrient.txt
+
+```{r paged.print=FALSE}
+n_df <- read.table("~/Documents/CAC/Projects/scu_dev/r_basics/nutrient.txt", quote="\"", comment.char="")
+head(n_df)
+names(n_df) # Column names are not great
+```
+
+```{r paged.print=FALSE}
+# replace the names with a vector of new names
+names(n_df) = c("caseID", "calcium", "iron", "protein", "vitA", "vitC")
+head(n_df)
+str(n_df)
+```
+
+### Output from readr import wizard:
+
+```{r}
+# This help file explains the tokens available for parsing time
+?parse_date_time
+```
+
+```{r}
+# code from import wizard
+require(readr)
+registration_times <- read_csv(
+  "registration_times.csv", 
+  col_types = cols(`Registration Time` = col_datetime(format = "%Y-%m-%d %H:%M:%S")
+))
+```
+
+```{r paged.print=FALSE}
+summary(registration_times)
+head(registration_times)
+```
+
+```{r}
+# "org" variable might be better represented as a factor 
+# check the unique values:
+unique(registration_times$org)
+```
+
+```{r}
+registration_times$org = factor(registration_times$org, levels=c('wcm', 'cu', 'other'))
+
+# While we are at it, lets rename the first column from `registration time` to just `time`:
+names(registration_times)[1] = "time"
+
+head(registration_times)
+```
+
+## Describing Data
+
+### Numeric data
+
+```{r paged.print=FALSE}
+# Basic summary of dataframe
+summary(n_df)
+```
+
+```{r}
+# Base R approach using apply functions (see also sapply, lapply)
+apply(n_df, 2, mean) # "2" applies function "by column"
+apply(n_df, 2, sd)
+```
+
+```{r}
+gg = (
+  ggplot(n_df, aes(x=calcium))
+  + geom_histogram(bins=50)
+  + ggtitle("Distribution of Calcium Intake")
+)
+gg
+
+```
+
+```{r}
+# Visual Description
+require(ggplot2)
+gg = (
+  ggplot(n_df, aes(x=calcium, y=iron))
+  + geom_point()
+  + ggtitle("Scatterplot of Iron and Calcium Intake")
+)
+gg
+```
+
+### Categorical Data
+
+```{r}
+require(titanic)
+df = titanic_train
+str(df)
+head(df)
+```
+
+Again, data types are not as precise as they could be.
+
+Types are Character, int, int but they are really all factors
+
+```{r}
+# use dplyr functions and the "pipe" operator `%>%`
+# alternative: head(select(df, Sex, Survided, Pclass))
+df %>% select( Sex, Survived, Pclass) %>% head
+df %>% select( Sex, Survived, Pclass) %>% summary
+```
+
+```{r}
+# less than idead data types lead to less ideal summaries
+table(df$Survived)
+```
+
+```{r}
+# Create factors from the columns
+df$Sex = factor(df$Sex, levels=c("male", "female"))
+df$Survived = factor(df$Survived, levels=c(0, 1), labels=c("No", "Yes"))
+df$Pclass = factor(df$Pclass, levels=c(1,2,3), ordered=TRUE)
+
+#Check the summary now:
+df %>% select( Sex, Survived, Pclass) %>% summary
+```
+
+Check for missing data:
+
+```{r}
+nrow(df)
+colSums(is.na(df))
+```
+
+```{r}
+#Single variable count tables
+table(df$Sex)
+table(df$Survived)
+```
+
+#### Table and Prop.table
+
+```{r}
+sex_surv = table(df$Sex, df$Survived, dnn=c("Sex", "Survived"))
+sex_surv
+addmargins(sex_surv)
+writeLines("")
+
+prop.table(sex_surv, 1 ) # The "1" means row proportions
+prop.table(sex_surv, 2) # The "2" means column proportions
+prop.table(sex_surv)    # skip the argument to get proportion of table total
+
+round(prop.table(sex_surv, 1), 2)
+```
+
+#### CrossTable (gmodels package)
+
+```{r}
+# gmodels package gives output more like SPSS/SAS/STATA
+require(gmodels) #show install
+CrossTable(df$Sex, df$Survived, digits=2, expected=TRUE, chisq=TRUE)
+```
+
+#### Xtabs
+
+```{r}
+# We need to know the variable names:
+names(df)
+```
+
+```{r}
+surv_class_sex = xtabs(~Survived+Pclass+Sex, data=df)
+surv_class_sex
+ftable(surv_class_sex)
+```
+
+#### Dplyr
+
+```{r paged.print=FALSE}
+(
+  df 
+  %>% group_by(Pclass, Sex, Survived) 
+  %>% summarize(n = n()) 
+  %>% group_by(Pclass, Sex) 
+  %>% mutate( Rate = n/sum(n)) 
+  #%>% filter(Survived=='Yes')
+)
+```
+
+```{r paged.print=FALSE}
+df %>% group_by(Sex) %>% summarize(age = mean(Age))
+df %>% group_by(Sex) %>% summarize(age = mean(Age, na.rm=TRUE))
+```
+
+#### Regression model
+
+(Note: proper model fitting and interpretation is beyond the scope of this tutorial)
+
+```{r}
+m1 = glm(Survived ~ Sex + Pclass + Age, family = 'binomial', data=df)
+summary(m1)
+```