Merge pull request #269 from naupaka/main

JasonJWilliamsNY · web-flow · commit 46d1774cc80a · 2024-04-10T11:41:08.000-04:00
Address #120 by adding short description of `$` when it is first used
diff --git a/episodes/03-basics-factors-dataframes.Rmd b/episodes/03-basics-factors-dataframes.Rmd
@@ -151,7 +151,7 @@ for `read.table("file.csv", sep = ",")`. You can see in the help
 documentation that there are several additional variations of 
 `read.table`, such as `read.csv2` to read tables separated by `;` 
 and `read.delim` to read in tables separated by `\t` (tabs). If you know how your table is separated, you can use one of the provided short cuts, 
-but case you run into an unconventional separator you are now equipt with the knowledge to define it in the `sep = ` arugument of `read.table`!
+but case you run into an unconventional separator you are now equipped with the knowledge to define it in the `sep = ` argument of `read.table`!
 
 
 ::::::::::::::::::::::::::::::::::::::::::::::::::
@@ -214,7 +214,7 @@ new data frame using the `data.frame()` function.
 ```{r, purl=FALSE}
 ## put the first three columns of variants into a new data frame called subset
 
-subset<-data.frame(variants[,c(1:3,6)])
+subset <- data.frame(variants[, c(1:3, 6)])
 ```
 
 Now, let's use the `str()` (structure) function to look a little more closely
@@ -239,12 +239,13 @@ Ok, thats a lot up unpack! Some things to notice.
 Factors are the final major data structure we will introduce in our R genomics
 lessons. Factors can be thought of as vectors which are specialized for
 categorical data. Given R's specialization for statistics, this make sense since
-categorial and continuous variables are usually treated differently. Sometimes
+categorical and continuous variables are usually treated differently. Sometimes
 you may want to have data treated as a factor, but in other cases, this may be
 undesirable.
 
-Let's see the value of treating some of which are categorical in nature as
-factors. Let's take a look at just the alternate alleles
+Let's explore the value of treating some vectors that are categorical in nature as
+factors. To do this we'll take a look at just the alternate alleles. We can use the `$` operator 
+to access or extract a column by its name in data frames (or to extract objects within named lists).
 
 ```{r, purl=FALSE}
 ## extract the "ALT" column to a new object
@@ -259,11 +260,11 @@ head(alt_alleles)
 ```
 
 There are 801 alleles (one for each row). To simplify, lets look at just the
-single-nuleotide alleles (SNPs). We can use some of the vector indexing skills
+single-nucleotide alleles (SNPs). We can use some of the vector indexing skills
 from the last episode.
 
 ```{r, purl=FALSE}
-snps <- c(alt_alleles[alt_alleles=="A"],
+snps <- c(alt_alleles[alt_alleles == "A"],
   alt_alleles[alt_alleles=="T"],
   alt_alleles[alt_alleles=="G"],
   alt_alleles[alt_alleles=="C"])
@@ -442,19 +443,19 @@ l. `variants[variants$REF == "A",]`
 a.
 
 ```{r}
-variants[1,1]
+variants[1, 1]
 ```
 
 b.
 
 ```{r}
-variants[2,4]
+variants[2, 4]
 ```
 
 c.
 
 ```{r}
-variants[801,29]
+variants[801, 29]
 ```
 
 d.
@@ -476,23 +477,23 @@ head(variants[-1, ])
 f.
 
 ```{r}
-variants[1:4,1]
+variants[1:4, 1]
 ```
 
 g.
 
 ```{r}
-variants[1:10,c("REF","ALT")]
+variants[1:10, c("REF", "ALT")]
 ```
 
 h.
 
 ```{r, echo=TRUE, eval=FALSE}
-variants[,c("sample_id")]
+variants[, c("sample_id")]
 ```
 
 ```{r, echo=FALSE, eval=TRUE}
-head(variants[,c("sample_id")])
+head(variants[, c("sample_id")])
 ```
 
 i.
@@ -520,11 +521,11 @@ head(variants$sample_id)
 l.
 
 ```{r, echo=TRUE, eval=FALSE}
-variants[variants$REF == "A",]
+variants[variants$REF == "A", ]
 ```
 
 ```{r, echo=FALSE, eval=TRUE}
-head(variants[variants$REF == "A",])
+head(variants[variants$REF == "A", ])
 ```
 
 :::::::::::::::::::::::::
@@ -547,7 +548,7 @@ them to a new object name:
 ```{r, purl=FALSE}
 # create a new data frame containing only observations from SRR2584863
 
-SRR2584863_variants <- variants[variants$sample_id == "SRR2584863",]
+SRR2584863_variants <- variants[variants$sample_id == "SRR2584863", ]
 
 # check the dimension of the data frame
 
@@ -842,7 +843,7 @@ H) Save the edited Ecoli\_metadata data frame as "exercise\_solution.csv" in you
 dim(Ecoli_metadata)
 levels(as.factor(Ecoli_metadata$cit))
 table(as.factor(Ecoli_metadata$cit))
-Ecoli_metadata[7,7]
+Ecoli_metadata[7, 7]
 median(Ecoli_metadata$genome_size)
 colnames(Ecoli_metadata)[colnames(Ecoli_metadata) == "sample"] <- "sample_id"
 Ecoli_metadata$genome_size_bp <- Ecoli_metadata$genome_size * 1000000