Skip to content

Commit 5413258

Browse files
committed
Adding completed metagenomics code
1 parent c0d1070 commit 5413258

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+108942
-0
lines changed

.DS_Store

0 Bytes
Binary file not shown.

Completed Code/Metagenomics/.DS_Store

6 KB
Binary file not shown.

Completed Code/Metagenomics/Data/Fall_Allegheny_1.txt

+2,225
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Fall_Allegheny_2.txt

+1,737
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Fall_Allegheny_3.txt

+1,521
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Fall_Braddock_1.txt

+2,118
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Fall_Braddock_2.txt

+2,918
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Fall_Braddock_3.txt

+2,341
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Fall_Control_1.txt

+2,724
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Fall_Control_2.txt

+2,415
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Fall_Control_3.txt

+3,267
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Fall_Monongahela_1.txt

+2,139
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Fall_Monongahela_2.txt

+2,701
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Fall_Monongahela_3.txt

+2,160
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Fall_Neville_1.txt

+1,753
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Fall_Neville_2.txt

+1,767
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Fall_Neville_3.txt

+2,040
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Fall_Sharpsburg_1.txt

+1,234
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Fall_Sharpsburg_2.txt

+2,375
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Fall_Sharpsburg_3.txt

+2,088
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Fall_Sharpsburg_4.txt

+1,897
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Spring_Braddock_1.txt

+5,254
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Spring_Braddock_2.txt

+3,413
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Spring_Control_1.txt

+617
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Spring_Monongahela_1.txt

+3,533
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Spring_Monongahela_2.txt

+5,572
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Spring_Neville_1.txt

+2,443
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Spring_Sharpsburg_1.txt

+2,484
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Summer_Allegheny_1.txt

+1,790
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Summer_Braddock_1.txt

+2,770
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Summer_Braddock_2.txt

+2,094
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Summer_Monongahela_1.txt

+2,437
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Summer_Neville_1.txt

+2,552
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Summer_Neville_2.txt

+2,781
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Summer_Sharpsburg_1.txt

+2,748
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Summer_Sharpsburg_2.txt

+3,290
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Winter_Allegheny_2.txt

+1,944
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Winter_Allegheny_3.txt

+2,307
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Winter_Braddock_1.txt

+523
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Winter_Braddock_2.txt

+1,142
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Winter_Braddock_3.txt

+1,420
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Winter_Control_2.txt

+1,039
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Winter_Monongahela_1.txt

+1,162
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Winter_Monongahela_2.txt

+1,480
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Winter_Monongahela_3.txt

+1,299
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Winter_Neville_1.txt

+2,631
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Winter_Neville_2.txt

+1,650
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Winter_Neville_3.txt

+1,630
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Winter_Sharpsburg_1.txt

+1,497
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Winter_Sharpsburg_2.txt

+1,439
Large diffs are not rendered by default.

Completed Code/Metagenomics/Data/Winter_Sharpsburg_3.txt

+2,221
Large diffs are not rendered by default.

Completed Code/Metagenomics/DataViz.R

+91
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# This R script takes a distance matrix between genomics samples as input.
2+
# It produces two plots. First, a heat map of the distances in the samples. Second, a PCoA plot
3+
# colored by the season.
4+
5+
# Import needed libraries. Please install these in R beforehand using install.packages("package_name")
6+
7+
library(ggcorrplot)
8+
library(reshape)
9+
library(stringr)
10+
library(ggplot2)
11+
library(ape) #ape library to compute PCoA of our matrix
12+
13+
# Now set working directory. This should be wherever the files are stored and is the only line that the user needs to edit.
14+
15+
# EDIT THIS LINE
16+
setwd("/Users/phillipcompeau/go/src/Metagenomics_Final")
17+
18+
# Part 1: generate box plots of evenness by season using Simpson's index.
19+
20+
# first, read in the table
21+
22+
simpsonTable <- read.csv(file="Matrices/SimpsonMatrix.csv")
23+
24+
simpsonColumns <- data.frame(simpsonTable)
25+
26+
# Add column to our data frame to represent the season
27+
Season <- sub("\\_.*", "", simpsonColumns$Sample) # parse out just the season from sample name
28+
cbind(simpsonColumns, Season) # adding column
29+
30+
# Now we plot box plots where we have evenness lumped by season.
31+
ggplot(simpsonColumns, aes(x=Season, y=SimpsonsIndex, fill=Season)) + geom_boxplot() +ggtitle("Simpson's Index over season")
32+
ggsave("Plots/SimpsonsBoxPlots.png")
33+
34+
35+
# Part 2: generate a heat map of the distance values
36+
37+
# Read in the file and process the table.
38+
table <- read.csv(file="Matrices/BetaDiversityMatrix.csv")
39+
40+
# trim the first column out because it only contains names
41+
table <- table[-c(1)]
42+
43+
matrix <- as.matrix(table)
44+
45+
# the following code is just all the technical stuff to build a heatmap out of the distance matrix.
46+
co=melt(matrix)
47+
head(co)
48+
ggplot(co, aes(X1, X2)) + # x and y axes => Var1 and Var2
49+
geom_tile(aes(fill = value)) + # background colours are mapped according to the value column
50+
scale_fill_gradient2(low = "#6D9EC1",
51+
mid = "white",
52+
high = "#E46726",
53+
midpoint = 0.5, limit= c(0,1.0)) +
54+
theme(panel.grid.major.x=element_blank(), #no gridlines
55+
panel.grid.minor.x=element_blank(),
56+
panel.grid.major.y=element_blank(),
57+
panel.grid.minor.y=element_blank(),
58+
panel.background=element_rect(fill="white"), # background=white
59+
axis.text.x = element_text(angle=90, hjust = 1,vjust=1,size = 8,face = "bold"),
60+
plot.title = element_text(size=20,face="bold"),
61+
axis.text.y = element_text(size = 8,face = "bold")) +
62+
ggtitle("Distance Heat Map") +
63+
theme(legend.title=element_text(face="bold", size=14)) +
64+
scale_x_discrete(name="") +
65+
scale_y_discrete(name="")
66+
ggsave("Plots/HeatMap.png")
67+
68+
# Part 3: generate a PCoA plot of the data and color-code by season.
69+
70+
# Read in the file and process the table.
71+
table2 <- read.csv(file="Matrices/BetaDiversityMatrix.csv")
72+
73+
#trim the first column out because it only contains names
74+
table2 <- table2[-c(1)]
75+
table2 <- table2[, -50] # trim out weird extra column at end of the matrix file
76+
77+
matrix2 <- as.matrix(table2)
78+
79+
pcoa_data <- pcoa(matrix2, correction="none", rn=NULL)
80+
pcoa_vectors <- data.frame(pcoa_data$vectors)
81+
# columns contains a vector for each point after PCoA tries to assign data points to vectors to preserve distances between points.
82+
83+
colnames(table2)
84+
85+
# Add column to our data frame to represent the season
86+
Season2 <- sub("\\_.*", "", colnames(table2)) # parse out just the season from sample name
87+
cbind(pcoa_vectors, Season2) # adding column
88+
89+
# Now, plot the data, colored by season.
90+
ggplot(pcoa_vectors, aes(x=Axis.1, y=Axis.2, color=Season2)) + geom_point()
91+
ggsave("Plots/PCoA.png")
+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
package main
2+
3+
import "sort"
4+
5+
// BetaDiversityMatrix takes a map of frequency maps along with a distance metric
6+
// ("Bray-Curtis" or "Jaccard") as input.
7+
// It returns a slice of strings corresponding to the sorted names of the keys
8+
// in the map, along with a matrix of distances whose (i,j)-th
9+
// element is the distance between the i-th and j-th samples using the input metric.
10+
func BetaDiversityMatrix(allMaps map[string](map[string]int), distMetric string) ([]string, [][]float64) {
11+
//first, grab all the sample IDs, put them in a list, and sort them
12+
numSamples := len(allMaps)
13+
14+
sampleNames := make([]string, 0)
15+
16+
//i := 0
17+
//get the sample IDs by ranging through allMaps
18+
for sampleName := range allMaps {
19+
sampleNames = append(sampleNames, sampleName)
20+
//sampleNames[i] = sampleName
21+
//i++
22+
}
23+
24+
//sort the sample IDs
25+
sort.Strings(sampleNames)
26+
27+
//build the beta diversity matrix
28+
mtx := InitializeSquareMatrix(numSamples)
29+
30+
//diagonal is good to go because these entries should be 0.0
31+
32+
//range through the matrix and set mtx[i][j] equal to beta diversity distance between sampleNames[i] and sampleNames[j]
33+
34+
for i := range mtx {
35+
for j := i + 1; j < len(mtx[i]); j++ {
36+
//grab the two maps we need
37+
freqMap1 := allMaps[sampleNames[i]]
38+
freqMap2 := allMaps[sampleNames[j]]
39+
var d float64 // will store distance
40+
//which metric we using?
41+
if distMetric == "Bray-Curtis" {
42+
d = BrayCurtisDistance(freqMap1, freqMap2)
43+
} else if distMetric == "Jaccard" {
44+
d = JaccardDistance(freqMap1, freqMap2)
45+
} else {
46+
panic("Error: invalid distance metric given.")
47+
}
48+
mtx[i][j] = d
49+
//also enter the corresponding symmetric value
50+
mtx[j][i] = d
51+
}
52+
}
53+
return sampleNames, mtx
54+
}
55+
56+
// SimpsonsMap takes an array of frequency maps as input. It returns a
57+
// map mapping each sample name to its Simpson's index.
58+
func SimpsonsMap(allMaps map[string](map[string]int)) map[string]float64 {
59+
s := make(map[string]float64)
60+
61+
for sampleName, freqMap := range allMaps {
62+
s[sampleName] = SimpsonsIndex(freqMap)
63+
}
64+
65+
return s
66+
}
67+
68+
// RichnessMap takes a map of frequency maps as input. It returns a map
69+
// whose values are the richness of each sample.
70+
func RichnessMap(allMaps map[string](map[string]int)) map[string]int {
71+
r := make(map[string]int)
72+
73+
//range over sample IDs and set associated richness value
74+
for sampleID, freqMap := range allMaps {
75+
r[sampleID] = Richness(freqMap)
76+
}
77+
78+
return r
79+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
package main
2+
3+
// BrayCurtisDistance takes two frequency maps and returns the Bray-Curtis
4+
// distance between them.
5+
func BrayCurtisDistance(map1 map[string]int, map2 map[string]int) float64 {
6+
c := SumOfMinima(map1, map2)
7+
s1 := SampleTotal(map1)
8+
s2 := SampleTotal(map2)
9+
10+
//don't allow the situation in which we have zero richness.
11+
if s1 == 0 || s2 == 0 {
12+
panic("Error: sample given to BrayCurtisDistance() has no positive values.")
13+
}
14+
15+
av := Average(float64(s1), float64(s2))
16+
return 1 - (float64(c) / av)
17+
}
18+
19+
// JaccardDistance takes two frequency maps and returns the Jaccard
20+
// distance between them.
21+
func JaccardDistance(map1 map[string]int, map2 map[string]int) float64 {
22+
c := SumOfMinima(map1, map2)
23+
t := SumOfMaxima(map1, map2)
24+
j := 1 - (float64(c) / float64(t))
25+
return j
26+
}
27+
28+
// SumOfMaxima takes two frequency maps as input.
29+
// It identifies the keys that are shared by the two frequency maps
30+
// and returns the sum of the corresponding values. (a.k.a. "union")
31+
// SumOfMaxima takes two frequency maps as input.
32+
// It identifies the keys that are shared by the two frequency maps
33+
// and returns the sum of the corresponding values. (a.k.a. "union")
34+
func SumOfMaxima(map1 map[string]int, map2 map[string]int) int {
35+
c := 0
36+
37+
for key := range map2 {
38+
_, exists := map1[key]
39+
if exists {
40+
c += Max2(map1[key], map2[key])
41+
} else {
42+
c += map2[key]
43+
}
44+
}
45+
for key := range map1 {
46+
_, exists := map2[key]
47+
if !exists {
48+
c += map1[key]
49+
}
50+
}
51+
52+
// panic if c is equal to zero since we don't want 0/0
53+
if c == 0 {
54+
panic("Error: no species common to two maps given to SumOfMaxima")
55+
}
56+
57+
return c
58+
}
59+
60+
// Max2 takes two integers and returns their maximum.
61+
func Max2(n1, n2 int) int {
62+
if n1 < n2 {
63+
return n2
64+
}
65+
return n1
66+
}
67+
68+
// Richness takes a frequency map. It returns the richness of the frequency map
69+
// (i.e., the number of keys in the map corresponding to nonzero values.)
70+
func Richness(sample map[string]int) int {
71+
c := 0
72+
73+
for _, val := range sample {
74+
if val < 0 {
75+
panic("Error: negative value in frequency map given to Richness()")
76+
}
77+
c++
78+
}
79+
80+
return c
81+
}
82+
83+
// SimpsonsIndex takes a frequency map and returns a decimal corresponding to Simpson's index:
84+
// the sum of (n/N)^2 where n is the number of individuals found for a given string/species
85+
// and N is the total number of individuals. The sum is over all species present.
86+
func SimpsonsIndex(sample map[string]int) float64 {
87+
total := SampleTotal(sample)
88+
simpson := 0.0
89+
90+
if total == 0 {
91+
panic("Error: Empty frequency map given to SimpsonsIndex()!")
92+
}
93+
94+
for _, val := range sample {
95+
x := float64(val) / float64(total)
96+
simpson += x * x
97+
}
98+
return simpson
99+
}
100+
101+
// InitializeSquareMatrix takes an integer n and returns an nxn slice of
102+
// floats initialized to 0.0.
103+
func InitializeSquareMatrix(n int) [][]float64 {
104+
mtx := make([][]float64, n)
105+
106+
for i := range mtx {
107+
mtx[i] = make([]float64, n)
108+
}
109+
return mtx
110+
}
111+
112+
// FrequencyMap forms the frequency map of a collection of input patterns.
113+
// Input: one collection of strings patterns
114+
// Output: a frequency map of strings to their # of counts in patterns
115+
func FrequencyMap(patterns []string) map[string]int {
116+
freqMap := make(map[string]int)
117+
for _, val := range patterns {
118+
freqMap[val]++
119+
}
120+
return freqMap
121+
}
122+
123+
// Average takes two floats and returns their average.
124+
func Average(x, y float64) float64 {
125+
return (x + y) / 2.0
126+
}
127+
128+
// SampleTotal takes a frequency map as input.
129+
// It returns the sum of the counts for each string in a sample.
130+
func SampleTotal(freqMap map[string]int) int {
131+
total := 0
132+
for _, val := range freqMap {
133+
if val > 0 {
134+
total += val
135+
}
136+
}
137+
return total
138+
}
139+
140+
// SumOfMinima takes two frequency maps as input.
141+
// It identifies the keys that are shared by the two frequency maps
142+
// and returns the sum of the corresponding values.
143+
func SumOfMinima(map1 map[string]int, map2 map[string]int) int {
144+
c := 0
145+
146+
for key := range map1 {
147+
_, exists := map2[key]
148+
if exists {
149+
c += Min2(map1[key], map2[key])
150+
}
151+
}
152+
153+
return c
154+
}
155+
156+
// Min2 takes two integers and returns their minimum.
157+
func Min2(n1, n2 int) int {
158+
if n1 < n2 {
159+
return n1
160+
}
161+
return n2
162+
}

Completed Code/Metagenomics/io.go

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package main
2+
3+
// ReadSamplesFromDirectory has the following input and output
4+
// Input: a collection of filenames. Each file has one string on each line
5+
// Output: a map whose keys are the sample names and whose values are the frequency map at that site.
6+
7+
// ReadFrequencyMapFromFile
8+
// Input: name of file which contains one string on each line
9+
// Output: the frequency map of strings in the file.
10+
11+
//WriteBetaDiversityMatrixToFile writes a distance matrix to a file,
12+
//with sample names labeling the initial row and column.
13+
//Input: a distance matrix, a collection of sample names corresponding to the distance matrix, and a file name.
14+
//Output: (no output)
15+
16+
//WriteSimpsonsMapToFile writes a given map of Simpson indices
17+
//to a file corresponding to an input filename.

Completed Code/Metagenomics/main.go

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
package main
2+
3+
import (
4+
"fmt"
5+
)
6+
7+
func main() {
8+
fmt.Println("Metagenomics!")
9+
10+
// fill in your code here.
11+
}

Starter Code/.DS_Store

0 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)