Litzky2017.Rmd

---
title: "Rmd Skeleton"
author: "The methylation in ART meta-analysis group"
date: "`r Sys.Date()`"
output:
  html_document:
    toc: true
    fig_width: 7
    fig_height: 7
theme: cosmo
---

## Introduction


## Loading packages and functions

These packackes will help us to perform vital steps such as normalisation, filtering, 
differential analysis, etc, and provide information about the array probe annotaions.


These functions provide shortcuts to help with charts and other analysis. They will
eventually be shoved into another Rscript or package but can stay here for now.

```{r,packages}
source("meth_functions.R")

# Annotation
ann450k <- getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19)
myann <- data.frame(ann450k[,c("UCSC_RefGene_Name","Regulatory_Feature_Group","Islands_Name","Relation_to_Island")])
promoters <- grep("Prom",myann$Regulatory_Feature_Group)
```


## Data import

TODO- add explanation

```{r,data}
setwd("/mnt/mnorris/ART_methylation")

dir.create("GSE75248")
ARRAY_SAMPLESHEET="GSE75248/GSE75248_series_matrix.txt.gz"
# only download it if it is not present on the system
if ( !file.exists(ARRAY_SAMPLESHEET ) ) {
  DLFILE=paste(ARRAY_SAMPLESHEET,".gz",sep="")
  download.file("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE75nnn/GSE75248/matrix/GSE75248_series_matrix.txt.gz",
                destfile = DLFILE)
   gunzip(DLFILE)
}

# Reading in GEO series matrix
gse <- getGEO(filename="/mnt/mnorris/ART_methylation/GSE75248/GSE75248_series_matrix.txt.gz")


ARRAY_DATA="GSE75248/GSE75248_RAW.tar"
# only download it if it is not present on the system
if ( !dir.exists("GSE75248/IDAT") ) {
  dir.create("GSE75248/IDAT")
  download.file("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE75248&format=file", destfile = ARRAY_DATA)
  untar(exdir = "GSE75248/IDAT", tarfile = "GSE75248/GSE75248_RAW.tar",)
}


baseDir <- "./GSE75248"
targets <- pData(phenoData(gse))


mylist <- list.files("./GSE75248",pattern = "GSM",recursive = TRUE)
mylist <- gsub("_Red.idat.gz", "", mylist)
mylist <- gsub("_Grn.idat.gz", "", mylist)
mybase <- unique(mylist)
targets$Basename <- paste("GSE75248/", mybase, sep = "")
rgSet <- read.metharray.exp(targets = targets)

```


## Normalisation

```{r,norm,fig.cap="Figure 1. Normalisation of bead-array data with SWAN."}
mSet <- preprocessRaw(rgSet)
mSetSw <- SWAN(mSet,verbose=FALSE)
par(mfrow=c(1,2), cex=0.8)
densityByProbeType(mSet[,1], main = "Raw")
densityByProbeType(mSetSw[,1], main = "SWAN")

```

## Filter probes

Here we are running parallel analyses, both including and excluding sex chromosomes.

```{r,filterprobes}
# include sex chromosomes
detP <- detectionP(rgSet)
keep <- rowSums(detP < 0.01) == ncol(rgSet)
mSetSw <- mSetSw[keep,]

# exclude SNP probes
mSetSw <- mapToGenome(mSetSw)
mSetSw_nosnp <- dropLociWithSnps(mSetSw)
dim(mSetSw)
dim(mSetSw_nosnp)
mSetSw <- mSetSw_nosnp

# exclude sex chromosomes
keep <- !(featureNames(mSetSw) %in% ann450k$Name[ann450k$chr %in% c("chrX","chrY")])
mSetFlt <- mSetSw[keep,]

# exclude cross reactive probes Chen (2013)
```

## Extracting Beta and M-values

```{r,beta_m_vals}
# include sex chromosomes
meth <- getMeth(mSetSw)
unmeth <- getUnmeth(mSetSw)
Mval <- log2((meth + 100)/(unmeth + 100))
beta <- getBeta(mSetSw)

# exclude sex chromosomes
meth <- getMeth(mSetFlt)
unmeth <- getUnmeth(mSetFlt)
Mval_flt <- log2((meth + 100)/(unmeth + 100))
beta_flt <- getBeta(mSetFlt)

```

## Gender Check
```{r, gender"}
cgx<-rownames(ann450k[which(ann450k$chr %in% "chrX"),])
cgy<-rownames(ann450k[which(ann450k$chr %in% "chrY"),])
mvx<-Mval[which(rownames(Mval) %in% cgx),]
mvy<-Mval[which(rownames(Mval) %in% cgy),]
plot(colMeans(mvy),colMeans(mvx)) 
plot(colMeans(mvy))
plot(colMeans(mvx))
intersect(cgx,cgy)
```
## MDS analysis


```{r,scree,fig.cap="Figure 2. Screeplot shows contribution of top principal components to the overall variation in the experiment."}
#Scree Plot
par(mfrow=c(2,1))  
myscree(Mval,main="incl sex chr")
myscree(Mval_flt,main="excl sex chr")

```


```{r,mds1,fig.width = 8 ,fig.height = 8,fig.cap="Figure 3. MDS plot coloured by IVF status."}
sample_groups <- factor(targets$`ivf:ch1`)
colour_palette=brewer.pal(n = length(levels(sample_groups)), name = "Paired")
colours <- colour_palette[as.integer(factor(targets$`ivf:ch1`))]
plot(1,axes = FALSE,xlab="",ylab="",main="MDS by IVF status")
legend("center",legend=levels(sample_groups),pch=16,cex=1.2,col=colour_palette)
mydist <- plotMDS(Mval, labels=targets$geo_accession,col=colours,main="sex chromosomes included")
mydist_flt <- plotMDS(Mval_flt, labels=targets$geo_accession,col=colours,main="sex chromosomes excluded")
```


```{r,mds2,fig.width = 8 ,fig.height = 8, fig.cap="Figure 4. MDS plot coloured by sex."}
sample_groups <- factor(targets$`infant gender:ch1`)
colour_palette=brewer.pal(n = length(levels(sample_groups)), name = "Paired")
colours <- colour_palette[as.integer(factor(targets$`infant gender:ch1`))]
plot(1,axes = FALSE,xlab="",ylab="",main="MDS by sex")
legend("center",legend=levels(sample_groups),pch=16,cex=1.2,col=colour_palette)
plotMDS(mydist, labels=targets$geo_accession,col=colours,main="sex chromosomes included")
plotMDS(mydist_flt, labels=targets$geo_accession,col=colours,main="sex chromosomes excluded")
```

For completeness, we show the MDS by ART and sex (Figure 5). It confirms the trends seen above where the 
samples are split on the x axis by sex and on the y axis by ART classification.

When sex chromosomes are removed, it is clear that ART classification is the dominant source of variance.

```{r,mds3,fig.width = 8 ,fig.height = 8, fig.cap="Figure 5. MDS plot coloured by ART classification and sex."}
targets$IVFgender <- paste(targets$`ivf:ch1`, targets$`infant gender:ch1`, sep=("_"))

sample_groups <- factor(targets$IVFgender)
colour_palette=brewer.pal(n = length(levels(sample_groups)), name = "Paired")
colours <- colour_palette[as.integer(factor(targets$IVFgender))]
plot(1,axes = FALSE,xlab="",ylab="",main="MDS by ART and sex")
legend("center",legend=levels(sample_groups),pch=16,cex=1.2,col=colour_palette)
plotMDS(mydist, labels=targets$geo_accession,col=colours,main="sex chromosomes included")
plotMDS(mydist_flt, labels=targets$geo_accession,col=colours,main="sex chromosomes excluded")
```

To acertain whether technical factors like batch effects account for variance on the two main PCs, we
create an MDS by array chip (Figure 6). There appears to be no relationship between array chip number
and the top two PCs in the two MDS plots shown below.

```{r,mds4,fig.width = 8 ,fig.height = 8, fig.cap="Figure 6. MDS plot coloured by array chip."}

```


## Differential analysis

* __NAT vs IVF__: 

* __NAT vs SUB__: .

* __SUB vs IVF__:


The differential analysis is centred around limma to identify differentially methylated probes. 
TopConfects was also run to obtain the probes with the largest confident effect (topconfect).
There are five outputs below:

1. Volcano plot (limma result).

2. Beeswarm plot (top probes by limma p-value).

3. Heatmap (top probes by limma p-value).

4. Beeswarm plot (top probes by topconfect ranking).

5. Heatmap (top probes by topconfect ranking). (TODO)

### 1 NAT vs IVF

```{r,nat_vs_ivf_incl}
# include sex chromosomes

samplesheet <- subset(targets,`ivf:ch1`=="No" | `ivf:ch1`=="Yes")
samplesheet <- subset(samplesheet, `infant gender:ch1`=="Male" | `infant gender:ch1`=="Female")
samplesheet <- subset(samplesheet, `subfertility:ch1`=="No")
samplesheet$Basename <- sapply(strsplit(samplesheet$Basename, "/"), "[[", 3)

groups <- factor(samplesheet$`ivf:ch1`,levels=c("No","Yes"))
sex <- factor(samplesheet$`infant gender:ch1`,levels=c("Male","Female"))


top_nat_vs_ivf_inc <- dm_analysis(samplesheet=samplesheet,
    sex=sex,groups=groups,mx=Mval_flt,name="top_nat_vs_ivf_inc",
    myann=myann ,beta= beta)


```

```{r,nat_vs_ivf_excl}
# exclude sex chromosomes
samplesheet <- subset(targets,`ivf:ch1`=="No" | `ivf:ch1`=="Yes")
samplesheet <- subset(samplesheet, `infant gender:ch1`=="Male" | `infant gender:ch1`=="Female")
samplesheet$Basename <- sapply(strsplit(samplesheet$Basename, "/"), "[[", 3)

groups <- factor(samplesheet$`ivf:ch1`,levels=c("No","Yes"))
sex <- factor(samplesheet$`infant gender:ch1`,levels=c("Male","Female"))

top_nat_vs_ivf_exc <- dm_analysis(samplesheet=samplesheet,
    sex=sex,groups=groups,mx=Mval_flt,name="top_nat_vs_ivf_exc",
    myann=myann ,beta= beta)

# Allele
top_nat_vs_ivf_exc$dma$unmeth <- "T"
top_nat_vs_ivf_exc$dma$meth <- "C"
top_nat_vs_ivf_exc$fit$SE <- sqrt(top_nat_vs_ivf_exc$fit$s2.post) * top_nat_vs_ivf_exc$fit$stdev.unscaled


# Extract required columns from dma
top_nat_vs_ivf_metal <-top_nat_vs_ivf_exc$dma[,c("Row.names", "meth", "unmeth", "AveExpr", "P.Value")]
head(top_nat_vs_ivf_metal)

# Convert fit outputs to dataframes
fitCE <- as.data.frame(top_nat_vs_ivf_exc$fit$coefficients)
fitCE$Row.names <- row.names(fitCE)
fitCE <- fitCE[,c("Row.names", "groupsYes")]
names(fitCE)[2]<- "coefficient"


fitSE <- as.data.frame(top_nat_vs_ivf_exc$fit$SE)
fitSE$Row.names <- row.names(fitSE)
fitSE <- fitSE[,c("Row.names", "groupsYes")]
names(fitSE)[2]<- "SE"

# Merge Datasets
top_nat_vs_ivf_metal <- merge(top_nat_vs_ivf_metal, fitCE)
top_nat_vs_ivf_metal <- merge(top_nat_vs_ivf_metal, fitSE)

# Number of effective participants
controls <- samplesheet[samplesheet$`ivf:ch1` == "No",]
cases <- samplesheet[samplesheet$`ivf:ch1` == "Yes",]
ctrl <- nrow(controls)
cses <- nrow(cases)
Neff <- 4/((1/cses)+(1/ctrl))

top_nat_vs_ivf_metal$N <- Neff

# Output for Meta-analysis 
write.table(top_nat_vs_ivf_metal, file="NATvsIVF/litzky_top_nat_vs_ivf_metal.tsv",sep="\t",quote=FALSE, row.names = FALSE) 


head(top_nat_vs_ivf_exc$dma)
head(top_nat_vs_ivf_exc$dmr)
head(top_nat_vs_ivf_metal)

```


### 2 NAT vs SUB

```{r,nat_vs_sub_excl}
# exclude sex chromosomes
samplesheet <- subset(targets,`subfertility:ch1`=="No" | `subfertility:ch1`=="Yes")
samplesheet <- subset(samplesheet, `infant gender:ch1`=="Male" | `infant gender:ch1`=="Female")
samplesheet$Basename <- sapply(strsplit(samplesheet$Basename, "/"), "[[", 3)

groups <- factor(samplesheet$`subfertility:ch1`,levels=c("No","Yes"))
sex <- factor(samplesheet$`infant gender:ch1`,levels=c("Male","Female"))

top_nat_vs_sub_exc <- dm_analysis(samplesheet=samplesheet,
    sex=sex,groups=groups,mx=Mval_flt,name="top_nat_vs_sub_exc",
    myann=myann ,beta= beta)

# Allele
top_nat_vs_sub_exc$dma$unmeth <- "T"
top_nat_vs_sub_exc$dma$meth <- "C"
top_nat_vs_sub_exc$fit$SE <- sqrt(top_nat_vs_sub_exc$fit$s2.post) * top_nat_vs_sub_exc$fit$stdev.unscaled


# Extract required columns from dma
top_nat_vs_sub_metal <-top_nat_vs_sub_exc$dma[,c("Row.names", "meth", "unmeth", "AveExpr", "P.Value")]
head(top_nat_vs_sub_metal)

# Convert fit outputs to dataframes
fitCE <- as.data.frame(top_nat_vs_sub_exc$fit$coefficients)
fitCE$Row.names <- row.names(fitCE)
fitCE <- fitCE[,c("Row.names", "groupsYes")]
names(fitCE)[2]<- "coefficient"


fitSE <- as.data.frame(top_nat_vs_sub_exc$fit$SE)
fitSE$Row.names <- row.names(fitSE)
fitSE <- fitSE[,c("Row.names", "groupsYes")]
names(fitSE)[2]<- "SE"

# Merge Datasets
top_nat_vs_sub_metal <- merge(top_nat_vs_sub_metal, fitCE)
top_nat_vs_sub_metal <- merge(top_nat_vs_sub_metal, fitSE)

# Number of effective participants
controls <- samplesheet[samplesheet$`subfertility:ch1` == "No",]
cases <- samplesheet[samplesheet$`subfertility:ch1` == "Yes",]
ctrl <- nrow(controls)
cses <- nrow(cases)
Neff <- 4/((1/cses)+(1/ctrl))

top_nat_vs_sub_metal$N <- Neff

# Output for Meta-analysis
write.table(top_nat_vs_sub_metal, file="NATvsHypo/litzky_top_nat_vs_sub_metal.tsv",sep="\t",quote=FALSE, row.names = FALSE) 


head(top_nat_vs_sub_exc$dma)
head(top_nat_vs_sub_exc$dmr)
head(top_nat_vs_sub_metal) 
```

### 3 SUB vs IVF

```{r,sub_vs_IVF_excl}
# exclude sex chromosomes
targets$subivf <- paste(targets$`subfertility:ch1`, targets$`ivf:ch1`, sep = ("_"))
samplesheet <- subset(targets, subivf=="Yes_No" | subivf == "Yes_Yes")
samplesheet$Basename <- sapply(strsplit(samplesheet$Basename, "/"), "[[", 3)

groups <- factor(samplesheet$subivf,levels=c("Yes_No","Yes_Yes"))
sex <- factor(samplesheet$`infant gender:ch1`,levels=c("Male","Female"))

top_sub_vs_ivf_exc <- dm_analysis(samplesheet=samplesheet,
    sex=sex,groups=groups,mx=Mval_flt,name="top_sub_vs_ivf_exc",
    myann=myann ,beta= beta)


# Allele
top_sub_vs_ivf_exc$dma$unmeth <- "T"
top_sub_vs_ivf_exc$dma$meth <- "C"
top_sub_vs_ivf_exc$fit$SE <- sqrt(top_sub_vs_ivf_exc$fit$s2.post) * top_sub_vs_ivf_exc$fit$stdev.unscaled


# Extract required columns from dma
top_sub_vs_ivf_metal <-top_sub_vs_ivf_exc$dma[,c("Row.names", "meth", "unmeth", "AveExpr", "P.Value")]
head(top_sub_vs_ivf_metal)

# Convert fit outputs to dataframes
fitCE <- as.data.frame(top_sub_vs_ivf_exc$fit$coefficients)
fitCE$Row.names <- row.names(fitCE)
fitCE <- fitCE[,c("Row.names", "groupsYes_Yes")]
names(fitCE)[2]<- "coefficient"


fitSE <- as.data.frame(top_sub_vs_ivf_exc$fit$SE)
fitSE$Row.names <- row.names(fitSE)
fitSE <- fitSE[,c("Row.names", "groupsYes_Yes")]
names(fitSE)[2]<- "SE"

# Merge Datasets
top_sub_vs_ivf_metal <- merge(top_sub_vs_ivf_metal, fitCE)
top_sub_vs_ivf_metal <- merge(top_sub_vs_ivf_metal, fitSE)

# Number of effective participants
controls <- samplesheet[samplesheet$`IVF:ch1` == "No",]
cases <- samplesheet[samplesheet$`IVF:ch1` == "Yes",]
ctrl <- nrow(controls)
cses <- nrow(cases)
Neff <- 4/((1/cses)+(1/ctrl))

top_sub_vs_ivf_metal$N <- Neff

# Output for Meta-analysis
dir.create("SUBvsIVF")
write.table(top_sub_vs_ivf_metal, file="SUBvsIVF/litzky_top_top_sub_vs_ivf_metal.tsv",sep="\t",quote=FALSE, row.names = FALSE) 


head(top_nat_vs_sub_exc$dma)
head(top_nat_vs_sub_exc$dmr)
head(top_nat_vs_sub_metal) 

```

## Venn diagrams of the differential methylated probes for each contrast

```{r,venn1,fig.cap="Comparison of probes altered by fresh and frozen IVF procedures"}

```


```{r,venn2,fig.cap="Comparison of probes altered by IUI, fresh and frozen IVF procedures"}

```

## Spearman correlations of each contrast


```{r,spearman}
myranks <- function(x) {
  xx<-x[[1]]
  xx$score <- sign(xx$logFC)/log10(xx$adj.P.Val)
  y <- xx[,"score",drop=FALSE]
  y$rn <- xx$Row.names
  return(y)
}

mycontrasts <- list("top_nat_vs_ivf_exc" = top_nat_vs_ivf_exc, "top_nat_vs_sub_exc" = top_nat_vs_sub_exc, "top_sub_vs_ivf_exc" = top_sub_vs_ivf_exc, "nat_vs_subivf_exc" = nat_vs_subivf_excl)

myrnks <- lapply(X = mycontrasts, FUN = myranks)
df <- join_all(myrnks,by="rn")
rownames(df) <- df$rn
df$rn=NULL
colnames(df) <- names(mycontrasts)
head(df)

mycors <- cor(df,method = "spearman")

my_palette <- colorRampPalette(c("darkred","red", "orange", "yellow","white"))(n = 25)
heatmap.2(mycors,scale="none",margin=c(10, 10),cexRow=0.8,trace="none",cexCol=0.8,
    col=my_palette,main="Spearman correlations")

mycors

```

## Genomic compartment analysis


```{r,compartment}
save.image("litzky2017.Rdata")

head(top_nat_vs_ivf_exc$dma)
dma <- top_nat_vs_ivf_exc$dma
head(dma)
up <- subset(dma,logFC>0 & adj.P.Val<0.05)
dn <- subset(dma,logFC<0 & adj.P.Val<0.05)

all <- table(unique(dma)$Regulatory_Feature_Group)
up <- table(unique(up)$Regulatory_Feature_Group)
dn <- table(unique(dn)$Regulatory_Feature_Group)

xx <- data.frame(all,up,dn)
rownames(xx) <- xx[,1]
rownames(xx)[1] = "None"
xx[,c(1,3,5)]=NULL
colnames(xx) <- c("all","up","dn")
xx <- rbind(xx,"Total"=colSums(xx))
xx
xx[,1:2]

save.image("litzky2017.Rdata")
```

## Enrichment analysis

```{r,genesets}
library("mitch")
# gene sets
download.file("https://reactome.org/download/current/ReactomePathways.gmt.zip", 
    destfile="ReactomePathways.gmt.zip")
unzip("ReactomePathways.gmt.zip",overwrite = TRUE)
genesets <- gmt_import("ReactomePathways.gmt")
```

# Run 1D enrichment analysis

One dimensional enrichment analysis with REACTOME gene sets using average promoter methylation t-statistic.

```{r,1danalysis}
top_nat_vs_ivf_mitch <- run_mitch_1d(dma= top_nat_vs_ivf_exc$dma, name="top_nat_vs_ivf_mitch")
head(top_nat_vs_ivf_mitch,50)

top_nat_vs_sub_mitch <- run_mitch_1d(dma= top_nat_vs_sub_exc$dma, name="top_nat_vs_sub_mitch")
head(top_nat_vs_sub_mitch,50)

top_sub_vs_ivf_mitch <- run_mitch_1d(dma= top_sub_vs_ivf_exc$dma, name="top_sub_vs_ivf_mitch")
head(top_sub_vs_ivf_mitch,50)
```

# Run multi-dimensional enrichment analysis

```{r,mitch}


```


## References


## Session info

```{r,sessioninfo}
sessionInfo()

```