Skip to content

Commit 06baf28

Browse files
committed
extensive rewrite of README
1 parent 779356a commit 06baf28

File tree

7 files changed

+336
-266
lines changed

7 files changed

+336
-266
lines changed

DESCRIPTION

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ Encoding: UTF-8
2222
Imports:
2323
ape (>= 5.3),
2424
digest (>= 0.6.25),
25+
igraph (>= 1.3),
2526
jsonlite (>= 1.7.2),
2627
phangorn (>= 0.7-70),
2728
phytools (>= 1.0-3)

R/graph-clustering.R

+12-10
Original file line numberDiff line numberDiff line change
@@ -12,24 +12,26 @@ require(igraph)
1212
#' @param setID: A numeric identifier for this cluster set.
1313
#' @param time.var: character, column name for discrete time variable to fit
1414
#' a model of edge density decay with time (optional)
15+
#' @param adjusted: logical, passed to fit.decay(), only used with time.var
1516
#' @return data.frame, known cases annotated with cluster ID and growth
1617
#' @export
17-
component.cluster <- function(obj, dist.thresh, setID=0, time.var=NA) {
18+
component.cluster <- function(obj, dist.thresh, setID=0, time.var=NA,
19+
adjusted=TRUE) {
1820
# Filter edges above the distance threshold
1921
filtered.edges <- obj$edge.info[obj$edge.info$Distance <= dist.thresh, ]
2022

2123
# unconnected vertices will be induced by maximum numeric vertex ID of edgelist
22-
g <- graph_from_edgelist(as.matrix(filtered.edges[c("ID1", "ID2")]),
24+
g <- igraph::graph_from_edgelist(as.matrix(filtered.edges[c("ID1", "ID2")]),
2325
directed=FALSE)
2426

2527
# append vertices with numeric IDs above maximum ID in edgelist
26-
if (length(V(g)) < nrow(obj$seq.info)) {
27-
orphans <- seq(max(V(g))+1, nrow(obj$seq.info))
28-
g <- add_vertices(g, length(orphans))
28+
if (length(igraph::V(g)) < nrow(obj$seq.info)) {
29+
orphans <- seq(max(igraph::V(g))+1, nrow(obj$seq.info))
30+
g <- igraph::add_vertices(g, length(orphans))
2931
}
3032

3133
# extract connected components from graph
32-
comps <- components(g)
34+
comps <- igraph::components(g)
3335

3436
# label sequences with cluster indices
3537
obj$seq.info$Cluster <- comps$membership
@@ -46,15 +48,15 @@ component.cluster <- function(obj, dist.thresh, setID=0, time.var=NA) {
4648
cluster.set$New <- NULL # should be all FALSE
4749
cluster.set <- cluster.set[order(ClusterID),]
4850

49-
# fit edge probability decay model (e.g., fit.decay="colyear")
51+
# fit edge probability decay model
5052
if (!is.na(time.var)) {
5153
if (!is.element(time.var, names(obj$seq.info)))
5254
stop(time.var, "is not a variable in obj$seq.info!")
5355
# fit binomial model to bipartite graph
54-
weights <- fit.decay(
55-
obj, times=obj$seq.info[[time.var]],
56+
weights <- fit.decay(obj, times=obj$seq.info[[time.var]],
5657
dist.thresh=dist.thresh, adjusted=adjusted)
57-
cluster.set$Weight <- split(weights[!obj$seq.info$New], obj$seq.info$Cluster[!obj$seq.info$New])
58+
cluster.set$Weight <- split(weights[!obj$seq.info$New],
59+
obj$seq.info$Cluster[!obj$seq.info$New])
5860
}
5961

6062
# Attach growth info and set ID

R/tree-clustering.R

+4-3
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ step.cluster <- function(obj, branch.thresh, boot.thresh, setID=0,
3131

3232
# assign cluster memberships for all nodes in tree (not include new tips)
3333
phy <- assign.sstrees(obj, branch.thresh, boot.thresh)
34+
ntips <- ape::Ntip(phy)
3435

3536
# build a data table of known cases (i.e., not new cases)
3637
seq.cols <- colnames(phy$seq.info)
@@ -43,8 +44,7 @@ step.cluster <- function(obj, branch.thresh, boot.thresh, setID=0,
4344

4445
# collect descendants for each known case to calculate cluster sizes
4546
des <- sapply(
46-
split(phy$node.info$Descendants[1:Ntip(phy)],
47-
phy$node.info$Cluster[1:Ntip(phy)]),
47+
split(phy$node.info$Descendants[1:ntips], phy$node.info$Cluster[1:ntips]),
4848
function(x) unique(unlist(x))
4949
)
5050
cluster.set[, "Descendants" := des]
@@ -140,7 +140,8 @@ assign.sstrees <- function(phy, branch.thresh, boot.thresh, debug=FALSE) {
140140

141141
# cluster assignments for tips only (including "new" sequences)
142142
phy$seq.info$Cluster <- 0
143-
phy$seq.info$Cluster[!phy$seq.info$New] <- phy$node.info$Cluster[1:Ntip(phy)]
143+
ntips <- ape::Ntip(phy)
144+
phy$seq.info$Cluster[!phy$seq.info$New] <- phy$node.info$Cluster[1:ntips]
144145

145146
phy$growth.info[, "Cluster" := phy$node.info[
146147
phy$growth.info$NeighbourNode, Cluster]

R/tree-setup.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ require(data.table)
1717
#' @export
1818
import.tree <- function(phy, seq.info=data.table(), keep_root=FALSE, quiet=FALSE) {
1919
# Midpoint root for consistency and resolve multichotomies
20-
if (is.rooted(phy)) {
20+
if (ape::is.rooted(phy)) {
2121
if (!keep_root) {
2222
cat(paste("Re-rooting tree at midpoint. To retain original root, re-run",
2323
"with keep_root=TRUE."))

README.Rmd

+71-83
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,9 @@ Conventionally, we interpret each connected component of the graph as a cluster.
110110
A connected component is a group of nodes such that (1) every node can be reached from another node through a path of edges, and (1) there are no edges to nodes outside of the group.
111111
Varying the threshold for edges yields different sets of clusters.
112112

113-
In the following example, we start by reading in a sequence alignment, extracting metadata from the sequence labels, and identifying a subset of new sequences:
113+
In the following example, we start by reading in a sequence alignment (a published set of anonymized HIV-1 sequences from Canada), extracting metadata from the sequence labels, and identifying a subset of new sequences:
114114

115-
```{r}
115+
```{r message=FALSE}
116116
require(clustuneR)
117117
seqs <- ape::read.FASTA("data/na.fasta", type="DNA")
118118
@@ -127,128 +127,123 @@ seq.info$colyear <- year(seq.info$coldate)
127127
which.new <- which(seq.info$colyear == max(seq.info$colyear))
128128
```
129129

130-
You may already have these metadata in the form of a tabular data set (*i.e.*, a CSV file), in which case you can simply load these metadata as a data frame.
130+
> You may already have these metadata in the form of a tabular data set (*i.e.*, a CSV file), in which case you can simply load these metadata as a data frame.
131131
132132
Next, we need to load a list of edges, where each row specifies two node labels and a distance.
133133
These data can be generated from a sequence alignment using the program [TN93](https://github.com/veg/tn93).
134+
The resulting output file is enormous (\>34MB), so we do not include it in this package!
134135

135136
```{r}
136137
# load genetic distances (run `tn93 -t 1 -o na.tn93.csv na.fasta`)
137138
edge.info <- read.csv("data/na.tn93.csv")
138-
139139
obj <- read.edges(edge.info, seq.info, which.new)
140140
141141
# generate cluster sets under varying parameter settings
142142
cutoffs <- seq(0, 0.04, length.out=50)
143-
param.list <- lapply(1:50, function(i) {
144-
list(dist.thresh=cutoffs[i], setID=i, time.var="colyear")
143+
param.list <- lapply(cutoffs, function(x) {
144+
list(dist.thresh=x, time.var="colyear")
145145
})
146146
cluster.sets <- multi.cluster(obj, param.list, component.cluster)
147+
```
147148

148-
res <- fit.analysis(cluster.sets, predictive.models=p.models,
149-
predictor.transformations=p.trans)
150-
AICs <- get.AIC(res)
151-
delta.AIC <- AICs$TimeModelAIC - AICs$NullModelAIC
149+
By specifying a `time.var` argument in `param.list`, we are fitting a model to the distribution of sample collection years to predict edges between cases.
150+
For a more detailed explanation of this method, please refer to the vignettes.
152151

153-
cutoffs <- sapply(param.list, function(x) x$dist.thresh)
154-
par(mar=c(5,5,1,1))
155-
plot(cutoffs, delta.AIC, type='l', col='cadetblue', lwd=2)
156-
abline(h=0, lty=2)
152+
The last step of the analysis is to fit regression models to the distribution of new cases among clusters.
153+
154+
```{r}
155+
ptrans <- list("Weight"=sum)
156+
pmods <- list(
157+
"NullModel"=function(x) glm(Growth~Size, data=x, family="poisson"),
158+
"AltModel"=function(x) glm(Growth~Weight, data=x, family="poisson")
159+
)
160+
res <- fit.analysis(cluster.sets, models=pmods, transforms=ptrans)
161+
gaic <- get.AIC(res, param.list)
157162
```
158163

159-
### Building a tree
164+
Here, `gaic` is a data frame that stores the key result of our analysis - the AIC values associated with the two models under varying clustering thresholds.
165+
The optimal TN93 distance cutoff is identified by the greatest difference between the AICs of the alternative and null models, which we can visualize as a plot:
160166

161-
We start with a multiple sequence alignment of sequences that are labelled with sample collection dates.
162-
An example of anonymized public domain HIV-1 sequences from a study based in northern Alberta (Canada) is provided in `data/na.fasta`.
163-
First, we use an R script to exclude the sequences collected in the most recent year:
167+
```{r fig.width=4, fig.height=4}
168+
par(mar=c(5,5,1,1))
169+
plot(cutoffs, gaic$AltModel - gaic$NullModel, type='l',
170+
lwd=2, col='cadetblue',
171+
xlab="TN93 distance cutoffs", ylab="delta-AIC")
172+
abline(h=0, lty=2)
173+
```
164174

165-
```{r message=FALSE}
166-
require(clustuneR)
167-
require(ape)
168-
require(lubridate)
175+
### Tree-based clustering
169176

170-
setwd("~/git/clustuneR")
171-
seqs <- ape::read.FASTA("data/na.fasta", type="DNA")
177+
A phylogenetic tree is a hypothesis about how different populations are related by their common ancestors.
178+
In the context of molecular epidemiology, the ancestral nodes in a tree relating different infections can approximate transmission events in the past.
179+
Thus, a cluster of sequences connected by short branches in the tree may represent an outbreak.
172180

173-
# parse sequence headers (alternatively import from another file)
174-
seq.info <- parse.headers(names(seqs), sep="_", var.names=c('accession', 'coldate', 'subtype'),
175-
var.transformations=c(as.character, as.Date, as.factor))
181+
As above, we start the same set of anonymized HIV-1 sequences in `data/na.fasta`.
182+
First, we generate a new alignment excluding any sequences collected in the most recent year:
176183

177-
max.year <- max(year(seq.info$coldate))
178-
old.seqs <- seqs[year(seq.info$coldate) < max.year]
179-
write.FASTA(old.seqs, file="data/na-old.fasta")
184+
```{r eval=FALSE}
185+
ape::write.FASTA(seqs[-which.new], file="data/na-old.fasta")
180186
```
181187

182-
Next, we use IQ-TREE to reconstruct a maximum likelihood tree relating the "old" sequences:
188+
Next, we use a maximum likelihood program such as [IQ-TREE](http://www.iqtree.org/) to reconstruct a tree relating these "old" sequences:
183189

184190
``` console
185191
iqtree -bb 1000 -m GTR -nstop 200 -s na-old.fasta
186192
```
187193

188-
Note we've specified the generalized time reversible model of nucleotide substitution to bypass the model selection stage.
189-
Even so, this is a time-consuming step - to speed things up, we've provided IQ-TREE output files at `data/na.nwk` and `data/na.log`.
194+
Note we've requested a specific model of nucleotide substitution (GTR) to bypass the model selection stage of this program.
195+
Even so, this is a time-consuming step - to speed things up, we've provided these IQ-TREE output files at `data/na.nwk` and `data/na.log`.
190196

191-
### Grafting new sequences
197+
> **clustuneR** uses a program (`pplacer`) that can work with the outputs of IQ-TREE, [FastTree](http://www.microbesonline.org/fasttree/) and [RAxML](https://cme.h-its.org/exelixis/web/software/raxml/).
198+
> You'll have to specify which ML tree reconstruction program you used in the next step.
192199
193-
Next, we import both the sequence alignment and the ML tree into R.
194-
We will use `clustuneR` to graft the sequences from the most recent year using the program `pplacer` and the output files from IQ-TREE.
200+
Assuming you've kept R running, our next step is to import the ML tree into R.
201+
(If you quit R, you'll have to repeat the previous steps to import the alignment and parse headers.) We can then use `pplacer` to use maximum likelihood to graft the "new" sequences onto this tree.
195202

196-
```{r warning=FALSE}
203+
```{r warning=FALSE, eval=FALSE}
197204
phy <- ape::read.tree("data/na.nwk")
198-
199-
# use pplacer to graft new sequences onto old tree
200-
phy.extend <- extend.tree(phy, seq.info, seqs, mc.cores=4, log.file="data/na.log")
205+
phy <- import.tree(phy, seq.info)
206+
phy.extend <- extend.tree(phy, seqs, log.file="data/na.log")
201207
```
202208

203-
### Finding the optimal threshold
209+
```{r echo=FALSE}
210+
load("data/tree-example.RData")
211+
```
204212

205-
Next, we want to configure `clustuneR` to fit two Poisson regression models to the distribution of new cases among clusters, for a range of genetic distance thresholds:
213+
We can reuse the `cutoffs` vector from the previous example to configure a new parameter list for generating different sets of clusters.
214+
In this case, we have two criteria: (1) a threshold for the total branch length from each tip to the root of a subtree, and (2) the bootstrap support for the subtree:
206215

207216
```{r}
208-
# generate cluster sets under varying parameter settings
209-
param.list <- lapply(seq(0.001, 0.04, 0.001), function(x) list(branch.thresh=x, boot.thresh=0.95))
210-
cluster.sets <- multi.cluster(step.cluster, param.list)
217+
param.list <- lapply(cutoffs, function(x) list(branch.thresh=x, boot.thresh=0.95))
218+
cluster.sets <- multi.cluster(phy.extend, param.list, step.cluster)
219+
```
220+
221+
We also need to specify two different regression models to fit to these sets of clusters.
222+
Unlike our graph clustering example, we are going to simply add the mean sample collection date of sequences in each cluster as a second model term:
211223

212-
# configure Poisson regression models
224+
```{r}
213225
p.models = list(
214-
"NullModel" = function(x){
215-
glm(Growth~Size, data=x, family="poisson")
216-
},
217-
"TimeModel" = function(x){
218-
glm(Growth~Size+coldate, data=x, family="poisson")
219-
}
220-
)
221-
p.trans = list( # average sample collection dates across nodes in each cluster
222-
"coldate" = function(x){mean(x)}
226+
"NullModel"=function(x) glm(Growth~Size, data=x, family="poisson"),
227+
"TimeModel"=function(x) glm(Growth~Size+coldate, data=x, family="poisson")
223228
)
224-
225-
res <- fit.analysis(cluster.sets, predictive.models=p.models,
226-
predictor.transformations=p.trans)
227-
AICs <- get.AIC(res)
228-
delta.AIC <- AICs$TimeModelAIC - AICs$NullModelAIC
229+
# average sample collection dates across nodes in each cluster
230+
p.trans = list("coldate"=mean)
231+
res <- fit.analysis(cluster.sets, models=p.models, transforms=p.trans)
232+
gaic <- get.AIC(res, param.list)
229233
```
230234

231-
We can visualize the difference in AICs between models as a function of the distance threshold:
235+
Finally, we can plot the difference in AIC between the models to select an optimal branch threshold:
232236

233-
```{r}
234-
cutoffs <- sapply(param.list, function(x) x$branch.thresh)
237+
```{r fig.width=4, fig.height=4}
235238
par(mar=c(5,5,1,1))
236-
plot(cutoffs, delta.AIC, type='l', col='cadetblue', lwd=2)
239+
plot(cutoffs, gaic$TimeModel - gaic$NullModel, type='l',
240+
lwd=2, col='cadetblue', xlab="Branch threshold", ylab="delta-AIC")
237241
abline(h=0, lty=2)
238242
```
239243

240-
### Explanation
241-
242-
The optimal distance threshold is associated with the lowest value of `delta.AIC`.
243-
We expect that adding information on sample collection dates should improve our ability to predict where the next infections will occur.
244-
However, this improvement will depend on how we have partitioned the database of known infections into clusters.
245-
If every known infection is merged into a single giant cluster, then there is no meaningful way to predict where new cases will occur, since there is no variation for a sample of one cluster.
246-
If every infection each becomes a cluster of one, then there will be excessive information loss due to random variation in sampling dates.
247-
At the threshold that minimizes `delta.AIC`, the known infections are partitioned into clusters in such a way that minimizes the information loss associated with incorporating sample dates into the predictive model.
248-
249244
## References
250245

251-
If you use `clustuneR` for your work, please cite one of the following references:
246+
If you use **clustuneR** for your work, please cite one of the following references:
252247

253248
- Chato C, Kalish ML, Poon AF. Public health in genetic spaces: a statistical framework to optimize cluster-based outbreak detection.
254249
Virus evolution.
@@ -262,13 +257,6 @@ This package includes the binaries for pplacer and guppy (<https://matsen.fhcrc.
262257

263258
- Matsen FA, Kodner RB, Armbrust EV. pplacer: linear time maximum-likelihood and Bayesian phylogenetic placement of sequences onto a fixed reference tree. BMC bioinformatics. 2010 Dec;11(1):1-6.
264259

265-
As an example, this package includes a subset of a larger published HIV-1 *pol* sequence data set.
266-
These sequences were originally published in a study by Vrancken *et al.* (2017) and publicly accessible in the GenBank database under the PopSet accession `1033910942`.
267-
268-
- Benson DA, Karsch-Mizrachi I, Lipman DJ, Ostell J, Rapp BA, Wheeler DL. GenBank.
269-
Nucleic acids research.
270-
2000 Jan 1;28(1):15-8.
260+
This package includes some anonymized HIV-1 sequences that were placed in the public domain in association with the following publication:
271261

272-
- Vrancken B, Adachi D, Benedet M, Singh A, Read R, Shafran S, Taylor GD, Simmonds K, Sikora C, Lemey P, Charlton CL. The multi-faceted dynamics of HIV-1 transmission in Northern Alberta: A combined analysis of virus genetic and public health data.
273-
Infection, Genetics and Evolution.
274-
2017 Aug 1;52:100-5.
262+
- Vrancken B, Adachi D, Benedet M, Singh A, Read R, Shafran S, Taylor GD, Simmonds K, Sikora C, Lemey P, Charlton CL. The multi-faceted dynamics of HIV-1 transmission in Northern Alberta: A combined analysis of virus genetic and public health data. Infection, Genetics and Evolution. 2017 Aug 1;52:100-5.

0 commit comments

Comments
 (0)