PoonLab
diff --git a/‎.Rhistory
Lines changed: 28 additions & 510 deletions b/‎.Rhistory
Lines changed: 28 additions & 510 deletions
diff --git a/‎DESCRIPTION
Lines changed: 17 additions & 12 deletions b/‎DESCRIPTION
Lines changed: 17 additions & 12 deletions
diff --git a/‎R/analysis.R
Lines changed: 39 additions & 25 deletions b/‎R/analysis.R
Lines changed: 39 additions & 25 deletions
diff --git a/‎R/data.R
Lines changed: 69 additions & 44 deletions b/‎R/data.R
Lines changed: 69 additions & 44 deletions
diff --git a/‎R/generate.data.R
Lines changed: 2 additions & 2 deletions b/‎R/generate.data.R
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/graph.clustering.R
Lines changed: 10 additions & 8 deletions b/‎R/graph.clustering.R
Lines changed: 10 additions & 8 deletions
@@ -1,17 +1,22 @@
-Package: MountainPlot
+Package: clustuneR
 Title: Molecular clustering implementation and predictive optimization
-Version: 0.0.0.9000
+Version: 1.0
 Authors@R: 
-  person(given = "Connor", family = "Chato", role = c("aut", "cre"), email = "[email protected]")
-Description: This package can take identify clusters from the ape package's 
-  implementation of tree and sequence data. Multiple common definitions of sequence
-  based clusters are implemented as functions, with a cluster neatly standardized 
-  as a row in a data.table object. This also offers some ability to run and test 
-  predictive models on clustered data sets, tracking the effect of known variables 
-  within clusters (ex. time) on outcomes such as cluster growth over time. Several
-  built in functions handle the measurement and definition of cluster growth for
-  this purpose. Optimal clusters(built under a certain set of parameters) can be
-  identified based on predictive model performance.
+    c(person("Connor", "Chato", email = "[email protected]",
+              role = c("aut", "cre")),
+    person("Art", "Poon", role="ths"))
+Description: clustuneR builds clusters from inputted sequence alignments and/or 
+    phylogenetic trees, allowing users to choose between multiple cluster-building 
+    algorithms implememented in the package and tune clustering parameters to produce 
+    informative clusters. The package also takes in meta-data associated with sequences 
+    such as a known collection date or subtype/variant classification. Cluster-level 
+    characteristics, such as the range of collection dates or the most common 
+    subtype/variant within a cluster can also be identified from these. 
+    If a subset of sequences are specified as "New", then clustuneR simulates cluster 
+    growth by building clusters in two stages: first clusters are built from sequences 
+    which are not specified as new, then the new sequences are added to clusters. 
+    Predictive models can then be tested on cluster-level attributes and validated 
+    with growth outcomes, to measure how informative a cluster set is.
 License: `use_gpl3_license()`
 Encoding: UTF-8
 Imports: 
 
@@ -1,14 +1,19 @@
 #' Multiple clusters from a parameter set
 #'
-#' Runs a given clustering method over a range of parameters values to output
-#' a range of cluster sets corresponding to different
-#'
-#' @param cluster.method: A given clustering function such as step.cluster() which produces a set of clusters
-#' @param param.list: A named list of parameter sets, which can act as inputs to cluster.method.
-#' @param rangeID: A unique identifier for the set of rows generated by this run.
-#' If this output is bound to other cluster ranges in a larger analysis, this can disambiguate
+#' Runs a given clustering method (a passed function) over a range of parameter 
+#' values (a list, each entry a named list of parameters for the function).
+#' Collects the data into a single data table with multiple cluster set ID's 
+#' indicating the parameter set used to define clusters a unique cluster range ID.
+#' 
+#' @param cluster.method: A given clustering function such as step.cluster() which 
+#' produces a set of clusters.
+#' @param param.list: A list, each entry a named list of parameter sets which can 
+#' act as inputs to the cluster.method. These include values such as trees and graphs, 
+#' as well as  criteria for clustering such as boot.thresh or dist.thresh.
+#' @param rangeID: A unique identifier for the set of rows generated by this run
+#' if this output is bound to other cluster ranges in a larger analysis.
 #' @param mc.cores: A parallel option to increase run speed.
-#' @return A data.table with parameter sets and cluster IDs specified
+#' @return A data.table of clusters. Multiple cluster sets are collected into a range.
 #' @export
 #' @example examples/multi.cluster_ex.R
 multi.cluster <- function(cluster.method, param.list, mc.cores = 1, rangeID = 0) {
@@ -28,24 +33,31 @@ multi.cluster <- function(cluster.method, param.list, mc.cores = 1, rangeID = 0)
 
 #' Predictive analysis on clusters
 #'
-#' Fits a predictive model of some outcome (by default, cluster growth) to sets of cluster data.
-#' These fits are recorded for each use of the predictive model on a given cluster set
+#' Fits predictive model of some outcome (by default, cluster growth) to some 
+#' cluster-level variable (by default, cluster size). This fit is done for each 
+#' cluster set. Multiple models can be inputted as a named list of functions taking 
+#' in cluster data (see example)
 #'
-#' @param cluster.data: Inputted set(s) of clusters May or may not be sorted into ranges
+#' @param cluster.data: Inputted set(s) of clusters. Possibly multiple ranges
 #' @param mc.cores: A parallel option to increase run speed
-#' @param predictor.transformations: A named list of transformation functions for each predictor variable.
-#' This name should correspond to a column from the cluster.data, which will be taken as input for the function.
-#' for example list("CollectionDate"=mean), would change the collection date column to a vector of means
-#' instead of a list collection date vectors
-#' @param predictive.models: A named list of functions, each of which applies a model to inputted data (x). See default null for example.
-#' @return A data.table of analysis results. Several important summary values such as null and full AIC are proposed here.
+#' @param predictor.transformations: A named list of transformation functions for 
+#' each predictor variable (ex. list("Data"==sum). Because clustered meta data takes 
+#' the form of a list these functions are often necessary to obtain a single, 
+#' cluster-level variable
+#' @param predictive.models: A named list of functions, each of which applies a 
+#' model to inputted cluster data (x). By default a "NullModel" example. Where
+#' Growth is predicted only by cluster size
+#' @return A data.table of analysis results. Model fits are stored as entries in 
+#' the rows of a data.table. The column specifying setID is retained, as is the 
+#' range ID and the parameters used to create the cluster.
 #' @export
 #' @example examples/fit.analysis_ex.R
 fit.analysis <- function(cluster.data, mc.cores = 1, predictor.transformations = list(),
                          predictive.models = list(
                            "NullModel" = function(x){
                              glm(Size~Growth, data=x, family="poisson")
-                             })) {
+                            })) {
+  
   # Check inputs
   predictors <- names(predictor.transformations)
   mod.names <- names(predictive.models)
@@ -69,7 +81,6 @@ fit.analysis <- function(cluster.data, mc.cores = 1, predictor.transformations =
     })]
   }
 
-
   # Obtain fit data for each cluster set
   cluster.analysis <- dplyr::bind_rows(
     parallel::mclapply(setIDs, function(id) {
@@ -88,20 +99,23 @@ fit.analysis <- function(cluster.data, mc.cores = 1, predictor.transformations =
 
 #'Get AIC values from an analysis
 #'
-#'Takes a cluster.analysis and extracts AIC values from columns containing model fits.
-#'Model fit columns are automatically identified
+#'Takes a cluster.analysis and extracts AIC values from columns containing model 
+#'fits. Fit columns are automatically identified
 #'
-#'@param cluster.analysis: A data.table from some predictive growth model analysis generated by fit.analysis()
-#'@return The AIC data for all columns containing fit objects
+#'@param cluster.analysis: A data.table from some predictive growth model analysis 
+#'generated by fit.analysis()
+#'@return The AIC data for all columns containing fit objects. The column specifying 
+#'setID is retained
 #'@export
 #'@example examples/get.AIC_ex.R
 get.AIC <- function(cluster.analysis){
 
   #Identify models
-  which.models <- sapply(cluster.analysis[1,], function(x){any(attr(x[[1]], "class")%in%c("lm", "glm"))})
+  which.models <- sapply(cluster.analysis[1,], 
+                         function(x){any(attr(x[[1]], "class")%in%c("lm", "glm"))})
   which.models <- which(which.models)
   if(length(which.models)==0) {
-    stop("No models in the data set provided")
+    stop("No fits in the data set provided")
   }
   model.fits <- cluster.analysis[,.SD, .SDcols = which.models]
 
 
@@ -1,49 +1,62 @@
-#'An alignment of HIV1, subtype B sequences
+#' An alignment of HIV1, subtype B sequences
 #'
-#'A dataset containing 10 HIV1, subtype B polymerase sequences collected in Northern Alberta Canada.
-#'This a 10 sequence sample from popset# 1033910942 on NCBI's genbank Archive
+#' A dataset containing 10 HIV1, subtype B polymerase sequences collected in Northern 
+#' Alberta Canada. This a 10 sequence sample from popset# 1033910942 on NCBI's genbank 
+#' Archive. The sequence headers also include meta-data for sequences.
 #'
-#' @format An ape DNA object: 10 DNA sequences in binary format stored in a list. All sequences of same length: 1017 
+#' @format An ape DNA object: 10 DNA sequences in binary format stored in a list. 
+#' All sequences of same length: 1017 
 #' @source \url{ https://www.ncbi.nlm.nih.gov/popset?DbFrom=nuccore&Cmd=Link&LinkName=nuccore_popset&IdsFromResult=1033912042 }
 "alignment.ex"
 
-#'An example set of sequence meta.data corresponding to alignment.ex
+#' An example set of sequence meta.data corresponding to alignment.ex
 #'
-#'A dataset describing 10 different HIV1 pol sequences collected in Northern Alberta Canada.
+#' Built from alignment.ex, the example 10 sequence alignment using pull.headers. 
+#' The date of each sequence's collection, it's genbank unique accession ID, and 
+#' sequence subtype are referenced within the header
 #'
-#' @format A data.table object with 9 variables:
+#' @format A data.table object with 4 variables:
 #' \describe{
 #' \item{ID}{Accession IDs (characters) of sequences}
 #' \item{CollectionDate}{Collection date of sequences. Full dates given as yyyy-mm-dd}
-#' \item{Subtype}{Subtypes (factors) within a cluster}
-#' \item{Header}{The original headers from the alignement. This matches meta data to sequences}
+#' \item{Subtype}{Subtypes (factors) of sequences}
+#' \item{Header}{The original headers from the alignement. This matches meta 
+#' data to sequences in original alignment}
 #' }
 "seq.info.ex"
 
-#'An example set of clusters, built using component.cluster
+#' An example set of clusters, built using component.cluster
 #'
-#'A dataset describing 5 different clusters. Their member headers are listed, as well as the growth they experienced 
-#'(ie. the number of new sequences forming clusters with old sequences.). See component.cluster for further information on 
-#'how these were assigned based on graph.ex as an input
+#' A dataset describing 5 different clusters. The headers (from alignment.ex), and 
+#' associated meta data (from seq.info.ex) of cluster members is captured, as well
+#' as several cluster-level traits, such as growth and size. See component.cluster 
+#' or further information onhow these were assigned based on graph.ex as an input
 #'
 #' @format A data.table object with 9 variables:
 #' \describe{
-#' \item{ClusterID}{ The unique identifier number for this cluster. A numberic}
-#' \item{ID}{A list of vectors, each containing the accession IDs (characters) of sequences within a cluster}
-#' \item{CollectionDate}{A list of vectors, each containing the collection date of sequences within a cluster}
-#' \item{Subtype}{A list of vectors, each containing the subtypes (factors) within a cluster}
-#' \item{Header}{A list of vectors, each containing the original headers from the alignement used to build this set of clusters}
-#' \item{Size}{The original size of this cluster before being updated with new cases. This simply the number of sequences within the cluster}
+#' \item{ClusterID}{ The unique identifier number for this cluster. A numeric}
+#' \item{ID}{A list of vectors, each containing the accession IDs (characters) 
+#' of sequences within a cluster}
+#' \item{CollectionDate}{A list of vectors, each containing the collection date 
+#' of sequences within a cluster}
+#' \item{Subtype}{A list of vectors, each containing the subtypes (factors) within 
+#' a cluster}
+#' \item{Header}{A list of vectors, each containing the original headers from the 
+#' alignement used to build this set of clusters}
+#' \item{Size}{The original size of this cluster before being updated with new cases. 
+#' This simply the number of sequences within the cluster}
 #' \item{Growth}{The growth of the cluster after new cases are added}
-#' \item{DistThresh}{The pairwise distance threshold used to create this complete set of clusters. Corresponds to a setID as an input parameter}
+#' \item{DistThresh}{The pairwise distance threshold used to create this complete 
+#' set of clusters. Corresponds to a setID as an input parameter}
 #' \item{SetID}{The unique identifier for this set of clusters. A numeric}
 #' }
 "cluster.ex"
 
-#'An example graph, built based on pairwise TN93 distances
+#' An example graph, built based on pairwise TN93 distances
 #'
-#'This implementation of a graph is a list, describing a set of sequences and the distances between them. 
-#'See create.graph for more information on how this graph was created using alignment.ex as input
+#' This implementation of a graph is a list, describing a set of sequences and the 
+#' distances between them. See create.graph for more information on how this graph 
+#' was created using alignment.ex as input
 #'
 #' @format A list of 3 variables
 #' \describe{
@@ -56,39 +69,51 @@
 #' }
 "graph.ex"
 
-#'A tree built based on a subset of alignment.ex
+#' A tree built based on a subset of alignment.ex
 #'
-#'This is a maximum likelyhood tree built using IQ-TREE with model selection and 1000 parametric bootstraps.
-#'The log information for this tree is stored in data/IQTREE_log_ex.txt. A subset of six older sequences 
-#'(collected before January 1st 2012) from alignment.ex was used to construct this tree
+#' A maximum likelihood tree built using IQ-TREE with model selection and 1000 
+#' parametric bootstraps. The log information for this tree is stored in data/IQTREE_log_ex.txt. 
+#' A subset of six older sequences (collected before January 1st 2012) from alignment.ex 
+#' was used to construct this tree.
 #'
 #'
-#' @format An unrooted, phylogenetic tree with 6 tips and 4 internal nodes. Node labels represent certainty
-#' See ape's implementation of phylogenetic tree objects for information about tags within this object
+#' @format An unrooted, phylogenetic tree with 6 tips and 4 internal nodes. 
+#' Node labels represent certainty. See ape's implementation of phylogenetic tree 
+#' objects for information about tags within this object
 "old.tree.ex"
 
-#'A tree built from alignment.ex
+#' A tree built from alignment.ex
 #'
-#'This is a maximum likelihood tree built using IQ-TREE with automatic model selection and 1000 parametric bootstraps.
+#' This is a maximum likelihood tree built using IQ-TREE with automatic model 
+#' selection and 1000 parametric bootstraps. Contrasting old.tree.ex. This is a 
+#' complete tree containing all sequences in alignment.ex
 #'
-#' @format An unrooted, phylogenetic tree with 10 tips and 8 internal nodes. Node labels represent certainty
-#' See ape's implementation of phylogenetic tree objects for information about tags within this object
+#' @format An unrooted, phylogenetic tree with 10 tips and 8 internal nodes. Node 
+#' labels represent certainty. See ape's implementation of phylogenetic tree objects 
+#' for information about tags within this object.
 "full.tree.ex"
 
-#'An extension of an ape tree object which can be used to create clusters
+#' An extension of an ape tree object which can be used to create clusters
 #'
-#'This is a maximum likelihood tree built using IQ-TREE with automatic model selection and 1000 parametric bootstraps.
-#'Additional functions within tree.setup.R were used to annotate information useful for clustering
+#' An extension of old.tree.ex maximum likelihood tree built using IQ-TREE with automatic 
+#' model selection and 1000 parametric bootstraps. growth information and additional 
+#' information useful for cluster identification were added by extend.tree. 
 #'
-#' @format A , phylogenetic tree with 6 tips and 4 internal nodes. Node labels represent certainty
-#' See ape's implementation of phylogenetic tree objects for information about tags within this object.
-#' In addition, there are 4 new objects created by functions within tree.setup.R
+#' @format A phylogenetic tree with 6 tips and 4 internal nodes. Node labels represent 
+#' certainty. See ape's implementation of phylogenetic tree objects for information 
+#' about tags within this object. In addition, there are 4 new objects created by 
+#' functions within tree.setup.R
 #' \describe{
 #' \item{seq.info}{ See seq.info.ex, a data.table containing sequence meta data}
-#' \item{node.info}{ Grouping of the meta.data present in seq.info assigned to various nodes in the tree, 
-#' coupled with information important to clustering, such as mean divergence from root, or node certainty }
-#' \item{path.info}{ Information regarding the path of edges from tips to the root of the tree. 
-#' This is also necessary for some clustering algorithms, specifically step.cluster}
-#' \item{growth.info}{ a data.table pairing new sequences, to a single node in the tree based on placements assigned by guppy and pplacer.}
+#' \item{node.info}{ Grouping of the meta.data present in seq.info assigned to 
+#' various nodes in the tree, coupled with information important to clustering, 
+#' such as mean divergence from root, or node certainty }
+#' \item{path.info}{ Information regarding the path of edges from tips to the root 
+#' of the tree. This is also necessary for some clustering algorithms, specifically 
+#' step.cluster}
+#' \item{growth.info}{ a data.table pairing new sequences, to a single node in the 
+#' tree based on placements assigned by guppy and pplacer. The certainty of this placement, 
+#' terminal branch length, neighbour, and branch length from new internal node to 
+#' new neighbour are described}
 #' }
 "extended.tree.ex"
@@ -1,7 +1,7 @@
 #'Generate data found in /data folder
 #'
-#'This is partially intended as example use code, however may also act as secondary,
-#'informal testing in the development cycle and as a tool to update data quickly if required.
+#'This is partially intended as example use code, however may also act as informal 
+#'testing in the development cycle and as a tool to update data quickly if required.
 generate.all <- function() {
   generate.seq.info()
   generate.graph()
 
@@ -1,16 +1,18 @@
 #' Create clusters based on the components of a graph
 #'
-#' This uses a homogenization algorithm to identify disconnected components in a graph.
-#' Edges are filtered away using a distance threshold to result in components.
+#' Edges are filtered away using a distance threshold to break up the completely 
+#' connected graph such that only similar edges remain.
 #'
-#' @param g: The input graph, annotated with vertex and edge information
-#' @param dist.thresh: The maximum distance defining which edges are filtered
-#' @param setID: If several different parameter ranges are used, the setID can identify them
-#' @return A data table which represents cluster information. This includes growth info
-#' Because data.tables are being used, this prevents original values being reassigned via pointer
+#' @param g: The input graph, annotated with vertex, edge, and growth resolution
+#' information
+#' @param dist.thresh: The maximum distance defining which edges are filtered.
+#' A higher distance threshold implies a larger average cluster size
+#' @param setID: A numeric identifier for this cluster set.
+#' @return A set of clusters as a data.table. See example cluster.ex object 
+#' documentation for an example of clustered sequence data + meta data
 #' @export
 #' @example examples/component.cluster_ex.R
-component.cluster <- function(g, dist.thresh = 0.007, setID = 0) {
+component.cluster <- function(g, dist.thresh = 0, setID = 0) {
 
   # Filter edges above the distance threshold and prepare for component finding algorithm
   # All edges from a new sequence are filtered except for their "growth-resolved" edge