|
| 1 | +library(dplyr) |
| 2 | +library(allelematch) |
| 3 | +library(readxl) |
| 4 | +library(readODS) |
| 5 | + |
| 6 | +#' Import and format data |
| 7 | +#' @description To be replaced by the user chosing the relevant columns |
| 8 | +#' |
| 9 | +#' @param file The path to the file to be imported |
| 10 | +#' @param index_column The name or index of the column containing the indexes |
| 11 | +#' @param additional_data A vector with the names or indexes to the columns that contain the date, north, east, gender, and any preset id:s in that order. |
| 12 | +#' @param locus_names A vector with the name or indexes to the columns that contain the genotypes |
| 13 | +#' |
| 14 | +#' @return A table with the relevant columns from the file |
| 15 | +#' @export |
| 16 | +#' |
| 17 | +#' @examples |
| 18 | +#' \dontrun{ |
| 19 | +#' data <- import_data("raw_data.csv") |
| 20 | +#' } |
| 21 | +import_data <- function(file, index_column, additional_data, locus_names) { |
| 22 | + # Read the data from the file depending on the file type |
| 23 | + if (endsWith(file, ".xls") | endsWith(file, ".xlsx")) { |
| 24 | + raw_data <- readxl::read_excel(path = file, sheet = 1, na = c("NA"), col_names = TRUE) |
| 25 | + } else if (endsWith(file, "ods")) { |
| 26 | + raw_data <- readODS::read_ods(path = file, col_names = TRUE, na = "NA") |
| 27 | + } else { |
| 28 | + raw_data <- read.table(file = file, header = TRUE, sep = ",", na.strings = c("NA")) |
| 29 | + } |
| 30 | + # Select only the columns we are intressted in |
| 31 | + data <- raw_data %>% |
| 32 | + select(all_of(index_column), as.vector(unlist(additional_data)), all_of(locus_names)) |
| 33 | + |
| 34 | + # Rename all column to be the names we know (index, north, south, gender etc) insted of the colmn names from the file |
| 35 | + colnames(data) <- c("index", names(additional_data), locus_names) |
| 36 | + # Make the rows indexable by index |
| 37 | + rownames(data) <- data$index |
| 38 | + # Return the table |
| 39 | + data |
| 40 | +} |
| 41 | + |
| 42 | + |
| 43 | +#' Convert The Data to a Allelematch Dataset |
| 44 | +#' @description To be rewritten to take a table with only index and locus data, split in the \code{\link{import_data}} function. |
| 45 | +#' Now ignores the irrelevant data for this step, can be connected by their index later. A wrapper for the |
| 46 | +#' \code{\link{amDataset}} function. |
| 47 | +#' |
| 48 | +#' @param data The relevant data that has been imported |
| 49 | +#' @param ignore_columns A vector with the names or indexes to the columns that are to be ignored by the 'allelematch' packet. These are the date, nord, east, gender and preset individual, in that order. TODO: Remove the preset individual, it should be used earlier. |
| 50 | +#' |
| 51 | +#' @return A allelematch dataset with the relevant index and locus column |
| 52 | +#' @export |
| 53 | +#' |
| 54 | +#' @examples |
| 55 | +#' \dontrun{ |
| 56 | +#' am_data <- create_allelematch_dataset(data) |
| 57 | +#' } |
| 58 | +create_allelematch_dataset <- function(data, ignore_columns) { |
| 59 | + # Create the allelematch dataset |
| 60 | + am_data <- allelematch::amDataset(data, indexColumn = "index", ignoreColumn = as.vector(unlist(ignore_columns)), missingCode = "000") |
| 61 | + # Retrun the allelematch dataset |
| 62 | + am_data |
| 63 | +} |
| 64 | + |
| 65 | +#' Load Data and Group Into Individuals |
| 66 | +#' @description TODO: Keep the multimatch data |
| 67 | +#' |
| 68 | +#' @param data A dateframe with index, all meta-data and locus |
| 69 | +#' @param am_data An allelematch dataset, converted from the data containing the index and locus |
| 70 | +#' @param allele_mismatch A value for how many allele mismatchs are to be allowed and still count like a match |
| 71 | +#' |
| 72 | +#' @return A list with the search_data, which is a list with $index, $multilocus, and $individ_id, a list with the $index of the samples that matched multiple individuals, and a list with the $index and $multilocus of the samples were unclassified. |
| 73 | +#' @export |
| 74 | +#' |
| 75 | +#' @examples |
| 76 | +#' \dontrun{ |
| 77 | +#' search_data <- create_search_data("data.csv") |
| 78 | +#' } |
| 79 | +#create_search_data <- function(file_path, index_column, additional_data_columns, locus_columns, allele_mismatch) { |
| 80 | +create_search_data <- function(data, am_data, allele_mismatch) { |
| 81 | + # Group the samples together to form individuals |
| 82 | + am_unique <- allelematch::amUnique(am_data, alleleMismatch = allele_mismatch) |
| 83 | + |
| 84 | + ind <- 0 |
| 85 | + # If a override id is specified, get the largest override id and add ind from that |
| 86 | + if (!is.null(data$preset_ind)) { |
| 87 | + # Convert everything to a number |
| 88 | + preset_num <- as.numeric(data[,"preset_ind"]) |
| 89 | + # Remove everything that is not a number, out generated ind cannot accidentaly assume the same value |
| 90 | + preset_num <- preset_num[!is.na(preset_num)] |
| 91 | + # Start right after the biggest one found |
| 92 | + ind_biggest <- max(preset_num) + 1 |
| 93 | + # If all override_ids were NA or not numbers, we can count from zero like usual |
| 94 | + if (is.na(ind_biggest)) { |
| 95 | + ind_biggest <- 0 |
| 96 | + } |
| 97 | + } |
| 98 | + |
| 99 | + # Go through the data and create a large data.frame with all the prevoius samples, adding a column for the individ_id |
| 100 | + search_data <- data.frame(index = character(), multilocus = character(), individ_id = character()) |
| 101 | + |
| 102 | + for (pair in am_unique$pairwise) { |
| 103 | + multilocus_combined <- apply(pair$match$multilocus, 1, combine_multilocus) |
| 104 | + |
| 105 | + search_data <- rbind(search_data, list(index = pair$match$index, multilocus = multilocus_combined, individ_id = rep(as.character(ind), length(pair$match$index)))) |
| 106 | + ind <- ind + 1 |
| 107 | + } |
| 108 | + |
| 109 | + # Add a empty column for the override data to be read from the file or created by the user later |
| 110 | + search_data <- cbind(search_data, list(override_id = rep(NA, length(search_data$index)))) |
| 111 | + |
| 112 | + # If a override_id column is specified, write it to the new column |
| 113 | + if (!is.null(data$preset_ind)) { |
| 114 | + search_data$override_id <- data[search_data$index,"preset_ind"] |
| 115 | + } |
| 116 | + |
| 117 | + # The multiple matches that have been handled by the user previously and is now in the file |
| 118 | + multiple_matches_filter <- duplicated(search_data$index) & duplicated(search_data$override_id) & !is.na(search_data$override_id) |
| 119 | + |
| 120 | + multiple_matches <- am_unique$multipleMatches$index |
| 121 | + # Only keep the multiple matches indexes that have not been handled |
| 122 | + multiple_matches <- multiple_matches[!(multiple_matches %in% search_data$index[multiple_matches_filter])] |
| 123 | + |
| 124 | + search_data <- search_data[!multiple_matches_filter,] |
| 125 | + |
| 126 | + # Return all we want, the data (meta-data), search_data (index, multilocus and the id to group them together) |
| 127 | + list(search_data, multiple_matches, list(index = am_unique$unclassified$index, multilocus = am_unique$unclassified$multilocus)) |
| 128 | +} |
| 129 | + |
| 130 | +#' Combine Multiple Locus and Assure Constant Width |
| 131 | +#' |
| 132 | +#' @param locus A vector with all the locus in string format. |
| 133 | +#' |
| 134 | +#' @return A long string with all locus combined, padding zeros making ever locus three characters long. |
| 135 | +#' @export |
| 136 | +#' |
| 137 | +#' @examples |
| 138 | +#' \dontrun{ |
| 139 | +#' multilocus <- c("182", "180", "152", "152") |
| 140 | +#' multilocus_combined <- combine_multilocus(multilocus) |
| 141 | +#' } |
| 142 | +#' \dontrun{ |
| 143 | +#' multilocus_combined <- apply(multilocus_matrix, 1, combine_multilocus) |
| 144 | +#' } |
| 145 | +combine_multilocus <- function(locus) { |
| 146 | + # Convert it to a number, add leading 0es if needed to reach length 3 and paste with collapse to create a long string |
| 147 | + locus %>% |
| 148 | + as.numeric() %>% |
| 149 | + formatC(width = 3, flag = "0", format = "d") %>% |
| 150 | + paste0(collapse = "", sep = " ") |
| 151 | +} |
| 152 | + |
| 153 | +#' Simplify the ID getting process |
| 154 | +#' @description Return the override id if there is one, otherwise return the individ_id |
| 155 | +#' |
| 156 | +#' @param row The row for which the id wishes to be taken |
| 157 | +#' |
| 158 | +#' @return The relevant id |
| 159 | +#' @export |
| 160 | +#' |
| 161 | +#' @examples |
| 162 | +#' \dontrun{ |
| 163 | +#' # Get the id for the sample at the 17th row |
| 164 | +#' id <- get_id(search_data[[17,]]) |
| 165 | +#' } |
| 166 | +get_id <- function(row) { |
| 167 | + # read the override id first |
| 168 | + id <- row$override_id |
| 169 | + # if the override id is NA, use the regular id instead |
| 170 | + id[is.na(id)] <- row$individ_id[is.na(id)] |
| 171 | + id |
| 172 | +} |
| 173 | + |
| 174 | +#' Generate Allelematch Profile Plot |
| 175 | +#' |
| 176 | +#' @param am_dataset A allelematch dataset to examine the optimal mismatch value for |
| 177 | +#' |
| 178 | +#' @return The plotdata to show the user what the program thinks is the optimal mismatch value |
| 179 | +#' @export |
| 180 | +#' |
| 181 | +#' @examples |
| 182 | +#' \dontrun{ |
| 183 | +#' library(shiny) |
| 184 | +#' |
| 185 | +#' output$plot <- renderPlot({ |
| 186 | +#' generate_allelematch_plot(am_data) |
| 187 | +#' }) |
| 188 | +#' } |
| 189 | +generate_allelemtach_profile_plot <- function(am_dataset) { |
| 190 | + # Generate the plot |
| 191 | + allelematch::amUniqueProfile(am_dataset, doPlot = TRUE) |
| 192 | +} |
| 193 | + |
| 194 | +#' Handle Multimatch By User |
| 195 | +#' @description This function takes |
| 196 | +#' |
| 197 | +#' @param search_data The search data, a dateframe with indexes, locuses and ids |
| 198 | +#' @param multiple_matches A list with the indexes that currently matches to multiples ids in the search_data |
| 199 | +#' @param multimatch_index The index of the sample that is to get a specified id |
| 200 | +#' @param new_id The new id that the specified id and all entries in the same group will get |
| 201 | +#' |
| 202 | +#' @return A list with the updated search_data and multiple_matches |
| 203 | +#' @export |
| 204 | +#' |
| 205 | +#' @examples |
| 206 | +#' \dontrun{ |
| 207 | +#' c(search_data, multiple_match) %<-% handle_multimatch |
| 208 | +#' (search_data, multiple_matches, "SEP123", "B31") |
| 209 | +#' } |
| 210 | +handle_multimatch <- function(search_data, multiple_matches, multimatch_index, new_id) { |
| 211 | + # Remove the sample from the list of samples that have multiple matches, it has been corrected by the user |
| 212 | + multiple_matches <- multiple_matches[multiple_matches != multimatch_index] |
| 213 | + # Remove all duplicated of the specific sample, the order doesnt matter as override id is the important parameter for the id |
| 214 | + # We can remove the one with the correct id and set the override id of another without problem |
| 215 | + search_data <- search_data[!(search_data$index == multimatch_index & duplicated(search_data$index)),] |
| 216 | + # Set the sample (now no duplicates, only one left) override id to be the id specified by the user |
| 217 | + search_data$override_id[search_data$index == multimatch_index] <- new_id |
| 218 | + # Set every sample that is in the same group to have a override id, maybe not necessary but to ensure the order generated by |
| 219 | + # allelelmatch doesnt change and would therefor place the user "controlled" one in a then different group |
| 220 | + search_data$override_id[get_id(search_data) == new_id & !(search_data$index %in% multiple_matches)] <- new_id |
| 221 | + |
| 222 | + # Return the updated data |
| 223 | + list(search_data, multiple_matches) |
| 224 | +} |
| 225 | + |
| 226 | +#' Title |
| 227 | +#' |
| 228 | +#' @param data The old data, containing all samples that have already been handeled, or been choosen not to be handleh |
| 229 | +#' @param new_data The new data, the meta-data for the new sample, containing the same columns as the old data, index, locus, date, gender, etc |
| 230 | +#' @param additional_data_columns The names of the columns containing the meta-data for the \code{\link{create_allelematch_dataset}} to be able to create a new am_dataset |
| 231 | +#' @param allele_mismatch The mismatch (numbers of alleles that are allowed to differ between individuals) to use for this grouping, user could be intrested in being more strict or more loose, up to them |
| 232 | +#' |
| 233 | +#' @return Returns only the new data grouped into search_data format, which of them that matched multiple and if some of them were unclassified |
| 234 | +#' @export |
| 235 | +#' |
| 236 | +#' @examples |
| 237 | +match_new_data <- function(data, new_data, additional_data_columns, allele_mismatch) { |
| 238 | + # Combine the data under each other, create a big data.frame |
| 239 | + print(head(data)) |
| 240 | + print(head(new_data)) |
| 241 | + combined_data <- rbind(data, new_data) |
| 242 | + # Create an amDataset to be able to run the grouping on all of the data |
| 243 | + am_data <- create_allelematch_dataset(data = combined_data, ignore_columns = additional_data_columns) |
| 244 | + # Get the combined data for all samples, every index and group, including which fitted into multiple and which were unclassified |
| 245 | + c(combined_search_data, combined_multiple_matches, combined_unclassified) %<-% create_search_data(data = combined_data, |
| 246 | + am_data = am_data, allele_mismatch = allele_mismatch) |
| 247 | + # Return the data gathered but filter to only keep the entries whos indexes are not in the "old" data. Thereby only passing on the new data but sorted with individ_id, if they matched multiple and if they were unclassified |
| 248 | + list(combined_search_data[!combined_search_data$index %in% data$index,], |
| 249 | + combined_multiple_matches[!combined_multiple_matches %in% data$index], |
| 250 | + combined_unclassified[!combined_unclassified$index %in% data$index]) |
| 251 | +} |
0 commit comments