Skip to content

Commit 4ffdcd5

Browse files
authored
Merge pull request #6 from CGI-NRM/develop
Base Functionality
2 parents 19fc41e + 9ff0461 commit 4ffdcd5

16 files changed

+1725
-0
lines changed

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,10 @@ vignettes/*.pdf
3737

3838
# R Environment Variables
3939
.Renviron
40+
41+
# Lock files
42+
.~lock*
43+
44+
# R Project
45+
# *.Rproj
46+
.Rbuildignore

DESCRIPTION

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
Package: GenotypeCheck
2+
Version: 0.0.0.9000
3+
Title: Identify New Locus Data With Database Of Individuals
4+
Description: Backend code to categorize sample data into groups of individuals and allow user to clean up the misses. Then allow to match new sample data to the existing individuals. Also the frontend code in the form of a shiny app that allows for intuitive use.
5+
Authors@R: person("Elias", "Lundell", email = "[email protected]", role=c("aut", "cre"))
6+
License: GPL-3 + file LICENSE
7+
Encoding: UTF-8
8+
LazyData: true
9+
Roxygen: list(markdown = TRUE)
10+
RoxygenNote: 7.1.0
11+
Imports:
12+
allelematch,
13+
dplyr,
14+
DT,
15+
leaflet,
16+
readxl,
17+
readODS,
18+
rgdal,
19+
shiny,
20+
zeallot

GenotypeCheck.Rproj

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
Version: 1.0
2+
3+
RestoreWorkspace: No
4+
SaveWorkspace: No
5+
AlwaysSaveHistory: Default
6+
7+
EnableCodeIndexing: Yes
8+
UseSpacesForTab: Yes
9+
NumSpacesForTab: 2
10+
Encoding: UTF-8
11+
12+
RnwWeave: Sweave
13+
LaTeX: pdfLaTeX
14+
15+
AutoAppendNewline: Yes
16+
StripTrailingWhitespace: Yes
17+
18+
BuildType: Package
19+
PackageUseDevtools: Yes
20+
PackageInstallArgs: --no-multiarch --with-keep.source
21+
PackageCheckArgs: --as-cran
22+
PackageRoxygenize: rd,collate,namespace,vignette

NAMESPACE

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Generated by roxygen2: do not edit by hand
2+
3+
export(combine_multilocus)
4+
export(create_allelematch_dataset)
5+
export(create_search_data)
6+
export(generate_allelemtach_profile_plot)
7+
export(get_id)
8+
export(handle_multimatch)
9+
export(import_data)
10+
export(match_new_data)

R/app.R

Lines changed: 394 additions & 0 deletions
Large diffs are not rendered by default.

R/import_data.R

Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
library(dplyr)
2+
library(allelematch)
3+
library(readxl)
4+
library(readODS)
5+
6+
#' Import and format data
7+
#' @description To be replaced by the user chosing the relevant columns
8+
#'
9+
#' @param file The path to the file to be imported
10+
#' @param index_column The name or index of the column containing the indexes
11+
#' @param additional_data A vector with the names or indexes to the columns that contain the date, north, east, gender, and any preset id:s in that order.
12+
#' @param locus_names A vector with the name or indexes to the columns that contain the genotypes
13+
#'
14+
#' @return A table with the relevant columns from the file
15+
#' @export
16+
#'
17+
#' @examples
18+
#' \dontrun{
19+
#' data <- import_data("raw_data.csv")
20+
#' }
21+
import_data <- function(file, index_column, additional_data, locus_names) {
22+
# Read the data from the file depending on the file type
23+
if (endsWith(file, ".xls") | endsWith(file, ".xlsx")) {
24+
raw_data <- readxl::read_excel(path = file, sheet = 1, na = c("NA"), col_names = TRUE)
25+
} else if (endsWith(file, "ods")) {
26+
raw_data <- readODS::read_ods(path = file, col_names = TRUE, na = "NA")
27+
} else {
28+
raw_data <- read.table(file = file, header = TRUE, sep = ",", na.strings = c("NA"))
29+
}
30+
# Select only the columns we are intressted in
31+
data <- raw_data %>%
32+
select(all_of(index_column), as.vector(unlist(additional_data)), all_of(locus_names))
33+
34+
# Rename all column to be the names we know (index, north, south, gender etc) insted of the colmn names from the file
35+
colnames(data) <- c("index", names(additional_data), locus_names)
36+
# Make the rows indexable by index
37+
rownames(data) <- data$index
38+
# Return the table
39+
data
40+
}
41+
42+
43+
#' Convert The Data to a Allelematch Dataset
44+
#' @description To be rewritten to take a table with only index and locus data, split in the \code{\link{import_data}} function.
45+
#' Now ignores the irrelevant data for this step, can be connected by their index later. A wrapper for the
46+
#' \code{\link{amDataset}} function.
47+
#'
48+
#' @param data The relevant data that has been imported
49+
#' @param ignore_columns A vector with the names or indexes to the columns that are to be ignored by the 'allelematch' packet. These are the date, nord, east, gender and preset individual, in that order. TODO: Remove the preset individual, it should be used earlier.
50+
#'
51+
#' @return A allelematch dataset with the relevant index and locus column
52+
#' @export
53+
#'
54+
#' @examples
55+
#' \dontrun{
56+
#' am_data <- create_allelematch_dataset(data)
57+
#' }
58+
create_allelematch_dataset <- function(data, ignore_columns) {
59+
# Create the allelematch dataset
60+
am_data <- allelematch::amDataset(data, indexColumn = "index", ignoreColumn = as.vector(unlist(ignore_columns)), missingCode = "000")
61+
# Retrun the allelematch dataset
62+
am_data
63+
}
64+
65+
#' Load Data and Group Into Individuals
66+
#' @description TODO: Keep the multimatch data
67+
#'
68+
#' @param data A dateframe with index, all meta-data and locus
69+
#' @param am_data An allelematch dataset, converted from the data containing the index and locus
70+
#' @param allele_mismatch A value for how many allele mismatchs are to be allowed and still count like a match
71+
#'
72+
#' @return A list with the search_data, which is a list with $index, $multilocus, and $individ_id, a list with the $index of the samples that matched multiple individuals, and a list with the $index and $multilocus of the samples were unclassified.
73+
#' @export
74+
#'
75+
#' @examples
76+
#' \dontrun{
77+
#' search_data <- create_search_data("data.csv")
78+
#' }
79+
#create_search_data <- function(file_path, index_column, additional_data_columns, locus_columns, allele_mismatch) {
80+
create_search_data <- function(data, am_data, allele_mismatch) {
81+
# Group the samples together to form individuals
82+
am_unique <- allelematch::amUnique(am_data, alleleMismatch = allele_mismatch)
83+
84+
ind <- 0
85+
# If a override id is specified, get the largest override id and add ind from that
86+
if (!is.null(data$preset_ind)) {
87+
# Convert everything to a number
88+
preset_num <- as.numeric(data[,"preset_ind"])
89+
# Remove everything that is not a number, out generated ind cannot accidentaly assume the same value
90+
preset_num <- preset_num[!is.na(preset_num)]
91+
# Start right after the biggest one found
92+
ind_biggest <- max(preset_num) + 1
93+
# If all override_ids were NA or not numbers, we can count from zero like usual
94+
if (is.na(ind_biggest)) {
95+
ind_biggest <- 0
96+
}
97+
}
98+
99+
# Go through the data and create a large data.frame with all the prevoius samples, adding a column for the individ_id
100+
search_data <- data.frame(index = character(), multilocus = character(), individ_id = character())
101+
102+
for (pair in am_unique$pairwise) {
103+
multilocus_combined <- apply(pair$match$multilocus, 1, combine_multilocus)
104+
105+
search_data <- rbind(search_data, list(index = pair$match$index, multilocus = multilocus_combined, individ_id = rep(as.character(ind), length(pair$match$index))))
106+
ind <- ind + 1
107+
}
108+
109+
# Add a empty column for the override data to be read from the file or created by the user later
110+
search_data <- cbind(search_data, list(override_id = rep(NA, length(search_data$index))))
111+
112+
# If a override_id column is specified, write it to the new column
113+
if (!is.null(data$preset_ind)) {
114+
search_data$override_id <- data[search_data$index,"preset_ind"]
115+
}
116+
117+
# The multiple matches that have been handled by the user previously and is now in the file
118+
multiple_matches_filter <- duplicated(search_data$index) & duplicated(search_data$override_id) & !is.na(search_data$override_id)
119+
120+
multiple_matches <- am_unique$multipleMatches$index
121+
# Only keep the multiple matches indexes that have not been handled
122+
multiple_matches <- multiple_matches[!(multiple_matches %in% search_data$index[multiple_matches_filter])]
123+
124+
search_data <- search_data[!multiple_matches_filter,]
125+
126+
# Return all we want, the data (meta-data), search_data (index, multilocus and the id to group them together)
127+
list(search_data, multiple_matches, list(index = am_unique$unclassified$index, multilocus = am_unique$unclassified$multilocus))
128+
}
129+
130+
#' Combine Multiple Locus and Assure Constant Width
131+
#'
132+
#' @param locus A vector with all the locus in string format.
133+
#'
134+
#' @return A long string with all locus combined, padding zeros making ever locus three characters long.
135+
#' @export
136+
#'
137+
#' @examples
138+
#' \dontrun{
139+
#' multilocus <- c("182", "180", "152", "152")
140+
#' multilocus_combined <- combine_multilocus(multilocus)
141+
#' }
142+
#' \dontrun{
143+
#' multilocus_combined <- apply(multilocus_matrix, 1, combine_multilocus)
144+
#' }
145+
combine_multilocus <- function(locus) {
146+
# Convert it to a number, add leading 0es if needed to reach length 3 and paste with collapse to create a long string
147+
locus %>%
148+
as.numeric() %>%
149+
formatC(width = 3, flag = "0", format = "d") %>%
150+
paste0(collapse = "", sep = " ")
151+
}
152+
153+
#' Simplify the ID getting process
154+
#' @description Return the override id if there is one, otherwise return the individ_id
155+
#'
156+
#' @param row The row for which the id wishes to be taken
157+
#'
158+
#' @return The relevant id
159+
#' @export
160+
#'
161+
#' @examples
162+
#' \dontrun{
163+
#' # Get the id for the sample at the 17th row
164+
#' id <- get_id(search_data[[17,]])
165+
#' }
166+
get_id <- function(row) {
167+
# read the override id first
168+
id <- row$override_id
169+
# if the override id is NA, use the regular id instead
170+
id[is.na(id)] <- row$individ_id[is.na(id)]
171+
id
172+
}
173+
174+
#' Generate Allelematch Profile Plot
175+
#'
176+
#' @param am_dataset A allelematch dataset to examine the optimal mismatch value for
177+
#'
178+
#' @return The plotdata to show the user what the program thinks is the optimal mismatch value
179+
#' @export
180+
#'
181+
#' @examples
182+
#' \dontrun{
183+
#' library(shiny)
184+
#'
185+
#' output$plot <- renderPlot({
186+
#' generate_allelematch_plot(am_data)
187+
#' })
188+
#' }
189+
generate_allelemtach_profile_plot <- function(am_dataset) {
190+
# Generate the plot
191+
allelematch::amUniqueProfile(am_dataset, doPlot = TRUE)
192+
}
193+
194+
#' Handle Multimatch By User
195+
#' @description This function takes
196+
#'
197+
#' @param search_data The search data, a dateframe with indexes, locuses and ids
198+
#' @param multiple_matches A list with the indexes that currently matches to multiples ids in the search_data
199+
#' @param multimatch_index The index of the sample that is to get a specified id
200+
#' @param new_id The new id that the specified id and all entries in the same group will get
201+
#'
202+
#' @return A list with the updated search_data and multiple_matches
203+
#' @export
204+
#'
205+
#' @examples
206+
#' \dontrun{
207+
#' c(search_data, multiple_match) %<-% handle_multimatch
208+
#' (search_data, multiple_matches, "SEP123", "B31")
209+
#' }
210+
handle_multimatch <- function(search_data, multiple_matches, multimatch_index, new_id) {
211+
# Remove the sample from the list of samples that have multiple matches, it has been corrected by the user
212+
multiple_matches <- multiple_matches[multiple_matches != multimatch_index]
213+
# Remove all duplicated of the specific sample, the order doesnt matter as override id is the important parameter for the id
214+
# We can remove the one with the correct id and set the override id of another without problem
215+
search_data <- search_data[!(search_data$index == multimatch_index & duplicated(search_data$index)),]
216+
# Set the sample (now no duplicates, only one left) override id to be the id specified by the user
217+
search_data$override_id[search_data$index == multimatch_index] <- new_id
218+
# Set every sample that is in the same group to have a override id, maybe not necessary but to ensure the order generated by
219+
# allelelmatch doesnt change and would therefor place the user "controlled" one in a then different group
220+
search_data$override_id[get_id(search_data) == new_id & !(search_data$index %in% multiple_matches)] <- new_id
221+
222+
# Return the updated data
223+
list(search_data, multiple_matches)
224+
}
225+
226+
#' Title
227+
#'
228+
#' @param data The old data, containing all samples that have already been handeled, or been choosen not to be handleh
229+
#' @param new_data The new data, the meta-data for the new sample, containing the same columns as the old data, index, locus, date, gender, etc
230+
#' @param additional_data_columns The names of the columns containing the meta-data for the \code{\link{create_allelematch_dataset}} to be able to create a new am_dataset
231+
#' @param allele_mismatch The mismatch (numbers of alleles that are allowed to differ between individuals) to use for this grouping, user could be intrested in being more strict or more loose, up to them
232+
#'
233+
#' @return Returns only the new data grouped into search_data format, which of them that matched multiple and if some of them were unclassified
234+
#' @export
235+
#'
236+
#' @examples
237+
match_new_data <- function(data, new_data, additional_data_columns, allele_mismatch) {
238+
# Combine the data under each other, create a big data.frame
239+
print(head(data))
240+
print(head(new_data))
241+
combined_data <- rbind(data, new_data)
242+
# Create an amDataset to be able to run the grouping on all of the data
243+
am_data <- create_allelematch_dataset(data = combined_data, ignore_columns = additional_data_columns)
244+
# Get the combined data for all samples, every index and group, including which fitted into multiple and which were unclassified
245+
c(combined_search_data, combined_multiple_matches, combined_unclassified) %<-% create_search_data(data = combined_data,
246+
am_data = am_data, allele_mismatch = allele_mismatch)
247+
# Return the data gathered but filter to only keep the entries whos indexes are not in the "old" data. Thereby only passing on the new data but sorted with individ_id, if they matched multiple and if they were unclassified
248+
list(combined_search_data[!combined_search_data$index %in% data$index,],
249+
combined_multiple_matches[!combined_multiple_matches %in% data$index],
250+
combined_unclassified[!combined_unclassified$index %in% data$index])
251+
}

0 commit comments

Comments
 (0)