.Rhistory

cor(activities_tidy$vo2max, activities_tidy$Avg.Cadence)
cor(activities_tidy$vo2max, activities_tidy$Avg.Pace)
cor(activities_tidy$vo2max, activities_tidy$Avg.Cadence)
# Some Correlations
cor(activities_tidy$vo2max, activities_tidy$Avg.HR)
activities_tidy %>%
filter(Activity.Type != "cycling") %>%
ggplot(mapping = aes(Avg.HR, vo2max)) +
geom_point() +
geom_smooth()
activities_tidy %>%
filter(Activity.Type != "cycling") %>%
ggplot(mapping = aes(Avg.Cadence, vo2max)) +
geom_point() +
geom_smooth()
cor(activities_tidy$vo2max, activities_tidy$Avg.Cadence)
# We can use Correlogram to look at this
nums <- sapply(activities_tidy, is.numeric)
corr <- round(cor(activities_tidy[,nums]), 1)
ggcorrplot(corr, lab = T, lab_size = 3, method = "circle", colors = c("tomato2", "white", "springgreen3"))
# Plot: Ground Contact Time
activities_tidy %>%
ggplot(aes(Date, Avg.Ground.Contact.Time)) +
geom_point(aes(color = Avg.Cadence)) +
geom_smooth()
# Plot: VO2 Max improvement over Time
# precise definition is the maximum volume of oxygen (in millilitres) you can consume per minute per kilogram of body weight at max performance
activities_tidy %>%
ggplot(mapping = aes(Date, vo2max)) +
geom_point() +
geom_smooth()
# Plot: Max HR over Time
activities_tidy %>%
ggplot(mapping = aes(Date, Max.HR)) +
geom_point() +
geom_smooth()
# Plot: Vertical Oscillation over Time
activities_tidy %>%
ggplot(mapping = aes(Date, Avg.Vertical.Oscillation)) +
geom_point() +
geom_smooth()
ggcorrplot(corr, lab = T, lab_size = 3, method = "circle", colors = c("tomato2", "white", "springgreen3"))
cor(activities_tidy$vo2max, activities_tidy$Avg.Cadence)
cor(activities_tidy$vo2max, activities_tidy$Avg.Cadence, na.rm = T)
cor(activities_tidy$vo2max, activities_tidy$Avg.Cadence, use = "complete.obs")
corr <- round(cor(activities_tidy[,nums], use = "complete.obs"), 1)
ggcorrplot(corr, lab = T, lab_size = 3, method = "circle", colors = c("tomato2", "white", "springgreen3"))
View(corr)
cor(activities_tidy$vo2max, activities_tidy$Avg.Cadence, use = "pairwise.complete.obs")
corr <- round(cor(activities_tidy[,nums], use = "pairwise.complete.obs"), 1)
ggcorrplot(corr, lab = T, lab_size = 3, method = "circle", colors = c("tomato2", "white", "springgreen3"))
cor_pmat(corr)
cor_pmat(round(cor(activities_tidy[,nums], use = "pairwise.complete.obs"), 1))
ggcorrplot(corr, hc.order = TRUE, lab = T, lab_size = 3, method = "circle", colors = c("tomato2", "white", "springgreen3"))
corr <- round(cor(activities_tidy[,nums], use = "pairwise.complete.obs"), 1)
ggcorrplot(corr, hc.order = TRUE, lab = T, lab_size = 3, method = "circle", colors = c("tomato2", "white", "springgreen3"))
ggcorrplot(corr, lab = T, lab_size = 3, method = "circle", colors = c("tomato2", "white", "springgreen3"))
View(activities_tidy)
# review the structure
str(activities_raw[1:5,])
# frequency distribution
activities_tidy %>%
filter(Activity.Type != "cycling") %>%
ggplot(aes(vo2max, y = ..density.., color = Activity.Type)) +
geom_freqpoly()
View(activities_raw)
# load libraries and prerequisites
library(tidyverse)
library(ggcorrplot)
# five number summary
# fivenum(activities_raw$`Avg Ground Contact Time`)
summary(activities_raw[5:10,])
# five number summary
# fivenum(activities_raw$`Avg Ground Contact Time`)
summary(activities_raw[,5:10])
# load libraries and prerequisites
library(tidyverse)
library(ggcorrplot)
# import the file, overriding "TIme" upon import, otherwise it will be set to col_type Time and values lost
activities_raw <- read_csv("Activities_Master.csv",
col_types = cols(
Date = col_datetime(format = "%d/%m/%Y %H:%M"),
Time = col_character(),
`Avg Pace` = col_time(format = "%M:%S")
),
na = "--")
# some useful readr functions:
# spec(activities_raw) # review parsed column formats
# problems(activities_raw) # review problems during parsing
# activities_raw # review the final "tibble"
# remove spaces from field names
names(activities_raw) <- make.names(names(activities_raw))
# some basic information (rows and columns)
dim(activities_raw)
# five number summary
# fivenum(activities_raw$`Avg Ground Contact Time`)
summary(activities_raw[,5:10])
# remove blank columns
activities_raw <- select(activities_raw, 1:20)
# review the structure
str(activities_raw[1:5,])
# let's have a look at the first 5 rows of data
head(activities_raw)
# similarly, the last 5 rows of data
tail(activities_raw)
# counts - table with means?
table(activities_raw$Activity.Type) # I guess i'm not a cyclist
# check for blanks and n/a
# missing data as % of dataset (think where that's referenced)
# Separate "Time" into component parts - converting to integers
activities_tidy <- separate(activities_raw, Time, into = (c("minute", "second", "centisecond")), convert = TRUE)
# Calculate total minutes, if first character is less than 5, treat as an hour unit, else, assume it's in minutes
activities_tidy$tot_mins <- ifelse((activities_tidy$minute < 5),
activities_tidy$minute * 60 + activities_tidy$second,
activities_tidy$minute)
# group elevation gain into categorical variables "high" and "low"
activities_tidy$gain <- ifelse(activities_tidy$Elev.Gain > 250, "high", "low")
# Bin the time series values to weeks and months: https://www.r-bloggers.com/plot-weekly-or-monthly-totals-in-r/
activities_tidy$week <- as.Date(cut(activities_tidy$Date, breaks = "week"))
activities_tidy$month <- as.Date(cut(activities_tidy$Date, breaks = "month"))
# Gauge
#activities_tidy$cadence_gauge <- activities_tidy %>%
#        if (`Avg Cadence` > 183) {return("purple")}
# Calculate #VO2 Max
# calculate meters per minute
activities_tidy$mpm <- (activities_tidy$Distance*1000)/activities_tidy$tot_mins
# calculate VO2
activities_tidy$vo2 <- -4.6+0.182*activities_tidy$mpm+0.000104*activities_tidy$mpm^2
# calculate % VO2 Max
activities_tidy$percentVO2 <- (activities_tidy$Avg.HR/activities_tidy$Max.HR -0.37)/0.64
# calculate VO2 Max
activities_tidy$vo2max <- activities_tidy$vo2/activities_tidy$percentVO2
View(activities_tidy)
# check for blanks and n/a
x0 <- activities_raw$Avg.Pace
class(x0)
str(x0)
summary(x0)
View(activities_raw)
# check for blanks and n/a
x0 <- activities_raw$Avg.Cadence
class(x0)
str(x0)
summary(x0)
mean(is.na(x0))
is.na(activities_raw)
mean(is.na(activities_raw))
activities_raw[1]
dim(activities_raw[1])
dim(activities_raw[,1])
dim(activities_raw[1,])
dim(activities_raw)
x <- dim(activities_raw)
x
x[1]
dim(activities_raw)[1] * dim(activities_raw)[2])
dim(activities_raw)[1] * dim(activities_raw)[2]
# load libraries and prerequisites
library(tidyverse)
library(ggcorrplot)
# import the file, overriding "TIme" upon import, otherwise it will be set to col_type Time and values lost
activities_raw <- read_csv("Activities_Master.csv",
col_types = cols(
Date = col_datetime(format = "%d/%m/%Y %H:%M"),
Time = col_character(),
`Avg Pace` = col_time(format = "%M:%S")
),
na = "--")
# some useful readr functions:
# spec(activities_raw) # review parsed column formats
# problems(activities_raw) # review problems during parsing
# activities_raw # review the final "tibble"
# remove spaces from field names
names(activities_raw) <- make.names(names(activities_raw))
# some basic information (rows and columns)
dim(activities_raw)
# five number summary
# fivenum(activities_raw$`Avg Ground Contact Time`)
summary(activities_raw[,5:10])
# remove blank columns
activities_raw <- select(activities_raw, 1:20)
# review the structure
str(activities_raw[1:5,])
# let's have a look at the first 5 rows of data
head(activities_raw)
# similarly, the last 5 rows of data
tail(activities_raw)
# counts - table with means?
table(activities_raw$Activity.Type) # I guess i'm not a cyclist
# check for blanks and n/a
# x0 <- activities_raw$Avg.Cadence
# class(x0)
# str(x0)
# summary(x0) # we can see we have some NA's
# mean(is.na(x0)) # is that missing data a problem?
# mean(is.na(activities_raw))
# check for blanks and n/a
# x0 <- activities_raw$Avg.Cadence
# class(x0)
# str(x0)
# summary(x0) # we can see we have some NA's
# mean(is.na(x0)) # is that missing data a problem?
# mean(is.na(activities_raw))
select(activities_raw, rowSums(is.na(activities_raw))>0)
# check for blanks and n/a
# x0 <- activities_raw$Avg.Cadence
# class(x0)
# str(x0)
# summary(x0) # we can see we have some NA's
# mean(is.na(x0)) # is that missing data a problem?
# mean(is.na(activities_raw))
select(activities_raw, is.na(Var2))
# check for blanks and n/a
# x0 <- activities_raw$Avg.Cadence
# class(x0)
# str(x0)
# summary(x0) # we can see we have some NA's
# mean(is.na(x0)) # is that missing data a problem?
# mean(is.na(activities_raw))
select(activities_raw, is.na(activities_raw))
# check for blanks and n/a
# x0 <- activities_raw$Avg.Cadence
# class(x0)
# str(x0)
# summary(x0) # we can see we have some NA's
# mean(is.na(x0)) # is that missing data a problem?
# mean(is.na(activities_raw))
select(activities_raw, is.na())
# check for blanks and n/a
# x0 <- activities_raw$Avg.Cadence
# class(x0)
# str(x0)
# summary(x0) # we can see we have some NA's
# mean(is.na(x0)) # is that missing data a problem?
# mean(is.na(activities_raw))
filter(activities_raw, is.na())
# check for blanks and n/a
# x0 <- activities_raw$Avg.Cadence
# class(x0)
# str(x0)
# summary(x0) # we can see we have some NA's
# mean(is.na(x0)) # is that missing data a problem?
# mean(is.na(activities_raw))
filter(activities_raw, is.na(activities_raw))
# check for blanks and n/a
# x0 <- activities_raw$Avg.Cadence
# class(x0)
# str(x0)
# summary(x0) # we can see we have some NA's
# mean(is.na(x0)) # is that missing data a problem?
# mean(is.na(activities_raw))
filter(activities_raw, is.na(activities_raw$Avg.Cadence))
# counts - table with means?
table(activities_raw$Activity.Type) # I guess i'm not a cyclist
# Separate "Time" into component parts - converting to integers
activities_tidy <- separate(activities_raw, Time, into = (c("minute", "second", "centisecond")), convert = TRUE)
# Calculate total minutes, if first character is less than 5, treat as an hour unit, else, assume it's in minutes
activities_tidy$tot_mins <- ifelse((activities_tidy$minute < 5),
activities_tidy$minute * 60 + activities_tidy$second,
activities_tidy$minute)
# group elevation gain into categorical variables "high" and "low"
activities_tidy$gain <- ifelse(activities_tidy$Elev.Gain > 250, "high", "low")
# Bin the time series values to weeks and months: https://www.r-bloggers.com/plot-weekly-or-monthly-totals-in-r/
activities_tidy$week <- as.Date(cut(activities_tidy$Date, breaks = "week"))
activities_tidy$month <- as.Date(cut(activities_tidy$Date, breaks = "month"))
# Gauge
#activities_tidy$cadence_gauge <- activities_tidy %>%
#        if (`Avg Cadence` > 183) {return("purple")}
# Calculate #VO2 Max
# calculate meters per minute
activities_tidy$mpm <- (activities_tidy$Distance*1000)/activities_tidy$tot_mins
# calculate VO2
activities_tidy$vo2 <- -4.6+0.182*activities_tidy$mpm+0.000104*activities_tidy$mpm^2
# calculate % VO2 Max
activities_tidy$percentVO2 <- (activities_tidy$Avg.HR/activities_tidy$Max.HR -0.37)/0.64
# calculate VO2 Max
activities_tidy$vo2max <- activities_tidy$vo2/activities_tidy$percentVO2
# check for blanks and n/a
# x0 <- activities_raw$Avg.Cadence
# class(x0)
# str(x0)
# summary(x0) # we can see we have some NA's
# mean(is.na(x0)) # is that missing data a problem?
# mean(is.na(activities_raw))
filter(activities_raw, is.na(activities_raw$Avg.Cadence))
# check for blanks and n/a
# x0 <- activities_raw$Avg.Cadence
# class(x0)
# str(x0)
# summary(x0) # we can see we have some NA's
# mean(is.na(x0)) # is that missing data a problem?
# mean(is.na(activities_raw))
filter(activities_tidy, is.na(activities_tidy$Avg.Cadence))
View(activities_tidy)
# outliers
sd(activities_tidy$vo2max)
# We can use Correlogram to look at this
nums <- sapply(activities_tidy, is.numeric)
corr <- round(cor(activities_tidy[,nums], use = "pairwise.complete.obs"), 1)
ggcorrplot(corr, lab = T, lab_size = 3, method = "circle", colors = c("tomato2", "white", "springgreen3"))
# scatter plot for all variables
# scatter plot for all variables
pairs(activities_tidy[,nums])
?plotmatrix
library(ggplot2)
# load libraries and prerequisites
library(tidyverse)
library(ggcorrplot)
# frequency distribution
activities_tidy %>%
filter(Activity.Type != "cycling") %>%
ggplot(aes(log(vo2max), y = ..density.., color = Activity.Type)) + # note: y has been set to density instead of count
geom_freqpoly()
# load libraries and prerequisites
library(tidyverse)
library(ggcorrplot)
# load libraries and prerequisites
library(tidyverse)
library(ggcorrplot)
# import the file, overriding "TIme" upon import, otherwise it will be set to col_type Time and values lost
activities_raw <- read_csv("Activities_Master.csv",
col_types = cols(
Date = col_datetime(format = "%d/%m/%Y %H:%M"),
Time = col_character(),
`Avg Pace` = col_time(format = "%M:%S")
),
na = "--")
# some useful readr functions:
# spec(activities_raw) # review parsed column formats
# problems(activities_raw) # review problems during parsing
# activities_raw # review the final "tibble"
# remove spaces from field names
names(activities_raw) <- make.names(names(activities_raw))
# some basic information (rows and columns)
dim(activities_raw)
# five number summary
# fivenum(activities_raw$`Avg Ground Contact Time`)
summary(activities_raw[,5:10])
# remove blank columns
activities_raw <- select(activities_raw, 1:20)
# review the structure
str(activities_raw[1:5,])
# let's have a look at the first 5 rows of data
head(activities_raw)
# similarly, the last 5 rows of data
tail(activities_raw)
# counts - table with means?
table(activities_raw$Activity.Type) # I guess i'm not a cyclist
# Separate "Time" into component parts - converting to integers
activities_tidy <- separate(activities_raw, Time, into = (c("minute", "second", "centisecond")), convert = TRUE)
# Calculate total minutes, if first character is less than 5, treat as an hour unit, else, assume it's in minutes
activities_tidy$tot_mins <- ifelse((activities_tidy$minute < 5),
activities_tidy$minute * 60 + activities_tidy$second,
activities_tidy$minute)
# group elevation gain into categorical variables "high" and "low"
activities_tidy$gain <- ifelse(activities_tidy$Elev.Gain > 250, "high", "low")
# Bin the time series values to weeks and months: https://www.r-bloggers.com/plot-weekly-or-monthly-totals-in-r/
activities_tidy$week <- as.Date(cut(activities_tidy$Date, breaks = "week"))
activities_tidy$month <- as.Date(cut(activities_tidy$Date, breaks = "month"))
# Gauge
#activities_tidy$cadence_gauge <- activities_tidy %>%
#        if (`Avg Cadence` > 183) {return("purple")}
# Calculate #VO2 Max
# calculate meters per minute
activities_tidy$mpm <- (activities_tidy$Distance*1000)/activities_tidy$tot_mins
# calculate VO2
activities_tidy$vo2 <- -4.6+0.182*activities_tidy$mpm+0.000104*activities_tidy$mpm^2
# calculate % VO2 Max
activities_tidy$percentVO2 <- (activities_tidy$Avg.HR/activities_tidy$Max.HR -0.37)/0.64
# calculate VO2 Max
activities_tidy$vo2max <- activities_tidy$vo2/activities_tidy$percentVO2
# check for blanks and n/a
# x0 <- activities_raw$Avg.Cadence
# class(x0)
# str(x0)
# summary(x0) # we can see we have some NA's
# mean(is.na(x0)) # is that missing data a problem?
# mean(is.na(activities_raw))
filter(activities_tidy, is.na(activities_tidy$Avg.Cadence))
# outliers
# normalize VO2 Max
# standard deviation ranges
# sd(activities_tidy$vo2max)
# frequency distribution
activities_tidy %>%
filter(Activity.Type != "cycling") %>%
ggplot(aes(log(vo2max), y = ..density.., color = Activity.Type)) + # note: y has been set to density instead of count
geom_freqpoly()
# frequency distribution
activities_tidy %>%
filter(Activity.Type == "running") %>%
ggplot(aes(log(vo2max), y = ..density.., color = Activity.Type)) + # note: y has been set to density instead of count
geom_freqpoly()
# frequency distribution
activities_tidy %>%
filter(Activity.Type == "running") %>%
ggplot(aes(vo2max), y = ..density.., color = Activity.Type)) + # note: y has been set to density instead of count
# frequency distribution
activities_tidy %>%
filter(Activity.Type == "running") %>%
ggplot(aes(vo2max, y = ..density.., color = Activity.Type)) + # note: y has been set to density instead of count
geom_freqpoly()
# frequency distribution
activities_tidy %>%
filter(Activity.Type == "running") %>%
ggplot(aes(log(vo2max), y = ..density.., color = Activity.Type)) + # note: y has been set to density instead of count
geom_freqpoly()
# frequency distribution
activities_tidy %>%
filter(Activity.Type == "track running") %>%
ggplot(aes(log(vo2max), y = ..density.., color = Activity.Type)) + # note: y has been set to density instead of count
geom_freqpoly()
# frequency distribution
activities_tidy %>%
filter(Activity.Type == "trail running") %>%
ggplot(aes(log(vo2max), y = ..density.., color = Activity.Type)) + # note: y has been set to density instead of count
geom_freqpoly()
# frequency distribution
activities_tidy %>%
filter(Activity.Type == "track_running") %>%
ggplot(aes(log(vo2max), y = ..density.., color = Activity.Type)) + # note: y has been set to density instead of count
geom_freqpoly()
# frequency distribution
activities_tidy %>%
filter(Activity.Type == "running") %>%
ggplot(aes(log(vo2max), y = ..density.., color = Activity.Type)) + # note: y has been set to density instead of count
geom_freqpoly()
# frequency distribution
activities_tidy %>%
filter(Activity.Type == "running") %>%
ggplot(aes(log(vo2max), color = Activity.Type)) + # note: y has been set to density instead of count
geom_freqpoly()
# frequency distribution
activities_tidy %>%
filter(Activity.Type == "running") %>%
ggplot(aes(log(vo2max), y = ..density.., color = Activity.Type)) + # note: y has been set to density instead of count
geom_freqpoly()
# frequency distribution
activities_tidy %>%
filter(Activity.Type == "running") %>%
ggplot(aes(log(vo2max), y = ..density.., color = Activity.Type)) + # note: y has been set to density instead of count
geom_freqpoly(binwidth = 10)
# frequency distribution
activities_tidy %>%
filter(Activity.Type == "running") %>%
ggplot(aes(log(vo2max), y = ..density.., color = Activity.Type)) + # note: y has been set to density instead of count
geom_freqpoly(binwidth = 1)
# frequency distribution
activities_tidy %>%
filter(Activity.Type == "running") %>%
ggplot(aes(log(vo2max), y = ..density.., color = Activity.Type)) + # note: y has been set to density instead of count
geom_freqpoly(binwidth = 30)
# frequency distribution
activities_tidy %>%
filter(Activity.Type == "running") %>%
ggplot(aes(log(vo2max), y = ..density.., color = Activity.Type)) + # note: y has been set to density instead of count
geom_freqpoly()
# frequency distribution
activities_tidy %>%
filter(Activity.Type == "running") %>%
ggplot(aes(log(vo2max), y = ..density.., color = Activity.Type)) + # note: y has been set to density instead of count
geom_freqpoly(binwidth = 0.1)
# frequency distribution
activities_tidy %>%
filter(Activity.Type == "running") %>%
ggplot(aes(vo2max, y = ..density.., color = Activity.Type)) + # note: y has been set to density instead of count
geom_freqpoly(binwidth = 0.1)
# frequency distribution
activities_tidy %>%
filter(Activity.Type == "running") %>%
ggplot(aes(log(vo2max), y = ..density.., color = Activity.Type)) + # note: y has been set to density instead of count
geom_freqpoly(binwidth = 0.1)
vo2max
# frequency distribution
activities_tidy %>%
filter(Activity.Type == "running") %>%
ggplot(aes(vo2max, y = ..density.., color = Activity.Type)) + # note: y has been set to density instead of count
geom_freqpoly(binwidth = 0.1)
# frequency distribution
activities_tidy %>%
filter(Activity.Type == "running") %>%
ggplot(aes(log(vo2max), y = ..density.., color = Activity.Type)) + # note: y has been set to density instead of count
geom_freqpoly(binwidth = 0.1)
activities_tidy %>%
filter(Activity.Type != "cycling") %>%
ggplot(mapping = aes(Avg.Pace, vo2max)) +
geom_point() +
geom_smooth()
# We can use Correlogram to look at this
nums <- sapply(activities_tidy, is.numeric)
corr <- round(cor(activities_tidy[,nums], use = "pairwise.complete.obs"), 1)
ggcorrplot(corr, lab = T, lab_size = 3, method = "circle", colors = c("tomato2", "white", "springgreen3"))
# scatter plot for all numeric variables
pairs(activities_tidy[,nums])
# load libraries and prerequisites
library(tidyverse)
library(ggcorrplot)
# import the file, overriding "TIme" upon import, otherwise it will be set to col_type Time and values lost
activities_raw <- read_csv("Activities_Master.csv",
col_types = cols(
Date = col_datetime(format = "%d/%m/%Y %H:%M"),
Time = col_character(),
`Avg Pace` = col_time(format = "%M:%S")
),
na = "--")
# some useful readr functions:
# spec(activities_raw) # review parsed column formats
# problems(activities_raw) # review problems during parsing
# activities_raw # review the final "tibble"
# remove spaces from field names
names(activities_raw) <- make.names(names(activities_raw))
View(activities_raw)