diff --git a/code/session_1_template.R b/code/session_1_template.R deleted file mode 100644 index 6a16123..0000000 --- a/code/session_1_template.R +++ /dev/null @@ -1,203 +0,0 @@ -## Programming for Professional Research Using R -- Session 1 - - ### Session Description - - # Basic R Use – Introduction to R and RStudio, How to import/export data - # Using Basic Tidyverse Functions – how to subset, mutate, summarize, and reshape data - # Using R in a Collaborative Setting – Introduction to downloading and publishing data using - # GitHub - - ### Resources - - # For those who are fully new to R, I strongly recommend: Thomas Mock, “A Gentle Introduction - # to Tidy Statistics in R” – blog post - # (https://themockup.blog/posts/2018-12-10-a-gentle-guide-to-tidy-statistics-inr/) and video - # (https://www.rstudio.com/resources/webinars/a-gentleintroduction-to-tidy-statistics-in-r/). - # We won’t have much time to review the basics of R programming during the first session. - - # Dominic Royé, “A very short introduction to Tidyverse” - # (https://dominicroye.github.io/en/2020/a-very-short-introduction-totidyverse/). Blog post - # covering the basics of Tidyverse use in R. - - # tidyr, “Pivoting” (https://tidyr.tidyverse.org/articles/pivot.html). Vignette explaining how - # to reshape datasets using pivot_longer and pivot_longr. - - # Hadley Wickham, “dplyr 1.0.0: working across columns” - # (https://www.tidyverse.org/blog/2020/04/dplyr-1-0-0-colwise/). Explains the basics for - # flexible column-wise operations using across in R. - - ### Session Objectives - - # Learn how to: - # Import data in an efficient and reproducible manner - # Filter, mutate, group, and summarize data using Tidyverse functions - # Reshape data using Tidyverse functions - - # Be introduced to: - # Code and data collaboration -- GitHub and data project reproducibility - - # Practice the above! - - ## 1. Setup ---- - - ### Packages Random - -# NOTE -- Unlike using library(), the 'pacman::p_load()' function installs the package if it is -# already not present in the user's R environment. - - if(!require(pacman)) install.packages("pacman") - - pacman::p_load(tidyverse, data.table, janitor, usethis) - - ## 2. Import Data ---- - -# This is session 1 so we're going to import the data from Dropbox: - - if(!(file.exists("data/final/wvs_values_norms_data.csv"))) { # Checks whether the data has been - # downloaded already - usethis::use_zip( - "https://www.dropbox.com/scl/fo/vnxjbqyq1g9z368coh1vq/h?rlkey=zc66o2ll7613b5e9ipp915ynk&dl=1" - ) - - } - - norms_values_data <- data.table::fread( # Other options are base R's read.csv() and data.table::fread() - "data/final/wvs_values_norms_data.csv", na.strings = "" - ) - - ## 3. Data 'Wrangling' ---- - -# Goal -- Determine what European countries feel is important in their life -> create a dataset -# summarizing each European country's opinion on different life subjects - - ### Step 1 -- Subset dataset to keep only European countries ---- - -# This dataset doesn't have a "continent" variable, so I will have to create one - -# Check the names of the variables in my dataset - - norms_values_data %>% - names() - -# Check which countries are in my dataset - - norms_values_data %>% - janitor::tabyl(B_COUNTRY_ALPHA) - -# Use ISO codes to distinguish European countries - - european_iso_codes <- c( # This creates a character list - "AND", "CYP", "DEU", "GRC", "RUS", "SRB", "TUR", "UKR" - ) - - norms_values_data <- norms_values_data %>% - dplyr::mutate( - european = dplyr::case_when( # This creates a dummy variable - B_COUNTRY_ALPHA %in% european_iso_codes ~ 1, - TRUE ~ 0 - ) - ) - -# Check that it worked - - norms_values_data %>% - janitor::tabyl(european, B_COUNTRY_ALPHA) - -# Now subset using filter() - - european_data <- norms_values_data %>% - dplyr::filter(european == 1) - - ### Step 2 -- Select relevant variables ---- - -# We want to look at what people find important in life. Those are questions Q1-Q6. - -# So we keep those questions, as well as D_INTERVIEW (unique ID, always keep) and B_COUNTRY_ALPHA - - european_data <- european_data %>% - dplyr::select( - D_INTERVIEW, B_COUNTRY_ALPHA, dplyr::matches("^Q0[1-6]") - # matches() allows us to select multiple variables at once using a common string in - # their name - ) - - ### Step 3 -- Clean variables ---- - -# Will explore this further in next week's session -- need to encode answers as "NA" to continue -# using numerically - - european_data <- european_data %>% - dplyr::mutate( - dplyr::across( # Across is magic and allows you to modify multiple variables at once - matches("^Q0[1-6]"), # Same as for select() above. IMPORTANT -- all variables need - # to be of the same 'type' (e.g. character, numeric) - ~ case_when( # across() basically loops over every relevant variable. '.x' refers to - # each variable being treated - .x %in% c(-1, -2, -4, -5) ~ NA_integer_, - # Key to understand what NA_ to use.Here the possible answers will be 1, 2, 3, - # 4 so those are integers - TRUE ~ .x # We've created NAs and are leaving the other - # values alone - ) - ) - ) - - ### Step 4 -- Summarize variables at the country level ---- - -# We want a dataset where each observation (row) is a country, not a household. To do this, we use -# group_by() and summarize() - - european_country_data <- european_data %>% # New observation level so new dataset - dplyr::group_by(B_COUNTRY_ALPHA) %>% # We're telling R at which level to do the grouping - dplyr::summarize( # Summarize aggregated values based on what we instruct it to do. If we - # didn't use group_by(), it would summarize to one single value. Here, - # it will output one value per country. - dplyr::across( # Using across() again to summarize multiple variables at once - matches("^Q0[1-6]"), - ~ mean(.x, na.rm = TRUE) # Removing NAs and taking the mean of each variable - ) - ) %>% - dplyr::ungroup() # Always remember to do this! Otherwise your future code will do weird - # things - - ### Step 5 -- Create a question-level dataset ---- - -# The above is useful. But what if we want to sum up countries' values for all of these questions, -# or look at each country's 'average' enthusiasm, it'll be easier with a 'long' dataset than a -# 'wide' one. - - european_country_data_long <- european_country_data %>% - tidyr::pivot_longer( - cols = matches("^Q0[1-6]"), # Variables whose data we want to be in a single, - # 'long' variable - names_to = "topic", # Creates a variable named 'topic' that saves the variable names - values_to = "score" # Creates a 'long' variable named 'score' that holds all of the - # original values - ) %>% - dplyr::mutate( # I don't like how 'topic' has more information than necessary - topic = stringr::str_replace( # stringr is the best package for string manipulation - topic, - "^Q0[1-6]_life_", - "" - ) - ) - -# Check that it worked - - european_country_data_long %>% janitor::tabyl(topic) # It did! - -# Now for example I can look at average 'enthusiasm' by country - - average_country_enthusiasm <- european_country_data_long %>% - dplyr::group_by(B_COUNTRY_ALPHA) %>% - dplyr::summarize( - average_score = mean(score, na.rm = TRUE) - ) %>% - dplyr::ungroup() %>% - dplyr::arrange(average_score) # Order them from highest to lowest enthusiasm (NOTE -- - # smaller number means more enthusiasm) - -# check what we created: - - average_country_enthusiasm %>% head() - \ No newline at end of file diff --git a/code/session_2_template.R b/code/session_2_template.R index 4ae0c68..c91baa5 100644 --- a/code/session_2_template.R +++ b/code/session_2_template.R @@ -2,44 +2,57 @@ ### Session Description - # (Continuing from last week) Using Basic Tidyverse Functions – how to subset, mutate, summarize, - # and reshape data - # Data Cleaning -- Simple checks and manipulations to clean raw data - # Introduction to "tidy" datasets and how to get to them + # Basic R Use – Introduction to R and RStudio, How to import/export data + # Using Basic Tidyverse Functions – how to subset, mutate, summarize, and reshape data + # Using R in a Collaborative Setting – Introduction to downloading and publishing data using + # GitHub ### Resources - # DIME, “Data Cleaning” (https://dimewiki.worldbank.org/Data_Cleaning). Instructions on how to clean - # raw household data for use in a development setting. + # For those who are fully new to R, I strongly recommend: Thomas Mock, “A Gentle Introduction + # to Tidy Statistics in R” – blog post + # (https://themockup.blog/posts/2018-12-10-a-gentle-guide-to-tidy-statistics-inr/) and video + # (https://www.rstudio.com/resources/webinars/a-gentleintroduction-to-tidy-statistics-in-r/). + # We won’t have much time to review the basics of R programming during the first session. - # Wickham and Grolemund, R for Data Science Chapter 12 – Tidy Data (https://r4ds.had.co.nz/tidy-data.html). - # How to structure (“tidy”) your dataset for flexible use in data analysis. + # Dominic Royé, “A very short introduction to Tidyverse” + # (https://dominicroye.github.io/en/2020/a-very-short-introduction-totidyverse/). Blog post + # covering the basics of Tidyverse use in R. + + # tidyr, “Pivoting” (https://tidyr.tidyverse.org/articles/pivot.html). Vignette explaining how + # to reshape datasets using pivot_longer and pivot_longr. + + # Hadley Wickham, “dplyr 1.0.0: working across columns” + # (https://www.tidyverse.org/blog/2020/04/dplyr-1-0-0-colwise/). Explains the basics for + # flexible column-wise operations using across in R. ### Session Objectives # Learn how to: + # Import data in an efficient and reproducible manner # Filter, mutate, group, and summarize data using Tidyverse functions # Reshape data using Tidyverse functions - # Check for duplicates and encode missing values # Be introduced to: - # "Tidy" datasets and how to create them using pivot_longer() and pivot_wider() + # Code and data collaboration -- GitHub and data project reproducibility # Practice the above! ## 1. Setup ---- - ### Packages + ### Packages Random -# NOTE -- Unlike using library(), the 'pacman::p_load()' function installs the package if it is already -# not present in the user's R environment. +# NOTE -- Unlike using library(), the 'pacman::p_load()' function installs the package if it is +# already not present in the user's R environment. - if(!require(pacman)) install.packages("pacman") + if(!require(pacman)) install.packages("pacman") - pacman::p_load(tidyverse, data.table, janitor) + pacman::p_load(tidyverse, data.table, janitor, usethis) ## 2. Import Data ---- +# This is session 1 so we're going to import the data from Dropbox: + if(!(file.exists("data/final/wvs_values_norms_data.csv"))) { # Checks whether the data has been # downloaded already usethis::use_zip( @@ -48,32 +61,30 @@ } - norms_values_data <- data.table::fread( - "data/final/wvs_values_norms_data.csv", - na.strings = "" - + norms_values_data <- read.csv( # Other options are base R's read.csv() and data.table::fread() + "data/final/wvs_values_norms_data.csv", na.strings = "" ) - ## 3. SESSION 1 -- Data 'Wrangling' ---- + ## 3. Data 'Wrangling' ---- - # Goal -- Determine what European countries feel is important in their life -> create a dataset - # summarizing each European country's opinion on different life subjects +# Goal -- Determine what European countries feel is important in their life -> create a dataset +# summarizing each European country's opinion on different life subjects - ### Step 1 -- Subset dataset to keep only European countries ---- + ### Step 1 -- Subset dataset to keep only European countries ---- - # This dataset doesn't have a "continent" variable, so I will have to create one +# This dataset doesn't have a "continent" variable, so I will have to create one - # Check the names of the variables in my dataset +# Check the names of the variables in my dataset norms_values_data %>% names() - # Check which countries are in my dataset +# Check which countries are in my dataset norms_values_data %>% janitor::tabyl(B_COUNTRY_ALPHA) - # Use ISO codes to distinguish European countries +# Use ISO codes to distinguish European countries european_iso_codes <- c( # This creates a character list "AND", "CYP", "DEU", "GRC", "RUS", "SRB", "TUR", "UKR" @@ -87,21 +98,21 @@ ) ) - # Check that it worked +# Check that it worked norms_values_data %>% janitor::tabyl(european, B_COUNTRY_ALPHA) - # Now subset using filter() +# Now subset using filter() european_data <- norms_values_data %>% dplyr::filter(european == 1) - ### Step 2 -- Select relevant variables ---- + ### Step 2 -- Select relevant variables ---- - # We want to look at what people find important in life. Those are questions Q1-Q6. +# We want to look at what people find important in life. Those are questions Q1-Q6. - # So we keep those questions, as well as D_INTERVIEW (unique ID, always keep) and B_COUNTRY_ALPHA +# So we keep those questions, as well as D_INTERVIEW (unique ID, always keep) and B_COUNTRY_ALPHA european_data <- european_data %>% dplyr::select( @@ -110,58 +121,58 @@ # their name ) - ### Step 3 -- Clean variables ---- + ### Step 3 -- Clean variables ---- - # Will explore this further in next week's session -- need to encode answers as "NA" to continue - # using numerically +# Will explore this further in next week's session -- need to encode answers as "NA" to continue +# using numerically european_data <- european_data %>% dplyr::mutate( dplyr::across( # Across is magic and allows you to modify multiple variables at once matches("^Q0[1-6]"), # Same as for select() above. IMPORTANT -- all variables need - # to be of the same 'type' (e.g. character, numeric) + # to be of the same 'type' (e.g. character, numeric) ~ case_when( # across() basically loops over every relevant variable. '.x' refers to - # each variable being treated + # each variable being treated .x %in% c(-1, -2, -4, -5) ~ NA_integer_, # Key to understand what NA_ to use.Here the possible answers will be 1, 2, 3, # 4 so those are integers TRUE ~ .x # We've created NAs and are leaving the other - # values alone + # values alone ) ) ) - ### Step 4 -- Summarize variables at the country level ---- + ### Step 4 -- Summarize variables at the country level ---- - # We want a dataset where each observation (row) is a country, not a household. To do this, we use - # group_by() and summarize() +# We want a dataset where each observation (row) is a country, not a household. To do this, we use +# group_by() and summarize() european_country_data <- european_data %>% # New observation level so new dataset dplyr::group_by(B_COUNTRY_ALPHA) %>% # We're telling R at which level to do the grouping dplyr::summarize( # Summarize aggregated values based on what we instruct it to do. If we - # didn't use group_by(), it would summarize to one single value. Here, - # it will output one value per country. + # didn't use group_by(), it would summarize to one single value. Here, + # it will output one value per country. dplyr::across( # Using across() again to summarize multiple variables at once matches("^Q0[1-6]"), ~ mean(.x, na.rm = TRUE) # Removing NAs and taking the mean of each variable ) ) %>% dplyr::ungroup() # Always remember to do this! Otherwise your future code will do weird - # things + # things - ### Step 5 -- Create a question-level dataset ---- + ### Step 5 -- Create a question-level dataset ---- - # The above is useful. But what if we want to sum up countries' values for all of these questions, - # or look at each country's 'average' enthusiasm, it'll be easier with a 'long' dataset than a - # 'wide' one. +# The above is useful. But what if we want to sum up countries' values for all of these questions, +# or look at each country's 'average' enthusiasm, it'll be easier with a 'long' dataset than a +# 'wide' one. - european_country_data_wide <- european_country_data %>% + european_country_data_long <- european_country_data %>% tidyr::pivot_longer( cols = matches("^Q0[1-6]"), # Variables whose data we want to be in a single, - # 'long' variable + # 'long' variable names_to = "topic", # Creates a variable named 'topic' that saves the variable names values_to = "score" # Creates a 'long' variable named 'score' that holds all of the - # original values + # original values ) %>% dplyr::mutate( # I don't like how 'topic' has more information than necessary topic = stringr::str_replace( # stringr is the best package for string manipulation @@ -171,244 +182,22 @@ ) ) - # Check that it worked +# Check that it worked - european_country_data_wide %>% janitor::tabyl(topic) # It did! + european_country_data_long %>% janitor::tabyl(topic) # It did! - # Now for example I can look at average 'enthusiasm' by country +# Now for example I can look at average 'enthusiasm' by country - average_country_enthusiasm <- european_country_data_wide %>% + average_country_enthusiasm <- european_country_data_long %>% dplyr::group_by(B_COUNTRY_ALPHA) %>% dplyr::summarize( average_score = mean(score, na.rm = TRUE) ) %>% dplyr::ungroup() %>% dplyr::arrange(average_score) # Order them from highest to lowest enthusiasm (NOTE -- - # smaller number means more enthusiasm) - - # check what we created: - - average_country_enthusiasm %>% head() + # smaller number means more enthusiasm) - ## 4. SESSION 2 -- Data Cleaning/Tidy Data ---- - -## This will show you some basic tasks that you can do with raw data to remove -## possible errors prior to analysis: - - ## Check for duplicate observations -> i.e. rows should be uniquely - ## identified - ## Encode missing values -> answers such as "don't know" or "refused to - ## respond" shouldn't be negative but NA - ## Check and confirm numerical outliers (not covered here) - - ### Step 1 -- Check for duplicate observations ---- - -# In the norms_values_data dataset, the unique identifier for each observation -# is "D_INTERVIEW". It should be the case that each ID is only used once in -# the dataset. To check for duplicates, this is the preferred method: - - duplicates <- norms_values_data %>% - group_by(D_INTERVIEW) %>% - # Grouping at the ID level, we'll be able to check how many times it - # shows up - dplyr::summarize( - num = n() - # n() counts the number of instances of each ID! Just run up to - # summarize() and add %>% head() to see what it does - ) %>% - filter(num > 1) - -# 36 duplicates, uh oh. They all seem to be similar in "order" so I'll check in -# which countries they occurred: - - duplicates <- duplicates %>% - left_join( # left_join() allows us to 'merge' datasets, in this case - # merging norms_values_data into the duplicates dataset to - # add countries - norms_values_data %>% - select(D_INTERVIEW, B_COUNTRY_ALPHA) %>% - # Need to have a common variable (D_INTERVIEW) between - # norms_values_data and duplicates, and we only want to add - # country name - distinct() # Need to do this BECAUSE of the duplicate IDs - ) - -# NOTE -- "left_join" means that only the observations from the first (left) -# dataset (based on value of D_INTERVIEW), here "duplicates", are kept. We -# could have used "right_join" if we wanted to only keep the observations from -# the second (right) dataset, here "norms_values_data", "full_join" if we -# wanted to keep both, or "inner_join" if we only wanted to keep observations -# that appear in both datasets - -# Check countries: - - duplicates %>% - janitor::tabyl(B_COUNTRY_ALPHA) # Uh oh, Mongolia! - -# There is an issue with the Mongolian IDs. Usually we would have to contact -# the data collection team to determine what the problem is. In this case we -# can't do that, so... Just remove all observations from Mongolia because this -# is strange - - norms_values_data <- norms_values_data %>% - filter(B_COUNTRY_ALPHA != "MNG") - - ### Step 2 -- Encode missing values ---- - -# Going to focus on the "child" variables here (Q07-Q17). Extract those first: - -# 'Easy' selection method would be this: - - child_data <- norms_values_data %>% - select( - D_INTERVIEW, B_COUNTRY_ALPHA, Q07_child_manners, - Q08_child_independence, Q09_child_hard_work, - Q10_child_responsibility, Q11_child_imagination, Q12_child_tolerance, - Q13_child_thrift, Q14_child_determined, Q15_child_faith, - Q16_child_unselfish, Q17_child_obedient # Ugh - ) - - child_data <- norms_values_data %>% - select( - D_INTERVIEW, B_COUNTRY_ALPHA, - matches("^Q(0[7-9]|1[0-7])") # See how much quicker this is? - ) +# check what we created: -# Check it worked: - - child_data %>% names() - -# We explore what possible values these variables can take. For example: - - child_data %>% - tabyl(Q07_child_manners) - -# These negative values don't seem great. If we look at the codebook, we see -# that they're different versions of not receiving a quantifiable answer (e.g. -# don't know, refused to respond). We need to encode those as NA - -# Annoying way to do this: - - child_data_sucky_mutate <- child_data %>% - mutate( - Q07_child_manners = case_when( - Q07_child_manners < 0 ~ NA_integer_, - # Notice that they're always negative so this is an easy - # simplification - TRUE ~ Q07_child_manners - ), - Q08_child_independence = case_when( - Q08_child_independence < 0 ~ NA_integer_, - TRUE ~ Q08_child_independence - ), - Q09_child_hard_work = case_when( - Q09_child_hard_work < 0 ~ NA_integer_, - TRUE ~ Q09_child_hard_work - ), - Q10_child_responsibility = case_when( - Q10_child_responsibility < 0 ~ NA_integer_, - TRUE ~ Q10_child_responsibility - ), - Q11_child_imagination = case_when( - Q11_child_imagination < 0 ~ NA_integer_, - TRUE ~ Q11_child_imagination - ), - Q12_child_tolerance = case_when( - Q12_child_tolerance < 0 ~ NA_integer_, - TRUE ~ Q12_child_tolerance - ), - Q13_child_thrift = case_when( - Q13_child_thrift < 0 ~ NA_integer_, - TRUE ~ Q13_child_thrift - ), - Q14_child_determined = case_when( - Q14_child_determined < 0 ~ NA_integer_, - TRUE ~ Q14_child_determined - ), - Q15_child_faith = case_when( - Q15_child_faith < 0 ~ NA_integer_, - TRUE ~ Q15_child_faith - ), - Q16_child_unselfish = case_when( - Q16_child_unselfish < 0 ~ NA_integer_, - TRUE ~ Q16_child_unselfish - ), - Q17_child_obedient = case_when( - Q17_child_obedient < 0 ~ NA_integer_, - TRUE ~ Q17_child_obedient - ) - ) - -# That sucked. Look how much easier the below is: - - child_data <- child_data %>% - mutate( - across( - Q07_child_manners:Q17_child_obedient, - ~ case_when( - .x < 0 ~ NA_integer_, - TRUE ~ .x - ) - ) - ) # Same result!!! You can compare child_data and child_data_sucky_mutate - # to see that - -# Note -- in this dataset, '1' is that the subject was mentioned, '2' that the -# subject wasn't. This is akin to 'yes' and 'no' which are traditionally -# encoded respectively as '1' and '0'. So: - - child_data <- child_data %>% - mutate( - across( - Q07_child_manners:Q17_child_obedient, - ~ case_when( - .x == 2 ~ 0, - TRUE ~ as.numeric(.x) # Note that we need to change class - # from integer to numeric because - # of the '0' - ) - ) - ) - - ### Step 3 -- Creating a 'tidy' version of this ---- - -# We want a dataset that doesn't have: - - # One variable spread across multiple columns, or - # One observation spread across multiple rows - -# In this case, it seems that the answers to the question of what is important -# to teach to a child is spread across multiple variables, one for each answer -# basically. This would make it difficult to e.g. check what the -# most popular 5 answers are. Solution: pivot_longer - - child_data_long <- child_data %>% - # Note -- good to create a new dataset when modifying the level of - # observation - pivot_longer( - cols = Q07_child_manners:Q17_child_obedient, - names_to = "child_quality", - names_prefix = "Q[0-9]{2}_child_", # Allows us to only keep what - # follows the variable prefix - values_to = "child_quality_value" # For now - ) - -# Check what it looks like: - - child_data_long %>% head() - -# Small changes to look nicer - - child_data_long <- child_data_long %>% - mutate( - child_quality = str_replace_all(child_quality, "_", " "), - child_quality = str_to_title(child_quality) - ) - - child_data_long %>% - tabyl(child_quality) # Looks good! - -# Challenge: Try to determine what the five most popular answers to this -# question were. Hint: "arrange(-var)" orders a dataset from the largest -# value of 'var' to the small value of 'var'. + average_country_enthusiasm %>% head() \ No newline at end of file diff --git a/code/session_3_template.R b/code/session_3_template.R index 1f7e70c..2f14221 100644 --- a/code/session_3_template.R +++ b/code/session_3_template.R @@ -2,37 +2,28 @@ ### Session Description - # Exporting Summary/Regression Tables – How to produce: - # (i) regression tables and - # (ii) descriptive summary tables for academic or policy audiences - - # Introduction to R Data Visualization – How to produce beautiful and informative visualizations: - # scatter plots, density plots, and more + # (Continuing from last week) Using Basic Tidyverse Functions – how to subset, mutate, summarize, + # and reshape data + # Data Cleaning -- Simple checks and manipulations to clean raw data + # Introduction to "tidy" datasets and how to get to them ### Resources - # Tables - - # Marek Hlavac, “stargazer: beautiful LATEX, HTML and ASCII tables from R statistical output” - # (https://cran.rproject.org/web/packages/stargazer/vignettes/stargazer.pdf). Vignette for the - # stargazer package, main tool to export regression tables to LateX - - # Thomas Mock, “gt - a (G)rammar of (T)ables” - # (https://themockup.blog/posts/2020-05-16-gt-a-grammer-of-tables/). - # Introduction to the gt package, a more flexible instrument to export tables in PNG, PDF, or HTML formats. + # DIME, “Data Cleaning” (https://dimewiki.worldbank.org/Data_Cleaning). Instructions on how to clean + # raw household data for use in a development setting. - # Plots - - # Alicia Horsch, “A quick introduction to ggplot2” - # (https://towardsdatascience.com/a-quick-introduction-to-ggplot2-d406f83bb9c9). Introduction to the - # ggplot2 package, the main instrument for plot creation in R. + # Wickham and Grolemund, R for Data Science Chapter 12 – Tidy Data (https://r4ds.had.co.nz/tidy-data.html). + # How to structure (“tidy”) your dataset for flexible use in data analysis. ### Session Objectives # Learn how to: - # Create simple academic-standard regression output tables using the stargazer package - # Create flexible and easy-to-read tables of any dataset using the gt package - # Create a scatter plot, density plot, and bar chart using the ggplot2 package + # Filter, mutate, group, and summarize data using Tidyverse functions + # Reshape data using Tidyverse functions + # Check for duplicates and encode missing values + + # Be introduced to: + # "Tidy" datasets and how to create them using pivot_longer() and pivot_wider() # Practice the above! @@ -43,11 +34,9 @@ # NOTE -- Unlike using library(), the 'pacman::p_load()' function installs the package if it is already # not present in the user's R environment. - options(scipen=999) - if(!require(pacman)) install.packages("pacman") - pacman::p_load(tidyverse, data.table, janitor, stargazer, huxtable, gt, paletteer) + pacman::p_load(tidyverse, data.table, janitor) ## 2. Import Data ---- @@ -59,19 +48,19 @@ } - norms_values_data <- data.table::fread( # Other options are base R's read.csv() and readr::read_csv(), but - # data.table::fread() is considered to be the fastest - "data/final/wvs_values_norms_data.csv", na.strings = "" - ) - - ## Country Continent Data - - country_continent_data <- data.table::fread( - "data/raw//country_continent.csv", na.strings = "" + norms_values_data <- data.table::fread( + "data/final/wvs_values_norms_data.csv", + na.strings = "" + ) ## 3. SESSION 1 -- Data 'Wrangling' ---- + # Goal -- Determine what European countries feel is important in their life -> create a dataset + # summarizing each European country's opinion on different life subjects + + ### Step 1 -- Subset dataset to keep only European countries ---- + # This dataset doesn't have a "continent" variable, so I will have to create one # Check the names of the variables in my dataset @@ -200,11 +189,11 @@ # check what we created: average_country_enthusiasm %>% head() - + ## 4. SESSION 2 -- Data Cleaning/Tidy Data ---- - ## This will show you some basic tasks that you can do with raw data to remove - ## possible errors prior to analysis: +## This will show you some basic tasks that you can do with raw data to remove +## possible errors prior to analysis: ## Check for duplicate observations -> i.e. rows should be uniquely ## identified @@ -212,11 +201,11 @@ ## respond" shouldn't be negative but NA ## Check and confirm numerical outliers (not covered here) - ### Step 1 -- Check for duplicate observations ---- + ### Step 1 -- Check for duplicate observations ---- - # In the norms_values_data dataset, the unique identifier for each observation - # is "D_INTERVIEW". It should be the case that each ID is only used once in - # the dataset. To check for duplicates, this is the preferred method: +# In the norms_values_data dataset, the unique identifier for each observation +# is "D_INTERVIEW". It should be the case that each ID is only used once in +# the dataset. To check for duplicates, this is the preferred method: duplicates <- norms_values_data %>% group_by(D_INTERVIEW) %>% @@ -229,13 +218,13 @@ ) %>% filter(num > 1) - # 36 duplicates, uh oh. They all seem to be similar in "order" so I'll check in - # which countries they occurred: +# 36 duplicates, uh oh. They all seem to be similar in "order" so I'll check in +# which countries they occurred: duplicates <- duplicates %>% left_join( # left_join() allows us to 'merge' datasets, in this case - # merging norms_values_data into the duplicates dataset to - # add countries + # merging norms_values_data into the duplicates dataset to + # add countries norms_values_data %>% select(D_INTERVIEW, B_COUNTRY_ALPHA) %>% # Need to have a common variable (D_INTERVIEW) between @@ -244,31 +233,31 @@ distinct() # Need to do this BECAUSE of the duplicate IDs ) - # NOTE -- "left_join" means that only the observations from the first (left) - # dataset (based on value of D_INTERVIEW), here "duplicates", are kept. We - # could have used "right_join" if we wanted to only keep the observations from - # the second (right) dataset, here "norms_values_data", "full_join" if we - # wanted to keep both, or "inner_join" if we only wanted to keep observations - # that appear in both datasets +# NOTE -- "left_join" means that only the observations from the first (left) +# dataset (based on value of D_INTERVIEW), here "duplicates", are kept. We +# could have used "right_join" if we wanted to only keep the observations from +# the second (right) dataset, here "norms_values_data", "full_join" if we +# wanted to keep both, or "inner_join" if we only wanted to keep observations +# that appear in both datasets - # Check countries: +# Check countries: duplicates %>% janitor::tabyl(B_COUNTRY_ALPHA) # Uh oh, Mongolia! - # There is an issue with the Mongolian IDs. Usually we would have to contact - # the data collection team to determine what the problem is. In this case we - # can't do that, so... Just remove all observations from Mongolia because this - # is strange +# There is an issue with the Mongolian IDs. Usually we would have to contact +# the data collection team to determine what the problem is. In this case we +# can't do that, so... Just remove all observations from Mongolia because this +# is strange norms_values_data <- norms_values_data %>% filter(B_COUNTRY_ALPHA != "MNG") + + ### Step 2 -- Encode missing values ---- - ### Step 2 -- Encode missing values ---- - - # Going to focus on the "child" variables here (Q07-Q17). Extract those first: +# Going to focus on the "child" variables here (Q07-Q17). Extract those first: - # 'Easy' selection method would be this: +# 'Easy' selection method would be this: child_data <- norms_values_data %>% select( @@ -285,20 +274,20 @@ matches("^Q(0[7-9]|1[0-7])") # See how much quicker this is? ) - # Check it worked: +# Check it worked: child_data %>% names() - # We explore what possible values these variables can take. For example: +# We explore what possible values these variables can take. For example: child_data %>% tabyl(Q07_child_manners) + +# These negative values don't seem great. If we look at the codebook, we see +# that they're different versions of not receiving a quantifiable answer (e.g. +# don't know, refused to respond). We need to encode those as NA - # These negative values don't seem great. If we look at the codebook, we see - # that they're different versions of not receiving a quantifiable answer (e.g. - # don't know, refused to respond). We need to encode those as NA - - # Annoying way to do this: +# Annoying way to do this: child_data_sucky_mutate <- child_data %>% mutate( @@ -350,7 +339,7 @@ ) ) - # That sucked. Look how much easier the below is: +# That sucked. Look how much easier the below is: child_data <- child_data %>% mutate( @@ -362,11 +351,11 @@ ) ) ) # Same result!!! You can compare child_data and child_data_sucky_mutate - # to see that + # to see that - # Note -- in this dataset, '1' is that the subject was mentioned, '2' that the - # subject wasn't. This is akin to 'yes' and 'no' which are traditionally - # encoded respectively as '1' and '0'. So: +# Note -- in this dataset, '1' is that the subject was mentioned, '2' that the +# subject wasn't. This is akin to 'yes' and 'no' which are traditionally +# encoded respectively as '1' and '0'. So: child_data <- child_data %>% mutate( @@ -375,40 +364,40 @@ ~ case_when( .x == 2 ~ 0, TRUE ~ as.numeric(.x) # Note that we need to change class - # from integer to numeric because - # of the '0' + # from integer to numeric because + # of the '0' ) ) ) - ### Step 3 -- Creating a 'tidy' version of this ---- + ### Step 3 -- Creating a 'tidy' version of this ---- - # We want a dataset that doesn't have: +# We want a dataset that doesn't have: # One variable spread across multiple columns, or # One observation spread across multiple rows - # In this case, it seems that the answers to the question of what is important - # to teach to a child is spread across multiple variables, one for each answer - # basically. This would make it difficult to e.g. check what the - # most popular 5 answers are. Solution: pivot_longer +# In this case, it seems that the answers to the question of what is important +# to teach to a child is spread across multiple variables, one for each answer +# basically. This would make it difficult to e.g. check what the +# most popular 5 answers are. Solution: pivot_longer child_data_long <- child_data %>% - # Note -- good to create a new dataset when modifying the level of - # observation + # Note -- good to create a new dataset when modifying the level of + # observation pivot_longer( cols = Q07_child_manners:Q17_child_obedient, names_to = "child_quality", names_prefix = "Q[0-9]{2}_child_", # Allows us to only keep what - # follows the variable prefix + # follows the variable prefix values_to = "child_quality_value" # For now ) - # Check what it looks like: +# Check what it looks like: child_data_long %>% head() - # Small changes to look nicer +# Small changes to look nicer child_data_long <- child_data_long %>% mutate( @@ -419,383 +408,7 @@ child_data_long %>% tabyl(child_quality) # Looks good! - # Challenge: Try to determine what the five most popular answers to this - # question were. Hint: "arrange(-var)" orders a dataset from the largest - # value of 'var' to the small value of 'var'. Solution below. - - popular_answers <- child_data_long %>% - group_by(child_quality) %>% - dplyr::summarize( - num = sum(child_quality_value, na.rm = TRUE) # Notice that "sum" works because of coding (yes = 1, no = 0) - ) %>% - arrange(desc(num)) # Manners, Responsibility, Tolerance, Hard Work, Independence, Faith - - ## 5. Session 3 -- Data Visualization using Tables and Graphs ---- - - ### Step 1 -- Simple Regression Table ---- - -# Purpose -- I want to accomplish two tasks: - - # Assess the relationship between a respondent's relationship with their parents - # (Q27_agree_parents_proud) and what they think is important for their child (Q07-Q17) - # -> Regression analysis and table production - -# We've already cleaned Q07-Q17, check Q27 - - norms_values_data %>% tabyl(Q27_agree_parents_proud) - -# Properly encode the missing values - - norms_values_data <- norms_values_data %>% - mutate( - Q27_agree_parents_proud = case_when( - Q27_agree_parents_proud < 0 ~ NA_real_, - Q27_agree_parents_proud == 4 ~ 1, # 4 is 'strongly disagree' and 1 is 'strongly agree'. - # I like bigger = better - Q27_agree_parents_proud == 3 ~ 2, - Q27_agree_parents_proud == 2 ~ 3, - Q27_agree_parents_proud == 1 ~ 4 - ) - ) - - parent_child_dataset <- norms_values_data %>% - select( - D_INTERVIEW, B_COUNTRY_ALPHA, - Q07_child_manners:Q17_child_obedient, Q27_agree_parents_proud - ) - - parent_child_regression <- lm( - data = parent_child_dataset, - formula = Q27_agree_parents_proud ~ Q08_child_independence + Q09_child_hard_work + - Q10_child_responsibility + Q11_child_imagination + Q12_child_tolerance + Q13_child_thrift + - Q14_child_determined + Q15_child_faith + Q16_child_unselfish + Q17_child_obedient - ) - -# Look at what the results of the regression are - - parent_child_regression %>% summary() - -# Lots of interesting relationships here! But we want to present this data easily to other people. - -# Stargazer outputs a simple LateX script - - parent_child_sg <- parent_child_regression %>% - stargazer() -# Looks super ugly -- you'd want to add labels to replace your variable names in the table. -# stargazer() is very customizable, just use help(stargazer) to see how to add labels - -# You can then save to LateX using the writeLines() function - - writeLines(parent_child_sg, "output/regression_table_sg.tex") - -# You can then either import the .tex file into a software like Overleaf, or use the -# pdflatex() function from the tinytex package to export to PDF. - -# Huxtable transforms a regression output into a table described in 'LateX' script. - - parent_child_names <- c( - "Independence" = "Q08_child_independence", - "Hard Work" = "Q09_child_hard_work", - "Responsibility" = "Q10_child_responsibility", - "Imagination" = "Q11_child_imagination", - "Tolerance" = "Q12_child_tolerance", - "Thriftiness" = "Q13_child_thrift", - "Determination" = "Q14_child_determined", - "Faith" = "Q15_child_faith", - "Selflessness" = "Q16_child_unselfish", - "Obedience" = "Q17_child_obedient" - ) - - parent_child_hux <- parent_child_regression %>% - huxtable::huxreg( - coefs = parent_child_names - ) - -# Some saving options: - - huxtable::quick_latex( - parent_child_hux, file = "output/regression_table.tex" - ) - - quick_pdf( - parent_child_hux, file = "output/regression_table.pdf" - ) - - quick_html( - parent_child_hux, file = "output/regression_table.html" - ) - - ### Step 2 -- Descriptive Statistics Table ---- - - # I want to observe how people on different continents think about politics and religion as - # parts of their life -> descriptive statistics table and various plots - -# First check if our variables are okay - - norms_values_data %>% tabyl(Q04_life_politics) - - norms_values_data %>% tabyl(Q06_life_religion) - - politics_religion_dataset <- norms_values_data %>% - select( - D_INTERVIEW, B_COUNTRY_ALPHA, - Q04_life_politics, Q06_life_religion - ) %>% - mutate( - across( - Q04_life_politics:Q06_life_religion, - ~ case_when( - .x < 0 ~ NA_real_, - .x == 4 ~ 1, # 4 is 'strongly disagree' and 1 is 'strongly agree'. - # I like bigger = better - .x == 3 ~ 2, - .x == 2 ~ 3, - .x == 1 ~ 4 - ) - ) - ) %>% - # We want to visualize the relationship between the politics and religion variables, but there - # are 83,000 observations (too many). So aggregate at the country level - group_by(B_COUNTRY_ALPHA) %>% - dplyr::summarize( - across( - Q04_life_politics:Q06_life_religion, - ~ mean(.x, na.rm = TRUE) - ) - ) %>% - ungroup() %>% - # Add continent data to the politics/religion dataset to compare continent statistics - left_join( - country_continent_data, - by = c("B_COUNTRY_ALPHA" = "country") - ) %>% - select( - country_long, continent, everything() - ) %>% - arrange(continent, country_long) - -# Use the gt() package to create a descriptive statistics table out of this. Using gt() is a very -# iterative process, I'd recommend just trying out the basic one shown below and then adding -# components one by one. - - simple_desc_gt_table <- politics_religion_dataset %>% - select(-B_COUNTRY_ALPHA) %>% # Don't need it - gt() - -# Looks pretty rough. Time to make it look nicer: - - politics_religion_gt_table <- politics_religion_dataset %>% - select(-B_COUNTRY_ALPHA) %>% # Don't need it - mutate( # Too many digits in our numeric variables - across( - Q04_life_politics:Q06_life_religion, - ~ round(.x, digits = 3) - ) - ) %>% - group_by(continent) %>% # See what this does - gt() %>% - cols_label( # Lets you assign names to columns - country_long = "Country", - Q04_life_politics = "Politics", - Q06_life_religion = "Religion" - ) %>% - tab_header( # Add title/subtitle - title = "World Values Survey", - subtitle = "Importance in Life -- Politics vs. Religion" - ) %>% - data_color( # Adding some color scales to make the numbers easier to parse - columns = Q04_life_politics, - colors = scales::col_numeric( - palette = as.character(paletteer::paletteer_d("ggsci::red_material", n = 5)), - domain = NULL - ) - ) %>% - data_color( # Adding some color scales to make the numbers easier to parse - columns = Q06_life_religion, - colors = scales::col_numeric( - palette = as.character(paletteer::paletteer_d("ggsci::blue_material", n = 5)), - domain = NULL - ) - ) - -# Could do a lot more but this is enough for now. Save: - - gtsave( - politics_religion_gt_table, - "output/politics_religion_gt.png" - ) - - ### Step 3 -- Descriptive Statistics Plot(s) ---- - -# We're going to compare politics and religion across continents using different plot formats in the -# ggplot2 package. - -# KEY -- Once you use the function ggplot(), REPLACE %>% with + - -# First -- Let's use a simple density plot to assess the distribution of both variables - - politics_religion_density_plot <- politics_religion_dataset %>% - ggplot() + - geom_density( - aes( # aesthetics -- variables always go inside of this - x = Q04_life_politics # geom_density only requires an 'x' - ), - color = "red" - ) + - geom_density( - aes( # aesthetics -- variables always go inside of this - x = Q06_life_religion # geom_density only requires an 'x' - ), - color = "blue" - ) + - xlab("Importance in Life (1 - 4)") - -# No legend though. Solution: Use pivot_longer so that each variable is a group - - politics_religion_dataset_long <- politics_religion_dataset %>% - pivot_longer( - cols = c(Q04_life_politics, Q06_life_religion), - names_to = "variable", - values_to = "life_importance" - ) %>% - mutate( # So that it looks good in the plot - variable = case_when( - variable == "Q04_life_politics" ~ "Politics", - variable == "Q06_life_religion" ~ "Religion" - ) - ) - - politics_religion_density_plot2 <- politics_religion_dataset_long %>% - ggplot() + - geom_density( - aes( - x = life_importance, color = variable - ) - ) + - xlab("Importance in Life (1-4)") + - theme_minimal() + # Always looks better - theme( - legend.position = "bottom", # I prefer this - plot.background = element_rect(color = "white") # Without this the graph's background is - # transparent - ) - -# Second -- Bar chart, grouping by continent - - politics_religion_bar_chart <- politics_religion_dataset_long %>% - mutate( # South and North America are too long strings - continent = case_when( - continent == "South America" ~ "South\nAmerica", # Line break - continent == "North America" ~ "North\nAmerica", - TRUE ~ continent - ) - ) %>% - group_by(continent, variable) %>% - dplyr::summarize( - life_importance = mean(life_importance, na.rm = TRUE) - ) %>% - ungroup() %>% - ggplot() + - geom_bar( - aes( - x = continent, y = life_importance, fill = variable - ), - position = "dodge", stat = "identity" - ) + - xlab("Continent") + - ylab("Importance in Life (1-4)") + - scale_fill_discrete(name = "") + # An easy way to get rid of the legend title - theme_minimal() + # Always looks better - theme( - legend.position = "bottom", - plot.background = element_rect(color = "white") # Without this the graph's background is - # transparent - ) - -# Finally -- Scatter plot of all countries - -# Scatter plots allow us to visualize data in two dimensions, i.e. along two variables. -# Here it's politics and religion's importance in life - - politics_religion_scatter_plot <- politics_religion_dataset %>% - ggplot() + - geom_point( - aes( - x = Q04_life_politics, y = Q06_life_religion, color = continent - ) - ) + - xlab("Importance of Politics") + - ylab("Importance of Religion") + - scale_x_continuous( # Make sure scale is 1-4 - limits = c(1, 4) - ) + - scale_y_continuous( - limits = c(1, 4) - ) + - theme_minimal() + - theme( - legend.position = "bottom", - plot.background = element_rect(color = "white") # Without this the graph's background is - # transparent - ) - -# Fun alternative -- replace points with country abbreviation - - politics_religion_scatter_plot2 <- politics_religion_dataset %>% - ggplot() + - geom_text( - aes( - x = Q04_life_politics, y = Q06_life_religion, - label = B_COUNTRY_ALPHA, color = continent - ), - check_overlap = TRUE - ) + - xlab("Importance of Politics") + - ylab("Importance of Religion") + - scale_x_continuous( # Make sure scale is 1-4 - limits = c(1, 4) - ) + - scale_y_continuous( - limits = c(1, 4) - ) + - theme_minimal() + - theme( - legend.position = "bottom", - plot.background = element_rect(color = "white") # Without this the graph's background is - # transparent - ) - -# Save everything important - - ggsave( - "output/politics_religion_density.png", - politics_religion_density_plot2 - ) - - ggsave( - "output/politics_religion_bar.png", - politics_religion_bar_chart - ) - - ggsave( - "output/politics_religion_scatter.png", - politics_religion_scatter_plot - ) - - ggsave( - "output/politics_religion_scatter2.png", - politics_religion_scatter_plot2 - ) - -# Challenges for today: - - # Using child_data_long, create a gt() table showing the five most important child qualities - # for a given country, along with either the number or %age of respondents in each country who - # called it important (hint: you'll have to use filter() and summarize() for this) - - # Then create a density plot showing the distribution of importance of these five child qualities - # across countries (i.e. summarize at the country level then use ggplot() and geom_density()) - - # Create a scatter plot showing an interesting comparison between two child qualities across - # countries/continents +# Challenge: Try to determine what the five most popular answers to this +# question were. Hint: "arrange(-var)" orders a dataset from the largest +# value of 'var' to the small value of 'var'. \ No newline at end of file diff --git a/code/session_4_template.R b/code/session_4_template.R new file mode 100644 index 0000000..8fd6311 --- /dev/null +++ b/code/session_4_template.R @@ -0,0 +1,801 @@ +## Programming for Professional Research Using R -- Session 4 + + ### Session Description + + # Exporting Summary/Regression Tables – How to produce: + # (i) regression tables and + # (ii) descriptive summary tables for academic or policy audiences + + # Introduction to R Data Visualization – How to produce beautiful and informative visualizations: + # scatter plots, density plots, and more + + ### Resources + + # Tables + + # Marek Hlavac, “stargazer: beautiful LATEX, HTML and ASCII tables from R statistical output” + # (https://cran.rproject.org/web/packages/stargazer/vignettes/stargazer.pdf). Vignette for the + # stargazer package, main tool to export regression tables to LateX + + # Thomas Mock, “gt - a (G)rammar of (T)ables” + # (https://themockup.blog/posts/2020-05-16-gt-a-grammer-of-tables/). + # Introduction to the gt package, a more flexible instrument to export tables in PNG, PDF, or HTML formats. + + # Plots + + # Alicia Horsch, “A quick introduction to ggplot2” + # (https://towardsdatascience.com/a-quick-introduction-to-ggplot2-d406f83bb9c9). Introduction to the + # ggplot2 package, the main instrument for plot creation in R. + + ### Session Objectives + + # Learn how to: + # Create simple academic-standard regression output tables using the stargazer package + # Create flexible and easy-to-read tables of any dataset using the gt package + # Create a scatter plot, density plot, and bar chart using the ggplot2 package + + # Practice the above! + + ## 1. Setup ---- + + ### Packages + +# NOTE -- Unlike using library(), the 'pacman::p_load()' function installs the package if it is already +# not present in the user's R environment. + + options(scipen=999) + + if(!require(pacman)) install.packages("pacman") + + pacman::p_load(tidyverse, data.table, janitor, stargazer, huxtable, gt, paletteer) + + ## 2. Import Data ---- + + if(!(file.exists("data/final/wvs_values_norms_data.csv"))) { # Checks whether the data has been + # downloaded already + usethis::use_zip( + "https://www.dropbox.com/scl/fo/vnxjbqyq1g9z368coh1vq/h?rlkey=zc66o2ll7613b5e9ipp915ynk&dl=1" + ) + + } + + norms_values_data <- data.table::fread( # Other options are base R's read.csv() and readr::read_csv(), but + # data.table::fread() is considered to be the fastest + "data/final/wvs_values_norms_data.csv", na.strings = "" + ) + + ## Country Continent Data + + country_continent_data <- data.table::fread( + "data/raw//country_continent.csv", na.strings = "" + ) + + ## 3. SESSION 1 -- Data 'Wrangling' ---- + + # This dataset doesn't have a "continent" variable, so I will have to create one + + # Check the names of the variables in my dataset + + norms_values_data %>% + names() + + # Check which countries are in my dataset + + norms_values_data %>% + janitor::tabyl(B_COUNTRY_ALPHA) + + # Use ISO codes to distinguish European countries + + european_iso_codes <- c( # This creates a character list + "AND", "CYP", "DEU", "GRC", "RUS", "SRB", "TUR", "UKR" + ) + + norms_values_data <- norms_values_data %>% + dplyr::mutate( + european = dplyr::case_when( # This creates a dummy variable + B_COUNTRY_ALPHA %in% european_iso_codes ~ 1, + TRUE ~ 0 + ) + ) + + # Check that it worked + + norms_values_data %>% + janitor::tabyl(european, B_COUNTRY_ALPHA) + + # Now subset using filter() + + european_data <- norms_values_data %>% + dplyr::filter(european == 1) + + ### Step 2 -- Select relevant variables ---- + + # We want to look at what people find important in life. Those are questions Q1-Q6. + + # So we keep those questions, as well as D_INTERVIEW (unique ID, always keep) and B_COUNTRY_ALPHA + + european_data <- european_data %>% + dplyr::select( + D_INTERVIEW, B_COUNTRY_ALPHA, dplyr::matches("^Q0[1-6]") + # matches() allows us to select multiple variables at once using a common string in + # their name + ) + + ### Step 3 -- Clean variables ---- + + # Will explore this further in next week's session -- need to encode answers as "NA" to continue + # using numerically + + european_data <- european_data %>% + dplyr::mutate( + dplyr::across( # Across is magic and allows you to modify multiple variables at once + matches("^Q0[1-6]"), # Same as for select() above. IMPORTANT -- all variables need + # to be of the same 'type' (e.g. character, numeric) + ~ case_when( # across() basically loops over every relevant variable. '.x' refers to + # each variable being treated + .x %in% c(-1, -2, -4, -5) ~ NA_integer_, + # Key to understand what NA_ to use.Here the possible answers will be 1, 2, 3, + # 4 so those are integers + TRUE ~ .x # We've created NAs and are leaving the other + # values alone + ) + ) + ) + + ### Step 4 -- Summarize variables at the country level ---- + + # We want a dataset where each observation (row) is a country, not a household. To do this, we use + # group_by() and summarize() + + european_country_data <- european_data %>% # New observation level so new dataset + dplyr::group_by(B_COUNTRY_ALPHA) %>% # We're telling R at which level to do the grouping + dplyr::summarize( # Summarize aggregated values based on what we instruct it to do. If we + # didn't use group_by(), it would summarize to one single value. Here, + # it will output one value per country. + dplyr::across( # Using across() again to summarize multiple variables at once + matches("^Q0[1-6]"), + ~ mean(.x, na.rm = TRUE) # Removing NAs and taking the mean of each variable + ) + ) %>% + dplyr::ungroup() # Always remember to do this! Otherwise your future code will do weird + # things + + ### Step 5 -- Create a question-level dataset ---- + + # The above is useful. But what if we want to sum up countries' values for all of these questions, + # or look at each country's 'average' enthusiasm, it'll be easier with a 'long' dataset than a + # 'wide' one. + + european_country_data_wide <- european_country_data %>% + tidyr::pivot_longer( + cols = matches("^Q0[1-6]"), # Variables whose data we want to be in a single, + # 'long' variable + names_to = "topic", # Creates a variable named 'topic' that saves the variable names + values_to = "score" # Creates a 'long' variable named 'score' that holds all of the + # original values + ) %>% + dplyr::mutate( # I don't like how 'topic' has more information than necessary + topic = stringr::str_replace( # stringr is the best package for string manipulation + topic, + "^Q0[1-6]_life_", + "" + ) + ) + + # Check that it worked + + european_country_data_wide %>% janitor::tabyl(topic) # It did! + + # Now for example I can look at average 'enthusiasm' by country + + average_country_enthusiasm <- european_country_data_wide %>% + dplyr::group_by(B_COUNTRY_ALPHA) %>% + dplyr::summarize( + average_score = mean(score, na.rm = TRUE) + ) %>% + dplyr::ungroup() %>% + dplyr::arrange(average_score) # Order them from highest to lowest enthusiasm (NOTE -- + # smaller number means more enthusiasm) + + # check what we created: + + average_country_enthusiasm %>% head() + + ## 4. SESSION 2 -- Data Cleaning/Tidy Data ---- + + ## This will show you some basic tasks that you can do with raw data to remove + ## possible errors prior to analysis: + + ## Check for duplicate observations -> i.e. rows should be uniquely + ## identified + ## Encode missing values -> answers such as "don't know" or "refused to + ## respond" shouldn't be negative but NA + ## Check and confirm numerical outliers (not covered here) + + ### Step 1 -- Check for duplicate observations ---- + + # In the norms_values_data dataset, the unique identifier for each observation + # is "D_INTERVIEW". It should be the case that each ID is only used once in + # the dataset. To check for duplicates, this is the preferred method: + + duplicates <- norms_values_data %>% + group_by(D_INTERVIEW) %>% + # Grouping at the ID level, we'll be able to check how many times it + # shows up + dplyr::summarize( + num = n() + # n() counts the number of instances of each ID! Just run up to + # summarize() and add %>% head() to see what it does + ) %>% + filter(num > 1) + + # 36 duplicates, uh oh. They all seem to be similar in "order" so I'll check in + # which countries they occurred: + + duplicates <- duplicates %>% + left_join( # left_join() allows us to 'merge' datasets, in this case + # merging norms_values_data into the duplicates dataset to + # add countries + norms_values_data %>% + select(D_INTERVIEW, B_COUNTRY_ALPHA) %>% + # Need to have a common variable (D_INTERVIEW) between + # norms_values_data and duplicates, and we only want to add + # country name + distinct() # Need to do this BECAUSE of the duplicate IDs + ) + + # NOTE -- "left_join" means that only the observations from the first (left) + # dataset (based on value of D_INTERVIEW), here "duplicates", are kept. We + # could have used "right_join" if we wanted to only keep the observations from + # the second (right) dataset, here "norms_values_data", "full_join" if we + # wanted to keep both, or "inner_join" if we only wanted to keep observations + # that appear in both datasets + + # Check countries: + + duplicates %>% + janitor::tabyl(B_COUNTRY_ALPHA) # Uh oh, Mongolia! + + # There is an issue with the Mongolian IDs. Usually we would have to contact + # the data collection team to determine what the problem is. In this case we + # can't do that, so... Just remove all observations from Mongolia because this + # is strange + + norms_values_data <- norms_values_data %>% + filter(B_COUNTRY_ALPHA != "MNG") + + ### Step 2 -- Encode missing values ---- + + # Going to focus on the "child" variables here (Q07-Q17). Extract those first: + + # 'Easy' selection method would be this: + + child_data <- norms_values_data %>% + select( + D_INTERVIEW, B_COUNTRY_ALPHA, Q07_child_manners, + Q08_child_independence, Q09_child_hard_work, + Q10_child_responsibility, Q11_child_imagination, Q12_child_tolerance, + Q13_child_thrift, Q14_child_determined, Q15_child_faith, + Q16_child_unselfish, Q17_child_obedient # Ugh + ) + + child_data <- norms_values_data %>% + select( + D_INTERVIEW, B_COUNTRY_ALPHA, + matches("^Q(0[7-9]|1[0-7])") # See how much quicker this is? + ) + + # Check it worked: + + child_data %>% names() + + # We explore what possible values these variables can take. For example: + + child_data %>% + tabyl(Q07_child_manners) + + # These negative values don't seem great. If we look at the codebook, we see + # that they're different versions of not receiving a quantifiable answer (e.g. + # don't know, refused to respond). We need to encode those as NA + + # Annoying way to do this: + + child_data_sucky_mutate <- child_data %>% + mutate( + Q07_child_manners = case_when( + Q07_child_manners < 0 ~ NA_integer_, + # Notice that they're always negative so this is an easy + # simplification + TRUE ~ Q07_child_manners + ), + Q08_child_independence = case_when( + Q08_child_independence < 0 ~ NA_integer_, + TRUE ~ Q08_child_independence + ), + Q09_child_hard_work = case_when( + Q09_child_hard_work < 0 ~ NA_integer_, + TRUE ~ Q09_child_hard_work + ), + Q10_child_responsibility = case_when( + Q10_child_responsibility < 0 ~ NA_integer_, + TRUE ~ Q10_child_responsibility + ), + Q11_child_imagination = case_when( + Q11_child_imagination < 0 ~ NA_integer_, + TRUE ~ Q11_child_imagination + ), + Q12_child_tolerance = case_when( + Q12_child_tolerance < 0 ~ NA_integer_, + TRUE ~ Q12_child_tolerance + ), + Q13_child_thrift = case_when( + Q13_child_thrift < 0 ~ NA_integer_, + TRUE ~ Q13_child_thrift + ), + Q14_child_determined = case_when( + Q14_child_determined < 0 ~ NA_integer_, + TRUE ~ Q14_child_determined + ), + Q15_child_faith = case_when( + Q15_child_faith < 0 ~ NA_integer_, + TRUE ~ Q15_child_faith + ), + Q16_child_unselfish = case_when( + Q16_child_unselfish < 0 ~ NA_integer_, + TRUE ~ Q16_child_unselfish + ), + Q17_child_obedient = case_when( + Q17_child_obedient < 0 ~ NA_integer_, + TRUE ~ Q17_child_obedient + ) + ) + + # That sucked. Look how much easier the below is: + + child_data <- child_data %>% + mutate( + across( + Q07_child_manners:Q17_child_obedient, + ~ case_when( + .x < 0 ~ NA_integer_, + TRUE ~ .x + ) + ) + ) # Same result!!! You can compare child_data and child_data_sucky_mutate + # to see that + + # Note -- in this dataset, '1' is that the subject was mentioned, '2' that the + # subject wasn't. This is akin to 'yes' and 'no' which are traditionally + # encoded respectively as '1' and '0'. So: + + child_data <- child_data %>% + mutate( + across( + Q07_child_manners:Q17_child_obedient, + ~ case_when( + .x == 2 ~ 0, + TRUE ~ as.numeric(.x) # Note that we need to change class + # from integer to numeric because + # of the '0' + ) + ) + ) + + ### Step 3 -- Creating a 'tidy' version of this ---- + + # We want a dataset that doesn't have: + + # One variable spread across multiple columns, or + # One observation spread across multiple rows + + # In this case, it seems that the answers to the question of what is important + # to teach to a child is spread across multiple variables, one for each answer + # basically. This would make it difficult to e.g. check what the + # most popular 5 answers are. Solution: pivot_longer + + child_data_long <- child_data %>% + # Note -- good to create a new dataset when modifying the level of + # observation + pivot_longer( + cols = Q07_child_manners:Q17_child_obedient, + names_to = "child_quality", + names_prefix = "Q[0-9]{2}_child_", # Allows us to only keep what + # follows the variable prefix + values_to = "child_quality_value" # For now + ) + + # Check what it looks like: + + child_data_long %>% head() + + # Small changes to look nicer + + child_data_long <- child_data_long %>% + mutate( + child_quality = str_replace_all(child_quality, "_", " "), + child_quality = str_to_title(child_quality) + ) + + child_data_long %>% + tabyl(child_quality) # Looks good! + + # Challenge: Try to determine what the five most popular answers to this + # question were. Hint: "arrange(-var)" orders a dataset from the largest + # value of 'var' to the small value of 'var'. Solution below. + + popular_answers <- child_data_long %>% + group_by(child_quality) %>% + dplyr::summarize( + num = sum(child_quality_value, na.rm = TRUE) # Notice that "sum" works because of coding (yes = 1, no = 0) + ) %>% + arrange(desc(num)) # Manners, Responsibility, Tolerance, Hard Work, Independence, Faith + + ## 5. Session 3 -- Data Visualization using Tables and Graphs ---- + + ### Step 1 -- Simple Regression Table ---- + +# Purpose -- I want to accomplish two tasks: + + # Assess the relationship between a respondent's relationship with their parents + # (Q27_agree_parents_proud) and what they think is important for their child (Q07-Q17) + # -> Regression analysis and table production + +# We've already cleaned Q07-Q17, check Q27 + + norms_values_data %>% tabyl(Q27_agree_parents_proud) + +# Properly encode the missing values + + norms_values_data <- norms_values_data %>% + mutate( + Q27_agree_parents_proud = case_when( + Q27_agree_parents_proud < 0 ~ NA_real_, + Q27_agree_parents_proud == 4 ~ 1, # 4 is 'strongly disagree' and 1 is 'strongly agree'. + # I like bigger = better + Q27_agree_parents_proud == 3 ~ 2, + Q27_agree_parents_proud == 2 ~ 3, + Q27_agree_parents_proud == 1 ~ 4 + ) + ) + + parent_child_dataset <- norms_values_data %>% + select( + D_INTERVIEW, B_COUNTRY_ALPHA, + Q07_child_manners:Q17_child_obedient, Q27_agree_parents_proud + ) + + parent_child_regression <- lm( + data = parent_child_dataset, + formula = Q27_agree_parents_proud ~ Q08_child_independence + Q09_child_hard_work + + Q10_child_responsibility + Q11_child_imagination + Q12_child_tolerance + Q13_child_thrift + + Q14_child_determined + Q15_child_faith + Q16_child_unselfish + Q17_child_obedient + ) + +# Look at what the results of the regression are + + parent_child_regression %>% summary() + +# Lots of interesting relationships here! But we want to present this data easily to other people. + +# Stargazer outputs a simple LateX script + + parent_child_sg <- parent_child_regression %>% + stargazer() +# Looks super ugly -- you'd want to add labels to replace your variable names in the table. +# stargazer() is very customizable, just use help(stargazer) to see how to add labels + +# You can then save to LateX using the writeLines() function + + writeLines(parent_child_sg, "output/regression_table_sg.tex") + +# You can then either import the .tex file into a software like Overleaf, or use the +# pdflatex() function from the tinytex package to export to PDF. + +# Huxtable transforms a regression output into a table described in 'LateX' script. + + parent_child_names <- c( + "Independence" = "Q08_child_independence", + "Hard Work" = "Q09_child_hard_work", + "Responsibility" = "Q10_child_responsibility", + "Imagination" = "Q11_child_imagination", + "Tolerance" = "Q12_child_tolerance", + "Thriftiness" = "Q13_child_thrift", + "Determination" = "Q14_child_determined", + "Faith" = "Q15_child_faith", + "Selflessness" = "Q16_child_unselfish", + "Obedience" = "Q17_child_obedient" + ) + + parent_child_hux <- parent_child_regression %>% + huxtable::huxreg( + coefs = parent_child_names + ) + +# Some saving options: + + huxtable::quick_latex( + parent_child_hux, file = "output/regression_table.tex" + ) + + quick_pdf( + parent_child_hux, file = "output/regression_table.pdf" + ) + + quick_html( + parent_child_hux, file = "output/regression_table.html" + ) + + ### Step 2 -- Descriptive Statistics Table ---- + + # I want to observe how people on different continents think about politics and religion as + # parts of their life -> descriptive statistics table and various plots + +# First check if our variables are okay + + norms_values_data %>% tabyl(Q04_life_politics) + + norms_values_data %>% tabyl(Q06_life_religion) + + politics_religion_dataset <- norms_values_data %>% + select( + D_INTERVIEW, B_COUNTRY_ALPHA, + Q04_life_politics, Q06_life_religion + ) %>% + mutate( + across( + Q04_life_politics:Q06_life_religion, + ~ case_when( + .x < 0 ~ NA_real_, + .x == 4 ~ 1, # 4 is 'strongly disagree' and 1 is 'strongly agree'. + # I like bigger = better + .x == 3 ~ 2, + .x == 2 ~ 3, + .x == 1 ~ 4 + ) + ) + ) %>% + # We want to visualize the relationship between the politics and religion variables, but there + # are 83,000 observations (too many). So aggregate at the country level + group_by(B_COUNTRY_ALPHA) %>% + dplyr::summarize( + across( + Q04_life_politics:Q06_life_religion, + ~ mean(.x, na.rm = TRUE) + ) + ) %>% + ungroup() %>% + # Add continent data to the politics/religion dataset to compare continent statistics + left_join( + country_continent_data, + by = c("B_COUNTRY_ALPHA" = "country") + ) %>% + select( + country_long, continent, everything() + ) %>% + arrange(continent, country_long) + +# Use the gt() package to create a descriptive statistics table out of this. Using gt() is a very +# iterative process, I'd recommend just trying out the basic one shown below and then adding +# components one by one. + + simple_desc_gt_table <- politics_religion_dataset %>% + select(-B_COUNTRY_ALPHA) %>% # Don't need it + gt() + +# Looks pretty rough. Time to make it look nicer: + + politics_religion_gt_table <- politics_religion_dataset %>% + select(-B_COUNTRY_ALPHA) %>% # Don't need it + mutate( # Too many digits in our numeric variables + across( + Q04_life_politics:Q06_life_religion, + ~ round(.x, digits = 3) + ) + ) %>% + group_by(continent) %>% # See what this does + gt() %>% + cols_label( # Lets you assign names to columns + country_long = "Country", + Q04_life_politics = "Politics", + Q06_life_religion = "Religion" + ) %>% + tab_header( # Add title/subtitle + title = "World Values Survey", + subtitle = "Importance in Life -- Politics vs. Religion" + ) %>% + data_color( # Adding some color scales to make the numbers easier to parse + columns = Q04_life_politics, + colors = scales::col_numeric( + palette = as.character(paletteer::paletteer_d("ggsci::red_material", n = 5)), + domain = NULL + ) + ) %>% + data_color( # Adding some color scales to make the numbers easier to parse + columns = Q06_life_religion, + colors = scales::col_numeric( + palette = as.character(paletteer::paletteer_d("ggsci::blue_material", n = 5)), + domain = NULL + ) + ) + +# Could do a lot more but this is enough for now. Save: + + gtsave( + politics_religion_gt_table, + "output/politics_religion_gt.png" + ) + + ### Step 3 -- Descriptive Statistics Plot(s) ---- + +# We're going to compare politics and religion across continents using different plot formats in the +# ggplot2 package. + +# KEY -- Once you use the function ggplot(), REPLACE %>% with + + +# First -- Let's use a simple density plot to assess the distribution of both variables + + politics_religion_density_plot <- politics_religion_dataset %>% + ggplot() + + geom_density( + aes( # aesthetics -- variables always go inside of this + x = Q04_life_politics # geom_density only requires an 'x' + ), + color = "red" + ) + + geom_density( + aes( # aesthetics -- variables always go inside of this + x = Q06_life_religion # geom_density only requires an 'x' + ), + color = "blue" + ) + + xlab("Importance in Life (1 - 4)") + +# No legend though. Solution: Use pivot_longer so that each variable is a group + + politics_religion_dataset_long <- politics_religion_dataset %>% + pivot_longer( + cols = c(Q04_life_politics, Q06_life_religion), + names_to = "variable", + values_to = "life_importance" + ) %>% + mutate( # So that it looks good in the plot + variable = case_when( + variable == "Q04_life_politics" ~ "Politics", + variable == "Q06_life_religion" ~ "Religion" + ) + ) + + politics_religion_density_plot2 <- politics_religion_dataset_long %>% + ggplot() + + geom_density( + aes( + x = life_importance, color = variable + ) + ) + + xlab("Importance in Life (1-4)") + + theme_minimal() + # Always looks better + theme( + legend.position = "bottom", # I prefer this + plot.background = element_rect(color = "white") # Without this the graph's background is + # transparent + ) + +# Second -- Bar chart, grouping by continent + + politics_religion_bar_chart <- politics_religion_dataset_long %>% + mutate( # South and North America are too long strings + continent = case_when( + continent == "South America" ~ "South\nAmerica", # Line break + continent == "North America" ~ "North\nAmerica", + TRUE ~ continent + ) + ) %>% + group_by(continent, variable) %>% + dplyr::summarize( + life_importance = mean(life_importance, na.rm = TRUE) + ) %>% + ungroup() %>% + ggplot() + + geom_bar( + aes( + x = continent, y = life_importance, fill = variable + ), + position = "dodge", stat = "identity" + ) + + xlab("Continent") + + ylab("Importance in Life (1-4)") + + scale_fill_discrete(name = "") + # An easy way to get rid of the legend title + theme_minimal() + # Always looks better + theme( + legend.position = "bottom", + plot.background = element_rect(color = "white") # Without this the graph's background is + # transparent + ) + +# Finally -- Scatter plot of all countries + +# Scatter plots allow us to visualize data in two dimensions, i.e. along two variables. +# Here it's politics and religion's importance in life + + politics_religion_scatter_plot <- politics_religion_dataset %>% + ggplot() + + geom_point( + aes( + x = Q04_life_politics, y = Q06_life_religion, color = continent + ) + ) + + xlab("Importance of Politics") + + ylab("Importance of Religion") + + scale_x_continuous( # Make sure scale is 1-4 + limits = c(1, 4) + ) + + scale_y_continuous( + limits = c(1, 4) + ) + + theme_minimal() + + theme( + legend.position = "bottom", + plot.background = element_rect(color = "white") # Without this the graph's background is + # transparent + ) + +# Fun alternative -- replace points with country abbreviation + + politics_religion_scatter_plot2 <- politics_religion_dataset %>% + ggplot() + + geom_text( + aes( + x = Q04_life_politics, y = Q06_life_religion, + label = B_COUNTRY_ALPHA, color = continent + ), + check_overlap = TRUE + ) + + xlab("Importance of Politics") + + ylab("Importance of Religion") + + scale_x_continuous( # Make sure scale is 1-4 + limits = c(1, 4) + ) + + scale_y_continuous( + limits = c(1, 4) + ) + + theme_minimal() + + theme( + legend.position = "bottom", + plot.background = element_rect(color = "white") # Without this the graph's background is + # transparent + ) + +# Save everything important + + ggsave( + "output/politics_religion_density.png", + politics_religion_density_plot2 + ) + + ggsave( + "output/politics_religion_bar.png", + politics_religion_bar_chart + ) + + ggsave( + "output/politics_religion_scatter.png", + politics_religion_scatter_plot + ) + + ggsave( + "output/politics_religion_scatter2.png", + politics_religion_scatter_plot2 + ) + +# Challenges for today: + + # Using child_data_long, create a gt() table showing the five most important child qualities + # for a given country, along with either the number or %age of respondents in each country who + # called it important (hint: you'll have to use filter() and summarize() for this) + + # Then create a density plot showing the distribution of importance of these five child qualities + # across countries (i.e. summarize at the country level then use ggplot() and geom_density()) + + # Create a scatter plot showing an interesting comparison between two child qualities across + # countries/continents + \ No newline at end of file diff --git a/slides/session_1/pics/code_not_running.png b/slides/session_1/pics/code_not_running.png new file mode 100644 index 0000000..be6086e Binary files /dev/null and b/slides/session_1/pics/code_not_running.png differ diff --git a/slides/session_1/pics/data_not_loading1.png b/slides/session_1/pics/data_not_loading1.png new file mode 100644 index 0000000..e3309e6 Binary files /dev/null and b/slides/session_1/pics/data_not_loading1.png differ diff --git a/slides/session_1/pics/data_not_loading2.png b/slides/session_1/pics/data_not_loading2.png new file mode 100644 index 0000000..4615dc9 Binary files /dev/null and b/slides/session_1/pics/data_not_loading2.png differ diff --git a/slides/session_1/pics/function_not_found.png b/slides/session_1/pics/function_not_found.png new file mode 100644 index 0000000..d3af667 Binary files /dev/null and b/slides/session_1/pics/function_not_found.png differ diff --git a/slides/session_1/pics/object_not_found.png b/slides/session_1/pics/object_not_found.png new file mode 100644 index 0000000..2f044f3 Binary files /dev/null and b/slides/session_1/pics/object_not_found.png differ diff --git a/slides/session_1/session_1.Rmd b/slides/session_1/session_1.Rmd index 0c1f218..527a66c 100644 --- a/slides/session_1/session_1.Rmd +++ b/slides/session_1/session_1.Rmd @@ -335,8 +335,6 @@ x[x %in% c(1, 2, 5)] # Elements in the set 1, 2, 5 --- - - ## R Basics .panelset[ @@ -390,6 +388,39 @@ In practice, we visualize data frames as **tables** where each vector is a **col --- +## R Basics + +NOTE — R works in a manner that allows to write a specific function over multiple lines. This is called a **code chunk**. + +This: +```{r} +a <- mean(c(seq(1, 4, by = 0.5))) + +a +``` + +Is the same as this: +```{r} +a <- mean( + c( + seq(1, 4, by = 0.5) + ) +) + +a + +``` + +--- + +## R Basics + +You can click anywhere in the code chunk and click "run" or Cmd+Enter (Mac)/Ctrl+Enter (Windows), and the whole chunk will run. + +KEY — If you only select a portion of the code chunk and run that, then R will identify the chunk as unfinished and refuse to let you do anything else until you've "completed" it. + +--- + class: center, middle # Coding Set Up @@ -478,12 +509,108 @@ norms_values_data <- data.table::fread( --- -## Avoiding Basic Issues in RStudio +## Basic Issues in RStudio +### Data Not Loading + +```{r, echo = FALSE, out.width = "90%", fig.align = "center"} -- Package not loading -- Data not loaded -- Data frame not found -- Variable not found +knitr::include_graphics( + "pics/data_not_loading1.png" +) +``` + + +```{r, echo = FALSE, out.width = "90%", fig.align = "center"} + +knitr::include_graphics( + "pics/data_not_loading2.png" +) +``` + +--- + +## Basic Issues in RStudio +### Data Not Loading + +Check: +- Working directory — `getwd()` or check the top of the RStudio console. +- File path — are there any typos? Is your file where you expect it to be? + +Solutions: +- If you were provided with an `.Rproj` file alongside your script, make sure that you opened the project. +- Modify the working directory using `setwd()` or correct the file path if need be! + +--- + +## Basic Issues in RStudio +### Function Not Found + +```{r, echo = FALSE, out.width = "100%", fig.align = "center"} + +knitr::include_graphics( + "pics/function_not_found.png" +) +``` + +--- + +## Basic Issues in RStudio +### Function Not Found + +Check: +- From which package the function comes. You can do this using `??FUNCTION NAME` (e.g. `??tabyl`) or through a Google search. +- That the package is (1) installed in your environment and (2) loaded. Having the package installed isn't sufficient! + +Solutions: +- If the package isn't installed, use `install.packages("tabyl")`. If the package isn't loaded, use `library(tabyl)` or `pacman::p_load(tabyl)`. + +--- + +## Basic Issues in RStudio +### Code Not Running + +```{r, echo = FALSE, out.width = "100%", fig.align = "center"} + +knitr::include_graphics( + "pics/code_not_running.png" +) +``` + +--- + +## Basic Issues in RStudio +### Code Not Running + +Check: +- That you didn't miss a parenthesis (`)`) or bracket (`}`) in your code! This is the most common reason. +- If you missed it, the console will show a `+` at the start of the console line instead of the expected `>`. + +Solutions: +- Type gibberish and/or the missing parenthesis/bracket until the `>` reappears. More likely, you'll have to rerun the code chunk to make sure it works! + +--- + +## Basic Issues in RStudio +### Object Not Found + +```{r, echo = FALSE, out.width = "100%", fig.align = "center"} + +knitr::include_graphics( + "pics/object_not_found.png" +) +``` + +--- + +## Basic Issues in RStudio +### Object Not Found + +Check: +- Whether the object exists in your environment. More likely than not, you either misspelt the name of the object, or you skipped the code that creates it (remember that a script is like a recipe and steps can't be skipped!). + +Solutions: +- Backtrack in your code and run the chunk that creates the object. +- If a typo is at fault, correct the typo. --- @@ -555,12 +682,6 @@ Syllabus: **https://mfiorina.github.io/sais_r_course/syllabus/r_course_syllabus. Thomas Mock, “A Gentle Introduction to Tidy Statistics in R” (**[blog post](https://themockup.blog/posts/2018-12-10-a-gentle-guide-to-tidy-statistics-in-r/)** and **[video](https://www.rstudio.com/resources/webinars/a-gentle-introduction-to-tidy-statistics-in-r/)**) -Dominic Royé, **[“A very short introduction to Tidyverse”](https://dominicroye.github.io/en/2020/a-very-short-introduction-to-tidyverse/)** - -tidyr, **[“Pivoting”](https://tidyr.tidyverse.org/articles/pivot.html)** - -Hadley Wickham, **[“dplyr 1.0.0: working across columns”](https://www.tidyverse.org/blog/2020/04/dplyr-1-0-0-colwise/)** - Hadley Wickham & Garrett Grolemund, **[R for Data Science](https://r4ds.had.co.nz/)** RStudio, **[RStudio Cheatsheets](https://www.rstudio.com/resources/cheatsheets/)** diff --git a/slides/session_1/session_1.html b/slides/session_1/session_1.html index af0961a..bfa74c4 100644 --- a/slides/session_1/session_1.html +++ b/slides/session_1/session_1.html @@ -209,6 +209,48 @@ ## R Basics +NOTE — R works in a manner that allows to write a specific function over multiple lines. This is called a **code chunk**. + +This: + +```r +a <- mean(c(seq(1, 4, by = 0.5))) + +a +``` + +``` +## [1] 2.5 +``` + +Is the same as this: + +```r +a <- mean( + c( + seq(1, 4, by = 0.5) + ) +) + +a +``` + +``` +## [1] 2.5 +``` + +--- + +## R Basics + +You can click anywhere in the code chunk and click "run" or Cmd+Enter (Mac)/Ctrl+Enter (Windows), and the whole chunk will run. + +KEY — If you only select a portion of the code chunk and run that, then R will identify the chunk as unfinished and refuse to let you do anything else until you've "completed" it. + +--- +## R Basics + + .panelset[ .panel[.panel-name[Scalars] @@ -296,8 +338,6 @@ --- - - ## R Basics .panelset[ @@ -362,6 +402,8 @@ .panel[.panel-name[Installing Packages] +Packages are groups of user-created functions that help us accomplish tasks that would be harder/impossible using base R functions. + ### Easy @@ -384,6 +426,8 @@ .panel[.panel-name[Setting up File Paths] +File paths help R identify where the files you want to use are located. + You want your code to be **reproducible** and **easy to use by other people** Simple solution: Create an `.rproj` file that people can open to access your R environment @@ -440,12 +484,83 @@ --- -## Avoiding Basic Issues in RStudio +## Basic Issues in RStudio +### Data Not Loading + +<img src="pics/data_not_loading1.png" width="90%" style="display: block; margin: auto;" /> + + +<img src="pics/data_not_loading2.png" width="90%" style="display: block; margin: auto;" /> + +--- + +## Basic Issues in RStudio +### Data Not Loading + +Check: +- Working directory — `getwd()` or check the top of the RStudio console. +- File path — are there any typos? Is your file where you expect it to be? + +Solutions: +- If you were provided with an `.Rproj` file alongside your script, make sure that you opened the project. +- Modify the working directory using `setwd()` or correct the file path if need be! + +--- + +## Basic Issues in RStudio +### Function Not Found + +<img src="pics/function_not_found.png" width="100%" style="display: block; margin: auto;" /> + +--- + +## Basic Issues in RStudio +### Function Not Found + +Check: +- From which package the function comes. You can do this using `??FUNCTION NAME` (e.g. `??tabyl`) or through a Google search. +- That the package is (1) installed in your environment and (2) loaded. Having the package installed isn't sufficient! + +Solutions: +- If the package isn't installed, use `install.packages("tabyl")`. If the package isn't loaded, use `library(tabyl)` or `pacman::p_load(tabyl)`. + +--- + +## Basic Issues in RStudio +### Code Not Running + +<img src="pics/code_not_running.png" width="100%" style="display: block; margin: auto;" /> + +--- + +## Basic Issues in RStudio +### Code Not Running + +Check: +- That you didn't miss a parenthesis (`)`) or bracket (`}`) in your code! This is the most common reason. +- If you missed it, the console will show a `+` at the start of the console line instead of the expected `>`. + +Solutions: +- Type gibberish and/or the missing parenthesis/bracket until the `>` reappears. More likely, you'll have to rerun the code chunk to make sure it works! + +--- + +## Basic Issues in RStudio +### Object Not Found + +<img src="pics/object_not_found.png" width="100%" style="display: block; margin: auto;" /> + +--- + +## Basic Issues in RStudio +### Object Not Found + +Check: +- Whether the object exists in your environment. More likely than not, you either misspelt the name of the object, or you skipped the code that creates it (remember that a script is like a recipe and steps can't be skipped!). -- Package not loading -- Data not loaded -- Data frame not found -- Variable not found +Solutions: +- Backtrack in your code and run the chunk that creates the object. +- If a typo is at fault, correct the typo. ---