-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerging_hgarb_files.R
163 lines (126 loc) · 5.38 KB
/
merging_hgarb_files.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
library(dplyr)
library(readxl)
library(rio)
library(here)
library(janitor)
library(tidyr)
library(stringr)
source("reason_fieldnames_bydocuments.R")
source("ppd.R")
##########Plan collected each year##########
year_2021 <- list.files("data/2023_May7", pattern = "_2021.xlsx") %>% str_remove("_2021.xlsx")
year_2022 <- list.files("data/2023_May7", pattern = "_2022.xlsx") %>% str_remove("_2022.xlsx")
year_2023 <- list.files("data/2023_May7", pattern = "_2023.xlsx") %>% str_remove("_2023.xlsx")
# check plan collected in one year but not in other year
setdiff(year_2021, year_2022)
setdiff(year_2022, year_2021)
setdiff(year_2023, year_2022)
########Function to read and process data#############
import_and_slice <- function(filename) {
read_and_process_sheet <- function(sheet) {
data <- read_excel(filename, sheet = sheet, skip = 1) %>%
clean_names() %>%
mutate(fye = as.numeric(fye)) %>%
filter(fye >= 2000 & fye <= 2023)
max_year <- max(data$fye, na.rm = TRUE)
end_index <- which(data$fye == max_year)[1]
sliced_data <- data[1:end_index, ]
return(sliced_data)
}
## sheet 1
gasb = read_and_process_sheet("GASB 68") %>%
# Differentiate some cols from sheet gasb & acfr
setnames(
old = c("adec_amt", "adec_paid_amt", "adec_missed"),
new = c(paste0(c("adec_amt", "adec_paid_amt", "adec_missed"), "_from_gasb")))
### sheet 2
valuation = read_and_process_sheet("Actuarial Valuation") %>%
# sheet 2 vs sheet 3
setnames(
old = c("actuarial_return", "market_return","ava" , "aal","ual", "funded_ratio_old", "payroll"),
new = c(paste0(c("actuarial_return", "market_return","ava" , "aal", "ual", "funded_ratio_old", "payroll"), "_from_valuation")))
## sheet 3
acfr <- read_and_process_sheet("CAFR") %>%
# valuation vs sheet acfrs
setnames(old = c("actuarial_return", "market_return", "ava", "aal", "ual", "funded_ratio_old","payroll"),
new = c(paste0(c("actuarial_return", "market_return", "ava", "aal", "ual", "funded_ratio_old",
"payroll"), "_from_acfr"))) %>%
#gasb vs. sheet acfr
setnames(
old = c("adec_amt", "adec_paid_amt", "adec_missed"),
new = c(paste0(c("adec_amt", "adec_paid_amt", "adec_missed"), "_from_acfr")))
# extract plan_id
plan_id <- import(filename, sheet = "Database Input") %>%
select(2) %>% colnames()
result <- gasb %>%
full_join(valuation, by = c("full_name", "fye")) %>%
full_join(acfr, by = c("full_name", "fye")) %>%
arrange(fye) %>%
#adding plan_id
mutate(plan_id = plan_id)
return(result)
}
# test the function
filename <- "data/2023_May7/Texas_TRS_updated_2023.xlsx"
result <- import_and_slice(filename)
####### Loop thorough year 2023#######
filelist_2023 <- list.files("data/2023_May7", pattern = "_2023.xlsx")
df_2023 <- data.frame()
df_exception_23 <- data.frame(filename = character(), error_message = character(),
stringsAsFactors = FALSE)
for (filename in filelist_2023) {
filename = paste0("data/2023_May7/", filename)
result <- tryCatch({
plan <- import_and_slice(filename)
df_2023 <- rbind(df_2023, plan) %>% distinct()
NULL # No error occurred
}, error = function(e) {
list(filename = filename, error_message = e$message)
})
# If an error occurred, add to df_exception
if (!is.null(result)) {
df_exception_23 <- rbind(df_exception_23, result)
}
}
#########Loop through 2022##########
# plans already have 2023 update
intersection22_23 <- Reduce(intersect, list(year_2023, year_2022))
# filter out plans ends _2022 but already have 2023 update
filelist_2022 <- setdiff(list.files("data/2023_May7", pattern = "_2022.xlsx"),
paste0(intersection22_23, "_2022.xlsx")) # these already have 2023 update
df_2022 <- data.frame()
df_exception_22 <- data.frame(filename = character(), error_message = character(),
stringsAsFactors = FALSE)
for (filename in filelist_2022) {
filename = paste0("data/2023_May7/", filename)
result <- tryCatch({
plan <- import_and_slice(filename)
df_2022 <- rbind(df_2022, plan) %>% distinct()
NULL # No error occurred
}, error = function(e) {
list(filename = filename, error_message = e$message)
})
# If an error occurred, add to df_exception
if (!is.null(result)) {
df_exception_22 <- rbind(df_exception_22, result)
}
}
# test the function with previously error file
filename <- "data/2023_May7/Arizona_PSPRS_Tier 3_updated_2023.xlsx"
result <- import_and_slice(filename)
######Exceptions############
# Delaware has extra column -x5
df_exception_23
delaware <- import_and_slice("data/2023_May7/Delaware_PERS_NSPPP_updated_2023.xlsx") %>%
select(-x5)
#Michigan_JRS_updated_2022.xlsx has this: "...12" and doesn't have this: "Tot_Total_Amt"
michigan <- import_and_slice("data/2023_May7/Michigan_JRS_updated_2023.xlsx") %>%
rename(tot_total_amt = x12)
exceptions <- rbind(delaware, michigan)
#dealing with other errors
df_2023_correction <- df_2023 %>%
mutate(full_name = ifelse(plan_id == "6060", "Arizona - Public Safety Personnel Retirement System Tier 3", full_name))
#######Result#########
all_plans_2023 <- rbind(df_2022, df_2023_correction, exceptions) %>%
mutate(plan_id = str_remove(plan_id, "\\.0"))
#write.csv(all_plans_2023, "output/all_plans_2023_hgarb.csv")