
# Function definitions ----------------------------------------------------

process_excel_file = function(path){
  # Get year of file to deal with inconsistencies
  yr = str_extract(path, "\\d{4}") |> as.numeric()
  print(yr)
  # Get names of excel sheets of interest
  sheets = excel_sheets(path)
  sheets = sheets[grep("^Land \\d{2} .", sheets)] # Start with Land XX
  # Column names for most files
  clnms = c("year", "state", "key_reg", "key_mun", "muni", "pop", "levy_prop_a", "levy_prop_b", "levy_lbt")
  # List of state data frames
  dt_list = map(sheets, \(x) {
    out_dt = read_excel(path, sheet = x, skip = if (yr < 2008) 4 else 5, col_names = FALSE, col_types = "text")
    if (yr == 2003){
      out_dt = cbind(out_dt[, 1:2], tibble(VAR = rep("", nrow(out_dt))), out_dt[3:ncol(out_dt)])
    } else if (yr %in% 2006:2007){
      out_dt = out_dt[, c(1,3,2,4:ncol(out_dt))]
    } else if (yr %in% 2009){
      out_dt = out_dt[, c(1:4,6:ncol(out_dt))]
    }
    out_dt = out_dt[, seq_along(clnms)]
    out_dt = mutate(out_dt, across(.cols = everything(), .fns = as.character))
    names(out_dt) = clnms
    return(out_dt)
  })
  out_dt = dt_list |> bind_rows()
  out_dt = mutate(out_dt, across(.cols = c(year, pop, starts_with("levy")), .fns = as.numeric))
  return(out_dt)
}

# Load data ---------------------------------------------------------------

path_levies = file.path("Data", "Hebesätze")

filenames = list.files(path_levies)

new_filenames = map_chr(filenames, \(x) {
  yr = str_extract(x, "\\d{4}")
  yr = as.character(yr)
  file_type = str_extract(x, regex("\\.xlsx?", ignore_case = TRUE))
  return(paste0("Hebesätze Ausgabe ", yr, file_type))
})

if (any(filenames != new_filenames)){
  file.rename(file.path(path_levies, filenames),
              file.path(path_levies, new_filenames))
  
}

paths = file.path(path_levies, new_filenames)

levy_data = map(paths, process_excel_file) |> bind_rows()

levy_data = levy_data |> filter(!is.na(year))

levy_data <- levy_data |> 
  mutate(key_mun_new = case_when(
    str_length(key_mun) == 4 ~ paste0("00", key_mun),
    str_length(key_mun) == 5 ~ paste0("0", key_mun),
    .default = key_mun))

levy_data |> filter(str_length(key_mun_new) != 6 & year <= 2016 & substr(state, 1, 2) != substr(key_mun, 1, 2)) |> count(state)

levy_data <- mutate(levy_data,
                    ags = ifelse(year <= 2016, paste0(state,key_mun_new), key_mun_new)) |> 
  relocate(ags, year) |> 
  arrange(ags, year)

levy_data <- mutate(levy_data,
                    key_reg = ifelse(str_length(key_reg) < 12, paste0("0", key_reg), key_reg))

levy_data <- levy_data |> 
  mutate(state = ifelse(str_length(state) < 2, paste0("0", state), state)) |> 
  mutate(ags_2 = ifelse(year == 2003, paste0(state, key_mun_new), paste0(substr(key_reg, 1, 5), substr(key_reg, 10, 12))))

levy_data <- mutate(levy_data,
  ags = ags_2,
  ags_2 = NULL
)

levy_data <- levy_data |> 
  relocate(ags, year) |> 
  arrange(ags, year)

levy_data <- unique(levy_data, by = c("ags", "year"))
levy_data <- select(levy_data, -c(key_mun_new))

write_parquet(levy_data, sink = "Data/levy_data.parquet")

rm(levy_data, path_levies, filenames, new_filenames, paths)
gc()
