
# Write data --------------------------------------------------------------

path_municipalities = file.path("Data", "AGS")

files = list.files(path_municipalities, pattern = "\\.xlsx")
files

file_dates <- str_extract(files,"\\d+") |> dmy()

files = files[which(year(file_dates) >= 2000)]

var_names = c(
  "record_type",
  "text_indicator",
  "state",
  "district",
  "county",
  "community",
  "municipality",
  "name",
  "area",
  "population",
  "male",
  "female",
  "pop_km2",
  "zip_code",
  "flag",
  "long",
  "lat",
  "commute_key",
  "commute_desc",
  "urban_key",
  "urban_desc"
)

lapply(seq_along(files), \(i) {
  yr = str_extract(files[i], "\\d+") |> dmy() |> year()
  print(yr)
  n_cols = read_excel(path = file.path(path_municipalities, files[i]), 
                    sheet = 2, 
                    skip = 6, n_max = 10,
                    col_names = FALSE) |> ncol()
  print(n_cols)
  
  if (yr %in% c(2007, 2008)){
    clnms = var_names[c(1: 12, 14:17)]
  } else if( n_cols == 15 || n_cols == 17){
    clnms = var_names[1:n_cols]
  } else if (n_cols == 16) {
    clnms = var_names[c(1:14, 16:17)]
  } else {
    clnms = var_names[c(1:14, 16:length(var_names))]
  }
  
  
  df = read_excel(path = file.path(path_municipalities, files[i]), 
                  sheet = 2, 
                  skip = 6, 
                  col_names = clnms)
  if (yr %in% 2007:2008){
    df = cbind(df[, 1:12], pop_km2 = rep(NA_real_, nrow(df)), df[, 13:ncol(df)])
  }
  df$flag <- NULL
  df$year = yr
  
  
  if (is.null(df$long)) {
    df$long <- rep(NA_real_, nrow(df))
    df$lat <- rep(NA_real_, nrow(df))
    } else if (is.character(df$long)){
    df$long <- as.numeric(str_replace(df$long, "\\,", "\\."))
    df$lat <- as.numeric(str_replace(df$lat, "\\,", "\\."))
  }
  
  return(df)
}) -> municipalities


municipalities |> bind_rows() |> filter(!is.na(municipality) & !is.na(district)) -> muni_data

muni_data |> 
  mutate(ags = paste0(state, district, county, municipality)) |> 
  relocate(ags, year) -> muni_data

write_parquet(muni_data, sink = "Data/ags_zip_code.parquet")

rm(path_municipalities, files, file_dates, var_names, municipalities, muni_data)
gc()