
# Starting point ---------------------------------------

# Starting point are the financials
dt = read_parquet(file = "Data/financials_cleaned.parquet")

setnames(dt, old = c("number_of_employees", "operating_revenue_turnover"), 
         new = c("num_emp", "turnover"))

setkey(dt, bvd_id_number, fyear)


# Add address data ---------------------------------------------------

addresses = read_parquet("Data/addresses_ags.parquet")

addresses[, n := .N, .(bvd_id_number)]

# Focus on single-establishment firms
addresses <- subset(addresses, n == 1)
addresses[, n := NULL]

dt = merge(dt, addresses, by = "bvd_id_number", all.x = TRUE)

# Add municipality data ---------------------------------------------------

municipalities = read_parquet("Data/municipality_panel.parquet")
setDT(municipalities, key = c("ags", "year"))

municipalities = unique(municipalities, by = c("ags", "year"))

dt = merge(dt, municipalities, by.x = c("ags", "fyear"),
           by.y = c("ags", "year"))


# Add survey data ---------------------------------------------------------

df_joint = read_dta("Data/survey_data_prepped_with_weights.dta")
setDT(df_joint, key = "BvDID")

# May only combine data if linking agreement is given
dt = merge(dt, df_joint[cdat3 == 1], by.x = c("bvd_id_number"),
           by.y = "BvDID", all.x = T, all.y = T, suffixes = c("_financials", "_survey"))

dt = subset(dt, !is.na(bvd_id_number))

setindex(dt, NULL)
setkey(dt, bvd_id_number, fyear)

write_parquet(dt, "Data/final_iab_panel.parquet")


# Function to make variable names Stata-compatible
clean_stata_names <- function(names_vector) {
  names_vector <- substr(names_vector, 1, 32)  # Truncate to 32 characters
  return(names_vector)
}

# Apply function to rename columns
colnames(dt) <- clean_stata_names(colnames(dt))


dt <- dt %>%
  mutate(across(where(is.factor), as.character))  # Convert factors to characters



# Convert integer64 columns
dt <- dt %>%
  mutate(across(where(is.integer64), as.numeric))  # Change to double for Stata

dt <- dt |> 
  mutate(participant = ifelse(!is.na(cdat3), 1, 0))
dt <- dt |> 
  filter(fyear == 2019) 

# Save the dataframe as a Stata .dta file
write_dta(dt, "Data/final_iab_panel_2019.dta")

rm(dt, addresses, municipalities, df_joint)
gc()