
# Functions ---------------------------------------------------------------

# This function produces a confusion matrix of two variables and computes
# the sum of its diagonal elements (corresponding predictions) as well
# as Cohen's Kappa
check_correspondence <- function(x, y, cap = "", lbl = "", note = NULL){
  tbl = round(prop.table(table(x, y)), 2)
  dge <- sum(diag(tbl))
  ck <- CohenKappa(x, y, conf.level = 0.95) |> sprintf(fmt="%.2f")
  obs <- sum(complete.cases(x,y))

  out <- tbl |> 
    as_tibble() |> 
    pivot_wider(id_cols = x, names_from = y, values_from = n) |> 
    rename(Survey=x) 
  
  fnote = str_c("This table shows the degree of correspondence in size between the survey responses and Orbis financial data. Results are based on ", format(obs, big.mark = ",")," observations. The diagonal elements sum to ",
                dge, ". Cohen's Kappa is ",
                ck[1], ", with 95% confidence interval [", ck[2], ", ", ck[3], "].")
  
  if (!is.null(note)){
    fnote = str_c(note, fnote, sep = " ")
  } 
  out <- out|> 
    kbl(format = "latex", booktabs = T, escape =F,
        caption = cap, label = lbl) |> 
    add_header_above(c(" "=1, "Orbis"=(ncol(out)-1))) |> 
    footnote(fnote, threeparttable = T)
  print(out)
  return(out)
}


sample_overview <- function(df, firm_id = "bvd_id_number", restriction = "", ov = NULL){
  num_firms <- length(unique(df[[firm_id]]))
  num_obs <- nrow(df)
  out <- data.frame(Restriction = restriction,
                    Firms = num_firms,
                    Observations = num_obs)
  if (!is.null(ov)){
    out <- bind_rows(ov, out)
  }
  
  return(out)
}


# Load Data ---------------------------------------------------------------

df = open_dataset("Data/final_iab_panel.parquet")

# Only keep firms with data link agreement
df_dp = df |>
  filter(cdat3 == 1) |> 
  collect()


# Keep track of Firms and Firm-years
overview = sample_overview(df_dp, 
                           restriction = "Firms with Linking Agreement")


# Validation firm characteristics -----------------------------------------

df_dp <- as_factor(df_dp)

df_dp[, ':='(
  rev_cat_orbis = 
    factor(case_when(turnover < 50000 ~ "Less than 50.000 EUR",
                     turnover <= 100000 ~ "50.000 - 100.000 EUR",
                     turnover <= 300000 ~ "100.001 - 350.000 EUR",
                     turnover <= 700000 ~ "350.001 - 700.000 EUR",    
                     turnover <= 2000000 ~ "700.001 - 2.000.000 EUR",
                     turnover <= 6000000 ~"2.000.001 - 6.000.000 EUR",
                     turnover <= 8000000 ~ "6.000.001 - 8.000.000 EUR",
                     turnover <= 10000000 ~ "8.000.001 - 10.000.000 EUR",
                     turnover <= 12000000 ~ "10.000.001 - 12.000.000 EUR",
                     turnover <= 20000000 ~"12.000.001 - 20.000.000 EUR",
                     turnover <= 40000000 ~ "20.000.001 - 40.000.000 EUR",
                     turnover <= 50000000 ~ "40.000.001 - 50.000.000 EUR",
                     turnover <= 60000000 ~ "50.000.001 - 60.000.000 EUR",
                     turnover > 60000000 ~"More than 60.000.000 EUR",
                     .default = NA),
           levels = c("Less than 50.000 EUR", 
                      "50.000 - 100.000 EUR",
                      "100.001 - 350.000 EUR",
                      "350.001 - 700.000 EUR",
                      "700.001 - 2.000.000 EUR",
                      "2.000.001 - 6.000.000 EUR",
                      "6.000.001 - 8.000.000 EUR",
                      "8.000.001 - 10.000.000 EUR",
                      "10.000.001 - 12.000.000 EUR",
                      "12.000.001 - 20.000.000 EUR",
                      "20.000.001 - 40.000.000 EUR",
                      "40.000.001 - 50.000.000 EUR",
                      "50.000.001 - 60.000.000 EUR",
                      "More than 60.000.000 EUR"
           ), ordered = T),
  num_emp_cat_orbis = 
    factor(case_when(num_emp_financials == 0 ~ "No employees",
                     num_emp_financials <= 5 ~ "1-5",
                     num_emp_financials <= 9 ~ "6-9",
                     num_emp_financials <= 19 ~ "10-19",
                     num_emp_financials <= 49 ~ "20-49", 
                     num_emp_financials <= 249 ~"50-249",
                     num_emp_financials <= 499 ~ "250-499",
                     num_emp_financials <= 999 ~ "500-999",
                     num_emp_financials >= 1000 ~ "More than 1000",
                     .default = NA), 
           levels = c("No employees", 
                      "1-5",     
                      "6-9",      
                      "10-19", 
                      "20-49",   
                      "50-249",
                      "250-499",
                      "500-999",
                      "More than 1000"))
)]

df_dp[, ':='(num_emp_coarse = factor(case_when(
  num_emp_cat %in% c("No employees", "1-5", "6-9") ~ "0-9",
  num_emp_cat %in% c("10-19", "20-49") ~ "10-49",
  num_emp_cat %in% c("50-249") ~ "50-249",
  num_emp_cat %in% c("250-499", "500-999", "More than 1000") ~ "> 250"
  ), levels = c("0-9", "10-49", "50-249", "> 250"), ordered = T),
  num_emp_coarse_orbis = factor(case_when(
    num_emp_cat_orbis %in% c("No employees", "1-5", "6-9" ) ~ "0-9",
    num_emp_cat_orbis %in% c("10-19", "20-49") ~ "10-49",
    num_emp_cat_orbis %in% c("50-249") ~ "50-249",
    num_emp_cat_orbis %in% c("250-499", "500-999", "More than 1000") ~ "> 250"
  ), levels = c("0-9", "10-49", "50-249", "> 250"), ordered = T),
  rev_cat_coarse = factor(case_when(
    rev_cat %in% c("Less than 50.000 EUR", 
                    "50.000 - 100.000 EUR",
                    "100.001 - 350.000 EUR",
                    "350.001 - 700.000 EUR",
                    "700.001 - 2.000.000 EUR") ~ "EUR 0–2 Mio.",
    rev_cat %in% c("2.000.001 - 6.000.000 EUR",
                    "6.000.001 - 8.000.000 EUR",
                    "8.000.001 - 10.000.000 EUR") ~ "EUR 2–10 Mio.",
    rev_cat %in% c("10.000.001 - 12.000.000 EUR",
                    "12.000.001 - 20.000.000 EUR",
                    "20.000.001 - 40.000.000 EUR",
                    "40.000.001 - 50.000.000 EUR") ~ "EUR 10–50 Mio.",
    rev_cat %in% c("50.000.001 - 60.000.000 EUR",
                    "More than 60.000.000 EUR") ~"> EUR 50 Mio.",
  ), levels = c("EUR 0–2 Mio.", "EUR 2–10 Mio.", "EUR 10–50 Mio.", "> EUR 50 Mio."), ordered = T),
  rev_cat_coarse_orbis = factor(case_when(
    rev_cat_orbis %in% c("Less than 50.000 EUR", 
                   "50.000 - 100.000 EUR",
                   "100.001 - 350.000 EUR",
                   "350.001 - 700.000 EUR",
                   "700.001 - 2.000.000 EUR") ~ "EUR 0–2 Mio.",
    rev_cat_orbis %in% c("2.000.001 - 6.000.000 EUR",
                   "6.000.001 - 8.000.000 EUR",
                   "8.000.001 - 10.000.000 EUR") ~ "EUR 2–10 Mio.",
    rev_cat_orbis %in% c("10.000.001 - 12.000.000 EUR",
                   "12.000.001 - 20.000.000 EUR",
                   "20.000.001 - 40.000.000 EUR",
                   "40.000.001 - 50.000.000 EUR") ~ "EUR 10–50 Mio.",
    rev_cat_orbis %in% c("50.000.001 - 60.000.000 EUR",
                   "More than 60.000.000 EUR") ~"> EUR 50 Mio.",
  ), levels = c("EUR 0–2 Mio.", "EUR 2–10 Mio.", "EUR 10–50 Mio.", "> EUR 50 Mio."), ordered = T))]


df_check <- subset(df_dp, fyear == 2019)

check_correspondence(df_check$num_emp_coarse, df_check$num_emp_coarse_orbis,
                     cap = "Correspondence Number of Employees.",
                     lbl = "correspondence-emp") |> 
  save_kable(file = "Tables/validation_num_emp.tex")

check_correspondence(df_check$rev_cat_coarse, df_check$rev_cat_coarse_orbis,
                     cap = "Correspondence Revenue.",
                     lbl = "correspondence-rev") |> 
  save_kable(file = "Tables/validation_rev.tex")


# Compute outcome variables -----------------------------------------------

# There are gaps in our panel. This has to be respected when looking at 
# first differences.

# Simplest approach: Only keep observations that have a prior financial year

setkey(df_dp, bvd_id_number, fyear)

df_dp[, valid_prev := ifelse(shift(fyear, type = "lag") == (fyear - 1), T, F),
      by = .(bvd_id_number)]

df_dp[, table(valid_prev, useNA = "ifany")]

df_dp = subset(df_dp, valid_prev == T)
df_dp[, valid_prev := NULL]


# Outcome: Change in employment
df_dp[, d_emp := (num_emp_financials- shift(num_emp_financials)) / shift(num_emp_financials), .(bvd_id_number)] |> 
  _[, d_emp := Winsorize(d_emp, val = quantile(d_emp, probs = c(0.1, 0.90), na.rm=T))]

df_dp[, ':='(pos_emp = d_emp > 0,
             neg_emp = d_emp < 0,
             ch_emp = abs(d_emp) > 0)]


# Compute independent variables -------------------------------------------

# Positive/negative change in employment

cutoff <- 0.1

df_dp[, ':='(neg_emp_survey = ifelse(treat_sign == "Increase", jobs > cutoff , NA),
             pos_emp_survey = ifelse(treat_sign == "Decrease", jobs > cutoff, NA),
             emp_survey = jobs > cutoff)] 

df_dp[lbt_change_sign == treat_sign & treat_sign == "Decrease", table(pos_emp, emp_survey)]
df_dp[lbt_change_sign == treat_sign & treat_sign == "Increase", table(neg_emp, emp_survey)]


# Descriptives ------------------------------------------------------------

# Keep track of Firms and Firm-years
overview = sample_overview(df_dp, restriction = "More than one financial year", ov=overview)

# Additional restrictions

# Any tax change
rest_sample = df_dp[lbt_change_sign != "No change"]

overview <- sample_overview(rest_sample, restriction = "Any tax change", ov = overview)


# Hypothetical treatment == actual treatment
rest_sample = df_dp[lbt_change_sign == treat_sign]

overview <- sample_overview(rest_sample, restriction = "Equal signs of treatment", ov = overview)

# Non-missing employees
rest_sample = rest_sample[!is.na(d_emp)]

overview <- sample_overview(rest_sample, restriction = "Non-missing employees", ov = overview)

# Final increases
inc_sample = rest_sample[treat_sign == "Increase"]

overview <- sample_overview(inc_sample, restriction = "Final increases", ov = overview)

# Final decreases
dec_sample = rest_sample[treat_sign == "Decrease"]

overview <- sample_overview(dec_sample, restriction = "Final decreases", ov = overview)

overview |>
  kbl(caption = "Sample Selection LBT Validation.",
      linesep = "",
      label = "validation-lbt",
      format = "latex", booktabs = T) |> 
  footnote("This table illustrates the sample selection process for 
               the LBT change validation exercise.", threeparttable = T) |> 
  save_kable("Tables/validation_lbt.tex")

df_dp[!is.na(d_emp) & treat_sign == lbt_change_sign, addmargins(table(lbt_change_sign, fyear))] |> 
  as_tibble() |> 
  pivot_wider(names_from = lbt_change_sign, values_from = n) |> 
  rename(Year = fyear) |> 
  kbl(format = "latex", linesep = "", booktabs = T, caption = "Overview LBT Changes.",
      format.args = list(big.mark=",")) |> 
  footnote("This table shows the distribution of LBT increases and decreases for the firms in our sample which we are able to link to external data sources.", threeparttable = T) |> 
  save_kable("Tables/overview_tax_changes.tex", keep_tex = T)

# Analysis ----------------------------------------------------------------

conduct_analysis <- function(outcome){
  fml_dec = as.formula(str_c("pos_", outcome, " ~ ", outcome, "_survey"))
  fml_inc = as.formula(str_c("neg_", outcome, " ~ ", outcome, "_survey"))
  res = list()
  
  
  # Indicator for increase in outcome regressed on indicator for saying so.
  res[[1]] <- feols(fml_dec, 
                     data = subset(df_dp, lbt_change_sign == "Decrease" & treat_sign == "Decrease"),
                     vcov = "HC1")
  
  # Indicator for decrease in outcome regressed on indicator for saying so
  res[[2]] <- feols(fml_inc, 
                     data = subset(df_dp, lbt_change_sign == "Increase" &  treat_sign == "Increase"),
                     vcov = "HC1")
 
  # Outcome: Change in variable, regressed on dummy for using this category
  res[[3]] <- feols(fml_inc,
                    data = subset(df_dp, lbt_change_sign == "Increase" &  treat_sign == "Increase" & fyear >= 2018),
                    vcov = "HC1")
  
  print(msummary(res,
           gof_map = c("nobs", "adj.r.squared")))
  
  return(res)
}

reg_res <- list()


reg_res$jobs <- conduct_analysis("emp")
names(reg_res$jobs) <- c("Positive Employment Change", "Negative Employment Change", "Negative Employment Change")

extra_rows <- tibble::tribble(~" ", ~"Decrease", ~"Increase",
                              "Sample Means", df_dp[treat_sign == "Decrease" & lbt_change_sign == "Decrease", mean(pos_emp, na.rm=T)],
                              df_dp[treat_sign == "Increase" & lbt_change_sign == "Increase", mean(neg_emp, na.rm=T)])

msummary(reg_res$jobs,
         gof_map = c("nobs", "adj.r.squared"),
         coef_map = c('(Intercept)'='Intercept', 'emp_surveyTRUE'='Indicated in Survey'),
         title = "Firm-level Association Hypothetical vs. Realized Behavior. \\label{tab:hypo-vs-real}", output = "Tables/hypo_vs_real.tex", escape = F)

rm(df, df_dp, df_check, cutoff, overview, 
   rest_sample, inc_sample, dec_sample, reg_res, extra_rows)
gc()