

# Regression analysis ---------------------------------------------------------
 

# The function "conduct_analysis" performs weighted regression analyses on a dataset (dt), 
# filters and subsets data based on specified weights, 
# and computes models for various outcomes using interaction terms.
# 
# The following parameters can be specified:
#   
# - dt: Dataset used in the regression analysis.
# - applied_weight:
#   "unweighted"  
#   "weight_all"   
# 
# The function generates and saves multiple visualizations, including regression coefficients, 
# asymmetry effects, magnitude effects, and heterogeneity across subgroups 
# (e.g., sectors, legal forms, and size categories). The results are organized into 
# structured outputs, including regression models, covariance matrices, and formatted data 
# for further analysis. Additionally, the function saves all outputs and plots in designated folders.


conduct_analysis <- function(dt, applied_weight = "unweighted", cntrls = NULL){
  
  cat("Working on \nweight: ", applied_weight, "\ncontrols:",
      if(is.null(cntrls)) "No\n" else "Yes\n", sep = "")
  
  # Checks whether the applied_weight parameter is set to "unweighted". This indicates 
  # that no specific weights should be applied to the regression analysis. If applied_weight 
  # is "unweighted", a new column weight is added to the specified data frame, with all
  # values set to 1. This effectively assigns equal weights to all observations. 
  # Otherwise, a new column weight is added to the specified data frame, and its 
  # values are taken from the column in the data frame specified 
  # by the applied_weight parameter.
  
  if(applied_weight == "unweighted"){
    dt$weight = 1
  } else{
    dt$weight = dt$weight_web
  }

  # Initializes an empty list named results, which is intended to store multiple 
  # objects or data structures (e.g., regression models, covariance matrices, 
  # formatted data, or plots) generated later in the function.
  results = list()


  # Constructs a file path for storing figure outputs by combining Figures or Tables, 
  # and applied_weight. 
  # For example, path_figures = file.path("Figures", "unweighted") results 
  # in "Figures/Web_Phone/unweighted".
  path_figures = file.path("Figures", applied_weight, if(is.null(cntrls)) "no_controls" else "controls")
  path_tables = file.path("Tables", applied_weight, if(is.null(cntrls)) "no_controls" else "controls")
  

  # Stores results of multiple Ordinary Least Squares regressions for the following
  # outcome margins (dep. variables): 
  # wages, jobs, payout, prices, reserves, investment, tax, debt, other. 
  # The respective dependent variable is regressed on the following independent variables:
  # - treat_mag*treat_sign: Includes treat_mag (1%, 10%, 25%), treat_sign (tax increase/decrease), 
  #                         and their interaction term (treat_mag:treat_sign) as predictors. 
  # - economic sector, legal form, respondent (CEO, other, not known), 
  #   size_rev (size category based on employees/revenues), 
  #   impact_on_revenue (percentage changed in revenues compared to beginning of 2020), 
  #   impact_on_net_income (percentage changed in net income compared to beginning of 2020), 
  # 
  # Regressions apply weights for the regressions based on the weight parameter specified. 
  reg_form <- "c(wages, jobs, payout, reserves, prices, investment,
                  tax, other) ~ treat_mag*treat_sign"
  if (!is.null(cntrls)){
    reg_form <- str_c(reg_form, "+", cntrls)
  }
  reg_form <- as.formula(reg_form)
  
  mods <- feols(reg_form,
                data = dt, weights = ~weight, vcov = "hetero")
  
  
  # Applies the the vcov() function to each regression model in the list mods.
  # vcov(x) extracts the variance-covariance matrix of the regression coefficients 
  # for each model in mods. This results in a data frame with the variance-covariance 
  # information for all regression models in mods. 
  covs = map(mods, \(x) vcov(x)) |> 
    map(as_tibble) |> 
    map(\(x) {
      x$coefficient = names(x)
      return(x)
    }) |> 
    bind_rows(.id = "outcome") |>
    pivot_longer(cols = -c(outcome, coefficient), names_to = "coefficient_2", values_to = "covariance") |> 
    mutate(outcome = str_remove(outcome, "lhs: "))
 

  # Processes the regression models stored in the list mods, tidies their outputs, 
  # combines them into a single data set.  
  reg_data <- map(mods, \(x)tidy(x)) |> bind_rows(.id = "outcome") |> 
    mutate(outcome = str_remove(outcome, "lhs\\: "))
  
  
  # Stores key results from the regression analysis into a structured list within the results object:
  # - Creates a new entry named "main_spec" in the results list.
  # - Stores the regression models (mods) in the "models" entry of the "main_spec" list.
  # - Stores the covariance matrices (covs) in the "covariances" entry of the "main_spec" list.
  # - Stores the regression results (reg_data) in the "est_data" entry of the "main_spec" list.
  results[["main_spec"]] <- list(
    models = mods,
    covariances = covs,
    est_data = reg_data
  )

  
  
    
  # Asymmetry ---------------------------------------------------------------
  
  
  # Processes the tidy regression results (reg_data) to analyze asymmetry in the effects 
  # of tax increases and decreases. It creates a new data frame (reg_data_asym) that 
  # organizes and reshapes the regression estimates and standard errors 
  # for "increase" and "decrease" terms.
  # - Re-codes the term column: 
  #   - "decrease" is assigned to the term (Intercept) (representing the baseline)
  #   - "increase" is assigned to the term treat_signIncrease (representing the effect of a tax increase)
  # - Filters the data to keep only rows where term is either "increase" or "decrease"                                                   
  # - Reshapes the data into a wide format, where separate columns are created for 
  #   "increase" and "decrease" estimates and their standard errors.
  reg_data_asym <- reg_data |> 
    mutate(term = fct_recode(term,
                             "decrease" ="(Intercept)",
                             "increase" = "treat_signIncrease")) |> 
    filter(term %in% c("increase", "decrease")) |> 
    pivot_wider(id_cols = outcome, names_from = term,
                values_from = estimate:std.error) 
  
  
  
  # Processes the covariance data (covs) to extract specific covariance values 
  # related to the interaction between the (Intercept) (tax decrease) and treat_signIncrease 
  # (tax increase) coefficients for each outcome. This isolates the covariance values that 
  # measure the relationship between the (Intercept) and treat_signIncrease terms 
  # in the regression models. These covariances are necessary for calculating 
  # combined standard errors or testing asymmetry between the effects of 
  # tax increases and decreases.
  covs_asym <- covs |> 
    filter(coefficient == "(Intercept)" & coefficient_2 == "treat_signIncrease") |> 
    select(outcome, covariance)
  
  
  # Combines regression data (reg_data_asym) with covariance information (covs_asym) 
  # to compute new estimates, standard errors, and confidence intervals for the 
  # combined effects of tax increases and decreases. 
  # Then, the following steps are performed:
  #   - Computes the combined effect of a tax increase as the sum of estimate_decrease 
  #     (baseline effect) and estimate_increase.
  #   - Computes the standard error for the combined effect:
  #     sqrt(std.error_decrease^2 + std.error_increase^2 + 2*covariance))
  #   - Splits column names into two components: .value (e.g., estimate, std.error) 
  #     and term (e.g., increase, decrease). Resulting long format has rows for each 
  #     combination of outcome and term (e.g., "increase" or "decrease").
  #   - Adds confidence interval bounds for each term (estimate +/- 1.96 * std.error).  
  reg_data_asym <- inner_join(reg_data_asym,covs_asym)|> 
    mutate(estimate_increase = estimate_decrease + estimate_increase,
           std.error_increase = sqrt(std.error_decrease^2 + std.error_increase^2 + 2*covariance)) |> 
    select(-covariance) |> 
    pivot_longer(cols = -outcome, names_to = c(".value", "term"), names_sep = "_") |> 
    mutate(conf.low = estimate - 1.96 * std.error,
           conf.high = estimate + 1.96 * std.error)
  
  
  # Code creates a bar plot with error bars to visualize the effects of tax increases 
  # and decreases on the respective outcomes, based on the data in reg_data_asym.
  # It presents the magnitude of the effects and the confidence intervals.
  reg_data_asym |> 
    mutate(outcome = factor(outcome,
                            levels = outcomes, labels = outcome_labels),
           term = str_to_title(term)) |> 
    ggplot(aes(x=outcome, y=estimate)) +
    geom_col(mapping = aes(fill=term), position=position_dodge2(width = 0.9, preserve="single")) +
    geom_pointrange(mapping=aes(ymin = conf.low, ymax = conf.high, group=term),
                    position=position_dodge2(width=0.9, preserve="single")) +
    #geom_label(aes(label=sprintf("%.2f", round(estimate,2))), position=position_dodge2(width = 0.9, preserve="single")) +
    scale_fill_brewer(palette = "Set1") + 
    labs(
      x = "Category",
      y = "Incidence (Percent)",
      fill = "Sign of tax change"
    ) +
    theme_bw() +
    theme(axis.text.x = element_text(hjust = 1, angle = 45))
  ggsave(filename=paste0(path_figures,"/sign_", applied_weight,".pdf"), width = fig_width, height = fig_height,
         units = "cm")
  
  
  
  # Partial effects:
  
  
  # This code calculates the marginal effects of treat_sign (e.g., tax increase vs tax decrease) 
  # for each outcome when the treatment magnitude is "Low". 
  # It organizes these results, annotates them with statistical significance and adjusted p-values
  # (i.e., adjusts the p-values (p.value) using the Benjamini-Yekutieli (BY) method 
  # to control the false discovery rate (FDR).)
  margin = lapply(mods, avg_slopes, variable = "treat_sign", newdata=subset(treat_mag=="Low")) |> 
    bind_rows(.id = "outcome") |> 
    mutate(outcome = str_remove(outcome, "lhs: "),
           outcome = factor(outcome, 
                            levels = outcomes, 
                            labels=outcome_labels),
           outcome = fct_rev(outcome), 
           q.value = p.adjust(p.value, method = "BY"),
           significant = ifelse(q.value < 0.05, "Yes", "No"),
           term = str_to_title(str_replace(term, "\\_", " ")),
           contrast = str_to_title(contrast),
           margin = str_c(term, contrast, sep = ": ")) 
  
  
  # Stores the partial asymmetry analysis results (calculated marginal effects) within the results list
  # under "partial_asymmetry".
  results[["main_spec"]]$partial_asymmetry = margin
  
  # Plots margins_sign_included_modes_applied_weight.pdf:
  # Creates a point plot to visualize the marginal effects stored in the margin data 
  # frame, showing the estimates, their statistical significance (i.e., open circles 
  # represent non-significant effects, solid circles represent significant effects), 
  # and grouping them by the specific outcome categories. The marginal effects display 
  # the impact of treat_sign (e.g., tax increase vs. tax decrease) on each outcome 
  # when the treatment magnitude is "Low". Saved file also includes information on mode and weights used.
  margin |> 
    ggplot() +
    geom_point(aes(outcome, estimate, shape = significant)) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    scale_shape_manual(values = c("No"=1, "Yes"=16)) +
    scale_color_brewer(palette="Set1") +
    coord_flip() +
    labs(y = "Estimate",
         x = "Outcome",
         shape = "Significant",
         color = "Sign") +
    facet_wrap(~margin) +
    theme_bw() + 
    theme(axis.text.y = element_text(hjust=0))
  ggsave(paste0(path_figures, "/margins_sign_", applied_weight,".pdf"), width = 1.5* fig_width, height = fig_height,
         units = "cm")

  
  
  
  
    
  # Magnitude ------------------------------------------------------

  
  # Uses the regression data (reg_data) to extract and classify coefficients related 
  # to the treatment magnitude (treat_mag) and treatment sign (treat_sign). 
  # It creates new columns to categorize the magnitude of the treatment (Low, Medium, High) and 
  # whether it corresponds to a tax increase or decrease. 
  reg_mag <- reg_data |> 
    filter(str_detect(term, "treat_mag") | term == "(Intercept)" | term == "treat_signIncrease") |> 
    mutate(magnitude = case_when(
      str_detect(term, "(Intercept)|(^treat_signIncrease)") ~ "Low",
      str_detect(term, fixed("Medium", ignore_case = T)) ~ "Medium",
      str_detect(term, fixed("High", ignore_case = T)) ~ "High"),
      sign = ifelse(str_detect(term, fixed("Increase", ignore_case = T)),
                    "Increase",
                    "Decrease")
    ) 
  

  
  # Creates a list, coef_list, that organizes regression coefficients into categories 
  # based on the treatment magnitude (Low, Medium, High) and treatment sign (Increase, Decrease).
  # The coefficients specified in each category define which terms are used for 
  # further analysis, such as calculating combined effects.
  coef_list = list(
    Decrease_Low = c("(Intercept)"),
    Decrease_Medium = c("(Intercept)", "treat_magMedium"),
    Decrease_High = c("(Intercept)", "treat_magHigh"),
    Increase_Low = c("(Intercept)", "treat_signIncrease"),
    Increase_Medium = c("(Intercept)", "treat_signIncrease", "treat_magMedium", "treat_magMedium:treat_signIncrease"),
    Increase_High = c("(Intercept)", "treat_signIncrease", "treat_magHigh", "treat_magHigh:treat_signIncrease")
  )


  
  
  
  # The function combined_inference calculates combined regression estimates and 
  # their confidence intervals for different treatment categories (coef_list) and outcomes and plots it. 
  # It uses the regression data (reg_data) and variance-covariance matrix (covs).
  # In particular, it does the following:
  #   - Computes the total effect of multiple coefficients for each treatment category and outcome.
  #   - Incorporates the variance-covariance structure to calculate accurate standard errors 
  #     and confidence intervals for each treatment and outcome.
  #   - The resulting data frame is structured for easy use in plots or tables.  
  combined_inference <- function(coef_list, outcomes){
    levels = names(coef_list)
    grd = expand.grid(coefs=coef_list,outcome=outcomes)
    plot_data = map2(grd$coefs, grd$outcome, \(x, y){
      estimate = filter(reg_data, term %in% x & outcome == y) |> summarize(estimate = sum(estimate)) |> pull(estimate)
      cv = filter(covs, coefficient %in% x & coefficient_2 %in% x & outcome == y) |> 
        summarize(covariance = sum(covariance)) |> 
        pull(covariance)
      tibble(estimate = estimate, 
             std.error = sqrt(cv),
             outcome = y)
    }) |> 
      bind_rows(.id="treatment") |> 
      mutate(treatment = factor(treatment, levels=levels)) |> 
      mutate(outcome = factor(outcome,
                              levels = outcomes, labels = outcome_labels)) |> 
      mutate(conf.low = estimate - 1.96*std.error,
             conf.high = estimate + 1.96*std.error)
    return(plot_data)
  }

  

  
  # Calls the combined_inference function, which calculates combined estimates, 
  # standard errors, and confidence intervals for the coefficients in coef_list 
  # across the specified outcomes. It separates treatment categories into 
  # distinct components (sign and magnitude) and ensures that the magnitude column 
  # is an ordered factor for subsequent plotting or modeling.
  mag_plot_data = combined_inference(coef_list, outcomes) |> 
    separate_wider_delim(col=treatment, delim="_", names=c("sign", "magnitude")) |> 
    mutate(magnitude = factor(magnitude, levels = c("Low", "Medium", "High")))

  
  # Assigns the processed data stored in mag_plot_data to a specific sub-element 
  # ("combined_inference") within the "main_spec" section of the results list. 
  results[["main_spec"]]$combined_inference = mag_plot_data
  

  # Plots magnitude_Decrease/Increase/included_modes_applied_weight.pdf:
  # Creates and saves bar plots for the combined effects of treatment magnitude 
  # (Low, Medium, High) on each outcome, separately for "Decrease" and "Increase" treatment signs.
  # Saved file also includes information on mode and weights used. 
  for (s in c("Decrease", "Increase")){
    p <- mag_plot_data |> 
      filter(sign == s) |> 
      ggplot(aes(outcome, estimate, fill = magnitude)) +
      geom_col(position=position_dodge2(width = 0.9, preserve = "single")) +
      geom_pointrange(aes(ymin = conf.low, ymax = conf.high),
                      position=position_dodge2(width = 0.9, preserve = "single")) +
      scale_fill_brewer(palette="Set1") +
      labs(x="Outcome",
           y="Estimate + 95% confidence bounds",
           fill = "Size of Tax Change") +
      theme_bw() +
      theme(axis.text.x= element_text(angle = 45, hjust = 1))
    ggsave(filename = paste0(path_figures, "/magnitude_", s, "_", applied_weight,".pdf"), 
           plot = p, width = fig_width, height = fig_height, units = "cm")
  }
  
  
  
  
  # Partial effects 
  
  # Wrt. reference
  
  # Calculates and processes marginal effects for the variable treat_mag across 
  # the multiple regression models in mods. The Marginal effects represent the 
  # change in the dependent variable for a unit change in treat_mag (levels: 1%/10%/25%), averaged over 
  # the observations in the data set. Moreover,  p-values (p.value) are adjusted using the 
  # Benjamini-Yekutieli (BY) method to control the false discovery rate (FDR) for 
  # multiple hypothesis testing. Further, a new columns is created, indicating whether 
  # the marginal effect is statistically significant at a 5% level. 
  margin = lapply(mods, avg_slopes, variable = "treat_mag") |> 
    bind_rows(.id = "outcome") |> 
    mutate(outcome = str_remove(outcome, "lhs: "),
           outcome = factor(outcome, 
                            levels = outcomes, 
                            labels=outcome_labels),
           outcome = fct_rev(outcome), 
           q.value = p.adjust(p.value, method = "BY"),
           significant = ifelse(q.value < 0.05, "Yes", "No"),
           term = str_to_title(str_replace(term, "\\_", " ")),
           contrast = str_to_title(contrast),
           margin = str_c(term, contrast, sep = ": ")) 

  
  # Assigns the processed marginal effects data stored in margin to the  
  # sub-element "partial_magnitude" within the "main_spec" section of the results list.
  results[["main_spec"]]$partial_magnitude = margin
  

  
  # Plot margins_magnitude_included_modes_applied_weight.pdf
  # Creates a point plot to visualize the marginal effects (margin) of treatment magnitudes 
  # (treat_mag) for different outcomes. The plot also highlights whether these effects 
  # are statistically significant and organizes them into facets by margin.
  # These facets compare the following two marginal effects:
  #   - High   vs. Low
  #   - Medium vs. Low
  # Saved file also includes information on mode and weights used. 
  margin |> 
    ggplot() +
    geom_point(aes(outcome, estimate, shape = significant)) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    scale_shape_manual(values = c("No"=1, "Yes"=16)) +
    scale_color_brewer(palette="Set1") +
    coord_flip() +
    labs(y = "Estimate",
         x = "Outcome",
         shape = "Significant",
         color = "Sign") +
    facet_wrap(~margin) +
    theme_bw() + 
    theme(axis.text.y = element_text(hjust=0))
  ggsave(paste0(path_figures, "/margins_magnitude_", applied_weight,".pdf")
         , width = 1.5* fig_width, height = fig_height, units = "cm")

  
  
  
    
  # Pairwise 25-1 vs 10-1
  
  # Calculates pairwise marginal effects for the variable treat_mag using the 
  # "pairwise" hypothesis. In particular, it compares the effects of "25%-1%" vs. "10%-1%" 
  # levels in treat_mag.
  # Moreover, the code processes the results to annotate outcomes, 
  # adjust p-values, classify significance, and format labels 
  # for further analysis or visualization.
  margin = lapply(mods, avg_slopes, variable = "treat_mag", hypothesis="pairwise") |>
    bind_rows(.id = "outcome") |>
    mutate(outcome = str_remove(outcome, "lhs: "),
           outcome = factor(outcome,
                            levels = outcomes,
                            labels=outcome_labels),
           outcome = fct_rev(outcome),
           q.value = p.adjust(p.value, method = "BY"),
           significant = ifelse(q.value < 0.05, "Yes", "No"),
           term = str_to_title(str_replace(term, "\\_", " ")))

  
  # Assigns the processed pairwise marginal effects data (margin) to the sub-element 
  # "partial_magnitude_pairwise" within the "main_spec" section of the results list.  
  results[["main_spec"]]$partial_magnitude_pairwise = margin

  
  
  # Plot margins_magnitude_pairwise_included_modes_applied_weight.pdf
  # Visualizes the pairwise marginal effects of treatment magnitudes (treat_mag) on different outcomes,
  # i.e., (High/25% - Low/1%)-(Medium/25% - Low/1%).
  # Highlights statistical significance of the effects using point shapes 
  # (Yes for significant, No for non-significant).
  # Saved file also includes information on mode and weights used.  
  margin |>
    ggplot() +
    geom_point(aes(outcome, estimate, shape = significant)) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    scale_shape_manual(values = c("No"=1, "Yes"=16)) +
    scale_color_brewer(palette="Set1") +
    coord_flip() +
    labs(y = "Estimate",
         x = "Outcome",
         shape = "Significant",
         color = "Sign") +
    facet_wrap(~term) +
    theme_bw() +
    theme(axis.text.y = element_text(hjust=0))
  ggsave(paste0(path_figures, "/margins_magnitude_pairwise_", applied_weight,".pdf"),
         width = 1.5* fig_width, height = fig_height, units = "cm")
  

  
  
  
    
  # Combination -------------------------------------------------------------
  
  
  # Plot main_pointrange_included_modes_applied_weight.pdf
  # Creates a line plot with error bars to visualize the relationship between 
  # treatment magnitude-sign combinations and their estimated effects on outcomes.
  # Saved file also includes information on mode and weights used.
  mag_plot_data |> 
    mutate(treatment = case_when(
      sign == "Decrease" & magnitude == "High" ~ -25,
      sign == "Decrease" & magnitude == "Medium" ~ -10,
      sign == "Decrease" & magnitude == "Low" ~ -1,
      sign == "Increase" & magnitude == "Low" ~ 1,
      sign == "Increase" & magnitude == "Medium" ~ 10,
      sign == "Increase" & magnitude == "High" ~ 25,
    )) |> 
    ggplot(aes(treatment,estimate)) +
    geom_line() +
    geom_vline(xintercept = 0, linetype = "dashed") +
    geom_pointrange(aes(ymin=conf.low, ymax = conf.high)) +
    labs(x = "Tax burden change (Percentage points)",
         y = "Share of tax burden change (Percent)") +
    scale_x_continuous(breaks=seq(-25,25,5)) +
    facet_wrap(~outcome) +
    theme_bw()
  ggsave(paste0(path_figures, "/main_pointrange_", applied_weight,".pdf"), 
         width = fig_width, height = fig_height, units = "cm")
  
  # In the case of controls, produce alternative via the margins command
  # We want a comparable plot that is not shifted by the change in reference
  # category
  
  
  
  
    
  # Heterogeneity -----------------------------------------------------------

  # Fits multiple ordinary least squares (OLS) regression models for each respective 
  # outcome variable, testing for heterogeneity regarding the tax change sign and 
  # firm characteristics. The following variables are used:
  #   
  # - Dependent variables - categories/outcomes/margins: 
  #   wages, jobs, payout, prices, reserves, investment, tax, debt, other
  # 
  # - Independent variables:
  #    - treat_sign (tax increase, tax decrease)
  #    - economic sector, legal form, respondent (CEO, other, not known),
  #      size_rev (size category based on employees/revenues),
  #      impact_on_revenue (percentage changed in revenues compared to beginning of 2020),
  #      impact_on_net_income (percentage changed in net income compared to beginning of 2020),
  #    - treat_sign*(sector + legal + size_rev + respondent + 
  #                    impact_on_revenue + impact_on_net_income):
  #      Interaction terms allow the effect of treat_sign to vary across firm-level 
  #      characteristics or external conditions, enabling the analysis of heterogeneous effects.
  #   
  # Parameter "weights" indicates if and which weights are used.    
  
  
  # The function setup_coeflist creates a named list of coefficients, organized by 
  # specified labels. Each list element includes relevant coefficients for the label, 
  # along with baseline terms like "(Intercept)" and "treat_signIncrease".
  setup_coeflist <- function(labels, rdata){
    coef_list = lapply(labels, \(x) {
      unique(c("(Intercept)", "treat_signIncrease", unique(str_subset(rdata$term,x))))
    })
    names(coef_list) <- str_remove(labels, "^[a-z0-9\\_]+")
    return(coef_list)
  }

  
  
  # Plot heterogeneity_included_modes_applied_weight_characteristic_increase/decrease.pdf:
  # This function, plot_heterogeneity, creates a bar plot with error bars to 
  # visualize heterogeneity in estimated effects across different firm characteristics, 
  # outcomes, and tax changes (increases and decreases).
  plot_heterogeneity <- function(dt, fill_lab, sign){
    dt |> 
      ggplot(aes(outcome, estimate, fill = treatment)) +
      geom_col(position=position_dodge2(width = 0.9, preserve = "single")) +
      geom_pointrange(aes(ymin = conf.low, ymax = conf.high),
                      position=position_dodge2(width = 0.9, preserve = "single")) +
      scale_fill_brewer(palette="Set1") +
      labs(x="Outcome",
           y="Estimate + 95% confidence bounds",
           fill = fill_lab) +
      theme_bw() +
      theme(axis.text.x= element_text(angle = 45, hjust = 1)) -> p
    ggsave(filename = paste0(path_figures, "/heterogeneity_", applied_weight, "_", 
                             str_to_lower(str_replace_all(fill_lab,"\\s", "_")),
                             "_",
                             str_to_lower(sign), 
                             ".pdf"), 
           plot = p, width = fig_width, height = fig_height,
           units = "cm")
    return(p)
  }

 
  # The function do_heterogeneity_analysis performs subgroup-specific heterogeneity 
  # analysis, calculates combined inference statistics, and generates visualizations. 
  # It handles both tax increase and decrease effects, returning a structured 
  # list of results and plots for further use.
  do_heterogeneity_analysis <- function(include_cntrls = F, ...){
    
    combined_inference <- function(coef_list, outcomes, rdata, cvs){
      levels = names(coef_list)
      grd = expand.grid(coefs=coef_list,outcome=outcomes)
      plot_data = map2(grd$coefs, grd$outcome, \(x, y){
        estimate = filter(rdata, term %in% x & outcome == y) |> summarize(estimate = sum(estimate)) |> pull(estimate)
        cv = filter(cvs, coefficient %in% x & coefficient_2 %in% x & outcome == y) |> 
          summarize(covariance = sum(covariance)) |> 
          pull(covariance)
        tibble(estimate = estimate, 
               std.error = sqrt(cv),
               outcome = y)
      }) |> 
        bind_rows(.id="treatment") |> 
        mutate(treatment = factor(treatment, levels=levels)) |> 
        mutate(outcome = factor(outcome,
                                levels = outcomes, labels = outcome_labels)) |> 
        mutate(conf.low = estimate - 1.96*std.error,
               conf.high = estimate + 1.96*std.error)
      return(plot_data)
    }
    
    splits = list(...) # Converts the input arguments (...) into a named list called splits.
    result = list() # Each element corresponds to a subgrouping variable and its categories.
    
    if (include_cntrls == T){
      reg_form = as.formula(str_c("c(wages, jobs, payout, reserves, prices, investment,
                      tax, other) ~ treat_sign*(", cntrls, ")"))
      mods <- feols(reg_form,
                    data = dt, weights = ~weight, vcov = "hetero")
      
      
      # Extracts and processes variance-covariance matrices from each regression models in mods
      # to produce a tidy data frame. Each row corresponds to a specific combination of 
      # coefficients, outcomes, and covariance values. 
      covs = map(mods, \(x) vcov(x)) |>
        map(as_tibble) |>
        map(\(x) {
          x$coefficient = names(x)
          return(x)
        }) |>
        bind_rows(.id = "outcome") |>
        pivot_longer(cols = -c(outcome, coefficient), names_to = "coefficient_2", values_to = "covariance") |>
        mutate(outcome = str_remove(outcome, "lhs: "))
      
      
      # Processes the regression models into a unified, tidy format for 
      # further analysis or visualization:
      #   - Combines results for multiple models (one for each dependent variable) into a single data frame.
      #   - Includes estimates, standard errors, test statistics, and p-values for all terms.
      #   - Cleans up the outcome column for clarity.
      reg_data<- map(mods, \(x)tidy(x)) |> 
        bind_rows(.id = "outcome") |>
        mutate(outcome = str_remove(outcome, "lhs\\: "))
      
      result$mod <- mods
    }
    
    for (n in seq_along(splits)){
      
      if (include_cntrls == F){
        split_var <- str_extract(splits[[n]][[1]], "^[a-z\\_]+")
        if (is.na(split_var)){
          split_var <- str_c(splits[[n]], collapse = "+")
        }
        reg_form <- as.formula(str_c("c(wages, jobs, payout, reserves, prices, investment,
                        tax, other) ~ treat_sign*(", split_var, ")"))
        mods <- feols(reg_form, data = dt, weights = ~weight, vcov = "hetero")
        
        # Extracts and processes variance-covariance matrices from each regression models in mods
        # to produce a tidy data frame. Each row corresponds to a specific combination of 
        # coefficients, outcomes, and covariance values. 
        covs = map(mods, \(x) vcov(x)) |>
          map(as_tibble) |>
          map(\(x) {
            x$coefficient = names(x)
            return(x)
          }) |>
          bind_rows(.id = "outcome") |>
          pivot_longer(cols = -c(outcome, coefficient), names_to = "coefficient_2", values_to = "covariance") |>
          mutate(outcome = str_remove(outcome, "lhs: "))
        
        
        # Processes the regression models into a unified, tidy format for 
        # further analysis or visualization:
        #   - Combines results for multiple models (one for each dependent variable) into a single data frame.
        #   - Includes estimates, standard errors, test statistics, and p-values for all terms.
        #   - Cleans up the outcome column for clarity.
        reg_data<- map(mods, \(x)tidy(x)) |> 
          bind_rows(.id = "outcome") |>
          mutate(outcome = str_remove(outcome, "lhs\\: "))
      }

      
      name_split = names(splits)[n] # Extracts the name of the current subgroup (e.g., "Sector" or "Size").
      coef_list = lapply(splits[[n]], \(x) {
        unique(c("(Intercept)", "treat_signIncrease", unique(str_subset(reg_data$term,x))))
      })
      names(coef_list) <- str_remove(splits[[n]], "^[a-z0-9\\_]+")

      for (sign in c("Increase", "Decrease")){ # For each subgroup, analyzes the effects separately for "Increase" and "Decrease" tax changes
        if (sign != "Increase"){
          coef_list_sign = lapply(coef_list, \(x) str_subset(x, "Increase", negate = T))
        } else{
          coef_list_sign = coef_list
        }
        
        # Calls the combined_inference function to calculate the combined estimates, 
        # standard errors, and confidence intervals for the current subgroup and tax change sign.
        heterogeneity = combined_inference(coef_list_sign, outcomes, reg_data, covs) 
        
        # Creates a bar plot with error bars for the heterogeneity results using the 
        # plot_heterogeneity function. The plot shows estimated effects across outcomes 
        # for the current subgroup and tax change sign.
        coef_plot <- plot_heterogeneity(heterogeneity, name_split, sign)
        
        # Saves the results for the current subgroup (e.g., sector) and sign (tax increase/tax decrease)
        result[[name_split]][[sign]] = list(het=heterogeneity, coef_plot=coef_plot)
      }
      
      result[[name_split]]$mod = mods
      result[[name_split]]$reg_data = reg_data
    }
    return(result)
  }


    
  # Initializes an empty list named heterogeneity within the results object.  
  results[["heterogeneity"]] = list()

  
  # Runs the do_heterogeneity_analysis function to perform heterogeneity analysis 
  # for multiple subgroup categories and assigns the resulting outputs 
  # (including regression data and plots) to the models_plots key within the 
  # results[["heterogeneity"]] list. It produces a nested list of results and plots.
  results[["heterogeneity"]]$models_plots = do_heterogeneity_analysis(include_cntrls = if (is.null(cntrls)) F else T,
    `Size Category`=c("size_revMicro", "size_revSmall", "size_revMedium", "size_revLarge"),
    `Legal Form`=c("legalCorporation", "legalPartnership", "legalSoleProprietor"),
    Sector = c("sectorManufacturing", "sectorConstruction", "sectorTrade", "sectorServices", "sectorOther"),
    `Respondent` = c("respondentCeo","respondentOther", "respondentUnknown"),
    `Revenue Impact` = c('impact_on_revenueSubstantial impact on revenues', 'impact_on_revenueNo substantial impact on revenues'),
    `Profit Impact` = c('impact_on_net_incomeSubstantial impact on profits', 'impact_on_net_incomeNo substantial impact on profits')
  )

  
  # Retrieves combined inference data for tax increases from all subgroups. 
  # Combines results (outcomes, estimates, confidence intervals) into a single tidy data frame for easier analysis.  
  # Adds a sign column to distinguish tax increase results from tax decrease results in subsequent steps.
  # results_data_increase = lapply(results[["heterogeneity"]][["models_plots"]], `[[`, 1) |> 
  #   lapply(`[[`, 1)|> bind_rows(.id="split") |> mutate(sign = "Increase")
  # 
  # # Does the same thing for tax decreases.
  # results_data_decrease =lapply(results[["heterogeneity"]][["models_plots"]], `[[`, 2) |> 
  #   lapply(`[[`, 1)|> bind_rows(.id="split") |> mutate(sign="Decrease")
  # 
  # 
  # # Merges the data frames for tax increases and decreases into a single data frame (results_data).
  # results_data = bind_rows(results_data_increase, results_data_decrease)
  # 
  # 
  # # Assigns the combined data from results_data to a new element named est_data 
  # # within the heterogeneity section of the results object.
  # results[["heterogeneity"]]$est_data = results_data

  # Case distinction with/without controls for marginal effects: with controls, 
  # we only have a single model. Without, we have one model for each sample split
  
  if (!is.null(cntrls)){
    # Computes the average marginal effects of treat_sign for scenarios where treat_sign == "Increase"
    # and combines results for all outcomes into a single data frame.
    margin_inc = lapply(results$heterogeneity$models_plots$mod, avg_slopes, newdata=subset(dt, treat_sign == "Increase")) |> bind_rows(.id= "outcome")
    # Adds a label for easy identification.
    margin_inc$sign = "Increase"
    
    
    # Computes the average marginal effects of treat_sign for scenarios where treat_sign == "Decrease"
    # and combines results for all outcomes into a single data frame.
    margin_dec = lapply(results$heterogeneity$models_plots$mod, avg_slopes, newdata=subset(dt, treat_sign == "Decrease")) |> bind_rows(.id= "outcome")
    # Adds a label for easy identification.
    margin_dec$sign = "Decrease"
    
    # Combines two data frames, margin_inc (marginal effects for tax increases) and 
    # margin_dec (marginal effects for tax decreases), into a single data frame called margin_all.
    margin_all = bind_rows(margin_inc, margin_dec)

    
    # Adjusts p-values for multiple hypothesis testing using the Benjamini–Yekutieli method.
    # Adds a new column q.value to the margin_all data frame so save the adjusted p-value. 
    margin_all <- margin_all |> 
      group_by(term) |> 
      mutate(q.value = p.adjust(p.value, method = "BY"))
    
    # Labels each result as "Significant" or "Not Significant" based on the 
    # adjusted p-values and a significance threshold of 0.05.
    # Adds a new column significant to the margin_all data frame to save this 
    # information.
    margin_all <- mutate(margin_all,
                         significant = ifelse(q.value < 0.05, "Yes", "No"))
    
    
    # Processes the margin_all data frame to filter, clean, and reformat its contents 
    # for further analysis. Following steps are done:
    #   - Filters the data frame to exclude rows where the column std.error (standard error) is NA (missing).
    #   - Make sure that outcome column labelled cleanly.
    #   - Converts the cleaned outcome column into a factor.
    #   - Reverses the order of the factor levels in the outcome column using fct_rev.
    #   - Reformatted term and contrast: Improved readability with title-cased labels.
    margin_all = margin_all |> 
      filter(!is.na(std.error)) |>
      mutate(outcome = factor(str_remove(outcome, "lhs\\: "), levels = outcomes, labels=outcome_labels)) |> 
      mutate(outcome = fct_rev(outcome)) |> 
      mutate(term = str_to_title(str_replace_all(term, "\\_", " ")),
             contrast = str_to_title(contrast)) |> 
      mutate(margin = str_c(term, contrast, sep = ": "))
    
    
    # Assigns the processed data frame margin_all to the margins element within 
    # the heterogeneity section of the results object.
    results[["heterogeneity"]]$margins = margin_all
    
    
    # Saves the results object to a file in R’s binary format (.Rdata). 
    # Stores the file in a structured location (specified by path_tables)
    save(results, file = paste0(path_tables, "/result_list.Rdata"))
    
    
    # Defines a scale for the y-axis in a plot using the scale_y_continuous function 
    # from the ggplot2 package.
    scale_margins = scale_y_continuous(limits=c(-0.2, 0.2), breaks = seq(-1, 1, 0.05))
    
    # Plot heterogeneity_margins_included_modes_applied_weight_trm.pdf: 
    # The loop iteratively creates and saves scatter plots for each unique 
    # firm characteristic in the margin_all data frame. Each plot visualizes the 
    # estimated marginal effects across outcomes and between different realizations 
    # regarding the respective firm characteristic, highlighting the significance 
    # of effects and differentiating between tax increases and decreases.
    for (trm in unique(margin_all$term)){
      margin_all |> 
        filter(term %in% trm) |>
        ggplot() +
        geom_point(aes(outcome, estimate, shape = significant, color = sign)) +
        geom_hline(yintercept = 0, linetype = "dashed") +
        geom_label_repel(mapping = aes(x = outcome, y = estimate, label = sprintf("%.2f",round(estimate, 2)))) +
        scale_shape_manual(values = c("No"=1, "Yes"=16)) +
        scale_color_brewer(palette="Set1") +
        coord_flip() +
        labs(y = "Estimate",
             x = "Outcome",
             shape = "Significant",
             color = "Sign") +
        facet_wrap(~margin) +
        scale_margins +
        theme_bw() + 
        theme(axis.text.y = element_text(hjust=0)) -> p
      ggsave(paste0(path_figures, "/heterogeneity_margins_", applied_weight, "_", trm,".pdf"), plot=p, width = 1.5* fig_width, height = fig_height,
             units = "cm")
      plot(p)
    }
  } else{
    # Sequence along the distinct regression models
    margin_all = tibble() 
    
    for (i in names(results$heterogeneity$models_plots)) {
      split = results$heterogeneity$models_plots[[i]]
      
      margin = tibble()
      
      for (sgn in c("Increase", "Decrease")){
        mods = split$mod
        
        tmp = lapply(mods, avg_slopes, newdata = subset(dt, treat_sign == sgn)) |> 
          bind_rows(.id = "outcome") |> 
          filter(term != "treat_sign")
        tmp$sign = sgn
        
        margin = bind_rows(margin, tmp)
      }
      
      margin <- margin |> 
        filter(!is.na(std.error)) |>
        mutate(outcome = factor(str_remove(outcome, "lhs\\: "), levels = outcomes, labels=outcome_labels)) |> 
        mutate(outcome = fct_rev(outcome)) |> 
        mutate(term = str_to_title(str_replace_all(term, "\\_", " ")),
               contrast = str_to_title(contrast)) |> 
        mutate(margin = str_c(term, contrast, sep = ": "))
      
      margin <- mutate(margin,
                           q.value = p.adjust(p.value, method = "BY"))
      
      # Labels each result as "Significant" or "Not Significant" based on the 
      # adjusted p-values and a significance threshold of 0.05.
      # Adds a new column significant to the margin_all data frame to save this 
      # information.
      margin <- mutate(margin,
                           significant = ifelse(q.value < 0.05, "Yes", "No"))
      
      # Defines a scale for the y-axis in a plot using the scale_y_continuous function 
      # from the ggplot2 package.
      scale_margins = scale_y_continuous(limits=c(-0.2, 0.2), breaks = seq(-1, 1, 0.05))
      
      margin |>
        ggplot() +
        geom_point(aes(outcome, estimate, shape = significant, color = sign)) +
        geom_hline(yintercept = 0, linetype = "dashed") +
        geom_label_repel(mapping = aes(x = outcome, y = estimate, label = sprintf("%.2f",round(estimate, 2)))) +
        scale_shape_manual(values = c("No"=1, "Yes"=16)) +
        scale_color_brewer(palette="Set1") +
        coord_flip() +
        labs(y = "Estimate",
             x = "Outcome",
             shape = "Significant",
             color = "Sign") +
        facet_wrap(~margin) +
        scale_margins +
        theme_bw() + 
        theme(axis.text.y = element_text(hjust=0)) -> p
      ggsave(paste0(path_figures, "/heterogeneity_margins_", applied_weight, "_", str_replace_all(str_to_lower(i), " ", "_")
,".pdf"), plot=p, width = 1.5* fig_width, height = fig_height,
             units = "cm")
      margin_all <- bind_rows(margin_all, margin)
    }
    
    results[["heterogeneity"]]$margins = margin_all
    
    
    # Saves the results object to a file in R’s binary format (.Rdata). 
    # Stores the file in a structured location (specified by path_tables)
    save(results, file = paste0(path_tables, "/result_list.Rdata"))
  }
  
}


# The specs tibble data frame serves as a set of predefined configurations for 
# running the conduct_analysis function. Each row corresponds to a specific call 
# to the function conduct_analysis, passing values for the parameters 
# "applied_weight" and "cntrls".
specs <- tribble(
  ~id, ~weight, ~controls,
  1, "unweighted", NULL,
  2, "unweighted", ctrls,
  3, "weight_web", NULL,
  4, "weight_web", ctrls
)


# Generates a character vector called folder_structure that defines file paths for 
# saving figures, based on the specifications (specs) for different survey modes and weight types. 
# The folder_structure vector contains file paths like: "Figures/Web/unweighted"
folder_structure <- str_c("Figures/", specs$weight, "/", sapply(specs$controls, \(x) {
  if (is.null(x)) return("no_controls") else return("controls")
}))
# Iteratively creates a directory structure based on the paths defined in the folder_structure vector.
for (folder in folder_structure){
  if (!dir.exists(folder)){
    dir.create(folder, recursive = T)
  }
}


# Generates a character vector called folder_structure that defines file paths for 
# saving tables, based on the specifications (specs) for different survey modes and weight types. 
# The folder_structure vector contains file paths like: "Tables/Web/unweighted".
folder_structure <- str_c("Tables/", specs$weight, "/", sapply(specs$controls, \(x) {
  if (is.null(x)) return("no_controls") else return("controls")
}))
# Iteratively creates a directory structure based on the paths defined in the folder_structure vector.
for (folder in folder_structure){
  if (!dir.exists(folder)){
    dir.create(folder, recursive = T)
  }
}


# Uses the map2 function from the purrr package to systematically run the 
# conduct_analysis function for each combination of mode and weight specified in the specs tibble.
map2(
  specs$weight,
  specs$controls,
  \(x, y) conduct_analysis(df, x, y)
)

# Regression Tables -------------------------------------------------------


# The following code automates the processing, summarization, and export of results stored in the result_list.Rdata 
# files located in the directories defined by folder_structure for tables.

# Initializes an empty list overview to store combined inference data (df_comb) 
# for each folder in folder_structure.
overview = list()

# Loops through each directory path in folder_structure, performing the following 
# operations for each folder.
for (path_tables in folder_structure){ 
  # Loads the results object from the result_list.Rdata file in the current directory (path_tables).
  load(file = file.path(path_tables, "result_list.Rdata"))
  
  # Assigns descriptive names for the outcome variables (from the vector outcome_labels) 
  # to the regression models stored in results$main_spec$models.
  names(results$main_spec$models) <- outcome_labels
  

  
  if (str_detect(path_tables, "no_controls")){
    extra_rows = matrix(rep("No",length(outcome_labels)),nrow=1)
  } else { 
    extra_rows = matrix(rep("Yes",length(outcome_labels)),nrow=1)
  }
  
  # Prepares an additional row named "Controls" with placeholder values ("X") for each outcome label.
  colnames(extra_rows) <- outcome_labels
  extra_rows = cbind(" "= "Controls", as_tibble(extra_rows))
  # Sets the position of these extra rows
  attr(extra_rows, 'position') <- 13
  

  
  # Generates a regression results table:
  #   - coef_map: Maps variable names to more descriptive labels for display.
  #   - stars: Adds significance stars based on p-values.
  #   - gof_map: Displays number of observations goodness-of-fit metrics (nobs, adj.r.squared).
  #   - add_rows: Adds the extra rows for "Controls."
  #   - title and notes: Provides a title and explanatory notes for the table.
  #   - output: Saves the table as a LaTeX file (main_spec.tex) in the current directory (path_tables).
  
  flnm = file.path(path_tables, str_c("main_spec", if (str_detect(path_tables, "no_controls", negate = T)) "_controls" else "" ,".tex"))
  modelsummary(results$main_spec$models, 
               coef_map = c(
                 "(Intercept)" = "Constant",
                 "treat_signIncrease" = "Increase",
                 "treat_magMedium" = "Medium Change",
                 "treat_magHigh" = "Large Change",
                 "treat_magMedium:treat_signIncrease" = "Increase x Medium Change",
                 "treat_magHigh:treat_signIncrease" = "Increase x Large Change"
               ),
               gof_map = c("nobs", "adj.r.squared"),
               add_rows = extra_rows, 
               fmt = 3,
               title = "Asymmetry and Magnitude Effects \\label{tab:asym_mag}",
               notes = paste("This table shows coefficient estimates and",
                             "robust standard errors for our main specification."),
               output = flnm, escape = F)
  
  
  
  
  ### Preparing and Saving Level Estimates
  
  # Extracts combined_inference data (df_comb) from the results object.
  df_comb <-results$main_spec$combined_inference
  
  # Saves it in the overview list, using a cleaned version of the folder name as the key.
  overview[[str_remove(path_tables, "Tables/") |> str_replace("/", "_")]] = df_comb
  
  # Converts column names in df_comb to title case for better readability.
  names(df_comb) <- str_to_title(names(df_comb))
  
  
  # Transforming Data:
  #   - Selects key columns (Sign, Magnitude, Estimate, Std.error, Outcome) from df_comb.
  #   - Combines Estimate and Std.error into a single Display column (e.g., "0.1234 (0.0123)").
  #   - Transforms the data into a wide format with separate columns for each 
  #     magnitude level (e.g., Low, Medium, High).
  tab_data <- df_comb |> 
    select(Sign, Magnitude, Estimate, Std.error, Outcome) |> 
    mutate(Display = str_c(round(Estimate, 4), " (", round(Std.error, 4), ")", sep = "")) |> 
    pivot_wider(id_cols = c(Outcome, Sign), names_from = c(Magnitude),
                names_sep = " ", values_from = Display)
  
  # The following code conducts Wald tests to test for asymmetry for each treatment
  # intensity
  
  # Define container
  tests_asymmetry = vector(mode = "list", length = length(results$main_spec$models))
  
  tests_magnitude = vector(mode = "list", length = length(results$main_spec$models))
  
  # Iterate over regression models
  for (i in seq_along(tests_asymmetry)){
    # Use marginaleffects::hypotheses
    coefs = coef(results$main_spec$models[[i]])
    coefnames = names(coefs)
    
    low_dif = as.integer(coefnames == "treat_signIncrease")
    
    med_dif = c(as.integer(coefnames == "treat_signIncrease") +
                  as.integer(coefnames == "treat_magMedium:treat_signIncrease"))
    
    high_dif = c(as.integer(coefnames == "treat_signIncrease") +
                   as.integer(coefnames == "treat_magHigh:treat_signIncrease"))
    
    tests_asymmetry[[i]] = bind_rows(linearHypothesis(
      results$main_spec$models[[i]],
      hypothesis.matrix = matrix(low_dif, nrow = 1),
      test = "F"
    )[2,],
    linearHypothesis(
      results$main_spec$models[[i]],
      hypothesis.matrix = matrix(med_dif, nrow = 1),
      test = "F"
    )[2,],
    linearHypothesis(
      results$main_spec$models[[i]],
      hypothesis.matrix = matrix(high_dif, nrow = 1),
      test = "F"
    )[2,]
    ) |> as_tibble() |> 
      mutate(estimate = c(sum(coefs[as.logical(low_dif)]),
                          sum(coefs[as.logical(med_dif)]),
                          sum(coefs[as.logical(high_dif)])) |> round(3),
             mag = c("Low", "Medium", "High")) |> 
      rename(p.value = `Pr(>F)`) |> 
      mutate(stars = case_when(
        p.value < 0.01 ~ "***",
        p.value < 0.05 ~ "**",
        p.value < 0.1 ~ "*",
        .default = ""
      ),
      display = sprintf(estimate, fmt="%.3f")) |> 
      select(mag, display) |> 
      pivot_wider(names_from = mag, values_from = display)
    
    tests_magnitude[[i]] = bind_rows(
      linearHypothesis(results$main_spec$models[[i]],
                       hypothesis.matrix = matrix(c(as.integer(coefnames == "treat_magMedium"),
                                as.integer(coefnames == "treat_magHigh")
                       ), nrow = 2, byrow = T), test = "F"
                       ) |> as_tibble() |> mutate(Sign = "Decrease"),
      linearHypothesis(results$main_spec$models[[i]],
                       hypothesis.matrix = matrix(c(as.integer(coefnames == "treat_magMedium") + as.integer(coefnames == "treat_magMedium:treat_signIncrease"),
                                                    as.integer(coefnames == "treat_magHigh") + as.integer(coefnames == "treat_magHigh:treat_signIncrease")
                       ), nrow = 2, byrow = T), test = "F"
      ) |> as_tibble() |> mutate(Sign = "Increase")
      ) |> filter(!is.na(F)) |> select(-c(Res.Df,Df))
    
  }
  
  # Magnitude tests
  
  names(tests_magnitude
  ) <- outcome_labels
  tests_magnitude <- bind_rows(tests_magnitude, .id = "Outcome")
  tests_magnitude <- mutate(tests_magnitude,
                            stars = case_when(
                              `Pr(>F)` < 0.01 ~ "***",
                              `Pr(>F)` < 0.05 ~ "**",
                              `Pr(>F)` < 0.1 ~ "*",
                              .default = ""
                              ),
                            `F-statistic` = as.character(round(`F`,2))) |> 
    select(`F-statistic`)
  
  tab_data <- cbind(tab_data, tests_magnitude)
  
  # Asymmetry tests
  
  # Add outcome labels for table
  names(tests_asymmetry) <- outcome_labels
  
  # Combine tests into one dataframe
  tests_asymmetry <- tests_asymmetry |> bind_rows(.id = "Outcome")
  
  tests_asymmetry$Sign = "Difference"
  tests_asymmetry$`F-statistic` = ""
  
  # Combine differences with table data and arrange properly
  tests_asymmetry |> bind_rows(tab_data) |> 
    mutate(Sign = factor(Sign, levels = c("Decrease", "Increase", "Difference")),
           Outcome = factor(Outcome, levels = outcome_labels)) |> 
    arrange(Outcome, Sign) |> 
    relocate(Sign, .after = Outcome) -> tab_data
    
  # Styling and Saving the Table:
  #   - tt: Prepares the table with a caption and notes.
  #   - group_tt: Groups rows by Outcome.
  #   - style_tt: Bolds specific rows and inserts horizontal lines
  #   - save_tt: Saves the styled table as a LaTeX file (main_spec_levels.tex) 
  #              in the current directory (path_tables).
  flnm = file.path(path_tables, str_c("main_spec_levels", if (str_detect(path_tables, "no_controls", negate = T)) "_controls" else "" ,".tex"))
  
  tab_data |> 
    mutate(`F-statistic`=replace_na(`F-statistic`, "")) |> 
    select(-Outcome) |> 
    tt(caption = "Level Estimates for Treatment Combinations.\\label{tab:main-levels}",
       notes = paste0("This table shows the incidence level estimates for the different treatment ",
                      "combinations estimated from Equation (1)."), escape =F) |> 
    group_tt(i=as.character(tab_data$Outcome)) |> 
    style_tt(i = seq(1, (nrow(tab_data) + length(outcome_labels)), 4),
             bold = T) |> 
    style_tt(i = seq(3, (nrow(tab_data) + length(outcome_labels)), 4), line = "b") |> 
    save_tt(output= flnm, overwrite = TRUE)
  
  # Creating Heterogeneity Tables
  
  if (str_detect(path_tables, "Phone")) next
  
  if (str_detect(path_tables, "no_controls")){
    het_models <- results$heterogeneity$models_plots
    
    for (het in names(het_models)){
      print(het)
      reg_mods <- het_models[[het]]$mod
      
      names(reg_mods) <- outcome_labels
      
      coef_nms <- names(coef(reg_mods))[-c(1:2)]
      
      coef_lbls <- coef_nms |> 
        str_remove_all(pattern = "[a-z]+\\_[a-z]+") |> 
        str_replace(pattern = ":", " x ") 
      names(coef_lbls) <- coef_nms
      
      modelsummary(reg_mods, 
                   coef_map = coef_lbls,
                   gof_map = c("nobs", "adj.r.squared"),
                   title = str_c("Treatment Effect Heterogeneity ", het,
                                 "\\label{tab:het-", str_to_lower(het) |> 
                                   str_replace(" ", "-"),"}"),
                   escape = F,
                   notes = str_c("This Table shows heterogeneous treatment effects for the split by ", str_to_lower(het), ". Robust standard errors are given in parentheses."),
                   output = file.path(path_tables, str_c("het_",
                                                         str_to_lower(het) |> 
                                                           str_replace(" ", "_"),
                                                         ".tex")))
    }
  } else {
    het_models <- results$heterogeneity$models_plots$mod
    
    het_splits <- str_extract(names(coef(het_models))[-c(1,2)], pattern = "(\\(Intercept\\))|([a-z\\_]+)") |> 
      str_subset(pattern = "treat_sign", negate = T) |> unique()
    
    
    
    for (split in het_splits){
      print(split)
      
      coef_nms <- names(coef(het_models))[-c(1,2)] |> 
        str_subset(split)
    
    coef_lbls <- coef_nms |> 
      str_remove_all(pattern = "[a-z]+\\_[a-z]*\\_*[a-z]*\\_*[a-z]*\\_*[a-z]+") |> 
      str_replace(pattern = ":", " x ") |> 
      str_replace(pattern = "Increase x [a-z]+", "Increase x ") |> 
      str_remove(pattern = "^[a-z]+")
    
    names(coef_lbls) <- coef_nms
    
    msummary(het_models, coef_map = coef_lbls, 
             stars = c('***'=0.01, '**'=0.05, '*'=0.1),
             gof_map = c("nobs", "adj.r.squared"),
             title = str_c("Treatment Effect Heterogeneity ", het,
                           "\\label{tab:het-", str_to_lower(het) |> 
                             str_replace(" ", "-"),"}"),
             escape = F,
             notes = str_c("This Table shows heterogeneous treatment effects for the split by ", str_to_lower(het), ". Robust standard errors are given in parentheses."),
             output = file.path(path_tables, str_c("het_",
                                                   str_to_lower(split) |> 
                                                     str_replace(" ", "_"),
                                                   ".tex")))
    }
  }
  

  
  
}



# Validation Check Weights --------------------------------------------


# The code processes and transforms data stored in the overview object, 
# creating a new, combined data frame called overview_data. Aim is to compare the results
# for the different survey modes "Web" and "Phone".

# The following steps are done:
#   Combine and Filter Data:
#     - Merges all data from the overview list into a single data frame.
#     - Filters to retain only specific specifications (excluding "Web_Phone" and 
#       including only "unweighted" or "Web_weight_web" specs).
#     - Replaces technical terms (e.g., "weight_web") with more interpretable labels (e.g., "weighted").
overview_data = bind_rows(overview, .id = "spec") |>
  filter(str_detect(spec,"no_controls")) |> 
  mutate(spec = str_replace(spec, "weight_[a-z]+", "weighted"),
         spec = str_remove(spec, "_no_controls"),
         spec = str_to_title(spec)
         )



# Groups overview_data by sign, magnitude, and outcome, then calculates the mean 
# of estimate within each group while ignoring missing values. The resulting data frame, 
# overview_means, summarizes the average treatment effects for each combination.
overview_means = overview_data |> 
  group_by(sign, magnitude, outcome) |> 
  summarize(estimate = mean(estimate, na.rm=T))



# Sequentially loads the files containing analysis results (result_list.Rdata) 
# from directories listed in the folder_structure vector and extracts the 
# combined_inference data from the results$main_spec object in each file.

# Web results (unweighted)
load(file = file.path(folder_structure[1], "result_list.Rdata"))
main_spec <- results$main_spec$combined_inference

# Web (weighted)
load(file = file.path(folder_structure[3], "result_list.Rdata"))
other_spec = results$main_spec$combined_inference


# Calculates and classifies the differences in estimates between two datasets 
# (main_spec (web) and other_spec (phone)) for each combination of sign, magnitude, and outcome.
  # - Quantifies differences in estimates between main_spec (Web) and other_spec (Phone) for 
  #   each combination of sign, magnitude, and outcome.
  # - Labels the differences as "Larger", "Smaller", or "Equal".
  # - Produces a tidy data frame summarizing the differences.
differences = inner_join(
  main_spec |> select(sign, magnitude, estimate, outcome),
  other_spec |> select(sign, magnitude, estimate, outcome),
  by = join_by(sign, magnitude, outcome), suffix = c("_web", "_phone")
) |> 
  mutate(estimate = round(estimate_web - estimate_phone, 2),
         diff = case_when(
           estimate > 0 ~ "Larger",
           estimate < 0 ~ "Smaller",
           estimate == 0 ~ "Equal"
         ))



# Plot comparison_specifications.pdf:
# Creates a detailed and multi-layered ggplot visualization comparing estimates and 
# deviations for each outcome across different specifications (Web (unweighted), Phone (unweighted), 
# Web (weighted)) and treatment combinations (tax change magnitude and tax change sign).
ggplot(overview_data,aes(x = interaction(magnitude,sign , sep = " Tax "), y=estimate)) +
  geom_col(aes(fill=spec),position = position_dodge2(width=0.9)) +
  geom_line(data=main_spec, aes(group=1), linewidth=1.5, color="blue", alpha = 0.5) +
  geom_line(data=other_spec, aes(group=1), linewidth=1.5, color="red", alpha = 0.5) +
  geom_pointrange(aes(
    fill=spec,
    ymin = estimate - 1.96*std.error, 
    ymax = estimate + 1.96*std.error), 
    position=position_dodge2(width = 0.9), alpha =0.9) +
  #geom_line(data=overview_means,aes(group=1), color="black", linewidth=1.5) +
  geom_label(data=differences, mapping=aes(y= -0.05, 
                                           label = sprintf(round(estimate, 2), fmt="%.2f"),
                                           color = estimate), alpha = 1) +
  scale_color_gradient2(
    limits = c(-max(abs(min(differences$estimate)), 
                    abs(max(differences$estimate))), 
               max(abs(min(differences$estimate)), 
                   abs(max(differences$estimate)))),
    low = "red",
    high = "red",
    mid = "green") +
  #scale_color_manual(values=c("Larger"="blue", "Smaller"="red", "Equal"="darkgreen")) +
  
  scale_fill_brewer(palette = "Set1") +
  xlim("High Tax Decrease", "Medium Tax Decrease", "Low Tax Decrease",
       "Low Tax Increase", "Medium Tax Increase", "High Tax Increase") +
  facet_wrap(~outcome) +
  labs(x = "Treatment",
       fill = "Specification",
       color = "Deviation") +
  theme_bw() +
  theme(axis.text.x = element_text(hjust = 1, angle = 45, size = 12),
        axis.text.y = element_text(size = 12))
ggsave(filename = "Figures/comparison_specifications.pdf", 
       width = 2*fig_width,
       height = 1.5*fig_height,
       units = "cm")


rm(folder_structure, overview, extra_rows, flnm, overview_data, overview_means,
   differences)
gc()