

# Load data ---------------------------------------------------------------


# Relevant data for analysis loaded. Data comes from the second wave of the 
# German Business Panel. 
df = read_dta("Data/survey_data_prepped_with_weights.dta") |> as_factor()

# Some weights could not be computed because marginal distribution to calibrate 
# weights are not available for the respective industries -> exclude these observations

df = df |> 
  filter(!is.na(weight_web))


# Balance tests ----------------------------------------------------------


# In the following, we conduct balancing tests across our six different treatment 
# groups for selected firm characteristics. These tests aim to assess whether 
# the treatment groups are comparable in terms of key baseline characteristics, 
# which is essential for validating if the randomization process was successful. 
# By confirming balance across groups, we ensure that any observed effects can 
# be more confidently attributed to the treatments rather than pre-existing 
# differences among groups.
# 
# Treatment groups:
#   - 1% tax increase
#   - 10% tax increase
#   - 25% tax increase
#   - 1% tax decrease
#   - 10% tax decrease
#   - 25% tax decrease
# 
# Firm characteristics:
#   - Revenues (annual company revenue in 2019)
#   - Number of employees (full-time employees subject to social security)
#   - Revenue impact (percentage changed compared to beginning of 2020)
#   - Profit impact (percentage changed compared to beginning of 2020)
#   - Legal forms (corporations, partnerships, sole proprietorships)
#   - Firm size (Micro, Small, Medium, Large)
#   - Economic Sector (Manufacturing, Construction, Trade, Services, Other)


# Extracts all unique values from the variable tax_exp column in the df data frame. 
# Removes duplicate values, resulting in a vector of unique values in df$tax_exp.
treatments = na.omit(unique(df$tax_exp))


# Creates a data frame of all possible pairwise treatment combinations. This can 
# then be used to perform balancing tests between each unique pair of treatments.  
treatment_combinations = expand.grid(Group_1 = treatments, Group_2 = treatments)



# The code creates a new data frame nested_data which keeps non-missing values in variable
# tax_exp, keeps all numeric variables (except wages,jobs,payout,prices,reserves, 
# investment, debt, tax, other and weight) and creates then a new nested data frame for each 
# of the 6 treatment groups. This new data frame consists of a list of data frames for each 
# of the 6 treatment groups.
nested_data = df |> 
  filter(!is.na(tax_exp)) |>   # Keep rows with non-missing values in `tax_exp`
  select(c(tax_exp, starts_with("sector_"), starts_with("legal_"), 
           starts_with("size_rev_"), starts_with("respondent_"),
           rev, num_emp, revenue_impact, net_income_impact, )) |> 
  nest(.by = tax_exp) # Nest the data by `tax_exp`




# This code performs pairwise Wilcoxon tests on nested data for each unique combination 
# of treatment groups and stores the results in a new column called results within 
# the treatment_combinations data frame. 
treatment_combinations$results = map2(treatment_combinations[, 1], treatment_combinations[,2], 
                                      \(x, y) {
                                        x = nested_data |> filter(tax_exp == x) |> pull(data)
                                        x = x[[1]]
                                        y = nested_data |> filter(tax_exp == y) |> pull(data)
                                        y = y[[1]]
                                        out = as.list(seq_along(x))
                                        names(out) <- names(x)
                                        for (i in seq_along(x)){
                                          out[[i]] = wilcox.test(pull(x, i), pull(y, i), alternative = "two.sided")
                                        }
                                        return(out)
                                      })



# The following code extracts the p-values from Wilcoxon test results for each pair 
# of treatment groups and organizes them by variable. It creates a new column p_values 
# containing a data frame of variable names and their corresponding p-values, then 
# removes the original results column and "unnests" the p-values.
treatment_combinations = treatment_combinations |> 
  mutate(p_values = map(results, \(x){
    x = unlist(x)
    x = x[grep("p.value", names(x))]
    out_dt = tibble(variable = str_extract(names(x), "[^\\.]+"),
                    p.value = as.numeric(x))
    return(out_dt)
  })) |> 
  select(-results) |> 
  unnest(p_values)



# Adjusting the p-values in the treatment_combinations data frame to control 
# for multiple comparisons using the Benjamini-Yekutieli (BY) procedure. 
treatment_combinations <- treatment_combinations |> 
  mutate(q.value = p.adjust(p.value, method = "BY"))



# Relabeling levels in variable "variable" which indicate the respective firm characteristic.
# Setting the custom order for the levels in the variable column to control the display order 
# in outputs.
treatment_combinations |>
  mutate(variable = fct_recode(variable,
                               'Revenue' = "rev",
                               'Number of Employees' = "num_emp",
                               'Revenue Impact' = 'revenue_impact',
                               'Profit Impact' = 'net_income_impact',
                               "Corporation" = "legal_Corporation",
                               "Partnership" = "legal_Partnership",
                               "Sole Proprietor"="legal_SoleProprietor",
                               "Microenterprise"="size_rev_Microenterprise",
                               "Small" = "size_rev_Small",
                               "Medium" = "size_rev_Medium",
                               "Large" = "size_rev_Large",
                               "Manufacturing" = "sector_Manufacturing",
                               "Construction" = "sector_Construction",
                               "Trade" = "sector_Trade",
                               "Services" = "sector_Services",
                               "Other" = "sector_Other",
                               "CEO" = "respondent_CEO",
                               "Other position" = "respondent_Other",
                               "Unknown position" = "respondent_Unknown"
  ), 
  variable = factor(variable, 
                    levels = c("Revenue", 
                               "Number of Employees",
                               'Revenue Impact',
                               'Profit Impact',
                               "Corporation", 
                               "Partnership", 
                               "Sole Proprietor",
                               "Microenterprise",
                               "Small",
                               "Medium",
                               "Large",
                               "Manufacturing",
                               "Construction", 
                               "Trade",
                               "Services", 
                               "Other",
                               "CEO",
                               "Other position",
                               "Unknown position"))) -> treatment_combinations




# Plot "balance_tests.pdf":
# Depicts the p-value for each treatment combination for each firm charateristic (variable)
# and calculates the total number of p-values of the pairwise Wilcoxon tests smaller 
# than 5%. Moreover, a horizontal line at 0.05 indicates all p-values smaller than 5%.
treatment_combinations |> 
  filter(Group_1 != Group_2) |>
  ggplot() +
  geom_point(aes(interaction(Group_1, Group_2, sep = " x "), y = p.value, color = variable), 
             position = position_jitter(),
             size = 2) +
  geom_hline(yintercept = 0.05, linetype = "dashed") +
  scale_color_viridis_d() +
  scale_y_continuous(breaks = seq(0, 1, 0.1)) +
  theme_bw() + 
  coord_flip() +
  labs(x = "Test",
       y = "P Value",
       color = "Variable",
       caption = paste("#Significant / #Total = ", round(sum(treatment_combinations$p.value <= 0.05, na.rm=T) / nrow(treatment_combinations),4)))
ggsave(filename = "Figures/balance_tests.pdf", width = 32, height = 24, units = "cm")



# Check: 
# For each firm charateristic, there are 36 treatment group combinations 
# (including "own-group" combinations)
treatment_combinations |> 
  count(variable)


# Descriptive statistics -------------------------------------------------------

## Summary statistics ----


# Calculates the number of non-missing values in vector x and formats the result for 
# better readability with commas as thousands separators. Later used in object "desc_tt" below 
# for producing descriptive tables.
N <- function(x) format(sum(!is.na(x)), big.mark = ",")

P10 <- function (x, fmt = NULL, na.rm = TRUE, ...) 
{
  out <- stats::quantile(x, prob = 0.1, na.rm = na.rm)
  return(out)
}

P90 <- function (x, fmt = NULL, na.rm = TRUE, ...) 
{
  out <- stats::quantile(x, prob = 0.9, na.rm = na.rm)
  return(out)
}


# desc_tt generates a descriptive statistics table ("descriptives.tex") for relevant 
# firm and manager charateristics (revenues, number of employees, legal forms, sector and share of CEO).     
desc_tt <- df |>
  select(
    rev, num_emp,
    legal_Corporation, legal_SoleProprietor, legal_Partnership,
    sector_Manufacturing, sector_Construction, sector_Trade, sector_Services, sector_Other, respondent_CEO
  ) |> 
  rename(
    Revenue = rev,
    `Num. Emp.` = num_emp,
    Corporation = legal_Corporation,
    Partnership = legal_Partnership,
    `Sole Prop.`= legal_SoleProprietor,
    Manufacturing = sector_Manufacturing,
    Construction = sector_Construction,
    Trade = sector_Trade,
    Services = sector_Services,
    Other = sector_Other,
    CEO = respondent_CEO
  ) %>%
  datasummary(formula = All(.) ~ N + Mean + SD + P10 + Median + P90,
              fmt = \(x) format(x, digits = 2, big.mark=",", scientific = F, output="tinytable"),
              title = "Descriptive Statistics \\label{tab:descriptives}",
  )

save_tt(desc_tt, "Tables/descriptives.md", overwrite = T)
save_tt(desc_tt, "Tables/descriptives.tex", overwrite = T)

## Range of shares ---------------------------------------------------------

# Table with averages and sample share information, now
# both treatment signs combined.
df |>
  group_by(treat_sign) |> 
  summarize(across(outcomes,list(
    N = ~sum(!is.na(.x)),
    mean = ~mean(.x,na.rm=T),
    meanw = ~weighted.mean(.x, weight_web,na.rm=T),
    s0 = ~sum(.x == 0, na.rm = T)/n(),
    s1 = ~sum(.x == 1, na.rm = T)/n(),
    s01 = ~sum(.x != 0 & abs(.x) != 1, na.rm = T) / n()
  ),.names = "{.col}-{.fn}")) %>% 
  mutate(across(.cols=-treat_sign, .fns=~round(.x,digits = 3))) %>% 
  pivot_longer(cols=-treat_sign,names_to = c("Outcome","Statistic"),names_sep="-") %>% 
  pivot_wider(id_cols = c(Outcome, treat_sign), names_from = "Statistic") %>% 
  mutate(Outcome = factor(Outcome, levels = outcomes, labels = outcome_labels)) |> 
  arrange(Outcome, treat_sign) |> 
  kbl(col.names = c("Outcome", "Treat Sign", "Obs", "Unwghtd.", "Weighted", "$s_{im} = 0$",
                    "$s_{im}= 1$","$s_{im} \\in (0,1)$"),
      escape=F,format="latex",booktabs=T, centering = T,
      caption = "Descriptive Statistics Incidence Shares \\label{tab:desc-shares}") %>% 
  collapse_rows(columns = 1:2, latex_hline = "major") |>
  add_header_above(c(" " = 3, "Mean" = 2, "Sample Percentages" = 3)) %>% 
  kable_styling(position = 'center',latex_options="hold_position") |> 
  save_kable(file = "Tables/share_table.tex")


## Reasons for investment change ----

df <- df |>
  mutate(reason_investment = ifelse(treat_sign == "Decrease", tax10, tax21)) |>
  mutate(reason_investment_cat = case_when(reason_investment <= 0.25 ~ "Capital Restrictions",
                                           reason_investment <= 0.75 ~ "Equally important",
                                           reason_investment <= 1 ~ "Profitability of Investment",
                                           .default = NA_character_) |> factor(levels = c("Capital Restrictions", "Equally important", "Profitability of Investment")))

df |>
  filter(!is.na(reason_investment_cat)) |>
  ggplot() +
  geom_bar(aes(reason_investment_cat, fill = treat_sign), position = position_dodge())  +
  scale_fill_brewer(palette="Set1") +
  labs(x = "Reason for change in investment",
       y = "Count",
       fill = "Tax Change") +
  theme_bw()
ggsave(filename = "Figures/reasons_investment.pdf", width = fig_width, height = fig_height, units = "cm")


# Responses Over Time -----------------------------------------------------

df_time <- as.data.table(df)

df_time[, mon := lubridate::month(RecordedDate, label=T, locale="en")]
df_time[, yr := lubridate::year(RecordedDate)]

df_time[, mon_unordered := factor(mon, ordered=F)]

mon_obs <- df_time[, .(mon=factor(mon, levels = c("Nov", "Dec",
                                 "Jan", "Feb",
                                 "Mar", "Apr",
                                 "May", "Jun")))] |> 
  _[, table(mon)]

mon_obs[c("Nov", "Dec", "Jan")] |> sum()
# [1] 5713
mon_obs[c("Feb", "Mar", "Apr", "May", "Jun")] |> sum()
# [1] 1015


all_res = tibble()

for (sign in c("Increase", "Decrease")){
  time_test = tibble()
  
  for (outcome in outcomes){
    print(outcome)
    frml = as.formula(str_c(outcome, "~mon_unordered+0"))
    tmp = lm_robust(frml, data=subset(df_time, treat_sign==sign)) |> 
      tidy() 
    time_test = rbind(time_test, tmp)
    diff_mod = lm_robust(as.formula(str_c(outcome, "~mon_unordered")), data=subset(df_time, treat_sign==sign & mon_unordered != "Jun"))
    print(diff_mod$fstatistic)
  }
  
  p.adjust(time_test$p.value[time_test$term != "(Intercept)"], method="BY") |> min() |> print()
  
  time_test$term <- str_remove(time_test$term, "mon_unordered")
  time_test$term <- factor(time_test$term, levels = c("Nov", "Dec",
                                                      "Jan", "Feb",
                                                      "Mar", "Apr",
                                                      "May", "Jun"))
  time_test$outcome <- factor(time_test$outcome, levels=outcomes,
                              labels=outcome_labels)
  print(time_test)
  p <- ggplot(filter(time_test, term != "Jun"), 
              aes(x = term,
                  y = estimate,
                  group=outcome)) +
    geom_pointrange(mapping=aes(ymin=conf.low,
                                ymax=conf.high)) +
    geom_line() +
    facet_wrap(~outcome) +
    labs(x="Month", y="Mean") +
    theme_bw()
  plot(p)
  
  time_test$sign=sign
  
  all_res = rbind(all_res, time_test)
}

ggplot(filter(all_res, term != "Jun"), 
       aes(x = term,
           y = estimate,
           color=sign,
           group=sign)) +
  geom_pointrange(mapping=aes(ymin=conf.low,
                              ymax=conf.high), size=0.2) +
  geom_line() +
  facet_wrap(~outcome) +
  scale_color_brewer(palette="Set1") +
  labs(x="Month", y="Mean", color="Sign") +
  theme_bw()
ggsave(filename = "Figures/responses_over_time.pdf", width = fig_width, height = fig_height, units = "cm")

rm(treatments, treatment_combinations, desc_tt, df_time, mon_obs, all_res)
gc()