
# Function definitions ----------------------------------------------------

replace_missing_codes <- function(x){
  if (is.numeric(x)){
    x[x %in% -c(9996:9999)] <- NA_real_
  } else if (is.character(x)){
    x[x %in% as.character(-c(9996:9999))] <- NA_character_
  } else if (is.factor(x)){
    x = fdroplevels(x, exclude = as.character(-c(9996:9999)))
  }
  return(x)
}

# Load Web data ----------------------------------------------------------------

df <- read_dta("Data/Welle2_Safehouse_C01_only_2022-09-23.dta") |> as_factor()

df <- df |> 
  mutate(# tax_exp: Indicator for treatment
         tax_exp = case_when(
           tax_exp == 1 ~ "1% lower",
           tax_exp == 2 ~ "10% lower",
           tax_exp == 3 ~ "25% lower",
           tax_exp == 4 ~ "1% higher",
           tax_exp == 5 ~ "10% higher",
           tax_exp == 6 ~ "25% higher"
         ))


# Replace missing codes ---------------------------------------------------

df <- df |> 
  mutate(across(.cols = everything(), .fns = replace_missing_codes))


# Data wrangling ----------------------------------------------------

df <- df %>%
  # ccgic3 Revenue categories
  mutate(ccgic3 = case_when(
    ccgic3 == 1 ~ "Less than 50.000 EUR",
    ccgic3 == 2 ~ "50.000 - 100.000 EUR",
    ccgic3 == 3 ~ "100.001 - 350.000 EUR",
    ccgic3 == 4 ~ "350.001 - 700.000 EUR",
    ccgic3 == 5 ~ "700.001 - 2.000.000 EUR",
    ccgic3 == 6 ~ "2.000.001 - 6.000.000 EUR",
    ccgic3 == 7 ~ "6.000.001 - 8.000.000 EUR",
    ccgic3 == 8 ~ "8.000.001 - 10.000.000 EUR",
    ccgic3 == 9 ~ "10.000.001 - 12.000.000 EUR",
    ccgic3 == 10 ~ "12.000.001 - 20.000.000 EUR",
    ccgic3 == 11 ~ "20.000.001 - 40.000.000 EUR",
    ccgic3 == 12 ~ "40.000.001 - 50.000.000 EUR",
    ccgic3 == 13 ~ "50.000.001 - 60.000.000 EUR",
    ccgic3 == 14 ~ "More than 60.000.000 EUR"
  ),
  # Create ordered factor for plots
  ccgic3 = factor(ccgic3, 
                  levels = c("Less than 50.000 EUR", 
                             "50.000 - 100.000 EUR",
                             "100.001 - 350.000 EUR",
                             "350.001 - 700.000 EUR",
                             "700.001 - 2.000.000 EUR",
                             "2.000.001 - 6.000.000 EUR",
                             "6.000.001 - 8.000.000 EUR",
                             "8.000.001 - 10.000.000 EUR",
                             "10.000.001 - 12.000.000 EUR",
                             "12.000.001 - 20.000.000 EUR",
                             "20.000.001 - 40.000.000 EUR",
                             "40.000.001 - 50.000.000 EUR",
                             "50.000.001 - 60.000.000 EUR",
                             "More than 60.000.000 EUR"
                  ),
                  ordered = T),
  # Recode employment categories
  ccgic6 = fct_recode(
    factor(as.character(ccgic6), 
           levels = c(
             as.character(1:9)
           ),
           ordered = T),
    "No employees" = "1",
    "1-5" = "2",
    "6-9" = "3",
    "10-19" = "4",
    "20-49" = "5",
    "50-249" = "6",
    "250-499" = "7",
    "500-999" = "8",
    "More than 1000" = "9"
  ))

# Rescale shares to be in [0,1]
df <- df %>% 
  mutate(across(c(tax1:tax9,tax10,tax11:tax19,tax20,tax21,tax4, tax14),~.x/100))

# Generate female respondent dummy
df <- df %>% 
  mutate(female = case_when(
    ccgic11 == "Frau" | ccgic11 == "frau" ~ 1L,
    ccgic11 == "Herr" ~ 0L,
    T ~ NA_integer_))

# Add broader employee size categories using definition of SMEs
df <- df %>% 
  mutate(size_emp = factor(
    case_when(
      ccgic6 %in% c("No employees","1-5","6-9") ~ "Microenterprise",
      ccgic6 %in% c("10-19","20-49") ~ "Small",
      ccgic6 %in% c("50-249") ~ "Medium",
      ccgic6 %in% c("250-499","500-999","More than 1000") ~ "Large",
      T ~ NA_character_
    ), levels = c("Microenterprise", "Small", "Medium", "Large")
  ))

# Add revenue size categories using definition of SMEs
df <- df %>% 
  mutate(size_rev = factor(
    case_when(
      ccgic3 %in% c("Less than 50.000 EUR",
                    "50.000 - 100.000 EUR",
                    "100.001 - 350.000 EUR",
                    "350.001 - 700.000 EUR",
                    "700.001 - 2.000.000 EUR") ~ "Microenterprise",
      ccgic3 %in% c("2.000.001 - 6.000.000 EUR",
                    "6.000.001 - 8.000.000 EUR",
                    "8.000.001 - 10.000.000 EUR",
                    "10.000.001 - 12.000.000 EUR",
                    "12.000.001 - 20.000.000 EUR") ~ "Small",
      ccgic3 %in% c("20.000.001 - 40.000.000 EUR",
                    "40.000.001 - 50.000.000 EUR") ~ "Medium",
      ccgic3 %in% c("50.000.001 - 60.000.000 EUR",
                    "More than 60.000.000 EUR") ~ "Large",
      T ~ NA_character_
    ), levels = c("Microenterprise", "Small", "Medium", "Large")
  ))

# Overall SME category (abstract from balance sheet total)
df <- df %>% 
  mutate(size = factor(
    case_when(
      size_emp == "Microenterprise" & size_rev == "Microenterprise" ~ "Microenterprise",
      size_emp %in% c("Microenterprise", "Small") & size_rev == "Small" ~ "Small",
      size_emp == "Small" & size_rev %in% c("Microenterprise","Small") ~ "Small",
      size_emp %in% c("Small", "Medium") & size_rev == "Medium" ~ "Medium",
      size_emp == "Medium" & size_rev %in% c("Small", "Medium") ~ "Medium",
      size_emp %in% c("Medium","Large") & size_rev == "Large" ~ "Large",
      size_emp == "Large" & size_rev %in% c("Medium","Large") ~ "Large",
      T ~ NA_character_
    ), levels = c("Microenterprise","Small","Medium","Large")
  ))


# Add broader categories of legal form
df <- df %>% 
  mutate(legal = factor(
    case_when(
      ccgic1 == "Einzelunternehmen" ~ "Sole Proprietor",
      ccgic1 %in% c("AG", "GmbH", "UG", "KGaA", "SE") ~ "Corporation",
      ccgic1 %in% c("GbR", "KG", "oHG", "GmbH & Co. KG",
                    "PartG") ~ "Partnership",
      T ~ NA_character_
    ), levels = c("Corporation", "Sole Proprietor", "Partnership")
  ))

# Add English industry categories
df <- df %>%
  mutate(industry_WZ08 = ifelse(substr(industry_WZ08,1,1) == "-",NA,industry_WZ08)) %>% # missing values
  mutate(industry_WZ08_1_letter = factor(substr(industry_WZ08,1,1)),
         industry_WZ08_1_english = 
           fct_recode(industry_WZ08_1_letter,
                      "Agriculture" = "A",
                      "Mining" = "B",
                      "Manufacturing" = "C",
                      "Electricity" = "D",
                      "Water" = "E",
                      "Construction" = "F",
                      "Retail" = "G",
                      "Transportation" = "H",
                      "Accomodation" = "I",
                      "Communication" = "J",
                      "Finance and insurance" = "K",
                      "Real estate" = "L",
                      "Professional Services" = "M",
                      "Admin and support services" = "N",
                      "Public administration and defence"="O",
                      "Education" = "P",
                      "Health and social" = "Q",
                      "Art, recreation" = "R",
                      "Other services" = "S",
                      NULL = "T",
                      NULL = "",
                      NULL = "-"),
         industry_nace = substr(industry_WZ08,2,5))

# Combine the payout categories tax3 and tax4, as well as tax13 and tax14
df <- df %>% 
  mutate(tax4 = case_when(
    !is.na(tax3) & tax3 >= 0 ~ tax3,
    !is.na(tax4) & tax4 >= 0 ~ tax4,
    T ~ NA_real_),
    tax14 = case_when(
      !is.na(tax13) & tax13 >= 0 ~ tax13,
      !is.na(tax14) & tax14 >= 0 ~tax14,
      T ~ NA_real_),
    tax3 = NULL,
    tax13 = NULL)

# Add new category indicating repayment of debt for the tax decrease treatment
df <- df %>% 
  mutate(taxD_debt = case_when(
    str_detect(
      tax9_text,
      stringr::regex(
        "(Verbindlichkeit)|(Rückzahlung)|(Abbau)|(Schulden)|(Fremdkapital)|(Kredit)|(Darlehen)|(verbindlichkeit)|(kredit)|(rückzahlung)|(tilgung)|(schui?lden)|(fremd)|(darlehen)", 
        ignore_case = T))  ~ tax9,
    str_detect(tax_exp, "lower") & tax9 >= 0 ~ 0
  ),
  tax9 = ifelse(taxD_debt > 0,0,tax9) # set reclassified observations to zero
  )

# Industry Categories 

# Create sector variable that groups 1-digit industries into categories as in
# Fuest et al. (2018)
df <- df %>%
  mutate(
    sector = fct_collapse(industry_WZ08_1_english,
                          # Lump services
                          Services = c("Accomodation", "Finance and insurance", "Professional Services", "Admin and support services", 
                                       "Other services")
    )) %>% 
  mutate(sector = factor(case_when(
    sector == "Manufacturing"  ~ "Manufacturing",
    sector == "Construction" ~ "Construction",
    sector == "Retail"  ~ "Trade",
    sector == "Services" ~ "Services",
    T ~ "Other"), 
    levels = c("Other", "Manufacturing", "Construction", "Trade", "Services")))

df %<>% 
  mutate(cos1_cat = factor(case_when( # Transform into categorical variable with fewer categories
    cos1 <= 2 ~ "Easy",
    cos1 <= 7 ~ "Medium",
    cos1 <= 10 ~ "Hard",
  ), levels = c("Medium", "Easy", "Hard"))) %>% 
  mutate(RecordedDate = as_date(ymd_hms(RecordedDate))) %>% 
  mutate(cun1_d = ymd(cun1) - RecordedDate) %>%
  mutate(cun17 = fct_recode(cun17, # Relabel long factor levels
                            "very low" = "sehr gering: künftige Umsätze können sehr genau vorhergesagt werden",
                            low = "gering: künftige Umsätze können genau vorhergesagt werden",         
                            medium = "mittel: künftige Umsätze können ungefähr vorhergesagt werden",       
                            high = "hoch: künftige Umsätze sind schwierig vorherzusagen",                
                            "very high" = "sehr hoch: künftige Umsätze sind sehr schwierig vorherzusagen"
  )) %>% 
  mutate(cun17 = fct_relevel(cun17,"medium")) # Use different base category in regressions

# Make uncertainty more broad
df <- df |> 
  mutate(uncertainty = factor(fct_collapse(cun17,
    low = c("very low", "low"),
    medium = "medium",
    high = c("high", "very high")
  ), levels = c("medium", "low" , "high"), labels=c("Medium", "Low", "High")))

# Create more granular categorical variable for net income impact ccd6
df$ccd6_cat4 <- factor(cut_interval(df$ccd6,n=4,labels = FALSE),levels=c(1,2,3,4))


# Create joint outcomes ---------------------------------------------------

df <- df %>% 
  mutate(
    wages = case_when(
      str_detect(tax_exp,"lower") ~ tax1,
      str_detect(tax_exp,"higher") ~ tax11,
      T ~ NA_real_),
    jobs = case_when(
      str_detect(tax_exp,"lower") ~ tax2,
      str_detect(tax_exp,"higher") ~ tax12,
      T ~ NA_real_),
    payout = case_when(
      str_detect(tax_exp,"lower") ~ tax4,
      str_detect(tax_exp,"higher") ~ tax14,
      T ~ NA_real_),
    reserves = case_when(
      str_detect(tax_exp,"lower") ~ tax5,
      str_detect(tax_exp,"higher") ~ tax15,
      T ~ NA_real_),
    prices = case_when(
      str_detect(tax_exp,"lower") ~ tax6,
      str_detect(tax_exp,"higher") ~ tax16,
      T ~ NA_real_),
    investment = case_when(
      str_detect(tax_exp,"lower") ~ tax7,
      str_detect(tax_exp,"higher") ~ tax17,
      T ~ NA_real_),
    tax = case_when(
      str_detect(tax_exp,"lower") ~ tax8,
      str_detect(tax_exp,"higher") ~ tax18,
      T ~ NA_real_),
    other = case_when(
      str_detect(tax_exp,"lower") ~ tax9,
      str_detect(tax_exp,"higher") ~ tax19,
      T ~ NA_real_),
    debt = case_when(
      str_detect(tax_exp,"lower") ~ taxD_debt,
      str_detect(tax_exp,"higher") ~ tax20,
      T ~ NA_real_)
  )


# Create distinct treatment indicators ------------------------------------

df |> 
  mutate(
    treat_sign = ifelse(str_detect(tax_exp, regex("higher", ignore_case = T)),
                                   "Increase", "Decrease"),
    treat_mag = factor(str_extract(tax_exp, "\\d+\\%"),
                       levels = c("1%", "10%", "25%"),
                       labels = c("Low", "Medium", "High"), ordered = T)
  ) -> df



# Adjust Categories for Design Flaw ---------------------------------------

# We decided to dissolve the debt category and to assign any answers for this
# margin to the other category, thereby alleviating the asymmetry in our
# margins.
df <- df |> mutate(other = other + debt,
                   debt = NULL)

# Missing Categories ------------------------------------------------------

get_freq_exp <- function(x){
  x = x[!(x %in% as.character(-c(9996:9999))) & !is.na(x)]
  x = str_to_lower(x) |> 
    str_split("\\s") |>
    unlist() |> 
    str_remove_all("[^\\p{L}\\s\\&]+") 
  x = x[x != ""]
  return(table(x))
}

## Decreases ----

tax9_dt = tibble(answer = df$tax9_text)
tax9_dt |>
  filter(!(answer %in% as.character(-c(9996:9999))) & !is.na(answer)) |> 
  filter(!(answer %in% c("k\\.a\\.", "xx", "?"))) |> 
  filter(str_detect(answer, "\\p{L}+")) |> 
  mutate(answer = str_to_lower(answer)) -> tax9_dt



tax9_dt |> 
  mutate(reason = case_when(
    str_detect(answer, "(verbindlichkeit)|(kredit)|(rückzahlung)|(tilgung)|(schui?lden)|(fremd)|(darlehen)") ~ "Debt repayment",
    str_detect(answer, "(forschung)|(entwicklung)|(f&e)|(r&d)|(innov)") ~ "Research & development",
    str_detect(answer, "(expansion)|(digital)|(lagerbestand)|(umwelt)|(büroimmobilien)|(neubau)|(reparaturen)|(maßnahmen)|(investi)|(einkauf)|(waren)(ren(i|o)?vier)") ~ "Investment",
    str_detect(answer, "(fortbildung)|(qualifi)|(mitarbeit)|(arbeitspl)|(kurzarbeit)") ~ "Employee Training & Retention",
    str_detect(answer, "(privatentnahme)|(versaufen)|(altersvorsorge)|(charity)|(rentenvorsorge)|(arbeitsbelastung)|(spende)|(urlaub)|(lebh?ens(er)?haltung)|(neue autos)|(privat)|(rente)|(unternehmer)|(entnahme)") ~ "Distribution to Shareholders",
    str_detect(answer, "(rücklage)|(sparkonto)|(reserven)") ~ "Retained earnings",
    str_detect(answer, "(marketing)|(vertrieb)|(werbung)") ~ "Marketing & sales",
    str_detect(answer, "(ge(w|s)inn)|(insolv)|(nicht)|(schließ)|(negativ)") ~ "No profits",
    str_detect(answer, "(liquidit)|(rechnung)|(begleichung)|(cash)|(laufend)") ~ "Short-term liquidity demands",
    str_detect(answer, "(markt)|(märkte)|(neu)|(zusätzlich)|(sortiment)|(angebotserweit)") ~ "New markets",
    str_detect(answer, "(kein)") ~ "No source",
    .default = "Other"
  )) -> tax9_dt

table(tax9_dt$reason)

ggplot(tax9_dt) +
  geom_bar(aes(fct_rev(fct_infreq(reason)), y = after_stat(prop), group=1)) +
  labs(x = "Category",
       y = "Share (Percent)") +
  theme_bw() +
  coord_flip()
ggsave(filename = "Figures/missing_categories_decrease.pdf", 
       width = fig_width, 
       height= fig_height, 
       units= "cm")

## Increases ----

tax19_dt = tibble(answer = df$tax19_text) |> 
  mutate(answer = str_to_lower(answer)) |> 
  filter(!(answer %in% as.character(-c(9996:9999))) & !is.na(answer) & str_detect(answer, "\\d", negate=TRUE)) |> 
  filter(!(answer %in% c("x"))) |> 
  filter(str_detect(answer, "\\p{L}+")) |> 
  mutate(answer = str_to_lower(answer)) -> tax19_dt


tax19_dt |> 
  mutate(reason = case_when(
    str_detect(answer, "(schließ)|(schliess)|(aufgabe)|(ende)|(weiter)|(beendigung)|(liquid)|(auflös)|(geschloss)") ~ "Closure/liquidation",
    str_detect(answer, "(werbung)|(fahrzeuge)|(verkauf)|(rationali)|(andere betrieb)|(reduktion)") ~ "Divestment",
    str_detect(answer, "(kosten)|(einsparung)") ~ "Cost reduction",
    str_detect(answer, "(privatein)|(eigen)") ~ "New capital from owner",
    str_detect(answer, "(produktivität)|(optim)|(prozess)|(automat)|(inn?ovation)|(effizient)") ~ "Higher efficiency",
    str_detect(answer, "(ausland)|(auslager)|(verlager)|(verleg)|(auswandern)") ~ "Move",
    str_detect(answer, "(gewinn)") ~ "No profit",
    str_detect(answer, "(weiß)|(situation)|(kein)|(k\\.)") ~ "Don't know",
    .default = "Other"
  )) -> tax19_dt

ggplot(tax19_dt) +
  geom_bar(aes(fct_rev(fct_infreq(reason)), y=after_stat(prop), group=1)) +
  labs(x = "Category",
       y = "Share (Percent)") +
  theme_bw() +
  coord_flip()
ggsave(filename = "Figures/missing_categories_increase.pdf", 
       width = fig_width, 
       height= fig_height, 
       units= "cm")


# Survey completion -------------------------------------------------------

df |> 
  ggplot(aes(x=Progress)) +
  geom_histogram(binwidth=1) +
  geom_vline(xintercept = 90, linetype ="dashed") +
  annotate("rect", xmin=90, ymin=0, xmax=Inf, ymax=Inf, fill = "darkgreen", alpha = 0.3) +
  annotate("rect", xmin=-Inf, ymin=0, xmax=90, ymax=Inf, fill = "darkred", alpha = 0.3) +
  annotate("label", x = c(45, 97.5), y = rep(10000, 2), label = c("Omitted", "Included"), size = 6, fontface = "bold") +
  scale_x_continuous(breaks=seq(0,100,10)) +
  scale_y_continuous(breaks=seq(0,10000,1000), labels=\(x) format(x, big.mark=",")) +
  labs(x = "Survey completion (Percentage points)",
       y = "Count") +
  theme_bw()
ggsave("Figures/progress_report.pdf", width = fig_width, height = fig_height, units = "cm")

# Subset to final sample --------------------------------------------------

# Problematic variable: respondent (position in the company) features a lot of missings!
# Fill respondent type with Unknown Category
df = df |> 
  filter(Progress >= 90) |>
  mutate(respondent = ifelse(str_detect(ccgic14,"(Inhaber)|(Geschäftsführer)|(Vorstand)|(Prokurist)", negate=T), "Other", "CEO"),
         respondent = ifelse(is.na(respondent), "Unknown", respondent),
         respondent = factor(respondent, levels = c("CEO", "Other", "Unknown"))) 


df <- df |> 
  select(BvDID, RecordedDate, ccgic2, ccgic3, ccgic5, ccgic6, respondent, tax_exp, treat_sign, treat_mag,
         size_rev, legal, sector, industry_WZ08_1_letter,
         ccd1, ccd6, weight, ccm2, ccm4, ccm8, chi1, chi2, uncertainty,
         wages, jobs, payout, reserves, prices, investment, tax, other, cdat3, tax10, tax21) 

# Filter for non-missing values of outcome and control variables
df <- df |> 
  filter(if_all(.cols = c(wages, jobs, payout, reserves, prices, investment, tax, other), .fns = ~!is.na(.x))) |> 
  filter(if_all(.cols = c(sector, legal, respondent, ccgic3, ccgic6, ccd1, ccd6), .fns = ~!is.na(.x)))


df |> rename(rev = ccgic2,
             rev_cat = ccgic3,
             num_emp = ccgic5,
             num_emp_cat = ccgic6,
             revenue_impact = ccd1,
             net_income_impact = ccd6) -> df


df <- df |> mutate(legal = fct_recode(legal, "SoleProprietor"="Sole Proprietor"))

df <- dummy_cols(df,select_columns = c("legal", "size_rev", "sector", "respondent"), ignore_na = T)

df <- df |> 
  mutate(impact_on_revenue = factor(ifelse(revenue_impact < median(revenue_impact, na.rm=T), 
                                           "Substantial impact on revenues", "No substantial impact on revenues"),
                                    levels = c("No substantial impact on revenues", "Substantial impact on revenues")),
         impact_on_net_income = factor(ifelse(net_income_impact < median(net_income_impact, na.rm=T),
                                              "Substantial impact on profits",
                                              "No substantial impact on profits"),
                                       levels = c( "No substantial impact on profits", "Substantial impact on profits")))

# Write final survey dataset ----------------------------------------------

write_parquet(df, sink = "Data/survey_data_prepped.parquet")

# dta
write_dta(df, path = "Data/survey_data_prepped.dta")

rm(tax9_dt, tax19_dt, df)
gc()