--- title: "Legitimation Amid Dependence (1960-2010): A Dataset on the Determinants of Authoritarian Legitimation Strategies - DATA WRANGLING SCRIPT" author: "Farah Aly" date: "2024-05-30" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ```{r libraries} install.packages("devtools") devtools::install_github("vdeminstitute/vdemdata") library(vdemdata) rm(list = ls()) p_required <- c("tidyr", "dplyr", "tidyverse", "haven", "readxl", "countrycode", "lmtest", "car", "clarify", "data.table", "stargazer", "ggbreak", "ggtext", "scales", "ggfortify") packages <- rownames(installed.packages()) p_to_install <- p_required[!(p_required %in% packages)] if (length(p_to_install) > 0) { install.packages(p_to_install) } sapply(p_required, require, character.only = TRUE) rm(p_required, p_to_install, packages) ``` ```{r loading and preprocessing GWF} ## Data retrievable at https://sites.psu.edu/dictators/ GWF <- read_dta("GWFtscs.dta") #converting the gwf_casename variable in the GWF dataset into a factor variable, might be important for certain upcoming analyses, such as including regime fixed effects in models GWF$gwf_casename <- as.factor(GWF$gwf_casename) ``` ```{r loading and preprocessing VDEM} vdemdata <- vdem #filtering the vdem data to include only legitimation-related variables vdem_legit <- vdemdata %>% dplyr::filter(year > 1900 & year < 2023) %>% dplyr::select(country_id, country_name, year, e_regionpol, v2exl_legitlead, v2exl_legitperf, v2exl_legitratio, v2exl_legitideol, v2exl_legitideolcr_0, v2exl_legitideolcr_1, v2exl_legitideolcr_2, v2exl_legitideolcr_3, v2exl_legitideolcr_4, v2exl_legitlead_osp, v2exl_legitperf_osp, v2exl_legitratio_osp, v2exl_legitideol_osp) #creating a list of replacements for country names to match the GWF name system country_replacements_vdem <- c( "Dominican Republic" = "Dominican Rep", "German Democratic Republic" = "Germany East", "Guinea-Bissau" = "Guinea Bissau", "Central African Republic" = "Cen African Rep", "Republic of the Congo" = "Congo-Brz", "Democratic Republic of the Congo" = "Congo/Zaire", "South Yemen" = "Yemen South", "North Korea" = "Korea North", "South Korea" = "Korea South", "Republic of Vietnam" = "Vietnam South", "Czechia" = "Czechoslovakia", "The Gambia" = "Gambia", "Eswatini" = "Swaziland", "Burma/Myanmar" = "Myanmar" ) #replacing the specified names vdem_legit <- vdem_legit %>% dplyr::mutate(country_name = ifelse(country_name %in% names(country_replacements_vdem), country_replacements_vdem[country_name], country_name)) #further modification to the country_name column: replacing occurrences of "Russia" with "Soviet Union" between the years 1922 and 1991 to match the GWF dataset vdem_legit <- vdem_legit %>% dplyr::mutate(country_name = ifelse(country_name == "Russia" & year >= 1922 & year <= 1991, "Soviet Union", country_name)) merged_GWF_VDEM <- left_join(GWF, vdem_legit, by = c("gwf_country" = "country_name", "year")) class(merged_GWF_VDEM$gwf_casename) merged_GWF_VDEM$e_regionpol <- as.factor(merged_GWF_VDEM$e_regionpol) ``` ```{r DONT RUN: FBIC -> creating FBICselected_dataset and FBIC_robustness_data} ##### NOTE: The original dataset is approximately 3.6 GB in size and is therefore very hard to work with. It is retrievable from this link: https://korbel.du.edu/pardee/content/formal-bilateral-influence-capacity. This chunk includes the steps I took to create (1) a dataset with the relevant variables for the main analysis and (2) a dataset with the relevant variables for the robustness checks. From this point onwards, I do not use the main dataset due to its size. #FBICdata <- read.csv("/Users/farahraafat/Downloads/BACH/data/Diplometrics_FBIC Index and inputs 1960-2022 final update_20231114.csv") #colnames(FBICdata) #FIBCselected_data <- FBICdata %>% # select(countrya, countryb, year, dependence, bandwidth, fbic) #write.csv(FIBCselected_data, "FBICselected_dataset.csv", row.names = FALSE) #FBIC_robustness_data <- FBICdata %>% # select(countrya, countryb, year, dependence, bandwidth, fbic, dependence_nonnorm, economicdependence, securitydependence) #write.csv(FBIC_robustness_data, "FBIC_robustness_data.csv", row.names = FALSE) ``` ```{r creating netdependence & merging with others} FBICselected_data <- read.csv("FBICselected_dataset.csv") ##to create my IV, net dependence, I subtract the extent to which the world is dependent on country X from the extent to which country X is dependent on the world. // Net Dependence = Country X's Dependence on the World (xonworld) - World's Dependence on Country X (worldonx) ## The extent to which 'countryb' depends on 'countrya' is *insert value of 'dependence'* ## The extent to which 'countrya' has an influence capacity over 'countryb' is *insert value of 'fbic'* ## so country a is the 'influence sending' country, and country b is the 'influence receiving' country. ## so if we group by countryb and year and calculate the sum of the columns of interest, we get the extent to which countryb is dependent on the world, or in other words the influence capacity (FBIC) of the world on countryb. Since bandwidth is symmetrical, it doesn't matter if we group by countrya or countryb xonworld_grouped_data <- FBICselected_data %>% group_by(countryb, year) %>% dplyr::summarise( dependence_xonworld = sum(dependence), total_bandwidth = sum(bandwidth), fbic_xonworld = sum(fbic) ) %>% ungroup() ## here I group by countrya and year because I want to calculate the extent to which the world is dependent on countrya, or to what extent countrya has an influence capacity (FBIC) on the world worldonx_grouped_data <- FBICselected_data %>% group_by(countrya, year) %>% dplyr::summarise( dependence_worldonx = sum(dependence), total_bandwidth = sum(bandwidth), fbic_worldonx = sum(fbic) ) %>% ungroup() netdependence <- left_join(xonworld_grouped_data, worldonx_grouped_data, by = c("countryb" = "countrya", "year")) ## now I create the net dependence variable by subtracting the created scores from each other netdependence <- netdependence %>% dplyr:: mutate(netdependence = dependence_xonworld - dependence_worldonx, netfbic = fbic_worldonx - fbic_xonworld) ##Note: if net dependence is positive then the country is more dependent on the outside world than the outside world is dependent on it, which means the outside world has more capacity to leverage the relationship. ##matching the names with the GWF name system country_replacements_FBIC <- c( "Dominican Republic" = "Dominican Rep", "Guinea-Bissau" = "Guinea Bissau", "Cote d'Ivoire" = "Ivory Coast", "Central African Republic" = "Cen African Rep", "Congo-Brazzaville" = "Congo-Brz", "Congo-Kinshasa" = "Congo/Zaire", "Yemen Arab Republic" = "Yemen", "Yemen-People's Democratic Republic of" = "Yemen South", "Korea-Democratic People's Republic of" = "Korea North", "Korea-Republic of" = "Korea South", "Vietnam-Democratic Republic of" = "Vietnam", "Vietnam-Republic of" = "Vietnam South", "East Germany" = "Germany East", "Serbia and Montenegro" = "Serbia" ) ## replacing the country names netdependence <- netdependence %>% dplyr::mutate(countryb = ifelse(countryb %in% names(country_replacements_FBIC), country_replacements_FBIC[countryb], countryb)) merged_FBIC_GWF_VDEM <- left_join(merged_GWF_VDEM, netdependence, by = c("gwf_country" = "countryb", "year")) #creating variables to capture annual changes in dependence/influence capacity, indicating whether they've risen or fallen compared to the preceding year merged_FBIC_GWF_VDEM <- merged_FBIC_GWF_VDEM %>% dplyr::mutate(total_fbic_diff = fbic_worldonx - lag(fbic_worldonx), dependence_diff = dependence_xonworld - lag(dependence_xonworld), bandwidth_diff = total_bandwidth.y - lag(total_bandwidth.y), netdependence_diff = netdependence - lag(netdependence)) ``` ```{r creating the DV measuring the transition in legitimation from ideology towards pragmatic approaches} merged_FBIC_GWF_VDEM <- merged_FBIC_GWF_VDEM %>% dplyr::mutate( legit_nonideological = (v2exl_legitlead + v2exl_legitperf + v2exl_legitratio)/3, #calculating the average of "v2exl_legitlead", "v2exl_legitperf", and "v2exl_legitratio", representing a non-ideological legitimation measure (not relevant for the analysis since I do not include personalist legitimation in my theory, but maybe relevant for future studies) legit_pragmatic = (v2exl_legitperf + v2exl_legitratio)/2 ) ##calculating the average of the performance legitimation and rational-legal legitimation scores, representing a pragmatic legitimation measure ## I do the same for the original scale variables merged_FBIC_GWF_VDEM <- merged_FBIC_GWF_VDEM %>% dplyr::mutate( legit_nonideological_osp = (v2exl_legitlead_osp + v2exl_legitperf_osp + v2exl_legitratio_osp)/3, legit_pragmatic_osp = (v2exl_legitperf_osp + v2exl_legitratio_osp)/2 ) ##computing the difference in all the legitimation variables between consecutive years merged_FBIC_GWF_VDEM <- merged_FBIC_GWF_VDEM %>% dplyr::mutate(legitideol_diff = v2exl_legitideol - lag(v2exl_legitideol), nonideological_diff = legit_nonideological - lag(legit_nonideological), pragmatic_diff = legit_pragmatic - lag(legit_pragmatic), legitideol_diff_osp = v2exl_legitideol_osp - lag(v2exl_legitideol_osp), nonideological_diff_osp = legit_nonideological_osp - lag(legit_nonideological_osp), pragmatic_diff_osp = legit_pragmatic_osp - lag(legit_pragmatic_osp)) ##creating binary variables that capture negative change in ideology and positive change in pragmatic legitimation // not relevant for the final analysis, but still very interesting merged_FBIC_GWF_VDEM <- merged_FBIC_GWF_VDEM %>% dplyr::mutate(shift_ideo_to_prag = ifelse(legitideol_diff < 0 & pragmatic_diff > 0, 1, 0)) table(merged_FBIC_GWF_VDEM$shift_ideo_to_prag) ##same but using the nonideological legitimation measure merged_FBIC_GWF_VDEM <- merged_FBIC_GWF_VDEM %>% dplyr::mutate(shift_ideo_to_nonideo = ifelse(legitideol_diff < 0 & nonideological_diff > 0, 1, 0)) table(merged_FBIC_GWF_VDEM$shift_ideo_to_nonideo) ###HERE I CALCULATE MY MAIN DEPENDENT VARIABLE OF INTEREST: "extent_shift_ideo_to_prag". By subtracting the difference in ideological legitimation from the difference in pragmatic legitimation, I account for scenarios that are excluded from the binary codation. If this variable is positive it can mean three things, either that pragmatic legitimation is increasing and ideology is decreasing, or that both are increasing but pragmatic legitimation is increasing at a faster pace, or that both are decreasing but pragmatic legitimation is decreasing at a slower rate. The latter two scenarios, where there is a relative shift rather than an absolute shift, were not captured in the binary codation. merged_FBIC_GWF_VDEM <- merged_FBIC_GWF_VDEM %>% dplyr::mutate(extent_shift_ideo_to_prag = (pragmatic_diff - legitideol_diff), extent_shift_ideo_to_nonideo_osp = (nonideological_diff_osp - legitideol_diff_osp), extent_shift_ideo_to_prag_osp = (pragmatic_diff_osp - legitideol_diff_osp), extent_shift_ideo_to_nonideo_osp = (nonideological_diff_osp - legitideol_diff_osp)) negative_cases <- merged_FBIC_GWF_VDEM %>% dplyr::filter(extent_shift_ideo_to_prag < 0) %>% nrow() negative_cases positive_cases <- merged_FBIC_GWF_VDEM %>% dplyr::filter(extent_shift_ideo_to_prag > 0) %>% nrow() positive_cases ### those are the cases where there is a pivot away from ideology towards pragmatic legitimation ``` ```{r adding the first control variable: GDP} ## source: The Maddison Project Database: retrievable at https://www.rug.nl/ggdc/historicaldevelopment/maddison/releases/maddison-project-database-2020?lang=en GDP <- read_excel("mpd2020.xlsx", sheet = "Full data") ##matching the names to GWF country_replacements_GDP <- c( "Dominican Republic" = "Dominican Rep", "Venezuela (Bolivarian Republic of)" = "Venezuela", "Bolivia (Plurinational State of)" = "Bolivia", "Former Yugoslavia" = "Yugoslavia", "Russian Federation" = "Russia", "Guinea-Bissau" = "Guinea Bissau", "Côte d'Ivoire" = "Ivory Coast", "Central African Republic" = "Cen African Rep", "Congo" = "Congo-Brz", "D.R. of the Congo" = "Congo/Zaire", "U.R. of Tanzania: Mainland" = "Tanzania", "Iran (Islamic Republic of)" = "Iran", "Syrian Arab Republic" = "Syria", "Taiwan, Province of China" = "Taiwan", "D.P.R. of Korea" = "Korea North", "Republic of Korea" = "Korea South", "Lao People's DR" = "Laos", "Viet Nam" = "Vietnam", "Sudan (Former)" = "Sudan" ) GDP <- GDP %>% dplyr::mutate(country = ifelse(country %in% names(country_replacements_GDP), country_replacements_GDP[country], country)) GDP <- GDP %>% dplyr::mutate(country = ifelse(country == "Russia" & year >= 1922 & year <= 1991, "Soviet Union", country)) merged_GDP_FBIC_GWF_VDEM <- left_join(merged_FBIC_GWF_VDEM, GDP, by = c("gwf_country" = "country", "year")) merged_GDP_FBIC_GWF_VDEM <- merged_GDP_FBIC_GWF_VDEM %>% dplyr::mutate(gdppc_diff = gdppc - lag(gdppc)) ``` ```{r conflict} # source: UCDP/PRIO Armed Conflict Dataset version 23.1, retrievable at https://ucdp.uu.se/downloads/index.html#armedconflict conflict <- read_excel("UcdpPrioConflict_v23_1.xlsx", sheet = "Blad1") ## filtering to interstate and internationalized intrastate conflict conflict <- conflict %>% dplyr:: filter(type_of_conflict == 2 | type_of_conflict == 4) %>% dplyr:: select(side_a, side_b, year) #%>% ## splitting rows containing multiple parties on one side into their individual components conflict <- conflict %>% separate_rows(side_a, sep = ",") %>% separate_rows(side_b, sep = ",") ## matching the names with the GWF dataset conflict <- conflict %>% dplyr::mutate(side_a = str_replace(side_a, "Government of ", ""), side_b = str_replace(side_b, "Government of ", "")) country_replacements_conflict <- c("South Korea" = "Korea South", "North Korea" = "Korea North", "Belarus (Byelorussia)" = "Belarus", "Burkina Faso (Upper Volta)" = "Burkina Faso", "Cambodia (Kampuchea)" = "Cambodia", "Central African Republic" = "Cen African Rep", "Congo" = "Congo-Brz", "Congo, Democratic Republic of (Zaire)" = "Congo/Zaire", "DR Congo (Zaire)" = "Congo/Zaire", "Dominican Republic" = "Dominican Rep", "German Democratic Republic" = "Germany East", "Guinea-Bissau" = "Guinea Bissau", "Iran (Persia)" = "Iran", "Cote D’Ivoire" = "Ivory Coast", "Kyrgyz Republic" = "Kyrgyzstan", "Madagascar (Malagasy)" = "Madagascar", "Myanmar (Burma)" = "Myanmar", "Rumania" = "Romania", "Russia (Soviet Union)" = "Russia", "Tanzania/Tanganyika" = "Tanzania", "Turkey/Ottoman Empire" = "Turkey", "Vietnam, Democratic Republic of" = "Vietnam", "Vietnam, Republic of" = "Vietnam South", "Yemen (Arab Republic of Yemen)" = "Yemen", "Yemen (North Yemen)" = "Yemen", "Yemen, People's Republic of" = "Yemen South", "Yugoslavia (Serbia)" = "Yugoslavia", "Zimbabwe (Rhodesia)" = "Zimbabwe") conflict <- conflict %>% dplyr::mutate(side_a = ifelse(side_a %in% names(country_replacements_conflict), country_replacements_conflict[side_a], side_a)) conflict <- conflict %>% dplyr::mutate(side_b = ifelse(side_b %in% names(country_replacements_conflict), country_replacements_conflict[side_b], side_b)) ## all cases where Russia is in the side_a column are after the collapse of the Soviet Union, and all cases where Russia is in the side_b column are within the time period of the Soviet Union, hence why I only change Russia to Soviet Union in the side_b column conflict <- conflict %>% dplyr::mutate(side_b = str_replace(side_b, "Russia", "Soviet Union")) ### In preparation for merging with the other datasets and creating a dummy variable that serves my analysis, I duplicate all cases into mirrors of themselves and then combine the original dataframe with its mirrored duplicate. This is because I am interested in whether any country was involved in interstate or internationalized intrastate war in year X, but I do not distinguish between their involvement on side_a or side_b. conflict_dup <- conflict %>% dplyr::mutate(side_a_temp = side_a, side_a = side_b, side_b = side_a_temp) %>% dplyr::select(-side_a_temp) combined_conflict <- bind_rows(conflict, conflict_dup) combined_conflict <- combined_conflict %>% arrange(side_a, side_b) rownames(combined_conflict) <- NULL combined_conflict <- combined_conflict %>% mutate_at(vars(side_a, side_b), ~trimws(.)) combined_conflict$year <- as.double(combined_conflict$year) ## removing duplicates from the side_a column cleaned_conflict <- combined_conflict %>% group_by(side_a, year) %>% slice(1) %>% ungroup() merged_conflict <- left_join(merged_GDP_FBIC_GWF_VDEM, cleaned_conflict, by = c("gwf_country" = "side_a", "year")) ##creating the dummy variable to determine whether country X was engaged in interstate/internationalized intrastate war in year Y merged_conflict$int_conflict <- ifelse(is.na(merged_conflict$side_b), 0, 1) ``` ```{r elections & mass mobilization & government effectiveness} ## extracting election variables from the vdem dataset which indicate whether there were elections in country X in year Y. Multiple types of elections are included, such as legislative, presidential, and constituent assembly elections. v2eltype <- c("v2eltype_0", "v2eltype_1", "v2eltype_2", "v2eltype_3", "v2eltype_4", "v2eltype_5", "v2eltype_6", "v2eltype_7", "v2eltype_8", "v2eltype_9", "country_name", "year") v2eltype <- vdemdata[, v2eltype] ### creating a dummy variable that distinguishes between years where there were no elections and years where there were *any* elections, regardless of their type v2eltype <- v2eltype %>% dplyr::mutate(election_yes_or_no = ifelse( rowSums(!is.na(dplyr::select(., starts_with("v2eltype_")))) > 0, 1, 0 )) election_yes_or_no <- v2eltype %>% dplyr::select ("country_name", "year", "election_yes_or_no") election_yes_or_no <- election_yes_or_no %>% dplyr::mutate(country_name = ifelse(country_name %in% names(country_replacements_vdem), country_replacements_vdem[country_name], country_name)) election_yes_or_no <- election_yes_or_no %>% dplyr::mutate(country_name = ifelse(country_name == "Russia" & year >= 1922 & year <= 1991, "Soviet Union", country_name)) merged_elections <- left_join(merged_conflict, election_yes_or_no, by = c("gwf_country" = "country_name", "year")) ####MASS MOBILIZATION #same procedure as with the vdem election variable v2cagenmob <- c("v2cagenmob", "country_name", "year") v2cagenmob <- vdemdata[, v2cagenmob] v2cagenmob <- v2cagenmob %>% dplyr::mutate(country_name = ifelse(country_name %in% names(country_replacements_vdem), country_replacements_vdem[country_name], country_name)) v2cagenmob <- v2cagenmob %>% dplyr::mutate(country_name = ifelse(country_name == "Russia" & year >= 1922 & year <= 1991, "Soviet Union", country_name)) merged_mob <- left_join(merged_elections, v2cagenmob, by = c("gwf_country" = "country_name", "year")) ``` ```{r adding leaders} ## Source: The Archigos data set, retrievable at https://www.rochester.edu/college/faculty/hgoemans/data.htm leaders <- read_dta("Archigos_4.1_stata14.dta") ### using the "ccode" variable to derive country names using the countrycode package and converting the "enddate" column to numeric to extract the year leaders$country_name <- countrycode(leaders$ccode, "cown", "country.name") leaders$year <- as.numeric(substr(leaders$enddate, 1, 4)) ### creating a binary variable indicating whether the leader's exit reason was natural death leaders$natural_death <- ifelse(leaders$exit == "Natural Death", 1, 0) leaders <- leaders %>% dplyr::select(country_name, year, natural_death) ## accounting for the cases where there are two leadership changes in one year leaders <- leaders %>% group_by(country_name, year) %>% dplyr::summarise(natural_death = first(natural_death)) country_replacements_leaders <- c( "Dominican Republic" = "Dominican Rep", "Guinea-Bissau" = "Guinea Bissau", "Central African Republic" = "Cen African Rep", "Congo - Brazzaville" = "Congo-Brz", "Congo - Kinshasa" = "Congo/Zaire", "Yemen Arab Republic" = "Yemen", "Yemen People's Republic" = "Yemen South", "Vietnam-Republic of" = "Vietnam South", "Vietnam-Democratic Republic of" = "Vietnam", "Côte d’Ivoire" = "Ivory Coast", "North Korea" = "Korea North", "South Korea" = "Korea South" ) leaders <- leaders %>% dplyr::mutate(country_name = ifelse(country_name %in% names(country_replacements_leaders), country_replacements_leaders[country_name], country_name)) leaders <- leaders %>% dplyr::mutate(country_name = ifelse(country_name == "Russia" & year >= 1922 & year <= 1991, "Soviet Union", country_name)) merged_death <- left_join(merged_mob, leaders, by = c("gwf_country" = "country_name", "year")) #### the years that didn't match correctly are not relevant for my analysis because they are beyond the temporal range of my other datasets merged_death$naturaldeath <- ifelse(is.na(merged_death$natural_death) | merged_death$natural_death == 0, 0, 1) ## creating a new dummy variable that indicates whether there was a leadership change (irrespective of its type) in country x in year y or no merged_death$leaderchange <- ifelse(is.na(merged_death$natural_death), 0, 1) ``` ```{r starting in 1960} ### I restrict my dataset to begin from the year 1960, aligning with the commencement of FBIC data on dependence data1960 <- merged_death %>% dplyr::filter(year > 1959) ``` ```{r normalization} ### rescaling the dependent variable to simplify the interpretation of its coefficient which is to be calculated in the following steps min_extent <- min(data1960$extent_shift_ideo_to_prag, na.rm = TRUE) max_extent <- max(data1960$extent_shift_ideo_to_prag, na.rm = TRUE) new_min <- 0 new_max <- 100 data1960$normalized_extent <- ((data1960$extent_shift_ideo_to_prag - min_extent) / (max_extent - min_extent)) * (new_max - new_min) + new_min write.csv(data1960, "legitimation_dependence_1960_2010.csv", row.names = FALSE) model_test <- lm(normalized_extent ~ netdependence_diff + gdppc_diff + int_conflict + election_yes_or_no + v2cagenmob + leaderchange + gwf_casename + year, data = data1960) #summary(model_test) ```