
# Preface -----------------------------------------------------------------

# Orbis Financial Data for universe of German firms


# Function Definitions ----------------------------------------------------

format_date <- function(x){
  x <- as.character(x)
  fcase(str_length(x) == 8, ymd(x),
        str_length(x) == 6, ym(x),
        str_length(x) == 4, make_date(year = as.integer(x), month = 12, day = 31)
  )
}


# Load Data ---------------------------------------------------------------

financials <- open_dataset(sources=,'Data/Industry-Global_financials_and_ratios-EUR.txt',
                           format = 'tsv',
                           parse_options = CsvParseOptions$create(delimiter = "\t",
                                                                  quoting = F))

col_names <- names(financials) %>% 
  str_to_lower() %>% 
  str_replace_all(fixed("("),"") %>%
  str_replace_all(fixed(")"),"") %>% 
  str_replace_all(fixed("/"),"") %>% 
  str_trim(side="both") %>% 
  str_replace_all(" ","_")

financials <- open_dataset(sources=,'Data/Industry-Global_financials_and_ratios-EUR.txt',
                           format = 'tsv',
                           schema(
                             field('bvd_id_number', string()),
                             field('consolidation_code', string()),
                             field('filing_type', string()),
                             field('closing_date', string()),
                             field('number_of_months', string()),
                             field('audit_status', string()),
                             field('accounting_practice', string()),
                             field('source_for_publicly_quoted_companies', string()),
                             field('original_units', string()),
                             field('original_currency', string()),
                             field('exchange_rate_from_original_currency', string()),
                             field('fixed_assets', string()),
                             field('intangible_fixed_assets', string()),
                             field('tangible_fixed_assets', string()),
                             field('other_fixed_assets', string()),
                             field('current_assets', string()),
                             field('stock', string()),
                             field('debtors', string()),
                             field('other_current_assets', string()),
                             field('cash_&_cash_equivalent', string()),
                             field('total_assets', string()),
                             field('shareholders_funds', string()),
                             field('capital', string()),
                             field('other_shareholders_funds', string()),
                             field('non-current_liabilities', string()),
                             field('long_term_debt', string()),
                             field('other_non-current_liabilities', string()),
                             field('provisions', string()),
                             field('current_liabilities', string()),
                             field('loans', string()),
                             field('creditors', string()),
                             field('other_current_liabilities', string()),
                             field('total_shareh._funds_&_liab.', string()),
                             field('working_capital', string()),
                             field('net_current_assets', string()),
                             field('enterprise_value', string()),
                             field('number_of_employees', string()),
                             field('operating_revenue_turnover', string()),
                             field('sales', string()),
                             field('costs_of_goods_sold', string()),
                             field('gross_profit', string()),
                             field('other_operating_expenses', string()),
                             field('operating_pl_[=ebit]', string()),
                             field('financial_revenue', string()),
                             field('financial_expenses', string()),
                             field('financial_pl', string()),
                             field('pl_before_tax', string()),
                             field('taxation', string()),
                             field('pl_after_tax', string()),
                             field('extr._and_other_revenue', string()),
                             field('extr._and_other_expenses', string()),
                             field('extr._and_other_pl', string()),
                             field('pl_for_period_[=net_income]', string()),
                             field('export_revenue', string()),
                             field('material_costs', string()),
                             field('costs_of_employees', string()),
                             field('depreciation_&_amortization', string()),
                             field('interest_paid', string()),
                             field('research_&_development_expenses', string()),
                             field('cash_flow', string()),
                             field('added_value', string()),
                             field('ebitda', string()),
                             field("roe_using_pl_before_tax_%", string()),
                             field("roce_using_pl_before_tax_%",string()),
                             field("roa_using_pl_before_tax_%", string()),
                             field("roe_using_net_income_%", string()),
                             field("roce_using_net_income_%", string()),
                             field("roa_using_net_income_%", string()),
                             field("profit_margin_%", string()),
                             field("gross_margin_%", string()),
                             field("ebitda_margin_%", string()),
                             field("ebit_margin_%", string()),
                             field("cash_flow__operating_revenue_%", string()),
                             field("enterprise_value__ebitda_x", string()),
                             field("market_cap__cash_flow_from_operations_x", string()),
                             field("net_assets_turnover_x", string()),
                             field("interest_cover_x", string()),
                             field("stock_turnover_x", string()),
                             field("collection_period_days", string()),
                             field("credit_period_days", string()),
                             field("export_revenue__operating_revenue_%", string()),
                             field("r&d_expenses__operating_revenue_%", string()),
                             field("current_ratio_x", string()),
                             field("liquidity_ratio_x", string()),
                             field("shareholders_liquidity_ratio_x", string()),
                             field("solvency_ratio_asset_based_%", string()),
                             field("solvency_ratio_liability_based_%", string()),
                             field("gearing_%", string()),
                             field("profit_per_employee_th", string()),
                             field("operating_revenue_per_employee_th", string()),
                             field("costs_of_employees__operating_revenue_%", string()),
                             field("average_cost_of_employee_th", string()),
                             field("shareholders_funds_per_employee_th", string()),
                             field("working_capital_per_employee_th", string()),
                             field("total_assets_per_employee_th", string()),
                             field("estimated_operating_revenue", string()),
                             field("estimated_employees", string()),
                             field("operating_revenue_original_range_value", string()),
                             field("employees__original_range_value", string())
                           ),
                           parse_options = CsvParseOptions$create(delimiter = "\t",
                                                                  quoting = F),
                           read_options = CsvReadOptions$create(
                             column_names = col_names,
                             skip_rows = 1
                           ),
                           convert_options = CsvConvertOptions$create(
                             check_utf8 = F
                           )) %>%
  filter(substr(bvd_id_number,1,2)=='DE') %>%
  write_parquet(sink = 'Data/orbis_financials.parquet')


# Clean Financial Data ----------------------------------------------------

financials <- open_dataset(sources = 'Data/orbis_financials.parquet') |> 
  select(bvd_id_number,
         consolidation_code,
         closing_date,
         `total_assets`,
         `number_of_employees`,
         `operating_revenue_turnover`,
         `taxation`,
         `costs_of_employees`) |>
  collect()

setDT(financials)

# Clean data --------------------------------------------------------------

numeric_vars <- names(financials)[!(names(financials) %in% c(
  'bvd_id_number',
  'consolidation_code',
  'filing_type',
  'closing_date',
  'audit_status',
  'accounting_practice',
  'original_units'
))]

financials[,(numeric_vars) := lapply(.SD,as.numeric),.SDcols = numeric_vars]


financials[, ':='(
  closing_date_fmt = format_date(closing_date)
)]


financials[, fyear := ifelse(month(closing_date_fmt) <= 6,
                             year(closing_date_fmt)-1,
                             year(closing_date_fmt))]


setkey(financials,bvd_id_number, fyear)

setorder(financials, bvd_id_number, fyear, closing_date_fmt)


financials <- financials[consolidation_code %in% c("U1", "U2", "LF")]

# Drop remaining duplicates (0.3 %)
financials = unique(financials, by = c("bvd_id_number", "fyear"))


setkey(financials,bvd_id_number,fyear)
setcolorder(financials)


write_parquet(financials, sink = 'Data/financials_cleaned.parquet')

rm(financials, numeric_vars)
gc()