gc()


# Preface -----------------------------------------------------------------

# Goal: Assign each firm (identified via BvD-ID) one or more applicable 
# collection rates for the Local Business tax

# LBT rate: basic federal rate x collection rate

# Collection rate is determined on the municipality (Gemeinde) level
# -> high variation in applicable rates across Germany

# STEPS PERFORMED
# 1. Find the location of each establishment of the company
# 2. Determine in which municipality the establishment is situated in -> Municipality identifier
# 3. Merge the collection rate to the firm via the municipality identifier

# Geocode zip code + city -------------------------------------------------

addresses_all <- read_parquet(file="Data/addresses_all.parquet")
# Geocoding is a time-intensive task: R communicates with the OpenStreetMaps
# server via an API -> if you post too many requests in a short time, the server
# will block you
# Hence, the geocoding function has a built-in waiting time of 1sec per request
# --> geocoding 4.8 million addresses would take about 55 days (!)

# However, I only need to know the rough location to determine the municipality
# Cut computing time by only geo-referencing unique city-postcode observations
unique_addresses = unique(addresses_all[ , .(Postcode, City, Country)])

unique_addresses_gc = geocode(unique_addresses, city = City, country = Country,
        postalcode = Postcode, method = "osm", verbose = TRUE) # ... Come back the next day (~13h run-time)

write_parquet(unique_addresses_gc, sink = "Data/unique_addresses_gc.parquet")

# Check validity of coordinates -------------------------------------------

unique_addresses_gc <- read_parquet(file = "Data/unique_addresses_gc.parquet")

unique_addresses_gc = unique_addresses_gc |> filter(!is.na(lat) & !is.na(long))

address_sf = st_as_sf(unique_addresses_gc, coords = c("long", "lat"), crs = 4326)

# crs = coordinate reference system: Coordinates are relative to some point, can
# use different reference systems. Mostly important that you use a consistent
# system with your data -> crs 4326 is commonly used

address_sf |> 
  ggplot() + 
  geom_sf()


# Load Municipality Shapefile ---------------------------------------------

path_shapefile = "Data/geo_daten/vg250_12-31.gk3.shape.ebenen/vg250_ebenen_1231/VG250_GEM.shp"

# A shapefile contains the borders of municipalities saved as a multi-polygon (Vieleck)
muni_shp = sf::read_sf(path_shapefile)

st_crs(muni_shp)
muni_shp = st_transform(muni_shp, crs = 4326) # Transform to corresponding reference system


# Spatial merge -----------------------------------------------------------

# I'm doing a spatial merge here: I check whether a point (location of a company)
# intersects with a surface (area of the municipality)
address_merged = st_join(address_sf, select(muni_shp, AGS), join=st_intersects)
# Now I have for each address the municipality key

# Merge AGS to BvD-ID -----------------------------------------------------

# Next step: match addresses back to firm identifiers, so that each firm has 
# a set of mapped municipality identifiers
addresses_all <- read_parquet(file = "Data/addresses_all.parquet")

addresses_ags = left_join(addresses_all, address_merged)

addresses_ags = as.data.table(addresses_ags)
setkey(addresses_ags, `BvD ID number`)

col_names = str_to_lower(str_replace_all(names(addresses_ags), "\\s", "_"))

setnames(addresses_ags, new = col_names)

addresses_ags$geometry = NULL
addresses_ags$country = NULL
addresses_ags$country_iso_code = NULL


write_parquet(addresses_ags, sink = "Data/addresses_ags.parquet")

rm(addresses_all, unique_addresses, unique_addresses_gc, address_sf,
   path_shapefile, muni_shp, address_merged, addresses_ags, col_names)
gc()