Regression

library(wiesbaden)

Warning: package 'wiesbaden' was built under R version 4.1.2

library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

✔ ggplot2 3.3.5     ✔ purrr   0.3.4
✔ tibble  3.1.5     ✔ dplyr   1.0.7
✔ tidyr   1.1.4     ✔ stringr 1.4.0
✔ readr   2.0.2     ✔ forcats 0.5.1

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

Wiesbaden package to access data from the Genesis database from Statistisches Bundesamt (located in Wiesbaden).

The Credentials to access the API are saved already, test login follows here.

test_login(genesis=c(db="regio")) #test login

[1] "Sie wurden erfolgreich an- und abgemeldet."

wahl_lohn <- readRDS("data/wahl_lohn.Rds")

Simple Regression

\[ \text{afd}_k = \beta_0 + \beta_1 \times \text{min}_k + \epsilon \]

\(\text{afd}_k\) AfD Vote Share in Landkreis \(k\)
\(\text{min}_k\) Share of Minimum Wage Receivers in Landkreis \(k\)
\(\beta_0\) Intercept
\(\beta_1\) Slope
\(\epsilon\) Error

model_basic <- lm(
    afd_prozent ~ lohn_prozent, 
    data = wahl_lohn
  )
summary(model_basic)


Call:
lm(formula = afd_prozent ~ lohn_prozent, data = wahl_lohn)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.113618 -0.023871 -0.004185  0.021197  0.119255 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -0.026312   0.005823  -4.519 8.21e-06 ***
lohn_prozent  0.714426   0.028455  25.107  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.03638 on 398 degrees of freedom
Multiple R-squared:  0.613, Adjusted R-squared:  0.612 
F-statistic: 630.4 on 1 and 398 DF,  p-value: < 2.2e-16

Ost West

länder <- wahl_lohn$land %>% unique()
ost_names <- c("BB", "BE", "MV", "SN", "ST", "TH")

wahl_lohn <- wahl_lohn %>%
  mutate(
    ost = ifelse(land %in% ost_names, 1, 0)
  )

Dummy for ost

model_ost <- lm(
    afd_prozent ~ lohn_prozent + ost, 
    data = wahl_lohn
  )
summary(model_ost)


Call:
lm(formula = afd_prozent ~ lohn_prozent + ost, data = wahl_lohn)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.094829 -0.019186 -0.001373  0.016590  0.098788 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.039565   0.006889   5.743 1.85e-08 ***
lohn_prozent 0.291299   0.039392   7.395 8.48e-13 ***
ost          0.086182   0.006420  13.425  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.03021 on 397 degrees of freedom
Multiple R-squared:  0.7338,    Adjusted R-squared:  0.7325 
F-statistic: 547.2 on 2 and 397 DF,  p-value: < 2.2e-16

Unemployment Rate

from Regionalstatistik 13211-02-05-4

d <- retrieve_datalist(tableseries="13211*", genesis=c(db="regio"))
subset(d, grepl("Kreise", description))

data <- retrieve_data(tablename="13211KJ009", genesis=c(db="regio"))
head(data)

arbeit_data <- data %>% 
  filter(JAHR == 2021) %>%
  select(kreis = KREISE, arbeitslosenquote = ERWP10_val) %>%
  # remove rows with more than 5 digits in kreis
  mutate(arbeitslosenquote = arbeitslosenquote / 100) %>%
  filter(nchar(kreis) == 5)

Merge

wahl_lohn <- wahl_lohn %>%
  left_join(arbeit_data, by = "kreis")

model_arbeit <- lm(
    afd_prozent ~ lohn_prozent + ost+ arbeitslosenquote, 
    data = wahl_lohn
  )
summary(model_arbeit)


Call:
lm(formula = afd_prozent ~ lohn_prozent + ost + arbeitslosenquote, 
    data = wahl_lohn)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.090252 -0.018952 -0.001568  0.015493  0.100010 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)        0.050509   0.007319   6.902 2.05e-11 ***
lohn_prozent       0.308381   0.038939   7.920 2.43e-14 ***
ost                0.087937   0.006322  13.910  < 2e-16 ***
arbeitslosenquote -0.279143   0.071078  -3.927 0.000101 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.02968 on 396 degrees of freedom
Multiple R-squared:  0.7438,    Adjusted R-squared:  0.7419 
F-statistic: 383.2 on 3 and 396 DF,  p-value: < 2.2e-16

GDP per Capita

Regionalstatistik 82111

d <- retrieve_datalist(tableseries="82111*", genesis=c(db="regio"))
subset(d, grepl("Kreise", description))

data <- retrieve_data(tablename="82111KJ008", genesis=c(db="regio"))
head(data)

gdp_data <- data %>% 
  filter(JAHR == 2021) %>%
  select(kreis = KREISE, gdp = BIP804_val) %>%
  # remove rows with more than 5 digits in kreis
  filter(nchar(kreis) == 5)

gdp_data %>%
  filter(!kreis %in% wahl_lohn$kreis) %>%
  select(kreis) %>%
  arrange(kreis)

16056 (Eisenach) not present in Wahl_Lohn Data…, because Eisenach Kreis doesnt exist anymore, it is now Wartburgkreis (16063)

Merge

wahl_lohn <- wahl_lohn %>%
  left_join(gdp_data, by = "kreis")

Model

model_gdp <- lm(
    afd_prozent ~ lohn_prozent + ost+ arbeitslosenquote + log(gdp), 
    data = wahl_lohn
  )
summary(model_gdp)


Call:
lm(formula = afd_prozent ~ lohn_prozent + ost + arbeitslosenquote + 
    log(gdp), data = wahl_lohn)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.089117 -0.018857 -0.001454  0.015734  0.100196 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)        0.071078   0.059875   1.187 0.235898    
lohn_prozent       0.301457   0.043815   6.880 2.35e-11 ***
ost                0.088321   0.006425  13.746  < 2e-16 ***
arbeitslosenquote -0.274819   0.072245  -3.804 0.000165 ***
log(gdp)          -0.001852   0.005349  -0.346 0.729429    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.02971 on 395 degrees of freedom
Multiple R-squared:  0.7439,    Adjusted R-squared:  0.7413 
F-statistic: 286.8 on 4 and 395 DF,  p-value: < 2.2e-16

Avg. Age

d <- retrieve_datalist(tableseries="12411*", genesis=c(db="regio"))
subset(d, grepl("Kreise", description))

data <- retrieve_data(tablename="12411KJ019", genesis=c(db="regio"))
head(data)

age_data <- data %>% 
  filter(STAG == "31.12.2021") %>%
  select(kreis = KREISE, 
         age = BEV519_val #avg age
         ) %>%
  # remove rows with more than 5 digits in kreis
  filter(nchar(kreis) == 5)

Merge

wahl_lohn <- wahl_lohn %>%
  left_join(age_data, by = "kreis")

model_age <- lm(
    afd_prozent ~ lohn_prozent + ost+ arbeitslosenquote + log(gdp) + age, 
    data = wahl_lohn
  )
summary(model_age)


Call:
lm(formula = afd_prozent ~ lohn_prozent + ost + arbeitslosenquote + 
    log(gdp) + age, data = wahl_lohn)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.072729 -0.019526 -0.002674  0.017335  0.097038 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)       -0.331786   0.091478  -3.627 0.000324 ***
lohn_prozent       0.226691   0.044195   5.129 4.57e-07 ***
ost                0.081481   0.006302  12.929  < 2e-16 ***
arbeitslosenquote -0.243973   0.069766  -3.497 0.000524 ***
log(gdp)           0.010820   0.005614   1.927 0.054652 .  
age                0.006261   0.001104   5.672 2.74e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.0286 on 394 degrees of freedom
Multiple R-squared:  0.7632,    Adjusted R-squared:  0.7602 
F-statistic:   254 on 5 and 394 DF,  p-value: < 2.2e-16

Population Density

Regionalstatistik 99910

d <- retrieve_datalist(tableseries="99910*", genesis=c(db="regio"))
subset(d, grepl("Bevölkerung", description))

data <- retrieve_data(tablename="99910KJA02", genesis=c(db="regio"))
head(data)

pop_data <- data %>% 
  filter(JAHR == 2021) %>%
  select(kreis = KREISE, 
         pop = AI0201_val, #population density je km2
         foreigners = AI0208_val #foreigners rate in %
         ) %>%
  mutate(foreigners = foreigners / 100) %>%
  # remove rows with more than 5 digits in kreis
  filter(nchar(kreis) == 5)

Merge

wahl_lohn <- wahl_lohn %>%
  left_join(pop_data, by = "kreis")

model_pop <- lm(
    afd_prozent ~ lohn_prozent + ost+ arbeitslosenquote + log(gdp) +age + log(pop), 
    data = wahl_lohn
  )
summary(model_pop)


Call:
lm(formula = afd_prozent ~ lohn_prozent + ost + arbeitslosenquote + 
    log(gdp) + age + log(pop), data = wahl_lohn)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.070578 -0.018897 -0.002703  0.017855  0.098898 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)       -0.315423   0.092243  -3.419 0.000693 ***
lohn_prozent       0.219617   0.044483   4.937 1.17e-06 ***
ost                0.081794   0.006301  12.981  < 2e-16 ***
arbeitslosenquote -0.162693   0.093287  -1.744 0.081941 .  
log(gdp)           0.013461   0.005959   2.259 0.024447 *  
age                0.005591   0.001216   4.600 5.71e-06 ***
log(pop)          -0.002990   0.002281  -1.311 0.190642    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.02858 on 393 degrees of freedom
Multiple R-squared:  0.7642,    Adjusted R-squared:  0.7606 
F-statistic: 212.3 on 6 and 393 DF,  p-value: < 2.2e-16

Choosing the best Model

via Stepwise Regression, explained here

not well regarded in Econometrics, because it is prone to overfitting! just for self reference, that lohn_prozent is chosen and therefore valid

step(model_basic, direction = "both", scope = ~ lohn_prozent + ost + foreigners+ arbeitslosenquote + log(gdp) + age, trace=0)


Call:
lm(formula = afd_prozent ~ lohn_prozent + ost + age + arbeitslosenquote + 
    foreigners, data = wahl_lohn)

Coefficients:
      (Intercept)       lohn_prozent                ost                age  
        -0.326616           0.218156           0.088402           0.008381  
arbeitslosenquote         foreigners  
        -0.385274           0.180668

Seems, best model is: “afd_prozent ~ lohn_prozent + ost + age + arbeitslosenquote + foreigners”

Interlude: Linke

model_linke <- lm(
    linke_prozent ~ lohn_prozent,
    data = wahl_lohn
  )

model_linke_2 <- lm(
    linke_prozent ~ lohn_prozent + ost,
    data = wahl_lohn
  )

model_linke_3 <- lm(
    linke_prozent ~ lohn_prozent + ost + log(gdp),
    data = wahl_lohn
  )

summary(model_linke_3)


Call:
lm(formula = linke_prozent ~ lohn_prozent + ost + log(gdp), data = wahl_lohn)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.035639 -0.008126 -0.001951  0.005206  0.052785 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -0.117659   0.027711  -4.246 2.72e-05 ***
lohn_prozent -0.002847   0.020338  -0.140    0.889    
ost           0.068507   0.003049  22.471  < 2e-16 ***
log(gdp)      0.014297   0.002457   5.819 1.23e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.01385 on 395 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.7754,    Adjusted R-squared:  0.7737 
F-statistic: 454.6 on 3 and 395 DF,  p-value: < 2.2e-16

=> not significant in all variants, low R2 in basic variant

Save Data

#save all Models
save(
  model_basic,
  model_ost,
  model_arbeit, 
  model_gdp, 
  model_age,
  model_pop, 
  file = "data/models.RData")

saveRDS(wahl_lohn, "data/wahl_lohn_mod.RDS")