6 Clusteranalyse
library(tidyverse)
library(GGally)
library(ggpubr)
6.1 Datensatz Tidyr
<- tibble(USArrests) arrests
6.1.1 Umformung
<- arrests %>%
arrests mutate(state = rownames(USArrests))
6.2 Deskription
6.2.1 kurze Analyse
%>%
arrests summarise(
avg_murder = mean(Murder),
sd_murder = sd(Murder),
avg_assault = mean(Assault),
sd_assault = sd(Assault),
avg_pop = mean(UrbanPop),
sd_pop = sd(UrbanPop),
avg_rape = mean(Rape),
sd_rape = sd(Rape)
)
## # A tibble: 1 × 8
## avg_murder sd_murder avg_assault sd_assault avg_pop
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.79 4.36 171. 83.3 65.5
## # … with 3 more variables: sd_pop <dbl>,
## # avg_rape <dbl>, sd_rape <dbl>
6.2.2 Zusammenhänge
%>%
arrests ggpairs(columns = c("Murder", "Assault", "UrbanPop", "Rape"))
## plot: [1,1] [=>------------------------] 6% est: 0s
## plot: [1,2] [==>-----------------------] 12% est: 0s
## plot: [1,3] [====>---------------------] 19% est: 0s
## plot: [1,4] [=====>--------------------] 25% est: 0s
## plot: [2,1] [=======>------------------] 31% est: 0s
## plot: [2,2] [=========>----------------] 38% est: 0s
## plot: [2,3] [==========>---------------] 44% est: 0s
## plot: [2,4] [============>-------------] 50% est: 0s
## plot: [3,1] [==============>-----------] 56% est: 0s
## plot: [3,2] [===============>----------] 62% est: 0s
## plot: [3,3] [=================>--------] 69% est: 0s
## plot: [3,4] [===================>------] 75% est: 0s
## plot: [4,1] [====================>-----] 81% est: 0s
## plot: [4,2] [======================>---] 88% est: 0s
## plot: [4,3] [=======================>--] 94% est: 0s
## plot: [4,4] [==========================]100% est: 0s
6.3 hierarchsiche Clusteranalyse
wichtig: - Standardisierung der Variablen - nur für metrisch skalierte Variablen
Vorbereitung
# Standardisierung
<- scale(arrests[,-5]) #alle außer die states
st_arrest
# Distanzmatrix (euklidisches Maß)
<- dist(st_arrest) eu_dist
Clusterisierung
<- hclust(eu_dist)
h_eu_compl $labels <- arrests$state h_eu_compl
Darstellung mit Dendrogrammen
plot(h_eu_compl, hang = -1)
=> vermutlich beste Clusteranzahl = 4
6.4 partitionierende Clusteranalyse
<- kmeans(st_arrest[,1:2], centers = 4, nstart = 50)
k2
%>%
arrests mutate(cluster = k2$cluster) %>%
ggplot(aes(Assault, Murder, color = factor(cluster))) +
geom_point() +
geom_text(aes(label = state), hjust = -0.1)
6.5 Aufgabenblatt
Biathlon Datensatz
importe:
library(tidyverse)
library(GGally)
library(ggpubr)
Datensatz einlesen
load("data/biathlon3.RData")
head(df.biathlon3,1) %>% t()
## 1
## nation "FRA"
## gender "M"
## competition "I"
## type "W"
## total.time "2667.9"
## course.lap.1 "486.6"
## course.lap.2 "482.8"
## course.lap.3 "481.9"
## course.lap.4 "484.6"
## course.lap.5 "480.7"
## course.total "2416.6"
## shoot.times.1 "26"
## shoot.times.2 "23"
## shoot.times.3 "36"
## shoot.times.4 "32"
## shoot.times.total "117"
## fails.1 "0"
## fails.2 "0"
## fails.3 "0"
## fails.4 "1"
## fails.total "1"
a) Betrachtung des Datensatzes
library(pastecs)
%>%
df.biathlon3 ::select(course.lap.1:shoot.times.total) %>%
dplyrstat.desc(basic=F) %>%
t() %>%
as.data.frame()
## median mean SE.mean
## course.lap.1 415.2 414.78749 1.23440998
## course.lap.2 427.1 428.50839 1.27495791
## course.lap.3 425.3 431.12730 1.31816930
## course.lap.4 390.9 421.09404 2.26134803
## course.lap.5 380.9 408.46061 2.16532519
## course.total 1556.7 1701.83214 8.32654983
## shoot.times.1 31.0 31.86822 0.08935101
## shoot.times.2 29.8 30.48364 0.09726272
## shoot.times.3 29.5 30.35905 0.14662694
## shoot.times.4 27.6 28.36341 0.13297238
## shoot.times.total 95.0 92.60725 0.53834769
## CI.mean.0.95 var std.dev
## course.lap.1 2.4202103 5503.85004 74.187937
## course.lap.2 2.4997094 5871.36979 76.624864
## course.lap.3 2.5844306 6276.10397 79.221865
## course.lap.4 4.4350467 9516.58623 97.552992
## course.lap.5 4.2467229 8725.54638 93.410633
## course.total 16.3252098 250425.13259 500.424952
## shoot.times.1 0.1751835 28.83677 5.369988
## shoot.times.2 0.1906953 34.16965 5.845481
## shoot.times.3 0.2875707 40.01049 6.325385
## shoot.times.4 0.2607908 32.90556 5.736337
## shoot.times.total 1.0554959 1046.82348 32.354652
## coef.var
## course.lap.1 0.1788577
## course.lap.2 0.1788177
## course.lap.3 0.1837552
## course.lap.4 0.2316656
## course.lap.5 0.2286895
## course.total 0.2940507
## shoot.times.1 0.1685061
## shoot.times.2 0.1917580
## shoot.times.3 0.2083525
## shoot.times.4 0.2022443
## shoot.times.total 0.3493749
$type <- factor(df.biathlon3$type)
df.biathlon3$competition <- factor(df.biathlon3$competition)
df.biathlon3summary(df.biathlon3$competition)
## I M P S
## 589 360 912 1751
summary(df.biathlon3$type)
## O W
## 525 3087
%>%
df.biathlon3 ggplot(aes(x= course.total, y= shoot.times.total,color=competition)) +
geom_point()
# facet_wrap(~gender)
b)
library(patchwork)
<- df.biathlon3 %>% dplyr::select(course.lap.1:course.lap.5)
df_laps <- df.biathlon3 %>% dplyr::select(shoot.times.1:shoot.times.4)
df_shoots <- dplyr::select(df.biathlon3, fails.1:fails.4)
df_fails
<- ggplot(stack(df_laps), aes(x = ind, y = values)) +
p1 geom_boxplot()
<- ggplot(stack(df_shoots), aes(x = ind, y = values)) +
p2 geom_boxplot()
/ p2 p1
## Warning: Removed 3502 rows containing non-finite values
## (stat_boxplot).
## Warning: Removed 3502 rows containing non-finite values
## (stat_boxplot).
ggplot(df.biathlon3, aes(x=fails.total)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value
## with `binwidth`.
c) auwahl, standardisierung
<- df.biathlon3 %>%
st_df ::select(course.total, shoot.times.total, fails.total) %>%
dplyrscale()
# mutate(comp = competition)
plot(hclust(dist(st_df)))
<- kmeans(st_df[,1:2], centers = 4, nstart = 50)
k
<- ggplot(df.biathlon3, aes(course.total, shoot.times.total)) +
p1 geom_point(aes(color=factor(k$cluster))) +
theme(legend.position = "none") +
labs(title = "Cluster")
<- ggplot(df.biathlon3, aes(course.total, shoot.times.total)) +
p2 geom_point(aes(color=competition)) +
theme(legend.position = "none") +
labs(title = "Comp")
+ p2 p1