8 Maschinelles Lernen / Data Mining
=> Lösen von Klassifikationsproblemen
8.1 Import all packages
8.2 Datensatz einlesen
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
## Species
## 1 setosa
## 2 setosa
## 3 setosa
## 4 setosa
## 5 setosa
## 6 setosa
<- as.data.frame(scale(iris[,-5]))
st_iris <- tibble(st_iris) %>%
st_iris add_column(Species = iris$Species)
8.2.1 Beschreibung
## Sepal.Length Sepal.Width Petal.Length
## median 5.80000000 3.00000000 4.3500000
## mean 5.84333333 3.05733333 3.7580000
## SE.mean 0.06761132 0.03558833 0.1441360
## CI.mean.0.95 0.13360085 0.07032302 0.2848146
## var 0.68569351 0.18997942 3.1162779
## std.dev 0.82806613 0.43586628 1.7652982
## coef.var 0.14171126 0.14256420 0.4697441
## Petal.Width Species
## median 1.30000000 NA
## mean 1.19933333 NA
## SE.mean 0.06223645 NA
## CI.mean.0.95 0.12298004 NA
## var 0.58100626 NA
## std.dev 0.76223767 NA
## coef.var 0.63555114 NA
8.2.2 Visualisieren
## plot: [1,1] [=>------------------------] 6% est: 0s
## plot: [1,2] [==>-----------------------] 12% est: 1s
## plot: [1,3] [====>---------------------] 19% est: 1s
## plot: [1,4] [=====>--------------------] 25% est: 1s
## plot: [2,1] [=======>------------------] 31% est: 1s
## plot: [2,2] [=========>----------------] 38% est: 1s
## plot: [2,3] [==========>---------------] 44% est: 0s
## plot: [2,4] [============>-------------] 50% est: 0s
## plot: [3,1] [==============>-----------] 56% est: 0s
## plot: [3,2] [===============>----------] 62% est: 0s
## plot: [3,3] [=================>--------] 69% est: 0s
## plot: [3,4] [===================>------] 75% est: 0s
## plot: [4,1] [====================>-----] 81% est: 0s
## plot: [4,2] [======================>---] 88% est: 0s
## plot: [4,3] [=======================>--] 94% est: 0s
## plot: [4,4] [==========================]100% est: 0s
8.3 Lineare Diskriminanzanalyse
8.3.1 Test / Train
<- sample(1:nrow(st_iris), 0.75*nrow(st_iris)) # 75 % fürs testen
train <- st_iris[train,]
train_set <- st_iris[-train,] test_set
8.3.2 Modelltrainierung
<- lda(Species ~. , data = train_set)
lda_train <- predict(lda_train, train_set)$class
train_pred
table(train_pred, train_set$Species)
##
## train_pred setosa versicolor virginica
## setosa 36 0 0
## versicolor 0 36 1
## virginica 0 2 37
8.3.3 Bewertung anhand der Testdaten
<- predict(lda_train, test_set)$class
test_pred table(test_pred, test_set$Species)
##
## test_pred setosa versicolor virginica
## setosa 14 0 0
## versicolor 0 12 0
## virginica 0 0 12
8.4 Random Forest Methode
8.4.1 Bildung mehrerer Entscheidungsbäume
#wiederholung der Train-Test-Sets
<- sample(1:nrow(st_iris), 0.75*nrow(st_iris)) # 75 % fürs testen
train <- st_iris[train,]
train_set <- st_iris[-train,]
test_set
<- randomForest(
rf1 ~ .,
Species data = train_set,
ntree = 100,
mtry = 2,
importance = T,
proximity = T
)
rf1
##
## Call:
## randomForest(formula = Species ~ ., data = train_set, ntree = 100, mtry = 2, importance = T, proximity = T)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 8.04%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 36 0 0 0.0000000
## versicolor 0 31 4 0.1142857
## virginica 0 5 36 0.1219512
<- predict(rf1, train_set[,-5], type = "response")
train_predict table(train_predict, train_set$Species)
##
## train_predict setosa versicolor virginica
## setosa 36 0 0
## versicolor 0 35 0
## virginica 0 0 41
<- predict(rf1, test_set[,-5], type = "response")
test_predict table(test_predict, test_set$Species)
##
## test_predict setosa versicolor virginica
## setosa 14 0 0
## versicolor 0 15 1
## virginica 0 0 8
8.5 Neuronales Netzwerk
8.5.1 Standardisierung und Aufteilung der Daten
5] <- as.numeric(iris$Species) -1
st_iris[,<- as.matrix(st_iris)
st_iris
<- st_iris[train,-5]
train_set <- st_iris[-train,-5] test_set
8.5.2 Trainieren des Neuronalen Netzwerks
8.5.3 Evaluation
<- dl %>%
score evaluate(test_set, test_settarget, batch_size = 128)
print(score)
## loss accuracy
## 0.03793639 1.00000000
# class <- predict_classes(dl, test_set) #decepretated
<- dl %>% predict(test_set) %>% k_argmax()
class <- as.numeric(class)
class_numeric
table(as.numeric(class), st_iris[-train,5])
##
## 0 1 2
## 0 14 0 0
## 1 0 15 0
## 2 0 0 9
8.6 H20
Internetverbundenes Packet, basierend auf JAVA!!
ich werde es nicht herunterladen, deswegen nur den ersten Block mitgeschrieben
= h2o.init(
localH20 ip = "localhost",
port = 54321,
startH2O = T,
nthreads = -1,
max_mem_size = "2G"
)
8.7 Aufgabenblatt
library(tidyverse)
library(GGally)
library(pastecs)
load("data/biathlon4.RData")
load("data/biathlon3.RData")
a) Betrachten Sie zunächst alle Variablen im Datensatz und analysieren Sie diese hinsichtlich Ihrer Lage und Streuung. Lassen sich irgendwelche Auffälligkeiten feststellen? Wenn ja sollten Sie überlegen, wie Sie diese bereinigen bzw. beseitigen könnten.
%>%
test ::select(c(total.time, shoot.times.total, fails.total, type)) %>%
dplyrstat.desc(basic = F)
## total.time shoot.times.total fails.total
## median 1.808850e+03 92.0000000 2.00000000
## mean 1.893898e+03 95.6011581 2.81648027
## SE.mean 7.487072e+00 0.3059346 0.01727304
## CI.mean.0.95 1.467577e+01 0.5996773 0.03385772
## var 7.217802e+05 1204.2060904 3.84165770
## std.dev 8.495765e+02 34.7016727 1.96001472
## coef.var 4.485861e-01 0.3629838 0.69590927
## type
## median NA
## mean NA
## SE.mean NA
## CI.mean.0.95 NA
## var NA
## std.dev NA
## coef.var NA
%>%
test ::select(c(total.time, shoot.times.total, tot.climb)) %>%
dplyrggpairs()
## Warning in ggally_statistic(data = data, mapping =
## mapping, na.rm = na.rm, : Removed 10 rows containing
## missing values
## Warning in ggally_statistic(data = data, mapping =
## mapping, na.rm = na.rm, : Removed 57 rows containing
## missing values
## Warning: Removed 10 rows containing missing values
## (geom_point).
## Warning: Removed 10 rows containing non-finite values
## (stat_density).
## Warning in ggally_statistic(data = data, mapping =
## mapping, na.rm = na.rm, : Removed 67 rows containing
## missing values
## Warning: Removed 57 rows containing missing values
## (geom_point).
## Warning: Removed 67 rows containing missing values
## (geom_point).
## Warning: Removed 57 rows containing non-finite values
## (stat_density).
%>%
test ggplot(aes(x = total.time, y = shoot.times.total, color = type)) +
geom_point() +
facet_wrap(~gender)
## Warning: Removed 10 rows containing missing values
## (geom_point).
Komisch: bei letzem Graphen erkennbar eine kleine Gruppe mit niedrigen Zeiten, unabhängig vom Rest des Feldes
vielleicht abhängig von Teamsachen
$is_team <- grepl("[0-9]", test$nation) #checks if there are any numbers in nation name, a sign for a team base
test%>%
test ggplot(aes(x = total.time, y = shoot.times.total, color = is_team)) +
geom_point() +
facet_grid(~gender)
## Warning: Removed 10 rows containing missing values
## (geom_point).
funktioniert so ein bisschen, nicht komplett, aber egal.
8.7.1 Filtern + Standardisieren des Datensatzes
welche Variable wollen wir erkennen lassen ?
colnames(test)
## [1] "nation" "gender"
## [3] "type" "total.time"
## [5] "course.lap.1" "course.lap.2"
## [7] "course.lap.3" "course.lap.4"
## [9] "course.lap.5" "course.total"
## [11] "shoot.times.1" "shoot.times.2"
## [13] "shoot.times.3" "shoot.times.4"
## [15] "shoot.times.total" "fails.1"
## [17] "fails.2" "fails.3"
## [19] "fails.4" "fails.total"
## [21] "max.climb" "tot.climb"
## [23] "height.diff" "is_team"
Datensatz extrahierung
<- test %>%
st_test ::select(total.time, course.total, shoot.times.total, height.diff) %>%
dplyrscale()
<- train %>%
st_train ::select(total.time, course.total, shoot.times.total, height.diff) %>%
dplyrscale()