8 Maschinelles Lernen / Data Mining

=> Lösen von Klassifikationsproblemen

8.1 Import all packages

8.2 Datensatz einlesen

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 3          4.7         3.2          1.3         0.2
## 4          4.6         3.1          1.5         0.2
## 5          5.0         3.6          1.4         0.2
## 6          5.4         3.9          1.7         0.4
##   Species
## 1  setosa
## 2  setosa
## 3  setosa
## 4  setosa
## 5  setosa
## 6  setosa

st_iris <- as.data.frame(scale(iris[,-5]))
st_iris <- tibble(st_iris) %>% 
  add_column(Species = iris$Species)

8.2.1 Beschreibung

##              Sepal.Length Sepal.Width Petal.Length
## median         5.80000000  3.00000000    4.3500000
## mean           5.84333333  3.05733333    3.7580000
## SE.mean        0.06761132  0.03558833    0.1441360
## CI.mean.0.95   0.13360085  0.07032302    0.2848146
## var            0.68569351  0.18997942    3.1162779
## std.dev        0.82806613  0.43586628    1.7652982
## coef.var       0.14171126  0.14256420    0.4697441
##              Petal.Width Species
## median        1.30000000      NA
## mean          1.19933333      NA
## SE.mean       0.06223645      NA
## CI.mean.0.95  0.12298004      NA
## var           0.58100626      NA
## std.dev       0.76223767      NA
## coef.var      0.63555114      NA

8.2.2 Visualisieren

## plot: [1,1] [=>------------------------] 6% est: 0s
## plot: [1,2] [==>-----------------------] 12% est: 1s
## plot: [1,3] [====>---------------------] 19% est: 1s
## plot: [1,4] [=====>--------------------] 25% est: 1s
## plot: [2,1] [=======>------------------] 31% est: 1s
## plot: [2,2] [=========>----------------] 38% est: 1s
## plot: [2,3] [==========>---------------] 44% est: 0s
## plot: [2,4] [============>-------------] 50% est: 0s
## plot: [3,1] [==============>-----------] 56% est: 0s
## plot: [3,2] [===============>----------] 62% est: 0s
## plot: [3,3] [=================>--------] 69% est: 0s
## plot: [3,4] [===================>------] 75% est: 0s
## plot: [4,1] [====================>-----] 81% est: 0s
## plot: [4,2] [======================>---] 88% est: 0s
## plot: [4,3] [=======================>--] 94% est: 0s
## plot: [4,4] [==========================]100% est: 0s

8.3 Lineare Diskriminanzanalyse

8.3.1 Test / Train

train <- sample(1:nrow(st_iris), 0.75*nrow(st_iris)) # 75 % fürs testen
train_set <- st_iris[train,]
test_set <- st_iris[-train,]

8.3.2 Modelltrainierung

lda_train  <- lda(Species ~. , data = train_set)
train_pred <- predict(lda_train, train_set)$class

table(train_pred, train_set$Species)

##             
## train_pred   setosa versicolor virginica
##   setosa         36          0         0
##   versicolor      0         36         1
##   virginica       0          2        37

8.3.3 Bewertung anhand der Testdaten

test_pred <- predict(lda_train, test_set)$class
table(test_pred, test_set$Species)

##             
## test_pred    setosa versicolor virginica
##   setosa         14          0         0
##   versicolor      0         12         0
##   virginica       0          0        12

8.4 Random Forest Methode

8.4.1 Bildung mehrerer Entscheidungsbäume

#wiederholung der Train-Test-Sets
train <- sample(1:nrow(st_iris), 0.75*nrow(st_iris)) # 75 % fürs testen
train_set <- st_iris[train,]
test_set <- st_iris[-train,]


rf1 <- randomForest(
  Species ~ ., 
  data = train_set,  
  ntree = 100,
  mtry = 2,
  importance = T,
  proximity = T
)

rf1

## 
## Call:
##  randomForest(formula = Species ~ ., data = train_set, ntree = 100,      mtry = 2, importance = T, proximity = T) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 8.04%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         36          0         0   0.0000000
## versicolor      0         31         4   0.1142857
## virginica       0          5        36   0.1219512

train_predict <- predict(rf1, train_set[,-5], type = "response")
table(train_predict, train_set$Species)

##              
## train_predict setosa versicolor virginica
##    setosa         36          0         0
##    versicolor      0         35         0
##    virginica       0          0        41

test_predict <- predict(rf1, test_set[,-5], type = "response")
table(test_predict, test_set$Species)

##             
## test_predict setosa versicolor virginica
##   setosa         14          0         0
##   versicolor      0         15         1
##   virginica       0          0         8

8.5 Neuronales Netzwerk

8.5.1 Standardisierung und Aufteilung der Daten

st_iris[,5] <- as.numeric(iris$Species) -1
st_iris <- as.matrix(st_iris)

train_set <- st_iris[train,-5]
test_set <- st_iris[-train,-5]

8.5.2 Trainieren des Neuronalen Netzwerks

8.5.3 Evaluation

score <- dl %>% 
  evaluate(test_set, test_settarget, batch_size = 128)
print(score)

##       loss   accuracy 
## 0.03793639 1.00000000

# class <- predict_classes(dl, test_set) #decepretated
class <- dl %>% predict(test_set) %>% k_argmax()
class_numeric <- as.numeric(class)

table(as.numeric(class), st_iris[-train,5])

##    
##      0  1  2
##   0 14  0  0
##   1  0 15  0
##   2  0  0  9

8.6 H20

Internetverbundenes Packet, basierend auf JAVA!!

ich werde es nicht herunterladen, deswegen nur den ersten Block mitgeschrieben

localH20 = h2o.init(
  ip = "localhost",
  port = 54321,
  startH2O = T,
  nthreads = -1,
  max_mem_size = "2G"
  )

8.7 Aufgabenblatt

library(tidyverse)
library(GGally)
library(pastecs)

load("data/biathlon4.RData")
load("data/biathlon3.RData")

a) Betrachten Sie zunächst alle Variablen im Datensatz und analysieren Sie diese hinsichtlich Ihrer Lage und Streuung. Lassen sich irgendwelche Auffälligkeiten feststellen? Wenn ja sollten Sie überlegen, wie Sie diese bereinigen bzw. beseitigen könnten.

test %>% 
  dplyr::select(c(total.time, shoot.times.total, fails.total, type)) %>% 
  stat.desc(basic = F)

##                total.time shoot.times.total fails.total
## median       1.808850e+03        92.0000000  2.00000000
## mean         1.893898e+03        95.6011581  2.81648027
## SE.mean      7.487072e+00         0.3059346  0.01727304
## CI.mean.0.95 1.467577e+01         0.5996773  0.03385772
## var          7.217802e+05      1204.2060904  3.84165770
## std.dev      8.495765e+02        34.7016727  1.96001472
## coef.var     4.485861e-01         0.3629838  0.69590927
##              type
## median         NA
## mean           NA
## SE.mean        NA
## CI.mean.0.95   NA
## var            NA
## std.dev        NA
## coef.var       NA

test %>% 
  dplyr::select(c(total.time, shoot.times.total, tot.climb)) %>% 
  ggpairs()

## Warning in ggally_statistic(data = data, mapping =
## mapping, na.rm = na.rm, : Removed 10 rows containing
## missing values

## Warning in ggally_statistic(data = data, mapping =
## mapping, na.rm = na.rm, : Removed 57 rows containing
## missing values

## Warning: Removed 10 rows containing missing values
## (geom_point).

## Warning: Removed 10 rows containing non-finite values
## (stat_density).

## Warning in ggally_statistic(data = data, mapping =
## mapping, na.rm = na.rm, : Removed 67 rows containing
## missing values

## Warning: Removed 57 rows containing missing values
## (geom_point).

## Warning: Removed 67 rows containing missing values
## (geom_point).

## Warning: Removed 57 rows containing non-finite values
## (stat_density).

test %>% 
  ggplot(aes(x = total.time, y = shoot.times.total, color = type)) +
    geom_point() +
    facet_wrap(~gender)

## Warning: Removed 10 rows containing missing values
## (geom_point).

Komisch: bei letzem Graphen erkennbar eine kleine Gruppe mit niedrigen Zeiten, unabhängig vom Rest des Feldes

vielleicht abhängig von Teamsachen

test$is_team <- grepl("[0-9]", test$nation) #checks if there are any numbers in nation name, a sign for a team base
test %>% 
  ggplot(aes(x = total.time, y = shoot.times.total, color = is_team)) +
    geom_point() +
    facet_grid(~gender)

## Warning: Removed 10 rows containing missing values
## (geom_point).

funktioniert so ein bisschen, nicht komplett, aber egal.

8.7.1 Filtern + Standardisieren des Datensatzes

welche Variable wollen wir erkennen lassen ?

colnames(test)

##  [1] "nation"            "gender"           
##  [3] "type"              "total.time"       
##  [5] "course.lap.1"      "course.lap.2"     
##  [7] "course.lap.3"      "course.lap.4"     
##  [9] "course.lap.5"      "course.total"     
## [11] "shoot.times.1"     "shoot.times.2"    
## [13] "shoot.times.3"     "shoot.times.4"    
## [15] "shoot.times.total" "fails.1"          
## [17] "fails.2"           "fails.3"          
## [19] "fails.4"           "fails.total"      
## [21] "max.climb"         "tot.climb"        
## [23] "height.diff"       "is_team"

Datensatz extrahierung

st_test <- test %>% 
  dplyr::select(total.time, course.total, shoot.times.total, height.diff) %>% 
  scale()

st_train <- train %>% 
  dplyr::select(total.time, course.total, shoot.times.total, height.diff) %>% 
  scale()