2 Der Tidyverse Approach
Import -> Tidy -> Transform / Visualise / Model -> Communicate
2.1 readr for import
library("readr")
Paket zum importieren verschiedener Formate Import der Dateien (nur mit working directory in “Einführungs” Folder)
#normal CSV
<- read_csv("data/flights.csv")
flights_csv #STATA
<- read_dta("data/flights.dta")
flights_dta #SPSS
<- read_sav("data/flights.sav") flights_sav
load("data/flights.RData")
=> verschiedene praktische Sachen, alles auch im cheatsheet
2.2 working with tidyr
library("tidyr")
different functions to wrangle datasets and clean them
gather
: gather columns into key value pairs
#pipe the table into a function
%>%
table4a gather("1999","2000", key="year", value="cases")
## # A tibble: 6 × 3
## country year cases
## <chr> <chr> <int>
## 1 Afghanistan 1999 745
## 2 Brazil 1999 37737
## 3 China 1999 212258
## 4 Afghanistan 2000 2666
## 5 Brazil 2000 80488
## 6 China 2000 213766
spread
: spread key value pair across columns
#pipe the table into a function
%>%
table2 spread(key="type", value="count", convert = T)
## # A tibble: 6 × 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
separate
: turn single character column into multiple columns
%>%
table3 separate(rate, into = c("cases", "population"), convert = T)
## # A tibble: 6 × 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
unite
: unit columns into one
%>%
table5 unite("century", "year", col = "year", sep="")
## # A tibble: 6 × 3
## country year rate
## <chr> <chr> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
2.3 dplyr
Datentransformation mit dplyr
library("dplyr")
library("nycflights13")
Einladen des Datensatzes filter
: einzelne Zeilen aus dem Datensatz raúsziehen
%>%
flights filter(month == 1, day == 1)
## # A tibble: 842 × 19
## year month day dep_time sched_dep_time dep_delay
## <int> <int> <int> <int> <int> <dbl>
## 1 2013 1 1 517 515 2
## 2 2013 1 1 533 529 4
## 3 2013 1 1 542 540 2
## 4 2013 1 1 544 545 -1
## 5 2013 1 1 554 600 -6
## 6 2013 1 1 554 558 -4
## 7 2013 1 1 555 600 -5
## 8 2013 1 1 557 600 -3
## 9 2013 1 1 557 600 -3
## 10 2013 1 1 558 600 -2
## # … with 832 more rows, and 13 more variables:
## # arr_time <int>, sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
%>%
flights filter(arr_delay >= 60)
## # A tibble: 28,317 × 19
## year month day dep_time sched_dep_time dep_delay
## <int> <int> <int> <int> <int> <dbl>
## 1 2013 1 1 811 630 101
## 2 2013 1 1 848 1835 853
## 3 2013 1 1 957 733 144
## 4 2013 1 1 1114 900 134
## 5 2013 1 1 1120 944 96
## 6 2013 1 1 1255 1200 55
## 7 2013 1 1 1301 1150 71
## 8 2013 1 1 1337 1220 77
## 9 2013 1 1 1342 1320 22
## 10 2013 1 1 1356 1259 57
## # … with 28,307 more rows, and 13 more variables:
## # arr_time <int>, sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
arrange
: ordnen von Daten
%>%
flights arrange(dep_time)
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay
## <int> <int> <int> <int> <int> <dbl>
## 1 2013 1 13 1 2249 72
## 2 2013 1 31 1 2100 181
## 3 2013 11 13 1 2359 2
## 4 2013 12 16 1 2359 2
## 5 2013 12 20 1 2359 2
## 6 2013 12 26 1 2359 2
## 7 2013 12 30 1 2359 2
## 8 2013 2 11 1 2100 181
## 9 2013 2 24 1 2245 76
## 10 2013 3 8 1 2355 6
## # … with 336,766 more rows, and 13 more variables:
## # arr_time <int>, sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
select
: einzelne spalten
%>%
flights ::select(year : day) dplyr
## # A tibble: 336,776 × 3
## year month day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## 5 2013 1 1
## 6 2013 1 1
## 7 2013 1 1
## 8 2013 1 1
## 9 2013 1 1
## 10 2013 1 1
## # … with 336,766 more rows
rename
: Umbenennung von Spalten namen
::rename(flights, arriving = arr_time) dplyr
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay
## <int> <int> <int> <int> <int> <dbl>
## 1 2013 1 1 517 515 2
## 2 2013 1 1 533 529 4
## 3 2013 1 1 542 540 2
## 4 2013 1 1 544 545 -1
## 5 2013 1 1 554 600 -6
## 6 2013 1 1 554 558 -4
## 7 2013 1 1 555 600 -5
## 8 2013 1 1 557 600 -3
## 9 2013 1 1 557 600 -3
## 10 2013 1 1 558 600 -2
## # … with 336,766 more rows, and 13 more variables:
## # arriving <int>, sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
mutate
: eurezugt neue Variable aus alten
%>%
flights ::select(distance,air_time) %>%
dplyr::mutate(speed = distance/air_time * 60) dplyr
## # A tibble: 336,776 × 3
## distance air_time speed
## <dbl> <dbl> <dbl>
## 1 1400 227 370.
## 2 1416 227 374.
## 3 1089 160 408.
## 4 1576 183 517.
## 5 762 116 394.
## 6 719 150 288.
## 7 1065 158 404.
## 8 229 53 259.
## 9 944 140 405.
## 10 733 138 319.
## # … with 336,766 more rows
summarize
und count
%>%
flights ::summarize(delay_average = mean(dep_delay, na.rm = T) ) dplyr
## # A tibble: 1 × 1
## delay_average
## <dbl>
## 1 12.6
groupby
: Gruppieren von Daten
%>%
flights group_by(carrier) %>%
summarize(del_avg = mean(dep_delay, na.rm = T),
del_sd= sd(dep_delay, na.rm = T),
del_median = median(dep_delay, na.rm = T)
)
## # A tibble: 16 × 4
## carrier del_avg del_sd del_median
## <chr> <dbl> <dbl> <dbl>
## 1 9E 16.7 45.9 -2
## 2 AA 8.59 37.4 -3
## 3 AS 5.80 31.4 -3
## 4 B6 13.0 38.5 -1
## 5 DL 9.26 39.7 -2
## 6 EV 20.0 46.6 -1
## 7 F9 20.2 58.4 0.5
## 8 FL 18.7 52.7 1
## 9 HA 4.90 74.1 -4
## 10 MQ 10.6 39.2 -3
## 11 OO 12.6 43.1 -6
## 12 UA 12.1 35.7 0
## 13 US 3.78 28.1 -4
## 14 VX 12.9 44.8 0
## 15 WN 17.7 43.3 1
## 16 YV 19.0 49.2 -2
join
: zusammenfügen verschiedener Datensätze
%>%
flights ::select(carrier, dep_time) %>%
dplyrleft_join(airlines, by = "carrier")
## # A tibble: 336,776 × 3
## carrier dep_time name
## <chr> <int> <chr>
## 1 UA 517 United Air Lines Inc.
## 2 UA 533 United Air Lines Inc.
## 3 AA 542 American Airlines Inc.
## 4 B6 544 JetBlue Airways
## 5 DL 554 Delta Air Lines Inc.
## 6 UA 554 United Air Lines Inc.
## 7 B6 555 JetBlue Airways
## 8 EV 557 ExpressJet Airlines Inc.
## 9 B6 557 JetBlue Airways
## 10 AA 558 American Airlines Inc.
## # … with 336,766 more rows
2.4 Aufgabenblatt
Starwars - Datensatz einladen
2.4.1 tidyr-arbeit:
load("data/starwars.RData")
#1
%>%
starwars2 ::gather("feminine", "masculine", , key = "gender", value = "mass", na.rm = T) tidyr
## # A tibble: 58 × 6
## name height hair_color `<NA>` gender mass
## <chr> <int> <chr> <dbl> <chr> <dbl>
## 1 Adi Gallia 184 none NA femini… 50
## 2 Ayla Secura 178 none NA femini… 55
## 3 Barriss Offee 166 black NA femini… 50
## 4 Beru Whitesun… 165 brown NA femini… 75
## 5 Leia Organa 150 brown NA femini… 49
## 6 Luminara Undu… 170 black NA femini… 56.2
## 7 Padmé Amidala 165 brown NA femini… 45
## 8 Shaak Ti 178 none NA femini… 57
## 9 Zam Wesell 168 blonde NA femini… 55
## 10 Ackbar 180 none NA mascul… 83
## # … with 48 more rows
#2 !Not finished
::unite(starwars2, "feminine", "masculine", col = "gender" , na.rm = T) tidyr
## # A tibble: 87 × 5
## name height hair_color gender `<NA>`
## <chr> <int> <chr> <chr> <dbl>
## 1 Ackbar 180 none "83" NA
## 2 Adi Gallia 184 none "50" NA
## 3 Anakin Skywalker 188 blond "84" NA
## 4 Arvel Crynyd NA brown "" NA
## 5 Ayla Secura 178 none "55" NA
## 6 Bail Prestor Organa 191 black "" NA
## 7 Barriss Offee 166 black "50" NA
## 8 BB8 NA none "" NA
## 9 Ben Quadinaros 163 none "65" NA
## 10 Beru Whitesun lars 165 brown "75" NA
## # … with 77 more rows
#3
::spread(starwars3, key=body_measure, value = body_values, convert = T) tidyr
## # A tibble: 87 × 5
## name hair_color gender height mass
## <chr> <chr> <chr> <int> <dbl>
## 1 Ackbar none masculine 180 83
## 2 Adi Gallia none feminine 184 50
## 3 Anakin Skywalker blond masculine 188 84
## 4 Arvel Crynyd brown masculine NA NA
## 5 Ayla Secura none feminine 178 55
## 6 Bail Prestor Organa black masculine 191 NA
## 7 Barriss Offee black feminine 166 50
## 8 BB8 none masculine NA NA
## 9 Ben Quadinaros none masculine 163 65
## 10 Beru Whitesun lars brown feminine 165 75
## # … with 77 more rows
#4 problems with doulbes and integers
::separate(starwars4, mass_height, into=c("mass", "height"),convert = T) tidyr
## Warning: Expected 2 pieces. Additional pieces discarded
## in 2 rows [21, 61].
## # A tibble: 87 × 5
## name hair_color gender mass height
## <chr> <chr> <chr> <int> <int>
## 1 Luke Skywalker blond mascul… 77 172
## 2 C-3PO <NA> mascul… 75 167
## 3 R2-D2 <NA> mascul… 32 96
## 4 Darth Vader none mascul… 136 202
## 5 Leia Organa brown femini… 49 150
## 6 Owen Lars brown, grey mascul… 120 178
## 7 Beru Whitesun lars brown femini… 75 165
## 8 R5-D4 <NA> mascul… 32 97
## 9 Biggs Darklighter black mascul… 84 183
## 10 Obi-Wan Kenobi auburn, white mascul… 77 182
## # … with 77 more rows
#5
::unite(starwars5, hair_color1, hair_color2, col = "haircolor", na.rm = T, sep="-") tidyr
## # A tibble: 87 × 5
## name height mass haircolor gender
## <chr> <int> <dbl> <chr> <chr>
## 1 Luke Skywalker 172 77 "blond" mascu…
## 2 C-3PO 167 75 "" mascu…
## 3 R2-D2 96 32 "" mascu…
## 4 Darth Vader 202 136 "none" mascu…
## 5 Leia Organa 150 49 "brown" femin…
## 6 Owen Lars 178 120 "brown-grey" mascu…
## 7 Beru Whitesun lars 165 75 "brown" femin…
## 8 R5-D4 97 32 "" mascu…
## 9 Biggs Darklighter 183 84 "black" mascu…
## 10 Obi-Wan Kenobi 182 77 "auburn-white" mascu…
## # … with 77 more rows
2.4.2 dplyr-arbeit:
#1
%>%
starwars filter(mass > 75, mass < 100, height > 180, height < 190) %>%
count(eye_color)
## # A tibble: 5 × 2
## eye_color n
## <chr> <int>
## 1 black 1
## 2 blue 1
## 3 blue-gray 1
## 4 brown 6
## 5 yellow 1
#2
%>%
starwars filter(is.na(height) | is.na(height))
## # A tibble: 6 × 16
## name height mass hair_color skin_color eye_color
## <chr> <int> <dbl> <chr> <chr> <chr>
## 1 Arvel Cr… NA NA brown fair brown
## 2 Finn NA NA black dark dark
## 3 Rey NA NA brown light hazel
## 4 Poe Dame… NA NA brown light brown
## 5 BB8 NA NA none none black
## 6 Captain … NA NA unknown unknown unknown
## # … with 10 more variables: birth_year <dbl>,
## # sex <chr>, gender <chr>, homeworld <chr>,
## # species <chr>, films <list>, vehicles <list>,
## # starships <list>, star_string <chr>,
## # films_low <chr>
#3
::arrange(starwars, -desc(mass)) dplyr
## # A tibble: 87 × 16
## name height mass hair_color skin_color eye_color
## <chr> <int> <dbl> <chr> <chr> <chr>
## 1 Ratts T… 79 15 none grey, blue unknown
## 2 Yoda 66 17 white green brown
## 3 Wicket … 88 20 brown brown brown
## 4 R2-D2 96 32 <NA> white, bl… red
## 5 R5-D4 97 32 <NA> white, red red
## 6 Sebulba 112 40 none grey, red orange
## 7 Dud Bolt 94 45 none blue, grey yellow
## 8 Padmé A… 165 45 brown light brown
## 9 Wat Tam… 193 48 none green, gr… unknown
## 10 Sly Moo… 178 48 none pale white
## # … with 77 more rows, and 10 more variables:
## # birth_year <dbl>, sex <chr>, gender <chr>,
## # homeworld <chr>, species <chr>, films <list>,
## # vehicles <list>, starships <list>,
## # star_string <chr>, films_low <chr>
::arrange(starwars, desc(height)) dplyr
## # A tibble: 87 × 16
## name height mass hair_color skin_color eye_color
## <chr> <int> <dbl> <chr> <chr> <chr>
## 1 Yarael… 264 NA none white yellow
## 2 Tarfful 234 136 brown brown blue
## 3 Lama Su 229 88 none grey black
## 4 Chewba… 228 112 brown unknown blue
## 5 Roos T… 224 82 none grey orange
## 6 Grievo… 216 159 none brown, whi… green, y…
## 7 Taun We 213 NA none grey black
## 8 Rugor … 206 NA none green orange
## 9 Tion M… 206 80 none grey black
## 10 Darth … 202 136 none white yellow
## # … with 77 more rows, and 10 more variables:
## # birth_year <dbl>, sex <chr>, gender <chr>,
## # homeworld <chr>, species <chr>, films <list>,
## # vehicles <list>, starships <list>,
## # star_string <chr>, films_low <chr>
#Darth Vader height = 202
#4
%>%
starwars ::filter(species != "Droid") %>%
dplyr::select(name:mass) %>%
dplyr::mutate(bmi = mass/((height/100)**2)) dplyr
## # A tibble: 77 × 4
## name height mass bmi
## <chr> <int> <dbl> <dbl>
## 1 Luke Skywalker 172 77 26.0
## 2 Darth Vader 202 136 33.3
## 3 Leia Organa 150 49 21.8
## 4 Owen Lars 178 120 37.9
## 5 Beru Whitesun lars 165 75 27.5
## 6 Biggs Darklighter 183 84 25.1
## 7 Obi-Wan Kenobi 182 77 23.2
## 8 Anakin Skywalker 188 84 23.8
## 9 Wilhuff Tarkin 180 NA NA
## 10 Chewbacca 228 112 21.5
## # … with 67 more rows
#5
%>%
starwars ::rename("größe" = "height" ) %>%
dplyrrename("gewicht"= "mass") %>%
rename("haarfarbe"= "hair_color" ) %>%
rename("hautfarbe"= "skin_color")
## # A tibble: 87 × 16
## name größe gewicht haarfarbe hautfarbe eye_color
## <chr> <int> <dbl> <chr> <chr> <chr>
## 1 Luke Sk… 172 77 blond fair blue
## 2 C-3PO 167 75 <NA> gold yellow
## 3 R2-D2 96 32 <NA> white, b… red
## 4 Darth V… 202 136 none white yellow
## 5 Leia Or… 150 49 brown light brown
## 6 Owen La… 178 120 brown, gr… light blue
## 7 Beru Wh… 165 75 brown light blue
## 8 R5-D4 97 32 <NA> white, r… red
## 9 Biggs D… 183 84 black light brown
## 10 Obi-Wan… 182 77 auburn, w… fair blue-gray
## # … with 77 more rows, and 10 more variables:
## # birth_year <dbl>, sex <chr>, gender <chr>,
## # homeworld <chr>, species <chr>, films <list>,
## # vehicles <list>, starships <list>,
## # star_string <chr>, films_low <chr>
#6
%>%
starwars ::filter(species != "Droid") %>%
dplyr::filter(!is.na(height) , !is.na(mass)) %>%
dplyr::select(name:mass) %>%
dplyr::mutate(bmi = mass/((height/100)**2)) %>%
dplyr::summarize(
dplyrmass_mean = mean(mass),
mass_median = median(mass),
mass_sd = sd(mass),
height_mean = mean(height),
height_median = median(height),
height_sd = sd(height),
bmi_mean = mean(bmi),
bmi_median = median(bmi),
bmi_sd = sd(bmi),
)
## # A tibble: 1 × 9
## mass_mean mass_median mass_sd height_mean
## <dbl> <dbl> <dbl> <dbl>
## 1 100. 79 177. 176.
## # … with 5 more variables: height_median <dbl>,
## # height_sd <dbl>, bmi_mean <dbl>, bmi_median <dbl>,
## # bmi_sd <dbl>
#7
%>%
starwars ::filter(!is.na(height) , !is.na(mass)) %>%
dplyr::select(name:mass, species) %>%
dplyr::mutate(bmi = mass/((height/100)**2)) %>%
dplyr::group_by(species) %>%
dplyr::summarize(
dplyrmass_mean = mean(mass),
mass_median = median(mass),
mass_sd = sd(mass),
height_mean = mean(height),
height_median = median(height),
height_sd = sd(height),
bmi_mean = mean(bmi),
bmi_median = median(bmi),
bmi_sd = sd(bmi),
)
## # A tibble: 32 × 10
## species mass_mean mass_median mass_sd height_mean
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Aleena 15 15 NA 79
## 2 Besalisk 102 102 NA 198
## 3 Cerean 82 82 NA 198
## 4 Clawdite 55 55 NA 168
## 5 Droid 69.8 53.5 51.0 140
## 6 Dug 40 40 NA 112
## 7 Ewok 20 20 NA 88
## 8 Geonosian 80 80 NA 183
## 9 Gungan 74 74 11.3 210
## 10 Human 82.8 79 19.4 180.
## # … with 22 more rows, and 5 more variables:
## # height_median <dbl>, height_sd <dbl>,
## # bmi_mean <dbl>, bmi_median <dbl>, bmi_sd <dbl>
#8
%>%
starwars ::filter(!is.na(height) , !is.na(mass)) %>%
dplyr::select(name:mass, gender, species) %>%
dplyr::mutate(bmi = mass/((height/100)**2)) %>%
dplyr::group_by(gender, species) %>%
dplyr::summarize(
dplyrmass_mean = mean(mass),
mass_median = median(mass),
mass_sd = sd(mass),
height_mean = mean(height),
height_median = median(height),
height_sd = sd(height),
bmi_mean = mean(bmi),
bmi_median = median(bmi),
bmi_sd = sd(bmi),
)
## `summarise()` has grouped output by 'gender'. You can override using the `.groups` argument.
## # A tibble: 33 × 11
## # Groups: gender [3]
## gender species mass_mean mass_median mass_sd
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 feminine Clawdite 55 55 NA
## 2 feminine Human 56.3 49 16.3
## 3 feminine Mirialan 53.1 53.1 4.38
## 4 feminine Tholothian 50 50 NA
## 5 feminine Togruta 57 57 NA
## 6 feminine Twi'lek 55 55 NA
## 7 masculine Aleena 15 15 NA
## 8 masculine Besalisk 102 102 NA
## 9 masculine Cerean 82 82 NA
## 10 masculine Droid 69.8 53.5 51.0
## # … with 23 more rows, and 6 more variables:
## # height_mean <dbl>, height_median <dbl>,
## # height_sd <dbl>, bmi_mean <dbl>, bmi_median <dbl>,
## # bmi_sd <dbl>
#9
count(starwars, gender)
## # A tibble: 3 × 2
## gender n
## <chr> <int>
## 1 feminine 17
## 2 masculine 66
## 3 <NA> 4
#10
::left_join(starwars6,starwars7, on = "name") dplyr
## Joining, by = "name"
## # A tibble: 87 × 14
## name height mass birth_year hair_color skin_color
## <chr> <int> <dbl> <dbl> <chr> <chr>
## 1 Luke S… 172 77 19 blond fair
## 2 C-3PO 167 75 112 <NA> gold
## 3 R2-D2 96 32 33 <NA> white, bl…
## 4 Darth … 202 136 41.9 none white
## 5 Leia O… 150 49 19 brown light
## 6 Owen L… 178 120 52 brown, gr… light
## 7 Beru W… 165 75 47 brown light
## 8 R5-D4 97 32 NA <NA> white, red
## 9 Biggs … 183 84 24 black light
## 10 Obi-Wa… 182 77 57 auburn, w… fair
## # … with 77 more rows, and 8 more variables:
## # eye_color <chr>, sex <chr>, gender <chr>,
## # homeworld <chr>, species <chr>, films <list>,
## # vehicles <list>, starships <list>
yeehaw, fertig