3 tidyverse mit stringr, forcats, lubridate
library("tidyverse")
3.1 stringr-Paket
paar Beispielstrings kreieren
<- "Hello"
string1 <- "Bye"
string2
<- "\u03BC" # mathematisches Zeichen mu
mu <- c(string1,string2, mu)
char_vector char_vector
## [1] "Hello" "Bye" "μ"
3.1.1 str_length()
Nutzen für Länge von Strings
str_length(char_vector)
## [1] 5 3 1
3.1.2 str_c()
Kombinieren von Strings
str_c(string1,string2, sep = "|")
## [1] "Hello|Bye"
3.1.3 str_sub()
zum Subsetten
<- c("apple", "banana", "pears")
x str_sub(x,start= 1,end= 3)
## [1] "app" "ban" "pea"
#überschreiben der 2- 4 buchstaben als Großbuchstaben
str_sub(x,start= 2,end= 4) <- str_to_upper(str_sub(x,2,4))
x
## [1] "aPPLe" "bANAna" "pEARs"
3.1.4 str_sort()
Sortieren
str_sort(x, locale = "en", decreasing = T )
## [1] "pEARs" "bANAna" "aPPLe"
3.1.5 str_view()
Regex Muster erkennen und markieren
str_view(x, pattern = "A")
wichtigste RegExs im stringr
cheatsheet !
3.1.6 str_detect()
RegEx Muster herausfiltern
str_detect(words, "^y")] # alle wörter die mit m anfangen words[
## [1] "year" "yes" "yesterday" "yet"
## [5] "you" "young"
sum(str_detect(words, "[aeiou]$")) # wörter mit vokal am ende
## [1] 271
!str_detect(words, "[aeiou]")] #wörter ohne vokal words[
## [1] "by" "dry" "fly" "mrs" "try" "why"
3.1.7 str_count()
mean(str_count(words, "[aeiou]")) #durschnitt vokale pro wort
## [1] 1.991837
3.1.8 beispiel
<- tibble(word = words, i = seq_along(word))
df head(df)
## # A tibble: 6 × 2
## word i
## <chr> <int>
## 1 a 1
## 2 able 2
## 3 about 3
## 4 absolute 4
## 5 accept 5
## 6 account 6
%>%
df mutate(vokale = str_count(word, "[aeiou]"), konsonanten = str_count(word, "[^aeiou]"))
## # A tibble: 980 × 4
## word i vokale konsonanten
## <chr> <int> <int> <int>
## 1 a 1 1 0
## 2 able 2 2 2
## 3 about 3 3 2
## 4 absolute 4 4 4
## 5 accept 5 2 4
## 6 account 6 3 4
## 7 achieve 7 4 3
## 8 across 8 2 4
## 9 act 9 1 2
## 10 active 10 3 3
## # … with 970 more rows
3.1.9 regex() funktion
statt patterns sind auch regex-matches mithilfe der regex()
funktion möglich
<- c("Banana", "banana", "BANANA")
ban %>%
ban str_view(regex("banana"))
3.2 forcats
nützliches Paket zur Arbeit mit Faktoren
# library(forcats)
<- c("Dez", "Jan", "Feb", "Apr")
string sort(string)
## [1] "Apr" "Dez" "Feb" "Jan"
merke: hier wird nach Alphabet sortiert
<- c(
monate_level "Jan", "Feb", "Mär", "Apr", "Jun", "Jul", "Aug", "Sep", "Okt", "Nov", "Dez"
)
3.2.1 factor()
Kodieren mit Faktoren
<- factor(string, monate_level)
faktor sort(faktor) # sortierung der Monate nach Faktoren
## [1] Jan Feb Apr Dez
## 11 Levels: Jan Feb Mär Apr Jun Jul Aug Sep Okt ... Dez
merke: zur Sortierung nach Level factor benutzen
3.2.2 Beispiel Einsatz
# Datensatz gss_cat
## # A tibble: 21,483 × 9
## year marital age race rincome partyid relig denom
## <int> <fct> <int> <fct> <fct> <fct> <fct> <fct>
## 1 2000 Never … 26 White $8000 … Ind,ne… Prot… Sout…
## 2 2000 Divorc… 48 White $8000 … Not st… Prot… Bapt…
## 3 2000 Widowed 67 White Not ap… Indepe… Prot… No d…
## 4 2000 Never … 39 White Not ap… Ind,ne… Orth… Not …
## 5 2000 Divorc… 25 White Not ap… Not st… None Not …
## 6 2000 Married 25 White $20000… Strong… Prot… Sout…
## 7 2000 Never … 36 White $25000… Not st… Chri… Not …
## 8 2000 Divorc… 44 White $7000 … Ind,ne… Prot… Luth…
## 9 2000 Married 44 White $25000… Not st… Prot… Other
## 10 2000 Married 47 White $25000… Strong… Prot… Sout…
## # … with 21,473 more rows, and 1 more variable:
## # tvhours <int>
sortieren nach religion
<- gss_cat %>%
relig group_by(relig) %>%
summarize(
avg_age = mean(age, na.rm=T),
avg_tv = mean(tvhours, na.rm = T),
n = n(),
)
ggplot(relig, aes(avg_tv, relig)) +
geom_point()
merke: plot ist unübersichtlich, deswegen jetzt anwendung factors
ggplot(relig, aes(avg_tv, fct_reorder(relig, avg_tv))) +
geom_point()
=> jetzt isses schön sortiert mit fct_reorder
möglicher Einsatz auch von fct_recode
zur Umbenennung von Faktoren
%>%
gss_cat mutate(race = fct_recode(race,
"Anders"= "Other",
"Weiß" = "White",
"Schwarz" = "Black"
))
## # A tibble: 21,483 × 9
## year marital age race rincome partyid relig denom
## <int> <fct> <int> <fct> <fct> <fct> <fct> <fct>
## 1 2000 Never … 26 Weiß $8000 … Ind,ne… Prot… Sout…
## 2 2000 Divorc… 48 Weiß $8000 … Not st… Prot… Bapt…
## 3 2000 Widowed 67 Weiß Not ap… Indepe… Prot… No d…
## 4 2000 Never … 39 Weiß Not ap… Ind,ne… Orth… Not …
## 5 2000 Divorc… 25 Weiß Not ap… Not st… None Not …
## 6 2000 Married 25 Weiß $20000… Strong… Prot… Sout…
## 7 2000 Never … 36 Weiß $25000… Not st… Chri… Not …
## 8 2000 Divorc… 44 Weiß $7000 … Ind,ne… Prot… Luth…
## 9 2000 Married 44 Weiß $25000… Not st… Prot… Other
## 10 2000 Married 47 Weiß $25000… Strong… Prot… Sout…
## # … with 21,473 more rows, and 1 more variable:
## # tvhours <int>
weitere wichtige Funktionen: - fct_collapse
zum Zusammenfassen von Merkmalen - fct_lump
auch Zusammenfassen irgendwie
3.3 lubridate
Paket zum Zeiten und Daten(Kalender)-Wrangeln
Arten von Zeit: time, date, datetime
now()
## [1] "2022-02-03 18:20:36 CET"
today()
## [1] "2022-02-03"
library(lubridate)
Wege, dates/times zu generieren:
- Strings
- individuelle Datetime komponenten
- aus vorhandenen Komponenten
3.3.1 1.ymd(), mdy(), dmy_hms()
aus string mit lubridate
dmy("14th September 2024")
## [1] "2024-09-14"
ymd("2021, December, 21")
## [1] "2021-12-21"
mdy(122108)
## [1] "2008-12-21"
ymd_hms("21 September 08 13:54:33")
## [1] "2021-09-08 13:54:33 UTC"
3.3.2 2. make_date()
library(nycflights13)
head(flights)
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay
## <int> <int> <int> <int> <int> <dbl>
## 1 2013 1 1 517 515 2
## 2 2013 1 1 533 529 4
## 3 2013 1 1 542 540 2
## 4 2013 1 1 544 545 -1
## 5 2013 1 1 554 600 -6
## 6 2013 1 1 554 558 -4
## # … with 13 more variables: arr_time <int>,
## # sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
%>%
flights ::select(year,month, day, hour, minute) %>%
dplyrmutate(
departure = make_datetime(year, month, day, hour, minute),
date = make_date(year,month,day)
)
## # A tibble: 336,776 × 7
## year month day hour minute departure
## <int> <int> <int> <dbl> <dbl> <dttm>
## 1 2013 1 1 5 15 2013-01-01 05:15:00
## 2 2013 1 1 5 29 2013-01-01 05:29:00
## 3 2013 1 1 5 40 2013-01-01 05:40:00
## 4 2013 1 1 5 45 2013-01-01 05:45:00
## 5 2013 1 1 6 0 2013-01-01 06:00:00
## 6 2013 1 1 5 58 2013-01-01 05:58:00
## 7 2013 1 1 6 0 2013-01-01 06:00:00
## 8 2013 1 1 6 0 2013-01-01 06:00:00
## 9 2013 1 1 6 0 2013-01-01 06:00:00
## 10 2013 1 1 6 0 2013-01-01 06:00:00
## # … with 336,766 more rows, and 1 more variable:
## # date <date>
3.3.3 3.aus vorhandenene datetimes
today()
## [1] "2022-02-03"
as_datetime(today())
## [1] "2022-02-03 UTC"
weiter fancy sachen
<- ymd_hms("20200914 151623")
datetime month(datetime) #ausgabe monat
## [1] 9
#ausgabe wochentag mit Parameter Montag als Anfang und Ausgabe des Namens
wday(datetime, week_start = getOption("lubridate.week.start" , 1), label = T) #ein montag
## [1] Mon
## 7 Levels: Mon < Tue < Wed < Thu < Fri < ... < Sun
yday(datetime) #258 tag des jahres
## [1] 258
3.4 Aufgabenblatt 3
3.4.1 Aufgabe 1
load("data/starwars.RData")
%>%
starwars ::select(hair_color, skin_color, eye_color) %>%
dplyrsummarize(
h_length = mean(str_length(hair_color), na.rm = T),
s_length = mean(str_length(skin_color), na.rm = T),
e_length = mean(str_length(eye_color), na.rm = T),
h_sd= sd(str_length(hair_color), na.rm = T),
s_sd = sd(str_length(skin_color), na.rm = T),
e_sd = sd(str_length(eye_color), na.rm = T),
)
## # A tibble: 1 × 6
## h_length s_length e_length h_sd s_sd e_sd
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4.84 5.97 5.10 1.54 3.47 1.40
%>%
starwars mutate(hair_color = str_c("hair_color", hair_color, sep = ":"))
## # A tibble: 87 × 16
## name height mass hair_color skin_color eye_color
## <chr> <int> <dbl> <chr> <chr> <chr>
## 1 Luke S… 172 77 hair_color… fair blue
## 2 C-3PO 167 75 <NA> gold yellow
## 3 R2-D2 96 32 <NA> white, bl… red
## 4 Darth … 202 136 hair_color… white yellow
## 5 Leia O… 150 49 hair_color… light brown
## 6 Owen L… 178 120 hair_color… light blue
## 7 Beru W… 165 75 hair_color… light blue
## 8 R5-D4 97 32 <NA> white, red red
## 9 Biggs … 183 84 hair_color… light brown
## 10 Obi-Wa… 182 77 hair_color… fair blue-gray
## # … with 77 more rows, and 10 more variables:
## # birth_year <dbl>, sex <chr>, gender <chr>,
## # homeworld <chr>, species <chr>, films <list>,
## # vehicles <list>, starships <list>,
## # star_string <chr>, films_low <chr>
%>%
starwars unite(hair_color, eye_color, sep = " and ", col="combined")
## # A tibble: 87 × 15
## name height mass combined skin_color birth_year
## <chr> <int> <dbl> <chr> <chr> <dbl>
## 1 Luke S… 172 77 blond and… fair 19
## 2 C-3PO 167 75 NA and ye… gold 112
## 3 R2-D2 96 32 NA and red white, bl… 33
## 4 Darth … 202 136 none and … white 41.9
## 5 Leia O… 150 49 brown and… light 19
## 6 Owen L… 178 120 brown, gr… light 52
## 7 Beru W… 165 75 brown and… light 47
## 8 R5-D4 97 32 NA and red white, red NA
## 9 Biggs … 183 84 black and… light 24
## 10 Obi-Wa… 182 77 auburn, w… fair 57
## # … with 77 more rows, and 9 more variables: sex <chr>,
## # gender <chr>, homeworld <chr>, species <chr>,
## # films <list>, vehicles <list>, starships <list>,
## # star_string <chr>, films_low <chr>
#oder
%>%
starwars mutate(combined2 = str_c(hair_color,eye_color, sep = " and "))
## # A tibble: 87 × 17
## name height mass hair_color skin_color eye_color
## <chr> <int> <dbl> <chr> <chr> <chr>
## 1 Luke Sk… 172 77 blond fair blue
## 2 C-3PO 167 75 <NA> gold yellow
## 3 R2-D2 96 32 <NA> white, bl… red
## 4 Darth V… 202 136 none white yellow
## 5 Leia Or… 150 49 brown light brown
## 6 Owen La… 178 120 brown, gr… light blue
## 7 Beru Wh… 165 75 brown light blue
## 8 R5-D4 97 32 <NA> white, red red
## 9 Biggs D… 183 84 black light brown
## 10 Obi-Wan… 182 77 auburn, w… fair blue-gray
## # … with 77 more rows, and 11 more variables:
## # birth_year <dbl>, sex <chr>, gender <chr>,
## # homeworld <chr>, species <chr>, films <list>,
## # vehicles <list>, starships <list>,
## # star_string <chr>, films_low <chr>, combined2 <chr>
Option2 macht wenn in einer Zelle NA ist, automatisch das Ergebnis NA, Option1 verbindet das miteinander
%>%
starwars mutate(short_hair = str_sub(hair_color, end = 2))
## # A tibble: 87 × 17
## name height mass hair_color skin_color eye_color
## <chr> <int> <dbl> <chr> <chr> <chr>
## 1 Luke Sk… 172 77 blond fair blue
## 2 C-3PO 167 75 <NA> gold yellow
## 3 R2-D2 96 32 <NA> white, bl… red
## 4 Darth V… 202 136 none white yellow
## 5 Leia Or… 150 49 brown light brown
## 6 Owen La… 178 120 brown, gr… light blue
## 7 Beru Wh… 165 75 brown light blue
## 8 R5-D4 97 32 <NA> white, red red
## 9 Biggs D… 183 84 black light brown
## 10 Obi-Wan… 182 77 auburn, w… fair blue-gray
## # … with 77 more rows, and 11 more variables:
## # birth_year <dbl>, sex <chr>, gender <chr>,
## # homeworld <chr>, species <chr>, films <list>,
## # vehicles <list>, starships <list>,
## # star_string <chr>, films_low <chr>,
## # short_hair <chr>
<- starwars %>%
starwars mutate(
star_string = str_c(
sep = ","),
name, hair_color, eye_color, skin_color, homeworld, films, vehicles, starships, films_low = str_to_lower(films)
)
## Warning in stri_c(..., sep = sep, collapse = collapse,
## ignore_null = TRUE): argument is not an atomic vector;
## coercing
## Warning in stri_c(..., sep = sep, collapse = collapse,
## ignore_null = TRUE): argument is not an atomic vector;
## coercing
## Warning in stri_c(..., sep = sep, collapse = collapse,
## ignore_null = TRUE): argument is not an atomic vector;
## coercing
## Warning in stri_trans_tolower(string, locale = locale):
## argument is not an atomic vector; coercing
%>%
starwars mutate(films_low = str_remove(films_low, "c") )
## # A tibble: 87 × 16
## name height mass hair_color skin_color eye_color
## <chr> <int> <dbl> <chr> <chr> <chr>
## 1 Luke Sk… 172 77 blond fair blue
## 2 C-3PO 167 75 <NA> gold yellow
## 3 R2-D2 96 32 <NA> white, bl… red
## 4 Darth V… 202 136 none white yellow
## 5 Leia Or… 150 49 brown light brown
## 6 Owen La… 178 120 brown, gr… light blue
## 7 Beru Wh… 165 75 brown light blue
## 8 R5-D4 97 32 <NA> white, red red
## 9 Biggs D… 183 84 black light brown
## 10 Obi-Wan… 182 77 auburn, w… fair blue-gray
## # … with 77 more rows, and 10 more variables:
## # birth_year <dbl>, sex <chr>, gender <chr>,
## # homeworld <chr>, species <chr>, films <list>,
## # vehicles <list>, starships <list>,
## # star_string <chr>, films_low <chr>
str_detect(starwars$name, "[:0-9]"), ] starwars[
## # A tibble: 6 × 16
## name height mass hair_color skin_color eye_color
## <chr> <int> <dbl> <chr> <chr> <chr>
## 1 C-3PO 167 75 <NA> gold yellow
## 2 R2-D2 96 32 <NA> white, blue red
## 3 R5-D4 97 32 <NA> white, red red
## 4 IG-88 200 140 none metal red
## 5 R4-P17 96 NA none silver, red red, blue
## 6 BB8 NA NA none none black
## # … with 10 more variables: birth_year <dbl>,
## # sex <chr>, gender <chr>, homeworld <chr>,
## # species <chr>, films <list>, vehicles <list>,
## # starships <list>, star_string <chr>,
## # films_low <chr>
sum(str_detect(starwars$homeworld, "[ment]$"), na.rm = T)
## [1] 28
str_detect(starwars$starships, 'Millennium\\sFalcon'), ] starwars[
## Warning in stri_detect_regex(string, pattern, negate =
## negate, opts_regex = opts(pattern)): argument is not an
## atomic vector; coercing
## # A tibble: 4 × 16
## name height mass hair_color skin_color eye_color
## <chr> <int> <dbl> <chr> <chr> <chr>
## 1 Chewbacca 228 112 brown unknown blue
## 2 Han Solo 180 80 brown fair brown
## 3 Lando Ca… 177 79 black dark brown
## 4 Nien Nunb 160 68 none grey black
## # … with 10 more variables: birth_year <dbl>,
## # sex <chr>, gender <chr>, homeworld <chr>,
## # species <chr>, films <list>, vehicles <list>,
## # starships <list>, star_string <chr>,
## # films_low <chr>
3.4.2 Aufgabe 2
<- as.factor(starwars$hair_color)
hair_factor <- tibble(hair_color = hair_factor)
hair_factor ggplot(hair_factor, aes(hair_color)) + geom_bar() #plot original
<- fct_infreq(hair_factor$hair_color)
freq ggplot(hair_factor, aes(freq)) + geom_bar()
levels(hair_factor)
## NULL
%>%
hair_factor mutate(hair_color = fct_collapse(hair_color, brown = c("auburn", "auburn, grey", "auburn, white"))) %>%
count(hair_color)
## # A tibble: 10 × 2
## hair_color n
## <fct> <int>
## 1 brown 21
## 2 black 13
## 3 blond 3
## 4 blonde 1
## 5 brown, grey 1
## 6 grey 1
## 7 none 37
## 8 unknown 1
## 9 white 4
## 10 <NA> 5
<- fct_infreq(factor(starwars$eye_color))
freq2 <- fct_lump(freq2, 6) #erste sechs als eigene , alles andere als "Other" zusammenfassen
freq2 fct_count(freq2)
## # A tibble: 7 × 2
## f n
## <fct> <int>
## 1 brown 21
## 2 blue 19
## 3 yellow 11
## 4 black 10
## 5 orange 8
## 6 red 5
## 7 Other 13
3.4.3 Aufgabe 3
#install.packages("devtools")
library(devtools)
#install_github("Ram-N/weatherData")
library(weatherData)
<- NewYork2013
newyork library(lubridate)
$Time <- as_datetime(newyork$Time)
newyork$date <- date(newyork$Time)
newyork$year <- year(newyork$Time)
newyork$month <- month(newyork$Time)
newyork$day <- day(newyork$Time)
newyork$week <- week(newyork$Time)
newyork$year_day <- yday(newyork$Time)
newyork$week_day <- wday(newyork$Time)
newyork$hour <- hour(newyork$Time)
newyork322,] newyork[
## Time Temperature date year
## 322 2013-01-13 01:47:00 42.8 2013-01-13 2013
## month day week year_day week_day hour
## 322 1 13 2 13 1 1
<- newyork %>% filter(month == 1, hour > 1)
filtered_newyork ggplot(filtered_newyork, aes(x = week_day, y = Temperature, colour = factor(week))) + geom_line()