2017-06-28
Manipulation de données avec {dplyr} dans le tidyverse
Aww @hadleywickham wants to change "hadleyverse" to "tidyverse" – consistent, uniform interface so packages work together #useR2016
— Hilary Parker (@hspter) June 29, 2016
Toutes les familles heureuses se ressemblent, mais chaque famille malheureuse l'est à sa façon.
Like families, tidy datasets are all alike but every messy dataset is messy in its own way
age <- c(25, 45, 31, 10) sexe <- c( "homme", "homme", "femme", "homme") df <- data.frame(age = age, sexe = sexe) df
age sexe 1 25 homme 2 45 homme 3 31 femme 4 10 homme
"80 % des effets sont le produit de 20 % des causes"
…afin d'être plus opérationnel

library(magrittr)
verbe2( verbe1( sujet, complement ), ... )
devient :
sujet %>% verbe1( complement ) %>% verbe2( ... )
head(df)
age sexe 1 25 homme 2 45 homme 3 31 femme 4 10 homme
devient :
df %>% head()
age sexe 1 25 homme 2 45 homme 3 31 femme 4 10 homme
voire…
df %>% head
age sexe 1 25 homme 2 45 homme 3 31 femme 4 10 homme
bowl()) puis,append()) puis,beat()) puis,shape()) puis,bake()) puis,cool()) et enfin,enjoy()) !x1 <- bowl(rep("flour", 2), "yeast", "water", "milk", "oil")
x2 <- append(x1, "flour", until = "soft")
x3 <- beat(x2, duration = "3mins")
x4 <- shape(x3, as = "balls", style = "slightly-flat")
x5 <- bake(x4, degrees = 200, duration = "15mins")
x6 <- cool(x5, buns, duration = "5mins")
enjoy(x6)
enjoy(cool(bake(shape(beat(append(bowl(rep("flour", 2), "yeast", "water", "milk", "oil"),
"flour", until = "soft"), duration = "3mins"), as = "balls", style = "slightly-flat"),
degrees = 200, duration = "15mins"), duration = "5mins"))
bowl(rep("flour", 2), "yeast", "water", "milk", "oil") %>%
append("flour", until = "soft") %>%
beat(duration = "3mins") %>%
shape(as = "balls", style = "slightly-flat") %>%
bake(degrees = 200, duration = "15mins") %>%
cool(buns, duration = "5mins") %>%
enjoy()
%>% comme "ensuite"tibbleParce que les data frame c'est has-been…
Parce que C'est le format choisi par Hadley comme output de ses fonctions read_ de {readr} et {readxl}
chargement du package :
library(tibble)
age <- c(25, 45, 31, 10) sexe <- c( "homme", "homme", "femme", "homme") df <- data.frame(age = age, sexe = sexe) # sexe devient `factor` df
age sexe 1 25 homme 2 45 homme 3 31 femme 4 10 homme
df <- data.frame(age = age, sexe = sexe, stringsAsFactors = FALSE) df
age sexe 1 25 homme 2 45 homme 3 31 femme 4 10 homme
df_tbl <- tibble(age = age, sexe = sexe) df_tbl
# A tibble: 4 x 2
age sexe
<dbl> <chr>
1 25 homme
2 45 homme
3 31 femme
4 10 homme
x <- 1:10
names(x) <- paste0("var", 1:10)
x_tbl <- as_tibble(x)
x_tbl
# A tibble: 10 x 1 value * <int> 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10
x_enframe <- enframe(x, name = "nom", value = "valeur") x_enframe
# A tibble: 10 x 2
nom valeur
<chr> <int>
1 var1 1
2 var2 2
3 var3 3
4 var4 4
5 var5 5
6 var6 6
7 var7 7
8 var8 8
9 var9 9
10 var10 10
rownames(x_enframe)
[1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
tribble()…df_tbl <- tribble( ~age, ~sexe, 25 , "homme", 45 , "homme", 34 , "femme", 10 , "homme" ) df_tbl
# A tibble: 4 x 2
age sexe
<dbl> <chr>
1 25 homme
2 45 homme
3 34 femme
4 10 homme
as.tibble()as_tibble(df)
# A tibble: 4 x 2
age sexe
<dbl> <chr>
1 25 homme
2 45 homme
3 31 femme
4 10 homme
options(tibble.print_max = 5) options(tibble.print_min = 2) df_tbl
# A tibble: 4 x 2
age sexe
<dbl> <chr>
1 25 homme
2 45 homme
3 34 femme
4 10 homme
df[ ,"sexe"]
[1] "homme" "homme" "femme" "homme"
df[ ,"sexe", drop = FALSE]
sexe 1 homme 2 homme 3 femme 4 homme
df_tbl[ , "sexe"]
# A tibble: 4 x 1 sexe <chr> 1 homme 2 homme 3 femme 4 homme
as.data.frame()as.data.frame(df_tbl)
age sexe 1 25 homme 2 45 homme 3 34 femme 4 10 homme
df_tbl %>% as.data.frame()
age sexe 1 25 homme 2 45 homme 3 34 femme 4 10 homme
starwarslibrary(dplyr)
packageVersion("dplyr") # 0.7.1
[1] '0.7.1'
data(starwars) starwars
# A tibble: 87 x 13
name height mass hair_color skin_color eye_color birth_year
<chr> <int> <dbl> <chr> <chr> <chr> <dbl>
1 Luke Skywalker 172 77 blond fair blue 19
2 C-3PO 167 75 <NA> gold yellow 112
# ... with 85 more rows, and 6 more variables: gender <chr>,
# homeworld <chr>, species <chr>, films <list>, vehicles <list>,
# starships <list>
glimpse(starwars)
Observations: 87 Variables: 13 $ name <chr> "Luke Skywalker", "C-3PO", "R2-D2", "Darth Vader", ... $ height <int> 172, 167, 96, 202, 150, 178, 165, 97, 183, 182, 188... $ mass <dbl> 77.0, 75.0, 32.0, 136.0, 49.0, 120.0, 75.0, 32.0, 8... $ hair_color <chr> "blond", NA, NA, "none", "brown", "brown, grey", "b... $ skin_color <chr> "fair", "gold", "white, blue", "white", "light", "l... $ eye_color <chr> "blue", "yellow", "red", "yellow", "brown", "blue",... $ birth_year <dbl> 19.0, 112.0, 33.0, 41.9, 19.0, 52.0, 47.0, NA, 24.0... $ gender <chr> "male", NA, NA, "male", "female", "male", "female",... $ homeworld <chr> "Tatooine", "Tatooine", "Naboo", "Tatooine", "Alder... $ species <chr> "Human", "Droid", "Droid", "Human", "Human", "Human... $ films <list> [<"Revenge of the Sith", "Return of the Jedi", "Th... $ vehicles <list> [<"Snowspeeder", "Imperial Speeder Bike">, <>, <>,... $ starships <list> [<"X-wing", "Imperial shuttle">, <>, <>, "TIE Adva...
{dplyr}Pour manipuler les observations (lignes) :
arrange()filter()Pour manipuler les variables (colonnes) :
select()mutate()arrange() remanie les observationsdesc()starwars %>% arrange(height) starwars %>% arrange(desc(height), mass) starwars %>% arrange(desc(height), mass) %>% tail
starwars %>% arrange(desc(height), mass) %>% tail(3)
# A tibble: 3 x 13
name height mass hair_color skin_color eye_color birth_year
<chr> <int> <dbl> <chr> <chr> <chr> <dbl>
1 Poe Dameron NA NA brown light brown NA
2 BB8 NA NA none none black NA
3 Captain Phasma NA NA unknown unknown unknown NA
# ... with 6 more variables: gender <chr>, homeworld <chr>, species <chr>,
# films <list>, vehicles <list>, starships <list>
filter() réduit la hauteur du dataset>, <,<=, >=, %in%, &, | , !…starwars %>% filter(height > 150) starwars %>% filter(height > 150, height < 200)
starwars %>% filter(height > 150 | species == "Human" )
# A tibble: 75 x 13
name height mass hair_color skin_color eye_color birth_year
<chr> <int> <dbl> <chr> <chr> <chr> <dbl>
1 Luke Skywalker 172 77 blond fair blue 19
2 C-3PO 167 75 <NA> gold yellow 112
# ... with 73 more rows, and 6 more variables: gender <chr>,
# homeworld <chr>, species <chr>, films <list>, vehicles <list>,
# starships <list>
starwars %>% filter(height > 150 | species == "Human") %>% arrange(desc(mass))
# A tibble: 75 x 13
name height mass hair_color skin_color
<chr> <int> <dbl> <chr> <chr>
1 Jabba Desilijic Tiure 175 1358 <NA> green-tan, brown
2 Grievous 216 159 none brown, white
# ... with 73 more rows, and 8 more variables: eye_color <chr>,
# birth_year <dbl>, gender <chr>, homeworld <chr>, species <chr>,
# films <list>, vehicles <list>, starships <list>
%in%starwars %>% filter(species == "Droid" | species == "Human" |species == "Wookie")
starwars %>% filter(species %in% c("Droid", "Human", "Wookie"))
# A tibble: 40 x 13
name height mass hair_color skin_color eye_color birth_year
<chr> <int> <dbl> <chr> <chr> <chr> <dbl>
1 Luke Skywalker 172 77 blond fair blue 19
2 C-3PO 167 75 <NA> gold yellow 112
# ... with 38 more rows, and 6 more variables: gender <chr>,
# homeworld <chr>, species <chr>, films <list>, vehicles <list>,
# starships <list>
is.na() et !is.na()starwars %>% filter(!is.na(height)) %>% arrange(height) %>% tail
# A tibble: 6 x 13
name height mass hair_color skin_color eye_color
<chr> <int> <dbl> <chr> <chr> <chr>
1 Grievous 216 159 none brown, white green, yellow
2 Roos Tarpals 224 82 none grey orange
# ... with 4 more rows, and 7 more variables: birth_year <dbl>,
# gender <chr>, homeworld <chr>, species <chr>, films <list>,
# vehicles <list>, starships <list>
starwars %>% filter(!is.na(height) & !is.na(mass))
# A tibble: 59 x 13
name height mass hair_color skin_color eye_color birth_year
<chr> <int> <dbl> <chr> <chr> <chr> <dbl>
1 Luke Skywalker 172 77 blond fair blue 19
2 C-3PO 167 75 <NA> gold yellow 112
# ... with 57 more rows, and 6 more variables: gender <chr>,
# homeworld <chr>, species <chr>, films <list>, vehicles <list>,
# starships <list>
prenomsdistinct()… dédoublonne un dataset (équivalent de unique())
starwars %>% distinct()
# A tibble: 87 x 13
name height mass hair_color skin_color eye_color birth_year
<chr> <int> <dbl> <chr> <chr> <chr> <dbl>
1 Luke Skywalker 172 77 blond fair blue 19
2 C-3PO 167 75 <NA> gold yellow 112
# ... with 85 more rows, and 6 more variables: gender <chr>,
# homeworld <chr>, species <chr>, films <list>, vehicles <list>,
# starships <list>
starwars %>% distinct(hair_color)
# A tibble: 13 x 1
hair_color
<chr>
1 blond
2 <NA>
# ... with 11 more rows
starwars %>% distinct(hair_color,.keep_all = TRUE)
# A tibble: 13 x 13
name height mass hair_color skin_color eye_color birth_year
<chr> <int> <dbl> <chr> <chr> <chr> <dbl>
1 Luke Skywalker 172 77 blond fair blue 19
2 C-3PO 167 75 <NA> gold yellow 112
# ... with 11 more rows, and 6 more variables: gender <chr>,
# homeworld <chr>, species <chr>, films <list>, vehicles <list>,
# starships <list>
sample_n()…échantillonnent dans un jeu de données
starwars %>% sample_n(size = 60)
# A tibble: 60 x 13
name height mass hair_color skin_color eye_color birth_year
<chr> <int> <dbl> <chr> <chr> <chr> <dbl>
1 Padmé Amidala 165 45 brown light brown 46
2 Ackbar 180 83 none brown mottle orange 41
# ... with 58 more rows, and 6 more variables: gender <chr>,
# homeworld <chr>, species <chr>, films <list>, vehicles <list>,
# starships <list>
starwars %>% sample_n(size = 60) %>% dim
[1] 60 13
sample_frac()starwars %>% sample_frac(size = 0.7)
# A tibble: 61 x 13
name height mass hair_color skin_color eye_color birth_year
<chr> <int> <dbl> <chr> <chr> <chr> <dbl>
1 Gregar Typho 185 85 black dark brown NA
2 Mace Windu 188 84 none dark brown 72
# ... with 59 more rows, and 6 more variables: gender <chr>,
# homeworld <chr>, species <chr>, films <list>, vehicles <list>,
# starships <list>
starwars %>% sample_frac(size = 0.7) %>% dim
[1] 61 13
slice()…sélectionne des observations particulières sur la base de d' indice(s)
starwars %>% slice(25:32)
# A tibble: 8 x 13
name height mass hair_color skin_color eye_color birth_year gender
<chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr>
1 Lobot 175 79 none light blue 37 male
2 Ackbar 180 83 none brown mottle orange 41 male
# ... with 6 more rows, and 5 more variables: homeworld <chr>,
# species <chr>, films <list>, vehicles <list>, starships <list>
starwars %>% slice(which(mass %in% range(mass, na.rm = TRUE)))
# A tibble: 2 x 13
name height mass hair_color skin_color eye_color
<chr> <int> <dbl> <chr> <chr> <chr>
1 Jabba Desilijic Tiure 175 1358 <NA> green-tan, brown orange
2 Ratts Tyerell 79 15 none grey, blue unknown
# ... with 7 more variables: birth_year <dbl>, gender <chr>,
# homeworld <chr>, species <chr>, films <list>, vehicles <list>,
# starships <list>
top_n()…trie et filtre les n premières observations
starwars %>% top_n(5, mass)
# A tibble: 5 x 13
name height mass hair_color skin_color
<chr> <int> <dbl> <chr> <chr>
1 Darth Vader 202 136 none white
2 Jabba Desilijic Tiure 175 1358 <NA> green-tan, brown
3 IG-88 200 140 none metal
4 Grievous 216 159 none brown, white
5 Tarfful 234 136 brown brown
# ... with 8 more variables: eye_color <chr>, birth_year <dbl>,
# gender <chr>, homeworld <chr>, species <chr>, films <list>,
# vehicles <list>, starships <list>
prenomsselect()starwars %>% select(name, gender, homeworld, species)
# A tibble: 87 x 4
name gender homeworld species
<chr> <chr> <chr> <chr>
1 Luke Skywalker male Tatooine Human
2 C-3PO <NA> Tatooine Droid
# ... with 85 more rows
-starts_with()ends_with()contains()matches()one_of():starwars %>% select(starts_with("h"))
# A tibble: 87 x 3 height hair_color homeworld <int> <chr> <chr> 1 172 blond Tatooine 2 167 <NA> Tatooine # ... with 85 more rows
starwars %>% select(matches("._."))
# A tibble: 87 x 4
hair_color skin_color eye_color birth_year
<chr> <chr> <chr> <dbl>
1 blond fair blue 19
2 <NA> gold yellow 112
# ... with 85 more rows
starwars %>% select(hair_color:birth_year)
# A tibble: 87 x 4
hair_color skin_color eye_color birth_year
<chr> <chr> <chr> <dbl>
1 blond fair blue 19
2 <NA> gold yellow 112
# ... with 85 more rows
starwars %>% select(-(films:starships)) # tous sauf les variables listes
# A tibble: 87 x 10
name height mass hair_color skin_color eye_color birth_year
<chr> <int> <dbl> <chr> <chr> <chr> <dbl>
1 Luke Skywalker 172 77 blond fair blue 19
2 C-3PO 167 75 <NA> gold yellow 112
# ... with 85 more rows, and 3 more variables: gender <chr>,
# homeworld <chr>, species <chr>
starwars %>% select(contains("color"))
# A tibble: 87 x 3
hair_color skin_color eye_color
<chr> <chr> <chr>
1 blond fair blue
2 <NA> gold yellow
# ... with 85 more rows
starwars %>% select(one_of("species"))
# A tibble: 87 x 1
species
<chr>
1 Human
2 Droid
# ... with 85 more rows
pull() permet de maîtriser les outputsstarwars %>% select(one_of("species")) %>% pull()
[1] "Human" "Droid" "Droid" "Human"
[5] "Human" "Human" "Human" "Droid"
[9] "Human" "Human" "Human" "Human"
[13] "Wookiee" "Human" "Rodian" "Hutt"
[17] "Human" "Human" "Yoda's species" "Human"
[21] "Human" "Droid" "Trandoshan" "Human"
[25] "Human" "Mon Calamari" "Human" "Human"
[29] "Ewok" "Sullustan" "Human" "Neimodian"
[33] "Human" "Gungan" "Gungan" "Gungan"
[37] NA "Toydarian" "Dug" NA
[41] "Human" "Zabrak" "Twi'lek" "Twi'lek"
[45] "Vulptereen" "Xexto" "Toong" "Human"
[49] "Cerean" "Nautolan" "Zabrak" "Tholothian"
[53] "Iktotchi" "Quermian" "Kel Dor" "Chagrian"
[57] "Human" "Human" "Human" "Geonosian"
[61] "Mirialan" "Mirialan" "Human" "Human"
[65] "Human" "Human" "Clawdite" "Besalisk"
[69] "Kaminoan" "Kaminoan" "Human" "Aleena"
[73] NA "Skakoan" "Muun"
[ reached getOption("max.print") -- omitted 12 entries ]
select_if()Sur la base d'un prédicat :
starwars %>% select_if(is.numeric)
# A tibble: 87 x 3 height mass birth_year <int> <dbl> <dbl> 1 172 77 19 2 167 75 112 # ... with 85 more rows
mutate() pour créer de nouvelles variablesmutate() modifie le jeu de données en largeur pour y adjoindre des colonnes
starwars %>% mutate(imc = mass/(height/100)**2) %>% select(name,imc)
# A tibble: 87 x 2
name imc
<chr> <dbl>
1 Luke Skywalker 26.02758
2 C-3PO 26.89232
# ... with 85 more rows
starwars %>%
mutate(imc = mass/(height/100)**2,
imc = round(imc,1)) %>%
select(name,imc)
# A tibble: 87 x 2
name imc
<chr> <dbl>
1 Luke Skywalker 26.0
2 C-3PO 26.9
# ... with 85 more rows
case_when() comme "helper" de mutate()Pour éviter d'imbriquer les ifelse():
starwars %>% mutate(imc = mass/(height/100)**2,
imc = round(imc,1),
imc_classe = case_when(
imc < 16.5 ~ "dénutrition",
imc >= 16.5 & imc < 18.5 ~ "maigreur",
imc >= 18.5 & imc < 25 ~ "normal",
imc >= 25 & imc < 30 ~ "surpoids",
imc >= 30 & imc < 35 ~ "obésité modérée",
imc >= 35 & imc < 40 ~ "obésité sévère",
imc >= 40 ~ "obésité morbide"
)) %>% select(name,imc, imc_classe)
# A tibble: 87 x 3
name imc imc_classe
<chr> <dbl> <chr>
1 Luke Skywalker 26.0 surpoids
2 C-3PO 26.9 surpoids
# ... with 85 more rows
mutate() comme armes de "construction massives"Afin d'appliquer des traitements en masse :
mutate_all()mutate_at()mutate_if()mutate_if()starwars %>% mutate_if(is.numeric, scale)
# A tibble: 87 x 13
name height mass hair_color skin_color eye_color
<chr> <dbl> <dbl> <chr> <chr> <chr>
1 Luke Skywalker -0.06781696 -0.1198643 blond fair blue
2 C-3PO -0.21161731 -0.1316667 <NA> gold yellow
# ... with 85 more rows, and 7 more variables: birth_year <dbl>,
# gender <chr>, homeworld <chr>, species <chr>, films <list>,
# vehicles <list>, starships <list>
starwars %>% mutate_if(is.numeric, funs(scale))
# A tibble: 87 x 13
name height mass hair_color skin_color eye_color
<chr> <dbl> <dbl> <chr> <chr> <chr>
1 Luke Skywalker -0.06781696 -0.1198643 blond fair blue
2 C-3PO -0.21161731 -0.1316667 <NA> gold yellow
# ... with 85 more rows, and 7 more variables: birth_year <dbl>,
# gender <chr>, homeworld <chr>, species <chr>, films <list>,
# vehicles <list>, starships <list>
starwars %>%
mutate_if(is.numeric, funs(scale = scale)) %>%
select(ends_with("scale"))
# A tibble: 87 x 3
height_scale mass_scale birth_year_scale
<dbl> <dbl> <dbl>
1 -0.06781696 -0.1198643 -0.4432379
2 -0.21161731 -0.1316667 0.1579589
# ... with 85 more rows
" "starwars %>%
mutate_at(c("gender","homeworld","species"), as.factor) %>%
select(gender:species)
# A tibble: 87 x 3 gender homeworld species <fctr> <fctr> <fctr> 1 male Tatooine Human 2 NA Tatooine Droid # ... with 85 more rows
équivaut à :
starwars %>% mutate_at(vars(gender:species), as.factor) %>% select(gender:species)
# A tibble: 87 x 3 gender homeworld species <fctr> <fctr> <fctr> 1 male Tatooine Human 2 NA Tatooine Droid # ... with 85 more rows
lag(), lead()cumsum(), cumprod()…dense_rank(), min_rank()…+, -, *, et aussi..>, <,<=, >=ifelse(), case_when(), coalesce(), na_if()…summarise()mean(), median()n() qui ne prends JAMAIS de paramètresvar(), sd(), IQR()quantile(), min(), max()summarise()starwars %>% summarise(taille_moyenne = mean(!is.na(height)),
taille_variance = var(!is.na(height)),
effectif = n(),
nombre_manquantes = sum(is.na(height)),
effectif_calcul = effectif - nombre_manquantes)
# A tibble: 1 x 5
taille_moyenne taille_variance effectif nombre_manquantes
<dbl> <dbl> <int> <int>
1 0.9310345 0.06495589 87 6
# ... with 1 more variables: effectif_calcul <int>
starwars %>% summarise_if(is.numeric, funs(moyenne = mean(!is.na(.)),
variance = var(!is.na(.)),
effectif = n()))
# A tibble: 1 x 9
height_moyenne mass_moyenne birth_year_moyenne height_variance
<dbl> <dbl> <dbl> <dbl>
1 0.9310345 0.6781609 0.4942529 0.06495589
# ... with 5 more variables: mass_variance <dbl>,
# birth_year_variance <dbl>, height_effectif <int>, mass_effectif <int>,
# birth_year_effectif <int>
starwars %>%
mutate_if(is.character,as.factor) %>%
summarise_at(c("gender","species"), nlevels)
# A tibble: 1 x 2 gender species <int> <int> 1 4 37
group_by()Le complément circonstanciel est une fonction, un groupe de mots qui indique les circonstances dans lesquelles se réalise l'action du verbe
… c'est la combinaison de group_by() + summarise()
starwars %>% group_by(species) %>%
summarise(masse_moyenne = mean(mass,na.rm=TRUE),
taille_moyenne = mean(height,na.rm=TRUE))
# A tibble: 38 x 3
species masse_moyenne taille_moyenne
<chr> <dbl> <dbl>
1 Aleena 15.00 79.0000
2 Besalisk 102.00 198.0000
3 Cerean 82.00 198.0000
4 Chagrian NaN 196.0000
5 Clawdite 55.00 168.0000
6 Droid 69.75 140.0000
7 Dug 40.00 112.0000
8 Ewok 20.00 88.0000
9 Geonosian 80.00 183.0000
10 Gungan 74.00 208.6667
# ... with 28 more rows
starwars %>% group_by(species) %>%
summarise(masse_moyenne = mean(!is.na(mass)),
taille_moyenne = mean(!is.na(height)),
effectif_calcul = n()) %>%
filter(effectif_calcul > 1) %>%
arrange(desc(effectif_calcul))
# A tibble: 9 x 4
species masse_moyenne taille_moyenne effectif_calcul
<chr> <dbl> <dbl> <int>
1 Human 0.6285714 0.8857143 35
2 Droid 0.8000000 0.8000000 5
# ... with 7 more rows
count() et tally()starwars %>% group_by(gender) %>% tally()
# A tibble: 5 x 2
gender n
<chr> <int>
1 female 19
2 hermaphrodite 1
3 male 62
4 none 2
5 <NA> 3
équivaut à :
starwars %>% count(gender)
# A tibble: 5 x 2
gender n
<chr> <int>
1 female 19
2 hermaphrodite 1
3 male 62
4 none 2
5 <NA> 3
prenomsbind_cols() pour remplacer cbind()left_join(), right_join(), inner_join(), full_join(), anti_join()…apprentissage et validationapprentisage <- starwars %>% select(-(films:starships)) %>% sample_frac(0.7) validation <- starwars %>% select(-(films:starships))%>% anti_join(apprentisage) dim(validation)
[1] 26 10
dim(apprentisage)
[1] 61 10
bind_rows() pour remplacer rbind() : nous épargne les galères de facteursintersect(x,y) : observations présentes en x et en ysetdiff(x,y) : observations présentes en x et absentes en yunion() : observations présentes en x ou y (union_all() conserves les doublons)prenomsLa recette pour nettoyer ses données :
library(tidyr)
gatherexperience <- data.frame(id = c(1,2,3,4),
sexe = c("F","M","M","F"),
contrôle = c(7.9,6.3,9.5,11.5),
traitement = c(12.3,10.6,13.1,13.4))
experience %>%
gather(key = condition, value = glycémie,contrôle:traitement)
id sexe condition glycémie 1 1 F contrôle 7.9 2 2 M contrôle 6.3 3 3 M contrôle 9.5 4 4 F contrôle 11.5 5 1 F traitement 12.3 6 2 M traitement 10.6 7 3 M traitement 13.1 8 4 F traitement 13.4
spreadtemp_europ <- data.frame(
Ville = c(rep("Amsterdam",3),rep("Lisbonne",3)),
Mois = c(rep(c("Janvier", "Février", "Mars"),2)),
Température = c(2.9,2.5,5.7, 10.5,11.3, 12.8))
temp_europ
Ville Mois Température 1 Amsterdam Janvier 2.9 2 Amsterdam Février 2.5 3 Amsterdam Mars 5.7 4 Lisbonne Janvier 10.5 5 Lisbonne Février 11.3 6 Lisbonne Mars 12.8
temp_europ %>% spread(key = Mois, value = Température)
Ville Février Janvier Mars 1 Amsterdam 2.5 2.9 5.7 2 Lisbonne 11.3 10.5 12.8
starwars %>% group_by(gender, species) %>% tally() %>% spread(key = species, value = n, fill = 0)
# A tibble: 5 x 39
# Groups: gender [5]
gender Aleena Besalisk Cerean Chagrian Clawdite Droid Dug Ewok
* <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 female 0 0 0 0 1 0 0 0
2 hermaphrodite 0 0 0 0 0 0 0 0
3 male 1 1 1 1 0 0 1 1
4 none 0 0 0 0 0 2 0 0
5 <NA> 0 0 0 0 0 3 0 0
# ... with 30 more variables: Geonosian <dbl>, Gungan <dbl>, Human <dbl>,
# Hutt <dbl>, Iktotchi <dbl>, Kaleesh <dbl>, Kaminoan <dbl>, `Kel
# Dor` <dbl>, Mirialan <dbl>, `Mon Calamari` <dbl>, Muun <dbl>,
# Nautolan <dbl>, Neimodian <dbl>, `Pau'an` <dbl>, Quermian <dbl>,
# Rodian <dbl>, Skakoan <dbl>, Sullustan <dbl>, Tholothian <dbl>,
# Togruta <dbl>, Toong <dbl>, Toydarian <dbl>, Trandoshan <dbl>,
# `Twi'lek` <dbl>, Vulptereen <dbl>, Wookiee <dbl>, Xexto <dbl>, `Yoda's
# species` <dbl>, Zabrak <dbl>, `<NA>` <dbl>
prenomsfill()permet de compléter les NAdataset <- data.frame(id = 1:6,annee = c("2014",NA,NA,"2015",NA,NA))
dataset
id annee 1 1 2014 2 2 <NA> 3 3 <NA> 4 4 2015 5 5 <NA> 6 6 <NA>
dataset %>% fill(annee)
id annee 1 1 2014 2 2 2014 3 3 2014 4 4 2015 5 5 2015 6 6 2015
separate() permet de separer une colonne en plusieurs colonnesdataset <- data.frame(id = 1:6, tension = c("12/8","12/7","14/4","18/10","13/8","12/8"))
dataset
id tension 1 1 12/8 2 2 12/7 3 3 14/4 4 4 18/10 5 5 13/8 6 6 12/8
dataset %>%
separate(tension,into = c("PAS","PAD"),sep = "/",remove = TRUE)
id PAS PAD 1 1 12 8 2 2 12 7 3 3 14 4 4 4 18 10 5 5 13 8 6 6 12 8
complete() ajoute les combinaisons manquantesdataset <- data.frame(id = 1:6,
annee = c("2012","2012","2013","2014","2014","2014"),
mois = month.abb[c(1,3,2,1,2,3)],
temperature = c(12,11,15,16,18,10))
dataset
id annee mois temperature 1 1 2012 Jan 12 2 2 2012 Mar 11 3 3 2013 Feb 15 4 4 2014 Jan 16 5 5 2014 Feb 18 6 6 2014 Mar 10
dataset %>% complete(annee, mois)
# A tibble: 9 x 4 annee mois id temperature <fctr> <fctr> <int> <dbl> 1 2012 Feb NA NA 2 2012 Jan 1 12 3 2012 Mar 2 11 4 2013 Feb 3 15 5 2013 Jan NA NA 6 2013 Mar NA NA 7 2014 Feb 5 18 8 2014 Jan 4 16 9 2014 Mar 6 10
Autres packages utiles du tidyverse :
{readr} : l'import réussi des données{purr} : programmation fonctionnelle{lubridate} : manipulation des dates{forcats} : manipulation des facteurs{roxygen2} et devtools : mettre le tout en package