2017-06-28
Manipulation de données avec {dplyr} dans le tidyverse
Aww @hadleywickham wants to change "hadleyverse" to "tidyverse" – consistent, uniform interface so packages work together #useR2016
— Hilary Parker (@hspter) June 29, 2016
Toutes les familles heureuses se ressemblent, mais chaque famille malheureuse l'est à sa façon.
Like families, tidy datasets are all alike but every messy dataset is messy in its own way
age <- c(25, 45, 31, 10) sexe <- c( "homme", "homme", "femme", "homme") df <- data.frame(age = age, sexe = sexe) df
age sexe 1 25 homme 2 45 homme 3 31 femme 4 10 homme
"80 % des effets sont le produit de 20 % des causes"
…afin d'être plus opérationnel
library(magrittr)
verbe2( verbe1( sujet, complement ), ... )
devient :
sujet %>% verbe1( complement ) %>% verbe2( ... )
head(df)
age sexe 1 25 homme 2 45 homme 3 31 femme 4 10 homme
devient :
df %>% head()
age sexe 1 25 homme 2 45 homme 3 31 femme 4 10 homme
voire…
df %>% head
age sexe 1 25 homme 2 45 homme 3 31 femme 4 10 homme
bowl()
) puis,append()
) puis,beat()
) puis,shape()
) puis,bake()
) puis,cool()
) et enfin,enjoy()
) !x1 <- bowl(rep("flour", 2), "yeast", "water", "milk", "oil") x2 <- append(x1, "flour", until = "soft") x3 <- beat(x2, duration = "3mins") x4 <- shape(x3, as = "balls", style = "slightly-flat") x5 <- bake(x4, degrees = 200, duration = "15mins") x6 <- cool(x5, buns, duration = "5mins") enjoy(x6)
enjoy(cool(bake(shape(beat(append(bowl(rep("flour", 2), "yeast", "water", "milk", "oil"), "flour", until = "soft"), duration = "3mins"), as = "balls", style = "slightly-flat"), degrees = 200, duration = "15mins"), duration = "5mins"))
bowl(rep("flour", 2), "yeast", "water", "milk", "oil") %>% append("flour", until = "soft") %>% beat(duration = "3mins") %>% shape(as = "balls", style = "slightly-flat") %>% bake(degrees = 200, duration = "15mins") %>% cool(buns, duration = "5mins") %>% enjoy()
%>%
comme "ensuite"tibble
Parce que les data frame c'est has-been…
Parce que C'est le format choisi par Hadley comme output de ses fonctions read_
de {readr}
et {readxl}
chargement du package :
library(tibble)
age <- c(25, 45, 31, 10) sexe <- c( "homme", "homme", "femme", "homme") df <- data.frame(age = age, sexe = sexe) # sexe devient `factor` df
age sexe 1 25 homme 2 45 homme 3 31 femme 4 10 homme
df <- data.frame(age = age, sexe = sexe, stringsAsFactors = FALSE) df
age sexe 1 25 homme 2 45 homme 3 31 femme 4 10 homme
df_tbl <- tibble(age = age, sexe = sexe) df_tbl
# A tibble: 4 x 2 age sexe <dbl> <chr> 1 25 homme 2 45 homme 3 31 femme 4 10 homme
x <- 1:10 names(x) <- paste0("var", 1:10) x_tbl <- as_tibble(x) x_tbl
# A tibble: 10 x 1 value * <int> 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10
x_enframe <- enframe(x, name = "nom", value = "valeur") x_enframe
# A tibble: 10 x 2 nom valeur <chr> <int> 1 var1 1 2 var2 2 3 var3 3 4 var4 4 5 var5 5 6 var6 6 7 var7 7 8 var8 8 9 var9 9 10 var10 10
rownames(x_enframe)
[1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
tribble()
…df_tbl <- tribble( ~age, ~sexe, 25 , "homme", 45 , "homme", 34 , "femme", 10 , "homme" ) df_tbl
# A tibble: 4 x 2 age sexe <dbl> <chr> 1 25 homme 2 45 homme 3 34 femme 4 10 homme
as.tibble()
as_tibble(df)
# A tibble: 4 x 2 age sexe <dbl> <chr> 1 25 homme 2 45 homme 3 31 femme 4 10 homme
options(tibble.print_max = 5) options(tibble.print_min = 2) df_tbl
# A tibble: 4 x 2 age sexe <dbl> <chr> 1 25 homme 2 45 homme 3 34 femme 4 10 homme
df[ ,"sexe"]
[1] "homme" "homme" "femme" "homme"
df[ ,"sexe", drop = FALSE]
sexe 1 homme 2 homme 3 femme 4 homme
df_tbl[ , "sexe"]
# A tibble: 4 x 1 sexe <chr> 1 homme 2 homme 3 femme 4 homme
as.data.frame()
as.data.frame(df_tbl)
age sexe 1 25 homme 2 45 homme 3 34 femme 4 10 homme
df_tbl %>% as.data.frame()
age sexe 1 25 homme 2 45 homme 3 34 femme 4 10 homme
starwars
library(dplyr) packageVersion("dplyr") # 0.7.1
[1] '0.7.1'
data(starwars) starwars
# A tibble: 87 x 13 name height mass hair_color skin_color eye_color birth_year <chr> <int> <dbl> <chr> <chr> <chr> <dbl> 1 Luke Skywalker 172 77 blond fair blue 19 2 C-3PO 167 75 <NA> gold yellow 112 # ... with 85 more rows, and 6 more variables: gender <chr>, # homeworld <chr>, species <chr>, films <list>, vehicles <list>, # starships <list>
glimpse(starwars)
Observations: 87 Variables: 13 $ name <chr> "Luke Skywalker", "C-3PO", "R2-D2", "Darth Vader", ... $ height <int> 172, 167, 96, 202, 150, 178, 165, 97, 183, 182, 188... $ mass <dbl> 77.0, 75.0, 32.0, 136.0, 49.0, 120.0, 75.0, 32.0, 8... $ hair_color <chr> "blond", NA, NA, "none", "brown", "brown, grey", "b... $ skin_color <chr> "fair", "gold", "white, blue", "white", "light", "l... $ eye_color <chr> "blue", "yellow", "red", "yellow", "brown", "blue",... $ birth_year <dbl> 19.0, 112.0, 33.0, 41.9, 19.0, 52.0, 47.0, NA, 24.0... $ gender <chr> "male", NA, NA, "male", "female", "male", "female",... $ homeworld <chr> "Tatooine", "Tatooine", "Naboo", "Tatooine", "Alder... $ species <chr> "Human", "Droid", "Droid", "Human", "Human", "Human... $ films <list> [<"Revenge of the Sith", "Return of the Jedi", "Th... $ vehicles <list> [<"Snowspeeder", "Imperial Speeder Bike">, <>, <>,... $ starships <list> [<"X-wing", "Imperial shuttle">, <>, <>, "TIE Adva...
{dplyr}
Pour manipuler les observations (lignes) :
arrange()
filter()
Pour manipuler les variables (colonnes) :
select()
mutate()
arrange()
remanie les observationsdesc()
starwars %>% arrange(height) starwars %>% arrange(desc(height), mass) starwars %>% arrange(desc(height), mass) %>% tail
starwars %>% arrange(desc(height), mass) %>% tail(3)
# A tibble: 3 x 13 name height mass hair_color skin_color eye_color birth_year <chr> <int> <dbl> <chr> <chr> <chr> <dbl> 1 Poe Dameron NA NA brown light brown NA 2 BB8 NA NA none none black NA 3 Captain Phasma NA NA unknown unknown unknown NA # ... with 6 more variables: gender <chr>, homeworld <chr>, species <chr>, # films <list>, vehicles <list>, starships <list>
filter()
réduit la hauteur du dataset>
, <
,<=
, >=
, %in%
, &
, |
, !
…starwars %>% filter(height > 150) starwars %>% filter(height > 150, height < 200)
starwars %>% filter(height > 150 | species == "Human" )
# A tibble: 75 x 13 name height mass hair_color skin_color eye_color birth_year <chr> <int> <dbl> <chr> <chr> <chr> <dbl> 1 Luke Skywalker 172 77 blond fair blue 19 2 C-3PO 167 75 <NA> gold yellow 112 # ... with 73 more rows, and 6 more variables: gender <chr>, # homeworld <chr>, species <chr>, films <list>, vehicles <list>, # starships <list>
starwars %>% filter(height > 150 | species == "Human") %>% arrange(desc(mass))
# A tibble: 75 x 13 name height mass hair_color skin_color <chr> <int> <dbl> <chr> <chr> 1 Jabba Desilijic Tiure 175 1358 <NA> green-tan, brown 2 Grievous 216 159 none brown, white # ... with 73 more rows, and 8 more variables: eye_color <chr>, # birth_year <dbl>, gender <chr>, homeworld <chr>, species <chr>, # films <list>, vehicles <list>, starships <list>
%in%
starwars %>% filter(species == "Droid" | species == "Human" |species == "Wookie")
starwars %>% filter(species %in% c("Droid", "Human", "Wookie"))
# A tibble: 40 x 13 name height mass hair_color skin_color eye_color birth_year <chr> <int> <dbl> <chr> <chr> <chr> <dbl> 1 Luke Skywalker 172 77 blond fair blue 19 2 C-3PO 167 75 <NA> gold yellow 112 # ... with 38 more rows, and 6 more variables: gender <chr>, # homeworld <chr>, species <chr>, films <list>, vehicles <list>, # starships <list>
is.na()
et !is.na()
starwars %>% filter(!is.na(height)) %>% arrange(height) %>% tail
# A tibble: 6 x 13 name height mass hair_color skin_color eye_color <chr> <int> <dbl> <chr> <chr> <chr> 1 Grievous 216 159 none brown, white green, yellow 2 Roos Tarpals 224 82 none grey orange # ... with 4 more rows, and 7 more variables: birth_year <dbl>, # gender <chr>, homeworld <chr>, species <chr>, films <list>, # vehicles <list>, starships <list>
starwars %>% filter(!is.na(height) & !is.na(mass))
# A tibble: 59 x 13 name height mass hair_color skin_color eye_color birth_year <chr> <int> <dbl> <chr> <chr> <chr> <dbl> 1 Luke Skywalker 172 77 blond fair blue 19 2 C-3PO 167 75 <NA> gold yellow 112 # ... with 57 more rows, and 6 more variables: gender <chr>, # homeworld <chr>, species <chr>, films <list>, vehicles <list>, # starships <list>
prenoms
distinct()
… dédoublonne un dataset (équivalent de unique()
)
starwars %>% distinct()
# A tibble: 87 x 13 name height mass hair_color skin_color eye_color birth_year <chr> <int> <dbl> <chr> <chr> <chr> <dbl> 1 Luke Skywalker 172 77 blond fair blue 19 2 C-3PO 167 75 <NA> gold yellow 112 # ... with 85 more rows, and 6 more variables: gender <chr>, # homeworld <chr>, species <chr>, films <list>, vehicles <list>, # starships <list>
starwars %>% distinct(hair_color)
# A tibble: 13 x 1 hair_color <chr> 1 blond 2 <NA> # ... with 11 more rows
starwars %>% distinct(hair_color,.keep_all = TRUE)
# A tibble: 13 x 13 name height mass hair_color skin_color eye_color birth_year <chr> <int> <dbl> <chr> <chr> <chr> <dbl> 1 Luke Skywalker 172 77 blond fair blue 19 2 C-3PO 167 75 <NA> gold yellow 112 # ... with 11 more rows, and 6 more variables: gender <chr>, # homeworld <chr>, species <chr>, films <list>, vehicles <list>, # starships <list>
sample_n()
…échantillonnent dans un jeu de données
starwars %>% sample_n(size = 60)
# A tibble: 60 x 13 name height mass hair_color skin_color eye_color birth_year <chr> <int> <dbl> <chr> <chr> <chr> <dbl> 1 Padmé Amidala 165 45 brown light brown 46 2 Ackbar 180 83 none brown mottle orange 41 # ... with 58 more rows, and 6 more variables: gender <chr>, # homeworld <chr>, species <chr>, films <list>, vehicles <list>, # starships <list>
starwars %>% sample_n(size = 60) %>% dim
[1] 60 13
sample_frac()
starwars %>% sample_frac(size = 0.7)
# A tibble: 61 x 13 name height mass hair_color skin_color eye_color birth_year <chr> <int> <dbl> <chr> <chr> <chr> <dbl> 1 Gregar Typho 185 85 black dark brown NA 2 Mace Windu 188 84 none dark brown 72 # ... with 59 more rows, and 6 more variables: gender <chr>, # homeworld <chr>, species <chr>, films <list>, vehicles <list>, # starships <list>
starwars %>% sample_frac(size = 0.7) %>% dim
[1] 61 13
slice()
…sélectionne des observations particulières sur la base de d' indice(s)
starwars %>% slice(25:32)
# A tibble: 8 x 13 name height mass hair_color skin_color eye_color birth_year gender <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> 1 Lobot 175 79 none light blue 37 male 2 Ackbar 180 83 none brown mottle orange 41 male # ... with 6 more rows, and 5 more variables: homeworld <chr>, # species <chr>, films <list>, vehicles <list>, starships <list>
starwars %>% slice(which(mass %in% range(mass, na.rm = TRUE)))
# A tibble: 2 x 13 name height mass hair_color skin_color eye_color <chr> <int> <dbl> <chr> <chr> <chr> 1 Jabba Desilijic Tiure 175 1358 <NA> green-tan, brown orange 2 Ratts Tyerell 79 15 none grey, blue unknown # ... with 7 more variables: birth_year <dbl>, gender <chr>, # homeworld <chr>, species <chr>, films <list>, vehicles <list>, # starships <list>
top_n()
…trie et filtre les n premières observations
starwars %>% top_n(5, mass)
# A tibble: 5 x 13 name height mass hair_color skin_color <chr> <int> <dbl> <chr> <chr> 1 Darth Vader 202 136 none white 2 Jabba Desilijic Tiure 175 1358 <NA> green-tan, brown 3 IG-88 200 140 none metal 4 Grievous 216 159 none brown, white 5 Tarfful 234 136 brown brown # ... with 8 more variables: eye_color <chr>, birth_year <dbl>, # gender <chr>, homeworld <chr>, species <chr>, films <list>, # vehicles <list>, starships <list>
prenoms
select()
starwars %>% select(name, gender, homeworld, species)
# A tibble: 87 x 4 name gender homeworld species <chr> <chr> <chr> <chr> 1 Luke Skywalker male Tatooine Human 2 C-3PO <NA> Tatooine Droid # ... with 85 more rows
-
starts_with()
ends_with()
contains()
matches()
one_of()
:
starwars %>% select(starts_with("h"))
# A tibble: 87 x 3 height hair_color homeworld <int> <chr> <chr> 1 172 blond Tatooine 2 167 <NA> Tatooine # ... with 85 more rows
starwars %>% select(matches("._."))
# A tibble: 87 x 4 hair_color skin_color eye_color birth_year <chr> <chr> <chr> <dbl> 1 blond fair blue 19 2 <NA> gold yellow 112 # ... with 85 more rows
starwars %>% select(hair_color:birth_year)
# A tibble: 87 x 4 hair_color skin_color eye_color birth_year <chr> <chr> <chr> <dbl> 1 blond fair blue 19 2 <NA> gold yellow 112 # ... with 85 more rows
starwars %>% select(-(films:starships)) # tous sauf les variables listes
# A tibble: 87 x 10 name height mass hair_color skin_color eye_color birth_year <chr> <int> <dbl> <chr> <chr> <chr> <dbl> 1 Luke Skywalker 172 77 blond fair blue 19 2 C-3PO 167 75 <NA> gold yellow 112 # ... with 85 more rows, and 3 more variables: gender <chr>, # homeworld <chr>, species <chr>
starwars %>% select(contains("color"))
# A tibble: 87 x 3 hair_color skin_color eye_color <chr> <chr> <chr> 1 blond fair blue 2 <NA> gold yellow # ... with 85 more rows
starwars %>% select(one_of("species"))
# A tibble: 87 x 1 species <chr> 1 Human 2 Droid # ... with 85 more rows
pull()
permet de maîtriser les outputsstarwars %>% select(one_of("species")) %>% pull()
[1] "Human" "Droid" "Droid" "Human" [5] "Human" "Human" "Human" "Droid" [9] "Human" "Human" "Human" "Human" [13] "Wookiee" "Human" "Rodian" "Hutt" [17] "Human" "Human" "Yoda's species" "Human" [21] "Human" "Droid" "Trandoshan" "Human" [25] "Human" "Mon Calamari" "Human" "Human" [29] "Ewok" "Sullustan" "Human" "Neimodian" [33] "Human" "Gungan" "Gungan" "Gungan" [37] NA "Toydarian" "Dug" NA [41] "Human" "Zabrak" "Twi'lek" "Twi'lek" [45] "Vulptereen" "Xexto" "Toong" "Human" [49] "Cerean" "Nautolan" "Zabrak" "Tholothian" [53] "Iktotchi" "Quermian" "Kel Dor" "Chagrian" [57] "Human" "Human" "Human" "Geonosian" [61] "Mirialan" "Mirialan" "Human" "Human" [65] "Human" "Human" "Clawdite" "Besalisk" [69] "Kaminoan" "Kaminoan" "Human" "Aleena" [73] NA "Skakoan" "Muun" [ reached getOption("max.print") -- omitted 12 entries ]
select_if()
Sur la base d'un prédicat :
starwars %>% select_if(is.numeric)
# A tibble: 87 x 3 height mass birth_year <int> <dbl> <dbl> 1 172 77 19 2 167 75 112 # ... with 85 more rows
mutate()
pour créer de nouvelles variablesmutate()
modifie le jeu de données en largeur pour y adjoindre des colonnes
starwars %>% mutate(imc = mass/(height/100)**2) %>% select(name,imc)
# A tibble: 87 x 2 name imc <chr> <dbl> 1 Luke Skywalker 26.02758 2 C-3PO 26.89232 # ... with 85 more rows
starwars %>% mutate(imc = mass/(height/100)**2, imc = round(imc,1)) %>% select(name,imc)
# A tibble: 87 x 2 name imc <chr> <dbl> 1 Luke Skywalker 26.0 2 C-3PO 26.9 # ... with 85 more rows
case_when()
comme "helper" de mutate()Pour éviter d'imbriquer les ifelse()
:
starwars %>% mutate(imc = mass/(height/100)**2, imc = round(imc,1), imc_classe = case_when( imc < 16.5 ~ "dénutrition", imc >= 16.5 & imc < 18.5 ~ "maigreur", imc >= 18.5 & imc < 25 ~ "normal", imc >= 25 & imc < 30 ~ "surpoids", imc >= 30 & imc < 35 ~ "obésité modérée", imc >= 35 & imc < 40 ~ "obésité sévère", imc >= 40 ~ "obésité morbide" )) %>% select(name,imc, imc_classe)
# A tibble: 87 x 3 name imc imc_classe <chr> <dbl> <chr> 1 Luke Skywalker 26.0 surpoids 2 C-3PO 26.9 surpoids # ... with 85 more rows
mutate()
comme armes de "construction massives"Afin d'appliquer des traitements en masse :
mutate_all()
mutate_at()
mutate_if()
mutate_if()
starwars %>% mutate_if(is.numeric, scale)
# A tibble: 87 x 13 name height mass hair_color skin_color eye_color <chr> <dbl> <dbl> <chr> <chr> <chr> 1 Luke Skywalker -0.06781696 -0.1198643 blond fair blue 2 C-3PO -0.21161731 -0.1316667 <NA> gold yellow # ... with 85 more rows, and 7 more variables: birth_year <dbl>, # gender <chr>, homeworld <chr>, species <chr>, films <list>, # vehicles <list>, starships <list>
starwars %>% mutate_if(is.numeric, funs(scale))
# A tibble: 87 x 13 name height mass hair_color skin_color eye_color <chr> <dbl> <dbl> <chr> <chr> <chr> 1 Luke Skywalker -0.06781696 -0.1198643 blond fair blue 2 C-3PO -0.21161731 -0.1316667 <NA> gold yellow # ... with 85 more rows, and 7 more variables: birth_year <dbl>, # gender <chr>, homeworld <chr>, species <chr>, films <list>, # vehicles <list>, starships <list>
starwars %>% mutate_if(is.numeric, funs(scale = scale)) %>% select(ends_with("scale"))
# A tibble: 87 x 3 height_scale mass_scale birth_year_scale <dbl> <dbl> <dbl> 1 -0.06781696 -0.1198643 -0.4432379 2 -0.21161731 -0.1316667 0.1579589 # ... with 85 more rows
" "
starwars %>% mutate_at(c("gender","homeworld","species"), as.factor) %>% select(gender:species)
# A tibble: 87 x 3 gender homeworld species <fctr> <fctr> <fctr> 1 male Tatooine Human 2 NA Tatooine Droid # ... with 85 more rows
équivaut à :
starwars %>% mutate_at(vars(gender:species), as.factor) %>% select(gender:species)
# A tibble: 87 x 3 gender homeworld species <fctr> <fctr> <fctr> 1 male Tatooine Human 2 NA Tatooine Droid # ... with 85 more rows
lag()
, lead()
cumsum()
, cumprod()
…dense_rank()
, min_rank()
…+
, -
, *
, et aussi..>
, <
,<=
, >=
ifelse()
, case_when()
, coalesce()
, na_if()
…summarise()
mean()
, median()
n()
qui ne prends JAMAIS de paramètresvar()
, sd()
, IQR()
quantile()
, min()
, max()
summarise()
starwars %>% summarise(taille_moyenne = mean(!is.na(height)), taille_variance = var(!is.na(height)), effectif = n(), nombre_manquantes = sum(is.na(height)), effectif_calcul = effectif - nombre_manquantes)
# A tibble: 1 x 5 taille_moyenne taille_variance effectif nombre_manquantes <dbl> <dbl> <int> <int> 1 0.9310345 0.06495589 87 6 # ... with 1 more variables: effectif_calcul <int>
starwars %>% summarise_if(is.numeric, funs(moyenne = mean(!is.na(.)), variance = var(!is.na(.)), effectif = n()))
# A tibble: 1 x 9 height_moyenne mass_moyenne birth_year_moyenne height_variance <dbl> <dbl> <dbl> <dbl> 1 0.9310345 0.6781609 0.4942529 0.06495589 # ... with 5 more variables: mass_variance <dbl>, # birth_year_variance <dbl>, height_effectif <int>, mass_effectif <int>, # birth_year_effectif <int>
starwars %>% mutate_if(is.character,as.factor) %>% summarise_at(c("gender","species"), nlevels)
# A tibble: 1 x 2 gender species <int> <int> 1 4 37
group_by()
Le complément circonstanciel est une fonction, un groupe de mots qui indique les circonstances dans lesquelles se réalise l'action du verbe
… c'est la combinaison de group_by()
+ summarise()
starwars %>% group_by(species) %>% summarise(masse_moyenne = mean(mass,na.rm=TRUE), taille_moyenne = mean(height,na.rm=TRUE))
# A tibble: 38 x 3 species masse_moyenne taille_moyenne <chr> <dbl> <dbl> 1 Aleena 15.00 79.0000 2 Besalisk 102.00 198.0000 3 Cerean 82.00 198.0000 4 Chagrian NaN 196.0000 5 Clawdite 55.00 168.0000 6 Droid 69.75 140.0000 7 Dug 40.00 112.0000 8 Ewok 20.00 88.0000 9 Geonosian 80.00 183.0000 10 Gungan 74.00 208.6667 # ... with 28 more rows
starwars %>% group_by(species) %>% summarise(masse_moyenne = mean(!is.na(mass)), taille_moyenne = mean(!is.na(height)), effectif_calcul = n()) %>% filter(effectif_calcul > 1) %>% arrange(desc(effectif_calcul))
# A tibble: 9 x 4 species masse_moyenne taille_moyenne effectif_calcul <chr> <dbl> <dbl> <int> 1 Human 0.6285714 0.8857143 35 2 Droid 0.8000000 0.8000000 5 # ... with 7 more rows
count()
et tally()
starwars %>% group_by(gender) %>% tally()
# A tibble: 5 x 2 gender n <chr> <int> 1 female 19 2 hermaphrodite 1 3 male 62 4 none 2 5 <NA> 3
équivaut à :
starwars %>% count(gender)
# A tibble: 5 x 2 gender n <chr> <int> 1 female 19 2 hermaphrodite 1 3 male 62 4 none 2 5 <NA> 3
prenoms
bind_cols()
pour remplacer cbind()
left_join()
, right_join()
, inner_join()
, full_join()
, anti_join()
…apprentissage
et validation
apprentisage <- starwars %>% select(-(films:starships)) %>% sample_frac(0.7) validation <- starwars %>% select(-(films:starships))%>% anti_join(apprentisage) dim(validation)
[1] 26 10
dim(apprentisage)
[1] 61 10
bind_rows()
pour remplacer rbind()
: nous épargne les galères de facteursintersect(x,y)
: observations présentes en x et en ysetdiff(x,y)
: observations présentes en x et absentes en yunion()
: observations présentes en x ou y (union_all()
conserves les doublons)prenoms
La recette pour nettoyer ses données :
library(tidyr)
gather
experience <- data.frame(id = c(1,2,3,4), sexe = c("F","M","M","F"), contrôle = c(7.9,6.3,9.5,11.5), traitement = c(12.3,10.6,13.1,13.4)) experience %>% gather(key = condition, value = glycémie,contrôle:traitement)
id sexe condition glycémie 1 1 F contrôle 7.9 2 2 M contrôle 6.3 3 3 M contrôle 9.5 4 4 F contrôle 11.5 5 1 F traitement 12.3 6 2 M traitement 10.6 7 3 M traitement 13.1 8 4 F traitement 13.4
spread
temp_europ <- data.frame( Ville = c(rep("Amsterdam",3),rep("Lisbonne",3)), Mois = c(rep(c("Janvier", "Février", "Mars"),2)), Température = c(2.9,2.5,5.7, 10.5,11.3, 12.8)) temp_europ
Ville Mois Température 1 Amsterdam Janvier 2.9 2 Amsterdam Février 2.5 3 Amsterdam Mars 5.7 4 Lisbonne Janvier 10.5 5 Lisbonne Février 11.3 6 Lisbonne Mars 12.8
temp_europ %>% spread(key = Mois, value = Température)
Ville Février Janvier Mars 1 Amsterdam 2.5 2.9 5.7 2 Lisbonne 11.3 10.5 12.8
starwars %>% group_by(gender, species) %>% tally() %>% spread(key = species, value = n, fill = 0)
# A tibble: 5 x 39 # Groups: gender [5] gender Aleena Besalisk Cerean Chagrian Clawdite Droid Dug Ewok * <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> 1 female 0 0 0 0 1 0 0 0 2 hermaphrodite 0 0 0 0 0 0 0 0 3 male 1 1 1 1 0 0 1 1 4 none 0 0 0 0 0 2 0 0 5 <NA> 0 0 0 0 0 3 0 0 # ... with 30 more variables: Geonosian <dbl>, Gungan <dbl>, Human <dbl>, # Hutt <dbl>, Iktotchi <dbl>, Kaleesh <dbl>, Kaminoan <dbl>, `Kel # Dor` <dbl>, Mirialan <dbl>, `Mon Calamari` <dbl>, Muun <dbl>, # Nautolan <dbl>, Neimodian <dbl>, `Pau'an` <dbl>, Quermian <dbl>, # Rodian <dbl>, Skakoan <dbl>, Sullustan <dbl>, Tholothian <dbl>, # Togruta <dbl>, Toong <dbl>, Toydarian <dbl>, Trandoshan <dbl>, # `Twi'lek` <dbl>, Vulptereen <dbl>, Wookiee <dbl>, Xexto <dbl>, `Yoda's # species` <dbl>, Zabrak <dbl>, `<NA>` <dbl>
prenoms
fill()
permet de compléter les NAdataset <- data.frame(id = 1:6,annee = c("2014",NA,NA,"2015",NA,NA)) dataset
id annee 1 1 2014 2 2 <NA> 3 3 <NA> 4 4 2015 5 5 <NA> 6 6 <NA>
dataset %>% fill(annee)
id annee 1 1 2014 2 2 2014 3 3 2014 4 4 2015 5 5 2015 6 6 2015
separate()
permet de separer une colonne en plusieurs colonnesdataset <- data.frame(id = 1:6, tension = c("12/8","12/7","14/4","18/10","13/8","12/8")) dataset
id tension 1 1 12/8 2 2 12/7 3 3 14/4 4 4 18/10 5 5 13/8 6 6 12/8
dataset %>% separate(tension,into = c("PAS","PAD"),sep = "/",remove = TRUE)
id PAS PAD 1 1 12 8 2 2 12 7 3 3 14 4 4 4 18 10 5 5 13 8 6 6 12 8
complete()
ajoute les combinaisons manquantesdataset <- data.frame(id = 1:6, annee = c("2012","2012","2013","2014","2014","2014"), mois = month.abb[c(1,3,2,1,2,3)], temperature = c(12,11,15,16,18,10)) dataset
id annee mois temperature 1 1 2012 Jan 12 2 2 2012 Mar 11 3 3 2013 Feb 15 4 4 2014 Jan 16 5 5 2014 Feb 18 6 6 2014 Mar 10
dataset %>% complete(annee, mois)
# A tibble: 9 x 4 annee mois id temperature <fctr> <fctr> <int> <dbl> 1 2012 Feb NA NA 2 2012 Jan 1 12 3 2012 Mar 2 11 4 2013 Feb 3 15 5 2013 Jan NA NA 6 2013 Mar NA NA 7 2014 Feb 5 18 8 2014 Jan 4 16 9 2014 Mar 6 10
Autres packages utiles du tidyverse :
{readr}
: l'import réussi des données{purr}
: programmation fonctionnelle{lubridate}
: manipulation des dates{forcats}
: manipulation des facteurs{roxygen2}
et devtools : mettre le tout en package