+ - 0:00:00
Notes for current slide
Notes for next slide

INPUT - OUTPUT Workshop R & RStudio

Deel 5

Enkele typische statistische analyses in R

Sven De Maeyer

03/05/2022

1 / 21

Overzicht

2 / 21

1. Correlatie

3 / 21

De cor( ) functie

Let op voor NA's!!

library(palmerpenguins)
data("penguins")
cor(penguins$flipper_length_mm,
penguins$body_mass_g)
## [1] NA
cor(penguins$flipper_length_mm,
penguins$body_mass_g,
use = "complete.obs")
## [1] 0.8712018
4 / 21

Statistische test met cor.test( ) functie

cor.test(penguins$flipper_length_mm,
penguins$body_mass_g)
##
## Pearson's product-moment correlation
##
## data: penguins$flipper_length_mm and penguins$body_mass_g
## t = 32.722, df = 340, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.843041 0.894599
## sample estimates:
## cor
## 0.8712018
5 / 21

broom pakket

  • Pakket in tidyverse

  • Output van verschillende statistische methoden op een gelijkaardige wijze omzetten naar tidy format!

tidy(): alle belangrijke output als tabel meegeven

library(broom)
C <- cor.test(penguins$flipper_length_mm,
penguins$body_mass_g)
tidy(C)
## # A tibble: 1 × 8
## estimate statistic p.value parameter conf.low conf.high method alternative
## <dbl> <dbl> <dbl> <int> <dbl> <dbl> <chr> <chr>
## 1 0.871 32.7 4.37e-107 340 0.843 0.895 Pearson… two.sided
6 / 21

Correlaties tussen meerdere variabelen

penguins %>%
select(
bill_depth_mm,
bill_length_mm,
flipper_length_mm,
body_mass_g
) %>%
cor(use = "pairwise.complete.obs")
## bill_depth_mm bill_length_mm flipper_length_mm body_mass_g
## bill_depth_mm 1.0000000 -0.2350529 -0.5838512 -0.4719156
## bill_length_mm -0.2350529 1.0000000 0.6561813 0.5951098
## flipper_length_mm -0.5838512 0.6561813 1.0000000 0.8712018
## body_mass_g -0.4719156 0.5951098 0.8712018 1.0000000
7 / 21

Correlaties tussen meerdere variabelen visueel

Pakket: GGally

Functie: ggpairs()

Resultaat: een ggplot object...

library(GGally)
penguins %>%
select(
species,
bill_depth_mm,
bill_length_mm,
flipper_length_mm,
body_mass_g
) %>%
ggpairs(
columns = c(
"flipper_length_mm", "body_mass_g",
"bill_length_mm", "bill_depth_mm")
)

8 / 21

Correlaties tussen meerdere variabelen visueel MET KLEUR!

penguins %>%
select(
species,
bill_depth_mm,
bill_length_mm,
flipper_length_mm,
body_mass_g
) %>%
ggpairs(
aes(color = species),
columns = c(
"flipper_length_mm", "body_mass_g",
"bill_length_mm", "bill_depth_mm")) +
scale_colour_manual(
values = c("darkorange","purple","cyan4")) +
scale_fill_manual(
values = c("darkorange","purple","cyan4"))

9 / 21

2. Lineaire regressie

10 / 21

lm()

Model1 <- lm(flipper_length_mm ~ body_mass_g,
data = penguins)
11 / 21

Model resultaten bekijken met summary()

summary(Model1)
##
## Call:
## lm(formula = flipper_length_mm ~ body_mass_g, data = penguins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.7626 -4.9138 0.9891 5.1166 16.6392
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.367e+02 1.997e+00 68.47 <2e-16 ***
## body_mass_g 1.528e-02 4.668e-04 32.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.913 on 340 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.759, Adjusted R-squared: 0.7583
## F-statistic: 1071 on 1 and 340 DF, p-value: < 2.2e-16
12 / 21

broom pakket

Functie: tidy()

Resultaat: tidy dataset met informatie over schattingen

tidy(Model1,
conf.int = TRUE,
conf.level = .90)
## # A tibble: 2 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 137. 2.00 68.5 5.71e-201 133. 140.
## 2 body_mass_g 0.0153 0.000467 32.7 4.37e-107 0.0145 0.0160
13 / 21

broom pakket

Functie: glance()

Resultaat: tidy dataset met model fit gegevens

glance(Model1)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.759 0.758 6.91 1071. 4.37e-107 1 -1146. 2297. 2309.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
14 / 21

broom pakket

Functie: augment()

Resultaat: aan de originele data extra informatie toevoegen op basis van het model (geschatte waarden; residuelen; ...)

augment(Model1)
## # A tibble: 342 × 9
## .rownames flipper_length_mm body_mass_g .fitted .resid .hat .sigma .cooksd
## <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 181 3750 194. -13.0 0.00385 6.89 6.88e-3
## 2 2 186 3800 195. -8.78 0.00366 6.91 2.97e-3
## 3 3 195 3250 186. 8.62 0.00705 6.91 5.57e-3
## 4 5 193 3450 189. 3.57 0.00550 6.92 7.41e-4
## 5 6 190 3650 192. -2.49 0.00431 6.92 2.81e-4
## 6 7 181 3625 192. -11.1 0.00444 6.90 5.78e-3
## 7 8 195 4675 208. -13.1 0.00395 6.89 7.19e-3
## 8 9 193 3475 190. 3.19 0.00533 6.92 5.73e-4
## 9 10 190 4250 202. -11.7 0.00293 6.89 4.19e-3
## 10 11 186 3300 187. -1.14 0.00663 6.92 9.14e-5
## # … with 332 more rows, and 1 more variable: .std.resid <dbl>
15 / 21

broom pakket

Functie: augment() + geom_histogram()

Resultaat: Residuelen normaal verdeeld?

augment(Model1) %>%
select(.resid) %>%
ggplot(
aes(
x = .resid
)
) +
geom_histogram() +
theme_minimal() +
labs(
title = "Model1",
subtitle = "Verdeling van de residuelen"
) +
theme(plot.title.position = "plot")

16 / 21

broom pakket

Functie: augment() + geom_histogram()

Resultaat: Residuelen normaal verdeeld?

augment(Model1) %>%
select(.fitted, .std.resid) %>%
ggplot(
aes(
x = .fitted,
y = .std.resid
)
) +
geom_point() +
theme_minimal() +
labs(
title = "Model1",
subtitle = "Fitted values vs residuals"
) + geom_hline(yintercept = 0) +
theme(plot.title.position = "plot")

17 / 21

broom pakket

Functie: augment() + geom_histogram()

Resultaat: Residuelen normaal verdeeld?

augment(Model1) %>%
select(.fitted, .std.resid) %>%
ggplot(
aes(
x = .fitted,
y = .std.resid
)
) +
geom_point() +
theme_minimal() +
labs(
title = "Model1",
subtitle = "Fitted values vs residuals"
) + geom_hline(yintercept = 0) +
theme(plot.title.position = "plot")

18 / 21

Lineair model met meerdere voorspellers

Model2 <- lm(
flipper_length_mm ~ body_mass_g + sex + species,
data = penguins
)
tidy(Model2)
## # A tibble: 5 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 165. 3.18 51.7 1.01e-159
## 2 body_mass_g 0.00655 0.000931 7.04 1.15e- 11
## 3 sexmale 2.48 0.854 2.90 3.97e- 3
## 4 speciesChinstrap 5.54 0.785 7.06 9.92e- 12
## 5 speciesGentoo 18.0 1.44 12.5 1.46e- 29
19 / 21

Modellen vergelijken

Model fit informatie van meerdere modellen op een rijtje zetten:

M1_info <- glance(Model1) %>% select(r.squared, AIC, BIC)
M2_info <- glance(Model2) %>% select(r.squared, AIC, BIC)
M1_info %>% rbind(M2_info)
## # A tibble: 2 × 3
## r.squared AIC BIC
## <dbl> <dbl> <dbl>
## 1 0.759 2297. 2309.
## 2 0.856 2068. 2091.
20 / 21

Model visualiseren

augment(Model2) %>%
ggplot() +
geom_point(
aes(x = body_mass_g,
y = flipper_length_mm,
color = sex),
alpha = .6
) +
geom_line(
aes(
x = body_mass_g,
y = .fitted,
color = sex,
),
size = 1.5
) +
facet_wrap(.~species) +
theme_minimal()

21 / 21

Overzicht

2 / 21
Paused

Help

Keyboard shortcuts

, , Pg Up, k Go to previous slide
, , Pg Dn, Space, j Go to next slide
Home Go to first slide
End Go to last slide
Number + Return Go to specific slide
b / m / f Toggle blackout / mirrored / fullscreen mode
c Clone slideshow
p Toggle presenter mode
t Restart the presentation timer
?, h Toggle this help
Esc Back to slideshow