Deel 5
Enkele typische statistische analyses in R
03/05/2022
cor( )
functieLet op voor NA's!!
library(palmerpenguins)data("penguins")cor(penguins$flipper_length_mm, penguins$body_mass_g)
## [1] NA
cor(penguins$flipper_length_mm, penguins$body_mass_g, use = "complete.obs")
## [1] 0.8712018
cor.test( )
functiecor.test(penguins$flipper_length_mm, penguins$body_mass_g)
## ## Pearson's product-moment correlation## ## data: penguins$flipper_length_mm and penguins$body_mass_g## t = 32.722, df = 340, p-value < 2.2e-16## alternative hypothesis: true correlation is not equal to 0## 95 percent confidence interval:## 0.843041 0.894599## sample estimates:## cor ## 0.8712018
broom
pakketPakket in tidyverse
Output van verschillende statistische methoden op een gelijkaardige wijze omzetten naar tidy format!
tidy()
: alle belangrijke output als tabel meegeven
library(broom)C <- cor.test(penguins$flipper_length_mm, penguins$body_mass_g)tidy(C)
## # A tibble: 1 × 8## estimate statistic p.value parameter conf.low conf.high method alternative## <dbl> <dbl> <dbl> <int> <dbl> <dbl> <chr> <chr> ## 1 0.871 32.7 4.37e-107 340 0.843 0.895 Pearson… two.sided
penguins %>% select( bill_depth_mm, bill_length_mm, flipper_length_mm, body_mass_g ) %>% cor(use = "pairwise.complete.obs")
## bill_depth_mm bill_length_mm flipper_length_mm body_mass_g## bill_depth_mm 1.0000000 -0.2350529 -0.5838512 -0.4719156## bill_length_mm -0.2350529 1.0000000 0.6561813 0.5951098## flipper_length_mm -0.5838512 0.6561813 1.0000000 0.8712018## body_mass_g -0.4719156 0.5951098 0.8712018 1.0000000
Pakket: GGally
Functie: ggpairs()
Resultaat: een ggplot object...
library(GGally)penguins %>% select( species, bill_depth_mm, bill_length_mm, flipper_length_mm, body_mass_g ) %>% ggpairs( columns = c( "flipper_length_mm", "body_mass_g", "bill_length_mm", "bill_depth_mm") )
penguins %>% select( species, bill_depth_mm, bill_length_mm, flipper_length_mm, body_mass_g ) %>% ggpairs( aes(color = species), columns = c( "flipper_length_mm", "body_mass_g", "bill_length_mm", "bill_depth_mm")) + scale_colour_manual( values = c("darkorange","purple","cyan4")) + scale_fill_manual( values = c("darkorange","purple","cyan4"))
lm()
Model1 <- lm(flipper_length_mm ~ body_mass_g, data = penguins)
summary()
summary(Model1)
## ## Call:## lm(formula = flipper_length_mm ~ body_mass_g, data = penguins)## ## Residuals:## Min 1Q Median 3Q Max ## -23.7626 -4.9138 0.9891 5.1166 16.6392 ## ## Coefficients:## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 1.367e+02 1.997e+00 68.47 <2e-16 ***## body_mass_g 1.528e-02 4.668e-04 32.72 <2e-16 ***## ---## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1## ## Residual standard error: 6.913 on 340 degrees of freedom## (2 observations deleted due to missingness)## Multiple R-squared: 0.759, Adjusted R-squared: 0.7583 ## F-statistic: 1071 on 1 and 340 DF, p-value: < 2.2e-16
broom
pakketFunctie: tidy()
Resultaat: tidy dataset met informatie over schattingen
tidy(Model1, conf.int = TRUE, conf.level = .90)
## # A tibble: 2 × 7## term estimate std.error statistic p.value conf.low conf.high## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>## 1 (Intercept) 137. 2.00 68.5 5.71e-201 133. 140. ## 2 body_mass_g 0.0153 0.000467 32.7 4.37e-107 0.0145 0.0160
broom
pakketFunctie: glance()
Resultaat: tidy dataset met model fit gegevens
glance(Model1)
## # A tibble: 1 × 12## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>## 1 0.759 0.758 6.91 1071. 4.37e-107 1 -1146. 2297. 2309.## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
broom
pakketFunctie: augment()
Resultaat: aan de originele data extra informatie toevoegen op basis van het model (geschatte waarden; residuelen; ...)
augment(Model1)
## # A tibble: 342 × 9## .rownames flipper_length_mm body_mass_g .fitted .resid .hat .sigma .cooksd## <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>## 1 1 181 3750 194. -13.0 0.00385 6.89 6.88e-3## 2 2 186 3800 195. -8.78 0.00366 6.91 2.97e-3## 3 3 195 3250 186. 8.62 0.00705 6.91 5.57e-3## 4 5 193 3450 189. 3.57 0.00550 6.92 7.41e-4## 5 6 190 3650 192. -2.49 0.00431 6.92 2.81e-4## 6 7 181 3625 192. -11.1 0.00444 6.90 5.78e-3## 7 8 195 4675 208. -13.1 0.00395 6.89 7.19e-3## 8 9 193 3475 190. 3.19 0.00533 6.92 5.73e-4## 9 10 190 4250 202. -11.7 0.00293 6.89 4.19e-3## 10 11 186 3300 187. -1.14 0.00663 6.92 9.14e-5## # … with 332 more rows, and 1 more variable: .std.resid <dbl>
broom
pakketFunctie: augment()
+ geom_histogram()
Resultaat: Residuelen normaal verdeeld?
augment(Model1) %>% select(.resid) %>% ggplot( aes( x = .resid ) ) + geom_histogram() + theme_minimal() + labs( title = "Model1", subtitle = "Verdeling van de residuelen" ) + theme(plot.title.position = "plot")
broom
pakketFunctie: augment()
+ geom_histogram()
Resultaat: Residuelen normaal verdeeld?
augment(Model1) %>% select(.fitted, .std.resid) %>% ggplot( aes( x = .fitted, y = .std.resid ) ) + geom_point() + theme_minimal() + labs( title = "Model1", subtitle = "Fitted values vs residuals" ) + geom_hline(yintercept = 0) + theme(plot.title.position = "plot")
broom
pakketFunctie: augment()
+ geom_histogram()
Resultaat: Residuelen normaal verdeeld?
augment(Model1) %>% select(.fitted, .std.resid) %>% ggplot( aes( x = .fitted, y = .std.resid ) ) + geom_point() + theme_minimal() + labs( title = "Model1", subtitle = "Fitted values vs residuals" ) + geom_hline(yintercept = 0) + theme(plot.title.position = "plot")
Model2 <- lm( flipper_length_mm ~ body_mass_g + sex + species, data = penguins)tidy(Model2)
## # A tibble: 5 × 5## term estimate std.error statistic p.value## <chr> <dbl> <dbl> <dbl> <dbl>## 1 (Intercept) 165. 3.18 51.7 1.01e-159## 2 body_mass_g 0.00655 0.000931 7.04 1.15e- 11## 3 sexmale 2.48 0.854 2.90 3.97e- 3## 4 speciesChinstrap 5.54 0.785 7.06 9.92e- 12## 5 speciesGentoo 18.0 1.44 12.5 1.46e- 29
Model fit informatie van meerdere modellen op een rijtje zetten:
M1_info <- glance(Model1) %>% select(r.squared, AIC, BIC)M2_info <- glance(Model2) %>% select(r.squared, AIC, BIC)M1_info %>% rbind(M2_info)
## # A tibble: 2 × 3## r.squared AIC BIC## <dbl> <dbl> <dbl>## 1 0.759 2297. 2309.## 2 0.856 2068. 2091.
augment(Model2) %>% ggplot() + geom_point( aes(x = body_mass_g, y = flipper_length_mm, color = sex), alpha = .6 ) + geom_line( aes( x = body_mass_g, y = .fitted, color = sex, ), size = 1.5 ) + facet_wrap(.~species) + theme_minimal()
Keyboard shortcuts
↑, ←, Pg Up, k | Go to previous slide |
↓, →, Pg Dn, Space, j | Go to next slide |
Home | Go to first slide |
End | Go to last slide |
Number + Return | Go to specific slide |
b / m / f | Toggle blackout / mirrored / fullscreen mode |
c | Clone slideshow |
p | Toggle presenter mode |
t | Restart the presentation timer |
?, h | Toggle this help |
Esc | Back to slideshow |