8 Descriptive statistics and correlations
8.1 Descriptives for all variables
The function summary(data_frame)
returns descriptive statistics for all variables in a dataset. For numeric variables, the minimum, maximum, quartiles, median, and mean values are returned, for factors the frequencies of the factor levels. In addition, the number of missing values for both variable types is displayed.
We first import our sample data set and assign the variables their correct scale level (i.e., defining factors as factors). Then we save the data set as an .Rdata
file in our data folder, so that we can load it next time without having to do all the import work again.
library(tidyverse)
library(haven)
adolescents <- read_sav("data/beispieldaten.sav")
# Look at the data set
adolescents
#> # A tibble: 286 x 98
#> ID westost geschlecht alter swk1 swk2 swk3 swk4 swk5 swk6 swk7 swk8
#> <dbl> <dbl+l> <dbl+lbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0 [Wes… 1 [weibli… 13 2 5 6 4 7 5 5 6
#> 2 2 0 [Wes… 0 [männli… 14 4 6 4 4 6 5 5 6
#> 3 10 0 [Wes… 1 [weibli… 14 5 4 5 6 5 7 7 6
#> 4 11 0 [Wes… 1 [weibli… 14 3 6 6 5 7 7 6 6
#> 5 12 0 [Wes… 1 [weibli… 14 5 6 6 5 7 6 5 5
#> 6 14 0 [Wes… 0 [männli… 14 3 5 5 4 5 7 5 6
#> # … with 280 more rows, and 86 more variables: swk9 <dbl>, swk10 <dbl>,
#> # swk11 <dbl>, swk12 <dbl>, swk13 <dbl>, swk14 <dbl>, swk15 <dbl>,
#> # swk16 <dbl>, swk17 <dbl>, swk18 <dbl>, swk19 <dbl>, swk20 <dbl>,
#> # swk21 <dbl>, swk22 <dbl>, swk23 <dbl>, swk24 <dbl>, swk25 <dbl>,
#> # swk26 <dbl>, swk27 <dbl>, swk28 <dbl>, swk29 <dbl>, swk30 <dbl>,
#> # swk31 <dbl>, swk32 <dbl>, swk33 <dbl>, swk34 <dbl>, swk35 <dbl>,
#> # swk36 <dbl>, unteltern1 <dbl>, unteltern2 <dbl>, unteltern3 <dbl>,
#> # unteltern4 <dbl>, unteltern5 <dbl>, unteltern6 <dbl>, untfreunde1 <dbl>,
#> # untfreunde2 <dbl>, untfreunde3 <dbl>, untfreunde4 <dbl>, untfreunde5 <dbl>,
#> # untfreunde6 <dbl>, leben1 <dbl>, leben2 <dbl>, leben3 <dbl>, leben4 <dbl>,
#> # leben5 <dbl>, leben6 <dbl>, leben7 <dbl>, leben8 <dbl>, leben9 <dbl>,
#> # leben10 <dbl>, leben11 <dbl>, stress1 <dbl>, stress2 <dbl>, stress3 <dbl>,
#> # stress4 <dbl>, stress5 <dbl>, stress6 <dbl>, stress7 <dbl>, stress8 <dbl>,
#> # stress9 <dbl>, stress10 <dbl>, stress11 <dbl>, stress12 <dbl>,
#> # Deutschnote <dbl>, Mathenote <dbl+lbl>, Fremdsprachenote <dbl+lbl>,
#> # Gesamtnote <dbl>, bildung_vater <dbl+lbl>, bildung_mutter <dbl+lbl>,
#> # bildung_vater_binaer <dbl+lbl>, bildung_mutter_binaer <dbl+lbl>,
#> # swk_neueslernen <dbl>, swk_lernregulation <dbl>, swk_motivation <dbl>,
#> # swk_durchsetzung <dbl>, swk_sozialkomp <dbl>, swk_beziehung <dbl>,
#> # unt_eltern <dbl>, unt_freunde <dbl>, leben_selbst <dbl>,
#> # leben_familie <dbl>, leben_schule <dbl>, leben_freunde <dbl>,
#> # leben_gesamt <dbl>, stress_somatisch <dbl>, stress_psychisch <dbl>
# Define factors
adolescents$ID <- as.factor(adolescents$ID)
adolescents$westost <- as_factor(adolescents$westost, levels = "default")
adolescents$geschlecht <- as_factor(adolescents$geschlecht, levels = "default")
adolescents$bildung_vater <- as_factor(adolescents$bildung_vater, levels = "default")
adolescents$bildung_mutter <- as_factor(adolescents$bildung_mutter, levels = "default")
adolescents$bildung_vater_binaer <- as_factor(adolescents$bildung_vater_binaer, levels = "default")
adolescents$bildung_mutter_binaer <- as_factor(adolescents$bildung_mutter_binaer, levels = "default")
# The factor levels of `bildung_vater` and `bildung_mutter` are quite long. Let's rename them:
adolescents <- adolescents %>%
mutate(bildung_vater = fct_recode(bildung_vater,
Hauptschule = "Hauptschulabschluss oder niedriger",
Realschule = "Realschulabschluss (mittlere Reife)",
Abitur = "Fachabitur, Abitur",
Hochschule = "Fachhochschulabschluss, Universitätsabschluss"),
bildung_mutter = fct_recode(bildung_mutter,
Hauptschule = "Hauptschulabschluss oder niedriger",
Realschule = "Realschulabschluss (mittlere Reife)",
Abitur = "Fachabitur, Abitur",
Hochschule = "Fachhochschulabschluss, Universitätsabschluss"))
# Save as Rdata file
save(adolescents, file = "data/adolescents.Rdata")
For showing the use of the summary()
function we first get rid of the single item variables (keeping scale variables only) to avoid too much output:
adolescents_scales <- adolescents %>% select (-c(swk1:stress12))
summary(adolescents_scales)
#> ID westost geschlecht alter SES
#> 1 : 1 West:143 männlich:152 Min. :13.0 Min. :1.000
#> 2 : 1 Ost :143 weiblich:134 1st Qu.:14.0 1st Qu.:3.000
#> 10 : 1 Median :15.0 Median :3.000
#> 11 : 1 Mean :14.7 Mean :3.391
#> 12 : 1 3rd Qu.:15.0 3rd Qu.:4.000
#> 14 : 1 Max. :17.0 Max. :5.000
#> (Other):280 NA's :2
#> Deutsch Mathe Fremdsprache Gesamtnote
#> Min. :2.000 Min. :2.000 Min. :2.000 Min. :3.000
#> 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:4.000
#> Median :4.000 Median :4.000 Median :4.000 Median :4.500
#> Mean :4.261 Mean :4.173 Mean :4.309 Mean :4.462
#> 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000
#> Max. :6.000 Max. :6.000 Max. :6.000 Max. :6.000
#> NA's :3 NA's :2 NA's :11 NA's :6
#> bildung_vater bildung_mutter bildung_vater_binaer
#> Hauptschule:41 Hauptschule:43 niedrig:135
#> Realschule :94 Realschule :95 hoch :134
#> Abitur :41 Abitur :50 NA's : 17
#> Hochschule :93 Hochschule :84
#> NA's :17 NA's :14
#>
#>
#> bildung_mutter_binaer swk_neueslernen swk_lernregulation swk_motivation
#> niedrig:138 Min. :2.429 Min. :2.875 Min. :3.200
#> hoch :134 1st Qu.:4.571 1st Qu.:4.750 1st Qu.:5.000
#> NA's : 14 Median :5.143 Median :5.250 Median :5.400
#> Mean :5.098 Mean :5.246 Mean :5.447
#> 3rd Qu.:5.571 3rd Qu.:5.875 3rd Qu.:6.000
#> Max. :7.000 Max. :7.000 Max. :7.000
#>
#> swk_durchsetzung swk_sozialkomp swk_beziehung unt_eltern
#> Min. :1.000 Min. :1.571 Min. :3.333 Min. :1.500
#> 1st Qu.:4.667 1st Qu.:4.857 1st Qu.:5.167 1st Qu.:5.333
#> Median :5.333 Median :5.381 Median :5.667 Median :6.167
#> Mean :5.285 Mean :5.280 Mean :5.616 Mean :5.872
#> 3rd Qu.:6.000 3rd Qu.:5.857 3rd Qu.:6.167 3rd Qu.:6.667
#> Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.000
#> NA's :1 NA's :1
#> unt_freunde leben_selbst leben_familie leben_schule
#> Min. :2.000 Min. :1.000 Min. :1.000 Min. :1.333
#> 1st Qu.:5.500 1st Qu.:5.000 1st Qu.:5.333 1st Qu.:4.000
#> Median :6.000 Median :5.500 Median :6.000 Median :4.667
#> Mean :5.928 Mean :5.433 Mean :5.760 Mean :4.670
#> 3rd Qu.:6.667 3rd Qu.:6.000 3rd Qu.:6.667 3rd Qu.:5.333
#> Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.000
#> NA's :1 NA's :1 NA's :2 NA's :1
#> leben_freunde leben_gesamt stress_somatisch stress_psychisch
#> Min. :2.000 Min. :2.909 Min. :1.000 Min. :1.000
#> 1st Qu.:5.500 1st Qu.:5.000 1st Qu.:2.167 1st Qu.:2.167
#> Median :6.000 Median :5.455 Median :3.000 Median :2.917
#> Mean :5.898 Mean :5.417 Mean :3.020 Mean :2.989
#> 3rd Qu.:6.500 3rd Qu.:5.909 3rd Qu.:3.667 3rd Qu.:3.667
#> Max. :7.000 Max. :6.909 Max. :7.000 Max. :6.833
#> NA's :1 NA's :1 NA's :3 NA's :2
#> leben_dich
#> Min. :0.0000
#> 1st Qu.:0.0000
#> Median :1.0000
#> Mean :0.5298
#> 3rd Qu.:1.0000
#> Max. :1.0000
#> NA's :1
8.2 Descriptives for categorical data
8.2.1 Frequencies and contingency tables with table()
Using table()
we get the frequencies of factor levels/groups. This is particularly useful when we are interested in the combined frequencies of several factors as in a contingency table.
# Frequencies for a single factor
table(adolescents$bildung_vater)
#>
#> Hauptschule Realschule Abitur Hochschule
#> 41 94 41 93
# contingency table of two factors
table(adolescents$bildung_vater, adolescents$bildung_mutter)
#>
#> Hauptschule Realschule Abitur Hochschule
#> Hauptschule 23 12 2 3
#> Realschule 12 55 17 10
#> Abitur 5 11 21 4
#> Hochschule 3 16 9 65
With prop.table()
we obtain proportions:
table(adolescents$bildung_vater, adolescents$bildung_mutter) %>% prop.table() %>% round(2)
#>
#> Hauptschule Realschule Abitur Hochschule
#> Hauptschule 0.09 0.04 0.01 0.01
#> Realschule 0.04 0.21 0.06 0.04
#> Abitur 0.02 0.04 0.08 0.01
#> Hochschule 0.01 0.06 0.03 0.24
# or in percentages
100*table(adolescents$bildung_vater, adolescents$bildung_mutter) %>% prop.table() %>% round(4)
#>
#> Hauptschule Realschule Abitur Hochschule
#> Hauptschule 8.58 4.48 0.75 1.12
#> Realschule 4.48 20.52 6.34 3.73
#> Abitur 1.87 4.10 7.84 1.49
#> Hochschule 1.12 5.97 3.36 24.25
In most cases we are not interested in the total proportions (which add up to 1 or 100% across the entire table), but in the row-conditional or column-conditional proportions. The former are obtained using the argument margin = 1
, the latter with margin = 2
in the prop.table()
function. Since there is no further argument we can also directly specify the value of the argument:
prop.table(1)
for row-conditional proportions
prop.table(2)
for column-conditional proportions
prop.table()
uses as first argument a table()
object. The unnamed margin argument can thus only be used in combination with the pipe %>%
operator.
# Row-wise percentages
100*table(adolescents$bildung_vater, adolescents$bildung_mutter) %>% prop.table(1) %>% round(4)
#>
#> Hauptschule Realschule Abitur Hochschule
#> Hauptschule 57.50 30.00 5.00 7.50
#> Realschule 12.77 58.51 18.09 10.64
#> Abitur 12.20 26.83 51.22 9.76
#> Hochschule 3.23 17.20 9.68 69.89
# Column-wise percentages
100*table(adolescents$bildung_vater, adolescents$bildung_mutter) %>% prop.table(2) %>% round(4)
#>
#> Hauptschule Realschule Abitur Hochschule
#> Hauptschule 53.49 12.77 4.08 3.66
#> Realschule 27.91 58.51 34.69 12.20
#> Abitur 11.63 11.70 42.86 4.88
#> Hochschule 6.98 17.02 18.37 79.27
8.2.2 Mode
There is no separate function for calculating the mode (category with the highest frequency) of a categorical variable in R. Either read the mode from the frequency table or help yourself with:
8.2.3 Descriptive statistics for categorical data with jmv
jamovi offers great functionality for statistical data analysis. It also offers a connection to R via the jmv
package. When running the jamovi program in syntax mode the provided syntax can be copied directly into R, and only the data argument has to be changed from its default. For our example data = data
is changed to data = adolescents
.
install.packages("jmv")
Let’s get some descriptive statistics for our variables bildung_vater
und bildung_mutter
using the jmv
function descriptives()
:
library(jmv)
descriptives(
data = adolescents,
vars = vars(bildung_vater, bildung_mutter),
freq = TRUE)
#>
#> DESCRIPTIVES
#>
#> Descriptives
#> ──────────────────────────────────────────────
#> bildung_vater bildung_mutter
#> ──────────────────────────────────────────────
#> N 269 272
#> Missing 17 14
#> Mean
#> Median
#> Minimum
#> Maximum
#> ──────────────────────────────────────────────
#>
#>
#> FREQUENCIES
#>
#> Frequencies of bildung_vater
#> ───────────────────────────────────────────────────────
#> Levels Counts % of Total Cumulative %
#> ───────────────────────────────────────────────────────
#> Hauptschule 41 15.2 15.2
#> Realschule 94 34.9 50.2
#> Abitur 41 15.2 65.4
#> Hochschule 93 34.6 100.0
#> ───────────────────────────────────────────────────────
#>
#>
#> Frequencies of bildung_mutter
#> ───────────────────────────────────────────────────────
#> Levels Counts % of Total Cumulative %
#> ───────────────────────────────────────────────────────
#> Hauptschule 43 15.8 15.8
#> Realschule 95 34.9 50.7
#> Abitur 50 18.4 69.1
#> Hochschule 84 30.9 100.0
#> ───────────────────────────────────────────────────────
Contingency tables can be obtained using contTables()
. This function comes with a \(\chi^2\)-Test for independence and has a number of further options. Here, we include the effect size Cramer’s V as well as Goodman-Kruskals \(\gamma\) (measure of association for categorical ordinal variables).
contTables(
formula = ~ bildung_vater:bildung_mutter,
data = adolescents,
phiCra = TRUE, # Phi and Cramer's V (Phi only for 2x2 tables)
gamma = TRUE, # Goodman-Kruskals Gamma
pcRow = TRUE, # row percentages
pcCol = TRUE, # column percentages
pcTot = TRUE) # total percentages
#>
#> CONTINGENCY TABLES
#>
#> Contingency Tables
#> ──────────────────────────────────────────────────────────────────────────────────────────────────
#> bildung_vater Hauptschule Realschule Abitur Hochschule Total
#> ──────────────────────────────────────────────────────────────────────────────────────────────────
#> Hauptschule Observed 23 12 2 3 40
#> % within row 57.5 30.0 5.0 7.5 100.0
#> % within column 53.5 12.8 4.1 3.7 14.9
#> % of total 8.6 4.5 0.7 1.1 14.9
#>
#> Realschule Observed 12 55 17 10 94
#> % within row 12.8 58.5 18.1 10.6 100.0
#> % within column 27.9 58.5 34.7 12.2 35.1
#> % of total 4.5 20.5 6.3 3.7 35.1
#>
#> Abitur Observed 5 11 21 4 41
#> % within row 12.2 26.8 51.2 9.8 100.0
#> % within column 11.6 11.7 42.9 4.9 15.3
#> % of total 1.9 4.1 7.8 1.5 15.3
#>
#> Hochschule Observed 3 16 9 65 93
#> % within row 3.2 17.2 9.7 69.9 100.0
#> % within column 7.0 17.0 18.4 79.3 34.7
#> % of total 1.1 6.0 3.4 24.3 34.7
#>
#> Total Observed 43 94 49 82 268
#> % within row 16.0 35.1 18.3 30.6 100.0
#> % within column 100.0 100.0 100.0 100.0 100.0
#> % of total 16.0 35.1 18.3 30.6 100.0
#> ──────────────────────────────────────────────────────────────────────────────────────────────────
#>
#>
#> χ² Tests
#> ───────────────────────────────
#> Value df p
#> ───────────────────────────────
#> χ² 182 9 < .001
#> N 268
#> ───────────────────────────────
#>
#>
#> Nominal
#> ────────────────────────────
#> Value
#> ────────────────────────────
#> Phi-coefficient NaN
#> Cramer's V 0.475
#> ────────────────────────────
#>
#>
#> Gamma
#> ─────────────────────────────────────────────
#> Gamma Standard Error Lower Upper
#> ─────────────────────────────────────────────
#> 0.693 0.0489 0.597 0.789
#> ─────────────────────────────────────────────
Reduced version with only row percentages:
contTables(
formula = ~ bildung_vater:bildung_mutter,
data = adolescents,
chiSq = FALSE,
obs = FALSE,
pcRow = TRUE)
#>
#> CONTINGENCY TABLES
#>
#> Contingency Tables
#> ───────────────────────────────────────────────────────────────────────────────────────────────
#> bildung_vater Hauptschule Realschule Abitur Hochschule Total
#> ───────────────────────────────────────────────────────────────────────────────────────────────
#> Hauptschule % within row 57.5 30.0 5.0 7.5 100.0
#> Realschule % within row 12.8 58.5 18.1 10.6 100.0
#> Abitur % within row 12.2 26.8 51.2 9.8 100.0
#> Hochschule % within row 3.2 17.2 9.7 69.9 100.0
#> Total % within row 16.0 35.1 18.3 30.6 100.0
#> ───────────────────────────────────────────────────────────────────────────────────────────────
#>
#>
#> χ² Tests
#> ──────────────
#> Value
#> ──────────────
#> N 268
#> ──────────────
8.3 Descriptives for ordinal and metric data
8.3.1 Descriptive statistics using base R functions
We already know most of the functions needed for describing the distributional characteristics of ordinal and metric variables:
mean(x, na.rm = FALSE) Arithmetic mean
sd(x, na.rm = FALSE) (Sample) Standard Deviation
var(x, na.rm = FALSE) (Sample) Variance
median(x) Median
quantile(x, probs, type) Quantile of x. probs: vector with probabilities
min(x) Minimum value of x
max(x) Maximum value of x
range(x) x_min and x_max
Examples
Let’s create a vector with sixteen numbers between 1 and 9 and sort it:
x <- c(4, 7, 9, 2, 3, 6, 6, 4, 1, 2, 8, 5, 5, 7, 3, 2)
x <- sort(x)
x
#> [1] 1 2 2 2 3 3 4 4 5 5 6 6 7 7 8 9
By the way, quantile()
has nine different methods (type = 1-9)
to compute the quantiles of a distribution (help('quantile'))
. The default method is type = 7
while the (simpler) method tought in most textbooks is type = 2
:
The tapply()
function can be used to display selected descriptive statistics for the combined levels of several factors. This may be useful for the presentation of means in 2-factorial ANOVAS. The generic form is: tapply(df$var1, list(factor1 = df$factor1, factor2 = df$factor2), statistic, na.rm = TRUE)
.
statistic
stands for the desired statistic (only one at a time), e.g. mean
, median
, sum
, sd
.
Let’s display the means of the variable unt_eltern
(perceived support by parents) for the combined factors geschlecht
and westost
:
tapply(adolescents$unt_eltern,
list(Geschlecht = adolescents$geschlecht, Region = adolescents$westost),
mean, na.rm = TRUE) %>%
round(2)
#> Region
#> Geschlecht West Ost
#> männlich 5.79 6.13
#> weiblich 5.74 5.83
We can do the same using dplyr
:
adolescents %>%
group_by(geschlecht, westost) %>%
summarise(mean = mean(unt_eltern, na.rm = TRUE))
#> # A tibble: 4 x 3
#> # Groups: geschlecht [2]
#> geschlecht westost mean
#> <fct> <fct> <dbl>
#> 1 männlich West 5.79
#> 2 männlich Ost 6.13
#> 3 weiblich West 5.74
#> 4 weiblich Ost 5.83
# Caution: If we use drop_na() to remove missing values, all cases with missings
# anywhere in the dataset will be removed. In this case first define a new dataset
# including only the relevant variables!
adolescents %>%
group_by(geschlecht, westost) %>%
drop_na() %>%
summarise(mean = mean(unt_eltern))
#> # A tibble: 4 x 3
#> # Groups: geschlecht [2]
#> geschlecht westost mean
#> <fct> <fct> <dbl>
#> 1 männlich West 5.71
#> 2 männlich Ost 6.08
#> 3 weiblich West 5.70
#> 4 weiblich Ost 5.83
8.3.2 Descriptive statistics using jmv
In jamovi (jmv
) we use descriptives()
again. Let’s look at unt_eltern
and get descriptive statistics (and plots) for boys and girls separately:
descriptives(
formula = unt_eltern ~ geschlecht,
data = adolescents,
hist = TRUE,
box = TRUE,
dot = TRUE,
median = FALSE,
sd = TRUE,
variance = TRUE,
se = TRUE,
skew = TRUE,
kurt = TRUE,
quart = TRUE)
#>
#> DESCRIPTIVES
#>
#> Descriptives
#> ───────────────────────────────────────────────────
#> geschlecht unt_eltern
#> ───────────────────────────────────────────────────
#> N männlich 151
#> weiblich 134
#> Missing männlich 1
#> weiblich 0
#> Mean männlich 5.94
#> weiblich 5.79
#> Std. error mean männlich 0.0707
#> weiblich 0.0970
#> Standard deviation männlich 0.869
#> weiblich 1.12
#> Variance männlich 0.755
#> weiblich 1.26
#> Minimum männlich 3.33
#> weiblich 1.50
#> Maximum männlich 7.00
#> weiblich 7.00
#> Skewness männlich -1.20
#> weiblich -1.33
#> Std. error skewness männlich 0.197
#> weiblich 0.209
#> Kurtosis männlich 1.09
#> weiblich 2.02
#> Std. error kurtosis männlich 0.392
#> weiblich 0.416
#> 25th percentile männlich 5.50
#> weiblich 5.17
#> 50th percentile männlich 6.17
#> weiblich 6.17
#> 75th percentile männlich 6.50
#> weiblich 6.67
#> ───────────────────────────────────────────────────
Let’s look at the perceived support from friends (unt_freunde
) for comparison, and add a second factor:
descriptives(
formula = unt_freunde ~ geschlecht:westost,
data = adolescents,
min = FALSE,
max = FALSE,
missing = FALSE,
sd = TRUE,
bar = TRUE,
box = TRUE)
#>
#> DESCRIPTIVES
#>
#> Descriptives
#> ──────────────────────────────────────────────────────────────
#> geschlecht westost unt_freunde
#> ──────────────────────────────────────────────────────────────
#> N männlich West 83
#> Ost 69
#> weiblich West 59
#> Ost 74
#> Mean männlich West 5.49
#> Ost 5.70
#> weiblich West 6.27
#> Ost 6.36
#> Median männlich West 5.67
#> Ost 5.67
#> weiblich West 6.50
#> Ost 6.50
#> Standard deviation männlich West 1.13
#> Ost 0.831
#> weiblich West 0.926
#> Ost 0.557
#> ──────────────────────────────────────────────────────────────
8.3.3 Descriptive statistics for metric variables using the psych
package
install.packages("psych")
The psych
package provides some nice functionality for data analysis in psychology. For example, it has a useful function for descriptive statistics for numerical variables: describe()
. The generic form is: describe(x, na.rm = TRUE, interp = FALSE, skew = TRUE, ranges = TRUE, trim = 0.1, type = 3, check = TRUE)
.
x
stands for the data frame or the variable to be analyzed (df$variable
). The defaults are:
interp = FALSE
refers to the definition of the median (interp = TRUE
averages adjacent values for an even n)skew = TRUE
displays skewness, kurtosis and the trimmed meanranges = TRUE
displays min, max, the range, the median, and the mad (median average deviation, an alternative measure of dispersion)trim = 0.1
refers to the proportion of the distribution that is trimmed at the lower and upper ends for the trimmed mean (default trimming is 10% on both sides, thus the trimmed mean is computed from the middle 80% of the data)type = 3
refers to the method of computing skewness and kurtosis (more here:?psych::describe
)check = TRUE
refers to checking for non-numeric variables in the dataset (for whichdescribe
has no use); ifcheck = FALSE
and there are non-numeric variables, an error message is displayed.
library(psych)
adolescents %>%
select(unt_eltern, unt_freunde) %>%
describe()
#> vars n mean sd median trimmed mad min max range skew kurtosis
#> unt_eltern 1 285 5.87 1.00 6.17 6.01 0.74 1.5 7 5.5 -1.35 2.05
#> unt_freunde 2 285 5.93 0.96 6.00 6.06 0.99 2.0 7 5.0 -1.25 1.85
#> se
#> unt_eltern 0.06
#> unt_freunde 0.06
# Only mean, sd, and se
adolescents %>%
select(unt_eltern, unt_freunde) %>%
describe(skew = FALSE, range = FALSE)
#> vars n mean sd se
#> unt_eltern 1 285 5.87 1.00 0.06
#> unt_freunde 2 285 5.93 0.96 0.06
8.4 Correlation analysis
8.4.1 Traditional ways of doing correlation analysis in R
The generic form for calculating a correlation matrix using base R is as follows: cor(df[,c("var1", "var2", "var3")], methdod = c("pearson", "kendall", "spearman"), use = "complete.obs")
.
We can thus compute Pearson correlations (default) or - for ordinal data - Kendall’s \(\tau\) or Spearman’s \(\rho\) rank correlations.
Let’s try it using four scales measuring adolescents’ academic as well as social self-efficacy:
adolescents %>%
select(swk_neueslernen, swk_motivation, swk_durchsetzung, swk_sozialkomp) %>%
cor(method = "pearson", use = "complete.obs") %>%
round(3)
#> swk_neueslernen swk_motivation swk_durchsetzung swk_sozialkomp
#> swk_neueslernen 1.000 0.640 0.351 0.363
#> swk_motivation 0.640 1.000 0.415 0.407
#> swk_durchsetzung 0.351 0.415 1.000 0.592
#> swk_sozialkomp 0.363 0.407 0.592 1.000
For covariances instead of correlations use cov()
with the same arguments as cor()
:
adolescents %>%
select(swk_neueslernen, swk_motivation, swk_durchsetzung, swk_sozialkomp) %>%
cov(method = "pearson", use = "complete.obs") %>%
round(3)
#> swk_neueslernen swk_motivation swk_durchsetzung swk_sozialkomp
#> swk_neueslernen 0.595 0.380 0.266 0.220
#> swk_motivation 0.380 0.591 0.314 0.246
#> swk_durchsetzung 0.266 0.314 0.970 0.458
#> swk_sozialkomp 0.220 0.246 0.458 0.617
A good way to display a scatterplot matrix for a number of variables is the function scatterplotMatrix
from the car
package.
In addition to displaying all combinations of bivariate scatterplots it is possible to get different univariate plots in the diagonal: diagonal = list(method = c("density", "boxplot", "histogram", "oned", "qqplot", "none"))
. In addition, smooth = TRUE
(default) adds a non-linear LOWESS line (LOcally WEighted Scatterplot Smoother) to assess the (non-)linearity of the correlation.
install.packages("car")
library(car)
scatterplotMatrix(~ swk_neueslernen + swk_motivation +
swk_durchsetzung + swk_sozialkomp,
smooth = TRUE, # default
diagonal = list(method = "histogram"), # if TRUE defaults to "density"
data = adolescents,
regLine = list(col = "red"), # if omitted defaults to point colour
use = "complete.obs") # default; alternative: "pairwise.complete.obs"
The function cor.test()
returns a significance test for one specific correlation.
The generic form is: cor.test (x, y, alternative = c ("two.sided", "less", "greater"), method = c ("pearson", "kendall", "spearman"), conf. level = 0.95)
.
Thus, in addition to the standard two-sided hypothesis test (alternative = "two.sided")
, directional alternative hypotheses are possible: (alternative = "less")
for a hypothesized negative correlation (“less than 0”) and (alternative = "greater")
for a hypothesized positive correlation (“greater than 0”).
cor.test(adolescents$swk_neueslernen, adolescents$swk_sozialkomp,
alternative = "greater",
method = "pearson")
#>
#> Pearson's product-moment correlation
#>
#> data: adolescents$swk_neueslernen and adolescents$swk_sozialkomp
#> t = 6.5528, df = 284, p-value = 1.325e-10
#> alternative hypothesis: true correlation is greater than 0
#> 95 percent confidence interval:
#> 0.2746381 1.0000000
#> sample estimates:
#> cor
#> 0.3624033
The function corr.test()
from the psych
package includes \(p\)-values for a correlation matrix. However, no directional alternative hypotheses are possible. Note that the two-sided \(p\)-values are in the lower triangular matrix and the \(p\)-values adjusted according to Holm are in the upper triangular matrix. The Holm procedure corresponds to a sequential alpha correction that is less conservative than the Bonferroni procedure, but nevertheless controls the family-wise error. In the diagonals \(p\)-values for the variances of the variables are displayed.
Let’s try it for the four self-efficacy scales, but use only the first 25 subjects (lines) to make sure we obtain differences between the adjusted and unadjusted \(p\)-values (because with the full sample all correlations are significant beyond \(p<.001\))
library(psych)
adolescents[1:25,] %>%
select(swk_neueslernen, swk_motivation, swk_durchsetzung, swk_sozialkomp) %>%
corr.test()
#> Call:corr.test(x = .)
#> Correlation matrix
#> swk_neueslernen swk_motivation swk_durchsetzung swk_sozialkomp
#> swk_neueslernen 1.00 0.57 0.22 0.19
#> swk_motivation 0.57 1.00 0.52 0.50
#> swk_durchsetzung 0.22 0.52 1.00 0.90
#> swk_sozialkomp 0.19 0.50 0.90 1.00
#> Sample Size
#> [1] 25
#> Probability values (Entries above the diagonal are adjusted for multiple tests.)
#> swk_neueslernen swk_motivation swk_durchsetzung swk_sozialkomp
#> swk_neueslernen 0.00 0.02 0.57 0.57
#> swk_motivation 0.00 0.00 0.03 0.03
#> swk_durchsetzung 0.29 0.01 0.00 0.00
#> swk_sozialkomp 0.36 0.01 0.00 0.00
#>
#> To see confidence intervals of the correlations, print with the short=FALSE option
8.4.2 Partial correlations with RcmdrMisc
Partial correlations are obtained using partial.cor(df[, c("var1", "var2", "var3")], tests = TRUE)
from the RcmdrMisc
package. Important: Partialling for the corresponding bivariate correlation is done for all other variables in the data set!
install.packages("RcmdrMisc")
library(RcmdrMisc)
# Normal correlation matrix for comparison (zero-order correlations)
cor(adolescents[1:50,c("Gesamtnote", "swk_neueslernen", "swk_motivation",
"swk_sozialkomp")],
use = "complete.obs") %>% round(3)
#> Gesamtnote swk_neueslernen swk_motivation swk_sozialkomp
#> Gesamtnote 1.000 0.396 0.292 0.139
#> swk_neueslernen 0.396 1.000 0.499 0.300
#> swk_motivation 0.292 0.499 1.000 0.600
#> swk_sozialkomp 0.139 0.300 0.600 1.000
# Partial correlation matrix with three variables (first-order partial correlations)
partial.cor(adolescents[1:50,c("Gesamtnote", "swk_neueslernen", "swk_motivation")],
tests = TRUE)
#>
#> Partial correlations:
#> Gesamtnote swk_neueslernen swk_motivation
#> Gesamtnote 0.00000 0.30214 0.11811
#> swk_neueslernen 0.30214 0.00000 0.43686
#> swk_motivation 0.11811 0.43686 0.00000
#>
#> Number of observations: 48
#>
#> Pairwise two-sided p-values:
#> Gesamtnote swk_neueslernen swk_motivation
#> Gesamtnote 0.0390 0.4291
#> swk_neueslernen 0.0390 0.0021
#> swk_motivation 0.4291 0.0021
#>
#> Adjusted p-values (Holm's method)
#> Gesamtnote swk_neueslernen swk_motivation
#> Gesamtnote 0.0780 0.4291
#> swk_neueslernen 0.0780 0.0064
#> swk_motivation 0.4291 0.0064
# Partial correlation matrix with four variables (second-order partial correlations)
partial.cor(adolescents[1:50,c("Gesamtnote", "swk_neueslernen", "swk_motivation",
"swk_sozialkomp")],
tests = TRUE)
#>
#> Partial correlations:
#> Gesamtnote swk_neueslernen swk_motivation swk_sozialkomp
#> Gesamtnote 0.00000 0.30249 0.12582 -0.04944
#> swk_neueslernen 0.30249 0.00000 0.35754 0.01537
#> swk_motivation 0.12582 0.35754 0.00000 0.54607
#> swk_sozialkomp -0.04944 0.01537 0.54607 0.00000
#>
#> Number of observations: 48
#>
#> Pairwise two-sided p-values:
#> Gesamtnote swk_neueslernen swk_motivation swk_sozialkomp
#> Gesamtnote 0.0410 0.4047 0.7442
#> swk_neueslernen 0.0410 0.0147 0.9193
#> swk_motivation 0.4047 0.0147 <.0001
#> swk_sozialkomp 0.7442 0.9193 <.0001
#>
#> Adjusted p-values (Holm's method)
#> Gesamtnote swk_neueslernen swk_motivation swk_sozialkomp
#> Gesamtnote 0.1641 1.0000 1.0000
#> swk_neueslernen 0.1641 0.0735 1.0000
#> swk_motivation 1.0000 0.0735 0.0005
#> swk_sozialkomp 1.0000 1.0000 0.0005
8.4.3 Correlation analysis with jmv
Almost all we have done above with the more traditional R functions we can also do in jmv
using the corrMatrix()
function. An overview of the available arguments is provided by ?jmv::corrMatrix
:
corrMatrix(data, vars, pearson = TRUE, spearman = FALSE, kendall = FALSE,
sig = TRUE, flag = FALSE, ci = FALSE, ciWidth = 95, plots = FALSE,
plotDens = FALSE, plotStats = FALSE, hypothesis = "corr")
Arguments
data the data as a data frame
vars a vector of strings naming the variables to correlate in data
pearson TRUE (default) or FALSE, provide Pearson's R
spearman TRUE or FALSE (default), provide Spearman's rho
kendall TRUE or FALSE (default), provide Kendall's tau-b
sig TRUE (default) or FALSE, provide significance levels
flag TRUE or FALSE (default), flag significant correlations
ci TRUE or FALSE (default), provide confidence intervals
ciWidth a number between 50 and 99.9 (default: 95), the width of CI
plots TRUE or FALSE (default), provide a correlation matrix plot
plotDens TRUE or FALSE (default), provide densities in the correlation matrix plot
plotStats TRUE or FALSE (default), provide statistics in the correlation matrix plot
hypothesis one of 'corr' (default), 'pos', 'neg' specifying the alernative hypothesis;
correlated, correlated positively, correlated negatively respectively.
Let’s try some options with the four self-efficacy variables:
library(jmv)
corrMatrix(
data = adolescents,
vars = vars(swk_neueslernen,
swk_motivation,
swk_durchsetzung,
swk_sozialkomp),
pearson = FALSE,
spearman = TRUE,
kendall = TRUE,
n = TRUE,
hypothesis = "pos")
#>
#> CORRELATION MATRIX
#>
#> Correlation Matrix
#> ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────
#> swk_neueslernen swk_motivation swk_durchsetzung swk_sozialkomp
#> ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────
#> swk_neueslernen Spearman's rho —
#> p-value —
#> Kendall's Tau B —
#> p-value —
#> N —
#>
#> swk_motivation Spearman's rho 0.619 —
#> p-value < .001 —
#> Kendall's Tau B 0.471 —
#> p-value < .001 —
#> N 286 —
#>
#> swk_durchsetzung Spearman's rho 0.377 0.463 —
#> p-value < .001 < .001 —
#> Kendall's Tau B 0.278 0.347 —
#> p-value < .001 < .001 —
#> N 285 285 —
#>
#> swk_sozialkomp Spearman's rho 0.378 0.413 0.539 —
#> p-value < .001 < .001 < .001 —
#> Kendall's Tau B 0.272 0.304 0.411 —
#> p-value < .001 < .001 < .001 —
#> N 286 286 285 —
#> ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────
#> Note. Hₐ is positive correlation
8.5 Reliability analysis
8.5.1 Reliability analysis using the psych
package
For the reliability analysis we create a reduced data set containing only the items for the scale swk_neueslernen
: swk1, swk7, swk14, swk16, swk26, swk27, and swk31.
swk_neueslernen <- adolescents %>%
select("swk1", "swk7", "swk14", "swk16", "swk26", "swk27", "swk31")
swk_neueslernen
#> # A tibble: 286 x 7
#> swk1 swk7 swk14 swk16 swk26 swk27 swk31
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2 5 5 4 6 6 4
#> 2 4 5 5 7 5 4 5
#> 3 5 7 3 7 7 3 3
#> 4 3 6 4 6 4 4 5
#> 5 5 5 4 5 7 6 NA
#> 6 3 5 5 7 4 5 5
#> # … with 280 more rows
Cronbach’s alpha
Cronbach’s \(\alpha\) can be computed using alpha()
:
psych::alpha(swk_neueslernen)
#>
#> Reliability analysis
#> Call: psych::alpha(x = swk_neueslernen)
#>
#> raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
#> 0.63 0.63 0.65 0.19 1.7 0.033 5.1 0.77 0.18
#>
#> lower alpha upper 95% confidence boundaries
#> 0.56 0.63 0.7
#>
#> Reliability if an item is dropped:
#> raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
#> swk1 0.59 0.59 0.60 0.19 1.4 0.037 0.022 0.168
#> swk7 0.58 0.58 0.57 0.19 1.4 0.039 0.013 0.192
#> swk14 0.57 0.56 0.57 0.18 1.3 0.039 0.022 0.097
#> swk16 0.64 0.64 0.65 0.23 1.8 0.032 0.020 0.192
#> swk26 0.61 0.61 0.63 0.21 1.6 0.035 0.024 0.180
#> swk27 0.58 0.58 0.59 0.19 1.4 0.039 0.021 0.168
#> swk31 0.56 0.57 0.56 0.18 1.3 0.040 0.015 0.180
#>
#> Item statistics
#> n raw.r std.r r.cor r.drop mean sd
#> swk1 286 0.53 0.56 0.45 0.34 5.0 1.2
#> swk7 284 0.61 0.58 0.52 0.39 5.0 1.5
#> swk14 283 0.61 0.62 0.54 0.41 4.9 1.4
#> swk16 285 0.43 0.43 0.24 0.18 5.8 1.4
#> swk26 285 0.47 0.50 0.34 0.27 5.3 1.2
#> swk27 281 0.61 0.58 0.48 0.38 4.9 1.5
#> swk31 283 0.63 0.61 0.56 0.43 4.7 1.4
#>
#> Non missing response frequency for each item
#> 1 2 3 4 5 6 7 miss
#> swk1 0.01 0.01 0.07 0.20 0.33 0.30 0.08 0.00
#> swk7 0.02 0.04 0.12 0.13 0.29 0.24 0.16 0.01
#> swk14 0.03 0.05 0.07 0.19 0.26 0.32 0.10 0.01
#> swk16 0.02 0.02 0.03 0.09 0.15 0.32 0.36 0.00
#> swk26 0.01 0.01 0.05 0.14 0.31 0.33 0.15 0.00
#> swk27 0.04 0.03 0.09 0.21 0.23 0.26 0.15 0.02
#> swk31 0.03 0.03 0.13 0.22 0.26 0.23 0.10 0.01
Explanation of the output:
raw_alpha # (Normal) Alpha based on covariances
std.alpha # Standardized Alpha based on correlations
G6(smc) # Guttman's Lambda 6 reliability
average_r # Average inter-item-correlation
S/N # Signal-to-noise ratio
ase/alpha se # Standard error of alpha
median_r # Median interitem correlation
alpha.drop # Statistics in case the respective item is deleted from the scale
item.stats # Item-total correlations (for raw and corrected scores)
raw.r # Correlation of each item with the total score, not corrected for item overlap
std.r # Correlation of each item with the total score (not corrected for item overlap) if the items were all standardized
r.cor # Item whole correlation corrected for item overlap and scale reliability
r.drop # Item whole correlation for this item against the scale without this item
Optionally, a “key vector” can be defined. This is useful when some items have been formulated in the opposite direction and therefore should actually be recoded before doing the reliability analysis. The “key vector” indicates which items should be recoded (here: none!)
The following command is (erroneously) specified with recoding keys on variables number 2 and 5:
psych::alpha(swk_neueslernen, keys = c(1, -1, 1, 1, -1, 1, 1))
#>
#> Reliability analysis
#> Call: psych::alpha(x = swk_neueslernen, keys = c(1, -1, 1, 1, -1, 1,
#> 1))
#>
#> raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
#> -0.033 -0.014 0.22 -0.002 -0.014 0.09 4.4 0.51 0.029
#>
#> lower alpha upper 95% confidence boundaries
#> -0.21 -0.03 0.14
#>
#> Reliability if an item is dropped:
#> raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
#> swk1 -0.320 -0.338 -0.014 -0.0439 -0.252 0.117 0.060 -0.045
#> swk7- 0.345 0.316 0.389 0.0716 0.463 0.057 0.045 0.083
#> swk14 -0.425 -0.377 -0.060 -0.0478 -0.274 0.129 0.054 -0.045
#> swk16 -0.256 -0.225 0.106 -0.0315 -0.184 0.111 0.075 -0.083
#> swk26- 0.190 0.241 0.388 0.0502 0.317 0.072 0.068 0.083
#> swk27 -0.041 -0.011 0.204 -0.0018 -0.011 0.093 0.059 -0.045
#> swk31 -0.082 -0.069 0.123 -0.0109 -0.065 0.096 0.049 -0.045
#>
#> Item statistics
#> n raw.r std.r r.cor r.drop mean sd
#> swk1 286 0.582 0.615 0.67 0.29902 5.0 1.2
#> swk7- 284 -0.027 -0.045 -0.62 -0.41024 3.0 1.5
#> swk14 283 0.646 0.637 0.76 0.31564 4.9 1.4
#> swk16 285 0.549 0.544 0.42 0.18961 5.8 1.4
#> swk26- 285 0.031 0.078 -0.54 -0.29163 2.7 1.2
#> swk27 281 0.407 0.375 0.17 -0.00063 4.9 1.5
#> swk31 283 0.431 0.426 0.38 0.03805 4.7 1.4
#>
#> Non missing response frequency for each item
#> 1 2 3 4 5 6 7 miss
#> swk1 0.01 0.01 0.07 0.20 0.33 0.30 0.08 0.00
#> swk7 0.02 0.04 0.12 0.13 0.29 0.24 0.16 0.01
#> swk14 0.03 0.05 0.07 0.19 0.26 0.32 0.10 0.01
#> swk16 0.02 0.02 0.03 0.09 0.15 0.32 0.36 0.00
#> swk26 0.01 0.01 0.05 0.14 0.31 0.33 0.15 0.00
#> swk27 0.04 0.03 0.09 0.21 0.23 0.26 0.15 0.02
#> swk31 0.03 0.03 0.13 0.22 0.26 0.23 0.10 0.01
Via the summary()
function it is also possible obtain only alpha. To do this, the result of the analysis must be saved in an object first:
alpha_neueslernen <- psych::alpha(swk_neueslernen)
summary(swk_neueslernen)
#> swk1 swk7 swk14 swk16
#> Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
#> 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:5.000
#> Median :5.000 Median :5.000 Median :5.000 Median :6.000
#> Mean :5.045 Mean :4.972 Mean :4.936 Mean :5.751
#> 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:7.000
#> Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.000
#> NA's :2 NA's :3 NA's :1
#> swk26 swk27 swk31
#> Min. :1.000 Min. :1.000 Min. :1.000
#> 1st Qu.:5.000 1st Qu.:4.000 1st Qu.:4.000
#> Median :5.000 Median :5.000 Median :5.000
#> Mean :5.312 Mean :4.936 Mean :4.724
#> 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:6.000
#> Max. :7.000 Max. :7.000 Max. :7.000
#> NA's :1 NA's :5 NA's :3
# alternatively:
swk_neueslernen %>%
psych::alpha() %>%
summary()
#>
#> Reliability analysis
#> raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
#> 0.63 0.63 0.65 0.19 1.7 0.033 5.1 0.77 0.18
8.5.2 Reliability analysis with jmv
A lot of the psych::alpha()
functionality is available in jmv
as well, plus McDonald’s \(\omega\) - the correct reliability measure for \(\tau\)-congeneric variables, i.e. variables with a measurement model that allows different slope/loading parameters. (To be fair: there is also a function in the psych
package for omega).
jmv::reliability(
data = adolescents,
vars = vars(swk1, swk7, swk14, swk16, swk26, swk27, swk31),
omegaScale = TRUE,
alphaItems = TRUE,
omegaItems = TRUE,
itemRestCor = TRUE,
revItems = vars(swk1, swk26))
#>
#> RELIABILITY ANALYSIS
#>
#> Scale Reliability Statistics
#> ─────────────────────────────────────────
#> Cronbach's α McDonald's ω
#> ─────────────────────────────────────────
#> scale 0.254 0.330
#> ─────────────────────────────────────────
#> Note. items 'swk1' and 'swk26'
#> correlate negatively with the total
#> scale and probably should be
#> reversed
#>
#>
#> Item Reliability Statistics
#> ────────────────────────────────────────────────────────────────────
#> item-rest correlation Cronbach's α McDonald's ω
#> ────────────────────────────────────────────────────────────────────
#> swk1 ᵃ -0.2904 0.42954 0.483
#> swk7 0.4269 -0.04970 0.116
#> swk14 0.0817 0.24063 0.344
#> swk16 0.0845 0.23831 0.340
#> swk26 ᵃ -0.2099 0.39386 0.453
#> swk27 0.2701 0.09027 0.215
#> swk31 0.3931 -0.00373 0.139
#> ────────────────────────────────────────────────────────────────────
#> ᵃ reverse scaled item
Oops… we mistakenly reversed some items again! Now correctly and with an additional bivariate correlation heatmap of the items (corPlot = TRUE
):
jmv::reliability(
data = adolescents,
vars = vars(swk1, swk7, swk14, swk16, swk26, swk27, swk31),
omegaScale = TRUE,
corPlot = TRUE,
alphaItems = TRUE,
omegaItems = TRUE,
itemRestCor = TRUE)
#>
#> RELIABILITY ANALYSIS
#>
#> Scale Reliability Statistics
#> ─────────────────────────────────────────
#> Cronbach's α McDonald's ω
#> ─────────────────────────────────────────
#> scale 0.637 0.643
#> ─────────────────────────────────────────
#>
#>
#> Item Reliability Statistics
#> ──────────────────────────────────────────────────────────────────
#> item-rest correlation Cronbach's α McDonald's ω
#> ──────────────────────────────────────────────────────────────────
#> swk1 0.352 0.601 0.615
#> swk7 0.390 0.586 0.601
#> swk14 0.418 0.577 0.597
#> swk16 0.195 0.648 0.650
#> swk26 0.271 0.622 0.634
#> swk27 0.385 0.588 0.598
#> swk31 0.431 0.573 0.589
#> ──────────────────────────────────────────────────────────────────
In the Holm procedure (Bonferroni-Holm method), the smallest \(p\)-value is multiplied by the number of tests (\(p(p-1)/2\); \(p\) = number of variables in the correlation matrix), the second smallest by the number of tests minus 1, etc. until the largest \(p\)-value that is no longer corrected (i.e., multiplied by \(p-p+1 = 1\)). However, the \(p\)-value is only multiplied by the corresponding number of tests if the resulting adjusted \(p\)-value is not smaller than the previously adjusted (smaller) \(p\)-value. If this is the case, the test is assigned the same \(p\)-value as its predecessor. This prevents that a smaller effect gets a smaller \(p\)-value after the adjustment than a larger effect, thereby preventing that a smaller effect may reach significance while the larger effect does not.