7 Descriptive Statistics

7.1 Summary statistics of all variables in a dataset

The function summary(data_frame) returns descriptive statistics for all variables in a dataset. For numeric variables, the minimum, maximum, quartiles, median, and mean values are returned, for factors the frequencies of the factor levels. In addition, the number of missing values for both variable types is displayed.

We first import our sample data set and assign the variables their scale level (i.e., defining factors as factors). Afterwards, we delete all single items from the data set since they are not needed now. This reduced data set, which we call westost_scales, is then saved as an .Rdata file in our data folder, so that we can load it next time without having to do all the import work again.

library(tidyverse)
library(haven)
westost <- read_sav("data/Beispieldatensatz.sav")
westost$ID <- as.factor(westost$ID)
westost$westost <- as_factor(westost$westost)
westost$geschlecht <- as_factor(westost$geschlecht)
westost$SES <- as_factor(westost$SES, ordered = TRUE)
westost$Deutsch <- as_factor(westost$Deutsch, ordered = TRUE)
westost$Mathe <- as_factor(westost$Mathe, ordered = TRUE)
westost$Fremdspr <- as_factor(westost$Fremdspr, ordered = TRUE)
westost$bildung_vater <- as_factor(westost$bildung_vater)
westost$bildung_mutter <- as_factor(westost$bildung_mutter)
westost$bildung_vater_b <- as_factor(westost$bildung_vater_b)
westost$bildung_mutter_b <- as_factor(westost$bildung_mutter_b)
save(westost, file = "data/westost.Rdata")

westost_scales <- westost %>% select (-c(T1_1:T7_12))

save(westost_scales, file = "data/westost_scales.Rdata")
load(file = "data/westost_scales.Rdata")
westost_scales
#> # A tibble: 286 x 33
#>   ID    westost geschlecht alter SES   Deutsch Mathe Fremdspr Schnitt
#>   <fct> <fct>   <fct>      <dbl> <ord> <ord>   <ord> <ord>      <dbl>
#> 1 2     West    männlich      14 unte… befrie… befr… ausreic…       4
#> 2 14    West    männlich      14 durc… befrie… befr… ausreic…       4
#> 3 15    West    männlich      14 durc… befrie… befr… befried…       4
#> 4 17    West    männlich      15 durc… befrie… befr… ausreic…       4
#> 5 18    West    männlich      14 durc… gut     gut   befried…       4
#> 6 19    West    männlich      15 über… befrie… befr… ausreic…       4
#> # … with 280 more rows, and 24 more variables: bildung_vater <fct>,
#> #   bildung_mutter <fct>, bildung_vater_b <fct>, bildung_mutter_b <fct>,
#> #   swk_akad <dbl>, swk_selbstreg <dbl>, swk_durch <dbl>,
#> #   swk_motselbst <dbl>, swk_sozharm <dbl>, swk_bez <dbl>,
#> #   unt_eltern <dbl>, unt_freunde <dbl>, sch_leistung <dbl>,
#> #   sch_bez <dbl>, sch_regeln <dbl>, ges_gerecht <dbl>, ges_nepot <dbl>,
#> #   leben_selbst <dbl>, leben_fam <dbl>, leben_schule <dbl>,
#> #   leben_freunde <dbl>, leben_gesamt <dbl>, stress_somat <dbl>,
#> #   stress_psych <dbl>

With summary() we now can get a descriptive overview of the dataset:

summary(westost_scales)
#>        ID      westost       geschlecht      alter     
#>  1      :  1   West:143   männlich:152   Min.   :13.0  
#>  2      :  1   Ost :143   weiblich:134   1st Qu.:14.0  
#>  10     :  1                             Median :15.0  
#>  11     :  1                             Mean   :14.7  
#>  12     :  1                             3rd Qu.:15.0  
#>  14     :  1                             Max.   :17.0  
#>  (Other):280                                           
#>                     SES              Deutsch             Mathe    
#>  sehr niedrig         :  1   ungenügend  :  0   ungenügend  :  0  
#>  unterdurchschnittlich:  4   mangelhaft  :  3   mangelhaft  : 11  
#>  durchschnittlich     :186   ausreichend : 52   ausreichend : 55  
#>  überdurchschnittlich : 69   befriedigend:112   befriedigend:111  
#>  sehr hoch            : 24   gut         :100   gut         : 88  
#>  NA's                 :  2   sehr gut    : 16   sehr gut    : 19  
#>                              NA's        :  3   NA's        :  2  
#>          Fremdspr      Schnitt     
#>  ungenügend  :  0   Min.   :3.000  
#>  mangelhaft  : 10   1st Qu.:4.000  
#>  ausreichend : 39   Median :4.500  
#>  befriedigend:103   Mean   :4.462  
#>  gut         :102   3rd Qu.:5.000  
#>  sehr gut    : 21   Max.   :6.000  
#>  NA's        : 11   NA's   :6      
#>                                        bildung_vater
#>  Hauptschulabschluss oder niedriger           :41   
#>  Realschulabschluss (mittlere Reife)          :94   
#>  Fachabitur, Abitur                           :41   
#>  Fachhochschulabschluss, Universitätsabschluss:93   
#>  NA's                                         :17   
#>                                                     
#>                                                     
#>                                        bildung_mutter bildung_vater_b
#>  Hauptschulabschluss oder niedriger           :43     niedrig:135    
#>  Realschulabschluss (mittlere Reife)          :95     hoch   :134    
#>  Fachabitur, Abitur                           :50     NA's   : 17    
#>  Fachhochschulabschluss, Universitätsabschluss:84                    
#>  NA's                                         :14                    
#>                                                                      
#>                                                                      
#>  bildung_mutter_b    swk_akad     swk_selbstreg     swk_durch    
#>  niedrig:138      Min.   :2.429   Min.   :2.875   Min.   :1.000  
#>  hoch   :134      1st Qu.:4.571   1st Qu.:4.750   1st Qu.:4.667  
#>  NA's   : 14      Median :5.143   Median :5.250   Median :5.333  
#>                   Mean   :5.098   Mean   :5.246   Mean   :5.285  
#>                   3rd Qu.:5.571   3rd Qu.:5.875   3rd Qu.:6.000  
#>                   Max.   :7.000   Max.   :7.000   Max.   :7.000  
#>                                                   NA's   :1      
#>  swk_motselbst    swk_sozharm       swk_bez        unt_eltern   
#>  Min.   :3.200   Min.   :1.571   Min.   :3.333   Min.   :1.500  
#>  1st Qu.:5.000   1st Qu.:4.857   1st Qu.:5.167   1st Qu.:5.333  
#>  Median :5.400   Median :5.381   Median :5.667   Median :6.167  
#>  Mean   :5.447   Mean   :5.280   Mean   :5.616   Mean   :5.872  
#>  3rd Qu.:6.000   3rd Qu.:5.857   3rd Qu.:6.167   3rd Qu.:6.667  
#>  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
#>                                                  NA's   :1      
#>   unt_freunde     sch_leistung      sch_bez        sch_regeln   
#>  Min.   :2.000   Min.   :1.667   Min.   :1.000   Min.   :1.750  
#>  1st Qu.:5.500   1st Qu.:3.667   1st Qu.:3.163   1st Qu.:4.000  
#>  Median :6.000   Median :4.333   Median :3.750   Median :4.925  
#>  Mean   :5.928   Mean   :4.326   Mean   :3.820   Mean   :4.875  
#>  3rd Qu.:6.667   3rd Qu.:5.000   3rd Qu.:4.500   3rd Qu.:5.750  
#>  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
#>  NA's   :1       NA's   :2                                      
#>   ges_gerecht      ges_nepot      leben_selbst     leben_fam    
#>  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
#>  1st Qu.:3.500   1st Qu.:3.600   1st Qu.:5.000   1st Qu.:5.333  
#>  Median :4.167   Median :4.400   Median :5.500   Median :6.000  
#>  Mean   :4.160   Mean   :4.423   Mean   :5.433   Mean   :5.760  
#>  3rd Qu.:5.000   3rd Qu.:5.200   3rd Qu.:6.000   3rd Qu.:6.667  
#>  Max.   :6.667   Max.   :7.000   Max.   :7.000   Max.   :7.000  
#>                  NA's   :3       NA's   :1       NA's   :2      
#>   leben_schule   leben_freunde    leben_gesamt    stress_somat  
#>  Min.   :1.333   Min.   :2.000   Min.   :2.909   Min.   :1.000  
#>  1st Qu.:4.000   1st Qu.:5.500   1st Qu.:5.000   1st Qu.:2.167  
#>  Median :4.667   Median :6.000   Median :5.455   Median :3.000  
#>  Mean   :4.670   Mean   :5.898   Mean   :5.417   Mean   :3.020  
#>  3rd Qu.:5.333   3rd Qu.:6.500   3rd Qu.:5.909   3rd Qu.:3.667  
#>  Max.   :7.000   Max.   :7.000   Max.   :6.909   Max.   :7.000  
#>  NA's   :1       NA's   :1       NA's   :1       NA's   :3      
#>   stress_psych  
#>  Min.   :1.000  
#>  1st Qu.:2.167  
#>  Median :2.917  
#>  Mean   :2.989  
#>  3rd Qu.:3.667  
#>  Max.   :6.833  
#>  NA's   :2

7.2 Descriptive statistics for nominal variables

7.2.1 Frequencies with table()

Using table() we can display the frequencies of the categories (factor levels) of a factor. This is particularly useful if we are interested in the combined frequencies of several factors as in a cross-tabulation.

table(westost_scales$bildung_vater)
#> 
#>            Hauptschulabschluss oder niedriger 
#>                                            41 
#>           Realschulabschluss (mittlere Reife) 
#>                                            94 
#>                            Fachabitur, Abitur 
#>                                            41 
#> Fachhochschulabschluss, Universitätsabschluss 
#>                                            93
table(westost_scales$bildung_vater_b, westost_scales$bildung_mutter_b)
#>          
#>           niedrig hoch
#>   niedrig     102   32
#>   hoch         35   99
table(westost_scales$westost, westost_scales$bildung_mutter)
#>       
#>        Hauptschulabschluss oder niedriger
#>   West                                 43
#>   Ost                                   0
#>       
#>        Realschulabschluss (mittlere Reife) Fachabitur, Abitur
#>   West                                  52                 20
#>   Ost                                   43                 30
#>       
#>        Fachhochschulabschluss, Universitätsabschluss
#>   West                                            26
#>   Ost                                             58

Let’s generate some frequency plots with ggplot2:

p <- westost_scales %>% 
    select(bildung_vater) %>% 
    drop_na() %>% 
    ggplot(aes(x = bildung_vater))
p + geom_bar(fill = "darkblue") +
    xlab("Education of the Father") +
    ylab("Frequency")
    

p1 <- westost_scales %>% 
    select(bildung_mutter_b, westost) %>% 
    drop_na() %>% 
    ggplot(aes(x = bildung_mutter_b, fill = westost))
p1 + geom_bar(position = "dodge") +
    xlab("Education of the Mother (binary)") +
    ylab("Frequency") +
    theme_classic() +
    scale_fill_manual(values = c("#000000", "#E69F00"))

With prop.table() we can get proportions (relative frequencies):

table(westost_scales$bildung_vater_b, westost_scales$bildung_mutter_b) %>% prop.table() %>% round(2)
#>          
#>           niedrig hoch
#>   niedrig    0.38 0.12
#>   hoch       0.13 0.37

# or in percentages

100*table(westost_scales$bildung_vater_b, westost_scales$bildung_mutter_b) %>% prop.table() %>% round(4)
#>          
#>           niedrig  hoch
#>   niedrig   38.06 11.94
#>   hoch      13.06 36.94

In most cases we are not interested in the total proportions (which add up to 1 or 100% across the entire table), but in the row-conditional or column-conditional relative frequencies. The former are obtained using margin = 1 in the prop.table() function, the latter with margin = 2. Since there is no further argument, we can also directly specify the value of the argument, i.e. prop.table(1) for row-conditional proportions and prop.table(2) for column-conditional proportions.

prop.table() uses as first argument a cross table as produced by table(). Only if we use it with the pipe %>% we can use the unnamed value for the margin argument as presented above.

table(westost_scales$geschlecht, westost_scales$westost)
#>           
#>            West Ost
#>   männlich   83  69
#>   weiblich   60  74

# Relative frequencies (entire table)
table(westost_scales$geschlecht, westost_scales$westost) %>% prop.table() %>% round(2)
#>           
#>            West  Ost
#>   männlich 0.29 0.24
#>   weiblich 0.21 0.26

# Row-conditional relative frequencies
table(westost_scales$geschlecht, westost_scales$westost) %>% prop.table(1) %>% round(2)
#>           
#>            West  Ost
#>   männlich 0.55 0.45
#>   weiblich 0.45 0.55

# Column-conditional relative frequencies
table(westost_scales$geschlecht, westost_scales$westost) %>% prop.table(2) %>% round(2)
#>           
#>            West  Ost
#>   männlich 0.58 0.48
#>   weiblich 0.42 0.52

7.2.2 Mode

There is no separate function for calculating the mode of a nominal variable. Either you read the mode (the category with the highest frequency) from the frequency table (or the frequency diagram), or you can help yourself with:

names(which.max(table(westost_scales$bildung_vater)))
#> [1] "Realschulabschluss (mittlere Reife)"

7.3 Descriptive statistics for ordinal and metric Variables

Most of the functions needed for describing the distributional characteristics of ordinal and metric variables we already know from the earlier chapter on the R language.

mean(x, na.rm = FALSE)    Arithmetic mean
sd(x)                     (Sample) Standard Deviation 
var(x)                    (Sample) Variance

median(x)                 Median
quantile(x, probs, type)  Quantile of x.  probs: vector with probabilities
min(x)                    Minimum value of x
max(x)                    Maximum value of x
range(x)                  x_min and x_max

Examples

Let’s create a vector with 16 numbers between 1 and 9 and sort it:

x <- c(4, 7, 9, 2, 3, 6, 6, 4, 1, 2, 8, 5, 5, 7, 3, 2)
n <- length(x)
x <- sort(x)
x
#>  [1] 1 2 2 2 3 3 4 4 5 5 6 6 7 7 8 9
median(x)
#> [1] 4.5
quantile(x, c(.25, .75))
#>  25%  75% 
#> 2.75 6.25

The median = 4.5 corresponds to our expectations, as it is the mean of the 8th and 9th sorted values. The quartiles are \(Q_1=2.75\) and \(Q_3=6.25\)`. This is probably not what you’ve learned in your statistics course (at least not with us…). We have learned that if \(n\cdot p\) is not an integer, then the quantile is the \([(n\cdot p) + 1]\)th value of the distribution; on the other hand, if \(n\cdot p\) is an integer, the respective quantile is the mean of the \((n\cdot p)\)th and the \([(n\cdot p) + 1]\)th value. Since 16 can be devided by 4, both \(n\cdot 0.25\) and \(n\cdot 0.75\) are integers. Thus, the first quartile should be \(Q_1=\frac {x_{n \cdot 0.25}+x_{n \cdot 0.25+1}}{2} = \frac {x_4+x_5}{2}= \frac {2+3}{2}= 2.5\) and the third quartile should be \(Q_3=\frac {x_{n \cdot 0.75}+x_{n \cdot 0.75+1}}{2} = \frac {x_{12}+x_{13}}{2}= \frac {6 + 7}{2}= 6.5\). But that’s not what we obtained here…

The short and simple answer to this riddle is that quantile() has nine different methods (type = 1-9) to compute the quantiles of a distribution (help('quantile')). The default method is type = 7, but the method we prefer is type = 2. Therefore:

quantile(x, c(.25, .75), type = 2)
#> 25% 75% 
#> 2.5 6.5

Which method is used by summary()?

summary(x)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>   1.000   2.750   4.500   4.625   6.250   9.000

Surprise! Summary also uses type = 7.

7.3.1 Descriptive summary of selected numerical variables

install.packages("RcmdrMisc")

Descriptive statistics for metric variables can be also obtained by the numSummary() function from the RcmdrMisc package.

The generic form of this function is: numSummary(df[, c("var1", "var2"], groups = df$factor, statistics = c("mean", "sd", "quantiles"), quantiles = c(0,.25,.5,.75,1)).

Additional statistics options are: se(mean) (standard error of the mean = \(\frac {sd}{\sqrt{n}}\)), cv (coefficient of variation = SD/mean), skewness, and kurtosis.

Example: Let us calculate descriptive statistics for the variables “Support by parents” (unt_eltern) and “Support by friends” (unt_friends) with mean value, standard deviation, standard error of the mean, skewness, kurtosis as well as the 10% and 90% quantiles. Using the groups argument we can do this separately for the levels of the factor (geschlecht).

library(RcmdrMisc)
numSummary(westost_scales[,c("unt_eltern", "unt_freunde")], 
           groups = westost_scales$geschlecht, 
           statistics = c("mean", "sd", "se(mean)", "skewness", 
                          "kurtosis", "quantiles"), 
           quantiles = c(.1,.9))
#> 
#> Variable: unt_eltern 
#>             mean        sd   se(mean)  skewness kurtosis      10%      90%
#> männlich 5.94415 0.8687628 0.07069892 -1.204500 1.088983 4.666667 6.833333
#> weiblich 5.79005 1.1232414 0.09703328 -1.332349 2.016779 4.333333 6.833333
#>            n NA
#> männlich 151  1
#> weiblich 134  0
#> 
#> Variable: unt_freunde 
#>              mean        sd   se(mean)   skewness  kurtosis      10%
#> männlich 5.584868 1.0055152 0.08155806 -0.8628965 0.8376967 4.333333
#> weiblich 6.319549 0.7416894 0.06431263 -2.2457703 8.4513611 5.500000
#>               90%   n NA
#> männlich 6.833333 152  0
#> weiblich 7.000000 133  1
p1 <- westost_scales %>% 
    select(unt_eltern, geschlecht) %>%
    drop_na() %>% 
    ggplot(aes(x = unt_eltern, fill = geschlecht, y = ..density..))

p1 + geom_histogram(binwidth = 0.5, 
                   position = "identity",
                   alpha = 0.5) +
    scale_fill_manual(values = c("#000000", "#E69F00")) +
    xlab("Support by Parents")

p2 <- westost_scales %>% 
    select(unt_freunde, geschlecht) %>%
    drop_na() %>% 
    ggplot(aes(x = unt_freunde, fill = geschlecht, y = ..density..))

p2 + geom_histogram(binwidth = 0.5, 
                   position = "identity",
                   alpha = 0.5) +
    scale_fill_manual(values = c("#000000", "#E69F00"))+
    xlab("Support by Friends")

7.3.2 Table with statistics (tapply())

The tapply () function can be used to display selected descriptive statistics for the combined levels of several factors. Useful for the presentation of means in 2-factorial ANOVAS. The generic form is: tapply(df$var1, list(factor1 = df$factor1, factor2 = df$factor2), statistic, na.rm = TRUE).

statistic has to be replaced by the desired statistic (only one at a time), e.g. mean, median, sum, sd.

Let’s display the means of the variable “Support by Parents” for the combined factors geschlecht and westost:

tapply(westost_scales$unt_eltern, 
       list(Geschlecht = westost_scales$geschlecht, Region = westost_scales$westost), 
       mean, na.rm = TRUE) %>%
    round(2)
#>           Region
#> Geschlecht West  Ost
#>   männlich 5.79 6.13
#>   weiblich 5.74 5.83

We did the same already using dplyr:

westost_scales %>% 
    group_by(geschlecht, westost) %>% 
    summarise(mean = mean(unt_eltern, na.rm = TRUE))
#> # A tibble: 4 x 3
#> # Groups:   geschlecht [2]
#>   geschlecht westost  mean
#>   <fct>      <fct>   <dbl>
#> 1 männlich   West     5.79
#> 2 männlich   Ost      6.13
#> 3 weiblich   West     5.74
#> 4 weiblich   Ost      5.83

# Caution: If we use drop_na() to remove missing values, all cases with missings 
# anywhere in the dataset will be removed. In this case, it is necessary to define 
# a new dataset including only the relevant variables first!

westost_scales %>% 
    group_by(geschlecht, westost) %>% 
    drop_na() %>% 
    summarise(mean = mean(unt_eltern))
#> # A tibble: 4 x 3
#> # Groups:   geschlecht [2]
#>   geschlecht westost  mean
#>   <fct>      <fct>   <dbl>
#> 1 männlich   West     5.78
#> 2 männlich   Ost      6.12
#> 3 weiblich   West     5.71
#> 4 weiblich   Ost      5.73

7.3.3 Descriptive statistics for metric variables using the psych package

install.packages("psych")

The psych package is often used for data analysis in psychology. It has a useful function for descriptive statistics for numerical variables: describe(). The generic form is: describe(x, na.rm = TRUE, interp = FALSE, skew = TRUE, ranges = TRUE, trim = .1, type = 3, check = TRUE).

x stands for the data frame or the variable to be analyzed (df$variable). The defaults are:
* interp = FALSE refers to the definition of the median (interp = TRUE uses our method of averaging adjacent values for an even n) * skew = TRUE displays skewness, kurtosis and the trimmed mean * ranges = TRUE displays the range * trim = .1 refers to the proportion of the distribution that is trimmed at the lower and upper ends for the trimmed mean (default trimming is 10% on both sides, thus the trimmed mean is computed from the middle 80% of the data) * type = 3 refers to the method of computing skewness and kurtosis (more here: ?psych::describe) * check = TRUE refers to checking for non-numeric variables in the dataset (for which describe has no use); if check = FALSE non-numeric variables exist, an error message is displayed.

library(psych)
describe(westost_scales)
#>                   vars   n   mean    sd median trimmed    mad   min    max
#> ID*                  1 286 143.50 82.71 143.50  143.50 106.01  1.00 286.00
#> westost*             2 286   1.50  0.50   1.50    1.50   0.74  1.00   2.00
#> geschlecht*          3 286   1.47  0.50   1.00    1.46   0.00  1.00   2.00
#> alter                4 286  14.70  0.76  15.00   14.65   1.48 13.00  17.00
#> SES*                 5 284   3.39  0.68   3.00    3.29   0.00  1.00   5.00
#> Deutsch*             6 283   4.26  0.86   4.00    4.27   1.48  2.00   6.00
#> Mathe*               7 284   4.17  0.95   4.00    4.18   1.48  2.00   6.00
#> Fremdspr*            8 275   4.31  0.93   4.00    4.33   1.48  2.00   6.00
#> Schnitt              9 280   4.46  0.69   4.50    4.51   0.74  3.00   6.00
#> bildung_vater*      10 269   2.69  1.10   2.00    2.74   1.48  1.00   4.00
#> bildung_mutter*     11 272   2.64  1.08   2.00    2.68   1.48  1.00   4.00
#> bildung_vater_b*    12 269   1.50  0.50   1.00    1.50   0.00  1.00   2.00
#> bildung_mutter_b*   13 272   1.49  0.50   1.00    1.49   0.00  1.00   2.00
#> swk_akad            14 286   5.10  0.77   5.14    5.11   0.85  2.43   7.00
#> swk_selbstreg       15 286   5.25  0.81   5.25    5.28   0.93  2.88   7.00
#> swk_durch           16 285   5.28  0.99   5.33    5.35   0.99  1.00   7.00
#> swk_motselbst       17 286   5.45  0.77   5.40    5.48   0.89  3.20   7.00
#> swk_sozharm         18 286   5.28  0.78   5.38    5.30   0.78  1.57   7.00
#> swk_bez             19 286   5.62  0.75   5.67    5.66   0.74  3.33   7.00
#> unt_eltern          20 285   5.87  1.00   6.17    6.01   0.74  1.50   7.00
#> unt_freunde         21 285   5.93  0.96   6.00    6.06   0.99  2.00   7.00
#> sch_leistung        22 284   4.33  1.20   4.33    4.33   0.99  1.67   7.00
#> sch_bez             23 286   3.82  1.12   3.75    3.82   1.00  1.00   7.00
#> sch_regeln          24 286   4.88  1.16   4.93    4.88   1.30  1.75   7.00
#> ges_gerecht         25 286   4.16  1.03   4.17    4.19   0.99  1.00   6.67
#> ges_nepot           26 283   4.42  1.09   4.40    4.43   1.19  1.00   7.00
#> leben_selbst        27 285   5.43  1.10   5.50    5.54   0.74  1.00   7.00
#> leben_fam           28 284   5.76  1.01   6.00    5.88   0.99  1.00   7.00
#> leben_schule        29 285   4.67  1.06   4.67    4.73   0.99  1.33   7.00
#> leben_freunde       30 285   5.90  0.81   6.00    5.95   0.74  2.00   7.00
#> leben_gesamt        31 285   5.42  0.69   5.45    5.46   0.67  2.91   6.91
#> stress_somat        32 283   3.02  1.19   3.00    2.93   0.99  1.00   7.00
#> stress_psych        33 284   2.99  1.20   2.92    2.91   1.11  1.00   6.83
#>                    range  skew kurtosis   se
#> ID*               285.00  0.00    -1.21 4.89
#> westost*            1.00  0.00    -2.01 0.03
#> geschlecht*         1.00  0.13    -1.99 0.03
#> alter               4.00  0.36    -0.36 0.04
#> SES*                4.00  0.91     0.73 0.04
#> Deutsch*            4.00 -0.09    -0.50 0.05
#> Mathe*              4.00 -0.15    -0.38 0.06
#> Fremdspr*           4.00 -0.35    -0.15 0.06
#> Schnitt             3.00 -0.41    -0.43 0.04
#> bildung_vater*      3.00 -0.06    -1.41 0.07
#> bildung_mutter*     3.00 -0.01    -1.34 0.07
#> bildung_vater_b*    1.00  0.01    -2.01 0.03
#> bildung_mutter_b*   1.00  0.03    -2.01 0.03
#> swk_akad            4.57 -0.16     0.02 0.05
#> swk_selbstreg       4.12 -0.35    -0.31 0.05
#> swk_durch           6.00 -0.80     1.45 0.06
#> swk_motselbst       3.80 -0.37    -0.32 0.05
#> swk_sozharm         5.43 -0.55     1.18 0.05
#> swk_bez             3.67 -0.48    -0.23 0.04
#> unt_eltern          5.50 -1.35     2.05 0.06
#> unt_freunde         5.00 -1.25     1.85 0.06
#> sch_leistung        5.33 -0.01    -0.37 0.07
#> sch_bez             6.00 -0.04    -0.05 0.07
#> sch_regeln          5.25 -0.10    -0.64 0.07
#> ges_gerecht         5.67 -0.26    -0.27 0.06
#> ges_nepot           6.00 -0.08    -0.10 0.06
#> leben_selbst        6.00 -1.28     2.41 0.07
#> leben_fam           6.00 -1.27     2.27 0.06
#> leben_schule        5.67 -0.48     0.11 0.06
#> leben_freunde       5.00 -1.16     3.10 0.05
#> leben_gesamt        4.00 -0.79     1.14 0.04
#> stress_somat        6.00  0.81     0.99 0.07
#> stress_psych        5.83  0.65     0.42 0.07
describe(westost_scales[, c("unt_eltern", "unt_freunde")])
#>             vars   n mean   sd median trimmed  mad min max range  skew
#> unt_eltern     1 285 5.87 1.00   6.17    6.01 0.74 1.5   7   5.5 -1.35
#> unt_freunde    2 285 5.93 0.96   6.00    6.06 0.99 2.0   7   5.0 -1.25
#>             kurtosis   se
#> unt_eltern      2.05 0.06
#> unt_freunde     1.85 0.06
describe(westost_scales[, c("unt_eltern", "unt_freunde")], trim = 0.2)
#>             vars   n mean   sd median trimmed  mad min max range  skew
#> unt_eltern     1 285 5.87 1.00   6.17    6.08 0.74 1.5   7   5.5 -1.35
#> unt_freunde    2 285 5.93 0.96   6.00    6.09 0.99 2.0   7   5.0 -1.25
#>             kurtosis   se
#> unt_eltern      2.05 0.06
#> unt_freunde     1.85 0.06
describe(westost_scales[, c("unt_eltern", "unt_freunde")], skew = FALSE, 
         range = FALSE)
#>             vars   n mean   sd   se
#> unt_eltern     1 285 5.87 1.00 0.06
#> unt_freunde    2 285 5.93 0.96 0.06

mad stands for median average deviation (mean absolute deviation from the median, an alternative measure of dispersion). The variables recognized as factors are given a * in the table. Nevertheless, their (mostly useless) statistics are displayed in spite of the detection. We are mainly interested in the variables unt_eltern and unt_friends. With trim = 0.2 we request a trimmed mean where 20% of the values at the lower end of the distribution and 20% of the values at the upper end of the distribution are removed. Finally, with skew = FALSE, range = FALSE we request a descriptive statistic without skewness and kurtosis as well as without median, trimmed, mad, min, max, range.

7.4 Descriptive statistics with jmv

The package jmv is the R package for the fabulous new statistics program jamovi. Everything that can be done in jamovi can also be done directly in R. When jamovi is run in syntax mode it is even possible to copy-paste the generated R code directly into an R markdown document like this one.

For the example, let’s first get a dataset swk including all self-efficacy scales + the gender variable:

swk <- westost_scales %>%
    select(starts_with("swk"), "geschlecht")
names(swk)
#> [1] "swk_akad"      "swk_selbstreg" "swk_durch"     "swk_motselbst"
#> [5] "swk_sozharm"   "swk_bez"       "geschlecht"

Descriptive statistics can be obtained with descriptives().

library(jmv)
descriptives(
    data = swk,
    vars = c(
        "swk_akad",
        "swk_selbstreg",
        "swk_motselbst",
        "swk_durch",
        "swk_sozharm",
        "swk_bez"),
    splitBy = "geschlecht")
#> 
#>  DESCRIPTIVES
#> 
#>  Descriptives                                                                                                   
#>  ────────────────────────────────────────────────────────────────────────────────────────────────────────────── 
#>               geschlecht    swk_akad    swk_selbstreg    swk_motselbst    swk_durch    swk_sozharm    swk_bez   
#>  ────────────────────────────────────────────────────────────────────────────────────────────────────────────── 
#>    N          männlich           152              152              152          151            152        152   
#>               weiblich           134              134              134          134            134        134   
#>    Missing    männlich             0                0                0            1              0          0   
#>               weiblich             0                0                0            0              0          0   
#>    Mean       männlich          5.14             5.16             5.43         5.48           5.30       5.55   
#>               weiblich          5.05             5.34             5.46         5.06           5.26       5.69   
#>    Median     männlich          5.21             5.25             5.40         5.67           5.29       5.67   
#>               weiblich          5.14             5.38             5.60         5.33           5.43       5.83   
#>    Minimum    männlich          3.29             2.88             3.20         3.00           3.86       3.33   
#>               weiblich          2.43             3.43             3.50         1.00           1.57       3.83   
#>    Maximum    männlich          7.00             6.88             7.00         7.00           7.00       7.00   
#>               weiblich          7.00             7.00             7.00         7.00           6.86       7.00   
#>  ──────────────────────────────────────────────────────────────────────────────────────────────────────────────

With ?jmv::descriptives we get an overview of all the options:

descriptives(data, vars, splitBy = NULL, freq = FALSE, hist = FALSE,
  dens = FALSE, bar = FALSE, barCounts = FALSE, box = FALSE,
  violin = FALSE, dot = FALSE, dotType = "jitter", n = TRUE,
  missing = TRUE, mean = TRUE, median = TRUE, mode = FALSE,
  sum = FALSE, sd = FALSE, variance = FALSE, range = FALSE,
  min = TRUE, max = TRUE, se = FALSE, skew = FALSE, kurt = FALSE,
  quart = FALSE, pcEqGr = FALSE, pcNEqGr = 4)

Arguments

data        the data as a data frame
vars        a vector of strings naming the variables of interest in data
splitBy     a vector of strings naming the variables used to split vars
freq        TRUE or FALSE (default), provide frequency tables (nominal, ordinal 
            variables only)
hist        TRUE or FALSE (default), provide histograms (continuous variables only)
dens        TRUE or FALSE (default), provide density plots (continuous variables only)
bar         TRUE or FALSE (default), provide bar plots (nominal, ordinal variables only)
barCounts   TRUE or FALSE (default), add counts to the bar plots
box         TRUE or FALSE (default), provide box plots (continuous variables only)
violin      TRUE or FALSE (default), provide violin plots (continuous variables only)
dot         TRUE or FALSE (default), provide dot plots (continuous variables only)

n           TRUE (default) or FALSE, provide the sample size
missing     TRUE (default) or FALSE, provide the number of missing values
mean        TRUE (default) or FALSE, provide the mean
median      TRUE (default) or FALSE, provide the median
mode        TRUE or FALSE (default), provide the mode
sum         TRUE or FALSE (default), provide the sum
sd          TRUE or FALSE (default), provide the standard deviation
variance    TRUE or FALSE (default), provide the variance
range       TRUE or FALSE (default), provide the range
min         TRUE (default) or FALSE, provide the minimum
max         TRUE (default) or FALSE, provide the maximum
se          TRUE or FALSE (default), provide the standard error
skew        TRUE or FALSE (default), provide the skewness
kurt        TRUE or FALSE (default), provide the kurtosis
quart       TRUE or FALSE (default), provide quartiles
pcEqGr      TRUE or FALSE (default), provide quantiles
pcNEqGr     an integer (default: 4) specifying the number of equal groups

Let’s try some of these:

descriptives(
    data = swk,
    vars = c(
        "geschlecht",
        "swk_akad",
        "swk_selbstreg",
        "swk_motselbst"),
    freq = TRUE,
    hist = TRUE,
    dens = TRUE,
    bar = TRUE,
    barCounts = TRUE,
    box = TRUE,
    sd = TRUE,
    variance = TRUE,
    range = TRUE,
    se = TRUE,
    skew = TRUE,
    kurt = TRUE,
    quart = TRUE,
    pcEqGr = TRUE,
    pcNEqGr = 10
)
#> 
#>  DESCRIPTIVES
#> 
#>  Descriptives                                                                        
#>  ─────────────────────────────────────────────────────────────────────────────────── 
#>                           geschlecht    swk_akad    swk_selbstreg    swk_motselbst   
#>  ─────────────────────────────────────────────────────────────────────────────────── 
#>    N                             286         286              286              286   
#>    Missing                         0           0                0                0   
#>    Mean                                     5.10             5.25             5.45   
#>    Std. error mean                        0.0455           0.0476           0.0454   
#>    Median                                   5.14             5.25             5.40   
#>    Standard deviation                      0.770            0.805            0.768   
#>    Variance                                0.593            0.648            0.590   
#>    Range                                    4.57             4.12             3.80   
#>    Minimum                                  2.43             2.88             3.20   
#>    Maximum                                  7.00             7.00             7.00   
#>    Skewness                               -0.161           -0.359           -0.374   
#>    Std. error skewness                     0.144            0.144            0.144   
#>    Kurtosis                               0.0615           -0.279           -0.288   
#>    Std. error kurtosis                     0.287            0.287            0.287   
#>    25th percentile                          4.57             4.75             5.00   
#>    50th percentile                          5.14             5.25             5.40   
#>    75th percentile                          5.57             5.88             6.00   
#>    10th percentile                          4.14             4.12             4.40   
#>    20th percentile                          4.43             4.62             4.80   
#>    30th percentile                          4.66             4.88             5.00   
#>    40th percentile                          4.86             5.12             5.33   
#>    50th percentile                          5.14             5.25             5.40   
#>    60th percentile                          5.33             5.50             5.80   
#>    70th percentile                          5.43             5.75             6.00   
#>    80th percentile                          5.71             6.00             6.20   
#>    90th percentile                          6.07             6.25             6.40   
#>  ─────────────────────────────────────────────────────────────────────────────────── 
#> 
#> 
#>  FREQUENCIES
#> 
#>  Frequencies of geschlecht                            
#>  ──────────────────────────────────────────────────── 
#>    Levels      Counts    % of Total    Cumulative %   
#>  ──────────────────────────────────────────────────── 
#>    männlich       152          53.1            53.1   
#>    weiblich       134          46.9           100.0   
#>  ────────────────────────────────────────────────────

Some of the functionality of the jamovi program comes in modules. The functions in these modules are not included in the jmv package, but in additional packages like scatr.

A scatter plot can be obtained with scatr::scat().

library(scatr)
scat(
    data = swk,
    x = "swk_akad",
    y = "swk_selbstreg",
    group = "geschlecht",
    line = "linear")

An overview of the available arguments is provided by ?scatr::scat:

scat(data, x, y, group = NULL, marg = "none", line = "none", se = FALSE)
Arguments

data    the data as a data frame
x       a string naming the variable from data that contains the x coordinates 
        of the points in the plot, variable must be numeric
y       a string naming the variable from data that contains the y coordinates 
        of the points in the plot, variable must be numeric
group   a string naming the variable from data that represents the grouping variable
marg    none (default), dens, or box, provide respectively no plots, density plots,
        or box plots on the axes
line    none (default), linear, or smooth, provide respectively no regression line, 
        a linear regression line, or a smoothed regression line
se      TRUE or FALSE (default), show the standard error for the regression line

Let’s try some of these:

library(scatr)
scat(
    data = swk,
    x = "swk_akad",
    y = "swk_selbstreg",
    group = "geschlecht",
    marg = "dens",
    line = "smooth",
    se = TRUE)

7.5 Correlations

7.5.1 Traditional ways of doing correlations in R

The generic form for calculating a (Pearson) correlation matrix using base R is as follows: cor(df[,c("var1", "var2", "var3")], methdod = c("pearson", "kendall", "spearman"), use = "complete.obs"). We can thus compute Pearson correlations (default) or - for ordinal data - Kendall’s tau or Spearman rank correlations.

swk_cor <- swk %>%
    select(-geschlecht) %>% 
    cor(method = "pearson", use = "complete.obs") %>%
    round(3)
swk_cor
#>               swk_akad swk_selbstreg swk_durch swk_motselbst swk_sozharm
#> swk_akad         1.000         0.562     0.351         0.640       0.363
#> swk_selbstreg    0.562         1.000     0.318         0.724       0.342
#> swk_durch        0.351         0.318     1.000         0.415       0.592
#> swk_motselbst    0.640         0.724     0.415         1.000       0.407
#> swk_sozharm      0.363         0.342     0.592         0.407       1.000
#> swk_bez          0.392         0.466     0.327         0.415       0.493
#>               swk_bez
#> swk_akad        0.392
#> swk_selbstreg   0.466
#> swk_durch       0.327
#> swk_motselbst   0.415
#> swk_sozharm     0.493
#> swk_bez         1.000

If we want covariances instead of correlations we can use cov() with the same arguments as cor(): cov(df[,c(“var1”, “var2”, “var3”)], methdod = c(“pearson”, “kendall”, “spearman”), use = “complete.obs”).

A good way to display a scatterplot matrix for a number of variables is the function scatterplotMatrix from the car package. In addition to displaying all combinations of bivariate scatterplots it is possible to get different univariate plots in the diagonal: diagonal = c("density","boxplot","histogram","oned","qqplot","none"). Similar to scat() above, smooth = TRUE can be used to display a non-linear LOWESS line (LOcally WEighted Scatterplot Smoother) to assess the (non-)linearity of the correlation.

library(car)
scatterplotMatrix(~ swk_akad + swk_selbstreg + swk_sozharm +  swk_durch,
                  smooth = TRUE,
                  diagonal = "density", 
                  data = swk)
#> Warning in applyDefaults(diagonal, defaults = list(method =
#> "adaptiveDensity"), : unnamed diag arguments, will be ignored

The function cor.test() can do a significance test for correlations, but only one at a time. The generic form is: cor. test (x, y, alternative = c ("two. sided", "less", "greater"), method = c ("pearson", "kendall", "spearman"), conf. level = 0.95). In addition to the non-directional alternative hypothesis (alternative = "two.sided"), we can also test directional alternative hypotheses: (alternative = "less") for a hypothesized negative correlation (“less than 0”) and (alternative = "greater") for a hypothesized positive correlation (“greater than 0”).

cor.test(swk$swk_akad, swk$swk_sozharm, alternative = "greater", method = "pearson")
#> 
#>  Pearson's product-moment correlation
#> 
#> data:  swk$swk_akad and swk$swk_sozharm
#> t = 6.5528, df = 284, p-value = 1.325e-10
#> alternative hypothesis: true correlation is greater than 0
#> 95 percent confidence interval:
#>  0.2746381 1.0000000
#> sample estimates:
#>       cor 
#> 0.3624033
p <- swk %>% 
    select(swk_akad, swk_sozharm) %>%
    drop_na() %>% 
    ggplot(mapping = aes(x = swk_akad, 
                         y = swk_sozharm))

p + geom_jitter(size = 3, alpha = 0.6) +
    stat_smooth(method = "lm", se = TRUE) +
    theme_classic()

The function corr.test() from the psych package returns p-values for a correlation matrix. However, no directional alternative hypotheses are possible here. Matrices of correlations, sample sizes and p-values are displayed. With the p-value matrix, note that the unadjusted (two-sided) p-values are in the lower triangular matrix and the p-values adjusted according to Holm are in the upper triangular matrix. The Holm procedure corresponds to a sequential alpha correction that is less conservative than the Bonferroni procedure, but nevertheless controls the family-wise error. In the diagonals we also obtain p-values for the variances of the variables!

In the Holm procedure (Bonferroni-Holm method), the smallest p-value is multiplied by the number of tests (\(p(p-1)/2\); p = number of variables in the correlation matrix), the second smallest by the number of tests minus 1, etc. until the largest p-value that is no longer corrected (i.e., multiplied by \(p-p+1 = 1\)). However, the p-value is only multiplied by the corresponding number of tests if the resulting adjusted p-value is not smaller than the previously adjusted (smaller) p-value. If this is the case, the test is assigned the same p-value as its predecessor. This prevents that a smaller effect gets a smaller p-value after the adjustment than a larger effect, thereby preventing that a smaller effect may reach significance while the larger effect does not.

library(psych)
swk[1:25,] %>% select(starts_with("swk")) %>% 
    corr.test()
#> Call:corr.test(x = .)
#> Correlation matrix 
#>               swk_akad swk_selbstreg swk_durch swk_motselbst swk_sozharm
#> swk_akad          1.00          0.38      0.66          0.45        0.55
#> swk_selbstreg     0.38          1.00      0.47          0.70        0.37
#> swk_durch         0.66          0.47      1.00          0.41        0.72
#> swk_motselbst     0.45          0.70      0.41          1.00        0.49
#> swk_sozharm       0.55          0.37      0.72          0.49        1.00
#> swk_bez           0.31          0.56      0.34          0.66        0.66
#>               swk_bez
#> swk_akad         0.31
#> swk_selbstreg    0.56
#> swk_durch        0.34
#> swk_motselbst    0.66
#> swk_sozharm      0.66
#> swk_bez          1.00
#> Sample Size 
#> [1] 25
#> Probability values (Entries above the diagonal are adjusted for multiple tests.) 
#>               swk_akad swk_selbstreg swk_durch swk_motselbst swk_sozharm
#> swk_akad          0.00          0.23      0.00          0.14        0.04
#> swk_selbstreg     0.06          0.00      0.12          0.00        0.23
#> swk_durch         0.00          0.02      0.00          0.21        0.00
#> swk_motselbst     0.02          0.00      0.04          0.00        0.11
#> swk_sozharm       0.00          0.07      0.00          0.01        0.00
#> swk_bez           0.13          0.00      0.09          0.00        0.00
#>               swk_bez
#> swk_akad         0.23
#> swk_selbstreg    0.04
#> swk_durch        0.23
#> swk_motselbst    0.00
#> swk_sozharm      0.00
#> swk_bez          0.00
#> 
#>  To see confidence intervals of the correlations, print with the short=FALSE option

7.5.2 Partial correlation

Partial correlations can be obtained partial.cor(df[, c("var1", "var2", "var3")], use = "complete.obs"). Important: Partialling for the corresponding bivariate correlation is done for all other variables in the data frame!

# Normal correlation matrix for comparison (zero-order correlations)
cor(westost_scales[,c("swk_akad", "swk_selbstreg", "Schnitt", "leben_schule")],
    use = "complete.obs")
#>                swk_akad swk_selbstreg   Schnitt leben_schule
#> swk_akad      1.0000000     0.5603931 0.4168517    0.3902735
#> swk_selbstreg 0.5603931     1.0000000 0.3824271    0.4896185
#> Schnitt       0.4168517     0.3824271 1.0000000    0.4754183
#> leben_schule  0.3902735     0.4896185 0.4754183    1.0000000

# Partial correlation matrix with three variables (first-order partial correlations)
partial.cor(westost_scales[,c("swk_akad", "swk_selbstreg", "Schnitt")],
            use = "complete.obs")
#> 
#>  Partial correlations:
#>               swk_akad swk_selbstreg Schnitt
#> swk_akad       0.00000       0.47894 0.26472
#> swk_selbstreg  0.47894       0.00000 0.19889
#> Schnitt        0.26472       0.19889 0.00000
#> 
#>  Number of observations: 280

# Partial correlation matrix with four variables (second-order partial correlations)
partial.cor(westost_scales[,c("swk_akad", "swk_selbstreg",
                              "Schnitt", "leben_schule")], 
            use = "complete.obs")
#> 
#>  Partial correlations:
#>               swk_akad swk_selbstreg Schnitt leben_schule
#> swk_akad       0.00000       0.43015 0.22486      0.07308
#> swk_selbstreg  0.43015       0.00000 0.07488      0.30949
#> Schnitt        0.22486       0.07488 0.00000      0.33116
#> leben_schule   0.07308       0.30949 0.33116      0.00000
#> 
#>  Number of observations: 279

7.5.3 Correlation analysis with jmv

Almost all we have done above with the more traditional R functions we can also do in jmv using the corrMatrix() function:

library(jmv)
corrMatrix(
    data = swk,
    vars = c(
        "swk_akad",
        "swk_selbstreg",
        "swk_motselbst",
        "swk_durch",
        "swk_sozharm",
        "swk_bez"))
#> 
#>  CORRELATION MATRIX
#> 
#>  Correlation Matrix                                                                                                    
#>  ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 
#>                                    swk_akad    swk_selbstreg    swk_motselbst    swk_durch    swk_sozharm    swk_bez   
#>  ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 
#>    swk_akad         Pearson's r           —            0.562            0.641        0.351          0.362      0.391   
#>                     p-value               —           < .001           < .001       < .001         < .001     < .001   
#>                                                                                                                        
#>    swk_selbstreg    Pearson's r                            —            0.723        0.318          0.341      0.464   
#>                     p-value                                —           < .001       < .001         < .001     < .001   
#>                                                                                                                        
#>    swk_motselbst    Pearson's r                                             —        0.415          0.407      0.414   
#>                     p-value                                                 —       < .001         < .001     < .001   
#>                                                                                                                        
#>    swk_durch        Pearson's r                                                          —          0.592      0.327   
#>                     p-value                                                              —         < .001     < .001   
#>                                                                                                                        
#>    swk_sozharm      Pearson's r                                                                         —      0.493   
#>                     p-value                                                                             —     < .001   
#>                                                                                                                        
#>    swk_bez          Pearson's r                                                                                    —   
#>                     p-value                                                                                        —   
#>  ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

An overview of the available arguments is provided by ?jmv::corrMatrix:

corrMatrix(data, vars, pearson = TRUE, spearman = FALSE, kendall = FALSE,
  sig = TRUE, flag = FALSE, ci = FALSE, ciWidth = 95, plots = FALSE,
  plotDens = FALSE, plotStats = FALSE, hypothesis = "corr")

Arguments

data        the data as a data frame
vars        a vector of strings naming the variables to correlate in data
pearson     TRUE (default) or FALSE, provide Pearson's R
spearman    TRUE or FALSE (default), provide Spearman's rho
kendall     TRUE or FALSE (default), provide Kendall's tau-b
sig         TRUE (default) or FALSE, provide significance levels
flag        TRUE or FALSE (default), flag significant correlations
ci          TRUE or FALSE (default), provide confidence intervals
ciWidth     a number between 50 and 99.9 (default: 95), the width of CI
plots       TRUE or FALSE (default), provide a correlation matrix plot
plotDens    TRUE or FALSE (default), provide densities in the correlation matrix plot
plotStats   TRUE or FALSE (default), provide statistics in the correlation matrix plot
hypothesis  one of 'corr' (default), 'pos', 'neg' specifying the alernative hypothesis; 
            correlated, correlated positively, correlated negatively respectively.

Let’s try some options with only the three academic self-efficacy variables:

corrMatrix(
    data = swk,
    vars = c(
        "swk_akad",
        "swk_selbstreg",
        "swk_motselbst"),
    kendall = TRUE,
    sig = FALSE,
    flag = TRUE,
    ci = TRUE)
#> 
#>  CORRELATION MATRIX
#> 
#>  Correlation Matrix                                                                 
#>  ────────────────────────────────────────────────────────────────────────────────── 
#>                                        swk_akad    swk_selbstreg    swk_motselbst   
#>  ────────────────────────────────────────────────────────────────────────────────── 
#>    swk_akad         Pearson's r               —            0.562            0.641   
#>                     95% CI Upper              —            0.636            0.704   
#>                     95% CI Lower              —            0.477            0.567   
#>                     Kendall's Tau B           —            0.425            0.471   
#>                                                                                     
#>    swk_selbstreg    Pearson's r                                —            0.723   
#>                     95% CI Upper                               —            0.774   
#>                     95% CI Lower                               —            0.663   
#>                     Kendall's Tau B                            —            0.569   
#>                                                                                     
#>    swk_motselbst    Pearson's r                                                 —   
#>                     95% CI Upper                                                —   
#>                     95% CI Lower                                                —   
#>                     Kendall's Tau B                                             —   
#>  ────────────────────────────────────────────────────────────────────────────────── 
#>    Note. * p < .05, ** p < .01, *** p < .001

7.6 Reliability analysis

7.6.1 Reliability analysis using the psych package

For the reliability analysis we will need the data set containing all individual items (westost). Then we create two reduced data sets: One for the scale “Self-Efficacy: Academic Achievement” (swkakad) including items T1_1, T1_7, T1_14, T1_16, T1_26, T1_27, and T1_31. And another for the scale “Self-Efficacy: Social Harmony” (swksozharm) including items T1_2, T1_8, T1_15, T1_23, T1_29, T1_32, and T1_35.

library(psych)
load(file = "data/westost.Rdata")
names(westost)
#>   [1] "ID"               "westost"          "geschlecht"      
#>   [4] "alter"            "T1_1"             "T1_2"            
#>   [7] "T1_3"             "T1_4"             "T1_5"            
#>  [10] "T1_6"             "T1_7"             "T1_8"            
#>  [13] "T1_9"             "T1_10"            "T1_11"           
#>  [16] "T1_12"            "T1_13"            "T1_14"           
#>  [19] "T1_15"            "T1_16"            "T1_17"           
#>  [22] "T1_18"            "T1_19"            "T1_20"           
#>  [25] "T1_21"            "T1_22"            "T1_23"           
#>  [28] "T1_24"            "T1_25"            "T1_26"           
#>  [31] "T1_27"            "T1_28"            "T1_29"           
#>  [34] "T1_30"            "T1_31"            "T1_32"           
#>  [37] "T1_33"            "T1_34"            "T1_35"           
#>  [40] "T1_36"            "T2_1"             "T2_2"            
#>  [43] "T2_3"             "T2_4"             "T2_5"            
#>  [46] "T2_6"             "T3_1"             "T3_2"            
#>  [49] "T3_3"             "T3_4"             "T3_5"            
#>  [52] "T3_6"             "T4_1"             "T4_2"            
#>  [55] "T4_3"             "T4_4"             "T4_5"            
#>  [58] "T4_6"             "T4_7"             "T4_8"            
#>  [61] "T4_9"             "T4_10"            "T4_11"           
#>  [64] "T5_1"             "T5_2"             "T5_3"            
#>  [67] "T5_4"             "T5_5"             "T5_6"            
#>  [70] "T5_7"             "T5_8"             "T5_9"            
#>  [73] "T5_10"            "T5_11"            "T6_1"            
#>  [76] "T6_2"             "T6_3"             "T6_4"            
#>  [79] "T6_5"             "T6_6"             "T6_7"            
#>  [82] "T6_8"             "T6_9"             "T6_10"           
#>  [85] "T6_11"            "T7_1"             "T7_2"            
#>  [88] "T7_3"             "T7_4"             "T7_5"            
#>  [91] "T7_6"             "T7_7"             "T7_8"            
#>  [94] "T7_9"             "T7_10"            "T7_11"           
#>  [97] "T7_12"            "SES"              "Deutsch"         
#> [100] "Mathe"            "Fremdspr"         "Schnitt"         
#> [103] "bildung_vater"    "bildung_mutter"   "bildung_vater_b" 
#> [106] "bildung_mutter_b" "swk_akad"         "swk_selbstreg"   
#> [109] "swk_durch"        "swk_motselbst"    "swk_sozharm"     
#> [112] "swk_bez"          "unt_eltern"       "unt_freunde"     
#> [115] "sch_leistung"     "sch_bez"          "sch_regeln"      
#> [118] "ges_gerecht"      "ges_nepot"        "leben_selbst"    
#> [121] "leben_fam"        "leben_schule"     "leben_freunde"   
#> [124] "leben_gesamt"     "stress_somat"     "stress_psych"

swkakad <- westost %>% 
    select(one_of(c("T1_1", "T1_7", "T1_14", "T1_16", 
                    "T1_26", "T1_27", "T1_31")))
swkakad
#> # A tibble: 286 x 7
#>    T1_1  T1_7 T1_14 T1_16 T1_26 T1_27 T1_31
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     4     5     5     7     5     4     5
#> 2     3     5     5     7     4     5     5
#> 3     4     1     2     6     6     5     4
#> 4     6     5     6     7     7     6     6
#> 5     5     5     4     6     6     4     6
#> 6     6     5     6     7     7     3     4
#> # … with 280 more rows


swksozharm <- westost %>% 
    select(one_of(c("T1_2", "T1_8", "T1_15", "T1_23", 
                    "T1_29", "T1_32", "T1_35")))
swksozharm
#> # A tibble: 286 x 7
#>    T1_2  T1_8 T1_15 T1_23 T1_29 T1_32 T1_35
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     6     6     5     5     5     6     6
#> 2     5     6     4     3     4     6     5
#> 3     3     6     4     3     3     5     7
#> 4     6     6     6     6     6     6     6
#> 5     3     4     3     4     4     6     5
#> 6     6     6     6     5     7     7     7
#> # … with 280 more rows

Cronbach’s alpha

The internal consistency (Cronbach’s Alpha) can be computed using the function alpha():

alpha(swkakad)
#> 
#> Reliability analysis   
#> Call: alpha(x = swkakad)
#> 
#>   raw_alpha std.alpha G6(smc) average_r S/N   ase mean   sd median_r
#>       0.63      0.63    0.65      0.19 1.7 0.033  5.1 0.77     0.18
#> 
#>  lower alpha upper     95% confidence boundaries
#> 0.56 0.63 0.7 
#> 
#>  Reliability if an item is dropped:
#>       raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
#> T1_1       0.59      0.59    0.60      0.19 1.4    0.037 0.022 0.168
#> T1_7       0.58      0.58    0.57      0.19 1.4    0.039 0.013 0.192
#> T1_14      0.57      0.56    0.57      0.18 1.3    0.039 0.022 0.097
#> T1_16      0.64      0.64    0.65      0.23 1.8    0.032 0.020 0.192
#> T1_26      0.61      0.61    0.63      0.21 1.6    0.035 0.024 0.180
#> T1_27      0.58      0.58    0.59      0.19 1.4    0.039 0.021 0.168
#> T1_31      0.56      0.57    0.56      0.18 1.3    0.040 0.015 0.180
#> 
#>  Item statistics 
#>         n raw.r std.r r.cor r.drop mean  sd
#> T1_1  286  0.53  0.56  0.45   0.34  5.0 1.2
#> T1_7  284  0.61  0.58  0.52   0.39  5.0 1.5
#> T1_14 283  0.61  0.62  0.54   0.41  4.9 1.4
#> T1_16 285  0.43  0.43  0.24   0.18  5.8 1.4
#> T1_26 285  0.47  0.50  0.34   0.27  5.3 1.2
#> T1_27 281  0.61  0.58  0.48   0.38  4.9 1.5
#> T1_31 283  0.63  0.61  0.56   0.43  4.7 1.4
#> 
#> Non missing response frequency for each item
#>          1    2    3    4    5    6    7 miss
#> T1_1  0.01 0.01 0.07 0.20 0.33 0.30 0.08 0.00
#> T1_7  0.02 0.04 0.12 0.13 0.29 0.24 0.16 0.01
#> T1_14 0.03 0.05 0.07 0.19 0.26 0.32 0.10 0.01
#> T1_16 0.02 0.02 0.03 0.09 0.15 0.32 0.36 0.00
#> T1_26 0.01 0.01 0.05 0.14 0.31 0.33 0.15 0.00
#> T1_27 0.04 0.03 0.09 0.21 0.23 0.26 0.15 0.02
#> T1_31 0.03 0.03 0.13 0.22 0.26 0.23 0.10 0.01

Optionally, a “key vector” can be defined. This is particularly useful if you have some items that have been formulated in the opposite direction and therefore should actually be recoded before doing the reliability analysis. The “key vector” indicates which items have to be recoded (here: none!)

alpha(swkakad, keys = c(1, 1, 1, 1, 1, 1, 1))
#> 
#> Reliability analysis   
#> Call: alpha(x = swkakad, keys = c(1, 1, 1, 1, 1, 1, 1))
#> 
#>   raw_alpha std.alpha G6(smc) average_r S/N   ase mean   sd median_r
#>       0.63      0.63    0.65      0.19 1.7 0.033  5.1 0.77     0.18
#> 
#>  lower alpha upper     95% confidence boundaries
#> 0.56 0.63 0.7 
#> 
#>  Reliability if an item is dropped:
#>       raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
#> T1_1       0.59      0.59    0.60      0.19 1.4    0.037 0.022 0.168
#> T1_7       0.58      0.58    0.57      0.19 1.4    0.039 0.013 0.192
#> T1_14      0.57      0.56    0.57      0.18 1.3    0.039 0.022 0.097
#> T1_16      0.64      0.64    0.65      0.23 1.8    0.032 0.020 0.192
#> T1_26      0.61      0.61    0.63      0.21 1.6    0.035 0.024 0.180
#> T1_27      0.58      0.58    0.59      0.19 1.4    0.039 0.021 0.168
#> T1_31      0.56      0.57    0.56      0.18 1.3    0.040 0.015 0.180
#> 
#>  Item statistics 
#>         n raw.r std.r r.cor r.drop mean  sd
#> T1_1  286  0.53  0.56  0.45   0.34  5.0 1.2
#> T1_7  284  0.61  0.58  0.52   0.39  5.0 1.5
#> T1_14 283  0.61  0.62  0.54   0.41  4.9 1.4
#> T1_16 285  0.43  0.43  0.24   0.18  5.8 1.4
#> T1_26 285  0.47  0.50  0.34   0.27  5.3 1.2
#> T1_27 281  0.61  0.58  0.48   0.38  4.9 1.5
#> T1_31 283  0.63  0.61  0.56   0.43  4.7 1.4
#> 
#> Non missing response frequency for each item
#>          1    2    3    4    5    6    7 miss
#> T1_1  0.01 0.01 0.07 0.20 0.33 0.30 0.08 0.00
#> T1_7  0.02 0.04 0.12 0.13 0.29 0.24 0.16 0.01
#> T1_14 0.03 0.05 0.07 0.19 0.26 0.32 0.10 0.01
#> T1_16 0.02 0.02 0.03 0.09 0.15 0.32 0.36 0.00
#> T1_26 0.01 0.01 0.05 0.14 0.31 0.33 0.15 0.00
#> T1_27 0.04 0.03 0.09 0.21 0.23 0.26 0.15 0.02
#> T1_31 0.03 0.03 0.13 0.22 0.26 0.23 0.10 0.01

The following command is (erroneously) specified with recoding keys on variables 2 and 5:

alpha(swkakad, keys = c(1, -1, 1, 1, -1, 1, 1))
#> 
#> Reliability analysis   
#> Call: alpha(x = swkakad, keys = c(1, -1, 1, 1, -1, 1, 1))
#> 
#>   raw_alpha std.alpha G6(smc) average_r S/N   ase mean   sd median_r
#>       0.63      0.63    0.65      0.19 1.7 0.033  5.1 0.77     0.18
#> 
#>  lower alpha upper     95% confidence boundaries
#> 0.56 0.63 0.7 
#> 
#>  Reliability if an item is dropped:
#>       raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
#> T1_1       0.59      0.59    0.60      0.19 1.4    0.037 0.022 0.168
#> T1_7       0.58      0.58    0.57      0.19 1.4    0.039 0.013 0.192
#> T1_14      0.57      0.56    0.57      0.18 1.3    0.039 0.022 0.097
#> T1_16      0.64      0.64    0.65      0.23 1.8    0.032 0.020 0.192
#> T1_26      0.61      0.61    0.63      0.21 1.6    0.035 0.024 0.180
#> T1_27      0.58      0.58    0.59      0.19 1.4    0.039 0.021 0.168
#> T1_31      0.56      0.57    0.56      0.18 1.3    0.040 0.015 0.180
#> 
#>  Item statistics 
#>         n raw.r std.r r.cor r.drop mean  sd
#> T1_1  286  0.53  0.56  0.45   0.34  5.0 1.2
#> T1_7  284  0.61  0.58  0.52   0.39  5.0 1.5
#> T1_14 283  0.61  0.62  0.54   0.41  4.9 1.4
#> T1_16 285  0.43  0.43  0.24   0.18  5.8 1.4
#> T1_26 285  0.47  0.50  0.34   0.27  5.3 1.2
#> T1_27 281  0.61  0.58  0.48   0.38  4.9 1.5
#> T1_31 283  0.63  0.61  0.56   0.43  4.7 1.4
#> 
#> Non missing response frequency for each item
#>          1    2    3    4    5    6    7 miss
#> T1_1  0.01 0.01 0.07 0.20 0.33 0.30 0.08 0.00
#> T1_7  0.02 0.04 0.12 0.13 0.29 0.24 0.16 0.01
#> T1_14 0.03 0.05 0.07 0.19 0.26 0.32 0.10 0.01
#> T1_16 0.02 0.02 0.03 0.09 0.15 0.32 0.36 0.00
#> T1_26 0.01 0.01 0.05 0.14 0.31 0.33 0.15 0.00
#> T1_27 0.04 0.03 0.09 0.21 0.23 0.26 0.15 0.02
#> T1_31 0.03 0.03 0.13 0.22 0.26 0.23 0.10 0.01

Explanation of the output:

raw_alpha   # (Normal) Alpha based on covariances
std.alpha   # Standardized Alpha based on correlations
G6(smc)     # Guttman's Lambda 6 (alternative measure of reliability)
average_r   # Average inter-item-correlation
mean        # Scale mean (across all items)
sd          # Scale standard deviation
alpha.drop  # Statistics in case the respective item is deleted from the scale
item.stats  # Item-total correlations (for raw and corrected scores) 
alpha(swksozharm)
#> 
#> Reliability analysis   
#> Call: alpha(x = swksozharm)
#> 
#>   raw_alpha std.alpha G6(smc) average_r S/N   ase mean   sd median_r
#>        0.8      0.82    0.81      0.39 4.5 0.018  5.3 0.78     0.36
#> 
#>  lower alpha upper     95% confidence boundaries
#> 0.77 0.8 0.84 
#> 
#>  Reliability if an item is dropped:
#>       raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
#> T1_2       0.76      0.78    0.76      0.37 3.5    0.022 0.011  0.32
#> T1_8       0.76      0.78    0.76      0.37 3.5    0.022 0.012  0.32
#> T1_15      0.79      0.80    0.79      0.41 4.1    0.020 0.017  0.37
#> T1_23      0.82      0.82    0.81      0.44 4.6    0.016 0.011  0.39
#> T1_29      0.78      0.80    0.79      0.40 4.1    0.020 0.018  0.35
#> T1_32      0.77      0.78    0.77      0.37 3.5    0.022 0.013  0.35
#> T1_35      0.77      0.78    0.76      0.37 3.6    0.022 0.011  0.36
#> 
#>  Item statistics 
#>         n raw.r std.r r.cor r.drop mean   sd
#> T1_2  285  0.75  0.75  0.71   0.61  5.2 1.27
#> T1_8  281  0.75  0.76  0.71   0.63  5.5 1.10
#> T1_15 284  0.64  0.64  0.54   0.49  4.7 1.10
#> T1_23 278  0.60  0.55  0.42   0.37  4.5 1.47
#> T1_29 286  0.65  0.65  0.56   0.51  5.1 1.10
#> T1_32 281  0.73  0.75  0.70   0.62  5.8 0.99
#> T1_35 281  0.72  0.75  0.71   0.62  6.1 0.96
#> 
#> Non missing response frequency for each item
#>          1    2    3    4    5    6    7 miss
#> T1_2  0.01 0.01 0.07 0.20 0.27 0.27 0.17 0.00
#> T1_8  0.00 0.01 0.02 0.15 0.26 0.39 0.17 0.02
#> T1_15 0.01 0.01 0.07 0.30 0.38 0.17 0.05 0.01
#> T1_23 0.04 0.06 0.12 0.24 0.27 0.20 0.08 0.03
#> T1_29 0.00 0.01 0.06 0.20 0.36 0.27 0.10 0.00
#> T1_32 0.01 0.00 0.01 0.05 0.25 0.44 0.23 0.02
#> T1_35 0.00 0.00 0.01 0.04 0.18 0.39 0.38 0.02

Via the summary() function it is also possible obtain only alpha. To do this, the result of the analysis must be saved in an object:

sozharm_rel <- alpha(swksozharm)
summary(sozharm_rel)
#> 
#> Reliability analysis   
#>  raw_alpha std.alpha G6(smc) average_r S/N   ase mean   sd median_r
#>        0.8      0.82    0.81      0.39 4.5 0.018  5.3 0.78     0.36

# also possible:
swksozharm %>% 
    alpha() %>% 
    summary()
#> 
#> Reliability analysis   
#>  raw_alpha std.alpha G6(smc) average_r S/N   ase mean   sd median_r
#>        0.8      0.82    0.81      0.39 4.5 0.018  5.3 0.78     0.36

7.6.2 Reliability analysis with jmv

library(jmv)
reliability(
    data = westost,
    vars = c(
        "T1_1",
        "T1_7",
        "T1_14",
        "T1_16",
        "T1_26",
        "T1_27",
        "T1_31"),
    omegaScale = TRUE,
    meanScale = TRUE,
    sdScale = TRUE,
    corPlot = TRUE,
    alphaItems = TRUE,
    omegaItems = TRUE,
    meanItems = TRUE,
    sdItems = TRUE,
    itemRestCor = TRUE)
#> 
#>  RELIABILITY ANALYSIS
#> 
#>  Scale Reliability Statistics                               
#>  ────────────────────────────────────────────────────────── 
#>             mean    sd       Cronbach's α    McDonald's ω   
#>  ────────────────────────────────────────────────────────── 
#>    scale    5.10    0.777           0.637           0.643   
#>  ────────────────────────────────────────────────────────── 
#> 
#> 
#>  Item Reliability Statistics                                                        
#>  ────────────────────────────────────────────────────────────────────────────────── 
#>             mean    sd      item-rest correlation    Cronbach's α    McDonald's ω   
#>  ────────────────────────────────────────────────────────────────────────────────── 
#>    T1_1     5.05    1.18                    0.352           0.601           0.615   
#>    T1_7     4.99    1.49                    0.390           0.586           0.601   
#>    T1_14    4.94    1.44                    0.418           0.577           0.597   
#>    T1_16    5.75    1.41                    0.195           0.648           0.650   
#>    T1_26    5.30    1.20                    0.271           0.622           0.634   
#>    T1_27    4.92    1.51                    0.385           0.588           0.598   
#>    T1_31    4.75    1.42                    0.431           0.573           0.589   
#>  ──────────────────────────────────────────────────────────────────────────────────
#    revItems = "")

library(jmv)
reliability(
    data = westost,
    vars = c(
        "T1_2",
        "T1_8",
        "T1_15",
        "T1_23",
        "T1_29",
        "T1_32",
        "T1_35"),
    omegaScale = TRUE,
    meanScale = TRUE,
    sdScale = TRUE,
    corPlot = TRUE,
    alphaItems = TRUE,
    omegaItems = TRUE,
    meanItems = TRUE,
    sdItems = TRUE,
    itemRestCor = TRUE,
    revItems = "T1_2")
#> 
#>  RELIABILITY ANALYSIS
#> 
#>  Scale Reliability Statistics                               
#>  ────────────────────────────────────────────────────────── 
#>             mean    sd       Cronbach's α    McDonald's ω   
#>  ────────────────────────────────────────────────────────── 
#>    scale    4.93    0.568           0.475           0.676   
#>  ────────────────────────────────────────────────────────── 
#>    Note. item 'T1_2' correlates negatively with the
#>    total scale and probably should be reversed
#> 
#> 
#>  Item Reliability Statistics                                                           
#>  ───────────────────────────────────────────────────────────────────────────────────── 
#>               mean    sd       item-rest correlation    Cronbach's α    McDonald's ω   
#>  ───────────────────────────────────────────────────────────────────────────────────── 
#>    T1_2  ᵃ    2.80    1.262                   -0.606           0.762           0.783   
#>    T1_8       5.50    1.098                    0.467           0.323           0.579   
#>    T1_15      4.72    1.117                    0.441           0.334           0.625   
#>    T1_23      4.55    1.487                    0.386           0.339           0.649   
#>    T1_29      5.12    1.084                    0.450           0.333           0.621   
#>    T1_32      5.77    1.000                    0.485           0.328           0.579   
#>    T1_35      6.08    0.963                    0.451           0.348           0.589   
#>  ───────────────────────────────────────────────────────────────────────────────────── 
#>    ᵃ reverse scaled item