8 Descriptive statistics and correlations

8.1 Descriptives for all variables

The function summary(data_frame) returns descriptive statistics for all variables in a dataset. For numeric variables, the minimum, maximum, quartiles, median, and mean values are returned, for factors the frequencies of the factor levels. In addition, the number of missing values for both variable types is displayed.

We first import our sample data set and assign the variables their correct scale level (i.e., defining factors as factors). Then we save the data set as an .Rdata file in our data folder, so that we can load it next time without having to do all the import work again.

library(tidyverse)
library(haven)
adolescents <- read_sav("data/beispieldaten.sav")
# Look at the data set
adolescents
#> # A tibble: 286 x 98
#>      ID westost geschlecht alter  swk1  swk2  swk3  swk4  swk5  swk6  swk7  swk8
#>   <dbl> <dbl+l>  <dbl+lbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     1 0 [Wes… 1 [weibli…    13     2     5     6     4     7     5     5     6
#> 2     2 0 [Wes… 0 [männli…    14     4     6     4     4     6     5     5     6
#> 3    10 0 [Wes… 1 [weibli…    14     5     4     5     6     5     7     7     6
#> 4    11 0 [Wes… 1 [weibli…    14     3     6     6     5     7     7     6     6
#> 5    12 0 [Wes… 1 [weibli…    14     5     6     6     5     7     6     5     5
#> 6    14 0 [Wes… 0 [männli…    14     3     5     5     4     5     7     5     6
#> # … with 280 more rows, and 86 more variables: swk9 <dbl>, swk10 <dbl>,
#> #   swk11 <dbl>, swk12 <dbl>, swk13 <dbl>, swk14 <dbl>, swk15 <dbl>,
#> #   swk16 <dbl>, swk17 <dbl>, swk18 <dbl>, swk19 <dbl>, swk20 <dbl>,
#> #   swk21 <dbl>, swk22 <dbl>, swk23 <dbl>, swk24 <dbl>, swk25 <dbl>,
#> #   swk26 <dbl>, swk27 <dbl>, swk28 <dbl>, swk29 <dbl>, swk30 <dbl>,
#> #   swk31 <dbl>, swk32 <dbl>, swk33 <dbl>, swk34 <dbl>, swk35 <dbl>,
#> #   swk36 <dbl>, unteltern1 <dbl>, unteltern2 <dbl>, unteltern3 <dbl>,
#> #   unteltern4 <dbl>, unteltern5 <dbl>, unteltern6 <dbl>, untfreunde1 <dbl>,
#> #   untfreunde2 <dbl>, untfreunde3 <dbl>, untfreunde4 <dbl>, untfreunde5 <dbl>,
#> #   untfreunde6 <dbl>, leben1 <dbl>, leben2 <dbl>, leben3 <dbl>, leben4 <dbl>,
#> #   leben5 <dbl>, leben6 <dbl>, leben7 <dbl>, leben8 <dbl>, leben9 <dbl>,
#> #   leben10 <dbl>, leben11 <dbl>, stress1 <dbl>, stress2 <dbl>, stress3 <dbl>,
#> #   stress4 <dbl>, stress5 <dbl>, stress6 <dbl>, stress7 <dbl>, stress8 <dbl>,
#> #   stress9 <dbl>, stress10 <dbl>, stress11 <dbl>, stress12 <dbl>,
#> #   Deutschnote <dbl>, Mathenote <dbl+lbl>, Fremdsprachenote <dbl+lbl>,
#> #   Gesamtnote <dbl>, bildung_vater <dbl+lbl>, bildung_mutter <dbl+lbl>,
#> #   bildung_vater_binaer <dbl+lbl>, bildung_mutter_binaer <dbl+lbl>,
#> #   swk_neueslernen <dbl>, swk_lernregulation <dbl>, swk_motivation <dbl>,
#> #   swk_durchsetzung <dbl>, swk_sozialkomp <dbl>, swk_beziehung <dbl>,
#> #   unt_eltern <dbl>, unt_freunde <dbl>, leben_selbst <dbl>,
#> #   leben_familie <dbl>, leben_schule <dbl>, leben_freunde <dbl>,
#> #   leben_gesamt <dbl>, stress_somatisch <dbl>, stress_psychisch <dbl>

# Define factors
adolescents$ID <- as.factor(adolescents$ID)
adolescents$westost <- as_factor(adolescents$westost, levels = "default")
adolescents$geschlecht <- as_factor(adolescents$geschlecht, levels = "default")
adolescents$bildung_vater <- as_factor(adolescents$bildung_vater, levels = "default")
adolescents$bildung_mutter <- as_factor(adolescents$bildung_mutter, levels = "default")
adolescents$bildung_vater_binaer <- as_factor(adolescents$bildung_vater_binaer, levels = "default")
adolescents$bildung_mutter_binaer <- as_factor(adolescents$bildung_mutter_binaer, levels = "default")


# The factor levels of `bildung_vater` and `bildung_mutter` are quite long. Let's rename them: 
adolescents <- adolescents %>% 
  mutate(bildung_vater = fct_recode(bildung_vater,
                                    Hauptschule = "Hauptschulabschluss oder niedriger",
                                    Realschule = "Realschulabschluss (mittlere Reife)",
                                    Abitur = "Fachabitur, Abitur",
                                    Hochschule = "Fachhochschulabschluss, Universitätsabschluss"),
         bildung_mutter = fct_recode(bildung_mutter,
                                    Hauptschule = "Hauptschulabschluss oder niedriger",
                                    Realschule = "Realschulabschluss (mittlere Reife)",
                                    Abitur = "Fachabitur, Abitur",
                                    Hochschule = "Fachhochschulabschluss, Universitätsabschluss"))

# Save as Rdata file
save(adolescents, file = "data/adolescents.Rdata")

For showing the use of the summary() function we first get rid of the single item variables (keeping scale variables only) to avoid too much output:

adolescents_scales <- adolescents %>% select (-c(swk1:stress12))

summary(adolescents_scales)
#>        ID      westost       geschlecht      alter           SES       
#>  1      :  1   West:143   männlich:152   Min.   :13.0   Min.   :1.000  
#>  2      :  1   Ost :143   weiblich:134   1st Qu.:14.0   1st Qu.:3.000  
#>  10     :  1                             Median :15.0   Median :3.000  
#>  11     :  1                             Mean   :14.7   Mean   :3.391  
#>  12     :  1                             3rd Qu.:15.0   3rd Qu.:4.000  
#>  14     :  1                             Max.   :17.0   Max.   :5.000  
#>  (Other):280                                            NA's   :2      
#>     Deutsch          Mathe        Fremdsprache     Gesamtnote   
#>  Min.   :2.000   Min.   :2.000   Min.   :2.000   Min.   :3.000  
#>  1st Qu.:4.000   1st Qu.:4.000   1st Qu.:4.000   1st Qu.:4.000  
#>  Median :4.000   Median :4.000   Median :4.000   Median :4.500  
#>  Mean   :4.261   Mean   :4.173   Mean   :4.309   Mean   :4.462  
#>  3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:5.000  
#>  Max.   :6.000   Max.   :6.000   Max.   :6.000   Max.   :6.000  
#>  NA's   :3       NA's   :2       NA's   :11      NA's   :6      
#>      bildung_vater     bildung_mutter bildung_vater_binaer
#>  Hauptschule:41    Hauptschule:43     niedrig:135         
#>  Realschule :94    Realschule :95     hoch   :134         
#>  Abitur     :41    Abitur     :50     NA's   : 17         
#>  Hochschule :93    Hochschule :84                         
#>  NA's       :17    NA's       :14                         
#>                                                           
#>                                                           
#>  bildung_mutter_binaer swk_neueslernen swk_lernregulation swk_motivation 
#>  niedrig:138           Min.   :2.429   Min.   :2.875      Min.   :3.200  
#>  hoch   :134           1st Qu.:4.571   1st Qu.:4.750      1st Qu.:5.000  
#>  NA's   : 14           Median :5.143   Median :5.250      Median :5.400  
#>                        Mean   :5.098   Mean   :5.246      Mean   :5.447  
#>                        3rd Qu.:5.571   3rd Qu.:5.875      3rd Qu.:6.000  
#>                        Max.   :7.000   Max.   :7.000      Max.   :7.000  
#>                                                                          
#>  swk_durchsetzung swk_sozialkomp  swk_beziehung     unt_eltern   
#>  Min.   :1.000    Min.   :1.571   Min.   :3.333   Min.   :1.500  
#>  1st Qu.:4.667    1st Qu.:4.857   1st Qu.:5.167   1st Qu.:5.333  
#>  Median :5.333    Median :5.381   Median :5.667   Median :6.167  
#>  Mean   :5.285    Mean   :5.280   Mean   :5.616   Mean   :5.872  
#>  3rd Qu.:6.000    3rd Qu.:5.857   3rd Qu.:6.167   3rd Qu.:6.667  
#>  Max.   :7.000    Max.   :7.000   Max.   :7.000   Max.   :7.000  
#>  NA's   :1                                        NA's   :1      
#>   unt_freunde     leben_selbst   leben_familie    leben_schule  
#>  Min.   :2.000   Min.   :1.000   Min.   :1.000   Min.   :1.333  
#>  1st Qu.:5.500   1st Qu.:5.000   1st Qu.:5.333   1st Qu.:4.000  
#>  Median :6.000   Median :5.500   Median :6.000   Median :4.667  
#>  Mean   :5.928   Mean   :5.433   Mean   :5.760   Mean   :4.670  
#>  3rd Qu.:6.667   3rd Qu.:6.000   3rd Qu.:6.667   3rd Qu.:5.333  
#>  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
#>  NA's   :1       NA's   :1       NA's   :2       NA's   :1      
#>  leben_freunde    leben_gesamt   stress_somatisch stress_psychisch
#>  Min.   :2.000   Min.   :2.909   Min.   :1.000    Min.   :1.000   
#>  1st Qu.:5.500   1st Qu.:5.000   1st Qu.:2.167    1st Qu.:2.167   
#>  Median :6.000   Median :5.455   Median :3.000    Median :2.917   
#>  Mean   :5.898   Mean   :5.417   Mean   :3.020    Mean   :2.989   
#>  3rd Qu.:6.500   3rd Qu.:5.909   3rd Qu.:3.667    3rd Qu.:3.667   
#>  Max.   :7.000   Max.   :6.909   Max.   :7.000    Max.   :6.833   
#>  NA's   :1       NA's   :1       NA's   :3        NA's   :2       
#>    leben_dich    
#>  Min.   :0.0000  
#>  1st Qu.:0.0000  
#>  Median :1.0000  
#>  Mean   :0.5298  
#>  3rd Qu.:1.0000  
#>  Max.   :1.0000  
#>  NA's   :1

8.2 Descriptives for categorical data

8.2.1 Frequencies and contingency tables with `table()`

Using table() we get the frequencies of factor levels/groups. This is particularly useful when we are interested in the combined frequencies of several factors as in a contingency table.

# Frequencies for a single factor
table(adolescents$bildung_vater)
#> 
#> Hauptschule  Realschule      Abitur  Hochschule 
#>          41          94          41          93

# contingency table of two factors
table(adolescents$bildung_vater, adolescents$bildung_mutter)
#>              
#>               Hauptschule Realschule Abitur Hochschule
#>   Hauptschule          23         12      2          3
#>   Realschule           12         55     17         10
#>   Abitur                5         11     21          4
#>   Hochschule            3         16      9         65

With prop.table() we obtain proportions:

table(adolescents$bildung_vater, adolescents$bildung_mutter) %>% prop.table() %>% round(2)
#>              
#>               Hauptschule Realschule Abitur Hochschule
#>   Hauptschule        0.09       0.04   0.01       0.01
#>   Realschule         0.04       0.21   0.06       0.04
#>   Abitur             0.02       0.04   0.08       0.01
#>   Hochschule         0.01       0.06   0.03       0.24

# or in percentages
100*table(adolescents$bildung_vater, adolescents$bildung_mutter) %>% prop.table() %>% round(4)
#>              
#>               Hauptschule Realschule Abitur Hochschule
#>   Hauptschule        8.58       4.48   0.75       1.12
#>   Realschule         4.48      20.52   6.34       3.73
#>   Abitur             1.87       4.10   7.84       1.49
#>   Hochschule         1.12       5.97   3.36      24.25

In most cases we are not interested in the total proportions (which add up to 1 or 100% across the entire table), but in the row-conditional or column-conditional proportions. The former are obtained using the argument margin = 1, the latter with margin = 2 in the prop.table() function. Since there is no further argument we can also directly specify the value of the argument:

prop.table(1) for row-conditional proportions prop.table(2) for column-conditional proportions

prop.table() uses as first argument a table() object. The unnamed margin argument can thus only be used in combination with the pipe %>% operator.

# Row-wise percentages
100*table(adolescents$bildung_vater, adolescents$bildung_mutter) %>% prop.table(1) %>% round(4)
#>              
#>               Hauptschule Realschule Abitur Hochschule
#>   Hauptschule       57.50      30.00   5.00       7.50
#>   Realschule        12.77      58.51  18.09      10.64
#>   Abitur            12.20      26.83  51.22       9.76
#>   Hochschule         3.23      17.20   9.68      69.89

# Column-wise percentages
100*table(adolescents$bildung_vater, adolescents$bildung_mutter) %>% prop.table(2) %>% round(4)
#>              
#>               Hauptschule Realschule Abitur Hochschule
#>   Hauptschule       53.49      12.77   4.08       3.66
#>   Realschule        27.91      58.51  34.69      12.20
#>   Abitur            11.63      11.70  42.86       4.88
#>   Hochschule         6.98      17.02  18.37      79.27

8.2.2 Mode

There is no separate function for calculating the mode (category with the highest frequency) of a categorical variable in R. Either read the mode from the frequency table or help yourself with:

names(which.max(table(adolescents$bildung_vater)))
#> [1] "Realschule"

8.2.3 Descriptive statistics for categorical data with `jmv`

jamovi offers great functionality for statistical data analysis. It also offers a connection to R via the jmv package. When running the jamovi program in syntax mode the provided syntax can be copied directly into R, and only the data argument has to be changed from its default. For our example data = data is changed to data = adolescents.

install.packages("jmv")

Let’s get some descriptive statistics for our variables bildung_vater und bildung_mutter using the jmv function descriptives():

library(jmv)
descriptives(
  data = adolescents,
  vars = vars(bildung_vater, bildung_mutter),
  freq = TRUE)
#> 
#>  DESCRIPTIVES
#> 
#>  Descriptives                                   
#>  ────────────────────────────────────────────── 
#>               bildung_vater    bildung_mutter   
#>  ────────────────────────────────────────────── 
#>    N                    269               272   
#>    Missing               17                14   
#>    Mean                                         
#>    Median                                       
#>    Minimum                                      
#>    Maximum                                      
#>  ────────────────────────────────────────────── 
#> 
#> 
#>  FREQUENCIES
#> 
#>  Frequencies of bildung_vater                            
#>  ─────────────────────────────────────────────────────── 
#>    Levels         Counts    % of Total    Cumulative %   
#>  ─────────────────────────────────────────────────────── 
#>    Hauptschule        41          15.2            15.2   
#>    Realschule         94          34.9            50.2   
#>    Abitur             41          15.2            65.4   
#>    Hochschule         93          34.6           100.0   
#>  ─────────────────────────────────────────────────────── 
#> 
#> 
#>  Frequencies of bildung_mutter                           
#>  ─────────────────────────────────────────────────────── 
#>    Levels         Counts    % of Total    Cumulative %   
#>  ─────────────────────────────────────────────────────── 
#>    Hauptschule        43          15.8            15.8   
#>    Realschule         95          34.9            50.7   
#>    Abitur             50          18.4            69.1   
#>    Hochschule         84          30.9           100.0   
#>  ───────────────────────────────────────────────────────

Contingency tables can be obtained using contTables(). This function comes with a $\chi^2$-Test for independence and has a number of further options. Here, we include the effect size Cramer’s V as well as Goodman-Kruskals $\gamma$ (measure of association for categorical ordinal variables).

contTables(
  formula = ~ bildung_vater:bildung_mutter,
  data = adolescents,
  phiCra = TRUE, # Phi and Cramer's V (Phi only for 2x2 tables)
  gamma = TRUE, # Goodman-Kruskals Gamma
  pcRow = TRUE, # row percentages
  pcCol = TRUE, # column percentages
  pcTot = TRUE) # total percentages
#> 
#>  CONTINGENCY TABLES
#> 
#>  Contingency Tables                                                                                 
#>  ────────────────────────────────────────────────────────────────────────────────────────────────── 
#>    bildung_vater                       Hauptschule    Realschule    Abitur    Hochschule    Total   
#>  ────────────────────────────────────────────────────────────────────────────────────────────────── 
#>    Hauptschule      Observed                    23            12         2             3       40   
#>                     % within row              57.5          30.0       5.0           7.5    100.0   
#>                     % within column           53.5          12.8       4.1           3.7     14.9   
#>                     % of total                 8.6           4.5       0.7           1.1     14.9   
#>                                                                                                     
#>    Realschule       Observed                    12            55        17            10       94   
#>                     % within row              12.8          58.5      18.1          10.6    100.0   
#>                     % within column           27.9          58.5      34.7          12.2     35.1   
#>                     % of total                 4.5          20.5       6.3           3.7     35.1   
#>                                                                                                     
#>    Abitur           Observed                     5            11        21             4       41   
#>                     % within row              12.2          26.8      51.2           9.8    100.0   
#>                     % within column           11.6          11.7      42.9           4.9     15.3   
#>                     % of total                 1.9           4.1       7.8           1.5     15.3   
#>                                                                                                     
#>    Hochschule       Observed                     3            16         9            65       93   
#>                     % within row               3.2          17.2       9.7          69.9    100.0   
#>                     % within column            7.0          17.0      18.4          79.3     34.7   
#>                     % of total                 1.1           6.0       3.4          24.3     34.7   
#>                                                                                                     
#>    Total            Observed                    43            94        49            82      268   
#>                     % within row              16.0          35.1      18.3          30.6    100.0   
#>                     % within column          100.0         100.0     100.0         100.0    100.0   
#>                     % of total                16.0          35.1      18.3          30.6    100.0   
#>  ────────────────────────────────────────────────────────────────────────────────────────────────── 
#> 
#> 
#>  χ² Tests                        
#>  ─────────────────────────────── 
#>          Value    df    p        
#>  ─────────────────────────────── 
#>    χ²      182     9    < .001   
#>    N       268                   
#>  ─────────────────────────────── 
#> 
#> 
#>  Nominal                      
#>  ──────────────────────────── 
#>                       Value   
#>  ──────────────────────────── 
#>    Phi-coefficient      NaN   
#>    Cramer's V         0.475   
#>  ──────────────────────────── 
#> 
#> 
#>  Gamma                                         
#>  ───────────────────────────────────────────── 
#>    Gamma    Standard Error    Lower    Upper   
#>  ───────────────────────────────────────────── 
#>    0.693            0.0489    0.597    0.789   
#>  ─────────────────────────────────────────────

Reduced version with only row percentages:

contTables(
    formula = ~ bildung_vater:bildung_mutter,
    data = adolescents,
    chiSq = FALSE,
    obs = FALSE,
    pcRow = TRUE)
#> 
#>  CONTINGENCY TABLES
#> 
#>  Contingency Tables                                                                              
#>  ─────────────────────────────────────────────────────────────────────────────────────────────── 
#>    bildung_vater                    Hauptschule    Realschule    Abitur    Hochschule    Total   
#>  ─────────────────────────────────────────────────────────────────────────────────────────────── 
#>    Hauptschule      % within row           57.5          30.0       5.0           7.5    100.0   
#>    Realschule       % within row           12.8          58.5      18.1          10.6    100.0   
#>    Abitur           % within row           12.2          26.8      51.2           9.8    100.0   
#>    Hochschule       % within row            3.2          17.2       9.7          69.9    100.0   
#>    Total            % within row           16.0          35.1      18.3          30.6    100.0   
#>  ─────────────────────────────────────────────────────────────────────────────────────────────── 
#> 
#> 
#>  χ² Tests       
#>  ────────────── 
#>         Value   
#>  ────────────── 
#>    N      268   
#>  ──────────────

8.3 Descriptives for ordinal and metric data

8.3.1 Descriptive statistics using base R functions

We already know most of the functions needed for describing the distributional characteristics of ordinal and metric variables:

mean(x, na.rm = FALSE)    Arithmetic mean
sd(x, na.rm = FALSE)       (Sample) Standard Deviation 
var(x, na.rm = FALSE)     (Sample) Variance

median(x)                  Median
quantile(x, probs, type)  Quantile of x.  probs: vector with probabilities
min(x)                    Minimum value of x
max(x)                    Maximum value of x
range(x)                   x_min and x_max

Examples

Let’s create a vector with sixteen numbers between 1 and 9 and sort it:

x <- c(4, 7, 9, 2, 3, 6, 6, 4, 1, 2, 8, 5, 5, 7, 3, 2)
x <- sort(x)
x
#>  [1] 1 2 2 2 3 3 4 4 5 5 6 6 7 7 8 9

median(x)
#> [1] 4.5
quantile(x, c(.25, .75))
#>  25%  75% 
#> 2.75 6.25

By the way, quantile() has nine different methods (type = 1-9) to compute the quantiles of a distribution (help('quantile')). The default method is type = 7 while the (simpler) method tought in most textbooks is type = 2:

quantile(x, c(.25, .75), type = 2)
#> 25% 75% 
#> 2.5 6.5

The tapply() function can be used to display selected descriptive statistics for the combined levels of several factors. This may be useful for the presentation of means in 2-factorial ANOVAS. The generic form is: tapply(df$var1, list(factor1 = df$factor1, factor2 = df$factor2), statistic, na.rm = TRUE).

statistic stands for the desired statistic (only one at a time), e.g. mean, median, sum, sd.

Let’s display the means of the variable unt_eltern (perceived support by parents) for the combined factors geschlecht and westost:

tapply(adolescents$unt_eltern, 
       list(Geschlecht = adolescents$geschlecht, Region = adolescents$westost), 
       mean, na.rm = TRUE) %>%
    round(2)
#>           Region
#> Geschlecht West  Ost
#>   männlich 5.79 6.13
#>   weiblich 5.74 5.83

We can do the same using dplyr:

adolescents %>% 
    group_by(geschlecht, westost) %>% 
    summarise(mean = mean(unt_eltern, na.rm = TRUE))
#> # A tibble: 4 x 3
#> # Groups:   geschlecht [2]
#>   geschlecht westost  mean
#>   <fct>      <fct>   <dbl>
#> 1 männlich   West     5.79
#> 2 männlich   Ost      6.13
#> 3 weiblich   West     5.74
#> 4 weiblich   Ost      5.83

# Caution: If we use drop_na() to remove missing values, all cases with missings 
# anywhere in the dataset will be removed. In this case first define a new dataset 
# including only the relevant variables!

adolescents %>% 
    group_by(geschlecht, westost) %>% 
    drop_na() %>% 
    summarise(mean = mean(unt_eltern))
#> # A tibble: 4 x 3
#> # Groups:   geschlecht [2]
#>   geschlecht westost  mean
#>   <fct>      <fct>   <dbl>
#> 1 männlich   West     5.71
#> 2 männlich   Ost      6.08
#> 3 weiblich   West     5.70
#> 4 weiblich   Ost      5.83

8.3.2 Descriptive statistics using `jmv`

In jamovi (jmv) we use descriptives() again. Let’s look at unt_eltern and get descriptive statistics (and plots) for boys and girls separately:

descriptives(
  formula = unt_eltern ~ geschlecht,
  data = adolescents,
  hist = TRUE,
  box = TRUE,
  dot = TRUE,
  median = FALSE,
  sd = TRUE,
  variance = TRUE,
  se = TRUE,
  skew = TRUE,
  kurt = TRUE,
  quart = TRUE)
#> 
#>  DESCRIPTIVES
#> 
#>  Descriptives                                        
#>  ─────────────────────────────────────────────────── 
#>                           geschlecht    unt_eltern   
#>  ─────────────────────────────────────────────────── 
#>    N                      männlich             151   
#>                           weiblich             134   
#>    Missing                männlich               1   
#>                           weiblich               0   
#>    Mean                   männlich            5.94   
#>                           weiblich            5.79   
#>    Std. error mean        männlich          0.0707   
#>                           weiblich          0.0970   
#>    Standard deviation     männlich           0.869   
#>                           weiblich            1.12   
#>    Variance               männlich           0.755   
#>                           weiblich            1.26   
#>    Minimum                männlich            3.33   
#>                           weiblich            1.50   
#>    Maximum                männlich            7.00   
#>                           weiblich            7.00   
#>    Skewness               männlich           -1.20   
#>                           weiblich           -1.33   
#>    Std. error skewness    männlich           0.197   
#>                           weiblich           0.209   
#>    Kurtosis               männlich            1.09   
#>                           weiblich            2.02   
#>    Std. error kurtosis    männlich           0.392   
#>                           weiblich           0.416   
#>    25th percentile        männlich            5.50   
#>                           weiblich            5.17   
#>    50th percentile        männlich            6.17   
#>                           weiblich            6.17   
#>    75th percentile        männlich            6.50   
#>                           weiblich            6.67   
#>  ───────────────────────────────────────────────────

Let’s look at the perceived support from friends (unt_freunde) for comparison, and add a second factor:

descriptives(
  formula = unt_freunde ~ geschlecht:westost,
  data = adolescents,
  min = FALSE,
  max = FALSE,
  missing = FALSE,
  sd = TRUE,
  bar = TRUE,
  box = TRUE)
#> 
#>  DESCRIPTIVES
#> 
#>  Descriptives                                                   
#>  ────────────────────────────────────────────────────────────── 
#>                          geschlecht    westost    unt_freunde   
#>  ────────────────────────────────────────────────────────────── 
#>    N                     männlich      West                83   
#>                                        Ost                 69   
#>                          weiblich      West                59   
#>                                        Ost                 74   
#>    Mean                  männlich      West              5.49   
#>                                        Ost               5.70   
#>                          weiblich      West              6.27   
#>                                        Ost               6.36   
#>    Median                männlich      West              5.67   
#>                                        Ost               5.67   
#>                          weiblich      West              6.50   
#>                                        Ost               6.50   
#>    Standard deviation    männlich      West              1.13   
#>                                        Ost              0.831   
#>                          weiblich      West             0.926   
#>                                        Ost              0.557   
#>  ──────────────────────────────────────────────────────────────

8.3.3 Descriptive statistics for metric variables using the `psych` package

install.packages("psych")

The psych package provides some nice functionality for data analysis in psychology. For example, it has a useful function for descriptive statistics for numerical variables: describe(). The generic form is: describe(x, na.rm = TRUE, interp = FALSE, skew = TRUE, ranges = TRUE, trim = 0.1, type = 3, check = TRUE).

x stands for the data frame or the variable to be analyzed (df$variable). The defaults are:

interp = FALSE refers to the definition of the median (interp = TRUE averages adjacent values for an even n)
skew = TRUE displays skewness, kurtosis and the trimmed mean
ranges = TRUE displays min, max, the range, the median, and the mad (median average deviation, an alternative measure of dispersion)
trim = 0.1 refers to the proportion of the distribution that is trimmed at the lower and upper ends for the trimmed mean (default trimming is 10% on both sides, thus the trimmed mean is computed from the middle 80% of the data)
type = 3 refers to the method of computing skewness and kurtosis (more here: ?psych::describe)
check = TRUE refers to checking for non-numeric variables in the dataset (for which describe has no use); if check = FALSE and there are non-numeric variables, an error message is displayed.

library(psych)
adolescents %>% 
  select(unt_eltern, unt_freunde) %>% 
  describe()
#>             vars   n mean   sd median trimmed  mad min max range  skew kurtosis
#> unt_eltern     1 285 5.87 1.00   6.17    6.01 0.74 1.5   7   5.5 -1.35     2.05
#> unt_freunde    2 285 5.93 0.96   6.00    6.06 0.99 2.0   7   5.0 -1.25     1.85
#>               se
#> unt_eltern  0.06
#> unt_freunde 0.06

# Only mean, sd, and se 
adolescents %>% 
  select(unt_eltern, unt_freunde) %>% 
  describe(skew = FALSE, range = FALSE)
#>             vars   n mean   sd   se
#> unt_eltern     1 285 5.87 1.00 0.06
#> unt_freunde    2 285 5.93 0.96 0.06

8.4 Correlation analysis

8.4.1 Traditional ways of doing correlation analysis in R

The generic form for calculating a correlation matrix using base R is as follows: cor(df[,c("var1", "var2", "var3")], methdod = c("pearson", "kendall", "spearman"), use = "complete.obs"). We can thus compute Pearson correlations (default) or - for ordinal data - Kendall’s $\tau$ or Spearman’s $\rho$ rank correlations.

Let’s try it using four scales measuring adolescents’ academic as well as social self-efficacy:

adolescents %>%
    select(swk_neueslernen, swk_motivation, swk_durchsetzung, swk_sozialkomp) %>% 
    cor(method = "pearson", use = "complete.obs") %>%
    round(3)
#>                  swk_neueslernen swk_motivation swk_durchsetzung swk_sozialkomp
#> swk_neueslernen            1.000          0.640            0.351          0.363
#> swk_motivation             0.640          1.000            0.415          0.407
#> swk_durchsetzung           0.351          0.415            1.000          0.592
#> swk_sozialkomp             0.363          0.407            0.592          1.000

For covariances instead of correlations use cov() with the same arguments as cor():

adolescents %>%
    select(swk_neueslernen, swk_motivation, swk_durchsetzung, swk_sozialkomp) %>% 
    cov(method = "pearson", use = "complete.obs") %>%
    round(3)
#>                  swk_neueslernen swk_motivation swk_durchsetzung swk_sozialkomp
#> swk_neueslernen            0.595          0.380            0.266          0.220
#> swk_motivation             0.380          0.591            0.314          0.246
#> swk_durchsetzung           0.266          0.314            0.970          0.458
#> swk_sozialkomp             0.220          0.246            0.458          0.617

A good way to display a scatterplot matrix for a number of variables is the function scatterplotMatrix from the car package. In addition to displaying all combinations of bivariate scatterplots it is possible to get different univariate plots in the diagonal: diagonal = list(method = c("density", "boxplot", "histogram", "oned", "qqplot", "none")). In addition, smooth = TRUE (default) adds a non-linear LOWESS line (LOcally WEighted Scatterplot Smoother) to assess the (non-)linearity of the correlation.

install.packages("car")

library(car)
scatterplotMatrix(~ swk_neueslernen + swk_motivation + 
                    swk_durchsetzung + swk_sozialkomp, 
                  smooth = TRUE, # default
                  diagonal = list(method = "histogram"), # if TRUE defaults to "density"
                  data = adolescents,
                  regLine = list(col = "red"), # if omitted defaults to point colour
                  use = "complete.obs") # default; alternative: "pairwise.complete.obs"

The function cor.test() returns a significance test for one specific correlation. The generic form is: cor.test (x, y, alternative = c ("two.sided", "less", "greater"), method = c ("pearson", "kendall", "spearman"), conf. level = 0.95).

Thus, in addition to the standard two-sided hypothesis test (alternative = "two.sided"), directional alternative hypotheses are possible: (alternative = "less") for a hypothesized negative correlation (“less than 0”) and (alternative = "greater") for a hypothesized positive correlation (“greater than 0”).

cor.test(adolescents$swk_neueslernen, adolescents$swk_sozialkomp, 
         alternative = "greater",
         method = "pearson")
#> 
#>  Pearson's product-moment correlation
#> 
#> data:  adolescents$swk_neueslernen and adolescents$swk_sozialkomp
#> t = 6.5528, df = 284, p-value = 1.325e-10
#> alternative hypothesis: true correlation is greater than 0
#> 95 percent confidence interval:
#>  0.2746381 1.0000000
#> sample estimates:
#>       cor 
#> 0.3624033

The function corr.test() from the psych package includes $p$-values for a correlation matrix. However, no directional alternative hypotheses are possible. Note that the two-sided $p$-values are in the lower triangular matrix and the $p$-values adjusted according to Holm are in the upper triangular matrix. The Holm procedure corresponds to a sequential alpha correction that is less conservative than the Bonferroni procedure, but nevertheless controls the family-wise error. In the diagonals $p$-values for the variances of the variables are displayed.

In the Holm procedure (Bonferroni-Holm method), the smallest $p$-value is multiplied by the number of tests ($p(p-1)/2$; $p$ = number of variables in the correlation matrix), the second smallest by the number of tests minus 1, etc. until the largest $p$-value that is no longer corrected (i.e., multiplied by $p-p+1 = 1$). However, the $p$-value is only multiplied by the corresponding number of tests if the resulting adjusted $p$-value is not smaller than the previously adjusted (smaller) $p$-value. If this is the case, the test is assigned the same $p$-value as its predecessor. This prevents that a smaller effect gets a smaller $p$-value after the adjustment than a larger effect, thereby preventing that a smaller effect may reach significance while the larger effect does not.

Let’s try it for the four self-efficacy scales, but use only the first 25 subjects (lines) to make sure we obtain differences between the adjusted and unadjusted $p$-values (because with the full sample all correlations are significant beyond $p<.001$)

library(psych)
adolescents[1:25,] %>% 
  select(swk_neueslernen, swk_motivation, swk_durchsetzung, swk_sozialkomp) %>% 
    corr.test()
#> Call:corr.test(x = .)
#> Correlation matrix 
#>                  swk_neueslernen swk_motivation swk_durchsetzung swk_sozialkomp
#> swk_neueslernen             1.00           0.57             0.22           0.19
#> swk_motivation              0.57           1.00             0.52           0.50
#> swk_durchsetzung            0.22           0.52             1.00           0.90
#> swk_sozialkomp              0.19           0.50             0.90           1.00
#> Sample Size 
#> [1] 25
#> Probability values (Entries above the diagonal are adjusted for multiple tests.) 
#>                  swk_neueslernen swk_motivation swk_durchsetzung swk_sozialkomp
#> swk_neueslernen             0.00           0.02             0.57           0.57
#> swk_motivation              0.00           0.00             0.03           0.03
#> swk_durchsetzung            0.29           0.01             0.00           0.00
#> swk_sozialkomp              0.36           0.01             0.00           0.00
#> 
#>  To see confidence intervals of the correlations, print with the short=FALSE option

8.4.2 Partial correlations with `RcmdrMisc`

Partial correlations are obtained using partial.cor(df[, c("var1", "var2", "var3")], tests = TRUE) from the RcmdrMisc package. Important: Partialling for the corresponding bivariate correlation is done for all other variables in the data set!

install.packages("RcmdrMisc")

library(RcmdrMisc)
# Normal correlation matrix for comparison (zero-order correlations)
cor(adolescents[1:50,c("Gesamtnote", "swk_neueslernen", "swk_motivation",
                       "swk_sozialkomp")],
    use = "complete.obs") %>% round(3)
#>                 Gesamtnote swk_neueslernen swk_motivation swk_sozialkomp
#> Gesamtnote           1.000           0.396          0.292          0.139
#> swk_neueslernen      0.396           1.000          0.499          0.300
#> swk_motivation       0.292           0.499          1.000          0.600
#> swk_sozialkomp       0.139           0.300          0.600          1.000

# Partial correlation matrix with three variables (first-order partial correlations)
partial.cor(adolescents[1:50,c("Gesamtnote", "swk_neueslernen", "swk_motivation")],
            tests = TRUE)
#> 
#>  Partial correlations:
#>                 Gesamtnote swk_neueslernen swk_motivation
#> Gesamtnote         0.00000         0.30214        0.11811
#> swk_neueslernen    0.30214         0.00000        0.43686
#> swk_motivation     0.11811         0.43686        0.00000
#> 
#>  Number of observations: 48 
#> 
#>  Pairwise two-sided p-values:
#>                 Gesamtnote swk_neueslernen swk_motivation
#> Gesamtnote                 0.0390          0.4291        
#> swk_neueslernen 0.0390                     0.0021        
#> swk_motivation  0.4291     0.0021                        
#> 
#>  Adjusted p-values (Holm's method)
#>                 Gesamtnote swk_neueslernen swk_motivation
#> Gesamtnote                 0.0780          0.4291        
#> swk_neueslernen 0.0780                     0.0064        
#> swk_motivation  0.4291     0.0064

# Partial correlation matrix with four variables (second-order partial correlations)
partial.cor(adolescents[1:50,c("Gesamtnote", "swk_neueslernen", "swk_motivation",
                               "swk_sozialkomp")], 
            tests = TRUE)
#> 
#>  Partial correlations:
#>                 Gesamtnote swk_neueslernen swk_motivation swk_sozialkomp
#> Gesamtnote         0.00000         0.30249        0.12582       -0.04944
#> swk_neueslernen    0.30249         0.00000        0.35754        0.01537
#> swk_motivation     0.12582         0.35754        0.00000        0.54607
#> swk_sozialkomp    -0.04944         0.01537        0.54607        0.00000
#> 
#>  Number of observations: 48 
#> 
#>  Pairwise two-sided p-values:
#>                 Gesamtnote swk_neueslernen swk_motivation swk_sozialkomp
#> Gesamtnote                 0.0410          0.4047         0.7442        
#> swk_neueslernen 0.0410                     0.0147         0.9193        
#> swk_motivation  0.4047     0.0147                         <.0001        
#> swk_sozialkomp  0.7442     0.9193          <.0001                       
#> 
#>  Adjusted p-values (Holm's method)
#>                 Gesamtnote swk_neueslernen swk_motivation swk_sozialkomp
#> Gesamtnote                 0.1641          1.0000         1.0000        
#> swk_neueslernen 0.1641                     0.0735         1.0000        
#> swk_motivation  1.0000     0.0735                         0.0005        
#> swk_sozialkomp  1.0000     1.0000          0.0005

8.4.3 Correlation analysis with `jmv`

Almost all we have done above with the more traditional R functions we can also do in jmv using the corrMatrix() function. An overview of the available arguments is provided by ?jmv::corrMatrix:

corrMatrix(data, vars, pearson = TRUE, spearman = FALSE, kendall = FALSE,
  sig = TRUE, flag = FALSE, ci = FALSE, ciWidth = 95, plots = FALSE,
  plotDens = FALSE, plotStats = FALSE, hypothesis = "corr")

Arguments

data        the data as a data frame
vars         a vector of strings naming the variables to correlate in data
pearson     TRUE (default) or FALSE, provide Pearson's R
spearman    TRUE or FALSE (default), provide Spearman's rho
kendall     TRUE or FALSE (default), provide Kendall's tau-b
sig         TRUE (default) or FALSE, provide significance levels
flag         TRUE or FALSE (default), flag significant correlations
ci          TRUE or FALSE (default), provide confidence intervals
ciWidth     a number between 50 and 99.9 (default: 95), the width of CI
plots        TRUE or FALSE (default), provide a correlation matrix plot
plotDens    TRUE or FALSE (default), provide densities in the correlation matrix plot
plotStats    TRUE or FALSE (default), provide statistics in the correlation matrix plot
hypothesis  one of 'corr' (default), 'pos', 'neg' specifying the alernative hypothesis; 
            correlated, correlated positively, correlated negatively respectively.

Let’s try some options with the four self-efficacy variables:

library(jmv)
corrMatrix(
  data = adolescents,
  vars = vars(swk_neueslernen, 
              swk_motivation, 
              swk_durchsetzung, 
              swk_sozialkomp),
  pearson = FALSE,
  spearman = TRUE,
  kendall = TRUE,
  n = TRUE,
  hypothesis = "pos")
#> 
#>  CORRELATION MATRIX
#> 
#>  Correlation Matrix                                                                                                 
#>  ────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 
#>                                           swk_neueslernen    swk_motivation    swk_durchsetzung    swk_sozialkomp   
#>  ────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 
#>    swk_neueslernen     Spearman's rho                   —                                                           
#>                        p-value                          —                                                           
#>                        Kendall's Tau B                  —                                                           
#>                        p-value                          —                                                           
#>                        N                                —                                                           
#>                                                                                                                     
#>    swk_motivation      Spearman's rho               0.619                 —                                         
#>                        p-value                     < .001                 —                                         
#>                        Kendall's Tau B              0.471                 —                                         
#>                        p-value                     < .001                 —                                         
#>                        N                              286                 —                                         
#>                                                                                                                     
#>    swk_durchsetzung    Spearman's rho               0.377             0.463                   —                     
#>                        p-value                     < .001            < .001                   —                     
#>                        Kendall's Tau B              0.278             0.347                   —                     
#>                        p-value                     < .001            < .001                   —                     
#>                        N                              285               285                   —                     
#>                                                                                                                     
#>    swk_sozialkomp      Spearman's rho               0.378             0.413               0.539                 —   
#>                        p-value                     < .001            < .001              < .001                 —   
#>                        Kendall's Tau B              0.272             0.304               0.411                 —   
#>                        p-value                     < .001            < .001              < .001                 —   
#>                        N                              286               286                 285                 —   
#>  ────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 
#>    Note. Hₐ is positive correlation

8.5 Reliability analysis

8.5.1 Reliability analysis using the `psych` package

For the reliability analysis we create a reduced data set containing only the items for the scale swk_neueslernen: swk1, swk7, swk14, swk16, swk26, swk27, and swk31.

swk_neueslernen <- adolescents %>% 
    select("swk1", "swk7", "swk14", "swk16", "swk26", "swk27", "swk31")

swk_neueslernen
#> # A tibble: 286 x 7
#>    swk1  swk7 swk14 swk16 swk26 swk27 swk31
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     2     5     5     4     6     6     4
#> 2     4     5     5     7     5     4     5
#> 3     5     7     3     7     7     3     3
#> 4     3     6     4     6     4     4     5
#> 5     5     5     4     5     7     6    NA
#> 6     3     5     5     7     4     5     5
#> # … with 280 more rows

Cronbach’s alpha

Cronbach’s $\alpha$ can be computed using alpha():

psych::alpha(swk_neueslernen)
#> 
#> Reliability analysis   
#> Call: psych::alpha(x = swk_neueslernen)
#> 
#>   raw_alpha std.alpha G6(smc) average_r S/N   ase mean   sd median_r
#>       0.63      0.63    0.65      0.19 1.7 0.033  5.1 0.77     0.18
#> 
#>  lower alpha upper     95% confidence boundaries
#> 0.56 0.63 0.7 
#> 
#>  Reliability if an item is dropped:
#>       raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
#> swk1       0.59      0.59    0.60      0.19 1.4    0.037 0.022 0.168
#> swk7       0.58      0.58    0.57      0.19 1.4    0.039 0.013 0.192
#> swk14      0.57      0.56    0.57      0.18 1.3    0.039 0.022 0.097
#> swk16      0.64      0.64    0.65      0.23 1.8    0.032 0.020 0.192
#> swk26      0.61      0.61    0.63      0.21 1.6    0.035 0.024 0.180
#> swk27      0.58      0.58    0.59      0.19 1.4    0.039 0.021 0.168
#> swk31      0.56      0.57    0.56      0.18 1.3    0.040 0.015 0.180
#> 
#>  Item statistics 
#>         n raw.r std.r r.cor r.drop mean  sd
#> swk1  286  0.53  0.56  0.45   0.34  5.0 1.2
#> swk7  284  0.61  0.58  0.52   0.39  5.0 1.5
#> swk14 283  0.61  0.62  0.54   0.41  4.9 1.4
#> swk16 285  0.43  0.43  0.24   0.18  5.8 1.4
#> swk26 285  0.47  0.50  0.34   0.27  5.3 1.2
#> swk27 281  0.61  0.58  0.48   0.38  4.9 1.5
#> swk31 283  0.63  0.61  0.56   0.43  4.7 1.4
#> 
#> Non missing response frequency for each item
#>          1    2    3    4    5    6    7 miss
#> swk1  0.01 0.01 0.07 0.20 0.33 0.30 0.08 0.00
#> swk7  0.02 0.04 0.12 0.13 0.29 0.24 0.16 0.01
#> swk14 0.03 0.05 0.07 0.19 0.26 0.32 0.10 0.01
#> swk16 0.02 0.02 0.03 0.09 0.15 0.32 0.36 0.00
#> swk26 0.01 0.01 0.05 0.14 0.31 0.33 0.15 0.00
#> swk27 0.04 0.03 0.09 0.21 0.23 0.26 0.15 0.02
#> swk31 0.03 0.03 0.13 0.22 0.26 0.23 0.10 0.01

Explanation of the output:

raw_alpha      # (Normal) Alpha based on covariances
std.alpha    # Standardized Alpha based on correlations
G6(smc)      # Guttman's Lambda 6 reliability
average_r    # Average inter-item-correlation
S/N          # Signal-to-noise ratio
ase/alpha se # Standard error of alpha
median_r       # Median interitem correlation
alpha.drop   # Statistics in case the respective item is deleted from the scale
item.stats   # Item-total correlations (for raw and corrected scores) 

raw.r          # Correlation of each item with the total score, not corrected for item overlap
std.r          # Correlation of each item with the total score (not corrected for item overlap)                if the items were all standardized
r.cor          # Item whole correlation corrected for item overlap and scale reliability
r.drop       # Item whole correlation for this item against the scale without this item

Optionally, a “key vector” can be defined. This is useful when some items have been formulated in the opposite direction and therefore should actually be recoded before doing the reliability analysis. The “key vector” indicates which items should be recoded (here: none!)

The following command is (erroneously) specified with recoding keys on variables number 2 and 5:

psych::alpha(swk_neueslernen, keys = c(1, -1, 1, 1, -1, 1, 1))
#> 
#> Reliability analysis   
#> Call: psych::alpha(x = swk_neueslernen, keys = c(1, -1, 1, 1, -1, 1, 
#>     1))
#> 
#>   raw_alpha std.alpha G6(smc) average_r    S/N  ase mean   sd median_r
#>     -0.033    -0.014    0.22    -0.002 -0.014 0.09  4.4 0.51    0.029
#> 
#>  lower alpha upper     95% confidence boundaries
#> -0.21 -0.03 0.14 
#> 
#>  Reliability if an item is dropped:
#>        raw_alpha std.alpha G6(smc) average_r    S/N alpha se var.r  med.r
#> swk1      -0.320    -0.338  -0.014   -0.0439 -0.252    0.117 0.060 -0.045
#> swk7-      0.345     0.316   0.389    0.0716  0.463    0.057 0.045  0.083
#> swk14     -0.425    -0.377  -0.060   -0.0478 -0.274    0.129 0.054 -0.045
#> swk16     -0.256    -0.225   0.106   -0.0315 -0.184    0.111 0.075 -0.083
#> swk26-     0.190     0.241   0.388    0.0502  0.317    0.072 0.068  0.083
#> swk27     -0.041    -0.011   0.204   -0.0018 -0.011    0.093 0.059 -0.045
#> swk31     -0.082    -0.069   0.123   -0.0109 -0.065    0.096 0.049 -0.045
#> 
#>  Item statistics 
#>          n  raw.r  std.r r.cor   r.drop mean  sd
#> swk1   286  0.582  0.615  0.67  0.29902  5.0 1.2
#> swk7-  284 -0.027 -0.045 -0.62 -0.41024  3.0 1.5
#> swk14  283  0.646  0.637  0.76  0.31564  4.9 1.4
#> swk16  285  0.549  0.544  0.42  0.18961  5.8 1.4
#> swk26- 285  0.031  0.078 -0.54 -0.29163  2.7 1.2
#> swk27  281  0.407  0.375  0.17 -0.00063  4.9 1.5
#> swk31  283  0.431  0.426  0.38  0.03805  4.7 1.4
#> 
#> Non missing response frequency for each item
#>          1    2    3    4    5    6    7 miss
#> swk1  0.01 0.01 0.07 0.20 0.33 0.30 0.08 0.00
#> swk7  0.02 0.04 0.12 0.13 0.29 0.24 0.16 0.01
#> swk14 0.03 0.05 0.07 0.19 0.26 0.32 0.10 0.01
#> swk16 0.02 0.02 0.03 0.09 0.15 0.32 0.36 0.00
#> swk26 0.01 0.01 0.05 0.14 0.31 0.33 0.15 0.00
#> swk27 0.04 0.03 0.09 0.21 0.23 0.26 0.15 0.02
#> swk31 0.03 0.03 0.13 0.22 0.26 0.23 0.10 0.01

Via the summary() function it is also possible obtain only alpha. To do this, the result of the analysis must be saved in an object first:

alpha_neueslernen <- psych::alpha(swk_neueslernen)
summary(swk_neueslernen)
#>       swk1            swk7           swk14           swk16      
#>  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
#>  1st Qu.:4.000   1st Qu.:4.000   1st Qu.:4.000   1st Qu.:5.000  
#>  Median :5.000   Median :5.000   Median :5.000   Median :6.000  
#>  Mean   :5.045   Mean   :4.972   Mean   :4.936   Mean   :5.751  
#>  3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:7.000  
#>  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
#>                  NA's   :2       NA's   :3       NA's   :1      
#>      swk26           swk27           swk31      
#>  Min.   :1.000   Min.   :1.000   Min.   :1.000  
#>  1st Qu.:5.000   1st Qu.:4.000   1st Qu.:4.000  
#>  Median :5.000   Median :5.000   Median :5.000  
#>  Mean   :5.312   Mean   :4.936   Mean   :4.724  
#>  3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000  
#>  Max.   :7.000   Max.   :7.000   Max.   :7.000  
#>  NA's   :1       NA's   :5       NA's   :3

# alternatively:
swk_neueslernen %>% 
    psych::alpha() %>% 
    summary()
#> 
#> Reliability analysis   
#>  raw_alpha std.alpha G6(smc) average_r S/N   ase mean   sd median_r
#>       0.63      0.63    0.65      0.19 1.7 0.033  5.1 0.77     0.18

8.5.2 Reliability analysis with `jmv`

A lot of the psych::alpha() functionality is available in jmv as well, plus McDonald’s $\omega$ - the correct reliability measure for $\tau$-congeneric variables, i.e. variables with a measurement model that allows different slope/loading parameters. (To be fair: there is also a function in the psych package for omega).

jmv::reliability(
    data = adolescents,
    vars = vars(swk1, swk7, swk14, swk16, swk26, swk27, swk31),
    omegaScale = TRUE,
    alphaItems = TRUE,
    omegaItems = TRUE,
    itemRestCor = TRUE,
    revItems = vars(swk1, swk26))
#> 
#>  RELIABILITY ANALYSIS
#> 
#>  Scale Reliability Statistics              
#>  ───────────────────────────────────────── 
#>             Cronbach's α    McDonald's ω   
#>  ───────────────────────────────────────── 
#>    scale           0.254           0.330   
#>  ───────────────────────────────────────── 
#>    Note. items 'swk1' and 'swk26'
#>    correlate negatively with the total
#>    scale and probably should be
#>    reversed
#> 
#> 
#>  Item Reliability Statistics                                          
#>  ──────────────────────────────────────────────────────────────────── 
#>               item-rest correlation    Cronbach's α    McDonald's ω   
#>  ──────────────────────────────────────────────────────────────────── 
#>    swk1  ᵃ                  -0.2904         0.42954           0.483   
#>    swk7                      0.4269        -0.04970           0.116   
#>    swk14                     0.0817         0.24063           0.344   
#>    swk16                     0.0845         0.23831           0.340   
#>    swk26 ᵃ                  -0.2099         0.39386           0.453   
#>    swk27                     0.2701         0.09027           0.215   
#>    swk31                     0.3931        -0.00373           0.139   
#>  ──────────────────────────────────────────────────────────────────── 
#>    ᵃ reverse scaled item

Oops… we mistakenly reversed some items again! Now correctly and with an additional bivariate correlation heatmap of the items (corPlot = TRUE):

jmv::reliability(
    data = adolescents,
    vars = vars(swk1, swk7, swk14, swk16, swk26, swk27, swk31),
    omegaScale = TRUE,
    corPlot = TRUE,
    alphaItems = TRUE,
    omegaItems = TRUE,
    itemRestCor = TRUE)
#> 
#>  RELIABILITY ANALYSIS
#> 
#>  Scale Reliability Statistics              
#>  ───────────────────────────────────────── 
#>             Cronbach's α    McDonald's ω   
#>  ───────────────────────────────────────── 
#>    scale           0.637           0.643   
#>  ───────────────────────────────────────── 
#> 
#> 
#>  Item Reliability Statistics                                        
#>  ────────────────────────────────────────────────────────────────── 
#>             item-rest correlation    Cronbach's α    McDonald's ω   
#>  ────────────────────────────────────────────────────────────────── 
#>    swk1                     0.352           0.601           0.615   
#>    swk7                     0.390           0.586           0.601   
#>    swk14                    0.418           0.577           0.597   
#>    swk16                    0.195           0.648           0.650   
#>    swk26                    0.271           0.622           0.634   
#>    swk27                    0.385           0.588           0.598   
#>    swk31                    0.431           0.573           0.589   
#>  ──────────────────────────────────────────────────────────────────