Diagnostics

mixed

Author

Sungkyun Cho

Published

December 20, 2024

Load libraries
library(haven)
library(psych)
library(tidyverse)
library(lavaan)
library(semTools)
library(manymome)

Non-normality

MVN: An R Package for Assessing Multivariate Normality

Load data
library(covsim)
set.seed(1)

sigma.target <- matrix(c(1, 0.4, 0.3, 0.4, 1, 0.4, 0.3, 0.4, 1), 3)
ig.sample <- rIG(N = 10^3, sigma.target = sigma.target, reps = 1, skewness = c(0, sqrt(8), 0), excesskurtosis = c(0, 12, 6))

df <- ig.sample |> as.data.frame() |> as_tibble()
df |> print()
# A tibble: 1,000 × 3
      X1     X2     X3
   <dbl>  <dbl>  <dbl>
1 -0.626 -0.609 -0.472
2  0.184 -0.125  0.799
3 -0.836 -0.749 -0.773
4  1.60   2.95   0.975
5  0.330 -0.268  0.693
6 -0.820  1.22   1.27 
# ℹ 994 more rows

Multivariate test

  • mardia for Mardia’s test,
  • hz for Henze-Zirkler’s test, # default
  • royston for Royston’s test,
  • dh for Doornik-Hansen’s test,
  • energy for E-statistic
library(MVN)
mvn(df) |> print()
$multivariateNormality
           Test       HZ p value MVN
1 Henze-Zirkler 26.31374       0  NO

$univariateNormality
              Test  Variable Statistic   p value Normality
1 Anderson-Darling    X1        0.2219  0.8297      YES   
2 Anderson-Darling    X2       62.9376  <0.001      NO    
3 Anderson-Darling    X3        4.1598  <0.001      NO    

$Descriptives
      n         Mean   Std.Dev      Median       Min      Max       25th      75th
X1 1000 -0.011648142 1.0349158 -0.03532423 -3.008049 3.810277 -0.6973732 0.6884280
X2 1000  0.027678496 1.0566535 -0.18501542 -1.580517 8.663489 -0.5357336 0.2325938
X3 1000  0.009367604 0.9815131 -0.01418352 -4.190712 4.747516 -0.5385744 0.5710483
          Skew     Kurtosis
X1 -0.01913836 -0.007768915
X2  2.96485086 13.333058070
X3  0.29143698  2.195906716
# multivariate normality
mvn(data = df, mvnTest = "hz")$multivariateNormality |> print()
           Test       HZ p value MVN
1 Henze-Zirkler 26.31374       0  NO
# multivariate normality plot
results <- mvn(data = df, multivariatePlot = "qq")

Univariate test

  • Shapiro-Wilk (SW),
  • Cramer-von Mises (CVM),
  • Lilliefors (Lillie),
  • Shapiro-Francia (SF),
  • Anderson-Darling (AD) # default
mvn(data = df, univariateTest = "AD")$univariateNormality |> print()
              Test  Variable Statistic   p value Normality
1 Anderson-Darling    X1        0.2219  0.8297      YES   
2 Anderson-Darling    X2       62.9376  <0.001      NO    
3 Anderson-Darling    X3        4.1598  <0.001      NO    
# univariate normality histogram
results <- mvn(data = df, univariatePlot = "histogram")

# univariate normality scatterplot
results <- mvn(data = df, univariatePlot = "scatter")

Outliers

# outliers
results <- mvn(data = df, mvnTest = "hz", multivariateOutlierMethod = "adj")

Homework Dataset

hw_mean <- haven::read_sav("data/chap 19 latent means/Homework means.sav")
hw_mean <- hw_mean |> 
  select(-ethnic, -Female, hw10 = f1s36a2, hw12 = f2s25f2)

set.seed(123)
hw_mean <- hw_mean |> sample_n(300)
mvn(data = hw_mean)
$multivariateNormality
A data.frame: 1 x 4
Test HZ p value MVN
<chr> <dbl> <dbl> <chr>
Henze-Zirkler 1.014463 2.875198e-07 NO
$univariateNormality
A data.frame: 13 x 5
Test Variable Statistic p value Normality
<I<chr>> <I<chr>> <I<chr>> <I<chr>> <I<chr>>
1 Anderson-Darling bytxrstd 2.0702 <0.001 NO
2 Anderson-Darling bytxmstd 3.1087 <0.001 NO
3 Anderson-Darling bytxsstd 0.5534 0.1522 YES
4 Anderson-Darling bytxhstd 1.6028 4e-04 NO
5 Anderson-Darling parocc 17.0342 <0.001 NO
6 Anderson-Darling hw10 9.8312 <0.001 NO
7 Anderson-Darling hw12 5.7268 <0.001 NO
8 Anderson-Darling eng92 1.4401 0.001 NO
9 Anderson-Darling math92 1.1281 0.0058 NO
10 Anderson-Darling sci92 0.7714 0.0445 NO
11 Anderson-Darling soc92 1.0731 0.008 NO
12 Anderson-Darling byfaminc 3.2795 <0.001 NO
13 Anderson-Darling bypared 10.0582 <0.001 NO
$Descriptives
A data.frame: 13 x 10
n Mean Std.Dev Median Min Max 25th 75th Skew Kurtosis
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
bytxrstd 247 52.252498 9.729559 52.744 29.459 67.499 44.2160 61.0010 -0.21638586 -0.9412619
bytxmstd 247 52.921267 9.785079 52.593 32.049 70.118 44.5670 60.9505 0.01656429 -1.1789082
bytxsstd 247 52.323486 10.027881 52.525 29.079 73.829 45.2340 59.2450 0.03041495 -0.6364867
bytxhstd 247 52.560567 9.909501 53.781 25.166 69.508 45.8075 60.7700 -0.38347369 -0.5212263
parocc 247 54.081619 20.713100 61.320 7.320 81.870 49.7000 66.1800 -0.82741669 -0.5561309
hw10 247 2.627530 1.708102 2.000 0.000 7.000 1.0000 4.0000 0.74801463 -0.2616338
hw12 247 3.210526 1.901062 3.000 0.000 8.000 2.0000 4.0000 0.59291685 -0.1371959
eng92 247 6.594130 2.344748 7.000 0.500 11.120 4.9000 8.3300 -0.27431510 -0.6372800
math92 247 5.943846 2.397474 6.000 0.500 11.120 4.2500 7.7500 -0.12080923 -0.7570884
sci92 247 6.252267 2.468523 6.500 0.670 11.330 4.5000 8.0000 -0.14739430 -0.7185309
soc92 247 6.708988 2.473552 6.830 0.000 11.000 5.0700 8.6450 -0.32097156 -0.5374977
byfaminc 247 10.121457 2.438142 10.000 3.000 15.000 8.0000 12.0000 -0.38695907 -0.1149796
bypared 247 3.283401 1.316194 3.000 1.000 6.000 3.0000 4.0000 0.40848247 -0.3096812
# multivariate normality
hw_mean2 <- hw_mean |> 
  na.omit()  # qqplot does not handle missing data
results <- mvn(data = hw_mean2, multivariatePlot = "qq")

# univariate normality
results <- mvn(data = hw_mean, univariatePlot = "histogram")

results <- mvn(data = hw_mean, univariatePlot = "box")
Warning message in uniPlot(data, type = univariatePlot):
“Box-Plots are based on standardized values (centered and scaled).”

# outliers
results <- mvn(data = hw_mean, mvnTest = "hz", multivariateOutlierMethod = "adj")  # show options

이상치들

# outliers
results <- mvn(data = hw_mean, mvnTest = "hz", multivariateOutlierMethod = "adj", showOutliers = TRUE, showNewData = TRUE)  # show options

results$multivariateOutliers |> as_tibble() |> print()
# A tibble: 62 × 3
  Observation `Mahalanobis Distance` Outlier
  <chr>                        <dbl> <chr>  
1 1                             94.0 TRUE   
2 2                             88.9 TRUE   
3 3                             86.8 TRUE   
4 4                             85.5 TRUE   
5 5                             83.8 TRUE   
6 6                             73.0 TRUE   
# ℹ 56 more rows

이상치가 제거된 데이터셋

results$newData

Non-normality에 대한 대응

Non-normality에 대한 대응
  • Robust ML
  • Bootstrapping
  • 분포에 대한 가정이 없는 방법: WLS(fully weighted least squares)

Robust ML 방식
ML 파라미터 추정치는 일반적으로 robust하나, 표준오차는 문제가 될 수 있음.
다음과 같은 ML에 대한 robust estimation들은 표준오차에 대한 보정을 제공함.

  1. Option “MLM” is for complete data sets only and generates the mean-adjusted Satorra-Bentler scaled chi-square.
  2. Option “MLR” can be applied in complete or incomplete data sets, and it generates a mean adjusted chi-square based on the Yuan-Bentler T2 statistic. Because this method accommodates missing data, it is the most flexible option listed here.
  3. Option “MLMV” is for complete data sets only and computes a mean- and variance-adjusted scaled chi-square.
  4. Option “MLMVS” generates a mean- and variance adjusted chi-square with a correction for heteroscedasticity by Satterthwaite (1941). Model degrees of freedom are estimated, and the method is for complete data sets.

Source: p. 137, 163, Klein, R. B. (2023). Principles and Practice of Structural Equation Modeling (5e)

Robust ML

결측치가 포함된 경우: estimator = "MLR"

  • 결측치에 대해 FIML(Full Information Maximum Likelihood) 적용
  • ‘Huber-White’ robust standard errors

결측치가 없는 경우: estimator = "MLM" or "MLMV" or "MLMVS" - 결측치는 listwise deletion

hw_model <- "
  famback =~ parocc + byfaminc + bypared
  prevach =~ bytxrstd + bytxmstd + bytxsstd + bytxhstd
  hw =~ hw12 + hw10
  grades =~ eng92 + math92 + sci92 + soc92

  bytxrstd ~~ eng92
  bytxmstd ~~ math92
  bytxsstd ~~ sci92
  bytxhstd ~~ soc92

  prevach ~ famback
  grades ~ prevach + hw
  hw ~ prevach + famback
"
sem_fit_robust <- sem(hw_model, data = hw_mean, estimator = "MLR")
summary(sem_fit_robust, standardized = TRUE, fit.measures = TRUE) |> print()
lavaan 0.6-19 ended normally after 141 iterations

  Estimator                                         ML
  Optimization method                           NLMINB
  Number of model parameters                        35

                                                  Used       Total
  Number of observations                           247         300

Model Test User Model:
                                              Standard      Scaled
  Test Statistic                                66.999      67.084
  Degrees of freedom                                56          56
  P-value (Chi-square)                           0.149       0.147
  Scaling correction factor                                  0.999
    Yuan-Bentler correction (Mplus variant)                       

Model Test Baseline Model:

  Test statistic                              1996.735    1887.529
  Degrees of freedom                                78          78
  P-value                                        0.000       0.000
  Scaling correction factor                                  1.058

User Model versus Baseline Model:

  Comparative Fit Index (CFI)                    0.994       0.994
  Tucker-Lewis Index (TLI)                       0.992       0.991
                                                                  
  Robust Comparative Fit Index (CFI)                         0.994
  Robust Tucker-Lewis Index (TLI)                            0.992

Loglikelihood and Information Criteria:

  Loglikelihood user model (H0)              -8046.960   -8046.960
  Scaling correction factor                                  1.030
      for the MLR correction                                      
  Loglikelihood unrestricted model (H1)             NA          NA
  Scaling correction factor                                  1.011
      for the MLR correction                                      
                                                                  
  Akaike (AIC)                               16163.920   16163.920
  Bayesian (BIC)                             16286.748   16286.748
  Sample-size adjusted Bayesian (SABIC)      16175.799   16175.799

Root Mean Square Error of Approximation:

  RMSEA                                          0.028       0.028
  90 Percent confidence interval - lower         0.000       0.000
  90 Percent confidence interval - upper         0.051       0.051
  P-value H_0: RMSEA <= 0.050                    0.942       0.942
  P-value H_0: RMSEA >= 0.080                    0.000       0.000
                                                                  
  Robust RMSEA                                               0.028
  90 Percent confidence interval - lower                     0.000
  90 Percent confidence interval - upper                     0.051
  P-value H_0: Robust RMSEA <= 0.050                         0.942
  P-value H_0: Robust RMSEA >= 0.080                         0.000

Standardized Root Mean Square Residual:

  SRMR                                           0.028       0.028

Parameter Estimates:

  Standard errors                             Sandwich
  Information bread                           Observed
  Observed information based on                Hessian

Latent Variables:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  famback =~                                                            
    parocc            1.000                              15.208    0.736
    byfaminc          0.127    0.010   13.060    0.000    1.931    0.794
    bypared           0.074    0.006   12.172    0.000    1.125    0.856
  prevach =~                                                            
    bytxrstd          1.000                               8.238    0.848
    bytxmstd          1.010    0.057   17.831    0.000    8.320    0.852
    bytxsstd          1.022    0.063   16.195    0.000    8.422    0.841
    bytxhstd          1.003    0.060   16.634    0.000    8.261    0.837
  hw =~                                                                 
    hw12              1.000                               1.221    0.644
    hw10              1.045    0.165    6.351    0.000    1.276    0.749
  grades =~                                                             
    eng92             1.000                               2.093    0.895
    math92            0.845    0.057   14.749    0.000    1.769    0.742
    sci92             0.995    0.050   19.888    0.000    2.082    0.845
    soc92             1.057    0.044   23.791    0.000    2.213    0.895

Regressions:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  prevach ~                                                             
    famback           0.320    0.038    8.384    0.000    0.591    0.591
  grades ~                                                              
    prevach           0.109    0.021    5.192    0.000    0.431    0.431
    hw                0.562    0.168    3.341    0.001    0.328    0.328
  hw ~                                                                  
    prevach           0.055    0.017    3.275    0.001    0.368    0.368
    famback           0.018    0.009    1.995    0.046    0.225    0.225

Covariances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
 .bytxrstd ~~                                                           
   .eng92             0.079    0.523    0.151    0.880    0.079    0.015
 .bytxmstd ~~                                                           
   .math92            2.081    0.570    3.651    0.000    2.081    0.255
 .bytxsstd ~~                                                           
   .sci92             0.249    0.694    0.360    0.719    0.249    0.035
 .bytxhstd ~~                                                           
   .soc92             0.424    0.645    0.657    0.511    0.424    0.071

Variances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .parocc          196.008   21.642    9.057    0.000  196.008    0.459
   .byfaminc          2.191    0.297    7.375    0.000    2.191    0.370
   .bypared           0.460    0.081    5.710    0.000    0.460    0.267
   .bytxrstd         26.465    4.083    6.482    0.000   26.465    0.281
   .bytxmstd         26.038    2.975    8.753    0.000   26.038    0.273
   .bytxsstd         29.285    3.609    8.114    0.000   29.285    0.292
   .bytxhstd         29.231    3.951    7.398    0.000   29.231    0.300
   .hw12              2.107    0.273    7.723    0.000    2.107    0.585
   .hw10              1.277    0.269    4.737    0.000    1.277    0.439
   .eng92             1.092    0.155    7.060    0.000    1.092    0.200
   .math92            2.550    0.254   10.026    0.000    2.550    0.449
   .sci92             1.742    0.205    8.511    0.000    1.742    0.287
   .soc92             1.221    0.195    6.254    0.000    1.221    0.200
    famback         231.288   32.809    7.049    0.000    1.000    1.000
   .prevach          44.162    5.703    7.743    0.000    0.651    0.651
   .hw                1.068    0.280    3.810    0.000    0.716    0.716
   .grades            2.476    0.350    7.079    0.000    0.565    0.565

Bootstrapping

  • 표본 수가 작은 경우 (n < 100) 사용하기 어려움.
  • ML로 얻은 파라미터 추정치에 대한 표준오차(se)를 bootstrap을 통해 추정
    • naive bootstrap: se = "bootstrap"
  • Chi-square test의 경우 Bollen-Stine bootstrap: test = "bootstrap"
    • 데이터를 완전한 모형이 되도록 변형 후 bootstrap 수행
    • 즉, 변형된 데이터에서 \(\mathbf{S}\) = \(\hat{\Sigma}(\theta)\) (단, 분포의 변형은 없음)
    • 결측치가 있는 경우, 특화된 함수를 시용: semTools 패키지의 bsBootMiss()
Bootstrap standard errors

If “boot” or “bootstrap”, bootstrap standard errors are computed using standard bootstrapping (unless Bollen-Stine bootstrapping is requested for the test statistic; in this case bootstrap standard errors are computed using model-based bootstrapping).

파라미터 추정치에 대한 표준오차는 se = "bootstrap"을 사용하여 보통의 naive/ordinary bootstrap을 수행해 계산할 수 있음.

sem_fit_boot <- sem(
  hw_model,
  data = hw_mean,
  se = "bootstrap", # standard errors
  missing = "FIML", # missing data handling
  bootstrap = 1000, # number of bootstrap samples; default
  iseed = 1234 # for reproducibility
)
summary(sem_fit_boot, standardized = TRUE) |> print()
lavaan 0.6-19 ended normally after 175 iterations

  Estimator                                         ML
  Optimization method                           NLMINB
  Number of model parameters                        48

  Number of observations                           300
  Number of missing patterns                        18

Model Test User Model:
                                                      
  Test statistic                                68.247
  Degrees of freedom                                56
  P-value (Chi-square)                           0.126

Parameter Estimates:

  Standard errors                            Bootstrap
  Number of requested bootstrap draws             1000
  Number of successful bootstrap draws            1000

Latent Variables:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  famback =~                                                            
    parocc            1.000                              16.913    0.774
    byfaminc          0.131    0.008   15.586    0.000    2.211    0.808
    bypared           0.066    0.005   13.718    0.000    1.108    0.829
  prevach =~                                                            
    bytxrstd          1.000                               8.630    0.858
    bytxmstd          0.980    0.050   19.448    0.000    8.457    0.856
    bytxsstd          0.977    0.055   17.824    0.000    8.430    0.847
    bytxhstd          0.965    0.053   18.245    0.000    8.330    0.844
  hw =~                                                                 
    hw12              1.000                               1.276    0.669
    hw10              0.963    0.157    6.145    0.000    1.228    0.714
  grades =~                                                             
    eng92             1.000                               2.466    0.915
    math92            0.853    0.044   19.443    0.000    2.105    0.798
    sci92             0.959    0.043   22.426    0.000    2.366    0.870
    soc92             1.044    0.034   30.901    0.000    2.574    0.917

Regressions:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  prevach ~                                                             
    famback           0.304    0.034    8.864    0.000    0.596    0.596
  grades ~                                                              
    prevach           0.130    0.024    5.503    0.000    0.453    0.453
    hw                0.667    0.225    2.969    0.003    0.345    0.345
  hw ~                                                                  
    prevach           0.044    0.016    2.822    0.005    0.300    0.300
    famback           0.022    0.008    2.760    0.006    0.295    0.295

Covariances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
 .bytxrstd ~~                                                           
   .eng92             0.232    0.501    0.463    0.643    0.232    0.042
 .bytxmstd ~~                                                           
   .math92            1.899    0.514    3.692    0.000    1.899    0.234
 .bytxsstd ~~                                                           
   .sci92             0.260    0.610    0.427    0.669    0.260    0.037
 .bytxhstd ~~                                                           
   .soc92             0.204    0.600    0.339    0.734    0.204    0.034

Intercepts:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .parocc           52.406    1.257   41.697    0.000   52.406    2.398
   .byfaminc          9.844    0.159   61.949    0.000    9.844    3.599
   .bypared           3.216    0.076   42.258    0.000    3.216    2.407
   .bytxrstd         51.337    0.595   86.307    0.000   51.337    5.105
   .bytxmstd         52.177    0.580   90.024    0.000   52.177    5.280
   .bytxsstd         51.627    0.592   87.252    0.000   51.627    5.189
   .bytxhstd         51.796    0.585   88.541    0.000   51.796    5.245
   .hw12              3.115    0.113   27.661    0.000    3.115    1.634
   .hw10              2.536    0.104   24.465    0.000    2.536    1.474
   .eng92             6.130    0.154   39.751    0.000    6.130    2.276
   .math92            5.534    0.153   36.256    0.000    5.534    2.100
   .sci92             5.846    0.156   37.569    0.000    5.846    2.149
   .soc92             6.283    0.163   38.516    0.000    6.283    2.238

Variances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .parocc          191.591   19.789    9.682    0.000  191.591    0.401
   .byfaminc          2.592    0.363    7.134    0.000    2.592    0.346
   .bypared           0.557    0.074    7.567    0.000    0.557    0.312
   .bytxrstd         26.646    3.843    6.933    0.000   26.646    0.264
   .bytxmstd         26.119    2.859    9.136    0.000   26.119    0.267
   .bytxsstd         27.924    3.202    8.721    0.000   27.924    0.282
   .bytxhstd         28.116    3.551    7.917    0.000   28.116    0.288
   .hw12              2.008    0.301    6.667    0.000    2.008    0.552
   .hw10              1.453    0.284    5.124    0.000    1.453    0.491
   .eng92             1.175    0.145    8.101    0.000    1.175    0.162
   .math92            2.519    0.226   11.162    0.000    2.519    0.363
   .sci92             1.800    0.190    9.487    0.000    1.800    0.243
   .soc92             1.257    0.181    6.950    0.000    1.257    0.159
    famback         286.051   33.754    8.474    0.000    1.000    1.000
   .prevach          48.024    5.646    8.506    0.000    0.645    0.645
   .hw                1.168    0.330    3.544    0.000    0.718    0.718
   .grades            3.202    0.378    8.481    0.000    0.526    0.526
Bootstrap Confidence Intervals

lavaan의 standardizedSolution() 함수의 ci값이 percentile bootstrap confidence intervals이 아니라고 하니, 다음 함수를 사용하는 것을 권장

library(semhelpinghands)
standardizedSolution_boot_ci(sem_fit_boot)

Chi-square test의 경우 Bollen-Stine bootstrap을 적용; test = "bootstrap"

sem_fit_boot <- sem(
  hw_model,
  data = hw_mean,
  test = "bootstrap", # model test
  bootstrap = 1000, # number of bootstrap samples; default
  iseed = 1234, # for reproducibility
)
summary(sem_fit_boot, standardized = TRUE, fit.measures = TRUE, estimates = FALSE) |> print()
lavaan 0.6-19 ended normally after 141 iterations

  Estimator                                         ML
  Optimization method                           NLMINB
  Number of model parameters                        35

                                                  Used       Total
  Number of observations                           247         300

Model Test User Model:
                                                      
  Test statistic                                66.999
  Degrees of freedom                                56
  P-value (Chi-square)                           0.149
                                                      
  Test statistic                                66.999
  Degrees of freedom                                56
  P-value (Bollen-Stine bootstrap)               0.202

Model Test Baseline Model:

  Test statistic                              1996.735
  Degrees of freedom                                78
  P-value                                        0.000

User Model versus Baseline Model:

  Comparative Fit Index (CFI)                    0.994
  Tucker-Lewis Index (TLI)                       0.992

Loglikelihood and Information Criteria:

  Loglikelihood user model (H0)              -8046.960
  Loglikelihood unrestricted model (H1)             NA
                                                      
  Akaike (AIC)                               16163.920
  Bayesian (BIC)                             16286.748
  Sample-size adjusted Bayesian (SABIC)      16175.799

Root Mean Square Error of Approximation:

  RMSEA                                          0.028
  90 Percent confidence interval - lower         0.000
  90 Percent confidence interval - upper         0.051
  P-value H_0: RMSEA <= 0.050                    0.942
  P-value H_0: RMSEA >= 0.080                    0.000

Standardized Root Mean Square Residual:

  SRMR                                           0.028

결측치가 포함된 경우 이에 특화된 bsBootMiss()함수를 사용

library(semTools)
fit <- sem(
  hw_model, 
  data = hw_mean, 
  meanstructure = TRUE, 
  missing = "FIML" # missing data handling
)
bsboot <- bsBootMiss(fit, nBoot = 1000, seed = 1234)
summary(bsboot) |> print()
  |==================================================| 100%
Time elapsed to transform the data:
Time difference of 0.220299 secs

Time elapsed to fit the model to 1000 bootstrapped samples:
Time difference of 6.22466 mins

Mean of Theoretical Distribution = DF = 56 
Variance of Theoretical Distribution = 2*DF = 112 

Mean of Bootstrap Distribution = 59.32232 
Variance of Bootstrap Distribution = 164.2552 

Chi-Squared = 68.24697
Degrees of Freedom = 56
Theoretical p value = 0.1262701
    i.e., pchisq(68.24697, df = 56, lower.tail = FALSE)

Bootstrapped p value = 0.217

Chi-Squared = 68.24697
Degrees of Freedom = 56
Theoretical p value = 0.1262701
    i.e., pchisq(68.24697, df = 56, lower.tail = FALSE)

Bootstrapped p value = 0.217
hist(bsboot, breaks = 50, legendArgs = list(x = "topleft"))

분포에 대한 가정이 없는 방법

WLS (fully weighted least squares); estimator = "WLS"
also called ADF (Asymptotically Distribution-Free) estimator: 부정적인 견해 존재; p. 141, Klein, R. B. (2023)

sem_fit <- sem(hw_model, data = hw_mean, estimator = "WLS")
summary(sem_fit, standardized = TRUE, fit.measures = TRUE) |> print()
lavaan 0.6-19 ended normally after 229 iterations

  Estimator                                        WLS
  Optimization method                           NLMINB
  Number of model parameters                        35

                                                  Used       Total
  Number of observations                           247         300

Model Test User Model:
                                                      
  Test statistic                                90.176
  Degrees of freedom                                56
  P-value (Chi-square)                           0.003

Model Test Baseline Model:

  Test statistic                               733.895
  Degrees of freedom                                78
  P-value                                        0.000

User Model versus Baseline Model:

  Comparative Fit Index (CFI)                    0.948
  Tucker-Lewis Index (TLI)                       0.927

Root Mean Square Error of Approximation:

  RMSEA                                          0.050
  90 Percent confidence interval - lower         0.030
  90 Percent confidence interval - upper         0.068
  P-value H_0: RMSEA <= 0.050                    0.485
  P-value H_0: RMSEA >= 0.080                    0.003

Standardized Root Mean Square Residual:

  SRMR                                           0.040

Parameter Estimates:

  Standard errors                             Standard
  Information                                 Expected
  Information saturated (h1) model        Unstructured

Latent Variables:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  famback =~                                                            
    parocc            1.000                              15.912    0.780
    byfaminc          0.121    0.008   15.623    0.000    1.923    0.788
    bypared           0.074    0.005   15.043    0.000    1.170    0.875
  prevach =~                                                            
    bytxrstd          1.000                               8.547    0.877
    bytxmstd          1.004    0.044   22.584    0.000    8.585    0.869
    bytxsstd          0.967    0.047   20.609    0.000    8.262    0.859
    bytxhstd          0.985    0.044   22.480    0.000    8.417    0.862
  hw =~                                                                 
    hw12              1.000                               1.092    0.600
    hw10              1.206    0.186    6.498    0.000    1.317    0.810
  grades =~                                                             
    eng92             1.000                               2.109    0.923
    math92            0.827    0.048   17.197    0.000    1.744    0.725
    sci92             0.980    0.042   23.289    0.000    2.066    0.855
    soc92             1.038    0.040   26.001    0.000    2.189    0.889

Regressions:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  prevach ~                                                             
    famback           0.286    0.029    9.966    0.000    0.532    0.532
  grades ~                                                              
    prevach           0.110    0.016    6.713    0.000    0.447    0.447
    hw                0.616    0.156    3.959    0.000    0.319    0.319
  hw ~                                                                  
    prevach           0.048    0.013    3.708    0.000    0.375    0.375
    famback           0.012    0.006    2.162    0.031    0.179    0.179

Covariances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
 .bytxrstd ~~                                                           
   .eng92            -0.304    0.437   -0.694    0.487   -0.304   -0.074
 .bytxmstd ~~                                                           
   .math92            2.510    0.456    5.501    0.000    2.510    0.310
 .bytxsstd ~~                                                           
   .sci92            -0.656    0.513   -1.277    0.202   -0.656   -0.106
 .bytxhstd ~~                                                           
   .soc92             0.177    0.520    0.340    0.734    0.177    0.032

Variances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .parocc          163.471   17.751    9.209    0.000  163.471    0.392
   .byfaminc          2.258    0.240    9.422    0.000    2.258    0.379
   .bypared           0.418    0.071    5.892    0.000    0.418    0.234
   .bytxrstd         21.834    3.150    6.932    0.000   21.834    0.230
   .bytxmstd         23.842    2.478    9.623    0.000   23.842    0.244
   .bytxsstd         24.190    2.637    9.173    0.000   24.190    0.262
   .bytxhstd         24.578    3.007    8.173    0.000   24.578    0.258
   .hw12              2.117    0.217    9.760    0.000    2.117    0.640
   .hw10              0.910    0.255    3.566    0.000    0.910    0.344
   .eng92             0.770    0.125    6.178    0.000    0.770    0.148
   .math92            2.747    0.211   12.997    0.000    2.747    0.475
   .sci92             1.571    0.155   10.148    0.000    1.571    0.269
   .soc92             1.272    0.163    7.802    0.000    1.272    0.210
    famback         253.186   28.024    9.035    0.000    1.000    1.000
   .prevach          52.345    4.815   10.872    0.000    0.717    0.717
   .hw                0.902    0.201    4.483    0.000    0.756    0.756
   .grades            2.509    0.293    8.569    0.000    0.564    0.564

Missing Data

결측치 분류

  • MCAR: Missing Completely At Random; 결측이 특정 변수와 전혀 상관없는 경우; 매우 드물다고 봄
  • MAR: Missing At Random; 결측의 패턴이 다른 변수와 상관이 있지만 결측된 변수의 값과는 상관이 없는 경우; 다른 변수에 의해 설명될 수 있음
  • MNAR: Missing Not At Random; 결측이 해당 변수의 값과 연관이 있을 때;
    • 우울 수준이 높은 사람들이 자신의 우울감을 보고하기를 꺼리는 경우,
    • 수입이 낮은/높은 사람들이 자신의 수입을 보고하기를 꺼리는 경우 등

결측치에 대한 처리

  • Listwise deletion / Pairwise deletion
  • Full Information Maximum Likelihood (FIML): 결측치를 포함한 모든 데이터를 이용
  • Imputation (결측치 대체)
    • Multivariate Imputation by Chained Equations (MICE)가 가장 우수한 것으로 알려져 있음
    • mice package 참고 서적
    • ML 기반이 아닌 estimator를 사용할 때 용이함.
  • 대체로 FIML과 MICE가 유사한 결과를 얻음.
  • Machine learning 기반 방법도 있음: random forest, k-nearest neighbors, …

결측치에 대한 시각화: naniar package; Missing Data Visualisations, Getting Started with naniar

Load data
hw_mean <- haven::read_sav("data/chap 19 latent means/Homework means.sav")
hw_mean <- hw_mean |> 
  rename(hw10 = f1s36a2, hw12 = f2s25f2)
  
set.seed(123)
hw_mean <- hw_mean |> sample_n(300)
skimr::skim(hw_mean) |> print()
── Data Summary ────────────────────────
                           Values 
Name                       hw_mean
Number of rows             300    
Number of columns          15     
_______________________           
Column type frequency:            
  numeric                  15     
________________________          
Group variables            None   

── Variable type: numeric ────────────────────────────────────────────────────────────────
   skim_variable n_missing complete_rate   mean     sd    p0   p25   p50   p75 p100 hist 
 1 bytxrstd              6         0.98  51.3   10.0   29.5  43.1  50.7  59.2  67.5 ▃▆▇▇▇
 2 bytxmstd              6         0.98  52.2    9.87  32.0  44.3  51.4  60.7  70.1 ▃▇▆▆▆
 3 bytxsstd              7         0.977 51.6    9.93  29.1  44.7  51.2  58.1  73.8 ▂▆▇▆▂
 4 bytxhstd              7         0.977 51.8    9.86  25.2  45.2  52.1  60.8  69.5 ▁▅▇▇▇
 5 parocc                2         0.993 52.5   21.9    7.32 27.4  61.3  66.2  81.9 ▃▁▁▇▂
 6 hw10                 15         0.95   2.60   1.70   0     1     2     4     7   ▇▇▆▂▂
 7 ethnic                4         0.987  0.784  0.412  0     1     1     1     1   ▂▁▁▁▇
 8 hw12                 28         0.907  3.19   1.89   0     2     3     4     8   ▂▇▂▃▁
 9 eng92                 4         0.987  6.16   2.68   0     4.22  6.5   8.25 11.2 ▂▅▅▇▃
10 math92                4         0.987  5.56   2.63   0     3.5   5.5   7.5  11.1 ▃▆▇▇▃
11 sci92                 4         0.987  5.87   2.71   0     3.91  6     8    11.3 ▃▆▇▇▃
12 soc92                 4         0.987  6.31   2.80   0     4.55  6.33  8.57 11   ▂▅▇▇▆
13 Female                0         1      0.503  0.501  0     0     1     1     1   ▇▁▁▁▇
14 byfaminc             10         0.967  9.82   2.73   2     8    10    12    15   ▁▂▃▇▂
15 bypared               1         0.997  3.21   1.34   1     2     3     4     6   ▅▇▂▂▂
library(naniar)
# 결측치 요약 시각화
gg_miss_var(hw_mean, facet = ethnic)  # by ethnic group

# 개별 결측치, 군집화 패턴 확인
vis_miss(hw_mean, cluster = TRUE)

# 함께 누락된 변수들의 패턴 확인
gg_miss_upset(hw_mean, nsets = n_var_miss(hw_mean))

결측치에 대한 패턴

학업 성취도가 낮은 아이들의 경우 숙제를 한 시간을 보고하지 않는 경향이 있을 수 있다는 의심하에,
다음과 같이 대략 세 변수에 대해 결측치 여부를 나타내는 변수를 추가하여, 다른 변수들과의 상관계수를 살펴보면,

# add a variable to the dataset that indicates the missingness
hw_mean <- hw_mean |> 
  mutate(
    missing_reading = is.na(bytxrstd),
    missing_hw12 = is.na(hw12),
    missing_hw10 = is.na(hw10)
  )
psych::lowerCor(hw_mean)
                 bytxr bytxm bytxs bytxh parcc hw10  ethnc hw12  eng92 mth92 sci92 soc92
bytxrstd          1.00                                                                  
bytxmstd          0.73  1.00                                                            
bytxsstd          0.71  0.73  1.00                                                      
bytxhstd          0.73  0.71  0.74  1.00                                                
parocc            0.37  0.41  0.32  0.35  1.00                                          
hw10              0.25  0.29  0.26  0.30  0.21  1.00                                    
ethnic            0.28  0.28  0.30  0.31  0.29  0.13  1.00                              
hw12              0.30  0.29  0.24  0.23  0.21  0.47  0.00  1.00                        
eng92             0.53  0.47  0.43  0.42  0.37  0.34  0.27  0.30  1.00                  
math92            0.48  0.50  0.38  0.37  0.29  0.25  0.20  0.26  0.71  1.00            
sci92             0.50  0.47  0.42  0.38  0.31  0.28  0.25  0.30  0.79  0.74  1.00      
soc92             0.54  0.49  0.45  0.44  0.35  0.36  0.24  0.33  0.85  0.71  0.79  1.00
Female           -0.14  0.01  0.09  0.09 -0.04 -0.05 -0.05 -0.01 -0.23 -0.04 -0.08 -0.14
byfaminc          0.38  0.43  0.40  0.39  0.63  0.23  0.36  0.21  0.36  0.28  0.32  0.38
bypared           0.44  0.43  0.44  0.39  0.64  0.28  0.30  0.26  0.39  0.34  0.36  0.37
missing_reading*    NA    NA    NA    NA -0.01  0.01 -0.04 -0.02  0.05  0.01  0.01  0.03
missing_hw12*    -0.25 -0.19 -0.20 -0.19 -0.24 -0.02 -0.12    NA -0.39 -0.32 -0.34 -0.36
missing_hw10*    -0.21 -0.12 -0.09 -0.11 -0.15    NA -0.10 -0.08 -0.40 -0.31 -0.32 -0.40
                 Femal byfmn byprd mss_* m_12* m_10*
Female            1.00                              
byfaminc          0.13  1.00                        
bypared           0.01  0.66  1.00                  
missing_reading* -0.05 -0.03  0.01  1.00            
missing_hw12*     0.04 -0.28 -0.11 -0.05  1.00      
missing_hw10*     0.04 -0.24 -0.12 -0.03  0.40  1.00
ggplot(hw_mean, aes(x = missing_hw12, y = eng92)) +
  geom_violin() +
  geom_jitter(alpha=.3)

ggplot(hw_mean, aes(x = hw12, y = eng92)) +
 geom_miss_point(jitter = 0.1)

FIML

Full Information Maximum Likelihood

  • 결측치를 제거하지 않고 모든 관측치를 이용
  • 각 관측치(case) 별로 결측치가 없는 변수들로만 이루어진 공분산 행렬로 log-likelihood를 계산하여 합산하는 방식
  • MNAR(missing not at random)에 대해서는 부적절할 수 있음.
Load data
hw_mean <- haven::read_sav("data/chap 19 latent means/Homework means.sav")
hw_mean <- hw_mean |> 
  rename(hw10 = f1s36a2, hw12 = f2s25f2) |>
  labelled::remove_labels()

set.seed(123)
hw_mean <- hw_mean |> sample_n(300)
# NA가 하나라도 포함된 행들만 추출
hw_mean |> 
  filter(rowSums(is.na(hw_mean)) > 0) |> print()
# A tibble: 55 × 15
  bytxrstd bytxmstd bytxsstd bytxhstd parocc  hw10 ethnic  hw12 eng92 math92 sci92 soc92
     <dbl>    <dbl>    <dbl>    <dbl>  <dbl> <dbl>  <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl>
1     46.4     55.2     46.7     42.4   NA      NA      0     2  1.29   0.4   0.4   0.67
2     46.2     50.2     51.1     61.0   15.6     0      1    NA  3.33  NA     0     0   
3     48.4     45.4     62.4     48.8   66.2     1      1     6  1.8    1.5   3.25  3.78
4     59.1     44.7     NA       NA     56.3     3      0     4  8.6    7.25 10.2   9   
5     35.8     32.6     42.5     35.3   56.3    NA      1     1  0.56   3     1.37  0.75
6     37.0     44.4     48.8     52.1   20.3    NA      1     0  0.78   0     1.75  1.29
# ℹ 49 more rows
# ℹ 3 more variables: Female <dbl>, byfaminc <dbl>, bypared <dbl>
hw_model <- "
  famback =~ parocc + byfaminc + bypared
  prevach =~ bytxrstd + bytxmstd + bytxsstd + bytxhstd
  hw =~ hw12 + hw10
  grades =~ eng92 + math92 + sci92 + soc92

  bytxrstd ~~ eng92
  bytxmstd ~~ math92
  bytxsstd ~~ sci92
  bytxhstd ~~ soc92

  prevach ~ famback
  grades ~ prevach + hw
  hw ~ prevach + famback
"
sem_fit <- sem(hw_model,
  data = hw_mean,
  estimator = "MLR",  # robust ML for non-normal data
  missing = "FIML"  # FIML for missing data; "MLR"일 때 자동으로 FIML로 설정됨
)
summary(sem_fit, standardized = TRUE, fit.measures = TRUE) |> print()
lavaan 0.6-19 ended normally after 175 iterations

  Estimator                                         ML
  Optimization method                           NLMINB
  Number of model parameters                        48

  Number of observations                           300
  Number of missing patterns                        18

Model Test User Model:
                                              Standard      Scaled
  Test Statistic                                68.247      67.485
  Degrees of freedom                                56          56
  P-value (Chi-square)                           0.126       0.140
  Scaling correction factor                                  1.011
    Yuan-Bentler correction (Mplus variant)                       

Model Test Baseline Model:

  Test statistic                              2543.132    2388.205
  Degrees of freedom                                78          78
  P-value                                        0.000       0.000
  Scaling correction factor                                  1.065

User Model versus Baseline Model:

  Comparative Fit Index (CFI)                    0.995       0.995
  Tucker-Lewis Index (TLI)                       0.993       0.993
                                                                  
  Robust Comparative Fit Index (CFI)                         0.995
  Robust Tucker-Lewis Index (TLI)                            0.993

Loglikelihood and Information Criteria:

  Loglikelihood user model (H0)              -9645.718   -9645.718
  Scaling correction factor                                  1.013
      for the MLR correction                                      
  Loglikelihood unrestricted model (H1)      -9611.594   -9611.594
  Scaling correction factor                                  1.012
      for the MLR correction                                      
                                                                  
  Akaike (AIC)                               19387.435   19387.435
  Bayesian (BIC)                             19565.217   19565.217
  Sample-size adjusted Bayesian (SABIC)      19412.989   19412.989

Root Mean Square Error of Approximation:

  RMSEA                                          0.027       0.026
  90 Percent confidence interval - lower         0.000       0.000
  90 Percent confidence interval - upper         0.047       0.046
  P-value H_0: RMSEA <= 0.050                    0.973       0.977
  P-value H_0: RMSEA >= 0.080                    0.000       0.000
                                                                  
  Robust RMSEA                                               0.027
  90 Percent confidence interval - lower                     0.000
  90 Percent confidence interval - upper                     0.048
  P-value H_0: Robust RMSEA <= 0.050                         0.965
  P-value H_0: Robust RMSEA >= 0.080                         0.000

Standardized Root Mean Square Residual:

  SRMR                                           0.030       0.030

Parameter Estimates:

  Standard errors                             Sandwich
  Information bread                           Observed
  Observed information based on                Hessian

Latent Variables:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  famback =~                                                            
    parocc            1.000                              16.913    0.774
    byfaminc          0.131    0.008   15.828    0.000    2.211    0.808
    bypared           0.066    0.005   13.734    0.000    1.108    0.829
  prevach =~                                                            
    bytxrstd          1.000                               8.630    0.858
    bytxmstd          0.980    0.049   19.853    0.000    8.457    0.856
    bytxsstd          0.977    0.053   18.505    0.000    8.430    0.847
    bytxhstd          0.965    0.052   18.668    0.000    8.330    0.844
  hw =~                                                                 
    hw12              1.000                               1.276    0.669
    hw10              0.963    0.148    6.496    0.000    1.228    0.714
  grades =~                                                             
    eng92             1.000                               2.466    0.915
    math92            0.853    0.043   19.977    0.000    2.105    0.798
    sci92             0.959    0.042   23.062    0.000    2.366    0.870
    soc92             1.044    0.034   31.117    0.000    2.574    0.917

Regressions:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  prevach ~                                                             
    famback           0.304    0.033    9.233    0.000    0.596    0.596
  grades ~                                                              
    prevach           0.130    0.022    5.923    0.000    0.453    0.453
    hw                0.667    0.194    3.445    0.001    0.345    0.345
  hw ~                                                                  
    prevach           0.044    0.016    2.844    0.004    0.300    0.300
    famback           0.022    0.008    2.749    0.006    0.295    0.295

Covariances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
 .bytxrstd ~~                                                           
   .eng92             0.232    0.503    0.462    0.644    0.232    0.042
 .bytxmstd ~~                                                           
   .math92            1.899    0.517    3.670    0.000    1.899    0.234
 .bytxsstd ~~                                                           
   .sci92             0.260    0.611    0.426    0.670    0.260    0.037
 .bytxhstd ~~                                                           
   .soc92             0.204    0.581    0.350    0.726    0.204    0.034

Intercepts:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .parocc           52.406    1.267   41.367    0.000   52.406    2.398
   .byfaminc          9.844    0.159   61.966    0.000    9.844    3.599
   .bypared           3.216    0.077   41.634    0.000    3.216    2.407
   .bytxrstd         51.337    0.584   87.842    0.000   51.337    5.105
   .bytxmstd         52.177    0.575   90.798    0.000   52.177    5.280
   .bytxsstd         51.627    0.579   89.200    0.000   51.627    5.189
   .bytxhstd         51.796    0.575   90.130    0.000   51.796    5.245
   .hw12              3.115    0.116   26.946    0.000    3.115    1.634
   .hw10              2.536    0.101   25.024    0.000    2.536    1.474
   .eng92             6.130    0.156   39.189    0.000    6.130    2.276
   .math92            5.534    0.153   36.156    0.000    5.534    2.100
   .sci92             5.846    0.158   37.113    0.000    5.846    2.149
   .soc92             6.283    0.163   38.656    0.000    6.283    2.238

Variances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .parocc          191.591   20.296    9.440    0.000  191.591    0.401
   .byfaminc          2.592    0.347    7.467    0.000    2.592    0.346
   .bypared           0.557    0.074    7.510    0.000    0.557    0.312
   .bytxrstd         26.646    3.670    7.261    0.000   26.646    0.264
   .bytxmstd         26.119    2.726    9.581    0.000   26.119    0.267
   .bytxsstd         27.924    3.161    8.834    0.000   27.924    0.282
   .bytxhstd         28.116    3.469    8.105    0.000   28.116    0.288
   .hw12              2.008    0.287    7.004    0.000    2.008    0.552
   .hw10              1.453    0.273    5.330    0.000    1.453    0.491
   .eng92             1.175    0.147    7.971    0.000    1.175    0.162
   .math92            2.519    0.230   10.975    0.000    2.519    0.363
   .sci92             1.800    0.192    9.397    0.000    1.800    0.243
   .soc92             1.257    0.183    6.879    0.000    1.257    0.159
    famback         286.051   32.782    8.726    0.000    1.000    1.000
   .prevach          48.024    5.785    8.302    0.000    0.645    0.645
   .hw                1.168    0.305    3.828    0.000    0.718    0.718
   .grades            3.202    0.386    8.302    0.000    0.526    0.526

Imputation

  • semTools에서 제공하는 imputation이 통합된 lavaan 함수를 이용하면; sem.mi()
    • MICE를 이용해 회귀모형에 기반해 결측치를 추정한 여러 셋트의 데이터를 생성 후 평균내는 방식
  • Machine learning 기반의 알고리즘들로 직접 결측값을 채워 새로운 데이터셋을 생성한 후 분석할 수도 있음.
Load data
hw_mean <- haven::read_sav("data/chap 19 latent means/Homework means.sav")
hw_mean <- hw_mean |> 
  rename(hw10 = f1s36a2, hw12 = f2s25f2) |> 
  labelled::remove_labels()
set.seed(123)
hw_mean <- hw_mean |> sample_n(300)
hw_model <- "
  famback =~ parocc + byfaminc + bypared
  prevach =~ bytxrstd + bytxmstd + bytxsstd + bytxhstd
  hw =~ hw12 + hw10
  grades =~ eng92 + math92 + sci92 + soc92

  bytxrstd ~~ eng92
  bytxmstd ~~ math92
  bytxsstd ~~ sci92
  bytxhstd ~~ soc92

  prevach ~ famback
  grades ~ prevach + hw
  hw ~ prevach + famback
"
pred_mat <- mice::quickpred(hw_mean, mincor = 0.25)  # r > 0.25인 변수만 사용
sem_fit <- sem.mi(hw_model,
                  data = hw_mean,
                  estimator = "MLM",
                  miPackage = "mice", m = 10, seed = 123,  # mice 패키지 사용
                  miArgs = list(predictorMatrix = pred_mat)
)
summary(sem_fit, standardized = TRUE, fit.measures = TRUE)
lavaan.mi object based on 10 imputed data sets. 
See class?lavaan.mi help page for available methods. 

Convergence information:
The model converged on 10 imputed data sets 

Rubin's (1987) rules were used to pool point and SE estimates across 10 imputed data sets, and to calculate degrees of freedom for each parameter's t test and CI.

Model Test User Model:

                                              Standard      Scaled
  Test statistic                                66.798      66.063
  Degrees of freedom                                56          56
  P-value                                        0.153       0.168
  Average scaling correction factor                          1.011

Model Test Baseline Model:

  Test statistic                              2483.002    2485.266
  Degrees of freedom                                78          78
  P-value                                        0.000       0.000
  Scaling correction factor                                  0.999

User Model versus Baseline Model:

  Comparative Fit Index (CFI)                    0.996       0.996
  Tucker-Lewis Index (TLI)                       0.994       0.994
                                                                  
  Robust Comparative Fit Index (CFI)                         0.996
  Robust Tucker-Lewis Index (TLI)                            1.000

Root Mean Square Error of Approximation:

  RMSEA                                          0.025       0.024
  Confidence interval - lower                    0.000       0.000
  Confidence interval - upper                    0.046       0.000
  P-value H_0: RMSEA <= 0.05                     0.979       0.982
                                                                  
  Robust RMSEA                                               0.025
  Confidence interval - lower                                0.000
  Confidence interval - upper                                0.046

Standardized Root Mean Square Residual:

  SRMR                                           0.032       0.032
A lavaan.parameterEstimates: 39 x 12
lhs op rhs exo est se t df pvalue std.lv std.all label
<chr> <chr> <chr> <int> <lvn.vctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
famback =~ parocc 0 1.00000000 0.000000000 NA NA NA 16.9496168 0.77384914
famback =~ byfaminc 0 0.13121803 0.008381035 15.6565414 Inf 2.997617e-55 2.2240953 0.81112072
famback =~ bypared 0 0.06524749 0.004744518 13.7521842 Inf 4.941480e-43 1.1059200 0.82771626
prevach =~ bytxrstd 0 1.00000000 0.000000000 NA NA NA 8.5903647 0.85607162
prevach =~ bytxmstd 0 0.98568829 0.049618611 19.8652939 Inf 8.127497e-88 8.4674219 0.85759578
prevach =~ bytxsstd 0 0.98072154 0.051524979 19.0339045 3273.3990 1.072947e-76 8.4247557 0.84712100
prevach =~ bytxhstd 0 0.96983846 0.051803967 18.7213163 2329.1752 5.767646e-73 8.3312661 0.84313210
hw =~ hw12 0 1.00000000 0.000000000 NA NA NA 1.2995948 0.68696647
hw =~ hw10 0 0.92385023 0.138226158 6.6836136 476.4311 6.531768e-11 1.2006310 0.70818514
grades =~ eng92 0 1.00000000 0.000000000 NA NA NA 2.4578334 0.91456192
grades =~ math92 0 0.85572904 0.042354997 20.2037326 5076.3862 2.281209e-87 2.1032395 0.79788580
grades =~ sci92 0 0.96374024 0.041243002 23.3673639 Inf 9.179699e-121 2.3687130 0.87133356
grades =~ soc92 0 1.04663090 0.036653908 28.5544150 4510.0989 5.415972e-165 2.5724444 0.91650259
bytxrstd ~~ eng92 0 0.20905253 0.517532041 0.4039412 Inf 6.862559e-01 0.2090525 0.03708374
bytxmstd ~~ math92 0 1.92330898 0.522066220 3.6840326 7901.0927 2.311070e-04 1.9233090 0.23835047
bytxsstd ~~ sci92 0 0.18129467 0.612794899 0.2958489 9042.6888 7.673523e-01 0.1812947 0.02571671
bytxhstd ~~ soc92 0 0.20708355 0.588586907 0.3518317 Inf 7.249645e-01 0.2070836 0.03471214
prevach ~ famback 0 0.30285173 0.031864614 9.5043275 2720.9412 4.261562e-21 0.5975556 0.59755562
grades ~ prevach 0 0.13345819 0.019806063 6.7382494 675.5309 3.436673e-11 0.4664492 0.46644925
grades ~ hw 0 0.61750612 0.161308353 3.8281100 418.0890 1.488829e-04 0.3265102 0.32651023
hw ~ prevach 0 0.04578762 0.015843193 2.8900500 4097.4572 3.872039e-03 0.3026577 0.30265768
hw ~ famback 0 0.02072202 0.007854752 2.6381509 1204.1109 8.443422e-03 0.2702614 0.27026141
parocc ~~ parocc 0 192.45185043 20.479141566 9.3974569 Inf 5.589774e-21 192.4518504 0.40115751
byfaminc ~~ byfaminc 0 2.57197952 0.339150592 7.5835914 6823.9857 3.807425e-14 2.5719795 0.34208318
bypared ~~ bypared 0 0.56213098 0.072652460 7.7372601 6435.6708 1.171931e-14 0.5621310 0.31488580
bytxrstd ~~ bytxrstd 0 26.89949774 3.649960535 7.3698051 4997.6575 1.989032e-13 26.8994977 0.26714138
bytxmstd ~~ bytxmstd 0 25.78761692 2.710712342 9.5132252 Inf 1.848411e-21 25.7876169 0.26452948
bytxsstd ~~ bytxsstd 0 27.92974234 3.170372671 8.8096086 Inf 1.255836e-18 27.9297423 0.28238602
bytxhstd ~~ bytxhstd 0 28.23067795 3.475072847 8.1237658 Inf 4.519374e-16 28.2306780 0.28912826
hw12 ~~ hw12 0 1.88991456 0.285407615 6.6218085 4492.9683 3.965259e-11 1.8899146 0.52807707
hw10 ~~ hw10 0 1.43274138 0.240963868 5.9458764 584.2203 4.733964e-09 1.4327414 0.49847381
eng92 ~~ eng92 0 1.18140710 0.154246192 7.6592302 8832.9652 2.067767e-14 1.1814071 0.16357650
math92 ~~ math92 0 2.52496228 0.228097781 11.0696486 4517.9871 4.036156e-28 2.5249623 0.36337825
sci92 ~~ sci92 0 1.77939551 0.188242981 9.4526527 3875.5226 5.548686e-21 1.7793955 0.24077782
soc92 ~~ soc92 0 1.26068628 0.190860744 6.6052676 Inf 3.968001e-11 1.2606863 0.16002300
famback ~~ famback 0 287.28951040 33.319558160 8.6222485 Inf 6.565204e-18 1.0000000 1.00000000
prevach ~~ prevach 0 47.44441120 5.781226901 8.2066336 Inf 2.274763e-16 0.6429273 0.64292728
hw ~~ hw 0 1.24576869 0.298623041 4.1717099 2183.9853 3.141502e-05 0.7376010 0.73760096
grades ~~ grades 0 3.22848941 0.382616586 8.4379233 Inf 3.230255e-17 0.5344345 0.53443448


kNN(k-nearest neighbors) imputation을 이용해서 대체된 값으로 채워진 데이터셋을 사용하면,

hw_mean_imp <- VIM::kNN(hw_mean, k = 5) # kNN imputation
sem_fit <- sem(hw_model,
  data = hw_mean_imp,
  estimator = "MLMVS"
)
summary(sem_fit, fit.measures = TRUE, standardized = TRUE) |> print()
lavaan 0.6-19 ended normally after 141 iterations

  Estimator                                         ML
  Optimization method                           NLMINB
  Number of model parameters                        35

  Number of observations                           300

Model Test User Model:
                                              Standard      Scaled
  Test Statistic                                62.574      46.436
  Degrees of freedom                                56      42.363
  P-value (Chi-square)                           0.254       0.308
  Scaling correction factor                                  1.348
    mean and variance adjusted correction                         

Model Test Baseline Model:

  Test statistic                              2607.768     303.657
  Degrees of freedom                                78       9.014
  P-value                                        0.000       0.000
  Scaling correction factor                                  8.588

User Model versus Baseline Model:

  Comparative Fit Index (CFI)                    0.997       0.986
  Tucker-Lewis Index (TLI)                       0.996       0.997

Loglikelihood and Information Criteria:

  Loglikelihood user model (H0)              -9861.267   -9861.267
  Loglikelihood unrestricted model (H1)      -9829.980   -9829.980
                                                                  
  Akaike (AIC)                               19792.534   19792.534
  Bayesian (BIC)                             19922.167   19922.167
  Sample-size adjusted Bayesian (SABIC)      19811.167   19811.167

Root Mean Square Error of Approximation:

  RMSEA                                          0.020       0.018
  90 Percent confidence interval - lower         0.000       0.000
  90 Percent confidence interval - upper         0.042       0.041
  P-value H_0: RMSEA <= 0.050                    0.991       0.993
  P-value H_0: RMSEA >= 0.080                    0.000       0.000

Standardized Root Mean Square Residual:

  SRMR                                           0.029       0.029

Parameter Estimates:

  Standard errors                           Robust.sem
  Information                                 Expected
  Information saturated (h1) model          Structured

Latent Variables:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  famback =~                                                            
    parocc            1.000                              16.819    0.770
    byfaminc          0.130    0.008   15.666    0.000    2.194    0.810
    bypared           0.065    0.005   13.767    0.000    1.101    0.825
  prevach =~                                                            
    bytxrstd          1.000                               8.606    0.859
    bytxmstd          0.982    0.048   20.520    0.000    8.451    0.857
    bytxsstd          0.977    0.050   19.553    0.000    8.405    0.849
    bytxhstd          0.971    0.050   19.365    0.000    8.354    0.846
  hw =~                                                                 
    hw12              1.000                               1.300    0.693
    hw10              0.856    0.122    6.995    0.000    1.114    0.665
  grades =~                                                             
    eng92             1.000                               2.428    0.908
    math92            0.859    0.043   19.830    0.000    2.084    0.792
    sci92             0.968    0.042   23.301    0.000    2.349    0.869
    soc92             1.055    0.038   27.759    0.000    2.561    0.915

Regressions:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  prevach ~                                                             
    famback           0.303    0.030    9.964    0.000    0.592    0.592
  grades ~                                                              
    prevach           0.124    0.020    6.093    0.000    0.439    0.439
    hw                0.649    0.166    3.912    0.000    0.348    0.348
  hw ~                                                                  
    prevach           0.049    0.015    3.199    0.001    0.323    0.323
    famback           0.024    0.008    3.054    0.002    0.307    0.307

Covariances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
 .bytxrstd ~~                                                           
   .eng92             0.261    0.506    0.516    0.606    0.261    0.046
 .bytxmstd ~~                                                           
   .math92            1.951    0.506    3.854    0.000    1.951    0.238
 .bytxsstd ~~                                                           
   .sci92             0.214    0.596    0.360    0.719    0.214    0.031
 .bytxhstd ~~                                                           
   .soc92             0.250    0.575    0.434    0.664    0.250    0.042

Variances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .parocc          193.835   20.125    9.632    0.000  193.835    0.407
   .byfaminc          2.517    0.325    7.748    0.000    2.517    0.343
   .bypared           0.570    0.070    8.107    0.000    0.570    0.320
   .bytxrstd         26.309    3.520    7.474    0.000   26.309    0.262
   .bytxmstd         25.862    2.691    9.612    0.000   25.862    0.266
   .bytxsstd         27.376    3.071    8.915    0.000   27.376    0.279
   .bytxhstd         27.760    3.364    8.251    0.000   27.760    0.285
   .hw12              1.826    0.274    6.656    0.000    1.826    0.519
   .hw10              1.561    0.229    6.806    0.000    1.561    0.557
   .eng92             1.251    0.155    8.056    0.000    1.251    0.175
   .math92            2.590    0.237   10.925    0.000    2.590    0.373
   .sci92             1.787    0.185    9.679    0.000    1.787    0.245
   .soc92             1.273    0.190    6.706    0.000    1.273    0.163
    famback         282.885   32.385    8.735    0.000    1.000    1.000
   .prevach          48.061    5.582    8.609    0.000    0.649    0.649
   .hw                1.156    0.282    4.103    0.000    0.684    0.684
   .grades            3.138    0.366    8.576    0.000    0.532    0.532

Linearity

trendlines <- function(data, mapping, ...){
  ggplot(data = data, mapping = mapping) + 
  geom_smooth(method = loess, se = FALSE, color = "orange", ...)
}
hw_mean |> select(-Female, -ethnic) |> 
  GGally::ggpairs(lower = list(continuous = trendlines))

library(car)
mod <- lm(eng92 ~ hw10, data=hw_mean)
crPlots(mod)

mod <- lm(eng92 ~ parocc, data=hw_mean)
crPlots(mod)