Diagnostics

mixed

Author

Sungkyun Cho

Published

December 20, 2024

Load libraries

library(haven)
library(psych)
library(tidyverse)
library(lavaan)
library(semTools)
library(manymome)

Non-normality

MVN: An R Package for Assessing Multivariate Normality

Load data

library(covsim)
set.seed(1)

sigma.target <- matrix(c(1, 0.4, 0.3, 0.4, 1, 0.4, 0.3, 0.4, 1), 3)
ig.sample <- rIG(N = 10^3, sigma.target = sigma.target, reps = 1, skewness = c(0, sqrt(8), 0), excesskurtosis = c(0, 12, 6))

df <- ig.sample |> as.data.frame() |> as_tibble()

df |> print()

# A tibble: 1,000 × 3
      X1     X2     X3
   <dbl>  <dbl>  <dbl>
1 -0.626 -0.609 -0.472
2  0.184 -0.125  0.799
3 -0.836 -0.749 -0.773
4  1.60   2.95   0.975
5  0.330 -0.268  0.693
6 -0.820  1.22   1.27 
# ℹ 994 more rows

Multivariate test

mardia for Mardia’s test,
hz for Henze-Zirkler’s test, # default
royston for Royston’s test,
dh for Doornik-Hansen’s test,
energy for E-statistic

library(MVN)
mvn(df) |> print()

$multivariateNormality
           Test       HZ p value MVN
1 Henze-Zirkler 26.31374       0  NO

$univariateNormality
              Test  Variable Statistic   p value Normality
1 Anderson-Darling    X1        0.2219  0.8297      YES   
2 Anderson-Darling    X2       62.9376  <0.001      NO    
3 Anderson-Darling    X3        4.1598  <0.001      NO    

$Descriptives
      n         Mean   Std.Dev      Median       Min      Max       25th      75th
X1 1000 -0.011648142 1.0349158 -0.03532423 -3.008049 3.810277 -0.6973732 0.6884280
X2 1000  0.027678496 1.0566535 -0.18501542 -1.580517 8.663489 -0.5357336 0.2325938
X3 1000  0.009367604 0.9815131 -0.01418352 -4.190712 4.747516 -0.5385744 0.5710483
          Skew     Kurtosis
X1 -0.01913836 -0.007768915
X2  2.96485086 13.333058070
X3  0.29143698  2.195906716

# multivariate normality
mvn(data = df, mvnTest = "hz")$multivariateNormality |> print()

           Test       HZ p value MVN
1 Henze-Zirkler 26.31374       0  NO

# multivariate normality plot
results <- mvn(data = df, multivariatePlot = "qq")

Univariate test

Shapiro-Wilk (SW),
Cramer-von Mises (CVM),
Lilliefors (Lillie),
Shapiro-Francia (SF),
Anderson-Darling (AD) # default

mvn(data = df, univariateTest = "AD")$univariateNormality |> print()

              Test  Variable Statistic   p value Normality
1 Anderson-Darling    X1        0.2219  0.8297      YES   
2 Anderson-Darling    X2       62.9376  <0.001      NO    
3 Anderson-Darling    X3        4.1598  <0.001      NO

# univariate normality histogram
results <- mvn(data = df, univariatePlot = "histogram")

# univariate normality scatterplot
results <- mvn(data = df, univariatePlot = "scatter")

Outliers

# outliers
results <- mvn(data = df, mvnTest = "hz", multivariateOutlierMethod = "adj")

Homework Dataset

hw_mean <- haven::read_sav("data/chap 19 latent means/Homework means.sav")
hw_mean <- hw_mean |> 
  select(-ethnic, -Female, hw10 = f1s36a2, hw12 = f2s25f2)

set.seed(123)
hw_mean <- hw_mean |> sample_n(300)

mvn(data = hw_mean)

$multivariateNormality

A data.frame: 1 x 4
Test	HZ	p value	MVN
<chr>	<dbl>	<dbl>	<chr>
Henze-Zirkler	1.014463	2.875198e-07	NO

$univariateNormality

A data.frame: 13 x 5
	Test	Variable	Statistic	p value	Normality
	<I<chr>>	<I<chr>>	<I<chr>>	<I<chr>>	<I<chr>>
1	Anderson-Darling	bytxrstd	2.0702	<0.001	NO
2	Anderson-Darling	bytxmstd	3.1087	<0.001	NO
3	Anderson-Darling	bytxsstd	0.5534	0.1522	YES
4	Anderson-Darling	bytxhstd	1.6028	4e-04	NO
5	Anderson-Darling	parocc	17.0342	<0.001	NO
6	Anderson-Darling	hw10	9.8312	<0.001	NO
7	Anderson-Darling	hw12	5.7268	<0.001	NO
8	Anderson-Darling	eng92	1.4401	0.001	NO
9	Anderson-Darling	math92	1.1281	0.0058	NO
10	Anderson-Darling	sci92	0.7714	0.0445	NO
11	Anderson-Darling	soc92	1.0731	0.008	NO
12	Anderson-Darling	byfaminc	3.2795	<0.001	NO
13	Anderson-Darling	bypared	10.0582	<0.001	NO

$Descriptives

A data.frame: 13 x 10
	n	Mean	Std.Dev	Median	Min	Max	25th	75th	Skew	Kurtosis
	<int>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
bytxrstd	247	52.252498	9.729559	52.744	29.459	67.499	44.2160	61.0010	-0.21638586	-0.9412619
bytxmstd	247	52.921267	9.785079	52.593	32.049	70.118	44.5670	60.9505	0.01656429	-1.1789082
bytxsstd	247	52.323486	10.027881	52.525	29.079	73.829	45.2340	59.2450	0.03041495	-0.6364867
bytxhstd	247	52.560567	9.909501	53.781	25.166	69.508	45.8075	60.7700	-0.38347369	-0.5212263
parocc	247	54.081619	20.713100	61.320	7.320	81.870	49.7000	66.1800	-0.82741669	-0.5561309
hw10	247	2.627530	1.708102	2.000	0.000	7.000	1.0000	4.0000	0.74801463	-0.2616338
hw12	247	3.210526	1.901062	3.000	0.000	8.000	2.0000	4.0000	0.59291685	-0.1371959
eng92	247	6.594130	2.344748	7.000	0.500	11.120	4.9000	8.3300	-0.27431510	-0.6372800
math92	247	5.943846	2.397474	6.000	0.500	11.120	4.2500	7.7500	-0.12080923	-0.7570884
sci92	247	6.252267	2.468523	6.500	0.670	11.330	4.5000	8.0000	-0.14739430	-0.7185309
soc92	247	6.708988	2.473552	6.830	0.000	11.000	5.0700	8.6450	-0.32097156	-0.5374977
byfaminc	247	10.121457	2.438142	10.000	3.000	15.000	8.0000	12.0000	-0.38695907	-0.1149796
bypared	247	3.283401	1.316194	3.000	1.000	6.000	3.0000	4.0000	0.40848247	-0.3096812

# multivariate normality
hw_mean2 <- hw_mean |> 
  na.omit()  # qqplot does not handle missing data
results <- mvn(data = hw_mean2, multivariatePlot = "qq")

# univariate normality
results <- mvn(data = hw_mean, univariatePlot = "histogram")

results <- mvn(data = hw_mean, univariatePlot = "box")

Warning message in uniPlot(data, type = univariatePlot):
“Box-Plots are based on standardized values (centered and scaled).”

# outliers
results <- mvn(data = hw_mean, mvnTest = "hz", multivariateOutlierMethod = "adj")  # show options

이상치들

# outliers
results <- mvn(data = hw_mean, mvnTest = "hz", multivariateOutlierMethod = "adj", showOutliers = TRUE, showNewData = TRUE)  # show options

results$multivariateOutliers |> as_tibble() |> print()

# A tibble: 62 × 3
  Observation `Mahalanobis Distance` Outlier
  <chr>                        <dbl> <chr>  
1 1                             94.0 TRUE   
2 2                             88.9 TRUE   
3 3                             86.8 TRUE   
4 4                             85.5 TRUE   
5 5                             83.8 TRUE   
6 6                             73.0 TRUE   
# ℹ 56 more rows

이상치가 제거된 데이터셋

results$newData

Non-normality에 대한 대응

Robust ML
Bootstrapping
분포에 대한 가정이 없는 방법: WLS(fully weighted least squares)

Robust ML 방식
ML 파라미터 추정치는 일반적으로 robust하나, 표준오차는 문제가 될 수 있음.
다음과 같은 ML에 대한 robust estimation들은 표준오차에 대한 보정을 제공함.

Option “MLM” is for complete data sets only and generates the mean-adjusted Satorra-Bentler scaled chi-square.
Option “MLR” can be applied in complete or incomplete data sets, and it generates a mean adjusted chi-square based on the Yuan-Bentler T2 statistic. Because this method accommodates missing data, it is the most flexible option listed here.
Option “MLMV” is for complete data sets only and computes a mean- and variance-adjusted scaled chi-square.
Option “MLMVS” generates a mean- and variance adjusted chi-square with a correction for heteroscedasticity by Satterthwaite (1941). Model degrees of freedom are estimated, and the method is for complete data sets.

Source: p. 137, 163, Klein, R. B. (2023). Principles and Practice of Structural Equation Modeling (5e)

Robust ML

결측치가 포함된 경우: estimator = "MLR"

결측치에 대해 FIML(Full Information Maximum Likelihood) 적용
‘Huber-White’ robust standard errors

결측치가 없는 경우: estimator = "MLM" or "MLMV" or "MLMVS" - 결측치는 listwise deletion

hw_model <- "
  famback =~ parocc + byfaminc + bypared
  prevach =~ bytxrstd + bytxmstd + bytxsstd + bytxhstd
  hw =~ hw12 + hw10
  grades =~ eng92 + math92 + sci92 + soc92

  bytxrstd ~~ eng92
  bytxmstd ~~ math92
  bytxsstd ~~ sci92
  bytxhstd ~~ soc92

  prevach ~ famback
  grades ~ prevach + hw
  hw ~ prevach + famback
"
sem_fit_robust <- sem(hw_model, data = hw_mean, estimator = "MLR")
summary(sem_fit_robust, standardized = TRUE, fit.measures = TRUE) |> print()

lavaan 0.6-19 ended normally after 141 iterations

  Estimator                                         ML
  Optimization method                           NLMINB
  Number of model parameters                        35

                                                  Used       Total
  Number of observations                           247         300

Model Test User Model:
                                              Standard      Scaled
  Test Statistic                                66.999      67.084
  Degrees of freedom                                56          56
  P-value (Chi-square)                           0.149       0.147
  Scaling correction factor                                  0.999
    Yuan-Bentler correction (Mplus variant)                       

Model Test Baseline Model:

  Test statistic                              1996.735    1887.529
  Degrees of freedom                                78          78
  P-value                                        0.000       0.000
  Scaling correction factor                                  1.058

User Model versus Baseline Model:

  Comparative Fit Index (CFI)                    0.994       0.994
  Tucker-Lewis Index (TLI)                       0.992       0.991
                                                                  
  Robust Comparative Fit Index (CFI)                         0.994
  Robust Tucker-Lewis Index (TLI)                            0.992

Loglikelihood and Information Criteria:

  Loglikelihood user model (H0)              -8046.960   -8046.960
  Scaling correction factor                                  1.030
      for the MLR correction                                      
  Loglikelihood unrestricted model (H1)             NA          NA
  Scaling correction factor                                  1.011
      for the MLR correction                                      
                                                                  
  Akaike (AIC)                               16163.920   16163.920
  Bayesian (BIC)                             16286.748   16286.748
  Sample-size adjusted Bayesian (SABIC)      16175.799   16175.799

Root Mean Square Error of Approximation:

  RMSEA                                          0.028       0.028
  90 Percent confidence interval - lower         0.000       0.000
  90 Percent confidence interval - upper         0.051       0.051
  P-value H_0: RMSEA <= 0.050                    0.942       0.942
  P-value H_0: RMSEA >= 0.080                    0.000       0.000
                                                                  
  Robust RMSEA                                               0.028
  90 Percent confidence interval - lower                     0.000
  90 Percent confidence interval - upper                     0.051
  P-value H_0: Robust RMSEA <= 0.050                         0.942
  P-value H_0: Robust RMSEA >= 0.080                         0.000

Standardized Root Mean Square Residual:

  SRMR                                           0.028       0.028

Parameter Estimates:

  Standard errors                             Sandwich
  Information bread                           Observed
  Observed information based on                Hessian

Latent Variables:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  famback =~                                                            
    parocc            1.000                              15.208    0.736
    byfaminc          0.127    0.010   13.060    0.000    1.931    0.794
    bypared           0.074    0.006   12.172    0.000    1.125    0.856
  prevach =~                                                            
    bytxrstd          1.000                               8.238    0.848
    bytxmstd          1.010    0.057   17.831    0.000    8.320    0.852
    bytxsstd          1.022    0.063   16.195    0.000    8.422    0.841
    bytxhstd          1.003    0.060   16.634    0.000    8.261    0.837
  hw =~                                                                 
    hw12              1.000                               1.221    0.644
    hw10              1.045    0.165    6.351    0.000    1.276    0.749
  grades =~                                                             
    eng92             1.000                               2.093    0.895
    math92            0.845    0.057   14.749    0.000    1.769    0.742
    sci92             0.995    0.050   19.888    0.000    2.082    0.845
    soc92             1.057    0.044   23.791    0.000    2.213    0.895

Regressions:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  prevach ~                                                             
    famback           0.320    0.038    8.384    0.000    0.591    0.591
  grades ~                                                              
    prevach           0.109    0.021    5.192    0.000    0.431    0.431
    hw                0.562    0.168    3.341    0.001    0.328    0.328
  hw ~                                                                  
    prevach           0.055    0.017    3.275    0.001    0.368    0.368
    famback           0.018    0.009    1.995    0.046    0.225    0.225

Covariances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
 .bytxrstd ~~                                                           
   .eng92             0.079    0.523    0.151    0.880    0.079    0.015
 .bytxmstd ~~                                                           
   .math92            2.081    0.570    3.651    0.000    2.081    0.255
 .bytxsstd ~~                                                           
   .sci92             0.249    0.694    0.360    0.719    0.249    0.035
 .bytxhstd ~~                                                           
   .soc92             0.424    0.645    0.657    0.511    0.424    0.071

Variances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .parocc          196.008   21.642    9.057    0.000  196.008    0.459
   .byfaminc          2.191    0.297    7.375    0.000    2.191    0.370
   .bypared           0.460    0.081    5.710    0.000    0.460    0.267
   .bytxrstd         26.465    4.083    6.482    0.000   26.465    0.281
   .bytxmstd         26.038    2.975    8.753    0.000   26.038    0.273
   .bytxsstd         29.285    3.609    8.114    0.000   29.285    0.292
   .bytxhstd         29.231    3.951    7.398    0.000   29.231    0.300
   .hw12              2.107    0.273    7.723    0.000    2.107    0.585
   .hw10              1.277    0.269    4.737    0.000    1.277    0.439
   .eng92             1.092    0.155    7.060    0.000    1.092    0.200
   .math92            2.550    0.254   10.026    0.000    2.550    0.449
   .sci92             1.742    0.205    8.511    0.000    1.742    0.287
   .soc92             1.221    0.195    6.254    0.000    1.221    0.200
    famback         231.288   32.809    7.049    0.000    1.000    1.000
   .prevach          44.162    5.703    7.743    0.000    0.651    0.651
   .hw                1.068    0.280    3.810    0.000    0.716    0.716
   .grades            2.476    0.350    7.079    0.000    0.565    0.565

Bootstrapping

표본 수가 작은 경우 (n < 100) 사용하기 어려움.
ML로 얻은 파라미터 추정치에 대한 표준오차(se)를 bootstrap을 통해 추정
- naive bootstrap: se = "bootstrap"
Chi-square test의 경우 Bollen-Stine bootstrap: test = "bootstrap"
- 데이터를 완전한 모형이 되도록 변형 후 bootstrap 수행
- 즉, 변형된 데이터에서 $\mathbf{S}$ = $\hat{\Sigma}(\theta)$ (단, 분포의 변형은 없음)
- 결측치가 있는 경우, 특화된 함수를 시용: semTools 패키지의 bsBootMiss()

Bootstrap standard errors

If “boot” or “bootstrap”, bootstrap standard errors are computed using standard bootstrapping (unless Bollen-Stine bootstrapping is requested for the test statistic; in this case bootstrap standard errors are computed using model-based bootstrapping).

파라미터 추정치에 대한 표준오차는 se = "bootstrap"을 사용하여 보통의 naive/ordinary bootstrap을 수행해 계산할 수 있음.

sem_fit_boot <- sem(
  hw_model,
  data = hw_mean,
  se = "bootstrap", # standard errors
  missing = "FIML", # missing data handling
  bootstrap = 1000, # number of bootstrap samples; default
  iseed = 1234 # for reproducibility
)
summary(sem_fit_boot, standardized = TRUE) |> print()

lavaan 0.6-19 ended normally after 175 iterations

  Estimator                                         ML
  Optimization method                           NLMINB
  Number of model parameters                        48

  Number of observations                           300
  Number of missing patterns                        18

Model Test User Model:
                                                      
  Test statistic                                68.247
  Degrees of freedom                                56
  P-value (Chi-square)                           0.126

Parameter Estimates:

  Standard errors                            Bootstrap
  Number of requested bootstrap draws             1000
  Number of successful bootstrap draws            1000

Latent Variables:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  famback =~                                                            
    parocc            1.000                              16.913    0.774
    byfaminc          0.131    0.008   15.586    0.000    2.211    0.808
    bypared           0.066    0.005   13.718    0.000    1.108    0.829
  prevach =~                                                            
    bytxrstd          1.000                               8.630    0.858
    bytxmstd          0.980    0.050   19.448    0.000    8.457    0.856
    bytxsstd          0.977    0.055   17.824    0.000    8.430    0.847
    bytxhstd          0.965    0.053   18.245    0.000    8.330    0.844
  hw =~                                                                 
    hw12              1.000                               1.276    0.669
    hw10              0.963    0.157    6.145    0.000    1.228    0.714
  grades =~                                                             
    eng92             1.000                               2.466    0.915
    math92            0.853    0.044   19.443    0.000    2.105    0.798
    sci92             0.959    0.043   22.426    0.000    2.366    0.870
    soc92             1.044    0.034   30.901    0.000    2.574    0.917

Regressions:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  prevach ~                                                             
    famback           0.304    0.034    8.864    0.000    0.596    0.596
  grades ~                                                              
    prevach           0.130    0.024    5.503    0.000    0.453    0.453
    hw                0.667    0.225    2.969    0.003    0.345    0.345
  hw ~                                                                  
    prevach           0.044    0.016    2.822    0.005    0.300    0.300
    famback           0.022    0.008    2.760    0.006    0.295    0.295

Covariances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
 .bytxrstd ~~                                                           
   .eng92             0.232    0.501    0.463    0.643    0.232    0.042
 .bytxmstd ~~                                                           
   .math92            1.899    0.514    3.692    0.000    1.899    0.234
 .bytxsstd ~~                                                           
   .sci92             0.260    0.610    0.427    0.669    0.260    0.037
 .bytxhstd ~~                                                           
   .soc92             0.204    0.600    0.339    0.734    0.204    0.034

Intercepts:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .parocc           52.406    1.257   41.697    0.000   52.406    2.398
   .byfaminc          9.844    0.159   61.949    0.000    9.844    3.599
   .bypared           3.216    0.076   42.258    0.000    3.216    2.407
   .bytxrstd         51.337    0.595   86.307    0.000   51.337    5.105
   .bytxmstd         52.177    0.580   90.024    0.000   52.177    5.280
   .bytxsstd         51.627    0.592   87.252    0.000   51.627    5.189
   .bytxhstd         51.796    0.585   88.541    0.000   51.796    5.245
   .hw12              3.115    0.113   27.661    0.000    3.115    1.634
   .hw10              2.536    0.104   24.465    0.000    2.536    1.474
   .eng92             6.130    0.154   39.751    0.000    6.130    2.276
   .math92            5.534    0.153   36.256    0.000    5.534    2.100
   .sci92             5.846    0.156   37.569    0.000    5.846    2.149
   .soc92             6.283    0.163   38.516    0.000    6.283    2.238

Variances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .parocc          191.591   19.789    9.682    0.000  191.591    0.401
   .byfaminc          2.592    0.363    7.134    0.000    2.592    0.346
   .bypared           0.557    0.074    7.567    0.000    0.557    0.312
   .bytxrstd         26.646    3.843    6.933    0.000   26.646    0.264
   .bytxmstd         26.119    2.859    9.136    0.000   26.119    0.267
   .bytxsstd         27.924    3.202    8.721    0.000   27.924    0.282
   .bytxhstd         28.116    3.551    7.917    0.000   28.116    0.288
   .hw12              2.008    0.301    6.667    0.000    2.008    0.552
   .hw10              1.453    0.284    5.124    0.000    1.453    0.491
   .eng92             1.175    0.145    8.101    0.000    1.175    0.162
   .math92            2.519    0.226   11.162    0.000    2.519    0.363
   .sci92             1.800    0.190    9.487    0.000    1.800    0.243
   .soc92             1.257    0.181    6.950    0.000    1.257    0.159
    famback         286.051   33.754    8.474    0.000    1.000    1.000
   .prevach          48.024    5.646    8.506    0.000    0.645    0.645
   .hw                1.168    0.330    3.544    0.000    0.718    0.718
   .grades            3.202    0.378    8.481    0.000    0.526    0.526

Bootstrap Confidence Intervals

lavaan의 standardizedSolution() 함수의 ci값이 percentile bootstrap confidence intervals이 아니라고 하니, 다음 함수를 사용하는 것을 권장

library(semhelpinghands)
standardizedSolution_boot_ci(sem_fit_boot)

Chi-square test의 경우 Bollen-Stine bootstrap을 적용; test = "bootstrap"

sem_fit_boot <- sem(
  hw_model,
  data = hw_mean,
  test = "bootstrap", # model test
  bootstrap = 1000, # number of bootstrap samples; default
  iseed = 1234, # for reproducibility
)
summary(sem_fit_boot, standardized = TRUE, fit.measures = TRUE, estimates = FALSE) |> print()

lavaan 0.6-19 ended normally after 141 iterations

  Estimator                                         ML
  Optimization method                           NLMINB
  Number of model parameters                        35

                                                  Used       Total
  Number of observations                           247         300

Model Test User Model:
                                                      
  Test statistic                                66.999
  Degrees of freedom                                56
  P-value (Chi-square)                           0.149
                                                      
  Test statistic                                66.999
  Degrees of freedom                                56
  P-value (Bollen-Stine bootstrap)               0.202

Model Test Baseline Model:

  Test statistic                              1996.735
  Degrees of freedom                                78
  P-value                                        0.000

User Model versus Baseline Model:

  Comparative Fit Index (CFI)                    0.994
  Tucker-Lewis Index (TLI)                       0.992

Loglikelihood and Information Criteria:

  Loglikelihood user model (H0)              -8046.960
  Loglikelihood unrestricted model (H1)             NA
                                                      
  Akaike (AIC)                               16163.920
  Bayesian (BIC)                             16286.748
  Sample-size adjusted Bayesian (SABIC)      16175.799

Root Mean Square Error of Approximation:

  RMSEA                                          0.028
  90 Percent confidence interval - lower         0.000
  90 Percent confidence interval - upper         0.051
  P-value H_0: RMSEA <= 0.050                    0.942
  P-value H_0: RMSEA >= 0.080                    0.000

Standardized Root Mean Square Residual:

  SRMR                                           0.028

결측치가 포함된 경우 이에 특화된 bsBootMiss()함수를 사용

library(semTools)
fit <- sem(
  hw_model, 
  data = hw_mean, 
  meanstructure = TRUE, 
  missing = "FIML" # missing data handling
)
bsboot <- bsBootMiss(fit, nBoot = 1000, seed = 1234)
summary(bsboot) |> print()

  |==================================================| 100%
Time elapsed to transform the data:
Time difference of 0.220299 secs

Time elapsed to fit the model to 1000 bootstrapped samples:
Time difference of 6.22466 mins

Mean of Theoretical Distribution = DF = 56 
Variance of Theoretical Distribution = 2*DF = 112 

Mean of Bootstrap Distribution = 59.32232 
Variance of Bootstrap Distribution = 164.2552 

Chi-Squared = 68.24697
Degrees of Freedom = 56
Theoretical p value = 0.1262701
    i.e., pchisq(68.24697, df = 56, lower.tail = FALSE)

Bootstrapped p value = 0.217

Chi-Squared = 68.24697
Degrees of Freedom = 56
Theoretical p value = 0.1262701
    i.e., pchisq(68.24697, df = 56, lower.tail = FALSE)

Bootstrapped p value = 0.217

hist(bsboot, breaks = 50, legendArgs = list(x = "topleft"))

분포에 대한 가정이 없는 방법

WLS (fully weighted least squares); estimator = "WLS"
also called ADF (Asymptotically Distribution-Free) estimator: 부정적인 견해 존재; p. 141, Klein, R. B. (2023)

sem_fit <- sem(hw_model, data = hw_mean, estimator = "WLS")
summary(sem_fit, standardized = TRUE, fit.measures = TRUE) |> print()

lavaan 0.6-19 ended normally after 229 iterations

  Estimator                                        WLS
  Optimization method                           NLMINB
  Number of model parameters                        35

                                                  Used       Total
  Number of observations                           247         300

Model Test User Model:
                                                      
  Test statistic                                90.176
  Degrees of freedom                                56
  P-value (Chi-square)                           0.003

Model Test Baseline Model:

  Test statistic                               733.895
  Degrees of freedom                                78
  P-value                                        0.000

User Model versus Baseline Model:

  Comparative Fit Index (CFI)                    0.948
  Tucker-Lewis Index (TLI)                       0.927

Root Mean Square Error of Approximation:

  RMSEA                                          0.050
  90 Percent confidence interval - lower         0.030
  90 Percent confidence interval - upper         0.068
  P-value H_0: RMSEA <= 0.050                    0.485
  P-value H_0: RMSEA >= 0.080                    0.003

Standardized Root Mean Square Residual:

  SRMR                                           0.040

Parameter Estimates:

  Standard errors                             Standard
  Information                                 Expected
  Information saturated (h1) model        Unstructured

Latent Variables:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  famback =~                                                            
    parocc            1.000                              15.912    0.780
    byfaminc          0.121    0.008   15.623    0.000    1.923    0.788
    bypared           0.074    0.005   15.043    0.000    1.170    0.875
  prevach =~                                                            
    bytxrstd          1.000                               8.547    0.877
    bytxmstd          1.004    0.044   22.584    0.000    8.585    0.869
    bytxsstd          0.967    0.047   20.609    0.000    8.262    0.859
    bytxhstd          0.985    0.044   22.480    0.000    8.417    0.862
  hw =~                                                                 
    hw12              1.000                               1.092    0.600
    hw10              1.206    0.186    6.498    0.000    1.317    0.810
  grades =~                                                             
    eng92             1.000                               2.109    0.923
    math92            0.827    0.048   17.197    0.000    1.744    0.725
    sci92             0.980    0.042   23.289    0.000    2.066    0.855
    soc92             1.038    0.040   26.001    0.000    2.189    0.889

Regressions:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  prevach ~                                                             
    famback           0.286    0.029    9.966    0.000    0.532    0.532
  grades ~                                                              
    prevach           0.110    0.016    6.713    0.000    0.447    0.447
    hw                0.616    0.156    3.959    0.000    0.319    0.319
  hw ~                                                                  
    prevach           0.048    0.013    3.708    0.000    0.375    0.375
    famback           0.012    0.006    2.162    0.031    0.179    0.179

Covariances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
 .bytxrstd ~~                                                           
   .eng92            -0.304    0.437   -0.694    0.487   -0.304   -0.074
 .bytxmstd ~~                                                           
   .math92            2.510    0.456    5.501    0.000    2.510    0.310
 .bytxsstd ~~                                                           
   .sci92            -0.656    0.513   -1.277    0.202   -0.656   -0.106
 .bytxhstd ~~                                                           
   .soc92             0.177    0.520    0.340    0.734    0.177    0.032

Variances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .parocc          163.471   17.751    9.209    0.000  163.471    0.392
   .byfaminc          2.258    0.240    9.422    0.000    2.258    0.379
   .bypared           0.418    0.071    5.892    0.000    0.418    0.234
   .bytxrstd         21.834    3.150    6.932    0.000   21.834    0.230
   .bytxmstd         23.842    2.478    9.623    0.000   23.842    0.244
   .bytxsstd         24.190    2.637    9.173    0.000   24.190    0.262
   .bytxhstd         24.578    3.007    8.173    0.000   24.578    0.258
   .hw12              2.117    0.217    9.760    0.000    2.117    0.640
   .hw10              0.910    0.255    3.566    0.000    0.910    0.344
   .eng92             0.770    0.125    6.178    0.000    0.770    0.148
   .math92            2.747    0.211   12.997    0.000    2.747    0.475
   .sci92             1.571    0.155   10.148    0.000    1.571    0.269
   .soc92             1.272    0.163    7.802    0.000    1.272    0.210
    famback         253.186   28.024    9.035    0.000    1.000    1.000
   .prevach          52.345    4.815   10.872    0.000    0.717    0.717
   .hw                0.902    0.201    4.483    0.000    0.756    0.756
   .grades            2.509    0.293    8.569    0.000    0.564    0.564

Missing Data

결측치 분류

MCAR: Missing Completely At Random; 결측이 특정 변수와 전혀 상관없는 경우; 매우 드물다고 봄
MAR: Missing At Random; 결측의 패턴이 다른 변수와 상관이 있지만 결측된 변수의 값과는 상관이 없는 경우; 다른 변수에 의해 설명될 수 있음
MNAR: Missing Not At Random; 결측이 해당 변수의 값과 연관이 있을 때;
- 우울 수준이 높은 사람들이 자신의 우울감을 보고하기를 꺼리는 경우,
- 수입이 낮은/높은 사람들이 자신의 수입을 보고하기를 꺼리는 경우 등

결측치에 대한 처리

Listwise deletion / Pairwise deletion
Full Information Maximum Likelihood (FIML): 결측치를 포함한 모든 데이터를 이용
Imputation (결측치 대체)
- Multivariate Imputation by Chained Equations (MICE)가 가장 우수한 것으로 알려져 있음
- mice package 참고 서적
- ML 기반이 아닌 estimator를 사용할 때 용이함.
대체로 FIML과 MICE가 유사한 결과를 얻음.
Machine learning 기반 방법도 있음: random forest, k-nearest neighbors, …

결측치에 대한 시각화: naniar package; Missing Data Visualisations, Getting Started with naniar

Load data

hw_mean <- haven::read_sav("data/chap 19 latent means/Homework means.sav")
hw_mean <- hw_mean |> 
  rename(hw10 = f1s36a2, hw12 = f2s25f2)
  
set.seed(123)
hw_mean <- hw_mean |> sample_n(300)

skimr::skim(hw_mean) |> print()

── Data Summary ────────────────────────
                           Values 
Name                       hw_mean
Number of rows             300    
Number of columns          15     
_______________________           
Column type frequency:            
  numeric                  15     
________________________          
Group variables            None   

── Variable type: numeric ────────────────────────────────────────────────────────────────
   skim_variable n_missing complete_rate   mean     sd    p0   p25   p50   p75 p100 hist 
 1 bytxrstd              6         0.98  51.3   10.0   29.5  43.1  50.7  59.2  67.5 ▃▆▇▇▇
 2 bytxmstd              6         0.98  52.2    9.87  32.0  44.3  51.4  60.7  70.1 ▃▇▆▆▆
 3 bytxsstd              7         0.977 51.6    9.93  29.1  44.7  51.2  58.1  73.8 ▂▆▇▆▂
 4 bytxhstd              7         0.977 51.8    9.86  25.2  45.2  52.1  60.8  69.5 ▁▅▇▇▇
 5 parocc                2         0.993 52.5   21.9    7.32 27.4  61.3  66.2  81.9 ▃▁▁▇▂
 6 hw10                 15         0.95   2.60   1.70   0     1     2     4     7   ▇▇▆▂▂
 7 ethnic                4         0.987  0.784  0.412  0     1     1     1     1   ▂▁▁▁▇
 8 hw12                 28         0.907  3.19   1.89   0     2     3     4     8   ▂▇▂▃▁
 9 eng92                 4         0.987  6.16   2.68   0     4.22  6.5   8.25 11.2 ▂▅▅▇▃
10 math92                4         0.987  5.56   2.63   0     3.5   5.5   7.5  11.1 ▃▆▇▇▃
11 sci92                 4         0.987  5.87   2.71   0     3.91  6     8    11.3 ▃▆▇▇▃
12 soc92                 4         0.987  6.31   2.80   0     4.55  6.33  8.57 11   ▂▅▇▇▆
13 Female                0         1      0.503  0.501  0     0     1     1     1   ▇▁▁▁▇
14 byfaminc             10         0.967  9.82   2.73   2     8    10    12    15   ▁▂▃▇▂
15 bypared               1         0.997  3.21   1.34   1     2     3     4     6   ▅▇▂▂▂

library(naniar)

# 결측치 요약 시각화
gg_miss_var(hw_mean, facet = ethnic)  # by ethnic group

# 개별 결측치, 군집화 패턴 확인
vis_miss(hw_mean, cluster = TRUE)

# 함께 누락된 변수들의 패턴 확인
gg_miss_upset(hw_mean, nsets = n_var_miss(hw_mean))

결측치에 대한 패턴

학업 성취도가 낮은 아이들의 경우 숙제를 한 시간을 보고하지 않는 경향이 있을 수 있다는 의심하에,
다음과 같이 대략 세 변수에 대해 결측치 여부를 나타내는 변수를 추가하여, 다른 변수들과의 상관계수를 살펴보면,

# add a variable to the dataset that indicates the missingness
hw_mean <- hw_mean |> 
  mutate(
    missing_reading = is.na(bytxrstd),
    missing_hw12 = is.na(hw12),
    missing_hw10 = is.na(hw10)
  )

psych::lowerCor(hw_mean)

                 bytxr bytxm bytxs bytxh parcc hw10  ethnc hw12  eng92 mth92 sci92 soc92
bytxrstd          1.00                                                                  
bytxmstd          0.73  1.00                                                            
bytxsstd          0.71  0.73  1.00                                                      
bytxhstd          0.73  0.71  0.74  1.00                                                
parocc            0.37  0.41  0.32  0.35  1.00                                          
hw10              0.25  0.29  0.26  0.30  0.21  1.00                                    
ethnic            0.28  0.28  0.30  0.31  0.29  0.13  1.00                              
hw12              0.30  0.29  0.24  0.23  0.21  0.47  0.00  1.00                        
eng92             0.53  0.47  0.43  0.42  0.37  0.34  0.27  0.30  1.00                  
math92            0.48  0.50  0.38  0.37  0.29  0.25  0.20  0.26  0.71  1.00            
sci92             0.50  0.47  0.42  0.38  0.31  0.28  0.25  0.30  0.79  0.74  1.00      
soc92             0.54  0.49  0.45  0.44  0.35  0.36  0.24  0.33  0.85  0.71  0.79  1.00
Female           -0.14  0.01  0.09  0.09 -0.04 -0.05 -0.05 -0.01 -0.23 -0.04 -0.08 -0.14
byfaminc          0.38  0.43  0.40  0.39  0.63  0.23  0.36  0.21  0.36  0.28  0.32  0.38
bypared           0.44  0.43  0.44  0.39  0.64  0.28  0.30  0.26  0.39  0.34  0.36  0.37
missing_reading*    NA    NA    NA    NA -0.01  0.01 -0.04 -0.02  0.05  0.01  0.01  0.03
missing_hw12*    -0.25 -0.19 -0.20 -0.19 -0.24 -0.02 -0.12    NA -0.39 -0.32 -0.34 -0.36
missing_hw10*    -0.21 -0.12 -0.09 -0.11 -0.15    NA -0.10 -0.08 -0.40 -0.31 -0.32 -0.40
                 Femal byfmn byprd mss_* m_12* m_10*
Female            1.00                              
byfaminc          0.13  1.00                        
bypared           0.01  0.66  1.00                  
missing_reading* -0.05 -0.03  0.01  1.00            
missing_hw12*     0.04 -0.28 -0.11 -0.05  1.00      
missing_hw10*     0.04 -0.24 -0.12 -0.03  0.40  1.00

ggplot(hw_mean, aes(x = missing_hw12, y = eng92)) +
  geom_violin() +
  geom_jitter(alpha=.3)

ggplot(hw_mean, aes(x = hw12, y = eng92)) +
 geom_miss_point(jitter = 0.1)

FIML

Full Information Maximum Likelihood

결측치를 제거하지 않고 모든 관측치를 이용
각 관측치(case) 별로 결측치가 없는 변수들로만 이루어진 공분산 행렬로 log-likelihood를 계산하여 합산하는 방식
MNAR(missing not at random)에 대해서는 부적절할 수 있음.

Load data

hw_mean <- haven::read_sav("data/chap 19 latent means/Homework means.sav")
hw_mean <- hw_mean |> 
  rename(hw10 = f1s36a2, hw12 = f2s25f2) |>
  labelled::remove_labels()

set.seed(123)
hw_mean <- hw_mean |> sample_n(300)

# NA가 하나라도 포함된 행들만 추출
hw_mean |> 
  filter(rowSums(is.na(hw_mean)) > 0) |> print()

# A tibble: 55 × 15
  bytxrstd bytxmstd bytxsstd bytxhstd parocc  hw10 ethnic  hw12 eng92 math92 sci92 soc92
     <dbl>    <dbl>    <dbl>    <dbl>  <dbl> <dbl>  <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl>
1     46.4     55.2     46.7     42.4   NA      NA      0     2  1.29   0.4   0.4   0.67
2     46.2     50.2     51.1     61.0   15.6     0      1    NA  3.33  NA     0     0   
3     48.4     45.4     62.4     48.8   66.2     1      1     6  1.8    1.5   3.25  3.78
4     59.1     44.7     NA       NA     56.3     3      0     4  8.6    7.25 10.2   9   
5     35.8     32.6     42.5     35.3   56.3    NA      1     1  0.56   3     1.37  0.75
6     37.0     44.4     48.8     52.1   20.3    NA      1     0  0.78   0     1.75  1.29
# ℹ 49 more rows
# ℹ 3 more variables: Female <dbl>, byfaminc <dbl>, bypared <dbl>

hw_model <- "
  famback =~ parocc + byfaminc + bypared
  prevach =~ bytxrstd + bytxmstd + bytxsstd + bytxhstd
  hw =~ hw12 + hw10
  grades =~ eng92 + math92 + sci92 + soc92

  bytxrstd ~~ eng92
  bytxmstd ~~ math92
  bytxsstd ~~ sci92
  bytxhstd ~~ soc92

  prevach ~ famback
  grades ~ prevach + hw
  hw ~ prevach + famback
"
sem_fit <- sem(hw_model,
  data = hw_mean,
  estimator = "MLR",  # robust ML for non-normal data
  missing = "FIML"  # FIML for missing data; "MLR"일 때 자동으로 FIML로 설정됨
)
summary(sem_fit, standardized = TRUE, fit.measures = TRUE) |> print()

lavaan 0.6-19 ended normally after 175 iterations

  Estimator                                         ML
  Optimization method                           NLMINB
  Number of model parameters                        48

  Number of observations                           300
  Number of missing patterns                        18

Model Test User Model:
                                              Standard      Scaled
  Test Statistic                                68.247      67.485
  Degrees of freedom                                56          56
  P-value (Chi-square)                           0.126       0.140
  Scaling correction factor                                  1.011
    Yuan-Bentler correction (Mplus variant)                       

Model Test Baseline Model:

  Test statistic                              2543.132    2388.205
  Degrees of freedom                                78          78
  P-value                                        0.000       0.000
  Scaling correction factor                                  1.065

User Model versus Baseline Model:

  Comparative Fit Index (CFI)                    0.995       0.995
  Tucker-Lewis Index (TLI)                       0.993       0.993
                                                                  
  Robust Comparative Fit Index (CFI)                         0.995
  Robust Tucker-Lewis Index (TLI)                            0.993

Loglikelihood and Information Criteria:

  Loglikelihood user model (H0)              -9645.718   -9645.718
  Scaling correction factor                                  1.013
      for the MLR correction                                      
  Loglikelihood unrestricted model (H1)      -9611.594   -9611.594
  Scaling correction factor                                  1.012
      for the MLR correction                                      
                                                                  
  Akaike (AIC)                               19387.435   19387.435
  Bayesian (BIC)                             19565.217   19565.217
  Sample-size adjusted Bayesian (SABIC)      19412.989   19412.989

Root Mean Square Error of Approximation:

  RMSEA                                          0.027       0.026
  90 Percent confidence interval - lower         0.000       0.000
  90 Percent confidence interval - upper         0.047       0.046
  P-value H_0: RMSEA <= 0.050                    0.973       0.977
  P-value H_0: RMSEA >= 0.080                    0.000       0.000
                                                                  
  Robust RMSEA                                               0.027
  90 Percent confidence interval - lower                     0.000
  90 Percent confidence interval - upper                     0.048
  P-value H_0: Robust RMSEA <= 0.050                         0.965
  P-value H_0: Robust RMSEA >= 0.080                         0.000

Standardized Root Mean Square Residual:

  SRMR                                           0.030       0.030

Parameter Estimates:

  Standard errors                             Sandwich
  Information bread                           Observed
  Observed information based on                Hessian

Latent Variables:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  famback =~                                                            
    parocc            1.000                              16.913    0.774
    byfaminc          0.131    0.008   15.828    0.000    2.211    0.808
    bypared           0.066    0.005   13.734    0.000    1.108    0.829
  prevach =~                                                            
    bytxrstd          1.000                               8.630    0.858
    bytxmstd          0.980    0.049   19.853    0.000    8.457    0.856
    bytxsstd          0.977    0.053   18.505    0.000    8.430    0.847
    bytxhstd          0.965    0.052   18.668    0.000    8.330    0.844
  hw =~                                                                 
    hw12              1.000                               1.276    0.669
    hw10              0.963    0.148    6.496    0.000    1.228    0.714
  grades =~                                                             
    eng92             1.000                               2.466    0.915
    math92            0.853    0.043   19.977    0.000    2.105    0.798
    sci92             0.959    0.042   23.062    0.000    2.366    0.870
    soc92             1.044    0.034   31.117    0.000    2.574    0.917

Regressions:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  prevach ~                                                             
    famback           0.304    0.033    9.233    0.000    0.596    0.596
  grades ~                                                              
    prevach           0.130    0.022    5.923    0.000    0.453    0.453
    hw                0.667    0.194    3.445    0.001    0.345    0.345
  hw ~                                                                  
    prevach           0.044    0.016    2.844    0.004    0.300    0.300
    famback           0.022    0.008    2.749    0.006    0.295    0.295

Covariances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
 .bytxrstd ~~                                                           
   .eng92             0.232    0.503    0.462    0.644    0.232    0.042
 .bytxmstd ~~                                                           
   .math92            1.899    0.517    3.670    0.000    1.899    0.234
 .bytxsstd ~~                                                           
   .sci92             0.260    0.611    0.426    0.670    0.260    0.037
 .bytxhstd ~~                                                           
   .soc92             0.204    0.581    0.350    0.726    0.204    0.034

Intercepts:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .parocc           52.406    1.267   41.367    0.000   52.406    2.398
   .byfaminc          9.844    0.159   61.966    0.000    9.844    3.599
   .bypared           3.216    0.077   41.634    0.000    3.216    2.407
   .bytxrstd         51.337    0.584   87.842    0.000   51.337    5.105
   .bytxmstd         52.177    0.575   90.798    0.000   52.177    5.280
   .bytxsstd         51.627    0.579   89.200    0.000   51.627    5.189
   .bytxhstd         51.796    0.575   90.130    0.000   51.796    5.245
   .hw12              3.115    0.116   26.946    0.000    3.115    1.634
   .hw10              2.536    0.101   25.024    0.000    2.536    1.474
   .eng92             6.130    0.156   39.189    0.000    6.130    2.276
   .math92            5.534    0.153   36.156    0.000    5.534    2.100
   .sci92             5.846    0.158   37.113    0.000    5.846    2.149
   .soc92             6.283    0.163   38.656    0.000    6.283    2.238

Variances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .parocc          191.591   20.296    9.440    0.000  191.591    0.401
   .byfaminc          2.592    0.347    7.467    0.000    2.592    0.346
   .bypared           0.557    0.074    7.510    0.000    0.557    0.312
   .bytxrstd         26.646    3.670    7.261    0.000   26.646    0.264
   .bytxmstd         26.119    2.726    9.581    0.000   26.119    0.267
   .bytxsstd         27.924    3.161    8.834    0.000   27.924    0.282
   .bytxhstd         28.116    3.469    8.105    0.000   28.116    0.288
   .hw12              2.008    0.287    7.004    0.000    2.008    0.552
   .hw10              1.453    0.273    5.330    0.000    1.453    0.491
   .eng92             1.175    0.147    7.971    0.000    1.175    0.162
   .math92            2.519    0.230   10.975    0.000    2.519    0.363
   .sci92             1.800    0.192    9.397    0.000    1.800    0.243
   .soc92             1.257    0.183    6.879    0.000    1.257    0.159
    famback         286.051   32.782    8.726    0.000    1.000    1.000
   .prevach          48.024    5.785    8.302    0.000    0.645    0.645
   .hw                1.168    0.305    3.828    0.000    0.718    0.718
   .grades            3.202    0.386    8.302    0.000    0.526    0.526

Imputation

semTools에서 제공하는 imputation이 통합된 lavaan 함수를 이용하면; sem.mi()
- MICE를 이용해 회귀모형에 기반해 결측치를 추정한 여러 셋트의 데이터를 생성 후 평균내는 방식
Machine learning 기반의 알고리즘들로 직접 결측값을 채워 새로운 데이터셋을 생성한 후 분석할 수도 있음.

Load data

hw_mean <- haven::read_sav("data/chap 19 latent means/Homework means.sav")
hw_mean <- hw_mean |> 
  rename(hw10 = f1s36a2, hw12 = f2s25f2) |> 
  labelled::remove_labels()
set.seed(123)
hw_mean <- hw_mean |> sample_n(300)

hw_model <- "
  famback =~ parocc + byfaminc + bypared
  prevach =~ bytxrstd + bytxmstd + bytxsstd + bytxhstd
  hw =~ hw12 + hw10
  grades =~ eng92 + math92 + sci92 + soc92

  bytxrstd ~~ eng92
  bytxmstd ~~ math92
  bytxsstd ~~ sci92
  bytxhstd ~~ soc92

  prevach ~ famback
  grades ~ prevach + hw
  hw ~ prevach + famback
"
pred_mat <- mice::quickpred(hw_mean, mincor = 0.25)  # r > 0.25인 변수만 사용
sem_fit <- sem.mi(hw_model,
                  data = hw_mean,
                  estimator = "MLM",
                  miPackage = "mice", m = 10, seed = 123,  # mice 패키지 사용
                  miArgs = list(predictorMatrix = pred_mat)
)

summary(sem_fit, standardized = TRUE, fit.measures = TRUE)

lavaan.mi object based on 10 imputed data sets. 
See class?lavaan.mi help page for available methods. 

Convergence information:
The model converged on 10 imputed data sets 

Rubin's (1987) rules were used to pool point and SE estimates across 10 imputed data sets, and to calculate degrees of freedom for each parameter's t test and CI.


Model Test User Model:

                                              Standard      Scaled
  Test statistic                                66.798      66.063
  Degrees of freedom                                56          56
  P-value                                        0.153       0.168
  Average scaling correction factor                          1.011

Model Test Baseline Model:

  Test statistic                              2483.002    2485.266
  Degrees of freedom                                78          78
  P-value                                        0.000       0.000
  Scaling correction factor                                  0.999

User Model versus Baseline Model:

  Comparative Fit Index (CFI)                    0.996       0.996
  Tucker-Lewis Index (TLI)                       0.994       0.994
                                                                  
  Robust Comparative Fit Index (CFI)                         0.996
  Robust Tucker-Lewis Index (TLI)                            1.000

Root Mean Square Error of Approximation:

  RMSEA                                          0.025       0.024
  Confidence interval - lower                    0.000       0.000
  Confidence interval - upper                    0.046       0.000
  P-value H_0: RMSEA <= 0.05                     0.979       0.982
                                                                  
  Robust RMSEA                                               0.025
  Confidence interval - lower                                0.000
  Confidence interval - upper                                0.046

Standardized Root Mean Square Residual:

  SRMR                                           0.032       0.032

A lavaan.parameterEstimates: 39 x 12
lhs	op	rhs	exo	est	se	t	df	pvalue	std.lv	std.all	label
<chr>	<chr>	<chr>	<int>	<lvn.vctr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>
famback	=~	parocc	0	1.00000000	0.000000000	NA	NA	NA	16.9496168	0.77384914
famback	=~	byfaminc	0	0.13121803	0.008381035	15.6565414	Inf	2.997617e-55	2.2240953	0.81112072
famback	=~	bypared	0	0.06524749	0.004744518	13.7521842	Inf	4.941480e-43	1.1059200	0.82771626
prevach	=~	bytxrstd	0	1.00000000	0.000000000	NA	NA	NA	8.5903647	0.85607162
prevach	=~	bytxmstd	0	0.98568829	0.049618611	19.8652939	Inf	8.127497e-88	8.4674219	0.85759578
prevach	=~	bytxsstd	0	0.98072154	0.051524979	19.0339045	3273.3990	1.072947e-76	8.4247557	0.84712100
prevach	=~	bytxhstd	0	0.96983846	0.051803967	18.7213163	2329.1752	5.767646e-73	8.3312661	0.84313210
hw	=~	hw12	0	1.00000000	0.000000000	NA	NA	NA	1.2995948	0.68696647
hw	=~	hw10	0	0.92385023	0.138226158	6.6836136	476.4311	6.531768e-11	1.2006310	0.70818514
grades	=~	eng92	0	1.00000000	0.000000000	NA	NA	NA	2.4578334	0.91456192
grades	=~	math92	0	0.85572904	0.042354997	20.2037326	5076.3862	2.281209e-87	2.1032395	0.79788580
grades	=~	sci92	0	0.96374024	0.041243002	23.3673639	Inf	9.179699e-121	2.3687130	0.87133356
grades	=~	soc92	0	1.04663090	0.036653908	28.5544150	4510.0989	5.415972e-165	2.5724444	0.91650259
bytxrstd	~~	eng92	0	0.20905253	0.517532041	0.4039412	Inf	6.862559e-01	0.2090525	0.03708374
bytxmstd	~~	math92	0	1.92330898	0.522066220	3.6840326	7901.0927	2.311070e-04	1.9233090	0.23835047
bytxsstd	~~	sci92	0	0.18129467	0.612794899	0.2958489	9042.6888	7.673523e-01	0.1812947	0.02571671
bytxhstd	~~	soc92	0	0.20708355	0.588586907	0.3518317	Inf	7.249645e-01	0.2070836	0.03471214
prevach	~	famback	0	0.30285173	0.031864614	9.5043275	2720.9412	4.261562e-21	0.5975556	0.59755562
grades	~	prevach	0	0.13345819	0.019806063	6.7382494	675.5309	3.436673e-11	0.4664492	0.46644925
grades	~	hw	0	0.61750612	0.161308353	3.8281100	418.0890	1.488829e-04	0.3265102	0.32651023
hw	~	prevach	0	0.04578762	0.015843193	2.8900500	4097.4572	3.872039e-03	0.3026577	0.30265768
hw	~	famback	0	0.02072202	0.007854752	2.6381509	1204.1109	8.443422e-03	0.2702614	0.27026141
parocc	~~	parocc	0	192.45185043	20.479141566	9.3974569	Inf	5.589774e-21	192.4518504	0.40115751
byfaminc	~~	byfaminc	0	2.57197952	0.339150592	7.5835914	6823.9857	3.807425e-14	2.5719795	0.34208318
bypared	~~	bypared	0	0.56213098	0.072652460	7.7372601	6435.6708	1.171931e-14	0.5621310	0.31488580
bytxrstd	~~	bytxrstd	0	26.89949774	3.649960535	7.3698051	4997.6575	1.989032e-13	26.8994977	0.26714138
bytxmstd	~~	bytxmstd	0	25.78761692	2.710712342	9.5132252	Inf	1.848411e-21	25.7876169	0.26452948
bytxsstd	~~	bytxsstd	0	27.92974234	3.170372671	8.8096086	Inf	1.255836e-18	27.9297423	0.28238602
bytxhstd	~~	bytxhstd	0	28.23067795	3.475072847	8.1237658	Inf	4.519374e-16	28.2306780	0.28912826
hw12	~~	hw12	0	1.88991456	0.285407615	6.6218085	4492.9683	3.965259e-11	1.8899146	0.52807707
hw10	~~	hw10	0	1.43274138	0.240963868	5.9458764	584.2203	4.733964e-09	1.4327414	0.49847381
eng92	~~	eng92	0	1.18140710	0.154246192	7.6592302	8832.9652	2.067767e-14	1.1814071	0.16357650
math92	~~	math92	0	2.52496228	0.228097781	11.0696486	4517.9871	4.036156e-28	2.5249623	0.36337825
sci92	~~	sci92	0	1.77939551	0.188242981	9.4526527	3875.5226	5.548686e-21	1.7793955	0.24077782
soc92	~~	soc92	0	1.26068628	0.190860744	6.6052676	Inf	3.968001e-11	1.2606863	0.16002300
famback	~~	famback	0	287.28951040	33.319558160	8.6222485	Inf	6.565204e-18	1.0000000	1.00000000
prevach	~~	prevach	0	47.44441120	5.781226901	8.2066336	Inf	2.274763e-16	0.6429273	0.64292728
hw	~~	hw	0	1.24576869	0.298623041	4.1717099	2183.9853	3.141502e-05	0.7376010	0.73760096
grades	~~	grades	0	3.22848941	0.382616586	8.4379233	Inf	3.230255e-17	0.5344345	0.53443448

kNN(k-nearest neighbors) imputation을 이용해서 대체된 값으로 채워진 데이터셋을 사용하면,

hw_mean_imp <- VIM::kNN(hw_mean, k = 5) # kNN imputation
sem_fit <- sem(hw_model,
  data = hw_mean_imp,
  estimator = "MLMVS"
)
summary(sem_fit, fit.measures = TRUE, standardized = TRUE) |> print()

lavaan 0.6-19 ended normally after 141 iterations

  Estimator                                         ML
  Optimization method                           NLMINB
  Number of model parameters                        35

  Number of observations                           300

Model Test User Model:
                                              Standard      Scaled
  Test Statistic                                62.574      46.436
  Degrees of freedom                                56      42.363
  P-value (Chi-square)                           0.254       0.308
  Scaling correction factor                                  1.348
    mean and variance adjusted correction                         

Model Test Baseline Model:

  Test statistic                              2607.768     303.657
  Degrees of freedom                                78       9.014
  P-value                                        0.000       0.000
  Scaling correction factor                                  8.588

User Model versus Baseline Model:

  Comparative Fit Index (CFI)                    0.997       0.986
  Tucker-Lewis Index (TLI)                       0.996       0.997

Loglikelihood and Information Criteria:

  Loglikelihood user model (H0)              -9861.267   -9861.267
  Loglikelihood unrestricted model (H1)      -9829.980   -9829.980
                                                                  
  Akaike (AIC)                               19792.534   19792.534
  Bayesian (BIC)                             19922.167   19922.167
  Sample-size adjusted Bayesian (SABIC)      19811.167   19811.167

Root Mean Square Error of Approximation:

  RMSEA                                          0.020       0.018
  90 Percent confidence interval - lower         0.000       0.000
  90 Percent confidence interval - upper         0.042       0.041
  P-value H_0: RMSEA <= 0.050                    0.991       0.993
  P-value H_0: RMSEA >= 0.080                    0.000       0.000

Standardized Root Mean Square Residual:

  SRMR                                           0.029       0.029

Parameter Estimates:

  Standard errors                           Robust.sem
  Information                                 Expected
  Information saturated (h1) model          Structured

Latent Variables:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  famback =~                                                            
    parocc            1.000                              16.819    0.770
    byfaminc          0.130    0.008   15.666    0.000    2.194    0.810
    bypared           0.065    0.005   13.767    0.000    1.101    0.825
  prevach =~                                                            
    bytxrstd          1.000                               8.606    0.859
    bytxmstd          0.982    0.048   20.520    0.000    8.451    0.857
    bytxsstd          0.977    0.050   19.553    0.000    8.405    0.849
    bytxhstd          0.971    0.050   19.365    0.000    8.354    0.846
  hw =~                                                                 
    hw12              1.000                               1.300    0.693
    hw10              0.856    0.122    6.995    0.000    1.114    0.665
  grades =~                                                             
    eng92             1.000                               2.428    0.908
    math92            0.859    0.043   19.830    0.000    2.084    0.792
    sci92             0.968    0.042   23.301    0.000    2.349    0.869
    soc92             1.055    0.038   27.759    0.000    2.561    0.915

Regressions:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  prevach ~                                                             
    famback           0.303    0.030    9.964    0.000    0.592    0.592
  grades ~                                                              
    prevach           0.124    0.020    6.093    0.000    0.439    0.439
    hw                0.649    0.166    3.912    0.000    0.348    0.348
  hw ~                                                                  
    prevach           0.049    0.015    3.199    0.001    0.323    0.323
    famback           0.024    0.008    3.054    0.002    0.307    0.307

Covariances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
 .bytxrstd ~~                                                           
   .eng92             0.261    0.506    0.516    0.606    0.261    0.046
 .bytxmstd ~~                                                           
   .math92            1.951    0.506    3.854    0.000    1.951    0.238
 .bytxsstd ~~                                                           
   .sci92             0.214    0.596    0.360    0.719    0.214    0.031
 .bytxhstd ~~                                                           
   .soc92             0.250    0.575    0.434    0.664    0.250    0.042

Variances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .parocc          193.835   20.125    9.632    0.000  193.835    0.407
   .byfaminc          2.517    0.325    7.748    0.000    2.517    0.343
   .bypared           0.570    0.070    8.107    0.000    0.570    0.320
   .bytxrstd         26.309    3.520    7.474    0.000   26.309    0.262
   .bytxmstd         25.862    2.691    9.612    0.000   25.862    0.266
   .bytxsstd         27.376    3.071    8.915    0.000   27.376    0.279
   .bytxhstd         27.760    3.364    8.251    0.000   27.760    0.285
   .hw12              1.826    0.274    6.656    0.000    1.826    0.519
   .hw10              1.561    0.229    6.806    0.000    1.561    0.557
   .eng92             1.251    0.155    8.056    0.000    1.251    0.175
   .math92            2.590    0.237   10.925    0.000    2.590    0.373
   .sci92             1.787    0.185    9.679    0.000    1.787    0.245
   .soc92             1.273    0.190    6.706    0.000    1.273    0.163
    famback         282.885   32.385    8.735    0.000    1.000    1.000
   .prevach          48.061    5.582    8.609    0.000    0.649    0.649
   .hw                1.156    0.282    4.103    0.000    0.684    0.684
   .grades            3.138    0.366    8.576    0.000    0.532    0.532

Linearity

trendlines <- function(data, mapping, ...){
  ggplot(data = data, mapping = mapping) + 
  geom_smooth(method = loess, se = FALSE, color = "orange", ...)
}
hw_mean |> select(-Female, -ethnic) |> 
  GGally::ggpairs(lower = list(continuous = trendlines))

library(car)
mod <- lm(eng92 ~ hw10, data=hw_mean)
crPlots(mod)

mod <- lm(eng92 ~ parocc, data=hw_mean)
crPlots(mod)