Alex J. Bowers
525 West 120th, Box 67
New York, New York 10027-6696
Bowers@tc.edu
This project was supported through a generous grant from the National Science Foundation (NSF IIS-1546653): http://www.nsf.gov/awardsearch/showAward?AWD_ID=1546653
Any opinions, findings, and conclusions or recommendations are those of the author and do not necessarily reflect the views of funding agencies
I load the library pROC used to calculate and compare AUC and the library OptimalCutpoints used to calculate optimal cutpoints of largest AUC. I load the data with education outcome variables, such as dropout and college enrollment, as well as predictors, such as math t score and suspension. Finally, I select math t score as a predictor of high school dropout after comparing the AUC’s of reading t score and math t score.
library(pROC)
library(OptimalCutpoints) #cutoff point
ROC = as.data.frame(read.csv(file = '/Users/zhouxiaoliang/Downloads/ROC.csv'))
#Math t score
ROC$BYTXMSTD_01 = ifelse(ROC$BYTXMSTD < 44.065, 1, 0)
#Reading t score
ROC$BYTXRSTD_01 = ifelse(ROC$BYTXRSTD < 43.64, 1, 0)
#compare AUC for math and reading scores
roc1 = roc(ROC$F2EVERDO, ROC$BYTXMSTD_01, legacy.axes=TRUE, asp=FALSE,
plot=TRUE, grid=FALSE, lty=1, xaxs="i", yaxs="i", main = "ROC curves for predicting dropout: Comparison")
roc2 = roc(ROC$F2EVERDO, ROC$BYTXRSTD_01,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=2)
legend("bottomright", legend=c("Math t score (2002)","Reading t score (2002)"), lty=c(1,2), lwd=2)
roc.test(roc1, roc2, method="delong") #select math t score with bigger AUC
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc2
## Z = 1.7462, p-value = 0.08078
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6472583 0.6363775
I calculate the statistics for the outcome variables and predictors used in the paper. The statistics appear in Appendix A: Variable Descriptive Statistics and Labels.
# High school dropout with dichotomous predictors
mean(ROC$F2EVERDO); sd(ROC$F2EVERDO); length(ROC$F2EVERDO) # Dropout
## [1] 0.1129839
## [1] 0.3165829
## [1] 16197
BYP52E=na.omit(ROC$BYP52E_01); mean(BYP52E); sd(BYP52E); length(BYP52E); # Absent
## [1] 0.1295784
## [1] 0.3358527
## [1] 12286
BYS24F=na.omit(ROC$BYS24F_01); mean(BYS24F); sd(BYS24F); length(BYS24F); # Suspension
## [1] 0.08068527
## [1] 0.2723606
## [1] 14476
BYP51=na.omit(ROC$BYP51); mean(BYP51); sd(BYP51); length(BYP51) # Misbehavior
## [1] 0.07248936
## [1] 0.2593069
## [1] 12457
flag_00=na.omit(ROC$flag_00); mean(flag_00); sd(flag_00); length(flag_00) # One or more flags
## [1] 0.3708094
## [1] 0.4830364
## [1] 16197
flag_01=na.omit(ROC$flag_01); mean(flag_01); sd(flag_01); length(flag_01) # Any one flag
## [1] 0.4446912
## [1] 0.4969499
## [1] 13506
flag_02=na.omit(ROC$flag_02); mean(flag_02); sd(flag_02); length(flag_02) # Any two flags
## [1] 0.1241418
## [1] 0.3297566
## [1] 12381
flag_03=na.omit(ROC$flag_03); mean(flag_03); sd(flag_03); length(flag_03) # Any three flags
## [1] 0.02458958
## [1] 0.1548762
## [1] 13949
flag_04=na.omit(ROC$flag_04); mean(flag_04); sd(flag_04); length(flag_04) # Any four flags
## [1] 0.003530167
## [1] 0.05931215
## [1] 15580
BYS42=na.omit(ROC$BYS42); mean(BYS42); sd(BYS42); length(BYS42); min(BYS42); max(BYS42) # Extrac. activities (2002)
## [1] 4.773294
## [1] 5.699724
## [1] 14446
## [1] 0
## [1] 21
BYTXMSTD_01=na.omit(ROC$BYTXMSTD_01); mean(BYTXMSTD_01); sd(BYTXMSTD_01); length(BYTXMSTD_01); # 1st quantile Math t score (2002)
## [1] 0.25
## [1] 0.4330263
## [1] 15892
# High school dropout with continous predictors
BYTXRSTD=na.omit(ROC$BYTXRSTD); mean(BYTXRSTD); sd(BYTXRSTD); length(BYTXRSTD) # Reading t score
## [1] 50.52618
## [1] 9.88539
## [1] 15892
BYTXMSTD=na.omit(ROC$BYTXMSTD); mean(BYTXMSTD); sd(BYTXMSTD); length(BYTXMSTD) # Math t score (2002)
## [1] 50.71016
## [1] 9.912398
## [1] 15892
# College enrollment
F2PS0601=na.omit(ROC$F2PS0601); mean(F2PS0601); sd(F2PS0601); length(F2PS0601); # College enrollment
## [1] 0.5456189
## [1] 0.4979383
## [1] 10511
F1RGPP2=na.omit(ROC$F1RGPP2); mean(F1RGPP2); sd(F1RGPP2); length(F1RGPP2);min(F1RGPP2); max(F1RGPP2) # GPA
## [1] 3.911868
## [1] 1.542702
## [1] 14796
## [1] 0
## [1] 6
F1S27=na.omit(ROC$F1S27); mean(F1S27); sd(F1S27); length(F1S27) # Extrac. activities (2004)
## [1] 3.181909
## [1] 1.8975
## [1] 14073
BYS33A=na.omit(ROC$BYS33A); mean(BYS33A); sd(BYS33A); length(BYS33A) # AP
## [1] 0.1823497
## [1] 0.3861459
## [1] 14368
# Postsecondary STEM degree
F3TZSTEM1CRED=na.omit(ROC$F3TZSTEM1CRED); mean(F3TZSTEM1CRED); sd(F3TZSTEM1CRED); length(F3TZSTEM1CRED); # Postsecondary STEM degree
## [1] 0.166955
## [1] 0.3729626
## [1] 6936
F3TZSTEM1TOT=na.omit(ROC$F3TZSTEM1TOT); mean(F3TZSTEM1TOT); sd(F3TZSTEM1TOT); length(F3TZSTEM1TOT); min(F3TZSTEM1TOT); max(F3TZSTEM1TOT) # Number of STEM Courses
## [1] 9.257192
## [1] 10.39643
## [1] 11540
## [1] 0
## [1] 56
F3TZSTEM2GPA=na.omit(ROC$F3TZSTEM2GPA); mean(F3TZSTEM2GPA); sd(F3TZSTEM2GPA); length(F3TZSTEM2GPA); min(F3TZSTEM2GPA); max(F3TZSTEM2GPA) # STEM course GPA
## [1] 2.585565
## [1] 0.9375899
## [1] 10755
## [1] 0
## [1] 4
F1S27=na.omit(ROC$F1S27); mean(F1S27); sd(F1S27); min(F1S27); max(F1S27) #Extracurricular activities (2004)
## [1] 3.181909
## [1] 1.8975
## [1] 1
## [1] 8
# Hard/soft STEM career
STEMH=na.omit(ROC$STEMH); mean(STEMH); sd(STEMH); length(STEMH); # Hard STEM career
## [1] 0.06322288
## [1] 0.2433729
## [1] 12796
STEMS=na.omit(ROC$STEMS); mean(STEMS); sd(STEMS); length(STEMS); # Soft STEM career
## [1] 0.07799312
## [1] 0.2681712
## [1] 12796
I replicate Balfanz et al. (2007) research on predictors of dropout with ELS:2002 data. The curves in black are for composite flags and those in gray are for raw flags. This analysis examines the accuracy of Balfanz et al.’s dichotomous predictors of high school dropout. Results appear in Figure 2A.
# Figure 2A. ROC for predicting dropout, with Balfanz et al.'s flags as predictors
roc1 = roc(ROC$F2EVERDO, ROC$BYP52E_01, legacy.axes=TRUE, asp=FALSE, main = "ROC curves for predicting dropout, with Balfanz et al.'s flags as predictors",
plot=TRUE, grid=FALSE, lty=5, xaxs="i", yaxs="i", col = 'gray')
roc2 = roc(ROC$F2EVERDO, ROC$BYTXMSTD_01,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=2, col = 'gray')
roc3 = roc(ROC$F2EVERDO, ROC$BYS24F_01,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=3, col = 'gray')
roc4 = roc(ROC$F2EVERDO, ROC$BYP51,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=4, col = 'gray')
roc5 = roc(ROC$F2EVERDO, ROC$flag_00,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=6)
roc6 = roc(ROC$F2EVERDO, ROC$flag_01,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=1)
roc7 = roc(ROC$F2EVERDO, ROC$flag_02,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=18)
roc8 = roc(ROC$F2EVERDO, ROC$flag_03,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=8)
roc9 = roc(ROC$F2EVERDO, ROC$flag_04,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=9)
legend("bottomright", legend=c("Absent","Math t score (2002)","Suspension","Misbehavior","One or more flags","Any one flag","Any two flags","Any three flags","Any four flags"), lty=c(5,2,3,4,6,1,18,8,9), lwd=2)
roc1; roc2; roc3; roc4; roc5; roc6; roc7; roc8; roc9;
##
## Call:
## roc.default(response = ROC$F2EVERDO, predictor = ROC$BYP52E_01, plot = TRUE, legacy.axes = TRUE, asp = FALSE, main = "ROC curves for predicting dropout, with Balfanz et al.'s flags as predictors", grid = FALSE, lty = 5, xaxs = "i", yaxs = "i", col = "gray")
##
## Data: ROC$BYP52E_01 in 11108 controls (ROC$F2EVERDO 0) < 1178 cases (ROC$F2EVERDO 1).
## Area under the curve: 0.6434
##
## Call:
## roc.default(response = ROC$F2EVERDO, predictor = ROC$BYTXMSTD_01, percent = roc1$percent, plot = TRUE, add = TRUE, lty = 2, col = "gray")
##
## Data: ROC$BYTXMSTD_01 in 14111 controls (ROC$F2EVERDO 0) < 1781 cases (ROC$F2EVERDO 1).
## Area under the curve: 0.6473
##
## Call:
## roc.default(response = ROC$F2EVERDO, predictor = ROC$BYS24F_01, percent = roc1$percent, plot = TRUE, add = TRUE, lty = 3, col = "gray")
##
## Data: ROC$BYS24F_01 in 12929 controls (ROC$F2EVERDO 0) < 1547 cases (ROC$F2EVERDO 1).
## Area under the curve: 0.5837
##
## Call:
## roc.default(response = ROC$F2EVERDO, predictor = ROC$BYP51, percent = roc1$percent, plot = TRUE, add = TRUE, lty = 4, col = "gray")
##
## Data: ROC$BYP51 in 11259 controls (ROC$F2EVERDO 0) < 1198 cases (ROC$F2EVERDO 1).
## Area under the curve: 0.5934
##
## Call:
## roc.default(response = ROC$F2EVERDO, predictor = ROC$flag_00, percent = roc1$percent, plot = TRUE, add = TRUE, lty = 6)
##
## Data: ROC$flag_00 in 14367 controls (ROC$F2EVERDO 0) < 1830 cases (ROC$F2EVERDO 1).
## Area under the curve: 0.7
##
## Call:
## roc.default(response = ROC$F2EVERDO, predictor = ROC$flag_01, percent = roc1$percent, plot = TRUE, add = TRUE, lty = 1)
##
## Data: ROC$flag_01 in 11943 controls (ROC$F2EVERDO 0) < 1563 cases (ROC$F2EVERDO 1).
## Area under the curve: 0.729
##
## Call:
## roc.default(response = ROC$F2EVERDO, predictor = ROC$flag_02, percent = roc1$percent, plot = TRUE, add = TRUE, lty = 18)
##
## Data: ROC$flag_02 in 11180 controls (ROC$F2EVERDO 0) < 1201 cases (ROC$F2EVERDO 1).
## Area under the curve: 0.6848
##
## Call:
## roc.default(response = ROC$F2EVERDO, predictor = ROC$flag_03, percent = roc1$percent, plot = TRUE, add = TRUE, lty = 8)
##
## Data: ROC$flag_03 in 12630 controls (ROC$F2EVERDO 0) < 1319 cases (ROC$F2EVERDO 1).
## Area under the curve: 0.5505
##
## Call:
## roc.default(response = ROC$F2EVERDO, predictor = ROC$flag_04, percent = roc1$percent, plot = TRUE, add = TRUE, lty = 9)
##
## Data: ROC$flag_04 in 13910 controls (ROC$F2EVERDO 0) < 1670 cases (ROC$F2EVERDO 1).
## Area under the curve: 0.5088
I perform pairwise significance tests on predictors used above. This analysis shows that we can tell whether the AUC’s of two dichotomous predictors of dropout are significantly different, which means that AUC can help researchers to pick predictors in terms of accuracy. Results appear in Appendix B: Significance of AUC Difference for Balfanz et al. (2007) Dropout Predictors.
roc.test(roc1, roc2, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc2
## Z = -0.97336, p-value = 0.3304
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6440119 0.6541790
roc.test(roc1, roc3, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc3
## Z = 5.6366, p-value = 1.734e-08
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6390450 0.5849602
roc.test(roc1, roc4, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc4
## Z = 5.6336, p-value = 1.765e-08
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6440290 0.5941789
roc.test(roc1, roc5, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc5
## Z = -12.868, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6433530 0.7383564
roc.test(roc1, roc6, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc6
## Z = -12.59, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6443835 0.7384469
roc.test(roc1, roc7, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc7
## Z = -3.6897, p-value = 0.0002246
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6459628 0.6719543
roc.test(roc1, roc8, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc8
## Z = 11.145, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6390668 0.5584752
roc.test(roc1, roc9, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc9
## Z = 17.897, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6406746 0.5126746
roc.test(roc2, roc3, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc2 and roc3
## Z = 8.6277, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6553283 0.5836591
roc.test(roc2, roc4, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc2 and roc4
## Z = 6.4587, p-value = 1.056e-10
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6559904 0.5932166
roc.test(roc2, roc5, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc2 and roc5
## Z = -11.121, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6472583 0.7023817
roc.test(roc2, roc6, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc2 and roc6
## Z = -11.095, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6698584 0.7332065
roc.test(roc2, roc7, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc2 and roc7
## Z = -1.7022, p-value = 0.08873
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6697738 0.6828854
roc.test(roc2, roc8, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc2 and roc8
## Z = 11.986, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6358832 0.5500515
roc.test(roc2, roc9, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc2 and roc9
## Z = 19.814, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6348865 0.5088618
roc.test(roc3, roc4, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc3 and roc4
## Z = -0.3594, p-value = 0.7193
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.5851569 0.5880785
roc.test(roc3, roc5, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc3 and roc5
## Z = -19.36, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.5836591 0.7122007
roc.test(roc3, roc6, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc3 and roc6
## Z = -20.236, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.5928714 0.7341106
roc.test(roc3, roc7, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc3 and roc7
## Z = -10.846, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6037795 0.6777058
roc.test(roc3, roc8, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc3 and roc8
## Z = 4.4293, p-value = 9.455e-06
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.5730985 0.5495565
roc.test(roc3, roc9, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc3 and roc9
## Z = 12.027, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.5703277 0.5099136
roc.test(roc4, roc5, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc4 and roc5
## Z = -19.344, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.5933506 0.7381237
roc.test(roc4, roc6, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc4 and roc6
## Z = -19.018, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.5942275 0.7378884
roc.test(roc4, roc7, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc4 and roc7
## Z = -10.754, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.5976528 0.6735882
roc.test(roc4, roc8, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc4 and roc8
## Z = 5.534, p-value = 3.129e-08
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.5872837 0.5578098
roc.test(roc4, roc9, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc4 and roc9
## Z = 13.01, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.5902130 0.5124571
roc.test(roc5, roc6, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc5 and roc6
## Z = 0, p-value = 1
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.7289771 0.7289771
roc.test(roc5, roc7, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc5 and roc7
## Z = 8.1069, p-value = 5.191e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.7419772 0.6848345
roc.test(roc5, roc8, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc5 and roc8
## Z = 22.036, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.7067209 0.5504767
roc.test(roc5, roc9, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc5 and roc9
## Z = 31.377, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6979752 0.5087541
roc.test(roc6, roc7, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc6 and roc7
## Z = 7.6501, p-value = 2.008e-14
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.7416755 0.6868757
roc.test(roc6, roc8, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc6 and roc8
## Z = 24.16, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.7361305 0.5582342
roc.test(roc6, roc9, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc6 and roc9
## Z = 38.944, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.7325055 0.5104029
roc.test(roc7, roc8, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc7 and roc8
## Z = 15.183, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6641474 0.5615573
roc.test(roc7, roc9, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc7 and roc9
## Z = 21.319, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6708728 0.5132080
roc.test(roc8, roc9, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc8 and roc9
## Z = 9.237, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.5470717 0.5113213
I calculate the AUC’s of continuous variables to predict dropout with ELS:2002 data. This analysis examines whether AUC performs well for continuous variables. Results appear in Figure 2B.
# Figure 2B. ROC for predicting dropout, with continuous predictors
roc1 = roc(ROC$F2EVERDO, ROC$BYTXRSTD, legacy.axes=TRUE, asp=FALSE, main = "ROC curves for predicting dropout, with continuous predictors",
plot=TRUE, grid=FALSE, lty=1, xaxs="i", yaxs="i")
roc2 = roc(ROC$F2EVERDO, ROC$BYTXMSTD,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=2)
roc3 = roc(ROC$F2EVERDO, ROC$BYS42,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=3)
roc4 = roc(ROC$F2EVERDO, ROC$BYP52E,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=4)
roc5 = roc(ROC$F2EVERDO, ROC$BYS24F,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=5)
legend("bottomright", legend=c("Reading t score (2002)","Math t score (2002)","Extracurricular activities (2002)","Absent","Suspension"), lty=c(1,2,3,4,5), lwd=2)
roc1; roc2; roc3; roc4; roc5;
##
## Call:
## roc.default(response = ROC$F2EVERDO, predictor = ROC$BYTXRSTD, plot = TRUE, legacy.axes = TRUE, asp = FALSE, main = "ROC curves for predicting dropout, with continuous predictors", grid = FALSE, lty = 1, xaxs = "i", yaxs = "i")
##
## Data: ROC$BYTXRSTD in 14111 controls (ROC$F2EVERDO 0) > 1781 cases (ROC$F2EVERDO 1).
## Area under the curve: 0.706
##
## Call:
## roc.default(response = ROC$F2EVERDO, predictor = ROC$BYTXMSTD, percent = roc1$percent, plot = TRUE, add = TRUE, lty = 2)
##
## Data: ROC$BYTXMSTD in 14111 controls (ROC$F2EVERDO 0) > 1781 cases (ROC$F2EVERDO 1).
## Area under the curve: 0.7217
##
## Call:
## roc.default(response = ROC$F2EVERDO, predictor = ROC$BYS42, percent = roc1$percent, plot = TRUE, add = TRUE, lty = 3)
##
## Data: ROC$BYS42 in 12935 controls (ROC$F2EVERDO 0) > 1511 cases (ROC$F2EVERDO 1).
## Area under the curve: 0.6718
##
## Call:
## roc.default(response = ROC$F2EVERDO, predictor = ROC$BYP52E, percent = roc1$percent, plot = TRUE, add = TRUE, lty = 4)
##
## Data: ROC$BYP52E in 11108 controls (ROC$F2EVERDO 0) < 1178 cases (ROC$F2EVERDO 1).
## Area under the curve: 0.647
##
## Call:
## roc.default(response = ROC$F2EVERDO, predictor = ROC$BYS24F, percent = roc1$percent, plot = TRUE, add = TRUE, lty = 5)
##
## Data: ROC$BYS24F in 12929 controls (ROC$F2EVERDO 0) < 1547 cases (ROC$F2EVERDO 1).
## Area under the curve: 0.5844
I perform pairwise significance tests on continuous predictors used above. This analysis shows that we can tell whether the AUC’s of two continuous predictors of dropout are significantly different, which means that AUC can help researchers to pick continuous predictors in terms of accuracy. Results appear in Table 1: Significance of AUC Difference for Predictors of Continuous Dropout.
roc.test(roc1, roc2, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc2
## Z = 3.2199, p-value = 0.001282
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.7060195 0.7216835
roc.test(roc1, roc3, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc3
## Z = -3.6406, p-value = 0.000272
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.7050345 0.6718251
roc.test(roc1, roc4, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc4
## Z = -33.716, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.7158917 0.6476846
roc.test(roc1, roc5, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc5
## Z = -32.105, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.7051809 0.5844164
roc.test(roc2, roc3, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc2 and roc3
## Z = -5.7437, p-value = 9.263e-09
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.7221498 0.6718251
roc.test(roc2, roc4, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc2 and roc4
## Z = -34.194, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.7292418 0.6476846
roc.test(roc2, roc5, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc2 and roc5
## Z = -34.84, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.7230061 0.5844164
roc.test(roc3, roc4, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc3 and roc4
## Z = -28.758, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6796658 0.6501814
roc.test(roc3, roc5, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc3 and roc5
## Z = -28.183, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6739673 0.5848016
roc.test(roc4, roc5, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc4 and roc5
## Z = 5.846, p-value = 5.036e-09
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6424408 0.5856693
I calculate the AUC’s of variables to predict college enrollment with ELS:2002 data. This analysis examines whether AUC performs well for college enrollment. Results appear in Figure 3A.
# Figure 3A. ROC Curves for Predicting College Enrollment
roc1 = roc(ROC$F2PS0601, ROC$F1RGPP2, legacy.axes=TRUE, asp=FALSE, main = "ROC Curves for Predicting College Enrollment",
plot=TRUE, grid=FALSE, lty=1, xaxs="i", yaxs="i")
roc2 = roc(ROC$F2PS0601, ROC$BYS33A,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=2)
roc3 = roc(ROC$F2PS0601, ROC$F1S27,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=3)
legend("bottomright", legend=c("GPA","AP","Extracurricular activities (2004)"), lty=c(1,2,3), lwd=2)
roc1; roc2; roc3;
##
## Call:
## roc.default(response = ROC$F2PS0601, predictor = ROC$F1RGPP2, plot = TRUE, legacy.axes = TRUE, asp = FALSE, main = "ROC Curves for Predicting College Enrollment", grid = FALSE, lty = 1, xaxs = "i", yaxs = "i")
##
## Data: ROC$F1RGPP2 in 4417 controls (ROC$F2PS0601 0) < 5357 cases (ROC$F2PS0601 1).
## Area under the curve: 0.7669
##
## Call:
## roc.default(response = ROC$F2PS0601, predictor = ROC$BYS33A, percent = roc1$percent, plot = TRUE, add = TRUE, lty = 2)
##
## Data: ROC$BYS33A in 4320 controls (ROC$F2PS0601 0) < 5206 cases (ROC$F2PS0601 1).
## Area under the curve: 0.5651
##
## Call:
## roc.default(response = ROC$F2PS0601, predictor = ROC$F1S27, percent = roc1$percent, plot = TRUE, add = TRUE, lty = 3)
##
## Data: ROC$F1S27 in 4392 controls (ROC$F2PS0601 0) < 5562 cases (ROC$F2PS0601 1).
## Area under the curve: 0.6424
I perform pairwise significance tests on predictors used above. This analysis shows that we can tell whether the AUC’s of two predictors of college enrollment are significantly different, which means that AUC can help researchers to pick predictors in terms of accuracy. Results appear in Table 2: Significance of AUC Difference for College Enrollment Predictors and STEM Career Predictors.
roc.test(roc1, roc2, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc2
## Z = 32.679, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.7626899 0.5656768
roc.test(roc1, roc3, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc3
## Z = 17.07, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.7636363 0.6446794
roc.test(roc2, roc3, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc2 and roc3
## Z = -10.939, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.5647999 0.6407896
I calculate and compare the AUC’s of continuous variables to predict postsecondary STEM degree with ELS:2002 data. This analysis examines whether AUC performs well for continuous variables. Results appear in Figure 3B.
#Figure 3B. ROC Curves for Predicting Postsecondary STEM Degree
roc1 = roc(ROC$F3TZSTEM1CRED, ROC$F3TZSTEM1TOT, legacy.axes=TRUE, asp=FALSE, main = "ROC Curves for Predicting Postsecondary STEM Degree",
plot=TRUE, grid=FALSE, lty=1, xaxs="i", yaxs="i")
roc2 = roc(ROC$F3TZSTEM1CRED, ROC$F3TZSTEM2GPA,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=2)
roc3 = roc(ROC$F3TZSTEM1CRED, ROC$BYTXMSTD,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=3)
legend("bottomright", legend=c("Number of STEM Courses","STEM course GPA","Math t score (2002)"), lty=c(1,2,3), lwd=2)
roc1; roc2; roc3;
##
## Call:
## roc.default(response = ROC$F3TZSTEM1CRED, predictor = ROC$F3TZSTEM1TOT, plot = TRUE, legacy.axes = TRUE, asp = FALSE, main = "ROC Curves for Predicting Postsecondary STEM Degree", grid = FALSE, lty = 1, xaxs = "i", yaxs = "i")
##
## Data: ROC$F3TZSTEM1TOT in 5734 controls (ROC$F3TZSTEM1CRED 0) < 1157 cases (ROC$F3TZSTEM1CRED 1).
## Area under the curve: 0.9568
##
## Call:
## roc.default(response = ROC$F3TZSTEM1CRED, predictor = ROC$F3TZSTEM2GPA, percent = roc1$percent, plot = TRUE, add = TRUE, lty = 2)
##
## Data: ROC$F3TZSTEM2GPA in 5577 controls (ROC$F3TZSTEM1CRED 0) < 1152 cases (ROC$F3TZSTEM1CRED 1).
## Area under the curve: 0.5801
##
## Call:
## roc.default(response = ROC$F3TZSTEM1CRED, predictor = ROC$BYTXMSTD, percent = roc1$percent, plot = TRUE, add = TRUE, lty = 3)
##
## Data: ROC$BYTXMSTD in 5722 controls (ROC$F3TZSTEM1CRED 0) < 1141 cases (ROC$F3TZSTEM1CRED 1).
## Area under the curve: 0.6715
I calculate the optimal number of STEM coures taken to receive a postsecondary STEM degree, considering that number of STEM coures has a high AUC of 0.957. This analysis shows that it is possible to obtain an optimal value for continuous predictors of an education outcome.
optimal.cutpoint.CB = optimal.cutpoints(X = F3TZSTEM1TOT ~ F3TZSTEM1CRED, tag.healthy = 1,
methods = "CB", data = ROC, pop.prev = NULL, control = control.cutpoints(), ci.fit = FALSE, conf.level = 0.95, trace = FALSE)
summary(optimal.cutpoint.CB)
##
## Call:
## optimal.cutpoints.formula(X = F3TZSTEM1TOT ~ F3TZSTEM1CRED, tag.healthy = 1,
## methods = "CB", data = ROC, pop.prev = NULL, control = control.cutpoints(),
## ci.fit = FALSE, conf.level = 0.95, trace = FALSE)
##
## Area under the ROC curve (AUC): 0.043 (0.036, 0.05)
##
## CRITERION: CB
## Number of optimal cutoffs: 1
##
## Estimate
## cutoff 0.0000000
## Se 1.0000000
## Sp 0.0000000
## PPV 0.8320998
## NPV NaN
## DLR.Positive 1.0000000
## DLR.Negative NaN
## FP 1157.0000000
## FN 0.0000000
## Optimal criterion 0.2017789
I perform pairwise significance tests on predictors used above. This analysis shows that we can tell whether the AUC’s of two continuous predictors of postsecondary STEM degree are significantly different, which means that AUC can help researchers to pick predictors in terms of accuracy. Results appear in Table 2: Significance of AUC Difference for College Enrollment Predictors and STEM Career Predictors.
roc.test(roc1, roc2, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc2
## Z = 39.743, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.9588816 0.5801084
roc.test(roc1, roc3, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc3
## Z = 32.086, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.9564770 0.6721962
roc.test(roc2, roc3, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc2 and roc3
## Z = -8.6547, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.5775748 0.6696404
I calculate the AUC’s of continuous variables to predict hard STEM with ELS:2002 data. This analysis examines whether AUC performs well for continuous variables. Results appear in Figure 4A.
#Figure 4A. ROC Curves for Predicting Hard STEM Career
roc1 = roc(ROC$STEMH, ROC$F3TZSTEM1TOT, legacy.axes=TRUE, asp=FALSE, main = "ROC Curves for Predicting Hard STEM Career",
plot=TRUE, grid=FALSE, lty=1, xaxs="i", yaxs="i")
roc2 = roc(ROC$STEMH, ROC$F3TZSTEM2GPA,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=2)
roc3 = roc(ROC$STEMH, ROC$BYTXMSTD,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=3)
legend("bottomright", legend=c("Number of STEM Courses","STEM course GPA","Math t score (2002)"), lty=c(1,2,3), lwd=2)
roc1; roc2; roc3;
##
## Call:
## roc.default(response = ROC$STEMH, predictor = ROC$F3TZSTEM1TOT, plot = TRUE, legacy.axes = TRUE, asp = FALSE, main = "ROC Curves for Predicting Hard STEM Career", grid = FALSE, lty = 1, xaxs = "i", yaxs = "i")
##
## Data: ROC$F3TZSTEM1TOT in 9518 controls (ROC$STEMH 0) < 750 cases (ROC$STEMH 1).
## Area under the curve: 0.7989
##
## Call:
## roc.default(response = ROC$STEMH, predictor = ROC$F3TZSTEM2GPA, percent = roc1$percent, plot = TRUE, add = TRUE, lty = 2)
##
## Data: ROC$F3TZSTEM2GPA in 8855 controls (ROC$STEMH 0) < 741 cases (ROC$STEMH 1).
## Area under the curve: 0.5929
##
## Call:
## roc.default(response = ROC$STEMH, predictor = ROC$BYTXMSTD, percent = roc1$percent, plot = TRUE, add = TRUE, lty = 3)
##
## Data: ROC$BYTXMSTD in 11821 controls (ROC$STEMH 0) < 798 cases (ROC$STEMH 1).
## Area under the curve: 0.7115
I perform pairwise significance tests on predictors used above. This analysis shows that we can tell whether the AUC’s of two continuous predictors of hard STEM are significantly different, which means that AUC can help researchers to pick predictors in terms of accuracy. Results appear in Table 3: Significance of AUC Difference for Soft and Hard STEM Career Predictors.
roc.test(roc1, roc2, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc2
## Z = 15.055, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.7919822 0.5929249
roc.test(roc1, roc3, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc3
## Z = 8.9427, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.7995714 0.6908471
roc.test(roc2, roc3, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc2 and roc3
## Z = -7.4012, p-value = 1.35e-13
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.5935556 0.6801902
I calculate the AUC’s of continuous variables to predict soft STEM with ELS:2002 data. This analysis examines whether AUC performs well for continuous variables. Results appear in Figure 4B.
#Figure 4B. ROC Curves for Predicting Soft STEM Career
roc1 = roc(ROC$STEMS, ROC$F3TZSTEM1TOT, legacy.axes=TRUE, asp=FALSE, main = "ROC Curves for Predicting Soft STEM Career",
plot=TRUE, grid=FALSE, lty=1, xaxs="i", yaxs="i")
roc2 = roc(ROC$STEMS, ROC$F3TZSTEM2GPA,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=2)
roc3 = roc(ROC$STEMS, ROC$BYTXMSTD,
plot=TRUE, add=TRUE, percent=roc1$percent, lty=3)
legend("bottomright", legend=c("Number of STEM Courses","STEM course GPA","Math t score (2002)"), lty=c(1,2,3), lwd=2)
roc1; roc2; roc3;
##
## Call:
## roc.default(response = ROC$STEMS, predictor = ROC$F3TZSTEM1TOT, plot = TRUE, legacy.axes = TRUE, asp = FALSE, main = "ROC Curves for Predicting Soft STEM Career", grid = FALSE, lty = 1, xaxs = "i", yaxs = "i")
##
## Data: ROC$F3TZSTEM1TOT in 9341 controls (ROC$STEMS 0) < 927 cases (ROC$STEMS 1).
## Area under the curve: 0.6926
##
## Call:
## roc.default(response = ROC$STEMS, predictor = ROC$F3TZSTEM2GPA, percent = roc1$percent, plot = TRUE, add = TRUE, lty = 2)
##
## Data: ROC$F3TZSTEM2GPA in 8684 controls (ROC$STEMS 0) < 912 cases (ROC$STEMS 1).
## Area under the curve: 0.6727
##
## Call:
## roc.default(response = ROC$STEMS, predictor = ROC$BYTXMSTD, percent = roc1$percent, plot = TRUE, add = TRUE, lty = 3)
##
## Data: ROC$BYTXMSTD in 11632 controls (ROC$STEMS 0) < 987 cases (ROC$STEMS 1).
## Area under the curve: 0.6106
I perform pairwise significance tests on predictors used above. This analysis shows that we can tell whether the AUC’s of two continuous predictors of dropout are significantly different, which means that AUC can help researchers to pick predictors in terms of accuracy. Results appear in Table 3: Significance of AUC Difference for Soft and Hard STEM Career Predictors.
roc.test(roc1, roc2, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc2
## Z = 0.84633, p-value = 0.3974
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6824838 0.6727173
roc.test(roc1, roc3, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc3
## Z = 10.523, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6913418 0.5726086
roc.test(roc2, roc3, method="delong")
##
## DeLong's test for two correlated ROC curves
##
## data: roc2 and roc3
## Z = 11.111, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.6724861 0.5598639
References:
Balfanz, R., Herzog, L., & Mac Iver, D. J. (2007). Preventing student disengagement and keeping students on the graduation path in urban middle-grades schools: Early identification and effective interventions. Educational Psychologist, 42(4), 223-235. http://www.tandfonline.com/doi/pdf/10.1080/00461520701621079?needAccess=true. DOI: 10.1080/00461520701621079
L?pez-Rat?n, M., Rodr?guez-?lvarez, M. X., Suarez, C. C., & Sampedro, F. G. (2014). OptimalCutpoints: an R package for selecting optimal cutpoints in diagnostic tests. Journal of Statistical Software, 61(8), 1-36.
Robin, X., Turck, N., Hainard, A., Tiberti, N., Lisacek, F., Sanchez, J. C., & M?ller, M. (2011). pROC: an open-source package for R and S+ to analyze and compare ROC curves. BMC bioinformatics, 12(1), 77. DOI: 10.1186/1471-2105-12-77