Metafile

Authors:

Alex J. Bowers, Xiaoliang Zhou
Teachers College, Columbia University
www.columbia.edu/
www.tc.columbia.edu/
www.tc.columbia.edu/organization-and-leadership/education-leadership/

Correspondence:

Alex J. Bowers
525 West 120th, Box 67
New York, New York 10027-6696
Bowers@tc.edu

NSF grant:

This project was supported through a generous grant from the National Science Foundation (NSF IIS-1546653): http://www.nsf.gov/awardsearch/showAward?AWD_ID=1546653
Any opinions, findings, and conclusions or recommendations are those of the author and do not necessarily reflect the views of funding agencies

Libraries and data loading

I load the library pROC used to calculate and compare AUC and the library OptimalCutpoints used to calculate optimal cutpoints of largest AUC. I load the data with education outcome variables, such as dropout and college enrollment, as well as predictors, such as math t score and suspension. Finally, I select math t score as a predictor of high school dropout after comparing the AUC’s of reading t score and math t score.

library(pROC)
library(OptimalCutpoints) #cutoff point

ROC = as.data.frame(read.csv(file = '/Users/zhouxiaoliang/Downloads/ROC.csv'))

#Math t score
ROC$BYTXMSTD_01 = ifelse(ROC$BYTXMSTD < 44.065, 1, 0)
#Reading t score
ROC$BYTXRSTD_01 = ifelse(ROC$BYTXRSTD < 43.64, 1, 0)
#compare AUC for math and reading scores
roc1 = roc(ROC$F2EVERDO, ROC$BYTXMSTD_01, legacy.axes=TRUE, asp=FALSE, 
           plot=TRUE, grid=FALSE, lty=1, xaxs="i", yaxs="i", main = "ROC curves for predicting dropout: Comparison")
roc2 = roc(ROC$F2EVERDO, ROC$BYTXRSTD_01,
           plot=TRUE, add=TRUE, percent=roc1$percent, lty=2)
legend("bottomright", legend=c("Math t score (2002)","Reading t score (2002)"), lty=c(1,2), lwd=2)

roc.test(roc1, roc2, method="delong") #select math t score with bigger AUC
## 
##  DeLong's test for two correlated ROC curves
## 
## data:  roc1 and roc2
## Z = 1.7462, p-value = 0.08078
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2 
##   0.6472583   0.6363775

Descriptive statistics for outcome variables and predictors

I calculate the statistics for the outcome variables and predictors used in the paper. The statistics appear in Appendix A: Variable Descriptive Statistics and Labels.

# High school dropout with dichotomous predictors
mean(ROC$F2EVERDO); sd(ROC$F2EVERDO); length(ROC$F2EVERDO) # Dropout
## [1] 0.1129839
## [1] 0.3165829
## [1] 16197
BYP52E=na.omit(ROC$BYP52E_01); mean(BYP52E); sd(BYP52E); length(BYP52E); # Absent
## [1] 0.1295784
## [1] 0.3358527
## [1] 12286
BYS24F=na.omit(ROC$BYS24F_01); mean(BYS24F); sd(BYS24F); length(BYS24F); # Suspension
## [1] 0.08068527
## [1] 0.2723606
## [1] 14476
BYP51=na.omit(ROC$BYP51); mean(BYP51); sd(BYP51); length(BYP51) # Misbehavior
## [1] 0.07248936
## [1] 0.2593069
## [1] 12457
flag_00=na.omit(ROC$flag_00); mean(flag_00); sd(flag_00); length(flag_00) # One or more flags 
## [1] 0.3708094
## [1] 0.4830364
## [1] 16197
flag_01=na.omit(ROC$flag_01); mean(flag_01); sd(flag_01); length(flag_01) # Any one flag
## [1] 0.4446912
## [1] 0.4969499
## [1] 13506
flag_02=na.omit(ROC$flag_02); mean(flag_02); sd(flag_02); length(flag_02) # Any two flags
## [1] 0.1241418
## [1] 0.3297566
## [1] 12381
flag_03=na.omit(ROC$flag_03); mean(flag_03); sd(flag_03); length(flag_03) # Any three flags
## [1] 0.02458958
## [1] 0.1548762
## [1] 13949
flag_04=na.omit(ROC$flag_04); mean(flag_04); sd(flag_04); length(flag_04) # Any four flags
## [1] 0.003530167
## [1] 0.05931215
## [1] 15580
BYS42=na.omit(ROC$BYS42); mean(BYS42); sd(BYS42); length(BYS42); min(BYS42); max(BYS42) # Extrac. activities (2002)
## [1] 4.773294
## [1] 5.699724
## [1] 14446
## [1] 0
## [1] 21
BYTXMSTD_01=na.omit(ROC$BYTXMSTD_01); mean(BYTXMSTD_01); sd(BYTXMSTD_01); length(BYTXMSTD_01); # 1st quantile Math t score (2002)
## [1] 0.25
## [1] 0.4330263
## [1] 15892
# High school dropout with continous predictors
BYTXRSTD=na.omit(ROC$BYTXRSTD); mean(BYTXRSTD); sd(BYTXRSTD); length(BYTXRSTD) # Reading t score
## [1] 50.52618
## [1] 9.88539
## [1] 15892
BYTXMSTD=na.omit(ROC$BYTXMSTD); mean(BYTXMSTD); sd(BYTXMSTD); length(BYTXMSTD) # Math t score (2002)
## [1] 50.71016
## [1] 9.912398
## [1] 15892
# College enrollment
F2PS0601=na.omit(ROC$F2PS0601); mean(F2PS0601); sd(F2PS0601); length(F2PS0601); # College enrollment 
## [1] 0.5456189
## [1] 0.4979383
## [1] 10511
F1RGPP2=na.omit(ROC$F1RGPP2); mean(F1RGPP2); sd(F1RGPP2); length(F1RGPP2);min(F1RGPP2); max(F1RGPP2) # GPA
## [1] 3.911868
## [1] 1.542702
## [1] 14796
## [1] 0
## [1] 6
F1S27=na.omit(ROC$F1S27); mean(F1S27); sd(F1S27); length(F1S27) # Extrac. activities (2004) 
## [1] 3.181909
## [1] 1.8975
## [1] 14073
BYS33A=na.omit(ROC$BYS33A); mean(BYS33A); sd(BYS33A); length(BYS33A) # AP
## [1] 0.1823497
## [1] 0.3861459
## [1] 14368
# Postsecondary STEM degree
F3TZSTEM1CRED=na.omit(ROC$F3TZSTEM1CRED); mean(F3TZSTEM1CRED); sd(F3TZSTEM1CRED); length(F3TZSTEM1CRED); # Postsecondary STEM degree
## [1] 0.166955
## [1] 0.3729626
## [1] 6936
F3TZSTEM1TOT=na.omit(ROC$F3TZSTEM1TOT); mean(F3TZSTEM1TOT); sd(F3TZSTEM1TOT); length(F3TZSTEM1TOT); min(F3TZSTEM1TOT); max(F3TZSTEM1TOT) # Number of STEM Courses
## [1] 9.257192
## [1] 10.39643
## [1] 11540
## [1] 0
## [1] 56
F3TZSTEM2GPA=na.omit(ROC$F3TZSTEM2GPA); mean(F3TZSTEM2GPA); sd(F3TZSTEM2GPA); length(F3TZSTEM2GPA); min(F3TZSTEM2GPA); max(F3TZSTEM2GPA) # STEM course GPA
## [1] 2.585565
## [1] 0.9375899
## [1] 10755
## [1] 0
## [1] 4
F1S27=na.omit(ROC$F1S27); mean(F1S27); sd(F1S27); min(F1S27); max(F1S27) #Extracurricular activities (2004)
## [1] 3.181909
## [1] 1.8975
## [1] 1
## [1] 8
# Hard/soft STEM career
STEMH=na.omit(ROC$STEMH); mean(STEMH); sd(STEMH); length(STEMH); # Hard STEM career
## [1] 0.06322288
## [1] 0.2433729
## [1] 12796
STEMS=na.omit(ROC$STEMS); mean(STEMS); sd(STEMS); length(STEMS); # Soft STEM career
## [1] 0.07799312
## [1] 0.2681712
## [1] 12796

ROC Curves for Predicting Dropout

I replicate Balfanz et al. (2007) research on predictors of dropout with ELS:2002 data. The curves in black are for composite flags and those in gray are for raw flags. This analysis examines the accuracy of Balfanz et al.’s dichotomous predictors of high school dropout. Results appear in Figure 2A.

# Figure 2A. ROC for predicting dropout, with Balfanz et al.'s flags as predictors
roc1 = roc(ROC$F2EVERDO, ROC$BYP52E_01, legacy.axes=TRUE, asp=FALSE, main = "ROC curves for predicting dropout, with Balfanz et al.'s flags as predictors",
           plot=TRUE, grid=FALSE, lty=5, xaxs="i", yaxs="i", col = 'gray')
roc2 = roc(ROC$F2EVERDO, ROC$BYTXMSTD_01,
           plot=TRUE, add=TRUE, percent=roc1$percent, lty=2, col = 'gray')
roc3 = roc(ROC$F2EVERDO, ROC$BYS24F_01,
           plot=TRUE, add=TRUE, percent=roc1$percent, lty=3, col = 'gray')
roc4 = roc(ROC$F2EVERDO, ROC$BYP51,
           plot=TRUE, add=TRUE, percent=roc1$percent, lty=4, col = 'gray')
roc5 = roc(ROC$F2EVERDO, ROC$flag_00,
           plot=TRUE, add=TRUE, percent=roc1$percent, lty=6)
roc6 = roc(ROC$F2EVERDO, ROC$flag_01,
           plot=TRUE, add=TRUE, percent=roc1$percent, lty=1)
roc7 = roc(ROC$F2EVERDO, ROC$flag_02,
           plot=TRUE, add=TRUE, percent=roc1$percent, lty=18)
roc8 = roc(ROC$F2EVERDO, ROC$flag_03,
           plot=TRUE, add=TRUE, percent=roc1$percent, lty=8)
roc9 = roc(ROC$F2EVERDO, ROC$flag_04,
           plot=TRUE, add=TRUE, percent=roc1$percent, lty=9)
legend("bottomright", legend=c("Absent","Math t score (2002)","Suspension","Misbehavior","One or more flags","Any one flag","Any two flags","Any three flags","Any four flags"), lty=c(5,2,3,4,6,1,18,8,9), lwd=2)