Descriptive Statistics for Key Variables

This section presents tables summarizing important variables in our data. The variables for academic performance will be used to predict success in college-level courses and the demographic variables will be used to examine how our predictive models perform for different groups of students.

There are three sets of tables. The first set of tables use the sample of students who enrolled in college-level English. The second set of tables use the sample of students who enrolled in college-level math. These two samples are the samples used for predictive analyses. The third set of tables use all students in the data and this sample was not used in the predictive analyses.

A Note on Missing Data

In general, missing values were imputed with a value of zero, and an additional 0/1 binary variable was created, which flagged cases that were imputed. For example, if student A had no missing values, but student B had a missing value for high school GPA, the imputed GPA variable would be 0 and the binary flag would be 1 for student B. When including variables in the predictive models, both the imputed variable and the accompanying binary flag were included in the models.

The variables that capture high school course taking behavior do not contain any missing values, because the 0 value captures both students who did not take the course and students who did not pass the course. In most cases, the high school courses identified in the data were required math courses.

Among Students who Enrolled in College-Level English

Academic Performance

left_join(desc_table(data_eng_cl, vars1, 1, TRUE),
          desc_table(data_eng_cl, vars1miss, 0, TRUE) %>% 
            select(name, contains("Mean")) %>% 
            rename_with(~str_replace_all(.,"_Mean","_Percent Missing")),
          by = c("name")) %>% 
  rename("Category" = name,
         "Measure" = group) %>% 
  select(Measure, 
         Category,
         starts_with("ASU"), 
         starts_with("ASUMID"), 
         starts_with("CCCUA"),
         starts_with("NWACC"),
         starts_with("SAU"),
         starts_with("SAUTECH"),
         starts_with("All")) %>%
  as_grouped_data(groups = "Measure") %>% 
  flextable() %>%
  separate_header()

Demographic Background

bind_rows(desc_table(data_eng_cl, vars2, 1, TRUE) %>% 
            select(name, group, contains("Mean")),
          desc_table(data_eng_cl, vars2miss,0, TRUE) %>% 
            select(name, group, contains("Mean"))) %>% 
  rename("Category" = name,
         "Subgroup" = group) %>% 
  rename_with(~str_remove_all(.,"_Mean")) %>% 
  arrange(Subgroup) %>% 
  select(Subgroup, 
         Category, 
         starts_with("ASU"), 
         starts_with("ASUMID"), 
         starts_with("CCCUA"),
         starts_with("NWACC"),
         starts_with("SAU"),
         starts_with("SAUTECH"),
         starts_with("All")) %>%
  as_grouped_data(groups = "Subgroup") %>% 
  flextable()

Among Students who Enrolled in College-Level Math

Academic Performance

left_join(desc_table(data_math_cl, vars1, 1, TRUE),
          desc_table(data_math_cl, vars1miss, 0, TRUE) %>% 
            select(name, contains("Mean")) %>% 
            rename_with(~str_replace_all(.,"_Mean","_Percent Missing")),
          by = c("name")) %>% 
  rename("Category" = name,
         "Measure" = group) %>% 
  select(Measure, 
         Category,         
         starts_with("ASU"), 
         starts_with("ASUMID"), 
         starts_with("CCCUA"),
         starts_with("NWACC"),
         starts_with("SAU"),
         starts_with("SAUTECH"),
         starts_with("All")) %>%
  as_grouped_data(groups = "Measure") %>% 
  flextable() %>%
  separate_header()

Demographic Background

bind_rows(desc_table(data_math_cl, vars2, 1, TRUE) %>% 
            select(name, group, contains("Mean")),
          desc_table(data_math_cl, vars2miss,0, TRUE) %>% 
            select(name, group, contains("Mean"))) %>% 
  rename("Category" = name,
         "Subgroup" = group) %>% 
  rename_with(~str_remove_all(.,"_Mean")) %>% 
  arrange(Subgroup) %>% 
  select(Subgroup, 
         Category,          
         starts_with("ASU"), 
         starts_with("ASUMID"), 
         starts_with("CCCUA"),
         starts_with("NWACC"),
         starts_with("SAU"),
         starts_with("SAUTECH"),
         starts_with("All")) %>%
  as_grouped_data(groups = "Subgroup") %>% 
  flextable()

Among Students All Students

Academic Performance

left_join(desc_table(data2, vars1, 1, TRUE),
          desc_table(data2, vars1miss, 0, TRUE) %>% 
            select(name, contains("Mean")) %>% 
            rename_with(~str_replace_all(.,"_Mean","_Percent Missing")),
          by = c("name")) %>% 
  rename("Category" = name,
         "Measure" = group) %>% 
  select(Measure, 
         Category,         
         starts_with("ASU"), 
         starts_with("ASUMID"), 
         starts_with("CCCUA"),
         starts_with("NWACC"),
         starts_with("SAU"),
         starts_with("SAUTECH"),
         starts_with("All")) %>%
  as_grouped_data(groups = "Measure") %>% 
  flextable() %>%
  separate_header()

Demographic Background

bind_rows(desc_table(data2, vars2, 1, TRUE) %>% 
            select(name, group, contains("Mean")),
          desc_table(data2, vars2miss,0, TRUE) %>% 
            select(name, group, contains("Mean"))) %>% 
  rename("Category" = name,
         "Subgroup" = group) %>% 
  rename_with(~str_remove_all(.,"_Mean")) %>% 
  arrange(Subgroup) %>% 
  select(Subgroup, 
         Category,          
         starts_with("ASU"), 
         starts_with("ASUMID"), 
         starts_with("CCCUA"),
         starts_with("NWACC"),
         starts_with("SAU"),
         starts_with("SAUTECH"),
         starts_with("All")) %>%
  as_grouped_data(groups = "Subgroup") %>% 
  flextable()

Correlations

Correlation is a statistical measure that captures the extent to which two variables are linearly related. It is useful for describing simple relationships among our data, but does not tell us anything about how well these variables may predict success in college-level courses.

The correlation coefficient (which is reported in the figures below) quantifies the strength of the relationship between any two variables. The correlation coefficient ranges from -1 to 1, and the closer it is to zero, the weaker the relationship. A positive coefficient indicates a positive correlation, meaning both variables tend to increase together. Conversely, a negative coefficient indicates a negative correlation, where one variable tends to increase when the other variable decreases.

ACT & High School GPA

act <- data2 %>%
  select(starts_with("act") & !ends_with(c("m0","miss"))) %>% 
  rename_all(~toupper(str_replace_all(.,"act_","")))

act_matrix  <- as.matrix(act)

act_gpa_matrix <- cbind(act_matrix, GPA = as_vector(data2$gpa_final))
act_gpa_corr   <- Hmisc::rcorr(x = act_gpa_matrix, type = "pearson")
act_gpa_corr$r[act_gpa_corr$n < 10] <- NA

corrplot(act_gpa_corr$r,
         method = "circle",
         type   = "upper",
         diag   = F,
         addCoef.col = "black",
         cl.pos = 'n',
         tl.col = "black",
         tl.srt = 45,
         mar = c(0,0,5,0),
         title = "Pearson Correlation for ACT tests and High School GPA")

Assessment Data from ADE & High School GPA

tests <- data2 %>%
  select(starts_with("grade") & !ends_with(c("m0","miss"))) %>% 
  rename_all(~toupper(str_replace_all(.,"grade|_score",""))) %>% 
  rename_all(~str_replace_all(.,"_M"," Math")) %>% 
  rename_all(~str_replace_all(.,"_S"," Science")) %>% 
  rename_all(~str_replace_all(.,"_RLA"," Reading")) %>% 
  rename_all(~str_replace_all(.,"9","9th Grade")) %>% 
  rename_all(~str_replace_all(.,"10","10th Grade"))

tests_matrix  <- as.matrix(tests)

tests_gpa_matrix <- cbind(tests_matrix, GPA = as_vector(data2$gpa_final))
tests_gpa_corr   <- Hmisc::rcorr(x = tests_gpa_matrix, type = "pearson")
tests_gpa_corr$r[tests_gpa_corr$n < 10] <- NA

corrplot(tests_gpa_corr$r,
         method = "circle",
         type   = "upper",
         diag   = F,
         addCoef.col = "black",
         cl.pos = 'n',
         tl.col = "black",
         tl.srt = 45,
         mar = c(0,0,5,0),
         title = "Pearson Correlation for ADE tests and High School GPA")

Other Metrics & High School GPA

other <- data2 %>%
  select(class_rank, avg_days_present, avg_days_absent) %>% 
  rename_all(~toupper(str_replace_all(.,"_"," ")))

other_matrix  <- as.matrix(other)

other_gpa_matrix <- cbind(other_matrix, GPA = as_vector(data2$gpa_final))
other_gpa_corr   <- Hmisc::rcorr(x = other_gpa_matrix, type = "pearson")
other_gpa_corr$r[other_gpa_corr$n < 10] <- NA

corrplot(other_gpa_corr$r,
         method = "circle",
         type   = "upper",
         diag   = F,
         addCoef.col = "black",
         cl.pos = 'n',
         tl.col = "black",
         tl.srt = 45,
         mar = c(0,0,5,0),
         title = "Pearson Correlation for Various High School Metrics and GPA")