Descriptive Statistics for Key Variables

This section presents tables summarizing important variables in our data. The variables for academic performance will be used to predict success in college-level courses and the demographic variables will be used to examine how our predictive models perform for different groups of students.

There are three sets of tables. The first set of tables use the sample of students who enrolled in college-level English. The second set of tables use the sample of students who enrolled in college-level math. These two samples are the samples used for predictive analyses. The third set of tables use all students in the data and this sample was not used in the predictive analyses.

A Note on Missing Data

In general, missing values were imputed with a value of zero, and an additional 0/1 binary variable was created, which flagged cases that were imputed. For example, if student A had no missing values, but student B had a missing value for high school GPA, the imputed GPA variable would be 0 and the binary flag would be 1 for student B. When including variables in the predictive models, both the imputed variable and the accompanying binary flag were included in the models.

The variables that capture high school course taking behavior do not contain any missing values, because the 0 value captures both students who did not take the course and students who did not pass the course. In most cases, the high school courses identified in the data were required math courses.

Among Students who Enrolled in College-Level English

Academic Performance
Demographic Background

left_join(desc_table(data_eng_cl, vars1, 1, TRUE),
          desc_table(data_eng_cl, vars1miss, 0, TRUE) %>% 
            select(name, contains("Mean")) %>% 
            rename_with(~str_replace_all(.,"_Mean","_Percent Missing")),
          by = c("name")) %>% 
  rename("Category" = name,
         "Measure" = group) %>% 
  select(Measure, 
         Category,
         starts_with("ASU"), 
         starts_with("ASUMID"), 
         starts_with("CCCUA"),
         starts_with("NWACC"),
         starts_with("SAU"),
         starts_with("SAUTECH"),
         starts_with("All")) %>%
  as_grouped_data(groups = "Measure") %>% 
  flextable() %>%
  separate_header()

bind_rows(desc_table(data_eng_cl, vars2, 1, TRUE) %>% 
            select(name, group, contains("Mean")),
          desc_table(data_eng_cl, vars2miss,0, TRUE) %>% 
            select(name, group, contains("Mean"))) %>% 
  rename("Category" = name,
         "Subgroup" = group) %>% 
  rename_with(~str_remove_all(.,"_Mean")) %>% 
  arrange(Subgroup) %>% 
  select(Subgroup, 
         Category, 
         starts_with("ASU"), 
         starts_with("ASUMID"), 
         starts_with("CCCUA"),
         starts_with("NWACC"),
         starts_with("SAU"),
         starts_with("SAUTECH"),
         starts_with("All")) %>%
  as_grouped_data(groups = "Subgroup") %>% 
  flextable()

Among Students who Enrolled in College-Level Math

Academic Performance
Demographic Background

left_join(desc_table(data_math_cl, vars1, 1, TRUE),
          desc_table(data_math_cl, vars1miss, 0, TRUE) %>% 
            select(name, contains("Mean")) %>% 
            rename_with(~str_replace_all(.,"_Mean","_Percent Missing")),
          by = c("name")) %>% 
  rename("Category" = name,
         "Measure" = group) %>% 
  select(Measure, 
         Category,         
         starts_with("ASU"), 
         starts_with("ASUMID"), 
         starts_with("CCCUA"),
         starts_with("NWACC"),
         starts_with("SAU"),
         starts_with("SAUTECH"),
         starts_with("All")) %>%
  as_grouped_data(groups = "Measure") %>% 
  flextable() %>%
  separate_header()

bind_rows(desc_table(data_math_cl, vars2, 1, TRUE) %>% 
            select(name, group, contains("Mean")),
          desc_table(data_math_cl, vars2miss,0, TRUE) %>% 
            select(name, group, contains("Mean"))) %>% 
  rename("Category" = name,
         "Subgroup" = group) %>% 
  rename_with(~str_remove_all(.,"_Mean")) %>% 
  arrange(Subgroup) %>% 
  select(Subgroup, 
         Category,          
         starts_with("ASU"), 
         starts_with("ASUMID"), 
         starts_with("CCCUA"),
         starts_with("NWACC"),
         starts_with("SAU"),
         starts_with("SAUTECH"),
         starts_with("All")) %>%
  as_grouped_data(groups = "Subgroup") %>% 
  flextable()

left_join(desc_table(data2, vars1, 1, TRUE),
          desc_table(data2, vars1miss, 0, TRUE) %>% 
            select(name, contains("Mean")) %>% 
            rename_with(~str_replace_all(.,"_Mean","_Percent Missing")),
          by = c("name")) %>% 
  rename("Category" = name,
         "Measure" = group) %>% 
  select(Measure, 
         Category,         
         starts_with("ASU"), 
         starts_with("ASUMID"), 
         starts_with("CCCUA"),
         starts_with("NWACC"),
         starts_with("SAU"),
         starts_with("SAUTECH"),
         starts_with("All")) %>%
  as_grouped_data(groups = "Measure") %>% 
  flextable() %>%
  separate_header()

bind_rows(desc_table(data2, vars2, 1, TRUE) %>% 
            select(name, group, contains("Mean")),
          desc_table(data2, vars2miss,0, TRUE) %>% 
            select(name, group, contains("Mean"))) %>% 
  rename("Category" = name,
         "Subgroup" = group) %>% 
  rename_with(~str_remove_all(.,"_Mean")) %>% 
  arrange(Subgroup) %>% 
  select(Subgroup, 
         Category,          
         starts_with("ASU"), 
         starts_with("ASUMID"), 
         starts_with("CCCUA"),
         starts_with("NWACC"),
         starts_with("SAU"),
         starts_with("SAUTECH"),
         starts_with("All")) %>%
  as_grouped_data(groups = "Subgroup") %>% 
  flextable()

Trends Over Time

The graph below summarizes enrollment and completion for college-level math and English courses. Please note, the percentages were calculated using the full sample of students across the entire time period. The enrollment and completion rates for each year would be higher than they appear in the graph.

data2 %>% 
  # select variables of interest
  select(site, sample_id, 
         contains(c("tvcrat","tvcret")) & ends_with("c") & !contains("oth")) %>%
  select(-starts_with("c")) %>%  
  
  # reshape the data from wide to long
  pivot_longer(cols = c(-site, -sample_id)) %>% 
  
  # clean the data for graphing
  mutate(Mon = case_when(str_sub(name,1,1) == "w" ~ 1,
                         str_sub(name,1,1) == "s" ~ 2,
                         str_sub(name,1,1) == "u" ~ 6,
                         str_sub(name,1,1) == "f" ~ 8), 
         Year = as.numeric(str_sub(name,2,3)) + 2000,
         Term = as.Date(paste0(Year,"-",Mon,"-1"),"%Y-%m-%d"),
         
         Type = if_else(str_extract(name,"cr[ae]t" ) == "crat","Enrolled","Completed"),
         Subj = if_else(str_extract(name,"math|eng") == "math","Math"    ,"English"  ),
         
         Type_fct = factor(Type, levels = c("Enrolled","Completed")),
         Subj_fct = factor(Subj, levels = c("English" , "Math"))) %>% 
  
  # keeping only fall semester data
  filter(str_sub(name,1,1) %in% c("f")) %>%

  # calculating % of students for each group
  group_by(site, Term, Type, Subj) %>% 
  mutate(Percent = sum(value)/n()) %>% 
  
  data_graph() + ylim(c(0,50))

Correlations

Correlation is a statistical measure that captures the extent to which two variables are linearly related. It is useful for describing simple relationships among our data, but does not tell us anything about how well these variables may predict success in college-level courses.

The correlation coefficient (which is reported in the figures below) quantifies the strength of the relationship between any two variables. The correlation coefficient ranges from -1 to 1, and the closer it is to zero, the weaker the relationship. A positive coefficient indicates a positive correlation, meaning both variables tend to increase together. Conversely, a negative coefficient indicates a negative correlation, where one variable tends to increase when the other variable decreases.

act <- data2 %>%
  select(starts_with("act") & !ends_with(c("m0","miss"))) %>% 
  rename_all(~toupper(str_replace_all(.,"act_","")))

act_matrix  <- as.matrix(act)

act_gpa_matrix <- cbind(act_matrix, GPA = as_vector(data2$gpa_final))
act_gpa_corr   <- Hmisc::rcorr(x = act_gpa_matrix, type = "pearson")
act_gpa_corr$r[act_gpa_corr$n < 10] <- NA

corrplot(act_gpa_corr$r,
         method = "circle",
         type   = "upper",
         diag   = F,
         addCoef.col = "black",
         cl.pos = 'n',
         tl.col = "black",
         tl.srt = 45,
         mar = c(0,0,5,0),
         title = "Pearson Correlation for ACT tests and High School GPA")

tests <- data2 %>%
  select(starts_with("grade") & !ends_with(c("m0","miss"))) %>% 
  rename_all(~toupper(str_replace_all(.,"grade|_score",""))) %>% 
  rename_all(~str_replace_all(.,"_M"," Math")) %>% 
  rename_all(~str_replace_all(.,"_S"," Science")) %>% 
  rename_all(~str_replace_all(.,"_RLA"," Reading")) %>% 
  rename_all(~str_replace_all(.,"9","9th Grade")) %>% 
  rename_all(~str_replace_all(.,"10","10th Grade"))

tests_matrix  <- as.matrix(tests)

tests_gpa_matrix <- cbind(tests_matrix, GPA = as_vector(data2$gpa_final))
tests_gpa_corr   <- Hmisc::rcorr(x = tests_gpa_matrix, type = "pearson")
tests_gpa_corr$r[tests_gpa_corr$n < 10] <- NA

corrplot(tests_gpa_corr$r,
         method = "circle",
         type   = "upper",
         diag   = F,
         addCoef.col = "black",
         cl.pos = 'n',
         tl.col = "black",
         tl.srt = 45,
         mar = c(0,0,5,0),
         title = "Pearson Correlation for ADE tests and High School GPA")

other <- data2 %>%
  select(class_rank, avg_days_present, avg_days_absent) %>% 
  rename_all(~toupper(str_replace_all(.,"_"," ")))

other_matrix  <- as.matrix(other)

other_gpa_matrix <- cbind(other_matrix, GPA = as_vector(data2$gpa_final))
other_gpa_corr   <- Hmisc::rcorr(x = other_gpa_matrix, type = "pearson")
other_gpa_corr$r[other_gpa_corr$n < 10] <- NA

corrplot(other_gpa_corr$r,
         method = "circle",
         type   = "upper",
         diag   = F,
         addCoef.col = "black",
         cl.pos = 'n',
         tl.col = "black",
         tl.srt = 45,
         mar = c(0,0,5,0),
         title = "Pearson Correlation for Various High School Metrics and GPA")

CAPR MMA Descriptive Analyses - Arkansas

Dorota Rizik & Dan Cullinan

19 September, 2022