COVID-19 Impact on NYC School Enrollment

Appendix to report

load packages

library(readr)
library(tidyr)
library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(snakecase)
library(stringr)
library(lubridate)


Attaching package: 'lubridate'

The following objects are masked from 'package:base':

    date, intersect, setdiff, union

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0     ✔ purrr   1.0.2
✔ ggplot2 3.4.3     ✔ tibble  3.2.1

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Data cleaning

# importing initial datasets 
nyc_schools_13_to_18 <- read_csv("2013_-_2018_Demographic_Snapshot_School_20231024.csv")

Rows: 8972 Columns: 39
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (4): DBN, School Name, Year, Economic Need Index
dbl (35): Total Enrollment, Grade PK (Half Day & Full Day), Grade K, Grade 1...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

nyc_schools_17_to_22 <- read_csv("2017-18__-_2021-22_Demographic_Snapshot_20231020.csv")

Rows: 9251 Columns: 44
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (6): DBN, School Name, Year, # Poverty, % Poverty, Economic Need Index
dbl (38): Total Enrollment, Grade 3K, Grade PK (Half Day & Full Day), Grade ...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# deleting all rows with the years "2017-18" in nyc_schools_13_to_18
# information is already contained in nyc_schools_17_to_22
nyc_schools_13_to_18_fix <- nyc_schools_13_to_18 |> 
  filter(Year != "2017-18")

In nyc_schools_13_to_18, “# Poverty” and “% Poverty” are both in double type. In nyc_schools_17_to_22, both are in character strings because there are schools with above 95% poverty, which is written as “Above 95%” in both the “# Poverty” and “% Poverty” and columns.

The chunk below turns the “# Poverty” and “% Poverty” columns in into character strings in nyc_schools_13_to_18 so the two dataset can be merged

nyc_schools_13_to_18_fix <- nyc_schools_13_to_18_fix |> 
  mutate(`% Poverty` = as.character(`% Poverty`), 
         `# Poverty` = as.character(`# Poverty`))

# creating a joined dataset 
schools <- 
  # joining the two data sets.
  bind_rows(nyc_schools_13_to_18_fix, nyc_schools_17_to_22) |> 
  # turn values in Year column from YYYY-YY to YYYY. E.g. 2013-24 to 2013 
  mutate("Year" = substr(Year, 1, 4)) |> 
  arrange(DBN)

# turning all column names to snakecase
colnames(schools) <- tolower(gsub(" ", "_", colnames(schools)))

# deleting  all '#' in column names to avoid confusing R
colnames(schools) <- gsub("#_", "", colnames(schools))

# turning all "%" in column names to "percent" to avoid confusing R
colnames(schools) <- gsub("%", "percent", colnames(schools))

# organizing columns so they make sense 
schools <- schools |> 
  # deleting grade_3k because most schools don't have information on it. 
  select(-grade_3k) |> 
  # putting all the race and ethnicity columns next to each other 
  select(1:`percent_white`, 
         `multi-racial`, 
         `percent_multi-racial`, 
         `native_american`, 
         `percent_native_american`, 
         `missing_race/ethnicity_data`, 
         `percent_missing_race/ethnicity_data`,           
         everything())

# cleaning column content structure and type 
schools <- schools |> 
  # turning all "No Data" to N/A in economic_need_index column 
  mutate(economic_need_index = ifelse(economic_need_index == "No Data", NA, 
                                      economic_need_index)) |> 
  # turning percentage and character values to numeric 
  mutate(`percent_poverty` = as.numeric(gsub("%", "", `percent_poverty`))) |> 
  mutate(`economic_need_index` = 
           as.numeric(gsub("%", "", `economic_need_index`))) |> 
  mutate(`poverty` = as.numeric(`poverty`)) |> 
  mutate(`year` = as.numeric(`year`))

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `percent_poverty = as.numeric(gsub("%", "", percent_poverty))`.
Caused by warning:
! NAs introduced by coercion

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `economic_need_index = as.numeric(gsub("%", "",
  economic_need_index))`.
Caused by warning:
! NAs introduced by coercion

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `poverty = as.numeric(poverty)`.
Caused by warning:
! NAs introduced by coercion

# writing the final dataset into a csv file 
#write.csv(schools, "schools.csv", row.names=FALSE)

glimpse(schools)

Rows: 16,379
Columns: 45
$ dbn                                              <chr> "01M015", "01M015", "…
$ school_name                                      <chr> "P.S. 015 Roberto Cle…
$ year                                             <dbl> 2013, 2014, 2015, 201…
$ total_enrollment                                 <dbl> 190, 183, 176, 178, 1…
$ `grade_pk_(half_day_&_full_day)`                 <dbl> 26, 18, 14, 17, 17, 1…
$ grade_k                                          <dbl> 39, 27, 32, 28, 28, 2…
$ grade_1                                          <dbl> 39, 47, 33, 33, 32, 3…
$ grade_2                                          <dbl> 21, 31, 39, 27, 33, 3…
$ grade_3                                          <dbl> 16, 19, 23, 31, 23, 3…
$ grade_4                                          <dbl> 26, 17, 17, 24, 31, 2…
$ grade_5                                          <dbl> 23, 24, 18, 18, 26, 2…
$ grade_6                                          <dbl> 0, 0, 0, 0, 0, 0, 0, …
$ grade_7                                          <dbl> 0, 0, 0, 0, 0, 0, 0, …
$ grade_8                                          <dbl> 0, 0, 0, 0, 0, 0, 0, …
$ grade_9                                          <dbl> 0, 0, 0, 0, 0, 0, 0, …
$ grade_10                                         <dbl> 0, 0, 0, 0, 0, 0, 0, …
$ grade_11                                         <dbl> 0, 0, 0, 0, 0, 0, 0, …
$ grade_12                                         <dbl> 0, 0, 0, 0, 0, 0, 0, …
$ female                                           <dbl> 93, 84, 83, 83, 99, 8…
$ percent_female                                   <dbl> 48.900, 45.900, 47.20…
$ male                                             <dbl> 97, 99, 93, 95, 91, 8…
$ percent_male                                     <dbl> 51.100, 54.100, 52.80…
$ asian                                            <dbl> 9, 8, 9, 14, 20, 24, …
$ percent_asian                                    <dbl> 4.700, 4.400, 5.100, …
$ black                                            <dbl> 72, 65, 57, 51, 52, 4…
$ percent_black                                    <dbl> 37.900, 35.500, 32.40…
$ hispanic                                         <dbl> 104, 107, 105, 105, 1…
$ percent_hispanic                                 <dbl> 54.700, 58.500, 59.70…
$ multiple_race_categories_not_represented         <dbl> 2, 1, 3, 4, NA, NA, N…
$ percent_multiple_race_categories_not_represented <dbl> 1.1, 0.5, 1.7, 2.2, N…
$ white                                            <dbl> 3, 2, 2, 4, 6, 6, 9, …
$ percent_white                                    <dbl> 1.600, 1.100, 1.100, …
$ `multi-racial`                                   <dbl> NA, NA, NA, NA, 1, 0,…
$ `percent_multi-racial`                           <dbl> NA, NA, NA, NA, 0.005…
$ native_american                                  <dbl> NA, NA, NA, NA, 1, 1,…
$ percent_native_american                          <dbl> NA, NA, NA, NA, 0.005…
$ `missing_race/ethnicity_data`                    <dbl> NA, NA, NA, NA, 0, 0,…
$ `percent_missing_race/ethnicity_data`            <dbl> NA, NA, NA, NA, 0.000…
$ students_with_disabilities                       <dbl> 65, 64, 60, 51, 49, 3…
$ percent_students_with_disabilities               <dbl> 34.200, 35.000, 34.10…
$ english_language_learners                        <dbl> 19, 17, 16, 12, 8, 8,…
$ percent_english_language_learners                <dbl> 10.000, 9.300, 9.100,…
$ poverty                                          <dbl> 171, 169, 149, 152, 1…
$ percent_poverty                                  <dbl> 90.0, 92.3, 84.7, 85.…
$ economic_need_index                              <dbl> NA, 93.5, 89.6, 89.2,…

Other appendicies (as necessary)